From a6d01009e396a9f30c5cbacf6c7fa182cf3d53bd Mon Sep 17 00:00:00 2001 From: Maximilian Attems Date: Mon, 21 Jul 2008 14:07:51 +0000 Subject: [PATCH] add openvz patch enabled for amd64. TODO: - i386, ia64 - userspace depends - proper desc svn path=/dists/trunk/linux-2.6/; revision=11867 --- debian/changelog | 1 + debian/config/defines | 2 +- debian/copyright | 12 + .../patches/features/all/openvz/openvz.patch | 82828 ++++++++++++++++ debian/patches/series/1~experimental.1-extra | 1 + 5 files changed, 82843 insertions(+), 1 deletion(-) create mode 100644 debian/patches/features/all/openvz/openvz.patch diff --git a/debian/changelog b/debian/changelog index 8e4b9553a..10351f426 100644 --- a/debian/changelog +++ b/debian/changelog @@ -70,6 +70,7 @@ linux-2.6 (2.6.26-1~experimental.1) UNRELEASED; urgency=low * Enable BLK_DEV_BSG for SG v4 support. * [amd64] Enable default disabled memtest boot param. * topconfig: Enable PATA_SIS instead of SATA_SIS. (closes: #485609) + * Add OpenVZ countainer flavour for amd64. (closes: #392015) [ Martin Michlmayr ] * [arm/orion5x] Update the config to reflect upstream renaming this diff --git a/debian/config/defines b/debian/config/defines index b25f3d2c9..74c8f5e42 100644 --- a/debian/config/defines +++ b/debian/config/defines @@ -25,7 +25,7 @@ featuresets: xen-vserver [featureset-openvz_base] -enabled: false +enabled: true [featureset-vserver_base] enabled: false diff --git a/debian/copyright b/debian/copyright index 169fdd53c..9f4f66aac 100644 --- a/debian/copyright +++ b/debian/copyright @@ -62,3 +62,15 @@ Xen is distributed under the following license: LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +The openvz patch was obtained from: + git://git.openvz.org/pub/linux-2.6.26-openvz + +OpenVZ is distributed under the GPL v2 and later license and that notice: + +Nothing in this license should be construed as a grant by SWsoft of any rights +beyond the rights specified in the GNU General Public License, and nothing in +this license should be construed as a waiver by SWsoft of its patent, copyright +and/or trademark rights, beyond the waiver required by the GNU General Public +License. This license is expressly inapplicable to any product that is not +within the scope of the GNU General Public License diff --git a/debian/patches/features/all/openvz/openvz.patch b/debian/patches/features/all/openvz/openvz.patch new file mode 100644 index 000000000..bf65ddb7b --- /dev/null +++ b/debian/patches/features/all/openvz/openvz.patch @@ -0,0 +1,82828 @@ +commit dbe7093bdda52ce46fb80f013e6937dddb03980c +Author: Vitaliy Gusev +Date: Thu Jul 17 18:49:45 2008 +0400 + + conntracks: register nf_conntrack_expect in each namespace + + Otherwise - warning: + + proc_dir_entry 'nf_conntrack_expect' already registered + Pid: 3955, comm: vzctl Not tainted 2.6.26 #130 + + Call Trace: + [] ? idr_get_new+0x13/0x33 + [] proc_register+0x11b/0x151 + [] proc_create_data+0x86/0x9f + [] proc_net_fops_create+0x18/0x1a + [] :nf_conntrack:nf_conntrack_expect_init+0x11e/0x17c + [] :nf_conntrack:nf_conntrack_init+0x26d/0x31c + + Signed-off-by: Vitaliy Gusev + Signed-off-by: Pavel Emelyanov + +commit 9429b536e6ed239b7c1752ba611482c7ed49205f +Author: Vitaliy Gusev +Date: Thu Jul 17 16:24:29 2008 +0400 + + Adjust Kconfig options to fix arbitrary compilation errors + + xemul: Some options are incompatible or are implemented in other way in OpenVZ + model, so we disable them. + + Signed-off-by: Vitaliy Gusev + Signed-off-by: Pavel Emelyanov + +commit 4b58555e7bf62369b6cc065343cba1df8b085c2d +Author: Vitaliy Gusev +Date: Thu Jul 17 16:23:05 2008 +0400 + + conntracks: make nf_conntrack_proto_icmp compile and work + + Signed-off-by: Vitaliy Gusev + Signed-off-by: Pavel Emelyanov + +commit 3fc2b5e403891efbb88887372e6764a4ee21c2c3 +Author: Vitaliy Gusev +Date: Thu Jul 17 16:22:13 2008 +0400 + + conntracks: make nf_conntrack_l3proto_ipv4_compat compile and work + + Signed-off-by: Vitaliy Gusev + Signed-off-by: Pavel Emelyanov + +commit 3c11bdc8d0005a659a72fc0805093a7c0a83632b +Author: Vitaliy Gusev +Date: Thu Jul 17 16:21:28 2008 +0400 + + conntracks: make nf_conntrack_l3proto_ipv4 part compile and work + + Main problem - poor sysctl tables registrarion. + + Signed-off-by: Vitaliy Gusev + Signed-off-by: Pavel Emelyanov + +commit 2655d79febc106ac47c9bf925e23d99dadb52072 +Author: Vitaliy Gusev +Date: Thu Jul 17 16:20:31 2008 +0400 + + netfilter: nf_conntrack_proto_fini() must be called before generic_sysctl_cleanup + + Signed-off-by: Vitaliy Gusev + Signed-off-by: Pavel Emelyanov + +commit 2ce6bb08051038cb5e11d9523b5e1b67df03022f +Author: Vitaliy Gusev +Date: Thu Jul 17 16:19:58 2008 +0400 + + netfilter: fix forgotten call nf_conntrack_proto_init(). + + Signed-off-by: Vitaliy Gusev + Signed-off-by: Pavel Emelyanov + +commit 1f7c5382a62f5518c7f26bd83fbf68131dd30aee +Author: Pavel Emelyanov +Date: Wed Jul 16 17:31:55 2008 +0400 + + Fix __d_path codeflow wrt vfsmnt_lock locking + + Signed-off-by: Pavel Emelyanov + +commit aafb4cebb852b897a594675389328c829e7680ea +Author: OpenVZ team +Date: Wed Jul 16 17:20:32 2008 +0400 + + Linux 2.6.26-ovz + + Netfilter in container now works! Checkpointing now compiles! + These were good news. Bad news is that conntracks are still + broken, and checkpointing can be compiled *only* + + Will fix it all later :) +diff --git a/COPYING.SWsoft b/COPYING.SWsoft +new file mode 100644 +index 0000000..059256d +--- /dev/null ++++ b/COPYING.SWsoft +@@ -0,0 +1,350 @@ ++ ++Nothing in this license should be construed as a grant by SWsoft of any rights ++beyond the rights specified in the GNU General Public License, and nothing in ++this license should be construed as a waiver by SWsoft of its patent, copyright ++and/or trademark rights, beyond the waiver required by the GNU General Public ++License. This license is expressly inapplicable to any product that is not ++within the scope of the GNU General Public License ++ ++---------------------------------------- ++ ++ GNU GENERAL PUBLIC LICENSE ++ Version 2, June 1991 ++ ++ Copyright (C) 1989, 1991 Free Software Foundation, Inc. ++ 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ Everyone is permitted to copy and distribute verbatim copies ++ of this license document, but changing it is not allowed. ++ ++ Preamble ++ ++ The licenses for most software are designed to take away your ++freedom to share and change it. By contrast, the GNU General Public ++License is intended to guarantee your freedom to share and change free ++software--to make sure the software is free for all its users. This ++General Public License applies to most of the Free Software ++Foundation's software and to any other program whose authors commit to ++using it. (Some other Free Software Foundation software is covered by ++the GNU Library General Public License instead.) You can apply it to ++your programs, too. ++ ++ When we speak of free software, we are referring to freedom, not ++price. Our General Public Licenses are designed to make sure that you ++have the freedom to distribute copies of free software (and charge for ++this service if you wish), that you receive source code or can get it ++if you want it, that you can change the software or use pieces of it ++in new free programs; and that you know you can do these things. ++ ++ To protect your rights, we need to make restrictions that forbid ++anyone to deny you these rights or to ask you to surrender the rights. ++These restrictions translate to certain responsibilities for you if you ++distribute copies of the software, or if you modify it. ++ ++ For example, if you distribute copies of such a program, whether ++gratis or for a fee, you must give the recipients all the rights that ++you have. You must make sure that they, too, receive or can get the ++source code. And you must show them these terms so they know their ++rights. ++ ++ We protect your rights with two steps: (1) copyright the software, and ++(2) offer you this license which gives you legal permission to copy, ++distribute and/or modify the software. ++ ++ Also, for each author's protection and ours, we want to make certain ++that everyone understands that there is no warranty for this free ++software. If the software is modified by someone else and passed on, we ++want its recipients to know that what they have is not the original, so ++that any problems introduced by others will not reflect on the original ++authors' reputations. ++ ++ Finally, any free program is threatened constantly by software ++patents. We wish to avoid the danger that redistributors of a free ++program will individually obtain patent licenses, in effect making the ++program proprietary. To prevent this, we have made it clear that any ++patent must be licensed for everyone's free use or not licensed at all. ++ ++ The precise terms and conditions for copying, distribution and ++modification follow. ++ ++ GNU GENERAL PUBLIC LICENSE ++ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION ++ ++ 0. This License applies to any program or other work which contains ++a notice placed by the copyright holder saying it may be distributed ++under the terms of this General Public License. The "Program", below, ++refers to any such program or work, and a "work based on the Program" ++means either the Program or any derivative work under copyright law: ++that is to say, a work containing the Program or a portion of it, ++either verbatim or with modifications and/or translated into another ++language. (Hereinafter, translation is included without limitation in ++the term "modification".) Each licensee is addressed as "you". ++ ++Activities other than copying, distribution and modification are not ++covered by this License; they are outside its scope. The act of ++running the Program is not restricted, and the output from the Program ++is covered only if its contents constitute a work based on the ++Program (independent of having been made by running the Program). ++Whether that is true depends on what the Program does. ++ ++ 1. You may copy and distribute verbatim copies of the Program's ++source code as you receive it, in any medium, provided that you ++conspicuously and appropriately publish on each copy an appropriate ++copyright notice and disclaimer of warranty; keep intact all the ++notices that refer to this License and to the absence of any warranty; ++and give any other recipients of the Program a copy of this License ++along with the Program. ++ ++You may charge a fee for the physical act of transferring a copy, and ++you may at your option offer warranty protection in exchange for a fee. ++ ++ 2. You may modify your copy or copies of the Program or any portion ++of it, thus forming a work based on the Program, and copy and ++distribute such modifications or work under the terms of Section 1 ++above, provided that you also meet all of these conditions: ++ ++ a) You must cause the modified files to carry prominent notices ++ stating that you changed the files and the date of any change. ++ ++ b) You must cause any work that you distribute or publish, that in ++ whole or in part contains or is derived from the Program or any ++ part thereof, to be licensed as a whole at no charge to all third ++ parties under the terms of this License. ++ ++ c) If the modified program normally reads commands interactively ++ when run, you must cause it, when started running for such ++ interactive use in the most ordinary way, to print or display an ++ announcement including an appropriate copyright notice and a ++ notice that there is no warranty (or else, saying that you provide ++ a warranty) and that users may redistribute the program under ++ these conditions, and telling the user how to view a copy of this ++ License. (Exception: if the Program itself is interactive but ++ does not normally print such an announcement, your work based on ++ the Program is not required to print an announcement.) ++ ++These requirements apply to the modified work as a whole. If ++identifiable sections of that work are not derived from the Program, ++and can be reasonably considered independent and separate works in ++themselves, then this License, and its terms, do not apply to those ++sections when you distribute them as separate works. But when you ++distribute the same sections as part of a whole which is a work based ++on the Program, the distribution of the whole must be on the terms of ++this License, whose permissions for other licensees extend to the ++entire whole, and thus to each and every part regardless of who wrote it. ++ ++Thus, it is not the intent of this section to claim rights or contest ++your rights to work written entirely by you; rather, the intent is to ++exercise the right to control the distribution of derivative or ++collective works based on the Program. ++ ++In addition, mere aggregation of another work not based on the Program ++with the Program (or with a work based on the Program) on a volume of ++a storage or distribution medium does not bring the other work under ++the scope of this License. ++ ++ 3. You may copy and distribute the Program (or a work based on it, ++under Section 2) in object code or executable form under the terms of ++Sections 1 and 2 above provided that you also do one of the following: ++ ++ a) Accompany it with the complete corresponding machine-readable ++ source code, which must be distributed under the terms of Sections ++ 1 and 2 above on a medium customarily used for software interchange; or, ++ ++ b) Accompany it with a written offer, valid for at least three ++ years, to give any third party, for a charge no more than your ++ cost of physically performing source distribution, a complete ++ machine-readable copy of the corresponding source code, to be ++ distributed under the terms of Sections 1 and 2 above on a medium ++ customarily used for software interchange; or, ++ ++ c) Accompany it with the information you received as to the offer ++ to distribute corresponding source code. (This alternative is ++ allowed only for noncommercial distribution and only if you ++ received the program in object code or executable form with such ++ an offer, in accord with Subsection b above.) ++ ++The source code for a work means the preferred form of the work for ++making modifications to it. For an executable work, complete source ++code means all the source code for all modules it contains, plus any ++associated interface definition files, plus the scripts used to ++control compilation and installation of the executable. However, as a ++special exception, the source code distributed need not include ++anything that is normally distributed (in either source or binary ++form) with the major components (compiler, kernel, and so on) of the ++operating system on which the executable runs, unless that component ++itself accompanies the executable. ++ ++If distribution of executable or object code is made by offering ++access to copy from a designated place, then offering equivalent ++access to copy the source code from the same place counts as ++distribution of the source code, even though third parties are not ++compelled to copy the source along with the object code. ++ ++ 4. You may not copy, modify, sublicense, or distribute the Program ++except as expressly provided under this License. Any attempt ++otherwise to copy, modify, sublicense or distribute the Program is ++void, and will automatically terminate your rights under this License. ++However, parties who have received copies, or rights, from you under ++this License will not have their licenses terminated so long as such ++parties remain in full compliance. ++ ++ 5. You are not required to accept this License, since you have not ++signed it. However, nothing else grants you permission to modify or ++distribute the Program or its derivative works. These actions are ++prohibited by law if you do not accept this License. Therefore, by ++modifying or distributing the Program (or any work based on the ++Program), you indicate your acceptance of this License to do so, and ++all its terms and conditions for copying, distributing or modifying ++the Program or works based on it. ++ ++ 6. Each time you redistribute the Program (or any work based on the ++Program), the recipient automatically receives a license from the ++original licensor to copy, distribute or modify the Program subject to ++these terms and conditions. You may not impose any further ++restrictions on the recipients' exercise of the rights granted herein. ++You are not responsible for enforcing compliance by third parties to ++this License. ++ ++ 7. If, as a consequence of a court judgment or allegation of patent ++infringement or for any other reason (not limited to patent issues), ++conditions are imposed on you (whether by court order, agreement or ++otherwise) that contradict the conditions of this License, they do not ++excuse you from the conditions of this License. If you cannot ++distribute so as to satisfy simultaneously your obligations under this ++License and any other pertinent obligations, then as a consequence you ++may not distribute the Program at all. For example, if a patent ++license would not permit royalty-free redistribution of the Program by ++all those who receive copies directly or indirectly through you, then ++the only way you could satisfy both it and this License would be to ++refrain entirely from distribution of the Program. ++ ++If any portion of this section is held invalid or unenforceable under ++any particular circumstance, the balance of the section is intended to ++apply and the section as a whole is intended to apply in other ++circumstances. ++ ++It is not the purpose of this section to induce you to infringe any ++patents or other property right claims or to contest validity of any ++such claims; this section has the sole purpose of protecting the ++integrity of the free software distribution system, which is ++implemented by public license practices. Many people have made ++generous contributions to the wide range of software distributed ++through that system in reliance on consistent application of that ++system; it is up to the author/donor to decide if he or she is willing ++to distribute software through any other system and a licensee cannot ++impose that choice. ++ ++This section is intended to make thoroughly clear what is believed to ++be a consequence of the rest of this License. ++ ++ 8. If the distribution and/or use of the Program is restricted in ++certain countries either by patents or by copyrighted interfaces, the ++original copyright holder who places the Program under this License ++may add an explicit geographical distribution limitation excluding ++those countries, so that distribution is permitted only in or among ++countries not thus excluded. In such case, this License incorporates ++the limitation as if written in the body of this License. ++ ++ 9. The Free Software Foundation may publish revised and/or new versions ++of the General Public License from time to time. Such new versions will ++be similar in spirit to the present version, but may differ in detail to ++address new problems or concerns. ++ ++Each version is given a distinguishing version number. If the Program ++specifies a version number of this License which applies to it and "any ++later version", you have the option of following the terms and conditions ++either of that version or of any later version published by the Free ++Software Foundation. If the Program does not specify a version number of ++this License, you may choose any version ever published by the Free Software ++Foundation. ++ ++ 10. If you wish to incorporate parts of the Program into other free ++programs whose distribution conditions are different, write to the author ++to ask for permission. For software which is copyrighted by the Free ++Software Foundation, write to the Free Software Foundation; we sometimes ++make exceptions for this. Our decision will be guided by the two goals ++of preserving the free status of all derivatives of our free software and ++of promoting the sharing and reuse of software generally. ++ ++ NO WARRANTY ++ ++ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY ++FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN ++OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES ++PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED ++OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS ++TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE ++PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, ++REPAIR OR CORRECTION. ++ ++ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING ++WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR ++REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, ++INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING ++OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED ++TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY ++YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER ++PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE ++POSSIBILITY OF SUCH DAMAGES. ++ ++ END OF TERMS AND CONDITIONS ++ ++ How to Apply These Terms to Your New Programs ++ ++ If you develop a new program, and you want it to be of the greatest ++possible use to the public, the best way to achieve this is to make it ++free software which everyone can redistribute and change under these terms. ++ ++ To do so, attach the following notices to the program. It is safest ++to attach them to the start of each source file to most effectively ++convey the exclusion of warranty; and each file should have at least ++the "copyright" line and a pointer to where the full notice is found. ++ ++ ++ Copyright (C) ++ ++ This program is free software; you can redistribute it and/or modify ++ it under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 2 of the License, or ++ (at your option) any later version. ++ ++ This program is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ GNU General Public License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with this program; if not, write to the Free Software ++ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ ++ ++Also add information on how to contact you by electronic and paper mail. ++ ++If the program is interactive, make it output a short notice like this ++when it starts in an interactive mode: ++ ++ Gnomovision version 69, Copyright (C) year name of author ++ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. ++ This is free software, and you are welcome to redistribute it ++ under certain conditions; type `show c' for details. ++ ++The hypothetical commands `show w' and `show c' should show the appropriate ++parts of the General Public License. Of course, the commands you use may ++be called something other than `show w' and `show c'; they could even be ++mouse-clicks or menu items--whatever suits your program. ++ ++You should also get your employer (if you work as a programmer) or your ++school, if any, to sign a "copyright disclaimer" for the program, if ++necessary. Here is a sample; alter the names: ++ ++ Yoyodyne, Inc., hereby disclaims all copyright interest in the program ++ `Gnomovision' (which makes passes at compilers) written by James Hacker. ++ ++ , 1 April 1989 ++ Ty Coon, President of Vice ++ ++This General Public License does not permit incorporating your program into ++proprietary programs. If your program is a subroutine library, you may ++consider it more useful to permit linking proprietary applications with the ++library. If this is what you want to do, use the GNU Library General ++Public License instead of this License. +diff --git a/Makefile b/Makefile +index e3c5eb6..62dc374 100644 +--- a/Makefile ++++ b/Makefile +@@ -2,6 +2,7 @@ VERSION = 2 + PATCHLEVEL = 6 + SUBLEVEL = 26 + EXTRAVERSION = ++VZVERSION = 036test001 + NAME = Rotary Wombat + + # *DOCUMENTATION* +@@ -339,7 +340,7 @@ KBUILD_AFLAGS := -D__ASSEMBLY__ + KERNELRELEASE = $(shell cat include/config/kernel.release 2> /dev/null) + KERNELVERSION = $(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION) + +-export VERSION PATCHLEVEL SUBLEVEL KERNELRELEASE KERNELVERSION ++export VERSION PATCHLEVEL SUBLEVEL KERNELRELEASE KERNELVERSION VZVERSION + export ARCH SRCARCH CONFIG_SHELL HOSTCC HOSTCFLAGS CROSS_COMPILE AS LD CC + export CPP AR NM STRIP OBJCOPY OBJDUMP MAKE AWK GENKSYMS PERL UTS_MACHINE + export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS +@@ -973,7 +974,8 @@ define filechk_utsrelease.h + echo '"$(KERNELRELEASE)" exceeds $(uts_len) characters' >&2; \ + exit 1; \ + fi; \ +- (echo \#define UTS_RELEASE \"$(KERNELRELEASE)\";) ++ (echo \#define UTS_RELEASE \"$(KERNELRELEASE)\"; \ ++ echo \#define VZVERSION \"$(VZVERSION)\";) + endef + + define filechk_version.h +diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c +index eefae1d..0c3a894 100644 +--- a/arch/arm/kernel/smp.c ++++ b/arch/arm/kernel/smp.c +@@ -201,7 +201,7 @@ int __cpuexit __cpu_disable(void) + local_flush_tlb_all(); + + read_lock(&tasklist_lock); +- for_each_process(p) { ++ for_each_process_all(p) { + if (p->mm) + cpu_clear(cpu, p->mm->cpu_vm_mask); + } +diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig +index 16be414..826b220 100644 +--- a/arch/ia64/Kconfig ++++ b/arch/ia64/Kconfig +@@ -611,6 +611,7 @@ source "arch/ia64/kvm/Kconfig" + + source "lib/Kconfig" + ++source "kernel/bc/Kconfig" + # + # Use the generic interrupt handling code in kernel/irq/: + # +@@ -638,6 +639,8 @@ source "arch/ia64/hp/sim/Kconfig" + + source "arch/ia64/Kconfig.debug" + ++source "kernel/Kconfig.openvz" ++ + source "security/Kconfig" + + source "crypto/Kconfig" +diff --git a/arch/ia64/ia32/binfmt_elf32.c b/arch/ia64/ia32/binfmt_elf32.c +index 4f0c30c..067cb28 100644 +--- a/arch/ia64/ia32/binfmt_elf32.c ++++ b/arch/ia64/ia32/binfmt_elf32.c +@@ -17,6 +17,8 @@ + #include + #include + ++#include ++ + #include "ia32priv.h" + #include "elfcore32.h" + +@@ -132,6 +134,12 @@ ia64_elf32_init (struct pt_regs *regs) + up_write(¤t->mm->mmap_sem); + } + ++ if (ub_memory_charge(current->mm, PAGE_ALIGN(IA32_LDT_ENTRIES * ++ IA32_LDT_ENTRY_SIZE), ++ VM_READ|VM_WRITE|VM_MAYREAD|VM_MAYWRITE, ++ NULL, UB_SOFT)) ++ goto skip; ++ + /* + * Install LDT as anonymous memory. This gives us all-zero segment descriptors + * until a task modifies them via modify_ldt(). +@@ -152,7 +160,12 @@ ia64_elf32_init (struct pt_regs *regs) + } + } + up_write(¤t->mm->mmap_sem); +- } ++ } else ++ ub_memory_uncharge(current->mm, PAGE_ALIGN(IA32_LDT_ENTRIES * ++ IA32_LDT_ENTRY_SIZE), ++ VM_READ|VM_WRITE|VM_MAYREAD|VM_MAYWRITE, NULL); ++ ++skip: + + ia64_psr(regs)->ac = 0; /* turn off alignment checking */ + regs->loadrs = 0; +diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S +index ca2bb95..e2fa9a0 100644 +--- a/arch/ia64/kernel/entry.S ++++ b/arch/ia64/kernel/entry.S +@@ -504,6 +504,74 @@ GLOBAL_ENTRY(clone) + br.ret.sptk.many rp + END(clone) + ++GLOBAL_ENTRY(ia64_ret_from_resume) ++ PT_REGS_UNWIND_INFO(0) ++{ /* ++ * Some versions of gas generate bad unwind info if the first instruction of a ++ * procedure doesn't go into the first slot of a bundle. This is a workaround. ++ */ ++ nop.m 0 ++ nop.i 0 ++ /* ++ * We need to call schedule_tail() to complete the scheduling process. ++ * Called by ia64_switch_to() after do_fork()->copy_thread(). r8 contains the ++ * address of the previously executing task. ++ */ ++ br.call.sptk.many rp=ia64_invoke_schedule_tail ++} ++ br.call.sptk.many rp=ia64_invoke_resume ++ ;; ++ adds sp=256,sp ++ ;; ++ /* Return from interrupt, we are all right. */ ++(pNonSys) br ia64_leave_kernel ++ ;; ++ /* Tricky part follows. We must restore correct syscall ++ * register frame before doing normal syscall exit job. ++ * It would the most natural to keep sw->ar_pfs correct, ++ * then we would be here with correct register frame. ++ * Unfortunately, IA64 has a feature. Registers were in backstore ++ * after context switch, and the first br.ret does _NOT_ fetch ++ * output registers. ++ * It is quite natural: look, if caller has output regs in his ++ * frame, they should be consumed. If callee does not have (enough of) ++ * input/local registers (1 in this case), the situation is unusual. ++ * Practical evidence: they are filled with something random crap. ++ * The only case, when this is essential in mainstream kernel ++ * is sys_clone(). The result is that new process gets some kernel ++ * information in its register frame. Which is a security problem, btw. ++ * ++ * So, we set sw->ar_pfs to pretend the whole frame is of local ++ * regs. And we have to repartition the frame it manually, using ++ * information from pt->cr_ifs (the register is invalid in this ++ * case, but it holds correct pfm). ++ */ ++ adds r3=PT(CR_IFS)+16,sp ++ ;; ++ ld8 r2=[r3],-(PT(CR_IFS)-PT(R8)) ++ ;; ++ extr.u r2=r2,0,37 ++ mov r8=ar.ec ++ ;; ++ extr.u r8=r8,0,5 ++ ;; ++ shl r8=r8,52 ++ ;; ++ or r2=r2,r8 ++ ;; ++ mov ar.pfs=r2 ++ ;; ++ movl r2=ia64_leave_syscall ++ ;; ++ mov rp=r2 ++ /* Plus, we should fetch r8 and r10 from pt_regs. Something else? */ ++ ld8 r8=[r3],PT(R10)-PT(R8) ++ ;; ++ ld8 r10=[r3] ++ ;; ++ br.ret.sptk.many rp ++END(ia64_ret_from_resume) ++ + /* + * Invoke a system call, but do some tracing before and after the call. + * We MUST preserve the current register frame throughout this routine +@@ -1236,6 +1304,34 @@ GLOBAL_ENTRY(ia64_invoke_schedule_tail) + br.ret.sptk.many rp + END(ia64_invoke_schedule_tail) + ++GLOBAL_ENTRY(ia64_invoke_resume) ++ alloc loc1=ar.pfs,0,3,1,0 ++ mov loc0=rp ++ adds out0=16,sp ++ ;; ++ ld8 r8=[out0] ++ ;; ++ cmp.eq p6,p0=r8,r0 ++ ;; ++(p6) br.cond.sptk 1f ++ ;; ++ mov loc2=gp ++ ;; ++ ld8 r10=[r8],8 ++ ;; ++ ld8 gp=[r8] ++ ;; ++ mov b7=r10 ++ ;; ++ br.call.sptk.many rp=b7 ++ ;; ++ mov gp=loc2 ++1: ++ mov ar.pfs=loc1 ++ mov rp=loc0 ++ br.ret.sptk.many rp ++END(ia64_invoke_resume) ++ + /* + * Setup stack and call do_notify_resume_user(), keeping interrupts + * disabled. +@@ -1664,4 +1760,17 @@ sys_call_table: + data8 sys_timerfd_settime + data8 sys_timerfd_gettime + ++.rept 1499-1313 ++ data8 sys_ni_syscall ++.endr ++ data8 sys_fairsched_vcpus ++ data8 sys_fairsched_mknod // 1500 ++ data8 sys_fairsched_rmnod ++ data8 sys_fairsched_chwt ++ data8 sys_fairsched_mvpr ++ data8 sys_fairsched_rate ++ data8 sys_getluid // 1505 ++ data8 sys_setluid ++ data8 sys_setublimit ++ data8 sys_ubstat + .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls +diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S +index c1625c7..634b102 100644 +--- a/arch/ia64/kernel/fsys.S ++++ b/arch/ia64/kernel/fsys.S +@@ -90,53 +90,6 @@ ENTRY(fsys_getpid) + FSYS_RETURN + END(fsys_getpid) + +-ENTRY(fsys_getppid) +- .prologue +- .altrp b6 +- .body +- add r17=IA64_TASK_GROUP_LEADER_OFFSET,r16 +- ;; +- ld8 r17=[r17] // r17 = current->group_leader +- add r9=TI_FLAGS+IA64_TASK_SIZE,r16 +- ;; +- +- ld4 r9=[r9] +- add r17=IA64_TASK_REAL_PARENT_OFFSET,r17 // r17 = ¤t->group_leader->real_parent +- ;; +- and r9=TIF_ALLWORK_MASK,r9 +- +-1: ld8 r18=[r17] // r18 = current->group_leader->real_parent +- ;; +- cmp.ne p8,p0=0,r9 +- add r8=IA64_TASK_TGID_OFFSET,r18 // r8 = ¤t->group_leader->real_parent->tgid +- ;; +- +- /* +- * The .acq is needed to ensure that the read of tgid has returned its data before +- * we re-check "real_parent". +- */ +- ld4.acq r8=[r8] // r8 = current->group_leader->real_parent->tgid +-#ifdef CONFIG_SMP +- /* +- * Re-read current->group_leader->real_parent. +- */ +- ld8 r19=[r17] // r19 = current->group_leader->real_parent +-(p8) br.spnt.many fsys_fallback_syscall +- ;; +- cmp.ne p6,p0=r18,r19 // did real_parent change? +- mov r19=0 // i must not leak kernel bits... +-(p6) br.cond.spnt.few 1b // yes -> redo the read of tgid and the check +- ;; +- mov r17=0 // i must not leak kernel bits... +- mov r18=0 // i must not leak kernel bits... +-#else +- mov r17=0 // i must not leak kernel bits... +- mov r18=0 // i must not leak kernel bits... +- mov r19=0 // i must not leak kernel bits... +-#endif +- FSYS_RETURN +-END(fsys_getppid) +- + ENTRY(fsys_set_tid_address) + .prologue + .altrp b6 +@@ -767,7 +720,7 @@ fsyscall_table: + data8 0 // chown + data8 0 // lseek // 1040 + data8 fsys_getpid // getpid +- data8 fsys_getppid // getppid ++ data8 0 // getppid + data8 0 // mount + data8 0 // umount + data8 0 // setuid // 1045 +diff --git a/arch/ia64/kernel/head.S b/arch/ia64/kernel/head.S +index ddeab4e..f9b6281 100644 +--- a/arch/ia64/kernel/head.S ++++ b/arch/ia64/kernel/head.S +@@ -1031,7 +1031,7 @@ GLOBAL_ENTRY(start_kernel_thread) + mov out1 = r11;; + br.call.sptk.many rp = kernel_thread_helper;; + mov out0 = r8 +- br.call.sptk.many rp = sys_exit;; ++ br.call.sptk.many rp = do_exit;; + 1: br.sptk.few 1b // not reached + END(start_kernel_thread) + +diff --git a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c +index 6da1f20..24950d6 100644 +--- a/arch/ia64/kernel/ia64_ksyms.c ++++ b/arch/ia64/kernel/ia64_ksyms.c +@@ -75,6 +75,8 @@ EXPORT_SYMBOL(xor_ia64_4); + EXPORT_SYMBOL(xor_ia64_5); + #endif + ++EXPORT_SYMBOL(empty_zero_page); ++ + #include + EXPORT_SYMBOL(ia64_pal_call_phys_stacked); + EXPORT_SYMBOL(ia64_pal_call_phys_static); +diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c +index 705176b..b00c3af 100644 +--- a/arch/ia64/kernel/mca.c ++++ b/arch/ia64/kernel/mca.c +@@ -1608,10 +1608,10 @@ default_monarch_init_process(struct notifier_block *self, unsigned long val, voi + } + printk("\n\n"); + if (read_trylock(&tasklist_lock)) { +- do_each_thread (g, t) { ++ do_each_thread_all (g, t) { + printk("\nBacktrace of pid %d (%s)\n", t->pid, t->comm); + show_stack(t, NULL); +- } while_each_thread (g, t); ++ } while_each_thread_all (g, t); + read_unlock(&tasklist_lock); + } + /* FIXME: This will not restore zapped printk locks. */ +diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c +index 7714a97..49fb204 100644 +--- a/arch/ia64/kernel/perfmon.c ++++ b/arch/ia64/kernel/perfmon.c +@@ -4176,12 +4176,12 @@ pfm_check_task_exist(pfm_context_t *ctx) + + read_lock(&tasklist_lock); + +- do_each_thread (g, t) { ++ do_each_thread_ve (g, t) { + if (t->thread.pfm_context == ctx) { + ret = 0; + goto out; + } +- } while_each_thread (g, t); ++ } while_each_thread_ve (g, t); + out: + read_unlock(&tasklist_lock); + +diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c +index a3a34b4..54179a7 100644 +--- a/arch/ia64/kernel/process.c ++++ b/arch/ia64/kernel/process.c +@@ -28,6 +28,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -387,6 +388,9 @@ ia64_load_extra (struct task_struct *task) + #endif + } + ++extern char ia64_ret_from_resume; ++EXPORT_SYMBOL(ia64_ret_from_resume); ++ + /* + * Copy the state of an ia-64 thread. + * +@@ -460,7 +464,6 @@ copy_thread (int nr, unsigned long clone_flags, + child_ptregs->r12 = user_stack_base + user_stack_size - 16; + child_ptregs->ar_bspstore = user_stack_base; + child_ptregs->ar_rnat = 0; +- child_ptregs->loadrs = 0; + } + } else { + /* +@@ -672,16 +675,25 @@ out: + return error; + } + ++extern void start_kernel_thread (void); ++EXPORT_SYMBOL(start_kernel_thread); ++ + pid_t + kernel_thread (int (*fn)(void *), void *arg, unsigned long flags) + { +- extern void start_kernel_thread (void); + unsigned long *helper_fptr = (unsigned long *) &start_kernel_thread; + struct { + struct switch_stack sw; + struct pt_regs pt; + } regs; + ++ /* Don't allow kernel_thread() inside VE */ ++ if (!ve_allow_kthreads && !ve_is_super(get_exec_env())) { ++ printk("kernel_thread call inside container\n"); ++ dump_stack(); ++ return -EPERM; ++ } ++ + memset(®s, 0, sizeof(regs)); + regs.pt.cr_iip = helper_fptr[0]; /* set entry point (IP) */ + regs.pt.r1 = helper_fptr[1]; /* set GP */ +diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c +index 2a9943b..e44debf 100644 +--- a/arch/ia64/kernel/ptrace.c ++++ b/arch/ia64/kernel/ptrace.c +@@ -10,6 +10,7 @@ + * Derived from the x86 and Alpha versions. + */ + #include ++#include + #include + #include + #include +@@ -105,6 +106,8 @@ ia64_get_scratch_nat_bits (struct pt_regs *pt, unsigned long scratch_unat) + + # undef GET_BITS + } ++EXPORT_SYMBOL(ia64_get_scratch_nat_bits); ++EXPORT_SYMBOL(__ia64_save_fpu); + + /* + * Set the NaT bits for the scratch registers according to NAT and +@@ -461,6 +464,7 @@ ia64_peek (struct task_struct *child, struct switch_stack *child_stack, + *val = ret; + return 0; + } ++EXPORT_SYMBOL(ia64_peek); + + long + ia64_poke (struct task_struct *child, struct switch_stack *child_stack, +@@ -525,6 +529,7 @@ ia64_get_user_rbs_end (struct task_struct *child, struct pt_regs *pt, + *cfmp = cfm; + return (unsigned long) ia64_rse_skip_regs(bspstore, ndirty); + } ++EXPORT_SYMBOL(ia64_get_user_rbs_end); + + /* + * Synchronize (i.e, write) the RSE backing store living in kernel +@@ -820,20 +825,20 @@ access_nat_bits (struct task_struct *child, struct pt_regs *pt, + if (write_access) { + nat_bits = *data; + scratch_unat = ia64_put_scratch_nat_bits(pt, nat_bits); +- if (unw_set_ar(info, UNW_AR_UNAT, scratch_unat) < 0) { +- dprintk("ptrace: failed to set ar.unat\n"); +- return -1; +- } ++ if (info->pri_unat_loc) ++ *info->pri_unat_loc = scratch_unat; ++ else ++ info->sw->caller_unat = scratch_unat; + for (regnum = 4; regnum <= 7; ++regnum) { + unw_get_gr(info, regnum, &dummy, &nat); + unw_set_gr(info, regnum, dummy, + (nat_bits >> regnum) & 1); + } + } else { +- if (unw_get_ar(info, UNW_AR_UNAT, &scratch_unat) < 0) { +- dprintk("ptrace: failed to read ar.unat\n"); +- return -1; +- } ++ if (info->pri_unat_loc) ++ scratch_unat = *info->pri_unat_loc; ++ else ++ scratch_unat = info->sw->caller_unat; + nat_bits = ia64_get_scratch_nat_bits(pt, scratch_unat); + for (regnum = 4; regnum <= 7; ++regnum) { + unw_get_gr(info, regnum, &dummy, &nat); +diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c +index 19c5a78..cc6c4e6 100644 +--- a/arch/ia64/kernel/signal.c ++++ b/arch/ia64/kernel/signal.c +@@ -14,6 +14,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -464,6 +465,12 @@ ia64_do_signal (struct sigscratch *scr, long in_syscall) + if (!user_mode(&scr->pt)) + return; + ++ if (try_to_freeze() && !signal_pending(current)) { ++ if ((long) scr->pt.r10 != -1) ++ restart = 0; ++ goto no_signal; ++ } ++ + if (current_thread_info()->status & TS_RESTORE_SIGMASK) + oldset = ¤t->saved_sigmask; + else +@@ -519,8 +526,10 @@ ia64_do_signal (struct sigscratch *scr, long in_syscall) + if (IS_IA32_PROCESS(&scr->pt)) { + scr->pt.r8 = scr->pt.r1; + scr->pt.cr_iip -= 2; +- } else ++ } else { + ia64_decrement_ip(&scr->pt); ++ scr->pt.r10 = 0; ++ } + restart = 0; /* don't restart twice if handle_signal() fails... */ + } + } +@@ -542,6 +551,7 @@ ia64_do_signal (struct sigscratch *scr, long in_syscall) + } + + /* Did we come from a system call? */ ++no_signal: + if (restart) { + /* Restart the system call - no handlers present */ + if (errno == ERESTARTNOHAND || errno == ERESTARTSYS || errno == ERESTARTNOINTR +@@ -561,6 +571,7 @@ ia64_do_signal (struct sigscratch *scr, long in_syscall) + ia64_decrement_ip(&scr->pt); + if (errno == ERESTART_RESTARTBLOCK) + scr->pt.r15 = __NR_restart_syscall; ++ scr->pt.r10 = 0; + } + } + } +diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c +index 1eda194..e93e7d3 100644 +--- a/arch/ia64/kernel/sys_ia64.c ++++ b/arch/ia64/kernel/sys_ia64.c +@@ -204,7 +204,7 @@ do_mmap2 (unsigned long addr, unsigned long len, int prot, int flags, int fd, un + + /* Careful about overflows.. */ + len = PAGE_ALIGN(len); +- if (!len || len > TASK_SIZE) { ++ if (len > TASK_SIZE) { + addr = -EINVAL; + goto out; + } +diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c +index aad1b7b..9194bf5 100644 +--- a/arch/ia64/kernel/time.c ++++ b/arch/ia64/kernel/time.c +@@ -40,6 +40,8 @@ struct fsyscall_gtod_data_t fsyscall_gtod_data = { + struct itc_jitter_data_t itc_jitter_data; + + volatile int time_keeper_id = 0; /* smp_processor_id() of time-keeper */ ++unsigned int cpu_khz; /* TSC clocks / usec, not used here */ ++EXPORT_SYMBOL(cpu_khz); + + #ifdef CONFIG_IA64_DEBUG_IRQ + +@@ -335,6 +337,8 @@ ia64_init_itm (void) + */ + clocksource_itc.rating = 50; + ++ cpu_khz = local_cpu_data->proc_freq / 1000; ++ + /* Setup the CPU local timer tick */ + ia64_cpu_local_tick(); + +diff --git a/arch/ia64/kernel/unaligned.c b/arch/ia64/kernel/unaligned.c +index ff0e7c1..7288a9f 100644 +--- a/arch/ia64/kernel/unaligned.c ++++ b/arch/ia64/kernel/unaligned.c +@@ -1291,7 +1291,7 @@ within_logging_rate_limit (void) + { + static unsigned long count, last_time; + +- if (time_after(jiffies, last_time + 5 * HZ)) ++ if (time_after(jiffies, last_time + 60 * HZ)) + count = 0; + if (count < 5) { + last_time = jiffies; +diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c +index 23088be..da13815 100644 +--- a/arch/ia64/mm/fault.c ++++ b/arch/ia64/mm/fault.c +@@ -148,7 +148,6 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re + if ((vma->vm_flags & mask) != mask) + goto bad_area; + +- survive: + /* + * If for any reason at all we couldn't handle the fault, make + * sure we exit gracefully rather than endlessly redo the +@@ -276,13 +275,13 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re + + out_of_memory: + up_read(&mm->mmap_sem); +- if (is_global_init(current)) { +- yield(); +- down_read(&mm->mmap_sem); +- goto survive; ++ if (user_mode(regs)) { ++ /* ++ * 0-order allocation always success if something really ++ * fatal not happen: beancounter overdraft or OOM. ++ */ ++ force_sig(SIGKILL, current); ++ return; + } +- printk(KERN_CRIT "VM: killing process %s\n", current->comm); +- if (user_mode(regs)) +- do_group_exit(SIGKILL); + goto no_context; + } +diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c +index 200100e..226b5cc 100644 +--- a/arch/ia64/mm/init.c ++++ b/arch/ia64/mm/init.c +@@ -37,6 +37,8 @@ + #include + #include + ++#include ++ + DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); + + extern void ia64_tlb_init (void); +@@ -111,6 +113,10 @@ ia64_init_addr_space (void) + + ia64_set_rbs_bot(); + ++ if (ub_memory_charge(current->mm, PAGE_SIZE, VM_DATA_DEFAULT_FLAGS, ++ NULL, UB_SOFT)) ++ goto skip; ++ + /* + * If we're out of memory and kmem_cache_alloc() returns NULL, we simply ignore + * the problem. When the process attempts to write to the register backing store +@@ -127,11 +133,16 @@ ia64_init_addr_space (void) + if (insert_vm_struct(current->mm, vma)) { + up_write(¤t->mm->mmap_sem); + kmem_cache_free(vm_area_cachep, vma); ++ ub_memory_uncharge(current->mm, PAGE_SIZE, ++ VM_DATA_DEFAULT_FLAGS, NULL); + return; + } + up_write(¤t->mm->mmap_sem); +- } ++ } else ++ ub_memory_uncharge(current->mm, PAGE_SIZE, ++ VM_DATA_DEFAULT_FLAGS, NULL); + ++skip: + /* map NaT-page at address zero to speed up speculative dereferencing of NULL: */ + if (!(current->personality & MMAP_PAGE_ZERO)) { + vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); +diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig +index 3934e26..f0f8abb 100644 +--- a/arch/powerpc/Kconfig ++++ b/arch/powerpc/Kconfig +@@ -798,8 +798,12 @@ source "arch/powerpc/sysdev/qe_lib/Kconfig" + + source "lib/Kconfig" + ++source "kernel/bc/Kconfig" ++ + source "arch/powerpc/Kconfig.debug" + ++source "kernel/Kconfig.openvz" ++ + source "security/Kconfig" + + config KEYS_COMPAT +diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S +index 89aaaa6..05a57b3 100644 +--- a/arch/powerpc/kernel/misc_32.S ++++ b/arch/powerpc/kernel/misc_32.S +@@ -835,7 +835,7 @@ _GLOBAL(abs) + * Create a kernel thread + * kernel_thread(fn, arg, flags) + */ +-_GLOBAL(kernel_thread) ++_GLOBAL(ppc_kernel_thread) + stwu r1,-16(r1) + stw r30,8(r1) + stw r31,12(r1) +diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S +index 942951e..5d704fe 100644 +--- a/arch/powerpc/kernel/misc_64.S ++++ b/arch/powerpc/kernel/misc_64.S +@@ -415,7 +415,7 @@ _GLOBAL(scom970_write) + * Create a kernel thread + * kernel_thread(fn, arg, flags) + */ +-_GLOBAL(kernel_thread) ++_GLOBAL(ppc_kernel_thread) + std r29,-24(r1) + std r30,-16(r1) + stdu r1,-STACK_FRAME_OVERHEAD(r1) +diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c +index 7de41c3..4fec839 100644 +--- a/arch/powerpc/kernel/process.c ++++ b/arch/powerpc/kernel/process.c +@@ -48,6 +48,8 @@ + #include + #endif + ++#include ++ + extern unsigned long _get_SP(void); + + #ifndef CONFIG_SMP +@@ -452,8 +454,9 @@ void show_regs(struct pt_regs * regs) + + printk("NIP: "REG" LR: "REG" CTR: "REG"\n", + regs->nip, regs->link, regs->ctr); +- printk("REGS: %p TRAP: %04lx %s (%s)\n", +- regs, regs->trap, print_tainted(), init_utsname()->release); ++ printk("REGS: %p TRAP: %04lx %s (%s %s)\n", ++ regs, regs->trap, print_tainted(), init_utsname()->release, ++ VZVERSION); + printk("MSR: "REG" ", regs->msr); + printbits(regs->msr, msr_bits); + printk(" CR: %08lx XER: %08lx\n", regs->ccr, regs->xer); +@@ -1004,6 +1007,20 @@ void dump_stack(void) + } + EXPORT_SYMBOL(dump_stack); + ++long kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) ++{ ++ extern long ppc_kernel_thread(int (*fn)(void *), void *arg, ++ unsigned long flags); ++ ++ if (!ve_is_super(get_exec_env())) { ++ printk("kernel_thread call inside container\n"); ++ dump_stack(); ++ return -EPERM; ++ } ++ ++ return ppc_kernel_thread(fn, arg, flags); ++} ++ + #ifdef CONFIG_PPC64 + void ppc64_runlatch_on(void) + { +diff --git a/arch/powerpc/kernel/systbl.S b/arch/powerpc/kernel/systbl.S +index 93219c3..a9e16bb 100644 +--- a/arch/powerpc/kernel/systbl.S ++++ b/arch/powerpc/kernel/systbl.S +@@ -43,5 +43,9 @@ + .p2align 3 + #endif + ++#define SYS_SKIP(from, to) .rept to - from \ ++ SYSCALL(sys_ni_syscall) \ ++ .endr ++ + _GLOBAL(sys_call_table) + #include +diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c +index 7b25107..bc28b20 100644 +--- a/arch/powerpc/mm/fault.c ++++ b/arch/powerpc/mm/fault.c +@@ -333,7 +333,6 @@ good_area: + * make sure we exit gracefully rather than endlessly redo + * the fault. + */ +- survive: + ret = handle_mm_fault(mm, vma, address, is_write); + if (unlikely(ret & VM_FAULT_ERROR)) { + if (ret & VM_FAULT_OOM) +@@ -373,14 +372,12 @@ bad_area_nosemaphore: + */ + out_of_memory: + up_read(&mm->mmap_sem); +- if (is_global_init(current)) { +- yield(); +- down_read(&mm->mmap_sem); +- goto survive; +- } +- printk("VM: killing process %s\n", current->comm); + if (user_mode(regs)) +- do_group_exit(SIGKILL); ++ /* ++ * 0-order allocation always success if something really ++ * fatal not happen: beancounter overdraft or OOM. Den ++ */ ++ force_sig(SIGKILL, current); + return SIGKILL; + + do_sigbus: +diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c +index 6aa6537..139b841 100644 +--- a/arch/powerpc/mm/init_64.c ++++ b/arch/powerpc/mm/init_64.c +@@ -173,7 +173,7 @@ void pgtable_cache_init(void) + "for size: %08x...\n", name, i, size); + pgtable_cache[i] = kmem_cache_create(name, + size, size, +- SLAB_PANIC, ++ SLAB_PANIC|SLAB_UBC|SLAB_NO_CHARGE, + zero_ctor); + } + } +diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c +index e0ff59f..083ce8b 100644 +--- a/arch/powerpc/mm/pgtable_32.c ++++ b/arch/powerpc/mm/pgtable_32.c +@@ -83,7 +83,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm) + { + pgd_t *ret; + +- ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, PGDIR_ORDER); ++ ret = (pgd_t *)__get_free_pages(GFP_KERNEL_UBC | __GFP_SOFT_UBC | ++ __GFP_ZERO, PGDIR_ORDER); + return ret; + } + +@@ -117,6 +118,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) + #else + gfp_t flags = GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO; + #endif ++ flags |= (__GFP_UBC | __GFP_SOFT_UBC); + + ptepage = alloc_pages(flags, 0); + if (!ptepage) +diff --git a/arch/powerpc/platforms/cell/spu_callbacks.c b/arch/powerpc/platforms/cell/spu_callbacks.c +index 19f6bfd..4f23f43 100644 +--- a/arch/powerpc/platforms/cell/spu_callbacks.c ++++ b/arch/powerpc/platforms/cell/spu_callbacks.c +@@ -46,6 +46,8 @@ static void *spu_syscall_table[] = { + #define PPC_SYS_SPU(func) ppc_##func, + #define SYSX_SPU(f, f3264, f32) f, + ++#define SYS_SKIP(from, to) [from ... to] = sys_ni_syscall, ++ + #include + }; + +diff --git a/arch/ppc/Kconfig b/arch/ppc/Kconfig +index 0f1863e..4849a57 100644 +--- a/arch/ppc/Kconfig ++++ b/arch/ppc/Kconfig +@@ -1181,6 +1181,10 @@ source "lib/Kconfig" + + source "arch/ppc/Kconfig.debug" + ++source "kernel/Kconfig.openvz" ++ + source "security/Kconfig" + ++source "kernel/bc/Kconfig" ++ + source "crypto/Kconfig" +diff --git a/arch/ppc/kernel/misc.S b/arch/ppc/kernel/misc.S +index d5e0dfc..b217a25 100644 +--- a/arch/ppc/kernel/misc.S ++++ b/arch/ppc/kernel/misc.S +@@ -826,7 +826,7 @@ _GLOBAL(_get_SP) + * Create a kernel thread + * kernel_thread(fn, arg, flags) + */ +-_GLOBAL(kernel_thread) ++_GLOBAL(ppc_kernel_thread) + stwu r1,-16(r1) + stw r30,8(r1) + stw r31,12(r1) +diff --git a/arch/ppc/mm/fault.c b/arch/ppc/mm/fault.c +index 36c0e75..276d861 100644 +--- a/arch/ppc/mm/fault.c ++++ b/arch/ppc/mm/fault.c +@@ -249,7 +249,6 @@ good_area: + * make sure we exit gracefully rather than endlessly redo + * the fault. + */ +- survive: + fault = handle_mm_fault(mm, vma, address, is_write); + if (unlikely(fault & VM_FAULT_ERROR)) { + if (fault & VM_FAULT_OOM) +@@ -290,14 +289,12 @@ bad_area: + */ + out_of_memory: + up_read(&mm->mmap_sem); +- if (is_global_init(current)) { +- yield(); +- down_read(&mm->mmap_sem); +- goto survive; +- } +- printk("VM: killing process %s\n", current->comm); + if (user_mode(regs)) +- do_group_exit(SIGKILL); ++ /* ++ * 0-order allocation always success if something really ++ * fatal not happen: beancounter overdraft or OOM. Den ++ */ ++ force_sig(SIGKILL, current); + return SIGKILL; + + do_sigbus: +diff --git a/arch/ppc/mm/pgtable.c b/arch/ppc/mm/pgtable.c +index 03a79bf..0cf0355 100644 +--- a/arch/ppc/mm/pgtable.c ++++ b/arch/ppc/mm/pgtable.c +@@ -70,7 +70,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm) + { + pgd_t *ret; + +- ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, PGDIR_ORDER); ++ ret = (pgd_t *)__get_free_pages(GFP_KERNEL_UBC | __GFP_SOFT_UBC | ++ __GFP_ZERO, PGDIR_ORDER); + return ret; + } + +@@ -104,6 +105,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) + #else + gfp_t flags = GFP_KERNEL | __GFP_REPEAT; + #endif ++ flags |= (__GFP_UBC | __GFP_SOFT_UBC); + + ptepage = alloc_pages(flags, 0); + if (ptepage) { +diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig +index 107e492..9477a5a 100644 +--- a/arch/s390/Kconfig ++++ b/arch/s390/Kconfig +@@ -562,6 +562,8 @@ source "fs/Kconfig" + + source "arch/s390/Kconfig.debug" + ++source "kernel/Kconfig.openvz" ++ + source "security/Kconfig" + + source "crypto/Kconfig" +diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c +index 5d4fa4b..4ac7ef3 100644 +--- a/arch/s390/kernel/smp.c ++++ b/arch/s390/kernel/smp.c +@@ -577,8 +577,19 @@ out: + */ + int __cpuinit start_secondary(void *cpuvoid) + { +- /* Setup the cpu */ +- cpu_init(); ++ /* Setup the cpu */ ++ cpu_init(); ++ ++#ifdef CONFIG_VE ++ /* TSC reset. kill whatever might rely on old values */ ++ VE_TASK_INFO(current)->wakeup_stamp = 0; ++ /* ++ * Cosmetic: sleep_time won't be changed afterwards for the idle ++ * thread; keep it 0 rather than -cycles. ++ */ ++ VE_TASK_INFO(idle)->sleep_time = 0; ++#endif ++ + preempt_disable(); + /* Enable TOD clock interrupts on the secondary cpu. */ + init_cpu_timer(); +@@ -836,6 +847,11 @@ void __init smp_prepare_cpus(unsigned int max_cpus) + for_each_possible_cpu(cpu) + if (cpu != smp_processor_id()) + smp_create_idle(cpu); ++ ++#ifdef CONFIG_VE ++ /* TSC reset. kill whatever might rely on old values */ ++ VE_TASK_INFO(current)->wakeup_stamp = 0; ++#endif + } + + void __init smp_prepare_boot_cpu(void) +diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c +index 0283d81..e7815f6 100644 +--- a/arch/sh/kernel/process_64.c ++++ b/arch/sh/kernel/process_64.c +@@ -680,7 +680,7 @@ asids_proc_info(char *buf, char **start, off_t fpos, int length, int *eof, void + int len=0; + struct task_struct *p; + read_lock(&tasklist_lock); +- for_each_process(p) { ++ for_each_process_ve(p) { + int pid = p->pid; + + if (!pid) +diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig +index eb36f3b..5c8eb15 100644 +--- a/arch/sparc64/Kconfig ++++ b/arch/sparc64/Kconfig +@@ -407,8 +407,12 @@ source "fs/Kconfig" + + source "arch/sparc64/Kconfig.debug" + ++source "kernel/Kconfig.openvz" ++ + source "security/Kconfig" + + source "crypto/Kconfig" + + source "lib/Kconfig" ++ ++source "kernel/bc/Kconfig" +diff --git a/arch/sparc64/kernel/process.c b/arch/sparc64/kernel/process.c +index 2084f81..552ed1b 100644 +--- a/arch/sparc64/kernel/process.c ++++ b/arch/sparc64/kernel/process.c +@@ -747,6 +747,13 @@ pid_t kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) + { + long retval; + ++ /* Don't allow kernel_thread() inside VE */ ++ if (!ve_is_super(get_exec_env())) { ++ printk("kernel_thread call inside container\n"); ++ dump_stack(); ++ return -EPERM; ++ } ++ + /* If the parent runs before fn(arg) is called by the child, + * the input registers of this function can be clobbered. + * So we stash 'fn' and 'arg' into global registers which +diff --git a/arch/sparc64/kernel/systbls.S b/arch/sparc64/kernel/systbls.S +index 8b5282d..9a58e19 100644 +--- a/arch/sparc64/kernel/systbls.S ++++ b/arch/sparc64/kernel/systbls.S +@@ -83,6 +83,24 @@ sys_call_table32: + /*310*/ .word compat_sys_utimensat, compat_sys_signalfd, sys_timerfd_create, sys_eventfd, compat_sys_fallocate + .word compat_sys_timerfd_settime, compat_sys_timerfd_gettime + ++ .rept 500-317 ++ .word sys_nis_syscall ++ .endr ++ .word sys_fairsched_mknod /* 500 */ ++ .word sys_fairsched_rmnod ++ .word sys_fairsched_chwt ++ .word sys_fairsched_mvpr ++ .word sys_fairsched_rate ++ .word sys_nis_syscall /* 505 */ ++ .word sys_nis_syscall ++ .word sys_nis_syscall ++ .word sys_nis_syscall ++ .word sys_nis_syscall ++ .word sys_getluid /* 510 */ ++ .word sys_setluid ++ .word compat_sys_setublimit ++ .word compat_sys_ubstat ++ + #endif /* CONFIG_COMPAT */ + + /* Now the 64-bit native Linux syscall table. */ +@@ -155,3 +173,20 @@ sys_call_table: + .word sys_set_mempolicy, sys_kexec_load, sys_move_pages, sys_getcpu, sys_epoll_pwait + /*310*/ .word sys_utimensat, sys_signalfd, sys_timerfd_create, sys_eventfd, sys_fallocate + .word sys_timerfd_settime, sys_timerfd_gettime ++ .rept 500-317 ++ .word sys_nis_syscall ++ .endr ++ .word sys_fairsched_mknod /* 500 */ ++ .word sys_fairsched_rmnod ++ .word sys_fairsched_chwt ++ .word sys_fairsched_mvpr ++ .word sys_fairsched_rate ++ .word sys_nis_syscall /* 505 */ ++ .word sys_nis_syscall ++ .word sys_nis_syscall ++ .word sys_nis_syscall ++ .word sys_nis_syscall ++ .word sys_getluid /* 510 */ ++ .word sys_setluid ++ .word sys_setublimit ++ .word sys_ubstat +diff --git a/arch/sparc64/kernel/traps.c b/arch/sparc64/kernel/traps.c +index 3697492..a60f6dd 100644 +--- a/arch/sparc64/kernel/traps.c ++++ b/arch/sparc64/kernel/traps.c +@@ -2197,6 +2197,10 @@ void die_if_kernel(char *str, struct pt_regs *regs) + " \\__U_/\n"); + + printk("%s(%d): %s [#%d]\n", current->comm, task_pid_nr(current), str, ++die_counter); ++ printk("VE:EXCVE %d:%d, CPU %d, VCPU %d:%d\n", ++ VEID(VE_TASK_INFO(current)->owner_env), VEID(get_exec_env()), ++ smp_processor_id(), ++ task_vsched_id(current), task_cpu(current)); + notify_die(DIE_OOPS, str, regs, 0, 255, SIGSEGV); + __asm__ __volatile__("flushw"); + __show_regs(regs); +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index bf07b6f..3574b92 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -1711,6 +1711,7 @@ config SYSVIPC_COMPAT + + endmenu + ++source "kernel/Kconfig.openvz" + + source "net/Kconfig" + +@@ -1729,3 +1730,5 @@ source "crypto/Kconfig" + source "arch/x86/kvm/Kconfig" + + source "lib/Kconfig" ++ ++source "kernel/bc/Kconfig" +diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S +index b5e329d..434e6cf 100644 +--- a/arch/x86/ia32/ia32entry.S ++++ b/arch/x86/ia32/ia32entry.S +@@ -517,7 +517,7 @@ ia32_sys_call_table: + .quad stub32_iopl /* 110 */ + .quad sys_vhangup + .quad quiet_ni_syscall /* old "idle" system call */ +- .quad sys32_vm86_warning /* vm86old */ ++ .quad quiet_ni_syscall /* vm86old */ + .quad compat_sys_wait4 + .quad sys_swapoff /* 115 */ + .quad compat_sys_sysinfo +@@ -570,7 +570,7 @@ ia32_sys_call_table: + .quad sys_mremap + .quad sys_setresuid16 + .quad sys_getresuid16 /* 165 */ +- .quad sys32_vm86_warning /* vm86 */ ++ .quad quiet_ni_syscall /* vm86 */ + .quad quiet_ni_syscall /* query_module */ + .quad sys_poll + .quad compat_sys_nfsservctl +diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c +index f00afdf..fd045ce 100644 +--- a/arch/x86/ia32/sys_ia32.c ++++ b/arch/x86/ia32/sys_ia32.c +@@ -817,20 +817,6 @@ long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high, + advice); + } + +-long sys32_vm86_warning(void) +-{ +- struct task_struct *me = current; +- static char lastcomm[sizeof(me->comm)]; +- +- if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) { +- compat_printk(KERN_INFO +- "%s: vm86 mode not supported on 64 bit kernel\n", +- me->comm); +- strncpy(lastcomm, me->comm, sizeof(lastcomm)); +- } +- return -ENOSYS; +-} +- + long sys32_lookup_dcookie(u32 addr_low, u32 addr_high, + char __user *buf, size_t len) + { +diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S +index c778e4f..732716c 100644 +--- a/arch/x86/kernel/entry_32.S ++++ b/arch/x86/kernel/entry_32.S +@@ -214,6 +214,7 @@ ENTRY(ret_from_fork) + GET_THREAD_INFO(%ebp) + popl %eax + CFI_ADJUST_CFA_OFFSET -4 ++ret_from_fork_tail: + pushl $0x0202 # Reset kernel eflags + CFI_ADJUST_CFA_OFFSET 4 + popfl +@@ -222,6 +223,25 @@ ENTRY(ret_from_fork) + CFI_ENDPROC + END(ret_from_fork) + ++ENTRY(i386_ret_from_resume) ++ CFI_STARTPROC ++ pushl %eax ++ CFI_ADJUST_CFA_OFFSET 4 ++ call schedule_tail ++ GET_THREAD_INFO(%ebp) ++ popl %eax ++ CFI_ADJUST_CFA_OFFSET -4 ++ movl (%esp),%eax ++ testl %eax,%eax ++ jz 1f ++ pushl %esp ++ call *%eax ++ addl $4,%esp ++1: ++ addl $256,%esp ++ jmp ret_from_fork_tail ++ CFI_ENDPROC ++ + /* + * Return to user mode is not as complex as all this looks, + * but we want the default path for a system call return to +diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S +index 556a8df..87cd7f2 100644 +--- a/arch/x86/kernel/entry_64.S ++++ b/arch/x86/kernel/entry_64.S +@@ -168,7 +168,12 @@ ENTRY(ret_from_fork) + popf # reset kernel eflags + CFI_ADJUST_CFA_OFFSET -4 + call schedule_tail ++ret_from_fork_tail: + GET_THREAD_INFO(%rcx) ++ btr $TIF_RESUME,threadinfo_flags(%rcx) ++ jc x86_64_ret_from_resume ++ ++ret_from_fork_check: + testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) + jnz rff_trace + rff_action: +@@ -184,6 +189,19 @@ rff_trace: + call syscall_trace_leave + GET_THREAD_INFO(%rcx) + jmp rff_action ++ ++x86_64_ret_from_resume: ++ movq (%rsp),%rax ++ testq %rax,%rax ++ jz 1f ++ movq %rsp,%rdi ++ call *%rax ++1: ++ addq $256,%rsp ++ cmpq $0,ORIG_RAX(%rsp) ++ jge ret_from_fork_tail ++ RESTORE_REST ++ jmp int_ret_from_sys_call + CFI_ENDPROC + END(ret_from_fork) + +@@ -992,7 +1010,7 @@ ENTRY(kernel_thread) + xorl %r9d,%r9d + + # clone now +- call do_fork ++ call do_fork_kthread + movq %rax,RAX(%rsp) + xorl %edi,%edi + +diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c +index 0224c36..e6385eb 100644 +--- a/arch/x86/kernel/ldt.c ++++ b/arch/x86/kernel/ldt.c +@@ -12,6 +12,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -19,6 +20,8 @@ + #include + #include + ++#include ++ + #ifdef CONFIG_SMP + static void flush_ldt(void *null) + { +@@ -38,9 +41,9 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) + mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) & + (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1)); + if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE) +- newldt = vmalloc(mincount * LDT_ENTRY_SIZE); ++ newldt = ub_vmalloc(mincount * LDT_ENTRY_SIZE); + else +- newldt = (void *)__get_free_page(GFP_KERNEL); ++ newldt = (void *)__get_free_page(GFP_KERNEL_UBC); + + if (!newldt) + return -ENOMEM; +@@ -112,6 +115,7 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) + } + return retval; + } ++EXPORT_SYMBOL_GPL(init_new_context); + + /* + * No need to lock the MM as we are the last user +diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c +index 84160f7..3fe4e6d 100644 +--- a/arch/x86/kernel/nmi_32.c ++++ b/arch/x86/kernel/nmi_32.c +@@ -316,6 +316,21 @@ EXPORT_SYMBOL(touch_nmi_watchdog); + + extern void die_nmi(struct pt_regs *, const char *msg); + ++void smp_show_regs(struct pt_regs *regs, void *info) ++{ ++ static DEFINE_SPINLOCK(show_regs_lock); ++ ++ if (regs == NULL) ++ return; ++ ++ spin_lock(&show_regs_lock); ++ bust_spinlocks(1); ++ printk("----------- IPI show regs -----------"); ++ show_regs(regs); ++ bust_spinlocks(0); ++ spin_unlock(&show_regs_lock); ++} ++ + notrace __kprobes int + nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) + { +diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c +index 5a29ded..e96721c 100644 +--- a/arch/x86/kernel/nmi_64.c ++++ b/arch/x86/kernel/nmi_64.c +@@ -354,10 +354,10 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) + if (!touched && __get_cpu_var(last_irq_sum) == sum) { + /* + * Ayiee, looks like this CPU is stuck ... +- * wait a few IRQs (5 seconds) before doing the oops ... ++ * wait a few IRQs (30 seconds) before doing the oops ... + */ + local_inc(&__get_cpu_var(alert_counter)); +- if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) ++ if (local_read(&__get_cpu_var(alert_counter)) == 30*nmi_hz) + die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs, + panic_on_timeout); + } else { +@@ -385,16 +385,35 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) + + static unsigned ignore_nmis; + ++static int dummy_nmi_callback(struct pt_regs *regs, int cpu) ++{ ++ return 0; ++} ++ ++static nmi_callback_t nmi_ipi_callback = dummy_nmi_callback; ++ + asmlinkage notrace __kprobes void + do_nmi(struct pt_regs *regs, long error_code) + { + nmi_enter(); + add_pda(__nmi_count,1); +- if (!ignore_nmis) +- default_do_nmi(regs); ++ if (!ignore_nmis) { ++ if (!nmi_ipi_callback(regs, smp_processor_id())) ++ default_do_nmi(regs); ++ } + nmi_exit(); + } + ++void set_nmi_ipi_callback(nmi_callback_t callback) ++{ ++ nmi_ipi_callback = callback; ++} ++ ++void unset_nmi_ipi_callback(void) ++{ ++ nmi_ipi_callback = dummy_nmi_callback; ++} ++ + void stop_nmi(void) + { + acpi_nmi_disable(); +diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c +index e2db9ac..9260537 100644 +--- a/arch/x86/kernel/process_32.c ++++ b/arch/x86/kernel/process_32.c +@@ -37,6 +37,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -51,12 +52,16 @@ + #endif + + #include ++#include + + #include + #include + #include + + asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); ++EXPORT_SYMBOL(ret_from_fork); ++asmlinkage void i386_ret_from_resume(void) __asm__("i386_ret_from_resume"); ++EXPORT_SYMBOL_GPL(i386_ret_from_resume); + + static int hlt_counter; + +@@ -212,16 +217,17 @@ void __show_registers(struct pt_regs *regs, int all) + } + + printk("\n"); +- printk("Pid: %d, comm: %s %s (%s %.*s)\n", ++ printk("Pid: %d, comm: %s %s (%s %.*s %s)\n", + task_pid_nr(current), current->comm, + print_tainted(), init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), +- init_utsname()->version); ++ init_utsname()->version, VZVERSION); + + printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", + (u16)regs->cs, regs->ip, regs->flags, + smp_processor_id()); +- print_symbol("EIP is at %s\n", regs->ip); ++ if (decode_call_traces) ++ print_symbol("EIP is at %s\n", regs->ip); + + printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", + regs->ax, regs->bx, regs->cx, regs->dx); +@@ -257,6 +263,8 @@ void show_regs(struct pt_regs *regs) + { + __show_registers(regs, 1); + show_trace(NULL, regs, ®s->sp, regs->bp); ++ if (!decode_call_traces) ++ printk(" EIP: [<%08lx>]\n", regs->ip); + } + + /* +@@ -265,6 +273,7 @@ void show_regs(struct pt_regs *regs) + * the "args". + */ + extern void kernel_thread_helper(void); ++EXPORT_SYMBOL(kernel_thread_helper); + + /* + * Create a kernel thread +@@ -273,6 +282,13 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) + { + struct pt_regs regs; + ++ /* Don't allow kernel_thread() inside VE */ ++ if (!ve_allow_kthreads && !ve_is_super(get_exec_env())) { ++ printk("kernel_thread call inside container\n"); ++ dump_stack(); ++ return -EPERM; ++ } ++ + memset(®s, 0, sizeof(regs)); + + regs.bx = (unsigned long) fn; +diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c +index c6eb5c9..c303e39 100644 +--- a/arch/x86/kernel/process_64.c ++++ b/arch/x86/kernel/process_64.c +@@ -26,8 +26,10 @@ + #include + #include + #include ++#include + #include + #include ++#include + #include + #include + #include +@@ -52,8 +54,6 @@ + #include + #include + +-asmlinkage extern void ret_from_fork(void); +- + unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; + + unsigned long boot_option_idle_override = 0; +@@ -189,13 +189,14 @@ void __show_regs(struct pt_regs * regs) + + printk("\n"); + print_modules(); +- printk("Pid: %d, comm: %.20s %s %s %.*s\n", ++ printk("Pid: %d, comm: %.20s %s %s %.*s %s\n", + current->pid, current->comm, print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), +- init_utsname()->version); ++ init_utsname()->version, VZVERSION); + printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); +- printk_address(regs->ip, 1); ++ if (decode_call_traces) ++ printk_address(regs->ip, 1); + printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp, + regs->flags); + printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", +@@ -243,9 +244,26 @@ void show_regs(struct pt_regs *regs) + { + printk("CPU %d:", smp_processor_id()); + __show_regs(regs); +- show_trace(NULL, regs, (void *)(regs + 1), regs->bp); ++ show_trace(NULL, regs, ®s->sp, regs->bp); ++ if (!decode_call_traces) ++ printk(" EIP: [<%08lx>]\n", regs->ip); + } + ++void smp_show_regs(struct pt_regs *regs, void *data) ++{ ++ static DEFINE_SPINLOCK(show_regs_lock); ++ ++ if (regs == NULL) ++ return; ++ ++ spin_lock(&show_regs_lock); ++ bust_spinlocks(1); ++ printk("----------- IPI show regs -----------\n"); ++ show_regs(regs); ++ bust_spinlocks(0); ++ spin_unlock(&show_regs_lock); ++ } ++ + /* + * Free current thread data structures etc.. + */ +@@ -868,3 +886,20 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) + unsigned long range_end = mm->brk + 0x02000000; + return randomize_range(mm->brk, range_end, 0) ? : mm->brk; + } ++ ++long do_fork_kthread(unsigned long clone_flags, ++ unsigned long stack_start, ++ struct pt_regs *regs, ++ unsigned long stack_size, ++ int __user *parent_tidptr, ++ int __user *child_tidptr) ++{ ++ if (ve_allow_kthreads || ve_is_super(get_exec_env())) ++ return do_fork(clone_flags, stack_start, regs, stack_size, ++ parent_tidptr, child_tidptr); ++ ++ /* Don't allow kernel_thread() inside VE */ ++ printk("kernel_thread call inside container\n"); ++ dump_stack(); ++ return -EPERM; ++} +diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c +index a7835f2..595dae6 100644 +--- a/arch/x86/kernel/ptrace.c ++++ b/arch/x86/kernel/ptrace.c +@@ -1422,8 +1422,11 @@ int do_syscall_trace(struct pt_regs *regs, int entryexit) + return 0; + + /* Fake a debug trap */ +- if (is_singlestep) ++ if (is_singlestep) { ++ set_pn_state(current, entryexit ? PN_STOP_LEAVE : PN_STOP_ENTRY); + send_sigtrap(current, regs, 0); ++ clear_pn_state(current); ++ } + + if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu) + goto out; +diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c +index aee0e82..2dd69cc 100644 +--- a/arch/x86/kernel/setup64.c ++++ b/arch/x86/kernel/setup64.c +@@ -285,3 +285,5 @@ void __cpuinit cpu_init (void) + if (is_uv_system()) + uv_cpu_init(); + } ++ ++EXPORT_SYMBOL_GPL(cpu_gdt_descr); +diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c +index d923736..193b8bf 100644 +--- a/arch/x86/kernel/signal_32.c ++++ b/arch/x86/kernel/signal_32.c +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -593,6 +594,9 @@ static void do_signal(struct pt_regs *regs) + if (!user_mode(regs)) + return; + ++ if (try_to_freeze() && !signal_pending(current)) ++ goto no_signal; ++ + if (current_thread_info()->status & TS_RESTORE_SIGMASK) + oldset = ¤t->saved_sigmask; + else +@@ -622,6 +626,7 @@ static void do_signal(struct pt_regs *regs) + return; + } + ++no_signal: + /* Did we come from a system call? */ + if ((long)regs->orig_ax >= 0) { + /* Restart the system call - no handlers present */ +diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c +index e53b267..3319d4a 100644 +--- a/arch/x86/kernel/signal_64.c ++++ b/arch/x86/kernel/signal_64.c +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -427,6 +428,9 @@ static void do_signal(struct pt_regs *regs) + if (!user_mode(regs)) + return; + ++ if (try_to_freeze() && !signal_pending(current)) ++ goto no_signal; ++ + if (current_thread_info()->status & TS_RESTORE_SIGMASK) + oldset = ¤t->saved_sigmask; + else +@@ -455,6 +459,7 @@ static void do_signal(struct pt_regs *regs) + return; + } + ++no_signal: + /* Did we come from a system call? */ + if (current_syscall(regs) >= 0) { + /* Restart the system call - no handlers present */ +diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c +index 0cb7aad..5657402 100644 +--- a/arch/x86/kernel/smp.c ++++ b/arch/x86/kernel/smp.c +@@ -22,6 +22,7 @@ + #include + #include + ++#include + #include + #include + #include +@@ -249,6 +250,89 @@ native_smp_call_function_mask(cpumask_t mask, + return 0; + } + ++static DEFINE_SPINLOCK(nmi_call_lock); ++static struct nmi_call_data_struct { ++ smp_nmi_function func; ++ void *info; ++ atomic_t started; ++ atomic_t finished; ++ cpumask_t cpus_called; ++ int wait; ++} *nmi_call_data; ++ ++static int smp_nmi_callback(struct pt_regs *regs, int cpu) ++{ ++ smp_nmi_function func; ++ void *info; ++ int wait; ++ ++ func = nmi_call_data->func; ++ info = nmi_call_data->info; ++ wait = nmi_call_data->wait; ++ ack_APIC_irq(); ++ /* prevent from calling func() multiple times */ ++ if (cpu_test_and_set(cpu, nmi_call_data->cpus_called)) ++ return 0; ++ /* ++ * notify initiating CPU that I've grabbed the data and am ++ * about to execute the function ++ */ ++ mb(); ++ atomic_inc(&nmi_call_data->started); ++ /* at this point the nmi_call_data structure is out of scope */ ++ irq_enter(); ++ func(regs, info); ++ irq_exit(); ++ if (wait) ++ atomic_inc(&nmi_call_data->finished); ++ ++ return 1; ++} ++ ++/* ++ * This function tries to call func(regs, info) on each cpu. ++ * Func must be fast and non-blocking. ++ * May be called with disabled interrupts and from any context. ++ */ ++int smp_nmi_call_function(smp_nmi_function func, void *info, int wait) ++{ ++ struct nmi_call_data_struct data; ++ int cpus; ++ ++ cpus = num_online_cpus() - 1; ++ if (!cpus) ++ return 0; ++ ++ data.func = func; ++ data.info = info; ++ data.wait = wait; ++ atomic_set(&data.started, 0); ++ atomic_set(&data.finished, 0); ++ cpus_clear(data.cpus_called); ++ /* prevent this cpu from calling func if NMI happens */ ++ cpu_set(smp_processor_id(), data.cpus_called); ++ ++ if (!spin_trylock(&nmi_call_lock)) ++ return -1; ++ ++ nmi_call_data = &data; ++ set_nmi_ipi_callback(smp_nmi_callback); ++ mb(); ++ ++ /* Send a message to all other CPUs and wait for them to respond */ ++ send_IPI_allbutself(APIC_DM_NMI); ++ while (atomic_read(&data.started) != cpus) ++ barrier(); ++ ++ unset_nmi_ipi_callback(); ++ if (wait) ++ while (atomic_read(&data.finished) != cpus) ++ barrier(); ++ spin_unlock(&nmi_call_lock); ++ ++ return 0; ++} ++ + static void stop_this_cpu(void *dummy) + { + local_irq_disable(); +diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c +index 3e1cece..ddc677e 100644 +--- a/arch/x86/kernel/smpboot.c ++++ b/arch/x86/kernel/smpboot.c +@@ -918,6 +918,13 @@ do_rest: + clear_tsk_thread_flag(c_idle.idle, TIF_FORK); + #endif + ++ ++#ifdef CONFIG_VE ++ /* Cosmetic: sleep_time won't be changed afterwards for the idle ++ * thread; keep it 0 rather than -cycles. */ ++ VE_TASK_INFO(c_idle.idle)->sleep_time = 0; ++#endif ++ + /* start_ip had better be page-aligned! */ + start_ip = setup_trampoline(); + +diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S +index adff556..7f59fad 100644 +--- a/arch/x86/kernel/syscall_table_32.S ++++ b/arch/x86/kernel/syscall_table_32.S +@@ -326,3 +326,22 @@ ENTRY(sys_call_table) + .long sys_fallocate + .long sys_timerfd_settime /* 325 */ + .long sys_timerfd_gettime ++ .rept 500-(.-sys_call_table)/4 ++ .long sys_ni_syscall ++ .endr ++ .long sys_fairsched_mknod /* 500 */ ++ .long sys_fairsched_rmnod ++ .long sys_fairsched_chwt ++ .long sys_fairsched_mvpr ++ .long sys_fairsched_rate ++ .long sys_fairsched_vcpus /* 505 */ ++ .long sys_ni_syscall ++ .long sys_ni_syscall ++ .long sys_ni_syscall ++ .long sys_ni_syscall ++ .long sys_getluid /* 510 */ ++ .long sys_setluid ++ .long sys_setublimit ++ .long sys_ubstat ++ .long sys_ni_syscall ++ .long sys_ni_syscall +diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c +index 9bb2363..318aa46 100644 +--- a/arch/x86/kernel/tlb_32.c ++++ b/arch/x86/kernel/tlb_32.c +@@ -204,6 +204,8 @@ void flush_tlb_mm(struct mm_struct *mm) + preempt_enable(); + } + ++EXPORT_SYMBOL(flush_tlb_mm); ++ + void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) + { + struct mm_struct *mm = vma->vm_mm; +diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c +index a1f07d7..8fdac14 100644 +--- a/arch/x86/kernel/tlb_64.c ++++ b/arch/x86/kernel/tlb_64.c +@@ -237,6 +237,8 @@ void flush_tlb_mm(struct mm_struct *mm) + preempt_enable(); + } + ++EXPORT_SYMBOL(flush_tlb_mm); ++ + void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) + { + struct mm_struct *mm = vma->vm_mm; +diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c +index 08d752d..c78417e 100644 +--- a/arch/x86/kernel/traps_32.c ++++ b/arch/x86/kernel/traps_32.c +@@ -222,6 +222,8 @@ print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) + { + printk(data); + print_symbol(msg, symbol); ++ if (decode_call_traces) ++ print_symbol("%s\n", symbol); + printk("\n"); + } + +@@ -259,7 +261,10 @@ show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, char *log_lvl) + { + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); +- printk("%s =======================\n", log_lvl); ++ if (decode_call_traces) ++ printk("%s =======================\n", log_lvl); ++ else ++ printk("%s ==", log_lvl); + } + + void show_trace(struct task_struct *task, struct pt_regs *regs, +@@ -290,9 +295,14 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + printk("\n%s ", log_lvl); + printk("%08lx ", *stack++); + } +- printk("\n%sCall Trace:\n", log_lvl); ++ if (decode_call_traces) ++ printk("\n%s Call Trace:\n", log_lvl); ++ else ++ printk("\n%s Call Trace: ", log_lvl); + + show_trace_log_lvl(task, regs, sp, bp, log_lvl); ++ if (!decode_call_traces) ++ printk("\n"); + } + + void show_stack(struct task_struct *task, unsigned long *sp) +@@ -321,6 +331,8 @@ void dump_stack(void) + init_utsname()->version); + + show_trace(current, NULL, &stack, bp); ++ if (!decode_call_traces) ++ printk("\n"); + } + + EXPORT_SYMBOL(dump_stack); +@@ -332,8 +344,9 @@ void show_registers(struct pt_regs *regs) + print_modules(); + __show_registers(regs, 0); + +- printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", ++ printk(KERN_EMERG "Process %.*s (pid: %d, veid: %d, ti=%p task=%p task.ti=%p)", + TASK_COMM_LEN, current->comm, task_pid_nr(current), ++ VEID(current->ve_task_info.owner_env), + current_thread_info(), current, task_thread_info(current)); + /* + * When in-kernel, we also print out the stack and code at the +@@ -754,6 +767,21 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) + printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); + } + ++/* ++ * Voyager doesn't implement these ++ */ ++void __attribute__((weak)) smp_show_regs(struct pt_regs *regs, void *info) ++{ ++} ++ ++#ifdef CONFIG_SMP ++int __attribute__((weak)) ++smp_nmi_call_function(smp_nmi_function func, void *info, int wait) ++{ ++ return 0; ++} ++#endif ++ + static DEFINE_SPINLOCK(nmi_print_lock); + + void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg) +@@ -771,6 +799,10 @@ void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg) + printk(" on CPU%d, ip %08lx, registers:\n", + smp_processor_id(), regs->ip); + show_registers(regs); ++ smp_nmi_call_function(smp_show_regs, NULL, 1); ++ bust_spinlocks(1); ++ if (!decode_call_traces) ++ show_registers(regs); + console_silent(); + spin_unlock(&nmi_print_lock); + bust_spinlocks(0); +@@ -787,6 +819,13 @@ void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg) + do_exit(SIGSEGV); + } + ++static int dummy_nmi_callback(struct pt_regs *regs, int cpu) ++{ ++ return 0; ++} ++ ++static nmi_callback_t nmi_ipi_callback = dummy_nmi_callback; ++ + static notrace __kprobes void default_do_nmi(struct pt_regs *regs) + { + unsigned char reason = 0; +@@ -839,12 +878,24 @@ notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) + + ++nmi_count(cpu); + +- if (!ignore_nmis) +- default_do_nmi(regs); ++ if (!ignore_nmis) { ++ if (!nmi_ipi_callback(regs, cpu)) ++ default_do_nmi(regs); ++ } + + nmi_exit(); + } + ++void set_nmi_ipi_callback(nmi_callback_t callback) ++{ ++ nmi_ipi_callback = callback; ++} ++ ++void unset_nmi_ipi_callback(void) ++{ ++ nmi_ipi_callback = dummy_nmi_callback; ++} ++ + void stop_nmi(void) + { + acpi_nmi_disable(); +diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c +index adff76e..0bcb70c 100644 +--- a/arch/x86/kernel/traps_64.c ++++ b/arch/x86/kernel/traps_64.c +@@ -112,6 +112,11 @@ void printk_address(unsigned long address, int reliable) + char namebuf[KSYM_NAME_LEN]; + char reliab[4] = ""; + ++ if (!decode_call_traces) { ++ printk("[<%016lx>]", address); ++ return; ++ } ++ + symname = kallsyms_lookup(address, &symsize, &offset, + &modname, namebuf); + if (!symname) { +@@ -421,7 +426,7 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp, + if (((long) stack & (THREAD_SIZE-1)) == 0) + break; + } +- if (i && ((i % 4) == 0)) ++ if (i && ((i % 4) == 0) && decode_call_traces) + printk("\n"); + printk(" %016lx", *stack++); + touch_nmi_watchdog(); +@@ -469,10 +474,12 @@ void show_registers(struct pt_regs *regs) + + sp = regs->sp; + ip = (u8 *) regs->ip - code_prologue; +- printk("CPU %d ", cpu); ++ printk("CPU: %d ", cpu); + __show_regs(regs); +- printk("Process %s (pid: %d, threadinfo %p, task %p)\n", +- cur->comm, cur->pid, task_thread_info(cur), cur); ++ printk("Process %s (pid: %d, veid=%d, threadinfo %p, task %p)\n", ++ cur->comm, cur->pid, ++ VEID(VE_TASK_INFO(current)->owner_env), ++ task_thread_info(cur), cur); + + /* + * When in-kernel, we also print out the stack and code at the +diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c +index 0577825..b7ce3ba 100644 +--- a/arch/x86/kernel/tsc_sync.c ++++ b/arch/x86/kernel/tsc_sync.c +@@ -142,6 +142,10 @@ void __cpuinit check_tsc_sync_source(int cpu) + printk(" passed.\n"); + } + ++#ifdef CONFIG_VE ++ /* TSC reset. kill whatever might rely on old values */ ++ VE_TASK_INFO(current)->wakeup_stamp = 0; ++#endif + /* + * Reset it - just in case we boot another CPU later: + */ +diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c +index f6c05d0..03fb381 100644 +--- a/arch/x86/kernel/x8664_ksyms_64.c ++++ b/arch/x86/kernel/x8664_ksyms_64.c +@@ -4,12 +4,14 @@ + #include + #include + #include ++#include + + #include + #include + #include + #include + ++EXPORT_SYMBOL(kernel_execve); + EXPORT_SYMBOL(kernel_thread); + + EXPORT_SYMBOL(__get_user_1); +diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c +index 8bcb6f4..3eb1991 100644 +--- a/arch/x86/mm/fault.c ++++ b/arch/x86/mm/fault.c +@@ -402,7 +402,8 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, + printk(KERN_CONT " at %016lx\n", address); + #endif + printk(KERN_ALERT "IP:"); +- printk_address(regs->ip, 1); ++ if (decode_call_traces) ++ printk_address(regs->ip, 1); + dump_pagetable(address); + } + +@@ -568,7 +569,7 @@ static int vmalloc_fault(unsigned long address) + #endif + } + +-int show_unhandled_signals = 1; ++int show_unhandled_signals = 0; + + /* + * This routine handles page faults. It determines the address, +@@ -673,7 +674,6 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) + */ + if (user_mode_vm(regs)) + error_code |= PF_USER; +-again: + #endif + /* When running in the kernel we expect faults to occur only to + * addresses in user space. All other faults represent errors in the +@@ -739,7 +739,6 @@ good_area: + } + + #ifdef CONFIG_X86_32 +-survive: + #endif + /* + * If for any reason at all we couldn't handle the fault, +@@ -799,7 +798,7 @@ bad_area_nosemaphore: + + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) { +- printk( ++ ve_printk(VE_LOG, + #ifdef CONFIG_X86_32 + "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx", + #else +@@ -877,19 +876,14 @@ no_context: + */ + out_of_memory: + up_read(&mm->mmap_sem); +- if (is_global_init(tsk)) { +- yield(); +-#ifdef CONFIG_X86_32 +- down_read(&mm->mmap_sem); +- goto survive; +-#else +- goto again; +-#endif ++ if (error_code & PF_USER) { ++ /* ++ * 0-order allocation always success if something really ++ * fatal not happen: beancounter overdraft or OOM. ++ */ ++ force_sig(SIGKILL, tsk); ++ return; + } +- +- printk("VM: killing process %s\n", tsk->comm); +- if (error_code & PF_USER) +- do_group_exit(SIGKILL); + goto no_context; + + do_sigbus: +diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c +index 0b3d567..768e246 100644 +--- a/arch/x86/mm/hugetlbpage.c ++++ b/arch/x86/mm/hugetlbpage.c +@@ -12,6 +12,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -207,6 +208,7 @@ int pmd_huge(pmd_t pmd) + { + return !!(pmd_val(pmd) & _PAGE_PSE); + } ++EXPORT_SYMBOL(pmd_huge); + + struct page * + follow_huge_pmd(struct mm_struct *mm, unsigned long address, +diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c +index 819dad9..b07653e 100644 +--- a/arch/x86/mm/init_64.c ++++ b/arch/x86/mm/init_64.c +@@ -113,6 +113,7 @@ void show_mem(void) + printk(KERN_INFO "%lu pages shared\n", shared); + printk(KERN_INFO "%lu pages swap cached\n", cached); + } ++EXPORT_SYMBOL_GPL(show_mem); + + int after_bootmem; + +diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c +index 5015976..97ff257 100644 +--- a/arch/x86/mm/pgtable.c ++++ b/arch/x86/mm/pgtable.c +@@ -13,9 +13,9 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) + struct page *pte; + + #ifdef CONFIG_HIGHPTE +- pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); ++ pte = alloc_pages(GFP_KERNEL_UBC|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); + #else +- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); ++ pte = alloc_pages(GFP_KERNEL_UBC|__GFP_REPEAT|__GFP_ZERO, 0); + #endif + if (pte) + pgtable_page_ctor(pte); +@@ -210,7 +210,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd) + + pgd_t *pgd_alloc(struct mm_struct *mm) + { +- pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); ++ pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL_UBC | __GFP_ZERO); + + /* so that alloc_pmd can use it */ + mm->pgd = pgd; +diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c +index 369cf06..aab674a 100644 +--- a/arch/x86/mm/pgtable_32.c ++++ b/arch/x86/mm/pgtable_32.c +@@ -66,6 +66,7 @@ void show_mem(void) + printk(KERN_INFO "%lu pages pagetables\n", + global_page_state(NR_PAGETABLE)); + } ++EXPORT_SYMBOL_GPL(show_mem); + + /* + * Associate a virtual page frame with a given physical page frame +diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c +index cf058fe..4579afd 100644 +--- a/arch/x86/vdso/vdso32-setup.c ++++ b/arch/x86/vdso/vdso32-setup.c +@@ -17,6 +17,8 @@ + #include + #include + ++#include ++ + #include + #include + #include +@@ -37,6 +39,8 @@ enum { + #else + #define VDSO_DEFAULT VDSO_ENABLED + #endif ++#undef VDSO_DEFAULT ++#define VDSO_DEFAULT VDSO_DISABLED + + #ifdef CONFIG_X86_64 + #define vdso_enabled sysctl_vsyscall32 +@@ -199,7 +203,8 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr) + */ + extern const char vdso32_default_start, vdso32_default_end; + extern const char vdso32_sysenter_start, vdso32_sysenter_end; +-static struct page *vdso32_pages[1]; ++struct page *vdso32_pages[1]; ++EXPORT_SYMBOL_GPL(vdso32_pages); + + #ifdef CONFIG_X86_64 + +@@ -319,16 +324,30 @@ int __init sysenter_setup(void) + return 0; + } + ++EXPORT_SYMBOL_GPL(VDSO32_SYSENTER_RETURN); ++EXPORT_SYMBOL_GPL(VDSO32_PRELINK); ++ + /* Setup a VMA at program startup for the vsyscall page */ +-int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) ++int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack, ++ unsigned long map_address) + { + struct mm_struct *mm = current->mm; +- unsigned long addr; ++ unsigned long addr = map_address; + int ret = 0; + bool compat; ++ unsigned long flags; + +- if (vdso_enabled == VDSO_DISABLED) ++ if (vdso_enabled == VDSO_DISABLED && map_address == 0) { ++ current->mm->context.vdso = NULL; + return 0; ++ } ++ ++ flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC | VM_MAYWRITE | ++ mm->def_flags; ++ ++ ret = -ENOMEM; ++ if (ub_memory_charge(mm, PAGE_SIZE, flags, NULL, UB_SOFT)) ++ goto err_charge; + + down_write(&mm->mmap_sem); + +@@ -338,17 +357,16 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) + + map_compat_vdso(compat); + +- if (compat) +- addr = VDSO_HIGH_BASE; +- else { +- addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); ++ if (!compat || map_address) { ++ addr = get_unmapped_area(NULL, addr, PAGE_SIZE, 0, 0); + if (IS_ERR_VALUE(addr)) { + ret = addr; + goto up_fail; + } +- } ++ } else ++ addr = VDSO_HIGH_BASE; + +- if (compat_uses_vma || !compat) { ++ if (compat_uses_vma || !compat || map_address) { + /* + * MAYWRITE to allow gdb to COW and set breakpoints + * +@@ -374,9 +392,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) + + up_fail: + up_write(&mm->mmap_sem); ++ if (ret < 0) ++ ub_memory_uncharge(mm, PAGE_SIZE, flags, NULL); ++err_charge: + + return ret; + } ++EXPORT_SYMBOL_GPL(arch_setup_additional_pages); + + #ifdef CONFIG_X86_64 + +diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c +index 3fdd514..785d7fd 100644 +--- a/arch/x86/vdso/vma.c ++++ b/arch/x86/vdso/vma.c +@@ -4,6 +4,7 @@ + * Subject to the GPL, v.2 + */ + #include ++#include + #include + #include + #include +@@ -16,7 +17,7 @@ + #include "vextern.h" /* Just for VMAGIC. */ + #undef VEXTERN + +-int vdso_enabled = 1; ++unsigned int vdso_enabled = 1; + + extern char vdso_start[], vdso_end[]; + extern unsigned short vdso_sync_cpuid; +@@ -96,18 +97,24 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) + + /* Setup a VMA at program startup for the vsyscall page. + Not called for compat tasks */ +-int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) ++int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack, ++ unsigned long map_address) + { + struct mm_struct *mm = current->mm; + unsigned long addr; + int ret; + unsigned len = round_up(vdso_end - vdso_start, PAGE_SIZE); + +- if (!vdso_enabled) ++ if (!vdso_enabled && map_address == 0) { ++ current->mm->context.vdso = NULL; + return 0; ++ } + + down_write(&mm->mmap_sem); +- addr = vdso_addr(mm->start_stack, len); ++ if (map_address) ++ addr = map_address; ++ else ++ addr = vdso_addr(mm->start_stack, len); + addr = get_unmapped_area(NULL, addr, len, 0, 0); + if (IS_ERR_VALUE(addr)) { + ret = addr; +@@ -127,6 +134,7 @@ up_fail: + up_write(&mm->mmap_sem); + return ret; + } ++EXPORT_SYMBOL_GPL(arch_setup_additional_pages); + + static __init int vdso_setup(char *s) + { +diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c +index d01b411..497b791 100644 +--- a/block/cfq-iosched.c ++++ b/block/cfq-iosched.c +@@ -11,6 +11,11 @@ + #include + #include + #include ++#include ++#include ++#include ++#include ++#include + + /* + * tunables +@@ -26,6 +31,7 @@ static const int cfq_slice_sync = HZ / 10; + static int cfq_slice_async = HZ / 25; + static const int cfq_slice_async_rq = 2; + static int cfq_slice_idle = HZ / 125; ++static int cfq_ub_slice = HZ / 2; + + /* + * offset from end of service tree +@@ -43,13 +49,11 @@ static int cfq_slice_idle = HZ / 125; + ((struct cfq_io_context *) (rq)->elevator_private) + #define RQ_CFQQ(rq) ((rq)->elevator_private2) + +-static struct kmem_cache *cfq_pool; + static struct kmem_cache *cfq_ioc_pool; + + static DEFINE_PER_CPU(unsigned long, ioc_count); + static struct completion *ioc_gone; + +-#define CFQ_PRIO_LISTS IOPRIO_BE_NR + #define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) + #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) + +@@ -58,105 +62,6 @@ static struct completion *ioc_gone; + + #define sample_valid(samples) ((samples) > 80) + +-/* +- * Most of our rbtree usage is for sorting with min extraction, so +- * if we cache the leftmost node we don't have to walk down the tree +- * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should +- * move this into the elevator for the rq sorting as well. +- */ +-struct cfq_rb_root { +- struct rb_root rb; +- struct rb_node *left; +-}; +-#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, } +- +-/* +- * Per block device queue structure +- */ +-struct cfq_data { +- struct request_queue *queue; +- +- /* +- * rr list of queues with requests and the count of them +- */ +- struct cfq_rb_root service_tree; +- unsigned int busy_queues; +- +- int rq_in_driver; +- int sync_flight; +- int hw_tag; +- +- /* +- * idle window management +- */ +- struct timer_list idle_slice_timer; +- struct work_struct unplug_work; +- +- struct cfq_queue *active_queue; +- struct cfq_io_context *active_cic; +- +- /* +- * async queue for each priority case +- */ +- struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; +- struct cfq_queue *async_idle_cfqq; +- +- sector_t last_position; +- unsigned long last_end_request; +- +- /* +- * tunables, see top of file +- */ +- unsigned int cfq_quantum; +- unsigned int cfq_fifo_expire[2]; +- unsigned int cfq_back_penalty; +- unsigned int cfq_back_max; +- unsigned int cfq_slice[2]; +- unsigned int cfq_slice_async_rq; +- unsigned int cfq_slice_idle; +- +- struct list_head cic_list; +-}; +- +-/* +- * Per process-grouping structure +- */ +-struct cfq_queue { +- /* reference count */ +- atomic_t ref; +- /* various state flags, see below */ +- unsigned int flags; +- /* parent cfq_data */ +- struct cfq_data *cfqd; +- /* service_tree member */ +- struct rb_node rb_node; +- /* service_tree key */ +- unsigned long rb_key; +- /* sorted list of pending requests */ +- struct rb_root sort_list; +- /* if fifo isn't expired, next request to serve */ +- struct request *next_rq; +- /* requests queued in sort_list */ +- int queued[2]; +- /* currently allocated requests */ +- int allocated[2]; +- /* fifo list of requests in sort_list */ +- struct list_head fifo; +- +- unsigned long slice_end; +- long slice_resid; +- +- /* pending metadata requests */ +- int meta_pending; +- /* number of requests that are on the dispatch list or inside driver */ +- int dispatched; +- +- /* io prio of this group */ +- unsigned short ioprio, org_ioprio; +- unsigned short ioprio_class, org_ioprio_class; +- +-}; +- + enum cfqq_state_flags { + CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ + CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ +@@ -201,6 +106,67 @@ CFQ_CFQQ_FNS(sync); + static void cfq_dispatch_insert(struct request_queue *, struct request *); + static struct cfq_queue *cfq_get_queue(struct cfq_data *, int, + struct io_context *, gfp_t); ++static void cfq_put_queue(struct cfq_queue *cfqq); ++ ++static void __cfq_put_async_queues(struct cfq_bc_data *cfq_bc) ++{ ++ int i; ++ ++ for (i = 0; i < CFQ_PRIO_LISTS; i++) { ++ if (cfq_bc->async_cfqq[0][i]) { ++ cfq_put_queue(cfq_bc->async_cfqq[0][i]); ++ cfq_bc->async_cfqq[0][i] = NULL; ++ } ++ if (cfq_bc->async_cfqq[1][i]) { ++ cfq_put_queue(cfq_bc->async_cfqq[1][i]); ++ cfq_bc->async_cfqq[1][i] = NULL; ++ } ++ } ++ if (cfq_bc->async_idle_cfqq) { ++ cfq_put_queue(cfq_bc->async_idle_cfqq); ++ cfq_bc->async_idle_cfqq = NULL; ++ } ++} ++ ++#ifdef CONFIG_BC_IO_SCHED ++static inline struct ub_iopriv *cfqq_ub_iopriv(struct cfq_data *cfqd, int sync) ++{ ++ int mode; ++ ++ mode = sync ? cfqd->virt_mode : cfqd->write_virt_mode; ++ return mode ? &get_io_ub()->iopriv : &get_ub0()->iopriv; ++} ++ ++static inline void cfq_put_async_queues(struct cfq_data *cfqd) ++{ ++ struct user_beancounter *ub; ++ struct cfq_bc_data *cfq_bc; ++ ++ rcu_read_lock(); ++ for_each_beancounter(ub) { ++ write_lock(&ub->iopriv.cfq_bc_list_lock); ++ cfq_bc = __find_cfq_bc(&ub->iopriv, cfqd); ++ if (!cfq_bc) { ++ write_unlock(&ub->iopriv.cfq_bc_list_lock); ++ continue; ++ } ++ __cfq_put_async_queues(cfq_bc); ++ write_unlock(&ub->iopriv.cfq_bc_list_lock); ++ } ++ rcu_read_unlock(); ++} ++#else ++static inline struct ub_iopriv *cfqq_ub_iopriv(struct cfq_data *cfqd, int sync) ++{ ++ return NULL; ++} ++ ++static inline void cfq_put_async_queues(struct cfq_data *cfqd) ++{ ++ __cfq_put_async_queues(&cfqd->cfq_bc); ++} ++#endif ++ + static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *, + struct io_context *); + +@@ -287,6 +253,11 @@ static inline int cfq_slice_used(struct cfq_queue *cfqq) + return 1; + } + ++static inline struct user_beancounter *ub_by_iopriv(struct ub_iopriv *iopriv) ++{ ++ return container_of(iopriv, struct user_beancounter, iopriv); ++} ++ + /* + * Lifted from AS - choose which of rq1 and rq2 that is best served now. + * We choose the request that is closest to the head right now. Distance +@@ -450,6 +421,7 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd, + static void cfq_service_tree_add(struct cfq_data *cfqd, + struct cfq_queue *cfqq, int add_front) + { ++ struct cfq_bc_data *cfq_bc = cfqq->cfq_bc; + struct rb_node **p, *parent; + struct cfq_queue *__cfqq; + unsigned long rb_key; +@@ -457,7 +429,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, + + if (cfq_class_idle(cfqq)) { + rb_key = CFQ_IDLE_DELAY; +- parent = rb_last(&cfqd->service_tree.rb); ++ parent = rb_last(&cfq_bc->service_tree.rb); + if (parent && parent != &cfqq->rb_node) { + __cfqq = rb_entry(parent, struct cfq_queue, rb_node); + rb_key += __cfqq->rb_key; +@@ -477,12 +449,12 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, + if (rb_key == cfqq->rb_key) + return; + +- cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); ++ cfq_rb_erase(&cfqq->rb_node, &cfq_bc->service_tree); + } + + left = 1; + parent = NULL; +- p = &cfqd->service_tree.rb.rb_node; ++ p = &cfq_bc->service_tree.rb.rb_node; + while (*p) { + struct rb_node **n; + +@@ -514,11 +486,11 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, + } + + if (left) +- cfqd->service_tree.left = &cfqq->rb_node; ++ cfq_bc->service_tree.left = &cfqq->rb_node; + + cfqq->rb_key = rb_key; + rb_link_node(&cfqq->rb_node, parent, p); +- rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb); ++ rb_insert_color(&cfqq->rb_node, &cfq_bc->service_tree.rb); + } + + /* +@@ -542,6 +514,7 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) + BUG_ON(cfq_cfqq_on_rr(cfqq)); + cfq_mark_cfqq_on_rr(cfqq); + cfqd->busy_queues++; ++ bc_inc_rqnum(cfqq); + + cfq_resort_rr_list(cfqd, cfqq); + } +@@ -552,14 +525,19 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) + */ + static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) + { ++ struct cfq_bc_data *cfq_bc; ++ + BUG_ON(!cfq_cfqq_on_rr(cfqq)); + cfq_clear_cfqq_on_rr(cfqq); + ++ cfq_bc = cfqq->cfq_bc; ++ + if (!RB_EMPTY_NODE(&cfqq->rb_node)) +- cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); ++ cfq_rb_erase(&cfqq->rb_node, &cfq_bc->service_tree); + + BUG_ON(!cfqd->busy_queues); + cfqd->busy_queues--; ++ bc_dec_rqnum(cfqq); + } + + /* +@@ -675,8 +653,7 @@ static void cfq_remove_request(struct request *rq) + } + } + +-static int cfq_merge(struct request_queue *q, struct request **req, +- struct bio *bio) ++static int cfq_merge(struct request_queue *q, struct request **req, struct bio *bio) + { + struct cfq_data *cfqd = q->elevator->elevator_data; + struct request *__rq; +@@ -800,10 +777,16 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, int timed_out) + */ + static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) + { +- if (RB_EMPTY_ROOT(&cfqd->service_tree.rb)) ++ struct cfq_bc_data *cfq_bc; ++ ++ cfq_bc = cfqd->active_cfq_bc; ++ if (!cfq_bc) + return NULL; + +- return cfq_rb_first(&cfqd->service_tree); ++ if (RB_EMPTY_ROOT(&cfq_bc->service_tree.rb)) ++ return NULL; ++ ++ return cfq_rb_first(&cfq_bc->service_tree); + } + + /* +@@ -811,9 +794,17 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) + */ + static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd) + { +- struct cfq_queue *cfqq; ++ struct cfq_queue *cfqq = NULL; ++ struct cfq_bc_data *cfq_bc; ++ ++ bc_schedule_active(cfqd); ++ ++ cfq_bc = cfqd->active_cfq_bc; ++ if (!cfq_bc) ++ goto out; + + cfqq = cfq_get_next_queue(cfqd); ++out: + __cfq_set_active_queue(cfqd, cfqq); + return cfqq; + } +@@ -904,6 +895,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) + + cfq_remove_request(rq); + cfqq->dispatched++; ++ cfqq->cfq_bc->on_dispatch++; + elv_dispatch_sort(q, rq); + + if (cfq_cfqq_sync(cfqq)) +@@ -961,7 +953,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) + /* + * The active queue has run out of time, expire it and select new. + */ +- if (cfq_slice_used(cfqq)) ++ if (cfq_slice_used(cfqq) || bc_expired(cfqd)) + goto expire; + + /* +@@ -1060,14 +1052,33 @@ static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq) + * Drain our current requests. Used for barriers and when switching + * io schedulers on-the-fly. + */ +-static int cfq_forced_dispatch(struct cfq_data *cfqd) ++static int __cfq_forced_dispatch(struct cfq_bc_data *cfq_bc) + { + struct cfq_queue *cfqq; + int dispatched = 0; + +- while ((cfqq = cfq_rb_first(&cfqd->service_tree)) != NULL) ++ while ((cfqq = cfq_rb_first(&cfq_bc->service_tree)) != NULL) + dispatched += __cfq_forced_dispatch_cfqq(cfqq); + ++ return dispatched; ++} ++ ++static int cfq_forced_dispatch(struct cfq_data *cfqd) ++{ ++ struct cfq_bc_data *cfq_bc; ++ struct cfq_bc_data *cfq_bc_tmp; ++ int dispatched; ++ ++ dispatched = 0; ++ /* ++ * We use here _safe iterating, because ++ * __cfq_forced_dispatch() produces list_del() implicitly ++ */ ++ list_for_each_entry_safe(cfq_bc, cfq_bc_tmp, ++ &cfqd->act_cfq_bc_head, act_cfq_bc_list) { ++ dispatched += __cfq_forced_dispatch(cfq_bc); ++ } ++ + cfq_slice_expired(cfqd, 0); + + BUG_ON(cfqd->busy_queues); +@@ -1243,6 +1254,10 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd, + if (ioc->ioc_data == cic) + rcu_assign_pointer(ioc->ioc_data, NULL); + ++ /* ++ * cic->cfqq[ASYNC] is always NULL and the put of async queues ++ * happens on appropriate bc death or device unplug ++ */ + if (cic->cfqq[ASYNC]) { + cfq_exit_cfqq(cfqd, cic->cfqq[ASYNC]); + cic->cfqq[ASYNC] = NULL; +@@ -1351,6 +1366,10 @@ static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic) + + spin_lock_irqsave(cfqd->queue->queue_lock, flags); + ++ /* ++ * cic->cfqq[ASYNC] is always NULL, ioprio change ++ * for async queues happens automatically ++ */ + cfqq = cic->cfqq[ASYNC]; + if (cfqq) { + struct cfq_queue *new_cfqq; +@@ -1380,8 +1399,11 @@ cfq_find_alloc_queue(struct cfq_data *cfqd, int is_sync, + { + struct cfq_queue *cfqq, *new_cfqq = NULL; + struct cfq_io_context *cic; ++ struct ub_iopriv *iopriv; ++ struct cfq_bc_data *cfq_bc = NULL; + + retry: ++ iopriv = cfqq_ub_iopriv(cfqd, is_sync); + cic = cfq_cic_lookup(cfqd, ioc); + /* cic always exists here */ + cfqq = cic_to_cfqq(cic, is_sync); +@@ -1399,18 +1421,32 @@ retry: + */ + spin_unlock_irq(cfqd->queue->queue_lock); + new_cfqq = kmem_cache_alloc_node(cfq_pool, +- gfp_mask | __GFP_NOFAIL | __GFP_ZERO, ++ gfp_mask|__GFP_NOFAIL|__GFP_ZERO, + cfqd->queue->node); ++ if (new_cfqq) { ++ cfq_bc = bc_findcreate_cfq_bc(iopriv, ++ cfqd, gfp_mask); ++ if (!cfq_bc) { ++ kmem_cache_free(cfq_pool, new_cfqq); ++ new_cfqq = NULL; ++ } ++ } + spin_lock_irq(cfqd->queue->queue_lock); + goto retry; + } else { + cfqq = kmem_cache_alloc_node(cfq_pool, +- gfp_mask | __GFP_ZERO, +- cfqd->queue->node); ++ gfp_mask|__GFP_ZERO, cfqd->queue->node); + if (!cfqq) + goto out; ++ cfq_bc = bc_findcreate_cfq_bc(iopriv, cfqd, gfp_mask); ++ if (!cfq_bc) { ++ kmem_cache_free(cfq_pool, cfqq); ++ cfqq = NULL; ++ goto out; ++ } + } + ++ cfqq->cfq_bc = cfq_bc; + RB_CLEAR_NODE(&cfqq->rb_node); + INIT_LIST_HEAD(&cfqq->fifo); + +@@ -1438,15 +1474,15 @@ out: + } + + static struct cfq_queue ** +-cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) ++cfq_async_queue_prio(struct cfq_bc_data *cfq_bc, int ioprio_class, int ioprio) + { + switch (ioprio_class) { + case IOPRIO_CLASS_RT: +- return &cfqd->async_cfqq[0][ioprio]; ++ return &cfq_bc->async_cfqq[0][ioprio]; + case IOPRIO_CLASS_BE: +- return &cfqd->async_cfqq[1][ioprio]; ++ return &cfq_bc->async_cfqq[1][ioprio]; + case IOPRIO_CLASS_IDLE: +- return &cfqd->async_idle_cfqq; ++ return &cfq_bc->async_idle_cfqq; + default: + BUG(); + } +@@ -1460,9 +1496,16 @@ cfq_get_queue(struct cfq_data *cfqd, int is_sync, struct io_context *ioc, + const int ioprio_class = task_ioprio_class(ioc); + struct cfq_queue **async_cfqq = NULL; + struct cfq_queue *cfqq = NULL; ++ struct cfq_bc_data *cfq_bc; ++ struct ub_iopriv *iopriv; ++ ++ iopriv = cfqq_ub_iopriv(cfqd, is_sync); + + if (!is_sync) { +- async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio); ++ cfq_bc = bc_findcreate_cfq_bc(iopriv, cfqd, gfp_mask); ++ if (!cfq_bc) ++ return NULL; ++ async_cfqq = cfq_async_queue_prio(cfq_bc, ioprio_class, ioprio); + cfqq = *async_cfqq; + } + +@@ -1840,6 +1883,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) + WARN_ON(!cfqq->dispatched); + cfqd->rq_in_driver--; + cfqq->dispatched--; ++ cfqq->cfq_bc->on_dispatch--; + + if (cfq_cfqq_sync(cfqq)) + cfqd->sync_flight--; +@@ -1952,6 +1996,7 @@ static void cfq_put_request(struct request *rq) + rq->elevator_private = NULL; + rq->elevator_private2 = NULL; + ++ put_beancounter(ub_by_iopriv(cfqq->cfq_bc->ub_iopriv)); + cfq_put_queue(cfqq); + } + } +@@ -1968,14 +2013,19 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) + const int is_sync = rq_is_sync(rq); + struct cfq_queue *cfqq; + unsigned long flags; ++ struct ub_iopriv *iopriv; ++ struct cfq_bc_data *cfq_bc = NULL; + + might_sleep_if(gfp_mask & __GFP_WAIT); + + cic = cfq_get_io_context(cfqd, gfp_mask); ++ iopriv = cfqq_ub_iopriv(cfqd, is_sync); ++ if (!is_sync) ++ cfq_bc = bc_findcreate_cfq_bc(iopriv, cfqd, gfp_mask); + + spin_lock_irqsave(q->queue_lock, flags); + +- if (!cic) ++ if (!cic || (!is_sync && cfq_bc == NULL)) + goto queue_fail; + + cfqq = cic_to_cfqq(cic, is_sync); +@@ -1996,6 +2046,7 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) + + rq->elevator_private = cic; + rq->elevator_private2 = cfqq; ++ get_beancounter(ub_by_iopriv(cfqq->cfq_bc->ub_iopriv)); + return 0; + + queue_fail: +@@ -2070,21 +2121,6 @@ static void cfq_shutdown_timer_wq(struct cfq_data *cfqd) + kblockd_flush_work(&cfqd->unplug_work); + } + +-static void cfq_put_async_queues(struct cfq_data *cfqd) +-{ +- int i; +- +- for (i = 0; i < IOPRIO_BE_NR; i++) { +- if (cfqd->async_cfqq[0][i]) +- cfq_put_queue(cfqd->async_cfqq[0][i]); +- if (cfqd->async_cfqq[1][i]) +- cfq_put_queue(cfqd->async_cfqq[1][i]); +- } +- +- if (cfqd->async_idle_cfqq) +- cfq_put_queue(cfqd->async_idle_cfqq); +-} +- + static void cfq_exit_queue(elevator_t *e) + { + struct cfq_data *cfqd = e->elevator_data; +@@ -2111,6 +2147,8 @@ static void cfq_exit_queue(elevator_t *e) + + cfq_shutdown_timer_wq(cfqd); + ++ bc_cfq_exit_queue(cfqd); ++ + kfree(cfqd); + } + +@@ -2118,11 +2156,19 @@ static void *cfq_init_queue(struct request_queue *q) + { + struct cfq_data *cfqd; + +- cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); ++ cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL|__GFP_ZERO, q->node); + if (!cfqd) + return NULL; + +- cfqd->service_tree = CFQ_RB_ROOT; ++ INIT_LIST_HEAD(&cfqd->act_cfq_bc_head); ++#ifndef CONFIG_BC_IO_SCHED ++ cfq_init_cfq_bc(&cfqd->cfq_bc); ++ /* ++ * Adding ub0 to active list in order to serve force dispatching ++ * case uniformally. Note, that nobody removes ub0 from this list. ++ */ ++ list_add_tail(&cfqd->cfq_bc.act_cfq_bc_list, &cfqd->act_cfq_bc_head); ++#endif + INIT_LIST_HEAD(&cfqd->cic_list); + + cfqd->queue = q; +@@ -2143,6 +2189,9 @@ static void *cfq_init_queue(struct request_queue *q) + cfqd->cfq_slice[1] = cfq_slice_sync; + cfqd->cfq_slice_async_rq = cfq_slice_async_rq; + cfqd->cfq_slice_idle = cfq_slice_idle; ++ cfqd->cfq_ub_slice = cfq_ub_slice; ++ cfqd->virt_mode = 1; ++ cfqd->write_virt_mode = 1; + + return cfqd; + } +@@ -2211,6 +2260,9 @@ SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1); + SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); + SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); + SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); ++SHOW_FUNCTION(cfq_ub_slice_show, cfqd->cfq_ub_slice, 1); ++SHOW_FUNCTION(cfq_virt_mode_show, cfqd->virt_mode, 0); ++SHOW_FUNCTION(cfq_write_virt_mode_show, cfqd->write_virt_mode, 0); + #undef SHOW_FUNCTION + + #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ +@@ -2242,6 +2294,9 @@ STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); + STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); + STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, + UINT_MAX, 0); ++STORE_FUNCTION(cfq_ub_slice_store, &cfqd->cfq_ub_slice, 1, UINT_MAX, 1); ++STORE_FUNCTION(cfq_virt_mode_store, &cfqd->virt_mode, 0, 1, 0); ++STORE_FUNCTION(cfq_write_virt_mode_store, &cfqd->write_virt_mode, 0, 1, 0); + #undef STORE_FUNCTION + + #define CFQ_ATTR(name) \ +@@ -2257,6 +2312,9 @@ static struct elv_fs_entry cfq_attrs[] = { + CFQ_ATTR(slice_async), + CFQ_ATTR(slice_async_rq), + CFQ_ATTR(slice_idle), ++ CFQ_ATTR(ub_slice), ++ CFQ_ATTR(virt_mode), ++ CFQ_ATTR(write_virt_mode), + __ATTR_NULL + }; + +@@ -2280,6 +2338,7 @@ static struct elevator_type iosched_cfq = { + .elevator_init_fn = cfq_init_queue, + .elevator_exit_fn = cfq_exit_queue, + .trim = cfq_free_io_context, ++ .put_queue = cfq_put_queue, + }, + .elevator_attrs = cfq_attrs, + .elevator_name = "cfq", +diff --git a/block/elevator.c b/block/elevator.c +index 902dd13..7241736 100644 +--- a/block/elevator.c ++++ b/block/elevator.c +@@ -40,6 +40,9 @@ + static DEFINE_SPINLOCK(elv_list_lock); + static LIST_HEAD(elv_list); + ++struct kmem_cache *cfq_pool; ++EXPORT_SYMBOL_GPL(cfq_pool); ++ + /* + * Merge hash stuff. + */ +@@ -1028,12 +1031,12 @@ void elv_unregister(struct elevator_type *e) + */ + if (e->ops.trim) { + read_lock(&tasklist_lock); +- do_each_thread(g, p) { ++ do_each_thread_all(g, p) { + task_lock(p); + if (p->io_context) + e->ops.trim(p->io_context); + task_unlock(p); +- } while_each_thread(g, p); ++ } while_each_thread_all(g, p); + read_unlock(&tasklist_lock); + } + +diff --git a/block/genhd.c b/block/genhd.c +index b922d48..901cf04 100644 +--- a/block/genhd.c ++++ b/block/genhd.c +@@ -513,6 +513,7 @@ static void disk_release(struct device *dev) + struct class block_class = { + .name = "block", + }; ++EXPORT_SYMBOL(block_class); + + static struct device_type disk_type = { + .name = "disk", +diff --git a/drivers/base/class.c b/drivers/base/class.c +index e085af0..b7fbb22 100644 +--- a/drivers/base/class.c ++++ b/drivers/base/class.c +@@ -18,6 +18,8 @@ + #include + #include + #include ++#include ++#include + #include "base.h" + + #define to_class_attr(_attr) container_of(_attr, struct class_attribute, attr) +@@ -71,8 +73,14 @@ static struct kobj_type class_ktype = { + }; + + /* Hotplug events for classes go to the class_obj subsys */ +-static struct kset *class_kset; ++struct kset *class_kset; ++EXPORT_SYMBOL_GPL(class_kset); + ++#ifndef CONFIG_VE ++#define visible_class_kset class_kset ++#else ++#define visible_class_kset (get_exec_env()->class_kset) ++#endif + + int class_create_file(struct class *cls, const struct class_attribute *attr) + { +@@ -151,9 +159,9 @@ int class_register(struct class *cls) + #if defined(CONFIG_SYSFS_DEPRECATED) && defined(CONFIG_BLOCK) + /* let the block class directory show up in the root of sysfs */ + if (cls != &block_class) +- cls->subsys.kobj.kset = class_kset; ++ cls->subsys.kobj.kset = visible_class_kset; + #else +- cls->subsys.kobj.kset = class_kset; ++ cls->subsys.kobj.kset = visible_class_kset; + #endif + cls->subsys.kobj.ktype = &class_ktype; + +@@ -379,13 +387,20 @@ void class_interface_unregister(struct class_interface *class_intf) + class_put(parent); + } + +-int __init classes_init(void) ++int classes_init(void) + { +- class_kset = kset_create_and_add("class", NULL, NULL); +- if (!class_kset) ++ visible_class_kset = kset_create_and_add("class", NULL, NULL); ++ if (!visible_class_kset) + return -ENOMEM; + return 0; + } ++EXPORT_SYMBOL_GPL(classes_init); ++ ++void classes_fini(void) ++{ ++ kset_unregister(visible_class_kset); ++} ++EXPORT_SYMBOL_GPL(classes_fini); + + EXPORT_SYMBOL_GPL(class_create_file); + EXPORT_SYMBOL_GPL(class_remove_file); +diff --git a/drivers/base/core.c b/drivers/base/core.c +index ee0a51a..660ecc0 100644 +--- a/drivers/base/core.c ++++ b/drivers/base/core.c +@@ -21,6 +21,8 @@ + #include + #include + #include ++#include ++#include + + #include "base.h" + #include "power/power.h" +@@ -417,9 +419,13 @@ static ssize_t show_dev(struct device *dev, struct device_attribute *attr, + static struct device_attribute devt_attr = + __ATTR(dev, S_IRUGO, show_dev, NULL); + +-/* kset to create /sys/devices/ */ + struct kset *devices_kset; + ++/* kset to create /sys/devices/ */ ++#ifdef CONFIG_VE ++#define ve_devices_kset (get_exec_env()->devices_kset) ++#endif ++ + /** + * device_create_file - create sysfs attribute file for device. + * @dev: device. +@@ -529,7 +535,7 @@ static void klist_children_put(struct klist_node *n) + */ + void device_initialize(struct device *dev) + { +- dev->kobj.kset = devices_kset; ++ dev->kobj.kset = ve_devices_kset; + kobject_init(&dev->kobj, &device_ktype); + klist_init(&dev->klist_children, klist_children_get, + klist_children_put); +@@ -566,7 +572,7 @@ static struct kobject *virtual_device_parent(struct device *dev) + + if (!virtual_dir) + virtual_dir = kobject_create_and_add("virtual", +- &devices_kset->kobj); ++ &ve_devices_kset->kobj); + + return virtual_dir; + } +@@ -1069,13 +1075,23 @@ struct device *device_find_child(struct device *parent, void *data, + return child; + } + +-int __init devices_init(void) ++int devices_init(void) + { +- devices_kset = kset_create_and_add("devices", &device_uevent_ops, NULL); +- if (!devices_kset) ++ ve_devices_kset = kset_create_and_add("devices", &device_uevent_ops, NULL); ++ if (!ve_devices_kset) + return -ENOMEM; ++ if (ve_is_super(get_exec_env())) ++ devices_kset = ve_devices_kset; ++ + return 0; + } ++EXPORT_SYMBOL_GPL(devices_init); ++ ++void devices_fini(void) ++{ ++ kset_unregister(devices_kset); ++} ++EXPORT_SYMBOL_GPL(devices_fini); + + EXPORT_SYMBOL_GPL(device_for_each_child); + EXPORT_SYMBOL_GPL(device_find_child); +diff --git a/drivers/char/keyboard.c b/drivers/char/keyboard.c +index d9a0a53..ab32c26 100644 +--- a/drivers/char/keyboard.c ++++ b/drivers/char/keyboard.c +@@ -160,6 +160,7 @@ unsigned char kbd_sysrq_xlate[KEY_MAX + 1] = + static int sysrq_down; + static int sysrq_alt_use; + #endif ++int sysrq_key_scancode = KEY_SYSRQ; + static int sysrq_alt; + + /* +@@ -1065,6 +1066,9 @@ static int emulate_raw(struct vc_data *vc, unsigned int keycode, + { + int code; + ++ if (keycode == sysrq_key_scancode && sysrq_alt) ++ goto sysrq; ++ + switch (keycode) { + case KEY_PAUSE: + put_queue(vc, 0xe1); +@@ -1083,6 +1087,7 @@ static int emulate_raw(struct vc_data *vc, unsigned int keycode, + break; + + case KEY_SYSRQ: ++sysrq: + /* + * Real AT keyboards (that's what we're trying + * to emulate here emit 0xe0 0x2a 0xe0 0x37 when +@@ -1179,7 +1184,8 @@ static void kbd_keycode(unsigned int keycode, int down, int hw_raw) + printk(KERN_WARNING "keyboard.c: can't emulate rawmode for keycode %d\n", keycode); + + #ifdef CONFIG_MAGIC_SYSRQ /* Handle the SysRq Hack */ +- if (keycode == KEY_SYSRQ && (sysrq_down || (down == 1 && sysrq_alt))) { ++ if ((keycode == sysrq_key_scancode || keycode == KEY_SYSRQ) && ++ (sysrq_down || (down == 1 && sysrq_alt))) { + if (!sysrq_down) { + sysrq_down = down; + sysrq_alt_use = sysrq_alt; +diff --git a/drivers/char/pty.c b/drivers/char/pty.c +index 0a05c03..9c0ccce 100644 +--- a/drivers/char/pty.c ++++ b/drivers/char/pty.c +@@ -29,16 +29,22 @@ + #include + #include + ++#include ++ + /* These are global because they are accessed in tty_io.c */ + #ifdef CONFIG_UNIX98_PTYS + struct tty_driver *ptm_driver; +-static struct tty_driver *pts_driver; ++struct tty_driver *pts_driver; ++EXPORT_SYMBOL(ptm_driver); ++EXPORT_SYMBOL(pts_driver); + #endif + + static void pty_close(struct tty_struct * tty, struct file * filp) + { + if (!tty) + return; ++ ++ ub_pty_uncharge(tty); + if (tty->driver->subtype == PTY_TYPE_MASTER) { + if (tty->count > 1) + printk("master pty_close: count = %d!!\n", tty->count); +@@ -58,8 +64,12 @@ static void pty_close(struct tty_struct * tty, struct file * filp) + if (tty->driver->subtype == PTY_TYPE_MASTER) { + set_bit(TTY_OTHER_CLOSED, &tty->flags); + #ifdef CONFIG_UNIX98_PTYS +- if (tty->driver == ptm_driver) ++ if (tty->driver->flags & TTY_DRIVER_DEVPTS_MEM) { ++ struct ve_struct *old_env; ++ old_env = set_exec_env(tty->owner_env); + devpts_pty_kill(tty->index); ++ (void)set_exec_env(old_env); ++ } + #endif + tty_vhangup(tty->link); + } +@@ -212,6 +222,10 @@ static int pty_open(struct tty_struct *tty, struct file * filp) + if (tty->link->count != 1) + goto out; + ++ retval = -ENOMEM; ++ if (ub_pty_charge(tty)) ++ goto out; ++ + clear_bit(TTY_OTHER_CLOSED, &tty->link->flags); + set_bit(TTY_THROTTLED, &tty->flags); + set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); +@@ -239,7 +253,9 @@ static const struct tty_operations pty_ops = { + + /* Traditional BSD devices */ + #ifdef CONFIG_LEGACY_PTYS +-static struct tty_driver *pty_driver, *pty_slave_driver; ++struct tty_driver *pty_driver, *pty_slave_driver; ++EXPORT_SYMBOL(pty_driver); ++EXPORT_SYMBOL(pty_slave_driver); + + static int pty_bsd_ioctl(struct tty_struct *tty, struct file *file, + unsigned int cmd, unsigned long arg) +@@ -452,6 +468,9 @@ static void __init unix98_pty_init(void) + + pty_table[1].data = &ptm_driver->refcount; + register_sysctl_table(pty_root_table); ++#ifdef CONFIG_VE ++ get_ve0()->ptm_driver = ptm_driver; ++#endif + } + #else + static inline void unix98_pty_init(void) { } +diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c +index dbce126..cb46ae2 100644 +--- a/drivers/char/sysrq.c ++++ b/drivers/char/sysrq.c +@@ -36,6 +36,8 @@ + #include + #include + #include ++#include ++#include + #include + + #include +@@ -241,9 +243,16 @@ static struct sysrq_key_op sysrq_showallcpus_op = { + static void sysrq_handle_showregs(int key, struct tty_struct *tty) + { + struct pt_regs *regs = get_irq_regs(); ++ ++ bust_spinlocks(1); + if (regs) + show_regs(regs); ++ bust_spinlocks(0); ++#if defined(__i386__) || defined(__x86_64__) ++ smp_nmi_call_function(smp_show_regs, NULL, 1); ++#endif + } ++ + static struct sysrq_key_op sysrq_showregs_op = { + .handler = sysrq_handle_showregs, + .help_msg = "showPc", +@@ -277,6 +286,7 @@ static struct sysrq_key_op sysrq_showstate_blocked_op = { + static void sysrq_handle_showmem(int key, struct tty_struct *tty) + { + show_mem(); ++ show_slab_info(); + } + static struct sysrq_key_op sysrq_showmem_op = { + .handler = sysrq_handle_showmem, +@@ -292,7 +302,7 @@ static void send_sig_all(int sig) + { + struct task_struct *p; + +- for_each_process(p) { ++ for_each_process_all(p) { + if (p->mm && !is_global_init(p)) + /* Not swapper, init nor kernel thread */ + force_sig(sig, p); +@@ -354,7 +364,267 @@ static struct sysrq_key_op sysrq_unrt_op = { + /* Key Operations table and lock */ + static DEFINE_SPINLOCK(sysrq_key_table_lock); + +-static struct sysrq_key_op *sysrq_key_table[36] = { ++#define SYSRQ_KEY_TABLE_LENGTH 37 ++static struct sysrq_key_op **sysrq_key_table; ++static struct sysrq_key_op *sysrq_default_key_table[]; ++ ++#ifdef CONFIG_SYSRQ_DEBUG ++#define SYSRQ_NAMELEN_MAX 64 ++#define SYSRQ_DUMP_LINES 32 ++ ++static struct sysrq_key_op *sysrq_debug_key_table[]; ++static struct sysrq_key_op *sysrq_input_key_table[]; ++static unsigned long *dump_address; ++static int orig_console_loglevel; ++static void (*sysrq_input_return)(char *) = NULL; ++ ++static void dump_mem(void) ++{ ++ unsigned long value[4]; ++ mm_segment_t old_fs; ++ int line, err; ++ ++ old_fs = get_fs(); ++ set_fs(KERNEL_DS); ++ err = 0; ++ ++ for (line = 0; line < SYSRQ_DUMP_LINES; line++) { ++ err |= __get_user(value[0], dump_address++); ++ err |= __get_user(value[1], dump_address++); ++ err |= __get_user(value[2], dump_address++); ++ err |= __get_user(value[3], dump_address++); ++ if (err) { ++ printk("Invalid address %p\n", dump_address - 4); ++ break; ++ } ++#if BITS_PER_LONG == 32 ++ printk("0x%p: %08lx %08lx %08lx %08lx\n", ++ dump_address - 4, ++ value[0], value[1], value[2], value[3]); ++#else ++ printk("0x%p: %016lx %016lx %016lx %016lx\n", ++ dump_address - 4, ++ value[0], value[1], value[2], value[3]); ++#endif ++ } ++ set_fs(old_fs); ++} ++ ++static void write_mem(unsigned long val) ++{ ++ mm_segment_t old_fs; ++ unsigned long old_val; ++ ++ old_fs = get_fs(); ++ set_fs(KERNEL_DS); ++ if (__get_user(old_val, dump_address)) { ++ printk("Invalid address %p\n", dump_address); ++ goto out; ++ } ++ ++#if BITS_PER_LONG == 32 ++ printk("Changing [%p] from %08lx to %08lx\n", ++ dump_address, old_val, val); ++#else ++ printk("Changing [%p] from %016lx to %016lx\n", ++ dump_address, old_val, val); ++#endif ++ __put_user(val, dump_address); ++out: ++ set_fs(old_fs); ++} ++ ++static void handle_read(int key, struct tty_struct *tty) ++{ ++ static int pos; ++ static int upper_case; ++ static char str[SYSRQ_NAMELEN_MAX]; ++ ++ if (key == 0) { ++ /* actually 0 is not shift only... */ ++ upper_case = 1; ++ return; ++ } ++ ++ if (key == 0x0d || pos == SYSRQ_NAMELEN_MAX - 1) { ++ /* enter */ ++ sysrq_key_table = sysrq_debug_key_table; ++ str[pos] = '\0'; ++ pos = upper_case = 0; ++ printk("\n"); ++ if (sysrq_input_return == NULL) ++ printk("No return handler!!!\n"); ++ else ++ sysrq_input_return(str); ++ return; ++ }; ++ ++ /* check for alowed symbols */ ++ if (key == '-') { ++ if (upper_case) ++ key = '_'; ++ goto correct; ++ }; ++ if (key >= 'a' && key <= 'z') { ++ if (upper_case) ++ key = key - 'a' + 'A'; ++ goto correct; ++ }; ++ if (key >= '0' && key <= '9') ++ goto correct; ++ ++ upper_case = 0; ++ return; ++ ++correct: ++ str[pos] = key; ++ printk("%c", (char)key); ++ pos++; ++ upper_case = 0; ++} ++ ++static struct sysrq_key_op input_read = { ++ .handler = handle_read, ++ .help_msg = "", ++ .action_msg = NULL, ++}; ++ ++static struct sysrq_key_op *sysrq_input_key_table[SYSRQ_KEY_TABLE_LENGTH] = { ++ [0 ... SYSRQ_KEY_TABLE_LENGTH - 1] = &input_read, ++}; ++ ++static void return_dump_mem(char *str) ++{ ++ unsigned long address; ++ char *end; ++ ++ address = simple_strtoul(str, &end, 0); ++ if (*end != '\0') { ++ printk("Bad address [%s]\n", str); ++ return; ++ } ++ ++ dump_address = (unsigned long *)address; ++ dump_mem(); ++} ++ ++static void handle_dump_mem(int key, struct tty_struct *tty) ++{ ++ sysrq_input_return = return_dump_mem; ++ sysrq_key_table = sysrq_input_key_table; ++} ++ ++static struct sysrq_key_op debug_dump_mem = { ++ .handler = handle_dump_mem, ++ .help_msg = "Dump", ++ .action_msg = "Enter address:", ++}; ++ ++static void return_resolve(char *str) ++{ ++ unsigned long address; ++ ++ address = kallsyms_lookup_name(str); ++ printk("%s : %lx\n", str, address); ++ if (address) { ++ dump_address = (unsigned long *)address; ++ printk("Now you can dump it via X\n"); ++ } ++} ++ ++static void handle_resolve(int key, struct tty_struct *tty) ++{ ++ sysrq_input_return = return_resolve; ++ sysrq_key_table = sysrq_input_key_table; ++} ++ ++static struct sysrq_key_op debug_resolve = { ++ .handler = handle_resolve, ++ .help_msg = "Resolve", ++ .action_msg = "Enter symbol name:", ++}; ++ ++static void return_write_mem(char *str) ++{ ++ unsigned long address; ++ unsigned long value; ++ char *end; ++ ++ address = simple_strtoul(str, &end, 0); ++ if (*end != '-') { ++ printk("Bad address in %s\n", str); ++ return; ++ } ++ value = simple_strtoul(end + 1, &end, 0); ++ if (*end != '\0') { ++ printk("Bad value in %s\n", str); ++ return; ++ } ++ ++ dump_address = (unsigned long *)address; ++ write_mem(value); ++} ++ ++static void handle_write_mem(int key, struct tty_struct *tty) ++{ ++ sysrq_input_return = return_write_mem; ++ sysrq_key_table = sysrq_input_key_table; ++} ++ ++static struct sysrq_key_op debug_write_mem = { ++ .handler = handle_write_mem, ++ .help_msg = "Writemem", ++ .action_msg = "Enter address-value:", ++}; ++ ++static void handle_next(int key, struct tty_struct *tty) ++{ ++ dump_mem(); ++} ++ ++static struct sysrq_key_op debug_next = { ++ .handler = handle_next, ++ .help_msg = "neXt", ++ .action_msg = "continuing", ++}; ++ ++static void handle_quit(int key, struct tty_struct *tty) ++{ ++ sysrq_key_table = sysrq_default_key_table; ++ console_loglevel = orig_console_loglevel; ++} ++ ++static struct sysrq_key_op debug_quit = { ++ .handler = handle_quit, ++ .help_msg = "Quit", ++ .action_msg = "Tnahk you for using debugger", ++}; ++ ++static struct sysrq_key_op *sysrq_debug_key_table[SYSRQ_KEY_TABLE_LENGTH] = { ++ [13] = &debug_dump_mem, /* d */ ++ [26] = &debug_quit, /* q */ ++ [27] = &debug_resolve, /* r */ ++ [32] = &debug_write_mem, /* w */ ++ [33] = &debug_next, /* x */ ++}; ++ ++static void sysrq_handle_debug(int key, struct tty_struct *tty) ++{ ++ orig_console_loglevel = console_loglevel; ++ console_loglevel = 8; ++ sysrq_key_table = sysrq_debug_key_table; ++ printk("Welcome sysrq debugging mode\n" ++ "Press H for help\n"); ++} ++ ++static struct sysrq_key_op sysrq_debug_op = { ++ .handler = sysrq_handle_debug, ++ .help_msg = "debuG", ++ .action_msg = "Select desired action", ++}; ++#endif ++ ++static struct sysrq_key_op *sysrq_default_key_table[SYSRQ_KEY_TABLE_LENGTH] = { + &sysrq_loglevel_op, /* 0 */ + &sysrq_loglevel_op, /* 1 */ + &sysrq_loglevel_op, /* 2 */ +@@ -377,7 +647,11 @@ static struct sysrq_key_op *sysrq_key_table[36] = { + &sysrq_term_op, /* e */ + &sysrq_moom_op, /* f */ + /* g: May be registered by ppc for kgdb */ ++#ifdef CONFIG_SYSRQ_DEBUG ++ &sysrq_debug_op, /* g */ ++#else + NULL, /* g */ ++#endif + NULL, /* h */ + &sysrq_kill_op, /* i */ + NULL, /* j */ +@@ -404,9 +678,12 @@ static struct sysrq_key_op *sysrq_key_table[36] = { + NULL, /* x */ + /* y: May be registered on sparc64 for global register dump */ + NULL, /* y */ +- NULL /* z */ ++ NULL, /* z */ ++ NULL, /* for debugger */ + }; + ++static struct sysrq_key_op **sysrq_key_table = sysrq_default_key_table; ++ + /* key2index calculation, -1 on invalid index */ + static int sysrq_key_table_key2index(int key) + { +@@ -416,6 +693,10 @@ static int sysrq_key_table_key2index(int key) + retval = key - '0'; + else if ((key >= 'a') && (key <= 'z')) + retval = key + 10 - 'a'; ++#ifdef CONFIG_SYSRQ_DEBUG ++ else if (key == 0 || key == 0x0d || key == '-') ++ retval = SYSRQ_KEY_TABLE_LENGTH - 1; ++#endif + else + retval = -1; + return retval; +@@ -457,7 +738,6 @@ void __handle_sysrq(int key, struct tty_struct *tty, int check_mask) + spin_lock_irqsave(&sysrq_key_table_lock, flags); + orig_log_level = console_loglevel; + console_loglevel = 7; +- printk(KERN_INFO "SysRq : "); + + op_p = __sysrq_get_key_op(key); + if (op_p) { +@@ -466,16 +746,17 @@ void __handle_sysrq(int key, struct tty_struct *tty, int check_mask) + * should not) and is the invoked operation enabled? + */ + if (!check_mask || sysrq_on_mask(op_p->enable_mask)) { +- printk("%s\n", op_p->action_msg); ++ if (op_p->action_msg) ++ printk("%s\n", op_p->action_msg); + console_loglevel = orig_log_level; + op_p->handler(key, tty); + } else { + printk("This sysrq operation is disabled.\n"); + } + } else { +- printk("HELP : "); ++ printk("SysRq HELP : "); + /* Only print the help msg once per handler */ +- for (i = 0; i < ARRAY_SIZE(sysrq_key_table); i++) { ++ for (i = 0; i < SYSRQ_KEY_TABLE_LENGTH; i++) { + if (sysrq_key_table[i]) { + int j; + +diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c +index 7501310..fb3a725 100644 +--- a/drivers/char/tty_io.c ++++ b/drivers/char/tty_io.c +@@ -95,6 +95,8 @@ + #include + #include + #include ++#include ++#include + + #include + #include +@@ -105,6 +107,7 @@ + + #include + #include ++#include + + #undef TTY_DEBUG_HANGUP + +@@ -129,6 +132,7 @@ EXPORT_SYMBOL(tty_std_termios); + into this file */ + + LIST_HEAD(tty_drivers); /* linked list of tty drivers */ ++EXPORT_SYMBOL(tty_drivers); + + /* Mutex to protect creating and releasing a tty. This is shared with + vt.c for deeply disgusting hack reasons */ +@@ -136,7 +140,11 @@ DEFINE_MUTEX(tty_mutex); + EXPORT_SYMBOL(tty_mutex); + + #ifdef CONFIG_UNIX98_PTYS ++#ifdef CONFIG_VE ++#define ptm_driver (get_exec_env()->ptm_driver) ++#else + extern struct tty_driver *ptm_driver; /* Unix98 pty masters; for /dev/ptmx */ ++#endif + static int ptmx_open(struct inode *, struct file *); + #endif + +@@ -172,7 +180,7 @@ static void proc_set_tty(struct task_struct *tsk, struct tty_struct *tty); + + static struct tty_struct *alloc_tty_struct(void) + { +- return kzalloc(sizeof(struct tty_struct), GFP_KERNEL); ++ return kzalloc(sizeof(struct tty_struct), GFP_KERNEL_UBC); + } + + static void tty_buffer_free_all(struct tty_struct *); +@@ -1146,9 +1154,29 @@ static struct tty_driver *get_tty_driver(dev_t device, int *index) + if (device < base || device >= base + p->num) + continue; + *index = device - base; +- return p; ++#ifdef CONFIG_VE ++ if (in_interrupt()) ++ goto found; ++ if (p->major!=PTY_MASTER_MAJOR && p->major!=PTY_SLAVE_MAJOR ++#ifdef CONFIG_UNIX98_PTYS ++ && (p->majormajor>UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT-1) && ++ (p->majormajor>UNIX98_PTY_SLAVE_MAJOR+UNIX98_PTY_MAJOR_COUNT-1) ++#endif ++ ) ++ goto found; ++ if (ve_is_super(p->owner_env) && ve_is_super(get_exec_env())) ++ goto found; ++ if (!ve_accessible_strict(p->owner_env, get_exec_env())) ++ continue; ++#endif ++ goto found; + } + return NULL; ++ ++found: ++ return p; + } + + #ifdef CONFIG_CONSOLE_POLL +@@ -2070,13 +2098,21 @@ static void tty_line_name(struct tty_driver *driver, int index, char *p) + */ + + static int init_dev(struct tty_driver *driver, int idx, +- struct tty_struct **ret_tty) ++ struct tty_struct *i_tty, struct tty_struct **ret_tty) + { + struct tty_struct *tty, *o_tty; + struct ktermios *tp, **tp_loc, *o_tp, **o_tp_loc; + struct ktermios *ltp, **ltp_loc, *o_ltp, **o_ltp_loc; ++ struct ve_struct * owner; + int retval = 0; + ++ owner = driver->owner_env; ++ ++ if (i_tty) { ++ tty = i_tty; ++ goto fast_track; ++ } ++ + /* check whether we're reopening an existing tty */ + if (driver->flags & TTY_DRIVER_DEVPTS_MEM) { + tty = devpts_get_tty(idx); +@@ -2126,6 +2162,7 @@ static int init_dev(struct tty_driver *driver, int idx, + tty->ops = driver->ops; + tty->index = idx; + tty_line_name(driver, idx, tty->name); ++ tty->owner_env = owner; + + if (driver->flags & TTY_DRIVER_DEVPTS_MEM) { + tp_loc = &tty->termios; +@@ -2136,14 +2173,14 @@ static int init_dev(struct tty_driver *driver, int idx, + } + + if (!*tp_loc) { +- tp = kmalloc(sizeof(struct ktermios), GFP_KERNEL); ++ tp = kmalloc(sizeof(struct ktermios), GFP_KERNEL_UBC); + if (!tp) + goto free_mem_out; + *tp = driver->init_termios; + } + + if (!*ltp_loc) { +- ltp = kzalloc(sizeof(struct ktermios), GFP_KERNEL); ++ ltp = kzalloc(sizeof(struct ktermios), GFP_KERNEL_UBC); + if (!ltp) + goto free_mem_out; + } +@@ -2157,6 +2194,7 @@ static int init_dev(struct tty_driver *driver, int idx, + o_tty->ops = driver->ops; + o_tty->index = idx; + tty_line_name(driver->other, idx, o_tty->name); ++ o_tty->owner_env = owner; + + if (driver->flags & TTY_DRIVER_DEVPTS_MEM) { + o_tp_loc = &o_tty->termios; +@@ -2167,14 +2205,14 @@ static int init_dev(struct tty_driver *driver, int idx, + } + + if (!*o_tp_loc) { +- o_tp = kmalloc(sizeof(struct ktermios), GFP_KERNEL); ++ o_tp = kmalloc(sizeof(struct ktermios), GFP_KERNEL_UBC); + if (!o_tp) + goto free_mem_out; + *o_tp = driver->other->init_termios; + } + + if (!*o_ltp_loc) { +- o_ltp = kzalloc(sizeof(struct ktermios), GFP_KERNEL); ++ o_ltp = kzalloc(sizeof(struct ktermios), GFP_KERNEL_UBC); + if (!o_ltp) + goto free_mem_out; + } +@@ -2190,6 +2228,10 @@ static int init_dev(struct tty_driver *driver, int idx, + *o_ltp_loc = o_ltp; + o_tty->termios = *o_tp_loc; + o_tty->termios_locked = *o_ltp_loc; ++#ifdef CONFIG_VE ++ if (driver->other->refcount == 0) ++ (void)get_ve(owner); ++#endif + driver->other->refcount++; + if (driver->subtype == PTY_TYPE_MASTER) + o_tty->count++; +@@ -2213,6 +2255,10 @@ static int init_dev(struct tty_driver *driver, int idx, + *ltp_loc = ltp; + tty->termios = *tp_loc; + tty->termios_locked = *ltp_loc; ++#ifdef CONFIG_VE ++ if (driver->refcount == 0) ++ (void)get_ve(owner); ++#endif + /* Compatibility until drivers always set this */ + tty->termios->c_ispeed = tty_termios_input_baud_rate(tty->termios); + tty->termios->c_ospeed = tty_termios_baud_rate(tty->termios); +@@ -2337,7 +2383,8 @@ static void release_one_tty(struct tty_struct *tty, int idx) + + tty->magic = 0; + tty->driver->refcount--; +- ++ if (tty->driver->refcount == 0) ++ put_ve(tty->owner_env); + file_list_lock(); + list_del_init(&tty->tty_files); + file_list_unlock(); +@@ -2667,7 +2714,7 @@ static void release_dev(struct file *filp) + + static int tty_open(struct inode *inode, struct file *filp) + { +- struct tty_struct *tty; ++ struct tty_struct *tty, *c_tty; + int noctty, retval; + struct tty_driver *driver; + int index; +@@ -2680,6 +2727,7 @@ retry_open: + noctty = filp->f_flags & O_NOCTTY; + index = -1; + retval = 0; ++ c_tty = NULL; + + mutex_lock(&tty_mutex); + +@@ -2691,6 +2739,7 @@ retry_open: + } + driver = tty->driver; + index = tty->index; ++ c_tty = tty; + filp->f_flags |= O_NONBLOCK; /* Don't let /dev/tty block */ + /* noctty = 1; */ + goto got_driver; +@@ -2698,6 +2747,12 @@ retry_open: + #ifdef CONFIG_VT + if (device == MKDEV(TTY_MAJOR, 0)) { + extern struct tty_driver *console_driver; ++#ifdef CONFIG_VE ++ if (!ve_is_super(get_exec_env())) { ++ mutex_unlock(&tty_mutex); ++ return -ENODEV; ++ } ++#endif + driver = console_driver; + index = fg_console; + noctty = 1; +@@ -2705,6 +2760,12 @@ retry_open: + } + #endif + if (device == MKDEV(TTYAUX_MAJOR, 1)) { ++#ifdef CONFIG_VE ++ if (!ve_is_super(get_exec_env())) { ++ mutex_unlock(&tty_mutex); ++ return -ENODEV; ++ } ++#endif + driver = console_device(&index); + if (driver) { + /* Don't let /dev/console block */ +@@ -2722,7 +2783,7 @@ retry_open: + return -ENODEV; + } + got_driver: +- retval = init_dev(driver, index, &tty); ++ retval = init_dev(driver, index, c_tty, &tty); + mutex_unlock(&tty_mutex); + if (retval) + return retval; +@@ -2806,7 +2867,7 @@ static int ptmx_open(struct inode *inode, struct file *filp) + return index; + + mutex_lock(&tty_mutex); +- retval = init_dev(ptm_driver, index, &tty); ++ retval = init_dev(ptm_driver, index, NULL, &tty); + mutex_unlock(&tty_mutex); + + if (retval) +@@ -3049,6 +3110,8 @@ static int tioccons(struct file *file) + { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; ++ if (!ve_is_super(get_exec_env())) ++ return -EACCES; + if (file->f_op->write == redirected_tty_write) { + struct file *f; + spin_lock(&redirect_lock); +@@ -3639,7 +3702,7 @@ void __do_SAK(struct tty_struct *tty) + /* Now kill any processes that happen to have the + * tty open. + */ +- do_each_thread(g, p) { ++ do_each_thread_all(g, p) { + if (p->signal->tty == tty) { + printk(KERN_NOTICE "SAK: killed process %d" + " (%s): task_session_nr(p)==tty->session\n", +@@ -3671,7 +3734,7 @@ void __do_SAK(struct tty_struct *tty) + spin_unlock(&p->files->file_lock); + } + task_unlock(p); +- } while_each_thread(g, p); ++ } while_each_thread_all(g, p); + read_unlock(&tasklist_lock); + #endif + } +@@ -4005,6 +4068,7 @@ int tty_register_driver(struct tty_driver *driver) + } + + mutex_lock(&tty_mutex); ++ driver->owner_env = get_exec_env(); + list_add(&driver->tty_drivers, &tty_drivers); + mutex_unlock(&tty_mutex); + +@@ -4202,3 +4266,43 @@ static int __init tty_init(void) + return 0; + } + module_init(tty_init); ++ ++#ifdef CONFIG_UNIX98_PTYS ++int init_ve_tty_class(void) ++{ ++ struct class * ve_tty_class; ++ struct device * ve_ptmx_dev_class; ++ ++ ve_tty_class = class_create(THIS_MODULE, "tty"); ++ if (IS_ERR(ve_tty_class)) ++ return -ENOMEM; ++ ++ ve_ptmx_dev_class = device_create(ve_tty_class, NULL, ++ MKDEV(TTYAUX_MAJOR, 2), "ptmx"); ++ if (IS_ERR(ve_ptmx_dev_class)) { ++ class_destroy(ve_tty_class); ++ return PTR_ERR(ve_ptmx_dev_class); ++ } ++ ++ get_exec_env()->tty_class = ve_tty_class; ++ return 0; ++} ++ ++void fini_ve_tty_class(void) ++{ ++ struct class *ve_tty_class = get_exec_env()->tty_class; ++ ++ device_destroy(ve_tty_class, MKDEV(TTYAUX_MAJOR, 2)); ++ class_destroy(ve_tty_class); ++} ++#else ++int init_ve_tty_class(void) ++{ ++ return 0; ++} ++void fini_ve_tty_class(void) ++{ ++} ++#endif ++EXPORT_SYMBOL(init_ve_tty_class); ++EXPORT_SYMBOL(fini_ve_tty_class); +diff --git a/drivers/net/Makefile b/drivers/net/Makefile +index dcbfe84..097d877 100644 +--- a/drivers/net/Makefile ++++ b/drivers/net/Makefile +@@ -27,6 +27,10 @@ gianfar_driver-objs := gianfar.o \ + obj-$(CONFIG_UCC_GETH) += ucc_geth_driver.o + ucc_geth_driver-objs := ucc_geth.o ucc_geth_mii.o ucc_geth_ethtool.o + ++obj-$(CONFIG_VE_NETDEV) += vznetdev.o ++vznetdev-objs := open_vznet.o venet_core.o ++obj-$(CONFIG_VE_ETHDEV) += vzethdev.o ++ + # + # link order important here + # +diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c +index 41b774b..78395c0 100644 +--- a/drivers/net/loopback.c ++++ b/drivers/net/loopback.c +@@ -134,6 +134,12 @@ static int loopback_xmit(struct sk_buff *skb, struct net_device *dev) + { + struct pcpu_lstats *pcpu_lstats, *lb_stats; + ++#ifdef CONFIG_VE ++ if (unlikely(get_exec_env()->disable_net)) { ++ kfree_skb(skb); ++ return 0; ++ } ++#endif + skb_orphan(skb); + + skb->protocol = eth_type_trans(skb,dev); +@@ -240,7 +246,8 @@ static void loopback_setup(struct net_device *dev) + | NETIF_F_NO_CSUM + | NETIF_F_HIGHDMA + | NETIF_F_LLTX +- | NETIF_F_NETNS_LOCAL; ++ | NETIF_F_NETNS_LOCAL ++ | NETIF_F_VIRTUAL; + dev->ethtool_ops = &loopback_ethtool_ops; + dev->header_ops = ð_header_ops; + dev->init = loopback_dev_init; +diff --git a/drivers/net/open_vznet.c b/drivers/net/open_vznet.c +new file mode 100644 +index 0000000..79bf640 +--- /dev/null ++++ b/drivers/net/open_vznet.c +@@ -0,0 +1,244 @@ ++/* ++ * open_vznet.c ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++/* ++ * Virtual Networking device used to change VE ownership on packets ++ */ ++ ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++void veip_stop(struct ve_struct *ve) ++{ ++ struct list_head *p, *tmp; ++ ++ write_lock_irq(&veip_hash_lock); ++ if (ve->veip == NULL) ++ goto unlock; ++ list_for_each_safe(p, tmp, &ve->veip->ip_lh) { ++ struct ip_entry_struct *ptr; ++ ptr = list_entry(p, struct ip_entry_struct, ve_list); ++ ptr->active_env = NULL; ++ list_del(&ptr->ve_list); ++ list_del(&ptr->ip_hash); ++ kfree(ptr); ++ } ++ veip_put(ve->veip); ++ ve->veip = NULL; ++ if (!ve_is_super(ve)) ++ module_put(THIS_MODULE); ++unlock: ++ write_unlock_irq(&veip_hash_lock); ++} ++ ++int veip_start(struct ve_struct *ve) ++{ ++ int err, get; ++ ++ err = 0; ++ write_lock_irq(&veip_hash_lock); ++ get = ve->veip == NULL; ++ ve->veip = veip_findcreate(ve->veid); ++ if (ve->veip == NULL) ++ err = -ENOMEM; ++ write_unlock_irq(&veip_hash_lock); ++ if (err == 0 && get && !ve_is_super(ve)) ++ __module_get(THIS_MODULE); ++ return err; ++} ++ ++int veip_entry_add(struct ve_struct *ve, struct ve_addr_struct *addr) ++{ ++ struct ip_entry_struct *entry, *found; ++ int err; ++ ++ entry = kzalloc(sizeof(struct ip_entry_struct), GFP_KERNEL); ++ if (entry == NULL) ++ return -ENOMEM; ++ ++ if (ve->veip == NULL) { ++ /* This can happen if we load venet AFTER ve was started */ ++ err = veip_start(ve); ++ if (err < 0) ++ goto out; ++ } ++ ++ write_lock_irq(&veip_hash_lock); ++ err = -EADDRINUSE; ++ found = venet_entry_lookup(addr); ++ if (found != NULL) ++ goto out_unlock; ++ ++ entry->active_env = ve; ++ entry->addr = *addr; ++ ip_entry_hash(entry, ve->veip); ++ ++ err = 0; ++ entry = NULL; ++out_unlock: ++ write_unlock_irq(&veip_hash_lock); ++out: ++ if (entry != NULL) ++ kfree(entry); ++ return err; ++} ++ ++int veip_entry_del(envid_t veid, struct ve_addr_struct *addr) ++{ ++ struct ip_entry_struct *found; ++ int err; ++ ++ err = -EADDRNOTAVAIL; ++ write_lock_irq(&veip_hash_lock); ++ found = venet_entry_lookup(addr); ++ if (found == NULL) ++ goto out; ++ if (found->active_env->veid != veid) ++ goto out; ++ ++ err = 0; ++ found->active_env = NULL; ++ ++ list_del(&found->ip_hash); ++ list_del(&found->ve_list); ++ kfree(found); ++out: ++ write_unlock_irq(&veip_hash_lock); ++ return err; ++} ++ ++static int skb_extract_addr(struct sk_buff *skb, ++ struct ve_addr_struct *addr, int dir) ++{ ++ switch (skb->protocol) { ++ case __constant_htons(ETH_P_IP): ++ addr->family = AF_INET; ++ addr->key[0] = 0; ++ addr->key[1] = 0; ++ addr->key[2] = 0; ++ addr->key[3] = (dir ? ip_hdr(skb)->daddr : ip_hdr(skb)->saddr); ++ return 0; ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ case __constant_htons(ETH_P_IPV6): ++ addr->family = AF_INET6; ++ memcpy(&addr->key, dir ? ++ ipv6_hdr(skb)->daddr.s6_addr32 : ++ ipv6_hdr(skb)->saddr.s6_addr32, ++ sizeof(addr->key)); ++ return 0; ++#endif ++ } ++ ++ return -EAFNOSUPPORT; ++} ++ ++static struct ve_struct *venet_find_ve(struct sk_buff *skb, int dir) ++{ ++ struct ip_entry_struct *entry; ++ struct ve_addr_struct addr; ++ ++ if (skb_extract_addr(skb, &addr, dir) < 0) ++ return NULL; ++ ++ entry = venet_entry_lookup(&addr); ++ if (entry == NULL) ++ return NULL; ++ ++ return entry->active_env; ++} ++ ++int venet_change_skb_owner(struct sk_buff *skb) ++{ ++ struct ve_struct *ve, *ve_old; ++ ++ ve_old = skb->owner_env; ++ ++ read_lock(&veip_hash_lock); ++ if (!ve_is_super(ve_old)) { ++ /* from VE to host */ ++ ve = venet_find_ve(skb, 0); ++ if (ve == NULL) ++ goto out_drop; ++ if (!ve_accessible_strict(ve, ve_old)) ++ goto out_source; ++ skb->owner_env = get_ve0(); ++ } else { ++ /* from host to VE */ ++ ve = venet_find_ve(skb, 1); ++ if (ve == NULL) ++ goto out_drop; ++ skb->owner_env = ve; ++ } ++ read_unlock(&veip_hash_lock); ++ ++ return 0; ++ ++out_drop: ++ read_unlock(&veip_hash_lock); ++ return -ESRCH; ++ ++out_source: ++ read_unlock(&veip_hash_lock); ++ if (net_ratelimit() && skb->protocol == __constant_htons(ETH_P_IP)) { ++ printk(KERN_WARNING "Dropped packet, source wrong " ++ "veid=%u src-IP=%u.%u.%u.%u " ++ "dst-IP=%u.%u.%u.%u\n", ++ skb->owner_env->veid, ++ NIPQUAD(ip_hdr(skb)->saddr), ++ NIPQUAD(ip_hdr(skb)->daddr)); ++ } ++ return -EACCES; ++} ++ ++#ifdef CONFIG_PROC_FS ++int veip_seq_show(struct seq_file *m, void *v) ++{ ++ struct list_head *p; ++ struct ip_entry_struct *entry; ++ char s[40]; ++ ++ p = (struct list_head *)v; ++ if (p == ip_entry_hash_table) { ++ seq_puts(m, "Version: 2.5\n"); ++ return 0; ++ } ++ entry = list_entry(p, struct ip_entry_struct, ip_hash); ++ veaddr_print(s, sizeof(s), &entry->addr); ++ seq_printf(m, "%39s %10u\n", s, 0); ++ return 0; ++} ++#endif ++ ++__exit void veip_cleanup(void) ++{ ++ int i; ++ ++ write_lock_irq(&veip_hash_lock); ++ for (i = 0; i < VEIP_HASH_SZ; i++) ++ while (!list_empty(ip_entry_hash_table + i)) { ++ struct ip_entry_struct *entry; ++ ++ entry = list_first_entry(ip_entry_hash_table + i, ++ struct ip_entry_struct, ip_hash); ++ list_del(&entry->ip_hash); ++ kfree(entry); ++ } ++ write_unlock_irq(&veip_hash_lock); ++} ++ ++MODULE_AUTHOR("SWsoft "); ++MODULE_DESCRIPTION("Virtuozzo Virtual Network Device"); ++MODULE_LICENSE("GPL v2"); +diff --git a/drivers/net/tun.c b/drivers/net/tun.c +index b9018bf..d9a5222 100644 +--- a/drivers/net/tun.c ++++ b/drivers/net/tun.c +@@ -82,45 +82,20 @@ static int debug; + #define DBG1( a... ) + #endif + +-struct tun_struct { +- struct list_head list; +- unsigned long flags; +- int attached; +- uid_t owner; +- gid_t group; +- +- wait_queue_head_t read_wait; +- struct sk_buff_head readq; +- +- struct net_device *dev; +- +- struct fasync_struct *fasync; +- +- unsigned long if_flags; +- u8 dev_addr[ETH_ALEN]; +- u32 chr_filter[2]; +- u32 net_filter[2]; +- +-#ifdef TUN_DEBUG +- int debug; +-#endif +-}; +- + /* Network device part of the driver */ + +-static unsigned int tun_net_id; +-struct tun_net { +- struct list_head dev_list; +-}; ++unsigned int tun_net_id; ++EXPORT_SYMBOL(tun_net_id); + + static const struct ethtool_ops tun_ethtool_ops; + + /* Net device open. */ +-static int tun_net_open(struct net_device *dev) ++int tun_net_open(struct net_device *dev) + { + netif_start_queue(dev); + return 0; + } ++EXPORT_SYMBOL(tun_net_open); + + /* Net device close. */ + static int tun_net_close(struct net_device *dev) +@@ -223,7 +198,7 @@ tun_net_change_mtu(struct net_device *dev, int new_mtu) + } + + /* Initialize net device. */ +-static void tun_net_init(struct net_device *dev) ++void tun_net_init(struct net_device *dev) + { + struct tun_struct *tun = netdev_priv(dev); + +@@ -255,6 +230,7 @@ static void tun_net_init(struct net_device *dev) + break; + } + } ++EXPORT_SYMBOL(tun_net_init); + + /* Character device part */ + +@@ -477,7 +453,7 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv, + return ret; + } + +-static void tun_setup(struct net_device *dev) ++void tun_setup(struct net_device *dev) + { + struct tun_struct *tun = netdev_priv(dev); + +@@ -494,6 +470,7 @@ static void tun_setup(struct net_device *dev) + dev->destructor = free_netdev; + dev->features |= NETIF_F_NETNS_LOCAL; + } ++EXPORT_SYMBOL(tun_setup); + + static struct tun_struct *tun_get_by_name(struct tun_net *tn, const char *name) + { +@@ -526,7 +503,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) + current->euid != tun->owner) || + (tun->group != -1 && + current->egid != tun->group)) && +- !capable(CAP_NET_ADMIN)) ++ !capable(CAP_NET_ADMIN) && ++ !capable(CAP_VE_NET_ADMIN)) + return -EPERM; + } + else if (__dev_get_by_name(net, ifr->ifr_name)) +@@ -601,6 +579,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) + file->private_data = tun; + tun->attached = 1; + get_net(dev_net(tun->dev)); ++ tun->bind_file = file; + + /* Make sure persistent devices do not get stuck in + * xoff state. +@@ -816,12 +795,13 @@ static int tun_chr_fasync(int fd, struct file *file, int on) + return 0; + } + +-static int tun_chr_open(struct inode *inode, struct file * file) ++int tun_chr_open(struct inode *inode, struct file * file) + { + DBG1(KERN_INFO "tunX: tun_chr_open\n"); + file->private_data = NULL; + return 0; + } ++EXPORT_SYMBOL(tun_chr_open); + + static int tun_chr_close(struct inode *inode, struct file *file) + { +diff --git a/drivers/net/venet_core.c b/drivers/net/venet_core.c +new file mode 100644 +index 0000000..6b21630 +--- /dev/null ++++ b/drivers/net/venet_core.c +@@ -0,0 +1,768 @@ ++/* ++ * venet_core.c ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++/* ++ * Common part for Virtuozzo virtual network devices ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include /* For the statistics structure. */ ++#include /* For ARPHRD_ETHER */ ++#include ++#include ++#include ++#include ++#include ++ ++struct list_head ip_entry_hash_table[VEIP_HASH_SZ]; ++rwlock_t veip_hash_lock = RW_LOCK_UNLOCKED; ++LIST_HEAD(veip_lh); ++ ++#define ip_entry_hash_function(ip) (ntohl(ip) & (VEIP_HASH_SZ - 1)) ++ ++void ip_entry_hash(struct ip_entry_struct *entry, struct veip_struct *veip) ++{ ++ list_add(&entry->ip_hash, ++ ip_entry_hash_table + ++ ip_entry_hash_function(entry->addr.key[3])); ++ list_add(&entry->ve_list, &veip->ip_lh); ++} ++ ++void veip_put(struct veip_struct *veip) ++{ ++ if (!list_empty(&veip->ip_lh)) ++ return; ++ if (!list_empty(&veip->src_lh)) ++ return; ++ if (!list_empty(&veip->dst_lh)) ++ return; ++ ++ list_del(&veip->list); ++ kfree(veip); ++} ++ ++struct ip_entry_struct *venet_entry_lookup(struct ve_addr_struct *addr) ++{ ++ struct ip_entry_struct *entry; ++ ++ list_for_each_entry (entry, ip_entry_hash_table + ++ ip_entry_hash_function(addr->key[3]), ip_hash) ++ if (memcmp(&entry->addr, addr, sizeof(*addr)) == 0) ++ return entry; ++ return NULL; ++} ++ ++struct veip_struct *veip_find(envid_t veid) ++{ ++ struct veip_struct *ptr; ++ ++ list_for_each_entry(ptr, &veip_lh, list) { ++ if (ptr->veid != veid) ++ continue; ++ return ptr; ++ } ++ return NULL; ++} ++ ++struct veip_struct *veip_findcreate(envid_t veid) ++{ ++ struct veip_struct *ptr; ++ ++ ptr = veip_find(veid); ++ if (ptr != NULL) ++ return ptr; ++ ++ ptr = kmalloc(sizeof(struct veip_struct), GFP_ATOMIC); ++ if (ptr == NULL) ++ return NULL; ++ memset(ptr, 0, sizeof(struct veip_struct)); ++ INIT_LIST_HEAD(&ptr->ip_lh); ++ INIT_LIST_HEAD(&ptr->src_lh); ++ INIT_LIST_HEAD(&ptr->dst_lh); ++ ptr->veid = veid; ++ list_add(&ptr->list, &veip_lh); ++ return ptr; ++} ++ ++static int convert_sockaddr(struct sockaddr *addr, int addrlen, ++ struct ve_addr_struct *veaddr) ++{ ++ int err; ++ ++ switch (addr->sa_family) { ++ case AF_INET: { ++ struct sockaddr_in *sin; ++ ++ err = -EINVAL; ++ if (addrlen != sizeof(struct sockaddr_in)) ++ break; ++ ++ err = 0; ++ sin = (struct sockaddr_in *)addr; ++ veaddr->family = AF_INET; ++ veaddr->key[0] = 0; ++ veaddr->key[1] = 0; ++ veaddr->key[2] = 0; ++ veaddr->key[3] = sin->sin_addr.s_addr; ++ break; ++ } ++ case AF_INET6: { ++ struct sockaddr_in6 *sin; ++ ++ err = -EINVAL; ++ if (addrlen != sizeof(struct sockaddr_in6)) ++ break; ++ ++ err = 0; ++ sin = (struct sockaddr_in6 *)addr; ++ veaddr->family = AF_INET6; ++ memcpy(veaddr->key, &sin->sin6_addr, sizeof(veaddr->key)); ++ break; ++ } ++ default: ++ err = -EAFNOSUPPORT; ++ } ++ return err; ++} ++ ++int sockaddr_to_veaddr(struct sockaddr __user *uaddr, int addrlen, ++ struct ve_addr_struct *veaddr) ++{ ++ int err; ++ char addr[MAX_SOCK_ADDR]; ++ ++ err = move_addr_to_kernel(uaddr, addrlen, &addr); ++ if (err < 0) ++ goto out; ++ ++ err = convert_sockaddr((struct sockaddr *)&addr, addrlen, veaddr); ++out: ++ return err; ++} ++ ++void veaddr_print(char *str, int len, struct ve_addr_struct *a) ++{ ++ if (a->family == AF_INET) ++ snprintf(str, len, "%u.%u.%u.%u", NIPQUAD(a->key[3])); ++ else ++ snprintf(str, len, "%x:%x:%x:%x:%x:%x:%x:%x", ++ ntohl(a->key[0])>>16, ntohl(a->key[0])&0xFFFF, ++ ntohl(a->key[1])>>16, ntohl(a->key[1])&0xFFFF, ++ ntohl(a->key[2])>>16, ntohl(a->key[2])&0xFFFF, ++ ntohl(a->key[3])>>16, ntohl(a->key[3])&0xFFFF ++ ); ++} ++ ++/* ++ * Device functions ++ */ ++ ++static int venet_open(struct net_device *dev) ++{ ++ if (!ve_is_super(get_exec_env()) && !try_module_get(THIS_MODULE)) ++ return -EBUSY; ++ return 0; ++} ++ ++static int venet_close(struct net_device *master) ++{ ++ if (!ve_is_super(get_exec_env())) ++ module_put(THIS_MODULE); ++ return 0; ++} ++ ++static void venet_destructor(struct net_device *dev) ++{ ++ struct venet_stats *stats = (struct venet_stats *)dev->priv; ++ if (stats == NULL) ++ return; ++ free_percpu(stats->real_stats); ++ kfree(stats); ++ dev->priv = NULL; ++} ++ ++/* ++ * The higher levels take care of making this non-reentrant (it's ++ * called with bh's disabled). ++ */ ++static int venet_xmit(struct sk_buff *skb, struct net_device *dev) ++{ ++ struct net_device_stats *stats; ++ struct net_device *rcv = NULL; ++ int length; ++ ++ stats = venet_stats(dev, smp_processor_id()); ++ if (unlikely(get_exec_env()->disable_net)) ++ goto outf; ++ ++ if (skb->protocol == __constant_htons(ETH_P_IP)) { ++ struct iphdr *iph; ++ iph = ip_hdr(skb); ++ if (ipv4_is_multicast(iph->daddr)) ++ goto outf; ++ } else if (skb->protocol == __constant_htons(ETH_P_IPV6)) { ++ struct ipv6hdr *ip6h; ++ ip6h = ipv6_hdr(skb); ++ if (ipv6_addr_is_multicast(&ip6h->daddr)) ++ goto outf; ++ skb_orphan(skb); ++ } else { ++ goto outf; ++ } ++ ++ if (venet_change_skb_owner(skb) < 0) ++ goto outf; ++ ++ if (unlikely(skb->owner_env->disable_net)) ++ goto outf; ++ ++ rcv = skb->owner_env->_venet_dev; ++ if (!rcv) ++ /* VE going down */ ++ goto outf; ++ ++ dev_hold(rcv); ++ ++ if (!(rcv->flags & IFF_UP)) { ++ /* Target VE does not want to receive packets */ ++ dev_put(rcv); ++ goto outf; ++ } ++ ++ skb->pkt_type = PACKET_HOST; ++ skb->dev = rcv; ++ ++ skb_reset_mac_header(skb); ++ memset(skb->data - dev->hard_header_len, 0, dev->hard_header_len); ++ ++ dst_release(skb->dst); ++ skb->dst = NULL; ++#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) ++ nf_conntrack_put(skb->nfct); ++ skb->nfct = NULL; ++#endif ++ length = skb->len; ++ ++ netif_rx(skb); ++ ++ stats->tx_bytes += length; ++ stats->tx_packets++; ++ if (rcv) { ++ struct net_device_stats *rcv_stats; ++ ++ rcv_stats = venet_stats(rcv, smp_processor_id()); ++ rcv_stats->rx_bytes += length; ++ rcv_stats->rx_packets++; ++ dev_put(rcv); ++ } ++ ++ return 0; ++ ++outf: ++ kfree_skb(skb); ++ ++stats->tx_dropped; ++ return 0; ++} ++ ++static struct net_device_stats *get_stats(struct net_device *dev) ++{ ++ int i; ++ struct venet_stats *stats; ++ ++ stats = (struct venet_stats *)dev->priv; ++ memset(&stats->stats, 0, sizeof(struct net_device_stats)); ++ for (i=0; i < NR_CPUS; i++) { ++ struct net_device_stats *dev_stats; ++ ++ if (!cpu_possible(i)) ++ continue; ++ dev_stats = venet_stats(dev, i); ++ stats->stats.rx_bytes += dev_stats->rx_bytes; ++ stats->stats.tx_bytes += dev_stats->tx_bytes; ++ stats->stats.rx_packets += dev_stats->rx_packets; ++ stats->stats.tx_packets += dev_stats->tx_packets; ++ } ++ ++ return &stats->stats; ++} ++ ++/* Initialize the rest of the LOOPBACK device. */ ++int venet_init_dev(struct net_device *dev) ++{ ++ struct venet_stats *stats; ++ ++ dev->hard_start_xmit = venet_xmit; ++ stats = kzalloc(sizeof(struct venet_stats), GFP_KERNEL); ++ if (stats == NULL) ++ goto fail; ++ stats->real_stats = alloc_percpu(struct net_device_stats); ++ if (stats->real_stats == NULL) ++ goto fail_free; ++ dev->priv = stats; ++ ++ dev->get_stats = get_stats; ++ dev->open = venet_open; ++ dev->stop = venet_close; ++ dev->destructor = venet_destructor; ++ ++ /* ++ * Fill in the generic fields of the device structure. ++ */ ++ dev->type = ARPHRD_VOID; ++ dev->hard_header_len = ETH_HLEN; ++ dev->mtu = 1500; /* eth_mtu */ ++ dev->tx_queue_len = 0; ++ ++ memset(dev->broadcast, 0xFF, ETH_ALEN); ++ ++ /* New-style flags. */ ++ dev->flags = IFF_BROADCAST|IFF_NOARP|IFF_POINTOPOINT; ++ return 0; ++ ++fail_free: ++ kfree(stats); ++fail: ++ return -ENOMEM; ++} ++ ++static int ++venet_set_op(struct net_device *dev, u32 data, ++ int (*fop)(struct net_device *, u32)) ++{ ++ ++ struct ve_struct *ve; ++ int ret = 0; ++ ++ read_lock(&ve_list_lock); ++ for_each_ve(ve) { ++ struct ve_struct *ve_old; ++ ++ ve_old = set_exec_env(ve); ++ read_lock(&dev_base_lock); ++ for_each_netdev(ve->ve_netns, dev) { ++ if (dev->hard_start_xmit == venet_xmit) ++ ret = fop(dev, data); ++ } ++ read_unlock(&dev_base_lock); ++ set_exec_env(ve_old); ++ ++ if (ret < 0) ++ break; ++ } ++ read_unlock(&ve_list_lock); ++ return ret; ++} ++ ++static unsigned long common_features; ++ ++static int venet_op_set_sg(struct net_device *dev, u32 data) ++{ ++ if (!ve_is_super(get_exec_env())) ++ return -EPERM; ++ ++ if (data) ++ common_features |= NETIF_F_SG; ++ else ++ common_features &= ~NETIF_F_SG; ++ ++ return venet_set_op(dev, data, ethtool_op_set_sg); ++} ++ ++static int venet_op_set_tx_csum(struct net_device *dev, u32 data) ++{ ++ if (!ve_is_super(get_exec_env())) ++ return -EPERM; ++ ++ if (data) ++ common_features |= NETIF_F_IP_CSUM; ++ else ++ common_features &= ~NETIF_F_IP_CSUM; ++ ++ return venet_set_op(dev, data, ethtool_op_set_tx_csum); ++} ++ ++#define venet_op_set_rx_csum venet_op_set_tx_csum ++ ++static struct ethtool_ops venet_ethtool_ops = { ++ .get_sg = ethtool_op_get_sg, ++ .set_sg = venet_op_set_sg, ++ .get_tx_csum = ethtool_op_get_tx_csum, ++ .set_tx_csum = venet_op_set_tx_csum, ++ .get_rx_csum = ethtool_op_get_tx_csum, ++ .set_rx_csum = venet_op_set_rx_csum, ++ .get_tso = ethtool_op_get_tso, ++}; ++ ++static void venet_setup(struct net_device *dev) ++{ ++ dev->init = venet_init_dev; ++ /* ++ * No other features, as they are: ++ * - checksumming is required, and nobody else will done our job ++ */ ++ dev->features |= NETIF_F_VENET | NETIF_F_VIRTUAL | NETIF_F_LLTX | ++ NETIF_F_HIGHDMA | NETIF_F_VLAN_CHALLENGED; ++ ++ dev->features |= common_features; ++ ++ SET_ETHTOOL_OPS(dev, &venet_ethtool_ops); ++} ++ ++#ifdef CONFIG_PROC_FS ++static int veinfo_seq_show(struct seq_file *m, void *v) ++{ ++ struct ve_struct *ve; ++ struct ip_entry_struct *entry; ++ ++ ve = list_entry((struct list_head *)v, struct ve_struct, ve_list); ++ ++ seq_printf(m, "%10u %5u %5u", ve->veid, ++ ve->class_id, atomic_read(&ve->pcounter)); ++ read_lock(&veip_hash_lock); ++ if (ve->veip == NULL) ++ goto unlock; ++ list_for_each_entry (entry, &ve->veip->ip_lh, ve_list) { ++ char addr[40]; ++ ++ if (entry->active_env == NULL) ++ continue; ++ ++ veaddr_print(addr, sizeof(addr), &entry->addr); ++ if (entry->addr.family == AF_INET) ++ seq_printf(m, " %15s", addr); ++ else ++ seq_printf(m, " %39s", addr); ++ } ++unlock: ++ read_unlock(&veip_hash_lock); ++ seq_putc(m, '\n'); ++ return 0; ++} ++ ++static struct seq_operations veinfo_seq_op = { ++ .start = ve_seq_start, ++ .next = ve_seq_next, ++ .stop = ve_seq_stop, ++ .show = veinfo_seq_show, ++}; ++ ++static int veinfo_open(struct inode *inode, struct file *file) ++{ ++ return seq_open(file, &veinfo_seq_op); ++} ++ ++static struct file_operations proc_veinfo_operations = { ++ .open = veinfo_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++static void *veip_seq_start(struct seq_file *m, loff_t *pos) ++{ ++ loff_t l; ++ struct list_head *p; ++ int i; ++ ++ l = *pos; ++ write_lock_irq(&veip_hash_lock); ++ if (l == 0) ++ return ip_entry_hash_table; ++ for (i = 0; i < VEIP_HASH_SZ; i++) { ++ list_for_each(p, ip_entry_hash_table + i) { ++ if (--l == 0) ++ return p; ++ } ++ } ++ return NULL; ++} ++ ++static void *veip_seq_next(struct seq_file *m, void *v, loff_t *pos) ++{ ++ struct list_head *p; ++ ++ p = (struct list_head *)v; ++ while (1) { ++ p = p->next; ++ if (p < ip_entry_hash_table || ++ p >= ip_entry_hash_table + VEIP_HASH_SZ) { ++ (*pos)++; ++ return p; ++ } ++ if (++p >= ip_entry_hash_table + VEIP_HASH_SZ) ++ return NULL; ++ } ++ return NULL; ++} ++ ++static void veip_seq_stop(struct seq_file *m, void *v) ++{ ++ write_unlock_irq(&veip_hash_lock); ++} ++ ++static struct seq_operations veip_seq_op = { ++ .start = veip_seq_start, ++ .next = veip_seq_next, ++ .stop = veip_seq_stop, ++ .show = veip_seq_show, ++}; ++ ++static int veip_open(struct inode *inode, struct file *file) ++{ ++ return seq_open(file, &veip_seq_op); ++} ++ ++static struct file_operations proc_veip_operations = { ++ .open = veip_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++#endif ++ ++static int real_ve_ip_map(envid_t veid, int op, struct sockaddr __user *uaddr, ++ int addrlen) ++{ ++ int err; ++ struct ve_struct *ve; ++ struct ve_addr_struct addr; ++ ++ err = -EPERM; ++ if (!capable(CAP_SETVEID)) ++ goto out; ++ ++ err = sockaddr_to_veaddr(uaddr, addrlen, &addr); ++ if (err < 0) ++ goto out; ++ ++ switch (op) ++ { ++ case VE_IP_ADD: ++ ve = get_ve_by_id(veid); ++ err = -ESRCH; ++ if (!ve) ++ goto out; ++ ++ down_read(&ve->op_sem); ++ if (ve->is_running) ++ err = veip_entry_add(ve, &addr); ++ up_read(&ve->op_sem); ++ put_ve(ve); ++ break; ++ ++ case VE_IP_DEL: ++ err = veip_entry_del(veid, &addr); ++ break; ++ default: ++ err = -EINVAL; ++ } ++ ++out: ++ return err; ++} ++ ++int venet_ioctl(struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ int err; ++ ++ err = -ENOTTY; ++ switch(cmd) { ++ case VENETCTL_VE_IP_MAP: { ++ struct vzctl_ve_ip_map s; ++ err = -EFAULT; ++ if (copy_from_user(&s, (void __user *)arg, sizeof(s))) ++ break; ++ err = real_ve_ip_map(s.veid, s.op, s.addr, s.addrlen); ++ break; ++ } ++ } ++ return err; ++} ++ ++#ifdef CONFIG_COMPAT ++int compat_venet_ioctl(struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ int err; ++ ++ switch(cmd) { ++ case VENETCTL_COMPAT_VE_IP_MAP: { ++ struct compat_vzctl_ve_ip_map cs; ++ ++ err = -EFAULT; ++ if (copy_from_user(&cs, (void *)arg, sizeof(cs))) ++ break; ++ ++ err = real_ve_ip_map(cs.veid, cs.op, compat_ptr(cs.addr), ++ cs.addrlen); ++ break; ++ } ++ default: ++ err = venet_ioctl(file, cmd, arg); ++ break; ++ } ++ return err; ++} ++#endif ++ ++static struct vzioctlinfo venetcalls = { ++ .type = VENETCTLTYPE, ++ .ioctl = venet_ioctl, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = compat_venet_ioctl, ++#endif ++ .owner = THIS_MODULE, ++}; ++ ++int venet_dev_start(struct ve_struct *ve) ++{ ++ struct net_device *dev_venet; ++ int err; ++ ++ dev_venet = alloc_netdev(0, "venet%d", venet_setup); ++ if (!dev_venet) ++ return -ENOMEM; ++ dev_net_set(dev_venet, ve->ve_netns); ++ err = dev_alloc_name(dev_venet, dev_venet->name); ++ if (err<0) ++ goto err; ++ if ((err = register_netdev(dev_venet)) != 0) ++ goto err; ++ ve->_venet_dev = dev_venet; ++ return 0; ++err: ++ free_netdev(dev_venet); ++ printk(KERN_ERR "VENET initialization error err=%d\n", err); ++ return err; ++} ++ ++static int venet_start(void *data) ++{ ++ struct ve_struct *env; ++ int err; ++ ++ env = (struct ve_struct *)data; ++ if (env->veip) ++ return -EEXIST; ++ ++ err = veip_start(env); ++ if (err != 0) ++ return err; ++ ++ err = venet_dev_start(env); ++ if (err) ++ goto err_free; ++ return 0; ++ ++err_free: ++ veip_stop(env); ++ return err; ++} ++ ++static void venet_stop(void *data) ++{ ++ struct ve_struct *env; ++ struct net_device *dev; ++ ++ env = (struct ve_struct *)data; ++ veip_stop(env); ++ ++ dev = env->_venet_dev; ++ if (dev == NULL) ++ return; ++ ++ unregister_netdev(dev); ++ env->_venet_dev = NULL; ++ free_netdev(dev); ++} ++ ++static struct ve_hook venet_ve_hook = { ++ .init = venet_start, ++ .fini = venet_stop, ++ .owner = THIS_MODULE, ++ .priority = HOOK_PRIO_NET, ++}; ++ ++__init int venet_init(void) ++{ ++#ifdef CONFIG_PROC_FS ++ struct proc_dir_entry *de; ++#endif ++ int i, err; ++ ++ if (get_ve0()->_venet_dev != NULL) ++ return -EEXIST; ++ ++ for (i = 0; i < VEIP_HASH_SZ; i++) ++ INIT_LIST_HEAD(ip_entry_hash_table + i); ++ ++ err = venet_start(get_ve0()); ++ if (err) ++ return err; ++ ++#ifdef CONFIG_PROC_FS ++ de = proc_create("veinfo", S_IFREG | S_IRUSR, glob_proc_vz_dir, ++ &proc_veinfo_operations); ++ if (de == NULL) ++ printk(KERN_WARNING "venet: can't make veinfo proc entry\n"); ++ ++ de = proc_create("veip", S_IFREG | S_IRUSR, proc_vz_dir, ++ &proc_veip_operations); ++ if (de == NULL) ++ printk(KERN_WARNING "venet: can't make veip proc entry\n"); ++#endif ++ ++ ve_hook_register(VE_SS_CHAIN, &venet_ve_hook); ++ vzioctl_register(&venetcalls); ++ return 0; ++} ++ ++__exit void venet_exit(void) ++{ ++ vzioctl_unregister(&venetcalls); ++ ve_hook_unregister(&venet_ve_hook); ++ ++#ifdef CONFIG_PROC_FS ++ remove_proc_entry("veip", proc_vz_dir); ++ remove_proc_entry("veinfo", glob_proc_vz_dir); ++#endif ++ venet_stop(get_ve0()); ++ veip_cleanup(); ++} ++ ++module_init(venet_init); ++module_exit(venet_exit); +diff --git a/drivers/net/vzethdev.c b/drivers/net/vzethdev.c +new file mode 100644 +index 0000000..1414618 +--- /dev/null ++++ b/drivers/net/vzethdev.c +@@ -0,0 +1,692 @@ ++/* ++ * veth.c ++ * ++ * Copyright (C) 2006 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++/* ++ * Virtual ethernet device used to change VE ownership on packets ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include /* For the statistics structure. */ ++#include /* For ARPHRD_ETHER */ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++static LIST_HEAD(veth_hwaddr_list); ++static DEFINE_RWLOCK(ve_hwaddr_lock); ++static DECLARE_MUTEX(hwaddr_sem); ++ ++struct net_device * veth_dev_start(char *dev_addr, char *name); ++ ++struct veth_struct *hwaddr_entry_lookup(char *name) ++{ ++ struct veth_struct *entry; ++ ++ list_for_each_entry(entry, &veth_hwaddr_list, hwaddr_list) { ++ BUG_ON(entry->pair == NULL); ++ if (strncmp(name, entry->pair->name, IFNAMSIZ) == 0) ++ return entry; ++ } ++ return NULL; ++} ++ ++int veth_entry_add(struct ve_struct *ve, char *dev_addr, char *name, ++ char *dev_addr_ve, char *name_ve) ++{ ++ struct net_device *dev_ve; ++ struct net_device *dev_ve0; ++ struct ve_struct *old_env; ++ char dev_name[IFNAMSIZ]; ++ int err; ++ ++ down(&hwaddr_sem); ++ ++ if (name[0] == '\0') ++ snprintf(dev_name, sizeof(dev_name), "vz%d.%%d", ve->veid); ++ else { ++ memcpy(dev_name, name, IFNAMSIZ - 1); ++ dev_name[IFNAMSIZ - 1] = '\0'; ++ } ++ dev_ve0 = veth_dev_start(dev_addr, dev_name); ++ if (IS_ERR(dev_ve0)) { ++ err = PTR_ERR(dev_ve0); ++ goto err; ++ } ++ ++ old_env = set_exec_env(ve); ++ if (name_ve[0] == '\0') ++ sprintf(dev_name, "eth%%d"); ++ else { ++ memcpy(dev_name, name_ve, IFNAMSIZ - 1); ++ dev_name[IFNAMSIZ - 1] = '\0'; ++ } ++ dev_ve = veth_dev_start(dev_addr_ve, dev_name); ++ if (IS_ERR(dev_ve)) { ++ err = PTR_ERR(dev_ve); ++ goto err_ve; ++ } ++ set_exec_env(old_env); ++ veth_from_netdev(dev_ve)->pair = dev_ve0; ++ veth_from_netdev(dev_ve0)->pair = dev_ve; ++ ++ write_lock(&ve_hwaddr_lock); ++ list_add(&(veth_from_netdev(dev_ve)->hwaddr_list), &veth_hwaddr_list); ++ write_unlock(&ve_hwaddr_lock); ++ ++ up(&hwaddr_sem); ++ return 0; ++ ++err_ve: ++ set_exec_env(old_env); ++ unregister_netdev(dev_ve0); ++err: ++ up(&hwaddr_sem); ++ return err; ++} ++ ++void veth_pair_del(struct ve_struct *env, struct veth_struct *entry) ++{ ++ struct net_device *dev; ++ struct ve_struct *old_env; ++ ++ write_lock(&ve_hwaddr_lock); ++ list_del(&entry->hwaddr_list); ++ write_unlock(&ve_hwaddr_lock); ++ ++ dev = entry->pair; ++ BUG_ON(entry->pair == NULL); ++ ++ veth_from_netdev(dev)->pair = NULL; ++ entry->pair = NULL; ++ rtnl_lock(); ++ old_env = set_exec_env(dev->owner_env); ++ dev_close(dev); ++ ++ /* ++ * Now device from VE0 does not send or receive anything, ++ * i.e. dev->hard_start_xmit won't be called. ++ */ ++ set_exec_env(env); ++ unregister_netdevice(veth_to_netdev(entry)); ++ set_exec_env(dev->owner_env); ++ unregister_netdevice(dev); ++ set_exec_env(old_env); ++ rtnl_unlock(); ++} ++ ++int veth_entry_del(struct ve_struct *ve, char *name) ++{ ++ struct veth_struct *found; ++ int err; ++ ++ err = -ENODEV; ++ down(&hwaddr_sem); ++ found = hwaddr_entry_lookup(name); ++ if (found == NULL) ++ goto out; ++ if (veth_to_netdev(found)->owner_env != ve) ++ goto out; ++ ++ err = 0; ++ veth_pair_del(ve, found); ++ ++out: ++ up(&hwaddr_sem); ++ return err; ++} ++ ++int veth_allow_change_mac(envid_t veid, char *name, int allow) ++{ ++ struct ve_struct *ve; ++ struct veth_struct *found; ++ int err; ++ ++ err = -ESRCH; ++ ve = get_ve_by_id(veid); ++ if (!ve) ++ return err; ++ ++ down_read(&ve->op_sem); ++ if (!ve->is_running) ++ goto out_ve; ++ err = -ENODEV; ++ down(&hwaddr_sem); ++ found = hwaddr_entry_lookup(name); ++ if (found == NULL) ++ goto out_sem; ++ if (veth_to_netdev(found)->owner_env != ve) ++ goto out_sem; ++ ++ err = 0; ++ found->allow_mac_change = allow; ++ ++out_sem: ++ up(&hwaddr_sem); ++out_ve: ++ up_read(&ve->op_sem); ++ put_ve(ve); ++ return err; ++} ++ ++/* ++ * Device functions ++ */ ++ ++static int veth_open(struct net_device *dev) ++{ ++ return 0; ++} ++ ++static int veth_close(struct net_device *master) ++{ ++ return 0; ++} ++ ++static void veth_destructor(struct net_device *dev) ++{ ++ free_percpu(veth_from_netdev(dev)->real_stats); ++ free_netdev(dev); ++} ++ ++static struct net_device_stats *get_stats(struct net_device *dev) ++{ ++ int i; ++ struct net_device_stats *stats; ++ ++ stats = &veth_from_netdev(dev)->stats; ++ memset(stats, 0, sizeof(struct net_device_stats)); ++ for (i = 0; i < NR_CPUS; i++) { ++ struct net_device_stats *dev_stats; ++ ++ if (!cpu_possible(i)) ++ continue; ++ dev_stats = veth_stats(dev, i); ++ stats->rx_bytes += dev_stats->rx_bytes; ++ stats->tx_bytes += dev_stats->tx_bytes; ++ stats->rx_packets += dev_stats->rx_packets; ++ stats->tx_packets += dev_stats->tx_packets; ++ } ++ ++ return stats; ++} ++ ++/* ++ * The higher levels take care of making this non-reentrant (it's ++ * called with bh's disabled). ++ */ ++static int veth_xmit(struct sk_buff *skb, struct net_device *dev) ++{ ++ struct net_device_stats *stats; ++ struct net_device *rcv = NULL; ++ struct veth_struct *entry; ++ int length; ++ ++ stats = veth_stats(dev, smp_processor_id()); ++ if (unlikely(get_exec_env()->disable_net)) ++ goto outf; ++ ++ entry = veth_from_netdev(dev); ++ rcv = entry->pair; ++ if (!rcv) ++ /* VE going down */ ++ goto outf; ++ ++ if (!(rcv->flags & IFF_UP)) { ++ /* Target VE does not want to receive packets */ ++ goto outf; ++ } ++ ++ if (unlikely(rcv->owner_env->disable_net)) ++ goto outf; ++ /* Filtering */ ++ if (ve_is_super(dev->owner_env) && ++ !veth_from_netdev(rcv)->allow_mac_change) { ++ /* from VE0 to VEX */ ++ if (ve_is_super(rcv->owner_env)) ++ goto out; ++ if (is_multicast_ether_addr( ++ ((struct ethhdr *)skb->data)->h_dest)) ++ goto out; ++ if (compare_ether_addr(((struct ethhdr *)skb->data)->h_dest, ++ rcv->dev_addr)) ++ goto outf; ++ } else if (!ve_is_super(dev->owner_env) && ++ !entry->allow_mac_change) { ++ /* from VE to VE0 */ ++ if (compare_ether_addr(((struct ethhdr *)skb->data)->h_source, ++ dev->dev_addr)) ++ goto outf; ++ } ++ ++out: ++ skb->owner_env = rcv->owner_env; ++ ++ skb->dev = rcv; ++ skb->pkt_type = PACKET_HOST; ++ skb->protocol = eth_type_trans(skb, rcv); ++ ++ if (skb->protocol != __constant_htons(ETH_P_IP)) ++ skb_orphan(skb); ++ ++ dst_release(skb->dst); ++ skb->dst = NULL; ++#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) ++ nf_conntrack_put(skb->nfct); ++ skb->nfct = NULL; ++#endif ++ length = skb->len; ++ ++ netif_rx(skb); ++ ++ stats->tx_bytes += length; ++ stats->tx_packets++; ++ if (rcv) { ++ struct net_device_stats *rcv_stats; ++ rcv_stats = veth_stats(rcv, smp_processor_id()); ++ rcv_stats->rx_bytes += length; ++ rcv_stats->rx_packets++; ++ } ++ ++ return 0; ++ ++outf: ++ kfree_skb(skb); ++ stats->tx_dropped++; ++ return 0; ++} ++ ++static int veth_set_mac(struct net_device *dev, void *p) ++{ ++ struct sockaddr *addr = p; ++ ++ if (!ve_is_super(dev->owner_env) && ++ !veth_from_netdev(dev)->allow_mac_change) ++ return -EPERM; ++ if (netif_running(dev)) ++ return -EBUSY; ++ if (!is_valid_ether_addr(addr->sa_data)) ++ return -EADDRNOTAVAIL; ++ ++ memcpy(dev->dev_addr, addr->sa_data, dev->addr_len); ++ ++ return 0; ++} ++ ++int veth_init_dev(struct net_device *dev) ++{ ++ dev->hard_start_xmit = veth_xmit; ++ dev->get_stats = get_stats; ++ dev->open = veth_open; ++ dev->stop = veth_close; ++ dev->destructor = veth_destructor; ++ ++ ether_setup(dev); ++ dev->set_mac_address = veth_set_mac; ++ ++ /* remove setted by ether_setup() handler */ ++ dev->change_mtu = NULL; ++ ++ dev->tx_queue_len = 0; ++ ++ veth_from_netdev(dev)->real_stats = ++ alloc_percpu(struct net_device_stats); ++ if (veth_from_netdev(dev)->real_stats == NULL) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++static int ++veth_set_op(struct net_device *dev, u32 data, ++ int (*fop)(struct net_device *, u32)) ++{ ++ struct net_device *pair; ++ int ret = 0; ++ ++ ret = fop(dev, data); ++ if (ret < 0) ++ goto out; ++ ++ pair = veth_from_netdev(dev)->pair; ++ if (pair) ++ ret = fop(pair, data); ++out: ++ return ret; ++} ++ ++static int veth_op_set_sg(struct net_device *dev, u32 data) ++{ ++ return veth_set_op(dev, data, ethtool_op_set_sg); ++} ++ ++static int veth_op_set_tx_csum(struct net_device *dev, u32 data) ++{ ++ return veth_set_op(dev, data, ethtool_op_set_tx_csum); ++} ++ ++#define veth_op_set_rx_csum veth_op_set_tx_csum ++ ++static struct ethtool_ops veth_ethtool_ops = { ++ .get_sg = ethtool_op_get_sg, ++ .set_sg = veth_op_set_sg, ++ .get_tx_csum = ethtool_op_get_tx_csum, ++ .set_tx_csum = veth_op_set_tx_csum, ++ .get_rx_csum = ethtool_op_get_tx_csum, ++ .set_rx_csum = veth_op_set_rx_csum, ++ .get_tso = ethtool_op_get_tso, ++}; ++ ++static void veth_setup(struct net_device *dev) ++{ ++ dev->init = veth_init_dev; ++ /* ++ * No other features, as they are: ++ * - checksumming is required, and nobody else will done our job ++ */ ++ dev->features |= NETIF_F_VENET | NETIF_F_VIRTUAL | NETIF_F_LLTX | ++ NETIF_F_HIGHDMA; ++ ++ SET_ETHTOOL_OPS(dev, &veth_ethtool_ops); ++} ++ ++#ifdef CONFIG_PROC_FS ++#define ADDR_FMT "%02x:%02x:%02x:%02x:%02x:%02x" ++#define ADDR_ARG(x) (x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5] ++static int vehwaddr_seq_show(struct seq_file *m, void *v) ++{ ++ struct list_head *p; ++ struct veth_struct *entry; ++ ++ p = (struct list_head *)v; ++ if (p == &veth_hwaddr_list) { ++ seq_puts(m, "Version: 1.0\n"); ++ return 0; ++ } ++ entry = list_entry(p, struct veth_struct, hwaddr_list); ++ seq_printf(m, ADDR_FMT " %16s ", ++ ADDR_ARG(entry->pair->dev_addr), entry->pair->name); ++ seq_printf(m, ADDR_FMT " %16s %10u %5s\n", ++ ADDR_ARG(veth_to_netdev(entry)->dev_addr), ++ veth_to_netdev(entry)->name, ++ VEID(veth_to_netdev(entry)->owner_env), ++ entry->allow_mac_change ? "allow" : "deny"); ++ return 0; ++} ++ ++static void *vehwaddr_seq_start(struct seq_file *m, loff_t *pos) ++{ ++ read_lock(&ve_hwaddr_lock); ++ return seq_list_start_head(&veth_hwaddr_list, *pos); ++} ++ ++static void *vehwaddr_seq_next(struct seq_file *m, void *v, loff_t *pos) ++{ ++ return seq_list_next(v, &veth_hwaddr_list, pos); ++} ++ ++static void vehwaddr_seq_stop(struct seq_file *m, void *v) ++{ ++ read_unlock(&ve_hwaddr_lock); ++} ++ ++static struct seq_operations vehwaddr_seq_op = { ++ .start = vehwaddr_seq_start, ++ .next = vehwaddr_seq_next, ++ .stop = vehwaddr_seq_stop, ++ .show = vehwaddr_seq_show, ++}; ++ ++static int vehwaddr_open(struct inode *inode, struct file *file) ++{ ++ return seq_open(file, &vehwaddr_seq_op); ++} ++ ++static struct file_operations proc_vehwaddr_operations = { ++ .open = vehwaddr_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++#endif ++ ++int real_ve_hwaddr(envid_t veid, int op, ++ unsigned char *dev_addr, int addrlen, char *name, ++ unsigned char *dev_addr_ve, int addrlen_ve, char *name_ve) ++{ ++ int err; ++ struct ve_struct *ve; ++ char ve_addr[ETH_ALEN]; ++ ++ err = -EPERM; ++ if (!capable(CAP_NET_ADMIN)) ++ goto out; ++ ++ err = -EINVAL; ++ switch (op) { ++ case VE_ETH_ADD: ++ if (addrlen != ETH_ALEN) ++ goto out; ++ if (addrlen_ve != ETH_ALEN && addrlen_ve != 0) ++ goto out; ++ /* If ve addr is not set then we use dev_addr[3] & 0x80 for it */ ++ if (addrlen_ve == 0 && (dev_addr[3] & 0x80)) ++ goto out; ++ if (addrlen_ve == 0) { ++ memcpy(ve_addr, dev_addr, ETH_ALEN); ++ ve_addr[3] |= 0x80; ++ } else { ++ memcpy(ve_addr, dev_addr_ve, ETH_ALEN); ++ } ++ ++ ve = get_ve_by_id(veid); ++ err = -ESRCH; ++ if (!ve) ++ goto out; ++ ++ down_read(&ve->op_sem); ++ if (ve->is_running) ++ err = veth_entry_add(ve, dev_addr, name, ve_addr, name_ve); ++ up_read(&ve->op_sem); ++ put_ve(ve); ++ break; ++ ++ case VE_ETH_DEL: ++ if (name[0] == '\0') ++ goto out; ++ ve = get_ve_by_id(veid); ++ err = -ESRCH; ++ if (!ve) ++ goto out; ++ ++ down_read(&ve->op_sem); ++ if (ve->is_running) ++ err = veth_entry_del(ve, name); ++ up_read(&ve->op_sem); ++ put_ve(ve); ++ break; ++ case VE_ETH_ALLOW_MAC_CHANGE: ++ case VE_ETH_DENY_MAC_CHANGE: ++ err = veth_allow_change_mac(veid, name, ++ op == VE_ETH_ALLOW_MAC_CHANGE); ++ break; ++ } ++ ++out: ++ return err; ++} ++ ++int veth_ioctl(struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ int err; ++ ++ err = -ENOTTY; ++ switch(cmd) { ++ case VETHCTL_VE_HWADDR: { ++ struct vzctl_ve_hwaddr s; ++ ++ err = -EFAULT; ++ if (copy_from_user(&s, (void __user *)arg, sizeof(s))) ++ break; ++ err = real_ve_hwaddr(s.veid, s.op, s.dev_addr, s.addrlen, ++ s.dev_name, s.dev_addr_ve, s.addrlen_ve, ++ s.dev_name_ve); ++ } ++ break; ++ } ++ return err; ++} ++ ++static struct vzioctlinfo vethcalls = { ++ .type = VETHCTLTYPE, ++ .ioctl = veth_ioctl, ++ .compat_ioctl = veth_ioctl, ++ .owner = THIS_MODULE, ++}; ++ ++struct net_device * veth_dev_start(char *dev_addr, char *name) ++{ ++ struct net_device *dev; ++ int err; ++ ++ if (!is_valid_ether_addr(dev_addr)) ++ return ERR_PTR(-EADDRNOTAVAIL); ++ ++ dev = alloc_netdev(sizeof(struct veth_struct), name, veth_setup); ++ if (!dev) ++ return ERR_PTR(-ENOMEM); ++ dev->nd_net = get_exec_env()->ve_netns; ++ if (strchr(dev->name, '%')) { ++ err = dev_alloc_name(dev, dev->name); ++ if (err < 0) ++ goto err; ++ } ++ if ((err = register_netdev(dev)) != 0) ++ goto err; ++ ++ memcpy(dev->dev_addr, dev_addr, ETH_ALEN); ++ dev->addr_len = ETH_ALEN; ++ ++ return dev; ++err: ++ free_netdev(dev); ++ printk(KERN_ERR "%s initialization error err=%d\n", name, err); ++ return ERR_PTR(err); ++} ++ ++static int veth_start(void *data) ++{ ++ return 0; ++} ++ ++static void veth_stop(void *data) ++{ ++ struct ve_struct *env; ++ struct veth_struct *entry, *tmp; ++ ++ env = (struct ve_struct *)data; ++ down(&hwaddr_sem); ++ list_for_each_entry_safe(entry, tmp, &veth_hwaddr_list, hwaddr_list) ++ if (VEID(env) == VEID(veth_to_netdev(entry)->owner_env)) ++ veth_pair_del(env, entry); ++ up(&hwaddr_sem); ++} ++ ++static struct ve_hook veth_ve_hook = { ++ .init = veth_start, ++ .fini = veth_stop, ++ .owner = THIS_MODULE, ++ .priority = HOOK_PRIO_NET, ++}; ++ ++__init int veth_init(void) ++{ ++#ifdef CONFIG_PROC_FS ++ struct proc_dir_entry *de; ++ ++ de = proc_create("veth", S_IFREG|S_IRUSR, proc_vz_dir, ++ &proc_vehwaddr_operations); ++ if (de == NULL) ++ printk(KERN_WARNING "veth: can't make vehwaddr proc entry\n"); ++#endif ++ ++ ve_hook_register(VE_SS_CHAIN, &veth_ve_hook); ++ vzioctl_register(&vethcalls); ++ KSYMRESOLVE(veth_open); ++ KSYMMODRESOLVE(vzethdev); ++ return 0; ++} ++ ++__exit void veth_exit(void) ++{ ++ struct veth_struct *entry; ++ struct list_head *tmp, *n; ++ struct ve_struct *ve; ++ ++ KSYMMODUNRESOLVE(vzethdev); ++ KSYMUNRESOLVE(veth_open); ++ vzioctl_unregister(&vethcalls); ++ ve_hook_unregister(&veth_ve_hook); ++#ifdef CONFIG_PROC_FS ++ remove_proc_entry("veth", proc_vz_dir); ++#endif ++ ++ down(&hwaddr_sem); ++ list_for_each_safe(tmp, n, &veth_hwaddr_list) { ++ entry = list_entry(tmp, struct veth_struct, hwaddr_list); ++ ve = get_ve(veth_to_netdev(entry)->owner_env); ++ ++ veth_pair_del(ve, entry); ++ ++ put_ve(ve); ++ } ++ up(&hwaddr_sem); ++} ++ ++module_init(veth_init); ++module_exit(veth_exit); ++ ++MODULE_AUTHOR("Andrey Mirkin "); ++MODULE_DESCRIPTION("Virtuozzo Virtual Ethernet Device"); ++MODULE_LICENSE("GPL v2"); ++ +diff --git a/fs/Kconfig b/fs/Kconfig +index 2694648..be9d729 100644 +--- a/fs/Kconfig ++++ b/fs/Kconfig +@@ -554,13 +554,22 @@ config QUOTA_NETLINK_INTERFACE + config PRINT_QUOTA_WARNING + bool "Print quota warnings to console (OBSOLETE)" + depends on QUOTA +- default y ++ default n + help + If you say Y here, quota warnings (about exceeding softlimit, reaching + hardlimit, etc.) will be printed to the process' controlling terminal. + Note that this behavior is currently deprecated and may go away in + future. Please use notification via netlink socket instead. + ++config QUOTA_COMPAT ++ bool "Compatibility with older quotactl interface" ++ depends on QUOTA ++ help ++ This option enables compatibility layer for older version ++ of quotactl interface with byte granularity (QUOTAON at 0x0100, ++ GETQUOTA at 0x0D00). Interface versions older than that one and ++ with block granularity are still not supported. ++ + config QFMT_V1 + tristate "Old quota format support" + depends on QUOTA +@@ -576,6 +585,40 @@ config QFMT_V2 + This quota format allows using quotas with 32-bit UIDs/GIDs. If you + need this functionality say Y here. + ++config SIM_FS ++ tristate "VPS filesystem" ++ depends on VZ_QUOTA ++ default m ++ help ++ This file system is a part of Virtuozzo. It intoduces a fake ++ superblock and blockdev to VE to hide real device and show ++ statfs results taken from quota. ++ ++config VZ_QUOTA ++ tristate "Virtuozzo Disk Quota support" ++ select QUOTA ++ select QUOTA_COMPAT ++ select VZ_DEV ++ default m ++ help ++ Virtuozzo Disk Quota imposes disk quota on directories with their ++ files and subdirectories in total. Such disk quota is used to ++ account and limit disk usage by Virtuozzo VPS, but also may be used ++ separately. ++ ++config VZ_QUOTA_UNLOAD ++ bool "Unloadable Virtuozzo Disk Quota module" ++ depends on VZ_QUOTA=m ++ default n ++ help ++ Make Virtuozzo Disk Quota module unloadable. ++ Doesn't work reliably now. ++ ++config VZ_QUOTA_UGID ++ bool "Per-user and per-group quota in Virtuozzo quota partitions" ++ depends on VZ_QUOTA!=n ++ default y ++ + config QUOTACTL + bool + depends on XFS_QUOTA || QUOTA +diff --git a/fs/Makefile b/fs/Makefile +index 1e7a11b..4c87b36 100644 +--- a/fs/Makefile ++++ b/fs/Makefile +@@ -53,9 +53,15 @@ obj-$(CONFIG_QUOTA) += dquot.o + obj-$(CONFIG_QFMT_V1) += quota_v1.o + obj-$(CONFIG_QFMT_V2) += quota_v2.o + obj-$(CONFIG_QUOTACTL) += quota.o ++obj-$(CONFIG_VZ_QUOTA) += vzdquota.o ++vzdquota-y += vzdquot.o vzdq_mgmt.o vzdq_ops.o vzdq_tree.o ++vzdquota-$(CONFIG_VZ_QUOTA_UGID) += vzdq_ugid.o ++vzdquota-$(CONFIG_VZ_QUOTA_UGID) += vzdq_file.o + + obj-$(CONFIG_DNOTIFY) += dnotify.o + ++obj-$(CONFIG_SIM_FS) += simfs.o ++ + obj-$(CONFIG_PROC_FS) += proc/ + obj-y += partitions/ + obj-$(CONFIG_SYSFS) += sysfs/ +diff --git a/fs/aio.c b/fs/aio.c +index 0fb3117..1a5d0d4 100644 +--- a/fs/aio.c ++++ b/fs/aio.c +@@ -43,13 +43,16 @@ + #endif + + /*------ sysctl variables----*/ +-static DEFINE_SPINLOCK(aio_nr_lock); ++DEFINE_SPINLOCK(aio_nr_lock); ++EXPORT_SYMBOL_GPL(aio_nr_lock); + unsigned long aio_nr; /* current system wide number of aio requests */ ++EXPORT_SYMBOL_GPL(aio_nr); + unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ + /*----end sysctl variables---*/ + + static struct kmem_cache *kiocb_cachep; +-static struct kmem_cache *kioctx_cachep; ++struct kmem_cache *kioctx_cachep; ++EXPORT_SYMBOL_GPL(kioctx_cachep); + + static struct workqueue_struct *aio_wq; + +@@ -60,7 +63,7 @@ static DECLARE_WORK(fput_work, aio_fput_routine); + static DEFINE_SPINLOCK(fput_lock); + static LIST_HEAD(fput_head); + +-static void aio_kick_handler(struct work_struct *); ++void aio_kick_handler(struct work_struct *); + static void aio_queue_work(struct kioctx *); + + /* aio_setup +@@ -327,7 +330,7 @@ static void aio_cancel_all(struct kioctx *ctx) + spin_unlock_irq(&ctx->ctx_lock); + } + +-static void wait_for_all_aios(struct kioctx *ctx) ++void wait_for_all_aios(struct kioctx *ctx) + { + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); +@@ -350,6 +353,7 @@ static void wait_for_all_aios(struct kioctx *ctx) + out: + spin_unlock_irq(&ctx->ctx_lock); + } ++EXPORT_SYMBOL_GPL(wait_for_all_aios); + + /* wait_on_sync_kiocb: + * Waits on the given sync kiocb to complete. +@@ -838,7 +842,7 @@ static inline void aio_run_all_iocbs(struct kioctx *ctx) + * space. + * Run on aiod's context. + */ +-static void aio_kick_handler(struct work_struct *work) ++void aio_kick_handler(struct work_struct *work) + { + struct kioctx *ctx = container_of(work, struct kioctx, wq.work); + mm_segment_t oldfs = get_fs(); +@@ -859,7 +863,7 @@ static void aio_kick_handler(struct work_struct *work) + if (requeue) + queue_delayed_work(aio_wq, &ctx->wq, 0); + } +- ++EXPORT_SYMBOL_GPL(aio_kick_handler); + + /* + * Called by kick_iocb to queue the kiocb for retry +diff --git a/fs/autofs/init.c b/fs/autofs/init.c +index cea5219..1217caf 100644 +--- a/fs/autofs/init.c ++++ b/fs/autofs/init.c +@@ -25,6 +25,7 @@ static struct file_system_type autofs_fs_type = { + .name = "autofs", + .get_sb = autofs_get_sb, + .kill_sb = autofs_kill_sb, ++ .fs_flags = FS_VIRTUALIZED, + }; + + static int __init init_autofs_fs(void) +diff --git a/fs/autofs/root.c b/fs/autofs/root.c +index 8aacade..f273f47 100644 +--- a/fs/autofs/root.c ++++ b/fs/autofs/root.c +@@ -362,7 +362,7 @@ static int autofs_root_unlink(struct inode *dir, struct dentry *dentry) + + /* This allows root to remove symlinks */ + lock_kernel(); +- if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) { ++ if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN)) { + unlock_kernel(); + return -EACCES; + } +@@ -556,7 +556,7 @@ static int autofs_root_ioctl(struct inode *inode, struct file *filp, + _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT) + return -ENOTTY; + +- if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) ++ if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN)) + return -EPERM; + + switch(cmd) { +diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c +index 723a1c5..01ac1e0 100644 +--- a/fs/autofs4/init.c ++++ b/fs/autofs4/init.c +@@ -25,6 +25,7 @@ static struct file_system_type autofs_fs_type = { + .name = "autofs", + .get_sb = autofs_get_sb, + .kill_sb = autofs4_kill_sb, ++ .fs_flags = FS_VIRTUALIZED, + }; + + static int __init init_autofs4_fs(void) +diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c +index edf5b6b..4e9cacc 100644 +--- a/fs/autofs4/root.c ++++ b/fs/autofs4/root.c +@@ -785,7 +785,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) + struct autofs_info *p_ino; + + /* This allows root to remove symlinks */ +- if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) ++ if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN)) + return -EACCES; + + if (atomic_dec_and_test(&ino->count)) { +@@ -1005,7 +1005,7 @@ static int autofs4_root_ioctl(struct inode *inode, struct file *filp, + _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT) + return -ENOTTY; + +- if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) ++ if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN)) + return -EPERM; + + switch(cmd) { +diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c +index 75e5955..67d444c 100644 +--- a/fs/autofs4/waitq.c ++++ b/fs/autofs4/waitq.c +@@ -136,6 +136,16 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, + struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet; + + pktsz = sizeof(*packet); ++#if defined CONFIG_X86_64 && defined CONFIG_IA32_EMULATION ++ /* ++ * On x86_64 autofs_v5_packet struct padded with 4 bytes ++ * it broke autofs daemon worked in ia32 emulation mode ++ * ++ * reduce size if work in 32-bit mode to satisfy userspace hope ++ */ ++ if (test_thread_flag(TIF_IA32)) ++ pktsz -= 4; ++#endif + + packet->wait_queue_token = wq->wait_queue_token; + packet->len = wq->len; +diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c +index ba4cddb..8430452 100644 +--- a/fs/binfmt_aout.c ++++ b/fs/binfmt_aout.c +@@ -375,12 +375,12 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) + if ((ex.a_text & 0xfff || ex.a_data & 0xfff) && + (N_MAGIC(ex) != NMAGIC) && printk_ratelimit()) + { +- printk(KERN_NOTICE "executable not page aligned\n"); ++ ve_printk(VE_LOG, KERN_NOTICE "executable not page aligned\n"); + } + + if ((fd_offset & ~PAGE_MASK) != 0 && printk_ratelimit()) + { +- printk(KERN_WARNING ++ ve_printk(VE_LOG, KERN_WARNING + "fd_offset is not page aligned. Please convert program: %s\n", + bprm->file->f_path.dentry->d_name.name); + } +@@ -495,7 +495,7 @@ static int load_aout_library(struct file *file) + + if (printk_ratelimit()) + { +- printk(KERN_WARNING ++ ve_printk(VE_LOG, KERN_WARNING + "N_TXTOFF is not page aligned. Please convert library: %s\n", + file->f_path.dentry->d_name.name); + } +diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c +index d48ff5f..67a3eaa 100644 +--- a/fs/binfmt_elf.c ++++ b/fs/binfmt_elf.c +@@ -403,7 +403,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, + eppnt = elf_phdata; + for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) { + if (eppnt->p_type == PT_LOAD) { +- int elf_type = MAP_PRIVATE | MAP_DENYWRITE; ++ int elf_type = MAP_PRIVATE|MAP_DENYWRITE|MAP_EXECPRIO; + int elf_prot = 0; + unsigned long vaddr = 0; + unsigned long k, map_addr; +@@ -785,7 +785,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) + if (elf_ppnt->p_flags & PF_X) + elf_prot |= PROT_EXEC; + +- elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE; ++ elf_flags = MAP_PRIVATE | MAP_DENYWRITE | ++ MAP_EXECUTABLE | MAP_EXECPRIO; + + vaddr = elf_ppnt->p_vaddr; + if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) { +@@ -920,7 +921,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) + set_binfmt(&elf_format); + + #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES +- retval = arch_setup_additional_pages(bprm, executable_stack); ++ retval = arch_setup_additional_pages(bprm, executable_stack, 0); + if (retval < 0) { + send_sig(SIGKILL, current, 0); + goto out; +@@ -1517,7 +1518,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, + * Allocate a structure for each thread. + */ + rcu_read_lock(); +- do_each_thread(g, p) ++ do_each_thread_ve(g, p) + if (p->mm == dump_task->mm) { + t = kzalloc(offsetof(struct elf_thread_core_info, + notes[info->thread_notes]), +@@ -1539,7 +1540,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, + info->thread->next = t; + } + } +- while_each_thread(g, p); ++ while_each_thread_ve(g, p); + rcu_read_unlock(); + + /* +@@ -1721,7 +1722,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, + if (signr) { + struct elf_thread_status *ets; + rcu_read_lock(); +- do_each_thread(g, p) ++ do_each_thread_ve(g, p) + if (current->mm == p->mm && current != p) { + ets = kzalloc(sizeof(*ets), GFP_ATOMIC); + if (!ets) { +@@ -1731,7 +1732,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, + ets->thread = p; + list_add(&ets->list, &info->thread_list); + } +- while_each_thread(g, p); ++ while_each_thread_ve(g, p); + rcu_read_unlock(); + list_for_each(t, &info->thread_list) { + int sz; +diff --git a/fs/block_dev.c b/fs/block_dev.c +index 10d8a0a..fd077d4 100644 +--- a/fs/block_dev.c ++++ b/fs/block_dev.c +@@ -1304,7 +1304,7 @@ int __invalidate_device(struct block_device *bdev) + * hold). + */ + shrink_dcache_sb(sb); +- res = invalidate_inodes(sb); ++ res = invalidate_inodes_check(sb, 1); + drop_super(sb); + } + invalidate_bdev(bdev); +diff --git a/fs/buffer.c b/fs/buffer.c +index 0f51c0f..9585ec2 100644 +--- a/fs/buffer.c ++++ b/fs/buffer.c +@@ -700,6 +700,8 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); + static int __set_page_dirty(struct page *page, + struct address_space *mapping, int warn) + { ++ int acct = 0; ++ + if (unlikely(!mapping)) + return !TestSetPageDirty(page); + +@@ -714,12 +716,14 @@ static int __set_page_dirty(struct page *page, + __inc_zone_page_state(page, NR_FILE_DIRTY); + __inc_bdi_stat(mapping->backing_dev_info, + BDI_RECLAIMABLE); +- task_io_account_write(PAGE_CACHE_SIZE); ++ acct = 1; + } + radix_tree_tag_set(&mapping->page_tree, + page_index(page), PAGECACHE_TAG_DIRTY); + } + write_unlock_irq(&mapping->tree_lock); ++ if (acct) ++ task_io_account_write(page, PAGE_CACHE_SIZE, 0); + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + + return 1; +diff --git a/fs/char_dev.c b/fs/char_dev.c +index 68e510b..8fd1195 100644 +--- a/fs/char_dev.c ++++ b/fs/char_dev.c +@@ -22,6 +22,8 @@ + #include + #include + ++#include ++ + #ifdef CONFIG_KMOD + #include + #endif +diff --git a/fs/compat.c b/fs/compat.c +index ed43e17..9ab3698 100644 +--- a/fs/compat.c ++++ b/fs/compat.c +@@ -26,6 +26,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -73,6 +74,18 @@ int compat_printk(const char *fmt, ...) + + #include "read_write.h" + ++int ve_compat_printk(int dst, const char *fmt, ...) ++{ ++ va_list ap; ++ int ret; ++ if (!compat_log) ++ return 0; ++ va_start(ap, fmt); ++ ret = ve_vprintk(dst, fmt, ap); ++ va_end(ap); ++ return ret; ++} ++ + /* + * Not all architectures have sys_utime, so implement this in terms + * of sys_utimes. +@@ -244,6 +257,8 @@ asmlinkage long compat_sys_statfs(const char __user *path, struct compat_statfs + struct kstatfs tmp; + error = vfs_statfs(nd.path.dentry, &tmp); + if (!error) ++ error = faudit_statfs(nd.path.mnt->mnt_sb, &tmp); ++ if (!error) + error = put_compat_statfs(buf, &tmp); + path_put(&nd.path); + } +@@ -262,6 +277,8 @@ asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user + goto out; + error = vfs_statfs(file->f_path.dentry, &tmp); + if (!error) ++ error = faudit_statfs(file->f_vfsmnt->mnt_sb, &tmp); ++ if (!error) + error = put_compat_statfs(buf, &tmp); + fput(file); + out: +@@ -312,6 +329,8 @@ asmlinkage long compat_sys_statfs64(const char __user *path, compat_size_t sz, s + struct kstatfs tmp; + error = vfs_statfs(nd.path.dentry, &tmp); + if (!error) ++ error = faudit_statfs(nd.path.mnt->mnt_sb, &tmp); ++ if (!error) + error = put_compat_statfs64(buf, &tmp); + path_put(&nd.path); + } +@@ -333,6 +352,8 @@ asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct c + goto out; + error = vfs_statfs(file->f_path.dentry, &tmp); + if (!error) ++ error = faudit_statfs(file->f_vfsmnt->mnt_sb, &tmp); ++ if (!error) + error = put_compat_statfs64(buf, &tmp); + fput(file); + out: +@@ -1351,6 +1372,10 @@ int compat_do_execve(char * filename, + struct file *file; + int retval; + ++ retval = virtinfo_gencall(VIRTINFO_DOEXECVE, NULL); ++ if (retval) ++ return retval; ++ + retval = -ENOMEM; + bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); + if (!bprm) +diff --git a/fs/dcache.c b/fs/dcache.c +index 6068c25..422d2b4 100644 +--- a/fs/dcache.c ++++ b/fs/dcache.c +@@ -27,13 +27,20 @@ + #include + #include + #include ++#include + #include + #include + #include + #include + #include ++#include ++#include ++#include ++#include + #include "internal.h" + ++#include ++#include + + int sysctl_vfs_cache_pressure __read_mostly = 100; + EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); +@@ -43,7 +50,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); + + EXPORT_SYMBOL(dcache_lock); + +-static struct kmem_cache *dentry_cache __read_mostly; ++struct kmem_cache *dentry_cache __read_mostly; + + #define DNAME_INLINE_LEN (sizeof(struct dentry)-offsetof(struct dentry,d_iname)) + +@@ -146,6 +153,7 @@ static struct dentry *d_kill(struct dentry *dentry) + + list_del(&dentry->d_u.d_child); + dentry_stat.nr_dentry--; /* For d_free, below */ ++ preempt_enable_no_resched(); + /*drops the locks, at that point nobody can reach this dentry */ + dentry_iput(dentry); + parent = dentry->d_parent; +@@ -184,21 +192,31 @@ static struct dentry *d_kill(struct dentry *dentry) + + void dput(struct dentry *dentry) + { ++ struct user_beancounter *ub; ++ unsigned long d_ubsize; ++ + if (!dentry) + return; + + repeat: + if (atomic_read(&dentry->d_count) == 1) + might_sleep(); +- if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock)) +- return; ++ preempt_disable(); ++ if (unlikely(ub_dentry_on)) { ++ spin_lock(&dcache_lock); ++ if (!atomic_dec_and_test(&dentry->d_count)) { ++ ub_dentry_uncharge_locked(dentry); ++ spin_unlock(&dcache_lock); ++ goto out_preempt; ++ } ++ } else { ++ if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock)) ++ goto out_preempt; ++ } + + spin_lock(&dentry->d_lock); +- if (atomic_read(&dentry->d_count)) { +- spin_unlock(&dentry->d_lock); +- spin_unlock(&dcache_lock); +- return; +- } ++ if (atomic_read(&dentry->d_count)) ++ goto out_unlock; + + /* + * AV: ->d_delete() is _NOT_ allowed to block now. +@@ -215,17 +233,30 @@ repeat: + list_add(&dentry->d_lru, &dentry_unused); + dentry_stat.nr_unused++; + } ++out_unlock: + spin_unlock(&dentry->d_lock); ++ ub_dentry_uncharge_locked(dentry); + spin_unlock(&dcache_lock); ++out_preempt: ++ preempt_enable(); + return; + + unhash_it: + __d_drop(dentry); + kill_it: + dentry_lru_remove(dentry); ++ ++ ub = dentry->dentry_bc.d_ub; ++ d_ubsize = dentry->dentry_bc.d_ubsize; + dentry = d_kill(dentry); +- if (dentry) ++ preempt_disable(); ++ if (unlikely(ub_dentry_on)) { ++ uncharge_dcache(ub, d_ubsize); ++ put_beancounter(ub); ++ } ++ if (dentry) + goto repeat; ++ preempt_enable(); + } + + /** +@@ -291,6 +322,7 @@ static inline struct dentry * __dget_locked(struct dentry *dentry) + { + atomic_inc(&dentry->d_count); + dentry_lru_remove(dentry); ++ ub_dentry_charge_nofail(dentry); + return dentry; + } + +@@ -393,6 +425,7 @@ static void prune_one_dentry(struct dentry * dentry) + __acquires(dcache_lock) + { + __d_drop(dentry); ++ preempt_disable(); + dentry = d_kill(dentry); + + /* +@@ -408,6 +441,7 @@ static void prune_one_dentry(struct dentry * dentry) + dentry->d_op->d_delete(dentry); + dentry_lru_remove(dentry); + __d_drop(dentry); ++ preempt_disable(); + dentry = d_kill(dentry); + spin_lock(&dcache_lock); + } +@@ -701,6 +735,8 @@ void shrink_dcache_for_umount(struct super_block *sb) + + dentry = sb->s_root; + sb->s_root = NULL; ++ /* "/" was also charged in d_alloc_root() */ ++ ub_dentry_uncharge(dentry); + atomic_dec(&dentry->d_count); + shrink_dcache_for_umount_subtree(dentry); + +@@ -860,12 +896,18 @@ void shrink_dcache_parent(struct dentry * parent) + */ + static int shrink_dcache_memory(int nr, gfp_t gfp_mask) + { ++ int res = -1; ++ ++ KSTAT_PERF_ENTER(shrink_dcache) + if (nr) { + if (!(gfp_mask & __GFP_FS)) +- return -1; ++ goto out; + prune_dcache(nr, NULL); + } +- return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; ++ res = (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; ++out: ++ KSTAT_PERF_LEAVE(shrink_dcache) ++ return res; + } + + static struct shrinker dcache_shrinker = { +@@ -888,21 +930,27 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) + struct dentry *dentry; + char *dname; + ++ dname = NULL; ++ if (name->len > DNAME_INLINE_LEN-1) { ++ dname = kmalloc(name->len + 1, GFP_KERNEL); ++ if (!dname) ++ goto err_name; ++ } ++ ++ ub_dentry_alloc_start(); ++ + dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); + if (!dentry) +- return NULL; ++ goto err_alloc; + +- if (name->len > DNAME_INLINE_LEN-1) { +- dname = kmalloc(name->len + 1, GFP_KERNEL); +- if (!dname) { +- kmem_cache_free(dentry_cache, dentry); +- return NULL; +- } +- } else { ++ preempt_disable(); ++ if (dname == NULL) + dname = dentry->d_iname; +- } + dentry->d_name.name = dname; + ++ if (ub_dentry_alloc(dentry)) ++ goto err_charge; ++ + dentry->d_name.len = name->len; + dentry->d_name.hash = name->hash; + memcpy(dname, name->name, name->len); +@@ -933,12 +981,27 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) + } + + spin_lock(&dcache_lock); +- if (parent) ++ if (parent) { + list_add(&dentry->d_u.d_child, &parent->d_subdirs); ++ if (parent->d_flags & DCACHE_VIRTUAL) ++ dentry->d_flags |= DCACHE_VIRTUAL; ++ } + dentry_stat.nr_dentry++; + spin_unlock(&dcache_lock); ++ preempt_enable(); ++ ub_dentry_alloc_end(); + + return dentry; ++ ++err_charge: ++ preempt_enable(); ++ kmem_cache_free(dentry_cache, dentry); ++err_alloc: ++ if (name->len > DNAME_INLINE_LEN - 1) ++ kfree(dname); ++ ub_dentry_alloc_end(); ++err_name: ++ return NULL; + } + + struct dentry *d_alloc_name(struct dentry *parent, const char *name) +@@ -1244,12 +1307,12 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) + unsigned int hash = name->hash; + const unsigned char *str = name->name; + struct hlist_head *head = d_hash(parent,hash); +- struct dentry *found = NULL; + struct hlist_node *node; +- struct dentry *dentry; ++ struct dentry *dentry, *found; + + rcu_read_lock(); + ++ found = NULL; + hlist_for_each_entry_rcu(dentry, node, head, d_hash) { + struct qstr *qstr; + +@@ -1286,6 +1349,8 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) + if (!d_unhashed(dentry)) { + atomic_inc(&dentry->d_count); + found = dentry; ++ if (ub_dentry_charge(found)) ++ goto charge_failure; + } + spin_unlock(&dentry->d_lock); + break; +@@ -1295,6 +1360,14 @@ next: + rcu_read_unlock(); + + return found; ++ ++charge_failure: ++ spin_unlock(&found->d_lock); ++ rcu_read_unlock(); ++ /* dentry is now unhashed, just kill it */ ++ dput(found); ++ /* ... and fail lookup */ ++ return NULL; + } + + /** +@@ -1763,6 +1836,16 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name) + } + + /** ++ * d_root_check - checks if dentry is accessible from current's fs root ++ * @dentry: dentry to be verified ++ * @vfsmnt: vfsmnt to which the dentry belongs ++ */ ++int d_root_check(struct path *path) ++{ ++ return PTR_ERR(d_path(path, NULL, 0)); ++} ++ ++/** + * __d_path - return the path of a dentry + * @path: the dentry/vfsmount to report + * @root: root vfsmnt/dentry (may be modified by this function) +@@ -1786,18 +1869,21 @@ char *__d_path(const struct path *path, struct path *root, + struct vfsmount *vfsmnt = path->mnt; + char *end = buffer + buflen; + char *retval; ++ int deleted; ++ struct vfsmount *oldmnt = vfsmnt; + + spin_lock(&vfsmount_lock); +- prepend(&end, &buflen, "\0", 1); +- if (!IS_ROOT(dentry) && d_unhashed(dentry) && +- (prepend(&end, &buflen, " (deleted)", 10) != 0)) ++ if (buffer) { ++ prepend(&end, &buflen, "\0", 1); ++ if (buflen < 1) + goto Elong; ++ } ++ deleted = (!IS_ROOT(dentry) && d_unhashed(dentry)); + +- if (buflen < 1) +- goto Elong; + /* Get '/' right */ + retval = end-1; +- *retval = '/'; ++ if (buffer) ++ *retval = '/'; + + for (;;) { + struct dentry * parent; +@@ -1815,20 +1901,43 @@ char *__d_path(const struct path *path, struct path *root, + } + parent = dentry->d_parent; + prefetch(parent); +- if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) || +- (prepend(&end, &buflen, "/", 1) != 0)) ++ if (buffer && ((prepend_name(&end, &buflen, &dentry->d_name) != 0) || ++ (prepend(&end, &buflen, "/", 1) != 0))) + goto Elong; + retval = end; + dentry = parent; + } + + out: ++ if (deleted && buffer && ++ prepend(&end, &buflen, " (deleted)", 10) != 0) ++ goto Elong; ++out_err: + spin_unlock(&vfsmount_lock); +- return retval; ++ return buffer ? retval : NULL; + + global_root: ++ /* ++ * We traversed the tree upward and reached a root, but the given ++ * lookup terminal point wasn't encountered. It means either that the ++ * dentry is out of our scope or belongs to an abstract space like ++ * sock_mnt or pipe_mnt. Check for it. ++ * ++ * There are different options to check it. ++ * We may assume that any dentry tree is unreachable unless it's ++ * connected to `root' (defined as fs root of init aka child reaper) ++ * and expose all paths that are not connected to it. ++ * The other option is to allow exposing of known abstract spaces ++ * explicitly and hide the path information for other cases. ++ * This approach is more safe, let's take it. 2001/04/22 SAW ++ */ ++ if (!(oldmnt->mnt_sb->s_flags & MS_NOUSER)) { ++ retval = ERR_PTR(-EINVAL); ++ goto out_err; ++ } ++ + retval += 1; /* hit the slash */ +- if (prepend_name(&retval, &buflen, &dentry->d_name) != 0) ++ if (buffer && prepend_name(&retval, &buflen, &dentry->d_name) != 0) + goto Elong; + root->mnt = vfsmnt; + root->dentry = dentry; +@@ -1836,8 +1945,9 @@ global_root: + + Elong: + retval = ERR_PTR(-ENAMETOOLONG); +- goto out; ++ goto out_err; + } ++EXPORT_SYMBOL(__d_path); + + /** + * d_path - return the path of a dentry +@@ -1864,8 +1974,11 @@ char *d_path(const struct path *path, char *buf, int buflen) + * thus don't need to be hashed. They also don't need a name until a + * user wants to identify the object in /proc/pid/fd/. The little hack + * below allows us to generate a name for these objects on demand: ++ * ++ * pipefs and socketfs methods assume valid buffer, d_root_check() ++ * supplies NULL one for access checks. + */ +- if (path->dentry->d_op && path->dentry->d_op->d_dname) ++ if (buf && path->dentry->d_op && path->dentry->d_op->d_dname) + return path->dentry->d_op->d_dname(path->dentry, buf, buflen); + + read_lock(¤t->fs->lock); +@@ -1880,6 +1993,231 @@ char *d_path(const struct path *path, char *buf, int buflen) + return res; + } + ++#ifdef CONFIG_VE ++#include ++#include ++#include ++#include ++#include ++ ++static void mark_sub_tree_virtual(struct dentry *d) ++{ ++ struct dentry *orig_root; ++ ++ orig_root = d; ++ while (1) { ++ spin_lock(&d->d_lock); ++ d->d_flags |= DCACHE_VIRTUAL; ++ spin_unlock(&d->d_lock); ++ ++ if (!list_empty(&d->d_subdirs)) { ++ d = list_entry(d->d_subdirs.next, ++ struct dentry, d_u.d_child); ++ continue; ++ } ++ if (d == orig_root) ++ break; ++ while (d == list_entry(d->d_parent->d_subdirs.prev, ++ struct dentry, d_u.d_child)) { ++ d = d->d_parent; ++ if (d == orig_root) ++ goto out; ++ } ++ d = list_entry(d->d_u.d_child.next, ++ struct dentry, d_u.d_child); ++ } ++out: ++ return; ++} ++ ++void mark_tree_virtual(struct path *path) ++{ ++ struct vfsmount *orig_rootmnt; ++ struct vfsmount *m = path->mnt; ++ struct dentry *d = path->dentry; ++ ++ spin_lock(&dcache_lock); ++ spin_lock(&vfsmount_lock); ++ orig_rootmnt = m; ++ while (1) { ++ mark_sub_tree_virtual(d); ++ if (!list_empty(&m->mnt_mounts)) { ++ m = list_entry(m->mnt_mounts.next, ++ struct vfsmount, mnt_child); ++ d = m->mnt_root; ++ continue; ++ } ++ if (m == orig_rootmnt) ++ break; ++ while (m == list_entry(m->mnt_parent->mnt_mounts.prev, ++ struct vfsmount, mnt_child)) { ++ m = m->mnt_parent; ++ if (m == orig_rootmnt) ++ goto out; ++ } ++ m = list_entry(m->mnt_child.next, ++ struct vfsmount, mnt_child); ++ d = m->mnt_root; ++ } ++out: ++ spin_unlock(&vfsmount_lock); ++ spin_unlock(&dcache_lock); ++} ++EXPORT_SYMBOL(mark_tree_virtual); ++ ++static struct vz_rate_info area_ri = { 20, 10*HZ }; ++#define VE_AREA_ACC_CHECK 0x0001 ++#define VE_AREA_ACC_DENY 0x0002 ++#define VE_AREA_EXEC_CHECK 0x0010 ++#define VE_AREA_EXEC_DENY 0x0020 ++#define VE0_AREA_ACC_CHECK 0x0100 ++#define VE0_AREA_ACC_DENY 0x0200 ++#define VE0_AREA_EXEC_CHECK 0x1000 ++#define VE0_AREA_EXEC_DENY 0x2000 ++int ve_area_access_check = 0; ++ ++static void print_connection_info(struct task_struct *tsk) ++{ ++ struct files_struct *files; ++ struct fdtable *fdt; ++ int fd; ++ ++ files = get_files_struct(tsk); ++ if (!files) ++ return; ++ ++ spin_lock(&files->file_lock); ++ fdt = files_fdtable(files); ++ for (fd = 0; fd < fdt->max_fds; fd++) { ++ struct file *file; ++ struct inode *inode; ++ struct socket *socket; ++ struct sock *sk; ++ struct inet_sock *inet; ++ ++ file = fdt->fd[fd]; ++ if (file == NULL) ++ continue; ++ ++ inode = file->f_dentry->d_inode; ++ if (!S_ISSOCK(inode->i_mode)) ++ continue; ++ ++ socket = SOCKET_I(inode); ++ if (socket == NULL) ++ continue; ++ ++ sk = socket->sk; ++ if ((sk->sk_family != PF_INET && sk->sk_family != PF_INET6) ++ || sk->sk_type != SOCK_STREAM) ++ continue; ++ ++ inet = inet_sk(sk); ++ printk(KERN_ALERT "connection from %u.%u.%u.%u:%u to port %u\n", ++ NIPQUAD(inet->daddr), ntohs(inet->dport), ++ inet->num); ++ } ++ spin_unlock(&files->file_lock); ++ put_files_struct(files); ++} ++ ++static void check_alert(struct path *path, char *str) ++{ ++ struct task_struct *tsk; ++ unsigned long page; ++ struct super_block *sb; ++ char *p; ++ ++ if (!vz_ratelimit(&area_ri)) ++ return; ++ ++ tsk = current; ++ p = ERR_PTR(-ENOMEM); ++ page = __get_free_page(GFP_KERNEL); ++ if (page) { ++ spin_lock(&dcache_lock); ++ p = __d_path(path, &tsk->fs->root, (char *)page, PAGE_SIZE); ++ spin_unlock(&dcache_lock); ++ } ++ if (IS_ERR(p)) ++ p = "(undefined)"; ++ ++ sb = path->dentry->d_sb; ++ printk(KERN_ALERT "%s check alert! file:[%s] from %d/%s, dev%x\n" ++ "Task %d/%d[%s] from VE%d, execenv %d\n", ++ str, p, sb->s_type->owner_env->veid, ++ sb->s_type->name, sb->s_dev, ++ tsk->pid, task_pid_vnr(tsk), tsk->comm, ++ VE_TASK_INFO(tsk)->owner_env->veid, ++ get_exec_env()->veid); ++ ++ free_page(page); ++ ++ print_connection_info(tsk); ++ ++ read_lock(&tasklist_lock); ++ tsk = tsk->parent; ++ get_task_struct(tsk); ++ read_unlock(&tasklist_lock); ++ ++ printk(KERN_ALERT "Parent %d/%d[%s] from VE%d\n", ++ tsk->pid, task_pid_vnr(tsk), tsk->comm, ++ VE_TASK_INFO(tsk)->owner_env->veid); ++ ++ print_connection_info(tsk); ++ put_task_struct(tsk); ++ dump_stack(); ++} ++#endif ++ ++int check_area_access_ve(struct path *path) ++{ ++#ifdef CONFIG_VE ++ int check, alert, deny; ++ ++ if (ve_is_super(get_exec_env())) { ++ check = ve_area_access_check & VE0_AREA_ACC_CHECK; ++ alert = path->dentry->d_flags & DCACHE_VIRTUAL; ++ deny = ve_area_access_check & VE0_AREA_ACC_DENY; ++ } else { ++ check = ve_area_access_check & VE_AREA_ACC_CHECK; ++ alert = !(path->dentry->d_flags & DCACHE_VIRTUAL); ++ deny = ve_area_access_check & VE_AREA_ACC_DENY; ++ } ++ ++ if (check && alert) ++ check_alert(path, "Access"); ++ if (deny && alert) ++ return -EACCES; ++#endif ++ return 0; ++} ++ ++#if 0 ++int check_area_execute_ve(struct dentry *dentry, struct vfsmount *mnt) ++{ ++#ifdef CONFIG_VE ++ int check, alert, deny; ++ ++ if (ve_is_super(get_exec_env())) { ++ check = ve_area_access_check & VE0_AREA_EXEC_CHECK; ++ alert = dentry->d_flags & DCACHE_VIRTUAL; ++ deny = ve_area_access_check & VE0_AREA_EXEC_DENY; ++ } else { ++ check = ve_area_access_check & VE_AREA_EXEC_CHECK; ++ alert = !(dentry->d_flags & DCACHE_VIRTUAL); ++ deny = ve_area_access_check & VE_AREA_EXEC_DENY; ++ } ++ ++ if (check && alert) ++ check_alert(mnt, dentry, "Exec"); ++ if (deny && alert) ++ return -EACCES; ++#endif ++ return 0; ++} ++#endif ++ + /* + * Helper function for dentry_operations.d_dname() members + */ +@@ -2072,10 +2410,12 @@ resume: + goto repeat; + } + atomic_dec(&dentry->d_count); ++ ub_dentry_uncharge_locked(dentry); + } + if (this_parent != root) { + next = this_parent->d_u.d_child.next; + atomic_dec(&this_parent->d_count); ++ ub_dentry_uncharge_locked(this_parent); + this_parent = this_parent->d_parent; + goto resume; + } +diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c +index 285b64a..d89511b 100644 +--- a/fs/devpts/inode.c ++++ b/fs/devpts/inode.c +@@ -23,6 +23,7 @@ + #include + #include + #include ++#include + + #define DEVPTS_SUPER_MAGIC 0x1cd1 + +@@ -30,18 +31,26 @@ + + extern int pty_limit; /* Config limit on Unix98 ptys */ + static DEFINE_IDR(allocated_ptys); ++#ifdef CONFIG_VE ++#define __ve_allocated_ptys(ve) (*((ve)->allocated_ptys)) ++#define ve_allocated_ptys __ve_allocated_ptys(get_exec_env()) ++#else ++#define __ve_allocated_ptys(ve) allocated_ptys ++#define ve_allocated_ptys allocated_ptys ++#endif + static DEFINE_MUTEX(allocated_ptys_lock); + ++struct devpts_config devpts_config = {.mode = 0600}; ++ ++#ifndef CONFIG_VE + static struct vfsmount *devpts_mnt; + static struct dentry *devpts_root; +- +-static struct { +- int setuid; +- int setgid; +- uid_t uid; +- gid_t gid; +- umode_t mode; +-} config = {.mode = DEVPTS_DEFAULT_MODE}; ++#define config devpts_config ++#else ++#define devpts_mnt (get_exec_env()->devpts_mnt) ++#define devpts_root (get_exec_env()->devpts_root) ++#define config (*(get_exec_env()->devpts_config)) ++#endif + + enum { + Opt_uid, Opt_gid, Opt_mode, +@@ -93,7 +102,8 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data) + config.mode = option & S_IALLUGO; + break; + default: +- printk(KERN_ERR "devpts: called with bogus options\n"); ++ ve_printk(VE_LOG, KERN_ERR ++ "devpts: called with bogus options\n"); + return -EINVAL; + } + } +@@ -157,13 +167,15 @@ static int devpts_get_sb(struct file_system_type *fs_type, + return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt); + } + +-static struct file_system_type devpts_fs_type = { ++struct file_system_type devpts_fs_type = { + .owner = THIS_MODULE, + .name = "devpts", + .get_sb = devpts_get_sb, + .kill_sb = kill_anon_super, + }; + ++EXPORT_SYMBOL(devpts_fs_type); ++ + /* + * The normal naming convention is simply /dev/pts/; this conforms + * to the System V naming convention +@@ -183,12 +195,12 @@ int devpts_new_index(void) + int idr_ret; + + retry: +- if (!idr_pre_get(&allocated_ptys, GFP_KERNEL)) { ++ if (!idr_pre_get(&ve_allocated_ptys, GFP_KERNEL)) { + return -ENOMEM; + } + + mutex_lock(&allocated_ptys_lock); +- idr_ret = idr_get_new(&allocated_ptys, NULL, &index); ++ idr_ret = idr_get_new(&ve_allocated_ptys, NULL, &index); + if (idr_ret < 0) { + mutex_unlock(&allocated_ptys_lock); + if (idr_ret == -EAGAIN) +@@ -197,7 +209,7 @@ retry: + } + + if (index >= pty_limit) { +- idr_remove(&allocated_ptys, index); ++ idr_remove(&ve_allocated_ptys, index); + mutex_unlock(&allocated_ptys_lock); + return -EIO; + } +@@ -208,7 +220,7 @@ retry: + void devpts_kill_index(int idx) + { + mutex_lock(&allocated_ptys_lock); +- idr_remove(&allocated_ptys, idx); ++ idr_remove(&ve_allocated_ptys, idx); + mutex_unlock(&allocated_ptys_lock); + } + +@@ -278,6 +290,17 @@ void devpts_pty_kill(int number) + mutex_unlock(&devpts_root->d_inode->i_mutex); + } + ++void prepare_tty(void) ++{ ++#ifdef CONFIG_VE ++ get_ve0()->allocated_ptys = &allocated_ptys; ++ /* ++ * in this case, tty_register_driver() setups ++ * owner_env correctly right from the bootup ++ */ ++#endif ++} ++ + static int __init init_devpts_fs(void) + { + int err = register_filesystem(&devpts_fs_type); +@@ -286,11 +309,13 @@ static int __init init_devpts_fs(void) + if (IS_ERR(devpts_mnt)) + err = PTR_ERR(devpts_mnt); + } ++ prepare_tty(); + return err; + } + + static void __exit exit_devpts_fs(void) + { ++ /* the code is never called, the argument is irrelevant */ + unregister_filesystem(&devpts_fs_type); + mntput(devpts_mnt); + } +diff --git a/fs/direct-io.c b/fs/direct-io.c +index 9e81add..f30ffef 100644 +--- a/fs/direct-io.c ++++ b/fs/direct-io.c +@@ -666,7 +666,7 @@ submit_page_section(struct dio *dio, struct page *page, + /* + * Read accounting is performed in submit_bio() + */ +- task_io_account_write(len); ++ task_io_account_write(page, len, 1); + } + + /* +diff --git a/fs/dquot.c b/fs/dquot.c +index 5ac77da..6d488bd 100644 +--- a/fs/dquot.c ++++ b/fs/dquot.c +@@ -162,7 +162,9 @@ static struct quota_format_type *find_quota_format(int id) + struct quota_format_type *actqf; + + spin_lock(&dq_list_lock); +- for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id; actqf = actqf->qf_next); ++ for (actqf = quota_formats; ++ actqf && (actqf->qf_fmt_id != id || actqf->qf_ops == NULL); ++ actqf = actqf->qf_next); + if (!actqf || !try_module_get(actqf->qf_owner)) { + int qm; + +diff --git a/fs/eventpoll.c b/fs/eventpoll.c +index 990c01d..4e99696 100644 +--- a/fs/eventpoll.c ++++ b/fs/eventpoll.c +@@ -31,6 +31,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -102,11 +103,6 @@ + + #define EP_UNACTIVE_PTR ((void *) -1L) + +-struct epoll_filefd { +- struct file *file; +- int fd; +-}; +- + /* + * Node that is linked into the "wake_task_list" member of the "struct poll_safewake". + * It is used to keep track on all tasks that are currently inside the wake_up() code +@@ -129,79 +125,6 @@ struct poll_safewake { + spinlock_t lock; + }; + +-/* +- * Each file descriptor added to the eventpoll interface will +- * have an entry of this type linked to the "rbr" RB tree. +- */ +-struct epitem { +- /* RB tree node used to link this structure to the eventpoll RB tree */ +- struct rb_node rbn; +- +- /* List header used to link this structure to the eventpoll ready list */ +- struct list_head rdllink; +- +- /* +- * Works together "struct eventpoll"->ovflist in keeping the +- * single linked chain of items. +- */ +- struct epitem *next; +- +- /* The file descriptor information this item refers to */ +- struct epoll_filefd ffd; +- +- /* Number of active wait queue attached to poll operations */ +- int nwait; +- +- /* List containing poll wait queues */ +- struct list_head pwqlist; +- +- /* The "container" of this item */ +- struct eventpoll *ep; +- +- /* List header used to link this item to the "struct file" items list */ +- struct list_head fllink; +- +- /* The structure that describe the interested events and the source fd */ +- struct epoll_event event; +-}; +- +-/* +- * This structure is stored inside the "private_data" member of the file +- * structure and rapresent the main data sructure for the eventpoll +- * interface. +- */ +-struct eventpoll { +- /* Protect the this structure access */ +- spinlock_t lock; +- +- /* +- * This mutex is used to ensure that files are not removed +- * while epoll is using them. This is held during the event +- * collection loop, the file cleanup path, the epoll file exit +- * code and the ctl operations. +- */ +- struct mutex mtx; +- +- /* Wait queue used by sys_epoll_wait() */ +- wait_queue_head_t wq; +- +- /* Wait queue used by file->poll() */ +- wait_queue_head_t poll_wait; +- +- /* List of ready file descriptors */ +- struct list_head rdllist; +- +- /* RB tree root used to store monitored fd structs */ +- struct rb_root rbr; +- +- /* +- * This is a single linked list that chains all the "struct epitem" that +- * happened while transfering ready events to userspace w/out +- * holding ->lock. +- */ +- struct epitem *ovflist; +-}; +- + /* Wait structure used by the poll hooks */ + struct eppoll_entry { + /* List header used to link this structure to the "struct epitem" */ +@@ -229,7 +152,8 @@ struct ep_pqueue { + /* + * This mutex is used to serialize ep_free() and eventpoll_release_file(). + */ +-static struct mutex epmutex; ++struct mutex epmutex; ++EXPORT_SYMBOL_GPL(epmutex); + + /* Safe wake up implementation */ + static struct poll_safewake psw; +@@ -482,10 +406,11 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) + } + + /* File callbacks that implement the eventpoll file behaviour */ +-static const struct file_operations eventpoll_fops = { ++const struct file_operations eventpoll_fops = { + .release = ep_eventpoll_release, + .poll = ep_eventpoll_poll + }; ++EXPORT_SYMBOL(eventpoll_fops); + + /* Fast test to see if the file is an evenpoll file */ + static inline int is_file_epoll(struct file *f) +@@ -557,7 +482,7 @@ static int ep_alloc(struct eventpoll **pep) + * are protected by the "mtx" mutex, and ep_find() must be called with + * "mtx" held. + */ +-static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) ++struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) + { + int kcmp; + struct rb_node *rbp; +@@ -583,6 +508,7 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) + + return epir; + } ++EXPORT_SYMBOL_GPL(ep_find); + + /* + * This is the callback that is passed to the wait queue wakeup +@@ -695,7 +621,7 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) + /* + * Must be called with "mtx" held. + */ +-static int ep_insert(struct eventpoll *ep, struct epoll_event *event, ++int ep_insert(struct eventpoll *ep, struct epoll_event *event, + struct file *tfile, int fd) + { + int error, revents, pwake = 0; +@@ -792,6 +718,7 @@ error_unregister: + error_return: + return error; + } ++EXPORT_SYMBOL(ep_insert); + + /* + * Modify the interest event mask by dropping an event if the new mask +@@ -1078,6 +1005,7 @@ error_return: + + return fd; + } ++EXPORT_SYMBOL(sys_epoll_create); + + /* + * The following function implements the controller interface for +diff --git a/fs/exec.c b/fs/exec.c +index fd92343..2f1a48a 100644 +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -26,6 +26,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -56,6 +57,8 @@ + #include + #include + ++#include ++ + #ifdef CONFIG_KMOD + #include + #endif +@@ -71,6 +74,8 @@ int suid_dumpable = 0; + + /* The maximal length of core_pattern is also specified in sysctl.c */ + ++int sysctl_at_vsyscall; ++ + static LIST_HEAD(formats); + static DEFINE_RWLOCK(binfmt_lock); + +@@ -230,9 +235,13 @@ static int __bprm_mm_init(struct linux_binprm *bprm) + struct vm_area_struct *vma = NULL; + struct mm_struct *mm = bprm->mm; + +- bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); ++ if (ub_memory_charge(mm, PAGE_SIZE, VM_STACK_FLAGS | mm->def_flags, ++ NULL, UB_SOFT)) ++ goto fail_charge; ++ ++ bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL_UBC); + if (!vma) +- goto err; ++ goto fail_alloc; + + down_write(&mm->mmap_sem); + vma->vm_mm = mm; +@@ -266,7 +275,9 @@ err: + bprm->vma = NULL; + kmem_cache_free(vm_area_cachep, vma); + } +- ++fail_alloc: ++ ub_memory_uncharge(mm, PAGE_SIZE, VM_STACK_FLAGS | mm->def_flags, NULL); ++fail_charge: + return err; + } + +@@ -709,10 +720,11 @@ int kernel_read(struct file *file, unsigned long offset, + + EXPORT_SYMBOL(kernel_read); + +-static int exec_mmap(struct mm_struct *mm) ++static int exec_mmap(struct linux_binprm *bprm) + { + struct task_struct *tsk; +- struct mm_struct * old_mm, *active_mm; ++ struct mm_struct *old_mm, *active_mm, *mm; ++ int ret; + + /* Notify parent that we're no longer interested in the old VM */ + tsk = current; +@@ -734,6 +746,10 @@ static int exec_mmap(struct mm_struct *mm) + return -EINTR; + } + } ++ ++ ret = 0; ++ mm = bprm->mm; ++ mm->vps_dumpable = 1; + task_lock(tsk); + active_mm = tsk->active_mm; + tsk->mm = mm; +@@ -742,14 +758,24 @@ static int exec_mmap(struct mm_struct *mm) + task_unlock(tsk); + mm_update_next_owner(old_mm); + arch_pick_mmap_layout(mm); ++ bprm->mm = NULL; /* We're using it now */ ++ ++#ifdef CONFIG_VZ_GENCALLS ++ if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_EXECMMAP, ++ bprm) & NOTIFY_FAIL) { ++ /* similar to binfmt_elf */ ++ send_sig(SIGKILL, current, 0); ++ ret = -ENOMEM; ++ } ++#endif + if (old_mm) { + up_read(&old_mm->mmap_sem); + BUG_ON(active_mm != old_mm); + mmput(old_mm); +- return 0; ++ return ret; + } + mmdrop(active_mm); +- return 0; ++ return ret; + } + + /* +@@ -847,6 +873,10 @@ static int de_thread(struct task_struct *tsk) + transfer_pid(leader, tsk, PIDTYPE_PGID); + transfer_pid(leader, tsk, PIDTYPE_SID); + list_replace_rcu(&leader->tasks, &tsk->tasks); ++#ifdef CONFIG_VE ++ list_replace_rcu(&leader->ve_task_info.vetask_list, ++ &tsk->ve_task_info.vetask_list); ++#endif + + tsk->group_leader = tsk; + leader->group_leader = tsk; +@@ -964,12 +994,10 @@ int flush_old_exec(struct linux_binprm * bprm) + /* + * Release all of the old mmap stuff + */ +- retval = exec_mmap(bprm->mm); ++ retval = exec_mmap(bprm); + if (retval) + goto out; + +- bprm->mm = NULL; /* We're using it now */ +- + /* This is the point of no return */ + current->sas_ss_sp = current->sas_ss_size = 0; + +@@ -1275,6 +1303,10 @@ int do_execve(char * filename, + struct files_struct *displaced; + int retval; + ++ retval = virtinfo_gencall(VIRTINFO_DOEXECVE, NULL); ++ if (retval) ++ return retval; ++ + retval = unshare_files(&displaced); + if (retval) + goto out_ret; +@@ -1543,7 +1575,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, + goto done; + + rcu_read_lock(); +- for_each_process(g) { ++ for_each_process_ve(g) { + if (g == tsk->group_leader) + continue; + +@@ -1677,7 +1709,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) + /* + * If another thread got here first, or we are not dumpable, bail out. + */ +- if (mm->core_waiters || !get_dumpable(mm)) { ++ if (mm->core_waiters || !get_dumpable(mm) || mm->vps_dumpable != 1) { + up_write(&mm->mmap_sem); + goto fail; + } +diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c +index 80c97fd..c03ef38 100644 +--- a/fs/ext2/namei.c ++++ b/fs/ext2/namei.c +@@ -31,6 +31,7 @@ + */ + + #include ++#include + #include "ext2.h" + #include "xattr.h" + #include "acl.h" +@@ -257,6 +258,8 @@ static int ext2_unlink(struct inode * dir, struct dentry *dentry) + struct page * page; + int err = -ENOENT; + ++ DQUOT_INIT(inode); ++ + de = ext2_find_entry (dir, dentry, &page); + if (!de) + goto out; +@@ -299,6 +302,9 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, + struct ext2_dir_entry_2 * old_de; + int err = -ENOENT; + ++ if (new_inode) ++ DQUOT_INIT(new_inode); ++ + old_de = ext2_find_entry (old_dir, old_dentry, &old_page); + if (!old_de) + goto out; +diff --git a/fs/ext2/super.c b/fs/ext2/super.c +index ef50cbc..dc11cf2 100644 +--- a/fs/ext2/super.c ++++ b/fs/ext2/super.c +@@ -1400,7 +1400,7 @@ static struct file_system_type ext2_fs_type = { + .name = "ext2", + .get_sb = ext2_get_sb, + .kill_sb = kill_block_super, +- .fs_flags = FS_REQUIRES_DEV, ++ .fs_flags = FS_REQUIRES_DEV | FS_VIRTUALIZED, + }; + + static int __init init_ext2_fs(void) +diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c +index 0d0c701..d4d3c11 100644 +--- a/fs/ext3/ioctl.c ++++ b/fs/ext3/ioctl.c +@@ -87,7 +87,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, + * the relevant capability. + */ + if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) { +- if (!capable(CAP_SYS_RESOURCE)) { ++ if (!capable(CAP_SYS_ADMIN)) { + mutex_unlock(&inode->i_mutex); + err = -EPERM; + goto flags_out; +diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c +index 0b8cf80..c39f682 100644 +--- a/fs/ext3/namei.c ++++ b/fs/ext3/namei.c +@@ -1345,7 +1345,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, + if (err) + ext3_std_error(dir->i_sb, err); + brelse(bh); +- return 0; ++ return err; + } + + /* +diff --git a/fs/ext3/super.c b/fs/ext3/super.c +index 2845425..1682ad4 100644 +--- a/fs/ext3/super.c ++++ b/fs/ext3/super.c +@@ -2903,7 +2903,7 @@ static struct file_system_type ext3_fs_type = { + .name = "ext3", + .get_sb = ext3_get_sb, + .kill_sb = kill_block_super, +- .fs_flags = FS_REQUIRES_DEV, ++ .fs_flags = FS_REQUIRES_DEV | FS_VIRTUALIZED, + }; + + static int __init init_ext3_fs(void) +diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c +index 7a6c2f1..ec237c2 100644 +--- a/fs/ext4/ioctl.c ++++ b/fs/ext4/ioctl.c +@@ -79,7 +79,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) + * the relevant capability. + */ + if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { +- if (!capable(CAP_SYS_RESOURCE)) ++ if (!capable(CAP_SYS_ADMIN)) + goto flags_out; + } + +diff --git a/fs/fcntl.c b/fs/fcntl.c +index bfd7765..24017e2 100644 +--- a/fs/fcntl.c ++++ b/fs/fcntl.c +@@ -181,6 +181,7 @@ out_fput: + fput(file); + goto out; + } ++EXPORT_SYMBOL_GPL(sys_dup2); + + asmlinkage long sys_dup(unsigned int fildes) + { +@@ -199,6 +200,9 @@ static int setfl(int fd, struct file * filp, unsigned long arg) + struct inode * inode = filp->f_path.dentry->d_inode; + int error = 0; + ++ if (!capable(CAP_SYS_RAWIO) && !odirect_enable) ++ arg &= ~O_DIRECT; ++ + /* + * O_APPEND cannot be cleared if the file is marked as append-only + * and the file is open for write. +diff --git a/fs/file.c b/fs/file.c +index 7b3887e..aad77ec 100644 +--- a/fs/file.c ++++ b/fs/file.c +@@ -8,6 +8,7 @@ + + #include + #include ++#include + #include + #include + #include +@@ -19,6 +20,8 @@ + #include + #include + ++#include ++ + struct fdtable_defer { + spinlock_t lock; + struct work_struct wq; +@@ -40,9 +43,9 @@ static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list); + static inline void * alloc_fdmem(unsigned int size) + { + if (size <= PAGE_SIZE) +- return kmalloc(size, GFP_KERNEL); ++ return kmalloc(size, GFP_KERNEL_UBC); + else +- return vmalloc(size); ++ return ub_vmalloc(size); + } + + static inline void free_fdarr(struct fdtable *fdt) +@@ -161,7 +164,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr) + if (unlikely(nr > sysctl_nr_open)) + nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; + +- fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL); ++ fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_UBC); + if (!fdt) + goto out; + fdt->max_fds = nr; +@@ -196,7 +199,7 @@ out: + * Return <0 error code on error; 1 on successful completion. + * The files->file_lock should be held on entry, and will be held on exit. + */ +-static int expand_fdtable(struct files_struct *files, int nr) ++int expand_fdtable(struct files_struct *files, int nr) + __releases(files->file_lock) + __acquires(files->file_lock) + { +@@ -236,6 +239,7 @@ static int expand_fdtable(struct files_struct *files, int nr) + } + return 1; + } ++EXPORT_SYMBOL_GPL(expand_fdtable); + + /* + * Expand files. +diff --git a/fs/file_table.c b/fs/file_table.c +index 8308422..7cdcec5 100644 +--- a/fs/file_table.c ++++ b/fs/file_table.c +@@ -21,9 +21,14 @@ + #include + #include + #include ++#include + + #include + ++#include ++#include ++#include ++ + /* sysctl tunables... */ + struct files_stat_struct files_stat = { + .max_files = NR_FILE +@@ -37,13 +42,16 @@ static struct percpu_counter nr_files __cacheline_aligned_in_smp; + static inline void file_free_rcu(struct rcu_head *head) + { + struct file *f = container_of(head, struct file, f_u.fu_rcuhead); ++ put_ve(f->owner_env); + kmem_cache_free(filp_cachep, f); + } + + static inline void file_free(struct file *f) + { +- percpu_counter_dec(&nr_files); + file_check_state(f); ++ if (f->f_ub == get_ub0()) ++ percpu_counter_dec(&nr_files); ++ ub_file_uncharge(f); + call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); + } + +@@ -97,11 +105,14 @@ struct file *get_empty_filp(void) + struct task_struct *tsk; + static int old_max; + struct file * f; ++ int acct; + ++ acct = (get_exec_ub() == get_ub0()); + /* + * Privileged users can go above max_files + */ +- if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { ++ if (acct && get_nr_files() >= files_stat.max_files && ++ !capable(CAP_SYS_ADMIN)) { + /* + * percpu_counters are inaccurate. Do an expensive check before + * we go and fail. +@@ -114,7 +125,13 @@ struct file *get_empty_filp(void) + if (f == NULL) + goto fail; + +- percpu_counter_inc(&nr_files); ++ if (ub_file_charge(f)) ++ goto fail_ch; ++ if (acct) ++ percpu_counter_inc(&nr_files); ++ ++ f->owner_env = get_ve(get_exec_env()); ++ + if (security_file_alloc(f)) + goto fail_sec; + +@@ -141,6 +158,10 @@ fail_sec: + file_free(f); + fail: + return NULL; ++ ++fail_ch: ++ kmem_cache_free(filp_cachep, f); ++ return NULL; + } + + EXPORT_SYMBOL(get_empty_filp); +diff --git a/fs/filesystems.c b/fs/filesystems.c +index f37f872..3dca4a7 100644 +--- a/fs/filesystems.c ++++ b/fs/filesystems.c +@@ -12,6 +12,9 @@ + #include + #include + #include ++#include /* for 'current' */ ++#include ++#include + #include + + /* +@@ -21,8 +24,8 @@ + * During the unload module must call unregister_filesystem(). + * We can access the fields of list element if: + * 1) spinlock is held or +- * 2) we hold the reference to the module. +- * The latter can be guaranteed by call of try_module_get(); if it ++ * 2) we hold the reference to the element. ++ * The latter can be guaranteed by call of try_filesystem(); if it + * returned 0 we must skip the element, otherwise we got the reference. + * Once the reference is obtained we can drop the spinlock. + */ +@@ -30,24 +33,46 @@ + static struct file_system_type *file_systems; + static DEFINE_RWLOCK(file_systems_lock); + ++int try_get_filesystem(struct file_system_type *fs) ++{ ++ if (try_module_get(fs->owner)) { ++ (void)get_ve(fs->owner_env); ++ return 1; ++ } ++ return 0; ++} ++ + /* WARNING: This can be used only if we _already_ own a reference */ + void get_filesystem(struct file_system_type *fs) + { ++ (void)get_ve(fs->owner_env); + __module_get(fs->owner); + } + + void put_filesystem(struct file_system_type *fs) + { + module_put(fs->owner); ++ put_ve(fs->owner_env); ++} ++ ++static inline int check_ve_fstype(struct file_system_type *p, ++ struct ve_struct *env) ++{ ++ return ((p->fs_flags & FS_VIRTUALIZED) || ++ ve_accessible_strict(p->owner_env, env)); + } + +-static struct file_system_type **find_filesystem(const char *name, unsigned len) ++static struct file_system_type **find_filesystem(const char *name, unsigned len, ++ struct ve_struct *env) + { + struct file_system_type **p; +- for (p=&file_systems; *p; p=&(*p)->next) ++ for (p=&file_systems; *p; p=&(*p)->next) { ++ if (!check_ve_fstype(*p, env)) ++ continue; + if (strlen((*p)->name) == len && + strncmp((*p)->name, name, len) == 0) + break; ++ } + return p; + } + +@@ -73,8 +98,12 @@ int register_filesystem(struct file_system_type * fs) + if (fs->next) + return -EBUSY; + INIT_LIST_HEAD(&fs->fs_supers); ++ if (fs->owner_env == NULL) ++ fs->owner_env = get_ve0(); ++ if (fs->proto == NULL) ++ fs->proto = fs; + write_lock(&file_systems_lock); +- p = find_filesystem(fs->name, strlen(fs->name)); ++ p = find_filesystem(fs->name, strlen(fs->name), fs->owner_env); + if (*p) + res = -EBUSY; + else +@@ -118,6 +147,75 @@ int unregister_filesystem(struct file_system_type * fs) + + EXPORT_SYMBOL(unregister_filesystem); + ++#ifdef CONFIG_VE ++int register_ve_fs_type(struct ve_struct *ve, struct file_system_type *template, ++ struct file_system_type **p_fs_type, struct vfsmount **p_mnt) ++{ ++ struct vfsmount *mnt; ++ struct file_system_type *local_fs_type; ++ int ret; ++ ++ local_fs_type = kzalloc(sizeof(*local_fs_type) + sizeof(void *), ++ GFP_KERNEL); ++ if (local_fs_type == NULL) ++ return -ENOMEM; ++ ++ local_fs_type->name = template->name; ++ local_fs_type->fs_flags = template->fs_flags; ++ local_fs_type->get_sb = template->get_sb; ++ local_fs_type->kill_sb = template->kill_sb; ++ local_fs_type->owner = template->owner; ++ local_fs_type->owner_env = ve; ++ local_fs_type->proto = template; ++ ++ get_filesystem(local_fs_type); /* get_ve() inside */ ++ ++ ret = register_filesystem(local_fs_type); ++ if (ret) ++ goto reg_err; ++ ++ if (p_mnt == NULL) ++ goto done; ++ ++ mnt = vfs_kern_mount(local_fs_type, 0, local_fs_type->name, NULL); ++ if (IS_ERR(mnt)) ++ goto mnt_err; ++ ++ *p_mnt = mnt; ++done: ++ *p_fs_type = local_fs_type; ++ return 0; ++ ++mnt_err: ++ ret = PTR_ERR(mnt); ++ unregister_filesystem(local_fs_type); /* does not put */ ++ ++reg_err: ++ put_filesystem(local_fs_type); ++ kfree(local_fs_type); ++ printk(KERN_DEBUG ++ "register_ve_fs_type(\"%s\") err=%d\n", template->name, ret); ++ return ret; ++} ++ ++EXPORT_SYMBOL(register_ve_fs_type); ++ ++void unregister_ve_fs_type(struct file_system_type *local_fs_type, ++ struct vfsmount *local_fs_mount) ++{ ++ if (local_fs_mount == NULL && local_fs_type == NULL) ++ return; ++ ++ unregister_filesystem(local_fs_type); ++ umount_ve_fs_type(local_fs_type); ++ if (local_fs_mount) ++ kern_umount(local_fs_mount); /* alias to mntput, drop our ref */ ++ put_filesystem(local_fs_type); ++} ++ ++EXPORT_SYMBOL(unregister_ve_fs_type); ++#endif ++ + static int fs_index(const char __user * __name) + { + struct file_system_type * tmp; +@@ -131,11 +229,14 @@ static int fs_index(const char __user * __name) + + err = -EINVAL; + read_lock(&file_systems_lock); +- for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) { ++ for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next) { ++ if (!check_ve_fstype(tmp, get_exec_env())) ++ continue; + if (strcmp(tmp->name,name) == 0) { + err = index; + break; + } ++ index++; + } + read_unlock(&file_systems_lock); + putname(name); +@@ -148,9 +249,15 @@ static int fs_name(unsigned int index, char __user * buf) + int len, res; + + read_lock(&file_systems_lock); +- for (tmp = file_systems; tmp; tmp = tmp->next, index--) +- if (index <= 0 && try_module_get(tmp->owner)) +- break; ++ for (tmp = file_systems; tmp; tmp = tmp->next) { ++ if (!check_ve_fstype(tmp, get_exec_env())) ++ continue; ++ if (!index) { ++ if (try_get_filesystem(tmp)) ++ break; ++ } else ++ index--; ++ } + read_unlock(&file_systems_lock); + if (!tmp) + return -EINVAL; +@@ -168,8 +275,9 @@ static int fs_maxindex(void) + int index; + + read_lock(&file_systems_lock); +- for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++) +- ; ++ for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next) ++ if (check_ve_fstype(tmp, get_exec_env())) ++ index++; + read_unlock(&file_systems_lock); + return index; + } +@@ -205,9 +313,10 @@ int get_filesystem_list(char * buf) + read_lock(&file_systems_lock); + tmp = file_systems; + while (tmp && len < PAGE_SIZE - 80) { +- len += sprintf(buf+len, "%s\t%s\n", +- (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", +- tmp->name); ++ if (check_ve_fstype(tmp, get_exec_env())) ++ len += sprintf(buf+len, "%s\t%s\n", ++ (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", ++ tmp->name); + tmp = tmp->next; + } + read_unlock(&file_systems_lock); +@@ -221,14 +330,14 @@ struct file_system_type *get_fs_type(const char *name) + unsigned len = dot ? dot - name : strlen(name); + + read_lock(&file_systems_lock); +- fs = *(find_filesystem(name, len)); +- if (fs && !try_module_get(fs->owner)) ++ fs = *(find_filesystem(name, len, get_exec_env())); ++ if (fs && !try_get_filesystem(fs)) + fs = NULL; + read_unlock(&file_systems_lock); + if (!fs && (request_module("%.*s", len, name) == 0)) { + read_lock(&file_systems_lock); +- fs = *(find_filesystem(name, len)); +- if (fs && !try_module_get(fs->owner)) ++ fs = *(find_filesystem(name, len, get_exec_env())); ++ if (fs && !try_get_filesystem(fs)) + fs = NULL; + read_unlock(&file_systems_lock); + } +diff --git a/fs/fuse/control.c b/fs/fuse/control.c +index 4f3cab3..755be17 100644 +--- a/fs/fuse/control.c ++++ b/fs/fuse/control.c +@@ -10,6 +10,8 @@ + + #include + #include ++#include ++#include + + #define FUSE_CTL_SUPER_MAGIC 0x65735543 + +@@ -17,7 +19,11 @@ + * This is non-NULL when the single instance of the control filesystem + * exists. Protected by fuse_mutex + */ ++#ifdef CONFIG_VE ++#define fuse_control_sb (get_exec_env()->_fuse_control_sb) ++#else + static struct super_block *fuse_control_sb; ++#endif + + static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file) + { +@@ -211,12 +217,51 @@ static struct file_system_type fuse_ctl_fs_type = { + .kill_sb = fuse_ctl_kill_sb, + }; + ++#ifdef CONFIG_VE ++static int fuse_ctl_start(void *data) ++{ ++ struct ve_struct *ve; ++ ++ ve = (struct ve_struct *)data; ++ if (ve->fuse_ctl_fs_type != NULL) ++ return -EBUSY; ++ ++ return register_ve_fs_type(ve, &fuse_ctl_fs_type, ++ &ve->fuse_ctl_fs_type, NULL); ++} ++ ++static void fuse_ctl_stop(void *data) ++{ ++ struct ve_struct *ve; ++ ++ ve = (struct ve_struct *)data; ++ if (ve->fuse_ctl_fs_type == NULL) ++ return; ++ ++ unregister_ve_fs_type(ve->fuse_ctl_fs_type, NULL); ++ ve->fuse_ctl_fs_type = NULL; ++} ++ ++static struct ve_hook fuse_ctl_ve_hook = { ++ .init = fuse_ctl_start, ++ .fini = fuse_ctl_stop, ++ .owner = THIS_MODULE, ++ .priority = HOOK_PRIO_FS, ++}; ++#endif ++ + int __init fuse_ctl_init(void) + { +- return register_filesystem(&fuse_ctl_fs_type); ++ int err; ++ ++ err = register_filesystem(&fuse_ctl_fs_type); ++ if (err == 0) ++ ve_hook_register(VE_SS_CHAIN, &fuse_ctl_ve_hook); ++ return err; + } + + void fuse_ctl_cleanup(void) + { ++ ve_hook_unregister(&fuse_ctl_ve_hook); + unregister_filesystem(&fuse_ctl_fs_type); + } +diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h +index bae9486..253017e 100644 +--- a/fs/fuse/fuse_i.h ++++ b/fs/fuse/fuse_i.h +@@ -45,7 +45,11 @@ + #define FUSE_ALLOW_OTHER (1 << 1) + + /** List of active connections */ ++#ifdef CONFIG_VE ++#define fuse_conn_list (get_exec_env()->_fuse_conn_list) ++#else + extern struct list_head fuse_conn_list; ++#endif + + /** Global mutex protecting fuse_conn_list and the control filesystem */ + extern struct mutex fuse_mutex; +diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c +index 3141690..84975e4 100644 +--- a/fs/fuse/inode.c ++++ b/fs/fuse/inode.c +@@ -18,13 +18,16 @@ + #include + #include + #include ++#include + + MODULE_AUTHOR("Miklos Szeredi "); + MODULE_DESCRIPTION("Filesystem in Userspace"); + MODULE_LICENSE("GPL"); + + static struct kmem_cache *fuse_inode_cachep; ++#ifndef CONFIG_VE + struct list_head fuse_conn_list; ++#endif + DEFINE_MUTEX(fuse_mutex); + + #define FUSE_SUPER_MAGIC 0x65735546 +@@ -858,6 +861,41 @@ static void fuse_sysfs_cleanup(void) + kobject_put(fuse_kobj); + } + ++#ifdef CONFIG_VE ++static int fuse_start(void *data) ++{ ++ struct ve_struct *ve; ++ ++ ve = (struct ve_struct *)data; ++ if (ve->fuse_fs_type != NULL) ++ return -EBUSY; ++ ++ INIT_LIST_HEAD(&ve->_fuse_conn_list); ++ return register_ve_fs_type(ve, &fuse_fs_type, &ve->fuse_fs_type, NULL); ++} ++ ++static void fuse_stop(void *data) ++{ ++ struct ve_struct *ve; ++ ++ ve = (struct ve_struct *)data; ++ if (ve->fuse_fs_type == NULL) ++ return; ++ ++ unregister_ve_fs_type(ve->fuse_fs_type, NULL); ++ kfree(ve->fuse_fs_type); ++ ve->fuse_fs_type = NULL; ++ BUG_ON(!list_empty(&ve->_fuse_conn_list)); ++} ++ ++static struct ve_hook fuse_ve_hook = { ++ .init = fuse_start, ++ .fini = fuse_stop, ++ .owner = THIS_MODULE, ++ .priority = HOOK_PRIO_FS, ++}; ++#endif ++ + static int __init fuse_init(void) + { + int res; +@@ -882,6 +920,7 @@ static int __init fuse_init(void) + if (res) + goto err_sysfs_cleanup; + ++ ve_hook_register(VE_SS_CHAIN, &fuse_ve_hook); + return 0; + + err_sysfs_cleanup: +@@ -898,6 +937,7 @@ static void __exit fuse_exit(void) + { + printk(KERN_DEBUG "fuse exit\n"); + ++ ve_hook_unregister(&fuse_ve_hook); + fuse_ctl_cleanup(); + fuse_sysfs_cleanup(); + fuse_fs_cleanup(); +diff --git a/fs/inode.c b/fs/inode.c +index c36d948..57adf85 100644 +--- a/fs/inode.c ++++ b/fs/inode.c +@@ -8,10 +8,13 @@ + #include + #include + #include ++#include + #include + #include + #include + #include ++#include ++#include + #include + #include + #include +@@ -22,6 +25,7 @@ + #include + #include + #include ++#include + + /* + * This is needed for the following functions: +@@ -97,7 +101,8 @@ static DEFINE_MUTEX(iprune_mutex); + */ + struct inodes_stat_t inodes_stat; + +-static struct kmem_cache * inode_cachep __read_mostly; ++struct kmem_cache * inode_cachep __read_mostly; ++ + + static void wake_up_inode(struct inode *inode) + { +@@ -108,11 +113,13 @@ static void wake_up_inode(struct inode *inode) + wake_up_bit(&inode->i_state, __I_LOCK); + } + ++static struct address_space_operations vfs_empty_aops; ++struct inode_operations vfs_empty_iops; ++static struct file_operations vfs_empty_fops; ++EXPORT_SYMBOL(vfs_empty_iops); ++ + static struct inode *alloc_inode(struct super_block *sb) + { +- static const struct address_space_operations empty_aops; +- static struct inode_operations empty_iops; +- static const struct file_operations empty_fops; + struct inode *inode; + + if (sb->s_op->alloc_inode) +@@ -127,8 +134,8 @@ static struct inode *alloc_inode(struct super_block *sb) + inode->i_blkbits = sb->s_blocksize_bits; + inode->i_flags = 0; + atomic_set(&inode->i_count, 1); +- inode->i_op = &empty_iops; +- inode->i_fop = &empty_fops; ++ inode->i_op = &vfs_empty_iops; ++ inode->i_fop = &vfs_empty_fops; + inode->i_nlink = 1; + atomic_set(&inode->i_writecount, 0); + inode->i_size = 0; +@@ -152,15 +159,15 @@ static struct inode *alloc_inode(struct super_block *sb) + } + + spin_lock_init(&inode->i_lock); +- lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); ++ lockdep_set_class(&inode->i_lock, &sb->s_type->proto->i_lock_key); + + mutex_init(&inode->i_mutex); +- lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key); ++ lockdep_set_class(&inode->i_mutex, &sb->s_type->proto->i_mutex_key); + + init_rwsem(&inode->i_alloc_sem); +- lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key); ++ lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->proto->i_alloc_sem_key); + +- mapping->a_ops = &empty_aops; ++ mapping->a_ops = &vfs_empty_aops; + mapping->host = inode; + mapping->flags = 0; + mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE); +@@ -310,13 +317,76 @@ static void dispose_list(struct list_head *head) + spin_unlock(&inode_lock); + } + ++static void show_header(struct inode *inode) ++{ ++ struct super_block *sb = inode->i_sb; ++ ++ printk("VFS: Busy inodes after unmount. " ++ "sb = %p, fs type = %s, sb count = %d, " ++ "sb->s_root = %s\n", sb, ++ (sb->s_type != NULL) ? sb->s_type->name : "", ++ sb->s_count, ++ (sb->s_root != NULL) ? ++ (char *)sb->s_root->d_name.name : ""); ++} ++ ++static void show_inode(struct inode *inode) ++{ ++ struct dentry *d; ++ struct vfsmount *mnt; ++ int i; ++ ++ printk("inode = %p, inode->i_count = %d, " ++ "inode->i_nlink = %d, " ++ "inode->i_mode = %d, " ++ "inode->i_state = %ld, " ++ "inode->i_flags = %d, " ++ "inode->i_devices.next = %p, " ++ "inode->i_devices.prev = %p, " ++ "inode->i_ino = %ld\n", ++ inode, ++ atomic_read(&inode->i_count), ++ inode->i_nlink, ++ inode->i_mode, ++ inode->i_state, ++ inode->i_flags, ++ inode->i_devices.next, ++ inode->i_devices.prev, ++ inode->i_ino); ++ printk("inode dump: "); ++ for (i = 0; i < sizeof(*inode); i++) ++ printk("%2.2x ", *((u_char *)inode + i)); ++ printk("\n"); ++ list_for_each_entry(d, &inode->i_dentry, d_alias) { ++ printk(" d_alias %s d_count=%d d_flags=%x\n", ++ d->d_name.name, atomic_read(&d->d_count), d->d_flags); ++ for (i = 0; i < sizeof(*d); i++) ++ printk("%2.2x ", *((u_char *)d + i)); ++ printk("\n"); ++ } ++ ++ spin_lock(&vfsmount_lock); ++ list_for_each_entry(mnt, &get_task_mnt_ns(current)->list, mnt_list) { ++ if (mnt->mnt_sb != inode->i_sb) ++ continue; ++ printk("mnt=%p count=%d flags=%x exp_mask=%x\n", ++ mnt, atomic_read(&mnt->mnt_count), ++ mnt->mnt_flags, ++ mnt->mnt_expiry_mark); ++ for (i = 0; i < sizeof(*mnt); i++) ++ printk("%2.2x ", *((u_char *)mnt + i)); ++ printk("\n"); ++ } ++ spin_unlock(&vfsmount_lock); ++} ++ + /* + * Invalidate all inodes for a device. + */ +-static int invalidate_list(struct list_head *head, struct list_head *dispose) ++static int invalidate_list(struct list_head *head, struct list_head *dispose, int check) + { + struct list_head *next; +- int busy = 0, count = 0; ++ int busy = 0, count = 0, once = 1; + + next = head->next; + for (;;) { +@@ -343,6 +413,14 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose) + continue; + } + busy = 1; ++ ++ if (check) { ++ if (once) { ++ once = 0; ++ show_header(inode); ++ } ++ show_inode(inode); ++ } + } + /* only unused inodes may be cached with i_count zero */ + inodes_stat.nr_unused -= count; +@@ -357,7 +435,7 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose) + * fails because there are busy inodes then a non zero value is returned. + * If the discard is successful all the inodes have been discarded. + */ +-int invalidate_inodes(struct super_block * sb) ++int invalidate_inodes_check(struct super_block * sb, int check) + { + int busy; + LIST_HEAD(throw_away); +@@ -365,7 +443,7 @@ int invalidate_inodes(struct super_block * sb) + mutex_lock(&iprune_mutex); + spin_lock(&inode_lock); + inotify_unmount_inodes(&sb->s_inodes); +- busy = invalidate_list(&sb->s_inodes, &throw_away); ++ busy = invalidate_list(&sb->s_inodes, &throw_away, check); + spin_unlock(&inode_lock); + + dispose_list(&throw_away); +@@ -374,7 +452,7 @@ int invalidate_inodes(struct super_block * sb) + return busy; + } + +-EXPORT_SYMBOL(invalidate_inodes); ++EXPORT_SYMBOL(invalidate_inodes_check); + + static int can_unuse(struct inode *inode) + { +@@ -464,6 +542,7 @@ static void prune_icache(int nr_to_scan) + */ + static int shrink_icache_memory(int nr, gfp_t gfp_mask) + { ++ KSTAT_PERF_ENTER(shrink_icache) + if (nr) { + /* + * Nasty deadlock avoidance. We may hold various FS locks, +@@ -474,6 +553,7 @@ static int shrink_icache_memory(int nr, gfp_t gfp_mask) + return -1; + prune_icache(nr); + } ++ KSTAT_PERF_LEAVE(shrink_icache) + return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; + } + +@@ -583,7 +663,7 @@ void unlock_new_inode(struct inode *inode) + */ + mutex_destroy(&inode->i_mutex); + mutex_init(&inode->i_mutex); +- lockdep_set_class(&inode->i_mutex, &type->i_mutex_dir_key); ++ lockdep_set_class(&inode->i_mutex, &type->proto->i_mutex_dir_key); + } + #endif + /* +diff --git a/fs/inotify.c b/fs/inotify.c +index 690e725..01ddb06 100644 +--- a/fs/inotify.c ++++ b/fs/inotify.c +@@ -32,6 +32,7 @@ + #include + #include + #include ++#include + + static atomic_t inotify_cookie; + +@@ -69,19 +70,6 @@ static atomic_t inotify_cookie; + * inotify_add_watch() to the final put_inotify_watch(). + */ + +-/* +- * struct inotify_handle - represents an inotify instance +- * +- * This structure is protected by the mutex 'mutex'. +- */ +-struct inotify_handle { +- struct idr idr; /* idr mapping wd -> watch */ +- struct mutex mutex; /* protects this bad boy */ +- struct list_head watches; /* list of watches */ +- atomic_t count; /* reference count */ +- u32 last_wd; /* the last wd allocated */ +- const struct inotify_operations *in_ops; /* inotify caller operations */ +-}; + + static inline void get_inotify_handle(struct inotify_handle *ih) + { +@@ -118,6 +106,9 @@ void put_inotify_watch(struct inotify_watch *watch) + struct inotify_handle *ih = watch->ih; + + iput(watch->inode); ++ path_put(&watch->path); ++ watch->path.dentry = NULL; ++ watch->path.mnt = NULL; + ih->in_ops->destroy_watch(watch); + put_inotify_handle(ih); + } +@@ -476,6 +467,8 @@ void inotify_init_watch(struct inotify_watch *watch) + INIT_LIST_HEAD(&watch->i_list); + atomic_set(&watch->count, 0); + get_inotify_watch(watch); /* initial get */ ++ watch->path.dentry = NULL; ++ watch->path.mnt = NULL; + } + EXPORT_SYMBOL_GPL(inotify_init_watch); + +@@ -616,8 +609,8 @@ EXPORT_SYMBOL_GPL(inotify_find_update_watch); + * Caller must ensure it only calls inotify_add_watch() once per watch. + * Calls inotify_handle_get_wd() so may sleep. + */ +-s32 inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch, +- struct inode *inode, u32 mask) ++s32 __inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch, ++ struct path *path, struct inode * inode, u32 mask) + { + int ret = 0; + int newly_watched; +@@ -645,6 +638,10 @@ s32 inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch, + * Save a reference to the inode and bump the ref count to make it + * official. We hold a reference to nameidata, which makes this safe. + */ ++ if (path) { ++ path_get(path); ++ watch->path = *path; ++ } + watch->inode = igrab(inode); + + /* Add the watch to the handle's and the inode's list */ +@@ -666,6 +663,18 @@ out: + } + EXPORT_SYMBOL_GPL(inotify_add_watch); + ++s32 inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch, ++ struct inode *inode, u32 mask) ++{ ++ return __inotify_add_watch(ih, watch, NULL, inode, mask); ++} ++ ++s32 inotify_add_watch_dget(struct inotify_handle *ih, ++ struct inotify_watch *watch, struct path *p, u32 mask) ++{ ++ return __inotify_add_watch(ih, watch, p, p->dentry->d_inode, mask); ++} ++ + /** + * inotify_clone_watch - put the watch next to existing one + * @old: already installed watch +diff --git a/fs/inotify_user.c b/fs/inotify_user.c +index 6676c06..dd51c6d 100644 +--- a/fs/inotify_user.c ++++ b/fs/inotify_user.c +@@ -20,6 +20,7 @@ + */ + + #include ++#include + #include + #include + #include +@@ -66,47 +67,6 @@ static int inotify_max_queued_events __read_mostly; + * first event, or to inotify_destroy(). + */ + +-/* +- * struct inotify_device - represents an inotify instance +- * +- * This structure is protected by the mutex 'mutex'. +- */ +-struct inotify_device { +- wait_queue_head_t wq; /* wait queue for i/o */ +- struct mutex ev_mutex; /* protects event queue */ +- struct mutex up_mutex; /* synchronizes watch updates */ +- struct list_head events; /* list of queued events */ +- atomic_t count; /* reference count */ +- struct user_struct *user; /* user who opened this dev */ +- struct inotify_handle *ih; /* inotify handle */ +- struct fasync_struct *fa; /* async notification */ +- unsigned int queue_size; /* size of the queue (bytes) */ +- unsigned int event_count; /* number of pending events */ +- unsigned int max_events; /* maximum number of events */ +-}; +- +-/* +- * struct inotify_kernel_event - An inotify event, originating from a watch and +- * queued for user-space. A list of these is attached to each instance of the +- * device. In read(), this list is walked and all events that can fit in the +- * buffer are returned. +- * +- * Protected by dev->ev_mutex of the device in which we are queued. +- */ +-struct inotify_kernel_event { +- struct inotify_event event; /* the user-space event */ +- struct list_head list; /* entry in inotify_device's list */ +- char *name; /* filename, if any */ +-}; +- +-/* +- * struct inotify_user_watch - our version of an inotify_watch, we add +- * a reference to the associated inotify_device. +- */ +-struct inotify_user_watch { +- struct inotify_device *dev; /* associated device */ +- struct inotify_watch wdata; /* inotify watch data */ +-}; + + #ifdef CONFIG_SYSCTL + +@@ -376,8 +336,7 @@ static int find_inode(const char __user *dirname, struct nameidata *nd, + * + * Callers must hold dev->up_mutex. + */ +-static int create_watch(struct inotify_device *dev, struct inode *inode, +- u32 mask) ++int inotify_create_watch(struct inotify_device *dev, struct path *p, u32 mask) + { + struct inotify_user_watch *watch; + int ret; +@@ -397,12 +356,13 @@ static int create_watch(struct inotify_device *dev, struct inode *inode, + atomic_inc(&dev->user->inotify_watches); + + inotify_init_watch(&watch->wdata); +- ret = inotify_add_watch(dev->ih, &watch->wdata, inode, mask); ++ ret = inotify_add_watch_dget(dev->ih, &watch->wdata, p, mask); + if (ret < 0) + free_inotify_user_watch(&watch->wdata); + + return ret; + } ++EXPORT_SYMBOL(inotify_create_watch); + + /* Device Interface */ + +@@ -552,7 +512,7 @@ static long inotify_ioctl(struct file *file, unsigned int cmd, + return ret; + } + +-static const struct file_operations inotify_fops = { ++const struct file_operations inotify_fops = { + .poll = inotify_poll, + .read = inotify_read, + .fasync = inotify_fasync, +@@ -560,6 +520,7 @@ static const struct file_operations inotify_fops = { + .unlocked_ioctl = inotify_ioctl, + .compat_ioctl = inotify_ioctl, + }; ++EXPORT_SYMBOL(inotify_fops); + + static const struct inotify_operations inotify_user_ops = { + .handle_event = inotify_dev_queue_event, +@@ -637,6 +598,7 @@ out_put_fd: + put_unused_fd(fd); + return ret; + } ++EXPORT_SYMBOL(sys_inotify_init); + + asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask) + { +@@ -673,7 +635,7 @@ asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask) + mutex_lock(&dev->up_mutex); + ret = inotify_find_update_watch(dev->ih, inode, mask); + if (ret == -ENOENT) +- ret = create_watch(dev, inode, mask); ++ ret = inotify_create_watch(dev, &nd.path, mask); + mutex_unlock(&dev->up_mutex); + + path_put(&nd.path); +diff --git a/fs/ioprio.c b/fs/ioprio.c +index c4a1c3c..08b7f78 100644 +--- a/fs/ioprio.c ++++ b/fs/ioprio.c +@@ -26,6 +26,8 @@ + #include + #include + #include ++#include ++#include + + static int set_task_ioprio(struct task_struct *task, int ioprio) + { +@@ -71,8 +73,11 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio) + int data = IOPRIO_PRIO_DATA(ioprio); + struct task_struct *p, *g; + struct user_struct *user; +- struct pid *pgrp; + int ret; ++ struct pid *pgrp; ++ ++ if (!ve_is_super(get_exec_env())) ++ return -EPERM; + + switch (class) { + case IOPRIO_CLASS_RT: +@@ -130,17 +135,23 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio) + if (!user) + break; + +- do_each_thread(g, p) { ++ do_each_thread_all(g, p) { + if (p->uid != who) + continue; + ret = set_task_ioprio(p, ioprio); + if (ret) + goto free_uid; +- } while_each_thread(g, p); ++ } while_each_thread_all(g, p); + free_uid: + if (who) + free_uid(user); + break; ++ case IOPRIO_WHO_UBC: ++ if (class != IOPRIO_CLASS_BE) ++ return -ERANGE; ++ ++ ret = bc_set_ioprio(who, data); ++ break; + default: + ret = -EINVAL; + } +@@ -185,9 +196,9 @@ asmlinkage long sys_ioprio_get(int which, int who) + { + struct task_struct *g, *p; + struct user_struct *user; +- struct pid *pgrp; + int ret = -ESRCH; + int tmpio; ++ struct pid *pgrp; + + read_lock(&tasklist_lock); + switch (which) { +@@ -223,7 +234,7 @@ asmlinkage long sys_ioprio_get(int which, int who) + if (!user) + break; + +- do_each_thread(g, p) { ++ do_each_thread_ve(g, p) { + if (p->uid != user->uid) + continue; + tmpio = get_task_ioprio(p); +@@ -233,7 +244,7 @@ asmlinkage long sys_ioprio_get(int which, int who) + ret = tmpio; + else + ret = ioprio_best(ret, tmpio); +- } while_each_thread(g, p); ++ } while_each_thread_ve(g, p); + + if (who) + free_uid(user); +diff --git a/fs/locks.c b/fs/locks.c +index dce8c74..4a37766 100644 +--- a/fs/locks.c ++++ b/fs/locks.c +@@ -130,6 +130,8 @@ + + #include + ++#include ++ + #define IS_POSIX(fl) (fl->fl_flags & FL_POSIX) + #define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) + #define IS_LEASE(fl) (fl->fl_flags & FL_LEASE) +@@ -146,9 +148,25 @@ static LIST_HEAD(blocked_list); + static struct kmem_cache *filelock_cache __read_mostly; + + /* Allocate an empty lock structure. */ +-static struct file_lock *locks_alloc_lock(void) ++static struct file_lock *locks_alloc_lock(int charge) + { +- return kmem_cache_alloc(filelock_cache, GFP_KERNEL); ++ struct file_lock *fl; ++ ++ fl = kmem_cache_alloc(filelock_cache, GFP_KERNEL); ++#ifdef CONFIG_BEANCOUNTERS ++ if (fl == NULL) ++ goto out; ++ fl->fl_charged = 0; ++ if (!charge) ++ goto out; ++ if (!ub_flock_charge(fl, 1)) ++ goto out; ++ ++ kmem_cache_free(filelock_cache, fl); ++ fl = NULL; ++out: ++#endif ++ return fl; + } + + static void locks_release_private(struct file_lock *fl) +@@ -173,6 +191,7 @@ static void locks_free_lock(struct file_lock *fl) + BUG_ON(!list_empty(&fl->fl_block)); + BUG_ON(!list_empty(&fl->fl_link)); + ++ ub_flock_uncharge(fl); + locks_release_private(fl); + kmem_cache_free(filelock_cache, fl); + } +@@ -276,7 +295,7 @@ static int flock_make_lock(struct file *filp, struct file_lock **lock, + if (type < 0) + return type; + +- fl = locks_alloc_lock(); ++ fl = locks_alloc_lock(type != F_UNLCK); + if (fl == NULL) + return -ENOMEM; + +@@ -463,7 +482,7 @@ static int lease_init(struct file *filp, int type, struct file_lock *fl) + /* Allocate a file_lock initialised to this type of lease */ + static struct file_lock *lease_alloc(struct file *filp, int type) + { +- struct file_lock *fl = locks_alloc_lock(); ++ struct file_lock *fl = locks_alloc_lock(1); + int error = -ENOMEM; + + if (fl == NULL) +@@ -734,8 +753,13 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) + goto find_conflict; + + if (request->fl_type != F_UNLCK) { ++ /* ++ * Nont F_UNLCK request must be already charged in ++ * flock_make_lock(). Actually new_fl must be charged not the ++ * request, but we try to fail earlier. ++ */ + error = -ENOMEM; +- new_fl = locks_alloc_lock(); ++ new_fl = locks_alloc_lock(0); + if (new_fl == NULL) + goto out; + error = 0; +@@ -785,6 +809,10 @@ find_conflict: + } + if (request->fl_flags & FL_ACCESS) + goto out; ++ ++ set_flock_charged(new_fl); ++ unset_flock_charged(request); ++ + locks_copy_lock(new_fl, request); + locks_insert_lock(before, new_fl); + new_fl = NULL; +@@ -816,8 +844,11 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str + if (!(request->fl_flags & FL_ACCESS) && + (request->fl_type != F_UNLCK || + request->fl_start != 0 || request->fl_end != OFFSET_MAX)) { +- new_fl = locks_alloc_lock(); +- new_fl2 = locks_alloc_lock(); ++ if (request->fl_type != F_UNLCK) ++ new_fl = locks_alloc_lock(1); ++ else ++ new_fl = NULL; ++ new_fl2 = locks_alloc_lock(0); + } + + lock_kernel(); +@@ -951,7 +982,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str + * bail out. + */ + error = -ENOLCK; /* "no luck" */ +- if (right && left == right && !new_fl2) ++ if (right && left == right && !(request->fl_type == F_UNLCK || new_fl2)) + goto out; + + error = 0; +@@ -962,23 +993,32 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str + goto out; + } + +- if (!new_fl) { +- error = -ENOLCK; ++ error = -ENOLCK; ++ if (!new_fl) ++ goto out; ++ if (right && (left == right) && ub_flock_charge(new_fl, 1)) + goto out; +- } + locks_copy_lock(new_fl, request); + locks_insert_lock(before, new_fl); + new_fl = NULL; ++ error = 0; + } + if (right) { + if (left == right) { + /* The new lock breaks the old one in two pieces, + * so we have to use the second new lock. + */ ++ error = -ENOLCK; ++ if (added && ub_flock_charge(new_fl2, ++ request->fl_type != F_UNLCK)) ++ goto out; ++ /* FIXME move all fl_charged manipulations in ub code */ ++ set_flock_charged(new_fl2); + left = new_fl2; + new_fl2 = NULL; + locks_copy_lock(left, right); + locks_insert_lock(before, left); ++ error = 0; + } + right->fl_start = request->fl_end + 1; + locks_wake_up_blocks(right); +@@ -1365,7 +1405,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) + + if (arg != F_UNLCK) { + error = -ENOMEM; +- new_fl = locks_alloc_lock(); ++ new_fl = locks_alloc_lock(1); + if (new_fl == NULL) + goto out; + +@@ -1608,6 +1648,7 @@ asmlinkage long sys_flock(unsigned int fd, unsigned int cmd) + out: + return error; + } ++EXPORT_SYMBOL_GPL(sys_flock); + + /** + * vfs_test_lock - test file byte range lock +@@ -1744,7 +1785,7 @@ EXPORT_SYMBOL_GPL(vfs_lock_file); + int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, + struct flock __user *l) + { +- struct file_lock *file_lock = locks_alloc_lock(); ++ struct file_lock *file_lock = locks_alloc_lock(0); + struct flock flock; + struct inode *inode; + struct file *f; +@@ -1881,7 +1922,7 @@ out: + int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, + struct flock64 __user *l) + { +- struct file_lock *file_lock = locks_alloc_lock(); ++ struct file_lock *file_lock = locks_alloc_lock(0); + struct flock64 flock; + struct inode *inode; + struct file *f; +@@ -2170,6 +2211,8 @@ static int locks_show(struct seq_file *f, void *v) + struct file_lock *fl, *bfl; + + fl = list_entry(v, struct file_lock, fl_link); ++ if (!ve_accessible(fl->fl_file->owner_env, get_exec_env())) ++ goto out; + + lock_get_status(f, fl, (long)f->private, ""); + +@@ -2177,6 +2220,7 @@ static int locks_show(struct seq_file *f, void *v) + lock_get_status(f, bfl, (long)f->private, " ->"); + + f->private++; ++out: + return 0; + } + +@@ -2286,7 +2330,7 @@ EXPORT_SYMBOL(lock_may_write); + static int __init filelock_init(void) + { + filelock_cache = kmem_cache_create("file_lock_cache", +- sizeof(struct file_lock), 0, SLAB_PANIC, ++ sizeof(struct file_lock), 0, SLAB_PANIC|SLAB_UBC, + init_once); + return 0; + } +diff --git a/fs/namei.c b/fs/namei.c +index 01e67dd..4092158 100644 +--- a/fs/namei.c ++++ b/fs/namei.c +@@ -142,6 +142,7 @@ char * getname(const char __user * filename) + { + char *tmp, *result; + ++ /*ub_dentry_checkup();*/ + result = ERR_PTR(-ENOMEM); + tmp = __getname(); + if (tmp) { +@@ -443,6 +444,21 @@ static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, + if (!dentry) + dentry = d_lookup(parent, name); + ++ /* ++ * The revalidation rules are simple: ++ * d_revalidate operation is called when we're about to use a cached ++ * dentry rather than call d_lookup. ++ * d_revalidate method may unhash the dentry itself or return FALSE, in ++ * which case if the dentry can be released d_lookup will be called. ++ * ++ * Additionally, by request of NFS people ++ * (http://linux.bkbits.net:8080/linux-2.4/cset@1.181?nav=index.html|src/|src/fs|related/fs/namei.c) ++ * d_revalidate is called when `/', `.' or `..' are looked up. ++ * Since re-lookup is impossible on them, we introduce a hack and ++ * return an error in this case. ++ * ++ * 2003/02/19 SAW ++ */ + if (dentry && dentry->d_op && dentry->d_op->d_revalidate) + dentry = do_revalidate(dentry, nd); + +@@ -502,6 +518,7 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s + struct dentry * result; + struct inode *dir = parent->d_inode; + ++repeat: + mutex_lock(&dir->i_mutex); + /* + * First re-do the cached lookup just in case it was created +@@ -540,7 +557,7 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s + if (result->d_op && result->d_op->d_revalidate) { + result = do_revalidate(result, nd); + if (!result) +- result = ERR_PTR(-ENOENT); ++ goto repeat; + } + return result; + } +@@ -794,6 +811,13 @@ static __always_inline void follow_dotdot(struct nameidata *nd) + read_unlock(&fs->lock); + break; + } ++#ifdef CONFIG_VE ++ if (nd->path.dentry == get_exec_env()->root_path.dentry && ++ nd->path.mnt == get_exec_env()->root_path.mnt) { ++ read_unlock(¤t->fs->lock); ++ break; ++ } ++#endif + read_unlock(&fs->lock); + spin_lock(&dcache_lock); + if (nd->path.dentry != nd->path.mnt->mnt_root) { +@@ -835,6 +859,10 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, + if (dentry->d_op && dentry->d_op->d_revalidate) + goto need_revalidate; + done: ++ if ((nd->flags & LOOKUP_STRICT) && d_mountpoint(dentry)) { ++ dput(dentry); ++ return -ENOENT; ++ } + path->mnt = mnt; + path->dentry = dentry; + __follow_mount(path); +@@ -872,6 +900,7 @@ static int __link_path_walk(const char *name, struct nameidata *nd) + struct inode *inode; + int err; + unsigned int lookup_flags = nd->flags; ++ int real_components = 0; + + while (*name=='/') + name++; +@@ -942,6 +971,7 @@ static int __link_path_walk(const char *name, struct nameidata *nd) + break; + } + /* This does the actual lookups.. */ ++ real_components++; + err = do_lookup(nd, &this, &next); + if (err) + break; +@@ -955,6 +985,9 @@ static int __link_path_walk(const char *name, struct nameidata *nd) + goto out_dput; + + if (inode->i_op->follow_link) { ++ err = -ENOENT; ++ if (lookup_flags & LOOKUP_STRICT) ++ goto out_dput; + err = do_follow_link(&next, nd); + if (err) + goto return_err; +@@ -1003,6 +1036,7 @@ last_component: + break; + inode = next.dentry->d_inode; + if ((lookup_flags & LOOKUP_FOLLOW) ++ && !(lookup_flags & LOOKUP_STRICT) + && inode && inode->i_op && inode->i_op->follow_link) { + err = do_follow_link(&next, nd); + if (err) +@@ -1024,27 +1058,41 @@ lookup_parent: + nd->last_type = LAST_NORM; + if (this.name[0] != '.') + goto return_base; +- if (this.len == 1) ++ if (this.len == 1) { + nd->last_type = LAST_DOT; +- else if (this.len == 2 && this.name[1] == '.') ++ goto return_reval; ++ } else if (this.len == 2 && this.name[1] == '.') { + nd->last_type = LAST_DOTDOT; +- else +- goto return_base; ++ goto return_reval; ++ } ++return_base: ++ if (!(nd->flags & LOOKUP_NOAREACHECK)) { ++ err = check_area_access_ve(&nd->path); ++ if (err) ++ break; ++ } ++ return 0; + return_reval: + /* + * We bypassed the ordinary revalidation routines. + * We may need to check the cached dentry for staleness. + */ +- if (nd->path.dentry && nd->path.dentry->d_sb && ++ if (!real_components && nd->path.dentry && nd->path.dentry->d_sb && + (nd->path.dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) { + err = -ESTALE; + /* Note: we do not d_invalidate() */ + if (!nd->path.dentry->d_op->d_revalidate( + nd->path.dentry, nd)) ++ /* ++ * This lookup is for `/' or `.' or `..'. ++ * The filesystem unhashed the dentry itself ++ * inside d_revalidate (otherwise, d_invalidate ++ * wouldn't succeed). As a special courtesy to ++ * NFS we return an error. 2003/02/19 SAW ++ */ + break; + } +-return_base: +- return 0; ++ goto return_base; + out_dput: + path_put_conditional(&next, nd); + break; +@@ -2126,6 +2174,7 @@ asmlinkage long sys_mknod(const char __user *filename, int mode, unsigned dev) + { + return sys_mknodat(AT_FDCWD, filename, mode, dev); + } ++EXPORT_SYMBOL_GPL(sys_mknod); + + int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) + { +@@ -2191,6 +2240,7 @@ asmlinkage long sys_mkdir(const char __user *pathname, int mode) + { + return sys_mkdirat(AT_FDCWD, pathname, mode); + } ++EXPORT_SYMBOL_GPL(sys_mkdir); + + /* + * We try to drop the dentry early: we should have +@@ -2218,6 +2268,7 @@ void dentry_unhash(struct dentry *dentry) + spin_unlock(&dentry->d_lock); + spin_unlock(&dcache_lock); + } ++EXPORT_SYMBOL(sys_symlink); + + int vfs_rmdir(struct inode *dir, struct dentry *dentry) + { +@@ -2303,6 +2354,7 @@ asmlinkage long sys_rmdir(const char __user *pathname) + { + return do_rmdir(AT_FDCWD, pathname); + } ++EXPORT_SYMBOL_GPL(sys_rmdir); + + int vfs_unlink(struct inode *dir, struct dentry *dentry) + { +@@ -2407,6 +2459,7 @@ asmlinkage long sys_unlink(const char __user *pathname) + { + return do_unlinkat(AT_FDCWD, pathname); + } ++EXPORT_SYMBOL_GPL(sys_unlink); + + int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, int mode) + { +@@ -2577,6 +2630,7 @@ asmlinkage long sys_link(const char __user *oldname, const char __user *newname) + { + return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0); + } ++EXPORT_SYMBOL(sys_rename); + + /* + * The worst of all namespace operations - renaming directory. "Perverted" +@@ -2688,6 +2742,9 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, + int is_dir = S_ISDIR(old_dentry->d_inode->i_mode); + const char *old_name; + ++ if (DQUOT_RENAME(old_dentry->d_inode, old_dir, new_dir)) ++ return -EXDEV; ++ + if (old_dentry->d_inode == new_dentry->d_inode) + return 0; + +diff --git a/fs/namespace.c b/fs/namespace.c +index 4fc302c..6873efd 100644 +--- a/fs/namespace.c ++++ b/fs/namespace.c +@@ -37,6 +37,7 @@ + + /* spinlock for vfsmount related operations, inplace of dcache_lock */ + __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock); ++EXPORT_SYMBOL(vfsmount_lock); + + static int event; + static DEFINE_IDA(mnt_id_ida); +@@ -44,7 +45,8 @@ static DEFINE_IDA(mnt_group_ida); + + static struct list_head *mount_hashtable __read_mostly; + static struct kmem_cache *mnt_cache __read_mostly; +-static struct rw_semaphore namespace_sem; ++struct rw_semaphore namespace_sem; ++EXPORT_SYMBOL_GPL(namespace_sem); + + /* /sys/fs */ + struct kobject *fs_kobj; +@@ -117,6 +119,7 @@ struct vfsmount *alloc_vfsmnt(const char *name) + return NULL; + } + ++ mnt->owner = VEID(get_exec_env()); + atomic_set(&mnt->mnt_count, 1); + INIT_LIST_HEAD(&mnt->mnt_hash); + INIT_LIST_HEAD(&mnt->mnt_child); +@@ -129,7 +132,7 @@ struct vfsmount *alloc_vfsmnt(const char *name) + atomic_set(&mnt->__mnt_writers, 0); + if (name) { + int size = strlen(name) + 1; +- char *newname = kmalloc(size, GFP_KERNEL); ++ char *newname = kmalloc(size, GFP_KERNEL_UBC); + if (newname) { + memcpy(newname, name, size); + mnt->mnt_devname = newname; +@@ -794,15 +797,48 @@ static void show_type(struct seq_file *m, struct super_block *sb) + } + } + ++static int prepare_mnt_root_mangle(struct path *path, ++ char **path_buf, char **ret_path) ++{ ++ /* skip FS_NOMOUNT mounts (rootfs) */ ++ if (path->mnt->mnt_sb->s_flags & MS_NOUSER) ++ return -EACCES; ++ ++ *path_buf = (char *)__get_free_page(GFP_KERNEL); ++ if (!*path_buf) ++ return -ENOMEM; ++ ++ *ret_path = d_path(path, *path_buf, PAGE_SIZE); ++ if (IS_ERR(*ret_path)) { ++ free_page((unsigned long)*path_buf); ++ /* ++ * This means that the file position will be incremented, i.e. ++ * the total number of "invisible" vfsmnt will leak. ++ */ ++ return -EACCES; ++ } ++ return 0; ++} ++ + static int show_vfsmnt(struct seq_file *m, void *v) + { + struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); +- int err = 0; ++ int err; + struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; ++ char *path_buf, *path; + +- mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); ++ err = prepare_mnt_root_mangle(&mnt_path, &path_buf, &path); ++ if (err < 0) ++ return (err == -EACCES ? 0 : err); ++ ++ if (ve_is_super(get_exec_env()) || ++ !(mnt->mnt_sb->s_type->fs_flags & FS_MANGLE_PROC)) ++ mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); ++ else ++ mangle(m, mnt->mnt_sb->s_type->name); + seq_putc(m, ' '); +- seq_path(m, &mnt_path, " \t\n\\"); ++ mangle(m, path); ++ free_page((unsigned long) path_buf); + seq_putc(m, ' '); + show_type(m, mnt->mnt_sb); + seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); +@@ -883,18 +919,27 @@ static int show_vfsstat(struct seq_file *m, void *v) + { + struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); + struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; +- int err = 0; ++ char *path_buf, *path; ++ int err; ++ ++ err = prepare_mnt_root_mangle(&mnt_path, &path_buf, &path); ++ if (err < 0) ++ return (err == -EACCES ? 0 : err); + + /* device */ + if (mnt->mnt_devname) { + seq_puts(m, "device "); +- mangle(m, mnt->mnt_devname); ++ if (ve_is_super(get_exec_env())) ++ mangle(m, mnt->mnt_devname); ++ else ++ mangle(m, mnt->mnt_sb->s_type->name); + } else + seq_puts(m, "no device"); + + /* mount point */ + seq_puts(m, " mounted on "); +- seq_path(m, &mnt_path, " \t\n\\"); ++ mangle(m, path); ++ free_page((unsigned long)path_buf); + seq_putc(m, ' '); + + /* file system type */ +@@ -1111,6 +1156,34 @@ static int do_umount(struct vfsmount *mnt, int flags) + return retval; + } + ++#ifdef CONFIG_VE ++void umount_ve_fs_type(struct file_system_type *local_fs_type) ++{ ++ struct vfsmount *mnt; ++ struct list_head *p, *q; ++ LIST_HEAD(kill); ++ LIST_HEAD(umount_list); ++ ++ down_write(&namespace_sem); ++ spin_lock(&vfsmount_lock); ++ list_for_each_safe(p, q, ¤t->nsproxy->mnt_ns->list) { ++ mnt = list_entry(p, struct vfsmount, mnt_list); ++ if (mnt->mnt_sb->s_type != local_fs_type) ++ continue; ++ list_del(p); ++ list_add(p, &kill); ++ } ++ ++ while (!list_empty(&kill)) { ++ mnt = list_entry(kill.next, struct vfsmount, mnt_list); ++ umount_tree(mnt, 1, &umount_list); ++ } ++ spin_unlock(&vfsmount_lock); ++ up_write(&namespace_sem); ++ release_mounts(&umount_list); ++} ++#endif ++ + /* + * Now umount can handle mount points as well as block devices. + * This is important for filesystems which use unnamed block devices. +@@ -1134,7 +1207,7 @@ asmlinkage long sys_umount(char __user * name, int flags) + goto dput_and_out; + + retval = -EPERM; +- if (!capable(CAP_SYS_ADMIN)) ++ if (!capable(CAP_VE_SYS_ADMIN)) + goto dput_and_out; + + retval = do_umount(nd.path.mnt, flags); +@@ -1160,7 +1233,7 @@ asmlinkage long sys_oldumount(char __user * name) + + static int mount_is_safe(struct nameidata *nd) + { +- if (capable(CAP_SYS_ADMIN)) ++ if (capable(CAP_VE_SYS_ADMIN)) + return 0; + return -EPERM; + #ifdef notyet +@@ -1430,6 +1503,8 @@ static noinline int do_change_type(struct nameidata *nd, int flag) + + if (nd->path.dentry != nd->path.mnt->mnt_root) + return -EINVAL; ++ if (!ve_accessible_veid(nd->path.mnt->owner, get_exec_env()->veid)) ++ return -EPERM; + + down_write(&namespace_sem); + if (type == MS_SHARED) { +@@ -1453,7 +1528,7 @@ static noinline int do_change_type(struct nameidata *nd, int flag) + * noinline this do_mount helper to save do_mount stack space. + */ + static noinline int do_loopback(struct nameidata *nd, char *old_name, +- int recurse) ++ int recurse, int mnt_flags) + { + struct nameidata old_nd; + struct vfsmount *mnt = NULL; +@@ -1483,6 +1558,7 @@ static noinline int do_loopback(struct nameidata *nd, char *old_name, + if (!mnt) + goto out; + ++ mnt->mnt_flags |= mnt_flags; + err = graft_tree(mnt, &nd->path); + if (err) { + LIST_HEAD(umount_list); +@@ -1527,7 +1603,7 @@ static noinline int do_remount(struct nameidata *nd, int flags, int mnt_flags, + int err; + struct super_block *sb = nd->path.mnt->mnt_sb; + +- if (!capable(CAP_SYS_ADMIN)) ++ if (!capable(CAP_VE_SYS_ADMIN)) + return -EPERM; + + if (!check_mnt(nd->path.mnt)) +@@ -1536,6 +1612,9 @@ static noinline int do_remount(struct nameidata *nd, int flags, int mnt_flags, + if (nd->path.dentry != nd->path.mnt->mnt_root) + return -EINVAL; + ++ if (!ve_accessible_veid(nd->path.mnt->owner, get_exec_env()->veid)) ++ return -EPERM; ++ + down_write(&sb->s_umount); + if (flags & MS_BIND) + err = change_mount_flags(nd->path.mnt, flags); +@@ -1568,7 +1647,7 @@ static noinline int do_move_mount(struct nameidata *nd, char *old_name) + struct path parent_path; + struct vfsmount *p; + int err = 0; +- if (!capable(CAP_SYS_ADMIN)) ++ if (!capable(CAP_VE_SYS_ADMIN)) + return -EPERM; + if (!old_name || !*old_name) + return -EINVAL; +@@ -1576,6 +1655,10 @@ static noinline int do_move_mount(struct nameidata *nd, char *old_name) + if (err) + return err; + ++ err = -EPERM; ++ if (!ve_accessible_veid(old_nd.path.mnt->owner, get_exec_env()->veid)) ++ goto out_nosem; ++ + down_write(&namespace_sem); + while (d_mountpoint(nd->path.dentry) && + follow_down(&nd->path.mnt, &nd->path.dentry)) +@@ -1633,6 +1716,7 @@ out: + up_write(&namespace_sem); + if (!err) + path_put(&parent_path); ++out_nosem: + path_put(&old_nd.path); + return err; + } +@@ -1651,7 +1735,7 @@ static noinline int do_new_mount(struct nameidata *nd, char *type, int flags, + return -EINVAL; + + /* we need capabilities... */ +- if (!capable(CAP_SYS_ADMIN)) ++ if (!capable(CAP_VE_SYS_ADMIN)) + return -EPERM; + + mnt = do_kern_mount(type, flags, name, data); +@@ -1690,6 +1774,11 @@ int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd, + goto unlock; + + newmnt->mnt_flags = mnt_flags; ++ ++ /* make this before graft_tree reveals mnt_root to the world... */ ++ if (nd->path.dentry->d_flags & DCACHE_VIRTUAL) ++ newmnt->mnt_root->d_flags |= DCACHE_VIRTUAL; ++ + if ((err = graft_tree(newmnt, &nd->path))) + goto unlock; + +@@ -1944,7 +2033,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, + retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, + data_page); + else if (flags & MS_BIND) +- retval = do_loopback(&nd, dev_name, flags & MS_REC); ++ retval = do_loopback(&nd, dev_name, flags & MS_REC, mnt_flags); + else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) + retval = do_change_type(&nd, flags); + else if (flags & MS_MOVE) +@@ -2086,6 +2175,7 @@ out1: + free_page(type_page); + return retval; + } ++EXPORT_SYMBOL_GPL(sys_mount); + + /* + * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. +@@ -2128,7 +2218,7 @@ static void chroot_fs_refs(struct path *old_root, struct path *new_root) + struct fs_struct *fs; + + read_lock(&tasklist_lock); +- do_each_thread(g, p) { ++ do_each_thread_ve(g, p) { + task_lock(p); + fs = p->fs; + if (fs) { +@@ -2143,7 +2233,7 @@ static void chroot_fs_refs(struct path *old_root, struct path *new_root) + put_fs_struct(fs); + } else + task_unlock(p); +- } while_each_thread(g, p); ++ } while_each_thread_ve(g, p); + read_unlock(&tasklist_lock); + } + +@@ -2314,7 +2404,7 @@ void __init mnt_init(void) + init_rwsem(&namespace_sem); + + mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount), +- 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); ++ 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_UBC, NULL); + + mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); + +@@ -2351,3 +2441,4 @@ void __put_mnt_ns(struct mnt_namespace *ns) + release_mounts(&umount_list); + kfree(ns); + } ++EXPORT_SYMBOL_GPL(__put_mnt_ns); +diff --git a/fs/open.c b/fs/open.c +index a99ad09..2165ec5 100644 +--- a/fs/open.c ++++ b/fs/open.c +@@ -25,6 +25,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -51,7 +52,21 @@ int vfs_statfs(struct dentry *dentry, struct kstatfs *buf) + + EXPORT_SYMBOL(vfs_statfs); + +-static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf) ++int faudit_statfs(struct super_block *sb, struct kstatfs *buf) ++{ ++ struct faudit_statfs_arg arg; ++ ++ arg.sb = sb; ++ arg.stat = buf; ++ ++ if (virtinfo_notifier_call(VITYPE_FAUDIT, VIRTINFO_FAUDIT_STATFS, &arg) ++ != NOTIFY_DONE) ++ return arg.err; ++ return 0; ++} ++ ++static int vfs_statfs_native(struct dentry *dentry, struct vfsmount *mnt, ++ struct statfs *buf) + { + struct kstatfs st; + int retval; +@@ -60,6 +75,10 @@ static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf) + if (retval) + return retval; + ++ retval = faudit_statfs(mnt->mnt_sb, &st); ++ if (retval) ++ return retval; ++ + if (sizeof(*buf) == sizeof(st)) + memcpy(buf, &st, sizeof(st)); + else { +@@ -94,7 +113,8 @@ static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf) + return 0; + } + +-static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf) ++static int vfs_statfs64(struct dentry *dentry, struct vfsmount *mnt, ++ struct statfs64 *buf) + { + struct kstatfs st; + int retval; +@@ -103,6 +123,10 @@ static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf) + if (retval) + return retval; + ++ retval = faudit_statfs(mnt->mnt_sb, &st); ++ if (retval) ++ return retval; ++ + if (sizeof(*buf) == sizeof(st)) + memcpy(buf, &st, sizeof(st)); + else { +@@ -129,7 +153,7 @@ asmlinkage long sys_statfs(const char __user * path, struct statfs __user * buf) + error = user_path_walk(path, &nd); + if (!error) { + struct statfs tmp; +- error = vfs_statfs_native(nd.path.dentry, &tmp); ++ error = vfs_statfs_native(nd.path.dentry, nd.path.mnt, &tmp); + if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) + error = -EFAULT; + path_put(&nd.path); +@@ -148,7 +172,7 @@ asmlinkage long sys_statfs64(const char __user *path, size_t sz, struct statfs64 + error = user_path_walk(path, &nd); + if (!error) { + struct statfs64 tmp; +- error = vfs_statfs64(nd.path.dentry, &tmp); ++ error = vfs_statfs64(nd.path.dentry, nd.path.mnt, &tmp); + if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) + error = -EFAULT; + path_put(&nd.path); +@@ -167,7 +191,7 @@ asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user * buf) + file = fget(fd); + if (!file) + goto out; +- error = vfs_statfs_native(file->f_path.dentry, &tmp); ++ error = vfs_statfs_native(file->f_path.dentry, file->f_path.mnt, &tmp); + if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) + error = -EFAULT; + fput(file); +@@ -188,7 +212,7 @@ asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz, struct statfs64 __user + file = fget(fd); + if (!file) + goto out; +- error = vfs_statfs64(file->f_path.dentry, &tmp); ++ error = vfs_statfs64(file->f_path.dentry, file->f_path.mnt, &tmp); + if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) + error = -EFAULT; + fput(file); +@@ -701,6 +725,7 @@ out_release: + out: + return error; + } ++EXPORT_SYMBOL_GPL(sys_chown); + + asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user, + gid_t group, int flag) +@@ -939,6 +964,7 @@ struct file *nameidata_to_filp(struct nameidata *nd, int flags) + return filp; + } + ++int odirect_enable = 0; + /* + * dentry_open() will have done dput(dentry) and mntput(mnt) if it returns an + * error. +@@ -960,6 +986,9 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) + return ERR_PTR(-EINVAL); + } + ++ if (!capable(CAP_SYS_RAWIO) && !odirect_enable) ++ flags &= ~O_DIRECT; ++ + error = -ENFILE; + f = get_empty_filp(); + if (f == NULL) { +@@ -1115,6 +1144,7 @@ asmlinkage long sys_open(const char __user *filename, int flags, int mode) + asmlinkage_protect(3, ret, filename, flags, mode); + return ret; + } ++EXPORT_SYMBOL_GPL(sys_open); + + asmlinkage long sys_openat(int dfd, const char __user *filename, int flags, + int mode) +diff --git a/fs/partitions/check.c b/fs/partitions/check.c +index 6149e4b..c904faa 100644 +--- a/fs/partitions/check.c ++++ b/fs/partitions/check.c +@@ -131,6 +131,7 @@ char *disk_name(struct gendisk *hd, int part, char *buf) + + return buf; + } ++EXPORT_SYMBOL(disk_name); + + const char *bdevname(struct block_device *bdev, char *buf) + { +diff --git a/fs/pipe.c b/fs/pipe.c +index 700f4e0..77ca617 100644 +--- a/fs/pipe.c ++++ b/fs/pipe.c +@@ -22,6 +22,8 @@ + #include + #include + ++#include ++ + /* + * We use a start+len construction, which provides full use of the + * allocated memory. +@@ -478,7 +480,7 @@ redo1: + int error, atomic = 1; + + if (!page) { +- page = alloc_page(GFP_HIGHUSER); ++ page = alloc_page(GFP_HIGHUSER | __GFP_UBC); + if (unlikely(!page)) { + ret = ret ? : -ENOMEM; + break; +@@ -856,7 +858,7 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode) + { + struct pipe_inode_info *pipe; + +- pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); ++ pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_UBC); + if (pipe) { + init_waitqueue_head(&pipe->wait); + pipe->r_counter = pipe->w_counter = 1; +@@ -1073,6 +1075,7 @@ int do_pipe(int *fd) + free_write_pipe(fw); + return error; + } ++EXPORT_SYMBOL_GPL(do_pipe); + + /* + * sys_pipe() is the normal C calling standard for creating +diff --git a/fs/proc/array.c b/fs/proc/array.c +index 797d775..6fd6695 100644 +--- a/fs/proc/array.c ++++ b/fs/proc/array.c +@@ -81,6 +81,8 @@ + #include + #include + ++#include ++ + #include + #include + #include "internal.h" +@@ -203,6 +205,15 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, + put_group_info(group_info); + + seq_printf(m, "\n"); ++ ++#ifdef CONFIG_VE ++ if (ve_is_super(get_exec_env())) { ++ seq_printf(m, "envID:\t%d\nVPid:\t%d\n", ++ p->ve_task_info.owner_env->veid, task_pid_vnr(p)); ++ seq_printf(m, "PNState:\t%u\nStopState:\t%u\n", ++ p->pn_state, p->stopped_state); ++ } ++#endif + } + + static void render_sigset_t(struct seq_file *m, const char *header, +@@ -242,10 +253,10 @@ static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign, + } + } + +-static inline void task_sig(struct seq_file *m, struct task_struct *p) ++void task_sig(struct seq_file *m, struct task_struct *p) + { + unsigned long flags; +- sigset_t pending, shpending, blocked, ignored, caught; ++ sigset_t pending, shpending, blocked, ignored, caught, saved; + int num_threads = 0; + unsigned long qsize = 0; + unsigned long qlim = 0; +@@ -255,12 +266,14 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) + sigemptyset(&blocked); + sigemptyset(&ignored); + sigemptyset(&caught); ++ sigemptyset(&saved); + + rcu_read_lock(); + if (lock_task_sighand(p, &flags)) { + pending = p->pending.signal; + shpending = p->signal->shared_pending.signal; + blocked = p->blocked; ++ saved = p->saved_sigmask; + collect_sigign_sigcatch(p, &ignored, &caught); + num_threads = atomic_read(&p->signal->count); + qsize = atomic_read(&p->user->sigpending); +@@ -278,6 +291,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) + render_sigset_t(m, "SigBlk:\t", &blocked); + render_sigset_t(m, "SigIgn:\t", &ignored); + render_sigset_t(m, "SigCgt:\t", &caught); ++ render_sigset_t(m, "SigSvd:\t", &saved); + } + + static void render_cap_t(struct seq_file *m, const char *header, +@@ -301,6 +315,20 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p) + render_cap_t(m, "CapBnd:\t", &p->cap_bset); + } + ++#ifdef CONFIG_BEANCOUNTERS ++static inline void ub_dump_task_info(struct task_struct *tsk, ++ char *stsk, int ltsk, char *smm, int lmm) ++{ ++ print_ub_uid(tsk->task_bc.task_ub, stsk, ltsk); ++ task_lock(tsk); ++ if (tsk->mm) ++ print_ub_uid(tsk->mm->mm_ub, smm, lmm); ++ else ++ strncpy(smm, "N/A", lmm); ++ task_unlock(tsk); ++} ++#endif ++ + static inline void task_context_switch_counts(struct seq_file *m, + struct task_struct *p) + { +@@ -314,6 +342,9 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) + { + struct mm_struct *mm = get_task_mm(task); ++#ifdef CONFIG_BEANCOUNTERS ++ char tsk_ub_info[64], mm_ub_info[64]; ++#endif + + task_name(m, task); + task_state(m, ns, pid, task); +@@ -329,6 +360,14 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, + task_show_regs(m, task); + #endif + task_context_switch_counts(m, task); ++#ifdef CONFIG_BEANCOUNTERS ++ ub_dump_task_info(task, ++ tsk_ub_info, sizeof(tsk_ub_info), ++ mm_ub_info, sizeof(mm_ub_info)); ++ ++ seq_printf(m, "TaskUB:\t%s\n", tsk_ub_info); ++ seq_printf(m, "MMUB:\t%s\n", mm_ub_info); ++#endif + return 0; + } + +@@ -410,6 +449,10 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, + unsigned long rsslim = 0; + char tcomm[sizeof(task->comm)]; + unsigned long flags; ++#ifdef CONFIG_BEANCOUNTERS ++ char ub_task_info[64]; ++ char ub_mm_info[64]; ++#endif + + state = *get_task_state(task); + vsize = eip = esp = 0; +@@ -488,6 +531,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, + priority = task_prio(task); + nice = task_nice(task); + ++#ifndef CONFIG_VE + /* Temporary variable needed for gcc-2.96 */ + /* convert timespec -> nsec*/ + start_time = +@@ -495,10 +539,25 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, + + task->real_start_time.tv_nsec; + /* convert nsec -> ticks */ + start_time = nsec_to_clock_t(start_time); ++#else ++ start_time = ve_relative_clock(&task->start_time); ++#endif ++ ++#ifdef CONFIG_BEANCOUNTERS ++ ub_dump_task_info(task, ub_task_info, sizeof(ub_task_info), ++ ub_mm_info, sizeof(ub_mm_info)); ++#endif + + seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \ + %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ +-%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n", ++%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld" ++#ifdef CONFIG_VE ++ " 0 0 0 0 0 0 0 %d %u" ++#endif ++#ifdef CONFIG_BEANCOUNTERS ++ " %s %s" ++#endif ++ "\n", + pid_nr_ns(pid, ns), + tcomm, + state, +@@ -545,7 +604,16 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, + task->policy, + (unsigned long long)delayacct_blkio_ticks(task), + cputime_to_clock_t(gtime), +- cputime_to_clock_t(cgtime)); ++ cputime_to_clock_t(cgtime) ++#ifdef CONFIG_VE ++ , task_pid_vnr(task), ++ VEID(VE_TASK_INFO(task)->owner_env) ++#endif ++#ifdef CONFIG_BEANCOUNTERS ++ , ub_task_info, ++ ub_mm_info ++#endif ++ ); + if (mm) + mmput(mm); + return 0; +diff --git a/fs/proc/base.c b/fs/proc/base.c +index 3b45537..fb40acb 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -185,10 +185,12 @@ static int proc_cwd_link(struct inode *inode, struct path *path) + } + if (fs) { + read_lock(&fs->lock); +- *path = fs->pwd; +- path_get(&fs->pwd); ++ result = d_root_check(&fs->pwd); ++ if (result == 0) { ++ *path = fs->pwd; ++ path_get(&fs->pwd); ++ } + read_unlock(&fs->lock); +- result = 0; + put_fs_struct(fs); + } + return result; +@@ -511,17 +513,31 @@ static int proc_pid_limits(struct task_struct *task, char *buffer) + static int proc_fd_access_allowed(struct inode *inode) + { + struct task_struct *task; +- int allowed = 0; ++ int err; ++ + /* Allow access to a task's file descriptors if it is us or we + * may use ptrace attach to the process and find out that + * information. + */ ++ err = -ENOENT; + task = get_proc_task(inode); + if (task) { +- allowed = ptrace_may_attach(task); ++ if (ptrace_may_attach(task)) ++ err = 0; ++ else ++ /* ++ * This clever ptrace_may_attach() may play a trick ++ * on us. If the task is zombie it will consider this ++ * task to be not dumpable at all and will deny any ++ * ptracing in VE. Not a big deal for ptrace(), but ++ * following the link will fail with the -EACCESS ++ * reason. Some software is unable to stand such a ++ * swindle and refuses to work :( ++ */ ++ err = (task->mm ? -EACCES : -ENOENT); + put_task_struct(task); + } +- return allowed; ++ return err; + } + + static int proc_setattr(struct dentry *dentry, struct iattr *attr) +@@ -996,6 +1012,8 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, + if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) && + oom_adjust != OOM_DISABLE) + return -EINVAL; ++ if (oom_adjust == OOM_DISABLE && !ve_is_super(get_exec_env())) ++ return -EPERM; + if (*end == '\n') + end++; + task = get_proc_task(file->f_path.dentry->d_inode); +@@ -1288,10 +1306,15 @@ static int proc_exe_link(struct inode *inode, struct path *exe_path) + exe_file = get_mm_exe_file(mm); + mmput(mm); + if (exe_file) { +- *exe_path = exe_file->f_path; +- path_get(&exe_file->f_path); ++ int result; ++ ++ result = d_root_check(&exe_file->f_path); ++ if (result == 0) { ++ *exe_path = exe_file->f_path; ++ path_get(&exe_file->f_path); ++ } + fput(exe_file); +- return 0; ++ return result; + } else + return -ENOENT; + } +@@ -1299,13 +1322,14 @@ static int proc_exe_link(struct inode *inode, struct path *exe_path) + static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) + { + struct inode *inode = dentry->d_inode; +- int error = -EACCES; ++ int error; + + /* We don't need a base pointer in the /proc filesystem */ + path_put(&nd->path); + + /* Are we allowed to snoop on the tasks file descriptors? */ +- if (!proc_fd_access_allowed(inode)) ++ error = proc_fd_access_allowed(inode); ++ if (error < 0) + goto out; + + error = PROC_I(inode)->op.proc_get_link(inode, &nd->path); +@@ -1340,12 +1364,13 @@ static int do_proc_readlink(struct path *path, char __user *buffer, int buflen) + + static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen) + { +- int error = -EACCES; ++ int error; + struct inode *inode = dentry->d_inode; + struct path path; + + /* Are we allowed to snoop on the tasks file descriptors? */ +- if (!proc_fd_access_allowed(inode)) ++ error = proc_fd_access_allowed(inode); ++ if (error < 0) + goto out; + + error = PROC_I(inode)->op.proc_get_link(inode, &path); +@@ -1586,6 +1611,7 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info) + struct files_struct *files = NULL; + struct file *file; + int fd = proc_fd(inode); ++ int err = -ENOENT; + + if (task) { + files = get_files_struct(task); +@@ -1598,7 +1624,8 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info) + */ + spin_lock(&files->file_lock); + file = fcheck_files(files, fd); +- if (file) { ++ err = -EACCES; ++ if (file && !d_root_check(&file->f_path)) { + if (path) { + *path = file->f_path; + path_get(&file->f_path); +@@ -1616,7 +1643,7 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info) + spin_unlock(&files->file_lock); + put_files_struct(files); + } +- return -ENOENT; ++ return err; + } + + static int proc_fd_link(struct inode *inode, struct path *path) +diff --git a/fs/proc/generic.c b/fs/proc/generic.c +index 43e54e8..76240e2 100644 +--- a/fs/proc/generic.c ++++ b/fs/proc/generic.c +@@ -228,6 +228,10 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) + struct proc_dir_entry *de = PDE(inode); + int error; + ++ if ((iattr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) && ++ LPDE(inode) == PDE(inode)) ++ return -EPERM; ++ + error = inode_change_ok(inode, iattr); + if (error) + goto out; +@@ -236,9 +240,12 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) + if (error) + goto out; + +- de->uid = inode->i_uid; +- de->gid = inode->i_gid; +- de->mode = inode->i_mode; ++ if (iattr->ia_valid & ATTR_UID) ++ de->uid = inode->i_uid; ++ if (iattr->ia_valid & ATTR_GID) ++ de->gid = inode->i_gid; ++ if (iattr->ia_valid & ATTR_MODE) ++ de->mode = inode->i_mode; + out: + return error; + } +@@ -371,29 +378,61 @@ static struct dentry_operations proc_dentry_operations = + .d_delete = proc_delete_dentry, + }; + ++static struct proc_dir_entry *__proc_lookup(struct proc_dir_entry *dir, ++ const char *name, int namelen) ++{ ++ struct proc_dir_entry *de; ++ ++ for (de = dir->subdir; de ; de = de->next) { ++ if (de->namelen != namelen) ++ continue; ++ if (memcmp(de->name, name, namelen)) ++ continue; ++ break; ++ } ++ return de; ++} ++ + /* + * Don't create negative dentries here, return -ENOENT by hand + * instead. + */ +-struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, +- struct dentry *dentry) ++struct dentry *proc_lookup_de(struct proc_dir_entry *de, ++ struct proc_dir_entry *lde, ++ struct inode *dir, struct dentry *dentry) + { + struct inode *inode = NULL; + int error = -ENOENT; + + lock_kernel(); + spin_lock(&proc_subdir_lock); +- for (de = de->subdir; de ; de = de->next) { +- if (de->namelen != dentry->d_name.len) +- continue; +- if (!memcmp(dentry->d_name.name, de->name, de->namelen)) { ++ de = __proc_lookup(de, dentry->d_name.name, dentry->d_name.len); ++ if (lde != NULL) ++ lde = __proc_lookup(lde, dentry->d_name.name, ++ dentry->d_name.len); ++ ++ if (de == NULL) ++ de = lde; ++ ++ if (de != NULL) { ++ /* ++ * de lde meaning inode(g,l) ++ * ------------------------------------ ++ * NULL NULL -ENOENT * ++ * X NULL global X NULL ++ * NULL X local X X ++ * X Y both X Y ++ */ ++ { + unsigned int ino; + + ino = de->low_ino; + de_get(de); ++ if (lde != NULL) ++ de_get(lde); + spin_unlock(&proc_subdir_lock); + error = -EINVAL; +- inode = proc_get_inode(dir->i_sb, ino, de); ++ inode = proc_get_inode(dir->i_sb, ino, de, lde); + goto out_unlock; + } + } +@@ -408,13 +447,15 @@ out_unlock: + } + if (de) + de_put(de); ++ if (lde) ++ de_put(lde); + return ERR_PTR(error); + } + + struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) + { +- return proc_lookup_de(PDE(dir), dir, dentry); ++ return proc_lookup_de(PDE(dir), LPDE(dir), dir, dentry); + } + + /* +@@ -426,13 +467,14 @@ struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, + * value of the readdir() call, as long as it's non-negative + * for success.. + */ +-int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, +- filldir_t filldir) ++int proc_readdir_de(struct proc_dir_entry *de, struct proc_dir_entry *lde, ++ struct file *filp, void *dirent, filldir_t filldir) + { + unsigned int ino; + int i; + struct inode *inode = filp->f_path.dentry->d_inode; + int ret = 0; ++ struct proc_dir_entry *ode = de, *fde = NULL; + + lock_kernel(); + +@@ -455,25 +497,19 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, + /* fall through */ + default: + spin_lock(&proc_subdir_lock); +- de = de->subdir; + i -= 2; +- for (;;) { +- if (!de) { +- ret = 1; +- spin_unlock(&proc_subdir_lock); +- goto out; +- } +- if (!i) +- break; +- de = de->next; +- i--; +- } +- +- do { ++repeat: ++ de = de->subdir; ++ while (de != NULL) { + struct proc_dir_entry *next; + +- /* filldir passes info to user space */ + de_get(de); ++ if (i-- > 0 || (fde != NULL && ++ __proc_lookup(fde, ++ de->name, de->namelen))) ++ goto skip; ++ ++ /* filldir passes info to user space */ + spin_unlock(&proc_subdir_lock); + if (filldir(dirent, de->name, de->namelen, filp->f_pos, + de->low_ino, de->mode >> 12) < 0) { +@@ -482,10 +518,17 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, + } + spin_lock(&proc_subdir_lock); + filp->f_pos++; ++skip: + next = de->next; + de_put(de); + de = next; +- } while (de); ++ } ++ ++ if (fde == NULL && lde != NULL && lde != ode) { ++ de = lde; ++ fde = ode; ++ goto repeat; ++ } + spin_unlock(&proc_subdir_lock); + } + ret = 1; +@@ -497,7 +540,7 @@ int proc_readdir(struct file *filp, void *dirent, filldir_t filldir) + { + struct inode *inode = filp->f_path.dentry->d_inode; + +- return proc_readdir_de(PDE(inode), filp, dirent, filldir); ++ return proc_readdir_de(PDE(inode), LPDE(inode), filp, dirent, filldir); + } + + /* +diff --git a/fs/proc/inode.c b/fs/proc/inode.c +index b08d100..4076902 100644 +--- a/fs/proc/inode.c ++++ b/fs/proc/inode.c +@@ -385,7 +385,7 @@ static const struct file_operations proc_reg_file_ops_no_compat = { + #endif + + struct inode *proc_get_inode(struct super_block *sb, unsigned int ino, +- struct proc_dir_entry *de) ++ struct proc_dir_entry *de, struct proc_dir_entry *lde) + { + struct inode * inode; + +@@ -399,6 +399,9 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino, + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + PROC_I(inode)->fd = 0; + PROC_I(inode)->pde = de; ++#ifdef CONFIG_VE ++ PROC_I(inode)->lpde = lde; ++#endif + + if (de->mode) { + inode->i_mode = de->mode; +@@ -445,9 +448,11 @@ int proc_fill_super(struct super_block *s) + s->s_magic = PROC_SUPER_MAGIC; + s->s_op = &proc_sops; + s->s_time_gran = 1; +- +- de_get(&proc_root); +- root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root); ++ ++ de_get(get_exec_env()->proc_root); ++ de_get(&glob_proc_root); ++ root_inode = proc_get_inode(s, PROC_ROOT_INO, ++ &glob_proc_root, get_exec_env()->proc_root); + if (!root_inode) + goto out_no_root; + root_inode->i_uid = 0; +diff --git a/fs/proc/internal.h b/fs/proc/internal.h +index 28cbca8..e7825f8 100644 +--- a/fs/proc/internal.h ++++ b/fs/proc/internal.h +@@ -12,6 +12,12 @@ + #include + + extern struct proc_dir_entry proc_root; ++#ifdef CONFIG_VE ++extern struct proc_dir_entry glob_proc_root; ++#else ++#define glob_proc_root proc_root ++#endif ++ + #ifdef CONFIG_PROC_SYSCTL + extern int proc_sys_init(void); + #else +@@ -84,7 +90,8 @@ static inline int proc_fd(struct inode *inode) + return PROC_I(inode)->fd; + } + +-struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino, ++struct dentry *proc_lookup_de(struct proc_dir_entry *de, ++ struct proc_dir_entry *lpde, struct inode *ino, + struct dentry *dentry); +-int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, +- filldir_t filldir); ++int proc_readdir_de(struct proc_dir_entry *de, struct proc_dir_entry *lpde, ++ struct file *filp, void *dirent, filldir_t filldir); +diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c +index ff3b90b..8e70bca 100644 +--- a/fs/proc/kmsg.c ++++ b/fs/proc/kmsg.c +@@ -11,6 +11,8 @@ + #include + #include + #include ++#include ++#include + + #include + #include +@@ -40,7 +42,7 @@ static ssize_t kmsg_read(struct file *file, char __user *buf, + + static unsigned int kmsg_poll(struct file *file, poll_table *wait) + { +- poll_wait(file, &log_wait, wait); ++ poll_wait(file, &ve_log_wait, wait); + if (do_syslog(9, NULL, 0)) + return POLLIN | POLLRDNORM; + return 0; +@@ -53,3 +55,4 @@ const struct file_operations proc_kmsg_operations = { + .open = kmsg_open, + .release = kmsg_release, + }; ++EXPORT_SYMBOL_GPL(proc_kmsg_operations); +diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c +index 7e277f2..3a7f65e 100644 +--- a/fs/proc/proc_misc.c ++++ b/fs/proc/proc_misc.c +@@ -32,6 +32,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -48,6 +49,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -86,19 +88,39 @@ static int loadavg_read_proc(char *page, char **start, off_t off, + int a, b, c; + int len; + unsigned long seq; ++ long running, threads; ++ struct ve_struct *ve; + ++ ve = get_exec_env(); + do { + seq = read_seqbegin(&xtime_lock); +- a = avenrun[0] + (FIXED_1/200); +- b = avenrun[1] + (FIXED_1/200); +- c = avenrun[2] + (FIXED_1/200); ++ if (ve_is_super(ve)) { ++ a = avenrun[0] + (FIXED_1/200); ++ b = avenrun[1] + (FIXED_1/200); ++ c = avenrun[2] + (FIXED_1/200); ++#ifdef CONFIG_VE ++ } else { ++ a = ve->avenrun[0] + (FIXED_1/200); ++ b = ve->avenrun[1] + (FIXED_1/200); ++ c = ve->avenrun[2] + (FIXED_1/200); ++#endif ++ } + } while (read_seqretry(&xtime_lock, seq)); ++ if (ve_is_super(ve)) { ++ running = nr_running(); ++ threads = nr_threads; ++#ifdef CONFIG_VE ++ } else { ++ running = nr_running_ve(ve); ++ threads = atomic_read(&ve->pcounter); ++#endif ++ } + +- len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n", ++ len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%ld %d\n", + LOAD_INT(a), LOAD_FRAC(a), + LOAD_INT(b), LOAD_FRAC(b), + LOAD_INT(c), LOAD_FRAC(c), +- nr_running(), nr_threads, ++ running, threads, + task_active_pid_ns(current)->last_pid); + return proc_calc_metrics(page, start, off, count, eof, len); + } +@@ -113,6 +135,13 @@ static int uptime_read_proc(char *page, char **start, off_t off, + + do_posix_clock_monotonic_gettime(&uptime); + monotonic_to_bootbased(&uptime); ++#ifdef CONFIG_VE ++ if (!ve_is_super(get_exec_env())) { ++ set_normalized_timespec(&uptime, ++ uptime.tv_sec - get_exec_env()->start_timespec.tv_sec, ++ uptime.tv_nsec - get_exec_env()->start_timespec.tv_nsec); ++ } ++#endif + cputime_to_timespec(idletime, &idle); + len = sprintf(page,"%lu.%02lu %lu.%02lu\n", + (unsigned long) uptime.tv_sec, +@@ -126,29 +155,50 @@ static int uptime_read_proc(char *page, char **start, off_t off, + static int meminfo_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) + { +- struct sysinfo i; ++ struct meminfo mi; + int len; +- unsigned long committed; +- unsigned long allowed; ++ unsigned long dummy; + struct vmalloc_info vmi; +- long cached; ++ ++ get_zone_counts(&mi.active, &mi.inactive, &dummy); + + /* + * display in kilobytes. + */ + #define K(x) ((x) << (PAGE_SHIFT - 10)) +- si_meminfo(&i); +- si_swapinfo(&i); +- committed = atomic_long_read(&vm_committed_space); +- allowed = ((totalram_pages - hugetlb_total_pages()) ++ si_meminfo(&mi.si); ++ si_swapinfo(&mi.si); ++ mi.committed_space = atomic_read(&vm_committed_space); ++ mi.swapcache = total_swapcache_pages; ++ mi.allowed = ((totalram_pages - hugetlb_total_pages()) + * sysctl_overcommit_ratio / 100) + total_swap_pages; + +- cached = global_page_state(NR_FILE_PAGES) - +- total_swapcache_pages - i.bufferram; +- if (cached < 0) +- cached = 0; ++ mi.cache = global_page_state(NR_FILE_PAGES) - ++ total_swapcache_pages - mi.si.bufferram; ++ if (mi.cache < 0) ++ mi.cache = 0; + + get_vmalloc_info(&vmi); ++ mi.vmalloc_used = vmi.used >> PAGE_SHIFT; ++ mi.vmalloc_largest = vmi.largest_chunk >> PAGE_SHIFT; ++ mi.vmalloc_total = VMALLOC_TOTAL >> PAGE_SHIFT; ++ ++ mi.pi.nr_file_dirty = global_page_state(NR_FILE_DIRTY); ++ mi.pi.nr_writeback = global_page_state(NR_WRITEBACK); ++ mi.pi.nr_anon_pages = global_page_state(NR_ANON_PAGES); ++ mi.pi.nr_file_mapped = global_page_state(NR_FILE_MAPPED); ++ mi.pi.nr_slab_rec = global_page_state(NR_SLAB_RECLAIMABLE); ++ mi.pi.nr_slab_unrec = global_page_state(NR_SLAB_UNRECLAIMABLE); ++ mi.pi.nr_pagetable = global_page_state(NR_PAGETABLE); ++ mi.pi.nr_unstable_nfs = global_page_state(NR_UNSTABLE_NFS); ++ mi.pi.nr_bounce = global_page_state(NR_BOUNCE); ++ mi.pi.nr_writeback_temp = global_page_state(NR_WRITEBACK_TEMP); ++ ++#ifdef CONFIG_BEANCOUNTERS ++ if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_MEMINFO, &mi) ++ & NOTIFY_FAIL) ++ return -ENOMSG; ++#endif + + /* + * Tagged format, for easy grepping and expansion. +@@ -185,38 +235,38 @@ static int meminfo_read_proc(char *page, char **start, off_t off, + "VmallocTotal: %8lu kB\n" + "VmallocUsed: %8lu kB\n" + "VmallocChunk: %8lu kB\n", +- K(i.totalram), +- K(i.freeram), +- K(i.bufferram), +- K(cached), +- K(total_swapcache_pages), +- K(global_page_state(NR_ACTIVE)), +- K(global_page_state(NR_INACTIVE)), ++ K(mi.si.totalram), ++ K(mi.si.freeram), ++ K(mi.si.bufferram), ++ K(mi.cache), ++ K(mi.swapcache), ++ K(mi.active), ++ K(mi.inactive), + #ifdef CONFIG_HIGHMEM +- K(i.totalhigh), +- K(i.freehigh), +- K(i.totalram-i.totalhigh), +- K(i.freeram-i.freehigh), ++ K(mi.si.totalhigh), ++ K(mi.si.freehigh), ++ K(mi.si.totalram-mi.si.totalhigh), ++ K(mi.si.freeram-mi.si.freehigh), + #endif +- K(i.totalswap), +- K(i.freeswap), +- K(global_page_state(NR_FILE_DIRTY)), +- K(global_page_state(NR_WRITEBACK)), +- K(global_page_state(NR_ANON_PAGES)), +- K(global_page_state(NR_FILE_MAPPED)), +- K(global_page_state(NR_SLAB_RECLAIMABLE) + +- global_page_state(NR_SLAB_UNRECLAIMABLE)), +- K(global_page_state(NR_SLAB_RECLAIMABLE)), +- K(global_page_state(NR_SLAB_UNRECLAIMABLE)), +- K(global_page_state(NR_PAGETABLE)), +- K(global_page_state(NR_UNSTABLE_NFS)), +- K(global_page_state(NR_BOUNCE)), +- K(global_page_state(NR_WRITEBACK_TEMP)), +- K(allowed), +- K(committed), +- (unsigned long)VMALLOC_TOTAL >> 10, +- vmi.used >> 10, +- vmi.largest_chunk >> 10 ++ K(mi.si.totalswap), ++ K(mi.si.freeswap), ++ K(mi.pi.nr_file_dirty), ++ K(mi.pi.nr_writeback), ++ K(mi.pi.nr_anon_pages), ++ K(mi.pi.nr_file_mapped), ++ K(mi.pi.nr_slab_rec + ++ mi.pi.nr_slab_unrec), ++ K(mi.pi.nr_slab_rec), ++ K(mi.pi.nr_slab_unrec), ++ K(mi.pi.nr_pagetable), ++ K(mi.pi.nr_unstable_nfs), ++ K(mi.pi.nr_bounce), ++ K(mi.pi.nr_writeback_temp), ++ K(mi.allowed), ++ K(mi.committed_space), ++ K(mi.vmalloc_total), ++ K(mi.vmalloc_used), ++ K(mi.vmalloc_largest) + ); + + len += hugetlb_report_meminfo(page + len); +@@ -472,25 +522,21 @@ static const struct file_operations proc_vmalloc_operations = { + }; + #endif + +-static int show_stat(struct seq_file *p, void *v) ++static void show_stat_ve0(struct seq_file *p) + { + int i; +- unsigned long jif; + cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; + cputime64_t guest; + u64 sum = 0; +- struct timespec boottime; + unsigned int *per_irq_sum; + + per_irq_sum = kzalloc(sizeof(unsigned int)*NR_IRQS, GFP_KERNEL); + if (!per_irq_sum) +- return -ENOMEM; ++ return; + + user = nice = system = idle = iowait = + irq = softirq = steal = cputime64_zero; + guest = cputime64_zero; +- getboottime(&boottime); +- jif = boottime.tv_sec; + + for_each_possible_cpu(i) { + int j; +@@ -550,9 +596,85 @@ static int show_stat(struct seq_file *p, void *v) + + for (i = 0; i < NR_IRQS; i++) + seq_printf(p, " %u", per_irq_sum[i]); ++ kfree(per_irq_sum); ++ seq_printf(p, "\nswap %lu %lu\n", ++ vm_events(PSWPIN), vm_events(PSWPOUT)); ++} ++ ++#ifdef CONFIG_VE ++static void show_stat_ve(struct seq_file *p, struct ve_struct *ve) ++{ ++ int i; ++ u64 user, nice, system; ++ cycles_t idle, iowait; ++ cpumask_t ve_cpus; ++ ++ ve_cpu_online_map(ve, &ve_cpus); ++ ++ user = nice = system = idle = iowait = 0; ++ for_each_cpu_mask(i, ve_cpus) { ++ user += VE_CPU_STATS(ve, i)->user; ++ nice += VE_CPU_STATS(ve, i)->nice; ++ system += VE_CPU_STATS(ve, i)->system; ++ idle += ve_sched_get_idle_time(ve, i); ++ iowait += ve_sched_get_iowait_time(ve, i); ++ } ++ ++ seq_printf(p, "cpu %llu %llu %llu %llu %llu 0 0 0\n", ++ (unsigned long long)cputime64_to_clock_t(user), ++ (unsigned long long)cputime64_to_clock_t(nice), ++ (unsigned long long)cputime64_to_clock_t(system), ++ (unsigned long long)cycles_to_clocks(idle), ++ (unsigned long long)cycles_to_clocks(iowait)); ++ ++ for_each_cpu_mask(i, ve_cpus) { ++ user = VE_CPU_STATS(ve, i)->user; ++ nice = VE_CPU_STATS(ve, i)->nice; ++ system = VE_CPU_STATS(ve, i)->system; ++ idle = ve_sched_get_idle_time(ve, i); ++ iowait = ve_sched_get_iowait_time(ve, i); ++ seq_printf(p, "cpu%d %llu %llu %llu %llu %llu 0 0 0\n", ++ i, ++ (unsigned long long)cputime64_to_clock_t(user), ++ (unsigned long long)cputime64_to_clock_t(nice), ++ (unsigned long long)cputime64_to_clock_t(system), ++ (unsigned long long)cycles_to_clocks(idle), ++ (unsigned long long)cycles_to_clocks(iowait)); ++ } ++ seq_printf(p, "intr 0\nswap 0 0\n"); ++} ++#endif ++ ++int show_stat(struct seq_file *p, void *v) ++{ ++ extern unsigned long total_forks; ++ unsigned long seq, jif; ++ struct ve_struct *env; ++ unsigned long __nr_running, __nr_iowait; ++ ++ do { ++ seq = read_seqbegin(&xtime_lock); ++ jif = - wall_to_monotonic.tv_sec; ++ if (wall_to_monotonic.tv_nsec) ++ --jif; ++ } while (read_seqretry(&xtime_lock, seq)); ++ ++ env = get_exec_env(); ++ if (ve_is_super(env)) { ++ show_stat_ve0(p); ++ __nr_running = nr_running(); ++ __nr_iowait = nr_iowait(); ++ } ++#ifdef CONFIG_VE ++ else { ++ show_stat_ve(p, env); ++ __nr_running = nr_running_ve(env); ++ __nr_iowait = nr_iowait_ve(env); ++ } ++#endif + + seq_printf(p, +- "\nctxt %llu\n" ++ "ctxt %llu\n" + "btime %lu\n" + "processes %lu\n" + "procs_running %lu\n" +@@ -560,10 +682,9 @@ static int show_stat(struct seq_file *p, void *v) + nr_context_switches(), + (unsigned long)jif, + total_forks, +- nr_running(), +- nr_iowait()); ++ __nr_running, ++ __nr_iowait); + +- kfree(per_irq_sum); + return 0; + } + +@@ -650,7 +771,8 @@ static int cmdline_read_proc(char *page, char **start, off_t off, + { + int len; + +- len = sprintf(page, "%s\n", saved_command_line); ++ len = sprintf(page, "%s\n", ++ ve_is_super(get_exec_env()) ? saved_command_line : "quiet"); + return proc_calc_metrics(page, start, off, count, eof, len); + } + +@@ -681,11 +803,16 @@ static ssize_t write_sysrq_trigger(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) + { + if (count) { +- char c; ++ int i, cnt; ++ char c[32]; + +- if (get_user(c, buf)) ++ cnt = min(count, sizeof(c)); ++ if (copy_from_user(c, buf, cnt)) + return -EFAULT; +- __handle_sysrq(c, NULL, 0); ++ ++ ++ for (i = 0; i < cnt && c[i] != '\n'; i++) ++ __handle_sysrq(c[i], NULL, 0); + } + return count; + } +@@ -833,38 +960,39 @@ void __init proc_misc_init(void) + static struct { + char *name; + int (*read_proc)(char*,char**,off_t,int,int*,void*); ++ struct proc_dir_entry *parent; + } *p, simple_ones[] = { +- {"loadavg", loadavg_read_proc}, +- {"uptime", uptime_read_proc}, +- {"meminfo", meminfo_read_proc}, +- {"version", version_read_proc}, ++ {"loadavg", loadavg_read_proc, &glob_proc_root}, ++ {"uptime", uptime_read_proc, &glob_proc_root}, ++ {"meminfo", meminfo_read_proc, &glob_proc_root}, ++ {"version", version_read_proc, &glob_proc_root}, + #ifdef CONFIG_PROC_HARDWARE + {"hardware", hardware_read_proc}, + #endif + #ifdef CONFIG_STRAM_PROC + {"stram", stram_read_proc}, + #endif +- {"filesystems", filesystems_read_proc}, +- {"cmdline", cmdline_read_proc}, ++ {"filesystems", filesystems_read_proc, &glob_proc_root}, ++ {"cmdline", cmdline_read_proc, &glob_proc_root}, + {"execdomains", execdomains_read_proc}, + {NULL,} + }; + for (p = simple_ones; p->name; p++) +- create_proc_read_entry(p->name, 0, NULL, p->read_proc, NULL); ++ create_proc_read_entry(p->name, 0, p->parent, p->read_proc, NULL); + +- proc_symlink("mounts", NULL, "self/mounts"); ++ proc_symlink("mounts", &glob_proc_root, "self/mounts"); + + /* And now for trickier ones */ + #ifdef CONFIG_PRINTK + proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations); + #endif +- proc_create("locks", 0, NULL, &proc_locks_operations); ++ proc_create("locks", 0, &glob_proc_root, &proc_locks_operations); + proc_create("devices", 0, NULL, &proc_devinfo_operations); +- proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations); ++ proc_create("cpuinfo", 0, &glob_proc_root, &proc_cpuinfo_operations); + #ifdef CONFIG_BLOCK + proc_create("partitions", 0, NULL, &proc_partitions_operations); + #endif +- proc_create("stat", 0, NULL, &proc_stat_operations); ++ proc_create("stat", 0, &glob_proc_root, &proc_stat_operations); + proc_create("interrupts", 0, NULL, &proc_interrupts_operations); + #ifdef CONFIG_SLABINFO + proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); +@@ -877,13 +1005,13 @@ void __init proc_misc_init(void) + #endif + proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); + proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops); +- proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations); ++ proc_create("vmstat", S_IRUGO, &glob_proc_root, &proc_vmstat_file_operations); + proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations); + #ifdef CONFIG_BLOCK + proc_create("diskstats", 0, NULL, &proc_diskstats_operations); + #endif + #ifdef CONFIG_MODULES +- proc_create("modules", 0, NULL, &proc_modules_operations); ++ proc_create("modules", 0, &glob_proc_root, &proc_modules_operations); + #endif + #ifdef CONFIG_SCHEDSTATS + proc_create("schedstat", 0, NULL, &proc_schedstat_operations); +diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c +index 83f357b..108d432 100644 +--- a/fs/proc/proc_net.c ++++ b/fs/proc/proc_net.c +@@ -90,7 +90,7 @@ static struct dentry *proc_tgid_net_lookup(struct inode *dir, + de = ERR_PTR(-ENOENT); + net = get_proc_task_net(dir); + if (net != NULL) { +- de = proc_lookup_de(net->proc_net, dir, dentry); ++ de = proc_lookup_de(net->proc_net, NULL, dir, dentry); + put_net(net); + } + return de; +@@ -128,7 +128,8 @@ static int proc_tgid_net_readdir(struct file *filp, void *dirent, + ret = -EINVAL; + net = get_proc_task_net(filp->f_path.dentry->d_inode); + if (net != NULL) { +- ret = proc_readdir_de(net->proc_net, filp, dirent, filldir); ++ ret = proc_readdir_de(net->proc_net, NULL, ++ filp, dirent, filldir); + put_net(net); + } + return ret; +@@ -203,7 +204,7 @@ static struct pernet_operations __net_initdata proc_net_ns_ops = { + + int __init proc_net_init(void) + { +- proc_symlink("net", NULL, "self/net"); ++ proc_symlink("net", &glob_proc_root, "self/net"); + + return register_pernet_subsys(&proc_net_ns_ops); + } +diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c +index 5acc001..37a80ee 100644 +--- a/fs/proc/proc_sysctl.c ++++ b/fs/proc/proc_sysctl.c +@@ -442,7 +442,7 @@ static struct proc_dir_entry *proc_sys_root; + + int proc_sys_init(void) + { +- proc_sys_root = proc_mkdir("sys", NULL); ++ proc_sys_root = proc_mkdir("sys", &glob_proc_root); + proc_sys_root->proc_iops = &proc_sys_inode_operations; + proc_sys_root->proc_fops = &proc_sys_file_operations; + proc_sys_root->nlink = 0; +diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c +index 21f490f..dba8a27 100644 +--- a/fs/proc/proc_tty.c ++++ b/fs/proc/proc_tty.c +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + #include + + /* +@@ -70,6 +71,9 @@ static int show_tty_driver(struct seq_file *m, void *v) + dev_t from = MKDEV(p->major, p->minor_start); + dev_t to = from + p->num; + ++ if (!ve_accessible_strict(p->owner_env, get_exec_env())) ++ goto out; ++ + if (&p->tty_drivers == tty_drivers.next) { + /* pseudo-drivers first */ + seq_printf(m, "%-20s /dev/%-8s ", "/dev/tty", "tty"); +@@ -97,6 +101,7 @@ static int show_tty_driver(struct seq_file *m, void *v) + } + if (from != to) + show_tty_range(m, p, from, to - from); ++out: + return 0; + } + +diff --git a/fs/proc/root.c b/fs/proc/root.c +index 9511753..e2390df 100644 +--- a/fs/proc/root.c ++++ b/fs/proc/root.c +@@ -43,6 +43,9 @@ static int proc_get_sb(struct file_system_type *fs_type, + struct super_block *sb; + struct pid_namespace *ns; + struct proc_inode *ei; ++#ifdef CONFIG_VE ++ struct vfsmount *proc_mnt = fs_type->owner_env->proc_mnt; ++#endif + + if (proc_mnt) { + /* Seed the root directory with a pid so it doesn't need +@@ -96,11 +99,12 @@ static void proc_kill_sb(struct super_block *sb) + put_pid_ns(ns); + } + +-static struct file_system_type proc_fs_type = { ++struct file_system_type proc_fs_type = { + .name = "proc", + .get_sb = proc_get_sb, + .kill_sb = proc_kill_sb, + }; ++EXPORT_SYMBOL(proc_fs_type); + + void __init proc_root_init(void) + { +@@ -110,6 +114,11 @@ void __init proc_root_init(void) + err = register_filesystem(&proc_fs_type); + if (err) + return; ++ ++#ifdef CONFIG_VE ++ get_ve0()->proc_root = &proc_root; ++#endif ++ + proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns); + err = PTR_ERR(proc_mnt); + if (IS_ERR(proc_mnt)) { +@@ -117,16 +126,20 @@ void __init proc_root_init(void) + return; + } + ++#ifdef CONFIG_VE ++ get_ve0()->proc_mnt = proc_mnt; ++#endif ++ + proc_misc_init(); + + proc_net_init(); + + #ifdef CONFIG_SYSVIPC +- proc_mkdir("sysvipc", NULL); ++ proc_mkdir("sysvipc", &glob_proc_root); + #endif +- proc_mkdir("fs", NULL); ++ proc_mkdir("fs", &glob_proc_root); + proc_mkdir("driver", NULL); +- proc_mkdir("fs/nfsd", NULL); /* somewhere for the nfsd filesystem to be mounted */ ++ proc_mkdir("fs/nfsd", &glob_proc_root); /* somewhere for the nfsd filesystem to be mounted */ + #if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE) + /* just give it a mountpoint */ + proc_mkdir("openprom", NULL); +@@ -211,6 +224,22 @@ struct proc_dir_entry proc_root = { + .parent = &proc_root, + }; + ++#ifdef CONFIG_VE ++struct proc_dir_entry glob_proc_root = { ++ .low_ino = PROC_ROOT_INO, ++ .namelen = 5, ++ .name = "/proc", ++ .mode = S_IFDIR | S_IRUGO | S_IXUGO, ++ .nlink = 2, ++ .count = ATOMIC_INIT(1), ++ .proc_iops = &proc_root_inode_operations, ++ .proc_fops = &proc_root_operations, ++ .parent = &glob_proc_root, ++}; ++ ++EXPORT_SYMBOL(glob_proc_root); ++#endif ++ + int pid_ns_prepare_proc(struct pid_namespace *ns) + { + struct vfsmount *mnt; +diff --git a/fs/quota.c b/fs/quota.c +index db1cc9f..e4fe2a6 100644 +--- a/fs/quota.c ++++ b/fs/quota.c +@@ -18,6 +18,7 @@ + #include + #include + #include ++#include + + /* Check validity of generic quotactl commands */ + static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t id) +@@ -81,11 +82,11 @@ static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid + if (cmd == Q_GETQUOTA) { + if (((type == USRQUOTA && current->euid != id) || + (type == GRPQUOTA && !in_egroup_p(id))) && +- !capable(CAP_SYS_ADMIN)) ++ !capable(CAP_VE_SYS_ADMIN)) + return -EPERM; + } + else if (cmd != Q_GETFMT && cmd != Q_SYNC && cmd != Q_GETINFO) +- if (!capable(CAP_SYS_ADMIN)) ++ if (!capable(CAP_VE_SYS_ADMIN)) + return -EPERM; + + return 0; +@@ -132,10 +133,10 @@ static int xqm_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t i + if (cmd == Q_XGETQUOTA) { + if (((type == XQM_USRQUOTA && current->euid != id) || + (type == XQM_GRPQUOTA && !in_egroup_p(id))) && +- !capable(CAP_SYS_ADMIN)) ++ !capable(CAP_VE_SYS_ADMIN)) + return -EPERM; + } else if (cmd != Q_XGETQSTAT && cmd != Q_XQUOTASYNC) { +- if (!capable(CAP_SYS_ADMIN)) ++ if (!capable(CAP_VE_SYS_ADMIN)) + return -EPERM; + } + +@@ -177,6 +178,8 @@ static void quota_sync_sb(struct super_block *sb, int type) + continue; + if (!sb_has_quota_enabled(sb, cnt)) + continue; ++ if (!sb_dqopt(sb)->files[cnt]) ++ continue; + mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, I_MUTEX_QUOTA); + truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0); + mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex); +@@ -207,7 +210,7 @@ restart: + sb->s_count++; + spin_unlock(&sb_lock); + down_read(&sb->s_umount); +- if (sb->s_root && sb->s_qcop->quota_sync) ++ if (sb->s_root && sb->s_qcop && sb->s_qcop->quota_sync) + quota_sync_sb(sb, type); + up_read(&sb->s_umount); + spin_lock(&sb_lock); +@@ -338,6 +341,7 @@ static inline struct super_block *quotactl_block(const char __user *special) + struct block_device *bdev; + struct super_block *sb; + char *tmp = getname(special); ++ int error; + + if (IS_ERR(tmp)) + return ERR_CAST(tmp); +@@ -345,6 +349,13 @@ static inline struct super_block *quotactl_block(const char __user *special) + putname(tmp); + if (IS_ERR(bdev)) + return ERR_CAST(bdev); ++ ++ error = devcgroup_inode_permission(bdev->bd_inode, MAY_QUOTACTL); ++ if (error) { ++ bdput(bdev); ++ return ERR_PTR(error); ++ } ++ + sb = get_super(bdev); + bdput(bdev); + if (!sb) +@@ -356,6 +367,215 @@ static inline struct super_block *quotactl_block(const char __user *special) + #endif + } + ++#ifdef CONFIG_QUOTA_COMPAT ++ ++#define QC_QUOTAON 0x0100 /* enable quotas */ ++#define QC_QUOTAOFF 0x0200 /* disable quotas */ ++/* GETQUOTA, SETQUOTA and SETUSE which were at 0x0300-0x0500 has now other parameteres */ ++#define QC_SYNC 0x0600 /* sync disk copy of a filesystems quotas */ ++#define QC_SETQLIM 0x0700 /* set limits */ ++/* GETSTATS at 0x0800 is now longer... */ ++#define QC_GETINFO 0x0900 /* get info about quotas - graces, flags... */ ++#define QC_SETINFO 0x0A00 /* set info about quotas */ ++#define QC_SETGRACE 0x0B00 /* set inode and block grace */ ++#define QC_SETFLAGS 0x0C00 /* set flags for quota */ ++#define QC_GETQUOTA 0x0D00 /* get limits and usage */ ++#define QC_SETQUOTA 0x0E00 /* set limits and usage */ ++#define QC_SETUSE 0x0F00 /* set usage */ ++/* 0x1000 used by old RSQUASH */ ++#define QC_GETSTATS 0x1100 /* get collected stats */ ++ ++struct compat_dqblk { ++ unsigned int dqb_ihardlimit; ++ unsigned int dqb_isoftlimit; ++ unsigned int dqb_curinodes; ++ unsigned int dqb_bhardlimit; ++ unsigned int dqb_bsoftlimit; ++ qsize_t dqb_curspace; ++ __kernel_time_t dqb_btime; ++ __kernel_time_t dqb_itime; ++}; ++ ++struct compat_dqinfo { ++ unsigned int dqi_bgrace; ++ unsigned int dqi_igrace; ++ unsigned int dqi_flags; ++ unsigned int dqi_blocks; ++ unsigned int dqi_free_blk; ++ unsigned int dqi_free_entry; ++}; ++ ++struct compat_dqstats { ++ __u32 lookups; ++ __u32 drops; ++ __u32 reads; ++ __u32 writes; ++ __u32 cache_hits; ++ __u32 allocated_dquots; ++ __u32 free_dquots; ++ __u32 syncs; ++ __u32 version; ++}; ++ ++asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special, qid_t id, void __user *addr); ++static long compat_quotactl(unsigned int cmds, unsigned int type, ++ const char __user *special, qid_t id, ++ void __user *addr) ++{ ++ struct super_block *sb; ++ long ret; ++ ++ sb = NULL; ++ switch (cmds) { ++ case QC_QUOTAON: ++ return sys_quotactl(QCMD(Q_QUOTAON, type), ++ special, id, addr); ++ ++ case QC_QUOTAOFF: ++ return sys_quotactl(QCMD(Q_QUOTAOFF, type), ++ special, id, addr); ++ ++ case QC_SYNC: ++ return sys_quotactl(QCMD(Q_SYNC, type), ++ special, id, addr); ++ ++ case QC_GETQUOTA: { ++ struct if_dqblk idq; ++ struct compat_dqblk cdq; ++ ++ sb = quotactl_block(special); ++ ret = PTR_ERR(sb); ++ if (IS_ERR(sb)) ++ break; ++ ret = check_quotactl_valid(sb, type, Q_GETQUOTA, id); ++ if (ret) ++ break; ++ ret = sb->s_qcop->get_dqblk(sb, type, id, &idq); ++ if (ret) ++ break; ++ cdq.dqb_ihardlimit = idq.dqb_ihardlimit; ++ cdq.dqb_isoftlimit = idq.dqb_isoftlimit; ++ cdq.dqb_curinodes = idq.dqb_curinodes; ++ cdq.dqb_bhardlimit = idq.dqb_bhardlimit; ++ cdq.dqb_bsoftlimit = idq.dqb_bsoftlimit; ++ cdq.dqb_curspace = idq.dqb_curspace; ++ cdq.dqb_btime = idq.dqb_btime; ++ cdq.dqb_itime = idq.dqb_itime; ++ ret = 0; ++ if (copy_to_user(addr, &cdq, sizeof(cdq))) ++ ret = -EFAULT; ++ break; ++ } ++ ++ case QC_SETQUOTA: ++ case QC_SETUSE: ++ case QC_SETQLIM: { ++ struct if_dqblk idq; ++ struct compat_dqblk cdq; ++ ++ sb = quotactl_block(special); ++ ret = PTR_ERR(sb); ++ if (IS_ERR(sb)) ++ break; ++ ret = check_quotactl_valid(sb, type, Q_SETQUOTA, id); ++ if (ret) ++ break; ++ ret = -EFAULT; ++ if (copy_from_user(&cdq, addr, sizeof(cdq))) ++ break; ++ idq.dqb_ihardlimit = cdq.dqb_ihardlimit; ++ idq.dqb_isoftlimit = cdq.dqb_isoftlimit; ++ idq.dqb_curinodes = cdq.dqb_curinodes; ++ idq.dqb_bhardlimit = cdq.dqb_bhardlimit; ++ idq.dqb_bsoftlimit = cdq.dqb_bsoftlimit; ++ idq.dqb_curspace = cdq.dqb_curspace; ++ idq.dqb_valid = 0; ++ if (cmds == QC_SETQUOTA || cmds == QC_SETQLIM) ++ idq.dqb_valid |= QIF_LIMITS; ++ if (cmds == QC_SETQUOTA || cmds == QC_SETUSE) ++ idq.dqb_valid |= QIF_USAGE; ++ ret = sb->s_qcop->set_dqblk(sb, type, id, &idq); ++ break; ++ } ++ ++ case QC_GETINFO: { ++ struct if_dqinfo iinf; ++ struct compat_dqinfo cinf; ++ ++ sb = quotactl_block(special); ++ ret = PTR_ERR(sb); ++ if (IS_ERR(sb)) ++ break; ++ ret = check_quotactl_valid(sb, type, Q_GETQUOTA, id); ++ if (ret) ++ break; ++ ret = sb->s_qcop->get_info(sb, type, &iinf); ++ if (ret) ++ break; ++ cinf.dqi_bgrace = iinf.dqi_bgrace; ++ cinf.dqi_igrace = iinf.dqi_igrace; ++ cinf.dqi_flags = 0; ++ if (iinf.dqi_flags & DQF_INFO_DIRTY) ++ cinf.dqi_flags |= 0x0010; ++ cinf.dqi_blocks = 0; ++ cinf.dqi_free_blk = 0; ++ cinf.dqi_free_entry = 0; ++ ret = 0; ++ if (copy_to_user(addr, &cinf, sizeof(cinf))) ++ ret = -EFAULT; ++ break; ++ } ++ ++ case QC_SETINFO: ++ case QC_SETGRACE: ++ case QC_SETFLAGS: { ++ struct if_dqinfo iinf; ++ struct compat_dqinfo cinf; ++ ++ sb = quotactl_block(special); ++ ret = PTR_ERR(sb); ++ if (IS_ERR(sb)) ++ break; ++ ret = check_quotactl_valid(sb, type, Q_SETINFO, id); ++ if (ret) ++ break; ++ ret = -EFAULT; ++ if (copy_from_user(&cinf, addr, sizeof(cinf))) ++ break; ++ iinf.dqi_bgrace = cinf.dqi_bgrace; ++ iinf.dqi_igrace = cinf.dqi_igrace; ++ iinf.dqi_flags = cinf.dqi_flags; ++ iinf.dqi_valid = 0; ++ if (cmds == QC_SETINFO || cmds == QC_SETGRACE) ++ iinf.dqi_valid |= IIF_BGRACE | IIF_IGRACE; ++ if (cmds == QC_SETINFO || cmds == QC_SETFLAGS) ++ iinf.dqi_valid |= IIF_FLAGS; ++ ret = sb->s_qcop->set_info(sb, type, &iinf); ++ break; ++ } ++ ++ case QC_GETSTATS: { ++ struct compat_dqstats stat; ++ ++ memset(&stat, 0, sizeof(stat)); ++ stat.version = 6*10000+5*100+0; ++ ret = 0; ++ if (copy_to_user(addr, &stat, sizeof(stat))) ++ ret = -EFAULT; ++ break; ++ } ++ ++ default: ++ ret = -ENOSYS; ++ break; ++ } ++ if (sb && !IS_ERR(sb)) ++ drop_super(sb); ++ return ret; ++} ++ ++#endif ++ + /* + * This is the system call interface. This communicates with + * the user-level programs. Currently this only supports diskquota +@@ -371,6 +591,11 @@ asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special, qid_t + cmds = cmd >> SUBCMDSHIFT; + type = cmd & SUBCMDMASK; + ++#ifdef CONFIG_QUOTA_COMPAT ++ if (cmds >= 0x0100 && cmds < 0x3000) ++ return compat_quotactl(cmds, type, special, id, addr); ++#endif ++ + if (cmds != Q_SYNC || special) { + sb = quotactl_block(special); + if (IS_ERR(sb)) +diff --git a/fs/read_write.c b/fs/read_write.c +index f0d1240..b9fbf10 100644 +--- a/fs/read_write.c ++++ b/fs/read_write.c +@@ -21,6 +21,8 @@ + #include + #include + ++#include ++ + const struct file_operations generic_ro_fops = { + .llseek = generic_file_llseek, + .read = do_sync_read, +@@ -350,6 +352,29 @@ static inline void file_pos_write(struct file *file, loff_t pos) + file->f_pos = pos; + } + ++static inline void bc_acct_write(size_t bytes) ++{ ++ struct user_beancounter *ub; ++ ++ if (bytes > 0) { ++ ub = get_exec_ub(); ++ ub_percpu_inc(ub, write); ++ ub_percpu_add(ub, wchar, bytes); ++ } ++} ++ ++static inline void bc_acct_read(size_t bytes) ++{ ++ struct user_beancounter *ub; ++ ++ if (bytes > 0) { ++ ub = get_exec_ub(); ++ ub_percpu_inc(ub, read); ++ ub_percpu_add(ub, rchar, bytes); ++ } ++} ++ ++ + asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count) + { + struct file *file; +@@ -362,6 +387,8 @@ asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count) + ret = vfs_read(file, buf, count, &pos); + file_pos_write(file, pos); + fput_light(file, fput_needed); ++ ++ bc_acct_read(ret); + } + + return ret; +@@ -379,6 +406,8 @@ asmlinkage ssize_t sys_write(unsigned int fd, const char __user * buf, size_t co + ret = vfs_write(file, buf, count, &pos); + file_pos_write(file, pos); + fput_light(file, fput_needed); ++ ++ bc_acct_write(ret); + } + + return ret; +@@ -400,6 +429,8 @@ asmlinkage ssize_t sys_pread64(unsigned int fd, char __user *buf, + if (file->f_mode & FMODE_PREAD) + ret = vfs_read(file, buf, count, &pos); + fput_light(file, fput_needed); ++ ++ bc_acct_read(ret); + } + + return ret; +@@ -421,6 +452,8 @@ asmlinkage ssize_t sys_pwrite64(unsigned int fd, const char __user *buf, + if (file->f_mode & FMODE_PWRITE) + ret = vfs_write(file, buf, count, &pos); + fput_light(file, fput_needed); ++ ++ bc_acct_write(ret); + } + + return ret; +@@ -666,6 +699,8 @@ sys_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen) + ret = vfs_readv(file, vec, vlen, &pos); + file_pos_write(file, pos); + fput_light(file, fput_needed); ++ ++ bc_acct_read(ret); + } + + if (ret > 0) +@@ -687,6 +722,8 @@ sys_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen) + ret = vfs_writev(file, vec, vlen, &pos); + file_pos_write(file, pos); + fput_light(file, fput_needed); ++ ++ bc_acct_write(ret); + } + + if (ret > 0) +diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c +index c1add28..3ca5049 100644 +--- a/fs/reiserfs/namei.c ++++ b/fs/reiserfs/namei.c +@@ -859,6 +859,9 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) + INITIALIZE_PATH(path); + struct reiserfs_dir_entry de; + ++ inode = dentry->d_inode; ++ DQUOT_INIT(inode); ++ + /* we will be doing 2 balancings and update 2 stat data, we change quotas + * of the owner of the directory and of the owner of the parent directory. + * The quota structure is possibly deleted only on last iput => outside +@@ -883,8 +886,6 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) + goto end_rmdir; + } + +- inode = dentry->d_inode; +- + reiserfs_update_inode_transaction(inode); + reiserfs_update_inode_transaction(dir); + +@@ -947,6 +948,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) + unsigned long savelink; + + inode = dentry->d_inode; ++ DQUOT_INIT(inode); + + /* in this transaction we can be doing at max two balancings and update + * two stat datas, we change quotas of the owner of the directory and of +@@ -1254,6 +1256,8 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, + + old_inode = old_dentry->d_inode; + new_dentry_inode = new_dentry->d_inode; ++ if (new_dentry_inode) ++ DQUOT_INIT(new_dentry_inode); + + // make sure, that oldname still exists and points to an object we + // are going to rename +diff --git a/fs/select.c b/fs/select.c +index da0e882..e0eb1cd 100644 +--- a/fs/select.c ++++ b/fs/select.c +@@ -27,6 +27,8 @@ + + #include + ++#include ++ + struct poll_table_page { + struct poll_table_page * next; + struct poll_table_entry * entry; +@@ -332,7 +334,8 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, + if (size > sizeof(stack_fds) / 6) { + /* Not enough space in on-stack array; must use kmalloc */ + ret = -ENOMEM; +- bits = kmalloc(6 * size, GFP_KERNEL); ++ bits = kmalloc(6 * size, size > PAGE_SIZE / 6 ? ++ GFP_KERNEL_UBC : GFP_KERNEL); + if (!bits) + goto out_nofds; + } +@@ -678,7 +681,7 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout) + + len = min(todo, POLLFD_PER_PAGE); + size = sizeof(struct poll_list) + sizeof(struct pollfd) * len; +- walk = walk->next = kmalloc(size, GFP_KERNEL); ++ walk = walk->next = kmalloc(size, GFP_KERNEL_UBC); + if (!walk) { + err = -ENOMEM; + goto out_fds; +@@ -710,7 +713,7 @@ out_fds: + return err; + } + +-static long do_restart_poll(struct restart_block *restart_block) ++long do_restart_poll(struct restart_block *restart_block) + { + struct pollfd __user *ufds = (struct pollfd __user*)restart_block->arg0; + int nfds = restart_block->arg1; +@@ -726,6 +729,7 @@ static long do_restart_poll(struct restart_block *restart_block) + } + return ret; + } ++EXPORT_SYMBOL_GPL(do_restart_poll); + + asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds, + long timeout_msecs) +diff --git a/fs/seq_file.c b/fs/seq_file.c +index 3f54dbd..4d8b86a 100644 +--- a/fs/seq_file.c ++++ b/fs/seq_file.c +@@ -32,7 +32,7 @@ int seq_open(struct file *file, const struct seq_operations *op) + struct seq_file *p = file->private_data; + + if (!p) { +- p = kmalloc(sizeof(*p), GFP_KERNEL); ++ p = kmalloc(sizeof(*p), GFP_KERNEL_UBC); + if (!p) + return -ENOMEM; + file->private_data = p; +@@ -87,7 +87,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) + m->version = file->f_version; + /* grab buffer if we didn't have one */ + if (!m->buf) { +- m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); ++ m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL_UBC); + if (!m->buf) + goto Enomem; + } +@@ -123,7 +123,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) + goto Fill; + m->op->stop(m, p); + kfree(m->buf); +- m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); ++ m->buf = kmalloc(m->size <<= 1, GFP_KERNEL_UBC); + if (!m->buf) + goto Enomem; + m->count = 0; +@@ -193,7 +193,7 @@ static int traverse(struct seq_file *m, loff_t offset) + return 0; + } + if (!m->buf) { +- m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); ++ m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL_UBC); + if (!m->buf) + return -ENOMEM; + } +@@ -232,7 +232,7 @@ static int traverse(struct seq_file *m, loff_t offset) + Eoverflow: + m->op->stop(m, p); + kfree(m->buf); +- m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); ++ m->buf = kmalloc(m->size <<= 1, GFP_KERNEL_UBC); + return !m->buf ? -ENOMEM : -EAGAIN; + } + +@@ -378,6 +378,8 @@ int seq_path(struct seq_file *m, struct path *path, char *esc) + if (m->count < m->size) { + char *s = m->buf + m->count; + char *p = d_path(path, s, m->size - m->count); ++ if (IS_ERR(p) && PTR_ERR(p) != -ENAMETOOLONG) ++ return 0; + if (!IS_ERR(p)) { + s = mangle_path(s, p, esc); + if (s) { +@@ -461,7 +463,7 @@ static void single_stop(struct seq_file *p, void *v) + int single_open(struct file *file, int (*show)(struct seq_file *, void *), + void *data) + { +- struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL); ++ struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL_UBC); + int res = -ENOMEM; + + if (op) { +@@ -505,7 +507,7 @@ void *__seq_open_private(struct file *f, const struct seq_operations *ops, + void *private; + struct seq_file *seq; + +- private = kzalloc(psize, GFP_KERNEL); ++ private = kzalloc(psize, GFP_KERNEL_UBC); + if (private == NULL) + goto out; + +diff --git a/fs/simfs.c b/fs/simfs.c +new file mode 100644 +index 0000000..366a3ed +--- /dev/null ++++ b/fs/simfs.c +@@ -0,0 +1,332 @@ ++/* ++ * fs/simfs.c ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#define SIMFS_GET_LOWER_FS_SB(sb) sb->s_root->d_sb ++ ++static struct super_operations sim_super_ops; ++ ++static int sim_getattr(struct vfsmount *mnt, struct dentry *dentry, ++ struct kstat *stat) ++{ ++ struct super_block *sb; ++ struct inode *inode; ++ ++ inode = dentry->d_inode; ++ if (!inode->i_op->getattr) { ++ generic_fillattr(inode, stat); ++ if (!stat->blksize) { ++ unsigned blocks; ++ ++ sb = inode->i_sb; ++ blocks = (stat->size + sb->s_blocksize-1) >> ++ sb->s_blocksize_bits; ++ stat->blocks = (sb->s_blocksize / 512) * blocks; ++ stat->blksize = sb->s_blocksize; ++ } ++ } else { ++ int err; ++ ++ err = inode->i_op->getattr(mnt, dentry, stat); ++ if (err) ++ return err; ++ } ++ ++ sb = mnt->mnt_sb; ++ if (sb->s_op == &sim_super_ops) ++ stat->dev = sb->s_dev; ++ return 0; ++} ++ ++static void quota_get_stat(struct super_block *sb, struct kstatfs *buf) ++{ ++ int err; ++ struct dq_stat qstat; ++ struct virt_info_quota q; ++ long free_file, adj_file; ++ s64 blk, free_blk, adj_blk; ++ int bsize_bits; ++ ++ q.super = sb; ++ q.qstat = &qstat; ++ err = virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_GETSTAT, &q); ++ if (err != NOTIFY_OK) ++ return; ++ ++ bsize_bits = ffs(buf->f_bsize) - 1; ++ ++ if (qstat.bsoftlimit > qstat.bcurrent) ++ free_blk = (qstat.bsoftlimit - qstat.bcurrent) >> bsize_bits; ++ else ++ free_blk = 0; ++ /* ++ * In the regular case, we always set buf->f_bfree and buf->f_blocks to ++ * the values reported by quota. In case of real disk space shortage, ++ * we adjust the values. We want this adjustment to look as if the ++ * total disk space were reduced, not as if the usage were increased. ++ * -- SAW ++ */ ++ adj_blk = 0; ++ if (buf->f_bfree < free_blk) ++ adj_blk = free_blk - buf->f_bfree; ++ buf->f_bfree = free_blk - adj_blk; ++ ++ if (free_blk < buf->f_bavail) ++ buf->f_bavail = free_blk; ++ ++ blk = (qstat.bsoftlimit >> bsize_bits) - adj_blk; ++ buf->f_blocks = blk > LONG_MAX ? LONG_MAX : blk; ++ ++ free_file = qstat.isoftlimit - qstat.icurrent; ++ if (free_file < 0) ++ free_file = 0; ++ if (buf->f_type == REISERFS_SUPER_MAGIC) ++ /* ++ * reiserfs doesn't initialize f_ffree and f_files values of ++ * kstatfs because it doesn't have an inode limit. ++ */ ++ buf->f_ffree = free_file; ++ adj_file = 0; ++ if (buf->f_ffree < free_file) ++ adj_file = free_file - buf->f_ffree; ++ buf->f_ffree = free_file - adj_file; ++ buf->f_files = qstat.isoftlimit - adj_file; ++} ++ ++static int sim_statfs(struct super_block *sb, struct kstatfs *buf) ++{ ++ int err; ++ struct super_block *lsb; ++ struct kstatfs statbuf; ++ ++ err = 0; ++ if (sb->s_op != &sim_super_ops) ++ return 0; ++ ++ memset(&statbuf, 0, sizeof(statbuf)); ++ lsb = SIMFS_GET_LOWER_FS_SB(sb); ++ ++ err = -ENOSYS; ++ if (lsb && lsb->s_op && lsb->s_op->statfs) ++ err = lsb->s_op->statfs(lsb->s_root, &statbuf); ++ if (err) ++ return err; ++ ++ quota_get_stat(sb, &statbuf); ++ ++ buf->f_files = statbuf.f_files; ++ buf->f_ffree = statbuf.f_ffree; ++ buf->f_blocks = statbuf.f_blocks; ++ buf->f_bfree = statbuf.f_bfree; ++ buf->f_bavail = statbuf.f_bavail; ++ return 0; ++} ++ ++static int sim_systemcall(struct vnotifier_block *me, unsigned long n, ++ void *d, int old_ret) ++{ ++ int err; ++ ++ switch (n) { ++ case VIRTINFO_FAUDIT_STAT: { ++ struct faudit_stat_arg *arg; ++ ++ arg = (struct faudit_stat_arg *)d; ++ err = sim_getattr(arg->mnt, arg->dentry, arg->stat); ++ arg->err = err; ++ } ++ break; ++ case VIRTINFO_FAUDIT_STATFS: { ++ struct faudit_statfs_arg *arg; ++ ++ arg = (struct faudit_statfs_arg *)d; ++ err = sim_statfs(arg->sb, arg->stat); ++ arg->err = err; ++ } ++ break; ++ default: ++ return old_ret; ++ } ++ return (err ? NOTIFY_BAD : NOTIFY_OK); ++} ++ ++static struct inode *sim_quota_root(struct super_block *sb) ++{ ++ return sb->s_root->d_inode; ++} ++ ++/* ++ * NOTE: We need to setup s_bdev field on super block, since sys_quotactl() ++ * does lookup_bdev() and get_super() which are comparing sb->s_bdev. ++ * so this is a MUST if we want unmodified sys_quotactl ++ * to work correctly on /dev/simfs inside VE ++ */ ++static int sim_init_blkdev(struct super_block *sb) ++{ ++ static struct hd_struct fake_hd; ++ struct block_device *blkdev; ++ ++ blkdev = bdget(sb->s_dev); ++ if (blkdev == NULL) ++ return -ENOMEM; ++ ++ blkdev->bd_part = &fake_hd; /* required for bdev_read_only() */ ++ sb->s_bdev = blkdev; ++ ++ return 0; ++} ++ ++static void sim_free_blkdev(struct super_block *sb) ++{ ++ /* set bd_part back to NULL */ ++ sb->s_bdev->bd_part = NULL; ++ bdput(sb->s_bdev); ++} ++ ++static void sim_quota_init(struct super_block *sb) ++{ ++ struct virt_info_quota viq; ++ ++ viq.super = sb; ++ virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_ON, &viq); ++} ++ ++static void sim_quota_free(struct super_block *sb) ++{ ++ struct virt_info_quota viq; ++ ++ viq.super = sb; ++ virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_OFF, &viq); ++} ++ ++static struct super_operations sim_super_ops = { ++ .get_quota_root = sim_quota_root, ++}; ++ ++static int sim_fill_super(struct super_block *s, void *data) ++{ ++ int err; ++ struct nameidata *nd; ++ ++ err = set_anon_super(s, NULL); ++ if (err) ++ goto out; ++ ++ err = 0; ++ nd = (struct nameidata *)data; ++ s->s_fs_info = mntget(nd->path.mnt); ++ s->s_root = dget(nd->path.dentry); ++ s->s_op = &sim_super_ops; ++out: ++ return err; ++} ++ ++static int sim_get_sb(struct file_system_type *type, int flags, ++ const char *dev_name, void *opt, struct vfsmount *mnt) ++{ ++ int err; ++ struct nameidata nd; ++ struct super_block *sb; ++ ++ err = -EINVAL; ++ if (opt == NULL) ++ goto out; ++ ++ err = path_lookup(opt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd); ++ if (err) ++ goto out; ++ ++ sb = sget(type, NULL, sim_fill_super, &nd); ++ err = PTR_ERR(sb); ++ if (IS_ERR(sb)) ++ goto out_path; ++ ++ err = sim_init_blkdev(sb); ++ if (err) ++ goto out_killsb; ++ ++ sim_quota_init(sb); ++ ++ path_put(&nd.path); ++ return simple_set_mnt(mnt, sb); ++ ++out_killsb: ++ up_write(&sb->s_umount); ++ deactivate_super(sb); ++out_path: ++ path_put(&nd.path); ++out: ++ return err; ++} ++ ++static void sim_kill_sb(struct super_block *sb) ++{ ++ dput(sb->s_root); ++ sb->s_root = NULL; ++ mntput((struct vfsmount *)(sb->s_fs_info)); ++ ++ sim_quota_free(sb); ++ sim_free_blkdev(sb); ++ ++ kill_anon_super(sb); ++} ++ ++static struct file_system_type sim_fs_type = { ++ .owner = THIS_MODULE, ++ .name = "simfs", ++ .get_sb = sim_get_sb, ++ .kill_sb = sim_kill_sb, ++ .fs_flags = FS_MANGLE_PROC, ++}; ++ ++static struct vnotifier_block sim_syscalls = { ++ .notifier_call = sim_systemcall, ++}; ++ ++static int __init init_simfs(void) ++{ ++ int err; ++ ++ err = register_filesystem(&sim_fs_type); ++ if (err) ++ return err; ++ ++ virtinfo_notifier_register(VITYPE_FAUDIT, &sim_syscalls); ++ return 0; ++} ++ ++static void __exit exit_simfs(void) ++{ ++ virtinfo_notifier_unregister(VITYPE_FAUDIT, &sim_syscalls); ++ unregister_filesystem(&sim_fs_type); ++} ++ ++MODULE_AUTHOR("SWsoft "); ++MODULE_DESCRIPTION("Open Virtuozzo Simulation of File System"); ++MODULE_LICENSE("GPL v2"); ++ ++module_init(init_simfs); ++module_exit(exit_simfs); +diff --git a/fs/smbfs/sock.c b/fs/smbfs/sock.c +index e37fe4d..1992fc0 100644 +--- a/fs/smbfs/sock.c ++++ b/fs/smbfs/sock.c +@@ -99,6 +99,7 @@ smb_close_socket(struct smb_sb_info *server) + + VERBOSE("closing socket %p\n", sock); + sock->sk->sk_data_ready = server->data_ready; ++ sock->sk->sk_user_data = NULL; + server->sock_file = NULL; + fput(file); + } +diff --git a/fs/stat.c b/fs/stat.c +index 9cf41f7..4d53945 100644 +--- a/fs/stat.c ++++ b/fs/stat.c +@@ -14,6 +14,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -41,11 +42,19 @@ int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) + { + struct inode *inode = dentry->d_inode; + int retval; ++ struct faudit_stat_arg arg; + + retval = security_inode_getattr(mnt, dentry); + if (retval) + return retval; + ++ arg.mnt = mnt; ++ arg.dentry = dentry; ++ arg.stat = stat; ++ if (virtinfo_notifier_call(VITYPE_FAUDIT, VIRTINFO_FAUDIT_STAT, &arg) ++ != NOTIFY_DONE) ++ return arg.err; ++ + if (inode->i_op->getattr) + return inode->i_op->getattr(mnt, dentry, stat); + +diff --git a/fs/super.c b/fs/super.c +index 453877c..55ce500 100644 +--- a/fs/super.c ++++ b/fs/super.c +@@ -38,6 +38,7 @@ + #include + #include + #include ++#include + #include + #include "internal.h" + +@@ -72,13 +73,15 @@ static struct super_block *alloc_super(struct file_system_type *type) + INIT_LIST_HEAD(&s->s_inodes); + init_rwsem(&s->s_umount); + mutex_init(&s->s_lock); +- lockdep_set_class(&s->s_umount, &type->s_umount_key); ++ lockdep_set_class(&s->s_umount, ++ &type->proto->s_umount_key); + /* + * The locking rules for s_lock are up to the + * filesystem. For example ext3fs has different + * lock ordering than usbfs: + */ +- lockdep_set_class(&s->s_lock, &type->s_lock_key); ++ lockdep_set_class(&s->s_lock, ++ &type->proto->s_lock_key); + down_write(&s->s_umount); + s->s_count = S_BIAS; + atomic_set(&s->s_active, 1); +@@ -303,7 +306,7 @@ void generic_shutdown_super(struct super_block *sb) + sop->put_super(sb); + + /* Forget any remaining inodes */ +- if (invalidate_inodes(sb)) { ++ if (invalidate_inodes_check(sb, 1)) { + printk("VFS: Busy inodes after unmount of %s. " + "Self-destruct in 5 seconds. Have a nice day...\n", + sb->s_id); +@@ -532,17 +535,26 @@ rescan: + spin_unlock(&sb_lock); + return NULL; + } ++EXPORT_SYMBOL(user_get_super); + + asmlinkage long sys_ustat(unsigned dev, struct ustat __user * ubuf) + { ++ dev_t kdev; + struct super_block *s; + struct ustat tmp; + struct kstatfs sbuf; +- int err = -EINVAL; ++ int err; ++ ++ kdev = new_decode_dev(dev); ++ err = get_device_perms_ve(S_IFBLK, kdev, FMODE_READ); ++ if (err) ++ goto out; ++ ++ err = -EINVAL; ++ s = user_get_super(kdev); ++ if (s == NULL) ++ goto out; + +- s = user_get_super(new_decode_dev(dev)); +- if (s == NULL) +- goto out; + err = vfs_statfs(s->s_root, &sbuf); + drop_super(s); + if (err) +@@ -684,6 +696,13 @@ void emergency_remount(void) + static struct idr unnamed_dev_idr; + static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */ + ++/* for compatibility with coreutils still unaware of new minor sizes */ ++int unnamed_dev_majors[] = { ++ 0, 144, 145, 146, 242, 243, 244, 245, ++ 246, 247, 248, 249, 250, 251, 252, 253 ++}; ++EXPORT_SYMBOL(unnamed_dev_majors); ++ + int set_anon_super(struct super_block *s, void *data) + { + int dev; +@@ -701,13 +720,13 @@ int set_anon_super(struct super_block *s, void *data) + else if (error) + return -EAGAIN; + +- if ((dev & MAX_ID_MASK) == (1 << MINORBITS)) { ++ if ((dev & MAX_ID_MASK) >= (1 << MINORBITS)) { + spin_lock(&unnamed_dev_lock); + idr_remove(&unnamed_dev_idr, dev); + spin_unlock(&unnamed_dev_lock); + return -EMFILE; + } +- s->s_dev = MKDEV(0, dev & MINORMASK); ++ s->s_dev = make_unnamed_dev(dev); + return 0; + } + +@@ -715,8 +734,9 @@ EXPORT_SYMBOL(set_anon_super); + + void kill_anon_super(struct super_block *sb) + { +- int slot = MINOR(sb->s_dev); ++ int slot; + ++ slot = unnamed_dev_idx(sb->s_dev); + generic_shutdown_super(sb); + spin_lock(&unnamed_dev_lock); + idr_remove(&unnamed_dev_idr, slot); +diff --git a/fs/sync.c b/fs/sync.c +index 228e17b..32ad4fc 100644 +--- a/fs/sync.c ++++ b/fs/sync.c +@@ -14,6 +14,8 @@ + #include + #include + ++#include ++ + #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ + SYNC_FILE_RANGE_WAIT_AFTER) + +@@ -38,7 +40,14 @@ static void do_sync(unsigned long wait) + + asmlinkage long sys_sync(void) + { ++ struct user_beancounter *ub; ++ ++ ub = get_exec_ub(); ++ ub_percpu_inc(ub, sync); ++ + do_sync(1); ++ ++ ub_percpu_inc(ub, sync_done); + return 0; + } + +@@ -80,6 +89,7 @@ long do_fsync(struct file *file, int datasync) + int ret; + int err; + struct address_space *mapping = file->f_mapping; ++ struct user_beancounter *ub; + + if (!file->f_op || !file->f_op->fsync) { + /* Why? We can still call filemap_fdatawrite */ +@@ -87,6 +97,12 @@ long do_fsync(struct file *file, int datasync) + goto out; + } + ++ ub = get_exec_ub(); ++ if (datasync) ++ ub_percpu_inc(ub, fdsync); ++ else ++ ub_percpu_inc(ub, fsync); ++ + ret = filemap_fdatawrite(mapping); + + /* +@@ -101,6 +117,11 @@ long do_fsync(struct file *file, int datasync) + err = filemap_fdatawait(mapping); + if (!ret) + ret = err; ++ ++ if (datasync) ++ ub_percpu_inc(ub, fdsync_done); ++ else ++ ub_percpu_inc(ub, fsync_done); + out: + return ret; + } +@@ -251,12 +272,16 @@ int do_sync_mapping_range(struct address_space *mapping, loff_t offset, + loff_t endbyte, unsigned int flags) + { + int ret; ++ struct user_beancounter *ub; + + if (!mapping) { + ret = -EINVAL; +- goto out; ++ goto out_noacct; + } + ++ ub = get_exec_ub(); ++ ub_percpu_inc(ub, frsync); ++ + ret = 0; + if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) { + ret = wait_on_page_writeback_range(mapping, +@@ -279,6 +304,8 @@ int do_sync_mapping_range(struct address_space *mapping, loff_t offset, + endbyte >> PAGE_CACHE_SHIFT); + } + out: ++ ub_percpu_inc(ub, frsync_done); ++out_noacct: + return ret; + } + EXPORT_SYMBOL_GPL(do_sync_mapping_range); +diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c +index 006fc64..9aec999 100644 +--- a/fs/sysfs/bin.c ++++ b/fs/sysfs/bin.c +@@ -177,6 +177,9 @@ static int open(struct inode * inode, struct file * file) + struct bin_buffer *bb = NULL; + int error; + ++ if (!ve_sysfs_alowed()) ++ return 0; ++ + /* binary file operations requires both @sd and its parent */ + if (!sysfs_get_active_two(attr_sd)) + return -ENODEV; +@@ -238,6 +241,9 @@ const struct file_operations bin_fops = { + + int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr) + { ++ if (!ve_sysfs_alowed()) ++ return 0; ++ + BUG_ON(!kobj || !kobj->sd || !attr); + + return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR); +@@ -252,6 +258,8 @@ int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr) + + void sysfs_remove_bin_file(struct kobject * kobj, struct bin_attribute * attr) + { ++ if (!ve_sysfs_alowed()) ++ return; + sysfs_hash_and_remove(kobj->sd, attr->attr.name); + } + +diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c +index 8c0e4b9..38c93f2 100644 +--- a/fs/sysfs/dir.c ++++ b/fs/sysfs/dir.c +@@ -478,6 +478,9 @@ static void sysfs_drop_dentry(struct sysfs_dirent *sd) + struct inode *inode; + struct dentry *dentry; + ++ if (!ve_sysfs_alowed()) ++ return; ++ + inode = ilookup(sysfs_sb, sd->s_ino); + if (!inode) + return; +@@ -649,12 +652,15 @@ int sysfs_create_dir(struct kobject * kobj) + struct sysfs_dirent *parent_sd, *sd; + int error = 0; + ++ if (!ve_sysfs_alowed()) ++ return 0; ++ + BUG_ON(!kobj); + + if (kobj->parent) + parent_sd = kobj->parent->sd; + else +- parent_sd = &sysfs_root; ++ parent_sd = ve_sysfs_root; + + error = create_dir(kobj, parent_sd, kobject_name(kobj), &sd); + if (!error) +@@ -755,6 +761,9 @@ void sysfs_remove_dir(struct kobject * kobj) + { + struct sysfs_dirent *sd = kobj->sd; + ++ if (!ve_sysfs_alowed()) ++ return; ++ + spin_lock(&sysfs_assoc_lock); + kobj->sd = NULL; + spin_unlock(&sysfs_assoc_lock); +@@ -770,6 +779,9 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name) + const char *dup_name = NULL; + int error; + ++ if (!ve_sysfs_alowed()) ++ return 0; ++ + mutex_lock(&sysfs_rename_mutex); + + error = 0; +@@ -838,7 +850,7 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj) + + mutex_lock(&sysfs_rename_mutex); + BUG_ON(!sd->s_parent); +- new_parent_sd = new_parent_kobj->sd ? new_parent_kobj->sd : &sysfs_root; ++ new_parent_sd = new_parent_kobj->sd ? new_parent_kobj->sd : ve_sysfs_root; + + error = 0; + if (sd->s_parent == new_parent_sd) +diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c +index e7735f6..fed6ceb 100644 +--- a/fs/sysfs/file.c ++++ b/fs/sysfs/file.c +@@ -516,6 +516,8 @@ int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr, + + int sysfs_create_file(struct kobject * kobj, const struct attribute * attr) + { ++ if (!ve_sysfs_alowed()) ++ return 0; + BUG_ON(!kobj || !kobj->sd || !attr); + + return sysfs_add_file(kobj->sd, attr, SYSFS_KOBJ_ATTR); +@@ -612,6 +614,8 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file); + + void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) + { ++ if (!ve_sysfs_alowed()) ++ return; + sysfs_hash_and_remove(kobj->sd, attr->name); + } + +diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c +index eeba384..c302cbe 100644 +--- a/fs/sysfs/group.c ++++ b/fs/sysfs/group.c +@@ -62,6 +62,8 @@ static int internal_create_group(struct kobject *kobj, int update, + struct sysfs_dirent *sd; + int error; + ++ if (!ve_sysfs_alowed()) ++ return 0; + BUG_ON(!kobj || (!update && !kobj->sd)); + + /* Updates may happen before the object has been instantiated */ +@@ -131,6 +133,9 @@ void sysfs_remove_group(struct kobject * kobj, + struct sysfs_dirent *dir_sd = kobj->sd; + struct sysfs_dirent *sd; + ++ if (!ve_sysfs_alowed()) ++ return; ++ + if (grp->name) { + sd = sysfs_get_dirent(dir_sd, grp->name); + if (!sd) { +diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c +index eb53c63..a09bfa5 100644 +--- a/fs/sysfs/inode.c ++++ b/fs/sysfs/inode.c +@@ -20,8 +20,6 @@ + #include + #include "sysfs.h" + +-extern struct super_block * sysfs_sb; +- + static const struct address_space_operations sysfs_aops = { + .readpage = simple_readpage, + .write_begin = simple_write_begin, +diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c +index 14f0023..974bf82 100644 +--- a/fs/sysfs/mount.c ++++ b/fs/sysfs/mount.c +@@ -15,6 +15,7 @@ + #include + #include + #include ++#include + #include + + #include "sysfs.h" +@@ -22,8 +23,11 @@ + /* Random magic number */ + #define SYSFS_MAGIC 0x62656572 + +-static struct vfsmount *sysfs_mount; ++#ifndef CONFIG_VE ++struct vfsmount *sysfs_mount; + struct super_block * sysfs_sb = NULL; ++#endif ++ + struct kmem_cache *sysfs_dir_cachep; + + static const struct super_operations sysfs_ops = { +@@ -39,6 +43,13 @@ struct sysfs_dirent sysfs_root = { + .s_ino = 1, + }; + ++static void init_ve0_sysfs_root(void) ++{ ++#ifdef CONFIG_VE ++ get_ve0()->_sysfs_root = &sysfs_root; ++#endif ++} ++ + static int sysfs_fill_super(struct super_block *sb, void *data, int silent) + { + struct inode *inode; +@@ -52,7 +63,7 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent) + sysfs_sb = sb; + + /* get root inode, initialize and unlock it */ +- inode = sysfs_get_inode(&sysfs_root); ++ inode = sysfs_get_inode(ve_sysfs_root); + if (!inode) { + pr_debug("sysfs: could not get root inode\n"); + return -ENOMEM; +@@ -65,7 +76,7 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent) + iput(inode); + return -ENOMEM; + } +- root->d_fsdata = &sysfs_root; ++ root->d_fsdata = ve_sysfs_root; + sb->s_root = root; + return 0; + } +@@ -76,16 +87,19 @@ static int sysfs_get_sb(struct file_system_type *fs_type, + return get_sb_single(fs_type, flags, data, sysfs_fill_super, mnt); + } + +-static struct file_system_type sysfs_fs_type = { ++struct file_system_type sysfs_fs_type = { + .name = "sysfs", + .get_sb = sysfs_get_sb, + .kill_sb = kill_anon_super, + }; + ++EXPORT_SYMBOL(sysfs_fs_type); ++ + int __init sysfs_init(void) + { + int err = -ENOMEM; + ++ init_ve0_sysfs_root(); + sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache", + sizeof(struct sysfs_dirent), + 0, 0, NULL); +diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c +index 817f596..1e3e7e7 100644 +--- a/fs/sysfs/symlink.c ++++ b/fs/sysfs/symlink.c +@@ -33,10 +33,13 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char + struct sysfs_addrm_cxt acxt; + int error; + ++ if (!ve_sysfs_alowed()) ++ return 0; ++ + BUG_ON(!name); + + if (!kobj) +- parent_sd = &sysfs_root; ++ parent_sd = ve_sysfs_root; + else + parent_sd = kobj->sd; + +@@ -89,8 +92,11 @@ void sysfs_remove_link(struct kobject * kobj, const char * name) + { + struct sysfs_dirent *parent_sd = NULL; + ++ if(!ve_sysfs_alowed()) ++ return; ++ + if (!kobj) +- parent_sd = &sysfs_root; ++ parent_sd = ve_sysfs_root; + else + parent_sd = kobj->sd; + +diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h +index ce4e15f..11e8464 100644 +--- a/fs/sysfs/sysfs.h ++++ b/fs/sysfs/sysfs.h +@@ -8,67 +8,17 @@ + * This file is released under the GPLv2. + */ + +-struct sysfs_open_dirent; +- +-/* type-specific structures for sysfs_dirent->s_* union members */ +-struct sysfs_elem_dir { +- struct kobject *kobj; +- /* children list starts here and goes through sd->s_sibling */ +- struct sysfs_dirent *children; +-}; +- +-struct sysfs_elem_symlink { +- struct sysfs_dirent *target_sd; +-}; +- +-struct sysfs_elem_attr { +- struct attribute *attr; +- struct sysfs_open_dirent *open; +-}; +- +-struct sysfs_elem_bin_attr { +- struct bin_attribute *bin_attr; +-}; +- +-/* +- * sysfs_dirent - the building block of sysfs hierarchy. Each and +- * every sysfs node is represented by single sysfs_dirent. +- * +- * As long as s_count reference is held, the sysfs_dirent itself is +- * accessible. Dereferencing s_elem or any other outer entity +- * requires s_active reference. +- */ +-struct sysfs_dirent { +- atomic_t s_count; +- atomic_t s_active; +- struct sysfs_dirent *s_parent; +- struct sysfs_dirent *s_sibling; +- const char *s_name; +- +- union { +- struct sysfs_elem_dir s_dir; +- struct sysfs_elem_symlink s_symlink; +- struct sysfs_elem_attr s_attr; +- struct sysfs_elem_bin_attr s_bin_attr; +- }; +- +- unsigned int s_flags; +- ino_t s_ino; +- umode_t s_mode; +- struct iattr *s_iattr; +-}; +- +-#define SD_DEACTIVATED_BIAS INT_MIN +- +-#define SYSFS_TYPE_MASK 0x00ff +-#define SYSFS_DIR 0x0001 +-#define SYSFS_KOBJ_ATTR 0x0002 +-#define SYSFS_KOBJ_BIN_ATTR 0x0004 +-#define SYSFS_KOBJ_LINK 0x0008 +-#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK) +- +-#define SYSFS_FLAG_MASK ~SYSFS_TYPE_MASK +-#define SYSFS_FLAG_REMOVED 0x0200 ++#ifndef CONFIG_VE ++extern struct vfsmount *sysfs_mount; ++extern struct super_block *sysfs_sb; ++#define ve_sysfs_alowed() 1 ++#else ++#include ++#include ++#define sysfs_mount (get_exec_env()->sysfs_mnt) ++#define sysfs_sb (get_exec_env()->sysfs_sb) ++#define ve_sysfs_alowed() (sysfs_sb != NULL) ++#endif + + static inline unsigned int sysfs_type(struct sysfs_dirent *sd) + { +@@ -88,8 +38,12 @@ struct sysfs_addrm_cxt { + /* + * mount.c + */ ++#ifdef CONFIG_VE ++#define ve_sysfs_root (get_exec_env()->_sysfs_root) ++#else + extern struct sysfs_dirent sysfs_root; +-extern struct super_block *sysfs_sb; ++#define ve_sysfs_root (&sysfs_root) ++#endif + extern struct kmem_cache *sysfs_dir_cachep; + + /* +diff --git a/fs/vzdq_file.c b/fs/vzdq_file.c +new file mode 100644 +index 0000000..4d814d9 +--- /dev/null ++++ b/fs/vzdq_file.c +@@ -0,0 +1,923 @@ ++/* ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ * This file contains Virtuozzo quota files as proc entry implementation. ++ * It is required for std quota tools to work correctly as they are expecting ++ * aquota.user and aquota.group files. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++/* ---------------------------------------------------------------------- ++ * ++ * File read operation ++ * ++ * FIXME: functions in this section (as well as many functions in vzdq_ugid.c, ++ * perhaps) abuse vz_quota_sem. ++ * Taking a global semaphore for lengthy and user-controlled operations inside ++ * VPSs is not a good idea in general. ++ * In this case, the reasons for taking this semaphore are completely unclear, ++ * especially taking into account that the only function that has comments ++ * about the necessity to be called under this semaphore ++ * (create_proc_quotafile) is actually called OUTSIDE it. ++ * ++ * --------------------------------------------------------------------- */ ++ ++#define DQBLOCK_SIZE 1024 ++#define DQUOTBLKNUM 21U ++#define DQTREE_DEPTH 4 ++#define TREENUM_2_BLKNUM(num) (((num) + 1) << 1) ++#define ISINDBLOCK(num) ((num)%2 != 0) ++#define FIRST_DATABLK 2 /* first even number */ ++#define LAST_IND_LEVEL (DQTREE_DEPTH - 1) ++#define CONVERT_LEVEL(level) ((level) * (QUOTAID_EBITS/QUOTAID_BBITS)) ++#define GETLEVINDX(ind, lev) (((ind) >> QUOTAID_BBITS*(lev)) \ ++ & QUOTATREE_BMASK) ++ ++#if (QUOTAID_EBITS / QUOTAID_BBITS) != (QUOTATREE_DEPTH / DQTREE_DEPTH) ++#error xBITS and DQTREE_DEPTH does not correspond ++#endif ++ ++#define BLOCK_NOT_FOUND 1 ++ ++/* data for quota file -- one per proc entry */ ++struct quotatree_data { ++ struct list_head list; ++ struct vz_quota_master *qmblk; ++ int type; /* type of the tree */ ++}; ++ ++/* serialized by vz_quota_sem */ ++static LIST_HEAD(qf_data_head); ++ ++static const u_int32_t vzquota_magics[] = V2_INITQMAGICS; ++static const u_int32_t vzquota_versions[] = V2_INITQVERSIONS; ++static const char aquota_user[] = "aquota.user"; ++static const char aquota_group[] = "aquota.group"; ++ ++ ++static inline loff_t get_depoff(int depth) ++{ ++ loff_t res = 1; ++ while (depth) { ++ res += (1 << ((depth - 1)*QUOTAID_EBITS + 1)); ++ depth--; ++ } ++ return res; ++} ++ ++static inline loff_t get_blknum(loff_t num, int depth) ++{ ++ loff_t res; ++ res = (num << 1) + get_depoff(depth); ++ return res; ++} ++ ++static int get_depth(loff_t num) ++{ ++ int i; ++ for (i = 0; i < DQTREE_DEPTH; i++) { ++ if (num >= get_depoff(i) && (i == DQTREE_DEPTH - 1 ++ || num < get_depoff(i + 1))) ++ return i; ++ } ++ return -1; ++} ++ ++static inline loff_t get_offset(loff_t num) ++{ ++ loff_t res, tmp; ++ ++ tmp = get_depth(num); ++ if (tmp < 0) ++ return -1; ++ num -= get_depoff(tmp); ++ BUG_ON(num < 0); ++ res = num >> 1; ++ ++ return res; ++} ++ ++static inline loff_t get_quot_blk_num(struct quotatree_tree *tree, int level) ++{ ++ /* return maximum available block num */ ++ return tree->levels[level].freenum; ++} ++ ++static inline loff_t get_block_num(struct quotatree_tree *tree) ++{ ++ loff_t ind_blk_num, quot_blk_num, max_ind, max_quot; ++ ++ quot_blk_num = get_quot_blk_num(tree, CONVERT_LEVEL(DQTREE_DEPTH) - 1); ++ max_quot = TREENUM_2_BLKNUM(quot_blk_num); ++ ind_blk_num = get_quot_blk_num(tree, CONVERT_LEVEL(DQTREE_DEPTH - 1)); ++ max_ind = (quot_blk_num) ? get_blknum(ind_blk_num, LAST_IND_LEVEL) ++ : get_blknum(ind_blk_num, 0); ++ ++ return (max_ind > max_quot) ? max_ind + 1 : max_quot + 1; ++} ++ ++/* Write quota file header */ ++static int read_header(void *buf, struct quotatree_tree *tree, ++ struct dq_info *dq_ugid_info, int type) ++{ ++ struct v2_disk_dqheader *dqh; ++ struct v2_disk_dqinfo *dq_disk_info; ++ ++ dqh = buf; ++ dq_disk_info = buf + sizeof(struct v2_disk_dqheader); ++ ++ dqh->dqh_magic = vzquota_magics[type]; ++ dqh->dqh_version = vzquota_versions[type]; ++ ++ dq_disk_info->dqi_bgrace = dq_ugid_info[type].bexpire; ++ dq_disk_info->dqi_igrace = dq_ugid_info[type].iexpire; ++ dq_disk_info->dqi_flags = 0; /* no flags */ ++ dq_disk_info->dqi_blocks = get_block_num(tree); ++ dq_disk_info->dqi_free_blk = 0; /* first block in the file */ ++ dq_disk_info->dqi_free_entry = FIRST_DATABLK; ++ ++ return 0; ++} ++ ++static int get_block_child(int depth, struct quotatree_node *p, u_int32_t *buf) ++{ ++ int i, j, lev_num; ++ ++ lev_num = QUOTATREE_DEPTH/DQTREE_DEPTH - 1; ++ for (i = 0; i < BLOCK_SIZE/sizeof(u_int32_t); i++) { ++ struct quotatree_node *next, *parent; ++ ++ parent = p; ++ next = p; ++ for (j = lev_num; j >= 0; j--) { ++ if (!next->blocks[GETLEVINDX(i,j)]) { ++ buf[i] = 0; ++ goto bad_branch; ++ } ++ parent = next; ++ next = next->blocks[GETLEVINDX(i,j)]; ++ } ++ buf[i] = (depth == DQTREE_DEPTH - 1) ? ++ TREENUM_2_BLKNUM(parent->num) ++ : get_blknum(next->num, depth + 1); ++ ++ bad_branch: ++ ; ++ } ++ ++ return 0; ++} ++ ++/* ++ * Write index block to disk (or buffer) ++ * @buf has length 256*sizeof(u_int32_t) bytes ++ */ ++static int read_index_block(int num, u_int32_t *buf, ++ struct quotatree_tree *tree) ++{ ++ struct quotatree_node *p; ++ u_int32_t index; ++ loff_t off; ++ int depth, res; ++ ++ res = BLOCK_NOT_FOUND; ++ index = 0; ++ depth = get_depth(num); ++ off = get_offset(num); ++ if (depth < 0 || off < 0) ++ return -EINVAL; ++ ++ list_for_each_entry(p, &tree->levels[CONVERT_LEVEL(depth)].usedlh, ++ list) { ++ if (p->num >= off) ++ res = 0; ++ if (p->num != off) ++ continue; ++ get_block_child(depth, p, buf); ++ break; ++ } ++ ++ return res; ++} ++ ++static inline void convert_quot_format(struct v2_disk_dqblk *dq, ++ struct vz_quota_ugid *vzq) ++{ ++ dq->dqb_id = vzq->qugid_id; ++ dq->dqb_ihardlimit = vzq->qugid_stat.ihardlimit; ++ dq->dqb_isoftlimit = vzq->qugid_stat.isoftlimit; ++ dq->dqb_curinodes = vzq->qugid_stat.icurrent; ++ dq->dqb_bhardlimit = vzq->qugid_stat.bhardlimit / QUOTABLOCK_SIZE; ++ dq->dqb_bsoftlimit = vzq->qugid_stat.bsoftlimit / QUOTABLOCK_SIZE; ++ dq->dqb_curspace = vzq->qugid_stat.bcurrent; ++ dq->dqb_btime = vzq->qugid_stat.btime; ++ dq->dqb_itime = vzq->qugid_stat.itime; ++} ++ ++static int read_dquot(loff_t num, void *buf, struct quotatree_tree *tree) ++{ ++ int res, i, entries = 0; ++ struct v2_disk_dqdbheader *dq_header; ++ struct quotatree_node *p; ++ struct v2_disk_dqblk *blk = buf + sizeof(struct v2_disk_dqdbheader); ++ ++ res = BLOCK_NOT_FOUND; ++ dq_header = buf; ++ memset(dq_header, 0, sizeof(*dq_header)); ++ ++ list_for_each_entry(p, &(tree->levels[QUOTATREE_DEPTH - 1].usedlh), ++ list) { ++ if (TREENUM_2_BLKNUM(p->num) >= num) ++ res = 0; ++ if (TREENUM_2_BLKNUM(p->num) != num) ++ continue; ++ ++ for (i = 0; i < QUOTATREE_BSIZE; i++) { ++ if (!p->blocks[i]) ++ continue; ++ convert_quot_format(blk + entries, ++ (struct vz_quota_ugid *)p->blocks[i]); ++ entries++; ++ res = 0; ++ } ++ break; ++ } ++ dq_header->dqdh_entries = entries; ++ ++ return res; ++} ++ ++static int read_block(int num, void *buf, struct quotatree_tree *tree, ++ struct dq_info *dq_ugid_info, int magic) ++{ ++ int res; ++ ++ memset(buf, 0, DQBLOCK_SIZE); ++ if (!num) ++ res = read_header(buf, tree, dq_ugid_info, magic); ++ else if (ISINDBLOCK(num)) ++ res = read_index_block(num, (u_int32_t*)buf, tree); ++ else ++ res = read_dquot(num, buf, tree); ++ ++ return res; ++} ++ ++/* ++ * FIXME: this function can handle quota files up to 2GB only. ++ */ ++static int read_proc_quotafile(char *page, char **start, off_t off, int count, ++ int *eof, void *data) ++{ ++ off_t blk_num, blk_off, buf_off; ++ char *tmp; ++ size_t buf_size; ++ struct quotatree_data *qtd; ++ struct quotatree_tree *tree; ++ struct dq_info *dqi; ++ int res; ++ ++ *start = NULL; ++ tmp = kmalloc(DQBLOCK_SIZE, GFP_KERNEL); ++ if (!tmp) ++ return -ENOMEM; ++ ++ qtd = data; ++ down(&vz_quota_sem); ++ down(&qtd->qmblk->dq_sem); ++ ++ res = 0; ++ tree = QUGID_TREE(qtd->qmblk, qtd->type); ++ if (!tree) { ++ *eof = 1; ++ goto out_dq; ++ } ++ ++ dqi = &qtd->qmblk->dq_ugid_info[qtd->type]; ++ ++ buf_off = 0; ++ buf_size = count; ++ blk_num = off / DQBLOCK_SIZE; ++ blk_off = off % DQBLOCK_SIZE; ++ ++ while (buf_size > 0) { ++ off_t len; ++ ++ len = min((size_t)(DQBLOCK_SIZE-blk_off), buf_size); ++ res = read_block(blk_num, tmp, tree, dqi, qtd->type); ++ if (res < 0) ++ goto out_err; ++ if (res == BLOCK_NOT_FOUND) { ++ *eof = 1; ++ break; ++ } ++ memcpy(page + buf_off, tmp + blk_off, len); ++ ++ blk_num++; ++ buf_size -= len; ++ blk_off = 0; ++ buf_off += len; ++ } ++ res = buf_off; ++ ++out_err: ++ *start += count; ++out_dq: ++ up(&qtd->qmblk->dq_sem); ++ up(&vz_quota_sem); ++ kfree(tmp); ++ ++ return res; ++} ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * /proc/vz/vzaquota/QID/aquota.* files ++ * ++ * FIXME: this code lacks serialization of read/readdir/lseek. ++ * However, this problem should be fixed after the mainstream issue of what ++ * appears to be non-atomic read and update of file position in sys_read. ++ * ++ * --------------------------------------------------------------------- */ ++ ++static inline unsigned long vzdq_aquot_getino(dev_t dev) ++{ ++ return 0xec000000UL + dev; ++} ++ ++static inline dev_t vzdq_aquot_getidev(struct inode *inode) ++{ ++ return (dev_t)(unsigned long)PROC_I(inode)->op.proc_get_link; ++} ++ ++static inline void vzdq_aquot_setidev(struct inode *inode, dev_t dev) ++{ ++ PROC_I(inode)->op.proc_get_link = (void *)(unsigned long)dev; ++} ++ ++static ssize_t vzdq_aquotf_read(struct file *file, ++ char __user *buf, size_t size, loff_t *ppos) ++{ ++ char *page; ++ size_t bufsize; ++ ssize_t l, l2, copied; ++ char *start; ++ struct inode *inode; ++ struct block_device *bdev; ++ struct super_block *sb; ++ struct quotatree_data data; ++ int eof, err; ++ ++ err = -ENOMEM; ++ page = (char *)__get_free_page(GFP_KERNEL); ++ if (page == NULL) ++ goto out_err; ++ ++ err = -ENODEV; ++ inode = file->f_dentry->d_inode; ++ bdev = bdget(vzdq_aquot_getidev(inode)); ++ if (bdev == NULL) ++ goto out_err; ++ sb = get_super(bdev); ++ bdput(bdev); ++ if (sb == NULL) ++ goto out_err; ++ data.qmblk = vzquota_find_qmblk(sb); ++ data.type = PROC_I(inode)->fd - 1; ++ drop_super(sb); ++ if (data.qmblk == NULL || data.qmblk == VZ_QUOTA_BAD) ++ goto out_err; ++ ++ copied = 0; ++ l = l2 = 0; ++ while (1) { ++ bufsize = min(size, (size_t)PAGE_SIZE); ++ if (bufsize <= 0) ++ break; ++ ++ l = read_proc_quotafile(page, &start, *ppos, bufsize, ++ &eof, &data); ++ if (l <= 0) ++ break; ++ ++ l2 = copy_to_user(buf, page, l); ++ copied += l - l2; ++ if (l2) ++ break; ++ ++ buf += l; ++ size -= l; ++ *ppos += (unsigned long)start; ++ l = l2 = 0; ++ } ++ ++ qmblk_put(data.qmblk); ++ free_page((unsigned long)page); ++ if (copied) ++ return copied; ++ else if (l2) /* last copy_to_user failed */ ++ return -EFAULT; ++ else /* read error or EOF */ ++ return l; ++ ++out_err: ++ if (page != NULL) ++ free_page((unsigned long)page); ++ return err; ++} ++ ++static struct file_operations vzdq_aquotf_file_operations = { ++ .read = &vzdq_aquotf_read, ++}; ++ ++static struct inode_operations vzdq_aquotf_inode_operations = { ++}; ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * /proc/vz/vzaquota/QID directory ++ * ++ * --------------------------------------------------------------------- */ ++ ++static int vzdq_aquotq_readdir(struct file *file, void *data, filldir_t filler) ++{ ++ loff_t n; ++ int err; ++ ++ n = file->f_pos; ++ for (err = 0; !err; n++) { ++ /* ppc32 can't cmp 2 long long's in switch, calls __cmpdi2() */ ++ switch ((unsigned long)n) { ++ case 0: ++ err = (*filler)(data, ".", 1, n, ++ file->f_dentry->d_inode->i_ino, ++ DT_DIR); ++ break; ++ case 1: ++ err = (*filler)(data, "..", 2, n, ++ parent_ino(file->f_dentry), DT_DIR); ++ break; ++ case 2: ++ err = (*filler)(data, aquota_user, ++ sizeof(aquota_user)-1, n, ++ file->f_dentry->d_inode->i_ino ++ + USRQUOTA + 1, ++ DT_REG); ++ break; ++ case 3: ++ err = (*filler)(data, aquota_group, ++ sizeof(aquota_group)-1, n, ++ file->f_dentry->d_inode->i_ino ++ + GRPQUOTA + 1, ++ DT_REG); ++ break; ++ default: ++ goto out; ++ } ++ } ++out: ++ file->f_pos = n; ++ return err; ++} ++ ++struct vzdq_aquotq_lookdata { ++ dev_t dev; ++ int type; ++ struct vz_quota_master *qmblk; ++}; ++ ++static int vzdq_aquotq_looktest(struct inode *inode, void *data) ++{ ++ struct vzdq_aquotq_lookdata *d; ++ ++ d = data; ++ return inode->i_op == &vzdq_aquotf_inode_operations && ++ vzdq_aquot_getidev(inode) == d->dev && ++ PROC_I(inode)->fd == d->type + 1; ++} ++ ++static int vzdq_aquotq_lookset(struct inode *inode, void *data) ++{ ++ struct vzdq_aquotq_lookdata *d; ++ struct super_block *sb; ++ struct quotatree_data qtd; ++ struct quotatree_tree *tree; ++ ++ d = data; ++ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; ++ inode->i_ino = vzdq_aquot_getino(d->dev) + d->type + 1; ++ inode->i_mode = S_IFREG | S_IRUSR; ++ inode->i_uid = 0; ++ inode->i_gid = 0; ++ inode->i_nlink = 1; ++ inode->i_op = &vzdq_aquotf_inode_operations; ++ inode->i_fop = &vzdq_aquotf_file_operations; ++ PROC_I(inode)->fd = d->type + 1; ++ vzdq_aquot_setidev(inode, d->dev); ++ ++ /* Setting size */ ++ sb = user_get_super(d->dev); ++ if (sb == NULL) ++ return -ENODEV; ++ qtd.qmblk = vzquota_find_qmblk(sb); ++ drop_super(sb); ++ ++ if (qtd.qmblk == NULL) ++ return -ESRCH; ++ if (qtd.qmblk == VZ_QUOTA_BAD) ++ return -EIO; ++ ++ qtd.type = PROC_I(inode)->fd - 1; ++ tree = QUGID_TREE(qtd.qmblk, qtd.type); ++ inode->i_size = get_block_num(tree) * 1024; ++ return 0; ++} ++ ++static int vzdq_aquotq_revalidate(struct dentry *vdentry, struct nameidata *nd) ++{ ++ return 0; ++} ++ ++static struct dentry_operations vzdq_aquotq_dentry_operations = { ++ .d_revalidate = &vzdq_aquotq_revalidate, ++}; ++ ++static struct vz_quota_master *find_qmblk_by_dev(dev_t dev) ++{ ++ struct super_block *sb; ++ struct vz_quota_master *qmblk; ++ ++ qmblk = NULL; ++ sb = user_get_super(dev); ++ if (sb != NULL) { ++ qmblk = vzquota_find_qmblk(sb); ++ drop_super(sb); ++ ++ if (qmblk == VZ_QUOTA_BAD) ++ qmblk = NULL; ++ } ++ ++ return qmblk; ++} ++ ++static struct dentry *vzdq_aquotq_lookup(struct inode *dir, ++ struct dentry *dentry, ++ struct nameidata *nd) ++{ ++ struct inode *inode; ++ struct vzdq_aquotq_lookdata d; ++ int k; ++ ++ if (dentry->d_name.len == sizeof(aquota_user)-1) { ++ if (memcmp(dentry->d_name.name, aquota_user, ++ sizeof(aquota_user)-1)) ++ goto out; ++ k = USRQUOTA; ++ } else if (dentry->d_name.len == sizeof(aquota_group)-1) { ++ if (memcmp(dentry->d_name.name, aquota_group, ++ sizeof(aquota_group)-1)) ++ goto out; ++ k = GRPQUOTA; ++ } else ++ goto out; ++ d.dev = vzdq_aquot_getidev(dir); ++ d.type = k; ++ d.qmblk = find_qmblk_by_dev(d.dev); ++ if (d.qmblk == NULL) ++ goto out; ++ ++ inode = iget5_locked(dir->i_sb, dir->i_ino + k + 1, ++ vzdq_aquotq_looktest, vzdq_aquotq_lookset, &d); ++ if (inode == NULL) ++ goto out; ++ unlock_new_inode(inode); ++ dentry->d_op = &vzdq_aquotq_dentry_operations; ++ d_add(dentry, inode); ++ return NULL; ++ ++out: ++ return ERR_PTR(-ENOENT); ++} ++ ++static struct file_operations vzdq_aquotq_file_operations = { ++ .read = &generic_read_dir, ++ .readdir = &vzdq_aquotq_readdir, ++}; ++ ++static struct inode_operations vzdq_aquotq_inode_operations = { ++ .lookup = &vzdq_aquotq_lookup, ++}; ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * /proc/vz/vzaquota directory ++ * ++ * --------------------------------------------------------------------- */ ++ ++struct vzdq_aquot_de { ++ struct list_head list; ++ struct vfsmount *mnt; ++}; ++ ++static int vzdq_aquot_buildmntlist(struct ve_struct *ve, ++ struct list_head *head) ++{ ++ struct vfsmount *rmnt, *mnt; ++ struct vzdq_aquot_de *p; ++ int err; ++ ++#ifdef CONFIG_VE ++ rmnt = mntget(ve->root_path.mnt); ++#else ++ read_lock(¤t->fs->lock); ++ rmnt = mntget(current->fs->rootmnt); ++ read_unlock(¤t->fs->lock); ++#endif ++ mnt = rmnt; ++ spin_lock(&vfsmount_lock); ++ while (1) { ++ list_for_each_entry(p, head, list) { ++ if (p->mnt->mnt_sb == mnt->mnt_sb) ++ goto skip; ++ } ++ ++ err = -ENOMEM; ++ p = kmalloc(sizeof(*p), GFP_ATOMIC); ++ if (p == NULL) ++ goto out; ++ p->mnt = mntget(mnt); ++ list_add_tail(&p->list, head); ++ ++skip: ++ err = 0; ++ if (list_empty(&mnt->mnt_mounts)) { ++ while (1) { ++ if (mnt == rmnt) ++ goto out; ++ if (mnt->mnt_child.next != ++ &mnt->mnt_parent->mnt_mounts) ++ break; ++ mnt = mnt->mnt_parent; ++ } ++ mnt = list_entry(mnt->mnt_child.next, ++ struct vfsmount, mnt_child); ++ } else ++ mnt = list_entry(mnt->mnt_mounts.next, ++ struct vfsmount, mnt_child); ++ } ++out: ++ spin_unlock(&vfsmount_lock); ++ mntput(rmnt); ++ return err; ++} ++ ++static void vzdq_aquot_releasemntlist(struct ve_struct *ve, ++ struct list_head *head) ++{ ++ struct vzdq_aquot_de *p; ++ ++ while (!list_empty(head)) { ++ p = list_entry(head->next, typeof(*p), list); ++ mntput(p->mnt); ++ list_del(&p->list); ++ kfree(p); ++ } ++} ++ ++static int vzdq_aquotd_readdir(struct file *file, void *data, filldir_t filler) ++{ ++ struct ve_struct *ve, *old_ve; ++ struct list_head mntlist; ++ struct vzdq_aquot_de *de; ++ struct super_block *sb; ++ struct vz_quota_master *qmblk; ++ loff_t i, n; ++ char buf[24]; ++ int l, err; ++ ++ i = 0; ++ n = file->f_pos; ++ ve = file->f_dentry->d_sb->s_type->owner_env; ++ old_ve = set_exec_env(ve); ++ ++ INIT_LIST_HEAD(&mntlist); ++#ifdef CONFIG_VE ++ /* ++ * The only reason of disabling readdir for the host system is that ++ * this readdir can be slow and CPU consuming with large number of VPSs ++ * (or just mount points). ++ */ ++ err = ve_is_super(ve); ++#else ++ err = 0; ++#endif ++ if (!err) { ++ err = vzdq_aquot_buildmntlist(ve, &mntlist); ++ if (err) ++ goto out_err; ++ } ++ ++ if (i >= n) { ++ if ((*filler)(data, ".", 1, i, ++ file->f_dentry->d_inode->i_ino, DT_DIR)) ++ goto out_fill; ++ } ++ i++; ++ ++ if (i >= n) { ++ if ((*filler)(data, "..", 2, i, ++ parent_ino(file->f_dentry), DT_DIR)) ++ goto out_fill; ++ } ++ i++; ++ ++ list_for_each_entry (de, &mntlist, list) { ++ sb = de->mnt->mnt_sb; ++ if (get_device_perms_ve(S_IFBLK, sb->s_dev, FMODE_QUOTACTL)) ++ continue; ++ ++ qmblk = vzquota_find_qmblk(sb); ++ if (qmblk == NULL || qmblk == VZ_QUOTA_BAD) ++ continue; ++ ++ qmblk_put(qmblk); ++ i++; ++ if (i <= n) ++ continue; ++ ++ l = sprintf(buf, "%08x", new_encode_dev(sb->s_dev)); ++ if ((*filler)(data, buf, l, i - 1, ++ vzdq_aquot_getino(sb->s_dev), DT_DIR)) ++ break; ++ } ++ ++out_fill: ++ err = 0; ++ file->f_pos = i; ++out_err: ++ vzdq_aquot_releasemntlist(ve, &mntlist); ++ (void)set_exec_env(old_ve); ++ return err; ++} ++ ++static int vzdq_aquotd_looktest(struct inode *inode, void *data) ++{ ++ return inode->i_op == &vzdq_aquotq_inode_operations && ++ vzdq_aquot_getidev(inode) == (dev_t)(unsigned long)data; ++} ++ ++static int vzdq_aquotd_lookset(struct inode *inode, void *data) ++{ ++ dev_t dev; ++ ++ dev = (dev_t)(unsigned long)data; ++ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; ++ inode->i_ino = vzdq_aquot_getino(dev); ++ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR; ++ inode->i_uid = 0; ++ inode->i_gid = 0; ++ inode->i_nlink = 2; ++ inode->i_op = &vzdq_aquotq_inode_operations; ++ inode->i_fop = &vzdq_aquotq_file_operations; ++ vzdq_aquot_setidev(inode, dev); ++ return 0; ++} ++ ++static struct dentry *vzdq_aquotd_lookup(struct inode *dir, ++ struct dentry *dentry, ++ struct nameidata *nd) ++{ ++ struct ve_struct *ve, *old_ve; ++ const unsigned char *s; ++ int l; ++ dev_t dev; ++ struct inode *inode; ++ ++ ve = dir->i_sb->s_type->owner_env; ++ old_ve = set_exec_env(ve); ++#ifdef CONFIG_VE ++ /* ++ * Lookup is much lighter than readdir, so it can be allowed for the ++ * host system. But it would be strange to be able to do lookup only ++ * without readdir... ++ */ ++ if (ve_is_super(ve)) ++ goto out; ++#endif ++ ++ dev = 0; ++ l = dentry->d_name.len; ++ if (l <= 0) ++ goto out; ++ for (s = dentry->d_name.name; l > 0; s++, l--) { ++ if (!isxdigit(*s)) ++ goto out; ++ if (dev & ~(~0UL >> 4)) ++ goto out; ++ dev <<= 4; ++ if (isdigit(*s)) ++ dev += *s - '0'; ++ else if (islower(*s)) ++ dev += *s - 'a' + 10; ++ else ++ dev += *s - 'A' + 10; ++ } ++ dev = new_decode_dev(dev); ++ ++ if (get_device_perms_ve(S_IFBLK, dev, FMODE_QUOTACTL)) ++ goto out; ++ ++ inode = iget5_locked(dir->i_sb, vzdq_aquot_getino(dev), ++ vzdq_aquotd_looktest, vzdq_aquotd_lookset, ++ (void *)(unsigned long)dev); ++ if (inode == NULL) ++ goto out; ++ unlock_new_inode(inode); ++ ++ d_add(dentry, inode); ++ (void)set_exec_env(old_ve); ++ return NULL; ++ ++out: ++ (void)set_exec_env(old_ve); ++ return ERR_PTR(-ENOENT); ++} ++ ++static struct file_operations vzdq_aquotd_file_operations = { ++ .read = &generic_read_dir, ++ .readdir = &vzdq_aquotd_readdir, ++}; ++ ++static struct inode_operations vzdq_aquotd_inode_operations = { ++ .lookup = &vzdq_aquotd_lookup, ++}; ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * Initialization and deinitialization ++ * ++ * --------------------------------------------------------------------- */ ++static int fake_data; ++static struct ctl_table fake_table[] = { ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = ".fake", ++ .mode = 0600, ++ .proc_handler = proc_dointvec, ++ .data = &fake_data, ++ .maxlen = sizeof(int), ++ }, ++ { } ++}; ++ ++static struct ctl_path fake_path[] = { ++ { .ctl_name = CTL_FS, .procname = "fs", }, ++ { .ctl_name = FS_DQSTATS, .procname = "quota", }, ++ { } ++}; ++ ++/* ++ * FIXME: creation of proc entries here is unsafe with respect to module ++ * unloading. ++ */ ++void vzaquota_init(void) ++{ ++ struct proc_dir_entry *de; ++ ++ de = proc_create("vzaquota", S_IFDIR | S_IRUSR | S_IXUSR, ++ glob_proc_vz_dir, &vzdq_aquotd_file_operations); ++ if (de != NULL) ++ de->proc_iops = &vzdq_aquotd_inode_operations; ++ else ++ printk("VZDQ: vz/vzaquota creation failed\n"); ++ ++ register_sysctl_glob_paths(fake_path, fake_table, 1); ++} ++ ++void vzaquota_fini(void) ++{ ++ remove_proc_entry("vz/vzaquota", NULL); ++} +diff --git a/fs/vzdq_mgmt.c b/fs/vzdq_mgmt.c +new file mode 100644 +index 0000000..a1e92e2 +--- /dev/null ++++ b/fs/vzdq_mgmt.c +@@ -0,0 +1,754 @@ ++/* ++ * Copyright (C) 2001, 2002, 2004, 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++/* ---------------------------------------------------------------------- ++ * Switching quota on. ++ * --------------------------------------------------------------------- */ ++ ++/* ++ * check limits copied from user ++ */ ++int vzquota_check_sane_limits(struct dq_stat *qstat) ++{ ++ int err; ++ ++ err = -EINVAL; ++ ++ /* softlimit must be less then hardlimit */ ++ if (qstat->bsoftlimit > qstat->bhardlimit) ++ goto out; ++ ++ if (qstat->isoftlimit > qstat->ihardlimit) ++ goto out; ++ ++ err = 0; ++out: ++ return err; ++} ++ ++/* ++ * check usage values copied from user ++ */ ++int vzquota_check_sane_values(struct dq_stat *qstat) ++{ ++ int err; ++ ++ err = -EINVAL; ++ ++ /* expiration time must not be set if softlimit was not exceeded */ ++ if (qstat->bcurrent < qstat->bsoftlimit && qstat->btime != 0) ++ goto out; ++ ++ if (qstat->icurrent < qstat->isoftlimit && qstat->itime != 0) ++ goto out; ++ ++ err = vzquota_check_sane_limits(qstat); ++out: ++ return err; ++} ++ ++/* ++ * create new quota master block ++ * this function should: ++ * - copy limits and usage parameters from user buffer; ++ * - allock, initialize quota block and insert it to hash; ++ */ ++static int vzquota_create(unsigned int quota_id, ++ struct vz_quota_stat __user *u_qstat, int compat) ++{ ++ int err; ++ struct vz_quota_stat qstat; ++ struct vz_quota_master *qmblk; ++ ++ down(&vz_quota_sem); ++ ++ err = -EFAULT; ++ if (!compat) { ++ if (copy_from_user(&qstat, u_qstat, sizeof(qstat))) ++ goto out; ++ } else { ++#ifdef CONFIG_COMPAT ++ struct compat_vz_quota_stat cqstat; ++ if (copy_from_user(&cqstat, u_qstat, sizeof(cqstat))) ++ goto out; ++ compat_dqstat2dqstat(&cqstat.dq_stat, &qstat.dq_stat); ++ compat_dqinfo2dqinfo(&cqstat.dq_info, &qstat.dq_info); ++#endif ++ } ++ ++ err = -EINVAL; ++ if (quota_id == 0) ++ goto out; ++ ++ if (vzquota_check_sane_values(&qstat.dq_stat)) ++ goto out; ++ err = 0; ++ qmblk = vzquota_alloc_master(quota_id, &qstat); ++ ++ if (IS_ERR(qmblk)) /* ENOMEM or EEXIST */ ++ err = PTR_ERR(qmblk); ++out: ++ up(&vz_quota_sem); ++ ++ return err; ++} ++ ++/** ++ * vzquota_on - turn quota on ++ * ++ * This function should: ++ * - find and get refcnt of directory entry for quota root and corresponding ++ * mountpoint; ++ * - find corresponding quota block and mark it with given path; ++ * - check quota tree; ++ * - initialize quota for the tree root. ++ */ ++static int vzquota_on(unsigned int quota_id, const char __user *quota_root, ++ char __user *buf) ++{ ++ int err; ++ struct nameidata nd; ++ struct vz_quota_master *qmblk; ++ struct super_block *dqsb; ++ ++ dqsb = NULL; ++ down(&vz_quota_sem); ++ ++ err = -ENOENT; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ err = -EBUSY; ++ if (qmblk->dq_state != VZDQ_STARTING) ++ goto out; ++ ++ err = user_path_walk(quota_root, &nd); ++ if (err) ++ goto out; ++ /* init path must be a directory */ ++ err = -ENOTDIR; ++ if (!S_ISDIR(nd.path.dentry->d_inode->i_mode)) ++ goto out_path; ++ ++ qmblk->dq_root_path = nd.path; ++ qmblk->dq_sb = nd.path.dentry->d_inode->i_sb; ++ err = vzquota_get_super(qmblk->dq_sb); ++ if (err) ++ goto out_super; ++ ++ /* ++ * Serialization with quota initialization and operations is performed ++ * through generation check: generation is memorized before qmblk is ++ * found and compared under inode_qmblk_lock with assignment. ++ * ++ * Note that the dentry tree is shrunk only for high-level logical ++ * serialization, purely as a courtesy to the user: to have consistent ++ * quota statistics, files should be closed etc. on quota on. ++ */ ++ err = vzquota_on_qmblk(qmblk->dq_sb, qmblk->dq_root_path.dentry->d_inode, ++ qmblk, buf); ++ if (err) ++ goto out_init; ++ qmblk->dq_state = VZDQ_WORKING; ++ ++ up(&vz_quota_sem); ++ return 0; ++ ++out_init: ++ dqsb = qmblk->dq_sb; ++out_super: ++ /* clear for qmblk_put/quota_free_master */ ++ qmblk->dq_sb = NULL; ++ qmblk->dq_root_path.dentry = NULL; ++ qmblk->dq_root_path.mnt = NULL; ++out_path: ++ path_put(&nd.path); ++out: ++ if (dqsb) ++ vzquota_put_super(dqsb); ++ up(&vz_quota_sem); ++ return err; ++} ++ ++ ++/* ---------------------------------------------------------------------- ++ * Switching quota off. ++ * --------------------------------------------------------------------- */ ++ ++/* ++ * destroy quota block by ID ++ */ ++static int vzquota_destroy(unsigned int quota_id) ++{ ++ int err; ++ struct vz_quota_master *qmblk; ++ struct path root; ++ ++ down(&vz_quota_sem); ++ ++ err = -ENOENT; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ err = -EBUSY; ++ if (qmblk->dq_state == VZDQ_WORKING) ++ goto out; /* quota_off first */ ++ ++ list_del_init(&qmblk->dq_hash); ++ root = qmblk->dq_root_path; ++ qmblk->dq_root_path.dentry = NULL; ++ qmblk->dq_root_path.mnt = NULL; ++ ++ if (qmblk->dq_sb) ++ vzquota_put_super(qmblk->dq_sb); ++ up(&vz_quota_sem); ++ ++ qmblk_put(qmblk); ++ path_put(&root); ++ return 0; ++ ++out: ++ up(&vz_quota_sem); ++ return err; ++} ++ ++/** ++ * vzquota_off - turn quota off ++ */ ++ ++static int __vzquota_sync_list(struct list_head *lh, ++ struct vz_quota_master *qmblk, ++ enum writeback_sync_modes sync_mode) ++{ ++ struct writeback_control wbc; ++ LIST_HEAD(list); ++ struct vz_quota_ilink *qlnk; ++ struct inode *inode; ++ int err, ret; ++ ++ memset(&wbc, 0, sizeof(wbc)); ++ wbc.sync_mode = sync_mode; ++ ++ err = ret = 0; ++ while (!list_empty(lh)) { ++ if (need_resched()) { ++ inode_qmblk_unlock(qmblk->dq_sb); ++ schedule(); ++ inode_qmblk_lock(qmblk->dq_sb); ++ continue; ++ } ++ ++ qlnk = list_first_entry(lh, struct vz_quota_ilink, list); ++ list_move(&qlnk->list, &list); ++ ++ inode = igrab(QLNK_INODE(qlnk)); ++ if (!inode) ++ continue; ++ ++ inode_qmblk_unlock(qmblk->dq_sb); ++ ++ wbc.nr_to_write = LONG_MAX; ++ ret = sync_inode(inode, &wbc); ++ if (ret) ++ err = ret; ++ iput(inode); ++ ++ inode_qmblk_lock(qmblk->dq_sb); ++ } ++ ++ list_splice(&list, lh); ++ return err; ++} ++ ++static int vzquota_sync_list(struct list_head *lh, ++ struct vz_quota_master *qmblk) ++{ ++ (void)__vzquota_sync_list(lh, qmblk, WB_SYNC_NONE); ++ return __vzquota_sync_list(lh, qmblk, WB_SYNC_ALL); ++} ++ ++static int vzquota_sync_inodes(struct vz_quota_master *qmblk) ++{ ++ int err; ++ LIST_HEAD(qlnk_list); ++ ++ list_splice_init(&qmblk->dq_ilink_list, &qlnk_list); ++ err = vzquota_sync_list(&qlnk_list, qmblk); ++ if (!err && !list_empty(&qmblk->dq_ilink_list)) ++ err = -EBUSY; ++ list_splice(&qlnk_list, &qmblk->dq_ilink_list); ++ ++ return err; ++} ++ ++static int vzquota_off(unsigned int quota_id, char __user *buf, int force) ++{ ++ int err, ret; ++ struct vz_quota_master *qmblk; ++ ++ down(&vz_quota_sem); ++ ++ err = -ENOENT; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ err = -EALREADY; ++ if (qmblk->dq_state != VZDQ_WORKING) ++ goto out; ++ ++ inode_qmblk_lock(qmblk->dq_sb); /* protects dq_ilink_list also */ ++ ret = vzquota_sync_inodes(qmblk); ++ inode_qmblk_unlock(qmblk->dq_sb); ++ ++ err = vzquota_off_qmblk(qmblk->dq_sb, qmblk, buf, force); ++ if (err) ++ goto out; ++ ++ err = ret; ++ /* vzquota_destroy will free resources */ ++ qmblk->dq_state = VZDQ_STOPING; ++out: ++ up(&vz_quota_sem); ++ ++ return err; ++} ++ ++ ++/* ---------------------------------------------------------------------- ++ * Other VZQUOTA ioctl's. ++ * --------------------------------------------------------------------- */ ++ ++/* ++ * this function should: ++ * - set new limits/buffer under quota master block lock ++ * - if new softlimit less then usage, then set expiration time ++ * - no need to alloc ugid hash table - we'll do that on demand ++ */ ++int vzquota_update_limit(struct dq_stat *_qstat, ++ struct dq_stat *qstat) ++{ ++ int err; ++ ++ err = -EINVAL; ++ if (vzquota_check_sane_limits(qstat)) ++ goto out; ++ ++ err = 0; ++ ++ /* limits */ ++ _qstat->bsoftlimit = qstat->bsoftlimit; ++ _qstat->bhardlimit = qstat->bhardlimit; ++ /* ++ * If the soft limit is exceeded, administrator can override the moment ++ * when the grace period for limit exceeding ends. ++ * Specifying the moment may be useful if the soft limit is set to be ++ * lower than the current usage. In the latter case, if the grace ++ * period end isn't specified, the grace period will start from the ++ * moment of the first write operation. ++ * There is a race with the user level. Soft limit may be already ++ * exceeded before the limit change, and grace period end calculated by ++ * the kernel will be overriden. User level may check if the limit is ++ * already exceeded, but check and set calls are not atomic. ++ * This race isn't dangerous. Under normal cicrumstances, the ++ * difference between the grace period end calculated by the kernel and ++ * the user level should be not greater than as the difference between ++ * the moments of check and set calls, i.e. not bigger than the quota ++ * timer resolution - 1 sec. ++ */ ++ if (qstat->btime != (time_t)0 && ++ _qstat->bcurrent >= _qstat->bsoftlimit) ++ _qstat->btime = qstat->btime; ++ ++ _qstat->isoftlimit = qstat->isoftlimit; ++ _qstat->ihardlimit = qstat->ihardlimit; ++ if (qstat->itime != (time_t)0 && ++ _qstat->icurrent >= _qstat->isoftlimit) ++ _qstat->itime = qstat->itime; ++ ++out: ++ return err; ++} ++ ++/* ++ * set new quota limits. ++ * this function should: ++ * copy new limits from user level ++ * - find quota block ++ * - set new limits and flags. ++ */ ++static int vzquota_setlimit(unsigned int quota_id, ++ struct vz_quota_stat __user *u_qstat, int compat) ++{ ++ int err; ++ struct vz_quota_stat qstat; ++ struct vz_quota_master *qmblk; ++ ++ down(&vz_quota_sem); /* for hash list protection */ ++ ++ err = -ENOENT; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ err = -EFAULT; ++ if (!compat) { ++ if (copy_from_user(&qstat, u_qstat, sizeof(qstat))) ++ goto out; ++ } else { ++#ifdef CONFIG_COMPAT ++ struct compat_vz_quota_stat cqstat; ++ if (copy_from_user(&cqstat, u_qstat, sizeof(cqstat))) ++ goto out; ++ compat_dqstat2dqstat(&cqstat.dq_stat, &qstat.dq_stat); ++ compat_dqinfo2dqinfo(&cqstat.dq_info, &qstat.dq_info); ++#endif ++ } ++ ++ qmblk_data_write_lock(qmblk); ++ err = vzquota_update_limit(&qmblk->dq_stat, &qstat.dq_stat); ++ if (err == 0) ++ qmblk->dq_info = qstat.dq_info; ++ qmblk_data_write_unlock(qmblk); ++ ++out: ++ up(&vz_quota_sem); ++ return err; ++} ++ ++/* ++ * get quota limits. ++ * very simple - just return stat buffer to user ++ */ ++static int vzquota_getstat(unsigned int quota_id, ++ struct vz_quota_stat __user *u_qstat, int compat) ++{ ++ int err; ++ struct vz_quota_stat qstat; ++ struct vz_quota_master *qmblk; ++ ++ down(&vz_quota_sem); ++ ++ err = -ENOENT; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ qmblk_data_read_lock(qmblk); ++ /* copy whole buffer under lock */ ++ memcpy(&qstat.dq_stat, &qmblk->dq_stat, sizeof(qstat.dq_stat)); ++ memcpy(&qstat.dq_info, &qmblk->dq_info, sizeof(qstat.dq_info)); ++ qmblk_data_read_unlock(qmblk); ++ ++ if (!compat) ++ err = copy_to_user(u_qstat, &qstat, sizeof(qstat)); ++ else { ++#ifdef CONFIG_COMPAT ++ struct compat_vz_quota_stat cqstat; ++ dqstat2compat_dqstat(&qstat.dq_stat, &cqstat.dq_stat); ++ dqinfo2compat_dqinfo(&qstat.dq_info, &cqstat.dq_info); ++ err = copy_to_user(u_qstat, &cqstat, sizeof(cqstat)); ++#endif ++ } ++ if (err) ++ err = -EFAULT; ++ ++out: ++ up(&vz_quota_sem); ++ return err; ++} ++ ++/* ++ * This is a system call to turn per-VE disk quota on. ++ * Note this call is allowed to run ONLY from VE0 ++ */ ++long do_vzquotactl(int cmd, unsigned int quota_id, ++ struct vz_quota_stat __user *qstat, const char __user *ve_root, ++ int compat) ++{ ++ int ret; ++ int force = 0; ++ ++ ret = -EPERM; ++ /* access allowed only from root of VE0 */ ++ if (!capable(CAP_SYS_RESOURCE) || ++ !capable(CAP_SYS_ADMIN)) ++ goto out; ++ ++ switch (cmd) { ++ case VZ_DQ_CREATE: ++ ret = vzquota_create(quota_id, qstat, compat); ++ break; ++ case VZ_DQ_DESTROY: ++ ret = vzquota_destroy(quota_id); ++ break; ++ case VZ_DQ_ON: ++ /* ++ * qstat is just a pointer to userspace buffer to ++ * store busy files path in case of vzquota_on fail ++ */ ++ ret = vzquota_on(quota_id, ve_root, (char *)qstat); ++ break; ++ case VZ_DQ_OFF_FORCED: ++ force = 1; ++ case VZ_DQ_OFF: ++ /* ++ * ve_root is just a pointer to userspace buffer to ++ * store busy files path in case of vzquota_off fail ++ */ ++ ret = vzquota_off(quota_id, (char *)ve_root, force); ++ break; ++ case VZ_DQ_SETLIMIT: ++ ret = vzquota_setlimit(quota_id, qstat, compat); ++ break; ++ case VZ_DQ_GETSTAT: ++ ret = vzquota_getstat(quota_id, qstat, compat); ++ break; ++ ++ default: ++ ret = -EINVAL; ++ goto out; ++ } ++ ++out: ++ return ret; ++} ++ ++ ++/* ---------------------------------------------------------------------- ++ * Proc filesystem routines ++ * ---------------------------------------------------------------------*/ ++ ++#if defined(CONFIG_PROC_FS) ++ ++#define QUOTA_UINT_LEN 15 ++#define QUOTA_TIME_LEN_FMT_UINT "%11u" ++#define QUOTA_NUM_LEN_FMT_UINT "%15u" ++#define QUOTA_NUM_LEN_FMT_ULL "%15Lu" ++#define QUOTA_TIME_LEN_FMT_STR "%11s" ++#define QUOTA_NUM_LEN_FMT_STR "%15s" ++#define QUOTA_PROC_MAX_LINE_LEN 2048 ++ ++/* ++ * prints /proc/ve_dq header line ++ */ ++static int print_proc_header(char * buffer) ++{ ++ return sprintf(buffer, ++ "%-11s" ++ QUOTA_NUM_LEN_FMT_STR ++ QUOTA_NUM_LEN_FMT_STR ++ QUOTA_NUM_LEN_FMT_STR ++ QUOTA_TIME_LEN_FMT_STR ++ QUOTA_TIME_LEN_FMT_STR ++ "\n", ++ "qid: path", ++ "usage", "softlimit", "hardlimit", "time", "expire"); ++} ++ ++/* ++ * prints proc master record id, dentry path ++ */ ++static int print_proc_master_id(char * buffer, char * path_buf, ++ struct vz_quota_master * qp) ++{ ++ char *path; ++ int over; ++ ++ path = NULL; ++ switch (qp->dq_state) { ++ case VZDQ_WORKING: ++ if (!path_buf) { ++ path = ""; ++ break; ++ } ++ path = d_path(&qp->dq_root_path, path_buf, PAGE_SIZE); ++ if (IS_ERR(path)) { ++ path = ""; ++ break; ++ } ++ /* do not print large path, truncate it */ ++ over = strlen(path) - ++ (QUOTA_PROC_MAX_LINE_LEN - 3 - 3 - ++ QUOTA_UINT_LEN); ++ if (over > 0) { ++ path += over - 3; ++ path[0] = path[1] = path[3] = '.'; ++ } ++ break; ++ case VZDQ_STARTING: ++ path = "-- started --"; ++ break; ++ case VZDQ_STOPING: ++ path = "-- stopped --"; ++ break; ++ } ++ ++ return sprintf(buffer, "%u: %s\n", qp->dq_id, path); ++} ++ ++/* ++ * prints struct vz_quota_stat data ++ */ ++static int print_proc_stat(char * buffer, struct dq_stat *qs, ++ struct dq_info *qi) ++{ ++ return sprintf(buffer, ++ "%11s" ++ QUOTA_NUM_LEN_FMT_ULL ++ QUOTA_NUM_LEN_FMT_ULL ++ QUOTA_NUM_LEN_FMT_ULL ++ QUOTA_TIME_LEN_FMT_UINT ++ QUOTA_TIME_LEN_FMT_UINT ++ "\n" ++ "%11s" ++ QUOTA_NUM_LEN_FMT_UINT ++ QUOTA_NUM_LEN_FMT_UINT ++ QUOTA_NUM_LEN_FMT_UINT ++ QUOTA_TIME_LEN_FMT_UINT ++ QUOTA_TIME_LEN_FMT_UINT ++ "\n", ++ "1k-blocks", ++ (unsigned long long)qs->bcurrent >> 10, ++ (unsigned long long)qs->bsoftlimit >> 10, ++ (unsigned long long)qs->bhardlimit >> 10, ++ (unsigned int)qs->btime, ++ (unsigned int)qi->bexpire, ++ "inodes", ++ qs->icurrent, ++ qs->isoftlimit, ++ qs->ihardlimit, ++ (unsigned int)qs->itime, ++ (unsigned int)qi->iexpire); ++} ++ ++ ++/* ++ * for /proc filesystem output ++ */ ++static int vzquota_read_proc(char *page, char **start, off_t off, int count, ++ int *eof, void *data) ++{ ++ int len, i; ++ off_t printed = 0; ++ char *p = page; ++ struct vz_quota_master *qp; ++ struct vz_quota_ilink *ql2; ++ struct list_head *listp; ++ char *path_buf; ++ ++ path_buf = (char*)__get_free_page(GFP_KERNEL); ++ if (path_buf == NULL) ++ return -ENOMEM; ++ ++ len = print_proc_header(p); ++ printed += len; ++ if (off < printed) /* keep header in output */ { ++ *start = p + off; ++ p += len; ++ } ++ ++ down(&vz_quota_sem); ++ ++ /* traverse master hash table for all records */ ++ for (i = 0; i < vzquota_hash_size; i++) { ++ list_for_each(listp, &vzquota_hash_table[i]) { ++ qp = list_entry(listp, ++ struct vz_quota_master, dq_hash); ++ ++ /* Skip other VE's information if not root of VE0 */ ++ if ((!capable(CAP_SYS_ADMIN) || ++ !capable(CAP_SYS_RESOURCE))) { ++ ql2 = INODE_QLNK(current->fs->root.dentry->d_inode); ++ if (ql2 == NULL || qp != ql2->qmblk) ++ continue; ++ } ++ /* ++ * Now print the next record ++ */ ++ len = 0; ++ /* we print quotaid and path only in VE0 */ ++ if (capable(CAP_SYS_ADMIN)) ++ len += print_proc_master_id(p+len,path_buf, qp); ++ len += print_proc_stat(p+len, &qp->dq_stat, ++ &qp->dq_info); ++ printed += len; ++ /* skip unnecessary lines */ ++ if (printed <= off) ++ continue; ++ p += len; ++ /* provide start offset */ ++ if (*start == NULL) ++ *start = p + (off - printed); ++ /* have we printed all requested size? */ ++ if (PAGE_SIZE - (p - page) < QUOTA_PROC_MAX_LINE_LEN || ++ (p - *start) >= count) ++ goto out; ++ } ++ } ++ ++ *eof = 1; /* checked all hash */ ++out: ++ up(&vz_quota_sem); ++ ++ len = 0; ++ if (*start != NULL) { ++ len = (p - *start); ++ if (len > count) ++ len = count; ++ } ++ ++ if (path_buf) ++ free_page((unsigned long) path_buf); ++ ++ return len; ++} ++ ++/* ++ * Register procfs read callback ++ */ ++int vzquota_proc_init(void) ++{ ++ struct proc_dir_entry *de; ++ ++ de = proc_create("vzquota", S_IFREG|S_IRUSR, proc_vz_dir, NULL); ++ if (de == NULL) ++ return -EBUSY; ++ ++ de->read_proc = vzquota_read_proc; ++ de->data = NULL; ++ return 0; ++} ++ ++void vzquota_proc_release(void) ++{ ++ /* Unregister procfs read callback */ ++ remove_proc_entry("vzquota", proc_vz_dir); ++} ++ ++#endif +diff --git a/fs/vzdq_ops.c b/fs/vzdq_ops.c +new file mode 100644 +index 0000000..5eb7d84 +--- /dev/null ++++ b/fs/vzdq_ops.c +@@ -0,0 +1,633 @@ ++/* ++ * Copyright (C) 2001, 2002, 2004, 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++/* ---------------------------------------------------------------------- ++ * Quota superblock operations - helper functions. ++ * --------------------------------------------------------------------- */ ++ ++static inline void vzquota_incr_inodes(struct dq_stat *dqstat, ++ unsigned long number) ++{ ++ dqstat->icurrent += number; ++} ++ ++static inline void vzquota_incr_space(struct dq_stat *dqstat, ++ __u64 number) ++{ ++ dqstat->bcurrent += number; ++} ++ ++static inline void vzquota_decr_inodes(struct dq_stat *dqstat, ++ unsigned long number) ++{ ++ if (dqstat->icurrent > number) ++ dqstat->icurrent -= number; ++ else ++ dqstat->icurrent = 0; ++ if (dqstat->icurrent < dqstat->isoftlimit) ++ dqstat->itime = (time_t) 0; ++} ++ ++static inline void vzquota_decr_space(struct dq_stat *dqstat, ++ __u64 number) ++{ ++ if (dqstat->bcurrent > number) ++ dqstat->bcurrent -= number; ++ else ++ dqstat->bcurrent = 0; ++ if (dqstat->bcurrent < dqstat->bsoftlimit) ++ dqstat->btime = (time_t) 0; ++} ++ ++/* ++ * better printk() message or use /proc/vzquotamsg interface ++ * similar to /proc/kmsg ++ */ ++static inline void vzquota_warn(struct dq_info *dq_info, int dq_id, int flag, ++ const char *fmt) ++{ ++ if (dq_info->flags & flag) /* warning already printed for this ++ masterblock */ ++ return; ++ printk(fmt, dq_id); ++ dq_info->flags |= flag; ++} ++ ++/* ++ * ignore_hardlimit - ++ * ++ * Intended to allow superuser of VE0 to overwrite hardlimits. ++ * ++ * ignore_hardlimit() has a very bad feature: ++ * ++ * writepage() operation for writable mapping of a file with holes ++ * may trigger get_block() with wrong current and as a consequence, ++ * opens a possibility to overcommit hardlimits ++ */ ++/* for the reason above, it is disabled now */ ++static inline int ignore_hardlimit(struct dq_info *dqstat) ++{ ++#if 0 ++ return ve_is_super(get_exec_env()) && ++ capable(CAP_SYS_RESOURCE) && ++ (dqstat->options & VZ_QUOTA_OPT_RSQUASH); ++#else ++ return 0; ++#endif ++} ++ ++static int vzquota_check_inodes(struct dq_info *dq_info, ++ struct dq_stat *dqstat, ++ unsigned long number, int dq_id) ++{ ++ if (number == 0) ++ return QUOTA_OK; ++ ++ if (dqstat->icurrent + number > dqstat->ihardlimit && ++ !ignore_hardlimit(dq_info)) { ++ vzquota_warn(dq_info, dq_id, VZ_QUOTA_INODES, ++ "VZ QUOTA: file hardlimit reached for id=%d\n"); ++ return NO_QUOTA; ++ } ++ ++ if (dqstat->icurrent + number > dqstat->isoftlimit) { ++ if (dqstat->itime == (time_t)0) { ++ vzquota_warn(dq_info, dq_id, 0, ++ "VZ QUOTA: file softlimit exceeded " ++ "for id=%d\n"); ++ dqstat->itime = CURRENT_TIME_SECONDS + ++ dq_info->iexpire; ++ } else if (CURRENT_TIME_SECONDS >= dqstat->itime && ++ !ignore_hardlimit(dq_info)) { ++ vzquota_warn(dq_info, dq_id, VZ_QUOTA_INODES, ++ "VZ QUOTA: file softlimit expired " ++ "for id=%d\n"); ++ return NO_QUOTA; ++ } ++ } ++ ++ return QUOTA_OK; ++} ++ ++static int vzquota_check_space(struct dq_info *dq_info, ++ struct dq_stat *dqstat, ++ __u64 number, int dq_id, char prealloc) ++{ ++ if (number == 0) ++ return QUOTA_OK; ++ ++ if (prealloc == DQUOT_CMD_FORCE) ++ return QUOTA_OK; ++ ++ if (dqstat->bcurrent + number > dqstat->bhardlimit && ++ !ignore_hardlimit(dq_info)) { ++ if (!prealloc) ++ vzquota_warn(dq_info, dq_id, VZ_QUOTA_SPACE, ++ "VZ QUOTA: disk hardlimit reached " ++ "for id=%d\n"); ++ return NO_QUOTA; ++ } ++ ++ if (dqstat->bcurrent + number > dqstat->bsoftlimit) { ++ if (dqstat->btime == (time_t)0) { ++ if (!prealloc) { ++ vzquota_warn(dq_info, dq_id, 0, ++ "VZ QUOTA: disk softlimit exceeded " ++ "for id=%d\n"); ++ dqstat->btime = CURRENT_TIME_SECONDS ++ + dq_info->bexpire; ++ } else { ++ /* ++ * Original Linux quota doesn't allow ++ * preallocation to exceed softlimit so ++ * exceeding will be always printed ++ */ ++ return NO_QUOTA; ++ } ++ } else if (CURRENT_TIME_SECONDS >= dqstat->btime && ++ !ignore_hardlimit(dq_info)) { ++ if (!prealloc) ++ vzquota_warn(dq_info, dq_id, VZ_QUOTA_SPACE, ++ "VZ QUOTA: disk quota " ++ "softlimit expired " ++ "for id=%d\n"); ++ return NO_QUOTA; ++ } ++ } ++ ++ return QUOTA_OK; ++} ++ ++#ifdef CONFIG_VZ_QUOTA_UGID ++static int vzquota_check_ugid_inodes(struct vz_quota_master *qmblk, ++ struct vz_quota_ugid *qugid[], ++ int type, unsigned long number) ++{ ++ struct dq_info *dqinfo; ++ struct dq_stat *dqstat; ++ ++ if (qugid[type] == NULL) ++ return QUOTA_OK; ++ if (qugid[type] == VZ_QUOTA_UGBAD) ++ return NO_QUOTA; ++ ++ if (type == USRQUOTA && !(qmblk->dq_flags & VZDQ_USRQUOTA)) ++ return QUOTA_OK; ++ if (type == GRPQUOTA && !(qmblk->dq_flags & VZDQ_GRPQUOTA)) ++ return QUOTA_OK; ++ if (number == 0) ++ return QUOTA_OK; ++ ++ dqinfo = &qmblk->dq_ugid_info[type]; ++ dqstat = &qugid[type]->qugid_stat; ++ ++ if (dqstat->ihardlimit != 0 && ++ dqstat->icurrent + number > dqstat->ihardlimit) ++ return NO_QUOTA; ++ ++ if (dqstat->isoftlimit != 0 && ++ dqstat->icurrent + number > dqstat->isoftlimit) { ++ if (dqstat->itime == (time_t)0) ++ dqstat->itime = CURRENT_TIME_SECONDS + ++ dqinfo->iexpire; ++ else if (CURRENT_TIME_SECONDS >= dqstat->itime) ++ return NO_QUOTA; ++ } ++ ++ return QUOTA_OK; ++} ++ ++static int vzquota_check_ugid_space(struct vz_quota_master *qmblk, ++ struct vz_quota_ugid *qugid[], ++ int type, __u64 number, char prealloc) ++{ ++ struct dq_info *dqinfo; ++ struct dq_stat *dqstat; ++ ++ if (prealloc == DQUOT_CMD_FORCE) ++ return QUOTA_OK; ++ ++ if (qugid[type] == NULL) ++ return QUOTA_OK; ++ if (qugid[type] == VZ_QUOTA_UGBAD) ++ return NO_QUOTA; ++ ++ if (type == USRQUOTA && !(qmblk->dq_flags & VZDQ_USRQUOTA)) ++ return QUOTA_OK; ++ if (type == GRPQUOTA && !(qmblk->dq_flags & VZDQ_GRPQUOTA)) ++ return QUOTA_OK; ++ if (number == 0) ++ return QUOTA_OK; ++ ++ dqinfo = &qmblk->dq_ugid_info[type]; ++ dqstat = &qugid[type]->qugid_stat; ++ ++ if (dqstat->bhardlimit != 0 && ++ dqstat->bcurrent + number > dqstat->bhardlimit) ++ return NO_QUOTA; ++ ++ if (dqstat->bsoftlimit != 0 && ++ dqstat->bcurrent + number > dqstat->bsoftlimit) { ++ if (dqstat->btime == (time_t)0) { ++ if (!prealloc) ++ dqstat->btime = CURRENT_TIME_SECONDS ++ + dqinfo->bexpire; ++ else ++ /* ++ * Original Linux quota doesn't allow ++ * preallocation to exceed softlimit so ++ * exceeding will be always printed ++ */ ++ return NO_QUOTA; ++ } else if (CURRENT_TIME_SECONDS >= dqstat->btime) ++ return NO_QUOTA; ++ } ++ ++ return QUOTA_OK; ++} ++#endif ++ ++/* ---------------------------------------------------------------------- ++ * Quota superblock operations ++ * --------------------------------------------------------------------- */ ++ ++/* ++ * S_NOQUOTA note. ++ * In the current kernel (2.6.8.1), S_NOQUOTA flag is set only for ++ * - quota file (absent in our case) ++ * - after explicit DQUOT_DROP (earlier than clear_inode) in functions like ++ * filesystem-specific new_inode, before the inode gets outside links. ++ * For the latter case, the only quota operation where care about S_NOQUOTA ++ * might be required is vzquota_drop, but there S_NOQUOTA has already been ++ * checked in DQUOT_DROP(). ++ * So, S_NOQUOTA may be ignored for now in the VZDQ code. ++ * ++ * The above note is not entirely correct. ++ * Both for ext2 and ext3 filesystems, DQUOT_FREE_INODE is called from ++ * delete_inode if new_inode fails (for example, because of inode quota ++ * limits), so S_NOQUOTA check is needed in free_inode. ++ * This seems to be the dark corner of the current quota API. ++ */ ++ ++/* ++ * Initialize quota operations for the specified inode. ++ */ ++static int vzquota_initialize(struct inode *inode, int type) ++{ ++ vzquota_inode_init_call(inode); ++ return 0; /* ignored by caller */ ++} ++ ++/* ++ * Release quota for the specified inode. ++ */ ++static int vzquota_drop(struct inode *inode) ++{ ++ vzquota_inode_drop_call(inode); ++ return 0; /* ignored by caller */ ++} ++ ++/* ++ * Allocate block callback. ++ * ++ * If (prealloc) disk quota exceeding warning is not printed. ++ * See Linux quota to know why. ++ * ++ * Return: ++ * QUOTA_OK == 0 on SUCCESS ++ * NO_QUOTA == 1 if allocation should fail ++ */ ++static int vzquota_alloc_space(struct inode *inode, ++ qsize_t number, int prealloc) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_datast data; ++ int ret = QUOTA_OK; ++ ++ qmblk = vzquota_inode_data(inode, &data); ++ if (qmblk == VZ_QUOTA_BAD) ++ return NO_QUOTA; ++ if (qmblk != NULL) { ++#ifdef CONFIG_VZ_QUOTA_UGID ++ int cnt; ++ struct vz_quota_ugid * qugid[MAXQUOTAS]; ++#endif ++ ++ /* checking first */ ++ ret = vzquota_check_space(&qmblk->dq_info, &qmblk->dq_stat, ++ number, qmblk->dq_id, prealloc); ++ if (ret == NO_QUOTA) ++ goto no_quota; ++#ifdef CONFIG_VZ_QUOTA_UGID ++ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { ++ qugid[cnt] = INODE_QLNK(inode)->qugid[cnt]; ++ ret = vzquota_check_ugid_space(qmblk, qugid, ++ cnt, number, prealloc); ++ if (ret == NO_QUOTA) ++ goto no_quota; ++ } ++ /* check ok, may increment */ ++ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { ++ if (qugid[cnt] == NULL) ++ continue; ++ vzquota_incr_space(&qugid[cnt]->qugid_stat, number); ++ } ++#endif ++ vzquota_incr_space(&qmblk->dq_stat, number); ++ vzquota_data_unlock(inode, &data); ++ } ++ ++ inode_add_bytes(inode, number); ++ might_sleep(); ++ return QUOTA_OK; ++ ++no_quota: ++ vzquota_data_unlock(inode, &data); ++ return NO_QUOTA; ++} ++ ++/* ++ * Allocate inodes callback. ++ * ++ * Return: ++ * QUOTA_OK == 0 on SUCCESS ++ * NO_QUOTA == 1 if allocation should fail ++ */ ++static int vzquota_alloc_inode(const struct inode *inode, unsigned long number) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_datast data; ++ int ret = QUOTA_OK; ++ ++ qmblk = vzquota_inode_data((struct inode *)inode, &data); ++ if (qmblk == VZ_QUOTA_BAD) ++ return NO_QUOTA; ++ if (qmblk != NULL) { ++#ifdef CONFIG_VZ_QUOTA_UGID ++ int cnt; ++ struct vz_quota_ugid *qugid[MAXQUOTAS]; ++#endif ++ ++ /* checking first */ ++ ret = vzquota_check_inodes(&qmblk->dq_info, &qmblk->dq_stat, ++ number, qmblk->dq_id); ++ if (ret == NO_QUOTA) ++ goto no_quota; ++#ifdef CONFIG_VZ_QUOTA_UGID ++ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { ++ qugid[cnt] = INODE_QLNK(inode)->qugid[cnt]; ++ ret = vzquota_check_ugid_inodes(qmblk, qugid, ++ cnt, number); ++ if (ret == NO_QUOTA) ++ goto no_quota; ++ } ++ /* check ok, may increment */ ++ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { ++ if (qugid[cnt] == NULL) ++ continue; ++ vzquota_incr_inodes(&qugid[cnt]->qugid_stat, number); ++ } ++#endif ++ vzquota_incr_inodes(&qmblk->dq_stat, number); ++ vzquota_data_unlock((struct inode *)inode, &data); ++ } ++ ++ might_sleep(); ++ return QUOTA_OK; ++ ++no_quota: ++ vzquota_data_unlock((struct inode *)inode, &data); ++ return NO_QUOTA; ++} ++ ++/* ++ * Free space callback. ++ */ ++static int vzquota_free_space(struct inode *inode, qsize_t number) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_datast data; ++ ++ qmblk = vzquota_inode_data(inode, &data); ++ if (qmblk == VZ_QUOTA_BAD) ++ return NO_QUOTA; /* isn't checked by the caller */ ++ if (qmblk != NULL) { ++#ifdef CONFIG_VZ_QUOTA_UGID ++ int cnt; ++ struct vz_quota_ugid * qugid; ++#endif ++ ++ vzquota_decr_space(&qmblk->dq_stat, number); ++#ifdef CONFIG_VZ_QUOTA_UGID ++ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { ++ qugid = INODE_QLNK(inode)->qugid[cnt]; ++ if (qugid == NULL || qugid == VZ_QUOTA_UGBAD) ++ continue; ++ vzquota_decr_space(&qugid->qugid_stat, number); ++ } ++#endif ++ vzquota_data_unlock(inode, &data); ++ } ++ inode_sub_bytes(inode, number); ++ might_sleep(); ++ return QUOTA_OK; ++} ++ ++/* ++ * Free inodes callback. ++ */ ++static int vzquota_free_inode(const struct inode *inode, unsigned long number) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_datast data; ++ ++ qmblk = vzquota_inode_data((struct inode *)inode, &data); ++ if (qmblk == VZ_QUOTA_BAD) ++ return NO_QUOTA; ++ if (qmblk != NULL) { ++#ifdef CONFIG_VZ_QUOTA_UGID ++ int cnt; ++ struct vz_quota_ugid * qugid; ++#endif ++ ++ vzquota_decr_inodes(&qmblk->dq_stat, number); ++#ifdef CONFIG_VZ_QUOTA_UGID ++ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { ++ qugid = INODE_QLNK(inode)->qugid[cnt]; ++ if (qugid == NULL || qugid == VZ_QUOTA_UGBAD) ++ continue; ++ vzquota_decr_inodes(&qugid->qugid_stat, number); ++ } ++#endif ++ vzquota_data_unlock((struct inode *)inode, &data); ++ } ++ might_sleep(); ++ return QUOTA_OK; ++} ++ ++void vzquota_inode_off(struct inode * inode) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_datast data; ++ ++ /* The call is made through virtinfo, it can be an inode ++ * not controlled by vzquota. ++ */ ++ if (inode->i_sb->dq_op != &vz_quota_operations) ++ return; ++ ++ qmblk = vzquota_inode_data(inode, &data); ++ if (qmblk == VZ_QUOTA_BAD) ++ return; ++ ++ if (qmblk == NULL) { ++ /* Tricky place. If qmblk == NULL, it means that this inode ++ * is not in area controlled by vzquota (except for rare ++ * case of already set S_NOQUOTA). But we have to set ++ * S_NOQUOTA in any case because vzquota can be turned ++ * on later, when this inode is invalid from viewpoint ++ * of vzquota. ++ * ++ * To be safe, we reacquire vzquota lock. ++ */ ++ inode_qmblk_lock(inode->i_sb); ++ inode->i_flags |= S_NOQUOTA; ++ inode_qmblk_unlock(inode->i_sb); ++ return; ++ } else { ++ loff_t bytes = inode_get_bytes(inode); ++#ifdef CONFIG_VZ_QUOTA_UGID ++ int cnt; ++ struct vz_quota_ugid * qugid; ++#endif ++ ++ inode->i_flags |= S_NOQUOTA; ++ ++ vzquota_decr_space(&qmblk->dq_stat, bytes); ++ vzquota_decr_inodes(&qmblk->dq_stat, 1); ++#ifdef CONFIG_VZ_QUOTA_UGID ++ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { ++ qugid = INODE_QLNK(inode)->qugid[cnt]; ++ if (qugid == NULL || qugid == VZ_QUOTA_UGBAD) ++ continue; ++ vzquota_decr_space(&qugid->qugid_stat, bytes); ++ vzquota_decr_inodes(&qugid->qugid_stat, 1); ++ } ++#endif ++ ++ vzquota_data_unlock(inode, &data); ++ ++ vzquota_inode_drop_call(inode); ++ } ++} ++ ++ ++#ifdef CONFIG_VZ_QUOTA_UGID ++ ++/* ++ * helper function for quota_transfer ++ * check that we can add inode to this quota_id ++ */ ++static int vzquota_transfer_check(struct vz_quota_master *qmblk, ++ struct vz_quota_ugid *qugid[], ++ unsigned int type, __u64 size) ++{ ++ if (vzquota_check_ugid_space(qmblk, qugid, type, size, 0) != QUOTA_OK || ++ vzquota_check_ugid_inodes(qmblk, qugid, type, 1) != QUOTA_OK) ++ return -1; ++ return 0; ++} ++ ++int vzquota_transfer_usage(struct inode *inode, ++ int mask, ++ struct vz_quota_ilink *qlnk) ++{ ++ struct vz_quota_ugid *qugid_old; ++ __u64 space; ++ int i; ++ ++ space = inode_get_bytes(inode); ++ for (i = 0; i < MAXQUOTAS; i++) { ++ if (!(mask & (1 << i))) ++ continue; ++ /* ++ * Do not permit chown a file if its owner does not have ++ * ugid record. This might happen if we somehow exceeded ++ * the UID/GID (e.g. set uglimit less than number of users). ++ */ ++ if (INODE_QLNK(inode)->qugid[i] == VZ_QUOTA_UGBAD) ++ return -1; ++ if (vzquota_transfer_check(qlnk->qmblk, qlnk->qugid, i, space)) ++ return -1; ++ } ++ ++ for (i = 0; i < MAXQUOTAS; i++) { ++ if (!(mask & (1 << i))) ++ continue; ++ qugid_old = INODE_QLNK(inode)->qugid[i]; ++ vzquota_decr_space(&qugid_old->qugid_stat, space); ++ vzquota_decr_inodes(&qugid_old->qugid_stat, 1); ++ vzquota_incr_space(&qlnk->qugid[i]->qugid_stat, space); ++ vzquota_incr_inodes(&qlnk->qugid[i]->qugid_stat, 1); ++ } ++ return 0; ++} ++ ++/* ++ * Transfer the inode between diffent user/group quotas. ++ */ ++static int vzquota_transfer(struct inode *inode, struct iattr *iattr) ++{ ++ return vzquota_inode_transfer_call(inode, iattr) ? ++ NO_QUOTA : QUOTA_OK; ++} ++ ++#else /* CONFIG_VZ_QUOTA_UGID */ ++ ++static int vzquota_transfer(struct inode *inode, struct iattr *iattr) ++{ ++ return QUOTA_OK; ++} ++ ++#endif ++ ++/* ++ * Called under following semaphores: ++ * old_d->d_inode->i_sb->s_vfs_rename_sem ++ * old_d->d_inode->i_sem ++ * new_d->d_inode->i_sem ++ * [not verified --SAW] ++ */ ++static int vzquota_rename(struct inode *inode, ++ struct inode *old_dir, struct inode *new_dir) ++{ ++ return vzquota_rename_check(inode, old_dir, new_dir) ? ++ NO_QUOTA : QUOTA_OK; ++} ++ ++/* ++ * Structure of superblock diskquota operations. ++ */ ++struct dquot_operations vz_quota_operations = { ++ .initialize = vzquota_initialize, ++ .drop = vzquota_drop, ++ .alloc_space = vzquota_alloc_space, ++ .alloc_inode = vzquota_alloc_inode, ++ .free_space = vzquota_free_space, ++ .free_inode = vzquota_free_inode, ++ .transfer = vzquota_transfer, ++ .rename = vzquota_rename, ++}; +diff --git a/fs/vzdq_tree.c b/fs/vzdq_tree.c +new file mode 100644 +index 0000000..f4f2152 +--- /dev/null ++++ b/fs/vzdq_tree.c +@@ -0,0 +1,286 @@ ++/* ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ * This file contains Virtuozzo quota tree implementation ++ */ ++ ++#include ++#include ++#include ++ ++struct quotatree_tree *quotatree_alloc(void) ++{ ++ int l; ++ struct quotatree_tree *tree; ++ ++ tree = kmalloc(sizeof(struct quotatree_tree), GFP_KERNEL); ++ if (tree == NULL) ++ goto out; ++ ++ for (l = 0; l < QUOTATREE_DEPTH; l++) { ++ INIT_LIST_HEAD(&tree->levels[l].usedlh); ++ INIT_LIST_HEAD(&tree->levels[l].freelh); ++ tree->levels[l].freenum = 0; ++ } ++ tree->root = NULL; ++ tree->leaf_num = 0; ++out: ++ return tree; ++} ++ ++static struct quotatree_node * ++quotatree_follow(struct quotatree_tree *tree, quotaid_t id, int level, ++ struct quotatree_find_state *st) ++{ ++ void **block; ++ struct quotatree_node *parent; ++ int l, index; ++ ++ parent = NULL; ++ block = (void **)&tree->root; ++ l = 0; ++ while (l < level && *block != NULL) { ++ index = (id >> QUOTATREE_BSHIFT(l)) & QUOTATREE_BMASK; ++ parent = *block; ++ block = parent->blocks + index; ++ l++; ++ } ++ if (st != NULL) { ++ st->block = block; ++ st->level = l; ++ } ++ ++ return parent; ++} ++ ++void *quotatree_find(struct quotatree_tree *tree, quotaid_t id, ++ struct quotatree_find_state *st) ++{ ++ quotatree_follow(tree, id, QUOTATREE_DEPTH, st); ++ if (st->level == QUOTATREE_DEPTH) ++ return *st->block; ++ else ++ return NULL; ++} ++ ++void *quotatree_leaf_byindex(struct quotatree_tree *tree, unsigned int index) ++{ ++ int i, count; ++ struct quotatree_node *p; ++ void *leaf; ++ ++ if (QTREE_LEAFNUM(tree) <= index) ++ return NULL; ++ ++ count = 0; ++ list_for_each_entry(p, &QTREE_LEAFLVL(tree)->usedlh, list) { ++ for (i = 0; i < QUOTATREE_BSIZE; i++) { ++ leaf = p->blocks[i]; ++ if (leaf == NULL) ++ continue; ++ if (count == index) ++ return leaf; ++ count++; ++ } ++ } ++ return NULL; ++} ++ ++/* returns data leaf (vz_quota_ugid) after _existent_ ugid (@id) ++ * in the tree... */ ++void *quotatree_get_next(struct quotatree_tree *tree, quotaid_t id) ++{ ++ int off; ++ struct quotatree_node *parent, *p; ++ struct list_head *lh; ++ ++ /* get parent refering correct quota tree node of the last level */ ++ parent = quotatree_follow(tree, id, QUOTATREE_DEPTH, NULL); ++ if (!parent) ++ return NULL; ++ ++ off = (id & QUOTATREE_BMASK) + 1; /* next ugid */ ++ lh = &parent->list; ++ do { ++ p = list_entry(lh, struct quotatree_node, list); ++ for ( ; off < QUOTATREE_BSIZE; off++) ++ if (p->blocks[off]) ++ return p->blocks[off]; ++ off = 0; ++ lh = lh->next; ++ } while (lh != &QTREE_LEAFLVL(tree)->usedlh); ++ ++ return NULL; ++} ++ ++int quotatree_insert(struct quotatree_tree *tree, quotaid_t id, ++ struct quotatree_find_state *st, void *data) ++{ ++ struct quotatree_node *p; ++ int l, index; ++ ++ while (st->level < QUOTATREE_DEPTH) { ++ l = st->level; ++ if (!list_empty(&tree->levels[l].freelh)) { ++ p = list_entry(tree->levels[l].freelh.next, ++ struct quotatree_node, list); ++ list_del(&p->list); ++ } else { ++ p = kmalloc(sizeof(struct quotatree_node), GFP_NOFS | __GFP_NOFAIL); ++ if (p == NULL) ++ return -ENOMEM; ++ /* save block number in the l-level ++ * it uses for quota file generation */ ++ p->num = tree->levels[l].freenum++; ++ } ++ list_add(&p->list, &tree->levels[l].usedlh); ++ memset(p->blocks, 0, sizeof(p->blocks)); ++ *st->block = p; ++ ++ index = (id >> QUOTATREE_BSHIFT(l)) & QUOTATREE_BMASK; ++ st->block = p->blocks + index; ++ st->level++; ++ } ++ tree->leaf_num++; ++ *st->block = data; ++ ++ return 0; ++} ++ ++static struct quotatree_node * ++quotatree_remove_ptr(struct quotatree_tree *tree, quotaid_t id, ++ int level) ++{ ++ struct quotatree_node *parent; ++ struct quotatree_find_state st; ++ ++ parent = quotatree_follow(tree, id, level, &st); ++ if (st.level == QUOTATREE_DEPTH) ++ tree->leaf_num--; ++ *st.block = NULL; ++ return parent; ++} ++ ++void quotatree_remove(struct quotatree_tree *tree, quotaid_t id) ++{ ++ struct quotatree_node *p; ++ int level, i; ++ ++ p = quotatree_remove_ptr(tree, id, QUOTATREE_DEPTH); ++ for (level = QUOTATREE_DEPTH - 1; level >= QUOTATREE_CDEPTH; level--) { ++ for (i = 0; i < QUOTATREE_BSIZE; i++) ++ if (p->blocks[i] != NULL) ++ return; ++ list_move(&p->list, &tree->levels[level].freelh); ++ p = quotatree_remove_ptr(tree, id, level); ++ } ++} ++ ++#if 0 ++static void quotatree_walk(struct quotatree_tree *tree, ++ struct quotatree_node *node_start, ++ quotaid_t id_start, ++ int level_start, int level_end, ++ int (*callback)(struct quotatree_tree *, ++ quotaid_t id, ++ int level, ++ void *ptr, ++ void *data), ++ void *data) ++{ ++ struct quotatree_node *p; ++ int l, shift, index; ++ quotaid_t id; ++ struct quotatree_find_state st; ++ ++ p = node_start; ++ l = level_start; ++ shift = (QUOTATREE_DEPTH - l) * QUOTAID_BBITS; ++ id = id_start; ++ index = 0; ++ ++ /* ++ * Invariants: ++ * shift == (QUOTATREE_DEPTH - l) * QUOTAID_BBITS; ++ * id & ((1 << shift) - 1) == 0 ++ * p is l-level node corresponding to id ++ */ ++ do { ++ if (!p) ++ break; ++ ++ if (l < level_end) { ++ for (; index < QUOTATREE_BSIZE; index++) ++ if (p->blocks[index] != NULL) ++ break; ++ if (index < QUOTATREE_BSIZE) { ++ /* descend */ ++ p = p->blocks[index]; ++ l++; ++ shift -= QUOTAID_BBITS; ++ id += (quotaid_t)index << shift; ++ index = 0; ++ continue; ++ } ++ } ++ ++ if ((*callback)(tree, id, l, p, data)) ++ break; ++ ++ /* ascend and to the next node */ ++ p = quotatree_follow(tree, id, l, &st); ++ ++ index = ((id >> shift) & QUOTATREE_BMASK) + 1; ++ l--; ++ shift += QUOTAID_BBITS; ++ id &= ~(((quotaid_t)1 << shift) - 1); ++ } while (l >= level_start); ++} ++#endif ++ ++static void free_list(struct list_head *node_list) ++{ ++ struct quotatree_node *p, *tmp; ++ ++ list_for_each_entry_safe(p, tmp, node_list, list) { ++ list_del(&p->list); ++ kfree(p); ++ } ++} ++ ++static inline void quotatree_free_nodes(struct quotatree_tree *tree) ++{ ++ int i; ++ ++ for (i = 0; i < QUOTATREE_DEPTH; i++) { ++ free_list(&tree->levels[i].usedlh); ++ free_list(&tree->levels[i].freelh); ++ } ++} ++ ++static void quotatree_free_leafs(struct quotatree_tree *tree, ++ void (*dtor)(void *)) ++{ ++ int i; ++ struct quotatree_node *p; ++ ++ list_for_each_entry(p, &QTREE_LEAFLVL(tree)->usedlh, list) { ++ for (i = 0; i < QUOTATREE_BSIZE; i++) { ++ if (p->blocks[i] == NULL) ++ continue; ++ ++ dtor(p->blocks[i]); ++ } ++ } ++} ++ ++void quotatree_free(struct quotatree_tree *tree, void (*dtor)(void *)) ++{ ++ quotatree_free_leafs(tree, dtor); ++ quotatree_free_nodes(tree); ++ kfree(tree); ++} +diff --git a/fs/vzdq_ugid.c b/fs/vzdq_ugid.c +new file mode 100644 +index 0000000..1031149 +--- /dev/null ++++ b/fs/vzdq_ugid.c +@@ -0,0 +1,1221 @@ ++/* ++ * Copyright (C) 2002 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ * This file contains Virtuozzo UID/GID disk quota implementation ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++/* ++ * XXX ++ * may be something is needed for sb->s_dquot->info[]? ++ */ ++ ++#define USRQUOTA_MASK (1 << USRQUOTA) ++#define GRPQUOTA_MASK (1 << GRPQUOTA) ++#define QTYPE2MASK(type) (1 << (type)) ++ ++static struct kmem_cache *vz_quota_ugid_cachep; ++ ++/* guard to protect vz_quota_master from destroy in quota_on/off. Also protects ++ * list on the hash table */ ++extern struct semaphore vz_quota_sem; ++ ++inline struct vz_quota_ugid *vzquota_get_ugid(struct vz_quota_ugid *qugid) ++{ ++ if (qugid != VZ_QUOTA_UGBAD) ++ atomic_inc(&qugid->qugid_count); ++ return qugid; ++} ++ ++/* we don't limit users with zero limits */ ++static inline int vzquota_fake_stat(struct dq_stat *stat) ++{ ++ return stat->bhardlimit == 0 && stat->bsoftlimit == 0 && ++ stat->ihardlimit == 0 && stat->isoftlimit == 0; ++} ++ ++/* callback function for quotatree_free() */ ++static inline void vzquota_free_qugid(void *ptr) ++{ ++ kmem_cache_free(vz_quota_ugid_cachep, ptr); ++} ++ ++/* ++ * destroy ugid, if it have zero refcount, limits and usage ++ * must be called under qmblk->dq_sem ++ */ ++void vzquota_put_ugid(struct vz_quota_master *qmblk, ++ struct vz_quota_ugid *qugid) ++{ ++ if (qugid == VZ_QUOTA_UGBAD) ++ return; ++ qmblk_data_read_lock(qmblk); ++ if (atomic_dec_and_test(&qugid->qugid_count) && ++ (qmblk->dq_flags & VZDQUG_FIXED_SET) == 0 && ++ vzquota_fake_stat(&qugid->qugid_stat) && ++ qugid->qugid_stat.bcurrent == 0 && ++ qugid->qugid_stat.icurrent == 0) { ++ quotatree_remove(QUGID_TREE(qmblk, qugid->qugid_type), ++ qugid->qugid_id); ++ qmblk->dq_ugid_count--; ++ vzquota_free_qugid(qugid); ++ } ++ qmblk_data_read_unlock(qmblk); ++} ++ ++/* ++ * Get ugid block by its index, like it would present in array. ++ * In reality, this is not array - this is leafs chain of the tree. ++ * NULL if index is out of range. ++ * qmblk semaphore is required to protect the tree. ++ */ ++static inline struct vz_quota_ugid * ++vzquota_get_byindex(struct vz_quota_master *qmblk, unsigned int index, int type) ++{ ++ return quotatree_leaf_byindex(QUGID_TREE(qmblk, type), index); ++} ++ ++/* ++ * get next element from ugid "virtual array" ++ * ugid must be in current array and this array may not be changed between ++ * two accesses (quaranteed by "stopped" quota state and quota semaphore) ++ * qmblk semaphore is required to protect the tree ++ */ ++static inline struct vz_quota_ugid * ++vzquota_get_next(struct vz_quota_master *qmblk, struct vz_quota_ugid *qugid) ++{ ++ return quotatree_get_next(QUGID_TREE(qmblk, qugid->qugid_type), ++ qugid->qugid_id); ++} ++ ++/* ++ * requires dq_sem ++ */ ++struct vz_quota_ugid *__vzquota_find_ugid(struct vz_quota_master *qmblk, ++ unsigned int quota_id, int type, int flags) ++{ ++ struct vz_quota_ugid *qugid; ++ struct quotatree_tree *tree; ++ struct quotatree_find_state st; ++ ++ tree = QUGID_TREE(qmblk, type); ++ qugid = quotatree_find(tree, quota_id, &st); ++ if (qugid) ++ goto success; ++ ++ /* caller does not want alloc */ ++ if (flags & VZDQUG_FIND_DONT_ALLOC) ++ goto fail; ++ ++ if (flags & VZDQUG_FIND_FAKE) ++ goto doit; ++ ++ /* check limit */ ++ if (qmblk->dq_ugid_count >= qmblk->dq_ugid_max) ++ goto fail; ++ ++ /* see comment at VZDQUG_FIXED_SET define */ ++ if (qmblk->dq_flags & VZDQUG_FIXED_SET) ++ goto fail; ++ ++doit: ++ /* alloc new structure */ ++ qugid = kmem_cache_alloc(vz_quota_ugid_cachep, ++ GFP_NOFS | __GFP_NOFAIL); ++ if (qugid == NULL) ++ goto fail; ++ ++ /* initialize new structure */ ++ qugid->qugid_id = quota_id; ++ memset(&qugid->qugid_stat, 0, sizeof(qugid->qugid_stat)); ++ qugid->qugid_type = type; ++ atomic_set(&qugid->qugid_count, 0); ++ ++ /* insert in tree */ ++ if (quotatree_insert(tree, quota_id, &st, qugid) < 0) ++ goto fail_insert; ++ qmblk->dq_ugid_count++; ++ ++success: ++ vzquota_get_ugid(qugid); ++ return qugid; ++ ++fail_insert: ++ vzquota_free_qugid(qugid); ++fail: ++ return VZ_QUOTA_UGBAD; ++} ++ ++/* ++ * takes dq_sem, may schedule ++ */ ++struct vz_quota_ugid *vzquota_find_ugid(struct vz_quota_master *qmblk, ++ unsigned int quota_id, int type, int flags) ++{ ++ struct vz_quota_ugid *qugid; ++ ++ down(&qmblk->dq_sem); ++ qugid = __vzquota_find_ugid(qmblk, quota_id, type, flags); ++ up(&qmblk->dq_sem); ++ ++ return qugid; ++} ++ ++/* ++ * destroy all ugid records on given quota master ++ */ ++void vzquota_kill_ugid(struct vz_quota_master *qmblk) ++{ ++ BUG_ON((qmblk->dq_gid_tree == NULL && qmblk->dq_uid_tree != NULL) || ++ (qmblk->dq_uid_tree == NULL && qmblk->dq_gid_tree != NULL)); ++ ++ if (qmblk->dq_uid_tree != NULL) { ++ quotatree_free(qmblk->dq_uid_tree, vzquota_free_qugid); ++ quotatree_free(qmblk->dq_gid_tree, vzquota_free_qugid); ++ } ++} ++ ++ ++/* ---------------------------------------------------------------------- ++ * Management interface to ugid quota for (super)users. ++ * --------------------------------------------------------------------- */ ++ ++static int vzquota_initialize2(struct inode *inode, int type) ++{ ++ return QUOTA_OK; ++} ++ ++static int vzquota_drop2(struct inode *inode) ++{ ++ return QUOTA_OK; ++} ++ ++static int vzquota_alloc_space2(struct inode *inode, ++ qsize_t number, int prealloc) ++{ ++ inode_add_bytes(inode, number); ++ return QUOTA_OK; ++} ++ ++static int vzquota_alloc_inode2(const struct inode *inode, unsigned long number) ++{ ++ return QUOTA_OK; ++} ++ ++static int vzquota_free_space2(struct inode *inode, qsize_t number) ++{ ++ inode_sub_bytes(inode, number); ++ return QUOTA_OK; ++} ++ ++static int vzquota_free_inode2(const struct inode *inode, unsigned long number) ++{ ++ return QUOTA_OK; ++} ++ ++static int vzquota_transfer2(struct inode *inode, struct iattr *iattr) ++{ ++ return QUOTA_OK; ++} ++ ++struct dquot_operations vz_quota_operations2 = { ++ .initialize = vzquota_initialize2, ++ .drop = vzquota_drop2, ++ .alloc_space = vzquota_alloc_space2, ++ .alloc_inode = vzquota_alloc_inode2, ++ .free_space = vzquota_free_space2, ++ .free_inode = vzquota_free_inode2, ++ .transfer = vzquota_transfer2, ++}; ++ ++ ++asmlinkage long sys_unlink(const char __user * pathname); ++asmlinkage long sys_rename(const char __user * oldname, ++ const char __user * newname); ++asmlinkage long sys_symlink(const char __user * oldname, ++ const char __user * newname); ++ ++/* called under sb->s_umount semaphore */ ++static int vz_restore_symlink(struct super_block *sb, char *path, int type) ++{ ++ mm_segment_t oldfs; ++ char *newpath; ++ char dest[64]; ++ const char *names[] = { ++ [USRQUOTA] "aquota.user", ++ [GRPQUOTA] "aquota.group" ++ }; ++ int err; ++ ++ newpath = kmalloc(strlen(path) + sizeof(".new"), GFP_KERNEL); ++ if (newpath == NULL) ++ return -ENOMEM; ++ ++ strcpy(newpath, path); ++ strcat(newpath, ".new"); ++ ++ sprintf(dest, "/proc/vz/vzaquota/%08x/%s", ++ new_encode_dev(sb->s_dev), names[type]); ++ ++ /* ++ * Lockdep will learn unneeded dependency while unlink(2): ++ * ->s_umount => ->i_mutex/1 => ->i_mutex ++ * Reverse dependency is, ++ * open_namei() => ->i_mutex => lookup_hash() => __lookup_hash() ++ * => ->lookup() \eq vzdq_aquotq_lookup() => find_qmblk_by_dev() ++ * => user_get_super() => ->s_umount ++ * ++ * However, first set of ->i_mutex'es belong to /, second to /proc . ++ * Right fix is to get rid of vz_restore_symlink(), of course. ++ */ ++ up_read(&sb->s_umount); ++ ++ oldfs = get_fs(); ++ set_fs(KERNEL_DS); ++ err = sys_unlink(newpath); ++ if (err < 0 && err != -ENOENT) ++ goto out_restore; ++ err = sys_symlink(dest, newpath); ++ if (err < 0) ++ goto out_restore; ++ err = sys_rename(newpath, path); ++out_restore: ++ set_fs(oldfs); ++ ++ down_read(&sb->s_umount); ++ /* umounted meanwhile? */ ++ if (err == 0 && !sb->s_root) ++ err = -ENODEV; ++ ++ kfree(newpath); ++ return err; ++} ++ ++/* called under sb->s_umount semaphore */ ++static int vz_quota_on(struct super_block *sb, int type, ++ int format_id, char *path, int remount) ++{ ++ struct vz_quota_master *qmblk; ++ int mask, mask2; ++ int err; ++ ++ qmblk = vzquota_find_qmblk(sb); ++ err = -ESRCH; ++ if (qmblk == NULL) ++ goto out; ++ err = -EIO; ++ if (qmblk == VZ_QUOTA_BAD) ++ goto out; ++ ++ err = vz_restore_symlink(sb, path, type); ++ if (err < 0) ++ goto out_put; ++ ++ down(&vz_quota_sem); ++ mask = 0; ++ mask2 = 0; ++ sb->dq_op = &vz_quota_operations2; ++ sb->s_qcop = &vz_quotactl_operations; ++ if (type == USRQUOTA) { ++ mask = DQUOT_USR_ENABLED; ++ mask2 = VZDQ_USRQUOTA; ++ } ++ if (type == GRPQUOTA) { ++ mask = DQUOT_GRP_ENABLED; ++ mask2 = VZDQ_GRPQUOTA; ++ } ++ err = -EBUSY; ++ if (qmblk->dq_flags & mask2) ++ goto out_sem; ++ ++ err = 0; ++ qmblk->dq_flags |= mask2; ++ sb->s_dquot.flags |= mask; ++ ++out_sem: ++ up(&vz_quota_sem); ++out_put: ++ qmblk_put(qmblk); ++out: ++ return err; ++} ++ ++static int vz_quota_off(struct super_block *sb, int type, int remount) ++{ ++ struct vz_quota_master *qmblk; ++ int mask2; ++ int err; ++ ++ qmblk = vzquota_find_qmblk(sb); ++ down(&vz_quota_sem); ++ err = -ESRCH; ++ if (qmblk == NULL) ++ goto out; ++ err = -EIO; ++ if (qmblk == VZ_QUOTA_BAD) ++ goto out; ++ ++ mask2 = 0; ++ if (type == USRQUOTA) ++ mask2 = VZDQ_USRQUOTA; ++ if (type == GRPQUOTA) ++ mask2 = VZDQ_GRPQUOTA; ++ err = -EINVAL; ++ if (!(qmblk->dq_flags & mask2)) ++ goto out; ++ ++ qmblk->dq_flags &= ~mask2; ++ err = 0; ++ ++out: ++ up(&vz_quota_sem); ++ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) ++ qmblk_put(qmblk); ++ return err; ++} ++ ++static int vz_quota_sync(struct super_block *sb, int type) ++{ ++ return 0; /* vz quota is always uptodate */ ++} ++ ++static int vz_get_dqblk(struct super_block *sb, int type, ++ qid_t id, struct if_dqblk *di) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_ugid *ugid; ++ int err; ++ ++ qmblk = vzquota_find_qmblk(sb); ++ down(&vz_quota_sem); ++ err = -ESRCH; ++ if (qmblk == NULL) ++ goto out; ++ err = -EIO; ++ if (qmblk == VZ_QUOTA_BAD) ++ goto out; ++ ++ err = 0; ++ ugid = vzquota_find_ugid(qmblk, id, type, VZDQUG_FIND_DONT_ALLOC); ++ if (ugid != VZ_QUOTA_UGBAD) { ++ qmblk_data_read_lock(qmblk); ++ di->dqb_bhardlimit = ugid->qugid_stat.bhardlimit >> 10; ++ di->dqb_bsoftlimit = ugid->qugid_stat.bsoftlimit >> 10; ++ di->dqb_curspace = ugid->qugid_stat.bcurrent; ++ di->dqb_ihardlimit = ugid->qugid_stat.ihardlimit; ++ di->dqb_isoftlimit = ugid->qugid_stat.isoftlimit; ++ di->dqb_curinodes = ugid->qugid_stat.icurrent; ++ di->dqb_btime = ugid->qugid_stat.btime; ++ di->dqb_itime = ugid->qugid_stat.itime; ++ qmblk_data_read_unlock(qmblk); ++ di->dqb_valid = QIF_ALL; ++ vzquota_put_ugid(qmblk, ugid); ++ } else { ++ memset(di, 0, sizeof(*di)); ++ di->dqb_valid = QIF_ALL; ++ } ++ ++out: ++ up(&vz_quota_sem); ++ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) ++ qmblk_put(qmblk); ++ return err; ++} ++ ++/* must be called under vz_quota_sem */ ++static int __vz_set_dqblk(struct vz_quota_master *qmblk, ++ int type, qid_t id, struct if_dqblk *di) ++{ ++ struct vz_quota_ugid *ugid; ++ ++ ugid = vzquota_find_ugid(qmblk, id, type, 0); ++ if (ugid == VZ_QUOTA_UGBAD) ++ return -ESRCH; ++ ++ qmblk_data_write_lock(qmblk); ++ /* ++ * Subtle compatibility breakage. ++ * ++ * Some old non-vz kernel quota didn't start grace period ++ * if the new soft limit happens to be below the usage. ++ * Non-vz kernel quota in 2.4.20 starts the grace period ++ * (if it hasn't been started). ++ * Current non-vz kernel performs even more complicated ++ * manipulations... ++ * ++ * Also, current non-vz kernels have inconsistency related to ++ * the grace time start. In regular operations the grace period ++ * is started if the usage is greater than the soft limit (and, ++ * strangely, is cancelled if the usage is less). ++ * However, set_dqblk starts the grace period if the usage is greater ++ * or equal to the soft limit. ++ * ++ * Here we try to mimic the behavior of the current non-vz kernel. ++ */ ++ if (di->dqb_valid & QIF_BLIMITS) { ++ ugid->qugid_stat.bhardlimit = ++ (__u64)di->dqb_bhardlimit << 10; ++ ugid->qugid_stat.bsoftlimit = ++ (__u64)di->dqb_bsoftlimit << 10; ++ if (di->dqb_bsoftlimit == 0 || ++ ugid->qugid_stat.bcurrent < ugid->qugid_stat.bsoftlimit) ++ ugid->qugid_stat.btime = 0; ++ else if (!(di->dqb_valid & QIF_BTIME)) ++ ugid->qugid_stat.btime = CURRENT_TIME_SECONDS ++ + qmblk->dq_ugid_info[type].bexpire; ++ else ++ ugid->qugid_stat.btime = di->dqb_btime; ++ } ++ if (di->dqb_valid & QIF_ILIMITS) { ++ ugid->qugid_stat.ihardlimit = di->dqb_ihardlimit; ++ ugid->qugid_stat.isoftlimit = di->dqb_isoftlimit; ++ if (di->dqb_isoftlimit == 0 || ++ ugid->qugid_stat.icurrent < ugid->qugid_stat.isoftlimit) ++ ugid->qugid_stat.itime = 0; ++ else if (!(di->dqb_valid & QIF_ITIME)) ++ ugid->qugid_stat.itime = CURRENT_TIME_SECONDS ++ + qmblk->dq_ugid_info[type].iexpire; ++ else ++ ugid->qugid_stat.itime = di->dqb_itime; ++ } ++ qmblk_data_write_unlock(qmblk); ++ vzquota_put_ugid(qmblk, ugid); ++ ++ return 0; ++} ++ ++static int vz_set_dqblk(struct super_block *sb, int type, ++ qid_t id, struct if_dqblk *di) ++{ ++ struct vz_quota_master *qmblk; ++ int err; ++ ++ qmblk = vzquota_find_qmblk(sb); ++ down(&vz_quota_sem); ++ err = -ESRCH; ++ if (qmblk == NULL) ++ goto out; ++ err = -EIO; ++ if (qmblk == VZ_QUOTA_BAD) ++ goto out; ++ err = __vz_set_dqblk(qmblk, type, id, di); ++out: ++ up(&vz_quota_sem); ++ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) ++ qmblk_put(qmblk); ++ return err; ++} ++ ++static int vz_get_dqinfo(struct super_block *sb, int type, ++ struct if_dqinfo *ii) ++{ ++ struct vz_quota_master *qmblk; ++ int err; ++ ++ qmblk = vzquota_find_qmblk(sb); ++ down(&vz_quota_sem); ++ err = -ESRCH; ++ if (qmblk == NULL) ++ goto out; ++ err = -EIO; ++ if (qmblk == VZ_QUOTA_BAD) ++ goto out; ++ ++ err = 0; ++ ii->dqi_bgrace = qmblk->dq_ugid_info[type].bexpire; ++ ii->dqi_igrace = qmblk->dq_ugid_info[type].iexpire; ++ ii->dqi_flags = 0; ++ ii->dqi_valid = IIF_ALL; ++ ++out: ++ up(&vz_quota_sem); ++ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) ++ qmblk_put(qmblk); ++ return err; ++} ++ ++/* must be called under vz_quota_sem */ ++static int __vz_set_dqinfo(struct vz_quota_master *qmblk, ++ int type, struct if_dqinfo *ii) ++{ ++ if (ii->dqi_valid & IIF_FLAGS) ++ if (ii->dqi_flags & DQF_MASK) ++ return -EINVAL; ++ ++ if (ii->dqi_valid & IIF_BGRACE) ++ qmblk->dq_ugid_info[type].bexpire = ii->dqi_bgrace; ++ if (ii->dqi_valid & IIF_IGRACE) ++ qmblk->dq_ugid_info[type].iexpire = ii->dqi_igrace; ++ return 0; ++} ++ ++static int vz_set_dqinfo(struct super_block *sb, int type, ++ struct if_dqinfo *ii) ++{ ++ struct vz_quota_master *qmblk; ++ int err; ++ ++ qmblk = vzquota_find_qmblk(sb); ++ down(&vz_quota_sem); ++ err = -ESRCH; ++ if (qmblk == NULL) ++ goto out; ++ err = -EIO; ++ if (qmblk == VZ_QUOTA_BAD) ++ goto out; ++ err = __vz_set_dqinfo(qmblk, type, ii); ++out: ++ up(&vz_quota_sem); ++ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) ++ qmblk_put(qmblk); ++ return err; ++} ++ ++#ifdef CONFIG_QUOTA_COMPAT ++ ++#define Q_GETQUOTI_SIZE 1024 ++ ++#define UGID2DQBLK(dst, src) \ ++ do { \ ++ (dst)->dqb_ihardlimit = (src)->qugid_stat.ihardlimit; \ ++ (dst)->dqb_isoftlimit = (src)->qugid_stat.isoftlimit; \ ++ (dst)->dqb_curinodes = (src)->qugid_stat.icurrent; \ ++ /* in 1K blocks */ \ ++ (dst)->dqb_bhardlimit = (src)->qugid_stat.bhardlimit >> 10; \ ++ /* in 1K blocks */ \ ++ (dst)->dqb_bsoftlimit = (src)->qugid_stat.bsoftlimit >> 10; \ ++ /* in bytes, 64 bit */ \ ++ (dst)->dqb_curspace = (src)->qugid_stat.bcurrent; \ ++ (dst)->dqb_btime = (src)->qugid_stat.btime; \ ++ (dst)->dqb_itime = (src)->qugid_stat.itime; \ ++ } while (0) ++ ++static int vz_get_quoti(struct super_block *sb, int type, qid_t idx, ++ struct v2_disk_dqblk __user *dqblk) ++{ ++ struct vz_quota_master *qmblk; ++ struct v2_disk_dqblk *data, *kbuf; ++ struct vz_quota_ugid *ugid; ++ int count; ++ int err; ++ ++ qmblk = vzquota_find_qmblk(sb); ++ err = -ESRCH; ++ if (qmblk == NULL) ++ goto out; ++ err = -EIO; ++ if (qmblk == VZ_QUOTA_BAD) ++ goto out; ++ ++ err = -ENOMEM; ++ kbuf = vmalloc(Q_GETQUOTI_SIZE * sizeof(*kbuf)); ++ if (!kbuf) ++ goto out; ++ ++ down(&vz_quota_sem); ++ down(&qmblk->dq_sem); ++ for (ugid = vzquota_get_byindex(qmblk, idx, type), count = 0; ++ ugid != NULL && count < Q_GETQUOTI_SIZE; ++ count++) ++ { ++ data = kbuf + count; ++ qmblk_data_read_lock(qmblk); ++ UGID2DQBLK(data, ugid); ++ qmblk_data_read_unlock(qmblk); ++ data->dqb_id = ugid->qugid_id; ++ ++ /* Find next entry */ ++ ugid = vzquota_get_next(qmblk, ugid); ++ BUG_ON(ugid != NULL && ugid->qugid_type != type); ++ } ++ up(&qmblk->dq_sem); ++ up(&vz_quota_sem); ++ ++ err = count; ++ if (copy_to_user(dqblk, kbuf, count * sizeof(*kbuf))) ++ err = -EFAULT; ++ ++ vfree(kbuf); ++out: ++ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) ++ qmblk_put(qmblk); ++ ++ return err; ++} ++ ++#endif ++ ++struct quotactl_ops vz_quotactl_operations = { ++ .quota_on = vz_quota_on, ++ .quota_off = vz_quota_off, ++ .quota_sync = vz_quota_sync, ++ .get_info = vz_get_dqinfo, ++ .set_info = vz_set_dqinfo, ++ .get_dqblk = vz_get_dqblk, ++ .set_dqblk = vz_set_dqblk, ++#ifdef CONFIG_QUOTA_COMPAT ++ .get_quoti = vz_get_quoti, ++#endif ++}; ++ ++ ++/* ---------------------------------------------------------------------- ++ * Management interface for host system admins. ++ * --------------------------------------------------------------------- */ ++ ++static int quota_ugid_addstat(unsigned int quota_id, unsigned int ugid_size, ++ struct vz_quota_iface __user *u_ugid_buf, int compat) ++{ ++ struct vz_quota_master *qmblk; ++ int ret; ++ ++ down(&vz_quota_sem); ++ ++ ret = -ENOENT; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ ret = -EBUSY; ++ if (qmblk->dq_state != VZDQ_STARTING) ++ goto out; /* working quota doesn't accept new ugids */ ++ ++ ret = 0; ++ /* start to add ugids */ ++ for (ret = 0; ret < ugid_size; ret++) { ++ struct vz_quota_iface ugid_buf; ++ struct vz_quota_ugid *ugid; ++ ++ if (!compat) { ++ if (copy_from_user(&ugid_buf, u_ugid_buf, ++ sizeof(ugid_buf))) ++ break; ++ u_ugid_buf++; /* next user buffer */ ++ } else { ++#ifdef CONFIG_COMPAT ++ struct compat_vz_quota_iface oqif; ++ if (copy_from_user(&oqif, u_ugid_buf, ++ sizeof(oqif))) ++ break; ++ ugid_buf.qi_id = oqif.qi_id; ++ ugid_buf.qi_type = oqif.qi_type; ++ compat_dqstat2dqstat(&oqif.qi_stat, &ugid_buf.qi_stat); ++ u_ugid_buf = (struct vz_quota_iface __user *) ++ (((void *)u_ugid_buf) + sizeof(oqif)); ++#endif ++ } ++ ++ if (ugid_buf.qi_type >= MAXQUOTAS) ++ break; /* bad quota type - this is the only check */ ++ ++ ugid = vzquota_find_ugid(qmblk, ++ ugid_buf.qi_id, ugid_buf.qi_type, 0); ++ if (ugid == VZ_QUOTA_UGBAD) { ++ qmblk->dq_flags |= VZDQUG_FIXED_SET; ++ break; /* limit reached */ ++ } ++ ++ /* update usage/limits ++ * we can copy the data without the lock, because the data ++ * cannot be modified in VZDQ_STARTING state */ ++ ugid->qugid_stat = ugid_buf.qi_stat; ++ ++ vzquota_put_ugid(qmblk, ugid); ++ } ++out: ++ up(&vz_quota_sem); ++ ++ return ret; ++} ++ ++static int quota_ugid_setgrace(unsigned int quota_id, ++ struct dq_info __user u_dq_info[], int compat) ++{ ++ struct vz_quota_master *qmblk; ++ struct dq_info dq_info[MAXQUOTAS]; ++ struct dq_info *target; ++ int err, type; ++ ++ down(&vz_quota_sem); ++ ++ err = -ENOENT; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ err = -EBUSY; ++ if (qmblk->dq_state != VZDQ_STARTING) ++ goto out; /* working quota doesn't accept changing options */ ++ ++ err = -EFAULT; ++ if (!compat) { ++ if (copy_from_user(dq_info, u_dq_info, sizeof(dq_info))) ++ goto out; ++ } else { ++#ifdef CONFIG_COMPAT ++ struct compat_dq_info odqi[MAXQUOTAS]; ++ if (copy_from_user(odqi, u_dq_info, sizeof(odqi))) ++ goto out; ++ for (type = 0; type < MAXQUOTAS; type++) ++ compat_dqinfo2dqinfo(&odqi[type], &dq_info[type]); ++#endif ++ } ++ ++ err = 0; ++ ++ /* update in qmblk */ ++ for (type = 0; type < MAXQUOTAS; type++) { ++ target = &qmblk->dq_ugid_info[type]; ++ target->bexpire = dq_info[type].bexpire; ++ target->iexpire = dq_info[type].iexpire; ++ } ++out: ++ up(&vz_quota_sem); ++ ++ return err; ++} ++ ++static int do_quota_ugid_getstat(struct vz_quota_master *qmblk, int index, int size, ++ struct vz_quota_iface *u_ugid_buf) ++{ ++ int type, count; ++ struct vz_quota_ugid *ugid; ++ ++ if (QTREE_LEAFNUM(qmblk->dq_uid_tree) + ++ QTREE_LEAFNUM(qmblk->dq_gid_tree) ++ <= index) ++ return 0; ++ ++ count = 0; ++ ++ type = index < QTREE_LEAFNUM(qmblk->dq_uid_tree) ? USRQUOTA : GRPQUOTA; ++ if (type == GRPQUOTA) ++ index -= QTREE_LEAFNUM(qmblk->dq_uid_tree); ++ ++ /* loop through ugid and then qgid quota */ ++repeat: ++ for (ugid = vzquota_get_byindex(qmblk, index, type); ++ ugid != NULL && count < size; ++ ugid = vzquota_get_next(qmblk, ugid), count++) ++ { ++ struct vz_quota_iface ugid_buf; ++ ++ /* form interface buffer and send in to user-level */ ++ qmblk_data_read_lock(qmblk); ++ memcpy(&ugid_buf.qi_stat, &ugid->qugid_stat, ++ sizeof(ugid_buf.qi_stat)); ++ qmblk_data_read_unlock(qmblk); ++ ugid_buf.qi_id = ugid->qugid_id; ++ ugid_buf.qi_type = ugid->qugid_type; ++ ++ memcpy(u_ugid_buf, &ugid_buf, sizeof(ugid_buf)); ++ u_ugid_buf++; /* next portion of user buffer */ ++ } ++ ++ if (type == USRQUOTA && count < size) { ++ type = GRPQUOTA; ++ index = 0; ++ goto repeat; ++ } ++ ++ return count; ++} ++ ++static int quota_ugid_getstat(unsigned int quota_id, ++ int index, int size, struct vz_quota_iface __user *u_ugid_buf, ++ int compat) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_iface *k_ugid_buf; ++ int err; ++ ++ if (index < 0 || size < 0) ++ return -EINVAL; ++ ++ if (size > INT_MAX / sizeof(struct vz_quota_iface)) ++ return -EINVAL; ++ ++ k_ugid_buf = vmalloc(size * sizeof(struct vz_quota_iface)); ++ if (k_ugid_buf == NULL) ++ return -ENOMEM; ++ ++ down(&vz_quota_sem); ++ ++ err = -ENOENT; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ down(&qmblk->dq_sem); ++ err = do_quota_ugid_getstat(qmblk, index, size, k_ugid_buf); ++ up(&qmblk->dq_sem); ++ if (err < 0) ++ goto out; ++ ++ if (!compat) { ++ if (copy_to_user(u_ugid_buf, k_ugid_buf, ++ err * sizeof(struct vz_quota_iface))) ++ err = -EFAULT; ++ } else { ++#ifdef CONFIG_COMPAT ++ struct compat_vz_quota_iface oqif; ++ int i; ++ for (i = 0; i < err; i++) { ++ oqif.qi_id = k_ugid_buf[i].qi_id; ++ oqif.qi_type = k_ugid_buf[i].qi_type; ++ dqstat2compat_dqstat(&k_ugid_buf[i].qi_stat, ++ &oqif.qi_stat); ++ if (copy_to_user(u_ugid_buf, &oqif, sizeof(oqif))) ++ err = -EFAULT; ++ u_ugid_buf = (struct vz_quota_iface __user *) ++ (((void *)u_ugid_buf) + sizeof(oqif)); ++ } ++#endif ++ } ++ ++out: ++ up(&vz_quota_sem); ++ vfree(k_ugid_buf); ++ return err; ++} ++ ++static int quota_ugid_getgrace(unsigned int quota_id, ++ struct dq_info __user u_dq_info[], int compat) ++{ ++ struct vz_quota_master *qmblk; ++ struct dq_info dq_info[MAXQUOTAS]; ++ struct dq_info *target; ++ int err, type; ++ ++ down(&vz_quota_sem); ++ ++ err = -ENOENT; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ err = 0; ++ /* update from qmblk */ ++ for (type = 0; type < MAXQUOTAS; type ++) { ++ target = &qmblk->dq_ugid_info[type]; ++ dq_info[type].bexpire = target->bexpire; ++ dq_info[type].iexpire = target->iexpire; ++ dq_info[type].flags = target->flags; ++ } ++ ++ if (!compat) { ++ if (copy_to_user(u_dq_info, dq_info, sizeof(dq_info))) ++ err = -EFAULT; ++ } else { ++#ifdef CONFIG_COMPAT ++ struct compat_dq_info odqi[MAXQUOTAS]; ++ for (type = 0; type < MAXQUOTAS; type ++) ++ dqinfo2compat_dqinfo(&dq_info[type], &odqi[type]); ++ if (copy_to_user(u_dq_info, odqi, sizeof(odqi))) ++ err = -EFAULT; ++#endif ++ } ++out: ++ up(&vz_quota_sem); ++ ++ return err; ++} ++ ++static int quota_ugid_getconfig(unsigned int quota_id, ++ struct vz_quota_ugid_stat __user *info) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_ugid_stat kinfo; ++ int err; ++ ++ down(&vz_quota_sem); ++ ++ err = -ENOENT; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ err = 0; ++ kinfo.limit = qmblk->dq_ugid_max; ++ kinfo.count = qmblk->dq_ugid_count; ++ kinfo.flags = qmblk->dq_flags; ++ ++ if (copy_to_user(info, &kinfo, sizeof(kinfo))) ++ err = -EFAULT; ++out: ++ up(&vz_quota_sem); ++ ++ return err; ++} ++ ++static int quota_ugid_setconfig(unsigned int quota_id, ++ struct vz_quota_ugid_stat __user *info) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_ugid_stat kinfo; ++ int err; ++ ++ down(&vz_quota_sem); ++ ++ err = -ENOENT; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ err = -EFAULT; ++ if (copy_from_user(&kinfo, info, sizeof(kinfo))) ++ goto out; ++ ++ err = 0; ++ qmblk->dq_ugid_max = kinfo.limit; ++ if (qmblk->dq_state == VZDQ_STARTING) { ++ qmblk->dq_flags = kinfo.flags; ++ if (qmblk->dq_flags & VZDQUG_ON) ++ qmblk->dq_flags |= VZDQ_USRQUOTA | VZDQ_GRPQUOTA; ++ } ++ ++out: ++ up(&vz_quota_sem); ++ ++ return err; ++} ++ ++static int quota_ugid_setlimit(unsigned int quota_id, ++ struct vz_quota_ugid_setlimit __user *u_lim) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_ugid_setlimit lim; ++ int err; ++ ++ down(&vz_quota_sem); ++ ++ err = -ESRCH; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ err = -EFAULT; ++ if (copy_from_user(&lim, u_lim, sizeof(lim))) ++ goto out; ++ ++ err = __vz_set_dqblk(qmblk, lim.type, lim.id, &lim.dqb); ++ ++out: ++ up(&vz_quota_sem); ++ ++ return err; ++} ++ ++static int quota_ugid_setinfo(unsigned int quota_id, ++ struct vz_quota_ugid_setinfo __user *u_info) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_ugid_setinfo info; ++ int err; ++ ++ down(&vz_quota_sem); ++ ++ err = -ESRCH; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ err = -EFAULT; ++ if (copy_from_user(&info, u_info, sizeof(info))) ++ goto out; ++ ++ err = __vz_set_dqinfo(qmblk, info.type, &info.dqi); ++ ++out: ++ up(&vz_quota_sem); ++ ++ return err; ++} ++ ++/* ++ * This is a system call to maintain UGID quotas ++ * Note this call is allowed to run ONLY from VE0 ++ */ ++long do_vzquotaugidctl(int cmd, unsigned int quota_id, ++ unsigned int ugid_index, unsigned int ugid_size, ++ void *addr, int compat) ++{ ++ int ret; ++ ++ ret = -EPERM; ++ /* access allowed only from root of VE0 */ ++ if (!capable(CAP_SYS_RESOURCE) || ++ !capable(CAP_SYS_ADMIN)) ++ goto out; ++ ++ switch (cmd) { ++ case VZ_DQ_UGID_GETSTAT: ++ ret = quota_ugid_getstat(quota_id, ++ ugid_index, ugid_size, ++ (struct vz_quota_iface __user *)addr, ++ compat); ++ break; ++ case VZ_DQ_UGID_ADDSTAT: ++ ret = quota_ugid_addstat(quota_id, ugid_size, ++ (struct vz_quota_iface __user *) addr, ++ compat); ++ break; ++ case VZ_DQ_UGID_GETGRACE: ++ ret = quota_ugid_getgrace(quota_id, ++ (struct dq_info __user *)addr, compat); ++ break; ++ case VZ_DQ_UGID_SETGRACE: ++ ret = quota_ugid_setgrace(quota_id, ++ (struct dq_info __user *)addr, compat); ++ break; ++ case VZ_DQ_UGID_GETCONFIG: ++ ret = quota_ugid_getconfig(quota_id, ++ (struct vz_quota_ugid_stat __user *) ++ addr); ++ break; ++ case VZ_DQ_UGID_SETCONFIG: ++ ret = quota_ugid_setconfig(quota_id, ++ (struct vz_quota_ugid_stat __user *) ++ addr); ++ break; ++ case VZ_DQ_UGID_SETLIMIT: ++ ret = quota_ugid_setlimit(quota_id, ++ (struct vz_quota_ugid_setlimit __user *) ++ addr); ++ break; ++ case VZ_DQ_UGID_SETINFO: ++ ret = quota_ugid_setinfo(quota_id, ++ (struct vz_quota_ugid_setinfo __user *) ++ addr); ++ break; ++ default: ++ ret = -EINVAL; ++ goto out; ++ } ++out: ++ return ret; ++} ++ ++static void ugid_quota_on_sb(struct super_block *sb) ++{ ++ struct super_block *real_sb; ++ struct vz_quota_master *qmblk; ++ ++ if (!sb->s_op->get_quota_root) ++ return; ++ ++ real_sb = sb->s_op->get_quota_root(sb)->i_sb; ++ if (real_sb->dq_op != &vz_quota_operations) ++ return; ++ ++ sb->dq_op = &vz_quota_operations2; ++ sb->s_qcop = &vz_quotactl_operations; ++ INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list); ++ INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list); ++ sb->s_dquot.info[USRQUOTA].dqi_format = &vz_quota_empty_v2_format; ++ sb->s_dquot.info[GRPQUOTA].dqi_format = &vz_quota_empty_v2_format; ++ ++ qmblk = vzquota_find_qmblk(sb); ++ if ((qmblk == NULL) || (qmblk == VZ_QUOTA_BAD)) ++ return; ++ down(&vz_quota_sem); ++ if (qmblk->dq_flags & VZDQ_USRQUOTA) ++ sb->s_dquot.flags |= DQUOT_USR_ENABLED; ++ if (qmblk->dq_flags & VZDQ_GRPQUOTA) ++ sb->s_dquot.flags |= DQUOT_GRP_ENABLED; ++ up(&vz_quota_sem); ++ qmblk_put(qmblk); ++} ++ ++static void ugid_quota_off_sb(struct super_block *sb) ++{ ++ /* can't make quota off on mounted super block */ ++ BUG_ON(sb->s_root != NULL); ++} ++ ++static int ugid_notifier_call(struct vnotifier_block *self, ++ unsigned long n, void *data, int old_ret) ++{ ++ struct virt_info_quota *viq; ++ ++ viq = (struct virt_info_quota *)data; ++ ++ switch (n) { ++ case VIRTINFO_QUOTA_ON: ++ ugid_quota_on_sb(viq->super); ++ break; ++ case VIRTINFO_QUOTA_OFF: ++ ugid_quota_off_sb(viq->super); ++ break; ++ case VIRTINFO_QUOTA_GETSTAT: ++ break; ++ default: ++ return old_ret; ++ } ++ return NOTIFY_OK; ++} ++ ++static struct vnotifier_block ugid_notifier_block = { ++ .notifier_call = ugid_notifier_call, ++}; ++ ++/* ---------------------------------------------------------------------- ++ * Init/exit. ++ * --------------------------------------------------------------------- */ ++ ++int vzquota_ugid_init(void) ++{ ++ int err; ++ ++ vz_quota_ugid_cachep = kmem_cache_create("vz_quota_ugid", ++ sizeof(struct vz_quota_ugid), ++ 0, SLAB_HWCACHE_ALIGN, NULL); ++ if (vz_quota_ugid_cachep == NULL) ++ goto err_slab; ++ ++ err = register_quota_format(&vz_quota_empty_v2_format); ++ if (err) ++ goto err_reg; ++ ++ virtinfo_notifier_register(VITYPE_QUOTA, &ugid_notifier_block); ++ return 0; ++ ++err_reg: ++ kmem_cache_destroy(vz_quota_ugid_cachep); ++ return err; ++ ++err_slab: ++ printk(KERN_ERR "Cannot create VZ_QUOTA SLAB cache\n"); ++ return -ENOMEM; ++} ++ ++void vzquota_ugid_release(void) ++{ ++ virtinfo_notifier_unregister(VITYPE_QUOTA, &ugid_notifier_block); ++ unregister_quota_format(&vz_quota_empty_v2_format); ++ ++ kmem_cache_destroy(vz_quota_ugid_cachep); ++} +diff --git a/fs/vzdquot.c b/fs/vzdquot.c +new file mode 100644 +index 0000000..c13bec2 +--- /dev/null ++++ b/fs/vzdquot.c +@@ -0,0 +1,1955 @@ ++/* ++ * Copyright (C) 2001, 2002, 2004, 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ * This file contains the core of Virtuozzo disk quota implementation: ++ * maintenance of VZDQ information in inodes, ++ * external interfaces, ++ * module entry. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ---------------------------------------------------------------------- ++ * ++ * Locking ++ * ++ * ---------------------------------------------------------------------- */ ++ ++/* ++ * Serializes on/off and all other do_vzquotactl operations. ++ * Protects qmblk hash. ++ */ ++struct semaphore vz_quota_sem; ++ ++/* ++ * Data access locks ++ * inode_qmblk ++ * protects qmblk pointers in all inodes and qlnk content in general ++ * (but not qmblk content); ++ * also protects related qmblk invalidation procedures; ++ * can't be per-inode because of vzquota_dtree_qmblk complications ++ * and problems with serialization with quota_on, ++ * but can be per-superblock; ++ * qmblk_data ++ * protects qmblk fields (such as current usage) ++ * quota_data ++ * protects charge/uncharge operations, thus, implies ++ * qmblk_data lock and, if CONFIG_VZ_QUOTA_UGID, inode_qmblk lock ++ * (to protect ugid pointers). ++ * ++ * Lock order: ++ * inode_qmblk_lock -> dcache_lock ++ * inode_qmblk_lock -> qmblk_data ++ */ ++static DEFINE_SPINLOCK(vzdq_qmblk_lock); ++ ++inline void inode_qmblk_lock(struct super_block *sb) ++{ ++ spin_lock(&vzdq_qmblk_lock); ++} ++ ++inline void inode_qmblk_unlock(struct super_block *sb) ++{ ++ spin_unlock(&vzdq_qmblk_lock); ++} ++ ++inline void qmblk_data_read_lock(struct vz_quota_master *qmblk) ++{ ++ spin_lock(&qmblk->dq_data_lock); ++} ++ ++inline void qmblk_data_read_unlock(struct vz_quota_master *qmblk) ++{ ++ spin_unlock(&qmblk->dq_data_lock); ++} ++ ++inline void qmblk_data_write_lock(struct vz_quota_master *qmblk) ++{ ++ spin_lock(&qmblk->dq_data_lock); ++} ++ ++inline void qmblk_data_write_unlock(struct vz_quota_master *qmblk) ++{ ++ spin_unlock(&qmblk->dq_data_lock); ++} ++ ++struct quota_format_type vz_quota_empty_v2_format = { ++ .qf_fmt_id = QFMT_VFS_V0, ++ .qf_ops = NULL, ++ .qf_owner = THIS_MODULE, ++}; ++ ++/* ---------------------------------------------------------------------- ++ * ++ * Master hash table handling. ++ * ++ * SMP not safe, serialied by vz_quota_sem within quota syscalls ++ * ++ * --------------------------------------------------------------------- */ ++ ++static struct kmem_cache *vzquota_cachep; ++ ++/* ++ * Hash function. ++ */ ++#define QHASH_BITS 6 ++#define VZ_QUOTA_HASH_SIZE (1 << QHASH_BITS) ++#define QHASH_MASK (VZ_QUOTA_HASH_SIZE - 1) ++ ++struct list_head vzquota_hash_table[VZ_QUOTA_HASH_SIZE]; ++int vzquota_hash_size = VZ_QUOTA_HASH_SIZE; ++ ++static inline int vzquota_hash_func(unsigned int qid) ++{ ++ return (((qid >> QHASH_BITS) ^ qid) & QHASH_MASK); ++} ++ ++/** ++ * vzquota_alloc_master - alloc and instantiate master quota record ++ * ++ * Returns: ++ * pointer to newly created record if SUCCESS ++ * -ENOMEM if out of memory ++ * -EEXIST if record with given quota_id already exist ++ */ ++struct vz_quota_master *vzquota_alloc_master(unsigned int quota_id, ++ struct vz_quota_stat *qstat) ++{ ++ int err; ++ struct vz_quota_master *qmblk; ++ ++ err = -EEXIST; ++ if (vzquota_find_master(quota_id) != NULL) ++ goto out; ++ ++ err = -ENOMEM; ++ qmblk = kmem_cache_alloc(vzquota_cachep, GFP_KERNEL); ++ if (qmblk == NULL) ++ goto out; ++#ifdef CONFIG_VZ_QUOTA_UGID ++ qmblk->dq_uid_tree = quotatree_alloc(); ++ if (!qmblk->dq_uid_tree) ++ goto out_free; ++ ++ qmblk->dq_gid_tree = quotatree_alloc(); ++ if (!qmblk->dq_gid_tree) ++ goto out_free_tree; ++#endif ++ ++ qmblk->dq_state = VZDQ_STARTING; ++ init_MUTEX(&qmblk->dq_sem); ++ spin_lock_init(&qmblk->dq_data_lock); ++ ++ qmblk->dq_id = quota_id; ++ qmblk->dq_stat = qstat->dq_stat; ++ qmblk->dq_info = qstat->dq_info; ++ qmblk->dq_root_path.dentry = NULL; ++ qmblk->dq_root_path.mnt = NULL; ++ qmblk->dq_sb = NULL; ++ qmblk->dq_ugid_count = 0; ++ qmblk->dq_ugid_max = 0; ++ qmblk->dq_flags = 0; ++ memset(qmblk->dq_ugid_info, 0, sizeof(qmblk->dq_ugid_info)); ++ INIT_LIST_HEAD(&qmblk->dq_ilink_list); ++ ++ atomic_set(&qmblk->dq_count, 1); ++ ++ /* insert in hash chain */ ++ list_add(&qmblk->dq_hash, ++ &vzquota_hash_table[vzquota_hash_func(quota_id)]); ++ ++ /* success */ ++ return qmblk; ++ ++#ifdef CONFIG_VZ_QUOTA_UGID ++out_free_tree: ++ quotatree_free(qmblk->dq_uid_tree, NULL); ++out_free: ++ kmem_cache_free(vzquota_cachep, qmblk); ++#endif ++out: ++ return ERR_PTR(err); ++} ++ ++static struct vz_quota_master *vzquota_alloc_fake(void) ++{ ++ struct vz_quota_master *qmblk; ++ ++ qmblk = kmem_cache_alloc(vzquota_cachep, GFP_KERNEL); ++ if (qmblk == NULL) ++ return NULL; ++ memset(qmblk, 0, sizeof(*qmblk)); ++ qmblk->dq_state = VZDQ_STOPING; ++ qmblk->dq_flags = VZDQ_NOQUOT; ++ spin_lock_init(&qmblk->dq_data_lock); ++ INIT_LIST_HEAD(&qmblk->dq_ilink_list); ++ atomic_set(&qmblk->dq_count, 1); ++ return qmblk; ++} ++ ++/** ++ * vzquota_find_master - find master record with given id ++ * ++ * Returns qmblk without touching its refcounter. ++ * Called under vz_quota_sem. ++ */ ++struct vz_quota_master *vzquota_find_master(unsigned int quota_id) ++{ ++ int i; ++ struct vz_quota_master *qp; ++ ++ i = vzquota_hash_func(quota_id); ++ list_for_each_entry(qp, &vzquota_hash_table[i], dq_hash) { ++ if (qp->dq_id == quota_id) ++ return qp; ++ } ++ return NULL; ++} ++ ++/** ++ * vzquota_free_master - release resources taken by qmblk, freeing memory ++ * ++ * qmblk is assumed to be already taken out from the hash. ++ * Should be called outside vz_quota_sem. ++ */ ++void vzquota_free_master(struct vz_quota_master *qmblk) ++{ ++#ifdef CONFIG_VZ_QUOTA_UGID ++ vzquota_kill_ugid(qmblk); ++#endif ++ BUG_ON(!list_empty(&qmblk->dq_ilink_list)); ++ kmem_cache_free(vzquota_cachep, qmblk); ++} ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * Passing quota information through current ++ * ++ * Used in inode -> qmblk lookup at inode creation stage (since at that ++ * time there are no links between the inode being created and its parent ++ * directory). ++ * ++ * --------------------------------------------------------------------- */ ++ ++#define VZDQ_CUR_MAGIC 0x57d0fee2 ++ ++static inline int vzquota_cur_qmblk_check(void) ++{ ++ return current->magic == VZDQ_CUR_MAGIC; ++} ++ ++static inline struct inode *vzquota_cur_qmblk_fetch(void) ++{ ++ return current->ino; ++} ++ ++static inline void vzquota_cur_qmblk_set(struct inode *data) ++{ ++ struct task_struct *tsk; ++ ++ tsk = current; ++ tsk->magic = VZDQ_CUR_MAGIC; ++ tsk->ino = data; ++} ++ ++#if 0 ++static inline void vzquota_cur_qmblk_reset(void) ++{ ++ current->magic = 0; ++} ++#endif ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * Superblock quota operations ++ * ++ * --------------------------------------------------------------------- */ ++ ++/* ++ * Kernel structure abuse. ++ * We use files[0] pointer as an int variable: ++ * reference counter of how many quota blocks uses this superblock. ++ * files[1] is used for generations structure which helps us to track ++ * when traversing of dentries is really required. ++ */ ++#define __VZ_QUOTA_NOQUOTA(sb) sb->s_dquot.vzdq_master ++#define __VZ_QUOTA_TSTAMP(sb) ((struct timeval *)\ ++ &sb->s_dquot.dqio_mutex) ++ ++#if defined(VZ_QUOTA_UNLOAD) ++ ++#define __VZ_QUOTA_SBREF(sb) sb->s_dquot.vzdq_count ++ ++struct dquot_operations *orig_dq_op; ++struct quotactl_ops *orig_dq_cop; ++ ++/** ++ * quota_get_super - account for new a quoted tree under the superblock ++ * ++ * One superblock can have multiple directory subtrees with different VZ ++ * quotas. We keep a counter of such subtrees and set VZ quota operations or ++ * reset the default ones. ++ * ++ * Called under vz_quota_sem (from quota_on). ++ */ ++int vzquota_get_super(struct super_block *sb) ++{ ++ if (sb->dq_op != &vz_quota_operations) { ++ down(&sb->s_dquot.dqonoff_sem); ++ if (sb->s_dquot.flags & (DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED)) { ++ up(&sb->s_dquot.dqonoff_sem); ++ return -EEXIST; ++ } ++ if (orig_dq_op == NULL && sb->dq_op != NULL) ++ orig_dq_op = sb->dq_op; ++ sb->dq_op = &vz_quota_operations; ++ if (orig_dq_cop == NULL && sb->s_qcop != NULL) ++ orig_dq_cop = sb->s_qcop; ++ /* XXX this may race with sys_quotactl */ ++#ifdef CONFIG_VZ_QUOTA_UGID ++ sb->s_qcop = &vz_quotactl_operations; ++#else ++ sb->s_qcop = NULL; ++#endif ++ do_gettimeofday(__VZ_QUOTA_TSTAMP(sb)); ++ memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info)); ++ ++ INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list); ++ INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list); ++ sb->s_dquot.info[USRQUOTA].dqi_format = &vz_quota_empty_v2_format; ++ sb->s_dquot.info[GRPQUOTA].dqi_format = &vz_quota_empty_v2_format; ++ /* ++ * To get quotaops.h call us we need to mark superblock ++ * as having quota. These flags mark the moment when ++ * our dq_op start to be called. ++ * ++ * The ordering of dq_op and s_dquot.flags assignment ++ * needs to be enforced, but other CPUs do not do rmb() ++ * between s_dquot.flags and dq_op accesses. ++ */ ++ wmb(); synchronize_sched(); ++ sb->s_dquot.flags = DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED; ++ __module_get(THIS_MODULE); ++ up(&sb->s_dquot.dqonoff_sem); ++ } ++ /* protected by vz_quota_sem */ ++ __VZ_QUOTA_SBREF(sb)++; ++ return 0; ++} ++ ++/** ++ * quota_put_super - release superblock when one quota tree goes away ++ * ++ * Called under vz_quota_sem. ++ */ ++void vzquota_put_super(struct super_block *sb) ++{ ++ int count; ++ ++ count = --__VZ_QUOTA_SBREF(sb); ++ if (count == 0) { ++ down(&sb->s_dquot.dqonoff_sem); ++ sb->s_dquot.flags = 0; ++ wmb(); synchronize_sched(); ++ sema_init(&sb->s_dquot.dqio_sem, 1); ++ sb->s_qcop = orig_dq_cop; ++ sb->dq_op = orig_dq_op; ++ inode_qmblk_lock(sb); ++ quota_gen_put(SB_QGEN(sb)); ++ SB_QGEN(sb) = NULL; ++ /* release qlnk's without qmblk */ ++ remove_inode_quota_links_list(&non_vzquota_inodes_lh, ++ sb, NULL); ++ /* ++ * Races with quota initialization: ++ * after this inode_qmblk_unlock all inode's generations are ++ * invalidated, quota_inode_qmblk checks superblock operations. ++ */ ++ inode_qmblk_unlock(sb); ++ /* ++ * Module refcounting: in theory, this is the best place ++ * to call module_put(THIS_MODULE). ++ * In reality, it can't be done because we can't be sure that ++ * other CPUs do not enter our code segment through dq_op ++ * cached long time ago. Quotaops interface isn't supposed to ++ * go into modules currently (that is, into unloadable ++ * modules). By omitting module_put, our module isn't ++ * unloadable. ++ */ ++ up(&sb->s_dquot.dqonoff_sem); ++ } ++} ++ ++#else ++ ++struct vzquota_new_sop { ++ struct super_operations new_op; ++ const struct super_operations *old_op; ++}; ++ ++/** ++ * vzquota_shutdown_super - callback on umount ++ */ ++void vzquota_shutdown_super(struct super_block *sb) ++{ ++ struct vz_quota_master *qmblk; ++ struct vzquota_new_sop *sop; ++ ++ qmblk = __VZ_QUOTA_NOQUOTA(sb); ++ __VZ_QUOTA_NOQUOTA(sb) = NULL; ++ if (qmblk != NULL) ++ qmblk_put(qmblk); ++ sop = container_of(sb->s_op, struct vzquota_new_sop, new_op); ++ sb->s_op = sop->old_op; ++ kfree(sop); ++ if (sb->s_op->put_super != NULL) ++ (*sb->s_op->put_super)(sb); ++} ++ ++/** ++ * vzquota_get_super - account for new a quoted tree under the superblock ++ * ++ * One superblock can have multiple directory subtrees with different VZ ++ * quotas. ++ * ++ * Called under vz_quota_sem (from vzquota_on). ++ */ ++int vzquota_get_super(struct super_block *sb) ++{ ++ struct vz_quota_master *qnew; ++ struct vzquota_new_sop *sop; ++ int err; ++ ++ mutex_lock(&sb->s_dquot.dqonoff_mutex); ++ err = -EEXIST; ++ if ((sb->s_dquot.flags & (DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED)) && ++ sb->dq_op != &vz_quota_operations) ++ goto out_up; ++ ++ /* ++ * This allocation code should be under sb->dq_op check below, but ++ * it doesn't really matter... ++ */ ++ if (__VZ_QUOTA_NOQUOTA(sb) == NULL) { ++ qnew = vzquota_alloc_fake(); ++ if (qnew == NULL) ++ goto out_up; ++ __VZ_QUOTA_NOQUOTA(sb) = qnew; ++ } ++ ++ if (sb->dq_op != &vz_quota_operations) { ++ sop = kmalloc(sizeof(*sop), GFP_KERNEL); ++ if (sop == NULL) { ++ vzquota_free_master(__VZ_QUOTA_NOQUOTA(sb)); ++ __VZ_QUOTA_NOQUOTA(sb) = NULL; ++ goto out_up; ++ } ++ memcpy(&sop->new_op, sb->s_op, sizeof(sop->new_op)); ++ sop->new_op.put_super = &vzquota_shutdown_super; ++ sop->old_op = sb->s_op; ++ sb->s_op = &sop->new_op; ++ ++ sb->dq_op = &vz_quota_operations; ++#ifdef CONFIG_VZ_QUOTA_UGID ++ sb->s_qcop = &vz_quotactl_operations; ++#else ++ sb->s_qcop = NULL; ++#endif ++ do_gettimeofday(__VZ_QUOTA_TSTAMP(sb)); ++ ++ memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info)); ++ /* these 2 list heads are checked in sync_dquots() */ ++ INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list); ++ INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list); ++ sb->s_dquot.info[USRQUOTA].dqi_format = ++ &vz_quota_empty_v2_format; ++ sb->s_dquot.info[GRPQUOTA].dqi_format = ++ &vz_quota_empty_v2_format; ++ ++ /* ++ * To get quotaops.h to call us we need to mark superblock ++ * as having quota. These flags mark the moment when ++ * our dq_op start to be called. ++ * ++ * The ordering of dq_op and s_dquot.flags assignment ++ * needs to be enforced, but other CPUs do not do rmb() ++ * between s_dquot.flags and dq_op accesses. ++ */ ++ wmb(); synchronize_sched(); ++ sb->s_dquot.flags = DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED; ++ } ++ err = 0; ++ ++out_up: ++ mutex_unlock(&sb->s_dquot.dqonoff_mutex); ++ return err; ++} ++ ++/** ++ * vzquota_put_super - one quota tree less on this superblock ++ * ++ * Called under vz_quota_sem. ++ */ ++void vzquota_put_super(struct super_block *sb) ++{ ++ /* ++ * Even if this put is the last one, ++ * sb->s_dquot.flags can't be cleared, because otherwise vzquota_drop ++ * won't be called and the remaining qmblk references won't be put. ++ */ ++} ++ ++#endif ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * Helpers for inode -> qmblk link maintenance ++ * ++ * --------------------------------------------------------------------- */ ++ ++#define __VZ_QUOTA_EMPTY ((void *)0xbdbdbdbd) ++#define VZ_QUOTA_IS_NOQUOTA(qm, sb) ((qm)->dq_flags & VZDQ_NOQUOT) ++#define VZ_QUOTA_EMPTY_IOPS (&vfs_empty_iops) ++extern struct inode_operations vfs_empty_iops; ++ ++static int VZ_QUOTA_IS_ACTUAL(struct inode *inode) ++{ ++ struct vz_quota_master *qmblk; ++ ++ qmblk = INODE_QLNK(inode)->qmblk; ++ if (qmblk == VZ_QUOTA_BAD) ++ return 1; ++ if (qmblk == __VZ_QUOTA_EMPTY) ++ return 0; ++ if (qmblk->dq_flags & VZDQ_NOACT) ++ /* not actual (invalidated) qmblk */ ++ return 0; ++ return 1; ++} ++ ++static inline int vzquota_qlnk_is_empty(struct vz_quota_ilink *qlnk) ++{ ++ return qlnk->qmblk == __VZ_QUOTA_EMPTY; ++} ++ ++static inline void set_qlnk_origin(struct vz_quota_ilink *qlnk, ++ unsigned char origin) ++{ ++ qlnk->origin[0] = qlnk->origin[1]; ++ qlnk->origin[1] = origin; ++} ++ ++static inline void vzquota_qlnk_set_empty(struct vz_quota_ilink *qlnk) ++{ ++ qlnk->qmblk = __VZ_QUOTA_EMPTY; ++ set_qlnk_origin(qlnk, VZ_QUOTAO_SETE); ++} ++ ++void vzquota_qlnk_init(struct vz_quota_ilink *qlnk) ++{ ++ memset(qlnk, 0, sizeof(*qlnk)); ++ INIT_LIST_HEAD(&qlnk->list); ++ vzquota_qlnk_set_empty(qlnk); ++ set_qlnk_origin(qlnk, VZ_QUOTAO_INIT); ++} ++ ++void vzquota_qlnk_destroy(struct vz_quota_ilink *qlnk) ++{ ++ might_sleep(); ++ if (vzquota_qlnk_is_empty(qlnk)) ++ return; ++#if defined(CONFIG_VZ_QUOTA_UGID) ++ if (qlnk->qmblk != NULL && qlnk->qmblk != VZ_QUOTA_BAD) { ++ struct vz_quota_master *qmblk; ++ struct vz_quota_ugid *quid, *qgid; ++ qmblk = qlnk->qmblk; ++ quid = qlnk->qugid[USRQUOTA]; ++ qgid = qlnk->qugid[GRPQUOTA]; ++ if (quid != NULL || qgid != NULL) { ++ down(&qmblk->dq_sem); ++ if (qgid != NULL) ++ vzquota_put_ugid(qmblk, qgid); ++ if (quid != NULL) ++ vzquota_put_ugid(qmblk, quid); ++ up(&qmblk->dq_sem); ++ } ++ } ++#endif ++ if (qlnk->qmblk != NULL && qlnk->qmblk != VZ_QUOTA_BAD) ++ qmblk_put(qlnk->qmblk); ++ set_qlnk_origin(qlnk, VZ_QUOTAO_DESTR); ++} ++ ++/** ++ * vzquota_qlnk_swap - swap inode's and temporary vz_quota_ilink contents ++ * @qlt: temporary ++ * @qli: inode's ++ * ++ * Locking is provided by the caller (depending on the context). ++ * After swap, @qli is inserted into the corresponding dq_ilink_list, ++ * @qlt list is reinitialized. ++ */ ++static void vzquota_qlnk_swap(struct vz_quota_ilink *qlt, ++ struct vz_quota_ilink *qli) ++{ ++ struct vz_quota_master *qb; ++ struct vz_quota_ugid *qu; ++ int i; ++ ++ qb = qlt->qmblk; ++ qlt->qmblk = qli->qmblk; ++ qli->qmblk = qb; ++ list_del_init(&qli->list); ++ if (qb != __VZ_QUOTA_EMPTY && qb != VZ_QUOTA_BAD) ++ list_add(&qli->list, &qb->dq_ilink_list); ++ INIT_LIST_HEAD(&qlt->list); ++ set_qlnk_origin(qli, VZ_QUOTAO_SWAP); ++ ++ for (i = 0; i < MAXQUOTAS; i++) { ++ qu = qlt->qugid[i]; ++ qlt->qugid[i] = qli->qugid[i]; ++ qli->qugid[i] = qu; ++ } ++} ++ ++/** ++ * vzquota_qlnk_reinit_locked - destroy qlnk content, called under locks ++ * ++ * Called under dcache_lock and inode_qmblk locks. ++ * Returns 1 if locks were dropped inside, 0 if atomic. ++ */ ++static int vzquota_qlnk_reinit_locked(struct vz_quota_ilink *qlnk, ++ struct inode *inode) ++{ ++ if (vzquota_qlnk_is_empty(qlnk)) ++ return 0; ++ if (qlnk->qmblk == VZ_QUOTA_BAD) { ++ vzquota_qlnk_set_empty(qlnk); ++ set_qlnk_origin(qlnk, VZ_QUOTAO_RE_LOCK); ++ return 0; ++ } ++ spin_unlock(&dcache_lock); ++ inode_qmblk_unlock(inode->i_sb); ++ vzquota_qlnk_destroy(qlnk); ++ vzquota_qlnk_init(qlnk); ++ inode_qmblk_lock(inode->i_sb); ++ spin_lock(&dcache_lock); ++ return 1; ++} ++ ++#if defined(CONFIG_VZ_QUOTA_UGID) ++/** ++ * vzquota_qlnk_reinit_attr - destroy and reinit qlnk content ++ * ++ * Similar to vzquota_qlnk_reinit_locked, called under different locks. ++ */ ++static int vzquota_qlnk_reinit_attr(struct vz_quota_ilink *qlnk, ++ struct inode *inode, ++ struct vz_quota_master *qmblk) ++{ ++ if (vzquota_qlnk_is_empty(qlnk)) ++ return 0; ++ /* may be optimized if qlnk->qugid all NULLs */ ++ qmblk_data_write_unlock(qmblk); ++ inode_qmblk_unlock(inode->i_sb); ++ vzquota_qlnk_destroy(qlnk); ++ vzquota_qlnk_init(qlnk); ++ inode_qmblk_lock(inode->i_sb); ++ qmblk_data_write_lock(qmblk); ++ return 1; ++} ++#endif ++ ++/** ++ * vzquota_qlnk_fill - fill vz_quota_ilink content ++ * @qlnk: vz_quota_ilink to fill ++ * @inode: inode for which @qlnk is filled (i_sb, i_uid, i_gid) ++ * @qmblk: qmblk to which this @qlnk will belong ++ * ++ * Called under dcache_lock and inode_qmblk locks. ++ * Returns 1 if locks were dropped inside, 0 if atomic. ++ * @qlnk is expected to be empty. ++ */ ++static int vzquota_qlnk_fill(struct vz_quota_ilink *qlnk, ++ struct inode *inode, ++ struct vz_quota_master *qmblk) ++{ ++ if (qmblk != VZ_QUOTA_BAD) ++ qmblk_get(qmblk); ++ qlnk->qmblk = qmblk; ++ ++#if defined(CONFIG_VZ_QUOTA_UGID) ++ if (qmblk != VZ_QUOTA_BAD && ++ !VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb) && ++ (qmblk->dq_flags & VZDQUG_ON)) { ++ struct vz_quota_ugid *quid, *qgid; ++ ++ spin_unlock(&dcache_lock); ++ inode_qmblk_unlock(inode->i_sb); ++ ++ down(&qmblk->dq_sem); ++ quid = __vzquota_find_ugid(qmblk, inode->i_uid, USRQUOTA, 0); ++ qgid = __vzquota_find_ugid(qmblk, inode->i_gid, GRPQUOTA, 0); ++ up(&qmblk->dq_sem); ++ ++ inode_qmblk_lock(inode->i_sb); ++ spin_lock(&dcache_lock); ++ qlnk->qugid[USRQUOTA] = quid; ++ qlnk->qugid[GRPQUOTA] = qgid; ++ return 1; ++ } ++#endif ++ ++ return 0; ++} ++ ++#if defined(CONFIG_VZ_QUOTA_UGID) ++/** ++ * vzquota_qlnk_fill_attr - fill vz_quota_ilink content for uid, gid ++ * ++ * This function is a helper for vzquota_transfer, and differs from ++ * vzquota_qlnk_fill only by locking. ++ */ ++static int vzquota_qlnk_fill_attr(struct vz_quota_ilink *qlnk, ++ struct inode *inode, ++ struct iattr *iattr, ++ int mask, ++ struct vz_quota_master *qmblk) ++{ ++ qmblk_get(qmblk); ++ qlnk->qmblk = qmblk; ++ ++ if (mask) { ++ struct vz_quota_ugid *quid, *qgid; ++ ++ quid = qgid = NULL; /* to make gcc happy */ ++ if (!(mask & (1 << USRQUOTA))) ++ quid = vzquota_get_ugid(INODE_QLNK(inode)-> ++ qugid[USRQUOTA]); ++ if (!(mask & (1 << GRPQUOTA))) ++ qgid = vzquota_get_ugid(INODE_QLNK(inode)-> ++ qugid[GRPQUOTA]); ++ ++ qmblk_data_write_unlock(qmblk); ++ inode_qmblk_unlock(inode->i_sb); ++ ++ down(&qmblk->dq_sem); ++ if (mask & (1 << USRQUOTA)) ++ quid = __vzquota_find_ugid(qmblk, iattr->ia_uid, ++ USRQUOTA, 0); ++ if (mask & (1 << GRPQUOTA)) ++ qgid = __vzquota_find_ugid(qmblk, iattr->ia_gid, ++ GRPQUOTA, 0); ++ up(&qmblk->dq_sem); ++ ++ inode_qmblk_lock(inode->i_sb); ++ qmblk_data_write_lock(qmblk); ++ qlnk->qugid[USRQUOTA] = quid; ++ qlnk->qugid[GRPQUOTA] = qgid; ++ return 1; ++ } ++ ++ return 0; ++} ++#endif ++ ++/** ++ * __vzquota_inode_init - make sure inode's qlnk is initialized ++ * ++ * May be called if qlnk is already initialized, detects this situation itself. ++ * Called under inode_qmblk_lock. ++ */ ++static void __vzquota_inode_init(struct inode *inode, unsigned char origin) ++{ ++ if (inode->i_dquot[USRQUOTA] == NODQUOT) { ++ vzquota_qlnk_init(INODE_QLNK(inode)); ++ inode->i_dquot[USRQUOTA] = (void *)~(unsigned long)NODQUOT; ++ } ++ set_qlnk_origin(INODE_QLNK(inode), origin); ++} ++ ++/** ++ * vzquota_inode_drop - destroy VZ quota information in the inode ++ * ++ * Inode must not be externally accessible or dirty. ++ */ ++static void vzquota_inode_drop(struct inode *inode) ++{ ++ struct vz_quota_ilink qlnk; ++ ++ vzquota_qlnk_init(&qlnk); ++ inode_qmblk_lock(inode->i_sb); ++ vzquota_qlnk_swap(&qlnk, INODE_QLNK(inode)); ++ set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_DRCAL); ++ inode->i_dquot[USRQUOTA] = NODQUOT; ++ inode_qmblk_unlock(inode->i_sb); ++ vzquota_qlnk_destroy(&qlnk); ++} ++ ++/** ++ * vzquota_inode_qmblk_set - initialize inode's qlnk ++ * @inode: inode to be initialized ++ * @qmblk: quota master block to which this inode should belong (may be BAD) ++ * @qlnk: placeholder to store data to resolve locking issues ++ * ++ * Returns 1 if locks were dropped and rechecks possibly needed, 0 otherwise. ++ * Called under dcache_lock and inode_qmblk locks. ++ * @qlnk will be destroyed in the caller chain. ++ * ++ * It is not mandatory to restart parent checks since quota on/off currently ++ * shrinks dentry tree and checks that there are not outside references. ++ * But if at some time that shink is removed, restarts will be required. ++ * Additionally, the restarts prevent inconsistencies if the dentry tree ++ * changes (inode is moved). This is not a big deal, but anyway... ++ */ ++static int vzquota_inode_qmblk_set(struct inode *inode, ++ struct vz_quota_master *qmblk, ++ struct vz_quota_ilink *qlnk) ++{ ++ if (qmblk == NULL) { ++ printk(KERN_ERR "VZDQ: NULL in set, orig {%u, %u}, " ++ "dev %s, inode %lu, fs %s\n", ++ INODE_QLNK(inode)->origin[0], ++ INODE_QLNK(inode)->origin[1], ++ inode->i_sb->s_id, inode->i_ino, ++ inode->i_sb->s_type->name); ++ printk(KERN_ERR "current %d (%s), VE %d\n", ++ current->pid, current->comm, ++ VEID(get_exec_env())); ++ dump_stack(); ++ qmblk = VZ_QUOTA_BAD; ++ } ++ while (1) { ++ if (vzquota_qlnk_is_empty(qlnk) && ++ vzquota_qlnk_fill(qlnk, inode, qmblk)) ++ return 1; ++ if (qlnk->qmblk == qmblk) ++ break; ++ if (vzquota_qlnk_reinit_locked(qlnk, inode)) ++ return 1; ++ } ++ vzquota_qlnk_swap(qlnk, INODE_QLNK(inode)); ++ set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_QSET); ++ return 0; ++} ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * vzquota_inode_qmblk (inode -> qmblk lookup) parts ++ * ++ * --------------------------------------------------------------------- */ ++ ++static int vzquota_dparents_check_attach(struct inode *inode) ++{ ++ if (!list_empty(&inode->i_dentry)) ++ return 0; ++ printk(KERN_ERR "VZDQ: no parent for " ++ "dev %s, inode %lu, fs %s\n", ++ inode->i_sb->s_id, ++ inode->i_ino, ++ inode->i_sb->s_type->name); ++ return -1; ++} ++ ++static struct inode *vzquota_dparents_check_actual(struct inode *inode) ++{ ++ struct dentry *de; ++ ++ list_for_each_entry(de, &inode->i_dentry, d_alias) { ++ if (de->d_parent == de) /* detached dentry, perhaps */ ++ continue; ++ /* first access to parent, make sure its qlnk initialized */ ++ __vzquota_inode_init(de->d_parent->d_inode, VZ_QUOTAO_ACT); ++ if (!VZ_QUOTA_IS_ACTUAL(de->d_parent->d_inode)) ++ return de->d_parent->d_inode; ++ } ++ return NULL; ++} ++ ++static struct vz_quota_master *vzquota_dparents_check_same(struct inode *inode) ++{ ++ struct dentry *de; ++ struct vz_quota_master *qmblk; ++ ++ qmblk = NULL; ++ list_for_each_entry(de, &inode->i_dentry, d_alias) { ++ if (de->d_parent == de) /* detached dentry, perhaps */ ++ continue; ++ if (qmblk == NULL) { ++ qmblk = INODE_QLNK(de->d_parent->d_inode)->qmblk; ++ continue; ++ } ++ if (INODE_QLNK(de->d_parent->d_inode)->qmblk != qmblk) { ++ printk(KERN_WARNING "VZDQ: multiple quotas for " ++ "dev %s, inode %lu, fs %s\n", ++ inode->i_sb->s_id, ++ inode->i_ino, ++ inode->i_sb->s_type->name); ++ qmblk = VZ_QUOTA_BAD; ++ break; ++ } ++ } ++ if (qmblk == NULL) { ++ printk(KERN_WARNING "VZDQ: not attached to tree, " ++ "dev %s, inode %lu, fs %s\n", ++ inode->i_sb->s_id, ++ inode->i_ino, ++ inode->i_sb->s_type->name); ++ qmblk = VZ_QUOTA_BAD; ++ } ++ return qmblk; ++} ++ ++static void vzquota_dbranch_actualize(struct inode *inode, ++ struct inode *refinode) ++{ ++ struct inode *pinode; ++ struct vz_quota_master *qmblk; ++ struct vz_quota_ilink qlnk; ++ ++ vzquota_qlnk_init(&qlnk); ++ ++start: ++ if (inode == inode->i_sb->s_root->d_inode) { ++ /* filesystem root */ ++ atomic_inc(&inode->i_count); ++ do { ++ qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb); ++ } while (vzquota_inode_qmblk_set(inode, qmblk, &qlnk)); ++ goto out; ++ } ++ ++ if (!vzquota_dparents_check_attach(inode)) { ++ pinode = vzquota_dparents_check_actual(inode); ++ if (pinode != NULL) { ++ inode = pinode; ++ goto start; ++ } ++ } ++ ++ atomic_inc(&inode->i_count); ++ while (1) { ++ if (VZ_QUOTA_IS_ACTUAL(inode)) /* actualized without us */ ++ break; ++ /* ++ * Need to check parents again if we have slept inside ++ * vzquota_inode_qmblk_set() in the loop. ++ * If the state of parents is different, just return and repeat ++ * the actualizing process again from the inode passed to ++ * vzquota_inode_qmblk_recalc(). ++ */ ++ if (!vzquota_dparents_check_attach(inode)) { ++ if (vzquota_dparents_check_actual(inode) != NULL) ++ break; ++ qmblk = vzquota_dparents_check_same(inode); ++ } else ++ qmblk = VZ_QUOTA_BAD; ++ if (!vzquota_inode_qmblk_set(inode, qmblk, &qlnk)){/* success */ ++ set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_ACT); ++ break; ++ } ++ } ++ ++out: ++ spin_unlock(&dcache_lock); ++ inode_qmblk_unlock(refinode->i_sb); ++ vzquota_qlnk_destroy(&qlnk); ++ iput(inode); ++ inode_qmblk_lock(refinode->i_sb); ++ spin_lock(&dcache_lock); ++} ++ ++static void vzquota_dtree_qmblk_recalc(struct inode *inode, ++ struct vz_quota_ilink *qlnk) ++{ ++ struct inode *pinode; ++ struct vz_quota_master *qmblk; ++ ++ if (inode == inode->i_sb->s_root->d_inode) { ++ /* filesystem root */ ++ do { ++ qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb); ++ } while (vzquota_inode_qmblk_set(inode, qmblk, qlnk)); ++ return; ++ } ++ ++start: ++ if (VZ_QUOTA_IS_ACTUAL(inode)) ++ return; ++ /* ++ * Here qmblk is (re-)initialized for all ancestors. ++ * This is not a very efficient procedure, but it guarantees that ++ * the quota tree is consistent (that is, the inode doesn't have two ++ * ancestors with different qmblk). ++ */ ++ if (!vzquota_dparents_check_attach(inode)) { ++ pinode = vzquota_dparents_check_actual(inode); ++ if (pinode != NULL) { ++ vzquota_dbranch_actualize(pinode, inode); ++ goto start; ++ } ++ qmblk = vzquota_dparents_check_same(inode); ++ } else ++ qmblk = VZ_QUOTA_BAD; ++ ++ if (vzquota_inode_qmblk_set(inode, qmblk, qlnk)) ++ goto start; ++ set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_DTREE); ++} ++ ++static void vzquota_det_qmblk_recalc(struct inode *inode, ++ struct vz_quota_ilink *qlnk) ++{ ++ struct inode *parent; ++ struct vz_quota_master *qmblk; ++ char *msg; ++ int cnt; ++ time_t timeout; ++ ++ cnt = 0; ++ parent = NULL; ++start: ++ /* ++ * qmblk of detached inodes shouldn't be considered as not actual. ++ * They are not in any dentry tree, so quota on/off shouldn't affect ++ * them. ++ */ ++ if (!vzquota_qlnk_is_empty(INODE_QLNK(inode))) ++ return; ++ ++ timeout = 3; ++ qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb); ++ /* ++ * Scenario: ++ * open ++ * unlink ++ * quotaon ++ * generic_delete_inode ++ * ++ * This is the first time vzquota sees inode. inode is outside of ++ * vzquota area of interest, otherwise quotaon would have got -EBUSY ++ * due to shrink_dcache_parent(). ++ * inode is almost completely destroyed, so don't intervene. ++ * ++ * dev@: ++ * However, there is a small race here... ++ * dput() first removes itself from all the lists, ++ * so shrink_dcache_parent() can succeed while dentry_iput is not ++ * done yet. ++ */ ++ if (inode->i_state & I_FREEING) ++ goto set; ++ ++ msg = "detached inode not in creation"; ++ if (inode->i_op != VZ_QUOTA_EMPTY_IOPS) ++ goto fail; ++ qmblk = VZ_QUOTA_BAD; ++ msg = "unexpected creation context"; ++ if (!vzquota_cur_qmblk_check()) ++ goto fail; ++ timeout = 0; ++ parent = vzquota_cur_qmblk_fetch(); ++ msg = "uninitialized parent"; ++ if (vzquota_qlnk_is_empty(INODE_QLNK(parent))) ++ goto fail; ++ msg = "parent not in tree"; ++ if (list_empty(&parent->i_dentry)) ++ goto fail; ++ msg = "parent has 0 refcount"; ++ if (!atomic_read(&parent->i_count)) ++ goto fail; ++ msg = "parent has different sb"; ++ if (parent->i_sb != inode->i_sb) ++ goto fail; ++ if (!VZ_QUOTA_IS_ACTUAL(parent)) { ++ vzquota_dbranch_actualize(parent, inode); ++ goto start; ++ } ++ ++ qmblk = INODE_QLNK(parent)->qmblk; ++set: ++ if (vzquota_inode_qmblk_set(inode, qmblk, qlnk)) ++ goto start; ++ set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_DET); ++ return; ++ ++fail: ++ { ++ struct timeval tv, tvo; ++ do_gettimeofday(&tv); ++ memcpy(&tvo, __VZ_QUOTA_TSTAMP(inode->i_sb), sizeof(tvo)); ++ tv.tv_sec -= tvo.tv_sec; ++ if (tv.tv_usec < tvo.tv_usec) { ++ tv.tv_sec--; ++ tv.tv_usec += USEC_PER_SEC - tvo.tv_usec; ++ } else ++ tv.tv_usec -= tvo.tv_usec; ++ if (tv.tv_sec < timeout) ++ goto set; ++ printk(KERN_ERR "VZDQ: %s, orig {%u, %u}," ++ " dev %s, inode %lu, fs %s\n", ++ msg, ++ INODE_QLNK(inode)->origin[0], ++ INODE_QLNK(inode)->origin[1], ++ inode->i_sb->s_id, inode->i_ino, ++ inode->i_sb->s_type->name); ++ printk(KERN_ERR "i_count %u, ", atomic_read(&inode->i_count)); ++ printk(KERN_ERR "i_mode %o, ", inode->i_mode); ++ printk(KERN_ERR "i_state %lx, ", inode->i_state); ++ printk(KERN_ERR "i_flags %x\n", inode->i_flags); ++ printk(KERN_ERR "i_op %p, vfs_empty_iops %p, " ++ "i_fop %p, i_mapping %p\n", ++ inode->i_op, &vfs_empty_iops, ++ inode->i_fop, inode->i_mapping); ++ if (!cnt++) { ++ printk(KERN_ERR "current %d (%s), VE %d," ++ " time %ld.%06ld\n", ++ current->pid, current->comm, ++ VEID(get_exec_env()), ++ tv.tv_sec, (long)tv.tv_usec); ++ dump_stack(); ++ } ++ if (parent != NULL) ++ printk(KERN_ERR "VZDQ: parent of %lu is %lu\n", ++ inode->i_ino, parent->i_ino); ++ } ++ goto set; ++} ++ ++static void vzquota_inode_qmblk_recalc(struct inode *inode, ++ struct vz_quota_ilink *qlnk) ++{ ++ spin_lock(&dcache_lock); ++ if (!list_empty(&inode->i_dentry)) ++ vzquota_dtree_qmblk_recalc(inode, qlnk); ++ else ++ vzquota_det_qmblk_recalc(inode, qlnk); ++ spin_unlock(&dcache_lock); ++} ++ ++/** ++ * vzquota_inode_qmblk - obtain inode's qmblk ++ * ++ * Returns qmblk with refcounter taken, %NULL if not under ++ * VZ quota or %VZ_QUOTA_BAD. ++ * ++ * FIXME: This function should be removed when vzquota_find_qmblk / ++ * get_quota_root / vzquota_dstat code is cleaned up. ++ */ ++struct vz_quota_master *vzquota_inode_qmblk(struct inode *inode) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_ilink qlnk; ++ ++ might_sleep(); ++ ++ if (inode->i_sb->dq_op != &vz_quota_operations) ++ return NULL; ++#if defined(VZ_QUOTA_UNLOAD) ++#error Make sure qmblk does not disappear ++#endif ++ ++ vzquota_qlnk_init(&qlnk); ++ inode_qmblk_lock(inode->i_sb); ++ __vzquota_inode_init(inode, VZ_QUOTAO_INICAL); ++ ++ if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) || ++ !VZ_QUOTA_IS_ACTUAL(inode)) ++ vzquota_inode_qmblk_recalc(inode, &qlnk); ++ ++ qmblk = INODE_QLNK(inode)->qmblk; ++ if (qmblk != VZ_QUOTA_BAD) { ++ if (!VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb)) ++ qmblk_get(qmblk); ++ else ++ qmblk = NULL; ++ } ++ ++ inode_qmblk_unlock(inode->i_sb); ++ vzquota_qlnk_destroy(&qlnk); ++ return qmblk; ++} ++ ++/** ++ * vzquota_find_qmblk - helper to emulate quota on virtual filesystems ++ * ++ * This function finds a quota master block corresponding to the root of ++ * a virtual filesystem. ++ * Returns a quota master block with reference taken, or %NULL if not under ++ * quota, or %VZ_QUOTA_BAD if quota inconsistency is found (and all allocation ++ * operations will fail). ++ * ++ * Note: this function uses vzquota_inode_qmblk(). ++ * The latter is a rather confusing function: it returns qmblk that used to be ++ * on the inode some time ago (without guarantee that it still has any ++ * relations to the inode). So, vzquota_find_qmblk() leaves it up to the ++ * caller to think whether the inode could have changed its qmblk and what to ++ * do in that case. ++ * Currently, the callers appear to not care :( ++ */ ++struct vz_quota_master *vzquota_find_qmblk(struct super_block *sb) ++{ ++ struct inode *qrinode; ++ struct vz_quota_master *qmblk; ++ ++ qmblk = NULL; ++ qrinode = NULL; ++ if (sb->s_op->get_quota_root != NULL) ++ qrinode = sb->s_op->get_quota_root(sb); ++ if (qrinode != NULL) ++ qmblk = vzquota_inode_qmblk(qrinode); ++ return qmblk; ++} ++ ++/* ---------------------------------------------------------------------- ++ * ++ * Calls from quota operations ++ * ++ * --------------------------------------------------------------------- */ ++ ++/** ++ * vzquota_inode_init_call - call from DQUOT_INIT ++ */ ++void vzquota_inode_init_call(struct inode *inode) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_datast data; ++ ++ /* initializes inode's quota inside */ ++ qmblk = vzquota_inode_data(inode, &data); ++ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) ++ vzquota_data_unlock(inode, &data); ++ ++ /* ++ * The check is needed for repeated new_inode() calls from a single ++ * ext3 call like create or mkdir in case of -ENOSPC. ++ */ ++ spin_lock(&dcache_lock); ++ if (!list_empty(&inode->i_dentry)) ++ vzquota_cur_qmblk_set(inode); ++ spin_unlock(&dcache_lock); ++} ++ ++/** ++ * vzquota_inode_drop_call - call from DQUOT_DROP ++ */ ++void vzquota_inode_drop_call(struct inode *inode) ++{ ++ vzquota_inode_drop(inode); ++} ++ ++/** ++ * vzquota_inode_data - initialize (if nec.) and lock inode quota ptrs ++ * @inode: the inode ++ * @data: storage space ++ * ++ * Returns: qmblk is NULL or VZ_QUOTA_BAD or actualized qmblk. ++ * On return if qmblk is neither NULL nor VZ_QUOTA_BAD: ++ * qmblk in inode's qlnk is the same as returned, ++ * ugid pointers inside inode's qlnk are valid, ++ * some locks are taken (and should be released by vzquota_data_unlock). ++ * If qmblk is NULL or VZ_QUOTA_BAD, locks are NOT taken. ++ */ ++struct vz_quota_master *vzquota_inode_data(struct inode *inode, ++ struct vz_quota_datast *data) ++{ ++ struct vz_quota_master *qmblk; ++ ++ might_sleep(); ++ ++ vzquota_qlnk_init(&data->qlnk); ++ inode_qmblk_lock(inode->i_sb); ++ if (unlikely(inode->i_flags & S_NOQUOTA)) { ++ inode_qmblk_unlock(inode->i_sb); ++ return NULL; ++ } ++ __vzquota_inode_init(inode, VZ_QUOTAO_INICAL); ++ ++ if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) || ++ !VZ_QUOTA_IS_ACTUAL(inode)) ++ vzquota_inode_qmblk_recalc(inode, &data->qlnk); ++ ++ qmblk = INODE_QLNK(inode)->qmblk; ++ if (qmblk != VZ_QUOTA_BAD) { ++ if (!VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb)) { ++ /* ++ * Note that in the current implementation, ++ * inode_qmblk_lock can theoretically be dropped here. ++ * This place is serialized with quota_off because ++ * quota_off fails when there are extra dentry ++ * references and syncs inodes before removing quota ++ * information from them. ++ * However, quota usage information should stop being ++ * updated immediately after vzquota_off. ++ */ ++ qmblk_data_write_lock(qmblk); ++ } else { ++ inode_qmblk_unlock(inode->i_sb); ++ qmblk = NULL; ++ } ++ } else { ++ inode_qmblk_unlock(inode->i_sb); ++ } ++ return qmblk; ++} ++ ++void vzquota_data_unlock(struct inode *inode, ++ struct vz_quota_datast *data) ++{ ++ qmblk_data_write_unlock(INODE_QLNK(inode)->qmblk); ++ inode_qmblk_unlock(inode->i_sb); ++ vzquota_qlnk_destroy(&data->qlnk); ++} ++ ++#if defined(CONFIG_VZ_QUOTA_UGID) ++/** ++ * vzquota_inode_transfer_call - call from vzquota_transfer ++ */ ++int vzquota_inode_transfer_call(struct inode *inode, struct iattr *iattr) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_datast data; ++ struct vz_quota_ilink qlnew; ++ int mask; ++ int ret; ++ ++ might_sleep(); ++ vzquota_qlnk_init(&qlnew); ++start: ++ qmblk = vzquota_inode_data(inode, &data); ++ ret = NO_QUOTA; ++ if (qmblk == VZ_QUOTA_BAD) ++ goto out_destr; ++ ret = QUOTA_OK; ++ if (qmblk == NULL) ++ goto out_destr; ++ qmblk_get(qmblk); ++ ++ ret = QUOTA_OK; ++ if (!(qmblk->dq_flags & VZDQUG_ON)) ++ /* no ugid quotas */ ++ goto out_unlock; ++ ++ mask = 0; ++ if ((iattr->ia_valid & ATTR_UID) && iattr->ia_uid != inode->i_uid) ++ mask |= 1 << USRQUOTA; ++ if ((iattr->ia_valid & ATTR_GID) && iattr->ia_gid != inode->i_gid) ++ mask |= 1 << GRPQUOTA; ++ while (1) { ++ if (vzquota_qlnk_is_empty(&qlnew) && ++ vzquota_qlnk_fill_attr(&qlnew, inode, iattr, mask, qmblk)) ++ break; ++ if (qlnew.qmblk == INODE_QLNK(inode)->qmblk && ++ qlnew.qmblk == qmblk) ++ goto finish; ++ if (vzquota_qlnk_reinit_attr(&qlnew, inode, qmblk)) ++ break; ++ } ++ ++ /* prepare for restart */ ++ vzquota_data_unlock(inode, &data); ++ qmblk_put(qmblk); ++ goto start; ++ ++finish: ++ /* all references obtained successfully */ ++ ret = vzquota_transfer_usage(inode, mask, &qlnew); ++ if (!ret) { ++ vzquota_qlnk_swap(&qlnew, INODE_QLNK(inode)); ++ set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_TRANS); ++ } ++out_unlock: ++ vzquota_data_unlock(inode, &data); ++ qmblk_put(qmblk); ++out_destr: ++ vzquota_qlnk_destroy(&qlnew); ++ return ret; ++} ++#endif ++ ++int vzquota_rename_check(struct inode *inode, ++ struct inode *old_dir, struct inode *new_dir) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_ilink qlnk1, qlnk2, qlnk3; ++ int c, ret; ++ ++ if (inode->i_sb != old_dir->i_sb || inode->i_sb != new_dir->i_sb) ++ return -1; ++ ++ might_sleep(); ++ ++ vzquota_qlnk_init(&qlnk1); ++ vzquota_qlnk_init(&qlnk2); ++ vzquota_qlnk_init(&qlnk3); ++ inode_qmblk_lock(inode->i_sb); ++ __vzquota_inode_init(inode, VZ_QUOTAO_INICAL); ++ __vzquota_inode_init(old_dir, VZ_QUOTAO_INICAL); ++ __vzquota_inode_init(new_dir, VZ_QUOTAO_INICAL); ++ ++ do { ++ c = 0; ++ if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) || ++ !VZ_QUOTA_IS_ACTUAL(inode)) { ++ vzquota_inode_qmblk_recalc(inode, &qlnk1); ++ c++; ++ } ++ if (vzquota_qlnk_is_empty(INODE_QLNK(new_dir)) || ++ !VZ_QUOTA_IS_ACTUAL(new_dir)) { ++ vzquota_inode_qmblk_recalc(new_dir, &qlnk2); ++ c++; ++ } ++ } while (c); ++ ++ ret = 0; ++ qmblk = INODE_QLNK(inode)->qmblk; ++ if (qmblk != INODE_QLNK(new_dir)->qmblk) { ++ ret = -1; ++ while (vzquota_qlnk_is_empty(INODE_QLNK(old_dir)) || ++ !VZ_QUOTA_IS_ACTUAL(old_dir)) ++ vzquota_inode_qmblk_recalc(old_dir, &qlnk3); ++ if (qmblk != VZ_QUOTA_BAD && ++ !VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb) && ++ qmblk->dq_root_path.dentry->d_inode == inode && ++ VZ_QUOTA_IS_NOQUOTA(INODE_QLNK(new_dir)->qmblk, ++ inode->i_sb) && ++ VZ_QUOTA_IS_NOQUOTA(INODE_QLNK(old_dir)->qmblk, ++ inode->i_sb)) ++ /* quota root rename is allowed */ ++ ret = 0; ++ } ++ ++ inode_qmblk_unlock(inode->i_sb); ++ vzquota_qlnk_destroy(&qlnk3); ++ vzquota_qlnk_destroy(&qlnk2); ++ vzquota_qlnk_destroy(&qlnk1); ++ return ret; ++} ++ ++/* ++ * Scan parent subdirs and find busy dentries names/path ++ * @parent: parent dentry ++ * @buf: buffer to store path. ++ */ ++static void vzdquota_read_busy_dentries(struct path *parent, ++ char *buf, int buflen) ++{ ++ struct dentry *this_parent = parent->dentry; ++ struct list_head *next; ++ char *res, *end, *start; ++ struct path root, path; ++ int len; ++ ++ if (!buf || buflen <= 0) ++ return; ++ ++ path.mnt = parent->mnt; ++ /* From d_path() ... */ ++ read_lock(¤t->fs->lock); ++ path_get(¤t->fs->root); ++ root = current->fs->root; ++ read_unlock(¤t->fs->lock); ++ ++ spin_lock(&dcache_lock); ++ ++ end = buf + buflen; ++ start = buf; ++repeat: ++ next = this_parent->d_subdirs.next; ++resume: ++ while (next != &this_parent->d_subdirs) { ++ struct list_head *tmp = next; ++ struct dentry *dentry; ++ int subdirs; ++ ++ dentry = list_entry(tmp, struct dentry, d_u.d_child); ++ next = tmp->next; ++ subdirs = !list_empty(&dentry->d_subdirs); ++ ++ if (atomic_read(&dentry->d_count) && !subdirs) { ++ if (!buflen) ++ goto out; ++ /* ++ * Note: __d_path will store filename at the ++ * end of buf. ++ */ ++ path.dentry = dentry; ++ res = __d_path(&path, &root, buf, buflen); ++ /* Exit if name is too long */ ++ if (IS_ERR(res)) ++ goto out; ++ ++ /* ++ * Move the string obtained by __d_path, ++ * behind the last dentry path in buf. ++ */ ++ len = end - res; ++ BUG_ON(len <= 0); ++ ++ memmove(buf, res, len); ++ ++ /* Trick: replace \0 by \n */ ++ if (buf != start) ++ *(char *)(buf - 1) = '\n'; ++ ++ buf += len; ++ buflen -= len; ++ } ++ ++ /* ++ * Descend a level if the d_subdirs list is non-empty. ++ */ ++ if (subdirs) { ++ this_parent = dentry; ++ goto repeat; ++ } ++ } ++ /* ++ * All done at this level ... ascend and resume the search. ++ */ ++ if (this_parent != parent->dentry) { ++ next = this_parent->d_u.d_child.next; ++ this_parent = this_parent->d_parent; ++ goto resume; ++ } ++out: ++ /* From d_path() ... */ ++ spin_unlock(&dcache_lock); ++ path_put(&root); ++} ++ ++/* ---------------------------------------------------------------------- ++ * ++ * qmblk-related parts of on/off operations ++ * ++ * --------------------------------------------------------------------- */ ++ ++/** ++ * vzquota_check_dtree - check dentry tree if quota on/off is allowed ++ * ++ * This function doesn't allow quota to be turned on/off if some dentries in ++ * the tree have external references. ++ * In addition to technical reasons, it enforces user-space correctness: ++ * current usage (taken from or reported to the user space) can be meaningful ++ * and accurate only if the tree is not being modified. ++ * Side effect: additional vfsmount structures referencing the tree (bind ++ * mounts of tree nodes to some other places) are not allowed at on/off time. ++ * ++ * Store busy dentries path to the buf (if passed) in case of vzquota_off ++ * ioctl fail. ++ */ ++int vzquota_check_dtree(struct vz_quota_master *qmblk, int off, ++ char *buf, int buflen) ++{ ++ struct dentry *dentry; ++ int err, count; ++ ++ err = -EBUSY; ++ dentry = qmblk->dq_root_path.dentry; ++ ++ if (d_unhashed(dentry) && dentry != dentry->d_sb->s_root) ++ goto unhashed; ++ ++ /* attempt to shrink */ ++ if (!list_empty(&dentry->d_subdirs)) { ++ spin_unlock(&dcache_lock); ++ inode_qmblk_unlock(dentry->d_sb); ++ shrink_dcache_parent(dentry); ++ inode_qmblk_lock(dentry->d_sb); ++ spin_lock(&dcache_lock); ++ if (!list_empty(&dentry->d_subdirs)) { ++ spin_unlock(&dcache_lock); ++ vzdquota_read_busy_dentries(&qmblk->dq_root_path, ++ buf, buflen); ++ spin_lock(&dcache_lock); ++ goto out; ++ } ++ ++ count = 1; ++ if (dentry == dentry->d_sb->s_root) ++ count += 2; /* sb and mnt refs */ ++ if (atomic_read(&dentry->d_count) < count) { ++ printk(KERN_ERR "%s: too small count %d vs %d.\n", ++ __FUNCTION__, ++ atomic_read(&dentry->d_count), count); ++ goto out; ++ } ++ if (atomic_read(&dentry->d_count) > count) ++ goto out; ++ } ++ ++ err = 0; ++out: ++ return err; ++ ++unhashed: ++ /* ++ * Quota root is removed. ++ * Allow to turn quota off, but not on. ++ */ ++ if (off) ++ err = 0; ++ goto out; ++} ++ ++int vzquota_on_qmblk(struct super_block *sb, struct inode *inode, ++ struct vz_quota_master *qmblk, char __user *ubuf) ++{ ++ struct vz_quota_ilink qlnk; ++ struct vz_quota_master *qold, *qnew; ++ int err; ++ char *buf; ++ ++ buf = (ubuf != NULL) ? (char *)__get_free_page(GFP_KERNEL) : NULL; ++ ++ might_sleep(); ++ ++ qold = NULL; ++ qnew = vzquota_alloc_fake(); ++ if (qnew == NULL) { ++ free_page((unsigned long)buf); ++ return -ENOMEM; ++ } ++ ++ vzquota_qlnk_init(&qlnk); ++ inode_qmblk_lock(sb); ++ __vzquota_inode_init(inode, VZ_QUOTAO_INICAL); ++ ++ spin_lock(&dcache_lock); ++ while (1) { ++ err = vzquota_check_dtree(qmblk, 0, buf, PAGE_SIZE); ++ if (err) ++ break; ++ if (!vzquota_inode_qmblk_set(inode, qmblk, &qlnk)) ++ break; ++ } ++ set_qlnk_origin(INODE_QLNK(inode), VZ_QUOTAO_ON); ++ spin_unlock(&dcache_lock); ++ ++ if (!err) { ++ qold = __VZ_QUOTA_NOQUOTA(sb); ++ qold->dq_flags |= VZDQ_NOACT; ++ __VZ_QUOTA_NOQUOTA(sb) = qnew; ++ } ++ ++ inode_qmblk_unlock(sb); ++ vzquota_qlnk_destroy(&qlnk); ++ if (qold != NULL) ++ qmblk_put(qold); ++ ++ if (buf) { ++ if (copy_to_user(ubuf, buf, PAGE_SIZE)) ++ ; ++ free_page((unsigned long)buf); ++ } ++ return err; ++} ++ ++int vzquota_off_qmblk(struct super_block *sb, struct vz_quota_master *qmblk, ++ char __user *ubuf, int force) ++{ ++ int ret; ++ char *buf; ++ ++ buf = (ubuf != NULL) ? (char *)__get_free_page(GFP_KERNEL) : NULL; ++ ++ ret = 0; ++ inode_qmblk_lock(sb); ++ ++ spin_lock(&dcache_lock); ++ if (vzquota_check_dtree(qmblk, 1, buf, PAGE_SIZE) && !force) ++ ret = -EBUSY; ++ spin_unlock(&dcache_lock); ++ ++ if (!ret) ++ qmblk->dq_flags |= VZDQ_NOACT | VZDQ_NOQUOT; ++ inode_qmblk_unlock(sb); ++ ++ if (buf) { ++ if (copy_to_user(ubuf, buf, PAGE_SIZE)) ++ ; ++ free_page((unsigned long)buf); ++ } ++ return ret; ++} ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * External interfaces ++ * ++ * ---------------------------------------------------------------------*/ ++ ++static int vzquota_ioctl(struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ int err; ++ ++ switch (cmd) { ++ case VZCTL_QUOTA_NEW_CTL: { ++ struct vzctl_quotactl qb; ++ ++ err = -EFAULT; ++ if (copy_from_user(&qb, (void __user *)arg, sizeof(qb))) ++ break; ++ err = do_vzquotactl(qb.cmd, qb.quota_id, ++ qb.qstat, qb.ve_root, 0); ++ break; ++ } ++#ifdef CONFIG_VZ_QUOTA_UGID ++ case VZCTL_QUOTA_UGID_CTL: { ++ struct vzctl_quotaugidctl qub; ++ ++ err = -EFAULT; ++ if (copy_from_user(&qub, (void __user *)arg, sizeof(qub))) ++ break; ++ err = do_vzquotaugidctl(qub.cmd, qub.quota_id, ++ qub.ugid_index, qub.ugid_size, qub.addr, 0); ++ break; ++ } ++#endif ++ default: ++ err = -ENOTTY; ++ } ++ return err; ++} ++ ++#ifdef CONFIG_COMPAT ++static int compat_vzquota_ioctl(struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ int err; ++ ++ switch (cmd) { ++ case VZCTL_COMPAT_QUOTA_CTL: { ++ struct compat_vzctl_quotactl cs; ++ ++ err = -EFAULT; ++ if (copy_from_user(&cs, (void *)arg, sizeof(cs))) ++ break; ++ err = do_vzquotactl(cs.cmd, cs.quota_id, ++ compat_ptr(cs.qstat), ++ compat_ptr(cs.ve_root), 1); ++ break; ++ } ++#ifdef CONFIG_VZ_QUOTA_UGID ++ case VZCTL_COMPAT_QUOTA_UGID_CTL: { ++ struct compat_vzctl_quotaugidctl cs; ++ ++ err = -EFAULT; ++ if (copy_from_user(&cs, (void *)arg, sizeof(cs))) ++ break; ++ ++ err = do_vzquotaugidctl(cs.cmd, cs.quota_id, cs.ugid_index, ++ cs.ugid_size, compat_ptr(cs.addr), 1); ++ break; ++ } ++#endif ++ default: ++ err = -ENOIOCTLCMD; ++ } ++ return err; ++} ++#endif ++ ++static struct vzioctlinfo vzdqcalls = { ++ .type = VZDQCTLTYPE, ++ .ioctl = vzquota_ioctl, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = compat_vzquota_ioctl, ++#endif ++ .owner = THIS_MODULE, ++}; ++ ++/** ++ * vzquota_dstat - get quota usage info for virtual superblock ++ */ ++static int vzquota_dstat(struct super_block *super, struct dq_stat *qstat) ++{ ++ struct vz_quota_master *qmblk; ++ ++ qmblk = vzquota_find_qmblk(super); ++ if (qmblk == NULL) ++ return -ENOENT; ++ if (qmblk == VZ_QUOTA_BAD) { ++ memset(qstat, 0, sizeof(*qstat)); ++ return 0; ++ } ++ ++ qmblk_data_read_lock(qmblk); ++ memcpy(qstat, &qmblk->dq_stat, sizeof(*qstat)); ++ qmblk_data_read_unlock(qmblk); ++ qmblk_put(qmblk); ++ return 0; ++} ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * Init/exit helpers ++ * ++ * ---------------------------------------------------------------------*/ ++ ++static int vzquota_cache_init(void) ++{ ++ int i; ++ ++ vzquota_cachep = kmem_cache_create("vz_quota_master", ++ sizeof(struct vz_quota_master), ++ 0, SLAB_HWCACHE_ALIGN, NULL); ++ if (vzquota_cachep == NULL) { ++ printk(KERN_ERR "Cannot create VZ_QUOTA SLAB cache\n"); ++ goto nomem2; ++ } ++ for (i = 0; i < VZ_QUOTA_HASH_SIZE; i++) ++ INIT_LIST_HEAD(&vzquota_hash_table[i]); ++ ++ return 0; ++ ++nomem2: ++ return -ENOMEM; ++} ++ ++static void vzquota_cache_release(void) ++{ ++ int i; ++ ++ /* sanity check */ ++ for (i = 0; i < VZ_QUOTA_HASH_SIZE; i++) ++ if (!list_empty(&vzquota_hash_table[i])) ++ BUG(); ++ ++ /* release caches */ ++ kmem_cache_destroy(vzquota_cachep); ++ vzquota_cachep = NULL; ++} ++ ++static int quota_notifier_call(struct vnotifier_block *self, ++ unsigned long n, void *data, int err) ++{ ++ struct virt_info_quota *viq; ++ struct super_block *sb; ++ ++ viq = (struct virt_info_quota *)data; ++ switch (n) { ++ case VIRTINFO_QUOTA_ON: ++ err = NOTIFY_BAD; ++ if (!try_module_get(THIS_MODULE)) ++ break; ++ sb = viq->super; ++ memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info)); ++ INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list); ++ INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list); ++ err = NOTIFY_OK; ++ break; ++ case VIRTINFO_QUOTA_OFF: ++ module_put(THIS_MODULE); ++ err = NOTIFY_OK; ++ break; ++ case VIRTINFO_QUOTA_GETSTAT: ++ err = NOTIFY_BAD; ++ if (vzquota_dstat(viq->super, viq->qstat)) ++ break; ++ err = NOTIFY_OK; ++ break; ++ case VIRTINFO_QUOTA_DISABLE: ++ err = NOTIFY_OK; ++ vzquota_inode_off((struct inode *)data); ++ break; ++ } ++ return err; ++} ++ ++struct vnotifier_block quota_notifier_block = { ++ .notifier_call = quota_notifier_call, ++ .priority = INT_MAX, ++}; ++ ++/* ---------------------------------------------------------------------- ++ * ++ * Init/exit procedures ++ * ++ * ---------------------------------------------------------------------*/ ++ ++static int __init vzquota_init(void) ++{ ++ int err; ++ ++ if ((err = vzquota_cache_init()) != 0) ++ goto out_cache; ++ ++ if ((err = vzquota_proc_init()) != 0) ++ goto out_proc; ++ ++#ifdef CONFIG_VZ_QUOTA_UGID ++ if ((err = vzquota_ugid_init()) != 0) ++ goto out_ugid; ++#endif ++ ++ init_MUTEX(&vz_quota_sem); ++ vzioctl_register(&vzdqcalls); ++ virtinfo_notifier_register(VITYPE_QUOTA, "a_notifier_block); ++#if defined(CONFIG_VZ_QUOTA_UGID) && defined(CONFIG_PROC_FS) ++ vzaquota_init(); ++#endif ++ ++ return 0; ++ ++#ifdef CONFIG_VZ_QUOTA_UGID ++out_ugid: ++ vzquota_proc_release(); ++#endif ++out_proc: ++ vzquota_cache_release(); ++out_cache: ++ return err; ++} ++ ++#if defined(VZ_QUOTA_UNLOAD) ++static void __exit vzquota_release(void) ++{ ++ virtinfo_notifier_unregister(VITYPE_QUOTA, "a_notifier_block); ++ vzioctl_unregister(&vzdqcalls); ++#ifdef CONFIG_VZ_QUOTA_UGID ++#ifdef CONFIG_PROC_FS ++ vzaquota_fini(); ++#endif ++ vzquota_ugid_release(); ++#endif ++ vzquota_proc_release(); ++ vzquota_cache_release(); ++} ++#endif ++ ++MODULE_AUTHOR("SWsoft "); ++MODULE_DESCRIPTION("Virtuozzo Disk Quota"); ++MODULE_LICENSE("GPL v2"); ++ ++module_init(vzquota_init) ++#if defined(VZ_QUOTA_UNLOAD) ++module_exit(vzquota_release) ++#endif +diff --git a/include/asm-ia64/mman.h b/include/asm-ia64/mman.h +index c73b878..849dbe9 100644 +--- a/include/asm-ia64/mman.h ++++ b/include/asm-ia64/mman.h +@@ -18,6 +18,7 @@ + #define MAP_NORESERVE 0x04000 /* don't check for reservations */ + #define MAP_POPULATE 0x08000 /* populate (prefault) pagetables */ + #define MAP_NONBLOCK 0x10000 /* do not block on IO */ ++#define MAP_EXECPRIO 0x20000 /* soft ubc charge */ + + #define MCL_CURRENT 1 /* lock all current mappings */ + #define MCL_FUTURE 2 /* lock all future mappings */ +diff --git a/include/asm-ia64/pgalloc.h b/include/asm-ia64/pgalloc.h +index b9ac1a6..9504729 100644 +--- a/include/asm-ia64/pgalloc.h ++++ b/include/asm-ia64/pgalloc.h +@@ -20,11 +20,13 @@ + #include + #include + ++#include ++ + #include + + static inline pgd_t *pgd_alloc(struct mm_struct *mm) + { +- return quicklist_alloc(0, GFP_KERNEL, NULL); ++ return quicklist_alloc(0, GFP_KERNEL_UBC|__GFP_SOFT_UBC, NULL); + } + + static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) +@@ -41,7 +43,7 @@ pgd_populate(struct mm_struct *mm, pgd_t * pgd_entry, pud_t * pud) + + static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) + { +- return quicklist_alloc(0, GFP_KERNEL, NULL); ++ return quicklist_alloc(0, GFP_KERNEL_UBC|__GFP_SOFT_UBC, NULL); + } + + static inline void pud_free(struct mm_struct *mm, pud_t *pud) +@@ -59,7 +61,7 @@ pud_populate(struct mm_struct *mm, pud_t * pud_entry, pmd_t * pmd) + + static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) + { +- return quicklist_alloc(0, GFP_KERNEL, NULL); ++ return quicklist_alloc(0, GFP_KERNEL_UBC|__GFP_SOFT_UBC, NULL); + } + + static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) +@@ -87,7 +89,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr) + struct page *page; + void *pg; + +- pg = quicklist_alloc(0, GFP_KERNEL, NULL); ++ pg = quicklist_alloc(0, GFP_KERNEL_UBC|__GFP_SOFT_UBC, NULL); + if (!pg) + return NULL; + page = virt_to_page(pg); +diff --git a/include/asm-ia64/processor.h b/include/asm-ia64/processor.h +index 6aff126..c148ef8 100644 +--- a/include/asm-ia64/processor.h ++++ b/include/asm-ia64/processor.h +@@ -361,7 +361,7 @@ struct thread_struct { + regs->loadrs = 0; \ + regs->r8 = get_dumpable(current->mm); /* set "don't zap registers" flag */ \ + regs->r12 = new_sp - 16; /* allocate 16 byte scratch area */ \ +- if (unlikely(!get_dumpable(current->mm))) { \ ++ if (unlikely(!get_dumpable(current->mm) || !current->mm->vps_dumpable)) { \ + /* \ + * Zap scratch regs to avoid leaking bits between processes with different \ + * uid/privileges. \ +diff --git a/include/asm-ia64/timex.h b/include/asm-ia64/timex.h +index 05a6baf..b2dc7e7 100644 +--- a/include/asm-ia64/timex.h ++++ b/include/asm-ia64/timex.h +@@ -10,6 +10,7 @@ + * Also removed cacheflush_time as it's entirely unused. + */ + ++#ifdef __KERNEL__ + #include + #include + +@@ -39,4 +40,8 @@ get_cycles (void) + return ret; + } + ++extern unsigned int cpu_khz; ++ ++#endif ++ + #endif /* _ASM_IA64_TIMEX_H */ +diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h +index e603147..f5af201 100644 +--- a/include/asm-ia64/unistd.h ++++ b/include/asm-ia64/unistd.h +@@ -302,6 +302,16 @@ + #define __NR_timerfd_create 1310 + #define __NR_timerfd_settime 1311 + #define __NR_timerfd_gettime 1312 ++#define __NR_fairsched_vcpus 1499 ++#define __NR_fairsched_mknod 1500 ++#define __NR_fairsched_rmnod 1501 ++#define __NR_fairsched_chwt 1502 ++#define __NR_fairsched_mvpr 1503 ++#define __NR_fairsched_rate 1504 ++#define __NR_getluid 1505 ++#define __NR_setluid 1506 ++#define __NR_setublimit 1507 ++#define __NR_ubstat 1508 + + #ifdef __KERNEL__ + +diff --git a/include/asm-powerpc/mman.h b/include/asm-powerpc/mman.h +index 24cf664..0d4a60f 100644 +--- a/include/asm-powerpc/mman.h ++++ b/include/asm-powerpc/mman.h +@@ -23,5 +23,6 @@ + + #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ + #define MAP_NONBLOCK 0x10000 /* do not block on IO */ ++#define MAP_EXECPRIO 0x20000 /* do soft ubc charge */ + + #endif /* _ASM_POWERPC_MMAN_H */ +diff --git a/include/asm-powerpc/pgalloc-64.h b/include/asm-powerpc/pgalloc-64.h +index 6898099..49075a6 100644 +--- a/include/asm-powerpc/pgalloc-64.h ++++ b/include/asm-powerpc/pgalloc-64.h +@@ -26,7 +26,8 @@ extern struct kmem_cache *pgtable_cache[]; + + static inline pgd_t *pgd_alloc(struct mm_struct *mm) + { +- return kmem_cache_alloc(pgtable_cache[PGD_CACHE_NUM], GFP_KERNEL); ++ return kmem_cache_alloc(pgtable_cache[PGD_CACHE_NUM], ++ GFP_KERNEL_UBC | __GFP_SOFT_UBC); + } + + static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) +@@ -42,7 +43,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) + static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) + { + return kmem_cache_alloc(pgtable_cache[PUD_CACHE_NUM], +- GFP_KERNEL|__GFP_REPEAT); ++ GFP_KERNEL_UBC|__GFP_SOFT_UBC|__GFP_REPEAT); + } + + static inline void pud_free(struct mm_struct *mm, pud_t *pud) +@@ -88,10 +89,15 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) + kmem_cache_free(pgtable_cache[PMD_CACHE_NUM], pmd); + } + ++static inline pte_t *do_pte_alloc(gfp_t flags) ++{ ++ return (pte_t *)__get_free_page(flags); ++} ++ + static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, + unsigned long address) + { +- return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO); ++ return do_pte_alloc(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO); + } + + static inline pgtable_t pte_alloc_one(struct mm_struct *mm, +@@ -100,7 +106,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm, + struct page *page; + pte_t *pte; + +- pte = pte_alloc_one_kernel(mm, address); ++ pte = do_pte_alloc(GFP_KERNEL_UBC | __GFP_REPEAT | __GFP_ZERO); + if (!pte) + return NULL; + page = virt_to_page(pte); +diff --git a/include/asm-powerpc/systbl.h b/include/asm-powerpc/systbl.h +index ae7085c..7ad02b9 100644 +--- a/include/asm-powerpc/systbl.h ++++ b/include/asm-powerpc/systbl.h +@@ -316,3 +316,19 @@ COMPAT_SYS(fallocate) + SYSCALL(subpage_prot) + COMPAT_SYS_SPU(timerfd_settime) + COMPAT_SYS_SPU(timerfd_gettime) ++SYS_SKIP(313, 400) ++SYSCALL(ni_syscall) ++SYS_SKIP_END() ++SYSCALL(fairsched_mknod) /* 400 */ ++SYSCALL(fairsched_rmnod) ++SYSCALL(fairsched_chwt) ++SYSCALL(fairsched_mvpr) ++SYSCALL(fairsched_rate) ++SYSCALL(fairsched_vcpus) ++SYS_SKIP(406, 410) ++SYSCALL(ni_syscall) ++SYS_SKIP_END() ++SYSCALL(getluid) /* 410 */ ++SYSCALL(setluid) ++SYSCALL(setublimit) ++SYSCALL(ubstat) +diff --git a/include/asm-powerpc/unistd.h b/include/asm-powerpc/unistd.h +index ce91bb6..dac5902 100644 +--- a/include/asm-powerpc/unistd.h ++++ b/include/asm-powerpc/unistd.h +@@ -336,9 +336,14 @@ + #define __NR_timerfd_settime 311 + #define __NR_timerfd_gettime 312 + ++#define __NR_getluid 410 ++#define __NR_setluid 411 ++#define __NR_setublimit 412 ++#define __NR_ubstat 413 ++ + #ifdef __KERNEL__ + +-#define __NR_syscalls 313 ++#define __NR_syscalls 414 + + #define __NR__exit __NR_exit + #define NR_syscalls __NR_syscalls +diff --git a/include/asm-sparc64/mman.h b/include/asm-sparc64/mman.h +index d2ae67c..51103a7 100644 +--- a/include/asm-sparc64/mman.h ++++ b/include/asm-sparc64/mman.h +@@ -20,6 +20,7 @@ + + #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ + #define MAP_NONBLOCK 0x10000 /* do not block on IO */ ++#define MAP_EXECPRIO 0x20000 /* do soft ubc charge */ + + #ifdef __KERNEL__ + #ifndef __ASSEMBLY__ +diff --git a/include/asm-sparc64/pgalloc.h b/include/asm-sparc64/pgalloc.h +index 326de10..f556b52 100644 +--- a/include/asm-sparc64/pgalloc.h ++++ b/include/asm-sparc64/pgalloc.h +@@ -16,7 +16,7 @@ + + static inline pgd_t *pgd_alloc(struct mm_struct *mm) + { +- return quicklist_alloc(0, GFP_KERNEL, NULL); ++ return quicklist_alloc(0, GFP_KERNEL_UBC, NULL); + } + + static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) +@@ -28,7 +28,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) + + static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) + { +- return quicklist_alloc(0, GFP_KERNEL, NULL); ++ return quicklist_alloc(0, GFP_KERNEL_UBC|__GFP_REPEAT, NULL); + } + + static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) +@@ -48,7 +48,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm, + struct page *page; + void *pg; + +- pg = quicklist_alloc(0, GFP_KERNEL, NULL); ++ pg = quicklist_alloc(0, GFP_KERNEL_UBC, NULL); + if (!pg) + return NULL; + page = virt_to_page(pg); +diff --git a/include/asm-sparc64/thread_info.h b/include/asm-sparc64/thread_info.h +index e5873e3..4d21580 100644 +--- a/include/asm-sparc64/thread_info.h ++++ b/include/asm-sparc64/thread_info.h +@@ -161,14 +161,14 @@ register struct thread_info *current_thread_info_reg asm("g6"); + struct thread_info *ret; \ + \ + ret = (struct thread_info *) \ +- __get_free_pages(GFP_KERNEL, __THREAD_INFO_ORDER); \ ++ __get_free_pages(GFP_KERNEL_UBC, __THREAD_INFO_ORDER);\ + if (ret) \ + memset(ret, 0, PAGE_SIZE<<__THREAD_INFO_ORDER); \ + ret; \ + }) + #else + #define alloc_thread_info(tsk) \ +- ((struct thread_info *)__get_free_pages(GFP_KERNEL, __THREAD_INFO_ORDER)) ++ ((struct thread_info *)__get_free_pages(GFP_KERNEL_UBC, __THREAD_INFO_ORDER)) + #endif + + #define free_thread_info(ti) \ +@@ -235,6 +235,7 @@ register struct thread_info *current_thread_info_reg asm("g6"); + #define TIF_ABI_PENDING 12 + #define TIF_MEMDIE 13 + #define TIF_POLLING_NRFLAG 14 ++#define TIF_FREEZE 15 /* Freeze request (atomic PF_FREEZE) */ + + #define _TIF_SYSCALL_TRACE (1< 2 + static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) + { +- return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); ++ return (pmd_t *)get_zeroed_page(GFP_KERNEL_UBC|__GFP_REPEAT); + } + + static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) +@@ -94,7 +94,7 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) + + static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) + { +- return (pud_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); ++ return (pud_t *)get_zeroed_page(GFP_KERNEL_UBC|__GFP_REPEAT); + } + + static inline void pud_free(struct mm_struct *mm, pud_t *pud) +diff --git a/include/asm-x86/processor.h b/include/asm-x86/processor.h +index 5591052..5fd8bee 100644 +--- a/include/asm-x86/processor.h ++++ b/include/asm-x86/processor.h +@@ -878,8 +878,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); + /* This decides where the kernel will search for a free chunk of vm + * space during mmap's. + */ +-#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \ +- 0xc0000000 : 0xFFFFe000) ++#define IA32_PAGE_OFFSET 0xc0000000 + + #define TASK_SIZE (test_thread_flag(TIF_IA32) ? \ + IA32_PAGE_OFFSET : TASK_SIZE64) +diff --git a/include/asm-x86/thread_info_32.h b/include/asm-x86/thread_info_32.h +index b633882..2776f8d 100644 +--- a/include/asm-x86/thread_info_32.h ++++ b/include/asm-x86/thread_info_32.h +@@ -96,10 +96,10 @@ static inline struct thread_info *current_thread_info(void) + /* thread information allocation */ + #ifdef CONFIG_DEBUG_STACK_USAGE + #define alloc_thread_info(tsk) ((struct thread_info *) \ +- __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(THREAD_SIZE))) ++ __get_free_pages(GFP_KERNEL_UBC | __GFP_ZERO, get_order(THREAD_SIZE))) + #else + #define alloc_thread_info(tsk) ((struct thread_info *) \ +- __get_free_pages(GFP_KERNEL, get_order(THREAD_SIZE))) ++ __get_free_pages(GFP_KERNEL_UBC, get_order(THREAD_SIZE))) + #endif + + #else /* !__ASSEMBLY__ */ +diff --git a/include/asm-x86/thread_info_64.h b/include/asm-x86/thread_info_64.h +index cb69f70..a787bec 100644 +--- a/include/asm-x86/thread_info_64.h ++++ b/include/asm-x86/thread_info_64.h +@@ -83,7 +83,8 @@ static inline struct thread_info *stack_thread_info(void) + #endif + + #define alloc_thread_info(tsk) \ +- ((struct thread_info *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER)) ++ ((struct thread_info *)__get_free_pages(THREAD_FLAGS | __GFP_UBC,\ ++ THREAD_ORDER)) + + #else /* !__ASSEMBLY__ */ + +@@ -124,6 +125,7 @@ static inline struct thread_info *stack_thread_info(void) + #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ + #define TIF_BTS_TRACE_TS 27 /* record scheduling event timestamps */ + #define TIF_NOTSC 28 /* TSC is not accessible in userland */ ++#define TIF_RESUME 29 + + #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) + #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) +@@ -145,6 +147,7 @@ static inline struct thread_info *stack_thread_info(void) + #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) + #define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS) + #define _TIF_NOTSC (1 << TIF_NOTSC) ++#define _TIF_RESUME (1< ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * UB_MAXVALUE is essentially LONG_MAX declared in a cross-compiling safe form. ++ */ ++#define UB_MAXVALUE ( (1UL << (sizeof(unsigned long)*8-1)) - 1) ++ ++ ++/* ++ * Resource management structures ++ * Serialization issues: ++ * beancounter list management is protected via ub_hash_lock ++ * task pointers are set only for current task and only once ++ * refcount is managed atomically ++ * value and limit comparison and change are protected by per-ub spinlock ++ */ ++ ++struct page_beancounter; ++struct task_beancounter; ++struct sock_beancounter; ++ ++struct page_private { ++ unsigned long ubp_unused_privvmpages; ++ unsigned long ubp_tmpfs_respages; ++ unsigned long ubp_swap_pages; ++ unsigned long long ubp_held_pages; ++}; ++ ++struct sock_private { ++ unsigned long ubp_rmem_thres; ++ unsigned long ubp_wmem_pressure; ++ unsigned long ubp_maxadvmss; ++ unsigned long ubp_rmem_pressure; ++ int ubp_tw_count; ++#define UB_RMEM_EXPAND 0 ++#define UB_RMEM_KEEP 1 ++#define UB_RMEM_SHRINK 2 ++ struct list_head ubp_other_socks; ++ struct list_head ubp_tcp_socks; ++ atomic_t ubp_orphan_count; ++}; ++ ++struct ub_percpu_struct { ++ unsigned long unmap; ++ unsigned long swapin; ++#ifdef CONFIG_BC_IO_ACCOUNTING ++ unsigned long long bytes_wrote; ++ unsigned long long bytes_read; ++ unsigned long long bytes_cancelled; ++#endif ++#ifdef CONFIG_BC_DEBUG_KMEM ++ long pages_charged; ++ long vmalloc_charged; ++ long pbcs; ++#endif ++ unsigned long sync; ++ unsigned long sync_done; ++ ++ unsigned long fsync; ++ unsigned long fsync_done; ++ ++ unsigned long fdsync; ++ unsigned long fdsync_done; ++ ++ unsigned long frsync; ++ unsigned long frsync_done; ++ ++ unsigned long write; ++ unsigned long read; ++ unsigned long long wchar; ++ unsigned long long rchar; ++}; ++ ++struct user_beancounter ++{ ++ unsigned long ub_magic; ++ atomic_t ub_refcount; ++ struct list_head ub_list; ++ struct hlist_node ub_hash; ++ ++ union { ++ struct rcu_head rcu; ++ struct execute_work cleanup; ++ }; ++ ++ spinlock_t ub_lock; ++ uid_t ub_uid; ++ ++ struct ub_rate_info ub_limit_rl; ++ int ub_oom_noproc; ++ ++ struct page_private ppriv; ++#define ub_unused_privvmpages ppriv.ubp_unused_privvmpages ++#define ub_tmpfs_respages ppriv.ubp_tmpfs_respages ++#define ub_swap_pages ppriv.ubp_swap_pages ++#define ub_held_pages ppriv.ubp_held_pages ++ struct sock_private spriv; ++#define ub_rmem_thres spriv.ubp_rmem_thres ++#define ub_maxadvmss spriv.ubp_maxadvmss ++#define ub_rmem_pressure spriv.ubp_rmem_pressure ++#define ub_wmem_pressure spriv.ubp_wmem_pressure ++#define ub_tcp_sk_list spriv.ubp_tcp_socks ++#define ub_other_sk_list spriv.ubp_other_socks ++#define ub_orphan_count spriv.ubp_orphan_count ++#define ub_tw_count spriv.ubp_tw_count ++ struct ub_iopriv iopriv; ++ ++ struct user_beancounter *parent; ++ void *private_data; ++ unsigned long ub_aflags; ++ ++#ifdef CONFIG_PROC_FS ++ struct proc_dir_entry *proc; ++#endif ++ ++ /* resources statistic and settings */ ++ struct ubparm ub_parms[UB_RESOURCES]; ++ /* resources statistic for last interval */ ++ struct ubparm ub_store[UB_RESOURCES]; ++ ++ struct ub_percpu_struct *ub_percpu; ++#ifdef CONFIG_BC_IO_ACCOUNTING ++ /* these are protected with pb_lock */ ++ unsigned long long bytes_wrote; ++ unsigned long long bytes_dirtied; ++ unsigned long long bytes_dirty_missed; ++ unsigned long io_pb_held; ++#endif ++#ifdef CONFIG_BC_DEBUG_KMEM ++ struct list_head ub_cclist; ++#endif ++}; ++ ++enum ub_severity { UB_HARD, UB_SOFT, UB_FORCE }; ++ ++#define UB_AFLAG_NOTIF_PAGEIN 0 ++ ++static inline ++struct user_beancounter *top_beancounter(struct user_beancounter *ub) ++{ ++ while (ub->parent != NULL) ++ ub = ub->parent; ++ return ub; ++} ++ ++static inline int ub_barrier_hit(struct user_beancounter *ub, int resource) ++{ ++ return ub->ub_parms[resource].held > ub->ub_parms[resource].barrier; ++} ++ ++static inline int ub_hfbarrier_hit(struct user_beancounter *ub, int resource) ++{ ++ return (ub->ub_parms[resource].held > ++ ((ub->ub_parms[resource].barrier) >> 1)); ++} ++ ++static inline int ub_barrier_farnr(struct user_beancounter *ub, int resource) ++{ ++ struct ubparm *p; ++ p = ub->ub_parms + resource; ++ return p->held <= (p->barrier >> 3); ++} ++ ++static inline int ub_barrier_farsz(struct user_beancounter *ub, int resource) ++{ ++ struct ubparm *p; ++ p = ub->ub_parms + resource; ++ return p->held <= (p->barrier >> 3) && p->barrier >= 1024 * 1024; ++} ++ ++#ifndef CONFIG_BEANCOUNTERS ++ ++#define ub_percpu_add(ub, f, v) do { } while (0) ++#define ub_percpu_sub(ub, f, v) do { } while (0) ++#define ub_percpu_inc(ub, f) do { } while (0) ++#define ub_percpu_dec(ub, f) do { } while (0) ++ ++#define mm_ub(mm) (NULL) ++ ++extern inline struct user_beancounter *get_beancounter_byuid ++ (uid_t uid, int create) { return NULL; } ++extern inline struct user_beancounter *get_beancounter ++ (struct user_beancounter *ub) { return NULL; } ++extern inline void put_beancounter(struct user_beancounter *ub) { } ++ ++static inline void ub_init_late(void) { }; ++static inline void ub_init_early(void) { }; ++ ++static inline int charge_beancounter(struct user_beancounter *ub, ++ int resource, unsigned long val, ++ enum ub_severity strict) { return 0; } ++static inline void uncharge_beancounter(struct user_beancounter *ub, ++ int resource, unsigned long val) { } ++ ++#else /* CONFIG_BEANCOUNTERS */ ++ ++#define ub_percpu_add(ub, field, v) do { \ ++ per_cpu_ptr(ub->ub_percpu, get_cpu())->field += (v); \ ++ put_cpu(); \ ++ } while (0) ++#define ub_percpu_inc(ub, field) ub_percpu_add(ub, field, 1) ++ ++#define ub_percpu_sub(ub, field, v) do { \ ++ per_cpu_ptr(ub->ub_percpu, get_cpu())->field -= (v); \ ++ put_cpu(); \ ++ } while (0) ++#define ub_percpu_dec(ub, field) ub_percpu_sub(ub, field, 1) ++ ++#define mm_ub(mm) ((mm)->mm_ub) ++/* ++ * Charge/uncharge operations ++ */ ++ ++extern int __charge_beancounter_locked(struct user_beancounter *ub, ++ int resource, unsigned long val, enum ub_severity strict); ++ ++extern void __uncharge_beancounter_locked(struct user_beancounter *ub, ++ int resource, unsigned long val); ++ ++extern void put_beancounter_safe(struct user_beancounter *ub); ++extern void __put_beancounter(struct user_beancounter *ub); ++ ++extern void uncharge_warn(struct user_beancounter *ub, int resource, ++ unsigned long val, unsigned long held); ++ ++extern const char *ub_rnames[]; ++/* ++ * Put a beancounter reference ++ */ ++ ++static inline void put_beancounter(struct user_beancounter *ub) ++{ ++ if (unlikely(ub == NULL)) ++ return; ++ ++ /* FIXME - optimize not to disable interrupts and make call */ ++ __put_beancounter(ub); ++} ++ ++/* fast put, refcount can't reach zero */ ++static inline void __put_beancounter_batch(struct user_beancounter *ub, int n) ++{ ++ atomic_sub(n, &ub->ub_refcount); ++} ++ ++static inline void put_beancounter_batch(struct user_beancounter *ub, int n) ++{ ++ if (n > 1) ++ __put_beancounter_batch(ub, n - 1); ++ __put_beancounter(ub); ++} ++ ++/* ++ * Create a new beancounter reference ++ */ ++extern struct user_beancounter *get_beancounter_byuid(uid_t uid, int create); ++ ++static inline ++struct user_beancounter *get_beancounter(struct user_beancounter *ub) ++{ ++ if (unlikely(ub == NULL)) ++ return NULL; ++ ++ atomic_inc(&ub->ub_refcount); ++ return ub; ++} ++ ++static inline ++struct user_beancounter *get_beancounter_rcu(struct user_beancounter *ub) ++{ ++ return atomic_inc_not_zero(&ub->ub_refcount) ? ub : NULL; ++} ++ ++static inline void get_beancounter_batch(struct user_beancounter *ub, int n) ++{ ++ atomic_add(n, &ub->ub_refcount); ++} ++ ++extern struct user_beancounter *get_subbeancounter_byid( ++ struct user_beancounter *, ++ int id, int create); ++ ++extern void ub_init_late(void); ++extern void ub_init_early(void); ++ ++extern int print_ub_uid(struct user_beancounter *ub, char *buf, int size); ++ ++/* ++ * Resource charging ++ * Change user's account and compare against limits ++ */ ++ ++static inline void ub_adjust_maxheld(struct user_beancounter *ub, int resource) ++{ ++ if (ub->ub_parms[resource].maxheld < ub->ub_parms[resource].held) ++ ub->ub_parms[resource].maxheld = ub->ub_parms[resource].held; ++ if (ub->ub_parms[resource].minheld > ub->ub_parms[resource].held) ++ ub->ub_parms[resource].minheld = ub->ub_parms[resource].held; ++} ++ ++int charge_beancounter(struct user_beancounter *ub, int resource, ++ unsigned long val, enum ub_severity strict); ++void uncharge_beancounter(struct user_beancounter *ub, int resource, ++ unsigned long val); ++void __charge_beancounter_notop(struct user_beancounter *ub, int resource, ++ unsigned long val); ++void __uncharge_beancounter_notop(struct user_beancounter *ub, int resource, ++ unsigned long val); ++ ++static inline void charge_beancounter_notop(struct user_beancounter *ub, ++ int resource, unsigned long val) ++{ ++ if (ub->parent != NULL) ++ __charge_beancounter_notop(ub, resource, val); ++} ++ ++static inline void uncharge_beancounter_notop(struct user_beancounter *ub, ++ int resource, unsigned long val) ++{ ++ if (ub->parent != NULL) ++ __uncharge_beancounter_notop(ub, resource, val); ++} ++ ++#endif /* CONFIG_BEANCOUNTERS */ ++ ++#ifndef CONFIG_BC_RSS_ACCOUNTING ++static inline void ub_ini_pbc(void) { } ++#else ++extern void ub_init_pbc(void); ++#endif ++#endif /* __KERNEL__ */ ++#endif /* _LINUX_BEANCOUNTER_H */ +diff --git a/include/bc/dcache.h b/include/bc/dcache.h +new file mode 100644 +index 0000000..5ebefff +--- /dev/null ++++ b/include/bc/dcache.h +@@ -0,0 +1,47 @@ ++/* ++ * include/bc/dcache.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __BC_DCACHE_H_ ++#define __BC_DCACHE_H_ ++ ++#include ++ ++/* ++ * UB_DCACHESIZE accounting ++ */ ++ ++struct dentry_beancounter ++{ ++ /* ++ * d_inuse = ++ * + ++ * ++ * ++ * d_inuse == -1 means that dentry is unused ++ * state change -1 => 0 causes charge ++ * state change 0 => -1 causes uncharge ++ */ ++ atomic_t d_inuse; ++ /* charged size, including name length if name is not inline */ ++ unsigned long d_ubsize; ++ struct user_beancounter *d_ub; ++}; ++ ++#ifdef CONFIG_BEANCOUNTERS ++#define ub_dget_testone(d) (atomic_inc_and_test(&(d)->dentry_bc.d_inuse)) ++#define ub_dput_testzero(d) (atomic_add_negative(-1, &(d)->dentry_bc.d_inuse)) ++#define INUSE_INIT 0 ++ ++extern int ub_dentry_on; ++#else ++#define ub_dget_testone(d) (0) ++#define ub_dput_testzero(d) (0) ++#endif ++#endif +diff --git a/include/bc/dcache_op.h b/include/bc/dcache_op.h +new file mode 100644 +index 0000000..23306e9 +--- /dev/null ++++ b/include/bc/dcache_op.h +@@ -0,0 +1,102 @@ ++/* ++ * include/bc/dcache_op.h ++ * ++ * Copyright (C) 2006 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __BC_DCACHE_OP_H_ ++#define __BC_DCACHE_OP_H_ ++ ++struct dentry; ++ ++#ifdef CONFIG_BEANCOUNTERS ++ ++#include ++#include ++#include ++ ++extern int ub_dentry_alloc_barrier; ++extern spinlock_t dcache_lock; ++ ++static inline int ub_dentry_alloc(struct dentry *d) ++{ ++ extern int __ub_dentry_alloc(struct dentry *); ++ ++ if (!ub_dentry_on) ++ return 0; ++ return __ub_dentry_alloc(d); ++} ++ ++static inline void ub_dentry_alloc_start(void) ++{ ++ extern void __ub_dentry_alloc_start(void); ++ ++ if (ub_dentry_alloc_barrier) ++ __ub_dentry_alloc_start(); ++} ++ ++static inline void ub_dentry_alloc_end(void) ++{ ++ extern void __ub_dentry_alloc_end(void); ++ ++ if (current->task_bc.dentry_alloc) ++ __ub_dentry_alloc_end(); ++} ++ ++static inline int ub_dentry_charge(struct dentry *d) ++{ ++ extern int __ub_dentry_charge(struct dentry *); ++ ++ if (!ub_dentry_on) ++ return 0; ++ return __ub_dentry_charge(d); ++} ++ ++static inline void ub_dentry_charge_nofail(struct dentry *d) ++{ ++ extern void __ub_dentry_charge_nofail(struct dentry *); ++ ++ if (!ub_dentry_on) ++ return; ++ __ub_dentry_charge_nofail(d); ++} ++ ++static inline void ub_dentry_uncharge_locked(struct dentry *d) ++{ ++ extern void __ub_dentry_uncharge(struct dentry *); ++ ++ if (!ub_dentry_on) ++ return; ++ __ub_dentry_uncharge(d); ++} ++ ++static inline void ub_dentry_uncharge(struct dentry *d) ++{ ++ extern void __ub_dentry_uncharge(struct dentry *); ++ ++ if (!ub_dentry_on) ++ return; ++ spin_lock(&dcache_lock); ++ __ub_dentry_uncharge(d); ++ spin_unlock(&dcache_lock); ++} ++ ++void uncharge_dcache(struct user_beancounter *ub, unsigned long size); ++#else /* CONFIG_BEANCOUNTERS */ ++ ++static inline int ub_dentry_alloc(struct dentry *d) { return 0; } ++static inline void ub_dentry_alloc_start(void) { } ++static inline void ub_dentry_alloc_end(void) { } ++static inline int ub_dentry_charge(struct dentry *d) { return 0; } ++static inline void ub_dentry_charge_nofail(struct dentry *d) { } ++static inline void ub_dentry_uncharge_locked(struct dentry *d) { } ++static inline void ub_dentry_uncharge(struct dentry *d) { } ++static inline void uncharge_dcache(struct user_beancounter *ub, unsigned long size) { } ++ ++#endif /* CONFIG_BEANCOUNTERS */ ++ ++#endif /* __dcache_op.h_ */ +diff --git a/include/bc/debug.h b/include/bc/debug.h +new file mode 100644 +index 0000000..7b1feb6 +--- /dev/null ++++ b/include/bc/debug.h +@@ -0,0 +1,109 @@ ++/* ++ * include/bc/debug.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __BC_DEBUG_H_ ++#define __BC_DEBUG_H_ ++ ++/* ++ * general debugging ++ */ ++ ++#define UBD_ALLOC 0x1 ++#define UBD_CHARGE 0x2 ++#define UBD_LIMIT 0x4 ++#define UBD_TRACE 0x8 ++ ++/* ++ * ub_net debugging ++ */ ++ ++#define UBD_NET_SOCKET 0x10 ++#define UBD_NET_SLEEP 0x20 ++#define UBD_NET_SEND 0x40 ++#define UBD_NET_RECV 0x80 ++ ++/* ++ * Main routines ++ */ ++ ++#define UB_DEBUG (0) ++#define DEBUG_RESOURCE (0ULL) ++ ++#define ub_dbg_cond(__cond, __str, args...) \ ++ do { \ ++ if ((__cond) != 0) \ ++ printk(__str, ##args); \ ++ } while(0) ++ ++#define ub_debug(__section, __str, args...) \ ++ ub_dbg_cond(UB_DEBUG & (__section), __str, ##args) ++ ++#define ub_debug_resource(__resource, __str, args...) \ ++ ub_dbg_cond((UB_DEBUG & UBD_CHARGE) && \ ++ (DEBUG_RESOURCE & (1 << (__resource))), \ ++ __str, ##args) ++ ++#if UB_DEBUG & UBD_TRACE ++#define ub_debug_trace(__cond, __b, __r) \ ++ do { \ ++ static struct ub_rate_info ri = { __b, __r }; \ ++ if ((__cond) != 0 && ub_ratelimit(&ri)) \ ++ dump_stack(); \ ++ } while(0) ++#else ++#define ub_debug_trace(__cond, __burst, __rate) ++#endif ++ ++#ifdef CONFIG_BC_DEBUG_KMEM ++#include ++ ++struct user_beancounter; ++struct ub_cache_counter { ++ struct list_head ulist; ++ struct ub_cache_counter *next; ++ struct user_beancounter *ub; ++ struct kmem_cache *cachep; ++ unsigned long counter; ++}; ++ ++extern spinlock_t cc_lock; ++extern void init_cache_counters(void); ++extern void ub_free_counters(struct user_beancounter *); ++extern void ub_kmemcache_free(struct kmem_cache *cachep); ++ ++struct vm_struct; ++#define inc_vmalloc_charged(vm, flags) do { \ ++ if (flags & __GFP_UBC) \ ++ ub_percpu_add(get_exec_ub(), vmalloc_charged, \ ++ vm->nr_pages); \ ++ } while (0) ++#define dec_vmalloc_charged(vm) do { \ ++ struct user_beancounter *ub; \ ++ ub = page_ub(vm->pages[0]); \ ++ if (ub != NULL) \ ++ ub_percpu_sub(ub, vmalloc_charged, \ ++ vm->nr_pages); \ ++ } while (0) ++ ++#define inc_pbc_count(ub) ub_percpu_inc(ub, pbcs) ++#define dec_pbc_count(ub) ub_percpu_dec(ub, pbcs) ++#else ++#define init_cache_counters() do { } while (0) ++#define inc_vmalloc_charged(vm, f) do { } while (0) ++#define dec_vmalloc_charged(vm) do { } while (0) ++ ++#define inc_pbc_count(ub) do { } while (0) ++#define dec_pbc_count(ub) do { } while (0) ++ ++#define ub_free_counters(ub) do { } while (0) ++#define ub_kmemcache_free(cachep) do { } while (0) ++#endif ++ ++#endif +diff --git a/include/bc/decl.h b/include/bc/decl.h +new file mode 100644 +index 0000000..6dd4cb9 +--- /dev/null ++++ b/include/bc/decl.h +@@ -0,0 +1,41 @@ ++/* ++ * include/bc/decl.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __BC_DECL_H_ ++#define __BC_DECL_H_ ++ ++#ifdef __KERNEL__ ++ ++/* ++ * Naming convension: ++ * ub__ ++ */ ++ ++#ifdef CONFIG_BEANCOUNTERS ++ ++#define UB_DECLARE_FUNC(ret_type, decl) extern ret_type decl; ++#define UB_DECLARE_VOID_FUNC(decl) extern void decl; ++ ++#else /* CONFIG_BEANCOUNTERS */ ++ ++#define UB_DECLARE_FUNC(ret_type, decl) \ ++ static inline ret_type decl \ ++ { \ ++ return (ret_type)0; \ ++ } ++#define UB_DECLARE_VOID_FUNC(decl) \ ++ static inline void decl \ ++ { \ ++ } ++ ++#endif /* CONFIG_BEANCOUNTERS */ ++#endif ++ ++#endif +diff --git a/include/bc/hash.h b/include/bc/hash.h +new file mode 100644 +index 0000000..b2afb69 +--- /dev/null ++++ b/include/bc/hash.h +@@ -0,0 +1,36 @@ ++/* ++ * include/bc/hash.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef _LINUX_UBHASH_H ++#define _LINUX_UBHASH_H ++ ++#ifdef __KERNEL__ ++ ++#define UB_HASH_SIZE 256 ++ ++extern struct hlist_head ub_hash[]; ++extern spinlock_t ub_hash_lock; ++extern struct list_head ub_list_head; ++ ++#ifdef CONFIG_BEANCOUNTERS ++ ++/* ++ * Iterate over beancounters ++ * @__ubp - beancounter ptr ++ * Can use break :) ++ */ ++#define for_each_beancounter(__ubp) \ ++ list_for_each_entry_rcu(__ubp, &ub_list_head, ub_list) \ ++ ++#define bc_hash_entry(ptr) hlist_entry(ptr, struct user_beancounter, ub_hash) ++ ++#endif /* CONFIG_BEANCOUNTERS */ ++#endif /* __KERNEL__ */ ++#endif /* _LINUX_UBHASH_H */ +diff --git a/include/bc/io_acct.h b/include/bc/io_acct.h +new file mode 100644 +index 0000000..d84bf5a +--- /dev/null ++++ b/include/bc/io_acct.h +@@ -0,0 +1,113 @@ ++/* ++ * include/bc/io_acct.h ++ * ++ * Copyright (C) 2006 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ * Pavel Emelianov ++ * ++ */ ++ ++#ifndef __UB_IO_ACCT_H_ ++#define __UB_IO_ACCT_H_ ++ ++#ifdef CONFIG_BC_IO_ACCOUNTING ++#include ++#include ++ ++#define page_iopb(page) ({ \ ++ struct page_beancounter *pb; \ ++ pb = page_pbc(page); \ ++ rmb(); \ ++ pb; \ ++ }) ++ ++/* ++ * IO ub is required in task context only, so if exec_ub is set ++ * to NULL this means that uses doesn't need to charge some ++ * resources. nevertheless IO activity must be accounted, so we ++ * account it to current's task beancounter. ++ */ ++ ++static inline struct user_beancounter *get_io_ub(void) ++{ ++ struct user_beancounter *ub; ++ ++ ub = get_exec_ub(); ++ if (unlikely(ub == NULL)) ++ ub = get_task_ub(current); ++ ++ return top_beancounter(ub); ++} ++ ++extern struct page_beancounter **page_pblist(struct page *); ++ ++extern void ub_io_save_context(struct page *, size_t); ++extern void ub_io_release_context(struct page *pg, size_t size); ++ ++#define PAGE_IO_MARK (0x1UL) ++ ++static inline struct page_beancounter *iopb_to_pb(struct page_beancounter *pb) ++{ ++ if (!((unsigned long)pb & PAGE_IO_MARK)) ++ return NULL; ++ ++ return (struct page_beancounter *)((unsigned long)pb & ~PAGE_IO_MARK); ++} ++ ++static inline void ub_io_account_read(size_t bytes) ++{ ++ ub_percpu_add(get_io_ub(), bytes_read, bytes); ++} ++ ++static inline void ub_io_account_write(size_t bytes) ++{ ++ ub_percpu_add(get_io_ub(), bytes_wrote, bytes); ++} ++ ++static inline void ub_io_account_dirty(struct page *page, size_t bytes) ++{ ++ ub_io_save_context(page, bytes); ++} ++ ++static inline void ub_io_account_write_cancelled(size_t bytes) ++{ ++ ub_percpu_add(get_io_ub(), bytes_cancelled, bytes); ++} ++ ++void ub_init_io(struct kmem_cache *); ++#else /* BC_IO_ACCOUNTING */ ++#define page_iopb(page) (NULL) ++#define page_pblist(page) (&page_pbc(page)) ++ ++static inline void ub_io_release_context(struct page *pg, size_t bytes) ++{ ++} ++ ++static inline void ub_io_account_dirty(struct page *p, size_t bytes) ++{ ++} ++ ++static inline void ub_io_account_read(size_t bytes) ++{ ++} ++ ++static inline void ub_io_account_write(size_t bytes) ++{ ++} ++ ++static inline void ub_io_account_write_cancelled(size_t bytes) ++{ ++} ++ ++static inline void ub_init_io(struct kmem_cache *pb_cachep) { }; ++#endif ++ ++#ifdef CONFIG_BC_DEBUG_IO ++extern void ub_io_release_debug(struct page *pg); ++#else ++#define ub_io_release_debug(pg) do { } while (0) ++#endif ++#endif +diff --git a/include/bc/io_prio.h b/include/bc/io_prio.h +new file mode 100644 +index 0000000..8c1d1e3 +--- /dev/null ++++ b/include/bc/io_prio.h +@@ -0,0 +1,82 @@ ++/* ++ * include/bc/io_prio.h ++ * ++ * Copyright (C) 2007 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ * Vasily Tarasov ++ * ++ */ ++ ++#ifndef _UB_IO_PRIO_H ++#define _UB_IO_PRIO_H ++ ++#include ++#include ++#include ++ ++#define UB_IOPRIO_MIN 0 ++#define UB_IOPRIO_MAX IOPRIO_BE_NR ++#define UB_IOPRIO_BASE 4 ++ ++struct ub_iopriv { ++ struct list_head cfq_bc_head; ++ rwlock_t cfq_bc_list_lock; ++ ++ unsigned int ioprio; ++}; ++ ++struct cfq_data; ++struct cfq_queue; ++ ++#ifdef CONFIG_BC_IO_SCHED ++extern void bc_init_ioprio(struct ub_iopriv *); ++extern void bc_fini_ioprio(struct ub_iopriv *); ++extern struct cfq_bc_data * bc_find_cfq_bc(struct ub_iopriv *, ++ struct cfq_data *); ++extern struct cfq_bc_data * bc_findcreate_cfq_bc(struct ub_iopriv *, ++ struct cfq_data *, gfp_t gfp_mask); ++extern void bc_cfq_exit_queue(struct cfq_data *); ++extern int bc_expired(struct cfq_data *); ++extern void bc_schedule_active(struct cfq_data *); ++extern void bc_inc_rqnum(struct cfq_queue *); ++extern void bc_dec_rqnum(struct cfq_queue *); ++extern unsigned long bc_set_ioprio(int, int); ++extern struct cfq_bc_data * ++__find_cfq_bc(struct ub_iopriv *iopriv, struct cfq_data *cfqd); ++extern struct user_beancounter *bc_io_switch_context(struct page *); ++extern void bc_io_restore_context(struct user_beancounter *); ++#else ++#include ++static inline void bc_init_ioprio(struct ub_iopriv *iopriv) { ; } ++static inline void bc_fini_ioprio(struct ub_iopriv *iopriv) { ; } ++static inline struct cfq_bc_data * ++bc_findcreate_cfq_bc(struct ub_iopriv *iopriv, ++ struct cfq_data *cfqd, gfp_t mask) ++{ ++ return &cfqd->cfq_bc; ++} ++static inline void bc_cfq_exit_queue(struct cfq_data *cfqd) { ; } ++static inline int bc_expired(struct cfq_data *cfqd) { return 0; } ++static inline void bc_schedule_active(struct cfq_data *cfqd) ++{ ++ cfqd->active_cfq_bc = &cfqd->cfq_bc; ++} ++static inline void bc_inc_rqnum(struct cfq_queue *cfqq) { ; } ++static inline void bc_dec_rqnum(struct cfq_queue *cfqq) { ; } ++static inline unsigned long bc_set_ioprio(int ubid, int ioprio) ++{ ++ return -EINVAL; ++} ++static inline struct cfq_bc_data * ++__find_cfq_bc(struct ub_iopriv *iopriv, struct cfq_data *cfqd) ++{ ++ return &cfqd->cfq_bc; ++} ++static inline struct user_beancounter * ++bc_io_switch_context(struct page *page) { return NULL; } ++static inline void bc_io_restore_context(struct user_beancounter *ub) { ; } ++#endif /* CONFIG_BC_IO_SCHED */ ++#endif /* _UB_IO_PRIO_H */ +diff --git a/include/bc/kmem.h b/include/bc/kmem.h +new file mode 100644 +index 0000000..c0ea26a +--- /dev/null ++++ b/include/bc/kmem.h +@@ -0,0 +1,69 @@ ++/* ++ * include/bc/kmem.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __UB_SLAB_H_ ++#define __UB_SLAB_H_ ++ ++#include ++#include ++ ++/* ++ * UB_KMEMSIZE accounting ++ */ ++ ++#ifdef CONFIG_BC_DEBUG_ITEMS ++#define CHARGE_ORDER(__o) (1 << (__o)) ++#define CHARGE_SIZE(__s) 1 ++#else ++#define CHARGE_ORDER(__o) (PAGE_SIZE << (__o)) ++#define CHARGE_SIZE(__s) (__s) ++#endif ++ ++#ifdef CONFIG_BEANCOUNTERS ++#define page_ub(__page) ((__page)->bc.page_ub) ++#else ++#define page_ub(__page) NULL ++#endif ++ ++struct mm_struct; ++struct page; ++struct kmem_cache; ++ ++UB_DECLARE_FUNC(struct user_beancounter *, vmalloc_ub(void *obj)) ++UB_DECLARE_FUNC(struct user_beancounter *, mem_ub(void *obj)) ++ ++UB_DECLARE_FUNC(int, ub_kmemsize_charge(struct user_beancounter *ub, ++ unsigned long size, enum ub_severity strict)) ++UB_DECLARE_VOID_FUNC(ub_kmemsize_uncharge(struct user_beancounter *ub, ++ unsigned long size)) ++ ++UB_DECLARE_FUNC(int, ub_page_charge(struct page *page, int order, gfp_t mask)) ++UB_DECLARE_VOID_FUNC(ub_page_uncharge(struct page *page, int order)) ++UB_DECLARE_FUNC(int, ub_slab_charge(struct kmem_cache *cachep, ++ void *objp, gfp_t flags)) ++UB_DECLARE_VOID_FUNC(ub_slab_uncharge(struct kmem_cache *cachep, void *obj)) ++ ++#ifdef CONFIG_BEANCOUNTERS ++static inline int should_charge(struct kmem_cache *cachep, gfp_t flags) ++{ ++ if (!(cachep->flags & SLAB_UBC)) ++ return 0; ++ if ((cachep->flags & SLAB_NO_CHARGE) && !(flags & __GFP_UBC)) ++ return 0; ++ return 1; ++} ++ ++#define should_uncharge(cachep) should_charge(cachep, __GFP_UBC) ++#else ++#define should_charge(cache, f) 0 ++#define should_uncharge(cache) 0 ++#endif ++ ++#endif /* __UB_SLAB_H_ */ +diff --git a/include/bc/misc.h b/include/bc/misc.h +new file mode 100644 +index 0000000..84082b2 +--- /dev/null ++++ b/include/bc/misc.h +@@ -0,0 +1,55 @@ ++/* ++ * include/bc/misc.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __BC_MISC_H_ ++#define __BC_MISC_H_ ++ ++#include ++ ++struct tty_struct; ++struct file; ++struct file_lock; ++struct sigqueue; ++ ++UB_DECLARE_FUNC(int, ub_file_charge(struct file *f)) ++UB_DECLARE_VOID_FUNC(ub_file_uncharge(struct file *f)) ++UB_DECLARE_FUNC(int, ub_flock_charge(struct file_lock *fl, int hard)) ++UB_DECLARE_VOID_FUNC(ub_flock_uncharge(struct file_lock *fl)) ++UB_DECLARE_FUNC(int, ub_siginfo_charge(struct sigqueue *q, ++ struct user_beancounter *ub)) ++UB_DECLARE_VOID_FUNC(ub_siginfo_uncharge(struct sigqueue *q)) ++UB_DECLARE_FUNC(int, ub_task_charge(struct task_struct *parent, ++ struct task_struct *task)) ++UB_DECLARE_VOID_FUNC(ub_task_uncharge(struct task_struct *task)) ++UB_DECLARE_VOID_FUNC(ub_task_put(struct task_struct *task)) ++UB_DECLARE_FUNC(int, ub_pty_charge(struct tty_struct *tty)) ++UB_DECLARE_VOID_FUNC(ub_pty_uncharge(struct tty_struct *tty)) ++ ++#ifdef CONFIG_BEANCOUNTERS ++#define set_flock_charged(fl) do { (fl)->fl_charged = 1; } while (0) ++#define unset_flock_charged(fl) do { \ ++ WARN_ON((fl)->fl_charged == 0); \ ++ (fl)->fl_charged = 0; \ ++ } while (0) ++#define set_mm_ub(mm, tsk) do { \ ++ (mm)->mm_ub = get_beancounter(tsk != current ? \ ++ tsk->task_bc.task_ub : get_exec_ub()); \ ++ } while (0) ++#define put_mm_ub(mm) do { \ ++ put_beancounter((mm)->mm_ub); \ ++ (mm)->mm_ub = NULL; \ ++ } while (0) ++#else ++#define set_flock_charged(fl) do { } while (0) ++#define unset_flock_charged(fl) do { } while (0) ++#define set_mm_ub(mm, tsk) do { } while (0) ++#define put_mm_ub(mm) do { } while (0) ++#endif ++#endif +diff --git a/include/bc/net.h b/include/bc/net.h +new file mode 100644 +index 0000000..5330a88 +--- /dev/null ++++ b/include/bc/net.h +@@ -0,0 +1,215 @@ ++/* ++ * include/bc/net.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __BC_NET_H_ ++#define __BC_NET_H_ ++ ++/* ++ * UB_NUMXXXSOCK, UB_XXXBUF accounting ++ */ ++ ++#include ++#include ++#include ++ ++#define bid2sid(__bufid) \ ++ ((__bufid) == UB_TCPSNDBUF ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK) ++ ++#define SOCK_MIN_UBCSPACE ((int)((2048 - sizeof(struct skb_shared_info)) & \ ++ ~(SMP_CACHE_BYTES-1))) ++#define SOCK_MIN_UBCSPACE_CH skb_charge_size(SOCK_MIN_UBCSPACE) ++ ++static inline int ub_skb_alloc_bc(struct sk_buff *skb, gfp_t gfp_mask) ++{ ++#ifdef CONFIG_BEANCOUNTERS ++ memset(skb_bc(skb), 0, sizeof(struct skb_beancounter)); ++#endif ++ return 0; ++} ++ ++static inline void ub_skb_free_bc(struct sk_buff *skb) ++{ ++} ++ ++#define IS_TCP_SOCK(__family, __type) \ ++ (((__family) == PF_INET || (__family) == PF_INET6) && (__type) == SOCK_STREAM) ++ ++/* number of sockets */ ++UB_DECLARE_FUNC(int, ub_sock_charge(struct sock *sk, int family, int type)) ++UB_DECLARE_FUNC(int, ub_tcp_sock_charge(struct sock *sk)) ++UB_DECLARE_FUNC(int, ub_other_sock_charge(struct sock *sk)) ++UB_DECLARE_VOID_FUNC(ub_sock_uncharge(struct sock *sk)) ++ ++/* management of queue for send space */ ++UB_DECLARE_FUNC(long, ub_sock_wait_for_space(struct sock *sk, long timeo, ++ unsigned long size)) ++UB_DECLARE_VOID_FUNC(ub_sock_snd_queue_add(struct sock *sk, int resource, ++ unsigned long size)) ++UB_DECLARE_VOID_FUNC(ub_sock_sndqueuedel(struct sock *sk)) ++ ++/* send space */ ++UB_DECLARE_FUNC(int, ub_sock_make_wreserv(struct sock *sk, int bufid, ++ unsigned long size)) ++UB_DECLARE_FUNC(int, ub_sock_get_wreserv(struct sock *sk, int bufid, ++ unsigned long size)) ++UB_DECLARE_VOID_FUNC(ub_sock_ret_wreserv(struct sock *sk, int bufid, ++ unsigned long size, unsigned long ressize)) ++UB_DECLARE_FUNC(int, ub_sock_tcp_chargesend(struct sock *sk, ++ struct sk_buff *skb, enum ub_severity strict)) ++UB_DECLARE_VOID_FUNC(ub_sock_tcp_unchargesend(struct sock *sk, ++ unsigned long size)) ++UB_DECLARE_FUNC(int, ub_sock_tcp_chargepage(struct sock *sk)) ++UB_DECLARE_VOID_FUNC(ub_sock_tcp_detachpage(struct sock *sk)) ++ ++UB_DECLARE_FUNC(int, ub_nlrcvbuf_charge(struct sk_buff *skb, struct sock *sk)) ++ ++/* receive space */ ++UB_DECLARE_FUNC(int, ub_sockrcvbuf_charge(struct sock *sk, struct sk_buff *skb)) ++UB_DECLARE_FUNC(int, ub_sock_tcp_chargerecv(struct sock *sk, ++ struct sk_buff *skb, enum ub_severity strict)) ++ ++/* skb destructor */ ++UB_DECLARE_VOID_FUNC(ub_skb_uncharge(struct sk_buff *skb)) ++ ++static inline int ub_sock_makewres_other(struct sock *sk, unsigned long size) ++{ ++ return ub_sock_make_wreserv(sk, UB_OTHERSOCKBUF, size); ++} ++ ++static inline int ub_sock_makewres_tcp(struct sock *sk, unsigned long size) ++{ ++ return ub_sock_make_wreserv(sk, UB_TCPSNDBUF, size); ++} ++ ++UB_DECLARE_FUNC(int, ub_sock_getwres_other(struct sock *sk, ++ unsigned long size)) ++ ++static inline int ub_sock_getwres_tcp(struct sock *sk, unsigned long size) ++{ ++ return ub_sock_get_wreserv(sk, UB_TCPSNDBUF, size); ++} ++ ++UB_DECLARE_VOID_FUNC(ub_sock_retwres_other(struct sock *sk, ++ unsigned long size, unsigned long ressize)) ++ ++static inline void ub_sock_retwres_tcp(struct sock *sk, unsigned long size, ++ unsigned long ressize) ++{ ++ ub_sock_ret_wreserv(sk, UB_TCPSNDBUF, size, ressize); ++} ++ ++static inline void ub_sock_sndqueueadd_other(struct sock *sk, unsigned long sz) ++{ ++ ub_sock_snd_queue_add(sk, UB_OTHERSOCKBUF, sz); ++} ++ ++static inline void ub_sock_sndqueueadd_tcp(struct sock *sk, unsigned long sz) ++{ ++ ub_sock_snd_queue_add(sk, UB_TCPSNDBUF, sz); ++} ++ ++static inline int ub_tcpsndbuf_charge(struct sock *sk, ++ struct sk_buff *skb) ++{ ++ return ub_sock_tcp_chargesend(sk, skb, UB_HARD); ++} ++ ++static inline int ub_tcpsndbuf_charge_forced(struct sock *sk, ++ struct sk_buff *skb) ++{ ++ return ub_sock_tcp_chargesend(sk, skb, UB_FORCE); ++} ++ ++static inline int ub_tcprcvbuf_charge(struct sock *sk, struct sk_buff *skb) ++{ ++ return ub_sock_tcp_chargerecv(sk, skb, UB_SOFT); ++} ++ ++static inline int ub_tcprcvbuf_charge_forced(struct sock *sk, ++ struct sk_buff *skb) ++{ ++ return ub_sock_tcp_chargerecv(sk, skb, UB_FORCE); ++} ++ ++/* Charge size */ ++static inline unsigned long skb_charge_datalen(unsigned long chargesize) ++{ ++#ifdef CONFIG_BEANCOUNTERS ++ unsigned long slabsize; ++ ++ chargesize -= sizeof(struct sk_buff); ++ slabsize = 64; ++ do { ++ slabsize <<= 1; ++ } while (slabsize <= chargesize); ++ ++ slabsize >>= 1; ++ return (slabsize - sizeof(struct skb_shared_info)) & ++ ~(SMP_CACHE_BYTES-1); ++#else ++ return 0; ++#endif ++} ++ ++static inline unsigned long skb_charge_size_gen(unsigned long size) ++{ ++#ifdef CONFIG_BEANCOUNTERS ++ unsigned int slabsize; ++ ++ size = SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info); ++ slabsize = 32; /* min size is 64 because of skb_shared_info */ ++ do { ++ slabsize <<= 1; ++ } while (slabsize < size); ++ ++ return slabsize + sizeof(struct sk_buff); ++#else ++ return 0; ++#endif ++ ++} ++ ++static inline unsigned long skb_charge_size_const(unsigned long size) ++{ ++#ifdef CONFIG_BEANCOUNTERS ++ unsigned int ret; ++ if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 64) ++ ret = 64 + sizeof(struct sk_buff); ++ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 128) ++ ret = 128 + sizeof(struct sk_buff); ++ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 256) ++ ret = 256 + sizeof(struct sk_buff); ++ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 512) ++ ret = 512 + sizeof(struct sk_buff); ++ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 1024) ++ ret = 1024 + sizeof(struct sk_buff); ++ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 2048) ++ ret = 2048 + sizeof(struct sk_buff); ++ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 4096) ++ ret = 4096 + sizeof(struct sk_buff); ++ else ++ ret = skb_charge_size_gen(size); ++ return ret; ++#else ++ return 0; ++#endif ++} ++ ++ ++#define skb_charge_size(__size) \ ++ (__builtin_constant_p(__size) ? \ ++ skb_charge_size_const(__size) : \ ++ skb_charge_size_gen(__size)) ++ ++UB_DECLARE_FUNC(int, skb_charge_fullsize(struct sk_buff *skb)) ++UB_DECLARE_VOID_FUNC(ub_skb_set_charge(struct sk_buff *skb, ++ struct sock *sk, unsigned long size, int res)) ++ ++#endif +diff --git a/include/bc/oom_kill.h b/include/bc/oom_kill.h +new file mode 100644 +index 0000000..c07608f +--- /dev/null ++++ b/include/bc/oom_kill.h +@@ -0,0 +1,26 @@ ++#include ++#include ++ ++UB_DECLARE_FUNC(int, ub_oom_lock(void)) ++UB_DECLARE_FUNC(struct user_beancounter *, ub_oom_select_worst(void)) ++UB_DECLARE_VOID_FUNC(ub_oom_mm_killed(struct user_beancounter *ub)) ++UB_DECLARE_VOID_FUNC(ub_oom_unlock(void)) ++UB_DECLARE_VOID_FUNC(ub_out_of_memory(struct user_beancounter *ub)) ++UB_DECLARE_VOID_FUNC(ub_oom_task_dead(struct task_struct *tsk)) ++UB_DECLARE_FUNC(int, ub_oom_task_skip(struct user_beancounter *ub, ++ struct task_struct *tsk)) ++ ++#ifdef CONFIG_BEANCOUNTERS ++extern int oom_generation; ++extern int oom_kill_counter; ++#define ub_oom_start() do { \ ++ current->task_bc.oom_generation = oom_generation; \ ++ } while (0) ++#define ub_oom_task_killed(p) do { \ ++ oom_kill_counter++; \ ++ wake_up_process(p); \ ++ } while (0) ++#else ++#define ub_oom_start() do { } while (0) ++#define ub_oom_task_killed(p) do { } while (0) ++#endif +diff --git a/include/bc/proc.h b/include/bc/proc.h +new file mode 100644 +index 0000000..f244523 +--- /dev/null ++++ b/include/bc/proc.h +@@ -0,0 +1,40 @@ ++/* ++ * include/bc/proc.h ++ * ++ * Copyright (C) 2006 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __UB_PROC_H_ ++#define __UB_PROC_H_ ++ ++#include ++ ++struct bc_proc_entry { ++ char *name; ++ union { ++ int (*show)(struct seq_file *, void *); ++ struct file_operations *fops; ++ } u; ++ struct bc_proc_entry *next; ++ int cookie; ++}; ++ ++struct user_beancounter; ++ ++void bc_register_proc_entry(struct bc_proc_entry *); ++void bc_register_proc_root_entry(struct bc_proc_entry *); ++ ++static inline struct user_beancounter *seq_beancounter(struct seq_file *f) ++{ ++ return (struct user_beancounter *)(f->private); ++} ++ ++extern const char *bc_proc_lu_fmt; ++extern const char *bc_proc_lu_lfmt; ++extern const char *bc_proc_llu_fmt; ++extern const char *bc_proc_lu_lu_fmt; ++#endif +diff --git a/include/bc/rss_pages.h b/include/bc/rss_pages.h +new file mode 100644 +index 0000000..b195961 +--- /dev/null ++++ b/include/bc/rss_pages.h +@@ -0,0 +1,57 @@ ++/* ++ * include/bc/rss_pages.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __RSS_PAGES_H_ ++#define __RSS_PAGES_H_ ++ ++/* ++ * Page_beancounters ++ */ ++ ++struct page; ++struct user_beancounter; ++ ++#define PB_MAGIC 0x62700001UL ++ ++struct page_beancounter { ++ unsigned long pb_magic; ++ struct page *page; ++ struct user_beancounter *ub; ++ union { ++ struct page_beancounter *next_hash; ++ struct page_beancounter *page_pb_list; ++ }; ++ union { ++ unsigned refcount; ++ unsigned io_debug; ++ }; ++ union { ++ struct list_head page_list; ++ struct list_head io_list; ++ }; ++}; ++ ++#define PB_REFCOUNT_BITS 24 ++#define PB_SHIFT_GET(c) ((c) >> PB_REFCOUNT_BITS) ++#define PB_SHIFT_INC(c) ((c) += (1 << PB_REFCOUNT_BITS)) ++#define PB_SHIFT_DEC(c) ((c) -= (1 << PB_REFCOUNT_BITS)) ++#define PB_COUNT_GET(c) ((c) & ((1 << PB_REFCOUNT_BITS) - 1)) ++#define PB_COUNT_INC(c) ((c)++) ++#define PB_COUNT_DEC(c) ((c)--) ++#define PB_REFCOUNT_MAKE(s, c) (((s) << PB_REFCOUNT_BITS) + (c)) ++ ++#define page_pbc(__page) ((__page)->bc.page_pb) ++ ++extern spinlock_t pb_lock; ++ ++struct address_space; ++extern int is_shmem_mapping(struct address_space *); ++ ++#endif +diff --git a/include/bc/sock.h b/include/bc/sock.h +new file mode 100644 +index 0000000..b314c9b +--- /dev/null ++++ b/include/bc/sock.h +@@ -0,0 +1,47 @@ ++/* ++ * include/bc/sock.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __BC_SOCK_H_ ++#define __BC_SOCK_H_ ++ ++#include ++ ++struct sock; ++struct sk_buff; ++ ++struct skb_beancounter { ++ struct user_beancounter *ub; ++ unsigned long charged:27, resource:5; ++}; ++ ++struct sock_beancounter { ++ struct user_beancounter *ub; ++ /* ++ * poll_reserv accounts space already charged for future sends. ++ * It is required to make poll agree with sendmsg. ++ * Additionally, it makes real charges (with taking bc spinlock) ++ * in the send path rarer, speeding networking up. ++ * For TCP (only): changes are protected by socket lock (not bc!) ++ * For all proto: may be read without serialization in poll. ++ */ ++ unsigned long poll_reserv; ++ unsigned long forw_space; ++ /* fields below are protected by bc spinlock */ ++ unsigned long ub_waitspc; /* space waiting for */ ++ unsigned long ub_wcharged; ++ struct list_head ub_sock_list; ++}; ++ ++#define sock_bc(__sk) (&(__sk)->sk_bc) ++#define skb_bc(__skb) (&(__skb)->skb_bc) ++#define skbc_sock(__skbc) (container_of(__skbc, struct sock, sk_bc)) ++#define sock_has_ubc(__sk) (sock_bc(__sk)->ub != NULL) ++ ++#endif +diff --git a/include/bc/sock_orphan.h b/include/bc/sock_orphan.h +new file mode 100644 +index 0000000..038d52b +--- /dev/null ++++ b/include/bc/sock_orphan.h +@@ -0,0 +1,106 @@ ++/* ++ * include/bc/sock_orphan.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __BC_SOCK_ORPHAN_H_ ++#define __BC_SOCK_ORPHAN_H_ ++ ++#include ++ ++#include "bc/beancounter.h" ++#include "bc/net.h" ++ ++ ++static inline atomic_t *__ub_get_orphan_count_ptr(struct sock *sk) ++{ ++#ifdef CONFIG_BEANCOUNTERS ++ if (sock_has_ubc(sk)) ++ return &sock_bc(sk)->ub->ub_orphan_count; ++#endif ++ return sk->sk_prot->orphan_count; ++} ++ ++static inline void ub_inc_orphan_count(struct sock *sk) ++{ ++ atomic_inc(__ub_get_orphan_count_ptr(sk)); ++} ++ ++static inline void ub_dec_orphan_count(struct sock *sk) ++{ ++ atomic_dec(__ub_get_orphan_count_ptr(sk)); ++} ++ ++static inline int ub_get_orphan_count(struct sock *sk) ++{ ++ return atomic_read(__ub_get_orphan_count_ptr(sk)); ++} ++ ++extern int __ub_too_many_orphans(struct sock *sk, int count); ++static inline int ub_too_many_orphans(struct sock *sk, int count) ++{ ++#ifdef CONFIG_BEANCOUNTERS ++ if (__ub_too_many_orphans(sk, count)) ++ return 1; ++#endif ++ return (ub_get_orphan_count(sk) > sysctl_tcp_max_orphans || ++ (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && ++ atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])); ++} ++ ++#include ++ ++struct inet_timewait_sock; ++ ++static inline void ub_timewait_mod(struct inet_timewait_sock *tw, int incdec) ++{ ++#ifdef CONFIG_BEANCOUNTERS ++ struct user_beancounter *ub; ++ ++ ub = slab_ub(tw); ++ if (ub != NULL) ++ ub->ub_tw_count += incdec; ++#endif ++} ++ ++static inline int __ub_timewait_check(struct sock *sk) ++{ ++#ifdef CONFIG_BEANCOUNTERS ++ struct user_beancounter *ub; ++ unsigned long mem_max, mem; ++ int tw_count; ++ ++ ub = sock_bc(sk)->ub; ++ if (ub == NULL) ++ return 1; ++ ++ tw_count = ub->ub_tw_count; ++ mem_max = sysctl_tcp_max_tw_kmem_fraction * ++ ((ub->ub_parms[UB_KMEMSIZE].limit >> 10) + 1); ++ mem = kmem_cache_objuse(sk->sk_prot_creator->twsk_prot->twsk_slab); ++ mem *= tw_count; ++ return tw_count < sysctl_tcp_max_tw_buckets_ub && mem < mem_max; ++#else ++ return 1; ++#endif ++} ++ ++#define ub_timewait_inc(tw, twdr) do { \ ++ if ((twdr)->ub_managed) \ ++ ub_timewait_mod(tw, 1); \ ++ } while (0) ++ ++#define ub_timewait_dec(tw, twdr) do { \ ++ if ((twdr)->ub_managed) \ ++ ub_timewait_mod(tw, -1); \ ++ } while (0) ++ ++#define ub_timewait_check(sk, twdr) ((!(twdr)->ub_managed) || \ ++ __ub_timewait_check(sk)) ++ ++#endif +diff --git a/include/bc/statd.h b/include/bc/statd.h +new file mode 100644 +index 0000000..9dafc5e +--- /dev/null ++++ b/include/bc/statd.h +@@ -0,0 +1,70 @@ ++/* ++ * include/bc/statd.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __BC_STATD_H_ ++#define __BC_STATD_H_ ++ ++/* sys_ubstat commands list */ ++#define UBSTAT_READ_ONE 0x010000 ++#define UBSTAT_READ_ALL 0x020000 ++#define UBSTAT_READ_FULL 0x030000 ++#define UBSTAT_UBLIST 0x040000 ++#define UBSTAT_UBPARMNUM 0x050000 ++#define UBSTAT_GETTIME 0x060000 ++ ++#define UBSTAT_CMD(func) ((func) & 0xF0000) ++#define UBSTAT_PARMID(func) ((func) & 0x0FFFF) ++ ++#define TIME_MAX_SEC (LONG_MAX / HZ) ++#define TIME_MAX_JIF (TIME_MAX_SEC * HZ) ++ ++typedef unsigned long ubstattime_t; ++ ++typedef struct { ++ ubstattime_t start_time; ++ ubstattime_t end_time; ++ ubstattime_t cur_time; ++} ubgettime_t; ++ ++typedef struct { ++ long maxinterval; ++ int signum; ++} ubnotifrq_t; ++ ++typedef struct { ++ unsigned long maxheld; ++ unsigned long failcnt; ++} ubstatparm_t; ++ ++typedef struct { ++ unsigned long barrier; ++ unsigned long limit; ++ unsigned long held; ++ unsigned long maxheld; ++ unsigned long minheld; ++ unsigned long failcnt; ++ unsigned long __unused1; ++ unsigned long __unused2; ++} ubstatparmf_t; ++ ++typedef struct { ++ ubstattime_t start_time; ++ ubstattime_t end_time; ++ ubstatparmf_t param[0]; ++} ubstatfull_t; ++ ++#ifdef __KERNEL__ ++struct ub_stat_notify { ++ struct list_head list; ++ struct task_struct *task; ++ int signum; ++}; ++#endif ++#endif +diff --git a/include/bc/task.h b/include/bc/task.h +new file mode 100644 +index 0000000..f5a2915 +--- /dev/null ++++ b/include/bc/task.h +@@ -0,0 +1,69 @@ ++/* ++ * include/bc/task.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __BC_TASK_H_ ++#define __BC_TASK_H_ ++ ++struct user_beancounter; ++ ++ ++#ifdef CONFIG_BEANCOUNTERS ++struct task_beancounter { ++ struct user_beancounter *exec_ub; ++ struct user_beancounter *saved_ub; ++ struct user_beancounter *task_ub; ++ struct user_beancounter *fork_sub; ++ unsigned long file_precharged, file_quant, file_count; ++ unsigned long kmem_precharged; ++ char dentry_alloc, pgfault_handle; ++ void *task_fnode, *task_freserv; ++ unsigned long oom_generation; ++ unsigned long task_data[4]; ++ unsigned long pgfault_allot; ++}; ++ ++#define get_task_ub(__task) ((__task)->task_bc.task_ub) ++ ++extern struct user_beancounter ub0; ++#define get_ub0() (&ub0) ++ ++#define ub_save_context(t) do { \ ++ t->task_bc.saved_ub = t->task_bc.exec_ub; \ ++ t->task_bc.exec_ub = get_ub0(); \ ++ } while (0) ++#define ub_restore_context(t) do { \ ++ t->task_bc.exec_ub = t->task_bc.saved_ub; \ ++ } while (0) ++ ++#define get_exec_ub() (current->task_bc.exec_ub) ++#define set_exec_ub(__newub) \ ++({ \ ++ struct user_beancounter *old; \ ++ struct task_beancounter *tbc; \ ++ \ ++ tbc = ¤t->task_bc; \ ++ old = tbc->exec_ub; \ ++ tbc->exec_ub = __newub; \ ++ old; \ ++}) ++ ++void ub_init_task_bc(struct task_beancounter *); ++ ++#else /* CONFIG_BEANCOUNTERS */ ++ ++#define get_ub0() (NULL) ++#define get_exec_ub() (NULL) ++#define get_task_ub(task) (NULL) ++#define set_exec_ub(__ub) (NULL) ++#define ub_save_context(t) do { } while (0) ++#define ub_restore_context(t) do { } while (0) ++ ++#endif /* CONFIG_BEANCOUNTERS */ ++#endif /* __task.h_ */ +diff --git a/include/bc/tcp.h b/include/bc/tcp.h +new file mode 100644 +index 0000000..d2bf748 +--- /dev/null ++++ b/include/bc/tcp.h +@@ -0,0 +1,76 @@ ++/* ++ * include/bc/tcp.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __BC_TCP_H_ ++#define __BC_TCP_H_ ++ ++/* ++ * UB_NUMXXXSOCK, UB_XXXBUF accounting ++ */ ++ ++#include ++#include ++ ++static inline void ub_tcp_update_maxadvmss(struct sock *sk) ++{ ++#ifdef CONFIG_BEANCOUNTERS ++ if (!sock_has_ubc(sk)) ++ return; ++ if (sock_bc(sk)->ub->ub_maxadvmss >= tcp_sk(sk)->advmss) ++ return; ++ ++ sock_bc(sk)->ub->ub_maxadvmss = ++ skb_charge_size(MAX_HEADER + sizeof(struct iphdr) ++ + sizeof(struct tcphdr) + tcp_sk(sk)->advmss); ++#endif ++} ++ ++static inline int ub_tcp_rmem_allows_expand(struct sock *sk) ++{ ++ if (tcp_memory_pressure) ++ return 0; ++#ifdef CONFIG_BEANCOUNTERS ++ if (sock_has_ubc(sk)) { ++ struct user_beancounter *ub; ++ ++ ub = sock_bc(sk)->ub; ++ if (ub->ub_rmem_pressure == UB_RMEM_EXPAND) ++ return 1; ++ if (ub->ub_rmem_pressure == UB_RMEM_SHRINK) ++ return 0; ++ return sk->sk_rcvbuf <= ub->ub_rmem_thres; ++ } ++#endif ++ return 1; ++} ++ ++static inline int ub_tcp_memory_pressure(struct sock *sk) ++{ ++ if (tcp_memory_pressure) ++ return 1; ++#ifdef CONFIG_BEANCOUNTERS ++ if (sock_has_ubc(sk)) ++ return sock_bc(sk)->ub->ub_rmem_pressure != UB_RMEM_EXPAND; ++#endif ++ return 0; ++} ++ ++static inline int ub_tcp_shrink_rcvbuf(struct sock *sk) ++{ ++ if (tcp_memory_pressure) ++ return 1; ++#ifdef CONFIG_BEANCOUNTERS ++ if (sock_has_ubc(sk)) ++ return sock_bc(sk)->ub->ub_rmem_pressure == UB_RMEM_SHRINK; ++#endif ++ return 0; ++} ++ ++#endif +diff --git a/include/bc/vmpages.h b/include/bc/vmpages.h +new file mode 100644 +index 0000000..09642e3 +--- /dev/null ++++ b/include/bc/vmpages.h +@@ -0,0 +1,152 @@ ++/* ++ * include/bc/vmpages.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __UB_PAGES_H_ ++#define __UB_PAGES_H_ ++ ++#include ++#include ++#include ++ ++/* ++ * Check whether vma has private or copy-on-write mapping. ++ * Should match checks in ub_protected_charge(). ++ */ ++#define VM_UB_PRIVATE(__flags, __file) \ ++ ( ((__flags) & VM_WRITE) ? \ ++ (__file) == NULL || !((__flags) & VM_SHARED) : \ ++ 0 \ ++ ) ++ ++/* Mprotect charging result */ ++#define PRIVVM_ERROR -1 ++#define PRIVVM_NO_CHARGE 0 /* UB_DECLARE_FUNC retval with ubc off */ ++#define PRIVVM_TO_PRIVATE 1 ++#define PRIVVM_TO_SHARED 2 ++ ++UB_DECLARE_FUNC(int, ub_protected_charge(struct mm_struct *mm, ++ unsigned long size, ++ unsigned long newflags, ++ struct vm_area_struct *vma)) ++ ++UB_DECLARE_VOID_FUNC(ub_unused_privvm_add(struct mm_struct *mm, ++ struct vm_area_struct *vma, ++ unsigned long num)) ++#define ub_unused_privvm_inc(mm, vma) ub_unused_privvm_add(mm, vma, 1) ++UB_DECLARE_VOID_FUNC(ub_unused_privvm_sub(struct mm_struct *mm, ++ struct vm_area_struct *vma, ++ unsigned long num)) ++#define ub_unused_privvm_dec(mm, vma) ub_unused_privvm_sub(mm, vma, 1) ++ ++UB_DECLARE_VOID_FUNC(__ub_unused_privvm_dec(struct mm_struct *mm, ++ long sz)) ++ ++UB_DECLARE_FUNC(int, ub_memory_charge(struct mm_struct *mm, ++ unsigned long size, ++ unsigned vm_flags, ++ struct file *vm_file, ++ int strict)) ++UB_DECLARE_VOID_FUNC(ub_memory_uncharge(struct mm_struct *mm, ++ unsigned long size, ++ unsigned vm_flags, ++ struct file *vm_file)) ++ ++struct shmem_inode_info; ++UB_DECLARE_FUNC(int, ub_shmpages_charge(struct shmem_inode_info *i, ++ unsigned long sz)) ++UB_DECLARE_VOID_FUNC(ub_shmpages_uncharge(struct shmem_inode_info *i, ++ unsigned long sz)) ++UB_DECLARE_VOID_FUNC(ub_tmpfs_respages_inc(struct shmem_inode_info *shi)) ++UB_DECLARE_VOID_FUNC(ub_tmpfs_respages_sub(struct shmem_inode_info *shi, ++ unsigned long size)) ++#define ub_tmpfs_respages_dec(shi) ub_tmpfs_respages_sub(shi, 1) ++ ++#ifdef CONFIG_BEANCOUNTERS ++#define shmi_ub_set(shi, ub) do { \ ++ (shi)->shmi_ub = get_beancounter(ub); \ ++ } while (0) ++#define shmi_ub_put(shi) do { \ ++ put_beancounter((shi)->shmi_ub); \ ++ (shi)->shmi_ub = NULL; \ ++ } while (0) ++#else ++#define shmi_ub_set(shi, ub) do { } while (0) ++#define shmi_ub_put(shi) do { } while (0) ++#endif ++ ++UB_DECLARE_FUNC(int, ub_locked_charge(struct mm_struct *mm, ++ unsigned long size)) ++UB_DECLARE_VOID_FUNC(ub_locked_uncharge(struct mm_struct *mm, ++ unsigned long size)) ++UB_DECLARE_FUNC(int, ub_lockedshm_charge(struct shmem_inode_info *shi, ++ unsigned long size)) ++UB_DECLARE_VOID_FUNC(ub_lockedshm_uncharge(struct shmem_inode_info *shi, ++ unsigned long size)) ++ ++UB_DECLARE_FUNC(unsigned long, pages_in_vma_range(struct vm_area_struct *vma, ++ unsigned long addr, unsigned long end)) ++#define pages_in_vma(vma) (pages_in_vma_range(vma, \ ++ vma->vm_start, vma->vm_end)) ++ ++#define UB_PAGE_WEIGHT_SHIFT 24 ++#define UB_PAGE_WEIGHT (1 << UB_PAGE_WEIGHT_SHIFT) ++ ++struct page_beancounter; ++#define PBC_COPY_SAME ((struct page_beancounter *) 1) ++ ++/* Mprotect charging result */ ++#define PRIVVM_ERROR -1 ++#define PRIVVM_NO_CHARGE 0 ++#define PRIVVM_TO_PRIVATE 1 ++#define PRIVVM_TO_SHARED 2 ++ ++extern void __ub_update_physpages(struct user_beancounter *ub); ++extern void __ub_update_oomguarpages(struct user_beancounter *ub); ++extern void __ub_update_privvm(struct user_beancounter *ub); ++ ++#ifdef CONFIG_BC_RSS_ACCOUNTING ++#define PB_DECLARE_FUNC(ret, decl) UB_DECLARE_FUNC(ret, decl) ++#define PB_DECLARE_VOID_FUNC(decl) UB_DECLARE_VOID_FUNC(decl) ++#else ++#define PB_DECLARE_FUNC(ret, decl) static inline ret decl {return (ret)0;} ++#define PB_DECLARE_VOID_FUNC(decl) static inline void decl { } ++#endif ++ ++PB_DECLARE_FUNC(int, pb_alloc(struct page_beancounter **pbc)) ++PB_DECLARE_FUNC(int, pb_alloc_list(struct page_beancounter **pbc, int num)) ++PB_DECLARE_FUNC(int, pb_alloc_all(struct page_beancounter **pbc)) ++PB_DECLARE_VOID_FUNC(pb_add_ref(struct page *page, ++ struct mm_struct *mm, ++ struct page_beancounter **pbc)) ++PB_DECLARE_VOID_FUNC(pb_dup_ref(struct page *page, ++ struct mm_struct *mm, ++ struct page_beancounter **pbc)) ++PB_DECLARE_VOID_FUNC(pb_free_list(struct page_beancounter **pb)) ++PB_DECLARE_VOID_FUNC(pb_free(struct page_beancounter **pb)) ++PB_DECLARE_VOID_FUNC(pb_remove_ref(struct page *page, ++ struct mm_struct *mm)) ++ ++PB_DECLARE_FUNC(struct user_beancounter *, pb_grab_page_ub(struct page *page)) ++#endif ++ ++#ifdef CONFIG_BC_SWAP_ACCOUNTING ++#define SWP_DECLARE_FUNC(ret, decl) UB_DECLARE_FUNC(ret, decl) ++#define SWP_DECLARE_VOID_FUNC(decl) UB_DECLARE_VOID_FUNC(decl) ++#else ++#define SWP_DECLARE_FUNC(ret, decl) static inline ret decl {return (ret)0;} ++#define SWP_DECLARE_VOID_FUNC(decl) static inline void decl { } ++#endif ++ ++struct swap_info_struct; ++SWP_DECLARE_FUNC(int, ub_swap_init(struct swap_info_struct *si, pgoff_t n)) ++SWP_DECLARE_VOID_FUNC(ub_swap_fini(struct swap_info_struct *si)) ++SWP_DECLARE_VOID_FUNC(ub_swapentry_inc(struct swap_info_struct *si, pgoff_t n, ++ struct user_beancounter *ub)) ++SWP_DECLARE_VOID_FUNC(ub_swapentry_dec(struct swap_info_struct *si, pgoff_t n)) +diff --git a/include/linux/aio.h b/include/linux/aio.h +index b51ddd2..6fb5195 100644 +--- a/include/linux/aio.h ++++ b/include/linux/aio.h +@@ -225,4 +225,8 @@ static inline struct kiocb *list_kiocb(struct list_head *h) + extern unsigned long aio_nr; + extern unsigned long aio_max_nr; + ++void wait_for_all_aios(struct kioctx *ctx); ++extern struct kmem_cache *kioctx_cachep; ++extern void aio_kick_handler(struct work_struct *); ++ + #endif /* __LINUX__AIO_H */ +diff --git a/include/linux/capability.h b/include/linux/capability.h +index 0267384..ab16bc6 100644 +--- a/include/linux/capability.h ++++ b/include/linux/capability.h +@@ -186,12 +186,9 @@ typedef struct kernel_cap_struct { + + #define CAP_NET_BROADCAST 11 + +-/* Allow interface configuration */ + /* Allow administration of IP firewall, masquerading and accounting */ + /* Allow setting debug option on sockets */ + /* Allow modification of routing tables */ +-/* Allow setting arbitrary process / process group ownership on +- sockets */ + /* Allow binding to any address for transparent proxying */ + /* Allow setting TOS (type of service) */ + /* Allow setting promiscuous mode */ +@@ -221,6 +218,7 @@ typedef struct kernel_cap_struct { + #define CAP_SYS_MODULE 16 + + /* Allow ioperm/iopl access */ ++/* Allow O_DIRECT access */ + /* Allow sending USB messages to any device via /proc/bus/usb */ + + #define CAP_SYS_RAWIO 17 +@@ -239,24 +237,19 @@ typedef struct kernel_cap_struct { + + /* Allow configuration of the secure attention key */ + /* Allow administration of the random device */ +-/* Allow examination and configuration of disk quotas */ + /* Allow configuring the kernel's syslog (printk behaviour) */ + /* Allow setting the domainname */ + /* Allow setting the hostname */ + /* Allow calling bdflush() */ +-/* Allow mount() and umount(), setting up new smb connection */ ++/* Allow setting up new smb connection */ + /* Allow some autofs root ioctls */ + /* Allow nfsservctl */ + /* Allow VM86_REQUEST_IRQ */ + /* Allow to read/write pci config on alpha */ + /* Allow irix_prctl on mips (setstacksize) */ + /* Allow flushing all cache on m68k (sys_cacheflush) */ +-/* Allow removing semaphores */ +-/* Used instead of CAP_CHOWN to "chown" IPC message queues, semaphores +- and shared memory */ + /* Allow locking/unlocking of shared memory segment */ + /* Allow turning swap on/off */ +-/* Allow forged pids on socket credentials passing */ + /* Allow setting readahead and flushing buffers on block devices */ + /* Allow setting geometry in floppy driver */ + /* Allow turning DMA on/off in xd driver */ +@@ -329,6 +322,50 @@ typedef struct kernel_cap_struct { + + #define CAP_SETFCAP 31 + ++#ifdef __KERNEL__ ++/* ++ * Important note: VZ capabilities do intersect with CAP_AUDIT ++ * this is due to compatibility reasons. Nothing bad. ++ * Both VZ and Audit/SELinux caps are disabled in VPSs. ++ */ ++ ++/* Allow access to all information. In the other case some structures will be ++ hiding to ensure different Virtual Environment non-interaction on the same ++ node */ ++#define CAP_SETVEID 29 ++ ++#define CAP_VE_ADMIN 30 ++ ++#ifdef CONFIG_VE ++ ++/* Replacement for CAP_NET_ADMIN: ++ delegated rights to the Virtual environment of its network administration. ++ For now the following rights have been delegated: ++ ++ Allow setting arbitrary process / process group ownership on sockets ++ Allow interface configuration ++ */ ++#define CAP_VE_NET_ADMIN CAP_VE_ADMIN ++ ++/* Replacement for CAP_SYS_ADMIN: ++ delegated rights to the Virtual environment of its administration. ++ For now the following rights have been delegated: ++ */ ++/* Allow mount/umount/remount */ ++/* Allow examination and configuration of disk quotas */ ++/* Allow removing semaphores */ ++/* Used instead of CAP_CHOWN to "chown" IPC message queues, semaphores ++ and shared memory */ ++/* Allow locking/unlocking of shared memory segment */ ++/* Allow forged pids on socket credentials passing */ ++ ++#define CAP_VE_SYS_ADMIN CAP_VE_ADMIN ++#else ++#define CAP_VE_NET_ADMIN CAP_NET_ADMIN ++#define CAP_VE_SYS_ADMIN CAP_SYS_ADMIN ++#endif ++#endif ++ + /* Override MAC access. + The base kernel enforces no MAC policy. + An LSM may enforce a MAC policy, and if it does and it chooses +@@ -390,7 +427,16 @@ typedef struct kernel_cap_struct { + #define CAP_INIT_INH_SET CAP_EMPTY_SET + + # define cap_clear(c) do { (c) = __cap_empty_set; } while (0) ++#ifndef CONFIG_VE + # define cap_set_full(c) do { (c) = __cap_full_set; } while (0) ++#else ++# define cap_set_full(c) do { \ ++ if (ve_is_super(get_exec_env())) \ ++ (c) = __cap_full_set; \ ++ else \ ++ (c) = get_exec_env()->ve_cap_bset;\ ++ } while (0) ++#endif + # define cap_set_init_eff(c) do { (c) = __cap_init_eff_set; } while (0) + + #define cap_raise(c, flag) ((c).cap[CAP_TO_INDEX(flag)] |= CAP_TO_MASK(flag)) +@@ -503,6 +549,10 @@ extern const kernel_cap_t __cap_init_eff_set; + + kernel_cap_t cap_set_effective(const kernel_cap_t pE_new); + ++#include ++ ++extern spinlock_t task_capability_lock; ++ + int capable(int cap); + int __capable(struct task_struct *t, int cap); + +diff --git a/include/linux/cfq-iosched.h b/include/linux/cfq-iosched.h +new file mode 100644 +index 0000000..b414c4a +--- /dev/null ++++ b/include/linux/cfq-iosched.h +@@ -0,0 +1,148 @@ ++#ifndef _LINUX_CFQ_IOSCHED_H ++#define _LINUX_CFQ_IOSCHED_H ++ ++#include ++#include ++#include ++ ++extern struct kmem_cache *cfq_pool; ++ ++#define CFQ_PRIO_LISTS IOPRIO_BE_NR ++ ++/* ++ * Most of our rbtree usage is for sorting with min extraction, so ++ * if we cache the leftmost node we don't have to walk down the tree ++ * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should ++ * move this into the elevator for the rq sorting as well. ++ */ ++struct cfq_rb_root { ++ struct rb_root rb; ++ struct rb_node *left; ++}; ++#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, } ++ ++/* ++ * Per (Device, UBC) queue data ++ */ ++struct cfq_bc_data { ++ /* for ub.iopriv->cfq_bc_head */ ++ struct list_head cfq_bc_list; ++ /* for cfqd->act_cfq_bc_head */ ++ struct list_head act_cfq_bc_list; ++ ++ struct cfq_data *cfqd; ++ struct ub_iopriv *ub_iopriv; ++ ++ /* ++ * rr list of queues with requests and the count of them ++ */ ++ struct cfq_rb_root service_tree; ++ ++ int cur_prio; ++ int cur_end_prio; ++ ++ unsigned long rqnum; ++ unsigned long on_dispatch; ++ ++ /* ++ * async queue for each priority case ++ */ ++ struct cfq_queue *async_cfqq[2][CFQ_PRIO_LISTS]; ++ struct cfq_queue *async_idle_cfqq; ++}; ++ ++/* ++ * Per block device queue structure ++ */ ++struct cfq_data { ++ struct request_queue *queue; ++ ++#ifndef CONFIG_BC_IO_SCHED ++ struct cfq_bc_data cfq_bc; ++#endif ++ unsigned int busy_queues; ++ ++ int rq_in_driver; ++ int sync_flight; ++ int hw_tag; ++ ++ /* ++ * idle window management ++ */ ++ struct timer_list idle_slice_timer; ++ struct work_struct unplug_work; ++ ++ struct cfq_queue *active_queue; ++ struct cfq_io_context *active_cic; ++ ++ sector_t last_position; ++ unsigned long last_end_request; ++ ++ /* ++ * tunables, see top of file ++ */ ++ unsigned int cfq_quantum; ++ unsigned int cfq_fifo_expire[2]; ++ unsigned int cfq_back_penalty; ++ unsigned int cfq_back_max; ++ unsigned int cfq_slice[2]; ++ unsigned int cfq_slice_async_rq; ++ unsigned int cfq_slice_idle; ++ ++ struct list_head cic_list; ++ ++ /* list of ub that have requests */ ++ struct list_head act_cfq_bc_head; ++ /* ub that owns a timeslice at the moment */ ++ struct cfq_bc_data *active_cfq_bc; ++ unsigned int cfq_ub_slice; ++ unsigned long slice_end; ++ int virt_mode; ++ int write_virt_mode; ++}; ++ ++/* ++ * Per process-grouping structure ++ */ ++struct cfq_queue { ++ /* reference count */ ++ atomic_t ref; ++ /* various state flags, see below */ ++ unsigned int flags; ++ /* parent cfq_data */ ++ struct cfq_data *cfqd; ++ /* service_tree member */ ++ struct rb_node rb_node; ++ /* service_tree key */ ++ unsigned long rb_key; ++ /* sorted list of pending requests */ ++ struct rb_root sort_list; ++ /* if fifo isn't expired, next request to serve */ ++ struct request *next_rq; ++ /* requests queued in sort_list */ ++ int queued[2]; ++ /* currently allocated requests */ ++ int allocated[2]; ++ /* fifo list of requests in sort_list */ ++ struct list_head fifo; ++ ++ unsigned long slice_end; ++ long slice_resid; ++ ++ /* pending metadata requests */ ++ int meta_pending; ++ /* number of requests that are on the dispatch list or inside driver */ ++ int dispatched; ++ ++ /* io prio of this group */ ++ unsigned short ioprio, org_ioprio; ++ unsigned short ioprio_class, org_ioprio_class; ++ ++ struct cfq_bc_data *cfq_bc; ++}; ++ ++static void inline cfq_init_cfq_bc(struct cfq_bc_data *cfq_bc) ++{ ++ cfq_bc->service_tree = CFQ_RB_ROOT; ++} ++#endif /* _LINUX_CFQ_IOSCHED_H */ +diff --git a/include/linux/compat.h b/include/linux/compat.h +index cf8d11c..3c778e2 100644 +--- a/include/linux/compat.h ++++ b/include/linux/compat.h +@@ -238,6 +238,7 @@ extern int put_compat_itimerspec(struct compat_itimerspec __user *dst, + asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp); + + extern int compat_printk(const char *fmt, ...); ++extern int ve_compat_printk(int dst, const char *fmt, ...); + extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat); + + asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, +diff --git a/include/linux/cpt_image.h b/include/linux/cpt_image.h +new file mode 100644 +index 0000000..ae331be +--- /dev/null ++++ b/include/linux/cpt_image.h +@@ -0,0 +1,1762 @@ ++/* ++ * ++ * include/linux/cpt_image.h ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __CPT_IMAGE_H_ ++#define __CPT_IMAGE_H_ 1 ++ ++#define CPT_NULL (~0ULL) ++#define CPT_NOINDEX (~0U) ++ ++/* ++ * Image file layout. ++ * ++ * - major header ++ * - sections[] ++ * ++ * Each section is: ++ * - section header ++ * - array of objects ++ * ++ * All data records are arch independent, 64 bit aligned. ++ */ ++ ++enum _cpt_object_type ++{ ++ CPT_OBJ_TASK = 0, ++ CPT_OBJ_MM, ++ CPT_OBJ_FS, ++ CPT_OBJ_FILES, ++ CPT_OBJ_FILE, ++ CPT_OBJ_SIGHAND_STRUCT, ++ CPT_OBJ_SIGNAL_STRUCT, ++ CPT_OBJ_TTY, ++ CPT_OBJ_SOCKET, ++ CPT_OBJ_SYSVSEM_UNDO, ++ CPT_OBJ_NAMESPACE, ++ CPT_OBJ_SYSV_SHM, ++ CPT_OBJ_INODE, ++ CPT_OBJ_UBC, ++ CPT_OBJ_SLM_SGREG, ++ CPT_OBJ_SLM_REGOBJ, ++ CPT_OBJ_SLM_MM, ++ CPT_OBJ_MAX, ++ /* The objects above are stored in memory while checkpointing */ ++ ++ CPT_OBJ_VMA = 1024, ++ CPT_OBJ_FILEDESC, ++ CPT_OBJ_SIGHANDLER, ++ CPT_OBJ_SIGINFO, ++ CPT_OBJ_LASTSIGINFO, ++ CPT_OBJ_SYSV_SEM, ++ CPT_OBJ_SKB, ++ CPT_OBJ_FLOCK, ++ CPT_OBJ_OPENREQ, ++ CPT_OBJ_VFSMOUNT, ++ CPT_OBJ_TRAILER, ++ CPT_OBJ_SYSVSEM_UNDO_REC, ++ CPT_OBJ_NET_DEVICE, ++ CPT_OBJ_NET_IFADDR, ++ CPT_OBJ_NET_ROUTE, ++ CPT_OBJ_NET_CONNTRACK, ++ CPT_OBJ_NET_CONNTRACK_EXPECT, ++ CPT_OBJ_AIO_CONTEXT, ++ CPT_OBJ_VEINFO, ++ CPT_OBJ_EPOLL, ++ CPT_OBJ_EPOLL_FILE, ++ CPT_OBJ_SKFILTER, ++ CPT_OBJ_SIGALTSTACK, ++ CPT_OBJ_SOCK_MCADDR, ++ CPT_OBJ_BIND_MNT, ++ CPT_OBJ_SYSVMSG, ++ CPT_OBJ_SYSVMSG_MSG, ++ ++ CPT_OBJ_X86_REGS = 4096, ++ CPT_OBJ_X86_64_REGS, ++ CPT_OBJ_PAGES, ++ CPT_OBJ_COPYPAGES, ++ CPT_OBJ_REMAPPAGES, ++ CPT_OBJ_LAZYPAGES, ++ CPT_OBJ_NAME, ++ CPT_OBJ_BITS, ++ CPT_OBJ_REF, ++ CPT_OBJ_ITERPAGES, ++ CPT_OBJ_ITERYOUNGPAGES, ++ CPT_OBJ_VSYSCALL, ++ CPT_OBJ_IA64_REGS, ++ CPT_OBJ_INOTIFY, ++ CPT_OBJ_INOTIFY_WATCH, ++ CPT_OBJ_INOTIFY_EVENT, ++ CPT_OBJ_TASK_AUX, ++ CPT_OBJ_NET_TUNTAP, ++ CPT_OBJ_NET_HWADDR, ++ CPT_OBJ_NET_VETH, ++ CPT_OBJ_NET_STATS, ++}; ++ ++#define CPT_ALIGN(n) (((n)+7)&~7) ++ ++struct cpt_major_hdr ++{ ++ __u8 cpt_signature[4]; /* Magic number */ ++ __u16 cpt_hdrlen; /* Length of this header */ ++ __u16 cpt_image_version; /* Format of this file */ ++#define CPT_VERSION_MINOR(a) ((a) & 0xf) ++#define CPT_VERSION_8 0 ++#define CPT_VERSION_9 0x100 ++#define CPT_VERSION_9_1 0x101 ++#define CPT_VERSION_9_2 0x102 ++#define CPT_VERSION_16 0x200 ++#define CPT_VERSION_18 0x300 ++#define CPT_VERSION_18_1 0x301 ++#define CPT_VERSION_20 0x400 ++#define CPT_VERSION_24 0x500 ++#define CPT_VERSION_26 0x600 ++ __u16 cpt_os_arch; /* Architecture */ ++#define CPT_OS_ARCH_I386 0 ++#define CPT_OS_ARCH_EMT64 1 ++#define CPT_OS_ARCH_IA64 2 ++ __u16 __cpt_pad1; ++ __u32 cpt_ve_features; /* VE features */ ++ __u32 cpt_ve_features2; /* VE features */ ++ __u16 cpt_pagesize; /* Page size used by OS */ ++ __u16 cpt_hz; /* HZ used by OS */ ++ __u64 cpt_start_jiffies64; /* Jiffies */ ++ __u32 cpt_start_sec; /* Seconds */ ++ __u32 cpt_start_nsec; /* Nanoseconds */ ++ __u32 cpt_cpu_caps[4]; /* CPU capabilities */ ++ __u32 cpt_kernel_config[4]; /* Kernel config */ ++ __u64 cpt_iptables_mask; /* Used netfilter modules */ ++} __attribute__ ((aligned (8))); ++ ++#define CPT_SIGNATURE0 0x79 ++#define CPT_SIGNATURE1 0x1c ++#define CPT_SIGNATURE2 0x01 ++#define CPT_SIGNATURE3 0x63 ++ ++/* CPU capabilities */ ++#define CPT_CPU_X86_CMOV 0 ++#define CPT_CPU_X86_FXSR 1 ++#define CPT_CPU_X86_SSE 2 ++#define CPT_CPU_X86_SSE2 3 ++#define CPT_CPU_X86_MMX 4 ++#define CPT_CPU_X86_3DNOW 5 ++#define CPT_CPU_X86_3DNOW2 6 ++#define CPT_CPU_X86_SEP 7 ++#define CPT_CPU_X86_EMT64 8 ++#define CPT_CPU_X86_IA64 9 ++#define CPT_CPU_X86_SYSCALL 10 ++#define CPT_CPU_X86_SYSCALL32 11 ++#define CPT_CPU_X86_SEP32 12 ++ ++/* Unsupported features */ ++#define CPT_EXTERNAL_PROCESS 16 ++#define CPT_NAMESPACES 17 ++#define CPT_SCHEDULER_POLICY 18 ++#define CPT_PTRACED_FROM_VE0 19 ++#define CPT_UNSUPPORTED_FSTYPE 20 ++#define CPT_BIND_MOUNT 21 ++#define CPT_UNSUPPORTED_NETDEV 22 ++#define CPT_UNSUPPORTED_MISC 23 ++ ++/* This mask is used to determine whether VE ++ has some unsupported features or not */ ++#define CPT_UNSUPPORTED_MASK 0xffff0000UL ++ ++#define CPT_KERNEL_CONFIG_PAE 0 ++ ++struct cpt_section_hdr ++{ ++ __u64 cpt_next; ++ __u32 cpt_section; ++ __u16 cpt_hdrlen; ++ __u16 cpt_align; ++} __attribute__ ((aligned (8))); ++ ++enum ++{ ++ CPT_SECT_ERROR, /* Error section, content is string */ ++ CPT_SECT_VEINFO, ++ CPT_SECT_FILES, /* Files. Content is array of file objects */ ++ CPT_SECT_TASKS, ++ CPT_SECT_MM, ++ CPT_SECT_FILES_STRUCT, ++ CPT_SECT_FS, ++ CPT_SECT_SIGHAND_STRUCT, ++ CPT_SECT_TTY, ++ CPT_SECT_SOCKET, ++ CPT_SECT_NAMESPACE, ++ CPT_SECT_SYSVSEM_UNDO, ++ CPT_SECT_INODE, /* Inodes with i->i_nlink==0 and ++ * deleted dentires with inodes not ++ * referenced inside dumped process. ++ */ ++ CPT_SECT_SYSV_SHM, ++ CPT_SECT_SYSV_SEM, ++ CPT_SECT_ORPHANS, ++ CPT_SECT_NET_DEVICE, ++ CPT_SECT_NET_IFADDR, ++ CPT_SECT_NET_ROUTE, ++ CPT_SECT_NET_IPTABLES, ++ CPT_SECT_NET_CONNTRACK, ++ CPT_SECT_NET_CONNTRACK_VE0, ++ CPT_SECT_UTSNAME, ++ CPT_SECT_TRAILER, ++ CPT_SECT_UBC, ++ CPT_SECT_SLM_SGREGS, ++ CPT_SECT_SLM_REGOBJS, ++/* Due to silly mistake we cannot index sections beyond this value */ ++#define CPT_SECT_MAX_INDEX (CPT_SECT_SLM_REGOBJS+1) ++ CPT_SECT_EPOLL, ++ CPT_SECT_VSYSCALL, ++ CPT_SECT_INOTIFY, ++ CPT_SECT_SYSV_MSG, ++ CPT_SECT_MAX ++}; ++ ++struct cpt_major_tail ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_lazypages; ++ __u32 cpt_64bit; ++ __u64 cpt_sections[CPT_SECT_MAX_INDEX]; ++ __u32 cpt_nsect; ++ __u8 cpt_signature[4]; /* Magic number */ ++} __attribute__ ((aligned (8))); ++ ++ ++/* Common object header. */ ++struct cpt_object_hdr ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++} __attribute__ ((aligned (8))); ++ ++enum _cpt_content_type { ++ CPT_CONTENT_VOID, ++ CPT_CONTENT_ARRAY, ++ CPT_CONTENT_DATA, ++ CPT_CONTENT_NAME, ++ ++ CPT_CONTENT_STACK, ++ CPT_CONTENT_X86_FPUSTATE_OLD, ++ CPT_CONTENT_X86_FPUSTATE, ++ CPT_CONTENT_MM_CONTEXT, ++ CPT_CONTENT_SEMARRAY, ++ CPT_CONTENT_SEMUNDO, ++ CPT_CONTENT_NLMARRAY, ++ CPT_CONTENT_MAX ++}; ++ ++/* CPT_OBJ_BITS: encode array of bytes */ ++struct cpt_obj_bits ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_size; ++ __u32 __cpt_pad1; ++} __attribute__ ((aligned (8))); ++ ++/* CPT_OBJ_REF: a reference to another object */ ++struct cpt_obj_ref ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_pos; ++} __attribute__ ((aligned (8))); ++ ++/* CPT_OBJ_VEINFO: various ve specific data */ ++struct cpt_veinfo_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ /* ipc ctls */ ++ __u32 shm_ctl_max; ++ __u32 shm_ctl_all; ++ __u32 shm_ctl_mni; ++ __u32 msg_ctl_max; ++ __u32 msg_ctl_mni; ++ __u32 msg_ctl_mnb; ++ __u32 sem_ctl_arr[4]; ++ ++ /* start time */ ++ __u64 start_timespec_delta; ++ __u64 start_jiffies_delta; ++ ++ /* later extension */ ++ __u32 last_pid; ++ __u32 pad1; ++ __u64 reserved[8]; ++} __attribute__ ((aligned (8))); ++ ++/* CPT_OBJ_FILE: one struct file */ ++struct cpt_file_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_flags; ++ __u32 cpt_mode; ++ __u64 cpt_pos; ++ __u32 cpt_uid; ++ __u32 cpt_gid; ++ ++ __u32 cpt_i_mode; ++ __u32 cpt_lflags; ++#define CPT_DENTRY_DELETED 1 ++#define CPT_DENTRY_ROOT 2 ++#define CPT_DENTRY_CLONING 4 ++#define CPT_DENTRY_PROC 8 ++#define CPT_DENTRY_EPOLL 0x10 ++#define CPT_DENTRY_REPLACED 0x20 ++#define CPT_DENTRY_INOTIFY 0x40 ++#define CPT_DENTRY_FUTEX 0x80 ++#define CPT_DENTRY_TUNTAP 0x100 ++ __u64 cpt_inode; ++ __u64 cpt_priv; ++ ++ __u32 cpt_fown_fd; ++ __u32 cpt_fown_pid; ++#define CPT_FOWN_STRAY_PID 0 ++ __u32 cpt_fown_uid; ++ __u32 cpt_fown_euid; ++ __u32 cpt_fown_signo; ++ __u32 __cpt_pad1; ++} __attribute__ ((aligned (8))); ++/* Followed by file name, encoded as CPT_OBJ_NAME */ ++ ++struct cpt_epoll_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_file; ++} __attribute__ ((aligned (8))); ++/* Followed by array of struct cpt_epoll_file */ ++ ++struct cpt_epoll_file_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_file; ++ __u32 cpt_fd; ++ __u32 cpt_events; ++ __u64 cpt_data; ++ __u32 cpt_revents; ++ __u32 cpt_ready; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_inotify_wd_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_wd; ++ __u32 cpt_mask; ++} __attribute__ ((aligned (8))); ++/* Followed by cpt_file_image of inode to watch */ ++ ++struct cpt_inotify_ev_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_wd; ++ __u32 cpt_mask; ++ __u32 cpt_cookie; ++ __u32 cpt_namelen; ++} __attribute__ ((aligned (8))); ++/* Followed by name */ ++ ++struct cpt_inotify_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_file; ++ __u32 cpt_user; ++ __u32 cpt_max_events; ++ __u32 cpt_last_wd; ++ __u32 __cpt_pad1; ++} __attribute__ ((aligned (8))); ++/* Followed by array of struct cpt_inotify_wd_image and cpt_inotify_ev_image */ ++ ++ ++/* CPT_OBJ_FILEDESC: one file descriptor */ ++struct cpt_fd_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_fd; ++ __u32 cpt_flags; ++#define CPT_FD_FLAG_CLOSEEXEC 1 ++ __u64 cpt_file; ++} __attribute__ ((aligned (8))); ++ ++/* CPT_OBJ_FILES: one files_struct */ ++struct cpt_files_struct_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_index; ++ __u32 cpt_max_fds; ++ __u32 cpt_next_fd; ++ __u32 __cpt_pad1; ++} __attribute__ ((aligned (8))); ++/* Followed by array of cpt_fd_image */ ++ ++/* CPT_OBJ_FS: one fs_struct */ ++struct cpt_fs_struct_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_umask; ++ __u32 __cpt_pad1; ++} __attribute__ ((aligned (8))); ++/* Followed by two/three CPT_OBJ_FILENAME for root, pwd and, optionally, altroot */ ++ ++/* CPT_OBJ_INODE: one struct inode */ ++struct cpt_inode_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_dev; ++ __u64 cpt_ino; ++ __u32 cpt_mode; ++ __u32 cpt_nlink; ++ __u32 cpt_uid; ++ __u32 cpt_gid; ++ __u64 cpt_rdev; ++ __u64 cpt_size; ++ __u64 cpt_blksize; ++ __u64 cpt_atime; ++ __u64 cpt_mtime; ++ __u64 cpt_ctime; ++ __u64 cpt_blocks; ++ __u32 cpt_sb; ++ __u32 __cpt_pad1; ++} __attribute__ ((aligned (8))); ++ ++/* CPT_OBJ_VFSMOUNT: one vfsmount */ ++struct cpt_vfsmount_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_mntflags; ++#define CPT_MNT_BIND 0x80000000 ++#define CPT_MNT_EXT 0x40000000 ++ __u32 cpt_flags; ++} __attribute__ ((aligned (8))); ++ ++ ++struct cpt_flock_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_owner; ++ __u32 cpt_pid; ++ __u64 cpt_start; ++ __u64 cpt_end; ++ __u32 cpt_flags; ++ __u32 cpt_type; ++} __attribute__ ((aligned (8))); ++ ++ ++struct cpt_tty_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_flags; ++ __u32 cpt_link; ++ __u32 cpt_index; ++ __u32 cpt_drv_type; ++ __u32 cpt_drv_subtype; ++ __u32 cpt_drv_flags; ++ __u8 cpt_packet; ++ __u8 cpt_stopped; ++ __u8 cpt_hw_stopped; ++ __u8 cpt_flow_stopped; ++ ++ __u32 cpt_canon_data; ++ __u32 cpt_canon_head; ++ __u32 cpt_canon_column; ++ __u32 cpt_column; ++ __u8 cpt_ctrl_status; ++ __u8 cpt_erasing; ++ __u8 cpt_lnext; ++ __u8 cpt_icanon; ++ __u8 cpt_raw; ++ __u8 cpt_real_raw; ++ __u8 cpt_closing; ++ __u8 __cpt_pad1; ++ __u16 cpt_minimum_to_wake; ++ __u16 __cpt_pad2; ++ __u32 cpt_pgrp; ++ __u32 cpt_session; ++ __u32 cpt_c_line; ++ __u8 cpt_name[64]; ++ __u16 cpt_ws_row; ++ __u16 cpt_ws_col; ++ __u16 cpt_ws_prow; ++ __u16 cpt_ws_pcol; ++ __u8 cpt_c_cc[32]; ++ __u32 cpt_c_iflag; ++ __u32 cpt_c_oflag; ++ __u32 cpt_c_cflag; ++ __u32 cpt_c_lflag; ++ __u32 cpt_read_flags[4096/32]; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_sock_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_file; ++ __u32 cpt_parent; ++ __u32 cpt_index; ++ ++ __u64 cpt_ssflags; ++ __u16 cpt_type; ++ __u16 cpt_family; ++ __u8 cpt_sstate; ++ __u8 cpt_passcred; ++ __u8 cpt_state; ++ __u8 cpt_reuse; ++ ++ __u8 cpt_zapped; ++ __u8 cpt_shutdown; ++ __u8 cpt_userlocks; ++ __u8 cpt_no_check; ++ __u8 cpt_debug; ++ __u8 cpt_rcvtstamp; ++ __u8 cpt_localroute; ++ __u8 cpt_protocol; ++ ++ __u32 cpt_err; ++ __u32 cpt_err_soft; ++ ++ __u16 cpt_max_ack_backlog; ++ __u16 __cpt_pad1; ++ __u32 cpt_priority; ++ ++ __u32 cpt_rcvlowat; ++ __u32 cpt_bound_dev_if; ++ ++ __u64 cpt_rcvtimeo; ++ __u64 cpt_sndtimeo; ++ __u32 cpt_rcvbuf; ++ __u32 cpt_sndbuf; ++ __u64 cpt_flags; ++ __u64 cpt_lingertime; ++ __u32 cpt_peer_pid; ++ __u32 cpt_peer_uid; ++ ++ __u32 cpt_peer_gid; ++ __u32 cpt_laddrlen; ++ __u32 cpt_laddr[128/4]; ++ __u32 cpt_raddrlen; ++ __u32 cpt_raddr[128/4]; ++ /* AF_UNIX */ ++ __u32 cpt_peer; ++ ++ __u8 cpt_socketpair; ++ __u8 cpt_deleted; ++ __u16 __cpt_pad4; ++ __u32 __cpt_pad5; ++/* ++ struct sk_filter *sk_filter; ++ */ ++ ++ __u64 cpt_stamp; ++ __u32 cpt_daddr; ++ __u16 cpt_dport; ++ __u16 cpt_sport; ++ ++ __u32 cpt_saddr; ++ __u32 cpt_rcv_saddr; ++ ++ __u32 cpt_uc_ttl; ++ __u32 cpt_tos; ++ ++ __u32 cpt_cmsg_flags; ++ __u32 cpt_mc_index; ++ ++ __u32 cpt_mc_addr; ++/* ++ struct ip_options *opt; ++ */ ++ __u8 cpt_hdrincl; ++ __u8 cpt_mc_ttl; ++ __u8 cpt_mc_loop; ++ __u8 cpt_pmtudisc; ++ ++ __u8 cpt_recverr; ++ __u8 cpt_freebind; ++ __u16 cpt_idcounter; ++ __u32 cpt_cork_flags; ++ ++ __u32 cpt_cork_fragsize; ++ __u32 cpt_cork_length; ++ __u32 cpt_cork_addr; ++ __u32 cpt_cork_saddr; ++ __u32 cpt_cork_daddr; ++ __u32 cpt_cork_oif; ++ ++ __u32 cpt_udp_pending; ++ __u32 cpt_udp_corkflag; ++ __u16 cpt_udp_encap; ++ __u16 cpt_udp_len; ++ __u32 __cpt_pad7; ++ ++ __u64 cpt_saddr6[2]; ++ __u64 cpt_rcv_saddr6[2]; ++ __u64 cpt_daddr6[2]; ++ __u32 cpt_flow_label6; ++ __u32 cpt_frag_size6; ++ __u32 cpt_hop_limit6; ++ __u32 cpt_mcast_hops6; ++ ++ __u32 cpt_mcast_oif6; ++ __u8 cpt_rxopt6; ++ __u8 cpt_mc_loop6; ++ __u8 cpt_recverr6; ++ __u8 cpt_sndflow6; ++ ++ __u8 cpt_pmtudisc6; ++ __u8 cpt_ipv6only6; ++ __u8 cpt_mapped; ++ __u8 __cpt_pad8; ++ __u32 cpt_pred_flags; ++ ++ __u32 cpt_rcv_nxt; ++ __u32 cpt_snd_nxt; ++ ++ __u32 cpt_snd_una; ++ __u32 cpt_snd_sml; ++ ++ __u32 cpt_rcv_tstamp; ++ __u32 cpt_lsndtime; ++ ++ __u8 cpt_tcp_header_len; ++ __u8 cpt_ack_pending; ++ __u8 cpt_quick; ++ __u8 cpt_pingpong; ++ __u8 cpt_blocked; ++ __u8 __cpt_pad9; ++ __u16 __cpt_pad10; ++ ++ __u32 cpt_ato; ++ __u32 cpt_ack_timeout; ++ ++ __u32 cpt_lrcvtime; ++ __u16 cpt_last_seg_size; ++ __u16 cpt_rcv_mss; ++ ++ __u32 cpt_snd_wl1; ++ __u32 cpt_snd_wnd; ++ ++ __u32 cpt_max_window; ++ __u32 cpt_pmtu_cookie; ++ ++ __u32 cpt_mss_cache; ++ __u16 cpt_mss_cache_std; ++ __u16 cpt_mss_clamp; ++ ++ __u16 cpt_ext_header_len; ++ __u16 cpt_ext2_header_len; ++ __u8 cpt_ca_state; ++ __u8 cpt_retransmits; ++ __u8 cpt_reordering; ++ __u8 cpt_frto_counter; ++ ++ __u32 cpt_frto_highmark; ++ __u8 cpt_adv_cong; ++ __u8 cpt_defer_accept; ++ __u8 cpt_backoff; ++ __u8 __cpt_pad11; ++ ++ __u32 cpt_srtt; ++ __u32 cpt_mdev; ++ ++ __u32 cpt_mdev_max; ++ __u32 cpt_rttvar; ++ ++ __u32 cpt_rtt_seq; ++ __u32 cpt_rto; ++ ++ __u32 cpt_packets_out; ++ __u32 cpt_left_out; ++ ++ __u32 cpt_retrans_out; ++ __u32 cpt_snd_ssthresh; ++ ++ __u32 cpt_snd_cwnd; ++ __u16 cpt_snd_cwnd_cnt; ++ __u16 cpt_snd_cwnd_clamp; ++ ++ __u32 cpt_snd_cwnd_used; ++ __u32 cpt_snd_cwnd_stamp; ++ ++ __u32 cpt_timeout; ++ __u32 cpt_ka_timeout; ++ ++ __u32 cpt_rcv_wnd; ++ __u32 cpt_rcv_wup; ++ ++ __u32 cpt_write_seq; ++ __u32 cpt_pushed_seq; ++ ++ __u32 cpt_copied_seq; ++ __u8 cpt_tstamp_ok; ++ __u8 cpt_wscale_ok; ++ __u8 cpt_sack_ok; ++ __u8 cpt_saw_tstamp; ++ ++ __u8 cpt_snd_wscale; ++ __u8 cpt_rcv_wscale; ++ __u8 cpt_nonagle; ++ __u8 cpt_keepalive_probes; ++ __u32 cpt_rcv_tsval; ++ ++ __u32 cpt_rcv_tsecr; ++ __u32 cpt_ts_recent; ++ ++ __u64 cpt_ts_recent_stamp; ++ __u16 cpt_user_mss; ++ __u8 cpt_dsack; ++ __u8 cpt_eff_sacks; ++ __u32 cpt_sack_array[2*5]; ++ __u32 cpt_window_clamp; ++ ++ __u32 cpt_rcv_ssthresh; ++ __u8 cpt_probes_out; ++ __u8 cpt_num_sacks; ++ __u16 cpt_advmss; ++ ++ __u8 cpt_syn_retries; ++ __u8 cpt_ecn_flags; ++ __u16 cpt_prior_ssthresh; ++ __u32 cpt_lost_out; ++ ++ __u32 cpt_sacked_out; ++ __u32 cpt_fackets_out; ++ ++ __u32 cpt_high_seq; ++ __u32 cpt_retrans_stamp; ++ ++ __u32 cpt_undo_marker; ++ __u32 cpt_undo_retrans; ++ ++ __u32 cpt_urg_seq; ++ __u16 cpt_urg_data; ++ __u8 cpt_pending; ++ __u8 cpt_urg_mode; ++ ++ __u32 cpt_snd_up; ++ __u32 cpt_keepalive_time; ++ ++ __u32 cpt_keepalive_intvl; ++ __u32 cpt_linger2; ++ ++ __u32 cpt_rcvrtt_rtt; ++ __u32 cpt_rcvrtt_seq; ++ ++ __u32 cpt_rcvrtt_time; ++ __u32 __cpt_pad12; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_sockmc_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u16 cpt_family; ++ __u16 cpt_mode; ++ __u32 cpt_ifindex; ++ __u32 cpt_mcaddr[4]; ++} __attribute__ ((aligned (8))); ++/* Followed by array of source addresses, each zero padded to 16 bytes */ ++ ++struct cpt_openreq_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_rcv_isn; ++ __u32 cpt_snt_isn; ++ ++ __u16 cpt_rmt_port; ++ __u16 cpt_mss; ++ __u8 cpt_family; ++ __u8 cpt_retrans; ++ __u8 cpt_snd_wscale; ++ __u8 cpt_rcv_wscale; ++ ++ __u8 cpt_tstamp_ok; ++ __u8 cpt_sack_ok; ++ __u8 cpt_wscale_ok; ++ __u8 cpt_ecn_ok; ++ __u8 cpt_acked; ++ __u8 __cpt_pad1; ++ __u16 __cpt_pad2; ++ ++ __u32 cpt_window_clamp; ++ __u32 cpt_rcv_wnd; ++ __u32 cpt_ts_recent; ++ __u32 cpt_iif; ++ __u64 cpt_expires; ++ ++ __u64 cpt_loc_addr[2]; ++ __u64 cpt_rmt_addr[2]; ++/* ++ struct ip_options *opt; ++ */ ++ ++} __attribute__ ((aligned (8))); ++ ++struct cpt_skb_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_owner; ++ __u32 cpt_queue; ++#define CPT_SKB_NQ 0 ++#define CPT_SKB_RQ 1 ++#define CPT_SKB_WQ 2 ++#define CPT_SKB_OFOQ 3 ++ ++ __u64 cpt_stamp; ++ __u32 cpt_len; ++ __u32 cpt_hspace; ++ __u32 cpt_tspace; ++ __u32 cpt_h; ++ __u32 cpt_nh; ++ __u32 cpt_mac; ++ ++ __u64 cpt_cb[5]; ++ __u32 cpt_mac_len; ++ __u32 cpt_csum; ++ __u8 cpt_local_df; ++ __u8 cpt_pkt_type; ++ __u8 cpt_ip_summed; ++ __u8 __cpt_pad1; ++ __u32 cpt_priority; ++ __u16 cpt_protocol; ++ __u16 cpt_security; ++ __u16 cpt_gso_segs; ++ __u16 cpt_gso_size; ++} __attribute__ ((aligned (8))); ++ ++ ++struct cpt_sysvshm_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_key; ++ __u64 cpt_uid; ++ __u64 cpt_gid; ++ __u64 cpt_cuid; ++ __u64 cpt_cgid; ++ __u64 cpt_mode; ++ __u64 cpt_seq; ++ ++ __u32 cpt_id; ++ __u32 cpt_mlockuser; ++ __u64 cpt_segsz; ++ __u64 cpt_atime; ++ __u64 cpt_ctime; ++ __u64 cpt_dtime; ++ __u64 cpt_creator; ++ __u64 cpt_last; ++} __attribute__ ((aligned (8))); ++ ++ ++struct cpt_sysvsem_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_key; ++ __u64 cpt_uid; ++ __u64 cpt_gid; ++ __u64 cpt_cuid; ++ __u64 cpt_cgid; ++ __u64 cpt_mode; ++ __u64 cpt_seq; ++ __u32 cpt_id; ++ __u32 __cpt_pad1; ++ ++ __u64 cpt_otime; ++ __u64 cpt_ctime; ++} __attribute__ ((aligned (8))); ++/* Content is array of pairs semval/sempid */ ++ ++struct cpt_sysvsem_undo_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_id; ++ __u32 cpt_nsem; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_sysvmsg_msg_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_type; ++ __u64 cpt_size; ++} __attribute__ ((aligned (8))); ++ ++ ++struct cpt_sysvmsg_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_key; ++ __u64 cpt_uid; ++ __u64 cpt_gid; ++ __u64 cpt_cuid; ++ __u64 cpt_cgid; ++ __u64 cpt_mode; ++ __u64 cpt_seq; ++ __u32 cpt_id; ++ __u32 __cpt_pad1; ++ ++ __u64 cpt_stime; ++ __u64 cpt_rtime; ++ __u64 cpt_ctime; ++ __u64 cpt_last_sender; ++ __u64 cpt_last_receiver; ++ __u64 cpt_qbytes; ++} __attribute__ ((aligned (8))); ++/* Content is array of sysv msg */ ++ ++ ++struct cpt_mm_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_start_code; ++ __u64 cpt_end_code; ++ __u64 cpt_start_data; ++ __u64 cpt_end_data; ++ __u64 cpt_start_brk; ++ __u64 cpt_brk; ++ __u64 cpt_start_stack; ++ __u64 cpt_start_arg; ++ __u64 cpt_end_arg; ++ __u64 cpt_start_env; ++ __u64 cpt_end_env; ++ __u64 cpt_def_flags; ++ __u64 cpt_mmub; ++ __u8 cpt_dumpable; ++ __u8 cpt_vps_dumpable; ++ __u8 cpt_used_hugetlb; ++ __u8 __cpt_pad; ++ __u32 cpt_vdso; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_page_block ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_start; ++ __u64 cpt_end; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_remappage_block ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_start; ++ __u64 cpt_end; ++ __u64 cpt_pgoff; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_copypage_block ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_start; ++ __u64 cpt_end; ++ __u64 cpt_source; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_lazypage_block ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_start; ++ __u64 cpt_end; ++ __u64 cpt_index; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_iterpage_block ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_start; ++ __u64 cpt_end; ++} __attribute__ ((aligned (8))); ++/* Followed by array of PFNs */ ++ ++struct cpt_vma_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_file; ++ __u32 cpt_type; ++#define CPT_VMA_TYPE_0 0 ++#define CPT_VMA_TYPE_SHM 1 ++#define CPT_VMA_VDSO 2 ++ __u32 cpt_anonvma; ++ __u64 cpt_anonvmaid; ++ ++ __u64 cpt_start; ++ __u64 cpt_end; ++ __u64 cpt_flags; ++ __u64 cpt_pgprot; ++ __u64 cpt_pgoff; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_aio_ctx_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_max_reqs; ++ __u32 cpt_ring_pages; ++ __u32 cpt_tail; ++ __u32 cpt_nr; ++ __u64 cpt_mmap_base; ++ /* Data (io_event's) and struct aio_ring are stored in user space VM */ ++} __attribute__ ((aligned (8))); ++ ++ ++/* Format of MM section. ++ * ++ * It is array of MM objects (mm_struct). Each MM object is ++ * header, encoding mm_struct, followed by array of VMA objects. ++ * Each VMA consists of VMA header, encoding vm_area_struct, and ++ * if the VMA contains copied pages, the header is followed by ++ * array of tuples start-end each followed by data. ++ * ++ * ATTN: no block/page alignment. Only 64bit alignment. This might be not good? ++ */ ++ ++struct cpt_restart_block { ++ __u64 fn; ++#define CPT_RBL_0 0 ++#define CPT_RBL_NANOSLEEP 1 ++#define CPT_RBL_COMPAT_NANOSLEEP 2 ++#define CPT_RBL_POLL 3 ++#define CPT_RBL_FUTEX_WAIT 4 ++ __u64 arg0; ++ __u64 arg1; ++ __u64 arg2; ++ __u64 arg3; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_siginfo_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_qflags; ++ __u32 cpt_signo; ++ __u32 cpt_errno; ++ __u32 cpt_code; ++ ++ __u64 cpt_sigval; ++ __u32 cpt_pid; ++ __u32 cpt_uid; ++ __u64 cpt_utime; ++ __u64 cpt_stime; ++ ++ __u64 cpt_user; ++} __attribute__ ((aligned (8))); ++ ++/* Portable presentaions for segment registers */ ++ ++#define CPT_SEG_ZERO 0 ++#define CPT_SEG_TLS1 1 ++#define CPT_SEG_TLS2 2 ++#define CPT_SEG_TLS3 3 ++#define CPT_SEG_USER32_DS 4 ++#define CPT_SEG_USER32_CS 5 ++#define CPT_SEG_USER64_DS 6 ++#define CPT_SEG_USER64_CS 7 ++#define CPT_SEG_LDT 256 ++ ++struct cpt_x86_regs ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_debugreg[8]; ++ __u32 cpt_fs; ++ __u32 cpt_gs; ++ ++ __u32 cpt_ebx; ++ __u32 cpt_ecx; ++ __u32 cpt_edx; ++ __u32 cpt_esi; ++ __u32 cpt_edi; ++ __u32 cpt_ebp; ++ __u32 cpt_eax; ++ __u32 cpt_xds; ++ __u32 cpt_xes; ++ __u32 cpt_orig_eax; ++ __u32 cpt_eip; ++ __u32 cpt_xcs; ++ __u32 cpt_eflags; ++ __u32 cpt_esp; ++ __u32 cpt_xss; ++ __u32 pad; ++}; ++ ++struct cpt_x86_64_regs ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_debugreg[8]; ++ ++ __u64 cpt_fsbase; ++ __u64 cpt_gsbase; ++ __u32 cpt_fsindex; ++ __u32 cpt_gsindex; ++ __u32 cpt_ds; ++ __u32 cpt_es; ++ ++ __u64 cpt_r15; ++ __u64 cpt_r14; ++ __u64 cpt_r13; ++ __u64 cpt_r12; ++ __u64 cpt_rbp; ++ __u64 cpt_rbx; ++ __u64 cpt_r11; ++ __u64 cpt_r10; ++ __u64 cpt_r9; ++ __u64 cpt_r8; ++ __u64 cpt_rax; ++ __u64 cpt_rcx; ++ __u64 cpt_rdx; ++ __u64 cpt_rsi; ++ __u64 cpt_rdi; ++ __u64 cpt_orig_rax; ++ __u64 cpt_rip; ++ __u64 cpt_cs; ++ __u64 cpt_eflags; ++ __u64 cpt_rsp; ++ __u64 cpt_ss; ++}; ++ ++struct cpt_ia64_regs ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 gr[128]; ++ __u64 fr[256]; ++ __u64 br[8]; ++ __u64 nat[2]; ++ ++ __u64 ar_bspstore; ++ __u64 num_regs; ++ __u64 loadrs; ++ __u64 ar_bsp; ++ __u64 ar_unat; ++ __u64 ar_pfs; ++ __u64 ar_ccv; ++ __u64 ar_fpsr; ++ __u64 ar_csd; ++ __u64 ar_ssd; ++ __u64 ar_ec; ++ __u64 ar_lc; ++ __u64 ar_rsc; ++ __u64 ar_rnat; ++ ++ __u64 cr_iip; ++ __u64 cr_ipsr; ++ ++ __u64 cfm; ++ __u64 pr; ++ ++ __u64 ibr[8]; ++ __u64 dbr[8]; ++}; ++ ++ ++struct cpt_task_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_state; ++ __u64 cpt_flags; ++ __u64 cpt_ptrace; ++ __u32 cpt_prio; ++ __u32 cpt_static_prio; ++ __u32 cpt_policy; ++ __u32 cpt_rt_priority; ++ ++ /* struct thread_info */ ++ __u64 cpt_exec_domain; ++ __u64 cpt_thrflags; ++ __u64 cpt_thrstatus; ++ __u64 cpt_addr_limit; ++ ++ __u64 cpt_personality; ++ ++ __u64 cpt_mm; ++ __u64 cpt_files; ++ __u64 cpt_fs; ++ __u64 cpt_signal; ++ __u64 cpt_sighand; ++ __u64 cpt_sigblocked; ++ __u64 cpt_sigrblocked; ++ __u64 cpt_sigpending; ++ __u64 cpt_namespace; ++ __u64 cpt_sysvsem_undo; ++ __u32 cpt_pid; ++ __u32 cpt_tgid; ++ __u32 cpt_ppid; ++ __u32 cpt_rppid; ++ __u32 cpt_pgrp; ++ __u32 cpt_session; ++ __u32 cpt_old_pgrp; ++ __u32 __cpt_pad; ++ __u32 cpt_leader; ++ __u8 cpt_pn_state; ++ __u8 cpt_stopped_state; ++ __u8 cpt_sigsuspend_state; ++ __u8 cpt_64bit; ++ __u64 cpt_set_tid; ++ __u64 cpt_clear_tid; ++ __u32 cpt_exit_code; ++ __u32 cpt_exit_signal; ++ __u32 cpt_pdeath_signal; ++ __u32 cpt_user; ++ __u32 cpt_uid; ++ __u32 cpt_euid; ++ __u32 cpt_suid; ++ __u32 cpt_fsuid; ++ __u32 cpt_gid; ++ __u32 cpt_egid; ++ __u32 cpt_sgid; ++ __u32 cpt_fsgid; ++ __u32 cpt_ngids; ++ __u32 cpt_gids[32]; ++ __u8 cpt_prctl_uac; ++ __u8 cpt_prctl_fpemu; ++ __u16 __cpt_pad1; ++ __u64 cpt_ecap; ++ __u64 cpt_icap; ++ __u64 cpt_pcap; ++ __u8 cpt_comm[16]; ++ __u64 cpt_tls[3]; ++ struct cpt_restart_block cpt_restart; ++ __u64 cpt_it_real_value; /* V8: jiffies, V9..: nsec */ ++ __u64 cpt_it_real_incr; /* V8: jiffies, V9..: nsec */ ++ __u64 cpt_it_prof_value; ++ __u64 cpt_it_prof_incr; ++ __u64 cpt_it_virt_value; ++ __u64 cpt_it_virt_incr; ++ ++ __u16 cpt_used_math; ++ __u8 cpt_keepcap; ++ __u8 cpt_did_exec; ++ __u32 cpt_ptrace_message; ++ ++ __u64 cpt_utime; ++ __u64 cpt_stime; ++ __u64 cpt_starttime; /* V8: jiffies, V9...: timespec */ ++ __u64 cpt_nvcsw; ++ __u64 cpt_nivcsw; ++ __u64 cpt_min_flt; ++ __u64 cpt_maj_flt; ++ ++ __u64 cpt_sigsuspend_blocked; ++ __u64 cpt_cutime, cpt_cstime; ++ __u64 cpt_cnvcsw, cpt_cnivcsw; ++ __u64 cpt_cmin_flt, cpt_cmaj_flt; ++ ++#define CPT_RLIM_NLIMITS 16 ++ __u64 cpt_rlim_cur[CPT_RLIM_NLIMITS]; ++ __u64 cpt_rlim_max[CPT_RLIM_NLIMITS]; ++ ++ __u64 cpt_task_ub; ++ __u64 cpt_exec_ub; ++ __u64 cpt_mm_ub; ++ __u64 cpt_fork_sub; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_sigaltstack_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_stack; ++ __u32 cpt_stacksize; ++ __u32 __cpt_pad1; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_task_aux_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_robust_list; ++ __u64 __cpt_future[16]; ++} __attribute__ ((aligned (8))); ++ ++ ++struct cpt_signal_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_leader; ++ __u8 cpt_pgrp_type; ++ __u8 cpt_old_pgrp_type; ++ __u8 cpt_session_type; ++#define CPT_PGRP_NORMAL 0 ++#define CPT_PGRP_ORPHAN 1 ++#define CPT_PGRP_STRAY 2 ++ __u8 __cpt_pad1; ++ __u64 cpt_pgrp; ++ __u64 cpt_old_pgrp; ++ __u64 cpt_session; ++ __u64 cpt_sigpending; ++ __u64 cpt_ctty; ++ ++ __u32 cpt_curr_target; ++ __u32 cpt_group_exit; ++ __u32 cpt_group_exit_code; ++ __u32 cpt_group_exit_task; ++ __u32 cpt_notify_count; ++ __u32 cpt_group_stop_count; ++ __u32 cpt_stop_state; ++ __u32 __cpt_pad2; ++ ++ __u64 cpt_utime, cpt_stime, cpt_cutime, cpt_cstime; ++ __u64 cpt_nvcsw, cpt_nivcsw, cpt_cnvcsw, cpt_cnivcsw; ++ __u64 cpt_min_flt, cpt_maj_flt, cpt_cmin_flt, cpt_cmaj_flt; ++ ++ __u64 cpt_rlim_cur[CPT_RLIM_NLIMITS]; ++ __u64 cpt_rlim_max[CPT_RLIM_NLIMITS]; ++} __attribute__ ((aligned (8))); ++/* Followed by list of posix timers. */ ++ ++struct cpt_sighand_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++} __attribute__ ((aligned (8))); ++/* Followed by list of sighandles. */ ++ ++struct cpt_sighandler_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_signo; ++ __u32 __cpt_pad1; ++ __u64 cpt_handler; ++ __u64 cpt_restorer; ++ __u64 cpt_flags; ++ __u64 cpt_mask; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_netdev_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_index; ++ __u32 cpt_flags; ++ __u8 cpt_name[16]; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_tuntap_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_owner; ++ __u32 cpt_attached; ++ __u64 cpt_flags; ++ __u64 cpt_bindfile; ++ __u64 cpt_if_flags; ++ __u8 cpt_dev_addr[6]; ++ __u16 cpt_pad; ++ __u32 cpt_chr_filter[2]; ++ __u32 cpt_net_filter[2]; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_veth_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_allow_mac_change; ++ __u32 __cpt_pad; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_hwaddr_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u8 cpt_dev_addr[32]; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_netstats_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_rx_packets; ++ __u64 cpt_tx_packets; ++ __u64 cpt_rx_bytes; ++ __u64 cpt_tx_bytes; ++ __u64 cpt_rx_errors; ++ __u64 cpt_tx_errors; ++ __u64 cpt_rx_dropped; ++ __u64 cpt_tx_dropped; ++ __u64 cpt_multicast; ++ __u64 cpt_collisions; ++ __u64 cpt_rx_length_errors; ++ __u64 cpt_rx_over_errors; ++ __u64 cpt_rx_crc_errors; ++ __u64 cpt_rx_frame_errors; ++ __u64 cpt_rx_fifo_errors; ++ __u64 cpt_rx_missed_errors; ++ __u64 cpt_tx_aborted_errors; ++ __u64 cpt_tx_carrier_errors; ++ __u64 cpt_tx_fifo_errors; ++ __u64 cpt_tx_heartbeat_errors; ++ __u64 cpt_tx_window_errors; ++ __u64 cpt_rx_compressed; ++ __u64 cpt_tx_compressed; ++ __u64 pad[4]; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_ifaddr_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_index; ++ __u8 cpt_family; ++ __u8 cpt_masklen; ++ __u8 cpt_flags; ++ __u8 cpt_scope; ++ __u32 cpt_address[4]; ++ __u32 cpt_peer[4]; ++ __u32 cpt_broadcast[4]; ++ __u8 cpt_label[16]; ++ __u32 cpt_valid_lft; ++ __u32 cpt_prefered_lft; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_ipct_tuple ++{ ++ __u32 cpt_src; ++ __u16 cpt_srcport; ++ __u16 __cpt_pad1; ++ ++ __u32 cpt_dst; ++ __u16 cpt_dstport; ++ __u8 cpt_protonum; ++ __u8 cpt_dir; /* TEMPORARY HACK TO VALIDATE CODE */ ++} __attribute__ ((aligned (8))); ++ ++struct cpt_nat_manip ++{ ++ __u8 cpt_direction; ++ __u8 cpt_hooknum; ++ __u8 cpt_maniptype; ++ __u8 __cpt_pad1; ++ ++ __u32 cpt_manip_addr; ++ __u16 cpt_manip_port; ++ __u16 __cpt_pad2; ++ __u32 __cpt_pad3; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_nat_seq ++{ ++ __u32 cpt_correction_pos; ++ __u32 cpt_offset_before; ++ __u32 cpt_offset_after; ++ __u32 __cpt_pad1; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_ip_connexpect_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_timeout; ++ __u32 cpt_sibling_conntrack; /* Index of child conntrack */ ++ __u32 cpt_seq; /* id in 2.6.15 */ ++ ++ struct cpt_ipct_tuple cpt_ct_tuple; /* NU 2.6.15 */ ++ struct cpt_ipct_tuple cpt_tuple; ++ struct cpt_ipct_tuple cpt_mask; ++ ++ /* union ip_conntrack_expect_help. Used by ftp, irc, amanda */ ++ __u32 cpt_help[3]; /* NU 2.6.15 */ ++ __u16 cpt_manip_proto; ++ __u8 cpt_dir; ++ __u8 cpt_flags; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_ip_conntrack_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ struct cpt_ipct_tuple cpt_tuple[2]; ++ __u64 cpt_status; ++ __u64 cpt_timeout; ++ __u32 cpt_index; ++ __u8 cpt_ct_helper; ++ __u8 cpt_nat_helper; ++ __u16 cpt_pad1; ++ ++ /* union ip_conntrack_proto. Used by tcp and icmp. */ ++ __u32 cpt_proto_data[12]; ++ ++ /* union ip_conntrack_help. Used by ftp and pptp helper. ++ * We do not support pptp... ++ */ ++ __u32 cpt_help_data[6]; ++ ++ /* nat info */ ++ __u32 cpt_initialized; /* NU 2.6.15 */ ++ __u32 cpt_num_manips; /* NU 2.6.15 */ ++ struct cpt_nat_manip cpt_nat_manips[6]; /* NU 2.6.15 */ ++ ++ struct cpt_nat_seq cpt_nat_seq[2]; ++ ++ __u32 cpt_masq_index; ++ __u32 cpt_id; ++ __u32 cpt_mark; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_ubparm ++{ ++ __u64 barrier; ++ __u64 limit; ++ __u64 held; ++ __u64 maxheld; ++ __u64 minheld; ++ __u64 failcnt; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_beancounter_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_parent; ++ __u32 cpt_id; ++ __u32 __cpt_pad; ++ struct cpt_ubparm cpt_parms[32 * 2]; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_slm_sgreg_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_size; ++ __u32 __cpt_pad1; ++ __u32 cpt_id; ++ __u16 cpt_resource; ++ __u8 cpt_regname[32]; ++ __u8 __cpt_pad2[2]; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_slm_obj_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_size; ++ __u32 __cpt_pad1; ++} __attribute__ ((aligned (8))); ++ ++#ifdef __KERNEL__ ++ ++static inline void __user * cpt_ptr_import(__u64 ptr) ++{ ++ return (void*)(unsigned long)ptr; ++} ++ ++static inline __u64 cpt_ptr_export(void __user *ptr) ++{ ++ return (__u64)(unsigned long)ptr; ++} ++ ++static inline void cpt_sigset_import(sigset_t *sig, __u64 ptr) ++{ ++ memcpy(sig, &ptr, sizeof(*sig)); ++} ++ ++static inline __u64 cpt_sigset_export(sigset_t *sig) ++{ ++ return *(__u64*)sig; ++} ++ ++static inline __u64 cpt_timespec_export(struct timespec *tv) ++{ ++ return (((u64)tv->tv_sec) << 32) + tv->tv_nsec; ++} ++ ++static inline void cpt_timespec_import(struct timespec *tv, __u64 val) ++{ ++ tv->tv_sec = val>>32; ++ tv->tv_nsec = (val&0xFFFFFFFF); ++} ++ ++static inline __u64 cpt_timeval_export(struct timeval *tv) ++{ ++ return (((u64)tv->tv_sec) << 32) + tv->tv_usec; ++} ++ ++static inline void cpt_timeval_import(struct timeval *tv, __u64 val) ++{ ++ tv->tv_sec = val>>32; ++ tv->tv_usec = (val&0xFFFFFFFF); ++} ++ ++#endif ++ ++#endif /* __CPT_IMAGE_H_ */ +diff --git a/include/linux/cpt_ioctl.h b/include/linux/cpt_ioctl.h +new file mode 100644 +index 0000000..b8e83cc +--- /dev/null ++++ b/include/linux/cpt_ioctl.h +@@ -0,0 +1,43 @@ ++/* ++ * ++ * include/linux/cpt_ioctl.h ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef _CPT_IOCTL_H_ ++#define _CPT_IOCTL_H_ 1 ++ ++#include ++#include ++ ++#define CPTCTLTYPE '-' ++#define CPT_SET_DUMPFD _IOW(CPTCTLTYPE, 1, int) ++#define CPT_SET_STATUSFD _IOW(CPTCTLTYPE, 2, int) ++#define CPT_SET_LOCKFD _IOW(CPTCTLTYPE, 3, int) ++#define CPT_SET_VEID _IOW(CPTCTLTYPE, 4, int) ++#define CPT_SUSPEND _IO(CPTCTLTYPE, 5) ++#define CPT_DUMP _IO(CPTCTLTYPE, 6) ++#define CPT_UNDUMP _IO(CPTCTLTYPE, 7) ++#define CPT_RESUME _IO(CPTCTLTYPE, 8) ++#define CPT_KILL _IO(CPTCTLTYPE, 9) ++#define CPT_JOIN_CONTEXT _IO(CPTCTLTYPE, 10) ++#define CPT_GET_CONTEXT _IOW(CPTCTLTYPE, 11, unsigned int) ++#define CPT_PUT_CONTEXT _IO(CPTCTLTYPE, 12) ++#define CPT_SET_PAGEINFDIN _IOW(CPTCTLTYPE, 13, int) ++#define CPT_SET_PAGEINFDOUT _IOW(CPTCTLTYPE, 14, int) ++#define CPT_PAGEIND _IO(CPTCTLTYPE, 15) ++#define CPT_VMPREP _IOW(CPTCTLTYPE, 16, int) ++#define CPT_SET_LAZY _IOW(CPTCTLTYPE, 17, int) ++#define CPT_SET_CPU_FLAGS _IOW(CPTCTLTYPE, 18, unsigned int) ++#define CPT_TEST_CAPS _IOW(CPTCTLTYPE, 19, unsigned int) ++#define CPT_TEST_VECAPS _IOW(CPTCTLTYPE, 20, unsigned int) ++#define CPT_SET_ERRORFD _IOW(CPTCTLTYPE, 21, int) ++ ++#define CPT_ITER _IOW(CPTCTLTYPE, 23, int) ++ ++#endif +diff --git a/include/linux/dcache.h b/include/linux/dcache.h +index d982eb8..2fceb53 100644 +--- a/include/linux/dcache.h ++++ b/include/linux/dcache.h +@@ -7,6 +7,8 @@ + #include + #include + ++#include ++ + struct nameidata; + struct path; + struct vfsmount; +@@ -110,6 +112,9 @@ struct dentry { + struct dcookie_struct *d_cookie; /* cookie, if any */ + #endif + int d_mounted; ++#ifdef CONFIG_BEANCOUNTERS ++ struct dentry_beancounter dentry_bc; ++#endif + unsigned char d_iname[DNAME_INLINE_LEN_MIN]; /* small names */ + }; + +@@ -173,9 +178,13 @@ d_iput: no no no yes + + #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */ + #define DCACHE_UNHASHED 0x0010 ++#define DCACHE_VIRTUAL 0x0100 /* ve accessible */ ++ ++extern void mark_tree_virtual(struct path *path); + + #define DCACHE_INOTIFY_PARENT_WATCHED 0x0020 /* Parent inode is watched */ + ++extern struct kmem_cache *dentry_cache; + extern spinlock_t dcache_lock; + extern seqlock_t rename_lock; + +@@ -302,6 +311,7 @@ extern char *dynamic_dname(struct dentry *, char *, int, const char *, ...); + extern char *__d_path(const struct path *path, struct path *root, char *, int); + extern char *d_path(const struct path *, char *, int); + extern char *dentry_path(struct dentry *, char *, int); ++extern int d_root_check(struct path *path); + + /* Allocation counts.. */ + +@@ -321,6 +331,12 @@ extern char *dentry_path(struct dentry *, char *, int); + static inline struct dentry *dget(struct dentry *dentry) + { + if (dentry) { ++#ifdef CONFIG_BEANCOUNTERS ++ preempt_disable(); ++ if (ub_dentry_on && ub_dget_testone(dentry)) ++ BUG(); ++ preempt_enable_no_resched(); ++#endif + BUG_ON(!atomic_read(&dentry->d_count)); + atomic_inc(&dentry->d_count); + } +@@ -363,4 +379,5 @@ extern struct dentry *lookup_create(struct nameidata *nd, int is_dir); + + extern int sysctl_vfs_cache_pressure; + ++extern int check_area_access_ve(struct path *); + #endif /* __LINUX_DCACHE_H */ +diff --git a/include/linux/device.h b/include/linux/device.h +index 6a2d04c..72a6aa3 100644 +--- a/include/linux/device.h ++++ b/include/linux/device.h +@@ -234,6 +234,15 @@ extern void class_interface_unregister(struct class_interface *); + extern struct class *class_create(struct module *owner, const char *name); + extern void class_destroy(struct class *cls); + ++extern struct class net_class; ++extern struct kset *class_kset; ++ ++int classes_init(void); ++void classes_fini(void); ++ ++int devices_init(void); ++void devices_fini(void); ++ + /* + * The type of device, "struct device" is embedded in. A class + * or bus can contain devices of different types +diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h +index 154769c..ee767ed 100644 +--- a/include/linux/devpts_fs.h ++++ b/include/linux/devpts_fs.h +@@ -23,6 +23,16 @@ int devpts_pty_new(struct tty_struct *tty); /* mknod in devpts */ + struct tty_struct *devpts_get_tty(int number); /* get tty structure */ + void devpts_pty_kill(int number); /* unlink */ + ++struct devpts_config { ++ int setuid; ++ int setgid; ++ uid_t uid; ++ gid_t gid; ++ umode_t mode; ++}; ++ ++extern struct devpts_config devpts_config; ++extern struct file_system_type devpts_fs_type; + #else + + /* Dummy stubs in the no-pty case */ +diff --git a/include/linux/elevator.h b/include/linux/elevator.h +index 639624b..be231eb 100644 +--- a/include/linux/elevator.h ++++ b/include/linux/elevator.h +@@ -56,6 +56,11 @@ struct elevator_ops + elevator_init_fn *elevator_init_fn; + elevator_exit_fn *elevator_exit_fn; + void (*trim)(struct io_context *); ++ /* In original cfq design task holds a cfqq refcount and puts it ++ * on exit via io context. Now async cfqqs are hold by UB, ++ * so we need somehow to put these queues. Use this function. ++ */ ++ void (*put_queue)(struct cfq_queue *); + }; + + #define ELV_NAME_MAX (16) +diff --git a/include/linux/elf.h b/include/linux/elf.h +index ff9fbed..f7f8507 100644 +--- a/include/linux/elf.h ++++ b/include/linux/elf.h +@@ -403,4 +403,6 @@ extern int elf_coredump_extra_notes_size(void); + extern int elf_coredump_extra_notes_write(struct file *file, loff_t *foffset); + #endif + ++extern int sysctl_at_vsyscall; ++ + #endif /* _LINUX_ELF_H */ +diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h +index cf79853..919ebfd 100644 +--- a/include/linux/eventpoll.h ++++ b/include/linux/eventpoll.h +@@ -15,6 +15,7 @@ + #define _LINUX_EVENTPOLL_H + + #include ++#include + + + /* Valid opcodes to issue to sys_epoll_ctl() */ +@@ -60,6 +61,88 @@ static inline void eventpoll_init_file(struct file *file) + spin_lock_init(&file->f_ep_lock); + } + ++struct epoll_filefd { ++ struct file *file; ++ int fd; ++}; ++ ++/* ++ * This structure is stored inside the "private_data" member of the file ++ * structure and rapresent the main data sructure for the eventpoll ++ * interface. ++ */ ++struct eventpoll { ++ /* Protect the this structure access */ ++ spinlock_t lock; ++ ++ /* ++ * This mutex is used to ensure that files are not removed ++ * while epoll is using them. This is held during the event ++ * collection loop, the file cleanup path, the epoll file exit ++ * code and the ctl operations. ++ */ ++ struct mutex mtx; ++ ++ /* Wait queue used by sys_epoll_wait() */ ++ wait_queue_head_t wq; ++ ++ /* Wait queue used by file->poll() */ ++ wait_queue_head_t poll_wait; ++ ++ /* List of ready file descriptors */ ++ struct list_head rdllist; ++ ++ /* RB tree root used to store monitored fd structs */ ++ struct rb_root rbr; ++ ++ /* ++ * This is a single linked list that chains all the "struct epitem" that ++ * happened while transfering ready events to userspace w/out ++ * holding ->lock. ++ */ ++ struct epitem *ovflist; ++}; ++ ++/* ++ * Each file descriptor added to the eventpoll interface will ++ * have an entry of this type linked to the "rbr" RB tree. ++ */ ++struct epitem { ++ /* RB tree node used to link this structure to the eventpoll RB tree */ ++ struct rb_node rbn; ++ ++ /* List header used to link this structure to the eventpoll ready list */ ++ struct list_head rdllink; ++ ++ /* ++ * Works together "struct eventpoll"->ovflist in keeping the ++ * single linked chain of items. ++ */ ++ struct epitem *next; ++ ++ /* The file descriptor information this item refers to */ ++ struct epoll_filefd ffd; ++ ++ /* Number of active wait queue attached to poll operations */ ++ int nwait; ++ ++ /* List containing poll wait queues */ ++ struct list_head pwqlist; ++ ++ /* The "container" of this item */ ++ struct eventpoll *ep; ++ ++ /* List header used to link this item to the "struct file" items list */ ++ struct list_head fllink; ++ ++ /* The structure that describe the interested events and the source fd */ ++ struct epoll_event event; ++}; ++ ++extern struct semaphore epsem; ++struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd); ++int ep_insert(struct eventpoll *ep, struct epoll_event *event, ++ struct file *tfile, int fd); + + /* Used to release the epoll bits inside the "struct file" */ + void eventpoll_release_file(struct file *file); +@@ -92,6 +175,8 @@ static inline void eventpoll_release(struct file *file) + eventpoll_release_file(file); + } + ++extern struct mutex epmutex; ++ + #else + + static inline void eventpoll_init_file(struct file *file) {} +diff --git a/include/linux/fairsched.h b/include/linux/fairsched.h +new file mode 100644 +index 0000000..e08c84d +--- /dev/null ++++ b/include/linux/fairsched.h +@@ -0,0 +1,86 @@ ++/* ++ * Fair Scheduler ++ * ++ * Copyright (C) 2000-2008 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __LINUX_FAIRSCHED_H__ ++#define __LINUX_FAIRSCHED_H__ ++ ++#define FAIRSCHED_SET_RATE 0 ++#define FAIRSCHED_DROP_RATE 1 ++#define FAIRSCHED_GET_RATE 2 ++ ++#ifdef __KERNEL__ ++ ++/* refcnt change protected with tasklist write lock */ ++struct fairsched_node { ++ struct task_group *tg; ++ int refcnt; ++ unsigned id; ++ struct list_head nodelist; ++ ++ unsigned weight; ++ unsigned char rate_limited; ++ unsigned rate; ++#ifdef CONFIG_VE ++ struct ve_struct *owner_env; ++#endif ++}; ++ ++#ifdef CONFIG_VZ_FAIRSCHED ++ ++#define FAIRSCHED_INIT_NODE_ID INT_MAX ++ ++extern struct fairsched_node fairsched_init_node; ++ ++void fairsched_init_early(void); ++void fairsched_init_late(void); ++ ++static inline int task_fairsched_node_id(struct task_struct *p) ++{ ++ return p->fsched_node->id; ++} ++ ++/* must called with tasklist write locked */ ++static inline void get_task_fairsched_node(struct task_struct *p) ++{ ++ p->fsched_node->refcnt++; ++} ++static inline void put_task_fairsched_node(struct task_struct *p) ++{ ++ p->fsched_node->refcnt--; ++} ++ ++#define INIT_VZ_FAIRSCHED .fsched_node = &fairsched_init_node, ++ ++#define FSCHWEIGHT_MAX ((1 << 16) - 1) ++#define FSCHRATE_SHIFT 10 ++#define FSCH_TIMESLICE 16 ++ ++asmlinkage int sys_fairsched_mknod(unsigned int parent, unsigned int weight, ++ unsigned int newid); ++asmlinkage int sys_fairsched_rmnod(unsigned int id); ++asmlinkage int sys_fairsched_mvpr(pid_t pid, unsigned int nodeid); ++asmlinkage int sys_fairsched_vcpus(unsigned int id, unsigned int vcpus); ++asmlinkage int sys_fairsched_chwt(unsigned int id, unsigned int weight); ++asmlinkage int sys_fairsched_rate(unsigned int id, int op, unsigned rate); ++ ++#else /* CONFIG_VZ_FAIRSCHED */ ++ ++static inline void fairsched_init_early(void) { } ++static inline void fairsched_init_late(void) { } ++static inline int task_fairsched_node_id(struct task_struct *p) { return 0; } ++static inline void get_task_fairsched_node(struct task_struct *p) { } ++static inline void put_task_fairsched_node(struct task_struct *p) { } ++ ++#define INIT_VZ_FAIRSCHED ++ ++#endif /* CONFIG_VZ_FAIRSCHED */ ++#endif /* __KERNEL__ */ ++ ++#endif /* __LINUX_FAIRSCHED_H__ */ +diff --git a/include/linux/faudit.h b/include/linux/faudit.h +new file mode 100644 +index 0000000..631c42e +--- /dev/null ++++ b/include/linux/faudit.h +@@ -0,0 +1,45 @@ ++/* ++ * include/linux/faudit.h ++ * ++ * Copyright (C) 2005 SWSoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __FAUDIT_H_ ++#define __FAUDIT_H_ ++ ++#include ++ ++struct vfsmount; ++struct dentry; ++struct super_block; ++struct kstatfs; ++struct kstat; ++struct pt_regs; ++ ++struct faudit_regs_arg { ++ int err; ++ struct pt_regs *regs; ++}; ++ ++struct faudit_stat_arg { ++ int err; ++ struct vfsmount *mnt; ++ struct dentry *dentry; ++ struct kstat *stat; ++}; ++ ++struct faudit_statfs_arg { ++ int err; ++ struct super_block *sb; ++ struct kstatfs *stat; ++}; ++ ++#define VIRTINFO_FAUDIT (0) ++#define VIRTINFO_FAUDIT_STAT (VIRTINFO_FAUDIT + 0) ++#define VIRTINFO_FAUDIT_STATFS (VIRTINFO_FAUDIT + 1) ++ ++#endif +diff --git a/include/linux/fs.h b/include/linux/fs.h +index d8e2762..f2c30f6 100644 +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -50,6 +50,7 @@ extern struct inodes_stat_t inodes_stat; + + extern int leases_enable, lease_break_time; + ++extern int odirect_enable; + #ifdef CONFIG_DNOTIFY + extern int dir_notify_enable; + #endif +@@ -60,6 +61,7 @@ extern int dir_notify_enable; + #define MAY_WRITE 2 + #define MAY_READ 4 + #define MAY_APPEND 8 ++#define MAY_QUOTACTL 16 /* for devgroup-vs-openvz only */ + + #define FMODE_READ 1 + #define FMODE_WRITE 2 +@@ -68,6 +70,7 @@ extern int dir_notify_enable; + #define FMODE_LSEEK 4 + #define FMODE_PREAD 8 + #define FMODE_PWRITE FMODE_PREAD /* These go hand in hand */ ++#define FMODE_QUOTACTL 4 + + /* File is being opened for execution. Primary users of this flag are + distributed filesystems that can use it to achieve correct ETXTBUSY +@@ -94,6 +97,8 @@ extern int dir_notify_enable; + #define FS_REQUIRES_DEV 1 + #define FS_BINARY_MOUNTDATA 2 + #define FS_HAS_SUBTYPE 4 ++#define FS_VIRTUALIZED 64 /* Can mount this fstype inside ve */ ++#define FS_MANGLE_PROC 128 /* hide some /proc/mounts info inside VE */ + #define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */ + #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() + * during rename() internally. +@@ -366,6 +371,9 @@ struct iattr { + * Includes for diskquotas. + */ + #include ++#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE) ++#include ++#endif + + /** + * enum positive_aop_returns - aop return codes with specific semantics +@@ -625,6 +633,9 @@ struct inode { + #ifdef CONFIG_QUOTA + struct dquot *i_dquot[MAXQUOTAS]; + #endif ++#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE) ++ struct vz_quota_ilink i_qlnk; ++#endif + struct list_head i_devices; + union { + struct pipe_inode_info *i_pipe; +@@ -680,6 +691,8 @@ enum inode_i_mutex_lock_class + extern void inode_double_lock(struct inode *inode1, struct inode *inode2); + extern void inode_double_unlock(struct inode *inode1, struct inode *inode2); + ++extern struct kmem_cache *inode_cachep; ++ + /* + * NOTE: in a 32bit arch with a preemptable kernel and + * an UP compile the i_size_read/write must be atomic +@@ -799,6 +812,7 @@ struct file { + struct fown_struct f_owner; + unsigned int f_uid, f_gid; + struct file_ra_state f_ra; ++ struct user_beancounter *f_ub; + + u64 f_version; + #ifdef CONFIG_SECURITY +@@ -816,6 +830,7 @@ struct file { + #ifdef CONFIG_DEBUG_WRITECOUNT + unsigned long f_mnt_write_state; + #endif ++ struct ve_struct *owner_env; + }; + extern spinlock_t files_lock; + #define file_list_lock() spin_lock(&files_lock); +@@ -924,6 +939,9 @@ struct file_lock { + struct file *fl_file; + unsigned char fl_flags; + unsigned char fl_type; ++#ifdef CONFIG_BEANCOUNTERS ++ unsigned char fl_charged; ++#endif + loff_t fl_start; + loff_t fl_end; + +@@ -1245,6 +1263,7 @@ struct file_operations { + ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); + ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); + int (*setlease)(struct file *, long, struct file_lock **); ++ struct file * (*get_host)(struct file *); + }; + + struct inode_operations { +@@ -1311,6 +1330,7 @@ struct super_operations { + #ifdef CONFIG_QUOTA + ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); + ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); ++ struct inode *(*get_quota_root)(struct super_block *); + #endif + }; + +@@ -1487,8 +1507,14 @@ struct file_system_type { + struct lock_class_key i_mutex_key; + struct lock_class_key i_mutex_dir_key; + struct lock_class_key i_alloc_sem_key; ++ ++ struct file_system_type *proto; ++ struct ve_struct *owner_env; + }; + ++void get_filesystem(struct file_system_type *fs); ++void put_filesystem(struct file_system_type *fs); ++ + extern int get_sb_bdev(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, + int (*fill_super)(struct super_block *, void *, int), +@@ -1528,6 +1554,11 @@ extern int register_filesystem(struct file_system_type *); + extern int unregister_filesystem(struct file_system_type *); + extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data); + #define kern_mount(type) kern_mount_data(type, NULL) ++extern int register_ve_fs_type(struct ve_struct *, struct file_system_type *, ++ struct file_system_type **, struct vfsmount **); ++extern void unregister_ve_fs_type(struct file_system_type *, struct vfsmount *); ++extern void umount_ve_fs_type(struct file_system_type *local_fs_type); ++#define kern_umount mntput + extern int may_umount_tree(struct vfsmount *); + extern int may_umount(struct vfsmount *); + extern long do_mount(char *, char *, char *, unsigned long, void *); +@@ -1535,6 +1566,7 @@ extern struct vfsmount *collect_mounts(struct vfsmount *, struct dentry *); + extern void drop_collected_mounts(struct vfsmount *); + + extern int vfs_statfs(struct dentry *, struct kstatfs *); ++extern int faudit_statfs(struct super_block *, struct kstatfs *); + + /* /sys/fs */ + extern struct kobject *fs_kobj; +@@ -1707,7 +1739,8 @@ extern int check_disk_change(struct block_device *); + extern int __invalidate_device(struct block_device *); + extern int invalidate_partition(struct gendisk *, int); + #endif +-extern int invalidate_inodes(struct super_block *); ++extern int invalidate_inodes_check(struct super_block *, int check); ++#define invalidate_inodes(sb) invalidate_inodes_check(sb, 0) + unsigned long __invalidate_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t end, + bool be_atomic); +@@ -2128,6 +2161,17 @@ static inline void free_secdata(void *secdata) + { } + #endif /* CONFIG_SECURITY */ + ++static inline void *file_private(struct file *file) ++{ ++ struct file *host = file; ++ ++ while (host->f_op->get_host) { ++ host = host->f_op->get_host(host); ++ BUG_ON(host->f_mapping != file->f_mapping); ++ } ++ return host->private_data; ++} ++ + struct ctl_table; + int proc_nr_files(struct ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); +diff --git a/include/linux/futex.h b/include/linux/futex.h +index 586ab56..9bf4c37 100644 +--- a/include/linux/futex.h ++++ b/include/linux/futex.h +@@ -124,7 +124,7 @@ struct robust_list_head { + #ifdef __KERNEL__ + long do_futex(u32 __user *uaddr, int op, u32 val, union ktime *timeout, + u32 __user *uaddr2, u32 val2, u32 val3); +- ++long futex_wait_restart(struct restart_block *restart); + extern int + handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi); + +diff --git a/include/linux/gfp.h b/include/linux/gfp.h +index b414be3..fb1ad5a 100644 +--- a/include/linux/gfp.h ++++ b/include/linux/gfp.h +@@ -50,20 +50,25 @@ struct vm_area_struct; + #define __GFP_THISNODE ((__force gfp_t)0x40000u)/* No fallback, no policies */ + #define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */ + #define __GFP_MOVABLE ((__force gfp_t)0x100000u) /* Page is movable */ ++#define __GFP_UBC ((__force gfp_t)0x200000u)/* charge kmem in buddy and slab */ ++#define __GFP_SOFT_UBC ((__force gfp_t)0x400000u)/* use soft charging */ + +-#define __GFP_BITS_SHIFT 21 /* Room for 21 __GFP_FOO bits */ ++#define __GFP_BITS_SHIFT 23 /* Room for __GFP_FOO bits */ + #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) + + /* This equals 0, but use constants in case they ever change */ + #define GFP_NOWAIT (GFP_ATOMIC & ~__GFP_HIGH) + /* GFP_ATOMIC means both !wait (__GFP_WAIT not set) and use emergency pool */ + #define GFP_ATOMIC (__GFP_HIGH) ++#define GFP_ATOMIC_UBC (__GFP_HIGH | __GFP_UBC) + #define GFP_NOIO (__GFP_WAIT) + #define GFP_NOFS (__GFP_WAIT | __GFP_IO) + #define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS) ++#define GFP_KERNEL_UBC (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_UBC) + #define GFP_TEMPORARY (__GFP_WAIT | __GFP_IO | __GFP_FS | \ + __GFP_RECLAIMABLE) + #define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL) ++#define GFP_USER_UBC (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | __GFP_UBC) + #define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | \ + __GFP_HIGHMEM) + #define GFP_HIGHUSER_MOVABLE (__GFP_WAIT | __GFP_IO | __GFP_FS | \ +diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h +index 181006c..5d48dcb 100644 +--- a/include/linux/hardirq.h ++++ b/include/linux/hardirq.h +@@ -7,6 +7,9 @@ + #include + #include + ++#include ++#include ++ + /* + * We put the hardirq and softirq counter into the preemption + * counter. The bitmask has the following meaning: +@@ -126,6 +129,24 @@ extern void rcu_irq_exit(void); + # define rcu_irq_exit() do { } while (0) + #endif /* CONFIG_PREEMPT_RCU */ + ++#define save_context() do { \ ++ struct task_struct *tsk; \ ++ if (hardirq_count() == HARDIRQ_OFFSET) { \ ++ tsk = current; \ ++ ve_save_context(tsk); \ ++ ub_save_context(tsk); \ ++ } \ ++ } while (0) ++ ++#define restore_context() do { \ ++ struct task_struct *tsk; \ ++ if (hardirq_count() == HARDIRQ_OFFSET) { \ ++ tsk = current; \ ++ ve_restore_context(tsk); \ ++ ub_restore_context(tsk); \ ++ } \ ++ } while (0) ++ + /* + * It is safe to do non-atomic ops on ->hardirq_context, + * because NMI handlers may not preempt and the ops are +@@ -137,6 +158,7 @@ extern void rcu_irq_exit(void); + rcu_irq_enter(); \ + account_system_vtime(current); \ + add_preempt_count(HARDIRQ_OFFSET); \ ++ save_context(); \ + trace_hardirq_enter(); \ + } while (0) + +@@ -152,6 +174,7 @@ extern void irq_enter(void); + do { \ + trace_hardirq_exit(); \ + account_system_vtime(current); \ ++ restore_context(); \ + sub_preempt_count(HARDIRQ_OFFSET); \ + rcu_irq_exit(); \ + } while (0) +diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h +index 6d93dce..b2c6f88 100644 +--- a/include/linux/hrtimer.h ++++ b/include/linux/hrtimer.h +@@ -342,6 +342,9 @@ extern long hrtimer_nanosleep(struct timespec *rqtp, + const enum hrtimer_mode mode, + const clockid_t clockid); + extern long hrtimer_nanosleep_restart(struct restart_block *restart_block); ++#ifdef CONFIG_COMPAT ++long compat_nanosleep_restart(struct restart_block *restart); ++#endif + + extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, + struct task_struct *tsk); +diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h +index 950e13d..0ca89e7 100644 +--- a/include/linux/if_bridge.h ++++ b/include/linux/if_bridge.h +@@ -44,6 +44,7 @@ + #define BRCTL_SET_PORT_PRIORITY 16 + #define BRCTL_SET_PATH_COST 17 + #define BRCTL_GET_FDB_ENTRIES 18 ++#define BRCTL_SET_VIA_ORIG_DEV 19 + + #define BR_STATE_DISABLED 0 + #define BR_STATE_LISTENING 1 +@@ -72,6 +73,7 @@ struct __bridge_info + __u32 tcn_timer_value; + __u32 topology_change_timer_value; + __u32 gc_timer_value; ++ __u8 via_phys_dev; + }; + + struct __port_info +@@ -106,9 +108,12 @@ struct __fdb_entry + + #include + ++#define BR_ALREADY_SEEN 1 ++ + extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *)); + extern struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p, + struct sk_buff *skb); ++extern int (*br_hard_xmit_hook)(struct sk_buff *skb, struct net_bridge_port *port); + extern int (*br_should_route_hook)(struct sk_buff *skb); + + #endif +diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h +index 8c71fe2..df256e8 100644 +--- a/include/linux/if_tun.h ++++ b/include/linux/if_tun.h +@@ -19,6 +19,7 @@ + #define __IF_TUN_H + + #include ++#include + + /* Read queue size */ + #define TUN_READQ_SIZE 500 +@@ -55,4 +56,40 @@ struct tun_pi { + }; + #define TUN_PKT_STRIP 0x0001 + ++struct sk_buff_head; ++struct tun_struct { ++ struct list_head list; ++ unsigned long flags; ++ int attached; ++ void *bind_file; ++ uid_t owner; ++ gid_t group; ++ ++ wait_queue_head_t read_wait; ++ struct sk_buff_head readq; ++ ++ struct net_device *dev; ++ ++ struct fasync_struct *fasync; ++ ++ unsigned long if_flags; ++ u8 dev_addr[ETH_ALEN]; ++ u32 chr_filter[2]; ++ u32 net_filter[2]; ++ ++#ifdef TUN_DEBUG ++ int debug; ++#endif ++}; ++ ++struct tun_net { ++ struct list_head dev_list; ++}; ++ ++extern int tun_net_open(struct net_device *dev); ++extern int tun_chr_open(struct inode *inode, struct file * file); ++extern void tun_net_init(struct net_device *dev); ++extern void tun_setup(struct net_device *dev); ++extern struct list_head tun_dev_list; ++ + #endif /* __IF_TUN_H */ +diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h +index 15ace02..ea02489 100644 +--- a/include/linux/if_vlan.h ++++ b/include/linux/if_vlan.h +@@ -88,6 +88,9 @@ struct vlan_group { + struct hlist_node hlist; /* linked list */ + struct net_device **vlan_devices_arrays[VLAN_GROUP_ARRAY_SPLIT_PARTS]; + struct rcu_head rcu; ++#ifdef CONFIG_VE ++ struct ve_struct *owner; ++#endif + }; + + static inline struct net_device *vlan_group_get_device(struct vlan_group *vg, +diff --git a/include/linux/init_task.h b/include/linux/init_task.h +index 9927a88..f235468 100644 +--- a/include/linux/init_task.h ++++ b/include/linux/init_task.h +@@ -10,6 +10,7 @@ + #include + #include + #include ++#include + + extern struct files_struct init_files; + +@@ -49,10 +50,17 @@ extern struct files_struct init_files; + .rlim = INIT_RLIMITS, \ + } + ++#ifdef CONFIG_VE ++/* one for ve0, one for init_task */ ++#define INIT_NSPROXY_COUNT ATOMIC_INIT(2) ++#else ++#define INIT_NSPROXY_COUNT ATOMIC_INIT(1) ++#endif ++ + extern struct nsproxy init_nsproxy; + #define INIT_NSPROXY(nsproxy) { \ + .pid_ns = &init_pid_ns, \ +- .count = ATOMIC_INIT(1), \ ++ .count = INIT_NSPROXY_COUNT, \ + .uts_ns = &init_uts_ns, \ + .mnt_ns = NULL, \ + INIT_NET_NS(net_ns) \ +@@ -179,6 +187,7 @@ extern struct group_info init_groups; + INIT_IDS \ + INIT_TRACE_IRQFLAGS \ + INIT_LOCKDEP \ ++ INIT_VZ_FAIRSCHED \ + } + + +diff --git a/include/linux/inotify.h b/include/linux/inotify.h +index 742b917..a935bb3 100644 +--- a/include/linux/inotify.h ++++ b/include/linux/inotify.h +@@ -67,6 +67,7 @@ struct inotify_event { + + #include + #include ++#include + + /* + * struct inotify_watch - represents a watch request on a specific inode +@@ -84,6 +85,7 @@ struct inotify_watch { + struct list_head i_list; /* entry in inode's list */ + atomic_t count; /* reference count */ + struct inotify_handle *ih; /* associated inotify handle */ ++ struct path path; + struct inode *inode; /* associated inode */ + __s32 wd; /* watch descriptor */ + __u32 mask; /* event mask for this watch */ +@@ -120,6 +122,8 @@ extern __s32 inotify_find_update_watch(struct inotify_handle *, struct inode *, + u32); + extern __s32 inotify_add_watch(struct inotify_handle *, struct inotify_watch *, + struct inode *, __u32); ++extern __s32 inotify_add_watch_dget(struct inotify_handle *, struct inotify_watch *, ++ struct path *, __u32); + extern __s32 inotify_clone_watch(struct inotify_watch *, struct inotify_watch *); + extern void inotify_evict_watch(struct inotify_watch *); + extern int inotify_rm_watch(struct inotify_handle *, struct inotify_watch *); +@@ -129,6 +133,66 @@ extern void inotify_remove_watch_locked(struct inotify_handle *, + extern void get_inotify_watch(struct inotify_watch *); + extern void put_inotify_watch(struct inotify_watch *); + ++/* ++ * struct inotify_handle - represents an inotify instance ++ * ++ * This structure is protected by the mutex 'mutex'. ++ */ ++struct inotify_handle { ++ struct idr idr; /* idr mapping wd -> watch */ ++ struct mutex mutex; /* protects this bad boy */ ++ struct list_head watches; /* list of watches */ ++ atomic_t count; /* reference count */ ++ u32 last_wd; /* the last wd allocated */ ++ const struct inotify_operations *in_ops; /* inotify caller operations */ ++}; ++ ++ ++/* ++ * struct inotify_device - represents an inotify instance ++ * ++ * This structure is protected by the mutex 'mutex'. ++ */ ++struct inotify_device { ++ wait_queue_head_t wq; /* wait queue for i/o */ ++ struct mutex ev_mutex; /* protects event queue */ ++ struct mutex up_mutex; /* synchronizes watch updates */ ++ struct list_head events; /* list of queued events */ ++ atomic_t count; /* reference count */ ++ struct user_struct *user; /* user who opened this dev */ ++ struct inotify_handle *ih; /* inotify handle */ ++ struct fasync_struct *fa; /* async notification */ ++ unsigned int queue_size; /* size of the queue (bytes) */ ++ unsigned int event_count; /* number of pending events */ ++ unsigned int max_events; /* maximum number of events */ ++}; ++ ++/* ++ * struct inotify_kernel_event - An inotify event, originating from a watch and ++ * queued for user-space. A list of these is attached to each instance of the ++ * device. In read(), this list is walked and all events that can fit in the ++ * buffer are returned. ++ * ++ * Protected by dev->ev_mutex of the device in which we are queued. ++ */ ++struct inotify_kernel_event { ++ struct inotify_event event; /* the user-space event */ ++ struct list_head list; /* entry in inotify_device's list */ ++ char *name; /* filename, if any */ ++}; ++ ++/* ++ * struct inotify_user_watch - our version of an inotify_watch, we add ++ * a reference to the associated inotify_device. ++ */ ++struct inotify_user_watch { ++ struct inotify_device *dev; /* associated device */ ++ struct inotify_watch wdata; /* inotify watch data */ ++}; ++ ++int inotify_create_watch(struct inotify_device *dev, struct path *p, u32 mask); ++ ++ + #else + + static inline void inotify_d_instantiate(struct dentry *dentry, +@@ -198,6 +262,13 @@ static inline __s32 inotify_add_watch(struct inotify_handle *ih, + return -EOPNOTSUPP; + } + ++static inline __s32 inotify_add_watch_dget(struct inotify_handle *h, ++ struct inotify_watch *w, ++ struct path *p, __u32 mask) ++{ ++ return -EOPNOTSUPP; ++} ++ + static inline int inotify_rm_watch(struct inotify_handle *ih, + struct inotify_watch *watch) + { +diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h +index f98a656..2d86ade 100644 +--- a/include/linux/ioprio.h ++++ b/include/linux/ioprio.h +@@ -39,6 +39,7 @@ enum { + IOPRIO_WHO_PROCESS = 1, + IOPRIO_WHO_PGRP, + IOPRIO_WHO_USER, ++ IOPRIO_WHO_UBC = 1000, + }; + + /* +diff --git a/include/linux/ipc.h b/include/linux/ipc.h +index b882610..67d186c 100644 +--- a/include/linux/ipc.h ++++ b/include/linux/ipc.h +@@ -81,6 +81,7 @@ struct ipc_kludge { + + #include + #include ++#include + + #define IPCMNI 32768 /* <= MAX_INT limit for ipc arrays (including sysctl changes) */ + +@@ -100,6 +101,15 @@ struct kern_ipc_perm + void *security; + }; + ++struct ipc_ids; ++ ++struct kern_ipc_perm *ipc_lock(struct ipc_ids *, int); ++static inline void ipc_unlock(struct kern_ipc_perm *perm) ++{ ++ spin_unlock(&perm->lock); ++ rcu_read_unlock(); ++} ++ + #endif /* __KERNEL__ */ + + #endif /* _LINUX_IPC_H */ +diff --git a/include/linux/kdev_t.h b/include/linux/kdev_t.h +index 2dacab8..91783a7 100644 +--- a/include/linux/kdev_t.h ++++ b/include/linux/kdev_t.h +@@ -87,6 +87,57 @@ static inline unsigned sysv_minor(u32 dev) + return dev & 0x3ffff; + } + ++#define UNNAMED_MAJOR_COUNT 16 ++ ++#if UNNAMED_MAJOR_COUNT > 1 ++ ++extern int unnamed_dev_majors[UNNAMED_MAJOR_COUNT]; ++ ++static inline dev_t make_unnamed_dev(int idx) ++{ ++ /* ++ * Here we transfer bits from 8 to 8+log2(UNNAMED_MAJOR_COUNT) of the ++ * unnamed device index into major number. ++ */ ++ return MKDEV(unnamed_dev_majors[(idx >> 8) & (UNNAMED_MAJOR_COUNT - 1)], ++ idx & ~((UNNAMED_MAJOR_COUNT - 1) << 8)); ++} ++ ++static inline int unnamed_dev_idx(dev_t dev) ++{ ++ int i; ++ for (i = 0; i < UNNAMED_MAJOR_COUNT && ++ MAJOR(dev) != unnamed_dev_majors[i]; i++); ++ return MINOR(dev) | (i << 8); ++} ++ ++static inline int is_unnamed_dev(dev_t dev) ++{ ++ int i; ++ for (i = 0; i < UNNAMED_MAJOR_COUNT && ++ MAJOR(dev) != unnamed_dev_majors[i]; i++); ++ return i < UNNAMED_MAJOR_COUNT; ++} ++ ++#else /* UNNAMED_MAJOR_COUNT */ ++ ++static inline dev_t make_unnamed_dev(int idx) ++{ ++ return MKDEV(0, idx); ++} ++ ++static inline int unnamed_dev_idx(dev_t dev) ++{ ++ return MINOR(dev); ++} ++ ++static inline int is_unnamed_dev(dev_t dev) ++{ ++ return MAJOR(dev) == 0; ++} ++ ++#endif /* UNNAMED_MAJOR_COUNT */ ++ + #else /* __KERNEL__ */ + + /* +diff --git a/include/linux/kernel.h b/include/linux/kernel.h +index 2e70006..5112a04 100644 +--- a/include/linux/kernel.h ++++ b/include/linux/kernel.h +@@ -191,6 +191,12 @@ extern int log_buf_get_len(void); + extern int log_buf_read(int idx); + extern int log_buf_copy(char *dest, int idx, int len); + ++asmlinkage int ve_vprintk(int dst, const char *fmt, va_list args) ++ __attribute__ ((format (printf, 2, 0))); ++asmlinkage int ve_printk(int, const char * fmt, ...) ++ __attribute__ ((format (printf, 2, 3))); ++void prepare_printk(void); ++ + extern int printk_ratelimit_jiffies; + extern int printk_ratelimit_burst; + extern int printk_ratelimit(void); +@@ -208,6 +214,15 @@ static inline int __cold printk(const char *s, ...) { return 0; } + static inline int log_buf_get_len(void) { return 0; } + static inline int log_buf_read(int idx) { return 0; } + static inline int log_buf_copy(char *dest, int idx, int len) { return 0; } ++static inline int ve_printk(int d, const char *s, ...) ++ __attribute__ ((format (printf, 2, 3))); ++static inline int ve_printk(int d, const char *s, ...) ++{ ++ return 0; ++} ++static inline void prepare_printk(void) ++{ ++} + static inline int printk_ratelimit(void) { return 0; } + static inline int __printk_ratelimit(int ratelimit_jiffies, \ + int ratelimit_burst) { return 0; } +@@ -216,14 +231,23 @@ static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, \ + { return false; } + #endif + ++#define VE0_LOG 1 ++#define VE_LOG 2 ++#define VE_LOG_BOTH (VE0_LOG | VE_LOG) ++ + extern void __attribute__((format(printf, 1, 2))) + early_printk(const char *fmt, ...); + + unsigned long int_sqrt(unsigned long); + ++extern int console_silence_loglevel; ++ + static inline void console_silent(void) + { +- console_loglevel = 0; ++ if (console_loglevel > console_silence_loglevel) { ++ printk(KERN_EMERG "console shuts up ...\n"); ++ console_loglevel = 0; ++ } + } + + static inline void console_verbose(void) +@@ -237,6 +261,7 @@ extern void wake_up_klogd(void); + extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ + extern int panic_timeout; + extern int panic_on_oops; ++extern int decode_call_traces; + extern int panic_on_unrecovered_nmi; + extern int tainted; + extern const char *print_tainted(void); +diff --git a/include/linux/kobject.h b/include/linux/kobject.h +index 39e709f..74ee66d 100644 +--- a/include/linux/kobject.h ++++ b/include/linux/kobject.h +@@ -52,6 +52,8 @@ enum kobject_action { + KOBJ_REMOVE, + KOBJ_CHANGE, + KOBJ_MOVE, ++ KOBJ_START, ++ KOBJ_STOP, + KOBJ_ONLINE, + KOBJ_OFFLINE, + KOBJ_MAX +diff --git a/include/linux/major.h b/include/linux/major.h +index 0cb9805..93c234c 100644 +--- a/include/linux/major.h ++++ b/include/linux/major.h +@@ -170,4 +170,7 @@ + + #define VIOTAPE_MAJOR 230 + ++#define UNNAMED_EXTRA_MAJOR 130 ++#define UNNAMED_EXTRA_MAJOR_COUNT 120 ++ + #endif +diff --git a/include/linux/mm.h b/include/linux/mm.h +index 586a943..484bd0b 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -702,15 +702,7 @@ static inline int page_mapped(struct page *page) + + extern void show_free_areas(void); + +-#ifdef CONFIG_SHMEM +-int shmem_lock(struct file *file, int lock, struct user_struct *user); +-#else +-static inline int shmem_lock(struct file *file, int lock, +- struct user_struct *user) +-{ +- return 0; +-} +-#endif ++#define shmem_nopage filemap_nopage + struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags); + + int shmem_zero_setup(struct vm_area_struct *); +@@ -776,7 +768,9 @@ void free_pgd_range(struct mmu_gather **tlb, unsigned long addr, + void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma, + unsigned long floor, unsigned long ceiling); + int copy_page_range(struct mm_struct *dst, struct mm_struct *src, +- struct vm_area_struct *vma); ++ struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); ++int __copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *vma, ++ unsigned long addr, size_t size); + void unmap_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen, int even_cows); + +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index 02a27ae..b9c983e 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -94,6 +94,14 @@ struct page { + #ifdef CONFIG_CGROUP_MEM_RES_CTLR + unsigned long page_cgroup; + #endif ++#ifdef CONFIG_BEANCOUNTERS ++ /* FIXME: switch to mainline memcgroup */ ++ union { ++ struct user_beancounter *page_ub; ++ struct page_beancounter *page_pb; ++ struct user_beancounter **slub_ubs; ++ } bc; ++#endif + }; + + /* +@@ -219,12 +227,18 @@ struct mm_struct { + + unsigned long flags; /* Must use atomic bitops to access the bits */ + ++ unsigned int vps_dumpable:2; ++ unsigned int oom_killed:1; ++ + /* coredumping support */ + struct completion *core_startup_done, core_done; + + /* aio bits */ + rwlock_t ioctx_list_lock; /* aio lock */ + struct kioctx *ioctx_list; ++#ifdef CONFIG_BEANCOUNTERS ++ struct user_beancounter *mm_ub; ++#endif + #ifdef CONFIG_MM_OWNER + /* + * "owner" points to a task that is regarded as the canonical +diff --git a/include/linux/mman.h b/include/linux/mman.h +index dab8892..a8528e1 100644 +--- a/include/linux/mman.h ++++ b/include/linux/mman.h +@@ -61,6 +61,9 @@ static inline unsigned long + calc_vm_flag_bits(unsigned long flags) + { + return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | ++#ifdef MAP_GROWSUP ++ _calc_vm_trans(flags, MAP_GROWSUP, VM_GROWSUP ) | ++#endif + _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | + _calc_vm_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE) | + _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ); +diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h +index 830bbcd..fdc1225 100644 +--- a/include/linux/mnt_namespace.h ++++ b/include/linux/mnt_namespace.h +@@ -24,6 +24,8 @@ struct proc_mounts { + + extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *, + struct fs_struct *); ++extern struct rw_semaphore namespace_sem; ++ + extern void __put_mnt_ns(struct mnt_namespace *ns); + + static inline void put_mnt_ns(struct mnt_namespace *ns) +diff --git a/include/linux/mount.h b/include/linux/mount.h +index 4374d1a..af1d137 100644 +--- a/include/linux/mount.h ++++ b/include/linux/mount.h +@@ -71,6 +71,7 @@ struct vfsmount { + * are held, and all mnt_writer[]s on this mount have 0 as their ->count + */ + atomic_t __mnt_writers; ++ unsigned owner; + }; + + static inline struct vfsmount *mntget(struct vfsmount *mnt) +diff --git a/include/linux/msg.h b/include/linux/msg.h +index 56abf15..050f740 100644 +--- a/include/linux/msg.h ++++ b/include/linux/msg.h +@@ -107,6 +107,14 @@ extern long do_msgsnd(int msqid, long mtype, void __user *mtext, + extern long do_msgrcv(int msqid, long *pmtype, void __user *mtext, + size_t msgsz, long msgtyp, int msgflg); + ++int sysvipc_walk_msg(int (*func)(int, struct msg_queue*, void *), void *arg); ++int sysvipc_setup_msg(key_t key, int msqid, int msgflg); ++int sysv_msg_store(struct msg_msg *msg, ++ int (*store)(void * src, int len, int offset, void * data), ++ int len, void * data); ++struct msg_msg *sysv_msg_load(int (*load)(void * dst, int len, int offset, ++ void * data), int len, void * data); ++ + #endif /* __KERNEL__ */ + + #endif /* _LINUX_MSG_H */ +diff --git a/include/linux/namei.h b/include/linux/namei.h +index 24d88e9..5ac5b00 100644 +--- a/include/linux/namei.h ++++ b/include/linux/namei.h +@@ -56,6 +56,8 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; + #define LOOKUP_CREATE (0x0200) + #define LOOKUP_ACCESS (0x0400) + #define LOOKUP_CHDIR (0x0800) ++#define LOOKUP_NOAREACHECK (0x1000) /* no area check on lookup */ ++#define LOOKUP_STRICT (0x2000) /* no symlinks or other filesystems */ + + extern int __user_walk(const char __user *, unsigned, struct nameidata *); + extern int __user_walk_fd(int dfd, const char __user *, unsigned, struct nameidata *); +diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h +index 25f8710..bacc0a0 100644 +--- a/include/linux/netdevice.h ++++ b/include/linux/netdevice.h +@@ -291,6 +291,11 @@ enum netdev_state_t + __LINK_STATE_QDISC_RUNNING, + }; + ++struct netdev_bc { ++ struct user_beancounter *exec_ub, *owner_ub; ++}; ++ ++#define netdev_bc(dev) (&(dev)->dev_bc) + + /* + * This structure holds at boot time configured netdevice settings. They +@@ -527,6 +532,10 @@ struct net_device + #define NETIF_F_GSO_ROBUST (SKB_GSO_DODGY << NETIF_F_GSO_SHIFT) + #define NETIF_F_TSO_ECN (SKB_GSO_TCP_ECN << NETIF_F_GSO_SHIFT) + #define NETIF_F_TSO6 (SKB_GSO_TCPV6 << NETIF_F_GSO_SHIFT) ++/* device is venet device */ ++#define NETIF_F_VENET (1 << (NETIF_F_GSO_SHIFT - 1)) ++/* can be registered inside VE */ ++#define NETIF_F_VIRTUAL (1 << (NETIF_F_GSO_SHIFT - 2)) + + /* List of features with software fallbacks. */ + #define NETIF_F_GSO_SOFTWARE (NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6) +@@ -741,6 +750,9 @@ struct net_device + /* macvlan */ + struct macvlan_port *macvlan_port; + ++ struct ve_struct *owner_env; /* Owner VE of the interface */ ++ struct netdev_bc dev_bc; ++ + /* class/net/name entry */ + struct device dev; + /* space for optional statistics and wireless sysfs groups */ +@@ -762,6 +774,20 @@ struct net_device + }; + #define to_net_dev(d) container_of(d, struct net_device, dev) + ++#define NETDEV_HASHBITS 8 ++#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS) ++ ++static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) ++{ ++ unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); ++ return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)]; ++} ++ ++static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) ++{ ++ return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)]; ++} ++ + #define NETDEV_ALIGN 32 + #define NETDEV_ALIGN_CONST (NETDEV_ALIGN - 1) + +@@ -1148,6 +1174,9 @@ extern int dev_ethtool(struct net *net, struct ifreq *); + extern unsigned dev_get_flags(const struct net_device *); + extern int dev_change_flags(struct net_device *, unsigned); + extern int dev_change_name(struct net_device *, char *); ++int __dev_change_net_namespace(struct net_device *, struct net *, const char *, ++ struct ve_struct *src_ve, struct ve_struct *dst_ve, ++ struct user_beancounter *exec_ub); + extern int dev_change_net_namespace(struct net_device *, + struct net *, const char *); + extern int dev_set_mtu(struct net_device *, int); +@@ -1513,6 +1542,18 @@ extern void linkwatch_run_queue(void); + + extern int netdev_compute_features(unsigned long all, unsigned long one); + ++#if defined(CONFIG_VE) && defined(CONFIG_NET) ++static inline int ve_is_dev_movable(struct net_device *dev) ++{ ++ return !(dev->features & (NETIF_F_VIRTUAL | NETIF_F_NETNS_LOCAL)); ++} ++#else ++static inline int ve_is_dev_movable(struct net_device *dev) ++{ ++ return 0; ++} ++#endif ++ + static inline int net_gso_ok(int features, int gso_type) + { + int feature = gso_type << NETIF_F_GSO_SHIFT; +diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h +index 0c5eb7e..8d41ea4 100644 +--- a/include/linux/netfilter.h ++++ b/include/linux/netfilter.h +@@ -394,5 +394,24 @@ static inline struct net *nf_post_routing_net(const struct net_device *in, + #endif + } + ++#ifdef CONFIG_VE_IPTABLES ++#include ++ ++#define net_ipt_module_permitted(netns, ipt) \ ++ (VE_IPT_CMP((netns)->owner_ve->ipt_mask, ipt) && \ ++ VE_IPT_CMP((netns)->owner_ve->_iptables_modules, \ ++ (ipt) & ~(ipt##_MOD))) ++ ++#define net_ipt_module_set(netns, ipt) ({ \ ++ (netns)->owner_ve->_iptables_modules |= ipt##_MOD; \ ++ }) ++#define net_is_ipt_module_set(netns, ipt) ( \ ++ (netns)->owner_ve->_iptables_modules & (ipt##_MOD)) ++#else ++#define net_ipt_module_permitted(netns, ipt) (1) ++#define net_ipt_module_set(netns, ipt) ++#define net_is_ipt_module_set(netns, ipt) (1) ++#endif ++ + #endif /*__KERNEL__*/ + #endif /*__LINUX_NETFILTER_H*/ +diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h +index 2326296..7a66377 100644 +--- a/include/linux/netfilter/x_tables.h ++++ b/include/linux/netfilter/x_tables.h +@@ -302,6 +302,7 @@ struct xt_table_info + { + /* Size per table */ + unsigned int size; ++ unsigned int alloc_size; + /* Number of entries: FIXME. --RR */ + unsigned int number; + /* Initial number of entries. Needed for module usage count */ +diff --git a/include/linux/netfilter/xt_hashlimit.h b/include/linux/netfilter/xt_hashlimit.h +index 51b18d8..439da56 100644 +--- a/include/linux/netfilter/xt_hashlimit.h ++++ b/include/linux/netfilter/xt_hashlimit.h +@@ -63,4 +63,11 @@ struct xt_hashlimit_mtinfo1 { + struct xt_hashlimit_htable *hinfo __attribute__((aligned(8))); + }; + ++#ifdef __KERNEL__ ++struct ve_xt_hashlimit { ++ struct hlist_head hashlimit_htables; ++ struct proc_dir_entry *hashlimit_procdir4; ++ struct proc_dir_entry *hashlimit_procdir6; ++}; ++#endif + #endif /*_XT_HASHLIMIT_H*/ +diff --git a/include/linux/netfilter_ipv4/ipt_recent.h b/include/linux/netfilter_ipv4/ipt_recent.h +index 6508a45..3b9a1e8 100644 +--- a/include/linux/netfilter_ipv4/ipt_recent.h ++++ b/include/linux/netfilter_ipv4/ipt_recent.h +@@ -24,4 +24,12 @@ struct ipt_recent_info { + u_int8_t side; + }; + ++#ifdef __KERNEL__ ++struct ve_ipt_recent { ++ struct list_head tables; ++#ifdef CONFIG_PROC_FS ++ struct proc_dir_entry *proc_dir; ++#endif ++}; ++#endif + #endif /*_IPT_RECENT_H*/ +diff --git a/include/linux/nfcalls.h b/include/linux/nfcalls.h +new file mode 100644 +index 0000000..f968054 +--- /dev/null ++++ b/include/linux/nfcalls.h +@@ -0,0 +1,172 @@ ++/* ++ * include/linux/nfcalls.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef _LINUX_NFCALLS_H ++#define _LINUX_NFCALLS_H ++ ++#include ++ ++#ifdef CONFIG_MODULES ++extern struct module no_module; ++ ++#define DECL_KSYM_MODULE(name) \ ++ extern struct module *vz_mod_##name ++ ++#define INIT_KSYM_MODULE(name) \ ++ struct module *vz_mod_##name = &no_module; \ ++ EXPORT_SYMBOL(vz_mod_##name) ++ ++static inline void __vzksym_modresolve(struct module **modp, struct module *mod) ++{ ++ /* ++ * we want to be sure, that pointer updates are visible first: ++ * 1. wmb() is here only for piece of sure ++ * (note, no rmb() in KSYMSAFECALL) ++ * 2. synchronize_sched() guarantees that updates are visible ++ * on all cpus and allows us to remove rmb() in KSYMSAFECALL ++ */ ++ wmb(); synchronize_sched(); ++ *modp = mod; ++ /* just to be sure, our changes are visible as soon as possible */ ++ wmb(); synchronize_sched(); ++} ++ ++static inline void __vzksym_modunresolve(struct module **modp) ++{ ++ /* ++ * try_module_get() in KSYMSAFECALL should fail at this moment since ++ * THIS_MODULE in in unloading state (we should be called from fini), ++ * no need to syncronize pointers/ve_module updates. ++ */ ++ *modp = &no_module; ++ /* ++ * synchronize_sched() guarantees here that we see ++ * updated module pointer before the module really gets away ++ */ ++ synchronize_sched(); ++} ++ ++static inline int __vzksym_module_get(struct module *mod) ++{ ++ /* ++ * we want to avoid rmb(), so use synchronize_sched() in KSYMUNRESOLVE ++ * and smp_read_barrier_depends() here... ++ */ ++ smp_read_barrier_depends(); /* for module loading */ ++ if (!try_module_get(mod)) ++ return -EBUSY; ++ ++ return 0; ++} ++ ++static inline void __vzksym_module_put(struct module *mod) ++{ ++ module_put(mod); ++} ++#else ++#define DECL_KSYM_MODULE(name) ++#define INIT_KSYM_MODULE(name) ++#define __vzksym_modresolve(modp, mod) ++#define __vzksym_modunresolve(modp) ++#define __vzksym_module_get(mod) 0 ++#define __vzksym_module_put(mod) ++#endif ++ ++#define __KSYMERRCALL(err, type, mod, name, args) \ ++({ \ ++ type ret = (type)err; \ ++ if (!__vzksym_module_get(vz_mod_##mod)) { \ ++ if (vz_##name) \ ++ ret = ((*vz_##name)args); \ ++ __vzksym_module_put(vz_mod_##mod); \ ++ } \ ++ ret; \ ++}) ++ ++#define __KSYMSAFECALL_VOID(mod, name, args) \ ++ do { \ ++ if (!__vzksym_module_get(vz_mod_##mod)) { \ ++ if (vz_##name) \ ++ ((*vz_##name)args); \ ++ __vzksym_module_put(vz_mod_##mod); \ ++ } \ ++ } while (0) ++ ++#define DECL_KSYM_CALL(type, name, args) \ ++ extern type (*vz_##name) args ++#define INIT_KSYM_CALL(type, name, args) \ ++ type (*vz_##name) args; \ ++EXPORT_SYMBOL(vz_##name) ++ ++#define KSYMERRCALL(err, mod, name, args) \ ++ __KSYMERRCALL(err, int, mod, name, args) ++#define KSYMSAFECALL(type, mod, name, args) \ ++ __KSYMERRCALL(0, type, mod, name, args) ++#define KSYMSAFECALL_VOID(mod, name, args) \ ++ __KSYMSAFECALL_VOID(mod, name, args) ++#define KSYMREF(name) vz_##name ++ ++/* should be called _after_ KSYMRESOLVE's */ ++#define KSYMMODRESOLVE(name) \ ++ __vzksym_modresolve(&vz_mod_##name, THIS_MODULE) ++#define KSYMMODUNRESOLVE(name) \ ++ __vzksym_modunresolve(&vz_mod_##name) ++ ++#define KSYMRESOLVE(name) \ ++ vz_##name = &name ++#define KSYMUNRESOLVE(name) \ ++ vz_##name = NULL ++ ++#if defined(CONFIG_VE) ++DECL_KSYM_MODULE(ip_tables); ++DECL_KSYM_MODULE(ip6_tables); ++DECL_KSYM_MODULE(iptable_filter); ++DECL_KSYM_MODULE(ip6table_filter); ++DECL_KSYM_MODULE(iptable_mangle); ++DECL_KSYM_MODULE(ip6table_mangle); ++DECL_KSYM_MODULE(ip_conntrack); ++DECL_KSYM_MODULE(nf_conntrack); ++DECL_KSYM_MODULE(nf_conntrack_ipv4); ++DECL_KSYM_MODULE(nf_conntrack_ipv6); ++DECL_KSYM_MODULE(xt_conntrack); ++DECL_KSYM_MODULE(ip_nat); ++DECL_KSYM_MODULE(nf_nat); ++DECL_KSYM_MODULE(iptable_nat); ++ ++struct sk_buff; ++ ++DECL_KSYM_CALL(int, init_iptable_conntrack, (void)); ++DECL_KSYM_CALL(int, nf_conntrack_init_ve, (void)); ++DECL_KSYM_CALL(int, init_nf_ct_l3proto_ipv4, (void)); ++DECL_KSYM_CALL(int, init_nf_ct_l3proto_ipv6, (void)); ++DECL_KSYM_CALL(int, nf_nat_init, (void)); ++DECL_KSYM_CALL(int, init_nftable_nat, (void)); ++DECL_KSYM_CALL(int, nf_nat_init, (void)); ++DECL_KSYM_CALL(void, fini_nftable_nat, (void)); ++DECL_KSYM_CALL(void, nf_nat_cleanup, (void)); ++DECL_KSYM_CALL(void, fini_iptable_conntrack, (void)); ++DECL_KSYM_CALL(void, nf_conntrack_cleanup_ve, (void)); ++DECL_KSYM_CALL(void, fini_nf_ct_l3proto_ipv4, (void)); ++DECL_KSYM_CALL(void, fini_nf_ct_l3proto_ipv6, (void)); ++ ++#include ++#endif ++ ++#if defined(CONFIG_VE_ETHDEV) || defined(CONFIG_VE_ETHDEV_MODULE) ++DECL_KSYM_MODULE(vzethdev); ++DECL_KSYM_CALL(int, veth_open, (struct net_device *dev)); ++#endif ++ ++#if defined(CONFIG_VE_CALLS) || defined(CONFIG_VE_CALLS_MODULE) ++DECL_KSYM_MODULE(vzmon); ++DECL_KSYM_CALL(void, real_do_env_free, (struct ve_struct *env)); ++#endif ++ ++#endif /* _LINUX_NFCALLS_H */ +diff --git a/include/linux/notifier.h b/include/linux/notifier.h +index 0ff6224..1e22bad 100644 +--- a/include/linux/notifier.h ++++ b/include/linux/notifier.h +@@ -153,8 +153,9 @@ extern int __srcu_notifier_call_chain(struct srcu_notifier_head *nh, + + #define NOTIFY_DONE 0x0000 /* Don't care */ + #define NOTIFY_OK 0x0001 /* Suits me */ ++#define NOTIFY_FAIL 0x0002 /* Reject */ + #define NOTIFY_STOP_MASK 0x8000 /* Don't call further */ +-#define NOTIFY_BAD (NOTIFY_STOP_MASK|0x0002) ++#define NOTIFY_BAD (NOTIFY_STOP_MASK|NOTIFY_FAIL) + /* Bad/Veto action */ + /* + * Clean way to return from the notifier and stop further calls. +diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h +index 0e66b57..dd6d50f 100644 +--- a/include/linux/nsproxy.h ++++ b/include/linux/nsproxy.h +@@ -66,6 +66,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk); + void exit_task_namespaces(struct task_struct *tsk); + void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); + void free_nsproxy(struct nsproxy *ns); ++struct mnt_namespace * get_task_mnt_ns(struct task_struct *tsk); + int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **, + struct fs_struct *); + +@@ -76,9 +77,10 @@ static inline void put_nsproxy(struct nsproxy *ns) + } + } + +-static inline void get_nsproxy(struct nsproxy *ns) ++static inline struct nsproxy *get_nsproxy(struct nsproxy *ns) + { + atomic_inc(&ns->count); ++ return ns; + } + + #ifdef CONFIG_CGROUP_NS +diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h +index f31debf..5b44dd6 100644 +--- a/include/linux/page-flags.h ++++ b/include/linux/page-flags.h +@@ -157,6 +157,7 @@ PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active) + __PAGEFLAG(Slab, slab) + PAGEFLAG(Checked, owner_priv_1) /* Used by some filesystems */ + PAGEFLAG(Pinned, owner_priv_1) TESTSCFLAG(Pinned, owner_priv_1) /* Xen */ ++PAGEFLAG(Checkpointed, owner_priv_1) + PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved) + PAGEFLAG(Private, private) __CLEARPAGEFLAG(Private, private) + __SETPAGEFLAG(Private, private) +diff --git a/include/linux/percpu.h b/include/linux/percpu.h +index 4cdd393..5ac97e1 100644 +--- a/include/linux/percpu.h ++++ b/include/linux/percpu.h +@@ -74,6 +74,13 @@ struct percpu_data { + (__typeof__(ptr))__p->ptrs[(cpu)]; \ + }) + ++#define static_percpu_ptr(sptr, sptrs) ({ \ ++ int i; \ ++ for (i = 0; i < NR_CPUS; i++) \ ++ (sptr)->ptrs[i] = &(sptrs)[i]; \ ++ (__typeof__(&sptrs[0]))__percpu_disguise(sptr);\ ++ }) ++ + extern void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu); + extern void percpu_depopulate(void *__pdata, int cpu); + extern int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, +@@ -85,6 +92,7 @@ extern void percpu_free(void *__pdata); + #else /* CONFIG_SMP */ + + #define percpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) ++#define static_percpu_ptr(sptr, sptrs) (&sptrs[0]) + + static inline void percpu_depopulate(void *__pdata, int cpu) + { +diff --git a/include/linux/pid.h b/include/linux/pid.h +index c21c7e8..4331c6b 100644 +--- a/include/linux/pid.h ++++ b/include/linux/pid.h +@@ -59,6 +59,9 @@ struct pid + atomic_t count; + /* lists of tasks that use this pid */ + struct hlist_head tasks[PIDTYPE_MAX]; ++#ifdef CONFIG_BEANCOUNTERS ++ struct user_beancounter *ub; ++#endif + struct rcu_head rcu; + unsigned int level; + struct upid numbers[1]; +@@ -96,6 +99,11 @@ extern void change_pid(struct task_struct *task, enum pid_type, + struct pid *pid); + extern void transfer_pid(struct task_struct *old, struct task_struct *new, + enum pid_type); ++extern void reattach_pid(struct task_struct *, enum pid_type, struct pid *); ++extern int alloc_pidmap(struct pid_namespace *pid_ns); ++extern int set_pidmap(struct pid_namespace *pid_ns, pid_t pid); ++ ++extern spinlock_t pidmap_lock; + + struct pid_namespace; + extern struct pid_namespace init_pid_ns; +@@ -121,8 +129,11 @@ extern struct pid *find_get_pid(int nr); + extern struct pid *find_ge_pid(int nr, struct pid_namespace *); + int next_pidmap(struct pid_namespace *pid_ns, int last); + +-extern struct pid *alloc_pid(struct pid_namespace *ns); ++extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t vpid); + extern void free_pid(struct pid *pid); ++extern int pid_ns_attach_init(struct pid_namespace *, struct task_struct *); ++extern int pid_ns_attach_task(struct pid_namespace *, struct task_struct *); ++pid_t pid_to_vpid(pid_t nr); + + /* + * the helpers to get the pid's id seen from different namespaces +diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h +index caff528..82514f2 100644 +--- a/include/linux/pid_namespace.h ++++ b/include/linux/pid_namespace.h +@@ -14,6 +14,14 @@ struct pidmap { + + #define PIDMAP_ENTRIES ((PID_MAX_LIMIT + 8*PAGE_SIZE - 1)/PAGE_SIZE/8) + ++/* pid namespace flags */ ++ ++/* if set newly created pid ns got PID_NS_HIDE_CHILD flag */ ++#define PID_NS_HIDE_CHILD 0x00000001 ++ ++/* if set newly created processes invisible from parent ns*/ ++#define PID_NS_HIDDEN 0x00000002 ++ + struct pid_namespace { + struct kref kref; + struct pidmap pidmap[PIDMAP_ENTRIES]; +@@ -22,6 +30,7 @@ struct pid_namespace { + struct kmem_cache *pid_cachep; + unsigned int level; + struct pid_namespace *parent; ++ unsigned flags; + #ifdef CONFIG_PROC_FS + struct vfsmount *proc_mnt; + #endif +diff --git a/include/linux/poll.h b/include/linux/poll.h +index ef45382..c1bf82a 100644 +--- a/include/linux/poll.h ++++ b/include/linux/poll.h +@@ -119,6 +119,7 @@ extern int do_sys_poll(struct pollfd __user * ufds, unsigned int nfds, + s64 *timeout); + extern int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, + fd_set __user *exp, s64 *timeout); ++long do_restart_poll(struct restart_block *restart_block); + + #endif /* KERNEL */ + +diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h +index fff1d27..62e16d4 100644 +--- a/include/linux/proc_fs.h ++++ b/include/linux/proc_fs.h +@@ -125,7 +125,10 @@ extern void remove_proc_entry(const char *name, struct proc_dir_entry *parent); + extern struct vfsmount *proc_mnt; + struct pid_namespace; + extern int proc_fill_super(struct super_block *); +-extern struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *); ++extern struct inode *proc_get_inode(struct super_block *, unsigned int, ++ struct proc_dir_entry *glob, struct proc_dir_entry *loc); ++ ++extern struct file_system_type proc_fs_type; + + /* + * These are generic /proc routines that use the internal +@@ -174,6 +177,8 @@ extern struct proc_dir_entry *proc_mkdir(const char *,struct proc_dir_entry *); + extern struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode, + struct proc_dir_entry *parent); + ++extern struct proc_dir_entry glob_proc_root; ++ + static inline struct proc_dir_entry *proc_create(const char *name, mode_t mode, + struct proc_dir_entry *parent, const struct file_operations *proc_fops) + { +@@ -287,6 +292,9 @@ struct proc_inode { + int fd; + union proc_op op; + struct proc_dir_entry *pde; ++#ifdef CONFIG_VE ++ struct proc_dir_entry *lpde; ++#endif + struct inode vfs_inode; + }; + +@@ -300,6 +308,15 @@ static inline struct proc_dir_entry *PDE(const struct inode *inode) + return PROC_I(inode)->pde; + } + ++static inline struct proc_dir_entry *LPDE(const struct inode *inode) ++{ ++#ifdef CONFIG_VE ++ return PROC_I(inode)->lpde; ++#else ++ return NULL; ++#endif ++} ++ + static inline struct net *PDE_NET(struct proc_dir_entry *pde) + { + return pde->parent->data; +diff --git a/include/linux/quota.h b/include/linux/quota.h +index dcddfb2..97dacfd 100644 +--- a/include/linux/quota.h ++++ b/include/linux/quota.h +@@ -166,6 +166,10 @@ enum { + #include + #include + ++#include ++ ++extern spinlock_t dq_data_lock; ++ + #include + #include + #include +@@ -282,6 +286,8 @@ struct quota_format_ops { + int (*release_dqblk)(struct dquot *dquot); /* Called when last reference to dquot is being dropped */ + }; + ++struct inode; ++struct iattr; + /* Operations working with dquots */ + struct dquot_operations { + int (*initialize) (struct inode *, int); +@@ -296,9 +302,11 @@ struct dquot_operations { + int (*release_dquot) (struct dquot *); /* Quota is going to be deleted from disk */ + int (*mark_dirty) (struct dquot *); /* Dquot is marked dirty */ + int (*write_info) (struct super_block *, int); /* Write of quota "superblock" */ ++ int (*rename) (struct inode *, struct inode *, struct inode *); + }; + + /* Operations handling requests from userspace */ ++struct v2_disk_dqblk; + struct quotactl_ops { + int (*quota_on)(struct super_block *, int, int, char *, int); + int (*quota_off)(struct super_block *, int, int); +@@ -311,6 +319,10 @@ struct quotactl_ops { + int (*set_xstate)(struct super_block *, unsigned int, int); + int (*get_xquota)(struct super_block *, int, qid_t, struct fs_disk_quota *); + int (*set_xquota)(struct super_block *, int, qid_t, struct fs_disk_quota *); ++#ifdef CONFIG_QUOTA_COMPAT ++ int (*get_quoti)(struct super_block *, int, unsigned int, ++ struct v2_disk_dqblk __user *); ++#endif + }; + + struct quota_format_type { +@@ -335,6 +347,10 @@ struct quota_info { + struct inode *files[MAXQUOTAS]; /* inodes of quotafiles */ + struct mem_dqinfo info[MAXQUOTAS]; /* Information for each quota type */ + struct quota_format_ops *ops[MAXQUOTAS]; /* Operations for each type */ ++#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE) ++ struct vz_quota_master *vzdq_master; ++ int vzdq_count; ++#endif + }; + + #define sb_has_quota_enabled(sb, type) ((type)==USRQUOTA ? \ +diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h +index f867020..a1bcf67 100644 +--- a/include/linux/quotaops.h ++++ b/include/linux/quotaops.h +@@ -170,6 +170,19 @@ static inline int DQUOT_TRANSFER(struct inode *inode, struct iattr *iattr) + return 0; + } + ++static __inline__ int DQUOT_RENAME(struct inode *inode, ++ struct inode *old_dir, struct inode *new_dir) ++{ ++ struct dquot_operations *q_op; ++ ++ q_op = inode->i_sb->dq_op; ++ if (q_op && q_op->rename) { ++ if (q_op->rename(inode, old_dir, new_dir) == NO_QUOTA) ++ return 1; ++ } ++ return 0; ++} ++ + /* The following two functions cannot be called inside a transaction */ + static inline void DQUOT_SYNC(struct super_block *sb) + { +@@ -244,6 +257,12 @@ static inline int DQUOT_TRANSFER(struct inode *inode, struct iattr *iattr) + return 0; + } + ++static inline int DQUOT_RENAME(struct inode *inode, struct inode *old_dir, ++ struct inode *new_dir) ++{ ++ return 0; ++} ++ + static inline int DQUOT_PREALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr) + { + inode_add_bytes(inode, nr); +diff --git a/include/linux/rmap.h b/include/linux/rmap.h +index 1383692..be68e7a 100644 +--- a/include/linux/rmap.h ++++ b/include/linux/rmap.h +@@ -74,6 +74,8 @@ void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); + void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); + void page_add_file_rmap(struct page *); + void page_remove_rmap(struct page *, struct vm_area_struct *); ++struct anon_vma *page_lock_anon_vma(struct page *page); ++void page_unlock_anon_vma(struct anon_vma *anon_vma); + + #ifdef CONFIG_DEBUG_VM + void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address); +diff --git a/include/linux/sched.h b/include/linux/sched.h +index c5d3f84..272da80 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -29,6 +29,10 @@ + #define CLONE_NEWNET 0x40000000 /* New network namespace */ + #define CLONE_IO 0x80000000 /* Clone io context */ + ++/* mask of clones which are disabled in OpenVZ VEs */ ++#define CLONE_NAMESPACES_MASK (CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER | \ ++ CLONE_NEWPID | CLONE_NEWNET) ++ + /* + * Scheduling policies + */ +@@ -90,6 +94,8 @@ struct sched_param { + + #include + ++#include ++ + struct mem_cgroup; + struct exec_domain; + struct futex_pi_state; +@@ -126,15 +132,38 @@ extern unsigned long avenrun[]; /* Load averages */ + load += n*(FIXED_1-exp); \ + load >>= FSHIFT; + ++#define LOAD_INT(x) ((x) >> FSHIFT) ++#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) ++ + extern unsigned long total_forks; + extern int nr_threads; + DECLARE_PER_CPU(unsigned long, process_counts); + extern int nr_processes(void); + extern unsigned long nr_running(void); ++extern unsigned long nr_sleeping(void); ++extern unsigned long nr_stopped(void); + extern unsigned long nr_uninterruptible(void); + extern unsigned long nr_active(void); + extern unsigned long nr_iowait(void); + extern unsigned long weighted_cpuload(const int cpu); ++extern atomic_t nr_dead; ++extern unsigned long nr_zombie; ++ ++#ifdef CONFIG_VE ++struct ve_struct; ++extern unsigned long nr_running_ve(struct ve_struct *); ++extern unsigned long nr_iowait_ve(struct ve_struct *); ++extern unsigned long nr_uninterruptible_ve(struct ve_struct *); ++extern cycles_t ve_sched_get_idle_time(struct ve_struct *ve, int cpu); ++extern cycles_t ve_sched_get_iowait_time(struct ve_struct *ve, int cpu); ++void ve_sched_attach(struct ve_struct *envid); ++#else ++#define nr_running_ve(ve) 0 ++#define nr_iowait_ve(ve) 0 ++#define nr_uninterruptible_ve(ve) 0 ++#define ve_sched_get_idle_time(ve, cpu) 0 ++#define ve_sched_get_iowait_time(ve, cpu) 0 ++#endif + + struct seq_file; + struct cfs_rq; +@@ -269,6 +298,7 @@ static inline void show_state(void) + } + + extern void show_regs(struct pt_regs *); ++extern void smp_show_regs(struct pt_regs *, void *); + + /* + * TASK is a pointer to the task whose backtrace we want to see (or NULL for current +@@ -423,6 +453,9 @@ struct pacct_struct { + unsigned long ac_minflt, ac_majflt; + }; + ++#include ++#include ++ + /* + * NOTE! "signal_struct" does not have it's own + * locking, because a shared signal_struct always +@@ -1092,6 +1125,7 @@ struct task_struct { + /* ??? */ + unsigned int personality; + unsigned did_exec:1; ++ unsigned did_ve_enter:1; + pid_t pid; + pid_t tgid; + +@@ -1289,6 +1323,14 @@ struct task_struct { + struct rcu_head rcu; + + /* ++ * state tracking for suspend ++ * FIXME - ptrace is completely rewritten in this kernel ++ * so set_pn_state() is not set in many places correctyl ++ */ ++ __u8 pn_state; ++ __u8 stopped_state:1; ++ ++ /* + * cache last used pipe for splice + */ + struct pipe_inode_info *splice_pipe; +@@ -1303,6 +1345,19 @@ struct task_struct { + int latency_record_count; + struct latency_record latency_record[LT_SAVECOUNT]; + #endif ++#ifdef CONFIG_BEANCOUNTERS ++ struct task_beancounter task_bc; ++#endif ++#ifdef CONFIG_VE ++ struct ve_task_info ve_task_info; ++#endif ++#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE) ++ unsigned long magic; ++ struct inode *ino; ++#endif ++#ifdef CONFIG_VZ_FAIRSCHED ++ struct fairsched_node *fsched_node; ++#endif + }; + + /* +@@ -1477,6 +1532,43 @@ static inline void put_task_struct(struct task_struct *t) + __put_task_struct(t); + } + ++#ifndef CONFIG_VE ++#define set_pn_state(tsk, state) do { } while(0) ++#define clear_pn_state(tsk) do { } while(0) ++#define set_stop_state(tsk) do { } while(0) ++#define clear_stop_state(tsk) do { } while(0) ++#else ++#define PN_STOP_TF 1 /* was not in 2.6.8 */ ++#define PN_STOP_TF_RT 2 /* was not in 2.6.8 */ ++#define PN_STOP_ENTRY 3 ++#define PN_STOP_FORK 4 ++#define PN_STOP_VFORK 5 ++#define PN_STOP_SIGNAL 6 ++#define PN_STOP_EXIT 7 ++#define PN_STOP_EXEC 8 ++#define PN_STOP_LEAVE 9 ++ ++static inline void set_pn_state(struct task_struct *tsk, int state) ++{ ++ tsk->pn_state = state; ++} ++ ++static inline void clear_pn_state(struct task_struct *tsk) ++{ ++ tsk->pn_state = 0; ++} ++ ++static inline void set_stop_state(struct task_struct *tsk) ++{ ++ tsk->stopped_state = 1; ++} ++ ++static inline void clear_stop_state(struct task_struct *tsk) ++{ ++ tsk->stopped_state = 0; ++} ++#endif ++ + /* + * Per process flags + */ +@@ -1493,6 +1585,7 @@ static inline void put_task_struct(struct task_struct *t) + #define PF_MEMALLOC 0x00000800 /* Allocating memory */ + #define PF_FLUSHER 0x00001000 /* responsible for disk writeback */ + #define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */ ++#define PF_EXIT_RESTART 0x00004000 /* do_exit() restarted, see do_exit() */ + #define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */ + #define PF_FROZEN 0x00010000 /* frozen for system suspend */ + #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ +@@ -1590,6 +1683,21 @@ extern unsigned long long cpu_clock(int cpu); + extern unsigned long long + task_sched_runtime(struct task_struct *task); + ++static inline unsigned long cycles_to_clocks(cycles_t cycles) ++{ ++ extern unsigned long cycles_per_clock; ++ do_div(cycles, cycles_per_clock); ++ return cycles; ++} ++ ++static inline u64 cycles_to_jiffies(cycles_t cycles) ++{ ++ extern unsigned long cycles_per_jiffy; ++ do_div(cycles, cycles_per_jiffy); ++ return cycles; ++} ++ ++ + /* sched_exec is called by processes performing an exec */ + #ifdef CONFIG_SMP + extern void sched_exec(void); +@@ -1727,6 +1835,7 @@ static inline struct user_struct *get_uid(struct user_struct *u) + extern void free_uid(struct user_struct *); + extern void switch_uid(struct user_struct *); + extern void release_uids(struct user_namespace *ns); ++extern int set_user(uid_t uid, int dumpclear); + + #include + +@@ -1859,6 +1968,13 @@ extern int disallow_signal(int); + + extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *); + extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *); ++extern long do_fork_pid(unsigned long clone_flags, ++ unsigned long stack_start, ++ struct pt_regs *regs, ++ unsigned long stack_size, ++ int __user *parent_tidptr, ++ int __user *child_tidptr, ++ long pid0); + struct task_struct *fork_idle(int); + + extern void set_task_comm(struct task_struct *tsk, char *from); +@@ -1873,19 +1989,19 @@ extern void wait_task_inactive(struct task_struct * p); + #define remove_parent(p) list_del_init(&(p)->sibling) + #define add_parent(p) list_add_tail(&(p)->sibling,&(p)->parent->children) + +-#define next_task(p) list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks) ++#define next_task_all(p) list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks) + +-#define for_each_process(p) \ +- for (p = &init_task ; (p = next_task(p)) != &init_task ; ) ++#define for_each_process_all(p) \ ++ for (p = &init_task ; (p = next_task_all(p)) != &init_task ; ) + + /* + * Careful: do_each_thread/while_each_thread is a double loop so + * 'break' will not work as expected - use goto instead. + */ +-#define do_each_thread(g, t) \ +- for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do ++#define do_each_thread_all(g, t) \ ++ for (g = t = &init_task ; (g = t = next_task_all(g)) != &init_task ; ) do + +-#define while_each_thread(g, t) \ ++#define while_each_thread_all(g, t) \ + while ((t = next_thread(t)) != g) + + /* de_thread depends on thread_group_leader not being a pid based check */ +@@ -1910,8 +2026,15 @@ int same_thread_group(struct task_struct *p1, struct task_struct *p2) + + static inline struct task_struct *next_thread(const struct task_struct *p) + { +- return list_entry(rcu_dereference(p->thread_group.next), ++ struct task_struct *tsk; ++ ++ tsk = list_entry(rcu_dereference(p->thread_group.next), + struct task_struct, thread_group); ++#ifdef CONFIG_VE ++ /* all threads should belong to ONE ve! */ ++ BUG_ON(VE_TASK_INFO(tsk)->owner_env != VE_TASK_INFO(p)->owner_env); ++#endif ++ return tsk; + } + + static inline int thread_group_empty(struct task_struct *p) +@@ -1951,6 +2074,98 @@ static inline void unlock_task_sighand(struct task_struct *tsk, + spin_unlock_irqrestore(&tsk->sighand->siglock, *flags); + } + ++#ifndef CONFIG_VE ++ ++#define for_each_process_ve(p) for_each_process_all(p) ++#define do_each_thread_ve(g, t) do_each_thread_all(g, t) ++#define while_each_thread_ve(g, t) while_each_thread_all(g, t) ++#define first_task_ve() next_task_ve(&init_task) ++#define __first_task_ve(owner) next_task_ve(&init_task) ++#define __next_task_ve(owner, p) next_task_ve(p) ++#define next_task_ve(p) \ ++ (next_task_all(p) != &init_task ? next_task_all(p) : NULL) ++ ++#define ve_is_super(env) 1 ++#define ve_accessible(target, owner) 1 ++#define ve_accessible_strict(target, owner) 1 ++#define ve_accessible_veid(target, owner) 1 ++#define ve_accessible_strict_veid(target, owner) 1 ++ ++#define VEID(ve) 0 ++ ++#else /* CONFIG_VE */ ++ ++#include ++ ++#define ve_is_super(env) ((env) == get_ve0()) ++ ++#define ve_accessible_strict(target, owner) ((target) == (owner)) ++static inline int ve_accessible(struct ve_struct *target, ++ struct ve_struct *owner) ++{ ++ return ve_is_super(owner) || ve_accessible_strict(target, owner); ++} ++ ++#define ve_accessible_strict_veid(target, owner) ((target) == (owner)) ++static inline int ve_accessible_veid(envid_t target, envid_t owner) ++{ ++ return get_ve0()->veid == owner || ++ ve_accessible_strict_veid(target, owner); ++} ++ ++#define VEID(ve) (ve->veid) ++ ++static inline struct task_struct *ve_lh2task(struct ve_struct *ve, ++ struct list_head *lh) ++{ ++ return lh == &ve->vetask_lh ? NULL : ++ list_entry(lh, struct task_struct, ve_task_info.vetask_list); ++} ++ ++static inline struct task_struct *__first_task_ve(struct ve_struct *ve) ++{ ++ struct task_struct *tsk; ++ ++ if (unlikely(ve_is_super(ve))) { ++ tsk = next_task_all(&init_task); ++ if (tsk == &init_task) ++ tsk = NULL; ++ } else { ++ tsk = ve_lh2task(ve, rcu_dereference(ve->vetask_lh.next)); ++ } ++ return tsk; ++} ++ ++static inline struct task_struct *__next_task_ve(struct ve_struct *ve, ++ struct task_struct *tsk) ++{ ++ if (unlikely(ve_is_super(ve))) { ++ tsk = next_task_all(tsk); ++ if (tsk == &init_task) ++ tsk = NULL; ++ } else { ++ BUG_ON(tsk->ve_task_info.owner_env != ve); ++ tsk = ve_lh2task(ve, rcu_dereference(tsk-> ++ ve_task_info.vetask_list.next)); ++ } ++ return tsk; ++} ++ ++#define first_task_ve() __first_task_ve(get_exec_env()) ++#define next_task_ve(p) __next_task_ve(get_exec_env(), p) ++/* no one uses prev_task_ve(), copy next_task_ve() if needed */ ++ ++#define for_each_process_ve(p) \ ++ for (p = first_task_ve(); p != NULL ; p = next_task_ve(p)) ++ ++#define do_each_thread_ve(g, t) \ ++ for (g = t = first_task_ve() ; g != NULL; g = t = next_task_ve(g)) do ++ ++#define while_each_thread_ve(g, t) \ ++ while ((t = next_thread(t)) != g) ++ ++#endif /* CONFIG_VE */ ++ + #ifndef __HAVE_THREAD_FUNCTIONS + + #define task_thread_info(task) ((struct thread_info *)(task)->stack) +diff --git a/include/linux/sem.h b/include/linux/sem.h +index c8eaad9..380e1d1 100644 +--- a/include/linux/sem.h ++++ b/include/linux/sem.h +@@ -154,6 +154,9 @@ static inline void exit_sem(struct task_struct *tsk) + } + #endif + ++int sysvipc_walk_sem(int (*func)(int, struct sem_array*, void *), void *arg); ++int sysvipc_setup_sem(key_t key, int semid, size_t size, int semflg); ++ + #endif /* __KERNEL__ */ + + #endif /* _LINUX_SEM_H */ +diff --git a/include/linux/shm.h b/include/linux/shm.h +index eca6235..c2b3bb5 100644 +--- a/include/linux/shm.h ++++ b/include/linux/shm.h +@@ -83,6 +83,22 @@ struct shm_info { + }; + + #ifdef __KERNEL__ ++ ++#include ++ ++#define IPC_SEM_IDS 0 ++#define IPC_MSG_IDS 1 ++#define IPC_SHM_IDS 2 ++ ++struct shm_file_data { ++ int id; ++ struct ipc_namespace *ns; ++ struct file *file; ++ const struct vm_operations_struct *vm_ops; ++}; ++#define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data)) ++#define shm_ids(ns) ((ns)->ids[IPC_SHM_IDS]) ++ + struct shmid_kernel /* private to the kernel */ + { + struct kern_ipc_perm shm_perm; +@@ -97,6 +113,23 @@ struct shmid_kernel /* private to the kernel */ + struct user_struct *mlock_user; + }; + ++/* ++ * shm_lock_(check_) routines are called in the paths where the rw_mutex ++ * is not held. ++ */ ++static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id) ++{ ++ struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id); ++ ++ if (IS_ERR(ipcp)) ++ return (struct shmid_kernel *)ipcp; ++ ++ return container_of(ipcp, struct shmid_kernel, shm_perm); ++} ++ ++#define shm_unlock(shp) \ ++ ipc_unlock(&(shp)->shm_perm) ++ + /* shm_mode upper byte flags */ + #define SHM_DEST 01000 /* segment will be destroyed on last detach */ + #define SHM_LOCKED 02000 /* segment will not be swapped */ +@@ -118,6 +151,12 @@ static inline int is_file_shm_hugepages(struct file *file) + } + #endif + ++int sysvipc_walk_shm(int (*func)(struct shmid_kernel*, void *), void *arg); ++struct file * sysvipc_setup_shm(key_t key, int shmid, size_t size, int shmflg); ++extern const struct file_operations shmem_file_operations; ++extern const struct file_operations shm_file_operations; ++ ++extern struct file_system_type tmpfs_fs_type; + #endif /* __KERNEL__ */ + + #endif /* _LINUX_SHM_H_ */ +diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h +index f2d12d5..c4d6482 100644 +--- a/include/linux/shmem_fs.h ++++ b/include/linux/shmem_fs.h +@@ -23,6 +23,9 @@ struct shmem_inode_info { + struct posix_acl *i_acl; + struct posix_acl *i_default_acl; + #endif ++#ifdef CONFIG_BEANCOUNTERS ++ struct user_beancounter *shmi_ub; ++#endif + }; + + struct shmem_sb_info { +@@ -62,4 +65,7 @@ static inline void shmem_acl_destroy_inode(struct inode *inode) + } + #endif /* CONFIG_TMPFS_POSIX_ACL */ + ++int shmem_insertpage(struct inode * inode, unsigned long index, ++ swp_entry_t swap); ++ + #endif +diff --git a/include/linux/signal.h b/include/linux/signal.h +index 84f997f..5adb84b 100644 +--- a/include/linux/signal.h ++++ b/include/linux/signal.h +@@ -6,6 +6,8 @@ + + #ifdef __KERNEL__ + #include ++#include ++#include + + /* + * Real Time signals may be queued. +@@ -16,6 +18,9 @@ struct sigqueue { + int flags; + siginfo_t info; + struct user_struct *user; ++#ifdef CONFIG_BEANCOUNTERS ++ struct user_beancounter *sig_ub; ++#endif + }; + + /* flags values. */ +@@ -372,6 +377,8 @@ int unhandled_signal(struct task_struct *tsk, int sig); + + void signals_init(void); + ++extern struct kmem_cache *sigqueue_cachep; ++ + #endif /* __KERNEL__ */ + + #endif /* _LINUX_SIGNAL_H */ +diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h +index 299ec4b..a05f088 100644 +--- a/include/linux/skbuff.h ++++ b/include/linux/skbuff.h +@@ -248,6 +248,8 @@ typedef unsigned char *sk_buff_data_t; + * @secmark: security marking + */ + ++#include ++ + struct sk_buff { + /* These two members must be first. */ + struct sk_buff *next; +@@ -294,7 +296,13 @@ struct sk_buff { + peeked:1, + nf_trace:1; + __be16 protocol; +- ++#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) ++ __u8 brmark; ++#endif ++#ifdef CONFIG_VE ++ unsigned int accounted:1; ++ unsigned int redirected:1; ++#endif + void (*destructor)(struct sk_buff *skb); + #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + struct nf_conntrack *nfct; +@@ -338,6 +346,8 @@ struct sk_buff { + *data; + unsigned int truesize; + atomic_t users; ++ struct skb_beancounter skb_bc; ++ struct ve_struct *owner_env; + }; + + #ifdef __KERNEL__ +@@ -345,6 +355,7 @@ struct sk_buff { + * Handling routines are only of interest to the kernel + */ + #include ++#include + + #include + +@@ -1171,6 +1182,8 @@ static inline void pskb_trim_unique(struct sk_buff *skb, unsigned int len) + */ + static inline void skb_orphan(struct sk_buff *skb) + { ++ ub_skb_uncharge(skb); ++ + if (skb->destructor) + skb->destructor(skb); + skb->destructor = NULL; +@@ -1669,6 +1682,26 @@ static inline void skb_init_secmark(struct sk_buff *skb) + { } + #endif + ++#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) ++static inline void skb_copy_brmark(struct sk_buff *to, const struct sk_buff *from) ++{ ++ to->brmark = from->brmark; ++} ++ ++static inline void skb_init_brmark(struct sk_buff *skb) ++{ ++ skb->brmark = 0; ++} ++#else ++static inline void skb_copy_brmark(struct sk_buff *to, const struct sk_buff *from) ++{ ++} ++ ++static inline void skb_init_brmark(struct sk_buff *skb) ++{ ++} ++#endif ++ + static inline void skb_set_queue_mapping(struct sk_buff *skb, u16 queue_mapping) + { + #ifdef CONFIG_NETDEVICES_MULTIQUEUE +diff --git a/include/linux/slab.h b/include/linux/slab.h +index 9aa90a6..588eea8 100644 +--- a/include/linux/slab.h ++++ b/include/linux/slab.h +@@ -51,6 +51,26 @@ + (unsigned long)ZERO_SIZE_PTR) + + /* ++ * allocation rules: __GFP_UBC 0 ++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ * cache (SLAB_UBC) charge charge ++ * (usual caches: mm, vma, task_struct, ...) ++ * ++ * cache (SLAB_UBC | SLAB_NO_CHARGE) charge --- ++ * (ub_kmalloc) (kmalloc) ++ * ++ * cache (no UB flags) BUG() --- ++ * (nonub caches, mempools) ++ * ++ * pages charge --- ++ * (ub_vmalloc, (vmalloc, ++ * poll, fdsets, ...) non-ub allocs) ++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ */ ++#define SLAB_UBC 0x10000000UL /* alloc space for ubs ... */ ++#define SLAB_NO_CHARGE 0x20000000UL /* ... but don't charge */ ++ ++/* + * struct kmem_cache related prototypes + */ + void __init kmem_cache_init(void); +@@ -65,7 +85,20 @@ void kmem_cache_free(struct kmem_cache *, void *); + unsigned int kmem_cache_size(struct kmem_cache *); + const char *kmem_cache_name(struct kmem_cache *); + int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr); ++extern void show_slab_info(void); ++int kmem_cache_objuse(struct kmem_cache *cachep); ++int kmem_obj_objuse(void *obj); ++int kmem_dname_objuse(void *obj); ++unsigned long ub_cache_growth(struct kmem_cache *cachep); + ++#ifdef CONFIG_BEANCOUNTERS ++void kmem_mark_nocharge(struct kmem_cache *cachep); ++struct user_beancounter **ub_slab_ptr(struct kmem_cache *cachep, void *obj); ++struct user_beancounter *slab_ub(void *obj); ++#else ++static inline void kmem_mark_nocharge(struct kmem_cache *cachep) { } ++static inline struct user_beancounter *slab_ub(void *obj) { return NULL; } ++#endif + /* + * Please use this macro to create slab caches. Simply specify the + * name of the structure and maybe some flags that are listed above. +diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h +index 39c3a5e..6be00b2 100644 +--- a/include/linux/slab_def.h ++++ b/include/linux/slab_def.h +@@ -15,6 +15,111 @@ + #include /* kmalloc_sizes.h needs L1_CACHE_BYTES */ + #include + ++/* ++ * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. ++ * 0 for faster, smaller code (especially in the critical paths). ++ * ++ * STATS - 1 to collect stats for /proc/slabinfo. ++ * 0 for faster, smaller code (especially in the critical paths). ++ * ++ * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) ++ */ ++ ++#ifdef CONFIG_DEBUG_SLAB ++#define SLAB_DEBUG 1 ++#define SLAB_STATS 1 ++#define SLAB_FORCED_DEBUG 1 ++#else ++#define SLAB_DEBUG 0 ++#define SLAB_STATS 0 ++#define SLAB_FORCED_DEBUG 0 ++#endif ++ ++/* ++ * struct kmem_cache ++ * ++ * manages a cache. ++ */ ++ ++struct kmem_cache { ++/* 1) per-cpu data, touched during every alloc/free */ ++ struct array_cache *array[NR_CPUS]; ++/* 2) Cache tunables. Protected by cache_chain_mutex */ ++ unsigned int batchcount; ++ unsigned int limit; ++ unsigned int shared; ++ ++ unsigned int buffer_size; ++ u32 reciprocal_buffer_size; ++/* 3) touched by every alloc & free from the backend */ ++ ++ unsigned int flags; /* constant flags */ ++ unsigned int num; /* # of objs per slab */ ++ ++/* 4) cache_grow/shrink */ ++ /* order of pgs per slab (2^n) */ ++ unsigned int gfporder; ++ ++ /* force GFP flags, e.g. GFP_DMA */ ++ gfp_t gfpflags; ++ ++ size_t colour; /* cache colouring range */ ++ unsigned int colour_off; /* colour offset */ ++ struct kmem_cache *slabp_cache; ++ unsigned int slab_size; ++ unsigned int dflags; /* dynamic flags */ ++ ++ /* constructor func */ ++ void (*ctor) (struct kmem_cache *, void *); ++ ++/* 5) cache creation/removal */ ++ const char *name; ++ struct list_head next; ++ ++/* 6) statistics */ ++ unsigned long grown; ++ unsigned long reaped; ++ unsigned long shrunk; ++#if SLAB_STATS ++ unsigned long num_active; ++ unsigned long num_allocations; ++ unsigned long high_mark; ++ unsigned long errors; ++ unsigned long max_freeable; ++ unsigned long node_allocs; ++ unsigned long node_frees; ++ unsigned long node_overflow; ++ atomic_t allochit; ++ atomic_t allocmiss; ++ atomic_t freehit; ++ atomic_t freemiss; ++#endif ++#if SLAB_DEBUG ++ /* ++ * If debugging is enabled, then the allocator can add additional ++ * fields and/or padding to every object. buffer_size contains the total ++ * object size including these internal fields, the following two ++ * variables contain the offset to the user object and its size. ++ */ ++ int obj_offset; ++ int obj_size; ++#endif ++#ifdef CONFIG_BEANCOUNTERS ++ int objuse; ++#endif ++ /* ++ * We put nodelists[] at the end of kmem_cache, because we want to size ++ * this array to nr_node_ids slots instead of MAX_NUMNODES ++ * (see kmem_cache_init()) ++ * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache ++ * is statically defined, so we reserve the max number of nodes. ++ */ ++ struct kmem_list3 *nodelists[MAX_NUMNODES]; ++ /* ++ * Do not add fields after nodelists[] ++ */ ++}; ++ + /* Size description struct for general caches. */ + struct cache_sizes { + size_t cs_size; +@@ -24,6 +129,7 @@ struct cache_sizes { + #endif + }; + extern struct cache_sizes malloc_sizes[]; ++extern int malloc_cache_num; + + void *kmem_cache_alloc(struct kmem_cache *, gfp_t); + void *__kmalloc(size_t size, gfp_t flags); +@@ -48,6 +154,8 @@ static inline void *kmalloc(size_t size, gfp_t flags) + __you_cannot_kmalloc_that_much(); + } + found: ++ if (flags & __GFP_UBC) ++ i += malloc_cache_num; + #ifdef CONFIG_ZONE_DMA + if (flags & GFP_DMA) + return kmem_cache_alloc(malloc_sizes[i].cs_dmacachep, +diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h +index d117ea2..c6187b4 100644 +--- a/include/linux/slub_def.h ++++ b/include/linux/slub_def.h +@@ -94,6 +94,10 @@ struct kmem_cache { + struct kobject kobj; /* For sysfs */ + #endif + ++#ifdef CONFIG_BEANCOUNTERS ++ atomic_t grown; ++ int objuse; ++#endif + #ifdef CONFIG_NUMA + /* + * Defragmentation by allocating from a remote node. +@@ -125,6 +129,19 @@ struct kmem_cache { + */ + extern struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1]; + ++#ifdef CONFIG_BEANCOUNTERS ++extern struct kmem_cache ub_kmalloc_caches[KMALLOC_SHIFT_HIGH + 1]; ++static inline struct kmem_cache *__kmalloc_cache(gfp_t f, int idx) ++{ ++ return (f & __GFP_UBC) ? &ub_kmalloc_caches[idx] : &kmalloc_caches[idx]; ++} ++#else ++static inline struct kmem_cache *__kmalloc_cache(gfp_t flags, int idx) ++{ ++ return &kmalloc_caches[idx]; ++} ++#endif ++ + /* + * Sorry that the following has to be that ugly but some versions of GCC + * have trouble with constant propagation and loops. +@@ -183,14 +200,14 @@ static __always_inline int kmalloc_index(size_t size) + * This ought to end up with a global pointer to the right cache + * in kmalloc_caches. + */ +-static __always_inline struct kmem_cache *kmalloc_slab(size_t size) ++static __always_inline struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) + { + int index = kmalloc_index(size); + + if (index == 0) + return NULL; + +- return &kmalloc_caches[index]; ++ return __kmalloc_cache(flags, index); + } + + #ifdef CONFIG_ZONE_DMA +@@ -215,7 +232,7 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags) + return kmalloc_large(size, flags); + + if (!(flags & SLUB_DMA)) { +- struct kmem_cache *s = kmalloc_slab(size); ++ struct kmem_cache *s = kmalloc_slab(size, flags); + + if (!s) + return ZERO_SIZE_PTR; +@@ -234,7 +251,7 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) + { + if (__builtin_constant_p(size) && + size <= PAGE_SIZE && !(flags & SLUB_DMA)) { +- struct kmem_cache *s = kmalloc_slab(size); ++ struct kmem_cache *s = kmalloc_slab(size, flags); + + if (!s) + return ZERO_SIZE_PTR; +diff --git a/include/linux/smp.h b/include/linux/smp.h +index 55232cc..8491af0 100644 +--- a/include/linux/smp.h ++++ b/include/linux/smp.h +@@ -10,6 +10,9 @@ + + extern void cpu_idle(void); + ++struct pt_regs; ++typedef void (*smp_nmi_function)(struct pt_regs *regs, void *info); ++ + #ifdef CONFIG_SMP + + #include +@@ -49,6 +52,8 @@ extern int __cpu_up(unsigned int cpunum); + */ + extern void smp_cpus_done(unsigned int max_cpus); + ++extern int smp_nmi_call_function(smp_nmi_function func, void *info, int wait); ++ + /* + * Call a function on all other processors + */ +@@ -113,6 +118,12 @@ static inline void smp_send_reschedule(int cpu) { } + #define smp_call_function_mask(mask, func, info, wait) \ + (up_smp_call_function(func, info)) + ++static inline int smp_nmi_call_function(smp_nmi_function func, ++ void *info, int wait) ++{ ++ return 0; ++} ++ + #endif /* !SMP */ + + /* +diff --git a/include/linux/socket.h b/include/linux/socket.h +index bd2b30a..675ee51 100644 +--- a/include/linux/socket.h ++++ b/include/linux/socket.h +@@ -298,6 +298,16 @@ struct ucred { + #define IPX_TYPE 1 + + #ifdef __KERNEL__ ++ ++#define MAX_SOCK_ADDR 128 /* 108 for Unix domain - ++ 16 for IP, 16 for IPX, ++ 24 for IPv6, ++ about 80 for AX.25 ++ must be at least one bigger than ++ the AF_UNIX size (see net/unix/af_unix.c ++ :unix_mkname()). ++ */ ++ + extern int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len); + extern int memcpy_fromiovecend(unsigned char *kdata, struct iovec *iov, + int offset, int len); +@@ -311,6 +321,8 @@ extern int memcpy_toiovec(struct iovec *v, unsigned char *kdata, int len); + extern int move_addr_to_user(void *kaddr, int klen, void __user *uaddr, int __user *ulen); + extern int move_addr_to_kernel(void __user *uaddr, int ulen, void *kaddr); + extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data); ++extern int vz_security_family_check(int family); ++extern int vz_security_protocol_check(int protocol); + + #endif + #endif /* not kernel and not glibc */ +diff --git a/include/linux/swap.h b/include/linux/swap.h +index 0b33776..7e449b8 100644 +--- a/include/linux/swap.h ++++ b/include/linux/swap.h +@@ -18,6 +18,7 @@ struct bio; + #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ + #define SWAP_FLAG_PRIO_MASK 0x7fff + #define SWAP_FLAG_PRIO_SHIFT 0 ++#define SWAP_FLAG_READONLY 0x40000000 /* set if swap is read-only */ + + static inline int current_is_kswapd(void) + { +@@ -93,6 +94,7 @@ struct address_space; + struct sysinfo; + struct writeback_control; + struct zone; ++struct user_beancounter; + + /* + * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of +@@ -122,6 +124,7 @@ enum { + SWP_ACTIVE = (SWP_USED | SWP_WRITEOK), + /* add others here before... */ + SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ ++ SWP_READONLY = (1 << 2), + }; + + #define SWAP_CLUSTER_MAX 32 +@@ -132,6 +135,7 @@ enum { + /* + * The in-memory structure used to track swap areas. + */ ++struct user_beancounter; + struct swap_info_struct { + unsigned int flags; + int prio; /* swap priority */ +@@ -149,6 +153,9 @@ struct swap_info_struct { + unsigned int max; + unsigned int inuse_pages; + int next; /* next entry on swap list */ ++#ifdef CONFIG_BC_SWAP_ACCOUNTING ++ struct user_beancounter **swap_ubs; ++#endif + }; + + struct swap_list_t { +@@ -156,9 +163,21 @@ struct swap_list_t { + int next; /* swapfile to be used next */ + }; + ++extern struct swap_list_t swap_list; ++extern struct swap_info_struct swap_info[MAX_SWAPFILES]; ++ + /* Swap 50% full? Release swapcache more aggressively.. */ + #define vm_swap_full() (nr_swap_pages*2 < total_swap_pages) + ++/* linux/mm/oom_kill.c */ ++extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order); ++extern int register_oom_notifier(struct notifier_block *nb); ++extern int unregister_oom_notifier(struct notifier_block *nb); ++extern int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, ++ struct mem_cgroup *mem, const char *message); ++extern struct task_struct *select_bad_process(struct user_beancounter *ub, ++ struct mem_cgroup *memcg); ++ + /* linux/mm/page_alloc.c */ + extern unsigned long totalram_pages; + extern unsigned long totalreserve_pages; +@@ -226,6 +245,8 @@ extern void show_swap_cache_info(void); + extern int add_to_swap(struct page *, gfp_t); + extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); + extern void __delete_from_swap_cache(struct page *); ++extern int __add_to_swap_cache(struct page *page, ++ swp_entry_t entry, gfp_t gfp_mask); + extern void delete_from_swap_cache(struct page *); + extern void free_page_and_swap_cache(struct page *); + extern void free_pages_and_swap_cache(struct page **, int); +@@ -239,7 +260,7 @@ extern struct page *swapin_readahead(swp_entry_t, gfp_t, + extern long total_swap_pages; + extern unsigned int nr_swapfiles; + extern void si_swapinfo(struct sysinfo *); +-extern swp_entry_t get_swap_page(void); ++extern swp_entry_t get_swap_page(struct user_beancounter *); + extern swp_entry_t get_swap_page_of_type(int); + extern int swap_duplicate(swp_entry_t); + extern int valid_swaphandles(swp_entry_t, unsigned long *); +@@ -252,6 +273,7 @@ extern sector_t swapdev_block(int, pgoff_t); + extern struct swap_info_struct *get_swap_info_struct(unsigned); + extern int can_share_swap_page(struct page *); + extern int remove_exclusive_swap_page(struct page *); ++extern int try_to_remove_exclusive_swap_page(struct page *); + struct backing_dev_info; + + extern spinlock_t swap_lock; +@@ -342,7 +364,7 @@ static inline int remove_exclusive_swap_page(struct page *p) + return 0; + } + +-static inline swp_entry_t get_swap_page(void) ++static inline swp_entry_t get_swap_page(struct user_beancounter *ub) + { + swp_entry_t entry; + entry.val = 0; +diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h +index 24141b4..7caace5 100644 +--- a/include/linux/sysctl.h ++++ b/include/linux/sysctl.h +@@ -1081,10 +1081,15 @@ struct ctl_table_header *__register_sysctl_paths( + struct ctl_table_header *register_sysctl_table(struct ctl_table * table); + struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, + struct ctl_table *table); ++struct ctl_table_header *register_sysctl_glob_table(struct ctl_table *, int); ++struct ctl_table_header *register_sysctl_glob_paths(const struct ctl_path *, ++ struct ctl_table *, int); + + void unregister_sysctl_table(struct ctl_table_header * table); + int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table); + ++extern int ve_allow_kthreads; ++ + #endif /* __KERNEL__ */ + + #endif /* _LINUX_SYSCTL_H */ +diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h +index 7858eac..3887cd6 100644 +--- a/include/linux/sysfs.h ++++ b/include/linux/sysfs.h +@@ -19,6 +19,7 @@ + + struct kobject; + struct module; ++struct sysfs_open_dirent; + + /* FIXME + * The *owner field is no longer used, but leave around +@@ -78,6 +79,66 @@ struct sysfs_ops { + ssize_t (*store)(struct kobject *,struct attribute *,const char *, size_t); + }; + ++/* type-specific structures for sysfs_dirent->s_* union members */ ++struct sysfs_elem_dir { ++ struct kobject *kobj; ++ /* children list starts here and goes through sd->s_sibling */ ++ struct sysfs_dirent *children; ++}; ++ ++struct sysfs_elem_symlink { ++ struct sysfs_dirent *target_sd; ++}; ++ ++struct sysfs_elem_attr { ++ struct attribute *attr; ++ struct sysfs_open_dirent *open; ++}; ++ ++struct sysfs_elem_bin_attr { ++ struct bin_attribute *bin_attr; ++}; ++ ++/* ++ * sysfs_dirent - the building block of sysfs hierarchy. Each and ++ * every sysfs node is represented by single sysfs_dirent. ++ * ++ * As long as s_count reference is held, the sysfs_dirent itself is ++ * accessible. Dereferencing s_elem or any other outer entity ++ * requires s_active reference. ++ */ ++struct sysfs_dirent { ++ atomic_t s_count; ++ atomic_t s_active; ++ struct sysfs_dirent *s_parent; ++ struct sysfs_dirent *s_sibling; ++ const char *s_name; ++ ++ union { ++ struct sysfs_elem_dir s_dir; ++ struct sysfs_elem_symlink s_symlink; ++ struct sysfs_elem_attr s_attr; ++ struct sysfs_elem_bin_attr s_bin_attr; ++ }; ++ ++ unsigned int s_flags; ++ ino_t s_ino; ++ umode_t s_mode; ++ struct iattr *s_iattr; ++}; ++ ++#define SD_DEACTIVATED_BIAS INT_MIN ++ ++#define SYSFS_TYPE_MASK 0x00ff ++#define SYSFS_DIR 0x0001 ++#define SYSFS_KOBJ_ATTR 0x0002 ++#define SYSFS_KOBJ_BIN_ATTR 0x0004 ++#define SYSFS_KOBJ_LINK 0x0008 ++#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK) ++ ++#define SYSFS_FLAG_MASK ~SYSFS_TYPE_MASK ++#define SYSFS_FLAG_REMOVED 0x0200 ++ + #ifdef CONFIG_SYSFS + + int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *), +@@ -118,6 +179,8 @@ void sysfs_notify(struct kobject *kobj, char *dir, char *attr); + + extern int __must_check sysfs_init(void); + ++extern struct file_system_type sysfs_fs_type; ++ + #else /* CONFIG_SYSFS */ + + static inline int sysfs_schedule_callback(struct kobject *kobj, +diff --git a/include/linux/task_io_accounting_ops.h b/include/linux/task_io_accounting_ops.h +index ff46c6f..205f82e 100644 +--- a/include/linux/task_io_accounting_ops.h ++++ b/include/linux/task_io_accounting_ops.h +@@ -5,10 +5,12 @@ + #define __TASK_IO_ACCOUNTING_OPS_INCLUDED + + #include ++#include + + #ifdef CONFIG_TASK_IO_ACCOUNTING + static inline void task_io_account_read(size_t bytes) + { ++ ub_io_account_read(bytes); + current->ioac.read_bytes += bytes; + } + +@@ -21,8 +23,14 @@ static inline unsigned long task_io_get_inblock(const struct task_struct *p) + return p->ioac.read_bytes >> 9; + } + +-static inline void task_io_account_write(size_t bytes) ++static inline void task_io_account_write(struct page *page, size_t bytes, ++ int sync) + { ++ if (sync) ++ ub_io_account_write(bytes); ++ else ++ ub_io_account_dirty(page, bytes); ++ + current->ioac.write_bytes += bytes; + } + +@@ -37,6 +45,7 @@ static inline unsigned long task_io_get_oublock(const struct task_struct *p) + + static inline void task_io_account_cancelled_write(size_t bytes) + { ++ ub_io_account_write_cancelled(bytes); + current->ioac.cancelled_write_bytes += bytes; + } + +@@ -56,7 +65,8 @@ static inline unsigned long task_io_get_inblock(const struct task_struct *p) + return 0; + } + +-static inline void task_io_account_write(size_t bytes) ++static inline void task_io_account_write(struct page *page, size_t bytes, ++ int sync) + { + } + +diff --git a/include/linux/tty.h b/include/linux/tty.h +index 324a3b2..fb4d996 100644 +--- a/include/linux/tty.h ++++ b/include/linux/tty.h +@@ -241,6 +241,7 @@ struct tty_struct { + spinlock_t read_lock; + /* If the tty has a pending do_SAK, queue it here - akpm */ + struct work_struct SAK_work; ++ struct ve_struct *owner_env; + }; + + /* tty magic number */ +@@ -270,6 +271,7 @@ struct tty_struct { + #define TTY_HUPPED 18 /* Post driver->hangup() */ + #define TTY_FLUSHING 19 /* Flushing to ldisc in progress */ + #define TTY_FLUSHPENDING 20 /* Queued buffer flush pending */ ++#define TTY_CHARGED 21 /* Charged as ub resource */ + + #define TTY_WRITE_FLUSH(tty) tty_write_flush((tty)) + +diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h +index d2a0035..fe1f025 100644 +--- a/include/linux/tty_driver.h ++++ b/include/linux/tty_driver.h +@@ -242,8 +242,19 @@ struct tty_driver { + + const struct tty_operations *ops; + struct list_head tty_drivers; ++ struct ve_struct *owner_env; + }; + ++#ifdef CONFIG_UNIX98_PTYS ++extern struct tty_driver *ptm_driver; /* Unix98 pty masters; for /dev/ptmx */ ++extern struct tty_driver *pts_driver; /* Unix98 pty slaves; for /dev/ptmx */ ++#endif ++ ++#ifdef CONFIG_LEGACY_PTYS ++extern struct tty_driver *pty_driver; ++extern struct tty_driver *pty_slave_driver; ++#endif ++ + extern struct list_head tty_drivers; + + struct tty_driver *alloc_tty_driver(int lines); +@@ -252,6 +263,9 @@ void tty_set_operations(struct tty_driver *driver, + const struct tty_operations *op); + extern struct tty_driver *tty_find_polling_driver(char *name, int *line); + ++int init_ve_tty_class(void); ++void fini_ve_tty_class(void); ++ + /* tty driver magic number */ + #define TTY_DRIVER_MAGIC 0x5402 + +diff --git a/include/linux/types.h b/include/linux/types.h +index d4a9ce6..dcdaf75 100644 +--- a/include/linux/types.h ++++ b/include/linux/types.h +@@ -29,6 +29,11 @@ typedef __kernel_timer_t timer_t; + typedef __kernel_clockid_t clockid_t; + typedef __kernel_mqd_t mqd_t; + ++#ifndef __ENVID_T_DEFINED__ ++typedef unsigned envid_t; ++#define __ENVID_T_DEFINED__ ++#endif ++ + #ifdef __KERNEL__ + typedef _Bool bool; + +diff --git a/include/linux/utsname.h b/include/linux/utsname.h +index 1123267..ec24d89 100644 +--- a/include/linux/utsname.h ++++ b/include/linux/utsname.h +@@ -43,6 +43,7 @@ struct uts_namespace { + struct new_utsname name; + }; + extern struct uts_namespace init_uts_ns; ++extern struct new_utsname virt_utsname; + + #ifdef CONFIG_UTS_NS + static inline void get_uts_ns(struct uts_namespace *ns) +diff --git a/include/linux/ve.h b/include/linux/ve.h +new file mode 100644 +index 0000000..7025716 +--- /dev/null ++++ b/include/linux/ve.h +@@ -0,0 +1,344 @@ ++/* ++ * include/linux/ve.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef _LINUX_VE_H ++#define _LINUX_VE_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef VZMON_DEBUG ++# define VZTRACE(fmt,args...) \ ++ printk(KERN_DEBUG fmt, ##args) ++#else ++# define VZTRACE(fmt,args...) ++#endif /* VZMON_DEBUG */ ++ ++struct tty_driver; ++struct devpts_config; ++struct task_struct; ++struct new_utsname; ++struct file_system_type; ++struct icmp_mib; ++struct ip_mib; ++struct tcp_mib; ++struct udp_mib; ++struct linux_mib; ++struct fib_info; ++struct fib_rule; ++struct veip_struct; ++struct ve_monitor; ++struct nsproxy; ++ ++#if defined(CONFIG_VE) && defined(CONFIG_INET) ++struct fib_table; ++#ifdef CONFIG_VE_IPTABLES ++struct xt_table; ++struct nf_conn; ++ ++#define FRAG6Q_HASHSZ 64 ++ ++struct ve_nf_conntrack { ++ struct hlist_head *_bysource; ++ struct nf_nat_protocol **_nf_nat_protos; ++ int _nf_nat_vmalloced; ++ struct xt_table *_nf_nat_table; ++ struct nf_conntrack_l3proto *_nf_nat_l3proto; ++ atomic_t _nf_conntrack_count; ++ int _nf_conntrack_max; ++ struct hlist_head *_nf_conntrack_hash; ++ int _nf_conntrack_checksum; ++ int _nf_conntrack_vmalloc; ++ struct hlist_head _unconfirmed; ++ struct hlist_head *_nf_ct_expect_hash; ++ unsigned int _nf_ct_expect_vmalloc; ++ unsigned int _nf_ct_expect_count; ++ unsigned int _nf_ct_expect_max; ++ struct hlist_head *_nf_ct_helper_hash; ++ unsigned int _nf_ct_helper_vmalloc; ++#ifdef CONFIG_SYSCTL ++ /* l4 stuff: */ ++ unsigned long _nf_ct_icmp_timeout; ++ unsigned long _nf_ct_icmpv6_timeout; ++ unsigned int _nf_ct_udp_timeout; ++ unsigned int _nf_ct_udp_timeout_stream; ++ unsigned int _nf_ct_generic_timeout; ++ unsigned int _nf_ct_log_invalid; ++ unsigned int _nf_ct_tcp_timeout_max_retrans; ++ int _nf_ct_tcp_be_liberal; ++ int _nf_ct_tcp_loose; ++ int _nf_ct_tcp_max_retrans; ++ unsigned int _nf_ct_tcp_timeouts[10]; ++ struct ctl_table_header *_icmp_sysctl_header; ++ unsigned int _tcp_sysctl_table_users; ++ struct ctl_table_header *_tcp_sysctl_header; ++ unsigned int _udp_sysctl_table_users; ++ struct ctl_table_header *_udp_sysctl_header; ++ struct ctl_table_header *_icmpv6_sysctl_header; ++ struct ctl_table_header *_generic_sysctl_header; ++#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT ++ struct ctl_table_header *_icmp_compat_sysctl_header; ++ struct ctl_table_header *_tcp_compat_sysctl_header; ++ struct ctl_table_header *_udp_compat_sysctl_header; ++ struct ctl_table_header *_generic_compat_sysctl_header; ++#endif ++ /* l4 protocols sysctl tables: */ ++ struct nf_conntrack_l4proto *_nf_conntrack_l4proto_icmp; ++ struct nf_conntrack_l4proto *_nf_conntrack_l4proto_tcp4; ++ struct nf_conntrack_l4proto *_nf_conntrack_l4proto_icmpv6; ++ struct nf_conntrack_l4proto *_nf_conntrack_l4proto_tcp6; ++ struct nf_conntrack_l4proto *_nf_conntrack_l4proto_udp4; ++ struct nf_conntrack_l4proto *_nf_conntrack_l4proto_udp6; ++ struct nf_conntrack_l4proto *_nf_conntrack_l4proto_generic; ++ struct nf_conntrack_l4proto **_nf_ct_protos[PF_MAX]; ++ /* l3 protocols sysctl tables: */ ++ struct nf_conntrack_l3proto *_nf_conntrack_l3proto_ipv4; ++ struct nf_conntrack_l3proto *_nf_conntrack_l3proto_ipv6; ++ struct nf_conntrack_l3proto *_nf_ct_l3protos[AF_MAX]; ++ /* sysctl standalone stuff: */ ++ struct ctl_table_header *_nf_ct_sysctl_header; ++ ctl_table *_nf_ct_sysctl_table; ++ ctl_table *_nf_ct_netfilter_table; ++ ctl_table *_nf_ct_net_table; ++ ctl_table *_ip_ct_netfilter_table; ++ struct ctl_table_header *_ip_ct_sysctl_header; ++ int _nf_ct_log_invalid_proto_min; ++ int _nf_ct_log_invalid_proto_max; ++#endif /* CONFIG_SYSCTL */ ++}; ++#endif ++#endif ++ ++struct ve_cpu_stats { ++ cycles_t idle_time; ++ cycles_t iowait_time; ++ cycles_t strt_idle_time; ++ cycles_t used_time; ++ seqcount_t stat_lock; ++ int nr_running; ++ int nr_unint; ++ int nr_iowait; ++ cputime64_t user; ++ cputime64_t nice; ++ cputime64_t system; ++} ____cacheline_aligned; ++ ++struct ve_ipt_recent; ++struct ve_xt_hashlimit; ++ ++struct cgroup; ++struct css_set; ++ ++struct ve_struct { ++ struct list_head ve_list; ++ ++ envid_t veid; ++ struct list_head vetask_lh; ++ /* capability bounding set */ ++ kernel_cap_t ve_cap_bset; ++ atomic_t pcounter; ++ /* ref counter to ve from ipc */ ++ atomic_t counter; ++ unsigned int class_id; ++ struct rw_semaphore op_sem; ++ int is_running; ++ int is_locked; ++ atomic_t suspend; ++ /* see vzcalluser.h for VE_FEATURE_XXX definitions */ ++ __u64 features; ++ ++/* VE's root */ ++ struct path root_path; ++ ++ struct file_system_type *proc_fstype; ++ struct vfsmount *proc_mnt; ++ struct proc_dir_entry *proc_root; ++ ++/* BSD pty's */ ++#ifdef CONFIG_LEGACY_PTYS ++ struct tty_driver *pty_driver; ++ struct tty_driver *pty_slave_driver; ++#endif ++#ifdef CONFIG_UNIX98_PTYS ++ struct tty_driver *ptm_driver; ++ struct tty_driver *pts_driver; ++ struct idr *allocated_ptys; ++ struct file_system_type *devpts_fstype; ++ struct vfsmount *devpts_mnt; ++ struct dentry *devpts_root; ++ struct devpts_config *devpts_config; ++#endif ++ ++ struct file_system_type *shmem_fstype; ++ struct vfsmount *shmem_mnt; ++#ifdef CONFIG_SYSFS ++ struct file_system_type *sysfs_fstype; ++ struct vfsmount *sysfs_mnt; ++ struct super_block *sysfs_sb; ++ struct sysfs_dirent *_sysfs_root; ++#endif ++#ifndef CONFIG_SYSFS_DEPRECATED ++ struct kobject *_virtual_dir; ++#endif ++ struct kset *class_kset; ++ struct kset *devices_kset; ++ struct class *tty_class; ++ struct class *mem_class; ++ ++#ifdef CONFIG_NET ++ struct class *net_class; ++#ifdef CONFIG_INET ++ unsigned long rt_flush_required; ++#endif ++#endif ++#if defined(CONFIG_VE_NETDEV) || defined (CONFIG_VE_NETDEV_MODULE) ++ struct veip_struct *veip; ++ struct net_device *_venet_dev; ++#endif ++ ++/* per VE CPU stats*/ ++ struct timespec start_timespec; ++ u64 start_jiffies; /* Deprecated */ ++ cycles_t start_cycles; ++ unsigned long avenrun[3]; /* loadavg data */ ++ ++ cycles_t cpu_used_ve; ++ struct kstat_lat_pcpu_struct sched_lat_ve; ++ ++#ifdef CONFIG_INET ++ struct icmp_mib *_icmp_statistics[2]; ++ struct icmpmsg_mib *_icmpmsg_statistics[2]; ++ struct ipstats_mib *_ip_statistics[2]; ++ struct tcp_mib *_tcp_statistics[2]; ++ struct udp_mib *_udp_statistics[2]; ++ struct udp_mib *_udplite_statistics[2]; ++ struct linux_mib *_net_statistics[2]; ++ struct venet_stat *stat; ++#ifdef CONFIG_VE_IPTABLES ++/* core/netfilter.c virtualization */ ++ struct xt_table *_ve_ipt_filter_pf; /* packet_filter struct */ ++ struct xt_table *_ve_ip6t_filter_pf; ++ struct xt_table *_ipt_mangle_table; ++ struct xt_table *_ip6t_mangle_table; ++ struct list_head _xt_tables[NPROTO]; ++ ++ __u64 ipt_mask; ++ __u64 _iptables_modules; ++ struct ve_nf_conntrack *_nf_conntrack; ++ struct ve_ipt_recent *_ipt_recent; ++ struct ve_xt_hashlimit *_xt_hashlimit; ++#endif /* CONFIG_VE_IPTABLES */ ++ ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ struct ipstats_mib *_ipv6_statistics[2]; ++ struct icmpv6_mib *_icmpv6_statistics[2]; ++ struct icmpv6msg_mib *_icmpv6msg_statistics[2]; ++ struct udp_mib *_udp_stats_in6[2]; ++ struct udp_mib *_udplite_stats_in6[2]; ++#endif ++#endif ++ wait_queue_head_t *_log_wait; ++ unsigned *_log_start; ++ unsigned *_log_end; ++ unsigned *_logged_chars; ++ char *log_buf; ++#define VE_DEFAULT_LOG_BUF_LEN 4096 ++ ++ struct ve_cpu_stats *cpu_stats; ++ unsigned long down_at; ++ struct list_head cleanup_list; ++#if defined(CONFIG_FUSE_FS) || defined(CONFIG_FUSE_FS_MODULE) ++ struct list_head _fuse_conn_list; ++ struct super_block *_fuse_control_sb; ++ ++ struct file_system_type *fuse_fs_type; ++ struct file_system_type *fuse_ctl_fs_type; ++#endif ++ unsigned long jiffies_fixup; ++ unsigned char disable_net; ++ struct ve_monitor *monitor; ++ struct proc_dir_entry *monitor_proc; ++ unsigned long meminfo_val; ++ ++ struct nsproxy *ve_ns; ++ struct net *ve_netns; ++ struct cgroup *ve_cgroup; ++ struct css_set *ve_css_set; ++}; ++ ++int init_ve_cgroups(struct ve_struct *ve); ++void fini_ve_cgroups(struct ve_struct *ve); ++ ++#define VE_CPU_STATS(ve, cpu) (per_cpu_ptr((ve)->cpu_stats, cpu)) ++ ++extern int nr_ve; ++extern struct proc_dir_entry *proc_vz_dir; ++extern struct proc_dir_entry *glob_proc_vz_dir; ++ ++#ifdef CONFIG_VE ++ ++void do_update_load_avg_ve(void); ++void do_env_free(struct ve_struct *ptr); ++ ++static inline struct ve_struct *get_ve(struct ve_struct *ptr) ++{ ++ if (ptr != NULL) ++ atomic_inc(&ptr->counter); ++ return ptr; ++} ++ ++static inline void put_ve(struct ve_struct *ptr) ++{ ++ if (ptr && atomic_dec_and_test(&ptr->counter)) { ++ BUG_ON(atomic_read(&ptr->pcounter) > 0); ++ BUG_ON(ptr->is_running); ++ do_env_free(ptr); ++ } ++} ++ ++static inline void pget_ve(struct ve_struct *ptr) ++{ ++ atomic_inc(&ptr->pcounter); ++} ++ ++void ve_cleanup_schedule(struct ve_struct *); ++static inline void pput_ve(struct ve_struct *ptr) ++{ ++ if (unlikely(atomic_dec_and_test(&ptr->pcounter))) ++ ve_cleanup_schedule(ptr); ++} ++ ++extern spinlock_t ve_cleanup_lock; ++extern struct list_head ve_cleanup_list; ++extern struct task_struct *ve_cleanup_thread; ++ ++extern unsigned long long ve_relative_clock(struct timespec * ts); ++ ++#ifdef CONFIG_FAIRSCHED ++#define ve_cpu_online_map(ve, mask) fairsched_cpu_online_map(ve->veid, mask) ++#else ++#define ve_cpu_online_map(ve, mask) do { *(mask) = cpu_online_map; } while (0) ++#endif ++#else /* CONFIG_VE */ ++#define ve_utsname system_utsname ++#define get_ve(ve) (NULL) ++#define put_ve(ve) do { } while (0) ++#define pget_ve(ve) do { } while (0) ++#define pput_ve(ve) do { } while (0) ++#endif /* CONFIG_VE */ ++ ++#endif /* _LINUX_VE_H */ +diff --git a/include/linux/ve_proto.h b/include/linux/ve_proto.h +new file mode 100644 +index 0000000..26ca897 +--- /dev/null ++++ b/include/linux/ve_proto.h +@@ -0,0 +1,89 @@ ++/* ++ * include/linux/ve_proto.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __VE_H__ ++#define __VE_H__ ++ ++#ifdef CONFIG_VE ++ ++struct ve_struct; ++ ++#ifdef CONFIG_INET ++void tcp_v4_kill_ve_sockets(struct ve_struct *envid); ++#ifdef CONFIG_VE_NETDEV ++int venet_init(void); ++#endif ++#endif ++ ++extern struct list_head ve_list_head; ++#define for_each_ve(ve) list_for_each_entry((ve), &ve_list_head, ve_list) ++extern rwlock_t ve_list_lock; ++extern struct ve_struct *get_ve_by_id(envid_t); ++extern struct ve_struct *__find_ve_by_id(envid_t); ++ ++struct env_create_param3; ++extern int real_env_create(envid_t veid, unsigned flags, u32 class_id, ++ struct env_create_param3 *data, int datalen); ++extern void ve_move_task(struct task_struct *, struct ve_struct *); ++ ++int set_device_perms_ve(struct ve_struct *, unsigned, dev_t, unsigned); ++int get_device_perms_ve(int dev_type, dev_t dev, int access_mode); ++int devperms_seq_show(struct seq_file *m, void *v); ++ ++enum { ++ VE_SS_CHAIN, ++ ++ VE_MAX_CHAINS ++}; ++ ++typedef int ve_hook_init_fn(void *data); ++typedef void ve_hook_fini_fn(void *data); ++ ++struct ve_hook ++{ ++ ve_hook_init_fn *init; ++ ve_hook_fini_fn *fini; ++ struct module *owner; ++ ++ /* Functions are called in ascending priority */ ++ int priority; ++ ++ /* Private part */ ++ struct list_head list; ++}; ++ ++enum { ++ HOOK_PRIO_DEFAULT = 0, ++ ++ HOOK_PRIO_FS = HOOK_PRIO_DEFAULT, ++ ++ HOOK_PRIO_NET_PRE, ++ HOOK_PRIO_NET, ++ HOOK_PRIO_NET_POST, ++ ++ HOOK_PRIO_AFTERALL = INT_MAX ++}; ++ ++void *ve_seq_start(struct seq_file *m, loff_t *pos); ++void *ve_seq_next(struct seq_file *m, void *v, loff_t *pos); ++void ve_seq_stop(struct seq_file *m, void *v); ++ ++extern int ve_hook_iterate_init(int chain, void *data); ++extern void ve_hook_iterate_fini(int chain, void *data); ++ ++extern void ve_hook_register(int chain, struct ve_hook *vh); ++extern void ve_hook_unregister(struct ve_hook *vh); ++#else /* CONFIG_VE */ ++#define ve_hook_register(ch, vh) do { } while (0) ++#define ve_hook_unregister(ve) do { } while (0) ++ ++#define get_device_perms_ve(t, d, a) (0) ++#endif /* CONFIG_VE */ ++#endif +diff --git a/include/linux/ve_task.h b/include/linux/ve_task.h +new file mode 100644 +index 0000000..4b7d722 +--- /dev/null ++++ b/include/linux/ve_task.h +@@ -0,0 +1,73 @@ ++/* ++ * include/linux/ve_task.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __VE_TASK_H__ ++#define __VE_TASK_H__ ++ ++#include ++#include ++ ++struct ve_task_info { ++/* virtualization */ ++ struct ve_struct *owner_env; ++ struct ve_struct *exec_env; ++ struct ve_struct *saved_env; ++ struct list_head vetask_list; ++ struct dentry *glob_proc_dentry; ++/* statistics: scheduling latency */ ++ cycles_t sleep_time; ++ cycles_t sched_time; ++ cycles_t sleep_stamp; ++ cycles_t wakeup_stamp; ++ seqcount_t wakeup_lock; ++}; ++ ++#define VE_TASK_INFO(task) (&(task)->ve_task_info) ++#define VE_TASK_LIST_2_TASK(lh) \ ++ list_entry(lh, struct task_struct, ve_task_info.vetask_list) ++ ++#ifdef CONFIG_VE ++extern struct ve_struct ve0; ++#define get_ve0() (&ve0) ++ ++#define ve_save_context(t) do { \ ++ t->ve_task_info.saved_env = \ ++ t->ve_task_info.exec_env; \ ++ t->ve_task_info.exec_env = get_ve0(); \ ++ } while (0) ++#define ve_restore_context(t) do { \ ++ t->ve_task_info.exec_env = \ ++ t->ve_task_info.saved_env; \ ++ } while (0) ++ ++#define get_exec_env() (current->ve_task_info.exec_env) ++#define set_exec_env(ve) ({ \ ++ struct ve_task_info *vi; \ ++ struct ve_struct *old, *new; \ ++ \ ++ vi = ¤t->ve_task_info; \ ++ old = vi->exec_env; \ ++ new = ve; \ ++ if (unlikely(new == NULL)) { \ ++ printk("%s: NULL exec env (%s)\n", __func__, #ve);\ ++ new = get_ve0(); \ ++ } \ ++ vi->exec_env = new; \ ++ old; \ ++ }) ++#else ++#define get_ve0() (NULL) ++#define get_exec_env() (NULL) ++#define set_exec_env(new_env) (NULL) ++#define ve_save_context(t) do { } while (0) ++#define ve_restore_context(t) do { } while (0) ++#endif ++ ++#endif /* __VE_TASK_H__ */ +diff --git a/include/linux/veip.h b/include/linux/veip.h +new file mode 100644 +index 0000000..745f1ec +--- /dev/null ++++ b/include/linux/veip.h +@@ -0,0 +1,15 @@ ++#ifndef __VE_IP_H_ ++#define __VE_IP_H_ ++ ++struct ve_addr_struct { ++ int family; ++ __u32 key[4]; ++}; ++ ++struct sockaddr; ++ ++extern void veaddr_print(char *, int, struct ve_addr_struct *); ++extern int sockaddr_to_veaddr(struct sockaddr __user *uaddr, int addrlen, ++ struct ve_addr_struct *veaddr); ++ ++#endif +diff --git a/include/linux/venet.h b/include/linux/venet.h +new file mode 100644 +index 0000000..14cf89e +--- /dev/null ++++ b/include/linux/venet.h +@@ -0,0 +1,86 @@ ++/* ++ * include/linux/venet.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef _VENET_H ++#define _VENET_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++#define VEIP_HASH_SZ 512 ++ ++struct ve_struct; ++struct venet_stat; ++struct venet_stats { ++ struct net_device_stats stats; ++ struct net_device_stats *real_stats; ++}; ++ ++struct ip_entry_struct ++{ ++ struct ve_addr_struct addr; ++ struct ve_struct *active_env; ++ struct venet_stat *stat; ++ struct veip_struct *veip; ++ struct list_head ip_hash; ++ struct list_head ve_list; ++}; ++ ++struct veip_struct ++{ ++ struct list_head src_lh; ++ struct list_head dst_lh; ++ struct list_head ip_lh; ++ struct list_head list; ++ envid_t veid; ++}; ++ ++static inline struct net_device_stats * ++venet_stats(struct net_device *dev, int cpu) ++{ ++ struct venet_stats *stats; ++ stats = (struct venet_stats*)dev->priv; ++ return per_cpu_ptr(stats->real_stats, cpu); ++} ++ ++/* veip_hash_lock should be taken for write by caller */ ++void ip_entry_hash(struct ip_entry_struct *entry, struct veip_struct *veip); ++/* veip_hash_lock should be taken for write by caller */ ++void ip_entry_unhash(struct ip_entry_struct *entry); ++/* veip_hash_lock should be taken for read by caller */ ++struct ip_entry_struct *venet_entry_lookup(struct ve_addr_struct *); ++ ++/* veip_hash_lock should be taken for read by caller */ ++struct veip_struct *veip_find(envid_t veid); ++/* veip_hash_lock should be taken for write by caller */ ++struct veip_struct *veip_findcreate(envid_t veid); ++/* veip_hash_lock should be taken for write by caller */ ++void veip_put(struct veip_struct *veip); ++ ++extern struct list_head veip_lh; ++ ++int veip_start(struct ve_struct *ve); ++void veip_stop(struct ve_struct *ve); ++__exit void veip_cleanup(void); ++int veip_entry_add(struct ve_struct *ve, struct ve_addr_struct *addr); ++int veip_entry_del(envid_t veid, struct ve_addr_struct *addr); ++int venet_change_skb_owner(struct sk_buff *skb); ++ ++extern struct list_head ip_entry_hash_table[]; ++extern rwlock_t veip_hash_lock; ++ ++#ifdef CONFIG_PROC_FS ++int veip_seq_show(struct seq_file *m, void *v); ++#endif ++ ++#endif +diff --git a/include/linux/veprintk.h b/include/linux/veprintk.h +new file mode 100644 +index 0000000..5669d7b +--- /dev/null ++++ b/include/linux/veprintk.h +@@ -0,0 +1,38 @@ ++/* ++ * include/linux/veprintk.h ++ * ++ * Copyright (C) 2006 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __VE_PRINTK_H__ ++#define __VE_PRINTK_H__ ++ ++#ifdef CONFIG_VE ++ ++#define ve_log_wait (*(get_exec_env()->_log_wait)) ++#define ve_log_start (*(get_exec_env()->_log_start)) ++#define ve_log_end (*(get_exec_env()->_log_end)) ++#define ve_logged_chars (*(get_exec_env()->_logged_chars)) ++#define ve_log_buf (get_exec_env()->log_buf) ++#define ve_log_buf_len (ve_is_super(get_exec_env()) ? \ ++ log_buf_len : VE_DEFAULT_LOG_BUF_LEN) ++#define VE_LOG_BUF_MASK (ve_log_buf_len - 1) ++#define VE_LOG_BUF(idx) (ve_log_buf[(idx) & VE_LOG_BUF_MASK]) ++ ++#else ++ ++#define ve_log_wait log_wait ++#define ve_log_start log_start ++#define ve_log_end log_end ++#define ve_logged_chars logged_chars ++#define ve_log_buf log_buf ++#define ve_log_buf_len log_buf_len ++#define VE_LOG_BUF_MASK LOG_BUF_MASK ++#define VE_LOG_BUF(idx) LOG_BUF(idx) ++ ++#endif /* CONFIG_VE */ ++#endif /* __VE_PRINTK_H__ */ +diff --git a/include/linux/veth.h b/include/linux/veth.h +index 3354c1e..34cfe2b 100644 +--- a/include/linux/veth.h ++++ b/include/linux/veth.h +@@ -1,3 +1,12 @@ ++/* ++ * include/linux/veth.h ++ * ++ * Copyright (C) 2007 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ + #ifndef __NET_VETH_H_ + #define __NET_VETH_H_ + +@@ -9,4 +18,28 @@ enum { + #define VETH_INFO_MAX (__VETH_INFO_MAX - 1) + }; + ++#ifdef __KERNEL__ ++struct veth_struct ++{ ++ struct net_device_stats stats; ++ struct net_device *pair; ++ struct list_head hwaddr_list; ++ struct net_device_stats *real_stats; ++ int allow_mac_change; ++}; ++ ++#define veth_from_netdev(dev) \ ++ ((struct veth_struct *)(netdev_priv(dev))) ++static inline struct net_device * veth_to_netdev(struct veth_struct *veth) ++{ ++ return (struct net_device *)((char *)veth - ((sizeof(struct net_device) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST)); ++} ++#endif ++ ++static inline struct net_device_stats * ++veth_stats(struct net_device *dev, int cpuid) ++{ ++ return per_cpu_ptr(veth_from_netdev(dev)->real_stats, cpuid); ++} ++ + #endif +diff --git a/include/linux/virtinfo.h b/include/linux/virtinfo.h +new file mode 100644 +index 0000000..b0dad07 +--- /dev/null ++++ b/include/linux/virtinfo.h +@@ -0,0 +1,100 @@ ++/* ++ * include/linux/virtinfo.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __LINUX_VIRTINFO_H ++#define __LINUX_VIRTINFO_H ++ ++#include ++#include ++#include ++ ++struct vnotifier_block ++{ ++ int (*notifier_call)(struct vnotifier_block *self, ++ unsigned long, void *, int); ++ struct vnotifier_block *next; ++ int priority; ++}; ++ ++extern struct semaphore virtinfo_sem; ++void __virtinfo_notifier_register(int type, struct vnotifier_block *nb); ++void virtinfo_notifier_register(int type, struct vnotifier_block *nb); ++void virtinfo_notifier_unregister(int type, struct vnotifier_block *nb); ++int virtinfo_notifier_call(int type, unsigned long n, void *data); ++ ++struct page_info { ++ unsigned long nr_file_dirty; ++ unsigned long nr_writeback; ++ unsigned long nr_anon_pages; ++ unsigned long nr_file_mapped; ++ unsigned long nr_slab_rec; ++ unsigned long nr_slab_unrec; ++ unsigned long nr_pagetable; ++ unsigned long nr_unstable_nfs; ++ unsigned long nr_bounce; ++ unsigned long nr_writeback_temp; ++}; ++ ++struct meminfo { ++ struct sysinfo si; ++ struct page_info pi; ++ unsigned long active, inactive; ++ unsigned long cache, swapcache; ++ unsigned long committed_space; ++ unsigned long allowed; ++ unsigned long vmalloc_total, vmalloc_used, vmalloc_largest; ++}; ++ ++#define VIRTINFO_MEMINFO 0 ++#define VIRTINFO_ENOUGHMEM 1 ++#define VIRTINFO_DOFORK 2 ++#define VIRTINFO_DOEXIT 3 ++#define VIRTINFO_DOEXECVE 4 ++#define VIRTINFO_DOFORKRET 5 ++#define VIRTINFO_DOFORKPOST 6 ++#define VIRTINFO_EXIT 7 ++#define VIRTINFO_EXITMMAP 8 ++#define VIRTINFO_EXECMMAP 9 ++#define VIRTINFO_OUTOFMEM 10 ++#define VIRTINFO_PAGEIN 11 ++#define VIRTINFO_SYSINFO 12 ++#define VIRTINFO_NEWUBC 13 ++#define VIRTINFO_VMSTAT 14 ++ ++enum virt_info_types { ++ VITYPE_GENERAL, ++ VITYPE_FAUDIT, ++ VITYPE_QUOTA, ++ VITYPE_SCP, ++ ++ VIRT_TYPES ++}; ++ ++#ifdef CONFIG_VZ_GENCALLS ++ ++static inline int virtinfo_gencall(unsigned long n, void *data) ++{ ++ int r; ++ ++ r = virtinfo_notifier_call(VITYPE_GENERAL, n, data); ++ if (r & NOTIFY_FAIL) ++ return -ENOBUFS; ++ if (r & NOTIFY_OK) ++ return -ERESTARTNOINTR; ++ return 0; ++} ++ ++#else ++ ++#define virtinfo_gencall(n, data) 0 ++ ++#endif ++ ++#endif /* __LINUX_VIRTINFO_H */ +diff --git a/include/linux/virtinfoscp.h b/include/linux/virtinfoscp.h +new file mode 100644 +index 0000000..9e7584f +--- /dev/null ++++ b/include/linux/virtinfoscp.h +@@ -0,0 +1,21 @@ ++#ifndef __VIRTINFO_SCP_H__ ++#define __VIRTINFO_SCP_H__ ++ ++/* ++ * Dump and restore operations are non-symmetric. ++ * With respect to finish/fail hooks, 2 dump hooks are called from ++ * different proc operations, but restore hooks are called from a single one. ++ */ ++#define VIRTINFO_SCP_COLLECT 0x10 ++#define VIRTINFO_SCP_DUMP 0x11 ++#define VIRTINFO_SCP_DMPFIN 0x12 ++#define VIRTINFO_SCP_RSTCHECK 0x13 ++#define VIRTINFO_SCP_RESTORE 0x14 ++#define VIRTINFO_SCP_RSTFAIL 0x15 ++ ++#define VIRTINFO_SCP_RSTTSK 0x20 ++#define VIRTINFO_SCP_RSTMM 0x21 ++ ++#define VIRTNOTIFY_CHANGE 0x100 ++ ++#endif /* __VIRTINFO_SCP_H__ */ +diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h +index 364789a..3de21b7 100644 +--- a/include/linux/vmalloc.h ++++ b/include/linux/vmalloc.h +@@ -22,6 +22,10 @@ struct vm_area_struct; + #define IOREMAP_MAX_ORDER (7 + PAGE_SHIFT) /* 128 pages */ + #endif + ++/* align size to 2^n page boundary */ ++#define POWER2_PAGE_ALIGN(size) \ ++ ((typeof(size))(1UL << (PAGE_SHIFT + get_order(size)))) ++ + struct vm_struct { + /* keep next,addr,size together to speedup lookups */ + struct vm_struct *next; +@@ -38,12 +42,16 @@ struct vm_struct { + * Highlevel APIs for driver use + */ + extern void *vmalloc(unsigned long size); ++extern void *ub_vmalloc(unsigned long size); + extern void *vmalloc_user(unsigned long size); + extern void *vmalloc_node(unsigned long size, int node); ++extern void *ub_vmalloc_node(unsigned long size, int node); + extern void *vmalloc_exec(unsigned long size); + extern void *vmalloc_32(unsigned long size); + extern void *vmalloc_32_user(unsigned long size); + extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); ++extern void *vmalloc_best(unsigned long size); ++extern void *ub_vmalloc_best(unsigned long size); + extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, + pgprot_t prot); + extern void vfree(const void *addr); +@@ -71,6 +79,9 @@ extern struct vm_struct *get_vm_area_caller(unsigned long size, + unsigned long flags, void *caller); + extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, + unsigned long start, unsigned long end); ++extern struct vm_struct * get_vm_area_best(unsigned long size, ++ unsigned long flags); ++extern void vprintstat(void); + extern struct vm_struct *get_vm_area_node(unsigned long size, + unsigned long flags, int node, + gfp_t gfp_mask); +diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h +index e83b693..fec874f 100644 +--- a/include/linux/vmstat.h ++++ b/include/linux/vmstat.h +@@ -92,6 +92,7 @@ static inline void vm_events_fold_cpu(int cpu) + } + #endif + ++extern unsigned long vm_events(enum vm_event_item i); + #else + + /* Disable counters */ +@@ -114,6 +115,7 @@ static inline void vm_events_fold_cpu(int cpu) + { + } + ++static inline unsigned long vm_events(enum vm_event_item i) { return 0; } + #endif /* CONFIG_VM_EVENT_COUNTERS */ + + #define __count_zone_vm_events(item, zone, delta) \ +diff --git a/include/linux/vzcalluser.h b/include/linux/vzcalluser.h +new file mode 100644 +index 0000000..9736479 +--- /dev/null ++++ b/include/linux/vzcalluser.h +@@ -0,0 +1,195 @@ ++/* ++ * include/linux/vzcalluser.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef _LINUX_VZCALLUSER_H ++#define _LINUX_VZCALLUSER_H ++ ++#include ++#include ++#include ++ ++#define KERN_VZ_PRIV_RANGE 51 ++ ++#ifndef __ENVID_T_DEFINED__ ++typedef unsigned envid_t; ++#define __ENVID_T_DEFINED__ ++#endif ++ ++#ifndef __KERNEL__ ++#define __user ++#endif ++ ++/* ++ * VE management ioctls ++ */ ++ ++struct vzctl_old_env_create { ++ envid_t veid; ++ unsigned flags; ++#define VE_CREATE 1 /* Create VE, VE_ENTER added automatically */ ++#define VE_EXCLUSIVE 2 /* Fail if exists */ ++#define VE_ENTER 4 /* Enter existing VE */ ++#define VE_TEST 8 /* Test if VE exists */ ++#define VE_LOCK 16 /* Do not allow entering created VE */ ++#define VE_SKIPLOCK 32 /* Allow entering embrion VE */ ++ __u32 addr; ++}; ++ ++struct vzctl_mark_env_to_down { ++ envid_t veid; ++}; ++ ++struct vzctl_setdevperms { ++ envid_t veid; ++ unsigned type; ++#define VE_USE_MAJOR 010 /* Test MAJOR supplied in rule */ ++#define VE_USE_MINOR 030 /* Test MINOR supplied in rule */ ++#define VE_USE_MASK 030 /* Testing mask, VE_USE_MAJOR|VE_USE_MINOR */ ++ unsigned dev; ++ unsigned mask; ++}; ++ ++struct vzctl_ve_netdev { ++ envid_t veid; ++ int op; ++#define VE_NETDEV_ADD 1 ++#define VE_NETDEV_DEL 2 ++ char __user *dev_name; ++}; ++ ++struct vzctl_ve_meminfo { ++ envid_t veid; ++ unsigned long val; ++}; ++ ++struct vzctl_env_create_cid { ++ envid_t veid; ++ unsigned flags; ++ __u32 class_id; ++}; ++ ++struct vzctl_env_create { ++ envid_t veid; ++ unsigned flags; ++ __u32 class_id; ++}; ++ ++struct env_create_param { ++ __u64 iptables_mask; ++}; ++ ++#define VZCTL_ENV_CREATE_DATA_MINLEN sizeof(struct env_create_param) ++ ++struct env_create_param2 { ++ __u64 iptables_mask; ++ __u64 feature_mask; ++ __u32 total_vcpus; /* 0 - don't care, same as in host */ ++}; ++ ++struct env_create_param3 { ++ __u64 iptables_mask; ++ __u64 feature_mask; ++ __u32 total_vcpus; ++ __u32 pad; ++ __u64 known_features; ++}; ++ ++#define VE_FEATURE_SYSFS (1ULL << 0) ++#define VE_FEATURE_DEF_PERMS (1ULL << 2) ++ ++#define VE_FEATURES_OLD (VE_FEATURE_SYSFS) ++#define VE_FEATURES_DEF (VE_FEATURE_SYSFS | \ ++ VE_FEATURE_DEF_PERMS) ++ ++typedef struct env_create_param3 env_create_param_t; ++#define VZCTL_ENV_CREATE_DATA_MAXLEN sizeof(env_create_param_t) ++ ++struct vzctl_env_create_data { ++ envid_t veid; ++ unsigned flags; ++ __u32 class_id; ++ env_create_param_t __user *data; ++ int datalen; ++}; ++ ++struct vz_load_avg { ++ int val_int; ++ int val_frac; ++}; ++ ++struct vz_cpu_stat { ++ unsigned long user_jif; ++ unsigned long nice_jif; ++ unsigned long system_jif; ++ unsigned long uptime_jif; ++ __u64 idle_clk; ++ __u64 strv_clk; ++ __u64 uptime_clk; ++ struct vz_load_avg avenrun[3]; /* loadavg data */ ++}; ++ ++struct vzctl_cpustatctl { ++ envid_t veid; ++ struct vz_cpu_stat __user *cpustat; ++}; ++ ++#define VZCTLTYPE '.' ++#define VZCTL_OLD_ENV_CREATE _IOW(VZCTLTYPE, 0, \ ++ struct vzctl_old_env_create) ++#define VZCTL_MARK_ENV_TO_DOWN _IOW(VZCTLTYPE, 1, \ ++ struct vzctl_mark_env_to_down) ++#define VZCTL_SETDEVPERMS _IOW(VZCTLTYPE, 2, \ ++ struct vzctl_setdevperms) ++#define VZCTL_ENV_CREATE_CID _IOW(VZCTLTYPE, 4, \ ++ struct vzctl_env_create_cid) ++#define VZCTL_ENV_CREATE _IOW(VZCTLTYPE, 5, \ ++ struct vzctl_env_create) ++#define VZCTL_GET_CPU_STAT _IOW(VZCTLTYPE, 6, \ ++ struct vzctl_cpustatctl) ++#define VZCTL_ENV_CREATE_DATA _IOW(VZCTLTYPE, 10, \ ++ struct vzctl_env_create_data) ++#define VZCTL_VE_NETDEV _IOW(VZCTLTYPE, 11, \ ++ struct vzctl_ve_netdev) ++#define VZCTL_VE_MEMINFO _IOW(VZCTLTYPE, 13, \ ++ struct vzctl_ve_meminfo) ++ ++#ifdef __KERNEL__ ++#ifdef CONFIG_COMPAT ++#include ++ ++struct compat_vzctl_ve_netdev { ++ envid_t veid; ++ int op; ++ compat_uptr_t dev_name; ++}; ++ ++struct compat_vzctl_ve_meminfo { ++ envid_t veid; ++ compat_ulong_t val; ++}; ++ ++struct compat_vzctl_env_create_data { ++ envid_t veid; ++ unsigned flags; ++ __u32 class_id; ++ compat_uptr_t data; ++ int datalen; ++}; ++ ++#define VZCTL_COMPAT_ENV_CREATE_DATA _IOW(VZCTLTYPE, 10, \ ++ struct compat_vzctl_env_create_data) ++#define VZCTL_COMPAT_VE_NETDEV _IOW(VZCTLTYPE, 11, \ ++ struct compat_vzctl_ve_netdev) ++#define VZCTL_COMPAT_VE_MEMINFO _IOW(VZCTLTYPE, 13, \ ++ struct compat_vzctl_ve_meminfo) ++#endif ++#endif ++ ++#endif +diff --git a/include/linux/vzctl.h b/include/linux/vzctl.h +new file mode 100644 +index 0000000..ad967ed +--- /dev/null ++++ b/include/linux/vzctl.h +@@ -0,0 +1,30 @@ ++/* ++ * include/linux/vzctl.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef _LINUX_VZCTL_H ++#define _LINUX_VZCTL_H ++ ++#include ++ ++struct module; ++struct inode; ++struct file; ++struct vzioctlinfo { ++ unsigned type; ++ int (*ioctl)(struct file *, unsigned int, unsigned long); ++ int (*compat_ioctl)(struct file *, unsigned int, unsigned long); ++ struct module *owner; ++ struct list_head list; ++}; ++ ++extern void vzioctl_register(struct vzioctlinfo *inf); ++extern void vzioctl_unregister(struct vzioctlinfo *inf); ++ ++#endif +diff --git a/include/linux/vzctl_quota.h b/include/linux/vzctl_quota.h +new file mode 100644 +index 0000000..6d36cdd +--- /dev/null ++++ b/include/linux/vzctl_quota.h +@@ -0,0 +1,74 @@ ++/* ++ * include/linux/vzctl_quota.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __LINUX_VZCTL_QUOTA_H__ ++#define __LINUX_VZCTL_QUOTA_H__ ++ ++#include ++ ++#ifndef __KERNEL__ ++#define __user ++#endif ++ ++/* ++ * Quota management ioctl ++ */ ++ ++struct vz_quota_stat; ++struct vzctl_quotactl { ++ int cmd; ++ unsigned int quota_id; ++ struct vz_quota_stat __user *qstat; ++ char __user *ve_root; ++}; ++ ++struct vzctl_quotaugidctl { ++ int cmd; /* subcommand */ ++ unsigned int quota_id; /* quota id where it applies to */ ++ unsigned int ugid_index;/* for reading statistic. index of first ++ uid/gid record to read */ ++ unsigned int ugid_size; /* size of ugid_buf array */ ++ void *addr; /* user-level buffer */ ++}; ++ ++#define VZDQCTLTYPE '+' ++#define VZCTL_QUOTA_DEPR_CTL _IOWR(VZDQCTLTYPE, 1, \ ++ struct vzctl_quotactl) ++#define VZCTL_QUOTA_NEW_CTL _IOWR(VZDQCTLTYPE, 2, \ ++ struct vzctl_quotactl) ++#define VZCTL_QUOTA_UGID_CTL _IOWR(VZDQCTLTYPE, 3, \ ++ struct vzctl_quotaugidctl) ++ ++#ifdef __KERNEL__ ++#ifdef CONFIG_COMPAT ++struct compat_vzctl_quotactl { ++ int cmd; ++ unsigned int quota_id; ++ compat_uptr_t qstat; ++ compat_uptr_t ve_root; ++}; ++ ++struct compat_vzctl_quotaugidctl { ++ int cmd; /* subcommand */ ++ unsigned int quota_id; /* quota id where it applies to */ ++ unsigned int ugid_index;/* for reading statistic. index of first ++ uid/gid record to read */ ++ unsigned int ugid_size; /* size of ugid_buf array */ ++ compat_uptr_t addr; /* user-level buffer */ ++}; ++ ++#define VZCTL_COMPAT_QUOTA_CTL _IOWR(VZDQCTLTYPE, 2, \ ++ struct compat_vzctl_quotactl) ++#define VZCTL_COMPAT_QUOTA_UGID_CTL _IOWR(VZDQCTLTYPE, 3, \ ++ struct compat_vzctl_quotaugidctl) ++#endif ++#endif ++ ++#endif /* __LINUX_VZCTL_QUOTA_H__ */ +diff --git a/include/linux/vzctl_venet.h b/include/linux/vzctl_venet.h +new file mode 100644 +index 0000000..4797a50 +--- /dev/null ++++ b/include/linux/vzctl_venet.h +@@ -0,0 +1,51 @@ ++/* ++ * include/linux/vzctl_venet.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef _VZCTL_VENET_H ++#define _VZCTL_VENET_H ++ ++#include ++#include ++#include ++ ++#ifndef __ENVID_T_DEFINED__ ++typedef unsigned envid_t; ++#define __ENVID_T_DEFINED__ ++#endif ++ ++struct vzctl_ve_ip_map { ++ envid_t veid; ++ int op; ++#define VE_IP_ADD 1 ++#define VE_IP_DEL 2 ++ struct sockaddr *addr; ++ int addrlen; ++}; ++ ++#define VENETCTLTYPE '(' ++ ++#define VENETCTL_VE_IP_MAP _IOW(VENETCTLTYPE, 3, \ ++ struct vzctl_ve_ip_map) ++ ++#ifdef __KERNEL__ ++#ifdef CONFIG_COMPAT ++struct compat_vzctl_ve_ip_map { ++ envid_t veid; ++ int op; ++ compat_uptr_t addr; ++ int addrlen; ++}; ++ ++#define VENETCTL_COMPAT_VE_IP_MAP _IOW(VENETCTLTYPE, 3, \ ++ struct compat_vzctl_ve_ip_map) ++#endif ++#endif ++ ++#endif +diff --git a/include/linux/vzctl_veth.h b/include/linux/vzctl_veth.h +new file mode 100644 +index 0000000..1480c5b +--- /dev/null ++++ b/include/linux/vzctl_veth.h +@@ -0,0 +1,42 @@ ++/* ++ * include/linux/vzctl_veth.h ++ * ++ * Copyright (C) 2006 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef _VZCTL_VETH_H ++#define _VZCTL_VETH_H ++ ++#include ++#include ++ ++#ifndef __ENVID_T_DEFINED__ ++typedef unsigned envid_t; ++#define __ENVID_T_DEFINED__ ++#endif ++ ++struct vzctl_ve_hwaddr { ++ envid_t veid; ++ int op; ++#define VE_ETH_ADD 1 ++#define VE_ETH_DEL 2 ++#define VE_ETH_ALLOW_MAC_CHANGE 3 ++#define VE_ETH_DENY_MAC_CHANGE 4 ++ unsigned char dev_addr[6]; ++ int addrlen; ++ char dev_name[16]; ++ unsigned char dev_addr_ve[6]; ++ int addrlen_ve; ++ char dev_name_ve[16]; ++}; ++ ++#define VETHCTLTYPE '[' ++ ++#define VETHCTL_VE_HWADDR _IOW(VETHCTLTYPE, 3, \ ++ struct vzctl_ve_hwaddr) ++ ++#endif +diff --git a/include/linux/vzdq_tree.h b/include/linux/vzdq_tree.h +new file mode 100644 +index 0000000..c019e09 +--- /dev/null ++++ b/include/linux/vzdq_tree.h +@@ -0,0 +1,99 @@ ++/* ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ * This file contains Virtuozzo disk quota tree definition ++ */ ++ ++#ifndef _VZDQ_TREE_H ++#define _VZDQ_TREE_H ++ ++#include ++#include ++ ++typedef unsigned int quotaid_t; ++#define QUOTAID_BITS 32 ++#define QUOTAID_BBITS 4 ++#define QUOTAID_EBITS 8 ++ ++#if QUOTAID_EBITS % QUOTAID_BBITS ++#error Quota bit assumption failure ++#endif ++ ++#define QUOTATREE_BSIZE (1 << QUOTAID_BBITS) ++#define QUOTATREE_BMASK (QUOTATREE_BSIZE - 1) ++#define QUOTATREE_DEPTH ((QUOTAID_BITS + QUOTAID_BBITS - 1) \ ++ / QUOTAID_BBITS) ++#define QUOTATREE_EDEPTH ((QUOTAID_BITS + QUOTAID_EBITS - 1) \ ++ / QUOTAID_EBITS) ++#define QUOTATREE_BSHIFT(lvl) ((QUOTATREE_DEPTH - (lvl) - 1) * QUOTAID_BBITS) ++ ++/* ++ * Depth of keeping unused node (not inclusive). ++ * 0 means release all nodes including root, ++ * QUOTATREE_DEPTH means never release nodes. ++ * Current value: release all nodes strictly after QUOTATREE_EDEPTH ++ * (measured in external shift units). ++ */ ++#define QUOTATREE_CDEPTH (QUOTATREE_DEPTH \ ++ - 2 * QUOTATREE_DEPTH / QUOTATREE_EDEPTH \ ++ + 1) ++ ++/* ++ * Levels 0..(QUOTATREE_DEPTH-1) are tree nodes. ++ * On level i the maximal number of nodes is 2^(i*QUOTAID_BBITS), ++ * and each node contains 2^QUOTAID_BBITS pointers. ++ * Level 0 is a (single) tree root node. ++ * ++ * Nodes of level (QUOTATREE_DEPTH-1) contain pointers to caller's data. ++ * Nodes of lower levels contain pointers to nodes. ++ * ++ * Double pointer in array of i-level node, pointing to a (i+1)-level node ++ * (such as inside quotatree_find_state) are marked by level (i+1), not i. ++ * Level 0 double pointer is a pointer to root inside tree struct. ++ * ++ * The tree is permanent, i.e. all index blocks allocated are keeped alive to ++ * preserve the blocks numbers in the quota file tree to keep its changes ++ * locally. ++ */ ++struct quotatree_node { ++ struct list_head list; ++ quotaid_t num; ++ void *blocks[QUOTATREE_BSIZE]; ++}; ++ ++struct quotatree_level { ++ struct list_head usedlh, freelh; ++ quotaid_t freenum; ++}; ++ ++struct quotatree_tree { ++ struct quotatree_level levels[QUOTATREE_DEPTH]; ++ struct quotatree_node *root; ++ unsigned int leaf_num; ++}; ++ ++struct quotatree_find_state { ++ void **block; ++ int level; ++}; ++ ++/* number of leafs (objects) and leaf level of the tree */ ++#define QTREE_LEAFNUM(tree) ((tree)->leaf_num) ++#define QTREE_LEAFLVL(tree) (&(tree)->levels[QUOTATREE_DEPTH - 1]) ++ ++struct quotatree_tree *quotatree_alloc(void); ++void *quotatree_find(struct quotatree_tree *tree, quotaid_t id, ++ struct quotatree_find_state *st); ++int quotatree_insert(struct quotatree_tree *tree, quotaid_t id, ++ struct quotatree_find_state *st, void *data); ++void quotatree_remove(struct quotatree_tree *tree, quotaid_t id); ++void quotatree_free(struct quotatree_tree *tree, void (*dtor)(void *)); ++void *quotatree_get_next(struct quotatree_tree *tree, quotaid_t id); ++void *quotatree_leaf_byindex(struct quotatree_tree *tree, unsigned int index); ++ ++#endif /* _VZDQ_TREE_H */ ++ +diff --git a/include/linux/vzevent.h b/include/linux/vzevent.h +new file mode 100644 +index 0000000..1a67297 +--- /dev/null ++++ b/include/linux/vzevent.h +@@ -0,0 +1,13 @@ ++#ifndef __LINUX_VZ_EVENT_H__ ++#define __LINUX_VZ_EVENT_H__ ++ ++#if defined(CONFIG_VZ_EVENT) || defined(CONFIG_VZ_EVENT_MODULE) ++extern int vzevent_send(int msg, const char *attrs_fmt, ...); ++#else ++static inline int vzevent_send(int msg, const char *attrs_fmt, ...) ++{ ++ return 0; ++} ++#endif ++ ++#endif /* __LINUX_VZ_EVENT_H__ */ +diff --git a/include/linux/vziptable_defs.h b/include/linux/vziptable_defs.h +new file mode 100644 +index 0000000..ec7586f +--- /dev/null ++++ b/include/linux/vziptable_defs.h +@@ -0,0 +1,51 @@ ++#ifndef _LINUX_VZIPTABLE_DEFS_H ++#define _LINUX_VZIPTABLE_DEFS_H ++ ++/* these masks represent modules */ ++#define VE_IP_IPTABLES_MOD (1U<<0) ++#define VE_IP_FILTER_MOD (1U<<1) ++#define VE_IP_MANGLE_MOD (1U<<2) ++#define VE_IP_CONNTRACK_MOD (1U<<14) ++#define VE_IP_CONNTRACK_FTP_MOD (1U<<15) ++#define VE_IP_CONNTRACK_IRC_MOD (1U<<16) ++#define VE_IP_NAT_MOD (1U<<20) ++#define VE_IP_NAT_FTP_MOD (1U<<21) ++#define VE_IP_NAT_IRC_MOD (1U<<22) ++#define VE_IP_IPTABLES6_MOD (1U<<26) ++#define VE_IP_FILTER6_MOD (1U<<27) ++#define VE_IP_MANGLE6_MOD (1U<<28) ++#define VE_IP_IPTABLE_NAT_MOD (1U<<29) ++#define VE_NF_CONNTRACK_MOD (1U<<30) ++ ++/* these masks represent modules with their dependences */ ++#define VE_IP_IPTABLES (VE_IP_IPTABLES_MOD) ++#define VE_IP_FILTER (VE_IP_FILTER_MOD \ ++ | VE_IP_IPTABLES) ++#define VE_IP_MANGLE (VE_IP_MANGLE_MOD \ ++ | VE_IP_IPTABLES) ++#define VE_IP_IPTABLES6 (VE_IP_IPTABLES6_MOD) ++#define VE_IP_FILTER6 (VE_IP_FILTER6_MOD | VE_IP_IPTABLES6) ++#define VE_IP_MANGLE6 (VE_IP_MANGLE6_MOD | VE_IP_IPTABLES6) ++#define VE_NF_CONNTRACK (VE_NF_CONNTRACK_MOD | VE_IP_IPTABLES) ++#define VE_IP_CONNTRACK (VE_IP_CONNTRACK_MOD \ ++ | VE_IP_IPTABLES) ++#define VE_IP_CONNTRACK_FTP (VE_IP_CONNTRACK_FTP_MOD \ ++ | VE_IP_CONNTRACK) ++#define VE_IP_CONNTRACK_IRC (VE_IP_CONNTRACK_IRC_MOD \ ++ | VE_IP_CONNTRACK) ++#define VE_IP_NAT (VE_IP_NAT_MOD \ ++ | VE_IP_CONNTRACK) ++#define VE_IP_NAT_FTP (VE_IP_NAT_FTP_MOD \ ++ | VE_IP_NAT | VE_IP_CONNTRACK_FTP) ++#define VE_IP_NAT_IRC (VE_IP_NAT_IRC_MOD \ ++ | VE_IP_NAT | VE_IP_CONNTRACK_IRC) ++#define VE_IP_IPTABLE_NAT (VE_IP_IPTABLE_NAT_MOD | VE_IP_CONNTRACK) ++ ++/* safe iptables mask to be used by default */ ++#define VE_IP_DEFAULT \ ++ (VE_IP_IPTABLES | \ ++ VE_IP_FILTER | VE_IP_MANGLE) ++ ++#define VE_IPT_CMP(x, y) (((x) & (y)) == (y)) ++ ++#endif /* _LINUX_VZIPTABLE_DEFS_H */ +diff --git a/include/linux/vzquota.h b/include/linux/vzquota.h +new file mode 100644 +index 0000000..18668e6 +--- /dev/null ++++ b/include/linux/vzquota.h +@@ -0,0 +1,379 @@ ++/* ++ * ++ * Copyright (C) 2001-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ * This file contains Virtuozzo disk quota implementation ++ */ ++ ++#ifndef _VZDQUOTA_H ++#define _VZDQUOTA_H ++ ++#include ++#include ++ ++/* vzquotactl syscall commands */ ++#define VZ_DQ_CREATE 5 /* create quota master block */ ++#define VZ_DQ_DESTROY 6 /* destroy qmblk */ ++#define VZ_DQ_ON 7 /* mark dentry with already created qmblk */ ++#define VZ_DQ_OFF 8 /* remove mark, don't destroy qmblk */ ++#define VZ_DQ_SETLIMIT 9 /* set new limits */ ++#define VZ_DQ_GETSTAT 10 /* get usage statistic */ ++#define VZ_DQ_OFF_FORCED 11 /* forced off */ ++/* set of syscalls to maintain UGID quotas */ ++#define VZ_DQ_UGID_GETSTAT 1 /* get usage/limits for ugid(s) */ ++#define VZ_DQ_UGID_ADDSTAT 2 /* set usage/limits statistic for ugid(s) */ ++#define VZ_DQ_UGID_GETGRACE 3 /* get expire times */ ++#define VZ_DQ_UGID_SETGRACE 4 /* set expire times */ ++#define VZ_DQ_UGID_GETCONFIG 5 /* get ugid_max limit, cnt, flags of qmblk */ ++#define VZ_DQ_UGID_SETCONFIG 6 /* set ugid_max limit, flags of qmblk */ ++#define VZ_DQ_UGID_SETLIMIT 7 /* set ugid B/I limits */ ++#define VZ_DQ_UGID_SETINFO 8 /* set ugid info */ ++ ++/* common structure for vz and ugid quota */ ++struct dq_stat { ++ /* blocks limits */ ++ __u64 bhardlimit; /* absolute limit in bytes */ ++ __u64 bsoftlimit; /* preferred limit in bytes */ ++ time_t btime; /* time limit for excessive disk use */ ++ __u64 bcurrent; /* current bytes count */ ++ /* inodes limits */ ++ __u32 ihardlimit; /* absolute limit on allocated inodes */ ++ __u32 isoftlimit; /* preferred inode limit */ ++ time_t itime; /* time limit for excessive inode use */ ++ __u32 icurrent; /* current # allocated inodes */ ++}; ++ ++/* One second resolution for grace times */ ++#define CURRENT_TIME_SECONDS (get_seconds()) ++ ++/* Values for dq_info->flags */ ++#define VZ_QUOTA_INODES 0x01 /* inodes limit warning printed */ ++#define VZ_QUOTA_SPACE 0x02 /* space limit warning printed */ ++ ++struct dq_info { ++ time_t bexpire; /* expire timeout for excessive disk use */ ++ time_t iexpire; /* expire timeout for excessive inode use */ ++ unsigned flags; /* see previos defines */ ++}; ++ ++struct vz_quota_stat { ++ struct dq_stat dq_stat; ++ struct dq_info dq_info; ++}; ++ ++/* UID/GID interface record - for user-kernel level exchange */ ++struct vz_quota_iface { ++ unsigned int qi_id; /* UID/GID this applies to */ ++ unsigned int qi_type; /* USRQUOTA|GRPQUOTA */ ++ struct dq_stat qi_stat; /* limits, options, usage stats */ ++}; ++ ++#ifdef CONFIG_COMPAT ++#include ++struct compat_dq_stat { ++ /* blocks limits */ ++ __u64 bhardlimit; /* absolute limit in bytes */ ++ __u64 bsoftlimit; /* preferred limit in bytes */ ++ compat_time_t btime; /* time limit for excessive disk use */ ++ __u64 bcurrent; /* current bytes count */ ++ /* inodes limits */ ++ __u32 ihardlimit; /* absolute limit on allocated inodes */ ++ __u32 isoftlimit; /* preferred inode limit */ ++ compat_time_t itime; /* time limit for excessive inode use */ ++ __u32 icurrent; /* current # allocated inodes */ ++}; ++ ++struct compat_dq_info { ++ compat_time_t bexpire; /* expire timeout for excessive disk use */ ++ compat_time_t iexpire; /* expire timeout for excessive inode use */ ++ unsigned flags; /* see previos defines */ ++}; ++ ++struct compat_vz_quota_stat { ++ struct compat_dq_stat dq_stat; ++ struct compat_dq_info dq_info; ++}; ++ ++struct compat_vz_quota_iface { ++ unsigned int qi_id; /* UID/GID this applies to */ ++ unsigned int qi_type; /* USRQUOTA|GRPQUOTA */ ++ struct compat_dq_stat qi_stat; /* limits, options, usage stats */ ++}; ++ ++static inline void compat_dqstat2dqstat(struct compat_dq_stat *odqs, ++ struct dq_stat *dqs) ++{ ++ dqs->bhardlimit = odqs->bhardlimit; ++ dqs->bsoftlimit = odqs->bsoftlimit; ++ dqs->bcurrent = odqs->bcurrent; ++ dqs->btime = odqs->btime; ++ ++ dqs->ihardlimit = odqs->ihardlimit; ++ dqs->isoftlimit = odqs->isoftlimit; ++ dqs->icurrent = odqs->icurrent; ++ dqs->itime = odqs->itime; ++} ++ ++static inline void compat_dqinfo2dqinfo(struct compat_dq_info *odqi, ++ struct dq_info *dqi) ++{ ++ dqi->bexpire = odqi->bexpire; ++ dqi->iexpire = odqi->iexpire; ++ dqi->flags = odqi->flags; ++} ++ ++static inline void dqstat2compat_dqstat(struct dq_stat *dqs, ++ struct compat_dq_stat *odqs) ++{ ++ odqs->bhardlimit = dqs->bhardlimit; ++ odqs->bsoftlimit = dqs->bsoftlimit; ++ odqs->bcurrent = dqs->bcurrent; ++ odqs->btime = (compat_time_t)dqs->btime; ++ ++ odqs->ihardlimit = dqs->ihardlimit; ++ odqs->isoftlimit = dqs->isoftlimit; ++ odqs->icurrent = dqs->icurrent; ++ odqs->itime = (compat_time_t)dqs->itime; ++} ++ ++static inline void dqinfo2compat_dqinfo(struct dq_info *dqi, ++ struct compat_dq_info *odqi) ++{ ++ odqi->bexpire = (compat_time_t)dqi->bexpire; ++ odqi->iexpire = (compat_time_t)dqi->iexpire; ++ odqi->flags = dqi->flags; ++} ++#endif ++ ++/* values for flags and dq_flags */ ++/* this flag is set if the userspace has been unable to provide usage ++ * information about all ugids ++ * if the flag is set, we don't allocate new UG quota blocks (their ++ * current usage is unknown) or free existing UG quota blocks (not to ++ * lose information that this block is ok) */ ++#define VZDQUG_FIXED_SET 0x01 ++/* permit to use ugid quota */ ++#define VZDQUG_ON 0x02 ++#define VZDQ_USRQUOTA 0x10 ++#define VZDQ_GRPQUOTA 0x20 ++#define VZDQ_NOACT 0x1000 /* not actual */ ++#define VZDQ_NOQUOT 0x2000 /* not under quota tree */ ++ ++struct vz_quota_ugid_stat { ++ unsigned int limit; /* max amount of ugid records */ ++ unsigned int count; /* amount of ugid records */ ++ unsigned int flags; ++}; ++ ++struct vz_quota_ugid_setlimit { ++ unsigned int type; /* quota type (USR/GRP) */ ++ unsigned int id; /* ugid */ ++ struct if_dqblk dqb; /* limits info */ ++}; ++ ++struct vz_quota_ugid_setinfo { ++ unsigned int type; /* quota type (USR/GRP) */ ++ struct if_dqinfo dqi; /* grace info */ ++}; ++ ++#ifdef __KERNEL__ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* Values for dq_info flags */ ++#define VZ_QUOTA_INODES 0x01 /* inodes limit warning printed */ ++#define VZ_QUOTA_SPACE 0x02 /* space limit warning printed */ ++ ++/* values for dq_state */ ++#define VZDQ_STARTING 0 /* created, not turned on yet */ ++#define VZDQ_WORKING 1 /* quota created, turned on */ ++#define VZDQ_STOPING 2 /* created, turned on and off */ ++ ++/* master quota record - one per veid */ ++struct vz_quota_master { ++ struct list_head dq_hash; /* next quota in hash list */ ++ atomic_t dq_count; /* inode reference count */ ++ unsigned int dq_flags; /* see VZDQUG_FIXED_SET */ ++ unsigned int dq_state; /* see values above */ ++ unsigned int dq_id; /* VEID this applies to */ ++ struct dq_stat dq_stat; /* limits, grace, usage stats */ ++ struct dq_info dq_info; /* grace times and flags */ ++ spinlock_t dq_data_lock; /* for dq_stat */ ++ ++ struct semaphore dq_sem; /* semaphore to protect ++ ugid tree */ ++ ++ struct list_head dq_ilink_list; /* list of vz_quota_ilink */ ++ struct quotatree_tree *dq_uid_tree; /* vz_quota_ugid tree for UIDs */ ++ struct quotatree_tree *dq_gid_tree; /* vz_quota_ugid tree for GIDs */ ++ unsigned int dq_ugid_count; /* amount of ugid records */ ++ unsigned int dq_ugid_max; /* max amount of ugid records */ ++ struct dq_info dq_ugid_info[MAXQUOTAS]; /* ugid grace times */ ++ ++ struct path dq_root_path; /* path of fs tree */ ++ struct super_block *dq_sb; /* superblock of our quota root */ ++}; ++ ++/* UID/GID quota record - one per pair (quota_master, uid or gid) */ ++struct vz_quota_ugid { ++ unsigned int qugid_id; /* UID/GID this applies to */ ++ struct dq_stat qugid_stat; /* limits, options, usage stats */ ++ int qugid_type; /* USRQUOTA|GRPQUOTA */ ++ atomic_t qugid_count; /* reference count */ ++}; ++ ++#define VZ_QUOTA_UGBAD ((struct vz_quota_ugid *)0xfeafea11) ++ ++struct vz_quota_datast { ++ struct vz_quota_ilink qlnk; ++}; ++ ++#define VIRTINFO_QUOTA_GETSTAT 0 ++#define VIRTINFO_QUOTA_ON 1 ++#define VIRTINFO_QUOTA_OFF 2 ++#define VIRTINFO_QUOTA_DISABLE 3 ++ ++struct virt_info_quota { ++ struct super_block *super; ++ struct dq_stat *qstat; ++}; ++ ++/* ++ * Interface to VZ quota core ++ */ ++#define INODE_QLNK(inode) (&(inode)->i_qlnk) ++#define QLNK_INODE(qlnk) container_of((qlnk), struct inode, i_qlnk) ++ ++#define VZ_QUOTA_BAD ((struct vz_quota_master *)0xefefefef) ++ ++#define VZ_QUOTAO_SETE 1 ++#define VZ_QUOTAO_INIT 2 ++#define VZ_QUOTAO_DESTR 3 ++#define VZ_QUOTAO_SWAP 4 ++#define VZ_QUOTAO_INICAL 5 ++#define VZ_QUOTAO_DRCAL 6 ++#define VZ_QUOTAO_QSET 7 ++#define VZ_QUOTAO_TRANS 8 ++#define VZ_QUOTAO_ACT 9 ++#define VZ_QUOTAO_DTREE 10 ++#define VZ_QUOTAO_DET 11 ++#define VZ_QUOTAO_ON 12 ++#define VZ_QUOTAO_RE_LOCK 13 ++ ++#define DQUOT_CMD_ALLOC 0 ++#define DQUOT_CMD_PREALLOC 1 ++#define DQUOT_CMD_CHECK 12 ++#define DQUOT_CMD_FORCE 13 ++ ++extern struct semaphore vz_quota_sem; ++void inode_qmblk_lock(struct super_block *sb); ++void inode_qmblk_unlock(struct super_block *sb); ++void qmblk_data_read_lock(struct vz_quota_master *qmblk); ++void qmblk_data_read_unlock(struct vz_quota_master *qmblk); ++void qmblk_data_write_lock(struct vz_quota_master *qmblk); ++void qmblk_data_write_unlock(struct vz_quota_master *qmblk); ++ ++/* for quota operations */ ++void vzquota_inode_init_call(struct inode *inode); ++void vzquota_inode_drop_call(struct inode *inode); ++int vzquota_inode_transfer_call(struct inode *, struct iattr *); ++struct vz_quota_master *vzquota_inode_data(struct inode *inode, ++ struct vz_quota_datast *); ++void vzquota_data_unlock(struct inode *inode, struct vz_quota_datast *); ++int vzquota_rename_check(struct inode *inode, ++ struct inode *old_dir, struct inode *new_dir); ++struct vz_quota_master *vzquota_inode_qmblk(struct inode *inode); ++/* for second-level quota */ ++struct vz_quota_master *vzquota_find_qmblk(struct super_block *); ++/* for management operations */ ++struct vz_quota_master *vzquota_alloc_master(unsigned int quota_id, ++ struct vz_quota_stat *qstat); ++void vzquota_free_master(struct vz_quota_master *); ++struct vz_quota_master *vzquota_find_master(unsigned int quota_id); ++int vzquota_on_qmblk(struct super_block *sb, struct inode *inode, ++ struct vz_quota_master *qmblk, char __user *buf); ++int vzquota_off_qmblk(struct super_block *sb, struct vz_quota_master *qmblk, ++ char __user *buf, int force); ++int vzquota_get_super(struct super_block *sb); ++void vzquota_put_super(struct super_block *sb); ++ ++static inline struct vz_quota_master *qmblk_get(struct vz_quota_master *qmblk) ++{ ++ if (!atomic_read(&qmblk->dq_count)) ++ BUG(); ++ atomic_inc(&qmblk->dq_count); ++ return qmblk; ++} ++ ++static inline void __qmblk_put(struct vz_quota_master *qmblk) ++{ ++ atomic_dec(&qmblk->dq_count); ++} ++ ++static inline void qmblk_put(struct vz_quota_master *qmblk) ++{ ++ if (!atomic_dec_and_test(&qmblk->dq_count)) ++ return; ++ vzquota_free_master(qmblk); ++} ++ ++extern struct list_head vzquota_hash_table[]; ++extern int vzquota_hash_size; ++ ++/* ++ * Interface to VZ UGID quota ++ */ ++extern struct quotactl_ops vz_quotactl_operations; ++extern struct dquot_operations vz_quota_operations2; ++extern struct quota_format_type vz_quota_empty_v2_format; ++ ++#define QUGID_TREE(qmblk, type) (((type) == USRQUOTA) ? \ ++ qmblk->dq_uid_tree : \ ++ qmblk->dq_gid_tree) ++ ++#define VZDQUG_FIND_DONT_ALLOC 1 ++#define VZDQUG_FIND_FAKE 2 ++struct vz_quota_ugid *vzquota_find_ugid(struct vz_quota_master *qmblk, ++ unsigned int quota_id, int type, int flags); ++struct vz_quota_ugid *__vzquota_find_ugid(struct vz_quota_master *qmblk, ++ unsigned int quota_id, int type, int flags); ++struct vz_quota_ugid *vzquota_get_ugid(struct vz_quota_ugid *qugid); ++void vzquota_put_ugid(struct vz_quota_master *qmblk, ++ struct vz_quota_ugid *qugid); ++void vzquota_kill_ugid(struct vz_quota_master *qmblk); ++int vzquota_ugid_init(void); ++void vzquota_ugid_release(void); ++int vzquota_transfer_usage(struct inode *inode, int mask, ++ struct vz_quota_ilink *qlnk); ++void vzquota_inode_off(struct inode *inode); ++ ++long do_vzquotaugidctl(int cmd, unsigned int quota_id, ++ unsigned int ugid_index, unsigned int ugid_size, ++ void *addr, int compat); ++ ++/* ++ * Other VZ quota parts ++ */ ++extern struct dquot_operations vz_quota_operations; ++ ++long do_vzquotactl(int cmd, unsigned int quota_id, ++ struct vz_quota_stat __user *qstat, const char __user *ve_root, ++ int compat); ++int vzquota_proc_init(void); ++void vzquota_proc_release(void); ++struct vz_quota_master *vzquota_find_qmblk(struct super_block *); ++extern struct semaphore vz_quota_sem; ++ ++void vzaquota_init(void); ++void vzaquota_fini(void); ++ ++#endif /* __KERNEL__ */ ++ ++#endif /* _VZDQUOTA_H */ +diff --git a/include/linux/vzquota_qlnk.h b/include/linux/vzquota_qlnk.h +new file mode 100644 +index 0000000..2788c41 +--- /dev/null ++++ b/include/linux/vzquota_qlnk.h +@@ -0,0 +1,25 @@ ++/* ++ * include/linux/vzquota_qlnk.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef _VZDQUOTA_QLNK_H ++#define _VZDQUOTA_QLNK_H ++ ++struct vz_quota_master; ++struct vz_quota_ugid; ++ ++/* inode link, used to track inodes using quota via dq_ilink_list */ ++struct vz_quota_ilink { ++ struct vz_quota_master *qmblk; ++ struct vz_quota_ugid *qugid[MAXQUOTAS]; ++ struct list_head list; ++ unsigned char origin[2]; ++}; ++ ++#endif /* _VZDQUOTA_QLNK_H */ +diff --git a/include/linux/vzratelimit.h b/include/linux/vzratelimit.h +new file mode 100644 +index 0000000..f26baad +--- /dev/null ++++ b/include/linux/vzratelimit.h +@@ -0,0 +1,28 @@ ++/* ++ * include/linux/vzratelimit.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __VZ_RATELIMIT_H__ ++#define __VZ_RATELIMIT_H__ ++ ++/* ++ * Generic ratelimiting stuff. ++ */ ++ ++struct vz_rate_info { ++ int burst; ++ int interval; /* jiffy_t per event */ ++ int bucket; /* kind of leaky bucket */ ++ unsigned long last; /* last event */ ++}; ++ ++/* Return true if rate limit permits. */ ++int vz_ratelimit(struct vz_rate_info *p); ++ ++#endif /* __VZ_RATELIMIT_H__ */ +diff --git a/include/linux/vzstat.h b/include/linux/vzstat.h +new file mode 100644 +index 0000000..5c23ea4 +--- /dev/null ++++ b/include/linux/vzstat.h +@@ -0,0 +1,182 @@ ++/* ++ * include/linux/vzstat.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __VZSTAT_H__ ++#define __VZSTAT_H__ ++ ++struct swap_cache_info_struct { ++ unsigned long add_total; ++ unsigned long del_total; ++ unsigned long find_success; ++ unsigned long find_total; ++ unsigned long noent_race; ++ unsigned long exist_race; ++ unsigned long remove_race; ++}; ++ ++struct kstat_lat_snap_struct { ++ cycles_t maxlat, totlat; ++ unsigned long count; ++}; ++struct kstat_lat_pcpu_snap_struct { ++ cycles_t maxlat, totlat; ++ unsigned long count; ++ seqcount_t lock; ++} ____cacheline_aligned_in_smp; ++ ++struct kstat_lat_struct { ++ struct kstat_lat_snap_struct cur, last; ++ cycles_t avg[3]; ++}; ++struct kstat_lat_pcpu_struct { ++ struct kstat_lat_pcpu_snap_struct cur[NR_CPUS]; ++ cycles_t max_snap; ++ struct kstat_lat_snap_struct last; ++ cycles_t avg[3]; ++}; ++ ++struct kstat_perf_snap_struct { ++ cycles_t wall_tottime, cpu_tottime; ++ cycles_t wall_maxdur, cpu_maxdur; ++ unsigned long count; ++}; ++struct kstat_perf_struct { ++ struct kstat_perf_snap_struct cur, last; ++}; ++ ++struct kstat_zone_avg { ++ unsigned long free_pages_avg[3], ++ nr_active_avg[3], ++ nr_inactive_avg[3]; ++}; ++ ++#define KSTAT_ALLOCSTAT_NR 5 ++ ++struct kernel_stat_glob { ++ unsigned long nr_unint_avg[3]; ++ ++ unsigned long alloc_fails[KSTAT_ALLOCSTAT_NR]; ++ struct kstat_lat_struct alloc_lat[KSTAT_ALLOCSTAT_NR]; ++ struct kstat_lat_pcpu_struct sched_lat; ++ struct kstat_lat_struct swap_in; ++ ++ struct kstat_perf_struct ttfp, cache_reap, ++ refill_inact, shrink_icache, shrink_dcache; ++ ++ struct kstat_zone_avg zone_avg[3]; /* MAX_NR_ZONES */ ++} ____cacheline_aligned; ++ ++extern struct kernel_stat_glob kstat_glob ____cacheline_aligned; ++extern spinlock_t kstat_glb_lock; ++ ++#ifdef CONFIG_VE ++#define KSTAT_PERF_ENTER(name) \ ++ unsigned long flags; \ ++ cycles_t start, sleep_time; \ ++ \ ++ start = get_cycles(); \ ++ sleep_time = VE_TASK_INFO(current)->sleep_time; \ ++ ++#define KSTAT_PERF_LEAVE(name) \ ++ spin_lock_irqsave(&kstat_glb_lock, flags); \ ++ kstat_glob.name.cur.count++; \ ++ start = get_cycles() - start; \ ++ if (kstat_glob.name.cur.wall_maxdur < start) \ ++ kstat_glob.name.cur.wall_maxdur = start;\ ++ kstat_glob.name.cur.wall_tottime += start; \ ++ start -= VE_TASK_INFO(current)->sleep_time - \ ++ sleep_time; \ ++ if (kstat_glob.name.cur.cpu_maxdur < start) \ ++ kstat_glob.name.cur.cpu_maxdur = start; \ ++ kstat_glob.name.cur.cpu_tottime += start; \ ++ spin_unlock_irqrestore(&kstat_glb_lock, flags); \ ++ ++#else ++#define KSTAT_PERF_ENTER(name) ++#define KSTAT_PERF_LEAVE(name) ++#endif ++ ++/* ++ * Add another statistics reading. ++ * Serialization is the caller's due. ++ */ ++static inline void KSTAT_LAT_ADD(struct kstat_lat_struct *p, ++ cycles_t dur) ++{ ++ p->cur.count++; ++ if (p->cur.maxlat < dur) ++ p->cur.maxlat = dur; ++ p->cur.totlat += dur; ++} ++ ++static inline void KSTAT_LAT_PCPU_ADD(struct kstat_lat_pcpu_struct *p, int cpu, ++ cycles_t dur) ++{ ++ struct kstat_lat_pcpu_snap_struct *cur; ++ ++ cur = &p->cur[cpu]; ++ write_seqcount_begin(&cur->lock); ++ cur->count++; ++ if (cur->maxlat < dur) ++ cur->maxlat = dur; ++ cur->totlat += dur; ++ write_seqcount_end(&cur->lock); ++} ++ ++/* ++ * Move current statistics to last, clear last. ++ * Serialization is the caller's due. ++ */ ++static inline void KSTAT_LAT_UPDATE(struct kstat_lat_struct *p) ++{ ++ cycles_t m; ++ memcpy(&p->last, &p->cur, sizeof(p->last)); ++ p->cur.maxlat = 0; ++ m = p->last.maxlat; ++ CALC_LOAD(p->avg[0], EXP_1, m) ++ CALC_LOAD(p->avg[1], EXP_5, m) ++ CALC_LOAD(p->avg[2], EXP_15, m) ++} ++ ++static inline void KSTAT_LAT_PCPU_UPDATE(struct kstat_lat_pcpu_struct *p) ++{ ++ unsigned i, cpu; ++ struct kstat_lat_pcpu_snap_struct snap, *cur; ++ cycles_t m; ++ ++ memset(&p->last, 0, sizeof(p->last)); ++ for (cpu = 0; cpu < NR_CPUS; cpu++) { ++ cur = &p->cur[cpu]; ++ do { ++ i = read_seqcount_begin(&cur->lock); ++ memcpy(&snap, cur, sizeof(snap)); ++ } while (read_seqcount_retry(&cur->lock, i)); ++ /* ++ * read above and this update of maxlat is not atomic, ++ * but this is OK, since it happens rarely and losing ++ * a couple of peaks is not essential. xemul ++ */ ++ cur->maxlat = 0; ++ ++ p->last.count += snap.count; ++ p->last.totlat += snap.totlat; ++ if (p->last.maxlat < snap.maxlat) ++ p->last.maxlat = snap.maxlat; ++ } ++ ++ m = (p->last.maxlat > p->max_snap ? p->last.maxlat : p->max_snap); ++ CALC_LOAD(p->avg[0], EXP_1, m); ++ CALC_LOAD(p->avg[1], EXP_5, m); ++ CALC_LOAD(p->avg[2], EXP_15, m); ++ /* reset max_snap to calculate it correctly next time */ ++ p->max_snap = 0; ++} ++ ++#endif /* __VZSTAT_H__ */ +diff --git a/include/net/addrconf.h b/include/net/addrconf.h +index bbd3d58..22e57e7 100644 +--- a/include/net/addrconf.h ++++ b/include/net/addrconf.h +@@ -258,5 +258,9 @@ extern int if6_proc_init(void); + extern void if6_proc_exit(void); + #endif + ++int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx, ++ unsigned int plen, __u8 ifa_flags, __u32 prefered_lft, ++ __u32 valid_lft); ++ + #endif + #endif +diff --git a/include/net/af_unix.h b/include/net/af_unix.h +index 2dfa96b..10183b3 100644 +--- a/include/net/af_unix.h ++++ b/include/net/af_unix.h +@@ -9,6 +9,7 @@ + extern void unix_inflight(struct file *fp); + extern void unix_notinflight(struct file *fp); + extern void unix_gc(void); ++extern void unix_destruct_fds(struct sk_buff *skb); + + #define UNIX_HASH_SIZE 256 + +diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h +index a5c6ccc..c2bb5ca 100644 +--- a/include/net/fib_rules.h ++++ b/include/net/fib_rules.h +@@ -62,7 +62,7 @@ struct fib_rules_ops + + /* Called after modifications to the rules set, must flush + * the route cache if one exists. */ +- void (*flush_cache)(void); ++ void (*flush_cache)(struct fib_rules_ops *ops); + + int nlgroup; + const struct nla_policy *policy; +diff --git a/include/net/flow.h b/include/net/flow.h +index ad16e00..bcf2002 100644 +--- a/include/net/flow.h ++++ b/include/net/flow.h +@@ -10,6 +10,7 @@ + #include + #include + ++struct ve_struct; + struct flowi { + int oif; + int iif; +@@ -76,6 +77,9 @@ struct flowi { + #define fl_icmp_code uli_u.icmpt.code + #define fl_ipsec_spi uli_u.spi + #define fl_mh_type uli_u.mht.type ++#ifdef CONFIG_VE ++ struct ve_struct *owner_env; ++#endif + __u32 secid; /* used by xfrm; see secid.txt */ + } __attribute__((__aligned__(BITS_PER_LONG/8))); + +diff --git a/include/net/icmp.h b/include/net/icmp.h +index dddb839..c0362a4 100644 +--- a/include/net/icmp.h ++++ b/include/net/icmp.h +@@ -31,15 +31,24 @@ struct icmp_err { + extern struct icmp_err icmp_err_convert[]; + DECLARE_SNMP_STAT(struct icmp_mib, icmp_statistics); + DECLARE_SNMP_STAT(struct icmpmsg_mib, icmpmsg_statistics); +-#define ICMP_INC_STATS(field) SNMP_INC_STATS(icmp_statistics, field) +-#define ICMP_INC_STATS_BH(field) SNMP_INC_STATS_BH(icmp_statistics, field) +-#define ICMP_INC_STATS_USER(field) SNMP_INC_STATS_USER(icmp_statistics, field) +-#define ICMPMSGOUT_INC_STATS(field) SNMP_INC_STATS(icmpmsg_statistics, field+256) +-#define ICMPMSGOUT_INC_STATS_BH(field) SNMP_INC_STATS_BH(icmpmsg_statistics, field+256) +-#define ICMPMSGOUT_INC_STATS_USER(field) SNMP_INC_STATS_USER(icmpmsg_statistics, field+256) +-#define ICMPMSGIN_INC_STATS(field) SNMP_INC_STATS(icmpmsg_statistics, field) +-#define ICMPMSGIN_INC_STATS_BH(field) SNMP_INC_STATS_BH(icmpmsg_statistics, field) +-#define ICMPMSGIN_INC_STATS_USER(field) SNMP_INC_STATS_USER(icmpmsg_statistics, field) ++ ++#if defined(CONFIG_VE) && defined(CONFIG_INET) ++#define ve_icmp_statistics (get_exec_env()->_icmp_statistics) ++#define ve_icmpmsg_statistics (get_exec_env()->_icmpmsg_statistics) ++#else ++#define ve_icmp_statistics icmp_statistics ++#define ve_icmpmsg_statistics icmpmsg_statistics ++#endif ++ ++#define ICMP_INC_STATS(field) SNMP_INC_STATS(ve_icmp_statistics, field) ++#define ICMP_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_icmp_statistics, field) ++#define ICMP_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_icmp_statistics, field) ++#define ICMPMSGOUT_INC_STATS(field) SNMP_INC_STATS(ve_icmpmsg_statistics, field+256) ++#define ICMPMSGOUT_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_icmpmsg_statistics, field+256) ++#define ICMPMSGOUT_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_icmpmsg_statistics, field+256) ++#define ICMPMSGIN_INC_STATS(field) SNMP_INC_STATS(ve_icmpmsg_statistics, field) ++#define ICMPMSGIN_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_icmpmsg_statistics, field) ++#define ICMPMSGIN_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_icmpmsg_statistics, field) + + struct dst_entry; + struct net_proto_family; +diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h +index 62a5b69..4b5375b 100644 +--- a/include/net/inet6_hashtables.h ++++ b/include/net/inet6_hashtables.h +@@ -29,9 +29,10 @@ struct inet_hashinfo; + + /* I have no idea if this is a good hash for v6 or not. -DaveM */ + static inline unsigned int inet6_ehashfn(const struct in6_addr *laddr, const u16 lport, +- const struct in6_addr *faddr, const __be16 fport) ++ const struct in6_addr *faddr, const __be16 fport, ++ const envid_t veid) + { +- u32 ports = (lport ^ (__force u16)fport); ++ u32 ports = (lport ^ (__force u16)fport) ^ (veid ^ (veid >> 16)); + + return jhash_3words((__force u32)laddr->s6_addr32[3], + (__force u32)faddr->s6_addr32[3], +@@ -46,7 +47,7 @@ static inline int inet6_sk_ehashfn(const struct sock *sk) + const struct in6_addr *faddr = &np->daddr; + const __u16 lport = inet->num; + const __be16 fport = inet->dport; +- return inet6_ehashfn(laddr, lport, faddr, fport); ++ return inet6_ehashfn(laddr, lport, faddr, fport, VEID(sk->owner_env)); + } + + extern void __inet6_hash(struct sock *sk); +diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h +index e081eef..7a554cc 100644 +--- a/include/net/inet_frag.h ++++ b/include/net/inet_frag.h +@@ -15,6 +15,9 @@ struct netns_frags { + struct inet_frag_queue { + struct hlist_node list; + struct netns_frags *net; ++#ifdef CONFIG_VE ++ struct ve_struct *owner_ve; ++#endif + struct list_head lru_list; /* lru list member */ + spinlock_t lock; + atomic_t refcnt; +diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h +index 735b926..30a2741 100644 +--- a/include/net/inet_hashtables.h ++++ b/include/net/inet_hashtables.h +@@ -74,6 +74,7 @@ struct inet_ehash_bucket { + * ports are created in O(1) time? I thought so. ;-) -DaveM + */ + struct inet_bind_bucket { ++ struct ve_struct *owner_env; + struct net *ib_net; + unsigned short port; + signed short fastreuse; +@@ -197,27 +198,29 @@ extern struct inet_bind_bucket * + inet_bind_bucket_create(struct kmem_cache *cachep, + struct net *net, + struct inet_bind_hashbucket *head, +- const unsigned short snum); ++ const unsigned short snum, ++ struct ve_struct *env); + extern void inet_bind_bucket_destroy(struct kmem_cache *cachep, + struct inet_bind_bucket *tb); + +-static inline int inet_bhashfn(const __u16 lport, const int bhash_size) ++static inline int inet_bhashfn(const __u16 lport, const int bhash_size, ++ unsigned veid) + { +- return lport & (bhash_size - 1); ++ return ((lport + (veid ^ (veid >> 16))) & (bhash_size - 1)); + } + + extern void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, + const unsigned short snum); + + /* These can have wildcards, don't try too hard. */ +-static inline int inet_lhashfn(const unsigned short num) ++static inline int inet_lhashfn(const unsigned short num, unsigned veid) + { +- return num & (INET_LHTABLE_SIZE - 1); ++ return ((num + (veid ^ (veid >> 16))) & (INET_LHTABLE_SIZE - 1)); + } + + static inline int inet_sk_listen_hashfn(const struct sock *sk) + { +- return inet_lhashfn(inet_sk(sk)->num); ++ return inet_lhashfn(inet_sk(sk)->num, VEID(sk->owner_env)); + } + + /* Caller must disable local BH processing. */ +@@ -372,7 +375,8 @@ static inline struct sock *inet_lookup(struct net *net, + extern int __inet_hash_connect(struct inet_timewait_death_row *death_row, + struct sock *sk, u32 port_offset, + int (*check_established)(struct inet_timewait_death_row *, +- struct sock *, __u16, struct inet_timewait_sock **), ++ struct sock *, __u16, struct inet_timewait_sock **, ++ struct ve_struct *), + void (*hash)(struct sock *sk)); + extern int inet_hash_connect(struct inet_timewait_death_row *death_row, + struct sock *sk); +diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h +index 9fabe5b..1ea3392 100644 +--- a/include/net/inet_sock.h ++++ b/include/net/inet_sock.h +@@ -172,12 +172,13 @@ extern u32 inet_ehash_secret; + extern void build_ehash_secret(void); + + static inline unsigned int inet_ehashfn(const __be32 laddr, const __u16 lport, +- const __be32 faddr, const __be16 fport) ++ const __be32 faddr, const __be16 fport, ++ const envid_t veid) + { + return jhash_3words((__force __u32) laddr, + (__force __u32) faddr, + ((__u32) lport) << 16 | (__force __u32)fport, +- inet_ehash_secret); ++ inet_ehash_secret ^ (veid ^ (veid >> 16))); + } + + static inline int inet_sk_ehashfn(const struct sock *sk) +@@ -187,8 +188,9 @@ static inline int inet_sk_ehashfn(const struct sock *sk) + const __u16 lport = inet->num; + const __be32 faddr = inet->daddr; + const __be16 fport = inet->dport; ++ envid_t veid = VEID(sk->owner_env); + +- return inet_ehashfn(laddr, lport, faddr, fport); ++ return inet_ehashfn(laddr, lport, faddr, fport, veid); + } + + +diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h +index 95c660c..3102e7b 100644 +--- a/include/net/inet_timewait_sock.h ++++ b/include/net/inet_timewait_sock.h +@@ -81,6 +81,7 @@ struct inet_timewait_death_row { + struct inet_hashinfo *hashinfo; + int sysctl_tw_recycle; + int sysctl_max_tw_buckets; ++ int ub_managed; + }; + + extern void inet_twdr_hangman(unsigned long data); +@@ -134,6 +135,7 @@ struct inet_timewait_sock { + unsigned long tw_ttd; + struct inet_bind_bucket *tw_tb; + struct hlist_node tw_death_node; ++ envid_t tw_owner_env; + }; + + static inline void inet_twsk_add_node(struct inet_timewait_sock *tw, +diff --git a/include/net/ip.h b/include/net/ip.h +index 3b40bc2..3a8d5f4 100644 +--- a/include/net/ip.h ++++ b/include/net/ip.h +@@ -157,16 +157,31 @@ struct ipv4_config + + extern struct ipv4_config ipv4_config; + DECLARE_SNMP_STAT(struct ipstats_mib, ip_statistics); +-#define IP_INC_STATS(field) SNMP_INC_STATS(ip_statistics, field) +-#define IP_INC_STATS_BH(field) SNMP_INC_STATS_BH(ip_statistics, field) +-#define IP_INC_STATS_USER(field) SNMP_INC_STATS_USER(ip_statistics, field) +-#define IP_ADD_STATS_BH(field, val) SNMP_ADD_STATS_BH(ip_statistics, field, val) ++ ++#ifdef CONFIG_VE ++#define ve_ip_statistics (get_exec_env()->_ip_statistics) ++#else ++#define ve_ip_statistics ip_statistics ++#endif ++#define IP_INC_STATS(field) SNMP_INC_STATS(ve_ip_statistics, field) ++#define IP_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_ip_statistics, field) ++#define IP_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_ip_statistics, field) ++#define IP_ADD_STATS_BH(field, val) SNMP_ADD_STATS_BH(ve_ip_statistics, field, val) ++ + DECLARE_SNMP_STAT(struct linux_mib, net_statistics); +-#define NET_INC_STATS(field) SNMP_INC_STATS(net_statistics, field) +-#define NET_INC_STATS_BH(field) SNMP_INC_STATS_BH(net_statistics, field) +-#define NET_INC_STATS_USER(field) SNMP_INC_STATS_USER(net_statistics, field) +-#define NET_ADD_STATS_BH(field, adnd) SNMP_ADD_STATS_BH(net_statistics, field, adnd) +-#define NET_ADD_STATS_USER(field, adnd) SNMP_ADD_STATS_USER(net_statistics, field, adnd) ++#if defined(CONFIG_VE) && defined(CONFIG_INET) ++#define ve_net_statistics (get_exec_env()->_net_statistics) ++ ++extern int init_ipv4_mibs(void); ++extern void cleanup_ipv4_mibs(void); ++#else ++#define ve_net_statistics net_statistics ++#endif ++#define NET_INC_STATS(field) SNMP_INC_STATS(ve_net_statistics, field) ++#define NET_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_net_statistics, field) ++#define NET_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_net_statistics, field) ++#define NET_ADD_STATS_BH(field, adnd) SNMP_ADD_STATS_BH(ve_net_statistics, field, adnd) ++#define NET_ADD_STATS_USER(field, adnd) SNMP_ADD_STATS_USER(ve_net_statistics, field, adnd) + + extern unsigned long snmp_fold_field(void *mib[], int offt); + extern int snmp_mib_init(void *ptr[2], size_t mibsize); +diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h +index 7c5c0f7..6d549ac 100644 +--- a/include/net/ip6_fib.h ++++ b/include/net/ip6_fib.h +@@ -156,6 +156,7 @@ struct fib6_table { + u32 tb6_id; + rwlock_t tb6_lock; + struct fib6_node tb6_root; ++ struct ve_struct *owner_env; + }; + + #define RT6_TABLE_UNSPEC RT_TABLE_UNSPEC +diff --git a/include/net/ipv6.h b/include/net/ipv6.h +index f422f72..185a018 100644 +--- a/include/net/ipv6.h ++++ b/include/net/ipv6.h +@@ -117,7 +117,7 @@ extern struct ctl_path net_ipv6_ctl_path[]; + struct inet6_dev *_idev = (idev); \ + if (likely(_idev != NULL)) \ + SNMP_INC_STATS##modifier((_idev)->stats.statname, (field)); \ +- SNMP_INC_STATS##modifier(statname##_statistics, (field)); \ ++ SNMP_INC_STATS##modifier(ve_##statname##_statistics, (field)); \ + }) + + #define _DEVADD(statname, modifier, idev, field, val) \ +@@ -125,9 +125,22 @@ extern struct ctl_path net_ipv6_ctl_path[]; + struct inet6_dev *_idev = (idev); \ + if (likely(_idev != NULL)) \ + SNMP_ADD_STATS##modifier((_idev)->stats.statname, (field), (val)); \ +- SNMP_ADD_STATS##modifier(statname##_statistics, (field), (val));\ ++ SNMP_ADD_STATS##modifier(ve_##statname##_statistics, (field), (val));\ + }) + ++#ifdef CONFIG_VE ++#define ve_ipv6_statistics (get_exec_env()->_ipv6_statistics) ++#define ve_icmpv6_statistics (get_exec_env()->_icmpv6_statistics) ++#define ve_icmpv6msg_statistics (get_exec_env()->_icmpv6msg_statistics) ++ ++extern int init_ipv6_mibs(void); ++extern void cleanup_ipv6_mibs(void); ++#else ++#define ve_ipv6_statistics ipv6_statistics ++#define ve_icmpv6_statistics icmpv6_statistics ++#define ve_icmpv6msg_statistics icmpv6msg_statistics ++#endif ++ + /* MIBs */ + DECLARE_SNMP_STAT(struct ipstats_mib, ipv6_statistics); + +diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h +index d9dd0f7..eca50f8 100644 +--- a/include/net/net_namespace.h ++++ b/include/net/net_namespace.h +@@ -45,6 +45,13 @@ struct net { + struct hlist_head *dev_name_head; + struct hlist_head *dev_index_head; + ++ int ifindex; ++ ++#ifdef CONFIG_VE ++ struct completion *sysfs_completion; ++ struct ve_struct *owner_ve; ++#endif ++ + /* core fib_rules */ + struct list_head rules_ops; + spinlock_t rules_mod_lock; +diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h +index 9bf0598..c94a355 100644 +--- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h ++++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h +@@ -18,8 +18,18 @@ extern struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4; + extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4; + extern struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp; + ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) + extern int nf_conntrack_ipv4_compat_init(void); + extern void nf_conntrack_ipv4_compat_fini(void); ++#else ++static inline int nf_conntrack_ipv4_compat_init(void) ++{ ++ return 0; ++} ++static inline void nf_conntrack_ipv4_compat_fini(void) ++{ ++} ++#endif + + extern void need_ipv4_conntrack(void); + +diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h +index 2dbd6c0..9563d32 100644 +--- a/include/net/netfilter/nf_conntrack.h ++++ b/include/net/netfilter/nf_conntrack.h +@@ -28,6 +28,10 @@ + + #include + ++#ifdef CONFIG_VE_IPTABLES ++#include ++#endif ++ + /* per conntrack: protocol private data */ + union nf_conntrack_proto { + /* insert conntrack proto private data here */ +@@ -131,6 +135,10 @@ struct nf_conn + struct nf_ct_ext *ext; + + struct rcu_head rcu; ++ ++#ifdef CONFIG_VE_IPTABLES ++ struct ve_struct *ct_owner_env; ++#endif + }; + + static inline struct nf_conn * +@@ -194,6 +202,11 @@ extern void nf_conntrack_hash_insert(struct nf_conn *ct); + + extern void nf_conntrack_flush(void); + ++struct nf_conntrack_helper * nf_ct_helper_find_get( const struct nf_conntrack_tuple *tuple); ++void nf_ct_helper_put(struct nf_conntrack_helper *helper); ++ ++struct nf_conntrack_helper * __nf_conntrack_helper_find_byname(const char *name); ++ + extern bool nf_ct_get_tuplepr(const struct sk_buff *skb, + unsigned int nhoff, u_int16_t l3num, + struct nf_conntrack_tuple *tuple); +@@ -239,7 +252,8 @@ nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data); + extern void nf_conntrack_free(struct nf_conn *ct); + extern struct nf_conn * + nf_conntrack_alloc(const struct nf_conntrack_tuple *orig, +- const struct nf_conntrack_tuple *repl); ++ const struct nf_conntrack_tuple *repl, ++ struct user_beancounter *); + + /* It's confirmed if it is, or has been in the hash table. */ + static inline int nf_ct_is_confirmed(struct nf_conn *ct) +@@ -262,6 +276,8 @@ extern unsigned int nf_conntrack_htable_size; + extern int nf_conntrack_checksum; + extern atomic_t nf_conntrack_count; + extern int nf_conntrack_max; ++extern int nf_conntrack_disable_ve0; ++extern int ip_conntrack_disable_ve0; + + DECLARE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat); + #define NF_CT_STAT_INC(count) (__get_cpu_var(nf_conntrack_stat).count++) +diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h +index a817712..30831ef 100644 +--- a/include/net/netfilter/nf_conntrack_core.h ++++ b/include/net/netfilter/nf_conntrack_core.h +@@ -52,6 +52,42 @@ nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple); + + extern int __nf_conntrack_confirm(struct sk_buff *skb); + ++#if defined(CONFIG_VE_IPTABLES) ++#include ++#define ve_nf_conntrack_hash (get_exec_env()->_nf_conntrack->_nf_conntrack_hash) ++#define ve_nf_conntrack_vmalloc (get_exec_env()->_nf_conntrack->_nf_conntrack_vmalloc) ++#define ve_unconfirmed (get_exec_env()->_nf_conntrack->_unconfirmed) ++#else ++#define ve_nf_conntrack_hash nf_conntrack_hash ++#define ve_nf_conntrack_vmalloc nf_conntrack_vmalloc ++#define ve_unconfirmed unconfirmed ++#endif /* CONFIG_VE_IPTABLES */ ++ ++#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL) ++#define ve_nf_ct_sysctl_header \ ++ (get_exec_env()->_nf_conntrack->_nf_ct_sysctl_header) ++#define ve_nf_ct_sysctl_table \ ++ (get_exec_env()->_nf_conntrack->_nf_ct_sysctl_table) ++#define ve_nf_ct_netfilter_table \ ++ (get_exec_env()->_nf_conntrack->_nf_ct_netfilter_table) ++#define ve_nf_ct_net_table \ ++ (get_exec_env()->_nf_conntrack->_nf_ct_net_table) ++extern void nf_ct_proto_generic_sysctl_cleanup(void); ++extern int nf_ct_proto_generic_sysctl_init(void); ++#else ++#define ve_nf_ct_sysctl_header nf_ct_sysctl_header ++#define ve_nf_ct_sysctl_table nf_ct_sysctl_table ++#define ve_nf_ct_netfilter_table nf_ct_netfilter_table ++#define ve_nf_ct_net_table nf_ct_net_table ++static inline int nf_ct_proto_generic_sysctl_init(void) ++{ ++ return 0; ++} ++static inline void nf_ct_proto_generic_sysctl_cleanup(void) ++{ ++} ++#endif /* CONFIG_VE_IPTABLES */ ++ + /* Confirm a connection: returns NF_DROP if packet must be dropped. */ + static inline int nf_conntrack_confirm(struct sk_buff *skb) + { +@@ -71,7 +107,9 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_l3proto *l3proto, + const struct nf_conntrack_l4proto *proto); + ++#ifndef CONFIG_VE_IPTABLES + extern struct hlist_head *nf_conntrack_hash; ++#endif + extern spinlock_t nf_conntrack_lock ; + extern struct hlist_head unconfirmed; + +diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h +index f0b9078..4bcf1bd 100644 +--- a/include/net/netfilter/nf_conntrack_ecache.h ++++ b/include/net/netfilter/nf_conntrack_ecache.h +@@ -34,6 +34,9 @@ nf_conntrack_event_cache(enum ip_conntrack_events event, + struct nf_conn *ct = (struct nf_conn *)skb->nfct; + struct nf_conntrack_ecache *ecache; + ++ if (!ve_is_super(get_exec_env())) ++ return; ++ + local_bh_disable(); + ecache = &__get_cpu_var(nf_conntrack_ecache); + if (ct != ecache->ct) +@@ -45,7 +48,7 @@ nf_conntrack_event_cache(enum ip_conntrack_events event, + static inline void nf_conntrack_event(enum ip_conntrack_events event, + struct nf_conn *ct) + { +- if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct)) ++ if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct) && ve_is_super(get_exec_env())) + atomic_notifier_call_chain(&nf_conntrack_chain, event, ct); + } + +@@ -57,7 +60,8 @@ static inline void + nf_ct_expect_event(enum ip_conntrack_expect_events event, + struct nf_conntrack_expect *exp) + { +- atomic_notifier_call_chain(&nf_ct_expect_chain, event, exp); ++ if (ve_is_super(get_exec_env())) ++ atomic_notifier_call_chain(&nf_ct_expect_chain, event, exp); + } + + #else /* CONFIG_NF_CONNTRACK_EVENTS */ +diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h +index dfdf4b4..4175cdf 100644 +--- a/include/net/netfilter/nf_conntrack_expect.h ++++ b/include/net/netfilter/nf_conntrack_expect.h +@@ -6,9 +6,17 @@ + #define _NF_CONNTRACK_EXPECT_H + #include + +-extern struct hlist_head *nf_ct_expect_hash; + extern unsigned int nf_ct_expect_hsize; + extern unsigned int nf_ct_expect_max; ++#ifdef CONFIG_VE_IPTABLES ++#include ++#define ve_nf_ct_expect_hash (get_exec_env()->_nf_conntrack->_nf_ct_expect_hash) ++#define ve_nf_ct_expect_max (get_exec_env()->_nf_conntrack->_nf_ct_expect_max) ++#else ++extern struct hlist_head *nf_ct_expect_hash; ++#define ve_nf_ct_expect_hash nf_ct_expect_hash ++#define ve_nf_ct_expect_max nf_ct_expect_max ++#endif + + struct nf_conntrack_expect + { +@@ -73,6 +81,8 @@ void nf_conntrack_expect_fini(void); + struct nf_conntrack_expect * + __nf_ct_expect_find(const struct nf_conntrack_tuple *tuple); + ++void nf_ct_expect_insert(struct nf_conntrack_expect *exp); ++ + struct nf_conntrack_expect * + nf_ct_expect_find_get(const struct nf_conntrack_tuple *tuple); + +diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h +index 0378676..6b1f720 100644 +--- a/include/net/netfilter/nf_conntrack_l3proto.h ++++ b/include/net/netfilter/nf_conntrack_l3proto.h +@@ -42,6 +42,9 @@ struct nf_conntrack_l3proto + int (*print_tuple)(struct seq_file *s, + const struct nf_conntrack_tuple *); + ++ /* Called when a conntrack entry is destroyed */ ++ void (*destroy)(struct nf_conn *conntrack); ++ + /* + * Called before tracking. + * *dataoff: offset of protocol header (TCP, UDP,...) in skb +@@ -67,6 +70,33 @@ struct nf_conntrack_l3proto + struct module *me; + }; + ++/* virtualization of l3 protocol's sysctl tables: */ ++#if defined(CONFIG_VE_IPTABLES) ++#include ++#define ve_nf_ct3 (get_exec_env()->_nf_conntrack) ++#endif ++ ++#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL) ++#define ve_nf_ct_l3protos ve_nf_ct3->_nf_ct_l3protos ++#define ve_nf_conntrack_l3proto_ipv4 (ve_nf_ct3->_nf_conntrack_l3proto_ipv4) ++#define ve_nf_conntrack_l3proto_ipv6 (ve_nf_ct3->_nf_conntrack_l3proto_ipv6) ++#define ve_nf_conntrack_max (ve_nf_ct3->_nf_conntrack_max) ++#define ve_nf_conntrack_count (ve_nf_ct3->_nf_conntrack_count) ++#define ve_nf_conntrack_checksum (ve_nf_ct3->_nf_conntrack_checksum) ++#else /* !CONFIG_VE_IPTABLES || !CONFIG_SYSCTL: */ ++#define ve_nf_ct_l3protos nf_ct_l3protos ++#define ve_nf_conntrack_l3proto_ipv4 &nf_conntrack_l3proto_ipv4 ++#define ve_nf_conntrack_l3proto_ipv6 &nf_conntrack_l3proto_ipv6 ++#define ve_nf_conntrack_max nf_conntrack_max ++#define ve_nf_conntrack_count nf_conntrack_count ++#define ve_nf_conntrack_checksum nf_conntrack_checksum ++#endif /* CONFIG_VE_IPTABLES && CONFIG_SYSCTL */ ++ ++extern int init_nf_ct_l3proto_ipv4(void); ++extern void fini_nf_ct_l3proto_ipv4(void); ++extern int init_nf_ct_l3proto_ipv6(void); ++extern void fini_nf_ct_l3proto_ipv6(void); ++ + extern struct nf_conntrack_l3proto *nf_ct_l3protos[AF_MAX]; + + /* Protocol registration. */ +@@ -83,7 +113,11 @@ __nf_ct_l3proto_find(u_int16_t l3proto) + { + if (unlikely(l3proto >= AF_MAX)) + return &nf_conntrack_l3proto_generic; +- return rcu_dereference(nf_ct_l3protos[l3proto]); ++#ifdef CONFIG_VE_IPTABLES ++ if (!get_exec_env()->_nf_conntrack) ++ return &nf_conntrack_l3proto_generic; ++#endif ++ return rcu_dereference(ve_nf_ct_l3protos[l3proto]); + } + + #endif /*_NF_CONNTRACK_L3PROTO_H*/ +diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h +index 723df9d..43ecaf7 100644 +--- a/include/net/netfilter/nf_conntrack_l4proto.h ++++ b/include/net/netfilter/nf_conntrack_l4proto.h +@@ -97,6 +97,7 @@ extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6; + extern struct nf_conntrack_l4proto nf_conntrack_l4proto_generic; + + #define MAX_NF_CT_PROTO 256 ++extern struct nf_conntrack_l4proto **nf_ct_protos[PF_MAX]; + + extern struct nf_conntrack_l4proto * + __nf_ct_l4proto_find(u_int16_t l3proto, u_int8_t l4proto); +@@ -117,16 +118,142 @@ extern int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], + struct nf_conntrack_tuple *t); + extern const struct nla_policy nf_ct_port_nla_policy[]; + ++#ifdef CONFIG_SYSCTL + /* Log invalid packets */ + extern unsigned int nf_ct_log_invalid; ++#endif ++ ++#ifdef CONFIG_VE_IPTABLES ++#include ++#define ve_nf_ct4 (get_exec_env()->_nf_conntrack) ++#endif ++ ++#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL) ++ ++#define ve_nf_ct_protos (ve_nf_ct4->_nf_ct_protos) ++#define ve_nf_conntrack_l4proto_icmp (ve_nf_ct4->_nf_conntrack_l4proto_icmp) ++#define ve_nf_conntrack_l4proto_icmpv6 \ ++ (ve_nf_ct4->_nf_conntrack_l4proto_icmpv6) ++#define ve_nf_conntrack_l4proto_tcp4 (ve_nf_ct4->_nf_conntrack_l4proto_tcp4) ++#define ve_nf_conntrack_l4proto_tcp6 (ve_nf_ct4->_nf_conntrack_l4proto_tcp6) ++#define ve_nf_conntrack_l4proto_udp4 (ve_nf_ct4->_nf_conntrack_l4proto_udp4) ++#define ve_nf_conntrack_l4proto_udp6 (ve_nf_ct4->_nf_conntrack_l4proto_udp6) ++#define ve_nf_conntrack_l4proto_generic \ ++ (ve_nf_ct4->_nf_conntrack_l4proto_generic) ++#define ve_nf_ct_log_invalid (ve_nf_ct4->_nf_ct_log_invalid) ++/* TCP: */ ++#define ve_nf_ct_tcp_timeouts (ve_nf_ct4->_nf_ct_tcp_timeouts) ++#define ve_nf_ct_tcp_timeout_max_retrans \ ++ (ve_nf_ct4->_nf_ct_tcp_timeout_max_retrans) ++#define ve_nf_ct_tcp_max_retrans (ve_nf_ct4->_nf_ct_tcp_max_retrans) ++#define ve_nf_ct_tcp_loose (ve_nf_ct4->_nf_ct_tcp_loose) ++#define ve_nf_ct_tcp_be_liberal (ve_nf_ct4->_nf_ct_tcp_be_liberal) ++#define ve_tcp_sysctl_table_users (ve_nf_ct4->_tcp_sysctl_table_users) ++#define ve_tcp_sysctl_header (ve_nf_ct4->_tcp_sysctl_header) ++#define ve_tcp_compat_sysctl_header (ve_nf_ct4->_tcp_compat_sysctl_header) ++/* UDP: */ ++#define ve_nf_ct_udp_timeout (ve_nf_ct4->_nf_ct_udp_timeout) ++#define ve_nf_ct_udp_timeout_stream (ve_nf_ct4->_nf_ct_udp_timeout_stream) ++#define ve_udp_sysctl_table_users (ve_nf_ct4->_udp_sysctl_table_users) ++#define ve_udp_sysctl_header (ve_nf_ct4->_udp_sysctl_header) ++#define ve_udp_compat_sysctl_header (ve_nf_ct4->_udp_compat_sysctl_header) ++/* ICMP: */ ++#define ve_nf_ct_icmp_timeout (ve_nf_ct4->_nf_ct_icmp_timeout) ++#define ve_icmp_sysctl_header (ve_nf_ct4->_icmp_sysctl_header) ++#define ve_icmp_compat_sysctl_header (ve_nf_ct4->_icmp_compat_sysctl_header) ++/* ICMPV6: */ ++#define ve_nf_ct_icmpv6_timeout (ve_nf_ct4->_nf_ct_icmpv6_timeout) ++#define ve_icmpv6_sysctl_header (ve_nf_ct4->_icmpv6_sysctl_header) ++/* GENERIC: */ ++#define ve_nf_ct_generic_timeout (ve_nf_ct4->_nf_ct_generic_timeout) ++#define ve_generic_sysctl_header (ve_nf_ct4->_generic_sysctl_header) ++#define ve_generic_compat_sysctl_header (ve_nf_ct4->_generic_compat_sysctl_header) ++ ++extern void nf_ct_proto_icmp_sysctl_cleanup(void); ++extern int nf_ct_proto_icmp_sysctl_init(void); ++extern void nf_ct_proto_icmpv6_sysctl_cleanup(void); ++extern int nf_ct_proto_icmpv6_sysctl_init(void); ++extern void nf_ct_proto_tcp_sysctl_cleanup(void); ++extern int nf_ct_proto_tcp_sysctl_init(void); ++extern void nf_ct_proto_udp_sysctl_cleanup(void); ++extern int nf_ct_proto_udp_sysctl_init(void); ++ ++#else /* !CONFIG_VE_IPTABLES || !CONFIG_SYSCTL: */ ++ ++#define ve_nf_ct_protos nf_ct_protos ++#define ve_nf_conntrack_l4proto_icmp &nf_conntrack_l4proto_icmp ++#define ve_nf_conntrack_l4proto_icmpv6 &nf_conntrack_l4proto_icmpv6 ++#define ve_nf_conntrack_l4proto_tcp4 &nf_conntrack_l4proto_tcp4 ++#define ve_nf_conntrack_l4proto_tcp6 &nf_conntrack_l4proto_tcp6 ++#define ve_nf_conntrack_l4proto_udp4 &nf_conntrack_l4proto_udp4 ++#define ve_nf_conntrack_l4proto_udp6 &nf_conntrack_l4proto_udp6 ++#define ve_nf_conntrack_l4proto_generic &nf_conntrack_l4proto_generic ++ ++#if defined(CONFIG_SYSCTL) ++ ++#define ve_nf_ct_log_invalid nf_ct_log_invalid ++/* TCP: */ ++#define ve_nf_ct_tcp_timeouts *tcp_timeouts ++#define ve_nf_ct_tcp_timeout_max_retrans \ ++ nf_ct_tcp_timeout_max_retrans ++#define ve_nf_ct_tcp_max_retrans nf_ct_tcp_max_retrans ++#define ve_nf_ct_tcp_loose nf_ct_tcp_loose ++#define ve_nf_ct_tcp_be_liberal nf_ct_tcp_be_liberal ++#define ve_tcp_sysctl_table_users tcp_sysctl_table_users ++#define ve_tcp_sysctl_header tcp_sysctl_header ++/* UDP:*/ ++#define ve_nf_ct_udp_timeout nf_ct_udp_timeout ++#define ve_nf_ct_udp_timeout_stream nf_ct_udp_timeout_stream ++#define ve_udp_sysctl_table_users udp_sysctl_table_users ++#define ve_udp_sysctl_header udp_sysctl_header ++/* ICMP: */ ++#define ve_nf_ct_icmp_timeout nf_ct_icmp_timeout ++#define ve_icmp_sysctl_header icmp_sysctl_header ++/* ICMPV6: */ ++#define ve_nf_ct_icmpv6_timeout nf_ct_icmpv6_timeout ++#define ve_icmpv6_sysctl_header icmpv6_sysctl_header ++/* GENERIC: */ ++#define ve_nf_ct_generic_timeout nf_ct_generic_timeout ++#define ve_generic_sysctl_header generic_sysctl_header ++#endif /* CONFIG_SYSCTL */ ++ ++static inline int nf_ct_proto_icmp_sysctl_init(void) ++{ ++ return 0; ++} ++static inline void nf_ct_proto_icmp_sysctl_cleanup(void) ++{ ++} ++static inline int nf_ct_proto_tcp_sysctl_init(void) ++{ ++ return 0; ++} ++static inline void nf_ct_proto_tcp_sysctl_cleanup(void) ++{ ++} ++static inline int nf_ct_proto_udp_sysctl_init(void) ++{ ++ return 0; ++} ++static inline void nf_ct_proto_udp_sysctl_cleanup(void) ++{ ++} ++static inline int nf_ct_proto_icmpv6_sysctl_init(void) ++{ ++ return 0; ++} ++static inline void nf_ct_proto_icmpv6_sysctl_cleanup(void) ++{ ++} ++#endif /* CONFIG_VE_IPTABLES && CONFIG_SYSCTL */ + + #ifdef CONFIG_SYSCTL + #ifdef DEBUG_INVALID_PACKETS + #define LOG_INVALID(proto) \ +- (nf_ct_log_invalid == (proto) || nf_ct_log_invalid == IPPROTO_RAW) ++ (ve_nf_ct_log_invalid == (proto) || ve_nf_ct_log_invalid == IPPROTO_RAW) + #else + #define LOG_INVALID(proto) \ +- ((nf_ct_log_invalid == (proto) || nf_ct_log_invalid == IPPROTO_RAW) \ ++ ((ve_nf_ct_log_invalid == (proto) || ve_nf_ct_log_invalid == IPPROTO_RAW) \ + && net_ratelimit()) + #endif + #else +diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h +index 9dc1039..bfa9069 100644 +--- a/include/net/netfilter/nf_nat.h ++++ b/include/net/netfilter/nf_nat.h +@@ -77,6 +77,8 @@ struct nf_conn_nat + #endif + }; + ++void nf_nat_hash_conntrack(struct nf_conn *ct); ++ + /* Set up the info structure to map into this range. */ + extern unsigned int nf_nat_setup_info(struct nf_conn *ct, + const struct nf_nat_range *range, +@@ -85,6 +87,7 @@ extern unsigned int nf_nat_setup_info(struct nf_conn *ct, + /* Is this tuple already taken? (not by us)*/ + extern int nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, + const struct nf_conn *ignored_conntrack); ++extern void ip_nat_hash_conntrack(struct nf_conn *ct); + + static inline struct nf_conn_nat *nfct_nat(const struct nf_conn *ct) + { +diff --git a/include/net/netfilter/nf_nat_rule.h b/include/net/netfilter/nf_nat_rule.h +index e4a18ae..8bb00da 100644 +--- a/include/net/netfilter/nf_nat_rule.h ++++ b/include/net/netfilter/nf_nat_rule.h +@@ -4,7 +4,7 @@ + #include + #include + +-extern int nf_nat_rule_init(void) __init; ++extern int nf_nat_rule_init(void); + extern void nf_nat_rule_cleanup(void); + extern int nf_nat_rule_find(struct sk_buff *skb, + unsigned int hooknum, +diff --git a/include/net/netlink_sock.h b/include/net/netlink_sock.h +new file mode 100644 +index 0000000..ce4701a +--- /dev/null ++++ b/include/net/netlink_sock.h +@@ -0,0 +1,23 @@ ++#ifndef __NET_NETLINK_SOCK_H ++#define __NET_NETLINK_SOCK_H ++ ++struct netlink_sock { ++ /* struct sock has to be the first member of netlink_sock */ ++ struct sock sk; ++ u32 pid; ++ u32 dst_pid; ++ u32 dst_group; ++ u32 flags; ++ u32 subscriptions; ++ u32 ngroups; ++ unsigned long *groups; ++ unsigned long state; ++ wait_queue_head_t wait; ++ struct netlink_callback *cb; ++ struct mutex *cb_mutex; ++ struct mutex cb_def_mutex; ++ void (*netlink_rcv)(struct sk_buff *skb); ++ struct module *module; ++}; ++ ++#endif /* __NET_NETLINK_SOCK_H */ +diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h +index 34ee348..d8588d5 100644 +--- a/include/net/netns/ipv4.h ++++ b/include/net/netns/ipv4.h +@@ -18,6 +18,7 @@ struct netns_ipv4 { + struct ctl_table_header *forw_hdr; + struct ctl_table_header *frags_hdr; + struct ctl_table_header *ipv4_hdr; ++ struct ctl_table_header *route_hdr; + #endif + struct ipv4_devconf *devconf_all; + struct ipv4_devconf *devconf_dflt; +@@ -44,5 +45,8 @@ struct netns_ipv4 { + int sysctl_icmp_ratelimit; + int sysctl_icmp_ratemask; + int sysctl_icmp_errors_use_inbound_ifaddr; ++ ++ struct timer_list rt_secret_timer; ++ atomic_t rt_genid; + }; + #endif +diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h +index ac053be..c368713 100644 +--- a/include/net/netns/ipv6.h ++++ b/include/net/netns/ipv6.h +@@ -13,6 +13,7 @@ struct netns_sysctl_ipv6 { + #ifdef CONFIG_SYSCTL + struct ctl_table_header *table; + struct ctl_table_header *frags_hdr; ++ struct ctl_table_header *nf_frags_hdr; + #endif + int bindv6only; + int flush_delay; +@@ -31,6 +32,7 @@ struct netns_ipv6 { + struct ipv6_devconf *devconf_all; + struct ipv6_devconf *devconf_dflt; + struct netns_frags frags; ++ struct netns_frags ct_frags; + #ifdef CONFIG_NETFILTER + struct xt_table *ip6table_filter; + struct xt_table *ip6table_mangle; +@@ -54,5 +56,7 @@ struct netns_ipv6 { + struct sock *ndisc_sk; + struct sock *tcp_sk; + struct sock *igmp_sk; ++ ++ struct proc_dir_entry *proc_dev_snmp; + }; + #endif +diff --git a/include/net/route.h b/include/net/route.h +index fc836ff..2ed2c29 100644 +--- a/include/net/route.h ++++ b/include/net/route.h +@@ -111,7 +111,7 @@ struct in_device; + extern int ip_rt_init(void); + extern void ip_rt_redirect(__be32 old_gw, __be32 dst, __be32 new_gw, + __be32 src, struct net_device *dev); +-extern void rt_cache_flush(int how); ++extern void rt_cache_flush(struct net *net, int how); + extern int __ip_route_output_key(struct net *, struct rtable **, const struct flowi *flp); + extern int ip_route_output_key(struct net *, struct rtable **, struct flowi *flp); + extern int ip_route_output_flow(struct net *, struct rtable **rp, struct flowi *flp, struct sock *sk, int flags); +@@ -138,6 +138,7 @@ static inline void ip_rt_put(struct rtable * rt) + #define IPTOS_RT_MASK (IPTOS_TOS_MASK & ~3) + + extern const __u8 ip_tos2prio[16]; ++extern int ip_rt_src_check; + + static inline char rt_tos2priority(u8 tos) + { +diff --git a/include/net/sock.h b/include/net/sock.h +index dc42b44..873caf6 100644 +--- a/include/net/sock.h ++++ b/include/net/sock.h +@@ -57,6 +57,8 @@ + #include + #include + ++#include ++ + /* + * This structure really needs to be cleaned up. + * Most of it is for TCP, and not used by any of +@@ -279,6 +281,8 @@ struct sock { + int (*sk_backlog_rcv)(struct sock *sk, + struct sk_buff *skb); + void (*sk_destruct)(struct sock *sk); ++ struct sock_beancounter sk_bc; ++ struct ve_struct *owner_env; + }; + + /* +@@ -495,6 +499,8 @@ static inline void sk_add_backlog(struct sock *sk, struct sk_buff *skb) + }) + + extern int sk_stream_wait_connect(struct sock *sk, long *timeo_p); ++extern int __sk_stream_wait_memory(struct sock *sk, long *timeo_p, ++ unsigned long amount); + extern int sk_stream_wait_memory(struct sock *sk, long *timeo_p); + extern void sk_stream_wait_close(struct sock *sk, long timeo_p); + extern int sk_stream_error(struct sock *sk, int flags, int err); +@@ -729,7 +735,8 @@ static inline int sk_has_account(struct sock *sk) + return !!sk->sk_prot->memory_allocated; + } + +-static inline int sk_wmem_schedule(struct sock *sk, int size) ++static inline int sk_wmem_schedule(struct sock *sk, int size, ++ struct sk_buff *skb) + { + if (!sk_has_account(sk)) + return 1; +@@ -737,12 +744,15 @@ static inline int sk_wmem_schedule(struct sock *sk, int size) + __sk_mem_schedule(sk, size, SK_MEM_SEND); + } + +-static inline int sk_rmem_schedule(struct sock *sk, int size) ++static inline int sk_rmem_schedule(struct sock *sk, struct sk_buff *skb) + { + if (!sk_has_account(sk)) + return 1; +- return size <= sk->sk_forward_alloc || +- __sk_mem_schedule(sk, size, SK_MEM_RECV); ++ if (!(skb->truesize <= sk->sk_forward_alloc || ++ __sk_mem_schedule(sk, skb->truesize, SK_MEM_RECV))) ++ return 0; ++ ++ return !ub_sockrcvbuf_charge(sk, skb); + } + + static inline void sk_mem_reclaim(struct sock *sk) +@@ -862,6 +872,11 @@ extern struct sk_buff *sock_alloc_send_skb(struct sock *sk, + unsigned long size, + int noblock, + int *errcode); ++extern struct sk_buff *sock_alloc_send_skb2(struct sock *sk, ++ unsigned long size, ++ unsigned long size2, ++ int noblock, ++ int *errcode); + extern void *sock_kmalloc(struct sock *sk, int size, + gfp_t priority); + extern void sock_kfree_s(struct sock *sk, void *mem, int size); +@@ -1119,6 +1134,7 @@ static inline int skb_copy_to_page(struct sock *sk, char __user *from, + + static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) + { ++ WARN_ON(skb->destructor); + sock_hold(sk); + skb->sk = sk; + skb->destructor = sock_wfree; +@@ -1127,6 +1143,7 @@ static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) + + static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk) + { ++ WARN_ON(skb->destructor); + skb->sk = sk; + skb->destructor = sock_rfree; + atomic_add(skb->truesize, &sk->sk_rmem_alloc); +diff --git a/include/net/tcp.h b/include/net/tcp.h +index cf54034..4fa0bca 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -43,6 +43,13 @@ + #include + + #include ++#include ++ ++#define TCP_PAGE(sk) (sk->sk_sndmsg_page) ++#define TCP_OFF(sk) (sk->sk_sndmsg_off) ++ ++#define TW_WSCALE_MASK 0x0f ++#define TW_WSCALE_SPEC 0x10 + + extern struct inet_hashinfo tcp_hashinfo; + +@@ -219,7 +226,9 @@ extern int sysctl_tcp_mem[3]; + extern int sysctl_tcp_wmem[3]; + extern int sysctl_tcp_rmem[3]; + extern int sysctl_tcp_app_win; ++#ifndef sysctl_tcp_adv_win_scale + extern int sysctl_tcp_adv_win_scale; ++#endif + extern int sysctl_tcp_tw_reuse; + extern int sysctl_tcp_frto; + extern int sysctl_tcp_frto_response; +@@ -234,6 +243,10 @@ extern int sysctl_tcp_base_mss; + extern int sysctl_tcp_workaround_signed_windows; + extern int sysctl_tcp_slow_start_after_idle; + extern int sysctl_tcp_max_ssthresh; ++extern int sysctl_tcp_use_sg; ++extern int sysctl_tcp_max_tw_kmem_fraction; ++extern int sysctl_tcp_max_tw_buckets_ub; ++ + + extern atomic_t tcp_memory_allocated; + extern atomic_t tcp_sockets_allocated; +@@ -266,12 +279,17 @@ static inline int tcp_too_many_orphans(struct sock *sk, int num) + extern struct proto tcp_prot; + + DECLARE_SNMP_STAT(struct tcp_mib, tcp_statistics); +-#define TCP_INC_STATS(field) SNMP_INC_STATS(tcp_statistics, field) +-#define TCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(tcp_statistics, field) +-#define TCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(tcp_statistics, field) +-#define TCP_DEC_STATS(field) SNMP_DEC_STATS(tcp_statistics, field) +-#define TCP_ADD_STATS_BH(field, val) SNMP_ADD_STATS_BH(tcp_statistics, field, val) +-#define TCP_ADD_STATS_USER(field, val) SNMP_ADD_STATS_USER(tcp_statistics, field, val) ++#if defined(CONFIG_VE) && defined(CONFIG_INET) ++#define ve_tcp_statistics (get_exec_env()->_tcp_statistics) ++#else ++#define ve_tcp_statistics tcp_statistics ++#endif ++#define TCP_INC_STATS(field) SNMP_INC_STATS(ve_tcp_statistics, field) ++#define TCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_tcp_statistics, field) ++#define TCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_tcp_statistics, field) ++#define TCP_DEC_STATS(field) SNMP_DEC_STATS(ve_tcp_statistics, field) ++#define TCP_ADD_STATS_BH(field, val) SNMP_ADD_STATS_BH(ve_tcp_statistics, field, val) ++#define TCP_ADD_STATS_USER(field, val) SNMP_ADD_STATS_USER(ve_tcp_statistics, field, val) + + extern void tcp_v4_err(struct sk_buff *skb, u32); + +@@ -545,7 +563,11 @@ extern u32 __tcp_select_window(struct sock *sk); + * to use only the low 32-bits of jiffies and hide the ugly + * casts with the following macro. + */ ++#ifdef CONFIG_VE ++#define tcp_time_stamp ((__u32)(jiffies + get_exec_env()->jiffies_fixup)) ++#else + #define tcp_time_stamp ((__u32)(jiffies)) ++#endif + + /* This is what the send packet queuing engine uses to pass + * TCP per-packet control information to the transmission +diff --git a/include/net/udp.h b/include/net/udp.h +index ccce837..62d3396 100644 +--- a/include/net/udp.h ++++ b/include/net/udp.h +@@ -148,6 +148,11 @@ extern int udp_lib_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, int optlen, + int (*push_pending_frames)(struct sock *)); + ++static inline int udp_hashfn(u16 num, unsigned veid) ++{ ++ return ((num + (veid ^ (veid >> 16))) & (UDP_HTABLE_SIZE - 1)); ++} ++ + DECLARE_SNMP_STAT(struct udp_mib, udp_statistics); + DECLARE_SNMP_STAT(struct udp_mib, udp_stats_in6); + +@@ -158,19 +163,31 @@ DECLARE_SNMP_STAT(struct udp_mib, udplite_stats_in6); + /* + * SNMP statistics for UDP and UDP-Lite + */ ++#ifdef CONFIG_VE ++#define ve_udp_statistics (get_exec_env()->_udp_statistics) ++#define ve_udplite_statistics (get_exec_env()->_udplite_statistics) ++#define ve_udp_stats_in6 (get_exec_env()->_udp_stats_in6) ++#define ve_udplite_stats_in6 (get_exec_env()->_udplite_stats_in6) ++#else ++#define ve_udp_statistics udp_statistics ++#define ve_udplite_statistics udplite_statistics ++#define ve_udp_stats_in6 udp_stats_in6 ++#define ve_udplite_stats_in6 udplite_stats_in6 ++#endif ++ + #define UDP_INC_STATS_USER(field, is_udplite) do { \ +- if (is_udplite) SNMP_INC_STATS_USER(udplite_statistics, field); \ +- else SNMP_INC_STATS_USER(udp_statistics, field); } while(0) ++ if (is_udplite) SNMP_INC_STATS_USER(ve_udplite_statistics, field); \ ++ else SNMP_INC_STATS_USER(ve_udp_statistics, field); } while(0) + #define UDP_INC_STATS_BH(field, is_udplite) do { \ +- if (is_udplite) SNMP_INC_STATS_BH(udplite_statistics, field); \ +- else SNMP_INC_STATS_BH(udp_statistics, field); } while(0) ++ if (is_udplite) SNMP_INC_STATS_BH(ve_udplite_statistics, field); \ ++ else SNMP_INC_STATS_BH(ve_udp_statistics, field); } while(0) + + #define UDP6_INC_STATS_BH(field, is_udplite) do { \ +- if (is_udplite) SNMP_INC_STATS_BH(udplite_stats_in6, field); \ +- else SNMP_INC_STATS_BH(udp_stats_in6, field); } while(0) ++ if (is_udplite) SNMP_INC_STATS_BH(ve_udplite_stats_in6, field); \ ++ else SNMP_INC_STATS_BH(ve_udp_stats_in6, field); } while(0) + #define UDP6_INC_STATS_USER(field, is_udplite) do { \ +- if (is_udplite) SNMP_INC_STATS_USER(udplite_stats_in6, field); \ +- else SNMP_INC_STATS_USER(udp_stats_in6, field); } while(0) ++ if (is_udplite) SNMP_INC_STATS_USER(ve_udplite_stats_in6, field); \ ++ else SNMP_INC_STATS_USER(ve_udp_stats_in6, field); } while(0) + + #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + #define UDPX_INC_STATS_BH(sk, field) \ +diff --git a/init/Kconfig b/init/Kconfig +index 6199d11..d0807fe 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -208,7 +208,7 @@ config TASK_XACCT + + config TASK_IO_ACCOUNTING + bool "Enable per-task storage I/O accounting (EXPERIMENTAL)" +- depends on TASK_XACCT ++ depends on TASK_XACCT && BEANCOUNTERS + help + Collect information on the number of bytes of storage I/O which this + task has caused. +@@ -292,7 +292,7 @@ config CGROUP_DEBUG + + config CGROUP_NS + bool "Namespace cgroup subsystem" +- depends on CGROUPS ++ depends on CGROUPS && !VE + help + Provides a simple namespace cgroup subsystem to + provide hierarchical naming of sets of namespaces, +@@ -308,7 +308,7 @@ config CGROUP_DEVICE + + config CPUSETS + bool "Cpuset support" +- depends on SMP && CGROUPS ++ depends on SMP && CGROUPS && !VE + help + This option will let you create and manage CPUSETs which + allow dynamically partitioning a system into sets of CPUs and +@@ -352,17 +352,18 @@ config RT_GROUP_SCHED + choice + depends on GROUP_SCHED + prompt "Basis for grouping tasks" +- default USER_SCHED ++ default VZ_FAIRSCHED + + config USER_SCHED + bool "user id" ++ depends on !VE + help + This option will choose userid as the basis for grouping + tasks, thus providing equal CPU bandwidth to each user. + + config CGROUP_SCHED + bool "Control groups" +- depends on CGROUPS ++ depends on CGROUPS && !VE + help + This option allows you to create arbitrary task groups + using the "cgroup" pseudo filesystem and control +@@ -370,6 +371,12 @@ config CGROUP_SCHED + Refer to Documentation/cgroups.txt for more information + on "cgroup" pseudo filesystem. + ++config VZ_FAIRSCHED ++ bool "OpenVZ groups" ++ help ++ This option add customizable task groups with OpenVZ compatible ++ syscall and procfs interface. ++ + endchoice + + config CGROUP_CPUACCT +diff --git a/init/calibrate.c b/init/calibrate.c +index ecb3822..474a8ca 100644 +--- a/init/calibrate.c ++++ b/init/calibrate.c +@@ -8,6 +8,7 @@ + #include + #include + #include ++#include + + unsigned long preset_lpj; + static int __init lpj_setup(char *str) +@@ -104,6 +105,60 @@ static unsigned long __cpuinit calibrate_delay_direct(void) + static unsigned long __cpuinit calibrate_delay_direct(void) {return 0;} + #endif + ++unsigned long cycles_per_jiffy, cycles_per_clock; ++ ++static __devinit void calibrate_cycles(void) ++{ ++ unsigned long ticks; ++ cycles_t time; ++ ++ ticks = jiffies; ++ while (ticks == jiffies) ++ /* nothing */; ++ time = get_cycles(); ++ ticks = jiffies; ++ while (ticks == jiffies) ++ /* nothing */; ++ ++ time = get_cycles() - time; ++ cycles_per_jiffy = time; ++ if ((time >> 32) != 0) { ++ printk("CPU too fast! timings are incorrect\n"); ++ cycles_per_jiffy = -1; ++ } ++} ++ ++EXPORT_SYMBOL(cycles_per_jiffy); ++EXPORT_SYMBOL(cycles_per_clock); ++ ++static __devinit void calc_cycles_per_jiffy(void) ++{ ++#if 0 ++ extern unsigned long fast_gettimeoffset_quotient; ++ unsigned long low, high; ++ ++ if (fast_gettimeoffset_quotient != 0) { ++ __asm__("divl %2" ++ :"=a" (low), "=d" (high) ++ :"r" (fast_gettimeoffset_quotient), ++ "0" (0), "1" (1000000/HZ)); ++ ++ cycles_per_jiffy = low; ++ } ++#endif ++ if (cycles_per_jiffy == 0) ++ calibrate_cycles(); ++ ++ if (cycles_per_jiffy == 0) { ++ printk(KERN_WARNING "Cycles are stuck! " ++ "Some statistics will not be available."); ++ /* to prevent division by zero in cycles_to_(clocks|jiffies) */ ++ cycles_per_jiffy = 1; ++ cycles_per_clock = 1; ++ } else ++ cycles_per_clock = cycles_per_jiffy * (HZ / CLOCKS_PER_SEC); ++} ++ + /* + * This is the number of bits of precision for the loops_per_jiffy. Each + * bit takes on average 1.5/HZ seconds. This (like the original) is a little +@@ -169,4 +224,5 @@ void __cpuinit calibrate_delay(void) + loops_per_jiffy); + } + ++ calc_cycles_per_jiffy(); + } +diff --git a/init/main.c b/init/main.c +index f7fb200..25c009c 100644 +--- a/init/main.c ++++ b/init/main.c +@@ -60,6 +60,9 @@ + #include + #include + #include ++#include ++ ++#include + + #include + #include +@@ -107,6 +110,16 @@ extern void tc_init(void); + enum system_states system_state; + EXPORT_SYMBOL(system_state); + ++#ifdef CONFIG_VE ++extern void init_ve_system(void); ++extern void init_ve0(void); ++extern void prepare_ve0_process(struct task_struct *tsk); ++#else ++#define init_ve_system() do { } while (0) ++#define init_ve0() do { } while (0) ++#define prepare_ve0_process(tsk) do { } while (0) ++#endif ++ + /* + * Boot command-line arguments + */ +@@ -538,6 +551,9 @@ asmlinkage void __init start_kernel(void) + + smp_setup_processor_id(); + ++ prepare_ve0_process(&init_task); ++ init_ve0(); ++ + /* + * Need to run as early as possible, to initialize the + * lockdep hash: +@@ -556,6 +572,7 @@ asmlinkage void __init start_kernel(void) + * enable them + */ + lock_kernel(); ++ ub_init_early(); + tick_init(); + boot_cpu_init(); + page_address_init(); +@@ -659,6 +676,7 @@ asmlinkage void __init start_kernel(void) + thread_info_cache_init(); + fork_init(num_physpages); + proc_caches_init(); ++ ub_init_late(); + buffer_init(); + unnamed_dev_init(); + key_init(); +@@ -680,6 +698,10 @@ asmlinkage void __init start_kernel(void) + + acpi_early_init(); /* before LAPIC and SMP init */ + ++#ifdef CONFIG_BC_RSS_ACCOUNTING ++ ub_init_pbc(); ++#endif ++ + /* Do the rest non-__init'ed, we're now alive */ + rest_init(); + } +@@ -758,6 +780,8 @@ static void __init do_initcalls(void) + */ + static void __init do_basic_setup(void) + { ++ init_ve_system(); ++ + /* drivers will send hotplug events */ + init_workqueues(); + usermodehelper_init(); +@@ -859,6 +883,7 @@ static int __init kernel_init(void * unused) + do_pre_smp_initcalls(); + + smp_init(); ++ fairsched_init_late(); + sched_init_smp(); + + cpuset_init_smp(); +diff --git a/init/version.c b/init/version.c +index 9d17d70..ce53b6b 100644 +--- a/init/version.c ++++ b/init/version.c +@@ -33,6 +33,12 @@ struct uts_namespace init_uts_ns = { + }; + EXPORT_SYMBOL_GPL(init_uts_ns); + ++struct new_utsname virt_utsname = { ++ /* we need only this field */ ++ .release = UTS_RELEASE, ++}; ++EXPORT_SYMBOL(virt_utsname); ++ + /* FIXED STRINGS! Don't touch! */ + const char linux_banner[] = + "Linux version " UTS_RELEASE " (" LINUX_COMPILE_BY "@" +diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c +index d349746..7e5dde4 100644 +--- a/ipc/ipc_sysctl.c ++++ b/ipc/ipc_sysctl.c +@@ -225,19 +225,14 @@ static struct ctl_table ipc_kern_table[] = { + {} + }; + +-static struct ctl_table ipc_root_table[] = { +- { +- .ctl_name = CTL_KERN, +- .procname = "kernel", +- .mode = 0555, +- .child = ipc_kern_table, +- }, ++static struct ctl_path ipc_path[] = { ++ { .ctl_name = CTL_KERN, .procname = "kernel", }, + {} + }; + + static int __init ipc_sysctl_init(void) + { +- register_sysctl_table(ipc_root_table); ++ register_sysctl_glob_paths(ipc_path, ipc_kern_table, 1); + return 0; + } + +diff --git a/ipc/msg.c b/ipc/msg.c +index b4eee1c..4fb6c0f 100644 +--- a/ipc/msg.c ++++ b/ipc/msg.c +@@ -183,6 +183,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params) + int id, retval; + key_t key = params->key; + int msgflg = params->flg; ++ int msqid = params->id; + + msq = ipc_rcu_alloc(sizeof(*msq)); + if (!msq) +@@ -201,7 +202,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params) + /* + * ipc_addid() locks msq + */ +- id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni); ++ id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni, msqid); + if (id < 0) { + security_msg_queue_free(msq); + ipc_rcu_putref(msq); +@@ -323,6 +324,7 @@ asmlinkage long sys_msgget(key_t key, int msgflg) + + msg_params.key = key; + msg_params.flg = msgflg; ++ msg_params.id = -1; + + return ipcget(ns, &msg_ids(ns), &msg_ops, &msg_params); + } +@@ -942,3 +944,55 @@ static int sysvipc_msg_proc_show(struct seq_file *s, void *it) + msq->q_ctime); + } + #endif ++ ++#ifdef CONFIG_VE ++#include ++ ++int sysvipc_setup_msg(key_t key, int msqid, int msgflg) ++{ ++ struct ipc_namespace *ns; ++ struct ipc_ops msg_ops; ++ struct ipc_params msg_params; ++ ++ ns = current->nsproxy->ipc_ns; ++ ++ msg_ops.getnew = newque; ++ msg_ops.associate = msg_security; ++ msg_ops.more_checks = NULL; ++ ++ msg_params.key = key; ++ msg_params.flg = msgflg | IPC_CREAT; ++ msg_params.id = msqid; ++ ++ return ipcget(ns, &msg_ids(ns), &msg_ops, &msg_params); ++} ++EXPORT_SYMBOL_GPL(sysvipc_setup_msg); ++ ++int sysvipc_walk_msg(int (*func)(int i, struct msg_queue*, void *), void *arg) ++{ ++ int err = 0; ++ struct msg_queue * msq; ++ struct ipc_namespace *ns; ++ int next_id; ++ int total, in_use; ++ ++ ns = current->nsproxy->ipc_ns; ++ ++ down_write(&msg_ids(ns).rw_mutex); ++ in_use = msg_ids(ns).in_use; ++ for (total = 0, next_id = 0; total < in_use; next_id++) { ++ msq = idr_find(&msg_ids(ns).ipcs_idr, next_id); ++ if (msq == NULL) ++ continue; ++ ipc_lock_by_ptr(&msq->q_perm); ++ err = func(ipc_buildid(next_id, msq->q_perm.seq), msq, arg); ++ msg_unlock(msq); ++ if (err) ++ break; ++ total++; ++ } ++ up_write(&msg_ids(ns).rw_mutex); ++ return err; ++} ++EXPORT_SYMBOL_GPL(sysvipc_walk_msg); ++#endif +diff --git a/ipc/msgutil.c b/ipc/msgutil.c +index c82c215..d058294 100644 +--- a/ipc/msgutil.c ++++ b/ipc/msgutil.c +@@ -8,6 +8,7 @@ + * See the file COPYING for more details. + */ + ++#include + #include + #include + #include +@@ -17,6 +18,8 @@ + + #include "util.h" + ++#include ++ + struct msg_msgseg { + struct msg_msgseg* next; + /* the next part of the message follows immediately */ +@@ -25,52 +28,53 @@ struct msg_msgseg { + #define DATALEN_MSG (PAGE_SIZE-sizeof(struct msg_msg)) + #define DATALEN_SEG (PAGE_SIZE-sizeof(struct msg_msgseg)) + +-struct msg_msg *load_msg(const void __user *src, int len) ++struct msg_msg *sysv_msg_load(int (*load)(void * dst, int len, int offset, ++ void * data), int len, void * data) + { + struct msg_msg *msg; + struct msg_msgseg **pseg; + int err; + int alen; ++ int offset = 0; + + alen = len; + if (alen > DATALEN_MSG) + alen = DATALEN_MSG; + +- msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL); ++ msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL_UBC); + if (msg == NULL) + return ERR_PTR(-ENOMEM); + + msg->next = NULL; + msg->security = NULL; + +- if (copy_from_user(msg + 1, src, alen)) { ++ if (load(msg + 1, alen, offset, data)) { + err = -EFAULT; + goto out_err; + } + + len -= alen; +- src = ((char __user *)src) + alen; ++ offset += alen; + pseg = &msg->next; + while (len > 0) { + struct msg_msgseg *seg; + alen = len; + if (alen > DATALEN_SEG) + alen = DATALEN_SEG; +- seg = kmalloc(sizeof(*seg) + alen, +- GFP_KERNEL); ++ seg = kmalloc(sizeof(*seg) + alen, GFP_KERNEL_UBC); + if (seg == NULL) { + err = -ENOMEM; + goto out_err; + } + *pseg = seg; + seg->next = NULL; +- if (copy_from_user(seg + 1, src, alen)) { ++ if (load(seg + 1, alen, offset, data)) { + err = -EFAULT; + goto out_err; + } + pseg = &seg->next; + len -= alen; +- src = ((char __user *)src) + alen; ++ offset += alen; + } + + err = security_msg_msg_alloc(msg); +@@ -83,33 +87,58 @@ out_err: + free_msg(msg); + return ERR_PTR(err); + } ++EXPORT_SYMBOL_GPL(sysv_msg_load); + +-int store_msg(void __user *dest, struct msg_msg *msg, int len) ++static int do_load_msg(void * dst, int len, int offset, void * data) ++{ ++ return copy_from_user(dst, data + offset, len); ++} ++ ++struct msg_msg *load_msg(const void __user *src, int len) ++{ ++ return sysv_msg_load(do_load_msg, len, (void*)src); ++} ++ ++int sysv_msg_store(struct msg_msg *msg, ++ int (*store)(void * src, int len, int offset, void * data), ++ int len, void * data) + { + int alen; ++ int offset = 0; + struct msg_msgseg *seg; +- ++ + alen = len; + if (alen > DATALEN_MSG) + alen = DATALEN_MSG; +- if (copy_to_user(dest, msg + 1, alen)) ++ if (store(msg + 1, alen, offset, data)) + return -1; + + len -= alen; +- dest = ((char __user *)dest) + alen; ++ offset += alen; + seg = msg->next; + while (len > 0) { + alen = len; + if (alen > DATALEN_SEG) + alen = DATALEN_SEG; +- if (copy_to_user(dest, seg + 1, alen)) ++ if (store(seg + 1, alen, offset, data)) + return -1; + len -= alen; +- dest = ((char __user *)dest) + alen; ++ offset += alen; + seg = seg->next; + } + return 0; + } ++EXPORT_SYMBOL_GPL(sysv_msg_store); ++ ++static int do_store_msg(void * src, int len, int offset, void * data) ++{ ++ return copy_to_user(data + offset, src, len); ++} ++ ++int store_msg(void __user *dest, struct msg_msg *msg, int len) ++{ ++ return sysv_msg_store(msg, do_store_msg, len, dest); ++} + + void free_msg(struct msg_msg *msg) + { +diff --git a/ipc/sem.c b/ipc/sem.c +index e9418df..2786746 100644 +--- a/ipc/sem.c ++++ b/ipc/sem.c +@@ -87,6 +87,8 @@ + #include + #include "util.h" + ++#include ++ + #define sem_ids(ns) ((ns)->ids[IPC_SEM_IDS]) + + #define sem_unlock(sma) ipc_unlock(&(sma)->sem_perm) +@@ -240,6 +242,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params) + key_t key = params->key; + int nsems = params->u.nsems; + int semflg = params->flg; ++ int semid = params->id; + + if (!nsems) + return -EINVAL; +@@ -263,7 +266,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params) + return retval; + } + +- id = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni); ++ id = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni, semid); + if (id < 0) { + security_sem_free(sma); + ipc_rcu_putref(sma); +@@ -327,6 +330,7 @@ asmlinkage long sys_semget(key_t key, int nsems, int semflg) + sem_params.key = key; + sem_params.flg = semflg; + sem_params.u.nsems = nsems; ++ sem_params.id = -1; + + return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params); + } +@@ -947,7 +951,7 @@ static inline int get_undo_list(struct sem_undo_list **undo_listp) + + undo_list = current->sysvsem.undo_list; + if (!undo_list) { +- undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL); ++ undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL_UBC); + if (undo_list == NULL) + return -ENOMEM; + spin_lock_init(&undo_list->lock); +@@ -1004,7 +1008,8 @@ static struct sem_undo *find_undo(struct ipc_namespace *ns, int semid) + nsems = sma->sem_nsems; + sem_getref_and_unlock(sma); + +- new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL); ++ new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, ++ GFP_KERNEL_UBC); + if (!new) { + sem_putref(sma); + return ERR_PTR(-ENOMEM); +@@ -1059,7 +1064,7 @@ asmlinkage long sys_semtimedop(int semid, struct sembuf __user *tsops, + if (nsops > ns->sc_semopm) + return -E2BIG; + if(nsops > SEMOPM_FAST) { +- sops = kmalloc(sizeof(*sops)*nsops,GFP_KERNEL); ++ sops = kmalloc(sizeof(*sops)*nsops, GFP_KERNEL_UBC); + if(sops==NULL) + return -ENOMEM; + } +@@ -1341,3 +1346,57 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it) + sma->sem_ctime); + } + #endif ++ ++#ifdef CONFIG_VE ++#include ++ ++int sysvipc_setup_sem(key_t key, int semid, size_t size, int semflg) ++{ ++ struct ipc_namespace *ns; ++ struct ipc_ops sem_ops; ++ struct ipc_params sem_params; ++ ++ ns = current->nsproxy->ipc_ns; ++ ++ sem_ops.getnew = newary; ++ sem_ops.associate = sem_security; ++ sem_ops.more_checks = sem_more_checks; ++ ++ sem_params.key = key; ++ sem_params.flg = semflg | IPC_CREAT; ++ sem_params.u.nsems = size; ++ sem_params.id = semid; ++ ++ return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params); ++} ++EXPORT_SYMBOL_GPL(sysvipc_setup_sem); ++ ++int sysvipc_walk_sem(int (*func)(int i, struct sem_array*, void *), void *arg) ++{ ++ int err = 0; ++ struct sem_array *sma; ++ struct ipc_namespace *ns; ++ int next_id; ++ int total, in_use; ++ ++ ns = current->nsproxy->ipc_ns; ++ ++ down_write(&sem_ids(ns).rw_mutex); ++ in_use = sem_ids(ns).in_use; ++ for (total = 0, next_id = 0; total < in_use; next_id++) { ++ sma = idr_find(&sem_ids(ns).ipcs_idr, next_id); ++ if (sma == NULL) ++ continue; ++ ipc_lock_by_ptr(&sma->sem_perm); ++ err = func(ipc_buildid(next_id, sma->sem_perm.seq), sma, arg); ++ sem_unlock(sma); ++ if (err) ++ break; ++ total++; ++ } ++ up_write(&sem_ids(ns).rw_mutex); ++ return err; ++} ++EXPORT_SYMBOL_GPL(sysvipc_walk_sem); ++EXPORT_SYMBOL_GPL(exit_sem); ++#endif +diff --git a/ipc/shm.c b/ipc/shm.c +index 790240c..e9ff453 100644 +--- a/ipc/shm.c ++++ b/ipc/shm.c +@@ -39,27 +39,17 @@ + #include + #include + #include ++#include + + #include + +-#include "util.h" +- +-struct shm_file_data { +- int id; +- struct ipc_namespace *ns; +- struct file *file; +- const struct vm_operations_struct *vm_ops; +-}; ++#include ++#include + +-#define shm_file_data(file) (*((struct shm_file_data **)&(file)->private_data)) ++#include "util.h" + +-static const struct file_operations shm_file_operations; + static struct vm_operations_struct shm_vm_ops; + +-#define shm_ids(ns) ((ns)->ids[IPC_SHM_IDS]) +- +-#define shm_unlock(shp) \ +- ipc_unlock(&(shp)->shm_perm) + + static int newseg(struct ipc_namespace *, struct ipc_params *); + static void shm_open(struct vm_area_struct *vma); +@@ -126,20 +116,6 @@ static inline struct shmid_kernel *shm_lock_down(struct ipc_namespace *ns, + return container_of(ipcp, struct shmid_kernel, shm_perm); + } + +-/* +- * shm_lock_(check_) routines are called in the paths where the rw_mutex +- * is not held. +- */ +-static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id) +-{ +- struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id); +- +- if (IS_ERR(ipcp)) +- return (struct shmid_kernel *)ipcp; +- +- return container_of(ipcp, struct shmid_kernel, shm_perm); +-} +- + static inline struct shmid_kernel *shm_lock_check(struct ipc_namespace *ns, + int id) + { +@@ -172,6 +148,48 @@ static void shm_open(struct vm_area_struct *vma) + shm_unlock(shp); + } + ++static int shmem_lock(struct shmid_kernel *shp, int lock, ++ struct user_struct *user) ++{ ++ struct file *file = shp->shm_file; ++ struct inode *inode = file->f_path.dentry->d_inode; ++ struct shmem_inode_info *info = SHMEM_I(inode); ++ unsigned long size; ++ ++ size = shp->shm_segsz + PAGE_SIZE - 1; ++ ++#ifdef CONFIG_SHMEM ++ spin_lock(&info->lock); ++ if (lock && !(info->flags & VM_LOCKED)) { ++ if (ub_lockedshm_charge(info, size) < 0) ++ goto out_ch; ++ ++ if (!user_shm_lock(inode->i_size, user)) ++ goto out_user; ++ info->flags |= VM_LOCKED; ++ } ++ if (!lock && (info->flags & VM_LOCKED) && user) { ++ ub_lockedshm_uncharge(info, size); ++ user_shm_unlock(inode->i_size, user); ++ info->flags &= ~VM_LOCKED; ++ } ++ spin_unlock(&info->lock); ++ return 0; ++ ++out_user: ++ ub_lockedshm_uncharge(info, size); ++out_ch: ++ spin_unlock(&info->lock); ++ return -ENOMEM; ++#else ++ if (lock && ub_lockedshm_charge(info, size)) ++ return -ENOMEM; ++ if (!lock) ++ ub_lockedshm_uncharge(info, size); ++ return 0; ++#endif ++} ++ + /* + * shm_destroy - free the struct shmid_kernel + * +@@ -187,7 +205,7 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) + shm_rmid(ns, shp); + shm_unlock(shp); + if (!is_file_hugepages(shp->shm_file)) +- shmem_lock(shp->shm_file, 0, shp->mlock_user); ++ shmem_lock(shp, 0, shp->mlock_user); + else + user_shm_unlock(shp->shm_file->f_path.dentry->d_inode->i_size, + shp->mlock_user); +@@ -319,12 +337,13 @@ int is_file_shm_hugepages(struct file *file) + return ret; + } + +-static const struct file_operations shm_file_operations = { ++const struct file_operations shm_file_operations = { + .mmap = shm_mmap, + .fsync = shm_fsync, + .release = shm_release, + .get_unmapped_area = shm_get_unmapped_area, + }; ++EXPORT_SYMBOL_GPL(shm_file_operations); + + static struct vm_operations_struct shm_vm_ops = { + .open = shm_open, /* callback for a new vm-area open */ +@@ -349,11 +368,12 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) + key_t key = params->key; + int shmflg = params->flg; + size_t size = params->u.size; ++ int shmid = params->id; + int error; + struct shmid_kernel *shp; + int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT; + struct file * file; +- char name[13]; ++ char name[64]; + int id; + + if (size < SHMMIN || size > ns->shm_ctlmax) +@@ -377,7 +397,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) + return error; + } + +- sprintf (name, "SYSV%08x", key); ++ snprintf (name, sizeof(name), "VE%d-SYSV%08x", VEID(get_exec_env()), key); + if (shmflg & SHM_HUGETLB) { + /* hugetlb_file_setup takes care of mlock user accounting */ + file = hugetlb_file_setup(name, size); +@@ -397,7 +417,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) + if (IS_ERR(file)) + goto no_file; + +- id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni); ++ id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni, shmid); + if (id < 0) { + error = id; + goto no_id; +@@ -470,6 +490,7 @@ asmlinkage long sys_shmget (key_t key, size_t size, int shmflg) + shm_params.key = key; + shm_params.flg = shmflg; + shm_params.u.size = size; ++ shm_params.id = -1; + + return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params); + } +@@ -778,14 +799,14 @@ asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf) + if(cmd==SHM_LOCK) { + struct user_struct * user = current->user; + if (!is_file_hugepages(shp->shm_file)) { +- err = shmem_lock(shp->shm_file, 1, user); ++ err = shmem_lock(shp, 1, user); + if (!err && !(shp->shm_perm.mode & SHM_LOCKED)){ + shp->shm_perm.mode |= SHM_LOCKED; + shp->mlock_user = user; + } + } + } else if (!is_file_hugepages(shp->shm_file)) { +- shmem_lock(shp->shm_file, 0, shp->mlock_user); ++ shmem_lock(shp, 0, shp->mlock_user); + shp->shm_perm.mode &= ~SHM_LOCKED; + shp->mlock_user = NULL; + } +@@ -1084,3 +1105,67 @@ static int sysvipc_shm_proc_show(struct seq_file *s, void *it) + shp->shm_ctim); + } + #endif ++ ++#ifdef CONFIG_VE ++#include ++ ++struct file * sysvipc_setup_shm(key_t key, int shmid, size_t size, int shmflg) ++{ ++ struct ipc_namespace *ns; ++ struct ipc_ops shm_ops; ++ struct ipc_params shm_params; ++ struct shmid_kernel *shp; ++ struct file *file; ++ int rv; ++ ++ ns = current->nsproxy->ipc_ns; ++ ++ shm_ops.getnew = newseg; ++ shm_ops.associate = shm_security; ++ shm_ops.more_checks = shm_more_checks; ++ ++ shm_params.key = key; ++ shm_params.flg = shmflg | IPC_CREAT; ++ shm_params.u.size = size; ++ shm_params.id = shmid; ++ ++ rv = ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params); ++ if (rv < 0) ++ return ERR_PTR(rv); ++ shp = shm_lock(ns, rv); ++ BUG_ON(IS_ERR(shp)); ++ file = shp->shm_file; ++ get_file(file); ++ shm_unlock(shp); ++ return file; ++} ++EXPORT_SYMBOL_GPL(sysvipc_setup_shm); ++ ++int sysvipc_walk_shm(int (*func)(struct shmid_kernel*, void *), void *arg) ++{ ++ int err = 0; ++ struct shmid_kernel* shp; ++ struct ipc_namespace *ns; ++ int next_id; ++ int total, in_use; ++ ++ ns = current->nsproxy->ipc_ns; ++ ++ down_write(&shm_ids(ns).rw_mutex); ++ in_use = shm_ids(ns).in_use; ++ for (total = 0, next_id = 0; total < in_use; next_id++) { ++ shp = idr_find(&shm_ids(ns).ipcs_idr, next_id); ++ if (shp == NULL) ++ continue; ++ ipc_lock_by_ptr(&shp->shm_perm); ++ err = func(shp, arg); ++ shm_unlock(shp); ++ if (err) ++ break; ++ total++; ++ } ++ up_write(&shm_ids(ns).rw_mutex); ++ return err; ++} ++EXPORT_SYMBOL_GPL(sysvipc_walk_shm); ++#endif +diff --git a/ipc/util.c b/ipc/util.c +index 3339177..5f1b3a2 100644 +--- a/ipc/util.c ++++ b/ipc/util.c +@@ -38,6 +38,8 @@ + + #include + ++#include ++ + #include "util.h" + + struct ipc_proc_iface { +@@ -247,6 +249,7 @@ int ipc_get_maxid(struct ipc_ids *ids) + * @ids: IPC identifier set + * @new: new IPC permission set + * @size: limit for the number of used ids ++ * @reqid: if >= 0, get this id exactly. If -1 -- don't care. + * + * Add an entry 'new' to the IPC ids idr. The permissions object is + * initialised and the first free entry is set up and the id assigned +@@ -256,10 +259,18 @@ int ipc_get_maxid(struct ipc_ids *ids) + * Called with ipc_ids.rw_mutex held as a writer. + */ + +-int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size) ++int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size, int reqid) + { + int id, err; + ++ if (reqid >= 0) { ++ id = reqid % SEQ_MULTIPLIER; ++ err = idr_get_new_above(&ids->ipcs_idr, new, id, &id); ++ if (err || id != (reqid % SEQ_MULTIPLIER)) ++ return -EEXIST; ++ goto found; ++ } ++ + if (size > IPCMNI) + size = IPCMNI; + +@@ -270,14 +281,19 @@ int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size) + if (err) + return err; + ++found: + ids->in_use++; + + new->cuid = new->uid = current->euid; + new->gid = new->cgid = current->egid; + +- new->seq = ids->seq++; +- if(ids->seq > ids->seq_max) +- ids->seq = 0; ++ if (reqid >= 0) { ++ new->seq = reqid/SEQ_MULTIPLIER; ++ } else { ++ new->seq = ids->seq++; ++ if(ids->seq > ids->seq_max) ++ ids->seq = 0; ++ } + + new->id = ipc_buildid(id, new->seq); + spin_lock_init(&new->lock); +@@ -445,9 +461,9 @@ void* ipc_alloc(int size) + { + void* out; + if(size > PAGE_SIZE) +- out = vmalloc(size); ++ out = ub_vmalloc(size); + else +- out = kmalloc(size, GFP_KERNEL); ++ out = kmalloc(size, GFP_KERNEL_UBC); + return out; + } + +@@ -530,14 +546,14 @@ void* ipc_rcu_alloc(int size) + * workqueue if necessary (for vmalloc). + */ + if (rcu_use_vmalloc(size)) { +- out = vmalloc(HDRLEN_VMALLOC + size); ++ out = ub_vmalloc(HDRLEN_VMALLOC + size); + if (out) { + out += HDRLEN_VMALLOC; + container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 1; + container_of(out, struct ipc_rcu_hdr, data)->refcount = 1; + } + } else { +- out = kmalloc(HDRLEN_KMALLOC + size, GFP_KERNEL); ++ out = kmalloc(HDRLEN_KMALLOC + size, GFP_KERNEL_UBC); + if (out) { + out += HDRLEN_KMALLOC; + container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 0; +@@ -724,6 +740,7 @@ struct kern_ipc_perm *ipc_lock(struct ipc_ids *ids, int id) + + return out; + } ++EXPORT_SYMBOL_GPL(ipc_lock); + + /** + * ipc_lock_down - Lock an ipc structure with rw_sem held +@@ -863,7 +880,7 @@ struct kern_ipc_perm *ipcctl_pre_down(struct ipc_ids *ids, int id, int cmd, + goto out_unlock; + } + if (current->euid == ipcp->cuid || +- current->euid == ipcp->uid || capable(CAP_SYS_ADMIN)) ++ current->euid == ipcp->uid || capable(CAP_VE_SYS_ADMIN)) + return ipcp; + + err = -EPERM; +diff --git a/ipc/util.h b/ipc/util.h +index cdb966a..e45893d 100644 +--- a/ipc/util.h ++++ b/ipc/util.h +@@ -39,6 +39,7 @@ struct ipc_params { + size_t size; /* for shared memories */ + int nsems; /* for semaphores */ + } u; /* holds the getnew() specific param */ ++ int id; + }; + + /* +@@ -68,14 +69,10 @@ void __init ipc_init_proc_interface(const char *path, const char *header, + #define ipc_init_proc_interface(path, header, ids, show) do {} while (0) + #endif + +-#define IPC_SEM_IDS 0 +-#define IPC_MSG_IDS 1 +-#define IPC_SHM_IDS 2 +- + #define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER) + + /* must be called with ids->rw_mutex acquired for writing */ +-int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int); ++int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int, int); + + /* must be called with ids->rw_mutex acquired for reading */ + int ipc_get_maxid(struct ipc_ids *); +@@ -107,7 +104,6 @@ void ipc_rcu_putref(void *ptr); + * ipc_lock: called without that lock held + */ + struct kern_ipc_perm *ipc_lock_down(struct ipc_ids *, int); +-struct kern_ipc_perm *ipc_lock(struct ipc_ids *, int); + + void kernel_to_ipc64_perm(struct kern_ipc_perm *in, struct ipc64_perm *out); + void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out); +@@ -149,12 +145,6 @@ static inline void ipc_lock_by_ptr(struct kern_ipc_perm *perm) + spin_lock(&perm->lock); + } + +-static inline void ipc_unlock(struct kern_ipc_perm *perm) +-{ +- spin_unlock(&perm->lock); +- rcu_read_unlock(); +-} +- + struct kern_ipc_perm *ipc_lock_check_down(struct ipc_ids *ids, int id); + struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id); + int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids, +diff --git a/kernel/Kconfig.openvz b/kernel/Kconfig.openvz +new file mode 100644 +index 0000000..dfd54fd +--- /dev/null ++++ b/kernel/Kconfig.openvz +@@ -0,0 +1,91 @@ ++# Copyright (C) 2005 SWsoft ++# All rights reserved. ++# Licensing governed by "linux/COPYING.SWsoft" file. ++ ++menu "OpenVZ" ++ ++config VE ++ bool "Virtual Environment support" ++ default y ++ select NAMESPACES ++ select PID_NS ++ select IPC_NS ++ select UTS_NS ++ select NET_NS ++ select USER_NS ++ select CGROUPS ++ select CGROUP_DEVICE ++ select GROUP_SCHED ++ select FAIR_GROUP_SCHED ++ help ++ This option adds support of virtual Linux running on the original box ++ with fully supported virtual network driver, tty subsystem and ++ configurable access for hardware and other resources. ++ ++config VE_CALLS ++ tristate "VE calls interface" ++ depends on VE ++ select VZ_DEV ++ default m ++ help ++ This option controls how to build vzmon code containing VE calls. ++ By default it's build in module vzmon.o ++ ++config VZ_GENCALLS ++ bool ++ default y ++ ++config VE_NETDEV ++ tristate "VE network device" ++ depends on VE_CALLS && NET ++ select VZ_DEV ++ default m ++ help ++ This option controls whether to build venet device. This is a ++ common interface for networking in VE. ++ ++config VE_ETHDEV ++ tristate "Virtual ethernet device" ++ depends on VE_CALLS && NET ++ select VZ_DEV ++ default m ++ help ++ This option controls whether to build virtual ethernet device. ++ ++config VZ_DEV ++ tristate "VE device" ++ default m ++ help ++ This option adds support of vzdev device, which is used by ++ user-space applications to control Virtual Environments. ++ ++config VE_IPTABLES ++ bool "VE netfiltering" ++ depends on VE && VE_NETDEV && INET && NETFILTER ++ default y ++ help ++ This option controls whether to build VE netfiltering code. ++ ++config VZ_WDOG ++ tristate "VE watchdog module" ++ depends on VE_CALLS ++ default m ++ help ++ This option controls building of vzwdog module, which dumps ++ a lot of useful system info on console periodically. ++ ++config VZ_CHECKPOINT ++ tristate "Checkpointing & restoring Virtual Environments" ++ depends on VE_CALLS && INET ++ select PM ++ select PM_SLEEP ++ select TUN ++ select VE_ETHDEV ++ select VE_NETDEV ++ default n ++ help ++ This option adds two modules, "cpt" and "rst", which allow ++ to save a running Virtual Environment and restore it ++ on another host (live migration) or on the same host (checkpointing). ++ ++endmenu +diff --git a/kernel/Makefile b/kernel/Makefile +index 1c9938a..d16fa33 100644 +--- a/kernel/Makefile ++++ b/kernel/Makefile +@@ -14,6 +14,10 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ + obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o + obj-$(CONFIG_STACKTRACE) += stacktrace.o + obj-y += time/ ++obj-$(CONFIG_BEANCOUNTERS) += bc/ ++obj-y += ve/ ++obj-$(CONFIG_VZ_CHECKPOINT) += cpt/ ++ + obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o + obj-$(CONFIG_LOCKDEP) += lockdep.o + ifeq ($(CONFIG_PROC_FS),y) +@@ -38,7 +42,11 @@ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o + obj-$(CONFIG_KEXEC) += kexec.o + obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o + obj-$(CONFIG_COMPAT) += compat.o ++ifeq ($(CONFIG_VE),n) + obj-$(CONFIG_CGROUPS) += cgroup.o ++else ++obj-$(CONFIG_CGROUPS) += cgroup_lite.o ++endif + obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o + obj-$(CONFIG_CPUSETS) += cpuset.o + obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o +@@ -69,6 +77,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o + obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o + obj-$(CONFIG_MARKERS) += marker.o + obj-$(CONFIG_LATENCYTOP) += latencytop.o ++obj-$(CONFIG_VZ_FAIRSCHED) += fairsched.o + + ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) + # According to Alan Modra , the -fno-omit-frame-pointer is +diff --git a/kernel/audit.c b/kernel/audit.c +index e092f1c..ddc9b19 100644 +--- a/kernel/audit.c ++++ b/kernel/audit.c +@@ -666,6 +666,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) + char *ctx = NULL; + u32 len; + ++ if (!ve_is_super(skb->owner_env)) ++ return -ECONNREFUSED; ++ + err = audit_netlink_ok(skb, msg_type); + if (err) + return err; +diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c +index 98c50cc..450142b 100644 +--- a/kernel/auditfilter.c ++++ b/kernel/auditfilter.c +@@ -164,8 +164,8 @@ static struct audit_parent *audit_init_parent(struct nameidata *ndp) + inotify_init_watch(&parent->wdata); + /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */ + get_inotify_watch(&parent->wdata); +- wd = inotify_add_watch(audit_ih, &parent->wdata, +- ndp->path.dentry->d_inode, AUDIT_IN_WATCH); ++ wd = inotify_add_watch_dget(audit_ih, &parent->wdata, ++ &ndp->path, AUDIT_IN_WATCH); + if (wd < 0) { + audit_free_parent(&parent->wdata); + return ERR_PTR(wd); +diff --git a/kernel/bc/Kconfig b/kernel/bc/Kconfig +new file mode 100644 +index 0000000..2c3de4a +--- /dev/null ++++ b/kernel/bc/Kconfig +@@ -0,0 +1,111 @@ ++# ++# User resources part (UBC) ++# ++# Copyright (C) 2005 SWsoft ++# All rights reserved. ++# ++# Licensing governed by "linux/COPYING.SWsoft" file. ++ ++menu "User resources" ++ ++config BEANCOUNTERS ++ bool "Enable user resource accounting" ++ default y ++ help ++ This patch provides accounting and allows to configure ++ limits for user's consumption of exhaustible system resources. ++ The most important resource controlled by this patch is unswappable ++ memory (either mlock'ed or used by internal kernel structures and ++ buffers). The main goal of this patch is to protect processes ++ from running short of important resources because of an accidental ++ misbehavior of processes or malicious activity aiming to ``kill'' ++ the system. It's worth to mention that resource limits configured ++ by setrlimit(2) do not give an acceptable level of protection ++ because they cover only small fraction of resources and work on a ++ per-process basis. Per-process accounting doesn't prevent malicious ++ users from spawning a lot of resource-consuming processes. ++ ++config BC_RSS_ACCOUNTING ++ bool "Account physical memory usage" ++ default y ++ depends on BEANCOUNTERS ++ help ++ This allows to estimate per beancounter physical memory usage. ++ Implemented alghorithm accounts shared pages of memory as well, ++ dividing them by number of beancounter which use the page. ++ ++config BC_IO_ACCOUNTING ++ bool "Account disk IO" ++ default y ++ depends on BC_RSS_ACCOUNTING ++ help ++ When on this option allows seeing disk IO activity caused by ++ tasks from each UB ++ ++config BC_IO_SCHED ++ bool "UBC I/O priority" ++ default y ++ depends on BC_IO_ACCOUNTING && IOSCHED_CFQ ++ help ++ This option controls whether to build CFQ I/O scheduler ++ with support of UBC I/O priority. ++ ++config BC_SWAP_ACCOUNTING ++ bool "Account swap usage" ++ default y ++ depends on BEANCOUNTERS ++ help ++ This allows accounting of swap usage. ++ ++config BC_PROC ++ bool "Report resource usage in /proc" ++ default y ++ depends on BEANCOUNTERS ++ help ++ Allows a system administrator to inspect resource accounts and limits. ++ ++config BC_DEBUG ++ bool "User resources debug features" ++ default n ++ depends on BEANCOUNTERS ++ help ++ Enables to setup debug features for user resource accounting ++ ++config BC_DEBUG_IO ++ bool "Debug IO accounting" ++ default y ++ depends on BC_DEBUG && BC_IO_ACCOUNTING ++ help ++ Debugging for IO accointing. ++ ++config BC_DEBUG_KMEM ++ bool "Debug kmemsize with cache counters" ++ default n ++ depends on BC_DEBUG ++ help ++ Adds /proc/user_beancounters_debug entry to get statistics ++ about cache usage of each beancounter ++ ++config BC_KEEP_UNUSED ++ bool "Keep unused beancounter alive" ++ default y ++ depends on BC_DEBUG ++ help ++ If on, unused beancounters are kept on the hash and maxheld value ++ can be looked through. ++ ++config BC_DEBUG_ITEMS ++ bool "Account resources in items rather than in bytes" ++ default y ++ depends on BC_DEBUG ++ help ++ When true some of the resources (e.g. kmemsize) are accounted ++ in items instead of bytes. ++ ++config BC_UNLIMITED ++ bool "Use unlimited ubc settings" ++ default y ++ depends on BC_DEBUG ++ help ++ When ON all limits and barriers are set to max values. ++endmenu +diff --git a/kernel/bc/Makefile b/kernel/bc/Makefile +new file mode 100644 +index 0000000..e0e6529 +--- /dev/null ++++ b/kernel/bc/Makefile +@@ -0,0 +1,16 @@ ++# ++# User resources part (UBC) ++# ++# Copyright (C) 2005 SWsoft ++# All rights reserved. ++# ++# Licensing governed by "linux/COPYING.SWsoft" file. ++ ++obj-y := sys.o beancounter.o dcache.o kmem.o misc.o \ ++ vm_pages.o statd.o oom_kill.o ++ ++obj-$(CONFIG_NET) += net.o ++obj-$(CONFIG_BC_RSS_ACCOUNTING) += rss_pages.o ++obj-$(CONFIG_BC_PROC) += proc.o ++obj-$(CONFIG_BC_IO_ACCOUNTING) += io_acct.o ++obj-$(CONFIG_BC_IO_SCHED) += io_prio.o +diff --git a/kernel/bc/beancounter.c b/kernel/bc/beancounter.c +new file mode 100644 +index 0000000..48fa1cc +--- /dev/null ++++ b/kernel/bc/beancounter.c +@@ -0,0 +1,676 @@ ++/* ++ * linux/kernel/bc/beancounter.c ++ * ++ * Copyright (C) 1998 Alan Cox ++ * 1998-2000 Andrey V. Savochkin ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ * TODO: ++ * - more intelligent limit check in mremap(): currently the new size is ++ * charged and _then_ old size is uncharged ++ * (almost done: !move_vma case is completely done, ++ * move_vma in its current implementation requires too many conditions to ++ * do things right, because it may be not only expansion, but shrinking ++ * also, plus do_munmap will require an additional parameter...) ++ * - problem: bad pmd page handling ++ * - consider /proc redesign ++ * - TCP/UDP ports ++ * + consider whether __charge_beancounter_locked should be inline ++ * ++ * Changes: ++ * 1999/08/17 Marcelo Tosatti ++ * - Set "barrier" and "limit" parts of limits atomically. ++ * 1999/10/06 Marcelo Tosatti ++ * - setublimit system call. ++ */ ++ ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++ ++static struct kmem_cache *ub_cachep; ++static struct user_beancounter default_beancounter; ++struct user_beancounter ub0; ++EXPORT_SYMBOL_GPL(ub0); ++ ++const char *ub_rnames[] = { ++ "kmemsize", /* 0 */ ++ "lockedpages", ++ "privvmpages", ++ "shmpages", ++ "dummy", ++ "numproc", /* 5 */ ++ "physpages", ++ "vmguarpages", ++ "oomguarpages", ++ "numtcpsock", ++ "numflock", /* 10 */ ++ "numpty", ++ "numsiginfo", ++ "tcpsndbuf", ++ "tcprcvbuf", ++ "othersockbuf", /* 15 */ ++ "dgramrcvbuf", ++ "numothersock", ++ "dcachesize", ++ "numfile", ++ "dummy", /* 20 */ ++ "dummy", ++ "dummy", ++ "numiptent", ++ "unused_privvmpages", /* UB_RESOURCES */ ++ "tmpfs_respages", ++ "swap_pages", ++ "held_pages", ++}; ++ ++static void init_beancounter_struct(struct user_beancounter *ub); ++static void init_beancounter_store(struct user_beancounter *ub); ++static void init_beancounter_nolimits(struct user_beancounter *ub); ++ ++int print_ub_uid(struct user_beancounter *ub, char *buf, int size) ++{ ++ if (ub->parent != NULL) ++ return snprintf(buf, size, "%u.%u", ++ ub->parent->ub_uid, ub->ub_uid); ++ else ++ return snprintf(buf, size, "%u", ub->ub_uid); ++} ++EXPORT_SYMBOL(print_ub_uid); ++ ++#define ub_hash_fun(x) ((((x) >> 8) ^ (x)) & (UB_HASH_SIZE - 1)) ++#define ub_subhash_fun(p, id) ub_hash_fun((p)->ub_uid + (id) * 17) ++struct hlist_head ub_hash[UB_HASH_SIZE]; ++DEFINE_SPINLOCK(ub_hash_lock); ++LIST_HEAD(ub_list_head); /* protected by ub_hash_lock */ ++EXPORT_SYMBOL(ub_hash); ++EXPORT_SYMBOL(ub_hash_lock); ++EXPORT_SYMBOL(ub_list_head); ++ ++/* ++ * Per user resource beancounting. Resources are tied to their luid. ++ * The resource structure itself is tagged both to the process and ++ * the charging resources (a socket doesn't want to have to search for ++ * things at irq time for example). Reference counters keep things in ++ * hand. ++ * ++ * The case where a user creates resource, kills all his processes and ++ * then starts new ones is correctly handled this way. The refcounters ++ * will mean the old entry is still around with resource tied to it. ++ */ ++ ++static inline void free_ub(struct user_beancounter *ub) ++{ ++ free_percpu(ub->ub_percpu); ++ kmem_cache_free(ub_cachep, ub); ++} ++ ++static inline struct user_beancounter *bc_lookup_hash(struct hlist_head *hash, ++ uid_t uid, struct user_beancounter *parent) ++{ ++ struct user_beancounter *ub; ++ struct hlist_node *ptr; ++ ++ hlist_for_each_entry (ub, ptr, hash, ub_hash) ++ if (ub->ub_uid == uid && ub->parent == parent) ++ return get_beancounter(ub); ++ ++ return NULL; ++} ++ ++struct user_beancounter *get_beancounter_byuid(uid_t uid, int create) ++{ ++ struct user_beancounter *new_ub, *ub; ++ unsigned long flags; ++ struct hlist_head *hash; ++ ++ hash = &ub_hash[ub_hash_fun(uid)]; ++ new_ub = NULL; ++retry: ++ spin_lock_irqsave(&ub_hash_lock, flags); ++ ub = bc_lookup_hash(hash, uid, NULL); ++ if (ub != NULL) { ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ ++ if (new_ub != NULL) ++ free_ub(new_ub); ++ return ub; ++ } ++ ++ if (!create) { ++ /* no ub found */ ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ return NULL; ++ } ++ ++ if (new_ub != NULL) { ++ list_add_rcu(&new_ub->ub_list, &ub_list_head); ++ hlist_add_head(&new_ub->ub_hash, hash); ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ return new_ub; ++ } ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ ++ /* alloc new ub */ ++ new_ub = (struct user_beancounter *)kmem_cache_alloc(ub_cachep, ++ GFP_KERNEL); ++ if (new_ub == NULL) ++ return NULL; ++ ++ ub_debug(UBD_ALLOC, "Creating ub %p\n", new_ub); ++ memcpy(new_ub, &default_beancounter, sizeof(*new_ub)); ++ init_beancounter_struct(new_ub); ++ new_ub->ub_percpu = alloc_percpu(struct ub_percpu_struct); ++ if (new_ub->ub_percpu == NULL) ++ goto fail_free; ++ new_ub->ub_uid = uid; ++ goto retry; ++ ++fail_free: ++ kmem_cache_free(ub_cachep, new_ub); ++ return NULL; ++} ++EXPORT_SYMBOL(get_beancounter_byuid); ++ ++struct user_beancounter *get_subbeancounter_byid(struct user_beancounter *p, ++ int id, int create) ++{ ++ struct user_beancounter *new_ub, *ub; ++ unsigned long flags; ++ struct hlist_head *hash; ++ ++ hash = &ub_hash[ub_subhash_fun(p, id)]; ++ new_ub = NULL; ++retry: ++ spin_lock_irqsave(&ub_hash_lock, flags); ++ ub = bc_lookup_hash(hash, id, p); ++ if (ub != NULL) { ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ ++ if (new_ub != NULL) { ++ put_beancounter(new_ub->parent); ++ free_ub(new_ub); ++ } ++ return ub; ++ } ++ ++ if (!create) { ++ /* no ub found */ ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ return NULL; ++ } ++ ++ if (new_ub != NULL) { ++ list_add_rcu(&new_ub->ub_list, &ub_list_head); ++ hlist_add_head(&new_ub->ub_hash, hash); ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ return new_ub; ++ } ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ ++ /* alloc new ub */ ++ new_ub = (struct user_beancounter *)kmem_cache_alloc(ub_cachep, ++ GFP_KERNEL); ++ if (new_ub == NULL) ++ return NULL; ++ ++ ub_debug(UBD_ALLOC, "Creating sub %p\n", new_ub); ++ memset(new_ub, 0, sizeof(*new_ub)); ++ init_beancounter_nolimits(new_ub); ++ init_beancounter_store(new_ub); ++ init_beancounter_struct(new_ub); ++ new_ub->ub_percpu = alloc_percpu(struct ub_percpu_struct); ++ if (new_ub->ub_percpu == NULL) ++ goto fail_free; ++ new_ub->ub_uid = id; ++ new_ub->parent = get_beancounter(p); ++ goto retry; ++ ++fail_free: ++ kmem_cache_free(ub_cachep, new_ub); ++ return NULL; ++} ++EXPORT_SYMBOL(get_subbeancounter_byid); ++ ++static void put_warn(struct user_beancounter *ub) ++{ ++ char id[64]; ++ ++ print_ub_uid(ub, id, sizeof(id)); ++ printk(KERN_ERR "UB: Bad refcount (%d) on put of %s (%p)\n", ++ atomic_read(&ub->ub_refcount), id, ub); ++} ++ ++#ifdef CONFIG_BC_KEEP_UNUSED ++#define release_beancounter(ub) do { } while (0) ++#else ++static int verify_res(struct user_beancounter *ub, int resource, ++ unsigned long held) ++{ ++ char id[64]; ++ ++ if (likely(held == 0)) ++ return 1; ++ ++ print_ub_uid(ub, id, sizeof(id)); ++ printk(KERN_WARNING "Ub %s helds %lu in %s on put\n", ++ id, held, ub_rnames[resource]); ++ return 0; ++} ++ ++static inline void bc_verify_held(struct user_beancounter *ub) ++{ ++ int i, clean; ++ ++ clean = 1; ++ for (i = 0; i < UB_RESOURCES; i++) ++ clean &= verify_res(ub, i, ub->ub_parms[i].held); ++ ++ clean &= verify_res(ub, UB_UNUSEDPRIVVM, ub->ub_unused_privvmpages); ++ clean &= verify_res(ub, UB_TMPFSPAGES, ub->ub_tmpfs_respages); ++ clean &= verify_res(ub, UB_SWAPPAGES, ub->ub_swap_pages); ++ clean &= verify_res(ub, UB_HELDPAGES, (unsigned long)ub->ub_held_pages); ++ ++ ub_debug_trace(!clean, 5, 60*HZ); ++} ++ ++static void bc_free_rcu(struct rcu_head *rcu) ++{ ++ struct user_beancounter *ub; ++ ++ ub = container_of(rcu, struct user_beancounter, rcu); ++ free_ub(ub); ++} ++ ++static void delayed_release_beancounter(struct work_struct *w) ++{ ++ struct user_beancounter *ub, *parent; ++ unsigned long flags; ++ ++ ub = container_of(w, struct user_beancounter, cleanup.work); ++again: ++ local_irq_save(flags); ++ if (!atomic_dec_and_lock(&ub->ub_refcount, &ub_hash_lock)) { ++ /* raced with get_beancounter_byuid */ ++ local_irq_restore(flags); ++ return; ++ } ++ ++ hlist_del(&ub->ub_hash); ++ list_del_rcu(&ub->ub_list); ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ ++ bc_verify_held(ub); ++ ub_free_counters(ub); ++ bc_fini_ioprio(&ub->iopriv); ++ parent = ub->parent; ++ ++ call_rcu(&ub->rcu, bc_free_rcu); ++ if (parent) { ++ ub = parent; ++ goto again; ++ } ++} ++ ++static inline void release_beancounter(struct user_beancounter *ub) ++{ ++ struct execute_work *ew; ++ ++ ew = &ub->cleanup; ++ INIT_WORK(&ew->work, delayed_release_beancounter); ++ schedule_work(&ew->work); ++} ++#endif ++ ++void __put_beancounter(struct user_beancounter *ub) ++{ ++ unsigned long flags; ++ ++ /* equevalent to atomic_dec_and_lock_irqsave() */ ++ local_irq_save(flags); ++ if (likely(!atomic_dec_and_lock(&ub->ub_refcount, &ub_hash_lock))) { ++ if (unlikely(atomic_read(&ub->ub_refcount) < 0)) ++ put_warn(ub); ++ local_irq_restore(flags); ++ return; ++ } ++ ++ if (unlikely(ub == get_ub0())) { ++ printk(KERN_ERR "Trying to put ub0\n"); ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ return; ++ } ++ ++ /* prevent get_beancounter_byuid + put_beancounter() reentrance */ ++ atomic_inc(&ub->ub_refcount); ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ ++ release_beancounter(ub); ++} ++EXPORT_SYMBOL(__put_beancounter); ++ ++void put_beancounter_safe(struct user_beancounter *ub) ++{ ++ synchronize_rcu(); ++ __put_beancounter(ub); ++} ++EXPORT_SYMBOL(put_beancounter_safe); ++ ++/* ++ * Generic resource charging stuff ++ */ ++ ++int __charge_beancounter_locked(struct user_beancounter *ub, ++ int resource, unsigned long val, enum ub_severity strict) ++{ ++ ub_debug_resource(resource, "Charging %lu for %d of %p with %lu\n", ++ val, resource, ub, ub->ub_parms[resource].held); ++ /* ++ * ub_value <= UB_MAXVALUE, value <= UB_MAXVALUE, and only one addition ++ * at the moment is possible so an overflow is impossible. ++ */ ++ ub->ub_parms[resource].held += val; ++ ++ switch (strict) { ++ case UB_HARD: ++ if (ub->ub_parms[resource].held > ++ ub->ub_parms[resource].barrier) ++ break; ++ case UB_SOFT: ++ if (ub->ub_parms[resource].held > ++ ub->ub_parms[resource].limit) ++ break; ++ case UB_FORCE: ++ ub_adjust_maxheld(ub, resource); ++ return 0; ++ default: ++ BUG(); ++ } ++ ++ if (strict == UB_SOFT && ub_ratelimit(&ub->ub_limit_rl)) ++ printk(KERN_INFO "Fatal resource shortage: %s, UB %d.\n", ++ ub_rnames[resource], ub->ub_uid); ++ ub->ub_parms[resource].failcnt++; ++ ub->ub_parms[resource].held -= val; ++ return -ENOMEM; ++} ++ ++int charge_beancounter(struct user_beancounter *ub, ++ int resource, unsigned long val, enum ub_severity strict) ++{ ++ int retval; ++ struct user_beancounter *p, *q; ++ unsigned long flags; ++ ++ retval = -EINVAL; ++ if (val > UB_MAXVALUE) ++ goto out; ++ ++ local_irq_save(flags); ++ for (p = ub; p != NULL; p = p->parent) { ++ spin_lock(&p->ub_lock); ++ retval = __charge_beancounter_locked(p, resource, val, strict); ++ spin_unlock(&p->ub_lock); ++ if (retval) ++ goto unroll; ++ } ++out_restore: ++ local_irq_restore(flags); ++out: ++ return retval; ++ ++unroll: ++ for (q = ub; q != p; q = q->parent) { ++ spin_lock(&q->ub_lock); ++ __uncharge_beancounter_locked(q, resource, val); ++ spin_unlock(&q->ub_lock); ++ } ++ goto out_restore; ++} ++ ++EXPORT_SYMBOL(charge_beancounter); ++ ++void __charge_beancounter_notop(struct user_beancounter *ub, ++ int resource, unsigned long val) ++{ ++ struct user_beancounter *p; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ for (p = ub; p->parent != NULL; p = p->parent) { ++ spin_lock(&p->ub_lock); ++ __charge_beancounter_locked(p, resource, val, UB_FORCE); ++ spin_unlock(&p->ub_lock); ++ } ++ local_irq_restore(flags); ++} ++ ++EXPORT_SYMBOL(__charge_beancounter_notop); ++ ++void uncharge_warn(struct user_beancounter *ub, int resource, ++ unsigned long val, unsigned long held) ++{ ++ char id[64]; ++ ++ print_ub_uid(ub, id, sizeof(id)); ++ printk(KERN_ERR "Uncharging too much %lu h %lu, res %s ub %s\n", ++ val, held, ub_rnames[resource], id); ++ ub_debug_trace(1, 10, 10*HZ); ++} ++ ++void __uncharge_beancounter_locked(struct user_beancounter *ub, ++ int resource, unsigned long val) ++{ ++ ub_debug_resource(resource, "Uncharging %lu for %d of %p with %lu\n", ++ val, resource, ub, ub->ub_parms[resource].held); ++ if (ub->ub_parms[resource].held < val) { ++ uncharge_warn(ub, resource, ++ val, ub->ub_parms[resource].held); ++ val = ub->ub_parms[resource].held; ++ } ++ ub->ub_parms[resource].held -= val; ++} ++ ++void uncharge_beancounter(struct user_beancounter *ub, ++ int resource, unsigned long val) ++{ ++ unsigned long flags; ++ struct user_beancounter *p; ++ ++ for (p = ub; p != NULL; p = p->parent) { ++ spin_lock_irqsave(&p->ub_lock, flags); ++ __uncharge_beancounter_locked(p, resource, val); ++ spin_unlock_irqrestore(&p->ub_lock, flags); ++ } ++} ++ ++EXPORT_SYMBOL(uncharge_beancounter); ++ ++void __uncharge_beancounter_notop(struct user_beancounter *ub, ++ int resource, unsigned long val) ++{ ++ struct user_beancounter *p; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ for (p = ub; p->parent != NULL; p = p->parent) { ++ spin_lock(&p->ub_lock); ++ __uncharge_beancounter_locked(p, resource, val); ++ spin_unlock(&p->ub_lock); ++ } ++ local_irq_restore(flags); ++} ++ ++EXPORT_SYMBOL(__uncharge_beancounter_notop); ++ ++ ++/* ++ * Rate limiting stuff. ++ */ ++int ub_ratelimit(struct ub_rate_info *p) ++{ ++ unsigned long cjif, djif; ++ unsigned long flags; ++ static spinlock_t ratelimit_lock = SPIN_LOCK_UNLOCKED; ++ long new_bucket; ++ ++ spin_lock_irqsave(&ratelimit_lock, flags); ++ cjif = jiffies; ++ djif = cjif - p->last; ++ if (djif < p->interval) { ++ if (p->bucket >= p->burst) { ++ spin_unlock_irqrestore(&ratelimit_lock, flags); ++ return 0; ++ } ++ p->bucket++; ++ } else { ++ new_bucket = p->bucket - (djif / (unsigned)p->interval); ++ if (new_bucket < 0) ++ new_bucket = 0; ++ p->bucket = new_bucket + 1; ++ } ++ p->last = cjif; ++ spin_unlock_irqrestore(&ratelimit_lock, flags); ++ return 1; ++} ++EXPORT_SYMBOL(ub_ratelimit); ++ ++ ++/* ++ * Initialization ++ * ++ * struct user_beancounter contains ++ * - limits and other configuration settings, ++ * with a copy stored for accounting purposes, ++ * - structural fields: lists, spinlocks and so on. ++ * ++ * Before these parts are initialized, the structure should be memset ++ * to 0 or copied from a known clean structure. That takes care of a lot ++ * of fields not initialized explicitly. ++ */ ++ ++static void init_beancounter_struct(struct user_beancounter *ub) ++{ ++ ub->ub_magic = UB_MAGIC; ++ atomic_set(&ub->ub_refcount, 1); ++ spin_lock_init(&ub->ub_lock); ++ INIT_LIST_HEAD(&ub->ub_tcp_sk_list); ++ INIT_LIST_HEAD(&ub->ub_other_sk_list); ++#ifdef CONFIG_BC_DEBUG_KMEM ++ INIT_LIST_HEAD(&ub->ub_cclist); ++#endif ++ bc_init_ioprio(&ub->iopriv); ++} ++ ++static void init_beancounter_store(struct user_beancounter *ub) ++{ ++ int k; ++ ++ for (k = 0; k < UB_RESOURCES; k++) { ++ memcpy(&ub->ub_store[k], &ub->ub_parms[k], ++ sizeof(struct ubparm)); ++ } ++} ++ ++static void init_beancounter_nolimits(struct user_beancounter *ub) ++{ ++ int k; ++ ++ for (k = 0; k < UB_RESOURCES; k++) { ++ ub->ub_parms[k].limit = UB_MAXVALUE; ++ /* FIXME: whether this is right for physpages and guarantees? */ ++ ub->ub_parms[k].barrier = UB_MAXVALUE; ++ } ++ ++ /* FIXME: set unlimited rate? */ ++ ub->ub_limit_rl.burst = 4; ++ ub->ub_limit_rl.interval = 300*HZ; ++} ++ ++static void init_beancounter_syslimits(struct user_beancounter *ub) ++{ ++ unsigned long mp; ++ extern int max_threads; ++ int k; ++ ++ mp = num_physpages; ++ ub->ub_parms[UB_KMEMSIZE].limit = ++ mp > (192*1024*1024 >> PAGE_SHIFT) ? ++ 32*1024*1024 : (mp << PAGE_SHIFT) / 6; ++ ub->ub_parms[UB_LOCKEDPAGES].limit = 8; ++ ub->ub_parms[UB_PRIVVMPAGES].limit = UB_MAXVALUE; ++ ub->ub_parms[UB_SHMPAGES].limit = 64; ++ ub->ub_parms[UB_NUMPROC].limit = max_threads / 2; ++ ub->ub_parms[UB_NUMTCPSOCK].limit = 1024; ++ ub->ub_parms[UB_TCPSNDBUF].limit = 1024*4*1024; /* 4k per socket */ ++ ub->ub_parms[UB_TCPRCVBUF].limit = 1024*6*1024; /* 6k per socket */ ++ ub->ub_parms[UB_NUMOTHERSOCK].limit = 256; ++ ub->ub_parms[UB_DGRAMRCVBUF].limit = 256*4*1024; /* 4k per socket */ ++ ub->ub_parms[UB_OTHERSOCKBUF].limit = 256*8*1024; /* 8k per socket */ ++ ub->ub_parms[UB_NUMFLOCK].limit = 1024; ++ ub->ub_parms[UB_NUMPTY].limit = 16; ++ ub->ub_parms[UB_NUMSIGINFO].limit = 1024; ++ ub->ub_parms[UB_DCACHESIZE].limit = 1024*1024; ++ ub->ub_parms[UB_NUMFILE].limit = 1024; ++ ++ for (k = 0; k < UB_RESOURCES; k++) ++ ub->ub_parms[k].barrier = ub->ub_parms[k].limit; ++ ++ ub->ub_limit_rl.burst = 4; ++ ub->ub_limit_rl.interval = 300*HZ; ++} ++ ++#ifdef CONFIG_SMP ++static struct percpu_data ub0_percpu; ++#endif ++static struct ub_percpu_struct ub0_percpu_data[NR_CPUS]; ++ ++void __init ub_init_early(void) ++{ ++ struct user_beancounter *ub; ++ ++ init_cache_counters(); ++ ub = get_ub0(); ++ memset(ub, 0, sizeof(*ub)); ++ ub->ub_uid = 0; ++ init_beancounter_nolimits(ub); ++ init_beancounter_store(ub); ++ init_beancounter_struct(ub); ++ ub->ub_percpu = static_percpu_ptr(&ub0_percpu, ub0_percpu_data); ++ ++ memset(¤t->task_bc, 0, sizeof(struct task_beancounter)); ++ (void)set_exec_ub(ub); ++ current->task_bc.task_ub = get_beancounter(ub); ++ __charge_beancounter_locked(ub, UB_NUMPROC, 1, UB_FORCE); ++ current->task_bc.fork_sub = get_beancounter(ub); ++ ub_init_task_bc(¤t->task_bc); ++ init_mm.mm_ub = get_beancounter(ub); ++ ++ hlist_add_head(&ub->ub_hash, &ub_hash[ub->ub_uid]); ++ list_add(&ub->ub_list, &ub_list_head); ++} ++ ++void __init ub_init_late(void) ++{ ++ ub_cachep = kmem_cache_create("user_beancounters", ++ sizeof(struct user_beancounter), ++ 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); ++ ++ memset(&default_beancounter, 0, sizeof(default_beancounter)); ++#ifdef CONFIG_BC_UNLIMITED ++ init_beancounter_nolimits(&default_beancounter); ++#else ++ init_beancounter_syslimits(&default_beancounter); ++#endif ++ init_beancounter_store(&default_beancounter); ++ init_beancounter_struct(&default_beancounter); ++} +diff --git a/kernel/bc/dcache.c b/kernel/bc/dcache.c +new file mode 100644 +index 0000000..2242d64 +--- /dev/null ++++ b/kernel/bc/dcache.c +@@ -0,0 +1,399 @@ ++/* ++ * kernel/bc/dcache.c ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++/* ++ * Locking ++ * traverse dcache_lock d_lock ++ * ub_dentry_charge + - + ++ * ub_dentry_uncharge + + - ++ * ub_dentry_charge_nofail + + - ++ * ++ * d_inuse changes are atomic, with special handling of "not in use" <-> ++ * "in use" (-1 <-> 0) transitions. We have two sources of non-atomicity ++ * here: (1) in many operations we need to change d_inuse of both dentry and ++ * its parent, and (2) on state transitions we need to adjust the account. ++ * ++ * Regarding (1): we do not have (and do not want) a single lock covering all ++ * operations, so in general it's impossible to get a consistent view of ++ * a tree with respect to d_inuse counters (except by swsuspend). It also ++ * means if a dentry with d_inuse of 0 gets one new in-use child and loses ++ * one, it's d_inuse counter will go either 0 -> 1 -> 0 path or 0 -> -1 -> 0, ++ * and we can't say which way. ++ * Note that path -1 -> 0 -> -1 can't turn into -1 -> -2 -> -1, since ++ * uncharge can be done only after return from charge (with d_genocide being ++ * the only apparent exception). ++ * Regarding (2): there is a similar uncertainty with the dcache account. ++ * If the account is equal to the limit, one more dentry is started to be ++ * used and one is put, the account will either hit the limit (and an error ++ * will be returned), or decrement will happen before increment. ++ * ++ * These races do not really matter. ++ * The only things we want are: ++ * - if a system is suspenede with no in-use dentries, all d_inuse counters ++ * should be correct (-1); ++ * - d_inuse counters should always be >= -1. ++ * This holds if ->parent references are accessed and maintained properly. ++ * In subtle moments (like d_move) dentries exchanging their parents should ++ * both be in-use. At d_genocide time, lookups and charges are assumed to be ++ * impossible. ++ */ ++ ++/* ++ * Hierarchical accounting ++ * UB argument must NOT be NULL ++ */ ++ ++static int do_charge_dcache(struct user_beancounter *ub, unsigned long size, ++ enum ub_severity sv) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ if (__charge_beancounter_locked(ub, UB_KMEMSIZE, CHARGE_SIZE(size), sv)) ++ goto out_mem; ++ if (__charge_beancounter_locked(ub, UB_DCACHESIZE, size, sv)) ++ goto out_dcache; ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ return 0; ++ ++out_dcache: ++ __uncharge_beancounter_locked(ub, UB_KMEMSIZE, CHARGE_SIZE(size)); ++out_mem: ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ return -ENOMEM; ++} ++ ++static void do_uncharge_dcache(struct user_beancounter *ub, ++ unsigned long size) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ __uncharge_beancounter_locked(ub, UB_KMEMSIZE, CHARGE_SIZE(size)); ++ __uncharge_beancounter_locked(ub, UB_DCACHESIZE, size); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++static int charge_dcache(struct user_beancounter *ub, unsigned long size, ++ enum ub_severity sv) ++{ ++ struct user_beancounter *p, *q; ++ ++ for (p = ub; p != NULL; p = p->parent) { ++ if (do_charge_dcache(p, size, sv)) ++ goto unroll; ++ } ++ return 0; ++ ++unroll: ++ for (q = ub; q != p; q = q->parent) ++ do_uncharge_dcache(q, size); ++ return -ENOMEM; ++} ++ ++void uncharge_dcache(struct user_beancounter *ub, unsigned long size) ++{ ++ for (; ub != NULL; ub = ub->parent) ++ do_uncharge_dcache(ub, size); ++} ++ ++/* ++ * Simple helpers to do maintain account and d_ub field. ++ */ ++ ++static inline int d_charge(struct dentry_beancounter *d_bc) ++{ ++ struct user_beancounter *ub; ++ ++ ub = get_beancounter(get_exec_ub()); ++ if (charge_dcache(ub, d_bc->d_ubsize, UB_SOFT)) { ++ put_beancounter(ub); ++ return -1; ++ } ++ d_bc->d_ub = ub; ++ return 0; ++} ++ ++static inline void d_forced_charge(struct dentry_beancounter *d_bc) ++{ ++ struct user_beancounter *ub; ++ ++ ub = get_beancounter(get_exec_ub()); ++ charge_dcache(ub, d_bc->d_ubsize, UB_FORCE); ++ d_bc->d_ub = ub; ++} ++ ++/* ++ * Minor helpers ++ */ ++ ++extern struct kmem_cache *dentry_cache; ++extern struct kmem_cache *inode_cachep; ++static struct rw_semaphore ub_dentry_alloc_sem; ++ ++static inline unsigned long d_charge_size(struct dentry *dentry) ++{ ++ /* dentry's d_name is already set to appropriate value (see d_alloc) */ ++ return kmem_cache_objuse(inode_cachep) + kmem_cache_objuse(dentry_cache) + ++ (dname_external(dentry) ? ++ kmem_dname_objuse((void *)dentry->d_name.name) : 0); ++} ++ ++/* ++ * Entry points from dcache.c ++ */ ++ ++/* ++ * Set initial d_inuse on d_alloc. ++ * Called with no locks, preemption disabled. ++ */ ++int __ub_dentry_alloc(struct dentry *dentry) ++{ ++ struct dentry_beancounter *d_bc; ++ ++ d_bc = &dentry->dentry_bc; ++ d_bc->d_ub = get_beancounter(get_exec_ub()); ++ atomic_set(&d_bc->d_inuse, INUSE_INIT); /* see comment in dcache.h */ ++ d_bc->d_ubsize = d_charge_size(dentry); ++ ++ if (charge_dcache(d_bc->d_ub, d_bc->d_ubsize, UB_HARD)) ++ goto failure; ++ return 0; ++ ++failure: ++ put_beancounter(d_bc->d_ub); ++ d_bc->d_ub = NULL; ++ return -ENOMEM; ++} ++void __ub_dentry_alloc_start(void) ++{ ++ down_read(&ub_dentry_alloc_sem); ++ current->task_bc.dentry_alloc = 1; ++} ++ ++void __ub_dentry_alloc_end(void) ++{ ++ current->task_bc.dentry_alloc = 0; ++ up_read(&ub_dentry_alloc_sem); ++} ++ ++/* ++ * It is assumed that parent is already in use, so traverse upwards is ++ * limited to one ancestor only. ++ * Called under d_lock and rcu_read_lock. ++ */ ++int __ub_dentry_charge(struct dentry *dentry) ++{ ++ struct dentry_beancounter *d_bc; ++ struct dentry *parent; ++ int ret; ++ ++ if (ub_dget_testone(dentry)) { ++ d_bc = &dentry->dentry_bc; ++ /* state transition -1 => 0 */ ++ if (d_charge(d_bc)) ++ goto failure; ++ ++ if (dentry != dentry->d_parent) { ++ parent = dentry->d_parent; ++ if (ub_dget_testone(parent)) ++ BUG(); ++ } ++ } ++ return 0; ++ ++failure: ++ /* ++ * Here we would like to fail the lookup. ++ * It is not easy: if d_lookup fails, callers expect that a dentry ++ * with the given name doesn't exist, and create a new one. ++ * So, first we forcedly charge for this dentry. ++ * Then try to remove it from cache safely. If it turns out to be ++ * possible, we can return error. ++ */ ++ d_forced_charge(d_bc); ++ ++ if (dentry != dentry->d_parent) { ++ parent = dentry->d_parent; ++ if (ub_dget_testone(parent)) ++ BUG(); ++ } ++ ++ ret = 0; ++ if (spin_trylock(&dcache_lock)) { ++ if (!list_empty(&dentry->d_subdirs)) { ++ spin_unlock(&dentry->d_lock); ++ spin_unlock(&dcache_lock); ++ rcu_read_unlock(); ++ shrink_dcache_parent(dentry); ++ rcu_read_lock(); ++ spin_lock(&dcache_lock); ++ spin_lock(&dentry->d_lock); ++ } ++ if (atomic_read(&dentry->d_count) == 1) { ++ __d_drop(dentry); ++ ret = -1; ++ } ++ spin_unlock(&dcache_lock); ++ } ++ ++ return ret; ++} ++ ++/* ++ * Go up in the tree decreasing d_inuse. ++ * Called under dcache_lock. ++ */ ++void __ub_dentry_uncharge(struct dentry *dentry) ++{ ++ struct dentry *parent; ++ struct user_beancounter *ub; ++ unsigned long size; ++ ++ /* go up until state doesn't change or and root is reached */ ++ size = dentry->dentry_bc.d_ubsize; ++ ub = dentry->dentry_bc.d_ub; ++ while (ub_dput_testzero(dentry)) { ++ /* state transition 0 => -1 */ ++ uncharge_dcache(ub, size); ++ put_beancounter(ub); ++ ++ parent = dentry->d_parent; ++ if (dentry == parent) ++ break; ++ ++ dentry = parent; ++ size = dentry->dentry_bc.d_ubsize; ++ ub = dentry->dentry_bc.d_ub; ++ } ++} ++ ++/* ++ * Forced charge for __dget_locked, where API doesn't allow to return error. ++ * Called under dcache_lock. ++ */ ++void __ub_dentry_charge_nofail(struct dentry *dentry) ++{ ++ struct dentry *parent; ++ ++ while (ub_dget_testone(dentry)) { ++ /* state transition -1 => 0 */ ++ d_forced_charge(&dentry->dentry_bc); ++ ++ parent = dentry->d_parent; ++ if (dentry == parent) ++ break; ++ dentry = parent; ++ } ++} ++ ++/* ++ * Adaptive accounting ++ */ ++ ++int ub_dentry_on = 1; ++int ub_dentry_alloc_barrier; ++EXPORT_SYMBOL(ub_dentry_on); ++ ++static unsigned long checklowat = 0; ++static unsigned long checkhiwat = ULONG_MAX; ++ ++static int sysctl_ub_dentry_chk = 10; ++#define sysctl_ub_lowat sysctl_ub_watermark[0] ++#define sysctl_ub_hiwat sysctl_ub_watermark[1] ++static DECLARE_RWSEM(ub_dentry_alloc_sem); ++/* 1024th of lowmem size */ ++static unsigned int sysctl_ub_watermark[2] = {0, 100}; ++ ++static void ub_dentry_set_limits(unsigned long pages, unsigned long cap) ++{ ++ down_write(&ub_dentry_alloc_sem); ++ preempt_disable(); ++ checklowat = (pages >> 10) * sysctl_ub_lowat; ++ checkhiwat = (pages >> 10) * sysctl_ub_hiwat; ++ if (checkhiwat > cap) { ++ checkhiwat = cap; ++ checklowat = cap / sysctl_ub_hiwat * sysctl_ub_lowat; ++ } ++ preempt_enable(); ++ up_write(&ub_dentry_alloc_sem); ++} ++ ++static int ub_dentry_proc_handler(ctl_table *ctl, int write, struct file *filp, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ int r; ++ ++ r = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); ++ if (!r && write) ++ ub_dentry_set_limits(totalram_pages - totalhigh_pages, ++ ULONG_MAX); ++ return r; ++} ++ ++static ctl_table ub_dentry_sysctl_table[] = { ++ { ++ .procname = "dentry_check", ++ .data = &sysctl_ub_dentry_chk, ++ .maxlen = sizeof(sysctl_ub_dentry_chk), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++ { ++ .procname = "dentry_watermark", ++ .data = &sysctl_ub_lowat, ++ .maxlen = sizeof(sysctl_ub_lowat) * 2, ++ .mode = 0644, ++ .proc_handler = ub_dentry_proc_handler, ++ }, ++ { .ctl_name = 0 } ++}; ++static ctl_table ub_dentry_sysctl_root[] = { ++ { ++ .procname = "ubc", ++ .mode = 0555, ++ .child = ub_dentry_sysctl_table, ++ }, ++ { .ctl_name = 0 } ++}; ++ ++static int __init ub_dentry_init(void) ++{ ++ /* ++ * Initial watermarks are limited, to limit walk time. ++ * 384MB translates into 0.8 sec on PIII 866MHz. ++ */ ++ ub_dentry_set_limits(totalram_pages - totalhigh_pages, ++ 384 * 1024 * 1024 / PAGE_SIZE); ++ if (register_sysctl_table(ub_dentry_sysctl_root) == NULL) ++ return -ENOMEM; ++ return 0; ++} ++__initcall(ub_dentry_init); +diff --git a/kernel/bc/io_acct.c b/kernel/bc/io_acct.c +new file mode 100644 +index 0000000..e8d6c38 +--- /dev/null ++++ b/kernel/bc/io_acct.c +@@ -0,0 +1,500 @@ ++/* ++ * kernel/bc/io_acct.c ++ * ++ * Copyright (C) 2006 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ * Pavel Emelianov ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++ ++static struct mempool_s *pb_pool; ++ ++#define PB_MIN_IO (1024) ++ ++static inline struct page_beancounter *io_pb_alloc(void) ++{ ++ return mempool_alloc(pb_pool, GFP_ATOMIC); ++} ++ ++static inline void io_pb_free(struct page_beancounter *pb) ++{ ++ mempool_free(pb, pb_pool); ++} ++ ++struct page_beancounter **page_pblist(struct page *page) ++{ ++ struct page_beancounter **pb, *iopb; ++ ++ pb = &page_pbc(page); ++ iopb = iopb_to_pb(*pb); ++ ++ return iopb == NULL ? pb : &iopb->page_pb_list; ++} ++ ++/* ++ * We save the context page was set dirty to use it later ++ * when the real write starts. If the page is mapped then ++ * IO pb is stores like this: ++ * ++ * Before saving: ++ * ++ * +- page -------+ ++ * | ... | ++ * | page_pb +---+ ++ * +--------------+ | +-----+ +-----+ +-----+ ++ * +-> | pb1 | -> | pb2 | - ... -> | pbN | -+ ++ * +-----+ +-----+ +-----+ | ++ * ^ | ++ * +---------------------------------+ ++ * ++ * After saving: ++ * ++ * +- page -------+ +- io pb ------+ ++ * | ... | | ... | ++ * | page_pb +----> | page_pb_list +-+ ++ * +--------------+ +--------------+ | ++ * | ++ * +-------------------+ ++ * | ++ * | +-----+ +-----+ +-----+ ++ * +-> | pb1 | -> | pb2 | - ... -> | pbN | -+ ++ * +-----+ +-----+ +-----+ | ++ * ^ | ++ * +---------------------------------+ ++ * ++ * And the page_pblist(...) function returns pointer to the place that ++ * points to this pbX ring. ++ */ ++ ++#ifdef CONFIG_BC_DEBUG_IO ++static LIST_HEAD(pb_io_list); ++static unsigned long anon_pages, not_released; ++ ++static inline void io_debug_save(struct page_beancounter *pb, ++ struct page_beancounter *mpb) ++{ ++ pb->io_debug = (mpb == NULL); ++ list_add(&pb->io_list, &pb_io_list); ++} ++ ++static inline void io_debug_release(struct page_beancounter *pb) ++{ ++ list_del(&pb->io_list); ++} ++ ++void ub_io_release_debug(struct page *page) ++{ ++ struct page_beancounter *pb; ++ static int once = 0; ++ ++ pb = page_pbc(page); ++ if (likely(iopb_to_pb(pb) == NULL)) ++ return; ++ ++ if (!once) { ++ printk("BUG: Page has an IO bc but is not expectd to\n"); ++ dump_stack(); ++ once = 1; ++ } ++ ++ spin_lock(&pb_lock); ++ not_released++; ++ pb = iopb_to_pb(pb); ++ page_pbc(page) = NULL; ++ io_debug_release(pb); ++ pb->ub->io_pb_held--; ++ spin_unlock(&pb_lock); ++ ++ put_beancounter(pb->ub); ++ io_pb_free(pb); ++} ++ ++static inline int io_debug_precheck_save(struct page *page) ++{ ++ if (unlikely(PageAnon(page))) { ++ anon_pages++; ++ return 1; ++ } ++ ++ return 0; ++} ++ ++static inline int io_debug_precheck_release(struct page *page) ++{ ++ return 0; ++} ++#else ++#define io_debug_save(pb, mpb) do { } while (0) ++#define io_debug_release(pb) do { } while (0) ++#define io_debug_precheck_save(page) (0) ++#define io_debug_precheck_release(p) (0) ++#endif ++ ++static inline void set_page_io(struct page *page, struct page_beancounter *pb, ++ struct page_beancounter *mapped_pb) ++{ ++ unsigned long val; ++ ++ val = (unsigned long)pb | PAGE_IO_MARK; ++ pb->page = page; ++ ++ page_pbc(page) = (struct page_beancounter *)val; ++ io_debug_save(pb, mapped_pb); ++ pb->ub->io_pb_held++; ++} ++ ++static inline void put_page_io(struct page *page, struct page_beancounter *pb) ++{ ++ pb->ub->io_pb_held--; ++ io_debug_release(pb); ++ page_pbc(page) = pb->page_pb_list; ++} ++ ++void ub_io_save_context(struct page *page, size_t bytes_dirtied) ++{ ++ struct user_beancounter *ub; ++ struct page_beancounter *pb, *mapped_pb, *io_pb; ++ ++ if (unlikely(in_interrupt())) { ++ WARN_ON_ONCE(1); ++ return; ++ } ++ ++ /* ++ * FIXME - this can happen from atomic context and ++ * it's probably not that good to loose some requests ++ */ ++ ++ pb = io_pb_alloc(); ++ io_pb = NULL; ++ ++ spin_lock(&pb_lock); ++ if (io_debug_precheck_save(page)) ++ goto out_unlock; ++ ++ mapped_pb = page_pbc(page); ++ io_pb = iopb_to_pb(mapped_pb); ++ if (io_pb != NULL) { ++ /* ++ * this page has an IO - release it and force a new one ++ * We could also race with page cleaning - see below ++ */ ++ mapped_pb = io_pb->page_pb_list; ++ put_page_io(page, io_pb); ++ } ++ ++ /* ++ * If the page is mapped we must save the context ++ * it maps to. If the page isn't mapped we use current ++ * context as this is a regular write. ++ */ ++ ++ if (mapped_pb != NULL) ++ ub = top_beancounter(mapped_pb->ub); ++ else ++ ub = get_io_ub(); ++ ++ if (!PageDirty(page)) { ++ /* ++ * race with clear_page_dirty(_for_io) - account ++ * writes for ub_io_release_context() ++ */ ++ if (io_pb != NULL) ++ io_pb->ub->bytes_wrote += PAGE_CACHE_SIZE; ++ if (pb != NULL) ++ io_pb_free(pb); ++ goto out_unlock; ++ } ++ ++ if (pb == NULL) { ++ ub->bytes_dirty_missed += bytes_dirtied; ++ goto out_unlock; ++ } ++ ++ /* ++ * the page may become clean here, but the context will be seen ++ * in ub_io_release_context() ++ */ ++ ++ pb->ub = get_beancounter(ub); ++ pb->page_pb_list = mapped_pb; ++ ub->bytes_dirtied += bytes_dirtied; ++ ++ set_page_io(page, pb, mapped_pb); ++ ++out_unlock: ++ spin_unlock(&pb_lock); ++ ++ if (io_pb != NULL) { ++ put_beancounter(io_pb->ub); ++ io_pb_free(io_pb); ++ } ++} ++ ++void ub_io_release_context(struct page *page, size_t wrote) ++{ ++ struct page_beancounter *pb; ++ ++ if (io_debug_precheck_release(page)) ++ return; ++ ++ if (unlikely(in_interrupt())) { ++ WARN_ON_ONCE(1); ++ return; ++ } ++ ++ spin_lock(&pb_lock); ++ pb = iopb_to_pb(page_pbc(page)); ++ if (unlikely(pb == NULL)) ++ /* ++ * this may happen if we failed to allocate ++ * context in ub_io_save_context or raced with it ++ */ ++ goto out_unlock; ++ ++ if (wrote) ++ pb->ub->bytes_wrote += wrote; ++ ++ put_page_io(page, pb); ++out_unlock: ++ spin_unlock(&pb_lock); ++ ++ if (pb != NULL) { ++ put_beancounter(pb->ub); ++ io_pb_free(pb); ++ } ++} ++ ++void __init ub_init_io(struct kmem_cache *pb_cachep) ++{ ++ pb_pool = mempool_create_slab_pool(PB_MIN_IO, pb_cachep); ++ if (pb_pool == NULL) ++ panic("Can't create pb_pool"); ++} ++ ++#ifdef CONFIG_PROC_FS ++#define in_flight(var) (var > var##_done ? var - var##_done : 0) ++ ++static int bc_ioacct_show(struct seq_file *f, void *v) ++{ ++ int i; ++ unsigned long long read, write, cancel; ++ unsigned long sync, sync_done; ++ unsigned long fsync, fsync_done; ++ unsigned long fdsync, fdsync_done; ++ unsigned long frsync, frsync_done; ++ unsigned long reads, writes; ++ unsigned long long rchar, wchar; ++ struct user_beancounter *ub; ++ ++ ub = seq_beancounter(f); ++ ++ read = write = cancel = 0; ++ sync = sync_done = fsync = fsync_done = ++ fdsync = fdsync_done = frsync = frsync_done = 0; ++ reads = writes = 0; ++ rchar = wchar = 0; ++ for_each_online_cpu(i) { ++ struct ub_percpu_struct *ub_percpu; ++ ub_percpu = per_cpu_ptr(ub->ub_percpu, i); ++ ++ read += ub_percpu->bytes_read; ++ write += ub_percpu->bytes_wrote; ++ cancel += ub_percpu->bytes_cancelled; ++ ++ sync += ub_percpu->sync; ++ fsync += ub_percpu->fsync; ++ fdsync += ub_percpu->fdsync; ++ frsync += ub_percpu->frsync; ++ sync_done += ub_percpu->sync_done; ++ fsync_done += ub_percpu->fsync_done; ++ fdsync_done += ub_percpu->fdsync_done; ++ frsync_done += ub_percpu->frsync_done; ++ ++ reads += ub_percpu->read; ++ writes += ub_percpu->write; ++ rchar += ub_percpu->rchar; ++ wchar += ub_percpu->wchar; ++ } ++ ++ seq_printf(f, bc_proc_llu_fmt, "read", read); ++ seq_printf(f, bc_proc_llu_fmt, "write", ub->bytes_wrote + write); ++ seq_printf(f, bc_proc_llu_fmt, "dirty", ub->bytes_dirtied); ++ seq_printf(f, bc_proc_llu_fmt, "cancel", cancel); ++ seq_printf(f, bc_proc_llu_fmt, "missed", ub->bytes_dirty_missed); ++ ++ seq_printf(f, bc_proc_lu_lfmt, "syncs_total", sync); ++ seq_printf(f, bc_proc_lu_lfmt, "fsyncs_total", fsync); ++ seq_printf(f, bc_proc_lu_lfmt, "fdatasyncs_total", fdsync); ++ seq_printf(f, bc_proc_lu_lfmt, "range_syncs_total", frsync); ++ ++ seq_printf(f, bc_proc_lu_lfmt, "syncs_active", in_flight(sync)); ++ seq_printf(f, bc_proc_lu_lfmt, "fsyncs_active", in_flight(fsync)); ++ seq_printf(f, bc_proc_lu_lfmt, "fdatasyncs_active", in_flight(fsync)); ++ seq_printf(f, bc_proc_lu_lfmt, "range_syncs_active", in_flight(frsync)); ++ ++ seq_printf(f, bc_proc_lu_lfmt, "vfs_reads", reads); ++ seq_printf(f, bc_proc_llu_fmt, "vfs_read_chars", rchar); ++ seq_printf(f, bc_proc_lu_lfmt, "vfs_writes", writes); ++ seq_printf(f, bc_proc_llu_fmt, "vfs_write_chars", wchar); ++ ++ seq_printf(f, bc_proc_lu_lfmt, "io_pbs", ub->io_pb_held); ++ return 0; ++} ++ ++static struct bc_proc_entry bc_ioacct_entry = { ++ .name = "ioacct", ++ .u.show = bc_ioacct_show, ++}; ++ ++#ifdef CONFIG_BC_DEBUG_IO ++#define PTR_SIZE (int)(sizeof(void *) * 2) ++#define INT_SIZE (int)(sizeof(int) * 2) ++ ++static int bc_io_show(struct seq_file *f, void *v) ++{ ++ struct list_head *lh; ++ struct page_beancounter *pb; ++ struct page *pg; ++ ++ lh = (struct list_head *)v; ++ if (lh == &pb_io_list) { ++ seq_printf(f, "Races: anon %lu missed %lu\n", ++ anon_pages, not_released); ++ ++ seq_printf(f, "%-*s %-1s %-*s %-4s %*s %*s " ++ "%-*s %-*s %-1s %-*s %-*s\n", ++ PTR_SIZE, "pb", "", ++ PTR_SIZE, "page", "flg", ++ INT_SIZE, "cnt", INT_SIZE, "mcnt", ++ PTR_SIZE, "pb_list", ++ PTR_SIZE, "page_pb", "", ++ PTR_SIZE, "mapping", ++ INT_SIZE, "ub"); ++ return 0; ++ } ++ ++ pb = list_entry(lh, struct page_beancounter, io_list); ++ pg = pb->page; ++ seq_printf(f, "%p %c %p %c%c%c%c %*d %*d %p %p %c %p %d\n", ++ pb, pb->io_debug ? 'e' : 'm', pg, ++ PageDirty(pg) ? 'D' : 'd', ++ PageAnon(pg) ? 'A' : 'a', ++ PageWriteback(pg) ? 'W' : 'w', ++ PageLocked(pg) ? 'L' : 'l', ++ INT_SIZE, page_count(pg), ++ INT_SIZE, page_mapcount(pg), ++ pb->page_pb_list, page_pbc(pg), ++ iopb_to_pb(page_pbc(pg)) == pb ? ' ' : '!', ++ pg->mapping, pb->ub->ub_uid); ++ return 0; ++} ++ ++static void *bc_io_start(struct seq_file *f, loff_t *ppos) ++{ ++ spin_lock(&pb_lock); ++ return seq_list_start_head(&pb_io_list, *ppos); ++} ++ ++static void *bc_io_next(struct seq_file *f, void *v, loff_t *ppos) ++{ ++ return seq_list_next(v, &pb_io_list, ppos); ++} ++ ++static void bc_io_stop(struct seq_file *f, void *v) ++{ ++ spin_unlock(&pb_lock); ++} ++ ++static struct seq_operations bc_io_seq_ops = { ++ .start = bc_io_start, ++ .next = bc_io_next, ++ .stop = bc_io_stop, ++ .show = bc_io_show, ++}; ++ ++static int bc_io_open(struct inode *inode, struct file *filp) ++{ ++ if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH))) ++ return -EACCES; ++ ++ return seq_open(filp, &bc_io_seq_ops); ++} ++static struct file_operations bc_io_debug_ops = { ++ .open = bc_io_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++static struct bc_proc_entry bc_ioacct_debug_entry = { ++ .name = "ioacct_debug", ++ .u.fops = &bc_io_debug_ops, ++}; ++#endif ++ ++static int bc_ioacct_notify(struct vnotifier_block *self, ++ unsigned long event, void *arg, int old_ret) ++{ ++ struct user_beancounter *ub; ++ unsigned long *vm_events; ++ unsigned long long bin, bout; ++ int i; ++ ++ if (event != VIRTINFO_VMSTAT) ++ return old_ret; ++ ++ ub = top_beancounter(get_exec_ub()); ++ if (ub == get_ub0()) ++ return old_ret; ++ ++ /* Think over: do we need to account here bytes_dirty_missed? */ ++ bout = ub->bytes_wrote; ++ bin = 0; ++ for_each_online_cpu(i) { ++ bout += per_cpu_ptr(ub->ub_percpu, i)->bytes_wrote; ++ bin += per_cpu_ptr(ub->ub_percpu, i)->bytes_read; ++ } ++ ++ /* convert to Kbytes */ ++ bout >>= 10; ++ bin >>= 10; ++ ++ vm_events = ((unsigned long *)arg) + NR_VM_ZONE_STAT_ITEMS; ++ vm_events[PGPGOUT] = (unsigned long)bout; ++ vm_events[PGPGIN] = (unsigned long)bin; ++ return NOTIFY_OK; ++} ++ ++static struct vnotifier_block bc_ioacct_nb = { ++ .notifier_call = bc_ioacct_notify, ++}; ++ ++static int __init bc_ioacct_init(void) ++{ ++#ifdef CONFIG_BC_DEBUG_IO ++ bc_register_proc_root_entry(&bc_ioacct_debug_entry); ++#endif ++ bc_register_proc_entry(&bc_ioacct_entry); ++ ++ virtinfo_notifier_register(VITYPE_GENERAL, &bc_ioacct_nb); ++ return 0; ++} ++ ++late_initcall(bc_ioacct_init); ++#endif +diff --git a/kernel/bc/io_prio.c b/kernel/bc/io_prio.c +new file mode 100644 +index 0000000..20aa133 +--- /dev/null ++++ b/kernel/bc/io_prio.c +@@ -0,0 +1,288 @@ ++/* ++ * kernel/bc/io_prio.c ++ * ++ * Copyright (C) 2007 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ * Vasily Tarasov ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++struct cfq_bc_data *__find_cfq_bc(struct ub_iopriv *iopriv, ++ struct cfq_data *cfqd) ++{ ++ struct cfq_bc_data *cfq_bc; ++ ++ list_for_each_entry(cfq_bc, &iopriv->cfq_bc_head, cfq_bc_list) ++ if (cfq_bc->cfqd == cfqd) ++ return cfq_bc; ++ ++ return NULL; ++} ++ ++struct cfq_bc_data *bc_find_cfq_bc(struct ub_iopriv *iopriv, ++ struct cfq_data *cfqd) ++{ ++ struct cfq_bc_data *cfq_bc; ++ unsigned long flags; ++ ++ read_lock_irqsave(&iopriv->cfq_bc_list_lock, flags); ++ cfq_bc = __find_cfq_bc(iopriv, cfqd); ++ read_unlock_irqrestore(&iopriv->cfq_bc_list_lock, flags); ++ return cfq_bc; ++} ++struct cfq_bc_data *bc_findcreate_cfq_bc(struct ub_iopriv *iopriv, ++ struct cfq_data *cfqd, gfp_t gfp_mask) ++{ ++ struct cfq_bc_data *cfq_bc_new; ++ struct cfq_bc_data *cfq_bc; ++ unsigned long flags; ++ ++ cfq_bc = bc_find_cfq_bc(iopriv, cfqd); ++ if (cfq_bc) ++ return cfq_bc; ++ ++ cfq_bc_new = kzalloc(sizeof(*cfq_bc_new), gfp_mask); ++ if (!cfq_bc_new) ++ return NULL; ++ ++ cfq_init_cfq_bc(cfq_bc_new); ++ cfq_bc_new->cfqd = cfqd; ++ cfq_bc_new->ub_iopriv = iopriv; ++ ++ write_lock_irqsave(&iopriv->cfq_bc_list_lock, flags); ++ cfq_bc = __find_cfq_bc(iopriv, cfqd); ++ if (cfq_bc) ++ kfree(cfq_bc_new); ++ else { ++ list_add_tail(&cfq_bc_new->cfq_bc_list, ++ &iopriv->cfq_bc_head); ++ cfq_bc = cfq_bc_new; ++ } ++ write_unlock_irqrestore(&iopriv->cfq_bc_list_lock, flags); ++ ++ return cfq_bc; ++} ++ ++void bc_init_ioprio(struct ub_iopriv *iopriv) ++{ ++ INIT_LIST_HEAD(&iopriv->cfq_bc_head); ++ rwlock_init(&iopriv->cfq_bc_list_lock); ++ iopriv->ioprio = UB_IOPRIO_BASE; ++} ++ ++static void inline bc_cfq_bc_check_empty(struct cfq_bc_data *cfq_bc) ++{ ++ BUG_ON(!RB_EMPTY_ROOT(&cfq_bc->service_tree.rb)); ++} ++ ++static void bc_release_cfq_bc(struct cfq_bc_data *cfq_bc) ++{ ++ struct cfq_data *cfqd; ++ elevator_t *eq; ++ int i; ++ ++ cfqd = cfq_bc->cfqd; ++ eq = cfqd->queue->elevator; ++ ++ for (i = 0; i < CFQ_PRIO_LISTS; i++) { ++ if (cfq_bc->async_cfqq[0][i]) { ++ eq->ops->put_queue(cfq_bc->async_cfqq[0][i]); ++ cfq_bc->async_cfqq[0][i] = NULL; ++ } ++ if (cfq_bc->async_cfqq[1][i]) { ++ eq->ops->put_queue(cfq_bc->async_cfqq[1][i]); ++ cfq_bc->async_cfqq[1][i] = NULL; ++ } ++ } ++ if (cfq_bc->async_idle_cfqq) { ++ eq->ops->put_queue(cfq_bc->async_idle_cfqq); ++ cfq_bc->async_idle_cfqq = NULL; ++ } ++ /* ++ * Note: this cfq_bc is already not in active list, ++ * but can be still pointed from cfqd as active. ++ */ ++ cfqd->active_cfq_bc = NULL; ++ ++ bc_cfq_bc_check_empty(cfq_bc); ++ list_del(&cfq_bc->cfq_bc_list); ++ kfree(cfq_bc); ++} ++ ++void bc_fini_ioprio(struct ub_iopriv *iopriv) ++{ ++ struct cfq_bc_data *cfq_bc; ++ struct cfq_bc_data *cfq_bc_tmp; ++ unsigned long flags; ++ spinlock_t *queue_lock; ++ ++ /* ++ * Don't get cfq_bc_list_lock since ub is already dead, ++ * but async cfqqs are still in hash list, consequently ++ * queue_lock should be hold. ++ */ ++ list_for_each_entry_safe(cfq_bc, cfq_bc_tmp, ++ &iopriv->cfq_bc_head, cfq_bc_list) { ++ queue_lock = cfq_bc->cfqd->queue->queue_lock; ++ spin_lock_irqsave(queue_lock, flags); ++ bc_release_cfq_bc(cfq_bc); ++ spin_unlock_irqrestore(queue_lock, flags); ++ } ++} ++ ++void bc_cfq_exit_queue(struct cfq_data *cfqd) ++{ ++ struct cfq_bc_data *cfq_bc; ++ struct user_beancounter *ub; ++ ++ local_irq_disable(); ++ for_each_beancounter(ub) { ++ write_lock(&ub->iopriv.cfq_bc_list_lock); ++ cfq_bc = __find_cfq_bc(&ub->iopriv, cfqd); ++ if (!cfq_bc) { ++ write_unlock(&ub->iopriv.cfq_bc_list_lock); ++ continue; ++ } ++ bc_release_cfq_bc(cfq_bc); ++ write_unlock(&ub->iopriv.cfq_bc_list_lock); ++ } ++ local_irq_enable(); ++} ++ ++int bc_expired(struct cfq_data *cfqd) ++{ ++ return time_after(jiffies, cfqd->slice_end) ? 1 : 0; ++} ++ ++static inline int bc_empty(struct cfq_bc_data *cfq_bc) ++{ ++ /* ++ * consider BC as empty only if there is no requests ++ * in elevator _and_ in driver ++ */ ++ if (!cfq_bc->rqnum && !cfq_bc->on_dispatch) ++ return 1; ++ ++ return 0; ++} ++ ++static inline unsigned long bc_time_slice_by_ioprio(unsigned int ioprio, ++ unsigned int base_slice) ++{ ++ return base_slice + ++ (base_slice * (ioprio - UB_IOPRIO_MIN)) ++ / (UB_IOPRIO_MAX - UB_IOPRIO_MIN - 1); ++} ++ ++static inline void bc_set_active(struct cfq_data *cfqd) ++{ ++ if (list_empty(&cfqd->act_cfq_bc_head)) { ++ cfqd->active_cfq_bc = NULL; ++ return; ++ } ++ ++ cfqd->active_cfq_bc = list_first_entry(&cfqd->act_cfq_bc_head, ++ struct cfq_bc_data, act_cfq_bc_list); ++ list_move_tail(&cfqd->active_cfq_bc->act_cfq_bc_list, ++ &cfqd->act_cfq_bc_head); ++ cfqd->slice_end = jiffies + ++ bc_time_slice_by_ioprio(cfqd->active_cfq_bc->ub_iopriv->ioprio, ++ cfqd->cfq_ub_slice); ++} ++ ++void bc_schedule_active(struct cfq_data *cfqd) ++{ ++ if (bc_expired(cfqd) || !cfqd->active_cfq_bc || ++ bc_empty(cfqd->active_cfq_bc)) ++ bc_set_active(cfqd); ++} ++ ++void bc_inc_rqnum(struct cfq_queue *cfqq) ++{ ++ struct cfq_bc_data *cfq_bc; ++ ++ cfq_bc = cfqq->cfq_bc; ++ ++ if (!cfq_bc->rqnum) ++ list_add_tail(&cfq_bc->act_cfq_bc_list, ++ &cfqq->cfqd->act_cfq_bc_head); ++ ++ cfq_bc->rqnum++; ++} ++ ++void bc_dec_rqnum(struct cfq_queue *cfqq) ++{ ++ struct cfq_bc_data *cfq_bc; ++ ++ cfq_bc = cfqq->cfq_bc; ++ ++ cfq_bc->rqnum--; ++ ++ if (!cfq_bc->rqnum) ++ list_del(&cfq_bc->act_cfq_bc_list); ++} ++ ++unsigned long bc_set_ioprio(int ubid, int ioprio) ++{ ++ struct user_beancounter *ub; ++ ++ if (ioprio < UB_IOPRIO_MIN || ioprio >= UB_IOPRIO_MAX) ++ return -ERANGE; ++ ++ ub = get_beancounter_byuid(ubid, 0); ++ if (!ub) ++ return -ESRCH; ++ ++ ub->iopriv.ioprio = ioprio; ++ put_beancounter(ub); ++ ++ return 0; ++} ++ ++struct user_beancounter *bc_io_switch_context(struct page *page) ++{ ++ struct page_beancounter *pb; ++ struct user_beancounter *old_ub = NULL; ++ ++ pb = page_iopb(page); ++ pb = iopb_to_pb(pb); ++ if (pb) { ++ get_beancounter(pb->ub); ++ old_ub = set_exec_ub(pb->ub); ++ } ++ ++ return old_ub; ++} ++ ++void bc_io_restore_context(struct user_beancounter *ub) ++{ ++ struct user_beancounter *old_ub; ++ ++ if (ub) { ++ old_ub = set_exec_ub(ub); ++ put_beancounter(old_ub); ++ } ++} ++ ++EXPORT_SYMBOL(bc_io_switch_context); ++EXPORT_SYMBOL(bc_io_restore_context); ++EXPORT_SYMBOL(__find_cfq_bc); ++EXPORT_SYMBOL(bc_fini_ioprio); ++EXPORT_SYMBOL(bc_init_ioprio); ++EXPORT_SYMBOL(bc_findcreate_cfq_bc); ++EXPORT_SYMBOL(bc_cfq_exit_queue); ++EXPORT_SYMBOL(bc_expired); ++EXPORT_SYMBOL(bc_schedule_active); ++EXPORT_SYMBOL(bc_inc_rqnum); ++EXPORT_SYMBOL(bc_dec_rqnum); +diff --git a/kernel/bc/kmem.c b/kernel/bc/kmem.c +new file mode 100644 +index 0000000..74c4179 +--- /dev/null ++++ b/kernel/bc/kmem.c +@@ -0,0 +1,406 @@ ++/* ++ * kernel/bc/kmem.c ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * Initialization ++ */ ++ ++/* ++ * Slab accounting ++ */ ++ ++#ifdef CONFIG_BC_DEBUG_KMEM ++ ++#define CC_HASH_SIZE 1024 ++static struct ub_cache_counter *cc_hash[CC_HASH_SIZE]; ++spinlock_t cc_lock; ++ ++static void __free_cache_counters(struct user_beancounter *ub, ++ struct kmem_cache *cachep) ++{ ++ struct ub_cache_counter *cc, **pprev, *del; ++ int i; ++ unsigned long flags; ++ ++ del = NULL; ++ spin_lock_irqsave(&cc_lock, flags); ++ for (i = 0; i < CC_HASH_SIZE; i++) { ++ pprev = &cc_hash[i]; ++ cc = cc_hash[i]; ++ while (cc != NULL) { ++ if (cc->ub != ub && cc->cachep != cachep) { ++ pprev = &cc->next; ++ cc = cc->next; ++ continue; ++ } ++ ++ list_del(&cc->ulist); ++ *pprev = cc->next; ++ cc->next = del; ++ del = cc; ++ cc = *pprev; ++ } ++ } ++ spin_unlock_irqrestore(&cc_lock, flags); ++ ++ while (del != NULL) { ++ cc = del->next; ++ kfree(del); ++ del = cc; ++ } ++} ++ ++void ub_free_counters(struct user_beancounter *ub) ++{ ++ __free_cache_counters(ub, NULL); ++} ++ ++void ub_kmemcache_free(struct kmem_cache *cachep) ++{ ++ __free_cache_counters(NULL, cachep); ++} ++ ++void __init init_cache_counters(void) ++{ ++ memset(cc_hash, 0, CC_HASH_SIZE * sizeof(cc_hash[0])); ++ spin_lock_init(&cc_lock); ++} ++ ++#define cc_hash_fun(ub, cachep) ( \ ++ (((unsigned long)(ub) >> L1_CACHE_SHIFT) ^ \ ++ ((unsigned long)(ub) >> (BITS_PER_LONG / 2)) ^ \ ++ ((unsigned long)(cachep) >> L1_CACHE_SHIFT) ^ \ ++ ((unsigned long)(cachep) >> (BITS_PER_LONG / 2)) \ ++ ) & (CC_HASH_SIZE - 1)) ++ ++static int change_slab_charged(struct user_beancounter *ub, ++ struct kmem_cache *cachep, long val) ++{ ++ struct ub_cache_counter *cc, *new_cnt, **pprev; ++ unsigned long flags; ++ ++ new_cnt = NULL; ++again: ++ spin_lock_irqsave(&cc_lock, flags); ++ cc = cc_hash[cc_hash_fun(ub, cachep)]; ++ while (cc) { ++ if (cc->ub == ub && cc->cachep == cachep) ++ goto found; ++ cc = cc->next; ++ } ++ ++ if (new_cnt != NULL) ++ goto insert; ++ ++ spin_unlock_irqrestore(&cc_lock, flags); ++ ++ new_cnt = kmalloc(sizeof(*new_cnt), GFP_ATOMIC); ++ if (new_cnt == NULL) ++ return -ENOMEM; ++ ++ new_cnt->counter = 0; ++ new_cnt->ub = ub; ++ new_cnt->cachep = cachep; ++ goto again; ++ ++insert: ++ pprev = &cc_hash[cc_hash_fun(ub, cachep)]; ++ new_cnt->next = *pprev; ++ *pprev = new_cnt; ++ list_add(&new_cnt->ulist, &ub->ub_cclist); ++ cc = new_cnt; ++ new_cnt = NULL; ++ ++found: ++ cc->counter += val; ++ spin_unlock_irqrestore(&cc_lock, flags); ++ if (new_cnt) ++ kfree(new_cnt); ++ return 0; ++} ++ ++static inline int inc_slab_charged(struct user_beancounter *ub, ++ struct kmem_cache *cachep) ++{ ++ return change_slab_charged(ub, cachep, 1); ++} ++ ++static inline void dec_slab_charged(struct user_beancounter *ub, ++ struct kmem_cache *cachep) ++{ ++ if (change_slab_charged(ub, cachep, -1) < 0) ++ BUG(); ++} ++ ++#include ++ ++#define inc_pages_charged(ub, order) ub_percpu_add(ub, \ ++ pages_charged, 1 << order) ++#define dec_pages_charged(ub, order) ub_percpu_sub(ub, \ ++ pages_charged, 1 << order) ++ ++#ifdef CONFIG_PROC_FS ++static int bc_kmem_debug_show(struct seq_file *f, void *v) ++{ ++ struct user_beancounter *ub; ++ struct ub_cache_counter *cc; ++ long pages, vmpages, pbc; ++ int i; ++ ++ ub = seq_beancounter(f); ++ ++ pages = vmpages = pbc = 0; ++ for_each_online_cpu(i) { ++ pages += per_cpu_ptr(ub->ub_percpu, i)->pages_charged; ++ vmpages += per_cpu_ptr(ub->ub_percpu, i)->vmalloc_charged; ++ pbc += per_cpu_ptr(ub->ub_percpu, i)->pbcs; ++ } ++ if (pages < 0) ++ pages = 0; ++ if (vmpages < 0) ++ vmpages = 0; ++ ++ seq_printf(f, bc_proc_lu_lu_fmt, "pages", pages, PAGE_SIZE); ++ seq_printf(f, bc_proc_lu_lu_fmt, "vmalloced", vmpages, PAGE_SIZE); ++ seq_printf(f, bc_proc_lu_lu_fmt, "pbcs", pbc, ++ sizeof(struct page_beancounter)); ++ ++ spin_lock_irq(&cc_lock); ++ list_for_each_entry (cc, &ub->ub_cclist, ulist) { ++ struct kmem_cache *cachep; ++ ++ cachep = cc->cachep; ++ seq_printf(f, bc_proc_lu_lu_fmt, ++ kmem_cache_name(cachep), ++ cc->counter, ++ kmem_cache_objuse(cachep)); ++ } ++ spin_unlock_irq(&cc_lock); ++ return 0; ++} ++ ++static struct bc_proc_entry bc_kmem_debug_entry = { ++ .name = "kmem_debug", ++ .u.show = bc_kmem_debug_show, ++}; ++ ++static int __init bc_kmem_debug_init(void) ++{ ++ bc_register_proc_entry(&bc_kmem_debug_entry); ++ return 0; ++} ++ ++late_initcall(bc_kmem_debug_init); ++#endif ++ ++#else ++#define inc_slab_charged(ub, cache) (0) ++#define dec_slab_charged(ub, cache) do { } while (0) ++#define inc_pages_charged(ub, cache) do { } while (0) ++#define dec_pages_charged(ub, cache) do { } while (0) ++#endif ++ ++#define UB_KMEM_QUANT (PAGE_SIZE * 4) ++ ++/* called with IRQ disabled */ ++int ub_kmemsize_charge(struct user_beancounter *ub, ++ unsigned long size, ++ enum ub_severity strict) ++{ ++ struct task_beancounter *tbc; ++ ++ tbc = ¤t->task_bc; ++ if (ub != tbc->task_ub || size > UB_KMEM_QUANT) ++ goto just_charge; ++ if (tbc->kmem_precharged >= size) { ++ tbc->kmem_precharged -= size; ++ return 0; ++ } ++ ++ if (charge_beancounter(ub, UB_KMEMSIZE, UB_KMEM_QUANT, UB_HARD) == 0) { ++ tbc->kmem_precharged += UB_KMEM_QUANT - size; ++ return 0; ++ } ++ ++just_charge: ++ return charge_beancounter(ub, UB_KMEMSIZE, size, strict); ++} ++ ++/* called with IRQ disabled */ ++void ub_kmemsize_uncharge(struct user_beancounter *ub, ++ unsigned long size) ++{ ++ struct task_beancounter *tbc; ++ ++ if (size > UB_MAXVALUE) { ++ printk("ub_kmemsize_uncharge: size %lu\n", size); ++ dump_stack(); ++ } ++ ++ tbc = ¤t->task_bc; ++ if (ub != tbc->task_ub) ++ goto just_uncharge; ++ ++ tbc->kmem_precharged += size; ++ if (tbc->kmem_precharged < UB_KMEM_QUANT * 2) ++ return; ++ size = tbc->kmem_precharged - UB_KMEM_QUANT; ++ tbc->kmem_precharged -= size; ++ ++just_uncharge: ++ uncharge_beancounter(ub, UB_KMEMSIZE, size); ++} ++ ++/* called with IRQ disabled */ ++int ub_slab_charge(struct kmem_cache *cachep, void *objp, gfp_t flags) ++{ ++ unsigned int size; ++ struct user_beancounter *ub; ++ ++ ub = get_beancounter(get_exec_ub()); ++ if (ub == NULL) ++ return 0; ++ ++ size = CHARGE_SIZE(kmem_cache_objuse(cachep)); ++ if (ub_kmemsize_charge(ub, size, ++ (flags & __GFP_SOFT_UBC ? UB_SOFT : UB_HARD))) ++ goto out_err; ++ ++ if (inc_slab_charged(ub, cachep) < 0) { ++ ub_kmemsize_uncharge(ub, size); ++ goto out_err; ++ } ++ *ub_slab_ptr(cachep, objp) = ub; ++ return 0; ++ ++out_err: ++ put_beancounter(ub); ++ return -ENOMEM; ++} ++ ++/* called with IRQ disabled */ ++void ub_slab_uncharge(struct kmem_cache *cachep, void *objp) ++{ ++ unsigned int size; ++ struct user_beancounter **ub_ref; ++ ++ ub_ref = ub_slab_ptr(cachep, objp); ++ if (*ub_ref == NULL) ++ return; ++ ++ dec_slab_charged(*ub_ref, cachep); ++ size = CHARGE_SIZE(kmem_cache_objuse(cachep)); ++ ub_kmemsize_uncharge(*ub_ref, size); ++ put_beancounter(*ub_ref); ++ *ub_ref = NULL; ++} ++ ++/* ++ * Pages accounting ++ */ ++ ++int ub_page_charge(struct page *page, int order, gfp_t mask) ++{ ++ struct user_beancounter *ub; ++ unsigned long flags; ++ ++ ub = NULL; ++ if (!(mask & __GFP_UBC)) ++ goto out; ++ ++ ub = get_beancounter(get_exec_ub()); ++ if (ub == NULL) ++ goto out; ++ ++ local_irq_save(flags); ++ if (ub_kmemsize_charge(ub, CHARGE_ORDER(order), ++ (mask & __GFP_SOFT_UBC ? UB_SOFT : UB_HARD))) ++ goto err; ++ ++ inc_pages_charged(ub, order); ++ local_irq_restore(flags); ++out: ++ BUG_ON(page_ub(page) != NULL); ++ page_ub(page) = ub; ++ return 0; ++ ++err: ++ local_irq_restore(flags); ++ BUG_ON(page_ub(page) != NULL); ++ put_beancounter(ub); ++ return -ENOMEM; ++} ++ ++void ub_page_uncharge(struct page *page, int order) ++{ ++ struct user_beancounter *ub; ++ unsigned long flags; ++ ++ ub = page_ub(page); ++ if (ub == NULL) ++ return; ++ ++ BUG_ON(ub->ub_magic != UB_MAGIC); ++ dec_pages_charged(ub, order); ++ local_irq_save(flags); ++ ub_kmemsize_uncharge(ub, CHARGE_ORDER(order)); ++ local_irq_restore(flags); ++ put_beancounter(ub); ++ page_ub(page) = NULL; ++} ++ ++/* ++ * takes init_mm.page_table_lock ++ * some outer lock to protect pages from vmalloced area must be held ++ */ ++struct user_beancounter *vmalloc_ub(void *obj) ++{ ++ struct page *pg; ++ ++ pg = vmalloc_to_page(obj); ++ if (pg == NULL) ++ return NULL; ++ ++ return page_ub(pg); ++} ++ ++EXPORT_SYMBOL(vmalloc_ub); ++ ++struct user_beancounter *mem_ub(void *obj) ++{ ++ struct user_beancounter *ub; ++ ++ if ((unsigned long)obj >= VMALLOC_START && ++ (unsigned long)obj < VMALLOC_END) ++ ub = vmalloc_ub(obj); ++ else ++ ub = slab_ub(obj); ++ ++ return ub; ++} ++ ++EXPORT_SYMBOL(mem_ub); +diff --git a/kernel/bc/misc.c b/kernel/bc/misc.c +new file mode 100644 +index 0000000..20c28a7 +--- /dev/null ++++ b/kernel/bc/misc.c +@@ -0,0 +1,455 @@ ++/* ++ * kernel/bc/misc.c ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++#define UB_FILE_MINQUANT 3 ++#define UB_FILE_MAXQUANT 10 ++#define UB_FILE_INIQUANT 4 ++ ++static unsigned long ub_file_precharge(struct task_beancounter *task_bc, ++ struct user_beancounter *ub, unsigned long *kmemsize); ++ ++extern struct kmem_cache *filp_cachep; ++ ++static inline unsigned long ub_file_kmemsize(unsigned long nr) ++{ ++ return CHARGE_SIZE(kmem_cache_objuse(filp_cachep)) * nr; ++} ++ ++/* ++ * Task staff ++ */ ++ ++static void init_task_sub(struct task_struct *parent, ++ struct task_struct *tsk, ++ struct task_beancounter *old_bc) ++{ ++ struct task_beancounter *new_bc; ++ struct user_beancounter *sub; ++ ++ new_bc = &tsk->task_bc; ++ sub = old_bc->fork_sub; ++ new_bc->fork_sub = get_beancounter(sub); ++ new_bc->task_fnode = NULL; ++ new_bc->task_freserv = old_bc->task_freserv; ++ old_bc->task_freserv = NULL; ++ memset(&new_bc->task_data, 0, sizeof(new_bc->task_data)); ++ new_bc->pgfault_handle = 0; ++ new_bc->pgfault_allot = 0; ++} ++ ++void ub_init_task_bc(struct task_beancounter *tbc) ++{ ++ tbc->file_precharged = 0; ++ tbc->file_quant = UB_FILE_INIQUANT; ++ tbc->file_count = 0; ++ ++ tbc->kmem_precharged = 0; ++ tbc->dentry_alloc = 0; ++} ++ ++int ub_task_charge(struct task_struct *parent, struct task_struct *task) ++{ ++ struct task_beancounter *old_bc; ++ struct task_beancounter *new_bc; ++ struct user_beancounter *ub, *pub; ++ unsigned long file_nr, kmemsize; ++ unsigned long flags; ++ ++ old_bc = &parent->task_bc; ++ ub = old_bc->fork_sub; ++ new_bc = &task->task_bc; ++ new_bc->task_ub = get_beancounter(ub); ++ new_bc->exec_ub = get_beancounter(ub); ++ ++ pub = top_beancounter(ub); ++ spin_lock_irqsave(&pub->ub_lock, flags); ++ if (unlikely(__charge_beancounter_locked(pub, UB_NUMPROC, ++ 1, UB_HARD) < 0)) ++ goto out_numproc; ++ ++ ub_init_task_bc(new_bc); ++ file_nr = ub_file_precharge(new_bc, pub, &kmemsize); ++ spin_unlock_irqrestore(&pub->ub_lock, flags); ++ ++ charge_beancounter_notop(ub, UB_NUMPROC, 1); ++ if (likely(file_nr)) { ++ charge_beancounter_notop(ub, UB_NUMFILE, file_nr); ++ charge_beancounter_notop(ub, UB_KMEMSIZE, kmemsize); ++ } ++ ++ init_task_sub(parent, task, old_bc); ++ return 0; ++ ++out_numproc: ++ spin_unlock_irqrestore(&pub->ub_lock, flags); ++ __put_beancounter_batch(ub, 2); ++ return -ENOMEM; ++} ++ ++extern atomic_t dbgpre; ++ ++void ub_task_uncharge(struct task_struct *task) ++{ ++ struct task_beancounter *task_bc; ++ struct user_beancounter *pub; ++ unsigned long file_nr, file_kmemsize; ++ unsigned long flags; ++ ++ task_bc = &task->task_bc; ++ pub = top_beancounter(task_bc->task_ub); ++ spin_lock_irqsave(&pub->ub_lock, flags); ++ __uncharge_beancounter_locked(pub, UB_NUMPROC, 1); ++ file_nr = task_bc->file_precharged; ++ if (likely(file_nr)) ++ __uncharge_beancounter_locked(pub, ++ UB_NUMFILE, file_nr); ++ ++ /* see comment in ub_file_charge */ ++ task_bc->file_precharged = 0; ++ file_kmemsize = ub_file_kmemsize(file_nr); ++ if (likely(file_kmemsize)) ++ __uncharge_beancounter_locked(pub, ++ UB_KMEMSIZE, file_kmemsize); ++ spin_unlock_irqrestore(&pub->ub_lock, flags); ++ ++ uncharge_beancounter_notop(task_bc->task_ub, UB_NUMPROC, 1); ++ if (likely(file_nr)) { ++ uncharge_beancounter_notop(task_bc->task_ub, ++ UB_NUMFILE, file_nr); ++ __put_beancounter_batch(task_bc->task_ub, file_nr); ++ } ++ if (likely(file_kmemsize)) ++ uncharge_beancounter_notop(task_bc->task_ub, ++ UB_KMEMSIZE, file_kmemsize); ++} ++ ++void ub_task_put(struct task_struct *task) ++{ ++ struct task_beancounter *task_bc; ++ struct user_beancounter *pub; ++ unsigned long kmemsize, flags; ++ ++ task_bc = &task->task_bc; ++ ++ pub = top_beancounter(task_bc->task_ub); ++ spin_lock_irqsave(&pub->ub_lock, flags); ++ kmemsize = task_bc->kmem_precharged; ++ task_bc->kmem_precharged = 0; ++ if (likely(kmemsize)) ++ __uncharge_beancounter_locked(pub, UB_KMEMSIZE, kmemsize); ++ spin_unlock_irqrestore(&pub->ub_lock, flags); ++ if (likely(kmemsize)) ++ uncharge_beancounter_notop(task_bc->task_ub, UB_KMEMSIZE, kmemsize); ++ ++ put_beancounter(task_bc->exec_ub); ++ put_beancounter(task_bc->task_ub); ++ put_beancounter(task_bc->fork_sub); ++ /* can't be freed elsewhere, failures possible in the middle of fork */ ++ if (task_bc->task_freserv != NULL) ++ kfree(task_bc->task_freserv); ++ ++ task_bc->exec_ub = (struct user_beancounter *)0xdeadbcbc; ++ task_bc->task_ub = (struct user_beancounter *)0xdead100c; ++ BUG_ON(task_bc->kmem_precharged != 0); ++} ++ ++/* ++ * Files and file locks. ++ */ ++/* ++ * For NUMFILE, we do not take a lock and call charge function ++ * for every file. We try to charge in batches, keeping local reserve on ++ * task. For experimental purposes, batch size is adaptive and depends ++ * on numfile barrier, number of processes, and the history of successes and ++ * failures of batch charges. ++ * ++ * Per-task fields have the following meaning ++ * file_precharged number of files charged to beancounter in advance, ++ * file_quant logarithm of batch size ++ * file_count counter of charge successes, to reduce batch size ++ * fluctuations. ++ */ ++static unsigned long ub_file_precharge(struct task_beancounter *task_bc, ++ struct user_beancounter *ub, unsigned long *kmemsize) ++{ ++ unsigned long n, kmem; ++ ++ n = 1UL << task_bc->file_quant; ++ if (ub->ub_parms[UB_NUMPROC].held > ++ (ub->ub_parms[UB_NUMFILE].barrier >> ++ task_bc->file_quant)) ++ goto nopre; ++ if (unlikely(__charge_beancounter_locked(ub, UB_NUMFILE, n, UB_HARD))) ++ goto nopre; ++ kmem = ub_file_kmemsize(n); ++ if (unlikely(__charge_beancounter_locked(ub, UB_KMEMSIZE, ++ kmem, UB_HARD))) ++ goto nopre_kmem; ++ ++ task_bc->file_precharged += n; ++ get_beancounter_batch(task_bc->task_ub, n); ++ task_bc->file_count++; ++ if (task_bc->file_quant < UB_FILE_MAXQUANT && ++ task_bc->file_count >= task_bc->file_quant) { ++ task_bc->file_quant++; ++ task_bc->file_count = 0; ++ } ++ *kmemsize = kmem; ++ return n; ++ ++nopre_kmem: ++ __uncharge_beancounter_locked(ub, UB_NUMFILE, n); ++nopre: ++ if (task_bc->file_quant > UB_FILE_MINQUANT) ++ task_bc->file_quant--; ++ task_bc->file_count = 0; ++ return 0; ++} ++ ++int ub_file_charge(struct file *f) ++{ ++ struct user_beancounter *ub, *pub; ++ struct task_beancounter *task_bc; ++ unsigned long file_nr, kmem; ++ unsigned long flags; ++ int err; ++ ++ task_bc = ¤t->task_bc; ++ ub = get_exec_ub(); ++ if (unlikely(ub != task_bc->task_ub)) ++ goto just_charge; ++ ++ if (likely(task_bc->file_precharged > 0)) { ++ /* ++ * files are put via RCU in 2.6.16 so during ++ * this decrement an IRQ can happen and called ++ * ub_files_uncharge() will mess file_precharged ++ * ++ * ub_task_uncharge() is called via RCU also so no ++ * protection is needed there ++ * ++ * Xemul ++ */ ++ ++ local_irq_save(flags); ++ task_bc->file_precharged--; ++ local_irq_restore(flags); ++ ++ f->f_ub = ub; ++ return 0; ++ } ++ ++ pub = top_beancounter(ub); ++ spin_lock_irqsave(&pub->ub_lock, flags); ++ file_nr = ub_file_precharge(task_bc, pub, &kmem); ++ if (unlikely(!file_nr)) ++ goto last_try; ++ spin_unlock(&pub->ub_lock); ++ task_bc->file_precharged--; ++ local_irq_restore(flags); ++ ++ charge_beancounter_notop(ub, UB_NUMFILE, file_nr); ++ charge_beancounter_notop(ub, UB_KMEMSIZE, kmem); ++ f->f_ub = ub; ++ return 0; ++ ++just_charge: ++ pub = top_beancounter(ub); ++ spin_lock_irqsave(&pub->ub_lock, flags); ++last_try: ++ kmem = ub_file_kmemsize(1); ++ err = __charge_beancounter_locked(pub, UB_NUMFILE, 1, UB_HARD); ++ if (likely(!err)) { ++ err = __charge_beancounter_locked(pub, UB_KMEMSIZE, ++ kmem, UB_HARD); ++ if (unlikely(err)) ++ __uncharge_beancounter_locked(pub, UB_NUMFILE, 1); ++ } ++ spin_unlock_irqrestore(&pub->ub_lock, flags); ++ if (likely(!err)) { ++ charge_beancounter_notop(ub, UB_NUMFILE, 1); ++ charge_beancounter_notop(ub, UB_KMEMSIZE, kmem); ++ f->f_ub = get_beancounter(ub); ++ } ++ return err; ++} ++ ++void ub_file_uncharge(struct file *f) ++{ ++ struct user_beancounter *ub, *pub; ++ struct task_beancounter *task_bc; ++ unsigned long nr; ++ ++ ub = f->f_ub; ++ task_bc = ¤t->task_bc; ++ if (likely(ub == task_bc->task_ub)) { ++ task_bc->file_precharged++; ++ pub = top_beancounter(ub); ++ if (ub_barrier_farnr(pub, UB_NUMFILE) && ++ ub_barrier_farsz(pub, UB_KMEMSIZE)) ++ return; ++ if (task_bc->file_precharged < (1UL << task_bc->file_quant)) ++ return; ++ nr = task_bc->file_precharged ++ - (1UL << (task_bc->file_quant - 1)); ++ task_bc->file_precharged -= nr; ++ __put_beancounter_batch(ub, nr); ++ uncharge_beancounter(ub, UB_NUMFILE, nr); ++ uncharge_beancounter(ub, UB_KMEMSIZE, ub_file_kmemsize(nr)); ++ } else { ++ uncharge_beancounter(ub, UB_NUMFILE, 1); ++ uncharge_beancounter(ub, UB_KMEMSIZE, ub_file_kmemsize(1)); ++ put_beancounter(ub); ++ } ++} ++ ++int ub_flock_charge(struct file_lock *fl, int hard) ++{ ++ struct user_beancounter *ub; ++ int err; ++ ++ /* No need to get_beancounter here since it's already got in slab */ ++ ub = slab_ub(fl); ++ if (ub == NULL) ++ return 0; ++ ++ err = charge_beancounter(ub, UB_NUMFLOCK, 1, hard ? UB_HARD : UB_SOFT); ++ if (!err) ++ fl->fl_charged = 1; ++ return err; ++} ++ ++void ub_flock_uncharge(struct file_lock *fl) ++{ ++ struct user_beancounter *ub; ++ ++ /* Ub will be put in slab */ ++ ub = slab_ub(fl); ++ if (ub == NULL || !fl->fl_charged) ++ return; ++ ++ uncharge_beancounter(ub, UB_NUMFLOCK, 1); ++ fl->fl_charged = 0; ++} ++ ++/* ++ * Signal handling ++ */ ++ ++static int do_ub_siginfo_charge(struct user_beancounter *ub, ++ unsigned long size) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ if (__charge_beancounter_locked(ub, UB_KMEMSIZE, size, UB_HARD)) ++ goto out_kmem; ++ ++ if (__charge_beancounter_locked(ub, UB_NUMSIGINFO, 1, UB_HARD)) ++ goto out_num; ++ ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ return 0; ++ ++out_num: ++ __uncharge_beancounter_locked(ub, UB_KMEMSIZE, size); ++out_kmem: ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ return -ENOMEM; ++} ++ ++static void do_ub_siginfo_uncharge(struct user_beancounter *ub, ++ unsigned long size) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ __uncharge_beancounter_locked(ub, UB_KMEMSIZE, size); ++ __uncharge_beancounter_locked(ub, UB_NUMSIGINFO, 1); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++int ub_siginfo_charge(struct sigqueue *sq, struct user_beancounter *ub) ++{ ++ unsigned long size; ++ struct user_beancounter *p, *q; ++ ++ size = CHARGE_SIZE(kmem_obj_objuse(sq)); ++ for (p = ub; p != NULL; p = p->parent) { ++ if (do_ub_siginfo_charge(p, size)) ++ goto unroll; ++ } ++ ++ sq->sig_ub = get_beancounter(ub); ++ return 0; ++ ++unroll: ++ for (q = ub; q != p; q = q->parent) ++ do_ub_siginfo_uncharge(q, size); ++ return -ENOMEM; ++} ++EXPORT_SYMBOL(ub_siginfo_charge); ++ ++void ub_siginfo_uncharge(struct sigqueue *sq) ++{ ++ unsigned long size; ++ struct user_beancounter *ub, *p; ++ ++ p = ub = sq->sig_ub; ++ sq->sig_ub = NULL; ++ size = CHARGE_SIZE(kmem_obj_objuse(sq)); ++ for (; ub != NULL; ub = ub->parent) ++ do_ub_siginfo_uncharge(ub, size); ++ put_beancounter(p); ++} ++ ++/* ++ * PTYs ++ */ ++ ++int ub_pty_charge(struct tty_struct *tty) ++{ ++ struct user_beancounter *ub; ++ int retval; ++ ++ ub = slab_ub(tty); ++ retval = 0; ++ if (ub && tty->driver->subtype == PTY_TYPE_MASTER && ++ !test_bit(TTY_CHARGED, &tty->flags)) { ++ retval = charge_beancounter(ub, UB_NUMPTY, 1, UB_HARD); ++ if (!retval) ++ set_bit(TTY_CHARGED, &tty->flags); ++ } ++ return retval; ++} ++ ++void ub_pty_uncharge(struct tty_struct *tty) ++{ ++ struct user_beancounter *ub; ++ ++ ub = slab_ub(tty); ++ if (ub && tty->driver->subtype == PTY_TYPE_MASTER && ++ test_bit(TTY_CHARGED, &tty->flags)) { ++ uncharge_beancounter(ub, UB_NUMPTY, 1); ++ clear_bit(TTY_CHARGED, &tty->flags); ++ } ++} +diff --git a/kernel/bc/net.c b/kernel/bc/net.c +new file mode 100644 +index 0000000..e0244b4 +--- /dev/null ++++ b/kernel/bc/net.c +@@ -0,0 +1,1160 @@ ++/* ++ * linux/kernel/bc/net.c ++ * ++ * Copyright (C) 1998-2004 Andrey V. Savochkin ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ * TODO: ++ * - sizeof(struct inode) charge ++ * = tcp_mem_schedule() feedback based on ub limits ++ * + measures so that one socket won't exhaust all send buffers, ++ * see bug in bugzilla ++ * = sk->socket check for NULL in snd_wakeups ++ * (tcp_write_space checks for NULL itself) ++ * + in tcp_close(), orphaned socket abortion should be based on ubc ++ * resources (same in tcp_out_of_resources) ++ * Beancounter should also have separate orphaned socket counter... ++ * + for rcv, in-order segment should be accepted ++ * if only barrier is exceeded ++ * = tcp_rmem_schedule() feedback based on ub limits ++ * - repair forward_alloc mechanism for receive buffers ++ * It's idea is that some buffer space is pre-charged so that receive fast ++ * path doesn't need to take spinlocks and do other heavy stuff ++ * + tcp_prune_queue actions based on ub limits ++ * + window adjustments depending on available buffers for receive ++ * - window adjustments depending on available buffers for send ++ * + race around usewreserv ++ * + avoid allocating new page for each tiny-gram, see letter from ANK ++ * + rename ub_sock_lock ++ * + sk->sleep wait queue probably can be used for all wakeups, and ++ * sk->ub_wait is unnecessary ++ * + for UNIX sockets, the current algorithm will lead to ++ * UB_UNIX_MINBUF-sized messages only for non-blocking case ++ * - charge for af_packet sockets ++ * + all datagram sockets should be charged to NUMUNIXSOCK ++ * - we do not charge for skb copies and clones staying in device queues ++ * + live-lock if number of sockets is big and buffer limits are small ++ * [diff-ubc-dbllim3] ++ * - check that multiple readers/writers on the same socket won't cause fatal ++ * consequences ++ * - check allocation/charge orders ++ * + There is potential problem with callback_lock. In *snd_wakeup we take ++ * beancounter first, in sock_def_error_report - callback_lock first. ++ * then beancounter. This is not a problem if callback_lock taken ++ * readonly, but anyway... ++ * - SKB_CHARGE_SIZE doesn't include the space wasted by slab allocator ++ * General kernel problems: ++ * - in tcp_sendmsg(), if allocation fails, non-blocking sockets with ASYNC ++ * notification won't get signals ++ * - datagram_poll looks racy ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include ++#include ++#include ++ ++/* by some reason it is not used currently */ ++#define UB_SOCK_MAINTAIN_WMEMPRESSURE 0 ++ ++ ++/* Skb truesize definition. Bad place. Den */ ++ ++static inline int skb_chargesize_head(struct sk_buff *skb) ++{ ++ return skb_charge_size(skb_end_pointer(skb) - skb->head + ++ sizeof(struct skb_shared_info)); ++} ++ ++int skb_charge_fullsize(struct sk_buff *skb) ++{ ++ int chargesize; ++ struct sk_buff *skbfrag; ++ ++ chargesize = skb_chargesize_head(skb) + ++ PAGE_SIZE * skb_shinfo(skb)->nr_frags; ++ if (likely(skb_shinfo(skb)->frag_list == NULL)) ++ return chargesize; ++ for (skbfrag = skb_shinfo(skb)->frag_list; ++ skbfrag != NULL; ++ skbfrag = skbfrag->next) { ++ chargesize += skb_charge_fullsize(skbfrag); ++ } ++ return chargesize; ++} ++EXPORT_SYMBOL(skb_charge_fullsize); ++ ++static int ub_sock_makewreserv_locked(struct sock *sk, ++ int bufid, unsigned long size); ++ ++int __ub_too_many_orphans(struct sock *sk, int count) ++{ ++ struct user_beancounter *ub; ++ ++ if (sock_has_ubc(sk)) { ++ ub = top_beancounter(sock_bc(sk)->ub); ++ if (count >= ub->ub_parms[UB_NUMTCPSOCK].barrier >> 2) ++ return 1; ++ } ++ return 0; ++} ++ ++/* ++ * Queueing ++ */ ++ ++static void ub_sock_snd_wakeup(struct user_beancounter *ub) ++{ ++ struct list_head *p; ++ struct sock *sk; ++ struct sock_beancounter *skbc; ++ struct socket *sock; ++ unsigned long added; ++ ++ while (!list_empty(&ub->ub_other_sk_list)) { ++ p = ub->ub_other_sk_list.next; ++ skbc = list_entry(p, struct sock_beancounter, ub_sock_list); ++ sk = skbc_sock(skbc); ++ ++ added = 0; ++ sock = sk->sk_socket; ++ if (sock == NULL) { ++ /* sk being destroyed */ ++ list_del_init(&skbc->ub_sock_list); ++ continue; ++ } ++ ++ ub_debug(UBD_NET_SLEEP, ++ "Checking queue, waiting %lu, reserv %lu\n", ++ skbc->ub_waitspc, skbc->poll_reserv); ++ added = -skbc->poll_reserv; ++ if (ub_sock_makewreserv_locked(sk, UB_OTHERSOCKBUF, ++ skbc->ub_waitspc)) ++ break; ++ added += skbc->poll_reserv; ++ ++ list_del_init(&skbc->ub_sock_list); ++ ++ /* ++ * See comments in ub_tcp_snd_wakeup. ++ * Locking note: both unix_write_space and ++ * sock_def_write_space take callback_lock themselves. ++ * We take it here just to be on the safe side and to ++ * act the same way as ub_tcp_snd_wakeup does. ++ */ ++ sock_hold(sk); ++ read_lock(&sk->sk_callback_lock); ++ spin_unlock(&ub->ub_lock); ++ ++ sk->sk_write_space(sk); ++ read_unlock(&sk->sk_callback_lock); ++ ++ if (skbc->ub != ub && added) ++ charge_beancounter_notop(skbc->ub, ++ UB_OTHERSOCKBUF, added); ++ sock_put(sk); ++ ++ spin_lock(&ub->ub_lock); ++ } ++} ++ ++static void ub_tcp_snd_wakeup(struct user_beancounter *ub) ++{ ++ struct list_head *p; ++ struct sock *sk; ++ struct sock_beancounter *skbc; ++ struct socket *sock; ++ unsigned long added; ++ ++ while (!list_empty(&ub->ub_tcp_sk_list)) { ++ p = ub->ub_tcp_sk_list.next; ++ skbc = list_entry(p, struct sock_beancounter, ub_sock_list); ++ sk = skbc_sock(skbc); ++ ++ added = 0; ++ sock = sk->sk_socket; ++ if (sock == NULL) { ++ /* sk being destroyed */ ++ list_del_init(&skbc->ub_sock_list); ++ continue; ++ } ++ ++ ub_debug(UBD_NET_SLEEP, ++ "Checking queue, waiting %lu, reserv %lu\n", ++ skbc->ub_waitspc, skbc->poll_reserv); ++ added = -skbc->poll_reserv; ++ if (ub_sock_makewreserv_locked(sk, UB_TCPSNDBUF, ++ skbc->ub_waitspc)) ++ break; ++ added += skbc->poll_reserv; ++ ++ list_del_init(&skbc->ub_sock_list); ++ ++ /* ++ * Send async notifications and wake up. ++ * Locking note: we get callback_lock here because ++ * tcp_write_space is over-optimistic about calling context ++ * (socket lock is presumed). So we get the lock here although ++ * it belongs to the callback. ++ */ ++ sock_hold(sk); ++ read_lock(&sk->sk_callback_lock); ++ spin_unlock(&ub->ub_lock); ++ ++ sk->sk_write_space(sk); ++ read_unlock(&sk->sk_callback_lock); ++ ++ if (skbc->ub != ub && added) ++ charge_beancounter_notop(skbc->ub, UB_TCPSNDBUF, added); ++ sock_put(sk); ++ ++ spin_lock(&ub->ub_lock); ++ } ++} ++ ++void ub_sock_snd_queue_add(struct sock *sk, int res, unsigned long size) ++{ ++ unsigned long flags; ++ struct sock_beancounter *skbc; ++ struct user_beancounter *ub; ++ unsigned long added_reserv; ++ ++ if (!sock_has_ubc(sk)) ++ return; ++ ++ skbc = sock_bc(sk); ++ ub = top_beancounter(skbc->ub); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ ub_debug(UBD_NET_SLEEP, "attempt to charge for %lu\n", size); ++ added_reserv = -skbc->poll_reserv; ++ if (!ub_sock_makewreserv_locked(sk, res, size)) { ++ /* ++ * It looks a bit hackish, but it is compatible with both ++ * wait_for_xx_ubspace and poll. ++ * This __set_current_state is equivalent to a wakeup event ++ * right after spin_unlock_irqrestore. ++ */ ++ __set_current_state(TASK_RUNNING); ++ added_reserv += skbc->poll_reserv; ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ if (added_reserv) ++ charge_beancounter_notop(skbc->ub, res, added_reserv); ++ return; ++ } ++ ++ ub_debug(UBD_NET_SLEEP, "Adding sk to queue\n"); ++ skbc->ub_waitspc = size; ++ if (!list_empty(&skbc->ub_sock_list)) { ++ ub_debug(UBD_NET_SOCKET, ++ "re-adding socket to beancounter %p.\n", ub); ++ goto out; ++ } ++ ++ switch (res) { ++ case UB_TCPSNDBUF: ++ list_add_tail(&skbc->ub_sock_list, ++ &ub->ub_tcp_sk_list); ++ break; ++ case UB_OTHERSOCKBUF: ++ list_add_tail(&skbc->ub_sock_list, ++ &ub->ub_other_sk_list); ++ break; ++ default: ++ BUG(); ++ } ++out: ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++EXPORT_SYMBOL(ub_sock_snd_queue_add); ++ ++long ub_sock_wait_for_space(struct sock *sk, long timeo, unsigned long size) ++{ ++ DECLARE_WAITQUEUE(wait, current); ++ ++ add_wait_queue(sk->sk_sleep, &wait); ++ for (;;) { ++ if (signal_pending(current)) ++ break; ++ set_current_state(TASK_INTERRUPTIBLE); ++ if (!ub_sock_make_wreserv(sk, UB_OTHERSOCKBUF, size)) ++ break; ++ ++ if (sk->sk_shutdown & SEND_SHUTDOWN) ++ break; ++ if (sk->sk_err) ++ break; ++ ub_sock_snd_queue_add(sk, UB_OTHERSOCKBUF, size); ++ timeo = schedule_timeout(timeo); ++ } ++ __set_current_state(TASK_RUNNING); ++ remove_wait_queue(sk->sk_sleep, &wait); ++ return timeo; ++} ++ ++void ub_sock_sndqueuedel(struct sock *sk) ++{ ++ struct user_beancounter *ub; ++ struct sock_beancounter *skbc; ++ unsigned long flags; ++ ++ if (!sock_has_ubc(sk)) ++ return; ++ skbc = sock_bc(sk); ++ ++ /* race with write_space callback of other socket */ ++ ub = top_beancounter(skbc->ub); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ list_del_init(&skbc->ub_sock_list); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++/* ++ * Helpers ++ */ ++ ++static inline void __ub_skb_set_charge(struct sk_buff *skb, struct sock *sk, ++ unsigned long size, int resource) ++{ ++ WARN_ON_ONCE(skb_bc(skb)->ub != NULL); ++ ++ skb_bc(skb)->ub = sock_bc(sk)->ub; ++ skb_bc(skb)->charged = size; ++ skb_bc(skb)->resource = resource; ++} ++ ++void ub_skb_set_charge(struct sk_buff *skb, struct sock *sk, ++ unsigned long size, int resource) ++{ ++ if (!sock_has_ubc(sk)) ++ return; ++ ++ if (sock_bc(sk)->ub == NULL) ++ BUG(); ++ ++ __ub_skb_set_charge(skb, sk, size, resource); ++ ++ /* Ugly. Ugly. Skb in sk writequeue can live without ref to sk */ ++ if (skb->sk == NULL) ++ skb->sk = sk; ++} ++ ++EXPORT_SYMBOL(ub_skb_set_charge); ++ ++static inline void ub_skb_set_uncharge(struct sk_buff *skb) ++{ ++ skb_bc(skb)->ub = NULL; ++ skb_bc(skb)->charged = 0; ++ skb_bc(skb)->resource = 0; ++} ++ ++static void ub_update_rmem_thres(struct sock_beancounter *skub) ++{ ++ struct user_beancounter *ub; ++ ++ if (skub && skub->ub) { ++ ub = top_beancounter(skub->ub); ++ ub->ub_rmem_thres = ub->ub_parms[UB_TCPRCVBUF].barrier / ++ (ub->ub_parms[UB_NUMTCPSOCK].held + 1); ++ } ++} ++ ++static inline void ub_sock_wcharge_dec(struct sock *sk, ++ unsigned long chargesize) ++{ ++ /* The check sk->sk_family != PF_NETLINK is made as the skb is ++ * queued to the kernel end of socket while changed to the user one. ++ * Den */ ++ if (unlikely(sock_bc(sk)->ub_wcharged) && sk->sk_family != PF_NETLINK) { ++ if (sock_bc(sk)->ub_wcharged > chargesize) ++ sock_bc(sk)->ub_wcharged -= chargesize; ++ else ++ sock_bc(sk)->ub_wcharged = 0; ++ } ++} ++ ++/* ++ * Charge socket number ++ */ ++ ++static inline void sk_alloc_beancounter(struct sock *sk) ++{ ++ struct sock_beancounter *skbc; ++ ++ skbc = sock_bc(sk); ++ memset(skbc, 0, sizeof(struct sock_beancounter)); ++} ++ ++static inline void sk_free_beancounter(struct sock *sk) ++{ ++} ++ ++static int __sock_charge(struct sock *sk, int res) ++{ ++ struct sock_beancounter *skbc; ++ struct user_beancounter *cub, *ub; ++ unsigned long added_reserv, added_forw; ++ unsigned long flags; ++ ++ cub = get_exec_ub(); ++ if (unlikely(cub == NULL)) ++ return 0; ++ ++ sk_alloc_beancounter(sk); ++ skbc = sock_bc(sk); ++ INIT_LIST_HEAD(&skbc->ub_sock_list); ++ ++ ub = top_beancounter(cub); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ if (unlikely(__charge_beancounter_locked(ub, res, 1, UB_HARD) < 0)) ++ goto out_limit; ++ ++ added_reserv = 0; ++ added_forw = 0; ++ if (res == UB_NUMTCPSOCK) { ++ added_reserv = skb_charge_size(MAX_TCP_HEADER + ++ 1500 - sizeof(struct iphdr) - ++ sizeof(struct tcphdr)); ++ added_reserv *= 4; ++ ub->ub_parms[UB_TCPSNDBUF].held += added_reserv; ++ if (!ub_barrier_farsz(ub, UB_TCPSNDBUF)) { ++ ub->ub_parms[UB_TCPSNDBUF].held -= added_reserv; ++ added_reserv = 0; ++ } ++ skbc->poll_reserv = added_reserv; ++ ++ added_forw = SK_MEM_QUANTUM * 4; ++ ub->ub_parms[UB_TCPRCVBUF].held += added_forw; ++ if (!ub_barrier_farsz(ub, UB_TCPRCVBUF)) { ++ ub->ub_parms[UB_TCPRCVBUF].held -= added_forw; ++ added_forw = 0; ++ } ++ skbc->forw_space = added_forw; ++ } ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ ++ charge_beancounter_notop(cub, res, 1); ++ if (added_reserv) ++ charge_beancounter_notop(cub, UB_TCPSNDBUF, added_reserv); ++ if (added_forw) ++ charge_beancounter_notop(cub, UB_TCPRCVBUF, added_forw); ++ ++ skbc->ub = get_beancounter(cub); ++ return 0; ++ ++out_limit: ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ sk_free_beancounter(sk); ++ return -ENOMEM; ++} ++ ++int ub_tcp_sock_charge(struct sock *sk) ++{ ++ int ret; ++ ++ ret = __sock_charge(sk, UB_NUMTCPSOCK); ++ ub_update_rmem_thres(sock_bc(sk)); ++ ++ return ret; ++} ++ ++int ub_other_sock_charge(struct sock *sk) ++{ ++ return __sock_charge(sk, UB_NUMOTHERSOCK); ++} ++ ++EXPORT_SYMBOL(ub_other_sock_charge); ++ ++int ub_sock_charge(struct sock *sk, int family, int type) ++{ ++ return (IS_TCP_SOCK(family, type) ? ++ ub_tcp_sock_charge(sk) : ub_other_sock_charge(sk)); ++} ++ ++EXPORT_SYMBOL(ub_sock_charge); ++ ++/* ++ * Uncharge socket number ++ */ ++ ++void ub_sock_uncharge(struct sock *sk) ++{ ++ int is_tcp_sock; ++ unsigned long flags; ++ struct sock_beancounter *skbc; ++ struct user_beancounter *ub; ++ unsigned long reserv, forw; ++ ++ if (unlikely(!sock_has_ubc(sk))) ++ return; ++ ++ is_tcp_sock = IS_TCP_SOCK(sk->sk_family, sk->sk_type); ++ skbc = sock_bc(sk); ++ ub_debug(UBD_NET_SOCKET, "Calling ub_sock_uncharge on %p\n", sk); ++ ++ ub = top_beancounter(skbc->ub); ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ if (!list_empty(&skbc->ub_sock_list)) { ++ ub_debug(UBD_NET_SOCKET, ++ "ub_sock_uncharge: removing from ub(%p) queue.\n", ++ skbc); ++ list_del_init(&skbc->ub_sock_list); ++ } ++ ++ reserv = skbc->poll_reserv; ++ forw = skbc->forw_space; ++ __uncharge_beancounter_locked(ub, ++ (is_tcp_sock ? UB_TCPSNDBUF : UB_OTHERSOCKBUF), ++ reserv); ++ if (forw) ++ __uncharge_beancounter_locked(ub, ++ (is_tcp_sock ? UB_TCPRCVBUF : UB_DGRAMRCVBUF), ++ forw); ++ __uncharge_beancounter_locked(ub, ++ (is_tcp_sock ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK), 1); ++ ++ ub_sock_wcharge_dec(sk, reserv); ++ if (unlikely(skbc->ub_wcharged)) ++ printk(KERN_WARNING ++ "ub_sock_uncharge: wch=%lu for ub %p (%d).\n", ++ skbc->ub_wcharged, skbc->ub, skbc->ub->ub_uid); ++ skbc->poll_reserv = 0; ++ skbc->forw_space = 0; ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ ++ uncharge_beancounter_notop(skbc->ub, ++ (is_tcp_sock ? UB_TCPSNDBUF : UB_OTHERSOCKBUF), ++ reserv); ++ if (forw) ++ uncharge_beancounter_notop(skbc->ub, ++ (is_tcp_sock ? UB_TCPRCVBUF : UB_DGRAMRCVBUF), ++ forw); ++ uncharge_beancounter_notop(skbc->ub, ++ (is_tcp_sock ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK), 1); ++ ++ put_beancounter(skbc->ub); ++ sk_free_beancounter(sk); ++} ++ ++/* ++ * Special case for netlink_dump - (un)charges precalculated size ++ */ ++ ++int ub_nlrcvbuf_charge(struct sk_buff *skb, struct sock *sk) ++{ ++ int ret; ++ unsigned long chargesize; ++ ++ if (unlikely(!sock_has_ubc(sk))) ++ return 0; ++ ++ chargesize = skb_charge_fullsize(skb); ++ ret = charge_beancounter(sock_bc(sk)->ub, ++ UB_OTHERSOCKBUF, chargesize, UB_HARD); ++ if (ret < 0) ++ return ret; ++ ub_skb_set_charge(skb, sk, chargesize, UB_OTHERSOCKBUF); ++ return ret; ++} ++ ++/* ++ * Poll reserve accounting ++ * ++ * This is the core of socket buffer management (along with queueing/wakeup ++ * functions. The rest of buffer accounting either call these functions, or ++ * repeat parts of their logic for some simpler cases. ++ */ ++ ++static int ub_sock_makewreserv_locked(struct sock *sk, ++ int bufid, unsigned long size) ++{ ++ unsigned long wcharge_added; ++ struct sock_beancounter *skbc; ++ struct user_beancounter *ub; ++ ++ skbc = sock_bc(sk); ++ if (skbc->poll_reserv >= size) /* no work to be done */ ++ goto out; ++ ++ ub = top_beancounter(skbc->ub); ++ ub->ub_parms[bufid].held += size - skbc->poll_reserv; ++ ++ wcharge_added = 0; ++ /* ++ * Logic: ++ * 1) when used memory hits barrier, we set wmem_pressure; ++ * wmem_pressure is reset under barrier/2; ++ * between barrier/2 and barrier we limit per-socket buffer growth; ++ * 2) each socket is guaranteed to get (limit-barrier)/maxsockets ++ * calculated on the base of memory eaten after the barrier is hit ++ */ ++ skbc = sock_bc(sk); ++#if UB_SOCK_MAINTAIN_WMEMPRESSURE ++ if (!ub_hfbarrier_hit(ub, bufid)) { ++ if (ub->ub_wmem_pressure) ++ ub_debug(UBD_NET_SEND, "makewres: pressure -> 0 " ++ "sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n", ++ sk, size, skbc->poll_reserv, ++ ub->ub_parms[bufid].held, ++ skbc->ub_wcharged, sk->sk_sndbuf); ++ ub->ub_wmem_pressure = 0; ++ } ++#endif ++ if (ub_barrier_hit(ub, bufid)) { ++#if UB_SOCK_MAINTAIN_WMEMPRESSURE ++ if (!ub->ub_wmem_pressure) ++ ub_debug(UBD_NET_SEND, "makewres: pressure -> 1 " ++ "sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n", ++ sk, size, skbc->poll_reserv, ++ ub->ub_parms[bufid].held, ++ skbc->ub_wcharged, sk->sk_sndbuf); ++ ub->ub_wmem_pressure = 1; ++#endif ++ if (sk->sk_family == PF_NETLINK) ++ goto unroll; ++ wcharge_added = size - skbc->poll_reserv; ++ skbc->ub_wcharged += wcharge_added; ++ if (skbc->ub_wcharged * ub->ub_parms[bid2sid(bufid)].limit + ++ ub->ub_parms[bufid].barrier > ++ ub->ub_parms[bufid].limit) ++ goto unroll_wch; ++ } ++ if (ub->ub_parms[bufid].held > ub->ub_parms[bufid].limit) ++ goto unroll; ++ ++ ub_adjust_maxheld(ub, bufid); ++ skbc->poll_reserv = size; ++out: ++ return 0; ++ ++unroll_wch: ++ skbc->ub_wcharged -= wcharge_added; ++unroll: ++ ub_debug(UBD_NET_SEND, ++ "makewres: deny " ++ "sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n", ++ sk, size, skbc->poll_reserv, ub->ub_parms[bufid].held, ++ skbc->ub_wcharged, sk->sk_sndbuf); ++ ub->ub_parms[bufid].failcnt++; ++ ub->ub_parms[bufid].held -= size - skbc->poll_reserv; ++ ++ if (sk->sk_socket != NULL) { ++ set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); ++ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); ++ } ++ return -ENOMEM; ++} ++ ++int ub_sock_make_wreserv(struct sock *sk, int bufid, unsigned long size) ++{ ++ struct sock_beancounter *skbc; ++ struct user_beancounter *ub; ++ unsigned long flags; ++ unsigned long added_reserv; ++ int err; ++ ++ skbc = sock_bc(sk); ++ ++ /* ++ * This function provides that there is sufficient reserve upon return ++ * only if sk has only one user. We can check poll_reserv without ++ * serialization and avoid locking if the reserve already exists. ++ */ ++ if (unlikely(!sock_has_ubc(sk)) || likely(skbc->poll_reserv >= size)) ++ return 0; ++ ++ ub = top_beancounter(skbc->ub); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ added_reserv = -skbc->poll_reserv; ++ err = ub_sock_makewreserv_locked(sk, bufid, size); ++ added_reserv += skbc->poll_reserv; ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ ++ if (added_reserv) ++ charge_beancounter_notop(skbc->ub, bufid, added_reserv); ++ ++ return err; ++} ++ ++EXPORT_SYMBOL(ub_sock_make_wreserv); ++ ++int ub_sock_get_wreserv(struct sock *sk, int bufid, unsigned long size) ++{ ++ struct sock_beancounter *skbc; ++ ++ if (unlikely(!sock_has_ubc(sk))) ++ return 0; ++ ++ /* optimize for the case if socket has sufficient reserve */ ++ ub_sock_make_wreserv(sk, bufid, size); ++ skbc = sock_bc(sk); ++ if (likely(skbc->poll_reserv >= size)) { ++ skbc->poll_reserv -= size; ++ return 0; ++ } ++ return -ENOMEM; ++} ++ ++EXPORT_SYMBOL(ub_sock_get_wreserv); ++ ++static void ub_sock_do_ret_wreserv(struct sock *sk, int bufid, ++ unsigned long size, unsigned long ressize) ++{ ++ struct sock_beancounter *skbc; ++ struct user_beancounter *ub; ++ unsigned long extra; ++ unsigned long flags; ++ ++ skbc = sock_bc(sk); ++ ub = top_beancounter(skbc->ub); ++ ++ extra = 0; ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ skbc->poll_reserv += size; ++ if (skbc->poll_reserv > ressize) { ++ extra = skbc->poll_reserv - ressize; ++ ub_sock_wcharge_dec(sk, extra); ++ skbc->poll_reserv = ressize; ++ ++ __uncharge_beancounter_locked(ub, bufid, extra); ++ if (bufid == UB_TCPSNDBUF) ++ ub_tcp_snd_wakeup(ub); ++ else ++ ub_sock_snd_wakeup(ub); ++ } ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ ++ if (extra) ++ uncharge_beancounter_notop(skbc->ub, bufid, extra); ++} ++ ++void ub_sock_ret_wreserv(struct sock *sk, int bufid, ++ unsigned long size, unsigned long ressize) ++{ ++ struct sock_beancounter *skbc; ++ struct user_beancounter *ub; ++ ++ if (unlikely(!sock_has_ubc(sk))) ++ return; ++ ++ skbc = sock_bc(sk); ++ ub = top_beancounter(skbc->ub); ++ /* check if the reserve can be kept */ ++ if (ub_barrier_farsz(ub, bufid)) { ++ skbc->poll_reserv += size; ++ return; ++ } ++ ub_sock_do_ret_wreserv(sk, bufid, size, ressize); ++} ++ ++/* ++ * UB_DGRAMRCVBUF ++ */ ++ ++static int ub_dgramrcvbuf_charge(struct sock *sk, struct sk_buff *skb) ++{ ++ unsigned long chargesize; ++ ++ chargesize = skb_charge_fullsize(skb); ++ if (charge_beancounter(sock_bc(sk)->ub, UB_DGRAMRCVBUF, ++ chargesize, UB_HARD)) ++ return -ENOMEM; ++ ++ ub_skb_set_charge(skb, sk, chargesize, UB_DGRAMRCVBUF); ++ return 0; ++} ++ ++int ub_sockrcvbuf_charge(struct sock *sk, struct sk_buff *skb) ++{ ++ if (unlikely(!sock_has_ubc(sk))) ++ return 0; ++ ++ if (IS_TCP_SOCK(sk->sk_family, sk->sk_type)) ++ return ub_tcprcvbuf_charge(sk, skb); ++ else ++ return ub_dgramrcvbuf_charge(sk, skb); ++} ++ ++EXPORT_SYMBOL(ub_sockrcvbuf_charge); ++ ++static void ub_sockrcvbuf_uncharge(struct sk_buff *skb) ++{ ++ uncharge_beancounter(skb_bc(skb)->ub, UB_DGRAMRCVBUF, ++ skb_bc(skb)->charged); ++ ub_skb_set_uncharge(skb); ++} ++ ++/* ++ * UB_TCPRCVBUF ++ */ ++ ++int ub_sock_tcp_chargerecv(struct sock *sk, struct sk_buff *skb, ++ enum ub_severity strict) ++{ ++ int retval; ++ unsigned long flags; ++ struct user_beancounter *ub; ++ struct sock_beancounter *skbc; ++ unsigned long chargesize; ++ ++ if (unlikely(!sock_has_ubc(sk))) ++ return 0; ++ skbc = sock_bc(sk); ++ ++ chargesize = skb_charge_fullsize(skb); ++ if (likely(skbc->forw_space >= chargesize)) { ++ skbc->forw_space -= chargesize; ++ __ub_skb_set_charge(skb, sk, chargesize, UB_TCPRCVBUF); ++ return 0; ++ } ++ ++ /* ++ * Memory pressure reactions: ++ * 1) set UB_RMEM_KEEP (clearing UB_RMEM_EXPAND) ++ * 2) set UB_RMEM_SHRINK and tcp_clamp_window() ++ * tcp_collapse_queues() if rmem_alloc > rcvbuf ++ * 3) drop OFO, tcp_purge_ofo() ++ * 4) drop all. ++ * Currently, we do #2 and #3 at once (which means that current ++ * collapsing of OFO queue in tcp_collapse_queues() is a waste of time, ++ * for example...) ++ * On memory pressure we jump from #0 to #3, and when the pressure ++ * subsides, to #1. ++ */ ++ retval = 0; ++ ub = top_beancounter(sock_bc(sk)->ub); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ ub->ub_parms[UB_TCPRCVBUF].held += chargesize; ++ if (ub->ub_parms[UB_TCPRCVBUF].held > ++ ub->ub_parms[UB_TCPRCVBUF].barrier && ++ strict != UB_FORCE) ++ goto excess; ++ ub_adjust_maxheld(ub, UB_TCPRCVBUF); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ ++out: ++ if (retval == 0) { ++ charge_beancounter_notop(sock_bc(sk)->ub, UB_TCPRCVBUF, ++ chargesize); ++ ub_skb_set_charge(skb, sk, chargesize, UB_TCPRCVBUF); ++ } ++ return retval; ++ ++excess: ++ ub->ub_rmem_pressure = UB_RMEM_SHRINK; ++ if (strict == UB_HARD) ++ retval = -ENOMEM; ++ if (ub->ub_parms[UB_TCPRCVBUF].held > ub->ub_parms[UB_TCPRCVBUF].limit) ++ retval = -ENOMEM; ++ /* ++ * We try to leave numsock*maxadvmss as a reserve for sockets not ++ * queueing any data yet (if the difference between the barrier and the ++ * limit is enough for this reserve). ++ */ ++ if (ub->ub_parms[UB_TCPRCVBUF].held + ++ ub->ub_parms[UB_NUMTCPSOCK].limit * ub->ub_maxadvmss ++ > ub->ub_parms[UB_TCPRCVBUF].limit && ++ atomic_read(&sk->sk_rmem_alloc)) ++ retval = -ENOMEM; ++ if (retval) { ++ ub->ub_parms[UB_TCPRCVBUF].held -= chargesize; ++ ub->ub_parms[UB_TCPRCVBUF].failcnt++; ++ } ++ ub_adjust_maxheld(ub, UB_TCPRCVBUF); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ goto out; ++} ++EXPORT_SYMBOL(ub_sock_tcp_chargerecv); ++ ++static void ub_tcprcvbuf_uncharge(struct sk_buff *skb) ++{ ++ unsigned long flags; ++ unsigned long held, bar; ++ int prev_pres; ++ struct user_beancounter *ub; ++ ++ ub = top_beancounter(skb_bc(skb)->ub); ++ if (ub_barrier_farsz(ub, UB_TCPRCVBUF)) { ++ sock_bc(skb->sk)->forw_space += skb_bc(skb)->charged; ++ ub_skb_set_uncharge(skb); ++ return; ++ } ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ if (ub->ub_parms[UB_TCPRCVBUF].held < skb_bc(skb)->charged) { ++ printk(KERN_ERR "Uncharging %d for tcprcvbuf of %p with %lu\n", ++ skb_bc(skb)->charged, ++ ub, ub->ub_parms[UB_TCPRCVBUF].held); ++ /* ass-saving bung */ ++ skb_bc(skb)->charged = ub->ub_parms[UB_TCPRCVBUF].held; ++ } ++ ub->ub_parms[UB_TCPRCVBUF].held -= skb_bc(skb)->charged; ++ held = ub->ub_parms[UB_TCPRCVBUF].held; ++ bar = ub->ub_parms[UB_TCPRCVBUF].barrier; ++ prev_pres = ub->ub_rmem_pressure; ++ if (held <= bar - (bar >> 2)) ++ ub->ub_rmem_pressure = UB_RMEM_EXPAND; ++ else if (held <= bar) ++ ub->ub_rmem_pressure = UB_RMEM_KEEP; ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ ++ uncharge_beancounter_notop(skb_bc(skb)->ub, UB_TCPRCVBUF, ++ skb_bc(skb)->charged); ++ ub_skb_set_uncharge(skb); ++} ++ ++ ++/* ++ * UB_OTHERSOCKBUF and UB_TCPSNDBUF ++ */ ++ ++static void ub_socksndbuf_uncharge(struct sk_buff *skb) ++{ ++ unsigned long flags; ++ struct user_beancounter *ub, *cub; ++ unsigned long chargesize; ++ ++ cub = skb_bc(skb)->ub; ++ ub = top_beancounter(cub); ++ chargesize = skb_bc(skb)->charged; ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ __uncharge_beancounter_locked(ub, UB_OTHERSOCKBUF, chargesize); ++ if (skb->sk != NULL && sock_has_ubc(skb->sk)) ++ ub_sock_wcharge_dec(skb->sk, chargesize); ++ ub_sock_snd_wakeup(ub); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ ++ uncharge_beancounter_notop(cub, UB_OTHERSOCKBUF, chargesize); ++ ub_skb_set_uncharge(skb); ++} ++ ++/* expected to be called under socket lock */ ++static void ub_tcpsndbuf_uncharge(struct sk_buff *skb) ++{ ++ /* ++ * ub_sock_ret_wreserv call is abused here, we just want to uncharge ++ * skb size. However, to reduce duplication of the code doing ++ * ub_hfbarrier_hit check, ub_wcharged reduction, and wakeup we call ++ * a function that already does all of this. 2006/04/27 SAW ++ */ ++ ub_sock_ret_wreserv(skb->sk, UB_TCPSNDBUF, skb_bc(skb)->charged, ++ sock_bc(skb->sk)->poll_reserv); ++ ub_skb_set_uncharge(skb); ++} ++ ++void ub_skb_uncharge(struct sk_buff *skb) ++{ ++ switch (skb_bc(skb)->resource) { ++ case UB_TCPSNDBUF: ++ ub_tcpsndbuf_uncharge(skb); ++ break; ++ case UB_TCPRCVBUF: ++ ub_tcprcvbuf_uncharge(skb); ++ break; ++ case UB_DGRAMRCVBUF: ++ ub_sockrcvbuf_uncharge(skb); ++ break; ++ case UB_OTHERSOCKBUF: ++ ub_socksndbuf_uncharge(skb); ++ break; ++ } ++} ++ ++EXPORT_SYMBOL(ub_skb_uncharge); /* due to skb_orphan()/conntracks */ ++ ++/* ++ * Other sock reserve managment ++ */ ++ ++int ub_sock_getwres_other(struct sock *sk, unsigned long size) ++{ ++ struct sock_beancounter *skbc; ++ struct user_beancounter *ub; ++ unsigned long flags; ++ unsigned long added_reserv; ++ int err; ++ ++ if (unlikely(!sock_has_ubc(sk))) ++ return 0; ++ ++ /* ++ * Nothing except beancounter lock protects skbc->poll_reserv. ++ * So, take the lock and do the job. ++ * Dances with added_reserv repeat ub_sock_make_wreserv. ++ */ ++ skbc = sock_bc(sk); ++ ub = top_beancounter(skbc->ub); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ added_reserv = -skbc->poll_reserv; ++ err = ub_sock_makewreserv_locked(sk, UB_OTHERSOCKBUF, size); ++ added_reserv += skbc->poll_reserv; ++ if (!err) ++ skbc->poll_reserv -= size; ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ ++ if (added_reserv) ++ charge_beancounter_notop(skbc->ub, UB_OTHERSOCKBUF, added_reserv); ++ ++ return err; ++} ++EXPORT_SYMBOL(ub_sock_getwres_other); ++ ++void ub_sock_retwres_other(struct sock *sk, ++ unsigned long size, unsigned long ressize) ++{ ++ if (unlikely(!sock_has_ubc(sk))) ++ return; ++ ++ ub_sock_do_ret_wreserv(sk, UB_OTHERSOCKBUF, size, ressize); ++} ++ ++/* ++ * TCP send buffers accouting. Paged part ++ */ ++ ++int ub_sock_tcp_chargepage(struct sock *sk) ++{ ++ struct sock_beancounter *skbc; ++ unsigned long extra; ++ int err; ++ ++ if (unlikely(!sock_has_ubc(sk))) ++ return 0; ++ ++ skbc = sock_bc(sk); ++ ub_sock_make_wreserv(sk, UB_TCPSNDBUF, PAGE_SIZE); ++ if (likely(skbc->poll_reserv >= PAGE_SIZE)) { ++ skbc->poll_reserv -= PAGE_SIZE; ++ return 0; ++ } ++ ++ /* ++ * Ok, full page is not available. ++ * However, this function must succeed if poll previously indicated ++ * that write is possible. We better make a forced charge here ++ * than reserve a whole page in poll. ++ */ ++ err = ub_sock_make_wreserv(sk, UB_TCPSNDBUF, SOCK_MIN_UBCSPACE); ++ if (unlikely(err < 0)) ++ goto out; ++ if (skbc->poll_reserv < PAGE_SIZE) { ++ extra = PAGE_SIZE - skbc->poll_reserv; ++ err = charge_beancounter(skbc->ub, UB_TCPSNDBUF, extra, ++ UB_FORCE); ++ if (err < 0) ++ goto out; ++ skbc->poll_reserv += extra; ++ } ++ skbc->poll_reserv -= PAGE_SIZE; ++ return 0; ++ ++out: ++ return err; ++} ++ ++void ub_sock_tcp_detachpage(struct sock *sk) ++{ ++ struct sk_buff *skb; ++ ++ if (unlikely(!sock_has_ubc(sk))) ++ return; ++ ++ /* The page is just detached from socket. The last skb in queue ++ with paged part holds referrence to it */ ++ skb = skb_peek_tail(&sk->sk_write_queue); ++ if (skb == NULL) { ++ /* If the queue is empty - all data is sent and page is about ++ to be freed */ ++ ub_sock_ret_wreserv(sk, UB_TCPSNDBUF, PAGE_SIZE, ++ sock_bc(sk)->poll_reserv); ++ } else { ++ /* Last skb is a good aproximation for a last skb with ++ paged part */ ++ skb_bc(skb)->charged += PAGE_SIZE; ++ } ++} ++ ++/* ++ * TCPSNDBUF charge functions below are called in the following cases: ++ * - sending of SYN, SYN-ACK, FIN, the latter charge is forced by ++ * some technical reasons in TCP code; ++ * - fragmentation of TCP packets. ++ * These functions are allowed but not required to use poll_reserv. ++ * Originally, these functions didn't do that, since it didn't make ++ * any sense. Now, since poll_reserv now has a function of general reserve, ++ * they use it. ++ */ ++int ub_sock_tcp_chargesend(struct sock *sk, struct sk_buff *skb, ++ enum ub_severity strict) ++{ ++ int ret; ++ unsigned long chargesize; ++ struct sock_beancounter *skbc; ++ struct user_beancounter *ub; ++ unsigned long flags; ++ ++ if (unlikely(!sock_has_ubc(sk))) ++ return 0; ++ ++ skbc = sock_bc(sk); ++ chargesize = skb_charge_fullsize(skb); ++ if (likely(skbc->poll_reserv >= chargesize)) { ++ skbc->poll_reserv -= chargesize; ++ __ub_skb_set_charge(skb, sk, chargesize, UB_TCPSNDBUF); ++ /* XXX hack, see ub_skb_set_charge */ ++ skb->sk = sk; ++ return 0; ++ } ++ ++ ub = top_beancounter(skbc->ub); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ ret = __charge_beancounter_locked(ub, UB_TCPSNDBUF, ++ chargesize, strict); ++ /* ++ * Note: this check is not equivalent of the corresponding check ++ * in makewreserv. It's similar in spirit, but an equivalent check ++ * would be too long and complicated here. ++ */ ++ if (!ret && ub_barrier_hit(ub, UB_TCPSNDBUF)) ++ skbc->ub_wcharged += chargesize; ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ if (likely(!ret)) { ++ charge_beancounter_notop(skbc->ub, UB_TCPSNDBUF, chargesize); ++ ub_skb_set_charge(skb, sk, chargesize, UB_TCPSNDBUF); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(ub_sock_tcp_chargesend); ++ ++void ub_sock_tcp_unchargesend(struct sock *sk, unsigned long size) ++{ ++ if (unlikely(!sock_has_ubc(sk))) ++ return; ++ /* see ub_tcpsndbuf_uncharge */ ++ ub_sock_ret_wreserv(sk, UB_TCPSNDBUF, size, sock_bc(sk)->poll_reserv); ++} ++ ++/* ++ * Initialization ++ */ ++ ++int __init skbc_cache_init(void) ++{ ++ return 0; ++} +diff --git a/kernel/bc/oom_kill.c b/kernel/bc/oom_kill.c +new file mode 100644 +index 0000000..c79e826 +--- /dev/null ++++ b/kernel/bc/oom_kill.c +@@ -0,0 +1,200 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++#define UB_OOM_TIMEOUT (5 * HZ) ++ ++int oom_generation; ++int oom_kill_counter; ++static DEFINE_SPINLOCK(oom_lock); ++static DECLARE_WAIT_QUEUE_HEAD(oom_wq); ++ ++static inline int ub_oom_completed(struct task_struct *tsk) ++{ ++ if (test_tsk_thread_flag(tsk, TIF_MEMDIE)) ++ /* we were oom killed - just die */ ++ return 1; ++ if (tsk->task_bc.oom_generation != oom_generation) ++ /* some task was succesfully killed */ ++ return 1; ++ return 0; ++} ++ ++static void ub_clear_oom(void) ++{ ++ struct user_beancounter *ub; ++ ++ rcu_read_lock(); ++ for_each_beancounter(ub) ++ ub->ub_oom_noproc = 0; ++ rcu_read_unlock(); ++} ++ ++/* Called with cpuset_lock held */ ++int ub_oom_lock(void) ++{ ++ int timeout; ++ DEFINE_WAIT(oom_w); ++ struct task_struct *tsk; ++ ++ tsk = current; ++ ++ spin_lock(&oom_lock); ++ if (!oom_kill_counter) ++ goto out_do_oom; ++ ++ timeout = UB_OOM_TIMEOUT; ++ while (1) { ++ if (ub_oom_completed(tsk)) { ++ spin_unlock(&oom_lock); ++ return -EINVAL; ++ } ++ ++ if (timeout == 0) ++ break; ++ ++ __set_current_state(TASK_UNINTERRUPTIBLE); ++ add_wait_queue(&oom_wq, &oom_w); ++ spin_unlock(&oom_lock); ++ cpuset_unlock(); ++ ++ timeout = schedule_timeout(timeout); ++ ++ cpuset_lock(); ++ spin_lock(&oom_lock); ++ remove_wait_queue(&oom_wq, &oom_w); ++ } ++ ++out_do_oom: ++ ub_clear_oom(); ++ return 0; ++} ++ ++static inline long ub_current_overdraft(struct user_beancounter *ub) ++{ ++ return ub->ub_parms[UB_OOMGUARPAGES].held + ++ ((ub->ub_parms[UB_KMEMSIZE].held ++ + ub->ub_parms[UB_TCPSNDBUF].held ++ + ub->ub_parms[UB_TCPRCVBUF].held ++ + ub->ub_parms[UB_OTHERSOCKBUF].held ++ + ub->ub_parms[UB_DGRAMRCVBUF].held) ++ >> PAGE_SHIFT) - ub->ub_parms[UB_OOMGUARPAGES].barrier; ++} ++ ++int ub_oom_task_skip(struct user_beancounter *ub, struct task_struct *tsk) ++{ ++ struct user_beancounter *mm_ub; ++ ++ if (ub == NULL) ++ return 0; ++ ++ task_lock(tsk); ++ if (tsk->mm == NULL) ++ mm_ub = NULL; ++ else ++ mm_ub = tsk->mm->mm_ub; ++ ++ while (mm_ub != NULL && mm_ub != ub) ++ mm_ub = mm_ub->parent; ++ task_unlock(tsk); ++ ++ return mm_ub != ub; ++} ++ ++struct user_beancounter *ub_oom_select_worst(void) ++{ ++ struct user_beancounter *ub, *walkp; ++ long ub_maxover; ++ ++ ub_maxover = 0; ++ ub = NULL; ++ ++ rcu_read_lock(); ++ for_each_beancounter (walkp) { ++ long ub_overdraft; ++ ++ if (walkp->parent != NULL) ++ continue; ++ if (walkp->ub_oom_noproc) ++ continue; ++ ++ ub_overdraft = ub_current_overdraft(walkp); ++ if (ub_overdraft > ub_maxover && get_beancounter_rcu(walkp)) { ++ put_beancounter(ub); ++ ub = walkp; ++ ub_maxover = ub_overdraft; ++ } ++ } ++ ++ if (ub) ++ ub->ub_oom_noproc = 1; ++ rcu_read_unlock(); ++ ++ return ub; ++} ++ ++void ub_oom_mm_killed(struct user_beancounter *ub) ++{ ++ static struct ub_rate_info ri = { 5, 60*HZ }; ++ ++ /* increment is serialized with oom_lock */ ++ ub->ub_parms[UB_OOMGUARPAGES].failcnt++; ++ ++ if (ub_ratelimit(&ri)) ++ show_mem(); ++} ++ ++void ub_oom_unlock(void) ++{ ++ spin_unlock(&oom_lock); ++} ++ ++void ub_oom_task_dead(struct task_struct *tsk) ++{ ++ spin_lock(&oom_lock); ++ oom_kill_counter = 0; ++ oom_generation++; ++ ++ printk("OOM killed process %s (pid=%d, ve=%d) exited, " ++ "free=%lu gen=%d.\n", ++ tsk->comm, tsk->pid, VEID(tsk->ve_task_info.owner_env), ++ nr_free_pages(), oom_generation); ++ /* if there is time to sleep in ub_oom_lock -> sleep will continue */ ++ wake_up_all(&oom_wq); ++ spin_unlock(&oom_lock); ++} ++ ++void ub_out_of_memory(struct user_beancounter *scope) ++{ ++ struct user_beancounter *ub; ++ struct task_struct *p; ++ ++ cpuset_lock(); ++ spin_lock(&oom_lock); ++ ub_clear_oom(); ++ ub = get_beancounter(scope); ++ ++ read_lock(&tasklist_lock); ++retry: ++ p = select_bad_process(ub, NULL); ++ if (p == NULL || PTR_ERR(p) == -1UL) ++ goto unlock; ++ ++ if (oom_kill_process(p, (gfp_t)-1, -1, NULL, "UB Out of memory")) ++ goto retry; ++ ++ put_beancounter(ub); ++ ++unlock: ++ read_unlock(&tasklist_lock); ++ spin_unlock(&oom_lock); ++ cpuset_unlock(); ++} ++EXPORT_SYMBOL(ub_out_of_memory); +diff --git a/kernel/bc/proc.c b/kernel/bc/proc.c +new file mode 100644 +index 0000000..5b1ae4b +--- /dev/null ++++ b/kernel/bc/proc.c +@@ -0,0 +1,682 @@ ++/* ++ * kernel/bc/proc.c ++ * ++ * Copyright (C) 2006 OpenVZ. SWsoft Inc. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++/* Generic output formats */ ++#if BITS_PER_LONG == 32 ++const char *bc_proc_lu_fmt = "\t%-20s %10lu\n"; ++const char *bc_proc_lu_lfmt = "\t%-20s %21lu\n"; ++const char *bc_proc_llu_fmt = "\t%-20s %21llu\n"; ++const char *bc_proc_lu_lu_fmt = "\t%-20s %10lu %10lu\n"; ++#else ++const char *bc_proc_lu_fmt = "\t%-20s %21lu\n"; ++const char *bc_proc_lu_lfmt = "\t%-20s %21lu\n"; ++const char *bc_proc_llu_fmt = "\t%-20s %21llu\n"; ++const char *bc_proc_lu_lu_fmt = "\t%-20s %21lu %21lu\n"; ++#endif ++ ++#if BITS_PER_LONG == 32 ++static const char *head_fmt = "%10s %-12s %10s %10s %10s %10s %10s\n"; ++static const char *res_fmt = "%10s %-12s %10lu %10lu %10lu %10lu %10lu\n"; ++#else ++static const char *head_fmt = "%10s %-12s %20s %20s %20s %20s %20s\n"; ++static const char *res_fmt = "%10s %-12s %20lu %20lu %20lu %20lu %20lu\n"; ++#endif ++ ++static void ub_show_res(struct seq_file *f, struct user_beancounter *ub, ++ int r, int show_uid) ++{ ++ int len; ++ char ub_uid[64]; ++ ++ if (show_uid && r == 0) { ++ len = print_ub_uid(ub, ub_uid, sizeof(ub_uid) - 2); ++ ub_uid[len] = ':'; ++ ub_uid[len + 1] = '\0'; ++ } else ++ strcpy(ub_uid, ""); ++ ++ seq_printf(f, res_fmt, ub_uid, ub_rnames[r], ++ ub->ub_parms[r].held, ++ ub->ub_parms[r].maxheld, ++ ub->ub_parms[r].barrier, ++ ub->ub_parms[r].limit, ++ ub->ub_parms[r].failcnt); ++} ++ ++static void __show_resources(struct seq_file *f, struct user_beancounter *ub, ++ int show_uid) ++{ ++ int i; ++ ++ for (i = 0; i < UB_RESOURCES_COMPAT; i++) ++ if (strcmp(ub_rnames[i], "dummy") != 0) ++ ub_show_res(f, ub, i, show_uid); ++ ++ for (i = UB_RESOURCES_COMPAT; i < UB_RESOURCES; i++) ++ ub_show_res(f, ub, i, show_uid); ++} ++ ++static int bc_resources_show(struct seq_file *f, void *v) ++{ ++ __show_resources(f, seq_beancounter(f), 0); ++ return 0; ++} ++ ++static struct bc_proc_entry bc_resources_entry = { ++ .name = "resources", ++ .u.show = bc_resources_show, ++}; ++ ++#ifdef CONFIG_UBC_DEBUG ++static int bc_debug_show(struct seq_file *f, void *v) ++{ ++ struct user_beancounter *ub; ++ char buf[64]; ++ ++ ub = seq_beancounter(f); ++ print_ub_uid(ub, buf, sizeof(buf)); ++ seq_printf(f, "uid: %s\n", buf); ++ seq_printf(f, "ref: %d\n", atomic_read(&ub->ub_refcount)); ++ ++ seq_printf(f, "bc: %p\n", ub); ++ seq_printf(f, "par: %p\n", ub->parent); ++ seq_printf(f, "priv: %p\n", ub->private_data); ++ return 0; ++} ++ ++static struct bc_proc_entry bc_debug_entry = { ++ .name = "debug", ++ .u.show = bc_debug_show, ++}; ++#endif ++ ++static int ub_show(struct seq_file *f, void *v) ++{ ++ int i; ++ ++ for (i = 0; i < UB_RESOURCES_COMPAT; i++) ++ ub_show_res(f, (struct user_beancounter *)v, i, 1); ++ return 0; ++} ++ ++static int res_show(struct seq_file *f, void *v) ++{ ++ __show_resources(f, (struct user_beancounter *)v, 1); ++ return 0; ++} ++ ++static int ub_accessible(struct user_beancounter *exec, ++ struct user_beancounter *target) ++{ ++ struct user_beancounter *p, *q; ++ ++ p = top_beancounter(exec); ++ q = top_beancounter(target); ++ ++ return (p == get_ub0() || p == q); ++} ++ ++static void ub_show_header(struct seq_file *f) ++{ ++ seq_printf(f, "Version: 2.5\n"); ++ seq_printf(f, head_fmt, "uid", "resource", ++ "held", "maxheld", "barrier", "limit", "failcnt"); ++} ++ ++static void *ub_start(struct seq_file *f, loff_t *ppos) ++{ ++ struct user_beancounter *ub; ++ struct user_beancounter *exec_ub; ++ unsigned long pos; ++ ++ pos = *ppos; ++ if (pos == 0) ++ ub_show_header(f); ++ ++ exec_ub = get_exec_ub(); ++ ++ rcu_read_lock(); ++ for_each_beancounter(ub) { ++ if (ub->parent != NULL) ++ continue; ++ if (!ub_accessible(exec_ub, ub)) ++ continue; ++ if (pos-- == 0) ++ return ub; ++ } ++ return NULL; ++} ++ ++static void *ub_next(struct seq_file *f, void *v, loff_t *ppos) ++{ ++ struct user_beancounter *ub; ++ struct list_head *entry; ++ struct user_beancounter *exec_ub; ++ ++ exec_ub = get_exec_ub(); ++ ub = (struct user_beancounter *)v; ++ ++ entry = &ub->ub_list; ++ ++ list_for_each_continue_rcu(entry, &ub_list_head) { ++ ub = list_entry(entry, struct user_beancounter, ub_list); ++ if (ub->parent != NULL) ++ continue; ++ if (!ub_accessible(exec_ub, ub)) ++ continue; ++ ++ (*ppos)++; ++ return ub; ++ } ++ return NULL; ++} ++ ++static void ub_stop(struct seq_file *f, void *v) ++{ ++ rcu_read_unlock(); ++} ++ ++static struct seq_operations ub_seq_ops = { ++ .start = ub_start, ++ .next = ub_next, ++ .stop = ub_stop, ++ .show = ub_show, ++}; ++ ++static int ub_open(struct inode *inode, struct file *filp) ++{ ++ if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH))) ++ return -EACCES; ++ ++ return seq_open(filp, &ub_seq_ops); ++} ++ ++static struct file_operations ub_file_operations = { ++ .open = ub_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++static struct seq_operations res_seq_ops = { ++ .start = ub_start, ++ .next = ub_next, ++ .stop = ub_stop, ++ .show = res_show, ++}; ++ ++static int res_open(struct inode *inode, struct file *filp) ++{ ++ if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH))) ++ return -EACCES; ++ ++ return seq_open(filp, &res_seq_ops); ++} ++ ++static struct file_operations resources_operations = { ++ .open = res_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++static struct bc_proc_entry bc_all_resources_entry = { ++ .name = "resources", ++ .u.fops = &resources_operations, ++}; ++ ++/* ++ * Generic showing stuff ++ */ ++ ++static int cookies, num_entries; ++static struct bc_proc_entry *bc_entries __read_mostly; ++static struct bc_proc_entry *bc_root_entries __read_mostly; ++static DEFINE_SPINLOCK(bc_entries_lock); ++static struct proc_dir_entry *bc_proc_root; ++ ++void bc_register_proc_entry(struct bc_proc_entry *e) ++{ ++ spin_lock(&bc_entries_lock); ++ e->cookie = ++cookies; ++ e->next = bc_entries; ++ bc_entries = e; ++ num_entries++; ++ spin_unlock(&bc_entries_lock); ++} ++ ++EXPORT_SYMBOL(bc_register_proc_entry); ++ ++void bc_register_proc_root_entry(struct bc_proc_entry *e) ++{ ++ spin_lock(&bc_entries_lock); ++ e->cookie = ++cookies; ++ e->next = bc_root_entries; ++ bc_root_entries = e; ++ bc_proc_root->nlink++; ++ spin_unlock(&bc_entries_lock); ++} ++ ++EXPORT_SYMBOL(bc_register_proc_root_entry); ++ ++/* ++ * small helpers ++ */ ++ ++static inline unsigned long bc_make_ino(struct user_beancounter *ub) ++{ ++ unsigned long ret; ++ ++ ret = 0xbc000000; ++ if (ub->parent) ++ ret |= ((ub->parent->ub_uid) << 4); ++ ret |= (ub->ub_uid + 1); ++ return ret; ++} ++ ++static inline unsigned long bc_make_file_ino(struct bc_proc_entry *de) ++{ ++ return 0xbe000000 + de->cookie; ++} ++ ++static int bc_d_delete(struct dentry *d) ++{ ++ return 1; ++} ++ ++static void bc_d_release(struct dentry *d) ++{ ++ put_beancounter((struct user_beancounter *)d->d_fsdata); ++} ++ ++static struct inode_operations bc_entry_iops; ++static struct file_operations bc_entry_fops; ++static struct dentry_operations bc_dentry_ops = { ++ .d_delete = bc_d_delete, ++ .d_release = bc_d_release, ++}; ++ ++/* ++ * common directory operations' helpers ++ */ ++ ++static int bc_readdir(struct file *file, filldir_t filler, void *data, ++ struct user_beancounter *parent) ++{ ++ int err = 0; ++ loff_t pos, filled; ++ struct user_beancounter *ub, *prev; ++ struct bc_proc_entry *pde; ++ ++ if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH))) ++ return -EPERM; ++ ++ pos = file->f_pos; ++ if (pos == 0) { ++ err = (*filler)(data, ".", 1, pos, ++ file->f_dentry->d_inode->i_ino, DT_DIR); ++ if (err < 0) { ++ err = 0; ++ goto out; ++ } ++ pos++; ++ } ++ ++ if (pos == 1) { ++ err = (*filler)(data, "..", 2, pos, ++ parent_ino(file->f_dentry), DT_DIR); ++ if (err < 0) { ++ err = 0; ++ goto out; ++ } ++ pos++; ++ } ++ ++ filled = 2; ++ for (pde = (parent == NULL ? bc_root_entries : bc_entries); ++ pde != NULL; pde = pde->next) { ++ if (filled++ < pos) ++ continue; ++ ++ err = (*filler)(data, pde->name, strlen(pde->name), pos, ++ bc_make_file_ino(pde), DT_REG); ++ if (err < 0) { ++ err = 0; ++ goto out; ++ } ++ pos++; ++ } ++ ++ rcu_read_lock(); ++ prev = NULL; ++ ub = list_entry(&ub_list_head, struct user_beancounter, ub_list); ++ while (1) { ++ int len; ++ unsigned long ino; ++ char buf[64]; ++ ++ ub = list_entry(rcu_dereference(ub->ub_list.next), ++ struct user_beancounter, ub_list); ++ if (&ub->ub_list == &ub_list_head) ++ break; ++ ++ if (ub->parent != parent) ++ continue; ++ ++ if (filled++ < pos) ++ continue; ++ ++ if (!get_beancounter_rcu(ub)) ++ continue; ++ ++ rcu_read_unlock(); ++ put_beancounter(prev); ++ ++ len = print_ub_uid(ub, buf, sizeof(buf)); ++ ino = bc_make_ino(ub); ++ ++ err = (*filler)(data, buf, len, pos, ino, DT_DIR); ++ if (err < 0) { ++ err = 0; ++ put_beancounter(ub); ++ goto out; ++ } ++ ++ rcu_read_lock(); ++ prev = ub; ++ pos++; ++ } ++ rcu_read_unlock(); ++ put_beancounter(prev); ++out: ++ file->f_pos = pos; ++ return err; ++} ++ ++static int bc_looktest(struct inode *ino, void *data) ++{ ++ return ino->i_op == &bc_entry_iops && ino->i_private == data; ++} ++ ++static int bc_lookset(struct inode *ino, void *data) ++{ ++ struct user_beancounter *ub; ++ ++ ub = (struct user_beancounter *)data; ++ ino->i_private = data; ++ ino->i_ino = bc_make_ino(ub); ++ ino->i_fop = &bc_entry_fops; ++ ino->i_op = &bc_entry_iops; ++ ino->i_mode = S_IFDIR | S_IRUSR | S_IXUGO; ++ /* subbeancounters are not included, but who cares? */ ++ ino->i_nlink = num_entries + 2; ++ ino->i_gid = 0; ++ ino->i_uid = 0; ++ return 0; ++} ++ ++static struct dentry *bc_lookup(struct user_beancounter *ub, struct inode *dir, ++ struct dentry *dentry) ++{ ++ struct inode *ino; ++ ++ ino = iget5_locked(dir->i_sb, ub->ub_uid, bc_looktest, bc_lookset, ub); ++ if (ino == NULL) ++ goto out_put; ++ ++ unlock_new_inode(ino); ++ dentry->d_op = &bc_dentry_ops; ++ dentry->d_fsdata = ub; ++ d_add(dentry, ino); ++ return NULL; ++ ++out_put: ++ put_beancounter(ub); ++ return ERR_PTR(-ENOENT); ++} ++ ++/* ++ * files (bc_proc_entry) manipulations ++ */ ++ ++static struct dentry *bc_lookup_file(struct inode *dir, ++ struct dentry *dentry, struct bc_proc_entry *root, ++ int (*test)(struct inode *, void *), ++ int (*set)(struct inode *, void *)) ++{ ++ struct bc_proc_entry *pde; ++ struct inode *ino; ++ ++ for (pde = root; pde != NULL; pde = pde->next) ++ if (strcmp(pde->name, dentry->d_name.name) == 0) ++ break; ++ ++ if (pde == NULL) ++ return ERR_PTR(-ESRCH); ++ ++ ino = iget5_locked(dir->i_sb, pde->cookie, test, set, pde); ++ if (ino == NULL) ++ return ERR_PTR(-ENOENT); ++ ++ unlock_new_inode(ino); ++ dentry->d_op = &bc_dentry_ops; ++ d_add(dentry, ino); ++ return NULL; ++} ++ ++static int bc_file_open(struct inode *ino, struct file *filp) ++{ ++ struct bc_proc_entry *de; ++ struct user_beancounter *ub; ++ ++ de = (struct bc_proc_entry *)ino->i_private; ++ ub = (struct user_beancounter *)filp->f_dentry->d_parent->d_fsdata; ++ BUG_ON(ub->ub_magic != UB_MAGIC); ++ ++ /* ++ * ub can't disappear: we hold d_parent, he holds the beancounter ++ */ ++ return single_open(filp, de->u.show, ub); ++} ++ ++static struct file_operations bc_file_ops = { ++ .open = bc_file_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = single_release, ++}; ++ ++static int bc_looktest_entry(struct inode *ino, void *data) ++{ ++ return ino->i_fop == &bc_file_ops && ino->i_private == data; ++} ++ ++static int bc_lookset_entry(struct inode *ino, void *data) ++{ ++ struct bc_proc_entry *de; ++ ++ de = (struct bc_proc_entry *)data; ++ ino->i_private = data; ++ ino->i_ino = bc_make_file_ino(de); ++ ino->i_fop = &bc_file_ops, ++ ino->i_mode = S_IFREG | S_IRUSR; ++ ino->i_nlink = 1; ++ ino->i_gid = 0; ++ ino->i_uid = 0; ++ return 0; ++} ++ ++static inline struct dentry *bc_lookup_files(struct inode *dir, ++ struct dentry *de) ++{ ++ return bc_lookup_file(dir, de, bc_entries, ++ bc_looktest_entry, bc_lookset_entry); ++} ++ ++static int bc_looktest_root_entry(struct inode *ino, void *data) ++{ ++ struct bc_proc_entry *de; ++ ++ de = (struct bc_proc_entry *)data; ++ return ino->i_fop == de->u.fops && ino->i_private == data; ++} ++ ++static int bc_lookset_root_entry(struct inode *ino, void *data) ++{ ++ struct bc_proc_entry *de; ++ ++ de = (struct bc_proc_entry *)data; ++ ino->i_private = data; ++ ino->i_ino = bc_make_file_ino(de); ++ ino->i_fop = de->u.fops; ++ ino->i_mode = S_IFREG | S_IRUSR; ++ ino->i_nlink = 1; ++ ino->i_gid = 0; ++ ino->i_uid = 0; ++ return 0; ++} ++ ++static inline struct dentry *bc_lookup_root_files(struct inode *dir, ++ struct dentry *de) ++{ ++ return bc_lookup_file(dir, de, bc_root_entries, ++ bc_looktest_root_entry, bc_lookset_root_entry); ++} ++ ++/* ++ * /proc/bc/.../ directory operations ++ */ ++ ++static int bc_entry_readdir(struct file *file, void *data, filldir_t filler) ++{ ++ return bc_readdir(file, filler, data, ++ (struct user_beancounter *)file->f_dentry->d_fsdata); ++} ++ ++static struct dentry *bc_entry_lookup(struct inode *dir, struct dentry *dentry, ++ struct nameidata *nd) ++{ ++ int id; ++ char *end; ++ struct user_beancounter *par, *ub; ++ struct dentry *de; ++ ++ if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH))) ++ return ERR_PTR(-EPERM); ++ ++ de = bc_lookup_files(dir, dentry); ++ if (de != ERR_PTR(-ESRCH)) ++ return de; ++ ++ id = simple_strtol(dentry->d_name.name, &end, 10); ++ if (*end != '.') ++ return ERR_PTR(-ENOENT); ++ ++ par = (struct user_beancounter *)dir->i_private; ++ if (par->ub_uid != id) ++ return ERR_PTR(-ENOENT); ++ ++ id = simple_strtol(end + 1, &end, 10); ++ if (*end != '\0') ++ return ERR_PTR(-ENOENT); ++ ++ ub = get_subbeancounter_byid(par, id, 0); ++ if (ub == NULL) ++ return ERR_PTR(-ENOENT); ++ ++ return bc_lookup(ub, dir, dentry); ++} ++ ++static struct file_operations bc_entry_fops = { ++ .read = generic_read_dir, ++ .readdir = bc_entry_readdir, ++}; ++ ++static struct inode_operations bc_entry_iops = { ++ .lookup = bc_entry_lookup, ++}; ++ ++/* ++ * /proc/bc directory operations ++ */ ++ ++static int bc_root_readdir(struct file *file, void *data, filldir_t filler) ++{ ++ return bc_readdir(file, filler, data, NULL); ++} ++ ++static struct dentry *bc_root_lookup(struct inode *dir, struct dentry *dentry, ++ struct nameidata *nd) ++{ ++ int id; ++ char *end; ++ struct user_beancounter *ub; ++ struct dentry *de; ++ ++ if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH))) ++ return ERR_PTR(-EPERM); ++ ++ de = bc_lookup_root_files(dir, dentry); ++ if (de != ERR_PTR(-ESRCH)) ++ return de; ++ ++ id = simple_strtol(dentry->d_name.name, &end, 10); ++ if (*end != '\0') ++ return ERR_PTR(-ENOENT); ++ ++ ub = get_beancounter_byuid(id, 0); ++ if (ub == NULL) ++ return ERR_PTR(-ENOENT); ++ ++ return bc_lookup(ub, dir, dentry); ++} ++ ++static struct file_operations bc_root_fops = { ++ .read = generic_read_dir, ++ .readdir = bc_root_readdir, ++}; ++ ++static struct inode_operations bc_root_iops = { ++ .lookup = bc_root_lookup, ++}; ++ ++static int __init ub_init_proc(void) ++{ ++ struct proc_dir_entry *entry; ++ ++ bc_proc_root = create_proc_entry("bc", ++ S_IFDIR | S_IRUGO | S_IXUGO, NULL); ++ if (bc_proc_root == NULL) ++ panic("Can't create /proc/bc entry"); ++ ++ bc_proc_root->proc_fops = &bc_root_fops; ++ bc_proc_root->proc_iops = &bc_root_iops; ++ ++ bc_register_proc_entry(&bc_resources_entry); ++#ifdef CONFIG_UBC_DEBUG ++ bc_register_proc_entry(&bc_debug_entry); ++#endif ++ bc_register_proc_root_entry(&bc_all_resources_entry); ++ ++ entry = proc_create("user_beancounters", ++ S_IRUGO, &glob_proc_root, &ub_file_operations); ++ return 0; ++} ++ ++core_initcall(ub_init_proc); +diff --git a/kernel/bc/rss_pages.c b/kernel/bc/rss_pages.c +new file mode 100644 +index 0000000..391585e +--- /dev/null ++++ b/kernel/bc/rss_pages.c +@@ -0,0 +1,437 @@ ++/* ++ * kernel/bc/rss_pages.c ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++ ++static struct kmem_cache *pb_cachep; ++spinlock_t pb_lock = SPIN_LOCK_UNLOCKED; ++static struct page_beancounter **pb_hash_table; ++static unsigned int pb_hash_mask; ++ ++/* ++ * Auxiliary staff ++ */ ++ ++static inline struct page_beancounter *next_page_pb(struct page_beancounter *p) ++{ ++ return list_entry(p->page_list.next, struct page_beancounter, ++ page_list); ++} ++ ++static inline struct page_beancounter *prev_page_pb(struct page_beancounter *p) ++{ ++ return list_entry(p->page_list.prev, struct page_beancounter, ++ page_list); ++} ++ ++/* ++ * Held pages manipulation ++ */ ++static inline void set_held_pages(struct user_beancounter *bc) ++{ ++ /* all three depend on ub_held_pages */ ++ __ub_update_physpages(bc); ++ __ub_update_oomguarpages(bc); ++ __ub_update_privvm(bc); ++} ++ ++static inline void do_dec_held_pages(struct user_beancounter *ub, int value) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ ub->ub_held_pages -= value; ++ set_held_pages(ub); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++static void dec_held_pages(struct user_beancounter *ub, int value) ++{ ++ for (; ub != NULL; ub = ub->parent) ++ do_dec_held_pages(ub, value); ++} ++ ++static inline void do_inc_held_pages(struct user_beancounter *ub, int value) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ ub->ub_held_pages += value; ++ set_held_pages(ub); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++static void inc_held_pages(struct user_beancounter *ub, int value) ++{ ++ for (; ub != NULL; ub = ub->parent) ++ do_inc_held_pages(ub, value); ++} ++ ++/* ++ * Alloc - free ++ */ ++ ++inline int pb_alloc(struct page_beancounter **pbc) ++{ ++ *pbc = kmem_cache_alloc(pb_cachep, GFP_KERNEL); ++ if (*pbc != NULL) { ++ (*pbc)->next_hash = NULL; ++ (*pbc)->pb_magic = PB_MAGIC; ++ } ++ return (*pbc == NULL); ++} ++ ++inline void pb_free(struct page_beancounter **pb) ++{ ++ if (*pb != NULL) { ++ kmem_cache_free(pb_cachep, *pb); ++ *pb = NULL; ++ } ++} ++ ++void pb_free_list(struct page_beancounter **p_pb) ++{ ++ struct page_beancounter *list, *pb; ++ ++ list = *p_pb; ++ if (list == PBC_COPY_SAME) ++ return; ++ ++ while (list) { ++ pb = list; ++ list = list->next_hash; ++ pb_free(&pb); ++ } ++ *p_pb = NULL; ++} ++ ++/* ++ * head -> -> -> ... ++ */ ++static int __alloc_list(struct page_beancounter **head, int num) ++{ ++ struct page_beancounter *pb; ++ ++ while (num > 0) { ++ if (pb_alloc(&pb)) ++ return -1; ++ pb->next_hash = *head; ++ *head = pb; ++ num--; ++ } ++ ++ return num; ++} ++ ++/* ++ * Ensure that the list contains at least num elements. ++ * p_pb points to an initialized list, may be of the zero length. ++ * ++ * mm->page_table_lock should be held ++ */ ++int pb_alloc_list(struct page_beancounter **p_pb, int num) ++{ ++ struct page_beancounter *list; ++ ++ for (list = *p_pb; list != NULL && num; list = list->next_hash, num--); ++ if (!num) ++ return 0; ++ ++ /* ++ * *p_pb(after) *p_pb (before) ++ * \ \ ++ * -...-> -> ... ++ */ ++ if (__alloc_list(p_pb, num) < 0) ++ goto nomem; ++ return 0; ++ ++nomem: ++ pb_free_list(p_pb); ++ return -ENOMEM; ++} ++ ++/* ++ * Allocates a page_beancounter for each ++ * user_beancounter in a hash ++ */ ++int pb_alloc_all(struct page_beancounter **pbs) ++{ ++ int need_alloc; ++ struct user_beancounter *ub; ++ ++ need_alloc = 0; ++ rcu_read_lock(); ++ for_each_beancounter(ub) ++ need_alloc++; ++ rcu_read_unlock(); ++ ++ if (!__alloc_list(pbs, need_alloc)) ++ return 0; ++ ++ pb_free_list(pbs); ++ return -ENOMEM; ++} ++ ++/* ++ * Hash routines ++ */ ++ ++static inline int pb_hash(struct user_beancounter *ub, struct page *page) ++{ ++ return (page_to_pfn(page) + (ub->ub_uid << 10)) & pb_hash_mask; ++} ++ ++/* pb_lock should be held */ ++static inline void insert_pb(struct page_beancounter *p, struct page *page, ++ struct user_beancounter *ub, int hash) ++{ ++ p->page = page; ++ p->ub = get_beancounter(ub); ++ p->next_hash = pb_hash_table[hash]; ++ pb_hash_table[hash] = p; ++ inc_pbc_count(ub); ++} ++ ++/* ++ * Heart ++ */ ++ ++static int __pb_dup_ref(struct page *page, struct user_beancounter *bc, ++ int hash) ++{ ++ struct page_beancounter *p; ++ ++ for (p = pb_hash_table[hash]; ++ p != NULL && (p->page != page || p->ub != bc); ++ p = p->next_hash); ++ if (p == NULL) ++ return -1; ++ ++ PB_COUNT_INC(p->refcount); ++ return 0; ++} ++ ++static void __pb_add_ref(struct page *page, struct user_beancounter *bc, ++ struct page_beancounter **ppb, int hash) ++{ ++ struct page_beancounter *head, *p, **hp; ++ int shift; ++ ++ p = *ppb; ++ *ppb = p->next_hash; ++ ++ insert_pb(p, page, bc, hash); ++ hp = page_pblist(page); ++ head = *hp; ++ ++ if (head != NULL) { ++ /* ++ * Move the first element to the end of the list. ++ * List head (pb_head) is set to the next entry. ++ * Note that this code works even if head is the only element ++ * on the list (because it's cyclic). ++ */ ++ BUG_ON(head->pb_magic != PB_MAGIC); ++ *hp = next_page_pb(head); ++ PB_SHIFT_INC(head->refcount); ++ shift = PB_SHIFT_GET(head->refcount); ++ /* ++ * Update user beancounter, the share of head has been changed. ++ * Note that the shift counter is taken after increment. ++ */ ++ dec_held_pages(head->ub, UB_PAGE_WEIGHT >> shift); ++ /* add the new page beancounter to the end of the list */ ++ head = *hp; ++ list_add_tail(&p->page_list, &head->page_list); ++ } else { ++ *hp = p; ++ shift = 0; ++ INIT_LIST_HEAD(&p->page_list); ++ } ++ ++ p->refcount = PB_REFCOUNT_MAKE(shift, 1); ++ /* update user beancounter for the new page beancounter */ ++ inc_held_pages(bc, UB_PAGE_WEIGHT >> shift); ++} ++ ++void pb_add_ref(struct page *page, struct mm_struct *mm, ++ struct page_beancounter **p_pb) ++{ ++ int hash; ++ struct user_beancounter *bc; ++ ++ bc = mm->mm_ub; ++ if (bc == NULL) ++ return; ++ ++ if (!PageAnon(page) && is_shmem_mapping(page->mapping)) ++ return; ++ ++ hash = pb_hash(bc, page); ++ ++ spin_lock(&pb_lock); ++ if (__pb_dup_ref(page, bc, hash)) ++ __pb_add_ref(page, bc, p_pb, hash); ++ spin_unlock(&pb_lock); ++} ++ ++void pb_dup_ref(struct page *page, struct mm_struct *mm, ++ struct page_beancounter **p_pb) ++{ ++ int hash; ++ struct user_beancounter *bc; ++ ++ bc = mm->mm_ub; ++ if (bc == NULL) ++ return; ++ ++ if (!PageAnon(page) && is_shmem_mapping(page->mapping)) ++ return; ++ ++ hash = pb_hash(bc, page); ++ ++ spin_lock(&pb_lock); ++ if (*page_pblist(page) == NULL) ++ /* ++ * pages like ZERO_PAGE must not be accounted in pbc ++ * so on fork we just skip them ++ */ ++ goto out_unlock; ++ ++ if (unlikely(*p_pb != PBC_COPY_SAME)) ++ __pb_add_ref(page, bc, p_pb, hash); ++ else if (unlikely(__pb_dup_ref(page, bc, hash))) ++ WARN_ON(1); ++out_unlock: ++ spin_unlock(&pb_lock); ++} ++ ++void pb_remove_ref(struct page *page, struct mm_struct *mm) ++{ ++ int hash; ++ struct user_beancounter *bc; ++ struct page_beancounter *p, **q, *f; ++ int shift, shiftt; ++ ++ bc = mm->mm_ub; ++ if (bc == NULL) ++ return; ++ ++ if (!PageAnon(page) && is_shmem_mapping(page->mapping)) ++ return; ++ ++ hash = pb_hash(bc, page); ++ ++ spin_lock(&pb_lock); ++ for (q = pb_hash_table + hash, p = *q; ++ p != NULL && (p->page != page || p->ub != bc); ++ q = &p->next_hash, p = *q); ++ if (p == NULL) ++ goto out_unlock; ++ ++ PB_COUNT_DEC(p->refcount); ++ if (PB_COUNT_GET(p->refcount)) ++ /* ++ * More references from the same user beancounter exist. ++ * Nothing needs to be done. ++ */ ++ goto out_unlock; ++ ++ /* remove from the hash list */ ++ f = p; ++ *q = p->next_hash; ++ ++ shift = PB_SHIFT_GET(p->refcount); ++ ++ dec_held_pages(p->ub, UB_PAGE_WEIGHT >> shift); ++ ++ q = page_pblist(page); ++ if (*q == p) { ++ if (list_empty(&p->page_list)) { ++ *q = NULL; ++ goto out_free; ++ } ++ ++ *q = next_page_pb(p); ++ } ++ list_del(&p->page_list); ++ ++ /* Now balance the list. Move the tail and adjust its shift counter. */ ++ p = prev_page_pb(*q); ++ shiftt = PB_SHIFT_GET(p->refcount); ++ *q = p; ++ PB_SHIFT_DEC(p->refcount); ++ ++ inc_held_pages(p->ub, UB_PAGE_WEIGHT >> shiftt); ++ ++ /* ++ * If the shift counter of the moved beancounter is different from the ++ * removed one's, repeat the procedure for one more tail beancounter ++ */ ++ if (shiftt > shift) { ++ p = prev_page_pb(*q); ++ *q = p; ++ PB_SHIFT_DEC(p->refcount); ++ inc_held_pages(p->ub, UB_PAGE_WEIGHT >> shiftt); ++ } ++out_free: ++ dec_pbc_count(f->ub); ++ spin_unlock(&pb_lock); ++ ++ put_beancounter(f->ub); ++ pb_free(&f); ++ return; ++ ++out_unlock: ++ spin_unlock(&pb_lock); ++} ++ ++struct user_beancounter *pb_grab_page_ub(struct page *page) ++{ ++ struct page_beancounter *pb; ++ struct user_beancounter *ub; ++ ++ spin_lock(&pb_lock); ++ pb = *page_pblist(page); ++ ub = (pb == NULL ? ERR_PTR(-EINVAL) : ++ get_beancounter(pb->ub)); ++ spin_unlock(&pb_lock); ++ return ub; ++} ++ ++void __init ub_init_pbc(void) ++{ ++ unsigned long hash_size; ++ ++ pb_cachep = kmem_cache_create("page_beancounter", ++ sizeof(struct page_beancounter), 0, ++ SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); ++ hash_size = num_physpages >> 2; ++ for (pb_hash_mask = 1; ++ (hash_size & pb_hash_mask) != hash_size; ++ pb_hash_mask = (pb_hash_mask << 1) + 1); ++ hash_size = pb_hash_mask + 1; ++ printk(KERN_INFO "Page beancounter hash is %lu entries.\n", hash_size); ++ pb_hash_table = vmalloc(hash_size * sizeof(struct page_beancounter *)); ++ memset(pb_hash_table, 0, hash_size * sizeof(struct page_beancounter *)); ++ ++ ub_init_io(pb_cachep); ++} +diff --git a/kernel/bc/statd.c b/kernel/bc/statd.c +new file mode 100644 +index 0000000..bf6354b +--- /dev/null ++++ b/kernel/bc/statd.c +@@ -0,0 +1,453 @@ ++/* ++ * kernel/bc/statd.c ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include ++#include ++#include ++ ++static spinlock_t ubs_notify_lock = SPIN_LOCK_UNLOCKED; ++static LIST_HEAD(ubs_notify_list); ++static long ubs_min_interval; ++static ubstattime_t ubs_start_time, ubs_end_time; ++static struct timer_list ubs_timer; ++ ++static int ubstat_get_list(void __user *buf, long size) ++{ ++ int retval; ++ struct user_beancounter *ub, *ubp; ++ long *page, *ptr, *end; ++ int len; ++ ++ page = (long *)__get_free_page(GFP_KERNEL); ++ if (page == NULL) ++ return -ENOMEM; ++ ++ retval = 0; ++ ubp = NULL; ++ ptr = page; ++ end = page + PAGE_SIZE / sizeof(*ptr); ++ ++ spin_lock_irq(&ub_hash_lock); ++ for_each_beancounter(ub) { ++ if (ub->parent != NULL) ++ continue; ++ *ptr++ = ub->ub_uid; ++ if (ptr != end) ++ continue; ++ ++ get_beancounter(ub); ++ spin_unlock_irq(&ub_hash_lock); ++ ++ put_beancounter(ubp); ++ ubp = ub; ++ ++ len = min_t(long, (ptr - page) * sizeof(*ptr), size); ++ if (copy_to_user(buf, page, len)) { ++ retval = -EFAULT; ++ goto out_put; ++ } ++ retval += len; ++ if (len < PAGE_SIZE) ++ goto out_put; ++ buf += len; ++ size -= len; ++ ++ ptr = page; ++ end = page + PAGE_SIZE / sizeof(*ptr); ++ ++ spin_lock_irq(&ub_hash_lock); ++ } ++ spin_unlock_irq(&ub_hash_lock); ++ ++ put_beancounter(ubp); ++ size = min_t(long, (ptr - page) * sizeof(*ptr), size); ++ if (size > 0 && copy_to_user(buf, page, size)) { ++ retval = -EFAULT; ++ goto out_put; ++ } ++ retval += size; ++ ++out_put: ++ put_beancounter(ubp); ++ free_page((unsigned long)page); ++ return retval; ++} ++ ++static int ubstat_gettime(void __user *buf, long size) ++{ ++ ubgettime_t data; ++ int retval; ++ ++ spin_lock(&ubs_notify_lock); ++ data.start_time = ubs_start_time; ++ data.end_time = ubs_end_time; ++ data.cur_time = ubs_start_time + (jiffies - ubs_start_time * HZ) / HZ; ++ spin_unlock(&ubs_notify_lock); ++ ++ retval = min_t(long, sizeof(data), size); ++ if (copy_to_user(buf, &data, retval)) ++ retval = -EFAULT; ++ return retval; ++} ++ ++static int ubstat_do_read_one(struct user_beancounter *ub, int res, void *kbuf) ++{ ++ struct { ++ ubstattime_t start_time; ++ ubstattime_t end_time; ++ ubstatparm_t param[1]; ++ } *data; ++ ++ data = kbuf; ++ data->start_time = ubs_start_time; ++ data->end_time = ubs_end_time; ++ ++ data->param[0].maxheld = ub->ub_store[res].maxheld; ++ data->param[0].failcnt = ub->ub_store[res].failcnt; ++ ++ return sizeof(*data); ++} ++ ++static int ubstat_do_read_all(struct user_beancounter *ub, void *kbuf, int size) ++{ ++ int wrote; ++ struct { ++ ubstattime_t start_time; ++ ubstattime_t end_time; ++ ubstatparm_t param[UB_RESOURCES]; ++ } *data; ++ int resource; ++ ++ data = kbuf; ++ data->start_time = ubs_start_time; ++ data->end_time = ubs_end_time; ++ wrote = sizeof(data->start_time) + sizeof(data->end_time); ++ ++ for (resource = 0; resource < UB_RESOURCES; resource++) { ++ if (size < wrote + sizeof(data->param[resource])) ++ break; ++ data->param[resource].maxheld = ub->ub_store[resource].maxheld; ++ data->param[resource].failcnt = ub->ub_store[resource].failcnt; ++ wrote += sizeof(data->param[resource]); ++ } ++ ++ return wrote; ++} ++ ++static int ubstat_do_read_full(struct user_beancounter *ub, void *kbuf, ++ int size) ++{ ++ int wrote; ++ struct { ++ ubstattime_t start_time; ++ ubstattime_t end_time; ++ ubstatparmf_t param[UB_RESOURCES]; ++ } *data; ++ int resource; ++ ++ data = kbuf; ++ data->start_time = ubs_start_time; ++ data->end_time = ubs_end_time; ++ wrote = sizeof(data->start_time) + sizeof(data->end_time); ++ ++ for (resource = 0; resource < UB_RESOURCES; resource++) { ++ if (size < wrote + sizeof(data->param[resource])) ++ break; ++ /* The beginning of ubstatparmf_t matches struct ubparm. */ ++ memcpy(&data->param[resource], &ub->ub_store[resource], ++ sizeof(ub->ub_store[resource])); ++ data->param[resource].__unused1 = 0; ++ data->param[resource].__unused2 = 0; ++ wrote += sizeof(data->param[resource]); ++ } ++ return wrote; ++} ++ ++static int ubstat_get_stat(struct user_beancounter *ub, long cmd, ++ void __user *buf, long size) ++{ ++ void *kbuf; ++ int retval; ++ ++ kbuf = (void *)__get_free_page(GFP_KERNEL); ++ if (kbuf == NULL) ++ return -ENOMEM; ++ ++ spin_lock(&ubs_notify_lock); ++ switch (UBSTAT_CMD(cmd)) { ++ case UBSTAT_READ_ONE: ++ retval = -EINVAL; ++ if (UBSTAT_PARMID(cmd) >= UB_RESOURCES) ++ break; ++ retval = ubstat_do_read_one(ub, ++ UBSTAT_PARMID(cmd), kbuf); ++ break; ++ case UBSTAT_READ_ALL: ++ retval = ubstat_do_read_all(ub, kbuf, PAGE_SIZE); ++ break; ++ case UBSTAT_READ_FULL: ++ retval = ubstat_do_read_full(ub, kbuf, PAGE_SIZE); ++ break; ++ default: ++ retval = -EINVAL; ++ } ++ spin_unlock(&ubs_notify_lock); ++ ++ if (retval > 0) { ++ retval = min_t(long, retval, size); ++ if (copy_to_user(buf, kbuf, retval)) ++ retval = -EFAULT; ++ } ++ ++ free_page((unsigned long)kbuf); ++ return retval; ++} ++ ++static int ubstat_handle_notifrq(ubnotifrq_t *req) ++{ ++ int retval; ++ struct ub_stat_notify *new_notify; ++ struct list_head *entry; ++ struct task_struct *tsk_to_free; ++ ++ new_notify = kmalloc(sizeof(new_notify), GFP_KERNEL); ++ if (new_notify == NULL) ++ return -ENOMEM; ++ ++ tsk_to_free = NULL; ++ INIT_LIST_HEAD(&new_notify->list); ++ ++ spin_lock(&ubs_notify_lock); ++ list_for_each(entry, &ubs_notify_list) { ++ struct ub_stat_notify *notify; ++ ++ notify = list_entry(entry, struct ub_stat_notify, list); ++ if (notify->task == current) { ++ kfree(new_notify); ++ new_notify = notify; ++ break; ++ } ++ } ++ ++ retval = -EINVAL; ++ if (req->maxinterval < 1) ++ goto out_unlock; ++ if (req->maxinterval > TIME_MAX_SEC) ++ req->maxinterval = TIME_MAX_SEC; ++ if (req->maxinterval < ubs_min_interval) { ++ unsigned long dif; ++ ++ ubs_min_interval = req->maxinterval; ++ dif = (ubs_timer.expires - jiffies + HZ - 1) / HZ; ++ if (dif > req->maxinterval) ++ mod_timer(&ubs_timer, ++ ubs_timer.expires - ++ (dif - req->maxinterval) * HZ); ++ } ++ ++ if (entry != &ubs_notify_list) { ++ list_del(&new_notify->list); ++ tsk_to_free = new_notify->task; ++ } ++ if (req->signum) { ++ new_notify->task = current; ++ get_task_struct(new_notify->task); ++ new_notify->signum = req->signum; ++ list_add(&new_notify->list, &ubs_notify_list); ++ } else ++ kfree(new_notify); ++ retval = 0; ++out_unlock: ++ spin_unlock(&ubs_notify_lock); ++ if (tsk_to_free != NULL) ++ put_task_struct(tsk_to_free); ++ return retval; ++} ++ ++/* ++ * former sys_ubstat ++ */ ++long do_ubstat(int func, unsigned long arg1, unsigned long arg2, ++ void __user *buf, long size) ++{ ++ int retval; ++ struct user_beancounter *ub; ++ ++ if (func == UBSTAT_UBPARMNUM) ++ return UB_RESOURCES; ++ if (func == UBSTAT_UBLIST) ++ return ubstat_get_list(buf, size); ++ if (!(capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))) ++ return -EPERM; ++ ++ if (func == UBSTAT_GETTIME) { ++ retval = ubstat_gettime(buf, size); ++ goto notify; ++ } ++ ++ ub = get_exec_ub(); ++ if (ub != NULL && ub->ub_uid == arg1) ++ get_beancounter(ub); ++ else /* FIXME must be if (ve_is_super) */ ++ ub = get_beancounter_byuid(arg1, 0); ++ ++ if (ub == NULL) ++ return -ESRCH; ++ ++ retval = ubstat_get_stat(ub, func, buf, size); ++ put_beancounter(ub); ++notify: ++ /* Handle request for notification */ ++ if (retval >= 0) { ++ ubnotifrq_t notifrq; ++ int err; ++ ++ err = -EFAULT; ++ if (!copy_from_user(¬ifrq, (void __user *)arg2, ++ sizeof(notifrq))) ++ err = ubstat_handle_notifrq(¬ifrq); ++ if (err) ++ retval = err; ++ } ++ ++ return retval; ++} ++ ++static void ubstat_save_onestat(struct user_beancounter *ub) ++{ ++ int resource; ++ ++ /* called with local irq disabled */ ++ spin_lock(&ub->ub_lock); ++ for (resource = 0; resource < UB_RESOURCES; resource++) { ++ memcpy(&ub->ub_store[resource], &ub->ub_parms[resource], ++ sizeof(struct ubparm)); ++ ub->ub_parms[resource].minheld = ++ ub->ub_parms[resource].maxheld = ++ ub->ub_parms[resource].held; ++ } ++ spin_unlock(&ub->ub_lock); ++} ++ ++static void ubstat_save_statistics(void) ++{ ++ unsigned long flags; ++ struct user_beancounter *ub; ++ ++ local_irq_save(flags); ++ for_each_beancounter (ub) ++ ubstat_save_onestat(ub); ++ local_irq_restore(flags); ++} ++ ++static void ubstatd_timeout(unsigned long __data) ++{ ++ struct task_struct *p; ++ ++ p = (struct task_struct *) __data; ++ wake_up_process(p); ++} ++ ++/* ++ * Safe wrapper for send_sig. It prevents a race with release_task ++ * for sighand. ++ * Should be called under tasklist_lock. ++ */ ++static void task_send_sig(struct ub_stat_notify *notify) ++{ ++ if (likely(notify->task->sighand != NULL)) ++ send_sig(notify->signum, notify->task, 1); ++} ++ ++static inline void do_notifies(void) ++{ ++ LIST_HEAD(notif_free_list); ++ struct ub_stat_notify *notify; ++ struct ub_stat_notify *tmp; ++ ++ spin_lock(&ubs_notify_lock); ++ ubs_start_time = ubs_end_time; ++ /* ++ * the expression below relies on time being unsigned long and ++ * arithmetic promotion rules ++ */ ++ ubs_end_time += (ubs_timer.expires - ubs_start_time * HZ) / HZ; ++ mod_timer(&ubs_timer, ubs_timer.expires + ubs_min_interval * HZ); ++ ubs_min_interval = TIME_MAX_SEC; ++ /* save statistics accumulated for the interval */ ++ ubstat_save_statistics(); ++ /* send signals */ ++ read_lock(&tasklist_lock); ++ while (!list_empty(&ubs_notify_list)) { ++ notify = list_entry(ubs_notify_list.next, ++ struct ub_stat_notify, list); ++ task_send_sig(notify); ++ list_del(¬ify->list); ++ list_add(¬ify->list, ¬if_free_list); ++ } ++ read_unlock(&tasklist_lock); ++ spin_unlock(&ubs_notify_lock); ++ ++ list_for_each_entry_safe(notify, tmp, ¬if_free_list, list) { ++ put_task_struct(notify->task); ++ kfree(notify); ++ } ++} ++ ++/* ++ * Kernel thread ++ */ ++static int ubstatd(void *unused) ++{ ++ /* daemonize call will take care of signals */ ++ daemonize("ubstatd"); ++ ++ ubs_timer.data = (unsigned long)current; ++ ubs_timer.function = ubstatd_timeout; ++ add_timer(&ubs_timer); ++ ++ while (1) { ++ set_task_state(current, TASK_INTERRUPTIBLE); ++ if (time_after(ubs_timer.expires, jiffies)) { ++ schedule(); ++ try_to_freeze(); ++ continue; ++ } ++ ++ __set_task_state(current, TASK_RUNNING); ++ do_notifies(); ++ } ++ return 0; ++} ++ ++static int __init ubstatd_init(void) ++{ ++ init_timer(&ubs_timer); ++ ubs_timer.expires = TIME_MAX_JIF; ++ ubs_min_interval = TIME_MAX_SEC; ++ ubs_start_time = ubs_end_time = 0; ++ ++ kernel_thread(ubstatd, NULL, 0); ++ return 0; ++} ++ ++module_init(ubstatd_init); +diff --git a/kernel/bc/sys.c b/kernel/bc/sys.c +new file mode 100644 +index 0000000..798166b +--- /dev/null ++++ b/kernel/bc/sys.c +@@ -0,0 +1,173 @@ ++/* ++ * kernel/bc/sys.c ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++ ++#include ++ ++/* ++ * The (rather boring) getluid syscall ++ */ ++asmlinkage long sys_getluid(void) ++{ ++ struct user_beancounter *ub; ++ ++ ub = get_exec_ub(); ++ if (ub == NULL) ++ return -EINVAL; ++ ++ return ub->ub_uid; ++} ++ ++/* ++ * The setluid syscall ++ */ ++asmlinkage long sys_setluid(uid_t uid) ++{ ++ struct user_beancounter *ub; ++ struct task_beancounter *task_bc; ++ int error; ++ ++ task_bc = ¤t->task_bc; ++ ++ /* You may not disown a setluid */ ++ error = -EINVAL; ++ if (uid == (uid_t)-1) ++ goto out; ++ ++ /* You may only set an ub as root */ ++ error = -EPERM; ++ if (!capable(CAP_SETUID)) ++ goto out; ++ /* ++ * The ub once set is irrevocable to all ++ * unless it's set from ve0. ++ */ ++ if (!ve_is_super(get_exec_env())) ++ goto out; ++ ++ /* Ok - set up a beancounter entry for this user */ ++ error = -ENOBUFS; ++ ub = get_beancounter_byuid(uid, 1); ++ if (ub == NULL) ++ goto out; ++ ++ ub_debug(UBD_ALLOC | UBD_LIMIT, "setluid, bean %p (count %d) " ++ "for %.20s pid %d\n", ++ ub, atomic_read(&ub->ub_refcount), ++ current->comm, current->pid); ++ /* install bc */ ++ error = virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_NEWUBC, ub); ++ if (!(error & NOTIFY_FAIL)) { ++ put_beancounter(task_bc->exec_ub); ++ task_bc->exec_ub = ub; ++ if (!(error & NOTIFY_OK)) { ++ put_beancounter(task_bc->fork_sub); ++ task_bc->fork_sub = get_beancounter(ub); ++ } ++ error = 0; ++ } else { ++ put_beancounter(ub); ++ error = -ENOBUFS; ++ } ++out: ++ return error; ++} ++ ++long do_setublimit(uid_t uid, unsigned long resource, ++ unsigned long *new_limits) ++{ ++ int error; ++ unsigned long flags; ++ struct user_beancounter *ub; ++ ++ error = -EPERM; ++ if(!capable(CAP_SYS_RESOURCE)) ++ goto out; ++ ++ if (!ve_is_super(get_exec_env())) ++ goto out; ++ ++ error = -EINVAL; ++ if (resource >= UB_RESOURCES) ++ goto out; ++ ++ error = -EINVAL; ++ if (new_limits[0] > UB_MAXVALUE || new_limits[1] > UB_MAXVALUE) ++ goto out; ++ ++ error = -ENOENT; ++ ub = get_beancounter_byuid(uid, 0); ++ if (ub == NULL) { ++ ub_debug(UBD_LIMIT, "No login bc for uid %d\n", uid); ++ goto out; ++ } ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ ub->ub_parms[resource].barrier = new_limits[0]; ++ ub->ub_parms[resource].limit = new_limits[1]; ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ ++ put_beancounter(ub); ++ ++ error = 0; ++out: ++ return error; ++} ++ ++/* ++ * The setbeanlimit syscall ++ */ ++asmlinkage long sys_setublimit(uid_t uid, unsigned long resource, ++ unsigned long __user *limits) ++{ ++ unsigned long new_limits[2]; ++ ++ if (copy_from_user(&new_limits, limits, sizeof(new_limits))) ++ return -EFAULT; ++ ++ return do_setublimit(uid, resource, new_limits); ++} ++ ++extern long do_ubstat(int func, unsigned long arg1, unsigned long arg2, ++ void __user *buf, long size); ++asmlinkage long sys_ubstat(int func, unsigned long arg1, unsigned long arg2, ++ void __user *buf, long size) ++{ ++ if (!ve_is_super(get_exec_env())) ++ return -EPERM; ++ ++ return do_ubstat(func, arg1, arg2, buf, size); ++} ++ ++#ifdef CONFIG_COMPAT ++asmlinkage long compat_sys_setublimit(uid_t uid, int resource, ++ unsigned int __user *limits) ++{ ++ unsigned int u_new_limits[2]; ++ unsigned long new_limits[2]; ++ ++ if (copy_from_user(&u_new_limits, limits, sizeof(u_new_limits))) ++ return -EFAULT; ++ ++ new_limits[0] = u_new_limits[0]; ++ new_limits[1] = u_new_limits[1]; ++ ++ return do_setublimit(uid, resource, new_limits); ++} ++ ++asmlinkage long compat_sys_ubstat(int func, unsigned int arg1, ++ unsigned int arg2, compat_uptr_t *buf, long size) ++{ ++ return sys_ubstat(func, arg1, arg2, buf, size); ++} ++#endif +diff --git a/kernel/bc/vm_pages.c b/kernel/bc/vm_pages.c +new file mode 100644 +index 0000000..e98134b +--- /dev/null ++++ b/kernel/bc/vm_pages.c +@@ -0,0 +1,549 @@ ++/* ++ * kernel/bc/vm_pages.c ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include ++#include ++#include ++ ++static inline unsigned long pages_in_pte_range(struct vm_area_struct *vma, ++ pmd_t *pmd, unsigned long addr, unsigned long end, ++ unsigned long *ret) ++{ ++ pte_t *pte; ++ spinlock_t *ptl; ++ ++ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); ++ do { ++ if (!pte_none(*pte) && pte_present(*pte)) ++ (*ret)++; ++ } while (pte++, addr += PAGE_SIZE, (addr != end)); ++ pte_unmap_unlock(pte - 1, ptl); ++ ++ return addr; ++} ++ ++static inline unsigned long pages_in_pmd_range(struct vm_area_struct *vma, ++ pud_t *pud, unsigned long addr, unsigned long end, ++ unsigned long *ret) ++{ ++ pmd_t *pmd; ++ unsigned long next; ++ ++ pmd = pmd_offset(pud, addr); ++ do { ++ next = pmd_addr_end(addr, end); ++ if (pmd_none_or_clear_bad(pmd)) ++ continue; ++ next = pages_in_pte_range(vma, pmd, addr, next, ret); ++ } while (pmd++, addr = next, (addr != end)); ++ ++ return addr; ++} ++ ++static inline unsigned long pages_in_pud_range(struct vm_area_struct *vma, ++ pgd_t *pgd, unsigned long addr, unsigned long end, ++ unsigned long *ret) ++{ ++ pud_t *pud; ++ unsigned long next; ++ ++ pud = pud_offset(pgd, addr); ++ do { ++ next = pud_addr_end(addr, end); ++ if (pud_none_or_clear_bad(pud)) ++ continue; ++ next = pages_in_pmd_range(vma, pud, addr, next, ret); ++ } while (pud++, addr = next, (addr != end)); ++ ++ return addr; ++} ++ ++unsigned long pages_in_vma_range(struct vm_area_struct *vma, ++ unsigned long addr, unsigned long end) ++{ ++ pgd_t *pgd; ++ unsigned long next; ++ unsigned long ret; ++ ++ ret = 0; ++ BUG_ON(addr >= end); ++ pgd = pgd_offset(vma->vm_mm, addr); ++ do { ++ next = pgd_addr_end(addr, end); ++ if (pgd_none_or_clear_bad(pgd)) ++ continue; ++ next = pages_in_pud_range(vma, pgd, addr, next, &ret); ++ } while (pgd++, addr = next, (addr != end)); ++ return ret; ++} ++ ++void __ub_update_physpages(struct user_beancounter *ub) ++{ ++ ub->ub_parms[UB_PHYSPAGES].held = ub->ub_tmpfs_respages ++ + (ub->ub_held_pages >> UB_PAGE_WEIGHT_SHIFT); ++ ub_adjust_maxheld(ub, UB_PHYSPAGES); ++} ++ ++void __ub_update_oomguarpages(struct user_beancounter *ub) ++{ ++ ub->ub_parms[UB_OOMGUARPAGES].held = ++ ub->ub_parms[UB_PHYSPAGES].held + ub->ub_swap_pages; ++ ub_adjust_maxheld(ub, UB_OOMGUARPAGES); ++} ++ ++void __ub_update_privvm(struct user_beancounter *ub) ++{ ++ ub->ub_parms[UB_PRIVVMPAGES].held = ++ (ub->ub_held_pages >> UB_PAGE_WEIGHT_SHIFT) ++ + ub->ub_unused_privvmpages ++ + ub->ub_parms[UB_SHMPAGES].held; ++ ub_adjust_maxheld(ub, UB_PRIVVMPAGES); ++} ++ ++static inline int __charge_privvm_locked(struct user_beancounter *ub, ++ unsigned long s, enum ub_severity strict) ++{ ++ if (__charge_beancounter_locked(ub, UB_PRIVVMPAGES, s, strict) < 0) ++ return -ENOMEM; ++ ++ ub->ub_unused_privvmpages += s; ++ return 0; ++} ++ ++static void __unused_privvm_dec_locked(struct user_beancounter *ub, ++ long size) ++{ ++ /* catch possible overflow */ ++ if (ub->ub_unused_privvmpages < size) { ++ uncharge_warn(ub, UB_UNUSEDPRIVVM, ++ size, ub->ub_unused_privvmpages); ++ size = ub->ub_unused_privvmpages; ++ } ++ ub->ub_unused_privvmpages -= size; ++ __ub_update_privvm(ub); ++} ++ ++void __ub_unused_privvm_dec(struct mm_struct *mm, long size) ++{ ++ unsigned long flags; ++ struct user_beancounter *ub; ++ ++ ub = mm->mm_ub; ++ if (ub == NULL) ++ return; ++ ++ ub = top_beancounter(ub); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ __unused_privvm_dec_locked(ub, size); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++void ub_unused_privvm_sub(struct mm_struct *mm, ++ struct vm_area_struct *vma, unsigned long count) ++{ ++ if (VM_UB_PRIVATE(vma->vm_flags, vma->vm_file)) ++ __ub_unused_privvm_dec(mm, count); ++} ++ ++void ub_unused_privvm_add(struct mm_struct *mm, ++ struct vm_area_struct *vma, unsigned long size) ++{ ++ unsigned long flags; ++ struct user_beancounter *ub; ++ ++ ub = mm->mm_ub; ++ if (ub == NULL || !VM_UB_PRIVATE(vma->vm_flags, vma->vm_file)) ++ return; ++ ++ ub = top_beancounter(ub); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ ub->ub_unused_privvmpages += size; ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++int ub_protected_charge(struct mm_struct *mm, unsigned long size, ++ unsigned long newflags, struct vm_area_struct *vma) ++{ ++ unsigned long flags; ++ struct file *file; ++ struct user_beancounter *ub; ++ ++ ub = mm->mm_ub; ++ if (ub == NULL) ++ return PRIVVM_NO_CHARGE; ++ ++ flags = vma->vm_flags; ++ if (!((newflags ^ flags) & VM_WRITE)) ++ return PRIVVM_NO_CHARGE; ++ ++ file = vma->vm_file; ++ if (!VM_UB_PRIVATE(newflags | VM_WRITE, file)) ++ return PRIVVM_NO_CHARGE; ++ ++ if (flags & VM_WRITE) ++ return PRIVVM_TO_SHARED; ++ ++ ub = top_beancounter(ub); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ if (__charge_privvm_locked(ub, size, UB_SOFT) < 0) ++ goto err; ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ return PRIVVM_TO_PRIVATE; ++ ++err: ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ return PRIVVM_ERROR; ++} ++ ++int ub_memory_charge(struct mm_struct *mm, unsigned long size, ++ unsigned vm_flags, struct file *vm_file, int sv) ++{ ++ struct user_beancounter *ub, *ubl; ++ unsigned long flags; ++ ++ ub = mm->mm_ub; ++ if (ub == NULL) ++ return 0; ++ ++ size >>= PAGE_SHIFT; ++ if (size > UB_MAXVALUE) ++ return -EINVAL; ++ ++ BUG_ON(sv != UB_SOFT && sv != UB_HARD); ++ ++ if (vm_flags & VM_LOCKED) { ++ if (charge_beancounter(ub, UB_LOCKEDPAGES, size, sv)) ++ goto out_err; ++ } ++ if (VM_UB_PRIVATE(vm_flags, vm_file)) { ++ ubl = top_beancounter(ub); ++ spin_lock_irqsave(&ubl->ub_lock, flags); ++ if (__charge_privvm_locked(ubl, size, sv)) ++ goto out_private; ++ spin_unlock_irqrestore(&ubl->ub_lock, flags); ++ } ++ return 0; ++ ++out_private: ++ spin_unlock_irqrestore(&ubl->ub_lock, flags); ++ if (vm_flags & VM_LOCKED) ++ uncharge_beancounter(ub, UB_LOCKEDPAGES, size); ++out_err: ++ return -ENOMEM; ++} ++ ++void ub_memory_uncharge(struct mm_struct *mm, unsigned long size, ++ unsigned vm_flags, struct file *vm_file) ++{ ++ struct user_beancounter *ub; ++ unsigned long flags; ++ ++ ub = mm->mm_ub; ++ if (ub == NULL) ++ return; ++ ++ size >>= PAGE_SHIFT; ++ ++ if (vm_flags & VM_LOCKED) ++ uncharge_beancounter(ub, UB_LOCKEDPAGES, size); ++ if (VM_UB_PRIVATE(vm_flags, vm_file)) { ++ ub = top_beancounter(ub); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ __unused_privvm_dec_locked(ub, size); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ } ++} ++ ++int ub_locked_charge(struct mm_struct *mm, unsigned long size) ++{ ++ struct user_beancounter *ub; ++ ++ ub = mm->mm_ub; ++ if (ub == NULL) ++ return 0; ++ ++ return charge_beancounter(ub, UB_LOCKEDPAGES, ++ size >> PAGE_SHIFT, UB_HARD); ++} ++ ++void ub_locked_uncharge(struct mm_struct *mm, unsigned long size) ++{ ++ struct user_beancounter *ub; ++ ++ ub = mm->mm_ub; ++ if (ub == NULL) ++ return; ++ ++ uncharge_beancounter(ub, UB_LOCKEDPAGES, size >> PAGE_SHIFT); ++} ++ ++int ub_lockedshm_charge(struct shmem_inode_info *shi, unsigned long size) ++{ ++ struct user_beancounter *ub; ++ ++ ub = shi->shmi_ub; ++ if (ub == NULL) ++ return 0; ++ ++ return charge_beancounter(ub, UB_LOCKEDPAGES, ++ size >> PAGE_SHIFT, UB_HARD); ++} ++ ++void ub_lockedshm_uncharge(struct shmem_inode_info *shi, unsigned long size) ++{ ++ struct user_beancounter *ub; ++ ++ ub = shi->shmi_ub; ++ if (ub == NULL) ++ return; ++ ++ uncharge_beancounter(ub, UB_LOCKEDPAGES, size >> PAGE_SHIFT); ++} ++ ++ ++static inline void do_ub_tmpfs_respages_inc(struct user_beancounter *ub) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ ub->ub_tmpfs_respages++; ++ __ub_update_physpages(ub); ++ __ub_update_oomguarpages(ub); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++void ub_tmpfs_respages_inc(struct shmem_inode_info *shi) ++{ ++ struct user_beancounter *ub; ++ ++ for (ub = shi->shmi_ub; ub != NULL; ub = ub->parent) ++ do_ub_tmpfs_respages_inc(ub); ++} ++ ++static inline void do_ub_tmpfs_respages_sub(struct user_beancounter *ub, ++ unsigned long size) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ /* catch possible overflow */ ++ if (ub->ub_tmpfs_respages < size) { ++ uncharge_warn(ub, UB_TMPFSPAGES, ++ size, ub->ub_tmpfs_respages); ++ size = ub->ub_tmpfs_respages; ++ } ++ ub->ub_tmpfs_respages -= size; ++ /* update values what is the most interesting */ ++ __ub_update_physpages(ub); ++ __ub_update_oomguarpages(ub); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++void ub_tmpfs_respages_sub(struct shmem_inode_info *shi, ++ unsigned long size) ++{ ++ struct user_beancounter *ub; ++ ++ for (ub = shi->shmi_ub; ub != NULL; ub = ub->parent) ++ do_ub_tmpfs_respages_sub(ub, size); ++} ++ ++int ub_shmpages_charge(struct shmem_inode_info *shi, unsigned long size) ++{ ++ int ret; ++ unsigned long flags; ++ struct user_beancounter *ub; ++ ++ ub = shi->shmi_ub; ++ if (ub == NULL) ++ return 0; ++ ++ ub = top_beancounter(ub); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ ret = __charge_beancounter_locked(ub, UB_SHMPAGES, size, UB_HARD); ++ if (ret == 0) ++ __ub_update_privvm(ub); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ return ret; ++} ++ ++void ub_shmpages_uncharge(struct shmem_inode_info *shi, unsigned long size) ++{ ++ unsigned long flags; ++ struct user_beancounter *ub; ++ ++ ub = shi->shmi_ub; ++ if (ub == NULL) ++ return; ++ ++ ub = top_beancounter(ub); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ __uncharge_beancounter_locked(ub, UB_SHMPAGES, size); ++ __ub_update_privvm(ub); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++#ifdef CONFIG_BC_SWAP_ACCOUNTING ++static inline void do_ub_swapentry_inc(struct user_beancounter *ub) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ ub->ub_swap_pages++; ++ __ub_update_oomguarpages(ub); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++void ub_swapentry_inc(struct swap_info_struct *si, pgoff_t num, ++ struct user_beancounter *ub) ++{ ++ si->swap_ubs[num] = get_beancounter(ub); ++ for (; ub != NULL; ub = ub->parent) ++ do_ub_swapentry_inc(ub); ++} ++EXPORT_SYMBOL(ub_swapentry_inc); ++ ++static inline void do_ub_swapentry_dec(struct user_beancounter *ub) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ if (ub->ub_swap_pages <= 0) ++ uncharge_warn(ub, UB_SWAPPAGES, 1, ub->ub_swap_pages); ++ else ++ ub->ub_swap_pages--; ++ __ub_update_oomguarpages(ub); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++void ub_swapentry_dec(struct swap_info_struct *si, pgoff_t num) ++{ ++ struct user_beancounter *ub, *ubp; ++ ++ ub = si->swap_ubs[num]; ++ si->swap_ubs[num] = NULL; ++ for (ubp = ub; ubp != NULL; ubp = ubp->parent) ++ do_ub_swapentry_dec(ubp); ++ put_beancounter(ub); ++} ++EXPORT_SYMBOL(ub_swapentry_dec); ++ ++int ub_swap_init(struct swap_info_struct *si, pgoff_t num) ++{ ++ struct user_beancounter **ubs; ++ ++ ubs = vmalloc(num * sizeof(struct user_beancounter *)); ++ if (ubs == NULL) ++ return -ENOMEM; ++ ++ memset(ubs, 0, num * sizeof(struct user_beancounter *)); ++ si->swap_ubs = ubs; ++ return 0; ++} ++ ++void ub_swap_fini(struct swap_info_struct *si) ++{ ++ if (si->swap_ubs) { ++ vfree(si->swap_ubs); ++ si->swap_ubs = NULL; ++ } ++} ++#endif ++ ++static int vmguar_enough_memory(struct vnotifier_block *self, ++ unsigned long event, void *arg, int old_ret) ++{ ++ struct user_beancounter *ub; ++ ++ if (event != VIRTINFO_ENOUGHMEM) ++ return old_ret; ++ /* ++ * If it's a kernel thread, don't care about it. ++ * Added in order aufsd to run smoothly over ramfs. ++ */ ++ if (!current->mm) ++ return NOTIFY_DONE; ++ ++ ub = top_beancounter(current->mm->mm_ub); ++ if (ub->ub_parms[UB_PRIVVMPAGES].held > ++ ub->ub_parms[UB_VMGUARPAGES].barrier) ++ return old_ret; ++ ++ return NOTIFY_OK; ++} ++ ++static struct vnotifier_block vmguar_notifier_block = { ++ .notifier_call = vmguar_enough_memory ++}; ++ ++static int __init init_vmguar_notifier(void) ++{ ++ virtinfo_notifier_register(VITYPE_GENERAL, &vmguar_notifier_block); ++ return 0; ++} ++ ++static void __exit fini_vmguar_notifier(void) ++{ ++ virtinfo_notifier_unregister(VITYPE_GENERAL, &vmguar_notifier_block); ++} ++ ++module_init(init_vmguar_notifier); ++module_exit(fini_vmguar_notifier); ++ ++#ifdef CONFIG_PROC_FS ++static int bc_vmaux_show(struct seq_file *f, void *v) ++{ ++ struct user_beancounter *ub; ++ unsigned long swap, unmap; ++ int i; ++ ++ ub = seq_beancounter(f); ++ ++ swap = unmap = 0; ++ for_each_online_cpu(i) { ++ swap += per_cpu_ptr(ub->ub_percpu, i)->swapin; ++ unmap += per_cpu_ptr(ub->ub_percpu, i)->unmap; ++ } ++ ++ seq_printf(f, bc_proc_lu_fmt, ub_rnames[UB_UNUSEDPRIVVM], ++ ub->ub_unused_privvmpages); ++ seq_printf(f, bc_proc_lu_fmt, ub_rnames[UB_TMPFSPAGES], ++ ub->ub_tmpfs_respages); ++ seq_printf(f, bc_proc_lu_fmt, ub_rnames[UB_SWAPPAGES], ++ ub->ub_swap_pages); ++ ++ seq_printf(f, bc_proc_lu_fmt, "swapin", swap); ++ seq_printf(f, bc_proc_lu_fmt, "unmap", unmap); ++ return 0; ++} ++static struct bc_proc_entry bc_vmaux_entry = { ++ .name = "vmaux", ++ .u.show = bc_vmaux_show, ++}; ++ ++static int __init bc_vmaux_init(void) ++{ ++ bc_register_proc_entry(&bc_vmaux_entry); ++ return 0; ++} ++ ++late_initcall(bc_vmaux_init); ++#endif +diff --git a/kernel/capability.c b/kernel/capability.c +index 901e0fd..6618a51 100644 +--- a/kernel/capability.c ++++ b/kernel/capability.c +@@ -19,7 +19,8 @@ + * This lock protects task->cap_* for all tasks including current. + * Locking rule: acquire this prior to tasklist_lock. + */ +-static DEFINE_SPINLOCK(task_capability_lock); ++DEFINE_SPINLOCK(task_capability_lock); ++EXPORT_SYMBOL(task_capability_lock); + + /* + * Leveraged for setting/resetting capabilities +@@ -242,7 +243,7 @@ static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective, + pgrp = find_vpid(pgrp_nr); + do_each_pid_task(pgrp, PIDTYPE_PGID, g) { + target = g; +- while_each_thread(g, target) { ++ while_each_thread_ve(g, target) { + if (!security_capset_check(target, effective, + inheritable, + permitted)) { +@@ -272,7 +273,7 @@ static inline int cap_set_all(kernel_cap_t *effective, + int ret = -EPERM; + int found = 0; + +- do_each_thread(g, target) { ++ do_each_thread_ve(g, target) { + if (target == current || is_container_init(target->group_leader)) + continue; + found = 1; +@@ -281,7 +282,7 @@ static inline int cap_set_all(kernel_cap_t *effective, + continue; + ret = 0; + security_capset_set(target, effective, inheritable, permitted); +- } while_each_thread(g, target); ++ } while_each_thread_ve(g, target); + + if (!found) + ret = 0; +diff --git a/kernel/cgroup.c b/kernel/cgroup.c +index 15ac0e1..e2735e2 100644 +--- a/kernel/cgroup.c ++++ b/kernel/cgroup.c +@@ -1809,7 +1809,7 @@ static void cgroup_enable_task_cg_lists(void) + struct task_struct *p, *g; + write_lock(&css_set_lock); + use_task_css_set_links = 1; +- do_each_thread(g, p) { ++ do_each_thread_all(g, p) { + task_lock(p); + /* + * We should check if the process is exiting, otherwise +@@ -1819,7 +1819,7 @@ static void cgroup_enable_task_cg_lists(void) + if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) + list_add(&p->cg_list, &p->cgroups->tasks); + task_unlock(p); +- } while_each_thread(g, p); ++ } while_each_thread_all(g, p); + write_unlock(&css_set_lock); + } + +@@ -2894,9 +2894,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys) + again: + root = subsys->root; + if (root == &rootnode) { +- printk(KERN_INFO +- "Not cloning cgroup for unused subsystem %s\n", +- subsys->name); + mutex_unlock(&cgroup_mutex); + return 0; + } +diff --git a/kernel/cgroup_lite.c b/kernel/cgroup_lite.c +new file mode 100644 +index 0000000..92c0f76 +--- /dev/null ++++ b/kernel/cgroup_lite.c +@@ -0,0 +1,221 @@ ++/* ++ * lite cgroups engine ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define SUBSYS(_x) &_x ## _subsys, ++ ++static struct cgroup_subsys *subsys[] = { ++#include ++}; ++ ++static struct css_set init_css_set; ++static struct cgroup init_cgroup; ++static struct cftype *subsys_cftypes[CGROUP_SUBSYS_COUNT]; ++ ++static int init_css_set_subsystems(struct cgroup *g, struct css_set *set) ++{ ++ int i; ++ struct cgroup_subsys_state *ss; ++ ++ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { ++ struct cgroup_subsys *cs = subsys[i]; ++ ++ ss = cs->create(cs, g); ++ if (IS_ERR(ss)) ++ goto destroy; ++ ++ g->subsys[i] = ss; ++ set->subsys[i] = ss; ++ atomic_set(&ss->refcnt, 0); ++ ss->cgroup = g; ++ } ++ return 0; ++ ++destroy: ++ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { ++ struct cgroup_subsys *cs = subsys[i]; ++ ++ if (g->subsys[i]) ++ cs->destroy(cs, g); ++ } ++ return PTR_ERR(ss); ++} ++ ++int init_ve_cgroups(struct ve_struct *ve) ++{ ++ int err = -ENOMEM; ++ struct cgroup *g; ++ struct css_set *cs; ++ ++ g = kzalloc(sizeof(struct cgroup), GFP_KERNEL); ++ if (g == NULL) ++ goto err_galloc; ++ ++ cs = kzalloc(sizeof(struct css_set), GFP_KERNEL); ++ if (cs == NULL) ++ goto err_calloc; ++ ++ g->parent = &init_cgroup; ++ err = init_css_set_subsystems(g, cs); ++ if (err) ++ goto err_subsys; ++ ++ g->parent = &init_cgroup; ++ ve->ve_cgroup = g; ++ ve->ve_css_set = cs; ++ return 0; ++ ++err_subsys: ++ kfree(cs); ++err_calloc: ++ kfree(g); ++err_galloc: ++ return err; ++} ++EXPORT_SYMBOL(init_ve_cgroups); ++ ++void fini_ve_cgroups(struct ve_struct *ve) ++{ ++ int i; ++ struct cgroup *g = ve->ve_cgroup; ++ struct css_set *css = ve->ve_css_set; ++ ++ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { ++ struct cgroup_subsys *cs = subsys[i]; ++ struct cgroup_subsys_state *ss = css->subsys[i]; ++ ++ BUG_ON(ss != g->subsys[i]); ++ ++ if (cs->pre_destroy) ++ cs->pre_destroy(cs, g); ++ ++ if (atomic_read(&ss->refcnt)) ++ printk(KERN_ERR "CG: leaking %d/%s subsys\n", ++ ve->veid, subsys[i]->name); ++ else ++ cs->destroy(cs, g); ++ } ++ ++ kfree(g); ++ kfree(css); ++ ve->ve_cgroup = NULL; ++ ve->ve_css_set = NULL; ++} ++EXPORT_SYMBOL(fini_ve_cgroups); ++ ++/* ++ * task lifecycle ++ */ ++ ++void cgroup_fork(struct task_struct *child) ++{ ++ child->cgroups = current->cgroups; ++} ++ ++void cgroup_fork_callbacks(struct task_struct *child) ++{ ++} ++ ++void cgroup_post_fork(struct task_struct *child) ++{ ++} ++ ++void cgroup_exit(struct task_struct *tsk, int dummy) ++{ ++ tsk->cgroups = &init_css_set; ++} ++ ++int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) ++{ ++ return -ENODATA; ++} ++ ++/* ++ * proc struts ++ */ ++ ++static int proc_cgroup_show(struct seq_file *m, void *v) ++{ ++ struct task_struct *tsk; ++ ++ tsk = pid_task((struct pid *)m->private, PIDTYPE_PID); ++ seq_printf(m, "%p\n", tsk->cgroups); ++ return 0; ++} ++ ++static int cgroup_open(struct inode *inode, struct file *file) ++{ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ return single_open(file, proc_cgroup_show, PROC_I(inode)->pid); ++} ++ ++struct file_operations proc_cgroup_operations = { ++ .open = cgroup_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = single_release, ++}; ++ ++/* ++ * cgroups misc struts ++ */ ++ ++int cgroup_add_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, ++ const struct cftype cft[], int count) ++{ ++ int idx = subsys->subsys_id; ++ static DEFINE_SPINLOCK(add_files_lock); ++ ++ if (unlikely(subsys_cftypes[idx] == NULL)) { ++ spin_lock(&add_files_lock); ++ if (subsys_cftypes[idx] == NULL) ++ subsys_cftypes[idx] = (struct cftype *)cft; ++ spin_unlock(&add_files_lock); ++ } ++ ++ BUG_ON(subsys_cftypes[idx] != cft); ++ return 0; ++} ++ ++void cgroup_lock(void) ++{ ++} ++ ++void cgroup_unlock(void) ++{ ++} ++ ++ ++int cgroup_is_removed(const struct cgroup *cgrp) ++{ ++ return 0; ++} ++ ++int __init cgroup_init_early(void) ++{ ++ int i; ++ ++ init_task.cgroups = &init_css_set; ++ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) ++ BUG_ON(subsys[i]->early_init); ++ ++ return 0; ++} ++ ++int __init cgroup_init(void) ++{ ++ get_ve0()->ve_cgroup = &init_cgroup; ++ get_ve0()->ve_css_set = &init_css_set; ++ if (init_css_set_subsystems(&init_cgroup, &init_css_set) != 0) ++ panic("CG: Can't init initial set\n"); ++ return 0; ++} +diff --git a/kernel/compat.c b/kernel/compat.c +index 32c254a..58506ef 100644 +--- a/kernel/compat.c ++++ b/kernel/compat.c +@@ -22,6 +22,7 @@ + #include + #include + #include ++#include + #include + + #include +@@ -40,7 +41,7 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user + __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; + } + +-static long compat_nanosleep_restart(struct restart_block *restart) ++long compat_nanosleep_restart(struct restart_block *restart) + { + struct compat_timespec __user *rmtp; + struct timespec rmt; +@@ -62,6 +63,7 @@ static long compat_nanosleep_restart(struct restart_block *restart) + + return ret; + } ++EXPORT_SYMBOL_GPL(compat_nanosleep_restart); + + asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, + struct compat_timespec __user *rmtp) +diff --git a/kernel/cpt/Makefile b/kernel/cpt/Makefile +new file mode 100644 +index 0000000..d97cc31 +--- /dev/null ++++ b/kernel/cpt/Makefile +@@ -0,0 +1,53 @@ ++# ++# ++# kernel/cpt/Makefile ++# ++# Copyright (C) 2000-2005 SWsoft ++# All rights reserved. ++# ++# Licensing governed by "linux/COPYING.SWsoft" file. ++ ++obj-$(CONFIG_VZ_CHECKPOINT) += vzcpt.o vzrst.o ++ ++vzcpt-objs := cpt_proc.o cpt_dump.o cpt_obj.o cpt_context.o cpt_process.o \ ++ cpt_mm.o cpt_files.o cpt_kernel.o \ ++ cpt_socket.o cpt_socket_in.o cpt_tty.o cpt_sysvipc.o cpt_net.o \ ++ cpt_conntrack.o cpt_epoll.o ++ ++vzrst-objs := rst_proc.o rst_undump.o rst_context.o rst_process.o \ ++ rst_mm.o rst_files.o \ ++ rst_socket.o rst_socket_in.o rst_tty.o rst_sysvipc.o rst_net.o \ ++ rst_conntrack.o rst_epoll.o ++ ++ifeq ($(CONFIG_BEANCOUNTERS), y) ++vzcpt-objs += cpt_ubc.o ++vzrst-objs += rst_ubc.o ++endif ++ ++ifeq ($(CONFIG_INOTIFY_USER), y) ++vzcpt-objs += cpt_inotify.o ++vzrst-objs += rst_inotify.o ++endif ++ ++vzrst-objs += cpt_exports.o ++ ++ifeq ($(CONFIG_VZ_CHECKPOINT), m) ++vzrst-objs += cpt_obj.o cpt_kernel.o ++endif ++ ++ifeq ($(CONFIG_VZ_CHECKPOINT_ITER), y) ++vzcpt-objs += cpt_iterative.o ++vzrst-objs += rst_iterative.o ++endif ++ ++ifeq ($(CONFIG_VZ_CHECKPOINT_LAZY), y) ++vzcpt-objs += cpt_pagein.o ++vzrst-objs += rst_pagein.o ++endif ++ ++ifeq ($(CONFIG_X86_64), y) ++vzcpt-objs += cpt_x8664.o ++ifeq ($(CONFIG_VZ_CHECKPOINT), m) ++vzrst-objs += cpt_x8664.o ++endif ++endif +diff --git a/kernel/cpt/cpt_conntrack.c b/kernel/cpt/cpt_conntrack.c +new file mode 100644 +index 0000000..19dcf32 +--- /dev/null ++++ b/kernel/cpt/cpt_conntrack.c +@@ -0,0 +1,365 @@ ++/* ++ * ++ * kernel/cpt/cpt_conntrack.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#if defined(CONFIG_VE_IPTABLES) && \ ++ (defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)) ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++ ++ ++/* How does it work? ++ * ++ * Network is disabled, so new conntrack entries will not appear. ++ * However, some of them can disappear because of timeouts. ++ * ++ * So, we take read_lock, collect all required information atomically, ++ * essentially, creating parallel "refcount" structures holding pointers. ++ * We delete conntrack timers as well, so the structures cannot disappear ++ * after releasing the lock. Now, after releasing lock we can dump everything ++ * safely. And on exit we restore timers to their original values. ++ * ++ * Note, this approach is not going to work in VE0. ++ */ ++ ++struct ct_holder ++{ ++ struct ct_holder *next; ++ struct ip_conntrack_tuple_hash *cth; ++ int index; ++}; ++ ++static void encode_tuple(struct cpt_ipct_tuple *v, struct ip_conntrack_tuple *tuple) ++{ ++ v->cpt_dst = tuple->dst.ip; ++ v->cpt_dstport = tuple->dst.u.all; ++ v->cpt_protonum = tuple->dst.protonum; ++ v->cpt_dir = tuple->dst.dir; ++ ++ v->cpt_src = tuple->src.ip; ++ v->cpt_srcport = tuple->src.u.all; ++} ++ ++static int dump_one_expect(struct cpt_ip_connexpect_image *v, ++ struct ip_conntrack_expect *exp, ++ int sibling, cpt_context_t *ctx) ++{ ++ int err = 0; ++ ++ v->cpt_next = sizeof(*v); ++ v->cpt_object = CPT_OBJ_NET_CONNTRACK_EXPECT; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_VOID; ++ ++ encode_tuple(&v->cpt_tuple, &exp->tuple); ++ encode_tuple(&v->cpt_mask, &exp->mask); ++ v->cpt_sibling_conntrack = sibling; ++ v->cpt_flags = exp->flags; ++ v->cpt_seq = exp->id; ++ v->cpt_dir = 0; ++ v->cpt_manip_proto = 0; ++#ifdef CONFIG_IP_NF_NAT_NEEDED ++ v->cpt_manip_proto = exp->saved_proto.all; ++ v->cpt_dir = exp->dir; ++#endif ++ v->cpt_timeout = 0; ++ if (exp->master->helper->timeout) ++ v->cpt_timeout = exp->timeout.expires - jiffies; ++ return err; ++} ++ ++/* NOTE. We use one page to dump list of expectations. This may be not enough ++ * in theory. In practice there is only one expectation per conntrack record. ++ * Moreover, taking into account that _ALL_ of expecations are saved in one ++ * global list, which is looked up each incoming/outpging packet, the system ++ * would be severely dead when even one conntrack would have so much of ++ * expectations. Shortly, I am not going to repair this. ++ */ ++ ++static int dump_expect_list(struct ip_conntrack *ct, struct ct_holder *list, ++ cpt_context_t *ctx) ++{ ++ int err = 0; ++ unsigned long pg; ++ struct cpt_ip_connexpect_image *v; ++ struct ip_conntrack_expect *exp; ++ ++ if (ct->expecting == 0) ++ return err; ++ if (ct->expecting*sizeof(struct cpt_ip_connexpect_image) > PAGE_SIZE) ++ return -ENOBUFS; ++ ++ pg = __get_free_page(GFP_KERNEL); ++ if (!pg) ++ return -ENOMEM; ++ v = (struct cpt_ip_connexpect_image *)pg; ++ ++ read_lock_bh(&ip_conntrack_lock); ++ list_for_each_entry(exp, &ve_ip_conntrack_expect_list, list) { ++ int sibling; ++ ++ if (exp->master != ct) ++ continue; ++ ++ if (ct->helper == NULL) { ++ eprintk_ctx("conntrack: no helper and non-trivial expectation\n"); ++ err = -EINVAL; ++ break; ++ } ++ ++ sibling = 0; ++#if 0 ++ /* That's all? No need to calculate sibling? */ ++ if (exp->sibling) { ++ struct ct_holder *c; ++ for (c = list; c; c = c->next) { ++ if (tuplehash_to_ctrack(c->cth) == exp->sibling) { ++ sibling = c->index; ++ break; ++ } ++ } ++ /* NOTE: exp->sibling could be not "confirmed" and, hence, ++ * out of hash table. We should just ignore such a sibling, ++ * the connection is going to be retried, the packet ++ * apparently was lost somewhere. ++ */ ++ if (sibling == 0) ++ dprintk_ctx("sibling conntrack is not found\n"); ++ } ++#endif ++ ++ /* If the expectation still does not have exp->sibling ++ * and timer is not running, it is about to die on another ++ * cpu. Skip it. */ ++ if (!sibling && ++ ct->helper->timeout && ++ !timer_pending(&exp->timeout)) { ++ dprintk_ctx("conntrack: expectation: no timer\n"); ++ continue; ++ } ++ ++ err = dump_one_expect(v, exp, sibling, ctx); ++ if (err) ++ break; ++ ++ v++; ++ } ++ read_unlock_bh(&ip_conntrack_lock); ++ ++ if (err == 0 && (unsigned long)v != pg) ++ ctx->write((void*)pg, (unsigned long)v - pg, ctx); ++ ++ free_page(pg); ++ return err; ++} ++ ++static int dump_one_ct(struct ct_holder *c, struct ct_holder *list, ++ cpt_context_t *ctx) ++{ ++ struct ip_conntrack_tuple_hash *h = c->cth; ++ struct ip_conntrack *ct = tuplehash_to_ctrack(h); ++ struct cpt_ip_conntrack_image v; ++ int err = 0; ++ ++ if (sizeof(v.cpt_proto_data) != sizeof(ct->proto)) { ++ eprintk_ctx("conntrack module ct->proto version mismatch\n"); ++ return -EINVAL; ++ } ++ ++ cpt_open_object(NULL, ctx); ++ ++ v.cpt_next = CPT_NULL; ++ v.cpt_object = CPT_OBJ_NET_CONNTRACK; ++ v.cpt_hdrlen = sizeof(v); ++ v.cpt_content = CPT_CONTENT_ARRAY; ++ ++ read_lock_bh(&ip_conntrack_lock); ++ v.cpt_status = ct->status; ++ v.cpt_timeout = ct->timeout.expires - jiffies; ++ v.cpt_ct_helper = (ct->helper != NULL); ++ v.cpt_index = c->index; ++ v.cpt_id = ct->id; ++ v.cpt_mark = 0; ++#if defined(CONFIG_IP_NF_CONNTRACK_MARK) ++ v.cpt_mark = ct->mark; ++#endif ++ encode_tuple(&v.cpt_tuple[0], &ct->tuplehash[0].tuple); ++ encode_tuple(&v.cpt_tuple[1], &ct->tuplehash[1].tuple); ++ memcpy(&v.cpt_proto_data, &ct->proto, sizeof(v.cpt_proto_data)); ++ memcpy(&v.cpt_help_data, &ct->help, sizeof(v.cpt_help_data)); ++ ++ v.cpt_masq_index = 0; ++ v.cpt_initialized = 0; ++ v.cpt_num_manips = 0; ++ v.cpt_nat_helper = 0; ++#ifdef CONFIG_IP_NF_NAT_NEEDED ++#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \ ++ defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE) ++ v.cpt_masq_index = ct->nat.masq_index; ++#endif ++ /* "help" data is used by pptp, difficult to support */ ++ v.cpt_nat_seq[0].cpt_correction_pos = ct->nat.info.seq[0].correction_pos; ++ v.cpt_nat_seq[0].cpt_offset_before = ct->nat.info.seq[0].offset_before; ++ v.cpt_nat_seq[0].cpt_offset_after = ct->nat.info.seq[0].offset_after; ++ v.cpt_nat_seq[1].cpt_correction_pos = ct->nat.info.seq[1].correction_pos; ++ v.cpt_nat_seq[1].cpt_offset_before = ct->nat.info.seq[1].offset_before; ++ v.cpt_nat_seq[1].cpt_offset_after = ct->nat.info.seq[1].offset_after; ++#endif ++ read_unlock_bh(&ip_conntrack_lock); ++ ++ ctx->write(&v, sizeof(v), ctx); ++ ++ err = dump_expect_list(ct, list, ctx); ++ ++ cpt_close_object(ctx); ++ return err; ++} ++ ++int cpt_dump_ip_conntrack(cpt_context_t * ctx) ++{ ++ struct ct_holder *ct_list = NULL; ++ struct ct_holder *c, **cp; ++ int err = 0; ++ int index = 0; ++ int idx; ++ ++ if (get_exec_env()->_ip_conntrack == NULL) ++ return 0; ++ ++ for (idx = atomic_read(&(get_exec_env()->_ip_conntrack->_ip_conntrack_count)); idx >= 0; idx--) { ++ c = kmalloc(sizeof(struct ct_holder), GFP_KERNEL); ++ if (c == NULL) { ++ err = -ENOMEM; ++ goto done; ++ } ++ memset(c, 0, sizeof(struct ct_holder)); ++ c->next = ct_list; ++ ct_list = c; ++ } ++ ++ c = ct_list; ++ ++ read_lock_bh(&ip_conntrack_lock); ++ for (idx = 0; idx < ip_conntrack_htable_size; idx++) { ++ struct ip_conntrack_tuple_hash *h; ++ list_for_each_entry(h, &ve_ip_conntrack_hash[idx], list) { ++ /* Skip reply tuples, they are covered by original ++ * direction. */ ++ if (DIRECTION(h)) ++ continue; ++ ++ /* Oops, we have not enough of holders... ++ * It is impossible. */ ++ if (unlikely(c == NULL)) { ++ read_unlock_bh(&ip_conntrack_lock); ++ eprintk_ctx("unexpected conntrack appeared\n"); ++ err = -ENOMEM; ++ goto done; ++ } ++ ++ /* If timer is not running, it means that it ++ * has just been scheduled on another cpu. ++ * We should skip this conntrack, it is about to be ++ * destroyed. */ ++ if (!del_timer(&tuplehash_to_ctrack(h)->timeout)) { ++ dprintk_ctx("conntrack: no timer\n"); ++ continue; ++ } ++ ++ /* Timer is deleted. refcnt is _not_ decreased. ++ * We are going to restore the timer on exit ++ * from this function. */ ++ c->cth = h; ++ c->index = ++index; ++ c = c->next; ++ } ++ } ++ read_unlock_bh(&ip_conntrack_lock); ++ ++ /* No conntracks? Good. */ ++ if (index == 0) ++ goto done; ++ ++ /* Comb the list a little. */ ++ cp = &ct_list; ++ while ((c = *cp) != NULL) { ++ /* Discard unused entries; they can appear, if some ++ * entries were timed out since we preallocated the list. ++ */ ++ if (c->cth == NULL) { ++ *cp = c->next; ++ kfree(c); ++ continue; ++ } ++ ++ /* Move conntracks attached to expectations to the beginning ++ * of the list. */ ++ if (tuplehash_to_ctrack(c->cth)->master && c != ct_list) { ++ *cp = c->next; ++ c->next = ct_list; ++ ct_list = c; ++ dprintk_ctx("conntrack: %d moved in list\n", c->index); ++ continue; ++ } ++ cp = &c->next; ++ } ++ ++ cpt_open_section(ctx, CPT_SECT_NET_CONNTRACK); ++ ++ for (c = ct_list; c; c = c->next) { ++ err = dump_one_ct(c, ct_list, ctx); ++ if (err) ++ goto done; ++ } ++ ++ cpt_close_section(ctx); ++ ++done: ++ while ((c = ct_list) != NULL) { ++ ct_list = c->next; ++ if (c->cth) { ++ /* Restore timer. refcnt is preserved. */ ++ add_timer(&tuplehash_to_ctrack(c->cth)->timeout); ++ } ++ kfree(c); ++ } ++ return err; ++} ++ ++#endif +diff --git a/kernel/cpt/cpt_context.c b/kernel/cpt/cpt_context.c +new file mode 100644 +index 0000000..58a8069 +--- /dev/null ++++ b/kernel/cpt/cpt_context.c +@@ -0,0 +1,257 @@ ++/* ++ * ++ * kernel/cpt/cpt_context.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++ ++ ++static void file_write(const void *addr, size_t count, struct cpt_context *ctx) ++{ ++ mm_segment_t oldfs; ++ ssize_t err = -EBADF; ++ struct file *file = ctx->file; ++ ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ if (file) ++ err = file->f_op->write(file, addr, count, &file->f_pos); ++ set_fs(oldfs); ++ if (err != count && !ctx->write_error) ++ ctx->write_error = err < 0 ? err : -EIO; ++} ++ ++static void file_pwrite(void *addr, size_t count, struct cpt_context *ctx, loff_t pos) ++{ ++ mm_segment_t oldfs; ++ ssize_t err = -EBADF; ++ struct file *file = ctx->file; ++ ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ if (file) ++ err = file->f_op->write(file, addr, count, &pos); ++ set_fs(oldfs); ++ if (err != count && !ctx->write_error) ++ ctx->write_error = err < 0 ? err : -EIO; ++} ++ ++static void file_align(struct cpt_context *ctx) ++{ ++ struct file *file = ctx->file; ++ ++ if (file) ++ file->f_pos = CPT_ALIGN(file->f_pos); ++} ++ ++void cpt_context_init(struct cpt_context *ctx) ++{ ++ int i; ++ ++ memset(ctx, 0, sizeof(*ctx)); ++ ++ init_MUTEX(&ctx->main_sem); ++ ctx->refcount = 1; ++ ++ ctx->current_section = -1; ++ ctx->current_object = -1; ++ ctx->pagesize = PAGE_SIZE; ++ ctx->write = file_write; ++ ctx->pwrite = file_pwrite; ++ ctx->align = file_align; ++ for (i=0; i < CPT_SECT_MAX; i++) ++ ctx->sections[i] = CPT_NULL; ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ init_completion(&ctx->pgin_notify); ++#endif ++ cpt_object_init(ctx); ++} ++ ++int cpt_open_dumpfile(struct cpt_context *ctx) ++{ ++ ctx->tmpbuf = (char*)__get_free_page(GFP_KERNEL); ++ if (ctx->tmpbuf == NULL) ++ return -ENOMEM; ++ __cpt_release_buf(ctx); ++ return 0; ++} ++ ++int cpt_close_dumpfile(struct cpt_context *ctx) ++{ ++ if (ctx->file) { ++ fput(ctx->file); ++ ctx->file = NULL; ++ } ++ if (ctx->tmpbuf) { ++ free_page((unsigned long)ctx->tmpbuf); ++ ctx->tmpbuf = NULL; ++ } ++ if (ctx->write_error) ++ eprintk_ctx("error while writing dump file: %d\n", ctx->write_error); ++ return ctx->write_error; ++} ++ ++int cpt_major_hdr_out(struct cpt_context *ctx) ++{ ++ struct cpt_major_hdr hdr; ++ ++ if (ctx->file == NULL) ++ return 0; ++ ++ memset(&hdr, 0, sizeof(hdr)); ++ hdr.cpt_signature[0] = CPT_SIGNATURE0; ++ hdr.cpt_signature[1] = CPT_SIGNATURE1; ++ hdr.cpt_signature[2] = CPT_SIGNATURE2; ++ hdr.cpt_signature[3] = CPT_SIGNATURE3; ++ hdr.cpt_hdrlen = sizeof(hdr); ++ hdr.cpt_image_version = CPT_VERSION_20; ++#ifdef CONFIG_X86_64 ++ hdr.cpt_os_arch = CPT_OS_ARCH_EMT64; ++#elif defined(CONFIG_X86_32) ++ hdr.cpt_os_arch = CPT_OS_ARCH_I386; ++#elif defined(CONFIG_IA64) ++ hdr.cpt_os_arch = CPT_OS_ARCH_IA64; ++#else ++#error Arch is not supported ++#endif ++ hdr.cpt_ve_features = (__u32)ctx->features; ++ hdr.cpt_ve_features2 = (__u32)(ctx->features>>32); ++ hdr.cpt_pagesize = (__u16)PAGE_SIZE; ++ hdr.cpt_hz = HZ; ++ hdr.cpt_start_jiffies64 = ctx->virt_jiffies64; ++ hdr.cpt_start_sec = ctx->start_time.tv_sec; ++ hdr.cpt_start_nsec = ctx->start_time.tv_nsec; ++ hdr.cpt_cpu_caps[0] = ctx->src_cpu_flags; ++ hdr.cpt_kernel_config[0] = ctx->kernel_config_flags; ++ hdr.cpt_iptables_mask = ctx->iptables_mask; ++ ++ ctx->write(&hdr, sizeof(hdr), ctx); ++ return 0; ++} ++ ++int cpt_close_section(struct cpt_context *ctx) ++{ ++ if (ctx->file && ctx->current_section >= 0) { ++ __u64 next = ctx->file->f_pos - ctx->current_section; ++ ctx->pwrite(&next, 8, ctx, ctx->current_section); ++ ctx->current_section = -1; ++ } ++ return 0; ++} ++EXPORT_SYMBOL(cpt_close_section); ++ ++int cpt_open_section(struct cpt_context *ctx, __u32 type) ++{ ++ struct cpt_section_hdr hdr; ++ ++ if (ctx->file == NULL) ++ return 0; ++ ++ cpt_close_section(ctx); ++ ++ ctx->current_section = ctx->file->f_pos; ++ ctx->sections[type] = ctx->current_section; ++ ++ hdr.cpt_next = 0; ++ hdr.cpt_section = type; ++ hdr.cpt_hdrlen = sizeof(hdr); ++ hdr.cpt_align = 0; ++ ctx->write(&hdr, sizeof(hdr), ctx); ++ ++ return 0; ++} ++EXPORT_SYMBOL(cpt_open_section); ++ ++ ++int cpt_close_object(struct cpt_context *ctx) ++{ ++ if (ctx->file && ctx->current_object >= 0) { ++ __u64 next = ctx->file->f_pos - ctx->current_object; ++ ctx->pwrite(&next, 8, ctx, ctx->current_object); ++ ctx->current_object = -1; ++ } ++ return 0; ++} ++EXPORT_SYMBOL(cpt_close_object); ++ ++int cpt_open_object(cpt_object_t *obj, struct cpt_context *ctx) ++{ ++ if (ctx->file == NULL) ++ return 0; ++ ++ cpt_close_object(ctx); ++ ++ ctx->current_object = ctx->file->f_pos; ++ if (obj) ++ cpt_obj_setpos(obj, ctx->current_object, ctx); ++ ++ return 0; ++} ++EXPORT_SYMBOL(cpt_open_object); ++ ++int cpt_push_object(loff_t *saved, struct cpt_context *ctx) ++{ ++ if (ctx->file) { ++ *saved = ctx->current_object; ++ ctx->current_object = ctx->file->f_pos; ++ } ++ return 0; ++} ++EXPORT_SYMBOL(cpt_push_object); ++ ++int cpt_pop_object(loff_t *saved, struct cpt_context *ctx) ++{ ++ ctx->current_object = *saved; ++ return 0; ++} ++EXPORT_SYMBOL(cpt_pop_object); ++ ++int cpt_dump_tail(struct cpt_context *ctx) ++{ ++ struct cpt_major_tail hdr; ++ int i; ++ ++ if (ctx->file == NULL) ++ return 0; ++ ++ cpt_open_section(ctx, CPT_SECT_TRAILER); ++ memset(&hdr, 0, sizeof(hdr)); ++ hdr.cpt_next = sizeof(hdr); ++ hdr.cpt_object = CPT_OBJ_TRAILER; ++ hdr.cpt_hdrlen = sizeof(hdr); ++ hdr.cpt_content = CPT_CONTENT_VOID; ++ hdr.cpt_lazypages = 0; ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ hdr.cpt_lazypages = ctx->lazypages; ++#endif ++ hdr.cpt_64bit = ctx->tasks64; ++ hdr.cpt_signature[0] = CPT_SIGNATURE0; ++ hdr.cpt_signature[1] = CPT_SIGNATURE1; ++ hdr.cpt_signature[2] = CPT_SIGNATURE2; ++ hdr.cpt_signature[3] = CPT_SIGNATURE3; ++ hdr.cpt_nsect = CPT_SECT_MAX_INDEX; ++ for (i = 0; i < CPT_SECT_MAX_INDEX; i++) ++ hdr.cpt_sections[i] = ctx->sections[i]; ++ ++ ctx->write(&hdr, sizeof(hdr), ctx); ++ cpt_close_section(ctx); ++ return 0; ++} +diff --git a/kernel/cpt/cpt_context.h b/kernel/cpt/cpt_context.h +new file mode 100644 +index 0000000..e4f82f9 +--- /dev/null ++++ b/kernel/cpt/cpt_context.h +@@ -0,0 +1,215 @@ ++#include ++#include ++#include ++ ++#define CPT_CTX_ERROR -1 ++#define CPT_CTX_IDLE 0 ++#define CPT_CTX_SUSPENDING 1 ++#define CPT_CTX_SUSPENDED 2 ++#define CPT_CTX_DUMPING 3 ++#define CPT_CTX_UNDUMPING 4 ++#define CPT_CTX_UNDUMPED 5 ++ ++#define CPT_TID(tsk) task_pid_nr(tsk), task_pid_vnr(tsk), (tsk)->comm ++#define CPT_FID "%d,%d(%s)" ++ ++ ++typedef struct cpt_context ++{ ++ struct list_head ctx_list; ++ int refcount; ++ int ctx_state; ++ int objcount; ++ int sticky; ++ struct semaphore main_sem; ++ ++ struct file *errorfile; ++ struct file *statusfile; ++ struct file *lockfile; ++ ++ int errno; ++ char *error_msg; ++ loff_t err_offset; ++ ++ struct file *file; ++ char *tmpbuf; ++ int pagesize; ++#ifdef CONFIG_VZ_CHECKPOINT_ITER ++ int iter_done; ++ void *iter_dir; ++ struct user_beancounter *iter_ub; ++#endif ++ loff_t current_section; ++ loff_t current_object; ++ ++ loff_t sections[CPT_SECT_MAX]; ++ ++ __u32 errormask; ++ __u32 write_error; ++ ++ struct list_head object_array[CPT_OBJ_MAX]; ++ ++ void (*write)(const void *addr, size_t count, struct cpt_context *ctx); ++ void (*pwrite)(void *addr, size_t count, struct cpt_context *ctx, loff_t pos); ++ ssize_t (*read)(void *addr, size_t count, struct cpt_context *ctx); ++ ssize_t (*pread)(void *addr, size_t count, struct cpt_context *ctx, loff_t pos); ++ void (*align)(struct cpt_context *ctx); ++ int ve_id; ++ int contextid; ++ struct timespec cpt_monotonic_time; /* Host monotonic time at the moment of cpt/rst ++ * corresponging to start_time */ ++ __u64 virt_jiffies64; /* Virtual jiffies64. It is == cpt_jiffies64 when ++ * VE did not migrate. */ ++ struct timespec start_time; ++ struct timespec delta_time; ++ __s64 delta_nsec; ++ int image_version; ++ __u16 image_arch; ++ __u64 iptables_mask; ++ __u64 features; ++ ++#define CPT_ANONVMA_HBITS (sizeof(void*) == 4 ? 10 : 9) ++#define CPT_ANONVMA_HSIZE (1<ve_id, ##arg) ++ ++#define wprintk(a...) cpt_printk(2, "CPT WRN: " a) ++#define wprintk_ctx(f, arg...) wprintk("%p,%u: " f, ctx, ctx->ve_id, ##arg) ++ ++#define eprintk(a...) cpt_printk(1, "CPT ERR: " a) ++#define eprintk_ctx(f, arg...) \ ++do { \ ++ eprintk("%p,%u :" f, ctx, ctx->ve_id, ##arg); \ ++ if (ctx->error_msg && ctx->err_offset < PAGE_SIZE) \ ++ ctx->err_offset += snprintf((char*)(ctx->error_msg + \ ++ ctx->err_offset), \ ++ PAGE_SIZE - ctx->err_offset, \ ++ "Error: " f, ##arg); \ ++} while(0) ++ ++#define CPT_TMPBUF_FREE 0x789adf12 ++#define CPT_TMPBUF_BUSY 0xabcd9876 ++ ++static inline void *cpt_get_buf(cpt_context_t *ctx) ++{ ++ void *buf = ctx->tmpbuf; ++ ++ BUG_ON(*(u32*)(buf + PAGE_SIZE - 4) != CPT_TMPBUF_FREE); ++ *(u32*)(buf + PAGE_SIZE - 4) = CPT_TMPBUF_BUSY; ++ return buf; ++} ++ ++static inline void __cpt_release_buf(cpt_context_t *ctx) ++{ ++ void *buf = ctx->tmpbuf; ++ ++ *(u32*)(buf + PAGE_SIZE - 4) = CPT_TMPBUF_FREE; ++} ++ ++static inline void cpt_release_buf(cpt_context_t *ctx) ++{ ++ void *buf = ctx->tmpbuf; ++ ++ BUG_ON(*(u32*)(buf + PAGE_SIZE - 4) != CPT_TMPBUF_BUSY); ++ *(u32*)(buf + PAGE_SIZE - 4) = CPT_TMPBUF_FREE; ++} ++ ++static inline void cpt_flush_error(cpt_context_t *ctx) ++{ ++ mm_segment_t oldfs; ++ ++ if (ctx->errorfile && ctx->error_msg && ctx->err_offset) { ++ if (ctx->errorfile->f_op && ctx->errorfile->f_op->write) { ++ oldfs = get_fs(); ++ set_fs(KERNEL_DS); ++ ctx->errorfile->f_op->write(ctx->errorfile, ++ ctx->error_msg, ctx->err_offset, ++ &ctx->errorfile->f_pos); ++ set_fs(oldfs); ++ } ++ ctx->error_msg[0] = 0; ++ ctx->err_offset = 0; ++ } ++} +diff --git a/kernel/cpt/cpt_dump.c b/kernel/cpt/cpt_dump.c +new file mode 100644 +index 0000000..de2364b +--- /dev/null ++++ b/kernel/cpt/cpt_dump.c +@@ -0,0 +1,1247 @@ ++/* ++ * ++ * kernel/cpt/cpt_dump.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_dump.h" ++#include "cpt_files.h" ++#include "cpt_mm.h" ++#include "cpt_process.h" ++#include "cpt_net.h" ++#include "cpt_socket.h" ++#include "cpt_ubc.h" ++#include "cpt_kernel.h" ++ ++ ++static int vps_child_level(struct task_struct *root, struct task_struct *c) ++{ ++ int level = 0; ++ int veid = VE_TASK_INFO(c)->owner_env->veid; ++ ++ while (VE_TASK_INFO(c)->owner_env->veid == veid) { ++ if (c->pid != c->tgid) ++ c = c->group_leader; ++ if (c == root) ++ return level; ++ ++ c = c->parent; ++ level++; ++ } ++ return -1; ++} ++ ++static inline int freezable(struct task_struct * p) ++{ ++ if (p->exit_state) ++ return 0; ++ ++ switch (p->state) { ++ case EXIT_ZOMBIE: ++ case EXIT_DEAD: ++ case TASK_STOPPED: ++#if TASK_TRACED != TASK_STOPPED ++ case TASK_TRACED: ++#endif ++ return 0; ++ default: ++ return 1; ++ } ++} ++ ++static void wake_ve(cpt_context_t *ctx) ++{ ++ struct task_struct *p, *g; ++ ++ do_each_thread_ve(g, p) { ++ spin_lock_irq(&p->sighand->siglock); ++ if (p->flags & PF_FROZEN) { ++ p->flags &= ~PF_FROZEN; ++ wake_up_process(p); ++ } ++ spin_unlock_irq(&p->sighand->siglock); ++ } while_each_thread_ve(g, p); ++} ++ ++/* ++ * Some comment is necessary about PF_FREEZE,PF_FROZEN,TIF_FREEZE... ++ * ++ * SWSUSP uses PF_FREEZE flag in tsk->flags raising it in context ++ * of another process. Apparently, it is unacceptable on SMP. ++ * Let's take freeze_processes() in kernel/power/process.c as an example. ++ * Unserialized modifications tsk->flags easily ++ * (believe or not, but it happens with probability of almost 100% :-)) ++ * creates the situation when setting PF_FREEZE in freeze_processes(), ++ * which quickly spins raising PF_FREEZE of all the processes, ++ * _clears_ PF_FROZEN just set in refrigerator(), so that suspend deadlocks. ++ * ++ * So, to make things clean, we require that those flags may be modified ++ * only under tsk->sighand->siglock, which is quite natural because PF_FREEZE ++ * is just a kind of signal. ++ * ++ * It is not enough, because we are still not allowed to change tsk->flags ++ * in context of another process, we can corrupt another flags, when the process ++ * running on another cpu modifies them. So, we use TIF_FREEZE in thread flags, ++ * which can be changed atomically. ++ * ++ * PF_FROZEN also changes in context of another process, but this happens ++ * only when the process is already in refrigerator() which does not modify ++ * tsk->flags. ++ */ ++ ++static int check_process_external(struct task_struct *p) ++{ ++ if (pid_alive(p)) { ++ if (p->pids[PIDTYPE_PID].pid->level == 0) ++ return PIDTYPE_PID; ++ if (p->pids[PIDTYPE_PGID].pid->level == 0) ++ return PIDTYPE_PGID; ++ if (p->pids[PIDTYPE_SID].pid->level == 0) ++ return PIDTYPE_SID; ++ } ++ ++ return PIDTYPE_MAX; ++} ++ ++enum ++{ ++ OBSTACLE_NOGO = -1, ++ OBSTACLE_TIMEOUT = -2, ++ OBSTACLE_TRYAGAIN = -3, ++}; ++ ++#define SUSPEND_TIMEOUT (10UL*HZ) ++ ++static int vps_stop_tasks(struct cpt_context *ctx) ++{ ++ unsigned long start_time = jiffies; ++ unsigned long target, timeout; ++ struct task_struct *p, *g; ++ int todo; ++ int round = 0; ++ ++ do_gettimespec(&ctx->start_time); ++ do_posix_clock_monotonic_gettime(&ctx->cpt_monotonic_time); ++ ctx->virt_jiffies64 = get_jiffies_64() + get_exec_env()->jiffies_fixup; ++ ++ read_lock(&tasklist_lock); ++ ++ atomic_inc(&get_exec_env()->suspend); ++ timeout = HZ/5; ++ target = jiffies + timeout; ++ ++ for(;;) { ++ struct task_struct *root; ++ todo = 0; ++ ++ root = find_task_by_vpid(1); ++ if (!root) { ++ read_unlock(&tasklist_lock); ++ eprintk_ctx("cannot find ve init\n"); ++ atomic_dec(&get_exec_env()->suspend); ++ return -ESRCH; ++ } ++ ++ do_each_thread_ve(g, p) { ++ if (vps_child_level(root, p) >= 0) { ++ switch (check_process_external(p)) { ++ case PIDTYPE_PID: ++ eprintk_ctx("external process %d/%d(%s) inside CT (e.g. vzctl enter or vzctl exec).\n", ++ task_pid_vnr(p), p->pid, p->comm); ++ todo = OBSTACLE_NOGO; ++ goto out; ++ case PIDTYPE_PGID: ++ eprintk_ctx("external process group %d/%d(%s) inside CT " ++ "(e.g. vzctl enter or vzctl exec).\n", ++ task_pgrp_vnr(p), p->pid, p->comm); ++ todo = OBSTACLE_NOGO; ++ goto out; ++ case PIDTYPE_SID: ++ eprintk_ctx("external process session %d/%d(%s) inside CT " ++ "(e.g. vzctl enter or vzctl exec).\n", ++ task_session_vnr(p), p->pid, p->comm); ++ todo = OBSTACLE_NOGO; ++ goto out; ++ } ++ if (p->vfork_done) { ++ /* Task between vfork()...exec() ++ * cannot be frozen, because parent ++ * wait in uninterruptible state. ++ * So, we do nothing, waiting for ++ * exec(), unless: ++ */ ++ if (p->state == TASK_STOPPED || ++ p->state == TASK_TRACED) { ++ eprintk_ctx("task " CPT_FID " is stopped while vfork(). " ++ "Checkpointing is impossible.\n", ++ CPT_TID(p)); ++ todo = OBSTACLE_NOGO; ++ /* It is fatal, _user_ stopped ++ * vfork()ing task, so that we ++ * cannot suspend now. ++ */ ++ } else { ++ todo = OBSTACLE_TRYAGAIN; ++ } ++ goto out; ++ } ++ if (p->signal->group_exit_task && ++ p->signal->notify_count) { ++ /* exec() waits for threads' death */ ++ wprintk_ctx("task " CPT_FID " waits for threads' death\n", CPT_TID(p)); ++ todo = OBSTACLE_TRYAGAIN; ++ goto out; ++ } ++ if (p->state == TASK_TRACED ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9) ++ && !p->stopped_state ++#endif ++ ) { ++ int ptrace_id = p->pn_state; ++ /* Debugger waits for signal. */ ++ switch (ptrace_id) { ++ case PN_STOP_TF: ++ case PN_STOP_TF_RT: ++ case PN_STOP_ENTRY: ++ case PN_STOP_FORK: ++ case PN_STOP_VFORK: ++ case PN_STOP_SIGNAL: ++ case PN_STOP_EXIT: ++ case PN_STOP_LEAVE: ++ break; ++ default: ++ eprintk_ctx("task " CPT_FID " is stopped by debugger while %d.\n", CPT_TID(p), ptrace_id); ++ todo = OBSTACLE_NOGO; ++ goto out; ++ } ++ } ++#ifdef CONFIG_UTRACE ++ if (check_utrace(p, root, ctx)) { ++ eprintk_ctx("task " CPT_FID " is utraced. Checkpointing is impossible.\n", CPT_TID(p)); ++ todo = OBSTACLE_NOGO; ++ goto out; ++ } ++#endif ++ if (p->flags & PF_NOFREEZE) { ++ eprintk_ctx("task " CPT_FID " is unfreezable. Checkpointing is impossible.\n", CPT_TID(p)); ++ todo = OBSTACLE_NOGO; ++ goto out; ++ } ++ ++ if (!freezable(p)) ++ continue; ++ ++ spin_lock_irq(&p->sighand->siglock); ++ if (!(p->flags & PF_FROZEN)) { ++ set_tsk_thread_flag(p, TIF_FREEZE); ++ signal_wake_up(p, 0); ++ } ++ spin_unlock_irq(&p->sighand->siglock); ++ ++ if (p->flags & PF_FROZEN) { ++ if (p->state != TASK_UNINTERRUPTIBLE) ++ printk("Holy Crap 1 %ld " CPT_FID "\n", p->state, CPT_TID(p)); ++ continue; ++ } ++ ++ if (round == 10) ++ wprintk_ctx(CPT_FID " is running\n", CPT_TID(p)); ++ ++ todo++; ++ } else { ++ if (p != current) { ++ eprintk_ctx("foreign process %d/%d(%s) inside CT (e.g. vzctl enter or vzctl exec).\n", ++ task_pid_vnr(p), task_pid_nr(p), p->comm); ++ todo = OBSTACLE_NOGO; ++ goto out; ++ } ++ } ++ } while_each_thread_ve(g, p); ++ ++ if (todo > 0) { ++ /* No visible obstacles, but VE did not freeze ++ * for timeout. Interrupt suspend, if it is major ++ * timeout or signal; if it is minor timeout ++ * we will wake VE and restart suspend. ++ */ ++ if (time_after(jiffies, start_time + SUSPEND_TIMEOUT) ++ || signal_pending(current)) ++ todo = OBSTACLE_TIMEOUT; ++ else if (time_after(jiffies, target)) ++ todo = OBSTACLE_TRYAGAIN; ++ } ++ ++out: ++ if (todo < 0) { ++ atomic_dec(&get_exec_env()->suspend); ++ ++ wake_ve(ctx); ++ ++#if 0 ++ /* This is sign of failure of printk(), which is not ++ * ours. So, no prefixes. */ ++ printk(">\n"); ++#endif ++ } ++ ++ read_unlock(&tasklist_lock); ++ ++ if (!todo) { ++ atomic_dec(&get_exec_env()->suspend); ++ return 0; ++ } ++ ++ switch (todo) { ++ case OBSTACLE_NOGO: ++ eprintk_ctx("suspend is impossible now.\n"); ++ return -EAGAIN; ++ ++ case OBSTACLE_TIMEOUT: ++ eprintk_ctx("interrupted or timed out.\n"); ++ return -EINTR; ++ ++ case OBSTACLE_TRYAGAIN: ++ if (time_after(jiffies, start_time + SUSPEND_TIMEOUT) || ++ signal_pending(current)) { ++ wprintk_ctx("suspend timed out\n"); ++ return -EAGAIN; ++ } ++ ++ wprintk_ctx("minor suspend timeout (%lu) expired, " ++ "trying again\n", timeout); ++ ++ /* Try again. VE is awake, give it some time to run. */ ++ current->state = TASK_INTERRUPTIBLE; ++ schedule_timeout(HZ); ++ ++ /* After a short wait restart suspend ++ * with longer timeout */ ++ atomic_inc(&get_exec_env()->suspend); ++ timeout = min(timeout<<1, SUSPEND_TIMEOUT); ++ target = jiffies + timeout; ++ break; ++ ++ default: ++ if (round > 0) { ++ /* VE is partially frozen, give processes ++ * a chance to enter to refrigerator(). */ ++ current->state = TASK_INTERRUPTIBLE; ++ schedule_timeout(HZ/20); ++ } else { ++ yield(); ++ } ++ } ++ ++ read_lock(&tasklist_lock); ++ round++; ++ } ++} ++ ++static int cpt_unlock_ve(struct cpt_context *ctx) ++{ ++ struct ve_struct *env; ++ ++ env = get_ve_by_id(ctx->ve_id); ++ if (!env) ++ return -ESRCH; ++ down_write(&env->op_sem); ++ env->is_locked = 0; ++ up_write(&env->op_sem); ++ put_ve(env); ++ return 0; ++} ++ ++int cpt_resume(struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ virtinfo_notifier_call(VITYPE_SCP, VIRTINFO_SCP_DMPFIN, ctx); ++ ++ cpt_unlock_sockets(ctx); ++ ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ if (ctx->pgin_task) { ++ wait_for_completion(&ctx->pgin_notify); ++ put_task_struct(ctx->pgin_task); ++ ctx->pgin_task = NULL; ++ } ++#endif ++ ++ for_each_object(obj, CPT_OBJ_TASK) { ++ struct task_struct *tsk = obj->o_obj; ++ ++ spin_lock_irq(&tsk->sighand->siglock); ++ if (tsk->flags & PF_FROZEN) { ++ tsk->flags &= ~PF_FROZEN; ++ wake_up_process(tsk); ++ } else if (freezable(tsk)) { ++ eprintk_ctx("strange, %s not frozen\n", tsk->comm ); ++ } ++ spin_unlock_irq(&tsk->sighand->siglock); ++ put_task_struct(tsk); ++ } ++ ++ cpt_resume_network(ctx); ++ ++ cpt_unlock_ve(ctx); ++ ++ cpt_finish_ubc(ctx); ++ cpt_object_destroy(ctx); ++ return 0; ++} ++ ++int cpt_kill(struct cpt_context *ctx) ++{ ++ int err = 0; ++ struct ve_struct *env; ++ cpt_object_t *obj; ++ struct task_struct *root_task = NULL; ++ long delay; ++ ++ if (!ctx->ve_id) ++ return -EINVAL; ++ ++ env = get_ve_by_id(ctx->ve_id); ++ if (!env) ++ return -ESRCH; ++ ++ /* from here cpt_kill succeeds */ ++ virtinfo_notifier_call(VITYPE_SCP, VIRTINFO_SCP_DMPFIN, ctx); ++ ++ if (current->ve_task_info.owner_env == env) { ++ wprintk_ctx("attempt to kill ve from inside, escaping...\n"); ++ ve_move_task(current, get_ve0()); ++ } ++ ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ if (ctx->pgin_task) { ++ wait_for_completion(&ctx->pgin_notify); ++ put_task_struct(ctx->pgin_task); ++ ctx->pgin_task = NULL; ++ } ++#endif ++ ++ cpt_kill_sockets(ctx); ++ ++ for_each_object(obj, CPT_OBJ_TASK) { ++ struct task_struct *tsk = obj->o_obj; ++ ++ if (tsk->exit_state) { ++ put_task_struct(tsk); ++ continue; ++ } ++ ++ if (task_pid_vnr(tsk) == 1) { ++ root_task = tsk; ++ continue; ++ } ++ ++ tsk->robust_list = NULL; ++#ifdef CONFIG_COMPAT ++ tsk->compat_robust_list = NULL; ++#endif ++ tsk->clear_child_tid = NULL; ++ ++ if (tsk->ptrace) { ++ write_lock_irq(&tasklist_lock); ++ tsk->ptrace = 0; ++ if (!list_empty(&tsk->ptrace_list)) { ++ list_del_init(&tsk->ptrace_list); ++ remove_parent(tsk); ++ tsk->parent = tsk->parent; ++ add_parent(tsk); ++ } ++ write_unlock_irq(&tasklist_lock); ++ } ++ ++ send_sig(SIGKILL, tsk, 1); ++ ++ spin_lock_irq(&tsk->sighand->siglock); ++ sigfillset(&tsk->blocked); ++ sigdelsetmask(&tsk->blocked, sigmask(SIGKILL)); ++ set_tsk_thread_flag(tsk, TIF_SIGPENDING); ++ if (tsk->flags & PF_FROZEN) ++ tsk->flags &= ~PF_FROZEN; ++ spin_unlock_irq(&tsk->sighand->siglock); ++ ++ wake_up_process(tsk); ++ put_task_struct(tsk); ++ } ++ ++ yield(); ++ ++ if (root_task != NULL) { ++ send_sig(SIGKILL, root_task, 1); ++ ++ spin_lock_irq(&root_task->sighand->siglock); ++ sigfillset(&root_task->blocked); ++ sigdelsetmask(&root_task->blocked, sigmask(SIGKILL)); ++ set_tsk_thread_flag(root_task, TIF_SIGPENDING); ++ clear_tsk_thread_flag(root_task, TIF_FREEZE); ++ if (root_task->flags & PF_FROZEN) ++ root_task->flags &= ~PF_FROZEN; ++ spin_unlock_irq(&root_task->sighand->siglock); ++ ++ wake_up_process(root_task); ++ put_task_struct(root_task); ++ } ++ ++ cpt_finish_ubc(ctx); ++ cpt_object_destroy(ctx); ++ ++ delay = 1; ++ while (atomic_read(&env->counter) != 1) { ++ if (signal_pending(current)) ++ break; ++ current->state = TASK_INTERRUPTIBLE; ++ delay = (delay < HZ) ? (delay << 1) : HZ; ++ schedule_timeout(delay); ++ } ++ put_ve(env); ++ ++ return err; ++} ++ ++#ifdef CONFIG_BEANCOUNTERS ++static void collect_task_ubc(struct task_struct *t, struct cpt_context *ctx) ++{ ++ struct task_beancounter *tbc; ++ ++ tbc = &(t->task_bc); ++ cpt_add_ubc(tbc->exec_ub, ctx); ++ cpt_add_ubc(tbc->task_ub, ctx); ++ cpt_add_ubc(tbc->fork_sub, ctx); ++} ++#else ++static void inline collect_task_ubc(struct task_struct *t, ++ struct cpt_context *ctx) ++{ return; } ++#endif ++ ++static cpt_object_t * remember_task(struct task_struct * child, ++ cpt_object_t * head, cpt_context_t * ctx) ++{ ++ cpt_object_t *cobj; ++ ++ if (freezable(child) && !(child->flags&PF_FROZEN)) { ++ eprintk_ctx("process " CPT_FID " is not frozen\n", CPT_TID(child)); ++ put_task_struct(child); ++ return NULL; ++ } ++ ++ if (lookup_cpt_object(CPT_OBJ_TASK, child, ctx)) BUG(); ++ if ((cobj = alloc_cpt_object(GFP_KERNEL, ctx)) == NULL) { ++ put_task_struct(child); ++ return NULL; ++ } ++ cobj->o_count = 1; ++ cpt_obj_setobj(cobj, child, ctx); ++ insert_cpt_object(CPT_OBJ_TASK, cobj, head, ctx); ++ collect_task_ubc(child, ctx); ++ return cobj; ++} ++ ++static int vps_collect_tasks(struct cpt_context *ctx) ++{ ++ int err = -ESRCH; ++ cpt_object_t *obj; ++ struct task_struct *root; ++ read_lock(&tasklist_lock); ++ root = find_task_by_vpid(1); ++ if (root) ++ get_task_struct(root); ++ read_unlock(&tasklist_lock); ++ ++ if (!root) { ++ err = -ESRCH; ++ eprintk_ctx("vps_collect_tasks: cannot find root\n"); ++ goto out; ++ } ++ ++ if ((obj = alloc_cpt_object(GFP_KERNEL, ctx)) == NULL) { ++ put_task_struct(root); ++ return -ENOMEM; ++ } ++ obj->o_count = 1; ++ cpt_obj_setobj(obj, root, ctx); ++ intern_cpt_object(CPT_OBJ_TASK, obj, ctx); ++ collect_task_ubc(root, ctx); ++ ++ /* Collect process subtree recursively */ ++ for_each_object(obj, CPT_OBJ_TASK) { ++ cpt_object_t *head = obj; ++ struct task_struct *tsk = obj->o_obj; ++ struct task_struct *child; ++ ++ if (freezable(tsk) && !(tsk->flags&PF_FROZEN)) { ++ eprintk_ctx("process " CPT_FID " is not frozen\n", CPT_TID(tsk)); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ if (tsk->state == TASK_RUNNING) ++ printk("Holy Crap 2 %ld " CPT_FID "\n", tsk->state, CPT_TID(tsk)); ++ ++ wait_task_inactive(tsk); ++ ++ err = check_task_state(tsk, ctx); ++ if (err) ++ goto out; ++ ++ if (tsk->pid == tsk->tgid) { ++ child = tsk; ++ for (;;) { ++ read_lock(&tasklist_lock); ++ child = next_thread(child); ++ if (child != tsk) ++ get_task_struct(child); ++ read_unlock(&tasklist_lock); ++ ++ if (child == tsk) ++ break; ++ ++ if (child->parent != tsk->parent) { ++ put_task_struct(child); ++ eprintk_ctx("illegal thread structure, kernel bug\n"); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ if ((head = remember_task(child, head, ctx)) == NULL) { ++ eprintk_ctx("task obj allocation failure\n"); ++ err = -ENOMEM; ++ goto out; ++ } ++ } ++ } ++ ++ /* About locking. VE is frozen. But lists of children ++ * may change at least for init, when entered task reparents ++ * to init and when reparented task exits. If we take care ++ * of this case, we still can unlock while scanning ++ * tasklists. ++ */ ++ read_lock(&tasklist_lock); ++ list_for_each_entry(child, &tsk->children, sibling) { ++ if (child->parent != tsk) ++ continue; ++ if (child->pid != child->tgid) ++ continue; ++ get_task_struct(child); ++ read_unlock(&tasklist_lock); ++ ++ if ((head = remember_task(child, head, ctx)) == NULL) { ++ eprintk_ctx("task obj allocation failure\n"); ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ read_lock(&tasklist_lock); ++ } ++ ++ list_for_each_entry(child, &tsk->ptrace_children, ptrace_list) { ++ if (child->parent != tsk) ++ continue; ++ if (child->pid != child->tgid) ++ continue; ++ get_task_struct(child); ++ read_unlock(&tasklist_lock); ++ ++ if ((head = remember_task(child, head, ctx)) == NULL) { ++ eprintk_ctx("task obj allocation failure\n"); ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ read_lock(&tasklist_lock); ++ } ++ read_unlock(&tasklist_lock); ++ } ++ ++ return 0; ++ ++out: ++ while (!list_empty(&ctx->object_array[CPT_OBJ_TASK])) { ++ struct list_head *head = ctx->object_array[CPT_OBJ_TASK].next; ++ cpt_object_t *obj = list_entry(head, cpt_object_t, o_list); ++ struct task_struct *tsk; ++ ++ list_del(head); ++ tsk = obj->o_obj; ++ put_task_struct(tsk); ++ free_cpt_object(obj, ctx); ++ } ++ return err; ++} ++ ++static int cpt_collect(struct cpt_context *ctx) ++{ ++ int err; ++ ++ if ((err = cpt_collect_mm(ctx)) != 0) ++ return err; ++ ++ if ((err = cpt_collect_sysv(ctx)) != 0) ++ return err; ++ ++ if ((err = cpt_collect_files(ctx)) != 0) ++ return err; ++ ++ if ((err = cpt_collect_fs(ctx)) != 0) ++ return err; ++ ++ if ((err = cpt_collect_namespace(ctx)) != 0) ++ return err; ++ ++ if ((err = cpt_collect_signals(ctx)) != 0) ++ return err; ++ ++ if (virtinfo_notifier_call(VITYPE_SCP, ++ VIRTINFO_SCP_COLLECT, ctx) & NOTIFY_FAIL) ++ return -ECHRNG; ++ ++ return 0; ++} ++ ++static int cpt_dump_veinfo(cpt_context_t *ctx) ++{ ++ struct cpt_veinfo_image *i = cpt_get_buf(ctx); ++ struct ve_struct *ve; ++ struct timespec delta; ++ struct ipc_namespace *ns; ++ ++ cpt_open_section(ctx, CPT_SECT_VEINFO); ++ cpt_open_object(NULL, ctx); ++ ++ memset(i, 0, sizeof(*i)); ++ ++ i->cpt_next = CPT_NULL; ++ i->cpt_object = CPT_OBJ_VEINFO; ++ i->cpt_hdrlen = sizeof(*i); ++ i->cpt_content = CPT_CONTENT_VOID; ++ ++ ve = get_exec_env(); ++ ns = ve->ve_ns->ipc_ns; ++ ++ if (ns->shm_ctlall > 0xFFFFFFFFU) ++ i->shm_ctl_all = 0xFFFFFFFFU; ++ if (ns->shm_ctlmax > 0xFFFFFFFFU) ++ i->shm_ctl_max = 0xFFFFFFFFU; ++ i->shm_ctl_mni = ns->shm_ctlmni; ++ ++ i->msg_ctl_max = ns->msg_ctlmax; ++ i->msg_ctl_mni = ns->msg_ctlmni; ++ i->msg_ctl_mnb = ns->msg_ctlmnb; ++ ++ BUILD_BUG_ON(sizeof(ns->sem_ctls) != sizeof(i->sem_ctl_arr)); ++ i->sem_ctl_arr[0] = ns->sem_ctls[0]; ++ i->sem_ctl_arr[1] = ns->sem_ctls[1]; ++ i->sem_ctl_arr[2] = ns->sem_ctls[2]; ++ i->sem_ctl_arr[3] = ns->sem_ctls[3]; ++ ++ do_posix_clock_monotonic_gettime(&delta); ++ _set_normalized_timespec(&delta, ++ delta.tv_sec - ve->start_timespec.tv_sec, ++ delta.tv_nsec - ve->start_timespec.tv_nsec); ++ i->start_timespec_delta = cpt_timespec_export(&delta); ++ i->start_jiffies_delta = get_jiffies_64() - ve->start_jiffies; ++ ++ i->last_pid = ve->ve_ns->pid_ns->last_pid; ++ ++ ctx->write(i, sizeof(*i), ctx); ++ cpt_release_buf(ctx); ++ cpt_close_object(ctx); ++ cpt_close_section(ctx); ++ return 0; ++} ++ ++static int cpt_dump_utsname(cpt_context_t *ctx) ++{ ++ int len; ++ struct cpt_object_hdr o; ++ struct ve_struct *ve; ++ struct uts_namespace *ns; ++ ++ cpt_open_section(ctx, CPT_SECT_UTSNAME); ++ ++ ve = get_exec_env(); ++ ns = ve->ve_ns->uts_ns; ++ ++ cpt_open_object(NULL, ctx); ++ len = strlen(ns->name.nodename); ++ o.cpt_next = CPT_NULL; ++ o.cpt_object = CPT_OBJ_NAME; ++ o.cpt_hdrlen = sizeof(o); ++ o.cpt_content = CPT_CONTENT_NAME; ++ ++ ctx->write(&o, sizeof(o), ctx); ++ ctx->write(ns->name.nodename, len+1, ctx); ++ ctx->align(ctx); ++ cpt_close_object(ctx); ++ ++ cpt_open_object(NULL, ctx); ++ len = strlen(ns->name.domainname); ++ o.cpt_next = CPT_NULL; ++ o.cpt_object = CPT_OBJ_NAME; ++ o.cpt_hdrlen = sizeof(o); ++ o.cpt_content = CPT_CONTENT_NAME; ++ ++ ctx->write(&o, sizeof(o), ctx); ++ ctx->write(ns->name.domainname, len+1, ctx); ++ ctx->align(ctx); ++ cpt_close_object(ctx); ++ ++ cpt_close_section(ctx); ++ return 0; ++} ++ ++#ifndef CONFIG_IA64 ++static int cpt_dump_vsyscall(cpt_context_t *ctx) ++{ ++ struct cpt_page_block *pgb = cpt_get_buf(ctx); ++ ++ cpt_open_section(ctx, CPT_SECT_VSYSCALL); ++ cpt_open_object(NULL, ctx); ++ ++ pgb->cpt_next = CPT_NULL; ++ pgb->cpt_object = CPT_OBJ_VSYSCALL; ++ pgb->cpt_hdrlen = sizeof(*pgb); ++ pgb->cpt_content = CPT_CONTENT_DATA; ++ pgb->cpt_start = cpt_ptr_export(vsyscall_addr); ++ pgb->cpt_end = pgb->cpt_start + PAGE_SIZE; ++ ++ ctx->write(pgb, sizeof(*pgb), ctx); ++ cpt_release_buf(ctx); ++ ++ ctx->write(vsyscall_addr, PAGE_SIZE, ctx); ++ ++ cpt_close_object(ctx); ++ cpt_close_section(ctx); ++ return 0; ++} ++#endif ++ ++int cpt_dump(struct cpt_context *ctx) ++{ ++ struct ve_struct *oldenv, *env; ++ struct nsproxy *old_ns; ++ int err, err2 = 0; ++ ++ if (!ctx->ve_id) ++ return -EINVAL; ++ ++ env = get_ve_by_id(ctx->ve_id); ++ if (!env) ++ return -ESRCH; ++ ++ down_read(&env->op_sem); ++ err = -ESRCH; ++ if (!env->is_running) ++ goto out_noenv; ++ if (!env->is_locked) ++ goto out_noenv; ++ err = -EINVAL; ++ if (env->ve_ns->pid_ns->flags & PID_NS_HIDDEN) { ++ printk(KERN_WARNING "CT: checkpointing not supported yet" ++ " for hidden pid namespaces.\n"); ++ goto out_noenv; ++ } ++ ++ oldenv = set_exec_env(env); ++ old_ns = current->nsproxy; ++ current->nsproxy = env->ve_ns; ++ ++ /* Phase 2: real checkpointing */ ++ err = cpt_open_dumpfile(ctx); ++ if (err) ++ goto out; ++ ++ cpt_major_hdr_out(ctx); ++ ++ if (!err) ++ err = cpt_dump_veinfo(ctx); ++ if (!err) ++ err = cpt_dump_ubc(ctx); ++ if (!err) ++ err = cpt_dump_files(ctx); ++ if (!err) ++ err = cpt_dump_files_struct(ctx); ++ if (!err) ++ err = cpt_dump_fs_struct(ctx); ++ /* netdevices should be dumped after dumping open files ++ as we need to restore netdevice binding to /dev/net/tun file */ ++ if (!err) ++ err = cpt_dump_ifinfo(ctx); ++ if (!err) ++ err = cpt_dump_namespace(ctx); ++ if (!err) ++ err = cpt_dump_sighand(ctx); ++ if (!err) ++ err = cpt_dump_vm(ctx); ++ if (!err) ++ err = cpt_dump_sysvsem(ctx); ++ if (!err) ++ err = cpt_dump_sysvmsg(ctx); ++ if (!err) ++ err = cpt_dump_tasks(ctx); ++ if (!err) ++ err = cpt_dump_orphaned_sockets(ctx); ++#if defined(CONFIG_VE_IPTABLES) && \ ++ (defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)) ++ if (!err) ++ err = cpt_dump_ip_conntrack(ctx); ++#endif ++ if (!err) { ++ if (virtinfo_notifier_call(VITYPE_SCP, ++ VIRTINFO_SCP_DUMP, ctx) & NOTIFY_FAIL) ++ err = -ECHRNG; ++ } ++ if (!err) ++ err = cpt_dump_utsname(ctx); ++ ++#ifndef CONFIG_IA64 ++ if (!err) ++ err = cpt_dump_vsyscall(ctx); ++#endif ++ ++ if (!err) ++ err = cpt_dump_tail(ctx); ++ ++ err2 = cpt_close_dumpfile(ctx); ++ ++out: ++ current->nsproxy = old_ns; ++ set_exec_env(oldenv); ++out_noenv: ++ up_read(&env->op_sem); ++ put_ve(env); ++ return err ? : err2; ++} ++ ++int cpt_vps_suspend(struct cpt_context *ctx) ++{ ++ struct ve_struct *oldenv, *env; ++ struct nsproxy *old_ns; ++ int err = 0; ++ ++ ctx->kernel_config_flags = test_kernel_config(); ++ cpt_object_init(ctx); ++ ++ if (!ctx->ve_id) { ++ env = get_exec_env(); ++ if (env == get_ve0()) ++ return -EINVAL; ++ wprintk("undefined ve_id\n"); ++ ctx->ve_id = env->veid; ++ get_ve(env); ++ } else { ++ env = get_ve_by_id(ctx->ve_id); ++ if (!env) ++ return -ESRCH; ++ } ++ ++#ifdef CONFIG_VE_IPTABLES ++ ctx->iptables_mask = env->_iptables_modules; ++#endif ++ ctx->features = env->features; ++ ++ down_write(&env->op_sem); ++ err = -ESRCH; ++ if (!env->is_running) ++ goto out_noenv; ++ ++ err = -EBUSY; ++ if (env->is_locked) ++ goto out_noenv; ++ env->is_locked = 1; ++ downgrade_write(&env->op_sem); ++ ++ oldenv = set_exec_env(env); ++ old_ns = current->nsproxy; ++ current->nsproxy = env->ve_ns; ++ ++ /* Phase 0: find and stop all the tasks */ ++ if ((err = vps_stop_tasks(ctx)) != 0) ++ goto out; ++ ++ if ((err = cpt_suspend_network(ctx)) != 0) ++ goto out_wake; ++ ++ /* At the moment all the state is frozen. We do not need to lock ++ * the state, which can be changed only if the tasks are running. ++ */ ++ ++ /* Phase 1: collect task tree */ ++ if ((err = vps_collect_tasks(ctx)) != 0) ++ goto out_wake; ++ ++ /* Phase 1': collect all the resources */ ++ if ((err = cpt_collect(ctx)) != 0) ++ goto out; ++ ++out: ++ current->nsproxy = old_ns; ++ set_exec_env(oldenv); ++ up_read(&env->op_sem); ++ put_ve(env); ++ return err; ++ ++out_noenv: ++ up_write(&env->op_sem); ++ put_ve(env); ++ return err; ++ ++out_wake: ++ read_lock(&tasklist_lock); ++ wake_ve(ctx); ++ read_unlock(&tasklist_lock); ++ goto out; ++} ++ ++static void check_unsupported_netdevices(struct cpt_context *ctx, __u32 *caps) ++{ ++ struct net *net = get_exec_env()->ve_netns; ++ struct net_device *dev; ++ ++ read_lock(&dev_base_lock); ++ for_each_netdev(net, dev) { ++ if (dev != net->loopback_dev ++#if defined(CONFIG_VE_ETHDEV) || defined(CONFIG_VE_ETHDEV_MODULE) ++ && !(KSYMREF(veth_open) && dev->open == KSYMREF(veth_open)) ++#endif ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++ && dev != get_exec_env()->_venet_dev ++#endif ++#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) ++ && dev->open != tun_net_open ++#endif ++ ) { ++ eprintk_ctx("unsupported netdevice %s\n", dev->name); ++ *caps |= (1<flags & _TIF_IA32)) ++ *caps |= flags & ((1<mm && p->mm->context.vdso) { ++ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) ++ *caps |= flags & (1<mm && p->mm->context.vdso) ++ *caps |= flags & (1<= 0) { ++ switch (check_process_external(p)) { ++ case PIDTYPE_PID: ++ eprintk_ctx("external process %d/%d(%s) inside CT (e.g. vzctl enter or vzctl exec).\n", task_pid_vnr(p), p->pid, p->comm); ++ *caps |= (1<pid, p->comm); ++ *caps |= (1<pid, p->comm); ++ *caps |= (1<pid, p->comm); ++ *caps |= (1<nsproxy) { ++ ns = p->nsproxy->mnt_ns; ++ if (ns) ++ get_mnt_ns(ns); ++ } ++ task_unlock(p); ++ if (ns) { ++ if (ns != current->nsproxy->mnt_ns) { ++ eprintk_ctx("namespaces are not supported: process %d/%d(%s)\n", task_pid_vnr(p), p->pid, p->comm); ++ *caps |= (1<policy != SCHED_NORMAL) { ++ eprintk_ctx("scheduler policy is not supported %d/%d(%s)\n", task_pid_vnr(p), p->pid, p->comm); ++ *caps |= (1<pid, virt_pid(p), p->comm); ++ *caps |= (1<list) { ++ struct vfsmount *mnt = list_entry(p, struct vfsmount, mnt_list); ++ struct path p; ++ ++ p.dentry = mnt->mnt_root; ++ p.mnt = mnt; ++ path = __d_path(&p, &env->root_path, ++ path_buf, PAGE_SIZE); ++ if (IS_ERR(path)) ++ continue; ++ ++ if (check_one_vfsmount(mnt)) { ++ eprintk_ctx("Unsupported filesystem %s\n", mnt->mnt_sb->s_type->name); ++ *caps |= (1<ve_id) ++ return -EINVAL; ++ ++ env = get_ve_by_id(ctx->ve_id); ++ if (env == NULL) ++ return -ESRCH; ++ ++ *caps = flags & (1<nsproxy; ++ current->nsproxy = env->ve_ns; ++ ++ check_unsupported_netdevices(ctx, caps); ++ ++ read_lock(&tasklist_lock); ++ root = find_task_by_vpid(1); ++ if (!root) { ++ read_unlock(&tasklist_lock); ++ eprintk_ctx("cannot find ve init\n"); ++ err = -ESRCH; ++ goto out; ++ } ++ get_task_struct(root); ++ for (p = __first_task_ve(env); p != NULL ; p = __next_task_ve(env, p)) ++ check_one_process(ctx, caps, flags, env, root, p); ++ read_unlock(&tasklist_lock); ++ ++ task_lock(root); ++ n = NULL; ++ if (root->nsproxy) { ++ n = root->nsproxy->mnt_ns; ++ if (n) ++ get_mnt_ns(n); ++ } ++ task_unlock(root); ++ if (n) { ++ char *path_buf; ++ ++ path_buf = (char *) __get_free_page(GFP_KERNEL); ++ if (!path_buf) { ++ put_mnt_ns(n); ++ err = -ENOMEM; ++ goto out_root; ++ } ++ ++ check_unsupported_mounts(ctx, caps, env, n, path_buf); ++ ++ free_page((unsigned long) path_buf); ++ put_mnt_ns(n); ++ } ++ ++ err = 0; ++ ++out_root: ++ put_task_struct(root); ++out: ++ current->nsproxy = old_ns; ++ set_exec_env(old_env); ++ put_ve(env); ++ ++ return err; ++} +diff --git a/kernel/cpt/cpt_dump.h b/kernel/cpt/cpt_dump.h +new file mode 100644 +index 0000000..71f6d94 +--- /dev/null ++++ b/kernel/cpt/cpt_dump.h +@@ -0,0 +1,16 @@ ++int cpt_dump(struct cpt_context *cpt); ++int rst_undump(struct cpt_context *cpt); ++int cpt_suspend(struct cpt_context *cpt); ++int cpt_resume(struct cpt_context *cpt); ++int cpt_kill(struct cpt_context *cpt); ++int rst_clean(struct cpt_context *cpt); ++int rst_resume(struct cpt_context *cpt); ++int rst_kill(struct cpt_context *cpt); ++ ++int cpt_freeze_one(pid_t pid, int freeze); ++int cpt_vps_suspend(struct cpt_context *ctx); ++int vps_rst_undump(struct cpt_context *ctx); ++ ++int cpt_vps_caps(struct cpt_context *ctx, __u32 *caps); ++ ++int cpt_check_unsupported(struct task_struct *tsk, struct cpt_context *ctx); +diff --git a/kernel/cpt/cpt_epoll.c b/kernel/cpt/cpt_epoll.c +new file mode 100644 +index 0000000..81d2b98 +--- /dev/null ++++ b/kernel/cpt/cpt_epoll.c +@@ -0,0 +1,113 @@ ++/* ++ * ++ * kernel/cpt/cpt_epoll.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_mm.h" ++#include "cpt_files.h" ++#include "cpt_kernel.h" ++#include "cpt_fsmagic.h" ++#include "cpt_syscalls.h" ++ ++int cpt_dump_epolldev(cpt_object_t *obj, cpt_context_t *ctx) ++{ ++ int err = 0; ++ struct file *file = obj->o_obj; ++ struct eventpoll *ep; ++ struct rb_node *rbp; ++ struct cpt_epoll_image ei; ++ ++ if (file->f_op != &eventpoll_fops) { ++ eprintk_ctx("bad epoll file\n"); ++ return -EINVAL; ++ } ++ ++ ep = file->private_data; ++ ++ /* eventpoll.c does not protect open /proc/N/fd, silly. ++ * Opener will get an invalid file with uninitialized private_data ++ */ ++ if (unlikely(ep == NULL)) { ++ eprintk_ctx("bad epoll device\n"); ++ return -EINVAL; ++ } ++ ++ cpt_open_object(NULL, ctx); ++ ++ ei.cpt_next = CPT_NULL; ++ ei.cpt_object = CPT_OBJ_EPOLL; ++ ei.cpt_hdrlen = sizeof(ei); ++ ei.cpt_content = CPT_CONTENT_ARRAY; ++ ei.cpt_file = obj->o_pos; ++ ++ ctx->write(&ei, sizeof(ei), ctx); ++ ++ mutex_lock(&epmutex); ++ for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { ++ loff_t saved_obj; ++ cpt_object_t *tobj; ++ struct cpt_epoll_file_image efi; ++ struct epitem *epi; ++ epi = rb_entry(rbp, struct epitem, rbn); ++ tobj = lookup_cpt_object(CPT_OBJ_FILE, epi->ffd.file, ctx); ++ if (tobj == NULL) { ++ eprintk_ctx("epoll device refers to an external file\n"); ++ err = -EBUSY; ++ break; ++ } ++ cpt_push_object(&saved_obj, ctx); ++ cpt_open_object(NULL, ctx); ++ ++ efi.cpt_next = CPT_NULL; ++ efi.cpt_object = CPT_OBJ_EPOLL_FILE; ++ efi.cpt_hdrlen = sizeof(efi); ++ efi.cpt_content = CPT_CONTENT_VOID; ++ efi.cpt_file = tobj->o_pos; ++ efi.cpt_fd = epi->ffd.fd; ++ efi.cpt_events = epi->event.events; ++ efi.cpt_data = epi->event.data; ++ efi.cpt_revents = 0; ++ efi.cpt_ready = 0; ++ if (!list_empty(&epi->rdllink)) ++ efi.cpt_ready = 1; ++ ++ ctx->write(&efi, sizeof(efi), ctx); ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_obj, ctx); ++ } ++ mutex_unlock(&epmutex); ++ ++ cpt_close_object(ctx); ++ ++ return err; ++} ++ +diff --git a/kernel/cpt/cpt_exports.c b/kernel/cpt/cpt_exports.c +new file mode 100644 +index 0000000..f492331 +--- /dev/null ++++ b/kernel/cpt/cpt_exports.c +@@ -0,0 +1,13 @@ ++#include ++#include ++ ++#include "cpt_obj.h" ++ ++EXPORT_SYMBOL(alloc_cpt_object); ++EXPORT_SYMBOL(intern_cpt_object); ++EXPORT_SYMBOL(insert_cpt_object); ++EXPORT_SYMBOL(__cpt_object_add); ++EXPORT_SYMBOL(cpt_object_add); ++EXPORT_SYMBOL(cpt_object_get); ++EXPORT_SYMBOL(lookup_cpt_object); ++EXPORT_SYMBOL(lookup_cpt_obj_bypos); +diff --git a/kernel/cpt/cpt_files.c b/kernel/cpt/cpt_files.c +new file mode 100644 +index 0000000..1a2dd15 +--- /dev/null ++++ b/kernel/cpt/cpt_files.c +@@ -0,0 +1,1634 @@ ++/* ++ * ++ * kernel/cpt/cpt_files.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_mm.h" ++#include "cpt_files.h" ++#include "cpt_socket.h" ++#include "cpt_kernel.h" ++#include "cpt_fsmagic.h" ++#include "cpt_syscalls.h" ++ ++void cpt_printk_dentry(struct dentry *d, struct vfsmount *mnt) ++{ ++ char *path; ++ struct path p; ++ unsigned long pg = __get_free_page(GFP_KERNEL); ++ ++ if (!pg) ++ return; ++ ++ p.dentry = d; ++ p.mnt = mnt; ++ path = d_path(&p, (char *)pg, PAGE_SIZE); ++ ++ if (!IS_ERR(path)) ++ eprintk("<%s>", path); ++ free_page(pg); ++} ++ ++int cpt_verify_overmount(char *path, struct dentry *d, struct vfsmount *mnt, ++ cpt_context_t *ctx) ++{ ++ if (path[0] == '/' && !(!IS_ROOT(d) && d_unhashed(d))) { ++ struct nameidata nd; ++ if (path_lookup(path, 0, &nd)) { ++ eprintk_ctx("d_path cannot be looked up %s\n", path); ++ return -EINVAL; ++ } ++ if (nd.path.dentry != d || nd.path.mnt != mnt) { ++ eprintk_ctx("d_path is invisible %s\n", path); ++ path_put(&nd.path); ++ return -EINVAL; ++ } ++ path_put(&nd.path); ++ } ++ return 0; ++} ++ ++static int ++cpt_replaced(struct dentry * de, struct vfsmount *mnt, cpt_context_t * ctx) ++{ ++ int result = 0; ++ ++#if defined(CONFIG_VZFS_FS) || defined(CONFIG_VZFS_FS_MODULE) ++ char *path; ++ unsigned long pg; ++ struct dentry * renamed_dentry; ++ struct path p; ++ ++ if (de->d_sb->s_magic != FSMAGIC_VEFS) ++ return 0; ++ if (de->d_inode->i_nlink != 0 || ++ atomic_read(&de->d_inode->i_writecount) > 0) ++ return 0; ++ ++ renamed_dentry = vefs_replaced_dentry(de); ++ if (renamed_dentry == NULL) ++ return 0; ++ ++ pg = __get_free_page(GFP_KERNEL); ++ if (!pg) ++ return 0; ++ ++ p.dentry = de; ++ p.mnt = mnt; ++ path = d_path(&p, (char *)pg, PAGE_SIZE); ++ if (!IS_ERR(path)) { ++ int len; ++ struct nameidata nd; ++ ++ len = pg + PAGE_SIZE - 1 - (unsigned long)path; ++ if (len >= sizeof("(deleted) ") - 1 && ++ !memcmp(path, "(deleted) ", sizeof("(deleted) ") - 1)) { ++ len -= sizeof("(deleted) ") - 1; ++ path += sizeof("(deleted) ") - 1; ++ } ++ ++ if (path_lookup(path, 0, &nd) == 0) { ++ if (mnt == nd.path.mnt && ++ vefs_is_renamed_dentry(nd.path.dentry, renamed_dentry)) ++ result = 1; ++ path_put(&nd.path); ++ } ++ } ++ free_page(pg); ++#endif ++ return result; ++} ++ ++static int cpt_dump_dentry(struct dentry *d, struct vfsmount *mnt, ++ int replaced, cpt_context_t *ctx) ++{ ++ int len; ++ char *path; ++ struct path p; ++ char *pg = cpt_get_buf(ctx); ++ loff_t saved; ++ ++ p.dentry = d; ++ p.mnt = mnt; ++ path = d_path(&p, pg, PAGE_SIZE); ++ len = PTR_ERR(path); ++ ++ if (IS_ERR(path)) { ++ struct cpt_object_hdr o; ++ char tmp[1]; ++ ++ /* VZ changes d_path() to return EINVAL, when path ++ * is not supposed to be visible inside VE. ++ * This changes behaviour of d_path() comparing ++ * to mainstream kernel, f.e. d_path() fails ++ * on any kind of shared memory. Maybe, there are ++ * another cases, but I am aware only about this one. ++ * So, we just ignore error on shmem mounts and proceed. ++ * Otherwise, checkpointing is prohibited because ++ * of reference to an invisible file. ++ */ ++ if (len != -EINVAL || ++ mnt != get_exec_env()->shmem_mnt) ++ eprintk_ctx("d_path err=%d\n", len); ++ else ++ len = 0; ++ ++ cpt_push_object(&saved, ctx); ++ cpt_open_object(NULL, ctx); ++ o.cpt_next = CPT_NULL; ++ o.cpt_object = CPT_OBJ_NAME; ++ o.cpt_hdrlen = sizeof(o); ++ o.cpt_content = CPT_CONTENT_NAME; ++ tmp[0] = 0; ++ ++ ctx->write(&o, sizeof(o), ctx); ++ ctx->write(tmp, 1, ctx); ++ ctx->align(ctx); ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved, ctx); ++ ++ __cpt_release_buf(ctx); ++ return len; ++ } else { ++ struct cpt_object_hdr o; ++ ++ len = pg + PAGE_SIZE - 1 - path; ++ if (replaced && ++ len >= sizeof("(deleted) ") - 1 && ++ !memcmp(path, "(deleted) ", sizeof("(deleted) ") - 1)) { ++ len -= sizeof("(deleted) ") - 1; ++ path += sizeof("(deleted) ") - 1; ++ } ++ o.cpt_next = CPT_NULL; ++ o.cpt_object = CPT_OBJ_NAME; ++ o.cpt_hdrlen = sizeof(o); ++ o.cpt_content = CPT_CONTENT_NAME; ++ path[len] = 0; ++ ++ if (cpt_verify_overmount(path, d, mnt, ctx)) { ++ __cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ ++ cpt_push_object(&saved, ctx); ++ cpt_open_object(NULL, ctx); ++ ctx->write(&o, sizeof(o), ctx); ++ ctx->write(path, len+1, ctx); ++ ctx->align(ctx); ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved, ctx); ++ __cpt_release_buf(ctx); ++ } ++ return 0; ++} ++ ++int cpt_dump_string(const char *s, struct cpt_context *ctx) ++{ ++ int len; ++ struct cpt_object_hdr o; ++ ++ cpt_open_object(NULL, ctx); ++ len = strlen(s); ++ o.cpt_next = CPT_NULL; ++ o.cpt_object = CPT_OBJ_NAME; ++ o.cpt_hdrlen = sizeof(o); ++ o.cpt_content = CPT_CONTENT_NAME; ++ ++ ctx->write(&o, sizeof(o), ctx); ++ ctx->write(s, len+1, ctx); ++ ctx->align(ctx); ++ cpt_close_object(ctx); ++ return 0; ++} ++ ++static int ++cpt_dump_filename(struct file *file, int replaced, cpt_context_t *ctx) ++{ ++ return cpt_dump_dentry(file->f_dentry, file->f_vfsmnt, replaced, ctx); ++} ++ ++int cpt_dump_inode(struct dentry *d, struct vfsmount *mnt, struct cpt_context *ctx) ++{ ++ int err; ++ struct cpt_inode_image *v = cpt_get_buf(ctx); ++ struct kstat sbuf; ++ ++ v->cpt_next = sizeof(*v); ++ v->cpt_object = CPT_OBJ_INODE; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_ARRAY; ++ ++ if ((err = vfs_getattr(mnt, d, &sbuf)) != 0) { ++ cpt_release_buf(ctx); ++ return err; ++ } ++ ++ v->cpt_dev = d->d_inode->i_sb->s_dev; ++ v->cpt_ino = d->d_inode->i_ino; ++ v->cpt_mode = sbuf.mode; ++ v->cpt_nlink = sbuf.nlink; ++ v->cpt_uid = sbuf.uid; ++ v->cpt_gid = sbuf.gid; ++ v->cpt_rdev = d->d_inode->i_rdev; ++ v->cpt_size = sbuf.size; ++ v->cpt_atime = cpt_timespec_export(&sbuf.atime); ++ v->cpt_mtime = cpt_timespec_export(&sbuf.mtime); ++ v->cpt_ctime = cpt_timespec_export(&sbuf.ctime); ++ v->cpt_blksize = sbuf.blksize; ++ v->cpt_blocks = sbuf.blocks; ++ v->cpt_sb = d->d_inode->i_sb->s_magic; ++ ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ return 0; ++} ++ ++int cpt_collect_files(cpt_context_t * ctx) ++{ ++ int err; ++ cpt_object_t *obj; ++ int index = 0; ++ ++ /* Collect process fd sets */ ++ for_each_object(obj, CPT_OBJ_TASK) { ++ struct task_struct *tsk = obj->o_obj; ++ if (tsk->files && cpt_object_add(CPT_OBJ_FILES, tsk->files, ctx) == NULL) ++ return -ENOMEM; ++ } ++ ++ /* Collect files from fd sets */ ++ for_each_object(obj, CPT_OBJ_FILES) { ++ int fd; ++ struct files_struct *f = obj->o_obj; ++ ++ cpt_obj_setindex(obj, index++, ctx); ++ ++ if (obj->o_count != atomic_read(&f->count)) { ++ eprintk_ctx("files_struct is referenced outside %d %d\n", obj->o_count, atomic_read(&f->count)); ++ return -EBUSY; ++ } ++ ++ for (fd = 0; fd < f->fdt->max_fds; fd++) { ++ struct file *file = fcheck_files(f, fd); ++ if (file && cpt_object_add(CPT_OBJ_FILE, file, ctx) == NULL) ++ return -ENOMEM; ++ } ++ } ++ ++ /* Collect files queued by AF_UNIX sockets. */ ++ if ((err = cpt_collect_passedfds(ctx)) < 0) ++ return err; ++ ++ /* OK. At this point we should count all the references. */ ++ for_each_object(obj, CPT_OBJ_FILE) { ++ struct file *file = obj->o_obj; ++ struct file *parent; ++ cpt_object_t *ino_obj; ++ ++ if (obj->o_count != atomic_read(&file->f_count)) { ++ eprintk_ctx("file struct is referenced outside %d %d\n", obj->o_count, atomic_read(&file->f_count)); ++ cpt_printk_dentry(file->f_dentry, file->f_vfsmnt); ++ return -EBUSY; ++ } ++ ++ switch (file->f_dentry->d_inode->i_sb->s_magic) { ++ case FSMAGIC_FUTEX: ++ case FSMAGIC_MQUEUE: ++ case FSMAGIC_BDEV: ++#ifndef CONFIG_INOTIFY_USER ++ case FSMAGIC_INOTIFY: ++#endif ++ eprintk_ctx("file on unsupported FS: magic %08lx\n", file->f_dentry->d_inode->i_sb->s_magic); ++ return -EBUSY; ++ } ++ ++ /* Collect inode. It is necessary mostly to resolve deleted ++ * hard links. */ ++ ino_obj = cpt_object_add(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx); ++ if (ino_obj == NULL) ++ return -ENOMEM; ++ ++ parent = ino_obj->o_parent; ++ if (!parent || (!IS_ROOT(parent->f_dentry) && d_unhashed(parent->f_dentry))) ++ ino_obj->o_parent = file; ++ ++ if (S_ISCHR(file->f_dentry->d_inode->i_mode)) { ++ int maj = imajor(file->f_dentry->d_inode); ++ if (maj == PTY_MASTER_MAJOR || ++ (maj >= UNIX98_PTY_MASTER_MAJOR && ++ maj < UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT) || ++ maj == PTY_SLAVE_MAJOR || ++ maj == UNIX98_PTY_SLAVE_MAJOR || ++ maj == TTYAUX_MAJOR) { ++ err = cpt_collect_tty(file, ctx); ++ if (err) ++ return err; ++ } ++ } ++ ++ if (S_ISSOCK(file->f_dentry->d_inode->i_mode)) { ++ err = cpt_collect_socket(file, ctx); ++ if (err) ++ return err; ++ } ++ } ++ ++ err = cpt_index_sockets(ctx); ++ ++ return err; ++} ++ ++/* /dev/ptmx is special, all the files share one inode, but real tty backend ++ * is attached via file->private_data. ++ */ ++ ++static inline int is_cloning_inode(struct inode *ino) ++{ ++ return S_ISCHR(ino->i_mode) && ++ ino->i_rdev == MKDEV(TTYAUX_MAJOR,2); ++} ++ ++static int dump_one_flock(struct file_lock *fl, int owner, struct cpt_context *ctx) ++{ ++ pid_t pid; ++ struct cpt_flock_image *v = cpt_get_buf(ctx); ++ ++ v->cpt_next = sizeof(*v); ++ v->cpt_object = CPT_OBJ_FLOCK; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_VOID; ++ ++ v->cpt_owner = owner; ++ ++ pid = fl->fl_pid; ++ if (pid) { ++ pid = pid_to_vpid(fl->fl_pid); ++ if (pid == -1) { ++ if (!(fl->fl_flags&FL_FLOCK)) { ++ eprintk_ctx("posix lock from another container?\n"); ++ cpt_release_buf(ctx); ++ return -EBUSY; ++ } ++ pid = 0; ++ } ++ } ++ ++ v->cpt_pid = pid; ++ v->cpt_start = fl->fl_start; ++ v->cpt_end = fl->fl_end; ++ v->cpt_flags = fl->fl_flags; ++ v->cpt_type = fl->fl_type; ++ ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ return 0; ++} ++ ++ ++int cpt_dump_flock(struct file *file, struct cpt_context *ctx) ++{ ++ int err = 0; ++ struct file_lock *fl; ++ ++ lock_kernel(); ++ for (fl = file->f_dentry->d_inode->i_flock; ++ fl; fl = fl->fl_next) { ++ if (file != fl->fl_file) ++ continue; ++ if (fl->fl_flags & FL_LEASE) { ++ eprintk_ctx("lease lock is not supported\n"); ++ err = -EINVAL; ++ break; ++ } ++ if (fl->fl_flags & FL_POSIX) { ++ cpt_object_t *obj; ++ obj = lookup_cpt_object(CPT_OBJ_FILES, fl->fl_owner, ctx); ++ if (obj) { ++ dump_one_flock(fl, obj->o_index, ctx); ++ continue; ++ } else { ++ eprintk_ctx("unknown lock owner %p\n", fl->fl_owner); ++ err = -EINVAL; ++ } ++ } ++ if (fl->fl_flags & FL_FLOCK) { ++ dump_one_flock(fl, -1, ctx); ++ continue; ++ } ++ } ++ unlock_kernel(); ++ return err; ++} ++ ++static int dump_one_file(cpt_object_t *obj, struct file *file, cpt_context_t *ctx) ++{ ++ int err = 0; ++ cpt_object_t *iobj; ++ struct cpt_file_image *v = cpt_get_buf(ctx); ++ struct kstat sbuf; ++ int replaced = 0; ++ ++ cpt_open_object(obj, ctx); ++ ++ v->cpt_next = CPT_NULL; ++ v->cpt_object = CPT_OBJ_FILE; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_ARRAY; ++ ++ v->cpt_flags = file->f_flags; ++ v->cpt_mode = file->f_mode; ++ v->cpt_pos = file->f_pos; ++ v->cpt_uid = file->f_uid; ++ v->cpt_gid = file->f_gid; ++ ++ vfs_getattr(file->f_vfsmnt, file->f_dentry, &sbuf); ++ ++ v->cpt_i_mode = sbuf.mode; ++ v->cpt_lflags = 0; ++ if (IS_ROOT(file->f_dentry)) ++ v->cpt_lflags |= CPT_DENTRY_ROOT; ++ else if (d_unhashed(file->f_dentry)) { ++ if (cpt_replaced(file->f_dentry, file->f_vfsmnt, ctx)) { ++ v->cpt_lflags |= CPT_DENTRY_REPLACED; ++ replaced = 1; ++ } else { ++ v->cpt_lflags |= CPT_DENTRY_DELETED; ++ } ++ } ++ if (is_cloning_inode(file->f_dentry->d_inode)) ++ v->cpt_lflags |= CPT_DENTRY_CLONING; ++ if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_PROC) ++ v->cpt_lflags |= CPT_DENTRY_PROC; ++ v->cpt_inode = CPT_NULL; ++ if (!(v->cpt_lflags & CPT_DENTRY_REPLACED)) { ++ iobj = lookup_cpt_object(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx); ++ if (iobj) ++ v->cpt_inode = iobj->o_pos; ++ } ++ v->cpt_priv = CPT_NULL; ++ v->cpt_fown_fd = -1; ++ if (S_ISCHR(v->cpt_i_mode)) { ++ iobj = lookup_cpt_object(CPT_OBJ_TTY, file->private_data, ctx); ++ if (iobj) { ++ v->cpt_priv = iobj->o_pos; ++ if (file->f_flags&FASYNC) ++ v->cpt_fown_fd = cpt_tty_fasync(file, ctx); ++ } ++#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) ++ if (file->f_op && file->f_op->open == tun_chr_open) ++ v->cpt_lflags |= CPT_DENTRY_TUNTAP; ++#endif ++ } ++ if (S_ISSOCK(v->cpt_i_mode)) { ++ if (obj->o_index < 0) { ++ eprintk_ctx("BUG: no socket index\n"); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ v->cpt_priv = obj->o_index; ++ if (file->f_flags&FASYNC) ++ v->cpt_fown_fd = cpt_socket_fasync(file, ctx); ++ } ++ if (file->f_op == &eventpoll_fops) { ++ v->cpt_priv = file->f_dentry->d_inode->i_ino; ++ v->cpt_lflags |= CPT_DENTRY_EPOLL; ++ } ++ if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_INOTIFY) { ++ v->cpt_priv = file->f_dentry->d_inode->i_ino; ++ v->cpt_lflags |= CPT_DENTRY_INOTIFY; ++ } ++ ++ v->cpt_fown_pid = (file->f_owner.pid == NULL ? ++ CPT_FOWN_STRAY_PID : pid_vnr(file->f_owner.pid)); ++ v->cpt_fown_uid = file->f_owner.uid; ++ v->cpt_fown_euid = file->f_owner.euid; ++ v->cpt_fown_signo = file->f_owner.signum; ++ ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ ++ if (!S_ISSOCK(v->cpt_i_mode)) { ++ err = cpt_dump_filename(file, replaced, ctx); ++ if (err) ++ return err; ++ if ((file->f_mode & FMODE_WRITE) && ++ file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_VEFS) ++ vefs_track_notify(file->f_dentry, 1); ++ } ++ ++ if (file->f_dentry->d_inode->i_flock) ++ err = cpt_dump_flock(file, ctx); ++ ++ cpt_close_object(ctx); ++ ++ return err; ++} ++ ++/* About this weird function... Crappy code dealing with SYSV shared memory ++ * defines TMPFS inode and file with f_op doing only mmap. So... ++ * Maybe, this is wrong and leaks something. It is clear access to ++ * SYSV shmem via mmap is quite unusual and impossible from user space. ++ */ ++static int dump_content_shm(struct file *file, struct cpt_context *ctx) ++{ ++ struct cpt_obj_bits *v; ++ loff_t saved_pos; ++ unsigned long addr; ++ ++ addr = do_mmap_pgoff(file, 0, file->f_dentry->d_inode->i_size, ++ PROT_READ, MAP_SHARED, 0); ++ if (IS_ERR((void*)addr)) ++ return PTR_ERR((void*)addr); ++ ++ cpt_push_object(&saved_pos, ctx); ++ cpt_open_object(NULL, ctx); ++ v = cpt_get_buf(ctx); ++ v->cpt_next = CPT_NULL; ++ v->cpt_object = CPT_OBJ_BITS; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_DATA; ++ v->cpt_size = file->f_dentry->d_inode->i_size; ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ ctx->write((void*)addr, file->f_dentry->d_inode->i_size, ctx); ++ ctx->align(ctx); ++ do_munmap(current->mm, addr, file->f_dentry->d_inode->i_size); ++ ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_pos, ctx); ++ return 0; ++} ++ ++static int data_is_zero(char *addr, int len) ++{ ++ int i; ++ unsigned long zerolong = 0; ++ ++ for (i=0; if_op == NULL) ++ return -EINVAL; ++ ++ do_read = file->f_op->read; ++ if (file->f_op == &shm_file_operations) { ++ struct shm_file_data *sfd = file->private_data; ++ ++ cpt_dump_content_sysvshm(sfd->file, ctx); ++ ++ return 0; ++ } ++ if (file->f_op == &shmem_file_operations) { ++ do_read = file->f_dentry->d_inode->i_fop->read; ++ cpt_dump_content_sysvshm(file, ctx); ++ if (!do_read) { ++ wprintk_ctx("TMPFS is not configured?\n"); ++ return dump_content_shm(file, ctx); ++ } ++ } ++ ++ if (!(file->f_mode & FMODE_READ) || ++ (file->f_flags & O_DIRECT)) { ++ file = dentry_open(dget(file->f_dentry), ++ mntget(file->f_vfsmnt), O_RDONLY); ++ if (IS_ERR(file)) { ++ cpt_printk_dentry(file->f_dentry, file->f_vfsmnt); ++ eprintk_ctx("cannot reopen file for read %ld\n", PTR_ERR(file)); ++ return PTR_ERR(file); ++ } ++ } else { ++ atomic_inc(&file->f_count); ++ } ++ ++ for (;;) { ++ mm_segment_t oldfs; ++ int err; ++ ++ (void)cpt_get_buf(ctx); ++ ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ err = do_read(file, ctx->tmpbuf, PAGE_SIZE, &pos); ++ set_fs(oldfs); ++ if (err < 0) { ++ eprintk_ctx("dump_content_regular: do_read: %d", err); ++ fput(file); ++ __cpt_release_buf(ctx); ++ return err; ++ } ++ if (err == 0) { ++ __cpt_release_buf(ctx); ++ break; ++ } ++ if (data_is_zero(ctx->tmpbuf, err)) { ++ if (obj_opened != CPT_NULL) { ++ ctx->pwrite(&pgb.cpt_end, 8, ctx, obj_opened + offsetof(struct cpt_page_block, cpt_end)); ++ ctx->align(ctx); ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_pos, ctx); ++ obj_opened = CPT_NULL; ++ } ++ } else { ++ if (obj_opened == CPT_NULL) { ++ cpt_push_object(&saved_pos, ctx); ++ cpt_open_object(NULL, ctx); ++ obj_opened = ctx->file->f_pos; ++ pgb.cpt_next = CPT_NULL; ++ pgb.cpt_object = CPT_OBJ_PAGES; ++ pgb.cpt_hdrlen = sizeof(pgb); ++ pgb.cpt_content = CPT_CONTENT_DATA; ++ pgb.cpt_start = pos - err; ++ pgb.cpt_end = pgb.cpt_start; ++ ctx->write(&pgb, sizeof(pgb), ctx); ++ } ++ ctx->write(ctx->tmpbuf, err, ctx); ++ pgb.cpt_end += err; ++ } ++ __cpt_release_buf(ctx); ++ } ++ ++ fput(file); ++ ++ if (obj_opened != CPT_NULL) { ++ ctx->pwrite(&pgb.cpt_end, 8, ctx, obj_opened + offsetof(struct cpt_page_block, cpt_end)); ++ ctx->align(ctx); ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_pos, ctx); ++ obj_opened = CPT_NULL; ++ } ++ return 0; ++} ++ ++ ++static int dump_content_chrdev(struct file *file, struct cpt_context *ctx) ++{ ++ struct inode *ino = file->f_dentry->d_inode; ++ int maj; ++ ++ maj = imajor(ino); ++ if (maj == MEM_MAJOR) { ++ /* Well, OK. */ ++ return 0; ++ } ++ if (maj == PTY_MASTER_MAJOR || ++ (maj >= UNIX98_PTY_MASTER_MAJOR && ++ maj < UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT) || ++ maj == PTY_SLAVE_MAJOR || ++ maj == UNIX98_PTY_SLAVE_MAJOR || ++ maj == TTYAUX_MAJOR) { ++ return cpt_dump_content_tty(file, ctx); ++ } ++#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) ++ if (file->f_op && file->f_op->open == tun_chr_open) ++ return 0; ++#endif ++ eprintk_ctx("unsupported chrdev %d/%d\n", maj, iminor(ino)); ++ return -EINVAL; ++} ++ ++static int dump_content_blkdev(struct file *file, struct cpt_context *ctx) ++{ ++ struct inode *ino = file->f_dentry->d_inode; ++ ++ /* We are not going to transfer them. */ ++ eprintk_ctx("unsupported blkdev %d/%d\n", imajor(ino), iminor(ino)); ++ return -EINVAL; ++} ++ ++static int dump_content_fifo(struct file *file, struct cpt_context *ctx) ++{ ++ struct inode *ino = file->f_dentry->d_inode; ++ cpt_object_t *obj; ++ loff_t saved_pos; ++ int readers; ++ int writers; ++ int anon = 0; ++ ++ mutex_lock(&ino->i_mutex); ++ readers = ino->i_pipe->readers; ++ writers = ino->i_pipe->writers; ++ for_each_object(obj, CPT_OBJ_FILE) { ++ struct file *file1 = obj->o_obj; ++ if (file1->f_dentry->d_inode == ino) { ++ if (file1->f_mode & FMODE_READ) ++ readers--; ++ if (file1->f_mode & FMODE_WRITE) ++ writers--; ++ } ++ } ++ mutex_unlock(&ino->i_mutex); ++ if (readers || writers) { ++ struct dentry *dr = file->f_dentry->d_sb->s_root; ++ if (dr->d_name.len == 7 && memcmp(dr->d_name.name,"pipefs:",7) == 0) ++ anon = 1; ++ ++ if (anon) { ++ eprintk_ctx("pipe has %d/%d external readers/writers\n", readers, writers); ++ return -EBUSY; ++ } ++ /* If fifo has external readers/writers, we are in troubles. ++ * If the buffer is not empty, we must move its content. ++ * But if the fifo is owned by a service, we cannot do ++ * this. See? ++ * ++ * For now we assume, that if fifo is opened by another ++ * process, we do not own it and, hence, migrate without ++ * data. ++ */ ++ return 0; ++ } ++ ++ /* OK, we must save fifo state. No semaphores required. */ ++ ++ if (ino->i_pipe->nrbufs) { ++ struct cpt_obj_bits *v = cpt_get_buf(ctx); ++ struct pipe_inode_info *info; ++ int count, buf, nrbufs; ++ ++ mutex_lock(&ino->i_mutex); ++ info = ino->i_pipe; ++ count = 0; ++ buf = info->curbuf; ++ nrbufs = info->nrbufs; ++ while (--nrbufs >= 0) { ++ if (!info->bufs[buf].ops->can_merge) { ++ mutex_unlock(&ino->i_mutex); ++ eprintk_ctx("unknown format of pipe buffer\n"); ++ return -EINVAL; ++ } ++ count += info->bufs[buf].len; ++ buf = (buf+1) & (PIPE_BUFFERS-1); ++ } ++ ++ if (!count) { ++ mutex_unlock(&ino->i_mutex); ++ return 0; ++ } ++ ++ cpt_push_object(&saved_pos, ctx); ++ cpt_open_object(NULL, ctx); ++ v->cpt_next = CPT_NULL; ++ v->cpt_object = CPT_OBJ_BITS; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_DATA; ++ v->cpt_size = count; ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ ++ count = 0; ++ buf = info->curbuf; ++ nrbufs = info->nrbufs; ++ while (--nrbufs >= 0) { ++ struct pipe_buffer *b = info->bufs + buf; ++ /* need to ->pin first? */ ++ void * addr = b->ops->map(info, b, 0); ++ ctx->write(addr + b->offset, b->len, ctx); ++ b->ops->unmap(info, b, addr); ++ buf = (buf+1) & (PIPE_BUFFERS-1); ++ } ++ ++ mutex_unlock(&ino->i_mutex); ++ ++ ctx->align(ctx); ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_pos, ctx); ++ } ++ ++ return 0; ++} ++ ++static int dump_content_socket(struct file *file, struct cpt_context *ctx) ++{ ++ return 0; ++} ++ ++struct cpt_dirent { ++ unsigned long ino; ++ char *name; ++ int namelen; ++ int found; ++}; ++ ++static int cpt_filldir(void * __buf, const char * name, int namelen, ++ loff_t offset, u64 ino, unsigned int d_type) ++{ ++ struct cpt_dirent * dirent = __buf; ++ ++ if ((ino == dirent->ino) && (namelen < PAGE_SIZE - 1)) { ++ memcpy(dirent->name, name, namelen); ++ dirent->name[namelen] = '\0'; ++ dirent->namelen = namelen; ++ dirent->found = 1; ++ return 1; ++ } ++ return 0; ++} ++ ++static int find_linked_dentry(struct dentry *d, struct vfsmount *mnt, ++ struct inode *ino, struct cpt_context *ctx) ++{ ++ int err = -EBUSY; ++ struct file *f = NULL; ++ struct cpt_dirent entry; ++ struct dentry *de, *found = NULL; ++ ++ dprintk_ctx("deleted reference to existing inode, try to find file\n"); ++ /* 1. Try to find not deleted dentry in ino->i_dentry list */ ++ spin_lock(&dcache_lock); ++ list_for_each_entry(de, &ino->i_dentry, d_alias) { ++ if (!IS_ROOT(de) && d_unhashed(de)) ++ continue; ++ found = de; ++ dget_locked(found); ++ break; ++ } ++ spin_unlock(&dcache_lock); ++ if (found) { ++ err = cpt_dump_dentry(found, mnt, 0, ctx); ++ dput(found); ++ if (!err) { ++ dprintk_ctx("dentry found in aliases\n"); ++ return 0; ++ } ++ } ++ ++ /* 2. Try to find file in current dir */ ++ de = dget_parent(d); ++ if (!de) ++ return -EINVAL; ++ ++ mntget(mnt); ++ f = dentry_open(de, mnt, O_RDONLY); ++ if (IS_ERR(f)) ++ return PTR_ERR(f); ++ ++ entry.ino = ino->i_ino; ++ entry.name = cpt_get_buf(ctx); ++ entry.found = 0; ++ err = vfs_readdir(f, cpt_filldir, &entry); ++ if (err || !entry.found) { ++ err = err ? err : -ENOENT; ++ goto err_readdir; ++ } ++ ++ found = lookup_one_len(entry.name, de, entry.namelen); ++ if (IS_ERR(found)) { ++ err = PTR_ERR(found); ++ goto err_readdir; ++ } ++ ++ err = -ENOENT; ++ if (found->d_inode != ino) ++ goto err_lookup; ++ ++ dprintk_ctx("dentry found in dir\n"); ++ __cpt_release_buf(ctx); ++ err = cpt_dump_dentry(found, mnt, 0, ctx); ++ ++err_lookup: ++ dput(found); ++err_readdir: ++ fput(f); ++ __cpt_release_buf(ctx); ++ return err; ++} ++ ++static int dump_one_inode(struct file *file, struct dentry *d, ++ struct vfsmount *mnt, struct cpt_context *ctx) ++{ ++ int err = 0; ++ struct inode *ino = d->d_inode; ++ cpt_object_t *iobj; ++ int dump_it = 0; ++ ++ iobj = lookup_cpt_object(CPT_OBJ_INODE, ino, ctx); ++ if (!iobj) ++ return -EINVAL; ++ ++ if (iobj->o_pos >= 0) ++ return 0; ++ ++ if ((!IS_ROOT(d) && d_unhashed(d)) && ++ !cpt_replaced(d, mnt, ctx)) ++ dump_it = 1; ++ if (!S_ISREG(ino->i_mode) && !S_ISDIR(ino->i_mode)) { ++ if (file->f_op == &eventpoll_fops) ++ return 0; ++ dump_it = 1; ++ } ++ ++ if (!dump_it) ++ return 0; ++ ++ cpt_open_object(iobj, ctx); ++ cpt_dump_inode(d, mnt, ctx); ++ ++ if (!IS_ROOT(d) && d_unhashed(d)) { ++ struct file *parent; ++ parent = iobj->o_parent; ++ if (!parent || ++ (!IS_ROOT(parent->f_dentry) && d_unhashed(parent->f_dentry))) { ++ /* Inode is not deleted, but it does not ++ * have references from inside checkpointed ++ * process group. */ ++ if (ino->i_nlink != 0) { ++ err = find_linked_dentry(d, mnt, ino, ctx); ++ if (err) { ++ eprintk_ctx("deleted reference to existing inode, checkpointing is impossible: %d\n", err); ++ return -EBUSY; ++ } ++ if (S_ISREG(ino->i_mode) || S_ISDIR(ino->i_mode)) ++ dump_it = 0; ++ } ++ } else { ++ /* Refer to _another_ file name. */ ++ err = cpt_dump_filename(parent, 0, ctx); ++ if (err) ++ return err; ++ if (S_ISREG(ino->i_mode) || S_ISDIR(ino->i_mode)) ++ dump_it = 0; ++ } ++ } ++ if (dump_it) { ++ if (S_ISREG(ino->i_mode)) { ++ if ((err = dump_content_regular(file, ctx)) != 0) { ++ eprintk_ctx("dump_content_regular "); ++ cpt_printk_dentry(d, mnt); ++ } ++ } else if (S_ISDIR(ino->i_mode)) { ++ /* We cannot do anything. The directory should be ++ * empty, so it is not a big deal. ++ */ ++ } else if (S_ISCHR(ino->i_mode)) { ++ err = dump_content_chrdev(file, ctx); ++ } else if (S_ISBLK(ino->i_mode)) { ++ err = dump_content_blkdev(file, ctx); ++ } else if (S_ISFIFO(ino->i_mode)) { ++ err = dump_content_fifo(file, ctx); ++ } else if (S_ISSOCK(ino->i_mode)) { ++ err = dump_content_socket(file, ctx); ++ } else { ++ eprintk_ctx("unknown inode mode %o, magic 0x%lx\n", ino->i_mode & S_IFMT, ino->i_sb->s_magic); ++ err = -EINVAL; ++ } ++ } ++ cpt_close_object(ctx); ++ ++ return err; ++} ++ ++int cpt_dump_files(struct cpt_context *ctx) ++{ ++ int epoll_nr, inotify_nr; ++ cpt_object_t *obj; ++ ++ cpt_open_section(ctx, CPT_SECT_TTY); ++ for_each_object(obj, CPT_OBJ_TTY) { ++ int err; ++ ++ if ((err = cpt_dump_tty(obj, ctx)) != 0) ++ return err; ++ } ++ cpt_close_section(ctx); ++ ++ cpt_open_section(ctx, CPT_SECT_INODE); ++ for_each_object(obj, CPT_OBJ_FILE) { ++ struct file *file = obj->o_obj; ++ int err; ++ ++ if ((err = dump_one_inode(file, file->f_dentry, ++ file->f_vfsmnt, ctx)) != 0) ++ return err; ++ } ++ for_each_object(obj, CPT_OBJ_FS) { ++ struct fs_struct *fs = obj->o_obj; ++ int err; ++ ++ if (fs->root.dentry && ++ (err = dump_one_inode(NULL, fs->root.dentry, fs->root.mnt, ctx)) != 0) ++ return err; ++ if (fs->pwd.dentry && ++ (err = dump_one_inode(NULL, fs->pwd.dentry, fs->pwd.mnt, ctx)) != 0) ++ return err; ++ if (fs->altroot.dentry && ++ (err = dump_one_inode(NULL, fs->altroot.dentry, fs->altroot.mnt, ctx)) != 0) ++ return err; ++ } ++ cpt_close_section(ctx); ++ ++ epoll_nr = 0; ++ inotify_nr = 0; ++ cpt_open_section(ctx, CPT_SECT_FILES); ++ for_each_object(obj, CPT_OBJ_FILE) { ++ struct file *file = obj->o_obj; ++ int err; ++ ++ if ((err = dump_one_file(obj, file, ctx)) != 0) ++ return err; ++ if (file->f_op == &eventpoll_fops) ++ epoll_nr++; ++ if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_INOTIFY) ++ inotify_nr++; ++ } ++ cpt_close_section(ctx); ++ ++ if (epoll_nr) { ++ cpt_open_section(ctx, CPT_SECT_EPOLL); ++ for_each_object(obj, CPT_OBJ_FILE) { ++ struct file *file = obj->o_obj; ++ if (file->f_op == &eventpoll_fops) { ++ int err; ++ if ((err = cpt_dump_epolldev(obj, ctx)) != 0) ++ return err; ++ } ++ } ++ cpt_close_section(ctx); ++ } ++ ++ if (inotify_nr) { ++ cpt_open_section(ctx, CPT_SECT_INOTIFY); ++ for_each_object(obj, CPT_OBJ_FILE) { ++ struct file *file = obj->o_obj; ++ if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_INOTIFY) { ++ int err = -EINVAL; ++#ifdef CONFIG_INOTIFY_USER ++ if ((err = cpt_dump_inotify(obj, ctx)) != 0) ++#endif ++ return err; ++ } ++ } ++ cpt_close_section(ctx); ++ } ++ ++ cpt_open_section(ctx, CPT_SECT_SOCKET); ++ for_each_object(obj, CPT_OBJ_SOCKET) { ++ int err; ++ ++ if ((err = cpt_dump_socket(obj, obj->o_obj, obj->o_index, -1, ctx)) != 0) ++ return err; ++ } ++ cpt_close_section(ctx); ++ ++ return 0; ++} ++ ++static int dump_filedesc(int fd, struct file *file, ++ struct files_struct *f, struct cpt_context *ctx) ++{ ++ struct cpt_fd_image *v = cpt_get_buf(ctx); ++ cpt_object_t *obj; ++ ++ cpt_open_object(NULL, ctx); ++ ++ v->cpt_next = CPT_NULL; ++ v->cpt_object = CPT_OBJ_FILEDESC; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_VOID; ++ ++ v->cpt_fd = fd; ++ obj = lookup_cpt_object(CPT_OBJ_FILE, file, ctx); ++ if (!obj) BUG(); ++ v->cpt_file = obj->o_pos; ++ v->cpt_flags = 0; ++ if (FD_ISSET(fd, f->fdt->close_on_exec)) ++ v->cpt_flags = CPT_FD_FLAG_CLOSEEXEC; ++ ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ cpt_close_object(ctx); ++ ++ return 0; ++} ++ ++static int dump_one_file_struct(cpt_object_t *obj, struct cpt_context *ctx) ++{ ++ struct files_struct *f = obj->o_obj; ++ struct cpt_files_struct_image *v = cpt_get_buf(ctx); ++ int fd; ++ loff_t saved_obj; ++ ++ cpt_open_object(obj, ctx); ++ ++ v->cpt_next = CPT_NULL; ++ v->cpt_object = CPT_OBJ_FILES; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_ARRAY; ++ ++ v->cpt_index = obj->o_index; ++ v->cpt_max_fds = f->fdt->max_fds; ++ v->cpt_next_fd = f->next_fd; ++ ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ ++ cpt_push_object(&saved_obj, ctx); ++ for (fd = 0; fd < f->fdt->max_fds; fd++) { ++ struct file *file = fcheck_files(f, fd); ++ if (file) ++ dump_filedesc(fd, file, f, ctx); ++ } ++ cpt_pop_object(&saved_obj, ctx); ++ ++ cpt_close_object(ctx); ++ ++ return 0; ++} ++ ++int cpt_dump_files_struct(struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ cpt_open_section(ctx, CPT_SECT_FILES_STRUCT); ++ ++ for_each_object(obj, CPT_OBJ_FILES) { ++ int err; ++ ++ if ((err = dump_one_file_struct(obj, ctx)) != 0) ++ return err; ++ } ++ ++ cpt_close_section(ctx); ++ return 0; ++} ++ ++int cpt_collect_fs(cpt_context_t * ctx) ++{ ++ cpt_object_t *obj; ++ ++ for_each_object(obj, CPT_OBJ_TASK) { ++ struct task_struct *tsk = obj->o_obj; ++ if (tsk->fs) { ++ if (cpt_object_add(CPT_OBJ_FS, tsk->fs, ctx) == NULL) ++ return -ENOMEM; ++ if (tsk->fs->pwd.dentry && ++ cpt_object_add(CPT_OBJ_INODE, tsk->fs->pwd.dentry->d_inode, ctx) == NULL) ++ return -ENOMEM; ++ if (tsk->fs->root.dentry && ++ cpt_object_add(CPT_OBJ_INODE, tsk->fs->root.dentry->d_inode, ctx) == NULL) ++ return -ENOMEM; ++ if (tsk->fs->altroot.dentry && ++ cpt_object_add(CPT_OBJ_INODE, tsk->fs->altroot.dentry->d_inode, ctx) == NULL) ++ return -ENOMEM; ++ } ++ } ++ return 0; ++} ++ ++int cpt_dump_dir(struct dentry *d, struct vfsmount *mnt, struct cpt_context *ctx) ++{ ++ struct file file; ++ ++ memset(&file, 0, sizeof(file)); ++ ++ file.f_dentry = d; ++ file.f_vfsmnt = mnt; ++ file.f_mode = FMODE_READ|FMODE_PREAD|FMODE_LSEEK; ++ return dump_one_file(NULL, &file, ctx); ++} ++ ++static int dump_one_fs(cpt_object_t *obj, struct cpt_context *ctx) ++{ ++ struct fs_struct *fs = obj->o_obj; ++ struct cpt_fs_struct_image *v = cpt_get_buf(ctx); ++ loff_t saved_obj; ++ int err; ++ ++ cpt_open_object(obj, ctx); ++ ++ v->cpt_next = CPT_NULL; ++ v->cpt_object = CPT_OBJ_FS; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_ARRAY; ++ ++ v->cpt_umask = fs->umask; ++ ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ ++ cpt_push_object(&saved_obj, ctx); ++ err = cpt_dump_dir(fs->root.dentry, fs->root.mnt, ctx); ++ if (!err) ++ err = cpt_dump_dir(fs->pwd.dentry, fs->pwd.mnt, ctx); ++ if (!err && fs->altroot.dentry) ++ err = cpt_dump_dir(fs->altroot.dentry, fs->altroot.mnt, ctx); ++ ++ cpt_pop_object(&saved_obj, ctx); ++ ++ cpt_close_object(ctx); ++ ++ return err; ++} ++ ++int cpt_dump_fs_struct(struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ cpt_open_section(ctx, CPT_SECT_FS); ++ ++ for_each_object(obj, CPT_OBJ_FS) { ++ int err; ++ ++ if ((err = dump_one_fs(obj, ctx)) != 0) ++ return err; ++ } ++ ++ cpt_close_section(ctx); ++ return 0; ++} ++ ++static int check_one_namespace(cpt_object_t *obj, struct cpt_context *ctx) ++{ ++ int err = 0; ++ struct mnt_namespace *n = obj->o_obj; ++ struct list_head *p; ++ char *path_buf, *path; ++ ++ path_buf = (char *) __get_free_page(GFP_KERNEL); ++ if (!path_buf) ++ return -ENOMEM; ++ ++ down_read(&namespace_sem); ++ list_for_each(p, &n->list) { ++ struct path pt; ++ struct vfsmount *mnt = list_entry(p, struct vfsmount, mnt_list); ++ ++ pt.dentry = mnt->mnt_root; ++ pt.mnt = mnt; ++ path = d_path(&pt, path_buf, PAGE_SIZE); ++ if (IS_ERR(path)) ++ continue; ++ ++ if (check_one_vfsmount(mnt)) { ++ eprintk_ctx("unsupported fs type %s\n", mnt->mnt_sb->s_type->name); ++ err = -EINVAL; ++ break; ++ } ++ } ++ up_read(&namespace_sem); ++ ++ free_page((unsigned long) path_buf); ++ ++ return err; ++} ++ ++int cpt_collect_namespace(cpt_context_t * ctx) ++{ ++ cpt_object_t *obj; ++ ++ for_each_object(obj, CPT_OBJ_TASK) { ++ struct task_struct *tsk = obj->o_obj; ++ if (tsk->nsproxy && tsk->nsproxy->mnt_ns && ++ cpt_object_add(CPT_OBJ_NAMESPACE, ++ tsk->nsproxy->mnt_ns, ctx) == NULL) ++ return -ENOMEM; ++ } ++ ++ for_each_object(obj, CPT_OBJ_NAMESPACE) { ++ int err; ++ if ((err = check_one_namespace(obj, ctx)) != 0) ++ return err; ++ } ++ ++ return 0; ++} ++ ++struct args_t ++{ ++ int* pfd; ++ char* path; ++}; ++ ++static int dumptmpfs(void *arg) ++{ ++ int i; ++ struct args_t *args = arg; ++ int *pfd = args->pfd; ++ int fd0, fd2; ++ char *path = args->path; ++ char *argv[] = { "tar", "-c", "-S", "--numeric-owner", path, NULL }; ++ ++ i = real_env_create(VEID(get_exec_env()), VE_ENTER|VE_SKIPLOCK, 2, NULL, 0); ++ if (i < 0) { ++ eprintk("cannot enter ve to dump tmpfs\n"); ++ module_put(THIS_MODULE); ++ return 255 << 8; ++ } ++ ++ if (pfd[1] != 1) ++ sc_dup2(pfd[1], 1); ++ set_fs(KERNEL_DS); ++ fd0 = sc_open("/dev/null", O_RDONLY, 0); ++ fd2 = sc_open("/dev/null", O_WRONLY, 0); ++ if (fd0 < 0 || fd2 < 0) { ++ eprintk("can not open /dev/null for tar: %d %d\n", fd0, fd2); ++ module_put(THIS_MODULE); ++ return 255 << 8; ++ } ++ if (fd0 != 0) ++ sc_dup2(fd0, 0); ++ if (fd2 != 2) ++ sc_dup2(fd2, 2); ++ ++ for (i = 3; i < current->files->fdt->max_fds; i++) { ++ sc_close(i); ++ } ++ ++ module_put(THIS_MODULE); ++ ++ i = sc_execve("/bin/tar", argv, NULL); ++ eprintk("failed to exec /bin/tar: %d\n", i); ++ return 255 << 8; ++} ++ ++static int cpt_dump_tmpfs(char *path, struct cpt_context *ctx) ++{ ++ int err; ++ int pid; ++ int pfd[2]; ++ struct file *f; ++ struct cpt_object_hdr v; ++ char buf[16]; ++ int n; ++ loff_t saved_obj; ++ struct args_t args; ++ int status; ++ mm_segment_t oldfs; ++ sigset_t ignore, blocked; ++ ++ err = sc_pipe(pfd); ++ if (err < 0) ++ return err; ++ args.pfd = pfd; ++ args.path = path; ++ ignore.sig[0] = CPT_SIG_IGNORE_MASK; ++ sigprocmask(SIG_BLOCK, &ignore, &blocked); ++ err = pid = local_kernel_thread(dumptmpfs, (void*)&args, ++ SIGCHLD | CLONE_VFORK, 0); ++ if (err < 0) { ++ eprintk_ctx("tmpfs local_kernel_thread: %d\n", err); ++ goto out; ++ } ++ f = fget(pfd[0]); ++ sc_close(pfd[1]); ++ sc_close(pfd[0]); ++ ++ cpt_push_object(&saved_obj, ctx); ++ cpt_open_object(NULL, ctx); ++ v.cpt_next = CPT_NULL; ++ v.cpt_object = CPT_OBJ_NAME; ++ v.cpt_hdrlen = sizeof(v); ++ v.cpt_content = CPT_CONTENT_NAME; ++ ++ ctx->write(&v, sizeof(v), ctx); ++ ++ do { ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ n = f->f_op->read(f, buf, sizeof(buf), &f->f_pos); ++ set_fs(oldfs); ++ if (n > 0) ++ ctx->write(buf, n, ctx); ++ } while (n > 0); ++ ++ fput(f); ++ ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ if ((err = sc_waitx(pid, 0, &status)) < 0) ++ eprintk_ctx("wait4: %d\n", err); ++ else if ((status & 0x7f) == 0) { ++ err = (status & 0xff00) >> 8; ++ if (err != 0) { ++ eprintk_ctx("tar exited with %d\n", err); ++ err = -EINVAL; ++ } ++ } else { ++ eprintk_ctx("tar terminated\n"); ++ err = -EINVAL; ++ } ++ set_fs(oldfs); ++ sigprocmask(SIG_SETMASK, &blocked, NULL); ++ ++ buf[0] = 0; ++ ctx->write(buf, 1, ctx); ++ ctx->align(ctx); ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_obj, ctx); ++ return n ? : err; ++ ++out: ++ if (pfd[1] >= 0) ++ sc_close(pfd[1]); ++ if (pfd[0] >= 0) ++ sc_close(pfd[0]); ++ sigprocmask(SIG_SETMASK, &blocked, NULL); ++ return err; ++} ++ ++static int loopy_root(struct vfsmount *mnt) ++{ ++ struct list_head *p; ++ ++ list_for_each(p, &mnt->mnt_ns->list) { ++ struct vfsmount * m = list_entry(p, struct vfsmount, mnt_list); ++ if (m == mnt) ++ return 0; ++ if (m->mnt_sb == mnt->mnt_sb) ++ return 1; ++ } ++ /* Cannot happen */ ++ return 0; ++} ++ ++static int cpt_dump_bind_mnt(struct vfsmount * mnt, cpt_context_t * ctx) ++{ ++ struct list_head *p; ++ int err = -EINVAL; ++ ++ /* One special case: mount --bind /a /a */ ++ if (mnt->mnt_root == mnt->mnt_mountpoint) ++ return cpt_dump_dentry(mnt->mnt_root, mnt, 0, ctx); ++ ++ list_for_each_prev(p, &mnt->mnt_list) { ++ struct vfsmount * m; ++ ++ if (p == &mnt->mnt_ns->list) ++ break; ++ ++ m = list_entry(p, struct vfsmount, mnt_list); ++ ++ if (m->mnt_sb != mnt->mnt_sb) ++ continue; ++ ++ err = cpt_dump_dentry(mnt->mnt_root, m, 0, ctx); ++ if (err == 0) ++ break; ++ } ++ return err; ++} ++ ++static int dump_vfsmount(struct vfsmount *mnt, struct cpt_context *ctx) ++{ ++ int err = 0; ++ struct cpt_vfsmount_image v; ++ loff_t saved_obj; ++ char *path_buf, *path; ++ struct path p; ++ ++ path_buf = (char *) __get_free_page(GFP_KERNEL); ++ if (!path_buf) ++ return -ENOMEM; ++ ++ p.dentry = mnt->mnt_root; ++ p.mnt = mnt; ++ path = d_path(&p, path_buf, PAGE_SIZE); ++ if (IS_ERR(path)) { ++ free_page((unsigned long) path_buf); ++ return PTR_ERR(path) == -EINVAL ? 0 : PTR_ERR(path); ++ } ++ ++ cpt_open_object(NULL, ctx); ++ ++ v.cpt_next = CPT_NULL; ++ v.cpt_object = CPT_OBJ_VFSMOUNT; ++ v.cpt_hdrlen = sizeof(v); ++ v.cpt_content = CPT_CONTENT_ARRAY; ++ ++ v.cpt_mntflags = mnt->mnt_flags; ++ if (top_beancounter(slab_ub(mnt)) != top_beancounter(get_exec_ub())) { ++ v.cpt_mntflags |= CPT_MNT_EXT; ++ } else { ++ if (mnt->mnt_root != mnt->mnt_sb->s_root || loopy_root(mnt)) ++ v.cpt_mntflags |= CPT_MNT_BIND; ++ } ++ v.cpt_flags = mnt->mnt_sb->s_flags; ++ ++ ctx->write(&v, sizeof(v), ctx); ++ ++ cpt_push_object(&saved_obj, ctx); ++ cpt_dump_string(mnt->mnt_devname ? : "none", ctx); ++ cpt_dump_string(path, ctx); ++ cpt_dump_string(mnt->mnt_sb->s_type->name, ctx); ++ ++ if (v.cpt_mntflags & CPT_MNT_BIND) ++ err = cpt_dump_bind_mnt(mnt, ctx); ++ else if (!(v.cpt_mntflags & CPT_MNT_EXT) && ++ strcmp(mnt->mnt_sb->s_type->name, "tmpfs") == 0) { ++ mntget(mnt); ++ up_read(&namespace_sem); ++ err = cpt_dump_tmpfs(path, ctx); ++ down_read(&namespace_sem); ++ if (!err) { ++ if (list_empty(&mnt->mnt_list)) ++ err = -EBUSY; ++ } ++ mntput(mnt); ++ } ++ ++ cpt_pop_object(&saved_obj, ctx); ++ ++ cpt_close_object(ctx); ++ if (!err && mnt->mnt_sb->s_magic == FSMAGIC_VEFS) ++ vefs_track_force_stop(mnt->mnt_sb); ++ ++ free_page((unsigned long) path_buf); ++ ++ return err; ++} ++ ++static int dump_one_namespace(cpt_object_t *obj, struct cpt_context *ctx) ++{ ++ struct mnt_namespace *n = obj->o_obj; ++ struct cpt_object_hdr v; ++ struct list_head *p; ++ loff_t saved_obj; ++ int err = 0; ++ ++ cpt_open_object(obj, ctx); ++ ++ v.cpt_next = -1; ++ v.cpt_object = CPT_OBJ_NAMESPACE; ++ v.cpt_hdrlen = sizeof(v); ++ v.cpt_content = CPT_CONTENT_ARRAY; ++ ++ ctx->write(&v, sizeof(v), ctx); ++ ++ cpt_push_object(&saved_obj, ctx); ++ ++ down_read(&namespace_sem); ++ list_for_each(p, &n->list) { ++ err = dump_vfsmount(list_entry(p, struct vfsmount, mnt_list), ctx); ++ if (err) ++ break; ++ } ++ up_read(&namespace_sem); ++ ++ cpt_pop_object(&saved_obj, ctx); ++ ++ cpt_close_object(ctx); ++ ++ return err; ++} ++ ++int cpt_dump_namespace(struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ cpt_open_section(ctx, CPT_SECT_NAMESPACE); ++ ++ for_each_object(obj, CPT_OBJ_NAMESPACE) { ++ int err; ++ ++ if ((err = dump_one_namespace(obj, ctx)) != 0) ++ return err; ++ } ++ ++ cpt_close_section(ctx); ++ return 0; ++} +diff --git a/kernel/cpt/cpt_files.h b/kernel/cpt/cpt_files.h +new file mode 100644 +index 0000000..7770ab2 +--- /dev/null ++++ b/kernel/cpt/cpt_files.h +@@ -0,0 +1,71 @@ ++int cpt_collect_files(cpt_context_t *); ++int cpt_collect_fs(cpt_context_t *); ++int cpt_collect_namespace(cpt_context_t *); ++int cpt_collect_sysvsem_undo(cpt_context_t *); ++int cpt_collect_tty(struct file *, cpt_context_t *); ++int cpt_dump_files(struct cpt_context *ctx); ++int cpt_dump_files_struct(struct cpt_context *ctx); ++int cpt_dump_fs_struct(struct cpt_context *ctx); ++int cpt_dump_content_sysvshm(struct file *file, struct cpt_context *ctx); ++int cpt_dump_content_tty(struct file *file, struct cpt_context *ctx); ++int cpt_dump_tty(cpt_object_t *, struct cpt_context *ctx); ++struct file * rst_sysv_shm_vma(struct cpt_vma_image *vmai, struct cpt_context *ctx); ++struct file * rst_sysv_shm_itself(loff_t pos, struct cpt_context *ctx); ++struct file * rst_open_tty(struct cpt_file_image *fi, struct cpt_inode_image *ii, unsigned flags, struct cpt_context *ctx); ++__u32 cpt_tty_fasync(struct file *file, struct cpt_context *ctx); ++ ++int rst_posix_locks(struct cpt_context *ctx); ++ ++struct file *rst_file(loff_t pos, int fd, struct cpt_context *ctx); ++int rst_files_complete(struct cpt_task_image *ti, struct cpt_context *ctx); ++__u32 rst_files_flag(struct cpt_task_image *ti, struct cpt_context *ctx); ++int rst_fs_complete(struct cpt_task_image *ti, struct cpt_context *ctx); ++int rst_restore_fs(struct cpt_context *ctx); ++ ++int cpt_collect_sysv(cpt_context_t *); ++int cpt_dump_sysvsem(struct cpt_context *ctx); ++int cpt_dump_sysvmsg(struct cpt_context *ctx); ++int rst_sysv_ipc(struct cpt_context *ctx); ++int rst_semundo_complete(struct cpt_task_image *ti, struct cpt_context *ctx); ++__u32 rst_semundo_flag(struct cpt_task_image *ti, struct cpt_context *ctx); ++ ++int cpt_dump_namespace(struct cpt_context *ctx); ++int rst_root_namespace(struct cpt_context *ctx); ++ ++int rst_stray_files(struct cpt_context *ctx); ++int rst_tty_jobcontrol(struct cpt_context *ctx); ++ ++void rst_flush_filejobs(struct cpt_context *); ++int rst_do_filejobs(struct cpt_context *); ++ ++extern struct file_operations eventpoll_fops; ++int rst_eventpoll(struct cpt_context *); ++struct file *cpt_open_epolldev(struct cpt_file_image *fi, ++ unsigned flags, ++ struct cpt_context *ctx); ++int cpt_dump_epolldev(cpt_object_t *obj, struct cpt_context *); ++ ++int cpt_dump_dir(struct dentry *d, struct vfsmount *mnt, struct cpt_context *ctx); ++int cpt_get_dentry(struct dentry **dp, struct vfsmount **mp, ++ loff_t *pos, struct cpt_context *ctx); ++ ++int cpt_dump_inotify(cpt_object_t *obj, cpt_context_t *ctx); ++int rst_inotify(cpt_context_t *ctx); ++struct file *rst_open_inotify(struct cpt_file_image *fi, ++ unsigned flags, ++ struct cpt_context *ctx); ++ ++ ++int cpt_verify_overmount(char *path, struct dentry *d, struct vfsmount *mnt, ++ cpt_context_t *ctx); ++ ++#define check_one_vfsmount(mnt) \ ++ (strcmp(mnt->mnt_sb->s_type->name, "rootfs") != 0 && \ ++ strcmp(mnt->mnt_sb->s_type->name, "ext3") != 0 && \ ++ strcmp(mnt->mnt_sb->s_type->name, "ext2") != 0 && \ ++ strcmp(mnt->mnt_sb->s_type->name, "simfs") != 0 && \ ++ strcmp(mnt->mnt_sb->s_type->name, "unionfs") != 0 && \ ++ strcmp(mnt->mnt_sb->s_type->name, "tmpfs") != 0 && \ ++ strcmp(mnt->mnt_sb->s_type->name, "devpts") != 0 && \ ++ strcmp(mnt->mnt_sb->s_type->name, "proc") != 0 && \ ++ strcmp(mnt->mnt_sb->s_type->name, "sysfs") != 0) +diff --git a/kernel/cpt/cpt_fsmagic.h b/kernel/cpt/cpt_fsmagic.h +new file mode 100644 +index 0000000..142e539 +--- /dev/null ++++ b/kernel/cpt/cpt_fsmagic.h +@@ -0,0 +1,16 @@ ++/* Collected from kernel sources. */ ++ ++#define FSMAGIC_TMPFS 0x01021994 ++#define FSMAGIC_PIPEFS 0x50495045 ++#define FSMAGIC_SOCKFS 0x534F434B ++#define FSMAGIC_PFMFS 0xa0b4d889 ++#define FSMAGIC_BDEV 0x62646576 ++#define FSMAGIC_FUTEX 0x0BAD1DEA ++#define FSMAGIC_INOTIFY 0x2BAD1DEA ++#define FSMAGIC_MQUEUE 0x19800202 ++#define FSMAGIC_PROC 0x9fa0 ++#define FSMAGIC_DEVPTS 0x1CD1 ++#define FSMAGIC_AUTOFS 0x0187 ++#define FSMAGIC_EXT2 0xEF53 ++#define FSMAGIC_REISER 0x52654973 ++#define FSMAGIC_VEFS 0x565a4653 +diff --git a/kernel/cpt/cpt_inotify.c b/kernel/cpt/cpt_inotify.c +new file mode 100644 +index 0000000..4d4637e +--- /dev/null ++++ b/kernel/cpt/cpt_inotify.c +@@ -0,0 +1,144 @@ ++/* ++ * ++ * kernel/cpt/cpt_inotify.c ++ * ++ * Copyright (C) 2000-2007 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_mm.h" ++#include "cpt_files.h" ++#include "cpt_kernel.h" ++#include "cpt_fsmagic.h" ++#include "cpt_syscalls.h" ++ ++extern struct file_operations inotify_fops; ++ ++int cpt_dump_inotify(cpt_object_t *obj, cpt_context_t *ctx) ++{ ++ int err = 0; ++ struct file *file = obj->o_obj; ++ struct inotify_device *dev; ++ struct inotify_watch *watch; ++ struct inotify_kernel_event *kev; ++ struct cpt_inotify_image ii; ++ ++ if (file->f_op != &inotify_fops) { ++ eprintk_ctx("bad inotify file\n"); ++ return -EINVAL; ++ } ++ ++ dev = file->private_data; ++ ++ /* inotify_user.c does not protect open /proc/N/fd, silly. ++ * Opener will get an invalid file with uninitialized private_data ++ */ ++ if (unlikely(dev == NULL)) { ++ eprintk_ctx("bad inotify dev\n"); ++ return -EINVAL; ++ } ++ ++ cpt_open_object(NULL, ctx); ++ ++ ii.cpt_next = CPT_NULL; ++ ii.cpt_object = CPT_OBJ_INOTIFY; ++ ii.cpt_hdrlen = sizeof(ii); ++ ii.cpt_content = CPT_CONTENT_ARRAY; ++ ii.cpt_file = obj->o_pos; ++ ii.cpt_user = dev->user->uid; ++ ii.cpt_max_events = dev->max_events; ++ ii.cpt_last_wd = dev->ih->last_wd; ++ ++ ctx->write(&ii, sizeof(ii), ctx); ++ ++ mutex_lock(&dev->ih->mutex); ++ list_for_each_entry(watch, &dev->ih->watches, h_list) { ++ loff_t saved_obj; ++ loff_t saved_obj2; ++ struct cpt_inotify_wd_image wi; ++ ++ cpt_push_object(&saved_obj, ctx); ++ cpt_open_object(NULL, ctx); ++ ++ wi.cpt_next = CPT_NULL; ++ wi.cpt_object = CPT_OBJ_INOTIFY_WATCH; ++ wi.cpt_hdrlen = sizeof(wi); ++ wi.cpt_content = CPT_CONTENT_ARRAY; ++ wi.cpt_wd = watch->wd; ++ wi.cpt_mask = watch->mask; ++ ++ ctx->write(&wi, sizeof(wi), ctx); ++ ++ cpt_push_object(&saved_obj2, ctx); ++ err = cpt_dump_dir(watch->path.dentry, watch->path.mnt, ctx); ++ cpt_pop_object(&saved_obj2, ctx); ++ if (err) ++ break; ++ ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_obj, ctx); ++ } ++ mutex_unlock(&dev->ih->mutex); ++ ++ if (err) ++ return err; ++ ++ mutex_lock(&dev->ev_mutex); ++ list_for_each_entry(kev, &dev->events, list) { ++ loff_t saved_obj; ++ struct cpt_inotify_ev_image ei; ++ ++ cpt_push_object(&saved_obj, ctx); ++ cpt_open_object(NULL, ctx); ++ ++ ei.cpt_next = CPT_NULL; ++ ei.cpt_object = CPT_OBJ_INOTIFY_EVENT; ++ ei.cpt_hdrlen = sizeof(ei); ++ ei.cpt_content = CPT_CONTENT_NAME; ++ ei.cpt_wd = kev->event.wd; ++ ei.cpt_mask = kev->event.mask; ++ ei.cpt_cookie = kev->event.cookie; ++ ei.cpt_namelen = kev->name ? strlen(kev->name) : 0; ++ ++ ctx->write(&ei, sizeof(ei), ctx); ++ ++ if (kev->name) { ++ ctx->write(kev->name, ei.cpt_namelen+1, ctx); ++ ctx->align(ctx); ++ } ++ ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_obj, ctx); ++ } ++ mutex_unlock(&dev->ev_mutex); ++ ++ cpt_close_object(ctx); ++ ++ return err; ++} +diff --git a/kernel/cpt/cpt_kernel.c b/kernel/cpt/cpt_kernel.c +new file mode 100644 +index 0000000..5eb7f1c +--- /dev/null ++++ b/kernel/cpt/cpt_kernel.c +@@ -0,0 +1,177 @@ ++/* ++ * ++ * kernel/cpt/cpt_kernel.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#define __KERNEL_SYSCALLS__ 1 ++ ++#include ++#include ++#include ++#include ++#include ++#ifdef CONFIG_X86 ++#include ++#endif ++#include ++ ++#include "cpt_kernel.h" ++#include "cpt_syscalls.h" ++ ++int debug_level = 1; ++ ++#ifdef CONFIG_X86_32 ++ ++/* ++ * Create a kernel thread ++ */ ++extern void kernel_thread_helper(void); ++int asm_kernel_thread(int (*fn)(void *), void * arg, unsigned long flags, pid_t pid) ++{ ++ struct pt_regs regs; ++ ++ memset(®s, 0, sizeof(regs)); ++ ++ regs.bx = (unsigned long) fn; ++ regs.dx = (unsigned long) arg; ++ ++ regs.ds = __USER_DS; ++ regs.es = __USER_DS; ++ regs.fs = __KERNEL_PERCPU; ++ regs.orig_ax = -1; ++ regs.ip = (unsigned long) kernel_thread_helper; ++ regs.cs = __KERNEL_CS | get_kernel_rpl(); ++ regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; ++ ++ /* Ok, create the new process.. */ ++ return do_fork_pid(flags | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL, pid); ++} ++#endif ++ ++#ifdef CONFIG_IA64 ++pid_t ++asm_kernel_thread (int (*fn)(void *), void *arg, unsigned long flags, pid_t pid) ++{ ++ extern void start_kernel_thread (void); ++ unsigned long *helper_fptr = (unsigned long *) &start_kernel_thread; ++ struct { ++ struct switch_stack sw; ++ struct pt_regs pt; ++ } regs; ++ ++ memset(®s, 0, sizeof(regs)); ++ regs.pt.cr_iip = helper_fptr[0]; /* set entry point (IP) */ ++ regs.pt.r1 = helper_fptr[1]; /* set GP */ ++ regs.pt.r9 = (unsigned long) fn; /* 1st argument */ ++ regs.pt.r11 = (unsigned long) arg; /* 2nd argument */ ++ /* Preserve PSR bits, except for bits 32-34 and 37-45, which we can't read. */ ++ regs.pt.cr_ipsr = ia64_getreg(_IA64_REG_PSR) | IA64_PSR_BN; ++ regs.pt.cr_ifs = 1UL << 63; /* mark as valid, empty frame */ ++ regs.sw.ar_fpsr = regs.pt.ar_fpsr = ia64_getreg(_IA64_REG_AR_FPSR); ++ regs.sw.ar_bspstore = (unsigned long) current + IA64_RBS_OFFSET; ++ regs.sw.pr = (1 << 2 /*PRED_KERNEL_STACK*/); ++ return do_fork_pid(flags | CLONE_UNTRACED, 0, ®s.pt, 0, NULL, NULL, pid); ++} ++#endif ++ ++int local_kernel_thread(int (*fn)(void *), void * arg, unsigned long flags, pid_t pid) ++{ ++ pid_t ret; ++ ++ if (current->fs == NULL) { ++ /* do_fork_pid() hates processes without fs, oopses. */ ++ printk("CPT BUG: local_kernel_thread: current->fs==NULL\n"); ++ return -EINVAL; ++ } ++ if (!try_module_get(THIS_MODULE)) ++ return -EBUSY; ++ ret = asm_kernel_thread(fn, arg, flags, pid); ++ if (ret < 0) ++ module_put(THIS_MODULE); ++ return ret; ++} ++ ++#ifdef __i386__ ++int __execve(const char *file, char **argv, char **envp) ++{ ++ long res; ++ __asm__ volatile ("int $0x80" ++ : "=a" (res) ++ : "0" (__NR_execve),"b" ((long)(file)),"c" ((long)(argv)), ++ "d" ((long)(envp)) : "memory"); ++ return (int)res; ++} ++#endif ++ ++int sc_execve(char *cmd, char **argv, char **env) ++{ ++ int ret; ++#ifndef __i386__ ++ ret = kernel_execve(cmd, argv, env); ++#else ++ ret = __execve(cmd, argv, env); ++#endif ++ return ret; ++} ++ ++unsigned int test_cpu_caps(void) ++{ ++ unsigned int flags = 0; ++ ++#ifdef CONFIG_X86 ++ if (boot_cpu_has(X86_FEATURE_CMOV)) ++ flags |= 1 << CPT_CPU_X86_CMOV; ++ if (cpu_has_fxsr) ++ flags |= 1 << CPT_CPU_X86_FXSR; ++ if (cpu_has_xmm) ++ flags |= 1 << CPT_CPU_X86_SSE; ++#ifndef CONFIG_X86_64 ++ if (cpu_has_xmm2) ++#endif ++ flags |= 1 << CPT_CPU_X86_SSE2; ++ if (cpu_has_mmx) ++ flags |= 1 << CPT_CPU_X86_MMX; ++ if (boot_cpu_has(X86_FEATURE_3DNOW)) ++ flags |= 1 << CPT_CPU_X86_3DNOW; ++ if (boot_cpu_has(X86_FEATURE_3DNOWEXT)) ++ flags |= 1 << CPT_CPU_X86_3DNOW2; ++ if (boot_cpu_has(X86_FEATURE_SYSCALL)) ++ flags |= 1 << CPT_CPU_X86_SYSCALL; ++#ifdef CONFIG_X86_64 ++ if (boot_cpu_has(X86_FEATURE_SYSCALL) && ++ boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) ++ flags |= 1 << CPT_CPU_X86_SYSCALL32; ++#endif ++ if (boot_cpu_has(X86_FEATURE_SEP) ++#ifdef CONFIG_X86_64 ++ && boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ++#endif ++ ) ++ flags |= ((1 << CPT_CPU_X86_SEP) | (1 << CPT_CPU_X86_SEP32)); ++#ifdef CONFIG_X86_64 ++ flags |= 1 << CPT_CPU_X86_EMT64; ++#endif ++#endif ++#ifdef CONFIG_IA64 ++ flags |= 1 << CPT_CPU_X86_IA64; ++ flags |= 1 << CPT_CPU_X86_FXSR; ++#endif ++ return flags; ++} ++ ++unsigned int test_kernel_config(void) ++{ ++ unsigned int flags = 0; ++#ifdef CONFIG_X86 ++#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) ++ flags |= 1 << CPT_KERNEL_CONFIG_PAE; ++#endif ++#endif ++ return flags; ++} +diff --git a/kernel/cpt/cpt_kernel.h b/kernel/cpt/cpt_kernel.h +new file mode 100644 +index 0000000..9254778 +--- /dev/null ++++ b/kernel/cpt/cpt_kernel.h +@@ -0,0 +1,99 @@ ++/* Interface to kernel vars which we had to _add_. */ ++ ++#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9) ++#define TASK_TRACED TASK_STOPPED ++#define unix_peer(sk) ((sk)->sk_pair) ++#define page_mapcount(pg) ((pg)->mapcount) ++#else ++#define unix_peer(sk) (unix_sk(sk)->peer) ++#endif ++ ++#ifdef CONFIG_IA64 ++#define cpu_has_fxsr 1 ++#endif ++ ++#define CPT_SIG_IGNORE_MASK (\ ++ (1 << (SIGCONT - 1)) | (1 << (SIGCHLD - 1)) | \ ++ (1 << (SIGWINCH - 1)) | (1 << (SIGURG - 1))) ++ ++static inline void do_gettimespec(struct timespec *ts) ++{ ++ struct timeval tv; ++ do_gettimeofday(&tv); ++ ts->tv_sec = tv.tv_sec; ++ ts->tv_nsec = tv.tv_usec*1000; ++} ++ ++int local_kernel_thread(int (*fn)(void *), ++ void * arg, ++ unsigned long flags, ++ pid_t pid); ++int asm_kernel_thread(int (*fn)(void *), ++ void * arg, ++ unsigned long flags, ++ pid_t pid); ++ ++#if defined(CONFIG_VZFS_FS) || defined(CONFIG_VZFS_FS_MODULE) ++void vefs_track_force_stop(struct super_block *super); ++ ++void vefs_track_notify(struct dentry *vdentry, int track_cow); ++ ++struct dentry * vefs_replaced_dentry(struct dentry *de); ++int vefs_is_renamed_dentry(struct dentry *vde, struct dentry *pde); ++#else ++static inline void vefs_track_force_stop(struct super_block *super) { }; ++ ++static inline void vefs_track_notify(struct dentry *vdentry, int track_cow) { }; ++#endif ++ ++unsigned int test_cpu_caps(void); ++unsigned int test_kernel_config(void); ++ ++#define test_one_flag_old(src, dst, flag, message, ret) \ ++if (src & (1 << flag)) \ ++ if (!(dst & (1 << flag))) { \ ++ wprintk("Destination cpu does not have " message "\n"); \ ++ ret = 1; \ ++ } ++#define test_one_flag(src, dst, flag, message, ret) \ ++if (src & (1 << flag)) \ ++ if (!(dst & (1 << flag))) { \ ++ eprintk_ctx("Destination cpu does not have " message "\n"); \ ++ ret = 1; \ ++ } ++ ++static inline void ++_set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) ++{ ++ while (nsec >= NSEC_PER_SEC) { ++ nsec -= NSEC_PER_SEC; ++ ++sec; ++ } ++ while (nsec < 0) { ++ nsec += NSEC_PER_SEC; ++ --sec; ++ } ++ ts->tv_sec = sec; ++ ts->tv_nsec = nsec; ++} ++ ++static inline struct timespec ++_ns_to_timespec(const s64 nsec) ++{ ++ struct timespec ts; ++ s32 rem; ++ ++ if (!nsec) ++ return (struct timespec) {0, 0}; ++ ++ ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); ++ if (unlikely(rem < 0)) { ++ ts.tv_sec--; ++ rem += NSEC_PER_SEC; ++ } ++ ts.tv_nsec = rem; ++ ++ return ts; ++} +diff --git a/kernel/cpt/cpt_mm.c b/kernel/cpt/cpt_mm.c +new file mode 100644 +index 0000000..a3d8c8e +--- /dev/null ++++ b/kernel/cpt/cpt_mm.c +@@ -0,0 +1,918 @@ ++/* ++ * ++ * kernel/cpt/cpt_mm.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#ifdef CONFIG_X86 ++#include ++#endif ++#include ++#include ++#include ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_mm.h" ++#include "cpt_kernel.h" ++#include "cpt_fsmagic.h" ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++#include "cpt_pagein.h" ++#endif ++#include "cpt_ubc.h" ++ ++static int collect_one_aio_ctx(struct mm_struct *mm, struct kioctx *aio_ctx, ++ cpt_context_t *ctx) ++{ ++ if (!list_empty(&aio_ctx->run_list)) { ++ /* This is impossible at least with kernel 2.6.8.1 or 2.6.16 */ ++ eprintk_ctx("run list is not empty, cannot suspend AIO\n"); ++ return -EBUSY; ++ } ++ ++ /* Wait for pending IOCBs. Linux AIO is mostly _fake_. ++ * It is actually synchronous, except for direct IO and ++ * some funny raw USB things, which cannot happen inside VE. ++ * However, we do this for future. ++ * ++ * Later note: in 2.6.16 we may allow O_DIRECT, so that ++ * it is not meaningless code. ++ */ ++ wait_for_all_aios(aio_ctx); ++ ++ if (!list_empty(&aio_ctx->run_list) || ++ !list_empty(&aio_ctx->active_reqs) || ++ aio_ctx->reqs_active) { ++ eprintk_ctx("were not able to suspend AIO\n"); ++ return -EBUSY; ++ } ++ ++ return 0; ++} ++ ++static int collect_one_mm(struct mm_struct *mm, cpt_context_t * ctx) ++{ ++ struct vm_area_struct *vma; ++ ++ for (vma = mm->mmap; vma; vma = vma->vm_next) { ++ if (vma->vm_file) { ++ if (cpt_object_add(CPT_OBJ_FILE, vma->vm_file, ctx) == NULL) ++ return -ENOMEM; ++ } ++ } ++#ifdef CONFIG_BEANCOUNTERS ++ if (cpt_add_ubc(mm->mm_ub, ctx) == NULL) ++ return -ENOMEM; ++#endif ++ ++ if (mm->ioctx_list) { ++ struct kioctx *aio_ctx; ++ int err; ++ ++ for (aio_ctx = mm->ioctx_list; aio_ctx; aio_ctx = aio_ctx->next) ++ if ((err = collect_one_aio_ctx(mm, aio_ctx, ctx)) != 0) ++ return err; ++ } ++ ++ return 0; ++} ++ ++int cpt_collect_mm(cpt_context_t * ctx) ++{ ++ cpt_object_t *obj; ++ int err; ++ int index; ++ ++ for_each_object(obj, CPT_OBJ_TASK) { ++ struct task_struct *tsk = obj->o_obj; ++ if (tsk->mm && cpt_object_add(CPT_OBJ_MM, tsk->mm, ctx) == NULL) ++ return -ENOMEM; ++ } ++ ++ index = 1; ++ for_each_object(obj, CPT_OBJ_MM) { ++ struct mm_struct *mm = obj->o_obj; ++ if (obj->o_count != atomic_read(&mm->mm_users)) { ++ eprintk_ctx("mm_struct is referenced outside %d %d\n", obj->o_count, atomic_read(&mm->mm_users)); ++ return -EAGAIN; ++ } ++ cpt_obj_setindex(obj, index++, ctx); ++ ++ if ((err = collect_one_mm(mm, ctx)) != 0) ++ return err; ++ } ++ ++ return 0; ++} ++ ++static int zcnt, scnt, scnt0, ucnt; ++ ++/* Function where_is_anon_page() returns address of a anonymous page in mm ++ * of already dumped process. This happens f.e. after fork(). We do not use ++ * this right now, just keep statistics, it is diffucult to restore such state, ++ * but the most direct use is to save space in dumped image. */ ++ ++ ++static inline unsigned long ++vma_address0(struct page *page, struct vm_area_struct *vma) ++{ ++ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); ++ unsigned long address; ++ ++ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); ++ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) ++ address |= 1; ++ return address; ++} ++ ++static int really_this_one(struct vm_area_struct *vma, unsigned long address, ++ struct page *page) ++{ ++ struct mm_struct *mm = vma->vm_mm; ++ pgd_t *pgd; ++ pud_t *pud; ++ pmd_t *pmd; ++ pte_t *pte; ++ spinlock_t *ptl; ++ int result; ++ ++ pgd = pgd_offset(mm, address); ++ if (unlikely(!pgd_present(*pgd))) ++ return 0; ++ ++ pud = pud_offset(pgd, address); ++ if (!pud_present(*pud)) ++ return 0; ++ ++ pmd = pmd_offset(pud, address); ++ if (unlikely(!pmd_present(*pmd))) ++ return 0; ++ ++ result = 0; ++ pte = pte_offset_map(pmd, address); ++ if (!pte_present(*pte)) { ++ pte_unmap(pte); ++ return 0; ++ } ++ ++ ptl = pte_lockptr(mm, pmd); ++ spin_lock(ptl); ++ if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) ++ result = 1; ++ pte_unmap_unlock(pte, ptl); ++ return result; ++} ++ ++static loff_t where_is_anon_page(cpt_object_t *mmobj, unsigned long mapaddr, ++ struct page *page, cpt_context_t * ctx) ++{ ++ loff_t mmptr = CPT_NULL; ++ struct anon_vma *anon_vma; ++ struct vm_area_struct *vma; ++ int idx = mmobj->o_index; ++ ++ if (!PageAnon(page)) ++ return CPT_NULL; ++ ++ anon_vma = page_lock_anon_vma(page); ++ if (!anon_vma) ++ return CPT_NULL; ++ ++ list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { ++ unsigned long addr = vma_address0(page, vma); ++ cpt_object_t *obj; ++ ++ /* We do not try to support mremapped regions (addr != mapaddr), ++ * only mmaps directly inherited via fork(). ++ * With this limitation we may check self-consistency of ++ * vmas (vm_start, vm_pgoff, anon_vma) before ++ * doing __copy_page_range() in rst_mm. ++ */ ++ if (mmobj->o_obj != vma->vm_mm && addr == mapaddr) { ++ obj = lookup_cpt_object(CPT_OBJ_MM, vma->vm_mm, ctx); ++ if (obj && obj->o_pos != CPT_NULL && obj->o_index < idx) { ++ if (really_this_one(vma, addr, page)) { ++ mmptr = obj->o_pos; ++ idx = obj->o_index; ++ } ++ } ++ } ++ } ++ page_unlock_anon_vma(anon_vma); ++ ++ return mmptr; ++} ++ ++struct page_area ++{ ++ int type; ++ unsigned long start; ++ unsigned long end; ++ pgoff_t pgoff; ++ loff_t mm; ++ __u64 list[16]; ++}; ++ ++struct page_desc ++{ ++ int type; ++ pgoff_t index; ++ loff_t mm; ++ int shared; ++}; ++ ++enum { ++ PD_ABSENT, ++ PD_COPY, ++ PD_ZERO, ++ PD_CLONE, ++ PD_FUNKEY, ++ PD_LAZY, ++ PD_ITER, ++ PD_ITERYOUNG, ++}; ++ ++/* 0: page can be obtained from backstore, or still not mapped anonymous page, ++ or something else, which does not requre copy. ++ 1: page requires copy ++ 2: page requres copy but its content is zero. Quite useless. ++ 3: wp page is shared after fork(). It is to be COWed when modified. ++ 4: page is something unsupported... We copy it right now. ++ */ ++ ++ ++ ++static void page_get_desc(cpt_object_t *mmobj, ++ struct vm_area_struct *vma, unsigned long addr, ++ struct page_desc *pdesc, cpt_context_t * ctx) ++{ ++ struct mm_struct *mm = vma->vm_mm; ++ pgd_t *pgd; ++ pud_t *pud; ++ pmd_t *pmd; ++ pte_t *ptep, pte; ++ spinlock_t *ptl; ++ struct page *pg = NULL; ++ pgoff_t linear_index = (addr - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff; ++ ++ pdesc->index = linear_index; ++ pdesc->shared = 0; ++ pdesc->mm = CPT_NULL; ++ ++ if (vma->vm_flags & VM_IO) { ++ pdesc->type = PD_ABSENT; ++ return; ++ } ++ ++ pgd = pgd_offset(mm, addr); ++ if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) ++ goto out_absent; ++ pud = pud_offset(pgd, addr); ++ if (pud_none(*pud) || unlikely(pud_bad(*pud))) ++ goto out_absent; ++ pmd = pmd_offset(pud, addr); ++ if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) ++ goto out_absent; ++#ifdef CONFIG_X86 ++ if (pmd_huge(*pmd)) { ++ eprintk_ctx("page_huge\n"); ++ goto out_unsupported; ++ } ++#endif ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++retry: ++#endif ++ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); ++ pte = *ptep; ++ pte_unmap(ptep); ++ ++ if (pte_none(pte)) ++ goto out_absent_unlock; ++ ++ if (!pte_present(pte)) { ++ if (pte_file(pte)) { ++ pdesc->index = pte_to_pgoff(pte); ++ goto out_absent_unlock; ++ } ++ if (vma->vm_flags & VM_SHARED) { ++ /* It is impossible: shared mappings cannot be in swap */ ++ eprintk_ctx("shared mapping is not present: %08lx@%Ld\n", addr, mmobj->o_pos); ++ goto out_unsupported_unlock; ++ } ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ /* Otherwise it is in swap. */ ++ if (!ctx->lazy_vm) { ++ int err; ++ /* If lazy transfer is not enabled, ++ * raise it from swap now, so that we ++ * save at least when the page is shared. ++ */ ++ spin_unlock(ptl); ++ err = handle_mm_fault(mm, vma, addr, 0); ++ if (err == VM_FAULT_SIGBUS) ++ goto out_absent; ++ if (err == VM_FAULT_OOM) ++ goto out_absent; ++ err = 0; ++ goto retry; ++ } ++#endif ++ pdesc->type = PD_LAZY; ++ goto out_unlock; ++ } ++ ++ if ((pg = vm_normal_page(vma, addr, pte)) == NULL) { ++ pdesc->type = PD_COPY; ++ goto out_unlock; ++ } ++ ++ get_page(pg); ++ spin_unlock(ptl); ++ ++ if (pg->mapping && !PageAnon(pg)) { ++ if (vma->vm_file == NULL) { ++ eprintk_ctx("pg->mapping!=NULL for fileless vma: %08lx\n", addr); ++ goto out_unsupported; ++ } ++ if (vma->vm_file->f_mapping != pg->mapping) { ++ eprintk_ctx("pg->mapping!=f_mapping: %08lx %p %p %Ld\n", ++ addr, vma->vm_file->f_mapping, pg->mapping, ++ mmobj->o_pos); ++ goto out_unsupported; ++ } ++ pdesc->index = (pg->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT)); ++ /* Page is in backstore. For us it is like ++ * it is not present. ++ */ ++ goto out_absent; ++ } ++ ++ if (PageReserved(pg)) { ++ /* Special case: ZERO_PAGE is used, when an ++ * anonymous page is accessed but not written. */ ++ if (pg == ZERO_PAGE(addr)) { ++ if (pte_write(pte)) { ++ eprintk_ctx("not funny already, writable ZERO_PAGE\n"); ++ goto out_unsupported; ++ } ++ zcnt++; ++ goto out_absent; ++ } ++ eprintk_ctx("reserved page %lu at %08lx@%Ld\n", pg->index, ++ addr, mmobj->o_pos); ++ goto out_unsupported; ++ } ++ ++ if (pg == ZERO_PAGE(addr)) { ++ wprintk_ctx("that's how it works now\n"); ++ } ++ ++ if (!pg->mapping) { ++ eprintk_ctx("page without mapping at %08lx@%Ld\n", addr, ++ mmobj->o_pos); ++ goto out_unsupported; ++ } ++ ++ if (pg->mapping && page_mapcount(pg) > 1) { ++ pdesc->shared = 1; ++ pdesc->mm = where_is_anon_page(mmobj, addr, pg, ctx); ++ if (pdesc->mm != CPT_NULL) { ++ scnt0++; ++ pdesc->type = PD_CLONE; ++ goto out_put; ++ } else { ++ scnt++; ++ } ++ } ++#ifdef CONFIG_VZ_CHECKPOINT_ITER ++ if (ctx->iter_done && ++ test_bit(PG_checkpointed, &pg->flags)) { ++ if (pte_write(pte)) { ++ wprintk_ctx("writable PG_checkpointed page\n"); ++ } ++ pdesc->index = page_to_pfn(pg); ++ pdesc->type = pte_young(pte) ? PD_ITERYOUNG : PD_ITER; ++ goto out_put; ++ } ++#endif ++ pdesc->type = pte_young(pte) ? PD_COPY : PD_LAZY; ++ ++out_put: ++ if (pg) ++ put_page(pg); ++ return; ++ ++out_unlock: ++ spin_unlock(ptl); ++ goto out_put; ++ ++out_absent_unlock: ++ spin_unlock(ptl); ++out_absent: ++ pdesc->type = PD_ABSENT; ++ goto out_put; ++ ++out_unsupported_unlock: ++ spin_unlock(ptl); ++out_unsupported: ++ ucnt++; ++ pdesc->type = PD_FUNKEY; ++ goto out_put; ++} ++ ++/* ATTN: We give "current" to get_user_pages(). This is wrong, but get_user_pages() ++ * does not really need this thing. It just stores some page fault stats there. ++ * ++ * BUG: some archs (f.e. sparc64, but not Intel*) require flush cache pages ++ * before accessing vma. ++ */ ++void dump_pages(struct vm_area_struct *vma, unsigned long start, ++ unsigned long end, struct cpt_context *ctx) ++{ ++#define MAX_PAGE_BATCH 16 ++ struct page *pg[MAX_PAGE_BATCH]; ++ int npages = (end - start)/PAGE_SIZE; ++ int count = 0; ++ ++ while (count < npages) { ++ int copy = npages - count; ++ int n; ++ ++ if (copy > MAX_PAGE_BATCH) ++ copy = MAX_PAGE_BATCH; ++ n = get_user_pages(current, vma->vm_mm, start, copy, ++ 0, 1, pg, NULL); ++ if (n == copy) { ++ int i; ++ for (i=0; iwrite(maddr, PAGE_SIZE, ctx); ++ kunmap(pg[i]); ++ } ++ } else { ++ eprintk_ctx("get_user_pages fault"); ++ for ( ; n > 0; n--) ++ page_cache_release(pg[n-1]); ++ return; ++ } ++ start += n*PAGE_SIZE; ++ count += n; ++ for ( ; n > 0; n--) ++ page_cache_release(pg[n-1]); ++ } ++ return; ++} ++ ++int dump_page_block(struct vm_area_struct *vma, struct cpt_page_block *pgb, ++ int copy, ++ struct cpt_context *ctx) ++{ ++ loff_t saved_object; ++ ++ cpt_push_object(&saved_object, ctx); ++ ++ pgb->cpt_object = (copy != PD_LAZY) ? CPT_OBJ_PAGES : CPT_OBJ_LAZYPAGES; ++ pgb->cpt_hdrlen = sizeof(*pgb); ++ pgb->cpt_content = (copy == PD_COPY || copy == PD_LAZY) ? CPT_CONTENT_DATA : CPT_CONTENT_VOID; ++ ++ ctx->write(pgb, sizeof(*pgb), ctx); ++ if (copy == PD_COPY || copy == PD_LAZY) ++ dump_pages(vma, pgb->cpt_start, pgb->cpt_end, ctx); ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_object, ctx); ++ return 0; ++} ++ ++int dump_remappage_block(struct vm_area_struct *vma, struct page_area *pa, ++ struct cpt_context *ctx) ++{ ++ struct cpt_remappage_block pgb; ++ loff_t saved_object; ++ ++ cpt_push_object(&saved_object, ctx); ++ ++ pgb.cpt_object = CPT_OBJ_REMAPPAGES; ++ pgb.cpt_hdrlen = sizeof(pgb); ++ pgb.cpt_content = CPT_CONTENT_VOID; ++ pgb.cpt_start = pa->start; ++ pgb.cpt_end = pa->end; ++ pgb.cpt_pgoff = pa->pgoff - (pa->end-pa->start)/PAGE_SIZE + 1; ++ ++ ctx->write(&pgb, sizeof(pgb), ctx); ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_object, ctx); ++ return 0; ++} ++ ++int dump_copypage_block(struct vm_area_struct *vma, struct page_area *pa, ++ struct cpt_context *ctx) ++{ ++ struct cpt_copypage_block pgb; ++ loff_t saved_object; ++ ++ cpt_push_object(&saved_object, ctx); ++ ++ pgb.cpt_object = CPT_OBJ_COPYPAGES; ++ pgb.cpt_hdrlen = sizeof(pgb); ++ pgb.cpt_content = CPT_CONTENT_VOID; ++ pgb.cpt_start = pa->start; ++ pgb.cpt_end = pa->end; ++ pgb.cpt_source = pa->mm; ++ ++ ctx->write(&pgb, sizeof(pgb), ctx); ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_object, ctx); ++ return 0; ++} ++ ++int dump_lazypage_block(struct vm_area_struct *vma, struct page_area *pa, ++ cpt_context_t *ctx) ++{ ++ struct cpt_lazypage_block pgb; ++ loff_t saved_object; ++ ++ cpt_push_object(&saved_object, ctx); ++ ++ pgb.cpt_object = CPT_OBJ_LAZYPAGES; ++ pgb.cpt_hdrlen = sizeof(pgb); ++ pgb.cpt_content = CPT_CONTENT_VOID; ++ pgb.cpt_start = pa->start; ++ pgb.cpt_end = pa->end; ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ pgb.cpt_index = cpt_alloc_pgin_index(vma, pa->start, ++ (pa->end-pa->start)/PAGE_SIZE, ctx); ++#endif ++ ctx->write(&pgb, sizeof(pgb), ctx); ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_object, ctx); ++ return 0; ++} ++ ++int dump_iterpage_block(struct vm_area_struct *vma, struct page_area *pa, ++ cpt_context_t *ctx) ++{ ++ struct cpt_iterpage_block pgb; ++ loff_t saved_object; ++ ++ cpt_push_object(&saved_object, ctx); ++ ++ pgb.cpt_object = pa->type == PD_ITER ? CPT_OBJ_ITERPAGES : ++ CPT_OBJ_ITERYOUNGPAGES; ++ pgb.cpt_hdrlen = sizeof(pgb); ++ pgb.cpt_content = CPT_CONTENT_VOID; ++ pgb.cpt_start = pa->start; ++ pgb.cpt_end = pa->end; ++ ctx->write(&pgb, sizeof(pgb), ctx); ++ ++ ctx->write(pa->list, 8*((pa->end-pa->start)/PAGE_SIZE), ctx); ++ ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_object, ctx); ++ return 0; ++} ++ ++ ++static int can_expand(struct page_area *pa, struct page_desc *pd) ++{ ++ if (pa->start == pa->end) ++ return 1; ++ if (pa->type != pd->type) ++ return 0; ++ if (pa->type == PD_ITER || pa->type == PD_ITERYOUNG) { ++ if (pa->end - pa->start >= PAGE_SIZE*16) ++ return 0; ++ pa->list[(pa->end - pa->start)/PAGE_SIZE] = pd->index; ++ } ++ if (pa->type == PD_ABSENT) ++ return pd->index == pa->pgoff + 1; ++ if (pa->type == PD_CLONE) ++ return pd->mm == pa->mm; ++ return 1; ++} ++ ++static int dump_one_vma(cpt_object_t *mmobj, ++ struct vm_area_struct *vma, struct cpt_context *ctx) ++{ ++ struct cpt_vma_image *v = cpt_get_buf(ctx); ++ unsigned long addr; ++ loff_t saved_object; ++ struct cpt_page_block pgb; ++ struct page_area pa; ++ int cloned_pages = 0; ++ ++ cpt_push_object(&saved_object, ctx); ++ ++ v->cpt_object = CPT_OBJ_VMA; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_ARRAY; ++ ++ v->cpt_start = vma->vm_start; ++ v->cpt_end = vma->vm_end; ++ v->cpt_flags = vma->vm_flags; ++ if (vma->vm_flags&VM_HUGETLB) { ++ eprintk_ctx("huge TLB VMAs are still not supported\n"); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ v->cpt_pgprot = vma->vm_page_prot.pgprot; ++ v->cpt_pgoff = vma->vm_pgoff; ++ v->cpt_file = CPT_NULL; ++#ifndef CONFIG_IA64 ++ if ((void *)vma->vm_start == vma->vm_mm->context.vdso && ++ vma->vm_ops == &special_mapping_vmops) ++ v->cpt_type = CPT_VMA_VDSO; ++ else ++#endif ++ v->cpt_type = CPT_VMA_TYPE_0; ++ v->cpt_anonvma = 0; ++ ++ /* We have to remember what VMAs are bound to one anon_vma. ++ * So, we store an identifier of group of VMAs. It is handy ++ * to use absolute address of anon_vma as this identifier. */ ++ v->cpt_anonvmaid = (unsigned long)vma->anon_vma; ++ ++ if (vma->vm_file) { ++ struct file *filp; ++ cpt_object_t *obj = lookup_cpt_object(CPT_OBJ_FILE, vma->vm_file, ctx); ++ if (obj == NULL) BUG(); ++ filp = obj->o_obj; ++ if (filp->f_op == &shm_file_operations) { ++ struct shm_file_data *sfd = filp->private_data; ++ ++ v->cpt_type = CPT_VMA_TYPE_SHM; ++ obj = lookup_cpt_object(CPT_OBJ_FILE, sfd->file, ctx); ++ } ++ v->cpt_file = obj->o_pos; ++ } ++ ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ if (v->cpt_type == CPT_VMA_VDSO) ++ goto out; ++ ++ pa.type = PD_ABSENT; ++ pa.pgoff = vma->vm_pgoff; ++ pa.mm = CPT_NULL; ++ pa.start = vma->vm_start; ++ pa.end = vma->vm_start; ++ ++ for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { ++ struct page_desc pd; ++ ++ page_get_desc(mmobj, vma, addr, &pd, ctx); ++ cloned_pages += pd.shared; ++ ++ if (pd.type == PD_FUNKEY) { ++ eprintk_ctx("dump_one_vma: funkey page\n"); ++ return -EINVAL; ++ } ++ ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ if (pd.type == PD_LAZY && ++ (ctx->lazy_vm == 0 || (vma->vm_flags&VM_LOCKED))) ++ pd.type = PD_COPY; ++#else ++ if (pd.type == PD_LAZY) ++ pd.type = PD_COPY; ++#endif ++ ++ if (!can_expand(&pa, &pd)) { ++ if (pa.type == PD_COPY || ++ pa.type == PD_ZERO) { ++ pgb.cpt_start = pa.start; ++ pgb.cpt_end = pa.end; ++ dump_page_block(vma, &pgb, pa.type, ctx); ++ } else if (pa.type == PD_CLONE) { ++ dump_copypage_block(vma, &pa, ctx); ++ cloned_pages++; ++ } else if (pa.type == PD_LAZY) { ++ dump_lazypage_block(vma, &pa, ctx); ++ } else if (pa.type == PD_ITER || pa.type == PD_ITERYOUNG) { ++ dump_iterpage_block(vma, &pa, ctx); ++ cloned_pages++; ++ } else if (pa.type == PD_ABSENT && ++ pa.pgoff != (pa.end - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff - 1) { ++ dump_remappage_block(vma, &pa, ctx); ++ } ++ pa.start = addr; ++ } ++ pa.type = pd.type; ++ pa.end = addr + PAGE_SIZE; ++ pa.pgoff = pd.index; ++ if (addr == pa.start) ++ pa.list[0] = pd.index; ++ pa.mm = pd.mm; ++ } ++ ++ if (pa.end > pa.start) { ++ if (pa.type == PD_COPY || ++ pa.type == PD_ZERO) { ++ pgb.cpt_start = pa.start; ++ pgb.cpt_end = pa.end; ++ dump_page_block(vma, &pgb, pa.type, ctx); ++ } else if (pa.type == PD_CLONE) { ++ dump_copypage_block(vma, &pa, ctx); ++ cloned_pages++; ++ } else if (pa.type == PD_LAZY) { ++ dump_lazypage_block(vma, &pa, ctx); ++ } else if (pa.type == PD_ITER || pa.type == PD_ITERYOUNG) { ++ dump_iterpage_block(vma, &pa, ctx); ++ cloned_pages++; ++ } else if (pa.type == PD_ABSENT && ++ pa.pgoff != (pa.end - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff - 1) { ++ dump_remappage_block(vma, &pa, ctx); ++ } ++ } ++ ++ if (cloned_pages) { ++ __u32 anonvma = 1; ++ loff_t anonpos = ctx->current_object + offsetof(struct cpt_vma_image, cpt_anonvma); ++ ctx->pwrite(&anonvma, 4, ctx, anonpos); ++ } ++ ++out: ++ cpt_close_object(ctx); ++ ++ cpt_pop_object(&saved_object, ctx); ++ ++ return 0; ++} ++ ++static int dump_one_aio_ctx(struct mm_struct *mm, struct kioctx *aio_ctx, ++ cpt_context_t *ctx) ++{ ++ loff_t saved_object; ++ struct cpt_aio_ctx_image aimg; ++ ++ if (!list_empty(&aio_ctx->run_list) || ++ !list_empty(&aio_ctx->active_reqs) || ++ aio_ctx->reqs_active) { ++ eprintk_ctx("AIO is active after suspend\n"); ++ return -EBUSY; ++ } ++ ++ cpt_push_object(&saved_object, ctx); ++ ++ aimg.cpt_next = CPT_ALIGN(sizeof(aimg)); ++ aimg.cpt_object = CPT_OBJ_AIO_CONTEXT; ++ aimg.cpt_hdrlen = sizeof(aimg); ++ aimg.cpt_content = CPT_CONTENT_ARRAY; ++ ++ aimg.cpt_max_reqs = aio_ctx->max_reqs; ++ aimg.cpt_ring_pages = aio_ctx->ring_info.nr_pages; ++ aimg.cpt_nr = aio_ctx->ring_info.nr; ++ aimg.cpt_tail = aio_ctx->ring_info.tail; ++ aimg.cpt_mmap_base = aio_ctx->ring_info.mmap_base; ++ ++ ctx->write(&aimg, sizeof(aimg), ctx); ++ ++ cpt_pop_object(&saved_object, ctx); ++ return 0; ++} ++ ++static int dump_one_mm(cpt_object_t *obj, struct cpt_context *ctx) ++{ ++ struct mm_struct *mm = obj->o_obj; ++ struct vm_area_struct *vma; ++ struct cpt_mm_image *v = cpt_get_buf(ctx); ++ ++ cpt_open_object(obj, ctx); ++ ++ v->cpt_next = -1; ++ v->cpt_object = CPT_OBJ_MM; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_ARRAY; ++ ++ v->cpt_start_code = mm->start_code; ++ v->cpt_end_code = mm->end_code; ++ v->cpt_start_data = mm->start_data; ++ v->cpt_end_data = mm->end_data; ++ v->cpt_start_brk = mm->start_brk; ++ v->cpt_brk = mm->brk; ++ v->cpt_start_stack = mm->start_stack; ++ v->cpt_start_arg = mm->arg_start; ++ v->cpt_end_arg = mm->arg_end; ++ v->cpt_start_env = mm->env_start; ++ v->cpt_end_env = mm->env_end; ++ v->cpt_def_flags = mm->def_flags; ++#ifdef CONFIG_BEANCOUNTERS ++ v->cpt_mmub = cpt_lookup_ubc(mm->mm_ub, ctx); ++#endif ++ /* FIXME when coredump mask exceeds 8 bits */ ++ WARN_ON(mm->flags >> 8); ++ v->cpt_dumpable = mm->flags; ++ v->cpt_vps_dumpable = mm->vps_dumpable; ++ v->cpt_used_hugetlb = 0; /* not used */ ++#ifndef CONFIG_IA64 ++ v->cpt_vdso = (__u32)(unsigned long)mm->context.vdso; ++#endif ++ ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ ++#ifdef CONFIG_X86 ++ if (mm->context.size) { ++ loff_t saved_object; ++ struct cpt_obj_bits b; ++ int size; ++ ++ dprintk_ctx("nontrivial LDT\n"); ++ ++ cpt_push_object(&saved_object, ctx); ++ ++ cpt_open_object(NULL, ctx); ++ b.cpt_next = CPT_NULL; ++ b.cpt_object = CPT_OBJ_BITS; ++ b.cpt_hdrlen = sizeof(b); ++ b.cpt_content = CPT_CONTENT_MM_CONTEXT; ++ b.cpt_size = mm->context.size*LDT_ENTRY_SIZE; ++ ++ ctx->write(&b, sizeof(b), ctx); ++ ++ size = mm->context.size*LDT_ENTRY_SIZE; ++ ++#if defined(CONFIG_X86_64) || defined(CONFIG_XEN) || \ ++ LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,19) ++ ctx->write(mm->context.ldt, size, ctx); ++#else ++ for (i = 0; i < size; i += PAGE_SIZE) { ++ int nr = i / PAGE_SIZE, bytes; ++ char *kaddr = kmap(mm->context.ldt_pages[nr]); ++ ++ bytes = size - i; ++ if (bytes > PAGE_SIZE) ++ bytes = PAGE_SIZE; ++ ctx->write(kaddr, bytes, ctx); ++ kunmap(mm->context.ldt_pages[nr]); ++ } ++#endif ++ ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_object, ctx); ++ } ++#endif ++ ++ for (vma = mm->mmap; vma; vma = vma->vm_next) { ++ int err; ++ ++ if ((err = dump_one_vma(obj, vma, ctx)) != 0) ++ return err; ++ } ++ ++ if (mm->ioctx_list) { ++ struct kioctx *aio_ctx; ++ int err; ++ ++ for (aio_ctx = mm->ioctx_list; aio_ctx; aio_ctx = aio_ctx->next) ++ if ((err = dump_one_aio_ctx(mm, aio_ctx, ctx)) != 0) ++ return err; ++ } ++ ++ cpt_close_object(ctx); ++ ++ return 0; ++} ++ ++int cpt_dump_vm(struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ scnt = scnt0 = zcnt = 0; ++ ++ cpt_open_section(ctx, CPT_SECT_MM); ++ ++ for_each_object(obj, CPT_OBJ_MM) { ++ int err; ++ ++ if ((err = dump_one_mm(obj, ctx)) != 0) ++ return err; ++ } ++ ++ cpt_close_section(ctx); ++ ++ if (scnt) ++ dprintk_ctx("cpt_dump_vm: %d shared private anon pages\n", scnt); ++ if (scnt0) ++ dprintk_ctx("cpt_dump_vm: %d anon pages are cloned\n", scnt0); ++ if (zcnt) ++ dprintk_ctx("cpt_dump_vm: %d silly pages canceled\n", zcnt); ++ return 0; ++} +diff --git a/kernel/cpt/cpt_mm.h b/kernel/cpt/cpt_mm.h +new file mode 100644 +index 0000000..dc2c483 +--- /dev/null ++++ b/kernel/cpt/cpt_mm.h +@@ -0,0 +1,35 @@ ++int cpt_collect_mm(cpt_context_t *); ++ ++int cpt_dump_vm(struct cpt_context *ctx); ++ ++__u32 rst_mm_flag(struct cpt_task_image *ti, struct cpt_context *ctx); ++int rst_mm_basic(cpt_object_t *obj, struct cpt_task_image *ti, struct cpt_context *ctx); ++int rst_mm_complete(struct cpt_task_image *ti, struct cpt_context *ctx); ++ ++int cpt_mm_prepare(unsigned long veid); ++ ++int cpt_free_pgin_dir(struct cpt_context *); ++int cpt_start_pagein(struct cpt_context *); ++int rst_setup_pagein(struct cpt_context *); ++int rst_complete_pagein(struct cpt_context *, int); ++int rst_pageind(struct cpt_context *); ++int cpt_iteration(cpt_context_t *ctx); ++int rst_iteration(cpt_context_t *ctx); ++void rst_drop_iter_dir(cpt_context_t *ctx); ++int rst_iter(struct vm_area_struct *vma, u64 pfn, ++ unsigned long addr, cpt_context_t * ctx); ++ ++int rst_swapoff(struct cpt_context *); ++ ++#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES ++struct linux_binprm; ++extern int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack, ++ unsigned long map_address); ++#endif ++ ++#ifdef CONFIG_X86 ++extern struct page *vdso32_pages[1]; ++#define vsyscall_addr page_address(vdso32_pages[0]) ++#endif ++ ++extern struct vm_operations_struct special_mapping_vmops; +diff --git a/kernel/cpt/cpt_net.c b/kernel/cpt/cpt_net.c +new file mode 100644 +index 0000000..373db60 +--- /dev/null ++++ b/kernel/cpt/cpt_net.c +@@ -0,0 +1,610 @@ ++/* ++ * ++ * kernel/cpt/cpt_net.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_kernel.h" ++#include "cpt_syscalls.h" ++ ++static void cpt_dump_veth(struct net_device *dev, struct cpt_context * ctx) ++{ ++#if defined(CONFIG_VE_ETHDEV) || defined(CONFIG_VE_ETHDEV_MODULE) ++ struct cpt_veth_image v; ++ struct veth_struct *veth; ++ ++ if (!KSYMREF(veth_open) || dev->open != KSYMREF(veth_open)) ++ return; ++ ++ veth = veth_from_netdev(dev); ++ cpt_open_object(NULL, ctx); ++ ++ v.cpt_next = CPT_NULL; ++ v.cpt_object = CPT_OBJ_NET_VETH; ++ v.cpt_hdrlen = sizeof(v); ++ v.cpt_content = CPT_CONTENT_VOID; ++ ++ v.cpt_allow_mac_change = veth->allow_mac_change; ++ ++ ctx->write(&v, sizeof(v), ctx); ++ cpt_close_object(ctx); ++#endif ++ return; ++} ++ ++static void cpt_dump_netstats(struct net_device *dev, struct cpt_context * ctx) ++{ ++ struct cpt_netstats_image *n; ++ struct net_device_stats *stats; ++ ++ if (!dev->get_stats) ++ return; ++ ++ n = cpt_get_buf(ctx); ++ stats = dev->get_stats(dev); ++ cpt_open_object(NULL, ctx); ++ ++ n->cpt_next = CPT_NULL; ++ n->cpt_object = CPT_OBJ_NET_STATS; ++ n->cpt_hdrlen = sizeof(*n); ++ n->cpt_content = CPT_CONTENT_VOID; ++ ++ n->cpt_rx_packets = stats->rx_packets; ++ n->cpt_tx_packets = stats->tx_packets; ++ n->cpt_rx_bytes = stats->rx_bytes; ++ n->cpt_tx_bytes = stats->tx_bytes; ++ n->cpt_rx_errors = stats->rx_errors; ++ n->cpt_tx_errors = stats->tx_errors; ++ n->cpt_rx_dropped = stats->rx_dropped; ++ n->cpt_tx_dropped = stats->tx_dropped; ++ n->cpt_multicast = stats->multicast; ++ n->cpt_collisions = stats->collisions; ++ n->cpt_rx_length_errors = stats->rx_length_errors; ++ n->cpt_rx_over_errors = stats->rx_over_errors; ++ n->cpt_rx_crc_errors = stats->rx_crc_errors; ++ n->cpt_rx_frame_errors = stats->rx_frame_errors; ++ n->cpt_rx_fifo_errors = stats->rx_fifo_errors; ++ n->cpt_rx_missed_errors = stats->rx_missed_errors; ++ n->cpt_tx_aborted_errors = stats->tx_aborted_errors; ++ n->cpt_tx_carrier_errors = stats->tx_carrier_errors; ++ n->cpt_tx_fifo_errors = stats->tx_fifo_errors; ++ n->cpt_tx_heartbeat_errors = stats->tx_heartbeat_errors; ++ n->cpt_tx_window_errors = stats->tx_window_errors; ++ n->cpt_rx_compressed = stats->rx_compressed; ++ n->cpt_tx_compressed = stats->tx_compressed; ++ ++ ctx->write(n, sizeof(*n), ctx); ++ cpt_close_object(ctx); ++ cpt_release_buf(ctx); ++ return; ++} ++ ++static void cpt_dump_tuntap(struct net_device *dev, struct cpt_context * ctx) ++{ ++#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) ++ struct cpt_tuntap_image v; ++ struct tun_struct *tun; ++ cpt_object_t *obj; ++ ++ if (dev->open != tun_net_open) ++ return; ++ ++ tun = netdev_priv(dev); ++ cpt_open_object(NULL, ctx); ++ ++ v.cpt_next = CPT_NULL; ++ v.cpt_object = CPT_OBJ_NET_TUNTAP; ++ v.cpt_hdrlen = sizeof(v); ++ v.cpt_content = CPT_CONTENT_VOID; ++ ++ v.cpt_owner = tun->owner; ++ v.cpt_flags = tun->flags; ++ v.cpt_attached = tun->attached; ++ ++ if (tun->bind_file) { ++ obj = lookup_cpt_object(CPT_OBJ_FILE, tun->bind_file, ctx); ++ BUG_ON(!obj); ++ v.cpt_bindfile = obj->o_pos; ++ } ++ ++ v.cpt_if_flags = tun->if_flags; ++ BUG_ON(sizeof(v.cpt_dev_addr) != sizeof(tun->dev_addr)); ++ memcpy(v.cpt_dev_addr, tun->dev_addr, sizeof(v.cpt_dev_addr)); ++ BUG_ON(sizeof(v.cpt_chr_filter) != sizeof(tun->chr_filter)); ++ memcpy(v.cpt_chr_filter, tun->chr_filter, sizeof(v.cpt_chr_filter)); ++ BUG_ON(sizeof(v.cpt_net_filter) != sizeof(tun->net_filter)); ++ memcpy(v.cpt_net_filter, tun->net_filter, sizeof(v.cpt_net_filter)); ++ ctx->write(&v, sizeof(v), ctx); ++ cpt_close_object(ctx); ++#endif ++ return; ++} ++ ++int cpt_dump_link(struct cpt_context * ctx) ++{ ++ struct net *net = get_exec_env()->ve_netns; ++ struct net_device *dev; ++ ++ cpt_open_section(ctx, CPT_SECT_NET_DEVICE); ++ for_each_netdev(net, dev) { ++ struct cpt_netdev_image v; ++ struct cpt_hwaddr_image hw; ++ loff_t saved_obj; ++ ++ cpt_open_object(NULL, ctx); ++ ++ v.cpt_next = CPT_NULL; ++ v.cpt_object = CPT_OBJ_NET_DEVICE; ++ v.cpt_hdrlen = sizeof(v); ++ v.cpt_content = CPT_CONTENT_ARRAY; ++ ++ v.cpt_index = dev->ifindex; ++ v.cpt_flags = dev->flags; ++ memcpy(v.cpt_name, dev->name, IFNAMSIZ); ++ ctx->write(&v, sizeof(v), ctx); ++ ++ cpt_push_object(&saved_obj, ctx); ++ ++ cpt_dump_tuntap(dev, ctx); ++ ++ cpt_dump_veth(dev, ctx); ++ ++ /* Dump hardware address */ ++ cpt_open_object(NULL, ctx); ++ hw.cpt_next = CPT_NULL; ++ hw.cpt_object = CPT_OBJ_NET_HWADDR; ++ hw.cpt_hdrlen = sizeof(hw); ++ hw.cpt_content = CPT_CONTENT_VOID; ++ BUG_ON(sizeof(hw.cpt_dev_addr) != sizeof(dev->dev_addr)); ++ memcpy(hw.cpt_dev_addr, dev->dev_addr, sizeof(hw.cpt_dev_addr)); ++ ctx->write(&hw, sizeof(hw), ctx); ++ cpt_close_object(ctx); ++ ++ cpt_dump_netstats(dev, ctx); ++ ++ cpt_pop_object(&saved_obj, ctx); ++ ++ cpt_close_object(ctx); ++ ++ if (dev != net->loopback_dev ++#if defined(CONFIG_VE_ETHDEV) || defined(CONFIG_VE_ETHDEV_MODULE) ++ && !(KSYMREF(veth_open) && dev->open == KSYMREF(veth_open)) ++#endif ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++ && dev != get_exec_env()->_venet_dev ++#endif ++#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) ++ && dev->open != tun_net_open ++#endif ++ ) { ++ eprintk_ctx("unsupported netdevice %s\n", dev->name); ++ cpt_close_section(ctx); ++ return -EBUSY; ++ } ++ } ++ cpt_close_section(ctx); ++ return 0; ++} ++ ++int cpt_suspend_network(struct cpt_context *ctx) ++{ ++ get_exec_env()->disable_net = 1; ++ synchronize_net(); ++ return 0; ++} ++ ++int cpt_resume_network(struct cpt_context *ctx) ++{ ++ struct ve_struct *env; ++ env = get_ve_by_id(ctx->ve_id); ++ if (!env) ++ return -ESRCH; ++ env->disable_net = 0; ++ put_ve(env); ++ return 0; ++} ++ ++int cpt_dump_ifaddr(struct cpt_context * ctx) ++{ ++ struct net *net = get_exec_env()->ve_netns; ++ struct net_device *dev; ++ ++ cpt_open_section(ctx, CPT_SECT_NET_IFADDR); ++ for_each_netdev(net, dev) { ++ struct in_device *idev = in_dev_get(dev); ++ struct in_ifaddr *ifa; ++ ++ if (!idev) ++ continue; ++ ++ for (ifa = idev->ifa_list; ifa; ifa = ifa->ifa_next) { ++ struct cpt_ifaddr_image v; ++ cpt_open_object(NULL, ctx); ++ ++ v.cpt_next = CPT_NULL; ++ v.cpt_object = CPT_OBJ_NET_IFADDR; ++ v.cpt_hdrlen = sizeof(v); ++ v.cpt_content = CPT_CONTENT_VOID; ++ ++ v.cpt_index = dev->ifindex; ++ v.cpt_family = AF_INET; ++ v.cpt_masklen = ifa->ifa_prefixlen; ++ v.cpt_flags = ifa->ifa_flags; ++ v.cpt_scope = ifa->ifa_scope; ++ memset(&v.cpt_address, 0, sizeof(v.cpt_address)); ++ memset(&v.cpt_peer, 0, sizeof(v.cpt_peer)); ++ memset(&v.cpt_broadcast, 0, sizeof(v.cpt_broadcast)); ++ v.cpt_address[0] = ifa->ifa_local; ++ v.cpt_peer[0] = ifa->ifa_address; ++ v.cpt_broadcast[0] = ifa->ifa_broadcast; ++ memcpy(v.cpt_label, ifa->ifa_label, IFNAMSIZ); ++ ctx->write(&v, sizeof(v), ctx); ++ cpt_close_object(ctx); ++ } ++ in_dev_put(idev); ++ } ++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) ++ for_each_netdev(net, dev) { ++ struct inet6_dev *idev = in6_dev_get(dev); ++ struct inet6_ifaddr *ifa; ++ ++ if (!idev) ++ continue; ++ ++ for (ifa = idev->addr_list; ifa; ifa = ifa->if_next) { ++ struct cpt_ifaddr_image v; ++ ++ if (dev == net->loopback_dev && ++ ifa->prefix_len == 128 && ++ ifa->addr.s6_addr32[0] == 0 && ++ ifa->addr.s6_addr32[1] == 0 && ++ ifa->addr.s6_addr32[2] == 0 && ++ ifa->addr.s6_addr32[3] == htonl(1)) ++ continue; ++ ++ cpt_open_object(NULL, ctx); ++ ++ v.cpt_next = CPT_NULL; ++ v.cpt_object = CPT_OBJ_NET_IFADDR; ++ v.cpt_hdrlen = sizeof(v); ++ v.cpt_content = CPT_CONTENT_VOID; ++ ++ v.cpt_index = dev->ifindex; ++ v.cpt_family = AF_INET6; ++ v.cpt_masklen = ifa->prefix_len; ++ v.cpt_flags = ifa->flags; ++ v.cpt_scope = ifa->scope; ++ v.cpt_valid_lft = ifa->valid_lft; ++ v.cpt_prefered_lft = ifa->prefered_lft; ++ memcpy(&v.cpt_address, &ifa->addr, 16); ++ memcpy(&v.cpt_peer, &ifa->addr, 16); ++ memset(&v.cpt_broadcast, 0, sizeof(v.cpt_broadcast)); ++ memcpy(v.cpt_label, dev->name, IFNAMSIZ); ++ ctx->write(&v, sizeof(v), ctx); ++ cpt_close_object(ctx); ++ } ++ in6_dev_put(idev); ++ } ++#endif ++ cpt_close_section(ctx); ++ return 0; ++} ++ ++static int cpt_dump_route(struct cpt_context * ctx) ++{ ++ int err; ++ struct socket *sock; ++ struct msghdr msg; ++ struct iovec iov; ++ struct { ++ struct nlmsghdr nlh; ++ struct rtgenmsg g; ++ } req; ++ struct sockaddr_nl nladdr; ++ struct cpt_object_hdr v; ++ mm_segment_t oldfs; ++ char *pg; ++ ++ err = sock_create_kern(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE, &sock); ++ if (err) ++ return err; ++ ++ memset(&nladdr, 0, sizeof(nladdr)); ++ nladdr.nl_family = AF_NETLINK; ++ ++ req.nlh.nlmsg_len = sizeof(req); ++ req.nlh.nlmsg_type = RTM_GETROUTE; ++ req.nlh.nlmsg_flags = NLM_F_ROOT|NLM_F_MATCH|NLM_F_REQUEST; ++ req.nlh.nlmsg_pid = 0; ++ req.g.rtgen_family = AF_INET; ++ ++ iov.iov_base=&req; ++ iov.iov_len=sizeof(req); ++ msg.msg_name=&nladdr; ++ msg.msg_namelen=sizeof(nladdr); ++ msg.msg_iov=&iov; ++ msg.msg_iovlen=1; ++ msg.msg_control=NULL; ++ msg.msg_controllen=0; ++ msg.msg_flags=MSG_DONTWAIT; ++ ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ err = sock_sendmsg(sock, &msg, sizeof(req)); ++ set_fs(oldfs); ++ ++ if (err < 0) ++ goto out_sock; ++ ++ pg = (char*)__get_free_page(GFP_KERNEL); ++ if (pg == NULL) { ++ err = -ENOMEM; ++ goto out_sock; ++ } ++ ++ cpt_open_section(ctx, CPT_SECT_NET_ROUTE); ++ cpt_open_object(NULL, ctx); ++ v.cpt_next = CPT_NULL; ++ v.cpt_object = CPT_OBJ_NET_ROUTE; ++ v.cpt_hdrlen = sizeof(v); ++ v.cpt_content = CPT_CONTENT_NLMARRAY; ++ ++ ctx->write(&v, sizeof(v), ctx); ++ ++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) ++restart: ++#endif ++ for (;;) { ++ struct nlmsghdr *h; ++ ++ iov.iov_base = pg; ++ iov.iov_len = PAGE_SIZE; ++ ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ err = sock_recvmsg(sock, &msg, PAGE_SIZE, MSG_DONTWAIT); ++ set_fs(oldfs); ++ ++ if (err < 0) ++ goto out_sock_pg; ++ if (msg.msg_flags & MSG_TRUNC) { ++ err = -ENOBUFS; ++ goto out_sock_pg; ++ } ++ ++ h = (struct nlmsghdr*)pg; ++ while (NLMSG_OK(h, err)) { ++ if (h->nlmsg_type == NLMSG_DONE) { ++ err = 0; ++ goto done; ++ } ++ if (h->nlmsg_type == NLMSG_ERROR) { ++ struct nlmsgerr *errm = (struct nlmsgerr*)NLMSG_DATA(h); ++ err = errm->error; ++ eprintk_ctx("NLMSG error: %d\n", errm->error); ++ goto done; ++ } ++ if (h->nlmsg_type != RTM_NEWROUTE) { ++ eprintk_ctx("NLMSG: %d\n", h->nlmsg_type); ++ err = -EINVAL; ++ goto done; ++ } ++ ctx->write(h, NLMSG_ALIGN(h->nlmsg_len), ctx); ++ h = NLMSG_NEXT(h, err); ++ } ++ if (err) { ++ eprintk_ctx("!!!Remnant of size %d %d %d\n", err, h->nlmsg_len, h->nlmsg_type); ++ err = -EINVAL; ++ break; ++ } ++ } ++done: ++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) ++ if (!err && req.g.rtgen_family == AF_INET) { ++ req.g.rtgen_family = AF_INET6; ++ iov.iov_base=&req; ++ iov.iov_len=sizeof(req); ++ msg.msg_name=&nladdr; ++ msg.msg_namelen=sizeof(nladdr); ++ msg.msg_iov=&iov; ++ msg.msg_iovlen=1; ++ msg.msg_control=NULL; ++ msg.msg_controllen=0; ++ msg.msg_flags=MSG_DONTWAIT; ++ ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ err = sock_sendmsg(sock, &msg, sizeof(req)); ++ set_fs(oldfs); ++ ++ if (err > 0) ++ goto restart; ++ } ++#endif ++ ctx->align(ctx); ++ cpt_close_object(ctx); ++ cpt_close_section(ctx); ++ ++out_sock_pg: ++ free_page((unsigned long)pg); ++out_sock: ++ sock_release(sock); ++ return err; ++} ++ ++static int dumpfn(void *arg) ++{ ++ int i; ++ int *pfd = arg; ++ char *argv[] = { "iptables-save", "-c", NULL }; ++ ++ i = real_env_create(VEID(get_exec_env()), VE_ENTER|VE_SKIPLOCK, 2, NULL, 0); ++ if (i < 0) { ++ eprintk("cannot enter ve to dump iptables\n"); ++ module_put(THIS_MODULE); ++ return 255 << 8; ++ } ++ ++ if (pfd[1] != 1) ++ sc_dup2(pfd[1], 1); ++ ++ for (i=0; ifiles->fdt->max_fds; i++) { ++ if (i != 1) ++ sc_close(i); ++ } ++ ++ module_put(THIS_MODULE); ++ ++ set_fs(KERNEL_DS); ++ i = sc_execve("/sbin/iptables-save", argv, NULL); ++ if (i == -ENOENT) ++ i = sc_execve("/usr/sbin/iptables-save", argv, NULL); ++ eprintk("failed to exec iptables-save: %d\n", i); ++ return 255 << 8; ++} ++ ++ ++static int cpt_dump_iptables(struct cpt_context * ctx) ++{ ++ int err = 0; ++#ifdef CONFIG_VE_IPTABLES ++ int pid; ++ int pfd[2]; ++ struct file *f; ++ struct cpt_object_hdr v; ++ char buf[16]; ++ loff_t pos; ++ int n; ++ int status; ++ mm_segment_t oldfs; ++ sigset_t ignore, blocked; ++ ++ if (!(get_exec_env()->_iptables_modules & VE_IP_IPTABLES_MOD)) ++ return 0; ++ ++ err = sc_pipe(pfd); ++ if (err < 0) { ++ eprintk_ctx("sc_pipe: %d\n", err); ++ return err; ++ } ++ ignore.sig[0] = CPT_SIG_IGNORE_MASK; ++ sigprocmask(SIG_BLOCK, &ignore, &blocked); ++ err = pid = local_kernel_thread(dumpfn, (void*)pfd, SIGCHLD, 0); ++ if (err < 0) { ++ eprintk_ctx("local_kernel_thread: %d\n", err); ++ goto out; ++ } ++ ++ f = fget(pfd[0]); ++ sc_close(pfd[1]); ++ sc_close(pfd[0]); ++ ++ cpt_open_section(ctx, CPT_SECT_NET_IPTABLES); ++ ++ cpt_open_object(NULL, ctx); ++ v.cpt_next = CPT_NULL; ++ v.cpt_object = CPT_OBJ_NAME; ++ v.cpt_hdrlen = sizeof(v); ++ v.cpt_content = CPT_CONTENT_NAME; ++ ++ ctx->write(&v, sizeof(v), ctx); ++ ++ pos = ctx->file->f_pos; ++ do { ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ n = f->f_op->read(f, buf, sizeof(buf), &f->f_pos); ++ set_fs(oldfs); ++ if (n > 0) ++ ctx->write(buf, n, ctx); ++ } while (n > 0); ++ ++ if (n < 0) ++ eprintk_ctx("read: %d\n", n); ++ ++ fput(f); ++ ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ if ((err = sc_waitx(pid, 0, &status)) < 0) ++ eprintk_ctx("wait4: %d\n", err); ++ else if ((status & 0x7f) == 0) { ++ err = (status & 0xff00) >> 8; ++ if (err != 0) { ++ eprintk_ctx("iptables-save exited with %d\n", err); ++ err = -EINVAL; ++ } ++ } else { ++ eprintk_ctx("iptables-save terminated\n"); ++ err = -EINVAL; ++ } ++ set_fs(oldfs); ++ sigprocmask(SIG_SETMASK, &blocked, NULL); ++ ++ if (ctx->file->f_pos != pos) { ++ buf[0] = 0; ++ ctx->write(buf, 1, ctx); ++ ctx->align(ctx); ++ cpt_close_object(ctx); ++ cpt_close_section(ctx); ++ } else { ++ pos = ctx->current_section; ++ cpt_close_object(ctx); ++ cpt_close_section(ctx); ++ ctx->sections[CPT_SECT_NET_IPTABLES] = CPT_NULL; ++ ctx->file->f_pos = pos; ++ } ++ return n ? : err; ++ ++out: ++ if (pfd[1] >= 0) ++ sc_close(pfd[1]); ++ if (pfd[0] >= 0) ++ sc_close(pfd[0]); ++ sigprocmask(SIG_SETMASK, &blocked, NULL); ++#endif ++ return err; ++} ++ ++int cpt_dump_ifinfo(struct cpt_context * ctx) ++{ ++ int err; ++ ++ rtnl_lock(); ++ err = cpt_dump_link(ctx); ++ if (!err) ++ err = cpt_dump_ifaddr(ctx); ++ rtnl_unlock(); ++ if (!err) ++ err = cpt_dump_route(ctx); ++ if (!err) ++ err = cpt_dump_iptables(ctx); ++ return err; ++} +diff --git a/kernel/cpt/cpt_net.h b/kernel/cpt/cpt_net.h +new file mode 100644 +index 0000000..5d33877 +--- /dev/null ++++ b/kernel/cpt/cpt_net.h +@@ -0,0 +1,7 @@ ++int cpt_dump_ifinfo(struct cpt_context *ctx); ++int rst_restore_net(struct cpt_context *ctx); ++int cpt_suspend_network(struct cpt_context *ctx); ++int cpt_resume_network(struct cpt_context *ctx); ++int rst_resume_network(struct cpt_context *ctx); ++int cpt_dump_ip_conntrack(struct cpt_context *ctx); ++int rst_restore_ip_conntrack(struct cpt_context * ctx); +diff --git a/kernel/cpt/cpt_obj.c b/kernel/cpt/cpt_obj.c +new file mode 100644 +index 0000000..7ab23d7 +--- /dev/null ++++ b/kernel/cpt/cpt_obj.c +@@ -0,0 +1,162 @@ ++/* ++ * ++ * kernel/cpt/cpt_obj.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++ ++cpt_object_t *alloc_cpt_object(int gfp, struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ obj = kmalloc(sizeof(cpt_object_t), gfp); ++ if (obj) { ++ INIT_LIST_HEAD(&obj->o_list); ++ INIT_LIST_HEAD(&obj->o_hash); ++ INIT_LIST_HEAD(&obj->o_alist); ++ obj->o_count = 1; ++ obj->o_pos = CPT_NULL; ++ obj->o_lock = 0; ++ obj->o_parent = NULL; ++ obj->o_index = CPT_NOINDEX; ++ obj->o_obj = NULL; ++ obj->o_image = NULL; ++ ctx->objcount++; ++ } ++ return obj; ++} ++ ++void free_cpt_object(cpt_object_t *obj, cpt_context_t *ctx) ++{ ++ list_del(&obj->o_alist); ++ kfree(obj); ++ ctx->objcount--; ++} ++ ++void intern_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, cpt_context_t *ctx) ++{ ++ list_add_tail(&obj->o_list, &ctx->object_array[type]); ++} ++ ++void insert_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, ++ cpt_object_t *head, cpt_context_t *ctx) ++{ ++ list_add(&obj->o_list, &head->o_list); ++} ++ ++cpt_object_t * __cpt_object_add(enum _cpt_object_type type, void *p, ++ unsigned gfp_mask, cpt_context_t *ctx) ++{ ++ cpt_object_t *obj; ++ ++ obj = lookup_cpt_object(type, p, ctx); ++ ++ if (obj) { ++ obj->o_count++; ++ return obj; ++ } ++ ++ if ((obj = alloc_cpt_object(gfp_mask, ctx)) != NULL) { ++ if (p) ++ cpt_obj_setobj(obj, p, ctx); ++ intern_cpt_object(type, obj, ctx); ++ return obj; ++ } ++ return NULL; ++} ++ ++cpt_object_t * cpt_object_add(enum _cpt_object_type type, void *p, cpt_context_t *ctx) ++{ ++ return __cpt_object_add(type, p, GFP_KERNEL, ctx); ++} ++ ++cpt_object_t * cpt_object_get(enum _cpt_object_type type, void *p, cpt_context_t *ctx) ++{ ++ cpt_object_t *obj; ++ ++ obj = lookup_cpt_object(type, p, ctx); ++ ++ if (obj) ++ obj->o_count++; ++ ++ return obj; ++} ++ ++int cpt_object_init(cpt_context_t *ctx) ++{ ++ int i; ++ ++ for (i=0; iobject_array[i]); ++ } ++ return 0; ++} ++ ++int cpt_object_destroy(cpt_context_t *ctx) ++{ ++ int i; ++ ++ for (i=0; iobject_array[i])) { ++ struct list_head *head = ctx->object_array[i].next; ++ cpt_object_t *obj = list_entry(head, cpt_object_t, o_list); ++ list_del(head); ++ if (obj->o_image) ++ kfree(obj->o_image); ++ free_cpt_object(obj, ctx); ++ } ++ } ++ if (ctx->objcount != 0) ++ eprintk_ctx("BUG: ctx->objcount=%d\n", ctx->objcount); ++ return 0; ++} ++ ++cpt_object_t *lookup_cpt_object(enum _cpt_object_type type, void *p, struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ for_each_object(obj, type) { ++ if (obj->o_obj == p) ++ return obj; ++ } ++ return NULL; ++} ++ ++cpt_object_t *lookup_cpt_obj_bypos(enum _cpt_object_type type, loff_t pos, struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ for_each_object(obj, type) { ++ if (obj->o_pos == pos) ++ return obj; ++ } ++ return NULL; ++} ++ ++cpt_object_t *lookup_cpt_obj_byindex(enum _cpt_object_type type, __u32 index, struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ for_each_object(obj, type) { ++ if (obj->o_index == index) ++ return obj; ++ } ++ return NULL; ++} +diff --git a/kernel/cpt/cpt_obj.h b/kernel/cpt/cpt_obj.h +new file mode 100644 +index 0000000..7762623 +--- /dev/null ++++ b/kernel/cpt/cpt_obj.h +@@ -0,0 +1,62 @@ ++#ifndef __CPT_OBJ_H_ ++#define __CPT_OBJ_H_ 1 ++ ++#include ++#include ++ ++typedef struct _cpt_object ++{ ++ struct list_head o_list; ++ struct list_head o_hash; ++ int o_count; ++ int o_index; ++ int o_lock; ++ loff_t o_pos; ++ loff_t o_ppos; ++ void *o_obj; ++ void *o_image; ++ void *o_parent; ++ struct list_head o_alist; ++} cpt_object_t; ++ ++struct cpt_context; ++ ++#define for_each_object(obj, type) list_for_each_entry(obj, &ctx->object_array[type], o_list) ++ ++ ++extern cpt_object_t *alloc_cpt_object(int gfp, struct cpt_context *ctx); ++extern void free_cpt_object(cpt_object_t *obj, struct cpt_context *ctx); ++ ++cpt_object_t *lookup_cpt_object(enum _cpt_object_type type, void *p, struct cpt_context *ctx); ++cpt_object_t *lookup_cpt_obj_bypos(enum _cpt_object_type type, loff_t pos, struct cpt_context *ctx); ++cpt_object_t *lookup_cpt_obj_byindex(enum _cpt_object_type type, __u32 index, struct cpt_context *ctx); ++ ++static inline void cpt_obj_setpos(cpt_object_t *cpt, loff_t pos, struct cpt_context *ctx) ++{ ++ cpt->o_pos = pos; ++ /* Add to pos hash table */ ++} ++ ++static inline void cpt_obj_setobj(cpt_object_t *cpt, void *ptr, struct cpt_context *ctx) ++{ ++ cpt->o_obj = ptr; ++ /* Add to hash table */ ++} ++ ++static inline void cpt_obj_setindex(cpt_object_t *cpt, __u32 index, struct cpt_context *ctx) ++{ ++ cpt->o_index = index; ++ /* Add to index hash table */ ++} ++ ++ ++extern void intern_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, struct cpt_context *ctx); ++extern void insert_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, cpt_object_t *head, struct cpt_context *ctx); ++extern cpt_object_t *cpt_object_add(enum _cpt_object_type type, void *p, struct cpt_context *ctx); ++extern cpt_object_t *__cpt_object_add(enum _cpt_object_type type, void *p, unsigned int gfp_mask, struct cpt_context *ctx); ++extern cpt_object_t *cpt_object_get(enum _cpt_object_type type, void *p, struct cpt_context *ctx); ++ ++extern int cpt_object_init(struct cpt_context *ctx); ++extern int cpt_object_destroy(struct cpt_context *ctx); ++ ++#endif /* __CPT_OBJ_H_ */ +diff --git a/kernel/cpt/cpt_proc.c b/kernel/cpt/cpt_proc.c +new file mode 100644 +index 0000000..08d5fd4 +--- /dev/null ++++ b/kernel/cpt/cpt_proc.c +@@ -0,0 +1,595 @@ ++/* ++ * ++ * kernel/cpt/cpt_proc.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_dump.h" ++#include "cpt_mm.h" ++#include "cpt_kernel.h" ++ ++MODULE_AUTHOR("Alexey Kuznetsov "); ++MODULE_LICENSE("GPL"); ++ ++/* List of contexts and lock protecting the list */ ++static struct list_head cpt_context_list; ++static spinlock_t cpt_context_lock; ++ ++static int proc_read(char *buffer, char **start, off_t offset, ++ int length, int *eof, void *data) ++{ ++ off_t pos = 0; ++ off_t begin = 0; ++ int len = 0; ++ cpt_context_t *ctx; ++ ++ len += sprintf(buffer, "Ctx Id VE State\n"); ++ ++ spin_lock(&cpt_context_lock); ++ ++ list_for_each_entry(ctx, &cpt_context_list, ctx_list) { ++ len += sprintf(buffer+len,"%p %08x %-8u %d", ++ ctx, ++ ctx->contextid, ++ ctx->ve_id, ++ ctx->ctx_state ++ ); ++ ++ buffer[len++] = '\n'; ++ ++ pos = begin+len; ++ if (pos < offset) { ++ len = 0; ++ begin = pos; ++ } ++ if (pos > offset+length) ++ goto done; ++ } ++ *eof = 1; ++ ++done: ++ spin_unlock(&cpt_context_lock); ++ *start = buffer + (offset - begin); ++ len -= (offset - begin); ++ if(len > length) ++ len = length; ++ if(len < 0) ++ len = 0; ++ return len; ++} ++ ++void cpt_context_release(cpt_context_t *ctx) ++{ ++ list_del(&ctx->ctx_list); ++ spin_unlock(&cpt_context_lock); ++ ++ if (ctx->ctx_state > 0) ++ cpt_resume(ctx); ++ ctx->ctx_state = CPT_CTX_ERROR; ++ ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ if (ctx->pgin_task) ++ put_task_struct(ctx->pgin_task); ++ if (ctx->pgin_dir) ++ cpt_free_pgin_dir(ctx); ++ if (ctx->pagein_file_out) ++ fput(ctx->pagein_file_out); ++ if (ctx->pagein_file_in) ++ fput(ctx->pagein_file_in); ++#endif ++ if (ctx->objcount) ++ eprintk_ctx("%d objects leaked\n", ctx->objcount); ++ if (ctx->file) ++ fput(ctx->file); ++ cpt_flush_error(ctx); ++ if (ctx->errorfile) { ++ fput(ctx->errorfile); ++ ctx->errorfile = NULL; ++ } ++ if (ctx->error_msg) { ++ free_page((unsigned long)ctx->error_msg); ++ ctx->error_msg = NULL; ++ } ++ if (ctx->statusfile) ++ fput(ctx->statusfile); ++ if (ctx->lockfile) ++ fput(ctx->lockfile); ++ kfree(ctx); ++ ++ spin_lock(&cpt_context_lock); ++} ++ ++static void __cpt_context_put(cpt_context_t *ctx) ++{ ++ if (!--ctx->refcount) ++ cpt_context_release(ctx); ++} ++ ++static void cpt_context_put(cpt_context_t *ctx) ++{ ++ spin_lock(&cpt_context_lock); ++ __cpt_context_put(ctx); ++ spin_unlock(&cpt_context_lock); ++} ++ ++cpt_context_t * cpt_context_open(void) ++{ ++ cpt_context_t *ctx; ++ ++ if ((ctx = kmalloc(sizeof(*ctx), GFP_KERNEL)) != NULL) { ++ cpt_context_init(ctx); ++ spin_lock(&cpt_context_lock); ++ list_add_tail(&ctx->ctx_list, &cpt_context_list); ++ spin_unlock(&cpt_context_lock); ++ ctx->error_msg = (char*)__get_free_page(GFP_KERNEL); ++ if (ctx->error_msg != NULL) ++ ctx->error_msg[0] = 0; ++ } ++ return ctx; ++} ++ ++static cpt_context_t * cpt_context_lookup(unsigned int contextid) ++{ ++ cpt_context_t *ctx; ++ ++ spin_lock(&cpt_context_lock); ++ list_for_each_entry(ctx, &cpt_context_list, ctx_list) { ++ if (ctx->contextid == contextid) { ++ ctx->refcount++; ++ spin_unlock(&cpt_context_lock); ++ return ctx; ++ } ++ } ++ spin_unlock(&cpt_context_lock); ++ return NULL; ++} ++ ++int cpt_context_lookup_veid(unsigned int veid) ++{ ++ cpt_context_t *ctx; ++ ++ spin_lock(&cpt_context_lock); ++ list_for_each_entry(ctx, &cpt_context_list, ctx_list) { ++ if (ctx->ve_id == veid && ctx->ctx_state > 0) { ++ spin_unlock(&cpt_context_lock); ++ return 1; ++ } ++ } ++ spin_unlock(&cpt_context_lock); ++ return 0; ++} ++ ++static int cpt_ioctl(struct inode * inode, struct file * file, unsigned int cmd, unsigned long arg) ++{ ++ int err = 0; ++ cpt_context_t *ctx; ++ struct file *dfile = NULL; ++ int try; ++ ++ unlock_kernel(); ++ ++ if (cmd == CPT_VMPREP) { ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ err = cpt_mm_prepare(arg); ++#else ++ err = -EINVAL; ++#endif ++ goto out_lock; ++ } ++ ++ if (cmd == CPT_TEST_CAPS) { ++ unsigned int src_flags, dst_flags = arg; ++ ++ err = 0; ++ src_flags = test_cpu_caps(); ++ test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_CMOV, "cmov", err); ++ test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_FXSR, "fxsr", err); ++ test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_SSE, "sse", err); ++ test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_SSE2, "sse2", err); ++ test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_MMX, "mmx", err); ++ test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_3DNOW, "3dnow", err); ++ test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_3DNOW2, "3dnowext", err); ++ test_one_flag_old(src_flags, dst_flags, CPT_CPU_X86_SEP, "sysenter", err); ++ goto out_lock; ++ } ++ ++ if (cmd == CPT_JOIN_CONTEXT || cmd == CPT_PUT_CONTEXT) { ++ cpt_context_t *old_ctx; ++ ++ ctx = NULL; ++ if (cmd == CPT_JOIN_CONTEXT) { ++ err = -ENOENT; ++ ctx = cpt_context_lookup(arg); ++ if (!ctx) ++ goto out_lock; ++ } ++ ++ spin_lock(&cpt_context_lock); ++ old_ctx = (cpt_context_t*)file->private_data; ++ file->private_data = ctx; ++ ++ if (old_ctx) { ++ if (cmd == CPT_PUT_CONTEXT && old_ctx->sticky) { ++ old_ctx->sticky = 0; ++ old_ctx->refcount--; ++ } ++ __cpt_context_put(old_ctx); ++ } ++ spin_unlock(&cpt_context_lock); ++ err = 0; ++ goto out_lock; ++ } ++ ++ spin_lock(&cpt_context_lock); ++ ctx = (cpt_context_t*)file->private_data; ++ if (ctx) ++ ctx->refcount++; ++ spin_unlock(&cpt_context_lock); ++ ++ if (!ctx) { ++ cpt_context_t *old_ctx; ++ ++ err = -ENOMEM; ++ ctx = cpt_context_open(); ++ if (!ctx) ++ goto out_lock; ++ ++ spin_lock(&cpt_context_lock); ++ old_ctx = (cpt_context_t*)file->private_data; ++ if (!old_ctx) { ++ ctx->refcount++; ++ file->private_data = ctx; ++ } else { ++ old_ctx->refcount++; ++ } ++ if (old_ctx) { ++ __cpt_context_put(ctx); ++ ctx = old_ctx; ++ } ++ spin_unlock(&cpt_context_lock); ++ } ++ ++ if (cmd == CPT_GET_CONTEXT) { ++ unsigned int contextid = (unsigned int)arg; ++ ++ if (ctx->contextid && ctx->contextid != contextid) { ++ err = -EINVAL; ++ goto out_nosem; ++ } ++ if (!ctx->contextid) { ++ cpt_context_t *c1 = cpt_context_lookup(contextid); ++ if (c1) { ++ cpt_context_put(c1); ++ err = -EEXIST; ++ goto out_nosem; ++ } ++ ctx->contextid = contextid; ++ } ++ spin_lock(&cpt_context_lock); ++ if (!ctx->sticky) { ++ ctx->sticky = 1; ++ ctx->refcount++; ++ } ++ spin_unlock(&cpt_context_lock); ++ goto out_nosem; ++ } ++ ++ down(&ctx->main_sem); ++ ++ err = -EBUSY; ++ if (ctx->ctx_state < 0) ++ goto out; ++ ++ err = 0; ++ switch (cmd) { ++ case CPT_SET_DUMPFD: ++ if (ctx->ctx_state == CPT_CTX_DUMPING) { ++ err = -EBUSY; ++ break; ++ } ++ if (arg >= 0) { ++ err = -EBADF; ++ dfile = fget(arg); ++ if (dfile == NULL) ++ break; ++ if (dfile->f_op == NULL || ++ dfile->f_op->write == NULL) { ++ fput(dfile); ++ break; ++ } ++ err = 0; ++ } ++ if (ctx->file) ++ fput(ctx->file); ++ ctx->file = dfile; ++ break; ++ case CPT_SET_ERRORFD: ++ if (arg >= 0) { ++ dfile = fget(arg); ++ if (dfile == NULL) { ++ err = -EBADF; ++ break; ++ } ++ } ++ if (ctx->errorfile) ++ fput(ctx->errorfile); ++ ctx->errorfile = dfile; ++ break; ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ case CPT_SET_PAGEINFDIN: ++ if (arg >= 0) { ++ dfile = fget(arg); ++ if (dfile == NULL) { ++ err = -EBADF; ++ break; ++ } ++ } ++ if (ctx->pagein_file_in) ++ fput(ctx->pagein_file_in); ++ ctx->pagein_file_in = dfile; ++ break; ++ case CPT_SET_PAGEINFDOUT: ++ if (arg >= 0) { ++ dfile = fget(arg); ++ if (dfile == NULL) { ++ err = -EBADF; ++ break; ++ } ++ } ++ if (ctx->pagein_file_out) ++ fput(ctx->pagein_file_out); ++ ctx->pagein_file_out = dfile; ++ break; ++ case CPT_SET_LAZY: ++ ctx->lazy_vm = arg; ++ break; ++ case CPT_ITER: ++ err = cpt_iteration(ctx); ++ break; ++ case CPT_PAGEIND: ++ err = cpt_start_pagein(ctx); ++ break; ++#endif ++ case CPT_SET_VEID: ++ if (ctx->ctx_state > 0) { ++ err = -EBUSY; ++ break; ++ } ++ ctx->ve_id = arg; ++ break; ++ case CPT_SET_CPU_FLAGS: ++ if (ctx->ctx_state > 0) { ++ err = -EBUSY; ++ break; ++ } ++ ctx->dst_cpu_flags = arg; ++ ctx->src_cpu_flags = test_cpu_caps(); ++ break; ++ case CPT_SUSPEND: ++ if (cpt_context_lookup_veid(ctx->ve_id) || ++ ctx->ctx_state > 0) { ++ err = -EBUSY; ++ break; ++ } ++ ctx->ctx_state = CPT_CTX_SUSPENDING; ++ try = 0; ++ do { ++ err = cpt_vps_suspend(ctx); ++ if (err) ++ cpt_resume(ctx); ++ if (err == -EAGAIN) ++ msleep(1000); ++ try++; ++ } while (err == -EAGAIN && try < 3); ++ if (err) { ++ ctx->ctx_state = CPT_CTX_IDLE; ++ } else { ++ ctx->ctx_state = CPT_CTX_SUSPENDED; ++ } ++ break; ++ case CPT_DUMP: ++ if (!ctx->ctx_state) { ++ err = -ENOENT; ++ break; ++ } ++ if (!ctx->file) { ++ err = -EBADF; ++ break; ++ } ++ err = cpt_dump(ctx); ++ break; ++ case CPT_RESUME: ++ if (ctx->ctx_state == CPT_CTX_IDLE) { ++ err = -ENOENT; ++ break; ++ } ++ err = cpt_resume(ctx); ++ if (!err) ++ ctx->ctx_state = CPT_CTX_IDLE; ++ break; ++ case CPT_KILL: ++ if (ctx->ctx_state == CPT_CTX_IDLE) { ++ err = -ENOENT; ++ break; ++ } ++ err = cpt_kill(ctx); ++ if (!err) ++ ctx->ctx_state = CPT_CTX_IDLE; ++ break; ++ case CPT_TEST_VECAPS: ++ { ++ __u32 dst_flags = arg; ++ __u32 src_flags; ++ ++ err = cpt_vps_caps(ctx, &src_flags); ++ if (err) ++ break; ++ ++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_CMOV, "cmov", err); ++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_FXSR, "fxsr", err); ++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SSE, "sse", err); ++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SSE2, "sse2", err); ++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_MMX, "mmx", err); ++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_3DNOW, "3dnow", err); ++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_3DNOW2, "3dnowext", err); ++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SEP, "sysenter", err); ++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_EMT64, "emt64", err); ++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_IA64, "ia64", err); ++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SYSCALL, "syscall", err); ++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SYSCALL32, "syscall32", err); ++ if (src_flags & CPT_UNSUPPORTED_MASK) ++ err = 2; ++ break; ++ } ++ default: ++ err = -EINVAL; ++ break; ++ } ++ ++out: ++ cpt_flush_error(ctx); ++ up(&ctx->main_sem); ++out_nosem: ++ cpt_context_put(ctx); ++out_lock: ++ lock_kernel(); ++ if (err == -ERESTARTSYS || err == -ERESTARTNOINTR || ++ err == -ERESTARTNOHAND || err == -ERESTART_RESTARTBLOCK) ++ err = -EINTR; ++ return err; ++} ++ ++static int cpt_open(struct inode *inode, struct file *file) ++{ ++ if (!try_module_get(THIS_MODULE)) ++ return -EBUSY; ++ ++ return 0; ++} ++ ++static int cpt_release(struct inode * inode, struct file * file) ++{ ++ cpt_context_t *ctx; ++ ++ spin_lock(&cpt_context_lock); ++ ctx = (cpt_context_t*)file->private_data; ++ file->private_data = NULL; ++ ++ if (ctx) ++ __cpt_context_put(ctx); ++ spin_unlock(&cpt_context_lock); ++ ++ module_put(THIS_MODULE); ++ return 0; ++} ++ ++ ++static struct file_operations cpt_fops = { ++ .owner = THIS_MODULE, ++ .open = cpt_open, ++ .release = cpt_release, ++ .ioctl = cpt_ioctl, ++}; ++ ++static struct proc_dir_entry *proc_ent; ++ ++static struct ctl_table_header *ctl_header; ++ ++static ctl_table debug_table[] = { ++ { ++ .procname = "cpt", ++ .data = &debug_level, ++ .maxlen = sizeof(debug_level), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { .ctl_name = 0 } ++}; ++static ctl_table root_table[] = { ++ { ++ .ctl_name = CTL_DEBUG, ++ .procname = "debug", ++ .mode = 0555, ++ .child = debug_table, ++ }, ++ { .ctl_name = 0 } ++}; ++ ++static int __init init_cpt(void) ++{ ++ int err; ++ ++ err = -ENOMEM; ++ ctl_header = register_sysctl_table(root_table); ++ if (!ctl_header) ++ goto err_mon; ++ ++ spin_lock_init(&cpt_context_lock); ++ INIT_LIST_HEAD(&cpt_context_list); ++ ++ err = -EINVAL; ++ proc_ent = proc_create("cpt", 0600, NULL, NULL); ++ if (!proc_ent) ++ goto err_out; ++ ++ cpt_fops.read = proc_ent->proc_fops->read; ++ cpt_fops.write = proc_ent->proc_fops->write; ++ cpt_fops.llseek = proc_ent->proc_fops->llseek; ++ proc_ent->proc_fops = &cpt_fops; ++ ++ proc_ent->read_proc = proc_read; ++ proc_ent->data = NULL; ++ proc_ent->owner = THIS_MODULE; ++ return 0; ++ ++err_out: ++ unregister_sysctl_table(ctl_header); ++err_mon: ++ return err; ++} ++module_init(init_cpt); ++ ++static void __exit exit_cpt(void) ++{ ++ remove_proc_entry("cpt", NULL); ++ unregister_sysctl_table(ctl_header); ++ ++ spin_lock(&cpt_context_lock); ++ while (!list_empty(&cpt_context_list)) { ++ cpt_context_t *ctx; ++ ctx = list_entry(cpt_context_list.next, cpt_context_t, ctx_list); ++ ++ if (!ctx->sticky) ++ ctx->refcount++; ++ ctx->sticky = 0; ++ ++ BUG_ON(ctx->refcount != 1); ++ ++ __cpt_context_put(ctx); ++ } ++ spin_unlock(&cpt_context_lock); ++} ++module_exit(exit_cpt); +diff --git a/kernel/cpt/cpt_process.c b/kernel/cpt/cpt_process.c +new file mode 100644 +index 0000000..4ceb351 +--- /dev/null ++++ b/kernel/cpt/cpt_process.c +@@ -0,0 +1,1366 @@ ++/* ++ * ++ * kernel/cpt/cpt_process.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_ubc.h" ++#include "cpt_process.h" ++#include "cpt_kernel.h" ++ ++#ifdef CONFIG_X86_32 ++#undef task_pt_regs ++#define task_pt_regs(t) ((struct pt_regs *)((t)->thread.sp0) - 1) ++#endif ++ ++int check_task_state(struct task_struct *tsk, struct cpt_context *ctx) ++{ ++#ifdef CONFIG_X86_64 ++ if (!(task_thread_info(tsk)->flags&_TIF_IA32)) { ++ if (task_pt_regs(tsk)->ip >= VSYSCALL_START && ++ task_pt_regs(tsk)->ip < VSYSCALL_END) { ++ eprintk_ctx(CPT_FID "cannot be checkpointied while vsyscall, try later\n", CPT_TID(tsk)); ++ return -EAGAIN; ++ } ++ } ++#endif ++ return 0; ++} ++ ++#ifdef CONFIG_X86 ++ ++static u32 encode_segment(u32 segreg) ++{ ++ segreg &= 0xFFFF; ++ ++ if (segreg == 0) ++ return CPT_SEG_ZERO; ++ if ((segreg & 3) != 3) { ++ wprintk("Invalid RPL of a segment reg %x\n", segreg); ++ return CPT_SEG_ZERO; ++ } ++ ++ /* LDT descriptor, it is just an index to LDT array */ ++ if (segreg & 4) ++ return CPT_SEG_LDT + (segreg >> 3); ++ ++ /* TLS descriptor. */ ++ if ((segreg >> 3) >= GDT_ENTRY_TLS_MIN && ++ (segreg >> 3) <= GDT_ENTRY_TLS_MAX) ++ return CPT_SEG_TLS1 + ((segreg>>3) - GDT_ENTRY_TLS_MIN); ++ ++ /* One of standard desriptors */ ++#ifdef CONFIG_X86_64 ++ if (segreg == __USER32_DS) ++ return CPT_SEG_USER32_DS; ++ if (segreg == __USER32_CS) ++ return CPT_SEG_USER32_CS; ++ if (segreg == __USER_DS) ++ return CPT_SEG_USER64_DS; ++ if (segreg == __USER_CS) ++ return CPT_SEG_USER64_CS; ++#else ++ if (segreg == __USER_DS) ++ return CPT_SEG_USER32_DS; ++ if (segreg == __USER_CS) ++ return CPT_SEG_USER32_CS; ++#endif ++ wprintk("Invalid segment reg %x\n", segreg); ++ return CPT_SEG_ZERO; ++} ++ ++#ifdef CONFIG_X86_64 ++static void xlate_ptregs_64_to_32(struct cpt_x86_regs *d, struct pt_regs *s, ++ struct task_struct *tsk) ++{ ++ d->cpt_ebp = s->bp; ++ d->cpt_ebx = s->bx; ++ d->cpt_eax = s->ax; ++ d->cpt_ecx = s->cx; ++ d->cpt_edx = s->dx; ++ d->cpt_esi = s->si; ++ d->cpt_edi = s->di; ++ d->cpt_orig_eax = s->orig_ax; ++ d->cpt_eip = s->ip; ++ d->cpt_xcs = encode_segment(s->cs); ++ d->cpt_eflags = s->flags; ++ d->cpt_esp = s->sp; ++ d->cpt_xss = encode_segment(s->ss); ++ d->cpt_xds = encode_segment(tsk->thread.ds); ++ d->cpt_xes = encode_segment(tsk->thread.es); ++} ++ ++static int dump_registers(struct task_struct *tsk, struct cpt_context *ctx) ++{ ++ cpt_open_object(NULL, ctx); ++ ++ if (task_thread_info(tsk)->flags & _TIF_IA32) { ++ struct cpt_x86_regs ri; ++ ri.cpt_next = sizeof(ri); ++ ri.cpt_object = CPT_OBJ_X86_REGS; ++ ri.cpt_hdrlen = sizeof(ri); ++ ri.cpt_content = CPT_CONTENT_VOID; ++ ++ ri.cpt_debugreg[0] = tsk->thread.debugreg0; ++ ri.cpt_debugreg[1] = tsk->thread.debugreg1; ++ ri.cpt_debugreg[2] = tsk->thread.debugreg2; ++ ri.cpt_debugreg[3] = tsk->thread.debugreg3; ++ ri.cpt_debugreg[4] = 0; ++ ri.cpt_debugreg[5] = 0; ++ ri.cpt_debugreg[6] = tsk->thread.debugreg6; ++ ri.cpt_debugreg[7] = tsk->thread.debugreg7; ++ ri.cpt_fs = encode_segment(tsk->thread.fsindex); ++ ri.cpt_gs = encode_segment(tsk->thread.gsindex); ++ ++ xlate_ptregs_64_to_32(&ri, task_pt_regs(tsk), tsk); ++ ++ ctx->write(&ri, sizeof(ri), ctx); ++ } else { ++ struct cpt_x86_64_regs ri; ++ ri.cpt_next = sizeof(ri); ++ ri.cpt_object = CPT_OBJ_X86_64_REGS; ++ ri.cpt_hdrlen = sizeof(ri); ++ ri.cpt_content = CPT_CONTENT_VOID; ++ ++ ri.cpt_fsbase = tsk->thread.fs; ++ ri.cpt_gsbase = tsk->thread.gs; ++ ri.cpt_fsindex = encode_segment(tsk->thread.fsindex); ++ ri.cpt_gsindex = encode_segment(tsk->thread.gsindex); ++ ri.cpt_ds = encode_segment(tsk->thread.ds); ++ ri.cpt_es = encode_segment(tsk->thread.es); ++ ri.cpt_debugreg[0] = tsk->thread.debugreg0; ++ ri.cpt_debugreg[1] = tsk->thread.debugreg1; ++ ri.cpt_debugreg[2] = tsk->thread.debugreg2; ++ ri.cpt_debugreg[3] = tsk->thread.debugreg3; ++ ri.cpt_debugreg[4] = 0; ++ ri.cpt_debugreg[5] = 0; ++ ri.cpt_debugreg[6] = tsk->thread.debugreg6; ++ ri.cpt_debugreg[7] = tsk->thread.debugreg7; ++ ++ memcpy(&ri.cpt_r15, task_pt_regs(tsk), sizeof(struct pt_regs)); ++ ++ ri.cpt_cs = encode_segment(task_pt_regs(tsk)->cs); ++ ri.cpt_ss = encode_segment(task_pt_regs(tsk)->ss); ++ ++ ctx->write(&ri, sizeof(ri), ctx); ++ ++ } ++ cpt_close_object(ctx); ++ ++ return 0; ++} ++ ++#else ++ ++static int dump_registers(struct task_struct *tsk, struct cpt_context *ctx) ++{ ++ struct cpt_x86_regs ri; ++ struct pt_regs *pt_regs; ++ ++ cpt_open_object(NULL, ctx); ++ ++ ri.cpt_next = sizeof(ri); ++ ri.cpt_object = CPT_OBJ_X86_REGS; ++ ri.cpt_hdrlen = sizeof(ri); ++ ri.cpt_content = CPT_CONTENT_VOID; ++ ++ ri.cpt_debugreg[0] = tsk->thread.debugreg0; ++ ri.cpt_debugreg[1] = tsk->thread.debugreg1; ++ ri.cpt_debugreg[2] = tsk->thread.debugreg2; ++ ri.cpt_debugreg[3] = tsk->thread.debugreg3; ++ ri.cpt_debugreg[6] = tsk->thread.debugreg6; ++ ri.cpt_debugreg[7] = tsk->thread.debugreg7; ++ ++ pt_regs = task_pt_regs(tsk); ++ ++ ri.cpt_fs = encode_segment(pt_regs->fs); ++ ri.cpt_gs = encode_segment(tsk->thread.gs); ++ ++ ri.cpt_ebx = pt_regs->bx; ++ ri.cpt_ecx = pt_regs->cx; ++ ri.cpt_edx = pt_regs->dx; ++ ri.cpt_esi = pt_regs->si; ++ ri.cpt_edi = pt_regs->di; ++ ri.cpt_ebp = pt_regs->bp; ++ ri.cpt_eax = pt_regs->ax; ++ ri.cpt_xds = pt_regs->ds; ++ ri.cpt_xes = pt_regs->es; ++ ri.cpt_orig_eax = pt_regs->orig_ax; ++ ri.cpt_eip = pt_regs->ip; ++ ri.cpt_xcs = pt_regs->cs; ++ ri.cpt_eflags = pt_regs->flags; ++ ri.cpt_esp = pt_regs->sp; ++ ri.cpt_xss = pt_regs->ss; ++ ++ ri.cpt_xcs = encode_segment(pt_regs->cs); ++ ri.cpt_xss = encode_segment(pt_regs->ss); ++ ri.cpt_xds = encode_segment(pt_regs->ds); ++ ri.cpt_xes = encode_segment(pt_regs->es); ++ ++ ctx->write(&ri, sizeof(ri), ctx); ++ cpt_close_object(ctx); ++ ++ return 0; ++} ++#endif ++#endif ++ ++#ifdef CONFIG_IA64 ++ ++/* ++ PMD? ++ */ ++ ++#define _C(x) do { if ((err = (x)) < 0) { printk("atm:" CPT_FID #x " %d\n", \ ++ CPT_TID(tsk), err); return -EINVAL; } } while (0) ++ ++static int ass_to_mouth(struct cpt_ia64_regs *r, struct task_struct *tsk, ++ struct cpt_context *ctx) ++{ ++ int err; ++ struct unw_frame_info info; ++ struct ia64_fpreg fpval; ++ int i; ++ ++ unw_init_from_blocked_task(&info, tsk); ++ _C(unw_unwind_to_user(&info)); ++ ++ /* NAT_BITS */ ++ do { ++ unsigned long scratch_unat; ++ ++ scratch_unat = info.sw->caller_unat; ++ if (info.pri_unat_loc) ++ scratch_unat = *info.pri_unat_loc; ++ ++ r->nat[0] = ia64_get_scratch_nat_bits(task_pt_regs(tsk), scratch_unat); ++ /* Just to be on safe side. */ ++ r->nat[0] &= 0xFFFFFFFFUL; ++ } while (0); ++ ++ /* R4-R7 */ ++ for (i = 4; i <= 7; i++) { ++ char nat = 0; ++ _C(unw_access_gr(&info, i, &r->gr[i], &nat, 0)); ++ r->nat[0] |= (nat != 0) << i; ++ } ++ ++ /* B1-B5 */ ++ for (i = 1; i <= 5; i++) { ++ _C(unw_access_br(&info, i, &r->br[i], 0)); ++ } ++ ++ /* AR_EC, AR_LC */ ++ _C(unw_access_ar(&info, UNW_AR_EC, &r->ar_ec, 0)); ++ _C(unw_access_ar(&info, UNW_AR_LC, &r->ar_lc, 0)); ++ ++ /* F2..F5, F16..F31 */ ++ for (i = 2; i <= 5; i++) { ++ _C(unw_get_fr(&info, i, &fpval)); ++ memcpy(&r->fr[i*2], &fpval, 16); ++ } ++ for (i = 16; i <= 31; i++) { ++ _C(unw_get_fr(&info, i, &fpval)); ++ memcpy(&r->fr[i*2], &fpval, 16); ++ } ++ return 0; ++} ++ ++#undef _C ++ ++static int dump_registers(struct task_struct *tsk, struct cpt_context *ctx) ++{ ++ int err; ++ unsigned long pg; ++ struct cpt_ia64_regs *r; ++ struct ia64_psr *psr; ++ struct switch_stack *sw; ++ struct pt_regs *pt; ++ void *krbs = (void *)tsk + IA64_RBS_OFFSET; ++ unsigned long reg; ++ ++ if (tsk->exit_state) ++ return 0; ++ ++ pt = task_pt_regs(tsk); ++ ++ sw = (struct switch_stack *) (tsk->thread.ksp + 16); ++ ++ if ((pg = __get_free_page(GFP_KERNEL)) == 0) ++ return -ENOMEM; ++ ++ r = (void*)pg; ++ /* To catch if we forgot some register */ ++ memset(r, 0xA5, sizeof(*r)); ++ ++ r->gr[0] = 0; ++ r->fr[0] = r->fr[1] = 0; ++ r->fr[2] = 0x8000000000000000UL; ++ r->fr[3] = 0xffff; ++ ++ r->nat[0] = r->nat[1] = 0; ++ ++ err = ass_to_mouth(r, tsk, ctx); ++ if (err) { ++ printk("ass_to_mouth error %d\n", err); ++ goto out; ++ } ++ ++ /* gr 1,2-3,8-11,12-13,14,15,16-31 are on pt_regs */ ++ memcpy(&r->gr[1], &pt->r1, 8*(2-1)); ++ memcpy(&r->gr[2], &pt->r2, 8*(4-2)); ++ memcpy(&r->gr[8], &pt->r8, 8*(12-8)); ++ memcpy(&r->gr[12], &pt->r12, 8*(14-12)); ++ memcpy(&r->gr[14], &pt->r14, 8*(15-14)); ++ memcpy(&r->gr[15], &pt->r15, 8*(16-15)); ++ memcpy(&r->gr[16], &pt->r16, 8*(32-16)); ++ ++ r->br[0] = pt->b0; ++ r->br[6] = pt->b6; ++ r->br[7] = pt->b7; ++ ++ r->ar_bspstore = pt->ar_bspstore; ++ r->ar_unat = pt->ar_unat; ++ r->ar_pfs = pt->ar_pfs; ++ r->ar_ccv = pt->ar_ccv; ++ r->ar_fpsr = pt->ar_fpsr; ++ r->ar_csd = pt->ar_csd; ++ r->ar_ssd = pt->ar_ssd; ++ r->ar_rsc = pt->ar_rsc; ++ ++ r->cr_iip = pt->cr_iip; ++ r->cr_ipsr = pt->cr_ipsr; ++ ++ r->pr = pt->pr; ++ ++ r->cfm = pt->cr_ifs; ++ r->ar_rnat = pt->ar_rnat; ++ ++ /* fpregs 6..9,10..11 are in pt_regs */ ++ memcpy(&r->fr[2*6], &pt->f6, 16*(10-6)); ++ memcpy(&r->fr[2*10], &pt->f10, 16*(12-10)); ++ /* fpreg 12..15 are on switch stack */ ++ memcpy(&r->fr[2*12], &sw->f12, 16*(16-12)); ++ /* fpregs 32...127 */ ++ psr = ia64_psr(task_pt_regs(tsk)); ++ preempt_disable(); ++ if (ia64_is_local_fpu_owner(tsk) && psr->mfh) { ++ psr->mfh = 0; ++ tsk->thread.flags |= IA64_THREAD_FPH_VALID; ++ ia64_save_fpu(&tsk->thread.fph[0]); ++ } ++ preempt_enable(); ++ memcpy(&r->fr[32*2], tsk->thread.fph, 16*(128-32)); ++ ++ if (tsk->thread.flags & IA64_THREAD_DBG_VALID) { ++ memcpy(r->ibr, tsk->thread.ibr, sizeof(r->ibr)); ++ memcpy(r->dbr, tsk->thread.dbr, sizeof(r->ibr)); ++ } else { ++ memset(r->ibr, 0, sizeof(r->ibr)); ++ memset(r->dbr, 0, sizeof(r->dbr)); ++ } ++ ++ r->loadrs = pt->loadrs; ++ r->num_regs = ia64_rse_num_regs(krbs, krbs + 8*(pt->loadrs >> 19)); ++ if ((long)pt->cr_ifs > 0) ++ r->num_regs += (pt->cr_ifs & 0x7f); ++ ++ if (r->num_regs > 96) { ++ eprintk_ctx(CPT_FID " too much RSE regs %lu\n", ++ CPT_TID(tsk), r->num_regs); ++ return -EINVAL; ++ } ++ ++ for (reg = 0; reg < r->num_regs; reg++) { ++ unsigned long *ptr = ia64_rse_skip_regs(krbs, reg); ++ unsigned long *rnatp = ia64_rse_rnat_addr(ptr); ++ ++ r->gr[32+reg] = *ptr; ++ ++ if ((unsigned long)rnatp >= sw->ar_bspstore) ++ rnatp = &sw->ar_rnat; ++ if (*rnatp & (1UL<nat[0] |= (1UL<<(reg+32)); ++ else ++ r->nat[1] |= (1UL<<(reg-32)); ++ } ++ } ++ if (r->nat[0] | r->nat[1]) ++ wprintk_ctx(CPT_FID " nat bits %lx%016lx\n", CPT_TID(tsk), ++ r->nat[1], r->nat[0]); ++ ++ cpt_open_object(NULL, ctx); ++ r->cpt_next = sizeof(*r); ++ r->cpt_object = CPT_OBJ_IA64_REGS; ++ r->cpt_hdrlen = sizeof(*r); ++ r->cpt_content = CPT_CONTENT_VOID; ++ ctx->write(r, sizeof(*r), ctx); ++ cpt_close_object(ctx); ++ err = 0; ++ ++out: ++ free_page(pg); ++ return err; ++} ++#endif ++ ++static int dump_kstack(struct task_struct *tsk, struct cpt_context *ctx) ++{ ++ struct cpt_obj_bits hdr; ++ unsigned long size; ++ void *start; ++ ++ cpt_open_object(NULL, ctx); ++ ++#ifdef CONFIG_X86_64 ++ size = tsk->thread.sp0 - tsk->thread.sp; ++ start = (void*)tsk->thread.sp; ++#elif defined(CONFIG_X86_32) ++ size = tsk->thread.sp0 - tsk->thread.sp; ++ start = (void*)tsk->thread.sp; ++#elif defined(CONFIG_IA64) ++ size = (unsigned long)(task_pt_regs(tsk)+1) - tsk->thread.ksp; ++ start = (void*)tsk->thread.ksp; ++#else ++#error Arch is not supported ++#endif ++ ++ hdr.cpt_next = sizeof(hdr) + CPT_ALIGN(size); ++ hdr.cpt_object = CPT_OBJ_BITS; ++ hdr.cpt_hdrlen = sizeof(hdr); ++ hdr.cpt_content = CPT_CONTENT_STACK; ++ hdr.cpt_size = size; ++ ++ ctx->write(&hdr, sizeof(hdr), ctx); ++ ctx->write(start, size, ctx); ++ ctx->align(ctx); ++ cpt_close_object(ctx); ++ return 0; ++} ++ ++#ifdef CONFIG_X86 ++/* Formats of i387_fxsave_struct are the same for x86_64 ++ * and i386. Plain luck. */ ++ ++static int dump_fpustate(struct task_struct *tsk, struct cpt_context *ctx) ++{ ++ struct cpt_obj_bits hdr; ++ unsigned long size; ++ int type; ++ ++ cpt_open_object(NULL, ctx); ++ ++ type = CPT_CONTENT_X86_FPUSTATE; ++ size = sizeof(struct i387_fxsave_struct); ++#ifndef CONFIG_X86_64 ++ if (!cpu_has_fxsr) { ++ size = sizeof(struct i387_fsave_struct); ++ type = CPT_CONTENT_X86_FPUSTATE_OLD; ++ } ++#endif ++ ++ hdr.cpt_next = sizeof(hdr) + CPT_ALIGN(size); ++ hdr.cpt_object = CPT_OBJ_BITS; ++ hdr.cpt_hdrlen = sizeof(hdr); ++ hdr.cpt_content = type; ++ hdr.cpt_size = size; ++ ++ ctx->write(&hdr, sizeof(hdr), ctx); ++ ctx->write(&tsk->thread.xstate, size, ctx); ++ ctx->align(ctx); ++ cpt_close_object(ctx); ++ return 0; ++} ++#endif ++ ++#ifdef CONFIG_IA64 ++ ++static int dump_fpustate(struct task_struct *tsk, struct cpt_context *ctx) ++{ ++ return 0; ++} ++#endif ++ ++static int encode_siginfo(struct cpt_siginfo_image *si, siginfo_t *info) ++{ ++ si->cpt_signo = info->si_signo; ++ si->cpt_errno = info->si_errno; ++ si->cpt_code = info->si_code; ++ ++ switch(si->cpt_code & __SI_MASK) { ++ case __SI_TIMER: ++ si->cpt_pid = info->si_tid; ++ si->cpt_uid = info->si_overrun; ++ si->cpt_sigval = cpt_ptr_export(info->_sifields._timer._sigval.sival_ptr); ++ si->cpt_utime = info->si_sys_private; ++ break; ++ case __SI_POLL: ++ si->cpt_pid = info->si_band; ++ si->cpt_uid = info->si_fd; ++ break; ++ case __SI_FAULT: ++ si->cpt_sigval = cpt_ptr_export(info->si_addr); ++#ifdef __ARCH_SI_TRAPNO ++ si->cpt_pid = info->si_trapno; ++#endif ++ break; ++ case __SI_CHLD: ++ si->cpt_pid = info->si_pid; ++ si->cpt_uid = info->si_uid; ++ si->cpt_sigval = info->si_status; ++ si->cpt_stime = info->si_stime; ++ si->cpt_utime = info->si_utime; ++ break; ++ case __SI_KILL: ++ case __SI_RT: ++ case __SI_MESGQ: ++ default: ++ si->cpt_pid = info->si_pid; ++ si->cpt_uid = info->si_uid; ++ si->cpt_sigval = cpt_ptr_export(info->si_ptr); ++ break; ++ } ++ return 0; ++} ++ ++static int dump_sigqueue(struct sigpending *list, struct cpt_context *ctx) ++{ ++ struct sigqueue *q; ++ loff_t saved_obj; ++ ++ if (list_empty(&list->list)) ++ return 0; ++ ++ cpt_push_object(&saved_obj, ctx); ++ list_for_each_entry(q, &list->list, list) { ++ struct cpt_siginfo_image si; ++ ++ si.cpt_next = sizeof(si); ++ si.cpt_object = CPT_OBJ_SIGINFO; ++ si.cpt_hdrlen = sizeof(si); ++ si.cpt_content = CPT_CONTENT_VOID; ++ ++ si.cpt_qflags = q->flags; ++ si.cpt_user = q->user->uid; ++ ++ if (encode_siginfo(&si, &q->info)) ++ return -EINVAL; ++ ++ ctx->write(&si, sizeof(si), ctx); ++ } ++ cpt_pop_object(&saved_obj, ctx); ++ return 0; ++} ++ ++ ++ ++static int dump_one_signal_struct(cpt_object_t *obj, struct cpt_context *ctx) ++{ ++ struct signal_struct *sig = obj->o_obj; ++ struct cpt_signal_image *v = cpt_get_buf(ctx); ++ struct task_struct *tsk; ++ int i; ++ ++ cpt_open_object(obj, ctx); ++ ++ v->cpt_next = CPT_NULL; ++ v->cpt_object = CPT_OBJ_SIGNAL_STRUCT; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_ARRAY; ++ ++ if (sig->__pgrp <= 0) { ++ eprintk_ctx("bad pgid\n"); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ v->cpt_pgrp_type = CPT_PGRP_NORMAL; ++ read_lock(&tasklist_lock); ++ tsk = find_task_by_pid(sig->__pgrp); ++ if (tsk == NULL) ++ v->cpt_pgrp_type = CPT_PGRP_ORPHAN; ++ read_unlock(&tasklist_lock); ++ v->cpt_pgrp = pid_to_vpid(sig->__pgrp); ++ ++ v->cpt_old_pgrp = 0; ++/* if (!sig->tty_old_pgrp) { ++ eprintk_ctx("bad tty_old_pgrp\n"); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ }*/ ++ if (sig->tty_old_pgrp) { ++ v->cpt_old_pgrp_type = CPT_PGRP_NORMAL; ++ read_lock(&tasklist_lock); ++ tsk = pid_task(sig->tty_old_pgrp, PIDTYPE_PID); ++ if (tsk == NULL) { ++ v->cpt_old_pgrp_type = CPT_PGRP_ORPHAN; ++ tsk = pid_task(sig->tty_old_pgrp, PIDTYPE_PGID); ++ } ++ read_unlock(&tasklist_lock); ++ if (tsk == NULL) { ++ eprintk_ctx("tty_old_pgrp does not exist anymore\n"); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ v->cpt_old_pgrp = pid_vnr(sig->tty_old_pgrp); ++ if ((int)v->cpt_old_pgrp < 0) { ++ dprintk_ctx("stray tty_old_pgrp %d\n", pid_nr(sig->tty_old_pgrp)); ++ v->cpt_old_pgrp = -1; ++ v->cpt_old_pgrp_type = CPT_PGRP_STRAY; ++ } ++ } ++ ++ if (sig->__session <= 0) { ++ eprintk_ctx("bad session\n"); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ v->cpt_session_type = CPT_PGRP_NORMAL; ++ read_lock(&tasklist_lock); ++ tsk = find_task_by_pid(sig->__session); ++ if (tsk == NULL) ++ v->cpt_session_type = CPT_PGRP_ORPHAN; ++ read_unlock(&tasklist_lock); ++ v->cpt_session = pid_to_vpid(sig->__session); ++ ++ v->cpt_leader = sig->leader; ++ v->cpt_ctty = CPT_NULL; ++ if (sig->tty) { ++ cpt_object_t *cobj = lookup_cpt_object(CPT_OBJ_TTY, sig->tty, ctx); ++ if (cobj) ++ v->cpt_ctty = cobj->o_pos; ++ else { ++ eprintk_ctx("controlling tty is not found\n"); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ } ++ memcpy(&v->cpt_sigpending, &sig->shared_pending.signal, 8); ++ ++ v->cpt_curr_target = 0; ++ if (sig->curr_target) ++ v->cpt_curr_target = task_pid_vnr(sig->curr_target); ++ v->cpt_group_exit = ((sig->flags & SIGNAL_GROUP_EXIT) != 0); ++ v->cpt_group_exit_code = sig->group_exit_code; ++ v->cpt_group_exit_task = 0; ++ if (sig->group_exit_task) ++ v->cpt_group_exit_task = task_pid_vnr(sig->group_exit_task); ++ v->cpt_notify_count = sig->notify_count; ++ v->cpt_group_stop_count = sig->group_stop_count; ++ ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,8) ++ v->cpt_utime = sig->utime; ++ v->cpt_stime = sig->stime; ++ v->cpt_cutime = sig->cutime; ++ v->cpt_cstime = sig->cstime; ++ v->cpt_nvcsw = sig->nvcsw; ++ v->cpt_nivcsw = sig->nivcsw; ++ v->cpt_cnvcsw = sig->cnvcsw; ++ v->cpt_cnivcsw = sig->cnivcsw; ++ v->cpt_min_flt = sig->min_flt; ++ v->cpt_maj_flt = sig->maj_flt; ++ v->cpt_cmin_flt = sig->cmin_flt; ++ v->cpt_cmaj_flt = sig->cmaj_flt; ++ ++ if (RLIM_NLIMITS > CPT_RLIM_NLIMITS) ++ __asm__("undefined\n"); ++ ++ for (i=0; icpt_rlim_cur[i] = sig->rlim[i].rlim_cur; ++ v->cpt_rlim_max[i] = sig->rlim[i].rlim_max; ++ } else { ++ v->cpt_rlim_cur[i] = CPT_NULL; ++ v->cpt_rlim_max[i] = CPT_NULL; ++ } ++ } ++#endif ++ ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ ++ dump_sigqueue(&sig->shared_pending, ctx); ++ ++ cpt_close_object(ctx); ++ return 0; ++} ++ ++int cpt_check_unsupported(struct task_struct *tsk, cpt_context_t *ctx) ++{ ++ if (tsk->splice_pipe) { ++ eprintk_ctx("splice is used by " CPT_FID "\n", CPT_TID(tsk)); ++ return -EBUSY; ++ } ++#ifdef CONFIG_KEYS ++ if (tsk->request_key_auth || tsk->thread_keyring) { ++ eprintk_ctx("keys are used by " CPT_FID "\n", CPT_TID(tsk)); ++ return -EBUSY; ++ } ++#endif ++#ifdef CONFIG_NUMA ++ if (tsk->mempolicy) { ++ eprintk_ctx("NUMA mempolicy is used by " CPT_FID "\n", CPT_TID(tsk)); ++ return -EBUSY; ++ } ++#endif ++#ifdef CONFIG_TUX ++ if (tsk->tux_info) { ++ eprintk_ctx("TUX is used by " CPT_FID "\n", CPT_TID(tsk)); ++ return -EBUSY; ++ } ++#endif ++ return 0; ++} ++ ++static int dump_one_process(cpt_object_t *obj, struct cpt_context *ctx) ++{ ++ struct task_struct *tsk = obj->o_obj; ++ int last_thread; ++ struct cpt_task_image *v = cpt_get_buf(ctx); ++ cpt_object_t *tobj; ++ cpt_object_t *tg_obj; ++ loff_t saved_obj; ++ int i; ++ int err; ++ struct timespec delta; ++ struct mm_struct * tsk_mm; ++ struct files_struct * tsk_files; ++ struct fs_struct * tsk_fs; ++ struct mnt_namespace * tsk_ns; ++ ++ cpt_open_object(obj, ctx); ++ ++ v->cpt_signal = CPT_NULL; ++ tg_obj = lookup_cpt_object(CPT_OBJ_SIGNAL_STRUCT, tsk->signal, ctx); ++ if (!tg_obj) BUG(); ++ ++ v->cpt_next = CPT_NULL; ++ v->cpt_object = CPT_OBJ_TASK; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_ARRAY; ++ ++ v->cpt_state = tsk->state; ++ if (tsk->state == EXIT_ZOMBIE) { ++ eprintk_ctx("invalid zombie state on" CPT_FID "\n", CPT_TID(tsk)); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } else if (tsk->state == EXIT_DEAD) { ++ if (tsk->exit_state != EXIT_DEAD && ++ tsk->exit_state != EXIT_ZOMBIE) { ++ eprintk_ctx("invalid exit_state %d on" CPT_FID "\n", tsk->exit_state, CPT_TID(tsk)); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ } ++ if (tsk->exit_state) { ++ v->cpt_state = tsk->exit_state; ++ if (tsk->state != EXIT_DEAD) { ++ eprintk_ctx("invalid tsk->state %ld/%d on" CPT_FID "\n", ++ tsk->state, tsk->exit_state, CPT_TID(tsk)); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ } ++ if (cpt_check_unsupported(tsk, ctx)) { ++ cpt_release_buf(ctx); ++ return -EBUSY; ++ } ++ ++ v->cpt_flags = tsk->flags&~(PF_FROZEN|PF_EXIT_RESTART); ++ v->cpt_ptrace = tsk->ptrace; ++ v->cpt_prio = tsk->prio; ++ v->cpt_exit_code = tsk->exit_code; ++ v->cpt_exit_signal = tsk->exit_signal; ++ v->cpt_pdeath_signal = tsk->pdeath_signal; ++ v->cpt_static_prio = tsk->static_prio; ++ v->cpt_rt_priority = tsk->rt_priority; ++ v->cpt_policy = tsk->policy; ++ if (v->cpt_policy != SCHED_NORMAL) { ++ eprintk_ctx("scheduler policy is not supported %d/%d(%s)\n", task_pid_vnr(tsk), tsk->pid, tsk->comm); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ ++ /* Unpleasant moment. When leader of thread group exits, ++ * it remains in zombie state until all the group exits. ++ * We save not-NULL pointers to process mm/files/fs, so ++ * that we can restore this thread group. ++ */ ++ tsk_mm = tsk->mm; ++ tsk_files = tsk->files; ++ tsk_fs = tsk->fs; ++ tsk_ns = tsk->nsproxy ? tsk->nsproxy->mnt_ns : NULL; ++ ++ if (tsk->exit_state && !thread_group_empty(tsk) && ++ thread_group_leader(tsk)) { ++ struct task_struct * p = tsk; ++ ++ read_lock(&tasklist_lock); ++ do { ++ if (p->mm) ++ tsk_mm = p->mm; ++ if (p->files) ++ tsk_files = p->files; ++ if (p->fs) ++ tsk_fs = p->fs; ++ if (p->nsproxy && p->nsproxy->mnt_ns) ++ tsk_ns = p->nsproxy->mnt_ns; ++ p = next_thread(p); ++ } while (p != tsk); ++ read_unlock(&tasklist_lock); ++ } ++ ++ v->cpt_mm = CPT_NULL; ++ if (tsk_mm) { ++ tobj = lookup_cpt_object(CPT_OBJ_MM, tsk_mm, ctx); ++ if (!tobj) BUG(); ++ v->cpt_mm = tobj->o_pos; ++ } ++ v->cpt_files = CPT_NULL; ++ if (tsk_files) { ++ tobj = lookup_cpt_object(CPT_OBJ_FILES, tsk_files, ctx); ++ if (!tobj) BUG(); ++ v->cpt_files = tobj->o_pos; ++ } ++ v->cpt_fs = CPT_NULL; ++ if (tsk_fs) { ++ tobj = lookup_cpt_object(CPT_OBJ_FS, tsk_fs, ctx); ++ if (!tobj) BUG(); ++ v->cpt_fs = tobj->o_pos; ++ } ++ v->cpt_namespace = CPT_NULL; ++ if (tsk_ns) { ++ tobj = lookup_cpt_object(CPT_OBJ_NAMESPACE, tsk_ns, ctx); ++ if (!tobj) BUG(); ++ v->cpt_namespace = tobj->o_pos; ++ ++ if (tsk_ns != current->nsproxy->mnt_ns) ++ eprintk_ctx("namespaces are not supported:" ++ "process " CPT_FID "\n", CPT_TID(tsk)); ++ } ++ v->cpt_sysvsem_undo = CPT_NULL; ++ if (tsk->sysvsem.undo_list && !tsk->exit_state) { ++ tobj = lookup_cpt_object(CPT_OBJ_SYSVSEM_UNDO, tsk->sysvsem.undo_list, ctx); ++ if (!tobj) BUG(); ++ v->cpt_sysvsem_undo = tobj->o_pos; ++ } ++ v->cpt_sighand = CPT_NULL; ++ if (tsk->sighand) { ++ tobj = lookup_cpt_object(CPT_OBJ_SIGHAND_STRUCT, tsk->sighand, ctx); ++ if (!tobj) BUG(); ++ v->cpt_sighand = tobj->o_pos; ++ } ++ v->cpt_sigblocked = cpt_sigset_export(&tsk->blocked); ++ v->cpt_sigrblocked = cpt_sigset_export(&tsk->real_blocked); ++ v->cpt_sigsuspend_blocked = cpt_sigset_export(&tsk->saved_sigmask); ++ ++ v->cpt_pid = task_pid_vnr(tsk); ++ v->cpt_tgid = task_tgid_vnr(tsk); ++ v->cpt_ppid = 0; ++ if (tsk->parent) { ++ if (tsk->parent != tsk->real_parent && ++ !lookup_cpt_object(CPT_OBJ_TASK, tsk->parent, ctx)) { ++ eprintk_ctx("task %d/%d(%s) is ptraced from ve0\n", tsk->pid, task_pid_vnr(tsk), tsk->comm); ++ cpt_release_buf(ctx); ++ return -EBUSY; ++ } ++ v->cpt_ppid = task_pid_vnr(tsk->parent); ++ } ++ v->cpt_rppid = tsk->real_parent ? task_pid_vnr(tsk->real_parent) : 0; ++ v->cpt_pgrp = task_pgrp_vnr(tsk); ++ v->cpt_session = task_session_vnr(tsk); ++ v->cpt_old_pgrp = 0; ++ if (tsk->signal->tty_old_pgrp) ++ v->cpt_old_pgrp = pid_vnr(tsk->signal->tty_old_pgrp); ++ v->cpt_leader = tsk->group_leader ? task_pid_vnr(tsk->group_leader) : 0; ++ v->cpt_set_tid = (unsigned long)tsk->set_child_tid; ++ v->cpt_clear_tid = (unsigned long)tsk->clear_child_tid; ++ memcpy(v->cpt_comm, tsk->comm, 16); ++ v->cpt_user = tsk->user->uid; ++ v->cpt_uid = tsk->uid; ++ v->cpt_euid = tsk->euid; ++ v->cpt_suid = tsk->suid; ++ v->cpt_fsuid = tsk->fsuid; ++ v->cpt_gid = tsk->gid; ++ v->cpt_egid = tsk->egid; ++ v->cpt_sgid = tsk->sgid; ++ v->cpt_fsgid = tsk->fsgid; ++ v->cpt_ngids = 0; ++ if (tsk->group_info && tsk->group_info->ngroups != 0) { ++ int i = tsk->group_info->ngroups; ++ if (i > 32) { ++ /* Shame... I did a simplified version and _forgot_ ++ * about this. Later, later. */ ++ eprintk_ctx("too many of groups " CPT_FID "\n", CPT_TID(tsk)); ++ return -EINVAL; ++ } ++ v->cpt_ngids = i; ++ for (i--; i>=0; i--) ++ v->cpt_gids[i] = tsk->group_info->small_block[i]; ++ } ++ v->cpt_prctl_uac = 0; ++ v->cpt_prctl_fpemu = 0; ++ v->__cpt_pad1 = 0; ++#ifdef CONFIG_IA64 ++ v->cpt_prctl_uac = (tsk->thread.flags & IA64_THREAD_UAC_MASK) >> IA64_THREAD_UAC_SHIFT; ++ v->cpt_prctl_fpemu = (tsk->thread.flags & IA64_THREAD_FPEMU_MASK) >> IA64_THREAD_FPEMU_SHIFT; ++#endif ++ memcpy(&v->cpt_ecap, &tsk->cap_effective, 8); ++ memcpy(&v->cpt_icap, &tsk->cap_inheritable, 8); ++ memcpy(&v->cpt_pcap, &tsk->cap_permitted, 8); ++ v->cpt_keepcap = tsk->securebits; ++ ++ v->cpt_did_exec = tsk->did_exec; ++ v->cpt_exec_domain = -1; ++ v->cpt_thrflags = task_thread_info(tsk)->flags & ~(1<cpt_64bit = 0; ++#ifdef CONFIG_X86_64 ++ /* Clear x86_64 specific flags */ ++ v->cpt_thrflags &= ~(_TIF_FORK|_TIF_ABI_PENDING|_TIF_IA32); ++ if (!(task_thread_info(tsk)->flags & _TIF_IA32)) { ++ ctx->tasks64++; ++ v->cpt_64bit = 1; ++ } ++#endif ++#ifdef CONFIG_IA64 ++ /* Clear ia64 specific flags */ ++ //// v->cpt_thrflags &= ~(_TIF_FORK|_TIF_ABI_PENDING|_TIF_IA32); ++ if (!IS_IA32_PROCESS(task_pt_regs(tsk))) { ++ ctx->tasks64++; ++ v->cpt_64bit = 1; ++ } ++#endif ++ v->cpt_thrstatus = task_thread_info(tsk)->status; ++ v->cpt_addr_limit = -1; ++ ++ v->cpt_personality = tsk->personality; ++ ++#ifdef CONFIG_X86 ++ for (i=0; i=3) { ++ eprintk_ctx("too many tls descs\n"); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ v->cpt_tls[i] = (((u64)tsk->thread.tls_array[i].b)<<32) + tsk->thread.tls_array[i].a; ++ } ++#endif ++ ++ v->cpt_restart.fn = CPT_RBL_0; ++ if (task_thread_info(tsk)->restart_block.fn != task_thread_info(current)->restart_block.fn) { ++ struct restart_block *rb = &task_thread_info(tsk)->restart_block; ++ ktime_t e; ++ ++ if (rb->fn == hrtimer_nanosleep_restart) { ++ v->cpt_restart.fn = CPT_RBL_NANOSLEEP; ++ ++ e.tv64 = ((u64)rb->arg3 << 32) | (u64)rb->arg2; ++ e = ktime_sub(e, timespec_to_ktime(ctx->cpt_monotonic_time)); ++ v->cpt_restart.arg0 = rb->arg0; ++ v->cpt_restart.arg1 = rb->arg1; ++ v->cpt_restart.arg2 = ktime_to_ns(e); ++ v->cpt_restart.arg3 = 0; ++ dprintk_ctx(CPT_FID " %Lu\n", CPT_TID(tsk), (unsigned long long)v->cpt_restart.arg0); ++ goto continue_dump; ++ } ++#if defined(CONFIG_X86_64) && defined(CONFIG_COMPAT) ++ if (rb->fn == compat_nanosleep_restart) { ++ v->cpt_restart.fn = CPT_RBL_COMPAT_NANOSLEEP; ++ ++ e.tv64 = ((u64)rb->arg3 << 32) | (u64)rb->arg2; ++ e = ktime_sub(e, timespec_to_ktime(ctx->cpt_monotonic_time)); ++ v->cpt_restart.arg0 = rb->arg0; ++ v->cpt_restart.arg1 = rb->arg1; ++ v->cpt_restart.arg2 = ktime_to_ns(e); ++ v->cpt_restart.arg3 = 0; ++ dprintk_ctx(CPT_FID " %Lu\n", CPT_TID(tsk), (unsigned long long)v->cpt_restart.arg0); ++ goto continue_dump; ++ } ++#endif ++ if (rb->fn == do_restart_poll) { ++ u64 timeout_jiffies; ++ ++ timeout_jiffies = ((u64)rb->arg3 << 32)|(u64)rb->arg2; ++ e.tv64 = timeout_jiffies * TICK_NSEC; ++ ++ v->cpt_restart.fn = CPT_RBL_POLL; ++ v->cpt_restart.arg0 = rb->arg0; ++ v->cpt_restart.arg1 = rb->arg1; ++ v->cpt_restart.arg2 = ktime_to_ns(e); ++ v->cpt_restart.arg3 = 0; ++ dprintk_ctx(CPT_FID " %Lu\n", CPT_TID(tsk), (unsigned long long)v->cpt_restart.arg0); ++ goto continue_dump; ++ } ++ if (rb->fn == futex_wait_restart) { ++ v->cpt_restart.fn = CPT_RBL_FUTEX_WAIT; ++ ++ e.tv64 = rb->futex.time; ++ e = ktime_sub(e, timespec_to_ktime(ctx->cpt_monotonic_time)); ++ v->cpt_restart.arg0 = (unsigned long)rb->futex.uaddr; ++ v->cpt_restart.arg1 = rb->futex.val; ++ v->cpt_restart.arg2 = ktime_to_ns(e); ++ v->cpt_restart.arg3 = rb->futex.flags; ++ goto continue_dump; ++ } ++ eprintk_ctx("unknown restart block %p\n", rb->fn); ++ return -EINVAL; ++ } ++ ++continue_dump: ++ v->cpt_it_real_incr = 0; ++ v->cpt_it_prof_incr = 0; ++ v->cpt_it_virt_incr = 0; ++ v->cpt_it_real_value = 0; ++ v->cpt_it_prof_value = 0; ++ v->cpt_it_virt_value = 0; ++ if (thread_group_leader(tsk) && tsk->exit_state == 0) { ++ ktime_t rem; ++ ++ v->cpt_it_real_incr = ktime_to_ns(tsk->signal->it_real_incr); ++ v->cpt_it_prof_incr = tsk->signal->it_prof_incr; ++ v->cpt_it_virt_incr = tsk->signal->it_virt_incr; ++ ++ rem = hrtimer_get_remaining(&tsk->signal->real_timer); ++ ++ if (hrtimer_active(&tsk->signal->real_timer)) { ++ if (rem.tv64 <= 0) ++ rem.tv64 = NSEC_PER_USEC; ++ v->cpt_it_real_value = ktime_to_ns(rem); ++ dprintk("cpt itimer " CPT_FID " %Lu\n", CPT_TID(tsk), (unsigned long long)v->cpt_it_real_value); ++ } ++ v->cpt_it_prof_value = tsk->signal->it_prof_expires; ++ v->cpt_it_virt_value = tsk->signal->it_virt_expires; ++ } ++ v->cpt_used_math = (tsk_used_math(tsk) != 0); ++ ++ if (tsk->notifier) { ++ eprintk_ctx("task notifier is in use: process %d/%d(%s)\n", task_pid_vnr(tsk), tsk->pid, tsk->comm); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ ++ v->cpt_utime = tsk->utime; ++ v->cpt_stime = tsk->stime; ++ delta = tsk->start_time; ++ _set_normalized_timespec(&delta, ++ delta.tv_sec - get_exec_env()->start_timespec.tv_sec, ++ delta.tv_nsec - get_exec_env()->start_timespec.tv_nsec); ++ v->cpt_starttime = cpt_timespec_export(&delta); ++ v->cpt_nvcsw = tsk->nvcsw; ++ v->cpt_nivcsw = tsk->nivcsw; ++ v->cpt_min_flt = tsk->min_flt; ++ v->cpt_maj_flt = tsk->maj_flt; ++ ++#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,8) ++ v->cpt_cutime = tsk->cutime; ++ v->cpt_cstime = tsk->cstime; ++ v->cpt_cnvcsw = tsk->cnvcsw; ++ v->cpt_cnivcsw = tsk->cnivcsw; ++ v->cpt_cmin_flt = tsk->cmin_flt; ++ v->cpt_cmaj_flt = tsk->cmaj_flt; ++ ++ if (RLIM_NLIMITS > CPT_RLIM_NLIMITS) ++ __asm__("undefined\n"); ++ ++ for (i=0; icpt_rlim_cur[i] = tsk->rlim[i].rlim_cur; ++ v->cpt_rlim_max[i] = tsk->rlim[i].rlim_max; ++ } else { ++ v->cpt_rlim_cur[i] = CPT_NULL; ++ v->cpt_rlim_max[i] = CPT_NULL; ++ } ++ } ++#else ++ v->cpt_cutime = tsk->signal->cutime; ++ v->cpt_cstime = tsk->signal->cstime; ++ v->cpt_cnvcsw = tsk->signal->cnvcsw; ++ v->cpt_cnivcsw = tsk->signal->cnivcsw; ++ v->cpt_cmin_flt = tsk->signal->cmin_flt; ++ v->cpt_cmaj_flt = tsk->signal->cmaj_flt; ++ ++ if (RLIM_NLIMITS > CPT_RLIM_NLIMITS) ++ __asm__("undefined\n"); ++ ++ for (i=0; icpt_rlim_cur[i] = tsk->signal->rlim[i].rlim_cur; ++ v->cpt_rlim_max[i] = tsk->signal->rlim[i].rlim_max; ++ } else { ++ v->cpt_rlim_cur[i] = CPT_NULL; ++ v->cpt_rlim_max[i] = CPT_NULL; ++ } ++ } ++#endif ++ ++#ifdef CONFIG_BEANCOUNTERS ++ if (tsk->mm) ++ v->cpt_mm_ub = cpt_lookup_ubc(tsk->mm->mm_ub, ctx); ++ else ++ v->cpt_mm_ub = CPT_NULL; ++ v->cpt_task_ub = cpt_lookup_ubc(tsk->task_bc.task_ub, ctx); ++ v->cpt_exec_ub = cpt_lookup_ubc(tsk->task_bc.exec_ub, ctx); ++ v->cpt_fork_sub = cpt_lookup_ubc(tsk->task_bc.fork_sub, ctx); ++#endif ++ ++ v->cpt_ptrace_message = tsk->ptrace_message; ++ v->cpt_pn_state = tsk->pn_state; ++ v->cpt_stopped_state = tsk->stopped_state; ++ v->cpt_sigsuspend_state = 0; ++ ++#ifdef CONFIG_X86_32 ++ if (tsk->thread.vm86_info) { ++ eprintk_ctx("vm86 task is running\n"); ++ cpt_release_buf(ctx); ++ return -EBUSY; ++ } ++#endif ++ ++ v->cpt_sigpending = cpt_sigset_export(&tsk->pending.signal); ++ ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ ++ cpt_push_object(&saved_obj, ctx); ++ dump_kstack(tsk, ctx); ++ cpt_pop_object(&saved_obj, ctx); ++ ++ cpt_push_object(&saved_obj, ctx); ++ err = dump_registers(tsk, ctx); ++ cpt_pop_object(&saved_obj, ctx); ++ if (err) ++ return err; ++ ++ if (tsk_used_math(tsk)) { ++ cpt_push_object(&saved_obj, ctx); ++ dump_fpustate(tsk, ctx); ++ cpt_pop_object(&saved_obj, ctx); ++ } ++ ++ if (tsk->last_siginfo) { ++ struct cpt_siginfo_image si; ++ cpt_push_object(&saved_obj, ctx); ++ ++ si.cpt_next = sizeof(si); ++ si.cpt_object = CPT_OBJ_LASTSIGINFO; ++ si.cpt_hdrlen = sizeof(si); ++ si.cpt_content = CPT_CONTENT_VOID; ++ ++ if (encode_siginfo(&si, tsk->last_siginfo)) ++ return -EINVAL; ++ ++ ctx->write(&si, sizeof(si), ctx); ++ cpt_pop_object(&saved_obj, ctx); ++ } ++ ++ if (tsk->sas_ss_size) { ++ struct cpt_sigaltstack_image si; ++ cpt_push_object(&saved_obj, ctx); ++ ++ si.cpt_next = sizeof(si); ++ si.cpt_object = CPT_OBJ_SIGALTSTACK; ++ si.cpt_hdrlen = sizeof(si); ++ si.cpt_content = CPT_CONTENT_VOID; ++ ++ si.cpt_stack = tsk->sas_ss_sp; ++ si.cpt_stacksize = tsk->sas_ss_size; ++ ++ ctx->write(&si, sizeof(si), ctx); ++ cpt_pop_object(&saved_obj, ctx); ++ } ++ ++ if (tsk->robust_list ++#ifdef CONFIG_COMPAT ++ || tsk->compat_robust_list ++#endif ++ ) { ++ struct cpt_task_aux_image ai; ++ cpt_push_object(&saved_obj, ctx); ++ ++ ai.cpt_next = sizeof(ai); ++ ai.cpt_object = CPT_OBJ_TASK_AUX; ++ ai.cpt_hdrlen = sizeof(ai); ++ ai.cpt_content = CPT_CONTENT_VOID; ++ ++ ai.cpt_robust_list = (unsigned long)tsk->robust_list; ++#ifdef CONFIG_X86_64 ++#ifdef CONFIG_COMPAT ++ if (task_thread_info(tsk)->flags & _TIF_IA32) ++ ai.cpt_robust_list = (unsigned long)tsk->compat_robust_list; ++#endif ++#endif ++ ctx->write(&ai, sizeof(ai), ctx); ++ cpt_pop_object(&saved_obj, ctx); ++ } ++ ++ dump_sigqueue(&tsk->pending, ctx); ++ ++ last_thread = 1; ++ read_lock(&tasklist_lock); ++ do { ++ struct task_struct * next = next_thread(tsk); ++ if (next != tsk && !thread_group_leader(next)) ++ last_thread = 0; ++ } while (0); ++ read_unlock(&tasklist_lock); ++ ++ if (last_thread) { ++ struct task_struct *prev_tsk; ++ int err; ++ loff_t pos = ctx->file->f_pos; ++ ++ cpt_push_object(&saved_obj, ctx); ++ err = dump_one_signal_struct(tg_obj, ctx); ++ cpt_pop_object(&saved_obj, ctx); ++ if (err) ++ return err; ++ ++ prev_tsk = tsk; ++ for (;;) { ++ if (prev_tsk->tgid == tsk->tgid) { ++ loff_t tg_pos; ++ ++ tg_pos = obj->o_pos + offsetof(struct cpt_task_image, cpt_signal); ++ ctx->pwrite(&pos, sizeof(pos), ctx, tg_pos); ++ if (thread_group_leader(prev_tsk)) ++ break; ++ } ++ ++ if (obj->o_list.prev == &ctx->object_array[CPT_OBJ_TASK]) { ++ eprintk_ctx("bug: thread group leader is lost\n"); ++ return -EINVAL; ++ } ++ ++ obj = list_entry(obj->o_list.prev, cpt_object_t, o_list); ++ prev_tsk = obj->o_obj; ++ } ++ } ++ ++ cpt_close_object(ctx); ++ return 0; ++} ++ ++int cpt_dump_tasks(struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ cpt_open_section(ctx, CPT_SECT_TASKS); ++ ++ for_each_object(obj, CPT_OBJ_TASK) { ++ int err; ++ ++ if ((err = dump_one_process(obj, ctx)) != 0) ++ return err; ++ } ++ ++ cpt_close_section(ctx); ++ return 0; ++} ++ ++int cpt_collect_signals(cpt_context_t *ctx) ++{ ++ cpt_object_t *obj; ++ ++ /* Collect process fd sets */ ++ for_each_object(obj, CPT_OBJ_TASK) { ++ struct task_struct *tsk = obj->o_obj; ++ if (tsk->signal && !list_empty(&tsk->signal->posix_timers)) { ++ eprintk_ctx("task %d/%d(%s) uses posix timers\n", tsk->pid, task_pid_vnr(tsk), tsk->comm); ++ return -EBUSY; ++ } ++ if (tsk->signal && cpt_object_add(CPT_OBJ_SIGNAL_STRUCT, tsk->signal, ctx) == NULL) ++ return -ENOMEM; ++ if (tsk->sighand && cpt_object_add(CPT_OBJ_SIGHAND_STRUCT, tsk->sighand, ctx) == NULL) ++ return -ENOMEM; ++ } ++ return 0; ++} ++ ++ ++static int dump_one_sighand_struct(cpt_object_t *obj, struct cpt_context *ctx) ++{ ++ struct sighand_struct *sig = obj->o_obj; ++ struct cpt_sighand_image *v = cpt_get_buf(ctx); ++ int i; ++ ++ cpt_open_object(obj, ctx); ++ ++ v->cpt_next = CPT_NULL; ++ v->cpt_object = CPT_OBJ_SIGHAND_STRUCT; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_ARRAY; ++ ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ ++ for (i=0; i< _NSIG; i++) { ++ if (sig->action[i].sa.sa_handler != SIG_DFL || ++ sig->action[i].sa.sa_flags) { ++ loff_t saved_obj; ++ struct cpt_sighandler_image *o = cpt_get_buf(ctx); ++ ++ cpt_push_object(&saved_obj, ctx); ++ cpt_open_object(NULL, ctx); ++ ++ o->cpt_next = CPT_NULL; ++ o->cpt_object = CPT_OBJ_SIGHANDLER; ++ o->cpt_hdrlen = sizeof(*o); ++ o->cpt_content = CPT_CONTENT_VOID; ++ ++ o->cpt_signo = i; ++ o->cpt_handler = (unsigned long)sig->action[i].sa.sa_handler; ++ o->cpt_restorer = 0; ++#ifdef CONFIG_X86 ++ o->cpt_restorer = (unsigned long)sig->action[i].sa.sa_restorer; ++#endif ++ o->cpt_flags = sig->action[i].sa.sa_flags; ++ memcpy(&o->cpt_mask, &sig->action[i].sa.sa_mask, 8); ++ ctx->write(o, sizeof(*o), ctx); ++ cpt_release_buf(ctx); ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_obj, ctx); ++ } ++ } ++ ++ cpt_close_object(ctx); ++ return 0; ++} ++ ++int cpt_dump_sighand(struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ cpt_open_section(ctx, CPT_SECT_SIGHAND_STRUCT); ++ ++ for_each_object(obj, CPT_OBJ_SIGHAND_STRUCT) { ++ int err; ++ ++ if ((err = dump_one_sighand_struct(obj, ctx)) != 0) ++ return err; ++ } ++ ++ cpt_close_section(ctx); ++ return 0; ++} +diff --git a/kernel/cpt/cpt_process.h b/kernel/cpt/cpt_process.h +new file mode 100644 +index 0000000..b9f28af +--- /dev/null ++++ b/kernel/cpt/cpt_process.h +@@ -0,0 +1,13 @@ ++int cpt_collect_signals(cpt_context_t *); ++int cpt_dump_signal(struct cpt_context *); ++int cpt_dump_sighand(struct cpt_context *); ++int cpt_dump_tasks(struct cpt_context *); ++ ++int rst_signal_complete(struct cpt_task_image *ti, int *exiting, struct cpt_context *ctx); ++__u32 rst_signal_flag(struct cpt_task_image *ti, struct cpt_context *ctx); ++ ++int rst_restore_process(struct cpt_context *ctx); ++int rst_process_linkage(struct cpt_context *ctx); ++ ++int check_task_state(struct task_struct *tsk, struct cpt_context *ctx); ++struct pid *alloc_vpid_safe(pid_t vnr); +diff --git a/kernel/cpt/cpt_socket.c b/kernel/cpt/cpt_socket.c +new file mode 100644 +index 0000000..4878df1 +--- /dev/null ++++ b/kernel/cpt/cpt_socket.c +@@ -0,0 +1,790 @@ ++/* ++ * ++ * kernel/cpt/cpt_socket.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_mm.h" ++#include "cpt_socket.h" ++#include "cpt_files.h" ++#include "cpt_kernel.h" ++ ++static int dump_rqueue(int owner, struct sock *sk, struct cpt_context *ctx); ++ ++ ++/* Sockets are quite different of another kinds of files. ++ * There is one simplification: only one struct file can refer to a socket, ++ * so we could store information about socket directly in section FILES as ++ * a description of a file and append f.e. array of not-yet-accepted ++ * connections of listening socket as array of auxiliary data. ++ * ++ * Complications are: ++ * 1. TCP sockets can be orphans. We have to relocate orphans as well, ++ * so we have to create special section for orphans. ++ * 2. AF_UNIX sockets are distinguished objects: set of links between ++ * AF_UNIX sockets is quite arbitrary. ++ * A. Each socket can refers to many of files due to FD passing. ++ * B. Each socket except for connected ones can have in queue skbs ++ * sent by any of sockets. ++ * ++ * 2A is relatively easy: after our tasks are frozen we make an additional ++ * recursive pass throgh set of collected files and get referenced to ++ * FD passed files. After end of recursion, all the files are treated ++ * in the same way. All they will be stored in section FILES. ++ * ++ * 2B. We have to resolve all those references at some point. ++ * It is the place where pipe-like approach to image fails. ++ * ++ * All this makes socket checkpointing quite chumbersome. ++ * Right now we collect all the sockets and assign some numeric index value ++ * to each of them. The socket section is separate and put after section FILES, ++ * so section FILES refers to sockets by index, section SOCKET refers to FILES ++ * as usual by position in image. All the refs inside socket section are ++ * by index. When restoring we read socket section, create objects to hold ++ * mappings index <-> pos. At the second pass we open sockets (simultaneosly ++ * with their pairs) and create FILE objects. ++ */ ++ ++ ++/* ====== FD passing ====== */ ++ ++/* Almost nobody does FD passing via AF_UNIX sockets, nevertheless we ++ * have to implement this. A problem is that in general case we receive ++ * skbs from an unknown context, so new files can arrive to checkpointed ++ * set of processes even after they are stopped. Well, we are going just ++ * to ignore unknown fds while doing real checkpointing. It is fair because ++ * links outside checkpointed set are going to fail anyway. ++ * ++ * ATTN: the procedure is recursive. We linearize the recursion adding ++ * newly found files to the end of file list, so they will be analyzed ++ * in the same loop. ++ */ ++ ++static int collect_one_passedfd(struct file *file, cpt_context_t * ctx) ++{ ++ struct inode *inode = file->f_dentry->d_inode; ++ struct socket *sock; ++ struct sock *sk; ++ struct sk_buff *skb; ++ ++ if (!S_ISSOCK(inode->i_mode)) ++ return -ENOTSOCK; ++ ++ sock = &container_of(inode, struct socket_alloc, vfs_inode)->socket; ++ ++ if (sock->ops->family != AF_UNIX) ++ return 0; ++ ++ sk = sock->sk; ++ ++ /* Subtle locking issue. skbs cannot be removed while ++ * we are scanning, because all the processes are stopped. ++ * They still can be added to tail of queue. Locking while ++ * we dereference skb->next is enough to resolve this. ++ * See above about collision with skbs added after we started ++ * checkpointing. ++ */ ++ ++ skb = skb_peek(&sk->sk_receive_queue); ++ while (skb && skb != (struct sk_buff*)&sk->sk_receive_queue) { ++ if (UNIXCB(skb).fp && skb->sk && ++ (!sock_flag(skb->sk, SOCK_DEAD) || unix_peer(sk) == skb->sk)) { ++ struct scm_fp_list *fpl = UNIXCB(skb).fp; ++ int i; ++ ++ for (i = fpl->count-1; i >= 0; i--) { ++ if (cpt_object_add(CPT_OBJ_FILE, fpl->fp[i], ctx) == NULL) ++ return -ENOMEM; ++ } ++ } ++ ++ spin_lock_irq(&sk->sk_receive_queue.lock); ++ skb = skb->next; ++ spin_unlock_irq(&sk->sk_receive_queue.lock); ++ } ++ ++ return 0; ++} ++ ++int cpt_collect_passedfds(cpt_context_t * ctx) ++{ ++ cpt_object_t *obj; ++ ++ for_each_object(obj, CPT_OBJ_FILE) { ++ struct file *file = obj->o_obj; ++ ++ if (S_ISSOCK(file->f_dentry->d_inode->i_mode)) { ++ int err; ++ ++ if ((err = collect_one_passedfd(file, ctx)) < 0) ++ return err; ++ } ++ } ++ ++ return 0; ++} ++ ++/* ====== End of FD passing ====== */ ++ ++/* Must be called under bh_lock_sock() */ ++ ++void clear_backlog(struct sock *sk) ++{ ++ struct sk_buff *skb = sk->sk_backlog.head; ++ ++ sk->sk_backlog.head = sk->sk_backlog.tail = NULL; ++ while (skb) { ++ struct sk_buff *next = skb->next; ++ ++ skb->next = NULL; ++ kfree_skb(skb); ++ skb = next; ++ } ++} ++ ++void release_sock_nobacklog(struct sock *sk) ++{ ++ spin_lock_bh(&(sk->sk_lock.slock)); ++ clear_backlog(sk); ++ sk->sk_lock.owned = 0; ++ if (waitqueue_active(&(sk->sk_lock.wq))) ++ wake_up(&(sk->sk_lock.wq)); ++ spin_unlock_bh(&(sk->sk_lock.slock)); ++} ++ ++int cpt_dump_skb(int type, int owner, struct sk_buff *skb, ++ struct cpt_context *ctx) ++{ ++ struct cpt_skb_image *v = cpt_get_buf(ctx); ++ loff_t saved_obj; ++ struct timeval tmptv; ++ ++ cpt_push_object(&saved_obj, ctx); ++ cpt_open_object(NULL, ctx); ++ ++ v->cpt_next = CPT_NULL; ++ v->cpt_object = CPT_OBJ_SKB; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_ARRAY; ++ ++ v->cpt_owner = owner; ++ v->cpt_queue = type; ++ skb_get_timestamp(skb, &tmptv); ++ v->cpt_stamp = cpt_timeval_export(&tmptv); ++ v->cpt_hspace = skb->data - skb->head; ++ v->cpt_tspace = skb->end - skb->tail; ++ v->cpt_h = skb_transport_header(skb) - skb->head; ++ v->cpt_nh = skb_network_header(skb) - skb->head; ++ v->cpt_mac = skb_mac_header(skb) - skb->head; ++ BUILD_BUG_ON(sizeof(skb->cb) < sizeof(v->cpt_cb)); ++ memcpy(v->cpt_cb, skb->cb, sizeof(v->cpt_cb)); ++ if (sizeof(skb->cb) > sizeof(v->cpt_cb)) { ++ int i; ++ for (i=sizeof(v->cpt_cb); icb); i++) { ++ if (skb->cb[i]) { ++ wprintk_ctx("dirty skb cb"); ++ break; ++ } ++ } ++ } ++ v->cpt_len = skb->len; ++ v->cpt_mac_len = skb->mac_len; ++ v->cpt_csum = skb->csum; ++ v->cpt_local_df = skb->local_df; ++ v->cpt_pkt_type = skb->pkt_type; ++ v->cpt_ip_summed = skb->ip_summed; ++ v->cpt_priority = skb->priority; ++ v->cpt_protocol = skb->protocol; ++ v->cpt_security = 0; ++ v->cpt_gso_segs = skb_shinfo(skb)->gso_segs; ++ v->cpt_gso_size = skb_shinfo(skb)->gso_size; ++ if (skb_shinfo(skb)->gso_type) { ++ eprintk_ctx("skb ufo is not supported\n"); ++ return -EINVAL; ++ } ++ ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ ++ if (skb->len + (skb->data - skb->head) > 0) { ++ struct cpt_obj_bits ob; ++ loff_t saved_obj2; ++ ++ cpt_push_object(&saved_obj2, ctx); ++ cpt_open_object(NULL, ctx); ++ ob.cpt_next = CPT_NULL; ++ ob.cpt_object = CPT_OBJ_BITS; ++ ob.cpt_hdrlen = sizeof(ob); ++ ob.cpt_content = CPT_CONTENT_DATA; ++ ob.cpt_size = skb->len + v->cpt_hspace; ++ ++ ctx->write(&ob, sizeof(ob), ctx); ++ ++ ctx->write(skb->head, (skb->data-skb->head) + (skb->len-skb->data_len), ctx); ++ if (skb->data_len) { ++ int offset = skb->len - skb->data_len; ++ while (offset < skb->len) { ++ int copy = skb->len - offset; ++ if (copy > PAGE_SIZE) ++ copy = PAGE_SIZE; ++ (void)cpt_get_buf(ctx); ++ if (skb_copy_bits(skb, offset, ctx->tmpbuf, copy)) ++ BUG(); ++ ctx->write(ctx->tmpbuf, copy, ctx); ++ __cpt_release_buf(ctx); ++ offset += copy; ++ } ++ } ++ ++ ctx->align(ctx); ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_obj2, ctx); ++ } ++ ++ if (skb->sk && skb->sk->sk_family == AF_UNIX) { ++ struct scm_fp_list *fpl = UNIXCB(skb).fp; ++ ++ if (fpl) { ++ int i; ++ ++ for (i = 0; i < fpl->count; i++) { ++ struct cpt_fd_image v; ++ cpt_object_t *obj; ++ loff_t saved_obj2; ++ ++ obj = lookup_cpt_object(CPT_OBJ_FILE, fpl->fp[i], ctx); ++ ++ if (!obj) { ++ eprintk_ctx("lost passed FD\n"); ++ return -EINVAL; ++ } ++ ++ cpt_push_object(&saved_obj2, ctx); ++ cpt_open_object(NULL, ctx); ++ v.cpt_next = CPT_NULL; ++ v.cpt_object = CPT_OBJ_FILEDESC; ++ v.cpt_hdrlen = sizeof(v); ++ v.cpt_content = CPT_CONTENT_VOID; ++ ++ v.cpt_fd = i; ++ v.cpt_file = obj->o_pos; ++ v.cpt_flags = 0; ++ ctx->write(&v, sizeof(v), ctx); ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_obj2, ctx); ++ } ++ } ++ } ++ ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_obj, ctx); ++ return 0; ++} ++ ++static int dump_rqueue(int idx, struct sock *sk, struct cpt_context *ctx) ++{ ++ struct sk_buff *skb; ++ struct sock *sk_cache = NULL; ++ ++ skb = skb_peek(&sk->sk_receive_queue); ++ while (skb && skb != (struct sk_buff*)&sk->sk_receive_queue) { ++ int err; ++ ++ if (sk->sk_family == AF_UNIX) { ++ cpt_object_t *obj; ++ if (skb->sk != sk_cache) { ++ idx = -1; ++ sk_cache = NULL; ++ obj = lookup_cpt_object(CPT_OBJ_SOCKET, skb->sk, ctx); ++ if (obj) { ++ idx = obj->o_index; ++ sk_cache = skb->sk; ++ } else if (unix_peer(sk) != skb->sk) ++ goto next_skb; ++ } ++ } ++ ++ err = cpt_dump_skb(CPT_SKB_RQ, idx, skb, ctx); ++ if (err) ++ return err; ++ ++next_skb: ++ spin_lock_irq(&sk->sk_receive_queue.lock); ++ skb = skb->next; ++ spin_unlock_irq(&sk->sk_receive_queue.lock); ++ } ++ return 0; ++} ++ ++static int dump_wqueue(int idx, struct sock *sk, struct cpt_context *ctx) ++{ ++ struct sk_buff *skb; ++ ++ skb = skb_peek(&sk->sk_write_queue); ++ while (skb && skb != (struct sk_buff*)&sk->sk_write_queue) { ++ int err = cpt_dump_skb(CPT_SKB_WQ, idx, skb, ctx); ++ if (err) ++ return err; ++ ++ spin_lock_irq(&sk->sk_write_queue.lock); ++ skb = skb->next; ++ spin_unlock_irq(&sk->sk_write_queue.lock); ++ } ++ return 0; ++} ++ ++void cpt_dump_sock_attr(struct sock *sk, cpt_context_t *ctx) ++{ ++ loff_t saved_obj; ++ if (sk->sk_filter) { ++ struct cpt_obj_bits v; ++ ++ cpt_push_object(&saved_obj, ctx); ++ cpt_open_object(NULL, ctx); ++ ++ v.cpt_next = CPT_NULL; ++ v.cpt_object = CPT_OBJ_SKFILTER; ++ v.cpt_hdrlen = sizeof(v); ++ v.cpt_content = CPT_CONTENT_DATA; ++ v.cpt_size = sk->sk_filter->len*sizeof(struct sock_filter); ++ ++ ctx->write(&v, sizeof(v), ctx); ++ ctx->write(sk->sk_filter->insns, v.cpt_size, ctx); ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_obj, ctx); ++ } ++ if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) { ++ cpt_push_object(&saved_obj, ctx); ++ cpt_dump_mcfilter(sk, ctx); ++ cpt_pop_object(&saved_obj, ctx); ++ } ++} ++ ++/* Dump socket content */ ++ ++int cpt_dump_socket(cpt_object_t *obj, struct sock *sk, int index, int parent, struct cpt_context *ctx) ++{ ++ struct cpt_sock_image *v = cpt_get_buf(ctx); ++ struct socket *sock; ++ struct timeval tmptv; ++ ++ cpt_open_object(obj, ctx); ++ ++ v->cpt_next = CPT_NULL; ++ v->cpt_object = CPT_OBJ_SOCKET; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_ARRAY; ++ ++ v->cpt_file = CPT_NULL; ++ sock = sk->sk_socket; ++ if (sock && sock->file) { ++ cpt_object_t *tobj; ++ tobj = lookup_cpt_object(CPT_OBJ_FILE, sock->file, ctx); ++ if (tobj) ++ v->cpt_file = tobj->o_pos; ++ } ++ v->cpt_index = index; ++ v->cpt_parent = parent; ++ ++ if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) { ++ if (sock && !obj->o_lock) { ++ lockdep_off(); ++ lock_sock(sk); ++ lockdep_on(); ++ obj->o_lock = 1; ++ } ++ } ++ ++ /* Some bits stored in inode */ ++ v->cpt_ssflags = sock ? sock->flags : 0; ++ v->cpt_sstate = sock ? sock->state : 0; ++ v->cpt_passcred = sock ? test_bit(SOCK_PASSCRED, &sock->flags) : 0; ++ ++ /* Common data */ ++ v->cpt_family = sk->sk_family; ++ v->cpt_type = sk->sk_type; ++ v->cpt_state = sk->sk_state; ++ v->cpt_reuse = sk->sk_reuse; ++ v->cpt_zapped = sock_flag(sk, SOCK_ZAPPED); ++ v->cpt_shutdown = sk->sk_shutdown; ++ v->cpt_userlocks = sk->sk_userlocks; ++ v->cpt_no_check = sk->sk_no_check; ++ v->cpt_zapped = sock_flag(sk, SOCK_DBG); ++ v->cpt_rcvtstamp = sock_flag(sk, SOCK_RCVTSTAMP); ++ v->cpt_localroute = sock_flag(sk, SOCK_LOCALROUTE); ++ v->cpt_protocol = sk->sk_protocol; ++ v->cpt_err = sk->sk_err; ++ v->cpt_err_soft = sk->sk_err_soft; ++ v->cpt_max_ack_backlog = sk->sk_max_ack_backlog; ++ v->cpt_priority = sk->sk_priority; ++ v->cpt_rcvlowat = sk->sk_rcvlowat; ++ v->cpt_rcvtimeo = CPT_NULL; ++ if (sk->sk_rcvtimeo != MAX_SCHEDULE_TIMEOUT) ++ v->cpt_rcvtimeo = sk->sk_rcvtimeo > INT_MAX ? INT_MAX : sk->sk_rcvtimeo; ++ v->cpt_sndtimeo = CPT_NULL; ++ if (sk->sk_sndtimeo != MAX_SCHEDULE_TIMEOUT) ++ v->cpt_sndtimeo = sk->sk_sndtimeo > INT_MAX ? INT_MAX : sk->sk_sndtimeo; ++ v->cpt_rcvbuf = sk->sk_rcvbuf; ++ v->cpt_sndbuf = sk->sk_sndbuf; ++ v->cpt_bound_dev_if = sk->sk_bound_dev_if; ++ v->cpt_flags = sk->sk_flags; ++ v->cpt_lingertime = CPT_NULL; ++ if (sk->sk_lingertime != MAX_SCHEDULE_TIMEOUT) ++ v->cpt_lingertime = sk->sk_lingertime > INT_MAX ? INT_MAX : sk->sk_lingertime; ++ v->cpt_peer_pid = sk->sk_peercred.pid; ++ v->cpt_peer_uid = sk->sk_peercred.uid; ++ v->cpt_peer_gid = sk->sk_peercred.gid; ++ tmptv = ktime_to_timeval(sk->sk_stamp); ++ v->cpt_stamp = cpt_timeval_export(&tmptv); ++ ++ v->cpt_peer = -1; ++ v->cpt_socketpair = 0; ++ v->cpt_deleted = 0; ++ ++ v->cpt_laddrlen = 0; ++ if (sock) { ++ int alen = sizeof(v->cpt_laddr); ++ int err = sock->ops->getname(sock, (struct sockaddr*)&v->cpt_laddr, &alen, 0); ++ if (err) { ++ cpt_release_buf(ctx); ++ return err; ++ } ++ v->cpt_laddrlen = alen; ++ } ++ v->cpt_raddrlen = 0; ++ if (sock) { ++ int alen = sizeof(v->cpt_raddr); ++ int err = sock->ops->getname(sock, (struct sockaddr*)&v->cpt_raddr, &alen, 2); ++ if (!err) ++ v->cpt_raddrlen = alen; ++ } ++ ++ if (sk->sk_family == AF_UNIX) { ++ if (unix_sk(sk)->dentry) { ++ struct dentry *d = unix_sk(sk)->dentry; ++ v->cpt_deleted = !IS_ROOT(d) && d_unhashed(d); ++ if (!v->cpt_deleted) { ++ int err = 0; ++ char *path; ++ struct path p; ++ unsigned long pg = __get_free_page(GFP_KERNEL); ++ ++ if (!pg) { ++ cpt_release_buf(ctx); ++ return -ENOMEM; ++ } ++ ++ p.dentry = d; ++ p.mnt = unix_sk(sk)->mnt; ++ path = d_path(&p, (char *)pg, PAGE_SIZE); ++ ++ if (!IS_ERR(path)) { ++ int len = strlen(path); ++ if (len < 126) { ++ strcpy(((char*)v->cpt_laddr)+2, path); ++ v->cpt_laddrlen = len + 2; ++ } else { ++ wprintk_ctx("af_unix path is too long: %s (%s)\n", path, ((char*)v->cpt_laddr)+2); ++ } ++ err = cpt_verify_overmount(path, d, unix_sk(sk)->mnt, ctx); ++ } else { ++ eprintk_ctx("cannot get path of an af_unix socket\n"); ++ err = PTR_ERR(path); ++ } ++ free_page(pg); ++ if (err) { ++ cpt_release_buf(ctx); ++ return err; ++ } ++ } ++ } ++ ++ /* If the socket is connected, find its peer. If peer is not ++ * in our table, the socket is connected to external process ++ * and we consider it disconnected. ++ */ ++ if (unix_peer(sk)) { ++ cpt_object_t *pobj; ++ pobj = lookup_cpt_object(CPT_OBJ_SOCKET, unix_peer(sk), ctx); ++ if (pobj) ++ v->cpt_peer = pobj->o_index; ++ else ++ v->cpt_shutdown = SHUTDOWN_MASK; ++ ++ if (unix_peer(unix_peer(sk)) == sk) ++ v->cpt_socketpair = 1; ++ } ++ ++ /* If the socket shares address with another socket it is ++ * child of some listening socket. Find and record it. */ ++ if (unix_sk(sk)->addr && ++ atomic_read(&unix_sk(sk)->addr->refcnt) > 1 && ++ sk->sk_state != TCP_LISTEN) { ++ cpt_object_t *pobj; ++ for_each_object(pobj, CPT_OBJ_SOCKET) { ++ struct sock *psk = pobj->o_obj; ++ if (psk->sk_family == AF_UNIX && ++ psk->sk_state == TCP_LISTEN && ++ unix_sk(psk)->addr == unix_sk(sk)->addr) { ++ v->cpt_parent = pobj->o_index; ++ break; ++ } ++ } ++ } ++ } ++ ++ if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) ++ cpt_dump_socket_in(v, sk, ctx); ++ ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ ++ cpt_dump_sock_attr(sk, ctx); ++ ++ dump_rqueue(index, sk, ctx); ++ if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) { ++ dump_wqueue(index, sk, ctx); ++ cpt_dump_ofo_queue(index, sk, ctx); ++ } ++ ++ if ((sk->sk_family == AF_INET || sk->sk_family == AF_INET6) ++ && sk->sk_state == TCP_LISTEN) ++ cpt_dump_synwait_queue(sk, index, ctx); ++ ++ cpt_close_object(ctx); ++ ++ if ((sk->sk_family == AF_INET || sk->sk_family == AF_INET6) ++ && sk->sk_state == TCP_LISTEN) ++ cpt_dump_accept_queue(sk, index, ctx); ++ ++ return 0; ++} ++ ++int cpt_dump_orphaned_sockets(struct cpt_context *ctx) ++{ ++ int i; ++ ++ cpt_open_section(ctx, CPT_SECT_ORPHANS); ++ ++ for (i = 0; i < tcp_hashinfo.ehash_size; i++) { ++ struct sock *sk; ++ struct hlist_node *node; ++ rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, i); ++retry: ++ read_lock_bh(lock); ++ sk_for_each(sk, node, &tcp_hashinfo.ehash[i].chain) { ++ ++ if (sk->owner_env != get_exec_env()) ++ continue; ++ if (sk->sk_socket) ++ continue; ++ if (!sock_flag(sk, SOCK_DEAD)) ++ continue; ++ if (lookup_cpt_object(CPT_OBJ_SOCKET, sk, ctx)) ++ continue; ++ sock_hold(sk); ++ read_unlock_bh(lock); ++ ++ local_bh_disable(); ++ bh_lock_sock(sk); ++ if (sock_owned_by_user(sk)) ++ eprintk_ctx("BUG: sk locked by whom?\n"); ++ sk->sk_lock.owned = 1; ++ bh_unlock_sock(sk); ++ local_bh_enable(); ++ ++ cpt_dump_socket(NULL, sk, -1, -1, ctx); ++ ++ local_bh_disable(); ++ bh_lock_sock(sk); ++ sk->sk_lock.owned = 0; ++ clear_backlog(sk); ++ tcp_done(sk); ++ bh_unlock_sock(sk); ++ local_bh_enable(); ++ sock_put(sk); ++ ++ goto retry; ++ } ++ read_unlock_bh(lock); ++ } ++ cpt_close_section(ctx); ++ return 0; ++} ++ ++static int can_dump(struct sock *sk, cpt_context_t *ctx) ++{ ++ switch (sk->sk_family) { ++ case AF_NETLINK: ++ if (((struct netlink_sock *)sk)->cb) { ++ eprintk_ctx("netlink socket has active callback\n"); ++ return 0; ++ } ++ break; ++ } ++ return 1; ++} ++ ++/* We are not going to block suspend when we have external AF_UNIX connections. ++ * But we cannot stop feed of new packets/connections to our environment ++ * from outside. Taking into account that it is intrincically unreliable, ++ * we collect some amount of data, but when checkpointing/restoring we ++ * are going to drop everything, which does not make sense: skbs sent ++ * by outside processes, connections from outside etc. etc. ++ */ ++ ++/* The first pass. When we see socket referenced by a file, we just ++ * add it to socket table */ ++int cpt_collect_socket(struct file *file, cpt_context_t * ctx) ++{ ++ cpt_object_t *obj; ++ struct socket *sock; ++ struct sock *sk; ++ ++ if (!S_ISSOCK(file->f_dentry->d_inode->i_mode)) ++ return -ENOTSOCK; ++ sock = &container_of(file->f_dentry->d_inode, struct socket_alloc, vfs_inode)->socket; ++ sk = sock->sk; ++ if (!can_dump(sk, ctx)) ++ return -EAGAIN; ++ if ((obj = cpt_object_add(CPT_OBJ_SOCKET, sk, ctx)) == NULL) ++ return -ENOMEM; ++ obj->o_parent = file; ++ ++ return 0; ++} ++ ++/* ++ * We should end with table containing: ++ * * all sockets opened by our processes in the table. ++ * * all the sockets queued in listening queues on _our_ listening sockets, ++ * which are connected to our opened sockets. ++ */ ++ ++static int collect_one_unix_listening_sock(cpt_object_t *obj, cpt_context_t * ctx) ++{ ++ struct sock *sk = obj->o_obj; ++ cpt_object_t *cobj; ++ struct sk_buff *skb; ++ ++ skb = skb_peek(&sk->sk_receive_queue); ++ while (skb && skb != (struct sk_buff*)&sk->sk_receive_queue) { ++ struct sock *lsk = skb->sk; ++ if (unix_peer(lsk) && ++ lookup_cpt_object(CPT_OBJ_SOCKET, unix_peer(lsk), ctx)) { ++ if ((cobj = cpt_object_add(CPT_OBJ_SOCKET, lsk, ctx)) == NULL) ++ return -ENOMEM; ++ cobj->o_parent = obj->o_parent; ++ } ++ spin_lock_irq(&sk->sk_receive_queue.lock); ++ skb = skb->next; ++ spin_unlock_irq(&sk->sk_receive_queue.lock); ++ } ++ ++ return 0; ++} ++ ++int cpt_index_sockets(cpt_context_t * ctx) ++{ ++ cpt_object_t *obj; ++ unsigned long index = 0; ++ ++ /* Collect not-yet-accepted children of listening sockets. */ ++ for_each_object(obj, CPT_OBJ_SOCKET) { ++ struct sock *sk = obj->o_obj; ++ ++ if (sk->sk_state != TCP_LISTEN) ++ continue; ++ ++ if (sk->sk_family == AF_UNIX) ++ collect_one_unix_listening_sock(obj, ctx); ++ } ++ ++ /* Assign indices to all the sockets. */ ++ for_each_object(obj, CPT_OBJ_SOCKET) { ++ struct sock *sk = obj->o_obj; ++ cpt_obj_setindex(obj, index++, ctx); ++ ++ if (sk->sk_socket && sk->sk_socket->file) { ++ cpt_object_t *tobj; ++ tobj = lookup_cpt_object(CPT_OBJ_FILE, sk->sk_socket->file, ctx); ++ if (tobj) ++ cpt_obj_setindex(tobj, obj->o_index, ctx); ++ } ++ } ++ ++ return 0; ++} ++ ++void cpt_unlock_sockets(cpt_context_t * ctx) ++{ ++ cpt_object_t *obj; ++ ++ lockdep_off(); ++ for_each_object(obj, CPT_OBJ_SOCKET) { ++ struct sock *sk = obj->o_obj; ++ if (sk && obj->o_lock) { ++ if (sk->sk_socket) ++ release_sock(sk); ++ } ++ } ++ lockdep_on(); ++} ++ ++void cpt_kill_sockets(cpt_context_t * ctx) ++{ ++ cpt_object_t *obj; ++ ++ for_each_object(obj, CPT_OBJ_SOCKET) { ++ struct sock *sk = obj->o_obj; ++ if (sk && obj->o_lock) { ++ struct ve_struct *old_env; ++ old_env = set_exec_env(sk->owner_env); ++ cpt_kill_socket(sk, ctx); ++ if (sk->sk_socket) ++ release_sock_nobacklog(sk); ++ set_exec_env(old_env); ++ } ++ } ++} ++ ++__u32 cpt_socket_fasync(struct file *file, struct cpt_context *ctx) ++{ ++ struct fasync_struct *fa; ++ struct inode *inode = file->f_dentry->d_inode; ++ struct socket *sock; ++ ++ sock = &container_of(inode, struct socket_alloc, vfs_inode)->socket; ++ ++ for (fa = sock->fasync_list; fa; fa = fa->fa_next) { ++ if (fa->fa_file == file) ++ return fa->fa_fd; ++ } ++ return -1; ++} +diff --git a/kernel/cpt/cpt_socket.h b/kernel/cpt/cpt_socket.h +new file mode 100644 +index 0000000..6489184 +--- /dev/null ++++ b/kernel/cpt/cpt_socket.h +@@ -0,0 +1,33 @@ ++struct sock; ++ ++int cpt_collect_passedfds(cpt_context_t *); ++int cpt_index_sockets(cpt_context_t *); ++int cpt_collect_socket(struct file *, cpt_context_t *); ++int cpt_dump_socket(cpt_object_t *obj, struct sock *sk, int index, int parent, struct cpt_context *ctx); ++int cpt_dump_accept_queue(struct sock *sk, int index, struct cpt_context *ctx); ++int cpt_dump_synwait_queue(struct sock *sk, int index, struct cpt_context *ctx); ++int rst_sockets(struct cpt_context *ctx); ++int rst_sockets_complete(struct cpt_context *ctx); ++int cpt_dump_orphaned_sockets(struct cpt_context *ctx); ++ ++int rst_sock_attr(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx); ++struct sk_buff * rst_skb(loff_t *pos_p, __u32 *owner, __u32 *queue, struct cpt_context *ctx); ++ ++void cpt_unlock_sockets(cpt_context_t *); ++void cpt_kill_sockets(cpt_context_t *); ++ ++ ++int cpt_kill_socket(struct sock *, cpt_context_t *); ++int cpt_dump_socket_in(struct cpt_sock_image *, struct sock *, struct cpt_context*); ++int rst_socket_in(struct cpt_sock_image *si, loff_t pos, struct sock *, struct cpt_context *ctx); ++__u32 cpt_socket_fasync(struct file *file, struct cpt_context *ctx); ++int cpt_attach_accept(struct sock *lsk, struct sock *sk, cpt_context_t *); ++int rst_restore_synwait_queue(struct sock *sk, struct cpt_sock_image *si, loff_t pos, struct cpt_context *ctx); ++int cpt_dump_ofo_queue(int idx, struct sock *sk, struct cpt_context *ctx); ++int cpt_dump_skb(int type, int owner, struct sk_buff *skb, struct cpt_context *ctx); ++int cpt_dump_mcfilter(struct sock *sk, struct cpt_context *ctx); ++ ++int rst_sk_mcfilter_in(struct sock *sk, struct cpt_sockmc_image *v, ++ loff_t pos, cpt_context_t *ctx); ++int rst_sk_mcfilter_in6(struct sock *sk, struct cpt_sockmc_image *v, ++ loff_t pos, cpt_context_t *ctx); +diff --git a/kernel/cpt/cpt_socket_in.c b/kernel/cpt/cpt_socket_in.c +new file mode 100644 +index 0000000..c02d459 +--- /dev/null ++++ b/kernel/cpt/cpt_socket_in.c +@@ -0,0 +1,450 @@ ++/* ++ * ++ * kernel/cpt/cpt_socket_in.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_mm.h" ++#include "cpt_socket.h" ++#include "cpt_kernel.h" ++ ++static inline __u32 jiffies_export(unsigned long tmo) ++{ ++ __s32 delta = (long)(tmo - jiffies); ++ return delta; ++} ++ ++static inline __u32 tcp_jiffies_export(__u32 tmo) ++{ ++ __s32 delta = tmo - tcp_time_stamp; ++ return delta; ++} ++ ++int cpt_dump_ofo_queue(int idx, struct sock *sk, struct cpt_context *ctx) ++{ ++ struct sk_buff *skb; ++ struct tcp_sock *tp; ++ ++ if (sk->sk_type != SOCK_STREAM || sk->sk_protocol != IPPROTO_TCP) ++ return 0; ++ ++ tp = tcp_sk(sk); ++ ++ skb = skb_peek(&tp->out_of_order_queue); ++ while (skb && skb != (struct sk_buff*)&tp->out_of_order_queue) { ++ int err; ++ ++ err = cpt_dump_skb(CPT_SKB_OFOQ, idx, skb, ctx); ++ if (err) ++ return err; ++ ++ spin_lock_irq(&tp->out_of_order_queue.lock); ++ skb = skb->next; ++ spin_unlock_irq(&tp->out_of_order_queue.lock); ++ } ++ return 0; ++} ++ ++static int cpt_dump_socket_tcp(struct cpt_sock_image *si, struct sock *sk, ++ struct cpt_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ si->cpt_pred_flags = tp->pred_flags; ++ si->cpt_rcv_nxt = tp->rcv_nxt; ++ si->cpt_snd_nxt = tp->snd_nxt; ++ si->cpt_snd_una = tp->snd_una; ++ si->cpt_snd_sml = tp->snd_sml; ++ si->cpt_rcv_tstamp = tcp_jiffies_export(tp->rcv_tstamp); ++ si->cpt_lsndtime = tcp_jiffies_export(tp->lsndtime); ++ si->cpt_tcp_header_len = tp->tcp_header_len; ++ si->cpt_ack_pending = inet_csk(sk)->icsk_ack.pending; ++ si->cpt_quick = inet_csk(sk)->icsk_ack.quick; ++ si->cpt_pingpong = inet_csk(sk)->icsk_ack.pingpong; ++ si->cpt_blocked = inet_csk(sk)->icsk_ack.blocked; ++ si->cpt_ato = inet_csk(sk)->icsk_ack.ato; ++ si->cpt_ack_timeout = jiffies_export(inet_csk(sk)->icsk_ack.timeout); ++ si->cpt_lrcvtime = tcp_jiffies_export(inet_csk(sk)->icsk_ack.lrcvtime); ++ si->cpt_last_seg_size = inet_csk(sk)->icsk_ack.last_seg_size; ++ si->cpt_rcv_mss = inet_csk(sk)->icsk_ack.rcv_mss; ++ si->cpt_snd_wl1 = tp->snd_wl1; ++ si->cpt_snd_wnd = tp->snd_wnd; ++ si->cpt_max_window = tp->max_window; ++ si->cpt_pmtu_cookie = inet_csk(sk)->icsk_pmtu_cookie; ++ si->cpt_mss_cache = tp->mss_cache; ++ si->cpt_mss_cache_std = tp->mss_cache; /* FIXMW was tp->mss_cache_std */ ++ si->cpt_mss_clamp = tp->rx_opt.mss_clamp; ++ si->cpt_ext_header_len = inet_csk(sk)->icsk_ext_hdr_len; ++ si->cpt_ext2_header_len = 0; ++ si->cpt_ca_state = inet_csk(sk)->icsk_ca_state; ++ si->cpt_retransmits = inet_csk(sk)->icsk_retransmits; ++ si->cpt_reordering = tp->reordering; ++ si->cpt_frto_counter = tp->frto_counter; ++ si->cpt_frto_highmark = tp->frto_highmark; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9) ++ // // si->cpt_adv_cong = tp->adv_cong; ++#endif ++ si->cpt_defer_accept = inet_csk(sk)->icsk_accept_queue.rskq_defer_accept; ++ si->cpt_backoff = inet_csk(sk)->icsk_backoff; ++ si->cpt_srtt = tp->srtt; ++ si->cpt_mdev = tp->mdev; ++ si->cpt_mdev_max = tp->mdev_max; ++ si->cpt_rttvar = tp->rttvar; ++ si->cpt_rtt_seq = tp->rtt_seq; ++ si->cpt_rto = inet_csk(sk)->icsk_rto; ++ si->cpt_packets_out = tp->packets_out; ++ si->cpt_left_out = tp->sacked_out + tp->lost_out; ++ si->cpt_retrans_out = tp->retrans_out; ++ si->cpt_lost_out = tp->lost_out; ++ si->cpt_sacked_out = tp->sacked_out; ++ si->cpt_fackets_out = tp->fackets_out; ++ si->cpt_snd_ssthresh = tp->snd_ssthresh; ++ si->cpt_snd_cwnd = tp->snd_cwnd; ++ si->cpt_snd_cwnd_cnt = tp->snd_cwnd_cnt; ++ si->cpt_snd_cwnd_clamp = tp->snd_cwnd_clamp; ++ si->cpt_snd_cwnd_used = tp->snd_cwnd_used; ++ si->cpt_snd_cwnd_stamp = tcp_jiffies_export(tp->snd_cwnd_stamp); ++ si->cpt_timeout = jiffies_export(inet_csk(sk)->icsk_timeout); ++ si->cpt_ka_timeout = 0; ++ si->cpt_rcv_wnd = tp->rcv_wnd; ++ si->cpt_rcv_wup = tp->rcv_wup; ++ si->cpt_write_seq = tp->write_seq; ++ si->cpt_pushed_seq = tp->pushed_seq; ++ si->cpt_copied_seq = tp->copied_seq; ++ si->cpt_tstamp_ok = tp->rx_opt.tstamp_ok; ++ si->cpt_wscale_ok = tp->rx_opt.wscale_ok; ++ si->cpt_sack_ok = tp->rx_opt.sack_ok; ++ si->cpt_saw_tstamp = tp->rx_opt.saw_tstamp; ++ si->cpt_snd_wscale = tp->rx_opt.snd_wscale; ++ si->cpt_rcv_wscale = tp->rx_opt.rcv_wscale; ++ si->cpt_nonagle = tp->nonagle; ++ si->cpt_keepalive_probes = tp->keepalive_probes; ++ si->cpt_rcv_tsval = tp->rx_opt.rcv_tsval; ++ si->cpt_rcv_tsecr = tp->rx_opt.rcv_tsecr; ++ si->cpt_ts_recent = tp->rx_opt.ts_recent; ++ si->cpt_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; ++ si->cpt_user_mss = tp->rx_opt.user_mss; ++ si->cpt_dsack = tp->rx_opt.dsack; ++ si->cpt_eff_sacks = tp->rx_opt.eff_sacks; ++ si->cpt_sack_array[0] = tp->duplicate_sack[0].start_seq; ++ si->cpt_sack_array[1] = tp->duplicate_sack[0].end_seq; ++ si->cpt_sack_array[2] = tp->selective_acks[0].start_seq; ++ si->cpt_sack_array[3] = tp->selective_acks[0].end_seq; ++ si->cpt_sack_array[4] = tp->selective_acks[1].start_seq; ++ si->cpt_sack_array[5] = tp->selective_acks[1].end_seq; ++ si->cpt_sack_array[6] = tp->selective_acks[2].start_seq; ++ si->cpt_sack_array[7] = tp->selective_acks[2].end_seq; ++ si->cpt_sack_array[8] = tp->selective_acks[3].start_seq; ++ si->cpt_sack_array[9] = tp->selective_acks[3].end_seq; ++ si->cpt_window_clamp = tp->window_clamp; ++ si->cpt_rcv_ssthresh = tp->rcv_ssthresh; ++ si->cpt_probes_out = inet_csk(sk)->icsk_probes_out; ++ si->cpt_num_sacks = tp->rx_opt.num_sacks; ++ si->cpt_advmss = tp->advmss; ++ si->cpt_syn_retries = inet_csk(sk)->icsk_syn_retries; ++ si->cpt_ecn_flags = tp->ecn_flags; ++ si->cpt_prior_ssthresh = tp->prior_ssthresh; ++ si->cpt_high_seq = tp->high_seq; ++ si->cpt_retrans_stamp = tp->retrans_stamp; ++ si->cpt_undo_marker = tp->undo_marker; ++ si->cpt_undo_retrans = tp->undo_retrans; ++ si->cpt_urg_seq = tp->urg_seq; ++ si->cpt_urg_data = tp->urg_data; ++ si->cpt_pending = inet_csk(sk)->icsk_pending; ++ si->cpt_urg_mode = tp->urg_mode; ++ si->cpt_snd_up = tp->snd_up; ++ si->cpt_keepalive_time = tp->keepalive_time; ++ si->cpt_keepalive_intvl = tp->keepalive_intvl; ++ si->cpt_linger2 = tp->linger2; ++ ++ if (sk->sk_state != TCP_LISTEN && ++ sk->sk_state != TCP_CLOSE && ++ sock_flag(sk, SOCK_KEEPOPEN)) { ++ si->cpt_ka_timeout = jiffies_export(sk->sk_timer.expires); ++ } ++ ++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) ++ { ++ extern struct inet_connection_sock_af_ops ipv6_mapped; ++ if (sk->sk_family == AF_INET6 && ++ inet_csk(sk)->icsk_af_ops == &ipv6_mapped) ++ si->cpt_mapped = 1; ++ } ++#endif ++ ++ return 0; ++} ++ ++ ++int cpt_dump_socket_in(struct cpt_sock_image *si, struct sock *sk, ++ struct cpt_context *ctx) ++{ ++ struct inet_sock *inet = inet_sk(sk); ++ struct ipv6_pinfo *np = inet6_sk(sk); ++ ++ if (sk->sk_family == AF_INET) { ++ struct sockaddr_in *sin = ((struct sockaddr_in*)si->cpt_laddr); ++ sin->sin_family = AF_INET; ++ sin->sin_port = inet->sport; ++ sin->sin_addr.s_addr = inet->rcv_saddr; ++ si->cpt_laddrlen = sizeof(*sin); ++ } else if (sk->sk_family == AF_INET6) { ++ struct sockaddr_in6 *sin6 = ((struct sockaddr_in6*)si->cpt_laddr); ++ sin6->sin6_family = AF_INET6; ++ sin6->sin6_port = inet->sport; ++ memcpy(&sin6->sin6_addr, &np->rcv_saddr, 16); ++ si->cpt_laddrlen = sizeof(*sin6); ++ } ++ if (!inet->num) ++ si->cpt_laddrlen = 0; ++ ++ si->cpt_daddr = inet->daddr; ++ si->cpt_dport = inet->dport; ++ si->cpt_saddr = inet->saddr; ++ si->cpt_rcv_saddr = inet->rcv_saddr; ++ si->cpt_sport = inet->sport; ++ si->cpt_uc_ttl = inet->uc_ttl; ++ si->cpt_tos = inet->tos; ++ si->cpt_cmsg_flags = inet->cmsg_flags; ++ si->cpt_mc_index = inet->mc_index; ++ si->cpt_mc_addr = inet->mc_addr; ++ si->cpt_hdrincl = inet->hdrincl; ++ si->cpt_mc_ttl = inet->mc_ttl; ++ si->cpt_mc_loop = inet->mc_loop; ++ si->cpt_pmtudisc = inet->pmtudisc; ++ si->cpt_recverr = inet->recverr; ++ si->cpt_freebind = inet->freebind; ++ si->cpt_idcounter = inet->id; ++ ++ si->cpt_cork_flags = inet->cork.flags; ++ si->cpt_cork_fragsize = 0; ++ si->cpt_cork_length = inet->cork.length; ++ si->cpt_cork_addr = inet->cork.addr; ++ si->cpt_cork_saddr = inet->cork.fl.fl4_src; ++ si->cpt_cork_daddr = inet->cork.fl.fl4_dst; ++ si->cpt_cork_oif = inet->cork.fl.oif; ++ if (inet->cork.dst) { ++ struct rtable *rt = (struct rtable *)inet->cork.dst; ++ si->cpt_cork_fragsize = inet->cork.fragsize; ++ si->cpt_cork_saddr = rt->fl.fl4_src; ++ si->cpt_cork_daddr = rt->fl.fl4_dst; ++ si->cpt_cork_oif = rt->fl.oif; ++ } ++ ++ if (sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP) { ++ struct udp_sock *up = udp_sk(sk); ++ si->cpt_udp_pending = up->pending; ++ si->cpt_udp_corkflag = up->corkflag; ++ si->cpt_udp_encap = up->encap_type; ++ si->cpt_udp_len = up->len; ++ } ++ ++ if (sk->sk_family == AF_INET6) { ++ memcpy(si->cpt_saddr6, &np->saddr, 16); ++ memcpy(si->cpt_rcv_saddr6, &np->rcv_saddr, 16); ++ memcpy(si->cpt_daddr6, &np->daddr, 16); ++ si->cpt_flow_label6 = np->flow_label; ++ si->cpt_frag_size6 = np->frag_size; ++ si->cpt_hop_limit6 = np->hop_limit; ++ si->cpt_mcast_hops6 = np->mcast_hops; ++ si->cpt_mcast_oif6 = np->mcast_oif; ++ si->cpt_rxopt6 = np->rxopt.all; ++ si->cpt_mc_loop6 = np->mc_loop; ++ si->cpt_recverr6 = np->recverr; ++ si->cpt_sndflow6 = np->sndflow; ++ si->cpt_pmtudisc6 = np->pmtudisc; ++ si->cpt_ipv6only6 = np->ipv6only; ++ si->cpt_mapped = 0; ++ } ++ ++ if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP) ++ cpt_dump_socket_tcp(si, sk, ctx); ++ ++ return 0; ++} ++ ++int cpt_dump_accept_queue(struct sock *sk, int index, struct cpt_context *ctx) ++{ ++ struct request_sock *req; ++ ++ for (req=inet_csk(sk)->icsk_accept_queue.rskq_accept_head; req; req=req->dl_next) ++ cpt_dump_socket(NULL, req->sk, -1, index, ctx); ++ return 0; ++} ++ ++ ++static int dump_openreq(struct request_sock *req, struct sock *sk, int index, ++ struct cpt_context *ctx) ++{ ++ struct cpt_openreq_image *v = cpt_get_buf(ctx); ++ ++ cpt_open_object(NULL, ctx); ++ ++ v->cpt_next = CPT_NULL; ++ v->cpt_object = CPT_OBJ_OPENREQ; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_VOID; ++ ++ v->cpt_rcv_isn = tcp_rsk(req)->rcv_isn; ++ v->cpt_snt_isn = tcp_rsk(req)->snt_isn; ++ v->cpt_rmt_port = inet_rsk(req)->rmt_port; ++ v->cpt_mss = req->mss; ++ // // v->cpt_family = (req->class == &or_ipv4 ? AF_INET : AF_INET6); ++ v->cpt_retrans = req->retrans; ++ v->cpt_snd_wscale = inet_rsk(req)->snd_wscale; ++ v->cpt_rcv_wscale = inet_rsk(req)->rcv_wscale; ++ v->cpt_tstamp_ok = inet_rsk(req)->tstamp_ok; ++ v->cpt_sack_ok = inet_rsk(req)->sack_ok; ++ v->cpt_wscale_ok = inet_rsk(req)->wscale_ok; ++ v->cpt_ecn_ok = inet_rsk(req)->ecn_ok; ++ v->cpt_acked = inet_rsk(req)->acked; ++ v->cpt_window_clamp = req->window_clamp; ++ v->cpt_rcv_wnd = req->rcv_wnd; ++ v->cpt_ts_recent = req->ts_recent; ++ v->cpt_expires = jiffies_export(req->expires); ++ ++ if (v->cpt_family == AF_INET) { ++ memcpy(v->cpt_loc_addr, &inet_rsk(req)->loc_addr, 4); ++ memcpy(v->cpt_rmt_addr, &inet_rsk(req)->rmt_addr, 4); ++ } else { ++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) ++ memcpy(v->cpt_loc_addr, &inet6_rsk(req)->loc_addr, 16); ++ memcpy(v->cpt_rmt_addr, &inet6_rsk(req)->rmt_addr, 16); ++ v->cpt_iif = inet6_rsk(req)->iif; ++#endif ++ } ++ ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ ++ cpt_close_object(ctx); ++ return 0; ++} ++ ++int cpt_dump_synwait_queue(struct sock *sk, int index, struct cpt_context *ctx) ++{ ++ struct inet_connection_sock *icsk; ++ struct listen_sock *lopt; ++ struct request_sock *req; ++ int nr_entries; ++ int i; ++ ++ icsk = inet_csk(sk); ++ lopt = icsk->icsk_accept_queue.listen_opt; ++ nr_entries = icsk->icsk_accept_queue.listen_opt->nr_table_entries; ++ ++ for (i=0; i < nr_entries; i++) { ++ for (req=lopt->syn_table[i]; req; req=req->dl_next) { ++ loff_t saved_obj; ++ cpt_push_object(&saved_obj, ctx); ++ dump_openreq(req, sk, index, ctx); ++ cpt_pop_object(&saved_obj, ctx); ++ } ++ } ++ return 0; ++} ++ ++ ++int cpt_kill_socket(struct sock *sk, cpt_context_t * ctx) ++{ ++ if (sk->sk_state != TCP_CLOSE && ++ (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) && ++ sk->sk_protocol == IPPROTO_TCP) { ++ if (sk->sk_state != TCP_LISTEN) ++ tcp_set_state(sk, TCP_CLOSE); ++ else ++ sk->sk_prot->disconnect(sk, 0); ++ } ++ return 0; ++} ++ ++int cpt_dump_mcfilter(struct sock *sk, cpt_context_t *ctx) ++{ ++ struct inet_sock *inet = inet_sk(sk); ++ struct ip_mc_socklist *iml; ++ ++ for (iml = inet->mc_list; iml; iml = iml->next) { ++ struct cpt_sockmc_image smi; ++ int scnt = 0; ++ int i; ++ ++ if (iml->sflist) ++ scnt = iml->sflist->sl_count*16; ++ ++ smi.cpt_next = sizeof(smi) + scnt; ++ smi.cpt_object = CPT_OBJ_SOCK_MCADDR; ++ smi.cpt_hdrlen = sizeof(smi); ++ smi.cpt_content = CPT_CONTENT_DATA; ++ ++ smi.cpt_family = AF_INET; ++ smi.cpt_mode = iml->sfmode; ++ smi.cpt_ifindex = iml->multi.imr_ifindex; ++ memset(&smi.cpt_mcaddr, 0, sizeof(smi.cpt_mcaddr)); ++ smi.cpt_mcaddr[0] = iml->multi.imr_multiaddr.s_addr; ++ ++ ctx->write(&smi, sizeof(smi), ctx); ++ ++ for (i = 0; i < scnt; i++) { ++ u32 addr[4]; ++ memset(&addr, 0, sizeof(addr)); ++ addr[0] = iml->sflist->sl_addr[i]; ++ ctx->write(&addr, sizeof(addr), ctx); ++ } ++ } ++ ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ if (sk->sk_family == AF_INET6) { ++ struct ipv6_mc_socklist *mcl; ++ struct ipv6_pinfo *np = inet6_sk(sk); ++ ++ for (mcl = np->ipv6_mc_list; mcl; mcl = mcl->next) { ++ struct cpt_sockmc_image smi; ++ int scnt = 0; ++ int i; ++ ++ if (mcl->sflist) ++ scnt = mcl->sflist->sl_count*16; ++ ++ smi.cpt_next = sizeof(smi) + scnt; ++ smi.cpt_object = CPT_OBJ_SOCK_MCADDR; ++ smi.cpt_hdrlen = sizeof(smi); ++ smi.cpt_content = CPT_CONTENT_DATA; ++ ++ smi.cpt_family = AF_INET6; ++ smi.cpt_mode = mcl->sfmode; ++ smi.cpt_ifindex = mcl->ifindex; ++ memcpy(&smi.cpt_mcaddr, &mcl->addr, sizeof(smi.cpt_mcaddr)); ++ ++ ctx->write(&smi, sizeof(smi), ctx); ++ for (i = 0; i < scnt; i++) ++ ctx->write(&mcl->sflist->sl_addr[i], 16, ctx); ++ } ++ } ++#endif ++ return 0; ++} +diff --git a/kernel/cpt/cpt_syscalls.h b/kernel/cpt/cpt_syscalls.h +new file mode 100644 +index 0000000..ba69cb5 +--- /dev/null ++++ b/kernel/cpt/cpt_syscalls.h +@@ -0,0 +1,101 @@ ++#include ++#include ++#include ++#include ++ ++#define WRAP(c, args) return sys_##c args ++#define WRAP2(c, args) int err; mm_segment_t oldfs; \ ++ oldfs = get_fs(); set_fs(KERNEL_DS); \ ++ err = sys_##c args ;\ ++ set_fs(oldfs); \ ++ return err ++ ++static inline int sc_close(int fd) ++{ ++ WRAP(close, (fd)); ++} ++ ++static inline int sc_dup2(int fd1, int fd2) ++{ ++ WRAP(dup2, (fd1, fd2)); ++} ++ ++static inline int sc_unlink(char *name) ++{ ++ WRAP2(unlink, (name)); ++} ++ ++static inline int sc_pipe(int *pfd) ++{ ++ return do_pipe(pfd); ++} ++ ++static inline int sc_mknod(char *name, int mode, int dev) ++{ ++ WRAP2(mknod, (name, mode, dev)); ++} ++ ++static inline int sc_chmod(char *name, int mode) ++{ ++ WRAP2(mkdir, (name, mode)); ++} ++ ++static inline int sc_chown(char *name, int uid, int gid) ++{ ++ WRAP2(chown, (name, uid, gid)); ++} ++ ++static inline int sc_mkdir(char *name, int mode) ++{ ++ WRAP2(mkdir, (name, mode)); ++} ++ ++static inline int sc_rmdir(char *name) ++{ ++ WRAP2(rmdir, (name)); ++} ++ ++static inline int sc_mount(char *mntdev, char *mntpnt, char *type, unsigned long flags) ++{ ++ WRAP2(mount, (mntdev ? : "none", mntpnt, type, flags, NULL)); ++} ++ ++static inline int sc_mprotect(unsigned long start, size_t len, ++ unsigned long prot) ++{ ++ WRAP(mprotect, (start, len, prot)); ++} ++ ++static inline int sc_mlock(unsigned long start, size_t len) ++{ ++ WRAP(mlock, (start, len)); ++} ++ ++static inline int sc_munlock(unsigned long start, size_t len) ++{ ++ WRAP(munlock, (start, len)); ++} ++ ++static inline int sc_remap_file_pages(unsigned long start, size_t len, ++ unsigned long prot, unsigned long pgoff, ++ unsigned long flags) ++{ ++ WRAP(remap_file_pages, (start, len, prot, pgoff, flags)); ++} ++ ++static inline int sc_waitx(int pid, int opt, int *stat_addr) ++{ ++ WRAP(wait4, (pid, stat_addr, opt, NULL)); ++} ++ ++static inline int sc_flock(int fd, int flags) ++{ ++ WRAP(flock, (fd, flags)); ++} ++ ++static inline int sc_open(char* path, int flags, int mode) ++{ ++ WRAP(open, (path, flags, mode)); ++} ++ ++extern int sc_execve(char *cms, char **argv, char **env); +diff --git a/kernel/cpt/cpt_sysvipc.c b/kernel/cpt/cpt_sysvipc.c +new file mode 100644 +index 0000000..8117307 +--- /dev/null ++++ b/kernel/cpt/cpt_sysvipc.c +@@ -0,0 +1,403 @@ ++/* ++ * ++ * kernel/cpt/cpt_sysvipc.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_kernel.h" ++ ++struct _warg { ++ struct file *file; ++ struct cpt_sysvshm_image *v; ++}; ++ ++static int dump_one_shm(struct shmid_kernel *shp, void *arg) ++{ ++ struct _warg *warg = arg; ++ struct cpt_sysvshm_image *v = (struct cpt_sysvshm_image *)warg->v; ++ ++ if (shp->shm_file != warg->file) ++ return 0; ++ ++ v->cpt_key = shp->shm_perm.key; ++ v->cpt_uid = shp->shm_perm.uid; ++ v->cpt_gid = shp->shm_perm.gid; ++ v->cpt_cuid = shp->shm_perm.cuid; ++ v->cpt_cgid = shp->shm_perm.cgid; ++ v->cpt_mode = shp->shm_perm.mode; ++ v->cpt_seq = shp->shm_perm.seq; ++ ++ v->cpt_id = shp->shm_perm.id; ++ v->cpt_segsz = shp->shm_segsz; ++ v->cpt_atime = shp->shm_atim; ++ v->cpt_ctime = shp->shm_ctim; ++ v->cpt_dtime = shp->shm_dtim; ++ v->cpt_creator = shp->shm_cprid; ++ v->cpt_last = shp->shm_lprid; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9) ++ v->cpt_mlockuser = shp->mlock_user ? shp->mlock_user->uid : -1; ++#else ++ v->cpt_mlockuser = -1; ++#endif ++ return 1; ++} ++ ++int cpt_dump_content_sysvshm(struct file *file, struct cpt_context *ctx) ++{ ++ struct cpt_sysvshm_image *v = cpt_get_buf(ctx); ++ struct _warg warg; ++ ++ v->cpt_next = sizeof(*v); ++ v->cpt_object = CPT_OBJ_SYSV_SHM; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_VOID; ++ ++ warg.file = file; ++ warg.v = v; ++ if (sysvipc_walk_shm(dump_one_shm, &warg) == 0) { ++ cpt_release_buf(ctx); ++ return -ESRCH; ++ } ++ ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ return 0; ++} ++ ++ ++int match_sem(int id, struct sem_array *sema, void *arg) ++{ ++ if (id != (unsigned long)arg) ++ return 0; ++ return sema->sem_nsems + 1; ++} ++ ++static int get_sem_nsem(int id, cpt_context_t *ctx) ++{ ++ int res; ++ res = sysvipc_walk_sem(match_sem, (void*)(unsigned long)id); ++ if (res > 0) ++ return res - 1; ++ eprintk_ctx("get_sem_nsem: SYSV semaphore %d not found\n", id); ++ return -ESRCH; ++} ++ ++static int dump_one_semundo(struct sem_undo *su, struct cpt_context *ctx) ++{ ++ struct cpt_sysvsem_undo_image v; ++ loff_t saved_obj; ++ ++ cpt_open_object(NULL, ctx); ++ ++ v.cpt_next = CPT_NULL; ++ v.cpt_object = CPT_OBJ_SYSVSEM_UNDO_REC; ++ v.cpt_hdrlen = sizeof(v); ++ v.cpt_content = CPT_CONTENT_SEMUNDO; ++ v.cpt_id = su->semid; ++ v.cpt_nsem = get_sem_nsem(su->semid, ctx); ++ if ((int)v.cpt_nsem < 0) ++ return -ESRCH; ++ ++ ctx->write(&v, sizeof(v), ctx); ++ ++ cpt_push_object(&saved_obj, ctx); ++ ctx->write(su->semadj, v.cpt_nsem*sizeof(short), ctx); ++ cpt_pop_object(&saved_obj, ctx); ++ ++ cpt_close_object(ctx); ++ return 0; ++} ++ ++struct sem_warg { ++ int last_id; ++ struct cpt_sysvsem_image *v; ++}; ++ ++static int dump_one_sem(int id, struct sem_array *sma, void *arg) ++{ ++ struct sem_warg * warg = (struct sem_warg *)arg; ++ struct cpt_sysvsem_image *v = warg->v; ++ int i; ++ ++ if (warg->last_id != -1) { ++ if ((id % IPCMNI) <= warg->last_id) ++ return 0; ++ } ++ ++ v->cpt_next = sizeof(*v); ++ v->cpt_object = CPT_OBJ_SYSV_SEM; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_SEMARRAY; ++ ++ v->cpt_key = sma->sem_perm.key; ++ v->cpt_uid = sma->sem_perm.uid; ++ v->cpt_gid = sma->sem_perm.gid; ++ v->cpt_cuid = sma->sem_perm.cuid; ++ v->cpt_cgid = sma->sem_perm.cgid; ++ v->cpt_mode = sma->sem_perm.mode; ++ v->cpt_seq = sma->sem_perm.seq; ++ ++ v->cpt_id = id; ++ v->cpt_ctime = sma->sem_ctime; ++ v->cpt_otime = sma->sem_otime; ++ ++ for (i=0; isem_nsems; i++) { ++ struct { ++ __u32 semval; ++ __u32 sempid; ++ } *s = (void*)v + v->cpt_next; ++ if (v->cpt_next >= PAGE_SIZE - sizeof(*s)) ++ return -EINVAL; ++ s->semval = sma->sem_base[i].semval; ++ s->sempid = sma->sem_base[i].sempid; ++ v->cpt_next += sizeof(*s); ++ } ++ ++ warg->last_id = id % IPCMNI; ++ return 1; ++} ++ ++ ++int cpt_dump_sysvsem(struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ struct sem_warg warg; ++ ++ /* Dumping semaphores is quite tricky because we cannot ++ * write to dump file under lock inside sysvipc_walk_sem(). ++ */ ++ cpt_open_section(ctx, CPT_SECT_SYSV_SEM); ++ warg.last_id = -1; ++ warg.v = cpt_get_buf(ctx); ++ for (;;) { ++ if (sysvipc_walk_sem(dump_one_sem, &warg) <= 0) ++ break; ++ ctx->write(warg.v, warg.v->cpt_next, ctx); ++ } ++ cpt_release_buf(ctx); ++ cpt_close_section(ctx); ++ ++ cpt_open_section(ctx, CPT_SECT_SYSVSEM_UNDO); ++ for_each_object(obj, CPT_OBJ_SYSVSEM_UNDO) { ++ struct sem_undo_list *semu = obj->o_obj; ++ struct sem_undo *su; ++ struct cpt_object_hdr v; ++ loff_t saved_obj; ++ ++ cpt_open_object(obj, ctx); ++ ++ v.cpt_next = CPT_NULL; ++ v.cpt_object = CPT_OBJ_SYSVSEM_UNDO; ++ v.cpt_hdrlen = sizeof(v); ++ v.cpt_content = CPT_CONTENT_ARRAY; ++ ++ ctx->write(&v, sizeof(v), ctx); ++ ++ cpt_push_object(&saved_obj, ctx); ++ for (su = semu->proc_list; su; su = su->proc_next) { ++ if (su->semid != -1) { ++ int err; ++ err = dump_one_semundo(su, ctx); ++ if (err < 0) ++ return err; ++ } ++ } ++ cpt_pop_object(&saved_obj, ctx); ++ ++ cpt_close_object(ctx); ++ } ++ cpt_close_section(ctx); ++ return 0; ++} ++ ++struct msg_warg { ++ int last_id; ++ struct msg_queue *msq; ++ struct cpt_sysvmsg_image *v; ++}; ++ ++static int dump_one_msg(int id, struct msg_queue *msq, void *arg) ++{ ++ struct msg_warg * warg = (struct msg_warg *)arg; ++ struct cpt_sysvmsg_image *v = warg->v; ++ ++ if (warg->last_id != -1) { ++ if ((id % IPCMNI) <= warg->last_id) ++ return 0; ++ } ++ ++ v->cpt_next = sizeof(*v); ++ v->cpt_object = CPT_OBJ_SYSVMSG; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_ARRAY; ++ ++ v->cpt_key = msq->q_perm.key; ++ v->cpt_uid = msq->q_perm.uid; ++ v->cpt_gid = msq->q_perm.gid; ++ v->cpt_cuid = msq->q_perm.cuid; ++ v->cpt_cgid = msq->q_perm.cgid; ++ v->cpt_mode = msq->q_perm.mode; ++ v->cpt_seq = msq->q_perm.seq; ++ ++ v->cpt_id = id; ++ v->cpt_stime = msq->q_stime; ++ v->cpt_rtime = msq->q_rtime; ++ v->cpt_ctime = msq->q_ctime; ++ v->cpt_last_sender = msq->q_lspid; ++ v->cpt_last_receiver = msq->q_lrpid; ++ v->cpt_qbytes = msq->q_qbytes; ++ ++ warg->msq = msq; ++ warg->last_id = id % IPCMNI; ++ return 1; ++} ++ ++static int do_store(void * src, int len, int offset, void * data) ++{ ++ cpt_context_t * ctx = data; ++ ctx->write(src, len, ctx); ++ return 0; ++} ++ ++static void cpt_dump_one_sysvmsg(struct msg_msg *m, cpt_context_t * ctx) ++{ ++ loff_t saved_obj; ++ struct cpt_sysvmsg_msg_image mv; ++ ++ cpt_open_object(NULL, ctx); ++ mv.cpt_next = CPT_NULL; ++ mv.cpt_object = CPT_OBJ_SYSVMSG_MSG; ++ mv.cpt_hdrlen = sizeof(mv); ++ mv.cpt_content = CPT_CONTENT_DATA; ++ ++ mv.cpt_type = m->m_type; ++ mv.cpt_size = m->m_ts; ++ ++ ctx->write(&mv, sizeof(mv), ctx); ++ ++ cpt_push_object(&saved_obj, ctx); ++ sysv_msg_store(m, do_store, m->m_ts, ctx); ++ cpt_pop_object(&saved_obj, ctx); ++ cpt_close_object(ctx); ++} ++ ++int cpt_dump_sysvmsg(struct cpt_context *ctx) ++{ ++ struct msg_warg warg; ++ ++ /* Dumping msg queues is tricky because we cannot ++ * write to dump file under lock inside sysvipc_walk_msg(). ++ * ++ * And even worse, we have to access msg list in an unserialized ++ * context. It is fragile. But VE is still frozen, remember? ++ */ ++ cpt_open_section(ctx, CPT_SECT_SYSV_MSG); ++ warg.last_id = -1; ++ warg.v = cpt_get_buf(ctx); ++ for (;;) { ++ loff_t saved_obj; ++ struct msg_msg * m; ++ ++ if (sysvipc_walk_msg(dump_one_msg, &warg) <= 0) ++ break; ++ ++ cpt_open_object(NULL, ctx); ++ ++ ctx->write(warg.v, warg.v->cpt_next, ctx); ++ ++ cpt_push_object(&saved_obj, ctx); ++ list_for_each_entry(m, &warg.msq->q_messages, m_list) { ++ cpt_dump_one_sysvmsg(m, ctx); ++ } ++ cpt_pop_object(&saved_obj, ctx); ++ ++ cpt_close_object(ctx); ++ } ++ cpt_release_buf(ctx); ++ cpt_close_section(ctx); ++ return 0; ++} ++ ++static int cpt_collect_sysvsem_undo(cpt_context_t *ctx) ++{ ++ cpt_object_t *obj; ++ ++ for_each_object(obj, CPT_OBJ_TASK) { ++ struct task_struct *tsk = obj->o_obj; ++ if (tsk->exit_state) { ++ /* ipc/sem.c forgets to clear tsk->sysvsem.undo_list ++ * on exit. Grrr... */ ++ continue; ++ } ++ if (tsk->sysvsem.undo_list && ++ cpt_object_add(CPT_OBJ_SYSVSEM_UNDO, tsk->sysvsem.undo_list, ctx) == NULL) ++ return -ENOMEM; ++ } ++ ++ for_each_object(obj, CPT_OBJ_SYSVSEM_UNDO) { ++ struct sem_undo_list *semu = obj->o_obj; ++ ++ if (atomic_read(&semu->refcnt) != obj->o_count) { ++ eprintk_ctx("sem_undo_list is referenced outside %d %d\n", obj->o_count, atomic_read(&semu->refcnt)); ++ return -EBUSY; ++ } ++ } ++ return 0; ++} ++ ++static int collect_one_shm(struct shmid_kernel *shp, void *arg) ++{ ++ cpt_context_t *ctx = arg; ++ ++ if (__cpt_object_add(CPT_OBJ_FILE, shp->shm_file, GFP_ATOMIC, ctx) == NULL) ++ return -ENOMEM; ++ return 0; ++} ++ ++int cpt_collect_sysvshm(cpt_context_t * ctx) ++{ ++ int err; ++ ++ err = sysvipc_walk_shm(collect_one_shm, ctx); ++ ++ return err < 0 ? err : 0; ++} ++ ++int cpt_collect_sysv(cpt_context_t * ctx) ++{ ++ int err; ++ ++ err = cpt_collect_sysvsem_undo(ctx); ++ if (err) ++ return err; ++ err = cpt_collect_sysvshm(ctx); ++ if (err) ++ return err; ++ ++ return 0; ++} +diff --git a/kernel/cpt/cpt_tty.c b/kernel/cpt/cpt_tty.c +new file mode 100644 +index 0000000..8ac9417 +--- /dev/null ++++ b/kernel/cpt/cpt_tty.c +@@ -0,0 +1,215 @@ ++/* ++ * ++ * kernel/cpt/cpt_tty.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++ ++/* We must support at least N_TTY. */ ++ ++int cpt_dump_content_tty(struct file *file, struct cpt_context *ctx) ++{ ++ struct tty_struct *tty = file->private_data; ++ cpt_object_t *obj; ++ struct cpt_obj_ref o; ++ loff_t saved_pos; ++ ++ obj = lookup_cpt_object(CPT_OBJ_TTY, tty, ctx); ++ if (!obj) ++ return -EINVAL; ++ ++ cpt_push_object(&saved_pos, ctx); ++ ++ o.cpt_next = sizeof(o); ++ o.cpt_object = CPT_OBJ_REF; ++ o.cpt_hdrlen = sizeof(o); ++ o.cpt_content = CPT_CONTENT_VOID; ++ o.cpt_pos = obj->o_pos; ++ ctx->write(&o, sizeof(o), ctx); ++ ++ cpt_pop_object(&saved_pos, ctx); ++ ++ return 0; ++} ++ ++int cpt_collect_tty(struct file *file, cpt_context_t * ctx) ++{ ++ struct tty_struct *tty = file->private_data; ++ ++ if (tty) { ++ if (cpt_object_add(CPT_OBJ_TTY, tty, ctx) == NULL) ++ return -ENOMEM; ++ if (tty->link) { ++ cpt_object_t *obj; ++ ++ obj = cpt_object_add(CPT_OBJ_TTY, tty->link, ctx); ++ if (obj == NULL) ++ return -ENOMEM; ++ /* Undo o_count, tty->link is not a reference */ ++ obj->o_count--; ++ } ++ } ++ return 0; ++} ++ ++int cpt_dump_tty(cpt_object_t *obj, struct cpt_context *ctx) ++{ ++ struct tty_struct *tty = obj->o_obj; ++ struct cpt_tty_image *v; ++ ++ if (tty->link) { ++ if (lookup_cpt_object(CPT_OBJ_TTY, tty->link, ctx) == NULL) { ++ eprintk_ctx("orphan pty %s %d\n", tty->name, tty->driver->subtype == PTY_TYPE_SLAVE); ++ return -EINVAL; ++ } ++ if (tty->link->link != tty) { ++ eprintk_ctx("bad pty pair\n"); ++ return -EINVAL; ++ } ++ if (tty->driver->type == TTY_DRIVER_TYPE_PTY && ++ tty->driver->subtype == PTY_TYPE_SLAVE && ++ tty->link->count) ++ obj->o_count++; ++ } ++ if (obj->o_count != tty->count) { ++ eprintk_ctx("tty %s is referenced outside %d %d\n", tty->name, obj->o_count, tty->count); ++ return -EBUSY; ++ } ++ ++ cpt_open_object(obj, ctx); ++ ++ v = cpt_get_buf(ctx); ++ v->cpt_next = -1; ++ v->cpt_object = CPT_OBJ_TTY; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_ARRAY; ++ ++ v->cpt_index = tty->index; ++ v->cpt_link = -1; ++ if (tty->link) ++ v->cpt_link = tty->link->index; ++ v->cpt_drv_type = tty->driver->type; ++ v->cpt_drv_subtype = tty->driver->subtype; ++ v->cpt_drv_flags = tty->driver->flags; ++ v->cpt_packet = tty->packet; ++ v->cpt_stopped = tty->stopped; ++ v->cpt_hw_stopped = tty->hw_stopped; ++ v->cpt_flow_stopped = tty->flow_stopped; ++ v->cpt_flags = tty->flags; ++ v->cpt_ctrl_status = tty->ctrl_status; ++ v->cpt_canon_data = tty->canon_data; ++ v->cpt_canon_head = tty->canon_head - tty->read_tail; ++ v->cpt_canon_column = tty->canon_column; ++ v->cpt_column = tty->column; ++ v->cpt_erasing = tty->erasing; ++ v->cpt_lnext = tty->lnext; ++ v->cpt_icanon = tty->icanon; ++ v->cpt_raw = tty->raw; ++ v->cpt_real_raw = tty->real_raw; ++ v->cpt_closing = tty->closing; ++ v->cpt_minimum_to_wake = tty->minimum_to_wake; ++ v->cpt_pgrp = 0; ++ if (tty->pgrp) { ++ v->cpt_pgrp = pid_vnr(tty->pgrp); ++ if ((int)v->cpt_pgrp < 0) { ++ dprintk_ctx("cannot map tty->pgrp %d -> %d\n", pid_vnr(tty->pgrp), (int)v->cpt_pgrp); ++ v->cpt_pgrp = -1; ++ } ++ } ++ v->cpt_session = 0; ++ if (tty->session) { ++ v->cpt_session = pid_vnr(tty->session); ++ if ((int)v->cpt_session < 0) { ++ eprintk_ctx("cannot map tty->session %d -> %d\n", pid_nr(tty->session), (int)v->cpt_session); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ } ++ memcpy(v->cpt_name, tty->name, 64); ++ v->cpt_ws_row = tty->winsize.ws_row; ++ v->cpt_ws_col = tty->winsize.ws_col; ++ v->cpt_ws_prow = tty->winsize.ws_ypixel; ++ v->cpt_ws_pcol = tty->winsize.ws_xpixel; ++ if (tty->termios == NULL) { ++ eprintk_ctx("NULL termios"); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ v->cpt_c_line = tty->termios->c_line; ++ v->cpt_c_iflag = tty->termios->c_iflag; ++ v->cpt_c_oflag = tty->termios->c_oflag; ++ v->cpt_c_cflag = tty->termios->c_cflag; ++ v->cpt_c_lflag = tty->termios->c_lflag; ++ memcpy(v->cpt_c_cc, tty->termios->c_cc, NCCS); ++ if (NCCS < 32) ++ memset(v->cpt_c_cc + NCCS, 255, 32 - NCCS); ++ memcpy(v->cpt_read_flags, tty->read_flags, sizeof(v->cpt_read_flags)); ++ ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ ++ if (tty->read_buf && tty->read_cnt) { ++ struct cpt_obj_bits *v = cpt_get_buf(ctx); ++ loff_t saved_pos; ++ ++ cpt_push_object(&saved_pos, ctx); ++ cpt_open_object(NULL, ctx); ++ v->cpt_next = CPT_NULL; ++ v->cpt_object = CPT_OBJ_BITS; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_DATA; ++ v->cpt_size = tty->read_cnt; ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ ++ if (tty->read_cnt) { ++ int n = min(tty->read_cnt, N_TTY_BUF_SIZE - tty->read_tail); ++ ctx->write(tty->read_buf + tty->read_tail, n, ctx); ++ if (tty->read_cnt > n) ++ ctx->write(tty->read_buf, tty->read_cnt-n, ctx); ++ ctx->align(ctx); ++ } ++ ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_pos, ctx); ++ } ++ ++ cpt_close_object(ctx); ++ ++ return 0; ++} ++ ++__u32 cpt_tty_fasync(struct file *file, struct cpt_context *ctx) ++{ ++ struct tty_struct * tty; ++ struct fasync_struct *fa; ++ ++ tty = (struct tty_struct *)file->private_data; ++ ++ for (fa = tty->fasync; fa; fa = fa->fa_next) { ++ if (fa->fa_file == file) ++ return fa->fa_fd; ++ } ++ return -1; ++} +diff --git a/kernel/cpt/cpt_ubc.c b/kernel/cpt/cpt_ubc.c +new file mode 100644 +index 0000000..fc27e74 +--- /dev/null ++++ b/kernel/cpt/cpt_ubc.c +@@ -0,0 +1,132 @@ ++/* ++ * ++ * kernel/cpt/cpt_ubc.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++ ++cpt_object_t *cpt_add_ubc(struct user_beancounter *bc, struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ obj = cpt_object_add(CPT_OBJ_UBC, bc, ctx); ++ if (obj != NULL) { ++ if (obj->o_count == 1) ++ get_beancounter(bc); ++ if (bc->parent != NULL && obj->o_parent == NULL) ++ obj->o_parent = cpt_add_ubc(bc->parent, ctx); ++ } ++ return obj; ++} ++ ++__u64 cpt_lookup_ubc(struct user_beancounter *bc, struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ obj = lookup_cpt_object(CPT_OBJ_UBC, bc, ctx); ++ if (obj == NULL) { ++ char buf[48]; ++ print_ub_uid(bc, buf, sizeof(buf)); ++ eprintk("CPT: unknown ub %s (%p)\n", buf, bc); ++ dump_stack(); ++ return CPT_NULL; ++ } ++ return obj->o_pos; ++} ++ ++static void dump_one_bc_parm(struct cpt_ubparm *dmp, struct ubparm *prm, ++ int held) ++{ ++ dmp->barrier = (prm->barrier < UB_MAXVALUE ? prm->barrier : CPT_NULL); ++ dmp->limit = (prm->limit < UB_MAXVALUE ? prm->limit : CPT_NULL); ++ dmp->held = (held ? prm->held : CPT_NULL); ++ dmp->maxheld = prm->maxheld; ++ dmp->minheld = prm->minheld; ++ dmp->failcnt = prm->failcnt; ++} ++ ++static int dump_one_bc(cpt_object_t *obj, struct cpt_context *ctx) ++{ ++ struct user_beancounter *bc; ++ struct cpt_beancounter_image *v; ++ int i; ++ ++ bc = obj->o_obj; ++ v = cpt_get_buf(ctx); ++ ++ v->cpt_next = CPT_NULL; ++ v->cpt_object = CPT_OBJ_UBC; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_VOID; ++ ++ if (obj->o_parent != NULL) ++ v->cpt_parent = ((cpt_object_t *)obj->o_parent)->o_pos; ++ else ++ v->cpt_parent = CPT_NULL; ++ v->cpt_id = (obj->o_parent != NULL) ? bc->ub_uid : 0; ++ for (i = 0; i < UB_RESOURCES; i++) { ++ dump_one_bc_parm(v->cpt_parms + i * 2, bc->ub_parms + i, 0); ++ dump_one_bc_parm(v->cpt_parms + i * 2 + 1, bc->ub_store + i, 1); ++ } ++ memset(v->cpt_parms + UB_RESOURCES * 2, 0, ++ sizeof(v->cpt_parms) ++ - UB_RESOURCES * 2 * sizeof(v->cpt_parms[0])); ++ ++ cpt_open_object(obj, ctx); ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_close_object(ctx); ++ ++ cpt_release_buf(ctx); ++ return 0; ++} ++ ++int cpt_dump_ubc(struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ int skipped; ++ int top; ++ ++ cpt_open_section(ctx, CPT_SECT_UBC); ++ ++ do { ++ skipped = 0; ++ top = 0; ++ for_each_object(obj, CPT_OBJ_UBC) { ++ if (obj->o_parent == NULL) ++ top++; ++ if (obj->o_pos != CPT_NULL) ++ continue; ++ if (obj->o_parent != NULL && ++ ((cpt_object_t *)obj->o_parent)->o_pos == CPT_NULL) ++ skipped++; ++ else ++ dump_one_bc(obj, ctx); ++ } ++ } while (skipped && (top < 2)); ++ ++ cpt_close_section(ctx); ++ if (top > 1) { ++ eprintk_ctx("More than one top level ub exist"); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++void cpt_finish_ubc(struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ for_each_object(obj, CPT_OBJ_UBC) ++ put_beancounter(obj->o_obj); ++} +diff --git a/kernel/cpt/cpt_ubc.h b/kernel/cpt/cpt_ubc.h +new file mode 100644 +index 0000000..645ba79 +--- /dev/null ++++ b/kernel/cpt/cpt_ubc.h +@@ -0,0 +1,23 @@ ++#ifdef CONFIG_BEANCOUNTERS ++cpt_object_t *cpt_add_ubc(struct user_beancounter *bc, struct cpt_context *ctx); ++__u64 cpt_lookup_ubc(struct user_beancounter *bc, struct cpt_context *ctx); ++int cpt_dump_ubc(struct cpt_context *ctx); ++ ++struct user_beancounter *rst_lookup_ubc(__u64 pos, struct cpt_context *ctx); ++int rst_undump_ubc(struct cpt_context *ctx); ++ ++void cpt_finish_ubc(struct cpt_context *ctx); ++void rst_finish_ubc(struct cpt_context *ctx); ++void copy_one_ubparm(struct ubparm *from, struct ubparm *to, int bc_parm_id); ++void set_one_ubparm_to_max(struct ubparm *ubprm, int bc_parm_id); ++#else ++static int inline cpt_dump_ubc(struct cpt_context *ctx) ++{ return 0; } ++static int inline rst_undump_ubc(struct cpt_context *ctx) ++{ return 0; } ++static void inline cpt_finish_ubc(struct cpt_context *ctx) ++{ return; } ++static void inline rst_finish_ubc(struct cpt_context *ctx) ++{ return; } ++#endif ++ +diff --git a/kernel/cpt/cpt_x8664.S b/kernel/cpt/cpt_x8664.S +new file mode 100644 +index 0000000..0d5e361 +--- /dev/null ++++ b/kernel/cpt/cpt_x8664.S +@@ -0,0 +1,67 @@ ++#define ASSEMBLY 1 ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ .code64 ++ ++ .macro FAKE_STACK_FRAME child_rip ++ /* push in order ss, rsp, eflags, cs, rip */ ++ xorq %rax, %rax ++ pushq %rax /* ss */ ++ pushq %rax /* rsp */ ++ pushq $(1<<9) /* eflags - interrupts on */ ++ pushq $__KERNEL_CS /* cs */ ++ pushq \child_rip /* rip */ ++ pushq %rax /* orig rax */ ++ .endm ++ ++ .macro UNFAKE_STACK_FRAME ++ addq $8*6, %rsp ++ .endm ++ ++ENTRY(asm_kernel_thread) ++ CFI_STARTPROC ++ FAKE_STACK_FRAME $child_rip ++ SAVE_ALL ++ ++ # rdi: flags, rsi: usp, rdx: will be &pt_regs ++ movq %rdx,%rdi ++ orq $0x00800000,%rdi ++ movq $-1, %rsi ++ movq %rsp, %rdx ++ ++ xorl %r8d,%r8d ++ xorl %r9d,%r9d ++ pushq %rcx ++ call do_fork_pid ++ addq $8, %rsp ++ /* call do_fork */ ++ movq %rax,RAX(%rsp) ++ xorl %edi,%edi ++ RESTORE_ALL ++ UNFAKE_STACK_FRAME ++ ret ++ CFI_ENDPROC ++ENDPROC(asm_kernel_thread) ++ ++child_rip: ++ pushq $0 # fake return address ++ CFI_STARTPROC ++ movq %rdi, %rax ++ movq %rsi, %rdi ++ call *%rax ++ movq %rax, %rdi ++ call do_exit ++ CFI_ENDPROC ++ENDPROC(child_rip) ++ +diff --git a/kernel/cpt/rst_conntrack.c b/kernel/cpt/rst_conntrack.c +new file mode 100644 +index 0000000..4c31f32 +--- /dev/null ++++ b/kernel/cpt/rst_conntrack.c +@@ -0,0 +1,283 @@ ++/* ++ * ++ * kernel/cpt/rst_conntrack.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#if defined(CONFIG_VE_IPTABLES) && \ ++ (defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)) ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define ASSERT_READ_LOCK(x) do { } while (0) ++#define ASSERT_WRITE_LOCK(x) do { } while (0) ++ ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++ ++struct ct_holder ++{ ++ struct ct_holder *next; ++ struct ip_conntrack *ct; ++ int index; ++}; ++ ++static void decode_tuple(struct cpt_ipct_tuple *v, struct ip_conntrack_tuple *tuple, int dir) ++{ ++ tuple->dst.ip = v->cpt_dst; ++ tuple->dst.u.all = v->cpt_dstport; ++ tuple->dst.protonum = v->cpt_protonum; ++ tuple->dst.dir = v->cpt_dir; ++ if (dir != tuple->dst.dir) ++ wprintk("dir != tuple->dst.dir\n"); ++ ++ tuple->src.ip = v->cpt_src; ++ tuple->src.u.all = v->cpt_srcport; ++} ++ ++ ++static int undump_expect_list(struct ip_conntrack *ct, ++ struct cpt_ip_conntrack_image *ci, ++ loff_t pos, struct ct_holder *ct_list, ++ cpt_context_t *ctx) ++{ ++ loff_t end; ++ int err; ++ ++ end = pos + ci->cpt_next; ++ pos += ci->cpt_hdrlen; ++ while (pos < end) { ++ struct cpt_ip_connexpect_image v; ++ struct ip_conntrack_expect *exp; ++ struct ip_conntrack *sibling; ++ ++ err = rst_get_object(CPT_OBJ_NET_CONNTRACK_EXPECT, pos, &v, ctx); ++ if (err) ++ return err; ++ ++ sibling = NULL; ++ if (v.cpt_sibling_conntrack) { ++ struct ct_holder *c; ++ ++ for (c = ct_list; c; c = c->next) { ++ if (c->index == v.cpt_sibling_conntrack) { ++ sibling = c->ct; ++ break; ++ } ++ } ++ if (!sibling) { ++ eprintk_ctx("lost sibling of expectation\n"); ++ return -EINVAL; ++ } ++ } ++ ++ write_lock_bh(&ip_conntrack_lock); ++ ++ /* It is possible. Helper module could be just unregistered, ++ * if expectation were on the list, it would be destroyed. */ ++ if (ct->helper == NULL) { ++ write_unlock_bh(&ip_conntrack_lock); ++ dprintk_ctx("conntrack: no helper and non-trivial expectation\n"); ++ continue; ++ } ++ ++ exp = ip_conntrack_expect_alloc(NULL); ++ if (exp == NULL) { ++ write_unlock_bh(&ip_conntrack_lock); ++ return -ENOMEM; ++ } ++ ++ if (ct->helper->timeout && !del_timer(&exp->timeout)) { ++ /* Dying already. We can do nothing. */ ++ write_unlock_bh(&ip_conntrack_lock); ++ dprintk_ctx("conntrack expectation is dying\n"); ++ continue; ++ } ++ ++ decode_tuple(&v.cpt_tuple, &exp->tuple, 0); ++ decode_tuple(&v.cpt_mask, &exp->mask, 0); ++ ++ exp->master = ct; ++ nf_conntrack_get(&ct->ct_general); ++ ip_conntrack_expect_insert(exp); ++#if 0 ++ if (sibling) { ++ exp->sibling = sibling; ++ sibling->master = exp; ++ LIST_DELETE(&ve_ip_conntrack_expect_list, exp); ++ ct->expecting--; ++ nf_conntrack_get(&master_ct(sibling)->infos[0]); ++ } else ++#endif ++ if (ct->helper->timeout) { ++ exp->timeout.expires = jiffies + v.cpt_timeout; ++ add_timer(&exp->timeout); ++ } ++ write_unlock_bh(&ip_conntrack_lock); ++ ++ pos += v.cpt_next; ++ } ++ return 0; ++} ++ ++static int undump_one_ct(struct cpt_ip_conntrack_image *ci, loff_t pos, ++ struct ct_holder **ct_list, cpt_context_t *ctx) ++{ ++ int err = 0; ++ struct ip_conntrack *conntrack; ++ struct ct_holder *c; ++ struct ip_conntrack_tuple orig, repl; ++ ++ c = kmalloc(sizeof(struct ct_holder), GFP_KERNEL); ++ if (c == NULL) ++ return -ENOMEM; ++ ++ decode_tuple(&ci->cpt_tuple[0], &orig, 0); ++ decode_tuple(&ci->cpt_tuple[1], &repl, 1); ++ ++ conntrack = ip_conntrack_alloc(&orig, &repl, get_exec_env()->_ip_conntrack->ub); ++ if (!conntrack || IS_ERR(conntrack)) { ++ kfree(c); ++ return -ENOMEM; ++ } ++ ++ c->ct = conntrack; ++ c->next = *ct_list; ++ *ct_list = c; ++ c->index = ci->cpt_index; ++ ++ decode_tuple(&ci->cpt_tuple[0], &conntrack->tuplehash[0].tuple, 0); ++ decode_tuple(&ci->cpt_tuple[1], &conntrack->tuplehash[1].tuple, 1); ++ ++ conntrack->status = ci->cpt_status; ++ ++ memcpy(&conntrack->proto, ci->cpt_proto_data, sizeof(conntrack->proto)); ++ memcpy(&conntrack->help, ci->cpt_help_data, sizeof(conntrack->help)); ++ ++#ifdef CONFIG_IP_NF_NAT_NEEDED ++#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \ ++ defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE) ++ conntrack->nat.masq_index = ci->cpt_masq_index; ++#endif ++ if (ci->cpt_initialized) { ++ conntrack->nat.info.seq[0].correction_pos = ci->cpt_nat_seq[0].cpt_correction_pos; ++ conntrack->nat.info.seq[0].offset_before = ci->cpt_nat_seq[0].cpt_offset_before; ++ conntrack->nat.info.seq[0].offset_after = ci->cpt_nat_seq[0].cpt_offset_after; ++ conntrack->nat.info.seq[1].correction_pos = ci->cpt_nat_seq[1].cpt_correction_pos; ++ conntrack->nat.info.seq[1].offset_before = ci->cpt_nat_seq[1].cpt_offset_before; ++ conntrack->nat.info.seq[1].offset_after = ci->cpt_nat_seq[1].cpt_offset_after; ++ } ++ if (conntrack->status & IPS_NAT_DONE_MASK) ++ ip_nat_hash_conntrack(conntrack); ++#endif ++ ++ if (ci->cpt_ct_helper) { ++ conntrack->helper = ip_conntrack_helper_find_get(&conntrack->tuplehash[1].tuple); ++ if (conntrack->helper == NULL) { ++ eprintk_ctx("conntrack: cannot find helper, some module is not loaded\n"); ++ err = -EINVAL; ++ } ++ } ++ ++ ip_conntrack_hash_insert(conntrack); ++ conntrack->timeout.expires = jiffies + ci->cpt_timeout; ++ ++ if (err == 0 && ci->cpt_next > ci->cpt_hdrlen) ++ err = undump_expect_list(conntrack, ci, pos, *ct_list, ctx); ++ ++ return err; ++} ++ ++int rst_restore_ip_conntrack(struct cpt_context * ctx) ++{ ++ int err = 0; ++ loff_t sec = ctx->sections[CPT_SECT_NET_CONNTRACK]; ++ loff_t endsec; ++ struct cpt_section_hdr h; ++ struct cpt_ip_conntrack_image ci; ++ struct ct_holder *c; ++ struct ct_holder *ct_list = NULL; ++ ++ if (sec == CPT_NULL) ++ return 0; ++ ++ if (sizeof(ci.cpt_proto_data) != sizeof(union ip_conntrack_proto)) { ++ eprintk_ctx("conntrack module ct->proto version mismatch\n"); ++ return -EINVAL; ++ } ++ ++ err = ctx->pread(&h, sizeof(h), ctx, sec); ++ if (err) ++ return err; ++ if (h.cpt_section != CPT_SECT_NET_CONNTRACK || h.cpt_hdrlen < sizeof(h)) ++ return -EINVAL; ++ ++ endsec = sec + h.cpt_next; ++ sec += h.cpt_hdrlen; ++ while (sec < endsec) { ++ err = rst_get_object(CPT_OBJ_NET_CONNTRACK, sec, &ci, ctx); ++ if (err) ++ break; ++ err = undump_one_ct(&ci, sec, &ct_list, ctx); ++ if (err) ++ break; ++ sec += ci.cpt_next; ++ } ++ ++ while ((c = ct_list) != NULL) { ++ ct_list = c->next; ++ if (c->ct) ++ add_timer(&c->ct->timeout); ++ kfree(c); ++ } ++ ++ return err; ++} ++ ++#else ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++ ++int rst_restore_ip_conntrack(struct cpt_context * ctx) ++{ ++ if (ctx->sections[CPT_SECT_NET_CONNTRACK] != CPT_NULL) ++ return -EINVAL; ++ return 0; ++} ++ ++#endif +diff --git a/kernel/cpt/rst_context.c b/kernel/cpt/rst_context.c +new file mode 100644 +index 0000000..47e4f35 +--- /dev/null ++++ b/kernel/cpt/rst_context.c +@@ -0,0 +1,323 @@ ++/* ++ * ++ * kernel/cpt/rst_context.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++ ++static ssize_t file_read(void *addr, size_t count, struct cpt_context *ctx) ++{ ++ mm_segment_t oldfs; ++ ssize_t err = -EBADF; ++ struct file *file = ctx->file; ++ ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ if (file) ++ err = file->f_op->read(file, addr, count, &file->f_pos); ++ set_fs(oldfs); ++ if (err != count) ++ return err >= 0 ? -EIO : err; ++ return 0; ++} ++ ++static ssize_t file_pread(void *addr, size_t count, struct cpt_context *ctx, loff_t pos) ++{ ++ mm_segment_t oldfs; ++ ssize_t err = -EBADF; ++ struct file *file = ctx->file; ++ ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ if (file) ++ err = file->f_op->read(file, addr, count, &pos); ++ set_fs(oldfs); ++ if (err != count) ++ return err >= 0 ? -EIO : err; ++ return 0; ++} ++ ++static void file_align(struct cpt_context *ctx) ++{ ++ struct file *file = ctx->file; ++ ++ if (file) ++ file->f_pos = CPT_ALIGN(file->f_pos); ++} ++ ++int rst_get_section(int type, struct cpt_context *ctx, loff_t *start, loff_t *end) ++{ ++ struct cpt_section_hdr hdr; ++ int err; ++ loff_t pos; ++ ++ pos = ctx->sections[type]; ++ *start = *end = pos; ++ ++ if (pos != CPT_NULL) { ++ if ((err = ctx->pread(&hdr, sizeof(hdr), ctx, pos)) != 0) ++ return err; ++ if (hdr.cpt_section != type || hdr.cpt_hdrlen < sizeof(hdr)) ++ return -EINVAL; ++ *start = pos + hdr.cpt_hdrlen; ++ *end = pos + hdr.cpt_next; ++ } ++ return 0; ++} ++EXPORT_SYMBOL(rst_get_section); ++ ++void rst_context_init(struct cpt_context *ctx) ++{ ++ int i; ++ ++ memset(ctx, 0, sizeof(*ctx)); ++ ++ init_MUTEX(&ctx->main_sem); ++ ctx->refcount = 1; ++ ++ ctx->current_section = -1; ++ ctx->current_object = -1; ++ ctx->pagesize = PAGE_SIZE; ++ ctx->read = file_read; ++ ctx->pread = file_pread; ++ ctx->align = file_align; ++ for (i=0; i < CPT_SECT_MAX; i++) ++ ctx->sections[i] = CPT_NULL; ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ init_completion(&ctx->pgin_notify); ++#endif ++ cpt_object_init(ctx); ++} ++ ++static int parse_sections(loff_t start, loff_t end, cpt_context_t *ctx) ++{ ++ struct cpt_section_hdr h; ++ ++ while (start < end) { ++ int err; ++ ++ err = ctx->pread(&h, sizeof(h), ctx, start); ++ if (err) ++ return err; ++ if (h.cpt_hdrlen < sizeof(h) || ++ h.cpt_next < h.cpt_hdrlen || ++ start + h.cpt_next > end) ++ return -EINVAL; ++ if (h.cpt_section >= CPT_SECT_MAX) ++ return -EINVAL; ++ ctx->sections[h.cpt_section] = start; ++ start += h.cpt_next; ++ } ++ return 0; ++} ++ ++int rst_open_dumpfile(struct cpt_context *ctx) ++{ ++ int err; ++ struct cpt_major_tail *v; ++ struct cpt_major_hdr h; ++ unsigned long size; ++ ++ err = -EBADF; ++ if (!ctx->file) ++ goto err_out; ++ ++ err = -ENOMEM; ++ ctx->tmpbuf = (char*)__get_free_page(GFP_KERNEL); ++ if (ctx->tmpbuf == NULL) ++ goto err_out; ++ __cpt_release_buf(ctx); ++ ++ size = ctx->file->f_dentry->d_inode->i_size; ++ ++ if (size & 7) { ++ err = -EINVAL; ++ goto err_out; ++ } ++ if (size < sizeof(struct cpt_major_hdr) + ++ sizeof(struct cpt_major_tail)) { ++ err = -EINVAL; ++ goto err_out; ++ } ++ err = ctx->pread(&h, sizeof(h), ctx, 0); ++ if (err) { ++ eprintk_ctx("too short image 1 %d\n", err); ++ goto err_out; ++ } ++ if (h.cpt_signature[0] != CPT_SIGNATURE0 || ++ h.cpt_signature[1] != CPT_SIGNATURE1 || ++ h.cpt_signature[2] != CPT_SIGNATURE2 || ++ h.cpt_signature[3] != CPT_SIGNATURE3) { ++ err = -EINVAL; ++ goto err_out; ++ } ++ if (h.cpt_hz != HZ) { ++ err = -EINVAL; ++ eprintk_ctx("HZ mismatch: %d != %d\n", h.cpt_hz, HZ); ++ goto err_out; ++ } ++ ctx->virt_jiffies64 = h.cpt_start_jiffies64; ++ ctx->start_time.tv_sec = h.cpt_start_sec; ++ ctx->start_time.tv_nsec = h.cpt_start_nsec; ++ ctx->kernel_config_flags = h.cpt_kernel_config[0]; ++ ctx->iptables_mask = h.cpt_iptables_mask; ++ if (h.cpt_image_version > CPT_VERSION_20 || ++ CPT_VERSION_MINOR(h.cpt_image_version) > 1) { ++ eprintk_ctx("Unknown image version: %x. Can't restore.\n", ++ h.cpt_image_version); ++ err = -EINVAL; ++ goto err_out; ++ } ++ ctx->image_version = h.cpt_image_version; ++ ctx->features = (__u64)((__u64)h.cpt_ve_features2<<32 | h.cpt_ve_features); ++ ctx->image_arch = h.cpt_os_arch; ++ ++ v = cpt_get_buf(ctx); ++ err = ctx->pread(v, sizeof(*v), ctx, size - sizeof(*v)); ++ if (err) { ++ eprintk_ctx("too short image 2 %d\n", err); ++ cpt_release_buf(ctx); ++ goto err_out; ++ } ++ if (v->cpt_signature[0] != CPT_SIGNATURE0 || ++ v->cpt_signature[1] != CPT_SIGNATURE1 || ++ v->cpt_signature[2] != CPT_SIGNATURE2 || ++ v->cpt_signature[3] != CPT_SIGNATURE3 || ++ v->cpt_nsect != CPT_SECT_MAX_INDEX) { ++ err = -EINVAL; ++ cpt_release_buf(ctx); ++ goto err_out; ++ } ++ if ((err = parse_sections(h.cpt_hdrlen, size - sizeof(*v) - sizeof(struct cpt_section_hdr), ctx)) < 0) { ++ cpt_release_buf(ctx); ++ goto err_out; ++ } ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ ctx->lazypages = v->cpt_lazypages; ++#endif ++ ctx->tasks64 = v->cpt_64bit; ++ cpt_release_buf(ctx); ++ return 0; ++ ++err_out: ++ if (ctx->tmpbuf) { ++ free_page((unsigned long)ctx->tmpbuf); ++ ctx->tmpbuf = NULL; ++ } ++ return err; ++} ++ ++void rst_close_dumpfile(struct cpt_context *ctx) ++{ ++ if (ctx->file) { ++ fput(ctx->file); ++ ctx->file = NULL; ++ } ++ if (ctx->tmpbuf) { ++ free_page((unsigned long)ctx->tmpbuf); ++ ctx->tmpbuf = NULL; ++ } ++} ++ ++int _rst_get_object(int type, loff_t pos, void *tmp, int size, struct cpt_context *ctx) ++{ ++ int err; ++ struct cpt_object_hdr *hdr = tmp; ++ err = ctx->pread(hdr, sizeof(struct cpt_object_hdr), ctx, pos); ++ if (err) ++ return err; ++ if (type > 0 && type != hdr->cpt_object) ++ return -EINVAL; ++ if (hdr->cpt_hdrlen > hdr->cpt_next) ++ return -EINVAL; ++ if (hdr->cpt_hdrlen < sizeof(struct cpt_object_hdr)) ++ return -EINVAL; ++ if (size < sizeof(*hdr)) ++ return -EINVAL; ++ if (size > hdr->cpt_hdrlen) ++ size = hdr->cpt_hdrlen; ++ if (size > sizeof(*hdr)) ++ err = ctx->pread(hdr+1, size - sizeof(*hdr), ++ ctx, pos + sizeof(*hdr)); ++ return err; ++} ++EXPORT_SYMBOL(_rst_get_object); ++ ++void * __rst_get_object(int type, loff_t pos, struct cpt_context *ctx) ++{ ++ int err; ++ void *tmp; ++ struct cpt_object_hdr hdr; ++ err = ctx->pread(&hdr, sizeof(hdr), ctx, pos); ++ if (err) ++ return NULL; ++ if (type > 0 && type != hdr.cpt_object) ++ return NULL; ++ if (hdr.cpt_hdrlen > hdr.cpt_next) ++ return NULL; ++ if (hdr.cpt_hdrlen < sizeof(struct cpt_object_hdr)) ++ return NULL; ++ tmp = kmalloc(hdr.cpt_hdrlen, GFP_KERNEL); ++ if (!tmp) ++ return NULL; ++ err = ctx->pread(tmp, hdr.cpt_hdrlen, ctx, pos); ++ if (!err) ++ return tmp; ++ kfree(tmp); ++ return NULL; ++} ++EXPORT_SYMBOL(__rst_get_object); ++ ++__u8 *__rst_get_name(loff_t *pos_p, struct cpt_context *ctx) ++{ ++ int err; ++ struct cpt_object_hdr hdr; ++ __u8 *name; ++ ++ err = rst_get_object(CPT_OBJ_NAME, *pos_p, &hdr, ctx); ++ if (err) ++ return NULL; ++ if (hdr.cpt_next - hdr.cpt_hdrlen > PAGE_SIZE) ++ return NULL; ++ name = (void*)__get_free_page(GFP_KERNEL); ++ if (!name) ++ return NULL; ++ err = ctx->pread(name, hdr.cpt_next - hdr.cpt_hdrlen, ++ ctx, *pos_p + hdr.cpt_hdrlen); ++ if (err) { ++ free_page((unsigned long)name); ++ return NULL; ++ } ++ *pos_p += hdr.cpt_next; ++ return name; ++} ++ ++__u8 *rst_get_name(loff_t pos, struct cpt_context *ctx) ++{ ++ return __rst_get_name(&pos, ctx); ++} ++ ++void rst_put_name(__u8 *name, struct cpt_context *ctx) ++{ ++ unsigned long addr = (unsigned long)name; ++ ++ if (addr) ++ free_page(addr&~(PAGE_SIZE-1)); ++} +diff --git a/kernel/cpt/rst_epoll.c b/kernel/cpt/rst_epoll.c +new file mode 100644 +index 0000000..0ac4cae +--- /dev/null ++++ b/kernel/cpt/rst_epoll.c +@@ -0,0 +1,169 @@ ++/* ++ * ++ * kernel/cpt/rst_epoll.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_mm.h" ++#include "cpt_files.h" ++#include "cpt_kernel.h" ++#include "cpt_fsmagic.h" ++#include "cpt_syscalls.h" ++ ++/* Those funcations are static in fs/eventpoll.c */ ++extern int ep_insert(struct eventpoll *ep, struct epoll_event *event, ++ struct file *tfile, int fd); ++extern struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd); ++extern void ep_release_epitem(struct epitem *epi); ++ ++ ++struct file *cpt_open_epolldev(struct cpt_file_image *fi, ++ unsigned flags, ++ struct cpt_context *ctx) ++{ ++ struct file *file; ++ int efd; ++ ++ /* Argument "size" is ignored, use just 1 */ ++ efd = sys_epoll_create(1); ++ if (efd < 0) ++ return ERR_PTR(efd); ++ ++ file = fget(efd); ++ sys_close(efd); ++ return file; ++} ++ ++static int restore_one_epoll(cpt_object_t *obj, ++ loff_t pos, ++ struct cpt_epoll_image *ebuf, ++ cpt_context_t *ctx) ++{ ++ int err = 0; ++ loff_t endpos; ++ struct file *file = obj->o_obj; ++ struct eventpoll *ep; ++ ++ if (file->f_op != &eventpoll_fops) { ++ eprintk_ctx("bad epoll file\n"); ++ return -EINVAL; ++ } ++ ++ ep = file->private_data; ++ ++ if (unlikely(ep == NULL)) { ++ eprintk_ctx("bad epoll device\n"); ++ return -EINVAL; ++ } ++ ++ endpos = pos + ebuf->cpt_next; ++ pos += ebuf->cpt_hdrlen; ++ while (pos < endpos) { ++ struct cpt_epoll_file_image efi; ++ struct epoll_event epds; ++ ++ cpt_object_t *tobj; ++ ++ err = rst_get_object(CPT_OBJ_EPOLL_FILE, pos, &efi, ctx); ++ if (err) ++ return err; ++ tobj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, efi.cpt_file, ctx); ++ if (!tobj) { ++ eprintk_ctx("epoll file not found\n"); ++ return -EINVAL; ++ } ++ epds.events = efi.cpt_events; ++ epds.data = efi.cpt_data; ++ mutex_lock(&ep->mtx); ++ err = ep_insert(ep, &epds, tobj->o_obj, efi.cpt_fd); ++ if (!err) { ++ struct epitem *epi; ++ epi = ep_find(ep, tobj->o_obj, efi.cpt_fd); ++ if (epi) { ++ if (efi.cpt_ready) { ++ unsigned long flags; ++ spin_lock_irqsave(&ep->lock, flags); ++ if (list_empty(&epi->rdllink)) ++ list_add_tail(&epi->rdllink, &ep->rdllist); ++ spin_unlock_irqrestore(&ep->lock, flags); ++ } ++ } ++ } ++ mutex_unlock(&ep->mtx); ++ if (err) ++ break; ++ pos += efi.cpt_next; ++ } ++ return err; ++} ++ ++int rst_eventpoll(cpt_context_t *ctx) ++{ ++ int err; ++ loff_t sec = ctx->sections[CPT_SECT_EPOLL]; ++ loff_t endsec; ++ struct cpt_section_hdr h; ++ ++ if (sec == CPT_NULL) ++ return 0; ++ ++ err = ctx->pread(&h, sizeof(h), ctx, sec); ++ if (err) ++ return err; ++ if (h.cpt_section != CPT_SECT_EPOLL || h.cpt_hdrlen < sizeof(h)) ++ return -EINVAL; ++ ++ endsec = sec + h.cpt_next; ++ sec += h.cpt_hdrlen; ++ while (sec < endsec) { ++ cpt_object_t *obj; ++ struct cpt_epoll_image *ebuf = cpt_get_buf(ctx); ++ err = rst_get_object(CPT_OBJ_EPOLL, sec, ebuf, ctx); ++ if (err) { ++ cpt_release_buf(ctx); ++ return err; ++ } ++ obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, ebuf->cpt_file, ctx); ++ if (obj == NULL) { ++ eprintk_ctx("cannot find epoll file object\n"); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ err = restore_one_epoll(obj, sec, ebuf, ctx); ++ cpt_release_buf(ctx); ++ if (err) ++ return err; ++ sec += ebuf->cpt_next; ++ } ++ ++ return 0; ++ ++} +diff --git a/kernel/cpt/rst_files.c b/kernel/cpt/rst_files.c +new file mode 100644 +index 0000000..534ea3a +--- /dev/null ++++ b/kernel/cpt/rst_files.c +@@ -0,0 +1,1661 @@ ++/* ++ * ++ * kernel/cpt/rst_files.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_mm.h" ++#include "cpt_files.h" ++#include "cpt_kernel.h" ++#include "cpt_fsmagic.h" ++ ++#include "cpt_syscalls.h" ++ ++ ++struct filejob { ++ struct filejob *next; ++ int pid; ++ loff_t fdi; ++}; ++ ++static int rst_filejob_queue(loff_t pos, cpt_context_t *ctx) ++{ ++ struct filejob *j; ++ ++ j = kmalloc(sizeof(*j), GFP_KERNEL); ++ if (j == NULL) ++ return -ENOMEM; ++ j->pid = current->pid; ++ j->fdi = pos; ++ j->next = ctx->filejob_queue; ++ ctx->filejob_queue = j; ++ return 0; ++} ++ ++static void _anon_pipe_buf_release(struct pipe_inode_info *pipe, ++ struct pipe_buffer *buf) ++{ ++ struct page *page = buf->page; ++ ++ /* ++ * If nobody else uses this page, and we don't already have a ++ * temporary page, let's keep track of it as a one-deep ++ * allocation cache. (Otherwise just release our reference to it) ++ */ ++ if (page_count(page) == 1 && !pipe->tmp_page) ++ pipe->tmp_page = page; ++ else ++ page_cache_release(page); ++ ++ module_put(THIS_MODULE); ++} ++ ++static void *_anon_pipe_buf_map(struct pipe_inode_info *pipe, ++ struct pipe_buffer *buf, int atomic) ++{ ++ if (atomic) { ++ buf->flags |= PIPE_BUF_FLAG_ATOMIC; ++ return kmap_atomic(buf->page, KM_USER0); ++ } ++ ++ return kmap(buf->page); ++} ++ ++static void _anon_pipe_buf_unmap(struct pipe_inode_info *pipe, ++ struct pipe_buffer *buf, void *map_data) ++{ ++ if (buf->flags & PIPE_BUF_FLAG_ATOMIC) { ++ buf->flags &= ~PIPE_BUF_FLAG_ATOMIC; ++ kunmap_atomic(map_data, KM_USER0); ++ } else ++ kunmap(buf->page); ++} ++ ++static int _anon_pipe_buf_steal(struct pipe_inode_info *pipe, ++ struct pipe_buffer *buf) ++{ ++ struct page *page = buf->page; ++ ++ if (page_count(page) == 1) { ++ lock_page(page); ++ return 0; ++ } ++ ++ return 1; ++} ++ ++static void _anon_pipe_buf_get(struct pipe_inode_info *info, struct pipe_buffer *buf) ++{ ++ page_cache_get(buf->page); ++} ++ ++static int _anon_pipe_buf_confirm(struct pipe_inode_info *info, struct pipe_buffer *buf) ++{ ++ return 0; ++} ++ ++static struct pipe_buf_operations _anon_pipe_buf_ops = { ++ .can_merge = 1, ++ .map = _anon_pipe_buf_map, ++ .unmap = _anon_pipe_buf_unmap, ++ .release = _anon_pipe_buf_release, ++ .confirm = _anon_pipe_buf_confirm, ++ .get = _anon_pipe_buf_get, ++ .steal = _anon_pipe_buf_steal, ++}; ++ ++/* Sorta ugly... Multiple readers/writers of named pipe rewrite buffer ++ * many times. We need to mark it in CPT_OBJ_INODE table in some way. ++ */ ++static int fixup_pipe_data(struct file *file, struct cpt_file_image *fi, ++ struct cpt_context *ctx) ++{ ++ struct inode *ino = file->f_dentry->d_inode; ++ struct cpt_inode_image ii; ++ struct cpt_obj_bits b; ++ struct pipe_inode_info *info; ++ int err; ++ int count; ++ ++ if (!S_ISFIFO(ino->i_mode)) { ++ eprintk_ctx("fixup_pipe_data: not a pipe %Ld\n", (long long)fi->cpt_inode); ++ return -EINVAL; ++ } ++ if (fi->cpt_inode == CPT_NULL) ++ return 0; ++ ++ err = rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, &ii, ctx); ++ if (err) ++ return err; ++ ++ if (ii.cpt_next <= ii.cpt_hdrlen) ++ return 0; ++ ++ err = rst_get_object(CPT_OBJ_BITS, fi->cpt_inode + ii.cpt_hdrlen, &b, ctx); ++ if (err) ++ return err; ++ ++ if (b.cpt_size == 0) ++ return 0; ++ ++ mutex_lock(&ino->i_mutex); ++ info = ino->i_pipe; ++ if (info->nrbufs) { ++ mutex_unlock(&ino->i_mutex); ++ eprintk("pipe buffer is restored already\n"); ++ return -EINVAL; ++ } ++ info->curbuf = 0; ++ count = 0; ++ while (count < b.cpt_size) { ++ struct pipe_buffer *buf = info->bufs + info->nrbufs; ++ void * addr; ++ int chars; ++ ++ chars = b.cpt_size - count; ++ if (chars > PAGE_SIZE) ++ chars = PAGE_SIZE; ++ if (!try_module_get(THIS_MODULE)) { ++ err = -EBUSY; ++ break; ++ } ++ ++ buf->page = alloc_page(GFP_HIGHUSER); ++ if (buf->page == NULL) { ++ err = -ENOMEM; ++ break; ++ } ++ buf->ops = &_anon_pipe_buf_ops; ++ buf->offset = 0; ++ buf->len = chars; ++ info->nrbufs++; ++ addr = kmap(buf->page); ++ err = ctx->pread(addr, chars, ctx, ++ fi->cpt_inode + ii.cpt_hdrlen + b.cpt_hdrlen + count); ++ if (err) ++ break; ++ count += chars; ++ } ++ mutex_unlock(&ino->i_mutex); ++ ++ return err; ++} ++ ++static int make_flags(struct cpt_file_image *fi) ++{ ++ int flags = O_NOFOLLOW; ++ switch (fi->cpt_mode&(FMODE_READ|FMODE_WRITE)) { ++ case FMODE_READ|FMODE_WRITE: ++ flags |= O_RDWR; break; ++ case FMODE_WRITE: ++ flags |= O_WRONLY; break; ++ case FMODE_READ: ++ flags |= O_RDONLY; break; ++ default: break; ++ } ++ flags |= fi->cpt_flags&~(O_ACCMODE|O_CREAT|O_TRUNC|O_EXCL|FASYNC); ++ flags |= O_NONBLOCK|O_NOCTTY; ++ return flags; ++} ++ ++static struct file *open_pipe(char *name, ++ struct cpt_file_image *fi, ++ unsigned flags, ++ struct cpt_context *ctx) ++{ ++ int err; ++ cpt_object_t *obj; ++ struct cpt_inode_image ii; ++ struct file *rf, *wf; ++ ++ err = rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, &ii, ctx); ++ if (err) ++ return ERR_PTR(err); ++ ++ if (ii.cpt_sb == FSMAGIC_PIPEFS) { ++ int pfd[2]; ++ ++ if ((err = sc_pipe(pfd)) < 0) ++ return ERR_PTR(err); ++ ++ rf = fcheck(pfd[0]); ++ wf = fcheck(pfd[1]); ++ get_file(rf); ++ get_file(wf); ++ sc_close(pfd[0]); ++ sc_close(pfd[1]); ++ ++ if (fi->cpt_mode&FMODE_READ) { ++ struct file *tf; ++ tf = wf; wf = rf; rf = tf; ++ } ++ } else { ++ if (fi->cpt_mode&FMODE_READ) { ++ rf = filp_open(name, flags, 0); ++ if (IS_ERR(rf)) { ++ dprintk_ctx("filp_open\n"); ++ return rf; ++ } ++ dprintk_ctx(CPT_FID "open RDONLY fifo ino %Ld %p %x\n", CPT_TID(current), ++ (long long)fi->cpt_inode, rf, rf->f_dentry->d_inode->i_mode); ++ return rf; ++ } ++ ++ dprintk_ctx(CPT_FID "open WRONLY fifo ino %Ld\n", CPT_TID(current), (long long)fi->cpt_inode); ++ ++ rf = filp_open(name, O_RDWR|O_NONBLOCK, 0); ++ if (IS_ERR(rf)) ++ return rf; ++ wf = dentry_open(dget(rf->f_dentry), ++ mntget(rf->f_vfsmnt), flags); ++ } ++ ++ /* Add pipe inode to obj table. */ ++ obj = cpt_object_add(CPT_OBJ_INODE, wf->f_dentry->d_inode, ctx); ++ if (obj == NULL) { ++ fput(rf); fput(wf); ++ return ERR_PTR(-ENOMEM); ++ } ++ cpt_obj_setpos(obj, fi->cpt_inode, ctx); ++ obj->o_parent = rf; ++ ++ /* Add another side of pipe to obj table, it will not be used ++ * (o_pos = PT_NULL), another processes opeining pipe will find ++ * inode and open it with dentry_open(). */ ++ obj = cpt_object_add(CPT_OBJ_FILE, rf, ctx); ++ if (obj == NULL) { ++ fput(wf); ++ return ERR_PTR(-ENOMEM); ++ } ++ return wf; ++} ++ ++static struct file *open_special(struct cpt_file_image *fi, ++ unsigned flags, ++ int deleted, ++ struct cpt_context *ctx) ++{ ++ struct cpt_inode_image *ii; ++ struct file *file; ++ ++ /* Directories and named pipes are not special actually */ ++ if (S_ISDIR(fi->cpt_i_mode) || S_ISFIFO(fi->cpt_i_mode)) ++ return NULL; ++ ++ /* No support for block devices at the moment. */ ++ if (S_ISBLK(fi->cpt_i_mode)) ++ return ERR_PTR(-EINVAL); ++ ++ if (S_ISSOCK(fi->cpt_i_mode)) { ++ eprintk_ctx("bug: socket is not open\n"); ++ return ERR_PTR(-EINVAL); ++ } ++ ++ /* Support only (some) character devices at the moment. */ ++ if (!S_ISCHR(fi->cpt_i_mode)) ++ return ERR_PTR(-EINVAL); ++ ++ ii = __rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, ctx); ++ if (ii == NULL) ++ return ERR_PTR(-ENOMEM); ++ ++ /* Do not worry about this right now. /dev/null,zero,*random are here. ++ * To prohibit at least /dev/mem? ++ */ ++ if (MAJOR(ii->cpt_rdev) == MEM_MAJOR) { ++ kfree(ii); ++ return NULL; ++ } ++ ++ /* /dev/net/tun will be opened by caller */ ++ if (fi->cpt_lflags & CPT_DENTRY_TUNTAP) { ++ kfree(ii); ++ return NULL; ++ } ++ ++ file = rst_open_tty(fi, ii, flags, ctx); ++ kfree(ii); ++ return file; ++} ++ ++static int restore_posix_lock(struct file *file, struct cpt_flock_image *fli, cpt_context_t *ctx) ++{ ++ struct file_lock lock; ++ cpt_object_t *obj; ++ ++ memset(&lock, 0, sizeof(lock)); ++ lock.fl_type = fli->cpt_type; ++ lock.fl_flags = fli->cpt_flags & ~FL_SLEEP; ++ lock.fl_start = fli->cpt_start; ++ lock.fl_end = fli->cpt_end; ++ obj = lookup_cpt_obj_byindex(CPT_OBJ_FILES, fli->cpt_owner, ctx); ++ if (!obj) { ++ eprintk_ctx("unknown lock owner %d\n", (int)fli->cpt_owner); ++ return -EINVAL; ++ } ++ lock.fl_owner = obj->o_obj; ++ lock.fl_pid = vpid_to_pid(fli->cpt_pid); ++ if (lock.fl_pid < 0) { ++ eprintk_ctx("unknown lock pid %d\n", lock.fl_pid); ++ return -EINVAL; ++ } ++ lock.fl_file = file; ++ ++ if (lock.fl_owner == NULL) ++ eprintk_ctx("no lock owner\n"); ++ return posix_lock_file(file, &lock, NULL); ++} ++ ++static int restore_flock(struct file *file, struct cpt_flock_image *fli, ++ cpt_context_t *ctx) ++{ ++ int cmd, err, fd; ++ fd = get_unused_fd(); ++ if (fd < 0) { ++ eprintk_ctx("BSD flock cannot be restored\n"); ++ return fd; ++ } ++ get_file(file); ++ fd_install(fd, file); ++ if (fli->cpt_type == F_RDLCK) { ++ cmd = LOCK_SH; ++ } else if (fli->cpt_type == F_WRLCK) { ++ cmd = LOCK_EX; ++ } else { ++ eprintk_ctx("flock flavor is unknown: %u\n", fli->cpt_type); ++ sc_close(fd); ++ return -EINVAL; ++ } ++ ++ err = sc_flock(fd, LOCK_NB | cmd); ++ sc_close(fd); ++ return err; ++} ++ ++ ++static int fixup_posix_locks(struct file *file, ++ struct cpt_file_image *fi, ++ loff_t pos, struct cpt_context *ctx) ++{ ++ int err; ++ loff_t end; ++ struct cpt_flock_image fli; ++ ++ end = pos + fi->cpt_next; ++ pos += fi->cpt_hdrlen; ++ while (pos < end) { ++ err = rst_get_object(-1, pos, &fli, ctx); ++ if (err) ++ return err; ++ if (fli.cpt_object == CPT_OBJ_FLOCK && ++ (fli.cpt_flags&FL_POSIX)) { ++ err = restore_posix_lock(file, &fli, ctx); ++ if (err) ++ return err; ++ dprintk_ctx("posix lock restored\n"); ++ } ++ pos += fli.cpt_next; ++ } ++ return 0; ++} ++ ++int rst_posix_locks(struct cpt_context *ctx) ++{ ++ int err; ++ cpt_object_t *obj; ++ ++ for_each_object(obj, CPT_OBJ_FILE) { ++ struct file *file = obj->o_obj; ++ struct cpt_file_image fi; ++ ++ if (obj->o_pos == CPT_NULL) ++ continue; ++ ++ err = rst_get_object(CPT_OBJ_FILE, obj->o_pos, &fi, ctx); ++ if (err < 0) ++ return err; ++ if (fi.cpt_next > fi.cpt_hdrlen) ++ fixup_posix_locks(file, &fi, obj->o_pos, ctx); ++ } ++ return 0; ++} ++ ++static int fixup_flocks(struct file *file, ++ struct cpt_file_image *fi, ++ loff_t pos, struct cpt_context *ctx) ++{ ++ int err; ++ loff_t end; ++ struct cpt_flock_image fli; ++ ++ end = pos + fi->cpt_next; ++ pos += fi->cpt_hdrlen; ++ while (pos < end) { ++ err = rst_get_object(-1, pos, &fli, ctx); ++ if (err) ++ return err; ++ if (fli.cpt_object == CPT_OBJ_FLOCK && ++ (fli.cpt_flags&FL_FLOCK)) { ++ err = restore_flock(file, &fli, ctx); ++ if (err) ++ return err; ++ dprintk_ctx("bsd lock restored\n"); ++ } ++ pos += fli.cpt_next; ++ } ++ return 0; ++} ++ ++ ++static int fixup_reg_data(struct file *file, loff_t pos, loff_t end, ++ struct cpt_context *ctx) ++{ ++ int err; ++ struct cpt_page_block pgb; ++ ssize_t (*do_write)(struct file *, const char __user *, size_t, loff_t *ppos); ++ ++ do_write = file->f_op->write; ++ if (do_write == NULL) { ++ eprintk_ctx("no write method. Cannot restore contents of the file.\n"); ++ return -EINVAL; ++ } ++ ++ atomic_inc(&file->f_count); ++ ++ while (pos < end) { ++ loff_t opos; ++ loff_t ipos; ++ int count; ++ ++ err = rst_get_object(CPT_OBJ_PAGES, pos, &pgb, ctx); ++ if (err) ++ goto out; ++ dprintk_ctx("restoring file data block: %08x-%08x\n", ++ (__u32)pgb.cpt_start, (__u32)pgb.cpt_end); ++ ipos = pos + pgb.cpt_hdrlen; ++ opos = pgb.cpt_start; ++ count = pgb.cpt_end-pgb.cpt_start; ++ while (count > 0) { ++ mm_segment_t oldfs; ++ int copy = count; ++ ++ if (copy > PAGE_SIZE) ++ copy = PAGE_SIZE; ++ (void)cpt_get_buf(ctx); ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ err = ctx->pread(ctx->tmpbuf, copy, ctx, ipos); ++ set_fs(oldfs); ++ if (err) { ++ __cpt_release_buf(ctx); ++ goto out; ++ } ++ if (!(file->f_mode & FMODE_WRITE) || ++ (file->f_flags&O_DIRECT)) { ++ fput(file); ++ file = dentry_open(dget(file->f_dentry), ++ mntget(file->f_vfsmnt), O_WRONLY); ++ if (IS_ERR(file)) { ++ __cpt_release_buf(ctx); ++ return PTR_ERR(file); ++ } ++ } ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ ipos += copy; ++ err = do_write(file, ctx->tmpbuf, copy, &opos); ++ set_fs(oldfs); ++ __cpt_release_buf(ctx); ++ if (err != copy) { ++ if (err >= 0) ++ err = -EIO; ++ goto out; ++ } ++ count -= copy; ++ } ++ pos += pgb.cpt_next; ++ } ++ err = 0; ++ ++out: ++ fput(file); ++ return err; ++} ++ ++ ++static int fixup_file_content(struct file **file_p, struct cpt_file_image *fi, ++ struct cpt_inode_image *ii, ++ struct cpt_context *ctx) ++{ ++ int err; ++ struct file *file = *file_p; ++ struct iattr newattrs; ++ ++ if (!S_ISREG(fi->cpt_i_mode)) ++ return 0; ++ ++ if (file == NULL) { ++ file = shmem_file_setup("dev/zero", ii->cpt_size, 0); ++ if (IS_ERR(file)) ++ return PTR_ERR(file); ++ *file_p = file; ++ } ++ ++ if (ii->cpt_next > ii->cpt_hdrlen) { ++ struct cpt_object_hdr hdr; ++ err = ctx->pread(&hdr, sizeof(struct cpt_object_hdr), ctx, fi->cpt_inode+ii->cpt_hdrlen); ++ if (err) ++ return err; ++ if (hdr.cpt_object == CPT_OBJ_PAGES) { ++ err = fixup_reg_data(file, fi->cpt_inode+ii->cpt_hdrlen, ++ fi->cpt_inode+ii->cpt_next, ctx); ++ if (err) ++ return err; ++ } ++ } ++ ++ mutex_lock(&file->f_dentry->d_inode->i_mutex); ++ /* stage 1 - update size like do_truncate does */ ++ newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; ++ newattrs.ia_size = ii->cpt_size; ++ cpt_timespec_import(&newattrs.ia_ctime, ii->cpt_ctime); ++ err = notify_change(file->f_dentry, &newattrs); ++ if (err) ++ goto out; ++ ++ /* stage 2 - update times, owner and mode */ ++ newattrs.ia_valid = ATTR_MTIME | ATTR_ATIME | ++ ATTR_ATIME_SET | ATTR_MTIME_SET | ++ ATTR_MODE | ATTR_UID | ATTR_GID; ++ newattrs.ia_uid = ii->cpt_uid; ++ newattrs.ia_gid = ii->cpt_gid; ++ newattrs.ia_mode = file->f_dentry->d_inode->i_mode & S_IFMT; ++ newattrs.ia_mode |= (ii->cpt_mode & ~S_IFMT); ++ cpt_timespec_import(&newattrs.ia_atime, ii->cpt_atime); ++ cpt_timespec_import(&newattrs.ia_mtime, ii->cpt_mtime); ++ err = notify_change(file->f_dentry, &newattrs); ++ ++out: ++ mutex_unlock(&file->f_dentry->d_inode->i_mutex); ++ return err; ++} ++ ++static int fixup_file_flags(struct file *file, struct cpt_file_image *fi, ++ int was_dentry_open, loff_t pos, ++ cpt_context_t *ctx) ++{ ++ if (fi->cpt_pos != file->f_pos) { ++ int err = -ESPIPE; ++ if (file->f_op->llseek) ++ err = file->f_op->llseek(file, fi->cpt_pos, 0); ++ if (err < 0) { ++ dprintk_ctx("file %Ld lseek %Ld - %Ld\n", ++ (long long)pos, ++ (long long)file->f_pos, ++ (long long)fi->cpt_pos); ++ file->f_pos = fi->cpt_pos; ++ } ++ } ++ file->f_uid = fi->cpt_uid; ++ file->f_gid = fi->cpt_gid; ++ file->f_owner.pid = 0; ++ if (fi->cpt_fown_pid != CPT_FOWN_STRAY_PID) { ++ file->f_owner.pid = find_get_pid(fi->cpt_fown_pid); ++ if (file->f_owner.pid == NULL) { ++ wprintk_ctx("fixup_file_flags: owner %d does not exist anymore\n", ++ fi->cpt_fown_pid); ++ return -EINVAL; ++ } ++ } ++ file->f_owner.uid = fi->cpt_fown_uid; ++ file->f_owner.euid = fi->cpt_fown_euid; ++ file->f_owner.signum = fi->cpt_fown_signo; ++ ++ if (file->f_mode != fi->cpt_mode) { ++ if (was_dentry_open && ++ ((file->f_mode^fi->cpt_mode)&(FMODE_PREAD|FMODE_LSEEK))) { ++ file->f_mode &= ~(FMODE_PREAD|FMODE_LSEEK); ++ file->f_mode |= fi->cpt_mode&(FMODE_PREAD|FMODE_LSEEK); ++ } ++ if (file->f_mode != fi->cpt_mode) ++ wprintk_ctx("file %ld mode mismatch %08x %08x\n", (long)pos, file->f_mode, fi->cpt_mode); ++ } ++ if (file->f_flags != fi->cpt_flags) { ++ if (!(fi->cpt_flags&O_NOFOLLOW)) ++ file->f_flags &= ~O_NOFOLLOW; ++ if ((file->f_flags^fi->cpt_flags)&O_NONBLOCK) { ++ file->f_flags &= ~O_NONBLOCK; ++ file->f_flags |= fi->cpt_flags&O_NONBLOCK; ++ } ++ if (fi->cpt_flags&FASYNC) { ++ if (fi->cpt_fown_fd == -1) { ++ wprintk_ctx("No fd for FASYNC\n"); ++ return -EINVAL; ++ } else if (file->f_op && file->f_op->fasync) { ++ if (file->f_op->fasync(fi->cpt_fown_fd, file, 1) < 0) { ++ wprintk_ctx("FASYNC problem\n"); ++ return -EINVAL; ++ } else { ++ file->f_flags |= FASYNC; ++ } ++ } ++ } ++ if (file->f_flags != fi->cpt_flags) { ++ eprintk_ctx("file %ld flags mismatch %08x %08x\n", (long)pos, file->f_flags, fi->cpt_flags); ++ return -EINVAL; ++ } ++ } ++ return 0; ++} ++ ++static struct file * ++open_deleted(char *name, unsigned flags, struct cpt_file_image *fi, ++ struct cpt_inode_image *ii, cpt_context_t *ctx) ++{ ++ struct file * file; ++ char *suffix = NULL; ++ int attempt = 0; ++ int tmp_pass = 0; ++ mode_t mode = fi->cpt_i_mode; ++ ++ /* Strip (deleted) part... */ ++ if (strlen(name) > strlen(" (deleted)")) { ++ if (strcmp(name + strlen(name) - strlen(" (deleted)"), " (deleted)") == 0) { ++ suffix = &name[strlen(name) - strlen(" (deleted)")]; ++ *suffix = 0; ++ } else if (memcmp(name, "(deleted) ", strlen("(deleted) ")) == 0) { ++ memmove(name, name + strlen("(deleted) "), strlen(name) - strlen(" (deleted)") + 1); ++ suffix = name + strlen(name); ++ } ++ } ++ ++try_again: ++ for (;;) { ++ if (attempt) { ++ if (attempt > 1000) { ++ eprintk_ctx("open_deleted: failed after %d attempts\n", attempt); ++ return ERR_PTR(-EEXIST); ++ } ++ if (suffix == NULL) { ++ eprintk_ctx("open_deleted: no suffix\n"); ++ return ERR_PTR(-EEXIST); ++ } ++ sprintf(suffix, ".%08x", (unsigned)((xtime.tv_nsec>>10)+attempt)); ++ } ++ attempt++; ++ ++ if (S_ISFIFO(mode)) { ++ int err; ++ err = sc_mknod(name, S_IFIFO|(mode&017777), 0); ++ if (err == -EEXIST) ++ continue; ++ if (err < 0 && !tmp_pass) ++ goto change_dir; ++ if (err < 0) ++ return ERR_PTR(err); ++ file = open_pipe(name, fi, flags, ctx); ++ sc_unlink(name); ++ } else if (S_ISCHR(mode)) { ++ int err; ++ err = sc_mknod(name, S_IFCHR|(mode&017777), new_encode_dev(ii->cpt_rdev)); ++ if (err == -EEXIST) ++ continue; ++ if (err < 0 && !tmp_pass) ++ goto change_dir; ++ if (err < 0) ++ return ERR_PTR(err); ++ file = filp_open(name, flags, mode&017777); ++ sc_unlink(name); ++ } else if (S_ISDIR(mode)) { ++ int err; ++ err = sc_mkdir(name, mode&017777); ++ if (err == -EEXIST) ++ continue; ++ if (err < 0 && !tmp_pass) ++ goto change_dir; ++ if (err < 0) ++ return ERR_PTR(err); ++ file = filp_open(name, flags, mode&017777); ++ sc_rmdir(name); ++ } else { ++ file = filp_open(name, O_CREAT|O_EXCL|flags, mode&017777); ++ if (IS_ERR(file)) { ++ if (PTR_ERR(file) == -EEXIST) ++ continue; ++ if (!tmp_pass) ++ goto change_dir; ++ } else { ++ sc_unlink(name); ++ } ++ } ++ break; ++ } ++ ++ if (IS_ERR(file)) { ++ eprintk_ctx("filp_open %s: %ld\n", name, PTR_ERR(file)); ++ return file; ++ } else { ++ dprintk_ctx("deleted file created as %s, %p, %x\n", name, file, file->f_dentry->d_inode->i_mode); ++ } ++ return file; ++ ++change_dir: ++ sprintf(name, "/tmp/rst%u", current->pid); ++ suffix = name + strlen(name); ++ attempt = 1; ++ tmp_pass = 1; ++ goto try_again; ++} ++ ++struct file *rst_file(loff_t pos, int fd, struct cpt_context *ctx) ++{ ++ int err; ++ int was_dentry_open = 0; ++ cpt_object_t *obj; ++ cpt_object_t *iobj; ++ struct cpt_file_image fi; ++ __u8 *name = NULL; ++ struct file *file; ++ int flags; ++ ++ obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, pos, ctx); ++ if (obj) { ++ file = obj->o_obj; ++ if (obj->o_index >= 0) { ++ dprintk_ctx("file is attached to a socket\n"); ++ err = rst_get_object(CPT_OBJ_FILE, pos, &fi, ctx); ++ if (err < 0) ++ goto err_out; ++ fixup_file_flags(file, &fi, 0, pos, ctx); ++ } ++ get_file(file); ++ return file; ++ } ++ ++ err = rst_get_object(CPT_OBJ_FILE, pos, &fi, ctx); ++ if (err < 0) ++ goto err_out; ++ ++ flags = make_flags(&fi); ++ ++ /* Easy way, inode has been already open. */ ++ if (fi.cpt_inode != CPT_NULL && ++ !(fi.cpt_lflags & CPT_DENTRY_CLONING) && ++ (iobj = lookup_cpt_obj_bypos(CPT_OBJ_INODE, fi.cpt_inode, ctx)) != NULL && ++ iobj->o_parent) { ++ struct file *filp = iobj->o_parent; ++ file = dentry_open(dget(filp->f_dentry), ++ mntget(filp->f_vfsmnt), flags); ++ dprintk_ctx("rst_file: file obtained by dentry_open\n"); ++ was_dentry_open = 1; ++ goto map_file; ++ } ++ ++ if (fi.cpt_next > fi.cpt_hdrlen) ++ name = rst_get_name(pos + sizeof(fi), ctx); ++ ++ if (!name) { ++ eprintk_ctx("no name for file?\n"); ++ err = -EINVAL; ++ goto err_out; ++ } ++ ++ if (fi.cpt_lflags & CPT_DENTRY_DELETED) { ++ struct cpt_inode_image ii; ++ if (fi.cpt_inode == CPT_NULL) { ++ eprintk_ctx("deleted file and no inode.\n"); ++ err = -EINVAL; ++ goto err_out; ++ } ++ ++ err = rst_get_object(CPT_OBJ_INODE, fi.cpt_inode, &ii, ctx); ++ if (err) ++ goto err_out; ++ ++ if (ii.cpt_next > ii.cpt_hdrlen) { ++ struct cpt_object_hdr hdr; ++ err = ctx->pread(&hdr, sizeof(hdr), ctx, ++ fi.cpt_inode + ii.cpt_hdrlen); ++ if (err) ++ goto err_out; ++ if (hdr.cpt_object == CPT_OBJ_NAME) { ++ rst_put_name(name, ctx); ++ name = rst_get_name(fi.cpt_inode+ii.cpt_hdrlen, ++ ctx); ++ if (!name) { ++ eprintk_ctx("no name for link?\n"); ++ err = -EINVAL; ++ goto err_out; ++ } ++ goto open_file; ++ } ++ } ++ ++ /* One very special case... */ ++ if (S_ISREG(fi.cpt_i_mode) && ++ (!name[0] || strcmp(name, "/dev/zero (deleted)") == 0)) { ++ /* MAP_ANON|MAP_SHARED mapping. ++ * kernel makes this damn ugly way, when file which ++ * is passed to mmap by user does not match ++ * file finally attached to VMA. Ok, rst_mm ++ * has to take care of this. Otherwise, it will fail. ++ */ ++ file = NULL; ++ } else if (S_ISREG(fi.cpt_i_mode) || ++ S_ISCHR(fi.cpt_i_mode) || ++ S_ISFIFO(fi.cpt_i_mode) || ++ S_ISDIR(fi.cpt_i_mode)) { ++ if (S_ISCHR(fi.cpt_i_mode)) { ++ file = open_special(&fi, flags, 1, ctx); ++ if (file != NULL) ++ goto map_file; ++ } ++ file = open_deleted(name, flags, &fi, &ii, ctx); ++ if (IS_ERR(file)) ++ goto out; ++ } else { ++ eprintk_ctx("not a regular deleted file.\n"); ++ err = -EINVAL; ++ goto err_out; ++ } ++ ++ err = fixup_file_content(&file, &fi, &ii, ctx); ++ if (err) ++ goto err_put; ++ goto map_file; ++ } else { ++open_file: ++ if (!name[0]) { ++ eprintk_ctx("empty name for file?\n"); ++ err = -EINVAL; ++ goto err_out; ++ } ++ if ((fi.cpt_lflags & CPT_DENTRY_EPOLL) && ++ (file = cpt_open_epolldev(&fi, flags, ctx)) != NULL) ++ goto map_file; ++#ifdef CONFIG_INOTIFY_USER ++ if ((fi.cpt_lflags & CPT_DENTRY_INOTIFY) && ++ (file = rst_open_inotify(&fi, flags, ctx)) != NULL) ++ goto map_file; ++#else ++ if (fi.cpt_lflags & CPT_DENTRY_INOTIFY) { ++ err = -EINVAL; ++ goto err_out; ++ } ++#endif ++ if (S_ISFIFO(fi.cpt_i_mode) && ++ (file = open_pipe(name, &fi, flags, ctx)) != NULL) ++ goto map_file; ++ if (!S_ISREG(fi.cpt_i_mode) && ++ (file = open_special(&fi, flags, 0, ctx)) != NULL) ++ goto map_file; ++ } ++ ++ file = filp_open(name, flags, 0); ++ ++map_file: ++ if (!IS_ERR(file)) { ++ fixup_file_flags(file, &fi, was_dentry_open, pos, ctx); ++ ++ if (S_ISFIFO(fi.cpt_i_mode) && !was_dentry_open) { ++ err = fixup_pipe_data(file, &fi, ctx); ++ if (err) ++ goto err_put; ++ } ++ ++ /* This is very special hack. Logically, cwd/root are ++ * nothing but open directories. Nevertheless, this causes ++ * failures of restores, when number of open files in VE ++ * is close to limit. So, if it is rst_file() of cwd/root ++ * (fd = -2) and the directory is not deleted, we skip ++ * adding files to object table. If the directory is ++ * not unlinked, this cannot cause any problems. ++ */ ++ if (fd != -2 || ++ !S_ISDIR(file->f_dentry->d_inode->i_mode) || ++ (fi.cpt_lflags & CPT_DENTRY_DELETED)) { ++ obj = cpt_object_get(CPT_OBJ_FILE, file, ctx); ++ if (!obj) { ++ obj = cpt_object_add(CPT_OBJ_FILE, file, ctx); ++ if (obj) ++ get_file(file); ++ } ++ if (obj) ++ cpt_obj_setpos(obj, pos, ctx); ++ ++ obj = cpt_object_add(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx); ++ if (obj) { ++ cpt_obj_setpos(obj, fi.cpt_inode, ctx); ++ if (!obj->o_parent || !(fi.cpt_lflags & CPT_DENTRY_DELETED)) ++ obj->o_parent = file; ++ } ++ } ++ ++ if (fi.cpt_next > fi.cpt_hdrlen) { ++ err = fixup_flocks(file, &fi, pos, ctx); ++ if (err) ++ goto err_put; ++ } ++ } else { ++ if (fi.cpt_lflags & CPT_DENTRY_PROC) { ++ dprintk_ctx("rst_file /proc delayed\n"); ++ file = NULL; ++ } else if (name) ++ eprintk_ctx("can't open file %s\n", name); ++ } ++ ++out: ++ if (name) ++ rst_put_name(name, ctx); ++ return file; ++ ++err_put: ++ if (file) ++ fput(file); ++err_out: ++ if (name) ++ rst_put_name(name, ctx); ++ return ERR_PTR(err); ++} ++ ++ ++__u32 rst_files_flag(struct cpt_task_image *ti, struct cpt_context *ctx) ++{ ++ __u32 flag = 0; ++ ++ if (ti->cpt_files == CPT_NULL || ++ lookup_cpt_obj_bypos(CPT_OBJ_FILES, ti->cpt_files, ctx)) ++ flag |= CLONE_FILES; ++ if (ti->cpt_fs == CPT_NULL || ++ lookup_cpt_obj_bypos(CPT_OBJ_FS, ti->cpt_fs, ctx)) ++ flag |= CLONE_FS; ++ return flag; ++} ++ ++static void local_close_files(struct files_struct * files) ++{ ++ int i, j; ++ ++ j = 0; ++ for (;;) { ++ unsigned long set; ++ i = j * __NFDBITS; ++ if (i >= files->fdt->max_fds) ++ break; ++ set = files->fdt->open_fds->fds_bits[j]; ++ while (set) { ++ if (set & 1) { ++ struct file * file = xchg(&files->fdt->fd[i], NULL); ++ if (file) ++ filp_close(file, files); ++ } ++ i++; ++ set >>= 1; ++ } ++ files->fdt->open_fds->fds_bits[j] = 0; ++ files->fdt->close_on_exec->fds_bits[j] = 0; ++ j++; ++ } ++} ++ ++extern int expand_fdtable(struct files_struct *files, int nr); ++ ++ ++int rst_files_complete(struct cpt_task_image *ti, struct cpt_context *ctx) ++{ ++ struct cpt_files_struct_image fi; ++ struct files_struct *f = current->files; ++ cpt_object_t *obj; ++ loff_t pos, endpos; ++ int err; ++ ++ if (ti->cpt_files == CPT_NULL) { ++ current->files = NULL; ++ if (f) ++ put_files_struct(f); ++ return 0; ++ } ++ ++ obj = lookup_cpt_obj_bypos(CPT_OBJ_FILES, ti->cpt_files, ctx); ++ if (obj) { ++ if (obj->o_obj != f) { ++ put_files_struct(f); ++ f = obj->o_obj; ++ atomic_inc(&f->count); ++ current->files = f; ++ } ++ return 0; ++ } ++ ++ err = rst_get_object(CPT_OBJ_FILES, ti->cpt_files, &fi, ctx); ++ if (err) ++ return err; ++ ++ local_close_files(f); ++ ++ if (fi.cpt_max_fds > f->fdt->max_fds) { ++ spin_lock(&f->file_lock); ++ err = expand_fdtable(f, fi.cpt_max_fds-1); ++ spin_unlock(&f->file_lock); ++ if (err < 0) ++ return err; ++ } ++ ++ pos = ti->cpt_files + fi.cpt_hdrlen; ++ endpos = ti->cpt_files + fi.cpt_next; ++ while (pos < endpos) { ++ struct cpt_fd_image fdi; ++ struct file *filp; ++ ++ err = rst_get_object(CPT_OBJ_FILEDESC, pos, &fdi, ctx); ++ if (err) ++ return err; ++ filp = rst_file(fdi.cpt_file, fdi.cpt_fd, ctx); ++ if (IS_ERR(filp)) { ++ eprintk_ctx("rst_file: %ld %Lu\n", PTR_ERR(filp), ++ (long long)fdi.cpt_file); ++ return PTR_ERR(filp); ++ } ++ if (filp == NULL) { ++ int err = rst_filejob_queue(pos, ctx); ++ if (err) ++ return err; ++ } else { ++ if (fdi.cpt_fd >= f->fdt->max_fds) BUG(); ++ f->fdt->fd[fdi.cpt_fd] = filp; ++ FD_SET(fdi.cpt_fd, f->fdt->open_fds); ++ if (fdi.cpt_flags&CPT_FD_FLAG_CLOSEEXEC) ++ FD_SET(fdi.cpt_fd, f->fdt->close_on_exec); ++ } ++ pos += fdi.cpt_next; ++ } ++ f->next_fd = fi.cpt_next_fd; ++ ++ obj = cpt_object_add(CPT_OBJ_FILES, f, ctx); ++ if (obj) { ++ cpt_obj_setpos(obj, ti->cpt_files, ctx); ++ cpt_obj_setindex(obj, fi.cpt_index, ctx); ++ } ++ return 0; ++} ++ ++int rst_do_filejobs(cpt_context_t *ctx) ++{ ++ struct filejob *j; ++ ++ while ((j = ctx->filejob_queue) != NULL) { ++ int err; ++ struct task_struct *tsk; ++ struct cpt_fd_image fdi; ++ struct file *filp; ++ ++ read_lock(&tasklist_lock); ++ tsk = find_task_by_vpid(j->pid); ++ if (tsk) ++ get_task_struct(tsk); ++ read_unlock(&tasklist_lock); ++ if (!tsk) ++ return -EINVAL; ++ ++ err = rst_get_object(CPT_OBJ_FILEDESC, j->fdi, &fdi, ctx); ++ if (err) { ++ put_task_struct(tsk); ++ return err; ++ } ++ ++ if (fdi.cpt_fd >= tsk->files->fdt->max_fds) BUG(); ++ if (tsk->files->fdt->fd[fdi.cpt_fd] || ++ FD_ISSET(fdi.cpt_fd, tsk->files->fdt->open_fds)) { ++ eprintk_ctx("doing filejob %Ld: fd is busy\n", j->fdi); ++ put_task_struct(tsk); ++ return -EBUSY; ++ } ++ ++ filp = rst_file(fdi.cpt_file, fdi.cpt_fd, ctx); ++ if (IS_ERR(filp)) { ++ eprintk_ctx("rst_do_filejobs: 1: %ld %Lu\n", PTR_ERR(filp), (unsigned long long)fdi.cpt_file); ++ put_task_struct(tsk); ++ return PTR_ERR(filp); ++ } ++ if (fdi.cpt_fd >= tsk->files->fdt->max_fds) BUG(); ++ tsk->files->fdt->fd[fdi.cpt_fd] = filp; ++ FD_SET(fdi.cpt_fd, tsk->files->fdt->open_fds); ++ if (fdi.cpt_flags&CPT_FD_FLAG_CLOSEEXEC) ++ FD_SET(fdi.cpt_fd, tsk->files->fdt->close_on_exec); ++ ++ dprintk_ctx("filejob %Ld done\n", j->fdi); ++ ++ put_task_struct(tsk); ++ ctx->filejob_queue = j->next; ++ kfree(j); ++ } ++ return 0; ++} ++ ++void rst_flush_filejobs(cpt_context_t *ctx) ++{ ++ struct filejob *j; ++ ++ while ((j = ctx->filejob_queue) != NULL) { ++ ctx->filejob_queue = j->next; ++ kfree(j); ++ } ++} ++ ++int rst_fs_complete(struct cpt_task_image *ti, struct cpt_context *ctx) ++{ ++ struct fs_struct *f = current->fs; ++ cpt_object_t *obj; ++ ++ if (ti->cpt_fs == CPT_NULL) { ++ exit_fs(current); ++ return 0; ++ } ++ ++ obj = lookup_cpt_obj_bypos(CPT_OBJ_FS, ti->cpt_fs, ctx); ++ if (obj) { ++ if (obj->o_obj != f) { ++ exit_fs(current); ++ f = obj->o_obj; ++ atomic_inc(&f->count); ++ current->fs = f; ++ } ++ return 0; ++ } ++ ++ /* Do _not_ restore root. Image contains absolute pathnames. ++ * So, we fix it in context of rst process. ++ */ ++ ++ obj = cpt_object_add(CPT_OBJ_FS, f, ctx); ++ if (obj) ++ cpt_obj_setpos(obj, ti->cpt_fs, ctx); ++ ++ return 0; ++} ++ ++int cpt_get_dentry(struct dentry **dp, struct vfsmount **mp, ++ loff_t *pos, struct cpt_context *ctx) ++{ ++ struct cpt_file_image fi; ++ struct file * file; ++ int err; ++ ++ err = rst_get_object(CPT_OBJ_FILE, *pos, &fi, ctx); ++ if (err) ++ return err; ++ ++ file = rst_file(*pos, -2, ctx); ++ if (IS_ERR(file)) ++ return PTR_ERR(file); ++ ++ *dp = dget(file->f_dentry); ++ *mp = mntget(file->f_vfsmnt); ++ *pos += fi.cpt_next; ++ fput(file); ++ return 0; ++} ++ ++static void __set_fs_root(struct fs_struct *fs, struct vfsmount *mnt, ++ struct dentry *dentry) ++{ ++ struct dentry *old_root; ++ struct vfsmount *old_rootmnt; ++ write_lock(&fs->lock); ++ old_root = fs->root.dentry; ++ old_rootmnt = fs->root.mnt; ++ fs->root.mnt = mnt; ++ fs->root.dentry = dentry; ++ write_unlock(&fs->lock); ++ if (old_root) { ++ dput(old_root); ++ mntput(old_rootmnt); ++ } ++} ++ ++static void __set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt, ++ struct dentry *dentry) ++{ ++ struct dentry *old_pwd; ++ struct vfsmount *old_pwdmnt; ++ ++ write_lock(&fs->lock); ++ old_pwd = fs->pwd.dentry; ++ old_pwdmnt = fs->pwd.mnt; ++ fs->pwd.mnt = mnt; ++ fs->pwd.dentry = dentry; ++ write_unlock(&fs->lock); ++ ++ if (old_pwd) { ++ dput(old_pwd); ++ mntput(old_pwdmnt); ++ } ++} ++ ++ ++int rst_restore_fs(struct cpt_context *ctx) ++{ ++ loff_t pos; ++ cpt_object_t *obj; ++ int err = 0; ++ ++ for_each_object(obj, CPT_OBJ_FS) { ++ struct cpt_fs_struct_image fi; ++ struct fs_struct *fs = obj->o_obj; ++ int i; ++ struct dentry *d[3]; ++ struct vfsmount *m[3]; ++ ++ err = rst_get_object(CPT_OBJ_FS, obj->o_pos, &fi, ctx); ++ if (err) ++ return err; ++ ++ fs->umask = fi.cpt_umask; ++ ++ pos = obj->o_pos + fi.cpt_hdrlen; ++ d[0] = d[1] = d[2] = NULL; ++ m[0] = m[1] = m[2] = NULL; ++ i = 0; ++ while (pos < obj->o_pos + fi.cpt_next && i<3) { ++ err = cpt_get_dentry(d+i, m+i, &pos, ctx); ++ if (err) { ++ eprintk_ctx("cannot get_dir: %d", err); ++ for (--i; i >= 0; i--) { ++ if (d[i]) ++ dput(d[i]); ++ if (m[i]) ++ mntput(m[i]); ++ } ++ return err; ++ } ++ i++; ++ } ++ if (d[0]) ++ __set_fs_root(fs, m[0], d[0]); ++ if (d[1]) ++ __set_fs_pwd(fs, m[1], d[1]); ++ if (d[2]) { ++ struct dentry *olddentry; ++ struct vfsmount *oldmnt; ++ write_lock(&fs->lock); ++ oldmnt = fs->altroot.mnt; ++ olddentry = fs->altroot.dentry; ++ fs->altroot.mnt = m[2]; ++ fs->altroot.dentry = d[2]; ++ write_unlock(&fs->lock); ++ ++ if (olddentry) { ++ dput(olddentry); ++ mntput(oldmnt); ++ } ++ } ++ } ++ return err; ++} ++ ++int do_one_mount(char *mntpnt, char *mnttype, char *mntbind, ++ unsigned long flags, unsigned long mnt_flags, ++ struct cpt_context *ctx) ++{ ++ int err; ++ ++ if (mntbind && (strcmp(mntbind, "/") == 0 || strcmp(mntbind, "") == 0)) ++ mntbind = NULL; ++ ++ if (mntbind) ++ flags |= MS_BIND; ++ /* Join per-mountpoint flags with global flags */ ++ if (mnt_flags & MNT_NOSUID) ++ flags |= MS_NOSUID; ++ if (mnt_flags & MNT_NODEV) ++ flags |= MS_NODEV; ++ if (mnt_flags & MNT_NOEXEC) ++ flags |= MS_NOEXEC; ++ ++ err = sc_mount(mntbind, mntpnt, mnttype, flags); ++ if (err < 0) { ++ eprintk_ctx("%d mounting %s %s %08lx\n", err, mntpnt, mnttype, flags); ++ return err; ++ } ++ return 0; ++} ++ ++static int undumptmpfs(void *arg) ++{ ++ int i; ++ int *pfd = arg; ++ int fd1, fd2, err; ++ char *argv[] = { "tar", "x", "-C", "/", "-S", NULL }; ++ ++ if (pfd[0] != 0) ++ sc_dup2(pfd[0], 0); ++ ++ set_fs(KERNEL_DS); ++ fd1 = sc_open("/dev/null", O_WRONLY, 0); ++ fd2 = sc_open("/dev/null", O_WRONLY, 0); ++try: ++ if (fd1 < 0 || fd2 < 0) { ++ if (fd1 == -ENOENT && fd2 == -ENOENT) { ++ err = sc_mknod("/dev/null", S_IFCHR|0666, ++ new_encode_dev((MEM_MAJOR<files->fdt->max_fds; i++) ++ sc_close(i); ++ ++ module_put(THIS_MODULE); ++ ++ i = sc_execve("/bin/tar", argv, NULL); ++ eprintk("failed to exec /bin/tar: %d\n", i); ++ return 255 << 8; ++} ++ ++static int rst_restore_tmpfs(loff_t *pos, struct cpt_context * ctx) ++{ ++ int err; ++ int pfd[2]; ++ struct file *f; ++ struct cpt_object_hdr v; ++ int n; ++ loff_t end; ++ int pid; ++ int status; ++ mm_segment_t oldfs; ++ sigset_t ignore, blocked; ++ ++ err = rst_get_object(CPT_OBJ_NAME, *pos, &v, ctx); ++ if (err < 0) ++ return err; ++ ++ err = sc_pipe(pfd); ++ if (err < 0) ++ return err; ++ ignore.sig[0] = CPT_SIG_IGNORE_MASK; ++ sigprocmask(SIG_BLOCK, &ignore, &blocked); ++ pid = err = local_kernel_thread(undumptmpfs, (void*)pfd, SIGCHLD, 0); ++ if (err < 0) { ++ eprintk_ctx("tmpfs local_kernel_thread: %d\n", err); ++ goto out; ++ } ++ f = fget(pfd[1]); ++ sc_close(pfd[1]); ++ sc_close(pfd[0]); ++ ++ ctx->file->f_pos = *pos + v.cpt_hdrlen; ++ end = *pos + v.cpt_next; ++ *pos += v.cpt_next; ++ do { ++ char buf[16]; ++ ++ n = end - ctx->file->f_pos; ++ if (n > sizeof(buf)) ++ n = sizeof(buf); ++ ++ if (ctx->read(buf, n, ctx)) ++ break; ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ f->f_op->write(f, buf, n, &f->f_pos); ++ set_fs(oldfs); ++ } while (ctx->file->f_pos < end); ++ ++ fput(f); ++ ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ if ((err = sc_waitx(pid, 0, &status)) < 0) ++ eprintk_ctx("wait4: %d\n", err); ++ else if ((status & 0x7f) == 0) { ++ err = (status & 0xff00) >> 8; ++ if (err != 0) { ++ eprintk_ctx("tar exited with %d\n", err); ++ err = -EINVAL; ++ } ++ } else { ++ eprintk_ctx("tar terminated\n"); ++ err = -EINVAL; ++ } ++ set_fs(oldfs); ++ sigprocmask(SIG_SETMASK, &blocked, NULL); ++ ++ return err; ++ ++out: ++ if (pfd[1] >= 0) ++ sc_close(pfd[1]); ++ if (pfd[0] >= 0) ++ sc_close(pfd[0]); ++ sigprocmask(SIG_SETMASK, &blocked, NULL); ++ return err; ++} ++ ++int check_ext_mount(char *mntpnt, char *mnttype, struct cpt_context *ctx) ++{ ++ struct mnt_namespace *n; ++ struct list_head *p; ++ struct vfsmount *t; ++ char *path, *path_buf; ++ int ret; ++ ++ n = current->nsproxy->mnt_ns; ++ ret = -ENOENT; ++ path_buf = cpt_get_buf(ctx); ++ down_read(&namespace_sem); ++ list_for_each(p, &n->list) { ++ struct path pt; ++ t = list_entry(p, struct vfsmount, mnt_list); ++ pt.dentry = t->mnt_root; ++ pt.mnt = t; ++ path = d_path(&pt, path_buf, PAGE_SIZE); ++ if (IS_ERR(path)) ++ continue; ++ if (!strcmp(path, mntpnt) && ++ !strcmp(t->mnt_sb->s_type->name, mnttype)) { ++ ret = 0; ++ break; ++ } ++ } ++ up_read(&namespace_sem); ++ __cpt_release_buf(ctx); ++ return ret; ++} ++ ++int restore_one_vfsmount(struct cpt_vfsmount_image *mi, loff_t pos, struct cpt_context *ctx) ++{ ++ int err; ++ loff_t endpos; ++ ++ endpos = pos + mi->cpt_next; ++ pos += mi->cpt_hdrlen; ++ ++ while (pos < endpos) { ++ char *mntdev; ++ char *mntpnt; ++ char *mnttype; ++ char *mntbind; ++ ++ mntdev = __rst_get_name(&pos, ctx); ++ mntpnt = __rst_get_name(&pos, ctx); ++ mnttype = __rst_get_name(&pos, ctx); ++ mntbind = NULL; ++ if (mi->cpt_mntflags & CPT_MNT_BIND) ++ mntbind = __rst_get_name(&pos, ctx); ++ err = -EINVAL; ++ if (mnttype && mntpnt) { ++ err = 0; ++ if (!(mi->cpt_mntflags & CPT_MNT_EXT) && ++ strcmp(mntpnt, "/")) { ++ err = do_one_mount(mntpnt, mnttype, mntbind, ++ mi->cpt_flags, ++ mi->cpt_mntflags, ctx); ++ if (!err && ++ strcmp(mnttype, "tmpfs") == 0 && ++ !(mi->cpt_mntflags & (CPT_MNT_BIND))) ++ err = rst_restore_tmpfs(&pos, ctx); ++ } else if (mi->cpt_mntflags & CPT_MNT_EXT) { ++ err = check_ext_mount(mntpnt, mnttype, ctx); ++ if (err) ++ eprintk_ctx("mount point is missing: %s\n", mntpnt); ++ } ++ } ++ if (mntdev) ++ rst_put_name(mntdev, ctx); ++ if (mntpnt) ++ rst_put_name(mntpnt, ctx); ++ if (mnttype) ++ rst_put_name(mnttype, ctx); ++ if (mntbind) ++ rst_put_name(mntbind, ctx); ++ if (err) ++ return err; ++ } ++ return 0; ++} ++ ++int restore_one_namespace(loff_t pos, loff_t endpos, struct cpt_context *ctx) ++{ ++ int err; ++ struct cpt_vfsmount_image mi; ++ ++ while (pos < endpos) { ++ err = rst_get_object(CPT_OBJ_VFSMOUNT, pos, &mi, ctx); ++ if (err) ++ return err; ++ err = restore_one_vfsmount(&mi, pos, ctx); ++ if (err) ++ return err; ++ pos += mi.cpt_next; ++ } ++ return 0; ++} ++ ++int rst_root_namespace(struct cpt_context *ctx) ++{ ++ int err; ++ loff_t sec = ctx->sections[CPT_SECT_NAMESPACE]; ++ loff_t endsec; ++ struct cpt_section_hdr h; ++ struct cpt_object_hdr sbuf; ++ int done = 0; ++ ++ if (sec == CPT_NULL) ++ return 0; ++ ++ err = ctx->pread(&h, sizeof(h), ctx, sec); ++ if (err) ++ return err; ++ if (h.cpt_section != CPT_SECT_NAMESPACE || h.cpt_hdrlen < sizeof(h)) ++ return -EINVAL; ++ ++ endsec = sec + h.cpt_next; ++ sec += h.cpt_hdrlen; ++ while (sec < endsec) { ++ err = rst_get_object(CPT_OBJ_NAMESPACE, sec, &sbuf, ctx); ++ if (err) ++ return err; ++ if (done) { ++ eprintk_ctx("multiple namespaces are not supported\n"); ++ break; ++ } ++ done++; ++ err = restore_one_namespace(sec+sbuf.cpt_hdrlen, sec+sbuf.cpt_next, ctx); ++ if (err) ++ return err; ++ sec += sbuf.cpt_next; ++ } ++ ++ return 0; ++} ++ ++int rst_stray_files(struct cpt_context *ctx) ++{ ++ int err = 0; ++ loff_t sec = ctx->sections[CPT_SECT_FILES]; ++ loff_t endsec; ++ struct cpt_section_hdr h; ++ ++ if (sec == CPT_NULL) ++ return 0; ++ ++ err = ctx->pread(&h, sizeof(h), ctx, sec); ++ if (err) ++ return err; ++ if (h.cpt_section != CPT_SECT_FILES || h.cpt_hdrlen < sizeof(h)) ++ return -EINVAL; ++ ++ endsec = sec + h.cpt_next; ++ sec += h.cpt_hdrlen; ++ while (sec < endsec) { ++ struct cpt_object_hdr sbuf; ++ cpt_object_t *obj; ++ ++ err = _rst_get_object(CPT_OBJ_FILE, sec, &sbuf, sizeof(sbuf), ctx); ++ if (err) ++ break; ++ ++ obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, sec, ctx); ++ if (!obj) { ++ struct file *file; ++ ++ dprintk_ctx("stray file %Ld\n", sec); ++ ++ file = rst_sysv_shm_itself(sec, ctx); ++ ++ if (IS_ERR(file)) { ++ eprintk_ctx("rst_stray_files: %ld\n", PTR_ERR(file)); ++ return PTR_ERR(file); ++ } else { ++ fput(file); ++ } ++ } ++ sec += sbuf.cpt_next; ++ } ++ ++ return err; ++} +diff --git a/kernel/cpt/rst_inotify.c b/kernel/cpt/rst_inotify.c +new file mode 100644 +index 0000000..0dcaf47 +--- /dev/null ++++ b/kernel/cpt/rst_inotify.c +@@ -0,0 +1,196 @@ ++/* ++ * ++ * kernel/cpt/rst_inotify.c ++ * ++ * Copyright (C) 2000-2007 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_mm.h" ++#include "cpt_files.h" ++#include "cpt_kernel.h" ++#include "cpt_fsmagic.h" ++#include "cpt_syscalls.h" ++ ++extern struct file_operations inotify_fops; ++ ++struct file *rst_open_inotify(struct cpt_file_image *fi, ++ unsigned flags, ++ struct cpt_context *ctx) ++{ ++ struct file *file; ++ int fd; ++ ++ fd = sys_inotify_init(); ++ if (fd < 0) ++ return ERR_PTR(fd); ++ ++ file = fget(fd); ++ sys_close(fd); ++ return file; ++} ++ ++static int restore_one_inotify(cpt_object_t *obj, ++ loff_t pos, ++ struct cpt_inotify_image *ibuf, ++ cpt_context_t *ctx) ++{ ++ int err = 0; ++ loff_t endpos; ++ struct file *file = obj->o_obj; ++ struct inotify_device *dev; ++ ++ if (file->f_op != &inotify_fops) { ++ eprintk_ctx("bad inotify file\n"); ++ return -EINVAL; ++ } ++ ++ dev = file->private_data; ++ ++ if (unlikely(dev == NULL)) { ++ eprintk_ctx("bad inotify device\n"); ++ return -EINVAL; ++ } ++ ++ endpos = pos + ibuf->cpt_next; ++ pos += ibuf->cpt_hdrlen; ++ while (pos < endpos) { ++ union { ++ struct cpt_inotify_wd_image wi; ++ struct cpt_inotify_ev_image ei; ++ } u; ++ ++ err = rst_get_object(-1, pos, &u, ctx); ++ if (err) { ++ eprintk_ctx("rst_get_object: %d\n", err); ++ return err; ++ } ++ if (u.wi.cpt_object == CPT_OBJ_INOTIFY_WATCH) { ++ struct path p; ++ loff_t fpos = pos + u.wi.cpt_hdrlen; ++ ++ err = cpt_get_dentry(&p.dentry, &p.mnt, &fpos, ctx); ++ if (err) { ++ eprintk_ctx("cpt_get_dentry: %d\n", err); ++ return err; ++ } ++ ++ mutex_lock(&dev->up_mutex); ++ dev->ih->last_wd = u.wi.cpt_wd - 1; ++ err = inotify_create_watch(dev, &p, u.wi.cpt_mask); ++ dev->ih->last_wd = ibuf->cpt_last_wd; ++ if (err != u.wi.cpt_wd) { ++ eprintk_ctx("wrong inotify descriptor %u %u\n", err, u.wi.cpt_wd); ++ if (err >= 0) ++ err = -EINVAL; ++ } else ++ err = 0; ++ mutex_unlock(&dev->up_mutex); ++ path_put(&p); ++ if (err) ++ break; ++ } else if (u.wi.cpt_object == CPT_OBJ_INOTIFY_EVENT) { ++ struct inotify_user_watch dummy_watch; ++ struct inotify_watch *w; ++ char *name = NULL; ++ ++ if (u.ei.cpt_namelen) { ++ name = kmalloc(u.ei.cpt_namelen+1, GFP_KERNEL); ++ if (name == NULL) { ++ err = -ENOMEM; ++ break; ++ } ++ name[u.ei.cpt_namelen] = 0; ++ err = ctx->pread(name, u.ei.cpt_namelen, ctx, pos + u.ei.cpt_hdrlen); ++ if (err) { ++ kfree(name); ++ break; ++ } ++ } ++ ++ w = &dummy_watch.wdata; ++ dummy_watch.dev = dev; ++ atomic_set(&w->count, 2); ++ ++ /* Trick to avoid destruction due to exit event */ ++ if (u.ei.cpt_mask & (IN_IGNORED | IN_ONESHOT)) ++ atomic_inc(&w->count); ++ dev->ih->in_ops->handle_event(w, u.ei.cpt_wd, u.ei.cpt_mask, ++ u.ei.cpt_cookie, name, NULL); ++ if (name) ++ kfree(name); ++ } else { ++ eprintk_ctx("bad object: %u\n", u.wi.cpt_object); ++ err = -EINVAL; ++ break; ++ } ++ pos += u.wi.cpt_next; ++ } ++ return err; ++} ++ ++int rst_inotify(cpt_context_t *ctx) ++{ ++ int err; ++ loff_t sec = ctx->sections[CPT_SECT_INOTIFY]; ++ loff_t endsec; ++ struct cpt_section_hdr h; ++ ++ if (sec == CPT_NULL) ++ return 0; ++ ++ err = ctx->pread(&h, sizeof(h), ctx, sec); ++ if (err) ++ return err; ++ if (h.cpt_section != CPT_SECT_INOTIFY || h.cpt_hdrlen < sizeof(h)) ++ return -EINVAL; ++ ++ endsec = sec + h.cpt_next; ++ sec += h.cpt_hdrlen; ++ while (sec < endsec) { ++ cpt_object_t *obj; ++ struct cpt_inotify_image ibuf; ++ ++ err = rst_get_object(CPT_OBJ_INOTIFY, sec, &ibuf, ctx); ++ if (err) ++ return err; ++ obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, ibuf.cpt_file, ctx); ++ if (obj == NULL) { ++ eprintk_ctx("cannot find inotify file object\n"); ++ return -EINVAL; ++ } ++ err = restore_one_inotify(obj, sec, &ibuf, ctx); ++ if (err) ++ return err; ++ sec += ibuf.cpt_next; ++ } ++ ++ return 0; ++ ++} +diff --git a/kernel/cpt/rst_mm.c b/kernel/cpt/rst_mm.c +new file mode 100644 +index 0000000..380b382 +--- /dev/null ++++ b/kernel/cpt/rst_mm.c +@@ -0,0 +1,1145 @@ ++/* ++ * ++ * kernel/cpt/rst_mm.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#ifdef CONFIG_X86 ++#include ++#include ++#endif ++#include ++#include ++#include ++#include ++ ++#ifdef CONFIG_VE ++#include ++#include ++#endif ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_files.h" ++#include "cpt_ubc.h" ++#include "cpt_mm.h" ++#include "cpt_kernel.h" ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++#include "cpt_pagein.h" ++#endif ++ ++#include "cpt_syscalls.h" ++ ++#define __PAGE_NX (1ULL<<63) ++ ++static unsigned long make_prot(struct cpt_vma_image *vmai) ++{ ++ unsigned long prot = 0; ++ ++ if (vmai->cpt_flags&VM_READ) ++ prot |= PROT_READ; ++ if (vmai->cpt_flags&VM_WRITE) ++ prot |= PROT_WRITE; ++ if (vmai->cpt_flags&VM_EXEC) ++ prot |= PROT_EXEC; ++ if (vmai->cpt_flags&VM_GROWSDOWN) ++ prot |= PROT_GROWSDOWN; ++ if (vmai->cpt_flags&VM_GROWSUP) ++ prot |= PROT_GROWSUP; ++ return prot; ++} ++ ++static unsigned long make_flags(struct cpt_vma_image *vmai) ++{ ++ unsigned long flags = MAP_FIXED; ++ ++ if (vmai->cpt_flags&(VM_SHARED|VM_MAYSHARE)) ++ flags |= MAP_SHARED; ++ else ++ flags |= MAP_PRIVATE; ++ ++ if (vmai->cpt_file == CPT_NULL) ++ flags |= MAP_ANONYMOUS; ++ if (vmai->cpt_flags&VM_GROWSDOWN) ++ flags |= MAP_GROWSDOWN; ++#ifdef MAP_GROWSUP ++ if (vmai->cpt_flags&VM_GROWSUP) ++ flags |= MAP_GROWSUP; ++#endif ++ if (vmai->cpt_flags&VM_DENYWRITE) ++ flags |= MAP_DENYWRITE; ++ if (vmai->cpt_flags&VM_EXECUTABLE) ++ flags |= MAP_EXECUTABLE; ++ if (!(vmai->cpt_flags&VM_ACCOUNT)) ++ flags |= MAP_NORESERVE; ++ return flags; ++} ++ ++#ifdef CONFIG_X86 ++#if !defined(CONFIG_X86_64) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) \ ++ && !defined(CONFIG_XEN) ++static int __alloc_ldt(mm_context_t *pc, int mincount) ++{ ++ int oldsize, newsize, nr; ++ ++ if (mincount <= pc->size) ++ return 0; ++ /* ++ * LDT got larger - reallocate if necessary. ++ */ ++ oldsize = pc->size; ++ mincount = (mincount+511)&(~511); ++ newsize = mincount*LDT_ENTRY_SIZE; ++ for (nr = 0; nr * PAGE_SIZE < newsize; nr++) { ++ BUG_ON(nr * PAGE_SIZE >= 64*1024); ++ if (!pc->ldt_pages[nr]) { ++ pc->ldt_pages[nr] = alloc_page(GFP_HIGHUSER|__GFP_UBC); ++ if (!pc->ldt_pages[nr]) ++ goto nomem; ++ clear_highpage(pc->ldt_pages[nr]); ++ } ++ } ++ pc->size = mincount; ++ return 0; ++ ++nomem: ++ while (--nr >= 0) ++ __free_page(pc->ldt_pages[nr]); ++ pc->size = 0; ++ return -ENOMEM; ++} ++ ++static int do_rst_ldt(struct cpt_obj_bits *li, loff_t pos, struct cpt_context *ctx) ++{ ++ struct mm_struct *mm = current->mm; ++ int i; ++ int err; ++ int size; ++ ++ err = __alloc_ldt(&mm->context, li->cpt_size/LDT_ENTRY_SIZE); ++ if (err) ++ return err; ++ ++ size = mm->context.size*LDT_ENTRY_SIZE; ++ ++ for (i = 0; i < size; i += PAGE_SIZE) { ++ int nr = i / PAGE_SIZE, bytes; ++ char *kaddr = kmap(mm->context.ldt_pages[nr]); ++ ++ bytes = size - i; ++ if (bytes > PAGE_SIZE) ++ bytes = PAGE_SIZE; ++ err = ctx->pread(kaddr, bytes, ctx, pos + li->cpt_hdrlen + i); ++ kunmap(mm->context.ldt_pages[nr]); ++ if (err) ++ return err; ++ } ++ ++ load_LDT(&mm->context); ++ return 0; ++} ++ ++#else ++ ++static int do_rst_ldt(struct cpt_obj_bits *li, loff_t pos, struct cpt_context *ctx) ++{ ++ struct mm_struct *mm = current->mm; ++ int oldsize = mm->context.size; ++ void *oldldt; ++ void *newldt; ++ int err; ++ ++ if (li->cpt_size > PAGE_SIZE) ++ newldt = vmalloc(li->cpt_size); ++ else ++ newldt = kmalloc(li->cpt_size, GFP_KERNEL); ++ ++ if (!newldt) ++ return -ENOMEM; ++ ++ err = ctx->pread(newldt, li->cpt_size, ctx, pos + li->cpt_hdrlen); ++ if (err) ++ return err; ++ ++ oldldt = mm->context.ldt; ++ mm->context.ldt = newldt; ++ mm->context.size = li->cpt_size/LDT_ENTRY_SIZE; ++ ++ load_LDT(&mm->context); ++ ++ if (oldsize) { ++ if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) ++ vfree(oldldt); ++ else ++ kfree(oldldt); ++ } ++ return 0; ++} ++#endif ++#endif ++ ++static int ++restore_aio_ring(struct kioctx *aio_ctx, struct cpt_aio_ctx_image *aimg) ++{ ++ struct aio_ring_info *info = &aio_ctx->ring_info; ++ unsigned nr_events = aio_ctx->max_reqs; ++ unsigned long size; ++ int nr_pages; ++ ++ /* We recalculate parameters of the ring exactly like ++ * fs/aio.c does and then compare calculated values ++ * with ones, stored in dump. They must be the same. */ ++ ++ nr_events += 2; ++ ++ size = sizeof(struct aio_ring); ++ size += sizeof(struct io_event) * nr_events; ++ nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT; ++ ++ if (nr_pages != aimg->cpt_ring_pages) ++ return -EINVAL; ++ ++ info->nr_pages = nr_pages; ++ ++ nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); ++ ++ if (nr_events != aimg->cpt_nr) ++ return -EINVAL; ++ ++ info->nr = 0; ++ info->ring_pages = info->internal_pages; ++ if (nr_pages > AIO_RING_PAGES) { ++ info->ring_pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_KERNEL); ++ if (!info->ring_pages) ++ return -ENOMEM; ++ memset(info->ring_pages, 0, sizeof(struct page *) * nr_pages); ++ } ++ ++ info->mmap_size = nr_pages * PAGE_SIZE; ++ ++ /* This piece of shit is not entirely my fault. Kernel aio.c makes ++ * something odd mmap()ping some pages and then pinning them. ++ * I guess it is just some mud remained of failed attempt to show ring ++ * to user space. The result is odd. :-) Immediately after ++ * creation of AIO context, kernel shares those pages with user ++ * and user can read and even write there. But after the first ++ * fork, pages are marked COW with evident consequences. ++ * I remember, I did the same mistake in the first version ++ * of mmapped packet socket, luckily that crap never reached ++ * mainstream. ++ * ++ * So, what are we going to do? I can simulate this odd behaviour ++ * exactly, but I am not insane yet. For now just take the pages ++ * from user space. Alternatively, we could keep kernel copy ++ * in AIO context image, which would be more correct. ++ * ++ * What is wrong now? If the pages are COWed, ring is transferred ++ * incorrectly. ++ */ ++ down_read(¤t->mm->mmap_sem); ++ info->mmap_base = aimg->cpt_mmap_base; ++ info->nr_pages = get_user_pages(current, current->mm, ++ info->mmap_base, nr_pages, ++ 1, 0, info->ring_pages, NULL); ++ up_read(¤t->mm->mmap_sem); ++ ++ if (unlikely(info->nr_pages != nr_pages)) { ++ int i; ++ ++ for (i=0; inr_pages; i++) ++ put_page(info->ring_pages[i]); ++ if (info->ring_pages && info->ring_pages != info->internal_pages) ++ kfree(info->ring_pages); ++ return -EFAULT; ++ } ++ ++ aio_ctx->user_id = info->mmap_base; ++ ++ info->nr = nr_events; ++ info->tail = aimg->cpt_tail; ++ ++ return 0; ++} ++ ++static int do_rst_aio(struct cpt_aio_ctx_image *aimg, loff_t pos, cpt_context_t *ctx) ++{ ++ int err; ++ struct kioctx *aio_ctx; ++ extern spinlock_t aio_nr_lock; ++ ++ aio_ctx = kmem_cache_alloc(kioctx_cachep, GFP_KERNEL); ++ if (!aio_ctx) ++ return -ENOMEM; ++ ++ memset(aio_ctx, 0, sizeof(*aio_ctx)); ++ aio_ctx->max_reqs = aimg->cpt_max_reqs; ++ ++ if ((err = restore_aio_ring(aio_ctx, aimg)) < 0) { ++ kmem_cache_free(kioctx_cachep, aio_ctx); ++ eprintk_ctx("AIO %Ld restore_aio_ring: %d\n", pos, err); ++ return err; ++ } ++ ++ aio_ctx->mm = current->mm; ++ atomic_inc(&aio_ctx->mm->mm_count); ++ atomic_set(&aio_ctx->users, 1); ++ spin_lock_init(&aio_ctx->ctx_lock); ++ spin_lock_init(&aio_ctx->ring_info.ring_lock); ++ init_waitqueue_head(&aio_ctx->wait); ++ INIT_LIST_HEAD(&aio_ctx->active_reqs); ++ INIT_LIST_HEAD(&aio_ctx->run_list); ++ INIT_WORK(&aio_ctx->wq.work, aio_kick_handler); ++ ++ spin_lock(&aio_nr_lock); ++ aio_nr += aio_ctx->max_reqs; ++ spin_unlock(&aio_nr_lock); ++ ++ write_lock(&aio_ctx->mm->ioctx_list_lock); ++ aio_ctx->next = aio_ctx->mm->ioctx_list; ++ aio_ctx->mm->ioctx_list = aio_ctx; ++ write_unlock(&aio_ctx->mm->ioctx_list_lock); ++ ++ return 0; ++} ++ ++struct anonvma_map ++{ ++ struct hlist_node list; ++ struct anon_vma *avma; ++ __u64 id; ++}; ++ ++static int verify_create_anonvma(struct mm_struct *mm, ++ struct cpt_vma_image *vmai, ++ cpt_context_t *ctx) ++{ ++ struct anon_vma *avma = NULL; ++ struct anon_vma *new_avma; ++ struct vm_area_struct *vma; ++ int h; ++ ++ if (!ctx->anonvmas) { ++ if (CPT_ANONVMA_HSIZE*sizeof(struct hlist_head) > PAGE_SIZE) ++ return -EINVAL; ++ if ((ctx->anonvmas = (void*)__get_free_page(GFP_KERNEL)) == NULL) ++ return -ENOMEM; ++ for (h = 0; h < CPT_ANONVMA_HSIZE; h++) ++ INIT_HLIST_HEAD(&ctx->anonvmas[h]); ++ } else { ++ struct anonvma_map *map; ++ struct hlist_node *elem; ++ ++ h = hash_long((unsigned long)vmai->cpt_anonvmaid, CPT_ANONVMA_HBITS); ++ hlist_for_each_entry(map, elem, &ctx->anonvmas[h], list) { ++ if (map->id == vmai->cpt_anonvmaid) { ++ avma = map->avma; ++ break; ++ } ++ } ++ } ++ ++ down_read(&mm->mmap_sem); ++ if ((vma = find_vma(mm, vmai->cpt_start)) == NULL) { ++ up_read(&mm->mmap_sem); ++ return -ESRCH; ++ } ++ if (vma->vm_start != vmai->cpt_start) { ++ up_read(&mm->mmap_sem); ++ eprintk_ctx("vma start mismatch\n"); ++ return -EINVAL; ++ } ++ if (vma->vm_pgoff != vmai->cpt_pgoff) { ++ dprintk_ctx("vma pgoff mismatch, fixing\n"); ++ if (vma->vm_file || (vma->vm_flags&(VM_SHARED|VM_MAYSHARE))) { ++ eprintk_ctx("cannot fixup vma pgoff\n"); ++ up_read(&mm->mmap_sem); ++ return -EINVAL; ++ } ++ vma->vm_pgoff = vmai->cpt_pgoff; ++ } ++ ++ if (!vma->anon_vma) { ++ if (avma) { ++ vma->anon_vma = avma; ++ anon_vma_link(vma); ++ } else { ++ int err; ++ ++ err = anon_vma_prepare(vma); ++ ++ if (err) { ++ up_read(&mm->mmap_sem); ++ return err; ++ } ++ } ++ } else { ++ /* Note, we _can_ arrive to the situation, when two ++ * different anonvmaid's point to one anon_vma, this happens ++ * f.e. when mmap() merged new area to previous one and ++ * they will share one anon_vma even if they did not on ++ * original host. ++ * ++ * IT IS OK. To all that I understand, we may merge all ++ * the anon_vma's and rmap can scan all the huge list of vmas ++ * searching for page. It is just "suboptimal". ++ * ++ * Real disaster would happen, if vma already got an anon_vma ++ * with different id. It is very rare case, kernel does the ++ * best efforts to merge anon_vmas when some attributes are ++ * different. In this case we will fall to copying memory. ++ */ ++ if (avma && vma->anon_vma != avma) { ++ up_read(&mm->mmap_sem); ++ wprintk_ctx("anon_vma mismatch\n"); ++ return 0; ++ } ++ } ++ ++ new_avma = vma->anon_vma; ++ up_read(&mm->mmap_sem); ++ ++ if (!avma) { ++ struct anonvma_map *map; ++ ++ if (!new_avma) ++ return -EINVAL; ++ ++ if ((map = kmalloc(sizeof(*map), GFP_KERNEL)) == NULL) ++ return -ENOMEM; ++ ++ map->id = vmai->cpt_anonvmaid; ++ map->avma = new_avma; ++ h = hash_long((unsigned long)vmai->cpt_anonvmaid, CPT_ANONVMA_HBITS); ++ hlist_add_head(&map->list, &ctx->anonvmas[h]); ++ } ++ return 0; ++} ++ ++static int copy_mm_pages(struct mm_struct *src, unsigned long start, ++ unsigned long end) ++{ ++ int err; ++ ++ for (; start < end; start += PAGE_SIZE) { ++ struct page *page; ++ struct page *spage; ++ void *maddr, *srcaddr; ++ ++ err = get_user_pages(current, current->mm, ++ start, 1, 1, 1, &page, NULL); ++ if (err == 0) ++ err = -EFAULT; ++ if (err < 0) ++ return err; ++ ++ err = get_user_pages(current, src, ++ start, 1, 0, 1, &spage, NULL); ++ ++ if (err == 0) ++ err = -EFAULT; ++ if (err < 0) { ++ page_cache_release(page); ++ return err; ++ } ++ ++ srcaddr = kmap(spage); ++ maddr = kmap(page); ++ memcpy(maddr, srcaddr, PAGE_SIZE); ++ set_page_dirty_lock(page); ++ kunmap(page); ++ kunmap(spage); ++ page_cache_release(page); ++ page_cache_release(spage); ++ } ++ return 0; ++} ++ ++static int do_rst_vma(struct cpt_vma_image *vmai, loff_t vmapos, loff_t mmpos, struct cpt_context *ctx) ++{ ++ int err = 0; ++ unsigned long addr; ++ struct mm_struct *mm = current->mm; ++ struct vm_area_struct *vma; ++ struct file *file = NULL; ++ unsigned long prot; ++ int checked = 0; ++ ++ if (vmai->cpt_type == CPT_VMA_VDSO) { ++ if (ctx->vdso == NULL) { ++#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES ++ err = arch_setup_additional_pages(NULL, 0, ++ vmai->cpt_start); ++#endif ++ goto out; ++ } ++ } ++ ++ prot = make_prot(vmai); ++ ++ if (vmai->cpt_file != CPT_NULL) { ++ if (vmai->cpt_type == CPT_VMA_TYPE_0) { ++ file = rst_file(vmai->cpt_file, -1, ctx); ++ if (IS_ERR(file)) { ++ eprintk_ctx("do_rst_vma: rst_file: %Ld\n", (unsigned long long)vmai->cpt_file); ++ return PTR_ERR(file); ++ } ++ } else if (vmai->cpt_type == CPT_VMA_TYPE_SHM) { ++ file = rst_sysv_shm_vma(vmai, ctx); ++ if (IS_ERR(file)) ++ return PTR_ERR(file); ++ } ++ } ++ ++ down_write(&mm->mmap_sem); ++ addr = do_mmap_pgoff(file, vmai->cpt_start, ++ vmai->cpt_end-vmai->cpt_start, ++ prot, make_flags(vmai), ++ vmai->cpt_pgoff); ++ ++ if (addr != vmai->cpt_start) { ++ up_write(&mm->mmap_sem); ++ ++ err = -EINVAL; ++ if (IS_ERR((void*)addr)) ++ err = addr; ++ goto out; ++ } ++ ++ vma = find_vma(mm, vmai->cpt_start); ++ if (vma == NULL) { ++ up_write(&mm->mmap_sem); ++ eprintk_ctx("cannot find mmapped vma\n"); ++ err = -ESRCH; ++ goto out; ++ } ++ ++ /* do_mmap_pgoff() can merge new area to previous one (not to the next, ++ * we mmap in order, the rest of mm is still unmapped). This can happen ++ * f.e. if flags are to be adjusted later, or if we had different ++ * anon_vma on two adjacent regions. Split it by brute force. */ ++ if (vma->vm_start != vmai->cpt_start) { ++ dprintk_ctx("vma %Ld merged, split\n", vmapos); ++ err = split_vma(mm, vma, (unsigned long)vmai->cpt_start, 0); ++ if (err) { ++ up_write(&mm->mmap_sem); ++ eprintk_ctx("cannot split vma\n"); ++ goto out; ++ } ++ } ++ up_write(&mm->mmap_sem); ++ ++ if (vmai->cpt_anonvma && vmai->cpt_anonvmaid) { ++ err = verify_create_anonvma(mm, vmai, ctx); ++ if (err) { ++ eprintk_ctx("cannot verify_create_anonvma %Ld\n", vmapos); ++ goto out; ++ } ++ } ++ ++ if (vmai->cpt_type == CPT_VMA_VDSO) { ++ struct page *page; ++ void *maddr; ++ ++ err = get_user_pages(current, current->mm, ++ (unsigned long)vmai->cpt_start, ++ 1, 1, 1, &page, NULL); ++ if (err == 0) ++ err = -EFAULT; ++ if (err < 0) { ++ eprintk_ctx("can't get vdso: get_user_pages: %d\n", err); ++ goto out; ++ } ++ err = 0; ++ maddr = kmap(page); ++ memcpy(maddr, ctx->vdso, PAGE_SIZE); ++ set_page_dirty_lock(page); ++ kunmap(page); ++ page_cache_release(page); ++ goto out; ++ } ++ ++ if (vmai->cpt_next > vmai->cpt_hdrlen) { ++ loff_t offset = vmapos + vmai->cpt_hdrlen; ++ ++ do { ++ union { ++ struct cpt_page_block pb; ++ struct cpt_remappage_block rpb; ++ struct cpt_copypage_block cpb; ++ struct cpt_lazypage_block lpb; ++ struct cpt_iterpage_block ipb; ++ } u; ++ loff_t pos; ++ ++ err = rst_get_object(-1, offset, &u, ctx); ++ if (err) { ++ eprintk_ctx("vma fix object: %d\n", err); ++ goto out; ++ } ++ if (u.rpb.cpt_object == CPT_OBJ_REMAPPAGES) { ++ err = sc_remap_file_pages(u.rpb.cpt_start, ++ u.rpb.cpt_end-u.rpb.cpt_start, ++ 0, u.rpb.cpt_pgoff, 0); ++ if (err < 0) { ++ eprintk_ctx("remap_file_pages: %d (%08x,%u,%u)\n", err, ++ (__u32)u.rpb.cpt_start, (__u32)(u.rpb.cpt_end-u.rpb.cpt_start), ++ (__u32)u.rpb.cpt_pgoff); ++ goto out; ++ } ++ offset += u.rpb.cpt_next; ++ continue; ++ } else if (u.cpb.cpt_object == CPT_OBJ_LAZYPAGES) { ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ unsigned long ptr = u.lpb.cpt_start; ++ ++ down_read(&mm->mmap_sem); ++ if ((vma = find_vma(mm, u.lpb.cpt_start)) == NULL) { ++ up_read(&mm->mmap_sem); ++ eprintk_ctx("lost vm_area_struct\n"); ++ err = -ESRCH; ++ goto out; ++ } ++ err = anon_vma_prepare(vma); ++ if (err) { ++ up_read(&mm->mmap_sem); ++ goto out; ++ } ++ while (ptr < u.lpb.cpt_end) { ++ err = rst_pagein(vma, u.lpb.cpt_index + (ptr-u.lpb.cpt_start)/PAGE_SIZE, ++ ptr, ctx); ++ if (err) ++ break; ++ ptr += PAGE_SIZE; ++ } ++ up_read(&mm->mmap_sem); ++#else ++ err = -EINVAL; ++#endif ++ if (err) ++ goto out; ++ offset += u.cpb.cpt_next; ++ continue; ++ } else if (u.cpb.cpt_object == CPT_OBJ_COPYPAGES) { ++ struct vm_area_struct *vma, *vma1; ++ struct mm_struct *src; ++ struct anon_vma *src_anon; ++ cpt_object_t *mobj; ++ ++ if (!vmai->cpt_anonvmaid) { ++ err = -EINVAL; ++ eprintk_ctx("CPT_OBJ_COPYPAGES in !anonvma\n"); ++ goto out; ++ } ++ ++ mobj = lookup_cpt_obj_bypos(CPT_OBJ_MM, u.cpb.cpt_source, ctx); ++ if (!mobj) { ++ eprintk_ctx("lost mm_struct to clone pages from\n"); ++ err = -ESRCH; ++ goto out; ++ } ++ src = mobj->o_obj; ++ ++ down_read(&src->mmap_sem); ++ src_anon = NULL; ++ vma1 = find_vma(src, u.cpb.cpt_start); ++ if (vma1) ++ src_anon = vma1->anon_vma; ++ up_read(&src->mmap_sem); ++ ++ if (!vma1) { ++ eprintk_ctx("lost src vm_area_struct\n"); ++ err = -ESRCH; ++ goto out; ++ } ++ ++ down_read(&mm->mmap_sem); ++ if ((vma = find_vma(mm, u.cpb.cpt_start)) == NULL) { ++ up_read(&mm->mmap_sem); ++ eprintk_ctx("lost vm_area_struct\n"); ++ err = -ESRCH; ++ goto out; ++ } ++ ++ if (!src_anon || ++ !vma->anon_vma || ++ vma->anon_vma != src_anon || ++ vma->vm_start - vma1->vm_start != ++ (vma->vm_pgoff - vma1->vm_pgoff) << PAGE_SHIFT) { ++ up_read(&mm->mmap_sem); ++ wprintk_ctx("anon_vma mismatch in vm_area_struct %Ld\n", vmapos); ++ err = copy_mm_pages(mobj->o_obj, ++ u.cpb.cpt_start, ++ u.cpb.cpt_end); ++ } else { ++ err = __copy_page_range(vma, vma1, ++ u.cpb.cpt_start, ++ u.cpb.cpt_end-u.cpb.cpt_start); ++ up_read(&mm->mmap_sem); ++ } ++ if (err) { ++ eprintk_ctx("clone_page_range: %d (%08x,%u,%ld)\n", err, ++ (__u32)u.cpb.cpt_start, (__u32)(u.cpb.cpt_end-u.cpb.cpt_start), ++ (long)u.cpb.cpt_source); ++ goto out; ++ } ++ ++ offset += u.cpb.cpt_next; ++ continue; ++ } else if (u.pb.cpt_object == CPT_OBJ_ITERPAGES || ++ u.pb.cpt_object == CPT_OBJ_ITERYOUNGPAGES ++ ) { ++#ifdef CONFIG_VZ_CHECKPOINT_ITER ++ unsigned long ptr = u.lpb.cpt_start; ++ u64 page_pos[16]; ++ pos = offset + sizeof(u.pb); ++ ++ err = ctx->pread(&page_pos, ++ 8*(u.lpb.cpt_end-ptr)/PAGE_SIZE, ++ ctx, ++ pos); ++ if (err) { ++ eprintk_ctx("Oops\n"); ++ goto out; ++ } ++ ++ down_read(&mm->mmap_sem); ++ if ((vma = find_vma(mm, u.lpb.cpt_start)) == NULL) { ++ up_read(&mm->mmap_sem); ++ eprintk_ctx("lost vm_area_struct\n"); ++ err = -ESRCH; ++ goto out; ++ } ++ err = anon_vma_prepare(vma); ++ if (err) { ++ up_read(&mm->mmap_sem); ++ goto out; ++ } ++ while (ptr < u.lpb.cpt_end) { ++ err = rst_iter(vma, ++ page_pos[(ptr-u.lpb.cpt_start)/PAGE_SIZE], ++ ptr, ++ ctx); ++ if (err) ++ break; ++ ptr += PAGE_SIZE; ++ } ++ if (u.pb.cpt_object == CPT_OBJ_ITERYOUNGPAGES) { ++ make_pages_present((unsigned long)u.lpb.cpt_start, ++ (unsigned long)u.lpb.cpt_end); ++ } ++ up_read(&mm->mmap_sem); ++#else ++ err = -EINVAL; ++#endif ++ if (err) ++ goto out; ++ offset += u.cpb.cpt_next; ++ continue; ++ } ++ if (u.pb.cpt_object != CPT_OBJ_PAGES) { ++ eprintk_ctx("unknown vma fix object %d\n", u.pb.cpt_object); ++ err = -EINVAL; ++ goto out; ++ } ++ pos = offset + sizeof(u.pb); ++ if (!(vmai->cpt_flags&VM_ACCOUNT) && !(prot&PROT_WRITE)) { ++ /* I guess this is get_user_pages() messed things, ++ * this happens f.e. when gdb inserts breakpoints. ++ */ ++ int i; ++ for (i=0; i<(u.pb.cpt_end-u.pb.cpt_start)/PAGE_SIZE; i++) { ++ struct page *page; ++ void *maddr; ++ err = get_user_pages(current, current->mm, ++ (unsigned long)u.pb.cpt_start + i*PAGE_SIZE, ++ 1, 1, 1, &page, NULL); ++ if (err == 0) ++ err = -EFAULT; ++ if (err < 0) { ++ eprintk_ctx("get_user_pages: %d\n", err); ++ goto out; ++ } ++ err = 0; ++ maddr = kmap(page); ++ if (u.pb.cpt_content == CPT_CONTENT_VOID) { ++ memset(maddr, 0, PAGE_SIZE); ++ } else if (u.pb.cpt_content == CPT_CONTENT_DATA) { ++ err = ctx->pread(maddr, PAGE_SIZE, ++ ctx, pos + i*PAGE_SIZE); ++ if (err) { ++ kunmap(page); ++ goto out; ++ } ++ } else { ++ err = -EINVAL; ++ kunmap(page); ++ goto out; ++ } ++ set_page_dirty_lock(page); ++ kunmap(page); ++ page_cache_release(page); ++ } ++ } else { ++ if (!(prot&PROT_WRITE)) ++ sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot | PROT_WRITE); ++ if (u.pb.cpt_content == CPT_CONTENT_VOID) { ++ int i; ++ for (i=0; i<(u.pb.cpt_end-u.pb.cpt_start)/sizeof(unsigned long); i++) { ++ err = __put_user(0UL, ((unsigned long __user*)(unsigned long)u.pb.cpt_start) + i); ++ if (err) { ++ eprintk_ctx("__put_user 2 %d\n", err); ++ goto out; ++ } ++ } ++ } else if (u.pb.cpt_content == CPT_CONTENT_DATA) { ++ loff_t tpos = pos; ++ err = ctx->file->f_op->read(ctx->file, cpt_ptr_import(u.pb.cpt_start), ++ u.pb.cpt_end-u.pb.cpt_start, ++ &tpos); ++ if (err != u.pb.cpt_end-u.pb.cpt_start) { ++ if (err >= 0) ++ err = -EIO; ++ goto out; ++ } ++ } else { ++ err = -EINVAL; ++ goto out; ++ } ++ if (!(prot&PROT_WRITE)) ++ sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot); ++ } ++ err = 0; ++ offset += u.pb.cpt_next; ++ } while (offset < vmapos + vmai->cpt_next); ++ } ++ ++check: ++ do { ++ struct vm_area_struct *vma; ++ down_read(&mm->mmap_sem); ++ vma = find_vma(mm, addr); ++ if (vma) { ++ if ((vma->vm_flags^vmai->cpt_flags)&VM_READHINTMASK) { ++ VM_ClearReadHint(vma); ++ vma->vm_flags |= vmai->cpt_flags&VM_READHINTMASK; ++ } ++ if ((vma->vm_flags^vmai->cpt_flags)&VM_LOCKED) { ++ dprintk_ctx("fixing up VM_LOCKED %Ld\n", vmapos); ++ up_read(&mm->mmap_sem); ++ if (vma->vm_flags&VM_LOCKED) ++ err = sc_munlock(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start); ++ else ++ err = sc_mlock(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start); ++ /* When mlock fails with EFAULT, it means ++ * that it could not bring in pages. ++ * It can happen after mlock() on unreadable ++ * VMAs. But VMA is correctly locked, ++ * so that this error can be ignored. */ ++ if (err == -EFAULT) ++ err = 0; ++ if (err) ++ goto out; ++ goto check; ++ } ++ if ((vma->vm_page_prot.pgprot^vmai->cpt_pgprot)&~__PAGE_NX) ++ wprintk_ctx("VMA %08lx@%ld pgprot mismatch %08Lx %08Lx\n", addr, (long)vmapos, ++ (unsigned long long)vma->vm_page_prot.pgprot, ++ (unsigned long long)vmai->cpt_pgprot); ++#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) ++ if (((vma->vm_page_prot.pgprot^vmai->cpt_pgprot)&__PAGE_NX) && ++ (ctx->kernel_config_flags&CPT_KERNEL_CONFIG_PAE)) ++ wprintk_ctx("VMA %08lx@%ld pgprot mismatch %08Lx %08Lx\n", addr, (long)vmapos, ++ (__u64)vma->vm_page_prot.pgprot, (__u64)vmai->cpt_pgprot); ++#endif ++ if (vma->vm_flags != vmai->cpt_flags) { ++ unsigned long x = vma->vm_flags ^ vmai->cpt_flags; ++ if (x & VM_EXEC) { ++ /* Crap. On i386 this is OK. ++ * It is impossible to make via mmap/mprotect ++ * exec.c clears VM_EXEC on stack. */ ++ vma->vm_flags &= ~VM_EXEC; ++ } else if ((x & VM_ACCOUNT) && !checked) { ++ checked = 1; ++ if (!(prot&PROT_WRITE)) { ++ up_read(&mm->mmap_sem); ++ sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot | PROT_WRITE); ++ sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot); ++ goto check; ++ } ++ wprintk_ctx("VMA %08lx@%ld flag mismatch %08x %08x\n", addr, (long)vmapos, ++ (__u32)vma->vm_flags, (__u32)vmai->cpt_flags); ++ } else { ++ wprintk_ctx("VMA %08lx@%ld flag mismatch %08x %08x\n", addr, (long)vmapos, ++ (__u32)vma->vm_flags, (__u32)vmai->cpt_flags); ++ } ++ } ++ } else { ++ wprintk_ctx("no VMA for %08lx@%ld\n", addr, (long)vmapos); ++ } ++ up_read(&mm->mmap_sem); ++ } while (0); ++ ++out: ++ if (file) ++ fput(file); ++ return err; ++} ++ ++#ifndef CONFIG_IA64 ++#define TASK_UNMAP_START 0 ++#else ++/* On IA64 the first page is a special VM_IO|VM_RESERVED mapping ++ * used to accelerate speculative dereferences of NULL pointer. */ ++#define TASK_UNMAP_START PAGE_SIZE ++#endif ++ ++static int do_rst_mm(struct cpt_mm_image *vmi, loff_t pos, struct cpt_context *ctx) ++{ ++ int err = 0; ++ unsigned int def_flags; ++ struct mm_struct *mm = current->mm; ++#ifdef CONFIG_BEANCOUNTERS ++ struct user_beancounter *bc; ++#endif ++ ++ down_write(&mm->mmap_sem); ++ do_munmap(mm, TASK_UNMAP_START, TASK_SIZE-TASK_UNMAP_START); ++ ++#ifdef CONFIG_BEANCOUNTERS ++ /* ++ * MM beancounter is usually correct from the fork time, ++ * but not for init, for example. ++ * Luckily, mm_ub can be changed for a completely empty MM. ++ */ ++ bc = rst_lookup_ubc(vmi->cpt_mmub, ctx); ++ err = virtinfo_notifier_call(VITYPE_SCP, VIRTINFO_SCP_RSTMM, bc); ++ if (err & NOTIFY_FAIL) { ++ up_write(&mm->mmap_sem); ++ return -ECHRNG; ++ } ++ if ((err & VIRTNOTIFY_CHANGE) && bc != mm->mm_ub) { ++ struct user_beancounter *old_bc; ++ ++ old_bc = mm->mm_ub; ++ mm->mm_ub = bc; ++ bc = old_bc; ++ } ++ err = 0; ++ put_beancounter(bc); ++#endif ++ ++ mm->start_code = vmi->cpt_start_code; ++ mm->end_code = vmi->cpt_end_code; ++ mm->start_data = vmi->cpt_start_data; ++ mm->end_data = vmi->cpt_end_data; ++ mm->start_brk = vmi->cpt_start_brk; ++ mm->brk = vmi->cpt_brk; ++ mm->start_stack = vmi->cpt_start_stack; ++ mm->arg_start = vmi->cpt_start_arg; ++ mm->arg_end = vmi->cpt_end_arg; ++ mm->env_start = vmi->cpt_start_env; ++ mm->env_end = vmi->cpt_end_env; ++ mm->def_flags = 0; ++ def_flags = vmi->cpt_def_flags; ++ ++ mm->flags = vmi->cpt_dumpable; ++ if (ctx->image_version < CPT_VERSION_24) ++ mm->flags |= MMF_DUMP_FILTER_DEFAULT << MMF_DUMPABLE_BITS; ++ ++ mm->vps_dumpable = vmi->cpt_vps_dumpable; ++#ifndef CONFIG_IA64 ++ if (ctx->image_version >= CPT_VERSION_9) { ++ mm->context.vdso = cpt_ptr_import(vmi->cpt_vdso); ++ current_thread_info()->sysenter_return = ++ VDSO32_SYMBOL(mm->context.vdso, SYSENTER_RETURN); ++ } ++#endif ++ ++#if 0 /* def CONFIG_HUGETLB_PAGE*/ ++/* NB: ? */ ++ int used_hugetlb; ++#endif ++ up_write(&mm->mmap_sem); ++ ++ if (vmi->cpt_next > vmi->cpt_hdrlen) { ++ loff_t offset = pos + vmi->cpt_hdrlen; ++ do { ++ union { ++ struct cpt_vma_image vmai; ++ struct cpt_aio_ctx_image aioi; ++ struct cpt_obj_bits bits; ++ } u; ++ err = rst_get_object(-1, offset, &u, ctx); ++ if (err) ++ goto out; ++ if (u.vmai.cpt_object == CPT_OBJ_VMA) { ++#ifdef CONFIG_IA64 ++ //// Later... ++ if (u.vmai.cpt_start) ++#endif ++ err = do_rst_vma(&u.vmai, offset, pos, ctx); ++ if (err) ++ goto out; ++#ifdef CONFIG_X86 ++ } else if (u.bits.cpt_object == CPT_OBJ_BITS && ++ u.bits.cpt_content == CPT_CONTENT_MM_CONTEXT) { ++ err = do_rst_ldt(&u.bits, offset, ctx); ++ if (err) ++ goto out; ++#endif ++ } else if (u.aioi.cpt_object == CPT_OBJ_AIO_CONTEXT) { ++ err = do_rst_aio(&u.aioi, offset, ctx); ++ if (err) ++ goto out; ++ } else { ++ eprintk_ctx("unknown object %u in mm image\n", u.vmai.cpt_object); ++ err = -EINVAL; ++ goto out; ++ } ++ offset += u.vmai.cpt_next; ++ } while (offset < pos + vmi->cpt_next); ++ } ++ ++ down_write(&mm->mmap_sem); ++ mm->def_flags = def_flags; ++ up_write(&mm->mmap_sem); ++ ++ ++out: ++ return err; ++} ++ ++extern void exit_mm(struct task_struct * tsk); ++ ++int rst_mm_complete(struct cpt_task_image *ti, struct cpt_context *ctx) ++{ ++ int err = 0; ++ cpt_object_t *mobj; ++ void *tmp = (void*)__get_free_page(GFP_KERNEL); ++ struct cpt_mm_image *vmi = (struct cpt_mm_image *)tmp; ++ ++ if (!tmp) ++ return -ENOMEM; ++ ++ if (ti->cpt_mm == CPT_NULL) { ++ if (current->mm) { ++ virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_EXIT, ++ current); ++ exit_mm(current); ++ } ++ goto out; ++ } ++ ++ mobj = lookup_cpt_obj_bypos(CPT_OBJ_MM, ti->cpt_mm, ctx); ++ if (mobj) { ++ if (current->mm != mobj->o_obj) BUG(); ++ goto out; ++ } ++ ++ if (current->mm == NULL) { ++ struct mm_struct *mm = mm_alloc(); ++ if (mm == NULL) { ++ err = -ENOMEM; ++ goto out; ++ } ++ err = init_new_context(current, mm); ++ if (err) { ++ mmdrop(mm); ++ goto out; ++ } ++ current->mm = mm; ++ } ++ ++ if ((err = rst_get_object(CPT_OBJ_MM, ti->cpt_mm, vmi, ctx)) != 0) ++ goto out; ++ if ((err = do_rst_mm(vmi, ti->cpt_mm, ctx)) != 0) { ++ eprintk_ctx("do_rst_mm %Ld\n", (unsigned long long)ti->cpt_mm); ++ goto out; ++ } ++ err = -ENOMEM; ++ mobj = cpt_object_add(CPT_OBJ_MM, current->mm, ctx); ++ if (mobj != NULL) { ++ err = 0; ++ cpt_obj_setpos(mobj, ti->cpt_mm, ctx); ++ } ++ ++out: ++ if (tmp) ++ free_page((unsigned long)tmp); ++ return err; ++} ++ ++/* This is part of mm setup, made in parent context. Mostly, it is the place, ++ * where we graft mm of another process to child. ++ */ ++ ++int rst_mm_basic(cpt_object_t *obj, struct cpt_task_image *ti, struct cpt_context *ctx) ++{ ++ struct task_struct *tsk = obj->o_obj; ++ cpt_object_t *mobj; ++ ++ /* Task without mm. Just get rid of this. */ ++ if (ti->cpt_mm == CPT_NULL) { ++ if (tsk->mm) { ++ virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_EXIT, ++ tsk); ++ mmput(tsk->mm); ++ tsk->mm = NULL; ++ } ++ return 0; ++ } ++ ++ mobj = lookup_cpt_obj_bypos(CPT_OBJ_MM, ti->cpt_mm, ctx); ++ if (mobj) { ++ struct mm_struct *newmm = mobj->o_obj; ++ /* Good, the MM is already created. */ ++ if (newmm == tsk->mm) { ++ /* Already done by clone(). */ ++ return 0; ++ } ++ mmput(tsk->mm); ++ atomic_inc(&newmm->mm_users); ++ tsk->mm = newmm; ++ tsk->active_mm = newmm; ++ } ++ return 0; ++} ++ ++/* We use CLONE_VM when mm of child is going to be shared with parent. ++ * Otherwise mm is copied. ++ */ ++ ++__u32 rst_mm_flag(struct cpt_task_image *ti, struct cpt_context *ctx) ++{ ++ if (ti->cpt_mm == CPT_NULL || ++ lookup_cpt_obj_bypos(CPT_OBJ_MM, ti->cpt_mm, ctx)) ++ return CLONE_VM; ++ return 0; ++} +diff --git a/kernel/cpt/rst_net.c b/kernel/cpt/rst_net.c +new file mode 100644 +index 0000000..b246ddb +--- /dev/null ++++ b/kernel/cpt/rst_net.c +@@ -0,0 +1,746 @@ ++/* ++ * ++ * kernel/cpt/rst_net.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_kernel.h" ++#include "cpt_net.h" ++#include "cpt_files.h" ++ ++#include "cpt_syscalls.h" ++ ++extern struct in_ifaddr *inet_alloc_ifa(void); ++extern int inet_insert_ifa(struct in_ifaddr *ifa); ++extern struct in_device *inetdev_init(struct net_device *dev); ++ ++int rst_restore_ifaddr(struct cpt_context *ctx) ++{ ++ struct net *net = get_exec_env()->ve_netns; ++ int err; ++ loff_t sec = ctx->sections[CPT_SECT_NET_IFADDR]; ++ loff_t endsec; ++ struct cpt_section_hdr h; ++ struct cpt_ifaddr_image di; ++ struct net_device *dev; ++ ++ if (sec == CPT_NULL) ++ return 0; ++ ++ err = ctx->pread(&h, sizeof(h), ctx, sec); ++ if (err) ++ return err; ++ if (h.cpt_section != CPT_SECT_NET_IFADDR || h.cpt_hdrlen < sizeof(h)) ++ return -EINVAL; ++ ++ endsec = sec + h.cpt_next; ++ sec += h.cpt_hdrlen; ++ while (sec < endsec) { ++ int cindex = -1; ++ int err; ++ err = rst_get_object(CPT_OBJ_NET_IFADDR, sec, &di, ctx); ++ if (err) ++ return err; ++ cindex = di.cpt_index; ++ rtnl_lock(); ++ dev = __dev_get_by_index(net, cindex); ++ if (dev && di.cpt_family == AF_INET) { ++ struct in_device *in_dev; ++ struct in_ifaddr *ifa; ++ if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) ++ in_dev = inetdev_init(dev); ++ ifa = inet_alloc_ifa(); ++ if (ifa) { ++ ifa->ifa_local = di.cpt_address[0]; ++ ifa->ifa_address = di.cpt_peer[0]; ++ ifa->ifa_broadcast = di.cpt_broadcast[0]; ++ ifa->ifa_prefixlen = di.cpt_masklen; ++ ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen); ++ ifa->ifa_flags = di.cpt_flags; ++ ifa->ifa_scope = di.cpt_scope; ++ memcpy(ifa->ifa_label, di.cpt_label, IFNAMSIZ); ++ in_dev_hold(in_dev); ++ ifa->ifa_dev = in_dev; ++ err = inet_insert_ifa(ifa); ++ if (err && err != -EEXIST) { ++ rtnl_unlock(); ++ eprintk_ctx("add ifaddr err %d for %d %s\n", err, di.cpt_index, di.cpt_label); ++ return err; ++ } ++ } ++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) ++ } else if (dev && di.cpt_family == AF_INET6) { ++ __u32 prefered_lft; ++ __u32 valid_lft; ++ struct net *net = get_exec_env()->ve_ns->net_ns; ++ prefered_lft = (di.cpt_flags & IFA_F_DEPRECATED) ? ++ 0 : di.cpt_prefered_lft; ++ valid_lft = (di.cpt_flags & IFA_F_PERMANENT) ? ++ 0xFFFFFFFF : di.cpt_valid_lft; ++ err = inet6_addr_add(net, dev->ifindex, ++ (struct in6_addr *)di.cpt_address, ++ di.cpt_masklen, 0, ++ prefered_lft, ++ valid_lft); ++ if (err && err != -EEXIST) { ++ rtnl_unlock(); ++ eprintk_ctx("add ifaddr err %d for %d %s\n", err, di.cpt_index, di.cpt_label); ++ return err; ++ } ++#endif ++ } else { ++ rtnl_unlock(); ++ eprintk_ctx("unknown ifaddr 2 for %d\n", di.cpt_index); ++ return -EINVAL; ++ } ++ rtnl_unlock(); ++ sec += di.cpt_next; ++ } ++ return 0; ++} ++ ++static int rewrite_rtmsg(struct nlmsghdr *nlh, struct cpt_context *ctx) ++{ ++ int min_len = NLMSG_LENGTH(sizeof(struct rtmsg)); ++ struct rtmsg *rtm = NLMSG_DATA(nlh); ++ __u32 prefix0 = 0; ++ ++ if (nlh->nlmsg_len > min_len) { ++ int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); ++ struct rtattr *rta = (void*)nlh + NLMSG_ALIGN(min_len); ++ ++ while (RTA_OK(rta, attrlen)) { ++ if (rta->rta_type == RTA_DST) { ++ prefix0 = *(__u32*)RTA_DATA(rta); ++ } ++ rta = RTA_NEXT(rta, attrlen); ++ } ++ } ++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) ++ if (rtm->rtm_family == AF_INET6) { ++ if (rtm->rtm_type == RTN_LOCAL) ++ return 2; ++ if (rtm->rtm_flags & RTM_F_CLONED) ++ return 2; ++ if (rtm->rtm_protocol == RTPROT_UNSPEC || ++ rtm->rtm_protocol == RTPROT_RA || ++ rtm->rtm_protocol == RTPROT_REDIRECT || ++ rtm->rtm_protocol == RTPROT_KERNEL) ++ return 2; ++ if (rtm->rtm_protocol == RTPROT_BOOT && ++ ((rtm->rtm_dst_len == 8 && prefix0 == htonl(0xFF000000)) || ++ (rtm->rtm_dst_len == 64 && prefix0 == htonl(0xFE800000)))) ++ return 2; ++ } ++#endif ++ return rtm->rtm_protocol == RTPROT_KERNEL; ++} ++ ++int rst_restore_route(struct cpt_context *ctx) ++{ ++ int err; ++ struct socket *sock; ++ struct msghdr msg; ++ struct iovec iov; ++ struct sockaddr_nl nladdr; ++ mm_segment_t oldfs; ++ loff_t sec = ctx->sections[CPT_SECT_NET_ROUTE]; ++ loff_t endsec; ++ struct cpt_section_hdr h; ++ struct cpt_object_hdr v; ++ char *pg; ++ ++ if (sec == CPT_NULL) ++ return 0; ++ ++ err = ctx->pread(&h, sizeof(h), ctx, sec); ++ if (err) ++ return err; ++ if (h.cpt_section != CPT_SECT_NET_ROUTE || h.cpt_hdrlen < sizeof(h)) ++ return -EINVAL; ++ ++ if (h.cpt_hdrlen >= h.cpt_next) ++ return 0; ++ ++ sec += h.cpt_hdrlen; ++ err = rst_get_object(CPT_OBJ_NET_ROUTE, sec, &v, ctx); ++ if (err < 0) ++ return err; ++ ++ err = sock_create_kern(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE, &sock); ++ if (err) ++ return err; ++ ++ pg = (char*)__get_free_page(GFP_KERNEL); ++ if (pg == NULL) { ++ err = -ENOMEM; ++ goto out_sock; ++ } ++ ++ memset(&nladdr, 0, sizeof(nladdr)); ++ nladdr.nl_family = AF_NETLINK; ++ ++ endsec = sec + v.cpt_next; ++ sec += v.cpt_hdrlen; ++ ++ while (sec < endsec) { ++ struct nlmsghdr *n; ++ struct nlmsghdr nh; ++ int kernel_flag; ++ ++ if (endsec - sec < sizeof(nh)) ++ break; ++ ++ err = ctx->pread(&nh, sizeof(nh), ctx, sec); ++ if (err) ++ goto out_sock_pg; ++ if (nh.nlmsg_len < sizeof(nh) || nh.nlmsg_len > PAGE_SIZE || ++ endsec - sec < nh.nlmsg_len) { ++ err = -EINVAL; ++ goto out_sock_pg; ++ } ++ err = ctx->pread(pg, nh.nlmsg_len, ctx, sec); ++ if (err) ++ goto out_sock_pg; ++ ++ n = (struct nlmsghdr*)pg; ++ n->nlmsg_flags = NLM_F_REQUEST|NLM_F_APPEND|NLM_F_CREATE; ++ ++ err = rewrite_rtmsg(n, ctx); ++ if (err < 0) ++ goto out_sock_pg; ++ kernel_flag = err; ++ ++ if (kernel_flag == 2) ++ goto do_next; ++ ++ iov.iov_base=n; ++ iov.iov_len=nh.nlmsg_len; ++ msg.msg_name=&nladdr; ++ msg.msg_namelen=sizeof(nladdr); ++ msg.msg_iov=&iov; ++ msg.msg_iovlen=1; ++ msg.msg_control=NULL; ++ msg.msg_controllen=0; ++ msg.msg_flags=MSG_DONTWAIT; ++ ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ err = sock_sendmsg(sock, &msg, nh.nlmsg_len); ++ set_fs(oldfs); ++ ++ if (err < 0) ++ goto out_sock_pg; ++ err = 0; ++ ++ iov.iov_base=pg; ++ iov.iov_len=PAGE_SIZE; ++ ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ err = sock_recvmsg(sock, &msg, PAGE_SIZE, MSG_DONTWAIT); ++ set_fs(oldfs); ++ if (err != -EAGAIN) { ++ if (err == NLMSG_LENGTH(sizeof(struct nlmsgerr)) && ++ n->nlmsg_type == NLMSG_ERROR) { ++ struct nlmsgerr *e = NLMSG_DATA(n); ++ if (e->error != -EEXIST || !kernel_flag) ++ eprintk_ctx("NLMERR: %d\n", e->error); ++ } else { ++ eprintk_ctx("Res: %d %d\n", err, n->nlmsg_type); ++ } ++ } ++do_next: ++ err = 0; ++ sec += NLMSG_ALIGN(nh.nlmsg_len); ++ } ++ ++out_sock_pg: ++ free_page((unsigned long)pg); ++out_sock: ++ sock_release(sock); ++ return err; ++} ++ ++int rst_resume_network(struct cpt_context *ctx) ++{ ++ struct ve_struct *env; ++ ++ env = get_ve_by_id(ctx->ve_id); ++ if (!env) ++ return -ESRCH; ++ env->disable_net = 0; ++ put_ve(env); ++ return 0; ++} ++ ++#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) ++extern unsigned int tun_net_id; ++#endif ++ ++/* We do not restore skb queue, just reinit it */ ++static int rst_restore_tuntap(loff_t start, struct cpt_netdev_image *di, ++ struct cpt_context *ctx) ++{ ++ int err = -ENODEV; ++#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) ++ struct cpt_tuntap_image ti; ++ struct net_device *dev; ++ struct file *bind_file = NULL; ++ struct net *net; ++ struct tun_struct *tun; ++ struct tun_net *tn; ++ loff_t pos; ++ ++ pos = start + di->cpt_hdrlen; ++ err = rst_get_object(CPT_OBJ_NET_TUNTAP, pos, &ti, ctx); ++ if (err) ++ return err; ++ ++ pos += ti.cpt_next; ++ if (ti.cpt_bindfile) { ++ bind_file = rst_file(ti.cpt_bindfile, -1, ctx); ++ if (IS_ERR(bind_file)) { ++ eprintk_ctx("rst_restore_tuntap:" ++ "rst_file: %Ld\n", ++ (unsigned long long)ti.cpt_bindfile); ++ return PTR_ERR(bind_file); ++ } ++ } ++ ++ rtnl_lock(); ++ err = -ENOMEM; ++ dev = alloc_netdev(sizeof(struct tun_struct), di->cpt_name, tun_setup); ++ if (!dev) ++ goto out; ++ ++ tun = netdev_priv(dev); ++ ++ tun->dev = dev; ++ tun->owner = ti.cpt_owner; ++ tun->flags = ti.cpt_flags; ++ tun->attached = ti.cpt_attached; ++ tun->if_flags = ti.cpt_if_flags; ++ tun_net_init(dev); ++ BUG_ON(sizeof(ti.cpt_dev_addr) != sizeof(tun->dev_addr)); ++ memcpy(tun->dev_addr, ti.cpt_dev_addr, sizeof(ti.cpt_dev_addr)); ++ BUG_ON(sizeof(ti.cpt_chr_filter) != sizeof(tun->chr_filter)); ++ memcpy(tun->chr_filter, ti.cpt_chr_filter, sizeof(ti.cpt_chr_filter)); ++ BUG_ON(sizeof(ti.cpt_net_filter) != sizeof(tun->net_filter)); ++ memcpy(tun->net_filter, ti.cpt_net_filter, sizeof(ti.cpt_net_filter)); ++ ++ err = register_netdevice(dev); ++ if (err < 0) { ++ free_netdev(dev); ++ eprintk_ctx("failed to register tun/tap net device\n"); ++ goto out; ++ } ++ if (pos < start + di->cpt_next) { ++ struct cpt_hwaddr_image hw; ++ /* Restore hardware address */ ++ err = rst_get_object(CPT_OBJ_NET_HWADDR, pos, ++ &hw, ctx); ++ if (err) ++ goto out; ++ BUG_ON(sizeof(hw.cpt_dev_addr) != sizeof(dev->dev_addr)); ++ memcpy(dev->dev_addr, hw.cpt_dev_addr, ++ sizeof(hw.cpt_dev_addr)); ++ } ++ net = get_exec_env()->ve_ns->net_ns; ++ tn = net_generic(net, tun_net_id); ++ list_add(&tun->list, &tn->dev_list); ++ ++ bind_file->private_data = tun; ++ tun->bind_file = bind_file; ++ ++out: ++ fput(bind_file); ++ rtnl_unlock(); ++#endif ++ return err; ++} ++ ++static int rst_restore_veth(loff_t pos, struct net_device *dev, ++ struct cpt_context *ctx) ++{ ++ int err = -ENODEV; ++#if defined(CONFIG_VE_ETHDEV) || defined(CONFIG_VE_ETHDEV_MODULE) ++ struct cpt_veth_image vi; ++ struct veth_struct *veth; ++ ++ if (!KSYMREF(veth_open) || dev->open != KSYMREF(veth_open)) { ++ eprintk_ctx("Module vzethdev is not loaded, " ++ "or device %s is not a veth device\n", dev->name); ++ return -EINVAL; ++ } ++ err = rst_get_object(CPT_OBJ_NET_VETH, pos, &vi, ctx); ++ if (err) ++ return err; ++ veth = veth_from_netdev(dev); ++ veth->allow_mac_change = vi.cpt_allow_mac_change; ++#endif ++ return err; ++} ++ ++static int rst_restore_netstats(loff_t pos, struct net_device *dev, ++ struct cpt_context * ctx) ++{ ++ struct cpt_netstats_image *n; ++ struct net_device_stats *stats = NULL; ++ struct net_device *lo = get_exec_env()->ve_netns->loopback_dev; ++ int err; ++ ++ if (!dev->get_stats) ++ return 0; ++ ++ n = cpt_get_buf(ctx); ++ err = rst_get_object(CPT_OBJ_NET_STATS, pos, n, ctx); ++ if (err) ++ goto out; ++ BUG_ON(sizeof(struct cpt_netstats_image) != n->cpt_hdrlen); ++ preempt_disable(); ++ if (dev == lo) ++ stats = &lo->stats; ++#if defined(CONFIG_VE_ETHDEV) || defined(CONFIG_VE_ETHDEV_MODULE) ++ else if (KSYMREF(veth_open) && dev->open == KSYMREF(veth_open)) ++ stats = veth_stats(dev, smp_processor_id()); ++#endif ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++ else if (dev == get_exec_env()->_venet_dev) ++ stats = venet_stats(dev, smp_processor_id()); ++#endif ++#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) ++ if (dev->open == tun_net_open) ++ stats = &dev->stats; ++#endif ++ if (!stats) { ++ err = -ENODEV; ++ eprintk_ctx("Network device %s is not supported\n", dev->name); ++ goto out; ++ } ++ ++ stats->rx_packets = n->cpt_rx_packets; ++ stats->tx_packets = n->cpt_tx_packets; ++ stats->rx_bytes = n->cpt_rx_bytes; ++ stats->tx_bytes = n->cpt_tx_bytes; ++ stats->rx_errors = n->cpt_rx_errors; ++ stats->tx_errors = n->cpt_tx_errors; ++ stats->rx_dropped = n->cpt_rx_dropped; ++ stats->tx_dropped = n->cpt_tx_dropped; ++ stats->multicast = n->cpt_multicast; ++ stats->collisions = n->cpt_collisions; ++ stats->rx_length_errors = n->cpt_rx_length_errors; ++ stats->rx_over_errors = n->cpt_rx_over_errors; ++ stats->rx_crc_errors = n->cpt_rx_crc_errors; ++ stats->rx_frame_errors = n->cpt_rx_frame_errors; ++ stats->rx_fifo_errors = n->cpt_rx_fifo_errors; ++ stats->rx_missed_errors = n->cpt_rx_missed_errors; ++ stats->tx_aborted_errors = n->cpt_tx_aborted_errors; ++ stats->tx_carrier_errors = n->cpt_tx_carrier_errors; ++ stats->tx_fifo_errors = n->cpt_tx_fifo_errors; ++ stats->tx_heartbeat_errors = n->cpt_tx_heartbeat_errors; ++ stats->tx_window_errors = n->cpt_tx_window_errors; ++ stats->rx_compressed = n->cpt_rx_compressed; ++ stats->tx_compressed = n->cpt_tx_compressed; ++ ++out: ++ preempt_enable(); ++ cpt_release_buf(ctx); ++ return err; ++} ++ ++int rst_restore_netdev(struct cpt_context *ctx) ++{ ++ struct net *net = get_exec_env()->ve_netns; ++ int err; ++ loff_t sec = ctx->sections[CPT_SECT_NET_DEVICE]; ++ loff_t endsec; ++ struct cpt_section_hdr h; ++ struct cpt_netdev_image di; ++ struct net_device *dev; ++ ++ get_exec_env()->disable_net = 1; ++ ++ if (sec == CPT_NULL) ++ return 0; ++ ++ err = ctx->pread(&h, sizeof(h), ctx, sec); ++ if (err) ++ return err; ++ if (h.cpt_section != CPT_SECT_NET_DEVICE || h.cpt_hdrlen < sizeof(h)) ++ return -EINVAL; ++ ++ endsec = sec + h.cpt_next; ++ sec += h.cpt_hdrlen; ++ while (sec < endsec) { ++ loff_t pos; ++ struct net_device *dev_new; ++ err = rst_get_object(CPT_OBJ_NET_DEVICE, sec, &di, ctx); ++ if (err) ++ return err; ++ ++ pos = sec + di.cpt_hdrlen; ++ if (di.cpt_next > sizeof(di)) { ++ struct cpt_object_hdr hdr; ++ err = ctx->pread(&hdr, sizeof(struct cpt_object_hdr), ++ ctx, sec + di.cpt_hdrlen); ++ if (err) ++ return err; ++ if (hdr.cpt_object == CPT_OBJ_NET_TUNTAP) { ++ err = rst_restore_tuntap(sec, &di, ctx); ++ if (err) { ++ eprintk_ctx("restore tuntap %s: %d\n", ++ di.cpt_name, err); ++ return err; ++ } ++ pos += hdr.cpt_next; ++ } ++ } ++ ++ rtnl_lock(); ++ dev = __dev_get_by_name(net, di.cpt_name); ++ if (dev) { ++ if (dev->ifindex != di.cpt_index) { ++ dev_new = __dev_get_by_index(net, di.cpt_index); ++ if (!dev_new) { ++ write_lock_bh(&dev_base_lock); ++ hlist_del(&dev->index_hlist); ++ if (dev->iflink == dev->ifindex) ++ dev->iflink = di.cpt_index; ++ dev->ifindex = di.cpt_index; ++ hlist_add_head(&dev->index_hlist, ++ dev_index_hash(net, dev->ifindex)); ++ write_unlock_bh(&dev_base_lock); ++ } else { ++ write_lock_bh(&dev_base_lock); ++ hlist_del(&dev->index_hlist); ++ hlist_del(&dev_new->index_hlist); ++ if (dev_new->iflink == dev_new->ifindex) ++ dev_new->iflink = dev->ifindex; ++ dev_new->ifindex = dev->ifindex; ++ if (dev->iflink == dev->ifindex) ++ dev->iflink = di.cpt_index; ++ dev->ifindex = di.cpt_index; ++ hlist_add_head(&dev->index_hlist, ++ dev_index_hash(net, dev->ifindex)); ++ hlist_add_head(&dev_new->index_hlist, ++ dev_index_hash(net, dev_new->ifindex)); ++ write_unlock_bh(&dev_base_lock); ++ } ++ } ++ if (di.cpt_flags^dev->flags) { ++ err = dev_change_flags(dev, di.cpt_flags); ++ if (err) ++ eprintk_ctx("dev_change_flags err: %d\n", err); ++ } ++ while (pos < sec + di.cpt_next) { ++ struct cpt_object_hdr hdr; ++ err = ctx->pread(&hdr, sizeof(struct cpt_object_hdr), ++ ctx, pos); ++ if (err) ++ goto out; ++ if (hdr.cpt_object == CPT_OBJ_NET_VETH) { ++ err = rst_restore_veth(pos, dev, ctx); ++ if (err) { ++ eprintk_ctx("restore veth %s: %d\n", ++ di.cpt_name, err); ++ goto out; ++ } ++ } else if (hdr.cpt_object == CPT_OBJ_NET_HWADDR) { ++ /* Restore hardware address */ ++ struct cpt_hwaddr_image hw; ++ err = rst_get_object(CPT_OBJ_NET_HWADDR, ++ pos, &hw, ctx); ++ if (err) ++ goto out; ++ BUG_ON(sizeof(hw.cpt_dev_addr) != ++ sizeof(dev->dev_addr)); ++ memcpy(dev->dev_addr, hw.cpt_dev_addr, ++ sizeof(hw.cpt_dev_addr)); ++ } else if (hdr.cpt_object == CPT_OBJ_NET_STATS) { ++ err = rst_restore_netstats(pos, dev, ctx); ++ if (err) { ++ eprintk_ctx("rst stats %s: %d\n", ++ di.cpt_name, err); ++ goto out; ++ } ++ } ++ pos += hdr.cpt_next; ++ } ++ } else { ++ eprintk_ctx("unknown interface 2 %s\n", di.cpt_name); ++ } ++ rtnl_unlock(); ++ sec += di.cpt_next; ++ } ++ return 0; ++out: ++ rtnl_unlock(); ++ return err; ++} ++ ++static int dumpfn(void *arg) ++{ ++ int i; ++ int *pfd = arg; ++ char *argv[] = { "iptables-restore", "-c", NULL }; ++ ++ if (pfd[0] != 0) ++ sc_dup2(pfd[0], 0); ++ ++ for (i=1; ifiles->fdt->max_fds; i++) ++ sc_close(i); ++ ++ module_put(THIS_MODULE); ++ ++ set_fs(KERNEL_DS); ++ i = sc_execve("/sbin/iptables-restore", argv, NULL); ++ if (i == -ENOENT) ++ i = sc_execve("/usr/sbin/iptables-restore", argv, NULL); ++ eprintk("failed to exec iptables-restore: %d\n", i); ++ return 255 << 8; ++} ++ ++static int rst_restore_iptables(struct cpt_context * ctx) ++{ ++ int err; ++ int pfd[2]; ++ struct file *f; ++ struct cpt_object_hdr v; ++ int n; ++ struct cpt_section_hdr h; ++ loff_t sec = ctx->sections[CPT_SECT_NET_IPTABLES]; ++ loff_t end; ++ int pid; ++ int status; ++ mm_segment_t oldfs; ++ sigset_t ignore, blocked; ++ ++ if (sec == CPT_NULL) ++ return 0; ++ ++ err = ctx->pread(&h, sizeof(h), ctx, sec); ++ if (err) ++ return err; ++ if (h.cpt_section != CPT_SECT_NET_IPTABLES || h.cpt_hdrlen < sizeof(h)) ++ return -EINVAL; ++ ++ if (h.cpt_hdrlen == h.cpt_next) ++ return 0; ++ if (h.cpt_hdrlen > h.cpt_next) ++ return -EINVAL; ++ sec += h.cpt_hdrlen; ++ err = rst_get_object(CPT_OBJ_NAME, sec, &v, ctx); ++ if (err < 0) ++ return err; ++ ++ err = sc_pipe(pfd); ++ if (err < 0) ++ return err; ++ ignore.sig[0] = CPT_SIG_IGNORE_MASK; ++ sigprocmask(SIG_BLOCK, &ignore, &blocked); ++ pid = err = local_kernel_thread(dumpfn, (void*)pfd, SIGCHLD, 0); ++ if (err < 0) { ++ eprintk_ctx("iptables local_kernel_thread: %d\n", err); ++ goto out; ++ } ++ f = fget(pfd[1]); ++ sc_close(pfd[1]); ++ sc_close(pfd[0]); ++ ++ ctx->file->f_pos = sec + v.cpt_hdrlen; ++ end = sec + v.cpt_next; ++ do { ++ char *p; ++ char buf[16]; ++ ++ n = end - ctx->file->f_pos; ++ if (n > sizeof(buf)) ++ n = sizeof(buf); ++ ++ if (ctx->read(buf, n, ctx)) ++ break; ++ if ((p = memchr(buf, 0, n)) != NULL) ++ n = p - buf; ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ f->f_op->write(f, buf, n, &f->f_pos); ++ set_fs(oldfs); ++ } while (ctx->file->f_pos < end); ++ ++ fput(f); ++ ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ if ((err = sc_waitx(pid, 0, &status)) < 0) ++ eprintk_ctx("wait4: %d\n", err); ++ else if ((status & 0x7f) == 0) { ++ err = (status & 0xff00) >> 8; ++ if (err != 0) { ++ eprintk_ctx("iptables-restore exited with %d\n", err); ++ err = -EINVAL; ++ } ++ } else { ++ eprintk_ctx("iptables-restore terminated\n"); ++ err = -EINVAL; ++ } ++ set_fs(oldfs); ++ sigprocmask(SIG_SETMASK, &blocked, NULL); ++ ++ return err; ++ ++out: ++ if (pfd[1] >= 0) ++ sc_close(pfd[1]); ++ if (pfd[0] >= 0) ++ sc_close(pfd[0]); ++ sigprocmask(SIG_SETMASK, &blocked, NULL); ++ return err; ++} ++ ++int rst_restore_net(struct cpt_context *ctx) ++{ ++ int err; ++ ++ err = rst_restore_netdev(ctx); ++ if (!err) ++ err = rst_restore_ifaddr(ctx); ++ if (!err) ++ err = rst_restore_route(ctx); ++ if (!err) ++ err = rst_restore_iptables(ctx); ++ if (!err) ++ err = rst_restore_ip_conntrack(ctx); ++ return err; ++} +diff --git a/kernel/cpt/rst_proc.c b/kernel/cpt/rst_proc.c +new file mode 100644 +index 0000000..189649f +--- /dev/null ++++ b/kernel/cpt/rst_proc.c +@@ -0,0 +1,580 @@ ++/* ++ * ++ * kernel/cpt/rst_proc.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_dump.h" ++#include "cpt_files.h" ++#include "cpt_mm.h" ++#include "cpt_kernel.h" ++ ++MODULE_AUTHOR("Alexey Kuznetsov "); ++MODULE_LICENSE("GPL"); ++ ++/* List of contexts and lock protecting the list */ ++static struct list_head cpt_context_list; ++static spinlock_t cpt_context_lock; ++ ++static int proc_read(char *buffer, char **start, off_t offset, ++ int length, int *eof, void *data) ++{ ++ off_t pos = 0; ++ off_t begin = 0; ++ int len = 0; ++ cpt_context_t *ctx; ++ ++ len += sprintf(buffer, "Ctx Id VE State\n"); ++ ++ spin_lock(&cpt_context_lock); ++ ++ list_for_each_entry(ctx, &cpt_context_list, ctx_list) { ++ len += sprintf(buffer+len,"%p %08x %-8u %d", ++ ctx, ++ ctx->contextid, ++ ctx->ve_id, ++ ctx->ctx_state ++ ); ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ len += pagein_info_printf(buffer+len, ctx); ++#endif ++ ++ buffer[len++] = '\n'; ++ ++ pos = begin+len; ++ if (pos < offset) { ++ len = 0; ++ begin = pos; ++ } ++ if (pos > offset+length) ++ goto done; ++ } ++ *eof = 1; ++ ++done: ++ spin_unlock(&cpt_context_lock); ++ *start = buffer + (offset - begin); ++ len -= (offset - begin); ++ if(len > length) ++ len = length; ++ if(len < 0) ++ len = 0; ++ return len; ++} ++ ++void rst_context_release(cpt_context_t *ctx) ++{ ++ list_del(&ctx->ctx_list); ++ spin_unlock(&cpt_context_lock); ++ ++ if (ctx->ctx_state > 0) ++ rst_resume(ctx); ++ ctx->ctx_state = CPT_CTX_ERROR; ++ ++ rst_close_dumpfile(ctx); ++ ++ if (ctx->anonvmas) { ++ int h; ++ for (h = 0; h < CPT_ANONVMA_HSIZE; h++) { ++ while (!hlist_empty(&ctx->anonvmas[h])) { ++ struct hlist_node *elem = ctx->anonvmas[h].first; ++ hlist_del(elem); ++ kfree(elem); ++ } ++ } ++ free_page((unsigned long)ctx->anonvmas); ++ } ++ cpt_flush_error(ctx); ++ if (ctx->errorfile) { ++ fput(ctx->errorfile); ++ ctx->errorfile = NULL; ++ } ++ if (ctx->error_msg) { ++ free_page((unsigned long)ctx->error_msg); ++ ctx->error_msg = NULL; ++ } ++#ifdef CONFIG_VZ_CHECKPOINT_ITER ++ rst_drop_iter_dir(ctx); ++#endif ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ if (ctx->pagein_file_out) ++ fput(ctx->pagein_file_out); ++ if (ctx->pagein_file_in) ++ fput(ctx->pagein_file_in); ++ if (ctx->pgin_task) ++ put_task_struct(ctx->pgin_task); ++#endif ++ if (ctx->filejob_queue) ++ rst_flush_filejobs(ctx); ++ if (ctx->vdso) ++ free_page((unsigned long)ctx->vdso); ++ if (ctx->objcount) ++ eprintk_ctx("%d objects leaked\n", ctx->objcount); ++ kfree(ctx); ++ ++ spin_lock(&cpt_context_lock); ++} ++ ++static void __cpt_context_put(cpt_context_t *ctx) ++{ ++ if (!--ctx->refcount) ++ rst_context_release(ctx); ++} ++ ++static void cpt_context_put(cpt_context_t *ctx) ++{ ++ spin_lock(&cpt_context_lock); ++ __cpt_context_put(ctx); ++ spin_unlock(&cpt_context_lock); ++} ++ ++cpt_context_t * rst_context_open(void) ++{ ++ cpt_context_t *ctx; ++ ++ if ((ctx = kmalloc(sizeof(*ctx), GFP_KERNEL)) != NULL) { ++ rst_context_init(ctx); ++ spin_lock(&cpt_context_lock); ++ list_add_tail(&ctx->ctx_list, &cpt_context_list); ++ spin_unlock(&cpt_context_lock); ++ ctx->error_msg = (char*)__get_free_page(GFP_KERNEL); ++ if (ctx->error_msg != NULL) ++ ctx->error_msg[0] = 0; ++ } ++ return ctx; ++} ++ ++void rst_report_error(int err, cpt_context_t *ctx) ++{ ++ if (ctx->statusfile) { ++ mm_segment_t oldfs; ++ int status = 7 /* VZ_ENVCREATE_ERROR */; ++ ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ if (ctx->statusfile->f_op && ctx->statusfile->f_op->write) ++ ctx->statusfile->f_op->write(ctx->statusfile, (char*)&status, sizeof(status), &ctx->statusfile->f_pos); ++ set_fs(oldfs); ++ fput(ctx->statusfile); ++ ctx->statusfile = NULL; ++ } ++} ++ ++ ++static cpt_context_t * cpt_context_lookup(unsigned int ctxid) ++{ ++ cpt_context_t *ctx; ++ ++ spin_lock(&cpt_context_lock); ++ list_for_each_entry(ctx, &cpt_context_list, ctx_list) { ++ if (ctx->contextid == ctxid) { ++ ctx->refcount++; ++ spin_unlock(&cpt_context_lock); ++ return ctx; ++ } ++ } ++ spin_unlock(&cpt_context_lock); ++ return NULL; ++} ++ ++static int rst_ioctl(struct inode * inode, struct file * file, unsigned int cmd, unsigned long arg) ++{ ++ int err = 0; ++ cpt_context_t *ctx; ++ struct file *dfile = NULL; ++ ++ unlock_kernel(); ++ ++ if (cmd == CPT_TEST_CAPS) { ++ err = test_cpu_caps(); ++ goto out_lock; ++ } ++ ++ if (cmd == CPT_JOIN_CONTEXT || cmd == CPT_PUT_CONTEXT) { ++ cpt_context_t *old_ctx; ++ ++ ctx = NULL; ++ if (cmd == CPT_JOIN_CONTEXT) { ++ err = -ENOENT; ++ ctx = cpt_context_lookup(arg); ++ if (!ctx) ++ goto out_lock; ++ } ++ ++ spin_lock(&cpt_context_lock); ++ old_ctx = (cpt_context_t*)file->private_data; ++ file->private_data = ctx; ++ ++ if (old_ctx) { ++ if (cmd == CPT_PUT_CONTEXT && old_ctx->sticky) { ++ old_ctx->sticky = 0; ++ old_ctx->refcount--; ++ } ++ __cpt_context_put(old_ctx); ++ } ++ spin_unlock(&cpt_context_lock); ++ err = 0; ++ goto out_lock; ++ } ++ ++ spin_lock(&cpt_context_lock); ++ ctx = (cpt_context_t*)file->private_data; ++ if (ctx) ++ ctx->refcount++; ++ spin_unlock(&cpt_context_lock); ++ ++ if (!ctx) { ++ cpt_context_t *old_ctx; ++ ++ err = -ENOMEM; ++ ctx = rst_context_open(); ++ if (!ctx) ++ goto out_lock; ++ ++ spin_lock(&cpt_context_lock); ++ old_ctx = (cpt_context_t*)file->private_data; ++ if (!old_ctx) { ++ ctx->refcount++; ++ file->private_data = ctx; ++ } else { ++ old_ctx->refcount++; ++ } ++ if (old_ctx) { ++ __cpt_context_put(ctx); ++ ctx = old_ctx; ++ } ++ spin_unlock(&cpt_context_lock); ++ } ++ ++ if (cmd == CPT_GET_CONTEXT) { ++ unsigned int contextid = (unsigned int)arg; ++ ++ err = -EINVAL; ++ if (ctx->contextid && ctx->contextid != contextid) ++ goto out_nosem; ++ if (!ctx->contextid) { ++ cpt_context_t *c1 = cpt_context_lookup(contextid); ++ if (c1) { ++ cpt_context_put(c1); ++ err = -EEXIST; ++ goto out_nosem; ++ } ++ ctx->contextid = contextid; ++ } ++ spin_lock(&cpt_context_lock); ++ if (!ctx->sticky) { ++ ctx->sticky = 1; ++ ctx->refcount++; ++ } ++ spin_unlock(&cpt_context_lock); ++ err = 0; ++ goto out_nosem; ++ } ++ ++ down(&ctx->main_sem); ++ ++ err = -EBUSY; ++ if (ctx->ctx_state < 0) ++ goto out; ++ ++ err = 0; ++ switch (cmd) { ++ case CPT_SET_DUMPFD: ++ if (ctx->ctx_state > 0) { ++ err = -EBUSY; ++ break; ++ } ++ if (arg >= 0) { ++ err = -EBADF; ++ dfile = fget(arg); ++ if (dfile == NULL) ++ break; ++ if (dfile->f_op == NULL || ++ dfile->f_op->read == NULL) { ++ fput(dfile); ++ break; ++ } ++ err = 0; ++ } ++ if (ctx->file) ++ fput(ctx->file); ++ ctx->file = dfile; ++ break; ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ case CPT_SET_PAGEINFDIN: ++ if (ctx->ctx_state > 0) { ++ err = -EBUSY; ++ break; ++ } ++ if (arg >= 0) { ++ dfile = fget(arg); ++ if (dfile == NULL) { ++ err = -EBADF; ++ break; ++ } ++ } ++ if (ctx->pagein_file_in) ++ fput(ctx->pagein_file_in); ++ ctx->pagein_file_in = dfile; ++ break; ++ case CPT_SET_PAGEINFDOUT: ++ if (ctx->ctx_state > 0) { ++ err = -EBUSY; ++ break; ++ } ++ if (arg >= 0) { ++ dfile = fget(arg); ++ if (dfile == NULL) { ++ err = -EBADF; ++ break; ++ } ++ } ++ if (ctx->pagein_file_out) ++ fput(ctx->pagein_file_out); ++ ctx->pagein_file_out = dfile; ++ break; ++ case CPT_PAGEIND: ++ err = rst_pageind(ctx); ++ break; ++#endif ++#ifdef CONFIG_VZ_CHECKPOINT_ITER ++ case CPT_ITER: ++ err = rst_iteration(ctx); ++ break; ++#endif ++ case CPT_SET_LOCKFD: ++ if (ctx->ctx_state > 0) { ++ err = -EBUSY; ++ break; ++ } ++ if (arg >= 0) { ++ dfile = fget(arg); ++ if (dfile == NULL) { ++ err = -EBADF; ++ break; ++ } ++ } ++ if (ctx->lockfile) ++ fput(ctx->lockfile); ++ ctx->lockfile = dfile; ++ break; ++ case CPT_SET_STATUSFD: ++ if (ctx->ctx_state > 0) { ++ err = -EBUSY; ++ break; ++ } ++ if (arg >= 0) { ++ dfile = fget(arg); ++ if (dfile == NULL) { ++ err = -EBADF; ++ break; ++ } ++ } ++ if (ctx->statusfile) ++ fput(ctx->statusfile); ++ ctx->statusfile = dfile; ++ break; ++ case CPT_SET_ERRORFD: ++ if (arg >= 0) { ++ dfile = fget(arg); ++ if (dfile == NULL) { ++ err = -EBADF; ++ break; ++ } ++ } ++ if (ctx->errorfile) ++ fput(ctx->errorfile); ++ ctx->errorfile = dfile; ++ break; ++ case CPT_SET_VEID: ++ if (ctx->ctx_state > 0) { ++ err = -EBUSY; ++ break; ++ } ++ ctx->ve_id = arg; ++ break; ++ case CPT_UNDUMP: ++ if (ctx->ctx_state > 0) { ++ err = -ENOENT; ++ break; ++ } ++ ctx->ctx_state = CPT_CTX_UNDUMPING; ++ err = vps_rst_undump(ctx); ++ if (err) { ++ rst_report_error(err, ctx); ++ if (rst_kill(ctx) == 0) ++ ctx->ctx_state = CPT_CTX_IDLE; ++ } else { ++ ctx->ctx_state = CPT_CTX_UNDUMPED; ++ } ++ break; ++ case CPT_RESUME: ++ if (!ctx->ctx_state) { ++ err = -ENOENT; ++ break; ++ } ++ err = rst_resume(ctx); ++ if (!err) ++ ctx->ctx_state = CPT_CTX_IDLE; ++ break; ++ case CPT_KILL: ++ if (!ctx->ctx_state) { ++ err = -ENOENT; ++ break; ++ } ++ err = rst_kill(ctx); ++ if (!err) ++ ctx->ctx_state = CPT_CTX_IDLE; ++ break; ++ default: ++ err = -EINVAL; ++ break; ++ } ++ ++out: ++ cpt_flush_error(ctx); ++ up(&ctx->main_sem); ++out_nosem: ++ cpt_context_put(ctx); ++out_lock: ++ lock_kernel(); ++ if (err == -ERESTARTSYS || err == -ERESTARTNOINTR || ++ err == -ERESTARTNOHAND || err == -ERESTART_RESTARTBLOCK) ++ err = -EINTR; ++ return err; ++} ++ ++static int rst_open(struct inode * inode, struct file * file) ++{ ++ if (!try_module_get(THIS_MODULE)) ++ return -EBUSY; ++ ++ return 0; ++} ++ ++static int rst_release(struct inode * inode, struct file * file) ++{ ++ cpt_context_t *ctx; ++ ++ spin_lock(&cpt_context_lock); ++ ctx = (cpt_context_t*)file->private_data; ++ file->private_data = NULL; ++ if (ctx) ++ __cpt_context_put(ctx); ++ spin_unlock(&cpt_context_lock); ++ ++ ++ module_put(THIS_MODULE); ++ return 0; ++} ++ ++static struct file_operations rst_fops = ++{ ++ .owner = THIS_MODULE, ++ .ioctl = rst_ioctl, ++ .open = rst_open, ++ .release = rst_release, ++}; ++ ++ ++static struct proc_dir_entry *proc_ent; ++extern void *schedule_tail_p; ++extern void schedule_tail_hook(void); ++ ++static struct ctl_table_header *ctl_header; ++ ++static ctl_table debug_table[] = { ++ { ++ .procname = "rst", ++ .data = &debug_level, ++ .maxlen = sizeof(debug_level), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { .ctl_name = 0 } ++}; ++static ctl_table root_table[] = { ++ { ++ .ctl_name = CTL_DEBUG, ++ .procname = "debug", ++ .mode = 0555, ++ .child = debug_table, ++ }, ++ { .ctl_name = 0 } ++}; ++ ++static int __init init_rst(void) ++{ ++ int err; ++ ++ err = -ENOMEM; ++ ctl_header = register_sysctl_table(root_table); ++ if (!ctl_header) ++ goto err_mon; ++ ++ spin_lock_init(&cpt_context_lock); ++ INIT_LIST_HEAD(&cpt_context_list); ++ ++ err = -EINVAL; ++ proc_ent = proc_create("rst", 0600, NULL, NULL); ++ if (!proc_ent) ++ goto err_out; ++ ++ rst_fops.read = proc_ent->proc_fops->read; ++ rst_fops.write = proc_ent->proc_fops->write; ++ rst_fops.llseek = proc_ent->proc_fops->llseek; ++ proc_ent->proc_fops = &rst_fops; ++ ++ proc_ent->read_proc = proc_read; ++ proc_ent->data = NULL; ++ proc_ent->owner = THIS_MODULE; ++ return 0; ++ ++err_out: ++ unregister_sysctl_table(ctl_header); ++err_mon: ++ return err; ++} ++module_init(init_rst); ++ ++static void __exit exit_rst(void) ++{ ++ remove_proc_entry("rst", NULL); ++ unregister_sysctl_table(ctl_header); ++ ++ spin_lock(&cpt_context_lock); ++ while (!list_empty(&cpt_context_list)) { ++ cpt_context_t *ctx; ++ ctx = list_entry(cpt_context_list.next, cpt_context_t, ctx_list); ++ ++ if (!ctx->sticky) ++ ctx->refcount++; ++ ctx->sticky = 0; ++ ++ BUG_ON(ctx->refcount != 1); ++ ++ __cpt_context_put(ctx); ++ } ++ spin_unlock(&cpt_context_lock); ++} ++module_exit(exit_rst); +diff --git a/kernel/cpt/rst_process.c b/kernel/cpt/rst_process.c +new file mode 100644 +index 0000000..0f60a06 +--- /dev/null ++++ b/kernel/cpt/rst_process.c +@@ -0,0 +1,1630 @@ ++/* ++ * ++ * kernel/cpt/rst_process.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#ifdef CONFIG_X86 ++#include ++#endif ++#include ++ ++#include ++#include ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_files.h" ++#include "cpt_mm.h" ++#include "cpt_ubc.h" ++#include "cpt_process.h" ++#include "cpt_kernel.h" ++ ++ ++#define HOOK_RESERVE 256 ++ ++struct resume_info ++{ ++ asmlinkage void (*hook)(struct resume_info *); ++ unsigned long hooks; ++#define HOOK_TID 0 ++#define HOOK_CONT 1 ++#define HOOK_LSI 2 ++#define HOOK_RESTART 3 ++ unsigned long tid_ptrs[2]; ++ siginfo_t last_siginfo; ++}; ++ ++#ifdef CONFIG_X86_32 ++ ++#define IN_SYSCALL(regs) ((long)(regs)->orig_ax >= 0) ++#define IN_ERROR(regs) ((long)(regs)->ax < 0) ++#define SYSCALL_ERRNO(regs) (-(long)((regs)->ax)) ++#define SYSCALL_RETVAL(regs) ((regs)->ax) ++#define SYSCALL_NR(regs) ((regs)->orig_ax) ++ ++#define SYSCALL_SETRET(regs,val) do { (regs)->ax = (val); } while (0) ++ ++#define SYSCALL_RESTART2(regs,new) do { (regs)->ax = (new); \ ++ (regs)->ip -= 2; } while (0) ++ ++#define syscall_is(tsk,regs,name) (SYSCALL_NR(regs) == __NR_##name) ++ ++/* In new kernels task_pt_regs() is define to something inappropriate */ ++#undef task_pt_regs ++#define task_pt_regs(t) ((struct pt_regs *)((t)->thread.sp0) - 1) ++ ++#elif defined(CONFIG_X86_64) ++ ++#define IN_SYSCALL(regs) ((long)(regs)->orig_ax >= 0) ++#define IN_ERROR(regs) ((long)(regs)->ax < 0) ++#define SYSCALL_ERRNO(regs) (-(long)((regs)->ax)) ++#define SYSCALL_RETVAL(regs) ((regs)->ax) ++#define SYSCALL_NR(regs) ((regs)->orig_ax) ++ ++#define SYSCALL_SETRET(regs,val) do { (regs)->ax = (val); } while (0) ++ ++#define SYSCALL_RESTART2(regs,new) do { (regs)->ax = (new); \ ++ (regs)->ip -= 2; } while (0) ++ ++#define __NR32_restart_syscall 0 ++#define __NR32_rt_sigtimedwait 177 ++#define __NR32_pause 29 ++#define __NR32_futex 240 ++ ++#define syscall_is(tsk,regs,name) ((!(task_thread_info(tsk)->flags&_TIF_IA32) && \ ++ SYSCALL_NR(regs) == __NR_##name) || \ ++ ((task_thread_info(tsk)->flags&_TIF_IA32) && \ ++ SYSCALL_NR(regs) == __NR32_##name)) ++ ++#elif defined (CONFIG_IA64) ++ ++#define IN_SYSCALL(regs) ((long)(regs)->cr_ifs >= 0) ++#define IN_ERROR(regs) ((long)(regs)->r10 == -1) ++#define SYSCALL_ERRNO(regs) ((regs)->r10 == -1 ? (long)((regs)->r8) : 0) ++#define SYSCALL_RETVAL(regs) ((regs)->r8) ++#define SYSCALL_NR(regs) ((regs)->cr_ifs >= 0 ? (regs)->r15 : -1) ++ ++#define SYSCALL_SETRET(regs,val) do { (regs)->r8 = (val); } while (0) ++ ++#define SYSCALL_RESTART2(regs,new) do { (regs)->r15 = (new); \ ++ (regs)->r10 = 0; \ ++ ia64_decrement_ip(regs); } while (0) ++ ++#define syscall_is(tsk,regs,name) (SYSCALL_NR(regs) == __NR_##name) ++ ++#else ++ ++#error This arch is not supported ++ ++#endif ++ ++#define SYSCALL_RESTART(regs) SYSCALL_RESTART2(regs, SYSCALL_NR(regs)) ++ ++pid_t vpid_to_pid(pid_t nr) ++{ ++ pid_t vnr; ++ struct pid *pid; ++ ++ rcu_read_lock(); ++ pid = find_vpid(nr); ++ vnr = (pid == NULL ? -1 : pid->numbers[0].nr); ++ rcu_read_unlock(); ++ return vnr; ++} ++ ++static void decode_siginfo(siginfo_t *info, struct cpt_siginfo_image *si) ++{ ++ memset(info, 0, sizeof(*info)); ++ switch(si->cpt_code & __SI_MASK) { ++ case __SI_TIMER: ++ info->si_tid = si->cpt_pid; ++ info->si_overrun = si->cpt_uid; ++ info->_sifields._timer._sigval.sival_ptr = cpt_ptr_import(si->cpt_sigval); ++ info->si_sys_private = si->cpt_utime; ++ break; ++ case __SI_POLL: ++ info->si_band = si->cpt_pid; ++ info->si_fd = si->cpt_uid; ++ break; ++ case __SI_FAULT: ++ info->si_addr = cpt_ptr_import(si->cpt_sigval); ++#ifdef __ARCH_SI_TRAPNO ++ info->si_trapno = si->cpt_pid; ++#endif ++ break; ++ case __SI_CHLD: ++ info->si_pid = si->cpt_pid; ++ info->si_uid = si->cpt_uid; ++ info->si_status = si->cpt_sigval; ++ info->si_stime = si->cpt_stime; ++ info->si_utime = si->cpt_utime; ++ break; ++ case __SI_KILL: ++ case __SI_RT: ++ case __SI_MESGQ: ++ default: ++ info->si_pid = si->cpt_pid; ++ info->si_uid = si->cpt_uid; ++ info->si_ptr = cpt_ptr_import(si->cpt_sigval); ++ break; ++ } ++ info->si_signo = si->cpt_signo; ++ info->si_errno = si->cpt_errno; ++ info->si_code = si->cpt_code; ++} ++ ++static int restore_sigqueue(struct task_struct *tsk, ++ struct sigpending *queue, unsigned long start, ++ unsigned long end) ++{ ++ while (start < end) { ++ struct cpt_siginfo_image *si = (struct cpt_siginfo_image *)start; ++ if (si->cpt_object == CPT_OBJ_SIGINFO) { ++ struct sigqueue *q = NULL; ++ struct user_struct *up; ++ ++ up = alloc_uid(get_exec_env()->ve_ns->user_ns, si->cpt_user); ++ if (!up) ++ return -ENOMEM; ++ q = kmem_cache_alloc(sigqueue_cachep, GFP_ATOMIC); ++ if (!q) { ++ free_uid(up); ++ return -ENOMEM; ++ } ++ if (ub_siginfo_charge(q, get_exec_ub())) { ++ kmem_cache_free(sigqueue_cachep, q); ++ free_uid(up); ++ return -ENOMEM; ++ } ++ ++ INIT_LIST_HEAD(&q->list); ++ /* Preallocated elements (posix timers) are not ++ * supported yet. It is safe to replace them with ++ * a private one. */ ++ q->flags = 0; ++ q->user = up; ++ atomic_inc(&q->user->sigpending); ++ ++ decode_siginfo(&q->info, si); ++ list_add_tail(&q->list, &queue->list); ++ } ++ start += si->cpt_next; ++ } ++ return 0; ++} ++ ++int rst_process_linkage(cpt_context_t *ctx) ++{ ++ cpt_object_t *obj; ++ ++ for_each_object(obj, CPT_OBJ_TASK) { ++ struct task_struct *tsk = obj->o_obj; ++ struct cpt_task_image *ti = obj->o_image; ++ ++ if (tsk == NULL) { ++ eprintk_ctx("task %u(%s) is missing\n", ti->cpt_pid, ti->cpt_comm); ++ return -EINVAL; ++ } ++ ++ if (task_pgrp_vnr(tsk) != ti->cpt_pgrp) { ++ struct pid *pid; ++ ++ rcu_read_lock(); ++ pid = find_vpid(ti->cpt_pgrp); ++ if (!pid) { ++ eprintk_ctx("illegal PGRP " CPT_FID "\n", CPT_TID(tsk)); ++ return -EINVAL; ++ } ++ ++ write_lock_irq(&tasklist_lock); ++ if (task_pgrp_nr(tsk) != pid_nr(pid)) { ++ detach_pid(tsk, PIDTYPE_PGID); ++ set_task_pgrp(tsk, pid_nr(pid)); ++ if (thread_group_leader(tsk)) ++ attach_pid(tsk, PIDTYPE_PGID, pid); ++ } ++ write_unlock_irq(&tasklist_lock); ++ if (task_pgrp_nr(tsk) != pid_nr(pid)) { ++ eprintk_ctx("cannot set PGRP " CPT_FID "\n", CPT_TID(tsk)); ++ return -EINVAL; ++ } ++ rcu_read_unlock(); ++ } ++ if (task_session_vnr(tsk) != ti->cpt_session) { ++ struct pid *pid; ++ ++ rcu_read_lock(); ++ pid = find_vpid(ti->cpt_session); ++ if (!pid) { ++ eprintk_ctx("illegal SID " CPT_FID "\n", CPT_TID(tsk)); ++ return -EINVAL; ++ } ++ ++ write_lock_irq(&tasklist_lock); ++ if (task_session_nr(tsk) != pid_nr(pid)) { ++ detach_pid(tsk, PIDTYPE_SID); ++ set_task_session(tsk, pid_nr(pid)); ++ if (thread_group_leader(tsk)) ++ attach_pid(tsk, PIDTYPE_SID, pid); ++ } ++ write_unlock_irq(&tasklist_lock); ++ if (task_session_nr(tsk) != pid_nr(pid)) { ++ eprintk_ctx("cannot set SID " CPT_FID "\n", CPT_TID(tsk)); ++ return -EINVAL; ++ } ++ rcu_read_unlock(); ++ } ++ if (ti->cpt_old_pgrp > 0 && !tsk->signal->tty_old_pgrp) { ++ struct pid *pid; ++ ++ rcu_read_lock(); ++ pid = get_pid(find_vpid(ti->cpt_old_pgrp)); ++ if (!pid) { ++ eprintk_ctx("illegal OLD_PGRP " CPT_FID "\n", CPT_TID(tsk)); ++ return -EINVAL; ++ } ++ tsk->signal->tty_old_pgrp = pid; ++ rcu_read_unlock(); ++ } ++ } ++ ++ return 0; ++} ++ ++struct pid *alloc_vpid_safe(pid_t vnr) ++{ ++ struct pid *pid; ++ ++ pid = alloc_pid(current->nsproxy->pid_ns, vnr); ++ if (!pid) ++ pid = find_vpid(vnr); ++ return pid; ++} ++ ++static int ++restore_one_signal_struct(struct cpt_task_image *ti, int *exiting, cpt_context_t *ctx) ++{ ++ int err; ++ struct cpt_signal_image *si = cpt_get_buf(ctx); ++ ++ current->signal->tty = NULL; ++ ++ err = rst_get_object(CPT_OBJ_SIGNAL_STRUCT, ti->cpt_signal, si, ctx); ++ if (err) { ++ cpt_release_buf(ctx); ++ return err; ++ } ++ ++ if (task_pgrp_vnr(current) != si->cpt_pgrp) { ++ struct pid * pid = NULL, *free = NULL; ++ ++ rcu_read_lock(); ++ if (si->cpt_pgrp_type == CPT_PGRP_ORPHAN) { ++#if 0 ++ if (!is_virtual_pid(si->cpt_pgrp)) { ++ eprintk_ctx("external process group " CPT_FID, CPT_TID(current)); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++#endif ++ pid = alloc_vpid_safe(si->cpt_pgrp); ++ free = pid; ++ } ++ write_lock_irq(&tasklist_lock); ++ if (pid != NULL) { ++ if (task_pgrp_nr(current) != pid_nr(pid)) { ++ detach_pid(current, PIDTYPE_PGID); ++ set_task_pgrp(current, pid_nr(pid)); ++ if (thread_group_leader(current)) { ++ attach_pid(current, PIDTYPE_PGID, pid); ++ free = NULL; ++ } ++ } ++ } ++ write_unlock_irq(&tasklist_lock); ++ if (free != NULL) ++ free_pid(free); ++ rcu_read_unlock(); ++ } ++ ++ current->signal->tty_old_pgrp = NULL; ++ if ((int)si->cpt_old_pgrp > 0) { ++ if (si->cpt_old_pgrp_type == CPT_PGRP_STRAY) { ++ current->signal->tty_old_pgrp = ++ alloc_pid(current->nsproxy->pid_ns, 0); ++ if (!current->signal->tty_old_pgrp) { ++ eprintk_ctx("failed to allocate stray tty_old_pgrp\n"); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ } else { ++ rcu_read_lock(); ++ current->signal->tty_old_pgrp = ++ get_pid(alloc_vpid_safe(si->cpt_old_pgrp)); ++ rcu_read_unlock(); ++ if (!current->signal->tty_old_pgrp) { ++ dprintk_ctx("forward old tty PGID\n"); ++ current->signal->tty_old_pgrp = NULL; ++ } ++ } ++ } ++ ++ if (task_session_vnr(current) != si->cpt_session) { ++ struct pid * pid = NULL, *free = NULL; ++ ++ rcu_read_lock(); ++ if (si->cpt_session_type == CPT_PGRP_ORPHAN) { ++#if 0 ++ if (!is_virtual_pid(si->cpt_session)) { ++ eprintk_ctx("external process session " CPT_FID, CPT_TID(current)); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++#endif ++ pid = alloc_vpid_safe(si->cpt_session); ++ free = pid; ++ } ++ write_lock_irq(&tasklist_lock); ++ if (pid == NULL) ++ pid = find_vpid(si->cpt_session); ++ if (pid != NULL) { ++ if (task_session_nr(current) != pid_nr(pid)) { ++ detach_pid(current, PIDTYPE_SID); ++ set_task_session(current, pid_nr(pid)); ++ if (thread_group_leader(current)) { ++ attach_pid(current, PIDTYPE_SID, pid); ++ free = NULL; ++ } ++ } ++ } ++ write_unlock_irq(&tasklist_lock); ++ if (free != NULL) ++ free_pid(free); ++ rcu_read_unlock(); ++ } ++ ++ cpt_sigset_import(¤t->signal->shared_pending.signal, si->cpt_sigpending); ++ current->signal->leader = si->cpt_leader; ++ if (si->cpt_ctty != CPT_NULL) { ++ cpt_object_t *obj = lookup_cpt_obj_bypos(CPT_OBJ_TTY, si->cpt_ctty, ctx); ++ if (obj) { ++ struct tty_struct *tty = obj->o_obj; ++ if (!tty->session || tty->session == ++ task_session(current)) { ++ tty->session = task_session(current); ++ current->signal->tty = tty; ++ } else { ++ wprintk_ctx("tty session mismatch\n"); ++ } ++ } ++ } ++ ++ if (si->cpt_curr_target) ++ current->signal->curr_target = find_task_by_vpid(si->cpt_curr_target); ++ current->signal->flags = 0; ++ *exiting = si->cpt_group_exit; ++ current->signal->group_exit_code = si->cpt_group_exit_code; ++ if (si->cpt_group_exit_task) { ++ current->signal->group_exit_task = find_task_by_vpid(si->cpt_group_exit_task); ++ if (current->signal->group_exit_task == NULL) { ++ eprintk_ctx("oops, group_exit_task=NULL, pid=%u\n", si->cpt_group_exit_task); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ } ++ current->signal->notify_count = si->cpt_notify_count; ++ current->signal->group_stop_count = si->cpt_group_stop_count; ++ ++ if (si->cpt_next > si->cpt_hdrlen) { ++ char *buf = kmalloc(si->cpt_next - si->cpt_hdrlen, GFP_KERNEL); ++ if (buf == NULL) { ++ cpt_release_buf(ctx); ++ return -ENOMEM; ++ } ++ err = ctx->pread(buf, si->cpt_next - si->cpt_hdrlen, ctx, ++ ti->cpt_signal + si->cpt_hdrlen); ++ if (err) { ++ kfree(buf); ++ cpt_release_buf(ctx); ++ return err; ++ } ++ restore_sigqueue(current, ++ ¤t->signal->shared_pending, (unsigned long)buf, ++ (unsigned long)buf + si->cpt_next - si->cpt_hdrlen); ++ kfree(buf); ++ } ++ cpt_release_buf(ctx); ++ return 0; ++} ++ ++int restore_one_sighand_struct(struct cpt_task_image *ti, struct cpt_context *ctx) ++{ ++ int err; ++ struct cpt_sighand_image si; ++ int i; ++ loff_t pos, endpos; ++ ++ err = rst_get_object(CPT_OBJ_SIGHAND_STRUCT, ti->cpt_sighand, &si, ctx); ++ if (err) ++ return err; ++ ++ for (i=0; i<_NSIG; i++) { ++ current->sighand->action[i].sa.sa_handler = SIG_DFL; ++#ifndef CONFIG_IA64 ++ current->sighand->action[i].sa.sa_restorer = 0; ++#endif ++ current->sighand->action[i].sa.sa_flags = 0; ++ memset(¤t->sighand->action[i].sa.sa_mask, 0, sizeof(sigset_t)); ++ } ++ ++ pos = ti->cpt_sighand + si.cpt_hdrlen; ++ endpos = ti->cpt_sighand + si.cpt_next; ++ while (pos < endpos) { ++ struct cpt_sighandler_image shi; ++ ++ err = rst_get_object(CPT_OBJ_SIGHANDLER, pos, &shi, ctx); ++ if (err) ++ return err; ++ current->sighand->action[shi.cpt_signo].sa.sa_handler = (void*)(unsigned long)shi.cpt_handler; ++#ifndef CONFIG_IA64 ++ current->sighand->action[shi.cpt_signo].sa.sa_restorer = (void*)(unsigned long)shi.cpt_restorer; ++#endif ++ current->sighand->action[shi.cpt_signo].sa.sa_flags = shi.cpt_flags; ++ cpt_sigset_import(¤t->sighand->action[shi.cpt_signo].sa.sa_mask, shi.cpt_mask); ++ pos += shi.cpt_next; ++ } ++ ++ return 0; ++} ++ ++ ++__u32 rst_signal_flag(struct cpt_task_image *ti, struct cpt_context *ctx) ++{ ++ __u32 flag = 0; ++ ++ if (lookup_cpt_obj_bypos(CPT_OBJ_SIGNAL_STRUCT, ti->cpt_signal, ctx)) ++ flag |= CLONE_THREAD; ++ if (ti->cpt_sighand == CPT_NULL || ++ lookup_cpt_obj_bypos(CPT_OBJ_SIGHAND_STRUCT, ti->cpt_sighand, ctx)) ++ flag |= CLONE_SIGHAND; ++ return flag; ++} ++ ++int ++rst_signal_complete(struct cpt_task_image *ti, int * exiting, cpt_context_t *ctx) ++{ ++ int err; ++ cpt_object_t *obj; ++ ++ if (ti->cpt_signal == CPT_NULL || ti->cpt_sighand == CPT_NULL) { ++ return -EINVAL; ++ } ++ ++ obj = lookup_cpt_obj_bypos(CPT_OBJ_SIGHAND_STRUCT, ti->cpt_sighand, ctx); ++ if (obj) { ++ struct sighand_struct *sig = current->sighand; ++ if (obj->o_obj != sig) { ++ return -EINVAL; ++ } ++ } else { ++ obj = cpt_object_add(CPT_OBJ_SIGHAND_STRUCT, current->sighand, ctx); ++ if (obj == NULL) ++ return -ENOMEM; ++ cpt_obj_setpos(obj, ti->cpt_sighand, ctx); ++ err = restore_one_sighand_struct(ti, ctx); ++ if (err) ++ return err; ++ } ++ ++ ++ obj = lookup_cpt_obj_bypos(CPT_OBJ_SIGNAL_STRUCT, ti->cpt_signal, ctx); ++ if (obj) { ++ struct signal_struct *sig = current->signal; ++ if (obj->o_obj != sig) { ++ return -EINVAL; ++ } ++/* if (current->signal) { ++ pid_t session; ++ ++ session = process_session(current); ++ set_process_vgroup(current, session); ++ set_signal_vsession(current->signal, session); ++ }*/ ++ } else { ++ obj = cpt_object_add(CPT_OBJ_SIGNAL_STRUCT, current->signal, ctx); ++ if (obj == NULL) ++ return -ENOMEM; ++ cpt_obj_setpos(obj, ti->cpt_signal, ctx); ++ err = restore_one_signal_struct(ti, exiting, ctx); ++ if (err) ++ return err; ++ } ++ ++ return 0; ++} ++ ++#ifdef CONFIG_X86 ++static u32 decode_segment(u32 segid) ++{ ++ if (segid == CPT_SEG_ZERO) ++ return 0; ++ ++ /* TLS descriptors */ ++ if (segid <= CPT_SEG_TLS3) ++ return ((GDT_ENTRY_TLS_MIN + segid-CPT_SEG_TLS1)<<3) + 3; ++ ++ /* LDT descriptor, it is just an index to LDT array */ ++ if (segid >= CPT_SEG_LDT) ++ return ((segid - CPT_SEG_LDT) << 3) | 7; ++ ++ /* Check for one of standard descriptors */ ++#ifdef CONFIG_X86_64 ++ if (segid == CPT_SEG_USER32_DS) ++ return __USER32_DS; ++ if (segid == CPT_SEG_USER32_CS) ++ return __USER32_CS; ++ if (segid == CPT_SEG_USER64_DS) ++ return __USER_DS; ++ if (segid == CPT_SEG_USER64_CS) ++ return __USER_CS; ++#else ++ if (segid == CPT_SEG_USER32_DS) ++ return __USER_DS; ++ if (segid == CPT_SEG_USER32_CS) ++ return __USER_CS; ++#endif ++ wprintk("Invalid segment reg %d\n", segid); ++ return 0; ++} ++#endif ++ ++#if defined (CONFIG_IA64) ++void ia64_decrement_ip (struct pt_regs *regs) ++{ ++ unsigned long w0, ri = ia64_psr(regs)->ri - 1; ++ ++ if (ia64_psr(regs)->ri == 0) { ++ regs->cr_iip -= 16; ++ ri = 2; ++ get_user(w0, (char __user *) regs->cr_iip + 0); ++ if (((w0 >> 1) & 0xf) == 2) { ++ /* ++ * rfi'ing to slot 2 of an MLX bundle causes ++ * an illegal operation fault. We don't want ++ * that to happen... ++ */ ++ ri = 1; ++ } ++ } ++ ia64_psr(regs)->ri = ri; ++} ++#endif ++ ++static void rst_child_tid(unsigned long *child_tids) ++{ ++ dprintk("rct: " CPT_FID "\n", CPT_TID(current)); ++ current->clear_child_tid = (void*)child_tids[0]; ++ current->set_child_tid = (void*)child_tids[1]; ++} ++ ++static void rst_last_siginfo(void) ++{ ++ int signr; ++ siginfo_t *info = current->last_siginfo; ++ struct pt_regs *regs = task_pt_regs(current); ++ struct k_sigaction *ka; ++ int ptrace_id; ++ ++ dprintk("rlsi: " CPT_FID "\n", CPT_TID(current)); ++ ++ spin_lock_irq(¤t->sighand->siglock); ++ current->last_siginfo = NULL; ++ recalc_sigpending(); ++ ++ ptrace_id = current->pn_state; ++ clear_pn_state(current); ++ ++ switch (ptrace_id) { ++ case PN_STOP_TF: ++ case PN_STOP_TF_RT: ++ /* frame_*signal */ ++ dprintk("SIGTRAP %u/%u(%s) %u/%u %u %ld %u %lu\n", ++ task_pid_vnr(current), current->pid, current->comm, ++ info->si_signo, info->si_code, ++ current->exit_code, SYSCALL_NR(regs), ++ current->ptrace, current->ptrace_message); ++ goto out; ++ case PN_STOP_ENTRY: ++ case PN_STOP_LEAVE: ++ /* do_syscall_trace */ ++ spin_unlock_irq(¤t->sighand->siglock); ++ dprintk("ptrace do_syscall_trace: %d %d\n", ptrace_id, current->exit_code); ++ if (current->exit_code) { ++ send_sig(current->exit_code, current, 1); ++ current->exit_code = 0; ++ } ++ if (IN_SYSCALL(regs)) { ++ if (ptrace_id == PN_STOP_ENTRY ++#ifdef CONFIG_X86 ++ && SYSCALL_ERRNO(regs) == ENOSYS ++#endif ++ ) ++ SYSCALL_RESTART(regs); ++ else if (IN_ERROR(regs) && ++ syscall_is(current, regs, rt_sigtimedwait) && ++ (SYSCALL_ERRNO(regs) == EAGAIN || ++ SYSCALL_ERRNO(regs) == EINTR)) ++ SYSCALL_RESTART(regs); ++ } ++ return; ++ case PN_STOP_FORK: ++ /* fork */ ++ SYSCALL_SETRET(regs, current->ptrace_message); ++ dprintk("ptrace fork returns pid %ld\n", SYSCALL_RETVAL(regs)); ++ goto out; ++ case PN_STOP_VFORK: ++ /* after vfork */ ++ SYSCALL_SETRET(regs, current->ptrace_message); ++ dprintk("ptrace after vfork returns pid %ld\n", SYSCALL_RETVAL(regs)); ++ goto out; ++ case PN_STOP_SIGNAL: ++ /* normal case : dequeue signal */ ++ break; ++ case PN_STOP_EXIT: ++ dprintk("ptrace exit caught\n"); ++ current->ptrace &= ~PT_TRACE_EXIT; ++ spin_unlock_irq(¤t->sighand->siglock); ++ module_put(THIS_MODULE); ++ complete_and_exit(NULL, current->ptrace_message); ++ BUG(); ++ case PN_STOP_EXEC: ++ eprintk("ptrace after exec caught: must not happen\n"); ++ BUG(); ++ default: ++ eprintk("ptrace with unknown identity %d\n", ptrace_id); ++ BUG(); ++ } ++ ++ signr = current->exit_code; ++ if (signr == 0) { ++ dprintk("rlsi: canceled signal %d\n", info->si_signo); ++ goto out; ++ } ++ current->exit_code = 0; ++ ++ if (signr != info->si_signo) { ++ info->si_signo = signr; ++ info->si_errno = 0; ++ info->si_code = SI_USER; ++ info->si_pid = task_pid_vnr(current->parent); ++ info->si_uid = current->parent->uid; ++ } ++ ++ /* If the (new) signal is now blocked, requeue it. */ ++ if (sigismember(¤t->blocked, signr)) { ++ dprintk("going to requeue signal %d\n", signr); ++ goto out_resend_sig; ++ } ++ ++ ka = ¤t->sighand->action[signr-1]; ++ if (ka->sa.sa_handler == SIG_IGN) { ++ dprintk("going to resend signal %d (ignored)\n", signr); ++ goto out; ++ } ++ if (ka->sa.sa_handler != SIG_DFL) { ++ dprintk("going to resend signal %d (not SIG_DFL)\n", signr); ++ goto out_resend_sig; ++ } ++ if (signr == SIGCONT || ++ signr == SIGCHLD || ++ signr == SIGWINCH || ++ signr == SIGURG || ++ current->pid == 1) ++ goto out; ++ ++ /* All the rest, which we cannot handle are requeued. */ ++ dprintk("going to resend signal %d (sigh)\n", signr); ++out_resend_sig: ++ spin_unlock_irq(¤t->sighand->siglock); ++ send_sig_info(signr, info, current); ++ return; ++ ++out: ++ spin_unlock_irq(¤t->sighand->siglock); ++} ++ ++static void rst_finish_stop(void) ++{ ++ /* ... ++ * do_signal() -> ++ * get_signal_to_deliver() -> ++ * do_signal_stop() -> ++ * finish_stop() ++ * ++ * Normally after SIGCONT it will dequeue the next signal. If no signal ++ * is found, do_signal restarts syscall unconditionally. ++ * Otherwise signal handler is pushed on user stack. ++ */ ++ ++ dprintk("rfs: " CPT_FID "\n", CPT_TID(current)); ++ ++ clear_stop_state(current); ++ current->exit_code = 0; ++} ++ ++static void rst_restart_sys(void) ++{ ++ struct pt_regs *regs = task_pt_regs(current); ++ ++ /* This hook is supposed to be executed, when we have ++ * to complete some interrupted syscall. ++ */ ++ dprintk("rrs: " CPT_FID "\n", CPT_TID(current)); ++ ++ if (!IN_SYSCALL(regs) || !IN_ERROR(regs)) ++ return; ++ ++#ifdef __NR_pause ++ if (syscall_is(current,regs,pause)) { ++ if (SYSCALL_ERRNO(regs) == ERESTARTNOHAND) { ++ current->state = TASK_INTERRUPTIBLE; ++ schedule(); ++ } ++ } else ++#else ++ /* On this arch pause() is simulated with sigsuspend(). */ ++ if (syscall_is(current,regs,rt_sigsuspend)) { ++ if (SYSCALL_ERRNO(regs) == ERESTARTNOHAND) { ++ current->state = TASK_INTERRUPTIBLE; ++ schedule(); ++ } ++ } else ++#endif ++ if (syscall_is(current,regs,rt_sigtimedwait)) { ++ if (SYSCALL_ERRNO(regs) == EAGAIN || ++ SYSCALL_ERRNO(regs) == EINTR) { ++ SYSCALL_RESTART(regs); ++ } ++ } else if (syscall_is(current,regs,futex)) { ++ if (SYSCALL_ERRNO(regs) == EINTR && ++ !signal_pending(current)) { ++ SYSCALL_RESTART(regs); ++ } ++ } ++ ++ if (!signal_pending(current) && ++ !current_thread_info()->status & TS_RESTORE_SIGMASK) { ++ if (SYSCALL_ERRNO(regs) == ERESTARTSYS || ++ SYSCALL_ERRNO(regs) == ERESTARTNOINTR || ++ SYSCALL_ERRNO(regs) == ERESTARTNOHAND) { ++ SYSCALL_RESTART(regs); ++ } else if (SYSCALL_ERRNO(regs) == ERESTART_RESTARTBLOCK) { ++ int new = __NR_restart_syscall; ++#ifdef CONFIG_X86_64 ++ if (task_thread_info(current)->flags&_TIF_IA32) ++ new = __NR32_restart_syscall; ++#endif ++ SYSCALL_RESTART2(regs, new); ++ } ++ } ++} ++ ++#ifdef CONFIG_X86_32 ++ ++static int restore_registers(struct task_struct *tsk, struct pt_regs *regs, ++ struct cpt_task_image *ti, struct cpt_x86_regs *b, ++ struct resume_info **rip, struct cpt_context *ctx) ++{ ++ extern char i386_ret_from_resume; ++ ++ if (b->cpt_object != CPT_OBJ_X86_REGS) ++ return -EINVAL; ++ ++ tsk->thread.sp = (unsigned long) regs; ++ tsk->thread.sp0 = (unsigned long) (regs+1); ++ tsk->thread.ip = (unsigned long) &i386_ret_from_resume; ++ ++ tsk->thread.gs = decode_segment(b->cpt_gs); ++ tsk->thread.debugreg0 = b->cpt_debugreg[0]; ++ tsk->thread.debugreg1 = b->cpt_debugreg[1]; ++ tsk->thread.debugreg2 = b->cpt_debugreg[2]; ++ tsk->thread.debugreg3 = b->cpt_debugreg[3]; ++ tsk->thread.debugreg6 = b->cpt_debugreg[6]; ++ tsk->thread.debugreg7 = b->cpt_debugreg[7]; ++ ++ regs->bx = b->cpt_ebx; ++ regs->cx = b->cpt_ecx; ++ regs->dx = b->cpt_edx; ++ regs->si = b->cpt_esi; ++ regs->di = b->cpt_edi; ++ regs->bp = b->cpt_ebp; ++ regs->ax = b->cpt_eax; ++ regs->ds = b->cpt_xds; ++ regs->es = b->cpt_xes; ++ regs->orig_ax = b->cpt_orig_eax; ++ regs->ip = b->cpt_eip; ++ regs->cs = b->cpt_xcs; ++ regs->flags = b->cpt_eflags; ++ regs->sp = b->cpt_esp; ++ regs->ss = b->cpt_xss; ++ ++ regs->cs = decode_segment(b->cpt_xcs); ++ regs->ss = decode_segment(b->cpt_xss); ++ regs->ds = decode_segment(b->cpt_xds); ++ regs->es = decode_segment(b->cpt_xes); ++ regs->fs = decode_segment(b->cpt_fs); ++ ++ tsk->thread.sp -= HOOK_RESERVE; ++ memset((void*)tsk->thread.sp, 0, HOOK_RESERVE); ++ *rip = (void*)tsk->thread.sp; ++ ++ return 0; ++} ++ ++#elif defined(CONFIG_X86_64) ++ ++static void xlate_ptregs_32_to_64(struct pt_regs *d, struct cpt_x86_regs *s) ++{ ++ memset(d, 0, sizeof(struct pt_regs)); ++ d->bp = s->cpt_ebp; ++ d->bx = s->cpt_ebx; ++ d->ax = (s32)s->cpt_eax; ++ d->cx = s->cpt_ecx; ++ d->dx = s->cpt_edx; ++ d->si = s->cpt_esi; ++ d->di = s->cpt_edi; ++ d->orig_ax = (s32)s->cpt_orig_eax; ++ d->ip = s->cpt_eip; ++ d->cs = s->cpt_xcs; ++ d->flags = s->cpt_eflags; ++ d->sp = s->cpt_esp; ++ d->ss = s->cpt_xss; ++} ++ ++static int restore_registers(struct task_struct *tsk, struct pt_regs *regs, ++ struct cpt_task_image *ti, struct cpt_obj_bits *hdr, ++ struct resume_info **rip, struct cpt_context *ctx) ++{ ++ if (hdr->cpt_object == CPT_OBJ_X86_64_REGS) { ++ struct cpt_x86_64_regs *b = (void*)hdr; ++ ++ tsk->thread.sp = (unsigned long) regs; ++ tsk->thread.sp0 = (unsigned long) (regs+1); ++ ++ tsk->thread.fs = b->cpt_fsbase; ++ tsk->thread.gs = b->cpt_gsbase; ++ tsk->thread.fsindex = decode_segment(b->cpt_fsindex); ++ tsk->thread.gsindex = decode_segment(b->cpt_gsindex); ++ tsk->thread.ds = decode_segment(b->cpt_ds); ++ tsk->thread.es = decode_segment(b->cpt_es); ++ tsk->thread.debugreg0 = b->cpt_debugreg[0]; ++ tsk->thread.debugreg1 = b->cpt_debugreg[1]; ++ tsk->thread.debugreg2 = b->cpt_debugreg[2]; ++ tsk->thread.debugreg3 = b->cpt_debugreg[3]; ++ tsk->thread.debugreg6 = b->cpt_debugreg[6]; ++ tsk->thread.debugreg7 = b->cpt_debugreg[7]; ++ ++ memcpy(regs, &b->cpt_r15, sizeof(struct pt_regs)); ++ ++ tsk->thread.usersp = regs->sp; ++ regs->cs = decode_segment(b->cpt_cs); ++ regs->ss = decode_segment(b->cpt_ss); ++ } else if (hdr->cpt_object == CPT_OBJ_X86_REGS) { ++ struct cpt_x86_regs *b = (void*)hdr; ++ ++ tsk->thread.sp = (unsigned long) regs; ++ tsk->thread.sp0 = (unsigned long) (regs+1); ++ ++ tsk->thread.fs = 0; ++ tsk->thread.gs = 0; ++ tsk->thread.fsindex = decode_segment(b->cpt_fs); ++ tsk->thread.gsindex = decode_segment(b->cpt_gs); ++ tsk->thread.debugreg0 = b->cpt_debugreg[0]; ++ tsk->thread.debugreg1 = b->cpt_debugreg[1]; ++ tsk->thread.debugreg2 = b->cpt_debugreg[2]; ++ tsk->thread.debugreg3 = b->cpt_debugreg[3]; ++ tsk->thread.debugreg6 = b->cpt_debugreg[6]; ++ tsk->thread.debugreg7 = b->cpt_debugreg[7]; ++ ++ xlate_ptregs_32_to_64(regs, b); ++ ++ tsk->thread.usersp = regs->sp; ++ regs->cs = decode_segment(b->cpt_xcs); ++ regs->ss = decode_segment(b->cpt_xss); ++ tsk->thread.ds = decode_segment(b->cpt_xds); ++ tsk->thread.es = decode_segment(b->cpt_xes); ++ } else { ++ return -EINVAL; ++ } ++ ++ tsk->thread.sp -= HOOK_RESERVE; ++ memset((void*)tsk->thread.sp, 0, HOOK_RESERVE); ++ *rip = (void*)tsk->thread.sp; ++ return 0; ++} ++ ++#elif defined(CONFIG_IA64) ++ ++#define MASK(nbits) ((1UL << (nbits)) - 1) /* mask with NBITS bits set */ ++ ++#define PUT_BITS(first, last, nat) \ ++ ({ \ ++ unsigned long bit = ia64_unat_pos(&pt->r##first); \ ++ unsigned long nbits = (last - first + 1); \ ++ unsigned long mask = MASK(nbits) << first; \ ++ long dist; \ ++ if (bit < first) \ ++ dist = 64 + bit - first; \ ++ else \ ++ dist = bit - first; \ ++ ia64_rotl(nat & mask, dist); \ ++ }) ++ ++unsigned long ++ia64_put_scratch_nat_bits (struct pt_regs *pt, unsigned long nat) ++{ ++ unsigned long scratch_unat; ++ ++ /* ++ * Registers that are stored consecutively in struct pt_regs ++ * can be handled in parallel. If the register order in ++ * struct_pt_regs changes, this code MUST be updated. ++ */ ++ scratch_unat = PUT_BITS( 1, 1, nat); ++ scratch_unat |= PUT_BITS( 2, 3, nat); ++ scratch_unat |= PUT_BITS(12, 13, nat); ++ scratch_unat |= PUT_BITS(14, 14, nat); ++ scratch_unat |= PUT_BITS(15, 15, nat); ++ scratch_unat |= PUT_BITS( 8, 11, nat); ++ scratch_unat |= PUT_BITS(16, 31, nat); ++ ++ return scratch_unat; ++ ++} ++ ++static unsigned long ++ia64_put_saved_nat_bits (struct switch_stack *pt, unsigned long nat) ++{ ++ unsigned long scratch_unat; ++ ++ scratch_unat = PUT_BITS( 4, 7, nat); ++ ++ return scratch_unat; ++ ++} ++ ++#undef PUT_BITS ++ ++ ++static int restore_registers(struct task_struct *tsk, struct pt_regs *pt, ++ struct cpt_task_image *ti, ++ struct cpt_ia64_regs *r, ++ struct resume_info **rip, ++ struct cpt_context *ctx) ++{ ++ extern char ia64_ret_from_resume; ++ struct switch_stack *sw; ++ struct resume_info *ri; ++ struct ia64_psr *psr = ia64_psr(pt); ++ void *krbs = (void *)tsk + IA64_RBS_OFFSET; ++ unsigned long reg; ++ ++ if (r->cpt_object != CPT_OBJ_IA64_REGS) ++ return -EINVAL; ++ ++ if (r->num_regs > 96) { ++ eprintk(CPT_FID " too much RSE regs %lu\n", ++ CPT_TID(tsk), r->num_regs); ++ return -EINVAL; ++ } ++ ++ *rip = ri = ((void*)pt) - HOOK_RESERVE; ++ sw = ((struct switch_stack *) ri) - 1; ++ ++ memmove(sw, (void*)tsk->thread.ksp + 16, sizeof(struct switch_stack)); ++ memset(ri, 0, HOOK_RESERVE); ++ ++ /* gr 1,2-3,8-11,12-13,14,15,16-31 are on pt_regs */ ++ memcpy(&pt->r1, &r->gr[1], 8*(2-1)); ++ memcpy(&pt->r2, &r->gr[2], 8*(4-2)); ++ memcpy(&pt->r8, &r->gr[8], 8*(12-8)); ++ memcpy(&pt->r12, &r->gr[12], 8*(14-12)); ++ memcpy(&pt->r14, &r->gr[14], 8*(15-14)); ++ memcpy(&pt->r15, &r->gr[15], 8*(16-15)); ++ memcpy(&pt->r16, &r->gr[16], 8*(32-16)); ++ ++ pt->b0 = r->br[0]; ++ pt->b6 = r->br[6]; ++ pt->b7 = r->br[7]; ++ ++ pt->ar_bspstore = r->ar_bspstore; ++ pt->ar_unat = r->ar_unat; ++ pt->ar_pfs = r->ar_pfs; ++ pt->ar_ccv = r->ar_ccv; ++ pt->ar_fpsr = r->ar_fpsr; ++ pt->ar_csd = r->ar_csd; ++ pt->ar_ssd = r->ar_ssd; ++ pt->ar_rsc = r->ar_rsc; ++ ++ pt->cr_iip = r->cr_iip; ++ pt->cr_ipsr = r->cr_ipsr; ++ ++ pt->pr = r->pr; ++ ++ pt->cr_ifs = r->cfm; ++ ++ /* fpregs 6..9,10..11 are in pt_regs */ ++ memcpy(&pt->f6, &r->fr[2*6], 16*(10-6)); ++ memcpy(&pt->f10, &r->fr[2*10], 16*(12-10)); ++ /* fpreg 12..15 are on switch stack */ ++ memcpy(&sw->f12, &r->fr[2*12], 16*(16-12)); ++ /* fpregs 32...127 */ ++ tsk->thread.flags |= IA64_THREAD_FPH_VALID; ++ memcpy(tsk->thread.fph, &r->fr[32*2], 16*(128-32)); ++ ia64_drop_fpu(tsk); ++ psr->dfh = 1; ++ ++ memcpy(&sw->r4, &r->gr[4], 8*(8-4)); ++ memcpy(&sw->b1, &r->br[1], 8*(6-1)); ++ sw->ar_lc = r->ar_lc; ++ ++ memcpy(&sw->f2, &r->fr[2*2], 16*(6-2)); ++ memcpy(&sw->f16, &r->fr[2*16], 16*(32-16)); ++ ++ sw->caller_unat = 0; ++ sw->ar_fpsr = pt->ar_fpsr; ++ sw->ar_unat = 0; ++ if (r->nat[0] & 0xFFFFFF0FUL) ++ sw->caller_unat = ia64_put_scratch_nat_bits(pt, r->nat[0]); ++ if (r->nat[0] & 0xF0) ++ sw->ar_unat = ia64_put_saved_nat_bits(sw, r->nat[0]); ++ ++ sw->ar_bspstore = (unsigned long)ia64_rse_skip_regs(krbs, r->num_regs); ++ memset(krbs, 0, (void*)sw->ar_bspstore - krbs); ++ sw->ar_rnat = 0; ++ sw->ar_pfs = 0; ++ ++ /* This is tricky. When we are in syscall, we have frame ++ * of output register (sometimes, plus one input reg sometimes). ++ * It is not so easy to restore such frame, RSE optimizes ++ * and does not fetch those regs from backstore. So, we restore ++ * the whole frame as local registers, and then repartition it ++ * in ia64_ret_from_resume(). ++ */ ++ if ((long)pt->cr_ifs >= 0) { ++ unsigned long out = (r->cfm&0x7F) - ((r->cfm>>7)&0x7F); ++ sw->ar_pfs = out | (out<<7); ++ } ++ if (r->ar_ec) ++ sw->ar_pfs |= (r->ar_ec & 0x3F) << 52; ++ ++ for (reg = 0; reg < r->num_regs; reg++) { ++ unsigned long *ptr = ia64_rse_skip_regs(krbs, reg); ++ unsigned long *rnatp; ++ unsigned long set_rnat = 0; ++ ++ *ptr = r->gr[32+reg]; ++ ++ if (reg < 32) ++ set_rnat = (r->nat[0] & (1UL<<(reg+32))); ++ else ++ set_rnat = (r->nat[1] & (1UL<<(reg-32))); ++ ++ if (set_rnat) { ++ rnatp = ia64_rse_rnat_addr(ptr); ++ if ((unsigned long)rnatp >= sw->ar_bspstore) ++ rnatp = &sw->ar_rnat; ++ *rnatp |= (1UL<b0 = (unsigned long) &ia64_ret_from_resume; ++ tsk->thread.ksp = (unsigned long) sw - 16; ++ ++#define PRED_LEAVE_SYSCALL 1 /* TRUE iff leave from syscall */ ++#define PRED_KERNEL_STACK 2 /* returning to kernel-stacks? */ ++#define PRED_USER_STACK 3 /* returning to user-stacks? */ ++#define PRED_SYSCALL 4 /* inside a system call? */ ++#define PRED_NON_SYSCALL 5 /* complement of PRED_SYSCALL */ ++ ++ pt->loadrs = r->loadrs; ++ sw->pr = 0; ++ sw->pr &= ~(1UL << PRED_LEAVE_SYSCALL); ++ sw->pr &= ~((1UL << PRED_SYSCALL) | (1UL << PRED_NON_SYSCALL)); ++ sw->pr &= ~(1UL << PRED_KERNEL_STACK); ++ sw->pr |= (1UL << PRED_USER_STACK); ++ if ((long)pt->cr_ifs < 0) { ++ sw->pr |= (1UL << PRED_NON_SYSCALL); ++ } else { ++ sw->pr |= ((1UL << PRED_SYSCALL) | (1UL << PRED_LEAVE_SYSCALL)); ++ } ++ ++ return 0; ++} ++#endif ++ ++asmlinkage void rst_resume_work(struct resume_info *ri) ++{ ++ if (ri->hooks & (1<tid_ptrs); ++ if (ri->hooks & (1<hooks & (1<hooks & (1<thread.xstate->fxsave.mxcsr &= 0x0000ffbf; ++#endif ++} ++ ++int rst_restore_process(struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ for_each_object(obj, CPT_OBJ_TASK) { ++ struct task_struct *tsk = obj->o_obj; ++ struct cpt_task_image *ti = obj->o_image; ++ struct pt_regs * regs; ++ struct cpt_object_hdr *b; ++ struct cpt_siginfo_image *lsi = NULL; ++ struct group_info *gids, *ogids; ++ struct resume_info *ri = NULL; ++ int i; ++ int err = 0; ++#ifdef CONFIG_BEANCOUNTERS ++ struct task_beancounter *tbc; ++ struct user_beancounter *new_bc, *old_bc; ++#endif ++ ++ if (tsk == NULL) { ++ eprintk_ctx("oops, task %d/%s is missing\n", ti->cpt_pid, ti->cpt_comm); ++ return -EFAULT; ++ } ++ ++ wait_task_inactive(tsk); ++#ifdef CONFIG_BEANCOUNTERS ++ tbc = &tsk->task_bc; ++ new_bc = rst_lookup_ubc(ti->cpt_exec_ub, ctx); ++ err = virtinfo_notifier_call(VITYPE_SCP, ++ VIRTINFO_SCP_RSTTSK, new_bc); ++ if (err & NOTIFY_FAIL) { ++ put_beancounter(new_bc); ++ return -ECHRNG; ++ } ++ old_bc = tbc->exec_ub; ++ if ((err & VIRTNOTIFY_CHANGE) && old_bc != new_bc) { ++ dprintk(" *** replacing ub %p by %p for %p (%d %s)\n", ++ old_bc, new_bc, tsk, ++ tsk->pid, tsk->comm); ++ tbc->exec_ub = new_bc; ++ new_bc = old_bc; ++ } ++ put_beancounter(new_bc); ++#endif ++ regs = task_pt_regs(tsk); ++ ++ if (!tsk->exit_state) { ++ tsk->lock_depth = -1; ++#ifdef CONFIG_PREEMPT ++ task_thread_info(tsk)->preempt_count--; ++#endif ++ } ++ ++ if (tsk->static_prio != ti->cpt_static_prio) ++ set_user_nice(tsk, PRIO_TO_NICE((s32)ti->cpt_static_prio)); ++ ++ cpt_sigset_import(&tsk->blocked, ti->cpt_sigblocked); ++ cpt_sigset_import(&tsk->real_blocked, ti->cpt_sigrblocked); ++ cpt_sigset_import(&tsk->saved_sigmask, ti->cpt_sigsuspend_blocked); ++ cpt_sigset_import(&tsk->pending.signal, ti->cpt_sigpending); ++ ++ tsk->uid = ti->cpt_uid; ++ tsk->euid = ti->cpt_euid; ++ tsk->suid = ti->cpt_suid; ++ tsk->fsuid = ti->cpt_fsuid; ++ tsk->gid = ti->cpt_gid; ++ tsk->egid = ti->cpt_egid; ++ tsk->sgid = ti->cpt_sgid; ++ tsk->fsgid = ti->cpt_fsgid; ++#ifdef CONFIG_IA64 ++ SET_UNALIGN_CTL(tsk, ti->cpt_prctl_uac); ++ SET_FPEMU_CTL(tsk, ti->cpt_prctl_fpemu); ++#endif ++ memcpy(&tsk->cap_effective, &ti->cpt_ecap, sizeof(tsk->cap_effective)); ++ memcpy(&tsk->cap_inheritable, &ti->cpt_icap, sizeof(tsk->cap_inheritable)); ++ memcpy(&tsk->cap_permitted, &ti->cpt_pcap, sizeof(tsk->cap_permitted)); ++ if (ctx->image_version < CPT_VERSION_26) ++ tsk->securebits = (ti->cpt_keepcap != 0) ? ++ issecure_mask(SECURE_KEEP_CAPS) : 0; ++ else ++ tsk->securebits = ti->cpt_keepcap; ++ tsk->did_exec = (ti->cpt_did_exec != 0); ++ gids = groups_alloc(ti->cpt_ngids); ++ ogids = tsk->group_info; ++ if (gids) { ++ int i; ++ for (i=0; i<32; i++) ++ gids->small_block[i] = ti->cpt_gids[i]; ++ tsk->group_info = gids; ++ } ++ if (ogids) ++ put_group_info(ogids); ++ tsk->utime = ti->cpt_utime; ++ tsk->stime = ti->cpt_stime; ++ if (ctx->image_version == CPT_VERSION_8) ++ tsk->start_time = _ns_to_timespec(ti->cpt_starttime*TICK_NSEC); ++ else ++ cpt_timespec_import(&tsk->start_time, ti->cpt_starttime); ++ _set_normalized_timespec(&tsk->start_time, ++ tsk->start_time.tv_sec + ++ VE_TASK_INFO(tsk)->owner_env->start_timespec.tv_sec, ++ tsk->start_time.tv_nsec + ++ VE_TASK_INFO(tsk)->owner_env->start_timespec.tv_nsec); ++ ++ tsk->nvcsw = ti->cpt_nvcsw; ++ tsk->nivcsw = ti->cpt_nivcsw; ++ tsk->min_flt = ti->cpt_min_flt; ++ tsk->maj_flt = ti->cpt_maj_flt; ++ ++#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,8) ++ tsk->cutime = ti->cpt_cutime; ++ tsk->cstime = ti->cpt_cstime; ++ tsk->cnvcsw = ti->cpt_cnvcsw; ++ tsk->cnivcsw = ti->cpt_cnivcsw; ++ tsk->cmin_flt = ti->cpt_cmin_flt; ++ tsk->cmaj_flt = ti->cpt_cmaj_flt; ++ ++ BUILD_BUG_ON(RLIM_NLIMITS > CPT_RLIM_NLIMITS); ++ ++ for (i=0; irlim[i].rlim_cur = ti->cpt_rlim_cur[i]; ++ tsk->rlim[i].rlim_max = ti->cpt_rlim_max[i]; ++ } ++#else ++ if (thread_group_leader(tsk) && tsk->signal) { ++ tsk->signal->utime = ti->cpt_utime; ++ tsk->signal->stime = ti->cpt_stime; ++ tsk->signal->cutime = ti->cpt_cutime; ++ tsk->signal->cstime = ti->cpt_cstime; ++ tsk->signal->nvcsw = ti->cpt_nvcsw; ++ tsk->signal->nivcsw = ti->cpt_nivcsw; ++ tsk->signal->cnvcsw = ti->cpt_cnvcsw; ++ tsk->signal->cnivcsw = ti->cpt_cnivcsw; ++ tsk->signal->min_flt = ti->cpt_min_flt; ++ tsk->signal->maj_flt = ti->cpt_maj_flt; ++ tsk->signal->cmin_flt = ti->cpt_cmin_flt; ++ tsk->signal->cmaj_flt = ti->cpt_cmaj_flt; ++ ++ for (i=0; isignal->rlim[i].rlim_cur = ti->cpt_rlim_cur[i]; ++ tsk->signal->rlim[i].rlim_max = ti->cpt_rlim_max[i]; ++ } ++ } ++#endif ++ ++#ifdef CONFIG_X86 ++ for (i=0; i<3; i++) { ++ if (i >= GDT_ENTRY_TLS_ENTRIES) { ++ eprintk_ctx("too many tls descs\n"); ++ } else { ++ tsk->thread.tls_array[i].a = ti->cpt_tls[i]&0xFFFFFFFF; ++ tsk->thread.tls_array[i].b = ti->cpt_tls[i]>>32; ++ } ++ } ++#endif ++ ++ clear_stopped_child_used_math(tsk); ++ ++ b = (void *)(ti+1); ++ while ((void*)b < ((void*)ti) + ti->cpt_next) { ++ /* Siginfo objects are at the end of obj array */ ++ if (b->cpt_object == CPT_OBJ_SIGINFO) { ++ struct ve_struct *env = set_exec_env(VE_TASK_INFO(tsk)->owner_env); ++ restore_sigqueue(tsk, &tsk->pending, (unsigned long)b, (unsigned long)ti + ti->cpt_next); ++ set_exec_env(env); ++ break; ++ } ++ ++ switch (b->cpt_object) { ++#ifdef CONFIG_X86 ++ case CPT_OBJ_BITS: ++ if (b->cpt_content == CPT_CONTENT_X86_FPUSTATE && ++ cpu_has_fxsr) { ++ memcpy(&tsk->thread.xstate, ++ (void*)b + b->cpt_hdrlen, ++ sizeof(struct i387_fxsave_struct)); ++ rst_apply_mxcsr_mask(tsk); ++ if (ti->cpt_used_math) ++ set_stopped_child_used_math(tsk); ++ } ++#ifndef CONFIG_X86_64 ++ else if (b->cpt_content == CPT_CONTENT_X86_FPUSTATE_OLD && ++ !cpu_has_fxsr) { ++ memcpy(&tsk->thread.xstate, ++ (void*)b + b->cpt_hdrlen, ++ sizeof(struct i387_fsave_struct)); ++ if (ti->cpt_used_math) ++ set_stopped_child_used_math(tsk); ++ } ++#endif ++ break; ++#endif ++ case CPT_OBJ_LASTSIGINFO: ++ lsi = (void*)b; ++ break; ++ case CPT_OBJ_X86_REGS: ++ case CPT_OBJ_X86_64_REGS: ++ case CPT_OBJ_IA64_REGS: ++ if (restore_registers(tsk, regs, ti, (void*)b, &ri, ctx)) { ++ eprintk_ctx("cannot restore registers: image is corrupted\n"); ++ return -EINVAL; ++ } ++ break; ++ case CPT_OBJ_SIGALTSTACK: { ++ struct cpt_sigaltstack_image *sas; ++ sas = (struct cpt_sigaltstack_image *)b; ++ tsk->sas_ss_sp = sas->cpt_stack; ++ tsk->sas_ss_size = sas->cpt_stacksize; ++ break; ++ } ++ case CPT_OBJ_TASK_AUX: { ++ struct cpt_task_aux_image *ai; ++ ai = (struct cpt_task_aux_image *)b; ++ tsk->robust_list = cpt_ptr_import(ai->cpt_robust_list); ++#ifdef CONFIG_X86_64 ++#ifdef CONFIG_COMPAT ++ if (task_thread_info(tsk)->flags&_TIF_IA32) { ++ tsk->robust_list = (void __user *)NULL; ++ tsk->compat_robust_list = cpt_ptr_import(ai->cpt_robust_list); ++ } ++#endif ++#endif ++ break; ++ } ++ } ++ b = ((void*)b) + b->cpt_next; ++ } ++ ++ if (ri == NULL && !(ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) { ++ eprintk_ctx("missing register info\n"); ++ return -EINVAL; ++ } ++ ++ if (ti->cpt_ppid != ti->cpt_rppid) { ++ struct task_struct *parent; ++ struct ve_struct *env = set_exec_env(VE_TASK_INFO(tsk)->owner_env); ++ write_lock_irq(&tasklist_lock); ++ parent = find_task_by_vpid(ti->cpt_ppid); ++ if (parent && parent != tsk->parent) { ++ list_add(&tsk->ptrace_list, &tsk->parent->ptrace_children); ++ remove_parent(tsk); ++ tsk->parent = parent; ++ add_parent(tsk); ++ } ++ write_unlock_irq(&tasklist_lock); ++ set_exec_env(env); ++ } ++ ++ tsk->ptrace_message = ti->cpt_ptrace_message; ++ tsk->pn_state = ti->cpt_pn_state; ++ tsk->stopped_state = ti->cpt_stopped_state; ++ task_thread_info(tsk)->flags = ti->cpt_thrflags; ++ ++ /* The image was created with kernel < 2.6.16, while ++ * task hanged in sigsuspend -> do_signal. ++ * ++ * FIXME! This needs more brain efforts... ++ */ ++ if (ti->cpt_sigsuspend_state) { ++ set_restore_sigmask(); ++ } ++ ++#ifdef CONFIG_X86_64 ++ task_thread_info(tsk)->flags |= _TIF_FORK | _TIF_RESUME; ++ if (!ti->cpt_64bit) ++ task_thread_info(tsk)->flags |= _TIF_IA32; ++#endif ++ ++#ifdef CONFIG_X86_32 ++ do { ++ if (regs->orig_ax == __NR__newselect && regs->di) { ++ struct timeval tv; ++ if (access_process_vm(tsk, regs->di, &tv, ++ sizeof(tv), 0) != sizeof(tv)) { ++ wprintk_ctx("task %d/%d(%s): Error 1 in access_process_vm: edi %ld\n", ++ task_pid_vnr(tsk), tsk->pid, tsk->comm, ++ regs->di); ++ break; ++ } ++ dprintk_ctx("task %d/%d(%s): Old timeval in newselect: %ld.%ld\n", ++ task_pid_vnr(tsk), tsk->pid, tsk->comm, ++ tv.tv_sec, tv.tv_usec); ++ tv.tv_sec -= ctx->delta_time.tv_sec; ++ if (tv.tv_usec < ctx->delta_time.tv_nsec / 1000) { ++ tv.tv_usec += 1000000 - ctx->delta_time.tv_nsec / 1000; ++ tv.tv_sec--; ++ } else { ++ tv.tv_usec -= ctx->delta_time.tv_nsec / 1000; ++ } ++ if (tv.tv_sec < 0) { ++ tv.tv_sec = 0; ++ tv.tv_usec = 0; ++ } ++ dprintk_ctx("task %d/%d(%s): New timeval in newselect: %ld.%ld\n", ++ task_pid_vnr(tsk), tsk->pid, tsk->comm, ++ tv.tv_sec, tv.tv_usec); ++ if (access_process_vm(tsk, regs->di, &tv, ++ sizeof(tv), 1) != sizeof(tv)) { ++ wprintk_ctx("task %d/%d(%s): Error 1 in access_process_vm write: edi %ld\n", ++ task_pid_vnr(tsk), tsk->pid, tsk->comm, regs->di); ++ } ++ ++ } else if (regs->orig_ax == __NR_select && regs->di) { ++ struct { ++ unsigned long n; ++ fd_set __user *inp, *outp, *exp; ++ struct timeval __user *tvp; ++ } a; ++ struct timeval tv; ++ if (access_process_vm(tsk, regs->bx, &a, ++ sizeof(a), 0) != sizeof(a)) { ++ wprintk_ctx("task %d: Error 2 in access_process_vm\n", tsk->pid); ++ break; ++ } ++ if (access_process_vm(tsk, (unsigned long)a.tvp, ++ &tv, sizeof(tv), 0) != sizeof(tv)) { ++ wprintk_ctx("task %d: Error 3 in access_process_vm\n", tsk->pid); ++ break; ++ } ++ dprintk_ctx("task %d: Old timeval in select: %ld.%ld\n", ++ tsk->pid, tv.tv_sec, tv.tv_usec); ++ tv.tv_sec -= ctx->delta_time.tv_sec; ++ if (tv.tv_usec < ctx->delta_time.tv_nsec / 1000) { ++ tv.tv_usec += 1000000 - ctx->delta_time.tv_nsec / 1000; ++ tv.tv_sec--; ++ } else { ++ tv.tv_usec -= ctx->delta_time.tv_nsec / 1000; ++ } ++ if (tv.tv_sec < 0) { ++ tv.tv_sec = 0; ++ tv.tv_usec = 0; ++ } ++ dprintk_ctx("task %d: New timeval in select: %ld.%ld\n", ++ tsk->pid, tv.tv_sec, tv.tv_usec); ++ if (access_process_vm(tsk, (unsigned long)a.tvp, ++ &tv, sizeof(tv), 1) != sizeof(tv)) { ++ wprintk_ctx("task %d: Error 3 in access_process_vm write\n", tsk->pid); ++ } ++ } ++ } while (0); ++#endif ++ ++ if (ri && IN_SYSCALL(regs) && IN_ERROR(regs)) { ++ switch (SYSCALL_ERRNO(regs)) { ++ case ERESTARTSYS: ++ case ERESTARTNOINTR: ++ case ERESTARTNOHAND: ++ case ERESTART_RESTARTBLOCK: ++ case EAGAIN: ++ case EINTR: ++ ri->hooks |= (1<pn_state)) { ++ /* ... -> ptrace_notify() ++ * or ++ * ... -> do_signal() -> get_signal_to_deliver() -> ++ * ptrace stop ++ */ ++ tsk->last_siginfo = &ri->last_siginfo; ++ ri->hooks |= (1<last_siginfo, lsi); ++ } ++ ++ tsk->ptrace = ti->cpt_ptrace; ++ tsk->flags = ti->cpt_flags & ~PF_FROZEN; ++ clear_tsk_thread_flag(tsk, TIF_FREEZE); ++ tsk->exit_signal = ti->cpt_exit_signal; ++ ++ if (ri && tsk->stopped_state) { ++ dprintk_ctx("finish_stop\n"); ++ if (ti->cpt_state != TASK_STOPPED) ++ eprintk_ctx("Hellooo, state is %u\n", (unsigned)ti->cpt_state); ++ ri->hooks |= (1<cpt_set_tid || ti->cpt_clear_tid)) { ++ ri->hooks |= (1<tid_ptrs[0] = ti->cpt_clear_tid; ++ ri->tid_ptrs[1] = ti->cpt_set_tid; ++ dprintk_ctx("settids\n"); ++ } ++ ++ if (ri && ri->hooks && ++ !(ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) { ++ if (try_module_get(THIS_MODULE)) ++ ri->hook = rst_resume_work; ++ } ++ ++ if (ti->cpt_state == TASK_TRACED) ++ tsk->state = TASK_TRACED; ++ else if (ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD)) { ++ tsk->signal->it_virt_expires = 0; ++ tsk->signal->it_prof_expires = 0; ++ if (tsk->state != EXIT_DEAD) ++ eprintk_ctx("oops, schedule() did not make us dead\n"); ++ } ++ ++ if (thread_group_leader(tsk) && ++ ti->cpt_it_real_value && ++ !(ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) { ++ ktime_t val; ++ s64 nsec; ++ ++ nsec = ti->cpt_it_real_value; ++ val.tv64 = 0; ++ ++ if (ctx->image_version < CPT_VERSION_9) ++ nsec *= TICK_NSEC; ++ ++ val = ktime_add_ns(val, nsec - ctx->delta_nsec); ++ if (val.tv64 <= 0) ++ val.tv64 = NSEC_PER_USEC; ++ dprintk("rst itimer " CPT_FID " +%Ld %Lu\n", CPT_TID(tsk), ++ (long long)val.tv64, ++ (unsigned long long)ti->cpt_it_real_value); ++ ++ spin_lock_irq(&tsk->sighand->siglock); ++ if (hrtimer_try_to_cancel(&tsk->signal->real_timer) >= 0) { ++ /* FIXME. Check!!!! */ ++ hrtimer_start(&tsk->signal->real_timer, val, HRTIMER_MODE_REL); ++ } else { ++ wprintk_ctx("Timer clash. Impossible?\n"); ++ } ++ spin_unlock_irq(&tsk->sighand->siglock); ++ ++ dprintk_ctx("itimer " CPT_FID " +%Lu\n", CPT_TID(tsk), ++ (unsigned long long)val.tv64); ++ } ++ ++ module_put(THIS_MODULE); ++ } ++ return 0; ++} +diff --git a/kernel/cpt/rst_socket.c b/kernel/cpt/rst_socket.c +new file mode 100644 +index 0000000..d90488e +--- /dev/null ++++ b/kernel/cpt/rst_socket.c +@@ -0,0 +1,918 @@ ++/* ++ * ++ * kernel/cpt/rst_socket.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_mm.h" ++#include "cpt_files.h" ++#include "cpt_socket.h" ++#include "cpt_kernel.h" ++ ++#include "cpt_syscalls.h" ++ ++ ++static int setup_sock_common(struct sock *sk, struct cpt_sock_image *si, ++ loff_t pos, struct cpt_context *ctx) ++{ ++ struct timeval tmptv; ++ ++ if (sk->sk_socket) { ++ sk->sk_socket->flags = si->cpt_ssflags; ++ sk->sk_socket->state = si->cpt_sstate; ++ } ++ sk->sk_reuse = si->cpt_reuse; ++ sk->sk_shutdown = si->cpt_shutdown; ++ sk->sk_userlocks = si->cpt_userlocks; ++ sk->sk_no_check = si->cpt_no_check; ++ sock_reset_flag(sk, SOCK_DBG); ++ if (si->cpt_debug) ++ sock_set_flag(sk, SOCK_DBG); ++ sock_reset_flag(sk, SOCK_RCVTSTAMP); ++ if (si->cpt_rcvtstamp) ++ sock_set_flag(sk, SOCK_RCVTSTAMP); ++ sock_reset_flag(sk, SOCK_LOCALROUTE); ++ if (si->cpt_localroute) ++ sock_set_flag(sk, SOCK_LOCALROUTE); ++ sk->sk_protocol = si->cpt_protocol; ++ sk->sk_err = si->cpt_err; ++ sk->sk_err_soft = si->cpt_err_soft; ++ sk->sk_priority = si->cpt_priority; ++ sk->sk_rcvlowat = si->cpt_rcvlowat; ++ sk->sk_rcvtimeo = si->cpt_rcvtimeo; ++ if (si->cpt_rcvtimeo == CPT_NULL) ++ sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; ++ sk->sk_sndtimeo = si->cpt_sndtimeo; ++ if (si->cpt_sndtimeo == CPT_NULL) ++ sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; ++ sk->sk_rcvbuf = si->cpt_rcvbuf; ++ sk->sk_sndbuf = si->cpt_sndbuf; ++ sk->sk_bound_dev_if = si->cpt_bound_dev_if; ++ sk->sk_flags = si->cpt_flags; ++ sk->sk_lingertime = si->cpt_lingertime; ++ if (si->cpt_lingertime == CPT_NULL) ++ sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; ++ sk->sk_peercred.pid = si->cpt_peer_pid; ++ sk->sk_peercred.uid = si->cpt_peer_uid; ++ sk->sk_peercred.gid = si->cpt_peer_gid; ++ cpt_timeval_import(&tmptv, si->cpt_stamp); ++ sk->sk_stamp = timeval_to_ktime(tmptv); ++ return 0; ++} ++ ++static struct file *sock_mapfile(struct socket *sock) ++{ ++ int fd = sock_map_fd(sock); ++ ++ if (fd >= 0) { ++ struct file *file = sock->file; ++ get_file(file); ++ sc_close(fd); ++ return file; ++ } ++ return ERR_PTR(fd); ++} ++ ++/* Assumption is that /tmp exists and writable. ++ * In previous versions we assumed that listen() will autobind ++ * the socket. It does not do this for AF_UNIX by evident reason: ++ * socket in abstract namespace is accessible, unlike socket bound ++ * to deleted FS object. ++ */ ++ ++static int ++select_deleted_name(char * name, cpt_context_t *ctx) ++{ ++ int i; ++ ++ for (i=0; i<100; i++) { ++ struct nameidata nd; ++ unsigned int rnd = net_random(); ++ ++ sprintf(name, "/tmp/SOCK.%08x", rnd); ++ ++ if (path_lookup(name, 0, &nd) != 0) ++ return 0; ++ ++ path_put(&nd.path); ++ } ++ ++ eprintk_ctx("failed to allocate deleted socket inode\n"); ++ return -ELOOP; ++} ++ ++static int ++bind_unix_socket(struct socket *sock, struct cpt_sock_image *si, ++ cpt_context_t *ctx) ++{ ++ int err; ++ char *name; ++ struct sockaddr* addr; ++ int addrlen; ++ struct sockaddr_un sun; ++ struct nameidata nd; ++ ++ if ((addrlen = si->cpt_laddrlen) <= 2) ++ return 0; ++ ++ nd.path.dentry = NULL; ++ name = ((char*)si->cpt_laddr) + 2; ++ addr = (struct sockaddr *)si->cpt_laddr; ++ ++ if (name[0]) { ++ if (path_lookup(name, 0, &nd)) ++ nd.path.dentry = NULL; ++ ++ if (si->cpt_deleted) { ++ if (nd.path.dentry == NULL && ++ sock->ops->bind(sock, addr, addrlen) == 0) { ++ sc_unlink(name); ++ return 0; ++ } ++ ++ addr = (struct sockaddr*)&sun; ++ addr->sa_family = AF_UNIX; ++ name = ((char*)addr) + 2; ++ err = select_deleted_name(name, ctx); ++ if (err) ++ goto out; ++ addrlen = 2 + strlen(name); ++ } else if (nd.path.dentry) { ++ if (!S_ISSOCK(nd.path.dentry->d_inode->i_mode)) { ++ eprintk_ctx("bind_unix_socket: not a socket dentry\n"); ++ err = -EINVAL; ++ goto out; ++ } ++ sc_unlink(name); ++ } ++ } ++ ++ err = sock->ops->bind(sock, addr, addrlen); ++ ++ if (!err && name[0]) { ++ if (nd.path.dentry) { ++ sc_chown(name, nd.path.dentry->d_inode->i_uid, ++ nd.path.dentry->d_inode->i_gid); ++ sc_chmod(name, nd.path.dentry->d_inode->i_mode); ++ } ++ if (si->cpt_deleted) ++ sc_unlink(name); ++ } ++ ++out: ++ if (nd.path.dentry) ++ path_put(&nd.path); ++ return err; ++} ++ ++static int fixup_unix_address(struct socket *sock, struct cpt_sock_image *si, ++ struct cpt_context *ctx) ++{ ++ struct sock *sk = sock->sk; ++ cpt_object_t *obj; ++ struct sock *parent; ++ ++ if (sk->sk_family != AF_UNIX || sk->sk_state == TCP_LISTEN) ++ return 0; ++ ++ if (si->cpt_parent == -1) ++ return bind_unix_socket(sock, si, ctx); ++ ++ obj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx); ++ if (!obj) ++ return 0; ++ ++ parent = obj->o_obj; ++ if (unix_sk(parent)->addr) { ++ if (unix_sk(sk)->addr && ++ atomic_dec_and_test(&unix_sk(sk)->addr->refcnt)) ++ kfree(unix_sk(sk)->addr); ++ atomic_inc(&unix_sk(parent)->addr->refcnt); ++ unix_sk(sk)->addr = unix_sk(parent)->addr; ++ } ++ return 0; ++} ++ ++static int generic_restore_queues(struct sock *sk, struct cpt_sock_image *si, ++ loff_t pos, struct cpt_context *ctx) ++{ ++ loff_t endpos; ++ ++ pos = pos + si->cpt_hdrlen; ++ endpos = pos + si->cpt_next; ++ while (pos < endpos) { ++ struct sk_buff *skb; ++ __u32 type; ++ ++ skb = rst_skb(&pos, NULL, &type, ctx); ++ if (IS_ERR(skb)) { ++ if (PTR_ERR(skb) == -EINVAL) { ++ int err; ++ ++ err = rst_sock_attr(&pos, sk, ctx); ++ if (err) ++ return err; ++ } ++ return PTR_ERR(skb); ++ } ++ ++ if (type == CPT_SKB_RQ) { ++ skb_set_owner_r(skb, sk); ++ skb_queue_tail(&sk->sk_receive_queue, skb); ++ } else { ++ wprintk_ctx("strange socket queue type %u\n", type); ++ kfree_skb(skb); ++ } ++ } ++ return 0; ++} ++ ++static int open_socket(cpt_object_t *obj, struct cpt_sock_image *si, ++ struct cpt_context *ctx) ++{ ++ int err; ++ struct socket *sock; ++ struct socket *sock2 = NULL; ++ struct file *file; ++ cpt_object_t *fobj; ++ cpt_object_t *pobj = NULL; ++ ++ err = sock_create_kern(si->cpt_family, si->cpt_type, si->cpt_protocol, ++ &sock); ++ if (err) ++ return err; ++ ++ if (si->cpt_socketpair) { ++ err = sock_create_kern(si->cpt_family, si->cpt_type, ++ si->cpt_protocol, &sock2); ++ if (err) ++ goto err_out; ++ ++ err = sock->ops->socketpair(sock, sock2); ++ if (err < 0) ++ goto err_out; ++ ++ /* Socketpair with a peer outside our environment. ++ * So, we create real half-open pipe and do not worry ++ * about dead end anymore. */ ++ if (si->cpt_peer == -1) { ++ sock_release(sock2); ++ sock2 = NULL; ++ } ++ } ++ ++ cpt_obj_setobj(obj, sock->sk, ctx); ++ ++ if (si->cpt_file != CPT_NULL) { ++ file = sock_mapfile(sock); ++ err = PTR_ERR(file); ++ if (IS_ERR(file)) ++ goto err_out; ++ ++ err = -ENOMEM; ++ ++ obj->o_parent = file; ++ ++ if ((fobj = cpt_object_add(CPT_OBJ_FILE, file, ctx)) == NULL) ++ goto err_out; ++ cpt_obj_setpos(fobj, si->cpt_file, ctx); ++ cpt_obj_setindex(fobj, si->cpt_index, ctx); ++ } ++ ++ if (sock2) { ++ struct file *file2; ++ ++ pobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_peer, ctx); ++ if (!pobj) BUG(); ++ if (pobj->o_obj) BUG(); ++ cpt_obj_setobj(pobj, sock2->sk, ctx); ++ ++ if (pobj->o_ppos != CPT_NULL) { ++ file2 = sock_mapfile(sock2); ++ err = PTR_ERR(file2); ++ if (IS_ERR(file2)) ++ goto err_out; ++ ++ err = -ENOMEM; ++ if ((fobj = cpt_object_add(CPT_OBJ_FILE, file2, ctx)) == NULL) ++ goto err_out; ++ cpt_obj_setpos(fobj, pobj->o_ppos, ctx); ++ cpt_obj_setindex(fobj, si->cpt_peer, ctx); ++ ++ pobj->o_parent = file2; ++ } ++ } ++ ++ setup_sock_common(sock->sk, si, obj->o_pos, ctx); ++ if (sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6) { ++ int saved_reuse = sock->sk->sk_reuse; ++ ++ inet_sk(sock->sk)->freebind = 1; ++ sock->sk->sk_reuse = 2; ++ if (si->cpt_laddrlen) { ++ err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen); ++ if (err) { ++ dprintk_ctx("binding failed: %d, do not worry\n", err); ++ } ++ } ++ sock->sk->sk_reuse = saved_reuse; ++ rst_socket_in(si, obj->o_pos, sock->sk, ctx); ++ } else if (sock->sk->sk_family == AF_NETLINK) { ++ struct sockaddr_nl *nl = (struct sockaddr_nl *)&si->cpt_laddr; ++ if (nl->nl_pid) { ++ err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen); ++ if (err) { ++ eprintk_ctx("AF_NETLINK binding failed: %d\n", err); ++ } ++ } ++ if (si->cpt_raddrlen && nl->nl_pid) { ++ err = sock->ops->connect(sock, (struct sockaddr *)&si->cpt_raddr, si->cpt_raddrlen, O_NONBLOCK); ++ if (err) { ++ eprintk_ctx("oops, AF_NETLINK connect failed: %d\n", err); ++ } ++ } ++ generic_restore_queues(sock->sk, si, obj->o_pos, ctx); ++ } else if (sock->sk->sk_family == PF_PACKET) { ++ struct sockaddr_ll *ll = (struct sockaddr_ll *)&si->cpt_laddr; ++ if (ll->sll_protocol || ll->sll_ifindex) { ++ int alen = si->cpt_laddrlen; ++ if (alen < sizeof(struct sockaddr_ll)) ++ alen = sizeof(struct sockaddr_ll); ++ err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, alen); ++ if (err) { ++ eprintk_ctx("AF_PACKET binding failed: %d\n", err); ++ } ++ } ++ generic_restore_queues(sock->sk, si, obj->o_pos, ctx); ++ } ++ fixup_unix_address(sock, si, ctx); ++ ++ if (sock2) { ++ err = rst_get_object(CPT_OBJ_SOCKET, pobj->o_pos, si, ctx); ++ if (err) ++ return err; ++ setup_sock_common(sock2->sk, si, pobj->o_pos, ctx); ++ fixup_unix_address(sock2, si, ctx); ++ } ++ ++ if ((sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6) ++ && (int)si->cpt_parent != -1) { ++ cpt_object_t *lobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx); ++ if (lobj && cpt_attach_accept(lobj->o_obj, sock->sk, ctx) == 0) ++ sock->sk = NULL; ++ } ++ ++ ++ if (si->cpt_file == CPT_NULL && sock->sk && ++ sock->sk->sk_family == AF_INET) { ++ struct sock *sk = sock->sk; ++ ++ if (sk) { ++ sock->sk = NULL; ++ ++ local_bh_disable(); ++ bh_lock_sock(sk); ++ if (sock_owned_by_user(sk)) ++ eprintk_ctx("oops, sock is locked by user\n"); ++ ++ sock_hold(sk); ++ sock_orphan(sk); ++ ub_inc_orphan_count(sk); ++ bh_unlock_sock(sk); ++ local_bh_enable(); ++ sock_put(sk); ++ dprintk_ctx("orphaning socket %p\n", sk); ++ } ++ } ++ ++ if (si->cpt_file == CPT_NULL && sock->sk == NULL) ++ sock_release(sock); ++ ++ return 0; ++ ++err_out: ++ if (sock2) ++ sock_release(sock2); ++ sock_release(sock); ++ return err; ++} ++ ++static int open_listening_socket(loff_t pos, struct cpt_sock_image *si, ++ struct cpt_context *ctx) ++{ ++ int err; ++ struct socket *sock; ++ struct file *file; ++ cpt_object_t *obj, *fobj; ++ ++ err = sock_create_kern(si->cpt_family, si->cpt_type, si->cpt_protocol, ++ &sock); ++ if (err) { ++ eprintk_ctx("open_listening_socket: sock_create_kern: %d\n", err); ++ return err; ++ } ++ ++ sock->sk->sk_reuse = 2; ++ sock->sk->sk_bound_dev_if = si->cpt_bound_dev_if; ++ ++ if (sock->sk->sk_family == AF_UNIX) { ++ err = bind_unix_socket(sock, si, ctx); ++ } else if (si->cpt_laddrlen) { ++ if (sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6) ++ inet_sk(sock->sk)->freebind = 1; ++ ++ err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen); ++ ++ if (err) { ++ eprintk_ctx("open_listening_socket: bind: %d\n", err); ++ goto err_out; ++ } ++ } ++ ++ err = sock->ops->listen(sock, si->cpt_max_ack_backlog); ++ if (err) { ++ eprintk_ctx("open_listening_socket: listen: %d, %Ld, %d\n", err, pos, si->cpt_deleted); ++ goto err_out; ++ } ++ ++ /* Now we may access socket body directly and fixup all the things. */ ++ ++ file = sock_mapfile(sock); ++ err = PTR_ERR(file); ++ if (IS_ERR(file)) { ++ eprintk_ctx("open_listening_socket: map: %d\n", err); ++ goto err_out; ++ } ++ ++ err = -ENOMEM; ++ if ((fobj = cpt_object_add(CPT_OBJ_FILE, file, ctx)) == NULL) ++ goto err_out; ++ if ((obj = cpt_object_add(CPT_OBJ_SOCKET, sock->sk, ctx)) == NULL) ++ goto err_out; ++ cpt_obj_setpos(obj, pos, ctx); ++ cpt_obj_setindex(obj, si->cpt_index, ctx); ++ obj->o_parent = file; ++ cpt_obj_setpos(fobj, si->cpt_file, ctx); ++ cpt_obj_setindex(fobj, si->cpt_index, ctx); ++ ++ setup_sock_common(sock->sk, si, pos, ctx); ++ ++ if (si->cpt_family == AF_INET || si->cpt_family == AF_INET6) ++ rst_restore_synwait_queue(sock->sk, si, pos, ctx); ++ ++ return 0; ++ ++err_out: ++ sock_release(sock); ++ return err; ++} ++ ++static int ++rst_sock_attr_mcfilter(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx) ++{ ++ int err; ++ loff_t pos = *pos_p; ++ struct cpt_sockmc_image v; ++ ++ err = rst_get_object(CPT_OBJ_SOCK_MCADDR, pos, &v, ctx); ++ if (err) ++ return err; ++ ++ *pos_p += v.cpt_next; ++ ++ if (v.cpt_family == AF_INET) ++ return rst_sk_mcfilter_in(sk, &v, pos, ctx); ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ else if (v.cpt_family == AF_INET6) ++ return rst_sk_mcfilter_in6(sk, &v, pos, ctx); ++#endif ++ else ++ return -EAFNOSUPPORT; ++} ++ ++ ++static int ++rst_sock_attr_skfilter(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx) ++{ ++ int err; ++ struct sk_filter *fp, *old_fp; ++ loff_t pos = *pos_p; ++ struct cpt_obj_bits v; ++ ++ err = rst_get_object(CPT_OBJ_SKFILTER, pos, &v, ctx); ++ if (err) ++ return err; ++ ++ *pos_p += v.cpt_next; ++ ++ if (v.cpt_size % sizeof(struct sock_filter)) ++ return -EINVAL; ++ ++ fp = sock_kmalloc(sk, v.cpt_size+sizeof(*fp), GFP_KERNEL_UBC); ++ if (fp == NULL) ++ return -ENOMEM; ++ atomic_set(&fp->refcnt, 1); ++ fp->len = v.cpt_size/sizeof(struct sock_filter); ++ ++ err = ctx->pread(fp->insns, v.cpt_size, ctx, pos+v.cpt_hdrlen); ++ if (err) { ++ sk_filter_uncharge(sk, fp); ++ return err; ++ } ++ ++ old_fp = sk->sk_filter; ++ sk->sk_filter = fp; ++ if (old_fp) ++ sk_filter_uncharge(sk, old_fp); ++ return 0; ++} ++ ++ ++int rst_sock_attr(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx) ++{ ++ int err; ++ loff_t pos = *pos_p; ++ ++ err = rst_sock_attr_skfilter(pos_p, sk, ctx); ++ if (err && pos == *pos_p) ++ err = rst_sock_attr_mcfilter(pos_p, sk, ctx); ++ return err; ++} ++ ++struct sk_buff * rst_skb(loff_t *pos_p, __u32 *owner, __u32 *queue, struct cpt_context *ctx) ++{ ++ int err; ++ struct sk_buff *skb; ++ struct cpt_skb_image v; ++ loff_t pos = *pos_p; ++ struct scm_fp_list *fpl = NULL; ++ struct timeval tmptv; ++ ++ err = rst_get_object(CPT_OBJ_SKB, pos, &v, ctx); ++ if (err) ++ return ERR_PTR(err); ++ *pos_p = pos + v.cpt_next; ++ ++ if (owner) ++ *owner = v.cpt_owner; ++ if (queue) ++ *queue = v.cpt_queue; ++ ++ skb = alloc_skb(v.cpt_len + v.cpt_hspace + v.cpt_tspace, GFP_KERNEL); ++ if (skb == NULL) ++ return ERR_PTR(-ENOMEM); ++ skb_reserve(skb, v.cpt_hspace); ++ skb_put(skb, v.cpt_len); ++#ifdef NET_SKBUFF_DATA_USES_OFFSET ++ skb->transport_header = v.cpt_h; ++ skb->network_header = v.cpt_nh; ++ skb->mac_header = v.cpt_mac; ++#else ++ skb->transport_header = skb->head + v.cpt_h; ++ skb->network_header = skb->head + v.cpt_nh; ++ skb->mac_header = skb->head + v.cpt_mac; ++#endif ++ BUILD_BUG_ON(sizeof(skb->cb) < sizeof(v.cpt_cb)); ++ memcpy(skb->cb, v.cpt_cb, sizeof(v.cpt_cb)); ++ skb->mac_len = v.cpt_mac_len; ++ ++ skb->csum = v.cpt_csum; ++ skb->local_df = v.cpt_local_df; ++ skb->pkt_type = v.cpt_pkt_type; ++ skb->ip_summed = v.cpt_ip_summed; ++ skb->priority = v.cpt_priority; ++ skb->protocol = v.cpt_protocol; ++ cpt_timeval_import(&tmptv, v.cpt_stamp); ++ skb->tstamp = timeval_to_ktime(tmptv); ++ ++ skb_shinfo(skb)->gso_segs = v.cpt_gso_segs; ++ skb_shinfo(skb)->gso_size = v.cpt_gso_size; ++ if (ctx->image_version == 0) { ++ skb_shinfo(skb)->gso_segs = 1; ++ skb_shinfo(skb)->gso_size = 0; ++ } ++ ++ if (v.cpt_next > v.cpt_hdrlen) { ++ pos = pos + v.cpt_hdrlen; ++ while (pos < *pos_p) { ++ union { ++ struct cpt_obj_bits b; ++ struct cpt_fd_image f; ++ } u; ++ ++ err = rst_get_object(-1, pos, &u, ctx); ++ if (err) { ++ kfree_skb(skb); ++ return ERR_PTR(err); ++ } ++ if (u.b.cpt_object == CPT_OBJ_BITS) { ++ if (u.b.cpt_size != v.cpt_hspace + skb->len) { ++ eprintk_ctx("invalid skb image %u != %u + %u\n", u.b.cpt_size, v.cpt_hspace, skb->len); ++ kfree_skb(skb); ++ return ERR_PTR(-EINVAL); ++ } ++ ++ err = ctx->pread(skb->head, u.b.cpt_size, ctx, pos+u.b.cpt_hdrlen); ++ if (err) { ++ kfree_skb(skb); ++ return ERR_PTR(err); ++ } ++ } else if (u.f.cpt_object == CPT_OBJ_FILEDESC) { ++ if (!fpl) { ++ fpl = kmalloc(sizeof(struct scm_fp_list), ++ GFP_KERNEL_UBC); ++ if (!fpl) { ++ kfree_skb(skb); ++ return ERR_PTR(-ENOMEM); ++ } ++ fpl->count = 0; ++ UNIXCB(skb).fp = fpl; ++ } ++ fpl->fp[fpl->count] = rst_file(u.f.cpt_file, -1, ctx); ++ if (!IS_ERR(fpl->fp[fpl->count])) ++ fpl->count++; ++ } ++ pos += u.b.cpt_next; ++ } ++ } ++ ++ return skb; ++} ++ ++static int restore_unix_rqueue(struct sock *sk, struct cpt_sock_image *si, ++ loff_t pos, struct cpt_context *ctx) ++{ ++ loff_t endpos; ++ ++ pos = pos + si->cpt_hdrlen; ++ endpos = pos + si->cpt_next; ++ while (pos < endpos) { ++ struct sk_buff *skb; ++ struct sock *owner_sk; ++ __u32 owner; ++ ++ skb = rst_skb(&pos, &owner, NULL, ctx); ++ if (IS_ERR(skb)) { ++ if (PTR_ERR(skb) == -EINVAL) { ++ int err; ++ ++ err = rst_sock_attr(&pos, sk, ctx); ++ if (err) ++ return err; ++ } ++ return PTR_ERR(skb); ++ } ++ ++ owner_sk = unix_peer(sk); ++ if (owner != -1) { ++ cpt_object_t *pobj; ++ pobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, owner, ctx); ++ if (pobj == NULL) { ++ eprintk_ctx("orphan af_unix skb?\n"); ++ kfree_skb(skb); ++ continue; ++ } ++ owner_sk = pobj->o_obj; ++ } ++ if (owner_sk == NULL) { ++ dprintk_ctx("orphan af_unix skb 2?\n"); ++ kfree_skb(skb); ++ continue; ++ } ++ skb_set_owner_w(skb, owner_sk); ++ if (UNIXCB(skb).fp) ++ skb->destructor = unix_destruct_fds; ++ skb_queue_tail(&sk->sk_receive_queue, skb); ++ if (sk->sk_state == TCP_LISTEN) { ++ struct socket *sock = skb->sk->sk_socket; ++ if (sock == NULL) BUG(); ++ if (sock->file) BUG(); ++ skb->sk->sk_socket = NULL; ++ skb->sk->sk_sleep = NULL; ++ sock->sk = NULL; ++ sock_release(sock); ++ } ++ } ++ return 0; ++} ++ ++ ++/* All the sockets are created before we start to open files */ ++ ++int rst_sockets(struct cpt_context *ctx) ++{ ++ int err; ++ loff_t sec = ctx->sections[CPT_SECT_SOCKET]; ++ loff_t endsec; ++ cpt_object_t *obj; ++ struct cpt_section_hdr h; ++ ++ if (sec == CPT_NULL) ++ return 0; ++ ++ err = ctx->pread(&h, sizeof(h), ctx, sec); ++ if (err) { ++ eprintk_ctx("rst_sockets: ctx->pread: %d\n", err); ++ return err; ++ } ++ if (h.cpt_section != CPT_SECT_SOCKET || h.cpt_hdrlen < sizeof(h)) { ++ eprintk_ctx("rst_sockets: hdr err\n"); ++ return -EINVAL; ++ } ++ ++ /* The first pass: we create socket index and open listening sockets. */ ++ endsec = sec + h.cpt_next; ++ sec += h.cpt_hdrlen; ++ while (sec < endsec) { ++ struct cpt_sock_image *sbuf = cpt_get_buf(ctx); ++ err = rst_get_object(CPT_OBJ_SOCKET, sec, sbuf, ctx); ++ if (err) { ++ eprintk_ctx("rst_sockets: rst_get_object: %d\n", err); ++ cpt_release_buf(ctx); ++ return err; ++ } ++ if (sbuf->cpt_state == TCP_LISTEN) { ++ err = open_listening_socket(sec, sbuf, ctx); ++ cpt_release_buf(ctx); ++ if (err) { ++ eprintk_ctx("rst_sockets: open_listening_socket: %d\n", err); ++ return err; ++ } ++ } else { ++ cpt_release_buf(ctx); ++ obj = alloc_cpt_object(GFP_KERNEL, ctx); ++ if (obj == NULL) ++ return -ENOMEM; ++ cpt_obj_setindex(obj, sbuf->cpt_index, ctx); ++ cpt_obj_setpos(obj, sec, ctx); ++ obj->o_ppos = sbuf->cpt_file; ++ intern_cpt_object(CPT_OBJ_SOCKET, obj, ctx); ++ } ++ sec += sbuf->cpt_next; ++ } ++ ++ /* Pass 2: really restore sockets */ ++ for_each_object(obj, CPT_OBJ_SOCKET) { ++ struct cpt_sock_image *sbuf; ++ if (obj->o_obj != NULL) ++ continue; ++ sbuf = cpt_get_buf(ctx); ++ err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, sbuf, ctx); ++ if (err) { ++ eprintk_ctx("rst_sockets: rst_get_object: %d\n", err); ++ cpt_release_buf(ctx); ++ return err; ++ } ++ if (sbuf->cpt_state == TCP_LISTEN) BUG(); ++ err = open_socket(obj, sbuf, ctx); ++ cpt_release_buf(ctx); ++ if (err) { ++ eprintk_ctx("rst_sockets: open_socket: %d\n", err); ++ return err; ++ } ++ } ++ ++ return 0; ++} ++ ++int rst_orphans(struct cpt_context *ctx) ++{ ++ int err; ++ loff_t sec = ctx->sections[CPT_SECT_ORPHANS]; ++ loff_t endsec; ++ cpt_object_t *obj; ++ struct cpt_section_hdr h; ++ ++ if (sec == CPT_NULL) ++ return 0; ++ ++ err = ctx->pread(&h, sizeof(h), ctx, sec); ++ if (err) ++ return err; ++ if (h.cpt_section != CPT_SECT_ORPHANS || h.cpt_hdrlen < sizeof(h)) ++ return -EINVAL; ++ ++ endsec = sec + h.cpt_next; ++ sec += h.cpt_hdrlen; ++ while (sec < endsec) { ++ struct cpt_sock_image *sbuf = cpt_get_buf(ctx); ++ err = rst_get_object(CPT_OBJ_SOCKET, sec, sbuf, ctx); ++ if (err) { ++ cpt_release_buf(ctx); ++ return err; ++ } ++ obj = alloc_cpt_object(GFP_KERNEL, ctx); ++ if (obj == NULL) { ++ cpt_release_buf(ctx); ++ return -ENOMEM; ++ } ++ obj->o_pos = sec; ++ obj->o_ppos = sbuf->cpt_file; ++ err = open_socket(obj, sbuf, ctx); ++ dprintk_ctx("Restoring orphan: %d\n", err); ++ free_cpt_object(obj, ctx); ++ cpt_release_buf(ctx); ++ if (err) ++ return err; ++ sec += sbuf->cpt_next; ++ } ++ ++ return 0; ++} ++ ++ ++/* Pass 3: I understand, this is not funny already :-), ++ * but we have to do another pass to establish links between ++ * not-paired AF_UNIX SOCK_DGRAM sockets and to restore AF_UNIX ++ * skb queues with proper skb->sk links. ++ * ++ * This could be made at the end of rst_sockets(), but we defer ++ * restoring af_unix queues up to the end of restoring files to ++ * make restoring passed FDs cleaner. ++ */ ++ ++int rst_sockets_complete(struct cpt_context *ctx) ++{ ++ int err; ++ cpt_object_t *obj; ++ ++ for_each_object(obj, CPT_OBJ_SOCKET) { ++ struct cpt_sock_image *sbuf; ++ struct sock *sk = obj->o_obj; ++ struct sock *peer; ++ ++ if (!sk) BUG(); ++ ++ if (sk->sk_family != AF_UNIX) ++ continue; ++ ++ sbuf = cpt_get_buf(ctx); ++ err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, sbuf, ctx); ++ if (err) { ++ cpt_release_buf(ctx); ++ return err; ++ } ++ ++ if (sbuf->cpt_next > sbuf->cpt_hdrlen) ++ restore_unix_rqueue(sk, sbuf, obj->o_pos, ctx); ++ ++ cpt_release_buf(ctx); ++ ++ if (sk->sk_type == SOCK_DGRAM && unix_peer(sk) == NULL) { ++ cpt_object_t *pobj; ++ ++ sbuf = cpt_get_buf(ctx); ++ err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, sbuf, ctx); ++ if (err) { ++ cpt_release_buf(ctx); ++ return err; ++ } ++ ++ if (sbuf->cpt_peer != -1) { ++ pobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, sbuf->cpt_peer, ctx); ++ if (pobj) { ++ peer = pobj->o_obj; ++ sock_hold(peer); ++ unix_peer(sk) = peer; ++ } ++ } ++ cpt_release_buf(ctx); ++ } ++ } ++ ++ rst_orphans(ctx); ++ ++ return 0; ++} ++ +diff --git a/kernel/cpt/rst_socket_in.c b/kernel/cpt/rst_socket_in.c +new file mode 100644 +index 0000000..ddc2d5a +--- /dev/null ++++ b/kernel/cpt/rst_socket_in.c +@@ -0,0 +1,489 @@ ++/* ++ * ++ * kernel/cpt/rst_socket_in.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_mm.h" ++#include "cpt_socket.h" ++#include "cpt_kernel.h" ++ ++static inline unsigned long jiffies_import(__u32 tmo) ++{ ++ __s32 delta = tmo; ++ return jiffies + (long)delta; ++} ++ ++static inline __u32 tcp_jiffies_import(__u32 tmo) ++{ ++ return ((__u32)jiffies) + tmo; ++} ++ ++ ++static int restore_queues(struct sock *sk, struct cpt_sock_image *si, ++ loff_t pos, struct cpt_context *ctx) ++{ ++ loff_t endpos; ++ ++ pos = pos + si->cpt_hdrlen; ++ endpos = pos + si->cpt_next; ++ while (pos < endpos) { ++ struct sk_buff *skb; ++ __u32 type; ++ ++ skb = rst_skb(&pos, NULL, &type, ctx); ++ if (IS_ERR(skb)) { ++ if (PTR_ERR(skb) == -EINVAL) { ++ int err; ++ ++ err = rst_sock_attr(&pos, sk, ctx); ++ if (err) ++ return err; ++ } ++ return PTR_ERR(skb); ++ } ++ ++ if (sk->sk_type == SOCK_STREAM) { ++ if (type == CPT_SKB_RQ) { ++ skb_set_owner_r(skb, sk); ++ ub_tcprcvbuf_charge_forced(sk, skb); ++ skb_queue_tail(&sk->sk_receive_queue, skb); ++ } else if (type == CPT_SKB_OFOQ) { ++ struct tcp_sock *tp = tcp_sk(sk); ++ skb_set_owner_r(skb, sk); ++ ub_tcprcvbuf_charge_forced(sk, skb); ++ skb_queue_tail(&tp->out_of_order_queue, skb); ++ } else if (type == CPT_SKB_WQ) { ++ sk->sk_wmem_queued += skb->truesize; ++ sk->sk_forward_alloc -= skb->truesize; ++ ub_tcpsndbuf_charge_forced(sk, skb); ++ skb_queue_tail(&sk->sk_write_queue, skb); ++ } else { ++ wprintk_ctx("strange stream queue type %u\n", type); ++ kfree_skb(skb); ++ } ++ } else { ++ if (type == CPT_SKB_RQ) { ++ skb_set_owner_r(skb, sk); ++ skb_queue_tail(&sk->sk_receive_queue, skb); ++ } else if (type == CPT_SKB_WQ) { ++ struct inet_sock *inet = inet_sk(sk); ++ if (inet->cork.fragsize) { ++ skb_set_owner_w(skb, sk); ++ skb_queue_tail(&sk->sk_write_queue, skb); ++ } else { ++ eprintk_ctx("cork skb is dropped\n"); ++ kfree_skb(skb); ++ } ++ } else { ++ wprintk_ctx("strange dgram queue type %u\n", type); ++ kfree_skb(skb); ++ } ++ } ++ } ++ return 0; ++} ++ ++static struct sock *find_parent(__u16 sport, cpt_context_t *ctx) ++{ ++ cpt_object_t *obj; ++ for_each_object(obj, CPT_OBJ_SOCKET) { ++ struct sock *sk = obj->o_obj; ++ if (sk && ++ sk->sk_state == TCP_LISTEN && ++ (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) && ++ inet_sk(sk)->sport == sport) ++ return sk; ++ } ++ return NULL; ++} ++ ++static int rst_socket_tcp(struct cpt_sock_image *si, loff_t pos, struct sock *sk, ++ struct cpt_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct sk_buff *skb; ++ tp->pred_flags = si->cpt_pred_flags; ++ tp->rcv_nxt = si->cpt_rcv_nxt; ++ tp->snd_nxt = si->cpt_snd_nxt; ++ tp->snd_una = si->cpt_snd_una; ++ tp->snd_sml = si->cpt_snd_sml; ++ tp->rcv_tstamp = tcp_jiffies_import(si->cpt_rcv_tstamp); ++ tp->lsndtime = tcp_jiffies_import(si->cpt_lsndtime); ++ tp->tcp_header_len = si->cpt_tcp_header_len; ++ inet_csk(sk)->icsk_ack.pending = si->cpt_ack_pending; ++ inet_csk(sk)->icsk_ack.quick = si->cpt_quick; ++ inet_csk(sk)->icsk_ack.pingpong = si->cpt_pingpong; ++ inet_csk(sk)->icsk_ack.blocked = si->cpt_blocked; ++ inet_csk(sk)->icsk_ack.ato = si->cpt_ato; ++ inet_csk(sk)->icsk_ack.timeout = jiffies_import(si->cpt_ack_timeout); ++ inet_csk(sk)->icsk_ack.lrcvtime = tcp_jiffies_import(si->cpt_lrcvtime); ++ inet_csk(sk)->icsk_ack.last_seg_size = si->cpt_last_seg_size; ++ inet_csk(sk)->icsk_ack.rcv_mss = si->cpt_rcv_mss; ++ tp->snd_wl1 = si->cpt_snd_wl1; ++ tp->snd_wnd = si->cpt_snd_wnd; ++ tp->max_window = si->cpt_max_window; ++ inet_csk(sk)->icsk_pmtu_cookie = si->cpt_pmtu_cookie; ++ tp->mss_cache = si->cpt_mss_cache; ++ tp->rx_opt.mss_clamp = si->cpt_mss_clamp; ++ inet_csk(sk)->icsk_ext_hdr_len = si->cpt_ext_header_len; ++ inet_csk(sk)->icsk_ca_state = si->cpt_ca_state; ++ inet_csk(sk)->icsk_retransmits = si->cpt_retransmits; ++ tp->reordering = si->cpt_reordering; ++ tp->frto_counter = si->cpt_frto_counter; ++ tp->frto_highmark = si->cpt_frto_highmark; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10) ++ // // tp->adv_cong = si->cpt_adv_cong; ++#endif ++ inet_csk(sk)->icsk_accept_queue.rskq_defer_accept = si->cpt_defer_accept; ++ inet_csk(sk)->icsk_backoff = si->cpt_backoff; ++ tp->srtt = si->cpt_srtt; ++ tp->mdev = si->cpt_mdev; ++ tp->mdev_max = si->cpt_mdev_max; ++ tp->rttvar = si->cpt_rttvar; ++ tp->rtt_seq = si->cpt_rtt_seq; ++ inet_csk(sk)->icsk_rto = si->cpt_rto; ++ tp->packets_out = si->cpt_packets_out; ++ tp->retrans_out = si->cpt_retrans_out; ++ tp->lost_out = si->cpt_lost_out; ++ tp->sacked_out = si->cpt_sacked_out; ++ tp->fackets_out = si->cpt_fackets_out; ++ tp->snd_ssthresh = si->cpt_snd_ssthresh; ++ tp->snd_cwnd = si->cpt_snd_cwnd; ++ tp->snd_cwnd_cnt = si->cpt_snd_cwnd_cnt; ++ tp->snd_cwnd_clamp = si->cpt_snd_cwnd_clamp; ++ tp->snd_cwnd_used = si->cpt_snd_cwnd_used; ++ tp->snd_cwnd_stamp = tcp_jiffies_import(si->cpt_snd_cwnd_stamp); ++ inet_csk(sk)->icsk_timeout = tcp_jiffies_import(si->cpt_timeout); ++ tp->rcv_wnd = si->cpt_rcv_wnd; ++ tp->rcv_wup = si->cpt_rcv_wup; ++ tp->write_seq = si->cpt_write_seq; ++ tp->pushed_seq = si->cpt_pushed_seq; ++ tp->copied_seq = si->cpt_copied_seq; ++ tp->rx_opt.tstamp_ok = si->cpt_tstamp_ok; ++ tp->rx_opt.wscale_ok = si->cpt_wscale_ok; ++ tp->rx_opt.sack_ok = si->cpt_sack_ok; ++ tp->rx_opt.saw_tstamp = si->cpt_saw_tstamp; ++ tp->rx_opt.snd_wscale = si->cpt_snd_wscale; ++ tp->rx_opt.rcv_wscale = si->cpt_rcv_wscale; ++ tp->nonagle = si->cpt_nonagle; ++ tp->keepalive_probes = si->cpt_keepalive_probes; ++ tp->rx_opt.rcv_tsval = si->cpt_rcv_tsval; ++ tp->rx_opt.rcv_tsecr = si->cpt_rcv_tsecr; ++ tp->rx_opt.ts_recent = si->cpt_ts_recent; ++ tp->rx_opt.ts_recent_stamp = si->cpt_ts_recent_stamp; ++ tp->rx_opt.user_mss = si->cpt_user_mss; ++ tp->rx_opt.dsack = si->cpt_dsack; ++ tp->rx_opt.eff_sacks = si->cpt_num_sacks; ++ tp->duplicate_sack[0].start_seq = si->cpt_sack_array[0]; ++ tp->duplicate_sack[0].end_seq = si->cpt_sack_array[1]; ++ tp->selective_acks[0].start_seq = si->cpt_sack_array[2]; ++ tp->selective_acks[0].end_seq = si->cpt_sack_array[3]; ++ tp->selective_acks[1].start_seq = si->cpt_sack_array[4]; ++ tp->selective_acks[1].end_seq = si->cpt_sack_array[5]; ++ tp->selective_acks[2].start_seq = si->cpt_sack_array[6]; ++ tp->selective_acks[2].end_seq = si->cpt_sack_array[7]; ++ tp->selective_acks[3].start_seq = si->cpt_sack_array[8]; ++ tp->selective_acks[3].end_seq = si->cpt_sack_array[9]; ++ ++ tp->window_clamp = si->cpt_window_clamp; ++ tp->rcv_ssthresh = si->cpt_rcv_ssthresh; ++ inet_csk(sk)->icsk_probes_out = si->cpt_probes_out; ++ tp->rx_opt.num_sacks = si->cpt_num_sacks; ++ tp->advmss = si->cpt_advmss; ++ inet_csk(sk)->icsk_syn_retries = si->cpt_syn_retries; ++ tp->ecn_flags = si->cpt_ecn_flags; ++ tp->prior_ssthresh = si->cpt_prior_ssthresh; ++ tp->high_seq = si->cpt_high_seq; ++ tp->retrans_stamp = si->cpt_retrans_stamp; ++ tp->undo_marker = si->cpt_undo_marker; ++ tp->undo_retrans = si->cpt_undo_retrans; ++ tp->urg_seq = si->cpt_urg_seq; ++ tp->urg_data = si->cpt_urg_data; ++ inet_csk(sk)->icsk_pending = si->cpt_pending; ++ tp->urg_mode = si->cpt_urg_mode; ++ tp->snd_up = si->cpt_snd_up; ++ tp->keepalive_time = si->cpt_keepalive_time; ++ tp->keepalive_intvl = si->cpt_keepalive_intvl; ++ tp->linger2 = si->cpt_linger2; ++ ++ sk->sk_send_head = NULL; ++ for (skb = skb_peek(&sk->sk_write_queue); ++ skb && skb != (struct sk_buff*)&sk->sk_write_queue; ++ skb = skb->next) { ++ if (!after(tp->snd_nxt, TCP_SKB_CB(skb)->seq)) { ++ sk->sk_send_head = skb; ++ break; ++ } ++ } ++ ++ if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) { ++ struct inet_sock *inet = inet_sk(sk); ++ if (inet->num == 0) { ++ cpt_object_t *lobj = NULL; ++ ++ if ((int)si->cpt_parent != -1) ++ lobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx); ++ ++ if (lobj && lobj->o_obj) { ++ inet->num = ntohs(inet->sport); ++ local_bh_disable(); ++ __inet_inherit_port(lobj->o_obj, sk); ++ local_bh_enable(); ++ dprintk_ctx("port inherited from parent\n"); ++ } else { ++ struct sock *lsk = find_parent(inet->sport, ctx); ++ if (lsk) { ++ inet->num = ntohs(inet->sport); ++ local_bh_disable(); ++ __inet_inherit_port(lsk, sk); ++ local_bh_enable(); ++ dprintk_ctx("port inherited\n"); ++ } else { ++ eprintk_ctx("we are kinda lost...\n"); ++ } ++ } ++ } ++ ++ sk->sk_prot->hash(sk); ++ ++ if (inet_csk(sk)->icsk_ack.pending&ICSK_ACK_TIMER) ++ sk_reset_timer(sk, &inet_csk(sk)->icsk_delack_timer, inet_csk(sk)->icsk_ack.timeout); ++ if (inet_csk(sk)->icsk_pending) ++ sk_reset_timer(sk, &inet_csk(sk)->icsk_retransmit_timer, ++ inet_csk(sk)->icsk_timeout); ++ if (sock_flag(sk, SOCK_KEEPOPEN)) { ++ unsigned long expires = jiffies_import(si->cpt_ka_timeout); ++ if (time_after(jiffies, expires)) ++ expires = jiffies + HZ; ++ sk_reset_timer(sk, &sk->sk_timer, expires); ++ } ++ } ++ ++ return 0; ++} ++ ++ ++int rst_socket_in(struct cpt_sock_image *si, loff_t pos, struct sock *sk, ++ struct cpt_context *ctx) ++{ ++ struct inet_sock *inet = inet_sk(sk); ++ struct net *net = get_exec_env()->ve_ns->net_ns; ++ ++ lock_sock(sk); ++ ++ sk->sk_state = si->cpt_state; ++ ++ inet->daddr = si->cpt_daddr; ++ inet->dport = si->cpt_dport; ++ inet->saddr = si->cpt_saddr; ++ inet->rcv_saddr = si->cpt_rcv_saddr; ++ inet->sport = si->cpt_sport; ++ inet->uc_ttl = si->cpt_uc_ttl; ++ inet->tos = si->cpt_tos; ++ inet->cmsg_flags = si->cpt_cmsg_flags; ++ inet->mc_index = si->cpt_mc_index; ++ inet->mc_addr = si->cpt_mc_addr; ++ inet->hdrincl = si->cpt_hdrincl; ++ inet->mc_ttl = si->cpt_mc_ttl; ++ inet->mc_loop = si->cpt_mc_loop; ++ inet->pmtudisc = si->cpt_pmtudisc; ++ inet->recverr = si->cpt_recverr; ++ inet->freebind = si->cpt_freebind; ++ inet->id = si->cpt_idcounter; ++ ++ inet->cork.flags = si->cpt_cork_flags; ++ inet->cork.fragsize = si->cpt_cork_fragsize; ++ inet->cork.length = si->cpt_cork_length; ++ inet->cork.addr = si->cpt_cork_addr; ++ inet->cork.fl.fl4_src = si->cpt_cork_saddr; ++ inet->cork.fl.fl4_dst = si->cpt_cork_daddr; ++ inet->cork.fl.oif = si->cpt_cork_oif; ++ if (inet->cork.fragsize) { ++ if (ip_route_output_key(net, (struct rtable **)&inet->cork.dst, &inet->cork.fl)) { ++ eprintk_ctx("failed to restore cork route\n"); ++ inet->cork.fragsize = 0; ++ } ++ } ++ ++ if (sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP) { ++ struct udp_sock *up = udp_sk(sk); ++ up->pending = si->cpt_udp_pending; ++ up->corkflag = si->cpt_udp_corkflag; ++ up->encap_type = si->cpt_udp_encap; ++ up->len = si->cpt_udp_len; ++ } ++ ++ if (sk->sk_family == AF_INET6) { ++ struct ipv6_pinfo *np = inet6_sk(sk); ++ ++ memcpy(&np->saddr, si->cpt_saddr6, 16); ++ memcpy(&np->rcv_saddr, si->cpt_rcv_saddr6, 16); ++ memcpy(&np->daddr, si->cpt_daddr6, 16); ++ np->flow_label = si->cpt_flow_label6; ++ np->frag_size = si->cpt_frag_size6; ++ np->hop_limit = si->cpt_hop_limit6; ++ np->mcast_hops = si->cpt_mcast_hops6; ++ np->mcast_oif = si->cpt_mcast_oif6; ++ np->rxopt.all = si->cpt_rxopt6; ++ np->mc_loop = si->cpt_mc_loop6; ++ np->recverr = si->cpt_recverr6; ++ np->sndflow = si->cpt_sndflow6; ++ np->pmtudisc = si->cpt_pmtudisc6; ++ np->ipv6only = si->cpt_ipv6only6; ++ ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ if (si->cpt_mapped) { ++ extern struct inet_connection_sock_af_ops ipv6_mapped; ++ if (sk->sk_type == SOCK_STREAM && ++ sk->sk_protocol == IPPROTO_TCP) { ++ inet_csk(sk)->icsk_af_ops = &ipv6_mapped; ++ sk->sk_backlog_rcv = tcp_v4_do_rcv; ++ } ++ } ++#endif ++ } ++ ++ restore_queues(sk, si, pos, ctx); ++ ++ if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP) ++ rst_socket_tcp(si, pos, sk, ctx); ++ ++ release_sock(sk); ++ return 0; ++} ++ ++int cpt_attach_accept(struct sock *lsk, struct sock *sk, cpt_context_t *ctx) ++{ ++ struct request_sock *req; ++ ++ if (lsk->sk_state != TCP_LISTEN) ++ return -EINVAL; ++ ++ req = reqsk_alloc(&tcp_request_sock_ops); ++ if (!req) ++ return -ENOMEM; ++ ++ sk->sk_socket = NULL; ++ sk->sk_sleep = NULL; ++ inet_csk_reqsk_queue_add(lsk, req, sk); ++ return 0; ++} ++ ++int rst_restore_synwait_queue(struct sock *sk, struct cpt_sock_image *si, ++ loff_t pos, struct cpt_context *ctx) ++{ ++ int err; ++ loff_t end = si->cpt_next; ++ ++ pos += si->cpt_hdrlen; ++ while (pos < end) { ++ struct cpt_openreq_image oi; ++ ++ err = rst_get_object(CPT_OBJ_OPENREQ, pos, &oi, ctx); ++ if (err) { ++ err = rst_sock_attr(&pos, sk, ctx); ++ if (err) ++ return err; ++ continue; ++ } ++ ++ if (oi.cpt_object == CPT_OBJ_OPENREQ) { ++ struct request_sock *req = reqsk_alloc(&tcp_request_sock_ops); ++ if (req == NULL) ++ return -ENOMEM; ++ ++ memset(req, 0, sizeof(*req)); ++ tcp_rsk(req)->rcv_isn = oi.cpt_rcv_isn; ++ tcp_rsk(req)->snt_isn = oi.cpt_snt_isn; ++ inet_rsk(req)->rmt_port = oi.cpt_rmt_port; ++ req->mss = oi.cpt_mss; ++ req->retrans = oi.cpt_retrans; ++ inet_rsk(req)->snd_wscale = oi.cpt_snd_wscale; ++ inet_rsk(req)->rcv_wscale = oi.cpt_rcv_wscale; ++ inet_rsk(req)->tstamp_ok = oi.cpt_tstamp_ok; ++ inet_rsk(req)->sack_ok = oi.cpt_sack_ok; ++ inet_rsk(req)->wscale_ok = oi.cpt_wscale_ok; ++ inet_rsk(req)->ecn_ok = oi.cpt_ecn_ok; ++ inet_rsk(req)->acked = oi.cpt_acked; ++ req->window_clamp = oi.cpt_window_clamp; ++ req->rcv_wnd = oi.cpt_rcv_wnd; ++ req->ts_recent = oi.cpt_ts_recent; ++ req->expires = jiffies_import(oi.cpt_expires); ++ ++ if (oi.cpt_family == AF_INET) { ++ memcpy(&inet_rsk(req)->loc_addr, oi.cpt_loc_addr, 4); ++ memcpy(&inet_rsk(req)->rmt_addr, oi.cpt_rmt_addr, 4); ++ inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); ++ } else { ++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) ++ memcpy(&inet6_rsk(req)->loc_addr, oi.cpt_loc_addr, 16); ++ memcpy(&inet6_rsk(req)->rmt_addr, oi.cpt_rmt_addr, 16); ++ inet6_rsk(req)->iif = oi.cpt_iif; ++ inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); ++#endif ++ } ++ } ++ pos += oi.cpt_next; ++ } ++ return 0; ++} ++ ++int rst_sk_mcfilter_in(struct sock *sk, struct cpt_sockmc_image *v, ++ loff_t pos, cpt_context_t *ctx) ++{ ++ struct ip_mreqn imr; ++ ++ if (v->cpt_mode || v->cpt_next != v->cpt_hdrlen) { ++ eprintk_ctx("IGMPv3 is still not supported\n"); ++ return -EINVAL; ++ } ++ ++ memset(&imr, 0, sizeof(imr)); ++ imr.imr_ifindex = v->cpt_ifindex; ++ imr.imr_multiaddr.s_addr = v->cpt_mcaddr[0]; ++ return ip_mc_join_group(sk, &imr); ++} ++ ++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) ++int rst_sk_mcfilter_in6(struct sock *sk, struct cpt_sockmc_image *v, ++ loff_t pos, cpt_context_t *ctx) ++{ ++ ++ if (v->cpt_mode || v->cpt_next != v->cpt_hdrlen) { ++ eprintk_ctx("IGMPv3 is still not supported\n"); ++ return -EINVAL; ++ } ++ ++ return ipv6_sock_mc_join(sk, v->cpt_ifindex, ++ (struct in6_addr*)v->cpt_mcaddr); ++} ++#endif +diff --git a/kernel/cpt/rst_sysvipc.c b/kernel/cpt/rst_sysvipc.c +new file mode 100644 +index 0000000..8803de5 +--- /dev/null ++++ b/kernel/cpt/rst_sysvipc.c +@@ -0,0 +1,636 @@ ++/* ++ * ++ * kernel/cpt/rst_sysvipc.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_kernel.h" ++ ++struct _warg { ++ struct file *file; ++ struct cpt_sysvshm_image *v; ++}; ++ ++static int fixup_one_shm(struct shmid_kernel *shp, void *arg) ++{ ++ struct _warg *warg = arg; ++ ++ if (shp->shm_file != warg->file) ++ return 0; ++ if (shp->shm_nattch) ++ return -EEXIST; ++ ++ shp->shm_perm.uid = warg->v->cpt_uid; ++ shp->shm_perm.gid = warg->v->cpt_gid; ++ shp->shm_perm.cuid = warg->v->cpt_cuid; ++ shp->shm_perm.cgid = warg->v->cpt_cgid; ++ shp->shm_perm.mode = warg->v->cpt_mode; ++ ++ shp->shm_atim = warg->v->cpt_atime; ++ shp->shm_dtim = warg->v->cpt_dtime; ++ shp->shm_ctim = warg->v->cpt_ctime; ++ shp->shm_cprid = warg->v->cpt_creator; ++ shp->shm_lprid = warg->v->cpt_last; ++ ++ /* TODO: fix shp->mlock_user? */ ++ return 1; ++} ++ ++static int fixup_shm(struct file *file, struct cpt_sysvshm_image *v) ++{ ++ struct _warg warg; ++ ++ warg.file = file; ++ warg.v = v; ++ ++ return sysvipc_walk_shm(fixup_one_shm, &warg); ++} ++ ++static int fixup_shm_data(struct file *file, loff_t pos, loff_t end, ++ struct cpt_context *ctx) ++{ ++ struct cpt_page_block pgb; ++ ssize_t (*do_write)(struct file *, const char __user *, size_t, loff_t *ppos); ++ ++ do_write = file->f_dentry->d_inode->i_fop->write; ++ if (do_write == NULL) { ++ eprintk_ctx("No TMPFS? Cannot restore content of SYSV SHM\n"); ++ return -EINVAL; ++ } ++ ++ while (pos < end) { ++ loff_t opos; ++ loff_t ipos; ++ int count; ++ int err; ++ ++ err = rst_get_object(CPT_OBJ_PAGES, pos, &pgb, ctx); ++ if (err) ++ return err; ++ dprintk_ctx("restoring SHM block: %08x-%08x\n", ++ (__u32)pgb.cpt_start, (__u32)pgb.cpt_end); ++ ipos = pos + pgb.cpt_hdrlen; ++ opos = pgb.cpt_start; ++ count = pgb.cpt_end-pgb.cpt_start; ++ while (count > 0) { ++ mm_segment_t oldfs; ++ int copy = count; ++ ++ if (copy > PAGE_SIZE) ++ copy = PAGE_SIZE; ++ (void)cpt_get_buf(ctx); ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ err = ctx->pread(ctx->tmpbuf, copy, ctx, ipos); ++ set_fs(oldfs); ++ if (err) { ++ __cpt_release_buf(ctx); ++ return err; ++ } ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ ipos += copy; ++ err = do_write(file, ctx->tmpbuf, copy, &opos); ++ set_fs(oldfs); ++ __cpt_release_buf(ctx); ++ if (err != copy) { ++ eprintk_ctx("write() failure\n"); ++ if (err >= 0) ++ err = -EIO; ++ return err; ++ } ++ count -= copy; ++ } ++ pos += pgb.cpt_next; ++ } ++ return 0; ++} ++ ++struct file * rst_sysv_shm_itself(loff_t pos, struct cpt_context *ctx) ++{ ++ struct file *file; ++ int err; ++ loff_t dpos, epos; ++ union { ++ struct cpt_file_image fi; ++ struct cpt_sysvshm_image shmi; ++ struct cpt_inode_image ii; ++ } u; ++ ++ err = rst_get_object(CPT_OBJ_FILE, pos, &u.fi, ctx); ++ if (err < 0) ++ goto err_out; ++ pos = u.fi.cpt_inode; ++ err = rst_get_object(CPT_OBJ_INODE, pos, &u.ii, ctx); ++ if (err < 0) ++ goto err_out; ++ dpos = pos + u.ii.cpt_hdrlen; ++ epos = pos + u.ii.cpt_next; ++ err = rst_get_object(CPT_OBJ_SYSV_SHM, pos + u.ii.cpt_hdrlen, &u.shmi, ctx); ++ if (err < 0) ++ goto err_out; ++ dpos += u.shmi.cpt_next; ++ ++ file = sysvipc_setup_shm(u.shmi.cpt_key, u.shmi.cpt_id, ++ u.shmi.cpt_segsz, u.shmi.cpt_mode); ++ if (!IS_ERR(file)) { ++ err = fixup_shm(file, &u.shmi); ++ if (err != -EEXIST && dpos < epos) ++ err = fixup_shm_data(file, dpos, epos, ctx); ++ } else if (IS_ERR(file) && PTR_ERR(file) == -EEXIST) { ++ struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns; ++ struct shmid_kernel *shp; ++ ++ shp = shm_lock(ipc_ns, u.shmi.cpt_id); ++ BUG_ON(IS_ERR(shp)); ++ get_file(shp->shm_file); ++ file = shp->shm_file; ++ shm_unlock(shp); ++ } ++ return file; ++ ++err_out: ++ return ERR_PTR(err); ++} ++ ++struct file * rst_sysv_shm_vma(struct cpt_vma_image *vmai, struct cpt_context *ctx) ++{ ++ struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns; ++ struct file *file; ++ union { ++ struct cpt_file_image fi; ++ struct cpt_inode_image ii; ++ struct cpt_sysvshm_image shmi; ++ } u; ++ struct shmid_kernel *shp; ++ struct shm_file_data *sfd; ++ struct path path; ++ mode_t f_mode; ++ loff_t pos; ++ int err; ++ ++ pos = vmai->cpt_file; ++ file = rst_sysv_shm_itself(pos, ctx); ++ if (IS_ERR(file) && PTR_ERR(file) != -EEXIST) ++ return file; ++ fput(file); ++ ++ err = rst_get_object(CPT_OBJ_FILE, pos, &u.fi, ctx); ++ if (err < 0) ++ goto err_out; ++ pos = u.fi.cpt_inode; ++ err = rst_get_object(CPT_OBJ_INODE, pos, &u.ii, ctx); ++ if (err < 0) ++ goto err_out; ++ err = rst_get_object(CPT_OBJ_SYSV_SHM, pos + u.ii.cpt_hdrlen, &u.shmi, ctx); ++ if (err < 0) ++ goto err_out; ++ ++ shp = shm_lock(ipc_ns, u.shmi.cpt_id); ++ BUG_ON(IS_ERR(shp)); ++ path.dentry = dget(shp->shm_file->f_path.dentry); ++ path.mnt = shp->shm_file->f_path.mnt; ++ shm_unlock(shp); ++ ++ err = -ENOMEM; ++ sfd = kzalloc(sizeof(*sfd), GFP_KERNEL); ++ if (!sfd) ++ goto out_put_dentry; ++ ++ f_mode = 0; ++ if (vmai->cpt_flags & VM_READ) ++ f_mode |= FMODE_READ; ++ if (vmai->cpt_flags & VM_WRITE) ++ f_mode |= FMODE_WRITE; ++ if (vmai->cpt_flags & VM_EXEC) ++ f_mode |= FMODE_EXEC; ++ ++ err = -ENOMEM; ++ file = alloc_file(path.mnt, path.dentry, f_mode, &shm_file_operations); ++ if (!file) ++ goto out_free; ++ ++ file->private_data = sfd; ++ file->f_mapping = shp->shm_file->f_mapping; ++ sfd->id = shp->shm_perm.id; ++ sfd->ns = get_ipc_ns(ipc_ns); ++ sfd->file = shp->shm_file; ++ sfd->vm_ops = NULL; ++ ++ return file; ++ ++out_free: ++ kfree(sfd); ++out_put_dentry: ++ dput(path.dentry); ++err_out: ++ return ERR_PTR(err); ++} ++ ++static int attach_one_undo(int semid, struct sem_array *sma, void *arg) ++{ ++ struct sem_undo *su = arg; ++ struct sem_undo_list *undo_list = current->sysvsem.undo_list; ++ ++ if (semid != su->semid) ++ return 0; ++ ++ su->proc_next = undo_list->proc_list; ++ undo_list->proc_list = su; ++ ++ su->id_next = sma->undo; ++ sma->undo = su; ++ ++ return 1; ++} ++ ++static int attach_undo(struct sem_undo *su) ++{ ++ return sysvipc_walk_sem(attach_one_undo, su); ++} ++ ++static int do_rst_semundo(struct cpt_object_hdr *sui, loff_t pos, struct cpt_context *ctx) ++{ ++ int err; ++ struct sem_undo_list *undo_list; ++ ++ if (current->sysvsem.undo_list) { ++ eprintk_ctx("Funny undo_list\n"); ++ return 0; ++ } ++ ++ undo_list = kzalloc(sizeof(struct sem_undo_list), GFP_KERNEL_UBC); ++ if (undo_list == NULL) ++ return -ENOMEM; ++ ++ atomic_set(&undo_list->refcnt, 1); ++ spin_lock_init(&undo_list->lock); ++ current->sysvsem.undo_list = undo_list; ++ ++ if (sui->cpt_next > sui->cpt_hdrlen) { ++ loff_t offset = pos + sui->cpt_hdrlen; ++ do { ++ struct sem_undo *new; ++ struct cpt_sysvsem_undo_image spi; ++ err = rst_get_object(CPT_OBJ_SYSVSEM_UNDO_REC, offset, &spi, ctx); ++ if (err) ++ goto out; ++ new = kmalloc(sizeof(struct sem_undo) + ++ sizeof(short)*spi.cpt_nsem, ++ GFP_KERNEL_UBC); ++ if (!new) { ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ memset(new, 0, sizeof(struct sem_undo) + sizeof(short)*spi.cpt_nsem); ++ new->semadj = (short *) &new[1]; ++ new->semid = spi.cpt_id; ++ err = ctx->pread(new->semadj, spi.cpt_nsem*sizeof(short), ctx, offset + spi.cpt_hdrlen); ++ if (err) { ++ kfree(new); ++ goto out; ++ } ++ err = attach_undo(new); ++ if (err <= 0) { ++ if (err == 0) ++ err = -ENOENT; ++ kfree(new); ++ goto out; ++ } ++ offset += spi.cpt_next; ++ } while (offset < pos + sui->cpt_next); ++ } ++ err = 0; ++ ++out: ++ return err; ++} ++ ++__u32 rst_semundo_flag(struct cpt_task_image *ti, struct cpt_context *ctx) ++{ ++ __u32 flag = 0; ++ ++#if 0 ++ if (ti->cpt_sysvsem_undo == CPT_NULL || ++ lookup_cpt_obj_bypos(CPT_OBJ_SYSVSEM_UNDO, ti->cpt_sysvsem_undo)) ++ flag |= CLONE_SYSVSEM; ++#endif ++ return flag; ++} ++ ++int rst_semundo_complete(struct cpt_task_image *ti, struct cpt_context *ctx) ++{ ++ int err; ++ struct sem_undo_list *f = current->sysvsem.undo_list; ++ cpt_object_t *obj; ++ struct cpt_object_hdr sui; ++ ++ if (ti->cpt_sysvsem_undo == CPT_NULL) { ++ exit_sem(current); ++ return 0; ++ } ++ ++ obj = lookup_cpt_obj_bypos(CPT_OBJ_SYSVSEM_UNDO, ti->cpt_sysvsem_undo, ctx); ++ if (obj) { ++ if (obj->o_obj != f) { ++ exit_sem(current); ++ f = obj->o_obj; ++ atomic_inc(&f->refcnt); ++ current->sysvsem.undo_list = f; ++ } ++ return 0; ++ } ++ ++ if ((err = rst_get_object(CPT_OBJ_SYSVSEM_UNDO, ti->cpt_sysvsem_undo, &sui, ctx)) != 0) ++ goto out; ++ ++ if ((err = do_rst_semundo(&sui, ti->cpt_sysvsem_undo, ctx)) != 0) ++ goto out; ++ ++ err = -ENOMEM; ++ obj = cpt_object_add(CPT_OBJ_SYSVSEM_UNDO, f, ctx); ++ if (obj) { ++ err = 0; ++ cpt_obj_setpos(obj, ti->cpt_sysvsem_undo, ctx); ++ } ++ ++ return 0; ++ ++out: ++ return err; ++} ++ ++struct _sarg { ++ int semid; ++ struct cpt_sysvsem_image *v; ++ __u32 *arr; ++}; ++ ++static int fixup_one_sem(int semid, struct sem_array *sma, void *arg) ++{ ++ struct _sarg *warg = arg; ++ ++ if (semid != warg->semid) ++ return 0; ++ ++ sma->sem_perm.uid = warg->v->cpt_uid; ++ sma->sem_perm.gid = warg->v->cpt_gid; ++ sma->sem_perm.cuid = warg->v->cpt_cuid; ++ sma->sem_perm.cgid = warg->v->cpt_cgid; ++ sma->sem_perm.mode = warg->v->cpt_mode; ++ sma->sem_perm.seq = warg->v->cpt_seq; ++ ++ sma->sem_ctime = warg->v->cpt_ctime; ++ sma->sem_otime = warg->v->cpt_otime; ++ memcpy(sma->sem_base, warg->arr, sma->sem_nsems*8); ++ return 1; ++} ++ ++static int fixup_sem(int semid, struct cpt_sysvsem_image *v, __u32 *arr) ++{ ++ struct _sarg warg; ++ ++ warg.semid = semid; ++ warg.v = v; ++ warg.arr = arr; ++ ++ return sysvipc_walk_sem(fixup_one_sem, &warg); ++} ++ ++ ++static int restore_sem(loff_t pos, struct cpt_sysvsem_image *si, ++ struct cpt_context *ctx) ++{ ++ int err; ++ __u32 *arr; ++ int nsems = (si->cpt_next - si->cpt_hdrlen)/8; ++ ++ arr = kmalloc(nsems*8, GFP_KERNEL); ++ if (!arr) ++ return -ENOMEM; ++ ++ err = ctx->pread(arr, nsems*8, ctx, pos+si->cpt_hdrlen); ++ if (err) ++ goto out; ++ err = sysvipc_setup_sem(si->cpt_key, si->cpt_id, nsems, si->cpt_mode); ++ if (err < 0) { ++ eprintk_ctx("SEM 3\n"); ++ goto out; ++ } ++ err = fixup_sem(si->cpt_id, si, arr); ++ if (err == 0) ++ err = -ESRCH; ++ if (err > 0) ++ err = 0; ++out: ++ kfree(arr); ++ return err; ++} ++ ++static int rst_sysv_sem(struct cpt_context *ctx) ++{ ++ int err; ++ loff_t sec = ctx->sections[CPT_SECT_SYSV_SEM]; ++ loff_t endsec; ++ struct cpt_section_hdr h; ++ struct cpt_sysvsem_image sbuf; ++ ++ if (sec == CPT_NULL) ++ return 0; ++ ++ err = ctx->pread(&h, sizeof(h), ctx, sec); ++ if (err) ++ return err; ++ if (h.cpt_section != CPT_SECT_SYSV_SEM || h.cpt_hdrlen < sizeof(h)) ++ return -EINVAL; ++ ++ endsec = sec + h.cpt_next; ++ sec += h.cpt_hdrlen; ++ while (sec < endsec) { ++ int err; ++ err = rst_get_object(CPT_OBJ_SYSV_SEM, sec, &sbuf, ctx); ++ if (err) ++ return err; ++ err = restore_sem(sec, &sbuf, ctx); ++ if (err) ++ return err; ++ sec += sbuf.cpt_next; ++ } ++ return 0; ++} ++ ++struct _marg { ++ int msqid; ++ struct cpt_sysvmsg_image *v; ++ struct msg_queue *m; ++}; ++ ++static int fixup_one_msg(int msqid, struct msg_queue *msq, void *arg) ++{ ++ struct _marg *warg = arg; ++ ++ if (msqid != warg->msqid) ++ return 0; ++ ++ msq->q_perm.uid = warg->v->cpt_uid; ++ msq->q_perm.gid = warg->v->cpt_gid; ++ msq->q_perm.cuid = warg->v->cpt_cuid; ++ msq->q_perm.cgid = warg->v->cpt_cgid; ++ msq->q_perm.mode = warg->v->cpt_mode; ++ msq->q_perm.seq = warg->v->cpt_seq; ++ ++ msq->q_stime = warg->v->cpt_stime; ++ msq->q_rtime = warg->v->cpt_rtime; ++ msq->q_ctime = warg->v->cpt_ctime; ++ msq->q_lspid = warg->v->cpt_last_sender; ++ msq->q_lrpid = warg->v->cpt_last_receiver; ++ msq->q_qbytes = warg->v->cpt_qbytes; ++ ++ warg->m = msq; ++ return 1; ++} ++ ++struct _larg ++{ ++ cpt_context_t * ctx; ++ loff_t pos; ++}; ++ ++static int do_load_msg(void * dst, int len, int offset, void * data) ++{ ++ struct _larg * arg = data; ++ return arg->ctx->pread(dst, len, arg->ctx, arg->pos + offset); ++} ++ ++static int fixup_msg(int msqid, struct cpt_sysvmsg_image *v, loff_t pos, ++ cpt_context_t * ctx) ++{ ++ int err; ++ struct _marg warg; ++ loff_t endpos = pos + v->cpt_next; ++ struct ipc_namespace *ns = current->nsproxy->ipc_ns; ++ ++ pos += v->cpt_hdrlen; ++ ++ warg.msqid = msqid; ++ warg.v = v; ++ ++ err = sysvipc_walk_msg(fixup_one_msg, &warg); ++ if (err <= 0) ++ return err; ++ ++ while (pos < endpos) { ++ struct cpt_sysvmsg_msg_image mi; ++ struct msg_msg *m; ++ struct _larg data = { ++ .ctx = ctx ++ }; ++ ++ err = rst_get_object(CPT_OBJ_SYSVMSG_MSG, pos, &mi, ctx); ++ if (err) ++ return err; ++ data.pos = pos + mi.cpt_hdrlen; ++ m = sysv_msg_load(do_load_msg, mi.cpt_size, &data); ++ if (IS_ERR(m)) ++ return PTR_ERR(m); ++ m->m_type = mi.cpt_type; ++ m->m_ts = mi.cpt_size; ++ list_add_tail(&m->m_list, &warg.m->q_messages); ++ warg.m->q_cbytes += m->m_ts; ++ warg.m->q_qnum++; ++ atomic_add(m->m_ts, &ns->msg_bytes); ++ atomic_inc(&ns->msg_hdrs); ++ ++ pos += mi.cpt_next; ++ } ++ return 1; ++} ++ ++static int restore_msg(loff_t pos, struct cpt_sysvmsg_image *si, ++ struct cpt_context *ctx) ++{ ++ int err; ++ ++ err = sysvipc_setup_msg(si->cpt_key, si->cpt_id, si->cpt_mode); ++ if (err < 0) { ++ eprintk_ctx("MSG 3\n"); ++ goto out; ++ } ++ err = fixup_msg(si->cpt_id, si, pos, ctx); ++ if (err == 0) ++ err = -ESRCH; ++ if (err > 0) ++ err = 0; ++out: ++ return err; ++} ++ ++static int rst_sysv_msg(struct cpt_context *ctx) ++{ ++ int err; ++ loff_t sec = ctx->sections[CPT_SECT_SYSV_MSG]; ++ loff_t endsec; ++ struct cpt_section_hdr h; ++ struct cpt_sysvmsg_image sbuf; ++ ++ if (sec == CPT_NULL) ++ return 0; ++ ++ err = ctx->pread(&h, sizeof(h), ctx, sec); ++ if (err) ++ return err; ++ if (h.cpt_section != CPT_SECT_SYSV_MSG || h.cpt_hdrlen < sizeof(h)) ++ return -EINVAL; ++ ++ endsec = sec + h.cpt_next; ++ sec += h.cpt_hdrlen; ++ while (sec < endsec) { ++ int err; ++ err = rst_get_object(CPT_OBJ_SYSVMSG, sec, &sbuf, ctx); ++ if (err) ++ return err; ++ err = restore_msg(sec, &sbuf, ctx); ++ if (err) ++ return err; ++ sec += sbuf.cpt_next; ++ } ++ return 0; ++} ++ ++ ++int rst_sysv_ipc(struct cpt_context *ctx) ++{ ++ int err; ++ ++ err = rst_sysv_sem(ctx); ++ if (!err) ++ err = rst_sysv_msg(ctx); ++ ++ return err; ++} +diff --git a/kernel/cpt/rst_tty.c b/kernel/cpt/rst_tty.c +new file mode 100644 +index 0000000..48bc4ce +--- /dev/null ++++ b/kernel/cpt/rst_tty.c +@@ -0,0 +1,384 @@ ++/* ++ * ++ * kernel/cpt/rst_tty.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_mm.h" ++#include "cpt_process.h" ++#include "cpt_files.h" ++#include "cpt_kernel.h" ++ ++static int pty_setup(struct tty_struct *stty, loff_t pos, ++ struct cpt_tty_image *pi, struct cpt_context *ctx) ++{ ++ unsigned long flags; ++ ++ stty->pgrp = NULL; ++ stty->session = NULL; ++ stty->packet = pi->cpt_packet; ++ stty->stopped = pi->cpt_stopped; ++ stty->hw_stopped = pi->cpt_hw_stopped; ++ stty->flow_stopped = pi->cpt_flow_stopped; ++#define DONOT_CHANGE ((1<flags & DONOT_CHANGE; ++ stty->flags = flags | (pi->cpt_flags & ~DONOT_CHANGE); ++ stty->ctrl_status = pi->cpt_ctrl_status; ++ stty->winsize.ws_row = pi->cpt_ws_row; ++ stty->winsize.ws_col = pi->cpt_ws_col; ++ stty->winsize.ws_ypixel = pi->cpt_ws_prow; ++ stty->winsize.ws_xpixel = pi->cpt_ws_pcol; ++ stty->canon_column = pi->cpt_canon_column; ++ stty->column = pi->cpt_column; ++ stty->raw = pi->cpt_raw; ++ stty->real_raw = pi->cpt_real_raw; ++ stty->erasing = pi->cpt_erasing; ++ stty->lnext = pi->cpt_lnext; ++ stty->icanon = pi->cpt_icanon; ++ stty->closing = pi->cpt_closing; ++ stty->minimum_to_wake = pi->cpt_minimum_to_wake; ++ ++ stty->termios->c_iflag = pi->cpt_c_iflag; ++ stty->termios->c_oflag = pi->cpt_c_oflag; ++ stty->termios->c_lflag = pi->cpt_c_lflag; ++ stty->termios->c_cflag = pi->cpt_c_cflag; ++ memcpy(&stty->termios->c_cc, &pi->cpt_c_cc, NCCS); ++ memcpy(stty->read_flags, pi->cpt_read_flags, sizeof(stty->read_flags)); ++ ++ if (pi->cpt_next > pi->cpt_hdrlen) { ++ int err; ++ struct cpt_obj_bits b; ++ err = rst_get_object(CPT_OBJ_BITS, pos + pi->cpt_hdrlen, &b, ctx); ++ if (err) ++ return err; ++ if (b.cpt_size == 0) ++ return 0; ++ err = ctx->pread(stty->read_buf, b.cpt_size, ctx, pos + pi->cpt_hdrlen + b.cpt_hdrlen); ++ if (err) ++ return err; ++ ++ spin_lock_irq(&stty->read_lock); ++ stty->read_tail = 0; ++ stty->read_cnt = b.cpt_size; ++ stty->read_head = b.cpt_size; ++ stty->canon_head = stty->read_tail + pi->cpt_canon_head; ++ stty->canon_data = pi->cpt_canon_data; ++ spin_unlock_irq(&stty->read_lock); ++ } ++ ++ return 0; ++} ++ ++/* Find slave/master tty in image, when we already know master/slave. ++ * It might be optimized, of course. */ ++static loff_t find_pty_pair(struct tty_struct *stty, loff_t pos, struct cpt_tty_image *pi, struct cpt_context *ctx) ++{ ++ int err; ++ loff_t sec = ctx->sections[CPT_SECT_TTY]; ++ loff_t endsec; ++ struct cpt_section_hdr h; ++ struct cpt_tty_image *pibuf; ++ ++ err = ctx->pread(&h, sizeof(h), ctx, sec); ++ if (err) ++ return CPT_NULL; ++ if (h.cpt_section != CPT_SECT_TTY || h.cpt_hdrlen < sizeof(h)) ++ return CPT_NULL; ++ pibuf = kmalloc(sizeof(*pibuf), GFP_KERNEL); ++ if (pibuf == NULL) { ++ eprintk_ctx("cannot allocate buffer\n"); ++ return CPT_NULL; ++ } ++ endsec = sec + h.cpt_next; ++ sec += h.cpt_hdrlen; ++ while (sec < endsec) { ++ if (rst_get_object(CPT_OBJ_TTY, sec, pibuf, ctx)) ++ return CPT_NULL; ++ if (pibuf->cpt_index == pi->cpt_index && ++ !((pi->cpt_drv_flags^pibuf->cpt_drv_flags)&TTY_DRIVER_DEVPTS_MEM) && ++ pos != sec) { ++ pty_setup(stty, sec, pibuf, ctx); ++ return sec; ++ } ++ sec += pibuf->cpt_next; ++ } ++ kfree(pibuf); ++ return CPT_NULL; ++} ++ ++static int fixup_tty_attrs(struct cpt_inode_image *ii, struct file *master, ++ struct cpt_context *ctx) ++{ ++ int err; ++ struct iattr newattrs; ++ struct dentry *d = master->f_dentry; ++ ++ newattrs.ia_valid = ATTR_UID|ATTR_GID|ATTR_MODE; ++ newattrs.ia_uid = ii->cpt_uid; ++ newattrs.ia_gid = ii->cpt_gid; ++ newattrs.ia_mode = ii->cpt_mode; ++ ++ mutex_lock(&d->d_inode->i_mutex); ++ err = notify_change(d, &newattrs); ++ mutex_unlock(&d->d_inode->i_mutex); ++ ++ return err; ++} ++ ++/* NOTE: "portable", but ugly thing. To allocate /dev/pts/N, we open ++ * /dev/ptmx until we get pty with desired index. ++ */ ++ ++struct file *ptmx_open(int index, unsigned int flags) ++{ ++ struct file *file; ++ struct file **stack = NULL; ++ int depth = 0; ++ ++ for (;;) { ++ struct tty_struct *tty; ++ ++ file = filp_open("/dev/ptmx", flags|O_NONBLOCK|O_NOCTTY|O_RDWR, 0); ++ if (IS_ERR(file)) ++ break; ++ tty = file->private_data; ++ if (tty->index == index) ++ break; ++ ++ if (depth == PAGE_SIZE/sizeof(struct file *)) { ++ fput(file); ++ file = ERR_PTR(-EBUSY); ++ break; ++ } ++ if (stack == NULL) { ++ stack = (struct file **)__get_free_page(GFP_KERNEL); ++ if (!stack) { ++ fput(file); ++ file = ERR_PTR(-ENOMEM); ++ break; ++ } ++ } ++ stack[depth] = file; ++ depth++; ++ } ++ while (depth > 0) { ++ depth--; ++ fput(stack[depth]); ++ } ++ if (stack) ++ free_page((unsigned long)stack); ++ return file; ++} ++ ++ ++struct file * rst_open_tty(struct cpt_file_image *fi, struct cpt_inode_image *ii, ++ unsigned flags, struct cpt_context *ctx) ++{ ++ int err; ++ cpt_object_t *obj; ++ struct file *master, *slave; ++ struct tty_struct *stty; ++ struct cpt_tty_image *pi; ++ static char *a = "pqrstuvwxyzabcde"; ++ static char *b = "0123456789abcdef"; ++ char pairname[16]; ++ unsigned master_flags, slave_flags; ++ ++ if (fi->cpt_priv == CPT_NULL) ++ return ERR_PTR(-EINVAL); ++ ++ obj = lookup_cpt_obj_bypos(CPT_OBJ_TTY, fi->cpt_priv, ctx); ++ if (obj && obj->o_parent) { ++ dprintk_ctx("obtained pty as pair to existing\n"); ++ master = obj->o_parent; ++ stty = master->private_data; ++ ++ if (stty->driver->subtype == PTY_TYPE_MASTER && ++ (stty->driver->flags&TTY_DRIVER_DEVPTS_MEM)) { ++ wprintk_ctx("cloning ptmx\n"); ++ get_file(master); ++ return master; ++ } ++ ++ master = dentry_open(dget(master->f_dentry), ++ mntget(master->f_vfsmnt), flags); ++ if (!IS_ERR(master)) { ++ stty = master->private_data; ++ if (stty->driver->subtype != PTY_TYPE_MASTER) ++ fixup_tty_attrs(ii, master, ctx); ++ } ++ return master; ++ } ++ ++ pi = cpt_get_buf(ctx); ++ err = rst_get_object(CPT_OBJ_TTY, fi->cpt_priv, pi, ctx); ++ if (err) { ++ cpt_release_buf(ctx); ++ return ERR_PTR(err); ++ } ++ ++ master_flags = slave_flags = 0; ++ if (pi->cpt_drv_subtype == PTY_TYPE_MASTER) ++ master_flags = flags; ++ else ++ slave_flags = flags; ++ ++ /* ++ * Open pair master/slave. ++ */ ++ if (pi->cpt_drv_flags&TTY_DRIVER_DEVPTS_MEM) { ++ master = ptmx_open(pi->cpt_index, master_flags); ++ } else { ++ sprintf(pairname, "/dev/pty%c%c", a[pi->cpt_index/16], b[pi->cpt_index%16]); ++ master = filp_open(pairname, master_flags|O_NONBLOCK|O_NOCTTY|O_RDWR, 0); ++ } ++ if (IS_ERR(master)) { ++ eprintk_ctx("filp_open master: %Ld %ld\n", (long long)fi->cpt_priv, PTR_ERR(master)); ++ cpt_release_buf(ctx); ++ return master; ++ } ++ stty = master->private_data; ++ clear_bit(TTY_PTY_LOCK, &stty->flags); ++ if (pi->cpt_drv_flags&TTY_DRIVER_DEVPTS_MEM) ++ sprintf(pairname, "/dev/pts/%d", stty->index); ++ else ++ sprintf(pairname, "/dev/tty%c%c", a[stty->index/16], b[stty->index%16]); ++ slave = filp_open(pairname, slave_flags|O_NONBLOCK|O_NOCTTY|O_RDWR, 0); ++ if (IS_ERR(slave)) { ++ eprintk_ctx("filp_open slave %s: %ld\n", pairname, PTR_ERR(slave)); ++ fput(master); ++ cpt_release_buf(ctx); ++ return slave; ++ } ++ ++ if (pi->cpt_drv_subtype != PTY_TYPE_MASTER) ++ fixup_tty_attrs(ii, slave, ctx); ++ ++ cpt_object_add(CPT_OBJ_TTY, master->private_data, ctx); ++ cpt_object_add(CPT_OBJ_TTY, slave->private_data, ctx); ++ cpt_object_add(CPT_OBJ_FILE, master, ctx); ++ cpt_object_add(CPT_OBJ_FILE, slave, ctx); ++ ++ if (pi->cpt_drv_subtype == PTY_TYPE_MASTER) { ++ loff_t pos; ++ obj = lookup_cpt_object(CPT_OBJ_TTY, master->private_data, ctx); ++ obj->o_parent = master; ++ cpt_obj_setpos(obj, fi->cpt_priv, ctx); ++ pty_setup(stty, fi->cpt_priv, pi, ctx); ++ ++ obj = lookup_cpt_object(CPT_OBJ_TTY, slave->private_data, ctx); ++ obj->o_parent = slave; ++ pos = find_pty_pair(stty->link, fi->cpt_priv, pi, ctx); ++ cpt_obj_setpos(obj, pos, ctx); ++ ++ obj = lookup_cpt_object(CPT_OBJ_FILE, slave, ctx); ++ cpt_obj_setpos(obj, CPT_NULL, ctx); ++ get_file(master); ++ cpt_release_buf(ctx); ++ return master; ++ } else { ++ loff_t pos; ++ obj = lookup_cpt_object(CPT_OBJ_TTY, slave->private_data, ctx); ++ obj->o_parent = slave; ++ cpt_obj_setpos(obj, fi->cpt_priv, ctx); ++ pty_setup(stty->link, fi->cpt_priv, pi, ctx); ++ ++ obj = lookup_cpt_object(CPT_OBJ_TTY, master->private_data, ctx); ++ obj->o_parent = master; ++ pos = find_pty_pair(stty, fi->cpt_priv, pi, ctx); ++ cpt_obj_setpos(obj, pos, ctx); ++ ++ obj = lookup_cpt_object(CPT_OBJ_FILE, master, ctx); ++ cpt_obj_setpos(obj, CPT_NULL, ctx); ++ get_file(slave); ++ cpt_release_buf(ctx); ++ return slave; ++ } ++} ++ ++int rst_tty_jobcontrol(struct cpt_context *ctx) ++{ ++ int err; ++ loff_t sec = ctx->sections[CPT_SECT_TTY]; ++ loff_t endsec; ++ struct cpt_section_hdr h; ++ ++ err = ctx->pread(&h, sizeof(h), ctx, sec); ++ if (err) ++ return err; ++ if (h.cpt_section != CPT_SECT_TTY || h.cpt_hdrlen < sizeof(h)) ++ return -EINVAL; ++ endsec = sec + h.cpt_next; ++ sec += h.cpt_hdrlen; ++ while (sec < endsec) { ++ cpt_object_t *obj; ++ struct cpt_tty_image *pibuf = cpt_get_buf(ctx); ++ ++ if (rst_get_object(CPT_OBJ_TTY, sec, pibuf, ctx)) { ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ ++ obj = lookup_cpt_obj_bypos(CPT_OBJ_TTY, sec, ctx); ++ if (obj) { ++ struct tty_struct *stty = obj->o_obj; ++ if ((int)pibuf->cpt_pgrp > 0) { ++ rcu_read_lock(); ++ stty->pgrp = get_pid(alloc_vpid_safe(pibuf->cpt_pgrp)); ++ rcu_read_unlock(); ++ if (!stty->pgrp) ++ dprintk_ctx("unknown tty pgrp %d\n", pibuf->cpt_pgrp); ++ } else if (pibuf->cpt_pgrp) { ++ stty->pgrp = alloc_pid(current->nsproxy->pid_ns, ++ 0); ++ if (!stty->pgrp) { ++ eprintk_ctx("cannot allocate stray tty->pgrp"); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ } ++ if ((int)pibuf->cpt_session > 0) { ++ struct pid *sess; ++ ++ rcu_read_lock(); ++ sess = get_pid(alloc_vpid_safe(pibuf->cpt_session)); ++ rcu_read_unlock(); ++ if (!sess) { ++ dprintk_ctx("unknown tty session %d\n", pibuf->cpt_session); ++ } else if (!stty->session) { ++ stty->session = sess; ++ } ++ } ++ } ++ sec += pibuf->cpt_next; ++ cpt_release_buf(ctx); ++ } ++ return 0; ++} +diff --git a/kernel/cpt/rst_ubc.c b/kernel/cpt/rst_ubc.c +new file mode 100644 +index 0000000..a39ae28 +--- /dev/null ++++ b/kernel/cpt/rst_ubc.c +@@ -0,0 +1,131 @@ ++/* ++ * ++ * kernel/cpt/rst_ubc.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++ ++struct user_beancounter *rst_lookup_ubc(__u64 pos, struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ obj = lookup_cpt_obj_bypos(CPT_OBJ_UBC, pos, ctx); ++ if (obj == NULL) { ++ eprintk("RST: unknown ub @%Ld\n", (long long)pos); ++ return get_beancounter(get_exec_ub()); ++ } ++ return get_beancounter(obj->o_obj); ++} ++ ++void copy_one_ubparm(struct ubparm *from, struct ubparm *to, int bc_parm_id) ++{ ++ to[bc_parm_id].barrier = from[bc_parm_id].barrier; ++ to[bc_parm_id].limit = from[bc_parm_id].limit; ++} ++ ++void set_one_ubparm_to_max(struct ubparm *ubprm, int bc_parm_id) ++{ ++ ubprm[bc_parm_id].barrier = UB_MAXVALUE; ++ ubprm[bc_parm_id].limit = UB_MAXVALUE; ++} ++ ++static void restore_one_bc_parm(struct cpt_ubparm *dmp, struct ubparm *prm, ++ int held) ++{ ++ prm->barrier = (dmp->barrier == CPT_NULL ? UB_MAXVALUE : dmp->barrier); ++ prm->limit = (dmp->limit == CPT_NULL ? UB_MAXVALUE : dmp->limit); ++ if (held) ++ prm->held = dmp->held; ++ prm->maxheld = dmp->maxheld; ++ prm->minheld = dmp->minheld; ++ prm->failcnt = dmp->failcnt; ++} ++ ++static int restore_one_bc(struct cpt_beancounter_image *v, ++ cpt_object_t *obj, struct cpt_context *ctx) ++{ ++ struct user_beancounter *bc; ++ cpt_object_t *pobj; ++ int i; ++ ++ if (v->cpt_parent != CPT_NULL) { ++ pobj = lookup_cpt_obj_bypos(CPT_OBJ_UBC, v->cpt_parent, ctx); ++ if (pobj == NULL) ++ return -ESRCH; ++ bc = get_subbeancounter_byid(pobj->o_obj, v->cpt_id, 1); ++ } else { ++ bc = get_exec_ub(); ++ while (bc->parent) ++ bc = bc->parent; ++ get_beancounter(bc); ++ } ++ if (bc == NULL) ++ return -ENOMEM; ++ obj->o_obj = bc; ++ ++ if (ctx->image_version < CPT_VERSION_18 && ++ CPT_VERSION_MINOR(ctx->image_version) < 1) ++ goto out; ++ ++ for (i = 0; i < UB_RESOURCES; i++) { ++ restore_one_bc_parm(v->cpt_parms + i * 2, bc->ub_parms + i, 0); ++ restore_one_bc_parm(v->cpt_parms + i * 2 + 1, ++ bc->ub_store + i, 1); ++ } ++ ++out: ++ if (!bc->parent) ++ for (i = 0; i < UB_RESOURCES; i++) ++ copy_one_ubparm(bc->ub_parms, ctx->saved_ubc, i); ++ ++ return 0; ++} ++ ++int rst_undump_ubc(struct cpt_context *ctx) ++{ ++ loff_t start, end; ++ struct cpt_beancounter_image *v; ++ cpt_object_t *obj; ++ int err; ++ ++ err = rst_get_section(CPT_SECT_UBC, ctx, &start, &end); ++ if (err) ++ return err; ++ ++ while (start < end) { ++ v = cpt_get_buf(ctx); ++ err = rst_get_object(CPT_OBJ_UBC, start, v, ctx); ++ if (err) { ++ cpt_release_buf(ctx); ++ return err; ++ } ++ ++ obj = alloc_cpt_object(GFP_KERNEL, ctx); ++ cpt_obj_setpos(obj, start, ctx); ++ intern_cpt_object(CPT_OBJ_UBC, obj, ctx); ++ ++ restore_one_bc(v, obj, ctx); ++ ++ cpt_release_buf(ctx); ++ start += v->cpt_next; ++ } ++ return 0; ++} ++ ++void rst_finish_ubc(struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ for_each_object(obj, CPT_OBJ_UBC) ++ put_beancounter(obj->o_obj); ++} +diff --git a/kernel/cpt/rst_undump.c b/kernel/cpt/rst_undump.c +new file mode 100644 +index 0000000..1a002d5 +--- /dev/null ++++ b/kernel/cpt/rst_undump.c +@@ -0,0 +1,1007 @@ ++/* ++ * ++ * kernel/cpt/rst_undump.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#ifdef CONFIG_X86 ++#include ++#endif ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_files.h" ++#include "cpt_mm.h" ++#include "cpt_process.h" ++#include "cpt_socket.h" ++#include "cpt_net.h" ++#include "cpt_ubc.h" ++#include "cpt_kernel.h" ++ ++static int rst_utsname(cpt_context_t *ctx); ++ ++ ++struct thr_context { ++ struct completion init_complete; ++ struct completion task_done; ++ int error; ++ struct cpt_context *ctx; ++ cpt_object_t *tobj; ++}; ++ ++static int rst_clone_children(cpt_object_t *obj, struct cpt_context *ctx); ++ ++static int vps_rst_veinfo(struct cpt_context *ctx) ++{ ++ int err; ++ struct cpt_veinfo_image *i; ++ struct ve_struct *ve; ++ struct timespec delta; ++ loff_t start, end; ++ struct ipc_namespace *ns; ++ ++ err = rst_get_section(CPT_SECT_VEINFO, ctx, &start, &end); ++ if (err) ++ goto out; ++ ++ i = cpt_get_buf(ctx); ++ memset(i, 0, sizeof(*i)); ++ err = rst_get_object(CPT_OBJ_VEINFO, start, i, ctx); ++ if (err) ++ goto out_rel; ++ ++ ve = get_exec_env(); ++ ns = ve->ve_ns->ipc_ns; ++ ++ /* Damn. Fatal mistake, these two values are size_t! */ ++ ns->shm_ctlall = i->shm_ctl_all ? : 0xFFFFFFFFU; ++ ns->shm_ctlmax = i->shm_ctl_max ? : 0xFFFFFFFFU; ++ ns->shm_ctlmni = i->shm_ctl_mni; ++ ++ ns->msg_ctlmax = i->msg_ctl_max; ++ ns->msg_ctlmni = i->msg_ctl_mni; ++ ns->msg_ctlmnb = i->msg_ctl_mnb; ++ ++ BUILD_BUG_ON(sizeof(ns->sem_ctls) != sizeof(i->sem_ctl_arr)); ++ ns->sem_ctls[0] = i->sem_ctl_arr[0]; ++ ns->sem_ctls[1] = i->sem_ctl_arr[1]; ++ ns->sem_ctls[2] = i->sem_ctl_arr[2]; ++ ns->sem_ctls[3] = i->sem_ctl_arr[3]; ++ ++ cpt_timespec_import(&delta, i->start_timespec_delta); ++ _set_normalized_timespec(&ve->start_timespec, ++ ve->start_timespec.tv_sec - delta.tv_sec, ++ ve->start_timespec.tv_nsec - delta.tv_nsec); ++ ve->start_jiffies -= i->start_jiffies_delta; ++ // // FIXME: what??? ++ // // ve->start_cycles -= (s64)i->start_jiffies_delta * cycles_per_jiffy; ++ ++ ctx->last_vpid = i->last_pid; ++ ++ err = 0; ++out_rel: ++ cpt_release_buf(ctx); ++out: ++ return err; ++} ++ ++static int vps_rst_reparent_root(cpt_object_t *obj, struct cpt_context *ctx) ++{ ++ int err; ++ struct env_create_param3 param; ++ ++ do_posix_clock_monotonic_gettime(&ctx->cpt_monotonic_time); ++ do_gettimespec(&ctx->delta_time); ++ ++ _set_normalized_timespec(&ctx->delta_time, ++ ctx->delta_time.tv_sec - ctx->start_time.tv_sec, ++ ctx->delta_time.tv_nsec - ctx->start_time.tv_nsec); ++ ctx->delta_nsec = (s64)ctx->delta_time.tv_sec*NSEC_PER_SEC + ctx->delta_time.tv_nsec; ++ if (ctx->delta_nsec < 0) { ++ wprintk_ctx("Wall time is behind source by %Ld ns, " ++ "time sensitive applications can misbehave\n", (long long)-ctx->delta_nsec); ++ } ++ ++ _set_normalized_timespec(&ctx->cpt_monotonic_time, ++ ctx->cpt_monotonic_time.tv_sec - ctx->delta_time.tv_sec, ++ ctx->cpt_monotonic_time.tv_nsec - ctx->delta_time.tv_nsec); ++ ++ memset(¶m, 0, sizeof(param)); ++ param.iptables_mask = ctx->iptables_mask; ++ param.feature_mask = ctx->features; ++ ++ /* feature_mask is set as required - pretend we know everything */ ++ param.known_features = (ctx->image_version < CPT_VERSION_18) ? ++ VE_FEATURES_OLD : ~(__u64)0; ++ ++ err = real_env_create(ctx->ve_id, VE_CREATE|VE_LOCK, 2, ++ ¶m, sizeof(param)); ++ if (err < 0) ++ eprintk_ctx("real_env_create: %d\n", err); ++ ++ get_exec_env()->jiffies_fixup = ++ (ctx->delta_time.tv_sec < 0 ? ++ 0 : timespec_to_jiffies(&ctx->delta_time)) - ++ (unsigned long)(get_jiffies_64() - ctx->virt_jiffies64); ++ dprintk_ctx("JFixup %ld %Ld\n", get_exec_env()->jiffies_fixup, ++ (long long)ctx->delta_nsec); ++ return err < 0 ? err : 0; ++} ++ ++static int hook(void *arg) ++{ ++ struct thr_context *thr_ctx = arg; ++ struct cpt_context *ctx; ++ cpt_object_t *tobj; ++ struct cpt_task_image *ti; ++ int err = 0; ++ int exiting = 0; ++ ++ current->state = TASK_UNINTERRUPTIBLE; ++ complete(&thr_ctx->init_complete); ++ schedule(); ++ ++ ctx = thr_ctx->ctx; ++ tobj = thr_ctx->tobj; ++ ti = tobj->o_image; ++ ++ current->fs->umask = 0; ++ ++ if (ti->cpt_pid == 1) { ++#ifdef CONFIG_BEANCOUNTERS ++ struct user_beancounter *bc; ++#endif ++ ++ err = vps_rst_reparent_root(tobj, ctx); ++ ++ if (err) { ++ rst_report_error(err, ctx); ++ goto out; ++ } ++ ++ memcpy(&get_exec_env()->ve_cap_bset, &ti->cpt_ecap, sizeof(kernel_cap_t)); ++ ++ if (ctx->statusfile) { ++ fput(ctx->statusfile); ++ ctx->statusfile = NULL; ++ } ++ ++ if (ctx->lockfile) { ++ char b; ++ mm_segment_t oldfs; ++ err = -EINVAL; ++ ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ if (ctx->lockfile->f_op && ctx->lockfile->f_op->read) ++ err = ctx->lockfile->f_op->read(ctx->lockfile, &b, 1, &ctx->lockfile->f_pos); ++ set_fs(oldfs); ++ fput(ctx->lockfile); ++ ctx->lockfile = NULL; ++ } ++ ++ if (err) { ++ eprintk_ctx("CPT: lock fd is closed incorrectly: %d\n", err); ++ goto out; ++ } ++ err = vps_rst_veinfo(ctx); ++ if (err) { ++ eprintk_ctx("rst_veinfo: %d\n", err); ++ goto out; ++ } ++ ++ err = rst_utsname(ctx); ++ if (err) { ++ eprintk_ctx("rst_utsname: %d\n", err); ++ goto out; ++ } ++ ++ err = rst_root_namespace(ctx); ++ if (err) { ++ eprintk_ctx("rst_namespace: %d\n", err); ++ goto out; ++ } ++ ++ if ((err = rst_restore_net(ctx)) != 0) { ++ eprintk_ctx("rst_restore_net: %d\n", err); ++ goto out; ++ } ++ ++ err = rst_sockets(ctx); ++ if (err) { ++ eprintk_ctx("rst_sockets: %d\n", err); ++ goto out; ++ } ++ err = rst_sysv_ipc(ctx); ++ if (err) { ++ eprintk_ctx("rst_sysv_ipc: %d\n", err); ++ goto out; ++ } ++#ifdef CONFIG_BEANCOUNTERS ++ bc = get_exec_ub(); ++ set_one_ubparm_to_max(bc->ub_parms, UB_KMEMSIZE); ++ set_one_ubparm_to_max(bc->ub_parms, UB_NUMPROC); ++ set_one_ubparm_to_max(bc->ub_parms, UB_NUMFILE); ++ set_one_ubparm_to_max(bc->ub_parms, UB_DCACHESIZE); ++#endif ++ } ++ ++ do { ++ if (current->user->uid != ti->cpt_user) { ++ struct user_struct *u; ++ ++ u = alloc_uid(get_exec_env()->ve_ns->user_ns, ti->cpt_user); ++ if (!u) { ++ eprintk_ctx("alloc_user\n"); ++ } else { ++ switch_uid(u); ++ } ++ } ++ } while (0); ++ ++ if ((err = rst_mm_complete(ti, ctx)) != 0) { ++ eprintk_ctx("rst_mm: %d\n", err); ++ goto out; ++ } ++ ++ if ((err = rst_files_complete(ti, ctx)) != 0) { ++ eprintk_ctx("rst_files: %d\n", err); ++ goto out; ++ } ++ ++ if ((err = rst_fs_complete(ti, ctx)) != 0) { ++ eprintk_ctx("rst_fs: %d\n", err); ++ goto out; ++ } ++ ++ if ((err = rst_semundo_complete(ti, ctx)) != 0) { ++ eprintk_ctx("rst_semundo: %d\n", err); ++ goto out; ++ } ++ ++ if ((err = rst_signal_complete(ti, &exiting, ctx)) != 0) { ++ eprintk_ctx("rst_signal: %d\n", err); ++ goto out; ++ } ++ ++ if (ti->cpt_personality != 0) ++ __set_personality(ti->cpt_personality); ++ ++#ifdef CONFIG_X86_64 ++ /* 32bit app from 32bit OS, won't have PER_LINUX32 set... :/ */ ++ if (!ti->cpt_64bit) ++ __set_personality(PER_LINUX32); ++#endif ++ ++ current->set_child_tid = NULL; ++ current->clear_child_tid = NULL; ++ current->flags &= ~(PF_FORKNOEXEC|PF_SUPERPRIV); ++ current->flags |= ti->cpt_flags&(PF_FORKNOEXEC|PF_SUPERPRIV); ++ current->exit_code = ti->cpt_exit_code; ++ current->pdeath_signal = ti->cpt_pdeath_signal; ++ ++ if (ti->cpt_restart.fn != CPT_RBL_0) { ++ if (ti->cpt_restart.fn == CPT_RBL_NANOSLEEP ++#ifdef CONFIG_COMPAT ++ || ti->cpt_restart.fn == CPT_RBL_COMPAT_NANOSLEEP ++#endif ++ ) { ++ struct restart_block *rb; ++ ktime_t e; ++ ++ e.tv64 = 0; ++ ++ if (ctx->image_version >= CPT_VERSION_20) ++ e = ktime_add_ns(e, ti->cpt_restart.arg2); ++ else if (ctx->image_version >= CPT_VERSION_9) ++ e = ktime_add_ns(e, ti->cpt_restart.arg0); ++ else ++ e = ktime_add_ns(e, ti->cpt_restart.arg0*TICK_NSEC); ++ if (e.tv64 < 0) ++ e.tv64 = TICK_NSEC; ++ e = ktime_add(e, timespec_to_ktime(ctx->cpt_monotonic_time)); ++ ++ rb = &task_thread_info(current)->restart_block; ++ if (ti->cpt_restart.fn == CPT_RBL_NANOSLEEP) ++ rb->fn = hrtimer_nanosleep_restart; ++#ifdef CONFIG_COMPAT ++ else ++ rb->fn = compat_nanosleep_restart; ++#endif ++ if (ctx->image_version >= CPT_VERSION_20) { ++ rb->arg0 = ti->cpt_restart.arg0; ++ rb->arg1 = ti->cpt_restart.arg1; ++ rb->arg2 = e.tv64 & 0xFFFFFFFF; ++ rb->arg3 = e.tv64 >> 32; ++ } else if (ctx->image_version >= CPT_VERSION_9) { ++ rb->arg0 = ti->cpt_restart.arg2; ++ rb->arg1 = ti->cpt_restart.arg3; ++ rb->arg2 = e.tv64 & 0xFFFFFFFF; ++ rb->arg3 = e.tv64 >> 32; ++ } else { ++ rb->arg0 = ti->cpt_restart.arg1; ++ rb->arg1 = CLOCK_MONOTONIC; ++ rb->arg2 = e.tv64 & 0xFFFFFFFF; ++ rb->arg3 = e.tv64 >> 32; ++ } ++ } else if (ti->cpt_restart.fn == CPT_RBL_POLL) { ++ struct restart_block *rb; ++ ktime_t e; ++ struct timespec ts; ++ unsigned long timeout_jiffies; ++ ++ e.tv64 = 0; ++ e = ktime_add_ns(e, ti->cpt_restart.arg2); ++ e = ktime_sub(e, timespec_to_ktime(ctx->delta_time)); ++ ts = ns_to_timespec(ktime_to_ns(e)); ++ timeout_jiffies = timespec_to_jiffies(&ts); ++ ++ rb = &task_thread_info(current)->restart_block; ++ rb->fn = do_restart_poll; ++ rb->arg0 = ti->cpt_restart.arg0; ++ rb->arg1 = ti->cpt_restart.arg1; ++ rb->arg2 = timeout_jiffies & 0xFFFFFFFF; ++ rb->arg3 = (u64)timeout_jiffies >> 32; ++ } else if (ti->cpt_restart.fn == CPT_RBL_FUTEX_WAIT) { ++ struct restart_block *rb; ++ ktime_t e; ++ ++ e.tv64 = 0; ++ e = ktime_add_ns(e, ti->cpt_restart.arg2); ++ e = ktime_add(e, timespec_to_ktime(ctx->cpt_monotonic_time)); ++ ++ rb = &task_thread_info(current)->restart_block; ++ rb->fn = futex_wait_restart; ++ rb->futex.uaddr = (void *)(unsigned long)ti->cpt_restart.arg0; ++ rb->futex.val = ti->cpt_restart.arg1; ++ rb->futex.time = e.tv64; ++ rb->futex.flags = ti->cpt_restart.arg3; ++ } else ++ eprintk_ctx("unknown restart block\n"); ++ } ++ ++ if (thread_group_leader(current)) { ++ current->signal->it_real_incr.tv64 = 0; ++ if (ctx->image_version >= CPT_VERSION_9) { ++ current->signal->it_real_incr = ++ ktime_add_ns(current->signal->it_real_incr, ti->cpt_it_real_incr); ++ } else { ++ current->signal->it_real_incr = ++ ktime_add_ns(current->signal->it_real_incr, ti->cpt_it_real_incr*TICK_NSEC); ++ } ++ current->signal->it_prof_incr = ti->cpt_it_prof_incr; ++ current->signal->it_virt_incr = ti->cpt_it_virt_incr; ++ current->signal->it_prof_expires = ti->cpt_it_prof_value; ++ current->signal->it_virt_expires = ti->cpt_it_virt_value; ++ } ++ ++ err = rst_clone_children(tobj, ctx); ++ if (err) { ++ eprintk_ctx("rst_clone_children\n"); ++ goto out; ++ } ++ ++ if (exiting) ++ current->signal->flags |= SIGNAL_GROUP_EXIT; ++ ++ if (ti->cpt_pid == 1) { ++ if ((err = rst_process_linkage(ctx)) != 0) { ++ eprintk_ctx("rst_process_linkage: %d\n", err); ++ goto out; ++ } ++ if ((err = rst_do_filejobs(ctx)) != 0) { ++ eprintk_ctx("rst_do_filejobs: %d\n", err); ++ goto out; ++ } ++ if ((err = rst_eventpoll(ctx)) != 0) { ++ eprintk_ctx("rst_eventpoll: %d\n", err); ++ goto out; ++ } ++#ifdef CONFIG_INOTIFY_USER ++ if ((err = rst_inotify(ctx)) != 0) { ++ eprintk_ctx("rst_inotify: %d\n", err); ++ goto out; ++ } ++#endif ++ if ((err = rst_sockets_complete(ctx)) != 0) { ++ eprintk_ctx("rst_sockets_complete: %d\n", err); ++ goto out; ++ } ++ if ((err = rst_stray_files(ctx)) != 0) { ++ eprintk_ctx("rst_stray_files: %d\n", err); ++ goto out; ++ } ++ if ((err = rst_posix_locks(ctx)) != 0) { ++ eprintk_ctx("rst_posix_locks: %d\n", err); ++ goto out; ++ } ++ if ((err = rst_tty_jobcontrol(ctx)) != 0) { ++ eprintk_ctx("rst_tty_jobcontrol: %d\n", err); ++ goto out; ++ } ++ if ((err = rst_restore_fs(ctx)) != 0) { ++ eprintk_ctx("rst_restore_fs: %d\n", err); ++ goto out; ++ } ++ if (virtinfo_notifier_call(VITYPE_SCP, ++ VIRTINFO_SCP_RESTORE, ctx) & NOTIFY_FAIL) { ++ err = -ECHRNG; ++ eprintk_ctx("scp_restore failed\n"); ++ goto out; ++ } ++ if (ctx->last_vpid) ++ get_exec_env()->ve_ns->pid_ns->last_pid = ++ ctx->last_vpid; ++ } ++ ++out: ++ thr_ctx->error = err; ++ complete(&thr_ctx->task_done); ++ ++ if (!err && (ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) { ++ current->flags |= PF_EXIT_RESTART; ++ do_exit(ti->cpt_exit_code); ++ } else { ++ __set_current_state(TASK_UNINTERRUPTIBLE); ++ } ++ ++ schedule(); ++ ++ dprintk_ctx("leaked through %d/%d %p\n", task_pid_nr(current), task_pid_vnr(current), current->mm); ++ ++ module_put(THIS_MODULE); ++ complete_and_exit(NULL, 0); ++ return 0; ++} ++ ++#if 0 ++static void set_task_ubs(struct cpt_task_image *ti, struct cpt_context *ctx) ++{ ++ struct task_beancounter *tbc; ++ ++ tbc = task_bc(current); ++ ++ put_beancounter(tbc->fork_sub); ++ tbc->fork_sub = rst_lookup_ubc(ti->cpt_task_ub, ctx); ++ if (ti->cpt_mm_ub != CPT_NULL) { ++ put_beancounter(tbc->exec_ub); ++ tbc->exec_ub = rst_lookup_ubc(ti->cpt_mm_ub, ctx); ++ } ++} ++#endif ++ ++static int create_root_task(cpt_object_t *obj, struct cpt_context *ctx, ++ struct thr_context *thr_ctx) ++{ ++ struct task_struct *tsk; ++ int pid; ++ ++ thr_ctx->ctx = ctx; ++ thr_ctx->error = 0; ++ init_completion(&thr_ctx->init_complete); ++ init_completion(&thr_ctx->task_done); ++#if 0 ++ set_task_ubs(obj->o_image, ctx); ++#endif ++ ++ pid = local_kernel_thread(hook, thr_ctx, 0, 0); ++ if (pid < 0) ++ return pid; ++ read_lock(&tasklist_lock); ++ tsk = find_task_by_vpid(pid); ++ if (tsk) ++ get_task_struct(tsk); ++ read_unlock(&tasklist_lock); ++ if (tsk == NULL) ++ return -ESRCH; ++ cpt_obj_setobj(obj, tsk, ctx); ++ thr_ctx->tobj = obj; ++ return 0; ++} ++ ++static int rst_basic_init_task(cpt_object_t *obj, struct cpt_context *ctx) ++{ ++ struct task_struct *tsk = obj->o_obj; ++ struct cpt_task_image *ti = obj->o_image; ++ ++ memcpy(tsk->comm, ti->cpt_comm, sizeof(tsk->comm)); ++ rst_mm_basic(obj, ti, ctx); ++ return 0; ++} ++ ++static int make_baby(cpt_object_t *cobj, ++ struct cpt_task_image *pi, ++ struct cpt_context *ctx) ++{ ++ unsigned long flags; ++ struct cpt_task_image *ci = cobj->o_image; ++ struct thr_context thr_ctx; ++ struct task_struct *tsk; ++ pid_t pid; ++ struct fs_struct *tfs = NULL; ++ ++ flags = rst_mm_flag(ci, ctx) | rst_files_flag(ci, ctx) ++ | rst_signal_flag(ci, ctx) | rst_semundo_flag(ci, ctx); ++ if (ci->cpt_rppid != pi->cpt_pid) { ++ flags |= CLONE_THREAD|CLONE_PARENT; ++ if (ci->cpt_signal != pi->cpt_signal || ++ !(flags&CLONE_SIGHAND) || ++ (!(flags&CLONE_VM) && pi->cpt_mm != CPT_NULL)) { ++ eprintk_ctx("something is wrong with threads: %d %d %d %Ld %Ld %08lx\n", ++ (int)ci->cpt_pid, (int)ci->cpt_rppid, (int)pi->cpt_pid, ++ (long long)ci->cpt_signal, (long long)pi->cpt_signal, flags ++ ); ++ return -EINVAL; ++ } ++ } ++ ++ thr_ctx.ctx = ctx; ++ thr_ctx.error = 0; ++ init_completion(&thr_ctx.init_complete); ++ init_completion(&thr_ctx.task_done); ++ thr_ctx.tobj = cobj; ++ ++#if 0 ++ set_task_ubs(ci, ctx); ++#endif ++ ++ if (current->fs == NULL) { ++ tfs = get_exec_env()->ve_ns->pid_ns->child_reaper->fs; ++ if (tfs == NULL) ++ return -EINVAL; ++ atomic_inc(&tfs->count); ++ current->fs = tfs; ++ } ++ pid = local_kernel_thread(hook, &thr_ctx, flags, ci->cpt_pid); ++ if (tfs) { ++ current->fs = NULL; ++ atomic_dec(&tfs->count); ++ } ++ if (pid < 0) ++ return pid; ++ ++ read_lock(&tasklist_lock); ++ tsk = find_task_by_vpid(pid); ++ if (tsk) ++ get_task_struct(tsk); ++ read_unlock(&tasklist_lock); ++ if (tsk == NULL) ++ return -ESRCH; ++ cpt_obj_setobj(cobj, tsk, ctx); ++ thr_ctx.tobj = cobj; ++ wait_for_completion(&thr_ctx.init_complete); ++ wait_task_inactive(cobj->o_obj); ++ rst_basic_init_task(cobj, ctx); ++ ++ /* clone() increases group_stop_count if it was not zero and ++ * CLONE_THREAD was asked. Undo. ++ */ ++ if (current->signal->group_stop_count && (flags & CLONE_THREAD)) { ++ if (tsk->signal != current->signal) BUG(); ++ current->signal->group_stop_count--; ++ } ++ ++ wake_up_process(tsk); ++ wait_for_completion(&thr_ctx.task_done); ++ wait_task_inactive(tsk); ++ ++ return thr_ctx.error; ++} ++ ++static int rst_clone_children(cpt_object_t *obj, struct cpt_context *ctx) ++{ ++ int err = 0; ++ struct cpt_task_image *ti = obj->o_image; ++ cpt_object_t *cobj; ++ ++ for_each_object(cobj, CPT_OBJ_TASK) { ++ struct cpt_task_image *ci = cobj->o_image; ++ if (cobj == obj) ++ continue; ++ if ((ci->cpt_rppid == ti->cpt_pid && ci->cpt_tgid == ci->cpt_pid) || ++ (ci->cpt_leader == ti->cpt_pid && ++ ci->cpt_tgid != ci->cpt_pid && ci->cpt_pid != 1)) { ++ err = make_baby(cobj, ti, ctx); ++ if (err) { ++ eprintk_ctx("make_baby: %d\n", err); ++ return err; ++ } ++ } ++ } ++ return 0; ++} ++ ++static int read_task_images(struct cpt_context *ctx) ++{ ++ int err; ++ loff_t start, end; ++ ++ err = rst_get_section(CPT_SECT_TASKS, ctx, &start, &end); ++ if (err) ++ return err; ++ ++ while (start < end) { ++ cpt_object_t *obj; ++ struct cpt_task_image *ti = cpt_get_buf(ctx); ++ ++ err = rst_get_object(CPT_OBJ_TASK, start, ti, ctx); ++ if (err) { ++ cpt_release_buf(ctx); ++ return err; ++ } ++#if 0 ++ if (ti->cpt_pid != 1 && !__is_virtual_pid(ti->cpt_pid)) { ++ eprintk_ctx("BUG: pid %d is not virtual\n", ti->cpt_pid); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++#endif ++ obj = alloc_cpt_object(GFP_KERNEL, ctx); ++ cpt_obj_setpos(obj, start, ctx); ++ intern_cpt_object(CPT_OBJ_TASK, obj, ctx); ++ obj->o_image = kmalloc(ti->cpt_next, GFP_KERNEL); ++ if (obj->o_image == NULL) { ++ cpt_release_buf(ctx); ++ return -ENOMEM; ++ } ++ memcpy(obj->o_image, ti, sizeof(*ti)); ++ err = ctx->pread(obj->o_image + sizeof(*ti), ++ ti->cpt_next - sizeof(*ti), ctx, start + sizeof(*ti)); ++ cpt_release_buf(ctx); ++ if (err) ++ return err; ++ start += ti->cpt_next; ++ } ++ return 0; ++} ++ ++ ++static int vps_rst_restore_tree(struct cpt_context *ctx) ++{ ++ int err; ++ cpt_object_t *obj; ++ struct thr_context thr_ctx_root; ++ ++ err = read_task_images(ctx); ++ if (err) ++ return err; ++ ++ err = rst_undump_ubc(ctx); ++ if (err) ++ return err; ++ ++ if (virtinfo_notifier_call(VITYPE_SCP, ++ VIRTINFO_SCP_RSTCHECK, ctx) & NOTIFY_FAIL) ++ return -ECHRNG; ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ err = rst_setup_pagein(ctx); ++ if (err) ++ return err; ++#endif ++ for_each_object(obj, CPT_OBJ_TASK) { ++ err = create_root_task(obj, ctx, &thr_ctx_root); ++ if (err) ++ return err; ++ ++ wait_for_completion(&thr_ctx_root.init_complete); ++ wait_task_inactive(obj->o_obj); ++ rst_basic_init_task(obj, ctx); ++ ++ wake_up_process(obj->o_obj); ++ wait_for_completion(&thr_ctx_root.task_done); ++ wait_task_inactive(obj->o_obj); ++ err = thr_ctx_root.error; ++ if (err) ++ return err; ++ break; ++ } ++ ++ return err; ++} ++ ++#ifndef CONFIG_IA64 ++int rst_read_vdso(struct cpt_context *ctx) ++{ ++ int err; ++ loff_t start, end; ++ struct cpt_page_block *pgb; ++ ++ ctx->vdso = NULL; ++ err = rst_get_section(CPT_SECT_VSYSCALL, ctx, &start, &end); ++ if (err) ++ return err; ++ if (start == CPT_NULL) ++ return 0; ++ if (end < start + sizeof(*pgb) + PAGE_SIZE) ++ return -EINVAL; ++ ++ pgb = cpt_get_buf(ctx); ++ err = rst_get_object(CPT_OBJ_VSYSCALL, start, pgb, ctx); ++ if (err) { ++ goto err_buf; ++ } ++ ctx->vdso = (char*)__get_free_page(GFP_KERNEL); ++ if (ctx->vdso == NULL) { ++ err = -ENOMEM; ++ goto err_buf; ++ } ++ err = ctx->pread(ctx->vdso, PAGE_SIZE, ctx, start + sizeof(*pgb)); ++ if (err) ++ goto err_page; ++ if (!memcmp(ctx->vdso, vsyscall_addr, PAGE_SIZE)) { ++ free_page((unsigned long)ctx->vdso); ++ ctx->vdso = NULL; ++ } ++ ++ cpt_release_buf(ctx); ++ return 0; ++err_page: ++ free_page((unsigned long)ctx->vdso); ++ ctx->vdso = NULL; ++err_buf: ++ cpt_release_buf(ctx); ++ return err; ++} ++#endif ++ ++int vps_rst_undump(struct cpt_context *ctx) ++{ ++ int err; ++ unsigned long umask; ++ ++ err = rst_open_dumpfile(ctx); ++ if (err) ++ return err; ++ ++ if (ctx->tasks64) { ++#if defined(CONFIG_IA64) ++ if (ctx->image_arch != CPT_OS_ARCH_IA64) ++#elif defined(CONFIG_X86_64) ++ if (ctx->image_arch != CPT_OS_ARCH_EMT64) ++#else ++ if (1) ++#endif ++ { ++ eprintk_ctx("Cannot restore 64 bit container on this architecture\n"); ++ return -EINVAL; ++ } ++ } ++ ++ umask = current->fs->umask; ++ current->fs->umask = 0; ++ ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ err = rst_setup_pagein(ctx); ++#endif ++#ifndef CONFIG_IA64 ++ if (err == 0) ++ err = rst_read_vdso(ctx); ++#endif ++ if (err == 0) ++ err = vps_rst_restore_tree(ctx); ++ ++ if (err == 0) ++ err = rst_restore_process(ctx); ++ ++ if (err) ++ virtinfo_notifier_call(VITYPE_SCP, ++ VIRTINFO_SCP_RSTFAIL, ctx); ++ ++ current->fs->umask = umask; ++ ++ return err; ++} ++ ++static int rst_unlock_ve(struct cpt_context *ctx) ++{ ++ struct ve_struct *env; ++ ++ env = get_ve_by_id(ctx->ve_id); ++ if (!env) ++ return -ESRCH; ++ down_write(&env->op_sem); ++ env->is_locked = 0; ++ up_write(&env->op_sem); ++ put_ve(env); ++ return 0; ++} ++ ++int recalc_sigpending_tsk(struct task_struct *t); ++ ++int rst_resume(struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ int err = 0; ++#ifdef CONFIG_BEANCOUNTERS ++ struct user_beancounter *bc; ++#endif ++ ++ for_each_object(obj, CPT_OBJ_FILE) { ++ struct file *file = obj->o_obj; ++ ++ fput(file); ++ } ++ ++#ifdef CONFIG_BEANCOUNTERS ++ bc = get_beancounter_byuid(ctx->ve_id, 0); ++ BUG_ON(!bc); ++ copy_one_ubparm(ctx->saved_ubc, bc->ub_parms, UB_KMEMSIZE); ++ copy_one_ubparm(ctx->saved_ubc, bc->ub_parms, UB_NUMPROC); ++ copy_one_ubparm(ctx->saved_ubc, bc->ub_parms, UB_NUMFILE); ++ copy_one_ubparm(ctx->saved_ubc, bc->ub_parms, UB_DCACHESIZE); ++ put_beancounter(bc); ++#endif ++ ++ rst_resume_network(ctx); ++ ++ for_each_object(obj, CPT_OBJ_TASK) { ++ struct task_struct *tsk = obj->o_obj; ++ struct cpt_task_image *ti = obj->o_image; ++ ++ if (!tsk) ++ continue; ++ ++ if (ti->cpt_state == TASK_UNINTERRUPTIBLE) { ++ dprintk_ctx("task %d/%d(%s) is started\n", task_pid_vnr(tsk), tsk->pid, tsk->comm); ++ ++ /* Weird... If a signal is sent to stopped task, ++ * nobody makes recalc_sigpending(). We have to do ++ * this by hands after wake_up_process(). ++ * if we did this before a signal could arrive before ++ * wake_up_process() and stall. ++ */ ++ spin_lock_irq(&tsk->sighand->siglock); ++ if (!signal_pending(tsk)) ++ recalc_sigpending_tsk(tsk); ++ spin_unlock_irq(&tsk->sighand->siglock); ++ ++ wake_up_process(tsk); ++ } else { ++ if (ti->cpt_state == TASK_STOPPED || ++ ti->cpt_state == TASK_TRACED) { ++ set_task_state(tsk, ti->cpt_state); ++ } ++ } ++ put_task_struct(tsk); ++ } ++ ++ rst_unlock_ve(ctx); ++ ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ rst_complete_pagein(ctx, 0); ++#endif ++ ++ rst_finish_ubc(ctx); ++ cpt_object_destroy(ctx); ++ ++ return err; ++} ++ ++int rst_kill(struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ int err = 0; ++ ++ for_each_object(obj, CPT_OBJ_FILE) { ++ struct file *file = obj->o_obj; ++ ++ fput(file); ++ } ++ ++ for_each_object(obj, CPT_OBJ_TASK) { ++ struct task_struct *tsk = obj->o_obj; ++ ++ if (tsk == NULL) ++ continue; ++ ++ if (tsk->exit_state == 0) { ++ send_sig(SIGKILL, tsk, 1); ++ ++ spin_lock_irq(&tsk->sighand->siglock); ++ sigfillset(&tsk->blocked); ++ sigdelsetmask(&tsk->blocked, sigmask(SIGKILL)); ++ set_tsk_thread_flag(tsk, TIF_SIGPENDING); ++ clear_tsk_thread_flag(tsk, TIF_FREEZE); ++ if (tsk->flags & PF_FROZEN) ++ tsk->flags &= ~PF_FROZEN; ++ spin_unlock_irq(&tsk->sighand->siglock); ++ ++ wake_up_process(tsk); ++ } ++ ++ put_task_struct(tsk); ++ } ++ ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ rst_complete_pagein(ctx, 1); ++#endif ++ ++ rst_finish_ubc(ctx); ++ cpt_object_destroy(ctx); ++ ++ return err; ++} ++ ++static int rst_utsname(cpt_context_t *ctx) ++{ ++ int err; ++ loff_t sec = ctx->sections[CPT_SECT_UTSNAME]; ++ loff_t endsec; ++ struct cpt_section_hdr h; ++ struct cpt_object_hdr o; ++ struct ve_struct *ve; ++ struct uts_namespace *ns; ++ int i; ++ ++ if (sec == CPT_NULL) ++ return 0; ++ ++ err = ctx->pread(&h, sizeof(h), ctx, sec); ++ if (err) ++ return err; ++ if (h.cpt_section != CPT_SECT_UTSNAME || h.cpt_hdrlen < sizeof(h)) ++ return -EINVAL; ++ ++ ve = get_exec_env(); ++ ns = ve->ve_ns->uts_ns; ++ ++ i = 0; ++ endsec = sec + h.cpt_next; ++ sec += h.cpt_hdrlen; ++ while (sec < endsec) { ++ int len; ++ char *ptr; ++ err = rst_get_object(CPT_OBJ_NAME, sec, &o, ctx); ++ if (err) ++ return err; ++ len = o.cpt_next - o.cpt_hdrlen; ++ if (len > __NEW_UTS_LEN + 1) ++ return -ENAMETOOLONG; ++ switch (i) { ++ case 0: ++ ptr = ns->name.nodename; break; ++ case 1: ++ ptr = ns->name.domainname; break; ++ default: ++ return -EINVAL; ++ } ++ err = ctx->pread(ptr, len, ctx, sec+o.cpt_hdrlen); ++ if (err) ++ return err; ++ i++; ++ sec += o.cpt_next; ++ } ++ ++ return 0; ++} +diff --git a/kernel/cpu.c b/kernel/cpu.c +index c77bc3a..1cd334c 100644 +--- a/kernel/cpu.c ++++ b/kernel/cpu.c +@@ -152,7 +152,7 @@ static inline void check_for_tasks(int cpu) + struct task_struct *p; + + write_lock_irq(&tasklist_lock); +- for_each_process(p) { ++ for_each_process_all(p) { + if (task_cpu(p) == cpu && + (!cputime_eq(p->utime, cputime_zero) || + !cputime_eq(p->stime, cputime_zero))) +diff --git a/kernel/exit.c b/kernel/exit.c +index 8f6185e..dcc5665 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -22,6 +22,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -45,13 +46,18 @@ + #include + #include + #include ++#include ++#include ++ ++#include ++#include + + #include + #include + #include + #include + +-static void exit_mm(struct task_struct * tsk); ++void exit_mm(struct task_struct * tsk); + + static inline int task_detached(struct task_struct *p) + { +@@ -67,6 +73,9 @@ static void __unhash_process(struct task_struct *p) + detach_pid(p, PIDTYPE_SID); + + list_del_rcu(&p->tasks); ++#ifdef CONFIG_VE ++ list_del_rcu(&p->ve_task_info.vetask_list); ++#endif + __get_cpu_var(process_counts)--; + } + list_del_rcu(&p->thread_group); +@@ -162,6 +171,8 @@ repeat: + ptrace_unlink(p); + BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); + __exit_signal(p); ++ nr_zombie--; ++ atomic_inc(&nr_dead); + + /* + * If we are the last non-leader member of the thread +@@ -183,9 +194,12 @@ repeat: + */ + zap_leader = task_detached(leader); + } ++ put_task_fairsched_node(p); + + write_unlock_irq(&tasklist_lock); + release_thread(p); ++ ub_task_uncharge(p); ++ pput_ve(p->ve_task_info.owner_env); + call_rcu(&p->rcu, delayed_put_task_struct); + + p = leader; +@@ -515,6 +529,7 @@ void put_files_struct(struct files_struct *files) + free_fdtable(fdt); + } + } ++EXPORT_SYMBOL_GPL(put_files_struct); + + void reset_files_struct(struct files_struct *files) + { +@@ -652,13 +667,17 @@ assign_new_owner: + * Turn us into a lazy TLB process if we + * aren't already.. + */ +-static void exit_mm(struct task_struct * tsk) ++void exit_mm(struct task_struct * tsk) + { + struct mm_struct *mm = tsk->mm; + + mm_release(tsk, mm); + if (!mm) + return; ++ ++ if (test_tsk_thread_flag(tsk, TIF_MEMDIE)) ++ mm->oom_killed = 1; ++ + /* + * Serialize with any possible pending coredump. + * We must hold mmap_sem around checking core_waiters +@@ -690,6 +709,7 @@ static void exit_mm(struct task_struct * tsk) + mm_update_next_owner(mm); + mmput(mm); + } ++EXPORT_SYMBOL_GPL(exit_mm); + + static void + reparent_thread(struct task_struct *p, struct task_struct *father, int traced) +@@ -864,6 +884,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead) + !capable(CAP_KILL)) + tsk->exit_signal = SIGCHLD; + ++ if (tsk->exit_signal != -1 && tsk == init_pid_ns.child_reaper) ++ /* We dont want people slaying init. */ ++ tsk->exit_signal = SIGCHLD; ++ + /* If something other than our normal parent is ptracing us, then + * send it a SIGCHLD instead of honoring exit_signal. exit_signal + * only has special meaning to our real parent. +@@ -880,6 +904,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) + if (task_detached(tsk) && likely(!tsk->ptrace)) + state = EXIT_DEAD; + tsk->exit_state = state; ++ nr_zombie++; + + /* mt-exec, de_thread() is waiting for us */ + if (thread_group_leader(tsk) && +@@ -953,6 +978,7 @@ static inline void exit_child_reaper(struct task_struct *tsk) + * perform the role of the child_reaper. + */ + zap_pid_ns_processes(tsk->nsproxy->pid_ns); ++ (void)virtinfo_gencall(VIRTINFO_DOEXIT, NULL); + } + + NORET_TYPE void do_exit(long code) +@@ -1023,12 +1049,14 @@ NORET_TYPE void do_exit(long code) + } + acct_collect(code, group_dead); + #ifdef CONFIG_FUTEX +- if (unlikely(tsk->robust_list)) +- exit_robust_list(tsk); ++ if (!(tsk->flags & PF_EXIT_RESTART)) { ++ if (unlikely(tsk->robust_list)) ++ exit_robust_list(tsk); + #ifdef CONFIG_COMPAT +- if (unlikely(tsk->compat_robust_list)) +- compat_exit_robust_list(tsk); ++ if (unlikely(tsk->compat_robust_list)) ++ compat_exit_robust_list(tsk); + #endif ++ } + #endif + if (group_dead) + tty_audit_exit(); +@@ -1057,8 +1085,16 @@ NORET_TYPE void do_exit(long code) + if (tsk->binfmt) + module_put(tsk->binfmt->module); + +- proc_exit_connector(tsk); +- exit_notify(tsk, group_dead); ++ if (!(tsk->flags & PF_EXIT_RESTART)) { ++ proc_exit_connector(tsk); ++ exit_notify(tsk, group_dead); ++ } else { ++ write_lock_irq(&tasklist_lock); ++ tsk->exit_state = EXIT_ZOMBIE; ++ nr_zombie++; ++ write_unlock_irq(&tasklist_lock); ++ exit_task_namespaces(tsk); ++ } + #ifdef CONFIG_NUMA + mpol_put(tsk->mempolicy); + tsk->mempolicy = NULL; +@@ -1719,6 +1755,7 @@ asmlinkage long sys_wait4(pid_t upid, int __user *stat_addr, + asmlinkage_protect(4, ret, upid, stat_addr, options, ru); + return ret; + } ++EXPORT_SYMBOL_GPL(sys_wait4); + + #ifdef __ARCH_WANT_SYS_WAITPID + +diff --git a/kernel/fairsched.c b/kernel/fairsched.c +new file mode 100644 +index 0000000..80eebdf +--- /dev/null ++++ b/kernel/fairsched.c +@@ -0,0 +1,633 @@ ++/* ++ * Fair Scheduler ++ * ++ * Copyright (C) 2000-2008 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++ ++struct fairsched_node fairsched_init_node = { ++ .id = FAIRSCHED_INIT_NODE_ID, ++ .tg = &init_task_group, ++#ifdef CONFIG_VE ++ .owner_env = get_ve0(), ++#endif ++ .weight = 1, ++}; ++ ++static DEFINE_MUTEX(fairsched_mutex); ++ ++/* list protected with fairsched_mutex */ ++static LIST_HEAD(fairsched_node_head); ++static int fairsched_nr_nodes; ++ ++void __init fairsched_init_early(void) ++{ ++ list_add(&fairsched_init_node.nodelist, &fairsched_node_head); ++ fairsched_nr_nodes++; ++} ++ ++#define FSCHWEIGHT_BASE 512000 ++ ++/****************************************************************************** ++ * cfs group shares = FSCHWEIGHT_BASE / fairsched weight ++ * ++ * vzctl cpuunits default 1000 ++ * cfs shares default value is 1024 (see init_task_group_load in sched.c) ++ * cpuunits = 1000 --> weight = 500000 / cpuunits = 500 --> shares = 1024 ++ * ^--- from vzctl ++ * weight in 1..65535 --> shares in 7..512000 ++ * shares should be >1 (see comment in sched_group_set_shares function) ++ *****************************************************************************/ ++ ++static struct fairsched_node *fairsched_find(unsigned int id) ++{ ++ struct fairsched_node *p; ++ list_for_each_entry(p, &fairsched_node_head, nodelist) { ++ if (p->id == id) ++ return p; ++ } ++ return NULL; ++} ++ ++/****************************************************************************** ++ * System calls ++ * ++ * All do_xxx functions are called under fairsched mutex and after ++ * capability check. ++ * ++ * The binary interfaces follow some other Fair Scheduler implementations ++ * (although some system call arguments are not needed for our implementation). ++ *****************************************************************************/ ++ ++static int do_fairsched_mknod(unsigned int parent, unsigned int weight, ++ unsigned int newid) ++{ ++ struct fairsched_node *node; ++ int retval; ++ ++ retval = -EINVAL; ++ if (weight < 1 || weight > FSCHWEIGHT_MAX) ++ goto out; ++ if (newid < 0 || newid > INT_MAX) ++ goto out; ++ ++ retval = -EBUSY; ++ if (fairsched_find(newid) != NULL) ++ goto out; ++ ++ retval = -ENOMEM; ++ node = kzalloc(sizeof(*node), GFP_KERNEL); ++ if (node == NULL) ++ goto out; ++ ++ node->tg = sched_create_group(&init_task_group); ++ if (IS_ERR(node->tg)) ++ goto out_free; ++ ++ node->id = newid; ++ node->weight = weight; ++ sched_group_set_shares(node->tg, FSCHWEIGHT_BASE / weight); ++#ifdef CONFIG_VE ++ node->owner_env = get_exec_env(); ++#endif ++ list_add(&node->nodelist, &fairsched_node_head); ++ fairsched_nr_nodes++; ++ ++ retval = newid; ++out: ++ return retval; ++ ++out_free: ++ kfree(node); ++ return retval; ++} ++ ++asmlinkage int sys_fairsched_mknod(unsigned int parent, unsigned int weight, ++ unsigned int newid) ++{ ++ int retval; ++ ++ if (!capable(CAP_SETVEID)) ++ return -EPERM; ++ ++ mutex_lock(&fairsched_mutex); ++ retval = do_fairsched_mknod(parent, weight, newid); ++ mutex_unlock(&fairsched_mutex); ++ ++ return retval; ++} ++EXPORT_SYMBOL(sys_fairsched_mknod); ++ ++static int do_fairsched_rmnod(unsigned int id) ++{ ++ struct fairsched_node *node; ++ int retval; ++ ++ retval = -EINVAL; ++ node = fairsched_find(id); ++ if (node == NULL) ++ goto out; ++ if (node == &fairsched_init_node) ++ goto out; ++ ++ retval = -EBUSY; ++ if (node->refcnt) ++ goto out; ++ ++ list_del(&node->nodelist); ++ fairsched_nr_nodes--; ++ ++ sched_destroy_group(node->tg); ++ kfree(node); ++ retval = 0; ++out: ++ return retval; ++} ++ ++asmlinkage int sys_fairsched_rmnod(unsigned int id) ++{ ++ int retval; ++ ++ if (!capable(CAP_SETVEID)) ++ return -EPERM; ++ ++ mutex_lock(&fairsched_mutex); ++ retval = do_fairsched_rmnod(id); ++ mutex_unlock(&fairsched_mutex); ++ ++ return retval; ++} ++EXPORT_SYMBOL(sys_fairsched_rmnod); ++ ++static int do_fairsched_chwt(unsigned int id, unsigned weight) ++{ ++ struct fairsched_node *node; ++ ++ if (id == 0) ++ return -EINVAL; ++ if (weight < 1 || weight > FSCHWEIGHT_MAX) ++ return -EINVAL; ++ ++ node = fairsched_find(id); ++ if (node == NULL) ++ return -ENOENT; ++ ++ node->weight = weight; ++ sched_group_set_shares(node->tg, FSCHWEIGHT_BASE / weight); ++ ++ return 0; ++} ++ ++asmlinkage int sys_fairsched_chwt(unsigned int id, unsigned weight) ++{ ++ int retval; ++ ++ if (!capable(CAP_SETVEID)) ++ return -EPERM; ++ ++ mutex_lock(&fairsched_mutex); ++ retval = do_fairsched_chwt(id, weight); ++ mutex_unlock(&fairsched_mutex); ++ ++ return retval; ++} ++ ++static int do_fairsched_vcpus(unsigned int id, unsigned int vcpus) ++{ ++ struct fairsched_node *node; ++ ++ if (id == 0) ++ return -EINVAL; ++ ++ node = fairsched_find(id); ++ if (node == NULL) ++ return -ENOENT; ++ ++ return 0; ++} ++ ++asmlinkage int sys_fairsched_vcpus(unsigned int id, unsigned int vcpus) ++{ ++ int retval; ++ ++ if (!capable(CAP_SETVEID)) ++ return -EPERM; ++ ++ mutex_lock(&fairsched_mutex); ++ retval = do_fairsched_vcpus(id, vcpus); ++ mutex_unlock(&fairsched_mutex); ++ ++ return retval; ++} ++EXPORT_SYMBOL(sys_fairsched_vcpus); ++ ++static int do_fairsched_rate(unsigned int id, int op, unsigned rate) ++{ ++ struct fairsched_node *node; ++ int retval; ++ ++ if (id == 0) ++ return -EINVAL; ++ if (op == FAIRSCHED_SET_RATE && (rate < 1 || rate >= (1UL << 31))) ++ return -EINVAL; ++ ++ node = fairsched_find(id); ++ if (node == NULL) ++ return -ENOENT; ++ ++ retval = -EINVAL; ++ switch (op) { ++ case FAIRSCHED_SET_RATE: ++ node->rate = rate; ++ node->rate_limited = 1; ++ retval = rate; ++ break; ++ case FAIRSCHED_DROP_RATE: ++ node->rate = 0; ++ node->rate_limited = 0; ++ retval = 0; ++ break; ++ case FAIRSCHED_GET_RATE: ++ if (node->rate_limited) ++ retval = node->rate; ++ else ++ retval = -ENODATA; ++ break; ++ } ++ return retval; ++} ++ ++asmlinkage int sys_fairsched_rate(unsigned int id, int op, unsigned rate) ++{ ++ int retval; ++ ++ if (!capable(CAP_SETVEID)) ++ return -EPERM; ++ ++ mutex_lock(&fairsched_mutex); ++ retval = do_fairsched_rate(id, op, rate); ++ mutex_unlock(&fairsched_mutex); ++ ++ return retval; ++} ++ ++static int do_fairsched_mvpr(pid_t pid, unsigned int nodeid) ++{ ++ struct task_struct *p; ++ struct fairsched_node *node; ++ int retval; ++ ++ retval = -ENOENT; ++ node = fairsched_find(nodeid); ++ if (node == NULL) ++ goto out; ++ ++ write_lock_irq(&tasklist_lock); ++ retval = -ESRCH; ++ p = find_task_by_pid(pid); ++ if (p == NULL) ++ goto out_unlock; ++ ++ get_task_struct(p); ++ put_task_fairsched_node(p); ++ p->fsched_node = node; ++ get_task_fairsched_node(p); ++ write_unlock_irq(&tasklist_lock); ++ ++ smp_wmb(); ++ sched_move_task(p); ++ put_task_struct(p); ++ return 0; ++ ++out_unlock: ++ write_unlock_irq(&tasklist_lock); ++out: ++ return retval; ++} ++ ++asmlinkage int sys_fairsched_mvpr(pid_t pid, unsigned int nodeid) ++{ ++ int retval; ++ ++ if (!capable(CAP_SETVEID)) ++ return -EPERM; ++ ++ mutex_lock(&fairsched_mutex); ++ retval = do_fairsched_mvpr(pid, nodeid); ++ mutex_unlock(&fairsched_mutex); ++ ++ return retval; ++} ++EXPORT_SYMBOL(sys_fairsched_mvpr); ++ ++#ifdef CONFIG_PROC_FS ++ ++/*********************************************************************/ ++/* ++ * proc interface ++ */ ++/*********************************************************************/ ++ ++#include ++#include ++#include ++ ++struct fairsched_node_dump { ++ int id; ++ unsigned weight; ++ unsigned rate; ++ int rate_limited; ++ int nr_pcpu; ++ int nr_tasks, nr_runtasks; ++}; ++ ++struct fairsched_dump { ++ int len; ++ struct fairsched_node_dump nodes[0]; ++}; ++ ++static struct fairsched_dump *fairsched_do_dump(int compat) ++{ ++ int nr_nodes; ++ int len; ++ struct fairsched_dump *dump; ++ struct fairsched_node *node; ++ struct fairsched_node_dump *p; ++ ++ mutex_lock(&fairsched_mutex); ++ nr_nodes = (ve_is_super(get_exec_env()) ? fairsched_nr_nodes + 16 : 1); ++ len = sizeof(*dump) + nr_nodes * sizeof(dump->nodes[0]); ++ dump = ub_vmalloc(len); ++ if (dump == NULL) ++ goto out; ++ ++ p = dump->nodes; ++ list_for_each_entry_reverse(node, &fairsched_node_head, nodelist) { ++ if ((char *)p - (char *)dump >= len) ++ break; ++ p->nr_tasks = 0; ++ p->nr_runtasks = 0; ++#ifdef CONFIG_VE ++ if (!ve_accessible(node->owner_env, get_exec_env())) ++ continue; ++ p->nr_tasks = atomic_read(&node->owner_env->pcounter); ++ p->nr_runtasks = nr_running_ve(node->owner_env); ++#endif ++ p->id = node->id; ++ p->weight = node->weight; ++ p->rate = node->rate; ++ p->rate_limited = node->rate_limited; ++ p->nr_pcpu = num_online_cpus(); ++ p++; ++ } ++ dump->len = p - dump->nodes; ++out: ++ mutex_unlock(&fairsched_mutex); ++ return dump; ++} ++ ++#define FAIRSCHED_PROC_HEADLINES 2 ++ ++#define FAIRSHED_DEBUG " debug" ++ ++#ifdef CONFIG_VE ++/* ++ * File format is dictated by compatibility reasons. ++ */ ++static int fairsched_seq_show(struct seq_file *m, void *v) ++{ ++ struct fairsched_dump *dump; ++ struct fairsched_node_dump *p; ++ unsigned vid, nid, pid, r; ++ ++ dump = m->private; ++ p = (struct fairsched_node_dump *)((unsigned long)v & ~3UL); ++ if (p - dump->nodes < FAIRSCHED_PROC_HEADLINES) { ++ if (p == dump->nodes) ++ seq_printf(m, "Version: 2.6 debug\n"); ++ else if (p == dump->nodes + 1) ++ seq_printf(m, ++ " veid " ++ " id " ++ " parent " ++ "weight " ++ " rate " ++ "tasks " ++ " run " ++ "cpus" ++ " " ++ "flg " ++ "ready " ++ " start_tag " ++ " value " ++ " delay" ++ "\n"); ++ } else { ++ p -= FAIRSCHED_PROC_HEADLINES; ++ vid = nid = pid = 0; ++ r = (unsigned long)v & 3; ++ if (p == dump->nodes) { ++ if (r == 2) ++ nid = p->id; ++ } else { ++ if (!r) ++ nid = p->id; ++ else if (r == 1) ++ vid = pid = p->id; ++ else ++ vid = p->id, nid = 1; ++ } ++ seq_printf(m, ++ "%10u " ++ "%10u %10u %6u %5u %5u %5u %4u" ++ " " ++ " %c%c %5u %20Lu %20Lu %20Lu" ++ "\n", ++ vid, ++ nid, ++ pid, ++ p->weight, ++ p->rate, ++ p->nr_tasks, ++ p->nr_runtasks, ++ p->nr_pcpu, ++ p->rate_limited ? 'L' : '.', ++ '.', ++ p->nr_runtasks, ++ 0ll, 0ll, 0ll); ++ } ++ ++ return 0; ++} ++ ++static void *fairsched_seq_start(struct seq_file *m, loff_t *pos) ++{ ++ struct fairsched_dump *dump; ++ unsigned long l; ++ ++ dump = m->private; ++ if (*pos >= dump->len * 3 - 1 + FAIRSCHED_PROC_HEADLINES) ++ return NULL; ++ if (*pos < FAIRSCHED_PROC_HEADLINES) ++ return dump->nodes + *pos; ++ /* guess why... */ ++ l = (unsigned long)(dump->nodes + ++ ((unsigned long)*pos + FAIRSCHED_PROC_HEADLINES * 2 + 1) / 3); ++ l |= ((unsigned long)*pos + FAIRSCHED_PROC_HEADLINES * 2 + 1) % 3; ++ return (void *)l; ++} ++static void *fairsched_seq_next(struct seq_file *m, void *v, loff_t *pos) ++{ ++ ++*pos; ++ return fairsched_seq_start(m, pos); ++} ++#endif /* CONFIG_VE */ ++ ++static int fairsched2_seq_show(struct seq_file *m, void *v) ++{ ++ struct fairsched_dump *dump; ++ struct fairsched_node_dump *p; ++ ++ dump = m->private; ++ p = v; ++ if (p - dump->nodes < FAIRSCHED_PROC_HEADLINES) { ++ if (p == dump->nodes) ++ seq_printf(m, "Version: 2.7" FAIRSHED_DEBUG "\n"); ++ else if (p == dump->nodes + 1) ++ seq_printf(m, ++ " id " ++ "weight " ++ " rate " ++ " run " ++ "cpus" ++#ifdef FAIRSHED_DEBUG ++ " " ++ "flg " ++ "ready " ++ " start_tag " ++ " value " ++ " delay" ++#endif ++ "\n"); ++ } else { ++ p -= FAIRSCHED_PROC_HEADLINES; ++ seq_printf(m, ++ "%10u %6u %5u %5u %4u" ++#ifdef FAIRSHED_DEBUG ++ " " ++ " %c%c %5u %20Lu %20Lu %20Lu" ++#endif ++ "\n", ++ p->id, ++ p->weight, ++ p->rate, ++ p->nr_runtasks, ++ p->nr_pcpu ++#ifdef FAIRSHED_DEBUG ++ , ++ p->rate_limited ? 'L' : '.', ++ '.', ++ p->nr_runtasks, ++ 0ll, 0ll, 0ll ++#endif ++ ); ++ } ++ ++ return 0; ++} ++ ++static void *fairsched2_seq_start(struct seq_file *m, loff_t *pos) ++{ ++ struct fairsched_dump *dump; ++ ++ dump = m->private; ++ if (*pos >= dump->len + FAIRSCHED_PROC_HEADLINES) ++ return NULL; ++ return dump->nodes + *pos; ++} ++static void *fairsched2_seq_next(struct seq_file *m, void *v, loff_t *pos) ++{ ++ ++*pos; ++ return fairsched2_seq_start(m, pos); ++} ++static void fairsched2_seq_stop(struct seq_file *m, void *v) ++{ ++} ++ ++#ifdef CONFIG_VE ++static struct seq_operations fairsched_seq_op = { ++ .start = fairsched_seq_start, ++ .next = fairsched_seq_next, ++ .stop = fairsched2_seq_stop, ++ .show = fairsched_seq_show ++}; ++#endif ++static struct seq_operations fairsched2_seq_op = { ++ .start = fairsched2_seq_start, ++ .next = fairsched2_seq_next, ++ .stop = fairsched2_seq_stop, ++ .show = fairsched2_seq_show ++}; ++static int fairsched_seq_open(struct inode *inode, struct file *file) ++{ ++ int ret; ++ struct seq_file *m; ++ int compat; ++ ++#ifdef CONFIG_VE ++ compat = (file->f_dentry->d_name.len == sizeof("fairsched") - 1); ++ ret = seq_open(file, compat ? &fairsched_seq_op : &fairsched2_seq_op); ++#else ++ compat = 0; ++ ret = seq_open(file, &fairsched2_seq_op); ++#endif ++ if (ret) ++ return ret; ++ m = file->private_data; ++ m->private = fairsched_do_dump(compat); ++ if (m->private == NULL) { ++ seq_release(inode, file); ++ ret = -ENOMEM; ++ } ++ return ret; ++} ++static int fairsched_seq_release(struct inode *inode, struct file *file) ++{ ++ struct seq_file *m; ++ struct fairsched_dump *dump; ++ ++ m = file->private_data; ++ dump = m->private; ++ m->private = NULL; ++ vfree(dump); ++ seq_release(inode, file); ++ return 0; ++} ++static struct file_operations proc_fairsched_operations = { ++ .open = fairsched_seq_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = fairsched_seq_release ++}; ++ ++void __init fairsched_init_late(void) ++{ ++ proc_create("fairsched", S_IRUGO, &glob_proc_root, ++ &proc_fairsched_operations); ++ proc_create("fairsched2", S_IRUGO, &glob_proc_root, ++ &proc_fairsched_operations); ++} ++ ++#else ++ ++void __init fairsched_init_late(void) { } ++ ++#endif /* CONFIG_PROC_FS */ +diff --git a/kernel/fork.c b/kernel/fork.c +index 19908b2..f366869 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -26,6 +26,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -54,6 +55,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -62,17 +64,23 @@ + #include + #include + ++#include ++#include ++#include ++ + /* + * Protected counters by write_lock_irq(&tasklist_lock) + */ + unsigned long total_forks; /* Handle normal Linux uptimes. */ + int nr_threads; /* The idle threads do not count.. */ ++EXPORT_SYMBOL_GPL(nr_threads); + + int max_threads; /* tunable limit on nr_threads */ + + DEFINE_PER_CPU(unsigned long, process_counts) = 0; + + __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ ++EXPORT_SYMBOL(tasklist_lock); + + int nr_processes(void) + { +@@ -124,14 +132,20 @@ void __put_task_struct(struct task_struct *tsk) + WARN_ON(atomic_read(&tsk->usage)); + WARN_ON(tsk == current); + ++ ub_task_put(tsk); + security_task_free(tsk); + free_uid(tsk->user); + put_group_info(tsk->group_info); + delayacct_tsk_free(tsk); + ++#ifdef CONFIG_VE ++ put_ve(VE_TASK_INFO(tsk)->owner_env); ++ atomic_dec(&nr_dead); ++#endif + if (!profile_handoff_task(tsk)) + free_task(tsk); + } ++EXPORT_SYMBOL_GPL(__put_task_struct); + + /* + * macro override instead of weak attribute alias, to workaround +@@ -150,7 +164,7 @@ void __init fork_init(unsigned long mempages) + /* create a slab on which task_structs can be allocated */ + task_struct_cachep = + kmem_cache_create("task_struct", sizeof(struct task_struct), +- ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL); ++ ARCH_MIN_TASKALIGN, SLAB_PANIC|SLAB_UBC, NULL); + #endif + + /* do the arch specific task caches init */ +@@ -270,6 +284,10 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) + continue; + } + charge = 0; ++ if (ub_memory_charge(mm, mpnt->vm_end - mpnt->vm_start, ++ mpnt->vm_flags & ~VM_LOCKED, ++ mpnt->vm_file, UB_HARD)) ++ goto fail_noch; + if (mpnt->vm_flags & VM_ACCOUNT) { + unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; + if (security_vm_enough_memory(len)) +@@ -316,7 +334,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) + rb_parent = &tmp->vm_rb; + + mm->map_count++; +- retval = copy_page_range(mm, oldmm, mpnt); ++ retval = copy_page_range(mm, oldmm, tmp, mpnt); + + if (tmp->vm_ops && tmp->vm_ops->open) + tmp->vm_ops->open(tmp); +@@ -335,6 +353,9 @@ out: + fail_nomem_policy: + kmem_cache_free(vm_area_cachep, tmp); + fail_nomem: ++ ub_memory_uncharge(mm, mpnt->vm_end - mpnt->vm_start, ++ mpnt->vm_flags & ~VM_LOCKED, mpnt->vm_file); ++fail_noch: + retval = -ENOMEM; + vm_unacct_memory(charge); + goto out; +@@ -383,12 +404,22 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) + mm->free_area_cache = TASK_UNMAPPED_BASE; + mm->cached_hole_size = ~0UL; + mm_init_owner(mm, p); ++ /* ++ * This looks ugly, buy when we came from ++ * sys_execve -> mm_alloc -> here ++ * we need to get exec_ub, not task_ub. But when ++ * we're here like this ++ * sys_fork() -> dup_mm -> here ++ * we need task_ub, not the exec one... xemul ++ */ ++ set_mm_ub(mm, p); + + if (likely(!mm_alloc_pgd(mm))) { + mm->def_flags = 0; + return mm; + } + ++ put_mm_ub(mm); + free_mm(mm); + return NULL; + } +@@ -407,6 +438,7 @@ struct mm_struct * mm_alloc(void) + } + return mm; + } ++EXPORT_SYMBOL_GPL(mm_alloc); + + /* + * Called when the last reference to the mm +@@ -418,6 +450,7 @@ void __mmdrop(struct mm_struct *mm) + BUG_ON(mm == &init_mm); + mm_free_pgd(mm); + destroy_context(mm); ++ put_mm_ub(mm); + free_mm(mm); + } + EXPORT_SYMBOL_GPL(__mmdrop); +@@ -439,6 +472,9 @@ void mmput(struct mm_struct *mm) + spin_unlock(&mmlist_lock); + } + put_swap_token(mm); ++ (void) virtinfo_gencall(VIRTINFO_EXITMMAP, mm); ++ if (mm->oom_killed) ++ ub_oom_task_dead(current); + mmdrop(mm); + } + } +@@ -568,6 +604,7 @@ fail_nocontext: + * because it calls destroy_context() + */ + mm_free_pgd(mm); ++ put_mm_ub(mm); + free_mm(mm); + return NULL; + } +@@ -874,14 +911,19 @@ static struct task_struct *copy_process(unsigned long clone_flags, + struct pt_regs *regs, + unsigned long stack_size, + int __user *child_tidptr, +- struct pid *pid) ++ struct pid *pid, pid_t vpid) + { + int retval; + struct task_struct *p; + int cgroup_callbacks_done = 0; + ++#ifdef CONFIG_VE ++ if (clone_flags & CLONE_NAMESPACES_MASK) ++ return ERR_PTR(-EINVAL); ++#else + if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) + return ERR_PTR(-EINVAL); ++#endif + + /* + * Thread groups must share signals as well, and detached threads +@@ -909,6 +951,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, + + rt_mutex_init_task(p); + ++ if (ub_task_charge(current, p)) ++ goto bad_fork_charge; ++ + #ifdef CONFIG_TRACE_IRQFLAGS + DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); + DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); +@@ -1064,7 +1109,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, + + if (pid != &init_struct_pid) { + retval = -ENOMEM; +- pid = alloc_pid(task_active_pid_ns(p)); ++ pid = alloc_pid(task_active_pid_ns(p), vpid); + if (!pid) + goto bad_fork_cleanup_io; + +@@ -1072,6 +1117,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, + retval = pid_ns_prepare_proc(task_active_pid_ns(p)); + if (retval < 0) + goto bad_fork_free_pid; ++ if (task_active_pid_ns(current)->flags & PID_NS_HIDE_CHILD) ++ task_active_pid_ns(p)->flags |= PID_NS_HIDDEN; + } + } + +@@ -1169,7 +1216,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, + * thread can't slip out of an OOM kill (or normal SIGKILL). + */ + recalc_sigpending(); +- if (signal_pending(current)) { ++ if (signal_pending(current) && !vpid) { + spin_unlock(¤t->sighand->siglock); + write_unlock_irq(&tasklist_lock); + retval = -ERESTARTNOINTR; +@@ -1212,14 +1259,24 @@ static struct task_struct *copy_process(unsigned long clone_flags, + attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); + attach_pid(p, PIDTYPE_SID, task_session(current)); + list_add_tail_rcu(&p->tasks, &init_task.tasks); ++#ifdef CONFIG_VE ++ list_add_tail_rcu(&p->ve_task_info.vetask_list, ++ &p->ve_task_info.owner_env->vetask_lh); ++#endif + __get_cpu_var(process_counts)++; + } + attach_pid(p, PIDTYPE_PID, pid); + nr_threads++; + } ++ (void)get_ve(p->ve_task_info.owner_env); ++ pget_ve(p->ve_task_info.owner_env); + ++#ifdef CONFIG_VE ++ seqcount_init(&p->ve_task_info.wakeup_lock); ++#endif + total_forks++; + spin_unlock(¤t->sighand->siglock); ++ get_task_fairsched_node(p); + write_unlock_irq(&tasklist_lock); + proc_fork_connector(p); + cgroup_post_fork(p); +@@ -1267,6 +1324,9 @@ bad_fork_cleanup_count: + atomic_dec(&p->user->processes); + free_uid(p->user); + bad_fork_free: ++ ub_task_uncharge(p); ++ ub_task_put(p); ++bad_fork_charge: + free_task(p); + fork_out: + return ERR_PTR(retval); +@@ -1284,7 +1344,7 @@ struct task_struct * __cpuinit fork_idle(int cpu) + struct pt_regs regs; + + task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, +- &init_struct_pid); ++ &init_struct_pid, 0); + if (!IS_ERR(task)) + init_idle(task, cpu); + +@@ -1313,12 +1373,13 @@ static int fork_traceflag(unsigned clone_flags) + * It copies the process, and if successful kick-starts + * it and waits for it to finish using the VM if required. + */ +-long do_fork(unsigned long clone_flags, ++long do_fork_pid(unsigned long clone_flags, + unsigned long stack_start, + struct pt_regs *regs, + unsigned long stack_size, + int __user *parent_tidptr, +- int __user *child_tidptr) ++ int __user *child_tidptr, ++ long vpid) + { + struct task_struct *p; + int trace = 0; +@@ -1341,6 +1402,10 @@ long do_fork(unsigned long clone_flags, + } + } + ++ nr = virtinfo_gencall(VIRTINFO_DOFORK, (void *)clone_flags); ++ if (nr) ++ return nr; ++ + if (unlikely(current->ptrace)) { + trace = fork_traceflag (clone_flags); + if (trace) +@@ -1348,7 +1413,7 @@ long do_fork(unsigned long clone_flags, + } + + p = copy_process(clone_flags, stack_start, regs, stack_size, +- child_tidptr, NULL); ++ child_tidptr, NULL, vpid); + /* + * Do this prior waking up the new thread - the thread pointer + * might get invalid after that point, if the thread exits quickly. +@@ -1374,6 +1439,8 @@ long do_fork(unsigned long clone_flags, + set_tsk_thread_flag(p, TIF_SIGPENDING); + } + ++ (void)virtinfo_gencall(VIRTINFO_DOFORKRET, p); ++ + if (!(clone_flags & CLONE_STOPPED)) + wake_up_new_task(p, clone_flags); + else +@@ -1396,6 +1463,8 @@ long do_fork(unsigned long clone_flags, + } else { + nr = PTR_ERR(p); + } ++ ++ (void)virtinfo_gencall(VIRTINFO_DOFORKPOST, (void *)(long)nr); + return nr; + } + +@@ -1411,27 +1480,40 @@ static void sighand_ctor(struct kmem_cache *cachep, void *data) + init_waitqueue_head(&sighand->signalfd_wqh); + } + ++EXPORT_SYMBOL(do_fork_pid); ++ ++long do_fork(unsigned long clone_flags, ++ unsigned long stack_start, ++ struct pt_regs *regs, ++ unsigned long stack_size, ++ int __user *parent_tidptr, ++ int __user *child_tidptr) ++{ ++ return do_fork_pid(clone_flags, stack_start, regs, stack_size, ++ parent_tidptr, child_tidptr, 0); ++} ++ + void __init proc_caches_init(void) + { + sighand_cachep = kmem_cache_create("sighand_cache", + sizeof(struct sighand_struct), 0, +- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU, ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|SLAB_UBC, + sighand_ctor); + signal_cachep = kmem_cache_create("signal_cache", + sizeof(struct signal_struct), 0, +- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL); + files_cachep = kmem_cache_create("files_cache", + sizeof(struct files_struct), 0, +- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL); + fs_cachep = kmem_cache_create("fs_cache", + sizeof(struct fs_struct), 0, +- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL); + vm_area_cachep = kmem_cache_create("vm_area_struct", + sizeof(struct vm_area_struct), 0, +- SLAB_PANIC, NULL); ++ SLAB_PANIC|SLAB_UBC, NULL); + mm_cachep = kmem_cache_create("mm_struct", + sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, +- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL); + } + + /* +@@ -1569,6 +1651,10 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) + CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER| + CLONE_NEWNET)) + goto bad_unshare_out; ++#ifdef CONFIG_VE ++ if (unshare_flags & CLONE_NAMESPACES_MASK) ++ goto bad_unshare_out; ++#endif + + /* + * CLONE_NEWIPC must also detach from the undolist: after switching +@@ -1587,9 +1673,11 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) + goto bad_unshare_cleanup_sigh; + if ((err = unshare_fd(unshare_flags, &new_fd))) + goto bad_unshare_cleanup_vm; ++#ifndef CONFIG_VE + if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, + new_fs))) + goto bad_unshare_cleanup_fd; ++#endif + + if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) { + if (do_sysvsem) { +@@ -1633,7 +1721,9 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) + if (new_nsproxy) + put_nsproxy(new_nsproxy); + ++#ifndef CONFIG_VE + bad_unshare_cleanup_fd: ++#endif + if (new_fd) + put_files_struct(new_fd); + +diff --git a/kernel/futex.c b/kernel/futex.c +index 7d1136e..a02be16 100644 +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -1198,8 +1198,6 @@ handle_fault: + */ + #define FLAGS_SHARED 1 + +-static long futex_wait_restart(struct restart_block *restart); +- + static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, + u32 val, ktime_t *abs_time, u32 bitset) + { +@@ -1365,7 +1363,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, + } + + +-static long futex_wait_restart(struct restart_block *restart) ++long futex_wait_restart(struct restart_block *restart) + { + u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; + struct rw_semaphore *fshared = NULL; +@@ -1378,6 +1376,7 @@ static long futex_wait_restart(struct restart_block *restart) + return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, + restart->futex.bitset); + } ++EXPORT_SYMBOL_GPL(futex_wait_restart); + + + /* +diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c +index ab80515..fd9ae33 100644 +--- a/kernel/hrtimer.c ++++ b/kernel/hrtimer.c +@@ -1524,6 +1524,7 @@ out: + destroy_hrtimer_on_stack(&t.timer); + return ret; + } ++EXPORT_SYMBOL_GPL(hrtimer_nanosleep_restart); + + long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, + const enum hrtimer_mode mode, const clockid_t clockid) +diff --git a/kernel/kmod.c b/kernel/kmod.c +index 8df97d3..4c93a6c 100644 +--- a/kernel/kmod.c ++++ b/kernel/kmod.c +@@ -78,6 +78,10 @@ int request_module(const char *fmt, ...) + #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ + static int kmod_loop_msg; + ++ /* Don't allow request_module() inside VE. */ ++ if (!ve_is_super(get_exec_env())) ++ return -EPERM; ++ + va_start(args, fmt); + ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); + va_end(args); +@@ -451,6 +455,9 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, + DECLARE_COMPLETION_ONSTACK(done); + int retval = 0; + ++ if (!ve_is_super(get_exec_env())) ++ return -EPERM; ++ + helper_lock(); + if (sub_info->path[0] == '\0') + goto out; +diff --git a/kernel/kprobes.c b/kernel/kprobes.c +index 1485ca8..5b4040a 100644 +--- a/kernel/kprobes.c ++++ b/kernel/kprobes.c +@@ -118,14 +118,14 @@ static int __kprobes check_safety(void) + ret = freeze_processes(); + if (ret == 0) { + struct task_struct *p, *q; +- do_each_thread(p, q) { ++ do_each_thread_all(p, q) { + if (p != current && p->state == TASK_RUNNING && + p->pid != 0) { + printk("Check failed: %s is running\n",p->comm); + ret = -1; + goto loop_end; + } +- } while_each_thread(p, q); ++ } while_each_thread_all(p, q); + } + loop_end: + thaw_processes(); +diff --git a/kernel/lockdep.c b/kernel/lockdep.c +index 81a4e4a..f59230a 100644 +--- a/kernel/lockdep.c ++++ b/kernel/lockdep.c +@@ -3182,7 +3182,7 @@ retry: + if (count != 10) + printk(" locked it.\n"); + +- do_each_thread(g, p) { ++ do_each_thread_all(g, p) { + /* + * It's not reliable to print a task's held locks + * if it's not sleeping (or if it's not the current +@@ -3195,7 +3195,7 @@ retry: + if (!unlock) + if (read_trylock(&tasklist_lock)) + unlock = 1; +- } while_each_thread(g, p); ++ } while_each_thread_all(g, p); + + printk("\n"); + printk("=============================================\n\n"); +diff --git a/kernel/module.c b/kernel/module.c +index 5f80478..131c925 100644 +--- a/kernel/module.c ++++ b/kernel/module.c +@@ -2463,6 +2463,8 @@ unsigned long module_kallsyms_lookup_name(const char *name) + static void *m_start(struct seq_file *m, loff_t *pos) + { + mutex_lock(&module_mutex); ++ if (!ve_is_super(get_exec_env())) ++ return NULL; + return seq_list_start(&modules, *pos); + } + +diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c +index adc7851..1c0848f 100644 +--- a/kernel/nsproxy.c ++++ b/kernel/nsproxy.c +@@ -27,6 +27,14 @@ static struct kmem_cache *nsproxy_cachep; + + struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); + ++void get_task_namespaces(struct task_struct *tsk) ++{ ++ struct nsproxy *ns = tsk->nsproxy; ++ if (ns) { ++ get_nsproxy(ns); ++ } ++} ++ + /* + * creates a copy of "orig" with refcount 1. + */ +@@ -134,10 +142,12 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) + CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET))) + return 0; + ++#ifndef CONFIG_VE + if (!capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto out; + } ++#endif + + /* + * CLONE_NEWIPC must detach from the undolist: after switching +@@ -169,6 +179,7 @@ out: + put_nsproxy(old_ns); + return err; + } ++EXPORT_SYMBOL(copy_namespaces); + + void free_nsproxy(struct nsproxy *ns) + { +@@ -185,6 +196,22 @@ void free_nsproxy(struct nsproxy *ns) + put_net(ns->net_ns); + kmem_cache_free(nsproxy_cachep, ns); + } ++EXPORT_SYMBOL(free_nsproxy); ++ ++struct mnt_namespace * get_task_mnt_ns(struct task_struct *tsk) ++{ ++ struct mnt_namespace *mnt_ns = NULL; ++ ++ task_lock(tsk); ++ if (tsk->nsproxy) ++ mnt_ns = tsk->nsproxy->mnt_ns; ++ if (mnt_ns) ++ get_mnt_ns(mnt_ns); ++ task_unlock(tsk); ++ ++ return mnt_ns; ++} ++EXPORT_SYMBOL(get_task_mnt_ns); + + /* + * Called from unshare. Unshare all the namespaces part of nsproxy. +diff --git a/kernel/pid.c b/kernel/pid.c +index 20d59fa..833f5b4 100644 +--- a/kernel/pid.c ++++ b/kernel/pid.c +@@ -32,6 +32,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -109,7 +110,7 @@ EXPORT_SYMBOL(is_container_init); + * For now it is easier to be safe than to prove it can't happen. + */ + +-static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); ++__cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); + + static void free_pidmap(struct upid *upid) + { +@@ -120,8 +121,9 @@ static void free_pidmap(struct upid *upid) + clear_bit(offset, map->page); + atomic_inc(&map->nr_free); + } ++EXPORT_SYMBOL_GPL(free_pidmap); + +-static int alloc_pidmap(struct pid_namespace *pid_ns) ++int alloc_pidmap(struct pid_namespace *pid_ns) + { + int i, offset, max_scan, pid, last = pid_ns->last_pid; + struct pidmap *map; +@@ -181,6 +183,36 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) + return -1; + } + ++int set_pidmap(struct pid_namespace *pid_ns, pid_t pid) ++{ ++ int offset; ++ struct pidmap *map; ++ ++ offset = pid & BITS_PER_PAGE_MASK; ++ map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; ++ if (unlikely(!map->page)) { ++ void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); ++ /* ++ * Free the page if someone raced with us ++ * installing it: ++ */ ++ spin_lock_irq(&pidmap_lock); ++ if (map->page) ++ kfree(page); ++ else ++ map->page = page; ++ spin_unlock_irq(&pidmap_lock); ++ if (unlikely(!map->page)) ++ return -ENOMEM; ++ } ++ ++ if (test_and_set_bit(offset, map->page)) ++ return -EBUSY; ++ ++ atomic_dec(&map->nr_free); ++ return pid; ++} ++ + int next_pidmap(struct pid_namespace *pid_ns, int last) + { + int offset; +@@ -226,25 +258,33 @@ void free_pid(struct pid *pid) + /* We can be called with write_lock_irq(&tasklist_lock) held */ + int i; + unsigned long flags; ++ struct upid *upid; + + spin_lock_irqsave(&pidmap_lock, flags); +- for (i = 0; i <= pid->level; i++) +- hlist_del_rcu(&pid->numbers[i].pid_chain); +- spin_unlock_irqrestore(&pidmap_lock, flags); ++ for (i = 0; i <= pid->level; i++) { ++ upid = &pid->numbers[i]; ++ if (!hlist_unhashed(&upid->pid_chain)) ++ hlist_del_rcu(&upid->pid_chain); ++ } ++ spin_unlock(&pidmap_lock); ++ ub_kmemsize_uncharge(pid->ub, pid->numbers[pid->level].ns->pid_cachep->objuse); ++ local_irq_restore(flags); + + for (i = 0; i <= pid->level; i++) + free_pidmap(pid->numbers + i); +- ++ put_beancounter(pid->ub); + call_rcu(&pid->rcu, delayed_put_pid); + } ++EXPORT_SYMBOL_GPL(free_pid); + +-struct pid *alloc_pid(struct pid_namespace *ns) ++struct pid *alloc_pid(struct pid_namespace *ns, pid_t vpid) + { + struct pid *pid; + enum pid_type type; + int i, nr; + struct pid_namespace *tmp; + struct upid *upid; ++ struct user_beancounter *ub; + + pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); + if (!pid) +@@ -252,7 +292,10 @@ struct pid *alloc_pid(struct pid_namespace *ns) + + tmp = ns; + for (i = ns->level; i >= 0; i--) { +- nr = alloc_pidmap(tmp); ++ if (vpid != 0 && i == ns->level) ++ nr = set_pidmap(tmp, vpid); ++ else ++ nr = alloc_pidmap(tmp); + if (nr < 0) + goto out_free; + +@@ -267,17 +310,32 @@ struct pid *alloc_pid(struct pid_namespace *ns) + for (type = 0; type < PIDTYPE_MAX; ++type) + INIT_HLIST_HEAD(&pid->tasks[type]); + ++#ifdef CONFIG_BEANCOUNTERS ++ ub = get_exec_ub(); ++ local_irq_disable(); ++ if (ub_kmemsize_charge(ub, ns->pid_cachep->objuse, UB_HARD)) ++ goto out_enable; ++ pid->ub = get_beancounter(ub); ++ spin_lock(&pidmap_lock); ++#else + spin_lock_irq(&pidmap_lock); ++#endif + for (i = ns->level; i >= 0; i--) { + upid = &pid->numbers[i]; + hlist_add_head_rcu(&upid->pid_chain, + &pid_hash[pid_hashfn(upid->nr, upid->ns)]); ++ if (upid->ns->flags & PID_NS_HIDDEN) ++ while (i--) ++ INIT_HLIST_NODE(&pid->numbers[i].pid_chain); + } + spin_unlock_irq(&pidmap_lock); + + out: + return pid; + ++out_enable: ++ local_irq_enable(); ++ put_pid_ns(ns); + out_free: + while (++i <= ns->level) + free_pidmap(pid->numbers + i); +@@ -286,6 +344,7 @@ out_free: + pid = NULL; + goto out; + } ++EXPORT_SYMBOL_GPL(alloc_pid); + + struct pid *find_pid_ns(int nr, struct pid_namespace *ns) + { +@@ -314,6 +373,45 @@ struct pid *find_pid(int nr) + } + EXPORT_SYMBOL_GPL(find_pid); + ++void reattach_pid(struct task_struct *tsk, enum pid_type type, ++ struct pid *pid) ++{ ++ int i; ++ struct pid *old_pid; ++ struct pid_link *link; ++ struct upid *upid; ++ ++ link = &tsk->pids[type]; ++ old_pid = link->pid; ++ ++ hlist_del_rcu(&link->node); ++ link->pid = pid; ++ hlist_add_head_rcu(&link->node, &pid->tasks[type]); ++ ++ if (type != PIDTYPE_PID) { ++ for (i = PIDTYPE_MAX; --i >= 0; ) ++ if (!hlist_empty(&old_pid->tasks[i])) ++ return; ++ ++ for (i = 0; i < pid->level; i++) ++ hlist_del_rcu(&old_pid->numbers[i].pid_chain); ++ } else { ++ for (i = PIDTYPE_MAX; --i >= 0; ) ++ if (!hlist_empty(&old_pid->tasks[i])) ++ BUG(); ++ ++ for (i = 0; i < pid->level; i++) ++ hlist_replace_rcu(&old_pid->numbers[i].pid_chain, ++ &pid->numbers[i].pid_chain); ++ ++ upid = &pid->numbers[pid->level]; ++ hlist_add_head_rcu(&upid->pid_chain, ++ &pid_hash[pid_hashfn(upid->nr, upid->ns)]); ++ } ++ ++ call_rcu(&old_pid->rcu, delayed_put_pid); ++} ++ + /* + * attach_pid() must be called with the tasklist_lock write-held. + */ +@@ -326,6 +424,7 @@ void attach_pid(struct task_struct *task, enum pid_type type, + link->pid = pid; + hlist_add_head_rcu(&link->node, &pid->tasks[type]); + } ++EXPORT_SYMBOL_GPL(attach_pid); + + static void __change_pid(struct task_struct *task, enum pid_type type, + struct pid *new) +@@ -346,6 +445,7 @@ static void __change_pid(struct task_struct *task, enum pid_type type, + + free_pid(pid); + } ++EXPORT_SYMBOL_GPL(detach_pid); + + void detach_pid(struct task_struct *task, enum pid_type type) + { +@@ -498,6 +598,17 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) + } + EXPORT_SYMBOL_GPL(find_get_pid); + ++pid_t pid_to_vpid(pid_t nr) ++{ ++ struct pid *pid; ++ ++ pid = find_pid(nr); ++ if (pid) ++ return pid->numbers[pid->level].nr; ++ return -1; ++} ++EXPORT_SYMBOL_GPL(pid_to_vpid); ++ + /* + * The pid hash table is scaled according to the amount of memory in the + * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or +diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c +index 98702b4..c478b80 100644 +--- a/kernel/pid_namespace.c ++++ b/kernel/pid_namespace.c +@@ -13,6 +13,8 @@ + #include + #include + ++#include ++ + #define BITS_PER_PAGE (PAGE_SIZE*8) + + struct pid_cache { +@@ -87,6 +89,7 @@ static struct pid_namespace *create_pid_namespace(unsigned int level) + ns->last_pid = 0; + ns->child_reaper = NULL; + ns->level = level; ++ ns->flags = 0; + + set_bit(0, ns->pidmap[0].page); + atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); +@@ -151,6 +154,160 @@ void free_pid_ns(struct kref *kref) + put_pid_ns(parent); + } + ++/* ++ * this is a dirty ugly hack. ++ */ ++ ++static int __pid_ns_attach_task(struct pid_namespace *ns, ++ struct task_struct *tsk, pid_t nr) ++{ ++ struct pid *pid; ++ enum pid_type type; ++ unsigned long old_size, new_size; ++ ++ pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); ++ if (!pid) ++ goto out; ++ ++ if (nr == 0) ++ nr = alloc_pidmap(ns); ++ else ++ nr = set_pidmap(ns, nr); ++ ++ if (nr < 0) ++ goto out_free; ++ ++ memcpy(pid, task_pid(tsk), ++ sizeof(struct pid) + (ns->level - 1) * sizeof(struct upid)); ++ get_pid_ns(ns); ++ pid->level++; ++ BUG_ON(pid->level != ns->level); ++ pid->numbers[pid->level].nr = nr; ++ pid->numbers[pid->level].ns = ns; ++ atomic_set(&pid->count, 1); ++ for (type = 0; type < PIDTYPE_MAX; ++type) ++ INIT_HLIST_HEAD(&pid->tasks[type]); ++ ++ old_size = pid->numbers[pid->level - 1].ns->pid_cachep->objuse; ++ new_size = pid->numbers[pid->level].ns->pid_cachep->objuse; ++ local_irq_disable(); ++ /* ++ * Depending on sizeof(struct foo), cache flags (redzoning, etc) ++ * and actual CPU (cacheline_size() jump from 64 to 128 bytes after ++ * CPU detection) new size can very well be smaller than old size. ++ */ ++ if (new_size > old_size) { ++ if (ub_kmemsize_charge(pid->ub, new_size - old_size, UB_HARD) < 0) ++ goto out_enable; ++ } else ++ ub_kmemsize_uncharge(pid->ub, old_size - new_size); ++ ++ write_lock(&tasklist_lock); ++ ++ spin_lock(&pidmap_lock); ++ reattach_pid(tsk, PIDTYPE_SID, pid); ++ set_task_session(tsk, pid_nr(pid)); ++ reattach_pid(tsk, PIDTYPE_PGID, pid); ++ tsk->signal->__pgrp = pid_nr(pid); ++ current->signal->tty_old_pgrp = NULL; ++ ++ reattach_pid(tsk, PIDTYPE_PID, pid); ++ spin_unlock(&pidmap_lock); ++ ++ write_unlock_irq(&tasklist_lock); ++ ++ return 0; ++ ++out_enable: ++ local_irq_enable(); ++ put_pid_ns(ns); ++out_free: ++ kmem_cache_free(ns->pid_cachep, pid); ++out: ++ return -ENOMEM; ++} ++ ++int pid_ns_attach_task(struct pid_namespace *ns, struct task_struct *tsk) ++{ ++ return __pid_ns_attach_task(ns, tsk, 0); ++} ++EXPORT_SYMBOL_GPL(pid_ns_attach_task); ++ ++int pid_ns_attach_init(struct pid_namespace *ns, struct task_struct *tsk) ++{ ++ int err; ++ ++ err = __pid_ns_attach_task(ns, tsk, 1); ++ if (err < 0) ++ return err; ++ ++ ns->child_reaper = tsk; ++ return 0; ++} ++EXPORT_SYMBOL_GPL(pid_ns_attach_init); ++ ++#ifdef CONFIG_VE ++static noinline void show_lost_task(struct task_struct *p) ++{ ++ char buf[512] = "N/A"; ++#ifdef CONFIG_PROC_FS ++ extern char * task_sig(struct task_struct *p, char *buffer); ++ ++ task_sig(p, buf); ++#endif ++ printk("Lost task: %d/%s/%p\nSignals:%s\n", p->pid, p->comm, p, buf); ++} ++ ++static void zap_ve_processes(struct ve_struct *env) ++{ ++ /* ++ * Here the VE changes its state into "not running". ++ * op_sem taken for write is a barrier to all VE manipulations from ++ * ioctl: it waits for operations currently in progress and blocks all ++ * subsequent operations until is_running is set to 0 and op_sem is ++ * released. ++ */ ++ down_write(&env->op_sem); ++ env->is_running = 0; ++ up_write(&env->op_sem); ++ ++ /* wait for all init childs exit */ ++ while (atomic_read(&env->pcounter) > 1) { ++ struct task_struct *g, *p; ++ long delay = 1; ++ ++ if (sys_wait4(-1, NULL, __WALL | WNOHANG, NULL) > 0) ++ continue; ++ /* it was ENOCHLD or no more children somehow */ ++ if (atomic_read(&env->pcounter) == 1) ++ break; ++ ++ /* clear all signals to avoid wakeups */ ++ if (signal_pending(current)) ++ flush_signals(current); ++ /* we have child without signal sent */ ++ __set_current_state(TASK_INTERRUPTIBLE); ++ schedule_timeout(delay); ++ delay = (delay < HZ) ? (delay << 1) : HZ; ++ read_lock(&tasklist_lock); ++ do_each_thread_ve(g, p) { ++ if (p != current) { ++ /* ++ * by that time no processes other then entered ++ * may exist in the VE. if some were missed by ++ * zap_pid_ns_processes() this was a BUG ++ */ ++ if (!p->did_ve_enter) ++ show_lost_task(p); ++ ++ force_sig_specific(SIGKILL, p); ++ } ++ } while_each_thread_ve(g, p); ++ read_unlock(&tasklist_lock); ++ } ++} ++#endif ++ + void zap_pid_ns_processes(struct pid_namespace *pid_ns) + { + int nr; +@@ -183,6 +340,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) + } while (rc != -ECHILD); + + ++#ifdef CONFIG_VE ++ zap_ve_processes(get_exec_env()); ++#endif + /* Child reaper for the pid namespace is going away */ + pid_ns->child_reaper = NULL; + return; +diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c +index dbd8398..f331727 100644 +--- a/kernel/posix-timers.c ++++ b/kernel/posix-timers.c +@@ -31,6 +31,8 @@ + * POSIX clocks & timers + */ + #include ++#include ++#include + #include + #include + #include +@@ -46,6 +48,9 @@ + #include + #include + #include ++#include ++ ++#include + + /* + * Management arrays for POSIX timers. Timers are kept in slab memory +@@ -240,8 +245,8 @@ static __init int init_posix_timers(void) + register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); + + posix_timers_cache = kmem_cache_create("posix_timers_cache", +- sizeof (struct k_itimer), 0, SLAB_PANIC, +- NULL); ++ sizeof (struct k_itimer), 0, ++ SLAB_PANIC|SLAB_UBC, NULL); + idr_init(&posix_timers_id); + return 0; + } +@@ -298,6 +303,13 @@ void do_schedule_next_timer(struct siginfo *info) + + int posix_timer_event(struct k_itimer *timr,int si_private) + { ++ int ret; ++ struct ve_struct *ve; ++ struct user_beancounter *ub; ++ ++ ve = set_exec_env(timr->it_process->ve_task_info.owner_env); ++ ub = set_exec_ub(timr->it_process->task_bc.task_ub); ++ + memset(&timr->sigq->info, 0, sizeof(siginfo_t)); + timr->sigq->info.si_sys_private = si_private; + /* Send signal to the process that owns this timer.*/ +@@ -310,10 +322,10 @@ int posix_timer_event(struct k_itimer *timr,int si_private) + + if (timr->it_sigev_notify & SIGEV_THREAD_ID) { + struct task_struct *leader; +- int ret = send_sigqueue(timr->sigq, timr->it_process, 0); ++ ret = send_sigqueue(timr->sigq, timr->it_process, 0); + + if (likely(ret >= 0)) +- return ret; ++ goto out; + + timr->it_sigev_notify = SIGEV_SIGNAL; + leader = timr->it_process->group_leader; +@@ -321,7 +333,11 @@ int posix_timer_event(struct k_itimer *timr,int si_private) + timr->it_process = leader; + } + +- return send_sigqueue(timr->sigq, timr->it_process, 1); ++ ret = send_sigqueue(timr->sigq, timr->it_process, 1); ++out: ++ (void)set_exec_ub(ub); ++ (void)set_exec_env(ve); ++ return ret; + } + EXPORT_SYMBOL_GPL(posix_timer_event); + +diff --git a/kernel/power/process.c b/kernel/power/process.c +index f1d0b34..6bfb79f 100644 +--- a/kernel/power/process.c ++++ b/kernel/power/process.c +@@ -14,6 +14,8 @@ + #include + #include + ++static atomic_t global_suspend = ATOMIC_INIT(0); ++ + /* + * Timeout for stopping processes + */ +@@ -26,7 +28,9 @@ static inline int freezeable(struct task_struct * p) + { + if ((p == current) || + (p->flags & PF_NOFREEZE) || +- (p->exit_state != 0)) ++ (p->exit_state != 0) || ++ (p->state == TASK_STOPPED) || ++ (p->state == TASK_TRACED)) + return 0; + return 1; + } +@@ -50,6 +54,28 @@ void refrigerator(void) + processes around? */ + long save; + ++#if defined(CONFIG_VZ_CHECKPOINT) || defined(CONFIG_VZ_CHECKPOINT_MODULE) ++ save = current->state; ++ current->state = TASK_UNINTERRUPTIBLE; ++ ++ spin_lock_irq(¤t->sighand->siglock); ++ if (test_and_clear_thread_flag(TIF_FREEZE)) { ++ recalc_sigpending(); /* We sent fake signal, clean it up */ ++ if (atomic_read(&global_suspend) || ++ atomic_read(&get_exec_env()->suspend)) ++ current->flags |= PF_FROZEN; ++ else ++ current->state = save; ++ } else { ++ /* Freeze request could be canceled before we entered ++ * refrigerator(). In this case we do nothing. */ ++ current->state = save; ++ } ++ spin_unlock_irq(¤t->sighand->siglock); ++ ++ while (current->flags & PF_FROZEN) ++ schedule(); ++#else + task_lock(current); + if (freezing(current)) { + frozen_process(); +@@ -71,6 +97,7 @@ void refrigerator(void) + break; + schedule(); + } ++#endif + pr_debug("%s left refrigerator\n", current->comm); + __set_current_state(save); + } +@@ -171,7 +198,7 @@ static int try_to_freeze_tasks(int freeze_user_space) + do { + todo = 0; + read_lock(&tasklist_lock); +- do_each_thread(g, p) { ++ do_each_thread_all(g, p) { + if (frozen(p) || !freezeable(p)) + continue; + +@@ -187,7 +214,7 @@ static int try_to_freeze_tasks(int freeze_user_space) + if (!task_is_stopped_or_traced(p) && + !freezer_should_skip(p)) + todo++; +- } while_each_thread(g, p); ++ } while_each_thread_all(g, p); + read_unlock(&tasklist_lock); + yield(); /* Yield is okay here */ + if (time_after(jiffies, end_time)) +@@ -211,13 +238,13 @@ static int try_to_freeze_tasks(int freeze_user_space) + elapsed_csecs / 100, elapsed_csecs % 100, todo); + show_state(); + read_lock(&tasklist_lock); +- do_each_thread(g, p) { ++ do_each_thread_all(g, p) { + task_lock(p); + if (freezing(p) && !freezer_should_skip(p)) + printk(KERN_ERR " %s\n", p->comm); + cancel_freezing(p); + task_unlock(p); +- } while_each_thread(g, p); ++ } while_each_thread_all(g, p); + read_unlock(&tasklist_lock); + } else { + printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100, +@@ -234,6 +261,7 @@ int freeze_processes(void) + { + int error; + ++ atomic_inc(&global_suspend); + printk("Freezing user space processes ... "); + error = try_to_freeze_tasks(FREEZER_USER_SPACE); + if (error) +@@ -248,6 +276,7 @@ int freeze_processes(void) + Exit: + BUG_ON(in_atomic()); + printk("\n"); ++ atomic_dec(&global_suspend); + return error; + } + +@@ -256,15 +285,17 @@ static void thaw_tasks(int thaw_user_space) + struct task_struct *g, *p; + + read_lock(&tasklist_lock); +- do_each_thread(g, p) { ++ do_each_thread_all(g, p) { + if (!freezeable(p)) + continue; + + if (!p->mm == thaw_user_space) + continue; + +- thaw_process(p); +- } while_each_thread(g, p); ++ if (!thaw_process(p)) ++ printk(KERN_WARNING " Strange, %s not stopped\n", ++ p->comm ); ++ } while_each_thread_all(g, p); + read_unlock(&tasklist_lock); + } + +diff --git a/kernel/printk.c b/kernel/printk.c +index e2129e8..f472b1a 100644 +--- a/kernel/printk.c ++++ b/kernel/printk.c +@@ -31,7 +31,9 @@ + #include + #include + #include ++#include + #include ++#include + + #include + +@@ -90,7 +92,7 @@ static int console_locked, console_suspended; + * It is also used in interesting ways to provide interlocking in + * release_console_sem(). + */ +-static DEFINE_SPINLOCK(logbuf_lock); ++DEFINE_SPINLOCK(logbuf_lock); + + #define LOG_BUF_MASK (log_buf_len-1) + #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) +@@ -124,6 +126,7 @@ static int preferred_console = -1; + + /* Flag: console code may call schedule() */ + static int console_may_schedule; ++int console_silence_loglevel; + + #ifdef CONFIG_PRINTK + +@@ -132,6 +135,19 @@ static char *log_buf = __log_buf; + static int log_buf_len = __LOG_BUF_LEN; + static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ + ++static int __init setup_console_silencelevel(char *str) ++{ ++ int level; ++ ++ if (get_option(&str, &level) != 1) ++ return 0; ++ ++ console_silence_loglevel = level; ++ return 1; ++} ++ ++__setup("silencelevel=", setup_console_silencelevel); ++ + static int __init log_buf_len_setup(char *str) + { + unsigned size = memparse(str, &str); +@@ -302,6 +318,9 @@ int do_syslog(int type, char __user *buf, int len) + char c; + int error = 0; + ++ if (!ve_is_super(get_exec_env()) && (type == 6 || type == 7)) ++ goto out; ++ + error = security_syslog(type); + if (error) + return error; +@@ -322,15 +341,15 @@ int do_syslog(int type, char __user *buf, int len) + error = -EFAULT; + goto out; + } +- error = wait_event_interruptible(log_wait, +- (log_start - log_end)); ++ error = wait_event_interruptible(ve_log_wait, ++ (ve_log_start - ve_log_end)); + if (error) + goto out; + i = 0; + spin_lock_irq(&logbuf_lock); +- while (!error && (log_start != log_end) && i < len) { +- c = LOG_BUF(log_start); +- log_start++; ++ while (!error && (ve_log_start != ve_log_end) && i < len) { ++ c = VE_LOG_BUF(ve_log_start); ++ ve_log_start++; + spin_unlock_irq(&logbuf_lock); + error = __put_user(c,buf); + buf++; +@@ -356,15 +375,17 @@ int do_syslog(int type, char __user *buf, int len) + error = -EFAULT; + goto out; + } ++ if (ve_log_buf == NULL) ++ goto out; + count = len; +- if (count > log_buf_len) +- count = log_buf_len; ++ if (count > ve_log_buf_len) ++ count = ve_log_buf_len; + spin_lock_irq(&logbuf_lock); +- if (count > logged_chars) +- count = logged_chars; ++ if (count > ve_logged_chars) ++ count = ve_logged_chars; + if (do_clear) +- logged_chars = 0; +- limit = log_end; ++ ve_logged_chars = 0; ++ limit = ve_log_end; + /* + * __put_user() could sleep, and while we sleep + * printk() could overwrite the messages +@@ -373,9 +394,9 @@ int do_syslog(int type, char __user *buf, int len) + */ + for (i = 0; i < count && !error; i++) { + j = limit-1-i; +- if (j + log_buf_len < log_end) ++ if (j + ve_log_buf_len < ve_log_end) + break; +- c = LOG_BUF(j); ++ c = VE_LOG_BUF(j); + spin_unlock_irq(&logbuf_lock); + error = __put_user(c,&buf[count-1-i]); + cond_resched(); +@@ -399,7 +420,7 @@ int do_syslog(int type, char __user *buf, int len) + } + break; + case 5: /* Clear ring buffer */ +- logged_chars = 0; ++ ve_logged_chars = 0; + break; + case 6: /* Disable logging to console */ + console_loglevel = minimum_console_loglevel; +@@ -411,16 +432,19 @@ int do_syslog(int type, char __user *buf, int len) + error = -EINVAL; + if (len < 1 || len > 8) + goto out; ++ error = 0; ++ /* VE has no console, so return success */ ++ if (!ve_is_super(get_exec_env())) ++ goto out; + if (len < minimum_console_loglevel) + len = minimum_console_loglevel; + console_loglevel = len; +- error = 0; + break; + case 9: /* Number of chars in the log buffer */ +- error = log_end - log_start; ++ error = ve_log_end - ve_log_start; + break; + case 10: /* Size of the log buffer */ +- error = log_buf_len; ++ error = ve_log_buf_len; + break; + default: + error = -EINVAL; +@@ -531,14 +555,14 @@ static void call_console_drivers(unsigned start, unsigned end) + + static void emit_log_char(char c) + { +- LOG_BUF(log_end) = c; +- log_end++; +- if (log_end - log_start > log_buf_len) +- log_start = log_end - log_buf_len; +- if (log_end - con_start > log_buf_len) +- con_start = log_end - log_buf_len; +- if (logged_chars < log_buf_len) +- logged_chars++; ++ VE_LOG_BUF(ve_log_end) = c; ++ ve_log_end++; ++ if (ve_log_end - ve_log_start > ve_log_buf_len) ++ ve_log_start = ve_log_end - ve_log_buf_len; ++ if (ve_is_super(get_exec_env()) && ve_log_end - con_start > ve_log_buf_len) ++ con_start = ve_log_end - ve_log_buf_len; ++ if (ve_logged_chars < ve_log_buf_len) ++ ve_logged_chars++; + } + + /* +@@ -604,6 +628,30 @@ static int have_callable_console(void) + * printf(3) + */ + ++static inline int ve_log_init(void) ++{ ++#ifdef CONFIG_VE ++ if (ve_log_buf != NULL) ++ return 0; ++ ++ if (ve_is_super(get_exec_env())) { ++ ve0._log_wait = &log_wait; ++ ve0._log_start = &log_start; ++ ve0._log_end = &log_end; ++ ve0._logged_chars = &logged_chars; ++ ve0.log_buf = log_buf; ++ return 0; ++ } ++ ++ ve_log_buf = kmalloc(ve_log_buf_len, GFP_ATOMIC); ++ if (!ve_log_buf) ++ return -ENOMEM; ++ ++ memset(ve_log_buf, 0, ve_log_buf_len); ++#endif ++ return 0; ++} ++ + asmlinkage int printk(const char *fmt, ...) + { + va_list args; +@@ -670,7 +718,7 @@ static const char printk_recursion_bug_msg [] = + KERN_CRIT "BUG: recent printk recursion!\n"; + static int printk_recursion_bug; + +-asmlinkage int vprintk(const char *fmt, va_list args) ++asmlinkage int __vprintk(const char *fmt, va_list args) + { + static int log_level_unknown = 1; + static char printk_buf[1024]; +@@ -679,6 +727,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) + int printed_len = 0; + int this_cpu; + char *p; ++ int err, need_wake; + + boot_delay_msec(); + +@@ -709,6 +758,12 @@ asmlinkage int vprintk(const char *fmt, va_list args) + spin_lock(&logbuf_lock); + printk_cpu = this_cpu; + ++ err = ve_log_init(); ++ if (err) { ++ spin_unlock_irqrestore(&logbuf_lock, flags); ++ return err; ++ } ++ + if (printk_recursion_bug) { + printk_recursion_bug = 0; + strcpy(printk_buf, printk_recursion_bug_msg); +@@ -785,7 +840,12 @@ asmlinkage int vprintk(const char *fmt, va_list args) + * will release 'logbuf_lock' regardless of whether it + * actually gets the semaphore or not. + */ +- if (acquire_console_semaphore_for_printk(this_cpu)) ++ if (!ve_is_super(get_exec_env())) { ++ need_wake = (ve_log_start != ve_log_end); ++ spin_unlock_irqrestore(&logbuf_lock, flags); ++ if (!oops_in_progress && need_wake) ++ wake_up_interruptible(&ve_log_wait); ++ } else if (acquire_console_semaphore_for_printk(this_cpu)) + release_console_sem(); + + lockdep_on(); +@@ -798,6 +858,41 @@ out_restore_irqs: + EXPORT_SYMBOL(printk); + EXPORT_SYMBOL(vprintk); + ++asmlinkage int vprintk(const char *fmt, va_list args) ++{ ++ int i; ++ struct ve_struct *env; ++ ++ env = set_exec_env(get_ve0()); ++ i = __vprintk(fmt, args); ++ (void)set_exec_env(env); ++ return i; ++} ++ ++asmlinkage int ve_vprintk(int dst, const char *fmt, va_list args) ++{ ++ int printed_len; ++ ++ printed_len = 0; ++ if (ve_is_super(get_exec_env()) || (dst & VE0_LOG)) ++ printed_len = vprintk(fmt, args); ++ if (!ve_is_super(get_exec_env()) && (dst & VE_LOG)) ++ printed_len = __vprintk(fmt, args); ++ return printed_len; ++} ++ ++asmlinkage int ve_printk(int dst, const char *fmt, ...) ++{ ++ va_list args; ++ int printed_len; ++ ++ va_start(args, fmt); ++ printed_len = ve_vprintk(dst, fmt, args); ++ va_end(args); ++ return printed_len; ++} ++EXPORT_SYMBOL(ve_printk); ++ + #else + + asmlinkage long sys_syslog(int type, char __user *buf, int len) +@@ -1346,6 +1441,36 @@ int printk_ratelimit(void) + } + EXPORT_SYMBOL(printk_ratelimit); + ++/* ++ * Rate limiting stuff. ++ */ ++int vz_ratelimit(struct vz_rate_info *p) ++{ ++ unsigned long cjif, djif; ++ unsigned long flags; ++ static spinlock_t ratelimit_lock = SPIN_LOCK_UNLOCKED; ++ long new_bucket; ++ ++ spin_lock_irqsave(&ratelimit_lock, flags); ++ cjif = jiffies; ++ djif = cjif - p->last; ++ if (djif < p->interval) { ++ if (p->bucket >= p->burst) { ++ spin_unlock_irqrestore(&ratelimit_lock, flags); ++ return 0; ++ } ++ p->bucket++; ++ } else { ++ new_bucket = p->bucket - (djif / (unsigned)p->interval); ++ if (new_bucket < 0) ++ new_bucket = 0; ++ p->bucket = new_bucket + 1; ++ } ++ p->last = cjif; ++ spin_unlock_irqrestore(&ratelimit_lock, flags); ++ return 1; ++} ++ + /** + * printk_timed_ratelimit - caller-controlled printk ratelimiting + * @caller_jiffies: pointer to caller's state +diff --git a/kernel/ptrace.c b/kernel/ptrace.c +index 6c19e94..df8f075 100644 +--- a/kernel/ptrace.c ++++ b/kernel/ptrace.c +@@ -132,6 +132,8 @@ int __ptrace_may_attach(struct task_struct *task) + * or halting the specified task is impossible. + */ + int dumpable = 0; ++ int vps_dumpable = 0; ++ + /* Don't let security modules deny introspection */ + if (task == current) + return 0; +@@ -143,11 +145,17 @@ int __ptrace_may_attach(struct task_struct *task) + (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) + return -EPERM; + smp_rmb(); +- if (task->mm) ++ if (task->mm) { + dumpable = get_dumpable(task->mm); ++ vps_dumpable = (task->mm->vps_dumpable == 1); ++ } ++ + if (!dumpable && !capable(CAP_SYS_PTRACE)) + return -EPERM; +- ++ if (!vps_dumpable && !ve_is_super(get_exec_env())) ++ return -EPERM; ++ if (!ve_accessible(VE_TASK_INFO(task)->owner_env, get_exec_env())) ++ return -EPERM; + return security_ptrace(current, task); + } + +@@ -198,6 +206,8 @@ repeat: + retval = __ptrace_may_attach(task); + if (retval) + goto bad; ++ if (task->mm->vps_dumpable == 2) ++ goto bad; + + /* Go */ + task->ptrace |= PT_PTRACED; +@@ -291,6 +301,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds + } + return copied; + } ++EXPORT_SYMBOL_GPL(access_process_vm); + + static int ptrace_setoptions(struct task_struct *child, long data) + { +diff --git a/kernel/sched.c b/kernel/sched.c +index 4e2f603..57a7d99 100644 +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -70,6 +70,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -340,6 +341,8 @@ static inline struct task_group *task_group(struct task_struct *p) + #elif defined(CONFIG_CGROUP_SCHED) + tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), + struct task_group, css); ++#elif defined(CONFIG_VZ_FAIRSCHED) ++ tg = p->fsched_node->tg; + #else + tg = &init_task_group; + #endif +@@ -509,6 +512,9 @@ struct rq { + */ + unsigned long nr_uninterruptible; + ++ unsigned long nr_sleeping; ++ unsigned long nr_stopped; ++ + struct task_struct *curr, *idle; + unsigned long next_balance; + struct mm_struct *prev_mm; +@@ -578,6 +584,11 @@ static inline int cpu_of(struct rq *rq) + #endif + } + ++struct kernel_stat_glob kstat_glob; ++DEFINE_SPINLOCK(kstat_glb_lock); ++EXPORT_SYMBOL(kstat_glob); ++EXPORT_SYMBOL(kstat_glb_lock); ++ + /* + * The domain tree (rq->sd) is protected by RCU's quiescent state transition. + * See detach_destroy_domains: synchronize_sched for details. +@@ -981,6 +992,217 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) + spin_unlock_irqrestore(&rq->lock, *flags); + } + ++#ifdef CONFIG_VE ++static inline void ve_nr_iowait_inc(struct ve_struct *ve, int cpu) ++{ ++ VE_CPU_STATS(ve, cpu)->nr_iowait++; ++} ++ ++static inline void ve_nr_iowait_dec(struct ve_struct *ve, int cpu) ++{ ++ VE_CPU_STATS(ve, cpu)->nr_iowait--; ++} ++ ++static inline void ve_nr_unint_inc(struct ve_struct *ve, int cpu) ++{ ++ VE_CPU_STATS(ve, cpu)->nr_unint++; ++} ++ ++static inline void ve_nr_unint_dec(struct ve_struct *ve, int cpu) ++{ ++ VE_CPU_STATS(ve, cpu)->nr_unint--; ++} ++ ++#define cycles_after(a, b) ((long long)(b) - (long long)(a) < 0) ++ ++cycles_t ve_sched_get_idle_time(struct ve_struct *ve, int cpu) ++{ ++ struct ve_cpu_stats *ve_stat; ++ unsigned v; ++ cycles_t strt, ret, cycles; ++ ++ ve_stat = VE_CPU_STATS(ve, cpu); ++ do { ++ v = read_seqcount_begin(&ve_stat->stat_lock); ++ ret = ve_stat->idle_time; ++ strt = ve_stat->strt_idle_time; ++ if (strt && nr_uninterruptible_ve(ve) == 0) { ++ cycles = get_cycles(); ++ if (cycles_after(cycles, strt)) ++ ret += cycles - strt; ++ } ++ } while (read_seqcount_retry(&ve_stat->stat_lock, v)); ++ return ret; ++} ++EXPORT_SYMBOL(ve_sched_get_idle_time); ++ ++cycles_t ve_sched_get_iowait_time(struct ve_struct *ve, int cpu) ++{ ++ struct ve_cpu_stats *ve_stat; ++ unsigned v; ++ cycles_t strt, ret, cycles; ++ ++ ve_stat = VE_CPU_STATS(ve, cpu); ++ do { ++ v = read_seqcount_begin(&ve_stat->stat_lock); ++ ret = ve_stat->iowait_time; ++ strt = ve_stat->strt_idle_time; ++ if (strt && nr_iowait_ve(ve) > 0) { ++ cycles = get_cycles(); ++ if (cycles_after(cycles, strt)) ++ ret += cycles - strt; ++ } ++ } while (read_seqcount_retry(&ve_stat->stat_lock, v)); ++ return ret; ++} ++EXPORT_SYMBOL(ve_sched_get_iowait_time); ++ ++static void ve_stop_idle(struct ve_struct *ve, unsigned int cpu, cycles_t cycles) ++{ ++ struct ve_cpu_stats *ve_stat; ++ ++ ve_stat = VE_CPU_STATS(ve, cpu); ++ ++ write_seqcount_begin(&ve_stat->stat_lock); ++ if (ve_stat->strt_idle_time) { ++ if (cycles_after(cycles, ve_stat->strt_idle_time)) { ++ if (nr_iowait_ve(ve) == 0) ++ ve_stat->idle_time += ++ cycles - ve_stat->strt_idle_time; ++ else ++ ve_stat->iowait_time += ++ cycles - ve_stat->strt_idle_time; ++ } ++ ve_stat->strt_idle_time = 0; ++ } ++ write_seqcount_end(&ve_stat->stat_lock); ++} ++ ++static void ve_strt_idle(struct ve_struct *ve, unsigned int cpu, cycles_t cycles) ++{ ++ struct ve_cpu_stats *ve_stat; ++ ++ ve_stat = VE_CPU_STATS(ve, cpu); ++ ++ write_seqcount_begin(&ve_stat->stat_lock); ++ ve_stat->strt_idle_time = cycles; ++ write_seqcount_end(&ve_stat->stat_lock); ++} ++ ++static inline void ve_nr_running_inc(struct ve_struct *ve, int cpu, cycles_t cycles) ++{ ++ if (++VE_CPU_STATS(ve, cpu)->nr_running == 1) ++ ve_stop_idle(ve, cpu, cycles); ++} ++ ++static inline void ve_nr_running_dec(struct ve_struct *ve, int cpu, cycles_t cycles) ++{ ++ if (--VE_CPU_STATS(ve, cpu)->nr_running == 0) ++ ve_strt_idle(ve, cpu, cycles); ++} ++ ++void ve_sched_attach(struct ve_struct *target_ve) ++{ ++ struct task_struct *tsk; ++ unsigned int cpu; ++ cycles_t cycles; ++ ++ tsk = current; ++ preempt_disable(); ++ cycles = get_cycles(); ++ cpu = task_cpu(tsk); ++ ve_nr_running_dec(VE_TASK_INFO(tsk)->owner_env, cpu, cycles); ++ ve_nr_running_inc(target_ve, cpu, cycles); ++ preempt_enable(); ++} ++EXPORT_SYMBOL(ve_sched_attach); ++ ++static inline void write_wakeup_stamp(struct task_struct *p, cycles_t cyc) ++{ ++ struct ve_task_info *ti; ++ ++ ti = VE_TASK_INFO(p); ++ write_seqcount_begin(&ti->wakeup_lock); ++ ti->wakeup_stamp = cyc; ++ write_seqcount_end(&ti->wakeup_lock); ++} ++ ++static inline void update_sched_lat(struct task_struct *t, cycles_t cycles) ++{ ++ int cpu; ++ cycles_t ve_wstamp; ++ ++ /* safe due to runqueue lock */ ++ cpu = smp_processor_id(); ++ ve_wstamp = t->ve_task_info.wakeup_stamp; ++ ++ if (ve_wstamp && cycles > ve_wstamp) { ++ KSTAT_LAT_PCPU_ADD(&kstat_glob.sched_lat, ++ cpu, cycles - ve_wstamp); ++ KSTAT_LAT_PCPU_ADD(&t->ve_task_info.exec_env->sched_lat_ve, ++ cpu, cycles - ve_wstamp); ++ } ++} ++ ++static inline void update_ve_task_info(struct task_struct *prev, cycles_t cycles) ++{ ++#ifdef CONFIG_FAIRSCHED ++ if (prev != this_pcpu()->idle) { ++#else ++ if (prev != this_rq()->idle) { ++#endif ++ VE_CPU_STATS(prev->ve_task_info.owner_env, ++ smp_processor_id())->used_time += ++ cycles - prev->ve_task_info.sched_time; ++ ++ prev->ve_task_info.sched_time = cycles; ++ } ++} ++#else ++static inline void ve_nr_running_inc(struct ve_struct, int cpu, cycles_t cycles) ++{ ++} ++ ++static inline void ve_nr_running_dec(struct ve_struct, int cpu, cycles_t cycles) ++{ ++} ++ ++static inline void ve_nr_iowait_inc(struct ve_struct *ve, int cpu) ++{ ++} ++ ++static inline void ve_nr_iowait_dec(struct ve_struct *ve, int cpu) ++{ ++} ++ ++static inline void ve_nr_unint_inc(struct ve_struct *ve, int cpu) ++{ ++} ++ ++static inline void ve_nr_unint_dec(struct ve_struct *ve, int cpu) ++{ ++} ++ ++static inline void update_ve_task_info(struct task_struct *prev, cycles_t cycles) ++{ ++} ++#endif ++ ++struct task_nrs_struct { ++ long nr_running; ++ long nr_unint; ++ long nr_stopped; ++ long nr_sleeping; ++ long nr_iowait; ++ long long nr_switches; ++} ____cacheline_aligned_in_smp; ++ ++unsigned long nr_zombie = 0; /* protected by tasklist_lock */ ++EXPORT_SYMBOL(nr_zombie); ++ ++atomic_t nr_dead = ATOMIC_INIT(0); ++EXPORT_SYMBOL(nr_dead); ++ + /* + * this_rq_lock - lock this runqueue and disable interrupts. + */ +@@ -1608,11 +1830,21 @@ static int effective_prio(struct task_struct *p) + */ + static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) + { +- if (task_contributes_to_load(p)) ++ cycles_t cycles; ++ ++#ifdef CONFIG_VE ++ cycles = get_cycles(); ++ write_wakeup_stamp(p, cycles); ++ p->ve_task_info.sleep_time += cycles; ++#endif ++ if (task_contributes_to_load(p)) { + rq->nr_uninterruptible--; ++ ve_nr_unint_dec(VE_TASK_INFO(p)->owner_env, task_cpu(p)); ++ } + + enqueue_task(rq, p, wakeup); + inc_nr_running(p, rq); ++ ve_nr_running_inc(VE_TASK_INFO(p)->owner_env, task_cpu(p), cycles); + } + + /* +@@ -1620,6 +1852,30 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) + */ + static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) + { ++ cycles_t cycles; ++#ifdef CONFIG_VE ++ unsigned int cpu, pcpu; ++ struct ve_struct *ve; ++ ++ cycles = get_cycles(); ++ cpu = task_cpu(p); ++ pcpu = smp_processor_id(); ++ ve = p->ve_task_info.owner_env; ++ ++ p->ve_task_info.sleep_time -= cycles; ++#endif ++ if (p->state == TASK_UNINTERRUPTIBLE) { ++ ve_nr_unint_inc(ve, cpu); ++ } ++ if (p->state == TASK_INTERRUPTIBLE) { ++ rq->nr_sleeping++; ++ } ++ if (p->state == TASK_STOPPED) { ++ rq->nr_stopped++; ++ } ++ ++ ve_nr_running_dec(VE_TASK_INFO(p)->owner_env, cpu, cycles); ++ + if (task_contributes_to_load(p)) + rq->nr_uninterruptible++; + +@@ -1843,6 +2099,7 @@ void wait_task_inactive(struct task_struct *p) + break; + } + } ++EXPORT_SYMBOL_GPL(wait_task_inactive); + + /*** + * kick_process - kick a running thread to enter/exit the kernel +@@ -2248,6 +2505,10 @@ void sched_fork(struct task_struct *p, int clone_flags) + /* Want to start with kernel preemption disabled. */ + task_thread_info(p)->preempt_count = 1; + #endif ++#ifdef CONFIG_VE ++ /* cosmetic: sleep till wakeup below */ ++ p->ve_task_info.sleep_time -= get_cycles(); ++#endif + put_cpu(); + } + +@@ -2278,6 +2539,8 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) + */ + p->sched_class->task_new(rq, p); + inc_nr_running(p, rq); ++ ve_nr_running_inc(VE_TASK_INFO(p)->owner_env, task_cpu(p), ++ get_cycles()); + } + check_preempt_curr(rq, p); + #ifdef CONFIG_SMP +@@ -2439,6 +2702,7 @@ asmlinkage void schedule_tail(struct task_struct *prev) + if (current->set_child_tid) + put_user(task_pid_vnr(current), current->set_child_tid); + } ++EXPORT_SYMBOL_GPL(schedule_tail); + + /* + * context_switch - switch to the new MM and the new +@@ -2509,6 +2773,7 @@ unsigned long nr_running(void) + + return sum; + } ++EXPORT_SYMBOL_GPL(nr_running); + + unsigned long nr_uninterruptible(void) + { +@@ -2526,6 +2791,7 @@ unsigned long nr_uninterruptible(void) + + return sum; + } ++EXPORT_SYMBOL_GPL(nr_uninterruptible); + + unsigned long long nr_context_switches(void) + { +@@ -2563,6 +2829,72 @@ unsigned long nr_active(void) + return running + uninterruptible; + } + ++unsigned long nr_stopped(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_stopped; ++ if (unlikely((long)sum < 0)) ++ sum = 0; ++ return sum; ++} ++EXPORT_SYMBOL(nr_stopped); ++ ++unsigned long nr_sleeping(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_sleeping; ++ if (unlikely((long)sum < 0)) ++ sum = 0; ++ return sum; ++} ++EXPORT_SYMBOL(nr_sleeping); ++ ++#ifdef CONFIG_VE ++unsigned long nr_running_ve(struct ve_struct *ve) ++{ ++ int i; ++ long sum = 0; ++ cpumask_t ve_cpus; ++ ++ ve_cpu_online_map(ve, &ve_cpus); ++ for_each_cpu_mask(i, ve_cpus) ++ sum += VE_CPU_STATS(ve, i)->nr_running; ++ return (unsigned long)(sum < 0 ? 0 : sum); ++} ++EXPORT_SYMBOL(nr_running_ve); ++ ++unsigned long nr_uninterruptible_ve(struct ve_struct *ve) ++{ ++ int i; ++ long sum = 0; ++ cpumask_t ve_cpus; ++ ++ sum = 0; ++ ve_cpu_online_map(ve, &ve_cpus); ++ for_each_cpu_mask(i, ve_cpus) ++ sum += VE_CPU_STATS(ve, i)->nr_unint; ++ return (unsigned long)(sum < 0 ? 0 : sum); ++} ++EXPORT_SYMBOL(nr_uninterruptible_ve); ++ ++unsigned long nr_iowait_ve(struct ve_struct *ve) ++{ ++ int i; ++ long sum = 0; ++ cpumask_t ve_cpus; ++ ++ ve_cpu_online_map(ve, &ve_cpus); ++ for_each_cpu_mask(i, ve_cpus) ++ sum += VE_CPU_STATS(ve, i)->nr_iowait; ++ return (unsigned long)(sum < 0 ? 0 : sum); ++} ++EXPORT_SYMBOL(nr_iowait_ve); ++#endif ++ + /* + * Update rq->cpu_load[] statistics. This function is usually called every + * scheduler tick (TICK_NSEC). +@@ -2593,6 +2925,16 @@ static void update_cpu_load(struct rq *this_rq) + } + } + ++#ifdef CONFIG_VE ++#define update_ve_cpu_time(p, time, tick) \ ++ do { \ ++ VE_CPU_STATS((p)->ve_task_info.owner_env, \ ++ task_cpu(p))->time += tick; \ ++ } while (0) ++#else ++#define update_ve_cpu_time(p, time, tick) do { } while (0) ++#endif ++ + #ifdef CONFIG_SMP + + /* +@@ -2720,8 +3062,15 @@ void sched_exec(void) + static void pull_task(struct rq *src_rq, struct task_struct *p, + struct rq *this_rq, int this_cpu) + { ++ struct ve_struct *ve; ++ cycles_t cycles = get_cycles(); ++ ++ ve = VE_TASK_INFO(p)->owner_env; ++ + deactivate_task(src_rq, p, 0); ++ ve_nr_running_dec(ve, task_cpu(p), cycles); + set_task_cpu(p, this_cpu); ++ ve_nr_running_inc(ve, task_cpu(p), cycles); + activate_task(this_rq, p, 0); + /* + * Note that idle threads have a prio of MAX_PRIO, for this test +@@ -3891,10 +4240,13 @@ void account_user_time(struct task_struct *p, cputime_t cputime) + + /* Add user time to cpustat. */ + tmp = cputime_to_cputime64(cputime); +- if (TASK_NICE(p) > 0) ++ if (TASK_NICE(p) > 0) { + cpustat->nice = cputime64_add(cpustat->nice, tmp); +- else ++ update_ve_cpu_time(p, nice, tmp); ++ } else { + cpustat->user = cputime64_add(cpustat->user, tmp); ++ update_ve_cpu_time(p, user, tmp); ++ } + } + + /* +@@ -3948,6 +4300,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, + + /* Add system time to cpustat. */ + tmp = cputime_to_cputime64(cputime); ++ update_ve_cpu_time(p, system, tmp); + if (hardirq_count() - hardirq_offset) + cpustat->irq = cputime64_add(cpustat->irq, tmp); + else if (softirq_count()) +@@ -4188,12 +4541,30 @@ need_resched_nonpreemptible: + next = pick_next_task(rq, prev); + + if (likely(prev != next)) { ++ cycles_t cycles = get_cycles(); ++ + sched_info_switch(prev, next); + + rq->nr_switches++; + rq->curr = next; + ++*switch_count; + ++#ifdef CONFIG_VE ++ prev->ve_task_info.sleep_stamp = cycles; ++ if (prev->state == TASK_RUNNING && prev != this_rq()->idle) ++ write_wakeup_stamp(prev, cycles); ++ update_sched_lat(next, cycles); ++ ++ /* because next & prev are protected with ++ * runqueue lock we may not worry about ++ * wakeup_stamp and sched_time protection ++ * (same thing in 'else' branch below) ++ */ ++ update_ve_task_info(prev, cycles); ++ next->ve_task_info.sched_time = cycles; ++ write_wakeup_stamp(next, 0); ++#endif ++ + context_switch(rq, prev, next); /* unlocks the rq */ + /* + * the context switch might have flipped the stack from under +@@ -4201,8 +4572,10 @@ need_resched_nonpreemptible: + */ + cpu = smp_processor_id(); + rq = cpu_rq(cpu); +- } else ++ } else { ++ update_ve_task_info(prev, get_cycles()); + spin_unlock_irq(&rq->lock); ++ } + + hrtick_set(rq); + +@@ -4785,7 +5158,7 @@ recheck: + /* + * Allow unprivileged RT tasks to decrease priority: + */ +- if (!capable(CAP_SYS_NICE)) { ++ if (!capable(CAP_SYS_ADMIN)) { + if (rt_policy(policy)) { + unsigned long rlim_rtprio; + +@@ -5257,10 +5630,15 @@ EXPORT_SYMBOL(yield); + void __sched io_schedule(void) + { + struct rq *rq = &__raw_get_cpu_var(runqueues); ++#ifdef CONFIG_VE ++ struct ve_struct *ve = current->ve_task_info.owner_env; ++#endif + + delayacct_blkio_start(); + atomic_inc(&rq->nr_iowait); ++ ve_nr_iowait_inc(ve, task_cpu(current)); + schedule(); ++ ve_nr_iowait_dec(ve, task_cpu(current)); + atomic_dec(&rq->nr_iowait); + delayacct_blkio_end(); + } +@@ -5270,10 +5648,15 @@ long __sched io_schedule_timeout(long timeout) + { + struct rq *rq = &__raw_get_cpu_var(runqueues); + long ret; ++#ifdef CONFIG_VE ++ struct ve_struct *ve = current->ve_task_info.owner_env; ++#endif + + delayacct_blkio_start(); + atomic_inc(&rq->nr_iowait); ++ ve_nr_iowait_inc(ve, task_cpu(current)); + ret = schedule_timeout(timeout); ++ ve_nr_iowait_dec(ve, task_cpu(current)); + atomic_dec(&rq->nr_iowait); + delayacct_blkio_end(); + return ret; +@@ -5394,17 +5777,7 @@ void sched_show_task(struct task_struct *p) + state = p->state ? __ffs(p->state) + 1 : 0; + printk(KERN_INFO "%-13.13s %c", p->comm, + state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); +-#if BITS_PER_LONG == 32 +- if (state == TASK_RUNNING) +- printk(KERN_CONT " running "); +- else +- printk(KERN_CONT " %08lx ", thread_saved_pc(p)); +-#else +- if (state == TASK_RUNNING) +- printk(KERN_CONT " running task "); +- else +- printk(KERN_CONT " %016lx ", thread_saved_pc(p)); +-#endif ++ printk(KERN_CONT " %p ", p); + #ifdef CONFIG_DEBUG_STACK_USAGE + { + unsigned long *n = end_of_stack(p); +@@ -5425,13 +5798,13 @@ void show_state_filter(unsigned long state_filter) + + #if BITS_PER_LONG == 32 + printk(KERN_INFO +- " task PC stack pid father\n"); ++ " task taskaddr stack pid father\n"); + #else + printk(KERN_INFO +- " task PC stack pid father\n"); ++ " task taskaddr stack pid father\n"); + #endif + read_lock(&tasklist_lock); +- do_each_thread(g, p) { ++ do_each_thread_all(g, p) { + /* + * reset the NMI-timeout, listing all files on a slow + * console might take alot of time: +@@ -5439,7 +5812,7 @@ void show_state_filter(unsigned long state_filter) + touch_nmi_watchdog(); + if (!state_filter || (p->state & state_filter)) + sched_show_task(p); +- } while_each_thread(g, p); ++ } while_each_thread_all(g, p); + + touch_all_softlockup_watchdogs(); + +@@ -5795,13 +6168,13 @@ static void migrate_live_tasks(int src_cpu) + + read_lock(&tasklist_lock); + +- do_each_thread(t, p) { ++ do_each_thread_all(t, p) { + if (p == current) + continue; + + if (task_cpu(p) == src_cpu) + move_task_off_dead_cpu(src_cpu, p); +- } while_each_thread(t, p); ++ } while_each_thread_all(t, p); + + read_unlock(&tasklist_lock); + } +@@ -7753,7 +8126,7 @@ void __init sched_init(void) + #ifdef CONFIG_FAIR_GROUP_SCHED + init_task_group.shares = init_task_group_load; + INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); +-#ifdef CONFIG_CGROUP_SCHED ++#if defined(CONFIG_CGROUP_SCHED) || defined(CONFIG_VZ_FAIRSCHED) + /* + * How much cpu bandwidth does init_task_group get? + * +@@ -7799,7 +8172,7 @@ void __init sched_init(void) + rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; + #ifdef CONFIG_RT_GROUP_SCHED + INIT_LIST_HEAD(&rq->leaf_rt_rq_list); +-#ifdef CONFIG_CGROUP_SCHED ++#if defined(CONFIG_CGROUP_SCHED) || defined(CONFIG_VZ_FAIRSCHED) + init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); + #elif defined CONFIG_USER_SCHED + init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL); +@@ -7858,6 +8231,7 @@ void __init sched_init(void) + * During early bootup we pretend to be a normal task: + */ + current->sched_class = &fair_sched_class; ++ fairsched_init_early(); + + scheduler_running = 1; + } +@@ -7910,7 +8284,7 @@ void normalize_rt_tasks(void) + struct rq *rq; + + read_lock_irqsave(&tasklist_lock, flags); +- do_each_thread(g, p) { ++ do_each_thread_all(g, p) { + /* + * Only normalize user tasks: + */ +@@ -7941,7 +8315,7 @@ void normalize_rt_tasks(void) + + __task_rq_unlock(rq); + spin_unlock(&p->pi_lock); +- } while_each_thread(g, p); ++ } while_each_thread_all(g, p); + + read_unlock_irqrestore(&tasklist_lock, flags); + } +diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c +index 8bb7130..e891f48 100644 +--- a/kernel/sched_debug.c ++++ b/kernel/sched_debug.c +@@ -101,12 +101,12 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) + + read_lock_irqsave(&tasklist_lock, flags); + +- do_each_thread(g, p) { ++ do_each_thread_all(g, p) { + if (!p->se.on_rq || task_cpu(p) != rq_cpu) + continue; + + print_task(m, rq, p); +- } while_each_thread(g, p); ++ } while_each_thread_all(g, p); + + read_unlock_irqrestore(&tasklist_lock, flags); + } +diff --git a/kernel/signal.c b/kernel/signal.c +index 6c0958e..fd916a1 100644 +--- a/kernel/signal.c ++++ b/kernel/signal.c +@@ -31,13 +31,32 @@ + #include + #include + #include ++#include + #include "audit.h" /* audit_signal_info() */ + + /* + * SLAB caches for signal bits. + */ + +-static struct kmem_cache *sigqueue_cachep; ++struct kmem_cache *sigqueue_cachep; ++EXPORT_SYMBOL(sigqueue_cachep); ++ ++static int sig_ve_ignored(int sig, struct siginfo *info, struct task_struct *t) ++{ ++ struct ve_struct *ve; ++ ++ /* always allow signals from the kernel */ ++ if (info == SEND_SIG_FORCED || ++ (!is_si_special(info) && SI_FROMKERNEL(info))) ++ return 0; ++ ++ ve = current->ve_task_info.owner_env; ++ if (ve->ve_ns->pid_ns->child_reaper != t) ++ return 0; ++ if (ve_is_super(get_exec_env())) ++ return 0; ++ return !sig_user_defined(t, sig) || sig_kernel_only(sig); ++} + + static int __sig_ignored(struct task_struct *t, int sig) + { +@@ -101,7 +120,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) + + #define PENDING(p,b) has_pending_signals(&(p)->signal, (b)) + +-static int recalc_sigpending_tsk(struct task_struct *t) ++int recalc_sigpending_tsk(struct task_struct *t) + { + if (t->signal->group_stop_count > 0 || + PENDING(&t->pending, &t->blocked) || +@@ -126,6 +145,7 @@ void recalc_sigpending_and_wake(struct task_struct *t) + if (recalc_sigpending_tsk(t)) + signal_wake_up(t, 0); + } ++EXPORT_SYMBOL_GPL(recalc_sigpending_tsk); + + void recalc_sigpending(void) + { +@@ -184,8 +204,13 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, + atomic_inc(&user->sigpending); + if (override_rlimit || + atomic_read(&user->sigpending) <= +- t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) ++ t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) { + q = kmem_cache_alloc(sigqueue_cachep, flags); ++ if (q && ub_siginfo_charge(q, get_task_ub(t))) { ++ kmem_cache_free(sigqueue_cachep, q); ++ q = NULL; ++ } ++ } + if (unlikely(q == NULL)) { + atomic_dec(&user->sigpending); + } else { +@@ -202,6 +227,7 @@ static void __sigqueue_free(struct sigqueue *q) + return; + atomic_dec(&q->user->sigpending); + free_uid(q->user); ++ ub_siginfo_uncharge(q); + kmem_cache_free(sigqueue_cachep, q); + } + +@@ -384,7 +410,18 @@ static int collect_signal(int sig, struct sigpending *list, siginfo_t *info) + static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, + siginfo_t *info) + { +- int sig = next_signal(pending, mask); ++ int sig = 0; ++ ++ /* SIGKILL must have priority, otherwise it is quite easy ++ * to create an unkillable process, sending sig < SIGKILL ++ * to self */ ++ if (unlikely(sigismember(&pending->signal, SIGKILL))) { ++ if (!sigismember(mask, SIGKILL)) ++ sig = SIGKILL; ++ } ++ ++ if (likely(!sig)) ++ sig = next_signal(pending, mask); + + if (sig) { + if (current->notifier) { +@@ -509,6 +546,7 @@ void signal_wake_up(struct task_struct *t, int resume) + if (!wake_up_state(t, mask)) + kick_process(t); + } ++EXPORT_SYMBOL_GPL(signal_wake_up); + + /* + * Remove signals in mask from the pending set and queue. +@@ -630,7 +668,7 @@ static int prepare_signal(int sig, struct task_struct *p) + t = p; + do { + rm_from_queue(sigmask(SIGCONT), &t->pending); +- } while_each_thread(p, t); ++ } while_each_thread_all(p, t); + } else if (sig == SIGCONT) { + unsigned int why; + /* +@@ -662,7 +700,7 @@ static int prepare_signal(int sig, struct task_struct *p) + state |= TASK_INTERRUPTIBLE; + } + wake_up_state(t, state); +- } while_each_thread(p, t); ++ } while_each_thread_all(p, t); + + /* + * Notify the parent with CLD_CONTINUED if we were stopped. +@@ -783,7 +821,7 @@ static void complete_signal(int sig, struct task_struct *p, int group) + do { + sigaddset(&t->pending.signal, SIGKILL); + signal_wake_up(t, 1); +- } while_each_thread(p, t); ++ } while_each_thread_all(p, t); + return; + } + } +@@ -1019,7 +1057,8 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) + if (!ret && sig) { + ret = -ESRCH; + if (lock_task_sighand(p, &flags)) { +- ret = __group_send_sig_info(sig, info, p); ++ ret = sig_ve_ignored(sig, info, p) ? 0 : ++ __group_send_sig_info(sig, info, p); + unlock_task_sighand(p, &flags); + } + } +@@ -1144,7 +1183,7 @@ static int kill_something_info(int sig, struct siginfo *info, int pid) + int retval = 0, count = 0; + struct task_struct * p; + +- for_each_process(p) { ++ for_each_process_ve(p) { + if (p->pid > 1 && !same_thread_group(p, current)) { + int err = group_send_sig_info(sig, info, p); + ++count; +@@ -1359,6 +1398,14 @@ void do_notify_parent(struct task_struct *tsk, int sig) + BUG_ON(!tsk->ptrace && + (tsk->group_leader != tsk || !thread_group_empty(tsk))); + ++#ifdef CONFIG_VE ++ /* Allow to send only SIGCHLD from VE */ ++ if (sig != SIGCHLD && ++ tsk->ve_task_info.owner_env != ++ tsk->parent->ve_task_info.owner_env) ++ sig = SIGCHLD; ++#endif ++ + info.si_signo = sig; + info.si_errno = 0; + /* +@@ -1630,7 +1677,9 @@ finish_stop(int stop_count) + } + + do { ++ set_stop_state(current); + schedule(); ++ clear_stop_state(current); + } while (try_to_freeze()); + /* + * Now we don't run again until continued. +@@ -1683,6 +1732,7 @@ static int do_signal_stop(int signr) + sig->group_stop_count = stop_count; + } + ++ clear_pn_state(current); + if (stop_count == 0) + sig->flags = SIGNAL_STOP_STOPPED; + current->exit_code = sig->group_exit_code; +@@ -1746,8 +1796,6 @@ relock: + * Now that we woke up, it's crucial if we're supposed to be + * frozen that we freeze now before running anything substantial. + */ +- try_to_freeze(); +- + spin_lock_irq(&sighand->siglock); + /* + * Every stopped thread goes here after wakeup. Check to see if +@@ -2236,7 +2284,8 @@ static int do_tkill(int tgid, int pid, int sig) + * signal is private anyway. + */ + if (!error && sig && lock_task_sighand(p, &flags)) { +- error = specific_send_sig_info(sig, &info, p); ++ if (!sig_ve_ignored(sig, &info, p)) ++ error = specific_send_sig_info(sig, &info, p); + unlock_task_sighand(p, &flags); + } + } +@@ -2592,5 +2641,5 @@ __attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma) + + void __init signals_init(void) + { +- sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); ++ sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC|SLAB_UBC); + } +diff --git a/kernel/softirq.c b/kernel/softirq.c +index 36e0617..a74d919 100644 +--- a/kernel/softirq.c ++++ b/kernel/softirq.c +@@ -22,6 +22,8 @@ + #include + #include + ++#include ++ + #include + /* + - No shared variables, all the data are CPU local. +@@ -209,10 +211,14 @@ EXPORT_SYMBOL(local_bh_enable_ip); + + asmlinkage void __do_softirq(void) + { ++ struct user_beancounter *ub; + struct softirq_action *h; + __u32 pending; + int max_restart = MAX_SOFTIRQ_RESTART; + int cpu; ++ struct ve_struct *envid; ++ ++ envid = set_exec_env(get_ve0()); + + pending = local_softirq_pending(); + account_system_vtime(current); +@@ -229,6 +235,7 @@ restart: + + h = softirq_vec; + ++ ub = set_exec_ub(get_ub0()); + do { + if (pending & 1) { + h->action(h); +@@ -237,6 +244,7 @@ restart: + h++; + pending >>= 1; + } while (pending); ++ (void)set_exec_ub(ub); + + local_irq_disable(); + +@@ -250,6 +258,7 @@ restart: + trace_softirq_exit(); + + account_system_vtime(current); ++ (void)set_exec_env(envid); + _local_bh_enable(); + } + +@@ -305,6 +314,7 @@ void irq_exit(void) + { + account_system_vtime(current); + trace_hardirq_exit(); ++ restore_context(); + sub_preempt_count(IRQ_EXIT_OFFSET); + if (!in_interrupt() && local_softirq_pending()) + invoke_softirq(); +diff --git a/kernel/softlockup.c b/kernel/softlockup.c +index a272d78..5332252 100644 +--- a/kernel/softlockup.c ++++ b/kernel/softlockup.c +@@ -199,12 +199,12 @@ static void check_hung_uninterruptible_tasks(int this_cpu) + return; + + read_lock(&tasklist_lock); +- do_each_thread(g, t) { ++ do_each_thread_all(g, t) { + if (!--max_count) + goto unlock; + if (t->state & TASK_UNINTERRUPTIBLE) + check_hung_task(t, now); +- } while_each_thread(g, t); ++ } while_each_thread_all(g, t); + unlock: + read_unlock(&tasklist_lock); + } +diff --git a/kernel/sys.c b/kernel/sys.c +index 14e9728..34a0c70 100644 +--- a/kernel/sys.c ++++ b/kernel/sys.c +@@ -10,6 +10,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -33,6 +34,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -112,6 +114,102 @@ EXPORT_SYMBOL(cad_pid); + + void (*pm_power_off_prepare)(void); + ++DECLARE_MUTEX(virtinfo_sem); ++EXPORT_SYMBOL(virtinfo_sem); ++static struct vnotifier_block *virtinfo_chain[VIRT_TYPES]; ++ ++void __virtinfo_notifier_register(int type, struct vnotifier_block *nb) ++{ ++ struct vnotifier_block **p; ++ ++ for (p = &virtinfo_chain[type]; ++ *p != NULL && nb->priority < (*p)->priority; ++ p = &(*p)->next); ++ nb->next = *p; ++ smp_wmb(); ++ *p = nb; ++} ++ ++EXPORT_SYMBOL(__virtinfo_notifier_register); ++ ++void virtinfo_notifier_register(int type, struct vnotifier_block *nb) ++{ ++ down(&virtinfo_sem); ++ __virtinfo_notifier_register(type, nb); ++ up(&virtinfo_sem); ++} ++ ++EXPORT_SYMBOL(virtinfo_notifier_register); ++ ++struct virtinfo_cnt_struct { ++ volatile unsigned long exit[NR_CPUS]; ++ volatile unsigned long entry; ++}; ++static DEFINE_PER_CPU(struct virtinfo_cnt_struct, virtcnt); ++ ++void virtinfo_notifier_unregister(int type, struct vnotifier_block *nb) ++{ ++ struct vnotifier_block **p; ++ int entry_cpu, exit_cpu; ++ unsigned long cnt, ent; ++ ++ down(&virtinfo_sem); ++ for (p = &virtinfo_chain[type]; *p != nb; p = &(*p)->next); ++ *p = nb->next; ++ smp_mb(); ++ ++ for_each_cpu_mask(entry_cpu, cpu_possible_map) { ++ while (1) { ++ cnt = 0; ++ for_each_cpu_mask(exit_cpu, cpu_possible_map) ++ cnt += ++ per_cpu(virtcnt, entry_cpu).exit[exit_cpu]; ++ smp_rmb(); ++ ent = per_cpu(virtcnt, entry_cpu).entry; ++ if (cnt == ent) ++ break; ++ __set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_timeout(HZ / 100); ++ } ++ } ++ up(&virtinfo_sem); ++} ++ ++EXPORT_SYMBOL(virtinfo_notifier_unregister); ++ ++int virtinfo_notifier_call(int type, unsigned long n, void *data) ++{ ++ int ret; ++ int entry_cpu, exit_cpu; ++ struct vnotifier_block *nb; ++ ++ entry_cpu = get_cpu(); ++ per_cpu(virtcnt, entry_cpu).entry++; ++ smp_wmb(); ++ put_cpu(); ++ ++ nb = virtinfo_chain[type]; ++ ret = NOTIFY_DONE; ++ while (nb) ++ { ++ ret = nb->notifier_call(nb, n, data, ret); ++ if(ret & NOTIFY_STOP_MASK) { ++ ret &= ~NOTIFY_STOP_MASK; ++ break; ++ } ++ nb = nb->next; ++ } ++ ++ exit_cpu = get_cpu(); ++ smp_wmb(); ++ per_cpu(virtcnt, entry_cpu).exit[exit_cpu]++; ++ put_cpu(); ++ ++ return ret; ++} ++ ++EXPORT_SYMBOL(virtinfo_notifier_call); ++ + static int set_one_prio(struct task_struct *p, int niceval, int error) + { + int no_nice; +@@ -181,10 +279,10 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) + if ((who != current->uid) && !(user = find_user(who))) + goto out_unlock; /* No processes for this user */ + +- do_each_thread(g, p) ++ do_each_thread_ve(g, p) + if (p->uid == who) + error = set_one_prio(p, niceval, error); +- while_each_thread(g, p); ++ while_each_thread_ve(g, p); + if (who != current->uid) + free_uid(user); /* For find_user() */ + break; +@@ -243,13 +341,13 @@ asmlinkage long sys_getpriority(int which, int who) + if ((who != current->uid) && !(user = find_user(who))) + goto out_unlock; /* No processes for this user */ + +- do_each_thread(g, p) ++ do_each_thread_ve(g, p) + if (p->uid == who) { + niceval = 20 - task_nice(p); + if (niceval > retval) + retval = niceval; + } +- while_each_thread(g, p); ++ while_each_thread_ve(g, p); + if (who != current->uid) + free_uid(user); /* for find_user() */ + break; +@@ -383,6 +481,25 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user + magic2 != LINUX_REBOOT_MAGIC2C)) + return -EINVAL; + ++#ifdef CONFIG_VE ++ if (!ve_is_super(get_exec_env())) ++ switch (cmd) { ++ case LINUX_REBOOT_CMD_RESTART: ++ case LINUX_REBOOT_CMD_HALT: ++ case LINUX_REBOOT_CMD_POWER_OFF: ++ case LINUX_REBOOT_CMD_RESTART2: ++ force_sig(SIGKILL, ++ get_exec_env()->ve_ns->pid_ns->child_reaper); ++ ++ case LINUX_REBOOT_CMD_CAD_ON: ++ case LINUX_REBOOT_CMD_CAD_OFF: ++ return 0; ++ ++ default: ++ return -EINVAL; ++ } ++#endif ++ + /* Instead of trying to make the power_off code look like + * halt when pm_power_off is not set do it the easy way. + */ +@@ -564,7 +681,7 @@ asmlinkage long sys_setgid(gid_t gid) + return 0; + } + +-static int set_user(uid_t new_ruid, int dumpclear) ++int set_user(uid_t new_ruid, int dumpclear) + { + struct user_struct *new_user; + +@@ -868,8 +985,27 @@ asmlinkage long sys_setfsgid(gid_t gid) + return old_fsgid; + } + ++#ifdef CONFIG_VE ++unsigned long long ve_relative_clock(struct timespec * ts) ++{ ++ unsigned long long offset = 0; ++ ++ if (ts->tv_sec > get_exec_env()->start_timespec.tv_sec || ++ (ts->tv_sec == get_exec_env()->start_timespec.tv_sec && ++ ts->tv_nsec >= get_exec_env()->start_timespec.tv_nsec)) ++ offset = (unsigned long long)(ts->tv_sec - ++ get_exec_env()->start_timespec.tv_sec) * NSEC_PER_SEC ++ + ts->tv_nsec - get_exec_env()->start_timespec.tv_nsec; ++ return nsec_to_clock_t(offset); ++} ++#endif ++ + asmlinkage long sys_times(struct tms __user * tbuf) + { ++#ifdef CONFIG_VE ++ struct timespec now; ++#endif ++ + /* + * In the SMP world we might just be unlucky and have one of + * the times increment as we use it. Since the value is an +@@ -903,7 +1039,13 @@ asmlinkage long sys_times(struct tms __user * tbuf) + if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) + return -EFAULT; + } ++#ifndef CONFIG_VE + return (long) jiffies_64_to_clock_t(get_jiffies_64()); ++#else ++ /* Compare to calculation in fs/proc/array.c */ ++ do_posix_clock_monotonic_gettime(&now); ++ return ve_relative_clock(&now); ++#endif + } + + /* +@@ -1077,6 +1219,7 @@ asmlinkage long sys_setsid(void) + + spin_lock(&group_leader->sighand->siglock); + group_leader->signal->tty = NULL; ++ group_leader->signal->tty_old_pgrp = 0; + spin_unlock(&group_leader->sighand->siglock); + + err = session; +@@ -1361,7 +1504,7 @@ asmlinkage long sys_sethostname(char __user *name, int len) + int errno; + char tmp[__NEW_UTS_LEN]; + +- if (!capable(CAP_SYS_ADMIN)) ++ if (!capable(CAP_VE_SYS_ADMIN)) + return -EPERM; + if (len < 0 || len > __NEW_UTS_LEN) + return -EINVAL; +@@ -1406,7 +1549,7 @@ asmlinkage long sys_setdomainname(char __user *name, int len) + int errno; + char tmp[__NEW_UTS_LEN]; + +- if (!capable(CAP_SYS_ADMIN)) ++ if (!capable(CAP_VE_SYS_ADMIN)) + return -EPERM; + if (len < 0 || len > __NEW_UTS_LEN) + return -EINVAL; +diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c +index 5b9b467..7717f4d 100644 +--- a/kernel/sys_ni.c ++++ b/kernel/sys_ni.c +@@ -161,3 +161,15 @@ cond_syscall(sys_timerfd_gettime); + cond_syscall(compat_sys_timerfd_settime); + cond_syscall(compat_sys_timerfd_gettime); + cond_syscall(sys_eventfd); ++cond_syscall(sys_getluid); ++cond_syscall(sys_setluid); ++cond_syscall(sys_setublimit); ++cond_syscall(sys_ubstat); ++ ++/* fairsched compat */ ++cond_syscall(sys_fairsched_mknod); ++cond_syscall(sys_fairsched_rmnod); ++cond_syscall(sys_fairsched_mvpr); ++cond_syscall(sys_fairsched_vcpus); ++cond_syscall(sys_fairsched_chwt); ++cond_syscall(sys_fairsched_rate); +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index 2911665..3692411 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -80,6 +80,7 @@ extern int percpu_pagelist_fraction; + extern int compat_log; + extern int maps_protect; + extern int sysctl_stat_interval; ++extern int ve_area_access_check; /* fs/namei.c */ + extern int latencytop_enabled; + extern int sysctl_nr_open_min, sysctl_nr_open_max; + +@@ -106,6 +107,13 @@ static int min_percpu_pagelist_fract = 8; + + static int ngroups_max = NGROUPS_MAX; + ++int ve_allow_kthreads = 1; ++EXPORT_SYMBOL(ve_allow_kthreads); ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++extern int sysrq_key_scancode; ++#endif ++ + #ifdef CONFIG_KMOD + extern char modprobe_path[]; + #endif +@@ -119,6 +127,8 @@ extern int stop_a_enabled; + extern int scons_pwroff; + #endif + ++extern int alloc_fail_warn; ++ + #ifdef __hppa__ + extern int pwrsw_enabled; + extern int unaligned_enabled; +@@ -133,6 +143,7 @@ extern int spin_retry; + #endif + + extern int sysctl_hz_timer; ++int decode_call_traces = 1; + + #ifdef CONFIG_BSD_PROCESS_ACCT + extern int acct_parm[]; +@@ -141,6 +152,10 @@ extern int acct_parm[]; + #ifdef CONFIG_IA64 + extern int no_unaligned_warning; + #endif ++#ifdef CONFIG_VE ++int glob_ve_meminfo = 0; ++EXPORT_SYMBOL(glob_ve_meminfo); ++#endif + + #ifdef CONFIG_RT_MUTEXES + extern int max_lock_depth; +@@ -160,9 +175,59 @@ static struct ctl_table_header root_table_header = { + .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.header_list), + .root = &sysctl_table_root, + }; ++ ++#ifdef CONFIG_VE ++static LIST_HEAD(empty_list); ++ ++static struct list_head *sysctl_default_lookup(struct ctl_table_root *r, ++ struct nsproxy *namespaces) ++{ ++ if (ve_is_super(get_exec_env())) ++ return &r->header_list; ++ ++ BUG_ON(!list_empty(&empty_list)); ++ return &empty_list; ++} ++ ++/* ++ * default root: ++ * all new tables go to one by default ++ * visible rw in ve0 only ++ */ ++static struct ctl_table_root sysctl_default_root = { ++ .header_list = LIST_HEAD_INIT(sysctl_default_root.header_list), ++ .lookup = sysctl_default_lookup, ++}; ++ ++/* ++ * virtual root: ++ * visible rw everywhere (glob 1) ++ */ ++static struct ctl_table_root sysctl_virt_root = { ++ .header_list = LIST_HEAD_INIT(sysctl_virt_root.header_list), ++}; ++ ++static int sysctl_root_perms(struct ctl_table_root *root, ++ struct nsproxy *namespaces, struct ctl_table *table) ++{ ++ if (ve_is_super(get_exec_env())) ++ return table->mode; ++ else ++ return table->mode & ~0222; ++} ++#else ++#define sysctl_default_root sysctl_table_root ++#define sysctl_root_perms NULL ++#endif ++ ++/* ++ * classical root: ++ * visible ro in ve and rw in ve0 (glob 0 && root_table_header) ++ */ + static struct ctl_table_root sysctl_table_root = { + .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list), + .header_list = LIST_HEAD_INIT(root_table_header.ctl_entry), ++ .permissions = sysctl_root_perms, + }; + + static struct ctl_table kern_table[]; +@@ -429,6 +494,20 @@ static struct ctl_table kern_table[] = { + .proc_handler = &proc_dointvec, + }, + #endif ++ { ++ .procname = "silence-level", ++ .data = &console_silence_loglevel, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++ { ++ .procname = "alloc_fail_warn", ++ .data = &alloc_fail_warn, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, + #ifdef __hppa__ + { + .ctl_name = KERN_HPPA_PWRSW, +@@ -593,6 +672,24 @@ static struct ctl_table kern_table[] = { + .extra1 = &pid_max_min, + .extra2 = &pid_max_max, + }, ++#ifdef CONFIG_VE ++ { ++ .procname = "ve_meminfo", ++ .data = &glob_ve_meminfo, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif ++#ifdef CONFIG_MAGIC_SYSRQ ++ { ++ .procname = "sysrq-key", ++ .data = &sysrq_key_scancode, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++#endif + { + .ctl_name = KERN_PANIC_ON_OOPS, + .procname = "panic_on_oops", +@@ -1140,6 +1237,21 @@ static struct ctl_table vm_table[] = { + .extra2 = &one, + }, + #endif ++ { ++ .procname = "vsyscall", ++ .data = &sysctl_at_vsyscall, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ .ctl_name = CTL_UNNUMBERED, ++ .procname = "odirect_enable", ++ .data = &odirect_enable, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, + /* + * NOTE: do not add new entries to this table unless you have read + * Documentation/sysctl/ctl_unnumbered.txt +@@ -1302,6 +1414,13 @@ static struct ctl_table fs_table[] = { + }; + + static struct ctl_table debug_table[] = { ++ { ++ .procname = "decode_call_traces", ++ .data = &decode_call_traces, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, + #if defined(CONFIG_X86) || defined(CONFIG_PPC) + { + .ctl_name = CTL_UNNUMBERED, +@@ -1598,6 +1717,10 @@ static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) + + static __init int sysctl_init(void) + { ++#ifdef CONFIG_VE ++ register_sysctl_root(&sysctl_default_root); ++ register_sysctl_root(&sysctl_virt_root); ++#endif + sysctl_set_parent(NULL, root_table); + #ifdef CONFIG_SYSCTL_SYSCALL_CHECK + { +@@ -1758,10 +1881,18 @@ struct ctl_table_header *__register_sysctl_paths( + struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, + struct ctl_table *table) + { +- return __register_sysctl_paths(&sysctl_table_root, current->nsproxy, ++ return __register_sysctl_paths(&sysctl_default_root, current->nsproxy, + path, table); + } + ++struct ctl_table_header *register_sysctl_glob_paths(const struct ctl_path *path, ++ struct ctl_table *table, int virtual_handler) ++{ ++ return __register_sysctl_paths(virtual_handler ? ++ &sysctl_virt_root : &sysctl_table_root, ++ current->nsproxy, path, table); ++} ++ + /** + * register_sysctl_table - register a sysctl table hierarchy + * @table: the top-level table structure +@@ -1778,6 +1909,14 @@ struct ctl_table_header *register_sysctl_table(struct ctl_table *table) + return register_sysctl_paths(null_path, table); + } + ++struct ctl_table_header *register_sysctl_glob_table(struct ctl_table *table, ++ int virtual_handler) ++{ ++ static const struct ctl_path null_path[] = { {} }; ++ ++ return register_sysctl_glob_paths(null_path, table, virtual_handler); ++} ++ + /** + * unregister_sysctl_table - unregister a sysctl table hierarchy + * @header: the header returned from register_sysctl_table +@@ -1810,6 +1949,18 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, + return NULL; + } + ++struct ctl_table_header *register_sysctl_glob_table(struct ctl_table *table, ++ int vh) ++{ ++ return NULL; ++} ++ ++struct ctl_table_header *register_sysctl_glob_paths(const struct ctl_path *path, ++ struct ctl_table *table, int vh) ++{ ++ return NULL; ++} ++ + void unregister_sysctl_table(struct ctl_table_header * table) + { + } +@@ -2829,6 +2980,57 @@ static int deprecated_sysctl_warning(struct __sysctl_args *args) + return 0; + } + ++#ifdef CONFIG_PID_NS ++#include ++ ++static int proc_pid_ns_hide_child(struct ctl_table *table, int write, ++ struct file *filp, void __user *buffer, ++ size_t *lenp, loff_t *ppos) ++{ ++ int tmp, res; ++ ++ tmp = (current->nsproxy->pid_ns->flags & PID_NS_HIDE_CHILD) ? 1 : 0; ++ ++ res = __do_proc_dointvec(&tmp, table, write, filp, buffer, ++ lenp, ppos, NULL, NULL); ++ if (res || !write) ++ return res; ++ ++ if (tmp) ++ current->nsproxy->pid_ns->flags |= PID_NS_HIDE_CHILD; ++ else ++ current->nsproxy->pid_ns->flags &= ~PID_NS_HIDE_CHILD; ++ return 0; ++} ++ ++static struct ctl_table pid_ns_kern_table[] = { ++ { ++ .procname = "pid_ns_hide_child", ++ .maxlen = sizeof(int), ++ .mode = 0600, ++ .proc_handler = proc_pid_ns_hide_child, ++ }, ++ {} ++}; ++ ++static struct ctl_table pid_ns_root_table[] = { ++ { ++ .ctl_name = CTL_KERN, ++ .procname = "kernel", ++ .mode = 0555, ++ .child = pid_ns_kern_table, ++ }, ++ {} ++}; ++ ++static __init int pid_ns_sysctl_init(void) ++{ ++ register_sysctl_table(pid_ns_root_table); ++ return 0; ++} ++postcore_initcall(pid_ns_sysctl_init); ++#endif /* CONFIG_PID_NS */ ++ + /* + * No sense putting this after each symbol definition, twice, + * exception granted :-) +@@ -2842,7 +3044,9 @@ EXPORT_SYMBOL(proc_dostring); + EXPORT_SYMBOL(proc_doulongvec_minmax); + EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); + EXPORT_SYMBOL(register_sysctl_table); ++EXPORT_SYMBOL(register_sysctl_glob_table); + EXPORT_SYMBOL(register_sysctl_paths); ++EXPORT_SYMBOL(register_sysctl_glob_paths); + EXPORT_SYMBOL(sysctl_intvec); + EXPORT_SYMBOL(sysctl_jiffies); + EXPORT_SYMBOL(sysctl_ms_jiffies); +diff --git a/kernel/taskstats.c b/kernel/taskstats.c +index 4a23517..590d37c 100644 +--- a/kernel/taskstats.c ++++ b/kernel/taskstats.c +@@ -254,7 +254,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first, + + stats->nvcsw += tsk->nvcsw; + stats->nivcsw += tsk->nivcsw; +- } while_each_thread(first, tsk); ++ } while_each_thread_all(first, tsk); + + unlock_task_sighand(first, &flags); + rc = 0; +diff --git a/kernel/time.c b/kernel/time.c +index 6a08660..c986346 100644 +--- a/kernel/time.c ++++ b/kernel/time.c +@@ -601,10 +601,12 @@ EXPORT_SYMBOL(jiffies_to_clock_t); + unsigned long clock_t_to_jiffies(unsigned long x) + { + #if (HZ % USER_HZ)==0 ++ WARN_ON((long)x < 0); + if (x >= ~0UL / (HZ / USER_HZ)) + return ~0UL; + return x * (HZ / USER_HZ); + #else ++ WARN_ON((long)x < 0); + /* Don't worry about loss of precision here .. */ + if (x >= ~0UL / HZ * USER_HZ) + return ~0UL; +@@ -617,6 +619,7 @@ EXPORT_SYMBOL(clock_t_to_jiffies); + + u64 jiffies_64_to_clock_t(u64 x) + { ++ WARN_ON((s64)x < 0); + #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 + # if HZ < USER_HZ + x = div_u64(x * USER_HZ, HZ); +@@ -639,6 +642,7 @@ EXPORT_SYMBOL(jiffies_64_to_clock_t); + + u64 nsec_to_clock_t(u64 x) + { ++ WARN_ON((s64)x < 0); + #if (NSEC_PER_SEC % USER_HZ) == 0 + return div_u64(x, NSEC_PER_SEC / USER_HZ); + #elif (USER_HZ % 512) == 0 +diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c +index e91c29f..3db0c59 100644 +--- a/kernel/time/timekeeping.c ++++ b/kernel/time/timekeeping.c +@@ -43,6 +43,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); + * used instead. + */ + struct timespec xtime __attribute__ ((aligned (16))); ++EXPORT_SYMBOL_GPL(xtime); + struct timespec wall_to_monotonic __attribute__ ((aligned (16))); + static unsigned long total_sleep_time; /* seconds */ + +diff --git a/kernel/timer.c b/kernel/timer.c +index ceacc66..83d6963 100644 +--- a/kernel/timer.c ++++ b/kernel/timer.c +@@ -37,6 +37,8 @@ + #include + #include + #include ++#include ++#include + + #include + #include +@@ -795,7 +797,11 @@ static inline void __run_timers(struct tvec_base *base) + spin_unlock_irq(&base->lock); + { + int preempt_count = preempt_count(); ++ struct ve_struct *ve; ++ ++ ve = set_exec_env(get_ve0()); + fn(data); ++ (void)set_exec_env(ve); + if (preempt_count != preempt_count()) { + printk(KERN_ERR "huh, entered %p " + "with preempt_count %08x, exited" +@@ -1014,6 +1020,37 @@ EXPORT_SYMBOL(avenrun); + * calc_load - given tick count, update the avenrun load estimates. + * This is called while holding a write_lock on xtime_lock. + */ ++ ++ ++#ifdef CONFIG_VE ++static void calc_load_ve(void) ++{ ++ unsigned long flags, nr_unint, nr_active; ++ struct ve_struct *ve; ++ ++ read_lock(&ve_list_lock); ++ for_each_ve(ve) { ++ nr_active = nr_running_ve(ve) + nr_uninterruptible_ve(ve); ++ nr_active *= FIXED_1; ++ ++ CALC_LOAD(ve->avenrun[0], EXP_1, nr_active); ++ CALC_LOAD(ve->avenrun[1], EXP_5, nr_active); ++ CALC_LOAD(ve->avenrun[2], EXP_15, nr_active); ++ } ++ read_unlock(&ve_list_lock); ++ ++ nr_unint = nr_uninterruptible() * FIXED_1; ++ spin_lock_irqsave(&kstat_glb_lock, flags); ++ CALC_LOAD(kstat_glob.nr_unint_avg[0], EXP_1, nr_unint); ++ CALC_LOAD(kstat_glob.nr_unint_avg[1], EXP_5, nr_unint); ++ CALC_LOAD(kstat_glob.nr_unint_avg[2], EXP_15, nr_unint); ++ spin_unlock_irqrestore(&kstat_glb_lock, flags); ++ ++} ++#else ++#define calc_load_ve() do { } while (0) ++#endif ++ + static inline void calc_load(unsigned long ticks) + { + unsigned long active_tasks; /* fixed-point */ +@@ -1026,6 +1063,7 @@ static inline void calc_load(unsigned long ticks) + CALC_LOAD(avenrun[0], EXP_1, active_tasks); + CALC_LOAD(avenrun[1], EXP_5, active_tasks); + CALC_LOAD(avenrun[2], EXP_15, active_tasks); ++ calc_load_ve(); + count += LOAD_FREQ; + } while (count < 0); + } +@@ -1275,11 +1313,12 @@ int do_sysinfo(struct sysinfo *info) + unsigned long mem_total, sav_total; + unsigned int mem_unit, bitcount; + unsigned long seq; ++ unsigned long *__avenrun; ++ struct timespec tp; + + memset(info, 0, sizeof(struct sysinfo)); + + do { +- struct timespec tp; + seq = read_seqbegin(&xtime_lock); + + /* +@@ -1297,18 +1336,34 @@ int do_sysinfo(struct sysinfo *info) + tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; + tp.tv_sec++; + } +- info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); +- +- info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); +- info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); +- info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); ++ } while (read_seqretry(&xtime_lock, seq)); + ++ if (ve_is_super(get_exec_env())) { ++ info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); ++ __avenrun = &avenrun[0]; + info->procs = nr_threads; +- } while (read_seqretry(&xtime_lock, seq)); ++ } ++#ifdef CONFIG_VE ++ else { ++ struct ve_struct *ve; ++ ve = get_exec_env(); ++ __avenrun = &ve->avenrun[0]; ++ info->procs = atomic_read(&ve->pcounter); ++ info->uptime = tp.tv_sec - ve->start_timespec.tv_sec; ++ } ++#endif ++ info->loads[0] = __avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); ++ info->loads[1] = __avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); ++ info->loads[2] = __avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); + + si_meminfo(info); + si_swapinfo(info); + ++#ifdef CONFIG_BEANCOUNTERS ++ if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_SYSINFO, info) ++ & NOTIFY_FAIL) ++ return -ENOMSG; ++#endif + /* + * If the sum of all the available memory (i.e. ram + swap) + * is less than can be stored in a 32 bit unsigned long then +diff --git a/kernel/user.c b/kernel/user.c +index 865ecf5..b1139b3 100644 +--- a/kernel/user.c ++++ b/kernel/user.c +@@ -314,6 +314,7 @@ static void remove_user_sysfs_dir(struct work_struct *w) + done: + uids_mutex_unlock(); + } ++EXPORT_SYMBOL_GPL(free_uid); + + /* IRQs are disabled and uidhash_lock is held upon function entry. + * IRQ state (as stored in flags) is restored and uidhash_lock released +@@ -383,6 +384,7 @@ void free_uid(struct user_struct *up) + else + local_irq_restore(flags); + } ++EXPORT_SYMBOL_GPL(free_uid); + + struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) + { +@@ -447,6 +449,7 @@ out_unlock: + uids_mutex_unlock(); + return NULL; + } ++EXPORT_SYMBOL_GPL(alloc_uid); + + void switch_uid(struct user_struct *new_user) + { +@@ -477,6 +480,7 @@ void switch_uid(struct user_struct *new_user) + free_uid(old_user); + suid_keys(current); + } ++EXPORT_SYMBOL_GPL(switch_uid); + + #ifdef CONFIG_USER_NS + void release_uids(struct user_namespace *ns) +@@ -510,7 +514,7 @@ static int __init uid_cache_init(void) + int n; + + uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), +- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); ++ 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL); + + for(n = 0; n < UIDHASH_SZ; ++n) + INIT_HLIST_HEAD(init_user_ns.uidhash_table + n); +diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c +index fe3a56c..22d14c2 100644 +--- a/kernel/utsname_sysctl.c ++++ b/kernel/utsname_sysctl.c +@@ -27,6 +27,10 @@ static void *get_uts(ctl_table *table, int write) + down_read(&uts_sem); + else + down_write(&uts_sem); ++ ++ if (strcmp(table->procname, "virt_osrelease") == 0) ++ return virt_utsname.release; ++ + return which; + } + +@@ -128,19 +132,27 @@ static struct ctl_table uts_kern_table[] = { + {} + }; + +-static struct ctl_table uts_root_table[] = { ++static struct ctl_table uts_virt_osrelease_table[] = { + { +- .ctl_name = CTL_KERN, +- .procname = "kernel", +- .mode = 0555, +- .child = uts_kern_table, ++ .procname = "virt_osrelease", ++ .data = virt_utsname.release, ++ .maxlen = sizeof(virt_utsname.release), ++ .mode = 0644, ++ .proc_handler = &proc_do_uts_string, ++ .strategy = sysctl_uts_string, + }, + {} + }; + ++static struct ctl_path uts_path[] = { ++ { .ctl_name = CTL_KERN, .procname = "kernel", }, ++ { } ++}; ++ + static int __init utsname_sysctl_init(void) + { +- register_sysctl_table(uts_root_table); ++ register_sysctl_glob_paths(uts_path, uts_kern_table, 1); ++ register_sysctl_paths(uts_path, uts_virt_osrelease_table); + return 0; + } + +diff --git a/kernel/ve/Makefile b/kernel/ve/Makefile +new file mode 100644 +index 0000000..9d60161 +--- /dev/null ++++ b/kernel/ve/Makefile +@@ -0,0 +1,16 @@ ++# ++# ++# kernel/ve/Makefile ++# ++# Copyright (C) 2000-2005 SWsoft ++# All rights reserved. ++# ++# Licensing governed by "linux/COPYING.SWsoft" file. ++ ++obj-$(CONFIG_VE) = ve.o veowner.o hooks.o ++obj-$(CONFIG_VZ_WDOG) += vzwdog.o ++obj-$(CONFIG_VE_CALLS) += vzmon.o ++ ++vzmon-objs = vecalls.o ++ ++obj-$(CONFIG_VZ_DEV) += vzdev.o +diff --git a/kernel/ve/hooks.c b/kernel/ve/hooks.c +new file mode 100644 +index 0000000..1b82c35 +--- /dev/null ++++ b/kernel/ve/hooks.c +@@ -0,0 +1,114 @@ ++/* ++ * linux/kernel/ve/hooks.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static struct list_head ve_hooks[VE_MAX_CHAINS]; ++static DECLARE_RWSEM(ve_hook_sem); ++ ++void ve_hook_register(int chain, struct ve_hook *vh) ++{ ++ struct list_head *lh; ++ struct ve_hook *tmp; ++ ++ BUG_ON(chain > VE_MAX_CHAINS); ++ ++ down_write(&ve_hook_sem); ++ list_for_each(lh, &ve_hooks[chain]) { ++ tmp = list_entry(lh, struct ve_hook, list); ++ if (vh->priority < tmp->priority) ++ break; ++ } ++ ++ list_add_tail(&vh->list, lh); ++ up_write(&ve_hook_sem); ++} ++ ++EXPORT_SYMBOL(ve_hook_register); ++ ++void ve_hook_unregister(struct ve_hook *vh) ++{ ++ down_write(&ve_hook_sem); ++ list_del(&vh->list); ++ up_write(&ve_hook_sem); ++} ++ ++EXPORT_SYMBOL(ve_hook_unregister); ++ ++static inline int ve_hook_init(struct ve_hook *vh, struct ve_struct *ve) ++{ ++ int err; ++ ++ err = 0; ++ if (try_module_get(vh->owner)) { ++ err = vh->init(ve); ++ module_put(vh->owner); ++ } ++ return err; ++} ++ ++static inline void ve_hook_fini(struct ve_hook *vh, struct ve_struct *ve) ++{ ++ if (vh->fini != NULL && try_module_get(vh->owner)) { ++ vh->fini(ve); ++ module_put(vh->owner); ++ } ++} ++ ++int ve_hook_iterate_init(int chain, void *ve) ++{ ++ struct ve_hook *vh; ++ int err; ++ ++ err = 0; ++ ++ down_read(&ve_hook_sem); ++ list_for_each_entry(vh, &ve_hooks[chain], list) ++ if ((err = ve_hook_init(vh, ve)) < 0) ++ break; ++ ++ if (err) ++ list_for_each_entry_continue_reverse(vh, &ve_hooks[chain], list) ++ ve_hook_fini(vh, ve); ++ ++ up_read(&ve_hook_sem); ++ return err; ++} ++ ++EXPORT_SYMBOL(ve_hook_iterate_init); ++ ++void ve_hook_iterate_fini(int chain, void *ve) ++{ ++ struct ve_hook *vh; ++ ++ down_read(&ve_hook_sem); ++ list_for_each_entry_reverse(vh, &ve_hooks[chain], list) ++ ve_hook_fini(vh, ve); ++ up_read(&ve_hook_sem); ++} ++ ++EXPORT_SYMBOL(ve_hook_iterate_fini); ++ ++static int __init ve_hooks_init(void) ++{ ++ int i; ++ ++ for (i = 0; i < VE_MAX_CHAINS; i++) ++ INIT_LIST_HEAD(&ve_hooks[i]); ++ return 0; ++} ++ ++core_initcall(ve_hooks_init); ++ +diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c +new file mode 100644 +index 0000000..d4ba7b3 +--- /dev/null ++++ b/kernel/ve/ve.c +@@ -0,0 +1,150 @@ ++/* ++ * linux/kernel/ve/ve.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++/* ++ * 've.c' helper file performing VE sub-system initialization ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++unsigned long vz_rstamp = 0x37e0f59d; ++ ++#ifdef CONFIG_MODULES ++struct module no_module = { .state = MODULE_STATE_GOING }; ++EXPORT_SYMBOL(no_module); ++#endif ++ ++INIT_KSYM_MODULE(ip_tables); ++INIT_KSYM_MODULE(ip6_tables); ++INIT_KSYM_MODULE(iptable_filter); ++INIT_KSYM_MODULE(ip6table_filter); ++INIT_KSYM_MODULE(iptable_mangle); ++INIT_KSYM_MODULE(ip6table_mangle); ++INIT_KSYM_MODULE(ip_conntrack); ++INIT_KSYM_MODULE(nf_conntrack); ++INIT_KSYM_MODULE(nf_conntrack_ipv4); ++INIT_KSYM_MODULE(nf_conntrack_ipv6); ++INIT_KSYM_MODULE(ip_nat); ++INIT_KSYM_MODULE(nf_nat); ++INIT_KSYM_MODULE(iptable_nat); ++ ++INIT_KSYM_CALL(int, init_iptable_conntrack, (void)); ++INIT_KSYM_CALL(int, nf_conntrack_init_ve, (void)); ++INIT_KSYM_CALL(int, init_nf_ct_l3proto_ipv4, (void)); ++INIT_KSYM_CALL(int, init_nf_ct_l3proto_ipv6, (void)); ++INIT_KSYM_CALL(int, nf_nat_init, (void)); ++INIT_KSYM_CALL(int, init_iptable_nat, (void)); ++INIT_KSYM_CALL(void, fini_iptable_nat, (void)); ++INIT_KSYM_CALL(int, init_nftable_nat, (void)); ++INIT_KSYM_CALL(void, fini_nftable_nat, (void)); ++INIT_KSYM_CALL(void, nf_nat_cleanup, (void)); ++INIT_KSYM_CALL(void, fini_iptable_conntrack, (void)); ++INIT_KSYM_CALL(void, nf_conntrack_cleanup_ve, (void)); ++INIT_KSYM_CALL(void, fini_nf_ct_l3proto_ipv4, (void)); ++INIT_KSYM_CALL(void, fini_nf_ct_l3proto_ipv6, (void)); ++ ++#if defined(CONFIG_VE_CALLS_MODULE) || defined(CONFIG_VE_CALLS) ++INIT_KSYM_MODULE(vzmon); ++INIT_KSYM_CALL(void, real_do_env_free, (struct ve_struct *env)); ++ ++void do_env_free(struct ve_struct *env) ++{ ++ KSYMSAFECALL_VOID(vzmon, real_do_env_free, (env)); ++} ++EXPORT_SYMBOL(do_env_free); ++#endif ++ ++#if defined(CONFIG_VE_ETHDEV) || defined(CONFIG_VE_ETHDEV_MODULE) ++INIT_KSYM_MODULE(vzethdev); ++INIT_KSYM_CALL(int, veth_open, (struct net_device *dev)); ++#endif ++ ++struct ve_struct ve0 = { ++ .counter = ATOMIC_INIT(1), ++ .pcounter = ATOMIC_INIT(1), ++ .ve_list = LIST_HEAD_INIT(ve0.ve_list), ++ .vetask_lh = LIST_HEAD_INIT(ve0.vetask_lh), ++ .start_jiffies = INITIAL_JIFFIES, ++#ifdef CONFIG_UNIX98_PTYS ++ .devpts_config = &devpts_config, ++#endif ++ .ve_ns = &init_nsproxy, ++ .ve_netns = &init_net, ++ .is_running = 1, ++ .op_sem = __RWSEM_INITIALIZER(ve0.op_sem), ++#ifdef CONFIG_VE_IPTABLES ++ .ipt_mask = ~0ULL, ++#endif ++}; ++ ++EXPORT_SYMBOL(ve0); ++ ++#ifdef CONFIG_SMP ++static struct { ++ void *ptrs[NR_CPUS]; ++} ve0_cpu_stats; ++#endif ++static struct ve_cpu_stats ve0_cpu_stats_data[NR_CPUS]; ++ ++LIST_HEAD(ve_list_head); ++rwlock_t ve_list_lock = RW_LOCK_UNLOCKED; ++ ++LIST_HEAD(ve_cleanup_list); ++DEFINE_SPINLOCK(ve_cleanup_lock); ++struct task_struct *ve_cleanup_thread; ++ ++EXPORT_SYMBOL(ve_list_lock); ++EXPORT_SYMBOL(ve_list_head); ++EXPORT_SYMBOL(ve_cleanup_lock); ++EXPORT_SYMBOL(ve_cleanup_list); ++EXPORT_SYMBOL(ve_cleanup_thread); ++ ++void init_ve0(void) ++{ ++ struct ve_struct *ve; ++ ++ ve = get_ve0(); ++ ve->cpu_stats = static_percpu_ptr(&ve0_cpu_stats, ve0_cpu_stats_data); ++ list_add(&ve->ve_list, &ve_list_head); ++} ++ ++void ve_cleanup_schedule(struct ve_struct *ve) ++{ ++ BUG_ON(ve_cleanup_thread == NULL); ++ ++ spin_lock(&ve_cleanup_lock); ++ list_add_tail(&ve->cleanup_list, &ve_cleanup_list); ++ spin_unlock(&ve_cleanup_lock); ++ ++ wake_up_process(ve_cleanup_thread); ++} +diff --git a/kernel/ve/vecalls.c b/kernel/ve/vecalls.c +new file mode 100644 +index 0000000..5aab66c +--- /dev/null ++++ b/kernel/ve/vecalls.c +@@ -0,0 +1,2422 @@ ++/* ++ * linux/kernel/ve/vecalls.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ */ ++ ++/* ++ * 'vecalls.c' is file with basic VE support. It provides basic primities ++ * along with initialization script ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#ifdef CONFIG_VZ_FAIRSCHED ++#include ++#endif ++ ++#include ++#include ++#include ++#include ++ ++int nr_ve = 1; /* One VE always exists. Compatibility with vestat */ ++EXPORT_SYMBOL(nr_ve); ++ ++static int do_env_enter(struct ve_struct *ve, unsigned int flags); ++static int alloc_ve_tty_drivers(struct ve_struct* ve); ++static void free_ve_tty_drivers(struct ve_struct* ve); ++static int register_ve_tty_drivers(struct ve_struct* ve); ++static void unregister_ve_tty_drivers(struct ve_struct* ve); ++static int init_ve_tty_drivers(struct ve_struct *); ++static void fini_ve_tty_drivers(struct ve_struct *); ++static void clear_termios(struct tty_driver* driver ); ++ ++static void vecalls_exit(void); ++ ++struct ve_struct *__find_ve_by_id(envid_t veid) ++{ ++ struct ve_struct *ve; ++ ++ for_each_ve(ve) { ++ if (ve->veid == veid) ++ return ve; ++ } ++ return NULL; ++} ++EXPORT_SYMBOL(__find_ve_by_id); ++ ++struct ve_struct *get_ve_by_id(envid_t veid) ++{ ++ struct ve_struct *ve; ++ read_lock(&ve_list_lock); ++ ve = __find_ve_by_id(veid); ++ get_ve(ve); ++ read_unlock(&ve_list_lock); ++ return ve; ++} ++EXPORT_SYMBOL(get_ve_by_id); ++ ++/* ++ * real_put_ve() MUST be used instead of put_ve() inside vecalls. ++ */ ++void real_do_env_free(struct ve_struct *ve); ++static inline void real_put_ve(struct ve_struct *ve) ++{ ++ if (ve && atomic_dec_and_test(&ve->counter)) { ++ BUG_ON(atomic_read(&ve->pcounter) > 0); ++ BUG_ON(ve->is_running); ++ real_do_env_free(ve); ++ } ++} ++ ++static int ve_get_cpu_stat(envid_t veid, struct vz_cpu_stat __user *buf) ++{ ++ struct ve_struct *ve; ++ struct vz_cpu_stat *vstat; ++ int retval; ++ int i, cpu; ++ unsigned long tmp; ++ ++ if (!ve_is_super(get_exec_env()) && (veid != get_exec_env()->veid)) ++ return -EPERM; ++ if (veid == 0) ++ return -ESRCH; ++ ++ vstat = kzalloc(sizeof(*vstat), GFP_KERNEL); ++ if (!vstat) ++ return -ENOMEM; ++ ++ retval = -ESRCH; ++ read_lock(&ve_list_lock); ++ ve = __find_ve_by_id(veid); ++ if (ve == NULL) ++ goto out_unlock; ++ for_each_online_cpu(cpu) { ++ struct ve_cpu_stats *st; ++ ++ st = VE_CPU_STATS(ve, cpu); ++ vstat->user_jif += (unsigned long)cputime64_to_clock_t(st->user); ++ vstat->nice_jif += (unsigned long)cputime64_to_clock_t(st->nice); ++ vstat->system_jif += (unsigned long)cputime64_to_clock_t(st->system); ++ vstat->idle_clk += ve_sched_get_idle_time(ve, cpu); ++ } ++ vstat->uptime_clk = get_cycles() - ve->start_cycles; ++ vstat->uptime_jif = (unsigned long)cputime64_to_clock_t( ++ get_jiffies_64() - ve->start_jiffies); ++ for (i = 0; i < 3; i++) { ++ tmp = ve->avenrun[i] + (FIXED_1/200); ++ vstat->avenrun[i].val_int = LOAD_INT(tmp); ++ vstat->avenrun[i].val_frac = LOAD_FRAC(tmp); ++ } ++ read_unlock(&ve_list_lock); ++ ++ retval = 0; ++ if (copy_to_user(buf, vstat, sizeof(*vstat))) ++ retval = -EFAULT; ++out_free: ++ kfree(vstat); ++ return retval; ++ ++out_unlock: ++ read_unlock(&ve_list_lock); ++ goto out_free; ++} ++ ++static int real_setdevperms(envid_t veid, unsigned type, ++ dev_t dev, unsigned mask) ++{ ++ struct ve_struct *ve; ++ int err; ++ ++ if (!capable(CAP_SETVEID) || veid == 0) ++ return -EPERM; ++ ++ if ((ve = get_ve_by_id(veid)) == NULL) ++ return -ESRCH; ++ ++ down_read(&ve->op_sem); ++ err = -ESRCH; ++ if (ve->is_running) ++ err = set_device_perms_ve(ve, type, dev, mask); ++ up_read(&ve->op_sem); ++ real_put_ve(ve); ++ return err; ++} ++ ++/********************************************************************** ++ ********************************************************************** ++ * ++ * VE start: subsystems ++ * ++ ********************************************************************** ++ **********************************************************************/ ++ ++#ifdef CONFIG_INET ++#include ++#include ++#include ++#include ++ ++static int init_fini_ve_mibs(struct ve_struct *ve, int fini) ++{ ++ if (fini) ++ goto fini; ++ if (init_ipv4_mibs()) ++ goto err_ipv4; ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ if (init_ipv6_mibs()) ++ goto err_ipv6; ++#endif ++ return 0; ++ ++fini: ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ cleanup_ipv6_mibs(); ++err_ipv6: ++#endif ++ cleanup_ipv4_mibs(); ++err_ipv4: ++ return -ENOMEM; ++} ++ ++static inline int init_ve_mibs(struct ve_struct *ve) ++{ ++ return init_fini_ve_mibs(ve, 0); ++} ++ ++static inline void fini_ve_mibs(struct ve_struct *ve) ++{ ++ (void)init_fini_ve_mibs(ve, 1); ++} ++#else ++#define init_ve_mibs(ve) (0) ++#define fini_ve_mibs(ve) do { } while (0) ++#endif ++ ++static int prepare_proc_root(struct ve_struct *ve) ++{ ++ struct proc_dir_entry *de; ++ ++ de = kzalloc(sizeof(struct proc_dir_entry) + 6, GFP_KERNEL); ++ if (de == NULL) ++ return -ENOMEM; ++ ++ memcpy(de + 1, "/proc", 6); ++ de->name = (char *)(de + 1); ++ de->namelen = 5; ++ de->mode = S_IFDIR | S_IRUGO | S_IXUGO; ++ de->nlink = 2; ++ atomic_set(&de->count, 1); ++ ++ ve->proc_root = de; ++ return 0; ++} ++ ++#ifdef CONFIG_PROC_FS ++static int init_ve_proc(struct ve_struct *ve) ++{ ++ int err; ++ ++ err = prepare_proc_root(ve); ++ if (err) ++ goto out_root; ++ ++ err = register_ve_fs_type(ve, &proc_fs_type, ++ &ve->proc_fstype, &ve->proc_mnt); ++ if (err) ++ goto out_reg; ++ ++#ifdef CONFIG_PRINTK ++ proc_create("kmsg", S_IRUSR, ve->proc_root, &proc_kmsg_operations); ++#endif ++ proc_mkdir("vz", ve->proc_root); ++ ++ ve->ve_ns->pid_ns->proc_mnt = mntget(ve->proc_mnt); ++ return 0; ++ ++out_reg: ++ /* proc_fstype and proc_root are freed in real_put_ve -> free_ve_proc */ ++ ; ++out_root: ++ return err; ++} ++ ++static void fini_ve_proc(struct ve_struct *ve) ++{ ++ remove_proc_entry("vz", ve->proc_root); ++ remove_proc_entry("kmsg", ve->proc_root); ++ unregister_ve_fs_type(ve->proc_fstype, ve->proc_mnt); ++ ve->proc_mnt = NULL; ++} ++ ++static void free_ve_proc(struct ve_struct *ve) ++{ ++ /* proc filesystem frees proc_dir_entries on remove_proc_entry() only, ++ so we check that everything was removed and not lost */ ++ if (ve->proc_root && ve->proc_root->subdir) { ++ struct proc_dir_entry *p = ve->proc_root; ++ printk(KERN_WARNING "CT: %d: proc entry /proc", ve->veid); ++ while ((p = p->subdir) != NULL) ++ printk("/%s", p->name); ++ printk(" is not removed!\n"); ++ } ++ ++ kfree(ve->proc_root); ++ kfree(ve->proc_fstype); ++ ++ ve->proc_fstype = NULL; ++ ve->proc_root = NULL; ++} ++#else ++#define init_ve_proc(ve) (0) ++#define fini_ve_proc(ve) do { } while (0) ++#define free_ve_proc(ve) do { } while (0) ++#endif ++ ++#ifdef CONFIG_UNIX98_PTYS ++#include ++ ++/* ++ * DEVPTS needs a virtualization: each environment should see each own list of ++ * pseudo-terminals. ++ * To implement it we need to have separate devpts superblocks for each ++ * VE, and each VE should mount its own one. ++ * Thus, separate vfsmount structures are required. ++ * To minimize intrusion into vfsmount lookup code, separate file_system_type ++ * structures are created. ++ * ++ * In addition to this, patch fo character device itself is required, as file ++ * system itself is used only for MINOR/MAJOR lookup. ++ */ ++ ++static int init_ve_devpts(struct ve_struct *ve) ++{ ++ int err; ++ ++ err = -ENOMEM; ++ ve->devpts_config = kzalloc(sizeof(struct devpts_config), GFP_KERNEL); ++ if (ve->devpts_config == NULL) ++ goto out; ++ ++ ve->devpts_config->mode = 0600; ++ err = register_ve_fs_type(ve, &devpts_fs_type, ++ &ve->devpts_fstype, &ve->devpts_mnt); ++ if (err) { ++ kfree(ve->devpts_config); ++ ve->devpts_config = NULL; ++ } ++out: ++ return err; ++} ++ ++static void fini_ve_devpts(struct ve_struct *ve) ++{ ++ unregister_ve_fs_type(ve->devpts_fstype, ve->devpts_mnt); ++ /* devpts_fstype is freed in real_put_ve -> free_ve_filesystems */ ++ ve->devpts_mnt = NULL; ++ kfree(ve->devpts_config); ++ ve->devpts_config = NULL; ++} ++#else ++#define init_ve_devpts(ve) (0) ++#define fini_ve_devpts(ve) do { } while (0) ++#endif ++ ++static int init_ve_shmem(struct ve_struct *ve) ++{ ++ return register_ve_fs_type(ve, ++ &tmpfs_fs_type, ++ &ve->shmem_fstype, ++ &ve->shmem_mnt); ++} ++ ++static void fini_ve_shmem(struct ve_struct *ve) ++{ ++ unregister_ve_fs_type(ve->shmem_fstype, ve->shmem_mnt); ++ /* shmem_fstype is freed in real_put_ve -> free_ve_filesystems */ ++ ve->shmem_mnt = NULL; ++} ++ ++#ifdef CONFIG_SYSFS ++static int init_ve_sysfs_root(struct ve_struct *ve) ++{ ++ struct sysfs_dirent *sysfs_root; ++ ++ sysfs_root = kzalloc(sizeof(struct sysfs_dirent), GFP_KERNEL); ++ if (sysfs_root == NULL) ++ return -ENOMEM; ++ sysfs_root->s_name = ""; ++ atomic_set(&sysfs_root->s_count, 1); ++ sysfs_root->s_flags = SYSFS_DIR; ++ sysfs_root->s_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; ++ sysfs_root->s_ino = 1; ++ ++ ve->_sysfs_root = sysfs_root; ++ return 0; ++} ++#endif ++ ++#if defined(CONFIG_NET) && defined(CONFIG_SYSFS) ++extern struct device_attribute ve_net_class_attributes[]; ++static inline int init_ve_netclass(void) ++{ ++ struct class *nc; ++ int err; ++ ++ nc = kzalloc(sizeof(*nc), GFP_KERNEL); ++ if (!nc) ++ return -ENOMEM; ++ ++ nc->name = net_class.name; ++ nc->dev_release = net_class.dev_release; ++ nc->dev_uevent = net_class.dev_uevent; ++ nc->dev_attrs = ve_net_class_attributes; ++ ++ err = class_register(nc); ++ if (!err) { ++ get_exec_env()->net_class = nc; ++ return 0; ++ } ++ kfree(nc); ++ return err; ++} ++ ++static inline void fini_ve_netclass(void) ++{ ++ struct ve_struct *ve = get_exec_env(); ++ ++ class_unregister(ve->net_class); ++ kfree(ve->net_class); ++ ve->net_class = NULL; ++} ++#else ++static inline int init_ve_netclass(void) { return 0; } ++static inline void fini_ve_netclass(void) { ; } ++#endif ++ ++extern struct kset devices_subsys; ++ ++static const struct { ++ unsigned minor; ++ char *name; ++} mem_class_devices [] = { ++ {3, "null"}, ++ {5, "zero"}, ++ {7, "full"}, ++ {8, "random"}, ++ {9, "urandom"}, ++ {0, NULL}, ++}; ++ ++static int init_ve_mem_class(void) ++{ ++ int i; ++ struct class *ve_mem_class; ++ ++ ve_mem_class = class_create(THIS_MODULE, "mem"); ++ if (IS_ERR(ve_mem_class)) ++ return -ENOMEM; ++ ++ for (i = 0; mem_class_devices[i].name; i++) ++ device_create(ve_mem_class, NULL, ++ MKDEV(MEM_MAJOR, mem_class_devices[i].minor), ++ mem_class_devices[i].name); ++ ++ get_exec_env()->mem_class = ve_mem_class; ++ return 0; ++} ++ ++ ++void fini_ve_mem_class(void) ++{ ++ int i; ++ struct class *ve_mem_class = get_exec_env()->mem_class; ++ ++ for (i = 0; mem_class_devices[i].name; i++) ++ device_destroy(ve_mem_class, ++ MKDEV(MEM_MAJOR, mem_class_devices[i].minor)); ++ class_destroy(ve_mem_class); ++} ++ ++static int init_ve_sysfs(struct ve_struct *ve) ++{ ++ int err; ++ ++#ifdef CONFIG_SYSFS ++ err = 0; ++ if (ve->features & VE_FEATURE_SYSFS) { ++ err = init_ve_sysfs_root(ve); ++ if (err != 0) ++ goto out; ++ err = register_ve_fs_type(ve, ++ &sysfs_fs_type, ++ &ve->sysfs_fstype, ++ &ve->sysfs_mnt); ++ if (err != 0) ++ goto out_fs_type; ++ } ++#endif ++ ++ err = classes_init(); ++ if (err != 0) ++ goto err_classes; ++ ++ err = devices_init(); ++ if (err != 0) ++ goto err_devices; ++ ++ err = init_ve_netclass(); ++ if (err != 0) ++ goto err_net; ++ ++ err = init_ve_tty_class(); ++ if (err != 0) ++ goto err_tty; ++ ++ err = init_ve_mem_class(); ++ if (err != 0) ++ goto err_mem; ++ ++ return 0; ++ ++err_mem: ++ fini_ve_tty_class(); ++err_tty: ++ fini_ve_netclass(); ++err_net: ++ devices_fini(); ++err_devices: ++ classes_fini(); ++err_classes: ++#ifdef CONFIG_SYSFS ++ unregister_ve_fs_type(ve->sysfs_fstype, ve->sysfs_mnt); ++ /* sysfs_fstype is freed in real_put_ve -> free_ve_filesystems */ ++out_fs_type: ++ kfree(ve->_sysfs_root); ++ ve->_sysfs_root = NULL; ++out: ++#endif ++ return err; ++} ++ ++static void fini_ve_sysfs(struct ve_struct *ve) ++{ ++ fini_ve_mem_class(); ++ fini_ve_tty_class(); ++ fini_ve_netclass(); ++ devices_fini(); ++ classes_fini(); ++#ifdef CONFIG_SYSFS ++ unregister_ve_fs_type(ve->sysfs_fstype, ve->sysfs_mnt); ++ ve->sysfs_mnt = NULL; ++ kfree(ve->_sysfs_root); ++ ve->_sysfs_root = NULL; ++ /* sysfs_fstype is freed in real_put_ve -> free_ve_filesystems */ ++#endif ++} ++ ++static void free_ve_filesystems(struct ve_struct *ve) ++{ ++#ifdef CONFIG_SYSFS ++ kfree(ve->sysfs_fstype); ++ ve->sysfs_fstype = NULL; ++#endif ++ kfree(ve->shmem_fstype); ++ ve->shmem_fstype = NULL; ++ ++ kfree(ve->devpts_fstype); ++ ve->devpts_fstype = NULL; ++ ++ free_ve_proc(ve); ++} ++ ++static int init_printk(struct ve_struct *ve) ++{ ++ struct ve_prep_printk { ++ wait_queue_head_t log_wait; ++ unsigned log_start; ++ unsigned log_end; ++ unsigned logged_chars; ++ } *tmp; ++ ++ tmp = kzalloc(sizeof(struct ve_prep_printk), GFP_KERNEL); ++ if (!tmp) ++ return -ENOMEM; ++ ++ init_waitqueue_head(&tmp->log_wait); ++ ve->_log_wait = &tmp->log_wait; ++ ve->_log_start = &tmp->log_start; ++ ve->_log_end = &tmp->log_end; ++ ve->_logged_chars = &tmp->logged_chars; ++ /* ve->log_buf will be initialized later by ve_log_init() */ ++ return 0; ++} ++ ++static void fini_printk(struct ve_struct *ve) ++{ ++ /* ++ * there is no spinlock protection here because nobody can use ++ * log_buf at the moments when this code is called. ++ */ ++ kfree(ve->log_buf); ++ kfree(ve->_log_wait); ++} ++ ++static void fini_venet(struct ve_struct *ve) ++{ ++#ifdef CONFIG_INET ++ tcp_v4_kill_ve_sockets(ve); ++ synchronize_net(); ++#endif ++} ++ ++static int init_ve_sched(struct ve_struct *ve) ++{ ++#ifdef CONFIG_VZ_FAIRSCHED ++ int err; ++ ++ /* ++ * We refuse to switch to an already existing node since nodes ++ * keep a pointer to their ve_struct... ++ */ ++ err = sys_fairsched_mknod(0, 1, ve->veid); ++ if (err < 0) { ++ printk(KERN_WARNING "Can't create fairsched node %d\n", ++ ve->veid); ++ return err; ++ } ++ err = sys_fairsched_mvpr(current->pid, ve->veid); ++ if (err) { ++ printk(KERN_WARNING "Can't switch to fairsched node %d\n", ++ ve->veid); ++ if (sys_fairsched_rmnod(ve->veid)) ++ printk(KERN_ERR "Can't clean fairsched node %d\n", ++ ve->veid); ++ return err; ++ } ++#endif ++ ve_sched_attach(ve); ++ return 0; ++} ++ ++static void fini_ve_sched(struct ve_struct *ve) ++{ ++#ifdef CONFIG_VZ_FAIRSCHED ++ if (task_fairsched_node_id(current) == ve->veid) ++ if (sys_fairsched_mvpr(current->pid, FAIRSCHED_INIT_NODE_ID)) ++ printk(KERN_WARNING "Can't leave fairsched node %d\n", ++ ve->veid); ++ if (sys_fairsched_rmnod(ve->veid)) ++ printk(KERN_ERR "Can't remove fairsched node %d\n", ++ ve->veid); ++#endif ++} ++ ++/* ++ * Namespaces ++ */ ++ ++static inline int init_ve_namespaces(struct ve_struct *ve, ++ struct nsproxy **old) ++{ ++ int err; ++ struct task_struct *tsk; ++ struct nsproxy *cur; ++ ++ tsk = current; ++ cur = tsk->nsproxy; ++ ++ err = copy_namespaces(CLONE_NAMESPACES_MASK & ~CLONE_NEWNET, tsk); ++ if (err < 0) ++ return err; ++ ++ ve->ve_ns = get_nsproxy(tsk->nsproxy); ++ memcpy(ve->ve_ns->uts_ns->name.release, virt_utsname.release, ++ sizeof(virt_utsname.release)); ++ ++ if (cur->pid_ns->flags & PID_NS_HIDE_CHILD) ++ ve->ve_ns->pid_ns->flags |= PID_NS_HIDDEN; ++ ++ *old = cur; ++ return 0; ++} ++ ++static inline void fini_ve_namespaces(struct ve_struct *ve, ++ struct nsproxy *old) ++{ ++ struct task_struct *tsk = current; ++ struct nsproxy *tmp; ++ ++ if (old) { ++ tmp = tsk->nsproxy; ++ tsk->nsproxy = get_nsproxy(old); ++ put_nsproxy(tmp); ++ tmp = ve->ve_ns; ++ ve->ve_ns = get_nsproxy(old); ++ put_nsproxy(tmp); ++ } else { ++ put_nsproxy(ve->ve_ns); ++ ve->ve_ns = NULL; ++ } ++} ++ ++static int init_ve_netns(struct ve_struct *ve, struct nsproxy **old) ++{ ++ int err; ++ struct task_struct *tsk; ++ struct nsproxy *cur; ++ ++ tsk = current; ++ cur = tsk->nsproxy; ++ ++ err = copy_namespaces(CLONE_NEWNET, tsk); ++ if (err < 0) ++ return err; ++ ++ put_nsproxy(ve->ve_ns); ++ ve->ve_ns = get_nsproxy(tsk->nsproxy); ++ ve->ve_netns = get_net(ve->ve_ns->net_ns); ++ *old = cur; ++ return 0; ++} ++ ++static inline void switch_ve_namespaces(struct ve_struct *ve, ++ struct task_struct *tsk) ++{ ++ struct nsproxy *old_ns; ++ struct nsproxy *new_ns; ++ ++ BUG_ON(tsk != current); ++ old_ns = tsk->nsproxy; ++ new_ns = ve->ve_ns; ++ ++ if (old_ns != new_ns) { ++ tsk->nsproxy = get_nsproxy(new_ns); ++ put_nsproxy(old_ns); ++ } ++} ++ ++static __u64 get_ve_features(env_create_param_t *data, int datalen) ++{ ++ __u64 known_features; ++ ++ if (datalen < sizeof(struct env_create_param3)) ++ /* this version of vzctl is aware of VE_FEATURES_OLD only */ ++ known_features = VE_FEATURES_OLD; ++ else ++ known_features = data->known_features; ++ ++ /* ++ * known features are set as required ++ * yet unknown features are set as in VE_FEATURES_DEF ++ */ ++ return (data->feature_mask & known_features) | ++ (VE_FEATURES_DEF & ~known_features); ++} ++ ++static int init_ve_struct(struct ve_struct *ve, envid_t veid, ++ u32 class_id, env_create_param_t *data, int datalen) ++{ ++ (void)get_ve(ve); ++ ve->veid = veid; ++ ve->class_id = class_id; ++ ve->features = get_ve_features(data, datalen); ++ INIT_LIST_HEAD(&ve->vetask_lh); ++ init_rwsem(&ve->op_sem); ++ ++ ve->start_timespec = current->start_time; ++ /* The value is wrong, but it is never compared to process ++ * start times */ ++ ve->start_jiffies = get_jiffies_64(); ++ ve->start_cycles = get_cycles(); ++ ++ return 0; ++} ++ ++/********************************************************************** ++ ********************************************************************** ++ * ++ * /proc/meminfo virtualization ++ * ++ ********************************************************************** ++ **********************************************************************/ ++static int ve_set_meminfo(envid_t veid, unsigned long val) ++{ ++#ifdef CONFIG_BEANCOUNTERS ++ struct ve_struct *ve; ++ ++ ve = get_ve_by_id(veid); ++ if (!ve) ++ return -EINVAL; ++ ++ ve->meminfo_val = val; ++ real_put_ve(ve); ++ return 0; ++#else ++ return -ENOTTY; ++#endif ++} ++ ++static int init_ve_meminfo(struct ve_struct *ve) ++{ ++ ve->meminfo_val = 0; ++ return 0; ++} ++ ++static inline void fini_ve_meminfo(struct ve_struct *ve) ++{ ++} ++ ++static void set_ve_root(struct ve_struct *ve, struct task_struct *tsk) ++{ ++ read_lock(&tsk->fs->lock); ++ ve->root_path = tsk->fs->root; ++ read_unlock(&tsk->fs->lock); ++ mark_tree_virtual(&ve->root_path); ++} ++ ++static void set_ve_caps(struct ve_struct *ve, struct task_struct *tsk) ++{ ++ /* required for real_setdevperms from register_ve_ above */ ++ memcpy(&ve->ve_cap_bset, &tsk->cap_effective, sizeof(kernel_cap_t)); ++ cap_lower(ve->ve_cap_bset, CAP_SETVEID); ++} ++ ++static int ve_list_add(struct ve_struct *ve) ++{ ++ write_lock_irq(&ve_list_lock); ++ if (__find_ve_by_id(ve->veid) != NULL) ++ goto err_exists; ++ ++ list_add(&ve->ve_list, &ve_list_head); ++ nr_ve++; ++ write_unlock_irq(&ve_list_lock); ++ return 0; ++ ++err_exists: ++ write_unlock_irq(&ve_list_lock); ++ return -EEXIST; ++} ++ ++static void ve_list_del(struct ve_struct *ve) ++{ ++ write_lock_irq(&ve_list_lock); ++ list_del(&ve->ve_list); ++ nr_ve--; ++ write_unlock_irq(&ve_list_lock); ++} ++ ++static void set_task_ve_caps(struct task_struct *tsk, struct ve_struct *ve) ++{ ++ kernel_cap_t bset; ++ ++ spin_lock(&task_capability_lock); ++ bset = ve->ve_cap_bset; ++ tsk->cap_effective = cap_intersect(tsk->cap_effective, bset); ++ tsk->cap_inheritable = cap_intersect(tsk->cap_inheritable, bset); ++ tsk->cap_permitted = cap_intersect(tsk->cap_permitted, bset); ++ spin_unlock(&task_capability_lock); ++} ++ ++void ve_move_task(struct task_struct *tsk, struct ve_struct *new) ++{ ++ struct ve_struct *old; ++ ++ might_sleep(); ++ BUG_ON(tsk != current); ++ BUG_ON(!(thread_group_leader(tsk) && thread_group_empty(tsk))); ++ ++ /* this probihibts ptracing of task entered to VE from host system */ ++ tsk->mm->vps_dumpable = 0; ++ /* setup capabilities before enter */ ++ set_task_ve_caps(tsk, new); ++ ++ old = tsk->ve_task_info.owner_env; ++ tsk->ve_task_info.owner_env = new; ++ tsk->ve_task_info.exec_env = new; ++ ++ write_lock_irq(&tasklist_lock); ++ list_del_rcu(&tsk->ve_task_info.vetask_list); ++ write_unlock_irq(&tasklist_lock); ++ ++ synchronize_rcu(); ++ ++ write_lock_irq(&tasklist_lock); ++ list_add_tail_rcu(&tsk->ve_task_info.vetask_list, ++ &new->vetask_lh); ++ write_unlock_irq(&tasklist_lock); ++ ++ atomic_dec(&old->pcounter); ++ real_put_ve(old); ++ ++ atomic_inc(&new->pcounter); ++ get_ve(new); ++ ++ tsk->cgroups = new->ve_css_set; ++} ++ ++EXPORT_SYMBOL(ve_move_task); ++ ++#ifdef CONFIG_VE_IPTABLES ++ ++#define KSYMIPTINIT(mask, ve, full_mask, mod, name, args) \ ++({ \ ++ int ret = 0; \ ++ if (VE_IPT_CMP(mask, full_mask) && \ ++ VE_IPT_CMP((ve)->_iptables_modules, \ ++ full_mask & ~(full_mask##_MOD))) { \ ++ ret = KSYMERRCALL(1, mod, name, args); \ ++ if (ret == 0) \ ++ (ve)->_iptables_modules |= \ ++ full_mask##_MOD; \ ++ if (ret == 1) \ ++ ret = 0; \ ++ } \ ++ ret; \ ++}) ++ ++#define KSYMIPTFINI(mask, full_mask, mod, name, args) \ ++({ \ ++ if (VE_IPT_CMP(mask, full_mask##_MOD)) \ ++ KSYMSAFECALL_VOID(mod, name, args); \ ++}) ++ ++ ++static int do_ve_iptables(struct ve_struct *ve, __u64 init_mask, ++ int init_or_cleanup) ++{ ++ int err; ++ ++ /* Remove when userspace will start supplying IPv6-related bits. */ ++ init_mask &= ~VE_IP_IPTABLES6; ++ init_mask &= ~VE_IP_FILTER6; ++ init_mask &= ~VE_IP_MANGLE6; ++ init_mask &= ~VE_IP_IPTABLE_NAT_MOD; ++ init_mask &= ~VE_NF_CONNTRACK_MOD; ++ if ((init_mask & VE_IP_IPTABLES) == VE_IP_IPTABLES) ++ init_mask |= VE_IP_IPTABLES6; ++ if ((init_mask & VE_IP_FILTER) == VE_IP_FILTER) ++ init_mask |= VE_IP_FILTER6; ++ if ((init_mask & VE_IP_MANGLE) == VE_IP_MANGLE) ++ init_mask |= VE_IP_MANGLE6; ++ if ((init_mask & VE_IP_NAT) == VE_IP_NAT) ++ init_mask |= VE_IP_IPTABLE_NAT; ++ ++ if ((init_mask & VE_IP_CONNTRACK) == VE_IP_CONNTRACK) ++ init_mask |= VE_NF_CONNTRACK; ++ ++ err = 0; ++ if (!init_or_cleanup) ++ goto cleanup; ++ ++ /* init part */ ++#if defined(CONFIG_NF_CONNTRACK_IPV4) || \ ++ defined(CONFIG_NF_CONNTRACK_IPV4_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_NF_CONNTRACK, ++ nf_conntrack, nf_conntrack_init_ve, ()); ++ if (err < 0) ++ goto err_nf_conntrack; ++ ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_CONNTRACK, ++ nf_conntrack_ipv4, init_nf_ct_l3proto_ipv4, ()); ++ if (err < 0) ++ goto err_nf_conntrack_ipv4; ++#endif ++#if defined(CONFIG_NF_NAT) || \ ++ defined(CONFIG_NF_NAT_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_NAT, ++ nf_nat, nf_nat_init, ()); ++ if (err < 0) ++ goto err_nftable_nat; ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_IPTABLE_NAT, ++ iptable_nat, init_nftable_nat, ()); ++ if (err < 0) ++ goto err_nftable_nat2; ++#endif ++ return 0; ++ ++/* ------------------------------------------------------------------------- */ ++ ++cleanup: ++#if defined(CONFIG_NF_NAT) || \ ++ defined(CONFIG_NF_NAT_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_IPTABLE_NAT, ++ iptable_nat, fini_nftable_nat, ()); ++err_nftable_nat2: ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_NAT, ++ nf_nat, nf_nat_cleanup, ()); ++err_nftable_nat: ++#endif ++#if defined(CONFIG_NF_CONNTRACK_IPV4) || \ ++ defined(CONFIG_NF_CONNTRACK_IPV4_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_CONNTRACK, ++ nf_conntrack_ipv4, fini_nf_ct_l3proto_ipv4, ()); ++err_nf_conntrack_ipv4: ++ KSYMIPTFINI(ve->_iptables_modules, VE_NF_CONNTRACK, ++ nf_conntrack, nf_conntrack_cleanup_ve, ()); ++err_nf_conntrack: ++#endif ++ /* Do not reset _iptables_modules as ++ * net hooks used one ++ */ ++ return err; ++} ++ ++static inline int init_ve_iptables(struct ve_struct *ve, __u64 init_mask) ++{ ++ return do_ve_iptables(ve, init_mask, 1); ++} ++ ++static inline void fini_ve_iptables(struct ve_struct *ve, __u64 init_mask) ++{ ++ (void)do_ve_iptables(ve, init_mask, 0); ++} ++ ++#else ++#define init_ve_iptables(x, y) (0) ++#define fini_ve_iptables(x, y) do { } while (0) ++#endif ++ ++static inline int init_ve_cpustats(struct ve_struct *ve) ++{ ++ ve->cpu_stats = alloc_percpu(struct ve_cpu_stats); ++ return ve->cpu_stats == NULL ? -ENOMEM : 0; ++} ++ ++static inline void free_ve_cpustats(struct ve_struct *ve) ++{ ++ free_percpu(ve->cpu_stats); ++ ve->cpu_stats = NULL; ++} ++ ++static int alone_in_pgrp(struct task_struct *tsk) ++{ ++ struct task_struct *p; ++ int alone = 0; ++ ++ read_lock(&tasklist_lock); ++ do_each_pid_task(task_pid(tsk), PIDTYPE_PGID, p) { ++ if (p != tsk) ++ goto out; ++ } while_each_pid_task(task_pid(tsk), PIDTYPE_PGID, p); ++ do_each_pid_task(task_pid(tsk), PIDTYPE_SID, p) { ++ if (p != tsk) ++ goto out; ++ } while_each_pid_task(task_pid(tsk), PIDTYPE_SID, p); ++ alone = 1; ++out: ++ read_unlock(&tasklist_lock); ++ return alone; ++} ++ ++static int do_env_create(envid_t veid, unsigned int flags, u32 class_id, ++ env_create_param_t *data, int datalen) ++{ ++ struct task_struct *tsk; ++ struct ve_struct *old; ++ struct ve_struct *old_exec; ++ struct ve_struct *ve; ++ __u64 init_mask; ++ int err; ++ struct nsproxy *old_ns, *old_ns_net; ++ DECLARE_COMPLETION_ONSTACK(sysfs_completion); ++ ++ tsk = current; ++ old = VE_TASK_INFO(tsk)->owner_env; ++ ++ if (!thread_group_leader(tsk) || !thread_group_empty(tsk)) ++ return -EINVAL; ++ ++ if (tsk->signal->tty) { ++ printk("ERR: CT init has controlling terminal\n"); ++ return -EINVAL; ++ } ++ if (task_pgrp(tsk) != task_pid(tsk) || ++ task_session(tsk) != task_pid(tsk)) { ++ int may_setsid; ++ ++ read_lock(&tasklist_lock); ++ may_setsid = !tsk->signal->leader && ++ !find_task_by_pid_type_ns(PIDTYPE_PGID, task_pid_nr(tsk), &init_pid_ns); ++ read_unlock(&tasklist_lock); ++ ++ if (!may_setsid) { ++ printk("ERR: CT init is process group leader\n"); ++ return -EINVAL; ++ } ++ } ++ /* Check that the process is not a leader of non-empty group/session. ++ * If it is, we cannot virtualize its PID and must fail. */ ++ if (!alone_in_pgrp(tsk)) { ++ printk("ERR: CT init is not alone in process group\n"); ++ return -EINVAL; ++ } ++ ++ ++ VZTRACE("%s: veid=%d classid=%d pid=%d\n", ++ __FUNCTION__, veid, class_id, current->pid); ++ ++ err = -ENOMEM; ++ ve = kzalloc(sizeof(struct ve_struct), GFP_KERNEL); ++ if (ve == NULL) ++ goto err_struct; ++ ++ init_ve_struct(ve, veid, class_id, data, datalen); ++ __module_get(THIS_MODULE); ++ down_write(&ve->op_sem); ++ if (flags & VE_LOCK) ++ ve->is_locked = 1; ++ ++ /* ++ * this should be done before adding to list ++ * because if calc_load_ve finds this ve in ++ * list it will be very surprised ++ */ ++ if ((err = init_ve_cpustats(ve)) < 0) ++ goto err_cpu_stats; ++ ++ if ((err = ve_list_add(ve)) < 0) ++ goto err_exist; ++ ++ /* this should be done before context switching */ ++ if ((err = init_printk(ve)) < 0) ++ goto err_log_wait; ++ ++ old_exec = set_exec_env(ve); ++ ++ if ((err = init_ve_sched(ve)) < 0) ++ goto err_sched; ++ ++ set_ve_root(ve, tsk); ++ ++ if ((err = init_ve_sysfs(ve))) ++ goto err_sysfs; ++ ++ if ((err = init_ve_mibs(ve))) ++ goto err_mibs; ++ ++ if ((err = init_ve_namespaces(ve, &old_ns))) ++ goto err_ns; ++ ++ if ((err = init_ve_proc(ve))) ++ goto err_proc; ++ ++ ++ init_mask = data ? data->iptables_mask : VE_IP_DEFAULT; ++ ++#ifdef CONFIG_VE_IPTABLES ++ /* Set up ipt_mask as it will be used during ++ * net namespace initialization ++ */ ++ ve->ipt_mask = init_mask; ++#endif ++ ++ if ((err = init_ve_netns(ve, &old_ns_net))) ++ goto err_netns; ++ ++ if ((err = init_ve_cgroups(ve))) ++ goto err_cgroup; ++ ++ if ((err = init_ve_tty_drivers(ve)) < 0) ++ goto err_tty; ++ ++ if ((err = init_ve_shmem(ve))) ++ goto err_shmem; ++ ++ if ((err = init_ve_devpts(ve))) ++ goto err_devpts; ++ ++ if((err = init_ve_meminfo(ve))) ++ goto err_meminf; ++ ++ set_ve_caps(ve, tsk); ++ ++ /* It is safe to initialize netfilter here as routing initialization and ++ interface setup will be done below. This means that NO skb can be ++ passed inside. Den */ ++ /* iptables ve initialization for non ve0; ++ ve0 init is in module_init */ ++ ++ if ((err = init_ve_iptables(ve, init_mask)) < 0) ++ goto err_iptables; ++ ++ if ((err = pid_ns_attach_init(ve->ve_ns->pid_ns, tsk)) < 0) ++ goto err_vpid; ++ ++ if ((err = ve_hook_iterate_init(VE_SS_CHAIN, ve)) < 0) ++ goto err_ve_hook; ++ ++ put_nsproxy(old_ns); ++ put_nsproxy(old_ns_net); ++ ++ /* finally: set vpids and move inside */ ++ ve_move_task(tsk, ve); ++ ++ ve->is_running = 1; ++ up_write(&ve->op_sem); ++ ++ printk(KERN_INFO "CT: %d: started\n", veid); ++ return veid; ++ ++err_ve_hook: ++ mntget(ve->proc_mnt); ++err_vpid: ++ fini_venet(ve); ++ fini_ve_iptables(ve, init_mask); ++err_iptables: ++ fini_ve_meminfo(ve); ++err_meminf: ++ fini_ve_devpts(ve); ++err_devpts: ++ fini_ve_shmem(ve); ++err_shmem: ++ fini_ve_tty_drivers(ve); ++err_tty: ++ fini_ve_cgroups(ve); ++err_cgroup: ++ fini_ve_namespaces(ve, old_ns_net); ++ put_nsproxy(old_ns_net); ++ ve->ve_netns->sysfs_completion = &sysfs_completion; ++ put_net(ve->ve_netns); ++ wait_for_completion(&sysfs_completion); ++err_netns: ++ /* ++ * If process hasn't become VE's init, proc_mnt won't be put during ++ * pidns death, so this mntput by hand is needed. If it has, we ++ * compensate with mntget above. ++ */ ++ mntput(ve->proc_mnt); ++ fini_ve_proc(ve); ++err_proc: ++ /* free_ve_utsname() is called inside real_put_ve() */ ++ fini_ve_namespaces(ve, old_ns); ++ put_nsproxy(old_ns); ++ /* ++ * We need to compensate, because fini_ve_namespaces() assumes ++ * ve->ve_ns will continue to be used after, but VE will be freed soon ++ * (in kfree() sense). ++ */ ++ put_nsproxy(ve->ve_ns); ++err_ns: ++ fini_ve_mibs(ve); ++err_mibs: ++ fini_ve_sysfs(ve); ++err_sysfs: ++ /* It is safe to restore current->envid here because ++ * ve_fairsched_detach does not use current->envid. */ ++ /* Really fairsched code uses current->envid in sys_fairsched_mknod ++ * only. It is correct if sys_fairsched_mknod is called from ++ * userspace. If sys_fairsched_mknod is called from ++ * ve_fairsched_attach, then node->envid and node->parent_node->envid ++ * are explicitly set to valid value after the call. */ ++ /* FIXME */ ++ VE_TASK_INFO(tsk)->owner_env = old; ++ VE_TASK_INFO(tsk)->exec_env = old_exec; ++ ++ fini_ve_sched(ve); ++err_sched: ++ (void)set_exec_env(old_exec); ++ ++ /* we can jump here having incorrect envid */ ++ VE_TASK_INFO(tsk)->owner_env = old; ++ fini_printk(ve); ++err_log_wait: ++ /* cpustats will be freed in do_env_free */ ++ ve_list_del(ve); ++ up_write(&ve->op_sem); ++ ++ real_put_ve(ve); ++err_struct: ++ printk(KERN_INFO "CT: %d: failed to start with err=%d\n", veid, err); ++ return err; ++ ++err_exist: ++ free_ve_cpustats(ve); ++err_cpu_stats: ++ kfree(ve); ++ goto err_struct; ++} ++ ++ ++/********************************************************************** ++ ********************************************************************** ++ * ++ * VE start/stop callbacks ++ * ++ ********************************************************************** ++ **********************************************************************/ ++ ++int real_env_create(envid_t veid, unsigned flags, u32 class_id, ++ env_create_param_t *data, int datalen) ++{ ++ int status; ++ struct ve_struct *ve; ++ ++ if (!flags) { ++ status = get_exec_env()->veid; ++ goto out; ++ } ++ ++ status = -EPERM; ++ if (!capable(CAP_SETVEID)) ++ goto out; ++ ++ status = -EINVAL; ++ if ((flags & VE_TEST) && (flags & (VE_ENTER|VE_CREATE))) ++ goto out; ++ ++ status = -EINVAL; ++ ve = get_ve_by_id(veid); ++ if (ve) { ++ if (flags & VE_TEST) { ++ status = 0; ++ goto out_put; ++ } ++ if (flags & VE_EXCLUSIVE) { ++ status = -EACCES; ++ goto out_put; ++ } ++ if (flags & VE_CREATE) { ++ flags &= ~VE_CREATE; ++ flags |= VE_ENTER; ++ } ++ } else { ++ if (flags & (VE_TEST|VE_ENTER)) { ++ status = -ESRCH; ++ goto out; ++ } ++ } ++ ++ if (flags & VE_CREATE) { ++ status = do_env_create(veid, flags, class_id, data, datalen); ++ goto out; ++ } else if (flags & VE_ENTER) ++ status = do_env_enter(ve, flags); ++ ++ /* else: returning EINVAL */ ++ ++out_put: ++ real_put_ve(ve); ++out: ++ return status; ++} ++EXPORT_SYMBOL(real_env_create); ++ ++static int do_env_enter(struct ve_struct *ve, unsigned int flags) ++{ ++ struct task_struct *tsk = current; ++ int err; ++ ++ VZTRACE("%s: veid=%d\n", __FUNCTION__, ve->veid); ++ ++ err = -EBUSY; ++ down_read(&ve->op_sem); ++ if (!ve->is_running) ++ goto out_up; ++ if (ve->is_locked && !(flags & VE_SKIPLOCK)) ++ goto out_up; ++ err = -EINVAL; ++ if (!thread_group_leader(tsk) || !thread_group_empty(tsk)) ++ goto out_up; ++ ++#ifdef CONFIG_VZ_FAIRSCHED ++ err = sys_fairsched_mvpr(current->pid, ve->veid); ++ if (err) ++ goto out_up; ++#endif ++ ve_sched_attach(ve); ++ switch_ve_namespaces(ve, tsk); ++ ve_move_task(current, ve); ++ ++ /* Check that the process is not a leader of non-empty group/session. ++ * If it is, we cannot virtualize its PID. Do not fail, just leave ++ * it non-virtual. ++ */ ++ if (alone_in_pgrp(tsk) && !(flags & VE_SKIPLOCK)) ++ pid_ns_attach_task(ve->ve_ns->pid_ns, tsk); ++ ++ /* Unlike VE_CREATE, we do not setsid() in VE_ENTER. ++ * Process is allowed to be in an external group/session. ++ * If user space callers wants, it will do setsid() after ++ * VE_ENTER. ++ */ ++ err = VE_TASK_INFO(tsk)->owner_env->veid; ++ tsk->did_ve_enter = 1; ++ ++out_up: ++ up_read(&ve->op_sem); ++ return err; ++} ++ ++static void env_cleanup(struct ve_struct *ve) ++{ ++ struct ve_struct *old_ve; ++ DECLARE_COMPLETION_ONSTACK(sysfs_completion); ++ ++ VZTRACE("real_do_env_cleanup\n"); ++ ++ down_read(&ve->op_sem); ++ old_ve = set_exec_env(ve); ++ ++ ve_hook_iterate_fini(VE_SS_CHAIN, ve); ++ ++ fini_venet(ve); ++ ++ /* no new packets in flight beyond this point */ ++ ++ /* kill iptables */ ++ /* No skb belonging to VE can exist at this point as unregister_netdev ++ is an operation awaiting until ALL skb's gone */ ++ fini_ve_iptables(ve, ve->_iptables_modules); ++ ++ fini_ve_sched(ve); ++ ++ fini_ve_devpts(ve); ++ fini_ve_shmem(ve); ++ unregister_ve_tty_drivers(ve); ++ fini_ve_meminfo(ve); ++ ++ fini_ve_cgroups(ve); ++ ++ fini_ve_namespaces(ve, NULL); ++ ve->ve_netns->sysfs_completion = &sysfs_completion; ++ put_net(ve->ve_netns); ++ wait_for_completion(&sysfs_completion); ++ fini_ve_mibs(ve); ++ fini_ve_proc(ve); ++ fini_ve_sysfs(ve); ++ ++ (void)set_exec_env(old_ve); ++ fini_printk(ve); /* no printk can happen in ve context anymore */ ++ ++ ve_list_del(ve); ++ up_read(&ve->op_sem); ++ ++ real_put_ve(ve); ++} ++ ++static DECLARE_COMPLETION(vzmond_complete); ++static volatile int stop_vzmond; ++ ++static int vzmond_helper(void *arg) ++{ ++ char name[18]; ++ struct ve_struct *ve; ++ ++ ve = (struct ve_struct *)arg; ++ snprintf(name, sizeof(name), "vzmond/%d", ve->veid); ++ daemonize(name); ++ env_cleanup(ve); ++ module_put_and_exit(0); ++} ++ ++static void do_pending_env_cleanups(void) ++{ ++ int err; ++ struct ve_struct *ve; ++ ++ spin_lock(&ve_cleanup_lock); ++ while (1) { ++ if (list_empty(&ve_cleanup_list) || need_resched()) ++ break; ++ ++ ve = list_first_entry(&ve_cleanup_list, ++ struct ve_struct, cleanup_list); ++ list_del(&ve->cleanup_list); ++ spin_unlock(&ve_cleanup_lock); ++ ++ __module_get(THIS_MODULE); ++ err = kernel_thread(vzmond_helper, (void *)ve, 0); ++ if (err < 0) { ++ env_cleanup(ve); ++ module_put(THIS_MODULE); ++ } ++ ++ spin_lock(&ve_cleanup_lock); ++ } ++ spin_unlock(&ve_cleanup_lock); ++} ++ ++static inline int have_pending_cleanups(void) ++{ ++ return !list_empty(&ve_cleanup_list); ++} ++ ++static int vzmond(void *arg) ++{ ++ daemonize("vzmond"); ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ while (!stop_vzmond || have_pending_cleanups()) { ++ schedule(); ++ try_to_freeze(); ++ if (signal_pending(current)) ++ flush_signals(current); ++ ++ do_pending_env_cleanups(); ++ set_current_state(TASK_INTERRUPTIBLE); ++ if (have_pending_cleanups()) ++ __set_current_state(TASK_RUNNING); ++ } ++ ++ __set_task_state(current, TASK_RUNNING); ++ complete_and_exit(&vzmond_complete, 0); ++} ++ ++static int __init init_vzmond(void) ++{ ++ int pid; ++ struct task_struct *tsk; ++ ++ pid = kernel_thread(vzmond, NULL, 0); ++ if (pid > 0) { ++ tsk = find_task_by_pid(pid); ++ BUG_ON(tsk == NULL); ++ ve_cleanup_thread = tsk; ++ } ++ return pid; ++} ++ ++static void fini_vzmond(void) ++{ ++ stop_vzmond = 1; ++ wake_up_process(ve_cleanup_thread); ++ wait_for_completion(&vzmond_complete); ++ ve_cleanup_thread = NULL; ++ WARN_ON(!list_empty(&ve_cleanup_list)); ++} ++ ++void real_do_env_free(struct ve_struct *ve) ++{ ++ VZTRACE("real_do_env_free\n"); ++ ++ free_ve_tty_drivers(ve); ++ free_ve_filesystems(ve); ++ free_ve_cpustats(ve); ++ printk(KERN_INFO "CT: %d: stopped\n", VEID(ve)); ++ kfree(ve); ++ ++ module_put(THIS_MODULE); ++} ++EXPORT_SYMBOL(real_do_env_free); ++ ++ ++/********************************************************************** ++ ********************************************************************** ++ * ++ * VE TTY handling ++ * ++ ********************************************************************** ++ **********************************************************************/ ++ ++static struct tty_driver *alloc_ve_tty_driver(struct tty_driver *base, ++ struct ve_struct *ve) ++{ ++ size_t size; ++ struct tty_driver *driver; ++ ++ /* FIXME: make it a normal way (or wait till ms version) */ ++ ++ driver = kmalloc(sizeof(struct tty_driver), GFP_KERNEL_UBC); ++ if (!driver) ++ goto out; ++ ++ memcpy(driver, base, sizeof(struct tty_driver)); ++ ++ driver->driver_state = NULL; ++ ++ size = base->num * 3 * sizeof(void *); ++ if (!(driver->flags & TTY_DRIVER_DEVPTS_MEM)) { ++ void **p; ++ p = kzalloc(size, GFP_KERNEL_UBC); ++ if (!p) ++ goto out_free; ++ ++ driver->ttys = (struct tty_struct **)p; ++ driver->termios = (struct ktermios **)(p + driver->num); ++ driver->termios_locked = (struct ktermios **) ++ (p + driver->num * 2); ++ } else { ++ driver->ttys = NULL; ++ driver->termios = NULL; ++ driver->termios_locked = NULL; ++ } ++ ++ driver->owner_env = ve; ++ driver->flags |= TTY_DRIVER_INSTALLED; ++ driver->refcount = 0; ++ ++ return driver; ++ ++out_free: ++ kfree(driver); ++out: ++ return NULL; ++} ++ ++static void free_ve_tty_driver(struct tty_driver *driver) ++{ ++ if (!driver) ++ return; ++ ++ clear_termios(driver); ++ kfree(driver->ttys); ++ kfree(driver); ++} ++ ++static int alloc_ve_tty_drivers(struct ve_struct* ve) ++{ ++#ifdef CONFIG_LEGACY_PTYS ++ /* Traditional BSD devices */ ++ ve->pty_driver = alloc_ve_tty_driver(pty_driver, ve); ++ if (!ve->pty_driver) ++ goto out_mem; ++ ++ ve->pty_slave_driver = alloc_ve_tty_driver(pty_slave_driver, ve); ++ if (!ve->pty_slave_driver) ++ goto out_mem; ++ ++ ve->pty_driver->other = ve->pty_slave_driver; ++ ve->pty_slave_driver->other = ve->pty_driver; ++#endif ++ ++#ifdef CONFIG_UNIX98_PTYS ++ ve->ptm_driver = alloc_ve_tty_driver(ptm_driver, ve); ++ if (!ve->ptm_driver) ++ goto out_mem; ++ ++ ve->pts_driver = alloc_ve_tty_driver(pts_driver, ve); ++ if (!ve->pts_driver) ++ goto out_mem; ++ ++ ve->ptm_driver->other = ve->pts_driver; ++ ve->pts_driver->other = ve->ptm_driver; ++ ++ ve->allocated_ptys = kmalloc(sizeof(*ve->allocated_ptys), ++ GFP_KERNEL_UBC); ++ if (!ve->allocated_ptys) ++ goto out_mem; ++ idr_init(ve->allocated_ptys); ++#endif ++ return 0; ++ ++out_mem: ++ free_ve_tty_drivers(ve); ++ return -ENOMEM; ++} ++ ++static void free_ve_tty_drivers(struct ve_struct* ve) ++{ ++#ifdef CONFIG_LEGACY_PTYS ++ free_ve_tty_driver(ve->pty_driver); ++ free_ve_tty_driver(ve->pty_slave_driver); ++ ve->pty_driver = ve->pty_slave_driver = NULL; ++#endif ++#ifdef CONFIG_UNIX98_PTYS ++ free_ve_tty_driver(ve->ptm_driver); ++ free_ve_tty_driver(ve->pts_driver); ++ kfree(ve->allocated_ptys); ++ ve->ptm_driver = ve->pts_driver = NULL; ++ ve->allocated_ptys = NULL; ++#endif ++} ++ ++static inline void __register_tty_driver(struct tty_driver *driver) ++{ ++ list_add(&driver->tty_drivers, &tty_drivers); ++} ++ ++static inline void __unregister_tty_driver(struct tty_driver *driver) ++{ ++ if (!driver) ++ return; ++ list_del(&driver->tty_drivers); ++} ++ ++static int register_ve_tty_drivers(struct ve_struct* ve) ++{ ++ mutex_lock(&tty_mutex); ++#ifdef CONFIG_UNIX98_PTYS ++ __register_tty_driver(ve->ptm_driver); ++ __register_tty_driver(ve->pts_driver); ++#endif ++#ifdef CONFIG_LEGACY_PTYS ++ __register_tty_driver(ve->pty_driver); ++ __register_tty_driver(ve->pty_slave_driver); ++#endif ++ mutex_unlock(&tty_mutex); ++ ++ return 0; ++} ++ ++static void unregister_ve_tty_drivers(struct ve_struct* ve) ++{ ++ VZTRACE("unregister_ve_tty_drivers\n"); ++ ++ mutex_lock(&tty_mutex); ++#ifdef CONFIG_LEGACY_PTYS ++ __unregister_tty_driver(ve->pty_driver); ++ __unregister_tty_driver(ve->pty_slave_driver); ++#endif ++#ifdef CONFIG_UNIX98_PTYS ++ __unregister_tty_driver(ve->ptm_driver); ++ __unregister_tty_driver(ve->pts_driver); ++#endif ++ mutex_unlock(&tty_mutex); ++} ++ ++static int init_ve_tty_drivers(struct ve_struct *ve) ++{ ++ int err; ++ ++ if ((err = alloc_ve_tty_drivers(ve))) ++ goto err_ttyalloc; ++ if ((err = register_ve_tty_drivers(ve))) ++ goto err_ttyreg; ++ return 0; ++ ++err_ttyreg: ++ free_ve_tty_drivers(ve); ++err_ttyalloc: ++ return err; ++} ++ ++static void fini_ve_tty_drivers(struct ve_struct *ve) ++{ ++ unregister_ve_tty_drivers(ve); ++ free_ve_tty_drivers(ve); ++} ++ ++/* ++ * Free the termios and termios_locked structures because ++ * we don't want to get memory leaks when modular tty ++ * drivers are removed from the kernel. ++ */ ++static void clear_termios(struct tty_driver *driver) ++{ ++ int i; ++ struct ktermios *tp; ++ ++ if (driver->termios == NULL) ++ return; ++ for (i = 0; i < driver->num; i++) { ++ tp = driver->termios[i]; ++ if (tp) { ++ driver->termios[i] = NULL; ++ kfree(tp); ++ } ++ tp = driver->termios_locked[i]; ++ if (tp) { ++ driver->termios_locked[i] = NULL; ++ kfree(tp); ++ } ++ } ++} ++ ++ ++/********************************************************************** ++ ********************************************************************** ++ * ++ * Pieces of VE network ++ * ++ ********************************************************************** ++ **********************************************************************/ ++ ++#ifdef CONFIG_NET ++#include ++#include ++#include ++#include ++#include ++#include ++#endif ++ ++static int ve_dev_add(envid_t veid, char *dev_name) ++{ ++ struct net_device *dev; ++ struct ve_struct *dst_ve; ++ struct net *dst_net; ++ int err = -ESRCH; ++ ++ dst_ve = get_ve_by_id(veid); ++ if (dst_ve == NULL) ++ goto out; ++ ++ dst_net = dst_ve->ve_netns; ++ ++ rtnl_lock(); ++ read_lock(&dev_base_lock); ++ dev = __dev_get_by_name(&init_net, dev_name); ++ read_unlock(&dev_base_lock); ++ if (dev == NULL) ++ goto out_unlock; ++ ++ err = __dev_change_net_namespace(dev, dst_net, dev_name, ++ get_ve0(), dst_ve, get_exec_ub()); ++out_unlock: ++ rtnl_unlock(); ++ real_put_ve(dst_ve); ++ ++ if (dev == NULL) ++ printk(KERN_WARNING "%s: device %s not found\n", ++ __func__, dev_name); ++out: ++ return err; ++} ++ ++static int ve_dev_del(envid_t veid, char *dev_name) ++{ ++ struct net_device *dev; ++ struct ve_struct *src_ve; ++ struct net *src_net; ++ int err = -ESRCH; ++ ++ src_ve = get_ve_by_id(veid); ++ if (src_ve == NULL) ++ goto out; ++ ++ src_net = src_ve->ve_netns; ++ ++ rtnl_lock(); ++ ++ read_lock(&dev_base_lock); ++ dev = __dev_get_by_name(src_net, dev_name); ++ read_unlock(&dev_base_lock); ++ if (dev == NULL) ++ goto out_unlock; ++ ++ err = __dev_change_net_namespace(dev, &init_net, dev_name, ++ src_ve, get_ve0(), netdev_bc(dev)->owner_ub); ++out_unlock: ++ rtnl_unlock(); ++ real_put_ve(src_ve); ++ ++ if (dev == NULL) ++ printk(KERN_WARNING "%s: device %s not found\n", ++ __func__, dev_name); ++out: ++ return err; ++} ++ ++int real_ve_dev_map(envid_t veid, int op, char *dev_name) ++{ ++ if (!capable(CAP_SETVEID)) ++ return -EPERM; ++ switch (op) { ++ case VE_NETDEV_ADD: ++ return ve_dev_add(veid, dev_name); ++ case VE_NETDEV_DEL: ++ return ve_dev_del(veid, dev_name); ++ default: ++ return -EINVAL; ++ } ++} ++ ++/********************************************************************** ++ ********************************************************************** ++ * ++ * VE information via /proc ++ * ++ ********************************************************************** ++ **********************************************************************/ ++#ifdef CONFIG_PROC_FS ++#if BITS_PER_LONG == 32 ++#define VESTAT_LINE_WIDTH (6 * 11 + 6 * 21) ++#define VESTAT_LINE_FMT "%10u %10lu %10lu %10lu %10Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %10lu\n" ++#define VESTAT_HEAD_FMT "%10s %10s %10s %10s %10s %20s %20s %20s %20s %20s %20s %10s\n" ++#else ++#define VESTAT_LINE_WIDTH (12 * 21) ++#define VESTAT_LINE_FMT "%20u %20lu %20lu %20lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20lu\n" ++#define VESTAT_HEAD_FMT "%20s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s\n" ++#endif ++ ++static int vestat_seq_show(struct seq_file *m, void *v) ++{ ++ struct list_head *entry; ++ struct ve_struct *ve; ++ struct ve_struct *curve; ++ int cpu; ++ unsigned long user_ve, nice_ve, system_ve; ++ unsigned long long uptime; ++ cycles_t uptime_cycles, idle_time, strv_time, used; ++ ++ entry = (struct list_head *)v; ++ ve = list_entry(entry, struct ve_struct, ve_list); ++ ++ curve = get_exec_env(); ++ if (entry == ve_list_head.next || ++ (!ve_is_super(curve) && ve == curve)) { ++ /* print header */ ++ seq_printf(m, "%-*s\n", ++ VESTAT_LINE_WIDTH - 1, ++ "Version: 2.2"); ++ seq_printf(m, VESTAT_HEAD_FMT, "VEID", ++ "user", "nice", "system", ++ "uptime", "idle", ++ "strv", "uptime", "used", ++ "maxlat", "totlat", "numsched"); ++ } ++ ++ if (ve == get_ve0()) ++ return 0; ++ ++ user_ve = nice_ve = system_ve = 0; ++ idle_time = strv_time = used = 0; ++ ++ for_each_online_cpu(cpu) { ++ struct ve_cpu_stats *st; ++ ++ st = VE_CPU_STATS(ve, cpu); ++ user_ve += st->user; ++ nice_ve += st->nice; ++ system_ve += st->system; ++ used += st->used_time; ++ idle_time += ve_sched_get_idle_time(ve, cpu); ++ } ++ uptime_cycles = get_cycles() - ve->start_cycles; ++ uptime = get_jiffies_64() - ve->start_jiffies; ++ ++ seq_printf(m, VESTAT_LINE_FMT, ve->veid, ++ user_ve, nice_ve, system_ve, ++ (unsigned long long)uptime, ++ (unsigned long long)idle_time, ++ (unsigned long long)strv_time, ++ (unsigned long long)uptime_cycles, ++ (unsigned long long)used, ++ (unsigned long long)ve->sched_lat_ve.last.maxlat, ++ (unsigned long long)ve->sched_lat_ve.last.totlat, ++ ve->sched_lat_ve.last.count); ++ return 0; ++} ++ ++void *ve_seq_start(struct seq_file *m, loff_t *pos) ++{ ++ struct ve_struct *curve; ++ ++ curve = get_exec_env(); ++ read_lock(&ve_list_lock); ++ if (!ve_is_super(curve)) { ++ if (*pos != 0) ++ return NULL; ++ return curve; ++ } ++ ++ return seq_list_start(&ve_list_head, *pos); ++} ++EXPORT_SYMBOL(ve_seq_start); ++ ++void *ve_seq_next(struct seq_file *m, void *v, loff_t *pos) ++{ ++ if (!ve_is_super(get_exec_env())) ++ return NULL; ++ else ++ return seq_list_next(v, &ve_list_head, pos); ++} ++EXPORT_SYMBOL(ve_seq_next); ++ ++void ve_seq_stop(struct seq_file *m, void *v) ++{ ++ read_unlock(&ve_list_lock); ++} ++EXPORT_SYMBOL(ve_seq_stop); ++ ++static struct seq_operations vestat_seq_op = { ++ .start = ve_seq_start, ++ .next = ve_seq_next, ++ .stop = ve_seq_stop, ++ .show = vestat_seq_show ++}; ++ ++static int vestat_open(struct inode *inode, struct file *file) ++{ ++ return seq_open(file, &vestat_seq_op); ++} ++ ++static struct file_operations proc_vestat_operations = { ++ .open = vestat_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release ++}; ++ ++static struct seq_operations devperms_seq_op = { ++ .start = ve_seq_start, ++ .next = ve_seq_next, ++ .stop = ve_seq_stop, ++ .show = devperms_seq_show, ++}; ++ ++static int devperms_open(struct inode *inode, struct file *file) ++{ ++ return seq_open(file, &devperms_seq_op); ++} ++ ++static struct file_operations proc_devperms_ops = { ++ .open = devperms_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++static int vz_version_show(struct seq_file *file, void* v) ++{ ++ static const char ver[] = VZVERSION "\n"; ++ ++ return seq_puts(file, ver); ++} ++ ++static int vz_version_open(struct inode *inode, struct file *file) ++{ ++ return single_open(file, vz_version_show, NULL); ++} ++ ++static struct file_operations proc_vz_version_oparations = { ++ .open = vz_version_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = single_release, ++}; ++ ++static inline unsigned long ve_used_mem(struct user_beancounter *ub) ++{ ++ extern int glob_ve_meminfo; ++ return glob_ve_meminfo ? ub->ub_parms[UB_OOMGUARPAGES].held : ++ ub->ub_parms[UB_PRIVVMPAGES].held ; ++} ++ ++static inline void ve_mi_replace(struct meminfo *mi) ++{ ++#ifdef CONFIG_BEANCOUNTERS ++ struct user_beancounter *ub; ++ unsigned long meminfo_val; ++ unsigned long nodettram; ++ unsigned long usedmem; ++ ++ meminfo_val = get_exec_env()->meminfo_val; ++ ++ if(!meminfo_val) ++ return; /* No virtualization */ ++ ++ nodettram = mi->si.totalram; ++ ub = current->mm->mm_ub; ++ usedmem = ve_used_mem(ub); ++ ++ memset(mi, 0, sizeof(*mi)); ++ ++ mi->si.totalram = (meminfo_val > nodettram) ? ++ nodettram : meminfo_val; ++ mi->si.freeram = (mi->si.totalram > usedmem) ? ++ (mi->si.totalram - usedmem) : 0; ++#else ++ return; ++#endif ++} ++ ++static int meminfo_call(struct vnotifier_block *self, ++ unsigned long event, void *arg, int old_ret) ++{ ++ if (event != VIRTINFO_MEMINFO) ++ return old_ret; ++ ++ ve_mi_replace((struct meminfo *)arg); ++ ++ return NOTIFY_OK; ++} ++ ++ ++static struct vnotifier_block meminfo_notifier_block = { ++ .notifier_call = meminfo_call ++}; ++ ++static int __init init_vecalls_proc(void) ++{ ++ struct proc_dir_entry *de; ++ ++ de = proc_create("vestat", S_IFREG | S_IRUSR, proc_vz_dir, ++ &proc_vestat_operations); ++ if (!de) ++ printk(KERN_WARNING "VZMON: can't make vestat proc entry\n"); ++ ++ de = proc_create("devperms", S_IFREG | S_IRUSR, proc_vz_dir, ++ &proc_devperms_ops); ++ if (!de) ++ printk(KERN_WARNING "VZMON: can't make devperms proc entry\n"); ++ ++ de = proc_create("version", S_IFREG | S_IRUGO, proc_vz_dir, ++ &proc_vz_version_oparations); ++ if (!de) ++ printk(KERN_WARNING "VZMON: can't make version proc entry\n"); ++ ++ virtinfo_notifier_register(VITYPE_GENERAL, &meminfo_notifier_block); ++ return 0; ++} ++ ++static void fini_vecalls_proc(void) ++{ ++ remove_proc_entry("version", proc_vz_dir); ++ remove_proc_entry("devperms", proc_vz_dir); ++ remove_proc_entry("vestat", proc_vz_dir); ++ virtinfo_notifier_unregister(VITYPE_GENERAL, &meminfo_notifier_block); ++} ++#else ++#define init_vecalls_proc() (0) ++#define fini_vecalls_proc() do { } while (0) ++#endif /* CONFIG_PROC_FS */ ++ ++ ++/********************************************************************** ++ ********************************************************************** ++ * ++ * User ctl ++ * ++ ********************************************************************** ++ **********************************************************************/ ++ ++int vzcalls_ioctl(struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ int err; ++ ++ err = -ENOTTY; ++ switch(cmd) { ++ case VZCTL_MARK_ENV_TO_DOWN: { ++ /* Compatibility issue */ ++ err = 0; ++ } ++ break; ++ case VZCTL_SETDEVPERMS: { ++ /* Device type was mistakenly declared as dev_t ++ * in the old user-kernel interface. ++ * That's wrong, dev_t is a kernel internal type. ++ * I use `unsigned' not having anything better in mind. ++ * 2001/08/11 SAW */ ++ struct vzctl_setdevperms s; ++ err = -EFAULT; ++ if (copy_from_user(&s, (void __user *)arg, sizeof(s))) ++ break; ++ err = real_setdevperms(s.veid, s.type, ++ new_decode_dev(s.dev), s.mask); ++ } ++ break; ++#ifdef CONFIG_INET ++ case VZCTL_VE_NETDEV: { ++ struct vzctl_ve_netdev d; ++ char *s; ++ err = -EFAULT; ++ if (copy_from_user(&d, (void __user *)arg, sizeof(d))) ++ break; ++ err = -ENOMEM; ++ s = kmalloc(IFNAMSIZ+1, GFP_KERNEL); ++ if (s == NULL) ++ break; ++ err = -EFAULT; ++ if (strncpy_from_user(s, d.dev_name, IFNAMSIZ) > 0) { ++ s[IFNAMSIZ] = 0; ++ err = real_ve_dev_map(d.veid, d.op, s); ++ } ++ kfree(s); ++ } ++ break; ++#endif ++ case VZCTL_ENV_CREATE: { ++ struct vzctl_env_create s; ++ err = -EFAULT; ++ if (copy_from_user(&s, (void __user *)arg, sizeof(s))) ++ break; ++ err = real_env_create(s.veid, s.flags, s.class_id, ++ NULL, 0); ++ } ++ break; ++ case VZCTL_ENV_CREATE_DATA: { ++ struct vzctl_env_create_data s; ++ env_create_param_t *data; ++ err = -EFAULT; ++ if (copy_from_user(&s, (void __user *)arg, sizeof(s))) ++ break; ++ err=-EINVAL; ++ if (s.datalen < VZCTL_ENV_CREATE_DATA_MINLEN || ++ s.datalen > VZCTL_ENV_CREATE_DATA_MAXLEN || ++ s.data == 0) ++ break; ++ err = -ENOMEM; ++ data = kzalloc(sizeof(*data), GFP_KERNEL); ++ if (!data) ++ break; ++ ++ err = -EFAULT; ++ if (copy_from_user(data, (void __user *)s.data, ++ s.datalen)) ++ goto free_data; ++ err = real_env_create(s.veid, s.flags, s.class_id, ++ data, s.datalen); ++free_data: ++ kfree(data); ++ } ++ break; ++ case VZCTL_GET_CPU_STAT: { ++ struct vzctl_cpustatctl s; ++ err = -EFAULT; ++ if (copy_from_user(&s, (void __user *)arg, sizeof(s))) ++ break; ++ err = ve_get_cpu_stat(s.veid, s.cpustat); ++ } ++ break; ++ case VZCTL_VE_MEMINFO: { ++ struct vzctl_ve_meminfo s; ++ err = -EFAULT; ++ if (copy_from_user(&s, (void __user *)arg, sizeof(s))) ++ break; ++ err = ve_set_meminfo(s.veid, s.val); ++ } ++ break; ++ } ++ return err; ++} ++ ++#ifdef CONFIG_COMPAT ++int compat_vzcalls_ioctl(struct file *file, unsigned int cmd, ++ unsigned long arg) ++{ ++ int err; ++ ++ switch(cmd) { ++ case VZCTL_GET_CPU_STAT: { ++ /* FIXME */ ++ } ++ case VZCTL_COMPAT_ENV_CREATE_DATA: { ++ struct compat_vzctl_env_create_data cs; ++ struct vzctl_env_create_data __user *s; ++ ++ s = compat_alloc_user_space(sizeof(*s)); ++ err = -EFAULT; ++ if (copy_from_user(&cs, (void *)arg, sizeof(cs))) ++ break; ++ ++ if (put_user(cs.veid, &s->veid) || ++ put_user(cs.flags, &s->flags) || ++ put_user(cs.class_id, &s->class_id) || ++ put_user(compat_ptr(cs.data), &s->data) || ++ put_user(cs.datalen, &s->datalen)) ++ break; ++ err = vzcalls_ioctl(file, VZCTL_ENV_CREATE_DATA, ++ (unsigned long)s); ++ break; ++ } ++#ifdef CONFIG_NET ++ case VZCTL_COMPAT_VE_NETDEV: { ++ struct compat_vzctl_ve_netdev cs; ++ struct vzctl_ve_netdev __user *s; ++ ++ s = compat_alloc_user_space(sizeof(*s)); ++ err = -EFAULT; ++ if (copy_from_user(&cs, (void *)arg, sizeof(cs))) ++ break; ++ ++ if (put_user(cs.veid, &s->veid) || ++ put_user(cs.op, &s->op) || ++ put_user(compat_ptr(cs.dev_name), &s->dev_name)) ++ break; ++ err = vzcalls_ioctl(file, VZCTL_VE_NETDEV, (unsigned long)s); ++ break; ++ } ++#endif ++ case VZCTL_COMPAT_VE_MEMINFO: { ++ struct compat_vzctl_ve_meminfo cs; ++ err = -EFAULT; ++ if (copy_from_user(&cs, (void *)arg, sizeof(cs))) ++ break; ++ err = ve_set_meminfo(cs.veid, cs.val); ++ break; ++ } ++ default: ++ err = vzcalls_ioctl(file, cmd, arg); ++ break; ++ } ++ return err; ++} ++#endif ++ ++static struct vzioctlinfo vzcalls = { ++ .type = VZCTLTYPE, ++ .ioctl = vzcalls_ioctl, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = compat_vzcalls_ioctl, ++#endif ++ .owner = THIS_MODULE, ++}; ++ ++ ++/********************************************************************** ++ ********************************************************************** ++ * ++ * Init/exit stuff ++ * ++ ********************************************************************** ++ **********************************************************************/ ++ ++static int __init init_vecalls_symbols(void) ++{ ++ KSYMRESOLVE(real_do_env_free); ++ KSYMMODRESOLVE(vzmon); ++ return 0; ++} ++ ++static void fini_vecalls_symbols(void) ++{ ++ KSYMMODUNRESOLVE(vzmon); ++ KSYMUNRESOLVE(real_do_env_free); ++} ++ ++static inline __init int init_vecalls_ioctls(void) ++{ ++ vzioctl_register(&vzcalls); ++ return 0; ++} ++ ++static inline void fini_vecalls_ioctls(void) ++{ ++ vzioctl_unregister(&vzcalls); ++} ++ ++#ifdef CONFIG_SYSCTL ++static struct ctl_table_header *table_header; ++ ++static ctl_table kernel_table[] = { ++ { ++ .procname = "ve_allow_kthreads", ++ .data = &ve_allow_kthreads, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { 0 } ++}; ++ ++static ctl_table root_table[] = { ++ {CTL_KERN, "kernel", NULL, 0, 0555, kernel_table}, ++ { 0 } ++}; ++ ++static int init_vecalls_sysctl(void) ++{ ++ table_header = register_sysctl_table(root_table); ++ if (!table_header) ++ return -ENOMEM ; ++ return 0; ++} ++ ++static void fini_vecalls_sysctl(void) ++{ ++ unregister_sysctl_table(table_header); ++} ++#else ++static int init_vecalls_sysctl(void) { return 0; } ++static void fini_vecalls_sysctl(void) { ; } ++#endif ++ ++static int __init vecalls_init(void) ++{ ++ int err; ++ ++ err = init_vecalls_sysctl(); ++ if (err) ++ goto out_vzmond; ++ ++ err = init_vzmond(); ++ if (err < 0) ++ goto out_sysctl; ++ ++ err = init_vecalls_symbols(); ++ if (err < 0) ++ goto out_sym; ++ ++ err = init_vecalls_proc(); ++ if (err < 0) ++ goto out_proc; ++ ++ err = init_vecalls_ioctls(); ++ if (err < 0) ++ goto out_ioctls; ++ ++ return 0; ++ ++out_ioctls: ++ fini_vecalls_proc(); ++out_proc: ++ fini_vecalls_symbols(); ++out_sym: ++ fini_vzmond(); ++out_sysctl: ++ fini_vecalls_sysctl(); ++out_vzmond: ++ return err; ++} ++ ++static void vecalls_exit(void) ++{ ++ fini_vecalls_ioctls(); ++ fini_vecalls_proc(); ++ fini_vecalls_symbols(); ++ fini_vzmond(); ++ fini_vecalls_sysctl(); ++} ++ ++MODULE_AUTHOR("SWsoft "); ++MODULE_DESCRIPTION("Virtuozzo Control"); ++MODULE_LICENSE("GPL v2"); ++ ++module_init(vecalls_init) ++module_exit(vecalls_exit) +diff --git a/kernel/ve/veowner.c b/kernel/ve/veowner.c +new file mode 100644 +index 0000000..8774e9c +--- /dev/null ++++ b/kernel/ve/veowner.c +@@ -0,0 +1,149 @@ ++/* ++ * kernel/ve/veowner.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++void prepare_ve0_process(struct task_struct *tsk) ++{ ++ VE_TASK_INFO(tsk)->exec_env = get_ve0(); ++ VE_TASK_INFO(tsk)->owner_env = get_ve0(); ++ VE_TASK_INFO(tsk)->sleep_time = 0; ++ VE_TASK_INFO(tsk)->wakeup_stamp = 0; ++ VE_TASK_INFO(tsk)->sched_time = 0; ++ seqcount_init(&VE_TASK_INFO(tsk)->wakeup_lock); ++ ++ if (tsk->pid) { ++ list_add_rcu(&tsk->ve_task_info.vetask_list, ++ &get_ve0()->vetask_lh); ++ atomic_inc(&get_ve0()->pcounter); ++ } ++} ++ ++/* ++ * ------------------------------------------------------------------------ ++ * proc entries ++ * ------------------------------------------------------------------------ ++ */ ++ ++#ifdef CONFIG_PROC_FS ++struct proc_dir_entry *proc_vz_dir; ++EXPORT_SYMBOL(proc_vz_dir); ++ ++struct proc_dir_entry *glob_proc_vz_dir; ++EXPORT_SYMBOL(glob_proc_vz_dir); ++ ++static void prepare_proc(void) ++{ ++ proc_vz_dir = proc_mkdir("vz", NULL); ++ if (!proc_vz_dir) ++ panic("Can't create /proc/vz dir\n"); ++ ++ glob_proc_vz_dir = proc_mkdir("vz", &glob_proc_root); ++ if (!proc_vz_dir) ++ panic("Can't create /proc/vz dir\n"); ++} ++#endif ++ ++/* ++ * ------------------------------------------------------------------------ ++ * OpenVZ sysctl ++ * ------------------------------------------------------------------------ ++ */ ++extern int ve_area_access_check; ++ ++#ifdef CONFIG_INET ++static struct ctl_table vz_ipv4_route_table[] = { ++ { ++ .procname = "src_check", ++ .data = &ip_rt_src_check, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++ { 0 } ++}; ++ ++static struct ctl_path net_ipv4_route_path[] = { ++ { .ctl_name = CTL_NET, .procname = "net", }, ++ { .ctl_name = NET_IPV4, .procname = "ipv4", }, ++ { .ctl_name = NET_IPV4_ROUTE, .procname = "route", }, ++ { } ++}; ++#endif ++ ++static struct ctl_table vz_fs_table[] = { ++ { ++ .procname = "ve-area-access-check", ++ .data = &ve_area_access_check, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++ { 0 } ++}; ++ ++static struct ctl_path fs_path[] = { ++ { .ctl_name = CTL_FS, .procname = "fs", }, ++ { } ++}; ++ ++static void prepare_sysctl(void) ++{ ++#ifdef CONFIG_INET ++ register_sysctl_paths(net_ipv4_route_path, vz_ipv4_route_table); ++#endif ++ register_sysctl_paths(fs_path, vz_fs_table); ++} ++ ++/* ++ * ------------------------------------------------------------------------ ++ * XXX init_ve_system ++ * ------------------------------------------------------------------------ ++ */ ++ ++void init_ve_system(void) ++{ ++ struct task_struct *init_entry; ++ struct ve_struct *ve; ++ ++ ve = get_ve0(); ++ ++ init_entry = init_pid_ns.child_reaper; ++ /* if ve_move_task to VE0 (e.g. in cpt code) * ++ * occurs, ve_cap_bset on VE0 is required */ ++ ve->ve_cap_bset = CAP_INIT_EFF_SET; ++ ++ read_lock(&init_entry->fs->lock); ++ ve->root_path = init_entry->fs->root; ++ read_unlock(&init_entry->fs->lock); ++ ++#ifdef CONFIG_PROC_FS ++ prepare_proc(); ++#endif ++ prepare_sysctl(); ++} +diff --git a/kernel/ve/vzdev.c b/kernel/ve/vzdev.c +new file mode 100644 +index 0000000..b2f010c +--- /dev/null ++++ b/kernel/ve/vzdev.c +@@ -0,0 +1,154 @@ ++/* ++ * kernel/ve/vzdev.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define VZCTL_MAJOR 126 ++#define VZCTL_NAME "vzctl" ++ ++MODULE_AUTHOR("SWsoft "); ++MODULE_DESCRIPTION("Virtuozzo Interface"); ++MODULE_LICENSE("GPL v2"); ++ ++static LIST_HEAD(ioctls); ++static spinlock_t ioctl_lock = SPIN_LOCK_UNLOCKED; ++ ++static struct vzioctlinfo *vzctl_get_handler(unsigned int cmd) ++{ ++ struct vzioctlinfo *h; ++ ++ spin_lock(&ioctl_lock); ++ list_for_each_entry(h, &ioctls, list) { ++ if (h->type == _IOC_TYPE(cmd)) ++ goto found; ++ } ++ h = NULL; ++found: ++ if (h && !try_module_get(h->owner)) ++ h = NULL; ++ spin_unlock(&ioctl_lock); ++ return h; ++} ++ ++static void vzctl_put_handler(struct vzioctlinfo *h) ++{ ++ if (!h) ++ return; ++ ++ module_put(h->owner); ++} ++ ++long vzctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ struct vzioctlinfo *h; ++ int err; ++ ++ err = -ENOTTY; ++ h = vzctl_get_handler(cmd); ++ if (h && h->ioctl) ++ err = (*h->ioctl)(file, cmd, arg); ++ vzctl_put_handler(h); ++ ++ return err; ++} ++ ++long compat_vzctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ struct vzioctlinfo *h; ++ int err; ++ ++ err = -ENOIOCTLCMD; ++ h = vzctl_get_handler(cmd); ++ if (h && h->compat_ioctl) ++ err = (*h->compat_ioctl)(file, cmd, arg); ++ vzctl_put_handler(h); ++ ++ return err; ++} ++ ++void vzioctl_register(struct vzioctlinfo *inf) ++{ ++ spin_lock(&ioctl_lock); ++ list_add(&inf->list, &ioctls); ++ spin_unlock(&ioctl_lock); ++} ++EXPORT_SYMBOL(vzioctl_register); ++ ++void vzioctl_unregister(struct vzioctlinfo *inf) ++{ ++ spin_lock(&ioctl_lock); ++ list_del_init(&inf->list); ++ spin_unlock(&ioctl_lock); ++} ++EXPORT_SYMBOL(vzioctl_unregister); ++ ++/* ++ * Init/exit stuff. ++ */ ++static struct file_operations vzctl_fops = { ++ .owner = THIS_MODULE, ++ .unlocked_ioctl = vzctl_ioctl, ++ .compat_ioctl = compat_vzctl_ioctl, ++}; ++ ++static struct class *vzctl_class; ++ ++static void __exit vzctl_exit(void) ++{ ++ device_destroy(vzctl_class, MKDEV(VZCTL_MAJOR, 0)); ++ class_destroy(vzctl_class); ++ unregister_chrdev(VZCTL_MAJOR, VZCTL_NAME); ++} ++ ++static int __init vzctl_init(void) ++{ ++ int ret; ++ struct device *class_err; ++ ++ ret = register_chrdev(VZCTL_MAJOR, VZCTL_NAME, &vzctl_fops); ++ if (ret < 0) ++ goto out; ++ ++ vzctl_class = class_create(THIS_MODULE, "vzctl"); ++ if (IS_ERR(vzctl_class)) { ++ ret = PTR_ERR(vzctl_class); ++ goto out_cleandev; ++ } ++ ++ class_err = device_create(vzctl_class, NULL, ++ MKDEV(VZCTL_MAJOR, 0), VZCTL_NAME); ++ if (IS_ERR(class_err)) { ++ ret = PTR_ERR(class_err); ++ goto out_rmclass; ++ } ++ ++ goto out; ++ ++out_rmclass: ++ class_destroy(vzctl_class); ++out_cleandev: ++ unregister_chrdev(VZCTL_MAJOR, VZCTL_NAME); ++out: ++ return ret; ++} ++ ++module_init(vzctl_init) ++module_exit(vzctl_exit); +diff --git a/kernel/ve/vzevent.c b/kernel/ve/vzevent.c +new file mode 100644 +index 0000000..554f169 +--- /dev/null ++++ b/kernel/ve/vzevent.c +@@ -0,0 +1,125 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define NETLINK_UEVENT 31 ++#define VZ_EVGRP_ALL 0x01 ++ ++/* ++ * NOTE: the original idea was to send events via kobject_uevent(), ++ * however, it turns out that it has negative consequences like ++ * start of /sbin/hotplug which tries to react on our events in inadequate manner. ++ */ ++ ++static struct sock *vzev_sock; ++ ++static char *action_to_string(int action) ++{ ++ switch (action) { ++ case KOBJ_MOUNT: ++ return "ve-mount"; ++ case KOBJ_UMOUNT: ++ return "ve-umount"; ++ case KOBJ_START: ++ return "ve-start"; ++ case KOBJ_STOP: ++ return "ve-stop"; ++ default: ++ return NULL; ++ } ++} ++ ++static int do_vzevent_send(int event, char *msg, int len) ++{ ++ struct sk_buff *skb; ++ char *buf, *action; ++ int alen; ++ ++ action = action_to_string(event); ++ alen = strlen(action); ++ ++ skb = alloc_skb(len + 1 + alen, GFP_KERNEL); ++ if (!skb) ++ return -ENOMEM; ++ ++ buf = skb_put(skb, len + 1 + alen); ++ memcpy(buf, action, alen); ++ buf[alen] = '@'; ++ memcpy(buf + alen + 1, msg, len); ++ (void)netlink_broadcast(vzev_sock, skb, 0, VZ_EVGRP_ALL, GFP_KERNEL); ++ return 0; ++} ++ ++int vzevent_send(int event, const char *attrs_fmt, ...) ++{ ++ va_list args; ++ int len, err; ++ struct ve_struct *ve; ++ char *page; ++ ++ err = -ENOMEM; ++ page = (char *)__get_free_page(GFP_KERNEL); ++ if (!page) ++ goto out; ++ ++ va_start(args, attrs_fmt); ++ len = vscnprintf(page, PAGE_SIZE, attrs_fmt, args); ++ va_end(args); ++ ++ ve = set_exec_env(get_ve0()); ++ err = do_vzevent_send(event, page, len); ++ (void)set_exec_env(ve); ++ free_page((unsigned long)page); ++out: ++ return err; ++} ++EXPORT_SYMBOL(vzevent_send); ++ ++static int ve_start(void *data) ++{ ++ struct ve_struct *ve; ++ ++ ve = (struct ve_struct *)data; ++ vzevent_send(KOBJ_START, "%d", ve->veid); ++ return 0; ++} ++ ++static void ve_stop(void *data) ++{ ++ struct ve_struct *ve; ++ ++ ve = (struct ve_struct *)data; ++ vzevent_send(KOBJ_STOP, "%d", ve->veid); ++} ++ ++static struct ve_hook ve_start_stop_hook = { ++ .init = ve_start, ++ .fini = ve_stop, ++ .owner = THIS_MODULE, ++ .priority = HOOK_PRIO_AFTERALL, ++}; ++ ++static int __init init_vzevent(void) ++{ ++ vzev_sock = netlink_kernel_create(NETLINK_UEVENT, 0, NULL, THIS_MODULE); ++ if (vzev_sock == NULL) ++ return -ENOMEM; ++ ve_hook_register(VE_SS_CHAIN, &ve_start_stop_hook); ++ return 0; ++} ++ ++static void __exit exit_vzevent(void) ++{ ++ ve_hook_unregister(&ve_start_stop_hook); ++ sock_release(vzev_sock->sk_socket); ++} ++ ++MODULE_LICENSE("GPL"); ++ ++module_init(init_vzevent); ++module_exit(exit_vzevent); +diff --git a/kernel/ve/vzwdog.c b/kernel/ve/vzwdog.c +new file mode 100644 +index 0000000..7117365 +--- /dev/null ++++ b/kernel/ve/vzwdog.c +@@ -0,0 +1,283 @@ ++/* ++ * kernel/ve/vzwdog.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* Staff regading kernel thread polling VE validity */ ++static int sleep_timeout = 60; ++static struct task_struct *wdog_thread_tsk; ++ ++extern void show_mem(void); ++ ++static struct file *intr_file; ++static char page[PAGE_SIZE]; ++ ++static void parse_irq_list(int len) ++{ ++ int i, k, skip; ++ for (i = 0; i < len; ) { ++ k = i; ++ while (i < len && page[i] != '\n' && page[i] != ':') ++ i++; ++ skip = 0; ++ if (i < len && page[i] != '\n') { ++ i++; /* skip ':' */ ++ while (i < len && (page[i] == ' ' || page[i] == '0')) ++ i++; ++ skip = (i < len && (page[i] < '0' || page[i] > '9')); ++ while (i < len && page[i] != '\n') ++ i++; ++ } ++ if (!skip) ++ printk("%.*s\n", i - k, page + k); ++ if (i < len) ++ i++; /* skip '\n' */ ++ } ++} ++ ++extern loff_t vfs_llseek(struct file *file, loff_t, int); ++extern ssize_t vfs_read(struct file *file, char __user *, size_t, loff_t *); ++extern struct file *filp_open(const char *filename, int flags, int mode); ++extern int filp_close(struct file *filp, fl_owner_t id); ++static void show_irq_list(void) ++{ ++ mm_segment_t fs; ++ int r; ++ ++ fs = get_fs(); ++ set_fs(KERNEL_DS); ++ vfs_llseek(intr_file, 0, 0); ++ r = vfs_read(intr_file, (void __user *)page, sizeof(page), ++ &intr_file->f_pos); ++ set_fs(fs); ++ ++ if (r > 0) ++ parse_irq_list(r); ++} ++ ++static void show_alloc_latency(void) ++{ ++ static const char *alloc_descr[KSTAT_ALLOCSTAT_NR] = { ++ "A0", ++ "L0", ++ "H0", ++ "L1", ++ "H1" ++ }; ++ int i; ++ ++ printk("lat: "); ++ for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++) { ++ struct kstat_lat_struct *p; ++ cycles_t maxlat, avg0, avg1, avg2; ++ ++ p = &kstat_glob.alloc_lat[i]; ++ spin_lock_irq(&kstat_glb_lock); ++ maxlat = p->last.maxlat; ++ avg0 = p->avg[0]; ++ avg1 = p->avg[1]; ++ avg2 = p->avg[2]; ++ spin_unlock_irq(&kstat_glb_lock); ++ ++ printk("%s %Lu (%Lu %Lu %Lu)", ++ alloc_descr[i], ++ (unsigned long long)maxlat, ++ (unsigned long long)avg0, ++ (unsigned long long)avg1, ++ (unsigned long long)avg2); ++ } ++ printk("\n"); ++} ++ ++static void show_schedule_latency(void) ++{ ++ struct kstat_lat_pcpu_struct *p; ++ cycles_t maxlat, totlat, avg0, avg1, avg2; ++ unsigned long count; ++ ++ p = &kstat_glob.sched_lat; ++ spin_lock_irq(&kstat_glb_lock); ++ maxlat = p->last.maxlat; ++ totlat = p->last.totlat; ++ count = p->last.count; ++ avg0 = p->avg[0]; ++ avg1 = p->avg[1]; ++ avg2 = p->avg[2]; ++ spin_unlock_irq(&kstat_glb_lock); ++ ++ printk("sched lat: %Lu/%Lu/%lu (%Lu %Lu %Lu)\n", ++ (unsigned long long)maxlat, ++ (unsigned long long)totlat, ++ count, ++ (unsigned long long)avg0, ++ (unsigned long long)avg1, ++ (unsigned long long)avg2); ++} ++ ++static void show_header(void) ++{ ++ struct timeval tv; ++ ++ do_gettimeofday(&tv); ++ preempt_disable(); ++ printk("*** VZWDOG 1.14: time %lu.%06lu uptime %Lu CPU %d ***\n", ++ tv.tv_sec, (long)tv.tv_usec, ++ (unsigned long long)get_jiffies_64(), ++ smp_processor_id()); ++#ifdef CONFIG_FAIRSCHED ++ printk("*** cycles_per_jiffy %lu jiffies_per_second %u ***\n", ++ cycles_per_jiffy, HZ); ++#else ++ printk("*** jiffies_per_second %u ***\n", HZ); ++#endif ++ preempt_enable(); ++} ++ ++static void show_pgdatinfo(void) ++{ ++ pg_data_t *pgdat; ++ ++ printk("pgdat:"); ++ for_each_online_pgdat(pgdat) { ++ printk(" %d: %lu,%lu,%lu", ++ pgdat->node_id, ++ pgdat->node_start_pfn, ++ pgdat->node_present_pages, ++ pgdat->node_spanned_pages); ++#ifdef CONFIG_FLAT_NODE_MEM_MAP ++ printk(",%p", pgdat->node_mem_map); ++#endif ++ } ++ printk("\n"); ++} ++ ++static void show_diskio(void) ++{ ++ struct device *dev; ++ char buf[BDEVNAME_SIZE]; ++ ++ printk("disk_io: "); ++ ++ list_for_each_entry(dev, &block_class.devices, node) { ++ char *name; ++ struct gendisk *gd = dev_to_disk(dev); ++ ++ name = disk_name(gd, 0, buf); ++ if ((strlen(name) > 4) && (strncmp(name, "loop", 4) == 0) && ++ isdigit(name[4])) ++ continue; ++ if ((strlen(name) > 3) && (strncmp(name, "ram", 3) == 0) && ++ isdigit(name[3])) ++ continue; ++ printk("(%u,%u) %s r(%lu %lu %lu) w(%lu %lu %lu)\n", ++ gd->major, gd->first_minor, ++ name, ++ disk_stat_read(gd, ios[READ]), ++ disk_stat_read(gd, sectors[READ]), ++ disk_stat_read(gd, merges[READ]), ++ disk_stat_read(gd, ios[WRITE]), ++ disk_stat_read(gd, sectors[WRITE]), ++ disk_stat_read(gd, merges[WRITE])); ++ } ++ ++ printk("\n"); ++} ++ ++static void show_nrprocs(void) ++{ ++ unsigned long _nr_running, _nr_sleeping, ++ _nr_unint, _nr_zombie, _nr_dead, _nr_stopped; ++ ++ _nr_running = nr_running(); ++ _nr_unint = nr_uninterruptible(); ++ _nr_sleeping = nr_sleeping(); ++ _nr_zombie = nr_zombie; ++ _nr_dead = atomic_read(&nr_dead); ++ _nr_stopped = nr_stopped(); ++ ++ printk("VEnum: %d, proc R %lu, S %lu, D %lu, " ++ "Z %lu, X %lu, T %lu (tot %d)\n", ++ nr_ve, _nr_running, _nr_sleeping, _nr_unint, ++ _nr_zombie, _nr_dead, _nr_stopped, nr_threads); ++} ++ ++static void wdog_print(void) ++{ ++ show_header(); ++ show_irq_list(); ++ show_pgdatinfo(); ++ show_mem(); ++ show_diskio(); ++ show_schedule_latency(); ++ show_alloc_latency(); ++ show_nrprocs(); ++} ++ ++static int wdog_loop(void* data) ++{ ++ while (1) { ++ wdog_print(); ++ try_to_freeze(); ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ if (kthread_should_stop()) ++ break; ++ schedule_timeout(sleep_timeout*HZ); ++ } ++ return 0; ++} ++ ++static int __init wdog_init(void) ++{ ++ struct file *file; ++ ++ file = filp_open("/proc/interrupts", 0, 0); ++ if (IS_ERR(file)) ++ return PTR_ERR(file); ++ intr_file = file; ++ ++ wdog_thread_tsk = kthread_run(wdog_loop, NULL, "vzwdog"); ++ if (IS_ERR(wdog_thread_tsk)) { ++ filp_close(intr_file, NULL); ++ return -EBUSY; ++ } ++ return 0; ++} ++ ++static void __exit wdog_exit(void) ++{ ++ kthread_stop(wdog_thread_tsk); ++ filp_close(intr_file, NULL); ++} ++ ++module_param(sleep_timeout, int, 0660); ++MODULE_AUTHOR("SWsoft "); ++MODULE_DESCRIPTION("Virtuozzo WDOG"); ++MODULE_LICENSE("GPL v2"); ++ ++module_init(wdog_init) ++module_exit(wdog_exit) +diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug +index d2099f4..eefdf6c 100644 +--- a/lib/Kconfig.debug ++++ b/lib/Kconfig.debug +@@ -125,6 +125,15 @@ config DEBUG_SECTION_MISMATCH + - Enable verbose reporting from modpost to help solving + the section mismatches reported. + ++config SYSRQ_DEBUG ++ bool "Debugging via sysrq keys" ++ depends on MAGIC_SYSRQ ++ default y ++ help ++ Say Y if you want to extend functionality of magic key. It will ++ provide you with some debugging facilities such as dumping and ++ writing memory, resolving symbols and some other. ++ + config DEBUG_KERNEL + bool "Kernel debugging" + help +diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c +index 2fa545a..fe9fa6a 100644 +--- a/lib/kobject_uevent.c ++++ b/lib/kobject_uevent.c +@@ -38,6 +38,8 @@ static const char *kobject_actions[] = { + [KOBJ_REMOVE] = "remove", + [KOBJ_CHANGE] = "change", + [KOBJ_MOVE] = "move", ++ [KOBJ_START] = "start", ++ [KOBJ_STOP] = "stop", + [KOBJ_ONLINE] = "online", + [KOBJ_OFFLINE] = "offline", + }; +diff --git a/mm/filemap.c b/mm/filemap.c +index 1e6a7d3..a49f9ea 100644 +--- a/mm/filemap.c ++++ b/mm/filemap.c +@@ -42,6 +42,8 @@ + + #include + ++#include ++ + static ssize_t + generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, + loff_t offset, unsigned long nr_segs); +@@ -121,6 +123,7 @@ void __remove_from_page_cache(struct page *page) + mem_cgroup_uncharge_page(page); + radix_tree_delete(&mapping->page_tree, page->index); + page->mapping = NULL; ++ ub_io_release_debug(page); + mapping->nrpages--; + __dec_zone_page_state(page, NR_FILE_PAGES); + BUG_ON(page_mapped(page)); +diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c +index 3e744ab..974fe0d 100644 +--- a/mm/filemap_xip.c ++++ b/mm/filemap_xip.c +@@ -16,6 +16,7 @@ + #include + #include + #include ++#include + + /* + * We do use our own empty page to avoid interference with other users +@@ -190,6 +191,8 @@ __xip_unmap (struct address_space * mapping, + flush_cache_page(vma, address, pte_pfn(*pte)); + pteval = ptep_clear_flush(vma, address, pte); + page_remove_rmap(page, vma); ++ pb_remove_ref(page, mm); ++ ub_unused_privvm_inc(mm, vma); + dec_mm_counter(mm, file_rss); + BUG_ON(pte_dirty(pteval)); + pte_unmap_unlock(pte, ptl); +diff --git a/mm/fremap.c b/mm/fremap.c +index 07a9c82..e2733ba 100644 +--- a/mm/fremap.c ++++ b/mm/fremap.c +@@ -20,6 +20,8 @@ + #include + #include + ++#include ++ + static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) + { +@@ -35,6 +37,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, + if (pte_dirty(pte)) + set_page_dirty(page); + page_remove_rmap(page, vma); ++ pb_remove_ref(page, mm); + page_cache_release(page); + update_hiwater_rss(mm); + dec_mm_counter(mm, file_rss); +@@ -61,8 +64,10 @@ static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, + if (!pte) + goto out; + +- if (!pte_none(*pte)) ++ if (!pte_none(*pte)) { + zap_pte(mm, vma, addr, pte); ++ ub_unused_privvm_inc(mm, vma); ++ } + + set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); + /* +@@ -237,4 +242,5 @@ out: + + return err; + } ++EXPORT_SYMBOL_GPL(sys_remap_file_pages); + +diff --git a/mm/memory.c b/mm/memory.c +index 2302d22..06180fe 100644 +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -42,6 +42,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -51,6 +52,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -61,6 +63,11 @@ + #include + #include + ++#include ++#include ++#include ++#include ++ + #ifndef CONFIG_NEED_MULTIPLE_NODES + /* use the per-pgdat data instead for discontigmem - mbligh */ + unsigned long max_mapnr; +@@ -115,18 +122,21 @@ void pgd_clear_bad(pgd_t *pgd) + pgd_ERROR(*pgd); + pgd_clear(pgd); + } ++EXPORT_SYMBOL_GPL(pgd_clear_bad); + + void pud_clear_bad(pud_t *pud) + { + pud_ERROR(*pud); + pud_clear(pud); + } ++EXPORT_SYMBOL_GPL(pud_clear_bad); + + void pmd_clear_bad(pmd_t *pmd) + { + pmd_ERROR(*pmd); + pmd_clear(pmd); + } ++EXPORT_SYMBOL_GPL(pmd_clear_bad); + + /* + * Note: this doesn't free the actual pages themselves. That +@@ -337,6 +347,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) + pte_free(mm, new); + return 0; + } ++EXPORT_SYMBOL_GPL(__pte_alloc); + + int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) + { +@@ -477,6 +488,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, + out: + return pfn_to_page(pfn); + } ++EXPORT_SYMBOL_GPL(vm_normal_page); + + /* + * copy one vm_area from one task to the other. Assumes the page tables +@@ -487,7 +499,7 @@ out: + static inline void + copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, +- unsigned long addr, int *rss) ++ unsigned long addr, int *rss, struct page_beancounter **pbc) + { + unsigned long vm_flags = vma->vm_flags; + pte_t pte = *src_pte; +@@ -542,6 +554,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, + if (page) { + get_page(page); + page_dup_rmap(page, vma, addr); ++ pb_dup_ref(page, dst_mm, pbc); + rss[!!PageAnon(page)]++; + } + +@@ -549,20 +562,35 @@ out_set_pte: + set_pte_at(dst_mm, addr, dst_pte, pte); + } + ++#define pte_ptrs(a) (PTRS_PER_PTE - ((a >> PAGE_SHIFT)&(PTRS_PER_PTE - 1))) ++#ifdef CONFIG_BEANCOUNTERS ++#define same_ub(mm1, mm2) ((mm1)->mm_ub == (mm2)->mm_ub) ++#else ++#define same_ub(mm1, mm2) 1 ++#endif ++ + static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, +- pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, ++ pmd_t *dst_pmd, pmd_t *src_pmd, ++ struct vm_area_struct *dst_vma, ++ struct vm_area_struct *vma, + unsigned long addr, unsigned long end) + { + pte_t *src_pte, *dst_pte; + spinlock_t *src_ptl, *dst_ptl; + int progress = 0; +- int rss[2]; ++ int rss[2], rss_tot; ++ struct page_beancounter *pbc; ++ int err; + ++ err = -ENOMEM; ++ pbc = same_ub(src_mm, dst_mm) ? PBC_COPY_SAME : NULL; + again: ++ if (pbc != PBC_COPY_SAME && pb_alloc_list(&pbc, pte_ptrs(addr))) ++ goto out; + rss[1] = rss[0] = 0; + dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); + if (!dst_pte) +- return -ENOMEM; ++ goto out; + src_pte = pte_offset_map_nested(src_pmd, addr); + src_ptl = pte_lockptr(src_mm, src_pmd); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); +@@ -583,23 +611,32 @@ again: + progress++; + continue; + } +- copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); ++ copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss, ++ &pbc); + progress += 8; + } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); + + arch_leave_lazy_mmu_mode(); + spin_unlock(src_ptl); + pte_unmap_nested(src_pte - 1); ++ rss_tot = rss[0] + rss[1]; ++ ub_unused_privvm_sub(dst_mm, dst_vma, rss_tot); + add_mm_rss(dst_mm, rss[0], rss[1]); + pte_unmap_unlock(dst_pte - 1, dst_ptl); + cond_resched(); + if (addr != end) + goto again; +- return 0; ++ ++ err = 0; ++out: ++ pb_free_list(&pbc); ++ return err; + } + + static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, +- pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, ++ pud_t *dst_pud, pud_t *src_pud, ++ struct vm_area_struct *dst_vma, ++ struct vm_area_struct *vma, + unsigned long addr, unsigned long end) + { + pmd_t *src_pmd, *dst_pmd; +@@ -614,14 +651,16 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src + if (pmd_none_or_clear_bad(src_pmd)) + continue; + if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, +- vma, addr, next)) ++ dst_vma, vma, addr, next)) + return -ENOMEM; + } while (dst_pmd++, src_pmd++, addr = next, addr != end); + return 0; + } + + static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, +- pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, ++ pgd_t *dst_pgd, pgd_t *src_pgd, ++ struct vm_area_struct *dst_vma, ++ struct vm_area_struct *vma, + unsigned long addr, unsigned long end) + { + pud_t *src_pud, *dst_pud; +@@ -636,19 +675,21 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src + if (pud_none_or_clear_bad(src_pud)) + continue; + if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, +- vma, addr, next)) ++ dst_vma, vma, addr, next)) + return -ENOMEM; + } while (dst_pud++, src_pud++, addr = next, addr != end); + return 0; + } + +-int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, +- struct vm_area_struct *vma) ++int __copy_page_range(struct vm_area_struct *dst_vma, ++ struct vm_area_struct *vma, ++ unsigned long addr, size_t size) + { ++ struct mm_struct *dst_mm = dst_vma->vm_mm; ++ struct mm_struct *src_mm = vma->vm_mm; + pgd_t *src_pgd, *dst_pgd; + unsigned long next; +- unsigned long addr = vma->vm_start; +- unsigned long end = vma->vm_end; ++ unsigned long end = addr + size; + + /* + * Don't copy ptes where a page fault will fill them correctly. +@@ -671,11 +712,22 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + if (pgd_none_or_clear_bad(src_pgd)) + continue; + if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, +- vma, addr, next)) ++ dst_vma, vma, addr, next)) + return -ENOMEM; + } while (dst_pgd++, src_pgd++, addr = next, addr != end); + return 0; + } ++EXPORT_SYMBOL_GPL(__copy_page_range); ++ ++int copy_page_range(struct mm_struct *dst, struct mm_struct *src, ++ struct vm_area_struct *dst_vma, struct vm_area_struct *vma) ++{ ++ if (dst_vma->vm_mm != dst) ++ BUG(); ++ if (vma->vm_mm != src) ++ BUG(); ++ return __copy_page_range(dst_vma, vma, vma->vm_start, vma->vm_end-vma->vm_start); ++} + + static unsigned long zap_pte_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pmd_t *pmd, +@@ -687,6 +739,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, + spinlock_t *ptl; + int file_rss = 0; + int anon_rss = 0; ++ int rss; + + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + arch_enter_lazy_mmu_mode(); +@@ -741,6 +794,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, + file_rss--; + } + page_remove_rmap(page, vma); ++ pb_remove_ref(page, mm); + tlb_remove_page(tlb, page); + continue; + } +@@ -755,6 +809,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, + pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); + ++ rss = -(file_rss + anon_rss); ++ ub_unused_privvm_add(mm, vma, rss); + add_mm_rss(mm, file_rss, anon_rss); + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(pte - 1, ptl); +@@ -1695,6 +1751,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, + int reuse = 0, ret = 0; + int page_mkwrite = 0; + struct page *dirty_page = NULL; ++ struct page_beancounter *pbc; + + old_page = vm_normal_page(vma, address, orig_pte); + if (!old_page) { +@@ -1766,6 +1823,7 @@ reuse: + flush_cache_page(vma, address, pte_pfn(orig_pte)); + entry = pte_mkyoung(orig_pte); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); ++ ClearPageCheckpointed(old_page); + if (ptep_set_access_flags(vma, address, page_table, entry,1)) + update_mmu_cache(vma, address, entry); + ret |= VM_FAULT_WRITE; +@@ -1779,6 +1837,9 @@ reuse: + gotten: + pte_unmap_unlock(page_table, ptl); + ++ if (unlikely(pb_alloc(&pbc))) ++ goto oom_nopb; ++ + if (unlikely(anon_vma_prepare(vma))) + goto oom; + VM_BUG_ON(old_page == ZERO_PAGE(0)); +@@ -1797,12 +1858,15 @@ gotten: + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + if (likely(pte_same(*page_table, orig_pte))) { + if (old_page) { ++ pb_remove_ref(old_page, mm); + if (!PageAnon(old_page)) { + dec_mm_counter(mm, file_rss); + inc_mm_counter(mm, anon_rss); + } +- } else ++ } else { ++ ub_unused_privvm_dec(mm, vma); + inc_mm_counter(mm, anon_rss); ++ } + flush_cache_page(vma, address, pte_pfn(orig_pte)); + entry = mk_pte(new_page, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); +@@ -1817,6 +1881,7 @@ gotten: + update_mmu_cache(vma, address, entry); + lru_cache_add_active(new_page); + page_add_new_anon_rmap(new_page, vma, address); ++ pb_add_ref(new_page, mm, &pbc); + + if (old_page) { + /* +@@ -1854,6 +1919,7 @@ gotten: + page_cache_release(new_page); + if (old_page) + page_cache_release(old_page); ++ pb_free(&pbc); + unlock: + pte_unmap_unlock(page_table, ptl); + if (dirty_page) { +@@ -1876,6 +1942,8 @@ unlock: + oom_free_new: + page_cache_release(new_page); + oom: ++ pb_free(&pbc); ++oom_nopb: + if (old_page) + page_cache_release(old_page); + return VM_FAULT_OOM; +@@ -2183,10 +2251,16 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, + swp_entry_t entry; + pte_t pte; + int ret = 0; ++ struct page_beancounter *pbc; ++ cycles_t start; + + if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) +- goto out; ++ goto out_nostat; + ++ if (unlikely(pb_alloc(&pbc))) ++ return VM_FAULT_OOM; ++ ++ start = get_cycles(); + entry = pte_to_swp_entry(orig_pte); + if (is_migration_entry(entry)) { + migration_entry_wait(mm, pmd, address); +@@ -2240,6 +2314,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, + /* The page isn't present yet, go ahead with the fault. */ + + inc_mm_counter(mm, anon_rss); ++ ub_percpu_inc(mm->mm_ub, swapin); + pte = mk_pte(page, vma->vm_page_prot); + if (write_access && can_share_swap_page(page)) { + pte = maybe_mkwrite(pte_mkdirty(pte), vma); +@@ -2249,10 +2324,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, + flush_icache_page(vma, page); + set_pte_at(mm, address, page_table, pte); + page_add_anon_rmap(page, vma, address); ++ pb_add_ref(page, mm, &pbc); ++ ub_unused_privvm_dec(mm, vma); + + swap_free(entry); +- if (vm_swap_full()) +- remove_exclusive_swap_page(page); ++ try_to_remove_exclusive_swap_page(page); + unlock_page(page); + + if (write_access) { +@@ -2267,10 +2343,16 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, + unlock: + pte_unmap_unlock(page_table, ptl); + out: ++ pb_free(&pbc); ++ spin_lock_irq(&kstat_glb_lock); ++ KSTAT_LAT_ADD(&kstat_glob.swap_in, get_cycles() - start); ++ spin_unlock_irq(&kstat_glb_lock); ++out_nostat: + return ret; + out_nomap: + mem_cgroup_uncharge_page(page); + pte_unmap_unlock(page_table, ptl); ++ pb_free(&pbc); + unlock_page(page); + page_cache_release(page); + return ret; +@@ -2288,10 +2370,14 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, + struct page *page; + spinlock_t *ptl; + pte_t entry; ++ struct page_beancounter *pbc; + + /* Allocate our own private page. */ + pte_unmap(page_table); + ++ if (unlikely(pb_alloc(&pbc))) ++ goto oom_nopb; ++ + if (unlikely(anon_vma_prepare(vma))) + goto oom; + page = alloc_zeroed_user_highpage_movable(vma, address); +@@ -2311,11 +2397,14 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, + inc_mm_counter(mm, anon_rss); + lru_cache_add_active(page); + page_add_new_anon_rmap(page, vma, address); ++ pb_add_ref(page, mm, &pbc); ++ ub_unused_privvm_dec(mm, vma); + set_pte_at(mm, address, page_table, entry); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, address, entry); + unlock: ++ pb_free(&pbc); + pte_unmap_unlock(page_table, ptl); + return 0; + release: +@@ -2325,6 +2414,8 @@ release: + oom_free_page: + page_cache_release(page); + oom: ++ pb_free(&pbc); ++oom_nopb: + return VM_FAULT_OOM; + } + +@@ -2351,6 +2442,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, + pte_t entry; + int anon = 0; + struct page *dirty_page = NULL; ++ struct page_beancounter *pbc; + struct vm_fault vmf; + int ret; + int page_mkwrite = 0; +@@ -2360,9 +2452,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, + vmf.flags = flags; + vmf.page = NULL; + ++ ret = VM_FAULT_OOM; ++ if (unlikely(pb_alloc(&pbc))) ++ goto oom_nopb; ++ + ret = vma->vm_ops->fault(vma, &vmf); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) +- return ret; ++ goto out_fault; + + /* + * For consistency in subsequent calls, make the faulted page always +@@ -2443,6 +2539,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, + */ + /* Only go through if we didn't race with anybody else... */ + if (likely(pte_same(*page_table, orig_pte))) { ++ struct user_beancounter *ub; ++ + flush_icache_page(vma, page); + entry = mk_pte(page, vma->vm_page_prot); + if (flags & FAULT_FLAG_WRITE) +@@ -2460,6 +2558,25 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, + get_page(dirty_page); + } + } ++ ub = page_ub(page); ++ if (ub != NULL && ++#ifdef CONFIG_BC_IO_ACCOUNTING ++ !((unsigned long)ub & PAGE_IO_MARK) && ++#endif ++ ub->ub_magic == UB_MAGIC) { ++ /* ++ * WOW: Page was already charged as page_ub. This may ++ * happens for example then some driver export its low ++ * memory pages to user space. We can't account page as ++ * page_ub and page_bp at the same time. So uncharge ++ * page from UB counter. ++ */ ++ WARN_ON_ONCE(1); ++ ub_page_uncharge(page, 0); ++ } ++ ++ pb_add_ref(page, mm, &pbc); ++ ub_unused_privvm_dec(mm, vma); + + /* no need to invalidate: a not-present page won't be cached */ + update_mmu_cache(vma, address, entry); +@@ -2485,7 +2602,9 @@ out_unlocked: + set_page_dirty_balance(dirty_page, page_mkwrite); + put_page(dirty_page); + } +- ++out_fault: ++ pb_free(&pbc); ++oom_nopb: + return ret; + } + +@@ -2667,6 +2786,27 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, + pmd_t *pmd; + pte_t *pte; + ++#ifdef CONFIG_VZ_GENCALLS ++ do { ++ int ret; ++#ifdef CONFIG_BEANCOUNTERS ++ struct task_beancounter *tbc; ++ ++ tbc = ¤t->task_bc; ++ if (!test_bit(UB_AFLAG_NOTIF_PAGEIN, &mm->mm_ub->ub_aflags) && ++ tbc->pgfault_allot) { ++ tbc->pgfault_allot--; ++ break; /* skip notifier */ ++ } ++#endif ++ ret = virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_PAGEIN, ++ (void *)1); ++ if (ret & NOTIFY_FAIL) ++ return VM_FAULT_SIGBUS; ++ if (ret & NOTIFY_OK) ++ return VM_FAULT_MINOR; /* retry */ ++ } while (0); ++#endif + __set_current_state(TASK_RUNNING); + + count_vm_event(PGFAULT); +@@ -2711,6 +2851,8 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) + } + #endif /* __PAGETABLE_PUD_FOLDED */ + ++EXPORT_SYMBOL_GPL(__pud_alloc); ++ + #ifndef __PAGETABLE_PMD_FOLDED + /* + * Allocate page middle directory. +@@ -2741,6 +2883,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) + } + #endif /* __PAGETABLE_PMD_FOLDED */ + ++EXPORT_SYMBOL_GPL(__pmd_alloc); ++ + int make_pages_present(unsigned long addr, unsigned long end) + { + int ret, len, write; +@@ -2760,6 +2904,8 @@ int make_pages_present(unsigned long addr, unsigned long end) + return ret == len ? 0 : -1; + } + ++EXPORT_SYMBOL(make_pages_present); ++ + #if !defined(__HAVE_ARCH_GATE_AREA) + + #if defined(AT_SYSINFO_EHDR) +diff --git a/mm/mempool.c b/mm/mempool.c +index a46eb1b..0e1a6bf 100644 +--- a/mm/mempool.c ++++ b/mm/mempool.c +@@ -77,6 +77,8 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, + init_waitqueue_head(&pool->wait); + pool->alloc = alloc_fn; + pool->free = free_fn; ++ if (alloc_fn == mempool_alloc_slab) ++ kmem_mark_nocharge((struct kmem_cache *)pool_data); + + /* + * First pre-allocate the guaranteed number of buffers. +@@ -118,6 +120,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask) + unsigned long flags; + + BUG_ON(new_min_nr <= 0); ++ gfp_mask &= ~__GFP_UBC; + + spin_lock_irqsave(&pool->lock, flags); + if (new_min_nr <= pool->min_nr) { +@@ -211,6 +214,7 @@ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) + gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ + gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ + gfp_mask |= __GFP_NOWARN; /* failures are OK */ ++ gfp_mask &= ~__GFP_UBC; + + gfp_temp = gfp_mask & ~(__GFP_WAIT|__GFP_IO); + +diff --git a/mm/mlock.c b/mm/mlock.c +index 7b26560..ea357f4 100644 +--- a/mm/mlock.c ++++ b/mm/mlock.c +@@ -8,10 +8,12 @@ + #include + #include + #include ++#include + #include + #include + #include + #include ++#include + + int can_do_mlock(void) + { +@@ -36,6 +38,14 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, + goto out; + } + ++ if (newflags & VM_LOCKED) { ++ ret = ub_locked_charge(mm, end - start); ++ if (ret < 0) { ++ *prev = vma; ++ goto out; ++ } ++ } ++ + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, + vma->vm_file, pgoff, vma_policy(vma)); +@@ -49,13 +59,13 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, + if (start != vma->vm_start) { + ret = split_vma(mm, vma, start, 1); + if (ret) +- goto out; ++ goto out_uncharge; + } + + if (end != vma->vm_end) { + ret = split_vma(mm, vma, end, 0); + if (ret) +- goto out; ++ goto out_uncharge; + } + + success: +@@ -74,13 +84,19 @@ success: + pages = -pages; + if (!(newflags & VM_IO)) + ret = make_pages_present(start, end); +- } ++ } else ++ ub_locked_uncharge(mm, end - start); + + mm->locked_vm -= pages; + out: + if (ret == -ENOMEM) + ret = -EAGAIN; + return ret; ++ ++out_uncharge: ++ if (newflags & VM_LOCKED) ++ ub_locked_uncharge(mm, end - start); ++ goto out; + } + + static int do_mlock(unsigned long start, size_t len, int on) +@@ -157,6 +173,7 @@ asmlinkage long sys_mlock(unsigned long start, size_t len) + up_write(¤t->mm->mmap_sem); + return error; + } ++EXPORT_SYMBOL_GPL(sys_mlock); + + asmlinkage long sys_munlock(unsigned long start, size_t len) + { +@@ -169,6 +186,7 @@ asmlinkage long sys_munlock(unsigned long start, size_t len) + up_write(¤t->mm->mmap_sem); + return ret; + } ++EXPORT_SYMBOL_GPL(sys_munlock); + + static int do_mlockall(int flags) + { +diff --git a/mm/mmap.c b/mm/mmap.c +index 3354fdd..89b2ef2 100644 +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -26,6 +26,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -36,10 +37,13 @@ + #define arch_mmap_check(addr, len, flags) (0) + #endif + ++#include ++ + #ifndef arch_rebalance_pgtables + #define arch_rebalance_pgtables(addr, len) (addr) + #endif + ++static unsigned long __do_brk(unsigned long addr, unsigned long len, int soft); + static void unmap_region(struct mm_struct *mm, + struct vm_area_struct *vma, struct vm_area_struct *prev, + unsigned long start, unsigned long end); +@@ -104,6 +108,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) + + vm_acct_memory(pages); + ++#ifdef CONFIG_BEANCOUNTERS ++ switch (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_ENOUGHMEM, ++ (void *)pages) ++ & (NOTIFY_OK | NOTIFY_FAIL)) { ++ case NOTIFY_OK: ++ return 0; ++ case NOTIFY_FAIL: ++ vm_unacct_memory(pages); ++ return -ENOMEM; ++ } ++#endif ++ + /* + * Sometimes we want to use more memory than we have + */ +@@ -228,6 +244,9 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) + struct vm_area_struct *next = vma->vm_next; + + might_sleep(); ++ ++ ub_memory_uncharge(vma->vm_mm, vma->vm_end - vma->vm_start, ++ vma->vm_flags, vma->vm_file); + if (vma->vm_ops && vma->vm_ops->close) + vma->vm_ops->close(vma); + if (vma->vm_file) { +@@ -285,7 +304,7 @@ asmlinkage unsigned long sys_brk(unsigned long brk) + goto out; + + /* Ok, looks good - let it rip. */ +- if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) ++ if (__do_brk(oldbrk, newbrk-oldbrk, UB_HARD) != oldbrk) + goto out; + set_brk: + mm->brk = brk; +@@ -927,7 +946,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, + prot |= PROT_EXEC; + + if (!len) +- return -EINVAL; ++ return addr; + + if (!(flags & MAP_FIXED)) + addr = round_hint_to_min(addr); +@@ -1092,6 +1111,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, + struct rb_node **rb_link, *rb_parent; + unsigned long charged = 0; + struct inode *inode = file ? file->f_path.dentry->d_inode : NULL; ++ unsigned long ub_charged = 0; + + /* Clear old maps */ + error = -ENOMEM; +@@ -1123,6 +1143,11 @@ munmap_back: + } + } + ++ if (ub_memory_charge(mm, len, vm_flags, file, ++ (flags & MAP_EXECPRIO ? UB_SOFT : UB_HARD))) ++ goto charge_error; ++ ub_charged = 1; ++ + /* + * Can we just expand an old private anonymous mapping? + * The VM_SHARED test is necessary because shmem_zero_setup +@@ -1138,7 +1163,8 @@ munmap_back: + * specific mapper. the address has already been validated, but + * not unmapped, but the maps are removed from the list. + */ +- vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); ++ vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL | ++ (flags & MAP_EXECPRIO ? __GFP_SOFT_UBC : 0)); + if (!vma) { + error = -ENOMEM; + goto unacct_error; +@@ -1168,6 +1194,19 @@ munmap_back: + goto unmap_and_free_vma; + if (vm_flags & VM_EXECUTABLE) + added_exe_file_vma(mm); ++ if (vm_flags != vma->vm_flags) { ++ /* ++ * ->vm_flags has been changed in f_op->mmap method. ++ * We have to recharge ub memory. ++ */ ++ ub_memory_uncharge(mm, len, vm_flags, file); ++ if (ub_memory_charge(mm, len, vma->vm_flags, file, ++ (flags & MAP_EXECPRIO ? UB_SOFT : UB_HARD))) { ++ ub_charged = 0; ++ error = -ENOMEM; ++ goto unmap_and_free_vma; ++ } ++ } + } else if (vm_flags & VM_SHARED) { + error = shmem_zero_setup(vma); + if (error) +@@ -1232,6 +1271,9 @@ unmap_and_free_vma: + free_vma: + kmem_cache_free(vm_area_cachep, vma); + unacct_error: ++ if (ub_charged) ++ ub_memory_uncharge(mm, len, vm_flags, file); ++charge_error: + if (charged) + vm_unacct_memory(charged); + return error; +@@ -1554,12 +1596,16 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un + if (is_hugepage_only_range(vma->vm_mm, new_start, size)) + return -EFAULT; + ++ if (ub_memory_charge(mm, grow << PAGE_SHIFT, vma->vm_flags, ++ vma->vm_file, UB_SOFT)) ++ goto fail_charge; ++ + /* + * Overcommit.. This must be the final test, as it will + * update security statistics. + */ + if (security_vm_enough_memory(grow)) +- return -ENOMEM; ++ goto fail_sec; + + /* Ok, everything looks good - let it rip */ + mm->total_vm += grow; +@@ -1567,6 +1613,11 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un + mm->locked_vm += grow; + vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); + return 0; ++ ++fail_sec: ++ ub_memory_uncharge(mm, grow << PAGE_SHIFT, vma->vm_flags, vma->vm_file); ++fail_charge: ++ return -ENOMEM; + } + + #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64) +@@ -1850,6 +1901,7 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, + + return 0; + } ++EXPORT_SYMBOL_GPL(split_vma); + + /* Munmap is split into 2 main parts -- this part which finds + * what needs doing, and the areas themselves, which do the +@@ -1943,7 +1995,7 @@ static inline void verify_mm_writelocked(struct mm_struct *mm) + * anonymous maps. eventually we may be able to do some + * brk-specific accounting here. + */ +-unsigned long do_brk(unsigned long addr, unsigned long len) ++static unsigned long __do_brk(unsigned long addr, unsigned long len, int soft) + { + struct mm_struct * mm = current->mm; + struct vm_area_struct * vma, * prev; +@@ -2009,8 +2061,11 @@ unsigned long do_brk(unsigned long addr, unsigned long len) + if (mm->map_count > sysctl_max_map_count) + return -ENOMEM; + ++ if (ub_memory_charge(mm, len, flags, NULL, soft)) ++ goto fail_charge; ++ + if (security_vm_enough_memory(len >> PAGE_SHIFT)) +- return -ENOMEM; ++ goto fail_sec; + + /* Can we just expand an old private anonymous mapping? */ + if (vma_merge(mm, prev, addr, addr + len, flags, +@@ -2020,11 +2075,10 @@ unsigned long do_brk(unsigned long addr, unsigned long len) + /* + * create a vma struct for an anonymous mapping + */ +- vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); +- if (!vma) { +- vm_unacct_memory(len >> PAGE_SHIFT); +- return -ENOMEM; +- } ++ vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL | ++ (soft == UB_SOFT ? __GFP_SOFT_UBC : 0)); ++ if (!vma) ++ goto fail_alloc; + + vma->vm_mm = mm; + vma->vm_start = addr; +@@ -2040,8 +2094,19 @@ out: + make_pages_present(addr, addr + len); + } + return addr; ++ ++fail_alloc: ++ vm_unacct_memory(len >> PAGE_SHIFT); ++fail_sec: ++ ub_memory_uncharge(mm, len, flags, NULL); ++fail_charge: ++ return -ENOMEM; + } + ++unsigned long do_brk(unsigned long addr, unsigned long len) ++{ ++ return __do_brk(addr, len, UB_SOFT); ++} + EXPORT_SYMBOL(do_brk); + + /* Release all mmaps. */ +@@ -2218,10 +2283,11 @@ static void special_mapping_close(struct vm_area_struct *vma) + { + } + +-static struct vm_operations_struct special_mapping_vmops = { ++struct vm_operations_struct special_mapping_vmops = { + .close = special_mapping_close, + .fault = special_mapping_fault, + }; ++EXPORT_SYMBOL_GPL(special_mapping_vmops); + + /* + * Called with mm->mmap_sem held for writing. +diff --git a/mm/mmzone.c b/mm/mmzone.c +index 486ed59..8cd9f7a 100644 +--- a/mm/mmzone.c ++++ b/mm/mmzone.c +@@ -13,6 +13,7 @@ struct pglist_data *first_online_pgdat(void) + { + return NODE_DATA(first_online_node); + } ++EXPORT_SYMBOL_GPL(first_online_pgdat); + + struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) + { +@@ -22,6 +23,7 @@ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) + return NULL; + return NODE_DATA(nid); + } ++EXPORT_SYMBOL_GPL(next_online_pgdat); + + /* + * next_zone - helper magic for for_each_zone() +diff --git a/mm/mprotect.c b/mm/mprotect.c +index a5bf31c..e0073cd 100644 +--- a/mm/mprotect.c ++++ b/mm/mprotect.c +@@ -9,6 +9,7 @@ + */ + + #include ++#include + #include + #include + #include +@@ -26,6 +27,8 @@ + #include + #include + ++#include ++ + #ifndef pgprot_modify + static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) + { +@@ -144,6 +147,8 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, + unsigned long charged = 0; + pgoff_t pgoff; + int error; ++ unsigned long ch_size; ++ int ch_dir; + int dirty_accountable = 0; + + if (newflags == oldflags) { +@@ -151,6 +156,12 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, + return 0; + } + ++ error = -ENOMEM; ++ ch_size = nrpages - pages_in_vma_range(vma, start, end); ++ ch_dir = ub_protected_charge(mm, ch_size, newflags, vma); ++ if (ch_dir == PRIVVM_ERROR) ++ goto fail_ch; ++ + /* + * If we make a private mapping writable we increase our commit; + * but (without finer accounting) cannot reduce our commit if we +@@ -163,7 +174,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, + if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) { + charged = nrpages; + if (security_vm_enough_memory(charged)) +- return -ENOMEM; ++ goto fail_sec; + newflags |= VM_ACCOUNT; + } + } +@@ -213,10 +224,16 @@ success: + change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); + vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); + vm_stat_account(mm, newflags, vma->vm_file, nrpages); ++ if (ch_dir == PRIVVM_TO_SHARED) ++ __ub_unused_privvm_dec(mm, ch_size); + return 0; + + fail: + vm_unacct_memory(charged); ++fail_sec: ++ if (ch_dir == PRIVVM_TO_PRIVATE) ++ __ub_unused_privvm_dec(mm, ch_size); ++fail_ch: + return error; + } + +@@ -318,3 +335,4 @@ out: + up_write(¤t->mm->mmap_sem); + return error; + } ++EXPORT_SYMBOL_GPL(sys_mprotect); +diff --git a/mm/mremap.c b/mm/mremap.c +index 08e3c7f..67511b9 100644 +--- a/mm/mremap.c ++++ b/mm/mremap.c +@@ -23,6 +23,8 @@ + #include + #include + ++#include ++ + static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) + { + pgd_t *pgd; +@@ -167,17 +169,21 @@ static unsigned long move_vma(struct vm_area_struct *vma, + unsigned long hiwater_vm; + int split = 0; + ++ if (ub_memory_charge(mm, new_len, vm_flags, ++ vma->vm_file, UB_HARD)) ++ goto err; ++ + /* + * We'd prefer to avoid failure later on in do_munmap: + * which may split one vma into three before unmapping. + */ + if (mm->map_count >= sysctl_max_map_count - 3) +- return -ENOMEM; ++ goto err_nomem; + + new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); + new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); + if (!new_vma) +- return -ENOMEM; ++ goto err_nomem; + + moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); + if (moved_len < old_len) { +@@ -236,7 +242,13 @@ static unsigned long move_vma(struct vm_area_struct *vma, + new_addr + new_len); + } + +- return new_addr; ++ if (new_addr != -ENOMEM) ++ return new_addr; ++ ++err_nomem: ++ ub_memory_uncharge(mm, new_len, vm_flags, vma->vm_file); ++err: ++ return -ENOMEM; + } + + /* +@@ -364,7 +376,15 @@ unsigned long do_mremap(unsigned long addr, + max_addr = vma->vm_next->vm_start; + /* can we just expand the current mapping? */ + if (max_addr - addr >= new_len) { +- int pages = (new_len - old_len) >> PAGE_SHIFT; ++ unsigned long len; ++ int pages; ++ ++ len = new_len - old_len; ++ pages = len >> PAGE_SHIFT; ++ ret = -ENOMEM; ++ if (ub_memory_charge(mm, len, vma->vm_flags, ++ vma->vm_file, UB_HARD)) ++ goto out; + + vma_adjust(vma, vma->vm_start, + addr + new_len, vma->vm_pgoff, NULL); +diff --git a/mm/oom_kill.c b/mm/oom_kill.c +index 8a5467e..916243b 100644 +--- a/mm/oom_kill.c ++++ b/mm/oom_kill.c +@@ -19,6 +19,8 @@ + #include + #include + #include ++#include ++#include + #include + #include + #include +@@ -27,6 +29,9 @@ + #include + #include + ++#include ++#include ++ + int sysctl_panic_on_oom; + int sysctl_oom_kill_allocating_task; + int sysctl_oom_dump_tasks; +@@ -198,16 +203,16 @@ static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist, + * + * (not docbooked, we don't want this one cluttering up the manual) + */ +-static struct task_struct *select_bad_process(unsigned long *ppoints, ++struct task_struct *select_bad_process(struct user_beancounter *ub, + struct mem_cgroup *mem) + { + struct task_struct *g, *p; + struct task_struct *chosen = NULL; + struct timespec uptime; +- *ppoints = 0; ++ unsigned long chosen_points = 0; + + do_posix_clock_monotonic_gettime(&uptime); +- do_each_thread(g, p) { ++ do_each_thread_all(g, p) { + unsigned long points; + + /* +@@ -221,6 +226,8 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, + continue; + if (mem && !task_in_mem_cgroup(p, mem)) + continue; ++ if (ub_oom_task_skip(ub, p)) ++ continue; + + /* + * This task already has access to memory reserves and is +@@ -249,18 +256,18 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, + return ERR_PTR(-1UL); + + chosen = p; +- *ppoints = ULONG_MAX; ++ chosen_points = ULONG_MAX; + } + + if (p->oomkilladj == OOM_DISABLE) + continue; + + points = badness(p, uptime.tv_sec); +- if (points > *ppoints || !chosen) { ++ if (points > chosen_points || !chosen) { + chosen = p; +- *ppoints = points; ++ chosen_points = points; + } +- } while_each_thread(g, p); ++ } while_each_thread_all(g, p); + + return chosen; + } +@@ -284,7 +291,7 @@ static void dump_tasks(const struct mem_cgroup *mem) + + printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj " + "name\n"); +- do_each_thread(g, p) { ++ do_each_thread_all(g, p) { + /* + * total_vm and rss sizes do not exist for tasks with a + * detached mm so there's no need to report them. +@@ -300,7 +307,7 @@ static void dump_tasks(const struct mem_cgroup *mem) + get_mm_rss(p->mm), (int)task_cpu(p), p->oomkilladj, + p->comm); + task_unlock(p); +- } while_each_thread(g, p); ++ } while_each_thread_all(g, p); + } + + /* +@@ -335,13 +342,16 @@ static void __oom_kill_task(struct task_struct *p, int verbose) + set_tsk_thread_flag(p, TIF_MEMDIE); + + force_sig(SIGKILL, p); ++ ub_oom_task_killed(p); + } + + static int oom_kill_task(struct task_struct *p) + { + struct mm_struct *mm; ++ struct user_beancounter *ub; + struct task_struct *g, *q; + ++ task_lock(p); + mm = p->mm; + + /* WARNING: mm may not be dereferenced since we did not obtain its +@@ -353,16 +363,21 @@ static int oom_kill_task(struct task_struct *p) + * However, this is of no concern to us. + */ + +- if (mm == NULL) ++ if (mm == NULL) { ++ task_unlock(p); + return 1; ++ } ++ ++ ub = get_beancounter(mm_ub(mm)); ++ task_unlock(p); + + /* + * Don't kill the process if any threads are set to OOM_DISABLE + */ +- do_each_thread(g, q) { ++ do_each_thread_all(g, q) { + if (q->mm == mm && q->oomkilladj == OOM_DISABLE) + return 1; +- } while_each_thread(g, q); ++ } while_each_thread_all(g, q); + + __oom_kill_task(p, 1); + +@@ -371,17 +386,18 @@ static int oom_kill_task(struct task_struct *p) + * but are in a different thread group. Don't let them have access + * to memory reserves though, otherwise we might deplete all memory. + */ +- do_each_thread(g, q) { ++ do_each_thread_all(g, q) { + if (q->mm == mm && !same_thread_group(q, p)) + force_sig(SIGKILL, q); +- } while_each_thread(g, q); ++ } while_each_thread_all(g, q); + ++ ub_oom_mm_killed(ub); ++ put_beancounter(ub); + return 0; + } + +-static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, +- unsigned long points, struct mem_cgroup *mem, +- const char *message) ++int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, ++ struct mem_cgroup *mem, const char *message) + { + struct task_struct *c; + +@@ -404,8 +420,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, + return 0; + } + +- printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n", +- message, task_pid_nr(p), p->comm, points); ++ printk(KERN_ERR "%s: kill process %d (%s) or a child\n", ++ message, task_pid_nr(p), p->comm); + + /* Try to kill a child first */ + list_for_each_entry(c, &p->children, sibling) { +@@ -520,9 +536,9 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) + void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) + { + struct task_struct *p; +- unsigned long points = 0; + unsigned long freed = 0; + enum oom_constraint constraint; ++ struct user_beancounter *ub; + + blocking_notifier_call_chain(&oom_notify_list, 0, &freed); + if (freed > 0) +@@ -532,16 +548,34 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) + if (sysctl_panic_on_oom == 2) + panic("out of memory. Compulsory panic_on_oom is selected.\n"); + ++ if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_OUTOFMEM, NULL) ++ & (NOTIFY_OK | NOTIFY_FAIL)) ++ return; ++ ++ ub = NULL; ++ if (ub_oom_lock()) ++ goto out_oom_lock; ++ ++ read_lock(&tasklist_lock); ++ ++ if (printk_ratelimit()) { ++ printk(KERN_WARNING "%s invoked oom-killer: " ++ "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", ++ current->comm, gfp_mask, order, current->oomkilladj); ++ dump_stack(); ++ show_mem(); ++ show_slab_info(); ++ } ++ + /* + * Check if there were limitations on the allocation (only relevant for + * NUMA) that may require different handling. + */ + constraint = constrained_alloc(zonelist, gfp_mask); +- read_lock(&tasklist_lock); + + switch (constraint) { + case CONSTRAINT_MEMORY_POLICY: +- oom_kill_process(current, gfp_mask, order, points, NULL, ++ oom_kill_process(current, gfp_mask, order, NULL, + "No available memory (MPOL_BIND)"); + break; + +@@ -551,27 +585,33 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) + /* Fall-through */ + case CONSTRAINT_CPUSET: + if (sysctl_oom_kill_allocating_task) { +- oom_kill_process(current, gfp_mask, order, points, NULL, ++ oom_kill_process(current, gfp_mask, order, NULL, + "Out of memory (oom_kill_allocating_task)"); + break; + } + retry: ++ put_beancounter(ub); ++ + /* + * Rambo mode: Shoot down a process and hope it solves whatever + * issues we may have. + */ +- p = select_bad_process(&points, NULL); ++ ub = ub_oom_select_worst(); ++ p = select_bad_process(ub, NULL); + + if (PTR_ERR(p) == -1UL) + goto out; + + /* Found nothing?!?! Either we hang forever, or we panic. */ + if (!p) { ++ if (ub != NULL) ++ goto retry; + read_unlock(&tasklist_lock); ++ ub_oom_unlock(); + panic("Out of memory and no killable processes...\n"); + } + +- if (oom_kill_process(p, gfp_mask, order, points, NULL, ++ if (oom_kill_process(p, gfp_mask, order, NULL, + "Out of memory")) + goto retry; + +@@ -580,7 +620,10 @@ retry: + + out: + read_unlock(&tasklist_lock); ++ ub_oom_unlock(); ++ put_beancounter(ub); + ++out_oom_lock: + /* + * Give "p" a good chance of killing itself before we + * retry to allocate memory unless "p" is current +diff --git a/mm/page-writeback.c b/mm/page-writeback.c +index 789b6ad..e1883c0 100644 +--- a/mm/page-writeback.c ++++ b/mm/page-writeback.c +@@ -35,6 +35,9 @@ + #include + #include + ++#include ++#include ++ + /* + * The maximum number of pages to writeout in a single bdflush/kupdate + * operation. We do this so we don't hold I_SYNC against an inode for +@@ -899,6 +902,7 @@ retry: + scanned = 1; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; ++ struct user_beancounter *old_ub; + + /* + * At this point we hold neither mapping->tree_lock nor +@@ -929,7 +933,9 @@ retry: + continue; + } + ++ old_ub = bc_io_switch_context(page); + ret = (*writepage)(page, wbc, data); ++ bc_io_restore_context(old_ub); + + if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { + unlock_page(page); +@@ -1025,12 +1031,15 @@ int write_one_page(struct page *page, int wait) + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 1, + }; ++ struct user_beancounter *old_ub; + + BUG_ON(!PageLocked(page)); + + if (wait) + wait_on_page_writeback(page); + ++ old_ub = bc_io_switch_context(page); ++ + if (clear_page_dirty_for_io(page)) { + page_cache_get(page); + ret = mapping->a_ops->writepage(page, &wbc); +@@ -1043,6 +1052,9 @@ int write_one_page(struct page *page, int wait) + } else { + unlock_page(page); + } ++ ++ bc_io_restore_context(old_ub); ++ + return ret; + } + EXPORT_SYMBOL(write_one_page); +@@ -1074,6 +1086,9 @@ int __set_page_dirty_no_writeback(struct page *page) + */ + int __set_page_dirty_nobuffers(struct page *page) + { ++ int acct; ++ ++ acct = 0; + if (!TestSetPageDirty(page)) { + struct address_space *mapping = page_mapping(page); + struct address_space *mapping2; +@@ -1081,6 +1096,7 @@ int __set_page_dirty_nobuffers(struct page *page) + if (!mapping) + return 1; + ++ acct = 0; + write_lock_irq(&mapping->tree_lock); + mapping2 = page_mapping(page); + if (mapping2) { /* Race with truncate? */ +@@ -1090,12 +1106,14 @@ int __set_page_dirty_nobuffers(struct page *page) + __inc_zone_page_state(page, NR_FILE_DIRTY); + __inc_bdi_stat(mapping->backing_dev_info, + BDI_RECLAIMABLE); +- task_io_account_write(PAGE_CACHE_SIZE); ++ acct = 1; + } + radix_tree_tag_set(&mapping->page_tree, + page_index(page), PAGECACHE_TAG_DIRTY); + } + write_unlock_irq(&mapping->tree_lock); ++ if (acct) ++ task_io_account_write(page, PAGE_CACHE_SIZE, 0); + if (mapping->host) { + /* !PageAnon && !swapper_space */ + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); +@@ -1234,6 +1252,7 @@ int clear_page_dirty_for_io(struct page *page) + dec_zone_page_state(page, NR_FILE_DIRTY); + dec_bdi_stat(mapping->backing_dev_info, + BDI_RECLAIMABLE); ++ ub_io_release_context(page, PAGE_CACHE_SIZE); + return 1; + } + return 0; +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index f32fae3..1d172d0 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -51,6 +51,9 @@ + #include + #include "internal.h" + ++#include ++#include ++ + /* + * Array of node states. + */ +@@ -102,6 +105,7 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { + 32, + }; + ++EXPORT_SYMBOL(nr_swap_pages); + EXPORT_SYMBOL(totalram_pages); + + static char * const zone_names[MAX_NR_ZONES] = { +@@ -456,8 +460,11 @@ static inline int free_pages_check(struct page *page) + (page_count(page) != 0) | + (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) + bad_page(page); +- if (PageDirty(page)) ++ if (PageDirty(page)) { ++ ub_io_release_context(page, 0); + __ClearPageDirty(page); ++ } else ++ ub_io_release_debug(page); + /* + * For now, we report if PG_reserved was found set, but do not + * clear it, and do not free the page. But we shall soon need +@@ -523,6 +530,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) + arch_free_page(page, order); + kernel_map_pages(page, 1 << order, 0); + ++ ub_page_uncharge(page, order); + local_irq_save(flags); + __count_vm_events(PGFREE, 1 << order); + free_one_page(page_zone(page), page, order); +@@ -979,6 +987,7 @@ static void free_hot_cold_page(struct page *page, int cold) + kernel_map_pages(page, 1, 0); + + pcp = &zone_pcp(zone, get_cpu())->pcp; ++ ub_page_uncharge(page, 0); + local_irq_save(flags); + __count_vm_event(PGFREE); + if (cold) +@@ -1426,6 +1435,31 @@ try_next_zone: + return page; + } + ++extern unsigned long cycles_per_jiffy; ++static void __alloc_collect_stats(gfp_t gfp_mask, unsigned int order, ++ struct page *page, cycles_t time) ++{ ++#ifdef CONFIG_VE ++ int ind; ++ unsigned long flags; ++ ++ time = (jiffies - time) * cycles_per_jiffy; ++ if (!(gfp_mask & __GFP_WAIT)) ++ ind = 0; ++ else if (!(gfp_mask & __GFP_HIGHMEM)) ++ ind = (order > 0 ? 2 : 1); ++ else ++ ind = (order > 0 ? 4 : 3); ++ spin_lock_irqsave(&kstat_glb_lock, flags); ++ KSTAT_LAT_ADD(&kstat_glob.alloc_lat[ind], time); ++ if (!page) ++ kstat_glob.alloc_fails[ind]++; ++ spin_unlock_irqrestore(&kstat_glb_lock, flags); ++#endif ++} ++ ++int alloc_fail_warn; ++ + /* + * This is the 'heart' of the zoned buddy allocator. + */ +@@ -1444,6 +1478,7 @@ __alloc_pages_internal(gfp_t gfp_mask, unsigned int order, + int alloc_flags; + unsigned long did_some_progress; + unsigned long pages_reclaimed = 0; ++ cycles_t start; + + might_sleep_if(wait); + +@@ -1461,6 +1496,7 @@ restart: + return NULL; + } + ++ start = jiffies; + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, + zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); + if (page) +@@ -1617,19 +1653,32 @@ nofail_alloc: + do_retry = 1; + } + if (do_retry) { ++ if (total_swap_pages > 0 && nr_swap_pages == 0) { ++ out_of_memory(zonelist, gfp_mask, order); ++ goto restart; ++ } + congestion_wait(WRITE, HZ/50); + goto rebalance; + } + + nopage: +- if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { ++ __alloc_collect_stats(gfp_mask, order, NULL, start); ++ if (alloc_fail_warn && !(gfp_mask & __GFP_NOWARN) && ++ printk_ratelimit()) { + printk(KERN_WARNING "%s: page allocation failure." + " order:%d, mode:0x%x\n", + p->comm, order, gfp_mask); + dump_stack(); + show_mem(); + } ++ return NULL; ++ + got_pg: ++ __alloc_collect_stats(gfp_mask, order, page, start); ++ if (ub_page_charge(page, order, gfp_mask)) { ++ __free_pages(page, order); ++ page = NULL; ++ } + return page; + } + +diff --git a/mm/rmap.c b/mm/rmap.c +index bf0a5b7..679a575 100644 +--- a/mm/rmap.c ++++ b/mm/rmap.c +@@ -50,6 +50,9 @@ + #include + #include + ++#include ++#include ++ + #include + + struct kmem_cache *anon_vma_cachep; +@@ -93,6 +96,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) + } + return 0; + } ++EXPORT_SYMBOL_GPL(anon_vma_prepare); + + void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) + { +@@ -118,6 +122,7 @@ void anon_vma_link(struct vm_area_struct *vma) + spin_unlock(&anon_vma->lock); + } + } ++EXPORT_SYMBOL_GPL(anon_vma_link); + + void anon_vma_unlink(struct vm_area_struct *vma) + { +@@ -149,14 +154,14 @@ static void anon_vma_ctor(struct kmem_cache *cachep, void *data) + void __init anon_vma_init(void) + { + anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), +- 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); ++ 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_UBC, anon_vma_ctor); + } + + /* + * Getting a lock on a stable anon_vma from a page off the LRU is + * tricky: page_lock_anon_vma rely on RCU to guard against the races. + */ +-static struct anon_vma *page_lock_anon_vma(struct page *page) ++struct anon_vma *page_lock_anon_vma(struct page *page) + { + struct anon_vma *anon_vma; + unsigned long anon_mapping; +@@ -175,12 +180,14 @@ out: + rcu_read_unlock(); + return NULL; + } ++EXPORT_SYMBOL_GPL(page_lock_anon_vma); + +-static void page_unlock_anon_vma(struct anon_vma *anon_vma) ++void page_unlock_anon_vma(struct anon_vma *anon_vma) + { + spin_unlock(&anon_vma->lock); + rcu_read_unlock(); + } ++EXPORT_SYMBOL_GPL(page_unlock_anon_vma); + + /* + * At what user virtual address is page expected in @vma? +@@ -684,6 +691,12 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma) + } + mem_cgroup_uncharge_page(page); + ++ /* ++ * Well, when a page is unmapped, we cannot keep PG_checkpointed ++ * flag, it is not accessible via process VM and we have no way ++ * to reset its state ++ */ ++ ClearPageCheckpointed(page); + __dec_zone_page_state(page, + PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); + } +@@ -775,6 +788,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, + + + page_remove_rmap(page, vma); ++ ub_unused_privvm_inc(mm, vma); ++ ub_percpu_inc(mm->mm_ub, unmap); ++ pb_remove_ref(page, mm); + page_cache_release(page); + + out_unmap: +@@ -865,6 +881,9 @@ static void try_to_unmap_cluster(unsigned long cursor, + set_page_dirty(page); + + page_remove_rmap(page, vma); ++ ub_percpu_inc(mm->mm_ub, unmap); ++ pb_remove_ref(page, mm); ++ ub_unused_privvm_inc(mm, vma); + page_cache_release(page); + dec_mm_counter(mm, file_rss); + (*mapcount)--; +diff --git a/mm/shmem.c b/mm/shmem.c +index e2a6ae1..c7dc238 100644 +--- a/mm/shmem.c ++++ b/mm/shmem.c +@@ -55,6 +55,8 @@ + #include + #include + ++#include ++ + /* This magic number is used in glibc for posix shared memory */ + #define TMPFS_MAGIC 0x01021994 + +@@ -193,7 +195,7 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages) + + static const struct super_operations shmem_ops; + static const struct address_space_operations shmem_aops; +-static const struct file_operations shmem_file_operations; ++const struct file_operations shmem_file_operations; + static const struct inode_operations shmem_inode_operations; + static const struct inode_operations shmem_dir_inode_operations; + static const struct inode_operations shmem_special_inode_operations; +@@ -256,7 +258,7 @@ static void shmem_free_inode(struct super_block *sb) + * + * It has to be called with the spinlock held. + */ +-static void shmem_recalc_inode(struct inode *inode) ++static void shmem_recalc_inode(struct inode *inode, long swp_freed) + { + struct shmem_inode_info *info = SHMEM_I(inode); + long freed; +@@ -266,6 +268,8 @@ static void shmem_recalc_inode(struct inode *inode) + info->alloced -= freed; + shmem_unacct_blocks(info->flags, freed); + shmem_free_blocks(inode, freed); ++ if (freed > swp_freed) ++ ub_tmpfs_respages_sub(info, freed - swp_freed); + } + } + +@@ -370,6 +374,11 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns + struct page *page = kmap_atomic_to_page(entry); + set_page_private(page, page_private(page) + incdec); + } ++ ++ if (incdec == 1) ++ ub_tmpfs_respages_dec(info); ++ else ++ ub_tmpfs_respages_inc(info); + } + + /** +@@ -386,14 +395,24 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct page *page = NULL; + swp_entry_t *entry; ++ unsigned long ub_val; + + if (sgp != SGP_WRITE && + ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) + return ERR_PTR(-EINVAL); + ++ ub_val = 0; ++ if (info->next_index <= index) { ++ ub_val = index + 1 - info->next_index; ++ if (ub_shmpages_charge(info, ub_val)) ++ return ERR_PTR(-ENOSPC); ++ } ++ + while (!(entry = shmem_swp_entry(info, index, &page))) { +- if (sgp == SGP_READ) +- return shmem_swp_map(ZERO_PAGE(0)); ++ if (sgp == SGP_READ) { ++ entry = shmem_swp_map(ZERO_PAGE(0)); ++ goto out; ++ } + /* + * Test free_blocks against 1 not 0, since we have 1 data + * page (and perhaps indirect index pages) yet to allocate: +@@ -403,7 +422,8 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long + spin_lock(&sbinfo->stat_lock); + if (sbinfo->free_blocks <= 1) { + spin_unlock(&sbinfo->stat_lock); +- return ERR_PTR(-ENOSPC); ++ entry = ERR_PTR(-ENOSPC); ++ goto out; + } + sbinfo->free_blocks--; + inode->i_blocks += BLOCKS_PER_PAGE; +@@ -411,31 +431,43 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long + } + + spin_unlock(&info->lock); +- page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping)); ++ page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | ++ __GFP_UBC); + if (page) + set_page_private(page, 0); + spin_lock(&info->lock); + + if (!page) { +- shmem_free_blocks(inode, 1); +- return ERR_PTR(-ENOMEM); ++ entry = ERR_PTR(-ENOMEM); ++ goto out_block; + } + if (sgp != SGP_WRITE && + ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { + entry = ERR_PTR(-EINVAL); +- break; ++ goto out_dir; + } +- if (info->next_index <= index) ++ if (info->next_index <= index) { ++ ub_val = 0; + info->next_index = index + 1; ++ } + } + if (page) { + /* another task gave its page, or truncated the file */ + shmem_free_blocks(inode, 1); + shmem_dir_free(page); + } +- if (info->next_index <= index && !IS_ERR(entry)) ++ if (info->next_index <= index) + info->next_index = index + 1; + return entry; ++ ++out_dir: ++ shmem_dir_free(page); ++out_block: ++ shmem_free_blocks(inode, 1); ++out: ++ if (ub_val) ++ ub_shmpages_uncharge(info, ub_val); ++ return entry; + } + + /** +@@ -543,6 +575,7 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) + return; + + spin_lock(&info->lock); ++ ub_shmpages_uncharge(info, info->next_index - idx); + info->flags |= SHMEM_TRUNCATE; + if (likely(end == (loff_t) -1)) { + limit = info->next_index; +@@ -729,7 +762,7 @@ done2: + info->swapped -= nr_swaps_freed; + if (nr_pages_to_free) + shmem_free_blocks(inode, nr_pages_to_free); +- shmem_recalc_inode(inode); ++ shmem_recalc_inode(inode, nr_swaps_freed); + spin_unlock(&info->lock); + + /* +@@ -812,6 +845,7 @@ static void shmem_delete_inode(struct inode *inode) + } + } + BUG_ON(inode->i_blocks); ++ shmi_ub_put(info); + shmem_free_inode(inode->i_sb); + clear_inode(inode); + } +@@ -991,6 +1025,12 @@ int shmem_unuse(swp_entry_t entry, struct page *page) + out: return found; /* 0 or 1 or -ENOMEM */ + } + ++#ifdef CONFIG_BEANCOUNTERS ++#define shm_get_swap_page(info) (get_swap_page((info)->shmi_ub)) ++#else ++#define shm_get_swap_page(info) (get_swap_page(NULL)) ++#endif ++ + /* + * Move the page from the page cache to the swap cache. + */ +@@ -1021,7 +1061,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) + * want to check if there's a redundant swappage to be discarded. + */ + if (wbc->for_reclaim) +- swap = get_swap_page(); ++ swap = shm_get_swap_page(info); + else + swap.val = 0; + +@@ -1039,7 +1079,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) + free_swap_and_cache(*entry); + shmem_swp_set(info, entry, 0); + } +- shmem_recalc_inode(inode); ++ shmem_recalc_inode(inode, 0); + + if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { + remove_from_page_cache(page); +@@ -1077,6 +1117,54 @@ redirty: + return 0; + } + ++/* Insert a swap entry to shmem inode address space. */ ++int shmem_insertpage(struct inode * inode, unsigned long index, ++ swp_entry_t swap) ++{ ++ struct shmem_inode_info *info; ++ swp_entry_t *entry; ++ int err; ++ ++ info = SHMEM_I(inode); ++ ++ spin_lock(&info->lock); ++ shmem_recalc_inode(inode, 0); ++ entry = shmem_swp_alloc(info, index, SGP_WRITE); ++ err = PTR_ERR(entry); ++ if (IS_ERR(entry)) ++ goto unlock; ++ ++ err = -EBUSY; ++ if (entry->val) ++ goto unlock_unmap; ++ ++ err = -EINVAL; ++ if (!swap_duplicate(swap)) ++ goto unlock_unmap; ++ ++ info->alloced++; ++ ub_tmpfs_respages_inc(info); ++ inode->i_blocks += BLOCKS_PER_PAGE; ++ shmem_swp_set(info, entry, swap.val); ++ shmem_swp_unmap(entry); ++ spin_unlock(&info->lock); ++ if (list_empty(&info->swaplist)) { ++ mutex_lock(&shmem_swaplist_mutex); ++ /* move instead of add in case we're racing */ ++ list_move_tail(&info->swaplist, &shmem_swaplist); ++ mutex_unlock(&shmem_swaplist_mutex); ++ } ++ return 0; ++ ++unlock_unmap: ++ shmem_swp_unmap(entry); ++unlock: ++ spin_unlock(&info->lock); ++ return err; ++} ++EXPORT_SYMBOL(shmem_insertpage); ++ ++ + #ifdef CONFIG_NUMA + #ifdef CONFIG_TMPFS + static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) +@@ -1219,7 +1307,7 @@ repeat: + } + + spin_lock(&info->lock); +- shmem_recalc_inode(inode); ++ shmem_recalc_inode(inode, 0); + entry = shmem_swp_alloc(info, idx, sgp); + if (IS_ERR(entry)) { + spin_unlock(&info->lock); +@@ -1407,6 +1495,7 @@ repeat: + clear_highpage(filepage); + flush_dcache_page(filepage); + SetPageUptodate(filepage); ++ ub_tmpfs_respages_inc(info); + if (sgp == SGP_DIRTY) + set_page_dirty(filepage); + } +@@ -1509,6 +1598,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) + inode->i_generation = get_seconds(); + info = SHMEM_I(inode); + memset(info, 0, (char *)inode - (char *)info); ++ shmi_ub_set(info, get_exec_ub()); + spin_lock_init(&info->lock); + INIT_LIST_HEAD(&info->swaplist); + +@@ -2365,7 +2455,7 @@ static const struct address_space_operations shmem_aops = { + .migratepage = migrate_page, + }; + +-static const struct file_operations shmem_file_operations = { ++const struct file_operations shmem_file_operations = { + .mmap = shmem_mmap, + #ifdef CONFIG_TMPFS + .llseek = generic_file_llseek, +@@ -2377,6 +2467,7 @@ static const struct file_operations shmem_file_operations = { + .splice_write = generic_file_splice_write, + #endif + }; ++EXPORT_SYMBOL_GPL(shmem_file_operations); + + static const struct inode_operations shmem_inode_operations = { + .truncate = shmem_truncate, +@@ -2446,6 +2537,10 @@ static struct vm_operations_struct shmem_vm_ops = { + #endif + }; + ++int is_shmem_mapping(struct address_space *map) ++{ ++ return (map != NULL && map->a_ops == &shmem_aops); ++} + + static int shmem_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) +@@ -2453,13 +2548,19 @@ static int shmem_get_sb(struct file_system_type *fs_type, + return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt); + } + +-static struct file_system_type tmpfs_fs_type = { ++struct file_system_type tmpfs_fs_type = { + .owner = THIS_MODULE, + .name = "tmpfs", + .get_sb = shmem_get_sb, + .kill_sb = kill_litter_super, + }; ++EXPORT_SYMBOL(tmpfs_fs_type); ++ ++#ifdef CONFIG_VE ++#define shm_mnt (get_exec_env()->shmem_mnt) ++#else + static struct vfsmount *shm_mnt; ++#endif + + static int __init init_tmpfs(void) + { +@@ -2500,6 +2601,36 @@ out4: + } + module_init(init_tmpfs) + ++static inline int shm_charge_ahead(struct inode *inode) ++{ ++#ifdef CONFIG_BEANCOUNTERS ++ struct shmem_inode_info *info = SHMEM_I(inode); ++ unsigned long idx; ++ swp_entry_t *entry; ++ ++ if (!inode->i_size) ++ return 0; ++ idx = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; ++ /* ++ * Just touch info to allocate space for entry and ++ * make all UBC checks ++ */ ++ spin_lock(&info->lock); ++ entry = shmem_swp_alloc(info, idx, SGP_CACHE); ++ if (IS_ERR(entry)) ++ goto err; ++ shmem_swp_unmap(entry); ++ spin_unlock(&info->lock); ++ return 0; ++ ++err: ++ spin_unlock(&info->lock); ++ return PTR_ERR(entry); ++#else ++ return 0; ++#endif ++} ++ + /** + * shmem_file_setup - get an unlinked file living in tmpfs + * @name: name for dentry (to be seen in /proc//maps +@@ -2546,6 +2677,9 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) + d_instantiate(dentry, inode); + inode->i_size = size; + inode->i_nlink = 0; /* It is unlinked */ ++ error = shm_charge_ahead(inode); ++ if (error) ++ goto close_file; + init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, + &shmem_file_operations); + return file; +@@ -2558,6 +2692,7 @@ put_memory: + shmem_unacct_size(flags, size); + return ERR_PTR(error); + } ++EXPORT_SYMBOL_GPL(shmem_file_setup); + + /** + * shmem_zero_setup - setup a shared anonymous mapping +@@ -2574,6 +2709,8 @@ int shmem_zero_setup(struct vm_area_struct *vma) + + if (vma->vm_file) + fput(vma->vm_file); ++ else if (vma->vm_flags & VM_WRITE) ++ __ub_unused_privvm_dec(vma->vm_mm, size >> PAGE_SHIFT); + vma->vm_file = file; + vma->vm_ops = &shmem_vm_ops; + return 0; +diff --git a/mm/slab.c b/mm/slab.c +index 046607f..bf82112 100644 +--- a/mm/slab.c ++++ b/mm/slab.c +@@ -111,30 +111,14 @@ + #include + #include + #include ++#include ++#include + + #include + #include + #include + +-/* +- * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. +- * 0 for faster, smaller code (especially in the critical paths). +- * +- * STATS - 1 to collect stats for /proc/slabinfo. +- * 0 for faster, smaller code (especially in the critical paths). +- * +- * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) +- */ +- +-#ifdef CONFIG_DEBUG_SLAB +-#define DEBUG 1 +-#define STATS 1 +-#define FORCED_DEBUG 1 +-#else +-#define DEBUG 0 +-#define STATS 0 +-#define FORCED_DEBUG 0 +-#endif ++#include + + /* Shouldn't this be in a header file somewhere? */ + #define BYTES_PER_WORD sizeof(void *) +@@ -169,19 +153,21 @@ + #endif + + /* Legal flag mask for kmem_cache_create(). */ +-#if DEBUG ++#if SLAB_DEBUG + # define CREATE_MASK (SLAB_RED_ZONE | \ + SLAB_POISON | SLAB_HWCACHE_ALIGN | \ + SLAB_CACHE_DMA | \ + SLAB_STORE_USER | \ + SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ + SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ ++ SLAB_UBC | SLAB_NO_CHARGE) + SLAB_DEBUG_OBJECTS) + #else + # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ + SLAB_CACHE_DMA | \ + SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ + SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ ++ SLAB_UBC | SLAB_NO_CHARGE | \ + SLAB_DEBUG_OBJECTS) + #endif + +@@ -371,87 +357,6 @@ static void kmem_list3_init(struct kmem_list3 *parent) + MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ + } while (0) + +-/* +- * struct kmem_cache +- * +- * manages a cache. +- */ +- +-struct kmem_cache { +-/* 1) per-cpu data, touched during every alloc/free */ +- struct array_cache *array[NR_CPUS]; +-/* 2) Cache tunables. Protected by cache_chain_mutex */ +- unsigned int batchcount; +- unsigned int limit; +- unsigned int shared; +- +- unsigned int buffer_size; +- u32 reciprocal_buffer_size; +-/* 3) touched by every alloc & free from the backend */ +- +- unsigned int flags; /* constant flags */ +- unsigned int num; /* # of objs per slab */ +- +-/* 4) cache_grow/shrink */ +- /* order of pgs per slab (2^n) */ +- unsigned int gfporder; +- +- /* force GFP flags, e.g. GFP_DMA */ +- gfp_t gfpflags; +- +- size_t colour; /* cache colouring range */ +- unsigned int colour_off; /* colour offset */ +- struct kmem_cache *slabp_cache; +- unsigned int slab_size; +- unsigned int dflags; /* dynamic flags */ +- +- /* constructor func */ +- void (*ctor)(struct kmem_cache *, void *); +- +-/* 5) cache creation/removal */ +- const char *name; +- struct list_head next; +- +-/* 6) statistics */ +-#if STATS +- unsigned long num_active; +- unsigned long num_allocations; +- unsigned long high_mark; +- unsigned long grown; +- unsigned long reaped; +- unsigned long errors; +- unsigned long max_freeable; +- unsigned long node_allocs; +- unsigned long node_frees; +- unsigned long node_overflow; +- atomic_t allochit; +- atomic_t allocmiss; +- atomic_t freehit; +- atomic_t freemiss; +-#endif +-#if DEBUG +- /* +- * If debugging is enabled, then the allocator can add additional +- * fields and/or padding to every object. buffer_size contains the total +- * object size including these internal fields, the following two +- * variables contain the offset to the user object and its size. +- */ +- int obj_offset; +- int obj_size; +-#endif +- /* +- * We put nodelists[] at the end of kmem_cache, because we want to size +- * this array to nr_node_ids slots instead of MAX_NUMNODES +- * (see kmem_cache_init()) +- * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache +- * is statically defined, so we reserve the max number of nodes. +- */ +- struct kmem_list3 *nodelists[MAX_NUMNODES]; +- /* +- * Do not add fields after nodelists[] +- */ +-}; +- + #define CFLGS_OFF_SLAB (0x80000000UL) + #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) + +@@ -466,12 +371,14 @@ struct kmem_cache { + #define REAPTIMEOUT_CPUC (2*HZ) + #define REAPTIMEOUT_LIST3 (4*HZ) + +-#if STATS ++#define STATS_INC_GROWN(x) ((x)->grown++) ++#define STATS_ADD_REAPED(x,y) ((x)->reaped += (y)) ++#define STATS_INC_SHRUNK(x) ((x)->shrunk++) ++ ++#if SLAB_STATS + #define STATS_INC_ACTIVE(x) ((x)->num_active++) + #define STATS_DEC_ACTIVE(x) ((x)->num_active--) + #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) +-#define STATS_INC_GROWN(x) ((x)->grown++) +-#define STATS_ADD_REAPED(x,y) ((x)->reaped += (y)) + #define STATS_SET_HIGH(x) \ + do { \ + if ((x)->num_active > (x)->high_mark) \ +@@ -494,8 +401,6 @@ struct kmem_cache { + #define STATS_INC_ACTIVE(x) do { } while (0) + #define STATS_DEC_ACTIVE(x) do { } while (0) + #define STATS_INC_ALLOCED(x) do { } while (0) +-#define STATS_INC_GROWN(x) do { } while (0) +-#define STATS_ADD_REAPED(x,y) do { } while (0) + #define STATS_SET_HIGH(x) do { } while (0) + #define STATS_INC_ERR(x) do { } while (0) + #define STATS_INC_NODEALLOCS(x) do { } while (0) +@@ -508,7 +413,7 @@ struct kmem_cache { + #define STATS_INC_FREEMISS(x) do { } while (0) + #endif + +-#if DEBUG ++#if SLAB_DEBUG + + /* + * memory layout of objects: +@@ -640,6 +545,8 @@ struct cache_sizes malloc_sizes[] = { + #define CACHE(x) { .cs_size = (x) }, + #include + CACHE(ULONG_MAX) ++#include ++ CACHE(ULONG_MAX) + #undef CACHE + }; + EXPORT_SYMBOL(malloc_sizes); +@@ -653,10 +560,17 @@ struct cache_names { + static struct cache_names __initdata cache_names[] = { + #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" }, + #include ++ {NULL,}, ++#undef CACHE ++#define CACHE(x) { .name = "size-" #x "(UBC)", .name_dma = "size-" #x "(DMA,UBC)" }, ++#include + {NULL,} + #undef CACHE + }; + ++int malloc_cache_num; ++EXPORT_SYMBOL(malloc_cache_num); ++ + static struct arraycache_init initarray_cache __initdata = + { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; + static struct arraycache_init initarray_generic = +@@ -733,6 +647,7 @@ static inline void init_lock_keys(void) + */ + static DEFINE_MUTEX(cache_chain_mutex); + static struct list_head cache_chain; ++static spinlock_t cache_chain_lock; + + /* + * chicken and egg problem: delay the per-cpu array allocation +@@ -765,7 +680,9 @@ static inline struct kmem_cache *__find_general_cachep(size_t size, + { + struct cache_sizes *csizep = malloc_sizes; + +-#if DEBUG ++ if (gfpflags & __GFP_UBC) ++ csizep += malloc_cache_num; ++#if SLAB_DEBUG + /* This happens if someone tries to call + * kmem_cache_create(), or __kmalloc(), before + * the generic caches are initialized. +@@ -795,9 +712,98 @@ static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags) + return __find_general_cachep(size, gfpflags); + } + +-static size_t slab_mgmt_size(size_t nr_objs, size_t align) ++static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) ++{ ++ return (kmem_bufctl_t *) (slabp + 1); ++} ++ ++#ifdef CONFIG_BEANCOUNTERS ++#define init_slab_ubps(cachep, slabp) do { \ ++ if (!((cachep)->flags & SLAB_UBC)) \ ++ break; \ ++ memset(slab_ubcs(cachep, slabp), 0, \ ++ (cachep)->num * sizeof(void *)); \ ++ } while (0) ++ ++#define UB_ALIGN(flags) (flags & SLAB_UBC ? sizeof(void *) : 1) ++#define UB_EXTRA(flags) (flags & SLAB_UBC ? sizeof(void *) : 0) ++#define set_cache_objuse(cachep) do { \ ++ (cachep)->objuse = ((PAGE_SIZE << (cachep)->gfporder) + \ ++ (cachep)->num - 1) / (cachep)->num; \ ++ if (!OFF_SLAB(cachep)) \ ++ break; \ ++ (cachep)->objuse += ((cachep)->slabp_cache->objuse + \ ++ (cachep)->num - 1) / (cachep)->num; \ ++ } while (0) ++ ++void kmem_mark_nocharge(struct kmem_cache *cachep) ++{ ++ cachep->flags |= SLAB_NO_CHARGE; ++} ++ ++int kmem_cache_objuse(struct kmem_cache *cachep) ++{ ++ return cachep->objuse; ++} ++ ++EXPORT_SYMBOL(kmem_cache_objuse); ++ ++int kmem_obj_objuse(void *obj) ++{ ++ return virt_to_cache(obj)->objuse; ++} ++ ++int kmem_dname_objuse(void *obj) ++{ ++ return virt_to_cache(obj)->objuse; ++} ++ ++unsigned long ub_cache_growth(struct kmem_cache *cachep) ++{ ++ return (cachep->grown - cachep->reaped - cachep->shrunk) ++ << cachep->gfporder; ++} ++ ++#define slab_ubcs(cachep, slabp) ((struct user_beancounter **)\ ++ (ALIGN((unsigned long)(slab_bufctl(slabp) + (cachep)->num),\ ++ sizeof(void *)))) ++ ++struct user_beancounter **ub_slab_ptr(struct kmem_cache *cachep, void *obj) ++{ ++ struct slab *slabp; ++ int objnr; ++ ++ BUG_ON(!(cachep->flags & SLAB_UBC)); ++ slabp = virt_to_slab(obj); ++ objnr = (obj - slabp->s_mem) / cachep->buffer_size; ++ return slab_ubcs(cachep, slabp) + objnr; ++} ++ ++struct user_beancounter *slab_ub(void *obj) + { +- return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); ++ return *ub_slab_ptr(virt_to_cache(obj), obj); ++} ++ ++EXPORT_SYMBOL(slab_ub); ++ ++#else ++#define UB_ALIGN(flags) 1 ++#define UB_EXTRA(flags) 0 ++#define set_cache_objuse(c) do { } while (0) ++#define init_slab_ubps(c, s) do { } while (0) ++#endif ++ ++static size_t slab_mgmt_size_noalign(size_t nr_objs, int flags) ++{ ++ size_t size_noub; ++ ++ size_noub = sizeof(struct slab) + nr_objs * sizeof(kmem_bufctl_t); ++ return ALIGN(size_noub, UB_ALIGN(flags)) + nr_objs * UB_EXTRA(flags); ++} ++ ++static size_t slab_mgmt_size(size_t nr_objs, size_t align, int flags) ++{ ++ return ALIGN(slab_mgmt_size_noalign(nr_objs, flags), align); + } + + /* +@@ -842,20 +848,23 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, + * into account. + */ + nr_objs = (slab_size - sizeof(struct slab)) / +- (buffer_size + sizeof(kmem_bufctl_t)); ++ (buffer_size + sizeof(kmem_bufctl_t) + ++ UB_EXTRA(flags)); + + /* + * This calculated number will be either the right + * amount, or one greater than what we want. + */ +- if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size +- > slab_size) ++ if (slab_mgmt_size(nr_objs, align, flags) + ++ nr_objs * buffer_size > slab_size) + nr_objs--; ++ BUG_ON(slab_mgmt_size(nr_objs, align, flags) + ++ nr_objs * buffer_size > slab_size); + + if (nr_objs > SLAB_LIMIT) + nr_objs = SLAB_LIMIT; + +- mgmt_size = slab_mgmt_size(nr_objs, align); ++ mgmt_size = slab_mgmt_size(nr_objs, align, flags); + } + *num = nr_objs; + *left_over = slab_size - nr_objs*buffer_size - mgmt_size; +@@ -1403,6 +1412,7 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, + cachep->nodelists[nodeid] = ptr; + local_irq_enable(); + } ++static int offslab_limit; + + /* + * For setting up all the kmem_list3s for cache whose buffer_size is same as +@@ -1476,6 +1486,7 @@ void __init kmem_cache_init(void) + + /* 1) create the cache_cache */ + INIT_LIST_HEAD(&cache_chain); ++ spin_lock_init(&cache_chain_lock); + list_add(&cache_cache.next, &cache_chain); + cache_cache.colour_off = cache_line_size(); + cache_cache.array[smp_processor_id()] = &initarray_cache.cache; +@@ -1487,7 +1498,7 @@ void __init kmem_cache_init(void) + */ + cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) + + nr_node_ids * sizeof(struct kmem_list3 *); +-#if DEBUG ++#if SLAB_DEBUG + cache_cache.obj_size = cache_cache.buffer_size; + #endif + cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, +@@ -1534,6 +1545,7 @@ void __init kmem_cache_init(void) + + slab_early_init = 0; + ++ for (i = 0; i < 2; i++) { + while (sizes->cs_size != ULONG_MAX) { + /* + * For performance, all the general caches are L1 aligned. +@@ -1546,21 +1558,30 @@ void __init kmem_cache_init(void) + sizes->cs_cachep = kmem_cache_create(names->name, + sizes->cs_size, + ARCH_KMALLOC_MINALIGN, +- ARCH_KMALLOC_FLAGS|SLAB_PANIC, ++ ARCH_KMALLOC_FLAGS|SLAB_PANIC| ++ (i ? SLAB_UBC : 0)|SLAB_NO_CHARGE, + NULL); + } ++ if (!(OFF_SLAB(sizes->cs_cachep))) ++ offslab_limit = sizes->cs_size; + #ifdef CONFIG_ZONE_DMA +- sizes->cs_dmacachep = kmem_cache_create( +- names->name_dma, ++ sizes->cs_dmacachep = kmem_cache_create(names->name_dma, + sizes->cs_size, + ARCH_KMALLOC_MINALIGN, + ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| ++ (i ? SLAB_UBC : 0) | SLAB_NO_CHARGE| + SLAB_PANIC, + NULL); + #endif + sizes++; + names++; + } ++ ++ sizes++; ++ names++; ++ if (!i) ++ malloc_cache_num = sizes - malloc_sizes; ++ } + /* 4) Replace the bootstrap head arrays */ + { + struct array_cache *ptr; +@@ -1730,7 +1751,7 @@ static void kmem_rcu_free(struct rcu_head *head) + kmem_cache_free(cachep->slabp_cache, slab_rcu); + } + +-#if DEBUG ++#if SLAB_DEBUG + + #ifdef CONFIG_DEBUG_PAGEALLOC + static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, +@@ -1807,7 +1828,7 @@ static void dump_line(char *data, int offset, int limit) + } + #endif + +-#if DEBUG ++#if SLAB_DEBUG + + static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) + { +@@ -1900,7 +1921,7 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) + } + #endif + +-#if DEBUG ++#if SLAB_DEBUG + /** + * slab_destroy_objs - destroy a slab and its objects + * @cachep: cache pointer being destroyed +@@ -2008,7 +2029,6 @@ static void __kmem_cache_destroy(struct kmem_cache *cachep) + static size_t calculate_slab_order(struct kmem_cache *cachep, + size_t size, size_t align, unsigned long flags) + { +- unsigned long offslab_limit; + size_t left_over = 0; + int gfporder; + +@@ -2021,15 +2041,10 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, + continue; + + if (flags & CFLGS_OFF_SLAB) { +- /* +- * Max number of objs-per-slab for caches which +- * use off-slab slabs. Needed to avoid a possible +- * looping condition in cache_grow(). +- */ +- offslab_limit = size - sizeof(struct slab); +- offslab_limit /= sizeof(kmem_bufctl_t); ++ int slab_size; + +- if (num > offslab_limit) ++ slab_size = slab_mgmt_size_noalign(num, flags); ++ if (slab_size > offslab_limit) + break; + } + +@@ -2193,9 +2208,9 @@ kmem_cache_create (const char *name, size_t size, size_t align, + } + } + +-#if DEBUG ++#if SLAB_DEBUG + WARN_ON(strchr(name, ' ')); /* It confuses parsers */ +-#if FORCED_DEBUG ++#if SLAB_FORCED_DEBUG + /* + * Enable redzoning and last user accounting, except for caches with + * large objects, if the increased size would increase the object size +@@ -2280,7 +2295,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, + if (!cachep) + goto oops; + +-#if DEBUG ++#if SLAB_DEBUG + cachep->obj_size = size; + + /* +@@ -2302,7 +2317,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, + else + size += BYTES_PER_WORD; + } +-#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) ++#if SLAB_FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) + if (size >= malloc_sizes[INDEX_L3 + 1].cs_size + && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) { + cachep->obj_offset += PAGE_SIZE - size; +@@ -2334,8 +2349,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, + cachep = NULL; + goto oops; + } +- slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) +- + sizeof(struct slab), align); ++ slab_size = slab_mgmt_size(cachep->num, align, flags); + + /* + * If the slab has been placed off-slab, and we have enough space then +@@ -2348,8 +2362,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, + + if (flags & CFLGS_OFF_SLAB) { + /* really off slab. No need for manual alignment */ +- slab_size = +- cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); ++ slab_size = slab_mgmt_size_noalign(cachep->num, flags); + } + + cachep->colour_off = cache_line_size(); +@@ -2386,7 +2399,10 @@ kmem_cache_create (const char *name, size_t size, size_t align, + } + + /* cache setup completed, link it into the list */ ++ spin_lock(&cache_chain_lock); + list_add(&cachep->next, &cache_chain); ++ spin_unlock(&cache_chain_lock); ++ set_cache_objuse(cachep); + oops: + if (!cachep && (flags & SLAB_PANIC)) + panic("kmem_cache_create(): failed to create slab `%s'\n", +@@ -2397,7 +2413,7 @@ oops: + } + EXPORT_SYMBOL(kmem_cache_create); + +-#if DEBUG ++#if SLAB_DEBUG + static void check_irq_off(void) + { + BUG_ON(!irqs_disabled()); +@@ -2493,10 +2509,11 @@ static int drain_freelist(struct kmem_cache *cache, + } + + slabp = list_entry(p, struct slab, list); +-#if DEBUG ++#if SLAB_DEBUG + BUG_ON(slabp->inuse); + #endif + list_del(&slabp->list); ++ STATS_INC_SHRUNK(cache); + /* + * Safe to drop the lock. The slab is no longer linked + * to the cache. +@@ -2579,10 +2596,14 @@ void kmem_cache_destroy(struct kmem_cache *cachep) + /* + * the chain is never empty, cache_cache is never destroyed + */ ++ spin_lock(&cache_chain_lock); + list_del(&cachep->next); ++ spin_unlock(&cache_chain_lock); + if (__cache_shrink(cachep)) { + slab_error(cachep, "Can't free all objects"); ++ spin_lock(&cache_chain_lock); + list_add(&cachep->next, &cache_chain); ++ spin_unlock(&cache_chain_lock); + mutex_unlock(&cache_chain_mutex); + put_online_cpus(); + return; +@@ -2591,6 +2612,8 @@ void kmem_cache_destroy(struct kmem_cache *cachep) + if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) + synchronize_rcu(); + ++ ++ ub_kmemcache_free(cachep); + __kmem_cache_destroy(cachep); + mutex_unlock(&cache_chain_mutex); + put_online_cpus(); +@@ -2617,7 +2640,8 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, + if (OFF_SLAB(cachep)) { + /* Slab management obj is off-slab. */ + slabp = kmem_cache_alloc_node(cachep->slabp_cache, +- local_flags & ~GFP_THISNODE, nodeid); ++ local_flags & (~(__GFP_UBC | GFP_THISNODE)), ++ nodeid); + if (!slabp) + return NULL; + } else { +@@ -2629,14 +2653,10 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, + slabp->s_mem = objp + colour_off; + slabp->nodeid = nodeid; + slabp->free = 0; ++ init_slab_ubps(cachep, slabp); + return slabp; + } + +-static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) +-{ +- return (kmem_bufctl_t *) (slabp + 1); +-} +- + static void cache_init_objs(struct kmem_cache *cachep, + struct slab *slabp) + { +@@ -2644,7 +2664,7 @@ static void cache_init_objs(struct kmem_cache *cachep, + + for (i = 0; i < cachep->num; i++) { + void *objp = index_to_obj(cachep, slabp, i); +-#if DEBUG ++#if SLAB_DEBUG + /* need to poison the objs? */ + if (cachep->flags & SLAB_POISON) + poison_obj(cachep, objp, POISON_FREE); +@@ -2702,7 +2722,7 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, + + slabp->inuse++; + next = slab_bufctl(slabp)[slabp->free]; +-#if DEBUG ++#if SLAB_DEBUG + slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; + WARN_ON(slabp->nodeid != nodeid); + #endif +@@ -2716,7 +2736,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, + { + unsigned int objnr = obj_to_index(cachep, slabp, objp); + +-#if DEBUG ++#if SLAB_DEBUG + /* Verify that the slab belongs to the intended node */ + WARN_ON(slabp->nodeid != nodeid); + +@@ -2804,7 +2824,7 @@ static int cache_grow(struct kmem_cache *cachep, + * 'nodeid'. + */ + if (!objp) +- objp = kmem_getpages(cachep, local_flags, nodeid); ++ objp = kmem_getpages(cachep, local_flags & ~__GFP_UBC, nodeid); + if (!objp) + goto failed; + +@@ -2837,7 +2857,7 @@ failed: + return 0; + } + +-#if DEBUG ++#if SLAB_DEBUG + + /* + * Perform extra freeing checks: +@@ -3050,12 +3070,12 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, + gfp_t flags) + { + might_sleep_if(flags & __GFP_WAIT); +-#if DEBUG ++#if SLAB_DEBUG + kmem_flagcheck(cachep, flags); + #endif + } + +-#if DEBUG ++#if SLAB_DEBUG + static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, + gfp_t flags, void *objp, void *caller) + { +@@ -3471,9 +3491,14 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) + cache_alloc_debugcheck_before(cachep, flags); + local_irq_save(save_flags); + objp = __do_cache_alloc(cachep, flags); +- local_irq_restore(save_flags); + objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); + prefetchw(objp); ++ if (objp && should_charge(cachep, flags) && ++ ub_slab_charge(cachep, objp, flags)) { ++ kmem_cache_free(cachep, objp); ++ objp = NULL; ++ } ++ local_irq_restore(save_flags); + + if (unlikely((flags & __GFP_ZERO) && objp)) + memset(objp, 0, obj_size(cachep)); +@@ -3507,6 +3532,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, + /* fixup slab chains */ + if (slabp->inuse == 0) { + if (l3->free_objects > l3->free_limit) { ++ STATS_INC_SHRUNK(cachep); + l3->free_objects -= cachep->num; + /* No need to drop any previously held + * lock here, even if we have a off-slab slab +@@ -3535,7 +3561,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) + int node = numa_node_id(); + + batchcount = ac->batchcount; +-#if DEBUG ++#if SLAB_DEBUG + BUG_ON(!batchcount || batchcount > ac->avail); + #endif + check_irq_off(); +@@ -3556,7 +3582,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) + + free_block(cachep, ac->entry, batchcount, node); + free_done: +-#if STATS ++#if SLAB_STATS + { + int i = 0; + struct list_head *p; +@@ -3590,6 +3616,9 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) + check_irq_off(); + objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); + ++ if (should_uncharge(cachep)) ++ ub_slab_uncharge(cachep, objp); ++ + /* + * Skip calling cache_free_alien() when the platform is not numa. + * This will avoid cache misses that happen while accessing slabp (which +@@ -3998,7 +4027,7 @@ static int enable_cpucache(struct kmem_cache *cachep) + if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1) + shared = 8; + +-#if DEBUG ++#if SLAB_DEBUG + /* + * With debugging enabled, large batchcount lead to excessively long + * periods with disabled local interrupts. Limit the batchcount +@@ -4066,6 +4095,7 @@ static void cache_reap(struct work_struct *w) + /* Give up. Setup the next iteration. */ + goto out; + ++ {KSTAT_PERF_ENTER(cache_reap) + list_for_each_entry(searchp, &cache_chain, next) { + check_irq_on(); + +@@ -4106,6 +4136,7 @@ next: + check_irq_on(); + mutex_unlock(&cache_chain_mutex); + next_reap_node(); ++ KSTAT_PERF_LEAVE(cache_reap)} + out: + /* Set up the next iteration */ + schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC)); +@@ -4119,7 +4150,7 @@ static void print_slabinfo_header(struct seq_file *m) + * Output format version, so at least we can change it + * without _too_ many complaints. + */ +-#if STATS ++#if SLAB_STATS + seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); + #else + seq_puts(m, "slabinfo - version: 2.1\n"); +@@ -4128,14 +4159,82 @@ static void print_slabinfo_header(struct seq_file *m) + " "); + seq_puts(m, " : tunables "); + seq_puts(m, " : slabdata "); +-#if STATS ++#if SLAB_STATS + seq_puts(m, " : globalstat " +- " "); ++ " "); + seq_puts(m, " : cpustat "); + #endif + seq_putc(m, '\n'); + } + ++#define SHOW_TOP_SLABS 10 ++ ++static unsigned long get_cache_size(struct kmem_cache *cachep) ++{ ++ unsigned long flags; ++ unsigned long slabs; ++ struct kmem_list3 *l3; ++ struct list_head *lh; ++ int node; ++ ++ slabs = 0; ++ ++ for_each_online_node (node) { ++ l3 = cachep->nodelists[node]; ++ if (l3 == NULL) ++ continue; ++ ++ spin_lock_irqsave(&l3->list_lock, flags); ++ list_for_each (lh, &l3->slabs_full) ++ slabs++; ++ list_for_each (lh, &l3->slabs_partial) ++ slabs++; ++ list_for_each (lh, &l3->slabs_free) ++ slabs++; ++ spin_unlock_irqrestore(&l3->list_lock, flags); ++ } ++ ++ return slabs * (PAGE_SIZE << cachep->gfporder) + ++ (OFF_SLAB(cachep) ? ++ cachep->slabp_cache->buffer_size * slabs : 0); ++} ++ ++void show_slab_info(void) ++{ ++ int i, j; ++ unsigned long size; ++ struct kmem_cache *ptr; ++ unsigned long sizes[SHOW_TOP_SLABS]; ++ struct kmem_cache *top[SHOW_TOP_SLABS]; ++ ++ memset(top, 0, sizeof(top)); ++ memset(sizes, 0, sizeof(sizes)); ++ ++ printk("Top %d caches:\n", SHOW_TOP_SLABS); ++ ++ spin_lock(&cache_chain_lock); ++ list_for_each_entry (ptr, &cache_chain, next) { ++ size = get_cache_size(ptr); ++ ++ j = 0; ++ for (i = 1; i < SHOW_TOP_SLABS; i++) ++ if (sizes[i] < sizes[j]) ++ j = i; ++ ++ if (size > sizes[j]) { ++ sizes[j] = size; ++ top[j] = ptr; ++ } ++ } ++ ++ for (i = 0; i < SHOW_TOP_SLABS; i++) ++ if (top[i]) ++ printk("%-21s: size %10lu objsize %10u\n", ++ top[i]->name, sizes[i], ++ top[i]->buffer_size); ++ spin_unlock(&cache_chain_lock); ++} ++ + static void *s_start(struct seq_file *m, loff_t *pos) + { + loff_t n = *pos; +@@ -4214,19 +4313,20 @@ static int s_show(struct seq_file *m, void *p) + if (error) + printk(KERN_ERR "slab: cache %s error: %s\n", name, error); + +- seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", ++ seq_printf(m, "%-21s %6lu %6lu %6u %4u %4d", + name, active_objs, num_objs, cachep->buffer_size, + cachep->num, (1 << cachep->gfporder)); + seq_printf(m, " : tunables %4u %4u %4u", + cachep->limit, cachep->batchcount, cachep->shared); + seq_printf(m, " : slabdata %6lu %6lu %6lu", + active_slabs, num_slabs, shared_avail); +-#if STATS ++#if SLAB_STATS + { /* list3 stats */ + unsigned long high = cachep->high_mark; + unsigned long allocs = cachep->num_allocations; + unsigned long grown = cachep->grown; + unsigned long reaped = cachep->reaped; ++ unsigned long shrunk = cachep->shrunk; + unsigned long errors = cachep->errors; + unsigned long max_freeable = cachep->max_freeable; + unsigned long node_allocs = cachep->node_allocs; +@@ -4234,9 +4334,10 @@ static int s_show(struct seq_file *m, void *p) + unsigned long overflows = cachep->node_overflow; + + seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ +- %4lu %4lu %4lu %4lu %4lu", allocs, high, grown, ++ %4lu %4lu %4lu %4lu %4lu %4lu", ++ allocs, high, grown, + reaped, errors, max_freeable, node_allocs, +- node_frees, overflows); ++ node_frees, overflows, shrunk); + } + /* cpu stats */ + { +diff --git a/mm/slub.c b/mm/slub.c +index 315c392..ad802eb 100644 +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -24,6 +24,8 @@ + #include + #include + ++#include ++ + /* + * Lock order: + * 1. slab_lock(page) +@@ -169,9 +171,11 @@ static inline void ClearSlabDebug(struct page *page) + + /* + * Set of flags that will prevent slab merging ++ * ++ * FIXME - think over how to allow merging accountable slubs + */ + #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ +- SLAB_TRACE | SLAB_DESTROY_BY_RCU) ++ SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_UBC) + + #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ + SLAB_CACHE_DMA) +@@ -337,6 +341,95 @@ static inline int oo_objects(struct kmem_cache_order_objects x) + return x.x & ((1 << 16) - 1); + } + ++#ifdef CONFIG_BEANCOUNTERS ++static inline void inc_cache_grown(struct kmem_cache *s) ++{ ++ atomic_inc(&s->grown); ++} ++ ++static inline void dec_cache_grown(struct kmem_cache *s) ++{ ++ atomic_dec(&s->grown); ++} ++ ++unsigned long ub_cache_growth(struct kmem_cache *cachep) ++{ ++ return atomic_read(&cachep->grown) << cachep->oo.x; /* XXX huh? */ ++} ++ ++static void __flush_cpu_slab(struct kmem_cache *s, int cpu); ++ ++int kmem_cache_objuse(struct kmem_cache *cachep) ++{ ++ return cachep->objuse; ++} ++ ++EXPORT_SYMBOL(kmem_cache_objuse); ++ ++int kmem_obj_objuse(void *obj) ++{ ++ return kmem_cache_objuse(virt_to_head_page(obj)->slab); ++} ++ ++EXPORT_SYMBOL(kmem_obj_objuse); ++ ++int kmem_dname_objuse(void *obj) ++{ ++ struct kmem_cache *s; ++ ++ /* ++ * Allocations larger than PAGE_SIZE/2 go directly through ++ * __get_free_pages() and aren't associated with any cache. ++ */ ++ s = virt_to_head_page(obj)->slab; ++ if (!s) ++ return PAGE_SIZE; ++ return kmem_cache_objuse(s); ++} ++ ++#define page_ubs(pg) (pg->bc.slub_ubs) ++ ++struct user_beancounter **ub_slab_ptr(struct kmem_cache *s, void *obj) ++{ ++ struct page *pg; ++ ++ BUG_ON(!(s->flags & SLAB_UBC)); ++ pg = virt_to_head_page(obj); ++ return page_ubs(pg) + slab_index(obj, s, page_address(pg)); ++} ++ ++EXPORT_SYMBOL(ub_slab_ptr); ++ ++struct user_beancounter *slab_ub(void *obj) ++{ ++ struct page *pg; ++ ++ pg = virt_to_head_page(obj); ++ BUG_ON(!(pg->slab->flags & SLAB_UBC)); ++ return page_ubs(pg)[slab_index(obj, pg->slab, page_address(pg))]; ++} ++ ++EXPORT_SYMBOL(slab_ub); ++ ++void kmem_mark_nocharge(struct kmem_cache *cachep) ++{ ++ cachep->flags |= SLAB_NO_CHARGE; ++} ++#else ++static inline void inc_cache_grown(struct kmem_cache *s) ++{ ++} ++ ++static inline void dec_cache_grown(struct kmem_cache *s) ++{ ++} ++#endif ++ ++void show_slab_info(void) ++{ ++ /* FIXME - show it */ ++} ++ + #ifdef CONFIG_SLUB_DEBUG + /* + * Debug settings: +@@ -1106,6 +1199,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) + struct page *page; + struct kmem_cache_order_objects oo = s->oo; + ++ flags &= ~__GFP_UBC; + flags |= s->allocflags; + + page = alloc_slab_page(flags | __GFP_NOWARN | __GFP_NORETRY, node, +@@ -1128,9 +1222,12 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) + NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, + 1 << oo_order(oo)); + ++ inc_cache_grown(s); + return page; + } + ++static void __free_slab(struct kmem_cache *s, struct page *page); ++ + static void setup_object(struct kmem_cache *s, struct page *page, + void *object) + { +@@ -1153,6 +1250,18 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) + if (!page) + goto out; + ++#ifdef CONFIG_BEANCOUNTERS ++ if (s->flags & SLAB_UBC) { ++ BUG_ON(page_ubs(page) != NULL); ++ page_ubs(page) = kzalloc(page->objects * sizeof(void *), ++ flags & ~__GFP_UBC); ++ if (page_ubs(page) == NULL) { ++ __free_slab(s, page); ++ page = NULL; ++ goto out; ++ } ++ } ++#endif + inc_slabs_node(s, page_to_nid(page), page->objects); + page->slab = s; + page->flags |= 1 << PG_slab; +@@ -1202,6 +1311,13 @@ static void __free_slab(struct kmem_cache *s, struct page *page) + + __ClearPageSlab(page); + reset_page_mapcount(page); ++#ifdef CONFIG_BEANCOUNTERS ++ if (page_ubs(page) != NULL) { ++ BUG_ON(!(s->flags & SLAB_UBC)); ++ kfree(page_ubs(page)); ++ page_ubs(page) = NULL; ++ } ++#endif + __free_pages(page, order); + } + +@@ -1224,6 +1340,8 @@ static void free_slab(struct kmem_cache *s, struct page *page) + call_rcu(head, rcu_free_slab); + } else + __free_slab(s, page); ++ ++ dec_cache_grown(s); + } + + static void discard_slab(struct kmem_cache *s, struct page *page) +@@ -1642,6 +1760,13 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, + c->freelist = object[c->offset]; + stat(c, ALLOC_FASTPATH); + } ++ ++ if (object && should_charge(s, gfpflags) && ++ ub_slab_charge(s, object, gfpflags)) { ++ kmem_cache_free(s, object); ++ object = NULL; ++ } ++ + local_irq_restore(flags); + + if (unlikely((gfpflags & __GFP_ZERO) && object)) +@@ -1752,6 +1877,9 @@ static __always_inline void slab_free(struct kmem_cache *s, + local_irq_save(flags); + c = get_cpu_slab(s, smp_processor_id()); + debug_check_no_locks_freed(object, c->objsize); ++ ++ if (should_uncharge(s)) ++ ub_slab_uncharge(s, x); + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(object, s->objsize); + if (likely(page == c->page && c->node >= 0)) { +@@ -2342,6 +2470,9 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, + #ifdef CONFIG_NUMA + s->remote_node_defrag_ratio = 100; + #endif ++#ifdef CONFIG_BEANCOUNTERS ++ s->objuse = s->size + (sizeof(struct page) / oo_objects(s->oo)); ++#endif + if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) + goto error; + +@@ -2496,6 +2627,10 @@ EXPORT_SYMBOL(kmem_cache_destroy); + + struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned; + EXPORT_SYMBOL(kmalloc_caches); ++#ifdef CONFIG_BEANCOUNTERS ++struct kmem_cache ub_kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __cacheline_aligned; ++EXPORT_SYMBOL(ub_kmalloc_caches); ++#endif + + static int __init setup_slub_min_order(char *str) + { +@@ -2537,6 +2672,11 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, + { + unsigned int flags = 0; + ++ if (gfp_flags & __GFP_UBC) { ++ flags = SLAB_UBC | SLAB_NO_CHARGE; ++ gfp_flags &= ~__GFP_UBC; ++ } ++ + if (gfp_flags & SLUB_DMA) + flags = SLAB_CACHE_DMA; + +@@ -2666,11 +2806,14 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags) + index = fls(size - 1); + + #ifdef CONFIG_ZONE_DMA +- if (unlikely((flags & SLUB_DMA))) ++ if (unlikely((flags & SLUB_DMA))) { ++ BUG_ON(flags & __GFP_UBC); + return dma_kmalloc_cache(index, flags); ++ } + + #endif +- return &kmalloc_caches[index]; ++ ++ return __kmalloc_cache(flags, index); + } + + void *__kmalloc(size_t size, gfp_t flags) +@@ -2984,6 +3127,11 @@ void __init kmem_cache_init(void) + create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", + sizeof(struct kmem_cache_node), GFP_KERNEL); + kmalloc_caches[0].refcount = -1; ++#ifdef CONFIG_BEANCOUNTERS ++ create_kmalloc_cache(&ub_kmalloc_caches[0], "kmem_cache_node_ubc", ++ sizeof(struct kmem_cache_node), GFP_KERNEL_UBC); ++ ub_kmalloc_caches[0].refcount = -1; ++#endif + caches++; + + hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); +@@ -2996,15 +3144,27 @@ void __init kmem_cache_init(void) + if (KMALLOC_MIN_SIZE <= 64) { + create_kmalloc_cache(&kmalloc_caches[1], + "kmalloc-96", 96, GFP_KERNEL); ++#ifdef CONFIG_BEANCOUNTERS ++ create_kmalloc_cache(&ub_kmalloc_caches[1], ++ "kmalloc-96-ubc", 96, GFP_KERNEL_UBC); ++#endif + caches++; + create_kmalloc_cache(&kmalloc_caches[2], + "kmalloc-192", 192, GFP_KERNEL); ++#ifdef CONFIG_BEANCOUNTERS ++ create_kmalloc_cache(&ub_kmalloc_caches[2], ++ "kmalloc-192-ubc", 192, GFP_KERNEL_UBC); ++#endif + caches++; + } + + for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) { + create_kmalloc_cache(&kmalloc_caches[i], + "kmalloc", 1 << i, GFP_KERNEL); ++#ifdef CONFIG_BEANCOUNTERS ++ create_kmalloc_cache(&ub_kmalloc_caches[i], ++ "kmalloc-ubc", 1 << i, GFP_KERNEL_UBC); ++#endif + caches++; + } + +@@ -3039,9 +3199,14 @@ void __init kmem_cache_init(void) + slab_state = UP; + + /* Provide the correct kmalloc names now that the caches are up */ +- for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) ++ for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) { + kmalloc_caches[i]. name = + kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); ++#ifdef CONFIG_BEANCOUNTERS ++ ub_kmalloc_caches[i].name = ++ kasprintf(GFP_KERNEL, "kmalloc-%d-ubc", 1 << i); ++#endif ++ } + + #ifdef CONFIG_SMP + register_cpu_notifier(&slab_notifier); +@@ -4308,6 +4473,8 @@ static char *create_unique_id(struct kmem_cache *s) + *p++ = 'a'; + if (s->flags & SLAB_DEBUG_FREE) + *p++ = 'F'; ++ if (s->flags & SLAB_UBC) ++ *p++ = 'b'; + if (p != name + 1) + *p++ = '-'; + p += sprintf(p, "%07d", s->size); +diff --git a/mm/swap.c b/mm/swap.c +index 45c9f25..8160a2e 100644 +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -209,6 +209,7 @@ void lru_cache_add_active(struct page *page) + __pagevec_lru_add_active(pvec); + put_cpu_var(lru_add_active_pvecs); + } ++EXPORT_SYMBOL(lru_cache_add_active); + + /* + * Drain pages out of the cpu's pagevecs. +@@ -244,6 +245,8 @@ void lru_add_drain(void) + put_cpu(); + } + ++EXPORT_SYMBOL(lru_add_drain); ++ + #ifdef CONFIG_NUMA + static void lru_add_drain_per_cpu(struct work_struct *dummy) + { +diff --git a/mm/swap_state.c b/mm/swap_state.c +index d8aadaf..46cb3df 100644 +--- a/mm/swap_state.c ++++ b/mm/swap_state.c +@@ -20,6 +20,9 @@ + + #include + ++#include ++#include ++ + /* + * swapper_space is a fiction, retained to simplify the path through + * vmscan's shrink_page_list, to make sync_page look nicer, and to allow +@@ -44,6 +47,7 @@ struct address_space swapper_space = { + .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), + .backing_dev_info = &swap_backing_dev_info, + }; ++EXPORT_SYMBOL(swapper_space); + + #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) + +@@ -93,6 +97,8 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) + return error; + } + ++EXPORT_SYMBOL(add_to_swap_cache); ++ + /* + * This must be called only on pages that have + * been verified to be in the swap cache. +@@ -129,7 +135,14 @@ int add_to_swap(struct page * page, gfp_t gfp_mask) + BUG_ON(!PageUptodate(page)); + + for (;;) { +- entry = get_swap_page(); ++ struct user_beancounter *ub; ++ ++ ub = pb_grab_page_ub(page); ++ if (IS_ERR(ub)) ++ return 0; ++ ++ entry = get_swap_page(ub); ++ put_beancounter(ub); + if (!entry.val) + return 0; + +@@ -313,6 +326,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, + return found_page; + } + ++EXPORT_SYMBOL(read_swap_cache_async); ++ + /** + * swapin_readahead - swap in pages in hope we need them soon + * @entry: swap entry of this memory +diff --git a/mm/swapfile.c b/mm/swapfile.c +index bd1bb59..019db42 100644 +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -33,6 +33,8 @@ + #include + #include + ++#include ++ + DEFINE_SPINLOCK(swap_lock); + unsigned int nr_swapfiles; + long total_swap_pages; +@@ -44,8 +46,12 @@ static const char Bad_offset[] = "Bad swap offset entry "; + static const char Unused_offset[] = "Unused swap offset entry "; + + struct swap_list_t swap_list = {-1, -1}; ++struct swap_info_struct swap_info[MAX_SWAPFILES]; + +-static struct swap_info_struct swap_info[MAX_SWAPFILES]; ++EXPORT_SYMBOL(total_swap_pages); ++EXPORT_SYMBOL(swap_lock); ++EXPORT_SYMBOL(swap_list); ++EXPORT_SYMBOL(swap_info); + + static DEFINE_MUTEX(swapon_mutex); + +@@ -172,7 +178,7 @@ no_page: + return 0; + } + +-swp_entry_t get_swap_page(void) ++swp_entry_t get_swap_page(struct user_beancounter *ub) + { + struct swap_info_struct *si; + pgoff_t offset; +@@ -193,6 +199,8 @@ swp_entry_t get_swap_page(void) + wrapped++; + } + ++ if (si->flags & SWP_READONLY) ++ continue; + if (!si->highest_bit) + continue; + if (!(si->flags & SWP_WRITEOK)) +@@ -202,6 +210,7 @@ swp_entry_t get_swap_page(void) + offset = scan_swap_map(si); + if (offset) { + spin_unlock(&swap_lock); ++ ub_swapentry_inc(si, offset, ub); + return swp_entry(type, offset); + } + next = swap_list.next; +@@ -213,6 +222,8 @@ noswap: + return (swp_entry_t) {0}; + } + ++EXPORT_SYMBOL(get_swap_page); ++ + swp_entry_t get_swap_page_of_type(int type) + { + struct swap_info_struct *si; +@@ -220,7 +231,7 @@ swp_entry_t get_swap_page_of_type(int type) + + spin_lock(&swap_lock); + si = swap_info + type; +- if (si->flags & SWP_WRITEOK) { ++ if (si->flags & SWP_WRITEOK && !(si->flags & SWP_READONLY)) { + nr_swap_pages--; + offset = scan_swap_map(si); + if (offset) { +@@ -277,6 +288,7 @@ static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) + count--; + p->swap_map[offset] = count; + if (!count) { ++ ub_swapentry_dec(p, offset); + if (offset < p->lowest_bit) + p->lowest_bit = offset; + if (offset > p->highest_bit) +@@ -305,6 +317,8 @@ void swap_free(swp_entry_t entry) + } + } + ++EXPORT_SYMBOL(swap_free); ++ + /* + * How many references to page are currently swapped out? + */ +@@ -386,6 +400,55 @@ int remove_exclusive_swap_page(struct page *page) + return retval; + } + ++int try_to_remove_exclusive_swap_page(struct page *page) ++{ ++ int retval; ++ struct swap_info_struct * p; ++ swp_entry_t entry; ++ ++ BUG_ON(PagePrivate(page)); ++ BUG_ON(!PageLocked(page)); ++ ++ if (!PageSwapCache(page)) ++ return 0; ++ if (PageWriteback(page)) ++ return 0; ++ if (page_count(page) != 2) /* 2: us + cache */ ++ return 0; ++ ++ entry.val = page->private; ++ p = swap_info_get(entry); ++ if (!p) ++ return 0; ++ ++ if (!vm_swap_full() && ++ (p->flags & (SWP_ACTIVE|SWP_READONLY)) == SWP_ACTIVE) { ++ spin_unlock(&swap_lock); ++ return 0; ++ } ++ ++ /* Is the only swap cache user the cache itself? */ ++ retval = 0; ++ if (p->swap_map[swp_offset(entry)] == 1) { ++ /* Recheck the page count with the swapcache lock held.. */ ++ write_lock_irq(&swapper_space.tree_lock); ++ if ((page_count(page) == 2) && !PageWriteback(page)) { ++ __delete_from_swap_cache(page); ++ SetPageDirty(page); ++ retval = 1; ++ } ++ write_unlock_irq(&swapper_space.tree_lock); ++ } ++ spin_unlock(&swap_lock); ++ ++ if (retval) { ++ swap_free(entry); ++ page_cache_release(page); ++ } ++ ++ return retval; ++} ++ + /* + * Free the swap entry like above, but also try to + * free the page cache entry if it is the last user. +@@ -425,6 +488,7 @@ void free_swap_and_cache(swp_entry_t entry) + page_cache_release(page); + } + } ++EXPORT_SYMBOL(free_swap_and_cache); + + #ifdef CONFIG_HIBERNATION + /* +@@ -508,11 +572,13 @@ unsigned int count_swap_pages(int type, int free) + * force COW, vm_page_prot omits write permission from any private vma. + */ + static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, +- unsigned long addr, swp_entry_t entry, struct page *page) ++ unsigned long addr, swp_entry_t entry, struct page *page, ++ struct page_beancounter **pb) + { + spinlock_t *ptl; + pte_t *pte; + int ret = 1; ++ struct mm_struct *mm = vma->vm_mm; + + if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) + ret = -ENOMEM; +@@ -525,9 +591,11 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, + goto out; + } + +- inc_mm_counter(vma->vm_mm, anon_rss); ++ inc_mm_counter(mm, anon_rss); ++ ub_unused_privvm_dec(mm, vma); ++ pb_add_ref(page, mm, pb); + get_page(page); +- set_pte_at(vma->vm_mm, addr, pte, ++ set_pte_at(mm, addr, pte, + pte_mkold(mk_pte(page, vma->vm_page_prot))); + page_add_anon_rmap(page, vma, addr); + swap_free(entry); +@@ -543,7 +611,8 @@ out: + + static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, +- swp_entry_t entry, struct page *page) ++ swp_entry_t entry, struct page *page, ++ struct page_beancounter **pb) + { + pte_t swp_pte = swp_entry_to_pte(entry); + pte_t *pte; +@@ -566,7 +635,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, + */ + if (unlikely(pte_same(*pte, swp_pte))) { + pte_unmap(pte); +- ret = unuse_pte(vma, pmd, addr, entry, page); ++ ret = unuse_pte(vma, pmd, addr, entry, page, pb); + if (ret) + goto out; + pte = pte_offset_map(pmd, addr); +@@ -579,7 +648,8 @@ out: + + static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, + unsigned long addr, unsigned long end, +- swp_entry_t entry, struct page *page) ++ swp_entry_t entry, struct page *page, ++ struct page_beancounter **pb) + { + pmd_t *pmd; + unsigned long next; +@@ -590,7 +660,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + continue; +- ret = unuse_pte_range(vma, pmd, addr, next, entry, page); ++ ret = unuse_pte_range(vma, pmd, addr, next, entry, page, pb); + if (ret) + return ret; + } while (pmd++, addr = next, addr != end); +@@ -599,7 +669,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, + + static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, + unsigned long addr, unsigned long end, +- swp_entry_t entry, struct page *page) ++ swp_entry_t entry, struct page *page, ++ struct page_beancounter **pb) + { + pud_t *pud; + unsigned long next; +@@ -610,7 +681,7 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; +- ret = unuse_pmd_range(vma, pud, addr, next, entry, page); ++ ret = unuse_pmd_range(vma, pud, addr, next, entry, page, pb); + if (ret) + return ret; + } while (pud++, addr = next, addr != end); +@@ -618,7 +689,8 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, + } + + static int unuse_vma(struct vm_area_struct *vma, +- swp_entry_t entry, struct page *page) ++ swp_entry_t entry, struct page *page, ++ struct page_beancounter **pb) + { + pgd_t *pgd; + unsigned long addr, end, next; +@@ -640,7 +712,7 @@ static int unuse_vma(struct vm_area_struct *vma, + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; +- ret = unuse_pud_range(vma, pgd, addr, next, entry, page); ++ ret = unuse_pud_range(vma, pgd, addr, next, entry, page, pb); + if (ret) + return ret; + } while (pgd++, addr = next, addr != end); +@@ -648,7 +720,8 @@ static int unuse_vma(struct vm_area_struct *vma, + } + + static int unuse_mm(struct mm_struct *mm, +- swp_entry_t entry, struct page *page) ++ swp_entry_t entry, struct page *page, ++ struct page_beancounter **pb) + { + struct vm_area_struct *vma; + int ret = 0; +@@ -664,7 +737,7 @@ static int unuse_mm(struct mm_struct *mm, + lock_page(page); + } + for (vma = mm->mmap; vma; vma = vma->vm_next) { +- if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) ++ if (vma->anon_vma && (ret = unuse_vma(vma, entry, page, pb))) + break; + } + up_read(&mm->mmap_sem); +@@ -726,6 +799,7 @@ static int try_to_unuse(unsigned int type) + int retval = 0; + int reset_overflow = 0; + int shmem; ++ struct page_beancounter *pb; + + /* + * When searching mms for an entry, a good strategy is to +@@ -778,6 +852,13 @@ static int try_to_unuse(unsigned int type) + break; + } + ++ pb = NULL; ++ if (pb_alloc_all(&pb)) { ++ page_cache_release(page); ++ retval = -ENOMEM; ++ break; ++ } ++ + /* + * Don't hold on to start_mm if it looks like exiting. + */ +@@ -800,6 +881,20 @@ static int try_to_unuse(unsigned int type) + lock_page(page); + wait_on_page_writeback(page); + ++ /* If read failed we cannot map not-uptodate page to ++ * user space. Actually, we are in serious troubles, ++ * we do not even know what process to kill. So, the only ++ * variant remains: to stop swapoff() and allow someone ++ * to kill processes to zap invalid pages. ++ */ ++ if (unlikely(!PageUptodate(page))) { ++ pb_free_list(&pb); ++ unlock_page(page); ++ page_cache_release(page); ++ retval = -EIO; ++ break; ++ } ++ + /* + * Remove all references to entry. + * Whenever we reach init_mm, there's no address space +@@ -811,7 +906,7 @@ static int try_to_unuse(unsigned int type) + if (start_mm == &init_mm) + shmem = shmem_unuse(entry, page); + else +- retval = unuse_mm(start_mm, entry, page); ++ retval = unuse_mm(start_mm, entry, page, &pb); + } + if (*swap_map > 1) { + int set_start_mm = (*swap_map >= swcount); +@@ -841,7 +936,7 @@ static int try_to_unuse(unsigned int type) + set_start_mm = 1; + shmem = shmem_unuse(entry, page); + } else +- retval = unuse_mm(mm, entry, page); ++ retval = unuse_mm(mm, entry, page, &pb); + if (set_start_mm && *swap_map < swcount) { + mmput(new_start_mm); + atomic_inc(&mm->mm_users); +@@ -862,6 +957,8 @@ static int try_to_unuse(unsigned int type) + retval = shmem; + break; + } ++ ++ pb_free_list(&pb); + if (retval) { + unlock_page(page); + page_cache_release(page); +@@ -1214,6 +1311,10 @@ asmlinkage long sys_swapoff(const char __user * specialfile) + int i, type, prev; + int err; + ++ /* VE admin check is just to be on the safe side, the admin may affect ++ * swaps only if he has access to special, i.e. if he has been granted ++ * access to the block device or if the swap file is in the area ++ * visible to him. */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + +@@ -1313,6 +1414,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile) + spin_unlock(&swap_lock); + mutex_unlock(&swapon_mutex); + vfree(swap_map); ++ ub_swap_fini(p); + inode = mapping->host; + if (S_ISBLK(inode->i_mode)) { + struct block_device *bdev = I_BDEV(inode); +@@ -1332,6 +1434,8 @@ out: + return err; + } + ++EXPORT_SYMBOL(sys_swapoff); ++ + #ifdef CONFIG_PROC_FS + /* iterator */ + static void *swap_start(struct seq_file *swap, loff_t *pos) +@@ -1426,7 +1530,7 @@ static const struct file_operations proc_swaps_operations = { + + static int __init procswaps_init(void) + { +- proc_create("swaps", 0, NULL, &proc_swaps_operations); ++ proc_create("swaps", 0, &glob_proc_root, &proc_swaps_operations); + return 0; + } + __initcall(procswaps_init); +@@ -1670,9 +1774,16 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) + goto bad_swap; + } + ++ if (ub_swap_init(p, maxpages)) { ++ error = -ENOMEM; ++ goto bad_swap; ++ } ++ + mutex_lock(&swapon_mutex); + spin_lock(&swap_lock); + p->flags = SWP_ACTIVE; ++ if (swap_flags & SWAP_FLAG_READONLY) ++ p->flags |= SWP_READONLY; + nr_swap_pages += nr_good_pages; + total_swap_pages += nr_good_pages; + +@@ -1732,6 +1843,8 @@ out: + return error; + } + ++EXPORT_SYMBOL(sys_swapon); ++ + void si_swapinfo(struct sysinfo *val) + { + unsigned int i; +@@ -1791,6 +1904,8 @@ bad_file: + goto out; + } + ++EXPORT_SYMBOL(swap_duplicate); ++ + struct swap_info_struct * + get_swap_info_struct(unsigned type) + { +diff --git a/mm/truncate.c b/mm/truncate.c +index b8961cb..d2f3e40 100644 +--- a/mm/truncate.c ++++ b/mm/truncate.c +@@ -77,6 +77,7 @@ void cancel_dirty_page(struct page *page, unsigned int account_size) + BDI_RECLAIMABLE); + if (account_size) + task_io_account_cancelled_write(account_size); ++ ub_io_release_context(page, account_size); + } + } + } +diff --git a/mm/vmalloc.c b/mm/vmalloc.c +index 6e45b0f..555f735 100644 +--- a/mm/vmalloc.c ++++ b/mm/vmalloc.c +@@ -22,6 +22,9 @@ + #include + #include + ++#include ++#include ++ + + DEFINE_RWLOCK(vmlist_lock); + struct vm_struct *vmlist; +@@ -334,6 +337,70 @@ static struct vm_struct *__find_vm_area(const void *addr) + return tmp; + } + ++struct vm_struct * get_vm_area_best(unsigned long size, unsigned long flags) ++{ ++ unsigned long addr, best_addr, delta, best_delta; ++ struct vm_struct **p, **best_p, *tmp, *area; ++ ++ area = kmalloc(sizeof(*area), GFP_KERNEL); ++ if (!area) ++ return NULL; ++ ++ size += PAGE_SIZE; /* one-page gap at the end */ ++ addr = VMALLOC_START; ++ best_addr = 0UL; ++ best_p = NULL; ++ best_delta = PAGE_ALIGN(VMALLOC_END) - VMALLOC_START; ++ ++ write_lock(&vmlist_lock); ++ for (p = &vmlist; (tmp = *p) && ++ (tmp->addr <= (void *)PAGE_ALIGN(VMALLOC_END)); ++ p = &tmp->next) { ++ if ((unsigned long)tmp->addr < addr) ++ continue; ++ if ((size + addr) < addr) ++ break; ++ delta = (unsigned long) tmp->addr - (size + addr); ++ if (delta < best_delta) { ++ best_delta = delta; ++ best_addr = addr; ++ best_p = p; ++ } ++ addr = tmp->size + (unsigned long)tmp->addr; ++ if (addr > VMALLOC_END-size) ++ break; ++ } ++ ++ if (!tmp || (tmp->addr > (void *)PAGE_ALIGN(VMALLOC_END))) { ++ /* check free area after list end */ ++ delta = (unsigned long) PAGE_ALIGN(VMALLOC_END) - (size + addr); ++ if (delta < best_delta) { ++ best_delta = delta; ++ best_addr = addr; ++ best_p = p; ++ } ++ } ++ if (best_addr) { ++ area->flags = flags; ++ /* allocate at the end of this area */ ++ area->addr = (void *)(best_addr + best_delta); ++ area->size = size; ++ area->next = *best_p; ++ area->pages = NULL; ++ area->nr_pages = 0; ++ area->phys_addr = 0; ++ *best_p = area; ++ /* check like in __vunmap */ ++ WARN_ON((PAGE_SIZE - 1) & (unsigned long)area->addr); ++ } else { ++ kfree(area); ++ area = NULL; ++ } ++ write_unlock(&vmlist_lock); ++ ++ return area; ++} ++ + /* Caller must hold vmlist_lock */ + static struct vm_struct *__remove_vm_area(const void *addr) + { +@@ -373,7 +440,7 @@ struct vm_struct *remove_vm_area(const void *addr) + return v; + } + +-static void __vunmap(const void *addr, int deallocate_pages) ++static void __vunmap(const void *addr, int deallocate_pages, int uncharge) + { + struct vm_struct *area; + +@@ -400,6 +467,8 @@ static void __vunmap(const void *addr, int deallocate_pages) + if (deallocate_pages) { + int i; + ++ if (uncharge) ++ dec_vmalloc_charged(area); + for (i = 0; i < area->nr_pages; i++) { + struct page *page = area->pages[i]; + +@@ -430,7 +499,7 @@ static void __vunmap(const void *addr, int deallocate_pages) + void vfree(const void *addr) + { + BUG_ON(in_interrupt()); +- __vunmap(addr, 1); ++ __vunmap(addr, 1, 1); + } + EXPORT_SYMBOL(vfree); + +@@ -446,7 +515,7 @@ EXPORT_SYMBOL(vfree); + void vunmap(const void *addr) + { + BUG_ON(in_interrupt()); +- __vunmap(addr, 0); ++ __vunmap(addr, 0, 0); + } + EXPORT_SYMBOL(vunmap); + +@@ -528,10 +597,12 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, + + if (map_vm_area(area, prot, &pages)) + goto fail; ++ ++ inc_vmalloc_charged(area, gfp_mask); + return area->addr; + + fail: +- vfree(area->addr); ++ __vunmap(area->addr, 1, 0); + return NULL; + } + +@@ -578,6 +649,22 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) + } + EXPORT_SYMBOL(__vmalloc); + ++static void *____vmalloc(unsigned long size, gfp_t mask, pgprot_t prot, ++ void *caller) ++{ ++ struct vm_struct *area; ++ ++ size = PAGE_ALIGN(size); ++ if (!size || (size >> PAGE_SHIFT) > num_physpages) ++ return NULL; ++ ++ area = get_vm_area_best(size, VM_ALLOC); ++ if (!area) ++ return NULL; ++ ++ return __vmalloc_area_node(area, mask, prot, -1, caller); ++} ++ + /** + * vmalloc - allocate virtually contiguous memory + * @size: allocation size +@@ -594,6 +681,28 @@ void *vmalloc(unsigned long size) + } + EXPORT_SYMBOL(vmalloc); + ++void *ub_vmalloc(unsigned long size) ++{ ++ return __vmalloc(size, GFP_KERNEL_UBC | __GFP_HIGHMEM, PAGE_KERNEL); ++} ++EXPORT_SYMBOL(ub_vmalloc); ++ ++void *vmalloc_best(unsigned long size) ++{ ++ return ____vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, ++ __builtin_return_address(0)); ++} ++ ++EXPORT_SYMBOL(vmalloc_best); ++ ++void *ub_vmalloc_best(unsigned long size) ++{ ++ return ____vmalloc(size, GFP_KERNEL_UBC | __GFP_HIGHMEM, PAGE_KERNEL, ++ __builtin_return_address(0)); ++} ++ ++EXPORT_SYMBOL(ub_vmalloc_best); ++ + /** + * vmalloc_user - allocate zeroed virtually contiguous memory for userspace + * @size: allocation size +@@ -635,6 +744,13 @@ void *vmalloc_node(unsigned long size, int node) + } + EXPORT_SYMBOL(vmalloc_node); + ++void *ub_vmalloc_node(unsigned long size, int node) ++{ ++ return __vmalloc_node(size, GFP_KERNEL_UBC | __GFP_HIGHMEM, PAGE_KERNEL, ++ node, __builtin_return_address(0)); ++} ++EXPORT_SYMBOL(ub_vmalloc_node); ++ + #ifndef PAGE_KERNEL_EXEC + # define PAGE_KERNEL_EXEC PAGE_KERNEL + #endif +@@ -898,6 +1014,39 @@ void free_vm_area(struct vm_struct *area) + } + EXPORT_SYMBOL_GPL(free_vm_area); + ++void vprintstat(void) ++{ ++ struct vm_struct *p, *last_p = NULL; ++ unsigned long addr, size, free_size, max_free_size; ++ int num; ++ ++ addr = VMALLOC_START; ++ size = max_free_size = 0; ++ num = 0; ++ ++ read_lock(&vmlist_lock); ++ for (p = vmlist; p; p = p->next) { ++ free_size = (unsigned long)p->addr - addr; ++ if (free_size > max_free_size) ++ max_free_size = free_size; ++ addr = (unsigned long)p->addr + p->size; ++ size += p->size; ++ ++num; ++ last_p = p; ++ } ++ if (last_p) { ++ free_size = VMALLOC_END - ++ ((unsigned long)last_p->addr + last_p->size); ++ if (free_size > max_free_size) ++ max_free_size = free_size; ++ } ++ read_unlock(&vmlist_lock); ++ ++ printk("VMALLOC Used: %luKB Total: %luKB Entries: %d\n" ++ " Max_Free: %luKB Start: %lx End: %lx\n", ++ size/1024, (VMALLOC_END - VMALLOC_START)/1024, num, ++ max_free_size/1024, VMALLOC_START, VMALLOC_END); ++} + + #ifdef CONFIG_PROC_FS + static void *s_start(struct seq_file *m, loff_t *pos) +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 967d30c..8e05cd3 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -39,10 +39,14 @@ + #include + #include + ++#include ++#include ++ + #include + #include + + #include ++#include + + #include "internal.h" + +@@ -177,6 +181,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, + if (scanned == 0) + scanned = SWAP_CLUSTER_MAX; + ++ if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE))) ++ return 1; ++ + if (!down_read_trylock(&shrinker_rwsem)) + return 1; /* Assume we'll be able to shrink next time */ + +@@ -211,6 +218,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, + int shrink_ret; + int nr_before; + ++ if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE))) ++ goto done; ++ + nr_before = (*shrinker->shrink)(0, gfp_mask); + shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask); + if (shrink_ret == -1) +@@ -225,6 +235,7 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, + + shrinker->nr += total_scan; + } ++done: + up_read(&shrinker_rwsem); + return ret; + } +@@ -338,6 +349,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, + */ + if (PagePrivate(page)) { + if (try_to_free_buffers(page)) { ++ ub_io_release_context(page, 0); + ClearPageDirty(page); + printk("%s: orphaned page\n", __func__); + return PAGE_CLEAN; +@@ -1073,6 +1085,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, + if (sc->may_swap) + reclaim_mapped = calc_reclaim_mapped(sc, zone, priority); + ++ {KSTAT_PERF_ENTER(refill_inact) + lru_add_drain(); + spin_lock_irq(&zone->lru_lock); + pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, +@@ -1162,6 +1175,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, + spin_unlock_irq(&zone->lru_lock); + + pagevec_release(&pvec); ++ KSTAT_PERF_LEAVE(refill_inact)} + } + + /* +@@ -1214,6 +1228,8 @@ static unsigned long shrink_zone(int priority, struct zone *zone, + nr_to_scan = min(nr_active, + (unsigned long)sc->swap_cluster_max); + nr_active -= nr_to_scan; ++ if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE))) ++ goto done; + shrink_active_list(nr_to_scan, zone, sc, priority); + } + +@@ -1221,12 +1237,15 @@ static unsigned long shrink_zone(int priority, struct zone *zone, + nr_to_scan = min(nr_inactive, + (unsigned long)sc->swap_cluster_max); + nr_inactive -= nr_to_scan; ++ if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE))) ++ goto done; + nr_reclaimed += shrink_inactive_list(nr_to_scan, zone, + sc); + } + } + + throttle_vm_writeout(sc->gfp_mask); ++done: + return nr_reclaimed; + } + +@@ -1282,6 +1301,9 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist, + } + + nr_reclaimed += shrink_zone(priority, zone, sc); ++ ++ if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE))) ++ break; + } + + return nr_reclaimed; +@@ -1316,8 +1338,11 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, + struct zone *zone; + enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); + ++ KSTAT_PERF_ENTER(ttfp); + if (scan_global_lru(sc)) + count_vm_event(ALLOCSTALL); ++ ++ ub_oom_start(); + /* + * mem_cgroup will not do shrink_slab. + */ +@@ -1367,6 +1392,11 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, + sc->may_writepage = 1; + } + ++ if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE))) { ++ ret = 1; ++ goto out; ++ } ++ + /* Take a nap, wait for some writeback to complete */ + if (sc->nr_scanned && priority < DEF_PRIORITY - 2) + congestion_wait(WRITE, HZ/10); +@@ -1396,6 +1426,7 @@ out: + } else + mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); + ++ KSTAT_PERF_LEAVE(ttfp); + return ret; + } + +diff --git a/mm/vmstat.c b/mm/vmstat.c +index db9eabb..fa80228 100644 +--- a/mm/vmstat.c ++++ b/mm/vmstat.c +@@ -14,6 +14,40 @@ + #include + #include + #include ++#include ++ ++void __get_zone_counts(unsigned long *active, unsigned long *inactive, ++ unsigned long *free, struct pglist_data *pgdat) ++{ ++ struct zone *zones = pgdat->node_zones; ++ int i; ++ ++ *active = 0; ++ *inactive = 0; ++ *free = 0; ++ for (i = 0; i < MAX_NR_ZONES; i++) { ++ *active += zone_page_state(&zones[i], NR_ACTIVE); ++ *inactive += zone_page_state(&zones[i], NR_INACTIVE); ++ *free += zone_page_state(&zones[i], NR_FREE_PAGES); ++ } ++} ++ ++void get_zone_counts(unsigned long *active, ++ unsigned long *inactive, unsigned long *free) ++{ ++ struct pglist_data *pgdat; ++ ++ *active = 0; ++ *inactive = 0; ++ *free = 0; ++ for_each_online_pgdat(pgdat) { ++ unsigned long l, m, n; ++ __get_zone_counts(&l, &m, &n, pgdat); ++ *active += l; ++ *inactive += m; ++ *free += n; ++ } ++} + + #ifdef CONFIG_VM_EVENT_COUNTERS + DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; +@@ -34,6 +68,20 @@ static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask) + } + } + ++unsigned long vm_events(enum vm_event_item i) ++{ ++ int cpu; ++ unsigned long sum; ++ struct vm_event_state *st; ++ ++ sum = 0; ++ for_each_online_cpu(cpu) { ++ st = &per_cpu(vm_event_states, cpu); ++ sum += st->event[i]; ++ } ++ ++ return (sum < 0 ? 0 : sum); ++} + /* + * Accumulate the vm event counters across all CPUs. + * The result is unavoidably approximate - it can change +@@ -745,30 +793,40 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) + unsigned long *v; + #ifdef CONFIG_VM_EVENT_COUNTERS + unsigned long *e; ++#define VMSTAT_BUFSIZE (NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + \ ++ sizeof(struct vm_event_state)) ++#else ++#define VMSTAT_BUFSIZE (NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)) + #endif + int i; + + if (*pos >= ARRAY_SIZE(vmstat_text)) + return NULL; + +-#ifdef CONFIG_VM_EVENT_COUNTERS +- v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +- + sizeof(struct vm_event_state), GFP_KERNEL); +-#else +- v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long), +- GFP_KERNEL); +-#endif ++ v = kmalloc(VMSTAT_BUFSIZE, GFP_KERNEL); + m->private = v; + if (!v) + return ERR_PTR(-ENOMEM); +- for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) +- v[i] = global_page_state(i); ++ ++ if (ve_is_super(get_exec_env())) { ++ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) ++ v[i] = global_page_state(i); + #ifdef CONFIG_VM_EVENT_COUNTERS +- e = v + NR_VM_ZONE_STAT_ITEMS; +- all_vm_events(e); +- e[PGPGIN] /= 2; /* sectors -> kbytes */ +- e[PGPGOUT] /= 2; ++ e = v + NR_VM_ZONE_STAT_ITEMS; ++ all_vm_events(e); ++ e[PGPGIN] /= 2; /* sectors -> kbytes */ ++ e[PGPGOUT] /= 2; + #endif ++ } else ++ memset(v, 0, VMSTAT_BUFSIZE); ++ ++ if (virtinfo_notifier_call(VITYPE_GENERAL, ++ VIRTINFO_VMSTAT, v) & NOTIFY_FAIL) { ++ kfree(v); ++ m->private = NULL; ++ return ERR_PTR(-ENOMSG); ++ } ++ + return v + *pos; + } + +diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c +index ab2225d..27de02f 100644 +--- a/net/8021q/vlan.c ++++ b/net/8021q/vlan.c +@@ -107,7 +107,7 @@ static struct vlan_group *vlan_group_alloc(struct net_device *real_dev) + { + struct vlan_group *grp; + +- grp = kzalloc(sizeof(struct vlan_group), GFP_KERNEL); ++ grp = kzalloc(sizeof(struct vlan_group), GFP_KERNEL_UBC); + if (!grp) + return NULL; + +@@ -129,7 +129,7 @@ static int vlan_group_prealloc_vid(struct vlan_group *vg, int vid) + return 0; + + size = sizeof(struct net_device *) * VLAN_GROUP_ARRAY_PART_LEN; +- array = kzalloc(size, GFP_KERNEL); ++ array = kzalloc(size, GFP_KERNEL_UBC); + if (array == NULL) + return -ENOBUFS; + +@@ -148,6 +148,7 @@ void unregister_vlan_dev(struct net_device *dev) + struct net_device *real_dev = vlan->real_dev; + struct vlan_group *grp; + unsigned short vlan_id = vlan->vlan_id; ++ struct ve_struct *env; + + ASSERT_RTNL(); + +@@ -179,7 +180,9 @@ void unregister_vlan_dev(struct net_device *dev) + /* Get rid of the vlan's reference to real_dev */ + dev_put(real_dev); + ++ env = set_exec_env(dev->owner_env); + unregister_netdevice(dev); ++ set_exec_env(env); + } + + static void vlan_transfer_operstate(const struct net_device *dev, +@@ -527,6 +530,17 @@ static struct notifier_block vlan_notifier_block __read_mostly = { + .notifier_call = vlan_device_event, + }; + ++static inline int vlan_check_caps(void) ++{ ++ if (capable(CAP_NET_ADMIN)) ++ return 1; ++#ifdef CONFIG_VE ++ if (capable(CAP_VE_NET_ADMIN)) ++ return 1; ++#endif ++ return 0; ++} ++ + /* + * VLAN IOCTL handler. + * o execute requested action or pass command to the device driver +@@ -570,7 +584,7 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg) + switch (args.cmd) { + case SET_VLAN_INGRESS_PRIORITY_CMD: + err = -EPERM; +- if (!capable(CAP_NET_ADMIN)) ++ if (!vlan_check_caps()) + break; + vlan_dev_set_ingress_priority(dev, + args.u.skb_priority, +@@ -580,7 +594,7 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg) + + case SET_VLAN_EGRESS_PRIORITY_CMD: + err = -EPERM; +- if (!capable(CAP_NET_ADMIN)) ++ if (!vlan_check_caps()) + break; + err = vlan_dev_set_egress_priority(dev, + args.u.skb_priority, +@@ -589,7 +603,7 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg) + + case SET_VLAN_FLAG_CMD: + err = -EPERM; +- if (!capable(CAP_NET_ADMIN)) ++ if (!vlan_check_caps()) + break; + err = vlan_dev_set_vlan_flag(dev, + args.u.flag, +@@ -598,7 +612,7 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg) + + case SET_VLAN_NAME_TYPE_CMD: + err = -EPERM; +- if (!capable(CAP_NET_ADMIN)) ++ if (!vlan_check_caps()) + break; + if ((args.u.name_type >= 0) && + (args.u.name_type < VLAN_NAME_TYPE_HIGHEST)) { +@@ -614,14 +628,14 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg) + + case ADD_VLAN_CMD: + err = -EPERM; +- if (!capable(CAP_NET_ADMIN)) ++ if (!vlan_check_caps()) + break; + err = register_vlan_device(dev, args.u.VID); + break; + + case DEL_VLAN_CMD: + err = -EPERM; +- if (!capable(CAP_NET_ADMIN)) ++ if (!vlan_check_caps()) + break; + unregister_vlan_dev(dev); + err = 0; +diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c +index 5d055c2..a05a47e 100644 +--- a/net/8021q/vlan_dev.c ++++ b/net/8021q/vlan_dev.c +@@ -361,6 +361,7 @@ static int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, + + static int vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) + { ++ struct ve_struct *env; + struct net_device_stats *stats = &dev->stats; + struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data); + +@@ -413,7 +414,10 @@ static int vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) + stats->tx_bytes += skb->len; + + skb->dev = vlan_dev_info(dev)->real_dev; ++ skb->owner_env = skb->dev->owner_env; ++ env = set_exec_env(skb->owner_env); + dev_queue_xmit(skb); ++ set_exec_env(env); + + return 0; + } +@@ -421,6 +425,7 @@ static int vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) + static int vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb, + struct net_device *dev) + { ++ struct ve_struct *env; + struct net_device_stats *stats = &dev->stats; + unsigned short veth_TCI; + +@@ -438,7 +443,10 @@ static int vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb, + stats->tx_bytes += skb->len; + + skb->dev = vlan_dev_info(dev)->real_dev; ++ skb->owner_env = skb->dev->owner_env; ++ env = set_exec_env(skb->owner_env); + dev_queue_xmit(skb); ++ set_exec_env(env); + + return 0; + } +@@ -725,4 +733,6 @@ void vlan_setup(struct net_device *dev) + dev->destructor = free_netdev; + + memset(dev->broadcast, 0, ETH_ALEN); ++ if (!ve_is_super(get_exec_env())) ++ dev->features |= NETIF_F_VIRTUAL; + } +diff --git a/net/Kconfig b/net/Kconfig +index acbf7c6..9aad03b 100644 +--- a/net/Kconfig ++++ b/net/Kconfig +@@ -30,7 +30,7 @@ menu "Networking options" + config NET_NS + bool "Network namespace support" + default n +- depends on EXPERIMENTAL && !SYSFS && NAMESPACES ++ depends on EXPERIMENTAL && NAMESPACES + help + Allow user space to create what appear to be multiple instances + of the network stack. +diff --git a/net/bridge/br.c b/net/bridge/br.c +index 8f3c58e..8e51412 100644 +--- a/net/bridge/br.c ++++ b/net/bridge/br.c +@@ -55,6 +55,7 @@ static int __init br_init(void) + + brioctl_set(br_ioctl_deviceless_stub); + br_handle_frame_hook = br_handle_frame; ++ br_hard_xmit_hook = br_xmit; + + br_fdb_get_hook = br_fdb_get; + br_fdb_put_hook = br_fdb_put; +@@ -89,6 +90,7 @@ static void __exit br_deinit(void) + br_fdb_put_hook = NULL; + + br_handle_frame_hook = NULL; ++ br_hard_xmit_hook = NULL; + br_fdb_fini(); + } + +diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c +index bf77873..38f3441 100644 +--- a/net/bridge/br_device.c ++++ b/net/bridge/br_device.c +@@ -40,16 +40,47 @@ int br_dev_xmit(struct sk_buff *skb, struct net_device *dev) + skb_reset_mac_header(skb); + skb_pull(skb, ETH_HLEN); + ++ skb->brmark = BR_ALREADY_SEEN; ++ + if (dest[0] & 1) + br_flood_deliver(br, skb); + else if ((dst = __br_fdb_get(br, dest)) != NULL) +- br_deliver(dst->dst, skb); ++ br_deliver(dst->dst, skb, 1); + else + br_flood_deliver(br, skb); + + return 0; + } + ++int br_xmit(struct sk_buff *skb, struct net_bridge_port *port) ++{ ++ struct net_bridge *br = port->br; ++ const unsigned char *dest = skb->data; ++ struct net_bridge_fdb_entry *dst; ++ ++ if (!br->via_phys_dev) ++ return 0; ++ ++ br->statistics.tx_packets++; ++ br->statistics.tx_bytes += skb->len; ++ ++ skb_reset_mac_header(skb); ++ skb_pull(skb, ETH_HLEN); ++ ++ skb->brmark = BR_ALREADY_SEEN; ++ ++ if (dest[0] & 1) ++ br_xmit_deliver(br, port, skb); ++ else if ((dst = __br_fdb_get(br, dest)) != NULL) ++ br_deliver(dst->dst, skb, 0); ++ else ++ br_xmit_deliver(br, port, skb); ++ ++ skb_push(skb, ETH_HLEN); ++ ++ return 0; ++} ++ + static int br_dev_open(struct net_device *dev) + { + struct net_bridge *br = netdev_priv(dev); +diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c +index bdd7c35..2517cd4 100644 +--- a/net/bridge/br_forward.c ++++ b/net/bridge/br_forward.c +@@ -78,14 +78,24 @@ static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb) + } + + /* called with rcu_read_lock */ +-void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) ++void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb, int free) + { + if (should_deliver(to, skb)) { ++ if (!free) { ++ struct sk_buff *skb2; ++ ++ if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) { ++ to->br->statistics.tx_dropped++; ++ return; ++ } ++ skb = skb2; ++ } + __br_deliver(to, skb); + return; + } + +- kfree_skb(skb); ++ if (free) ++ kfree_skb(skb); + } + + /* called with rcu_read_lock */ +@@ -101,6 +111,7 @@ void br_forward(const struct net_bridge_port *to, struct sk_buff *skb) + + /* called under bridge lock */ + static void br_flood(struct net_bridge *br, struct sk_buff *skb, ++ int free, + void (*__packet_hook)(const struct net_bridge_port *p, + struct sk_buff *skb)) + { +@@ -132,18 +143,41 @@ static void br_flood(struct net_bridge *br, struct sk_buff *skb, + return; + } + +- kfree_skb(skb); ++ if (free) ++ kfree_skb(skb); + } + + + /* called with rcu_read_lock */ + void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb) + { +- br_flood(br, skb, __br_deliver); ++ br_flood(br, skb, 1, __br_deliver); ++} ++ ++/* called with rcu_read_lock */ ++void br_xmit_deliver(struct net_bridge *br, struct net_bridge_port *port, ++ struct sk_buff *skb) ++{ ++ struct net_bridge_port *p; ++ ++ list_for_each_entry_rcu(p, &br->port_list, list) { ++ if (p == port) ++ continue; ++ if (should_deliver(p, skb)) { ++ struct sk_buff *skb2; ++ ++ if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) { ++ br->statistics.tx_dropped++; ++ return; ++ } ++ __br_deliver(p, skb2); ++ } ++ } + } + + /* called under bridge lock */ + void br_flood_forward(struct net_bridge *br, struct sk_buff *skb) + { +- br_flood(br, skb, __br_forward); ++ skb->brmark = BR_ALREADY_SEEN; ++ br_flood(br, skb, 1, __br_forward); + } +diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c +index f38cc53..3dac8fc 100644 +--- a/net/bridge/br_if.c ++++ b/net/bridge/br_if.c +@@ -14,6 +14,7 @@ + */ + + #include ++#include + #include + #include + #include +@@ -160,6 +161,11 @@ static void del_br(struct net_bridge *br) + { + struct net_bridge_port *p, *n; + ++ if (br->master_dev) { ++ dev_put(br->master_dev); ++ br->master_dev = NULL; ++ } ++ + list_for_each_entry_safe(p, n, &br->port_list, list) { + del_nbp(p); + } +@@ -299,7 +305,7 @@ int br_del_bridge(const char *name) + int ret = 0; + + rtnl_lock(); +- dev = __dev_get_by_name(&init_net, name); ++ dev = __dev_get_by_name(current->nsproxy->net_ns, name); + if (dev == NULL) + ret = -ENXIO; /* Could not find device */ + +@@ -400,6 +406,10 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) + if ((dev->flags & IFF_UP) && netif_carrier_ok(dev) && + (br->dev->flags & IFF_UP)) + br_stp_enable_port(p); ++ if (!(dev->features & NETIF_F_VIRTUAL)) { ++ dev_hold(dev); ++ br->master_dev = dev; ++ } + spin_unlock_bh(&br->lock); + + br_ifinfo_notify(RTM_NEWLINK, p); +@@ -435,6 +445,16 @@ int br_del_if(struct net_bridge *br, struct net_device *dev) + spin_lock_bh(&br->lock); + br_stp_recalculate_bridge_id(br); + br_features_recompute(br); ++ if (br->master_dev == dev) { ++ br->master_dev = NULL; ++ dev_put(dev); ++ list_for_each_entry(p, &br->port_list, list) ++ if (!(p->dev->features & NETIF_F_VIRTUAL)) { ++ dev_hold(p->dev); ++ br->master_dev = p->dev; ++ break; ++ } ++ } + spin_unlock_bh(&br->lock); + + return 0; +@@ -446,7 +466,7 @@ void __exit br_cleanup_bridges(void) + + rtnl_lock(); + restart: +- for_each_netdev(&init_net, dev) { ++ for_each_netdev(current->nsproxy->net_ns, dev) { + if (dev->priv_flags & IFF_EBRIDGE) { + del_br(dev->priv); + goto restart; +diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c +index 255c00f..8809156 100644 +--- a/net/bridge/br_input.c ++++ b/net/bridge/br_input.c +@@ -24,13 +24,20 @@ const u8 br_group_address[ETH_ALEN] = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 }; + + static void br_pass_frame_up(struct net_bridge *br, struct sk_buff *skb) + { +- struct net_device *indev; ++ struct net_device *indev, *outdev; + + br->statistics.rx_packets++; + br->statistics.rx_bytes += skb->len; + + indev = skb->dev; +- skb->dev = br->dev; ++ if (!br->via_phys_dev) ++ skb->dev = br->dev; ++ else { ++ skb->brmark = BR_ALREADY_SEEN; ++ outdev = br->master_dev; ++ if (outdev) ++ skb->dev = outdev; ++ } + + NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_IN, skb, indev, NULL, + netif_receive_skb); +@@ -58,7 +65,7 @@ int br_handle_frame_finish(struct sk_buff *skb) + /* The packet skb2 goes to the local host (NULL to skip). */ + skb2 = NULL; + +- if (br->dev->flags & IFF_PROMISC) ++ if ((br->dev->flags & IFF_PROMISC) && !br->via_phys_dev) + skb2 = skb; + + dst = NULL; +@@ -147,6 +154,8 @@ struct sk_buff *br_handle_frame(struct net_bridge_port *p, struct sk_buff *skb) + } + + switch (p->state) { ++ struct net_device *out; ++ + case BR_STATE_FORWARDING: + rhook = rcu_dereference(br_should_route_hook); + if (rhook != NULL) { +@@ -156,7 +165,12 @@ struct sk_buff *br_handle_frame(struct net_bridge_port *p, struct sk_buff *skb) + } + /* fall through */ + case BR_STATE_LEARNING: +- if (!compare_ether_addr(p->br->dev->dev_addr, dest)) ++ if (skb->brmark == BR_ALREADY_SEEN) ++ return 0; ++ ++ out = p->br->via_phys_dev ? p->br->master_dev : p->br->dev; ++ ++ if (out && !compare_ether_addr(p->br->dev->dev_addr, dest)) + skb->pkt_type = PACKET_HOST; + + NF_HOOK(PF_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL, +diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c +index 0655a5f..be53554 100644 +--- a/net/bridge/br_ioctl.c ++++ b/net/bridge/br_ioctl.c +@@ -17,6 +17,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -28,7 +29,7 @@ static int get_bridge_ifindices(int *indices, int num) + struct net_device *dev; + int i = 0; + +- for_each_netdev(&init_net, dev) { ++ for_each_netdev(current->nsproxy->net_ns, dev) { + if (i >= num) + break; + if (dev->priv_flags & IFF_EBRIDGE) +@@ -91,7 +92,7 @@ static int add_del_if(struct net_bridge *br, int ifindex, int isadd) + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + +- dev = dev_get_by_index(&init_net, ifindex); ++ dev = dev_get_by_index(current->nsproxy->net_ns, ifindex); + if (dev == NULL) + return -EINVAL; + +@@ -142,6 +143,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) + b.root_port = br->root_port; + + b.stp_enabled = (br->stp_enabled != BR_NO_STP); ++ b.via_phys_dev = br->via_phys_dev; + b.ageing_time = jiffies_to_clock_t(br->ageing_time); + b.hello_timer_value = br_timer_value(&br->hello_timer); + b.tcn_timer_value = br_timer_value(&br->tcn_timer); +@@ -258,6 +260,13 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) + br_stp_set_enabled(br, args[1]); + return 0; + ++ case BRCTL_SET_VIA_ORIG_DEV: ++ if (!capable(CAP_NET_ADMIN)) ++ return -EPERM; ++ ++ br->via_phys_dev = args[1] ? 1 : 0; ++ return 0; ++ + case BRCTL_SET_BRIDGE_PRIORITY: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; +diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c +index f155e6c..e7a1b78 100644 +--- a/net/bridge/br_netlink.c ++++ b/net/bridge/br_netlink.c +@@ -11,6 +11,7 @@ + */ + + #include ++#include + #include + #include + #include +@@ -97,10 +98,11 @@ void br_ifinfo_notify(int event, struct net_bridge_port *port) + kfree_skb(skb); + goto errout; + } +- err = rtnl_notify(skb, &init_net,0, RTNLGRP_LINK, NULL, GFP_ATOMIC); ++ err = rtnl_notify(skb, dev_net(port->dev),0, RTNLGRP_LINK, ++ NULL, GFP_ATOMIC); + errout: + if (err < 0) +- rtnl_set_sk_err(&init_net, RTNLGRP_LINK, err); ++ rtnl_set_sk_err(dev_net(port->dev), RTNLGRP_LINK, err); + } + + /* +@@ -112,11 +114,8 @@ static int br_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) + struct net_device *dev; + int idx; + +- if (net != &init_net) +- return 0; +- + idx = 0; +- for_each_netdev(&init_net, dev) { ++ for_each_netdev(net, dev) { + /* not a bridge port */ + if (dev->br_port == NULL || idx < cb->args[0]) + goto skip; +@@ -165,7 +164,7 @@ static int br_rtm_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) + if (new_state > BR_STATE_BLOCKING) + return -EINVAL; + +- dev = __dev_get_by_index(&init_net, ifm->ifi_index); ++ dev = __dev_get_by_index(current->nsproxy->net_ns, ifm->ifi_index); + if (!dev) + return -ENODEV; + +diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c +index 00644a5..7484a56 100644 +--- a/net/bridge/br_notify.c ++++ b/net/bridge/br_notify.c +@@ -37,9 +37,6 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v + struct net_bridge_port *p = dev->br_port; + struct net_bridge *br; + +- if (dev_net(dev) != &init_net) +- return NOTIFY_DONE; +- + /* not a port of a bridge */ + if (p == NULL) + return NOTIFY_DONE; +diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h +index c11b554..5acb2e4 100644 +--- a/net/bridge/br_private.h ++++ b/net/bridge/br_private.h +@@ -90,6 +90,8 @@ struct net_bridge + spinlock_t lock; + struct list_head port_list; + struct net_device *dev; ++ struct net_device *master_dev; ++ unsigned char via_phys_dev; + struct net_device_stats statistics; + spinlock_t hash_lock; + struct hlist_head hash[BR_HASH_SIZE]; +@@ -139,6 +141,7 @@ static inline int br_is_root_bridge(const struct net_bridge *br) + /* br_device.c */ + extern void br_dev_setup(struct net_device *dev); + extern int br_dev_xmit(struct sk_buff *skb, struct net_device *dev); ++extern int br_xmit(struct sk_buff *skb, struct net_bridge_port *port); + + /* br_fdb.c */ + extern int br_fdb_init(void); +@@ -165,12 +168,13 @@ extern void br_fdb_update(struct net_bridge *br, + + /* br_forward.c */ + extern void br_deliver(const struct net_bridge_port *to, +- struct sk_buff *skb); ++ struct sk_buff *skb, int free); + extern int br_dev_queue_push_xmit(struct sk_buff *skb); + extern void br_forward(const struct net_bridge_port *to, + struct sk_buff *skb); + extern int br_forward_finish(struct sk_buff *skb); + extern void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb); ++extern void br_xmit_deliver(struct net_bridge *br, struct net_bridge_port *port, struct sk_buff *skb); + extern void br_flood_forward(struct net_bridge *br, struct sk_buff *skb); + + /* br_if.c */ +diff --git a/net/bridge/br_stp_bpdu.c b/net/bridge/br_stp_bpdu.c +index ddeb6e5..e9f6b7e 100644 +--- a/net/bridge/br_stp_bpdu.c ++++ b/net/bridge/br_stp_bpdu.c +@@ -142,9 +142,6 @@ int br_stp_rcv(struct sk_buff *skb, struct net_device *dev, + struct net_bridge *br; + const unsigned char *buf; + +- if (dev_net(dev) != &init_net) +- goto err; +- + if (!p) + goto err; + +diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c +index 27d6a51..0661fd0 100644 +--- a/net/bridge/br_sysfs_br.c ++++ b/net/bridge/br_sysfs_br.c +@@ -172,6 +172,27 @@ static ssize_t store_stp_state(struct device *d, + static DEVICE_ATTR(stp_state, S_IRUGO | S_IWUSR, show_stp_state, + store_stp_state); + ++static ssize_t show_via_phys_dev_state(struct device *cd, ++ struct device_attribute *attr, char *buf) ++{ ++ struct net_bridge *br = to_bridge(cd); ++ return sprintf(buf, "%d\n", br->via_phys_dev); ++} ++ ++static void set_via_phys_dev_state(struct net_bridge *br, unsigned long val) ++{ ++ br->via_phys_dev = val; ++} ++ ++static ssize_t store_via_phys_dev_state(struct device *cd, ++ struct device_attribute *attr, const char *buf, size_t len) ++{ ++ return store_bridge_parm(cd, buf, len, set_via_phys_dev_state); ++} ++ ++static DEVICE_ATTR(via_phys_dev, S_IRUGO | S_IWUSR, show_via_phys_dev_state, ++ store_via_phys_dev_state); ++ + static ssize_t show_priority(struct device *d, struct device_attribute *attr, + char *buf) + { +@@ -340,6 +361,7 @@ static struct attribute *bridge_attrs[] = { + &dev_attr_max_age.attr, + &dev_attr_ageing_time.attr, + &dev_attr_stp_state.attr, ++ &dev_attr_via_phys_dev.attr, + &dev_attr_priority.attr, + &dev_attr_bridge_id.attr, + &dev_attr_root_id.attr, +diff --git a/net/core/datagram.c b/net/core/datagram.c +index 8a28fc9..d9e576c 100644 +--- a/net/core/datagram.c ++++ b/net/core/datagram.c +@@ -56,6 +56,8 @@ + #include + #include + ++#include ++ + /* + * Is a socket 'connection oriented' ? + */ +@@ -522,6 +524,7 @@ unsigned int datagram_poll(struct file *file, struct socket *sock, + { + struct sock *sk = sock->sk; + unsigned int mask; ++ int no_ubc_space; + + poll_wait(file, sk->sk_sleep, wait); + mask = 0; +@@ -531,8 +534,14 @@ unsigned int datagram_poll(struct file *file, struct socket *sock, + mask |= POLLERR; + if (sk->sk_shutdown & RCV_SHUTDOWN) + mask |= POLLRDHUP; +- if (sk->sk_shutdown == SHUTDOWN_MASK) ++ if (sk->sk_shutdown == SHUTDOWN_MASK) { ++ no_ubc_space = 0; + mask |= POLLHUP; ++ } else { ++ no_ubc_space = ub_sock_makewres_other(sk, SOCK_MIN_UBCSPACE_CH); ++ if (no_ubc_space) ++ ub_sock_sndqueueadd_other(sk, SOCK_MIN_UBCSPACE_CH); ++ } + + /* readable? */ + if (!skb_queue_empty(&sk->sk_receive_queue) || +@@ -549,7 +558,7 @@ unsigned int datagram_poll(struct file *file, struct socket *sock, + } + + /* writable? */ +- if (sock_writeable(sk)) ++ if (!no_ubc_space && sock_writeable(sk)) + mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + else + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); +diff --git a/net/core/dev.c b/net/core/dev.c +index fca23a3..26b529f 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -123,6 +123,9 @@ + + #include "net-sysfs.h" + ++#include ++#include ++ + /* + * The list of packet types we will receive (as opposed to discard) + * and the routines to invoke. +@@ -200,20 +203,6 @@ DEFINE_RWLOCK(dev_base_lock); + + EXPORT_SYMBOL(dev_base_lock); + +-#define NETDEV_HASHBITS 8 +-#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS) +- +-static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) +-{ +- unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); +- return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)]; +-} +- +-static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) +-{ +- return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)]; +-} +- + /* Device list insertion */ + static int list_netdevice(struct net_device *dev) + { +@@ -1566,6 +1555,23 @@ static int dev_gso_segment(struct sk_buff *skb) + return 0; + } + ++#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) ++int (*br_hard_xmit_hook)(struct sk_buff *skb, struct net_bridge_port *port); ++static __inline__ int bridge_hard_start_xmit(struct sk_buff *skb, ++ struct net_device *dev) ++{ ++ struct net_bridge_port *port; ++ ++ if (((port = rcu_dereference(dev->br_port)) == NULL) || ++ (skb->brmark == BR_ALREADY_SEEN)) ++ return 0; ++ ++ return br_hard_xmit_hook(skb, port); ++} ++#else ++#define bridge_hard_start_xmit(skb, dev) (0) ++#endif ++ + int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) + { + if (likely(!skb->next)) { +@@ -1579,6 +1585,8 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) + goto gso; + } + ++ bridge_hard_start_xmit(skb, dev); ++ + return dev->hard_start_xmit(skb, dev); + } + +@@ -1589,6 +1597,9 @@ gso: + + skb->next = nskb->next; + nskb->next = NULL; ++ ++ bridge_hard_start_xmit(skb, dev); ++ + rc = dev->hard_start_xmit(nskb, dev); + if (unlikely(rc)) { + nskb->next = skb->next; +@@ -2051,6 +2062,7 @@ int netif_receive_skb(struct sk_buff *skb) + struct net_device *orig_dev; + int ret = NET_RX_DROP; + __be16 type; ++ struct ve_struct *old_ve; + + /* if we've gotten here through NAPI, check netpoll */ + if (netpoll_receive_skb(skb)) +@@ -2073,6 +2085,16 @@ int netif_receive_skb(struct sk_buff *skb) + skb_reset_transport_header(skb); + skb->mac_len = skb->network_header - skb->mac_header; + ++#ifdef CONFIG_VE ++ /* ++ * Skb might be alloced in another VE context, than its device works. ++ * So, set the correct owner_env. ++ */ ++ skb->owner_env = skb->dev->owner_env; ++ BUG_ON(skb->owner_env == NULL); ++#endif ++ old_ve = set_exec_env(skb->owner_env); ++ + pt_prev = NULL; + + rcu_read_lock(); +@@ -2133,6 +2155,7 @@ ncls: + + out: + rcu_read_unlock(); ++ (void)set_exec_env(old_ve); + return ret; + } + +@@ -2779,8 +2802,11 @@ static void __dev_set_promiscuity(struct net_device *dev, int inc) + dev->flags &= ~IFF_PROMISC; + else + dev->flags |= IFF_PROMISC; ++ /* Promiscous mode on these devices does not mean anything */ ++ if (dev->flags & (IFF_LOOPBACK|IFF_POINTOPOINT)) ++ return; + if (dev->flags != old_flags) { +- printk(KERN_INFO "device %s %s promiscuous mode\n", ++ ve_printk(VE_LOG, KERN_INFO "device %s %s promiscuous mode\n", + dev->name, (dev->flags & IFF_PROMISC) ? "entered" : + "left"); + if (audit_enabled) +@@ -3543,11 +3569,20 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) + * - require strict serialization. + * - do not return a value + */ ++ case SIOCSIFMTU: ++ case SIOCSIFHWADDR: + case SIOCSIFFLAGS: ++ if (!capable(CAP_NET_ADMIN) && ++ !capable(CAP_VE_NET_ADMIN)) ++ return -EPERM; ++ dev_load(net, ifr.ifr_name); ++ rtnl_lock(); ++ ret = dev_ifsioc(net, &ifr, cmd); ++ rtnl_unlock(); ++ return ret; ++ + case SIOCSIFMETRIC: +- case SIOCSIFMTU: + case SIOCSIFMAP: +- case SIOCSIFHWADDR: + case SIOCSIFSLAVE: + case SIOCADDMULTI: + case SIOCDELMULTI: +@@ -3614,12 +3649,11 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) + */ + static int dev_new_index(struct net *net) + { +- static int ifindex; + for (;;) { +- if (++ifindex <= 0) +- ifindex = 1; +- if (!__dev_get_by_index(net, ifindex)) +- return ifindex; ++ if (++net->ifindex <= 0) ++ net->ifindex = 1; ++ if (!__dev_get_by_index(net, net->ifindex)) ++ return net->ifindex; + } + } + +@@ -3722,6 +3756,10 @@ int register_netdevice(struct net_device *dev) + BUG_ON(!dev_net(dev)); + net = dev_net(dev); + ++ ret = -EPERM; ++ if (!ve_is_super(get_exec_env()) && ve_is_dev_movable(dev)) ++ goto out; ++ + spin_lock_init(&dev->queue_lock); + spin_lock_init(&dev->_xmit_lock); + netdev_set_lockdep_class(&dev->_xmit_lock, dev->type); +@@ -3819,6 +3857,10 @@ int register_netdevice(struct net_device *dev) + + set_bit(__LINK_STATE_PRESENT, &dev->state); + ++ dev->owner_env = get_exec_env(); ++ netdev_bc(dev)->owner_ub = get_beancounter(get_exec_ub()); ++ netdev_bc(dev)->exec_ub = get_beancounter(get_exec_ub()); ++ + dev_init_scheduler(dev); + dev_hold(dev); + list_netdevice(dev); +@@ -3952,6 +3994,7 @@ static DEFINE_MUTEX(net_todo_run_mutex); + void netdev_run_todo(void) + { + struct list_head list; ++ struct ve_struct *old_ve; + + /* Need to guard against multiple cpu's getting out of order. */ + mutex_lock(&net_todo_run_mutex); +@@ -3969,6 +4012,7 @@ void netdev_run_todo(void) + list_replace_init(&net_todo_list, &list); + spin_unlock(&net_todo_list_lock); + ++ old_ve = get_exec_env(); + while (!list_empty(&list)) { + struct net_device *dev + = list_entry(list.next, struct net_device, todo_list); +@@ -3981,6 +4025,7 @@ void netdev_run_todo(void) + continue; + } + ++ (void)set_exec_env(dev->owner_env); + dev->reg_state = NETREG_UNREGISTERED; + + netdev_wait_allrefs(dev); +@@ -3991,13 +4036,21 @@ void netdev_run_todo(void) + BUG_TRAP(!dev->ip6_ptr); + BUG_TRAP(!dev->dn_ptr); + ++ put_beancounter(netdev_bc(dev)->exec_ub); ++ put_beancounter(netdev_bc(dev)->owner_ub); ++ netdev_bc(dev)->exec_ub = NULL; ++ netdev_bc(dev)->owner_ub = NULL; ++ ++ /* It must be the very last action, ++ * after this 'dev' may point to freed up memory. ++ */ + if (dev->destructor) + dev->destructor(dev); + + /* Free network device */ + kobject_put(&dev->dev.kobj); + } +- ++ (void)set_exec_env(old_ve); + out: + mutex_unlock(&net_todo_run_mutex); + } +@@ -4037,7 +4090,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, + /* ensure 32-byte alignment of whole construct */ + alloc_size += NETDEV_ALIGN_CONST; + +- p = kzalloc(alloc_size, GFP_KERNEL); ++ p = kzalloc(alloc_size, GFP_KERNEL_UBC); + if (!p) { + printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n"); + return NULL; +@@ -4153,11 +4206,15 @@ EXPORT_SYMBOL(unregister_netdev); + * Callers must hold the rtnl semaphore. + */ + +-int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) ++int __dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat, ++ struct ve_struct *src_ve, struct ve_struct *dst_ve, ++ struct user_beancounter *exec_ub) + { + char buf[IFNAMSIZ]; + const char *destname; + int err; ++ struct ve_struct *cur_ve; ++ struct user_beancounter *tmp_ub; + + ASSERT_RTNL(); + +@@ -4208,6 +4265,11 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char + err = -ENODEV; + unlist_netdevice(dev); + ++ dev->owner_env = dst_ve; ++ tmp_ub = netdev_bc(dev)->exec_ub; ++ netdev_bc(dev)->exec_ub = get_beancounter(exec_ub); ++ put_beancounter(tmp_ub); ++ + synchronize_net(); + + /* Shutdown queueing discipline. */ +@@ -4216,7 +4278,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char + /* Notify protocols, that we are about to destroy + this device. They should clean all the things. + */ ++ cur_ve = set_exec_env(src_ve); + call_netdevice_notifiers(NETDEV_UNREGISTER, dev); ++ (void)set_exec_env(cur_ve); + + /* + * Flush the unicast and multicast chains +@@ -4247,7 +4311,9 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char + list_netdevice(dev); + + /* Notify protocols, that a new device appeared. */ ++ cur_ve = set_exec_env(dst_ve); + call_netdevice_notifiers(NETDEV_REGISTER, dev); ++ (void)set_exec_env(cur_ve); + + synchronize_net(); + err = 0; +@@ -4255,6 +4321,14 @@ out: + return err; + } + ++int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) ++{ ++ struct ve_struct *ve = get_exec_env(); ++ struct user_beancounter *ub = get_exec_ub(); ++ ++ return __dev_change_net_namespace(dev, net, pat, ve, ve, ub); ++} ++ + static int dev_cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *ocpu) +@@ -4460,7 +4534,7 @@ static struct hlist_head *netdev_create_hash(void) + int i; + struct hlist_head *hash; + +- hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL); ++ hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL_UBC); + if (hash != NULL) + for (i = 0; i < NETDEV_HASHENTRIES; i++) + INIT_HLIST_HEAD(&hash[i]); +@@ -4604,6 +4678,7 @@ EXPORT_SYMBOL(__dev_remove_pack); + EXPORT_SYMBOL(dev_valid_name); + EXPORT_SYMBOL(dev_add_pack); + EXPORT_SYMBOL(dev_alloc_name); ++EXPORT_SYMBOL(__dev_change_net_namespace); + EXPORT_SYMBOL(dev_close); + EXPORT_SYMBOL(dev_get_by_flags); + EXPORT_SYMBOL(dev_get_by_index); +@@ -4635,6 +4710,7 @@ EXPORT_SYMBOL(dev_get_flags); + + #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) + EXPORT_SYMBOL(br_handle_frame_hook); ++EXPORT_SYMBOL(br_hard_xmit_hook); + EXPORT_SYMBOL(br_fdb_get_hook); + EXPORT_SYMBOL(br_fdb_put_hook); + #endif +diff --git a/net/core/dst.c b/net/core/dst.c +index fe03266..ce92751 100644 +--- a/net/core/dst.c ++++ b/net/core/dst.c +@@ -308,6 +308,7 @@ static int dst_dev_event(struct notifier_block *this, unsigned long event, void + switch (event) { + case NETDEV_UNREGISTER: + case NETDEV_DOWN: ++ dst_gc_task(NULL); + mutex_lock(&dst_gc_mutex); + for (dst = dst_busy_list; dst; dst = dst->next) { + last = dst; +diff --git a/net/core/ethtool.c b/net/core/ethtool.c +index 0133b5e..770e607 100644 +--- a/net/core/ethtool.c ++++ b/net/core/ethtool.c +@@ -828,7 +828,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) + case ETHTOOL_GPFLAGS: + break; + default: +- if (!capable(CAP_NET_ADMIN)) ++ if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) + return -EPERM; + } + +diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c +index 277a230..ee4499f 100644 +--- a/net/core/fib_rules.c ++++ b/net/core/fib_rules.c +@@ -20,7 +20,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops, + { + struct fib_rule *r; + +- r = kzalloc(ops->rule_size, GFP_KERNEL); ++ r = kzalloc(ops->rule_size, GFP_KERNEL_UBC); + if (r == NULL) + return -ENOMEM; + +@@ -69,7 +69,7 @@ static void rules_ops_put(struct fib_rules_ops *ops) + static void flush_route_cache(struct fib_rules_ops *ops) + { + if (ops->flush_cache) +- ops->flush_cache(); ++ ops->flush_cache(ops); + } + + int fib_rules_register(struct fib_rules_ops *ops) +@@ -238,7 +238,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) + if (err < 0) + goto errout; + +- rule = kzalloc(ops->rule_size, GFP_KERNEL); ++ rule = kzalloc(ops->rule_size, GFP_KERNEL_UBC); + if (rule == NULL) { + err = -ENOMEM; + goto errout; +diff --git a/net/core/filter.c b/net/core/filter.c +index df37443..8a4933c 100644 +--- a/net/core/filter.c ++++ b/net/core/filter.c +@@ -478,7 +478,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) + if (fprog->filter == NULL) + return -EINVAL; + +- fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL); ++ fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL_UBC); + if (!fp) + return -ENOMEM; + if (copy_from_user(fp->insns, fprog->filter, fsize)) { +diff --git a/net/core/neighbour.c b/net/core/neighbour.c +index 65f01f7..a423038 100644 +--- a/net/core/neighbour.c ++++ b/net/core/neighbour.c +@@ -21,6 +21,7 @@ + #include + #include + #include ++#include + #ifdef CONFIG_SYSCTL + #include + #endif +@@ -35,6 +36,7 @@ + #include + #include + #include ++#include + + #define NEIGH_DEBUG 1 + +@@ -264,6 +266,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl) + int entries; + + entries = atomic_inc_return(&tbl->entries) - 1; ++ n = ERR_PTR(-ENOBUFS); + if (entries >= tbl->gc_thresh3 || + (entries >= tbl->gc_thresh2 && + time_after(now, tbl->last_flush + 5 * HZ))) { +@@ -274,7 +277,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl) + + n = kmem_cache_zalloc(tbl->kmem_cachep, GFP_ATOMIC); + if (!n) +- goto out_entries; ++ goto out_nomem; + + skb_queue_head_init(&n->arp_queue); + rwlock_init(&n->lock); +@@ -291,6 +294,8 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl) + out: + return n; + ++out_nomem: ++ n = ERR_PTR(-ENOMEM); + out_entries: + atomic_dec(&tbl->entries); + goto out; +@@ -409,12 +414,11 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, + u32 hash_val; + int key_len = tbl->key_len; + int error; +- struct neighbour *n1, *rc, *n = neigh_alloc(tbl); ++ struct neighbour *n1, *rc, *n; + +- if (!n) { +- rc = ERR_PTR(-ENOBUFS); ++ rc = n = neigh_alloc(tbl); ++ if (IS_ERR(n)) + goto out; +- } + + memcpy(n->primary_key, pkey, key_len); + n->dev = dev; +@@ -736,10 +740,21 @@ static void neigh_periodic_timer(unsigned long arg) + if (atomic_read(&n->refcnt) == 1 && + (state == NUD_FAILED || + time_after(now, n->used + n->parms->gc_staletime))) { ++ struct net_device *dev = n->dev; ++ struct ve_struct *ve; ++ struct user_beancounter *ub; ++ + *np = n->next; + n->dead = 1; + write_unlock(&n->lock); ++ ++ ve = set_exec_env(dev->owner_env); ++ ub = set_exec_ub(netdev_bc(dev)->owner_ub); ++ + neigh_cleanup_and_release(n); ++ ++ set_exec_ub(ub); ++ set_exec_env(ve); + continue; + } + write_unlock(&n->lock); +@@ -781,6 +796,11 @@ static void neigh_timer_handler(unsigned long arg) + struct neighbour *neigh = (struct neighbour *)arg; + unsigned state; + int notify = 0; ++ struct ve_struct *env; ++ struct user_beancounter *ub; ++ ++ env = set_exec_env(neigh->dev->owner_env); ++ ub = set_exec_ub(netdev_bc(neigh->dev)->exec_ub); + + write_lock(&neigh->lock); + +@@ -884,6 +904,8 @@ out: + neigh_update_notify(neigh); + + neigh_release(neigh); ++ (void)set_exec_ub(ub); ++ (void)set_exec_env(env); + } + + int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) +@@ -1272,9 +1294,16 @@ static void neigh_proxy_process(unsigned long arg) + if (tdif <= 0) { + struct net_device *dev = back->dev; + __skb_unlink(back, &tbl->proxy_queue); +- if (tbl->proxy_redo && netif_running(dev)) ++ if (tbl->proxy_redo && netif_running(dev)) { ++ struct ve_struct *ve; ++ struct user_beancounter *ub; ++ ++ ve = set_exec_env(dev->owner_env); ++ ub = set_exec_ub(netdev_bc(dev)->owner_ub); + tbl->proxy_redo(back); +- else ++ set_exec_ub(ub); ++ set_exec_env(ve); ++ } else + kfree_skb(back); + + dev_put(dev); +diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c +index 90e2177..1c484ee 100644 +--- a/net/core/net-sysfs.c ++++ b/net/core/net-sysfs.c +@@ -229,6 +229,27 @@ static struct device_attribute net_class_attributes[] = { + {} + }; + ++#ifdef CONFIG_VE ++struct device_attribute ve_net_class_attributes[] = { ++ __ATTR(addr_len, S_IRUGO, show_addr_len, NULL), ++ __ATTR(iflink, S_IRUGO, show_iflink, NULL), ++ __ATTR(ifindex, S_IRUGO, show_ifindex, NULL), ++ __ATTR(features, S_IRUGO, show_features, NULL), ++ __ATTR(type, S_IRUGO, show_type, NULL), ++ __ATTR(link_mode, S_IRUGO, show_link_mode, NULL), ++ __ATTR(address, S_IRUGO, show_address, NULL), ++ __ATTR(broadcast, S_IRUGO, show_broadcast, NULL), ++ __ATTR(carrier, S_IRUGO, show_carrier, NULL), ++ __ATTR(dormant, S_IRUGO, show_dormant, NULL), ++ __ATTR(operstate, S_IRUGO, show_operstate, NULL), ++ __ATTR(mtu, S_IRUGO, show_mtu, NULL), ++ __ATTR(flags, S_IRUGO, show_flags, NULL), ++ __ATTR(tx_queue_len, S_IRUGO, show_tx_queue_len, NULL), ++ {} ++}; ++EXPORT_SYMBOL(ve_net_class_attributes); ++#endif ++ + /* Show a given an attribute in the statistics group */ + static ssize_t netstat_show(const struct device *d, + struct device_attribute *attr, char *buf, +@@ -421,7 +442,7 @@ static void netdev_release(struct device *d) + kfree((char *)dev - dev->padded); + } + +-static struct class net_class = { ++struct class net_class = { + .name = "net", + .dev_release = netdev_release, + #ifdef CONFIG_SYSFS +@@ -431,6 +452,13 @@ static struct class net_class = { + .dev_uevent = netdev_uevent, + #endif + }; ++EXPORT_SYMBOL(net_class); ++ ++#ifndef CONFIG_VE ++#define visible_net_class net_class ++#else ++#define visible_net_class (*get_exec_env()->net_class) ++#endif + + /* Delete sysfs entries but hold kobject reference until after all + * netdev references are gone. +@@ -449,7 +477,7 @@ int netdev_register_kobject(struct net_device *net) + struct device *dev = &(net->dev); + struct attribute_group **groups = net->sysfs_groups; + +- dev->class = &net_class; ++ dev->class = &visible_net_class; + dev->platform_data = net; + dev->groups = groups; + +@@ -475,7 +503,15 @@ void netdev_initialize_kobject(struct net_device *net) + device_initialize(device); + } + ++void prepare_sysfs_netdev(void) ++{ ++#ifdef CONFIG_VE ++ get_ve0()->net_class = &net_class; ++#endif ++} ++ + int netdev_kobject_init(void) + { ++ prepare_sysfs_netdev(); + return class_register(&net_class); + } +diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c +index 7c52fe2..e1aeb0e 100644 +--- a/net/core/net_namespace.c ++++ b/net/core/net_namespace.c +@@ -1,6 +1,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -34,6 +35,10 @@ static __net_init int setup_net(struct net *net) + int error; + struct net_generic *ng; + ++#ifdef CONFIG_VE ++ net->owner_ve = get_exec_env(); ++#endif ++ + atomic_set(&net->count, 1); + #ifdef NETNS_REFCNT_DEBUG + atomic_set(&net->use_count, 0); +@@ -85,6 +90,8 @@ static struct net *net_alloc(void) + + static void net_free(struct net *net) + { ++ struct completion *sysfs_completion; ++ + if (!net) + return; + +@@ -96,7 +103,10 @@ static void net_free(struct net *net) + } + #endif + ++ sysfs_completion = net->sysfs_completion; + kmem_cache_free(net_cachep, net); ++ if (sysfs_completion) ++ complete(sysfs_completion); + } + + struct net *copy_net_ns(unsigned long flags, struct net *old_net) +@@ -139,6 +149,7 @@ static void cleanup_net(struct work_struct *work) + { + struct pernet_operations *ops; + struct net *net; ++ struct ve_struct *old_ve; + + /* Be very certain incoming network packets will not find us */ + rcu_barrier(); +@@ -152,11 +163,13 @@ static void cleanup_net(struct work_struct *work) + list_del(&net->list); + rtnl_unlock(); + ++ old_ve = set_exec_env(net->owner_ve); + /* Run all of the network namespace exit methods */ + list_for_each_entry_reverse(ops, &pernet_list, list) { + if (ops->exit) + ops->exit(net); + } ++ (void)set_exec_env(old_ve); + + mutex_unlock(&net_mutex); + +diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c +index a9a7721..7e10b49 100644 +--- a/net/core/rtnetlink.c ++++ b/net/core/rtnetlink.c +@@ -1205,6 +1205,8 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) + if (rtnl_msg_handlers[idx] == NULL || + rtnl_msg_handlers[idx][type].dumpit == NULL) + continue; ++ if (vz_security_family_check(idx)) ++ continue; + if (idx > s_idx) + memset(&cb->args[0], 0, sizeof(cb->args)); + if (rtnl_msg_handlers[idx][type].dumpit(skb, cb)) +@@ -1265,13 +1267,13 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) + return 0; + + family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family; +- if (family >= NPROTO) ++ if (family >= NPROTO || vz_security_family_check(family)) + return -EAFNOSUPPORT; + + sz_idx = type>>2; + kind = type&3; + +- if (kind != 2 && security_netlink_recv(skb, CAP_NET_ADMIN)) ++ if (kind != 2 && security_netlink_recv(skb, CAP_VE_NET_ADMIN)) + return -EPERM; + + if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { +diff --git a/net/core/scm.c b/net/core/scm.c +index 10f5c65..65e0983 100644 +--- a/net/core/scm.c ++++ b/net/core/scm.c +@@ -36,6 +36,7 @@ + #include + #include + ++#include + + /* + * Only allow a user to send credentials, that they could set with +@@ -44,7 +45,9 @@ + + static __inline__ int scm_check_creds(struct ucred *creds) + { +- if ((creds->pid == task_tgid_vnr(current) || capable(CAP_SYS_ADMIN)) && ++ if ((creds->pid == task_tgid_vnr(current) || ++ creds->pid == current->tgid || ++ capable(CAP_VE_SYS_ADMIN)) && + ((creds->uid == current->uid || creds->uid == current->euid || + creds->uid == current->suid) || capable(CAP_SETUID)) && + ((creds->gid == current->gid || creds->gid == current->egid || +@@ -71,7 +74,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) + + if (!fpl) + { +- fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL); ++ fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL_UBC); + if (!fpl) + return -ENOMEM; + *fplp = fpl; +@@ -282,7 +285,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) + if (!fpl) + return NULL; + +- new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL); ++ new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL_UBC); + if (new_fpl) { + for (i=fpl->count-1; i>=0; i--) + get_file(fpl->fp[i]); +diff --git a/net/core/skbuff.c b/net/core/skbuff.c +index 3666216..b82442c 100644 +--- a/net/core/skbuff.c ++++ b/net/core/skbuff.c +@@ -67,6 +67,8 @@ + #include + #include + ++#include ++ + #include "kmap_skb.h" + + static struct kmem_cache *skbuff_head_cache __read_mostly; +@@ -193,6 +195,10 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, + if (!skb) + goto out; + ++ if (ub_skb_alloc_bc(skb, gfp_mask & ~__GFP_DMA)) ++ goto nobc; ++ ++ /* Get the DATA. Size must match skb_add_mtu(). */ + size = SKB_DATA_ALIGN(size); + data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info), + gfp_mask, node); +@@ -211,6 +217,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, + skb->data = data; + skb_reset_tail_pointer(skb); + skb->end = skb->tail + size; ++ skb->owner_env = get_exec_env(); + /* make sure we initialize shinfo sequentially */ + shinfo = skb_shinfo(skb); + atomic_set(&shinfo->dataref, 1); +@@ -233,6 +240,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, + out: + return skb; + nodata: ++ ub_skb_free_bc(skb); ++nobc: + kmem_cache_free(cache, skb); + skb = NULL; + goto out; +@@ -339,6 +348,7 @@ static void kfree_skbmem(struct sk_buff *skb) + struct sk_buff *other; + atomic_t *fclone_ref; + ++ ub_skb_free_bc(skb); + switch (skb->fclone) { + case SKB_FCLONE_UNAVAILABLE: + kmem_cache_free(skbuff_head_cache, skb); +@@ -372,6 +382,7 @@ static void skb_release_all(struct sk_buff *skb) + #ifdef CONFIG_XFRM + secpath_put(skb->sp); + #endif ++ ub_skb_uncharge(skb); + if (skb->destructor) { + WARN_ON(in_irq()); + skb->destructor(skb); +@@ -461,6 +472,11 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) + new->tc_verd = old->tc_verd; + #endif + #endif ++#ifdef CONFIG_VE ++ new->accounted = old->accounted; ++ new->redirected = old->redirected; ++#endif ++ skb_copy_brmark(new, old); + skb_copy_secmark(new, old); + } + +@@ -478,6 +494,10 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) + n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; + n->cloned = 1; + n->nohdr = 0; ++ C(owner_env); ++#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) ++ C(brmark); ++#endif + n->destructor = NULL; + C(iif); + C(tail); +@@ -487,6 +507,11 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) + C(truesize); + atomic_set(&n->users, 1); + ++#ifdef CONFIG_VE ++ C(accounted); ++ C(redirected); ++#endif ++ + atomic_inc(&(skb_shinfo(skb)->dataref)); + skb->cloned = 1; + +@@ -542,6 +567,10 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) + n->fclone = SKB_FCLONE_UNAVAILABLE; + } + ++ if (ub_skb_alloc_bc(n, gfp_mask)) { ++ kmem_cache_free(skbuff_head_cache, n); ++ return NULL; ++ } + return __skb_clone(n, skb); + } + +diff --git a/net/core/sock.c b/net/core/sock.c +index 88094cb..bb59e40 100644 +--- a/net/core/sock.c ++++ b/net/core/sock.c +@@ -125,6 +125,9 @@ + #include + #include + ++#include ++#include ++ + #include + + #ifdef CONFIG_INET +@@ -250,7 +253,7 @@ static void sock_warn_obsolete_bsdism(const char *name) + static char warncomm[TASK_COMM_LEN]; + if (strcmp(warncomm, current->comm) && warned < 5) { + strcpy(warncomm, current->comm); +- printk(KERN_WARNING "process `%s' is using obsolete " ++ ve_printk(VE_LOG, KERN_WARNING "process `%s' is using obsolete " + "%s SO_BSDCOMPAT\n", warncomm, name); + warned++; + } +@@ -283,7 +286,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) + if (err) + goto out; + +- if (!sk_rmem_schedule(sk, skb->truesize)) { ++ if (!sk_rmem_schedule(sk, skb)) { + err = -ENOBUFS; + goto out; + } +@@ -921,6 +924,7 @@ static void sk_prot_free(struct proto *prot, struct sock *sk) + slab = prot->slab; + + security_sk_free(sk); ++ ub_sock_uncharge(sk); + if (slab != NULL) + kmem_cache_free(slab, sk); + else +@@ -949,6 +953,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, + */ + sk->sk_prot = sk->sk_prot_creator = prot; + sock_lock_init(sk); ++ sk->owner_env = get_exec_env(); + sock_net_set(sk, get_net(net)); + } + +@@ -1043,14 +1048,11 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) + if (filter != NULL) + sk_filter_charge(newsk, filter); + +- if (unlikely(xfrm_sk_clone_policy(newsk))) { +- /* It is still raw copy of parent, so invalidate +- * destructor and make plain sk_free() */ +- newsk->sk_destruct = NULL; +- sk_free(newsk); +- newsk = NULL; +- goto out; +- } ++ if (ub_sock_charge(newsk, newsk->sk_family, newsk->sk_type) < 0) ++ goto out_err; ++ ++ if (unlikely(xfrm_sk_clone_policy(newsk))) ++ goto out_err; + + newsk->sk_err = 0; + newsk->sk_priority = 0; +@@ -1074,14 +1076,23 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) + if (newsk->sk_prot->sockets_allocated) + atomic_inc(newsk->sk_prot->sockets_allocated); + } +-out: + return newsk; ++ ++out_err: ++ /* It is still raw copy of parent, so invalidate ++ * destructor and make plain sk_free() */ ++ sock_reset_flag(newsk, SOCK_TIMESTAMP); ++ newsk->sk_destruct = NULL; ++ sk_free(newsk); ++ return NULL; + } + + EXPORT_SYMBOL_GPL(sk_clone); + + void sk_setup_caps(struct sock *sk, struct dst_entry *dst) + { ++ extern int sysctl_tcp_use_sg; ++ + __sk_dst_set(sk, dst); + sk->sk_route_caps = dst->dev->features; + if (sk->sk_route_caps & NETIF_F_GSO) +@@ -1094,6 +1105,8 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) + sk->sk_gso_max_size = dst->dev->gso_max_size; + } + } ++ if (!sysctl_tcp_use_sg) ++ sk->sk_route_caps &= ~NETIF_F_SG; + } + EXPORT_SYMBOL_GPL(sk_setup_caps); + +@@ -1254,11 +1267,9 @@ static long sock_wait_for_wmem(struct sock * sk, long timeo) + /* + * Generic send/receive buffer handlers + */ +- +-static struct sk_buff *sock_alloc_send_pskb(struct sock *sk, +- unsigned long header_len, +- unsigned long data_len, +- int noblock, int *errcode) ++struct sk_buff *sock_alloc_send_skb2(struct sock *sk, unsigned long size, ++ unsigned long size2, int noblock, ++ int *errcode) + { + struct sk_buff *skb; + gfp_t gfp_mask; +@@ -1279,46 +1290,35 @@ static struct sk_buff *sock_alloc_send_pskb(struct sock *sk, + if (sk->sk_shutdown & SEND_SHUTDOWN) + goto failure; + +- if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { +- skb = alloc_skb(header_len, gfp_mask); +- if (skb) { +- int npages; +- int i; +- +- /* No pages, we're done... */ +- if (!data_len) +- break; +- +- npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; +- skb->truesize += data_len; +- skb_shinfo(skb)->nr_frags = npages; +- for (i = 0; i < npages; i++) { +- struct page *page; +- skb_frag_t *frag; +- +- page = alloc_pages(sk->sk_allocation, 0); +- if (!page) { +- err = -ENOBUFS; +- skb_shinfo(skb)->nr_frags = i; +- kfree_skb(skb); +- goto failure; +- } +- +- frag = &skb_shinfo(skb)->frags[i]; +- frag->page = page; +- frag->page_offset = 0; +- frag->size = (data_len >= PAGE_SIZE ? +- PAGE_SIZE : +- data_len); +- data_len -= PAGE_SIZE; +- } ++ if (ub_sock_getwres_other(sk, skb_charge_size(size))) { ++ if (size2 < size) { ++ size = size2; ++ continue; ++ } ++ set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); ++ err = -EAGAIN; ++ if (!timeo) ++ goto failure; ++ if (signal_pending(current)) ++ goto interrupted; ++ timeo = ub_sock_wait_for_space(sk, timeo, ++ skb_charge_size(size)); ++ continue; ++ } + ++ if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { ++ skb = alloc_skb(size, gfp_mask); ++ if (skb) + /* Full success... */ + break; +- } ++ ub_sock_retwres_other(sk, skb_charge_size(size), ++ SOCK_MIN_UBCSPACE_CH); + err = -ENOBUFS; + goto failure; + } ++ ub_sock_retwres_other(sk, ++ skb_charge_size(size), ++ SOCK_MIN_UBCSPACE_CH); + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + err = -EAGAIN; +@@ -1329,6 +1329,7 @@ static struct sk_buff *sock_alloc_send_pskb(struct sock *sk, + timeo = sock_wait_for_wmem(sk, timeo); + } + ++ ub_skb_set_charge(skb, sk, skb_charge_size(size), UB_OTHERSOCKBUF); + skb_set_owner_w(skb, sk); + return skb; + +@@ -1338,11 +1339,12 @@ failure: + *errcode = err; + return NULL; + } ++EXPORT_SYMBOL(sock_alloc_send_skb2); + + struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, + int noblock, int *errcode) + { +- return sock_alloc_send_pskb(sk, size, 0, noblock, errcode); ++ return sock_alloc_send_skb2(sk, size, size, noblock, errcode); + } + + static void __lock_sock(struct sock *sk) +@@ -1752,10 +1754,12 @@ void lock_sock_nested(struct sock *sk, int subclass) + __lock_sock(sk); + sk->sk_lock.owned = 1; + spin_unlock(&sk->sk_lock.slock); ++#if !defined(CONFIG_VZ_CHECKPOINT) && !defined(CONFIG_VZ_CHECKPOINT_MODULE) + /* + * The sk_lock has mutex_lock() semantics here: + */ + mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); ++#endif + local_bh_enable(); + } + +@@ -1763,11 +1767,12 @@ EXPORT_SYMBOL(lock_sock_nested); + + void release_sock(struct sock *sk) + { ++#if !defined(CONFIG_VZ_CHECKPOINT) && !defined(CONFIG_VZ_CHECKPOINT_MODULE) + /* + * The sk_lock has mutex_unlock() semantics: + */ + mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); +- ++#endif + spin_lock_bh(&sk->sk_lock.slock); + if (sk->sk_backlog.tail) + __release_sock(sk); +@@ -2041,7 +2046,7 @@ int proto_register(struct proto *prot, int alloc_slab) + + if (alloc_slab) { + prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0, +- SLAB_HWCACHE_ALIGN, NULL); ++ SLAB_HWCACHE_ALIGN|SLAB_UBC, NULL); + + if (prot->slab == NULL) { + printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n", +@@ -2059,7 +2064,7 @@ int proto_register(struct proto *prot, int alloc_slab) + sprintf(request_sock_slab_name, mask, prot->name); + prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name, + prot->rsk_prot->obj_size, 0, +- SLAB_HWCACHE_ALIGN, NULL); ++ SLAB_HWCACHE_ALIGN|SLAB_UBC, NULL); + + if (prot->rsk_prot->slab == NULL) { + printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n", +@@ -2080,7 +2085,7 @@ int proto_register(struct proto *prot, int alloc_slab) + prot->twsk_prot->twsk_slab = + kmem_cache_create(timewait_sock_slab_name, + prot->twsk_prot->twsk_obj_size, +- 0, SLAB_HWCACHE_ALIGN, ++ 0, SLAB_HWCACHE_ALIGN|SLAB_UBC, + NULL); + if (prot->twsk_prot->twsk_slab == NULL) + goto out_free_timewait_sock_slab_name; +@@ -2237,10 +2242,26 @@ static const struct file_operations proto_seq_fops = { + .release = seq_release, + }; + ++static int proto_net_init(struct net *net) ++{ ++ if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops)) ++ return -ENOBUFS; ++ return 0; ++} ++ ++static void proto_net_exit(struct net *net) ++{ ++ proc_net_remove(net, "protocols"); ++} ++ ++static struct pernet_operations proto_net_ops = { ++ .init = proto_net_init, ++ .exit = proto_net_exit, ++}; ++ + static int __init proto_init(void) + { +- /* register /proc/net/protocols */ +- return proc_net_fops_create(&init_net, "protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0; ++ return register_pernet_subsys(&proto_net_ops); + } + + subsys_initcall(proto_init); +diff --git a/net/core/stream.c b/net/core/stream.c +index 4a0ad15..5c39418 100644 +--- a/net/core/stream.c ++++ b/net/core/stream.c +@@ -111,8 +111,10 @@ EXPORT_SYMBOL(sk_stream_wait_close); + * sk_stream_wait_memory - Wait for more memory for a socket + * @sk: socket to wait for memory + * @timeo_p: for how long ++ * @amount - amount of memory to wait for (in UB space!) + */ +-int sk_stream_wait_memory(struct sock *sk, long *timeo_p) ++int __sk_stream_wait_memory(struct sock *sk, long *timeo_p, ++ unsigned long amount) + { + int err = 0; + long vm_wait = 0; +@@ -134,8 +136,11 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) + if (signal_pending(current)) + goto do_interrupted; + clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); +- if (sk_stream_memory_free(sk) && !vm_wait) +- break; ++ if (amount == 0) { ++ if (sk_stream_memory_free(sk) && !vm_wait) ++ break; ++ } else ++ ub_sock_sndqueueadd_tcp(sk, amount); + + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + sk->sk_write_pending++; +@@ -144,6 +149,8 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) + sk_stream_memory_free(sk) && + vm_wait); + sk->sk_write_pending--; ++ if (amount > 0) ++ ub_sock_sndqueuedel(sk); + + if (vm_wait) { + vm_wait -= current_timeo; +@@ -170,6 +177,10 @@ do_interrupted: + goto out; + } + ++int sk_stream_wait_memory(struct sock *sk, long *timeo_p) ++{ ++ return __sk_stream_wait_memory(sk, timeo_p, 0); ++} + EXPORT_SYMBOL(sk_stream_wait_memory); + + int sk_stream_error(struct sock *sk, int flags, int err) +diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c +index f7fe2a5..024413f 100644 +--- a/net/dccp/ipv6.c ++++ b/net/dccp/ipv6.c +@@ -567,6 +567,8 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, + __ip6_dst_store(newsk, dst, NULL, NULL); + newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM | + NETIF_F_TSO); ++ if (!sysctl_tcp_use_sg) ++ newsk->sk_route_caps &= ~NETIF_F_SG; + newdp6 = (struct dccp6_sock *)newsk; + newinet = inet_sk(newsk); + newinet->pinet6 = &newdp6->inet6; +diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c +index 66dca5b..e7802a4 100644 +--- a/net/dccp/minisocks.c ++++ b/net/dccp/minisocks.c +@@ -19,6 +19,8 @@ + #include + #include + ++#include ++ + #include "ackvec.h" + #include "ccid.h" + #include "dccp.h" +@@ -56,7 +58,8 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) + { + struct inet_timewait_sock *tw = NULL; + +- if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets) ++ if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets && ++ ub_timewait_check(sk, &dccp_death_row)) + tw = inet_twsk_alloc(sk, state); + + if (tw != NULL) { +diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c +index 5b7539b..14fbca5 100644 +--- a/net/decnet/dn_rules.c ++++ b/net/decnet/dn_rules.c +@@ -229,7 +229,7 @@ static u32 dn_fib_rule_default_pref(struct fib_rules_ops *ops) + return 0; + } + +-static void dn_fib_rule_flush_cache(void) ++static void dn_fib_rule_flush_cache(struct fib_rules_ops *ops) + { + dn_rt_cache_flush(-1); + } +diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c +index 6d2bd32..45567e3 100644 +--- a/net/decnet/netfilter/dn_rtmsg.c ++++ b/net/decnet/netfilter/dn_rtmsg.c +@@ -107,7 +107,7 @@ static inline void dnrmg_receive_user_skb(struct sk_buff *skb) + if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) + return; + +- if (security_netlink_recv(skb, CAP_NET_ADMIN)) ++ if (security_netlink_recv(skb, CAP_VE_NET_ADMIN)) + RCV_SKB_FAIL(-EPERM); + + /* Eventually we might send routing messages too */ +diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c +index 24eca23..dcaebf0 100644 +--- a/net/ipv4/af_inet.c ++++ b/net/ipv4/af_inet.c +@@ -115,6 +115,7 @@ + #ifdef CONFIG_IP_MROUTE + #include + #endif ++#include + + DEFINE_SNMP_STAT(struct linux_mib, net_statistics) __read_mostly; + +@@ -330,6 +331,10 @@ lookup_protocol: + goto out_rcu_unlock; + } + ++ err = vz_security_protocol_check(answer->protocol); ++ if (err < 0) ++ goto out_rcu_unlock; ++ + err = -EPERM; + if (answer->capability > 0 && !capable(answer->capability)) + goto out_rcu_unlock; +@@ -351,6 +356,13 @@ lookup_protocol: + if (sk == NULL) + goto out; + ++ err = -ENOBUFS; ++ if (ub_sock_charge(sk, PF_INET, sock->type)) ++ goto out_sk_free; ++ /* if charge was successful, sock_init_data() MUST be called to ++ * set sk->sk_type. otherwise sk will be uncharged to wrong resource ++ */ ++ + err = 0; + sk->sk_no_check = answer_no_check; + if (INET_PROTOSW_REUSE & answer_flags) +@@ -408,6 +420,9 @@ out: + out_rcu_unlock: + rcu_read_unlock(); + goto out; ++out_sk_free: ++ sk_free(sk); ++ return err; + } + + +@@ -422,6 +437,9 @@ int inet_release(struct socket *sock) + + if (sk) { + long timeout; ++ struct ve_struct *saved_env; ++ ++ saved_env = set_exec_env(sk->owner_env); + + /* Applications forget to leave groups before exiting */ + ip_mc_drop_socket(sk); +@@ -439,6 +457,8 @@ int inet_release(struct socket *sock) + timeout = sk->sk_lingertime; + sock->sk = NULL; + sk->sk_prot->close(sk, timeout); ++ ++ (void)set_exec_env(saved_env); + } + return 0; + } +@@ -1341,27 +1361,27 @@ static struct net_protocol icmp_protocol = { + .netns_ok = 1, + }; + +-static int __init init_ipv4_mibs(void) ++int init_ipv4_mibs(void) + { +- if (snmp_mib_init((void **)net_statistics, ++ if (snmp_mib_init((void **)ve_net_statistics, + sizeof(struct linux_mib)) < 0) + goto err_net_mib; +- if (snmp_mib_init((void **)ip_statistics, ++ if (snmp_mib_init((void **)ve_ip_statistics, + sizeof(struct ipstats_mib)) < 0) + goto err_ip_mib; +- if (snmp_mib_init((void **)icmp_statistics, ++ if (snmp_mib_init((void **)ve_icmp_statistics, + sizeof(struct icmp_mib)) < 0) + goto err_icmp_mib; +- if (snmp_mib_init((void **)icmpmsg_statistics, ++ if (snmp_mib_init((void **)ve_icmpmsg_statistics, + sizeof(struct icmpmsg_mib)) < 0) + goto err_icmpmsg_mib; +- if (snmp_mib_init((void **)tcp_statistics, ++ if (snmp_mib_init((void **)ve_tcp_statistics, + sizeof(struct tcp_mib)) < 0) + goto err_tcp_mib; +- if (snmp_mib_init((void **)udp_statistics, ++ if (snmp_mib_init((void **)ve_udp_statistics, + sizeof(struct udp_mib)) < 0) + goto err_udp_mib; +- if (snmp_mib_init((void **)udplite_statistics, ++ if (snmp_mib_init((void **)ve_udplite_statistics, + sizeof(struct udp_mib)) < 0) + goto err_udplite_mib; + +@@ -1370,20 +1390,33 @@ static int __init init_ipv4_mibs(void) + return 0; + + err_udplite_mib: +- snmp_mib_free((void **)udp_statistics); ++ snmp_mib_free((void **)ve_udp_statistics); + err_udp_mib: +- snmp_mib_free((void **)tcp_statistics); ++ snmp_mib_free((void **)ve_tcp_statistics); + err_tcp_mib: +- snmp_mib_free((void **)icmpmsg_statistics); ++ snmp_mib_free((void **)ve_icmpmsg_statistics); + err_icmpmsg_mib: +- snmp_mib_free((void **)icmp_statistics); ++ snmp_mib_free((void **)ve_icmp_statistics); + err_icmp_mib: +- snmp_mib_free((void **)ip_statistics); ++ snmp_mib_free((void **)ve_ip_statistics); + err_ip_mib: +- snmp_mib_free((void **)net_statistics); ++ snmp_mib_free((void **)ve_net_statistics); + err_net_mib: + return -ENOMEM; + } ++EXPORT_SYMBOL(init_ipv4_mibs); ++ ++void cleanup_ipv4_mibs(void) ++{ ++ snmp_mib_free((void **)ve_udplite_statistics); ++ snmp_mib_free((void **)ve_udp_statistics); ++ snmp_mib_free((void **)ve_tcp_statistics); ++ snmp_mib_free((void **)ve_icmpmsg_statistics); ++ snmp_mib_free((void **)ve_icmp_statistics); ++ snmp_mib_free((void **)ve_ip_statistics); ++ snmp_mib_free((void **)ve_net_statistics); ++} ++EXPORT_SYMBOL(cleanup_ipv4_mibs); + + static int ipv4_proc_init(void); + +diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c +index 9b539fa..afd5dfb 100644 +--- a/net/ipv4/arp.c ++++ b/net/ipv4/arp.c +@@ -1137,7 +1137,8 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) + switch (cmd) { + case SIOCDARP: + case SIOCSARP: +- if (!capable(CAP_NET_ADMIN)) ++ if (!capable(CAP_NET_ADMIN) && ++ !capable(CAP_VE_NET_ADMIN)) + return -EPERM; + case SIOCGARP: + err = copy_from_user(&r, arg, sizeof(struct arpreq)); +@@ -1199,7 +1200,7 @@ static int arp_netdev_event(struct notifier_block *this, unsigned long event, vo + switch (event) { + case NETDEV_CHANGEADDR: + neigh_changeaddr(&arp_tbl, dev); +- rt_cache_flush(0); ++ rt_cache_flush(dev_net(dev), 0); + break; + default: + break; +diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c +index 79a7ef6..46a3f5c 100644 +--- a/net/ipv4/devinet.c ++++ b/net/ipv4/devinet.c +@@ -112,9 +112,9 @@ static inline void devinet_sysctl_unregister(struct in_device *idev) + + /* Locks all the inet devices. */ + +-static struct in_ifaddr *inet_alloc_ifa(void) ++struct in_ifaddr *inet_alloc_ifa(void) + { +- struct in_ifaddr *ifa = kzalloc(sizeof(*ifa), GFP_KERNEL); ++ struct in_ifaddr *ifa = kzalloc(sizeof(*ifa), GFP_KERNEL_UBC); + + if (ifa) { + INIT_RCU_HEAD(&ifa->rcu_head); +@@ -122,6 +122,7 @@ static struct in_ifaddr *inet_alloc_ifa(void) + + return ifa; + } ++EXPORT_SYMBOL_GPL(inet_alloc_ifa); + + static void inet_rcu_free_ifa(struct rcu_head *head) + { +@@ -154,7 +155,7 @@ void in_dev_finish_destroy(struct in_device *idev) + } + } + +-static struct in_device *inetdev_init(struct net_device *dev) ++struct in_device *inetdev_init(struct net_device *dev) + { + struct in_device *in_dev; + +@@ -189,6 +190,7 @@ out_kfree: + in_dev = NULL; + goto out; + } ++EXPORT_SYMBOL_GPL(inetdev_init); + + static void in_dev_rcu_put(struct rcu_head *head) + { +@@ -382,7 +384,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, + return 0; + } + +-static int inet_insert_ifa(struct in_ifaddr *ifa) ++int inet_insert_ifa(struct in_ifaddr *ifa) + { + return __inet_insert_ifa(ifa, NULL, 0); + } +@@ -433,6 +435,7 @@ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, + } endfor_ifa(in_dev); + return NULL; + } ++EXPORT_SYMBOL_GPL(inet_insert_ifa); + + static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) + { +@@ -633,7 +636,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) + + case SIOCSIFFLAGS: + ret = -EACCES; +- if (!capable(CAP_NET_ADMIN)) ++ if (!capable(CAP_VE_NET_ADMIN)) + goto out; + break; + case SIOCSIFADDR: /* Set interface address (and family) */ +@@ -641,7 +644,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) + case SIOCSIFDSTADDR: /* Set the destination address */ + case SIOCSIFNETMASK: /* Set the netmask for the interface */ + ret = -EACCES; +- if (!capable(CAP_NET_ADMIN)) ++ if (!capable(CAP_VE_NET_ADMIN)) + goto out; + ret = -EINVAL; + if (sin->sin_family != AF_INET) +@@ -1249,7 +1252,7 @@ static void inet_forward_change(struct net *net) + } + read_unlock(&dev_base_lock); + +- rt_cache_flush(0); ++ rt_cache_flush(net, 0); + } + + static int devinet_conf_proc(ctl_table *ctl, int write, +@@ -1338,7 +1341,7 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write, + if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) + inet_forward_change(net); + else if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) +- rt_cache_flush(0); ++ rt_cache_flush(net, 0); + } + + return ret; +@@ -1351,9 +1354,10 @@ int ipv4_doint_and_flush(ctl_table *ctl, int write, + int *valp = ctl->data; + int val = *valp; + int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); ++ struct net *net = ctl->extra2; + + if (write && *valp != val) +- rt_cache_flush(0); ++ rt_cache_flush(net, 0); + + return ret; + } +@@ -1364,9 +1368,10 @@ int ipv4_doint_and_flush_strategy(ctl_table *table, int __user *name, int nlen, + { + int ret = devinet_conf_sysctl(table, name, nlen, oldval, oldlenp, + newval, newlen); ++ struct net *net = table->extra2; + + if (ret == 1) +- rt_cache_flush(0); ++ rt_cache_flush(net, 0); + + return ret; + } +diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c +index 0b2ac6a..2dcec65 100644 +--- a/net/ipv4/fib_frontend.c ++++ b/net/ipv4/fib_frontend.c +@@ -146,7 +146,7 @@ static void fib_flush(struct net *net) + } + + if (flushed) +- rt_cache_flush(-1); ++ rt_cache_flush(net, -1); + } + + /* +@@ -260,7 +260,8 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, + net = dev_net(dev); + if (fib_lookup(net, &fl, &res)) + goto last_resort; +- if (res.type != RTN_UNICAST) ++ if (res.type != RTN_UNICAST && ++ (!(dev->features & NETIF_F_VENET) || res.type != RTN_LOCAL)) + goto e_inval_res; + *spec_dst = FIB_RES_PREFSRC(res); + fib_combine_itag(itag, &res); +@@ -462,7 +463,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) + switch (cmd) { + case SIOCADDRT: /* Add a route */ + case SIOCDELRT: /* Delete a route */ +- if (!capable(CAP_NET_ADMIN)) ++ if (!capable(CAP_VE_NET_ADMIN)) + return -EPERM; + + if (copy_from_user(&rt, arg, sizeof(rt))) +@@ -899,21 +900,22 @@ static void fib_disable_ip(struct net_device *dev, int force) + { + if (fib_sync_down_dev(dev, force)) + fib_flush(dev_net(dev)); +- rt_cache_flush(0); ++ rt_cache_flush(dev_net(dev), 0); + arp_ifdown(dev); + } + + static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) + { + struct in_ifaddr *ifa = (struct in_ifaddr*)ptr; ++ struct net_device *dev = ifa->ifa_dev->dev; + + switch (event) { + case NETDEV_UP: + fib_add_ifaddr(ifa); + #ifdef CONFIG_IP_ROUTE_MULTIPATH +- fib_sync_up(ifa->ifa_dev->dev); ++ fib_sync_up(dev); + #endif +- rt_cache_flush(-1); ++ rt_cache_flush(dev_net(dev), -1); + break; + case NETDEV_DOWN: + fib_del_ifaddr(ifa); +@@ -921,9 +923,9 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, + /* Last address was deleted from this interface. + Disable IP. + */ +- fib_disable_ip(ifa->ifa_dev->dev, 1); ++ fib_disable_ip(dev, 1); + } else { +- rt_cache_flush(-1); ++ rt_cache_flush(dev_net(dev), -1); + } + break; + } +@@ -951,14 +953,14 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo + #ifdef CONFIG_IP_ROUTE_MULTIPATH + fib_sync_up(dev); + #endif +- rt_cache_flush(-1); ++ rt_cache_flush(dev_net(dev), -1); + break; + case NETDEV_DOWN: + fib_disable_ip(dev, 0); + break; + case NETDEV_CHANGEMTU: + case NETDEV_CHANGE: +- rt_cache_flush(0); ++ rt_cache_flush(dev_net(dev), 0); + break; + } + return NOTIFY_DONE; +diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c +index 2e2fc33..ccb3830 100644 +--- a/net/ipv4/fib_hash.c ++++ b/net/ipv4/fib_hash.c +@@ -474,7 +474,7 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg) + + fib_release_info(fi_drop); + if (state & FA_S_ACCESSED) +- rt_cache_flush(-1); ++ rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); + rtmsg_fib(RTM_NEWROUTE, key, fa, cfg->fc_dst_len, tb->tb_id, + &cfg->fc_nlinfo, NLM_F_REPLACE); + return 0; +@@ -534,7 +534,7 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg) + + if (new_f) + fz->fz_nent++; +- rt_cache_flush(-1); ++ rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); + + rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, tb->tb_id, + &cfg->fc_nlinfo, 0); +@@ -616,7 +616,7 @@ static int fn_hash_delete(struct fib_table *tb, struct fib_config *cfg) + write_unlock_bh(&fib_hash_lock); + + if (fa->fa_state & FA_S_ACCESSED) +- rt_cache_flush(-1); ++ rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); + fn_free_alias(fa, f); + if (kill_fn) { + fn_free_node(f); +@@ -772,10 +772,10 @@ static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin + void __init fib_hash_init(void) + { + fn_hash_kmem = kmem_cache_create("ip_fib_hash", sizeof(struct fib_node), +- 0, SLAB_PANIC, NULL); ++ 0, SLAB_PANIC | SLAB_UBC, NULL); + + fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias), +- 0, SLAB_PANIC, NULL); ++ 0, SLAB_PANIC | SLAB_UBC, NULL); + + } + +diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c +index 1fb5687..6080d71 100644 +--- a/net/ipv4/fib_rules.c ++++ b/net/ipv4/fib_rules.c +@@ -258,9 +258,9 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule) + + nla_total_size(4); /* flow */ + } + +-static void fib4_rule_flush_cache(void) ++static void fib4_rule_flush_cache(struct fib_rules_ops *ops) + { +- rt_cache_flush(-1); ++ rt_cache_flush(ops->fro_net, -1); + } + + static struct fib_rules_ops fib4_rules_ops_template = { +diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c +index e1600ad..be16529 100644 +--- a/net/ipv4/fib_trie.c ++++ b/net/ipv4/fib_trie.c +@@ -1273,7 +1273,7 @@ static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg) + + fib_release_info(fi_drop); + if (state & FA_S_ACCESSED) +- rt_cache_flush(-1); ++ rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); + rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, + tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE); + +@@ -1318,7 +1318,7 @@ static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg) + list_add_tail_rcu(&new_fa->fa_list, + (fa ? &fa->fa_list : fa_head)); + +- rt_cache_flush(-1); ++ rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); + rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, + &cfg->fc_nlinfo, 0); + succeeded: +@@ -1661,7 +1661,7 @@ static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg) + trie_leaf_remove(t, l); + + if (fa->fa_state & FA_S_ACCESSED) +- rt_cache_flush(-1); ++ rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); + + fib_release_info(fa->fa_info); + alias_free_mem_rcu(fa); +diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c +index 2769dc4..03a7004 100644 +--- a/net/ipv4/igmp.c ++++ b/net/ipv4/igmp.c +@@ -83,6 +83,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -2317,7 +2318,7 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq) + struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); + + state->in_dev = NULL; +- for_each_netdev(&init_net, state->dev) { ++ for_each_netdev(get_exec_env()->ve_netns, state->dev) { + struct in_device *in_dev; + in_dev = in_dev_get(state->dev); + if (!in_dev) +@@ -2466,7 +2467,7 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq) + + state->idev = NULL; + state->im = NULL; +- for_each_netdev(&init_net, state->dev) { ++ for_each_netdev(get_exec_env()->ve_netns, state->dev) { + struct in_device *idev; + idev = in_dev_get(state->dev); + if (unlikely(idev == NULL)) +@@ -2609,11 +2610,34 @@ static const struct file_operations igmp_mcf_seq_fops = { + .release = seq_release_private, + }; + +-int __init igmp_mc_proc_init(void) ++static int igmp_net_init(struct net *net) + { +- proc_net_fops_create(&init_net, "igmp", S_IRUGO, &igmp_mc_seq_fops); +- proc_net_fops_create(&init_net, "mcfilter", S_IRUGO, &igmp_mcf_seq_fops); ++ if (!proc_net_fops_create(net, "igmp", S_IRUGO, &igmp_mc_seq_fops)) ++ goto out_igmp; ++ if (!proc_net_fops_create(net, "mcfilter", S_IRUGO, &igmp_mcf_seq_fops)) ++ goto out_mcfilter; + return 0; ++ ++out_mcfilter: ++ proc_net_remove(net, "igmp"); ++out_igmp: ++ return -ENOMEM; ++} ++ ++static void igmp_net_exit(struct net *net) ++{ ++ proc_net_remove(net, "igmp"); ++ proc_net_remove(net, "mcfilter"); ++} ++ ++static struct pernet_operations igmp_net_ops = { ++ .init = igmp_net_init, ++ .exit = igmp_net_exit, ++}; ++ ++int __init igmp_mc_proc_init(void) ++{ ++ return register_pernet_subsys(&igmp_net_ops); + } + #endif + +diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c +index ec83448..5bfa408 100644 +--- a/net/ipv4/inet_connection_sock.c ++++ b/net/ipv4/inet_connection_sock.c +@@ -24,6 +24,9 @@ + #include + #include + ++#include ++#include ++ + #ifdef INET_CSK_DEBUG + const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; + EXPORT_SYMBOL(inet_csk_timer_bug_msg); +@@ -93,6 +96,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) + struct inet_bind_bucket *tb; + int ret; + struct net *net = sock_net(sk); ++ struct ve_struct *env = sk->owner_env; + + local_bh_disable(); + if (!snum) { +@@ -103,7 +107,8 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) + rover = net_random() % remaining + low; + + do { +- head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)]; ++ head = &hashinfo->bhash[inet_bhashfn(rover, ++ hashinfo->bhash_size, VEID(env))]; + spin_lock(&head->lock); + inet_bind_bucket_for_each(tb, node, &head->chain) + if (tb->ib_net == net && tb->port == rover) +@@ -130,7 +135,8 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) + */ + snum = rover; + } else { +- head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)]; ++ head = &hashinfo->bhash[inet_bhashfn(snum, ++ hashinfo->bhash_size, VEID(env))]; + spin_lock(&head->lock); + inet_bind_bucket_for_each(tb, node, &head->chain) + if (tb->ib_net == net && tb->port == snum) +@@ -152,7 +158,7 @@ tb_found: + tb_not_found: + ret = 1; + if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, +- net, head, snum)) == NULL) ++ net, head, snum, env)) == NULL) + goto fail_unlock; + if (hlist_empty(&tb->owners)) { + if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) +@@ -553,7 +559,7 @@ void inet_csk_destroy_sock(struct sock *sk) + + sk_refcnt_debug_release(sk); + +- atomic_dec(sk->sk_prot->orphan_count); ++ ub_dec_orphan_count(sk); + sock_put(sk); + } + +@@ -633,7 +639,7 @@ void inet_csk_listen_stop(struct sock *sk) + + sock_orphan(child); + +- atomic_inc(sk->sk_prot->orphan_count); ++ ub_inc_orphan_count(sk); + + inet_csk_destroy_sock(child); + +diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c +index da97695..58408eb 100644 +--- a/net/ipv4/inet_diag.c ++++ b/net/ipv4/inet_diag.c +@@ -708,6 +708,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) + struct inet_diag_req *r = NLMSG_DATA(cb->nlh); + const struct inet_diag_handler *handler; + struct inet_hashinfo *hashinfo; ++ struct ve_struct *ve = get_exec_env(); + + handler = inet_diag_lock_handler(cb->nlh->nlmsg_type); + if (IS_ERR(handler)) +@@ -731,6 +732,8 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) + sk_for_each(sk, node, &hashinfo->listening_hash[i]) { + struct inet_sock *inet = inet_sk(sk); + ++ if (!ve_accessible(sk->owner_env, ve)) ++ continue; + if (num < s_num) { + num++; + continue; +@@ -792,6 +795,8 @@ skip_listen_ht: + sk_for_each(sk, node, &head->chain) { + struct inet_sock *inet = inet_sk(sk); + ++ if (!ve_accessible(sk->owner_env, ve)) ++ continue; + if (num < s_num) + goto next_normal; + if (!(r->idiag_states & (1 << sk->sk_state))) +@@ -816,6 +821,8 @@ next_normal: + inet_twsk_for_each(tw, node, + &head->twchain) { + ++ if (!ve_accessible_veid(tw->tw_owner_env, VEID(ve))) ++ continue; + if (num < s_num) + goto next_dying; + if (r->id.idiag_sport != tw->tw_sport && +diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c +index 0546a0b..a967588 100644 +--- a/net/ipv4/inet_fragment.c ++++ b/net/ipv4/inet_fragment.c +@@ -249,6 +249,9 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, + spin_lock_init(&q->lock); + atomic_set(&q->refcnt, 1); + q->net = nf; ++#ifdef CONFIG_VE ++ q->owner_ve = get_exec_env(); ++#endif + + return q; + } +diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c +index 2023d37..af2a58b 100644 +--- a/net/ipv4/inet_hashtables.c ++++ b/net/ipv4/inet_hashtables.c +@@ -30,7 +30,8 @@ + struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, + struct net *net, + struct inet_bind_hashbucket *head, +- const unsigned short snum) ++ const unsigned short snum, ++ struct ve_struct *ve) + { + struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); + +@@ -39,6 +40,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, + tb->port = snum; + tb->fastreuse = 0; + INIT_HLIST_HEAD(&tb->owners); ++ tb->owner_env = ve; + hlist_add_head(&tb->node, &head->chain); + } + return tb; +@@ -70,10 +72,13 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, + static void __inet_put_port(struct sock *sk) + { + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; +- const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size); +- struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; ++ int bhash; ++ struct inet_bind_hashbucket *head; + struct inet_bind_bucket *tb; + ++ bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size, ++ VEID(sk->owner_env)); ++ head = &hashinfo->bhash[bhash]; + spin_lock(&head->lock); + tb = inet_csk(sk)->icsk_bind_hash; + __sk_del_bind_node(sk); +@@ -95,7 +100,8 @@ EXPORT_SYMBOL(inet_put_port); + void __inet_inherit_port(struct sock *sk, struct sock *child) + { + struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; +- const int bhash = inet_bhashfn(inet_sk(child)->num, table->bhash_size); ++ const int bhash = inet_bhashfn(inet_sk(child)->num, table->bhash_size, ++ VEID(child->owner_env)); + struct inet_bind_hashbucket *head = &table->bhash[bhash]; + struct inet_bind_bucket *tb; + +@@ -190,9 +196,11 @@ struct sock *__inet_lookup_listener(struct net *net, + { + struct sock *sk = NULL; + const struct hlist_head *head; ++ struct ve_struct *env; + ++ env = get_exec_env(); + read_lock(&hashinfo->lhash_lock); +- head = &hashinfo->listening_hash[inet_lhashfn(hnum)]; ++ head = &hashinfo->listening_hash[inet_lhashfn(hnum, VEID(env))]; + if (!hlist_empty(head)) { + const struct inet_sock *inet = inet_sk((sk = __sk_head(head))); + +@@ -225,7 +233,8 @@ struct sock * __inet_lookup_established(struct net *net, + /* Optimize here for direct hit, only listening connections can + * have wildcards anyways. + */ +- unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport); ++ struct ve_struct *ve = get_exec_env(); ++ unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport, VEID(ve)); + struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); + rwlock_t *lock = inet_ehash_lockp(hashinfo, hash); + +@@ -256,7 +265,8 @@ EXPORT_SYMBOL_GPL(__inet_lookup_established); + /* called with local bh disabled */ + static int __inet_check_established(struct inet_timewait_death_row *death_row, + struct sock *sk, __u16 lport, +- struct inet_timewait_sock **twp) ++ struct inet_timewait_sock **twp, ++ struct ve_struct *ve) + { + struct inet_hashinfo *hinfo = death_row->hashinfo; + struct inet_sock *inet = inet_sk(sk); +@@ -265,7 +275,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, + int dif = sk->sk_bound_dev_if; + INET_ADDR_COOKIE(acookie, saddr, daddr) + const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport); +- unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); ++ unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, VEID(ve)); + struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); + rwlock_t *lock = inet_ehash_lockp(hinfo, hash); + struct sock *sk2; +@@ -415,7 +425,8 @@ EXPORT_SYMBOL_GPL(inet_unhash); + int __inet_hash_connect(struct inet_timewait_death_row *death_row, + struct sock *sk, u32 port_offset, + int (*check_established)(struct inet_timewait_death_row *, +- struct sock *, __u16, struct inet_timewait_sock **), ++ struct sock *, __u16, struct inet_timewait_sock **, ++ struct ve_struct *), + void (*hash)(struct sock *sk)) + { + struct inet_hashinfo *hinfo = death_row->hashinfo; +@@ -424,6 +435,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, + struct inet_bind_bucket *tb; + int ret; + struct net *net = sock_net(sk); ++ struct ve_struct *ve = sk->owner_env; + + if (!snum) { + int i, remaining, low, high, port; +@@ -438,7 +450,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, + local_bh_disable(); + for (i = 1; i <= remaining; i++) { + port = low + (i + offset) % remaining; +- head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)]; ++ head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size, VEID(ve))]; + spin_lock(&head->lock); + + /* Does not bother with rcv_saddr checks, +@@ -451,14 +463,14 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, + if (tb->fastreuse >= 0) + goto next_port; + if (!check_established(death_row, sk, +- port, &tw)) ++ port, &tw, ve)) + goto ok; + goto next_port; + } + } + + tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, +- net, head, port); ++ net, head, port, ve); + if (!tb) { + spin_unlock(&head->lock); + break; +@@ -493,7 +505,7 @@ ok: + goto out; + } + +- head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)]; ++ head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size, VEID(ve))]; + tb = inet_csk(sk)->icsk_bind_hash; + spin_lock_bh(&head->lock); + if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { +@@ -503,7 +515,7 @@ ok: + } else { + spin_unlock(&head->lock); + /* No definite answer... Walk to established hash table */ +- ret = check_established(death_row, sk, snum, NULL); ++ ret = check_established(death_row, sk, snum, NULL, ve); + out: + local_bh_enable(); + return ret; +diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c +index ce16e9a..bd67750 100644 +--- a/net/ipv4/inet_timewait_sock.c ++++ b/net/ipv4/inet_timewait_sock.c +@@ -13,6 +13,8 @@ + #include + #include + ++#include ++ + /* Must be called with locally disabled BHs. */ + static void __inet_twsk_kill(struct inet_timewait_sock *tw, + struct inet_hashinfo *hashinfo) +@@ -32,7 +34,8 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw, + write_unlock(lock); + + /* Disassociate with bind bucket. */ +- bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)]; ++ bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, ++ hashinfo->bhash_size, tw->tw_owner_env)]; + spin_lock(&bhead->lock); + tb = tw->tw_tb; + __hlist_del(&tw->tw_bind_node); +@@ -81,7 +84,8 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, + Note, that any socket with inet->num != 0 MUST be bound in + binding cache, even if it is closed. + */ +- bhead = &hashinfo->bhash[inet_bhashfn(inet->num, hashinfo->bhash_size)]; ++ bhead = &hashinfo->bhash[inet_bhashfn(inet->num, ++ hashinfo->bhash_size, tw->tw_owner_env)]; + spin_lock(&bhead->lock); + tw->tw_tb = icsk->icsk_bind_hash; + BUG_TRAP(icsk->icsk_bind_hash); +@@ -105,9 +109,14 @@ EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); + + struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) + { +- struct inet_timewait_sock *tw = +- kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, +- GFP_ATOMIC); ++ struct user_beancounter *ub; ++ struct inet_timewait_sock *tw; ++ ++ ub = set_exec_ub(sock_bc(sk)->ub); ++ tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, ++ GFP_ATOMIC); ++ (void)set_exec_ub(ub); ++ + if (tw != NULL) { + const struct inet_sock *inet = inet_sk(sk); + +@@ -156,6 +165,7 @@ static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr, + rescan: + inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) { + __inet_twsk_del_dead_node(tw); ++ ub_timewait_dec(tw, twdr); + spin_unlock(&twdr->death_lock); + __inet_twsk_kill(tw, twdr->hashinfo); + inet_twsk_put(tw); +@@ -252,6 +262,7 @@ void inet_twsk_deschedule(struct inet_timewait_sock *tw, + { + spin_lock(&twdr->death_lock); + if (inet_twsk_del_dead_node(tw)) { ++ ub_timewait_dec(tw, twdr); + inet_twsk_put(tw); + if (--twdr->tw_count == 0) + del_timer(&twdr->tw_timer); +@@ -298,9 +309,10 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw, + spin_lock(&twdr->death_lock); + + /* Unlink it, if it was scheduled */ +- if (inet_twsk_del_dead_node(tw)) ++ if (inet_twsk_del_dead_node(tw)) { ++ ub_timewait_dec(tw, twdr); + twdr->tw_count--; +- else ++ } else + atomic_inc(&tw->tw_refcnt); + + if (slot >= INET_TWDR_RECYCLE_SLOTS) { +@@ -336,6 +348,7 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw, + + hlist_add_head(&tw->tw_death_node, list); + ++ ub_timewait_inc(tw, twdr); + if (twdr->tw_count++ == 0) + mod_timer(&twdr->tw_timer, jiffies + twdr->period); + spin_unlock(&twdr->death_lock); +@@ -370,6 +383,7 @@ void inet_twdr_twcal_tick(unsigned long data) + &twdr->twcal_row[slot]) { + __inet_twsk_del_dead_node(tw); + __inet_twsk_kill(tw, twdr->hashinfo); ++ ub_timewait_dec(tw, twdr); + inet_twsk_put(tw); + killed++; + } +diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c +index 4813c39..d9af146 100644 +--- a/net/ipv4/ip_forward.c ++++ b/net/ipv4/ip_forward.c +@@ -93,6 +93,24 @@ int ip_forward(struct sk_buff *skb) + goto drop; + } + ++ /* ++ * We try to optimize forwarding of VE packets: ++ * do not decrement TTL (and so save skb_cow) ++ * during forwarding of outgoing pkts from VE. ++ * For incoming pkts we still do ttl decr, ++ * since such skb is not cloned and does not require ++ * actual cow. So, there is at least one place ++ * in pkts path with mandatory ttl decr, that is ++ * sufficient to prevent routing loops. ++ */ ++ iph = ip_hdr(skb); ++ if ( ++#ifdef CONFIG_IP_ROUTE_NAT ++ (rt->rt_flags & RTCF_NAT) == 0 && /* no NAT mangling expected */ ++#endif /* and */ ++ (skb->dev->features & NETIF_F_VENET)) /* src is VENET device */ ++ goto no_ttl_decr; ++ + /* We are about to mangle packet. Copy it! */ + if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len)) + goto drop; +@@ -101,6 +119,8 @@ int ip_forward(struct sk_buff *skb) + /* Decrease ttl after skb cow done */ + ip_decrease_ttl(iph); + ++no_ttl_decr: ++ + /* + * We now generate an ICMP HOST REDIRECT giving the route + * we calculated. +diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c +index 37221f6..b74fe5e 100644 +--- a/net/ipv4/ip_fragment.c ++++ b/net/ipv4/ip_fragment.c +@@ -188,9 +188,12 @@ static void ip_evictor(struct net *net) + */ + static void ip_expire(unsigned long arg) + { ++ struct inet_frag_queue *q = (struct inet_frag_queue *)arg; + struct ipq *qp; ++ struct ve_struct *old_ve; + +- qp = container_of((struct inet_frag_queue *) arg, struct ipq, q); ++ qp = container_of(q, struct ipq, q); ++ old_ve = set_exec_env(q->owner_ve); + + spin_lock(&qp->q.lock); + +@@ -216,6 +219,8 @@ static void ip_expire(unsigned long arg) + out: + spin_unlock(&qp->q.lock); + ipq_put(qp); ++ ++ (void)set_exec_env(old_ve); + } + + /* Find the correct entry in the "incomplete datagrams" queue for +@@ -523,6 +528,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, + clone->csum = 0; + clone->ip_summed = head->ip_summed; + atomic_add(clone->truesize, &qp->q.net->mem); ++ clone->owner_env = head->owner_env; + } + + skb_shinfo(head)->frag_list = head->next; +diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c +index ff77a4a..6408845 100644 +--- a/net/ipv4/ip_input.c ++++ b/net/ipv4/ip_input.c +@@ -201,6 +201,8 @@ static int ip_local_deliver_finish(struct sk_buff *skb) + { + struct net *net = dev_net(skb->dev); + ++ if (skb->destructor) ++ skb_orphan(skb); + __skb_pull(skb, ip_hdrlen(skb)); + + /* Point into the IP datagram, just past the header. */ +diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c +index e527628..adc6892 100644 +--- a/net/ipv4/ip_output.c ++++ b/net/ipv4/ip_output.c +@@ -1346,12 +1346,13 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar + char data[40]; + } replyopts; + struct ipcm_cookie ipc; +- __be32 daddr; ++ __be32 saddr, daddr; + struct rtable *rt = skb->rtable; + + if (ip_options_echo(&replyopts.opt, skb)) + return; + ++ saddr = ip_hdr(skb)->daddr; + daddr = ipc.addr = rt->rt_src; + ipc.opt = NULL; + +@@ -1366,7 +1367,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar + struct flowi fl = { .oif = arg->bound_dev_if, + .nl_u = { .ip4_u = + { .daddr = daddr, +- .saddr = rt->rt_spec_dst, ++ .saddr = saddr, + .tos = RT_TOS(ip_hdr(skb)->tos) } }, + /* Not quite clean, but right. */ + .uli_u = { .ports = +diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c +index ed45037..6fab8a3 100644 +--- a/net/ipv4/ipconfig.c ++++ b/net/ipv4/ipconfig.c +@@ -189,19 +189,20 @@ static int __init ic_open_devs(void) + struct ic_device *d, **last; + struct net_device *dev; + unsigned short oflags; ++ struct net *net = get_exec_env()->ve_netns; + + last = &ic_first_dev; + rtnl_lock(); + + /* bring loopback device up first */ +- for_each_netdev(&init_net, dev) { ++ for_each_netdev(net, dev) { + if (!(dev->flags & IFF_LOOPBACK)) + continue; + if (dev_change_flags(dev, dev->flags | IFF_UP) < 0) + printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name); + } + +- for_each_netdev(&init_net, dev) { ++ for_each_netdev(net, dev) { + if (dev->flags & IFF_LOOPBACK) + continue; + if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) : +@@ -434,9 +435,6 @@ ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt + unsigned char *sha, *tha; /* s for "source", t for "target" */ + struct ic_device *d; + +- if (dev_net(dev) != &init_net) +- goto drop; +- + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) + return NET_RX_DROP; + +@@ -854,9 +852,6 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str + struct ic_device *d; + int len, ext_len; + +- if (dev_net(dev) != &init_net) +- goto drop; +- + /* Perform verifications before taking the lock. */ + if (skb->pkt_type == PACKET_OTHERHOST) + goto drop; +diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c +index 11700a4..247d1cc 100644 +--- a/net/ipv4/ipmr.c ++++ b/net/ipv4/ipmr.c +@@ -42,6 +42,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -123,9 +124,10 @@ static struct timer_list ipmr_expire_timer; + static + struct net_device *ipmr_new_tunnel(struct vifctl *v) + { ++ struct net *net = get_exec_env()->ve_netns; + struct net_device *dev; + +- dev = __dev_get_by_name(&init_net, "tunl0"); ++ dev = __dev_get_by_name(net, "tunl0"); + + if (dev) { + int err; +@@ -149,7 +151,7 @@ struct net_device *ipmr_new_tunnel(struct vifctl *v) + + dev = NULL; + +- if (err == 0 && (dev = __dev_get_by_name(&init_net, p.name)) != NULL) { ++ if (err == 0 && (dev = __dev_get_by_name(net, p.name)) != NULL) { + dev->flags |= IFF_MULTICAST; + + in_dev = __in_dev_get_rtnl(dev); +@@ -1089,9 +1091,6 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v + struct vif_device *v; + int ct; + +- if (dev_net(dev) != &init_net) +- return NOTIFY_DONE; +- + if (event != NETDEV_UNREGISTER) + return NOTIFY_DONE; + v=&vif_table[0]; +diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c +index 65f1ba1..5f223e8 100644 +--- a/net/ipv4/ipvs/ip_vs_conn.c ++++ b/net/ipv4/ipvs/ip_vs_conn.c +@@ -981,7 +981,7 @@ int ip_vs_conn_init(void) + /* Allocate ip_vs_conn slab cache */ + ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", + sizeof(struct ip_vs_conn), 0, +- SLAB_HWCACHE_ALIGN, NULL); ++ SLAB_HWCACHE_ALIGN|SLAB_UBC, NULL); + if (!ip_vs_conn_cachep) { + vfree(ip_vs_conn_tab); + return -ENOMEM; +diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c +index eff54ef..f045d56 100644 +--- a/net/ipv4/ipvs/ip_vs_sync.c ++++ b/net/ipv4/ipvs/ip_vs_sync.c +@@ -23,6 +23,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -464,7 +465,8 @@ static int set_mcast_if(struct sock *sk, char *ifname) + struct net_device *dev; + struct inet_sock *inet = inet_sk(sk); + +- if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) ++ dev = __dev_get_by_name(get_exec_env()->ve_netns, ifname); ++ if (!dev) + return -ENODEV; + + if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) +@@ -485,11 +487,12 @@ static int set_mcast_if(struct sock *sk, char *ifname) + */ + static int set_sync_mesg_maxlen(int sync_state) + { ++ struct net *net = get_exec_env()->ve_netns; + struct net_device *dev; + int num; + + if (sync_state == IP_VS_STATE_MASTER) { +- if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL) ++ if ((dev = __dev_get_by_name(net, ip_vs_master_mcast_ifn)) == NULL) + return -ENODEV; + + num = (dev->mtu - sizeof(struct iphdr) - +@@ -500,7 +503,7 @@ static int set_sync_mesg_maxlen(int sync_state) + IP_VS_DBG(7, "setting the maximum length of sync sending " + "message %d.\n", sync_send_mesg_maxlen); + } else if (sync_state == IP_VS_STATE_BACKUP) { +- if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL) ++ if ((dev = __dev_get_by_name(net, ip_vs_backup_mcast_ifn)) == NULL) + return -ENODEV; + + sync_recv_mesg_maxlen = dev->mtu - +@@ -528,7 +531,8 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) + memset(&mreq, 0, sizeof(mreq)); + memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); + +- if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) ++ dev = __dev_get_by_name(get_exec_env()->ve_netns, ifname); ++ if (!dev) + return -ENODEV; + if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) + return -EINVAL; +@@ -549,7 +553,8 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname) + __be32 addr; + struct sockaddr_in sin; + +- if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) ++ dev = __dev_get_by_name(get_exec_env()->ve_netns, ifname); ++ if (!dev) + return -ENODEV; + + addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); +diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c +index 26a37ce..5ac56af 100644 +--- a/net/ipv4/netfilter/ip_queue.c ++++ b/net/ipv4/netfilter/ip_queue.c +@@ -439,7 +439,7 @@ __ipq_rcv_skb(struct sk_buff *skb) + if (type <= IPQM_BASE) + return; + +- if (security_netlink_recv(skb, CAP_NET_ADMIN)) ++ if (security_netlink_recv(skb, CAP_VE_NET_ADMIN)) + RCV_SKB_FAIL(-EPERM); + + write_lock_bh(&queue_lock); +@@ -469,8 +469,12 @@ __ipq_rcv_skb(struct sk_buff *skb) + static void + ipq_rcv_skb(struct sk_buff *skb) + { ++ struct ve_struct *old_ve; ++ + mutex_lock(&ipqnl_mutex); ++ old_ve = set_exec_env(skb->owner_env); + __ipq_rcv_skb(skb); ++ (void)set_exec_env(old_ve); + mutex_unlock(&ipqnl_mutex); + } + +@@ -480,9 +484,6 @@ ipq_rcv_dev_event(struct notifier_block *this, + { + struct net_device *dev = ptr; + +- if (dev_net(dev) != &init_net) +- return NOTIFY_DONE; +- + /* Drop any packets associated with the downed device */ + if (event == NETDEV_DOWN) + ipq_dev_drop(dev->ifindex); +@@ -502,7 +503,7 @@ ipq_rcv_nl_event(struct notifier_block *this, + if (event == NETLINK_URELEASE && + n->protocol == NETLINK_FIREWALL && n->pid) { + write_lock_bh(&queue_lock); +- if ((n->net == &init_net) && (n->pid == peer_pid)) ++ if (n->pid == peer_pid) + __ipq_reset(); + write_unlock_bh(&queue_lock); + } +diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c +index 4e7c719..18e2717 100644 +--- a/net/ipv4/netfilter/ip_tables.c ++++ b/net/ipv4/netfilter/ip_tables.c +@@ -337,6 +337,9 @@ ipt_do_table(struct sk_buff *skb, + struct ipt_entry *e, *back; + struct xt_table_info *private; + ++ if (!table) /* VE is not allowed to have this xtable */ ++ return NF_ACCEPT; ++ + /* Initialization */ + ip = ip_hdr(skb); + datalen = skb->len - ip->ihl * 4; +@@ -488,8 +491,8 @@ mark_source_chains(struct xt_table_info *newinfo, + int visited = e->comefrom & (1 << hook); + + if (e->comefrom & (1 << NF_INET_NUMHOOKS)) { +- printk("iptables: loop hook %u pos %u %08X.\n", +- hook, pos, e->comefrom); ++ ve_printk(VE_LOG, "iptables: loop hook %u pos " ++ "%u %08X.\n", hook, pos, e->comefrom); + return 0; + } + e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); +@@ -932,7 +935,7 @@ static struct xt_counters * alloc_counters(struct xt_table *table) + (other than comefrom, which userspace doesn't care + about). */ + countersize = sizeof(struct xt_counters) * private->number; +- counters = vmalloc_node(countersize, numa_node_id()); ++ counters = ub_vmalloc_node(countersize, numa_node_id()); + + if (counters == NULL) + return ERR_PTR(-ENOMEM); +@@ -1202,7 +1205,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, + void *loc_cpu_old_entry; + + ret = 0; +- counters = vmalloc(num_counters * sizeof(struct xt_counters)); ++ counters = ub_vmalloc_best(num_counters * sizeof(struct xt_counters)); + if (!counters) { + ret = -ENOMEM; + goto out; +@@ -1374,7 +1377,7 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat + if (len != size + num_counters * sizeof(struct xt_counters)) + return -EINVAL; + +- paddc = vmalloc_node(len - size, numa_node_id()); ++ paddc = ub_vmalloc_node(len - size, numa_node_id()); + if (!paddc) + return -ENOMEM; + +@@ -1841,13 +1844,15 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) + return ret; + } + ++static int do_ipt_set_ctl(struct sock *, int, void __user *, unsigned int); ++ + static int + compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, + unsigned int len) + { + int ret; + +- if (!capable(CAP_NET_ADMIN)) ++ if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) + return -EPERM; + + switch (cmd) { +@@ -1860,8 +1865,7 @@ compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, + break; + + default: +- duprintf("do_ipt_set_ctl: unknown request %i\n", cmd); +- ret = -EINVAL; ++ ret = do_ipt_set_ctl(sk, cmd, user, len); + } + + return ret; +@@ -1958,7 +1962,7 @@ compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) + { + int ret; + +- if (!capable(CAP_NET_ADMIN)) ++ if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) + return -EPERM; + + switch (cmd) { +@@ -1980,7 +1984,7 @@ do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) + { + int ret; + +- if (!capable(CAP_NET_ADMIN)) ++ if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) + return -EPERM; + + switch (cmd) { +@@ -2005,7 +2009,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) + { + int ret; + +- if (!capable(CAP_NET_ADMIN)) ++ if (!capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) + return -EPERM; + + switch (cmd) { +@@ -2057,7 +2061,7 @@ struct xt_table *ipt_register_table(struct net *net, struct xt_table *table, + int ret; + struct xt_table_info *newinfo; + struct xt_table_info bootstrap +- = { 0, 0, 0, { 0 }, { 0 }, { } }; ++ = { 0, 0, 0, 0, { 0 }, { 0 }, { } }; + void *loc_cpu_entry; + struct xt_table *new_table; + +@@ -2216,11 +2220,22 @@ static struct xt_match icmp_matchstruct __read_mostly = { + + static int __net_init ip_tables_net_init(struct net *net) + { +- return xt_proto_init(net, AF_INET); ++ int res; ++ ++ if (!net_ipt_module_permitted(net, VE_IP_IPTABLES)) ++ return 0; ++ ++ res = xt_proto_init(net, AF_INET); ++ if (!res) ++ net_ipt_module_set(net, VE_IP_IPTABLES); ++ return res; + } + + static void __net_exit ip_tables_net_exit(struct net *net) + { ++ if (!net_is_ipt_module_set(net, VE_IP_IPTABLES)) ++ return; ++ + xt_proto_fini(net, AF_INET); + } + +diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c +index 1819ad7..25223a8 100644 +--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c ++++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -388,7 +389,8 @@ clusterip_tg_check(const char *tablename, const void *e_void, + return false; + } + +- dev = dev_get_by_name(&init_net, e->ip.iniface); ++ dev = dev_get_by_name(get_exec_env()->ve_netns, ++ e->ip.iniface); + if (!dev) { + printk(KERN_WARNING "CLUSTERIP: no such interface %s\n", e->ip.iniface); + return false; +diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c +index 0af1413..08a4bcd 100644 +--- a/net/ipv4/netfilter/ipt_LOG.c ++++ b/net/ipv4/netfilter/ipt_LOG.c +@@ -47,32 +47,32 @@ static void dump_packet(const struct nf_loginfo *info, + + ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); + if (ih == NULL) { +- printk("TRUNCATED"); ++ ve_printk(VE_LOG, "TRUNCATED"); + return; + } + + /* Important fields: + * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */ + /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ +- printk("SRC=%u.%u.%u.%u DST=%u.%u.%u.%u ", ++ ve_printk(VE_LOG, "SRC=%u.%u.%u.%u DST=%u.%u.%u.%u ", + NIPQUAD(ih->saddr), NIPQUAD(ih->daddr)); + + /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ +- printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", ++ ve_printk(VE_LOG, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", + ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, + ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); + + /* Max length: 6 "CE DF MF " */ + if (ntohs(ih->frag_off) & IP_CE) +- printk("CE "); ++ ve_printk(VE_LOG, "CE "); + if (ntohs(ih->frag_off) & IP_DF) +- printk("DF "); ++ ve_printk(VE_LOG, "DF "); + if (ntohs(ih->frag_off) & IP_MF) +- printk("MF "); ++ ve_printk(VE_LOG, "MF "); + + /* Max length: 11 "FRAG:65535 " */ + if (ntohs(ih->frag_off) & IP_OFFSET) +- printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); ++ ve_printk(VE_LOG, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); + + if ((logflags & IPT_LOG_IPOPT) + && ih->ihl * 4 > sizeof(struct iphdr)) { +@@ -84,15 +84,15 @@ static void dump_packet(const struct nf_loginfo *info, + op = skb_header_pointer(skb, iphoff+sizeof(_iph), + optsize, _opt); + if (op == NULL) { +- printk("TRUNCATED"); ++ ve_printk(VE_LOG, "TRUNCATED"); + return; + } + + /* Max length: 127 "OPT (" 15*4*2chars ") " */ +- printk("OPT ("); ++ ve_printk(VE_LOG, "OPT ("); + for (i = 0; i < optsize; i++) +- printk("%02X", op[i]); +- printk(") "); ++ ve_printk(VE_LOG, "%02X", op[i]); ++ ve_printk(VE_LOG, ") "); + } + + switch (ih->protocol) { +@@ -101,7 +101,7 @@ static void dump_packet(const struct nf_loginfo *info, + const struct tcphdr *th; + + /* Max length: 10 "PROTO=TCP " */ +- printk("PROTO=TCP "); ++ ve_printk(VE_LOG, "PROTO=TCP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; +@@ -110,41 +110,41 @@ static void dump_packet(const struct nf_loginfo *info, + th = skb_header_pointer(skb, iphoff + ih->ihl * 4, + sizeof(_tcph), &_tcph); + if (th == NULL) { +- printk("INCOMPLETE [%u bytes] ", ++ ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ +- printk("SPT=%u DPT=%u ", ++ ve_printk(VE_LOG, "SPT=%u DPT=%u ", + ntohs(th->source), ntohs(th->dest)); + /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ + if (logflags & IPT_LOG_TCPSEQ) +- printk("SEQ=%u ACK=%u ", ++ ve_printk(VE_LOG, "SEQ=%u ACK=%u ", + ntohl(th->seq), ntohl(th->ack_seq)); + /* Max length: 13 "WINDOW=65535 " */ +- printk("WINDOW=%u ", ntohs(th->window)); ++ ve_printk(VE_LOG, "WINDOW=%u ", ntohs(th->window)); + /* Max length: 9 "RES=0x3F " */ +- printk("RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); ++ ve_printk(VE_LOG, "RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); + /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ + if (th->cwr) +- printk("CWR "); ++ ve_printk(VE_LOG, "CWR "); + if (th->ece) +- printk("ECE "); ++ ve_printk(VE_LOG, "ECE "); + if (th->urg) +- printk("URG "); ++ ve_printk(VE_LOG, "URG "); + if (th->ack) +- printk("ACK "); ++ ve_printk(VE_LOG, "ACK "); + if (th->psh) +- printk("PSH "); ++ ve_printk(VE_LOG, "PSH "); + if (th->rst) +- printk("RST "); ++ ve_printk(VE_LOG, "RST "); + if (th->syn) +- printk("SYN "); ++ ve_printk(VE_LOG, "SYN "); + if (th->fin) +- printk("FIN "); ++ ve_printk(VE_LOG, "FIN "); + /* Max length: 11 "URGP=65535 " */ +- printk("URGP=%u ", ntohs(th->urg_ptr)); ++ ve_printk(VE_LOG, "URGP=%u ", ntohs(th->urg_ptr)); + + if ((logflags & IPT_LOG_TCPOPT) + && th->doff * 4 > sizeof(struct tcphdr)) { +@@ -157,15 +157,15 @@ static void dump_packet(const struct nf_loginfo *info, + iphoff+ih->ihl*4+sizeof(_tcph), + optsize, _opt); + if (op == NULL) { +- printk("TRUNCATED"); ++ ve_printk(VE_LOG, "TRUNCATED"); + return; + } + + /* Max length: 127 "OPT (" 15*4*2chars ") " */ +- printk("OPT ("); ++ ve_printk(VE_LOG, "OPT ("); + for (i = 0; i < optsize; i++) +- printk("%02X", op[i]); +- printk(") "); ++ ve_printk(VE_LOG, "%02X", op[i]); ++ ve_printk(VE_LOG, ") "); + } + break; + } +@@ -176,9 +176,9 @@ static void dump_packet(const struct nf_loginfo *info, + + if (ih->protocol == IPPROTO_UDP) + /* Max length: 10 "PROTO=UDP " */ +- printk("PROTO=UDP " ); ++ ve_printk(VE_LOG, "PROTO=UDP " ); + else /* Max length: 14 "PROTO=UDPLITE " */ +- printk("PROTO=UDPLITE "); ++ ve_printk(VE_LOG, "PROTO=UDPLITE "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; +@@ -187,13 +187,13 @@ static void dump_packet(const struct nf_loginfo *info, + uh = skb_header_pointer(skb, iphoff+ih->ihl*4, + sizeof(_udph), &_udph); + if (uh == NULL) { +- printk("INCOMPLETE [%u bytes] ", ++ ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ +- printk("SPT=%u DPT=%u LEN=%u ", ++ ve_printk(VE_LOG, "SPT=%u DPT=%u LEN=%u ", + ntohs(uh->source), ntohs(uh->dest), + ntohs(uh->len)); + break; +@@ -220,7 +220,7 @@ static void dump_packet(const struct nf_loginfo *info, + [ICMP_ADDRESSREPLY] = 12 }; + + /* Max length: 11 "PROTO=ICMP " */ +- printk("PROTO=ICMP "); ++ ve_printk(VE_LOG, "PROTO=ICMP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; +@@ -229,19 +229,19 @@ static void dump_packet(const struct nf_loginfo *info, + ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, + sizeof(_icmph), &_icmph); + if (ich == NULL) { +- printk("INCOMPLETE [%u bytes] ", ++ ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Max length: 18 "TYPE=255 CODE=255 " */ +- printk("TYPE=%u CODE=%u ", ich->type, ich->code); ++ ve_printk(VE_LOG, "TYPE=%u CODE=%u ", ich->type, ich->code); + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + if (ich->type <= NR_ICMP_TYPES + && required_len[ich->type] + && skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { +- printk("INCOMPLETE [%u bytes] ", ++ ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } +@@ -250,19 +250,19 @@ static void dump_packet(const struct nf_loginfo *info, + case ICMP_ECHOREPLY: + case ICMP_ECHO: + /* Max length: 19 "ID=65535 SEQ=65535 " */ +- printk("ID=%u SEQ=%u ", ++ ve_printk(VE_LOG, "ID=%u SEQ=%u ", + ntohs(ich->un.echo.id), + ntohs(ich->un.echo.sequence)); + break; + + case ICMP_PARAMETERPROB: + /* Max length: 14 "PARAMETER=255 " */ +- printk("PARAMETER=%u ", ++ ve_printk(VE_LOG, "PARAMETER=%u ", + ntohl(ich->un.gateway) >> 24); + break; + case ICMP_REDIRECT: + /* Max length: 24 "GATEWAY=255.255.255.255 " */ +- printk("GATEWAY=%u.%u.%u.%u ", ++ ve_printk(VE_LOG, "GATEWAY=%u.%u.%u.%u ", + NIPQUAD(ich->un.gateway)); + /* Fall through */ + case ICMP_DEST_UNREACH: +@@ -270,16 +270,16 @@ static void dump_packet(const struct nf_loginfo *info, + case ICMP_TIME_EXCEEDED: + /* Max length: 3+maxlen */ + if (!iphoff) { /* Only recurse once. */ +- printk("["); ++ ve_printk(VE_LOG, "["); + dump_packet(info, skb, + iphoff + ih->ihl*4+sizeof(_icmph)); +- printk("] "); ++ ve_printk(VE_LOG, "] "); + } + + /* Max length: 10 "MTU=65535 " */ + if (ich->type == ICMP_DEST_UNREACH + && ich->code == ICMP_FRAG_NEEDED) +- printk("MTU=%u ", ntohs(ich->un.frag.mtu)); ++ ve_printk(VE_LOG, "MTU=%u ", ntohs(ich->un.frag.mtu)); + } + break; + } +@@ -292,19 +292,19 @@ static void dump_packet(const struct nf_loginfo *info, + break; + + /* Max length: 9 "PROTO=AH " */ +- printk("PROTO=AH "); ++ ve_printk(VE_LOG, "PROTO=AH "); + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ah = skb_header_pointer(skb, iphoff+ih->ihl*4, + sizeof(_ahdr), &_ahdr); + if (ah == NULL) { +- printk("INCOMPLETE [%u bytes] ", ++ ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Length: 15 "SPI=0xF1234567 " */ +- printk("SPI=0x%x ", ntohl(ah->spi)); ++ ve_printk(VE_LOG, "SPI=0x%x ", ntohl(ah->spi)); + break; + } + case IPPROTO_ESP: { +@@ -312,7 +312,7 @@ static void dump_packet(const struct nf_loginfo *info, + const struct ip_esp_hdr *eh; + + /* Max length: 10 "PROTO=ESP " */ +- printk("PROTO=ESP "); ++ ve_printk(VE_LOG, "PROTO=ESP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; +@@ -321,25 +321,25 @@ static void dump_packet(const struct nf_loginfo *info, + eh = skb_header_pointer(skb, iphoff+ih->ihl*4, + sizeof(_esph), &_esph); + if (eh == NULL) { +- printk("INCOMPLETE [%u bytes] ", ++ ve_printk(VE_LOG, "INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Length: 15 "SPI=0xF1234567 " */ +- printk("SPI=0x%x ", ntohl(eh->spi)); ++ ve_printk(VE_LOG, "SPI=0x%x ", ntohl(eh->spi)); + break; + } + /* Max length: 10 "PROTO 255 " */ + default: +- printk("PROTO=%u ", ih->protocol); ++ ve_printk(VE_LOG, "PROTO=%u ", ih->protocol); + } + + /* Max length: 15 "UID=4294967295 " */ + if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) { + read_lock_bh(&skb->sk->sk_callback_lock); + if (skb->sk->sk_socket && skb->sk->sk_socket->file) +- printk("UID=%u GID=%u ", ++ ve_printk(VE_LOG, "UID=%u GID=%u ", + skb->sk->sk_socket->file->f_uid, + skb->sk->sk_socket->file->f_gid); + read_unlock_bh(&skb->sk->sk_callback_lock); +@@ -387,7 +387,7 @@ ipt_log_packet(unsigned int pf, + loginfo = &default_loginfo; + + spin_lock_bh(&log_lock); +- printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, ++ ve_printk(VE_LOG, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, + prefix, + in ? in->name : "", + out ? out->name : ""); +@@ -398,30 +398,30 @@ ipt_log_packet(unsigned int pf, + + physindev = skb->nf_bridge->physindev; + if (physindev && in != physindev) +- printk("PHYSIN=%s ", physindev->name); ++ ve_printk(VE_LOG, "PHYSIN=%s ", physindev->name); + physoutdev = skb->nf_bridge->physoutdev; + if (physoutdev && out != physoutdev) +- printk("PHYSOUT=%s ", physoutdev->name); ++ ve_printk(VE_LOG, "PHYSOUT=%s ", physoutdev->name); + } + #endif + + if (in && !out) { + /* MAC logging for input chain only. */ +- printk("MAC="); ++ ve_printk(VE_LOG, "MAC="); + if (skb->dev && skb->dev->hard_header_len + && skb->mac_header != skb->network_header) { + int i; + const unsigned char *p = skb_mac_header(skb); + for (i = 0; i < skb->dev->hard_header_len; i++,p++) +- printk("%02x%c", *p, ++ ve_printk(VE_LOG, "%02x%c", *p, + i==skb->dev->hard_header_len - 1 + ? ' ':':'); + } else +- printk(" "); ++ ve_printk(VE_LOG, " "); + } + + dump_packet(loginfo, skb, 0); +- printk("\n"); ++ ve_printk(VE_LOG, "\n"); + spin_unlock_bh(&log_lock); + } + +diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c +index 84c26dd..85e4a69 100644 +--- a/net/ipv4/netfilter/ipt_MASQUERADE.c ++++ b/net/ipv4/netfilter/ipt_MASQUERADE.c +@@ -98,6 +98,7 @@ masquerade_tg(struct sk_buff *skb, const struct net_device *in, + return nf_nat_setup_info(ct, &newrange, IP_NAT_MANIP_SRC); + } + ++#if 0 + static int + device_cmp(struct nf_conn *i, void *ifindex) + { +@@ -120,9 +121,6 @@ static int masq_device_event(struct notifier_block *this, + { + const struct net_device *dev = ptr; + +- if (dev_net(dev) != &init_net) +- return NOTIFY_DONE; +- + if (event == NETDEV_DOWN) { + /* Device was downed. Search entire table for + conntracks which were associated with that device, +@@ -150,6 +148,7 @@ static struct notifier_block masq_dev_notifier = { + static struct notifier_block masq_inet_notifier = { + .notifier_call = masq_inet_event, + }; ++#endif + + static struct xt_target masquerade_tg_reg __read_mostly = { + .name = "MASQUERADE", +@@ -168,12 +167,16 @@ static int __init masquerade_tg_init(void) + + ret = xt_register_target(&masquerade_tg_reg); + ++#if 0 ++/* These notifiers are unnecessary and may ++ lead to oops in virtual environments */ + if (ret == 0) { + /* Register for device down reports */ + register_netdevice_notifier(&masq_dev_notifier); + /* Register IP address change reports */ + register_inetaddr_notifier(&masq_inet_notifier); + } ++#endif + + return ret; + } +@@ -181,8 +184,8 @@ static int __init masquerade_tg_init(void) + static void __exit masquerade_tg_exit(void) + { + xt_unregister_target(&masquerade_tg_reg); +- unregister_netdevice_notifier(&masq_dev_notifier); +- unregister_inetaddr_notifier(&masq_inet_notifier); ++/* unregister_netdevice_notifier(&masq_dev_notifier); ++ unregister_inetaddr_notifier(&masq_inet_notifier);*/ + } + + module_init(masquerade_tg_init); +diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c +index 5c62924..99dfc92 100644 +--- a/net/ipv4/netfilter/ipt_REDIRECT.c ++++ b/net/ipv4/netfilter/ipt_REDIRECT.c +@@ -72,8 +72,13 @@ redirect_tg(struct sk_buff *skb, const struct net_device *in, + + rcu_read_lock(); + indev = __in_dev_get_rcu(skb->dev); +- if (indev && (ifa = indev->ifa_list)) ++ if (indev && (ifa = indev->ifa_list)) { ++ /* because of venet device specific, we should use ++ * second ifa in the list */ ++ if (IN_LOOPBACK(ntohl(ifa->ifa_local)) && ifa->ifa_next) ++ ifa = ifa->ifa_next; + newdst = ifa->ifa_local; ++ } + rcu_read_unlock(); + + if (!newdst) +diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c +index 2639872..6b1fcf8 100644 +--- a/net/ipv4/netfilter/ipt_REJECT.c ++++ b/net/ipv4/netfilter/ipt_REJECT.c +@@ -186,13 +186,13 @@ reject_tg_check(const char *tablename, const void *e_void, + const struct ipt_entry *e = e_void; + + if (rejinfo->with == IPT_ICMP_ECHOREPLY) { +- printk("ipt_REJECT: ECHOREPLY no longer supported.\n"); ++ ve_printk(VE_LOG, "ipt_REJECT: ECHOREPLY no longer supported.\n"); + return false; + } else if (rejinfo->with == IPT_TCP_RESET) { + /* Must specify that it's a TCP packet */ + if (e->ip.proto != IPPROTO_TCP + || (e->ip.invflags & XT_INV_PROTO)) { +- printk("ipt_REJECT: TCP_RESET invalid for non-tcp\n"); ++ ve_printk(VE_LOG, "ipt_REJECT: TCP_RESET invalid for non-tcp\n"); + return false; + } + } +diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c +index 21cb053..43d5667 100644 +--- a/net/ipv4/netfilter/ipt_recent.c ++++ b/net/ipv4/netfilter/ipt_recent.c +@@ -14,6 +14,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -52,6 +53,19 @@ MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/ipt_recent/* files"); + MODULE_PARM_DESC(ip_list_uid,"owner of /proc/net/ipt_recent/* files"); + MODULE_PARM_DESC(ip_list_gid,"owning group of /proc/net/ipt_recent/* files"); + ++#include ++ ++#if defined(CONFIG_VE_IPTABLES) ++#define tables (get_exec_env()->_ipt_recent->tables) ++#define proc_dir (get_exec_env()->_ipt_recent->proc_dir) ++#else ++static LIST_HEAD(tables); ++static struct proc_dir_entry *proc_dir; ++#endif /* CONFIG_VE_IPTABLES */ ++ ++static int init_ipt_recent(struct ve_struct *ve); ++static void fini_ipt_recent(struct ve_struct *ve); ++ + struct recent_entry { + struct list_head list; + struct list_head lru_list; +@@ -74,12 +88,10 @@ struct recent_table { + struct list_head iphash[0]; + }; + +-static LIST_HEAD(tables); + static DEFINE_SPINLOCK(recent_lock); + static DEFINE_MUTEX(recent_mutex); + + #ifdef CONFIG_PROC_FS +-static struct proc_dir_entry *proc_dir; + static const struct file_operations recent_fops; + #endif + +@@ -258,6 +270,9 @@ recent_mt_check(const char *tablename, const void *ip, + strnlen(info->name, IPT_RECENT_NAME_LEN) == IPT_RECENT_NAME_LEN) + return false; + ++ if (init_ipt_recent(get_exec_env())) ++ return 0; ++ + mutex_lock(&recent_mutex); + t = recent_table_lookup(info->name); + if (t != NULL) { +@@ -298,6 +313,13 @@ static void recent_mt_destroy(const struct xt_match *match, void *matchinfo) + { + const struct ipt_recent_info *info = matchinfo; + struct recent_table *t; ++ struct ve_struct *ve; ++ ++ ve = get_exec_env(); ++#ifdef CONFIG_VE_IPTABLES ++ if (!ve->_ipt_recent) ++ return; ++#endif + + mutex_lock(&recent_mutex); + t = recent_table_lookup(info->name); +@@ -312,6 +334,8 @@ static void recent_mt_destroy(const struct xt_match *match, void *matchinfo) + kfree(t); + } + mutex_unlock(&recent_mutex); ++ if (!ve_is_super(ve) && list_empty(&tables)) ++ fini_ipt_recent(ve); + } + + #ifdef CONFIG_PROC_FS +@@ -467,6 +491,49 @@ static struct xt_match recent_mt_reg __read_mostly = { + .me = THIS_MODULE, + }; + ++static int init_ipt_recent(struct ve_struct *ve) ++{ ++ int err = 0; ++ ++#ifdef CONFIG_VE_IPTABLES ++ if (ve->_ipt_recent) ++ return 0; ++ ++ ve->_ipt_recent = kzalloc(sizeof(struct ve_ipt_recent), GFP_KERNEL); ++ if (!ve->_ipt_recent) { ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ INIT_LIST_HEAD(&tables); ++#endif ++#ifdef CONFIG_PROC_FS ++ if (err) ++ return err; ++ proc_dir = proc_mkdir("ipt_recent", ve->ve_netns->proc_net); ++ if (proc_dir == NULL) { ++ err = -ENOMEM; ++ goto out_mem; ++ } ++#endif ++out: ++ return err; ++out_mem: ++#ifdef CONFIG_VE_IPTABLES ++ kfree(ve->_ipt_recent); ++#endif ++ goto out; ++} ++ ++static void fini_ipt_recent(struct ve_struct *ve) ++{ ++ remove_proc_entry("ipt_recent", ve->ve_netns->proc_net); ++#ifdef CONFIG_VE_IPTABLES ++ kfree(ve->_ipt_recent); ++ ve->_ipt_recent = NULL; ++#endif ++} ++ + static int __init recent_mt_init(void) + { + int err; +@@ -476,25 +543,24 @@ static int __init recent_mt_init(void) + ip_list_hash_size = 1 << fls(ip_list_tot); + + err = xt_register_match(&recent_mt_reg); +-#ifdef CONFIG_PROC_FS + if (err) + return err; +- proc_dir = proc_mkdir("ipt_recent", init_net.proc_net); +- if (proc_dir == NULL) { ++ ++ err = init_ipt_recent(&ve0); ++ if (err) { + xt_unregister_match(&recent_mt_reg); +- err = -ENOMEM; ++ return err; + } +-#endif +- return err; ++ ++ return 0; + } + + static void __exit recent_mt_exit(void) + { + BUG_ON(!list_empty(&tables)); ++ ++ fini_ipt_recent(&ve0); + xt_unregister_match(&recent_mt_reg); +-#ifdef CONFIG_PROC_FS +- remove_proc_entry("ipt_recent", init_net.proc_net); +-#endif + } + + module_init(recent_mt_init); +diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c +index 1ea677d..12c4c2b 100644 +--- a/net/ipv4/netfilter/iptable_filter.c ++++ b/net/ipv4/netfilter/iptable_filter.c +@@ -134,16 +134,24 @@ module_param(forward, bool, 0000); + + static int __net_init iptable_filter_net_init(struct net *net) + { ++ if (!net_ipt_module_permitted(net, VE_IP_FILTER)) ++ return 0; ++ + /* Register table */ + net->ipv4.iptable_filter = + ipt_register_table(net, &packet_filter, &initial_table.repl); + if (IS_ERR(net->ipv4.iptable_filter)) + return PTR_ERR(net->ipv4.iptable_filter); ++ ++ net_ipt_module_set(net, VE_IP_FILTER); + return 0; + } + + static void __net_exit iptable_filter_net_exit(struct net *net) + { ++ if (!net_is_ipt_module_set(net, VE_IP_FILTER)) ++ return; ++ + ipt_unregister_table(net->ipv4.iptable_filter); + } + +diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c +index da59182..f6343d8 100644 +--- a/net/ipv4/netfilter/iptable_mangle.c ++++ b/net/ipv4/netfilter/iptable_mangle.c +@@ -203,16 +203,24 @@ static struct nf_hook_ops ipt_ops[] __read_mostly = { + + static int __net_init iptable_mangle_net_init(struct net *net) + { ++ if (!net_ipt_module_permitted(net, VE_IP_MANGLE)) ++ return 0; ++ + /* Register table */ + net->ipv4.iptable_mangle = + ipt_register_table(net, &packet_mangler, &initial_table.repl); + if (IS_ERR(net->ipv4.iptable_mangle)) + return PTR_ERR(net->ipv4.iptable_mangle); ++ ++ net_ipt_module_set(net, VE_IP_MANGLE); + return 0; + } + + static void __net_exit iptable_mangle_net_exit(struct net *net) + { ++ if (!net_is_ipt_module_set(net, VE_IP_MANGLE)) ++ return; ++ + ipt_unregister_table(net->ipv4.iptable_mangle); + } + +diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +index 5a955c4..dca8da7 100644 +--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c ++++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +@@ -15,6 +15,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -417,66 +418,226 @@ MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET)); + MODULE_ALIAS("ip_conntrack"); + MODULE_LICENSE("GPL"); + +-static int __init nf_conntrack_l3proto_ipv4_init(void) ++#ifdef CONFIG_VE_IPTABLES ++#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) ++static int nf_ct_proto_ipv4_sysctl_init(void) + { +- int ret = 0; ++ struct nf_conntrack_l3proto *ipv4 = ve_nf_conntrack_l3proto_ipv4; ++ struct ctl_table *ct_table; ++ struct net *net = get_exec_env()->ve_netns; + +- need_conntrack(); ++ ct_table = ip_ct_sysctl_table; + +- ret = nf_register_sockopt(&so_getorigdst); +- if (ret < 0) { +- printk(KERN_ERR "Unable to register netfilter socket option\n"); +- return ret; ++ if (net != &init_net) { ++ ct_table = kmemdup(ct_table, sizeof(ip_ct_sysctl_table), ++ GFP_KERNEL); ++ if (!ct_table) ++ return -ENOMEM; ++ } ++ ++ ipv4->ctl_table_header = NULL; ++ ipv4->ctl_table_path = nf_net_ipv4_netfilter_sysctl_path; ++ ipv4->ctl_table = ct_table; ++ ++ ipv4->ctl_table[0].data = &ve_nf_conntrack_max; ++ ipv4->ctl_table[1].data = &ve_nf_conntrack_count; ++ ipv4->ctl_table[3].data = &ve_nf_conntrack_checksum; ++ ipv4->ctl_table[4].data = &ve_nf_ct_log_invalid; ++ ++ return 0; ++} ++ ++static void nf_ct_proto_ipv4_sysctl_cleanup(void) ++{ ++ struct net *net = get_exec_env()->ve_netns; ++ ++ if (net != &init_net) { ++ kfree(ve_nf_conntrack_l3proto_ipv4->ctl_table); + } ++} ++#else ++static inline int nf_ct_proto_ipv4_sysctl_init(void) ++{ ++ return 0; ++} ++static inline void nf_ct_proto_ipv4_sysctl_cleanup(void) ++{ ++} ++#endif /* SYSCTL && NF_CONNTRACK_PROC_COMPAT */ ++ ++/* ++ * Functions init/fini_nf_ct_l3proto_ipv4 glue distributed nf_conntrack ++ * virtualization efforts. They are to be called from 2 places: ++ * ++ * 1) on loading/unloading module nf_conntrack_ipv4 from ++ * nf_conntrack_l3proto_ipv4_init/fini ++ * 2) on start/stop ve - from do_ve_iptables ++ */ ++static int nf_ct_proto_ipv4_init(void) ++{ ++ struct nf_conntrack_l3proto *ipv4; ++ ++ if (ve_is_super(get_exec_env())) { ++ ipv4 = &nf_conntrack_l3proto_ipv4; ++ goto out; ++ } ++ ipv4 = kmemdup(&nf_conntrack_l3proto_ipv4, ++ sizeof(struct nf_conntrack_l3proto), GFP_KERNEL); ++ if (!ipv4) ++ return -ENOMEM; ++out: ++ ve_nf_conntrack_l3proto_ipv4 = ipv4; ++ return 0; ++} ++ ++static void nf_ct_proto_ipv4_fini(void) ++{ ++ if (!ve_is_super(get_exec_env())) ++ kfree(ve_nf_conntrack_l3proto_ipv4); ++} ++#endif ++ ++int init_nf_ct_l3proto_ipv4(void) ++{ ++ int ret = -ENOMEM; ++ int do_hooks = ve_is_super(get_exec_env()); ++ ++#ifdef CONFIG_VE_IPTABLES ++ if (!ve_is_super(get_exec_env())) ++ __module_get(THIS_MODULE); ++ ++ ret = nf_ct_proto_ipv4_init(); ++ if (ret < 0) ++ goto err_out; ++ ret = nf_ct_proto_ipv4_sysctl_init(); ++ if (ret < 0) ++ goto no_mem_ipv4; ++ ret = nf_ct_proto_tcp_sysctl_init(); ++ if (ret < 0) ++ goto no_mem_tcp; ++ ret = nf_ct_proto_udp_sysctl_init(); ++ if (ret < 0) ++ goto no_mem_udp; ++ ret = nf_ct_proto_icmp_sysctl_init(); ++ if (ret < 0) ++ goto no_mem_icmp; ++#endif /* CONFIG_VE_IPTABLES */ + +- ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp4); ++ ret = nf_conntrack_l4proto_register(ve_nf_conntrack_l4proto_tcp4); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register tcp.\n"); +- goto cleanup_sockopt; ++ goto cleanup_sys; + } + +- ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp4); ++ ret = nf_conntrack_l4proto_register(ve_nf_conntrack_l4proto_udp4); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register udp.\n"); +- goto cleanup_tcp; ++ goto unreg_tcp; + } + +- ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmp); ++ ret = nf_conntrack_l4proto_register(ve_nf_conntrack_l4proto_icmp); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register icmp.\n"); +- goto cleanup_udp; ++ goto unreg_udp; + } + +- ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv4); ++ ret = nf_conntrack_l3proto_register(ve_nf_conntrack_l3proto_ipv4); + if (ret < 0) { + printk("nf_conntrack_ipv4: can't register ipv4\n"); +- goto cleanup_icmp; ++ goto unreg_icmp; + } + +- ret = nf_register_hooks(ipv4_conntrack_ops, +- ARRAY_SIZE(ipv4_conntrack_ops)); +- if (ret < 0) { +- printk("nf_conntrack_ipv4: can't register hooks.\n"); +- goto cleanup_ipv4; ++ if (do_hooks) { ++ ret = nf_register_hooks(ipv4_conntrack_ops, ++ ARRAY_SIZE(ipv4_conntrack_ops)); ++ if (ret < 0) { ++ printk("nf_conntrack_ipv4: can't register hooks.\n"); ++ goto unreg_ipv4; ++ } + } +-#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) + ret = nf_conntrack_ipv4_compat_init(); + if (ret < 0) +- goto cleanup_hooks; +-#endif ++ goto unreg_hooks; ++ return 0; ++ ++unreg_hooks: ++ if (do_hooks) ++ nf_unregister_hooks(ipv4_conntrack_ops, ++ ARRAY_SIZE(ipv4_conntrack_ops)); ++unreg_ipv4: ++ nf_conntrack_l3proto_unregister(ve_nf_conntrack_l3proto_ipv4); ++unreg_icmp: ++ nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_icmp); ++unreg_udp: ++ nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_udp4); ++unreg_tcp: ++ nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_tcp4); ++cleanup_sys: ++#ifdef CONFIG_VE_IPTABLES ++no_mem_icmp: ++ nf_ct_proto_udp_sysctl_cleanup(); ++no_mem_udp: ++ nf_ct_proto_tcp_sysctl_cleanup(); ++no_mem_tcp: ++ nf_ct_proto_ipv4_sysctl_cleanup(); ++no_mem_ipv4: ++ nf_ct_proto_ipv4_fini(); ++err_out: ++ if (!ve_is_super(get_exec_env())) ++ module_put(THIS_MODULE); ++#endif /* CONFIG_VE_IPTABLES */ + return ret; +-#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) +- cleanup_hooks: +- nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); +-#endif +- cleanup_ipv4: +- nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4); +- cleanup_icmp: +- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp); +- cleanup_udp: +- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4); +- cleanup_tcp: +- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4); ++} ++EXPORT_SYMBOL(init_nf_ct_l3proto_ipv4); ++ ++void fini_nf_ct_l3proto_ipv4(void) ++{ ++ int do_hooks = ve_is_super(get_exec_env()); ++ ++ nf_conntrack_ipv4_compat_fini(); ++ if (do_hooks) ++ nf_unregister_hooks(ipv4_conntrack_ops, ++ ARRAY_SIZE(ipv4_conntrack_ops)); ++ ++ nf_conntrack_l3proto_unregister(ve_nf_conntrack_l3proto_ipv4); ++ nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_icmp); ++ nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_udp4); ++ nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_tcp4); ++ ++#ifdef CONFIG_VE_IPTABLES ++ nf_ct_proto_icmp_sysctl_cleanup(); ++ nf_ct_proto_udp_sysctl_cleanup(); ++ nf_ct_proto_tcp_sysctl_cleanup(); ++ nf_ct_proto_ipv4_sysctl_cleanup(); ++ nf_ct_proto_ipv4_fini(); ++ if (!ve_is_super(get_exec_env())) ++ module_put(THIS_MODULE); ++#endif /* CONFIG_VE_IPTABLES */ ++} ++EXPORT_SYMBOL(fini_nf_ct_l3proto_ipv4); ++ ++static int __init nf_conntrack_l3proto_ipv4_init(void) ++{ ++ int ret = 0; ++ ++ need_conntrack(); ++ ++ ret = nf_register_sockopt(&so_getorigdst); ++ if (ret < 0) { ++ printk(KERN_ERR "Unable to register netfilter socket option\n"); ++ return ret; ++ } ++ ++ ret = init_nf_ct_l3proto_ipv4(); ++ if (ret < 0) { ++ printk(KERN_ERR "Unable to initialize netfilter protocols\n"); ++ goto cleanup_sockopt; ++ } ++ KSYMRESOLVE(init_nf_ct_l3proto_ipv4); ++ KSYMRESOLVE(fini_nf_ct_l3proto_ipv4); ++ KSYMMODRESOLVE(nf_conntrack_ipv4); ++ return ret; ++ + cleanup_sockopt: + nf_unregister_sockopt(&so_getorigdst); + return ret; +@@ -485,14 +646,12 @@ static int __init nf_conntrack_l3proto_ipv4_init(void) + static void __exit nf_conntrack_l3proto_ipv4_fini(void) + { + synchronize_net(); +-#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT) +- nf_conntrack_ipv4_compat_fini(); +-#endif +- nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); +- nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv4); +- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmp); +- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp4); +- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp4); ++ ++ KSYMMODUNRESOLVE(nf_conntrack_ipv4); ++ KSYMUNRESOLVE(init_nf_ct_l3proto_ipv4); ++ KSYMUNRESOLVE(fini_nf_ct_l3proto_ipv4); ++ ++ fini_nf_ct_l3proto_ipv4(); + nf_unregister_sockopt(&so_getorigdst); + } + +diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +index 40a46d4..f73ad01 100644 +--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c ++++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +@@ -9,7 +9,9 @@ + */ + #include + #include ++#include + #include ++#include + #include + #include + +@@ -44,7 +46,7 @@ static struct hlist_node *ct_get_first(struct seq_file *seq) + for (st->bucket = 0; + st->bucket < nf_conntrack_htable_size; + st->bucket++) { +- n = rcu_dereference(nf_conntrack_hash[st->bucket].first); ++ n = rcu_dereference(ve_nf_conntrack_hash[st->bucket].first); + if (n) + return n; + } +@@ -60,7 +62,7 @@ static struct hlist_node *ct_get_next(struct seq_file *seq, + while (head == NULL) { + if (++st->bucket >= nf_conntrack_htable_size) + return NULL; +- head = rcu_dereference(nf_conntrack_hash[st->bucket].first); ++ head = rcu_dereference(ve_nf_conntrack_hash[st->bucket].first); + } + return head; + } +@@ -193,7 +195,7 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq) + struct hlist_node *n; + + for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { +- n = rcu_dereference(nf_ct_expect_hash[st->bucket].first); ++ n = rcu_dereference(ve_nf_ct_expect_hash[st->bucket].first); + if (n) + return n; + } +@@ -209,7 +211,7 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq, + while (head == NULL) { + if (++st->bucket >= nf_ct_expect_hsize) + return NULL; +- head = rcu_dereference(nf_ct_expect_hash[st->bucket].first); ++ head = rcu_dereference(ve_nf_ct_expect_hash[st->bucket].first); + } + return head; + } +@@ -326,7 +328,7 @@ static void ct_cpu_seq_stop(struct seq_file *seq, void *v) + + static int ct_cpu_seq_show(struct seq_file *seq, void *v) + { +- unsigned int nr_conntracks = atomic_read(&nf_conntrack_count); ++ unsigned int nr_conntracks = atomic_read(&ve_nf_conntrack_count); + const struct ip_conntrack_stat *st = v; + + if (v == SEQ_START_TOKEN) { +@@ -377,36 +379,91 @@ static const struct file_operations ct_cpu_seq_fops = { + .release = seq_release, + }; + +-int __init nf_conntrack_ipv4_compat_init(void) ++#ifdef CONFIG_VE_IPTABLES ++#define ve_ip_ct_netfilter_table (get_exec_env()->_nf_conntrack->_ip_ct_netfilter_table) ++#define ve_ip_ct_sysctl_header (get_exec_env()->_nf_conntrack->_ip_ct_sysctl_header) ++#else ++#define ve_ip_ct_netfilter_table ip_ct_netfilter_table ++#define ve_ip_ct_sysctl_header ip_ct_sysctl_header ++#endif ++ ++static ctl_table ip_ct_netfilter_table[] = { ++ { ++ .procname = "ip_conntrack_max", ++ .data = &nf_conntrack_max, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++ {} ++}; ++ ++static struct ctl_path ip_ct_net_table_path[] = { ++ { .procname = "net", .ctl_name = CTL_NET, }, ++ { .procname = "ipv4", .ctl_name = NET_IPV4, }, ++ {}, ++}; ++ ++int nf_conntrack_ipv4_compat_init(void) + { ++ struct net *net = get_exec_env()->ve_netns; + struct proc_dir_entry *proc, *proc_exp, *proc_stat; ++ static ctl_table *table; + +- proc = proc_net_fops_create(&init_net, "ip_conntrack", 0440, &ct_file_ops); ++ proc = proc_net_fops_create(net, "ip_conntrack", 0440, &ct_file_ops); + if (!proc) + goto err1; + +- proc_exp = proc_net_fops_create(&init_net, "ip_conntrack_expect", 0440, ++ proc_exp = proc_net_fops_create(net, "ip_conntrack_expect", 0440, + &ip_exp_file_ops); + if (!proc_exp) + goto err2; + + proc_stat = proc_create("ip_conntrack", S_IRUGO, +- init_net.proc_net_stat, &ct_cpu_seq_fops); ++ net->proc_net_stat, &ct_cpu_seq_fops); + if (!proc_stat) + goto err3; ++ ++ table = ip_ct_netfilter_table; ++ if (net != &init_net) { ++ table = kmemdup(table, ++ sizeof(ip_ct_netfilter_table), ++ GFP_KERNEL); ++ if (!table) ++ goto err4; ++ } ++ ++ table[0].data = &ve_nf_conntrack_max; ++ ve_ip_ct_sysctl_header = register_net_sysctl_table(net, ++ ip_ct_net_table_path, ++ table); ++ if (!ve_ip_ct_sysctl_header) ++ goto err5; ++ + return 0; + ++err5: ++ if (net != &init_net) ++ kfree(table); ++err4: ++ remove_proc_entry("ip_conntrack", net->proc_net_stat); + err3: +- proc_net_remove(&init_net, "ip_conntrack_expect"); ++ proc_net_remove(net, "ip_conntrack_expect"); + err2: +- proc_net_remove(&init_net, "ip_conntrack"); ++ proc_net_remove(net, "ip_conntrack"); + err1: + return -ENOMEM; + } + +-void __exit nf_conntrack_ipv4_compat_fini(void) ++void nf_conntrack_ipv4_compat_fini(void) + { +- remove_proc_entry("ip_conntrack", init_net.proc_net_stat); +- proc_net_remove(&init_net, "ip_conntrack_expect"); +- proc_net_remove(&init_net, "ip_conntrack"); ++ struct net *net = get_exec_env()->ve_netns; ++ struct ctl_table *table = ve_ip_ct_sysctl_header->ctl_table_arg; ++ ++ unregister_net_sysctl_table(ve_ip_ct_sysctl_header); ++ if (net != &init_net) ++ kfree(table); ++ remove_proc_entry("ip_conntrack", net->proc_net_stat); ++ proc_net_remove(net, "ip_conntrack_expect"); ++ proc_net_remove(net, "ip_conntrack"); + } +diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +index 78ab19a..f510c45 100644 +--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c ++++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +@@ -7,6 +7,7 @@ + */ + + #include ++#include + #include + #include + #include +@@ -20,7 +21,7 @@ + #include + #include + +-static unsigned long nf_ct_icmp_timeout __read_mostly = 30*HZ; ++unsigned long nf_ct_icmp_timeout __read_mostly = 30*HZ; + + static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, + struct nf_conntrack_tuple *tuple) +@@ -93,7 +94,7 @@ static int icmp_packet(struct nf_conn *ct, + } else { + atomic_inc(&ct->proto.icmp.count); + nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); +- nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmp_timeout); ++ nf_ct_refresh_acct(ct, ctinfo, skb, ve_nf_ct_icmp_timeout); + } + + return NF_ACCEPT; +@@ -149,7 +150,7 @@ icmp_error_message(struct sk_buff *skb, + /* Ordinarily, we'd expect the inverted tupleproto, but it's + been preserved inside the ICMP. */ + if (!nf_ct_invert_tuple(&innertuple, &origtuple, +- &nf_conntrack_l3proto_ipv4, innerproto)) { ++ ve_nf_conntrack_l3proto_ipv4, innerproto)) { + pr_debug("icmp_error_message: no match\n"); + return -NF_ACCEPT; + } +@@ -321,3 +322,64 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly = + #endif + #endif + }; ++ ++#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL) ++int nf_ct_proto_icmp_sysctl_init(void) ++{ ++ struct nf_conntrack_l4proto *icmp; ++ ++ if (ve_is_super(get_exec_env())) { ++ icmp = &nf_conntrack_l4proto_icmp; ++ goto out; ++ } ++ ++ icmp = kmemdup(&nf_conntrack_l4proto_icmp, ++ sizeof(struct nf_conntrack_l4proto), GFP_KERNEL); ++ if (!icmp) ++ goto no_mem_ct; ++ ++ icmp->ctl_table_header = &ve_icmp_sysctl_header; ++ icmp->ctl_table = kmemdup(icmp_sysctl_table, ++ sizeof(icmp_sysctl_table), GFP_KERNEL); ++ if (icmp->ctl_table == NULL) ++ goto no_mem_sys; ++ icmp->ctl_table[0].data = &ve_nf_ct_icmp_timeout; ++ ++#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT ++ icmp->ctl_compat_table_header = ve_icmp_compat_sysctl_header; ++ icmp->ctl_compat_table = kmemdup(icmp_compat_sysctl_table, ++ sizeof(icmp_compat_sysctl_table), ++ GFP_KERNEL); ++ if (icmp->ctl_compat_table == NULL) ++ goto no_mem_compat; ++ icmp->ctl_compat_table[0].data = &ve_nf_ct_icmp_timeout; ++#endif ++out: ++ ve_nf_ct_icmp_timeout = nf_ct_icmp_timeout; ++ ++ ve_nf_conntrack_l4proto_icmp = icmp; ++ return 0; ++ ++#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT ++no_mem_compat: ++ kfree(icmp->ctl_table); ++#endif ++no_mem_sys: ++ kfree(icmp); ++no_mem_ct: ++ return -ENOMEM; ++} ++EXPORT_SYMBOL(nf_ct_proto_icmp_sysctl_init); ++ ++void nf_ct_proto_icmp_sysctl_cleanup(void) ++{ ++ if (!ve_is_super(get_exec_env())) { ++#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT ++ kfree(ve_nf_conntrack_l4proto_icmp->ctl_compat_table); ++#endif ++ kfree(ve_nf_conntrack_l4proto_icmp->ctl_table); ++ kfree(ve_nf_conntrack_l4proto_icmp); ++ } ++} ++EXPORT_SYMBOL(nf_ct_proto_icmp_sysctl_cleanup); ++#endif /* CONFIG_VE_IPTABLES && CONFIG_SYSCTL */ +diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c +index d2a887f..f7f832b 100644 +--- a/net/ipv4/netfilter/nf_nat_core.c ++++ b/net/ipv4/netfilter/nf_nat_core.c +@@ -19,6 +19,8 @@ + #include + #include + #include ++#include ++#include + + #include + #include +@@ -33,22 +35,34 @@ + + static DEFINE_SPINLOCK(nf_nat_lock); + +-static struct nf_conntrack_l3proto *l3proto __read_mostly; + + /* Calculated at init based on memory size */ + static unsigned int nf_nat_htable_size __read_mostly; +-static int nf_nat_vmalloced; + ++#define MAX_IP_NAT_PROTO 256 ++ ++#ifdef CONFIG_VE_IPTABLES ++#define ve_nf_nat_protos (get_exec_env()->_nf_conntrack->_nf_nat_protos) ++#define ve_nf_nat_l3proto (get_exec_env()->_nf_conntrack->_nf_nat_l3proto) ++#define ve_bysource (get_exec_env()->_nf_conntrack->_bysource) ++#define ve_nf_nat_vmalloced (get_exec_env()->_nf_conntrack->_nf_nat_vmalloced) ++#else ++static struct nf_conntrack_l3proto *l3proto __read_mostly; ++static int nf_nat_vmalloced; + static struct hlist_head *bysource __read_mostly; + +-#define MAX_IP_NAT_PROTO 256 + static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO] + __read_mostly; ++#define ve_nf_nat_protos nf_nat_protos ++#define ve_nf_nat_l3proto l3proto ++#define ve_bysource bysource ++#define ve_nf_nat_vmalloced nf_nat_vmalloced ++#endif + + static inline const struct nf_nat_protocol * + __nf_nat_proto_find(u_int8_t protonum) + { +- return rcu_dereference(nf_nat_protos[protonum]); ++ return rcu_dereference(ve_nf_nat_protos[protonum]); + } + + const struct nf_nat_protocol * +@@ -155,7 +169,7 @@ find_appropriate_src(const struct nf_conntrack_tuple *tuple, + const struct hlist_node *n; + + rcu_read_lock(); +- hlist_for_each_entry_rcu(nat, n, &bysource[h], bysource) { ++ hlist_for_each_entry_rcu(nat, n, &ve_bysource[h], bysource) { + ct = nat->ct; + if (same_src(ct, tuple)) { + /* Copy source part from reply tuple. */ +@@ -278,6 +292,22 @@ out: + rcu_read_unlock(); + } + ++void nf_nat_hash_conntrack(struct nf_conn *ct) ++{ ++ struct nf_conn_nat *nat; ++ unsigned int srchash; ++ ++ srchash = hash_by_src(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); ++ spin_lock_bh(&nf_nat_lock); ++ /* nf_conntrack_alter_reply might re-allocate exntension aera */ ++ nat = nfct_nat(ct); ++ nat->ct = ct; ++ hlist_add_head_rcu(&nat->bysource, &ve_bysource[srchash]); ++ spin_unlock_bh(&nf_nat_lock); ++ ++} ++EXPORT_SYMBOL_GPL(nf_nat_hash_conntrack); ++ + unsigned int + nf_nat_setup_info(struct nf_conn *ct, + const struct nf_nat_range *range, +@@ -326,17 +356,8 @@ nf_nat_setup_info(struct nf_conn *ct, + } + + /* Place in source hash if this is the first time. */ +- if (have_to_hash) { +- unsigned int srchash; +- +- srchash = hash_by_src(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); +- spin_lock_bh(&nf_nat_lock); +- /* nf_conntrack_alter_reply might re-allocate exntension aera */ +- nat = nfct_nat(ct); +- nat->ct = ct; +- hlist_add_head_rcu(&nat->bysource, &bysource[srchash]); +- spin_unlock_bh(&nf_nat_lock); +- } ++ if (have_to_hash) ++ nf_nat_hash_conntrack(ct); + + /* It's done. */ + if (maniptype == IP_NAT_MANIP_DST) +@@ -426,7 +447,6 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, + struct icmphdr icmp; + struct iphdr ip; + } *inside; +- const struct nf_conntrack_l4proto *l4proto; + struct nf_conntrack_tuple inner, target; + int hdrlen = ip_hdrlen(skb); + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); +@@ -463,16 +483,14 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, + "dir %s\n", skb, manip, + dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY"); + +- /* rcu_read_lock()ed by nf_hook_slow */ +- l4proto = __nf_ct_l4proto_find(PF_INET, inside->ip.protocol); +- + if (!nf_ct_get_tuple(skb, + ip_hdrlen(skb) + sizeof(struct icmphdr), + (ip_hdrlen(skb) + + sizeof(struct icmphdr) + inside->ip.ihl * 4), + (u_int16_t)AF_INET, + inside->ip.protocol, +- &inner, l3proto, l4proto)) ++ &inner, ve_nf_nat_l3proto, ++ __nf_ct_l4proto_find(PF_INET, inside->ip.protocol))) + return 0; + + /* Change inner back to look like incoming packet. We do the +@@ -522,11 +540,11 @@ int nf_nat_protocol_register(const struct nf_nat_protocol *proto) + int ret = 0; + + spin_lock_bh(&nf_nat_lock); +- if (nf_nat_protos[proto->protonum] != &nf_nat_unknown_protocol) { ++ if (ve_nf_nat_protos[proto->protonum] != &nf_nat_unknown_protocol) { + ret = -EBUSY; + goto out; + } +- rcu_assign_pointer(nf_nat_protos[proto->protonum], proto); ++ rcu_assign_pointer(ve_nf_nat_protos[proto->protonum], proto); + out: + spin_unlock_bh(&nf_nat_lock); + return ret; +@@ -537,7 +555,7 @@ EXPORT_SYMBOL(nf_nat_protocol_register); + void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto) + { + spin_lock_bh(&nf_nat_lock); +- rcu_assign_pointer(nf_nat_protos[proto->protonum], ++ rcu_assign_pointer(ve_nf_nat_protos[proto->protonum], + &nf_nat_unknown_protocol); + spin_unlock_bh(&nf_nat_lock); + synchronize_rcu(); +@@ -583,47 +601,62 @@ static struct nf_ct_ext_type nat_extend __read_mostly = { + .flags = NF_CT_EXT_F_PREALLOC, + }; + +-static int __init nf_nat_init(void) ++int nf_nat_init(void) + { + size_t i; + int ret; + + need_ipv4_conntrack(); + +- ret = nf_ct_extend_register(&nat_extend); +- if (ret < 0) { +- printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); +- return ret; ++ if (ve_is_super(get_exec_env())) { ++ ret = nf_ct_extend_register(&nat_extend); ++ if (ret < 0) { ++ printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); ++ return ret; ++ } + } + + /* Leave them the same for the moment. */ + nf_nat_htable_size = nf_conntrack_htable_size; + +- bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, +- &nf_nat_vmalloced); +- if (!bysource) { ++ ve_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, ++ &ve_nf_nat_vmalloced); ++ if (!ve_bysource) { + ret = -ENOMEM; + goto cleanup_extend; + } + ++#ifdef CONFIG_VE_IPTABLES ++ ve_nf_nat_protos = kcalloc(MAX_IP_NAT_PROTO, sizeof(void *), GFP_KERNEL); ++ if (!ve_nf_nat_protos) { ++ ret = -ENOMEM; ++ goto cleanup_hash; ++ } ++#endif + /* Sew in builtin protocols. */ + spin_lock_bh(&nf_nat_lock); + for (i = 0; i < MAX_IP_NAT_PROTO; i++) +- rcu_assign_pointer(nf_nat_protos[i], &nf_nat_unknown_protocol); +- rcu_assign_pointer(nf_nat_protos[IPPROTO_TCP], &nf_nat_protocol_tcp); +- rcu_assign_pointer(nf_nat_protos[IPPROTO_UDP], &nf_nat_protocol_udp); +- rcu_assign_pointer(nf_nat_protos[IPPROTO_ICMP], &nf_nat_protocol_icmp); ++ rcu_assign_pointer(ve_nf_nat_protos[i], &nf_nat_unknown_protocol); ++ rcu_assign_pointer(ve_nf_nat_protos[IPPROTO_TCP], &nf_nat_protocol_tcp); ++ rcu_assign_pointer(ve_nf_nat_protos[IPPROTO_UDP], &nf_nat_protocol_udp); ++ rcu_assign_pointer(ve_nf_nat_protos[IPPROTO_ICMP], &nf_nat_protocol_icmp); + spin_unlock_bh(&nf_nat_lock); + +- /* Initialize fake conntrack so that NAT will skip it */ +- nf_conntrack_untracked.status |= IPS_NAT_DONE_MASK; ++ if (ve_is_super(get_exec_env())) { ++ /* Initialize fake conntrack so that NAT will skip it */ ++ nf_conntrack_untracked.status |= IPS_NAT_DONE_MASK; ++ } + +- l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET); ++ ve_nf_nat_l3proto = nf_ct_l3proto_find_get((u_int16_t)AF_INET); + + BUG_ON(nf_nat_seq_adjust_hook != NULL); + rcu_assign_pointer(nf_nat_seq_adjust_hook, nf_nat_seq_adjust); + return 0; + ++#ifdef CONFIG_VE_IPTABLES ++cleanup_hash: ++#endif ++ nf_ct_free_hashtable(ve_bysource, ve_nf_nat_vmalloced, nf_nat_htable_size); + cleanup_extend: + nf_ct_extend_unregister(&nat_extend); + return ret; +@@ -641,18 +674,45 @@ static int clean_nat(struct nf_conn *i, void *data) + return 0; + } + +-static void __exit nf_nat_cleanup(void) ++void nf_nat_cleanup(void) + { + nf_ct_iterate_cleanup(&clean_nat, NULL); + synchronize_rcu(); +- nf_ct_free_hashtable(bysource, nf_nat_vmalloced, nf_nat_htable_size); +- nf_ct_l3proto_put(l3proto); +- nf_ct_extend_unregister(&nat_extend); ++ nf_ct_free_hashtable(ve_bysource, ve_nf_nat_vmalloced, nf_nat_htable_size); ++ nf_ct_l3proto_put(ve_nf_nat_l3proto); ++#ifdef CONFIG_VE_IPTABLES ++ kfree(ve_nf_nat_protos); ++#endif ++ if (ve_is_super(get_exec_env())) ++ nf_ct_extend_unregister(&nat_extend); + rcu_assign_pointer(nf_nat_seq_adjust_hook, NULL); + synchronize_net(); + } + ++static int __init init(void) ++{ ++ int rv; ++ ++ rv = nf_nat_init(); ++ if (rv < 0) ++ return rv; ++ ++ KSYMRESOLVE(nf_nat_init); ++ KSYMRESOLVE(nf_nat_cleanup); ++ KSYMMODRESOLVE(nf_nat); ++ return 0; ++} ++ ++static void __exit fini(void) ++{ ++ KSYMMODUNRESOLVE(nf_nat); ++ KSYMUNRESOLVE(nf_nat_cleanup); ++ KSYMUNRESOLVE(nf_nat_init); ++ ++ nf_nat_cleanup(); ++} ++ + MODULE_LICENSE("GPL"); + +-module_init(nf_nat_init); +-module_exit(nf_nat_cleanup); ++module_init(init); ++module_exit(fini); +diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c +index e8b4d0d..f301178 100644 +--- a/net/ipv4/netfilter/nf_nat_rule.c ++++ b/net/ipv4/netfilter/nf_nat_rule.c +@@ -15,6 +15,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -33,7 +34,7 @@ static struct + struct ipt_replace repl; + struct ipt_standard entries[3]; + struct ipt_error term; +-} nat_initial_table __initdata = { ++} nat_initial_table = { + .repl = { + .name = "nat", + .valid_hooks = NAT_VALID_HOOKS, +@@ -65,7 +66,12 @@ static struct xt_table __nat_table = { + .me = THIS_MODULE, + .af = AF_INET, + }; ++#ifdef CONFIG_VE_IPTABLES ++#define nat_table \ ++ (get_exec_env()->_nf_conntrack->_nf_nat_table) ++#else + static struct xt_table *nat_table; ++#endif + + /* Source NAT */ + static unsigned int ipt_snat_target(struct sk_buff *skb, +@@ -226,14 +232,20 @@ static struct xt_target ipt_dnat_reg __read_mostly = { + .family = AF_INET, + }; + +-int __init nf_nat_rule_init(void) ++int nf_nat_rule_init(void) + { + int ret; ++ struct net *net = get_exec_env()->ve_netns; + +- nat_table = ipt_register_table(&init_net, &__nat_table, ++ nat_table = ipt_register_table(net, &__nat_table, + &nat_initial_table.repl); + if (IS_ERR(nat_table)) + return PTR_ERR(nat_table); ++ ++ ret = 0; ++ if (!ve_is_super(get_exec_env())) ++ goto done; ++ + ret = xt_register_target(&ipt_snat_reg); + if (ret != 0) + goto unregister_table; +@@ -242,19 +254,26 @@ int __init nf_nat_rule_init(void) + if (ret != 0) + goto unregister_snat; + ++done: + return ret; + + unregister_snat: + xt_unregister_target(&ipt_snat_reg); + unregister_table: + ipt_unregister_table(nat_table); ++ nat_table = NULL; + + return ret; + } + + void nf_nat_rule_cleanup(void) + { ++ if (!ve_is_super(get_exec_env())) ++ goto skip; ++ + xt_unregister_target(&ipt_dnat_reg); + xt_unregister_target(&ipt_snat_reg); ++skip: + ipt_unregister_table(nat_table); ++ nat_table = NULL; + } +diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c +index b7dd695..9aec464 100644 +--- a/net/ipv4/netfilter/nf_nat_standalone.c ++++ b/net/ipv4/netfilter/nf_nat_standalone.c +@@ -16,6 +16,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -282,30 +283,64 @@ static struct nf_hook_ops nf_nat_ops[] __read_mostly = { + }, + }; + +-static int __init nf_nat_standalone_init(void) ++int init_nftable_nat(void) + { +- int ret = 0; ++ int ret; + +- need_ipv4_conntrack(); ++ if (!ve_is_super(get_exec_env())) ++ __module_get(THIS_MODULE); + +-#ifdef CONFIG_XFRM +- BUG_ON(ip_nat_decode_session != NULL); +- rcu_assign_pointer(ip_nat_decode_session, nat_decode_session); +-#endif + ret = nf_nat_rule_init(); + if (ret < 0) { + printk("nf_nat_init: can't setup rules.\n"); +- goto cleanup_decode_session; ++ goto out_modput; + } + ret = nf_register_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops)); + if (ret < 0) { + printk("nf_nat_init: can't register hooks.\n"); + goto cleanup_rule_init; + } ++ return 0; ++ ++cleanup_rule_init: ++ nf_nat_rule_cleanup(); ++out_modput: ++ if (!ve_is_super(get_exec_env())) ++ module_put(THIS_MODULE); + return ret; ++} + +- cleanup_rule_init: ++void fini_nftable_nat(void) ++{ ++ nf_unregister_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops)); + nf_nat_rule_cleanup(); ++ if (!ve_is_super(get_exec_env())) ++ module_put(THIS_MODULE); ++} ++ ++static int __init nf_nat_standalone_init(void) ++{ ++ int ret = 0; ++ ++ need_ipv4_conntrack(); ++ ++#ifdef CONFIG_XFRM ++ BUG_ON(ip_nat_decode_session != NULL); ++ rcu_assign_pointer(ip_nat_decode_session, nat_decode_session); ++#endif ++ ++ if (!ip_conntrack_disable_ve0) { ++ ret = init_nftable_nat(); ++ if (ret < 0) ++ goto cleanup_decode_session; ++ } ++ ++ KSYMRESOLVE(init_nftable_nat); ++ KSYMRESOLVE(fini_nftable_nat); ++ KSYMMODRESOLVE(iptable_nat); ++ ++ return ret; ++ + cleanup_decode_session: + #ifdef CONFIG_XFRM + rcu_assign_pointer(ip_nat_decode_session, NULL); +@@ -316,8 +351,12 @@ static int __init nf_nat_standalone_init(void) + + static void __exit nf_nat_standalone_fini(void) + { +- nf_unregister_hooks(nf_nat_ops, ARRAY_SIZE(nf_nat_ops)); +- nf_nat_rule_cleanup(); ++ KSYMMODUNRESOLVE(iptable_nat); ++ KSYMUNRESOLVE(init_nftable_nat); ++ KSYMUNRESOLVE(fini_nftable_nat); ++ ++ if (!ip_conntrack_disable_ve0) ++ fini_nftable_nat(); + #ifdef CONFIG_XFRM + rcu_assign_pointer(ip_nat_decode_session, NULL); + synchronize_net(); +diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c +index 552169b..bf8e34e 100644 +--- a/net/ipv4/proc.c ++++ b/net/ipv4/proc.c +@@ -53,6 +53,9 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) + { + struct net *net = seq->private; + ++ if (!ve_is_super(get_exec_env())) ++ return 0; ++ + socket_seq_show(seq); + seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", + sock_prot_inuse_get(net, &tcp_prot), +@@ -272,7 +275,7 @@ static void icmpmsg_put(struct seq_file *seq) + count = 0; + for (i = 0; i < ICMPMSG_MIB_MAX; i++) { + +- if (snmp_fold_field((void **) icmpmsg_statistics, i)) ++ if (snmp_fold_field((void **) ve_icmpmsg_statistics, i)) + out[count++] = i; + if (count < PERLINE) + continue; +@@ -284,7 +287,7 @@ static void icmpmsg_put(struct seq_file *seq) + seq_printf(seq, "\nIcmpMsg: "); + for (j = 0; j < PERLINE; ++j) + seq_printf(seq, " %lu", +- snmp_fold_field((void **) icmpmsg_statistics, ++ snmp_fold_field((void **) ve_icmpmsg_statistics, + out[j])); + seq_putc(seq, '\n'); + } +@@ -296,7 +299,7 @@ static void icmpmsg_put(struct seq_file *seq) + seq_printf(seq, "\nIcmpMsg:"); + for (j = 0; j < count; ++j) + seq_printf(seq, " %lu", snmp_fold_field((void **) +- icmpmsg_statistics, out[j])); ++ ve_icmpmsg_statistics, out[j])); + } + + #undef PERLINE +@@ -313,18 +316,18 @@ static void icmp_put(struct seq_file *seq) + for (i=0; icmpmibmap[i].name != NULL; i++) + seq_printf(seq, " Out%s", icmpmibmap[i].name); + seq_printf(seq, "\nIcmp: %lu %lu", +- snmp_fold_field((void **) icmp_statistics, ICMP_MIB_INMSGS), +- snmp_fold_field((void **) icmp_statistics, ICMP_MIB_INERRORS)); ++ snmp_fold_field((void **) ve_icmp_statistics, ICMP_MIB_INMSGS), ++ snmp_fold_field((void **) ve_icmp_statistics, ICMP_MIB_INERRORS)); + for (i=0; icmpmibmap[i].name != NULL; i++) + seq_printf(seq, " %lu", +- snmp_fold_field((void **) icmpmsg_statistics, ++ snmp_fold_field((void **) ve_icmpmsg_statistics, + icmpmibmap[i].index)); + seq_printf(seq, " %lu %lu", +- snmp_fold_field((void **) icmp_statistics, ICMP_MIB_OUTMSGS), +- snmp_fold_field((void **) icmp_statistics, ICMP_MIB_OUTERRORS)); ++ snmp_fold_field((void **) ve_icmp_statistics, ICMP_MIB_OUTMSGS), ++ snmp_fold_field((void **) ve_icmp_statistics, ICMP_MIB_OUTERRORS)); + for (i=0; icmpmibmap[i].name != NULL; i++) + seq_printf(seq, " %lu", +- snmp_fold_field((void **) icmpmsg_statistics, ++ snmp_fold_field((void **) ve_icmpmsg_statistics, + icmpmibmap[i].index | 0x100)); + } + +@@ -346,7 +349,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) + + for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) + seq_printf(seq, " %lu", +- snmp_fold_field((void **)ip_statistics, ++ snmp_fold_field((void **)ve_ip_statistics, + snmp4_ipstats_list[i].entry)); + + icmp_put(seq); /* RFC 2011 compatibility */ +@@ -361,11 +364,11 @@ static int snmp_seq_show(struct seq_file *seq, void *v) + /* MaxConn field is signed, RFC 2012 */ + if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) + seq_printf(seq, " %ld", +- snmp_fold_field((void **)tcp_statistics, ++ snmp_fold_field((void **)ve_tcp_statistics, + snmp4_tcp_list[i].entry)); + else + seq_printf(seq, " %lu", +- snmp_fold_field((void **)tcp_statistics, ++ snmp_fold_field((void **)ve_tcp_statistics, + snmp4_tcp_list[i].entry)); + } + +@@ -376,7 +379,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) + seq_puts(seq, "\nUdp:"); + for (i = 0; snmp4_udp_list[i].name != NULL; i++) + seq_printf(seq, " %lu", +- snmp_fold_field((void **)udp_statistics, ++ snmp_fold_field((void **)ve_udp_statistics, + snmp4_udp_list[i].entry)); + + /* the UDP and UDP-Lite MIBs are the same */ +@@ -387,7 +390,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) + seq_puts(seq, "\nUdpLite:"); + for (i = 0; snmp4_udp_list[i].name != NULL; i++) + seq_printf(seq, " %lu", +- snmp_fold_field((void **)udplite_statistics, ++ snmp_fold_field((void **)ve_udplite_statistics, + snmp4_udp_list[i].entry)); + + seq_putc(seq, '\n'); +@@ -423,7 +426,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v) + seq_puts(seq, "\nTcpExt:"); + for (i = 0; snmp4_net_list[i].name != NULL; i++) + seq_printf(seq, " %lu", +- snmp_fold_field((void **)net_statistics, ++ snmp_fold_field((void **)ve_net_statistics, + snmp4_net_list[i].entry)); + + seq_puts(seq, "\nIpExt:"); +@@ -433,7 +436,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v) + seq_puts(seq, "\nIpExt:"); + for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++) + seq_printf(seq, " %lu", +- snmp_fold_field((void **)ip_statistics, ++ snmp_fold_field((void **)ve_ip_statistics, + snmp4_ipextstats_list[i].entry)); + + seq_putc(seq, '\n'); +@@ -456,13 +459,26 @@ static const struct file_operations netstat_seq_fops = { + static __net_init int ip_proc_init_net(struct net *net) + { + if (!proc_net_fops_create(net, "sockstat", S_IRUGO, &sockstat_seq_fops)) +- return -ENOMEM; ++ goto out; ++ if (!proc_net_fops_create(net, "netstat", S_IRUGO, &netstat_seq_fops)) ++ goto out_netstat; ++ if (!proc_net_fops_create(net, "snmp", S_IRUGO, &snmp_seq_fops)) ++ goto out_snmp; + return 0; ++ ++out_snmp: ++ proc_net_remove(net, "netstat"); ++out_netstat: ++ proc_net_remove(net, "sockstat"); ++out: ++ return -ENOMEM; + } + + static __net_exit void ip_proc_exit_net(struct net *net) + { + proc_net_remove(net, "sockstat"); ++ proc_net_remove(net, "netstat"); ++ proc_net_remove(net, "snmp"); + } + + static __net_initdata struct pernet_operations ip_proc_ops = { +@@ -472,24 +488,6 @@ static __net_initdata struct pernet_operations ip_proc_ops = { + + int __init ip_misc_proc_init(void) + { +- int rc = 0; +- +- if (register_pernet_subsys(&ip_proc_ops)) +- goto out_pernet; +- +- if (!proc_net_fops_create(&init_net, "netstat", S_IRUGO, &netstat_seq_fops)) +- goto out_netstat; +- +- if (!proc_net_fops_create(&init_net, "snmp", S_IRUGO, &snmp_seq_fops)) +- goto out_snmp; +-out: +- return rc; +-out_snmp: +- proc_net_remove(&init_net, "netstat"); +-out_netstat: +- unregister_pernet_subsys(&ip_proc_ops); +-out_pernet: +- rc = -ENOMEM; +- goto out; ++ return register_pernet_subsys(&ip_proc_ops); + } + +diff --git a/net/ipv4/route.c b/net/ipv4/route.c +index 96be336..d032f59 100644 +--- a/net/ipv4/route.c ++++ b/net/ipv4/route.c +@@ -71,6 +71,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -117,6 +118,7 @@ + + #define RT_GC_TIMEOUT (300*HZ) + ++int ip_rt_src_check = 1; + static int ip_rt_max_size; + static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; + static int ip_rt_gc_interval __read_mostly = 60 * HZ; +@@ -134,7 +136,6 @@ static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ; + + static void rt_worker_func(struct work_struct *work); + static DECLARE_DELAYED_WORK(expires_work, rt_worker_func); +-static struct timer_list rt_secret_timer; + + /* + * Interface to generic destination cache. +@@ -253,20 +254,41 @@ static inline void rt_hash_lock_init(void) + static struct rt_hash_bucket *rt_hash_table __read_mostly; + static unsigned rt_hash_mask __read_mostly; + static unsigned int rt_hash_log __read_mostly; +-static atomic_t rt_genid __read_mostly; + + static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); + #define RT_CACHE_STAT_INC(field) \ + (__raw_get_cpu_var(rt_cache_stat).field++) + +-static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx) ++static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx, ++ int genid) + { + return jhash_3words((__force u32)(__be32)(daddr), + (__force u32)(__be32)(saddr), +- idx, atomic_read(&rt_genid)) ++ idx, genid) + & rt_hash_mask; + } + ++void prepare_rt_cache(void) ++{ ++#ifdef CONFIG_VE ++ struct rtable *r; ++ int i; ++ ++ for (i = rt_hash_mask; i >= 0; i--) { ++ spin_lock_bh(rt_hash_lock_addr(i)); ++ for (r = rt_hash_table[i].chain; r; r = r->u.dst.rt_next) { ++ r->fl.owner_env = get_ve0(); ++ } ++ spin_unlock_bh(rt_hash_lock_addr(i)); ++ } ++#endif ++} ++ ++static inline int rt_genid(struct net *net) ++{ ++ return atomic_read(&net->ipv4.rt_genid); ++} ++ + #ifdef CONFIG_PROC_FS + struct rt_cache_iter_state { + struct seq_net_private p; +@@ -336,7 +358,7 @@ static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) + struct rt_cache_iter_state *st = seq->private; + if (*pos) + return rt_cache_get_idx(seq, *pos - 1); +- st->genid = atomic_read(&rt_genid); ++ st->genid = rt_genid(seq_file_net(seq)); + return SEQ_START_TOKEN; + } + +@@ -683,6 +705,11 @@ static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) + return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev); + } + ++static inline int rt_is_expired(struct rtable *rth) ++{ ++ return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev)); ++} ++ + /* + * Perform a full scan of hash table and free all entries. + * Can be called by a softirq or a process. +@@ -692,6 +719,7 @@ static void rt_do_flush(int process_context) + { + unsigned int i; + struct rtable *rth, *next; ++ struct rtable *tail; + + for (i = 0; i <= rt_hash_mask; i++) { + if (process_context && need_resched()) +@@ -701,11 +729,40 @@ static void rt_do_flush(int process_context) + continue; + + spin_lock_bh(rt_hash_lock_addr(i)); ++#ifdef CONFIG_NET_NS ++ { ++ struct rtable ** prev, * p; ++ ++ rth = rt_hash_table[i].chain; ++ ++ /* defer releasing the head of the list after spin_unlock */ ++ for (tail = rth; tail; tail = tail->u.dst.rt_next) ++ if (!rt_is_expired(tail)) ++ break; ++ if (rth != tail) ++ rt_hash_table[i].chain = tail; ++ ++ /* call rt_free on entries after the tail requiring flush */ ++ prev = &rt_hash_table[i].chain; ++ for (p = *prev; p; p = next) { ++ next = p->u.dst.rt_next; ++ if (!rt_is_expired(p)) { ++ prev = &p->u.dst.rt_next; ++ } else { ++ *prev = next; ++ rt_free(p); ++ } ++ } ++ } ++#else + rth = rt_hash_table[i].chain; + rt_hash_table[i].chain = NULL; ++ tail = NULL; ++ ++#endif + spin_unlock_bh(rt_hash_lock_addr(i)); + +- for (; rth; rth = next) { ++ for (; rth != tail; rth = next) { + next = rth->u.dst.rt_next; + rt_free(rth); + } +@@ -738,7 +795,7 @@ static void rt_check_expire(void) + continue; + spin_lock_bh(rt_hash_lock_addr(i)); + while ((rth = *rthp) != NULL) { +- if (rth->rt_genid != atomic_read(&rt_genid)) { ++ if (rt_is_expired(rth)) { + *rthp = rth->u.dst.rt_next; + rt_free(rth); + continue; +@@ -781,21 +838,21 @@ static void rt_worker_func(struct work_struct *work) + * many times (2^24) without giving recent rt_genid. + * Jenkins hash is strong enough that litle changes of rt_genid are OK. + */ +-static void rt_cache_invalidate(void) ++static void rt_cache_invalidate(struct net *net) + { + unsigned char shuffle; + + get_random_bytes(&shuffle, sizeof(shuffle)); +- atomic_add(shuffle + 1U, &rt_genid); ++ atomic_add(shuffle + 1U, &net->ipv4.rt_genid); + } + + /* + * delay < 0 : invalidate cache (fast : entries will be deleted later) + * delay >= 0 : invalidate & flush cache (can be long) + */ +-void rt_cache_flush(int delay) ++void rt_cache_flush(struct net *net, int delay) + { +- rt_cache_invalidate(); ++ rt_cache_invalidate(net); + if (delay >= 0) + rt_do_flush(!in_softirq()); + } +@@ -803,10 +860,12 @@ void rt_cache_flush(int delay) + /* + * We change rt_genid and let gc do the cleanup + */ +-static void rt_secret_rebuild(unsigned long dummy) ++static void rt_secret_rebuild(unsigned long __net) + { +- rt_cache_invalidate(); +- mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval); ++ struct net *net = (struct net *)__net; ++ ++ rt_cache_invalidate(net); ++ mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval); + } + + /* +@@ -882,7 +941,7 @@ static int rt_garbage_collect(struct dst_ops *ops) + rthp = &rt_hash_table[k].chain; + spin_lock_bh(rt_hash_lock_addr(k)); + while ((rth = *rthp) != NULL) { +- if (rth->rt_genid == atomic_read(&rt_genid) && ++ if (!rt_is_expired(rth) && + !rt_may_expire(rth, tmo, expire)) { + tmo >>= 1; + rthp = &rth->u.dst.rt_next; +@@ -964,7 +1023,7 @@ restart: + + spin_lock_bh(rt_hash_lock_addr(hash)); + while ((rth = *rthp) != NULL) { +- if (rth->rt_genid != atomic_read(&rt_genid)) { ++ if (rt_is_expired(rth)) { + *rthp = rth->u.dst.rt_next; + rt_free(rth); + continue; +@@ -1140,7 +1199,7 @@ static void rt_del(unsigned hash, struct rtable *rt) + spin_lock_bh(rt_hash_lock_addr(hash)); + ip_rt_put(rt); + while ((aux = *rthp) != NULL) { +- if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) { ++ if (aux == rt || rt_is_expired(aux)) { + *rthp = aux->u.dst.rt_next; + rt_free(aux); + continue; +@@ -1182,7 +1241,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, + + for (i = 0; i < 2; i++) { + for (k = 0; k < 2; k++) { +- unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]); ++ unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], ++ rt_genid(net)); + + rthp=&rt_hash_table[hash].chain; + +@@ -1194,7 +1254,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, + rth->fl.fl4_src != skeys[i] || + rth->fl.oif != ikeys[k] || + rth->fl.iif != 0 || +- rth->rt_genid != atomic_read(&rt_genid) || ++ rt_is_expired(rth) || + !net_eq(dev_net(rth->u.dst.dev), net)) { + rthp = &rth->u.dst.rt_next; + continue; +@@ -1233,7 +1293,10 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, + rt->u.dst.neighbour = NULL; + rt->u.dst.hh = NULL; + rt->u.dst.xfrm = NULL; +- rt->rt_genid = atomic_read(&rt_genid); ++ rt->rt_genid = rt_genid(net); ++#ifdef CONFIG_VE ++ rt->fl.owner_env = get_exec_env(); ++#endif + rt->rt_flags |= RTCF_REDIRECTED; + + /* Gateway is different ... */ +@@ -1297,7 +1360,8 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) + } else if ((rt->rt_flags & RTCF_REDIRECTED) || + rt->u.dst.expires) { + unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, +- rt->fl.oif); ++ rt->fl.oif, ++ rt_genid(dev_net(dst->dev))); + #if RT_CACHE_DEBUG >= 1 + printk(KERN_DEBUG "ipv4_negative_advice: redirect to " + NIPQUAD_FMT "/%02x dropped\n", +@@ -1446,7 +1510,8 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, + + for (k = 0; k < 2; k++) { + for (i = 0; i < 2; i++) { +- unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]); ++ unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], ++ rt_genid(net)); + + rcu_read_lock(); + for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; +@@ -1461,7 +1526,7 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, + rth->fl.iif != 0 || + dst_metric_locked(&rth->u.dst, RTAX_MTU) || + !net_eq(dev_net(rth->u.dst.dev), net) || +- rth->rt_genid != atomic_read(&rt_genid)) ++ !rt_is_expired(rth)) + continue; + + if (new_mtu < 68 || new_mtu >= old_mtu) { +@@ -1688,15 +1753,18 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, + #ifdef CONFIG_NET_CLS_ROUTE + rth->u.dst.tclassid = itag; + #endif ++#ifdef CONFIG_VE ++ rth->fl.owner_env = get_exec_env(); ++#endif + rth->rt_iif = + rth->fl.iif = dev->ifindex; +- rth->u.dst.dev = init_net.loopback_dev; ++ rth->u.dst.dev = get_exec_env()->ve_netns->loopback_dev; + dev_hold(rth->u.dst.dev); + rth->idev = in_dev_get(rth->u.dst.dev); + rth->fl.oif = 0; + rth->rt_gateway = daddr; + rth->rt_spec_dst= spec_dst; +- rth->rt_genid = atomic_read(&rt_genid); ++ rth->rt_genid = rt_genid(dev_net(dev)); + rth->rt_flags = RTCF_MULTICAST; + rth->rt_type = RTN_MULTICAST; + if (our) { +@@ -1711,7 +1779,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, + RT_CACHE_STAT_INC(in_slow_mc); + + in_dev_put(in_dev); +- hash = rt_hash(daddr, saddr, dev->ifindex); ++ hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); + return rt_intern_hash(hash, rth, &skb->rtable); + + e_nobufs: +@@ -1827,6 +1895,9 @@ static int __mkroute_input(struct sk_buff *skb, + rth->fl.fl4_src = saddr; + rth->rt_src = saddr; + rth->rt_gateway = daddr; ++#ifdef CONFIG_VE ++ rth->fl.owner_env = get_exec_env(); ++#endif + rth->rt_iif = + rth->fl.iif = in_dev->dev->ifindex; + rth->u.dst.dev = (out_dev)->dev; +@@ -1837,7 +1908,7 @@ static int __mkroute_input(struct sk_buff *skb, + + rth->u.dst.input = ip_forward; + rth->u.dst.output = ip_output; +- rth->rt_genid = atomic_read(&rt_genid); ++ rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev)); + + rt_set_nexthop(rth, res, itag); + +@@ -1872,7 +1943,8 @@ static int ip_mkroute_input(struct sk_buff *skb, + return err; + + /* put it into the cache */ +- hash = rt_hash(daddr, saddr, fl->iif); ++ hash = rt_hash(daddr, saddr, fl->iif, ++ rt_genid(dev_net(rth->u.dst.dev))); + return rt_intern_hash(hash, rth, &skb->rtable); + } + +@@ -1998,7 +2070,7 @@ local_input: + goto e_nobufs; + + rth->u.dst.output= ip_rt_bug; +- rth->rt_genid = atomic_read(&rt_genid); ++ rth->rt_genid = rt_genid(net); + + atomic_set(&rth->u.dst.__refcnt, 1); + rth->u.dst.flags= DST_HOST; +@@ -2020,6 +2092,9 @@ local_input: + rth->idev = in_dev_get(rth->u.dst.dev); + rth->rt_gateway = daddr; + rth->rt_spec_dst= spec_dst; ++#ifdef CONFIG_VE ++ rth->fl.owner_env = get_exec_env(); ++#endif + rth->u.dst.input= ip_local_deliver; + rth->rt_flags = flags|RTCF_LOCAL; + if (res.type == RTN_UNREACHABLE) { +@@ -2028,7 +2103,7 @@ local_input: + rth->rt_flags &= ~RTCF_LOCAL; + } + rth->rt_type = res.type; +- hash = rt_hash(daddr, saddr, fl.iif); ++ hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); + err = rt_intern_hash(hash, rth, &skb->rtable); + goto done; + +@@ -2079,7 +2154,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, + + net = dev_net(dev); + tos &= IPTOS_RT_MASK; +- hash = rt_hash(daddr, saddr, iif); ++ hash = rt_hash(daddr, saddr, iif, rt_genid(net)); + + rcu_read_lock(); + for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; +@@ -2091,7 +2166,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, + (rth->fl.fl4_tos ^ tos)) == 0 && + rth->fl.mark == skb->mark && + net_eq(dev_net(rth->u.dst.dev), net) && +- rth->rt_genid == atomic_read(&rt_genid)) { ++ !rt_is_expired(rth)) { + dst_use(&rth->u.dst, jiffies); + RT_CACHE_STAT_INC(in_hit); + rcu_read_unlock(); +@@ -2209,6 +2284,9 @@ static int __mkroute_output(struct rtable **result, + rth->fl.mark = oldflp->mark; + rth->rt_dst = fl->fl4_dst; + rth->rt_src = fl->fl4_src; ++#ifdef CONFIG_VE ++ rth->fl.owner_env = get_exec_env(); ++#endif + rth->rt_iif = oldflp->oif ? : dev_out->ifindex; + /* get references to the devices that are to be hold by the routing + cache entry */ +@@ -2219,7 +2297,7 @@ static int __mkroute_output(struct rtable **result, + rth->rt_spec_dst= fl->fl4_src; + + rth->u.dst.output=ip_output; +- rth->rt_genid = atomic_read(&rt_genid); ++ rth->rt_genid = rt_genid(dev_net(dev_out)); + + RT_CACHE_STAT_INC(out_slow_tot); + +@@ -2268,7 +2346,8 @@ static int ip_mkroute_output(struct rtable **rp, + int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags); + unsigned hash; + if (err == 0) { +- hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif); ++ hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif, ++ rt_genid(dev_net(dev_out))); + err = rt_intern_hash(hash, rth, rp); + } + +@@ -2313,10 +2392,13 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, + ipv4_is_zeronet(oldflp->fl4_src)) + goto out; + +- /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ +- dev_out = ip_dev_find(net, oldflp->fl4_src); +- if (dev_out == NULL) +- goto out; ++ if (ip_rt_src_check) { ++ /* It is equivalent to ++ inet_addr_type(saddr) == RTN_LOCAL */ ++ dev_out = ip_dev_find(net, oldflp->fl4_src); ++ if (dev_out == NULL) ++ goto out; ++ } + + /* I removed check for oif == dev_out->oif here. + It was wrong for two reasons: +@@ -2344,6 +2426,12 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, + Luckily, this hack is good workaround. + */ + ++ if (dev_out == NULL) { ++ dev_out = ip_dev_find(net, oldflp->fl4_src); ++ if (dev_out == NULL) ++ goto out; ++ } ++ + fl.oif = dev_out->ifindex; + goto make_route; + } +@@ -2480,7 +2568,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp, + unsigned hash; + struct rtable *rth; + +- hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif); ++ hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); + + rcu_read_lock_bh(); + for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; +@@ -2493,7 +2581,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp, + !((rth->fl.fl4_tos ^ flp->fl4_tos) & + (IPTOS_RT_MASK | RTO_ONLINK)) && + net_eq(dev_net(rth->u.dst.dev), net) && +- rth->rt_genid == atomic_read(&rt_genid)) { ++ !rt_is_expired(rth)) { + dst_use(&rth->u.dst, jiffies); + RT_CACHE_STAT_INC(out_hit); + rcu_read_unlock_bh(); +@@ -2524,7 +2612,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = { + }; + + +-static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp) ++static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp) + { + struct rtable *ort = *rp; + struct rtable *rt = (struct rtable *) +@@ -2548,7 +2636,7 @@ static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp) + rt->idev = ort->idev; + if (rt->idev) + in_dev_hold(rt->idev); +- rt->rt_genid = atomic_read(&rt_genid); ++ rt->rt_genid = rt_genid(net); + rt->rt_flags = ort->rt_flags; + rt->rt_type = ort->rt_type; + rt->rt_dst = ort->rt_dst; +@@ -2584,7 +2672,7 @@ int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, + err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, + flags ? XFRM_LOOKUP_WAIT : 0); + if (err == -EREMOTE) +- err = ipv4_dst_blackhole(rp, flp); ++ err = ipv4_dst_blackhole(net, rp, flp); + + return err; + } +@@ -2803,7 +2891,7 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) + rt = rcu_dereference(rt->u.dst.rt_next), idx++) { + if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx) + continue; +- if (rt->rt_genid != atomic_read(&rt_genid)) ++ if (rt_is_expired(rt)) + continue; + skb->dst = dst_clone(&rt->u.dst); + if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid, +@@ -2827,19 +2915,29 @@ done: + + void ip_rt_multicast_event(struct in_device *in_dev) + { +- rt_cache_flush(0); ++ rt_cache_flush(dev_net(in_dev->dev), 0); + } + + #ifdef CONFIG_SYSCTL +-static int flush_delay; ++#warning "Rework this shit via ro net sysctls" + + static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, + struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos) + { + if (write) { ++ int flush_delay; ++ static DEFINE_MUTEX(flush_mutex); ++ struct net *net; ++ ++ mutex_lock(&flush_mutex); ++ ctl->data = &flush_delay; + proc_dointvec(ctl, write, filp, buffer, lenp, ppos); +- rt_cache_flush(flush_delay); ++ ctl->data = NULL; ++ mutex_unlock(&flush_mutex); ++ ++ net = (struct net *)ctl->extra1; ++ rt_cache_flush(net, flush_delay); + return 0; + } + +@@ -2855,25 +2953,18 @@ static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, + size_t newlen) + { + int delay; ++ struct net *net; + if (newlen != sizeof(int)) + return -EINVAL; + if (get_user(delay, (int __user *)newval)) + return -EFAULT; +- rt_cache_flush(delay); ++ net = (struct net *)table->extra1; ++ rt_cache_flush(net, delay); + return 0; + } + + ctl_table ipv4_route_table[] = { + { +- .ctl_name = NET_IPV4_ROUTE_FLUSH, +- .procname = "flush", +- .data = &flush_delay, +- .maxlen = sizeof(int), +- .mode = 0200, +- .proc_handler = &ipv4_sysctl_rtcache_flush, +- .strategy = &ipv4_sysctl_rtcache_flush_strategy, +- }, +- { + .ctl_name = NET_IPV4_ROUTE_GC_THRESH, + .procname = "gc_thresh", + .data = &ipv4_dst_ops.gc_thresh, +@@ -3011,8 +3102,97 @@ ctl_table ipv4_route_table[] = { + }, + { .ctl_name = 0 } + }; ++ ++static __net_initdata struct ctl_path ipv4_route_path[] = { ++ { .procname = "net", .ctl_name = CTL_NET, }, ++ { .procname = "ipv4", .ctl_name = NET_IPV4, }, ++ { .procname = "route", .ctl_name = NET_IPV4_ROUTE, }, ++ { }, ++}; ++ ++ ++static struct ctl_table ipv4_route_flush_table[] = { ++ { ++ .ctl_name = NET_IPV4_ROUTE_FLUSH, ++ .procname = "flush", ++ .maxlen = sizeof(int), ++ .mode = 0200, ++ .proc_handler = &ipv4_sysctl_rtcache_flush, ++ .strategy = &ipv4_sysctl_rtcache_flush_strategy, ++ }, ++ { .ctl_name = 0 }, ++}; ++ ++static __net_init int sysctl_route_net_init(struct net *net) ++{ ++ struct ctl_table *tbl; ++ ++ tbl = ipv4_route_flush_table; ++ if (net != &init_net) { ++ tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); ++ if (tbl == NULL) ++ goto err_dup; ++ } ++ tbl[0].extra1 = net; ++ ++ net->ipv4.route_hdr = ++ register_net_sysctl_table(net, ipv4_route_path, tbl); ++ if (net->ipv4.route_hdr == NULL) ++ goto err_reg; ++ return 0; ++ ++err_reg: ++ if (tbl != ipv4_route_flush_table) ++ kfree(tbl); ++err_dup: ++ return -ENOMEM; ++} ++ ++static __net_exit void sysctl_route_net_exit(struct net *net) ++{ ++ struct ctl_table *tbl; ++ ++ tbl = net->ipv4.route_hdr->ctl_table_arg; ++ unregister_net_sysctl_table(net->ipv4.route_hdr); ++ BUG_ON(tbl == ipv4_route_flush_table); ++ kfree(tbl); ++} ++ ++static __net_initdata struct pernet_operations sysctl_route_ops = { ++ .init = sysctl_route_net_init, ++ .exit = sysctl_route_net_exit, ++}; + #endif + ++ ++static __net_init int rt_secret_timer_init(struct net *net) ++{ ++ atomic_set(&net->ipv4.rt_genid, ++ (int) ((num_physpages ^ (num_physpages>>8)) ^ ++ (jiffies ^ (jiffies >> 7)))); ++ ++ net->ipv4.rt_secret_timer.function = rt_secret_rebuild; ++ net->ipv4.rt_secret_timer.data = (unsigned long)net; ++ init_timer_deferrable(&net->ipv4.rt_secret_timer); ++ ++ net->ipv4.rt_secret_timer.expires = ++ jiffies + net_random() % ip_rt_secret_interval + ++ ip_rt_secret_interval; ++ add_timer(&net->ipv4.rt_secret_timer); ++ return 0; ++} ++ ++static __net_exit void rt_secret_timer_exit(struct net *net) ++{ ++ del_timer_sync(&net->ipv4.rt_secret_timer); ++} ++ ++static __net_initdata struct pernet_operations rt_secret_timer_ops = { ++ .init = rt_secret_timer_init, ++ .exit = rt_secret_timer_exit, ++}; ++ ++ + #ifdef CONFIG_NET_CLS_ROUTE + struct ip_rt_acct *ip_rt_acct __read_mostly; + #endif /* CONFIG_NET_CLS_ROUTE */ +@@ -3031,9 +3211,6 @@ int __init ip_rt_init(void) + { + int rc = 0; + +- atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^ +- (jiffies ^ (jiffies >> 7)))); +- + #ifdef CONFIG_NET_CLS_ROUTE + ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct)); + if (!ip_rt_acct) +@@ -3065,19 +3242,14 @@ int __init ip_rt_init(void) + devinet_init(); + ip_fib_init(); + +- rt_secret_timer.function = rt_secret_rebuild; +- rt_secret_timer.data = 0; +- init_timer_deferrable(&rt_secret_timer); +- + /* All the timers, started at system startup tend + to synchronize. Perturb it a bit. + */ + schedule_delayed_work(&expires_work, + net_random() % ip_rt_gc_interval + ip_rt_gc_interval); + +- rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval + +- ip_rt_secret_interval; +- add_timer(&rt_secret_timer); ++ if (register_pernet_subsys(&rt_secret_timer_ops)) ++ printk(KERN_ERR "Unable to setup rt_secret_timer\n"); + + if (ip_rt_proc_init()) + printk(KERN_ERR "Unable to create route proc files\n"); +@@ -3087,6 +3259,9 @@ int __init ip_rt_init(void) + #endif + rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL); + ++#ifdef CONFIG_SYSCTL ++ register_pernet_subsys(&sysctl_route_ops); ++#endif + return rc; + } + +diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c +index c437f80..e69d5ee 100644 +--- a/net/ipv4/sysctl_net_ipv4.c ++++ b/net/ipv4/sysctl_net_ipv4.c +@@ -28,6 +28,9 @@ static int tcp_retr1_max = 255; + static int ip_local_port_range_min[] = { 1, 1 }; + static int ip_local_port_range_max[] = { 65535, 65535 }; + ++int sysctl_tcp_use_sg = 1; ++EXPORT_SYMBOL(sysctl_tcp_use_sg); ++ + extern seqlock_t sysctl_port_range_lock; + extern int sysctl_local_port_range[2]; + +@@ -419,6 +422,13 @@ static struct ctl_table ipv4_table[] = { + .mode = 0644, + .proc_handler = &proc_dointvec + }, ++ { ++ .procname = "tcp_use_sg", ++ .data = &sysctl_tcp_use_sg, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, + + #endif + { +@@ -586,6 +596,20 @@ static struct ctl_table ipv4_table[] = { + .proc_handler = &proc_dointvec + }, + { ++ .procname = "tcp_max_tw_kmem_fraction", ++ .data = &sysctl_tcp_max_tw_kmem_fraction, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++ { ++ .procname = "tcp_max_tw_buckets_ub", ++ .data = &sysctl_tcp_max_tw_buckets_ub, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++ { + .ctl_name = NET_TCP_NO_METRICS_SAVE, + .procname = "tcp_no_metrics_save", + .data = &sysctl_tcp_nometrics_save, +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index 1d723de..56f3de7 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -274,6 +274,10 @@ + #include + #include + ++#include ++#include ++#include ++ + #include + #include + +@@ -340,6 +344,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) + unsigned int mask; + struct sock *sk = sock->sk; + struct tcp_sock *tp = tcp_sk(sk); ++ int check_send_space; + + poll_wait(file, sk->sk_sleep, wait); + if (sk->sk_state == TCP_LISTEN) +@@ -354,6 +359,21 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) + if (sk->sk_err) + mask = POLLERR; + ++ check_send_space = 1; ++#ifdef CONFIG_BEANCOUNTERS ++ if (!(sk->sk_shutdown & SEND_SHUTDOWN) && sock_has_ubc(sk)) { ++ unsigned long size; ++ size = MAX_TCP_HEADER + tp->mss_cache; ++ if (size > SOCK_MIN_UBCSPACE) ++ size = SOCK_MIN_UBCSPACE; ++ size = skb_charge_size(size); ++ if (ub_sock_makewres_tcp(sk, size)) { ++ check_send_space = 0; ++ ub_sock_sndqueueadd_tcp(sk, size); ++ } ++ } ++#endif ++ + /* + * POLLHUP is certainly not done right. But poll() doesn't + * have a notion of HUP in just one direction, and for a +@@ -397,7 +417,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) + sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data)) + mask |= POLLIN | POLLRDNORM; + +- if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { ++ if (check_send_space && !(sk->sk_shutdown & SEND_SHUTDOWN)) { + if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { + mask |= POLLOUT | POLLWRNORM; + } else { /* send SIGIO later */ +@@ -641,7 +661,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) + + skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); + if (skb) { +- if (sk_wmem_schedule(sk, skb->truesize)) { ++ if (sk_wmem_schedule(sk, skb->truesize, skb)) { + /* + * Make sure that we have exactly size bytes + * available to the caller, no more, no less. +@@ -687,15 +707,22 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse + int copy, i, can_coalesce; + int offset = poffset % PAGE_SIZE; + int size = min_t(size_t, psize, PAGE_SIZE - offset); ++ unsigned long chargesize = 0; + + if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) { + new_segment: ++ chargesize = 0; + if (!sk_stream_memory_free(sk)) + goto wait_for_sndbuf; + ++ chargesize = skb_charge_size(MAX_TCP_HEADER + ++ tp->mss_cache); ++ if (ub_sock_getwres_tcp(sk, chargesize) < 0) ++ goto wait_for_ubspace; + skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); + if (!skb) + goto wait_for_memory; ++ ub_skb_set_charge(skb, sk, chargesize, UB_TCPSNDBUF); + + skb_entail(sk, skb); + copy = size_goal; +@@ -710,7 +737,7 @@ new_segment: + tcp_mark_push(tp, skb); + goto new_segment; + } +- if (!sk_wmem_schedule(sk, copy)) ++ if (!sk_wmem_schedule(sk, copy, skb)) + goto wait_for_memory; + + if (can_coalesce) { +@@ -751,10 +778,15 @@ new_segment: + wait_for_sndbuf: + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + wait_for_memory: ++ ub_sock_retwres_tcp(sk, chargesize, ++ skb_charge_size(MAX_TCP_HEADER + tp->mss_cache)); ++ chargesize = 0; ++wait_for_ubspace: + if (copied) + tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); + +- if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) ++ err = __sk_stream_wait_memory(sk, &timeo, chargesize); ++ if (err != 0) + goto do_error; + + mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); +@@ -791,12 +823,8 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, + return res; + } + +-#define TCP_PAGE(sk) (sk->sk_sndmsg_page) +-#define TCP_OFF(sk) (sk->sk_sndmsg_off) +- +-static inline int select_size(struct sock *sk) ++static inline int select_size(struct sock *sk, struct tcp_sock *tp) + { +- struct tcp_sock *tp = tcp_sk(sk); + int tmp = tp->mss_cache; + + if (sk->sk_route_caps & NETIF_F_SG) { +@@ -855,6 +883,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + while (--iovlen >= 0) { + int seglen = iov->iov_len; + unsigned char __user *from = iov->iov_base; ++ unsigned long chargesize = 0; + + iov++; + +@@ -865,18 +894,27 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + + if (!tcp_send_head(sk) || + (copy = size_goal - skb->len) <= 0) { ++ unsigned long size; + + new_segment: + /* Allocate new segment. If the interface is SG, + * allocate skb fitting to single page. + */ ++ chargesize = 0; + if (!sk_stream_memory_free(sk)) + goto wait_for_sndbuf; + +- skb = sk_stream_alloc_skb(sk, select_size(sk), ++ size = select_size(sk, tp); ++ chargesize = skb_charge_size(MAX_TCP_HEADER + ++ size); ++ if (ub_sock_getwres_tcp(sk, chargesize) < 0) ++ goto wait_for_ubspace; ++ skb = sk_stream_alloc_skb(sk, size, + sk->sk_allocation); + if (!skb) + goto wait_for_memory; ++ ub_skb_set_charge(skb, sk, chargesize, ++ UB_TCPSNDBUF); + + /* + * Check whether we can use HW checksum. +@@ -922,6 +960,7 @@ new_segment: + } else if (page) { + if (off == PAGE_SIZE) { + put_page(page); ++ ub_sock_tcp_detachpage(sk); + TCP_PAGE(sk) = page = NULL; + off = 0; + } +@@ -931,10 +970,13 @@ new_segment: + if (copy > PAGE_SIZE - off) + copy = PAGE_SIZE - off; + +- if (!sk_wmem_schedule(sk, copy)) ++ if (!sk_wmem_schedule(sk, copy, skb)) + goto wait_for_memory; + + if (!page) { ++ chargesize = PAGE_SIZE; ++ if (ub_sock_tcp_chargepage(sk) < 0) ++ goto wait_for_ubspace; + /* Allocate new cache page. */ + if (!(page = sk_stream_alloc_page(sk))) + goto wait_for_memory; +@@ -966,7 +1008,8 @@ new_segment: + } else if (off + copy < PAGE_SIZE) { + get_page(page); + TCP_PAGE(sk) = page; +- } ++ } else ++ ub_sock_tcp_detachpage(sk); + } + + TCP_OFF(sk) = off + copy; +@@ -997,10 +1040,15 @@ new_segment: + wait_for_sndbuf: + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + wait_for_memory: ++ ub_sock_retwres_tcp(sk, chargesize, ++ skb_charge_size(MAX_TCP_HEADER+tp->mss_cache)); ++ chargesize = 0; ++wait_for_ubspace: + if (copied) + tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); + +- if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) ++ err = __sk_stream_wait_memory(sk, &timeo, chargesize); ++ if (err != 0) + goto do_error; + + mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); +@@ -1100,7 +1148,18 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied) + #if TCP_DEBUG + struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); + +- BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)); ++ if (!(skb==NULL || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq))) { ++ printk("KERNEL: assertion: skb==NULL || " ++ "before(tp->copied_seq, skb->end_seq)\n"); ++ printk("VE%u pid %d comm %.16s\n", ++ (get_exec_env() ? VEID(get_exec_env()) : 0), ++ current->pid, current->comm); ++ printk("copied=%d, copied_seq=%d, rcv_nxt=%d\n", copied, ++ tp->copied_seq, tp->rcv_nxt); ++ printk("skb->len=%d, skb->seq=%d, skb->end_seq=%d\n", ++ skb->len, TCP_SKB_CB(skb)->seq, ++ TCP_SKB_CB(skb)->end_seq); ++ } + #endif + + if (inet_csk_ack_scheduled(sk)) { +@@ -1362,7 +1421,23 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + goto found_ok_skb; + if (tcp_hdr(skb)->fin) + goto found_fin_ok; +- BUG_TRAP(flags & MSG_PEEK); ++ if (!(flags & MSG_PEEK)) { ++ printk("KERNEL: assertion: flags&MSG_PEEK\n"); ++ printk("VE%u pid %d comm %.16s\n", ++ (get_exec_env() ? ++ VEID(get_exec_env()) : 0), ++ current->pid, current->comm); ++ printk("flags=0x%x, len=%d, copied_seq=%d, " ++ "rcv_nxt=%d\n", flags, ++ (int)len, tp->copied_seq, ++ tp->rcv_nxt); ++ printk("skb->len=%d, *seq=%d, skb->seq=%d, " ++ "skb->end_seq=%d, offset=%d\n", ++ skb->len, *seq, ++ TCP_SKB_CB(skb)->seq, ++ TCP_SKB_CB(skb)->end_seq, ++ offset); ++ } + skb = skb->next; + } while (skb != (struct sk_buff *)&sk->sk_receive_queue); + +@@ -1425,8 +1500,19 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + + tp->ucopy.len = len; + +- BUG_TRAP(tp->copied_seq == tp->rcv_nxt || +- (flags & (MSG_PEEK | MSG_TRUNC))); ++ if (!(tp->copied_seq == tp->rcv_nxt || ++ (flags&(MSG_PEEK|MSG_TRUNC)))) { ++ printk("KERNEL: assertion: tp->copied_seq == " ++ "tp->rcv_nxt || ...\n"); ++ printk("VE%u pid %d comm %.16s\n", ++ (get_exec_env() ? ++ VEID(get_exec_env()) : 0), ++ current->pid, current->comm); ++ printk("flags=0x%x, len=%d, copied_seq=%d, " ++ "rcv_nxt=%d\n", flags, ++ (int)len, tp->copied_seq, ++ tp->rcv_nxt); ++ } + + /* Ugly... If prequeue is not empty, we have to + * process it before releasing socket, otherwise +@@ -1837,7 +1923,7 @@ adjudge_to_death: + state = sk->sk_state; + sock_hold(sk); + sock_orphan(sk); +- atomic_inc(sk->sk_prot->orphan_count); ++ ub_inc_orphan_count(sk); + + /* It is the last release_sock in its life. It will remove backlog. */ + release_sock(sk); +@@ -1887,12 +1973,19 @@ adjudge_to_death: + } + } + if (sk->sk_state != TCP_CLOSE) { ++ int orphans = ub_get_orphan_count(sk); ++ + sk_mem_reclaim(sk); +- if (tcp_too_many_orphans(sk, +- atomic_read(sk->sk_prot->orphan_count))) { +- if (net_ratelimit()) ++ if (ub_too_many_orphans(sk, orphans)) { ++ if (net_ratelimit()) { ++ int ubid = 0; ++#ifdef CONFIG_USER_RESOURCE ++ ubid = sock_has_ubc(sk) ? ++ top_beancounter(sock_bc(sk)->ub)->ub_uid : 0; ++#endif + printk(KERN_INFO "TCP: too many of orphaned " +- "sockets\n"); ++ "sockets (%d in CT%d)\n", orphans, ubid); ++ } + tcp_set_state(sk, TCP_CLOSE); + tcp_send_active_reset(sk, GFP_ATOMIC); + NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY); +@@ -1968,6 +2061,7 @@ int tcp_disconnect(struct sock *sk, int flags) + tp->snd_ssthresh = 0x7fffffff; + tp->snd_cwnd_cnt = 0; + tp->bytes_acked = 0; ++ tp->advmss = 65535; + tcp_set_ca_state(sk, TCP_CA_Open); + tcp_clear_retrans(tp); + inet_csk_delack_init(sk); +@@ -2632,7 +2726,7 @@ void __init tcp_init(void) + tcp_hashinfo.bind_bucket_cachep = + kmem_cache_create("tcp_bind_bucket", + sizeof(struct inet_bind_bucket), 0, +- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL); + + /* Size and allocate the main established and bind bucket + * hash tables. +@@ -2701,6 +2795,11 @@ void __init tcp_init(void) + sysctl_tcp_mem[1] = limit; + sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; + ++ if (sysctl_tcp_mem[2] - sysctl_tcp_mem[1] > 4096) ++ sysctl_tcp_mem[1] = sysctl_tcp_mem[2] - 4096; ++ if (sysctl_tcp_mem[1] - sysctl_tcp_mem[0] > 4096) ++ sysctl_tcp_mem[0] = sysctl_tcp_mem[1] - 4096; ++ + /* Set per-socket limits to no more than 1/128 the pressure threshold */ + limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7); + max_share = min(4UL*1024*1024, limit); +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index cad73b7..bdb0162 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -73,6 +73,8 @@ + #include + #include + ++#include ++ + int sysctl_tcp_timestamps __read_mostly = 1; + int sysctl_tcp_window_scaling __read_mostly = 1; + int sysctl_tcp_sack __read_mostly = 1; +@@ -308,7 +310,7 @@ static void tcp_grow_window(struct sock *sk, struct sk_buff *skb) + /* Check #1 */ + if (tp->rcv_ssthresh < tp->window_clamp && + (int)tp->rcv_ssthresh < tcp_space(sk) && +- !tcp_memory_pressure) { ++ ub_tcp_rmem_allows_expand(sk)) { + int incr; + + /* Check #2. Increase window, if skb with such overhead +@@ -378,6 +380,8 @@ static void tcp_init_buffer_space(struct sock *sk) + + tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); + tp->snd_cwnd_stamp = tcp_time_stamp; ++ ++ ub_tcp_update_maxadvmss(sk); + } + + /* 5. Recalculate window clamp after socket hit its memory bounds. */ +@@ -390,7 +394,7 @@ static void tcp_clamp_window(struct sock *sk) + + if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && + !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && +- !tcp_memory_pressure && ++ !ub_tcp_memory_pressure(sk) && + atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { + sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), + sysctl_tcp_rmem[2]); +@@ -3877,19 +3881,19 @@ static void tcp_ofo_queue(struct sock *sk) + static int tcp_prune_ofo_queue(struct sock *sk); + static int tcp_prune_queue(struct sock *sk); + +-static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size) ++static inline int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb) + { + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || +- !sk_rmem_schedule(sk, size)) { ++ !sk_rmem_schedule(sk, skb)) { + + if (tcp_prune_queue(sk) < 0) + return -1; + +- if (!sk_rmem_schedule(sk, size)) { ++ if (!sk_rmem_schedule(sk, skb)) { + if (!tcp_prune_ofo_queue(sk)) + return -1; + +- if (!sk_rmem_schedule(sk, size)) ++ if (!sk_rmem_schedule(sk, skb)) + return -1; + } + } +@@ -3945,8 +3949,8 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) + if (eaten <= 0) { + queue_and_out: + if (eaten < 0 && +- tcp_try_rmem_schedule(sk, skb->truesize)) +- goto drop; ++ tcp_try_rmem_schedule(sk, skb)) ++ goto drop_part; + + skb_set_owner_r(skb, sk); + __skb_queue_tail(&sk->sk_receive_queue, skb); +@@ -3990,6 +3994,12 @@ out_of_window: + drop: + __kfree_skb(skb); + return; ++ ++drop_part: ++ if (after(tp->copied_seq, tp->rcv_nxt)) ++ tp->rcv_nxt = tp->copied_seq; ++ __kfree_skb(skb); ++ return; + } + + /* Out of window. F.e. zero window probe. */ +@@ -4016,7 +4026,7 @@ drop: + + TCP_ECN_check_ce(tp, skb); + +- if (tcp_try_rmem_schedule(sk, skb->truesize)) ++ if (tcp_try_rmem_schedule(sk, skb)) + goto drop; + + /* Disable header prediction. */ +@@ -4160,6 +4170,10 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, + nskb = alloc_skb(copy + header, GFP_ATOMIC); + if (!nskb) + return; ++ if (ub_tcprcvbuf_charge_forced(skb->sk, nskb) < 0) { ++ kfree_skb(nskb); ++ return; ++ } + + skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head); + skb_set_network_header(nskb, (skb_network_header(skb) - +@@ -4287,7 +4301,7 @@ static int tcp_prune_queue(struct sock *sk) + + if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) + tcp_clamp_window(sk); +- else if (tcp_memory_pressure) ++ else if (ub_tcp_memory_pressure(sk)) + tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); + + tcp_collapse_ofo_queue(sk); +@@ -4352,7 +4366,7 @@ static int tcp_should_expand_sndbuf(struct sock *sk) + return 0; + + /* If we are under global TCP memory pressure, do not expand. */ +- if (tcp_memory_pressure) ++ if (ub_tcp_memory_pressure(sk)) + return 0; + + /* If we are under soft global TCP memory pressure, do not expand. */ +@@ -4801,6 +4815,10 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, + + if ((int)skb->truesize > sk->sk_forward_alloc) + goto step5; ++ /* This is OK not to try to free memory here. ++ * Do this below on slow path. Den */ ++ if (ub_tcprcvbuf_charge(sk, skb) < 0) ++ goto step5; + + NET_INC_STATS_BH(LINUX_MIB_TCPHPHITS); + +diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c +index ffe869a..ca6b5d3 100644 +--- a/net/ipv4/tcp_ipv4.c ++++ b/net/ipv4/tcp_ipv4.c +@@ -73,6 +73,8 @@ + #include + #include + ++#include ++ + #include + #include + #include +@@ -699,7 +701,8 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) + struct tcp_timewait_sock *tcptw = tcp_twsk(sk); + + tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, +- tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, ++ tcptw->tw_rcv_wnd >> ++ (tw->tw_rcv_wscale & TW_WSCALE_MASK), + tcptw->tw_ts_recent); + + inet_twsk_put(tw); +@@ -1228,6 +1231,7 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = { + .destructor = tcp_v4_reqsk_destructor, + .send_reset = tcp_v4_send_reset, + }; ++EXPORT_SYMBOL_GPL(tcp_request_sock_ops); + + #ifdef CONFIG_TCP_MD5SIG + static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { +@@ -1532,6 +1536,10 @@ static __sum16 tcp_v4_checksum_init(struct sk_buff *skb) + int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) + { + struct sock *rsk; ++ struct user_beancounter *ub; ++ ++ ub = set_exec_ub(sock_bc(sk)->ub); ++ + #ifdef CONFIG_TCP_MD5SIG + /* + * We really want to reject the packet as early as possible +@@ -1550,7 +1558,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) + goto reset; + } + TCP_CHECK_TIMER(sk); +- return 0; ++ goto restore_context; + } + + if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb)) +@@ -1566,7 +1574,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) + rsk = nsk; + goto reset; + } +- return 0; ++ goto restore_context; + } + } + +@@ -1576,6 +1584,9 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) + goto reset; + } + TCP_CHECK_TIMER(sk); ++ ++restore_context: ++ (void)set_exec_ub(ub); + return 0; + + reset: +@@ -1587,7 +1598,7 @@ discard: + * might be destroyed here. This current version compiles correctly, + * but you have been warned. + */ +- return 0; ++ goto restore_context; + + csum_err: + TCP_INC_STATS_BH(TCP_MIB_INERRS); +@@ -1849,6 +1860,8 @@ static int tcp_v4_init_sock(struct sock *sk) + tp->snd_cwnd_clamp = ~0; + tp->mss_cache = 536; + ++ tp->advmss = 65535; /* max value */ ++ + tp->reordering = sysctl_tcp_reordering; + icsk->icsk_ca_ops = &tcp_init_congestion_ops; + +@@ -1910,6 +1923,8 @@ int tcp_v4_destroy_sock(struct sock *sk) + * If sendmsg cached page exists, toss it. + */ + if (sk->sk_sndmsg_page) { ++ /* queue is empty, uncharge */ ++ ub_sock_tcp_detachpage(sk); + __free_page(sk->sk_sndmsg_page); + sk->sk_sndmsg_page = NULL; + } +@@ -2463,6 +2478,87 @@ void __init tcp_v4_init(void) + panic("Failed to create the TCP control socket.\n"); + } + ++#ifdef CONFIG_VE ++static void tcp_kill_ve_onesk(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ /* Check the assumed state of the socket. */ ++ if (!sock_flag(sk, SOCK_DEAD)) { ++ static int printed; ++invalid: ++ if (!printed) ++ printk(KERN_DEBUG "Killing sk: dead %d, state %d, " ++ "wrseq %u unseq %u, wrqu %d.\n", ++ sock_flag(sk, SOCK_DEAD), sk->sk_state, ++ tp->write_seq, tp->snd_una, ++ !skb_queue_empty(&sk->sk_write_queue)); ++ printed = 1; ++ return; ++ } ++ ++ tcp_send_active_reset(sk, GFP_ATOMIC); ++ switch (sk->sk_state) { ++ case TCP_FIN_WAIT1: ++ case TCP_CLOSING: ++ /* In these 2 states the peer may want us to retransmit ++ * some data and/or FIN. Entering "resetting mode" ++ * instead. ++ */ ++ tcp_time_wait(sk, TCP_CLOSE, 0); ++ break; ++ case TCP_FIN_WAIT2: ++ /* By some reason the socket may stay in this state ++ * without turning into a TW bucket. Fix it. ++ */ ++ tcp_time_wait(sk, TCP_FIN_WAIT2, 0); ++ break; ++ case TCP_LAST_ACK: ++ /* Just jump into CLOSED state. */ ++ tcp_done(sk); ++ break; ++ default: ++ /* The socket must be already close()d. */ ++ goto invalid; ++ } ++} ++ ++void tcp_v4_kill_ve_sockets(struct ve_struct *envid) ++{ ++ struct inet_ehash_bucket *head; ++ int i; ++ ++ /* alive */ ++ local_bh_disable(); ++ head = tcp_hashinfo.ehash; ++ for (i = 0; i < tcp_hashinfo.ehash_size; i++) { ++ struct sock *sk; ++ struct hlist_node *node; ++ rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, i); ++more_work: ++ write_lock(lock); ++ sk_for_each(sk, node, &head[i].chain) { ++ if (ve_accessible_strict(sk->owner_env, envid)) { ++ sock_hold(sk); ++ write_unlock(lock); ++ ++ bh_lock_sock(sk); ++ /* sk might have disappeared from the hash before ++ * we got the lock */ ++ if (sk->sk_state != TCP_CLOSE) ++ tcp_kill_ve_onesk(sk); ++ bh_unlock_sock(sk); ++ sock_put(sk); ++ goto more_work; ++ } ++ } ++ write_unlock(lock); ++ } ++ local_bh_enable(); ++} ++EXPORT_SYMBOL(tcp_v4_kill_ve_sockets); ++#endif ++ + EXPORT_SYMBOL(ipv4_specific); + EXPORT_SYMBOL(tcp_hashinfo); + EXPORT_SYMBOL(tcp_prot); +diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c +index 8245247..8bbda56 100644 +--- a/net/ipv4/tcp_minisocks.c ++++ b/net/ipv4/tcp_minisocks.c +@@ -28,6 +28,9 @@ + #include + #include + ++#include ++#include ++ + #ifdef CONFIG_SYSCTL + #define SYNC_INIT 0 /* let the user enable it */ + #else +@@ -38,6 +41,11 @@ int sysctl_tcp_syncookies __read_mostly = SYNC_INIT; + EXPORT_SYMBOL(sysctl_tcp_syncookies); + + int sysctl_tcp_abort_on_overflow __read_mostly; ++int sysctl_tcp_max_tw_kmem_fraction __read_mostly = 384; ++int sysctl_tcp_max_tw_buckets_ub __read_mostly = 16536; ++ ++EXPORT_SYMBOL(sysctl_tcp_max_tw_kmem_fraction); ++EXPORT_SYMBOL(sysctl_tcp_max_tw_buckets_ub); + + struct inet_timewait_death_row tcp_death_row = { + .sysctl_max_tw_buckets = NR_FILE * 2, +@@ -53,6 +61,7 @@ struct inet_timewait_death_row tcp_death_row = { + .twcal_hand = -1, + .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0, + (unsigned long)&tcp_death_row), ++ .ub_managed = 1, + }; + + EXPORT_SYMBOL_GPL(tcp_death_row); +@@ -281,7 +290,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) + if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) + recycle_ok = icsk->icsk_af_ops->remember_stamp(sk); + +- if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) ++ if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets && ++ ub_timewait_check(sk, &tcp_death_row)) + tw = inet_twsk_alloc(sk, state); + + if (tw != NULL) { +@@ -294,6 +304,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) + tcptw->tw_rcv_wnd = tcp_receive_window(tp); + tcptw->tw_ts_recent = tp->rx_opt.ts_recent; + tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; ++ if (sk->sk_user_data != NULL) ++ tw->tw_rcv_wscale |= TW_WSCALE_SPEC; + + #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (tw->tw_family == PF_INET6) { +@@ -328,6 +340,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) + } + } while (0); + #endif ++ tw->tw_owner_env = VEID(sk->owner_env); + + /* Linkage updates. */ + __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); +@@ -348,11 +361,16 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) + TCP_TIMEWAIT_LEN); + inet_twsk_put(tw); + } else { ++ int ubid = 0; + /* Sorry, if we're out of memory, just CLOSE this + * socket up. We've got bigger problems than + * non-graceful socket closings. + */ +- LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow\n"); ++#ifdef CONFIG_BEANCOUNTERS ++ if (sock_has_ubc(sk)) ++ ubid = top_beancounter(sock_bc(sk)->ub)->ub_uid; ++#endif ++ LIMIT_NETDEBUG(KERN_INFO "TCP: time wait bucket table overflow (CT%d)\n", ubid); + } + + tcp_update_metrics(sk); +@@ -393,6 +411,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, + struct tcp_sock *newtp; + + /* Now setup tcp_sock */ ++ newsk->owner_env = sk->owner_env; ++ + newtp = tcp_sk(newsk); + newtp->pred_flags = 0; + newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1; +diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c +index ad993ec..4459fd3 100644 +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -41,6 +41,9 @@ + #include + #include + ++#include ++#include ++ + /* People can turn this off for buggy TCP's found in printers etc. */ + int sysctl_tcp_retrans_collapse __read_mostly = 1; + +@@ -455,6 +458,13 @@ static void tcp_syn_build_options(__be32 *ptr, int mss, int ts, int sack, + #endif + } + ++static int skb_header_size(struct sock *sk, int tcp_hlen) ++{ ++ struct ip_options *opt = inet_sk(sk)->opt; ++ return tcp_hlen + sizeof(struct iphdr) + ++ (opt ? opt->optlen : 0) + ETH_HLEN /* For hard header */; ++} ++ + /* This routine actually transmits TCP packets queued in by + * tcp_do_sendmsg(). This is used by both the initial + * transmission and possible later retransmissions. +@@ -474,6 +484,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, + struct tcp_sock *tp; + struct tcp_skb_cb *tcb; + int tcp_header_size; ++ int header_size; + #ifdef CONFIG_TCP_MD5SIG + struct tcp_md5sig_key *md5; + __u8 *md5_hash_location; +@@ -533,6 +544,20 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, + TCPOLEN_SACK_PERBLOCK)); + } + ++ /* Unfortunately, we can have skb from outside world here ++ * with size insufficient for header. It is impossible to make ++ * guess when we queue skb, so the decision should be made ++ * here. Den ++ */ ++ header_size = skb_header_size(sk, tcp_header_size); ++ if (skb->data - header_size < skb->head) { ++ int delta = header_size - skb_headroom(skb); ++ err = pskb_expand_head(skb, SKB_DATA_ALIGN(delta), ++ 0, GFP_ATOMIC); ++ if (err) ++ return err; ++ } ++ + if (tcp_packets_in_flight(tp) == 0) + tcp_ca_event(sk, CA_EVENT_TX_START); + +@@ -706,15 +731,23 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, + if (nsize < 0) + nsize = 0; + +- if (skb_cloned(skb) && +- skb_is_nonlinear(skb) && +- pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) +- return -ENOMEM; ++ if (skb_cloned(skb) && skb_is_nonlinear(skb)) { ++ unsigned long chargesize; ++ chargesize = skb_bc(skb)->charged; ++ if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) ++ return -ENOMEM; ++ ub_sock_tcp_unchargesend(sk, chargesize); ++ ub_tcpsndbuf_charge_forced(sk, skb); ++ } + + /* Get a new skb... force flag on. */ + buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); + if (buff == NULL) + return -ENOMEM; /* We'll just try again later. */ ++ if (ub_tcpsndbuf_charge(sk, buff) < 0) { ++ kfree_skb(buff); ++ return -ENOMEM; ++ } + + sk->sk_wmem_queued += buff->truesize; + sk_mem_charge(sk, buff->truesize); +@@ -1216,6 +1249,11 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, + if (unlikely(buff == NULL)) + return -ENOMEM; + ++ if (ub_tcpsndbuf_charge(sk, buff) < 0) { ++ kfree_skb(buff); ++ return -ENOMEM; ++ } ++ + sk->sk_wmem_queued += buff->truesize; + sk_mem_charge(sk, buff->truesize); + buff->truesize += nlen; +@@ -1651,7 +1689,7 @@ u32 __tcp_select_window(struct sock *sk) + if (free_space < (full_space >> 1)) { + icsk->icsk_ack.quick = 0; + +- if (tcp_memory_pressure) ++ if (ub_tcp_shrink_rcvbuf(sk)) + tp->rcv_ssthresh = min(tp->rcv_ssthresh, + 4U * tp->advmss); + +@@ -2096,6 +2134,7 @@ void tcp_send_fin(struct sock *sk) + break; + yield(); + } ++ ub_tcpsndbuf_charge_forced(sk, skb); + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(skb, MAX_TCP_HEADER); +@@ -2154,6 +2193,10 @@ int tcp_send_synack(struct sock *sk) + struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); + if (nskb == NULL) + return -ENOMEM; ++ if (ub_tcpsndbuf_charge(sk, skb) < 0) { ++ kfree_skb(nskb); ++ return -ENOMEM; ++ } + tcp_unlink_write_queue(skb, sk); + skb_header_release(nskb); + __tcp_add_write_queue_head(sk, nskb); +@@ -2282,6 +2325,7 @@ static void tcp_connect_init(struct sock *sk) + struct dst_entry *dst = __sk_dst_get(sk); + struct tcp_sock *tp = tcp_sk(sk); + __u8 rcv_wscale; ++ static int once = 0; + + /* We'll fix this up when we get a response from the other end. + * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. +@@ -2301,9 +2345,23 @@ static void tcp_connect_init(struct sock *sk) + tcp_mtup_init(sk); + tcp_sync_mss(sk, dst_mtu(dst)); + ++ if (!once && dst_metric(dst, RTAX_ADVMSS) == 0) { ++ once = 1; ++ ++ printk("Oops in connect_init! dst->advmss=%d\n", ++ dst_metric(dst, RTAX_ADVMSS)); ++ printk("dst: pmtu=%u\n", dst_metric(dst, RTAX_MTU)); ++ printk("sk->state=%d, tp: ack.rcv_mss=%d, mss_cache=%d, " ++ "advmss=%d, user_mss=%d\n", ++ sk->sk_state, inet_csk(sk)->icsk_ack.rcv_mss, ++ tp->mss_cache, tp->advmss, tp->rx_opt.user_mss); ++ } ++ + if (!tp->window_clamp) + tp->window_clamp = dst_metric(dst, RTAX_WINDOW); + tp->advmss = dst_metric(dst, RTAX_ADVMSS); ++ if (tp->advmss == 0) ++ tp->advmss = 1460; + tcp_initialize_rcv_mss(sk); + + tcp_select_initial_window(tcp_full_space(sk), +@@ -2344,6 +2402,10 @@ int tcp_connect(struct sock *sk) + buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation); + if (unlikely(buff == NULL)) + return -ENOBUFS; ++ if (ub_tcpsndbuf_charge(sk, buff) < 0) { ++ kfree_skb(buff); ++ return -ENOBUFS; ++ } + + /* Reserve space for headers. */ + skb_reserve(buff, MAX_TCP_HEADER); +diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c +index 63ed9d6..2432a49 100644 +--- a/net/ipv4/tcp_timer.c ++++ b/net/ipv4/tcp_timer.c +@@ -22,6 +22,8 @@ + + #include + #include ++#include ++#include + + int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES; + int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES; +@@ -67,7 +69,8 @@ static void tcp_write_err(struct sock *sk) + static int tcp_out_of_resources(struct sock *sk, int do_reset) + { + struct tcp_sock *tp = tcp_sk(sk); +- int orphans = atomic_read(&tcp_orphan_count); ++ int orphans = ub_get_orphan_count(sk); ++ int orph = orphans; + + /* If peer does not open window for long time, or did not transmit + * anything for long time, penalize it. */ +@@ -78,10 +81,16 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset) + if (sk->sk_err_soft) + orphans <<= 1; + +- if (tcp_too_many_orphans(sk, orphans)) { +- if (net_ratelimit()) +- printk(KERN_INFO "Out of socket memory\n"); +- ++ if (ub_too_many_orphans(sk, orphans)) { ++ if (net_ratelimit()) { ++ int ubid = 0; ++#ifdef CONFIG_USER_RESOURCE ++ ubid = sock_has_ubc(sk) ? ++ top_beancounter(sock_bc(sk)->ub)->ub_uid : 0; ++#endif ++ printk(KERN_INFO "Orphaned socket dropped " ++ "(%d,%d in CT%d)\n", orph, orphans, ubid); ++ } + /* Catch exceptional cases, when connection requires reset. + * 1. Last segment was sent recently. */ + if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN || +@@ -174,9 +183,12 @@ static int tcp_write_timeout(struct sock *sk) + static void tcp_delack_timer(unsigned long data) + { + struct sock *sk = (struct sock*)data; ++ struct ve_struct *env; + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + ++ env = set_exec_env(sk->owner_env); ++ + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later. */ +@@ -225,11 +237,12 @@ static void tcp_delack_timer(unsigned long data) + TCP_CHECK_TIMER(sk); + + out: +- if (tcp_memory_pressure) ++ if (ub_tcp_memory_pressure(sk)) + sk_mem_reclaim(sk); + out_unlock: + bh_unlock_sock(sk); + sock_put(sk); ++ (void)set_exec_env(env); + } + + static void tcp_probe_timer(struct sock *sk) +@@ -284,8 +297,11 @@ static void tcp_probe_timer(struct sock *sk) + static void tcp_retransmit_timer(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); ++ struct ve_struct *env; + struct inet_connection_sock *icsk = inet_csk(sk); + ++ env = set_exec_env(sk->owner_env); ++ + if (!tp->packets_out) + goto out; + +@@ -390,15 +406,19 @@ out_reset_timer: + if (icsk->icsk_retransmits > sysctl_tcp_retries1) + __sk_dst_reset(sk); + +-out:; ++out: ++ (void)set_exec_env(env); + } + + static void tcp_write_timer(unsigned long data) + { + struct sock *sk = (struct sock*)data; ++ struct ve_struct *env; + struct inet_connection_sock *icsk = inet_csk(sk); + int event; + ++ env = set_exec_env(sk->owner_env); ++ + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later */ +@@ -432,6 +452,7 @@ out: + out_unlock: + bh_unlock_sock(sk); + sock_put(sk); ++ (void)set_exec_env(env); + } + + /* +@@ -459,10 +480,13 @@ void tcp_set_keepalive(struct sock *sk, int val) + static void tcp_keepalive_timer (unsigned long data) + { + struct sock *sk = (struct sock *) data; ++ struct ve_struct *env; + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + __u32 elapsed; + ++ env = set_exec_env(sk->owner_env); ++ + /* Only process if socket is not in use. */ + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { +@@ -534,4 +558,5 @@ death: + out: + bh_unlock_sock(sk); + sock_put(sk); ++ (void)set_exec_env(env); + } +diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c +index 56fcda3..2a1087b 100644 +--- a/net/ipv4/udp.c ++++ b/net/ipv4/udp.c +@@ -159,7 +159,9 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, + struct sock *sk2; + int error = 1; + struct net *net = sock_net(sk); ++ struct ve_struct *ve; + ++ ve = get_exec_env(); + write_lock_bh(&udp_hash_lock); + + if (!snum) { +@@ -176,7 +178,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, + for (i = 0; i < UDP_HTABLE_SIZE; i++) { + int size = 0; + +- head = &udptable[rover & (UDP_HTABLE_SIZE - 1)]; ++ head = &udptable[udp_hashfn(rover, VEID(ve))]; + if (hlist_empty(head)) + goto gotit; + +@@ -213,7 +215,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, + gotit: + snum = rover; + } else { +- head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; ++ head = &udptable[udp_hashfn(snum, VEID(ve))]; + + sk_for_each(sk2, node, head) + if (sk2->sk_hash == snum && +@@ -229,7 +231,7 @@ gotit: + inet_sk(sk)->num = snum; + sk->sk_hash = snum; + if (sk_unhashed(sk)) { +- head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; ++ head = &udptable[udp_hashfn(snum, VEID(ve))]; + sk_add_node(sk, head); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + } +@@ -264,9 +266,11 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, + struct hlist_node *node; + unsigned short hnum = ntohs(dport); + int badness = -1; ++ struct ve_struct *ve; + ++ ve = get_exec_env(); + read_lock(&udp_hash_lock); +- sk_for_each(sk, node, &udptable[hnum & (UDP_HTABLE_SIZE - 1)]) { ++ sk_for_each(sk, node, &udptable[udp_hashfn(hnum, VEID(ve))]) { + struct inet_sock *inet = inet_sk(sk); + + if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum && +@@ -1070,7 +1074,8 @@ static int __udp4_lib_mcast_deliver(struct sk_buff *skb, + int dif; + + read_lock(&udp_hash_lock); +- sk = sk_head(&udptable[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]); ++ sk = sk_head(&udptable[udp_hashfn(ntohs(uh->dest), ++ VEID(skb->owner_env))]); + dif = skb->dev->ifindex; + sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); + if (sk) { +diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c +index ff61a5c..5935c08 100644 +--- a/net/ipv6/addrconf.c ++++ b/net/ipv6/addrconf.c +@@ -388,9 +388,8 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev) + dev->type == ARPHRD_TUNNEL6 || + dev->type == ARPHRD_SIT || + dev->type == ARPHRD_NONE) { +- printk(KERN_INFO +- "%s: Disabled Privacy Extensions\n", +- dev->name); ++ ADBG((KERN_INFO "%s: Disabled Privacy Extensions\n", ++ dev->name)); + ndev->cnf.use_tempaddr = -1; + } else { + in6_dev_hold(ndev); +@@ -584,7 +583,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen, + goto out; + } + +- ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC); ++ ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC_UBC); + + if (ifa == NULL) { + ADBG(("ipv6_add_addr: malloc failed\n")); +@@ -2025,7 +2024,7 @@ err_exit: + /* + * Manual configuration of address on an interface + */ +-static int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx, ++int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx, + unsigned int plen, __u8 ifa_flags, __u32 prefered_lft, + __u32 valid_lft) + { +@@ -2097,6 +2096,7 @@ static int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx, + + return PTR_ERR(ifp); + } ++EXPORT_SYMBOL_GPL(inet6_addr_add); + + static int inet6_addr_del(struct net *net, int ifindex, struct in6_addr *pfx, + unsigned int plen) +@@ -2142,7 +2142,7 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg) + struct in6_ifreq ireq; + int err; + +- if (!capable(CAP_NET_ADMIN)) ++ if (!capable(CAP_VE_NET_ADMIN)) + return -EPERM; + + if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) +@@ -2161,7 +2161,7 @@ int addrconf_del_ifaddr(struct net *net, void __user *arg) + struct in6_ifreq ireq; + int err; + +- if (!capable(CAP_NET_ADMIN)) ++ if (!capable(CAP_VE_NET_ADMIN)) + return -EPERM; + + if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) +@@ -2664,6 +2664,9 @@ static int addrconf_ifdown(struct net_device *dev, int how) + static void addrconf_rs_timer(unsigned long data) + { + struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data; ++ struct ve_struct *old_env; ++ ++ old_env = set_exec_env(ifp->idev->dev->owner_env); + + if (ifp->idev->cnf.forwarding) + goto out; +@@ -2698,6 +2701,7 @@ static void addrconf_rs_timer(unsigned long data) + + out: + in6_ifa_put(ifp); ++ (void)set_exec_env(old_env); + } + + /* +@@ -2773,7 +2777,9 @@ static void addrconf_dad_timer(unsigned long data) + struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data; + struct inet6_dev *idev = ifp->idev; + struct in6_addr mcaddr; ++ struct ve_struct *old_env; + ++ old_env = set_exec_env(ifp->idev->dev->owner_env); + read_lock_bh(&idev->lock); + if (idev->dead) { + read_unlock_bh(&idev->lock); +@@ -2804,6 +2810,7 @@ static void addrconf_dad_timer(unsigned long data) + ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &in6addr_any); + out: + in6_ifa_put(ifp); ++ (void)set_exec_env(old_env); + } + + static void addrconf_dad_completed(struct inet6_ifaddr *ifp) +@@ -3026,6 +3033,7 @@ static void addrconf_verify(unsigned long foo) + struct inet6_ifaddr *ifp; + unsigned long now, next; + int i; ++ struct ve_struct *old_env; + + spin_lock_bh(&addrconf_verify_lock); + now = jiffies; +@@ -3046,6 +3054,8 @@ restart: + if (ifp->flags & IFA_F_PERMANENT) + continue; + ++ old_env = set_exec_env(ifp->idev->dev->owner_env); ++ + spin_lock(&ifp->lock); + age = (now - ifp->tstamp) / HZ; + +@@ -3061,9 +3071,11 @@ restart: + in6_ifa_hold(ifp); + read_unlock(&addrconf_hash_lock); + ipv6_del_addr(ifp); ++ (void)set_exec_env(old_env); + goto restart; + } else if (ifp->prefered_lft == INFINITY_LIFE_TIME) { + spin_unlock(&ifp->lock); ++ set_exec_env(old_env); + continue; + } else if (age >= ifp->prefered_lft) { + /* jiffies - ifp->tsamp > age >= ifp->prefered_lft */ +@@ -3085,6 +3097,7 @@ restart: + + ipv6_ifa_notify(0, ifp); + in6_ifa_put(ifp); ++ (void)set_exec_env(old_env); + goto restart; + } + #ifdef CONFIG_IPV6_PRIVACY +@@ -3106,6 +3119,7 @@ restart: + ipv6_create_tempaddr(ifpub, ifp); + in6_ifa_put(ifpub); + in6_ifa_put(ifp); ++ (void)set_exec_env(old_env); + goto restart; + } + } else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next)) +@@ -3118,6 +3132,7 @@ restart: + next = ifp->tstamp + ifp->prefered_lft * HZ; + spin_unlock(&ifp->lock); + } ++ (void)set_exec_env(old_env); + } + read_unlock(&addrconf_hash_lock); + } +diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c +index e84b3fd..10c74ae 100644 +--- a/net/ipv6/af_inet6.c ++++ b/net/ipv6/af_inet6.c +@@ -58,6 +58,10 @@ + #ifdef CONFIG_IPV6_TUNNEL + #include + #endif ++#ifdef CONFIG_IPV6_MIP6 ++#include ++#endif ++#include + + #include + #include +@@ -147,6 +151,10 @@ lookup_protocol: + goto out_rcu_unlock; + } + ++ err = vz_security_protocol_check(answer->protocol); ++ if (err < 0) ++ goto out_rcu_unlock; ++ + err = -EPERM; + if (answer->capability > 0 && !capable(answer->capability)) + goto out_rcu_unlock; +@@ -164,6 +172,13 @@ lookup_protocol: + if (sk == NULL) + goto out; + ++ err = -ENOBUFS; ++ if (ub_sock_charge(sk, PF_INET6, sock->type)) ++ goto out_sk_free; ++ /* if charge was successful, sock_init_data() MUST be called to ++ * set sk->sk_type. otherwise sk will be uncharged to wrong resource ++ */ ++ + sock_init_data(sock, sk); + + err = 0; +@@ -238,6 +253,9 @@ out: + out_rcu_unlock: + rcu_read_unlock(); + goto out; ++out_sk_free: ++ sk_free(sk); ++ return err; + } + + +@@ -803,45 +821,48 @@ static void ipv6_packet_cleanup(void) + dev_remove_pack(&ipv6_packet_type); + } + +-static int __init init_ipv6_mibs(void) ++int init_ipv6_mibs(void) + { +- if (snmp_mib_init((void **)ipv6_statistics, ++ if (snmp_mib_init((void **)ve_ipv6_statistics, + sizeof(struct ipstats_mib)) < 0) + goto err_ip_mib; +- if (snmp_mib_init((void **)icmpv6_statistics, ++ if (snmp_mib_init((void **)ve_icmpv6_statistics, + sizeof(struct icmpv6_mib)) < 0) + goto err_icmp_mib; +- if (snmp_mib_init((void **)icmpv6msg_statistics, ++ if (snmp_mib_init((void **)ve_icmpv6msg_statistics, + sizeof(struct icmpv6msg_mib)) < 0) + goto err_icmpmsg_mib; +- if (snmp_mib_init((void **)udp_stats_in6, sizeof (struct udp_mib)) < 0) ++ if (snmp_mib_init((void **)ve_udp_stats_in6, ++ sizeof (struct udp_mib)) < 0) + goto err_udp_mib; +- if (snmp_mib_init((void **)udplite_stats_in6, ++ if (snmp_mib_init((void **)ve_udplite_stats_in6, + sizeof (struct udp_mib)) < 0) + goto err_udplite_mib; + return 0; + + err_udplite_mib: +- snmp_mib_free((void **)udp_stats_in6); ++ snmp_mib_free((void **)ve_udp_stats_in6); + err_udp_mib: +- snmp_mib_free((void **)icmpv6msg_statistics); ++ snmp_mib_free((void **)ve_icmpv6msg_statistics); + err_icmpmsg_mib: +- snmp_mib_free((void **)icmpv6_statistics); ++ snmp_mib_free((void **)ve_icmpv6_statistics); + err_icmp_mib: +- snmp_mib_free((void **)ipv6_statistics); ++ snmp_mib_free((void **)ve_ipv6_statistics); + err_ip_mib: + return -ENOMEM; + + } ++EXPORT_SYMBOL(init_ipv6_mibs); + +-static void cleanup_ipv6_mibs(void) ++void cleanup_ipv6_mibs(void) + { +- snmp_mib_free((void **)ipv6_statistics); +- snmp_mib_free((void **)icmpv6_statistics); +- snmp_mib_free((void **)icmpv6msg_statistics); +- snmp_mib_free((void **)udp_stats_in6); +- snmp_mib_free((void **)udplite_stats_in6); ++ snmp_mib_free((void **)ve_ipv6_statistics); ++ snmp_mib_free((void **)ve_icmpv6_statistics); ++ snmp_mib_free((void **)ve_icmpv6msg_statistics); ++ snmp_mib_free((void **)ve_udp_stats_in6); ++ snmp_mib_free((void **)ve_udplite_stats_in6); + } ++EXPORT_SYMBOL(cleanup_ipv6_mibs); + + static int inet6_net_init(struct net *net) + { +diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c +index 580014a..f099a61 100644 +--- a/net/ipv6/inet6_hashtables.c ++++ b/net/ipv6/inet6_hashtables.c +@@ -68,7 +68,8 @@ struct sock *__inet6_lookup_established(struct net *net, + /* Optimize here for direct hit, only listening connections can + * have wildcards anyways. + */ +- unsigned int hash = inet6_ehashfn(daddr, hnum, saddr, sport); ++ struct ve_struct *env = get_exec_env(); ++ unsigned int hash = inet6_ehashfn(daddr, hnum, saddr, sport, VEID(env)); + struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); + rwlock_t *lock = inet_ehash_lockp(hashinfo, hash); + +@@ -102,9 +103,10 @@ struct sock *inet6_lookup_listener(struct net *net, + const struct hlist_node *node; + struct sock *result = NULL; + int score, hiscore = 0; ++ struct ve_struct *ve = get_exec_env(); + + read_lock(&hashinfo->lhash_lock); +- sk_for_each(sk, node, &hashinfo->listening_hash[inet_lhashfn(hnum)]) { ++ sk_for_each(sk, node, &hashinfo->listening_hash[inet_lhashfn(hnum, VEID(ve))]) { + if (net_eq(sock_net(sk), net) && inet_sk(sk)->num == hnum && + sk->sk_family == PF_INET6) { + const struct ipv6_pinfo *np = inet6_sk(sk); +@@ -156,7 +158,8 @@ EXPORT_SYMBOL_GPL(inet6_lookup); + + static int __inet6_check_established(struct inet_timewait_death_row *death_row, + struct sock *sk, const __u16 lport, +- struct inet_timewait_sock **twp) ++ struct inet_timewait_sock **twp, ++ struct ve_struct *ve) + { + struct inet_hashinfo *hinfo = death_row->hashinfo; + struct inet_sock *inet = inet_sk(sk); +@@ -166,7 +169,7 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, + const int dif = sk->sk_bound_dev_if; + const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport); + const unsigned int hash = inet6_ehashfn(daddr, lport, saddr, +- inet->dport); ++ inet->dport, VEID(ve)); + struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); + rwlock_t *lock = inet_ehash_lockp(hinfo, hash); + struct sock *sk2; +diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c +index 1ee4fa1..45acc3e 100644 +--- a/net/ipv6/ip6_fib.c ++++ b/net/ipv6/ip6_fib.c +@@ -184,11 +184,9 @@ static void fib6_link_table(struct net *net, struct fib6_table *tb) + + h = tb->tb6_id & (FIB_TABLE_HASHSZ - 1); + +- /* +- * No protection necessary, this is the only list mutatation +- * operation, tables never disappear once they exist. +- */ ++ write_lock_bh(&tb->tb6_lock); + hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]); ++ write_unlock_bh(&tb->tb6_lock); + } + + #ifdef CONFIG_IPV6_MULTIPLE_TABLES +@@ -1370,10 +1368,14 @@ void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), + for (h = 0; h < FIB_TABLE_HASHSZ; h++) { + head = &net->ipv6.fib_table_hash[h]; + hlist_for_each_entry_rcu(table, node, head, tb6_hlist) { ++ struct ve_struct *old_env; ++ ++ old_env = set_exec_env(table->owner_env); + write_lock_bh(&table->tb6_lock); + fib6_clean_tree(net, &table->tb6_root, + func, prune, arg); + write_unlock_bh(&table->tb6_lock); ++ (void)set_exec_env(old_env); + } + } + rcu_read_unlock(); +@@ -1506,6 +1508,9 @@ static int fib6_net_init(struct net *net) + if (!net->ipv6.fib6_main_tbl) + goto out_fib_table_hash; + ++#ifdef CONFIG_VE ++ net->ipv6.fib6_main_tbl->owner_env = get_exec_env(); ++#endif + net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN; + net->ipv6.fib6_main_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry; + net->ipv6.fib6_main_tbl->tb6_root.fn_flags = +@@ -1516,6 +1521,10 @@ static int fib6_net_init(struct net *net) + GFP_KERNEL); + if (!net->ipv6.fib6_local_tbl) + goto out_fib6_main_tbl; ++ ++#ifdef CONFIG_VE ++ net->ipv6.fib6_local_tbl->owner_env = get_exec_env(); ++#endif + net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL; + net->ipv6.fib6_local_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry; + net->ipv6.fib6_local_tbl->tb6_root.fn_flags = +@@ -1564,7 +1573,7 @@ int __init fib6_init(void) + + fib6_node_kmem = kmem_cache_create("fib6_nodes", + sizeof(struct fib6_node), +- 0, SLAB_HWCACHE_ALIGN, ++ 0, SLAB_HWCACHE_ALIGN|SLAB_UBC, + NULL); + if (!fib6_node_kmem) + goto out; +diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c +index 48cdce9..0976ff5 100644 +--- a/net/ipv6/ip6_output.c ++++ b/net/ipv6/ip6_output.c +@@ -516,6 +516,20 @@ int ip6_forward(struct sk_buff *skb) + return -EMSGSIZE; + } + ++ /* ++ * We try to optimize forwarding of VE packets: ++ * do not decrement TTL (and so save skb_cow) ++ * during forwarding of outgoing pkts from VE. ++ * For incoming pkts we still do ttl decr, ++ * since such skb is not cloned and does not require ++ * actual cow. So, there is at least one place ++ * in pkts path with mandatory ttl decr, that is ++ * sufficient to prevent routing loops. ++ */ ++ hdr = ipv6_hdr(skb); ++ if (skb->dev->features & NETIF_F_VENET) /* src is VENET device */ ++ goto no_ttl_decr; ++ + if (skb_cow(skb, dst->dev->hard_header_len)) { + IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS); + goto drop; +@@ -527,6 +541,7 @@ int ip6_forward(struct sk_buff *skb) + + hdr->hop_limit--; + ++no_ttl_decr: + IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); + return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev, + ip6_forward_finish); +diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c +index fd632dd..a78e9cb 100644 +--- a/net/ipv6/mcast.c ++++ b/net/ipv6/mcast.c +@@ -246,6 +246,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) + + return 0; + } ++EXPORT_SYMBOL_GPL(ipv6_sock_mc_join); + + /* + * socket leave on multicast group +@@ -2202,15 +2203,18 @@ static void igmp6_leave_group(struct ifmcaddr6 *ma) + static void mld_gq_timer_expire(unsigned long data) + { + struct inet6_dev *idev = (struct inet6_dev *)data; ++ struct ve_struct *old_env = set_exec_env(idev->dev->owner_env); + + idev->mc_gq_running = 0; + mld_send_report(idev, NULL); + __in6_dev_put(idev); ++ set_exec_env(old_env); + } + + static void mld_ifc_timer_expire(unsigned long data) + { + struct inet6_dev *idev = (struct inet6_dev *)data; ++ struct ve_struct *old_env = set_exec_env(idev->dev->owner_env); + + mld_send_cr(idev); + if (idev->mc_ifc_count) { +@@ -2219,6 +2223,7 @@ static void mld_ifc_timer_expire(unsigned long data) + mld_ifc_start_timer(idev, idev->mc_maxdelay); + } + __in6_dev_put(idev); ++ set_exec_env(old_env); + } + + static void mld_ifc_event(struct inet6_dev *idev) +@@ -2233,6 +2238,7 @@ static void mld_ifc_event(struct inet6_dev *idev) + static void igmp6_timer_handler(unsigned long data) + { + struct ifmcaddr6 *ma = (struct ifmcaddr6 *) data; ++ struct ve_struct *old_env = set_exec_env(ma->idev->dev->owner_env); + + if (MLD_V1_SEEN(ma->idev)) + igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); +@@ -2244,6 +2250,7 @@ static void igmp6_timer_handler(unsigned long data) + ma->mca_flags &= ~MAF_TIMER_RUNNING; + spin_unlock(&ma->mca_lock); + ma_put(ma); ++ set_exec_env(old_env); + } + + /* Device going down */ +diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c +index 2eff3ae..8753e85 100644 +--- a/net/ipv6/netfilter/ip6_queue.c ++++ b/net/ipv6/netfilter/ip6_queue.c +@@ -442,7 +442,7 @@ __ipq_rcv_skb(struct sk_buff *skb) + if (type <= IPQM_BASE) + return; + +- if (security_netlink_recv(skb, CAP_NET_ADMIN)) ++ if (security_netlink_recv(skb, CAP_VE_NET_ADMIN)) + RCV_SKB_FAIL(-EPERM); + + write_lock_bh(&queue_lock); +@@ -472,8 +472,12 @@ __ipq_rcv_skb(struct sk_buff *skb) + static void + ipq_rcv_skb(struct sk_buff *skb) + { ++ struct ve_struct *old_ve; ++ + mutex_lock(&ipqnl_mutex); ++ old_ve = set_exec_env(skb->owner_env); + __ipq_rcv_skb(skb); ++ (void)set_exec_env(old_ve); + mutex_unlock(&ipqnl_mutex); + } + +@@ -483,9 +487,6 @@ ipq_rcv_dev_event(struct notifier_block *this, + { + struct net_device *dev = ptr; + +- if (dev_net(dev) != &init_net) +- return NOTIFY_DONE; +- + /* Drop any packets associated with the downed device */ + if (event == NETDEV_DOWN) + ipq_dev_drop(dev->ifindex); +@@ -505,7 +506,7 @@ ipq_rcv_nl_event(struct notifier_block *this, + if (event == NETLINK_URELEASE && + n->protocol == NETLINK_IP6_FW && n->pid) { + write_lock_bh(&queue_lock); +- if ((n->net == &init_net) && (n->pid == peer_pid)) ++ if (n->pid == peer_pid) + __ipq_reset(); + write_unlock_bh(&queue_lock); + } +diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c +index 0b4557e..0b6f441 100644 +--- a/net/ipv6/netfilter/ip6_tables.c ++++ b/net/ipv6/netfilter/ip6_tables.c +@@ -1874,7 +1874,7 @@ compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, + { + int ret; + +- if (!capable(CAP_NET_ADMIN)) ++ if (!capable(CAP_VE_NET_ADMIN)) + return -EPERM; + + switch (cmd) { +@@ -1985,7 +1985,7 @@ compat_do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) + { + int ret; + +- if (!capable(CAP_NET_ADMIN)) ++ if (!capable(CAP_VE_NET_ADMIN)) + return -EPERM; + + switch (cmd) { +@@ -2084,7 +2084,7 @@ struct xt_table *ip6t_register_table(struct net *net, struct xt_table *table, + int ret; + struct xt_table_info *newinfo; + struct xt_table_info bootstrap +- = { 0, 0, 0, { 0 }, { 0 }, { } }; ++ = { 0, 0, 0, 0, { 0 }, { 0 }, { } }; + void *loc_cpu_entry; + struct xt_table *new_table; + +@@ -2241,11 +2241,22 @@ static struct xt_match icmp6_matchstruct __read_mostly = { + + static int __net_init ip6_tables_net_init(struct net *net) + { +- return xt_proto_init(net, AF_INET6); ++ int res; ++ ++ if (!net_ipt_module_permitted(net, VE_IP_IPTABLES6)) ++ return 0; ++ ++ res = xt_proto_init(net, AF_INET6); ++ if (!res) ++ net_ipt_module_set(net, VE_IP_IPTABLES6); ++ return res; + } + + static void __net_exit ip6_tables_net_exit(struct net *net) + { ++ if (!net_is_ipt_module_set(net, VE_IP_IPTABLES6)) ++ return; ++ + xt_proto_fini(net, AF_INET6); + } + +diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c +index f979e48..d03046a 100644 +--- a/net/ipv6/netfilter/ip6table_filter.c ++++ b/net/ipv6/netfilter/ip6table_filter.c +@@ -120,16 +120,24 @@ module_param(forward, bool, 0000); + + static int __net_init ip6table_filter_net_init(struct net *net) + { ++ if (!net_ipt_module_permitted(net, VE_IP_FILTER6)) ++ return 0; ++ + /* Register table */ + net->ipv6.ip6table_filter = + ip6t_register_table(net, &packet_filter, &initial_table.repl); + if (IS_ERR(net->ipv6.ip6table_filter)) + return PTR_ERR(net->ipv6.ip6table_filter); ++ ++ net_ipt_module_set(net, VE_IP_FILTER6); + return 0; + } + + static void __net_exit ip6table_filter_net_exit(struct net *net) + { ++ if (!net_is_ipt_module_set(net, VE_IP_FILTER6)) ++ return; ++ + ip6t_unregister_table(net->ipv6.ip6table_filter); + } + +diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c +index f405cea..a4727b3 100644 +--- a/net/ipv6/netfilter/ip6table_mangle.c ++++ b/net/ipv6/netfilter/ip6table_mangle.c +@@ -160,16 +160,24 @@ static struct nf_hook_ops ip6t_ops[] __read_mostly = { + + static int __net_init ip6table_mangle_net_init(struct net *net) + { ++ if (!net_ipt_module_permitted(net, VE_IP_MANGLE6)) ++ return 0; ++ + /* Register table */ + net->ipv6.ip6table_mangle = + ip6t_register_table(net, &packet_mangler, &initial_table.repl); + if (IS_ERR(net->ipv6.ip6table_mangle)) + return PTR_ERR(net->ipv6.ip6table_mangle); ++ ++ net_ipt_module_set(net, VE_IP_MANGLE6); + return 0; + } + + static void __net_exit ip6table_mangle_net_exit(struct net *net) + { ++ if (!net_is_ipt_module_set(net, VE_IP_MANGLE6)) ++ return; ++ + ip6t_unregister_table(net->ipv6.ip6table_mangle); + } + +diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +index 85050c0..e6f8f7d 100644 +--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c ++++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +@@ -14,6 +14,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -359,39 +360,52 @@ MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET6)); + MODULE_LICENSE("GPL"); + MODULE_AUTHOR("Yasuyuki KOZAKAI @USAGI "); + +-static int __init nf_conntrack_l3proto_ipv6_init(void) ++int init_nf_ct_l3proto_ipv6(void) + { +- int ret = 0; +- +- need_conntrack(); +- ++ int ret = -ENOMEM; ++ ++#ifdef CONFIG_VE_IPTABLES ++ if (!ve_is_super(get_exec_env())) ++ __module_get(THIS_MODULE); ++ ++ ret = nf_ct_proto_tcp_sysctl_init(); ++ if (ret < 0) ++ goto no_mem_tcp; ++ ret = nf_ct_proto_udp_sysctl_init(); ++ if (ret < 0) ++ goto no_mem_udp; ++ ret = nf_ct_proto_icmpv6_sysctl_init(); ++ if (ret < 0) ++ goto no_mem_icmp; ++#endif /* CONFIG_VE_IPTABLES */ + ret = nf_ct_frag6_init(); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't initialize frag6.\n"); +- return ret; ++ goto cleanup_sys; + } +- ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp6); ++ ++ ret = nf_conntrack_l4proto_register(ve_nf_conntrack_l4proto_tcp6); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register tcp.\n"); + goto cleanup_frag6; + } + +- ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp6); ++ ret = nf_conntrack_l4proto_register(ve_nf_conntrack_l4proto_udp6); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register udp.\n"); +- goto cleanup_tcp; ++ goto unreg_tcp; + } + +- ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_icmpv6); ++ ret = nf_conntrack_l4proto_register(ve_nf_conntrack_l4proto_icmpv6); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register icmpv6.\n"); +- goto cleanup_udp; ++ goto unreg_udp; + } + +- ret = nf_conntrack_l3proto_register(&nf_conntrack_l3proto_ipv6); ++ ret = nf_conntrack_l3proto_register(ve_nf_conntrack_l3proto_ipv6); + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register ipv6\n"); +- goto cleanup_icmpv6; ++ goto unreg_icmpv6; + } + + ret = nf_register_hooks(ipv6_conntrack_ops, +@@ -399,32 +413,77 @@ static int __init nf_conntrack_l3proto_ipv6_init(void) + if (ret < 0) { + printk("nf_conntrack_ipv6: can't register pre-routing defrag " + "hook.\n"); +- goto cleanup_ipv6; ++ goto unreg_ipv6; + } +- return ret; ++ return 0; + +- cleanup_ipv6: +- nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv6); +- cleanup_icmpv6: +- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmpv6); +- cleanup_udp: +- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6); +- cleanup_tcp: +- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6); +- cleanup_frag6: ++unreg_ipv6: ++ nf_conntrack_l3proto_unregister(ve_nf_conntrack_l3proto_ipv6); ++unreg_icmpv6: ++ nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_icmpv6); ++unreg_udp: ++ nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_udp6); ++unreg_tcp: ++ nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_tcp6); ++cleanup_frag6: + nf_ct_frag6_cleanup(); ++cleanup_sys: ++#ifdef CONFIG_VE_IPTABLES ++no_mem_icmp: ++ nf_ct_proto_udp_sysctl_cleanup(); ++no_mem_udp: ++ nf_ct_proto_tcp_sysctl_cleanup(); ++no_mem_tcp: ++ if (!ve_is_super(get_exec_env())) ++ module_put(THIS_MODULE); ++#endif /* CONFIG_VE_IPTABLES */ + return ret; + } ++EXPORT_SYMBOL(init_nf_ct_l3proto_ipv6); + +-static void __exit nf_conntrack_l3proto_ipv6_fini(void) ++void fini_nf_ct_l3proto_ipv6(void) + { +- synchronize_net(); + nf_unregister_hooks(ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops)); +- nf_conntrack_l3proto_unregister(&nf_conntrack_l3proto_ipv6); +- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmpv6); +- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6); +- nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6); ++ nf_conntrack_l3proto_unregister(ve_nf_conntrack_l3proto_ipv6); ++ nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_icmpv6); ++ nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_udp6); ++ nf_conntrack_l4proto_unregister(ve_nf_conntrack_l4proto_tcp6); + nf_ct_frag6_cleanup(); ++ ++#ifdef CONFIG_VE_IPTABLES ++ nf_ct_proto_icmpv6_sysctl_cleanup(); ++ nf_ct_proto_udp_sysctl_cleanup(); ++ nf_ct_proto_tcp_sysctl_cleanup(); ++ if (!ve_is_super(get_exec_env())) ++ module_put(THIS_MODULE); ++#endif /* CONFIG_VE_IPTABLES */ ++} ++EXPORT_SYMBOL(fini_nf_ct_l3proto_ipv6); ++ ++static int __init nf_conntrack_l3proto_ipv6_init(void) ++{ ++ int ret = 0; ++ ++ need_conntrack(); ++ ++ ret = init_nf_ct_l3proto_ipv6(); ++ if (ret < 0) { ++ printk(KERN_ERR "Unable to initialize netfilter protocols\n"); ++ return ret; ++ } ++ KSYMRESOLVE(init_nf_ct_l3proto_ipv6); ++ KSYMRESOLVE(fini_nf_ct_l3proto_ipv6); ++ KSYMMODRESOLVE(nf_conntrack_ipv6); ++ return 0; ++} ++ ++static void __exit nf_conntrack_l3proto_ipv6_fini(void) ++{ ++ synchronize_net(); ++ KSYMMODUNRESOLVE(nf_conntrack_ipv6); ++ KSYMUNRESOLVE(init_nf_ct_l3proto_ipv6); ++ KSYMUNRESOLVE(fini_nf_ct_l3proto_ipv6); ++ fini_nf_ct_l3proto_ipv6(); + } + + module_init(nf_conntrack_l3proto_ipv6_init); +diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +index ee713b0..cae064f 100644 +--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c ++++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +@@ -10,6 +10,7 @@ + */ + + #include ++#include + #include + #include + #include +@@ -95,7 +96,7 @@ static int icmpv6_packet(struct nf_conn *ct, + } else { + atomic_inc(&ct->proto.icmp.count); + nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); +- nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_icmpv6_timeout); ++ nf_ct_refresh_acct(ct, ctinfo, skb, ve_nf_ct_icmpv6_timeout); + } + + return NF_ACCEPT; +@@ -150,7 +151,7 @@ icmpv6_error_message(struct sk_buff *skb, + /* Ordinarily, we'd expect the inverted tupleproto, but it's + been preserved inside the ICMP. */ + if (!nf_ct_invert_tuple(&intuple, &origtuple, +- &nf_conntrack_l3proto_ipv6, inproto)) { ++ ve_nf_conntrack_l3proto_ipv6, inproto)) { + pr_debug("icmpv6_error: Can't invert tuple\n"); + return -NF_ACCEPT; + } +@@ -282,3 +283,48 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 __read_mostly = + .ctl_table = icmpv6_sysctl_table, + #endif + }; ++ ++#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL) ++int nf_ct_proto_icmpv6_sysctl_init(void) ++{ ++ struct nf_conntrack_l4proto *icmp6; ++ ++ if (ve_is_super(get_exec_env())) { ++ icmp6 = &nf_conntrack_l4proto_icmpv6; ++ goto out; ++ } ++ ++ icmp6 = kmemdup(&nf_conntrack_l4proto_icmpv6, ++ sizeof(struct nf_conntrack_l4proto), GFP_KERNEL); ++ if (!icmp6) ++ goto no_mem_ct; ++ ++ icmp6->ctl_table_header = &ve_icmpv6_sysctl_header; ++ icmp6->ctl_table = kmemdup(icmpv6_sysctl_table, ++ sizeof(icmpv6_sysctl_table), GFP_KERNEL); ++ if (!icmp6->ctl_table) ++ goto no_mem_sys; ++ ++ icmp6->ctl_table[0].data = &ve_nf_ct_icmpv6_timeout; ++out: ++ ve_nf_ct_icmpv6_timeout = nf_ct_icmpv6_timeout; ++ ++ ve_nf_conntrack_l4proto_icmpv6 = icmp6; ++ return 0; ++ ++no_mem_sys: ++ kfree(icmp6); ++no_mem_ct: ++ return -ENOMEM; ++} ++EXPORT_SYMBOL(nf_ct_proto_icmpv6_sysctl_init); ++ ++void nf_ct_proto_icmpv6_sysctl_cleanup(void) ++{ ++ if (!ve_is_super(get_exec_env())) { ++ kfree(ve_nf_conntrack_l4proto_icmpv6->ctl_table); ++ kfree(ve_nf_conntrack_l4proto_icmpv6); ++ } ++} ++EXPORT_SYMBOL(nf_ct_proto_icmpv6_sysctl_cleanup); ++#endif /* CONFIG_VE_IPTABLES && CONFIG_SYSCTL */ +diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c +index cf20bc4..9faaa59 100644 +--- a/net/ipv6/netfilter/nf_conntrack_reasm.c ++++ b/net/ipv6/netfilter/nf_conntrack_reasm.c +@@ -145,11 +145,12 @@ static void nf_skb_free(struct sk_buff *skb) + } + + /* Memory Tracking Functions. */ +-static inline void frag_kfree_skb(struct sk_buff *skb, unsigned int *work) ++static inline void frag_kfree_skb(struct netns_frags *nf, ++ struct sk_buff *skb, unsigned int *work) + { + if (work) + *work -= skb->truesize; +- atomic_sub(skb->truesize, &nf_init_frags.mem); ++ atomic_sub(skb->truesize, &nf->mem); + nf_skb_free(skb); + kfree_skb(skb); + } +@@ -169,10 +170,10 @@ static __inline__ void fq_kill(struct nf_ct_frag6_queue *fq) + inet_frag_kill(&fq->q, &nf_frags); + } + +-static void nf_ct_frag6_evictor(void) ++static void nf_ct_frag6_evictor(struct netns_frags *nf) + { + local_bh_disable(); +- inet_frag_evictor(&nf_init_frags, &nf_frags); ++ inet_frag_evictor(nf, &nf_frags); + local_bh_enable(); + } + +@@ -198,7 +199,7 @@ out: + /* Creation primitives. */ + + static __inline__ struct nf_ct_frag6_queue * +-fq_find(__be32 id, struct in6_addr *src, struct in6_addr *dst) ++fq_find(struct net *net, __be32 id, struct in6_addr *src, struct in6_addr *dst) + { + struct inet_frag_queue *q; + struct ip6_create_arg arg; +@@ -211,7 +212,7 @@ fq_find(__be32 id, struct in6_addr *src, struct in6_addr *dst) + read_lock_bh(&nf_frags.lock); + hash = ip6qhashfn(id, src, dst); + +- q = inet_frag_find(&nf_init_frags, &nf_frags, &arg, hash); ++ q = inet_frag_find(&net->ipv6.ct_frags, &nf_frags, &arg, hash); + local_bh_enable(); + if (q == NULL) + goto oom; +@@ -224,7 +225,8 @@ oom: + } + + +-static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, ++static int nf_ct_frag6_queue(struct net *net, struct nf_ct_frag6_queue *fq, ++ struct sk_buff *skb, + const struct frag_hdr *fhdr, int nhoff) + { + struct sk_buff *prev, *next; +@@ -365,7 +367,7 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, + fq->q.fragments = next; + + fq->q.meat -= free_it->len; +- frag_kfree_skb(free_it, NULL); ++ frag_kfree_skb(fq->q.net, free_it, NULL); + } + } + +@@ -381,7 +383,7 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, + skb->dev = NULL; + fq->q.stamp = skb->tstamp; + fq->q.meat += skb->len; +- atomic_add(skb->truesize, &nf_init_frags.mem); ++ atomic_add(skb->truesize, &net->ipv6.ct_frags.mem); + + /* The first fragment. + * nhoffset is obtained from the first fragment, of course. +@@ -391,7 +393,7 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, + fq->q.last_in |= INET_FRAG_FIRST_IN; + } + write_lock(&nf_frags.lock); +- list_move_tail(&fq->q.lru_list, &nf_init_frags.lru_list); ++ list_move_tail(&fq->q.lru_list, &net->ipv6.ct_frags.lru_list); + write_unlock(&nf_frags.lock); + return 0; + +@@ -409,7 +411,8 @@ err: + * the last and the first frames arrived and all the bits are here. + */ + static struct sk_buff * +-nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) ++nf_ct_frag6_reasm(struct net *net, struct nf_ct_frag6_queue *fq, ++ struct net_device *dev) + { + struct sk_buff *fp, *op, *head = fq->q.fragments; + int payload_len; +@@ -458,7 +461,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) + clone->ip_summed = head->ip_summed; + + NFCT_FRAG6_CB(clone)->orig = NULL; +- atomic_add(clone->truesize, &nf_init_frags.mem); ++ atomic_add(clone->truesize, &net->ipv6.ct_frags.mem); + } + + /* We have to remove fragment header from datagram and to relocate +@@ -472,7 +475,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) + skb_shinfo(head)->frag_list = head->next; + skb_reset_transport_header(head); + skb_push(head, head->data - skb_network_header(head)); +- atomic_sub(head->truesize, &nf_init_frags.mem); ++ atomic_sub(head->truesize, &net->ipv6.ct_frags.mem); + + for (fp=head->next; fp; fp = fp->next) { + head->data_len += fp->len; +@@ -482,7 +485,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) + else if (head->ip_summed == CHECKSUM_COMPLETE) + head->csum = csum_add(head->csum, fp->csum); + head->truesize += fp->truesize; +- atomic_sub(fp->truesize, &nf_init_frags.mem); ++ atomic_sub(fp->truesize, &net->ipv6.ct_frags.mem); + } + + head->next = NULL; +@@ -599,6 +602,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb) + int fhoff, nhoff; + u8 prevhdr; + struct sk_buff *ret_skb = NULL; ++ struct net *net = dev_net(dev); + + /* Jumbo payload inhibits frag. header */ + if (ipv6_hdr(skb)->payload_len == 0) { +@@ -632,10 +636,11 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb) + goto ret_orig; + } + +- if (atomic_read(&nf_init_frags.mem) > nf_init_frags.high_thresh) +- nf_ct_frag6_evictor(); ++ if (atomic_read(&net->ipv6.ct_frags.mem) > ++ net->ipv6.ct_frags.high_thresh) ++ nf_ct_frag6_evictor(&net->ipv6.ct_frags); + +- fq = fq_find(fhdr->identification, &hdr->saddr, &hdr->daddr); ++ fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr); + if (fq == NULL) { + pr_debug("Can't find and can't create new queue\n"); + goto ret_orig; +@@ -643,7 +648,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb) + + spin_lock_bh(&fq->q.lock); + +- if (nf_ct_frag6_queue(fq, clone, fhdr, nhoff) < 0) { ++ if (nf_ct_frag6_queue(net, fq, clone, fhdr, nhoff) < 0) { + spin_unlock_bh(&fq->q.lock); + pr_debug("Can't insert skb to queue\n"); + fq_put(fq); +@@ -652,7 +657,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb) + + if (fq->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && + fq->q.meat == fq->q.len) { +- ret_skb = nf_ct_frag6_reasm(fq, dev); ++ ret_skb = nf_ct_frag6_reasm(net, fq, dev); + if (ret_skb == NULL) + pr_debug("Can't reassemble fragmented packets\n"); + } +@@ -687,8 +692,32 @@ void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb, + nf_conntrack_put_reasm(skb); + } + ++static int nf_ct_frag6_init_net(struct net *net) ++{ ++ struct netns_frags *frags = &net->ipv6.ct_frags; ++ ++ frags->timeout = IPV6_FRAG_TIMEOUT; ++ frags->high_thresh = 256 * 1024; ++ frags->low_thresh = 192 * 1024; ++ inet_frags_init_net(frags); ++ ++ return 0; /* FIXME : sysctls */ ++} ++ ++static void nf_ct_frag6_exit_net(struct net *net) ++{ ++ inet_frags_exit_net(&net->ipv6.ct_frags, &nf_frags); ++} ++ ++static struct pernet_operations nf_ct_frag6_ops = { ++ .init = nf_ct_frag6_init_net, ++ .exit = nf_ct_frag6_exit_net, ++}; ++ + int nf_ct_frag6_init(void) + { ++ register_pernet_subsys(&nf_ct_frag6_ops); ++ + nf_frags.hashfn = nf_hashfn; + nf_frags.constructor = ip6_frag_init; + nf_frags.destructor = NULL; +@@ -697,10 +726,6 @@ int nf_ct_frag6_init(void) + nf_frags.match = ip6_frag_match; + nf_frags.frag_expire = nf_ct_frag6_expire; + nf_frags.secret_interval = 10 * 60 * HZ; +- nf_init_frags.timeout = IPV6_FRAG_TIMEOUT; +- nf_init_frags.high_thresh = 256 * 1024; +- nf_init_frags.low_thresh = 192 * 1024; +- inet_frags_init_net(&nf_init_frags); + inet_frags_init(&nf_frags); + + return 0; +@@ -709,7 +734,5 @@ int nf_ct_frag6_init(void) + void nf_ct_frag6_cleanup(void) + { + inet_frags_fini(&nf_frags); +- +- nf_init_frags.low_thresh = 0; +- nf_ct_frag6_evictor(); ++ unregister_pernet_subsys(&nf_ct_frag6_ops); + } +diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c +index df0736a..ea78916 100644 +--- a/net/ipv6/proc.c ++++ b/net/ipv6/proc.c +@@ -31,8 +31,6 @@ + #include + #include + +-static struct proc_dir_entry *proc_net_devsnmp6; +- + static int sockstat6_seq_show(struct seq_file *seq, void *v) + { + struct net *net = seq->private; +@@ -174,11 +172,11 @@ static int snmp6_seq_show(struct seq_file *seq, void *v) + snmp6_seq_show_item(seq, (void **)idev->stats.icmpv6, snmp6_icmp6_list); + snmp6_seq_show_icmpv6msg(seq, (void **)idev->stats.icmpv6msg); + } else { +- snmp6_seq_show_item(seq, (void **)ipv6_statistics, snmp6_ipstats_list); +- snmp6_seq_show_item(seq, (void **)icmpv6_statistics, snmp6_icmp6_list); +- snmp6_seq_show_icmpv6msg(seq, (void **)icmpv6msg_statistics); +- snmp6_seq_show_item(seq, (void **)udp_stats_in6, snmp6_udp6_list); +- snmp6_seq_show_item(seq, (void **)udplite_stats_in6, snmp6_udplite6_list); ++ snmp6_seq_show_item(seq, (void **)ve_ipv6_statistics, snmp6_ipstats_list); ++ snmp6_seq_show_item(seq, (void **)ve_icmpv6_statistics, snmp6_icmp6_list); ++ snmp6_seq_show_icmpv6msg(seq, (void **)ve_icmpv6msg_statistics); ++ snmp6_seq_show_item(seq, (void **)ve_udp_stats_in6, snmp6_udp6_list); ++ snmp6_seq_show_item(seq, (void **)ve_udplite_stats_in6, snmp6_udplite6_list); + } + return 0; + } +@@ -237,18 +235,17 @@ static const struct file_operations snmp6_seq_fops = { + int snmp6_register_dev(struct inet6_dev *idev) + { + struct proc_dir_entry *p; ++ struct net *net; + + if (!idev || !idev->dev) + return -EINVAL; + +- if (dev_net(idev->dev) != &init_net) +- return 0; +- +- if (!proc_net_devsnmp6) ++ net = dev_net(idev->dev); ++ if (!net->ipv6.proc_dev_snmp) + return -ENOENT; + + p = proc_create_data(idev->dev->name, S_IRUGO, +- proc_net_devsnmp6, &snmp6_seq_fops, idev); ++ net->ipv6.proc_dev_snmp, &snmp6_seq_fops, idev); + if (!p) + return -ENOMEM; + +@@ -258,12 +255,14 @@ int snmp6_register_dev(struct inet6_dev *idev) + + int snmp6_unregister_dev(struct inet6_dev *idev) + { +- if (!proc_net_devsnmp6) ++ struct net *net = dev_net(idev->dev); ++ ++ if (!net->ipv6.proc_dev_snmp) + return -ENOENT; + if (!idev || !idev->stats.proc_dir_entry) + return -EINVAL; + remove_proc_entry(idev->stats.proc_dir_entry->name, +- proc_net_devsnmp6); ++ net->ipv6.proc_dev_snmp); + idev->stats.proc_dir_entry = NULL; + return 0; + } +@@ -272,12 +271,24 @@ static int ipv6_proc_init_net(struct net *net) + { + if (!proc_net_fops_create(net, "sockstat6", S_IRUGO, + &sockstat6_seq_fops)) +- return -ENOMEM; ++ goto err_sockstat; ++ ++ net->ipv6.proc_dev_snmp = proc_net_mkdir(net, ++ "dev_snmp6", net->proc_net); ++ if (!net->ipv6.proc_dev_snmp) ++ goto err_dev_snmp; ++ + return 0; ++ ++err_dev_snmp: ++ proc_net_remove(net, "sockstat6"); ++err_sockstat: ++ return -ENOMEM; + } + + static void ipv6_proc_exit_net(struct net *net) + { ++ proc_net_remove(net, "dev_snmp6"); + proc_net_remove(net, "sockstat6"); + } + +@@ -296,14 +307,9 @@ int __init ipv6_misc_proc_init(void) + if (!proc_net_fops_create(&init_net, "snmp6", S_IRUGO, &snmp6_seq_fops)) + goto proc_snmp6_fail; + +- proc_net_devsnmp6 = proc_mkdir("dev_snmp6", init_net.proc_net); +- if (!proc_net_devsnmp6) +- goto proc_dev_snmp6_fail; + out: + return rc; + +-proc_dev_snmp6_fail: +- proc_net_remove(&init_net, "snmp6"); + proc_snmp6_fail: + unregister_pernet_subsys(&ipv6_proc_ops); + proc_net_fail: +@@ -314,7 +320,6 @@ proc_net_fail: + void ipv6_misc_proc_exit(void) + { + proc_net_remove(&init_net, "sockstat6"); +- proc_net_remove(&init_net, "dev_snmp6"); + proc_net_remove(&init_net, "snmp6"); + unregister_pernet_subsys(&ipv6_proc_ops); + } +diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c +index a60d7d1..408859e 100644 +--- a/net/ipv6/reassembly.c ++++ b/net/ipv6/reassembly.c +@@ -198,8 +198,10 @@ static void ip6_frag_expire(unsigned long data) + struct frag_queue *fq; + struct net_device *dev = NULL; + struct net *net; ++ struct ve_struct *old_ve; + + fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q); ++ old_ve = set_exec_env(fq->q.owner_ve); + + spin_lock(&fq->q.lock); + +@@ -234,6 +236,8 @@ out: + dev_put(dev); + spin_unlock(&fq->q.lock); + fq_put(fq); ++ ++ (void)set_exec_env(old_ve); + } + + static __inline__ struct frag_queue * +@@ -510,6 +514,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, + clone->csum = 0; + clone->ip_summed = head->ip_summed; + atomic_add(clone->truesize, &fq->q.net->mem); ++ clone->owner_env = head->owner_env; + } + + /* We have to remove fragment header from datagram and to relocate +diff --git a/net/ipv6/route.c b/net/ipv6/route.c +index 7ff6870..4d83e48 100644 +--- a/net/ipv6/route.c ++++ b/net/ipv6/route.c +@@ -1881,10 +1881,12 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, + rt->rt6i_flags |= RTF_ANYCAST; + else + rt->rt6i_flags |= RTF_LOCAL; +- rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway); +- if (rt->rt6i_nexthop == NULL) { +- dst_free(&rt->u.dst); +- return ERR_PTR(-ENOMEM); ++ rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, rt->rt6i_dev); ++ if (IS_ERR(rt->rt6i_nexthop)) { ++ void *err = rt->rt6i_nexthop; ++ rt->rt6i_nexthop = NULL; ++ dst_free((struct dst_entry *) rt); ++ return err; + } + + ipv6_addr_copy(&rt->rt6i_dst.addr, addr); +diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c +index 40ea9c3..cdc8697 100644 +--- a/net/ipv6/tcp_ipv6.c ++++ b/net/ipv6/tcp_ipv6.c +@@ -62,6 +62,8 @@ + #include + #include + ++#include ++ + #include + + #include +@@ -77,7 +79,7 @@ static void tcp_v6_send_check(struct sock *sk, int len, + + static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); + +-static struct inet_connection_sock_af_ops ipv6_mapped; ++struct inet_connection_sock_af_ops ipv6_mapped; + static struct inet_connection_sock_af_ops ipv6_specific; + #ifdef CONFIG_TCP_MD5SIG + static struct tcp_sock_af_ops tcp_sock_ipv6_specific; +@@ -1580,6 +1582,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) + struct ipv6_pinfo *np = inet6_sk(sk); + struct tcp_sock *tp; + struct sk_buff *opt_skb = NULL; ++ struct user_beancounter *ub; + + /* Imagine: socket is IPv6. IPv4 packet arrives, + goes to IPv4 receive handler and backlogged. +@@ -1592,6 +1595,8 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) + if (skb->protocol == htons(ETH_P_IP)) + return tcp_v4_do_rcv(sk, skb); + ++ ub = set_exec_ub(sock_bc(sk)->ub); ++ + #ifdef CONFIG_TCP_MD5SIG + if (tcp_v6_inbound_md5_hash (sk, skb)) + goto discard; +@@ -1628,7 +1633,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) + TCP_CHECK_TIMER(sk); + if (opt_skb) + goto ipv6_pktoptions; +- return 0; ++ goto restore_context; + } + + if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb)) +@@ -1649,7 +1654,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) + goto reset; + if (opt_skb) + __kfree_skb(opt_skb); +- return 0; ++ goto restore_context; + } + } + +@@ -1659,6 +1664,9 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) + TCP_CHECK_TIMER(sk); + if (opt_skb) + goto ipv6_pktoptions; ++ ++restore_context: ++ (void)set_exec_ub(ub); + return 0; + + reset: +@@ -1667,7 +1675,7 @@ discard: + if (opt_skb) + __kfree_skb(opt_skb); + kfree_skb(skb); +- return 0; ++ goto restore_context; + csum_err: + TCP_INC_STATS_BH(TCP_MIB_INERRS); + goto discard; +@@ -1699,7 +1707,7 @@ ipv6_pktoptions: + + if (opt_skb) + kfree_skb(opt_skb); +- return 0; ++ goto restore_context; + } + + static int tcp_v6_rcv(struct sk_buff *skb) +@@ -1881,7 +1889,7 @@ static struct tcp_sock_af_ops tcp_sock_ipv6_specific = { + * TCP over IPv4 via INET6 API + */ + +-static struct inet_connection_sock_af_ops ipv6_mapped = { ++struct inet_connection_sock_af_ops ipv6_mapped = { + .queue_xmit = ip_queue_xmit, + .send_check = tcp_v4_send_check, + .rebuild_header = inet_sk_rebuild_header, +@@ -1900,6 +1908,8 @@ static struct inet_connection_sock_af_ops ipv6_mapped = { + #endif + }; + ++EXPORT_SYMBOL_GPL(ipv6_mapped); ++ + #ifdef CONFIG_TCP_MD5SIG + static struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = { + .md5_lookup = tcp_v4_md5_lookup, +diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c +index dd30962..d7b151d 100644 +--- a/net/ipv6/udp.c ++++ b/net/ipv6/udp.c +@@ -65,9 +65,11 @@ static struct sock *__udp6_lib_lookup(struct net *net, + struct hlist_node *node; + unsigned short hnum = ntohs(dport); + int badness = -1; ++ struct ve_struct *ve; + ++ ve = get_exec_env(); + read_lock(&udp_hash_lock); +- sk_for_each(sk, node, &udptable[hnum & (UDP_HTABLE_SIZE - 1)]) { ++ sk_for_each(sk, node, &udptable[udp_hashfn(hnum, VEID(ve))]) { + struct inet_sock *inet = inet_sk(sk); + + if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum && +@@ -363,7 +365,7 @@ static int __udp6_lib_mcast_deliver(struct sk_buff *skb, struct in6_addr *saddr, + int dif; + + read_lock(&udp_hash_lock); +- sk = sk_head(&udptable[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]); ++ sk = sk_head(&udptable[udp_hashfn(ntohs(uh->dest), VEID(skb->owner_env))]); + dif = inet6_iif(skb); + sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); + if (!sk) { +diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c +index 8f1e054..e32613a 100644 +--- a/net/ipv6/xfrm6_policy.c ++++ b/net/ipv6/xfrm6_policy.c +@@ -14,6 +14,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -38,7 +39,7 @@ static struct dst_entry *xfrm6_dst_lookup(int tos, xfrm_address_t *saddr, + if (saddr) + memcpy(&fl.fl6_src, saddr, sizeof(fl.fl6_src)); + +- dst = ip6_route_output(&init_net, NULL, &fl); ++ dst = ip6_route_output(get_exec_env()->ve_netns, NULL, &fl); + + err = dst->error; + if (dst->error) { +diff --git a/net/netfilter/core.c b/net/netfilter/core.c +index 292fa28..6bf46b5 100644 +--- a/net/netfilter/core.c ++++ b/net/netfilter/core.c +@@ -60,6 +60,8 @@ int nf_register_hook(struct nf_hook_ops *reg) + struct nf_hook_ops *elem; + int err; + ++ BUG_ON(!ve_is_super(get_exec_env())); ++ + err = mutex_lock_interruptible(&nf_hook_mutex); + if (err < 0) + return err; +@@ -75,6 +77,8 @@ EXPORT_SYMBOL(nf_register_hook); + + void nf_unregister_hook(struct nf_hook_ops *reg) + { ++ BUG_ON(!ve_is_super(get_exec_env())); ++ + mutex_lock(&nf_hook_mutex); + list_del_rcu(®->list); + mutex_unlock(&nf_hook_mutex); +@@ -169,8 +173,6 @@ int nf_hook_slow(int pf, unsigned int hook, struct sk_buff *skb, + struct net *net; + + net = indev == NULL ? dev_net(outdev) : dev_net(indev); +- if (net != &init_net) +- return 1; + #endif + + /* We may already have this, but read-locks nest anyway */ +diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c +index 662c1cc..e811c0b 100644 +--- a/net/netfilter/nf_conntrack_core.c ++++ b/net/netfilter/nf_conntrack_core.c +@@ -30,6 +30,8 @@ + #include + #include + ++#include ++ + #include + #include + #include +@@ -53,8 +55,10 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); + int nf_conntrack_max __read_mostly; + EXPORT_SYMBOL_GPL(nf_conntrack_max); + ++#ifndef CONFIG_VE_IPTABLES + struct hlist_head *nf_conntrack_hash __read_mostly; + EXPORT_SYMBOL_GPL(nf_conntrack_hash); ++#endif + + struct nf_conn nf_conntrack_untracked __read_mostly; + EXPORT_SYMBOL_GPL(nf_conntrack_untracked); +@@ -179,7 +183,14 @@ static void + destroy_conntrack(struct nf_conntrack *nfct) + { + struct nf_conn *ct = (struct nf_conn *)nfct; ++ struct nf_conn_help *help = nfct_help(ct); ++ struct nf_conntrack_l3proto *l3proto; + struct nf_conntrack_l4proto *l4proto; ++#ifdef CONFIG_VE_IPTABLES ++ struct ve_struct *old_ve; ++ ++ old_ve = set_exec_env(ct->ct_owner_env); ++#endif + + pr_debug("destroy_conntrack(%p)\n", ct); + NF_CT_ASSERT(atomic_read(&nfct->use) == 0); +@@ -188,10 +199,17 @@ destroy_conntrack(struct nf_conntrack *nfct) + nf_conntrack_event(IPCT_DESTROY, ct); + set_bit(IPS_DYING_BIT, &ct->status); + ++ if (help && help->helper && help->helper->destroy) ++ help->helper->destroy(ct); ++ + /* To make sure we don't get any weird locking issues here: + * destroy_conntrack() MUST NOT be called with a write lock + * to nf_conntrack_lock!!! -HW */ + rcu_read_lock(); ++ l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct)); ++ if (l3proto && l3proto->destroy) ++ l3proto->destroy(ct); ++ + l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + if (l4proto && l4proto->destroy) + l4proto->destroy(ct); +@@ -219,6 +237,9 @@ destroy_conntrack(struct nf_conntrack *nfct) + + pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct); + nf_conntrack_free(ct); ++#ifdef CONFIG_VE_IPTABLES ++ (void)set_exec_env(old); ++#endif + } + + static void death_by_timeout(unsigned long ul_conntrack) +@@ -255,7 +276,7 @@ __nf_conntrack_find(const struct nf_conntrack_tuple *tuple) + * at least once for the stats anyway. + */ + local_bh_disable(); +- hlist_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash], hnode) { ++ hlist_for_each_entry_rcu(h, n, &ve_nf_conntrack_hash[hash], hnode) { + if (nf_ct_tuple_equal(tuple, &h->tuple)) { + NF_CT_STAT_INC(found); + local_bh_enable(); +@@ -294,9 +315,9 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct, + unsigned int repl_hash) + { + hlist_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode, +- &nf_conntrack_hash[hash]); ++ &ve_nf_conntrack_hash[hash]); + hlist_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnode, +- &nf_conntrack_hash[repl_hash]); ++ &ve_nf_conntrack_hash[repl_hash]); + } + + void nf_conntrack_hash_insert(struct nf_conn *ct) +@@ -350,11 +371,11 @@ __nf_conntrack_confirm(struct sk_buff *skb) + /* See if there's one in the list already, including reverse: + NAT could have grabbed it without realizing, since we're + not in the hash. If there is, we lost race. */ +- hlist_for_each_entry(h, n, &nf_conntrack_hash[hash], hnode) ++ hlist_for_each_entry(h, n, &ve_nf_conntrack_hash[hash], hnode) + if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + &h->tuple)) + goto out; +- hlist_for_each_entry(h, n, &nf_conntrack_hash[repl_hash], hnode) ++ hlist_for_each_entry(h, n, &ve_nf_conntrack_hash[repl_hash], hnode) + if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, + &h->tuple)) + goto out; +@@ -405,7 +426,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, + * least once for the stats anyway. + */ + rcu_read_lock_bh(); +- hlist_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash], hnode) { ++ hlist_for_each_entry_rcu(h, n, &ve_nf_conntrack_hash[hash], hnode) { + if (nf_ct_tuplehash_to_ctrack(h) != ignored_conntrack && + nf_ct_tuple_equal(tuple, &h->tuple)) { + NF_CT_STAT_INC(found); +@@ -435,7 +456,7 @@ static noinline int early_drop(unsigned int hash) + + rcu_read_lock(); + for (i = 0; i < nf_conntrack_htable_size; i++) { +- hlist_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash], ++ hlist_for_each_entry_rcu(h, n, &ve_nf_conntrack_hash[hash], + hnode) { + tmp = nf_ct_tuplehash_to_ctrack(h); + if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) +@@ -464,9 +485,11 @@ static noinline int early_drop(unsigned int hash) + } + + struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig, +- const struct nf_conntrack_tuple *repl) ++ const struct nf_conntrack_tuple *repl, ++ struct user_beancounter *ub) + { + struct nf_conn *ct = NULL; ++ struct user_beancounter *old_ub; + + if (unlikely(!nf_conntrack_hash_rnd_initted)) { + get_random_bytes(&nf_conntrack_hash_rnd, 4); +@@ -474,25 +497,28 @@ struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig, + } + + /* We don't want any race condition at early drop stage */ +- atomic_inc(&nf_conntrack_count); ++ atomic_inc(&ve_nf_conntrack_count); + +- if (nf_conntrack_max && +- unlikely(atomic_read(&nf_conntrack_count) > nf_conntrack_max)) { ++ if (ve_nf_conntrack_max && ++ unlikely(atomic_read(&ve_nf_conntrack_count) > ++ ve_nf_conntrack_max)) { + unsigned int hash = hash_conntrack(orig); + if (!early_drop(hash)) { +- atomic_dec(&nf_conntrack_count); ++ atomic_dec(&ve_nf_conntrack_count); + if (net_ratelimit()) +- printk(KERN_WARNING +- "nf_conntrack: table full, dropping" +- " packet.\n"); ++ ve_printk(VE_LOG_BOTH, KERN_WARNING ++ "nf_conntrack: CT %d: table full, dropping" ++ " packet.\n", VEID(get_exec_env())); + return ERR_PTR(-ENOMEM); + } + } + ++ old_ub = set_exec_ub(ub); + ct = kmem_cache_zalloc(nf_conntrack_cachep, GFP_ATOMIC); ++ (void)set_exec_ub(old_ub); + if (ct == NULL) { + pr_debug("nf_conntrack_alloc: Can't alloc conntrack.\n"); +- atomic_dec(&nf_conntrack_count); ++ atomic_dec(&ve_nf_conntrack_count); + return ERR_PTR(-ENOMEM); + } + +@@ -502,6 +528,9 @@ struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig, + /* Don't set timer yet: wait for confirmation */ + setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct); + INIT_RCU_HEAD(&ct->rcu); ++#ifdef CONFIG_VE_IPTABLES ++ ct->ct_owner_env = get_exec_env(); ++#endif + + return ct; + } +@@ -513,7 +542,7 @@ static void nf_conntrack_free_rcu(struct rcu_head *head) + + nf_ct_ext_free(ct); + kmem_cache_free(nf_conntrack_cachep, ct); +- atomic_dec(&nf_conntrack_count); ++ atomic_dec(&ve_nf_conntrack_count); + } + + void nf_conntrack_free(struct nf_conn *ct) +@@ -536,13 +565,20 @@ init_conntrack(const struct nf_conntrack_tuple *tuple, + struct nf_conn_help *help; + struct nf_conntrack_tuple repl_tuple; + struct nf_conntrack_expect *exp; ++ struct user_beancounter *ub = NULL; + + if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) { + pr_debug("Can't invert tuple.\n"); + return NULL; + } + +- ct = nf_conntrack_alloc(tuple, &repl_tuple); ++#ifdef CONFIG_BEANCOUNTERS ++ if (skb->dev != NULL) /* received skb */ ++ ub = netdev_bc(skb->dev)->exec_ub; ++ else if (skb->sk != NULL) /* sent skb */ ++ ub = sock_bc(skb->sk)->ub; ++#endif ++ ct = nf_conntrack_alloc(tuple, &repl_tuple, ub); + if (ct == NULL || IS_ERR(ct)) { + pr_debug("Can't allocate conntrack.\n"); + return (struct nf_conntrack_tuple_hash *)ct; +@@ -589,7 +625,8 @@ init_conntrack(const struct nf_conntrack_tuple *tuple, + } + + /* Overload tuple linked list to put us in unconfirmed list. */ +- hlist_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode, &unconfirmed); ++ hlist_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode, ++ &ve_unconfirmed); + + spin_unlock_bh(&nf_conntrack_lock); + +@@ -918,13 +955,13 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data), + + spin_lock_bh(&nf_conntrack_lock); + for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { +- hlist_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnode) { ++ hlist_for_each_entry(h, n, &ve_nf_conntrack_hash[*bucket], hnode) { + ct = nf_ct_tuplehash_to_ctrack(h); + if (iter(ct, data)) + goto found; + } + } +- hlist_for_each_entry(h, n, &unconfirmed, hnode) { ++ hlist_for_each_entry(h, n, &ve_unconfirmed, hnode) { + ct = nf_ct_tuplehash_to_ctrack(h); + if (iter(ct, data)) + set_bit(IPS_DYING_BIT, &ct->status); +@@ -979,7 +1016,10 @@ EXPORT_SYMBOL_GPL(nf_conntrack_flush); + supposed to kill the mall. */ + void nf_conntrack_cleanup(void) + { +- rcu_assign_pointer(ip_ct_attach, NULL); ++ struct ve_struct *ve = get_exec_env(); ++ ++ if (ve_is_super(ve)) ++ rcu_assign_pointer(ip_ct_attach, NULL); + + /* This makes sure all current packets have passed through + netfilter framework. Roll on, two-stage module +@@ -989,10 +1029,12 @@ void nf_conntrack_cleanup(void) + nf_ct_event_cache_flush(); + i_see_dead_people: + nf_conntrack_flush(); +- if (atomic_read(&nf_conntrack_count) != 0) { ++ if (atomic_read(&ve_nf_conntrack_count) != 0) { + schedule(); + goto i_see_dead_people; + } ++ if (!ve_is_super(ve)) ++ goto skip_ct_cache; + /* wait until all references to nf_conntrack_untracked are dropped */ + while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1) + schedule(); +@@ -1000,12 +1042,17 @@ void nf_conntrack_cleanup(void) + rcu_assign_pointer(nf_ct_destroy, NULL); + + kmem_cache_destroy(nf_conntrack_cachep); +- nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_vmalloc, +- nf_conntrack_htable_size); +- +- nf_conntrack_proto_fini(); ++skip_ct_cache: + nf_conntrack_helper_fini(); + nf_conntrack_expect_fini(); ++ ++ nf_conntrack_proto_fini(); ++ nf_ct_proto_generic_sysctl_cleanup(); ++ nf_ct_free_hashtable(ve_nf_conntrack_hash, ve_nf_conntrack_vmalloc, ++ nf_conntrack_htable_size); ++#ifdef CONFIG_VE_IPTABLES ++ kfree(ve->_nf_conntrack); ++#endif + } + + struct hlist_head *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced) +@@ -1016,13 +1063,13 @@ struct hlist_head *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced) + *vmalloced = 0; + + size = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_head)); +- hash = (void*)__get_free_pages(GFP_KERNEL|__GFP_NOWARN, ++ hash = (void*)__get_free_pages(GFP_KERNEL_UBC|__GFP_NOWARN, + get_order(sizeof(struct hlist_head) + * size)); + if (!hash) { + *vmalloced = 1; + printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n"); +- hash = vmalloc(sizeof(struct hlist_head) * size); ++ hash = ub_vmalloc(sizeof(struct hlist_head) * size); + } + + if (hash) +@@ -1064,8 +1111,8 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) + */ + spin_lock_bh(&nf_conntrack_lock); + for (i = 0; i < nf_conntrack_htable_size; i++) { +- while (!hlist_empty(&nf_conntrack_hash[i])) { +- h = hlist_entry(nf_conntrack_hash[i].first, ++ while (!hlist_empty(&ve_nf_conntrack_hash[i])) { ++ h = hlist_entry(ve_nf_conntrack_hash[i].first, + struct nf_conntrack_tuple_hash, hnode); + hlist_del_rcu(&h->hnode); + bucket = __hash_conntrack(&h->tuple, hashsize, rnd); +@@ -1073,12 +1120,12 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) + } + } + old_size = nf_conntrack_htable_size; +- old_vmalloced = nf_conntrack_vmalloc; +- old_hash = nf_conntrack_hash; ++ old_vmalloced = ve_nf_conntrack_vmalloc; ++ old_hash = ve_nf_conntrack_hash; + + nf_conntrack_htable_size = hashsize; +- nf_conntrack_vmalloc = vmalloced; +- nf_conntrack_hash = hash; ++ ve_nf_conntrack_vmalloc = vmalloced; ++ ve_nf_conntrack_hash = hash; + nf_conntrack_hash_rnd = rnd; + spin_unlock_bh(&nf_conntrack_lock); + +@@ -1090,53 +1137,82 @@ EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); + module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, + &nf_conntrack_htable_size, 0600); + +-int __init nf_conntrack_init(void) ++int nf_conntrack_init(void) + { ++ struct ve_struct *ve = get_exec_env(); + int max_factor = 8; +- int ret; ++ int ret = 0, i; ++ ++ if (ve_is_super(ve)) { ++ ++ /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB ++ * machine has 512 buckets. >= 1GB machines have 16384 buckets. */ ++ if (!nf_conntrack_htable_size) { ++ nf_conntrack_htable_size ++ = (((num_physpages << PAGE_SHIFT) / 16384) ++ / sizeof(struct hlist_head)); ++ if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) ++ nf_conntrack_htable_size = 16384; ++ if (nf_conntrack_htable_size < 32) ++ nf_conntrack_htable_size = 32; ++ ++ /* Use a max. factor of four by default to get the same ++ * max as with the old struct list_heads. When a table ++ * size is given we use the old value of 8 to avoid ++ * reducing the max. entries. */ ++ max_factor = 4; ++ } ++ nf_conntrack_max = max_factor * nf_conntrack_htable_size; ++ ++ printk("nf_conntrack version %s (%u buckets, %d max)\n", ++ NF_CONNTRACK_VERSION, nf_conntrack_htable_size, ++ nf_conntrack_max); ++ } + +- /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB +- * machine has 512 buckets. >= 1GB machines have 16384 buckets. */ +- if (!nf_conntrack_htable_size) { +- nf_conntrack_htable_size +- = (((num_physpages << PAGE_SHIFT) / 16384) +- / sizeof(struct hlist_head)); +- if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) +- nf_conntrack_htable_size = 16384; +- if (nf_conntrack_htable_size < 32) +- nf_conntrack_htable_size = 32; +- +- /* Use a max. factor of four by default to get the same max as +- * with the old struct list_heads. When a table size is given +- * we use the old value of 8 to avoid reducing the max. +- * entries. */ +- max_factor = 4; ++#ifdef CONFIG_VE_IPTABLES ++ ve->_nf_conntrack = kzalloc(sizeof(struct ve_nf_conntrack), GFP_KERNEL); ++ if (!ve->_nf_conntrack) { ++ ret = -ENOMEM; ++ goto out; + } +- nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, +- &nf_conntrack_vmalloc); +- if (!nf_conntrack_hash) { ++ ++ ve_nf_conntrack_max = nf_conntrack_max; ++ ve_nf_conntrack_checksum = nf_conntrack_checksum; ++ ve_nf_ct_expect_max = nf_ct_expect_max; ++ atomic_set(&ve_nf_conntrack_count, 0); ++ INIT_HLIST_HEAD(&ve_unconfirmed); ++#endif ++ ve_nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, ++ &ve_nf_conntrack_vmalloc); ++ if (!ve_nf_conntrack_hash) { + printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); + goto err_out; + } + +- nf_conntrack_max = max_factor * nf_conntrack_htable_size; +- +- printk("nf_conntrack version %s (%u buckets, %d max)\n", +- NF_CONNTRACK_VERSION, nf_conntrack_htable_size, +- nf_conntrack_max); +- +- nf_conntrack_cachep = kmem_cache_create("nf_conntrack", ++ if (ve_is_super(ve)) { ++ nf_conntrack_cachep = kmem_cache_create("nf_conntrack", + sizeof(struct nf_conn), +- 0, 0, NULL); +- if (!nf_conntrack_cachep) { +- printk(KERN_ERR "Unable to create nf_conn slab cache\n"); +- goto err_free_hash; ++ 0, SLAB_UBC, NULL); ++ if (!nf_conntrack_cachep) { ++ printk(KERN_ERR "Unable to create nf_conn slab cache\n"); ++ goto err_free_hash; ++ } + } + +- ret = nf_conntrack_proto_init(); ++ ret = nf_ct_proto_generic_sysctl_init(); + if (ret < 0) + goto err_free_conntrack_slab; + ++ ret = nf_conntrack_proto_init(); ++ if (ret < 0) ++ goto err_generic_proto; ++ ++ /* Don't NEED lock here, but good form anyway. */ ++ spin_lock_bh(&nf_conntrack_lock); ++ for (i = 0; i < AF_MAX; i++) ++ ve_nf_ct_l3protos[i] = &nf_conntrack_l3proto_generic; ++ spin_unlock_bh(&nf_conntrack_lock); ++ + ret = nf_conntrack_expect_init(); + if (ret < 0) + goto out_fini_proto; +@@ -1145,27 +1221,36 @@ int __init nf_conntrack_init(void) + if (ret < 0) + goto out_fini_expect; + +- /* For use by REJECT target */ +- rcu_assign_pointer(ip_ct_attach, nf_conntrack_attach); +- rcu_assign_pointer(nf_ct_destroy, destroy_conntrack); ++ if (ve_is_super(ve)) { ++ /* For use by REJECT target */ ++ rcu_assign_pointer(ip_ct_attach, nf_conntrack_attach); ++ rcu_assign_pointer(nf_ct_destroy, destroy_conntrack); + +- /* Set up fake conntrack: +- - to never be deleted, not in any hashes */ +- atomic_set(&nf_conntrack_untracked.ct_general.use, 1); +- /* - and look it like as a confirmed connection */ +- set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status); ++ /* Set up fake conntrack: ++ - to never be deleted, not in any hashes */ ++ atomic_set(&nf_conntrack_untracked.ct_general.use, 1); ++ /* - and look it like as a confirmed connection */ ++ set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status); ++ } + +- return ret; ++ return 0; + + out_fini_expect: + nf_conntrack_expect_fini(); + out_fini_proto: + nf_conntrack_proto_fini(); ++err_generic_proto: ++ nf_ct_proto_generic_sysctl_cleanup(); + err_free_conntrack_slab: +- kmem_cache_destroy(nf_conntrack_cachep); ++ if (ve_is_super(ve)) ++ kmem_cache_destroy(nf_conntrack_cachep); + err_free_hash: +- nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_vmalloc, ++ nf_ct_free_hashtable(ve_nf_conntrack_hash, nf_conntrack_vmalloc, + nf_conntrack_htable_size); + err_out: +- return -ENOMEM; ++#ifdef CONFIG_VE_IPTABLES ++ kfree(ve->_nf_conntrack); ++out: ++#endif ++ return ret; + } +diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c +index 83c41ac..d0ddfb6 100644 +--- a/net/netfilter/nf_conntrack_ecache.c ++++ b/net/netfilter/nf_conntrack_ecache.c +@@ -53,6 +53,9 @@ void nf_ct_deliver_cached_events(const struct nf_conn *ct) + { + struct nf_conntrack_ecache *ecache; + ++ if (!ve_is_super(get_exec_env())) ++ return; ++ + local_bh_disable(); + ecache = &__get_cpu_var(nf_conntrack_ecache); + if (ecache->ct == ct) +@@ -66,6 +69,9 @@ void __nf_ct_event_cache_init(struct nf_conn *ct) + { + struct nf_conntrack_ecache *ecache; + ++ if (!ve_is_super(get_exec_env())) ++ return; ++ + /* take care of delivering potentially old events */ + ecache = &__get_cpu_var(nf_conntrack_ecache); + BUG_ON(ecache->ct == ct); +@@ -84,6 +90,9 @@ void nf_ct_event_cache_flush(void) + struct nf_conntrack_ecache *ecache; + int cpu; + ++ if (!ve_is_super(get_exec_env())) ++ return; ++ + for_each_possible_cpu(cpu) { + ecache = &per_cpu(nf_conntrack_ecache, cpu); + if (ecache->ct) +diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c +index e8f0dea..88f3fa8 100644 +--- a/net/netfilter/nf_conntrack_expect.c ++++ b/net/netfilter/nf_conntrack_expect.c +@@ -28,17 +28,26 @@ + #include + #include + ++#ifndef CONFIG_VE_IPTABLES + struct hlist_head *nf_ct_expect_hash __read_mostly; + EXPORT_SYMBOL_GPL(nf_ct_expect_hash); ++#endif + + unsigned int nf_ct_expect_hsize __read_mostly; + EXPORT_SYMBOL_GPL(nf_ct_expect_hsize); + + static unsigned int nf_ct_expect_hash_rnd __read_mostly; +-static unsigned int nf_ct_expect_count; + unsigned int nf_ct_expect_max __read_mostly; + static int nf_ct_expect_hash_rnd_initted __read_mostly; ++#ifdef CONFIG_VE_IPTABLES ++#define ve_nf_ct_expect_count (get_exec_env()->_nf_conntrack->_nf_ct_expect_count) ++#define ve_nf_ct_expect_vmalloc (get_exec_env()->_nf_conntrack->_nf_ct_expect_vmalloc) ++#else ++static unsigned int nf_ct_expect_count; + static int nf_ct_expect_vmalloc; ++#define ve_nf_ct_expect_count nf_ct_expect_count ++#define ve_nf_ct_expect_vmalloc nf_ct_expect_vmalloc ++#endif + + static struct kmem_cache *nf_ct_expect_cachep __read_mostly; + +@@ -51,7 +60,7 @@ void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) + NF_CT_ASSERT(!timer_pending(&exp->timeout)); + + hlist_del_rcu(&exp->hnode); +- nf_ct_expect_count--; ++ ve_nf_ct_expect_count--; + + hlist_del(&exp->lnode); + master_help->expecting[exp->class]--; +@@ -93,11 +102,11 @@ __nf_ct_expect_find(const struct nf_conntrack_tuple *tuple) + struct hlist_node *n; + unsigned int h; + +- if (!nf_ct_expect_count) ++ if (!ve_nf_ct_expect_count) + return NULL; + + h = nf_ct_expect_dst_hash(tuple); +- hlist_for_each_entry_rcu(i, n, &nf_ct_expect_hash[h], hnode) { ++ hlist_for_each_entry_rcu(i, n, &ve_nf_ct_expect_hash[h], hnode) { + if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) + return i; + } +@@ -130,11 +139,11 @@ nf_ct_find_expectation(const struct nf_conntrack_tuple *tuple) + struct hlist_node *n; + unsigned int h; + +- if (!nf_ct_expect_count) ++ if (!ve_nf_ct_expect_count) + return NULL; + + h = nf_ct_expect_dst_hash(tuple); +- hlist_for_each_entry(i, n, &nf_ct_expect_hash[h], hnode) { ++ hlist_for_each_entry(i, n, &ve_nf_ct_expect_hash[h], hnode) { + if (!(i->flags & NF_CT_EXPECT_INACTIVE) && + nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) { + exp = i; +@@ -308,7 +317,7 @@ void nf_ct_expect_put(struct nf_conntrack_expect *exp) + } + EXPORT_SYMBOL_GPL(nf_ct_expect_put); + +-static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) ++void nf_ct_expect_insert(struct nf_conntrack_expect *exp) + { + struct nf_conn_help *master_help = nfct_help(exp->master); + const struct nf_conntrack_expect_policy *p; +@@ -319,8 +328,8 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) + hlist_add_head(&exp->lnode, &master_help->expectations); + master_help->expecting[exp->class]++; + +- hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]); +- nf_ct_expect_count++; ++ hlist_add_head_rcu(&exp->hnode, &ve_nf_ct_expect_hash[h]); ++ ve_nf_ct_expect_count++; + + setup_timer(&exp->timeout, nf_ct_expectation_timed_out, + (unsigned long)exp); +@@ -331,6 +340,7 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) + atomic_inc(&exp->use); + NF_CT_STAT_INC(expect_create); + } ++EXPORT_SYMBOL_GPL(nf_ct_expect_insert); + + /* Race with expectations being used means we could have none to find; OK. */ + static void evict_oldest_expect(struct nf_conn *master, +@@ -383,7 +393,7 @@ int nf_ct_expect_related(struct nf_conntrack_expect *expect) + goto out; + } + h = nf_ct_expect_dst_hash(&expect->tuple); +- hlist_for_each_entry(i, n, &nf_ct_expect_hash[h], hnode) { ++ hlist_for_each_entry(i, n, &ve_nf_ct_expect_hash[h], hnode) { + if (expect_matches(i, expect)) { + /* Refresh timer: if it's dying, ignore.. */ + if (refresh_timer(i)) { +@@ -406,7 +416,7 @@ int nf_ct_expect_related(struct nf_conntrack_expect *expect) + } + } + +- if (nf_ct_expect_count >= nf_ct_expect_max) { ++ if (ve_nf_ct_expect_count >= ve_nf_ct_expect_max) { + if (net_ratelimit()) + printk(KERN_WARNING + "nf_conntrack: expectation table full\n"); +@@ -434,7 +444,7 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq) + struct hlist_node *n; + + for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { +- n = rcu_dereference(nf_ct_expect_hash[st->bucket].first); ++ n = rcu_dereference(ve_nf_ct_expect_hash[st->bucket].first); + if (n) + return n; + } +@@ -450,7 +460,7 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq, + while (head == NULL) { + if (++st->bucket >= nf_ct_expect_hsize) + return NULL; +- head = rcu_dereference(nf_ct_expect_hash[st->bucket].first); ++ head = rcu_dereference(ve_nf_ct_expect_hash[st->bucket].first); + } + return head; + } +@@ -537,12 +547,13 @@ static const struct file_operations exp_file_ops = { + }; + #endif /* CONFIG_PROC_FS */ + +-static int __init exp_proc_init(void) ++static int exp_proc_init(void) + { + #ifdef CONFIG_PROC_FS + struct proc_dir_entry *proc; + +- proc = proc_net_fops_create(&init_net, "nf_conntrack_expect", 0440, &exp_file_ops); ++ proc = proc_net_fops_create(get_exec_env()->ve_netns, ++ "nf_conntrack_expect", 0440, &exp_file_ops); + if (!proc) + return -ENOMEM; + #endif /* CONFIG_PROC_FS */ +@@ -552,13 +563,13 @@ static int __init exp_proc_init(void) + static void exp_proc_remove(void) + { + #ifdef CONFIG_PROC_FS +- proc_net_remove(&init_net, "nf_conntrack_expect"); ++ proc_net_remove(get_exec_env()->ve_netns, "nf_conntrack_expect"); + #endif /* CONFIG_PROC_FS */ + } + + module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0600); + +-int __init nf_conntrack_expect_init(void) ++int nf_conntrack_expect_init(void) + { + int err = -ENOMEM; + +@@ -569,16 +580,20 @@ int __init nf_conntrack_expect_init(void) + } + nf_ct_expect_max = nf_ct_expect_hsize * 4; + +- nf_ct_expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, +- &nf_ct_expect_vmalloc); +- if (nf_ct_expect_hash == NULL) ++ ve_nf_ct_expect_count = 0; ++ ve_nf_ct_expect_max = nf_ct_expect_max; ++ ve_nf_ct_expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, ++ &ve_nf_ct_expect_vmalloc); ++ if (ve_nf_ct_expect_hash == NULL) + goto err1; + +- nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect", ++ if (ve_is_super(get_exec_env())) { ++ nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect", + sizeof(struct nf_conntrack_expect), +- 0, 0, NULL); +- if (!nf_ct_expect_cachep) +- goto err2; ++ 0, SLAB_UBC, NULL); ++ if (!nf_ct_expect_cachep) ++ goto err2; ++ } + + err = exp_proc_init(); + if (err < 0) +@@ -587,9 +602,10 @@ int __init nf_conntrack_expect_init(void) + return 0; + + err3: +- kmem_cache_destroy(nf_ct_expect_cachep); ++ if (ve_is_super(get_exec_env())) ++ kmem_cache_destroy(nf_ct_expect_cachep); + err2: +- nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_vmalloc, ++ nf_ct_free_hashtable(ve_nf_ct_expect_hash, ve_nf_ct_expect_vmalloc, + nf_ct_expect_hsize); + err1: + return err; +@@ -598,7 +614,8 @@ err1: + void nf_conntrack_expect_fini(void) + { + exp_proc_remove(); +- kmem_cache_destroy(nf_ct_expect_cachep); +- nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_vmalloc, ++ if (ve_is_super(get_exec_env())) ++ kmem_cache_destroy(nf_ct_expect_cachep); ++ nf_ct_free_hashtable(ve_nf_ct_expect_hash, ve_nf_ct_expect_vmalloc, + nf_ct_expect_hsize); + } +diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c +index 7d1b117..b06f23f 100644 +--- a/net/netfilter/nf_conntrack_helper.c ++++ b/net/netfilter/nf_conntrack_helper.c +@@ -33,6 +33,13 @@ static struct hlist_head *nf_ct_helper_hash __read_mostly; + static unsigned int nf_ct_helper_hsize __read_mostly; + static unsigned int nf_ct_helper_count __read_mostly; + static int nf_ct_helper_vmalloc; ++#ifdef CONFIG_VE_IPTABLES ++#define ve_nf_ct_helper_hash (get_exec_env()->_nf_conntrack->_nf_ct_helper_hash) ++#define ve_nf_ct_helper_vmalloc (get_exec_env()->_nf_conntrack->_nf_ct_helper_vmalloc) ++#else ++#define ve_nf_ct_helper_hash nf_ct_helper_hash ++#define ve_nf_ct_helper_vmalloc nf_ct_helper_vmalloc ++#endif + + + /* Stupid hash, but collision free for the default registrations of the +@@ -55,7 +62,7 @@ __nf_ct_helper_find(const struct nf_conntrack_tuple *tuple) + return NULL; + + h = helper_hash(tuple); +- hlist_for_each_entry_rcu(helper, n, &nf_ct_helper_hash[h], hnode) { ++ hlist_for_each_entry_rcu(helper, n, &ve_nf_ct_helper_hash[h], hnode) { + if (nf_ct_tuple_src_mask_cmp(tuple, &helper->tuple, &mask)) + return helper; + } +@@ -71,7 +78,7 @@ __nf_conntrack_helper_find_byname(const char *name) + unsigned int i; + + for (i = 0; i < nf_ct_helper_hsize; i++) { +- hlist_for_each_entry_rcu(h, n, &nf_ct_helper_hash[i], hnode) { ++ hlist_for_each_entry_rcu(h, n, &ve_nf_ct_helper_hash[i], hnode) { + if (!strcmp(h->name, name)) + return h; + } +@@ -114,7 +121,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me) + BUG_ON(me->expect_class_max >= NF_CT_MAX_EXPECT_CLASSES); + + mutex_lock(&nf_ct_helper_mutex); +- hlist_add_head_rcu(&me->hnode, &nf_ct_helper_hash[h]); ++ hlist_add_head_rcu(&me->hnode, &ve_nf_ct_helper_hash[h]); + nf_ct_helper_count++; + mutex_unlock(&nf_ct_helper_mutex); + +@@ -144,7 +151,7 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) + /* Get rid of expectations */ + for (i = 0; i < nf_ct_expect_hsize; i++) { + hlist_for_each_entry_safe(exp, n, next, +- &nf_ct_expect_hash[i], hnode) { ++ &ve_nf_ct_expect_hash[i], hnode) { + struct nf_conn_help *help = nfct_help(exp->master); + if ((help->helper == me || exp->helper == me) && + del_timer(&exp->timeout)) { +@@ -155,10 +162,10 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) + } + + /* Get rid of expecteds, set helpers to NULL. */ +- hlist_for_each_entry(h, n, &unconfirmed, hnode) ++ hlist_for_each_entry(h, n, &ve_unconfirmed, hnode) + unhelp(h, me); + for (i = 0; i < nf_conntrack_htable_size; i++) { +- hlist_for_each_entry(h, n, &nf_conntrack_hash[i], hnode) ++ hlist_for_each_entry(h, n, &ve_nf_conntrack_hash[i], hnode) + unhelp(h, me); + } + spin_unlock_bh(&nf_conntrack_lock); +@@ -176,26 +183,29 @@ int nf_conntrack_helper_init(void) + int err; + + nf_ct_helper_hsize = 1; /* gets rounded up to use one page */ +- nf_ct_helper_hash = nf_ct_alloc_hashtable(&nf_ct_helper_hsize, +- &nf_ct_helper_vmalloc); +- if (!nf_ct_helper_hash) ++ ve_nf_ct_helper_hash = nf_ct_alloc_hashtable(&nf_ct_helper_hsize, ++ &ve_nf_ct_helper_vmalloc); ++ if (!ve_nf_ct_helper_hash) + return -ENOMEM; + +- err = nf_ct_extend_register(&helper_extend); +- if (err < 0) +- goto err1; ++ if (ve_is_super(get_exec_env())) { ++ err = nf_ct_extend_register(&helper_extend); ++ if (err < 0) ++ goto err1; ++ } + + return 0; + + err1: +- nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_vmalloc, ++ nf_ct_free_hashtable(ve_nf_ct_helper_hash, ve_nf_ct_helper_vmalloc, + nf_ct_helper_hsize); + return err; + } + + void nf_conntrack_helper_fini(void) + { +- nf_ct_extend_unregister(&helper_extend); +- nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_vmalloc, ++ if (ve_is_super(get_exec_env())) ++ nf_ct_extend_unregister(&helper_extend); ++ nf_ct_free_hashtable(ve_nf_ct_helper_hash, ve_nf_ct_helper_vmalloc, + nf_ct_helper_hsize); + } +diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c +index 0edefcf..e9bee13 100644 +--- a/net/netfilter/nf_conntrack_netlink.c ++++ b/net/netfilter/nf_conntrack_netlink.c +@@ -26,6 +26,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -43,6 +44,8 @@ + + #include + #include ++#include ++#include + + MODULE_LICENSE("GPL"); + +@@ -551,7 +554,8 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + last = (struct nf_conn *)cb->args[1]; + for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) { + restart: +- hlist_for_each_entry_rcu(h, n, &nf_conntrack_hash[cb->args[0]], ++ hlist_for_each_entry_rcu(h, n, ++ &ve_nf_conntrack_hash[cb->args[0]], + hnode) { + if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) + continue; +@@ -1123,14 +1127,15 @@ static int + ctnetlink_create_conntrack(struct nlattr *cda[], + struct nf_conntrack_tuple *otuple, + struct nf_conntrack_tuple *rtuple, +- struct nf_conn *master_ct) ++ struct nf_conn *master_ct, ++ struct user_beancounter *ub) + { + struct nf_conn *ct; + int err = -EINVAL; + struct nf_conn_help *help; + struct nf_conntrack_helper *helper; + +- ct = nf_conntrack_alloc(otuple, rtuple); ++ ct = nf_conntrack_alloc(otuple, rtuple, ub); + if (ct == NULL || IS_ERR(ct)) + return -ENOMEM; + +@@ -1240,11 +1245,19 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, + + spin_unlock_bh(&nf_conntrack_lock); + err = -ENOENT; +- if (nlh->nlmsg_flags & NLM_F_CREATE) ++ if (nlh->nlmsg_flags & NLM_F_CREATE) { ++ struct user_beancounter *ub = NULL; ++ ++#ifdef CONFIG_BEANCOUNTERS ++ if (skb->sk) ++ ub = sock_bc(skb->sk)->ub; ++#endif + err = ctnetlink_create_conntrack(cda, + &otuple, + &rtuple, +- master_ct); ++ master_ct, ++ ub); ++ } + if (err < 0 && master_ct) + nf_ct_put(master_ct); + +@@ -1466,7 +1479,7 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) + last = (struct nf_conntrack_expect *)cb->args[1]; + for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) { + restart: +- hlist_for_each_entry(exp, n, &nf_ct_expect_hash[cb->args[0]], ++ hlist_for_each_entry(exp, n, &ve_nf_ct_expect_hash[cb->args[0]], + hnode) { + if (l3proto && exp->tuple.src.l3num != l3proto) + continue; +@@ -1612,7 +1625,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, + } + for (i = 0; i < nf_ct_expect_hsize; i++) { + hlist_for_each_entry_safe(exp, n, next, +- &nf_ct_expect_hash[i], ++ &ve_nf_ct_expect_hash[i], + hnode) { + m_help = nfct_help(exp->master); + if (m_help->helper == h +@@ -1628,7 +1641,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, + spin_lock_bh(&nf_conntrack_lock); + for (i = 0; i < nf_ct_expect_hsize; i++) { + hlist_for_each_entry_safe(exp, n, next, +- &nf_ct_expect_hash[i], ++ &ve_nf_ct_expect_hash[i], + hnode) { + if (del_timer(&exp->timeout)) { + nf_ct_unlink_expect(exp); +diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c +index a49fc93..49fc01f 100644 +--- a/net/netfilter/nf_conntrack_proto.c ++++ b/net/netfilter/nf_conntrack_proto.c +@@ -28,7 +28,7 @@ + #include + #include + +-static struct nf_conntrack_l4proto **nf_ct_protos[PF_MAX] __read_mostly; ++struct nf_conntrack_l4proto **nf_ct_protos[PF_MAX] __read_mostly; + struct nf_conntrack_l3proto *nf_ct_l3protos[AF_MAX] __read_mostly; + EXPORT_SYMBOL_GPL(nf_ct_l3protos); + +@@ -40,7 +40,8 @@ nf_ct_register_sysctl(struct ctl_table_header **header, struct ctl_path *path, + struct ctl_table *table, unsigned int *users) + { + if (*header == NULL) { +- *header = register_sysctl_paths(path, table); ++ *header = register_net_sysctl_table(get_exec_env()->ve_netns, ++ path, table); + if (*header == NULL) + return -ENOMEM; + } +@@ -56,7 +57,7 @@ nf_ct_unregister_sysctl(struct ctl_table_header **header, + if (users != NULL && --*users > 0) + return; + +- unregister_sysctl_table(*header); ++ unregister_net_sysctl_table(*header); + *header = NULL; + } + #endif +@@ -64,10 +65,10 @@ nf_ct_unregister_sysctl(struct ctl_table_header **header, + struct nf_conntrack_l4proto * + __nf_ct_l4proto_find(u_int16_t l3proto, u_int8_t l4proto) + { +- if (unlikely(l3proto >= AF_MAX || nf_ct_protos[l3proto] == NULL)) +- return &nf_conntrack_l4proto_generic; ++ if (unlikely(l3proto >= AF_MAX || ve_nf_ct_protos[l3proto] == NULL)) ++ return ve_nf_conntrack_l4proto_generic; + +- return rcu_dereference(nf_ct_protos[l3proto][l4proto]); ++ return rcu_dereference(ve_nf_ct_protos[l3proto][l4proto]); + } + EXPORT_SYMBOL_GPL(__nf_ct_l4proto_find); + +@@ -81,7 +82,7 @@ nf_ct_l4proto_find_get(u_int16_t l3proto, u_int8_t l4proto) + rcu_read_lock(); + p = __nf_ct_l4proto_find(l3proto, l4proto); + if (!try_module_get(p->me)) +- p = &nf_conntrack_l4proto_generic; ++ p = ve_nf_conntrack_l4proto_generic; + rcu_read_unlock(); + + return p; +@@ -188,7 +189,8 @@ int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto) + return -EBUSY; + + mutex_lock(&nf_ct_proto_mutex); +- if (nf_ct_l3protos[proto->l3proto] != &nf_conntrack_l3proto_generic) { ++ if (ve_nf_ct_l3protos[proto->l3proto] != ++ &nf_conntrack_l3proto_generic) { + ret = -EBUSY; + goto out_unlock; + } +@@ -197,7 +199,7 @@ int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto) + if (ret < 0) + goto out_unlock; + +- rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], proto); ++ rcu_assign_pointer(ve_nf_ct_l3protos[proto->l3proto], proto); + + out_unlock: + mutex_unlock(&nf_ct_proto_mutex); +@@ -210,8 +212,8 @@ void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto) + BUG_ON(proto->l3proto >= AF_MAX); + + mutex_lock(&nf_ct_proto_mutex); +- BUG_ON(nf_ct_l3protos[proto->l3proto] != proto); +- rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], ++ BUG_ON(ve_nf_ct_l3protos[proto->l3proto] != proto); ++ rcu_assign_pointer(ve_nf_ct_l3protos[proto->l3proto], + &nf_conntrack_l3proto_generic); + nf_ct_l3proto_unregister_sysctl(proto); + mutex_unlock(&nf_ct_proto_mutex); +@@ -279,7 +281,7 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto) + return -EBUSY; + + mutex_lock(&nf_ct_proto_mutex); +- if (!nf_ct_protos[l4proto->l3proto]) { ++ if (!ve_nf_ct_protos[l4proto->l3proto]) { + /* l3proto may be loaded latter. */ + struct nf_conntrack_l4proto **proto_array; + int i; +@@ -293,10 +295,10 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto) + } + + for (i = 0; i < MAX_NF_CT_PROTO; i++) +- proto_array[i] = &nf_conntrack_l4proto_generic; +- nf_ct_protos[l4proto->l3proto] = proto_array; +- } else if (nf_ct_protos[l4proto->l3proto][l4proto->l4proto] != +- &nf_conntrack_l4proto_generic) { ++ proto_array[i] = ve_nf_conntrack_l4proto_generic; ++ ve_nf_ct_protos[l4proto->l3proto] = proto_array; ++ } else if (ve_nf_ct_protos[l4proto->l3proto][l4proto->l4proto] != ++ ve_nf_conntrack_l4proto_generic) { + ret = -EBUSY; + goto out_unlock; + } +@@ -305,7 +307,7 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto) + if (ret < 0) + goto out_unlock; + +- rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], ++ rcu_assign_pointer(ve_nf_ct_protos[l4proto->l3proto][l4proto->l4proto], + l4proto); + + out_unlock: +@@ -319,9 +321,9 @@ void nf_conntrack_l4proto_unregister(struct nf_conntrack_l4proto *l4proto) + BUG_ON(l4proto->l3proto >= PF_MAX); + + mutex_lock(&nf_ct_proto_mutex); +- BUG_ON(nf_ct_protos[l4proto->l3proto][l4proto->l4proto] != l4proto); +- rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], +- &nf_conntrack_l4proto_generic); ++ BUG_ON(ve_nf_ct_protos[l4proto->l3proto][l4proto->l4proto] != l4proto); ++ rcu_assign_pointer(ve_nf_ct_protos[l4proto->l3proto][l4proto->l4proto], ++ ve_nf_conntrack_l4proto_generic); + nf_ct_l4proto_unregister_sysctl(l4proto); + mutex_unlock(&nf_ct_proto_mutex); + +@@ -337,12 +339,12 @@ int nf_conntrack_proto_init(void) + unsigned int i; + int err; + +- err = nf_ct_l4proto_register_sysctl(&nf_conntrack_l4proto_generic); ++ err = nf_ct_l4proto_register_sysctl(ve_nf_conntrack_l4proto_generic); + if (err < 0) + return err; + + for (i = 0; i < AF_MAX; i++) +- rcu_assign_pointer(nf_ct_l3protos[i], ++ rcu_assign_pointer(ve_nf_ct_l3protos[i], + &nf_conntrack_l3proto_generic); + return 0; + } +@@ -351,9 +353,13 @@ void nf_conntrack_proto_fini(void) + { + unsigned int i; + +- nf_ct_l4proto_unregister_sysctl(&nf_conntrack_l4proto_generic); ++ nf_ct_l4proto_unregister_sysctl(ve_nf_conntrack_l4proto_generic); + + /* free l3proto protocol tables */ + for (i = 0; i < PF_MAX; i++) +- kfree(nf_ct_protos[i]); ++ kfree(ve_nf_ct_protos[i]); ++#ifdef CONFIG_VE_IPTABLES ++ if (!ve_is_super(get_exec_env())) ++ kfree(ve_nf_conntrack_l4proto_generic); ++#endif + } +diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c +index e31b0e7..e65f9a7 100644 +--- a/net/netfilter/nf_conntrack_proto_generic.c ++++ b/net/netfilter/nf_conntrack_proto_generic.c +@@ -8,6 +8,7 @@ + + #include + #include ++#include + #include + #include + #include +@@ -48,7 +49,7 @@ static int packet(struct nf_conn *ct, + int pf, + unsigned int hooknum) + { +- nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_generic_timeout); ++ nf_ct_refresh_acct(ct, ctinfo, skb, ve_nf_ct_generic_timeout); + return NF_ACCEPT; + } + +@@ -107,3 +108,62 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_generic __read_mostly = + #endif + #endif + }; ++ ++#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL) ++int nf_ct_proto_generic_sysctl_init(void) ++{ ++ struct nf_conntrack_l4proto *generic; ++ ++ if (ve_is_super(get_exec_env())) { ++ generic = &nf_conntrack_l4proto_generic; ++ goto out; ++ } ++ ++ generic = kmemdup(&nf_conntrack_l4proto_generic, ++ sizeof(struct nf_conntrack_l4proto), GFP_KERNEL); ++ if (generic == NULL) ++ goto no_mem_ct; ++ ++ generic->ctl_table_header = &ve_generic_sysctl_header; ++ generic->ctl_table = kmemdup(generic_sysctl_table, ++ sizeof(generic_sysctl_table), GFP_KERNEL); ++ if (generic->ctl_table == NULL) ++ goto no_mem_sys; ++ ++ generic->ctl_table[0].data = &ve_nf_ct_generic_timeout; ++#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT ++ generic->ctl_compat_table_header = ve_generic_compat_sysctl_header; ++ generic->ctl_compat_table = kmemdup(generic_compat_sysctl_table, ++ sizeof(generic_compat_sysctl_table), GFP_KERNEL); ++ if (generic->ctl_compat_table == NULL) ++ goto no_mem_compat; ++ generic->ctl_compat_table[0].data = &ve_nf_ct_generic_timeout; ++#endif ++out: ++ ve_nf_ct_generic_timeout = nf_ct_generic_timeout; ++ ++ ve_nf_conntrack_l4proto_generic = generic; ++ return 0; ++ ++#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT ++no_mem_compat: ++ kfree(generic->ctl_table); ++#endif ++no_mem_sys: ++ kfree(generic); ++no_mem_ct: ++ return -ENOMEM; ++} ++EXPORT_SYMBOL(nf_ct_proto_generic_sysctl_init); ++ ++void nf_ct_proto_generic_sysctl_cleanup(void) ++{ ++ if (!ve_is_super(get_exec_env())) { ++#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT ++ kfree(ve_nf_conntrack_l4proto_generic->ctl_compat_table); ++#endif ++ kfree(ve_nf_conntrack_l4proto_generic->ctl_table); ++ } ++} ++EXPORT_SYMBOL(nf_ct_proto_generic_sysctl_cleanup); ++#endif /* CONFIG_VE_IPTABLES && CONFIG_SYSCTL */ +diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c +index dd28fb2..4c6989c 100644 +--- a/net/netfilter/nf_conntrack_proto_tcp.c ++++ b/net/netfilter/nf_conntrack_proto_tcp.c +@@ -7,6 +7,7 @@ + */ + + #include ++#include + #include + #include + #include +@@ -661,7 +662,7 @@ static bool tcp_in_window(const struct nf_conn *ct, + } else { + res = false; + if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL || +- nf_ct_tcp_be_liberal) ++ ve_nf_ct_tcp_be_liberal) + res = true; + if (!res && LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, +@@ -953,9 +954,9 @@ static int tcp_packet(struct nf_conn *ct, + if (old_state != new_state + && new_state == TCP_CONNTRACK_FIN_WAIT) + ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; +- timeout = ct->proto.tcp.retrans >= nf_ct_tcp_max_retrans +- && tcp_timeouts[new_state] > nf_ct_tcp_timeout_max_retrans +- ? nf_ct_tcp_timeout_max_retrans : tcp_timeouts[new_state]; ++ timeout = ct->proto.tcp.retrans >= ve_nf_ct_tcp_max_retrans ++ && ve_nf_ct_tcp_timeouts[new_state] > ve_nf_ct_tcp_timeout_max_retrans ++ ? ve_nf_ct_tcp_timeout_max_retrans : ve_nf_ct_tcp_timeouts[new_state]; + write_unlock_bh(&tcp_lock); + + nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); +@@ -1024,7 +1025,7 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, + + tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]); + ct->proto.tcp.seen[1].flags = 0; +- } else if (nf_ct_tcp_loose == 0) { ++ } else if (ve_nf_ct_tcp_loose == 0) { + /* Don't try to pick up connections. */ + return false; + } else { +@@ -1419,3 +1420,115 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 __read_mostly = + #endif + }; + EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp6); ++ ++#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL) ++int nf_ct_proto_tcp_sysctl_init(void) ++{ ++ struct nf_conntrack_l4proto *tcp4, *tcp6; ++ ++ if (ve_is_super(get_exec_env())) { ++ tcp4 = &nf_conntrack_l4proto_tcp4; ++ tcp6 = &nf_conntrack_l4proto_tcp6; ++ goto out; ++ } ++ ++ tcp4 = kmemdup(&nf_conntrack_l4proto_tcp4, ++ sizeof(struct nf_conntrack_l4proto), GFP_KERNEL); ++ if (tcp4 == NULL) ++ goto no_mem_ct4; ++ ++ tcp4->ctl_table_users = &ve_tcp_sysctl_table_users; ++ tcp4->ctl_table_header = &ve_tcp_sysctl_header; ++ tcp4->ctl_table = kmemdup(tcp_sysctl_table, ++ sizeof(tcp_sysctl_table), GFP_KERNEL); ++ if (tcp4->ctl_table == NULL) ++ goto no_mem_sys; ++ ++ tcp4->ctl_table[0].data = &ve_nf_ct_tcp_timeouts[1]; ++ tcp4->ctl_table[1].data = &ve_nf_ct_tcp_timeouts[2]; ++ tcp4->ctl_table[2].data = &ve_nf_ct_tcp_timeouts[3]; ++ tcp4->ctl_table[3].data = &ve_nf_ct_tcp_timeouts[4]; ++ tcp4->ctl_table[4].data = &ve_nf_ct_tcp_timeouts[5]; ++ tcp4->ctl_table[5].data = &ve_nf_ct_tcp_timeouts[6]; ++ tcp4->ctl_table[6].data = &ve_nf_ct_tcp_timeouts[7]; ++ tcp4->ctl_table[7].data = &ve_nf_ct_tcp_timeouts[8]; ++ tcp4->ctl_table[8].data = &ve_nf_ct_tcp_timeout_max_retrans; ++ tcp4->ctl_table[9].data = &ve_nf_ct_tcp_loose; ++ tcp4->ctl_table[10].data = &ve_nf_ct_tcp_be_liberal; ++ tcp4->ctl_table[11].data = &ve_nf_ct_tcp_max_retrans; ++ ++#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT ++ tcp4->ctl_compat_table_header = ve_tcp_compat_sysctl_header; ++ tcp4->ctl_compat_table = kmemdup(tcp_compat_sysctl_table, ++ sizeof(tcp_compat_sysctl_table), GFP_KERNEL); ++ if (tcp4->ctl_compat_table == NULL) ++ goto no_mem_compat; ++ ++ tcp4->ctl_compat_table[0].data = &ve_nf_ct_tcp_timeouts[1]; ++ tcp4->ctl_compat_table[1].data = &ve_nf_ct_tcp_timeouts[2]; ++ tcp4->ctl_compat_table[2].data = &ve_nf_ct_tcp_timeouts[3]; ++ tcp4->ctl_compat_table[3].data = &ve_nf_ct_tcp_timeouts[4]; ++ tcp4->ctl_compat_table[4].data = &ve_nf_ct_tcp_timeouts[5]; ++ tcp4->ctl_compat_table[5].data = &ve_nf_ct_tcp_timeouts[6]; ++ tcp4->ctl_compat_table[6].data = &ve_nf_ct_tcp_timeouts[7]; ++ tcp4->ctl_compat_table[7].data = &ve_nf_ct_tcp_timeouts[8]; ++ tcp4->ctl_compat_table[8].data = &ve_nf_ct_tcp_timeout_max_retrans; ++ tcp4->ctl_compat_table[9].data = &ve_nf_ct_tcp_loose; ++ tcp4->ctl_compat_table[10].data = &ve_nf_ct_tcp_be_liberal; ++ tcp4->ctl_compat_table[11].data = &ve_nf_ct_tcp_max_retrans; ++#endif ++ ++ tcp6 = kmemdup(&nf_conntrack_l4proto_tcp6, ++ sizeof(struct nf_conntrack_l4proto), GFP_KERNEL); ++ if (!tcp6) ++ goto no_mem_ct6; ++ ++ tcp6->ctl_table_users = &ve_tcp_sysctl_table_users; ++ tcp6->ctl_table_header = &ve_tcp_sysctl_header; ++ tcp6->ctl_table = tcp4->ctl_table; ++out: ++ ve_nf_ct_tcp_timeouts[1] = tcp_timeouts[TCP_CONNTRACK_SYN_SENT]; ++ ve_nf_ct_tcp_timeouts[2] = tcp_timeouts[TCP_CONNTRACK_SYN_RECV]; ++ ve_nf_ct_tcp_timeouts[3] = tcp_timeouts[TCP_CONNTRACK_ESTABLISHED]; ++ ve_nf_ct_tcp_timeouts[4] = tcp_timeouts[TCP_CONNTRACK_FIN_WAIT]; ++ ve_nf_ct_tcp_timeouts[5] = tcp_timeouts[TCP_CONNTRACK_CLOSE_WAIT]; ++ ve_nf_ct_tcp_timeouts[6] = tcp_timeouts[TCP_CONNTRACK_LAST_ACK]; ++ ve_nf_ct_tcp_timeouts[7] = tcp_timeouts[TCP_CONNTRACK_TIME_WAIT]; ++ ve_nf_ct_tcp_timeouts[8] = tcp_timeouts[TCP_CONNTRACK_CLOSE]; ++ ve_nf_ct_tcp_timeout_max_retrans = nf_ct_tcp_timeout_max_retrans; ++ ve_nf_ct_tcp_loose = nf_ct_tcp_loose; ++ ve_nf_ct_tcp_be_liberal = nf_ct_tcp_be_liberal; ++ ve_nf_ct_tcp_max_retrans = nf_ct_tcp_max_retrans; ++ ++ ve_nf_conntrack_l4proto_tcp4 = tcp4; ++ ve_nf_conntrack_l4proto_tcp6 = tcp6; ++ return 0; ++ ++no_mem_ct6: ++#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT ++ kfree(tcp4->ctl_compat_table); ++no_mem_compat: ++#endif ++ kfree(tcp4->ctl_table); ++no_mem_sys: ++ kfree(tcp4); ++no_mem_ct4: ++ return -ENOMEM; ++} ++EXPORT_SYMBOL(nf_ct_proto_tcp_sysctl_init); ++ ++void nf_ct_proto_tcp_sysctl_cleanup(void) ++{ ++ if (!ve_is_super(get_exec_env())) { ++#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT ++ kfree(ve_nf_conntrack_l4proto_tcp4->ctl_compat_table); ++#endif ++ kfree(ve_nf_conntrack_l4proto_tcp4->ctl_table); ++ kfree(ve_nf_conntrack_l4proto_tcp4); ++ ++ kfree(ve_nf_conntrack_l4proto_tcp6); ++ } ++} ++EXPORT_SYMBOL(nf_ct_proto_tcp_sysctl_cleanup); ++#endif /* CONFIG_VE_IPTABLES && CONFIG_SYSCTL */ ++ +diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c +index 8b21762..b01823e 100644 +--- a/net/netfilter/nf_conntrack_proto_udp.c ++++ b/net/netfilter/nf_conntrack_proto_udp.c +@@ -7,6 +7,7 @@ + */ + + #include ++#include + #include + #include + #include +@@ -72,12 +73,13 @@ static int udp_packet(struct nf_conn *ct, + /* If we've seen traffic both ways, this is some kind of UDP + stream. Extend timeout. */ + if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { +- nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_udp_timeout_stream); ++ nf_ct_refresh_acct(ct, ctinfo, skb, ++ ve_nf_ct_udp_timeout_stream); + /* Also, more likely to be important, and not a probe */ + if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) + nf_conntrack_event_cache(IPCT_STATUS, skb); + } else +- nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_udp_timeout); ++ nf_ct_refresh_acct(ct, ctinfo, skb, ve_nf_ct_udp_timeout); + + return NF_ACCEPT; + } +@@ -229,3 +231,85 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly = + #endif + }; + EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udp6); ++ ++#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL) ++int nf_ct_proto_udp_sysctl_init(void) ++{ ++ struct nf_conntrack_l4proto *udp4, *udp6; ++ ++ if (ve_is_super(get_exec_env())) { ++ udp4 = &nf_conntrack_l4proto_udp4; ++ udp6 = &nf_conntrack_l4proto_udp6; ++ goto out; ++ } ++ ++ udp4 = kmemdup(&nf_conntrack_l4proto_udp4, ++ sizeof(struct nf_conntrack_l4proto), GFP_KERNEL); ++ if (udp4 == NULL) ++ goto no_mem_ct4; ++ ++ udp4->ctl_table_users = &ve_udp_sysctl_table_users; ++ udp4->ctl_table_header = &ve_udp_sysctl_header; ++ udp4->ctl_table = kmemdup(udp_sysctl_table, ++ sizeof(udp_sysctl_table), GFP_KERNEL); ++ if (udp4->ctl_table == NULL) ++ goto no_mem_sys; ++ udp4->ctl_table[0].data = &ve_nf_ct_udp_timeout; ++ udp4->ctl_table[1].data = &ve_nf_ct_udp_timeout_stream; ++ ++#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT ++ udp4->ctl_compat_table_header = ve_udp_compat_sysctl_header; ++ udp4->ctl_compat_table = kmemdup(udp_compat_sysctl_table, ++ sizeof(udp_compat_sysctl_table), GFP_KERNEL); ++ if (udp4->ctl_compat_table == NULL) ++ goto no_mem_compat; ++ udp4->ctl_compat_table[0].data = &ve_nf_ct_udp_timeout; ++ udp4->ctl_compat_table[1].data = &ve_nf_ct_udp_timeout_stream; ++#endif ++ ++ udp6 = kmemdup(&nf_conntrack_l4proto_udp6, ++ sizeof(struct nf_conntrack_l4proto), GFP_KERNEL); ++ if (!udp6) ++ goto no_mem_ct6; ++ ++ udp6->ctl_table_users = &ve_udp_sysctl_table_users; ++ udp6->ctl_table_header = &ve_udp_sysctl_header; ++ udp6->ctl_table = udp4->ctl_table; ++ ++ udp6->ctl_table[0].data = &ve_nf_ct_udp_timeout; ++ udp6->ctl_table[1].data = &ve_nf_ct_udp_timeout_stream; ++out: ++ ve_nf_ct_udp_timeout = nf_ct_udp_timeout; ++ ve_nf_ct_udp_timeout_stream = nf_ct_udp_timeout_stream; ++ ++ ve_nf_conntrack_l4proto_udp4 = udp4; ++ ve_nf_conntrack_l4proto_udp6 = udp6; ++ return 0; ++ ++no_mem_ct6: ++#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT ++ kfree(udp4->ctl_compat_table); ++no_mem_compat: ++#endif ++ kfree(udp4->ctl_table); ++no_mem_sys: ++ kfree(udp4); ++no_mem_ct4: ++ return -ENOMEM; ++} ++EXPORT_SYMBOL(nf_ct_proto_udp_sysctl_init); ++ ++void nf_ct_proto_udp_sysctl_cleanup(void) ++{ ++ if (!ve_is_super(get_exec_env())) { ++#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT ++ kfree(ve_nf_conntrack_l4proto_udp4->ctl_compat_table); ++#endif ++ kfree(ve_nf_conntrack_l4proto_udp4->ctl_table); ++ kfree(ve_nf_conntrack_l4proto_udp4); ++ ++ kfree(ve_nf_conntrack_l4proto_udp6); ++ } ++} ++EXPORT_SYMBOL(nf_ct_proto_udp_sysctl_cleanup); ++#endif /* CONFIG_VE_IPTABLES && CONFIG_SYSCTL */ +diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c +index 46ea542..c4d8ef2 100644 +--- a/net/netfilter/nf_conntrack_standalone.c ++++ b/net/netfilter/nf_conntrack_standalone.c +@@ -9,6 +9,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -18,6 +19,7 @@ + #ifdef CONFIG_SYSCTL + #include + #endif ++#include + + #include + #include +@@ -28,6 +30,10 @@ + + MODULE_LICENSE("GPL"); + ++int ip_conntrack_disable_ve0 = 0; ++module_param(ip_conntrack_disable_ve0, int, 0440); ++EXPORT_SYMBOL(ip_conntrack_disable_ve0); ++ + #ifdef CONFIG_PROC_FS + int + print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, +@@ -63,7 +69,7 @@ static struct hlist_node *ct_get_first(struct seq_file *seq) + for (st->bucket = 0; + st->bucket < nf_conntrack_htable_size; + st->bucket++) { +- n = rcu_dereference(nf_conntrack_hash[st->bucket].first); ++ n = rcu_dereference(ve_nf_conntrack_hash[st->bucket].first); + if (n) + return n; + } +@@ -79,7 +85,7 @@ static struct hlist_node *ct_get_next(struct seq_file *seq, + while (head == NULL) { + if (++st->bucket >= nf_conntrack_htable_size) + return NULL; +- head = rcu_dereference(nf_conntrack_hash[st->bucket].first); ++ head = rcu_dereference(ve_nf_conntrack_hash[st->bucket].first); + } + return head; + } +@@ -238,7 +244,7 @@ static void ct_cpu_seq_stop(struct seq_file *seq, void *v) + + static int ct_cpu_seq_show(struct seq_file *seq, void *v) + { +- unsigned int nr_conntracks = atomic_read(&nf_conntrack_count); ++ unsigned int nr_conntracks = atomic_read(&ve_nf_conntrack_count); + const struct ip_conntrack_stat *st = v; + + if (v == SEQ_START_TOKEN) { +@@ -292,27 +298,30 @@ static const struct file_operations ct_cpu_seq_fops = { + static int nf_conntrack_standalone_init_proc(void) + { + struct proc_dir_entry *pde; ++ struct net *net = get_exec_env()->ve_netns; + +- pde = proc_net_fops_create(&init_net, "nf_conntrack", 0440, &ct_file_ops); ++ pde = proc_net_fops_create(net, "nf_conntrack", 0440, &ct_file_ops); + if (!pde) + goto out_nf_conntrack; + +- pde = proc_create("nf_conntrack", S_IRUGO, init_net.proc_net_stat, ++ pde = proc_create("nf_conntrack", S_IRUGO, net->proc_net_stat, + &ct_cpu_seq_fops); + if (!pde) + goto out_stat_nf_conntrack; + return 0; + + out_stat_nf_conntrack: +- proc_net_remove(&init_net, "nf_conntrack"); ++ proc_net_remove(net, "nf_conntrack"); + out_nf_conntrack: + return -ENOMEM; + } + + static void nf_conntrack_standalone_fini_proc(void) + { +- remove_proc_entry("nf_conntrack", init_net.proc_net_stat); +- proc_net_remove(&init_net, "nf_conntrack"); ++ struct net *net = get_exec_env()->ve_netns; ++ ++ remove_proc_entry("nf_conntrack", net->proc_net_stat); ++ proc_net_remove(net, "nf_conntrack"); + } + #else + static int nf_conntrack_standalone_init_proc(void) +@@ -421,19 +430,61 @@ EXPORT_SYMBOL_GPL(nf_ct_log_invalid); + + static int nf_conntrack_standalone_init_sysctl(void) + { +- nf_ct_sysctl_header = +- register_sysctl_paths(nf_ct_path, nf_ct_netfilter_table); +- if (nf_ct_sysctl_header == NULL) { +- printk("nf_conntrack: can't register to sysctl.\n"); +- return -ENOMEM; ++ struct ctl_table *nf_table, *ct_table; ++ ++ nf_table = nf_ct_netfilter_table; ++ ct_table = nf_ct_sysctl_table; ++ ++ if (!ve_is_super(get_exec_env())) { ++ nf_table = kmemdup(nf_table, sizeof(nf_ct_netfilter_table), ++ GFP_KERNEL); ++ if (nf_table == NULL) ++ goto err_nft; ++ ++ ct_table = kmemdup(ct_table, sizeof(nf_ct_sysctl_table), ++ GFP_KERNEL); ++ if (ct_table == NULL) ++ goto err_ctt; ++ ++ nf_table[0].child = ct_table; + } ++ ++ nf_table[1].data = &ve_nf_conntrack_max; ++ ct_table[0].data = &ve_nf_conntrack_max; ++ ct_table[1].data = &ve_nf_conntrack_count; ++ /* nf_conntrack_htable_size is shared and readonly */ ++ ct_table[3].data = &ve_nf_conntrack_checksum; ++ ct_table[4].data = &ve_nf_ct_log_invalid; ++ ct_table[5].data = &ve_nf_ct_expect_max; ++ ++ ve_nf_ct_sysctl_header = register_net_sysctl_table(get_exec_env()->ve_netns, ++ nf_ct_path, nf_table); ++ if (ve_nf_ct_sysctl_header == NULL) ++ goto err_reg; ++ + return 0; + ++err_reg: ++ if (ct_table != nf_ct_sysctl_table) ++ kfree(ct_table); ++err_ctt: ++ if (nf_table != nf_ct_netfilter_table) ++ kfree(nf_table); ++err_nft: ++ return -ENOMEM; + } + + static void nf_conntrack_standalone_fini_sysctl(void) + { +- unregister_sysctl_table(nf_ct_sysctl_header); ++ struct ctl_table *table = ve_nf_ct_sysctl_header->ctl_table_arg; ++ ++ unregister_net_sysctl_table(ve_nf_ct_sysctl_header); ++ ++ if (!ve_is_super(get_exec_env())) { ++ kfree(table[0].child); ++ kfree(table); ++ } ++ + } + #else + static int nf_conntrack_standalone_init_sysctl(void) +@@ -446,7 +497,7 @@ static void nf_conntrack_standalone_fini_sysctl(void) + } + #endif /* CONFIG_SYSCTL */ + +-static int __init nf_conntrack_standalone_init(void) ++static int nf_conntrack_init_ve(void) + { + int ret; + +@@ -469,13 +520,34 @@ out: + return ret; + } + +-static void __exit nf_conntrack_standalone_fini(void) ++static void nf_conntrack_cleanup_ve(void) + { + nf_conntrack_standalone_fini_sysctl(); + nf_conntrack_standalone_fini_proc(); + nf_conntrack_cleanup(); + } + ++static int __init nf_conntrack_standalone_init(void) ++{ ++#ifdef CONFIG_VE_IPTABLES ++ KSYMRESOLVE(nf_conntrack_init_ve); ++ KSYMRESOLVE(nf_conntrack_cleanup_ve); ++ KSYMMODRESOLVE(nf_conntrack); ++#endif ++ ++ return nf_conntrack_init_ve(); ++} ++ ++static void __exit nf_conntrack_standalone_fini(void) ++{ ++#ifdef CONFIG_VE_IPTABLES ++ KSYMMODUNRESOLVE(nf_conntrack); ++ KSYMUNRESOLVE(nf_conntrack_init_ve); ++ KSYMUNRESOLVE(nf_conntrack_cleanup_ve); ++#endif ++ nf_conntrack_cleanup_ve(); ++} ++ + module_init(nf_conntrack_standalone_init); + module_exit(nf_conntrack_standalone_fini); + +diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c +index 69d699f..aa01c54 100644 +--- a/net/netfilter/nf_sockopt.c ++++ b/net/netfilter/nf_sockopt.c +@@ -65,9 +65,6 @@ static struct nf_sockopt_ops *nf_sockopt_find(struct sock *sk, int pf, + { + struct nf_sockopt_ops *ops; + +- if (sock_net(sk) != &init_net) +- return ERR_PTR(-ENOPROTOOPT); +- + if (mutex_lock_interruptible(&nf_sockopt_mutex) != 0) + return ERR_PTR(-EINTR); + +diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c +index b75c9c4..04491ab 100644 +--- a/net/netfilter/nfnetlink.c ++++ b/net/netfilter/nfnetlink.c +@@ -124,7 +124,7 @@ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) + const struct nfnetlink_subsystem *ss; + int type, err; + +- if (security_netlink_recv(skb, CAP_NET_ADMIN)) ++ if (security_netlink_recv(skb, CAP_VE_NET_ADMIN)) + return -EPERM; + + /* All the messages must at least contain nfgenmsg */ +diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c +index 3447025..9655cff 100644 +--- a/net/netfilter/nfnetlink_queue.c ++++ b/net/netfilter/nfnetlink_queue.c +@@ -556,9 +556,6 @@ nfqnl_rcv_dev_event(struct notifier_block *this, + { + struct net_device *dev = ptr; + +- if (dev_net(dev) != &init_net) +- return NOTIFY_DONE; +- + /* Drop any packets associated with the downed device */ + if (event == NETDEV_DOWN) + nfqnl_dev_drop(dev->ifindex); +@@ -587,8 +584,7 @@ nfqnl_rcv_nl_event(struct notifier_block *this, + struct hlist_head *head = &instance_table[i]; + + hlist_for_each_entry_safe(inst, tmp, t2, head, hlist) { +- if ((n->net == &init_net) && +- (n->pid == inst->peer_pid)) ++ if (n->pid == inst->peer_pid) + __instance_destroy(inst); + } + } +diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c +index 5d75cd8..7ffe66a 100644 +--- a/net/netfilter/x_tables.c ++++ b/net/netfilter/x_tables.c +@@ -24,6 +24,8 @@ + #include + #include + ++#include ++ + #include + #include + +@@ -64,6 +66,46 @@ static const char *const xt_prefix[NPROTO] = { + [NF_ARP] = "arp", + }; + ++#ifdef CONFIG_BEANCOUNTERS ++static inline struct user_beancounter *xt_table_ub(struct xt_table_info *info) ++{ ++ struct user_beancounter *ub; ++ ++ for (ub = mem_ub(info); ub->parent != NULL; ub = ub->parent); ++ return ub; ++} ++ ++static void uncharge_xtables(struct xt_table_info *info, unsigned long size) ++{ ++ struct user_beancounter *ub; ++ ++ ub = xt_table_ub(info); ++ uncharge_beancounter(ub, UB_NUMXTENT, size); ++} ++ ++static int recharge_xtables(int check_ub, ++ struct xt_table_info *new, struct xt_table_info *old) ++{ ++ struct user_beancounter *ub; ++ long change; ++ ++ ub = xt_table_ub(new); ++ BUG_ON(check_ub && ub != xt_table_ub(old)); ++ ++ change = (long)new->number - (long)old->number; ++ if (change > 0) { ++ if (charge_beancounter(ub, UB_NUMXTENT, change, UB_SOFT)) ++ return -ENOMEM; ++ } else if (change < 0) ++ uncharge_beancounter(ub, UB_NUMXTENT, -change); ++ ++ return 0; ++} ++#else ++#define recharge_xtables(c, new, old) (0) ++#define uncharge_xtables(info, s) do { } while (0) ++#endif /* CONFIG_BEANCOUNTERS */ ++ + /* Registration hooks for targets. */ + int + xt_register_target(struct xt_target *target) +@@ -312,23 +354,23 @@ int xt_check_match(const struct xt_match *match, unsigned short family, + unsigned short proto, int inv_proto) + { + if (XT_ALIGN(match->matchsize) != size) { +- printk("%s_tables: %s match: invalid size %Zu != %u\n", ++ ve_printk(VE_LOG, "%s_tables: %s match: invalid size %Zu != %u\n", + xt_prefix[family], match->name, + XT_ALIGN(match->matchsize), size); + return -EINVAL; + } + if (match->table && strcmp(match->table, table)) { +- printk("%s_tables: %s match: only valid in %s table, not %s\n", ++ ve_printk(VE_LOG, "%s_tables: %s match: only valid in %s table, not %s\n", + xt_prefix[family], match->name, match->table, table); + return -EINVAL; + } + if (match->hooks && (hook_mask & ~match->hooks) != 0) { +- printk("%s_tables: %s match: bad hook_mask %u/%u\n", ++ ve_printk(VE_LOG, "%s_tables: %s match: bad hook_mask %u/%u\n", + xt_prefix[family], match->name, hook_mask, match->hooks); + return -EINVAL; + } + if (match->proto && (match->proto != proto || inv_proto)) { +- printk("%s_tables: %s match: only valid for protocol %u\n", ++ ve_printk(VE_LOG, "%s_tables: %s match: only valid for protocol %u\n", + xt_prefix[family], match->name, match->proto); + return -EINVAL; + } +@@ -453,24 +495,24 @@ int xt_check_target(const struct xt_target *target, unsigned short family, + unsigned short proto, int inv_proto) + { + if (XT_ALIGN(target->targetsize) != size) { +- printk("%s_tables: %s target: invalid size %Zu != %u\n", ++ ve_printk(VE_LOG, "%s_tables: %s target: invalid size %Zu != %u\n", + xt_prefix[family], target->name, + XT_ALIGN(target->targetsize), size); + return -EINVAL; + } + if (target->table && strcmp(target->table, table)) { +- printk("%s_tables: %s target: only valid in %s table, not %s\n", ++ ve_printk(VE_LOG, "%s_tables: %s target: only valid in %s table, not %s\n", + xt_prefix[family], target->name, target->table, table); + return -EINVAL; + } + if (target->hooks && (hook_mask & ~target->hooks) != 0) { +- printk("%s_tables: %s target: bad hook_mask %u/%u\n", ++ ve_printk(VE_LOG, "%s_tables: %s target: bad hook_mask %u/%u\n", + xt_prefix[family], target->name, hook_mask, + target->hooks); + return -EINVAL; + } + if (target->proto && (target->proto != proto || inv_proto)) { +- printk("%s_tables: %s target: only valid for protocol %u\n", ++ ve_printk(VE_LOG, "%s_tables: %s target: only valid for protocol %u\n", + xt_prefix[family], target->name, target->proto); + return -EINVAL; + } +@@ -550,19 +592,19 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size) + if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > num_physpages) + return NULL; + +- newinfo = kzalloc(XT_TABLE_INFO_SZ, GFP_KERNEL); ++ newinfo = kzalloc(XT_TABLE_INFO_SZ, GFP_KERNEL_UBC); + if (!newinfo) + return NULL; + +- newinfo->size = size; ++ newinfo->alloc_size = newinfo->size = size; + + for_each_possible_cpu(cpu) { + if (size <= PAGE_SIZE) + newinfo->entries[cpu] = kmalloc_node(size, +- GFP_KERNEL, ++ GFP_KERNEL_UBC, + cpu_to_node(cpu)); + else +- newinfo->entries[cpu] = vmalloc_node(size, ++ newinfo->entries[cpu] = ub_vmalloc_node(size, + cpu_to_node(cpu)); + + if (newinfo->entries[cpu] == NULL) { +@@ -580,7 +622,7 @@ void xt_free_table_info(struct xt_table_info *info) + int cpu; + + for_each_possible_cpu(cpu) { +- if (info->size <= PAGE_SIZE) ++ if (info->alloc_size <= PAGE_SIZE) + kfree(info->entries[cpu]); + else + vfree(info->entries[cpu]); +@@ -645,6 +687,13 @@ xt_replace_table(struct xt_table *table, + return NULL; + } + oldinfo = private; ++ ++ if (recharge_xtables(num_counters != 0, newinfo, oldinfo)) { ++ write_unlock_bh(&table->lock); ++ *error = -ENOMEM; ++ return NULL; ++ } ++ + table->private = newinfo; + newinfo->initial_entries = oldinfo->initial_entries; + write_unlock_bh(&table->lock); +@@ -714,6 +763,7 @@ void *xt_unregister_table(struct xt_table *table) + list_del(&table->list); + mutex_unlock(&xt[table->af].mutex); + kfree(table); ++ uncharge_xtables(private, private->number); + + return private; + } +diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c +index 97efd74..d0453de 100644 +--- a/net/netfilter/xt_DSCP.c ++++ b/net/netfilter/xt_DSCP.c +@@ -73,7 +73,7 @@ dscp_tg_check(const char *tablename, const void *e_void, + const u_int8_t dscp = ((struct xt_DSCP_info *)targinfo)->dscp; + + if (dscp > XT_DSCP_MAX) { +- printk(KERN_WARNING "DSCP: dscp %x out of range\n", dscp); ++ ve_printk(VE_LOG, KERN_WARNING "DSCP: dscp %x out of range\n", dscp); + return false; + } + return true; +diff --git a/net/netfilter/xt_MARK.c b/net/netfilter/xt_MARK.c +index f9ce20b..030ba07 100644 +--- a/net/netfilter/xt_MARK.c ++++ b/net/netfilter/xt_MARK.c +@@ -80,7 +80,7 @@ mark_tg_check_v0(const char *tablename, const void *entry, + const struct xt_mark_target_info *markinfo = targinfo; + + if (markinfo->mark > 0xffffffff) { +- printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n"); ++ ve_printk(VE_LOG, KERN_WARNING "MARK: Only supports 32bit wide mark\n"); + return false; + } + return true; +@@ -96,12 +96,12 @@ mark_tg_check_v1(const char *tablename, const void *entry, + if (markinfo->mode != XT_MARK_SET + && markinfo->mode != XT_MARK_AND + && markinfo->mode != XT_MARK_OR) { +- printk(KERN_WARNING "MARK: unknown mode %u\n", ++ ve_printk(VE_LOG, KERN_WARNING "MARK: unknown mode %u\n", + markinfo->mode); + return false; + } + if (markinfo->mark > 0xffffffff) { +- printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n"); ++ ve_printk(VE_LOG, KERN_WARNING "MARK: Only supports 32bit wide mark\n"); + return false; + } + return true; +diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c +index 217e2b6..1edbe18 100644 +--- a/net/netfilter/xt_TCPMSS.c ++++ b/net/netfilter/xt_TCPMSS.c +@@ -67,7 +67,7 @@ tcpmss_mangle_packet(struct sk_buff *skb, + badly. --RR */ + if (tcplen != tcph->doff*4) { + if (net_ratelimit()) +- printk(KERN_ERR "xt_TCPMSS: bad length (%u bytes)\n", ++ ve_printk(VE_LOG, KERN_ERR "xt_TCPMSS: bad length (%u bytes)\n", + skb->len); + return -1; + } +@@ -75,7 +75,7 @@ tcpmss_mangle_packet(struct sk_buff *skb, + if (info->mss == XT_TCPMSS_CLAMP_PMTU) { + if (dst_mtu(skb->dst) <= minlen) { + if (net_ratelimit()) +- printk(KERN_ERR "xt_TCPMSS: " ++ ve_printk(VE_LOG, KERN_ERR "xt_TCPMSS: " + "unknown or invalid path-MTU (%u)\n", + dst_mtu(skb->dst)); + return -1; +@@ -269,13 +269,13 @@ tcpmss_tg4_check(const char *tablename, const void *entry, + (hook_mask & ~((1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING))) != 0) { +- printk("xt_TCPMSS: path-MTU clamping only supported in " ++ ve_printk(VE_LOG, "xt_TCPMSS: path-MTU clamping only supported in " + "FORWARD, OUTPUT and POSTROUTING hooks\n"); + return false; + } + if (IPT_MATCH_ITERATE(e, find_syn_match)) + return true; +- printk("xt_TCPMSS: Only works on TCP SYN packets\n"); ++ ve_printk(VE_LOG, "xt_TCPMSS: Only works on TCP SYN packets\n"); + return false; + } + +@@ -292,13 +292,13 @@ tcpmss_tg6_check(const char *tablename, const void *entry, + (hook_mask & ~((1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING))) != 0) { +- printk("xt_TCPMSS: path-MTU clamping only supported in " ++ ve_printk(VE_LOG, "xt_TCPMSS: path-MTU clamping only supported in " + "FORWARD, OUTPUT and POSTROUTING hooks\n"); + return false; + } + if (IP6T_MATCH_ITERATE(e, find_syn_match)) + return true; +- printk("xt_TCPMSS: Only works on TCP SYN packets\n"); ++ ve_printk(VE_LOG, "xt_TCPMSS: Only works on TCP SYN packets\n"); + return false; + } + #endif +diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c +index 6809af5..da1f086 100644 +--- a/net/netfilter/xt_hashlimit.c ++++ b/net/netfilter/xt_hashlimit.c +@@ -15,6 +15,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -41,8 +42,13 @@ MODULE_ALIAS("ipt_hashlimit"); + MODULE_ALIAS("ip6t_hashlimit"); + + /* need to declare this at the top */ ++#ifdef CONFIG_VE_IPTABLES ++#define hashlimit_procdir4 (get_exec_env()->_xt_hashlimit->hashlimit_procdir4) ++#define hashlimit_procdir6 (get_exec_env()->_xt_hashlimit->hashlimit_procdir6) ++#else + static struct proc_dir_entry *hashlimit_procdir4; + static struct proc_dir_entry *hashlimit_procdir6; ++#endif + static const struct file_operations dl_file_ops; + + /* hash table crap */ +@@ -99,9 +105,16 @@ struct xt_hashlimit_htable { + + static DEFINE_SPINLOCK(hashlimit_lock); /* protects htables list */ + static DEFINE_MUTEX(hlimit_mutex); /* additional checkentry protection */ ++#ifdef CONFIG_VE_IPTABLES ++#define hashlimit_htables (get_exec_env()->_xt_hashlimit->hashlimit_htables) ++#else + static HLIST_HEAD(hashlimit_htables); ++#endif + static struct kmem_cache *hashlimit_cachep __read_mostly; + ++static int init_xt_hashlimit(void); ++static void fini_xt_hashlimit(void); ++ + static inline bool dst_cmp(const struct dsthash_ent *ent, + const struct dsthash_dst *b) + { +@@ -702,6 +715,9 @@ hashlimit_mt_check_v0(const char *tablename, const void *inf, + if (r->name[sizeof(r->name) - 1] != '\0') + return false; + ++ if (init_xt_hashlimit()) ++ return 0; ++ + /* This is the best we've got: We cannot release and re-grab lock, + * since checkentry() is called before x_tables.c grabs xt_mutex. + * We also cannot grab the hashtable spinlock, since htable_create will +@@ -748,6 +764,9 @@ hashlimit_mt_check(const char *tablename, const void *inf, + return false; + } + ++ if (init_xt_hashlimit()) ++ return 0; ++ + /* This is the best we've got: We cannot release and re-grab lock, + * since checkentry() is called before x_tables.c grabs xt_mutex. + * We also cannot grab the hashtable spinlock, since htable_create will +@@ -770,6 +789,8 @@ hashlimit_mt_destroy_v0(const struct xt_match *match, void *matchinfo) + const struct xt_hashlimit_info *r = matchinfo; + + htable_put(r->hinfo); ++ if (!ve_is_super(get_exec_env()) && hlist_empty(&hashlimit_htables)) ++ fini_xt_hashlimit(); + } + + static void +@@ -778,6 +799,8 @@ hashlimit_mt_destroy(const struct xt_match *match, void *matchinfo) + const struct xt_hashlimit_mtinfo1 *info = matchinfo; + + htable_put(info->hinfo); ++ if (!ve_is_super(get_exec_env()) && hlist_empty(&hashlimit_htables)) ++ fini_xt_hashlimit(); + } + + #ifdef CONFIG_COMPAT +@@ -980,6 +1003,76 @@ static const struct file_operations dl_file_ops = { + .release = seq_release + }; + ++static inline struct proc_dir_entry *proc_from_netns(void) ++{ ++#if defined(CONFIG_VE) ++ return get_exec_env()->ve_netns->proc_net; ++#else ++ return init_net.proc_net; ++#endif ++} ++ ++static int init_xt_hashlimit(void) ++{ ++ struct proc_dir_entry *proc_net = proc_from_netns(); ++ ++#if defined(CONFIG_VE_IPTABLES) ++ struct ve_struct *ve = get_exec_env(); ++ ++ if (ve->_xt_hashlimit) ++ return 0; ++ ++ ve->_xt_hashlimit = kzalloc(sizeof(struct ve_xt_hashlimit), GFP_KERNEL); ++ if (!ve->_xt_hashlimit) ++ goto err1; ++#endif ++ INIT_HLIST_HEAD(&hashlimit_htables); ++ ++ hashlimit_procdir4 = proc_mkdir("ipt_hashlimit", proc_net); ++ if (!hashlimit_procdir4) { ++ printk(KERN_ERR "xt_hashlimit: unable to create proc dir " ++ "entry\n"); ++ goto err2; ++ } ++#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) ++ hashlimit_procdir6 = proc_mkdir("ip6t_hashlimit", proc_net); ++ if (!hashlimit_procdir6) { ++ printk(KERN_ERR "xt_hashlimit: unable to create proc dir " ++ "entry\n"); ++ goto err3; ++ } ++#endif ++ ++ return 0; ++ ++err3: ++ remove_proc_entry("ipt_hashlimit", proc_net); ++err2: ++#if defined(CONFIG_VE_IPTABLES) ++ kfree(ve->_xt_hashlimit); ++ ve->_xt_hashlimit = NULL; ++err1: ++#endif ++ return -ENOMEM; ++} ++ ++static void fini_xt_hashlimit(void) ++{ ++ struct proc_dir_entry *proc_net = proc_from_netns(); ++#ifdef CONFIG_VE_IPTABLES ++ struct ve_struct *ve = get_exec_env(); ++#endif ++#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) ++ remove_proc_entry("ip6t_hashlimit", proc_net); ++#endif ++ remove_proc_entry("ipt_hashlimit", proc_net); ++ ++#if defined(CONFIG_VE_IPTABLES) ++ kfree(ve->_xt_hashlimit); ++ ve->_xt_hashlimit = NULL; ++#endif ++} ++ + static int __init hashlimit_mt_init(void) + { + int err; +@@ -997,24 +1090,11 @@ static int __init hashlimit_mt_init(void) + printk(KERN_ERR "xt_hashlimit: unable to create slab cache\n"); + goto err2; + } +- hashlimit_procdir4 = proc_mkdir("ipt_hashlimit", init_net.proc_net); +- if (!hashlimit_procdir4) { +- printk(KERN_ERR "xt_hashlimit: unable to create proc dir " +- "entry\n"); ++ err = init_xt_hashlimit(); ++ if (err) + goto err3; +- } +- err = 0; +-#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) +- hashlimit_procdir6 = proc_mkdir("ip6t_hashlimit", init_net.proc_net); +- if (!hashlimit_procdir6) { +- printk(KERN_ERR "xt_hashlimit: unable to create proc dir " +- "entry\n"); +- err = -ENOMEM; +- } +-#endif + if (!err) + return 0; +- remove_proc_entry("ipt_hashlimit", init_net.proc_net); + err3: + kmem_cache_destroy(hashlimit_cachep); + err2: +@@ -1026,10 +1106,7 @@ err1: + + static void __exit hashlimit_mt_exit(void) + { +- remove_proc_entry("ipt_hashlimit", init_net.proc_net); +-#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) +- remove_proc_entry("ip6t_hashlimit", init_net.proc_net); +-#endif ++ fini_xt_hashlimit(); + kmem_cache_destroy(hashlimit_cachep); + xt_unregister_matches(hashlimit_mt_reg, ARRAY_SIZE(hashlimit_mt_reg)); + } +diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c +index aad9ab8..91570c7 100644 +--- a/net/netfilter/xt_limit.c ++++ b/net/netfilter/xt_limit.c +@@ -105,7 +105,7 @@ limit_mt_check(const char *tablename, const void *inf, + /* Check for overflow. */ + if (r->burst == 0 + || user2credits(r->avg * r->burst) < user2credits(r->avg)) { +- printk("Overflow in xt_limit, try lower: %u/%u\n", ++ ve_printk(VE_LOG, "Overflow in xt_limit, try lower: %u/%u\n", + r->avg, r->burst); + return false; + } +diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c +index 349aba1..d30766c 100644 +--- a/net/netlink/af_netlink.c ++++ b/net/netlink/af_netlink.c +@@ -60,29 +60,14 @@ + #include + #include + #include ++#include ++ ++#include ++#include + + #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) + #define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long)) + +-struct netlink_sock { +- /* struct sock has to be the first member of netlink_sock */ +- struct sock sk; +- u32 pid; +- u32 dst_pid; +- u32 dst_group; +- u32 flags; +- u32 subscriptions; +- u32 ngroups; +- unsigned long *groups; +- unsigned long state; +- wait_queue_head_t wait; +- struct netlink_callback *cb; +- struct mutex *cb_mutex; +- struct mutex cb_def_mutex; +- void (*netlink_rcv)(struct sk_buff *skb); +- struct module *module; +-}; +- + #define NETLINK_KERNEL_SOCKET 0x1 + #define NETLINK_RECV_PKTINFO 0x2 + +@@ -401,6 +386,8 @@ static int __netlink_create(struct net *net, struct socket *sock, + sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto); + if (!sk) + return -ENOMEM; ++ if (ub_other_sock_charge(sk)) ++ goto out_free; + + sock_init_data(sock, sk); + +@@ -416,6 +403,10 @@ static int __netlink_create(struct net *net, struct socket *sock, + sk->sk_destruct = netlink_sock_destruct; + sk->sk_protocol = protocol; + return 0; ++ ++out_free: ++ sk_free(sk); ++ return -ENOMEM; + } + + static int netlink_create(struct net *net, struct socket *sock, int protocol) +@@ -522,7 +513,7 @@ static int netlink_autobind(struct socket *sock) + struct hlist_head *head; + struct sock *osk; + struct hlist_node *node; +- s32 pid = current->tgid; ++ s32 pid = task_tgid_vnr(current); + int err; + static s32 rover = -4097; + +@@ -558,7 +549,7 @@ retry: + static inline int netlink_capable(struct socket *sock, unsigned int flag) + { + return (nl_table[sock->sk->sk_protocol].nl_nonroot & flag) || +- capable(CAP_NET_ADMIN); ++ capable(CAP_VE_NET_ADMIN); + } + + static void +@@ -763,12 +754,20 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, + long *timeo, struct sock *ssk) + { + struct netlink_sock *nlk; ++ unsigned long chargesize; ++ int no_ubc; + + nlk = nlk_sk(sk); + +- if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || ++ chargesize = skb_charge_fullsize(skb); ++ no_ubc = ub_sock_getwres_other(sk, chargesize); ++ if (no_ubc || atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || + test_bit(0, &nlk->state)) { + DECLARE_WAITQUEUE(wait, current); ++ ++ if (!no_ubc) ++ ub_sock_retwres_other(sk, chargesize, ++ SOCK_MIN_UBCSPACE_CH); + if (!*timeo) { + if (!ssk || netlink_is_kernel(ssk)) + netlink_overrun(sk); +@@ -780,13 +779,20 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, + __set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&nlk->wait, &wait); + ++ /* this if can't be moved upper because ub_sock_snd_queue_add() ++ * may change task state to TASK_RUNNING */ ++ if (no_ubc) ++ ub_sock_sndqueueadd_other(sk, chargesize); ++ + if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || +- test_bit(0, &nlk->state)) && ++ test_bit(0, &nlk->state) || no_ubc) && + !sock_flag(sk, SOCK_DEAD)) + *timeo = schedule_timeout(*timeo); + + __set_current_state(TASK_RUNNING); + remove_wait_queue(&nlk->wait, &wait); ++ if (no_ubc) ++ ub_sock_sndqueuedel(sk); + sock_put(sk); + + if (signal_pending(current)) { +@@ -796,6 +802,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, + return 1; + } + skb_set_owner_r(skb, sk); ++ ub_skb_set_charge(skb, sk, chargesize, UB_OTHERSOCKBUF); + return 0; + } + +@@ -961,6 +968,9 @@ static inline int do_one_broadcast(struct sock *sk, + !test_bit(p->group - 1, nlk->groups)) + goto out; + ++ if (!ve_accessible_strict(get_exec_env(), sk->owner_env)) ++ goto out; ++ + if (!net_eq(sock_net(sk), p->net)) + goto out; + +@@ -1530,6 +1540,10 @@ static int netlink_dump(struct sock *sk) + skb = sock_rmalloc(sk, NLMSG_GOODSIZE, 0, GFP_KERNEL); + if (!skb) + goto errout; ++ if (ub_nlrcvbuf_charge(skb, sk) < 0) { ++ kfree_skb(skb); ++ return -EACCES; ++ } + + mutex_lock(nlk->cb_mutex); + +diff --git a/net/netlink/attr.c b/net/netlink/attr.c +index 2d106cf..d9846a4 100644 +--- a/net/netlink/attr.c ++++ b/net/netlink/attr.c +@@ -164,7 +164,7 @@ int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len, + } + + if (unlikely(rem > 0)) +- printk(KERN_WARNING "netlink: %d bytes leftover after parsing " ++ ve_printk(VE_LOG, KERN_WARNING "netlink: %d bytes leftover after parsing " + "attributes.\n", rem); + + err = 0; +diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c +index 3e1191c..f5c0578 100644 +--- a/net/netlink/genetlink.c ++++ b/net/netlink/genetlink.c +@@ -437,7 +437,7 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) + return -EOPNOTSUPP; + + if ((ops->flags & GENL_ADMIN_PERM) && +- security_netlink_recv(skb, CAP_NET_ADMIN)) ++ security_netlink_recv(skb, CAP_VE_NET_ADMIN)) + return -EPERM; + + if (nlh->nlmsg_flags & NLM_F_DUMP) { +diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c +index 2cee87d..1b7fbf4 100644 +--- a/net/packet/af_packet.c ++++ b/net/packet/af_packet.c +@@ -80,6 +80,8 @@ + #include + #include + ++#include ++ + #ifdef CONFIG_INET + #include + #endif +@@ -454,6 +456,8 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet + if (dev_net(dev) != sock_net(sk)) + goto drop; + ++ skb_orphan(skb); ++ + skb->dev = dev; + + if (dev->header_ops) { +@@ -517,6 +521,9 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet + if (pskb_trim(skb, snaplen)) + goto drop_n_acct; + ++ if (ub_sockrcvbuf_charge(sk, skb)) ++ goto drop_n_acct; ++ + skb_set_owner_r(skb, sk); + skb->dev = NULL; + dst_release(skb->dst); +@@ -571,6 +578,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe + if (dev_net(dev) != sock_net(sk)) + goto drop; + ++ skb_orphan(skb); ++ + if (dev->header_ops) { + if (sk->sk_type != SOCK_DGRAM) + skb_push(skb, skb->data - skb_mac_header(skb)); +@@ -617,6 +626,12 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe + snaplen = 0; + } + ++ if (copy_skb && ++ ub_sockrcvbuf_charge(sk, copy_skb)) { ++ spin_lock(&sk->sk_receive_queue.lock); ++ goto ring_is_full; ++ } ++ + spin_lock(&sk->sk_receive_queue.lock); + h = packet_lookup_frame(po, po->head); + +@@ -982,6 +997,8 @@ static int packet_create(struct net *net, struct socket *sock, int protocol) + sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto); + if (sk == NULL) + goto out; ++ if (ub_other_sock_charge(sk)) ++ goto out_free; + + sock->ops = &packet_ops; + if (sock->type == SOCK_PACKET) +@@ -1019,6 +1036,9 @@ static int packet_create(struct net *net, struct socket *sock, int protocol) + sk_add_node(sk, &net->packet.sklist); + write_unlock_bh(&net->packet.sklist_lock); + return(0); ++ ++out_free: ++ sk_free(sk); + out: + return err; + } +diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c +index 2a3c97f..9450c72 100644 +--- a/net/sched/sch_cbq.c ++++ b/net/sched/sch_cbq.c +@@ -905,8 +905,8 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio) + + if (cl->deficit <= 0) { + q->active[prio] = cl; +- cl = cl->next_alive; + cl->deficit += cl->quantum; ++ cl = cl->next_alive; + } + return skb; + +@@ -1078,17 +1078,19 @@ static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio) + + for (h=0; h<16; h++) { + for (cl = q->classes[h]; cl; cl = cl->next) { ++ long mtu; + /* BUGGGG... Beware! This expression suffer of + arithmetic overflows! + */ + if (cl->priority == prio) { +- cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/ +- q->quanta[prio]; +- } +- if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) { +- printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->classid, cl->quantum); +- cl->quantum = cl->qdisc->dev->mtu/2 + 1; ++ cl->quantum = (cl->weight * cl->allot) / ++ (q->quanta[prio] / q->nclasses[prio]); + } ++ mtu = cl->qdisc->dev->mtu; ++ if (cl->quantum <= mtu/2) ++ cl->quantum = mtu/2 + 1; ++ else if (cl->quantum > 32*mtu) ++ cl->quantum = 32*mtu; + } + } + } +diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c +index 13afa72..1365604 100644 +--- a/net/sched/sch_generic.c ++++ b/net/sched/sch_generic.c +@@ -141,11 +141,13 @@ static inline int qdisc_restart(struct net_device *dev) + struct Qdisc *q = dev->qdisc; + struct sk_buff *skb; + int ret = NETDEV_TX_BUSY; ++ struct ve_struct *old_ve; + + /* Dequeue packet */ + if (unlikely((skb = dev_dequeue_skb(dev, q)) == NULL)) + return 0; + ++ old_ve = set_exec_env(skb->owner_env); + + /* And release queue */ + spin_unlock(&dev->queue_lock); +@@ -179,6 +181,8 @@ static inline int qdisc_restart(struct net_device *dev) + break; + } + ++ (void)set_exec_env(old_ve); ++ + return ret; + } + +diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c +index 0444fd0..57e0989 100644 +--- a/net/sched/sch_teql.c ++++ b/net/sched/sch_teql.c +@@ -174,6 +174,9 @@ static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt) + struct teql_master *m = (struct teql_master*)sch->ops; + struct teql_sched_data *q = qdisc_priv(sch); + ++ if (!capable(CAP_NET_ADMIN)) ++ return -EPERM; ++ + if (dev->hard_header_len > m->dev->hard_header_len) + return -EINVAL; + +diff --git a/net/socket.c b/net/socket.c +index 66c4a8c..58a9495 100644 +--- a/net/socket.c ++++ b/net/socket.c +@@ -84,6 +84,7 @@ + #include + #include + #include ++#include + #include + + #include +@@ -159,15 +160,6 @@ static DEFINE_PER_CPU(int, sockets_in_use) = 0; + * divide and look after the messy bits. + */ + +-#define MAX_SOCK_ADDR 128 /* 108 for Unix domain - +- 16 for IP, 16 for IPX, +- 24 for IPv6, +- about 80 for AX.25 +- must be at least one bigger than +- the AF_UNIX size (see net/unix/af_unix.c +- :unix_mkname()). +- */ +- + /** + * move_addr_to_kernel - copy a socket address into kernel space + * @uaddr: Address in user space +@@ -189,6 +181,7 @@ int move_addr_to_kernel(void __user *uaddr, int ulen, void *kaddr) + return -EFAULT; + return audit_sockaddr(ulen, kaddr); + } ++EXPORT_SYMBOL(move_addr_to_kernel); + + /** + * move_addr_to_user - copy an address to user space +@@ -496,6 +489,8 @@ static struct socket *sock_alloc(void) + return sock; + } + ++EXPORT_SYMBOL(sock_alloc); ++ + /* + * In theory you can't get an open on this inode, but /proc provides + * a back door. Remember to keep it shut otherwise you'll let the +@@ -1090,6 +1085,49 @@ call_kill: + return 0; + } + ++int vz_security_family_check(int family) ++{ ++#ifdef CONFIG_VE ++ if (ve_is_super(get_exec_env())) ++ return 0; ++ ++ switch (family) { ++ case PF_UNSPEC: ++ case PF_PACKET: ++ case PF_NETLINK: ++ case PF_UNIX: ++ case PF_INET: ++ case PF_INET6: ++ break; ++ default: ++ return -EAFNOSUPPORT; ++ } ++#endif ++ return 0; ++} ++EXPORT_SYMBOL_GPL(vz_security_family_check); ++ ++int vz_security_protocol_check(int protocol) ++{ ++#ifdef CONFIG_VE ++ if (ve_is_super(get_exec_env())) ++ return 0; ++ ++ switch (protocol) { ++ case IPPROTO_IP: ++ case IPPROTO_TCP: ++ case IPPROTO_UDP: ++ case IPPROTO_RAW: ++ case IPPROTO_DCCP: ++ break; ++ default: ++ return -EAFNOSUPPORT; ++ } ++#endif ++ return 0; ++} ++EXPORT_SYMBOL_GPL(vz_security_protocol_check); ++ + static int __sock_create(struct net *net, int family, int type, int protocol, + struct socket **res, int kern) + { +@@ -1120,6 +1158,11 @@ static int __sock_create(struct net *net, int family, int type, int protocol, + family = PF_PACKET; + } + ++ /* VZ compatibility layer */ ++ err = vz_security_family_check(family); ++ if (err < 0) ++ return err; ++ + err = security_socket_create(family, type, protocol, kern); + if (err) + return err; +@@ -2314,9 +2357,12 @@ int kernel_sock_ioctl(struct socket *sock, int cmd, unsigned long arg) + { + mm_segment_t oldfs = get_fs(); + int err; ++ struct ve_struct *old_env; + + set_fs(KERNEL_DS); ++ old_env = set_exec_env(get_ve0()); + err = sock->ops->ioctl(sock, cmd, arg); ++ (void)set_exec_env(old_env); + set_fs(oldfs); + + return err; +diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c +index 6eab9bf..4fba93a 100644 +--- a/net/sunrpc/sched.c ++++ b/net/sunrpc/sched.c +@@ -615,7 +615,9 @@ void rpc_release_calldata(const struct rpc_call_ops *ops, void *calldata) + static void __rpc_execute(struct rpc_task *task) + { + int status = 0; ++ struct ve_struct *env; + ++ env = set_exec_env(get_ve0()); + dprintk("RPC: %5u __rpc_execute flags=0x%x\n", + task->tk_pid, task->tk_flags); + +@@ -693,6 +695,7 @@ static void __rpc_execute(struct rpc_task *task) + task->tk_status); + /* Release all resources associated with the task */ + rpc_release_task(task); ++ (void)set_exec_env(env); + } + + /* +diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c +index 3e65719..029c673 100644 +--- a/net/sunrpc/svcsock.c ++++ b/net/sunrpc/svcsock.c +@@ -178,6 +178,9 @@ static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) + unsigned int pglen = xdr->page_len; + unsigned int flags = MSG_MORE; + RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); ++ struct ve_struct *old_env; ++ ++ old_env = set_exec_env(get_ve0()); + + slen = xdr->len; + +@@ -238,6 +241,8 @@ out: + svsk, xdr->head[0].iov_base, xdr->head[0].iov_len, + xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf))); + ++ (void)set_exec_env(old_env); ++ + return len; + } + +@@ -316,11 +321,14 @@ static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, + .msg_flags = MSG_DONTWAIT, + }; + int len; ++ struct ve_struct *old_env; + + rqstp->rq_xprt_hlen = 0; + ++ old_env = set_exec_env(get_ve0()); + len = kernel_recvmsg(svsk->sk_sock, &msg, iov, nr, buflen, + msg.msg_flags); ++ (void)set_exec_env(get_ve0()); + + dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n", + svsk, iov[0].iov_base, iov[0].iov_len, len); +@@ -719,11 +727,13 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) + struct svc_sock *newsvsk; + int err, slen; + RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); ++ struct ve_struct *old_env; + + dprintk("svc: tcp_accept %p sock %p\n", svsk, sock); + if (!sock) + return NULL; + ++ old_env = set_exec_env(get_ve0()); + clear_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); + err = kernel_accept(sock, &newsock, O_NONBLOCK); + if (err < 0) { +@@ -733,7 +743,7 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) + else if (err != -EAGAIN && net_ratelimit()) + printk(KERN_WARNING "%s: accept failed (err %d)!\n", + serv->sv_name, -err); +- return NULL; ++ goto restore; + } + set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); + +@@ -774,6 +784,8 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) + } + svc_xprt_set_local(&newsvsk->sk_xprt, sin, slen); + ++ (void)set_exec_env(old_env); ++ + if (serv->sv_stats) + serv->sv_stats->nettcpconn++; + +@@ -781,6 +793,8 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) + + failed: + sock_release(newsock); ++restore: ++ (void)set_exec_env(old_env); + return NULL; + } + +@@ -1211,6 +1225,7 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, + struct sockaddr *newsin = (struct sockaddr *)&addr; + int newlen; + RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); ++ struct ve_struct *old_env; + + dprintk("svc: svc_create_socket(%s, %d, %s)\n", + serv->sv_program->pg_name, protocol, +@@ -1223,9 +1238,10 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, + } + type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; + ++ old_env = set_exec_env(get_ve0()); + error = sock_create_kern(sin->sa_family, type, protocol, &sock); + if (error < 0) +- return ERR_PTR(error); ++ goto restore; + + svc_reclassify_socket(sock); + +@@ -1247,12 +1263,15 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, + + if ((svsk = svc_setup_socket(serv, sock, &error, flags)) != NULL) { + svc_xprt_set_local(&svsk->sk_xprt, newsin, newlen); ++ (void)set_exec_env(old_env); + return (struct svc_xprt *)svsk; + } + + bummer: + dprintk("svc: svc_create_socket error = %d\n", -error); + sock_release(sock); ++restore: ++ (void)set_exec_env(old_env); + return ERR_PTR(error); + } + +@@ -1267,6 +1286,8 @@ static void svc_sock_detach(struct svc_xprt *xprt) + + dprintk("svc: svc_sock_detach(%p)\n", svsk); + ++ /* XXX: serialization? */ ++ sk->sk_user_data = NULL; + /* put back the old socket callbacks */ + sk->sk_state_change = svsk->sk_ostate; + sk->sk_data_ready = svsk->sk_odata; +diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c +index 783317d..39d2173 100644 +--- a/net/unix/af_unix.c ++++ b/net/unix/af_unix.c +@@ -117,6 +117,9 @@ + #include + #include + ++#include ++#include ++ + static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; + static DEFINE_SPINLOCK(unix_table_lock); + static atomic_t unix_nr_socks = ATOMIC_INIT(0); +@@ -593,6 +596,8 @@ static struct sock * unix_create1(struct net *net, struct socket *sock) + sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto); + if (!sk) + goto out; ++ if (ub_other_sock_charge(sk)) ++ goto out_sk_free; + + sock_init_data(sock,sk); + lockdep_set_class(&sk->sk_receive_queue.lock, +@@ -614,6 +619,9 @@ out: + if (sk == NULL) + atomic_dec(&unix_nr_socks); + return sk; ++out_sk_free: ++ sk_free(sk); ++ return NULL; + } + + static int unix_create(struct net *net, struct socket *sock, int protocol) +@@ -1015,6 +1023,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, + int st; + int err; + long timeo; ++ unsigned long chargesize; + + err = unix_mkname(sunaddr, addr_len, &hash); + if (err < 0) +@@ -1043,6 +1052,10 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, + skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); + if (skb == NULL) + goto out; ++ chargesize = skb_charge_fullsize(skb); ++ if (ub_sock_getwres_other(newsk, chargesize) < 0) ++ goto out; ++ ub_skb_set_charge(skb, newsk, chargesize, UB_OTHERSOCKBUF); + + restart: + /* Find listening sock. */ +@@ -1290,7 +1303,7 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) + unix_notinflight(scm->fp->fp[i]); + } + +-static void unix_destruct_fds(struct sk_buff *skb) ++void unix_destruct_fds(struct sk_buff *skb) + { + struct scm_cookie scm; + memset(&scm, 0, sizeof(scm)); +@@ -1301,6 +1314,7 @@ static void unix_destruct_fds(struct sk_buff *skb) + scm_destroy(&scm); + sock_wfree(skb); + } ++EXPORT_SYMBOL_GPL(unix_destruct_fds); + + static void unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) + { +@@ -1512,6 +1526,16 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, + + size = len-sent; + ++ if (msg->msg_flags & MSG_DONTWAIT) ++ ub_sock_makewres_other(sk, skb_charge_size(size)); ++ if (sock_bc(sk) != NULL && ++ sock_bc(sk)->poll_reserv >= ++ SOCK_MIN_UBCSPACE && ++ skb_charge_size(size) > ++ sock_bc(sk)->poll_reserv) ++ size = skb_charge_datalen(sock_bc(sk)->poll_reserv); ++ ++ + /* Keep two messages in the pipe so it schedules better */ + if (size > ((sk->sk_sndbuf >> 1) - 64)) + size = (sk->sk_sndbuf >> 1) - 64; +@@ -1523,7 +1547,9 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, + * Grab a buffer + */ + +- skb=sock_alloc_send_skb(sk,size,msg->msg_flags&MSG_DONTWAIT, &err); ++ ++ skb = sock_alloc_send_skb2(sk, size, SOCK_MIN_UBCSPACE, ++ msg->msg_flags&MSG_DONTWAIT, &err); + + if (skb==NULL) + goto out_err; +@@ -1963,6 +1989,7 @@ static unsigned int unix_poll(struct file * file, struct socket *sock, poll_tabl + { + struct sock *sk = sock->sk; + unsigned int mask; ++ int no_ub_res; + + poll_wait(file, sk->sk_sleep, wait); + mask = 0; +@@ -1975,6 +2002,10 @@ static unsigned int unix_poll(struct file * file, struct socket *sock, poll_tabl + if (sk->sk_shutdown & RCV_SHUTDOWN) + mask |= POLLRDHUP; + ++ no_ub_res = ub_sock_makewres_other(sk, SOCK_MIN_UBCSPACE_CH); ++ if (no_ub_res) ++ ub_sock_sndqueueadd_other(sk, SOCK_MIN_UBCSPACE_CH); ++ + /* readable? */ + if (!skb_queue_empty(&sk->sk_receive_queue) || + (sk->sk_shutdown & RCV_SHUTDOWN)) +@@ -1988,7 +2019,7 @@ static unsigned int unix_poll(struct file * file, struct socket *sock, poll_tabl + * we set writable also when the other side has shut down the + * connection. This prevents stuck sockets. + */ +- if (unix_writable(sk)) ++ if (!no_ub_res && unix_writable(sk)) + mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + + return mask; +diff --git a/net/unix/garbage.c b/net/unix/garbage.c +index ebdff3d..1ed511f 100644 +--- a/net/unix/garbage.c ++++ b/net/unix/garbage.c +@@ -80,6 +80,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -151,6 +152,7 @@ void unix_notinflight(struct file *fp) + spin_unlock(&unix_gc_lock); + } + } ++EXPORT_SYMBOL_GPL(unix_notinflight); + + static inline struct sk_buff *sock_queue_head(struct sock *sk) + { +diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c +index cae9fd8..420b756 100644 +--- a/net/xfrm/xfrm_policy.c ++++ b/net/xfrm/xfrm_policy.c +@@ -2360,9 +2360,6 @@ static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void + { + struct net_device *dev = ptr; + +- if (dev_net(dev) != &init_net) +- return NOTIFY_DONE; +- + switch (event) { + case NETDEV_DOWN: + xfrm_flush_bundles(); +diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c +index 04c4150..aa0bad6 100644 +--- a/net/xfrm/xfrm_user.c ++++ b/net/xfrm/xfrm_user.c +@@ -1947,7 +1947,7 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) + link = &xfrm_dispatch[type]; + + /* All operations require privileges, even GET */ +- if (security_netlink_recv(skb, CAP_NET_ADMIN)) ++ if (security_netlink_recv(skb, CAP_VE_NET_ADMIN)) + return -EPERM; + + if ((type == (XFRM_MSG_GETSA - XFRM_MSG_BASE) || +diff --git a/security/Kconfig b/security/Kconfig +index 49b51f9..f2a0ec8 100644 +--- a/security/Kconfig ++++ b/security/Kconfig +@@ -41,7 +41,7 @@ config KEYS_DEBUG_PROC_KEYS + + config SECURITY + bool "Enable different security models" +- depends on SYSFS ++ depends on SYSFS && !VE + help + This allows you to choose different security modules to be + configured into your kernel. +diff --git a/security/commoncap.c b/security/commoncap.c +index 33d3433..b42b99a 100644 +--- a/security/commoncap.c ++++ b/security/commoncap.c +@@ -35,6 +35,10 @@ int cap_netlink_send(struct sock *sk, struct sk_buff *skb) + + int cap_netlink_recv(struct sk_buff *skb, int cap) + { ++ if (likely(cap == CAP_VE_NET_ADMIN) && ++ cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)) ++ return 0; ++ + if (!cap_raised(NETLINK_CB(skb).eff_cap, cap)) + return -EPERM; + return 0; +@@ -399,7 +403,7 @@ int cap_inode_setxattr(struct dentry *dentry, const char *name, + return 0; + } else if (!strncmp(name, XATTR_SECURITY_PREFIX, + sizeof(XATTR_SECURITY_PREFIX) - 1) && +- !capable(CAP_SYS_ADMIN)) ++ !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_ADMIN)) + return -EPERM; + return 0; + } +@@ -412,7 +416,7 @@ int cap_inode_removexattr(struct dentry *dentry, const char *name) + return 0; + } else if (!strncmp(name, XATTR_SECURITY_PREFIX, + sizeof(XATTR_SECURITY_PREFIX) - 1) && +- !capable(CAP_SYS_ADMIN)) ++ !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_ADMIN)) + return -EPERM; + return 0; + } +@@ -675,7 +679,7 @@ void cap_task_reparent_to_init (struct task_struct *p) + + int cap_syslog (int type) + { +- if ((type != 3 && type != 10) && !capable(CAP_SYS_ADMIN)) ++ if ((type != 3 && type != 10) && !capable(CAP_VE_SYS_ADMIN)) + return -EPERM; + return 0; + } +diff --git a/security/device_cgroup.c b/security/device_cgroup.c +index ddd92ce..d1da90a 100644 +--- a/security/device_cgroup.c ++++ b/security/device_cgroup.c +@@ -10,11 +10,23 @@ + #include + #include + #include ++#include ++#include ++#include + + #define ACC_MKNOD 1 + #define ACC_READ 2 + #define ACC_WRITE 4 +-#define ACC_MASK (ACC_MKNOD | ACC_READ | ACC_WRITE) ++#define ACC_QUOTA 8 ++#define ACC_HIDDEN 16 ++#define ACC_MASK (ACC_MKNOD | ACC_READ | ACC_WRITE | ACC_QUOTA) ++ ++static inline int convert_bits(int acc) ++{ ++ /* ...10x <-> ...01x trial: guess hwy */ ++ return ((((acc & 06) == 00) || ((acc & 06) == 06)) ? acc : acc ^06) & ++ (ACC_READ | ACC_WRITE | ACC_QUOTA); ++} + + #define DEV_BLOCK 1 + #define DEV_CHAR 2 +@@ -73,6 +85,38 @@ static int devcgroup_can_attach(struct cgroup_subsys *ss, + /* + * called under cgroup_lock() + */ ++#ifdef CONFIG_VE ++static struct dev_whitelist_item default_whitelist_items[] = { ++ { ~0, ~0, DEV_ALL, ACC_MKNOD }, ++ { UNIX98_PTY_SLAVE_MAJOR, ~0, DEV_CHAR, ACC_READ | ACC_WRITE }, ++ { UNIX98_PTY_SLAVE_MAJOR, ~0, DEV_CHAR, ACC_READ | ACC_WRITE }, ++ { PTY_MASTER_MAJOR, ~0, DEV_CHAR, ACC_READ | ACC_WRITE }, ++ { PTY_SLAVE_MAJOR, ~0, DEV_CHAR, ACC_READ | ACC_WRITE }, ++ { MEM_MAJOR, /* null */ 3, DEV_CHAR, ACC_READ | ACC_WRITE }, ++ { MEM_MAJOR, /* zero */ 5, DEV_CHAR, ACC_READ | ACC_WRITE }, ++ { MEM_MAJOR, /* full */ 7, DEV_CHAR, ACC_READ | ACC_WRITE }, ++ { TTYAUX_MAJOR, /* tty */ 0, DEV_CHAR, ACC_READ | ACC_WRITE }, ++ { TTYAUX_MAJOR, /* ptmx */ 2, DEV_CHAR, ACC_READ | ACC_WRITE }, ++ { MEM_MAJOR, /* random */ 8, DEV_CHAR, ACC_READ }, ++ { MEM_MAJOR, /* urandom */ 9, DEV_CHAR, ACC_READ }, ++}; ++ ++static LIST_HEAD(default_perms); ++#define parent_whitelist(p) (&default_perms) ++static void prepare_def_perms(void) ++{ ++ int i; ++ ++ for (i = 0; i < ARRAY_SIZE(default_whitelist_items); i++) { ++ default_whitelist_items[i].access |= ACC_HIDDEN; ++ list_add(&default_whitelist_items[i].list, &default_perms); ++ } ++} ++#else ++#define prepare_def_perms() do { } while(0) ++#define parent_whitelist(p) (&parent_dev_cgroup->whitelist) ++#endif ++ + static int dev_whitelist_copy(struct list_head *dest, struct list_head *orig) + { + struct dev_whitelist_item *wh, *tmp, *new; +@@ -188,12 +232,14 @@ static struct cgroup_subsys_state *devcgroup_create(struct cgroup_subsys *ss, + } + wh->minor = wh->major = ~0; + wh->type = DEV_ALL; +- wh->access = ACC_MKNOD | ACC_READ | ACC_WRITE; ++ wh->access = ACC_MKNOD | ACC_READ | ACC_WRITE | ACC_QUOTA; + list_add(&wh->list, &dev_cgroup->whitelist); ++ ++ prepare_def_perms(); + } else { + parent_dev_cgroup = cgroup_to_devcgroup(parent_cgroup); + ret = dev_whitelist_copy(&dev_cgroup->whitelist, +- &parent_dev_cgroup->whitelist); ++ parent_whitelist(parent_dev_cgroup)); + if (ret) { + kfree(dev_cgroup); + return ERR_PTR(ret); +@@ -269,8 +315,14 @@ static int devcgroup_seq_read(struct cgroup *cgroup, struct cftype *cft, + set_access(acc, wh->access); + set_majmin(maj, wh->major); + set_majmin(min, wh->minor); +- seq_printf(m, "%c %s:%s %s\n", type_to_char(wh->type), +- maj, min, acc); ++ ++ if (cft != NULL) ++ seq_printf(m, "%c %s:%s %s\n", type_to_char(wh->type), ++ maj, min, acc); ++ else if (!(wh->access & ACC_HIDDEN)) ++ seq_printf(m, "%10u %c %03o %s:%s\n", (int)m->private, ++ type_to_char(wh->type), ++ convert_bits(wh->access), maj, min); + } + spin_unlock(&devcgroup->lock); + +@@ -520,16 +572,13 @@ struct cgroup_subsys devices_subsys = { + .subsys_id = devices_subsys_id, + }; + +-int devcgroup_inode_permission(struct inode *inode, int mask) ++static int __devcgroup_inode_permission(int blk, dev_t device, int mask) + { + struct dev_cgroup *dev_cgroup; + struct dev_whitelist_item *wh; + +- dev_t device = inode->i_rdev; + if (!device) + return 0; +- if (!S_ISBLK(inode->i_mode) && !S_ISCHR(inode->i_mode)) +- return 0; + dev_cgroup = css_to_devcgroup(task_subsys_state(current, + devices_subsys_id)); + if (!dev_cgroup) +@@ -539,19 +588,21 @@ int devcgroup_inode_permission(struct inode *inode, int mask) + list_for_each_entry(wh, &dev_cgroup->whitelist, list) { + if (wh->type & DEV_ALL) + goto acc_check; +- if ((wh->type & DEV_BLOCK) && !S_ISBLK(inode->i_mode)) ++ if ((wh->type & DEV_BLOCK) && !blk) + continue; +- if ((wh->type & DEV_CHAR) && !S_ISCHR(inode->i_mode)) ++ if ((wh->type & DEV_CHAR) && blk) + continue; +- if (wh->major != ~0 && wh->major != imajor(inode)) ++ if (wh->major != ~0 && wh->major != MAJOR(device)) + continue; +- if (wh->minor != ~0 && wh->minor != iminor(inode)) ++ if (wh->minor != ~0 && wh->minor != MINOR(device)) + continue; + acc_check: + if ((mask & MAY_WRITE) && !(wh->access & ACC_WRITE)) + continue; + if ((mask & MAY_READ) && !(wh->access & ACC_READ)) + continue; ++ if ((mask & MAY_QUOTACTL) && !(wh->access & ACC_QUOTA)) ++ continue; + spin_unlock(&dev_cgroup->lock); + return 0; + } +@@ -560,6 +611,15 @@ acc_check: + return -EPERM; + } + ++int devcgroup_inode_permission(struct inode *inode, int mask) ++{ ++ if (!S_ISBLK(inode->i_mode) && !S_ISCHR(inode->i_mode)) ++ return 0; ++ ++ return __devcgroup_inode_permission(S_ISBLK(inode->i_mode), ++ inode->i_rdev, mask); ++} ++ + int devcgroup_inode_mknod(int mode, dev_t dev) + { + struct dev_cgroup *dev_cgroup; +@@ -591,3 +651,75 @@ acc_check: + spin_unlock(&dev_cgroup->lock); + return -EPERM; + } ++ ++#ifdef CONFIG_VE ++int get_device_perms_ve(int dev_type, dev_t dev, int access_mode) ++{ ++ int mask = 0; ++ ++ mask |= (access_mode & FMODE_READ ? MAY_READ : 0); ++ mask |= (access_mode & FMODE_WRITE ? MAY_WRITE : 0); ++ mask |= (access_mode & FMODE_QUOTACTL ? MAY_QUOTACTL : 0); ++ ++ return __devcgroup_inode_permission(dev_type == S_IFBLK, dev, mask); ++} ++EXPORT_SYMBOL(get_device_perms_ve); ++ ++int set_device_perms_ve(struct ve_struct *ve, ++ unsigned type, dev_t dev, unsigned mask) ++{ ++ int err = -EINVAL; ++ struct dev_whitelist_item *new; ++ ++ new = kzalloc(sizeof(*new), GFP_KERNEL); ++ if (new == NULL) ++ return -ENOMEM; ++ ++ if ((type & S_IFMT) == S_IFBLK) ++ new->type = DEV_BLOCK; ++ else if ((type & S_IFMT) == S_IFCHR) ++ new->type = DEV_CHAR; ++ else ++ goto out; ++ ++ new->access = convert_bits(mask); ++ new->major = new->minor = ~0; ++ ++ switch (type & VE_USE_MASK) { ++ default: ++ new->minor = MINOR(dev); ++ case VE_USE_MAJOR: ++ new->major = MAJOR(dev); ++ case 0: ++ ; ++ } ++ ++ err = dev_whitelist_add(cgroup_to_devcgroup(ve->ve_cgroup), new); ++out: ++ if (err < 0) ++ kfree(new); ++ return err; ++} ++EXPORT_SYMBOL(set_device_perms_ve); ++ ++#ifdef CONFIG_PROC_FS ++int devperms_seq_show(struct seq_file *m, void *v) ++{ ++ struct ve_struct *ve = list_entry(v, struct ve_struct, ve_list); ++ ++ if (m->private == (void *)0) { ++ seq_printf(m, "Version: 2.7\n"); ++ m->private = (void *)-1; ++ } ++ ++ if (ve_is_super(ve)) { ++ seq_printf(m, "%10u b 016 *:*\n%10u c 006 *:*\n", 0, 0); ++ return 0; ++ } ++ ++ m->private = (void *)ve->veid; ++ return devcgroup_seq_read(ve->ve_cgroup, NULL, m); ++} ++EXPORT_SYMBOL(devperms_seq_show); ++#endif ++#endif +diff --git a/security/selinux/Kconfig b/security/selinux/Kconfig +index a436d1c..130a8be 100644 +--- a/security/selinux/Kconfig ++++ b/security/selinux/Kconfig +@@ -1,6 +1,6 @@ + config SECURITY_SELINUX + bool "NSA SELinux Support" +- depends on SECURITY_NETWORK && AUDIT && NET && INET ++ depends on SECURITY_NETWORK && AUDIT && NET && INET && !VE + select NETWORK_SECMARK + default n + help +diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c +index 1c864c0..263b6db 100644 +--- a/security/selinux/hooks.c ++++ b/security/selinux/hooks.c +@@ -5186,12 +5186,12 @@ static int selinux_setprocattr(struct task_struct *p, + struct task_struct *g, *t; + struct mm_struct *mm = p->mm; + read_lock(&tasklist_lock); +- do_each_thread(g, t) ++ do_each_thread_ve(g, t) + if (t->mm == mm && t != p) { + read_unlock(&tasklist_lock); + return -EPERM; + } +- while_each_thread(g, t); ++ while_each_thread_ve(g, t); + read_unlock(&tasklist_lock); + } + diff --git a/debian/patches/series/1~experimental.1-extra b/debian/patches/series/1~experimental.1-extra index 95a9774a4..ba786467f 100644 --- a/debian/patches/series/1~experimental.1-extra +++ b/debian/patches/series/1~experimental.1-extra @@ -8,6 +8,7 @@ + features/all/xen/xenctrl-xenbus.patch featureset=xen + features/all/xen/xenctrl-sys-hypervisor.patch featureset=xen ++ features/all/openvz/openvz.patch featureset=openvz # m68k patches