From 19bd6fc76e08fed73895b6e39da8504741cc6b09 Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Mon, 2 Feb 2015 20:36:16 +0000 Subject: [PATCH] Add the llvm patch corresponding to r278112. --- .../patch-07-llvm-r227752-boot2-shrink.diff | 1271 +++++++++++++++++ 1 file changed, 1271 insertions(+) create mode 100644 contrib/llvm/patches/patch-07-llvm-r227752-boot2-shrink.diff diff --git a/contrib/llvm/patches/patch-07-llvm-r227752-boot2-shrink.diff b/contrib/llvm/patches/patch-07-llvm-r227752-boot2-shrink.diff new file mode 100644 index 000000000000..d5650f7689ae --- /dev/null +++ b/contrib/llvm/patches/patch-07-llvm-r227752-boot2-shrink.diff @@ -0,0 +1,1271 @@ +Pull in r227752 from upstream llvm trunk (by Michael Kuperstein): + + [X86] Convert esp-relative movs of function arguments to pushes, step 2 + + This moves the transformation introduced in r223757 into a separate MI pass. + This allows it to cover many more cases (not only cases where there must be a + reserved call frame), and perform rudimentary call folding. It still doesn't + have a heuristic, so it is enabled only for optsize/minsize, with stack + alignment <= 8, where it ought to be a fairly clear win. + + (Re-commit of r227728) + + Differential Revision: http://reviews.llvm.org/D6789 + +This helps to get sys/boot/i386/boot2 below the required size again, +when optimizing with -Oz. + +Introduced here: http://svnweb.freebsd.org/changeset/base/278112 + +Index: include/llvm/Target/TargetFrameLowering.h +=================================================================== +--- include/llvm/Target/TargetFrameLowering.h ++++ include/llvm/Target/TargetFrameLowering.h +@@ -193,6 +193,11 @@ class TargetFrameLowering { + return hasReservedCallFrame(MF) || hasFP(MF); + } + ++ // needsFrameIndexResolution - Do we need to perform FI resolution for ++ // this function. Normally, this is required only when the function ++ // has any stack objects. However, targets may want to override this. ++ virtual bool needsFrameIndexResolution(const MachineFunction &MF) const; ++ + /// getFrameIndexOffset - Returns the displacement from the frame register to + /// the stack frame of the specified index. + virtual int getFrameIndexOffset(const MachineFunction &MF, int FI) const; +Index: lib/CodeGen/PrologEpilogInserter.cpp +=================================================================== +--- lib/CodeGen/PrologEpilogInserter.cpp ++++ lib/CodeGen/PrologEpilogInserter.cpp +@@ -703,7 +703,8 @@ void PEI::insertPrologEpilogCode(MachineFunction & + /// register references and actual offsets. + /// + void PEI::replaceFrameIndices(MachineFunction &Fn) { +- if (!Fn.getFrameInfo()->hasStackObjects()) return; // Nothing to do? ++ const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering(); ++ if (!TFI.needsFrameIndexResolution(Fn)) return; + + // Store SPAdj at exit of a basic block. + SmallVector SPState; +@@ -769,13 +770,6 @@ void PEI::replaceFrameIndices(MachineBasicBlock *B + continue; + } + +- // If we are looking at a call sequence, we need to keep track of +- // the SP adjustment made by each instruction in the sequence. +- // This includes both the frame setup/destroy pseudos (handled above), +- // as well as other instructions that have side effects w.r.t the SP. +- if (InsideCallSequence) +- SPAdj += TII.getSPAdjust(I); +- + MachineInstr *MI = I; + bool DoIncr = true; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { +@@ -854,6 +848,16 @@ void PEI::replaceFrameIndices(MachineBasicBlock *B + break; + } + ++ // If we are looking at a call sequence, we need to keep track of ++ // the SP adjustment made by each instruction in the sequence. ++ // This includes both the frame setup/destroy pseudos (handled above), ++ // as well as other instructions that have side effects w.r.t the SP. ++ // Note that this must come after eliminateFrameIndex, because ++ // if I itself referred to a frame index, we shouldn't count its own ++ // adjustment. ++ if (MI && InsideCallSequence) ++ SPAdj += TII.getSPAdjust(MI); ++ + if (DoIncr && I != BB->end()) ++I; + + // Update register states. +Index: lib/CodeGen/TargetFrameLoweringImpl.cpp +=================================================================== +--- lib/CodeGen/TargetFrameLoweringImpl.cpp ++++ lib/CodeGen/TargetFrameLoweringImpl.cpp +@@ -42,3 +42,8 @@ int TargetFrameLowering::getFrameIndexReference(co + FrameReg = RI->getFrameRegister(MF); + return getFrameIndexOffset(MF, FI); + } ++ ++bool TargetFrameLowering::needsFrameIndexResolution( ++ const MachineFunction &MF) const { ++ return MF.getFrameInfo()->hasStackObjects(); ++} +Index: lib/Target/X86/CMakeLists.txt +=================================================================== +--- lib/Target/X86/CMakeLists.txt ++++ lib/Target/X86/CMakeLists.txt +@@ -14,6 +14,7 @@ add_public_tablegen_target(X86CommonTableGen) + + set(sources + X86AsmPrinter.cpp ++ X86CallFrameOptimization.cpp + X86FastISel.cpp + X86FloatingPoint.cpp + X86FrameLowering.cpp +Index: lib/Target/X86/X86.h +=================================================================== +--- lib/Target/X86/X86.h ++++ lib/Target/X86/X86.h +@@ -67,6 +67,11 @@ FunctionPass *createX86PadShortFunctions(); + /// to eliminate execution delays in some Atom processors. + FunctionPass *createX86FixupLEAs(); + ++/// createX86CallFrameOptimization - Return a pass that optimizes ++/// the code-size of x86 call sequences. This is done by replacing ++/// esp-relative movs with pushes. ++FunctionPass *createX86CallFrameOptimization(); ++ + } // End llvm namespace + + #endif +Index: lib/Target/X86/X86CallFrameOptimization.cpp +=================================================================== +--- lib/Target/X86/X86CallFrameOptimization.cpp ++++ lib/Target/X86/X86CallFrameOptimization.cpp +@@ -0,0 +1,400 @@ ++//===----- X86CallFrameOptimization.cpp - Optimize x86 call sequences -----===// ++// ++// The LLVM Compiler Infrastructure ++// ++// This file is distributed under the University of Illinois Open Source ++// License. See LICENSE.TXT for details. ++// ++//===----------------------------------------------------------------------===// ++// ++// This file defines a pass that optimizes call sequences on x86. ++// Currently, it converts movs of function parameters onto the stack into ++// pushes. This is beneficial for two main reasons: ++// 1) The push instruction encoding is much smaller than an esp-relative mov ++// 2) It is possible to push memory arguments directly. So, if the ++// the transformation is preformed pre-reg-alloc, it can help relieve ++// register pressure. ++// ++//===----------------------------------------------------------------------===// ++ ++#include ++ ++#include "X86.h" ++#include "X86InstrInfo.h" ++#include "X86Subtarget.h" ++#include "X86MachineFunctionInfo.h" ++#include "llvm/ADT/Statistic.h" ++#include "llvm/CodeGen/MachineFunctionPass.h" ++#include "llvm/CodeGen/MachineInstrBuilder.h" ++#include "llvm/CodeGen/MachineRegisterInfo.h" ++#include "llvm/CodeGen/Passes.h" ++#include "llvm/IR/Function.h" ++#include "llvm/Support/Debug.h" ++#include "llvm/Support/raw_ostream.h" ++#include "llvm/Target/TargetInstrInfo.h" ++ ++using namespace llvm; ++ ++#define DEBUG_TYPE "x86-cf-opt" ++ ++cl::opt NoX86CFOpt("no-x86-call-frame-opt", ++ cl::desc("Avoid optimizing x86 call frames for size"), ++ cl::init(false), cl::Hidden); ++ ++namespace { ++class X86CallFrameOptimization : public MachineFunctionPass { ++public: ++ X86CallFrameOptimization() : MachineFunctionPass(ID) {} ++ ++ bool runOnMachineFunction(MachineFunction &MF) override; ++ ++private: ++ bool shouldPerformTransformation(MachineFunction &MF); ++ ++ bool adjustCallSequence(MachineFunction &MF, MachineBasicBlock &MBB, ++ MachineBasicBlock::iterator I); ++ ++ MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup, ++ unsigned Reg); ++ ++ const char *getPassName() const override { ++ return "X86 Optimize Call Frame"; ++ } ++ ++ const TargetInstrInfo *TII; ++ const TargetFrameLowering *TFL; ++ const MachineRegisterInfo *MRI; ++ static char ID; ++}; ++ ++char X86CallFrameOptimization::ID = 0; ++} ++ ++FunctionPass *llvm::createX86CallFrameOptimization() { ++ return new X86CallFrameOptimization(); ++} ++ ++// This checks whether the transformation is legal and profitable ++bool X86CallFrameOptimization::shouldPerformTransformation(MachineFunction &MF) { ++ if (NoX86CFOpt.getValue()) ++ return false; ++ ++ // We currently only support call sequences where *all* parameters. ++ // are passed on the stack. ++ // No point in running this in 64-bit mode, since some arguments are ++ // passed in-register in all common calling conventions, so the pattern ++ // we're looking for will never match. ++ const X86Subtarget &STI = MF.getTarget().getSubtarget(); ++ if (STI.is64Bit()) ++ return false; ++ ++ // You would expect straight-line code between call-frame setup and ++ // call-frame destroy. You would be wrong. There are circumstances (e.g. ++ // CMOV_GR8 expansion of a select that feeds a function call!) where we can ++ // end up with the setup and the destroy in different basic blocks. ++ // This is bad, and breaks SP adjustment. ++ // So, check that all of the frames in the function are closed inside ++ // the same block, and, for good measure, that there are no nested frames. ++ int FrameSetupOpcode = TII->getCallFrameSetupOpcode(); ++ int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); ++ for (MachineBasicBlock &BB : MF) { ++ bool InsideFrameSequence = false; ++ for (MachineInstr &MI : BB) { ++ if (MI.getOpcode() == FrameSetupOpcode) { ++ if (InsideFrameSequence) ++ return false; ++ InsideFrameSequence = true; ++ } ++ else if (MI.getOpcode() == FrameDestroyOpcode) { ++ if (!InsideFrameSequence) ++ return false; ++ InsideFrameSequence = false; ++ } ++ } ++ ++ if (InsideFrameSequence) ++ return false; ++ } ++ ++ // Now that we know the transformation is legal, check if it is ++ // profitable. ++ // TODO: Add a heuristic that actually looks at the function, ++ // and enable this for more cases. ++ ++ // This transformation is always a win when we expected to have ++ // a reserved call frame. Under other circumstances, it may be either ++ // a win or a loss, and requires a heuristic. ++ // For now, enable it only for the relatively clear win cases. ++ bool CannotReserveFrame = MF.getFrameInfo()->hasVarSizedObjects(); ++ if (CannotReserveFrame) ++ return true; ++ ++ // For now, don't even try to evaluate the profitability when ++ // not optimizing for size. ++ AttributeSet FnAttrs = MF.getFunction()->getAttributes(); ++ bool OptForSize = ++ FnAttrs.hasAttribute(AttributeSet::FunctionIndex, ++ Attribute::OptimizeForSize) || ++ FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize); ++ ++ if (!OptForSize) ++ return false; ++ ++ // Stack re-alignment can make this unprofitable even in terms of size. ++ // As mentioned above, a better heuristic is needed. For now, don't do this ++ // when the required alignment is above 8. (4 would be the safe choice, but ++ // some experimentation showed 8 is generally good). ++ if (TFL->getStackAlignment() > 8) ++ return false; ++ ++ return true; ++} ++ ++bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) { ++ TII = MF.getSubtarget().getInstrInfo(); ++ TFL = MF.getSubtarget().getFrameLowering(); ++ MRI = &MF.getRegInfo(); ++ ++ if (!shouldPerformTransformation(MF)) ++ return false; ++ ++ int FrameSetupOpcode = TII->getCallFrameSetupOpcode(); ++ ++ bool Changed = false; ++ ++ for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) ++ for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) ++ if (I->getOpcode() == FrameSetupOpcode) ++ Changed |= adjustCallSequence(MF, *BB, I); ++ ++ return Changed; ++} ++ ++bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, ++ MachineBasicBlock &MBB, ++ MachineBasicBlock::iterator I) { ++ ++ // Check that this particular call sequence is amenable to the ++ // transformation. ++ const X86RegisterInfo &RegInfo = *static_cast( ++ MF.getSubtarget().getRegisterInfo()); ++ unsigned StackPtr = RegInfo.getStackRegister(); ++ int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); ++ ++ // We expect to enter this at the beginning of a call sequence ++ assert(I->getOpcode() == TII->getCallFrameSetupOpcode()); ++ MachineBasicBlock::iterator FrameSetup = I++; ++ ++ ++ // For globals in PIC mode, we can have some LEAs here. ++ // Ignore them, they don't bother us. ++ // TODO: Extend this to something that covers more cases. ++ while (I->getOpcode() == X86::LEA32r) ++ ++I; ++ ++ // We expect a copy instruction here. ++ // TODO: The copy instruction is a lowering artifact. ++ // We should also support a copy-less version, where the stack ++ // pointer is used directly. ++ if (!I->isCopy() || !I->getOperand(0).isReg()) ++ return false; ++ MachineBasicBlock::iterator SPCopy = I++; ++ StackPtr = SPCopy->getOperand(0).getReg(); ++ ++ // Scan the call setup sequence for the pattern we're looking for. ++ // We only handle a simple case - a sequence of MOV32mi or MOV32mr ++ // instructions, that push a sequence of 32-bit values onto the stack, with ++ // no gaps between them. ++ SmallVector MovVector(4, nullptr); ++ unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4; ++ if (MaxAdjust > 4) ++ MovVector.resize(MaxAdjust, nullptr); ++ ++ do { ++ int Opcode = I->getOpcode(); ++ if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr) ++ break; ++ ++ // We only want movs of the form: ++ // movl imm/r32, k(%esp) ++ // If we run into something else, bail. ++ // Note that AddrBaseReg may, counter to its name, not be a register, ++ // but rather a frame index. ++ // TODO: Support the fi case. This should probably work now that we ++ // have the infrastructure to track the stack pointer within a call ++ // sequence. ++ if (!I->getOperand(X86::AddrBaseReg).isReg() || ++ (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) || ++ !I->getOperand(X86::AddrScaleAmt).isImm() || ++ (I->getOperand(X86::AddrScaleAmt).getImm() != 1) || ++ (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) || ++ (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) || ++ !I->getOperand(X86::AddrDisp).isImm()) ++ return false; ++ ++ int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm(); ++ assert(StackDisp >= 0 && "Negative stack displacement when passing parameters"); ++ ++ // We really don't want to consider the unaligned case. ++ if (StackDisp % 4) ++ return false; ++ StackDisp /= 4; ++ ++ assert((size_t)StackDisp < MovVector.size() && ++ "Function call has more parameters than the stack is adjusted for."); ++ ++ // If the same stack slot is being filled twice, something's fishy. ++ if (MovVector[StackDisp] != nullptr) ++ return false; ++ MovVector[StackDisp] = I; ++ ++ ++I; ++ } while (I != MBB.end()); ++ ++ // We now expect the end of the sequence - a call and a stack adjust. ++ if (I == MBB.end()) ++ return false; ++ ++ // For PCrel calls, we expect an additional COPY of the basereg. ++ // If we find one, skip it. ++ if (I->isCopy()) { ++ if (I->getOperand(1).getReg() == ++ MF.getInfo()->getGlobalBaseReg()) ++ ++I; ++ else ++ return false; ++ } ++ ++ if (!I->isCall()) ++ return false; ++ MachineBasicBlock::iterator Call = I; ++ if ((++I)->getOpcode() != FrameDestroyOpcode) ++ return false; ++ ++ // Now, go through the vector, and see that we don't have any gaps, ++ // but only a series of 32-bit MOVs. ++ ++ int64_t ExpectedDist = 0; ++ auto MMI = MovVector.begin(), MME = MovVector.end(); ++ for (; MMI != MME; ++MMI, ExpectedDist += 4) ++ if (*MMI == nullptr) ++ break; ++ ++ // If the call had no parameters, do nothing ++ if (!ExpectedDist) ++ return false; ++ ++ // We are either at the last parameter, or a gap. ++ // Make sure it's not a gap ++ for (; MMI != MME; ++MMI) ++ if (*MMI != nullptr) ++ return false; ++ ++ // Ok, we can in fact do the transformation for this call. ++ // Do not remove the FrameSetup instruction, but adjust the parameters. ++ // PEI will end up finalizing the handling of this. ++ FrameSetup->getOperand(1).setImm(ExpectedDist); ++ ++ DebugLoc DL = I->getDebugLoc(); ++ // Now, iterate through the vector in reverse order, and replace the movs ++ // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to ++ // replace uses. ++ for (int Idx = (ExpectedDist / 4) - 1; Idx >= 0; --Idx) { ++ MachineBasicBlock::iterator MOV = *MovVector[Idx]; ++ MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands); ++ if (MOV->getOpcode() == X86::MOV32mi) { ++ unsigned PushOpcode = X86::PUSHi32; ++ // If the operand is a small (8-bit) immediate, we can use a ++ // PUSH instruction with a shorter encoding. ++ // Note that isImm() may fail even though this is a MOVmi, because ++ // the operand can also be a symbol. ++ if (PushOp.isImm()) { ++ int64_t Val = PushOp.getImm(); ++ if (isInt<8>(Val)) ++ PushOpcode = X86::PUSH32i8; ++ } ++ BuildMI(MBB, Call, DL, TII->get(PushOpcode)).addOperand(PushOp); ++ } else { ++ unsigned int Reg = PushOp.getReg(); ++ ++ // If PUSHrmm is not slow on this target, try to fold the source of the ++ // push into the instruction. ++ const X86Subtarget &ST = MF.getTarget().getSubtarget(); ++ bool SlowPUSHrmm = ST.isAtom() || ST.isSLM(); ++ ++ // Check that this is legal to fold. Right now, we're extremely ++ // conservative about that. ++ MachineInstr *DefMov = nullptr; ++ if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) { ++ MachineInstr *Push = BuildMI(MBB, Call, DL, TII->get(X86::PUSH32rmm)); ++ ++ unsigned NumOps = DefMov->getDesc().getNumOperands(); ++ for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i) ++ Push->addOperand(DefMov->getOperand(i)); ++ ++ DefMov->eraseFromParent(); ++ } else { ++ BuildMI(MBB, Call, DL, TII->get(X86::PUSH32r)).addReg(Reg).getInstr(); ++ } ++ } ++ ++ MBB.erase(MOV); ++ } ++ ++ // The stack-pointer copy is no longer used in the call sequences. ++ // There should not be any other users, but we can't commit to that, so: ++ if (MRI->use_empty(SPCopy->getOperand(0).getReg())) ++ SPCopy->eraseFromParent(); ++ ++ // Once we've done this, we need to make sure PEI doesn't assume a reserved ++ // frame. ++ X86MachineFunctionInfo *FuncInfo = MF.getInfo(); ++ FuncInfo->setHasPushSequences(true); ++ ++ return true; ++} ++ ++MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush( ++ MachineBasicBlock::iterator FrameSetup, unsigned Reg) { ++ // Do an extremely restricted form of load folding. ++ // ISel will often create patterns like: ++ // movl 4(%edi), %eax ++ // movl 8(%edi), %ecx ++ // movl 12(%edi), %edx ++ // movl %edx, 8(%esp) ++ // movl %ecx, 4(%esp) ++ // movl %eax, (%esp) ++ // call ++ // Get rid of those with prejudice. ++ if (!TargetRegisterInfo::isVirtualRegister(Reg)) ++ return nullptr; ++ ++ // Make sure this is the only use of Reg. ++ if (!MRI->hasOneNonDBGUse(Reg)) ++ return nullptr; ++ ++ MachineBasicBlock::iterator DefMI = MRI->getVRegDef(Reg); ++ ++ // Make sure the def is a MOV from memory. ++ // If the def is an another block, give up. ++ if (DefMI->getOpcode() != X86::MOV32rm || ++ DefMI->getParent() != FrameSetup->getParent()) ++ return nullptr; ++ ++ // Be careful with movs that load from a stack slot, since it may get ++ // resolved incorrectly. ++ // TODO: Again, we already have the infrastructure, so this should work. ++ if (!DefMI->getOperand(1).isReg()) ++ return nullptr; ++ ++ // Now, make sure everything else up until the ADJCALLSTACK is a sequence ++ // of MOVs. To be less conservative would require duplicating a lot of the ++ // logic from PeepholeOptimizer. ++ // FIXME: A possibly better approach would be to teach the PeepholeOptimizer ++ // to be smarter about folding into pushes. ++ for (auto I = DefMI; I != FrameSetup; ++I) ++ if (I->getOpcode() != X86::MOV32rm) ++ return nullptr; ++ ++ return DefMI; ++} +Index: lib/Target/X86/X86FastISel.cpp +=================================================================== +--- lib/Target/X86/X86FastISel.cpp ++++ lib/Target/X86/X86FastISel.cpp +@@ -2735,7 +2735,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo & + // Issue CALLSEQ_START + unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown)) +- .addImm(NumBytes); ++ .addImm(NumBytes).addImm(0); + + // Walk the register/memloc assignments, inserting copies/loads. + const X86RegisterInfo *RegInfo = static_cast( +Index: lib/Target/X86/X86FrameLowering.cpp +=================================================================== +--- lib/Target/X86/X86FrameLowering.cpp ++++ lib/Target/X86/X86FrameLowering.cpp +@@ -38,9 +38,36 @@ using namespace llvm; + extern cl::opt ForceStackAlign; + + bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { +- return !MF.getFrameInfo()->hasVarSizedObjects(); ++ return !MF.getFrameInfo()->hasVarSizedObjects() && ++ !MF.getInfo()->getHasPushSequences(); + } + ++/// canSimplifyCallFramePseudos - If there is a reserved call frame, the ++/// call frame pseudos can be simplified. Having a FP, as in the default ++/// implementation, is not sufficient here since we can't always use it. ++/// Use a more nuanced condition. ++bool ++X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const { ++ const X86RegisterInfo *TRI = static_cast ++ (MF.getSubtarget().getRegisterInfo()); ++ return hasReservedCallFrame(MF) || ++ (hasFP(MF) && !TRI->needsStackRealignment(MF)) ++ || TRI->hasBasePointer(MF); ++} ++ ++// needsFrameIndexResolution - Do we need to perform FI resolution for ++// this function. Normally, this is required only when the function ++// has any stack objects. However, FI resolution actually has another job, ++// not apparent from the title - it resolves callframesetup/destroy ++// that were not simplified earlier. ++// So, this is required for x86 functions that have push sequences even ++// when there are no stack objects. ++bool ++X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const { ++ return MF.getFrameInfo()->hasStackObjects() || ++ MF.getInfo()->getHasPushSequences(); ++} ++ + /// hasFP - Return true if the specified function should have a dedicated frame + /// pointer register. This is true if the function has variable sized allocas + /// or if frame pointer elimination is disabled. +@@ -93,16 +120,6 @@ static unsigned getANDriOpcode(bool IsLP64, int64_ + return X86::AND32ri; + } + +-static unsigned getPUSHiOpcode(bool IsLP64, MachineOperand MO) { +- // We don't support LP64 for now. +- assert(!IsLP64); +- +- if (MO.isImm() && isInt<8>(MO.getImm())) +- return X86::PUSH32i8; +- +- return X86::PUSHi32;; +-} +- + static unsigned getLEArOpcode(unsigned IsLP64) { + return IsLP64 ? X86::LEA64r : X86::LEA32r; + } +@@ -1848,100 +1865,6 @@ void X86FrameLowering::adjustForHiPEPrologue(Machi + #endif + } + +-bool X86FrameLowering:: +-convertArgMovsToPushes(MachineFunction &MF, MachineBasicBlock &MBB, +- MachineBasicBlock::iterator I, uint64_t Amount) const { +- const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); +- const X86RegisterInfo &RegInfo = *static_cast( +- MF.getSubtarget().getRegisterInfo()); +- unsigned StackPtr = RegInfo.getStackRegister(); +- +- // Scan the call setup sequence for the pattern we're looking for. +- // We only handle a simple case now - a sequence of MOV32mi or MOV32mr +- // instructions, that push a sequence of 32-bit values onto the stack, with +- // no gaps. +- std::map MovMap; +- do { +- int Opcode = I->getOpcode(); +- if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr) +- break; +- +- // We only want movs of the form: +- // movl imm/r32, k(%ecx) +- // If we run into something else, bail +- // Note that AddrBaseReg may, counterintuitively, not be a register... +- if (!I->getOperand(X86::AddrBaseReg).isReg() || +- (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) || +- !I->getOperand(X86::AddrScaleAmt).isImm() || +- (I->getOperand(X86::AddrScaleAmt).getImm() != 1) || +- (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) || +- (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) || +- !I->getOperand(X86::AddrDisp).isImm()) +- return false; +- +- int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm(); +- +- // We don't want to consider the unaligned case. +- if (StackDisp % 4) +- return false; +- +- // If the same stack slot is being filled twice, something's fishy. +- if (!MovMap.insert(std::pair(StackDisp, I)).second) +- return false; +- +- ++I; +- } while (I != MBB.end()); +- +- // We now expect the end of the sequence - a call and a stack adjust. +- if (I == MBB.end()) +- return false; +- if (!I->isCall()) +- return false; +- MachineBasicBlock::iterator Call = I; +- if ((++I)->getOpcode() != TII.getCallFrameDestroyOpcode()) +- return false; +- +- // Now, go through the map, and see that we don't have any gaps, +- // but only a series of 32-bit MOVs. +- // Since std::map provides ordered iteration, the original order +- // of the MOVs doesn't matter. +- int64_t ExpectedDist = 0; +- for (auto MMI = MovMap.begin(), MME = MovMap.end(); MMI != MME; +- ++MMI, ExpectedDist += 4) +- if (MMI->first != ExpectedDist) +- return false; +- +- // Ok, everything looks fine. Do the transformation. +- DebugLoc DL = I->getDebugLoc(); +- +- // It's possible the original stack adjustment amount was larger than +- // that done by the pushes. If so, we still need a SUB. +- Amount -= ExpectedDist; +- if (Amount) { +- MachineInstr* Sub = BuildMI(MBB, Call, DL, +- TII.get(getSUBriOpcode(false, Amount)), StackPtr) +- .addReg(StackPtr).addImm(Amount); +- Sub->getOperand(3).setIsDead(); +- } +- +- // Now, iterate through the map in reverse order, and replace the movs +- // with pushes. MOVmi/MOVmr doesn't have any defs, so need to replace uses. +- for (auto MMI = MovMap.rbegin(), MME = MovMap.rend(); MMI != MME; ++MMI) { +- MachineBasicBlock::iterator MOV = MMI->second; +- MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands); +- +- // Replace MOVmr with PUSH32r, and MOVmi with PUSHi of appropriate size +- int PushOpcode = X86::PUSH32r; +- if (MOV->getOpcode() == X86::MOV32mi) +- PushOpcode = getPUSHiOpcode(false, PushOp); +- +- BuildMI(MBB, Call, DL, TII.get(PushOpcode)).addOperand(PushOp); +- MBB.erase(MOV); +- } +- +- return true; +-} +- + void X86FrameLowering:: + eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { +@@ -1956,7 +1879,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, + bool IsLP64 = STI.isTarget64BitLP64(); + DebugLoc DL = I->getDebugLoc(); + uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0; +- uint64_t CalleeAmt = isDestroy ? I->getOperand(1).getImm() : 0; ++ uint64_t InternalAmt = (isDestroy || Amount) ? I->getOperand(1).getImm() : 0; + I = MBB.erase(I); + + if (!reserveCallFrame) { +@@ -1976,24 +1899,18 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, + Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign; + + MachineInstr *New = nullptr; +- if (Opcode == TII.getCallFrameSetupOpcode()) { +- // Try to convert movs to the stack into pushes. +- // We currently only look for a pattern that appears in 32-bit +- // calling conventions. +- if (!IsLP64 && convertArgMovsToPushes(MF, MBB, I, Amount)) +- return; + +- New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)), +- StackPtr) +- .addReg(StackPtr) +- .addImm(Amount); +- } else { +- assert(Opcode == TII.getCallFrameDestroyOpcode()); ++ // Factor out the amount that gets handled inside the sequence ++ // (Pushes of argument for frame setup, callee pops for frame destroy) ++ Amount -= InternalAmt; + +- // Factor out the amount the callee already popped. +- Amount -= CalleeAmt; ++ if (Amount) { ++ if (Opcode == TII.getCallFrameSetupOpcode()) { ++ New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)), StackPtr) ++ .addReg(StackPtr).addImm(Amount); ++ } else { ++ assert(Opcode == TII.getCallFrameDestroyOpcode()); + +- if (Amount) { + unsigned Opc = getADDriOpcode(IsLP64, Amount); + New = BuildMI(MF, DL, TII.get(Opc), StackPtr) + .addReg(StackPtr).addImm(Amount); +@@ -2011,13 +1928,13 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, + return; + } + +- if (Opcode == TII.getCallFrameDestroyOpcode() && CalleeAmt) { ++ if (Opcode == TII.getCallFrameDestroyOpcode() && InternalAmt) { + // If we are performing frame pointer elimination and if the callee pops + // something off the stack pointer, add it back. We do this until we have + // more advanced stack pointer tracking ability. +- unsigned Opc = getSUBriOpcode(IsLP64, CalleeAmt); ++ unsigned Opc = getSUBriOpcode(IsLP64, InternalAmt); + MachineInstr *New = BuildMI(MF, DL, TII.get(Opc), StackPtr) +- .addReg(StackPtr).addImm(CalleeAmt); ++ .addReg(StackPtr).addImm(InternalAmt); + + // The EFLAGS implicit def is dead. + New->getOperand(3).setIsDead(); +Index: lib/Target/X86/X86FrameLowering.h +=================================================================== +--- lib/Target/X86/X86FrameLowering.h ++++ lib/Target/X86/X86FrameLowering.h +@@ -64,6 +64,8 @@ class X86FrameLowering : public TargetFrameLowerin + + bool hasFP(const MachineFunction &MF) const override; + bool hasReservedCallFrame(const MachineFunction &MF) const override; ++ bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override; ++ bool needsFrameIndexResolution(const MachineFunction &MF) const override; + + int getFrameIndexOffset(const MachineFunction &MF, int FI) const override; + int getFrameIndexReference(const MachineFunction &MF, int FI, +Index: lib/Target/X86/X86InstrCompiler.td +=================================================================== +--- lib/Target/X86/X86InstrCompiler.td ++++ lib/Target/X86/X86InstrCompiler.td +@@ -43,9 +43,9 @@ let hasSideEffects = 0, isNotDuplicable = 1, Uses + // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become + // sub / add which can clobber EFLAGS. + let Defs = [ESP, EFLAGS], Uses = [ESP] in { +-def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt), ++def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), + "#ADJCALLSTACKDOWN", +- [(X86callseq_start timm:$amt)]>, ++ []>, + Requires<[NotLP64]>; + def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), + "#ADJCALLSTACKUP", +@@ -52,7 +52,10 @@ def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins + [(X86callseq_end timm:$amt1, timm:$amt2)]>, + Requires<[NotLP64]>; + } ++def : Pat<(X86callseq_start timm:$amt1), ++ (ADJCALLSTACKDOWN32 i32imm:$amt1, 0)>, Requires<[NotLP64]>; + ++ + // ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into + // a stack adjustment and the codegen must know that they may modify the stack + // pointer before prolog-epilog rewriting occurs. +@@ -59,9 +62,9 @@ def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins + // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become + // sub / add which can clobber EFLAGS. + let Defs = [RSP, EFLAGS], Uses = [RSP] in { +-def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt), ++def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), + "#ADJCALLSTACKDOWN", +- [(X86callseq_start timm:$amt)]>, ++ []>, + Requires<[IsLP64]>; + def ADJCALLSTACKUP64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), + "#ADJCALLSTACKUP", +@@ -68,9 +71,10 @@ def ADJCALLSTACKUP64 : I<0, Pseudo, (outs), (ins + [(X86callseq_end timm:$amt1, timm:$amt2)]>, + Requires<[IsLP64]>; + } ++def : Pat<(X86callseq_start timm:$amt1), ++ (ADJCALLSTACKDOWN64 i32imm:$amt1, 0)>, Requires<[IsLP64]>; + + +- + // x86-64 va_start lowering magic. + let usesCustomInserter = 1, Defs = [EFLAGS] in { + def VASTART_SAVE_XMM_REGS : I<0, Pseudo, +Index: lib/Target/X86/X86InstrInfo.cpp +=================================================================== +--- lib/Target/X86/X86InstrInfo.cpp ++++ lib/Target/X86/X86InstrInfo.cpp +@@ -1692,6 +1692,58 @@ X86InstrInfo::isCoalescableExtInstr(const MachineI + return false; + } + ++int X86InstrInfo::getSPAdjust(const MachineInstr *MI) const { ++ const MachineFunction *MF = MI->getParent()->getParent(); ++ const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering(); ++ ++ if (MI->getOpcode() == getCallFrameSetupOpcode() || ++ MI->getOpcode() == getCallFrameDestroyOpcode()) { ++ unsigned StackAlign = TFI->getStackAlignment(); ++ int SPAdj = (MI->getOperand(0).getImm() + StackAlign - 1) / StackAlign * ++ StackAlign; ++ ++ SPAdj -= MI->getOperand(1).getImm(); ++ ++ if (MI->getOpcode() == getCallFrameSetupOpcode()) ++ return SPAdj; ++ else ++ return -SPAdj; ++ } ++ ++ // To know whether a call adjusts the stack, we need information ++ // that is bound to the following ADJCALLSTACKUP pseudo. ++ // Look for the next ADJCALLSTACKUP that follows the call. ++ if (MI->isCall()) { ++ const MachineBasicBlock* MBB = MI->getParent(); ++ auto I = ++MachineBasicBlock::const_iterator(MI); ++ for (auto E = MBB->end(); I != E; ++I) { ++ if (I->getOpcode() == getCallFrameDestroyOpcode() || ++ I->isCall()) ++ break; ++ } ++ ++ // If we could not find a frame destroy opcode, then it has already ++ // been simplified, so we don't care. ++ if (I->getOpcode() != getCallFrameDestroyOpcode()) ++ return 0; ++ ++ return -(I->getOperand(1).getImm()); ++ } ++ ++ // Currently handle only PUSHes we can reasonably expect to see ++ // in call sequences ++ switch (MI->getOpcode()) { ++ default: ++ return 0; ++ case X86::PUSH32i8: ++ case X86::PUSH32r: ++ case X86::PUSH32rmm: ++ case X86::PUSH32rmr: ++ case X86::PUSHi32: ++ return 4; ++ } ++} ++ + /// isFrameOperand - Return true and the FrameIndex if the specified + /// operand and follow operands form a reference to the stack frame. + bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op, +Index: lib/Target/X86/X86InstrInfo.h +=================================================================== +--- lib/Target/X86/X86InstrInfo.h ++++ lib/Target/X86/X86InstrInfo.h +@@ -175,6 +175,11 @@ class X86InstrInfo final : public X86GenInstrInfo + /// + const X86RegisterInfo &getRegisterInfo() const { return RI; } + ++ /// getSPAdjust - This returns the stack pointer adjustment made by ++ /// this instruction. For x86, we need to handle more complex call ++ /// sequences involving PUSHes. ++ int getSPAdjust(const MachineInstr *MI) const override; ++ + /// isCoalescableExtInstr - Return true if the instruction is a "coalescable" + /// extension instruction. That is, it's like a copy where it's legal for the + /// source to overlap the destination. e.g. X86::MOVSX64rr32. If this returns +Index: lib/Target/X86/X86MachineFunctionInfo.h +=================================================================== +--- lib/Target/X86/X86MachineFunctionInfo.h ++++ lib/Target/X86/X86MachineFunctionInfo.h +@@ -77,6 +77,9 @@ class X86MachineFunctionInfo : public MachineFunct + unsigned ArgumentStackSize; + /// NumLocalDynamics - Number of local-dynamic TLS accesses. + unsigned NumLocalDynamics; ++ /// HasPushSequences - Keeps track of whether this function uses sequences ++ /// of pushes to pass function parameters. ++ bool HasPushSequences; + + private: + /// ForwardedMustTailRegParms - A list of virtual and physical registers +@@ -97,7 +100,8 @@ class X86MachineFunctionInfo : public MachineFunct + VarArgsGPOffset(0), + VarArgsFPOffset(0), + ArgumentStackSize(0), +- NumLocalDynamics(0) {} ++ NumLocalDynamics(0), ++ HasPushSequences(false) {} + + explicit X86MachineFunctionInfo(MachineFunction &MF) + : ForceFramePointer(false), +@@ -113,11 +117,15 @@ class X86MachineFunctionInfo : public MachineFunct + VarArgsGPOffset(0), + VarArgsFPOffset(0), + ArgumentStackSize(0), +- NumLocalDynamics(0) {} ++ NumLocalDynamics(0), ++ HasPushSequences(false) {} + + bool getForceFramePointer() const { return ForceFramePointer;} + void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; } + ++ bool getHasPushSequences() const { return HasPushSequences; } ++ void setHasPushSequences(bool HasPush) { HasPushSequences = HasPush; } ++ + bool getRestoreBasePointer() const { return RestoreBasePointerOffset!=0; } + void setRestoreBasePointer(const MachineFunction *MF); + int getRestoreBasePointerOffset() const {return RestoreBasePointerOffset; } +Index: lib/Target/X86/X86RegisterInfo.cpp +=================================================================== +--- lib/Target/X86/X86RegisterInfo.cpp ++++ lib/Target/X86/X86RegisterInfo.cpp +@@ -468,8 +468,6 @@ void + X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, unsigned FIOperandNum, + RegScavenger *RS) const { +- assert(SPAdj == 0 && "Unexpected"); +- + MachineInstr &MI = *II; + MachineFunction &MF = *MI.getParent()->getParent(); + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); +@@ -506,6 +504,9 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicB + } else + FIOffset = TFI->getFrameIndexOffset(MF, FrameIndex); + ++ if (BasePtr == StackPtr) ++ FIOffset += SPAdj; ++ + // The frame index format for stackmaps and patchpoints is different from the + // X86 format. It only has a FI and an offset. + if (Opc == TargetOpcode::STACKMAP || Opc == TargetOpcode::PATCHPOINT) { +Index: lib/Target/X86/X86TargetMachine.cpp +=================================================================== +--- lib/Target/X86/X86TargetMachine.cpp ++++ lib/Target/X86/X86TargetMachine.cpp +@@ -154,6 +154,7 @@ class X86PassConfig : public TargetPassConfig { + void addIRPasses() override; + bool addInstSelector() override; + bool addILPOpts() override; ++ void addPreRegAlloc() override; + void addPostRegAlloc() override; + void addPreEmitPass() override; + }; +@@ -187,6 +188,10 @@ bool X86PassConfig::addILPOpts() { + return true; + } + ++void X86PassConfig::addPreRegAlloc() { ++ addPass(createX86CallFrameOptimization()); ++} ++ + void X86PassConfig::addPostRegAlloc() { + addPass(createX86FloatingPointStackifierPass()); + } +Index: test/CodeGen/X86/inalloca-invoke.ll +=================================================================== +--- test/CodeGen/X86/inalloca-invoke.ll ++++ test/CodeGen/X86/inalloca-invoke.ll +@@ -31,7 +31,7 @@ blah: + to label %invoke.cont unwind label %lpad + + ; Uses end as sret param. +-; CHECK: movl %[[end]], (%esp) ++; CHECK: pushl %[[end]] + ; CHECK: calll _plus + + invoke.cont: +Index: test/CodeGen/X86/movtopush.ll +=================================================================== +--- test/CodeGen/X86/movtopush.ll ++++ test/CodeGen/X86/movtopush.ll +@@ -1,10 +1,12 @@ + ; RUN: llc < %s -mtriple=i686-windows | FileCheck %s -check-prefix=NORMAL ++; RUN: llc < %s -mtriple=x86_64-windows | FileCheck %s -check-prefix=X64 + ; RUN: llc < %s -mtriple=i686-windows -force-align-stack -stack-alignment=32 | FileCheck %s -check-prefix=ALIGNED ++ + declare void @good(i32 %a, i32 %b, i32 %c, i32 %d) + declare void @inreg(i32 %a, i32 inreg %b, i32 %c, i32 %d) + + ; Here, we should have a reserved frame, so we don't expect pushes +-; NORMAL-LABEL: test1 ++; NORMAL-LABEL: test1: + ; NORMAL: subl $16, %esp + ; NORMAL-NEXT: movl $4, 12(%esp) + ; NORMAL-NEXT: movl $3, 8(%esp) +@@ -11,6 +13,7 @@ declare void @inreg(i32 %a, i32 inreg %b, i32 %c, + ; NORMAL-NEXT: movl $2, 4(%esp) + ; NORMAL-NEXT: movl $1, (%esp) + ; NORMAL-NEXT: call ++; NORMAL-NEXT: addl $16, %esp + define void @test1() { + entry: + call void @good(i32 1, i32 2, i32 3, i32 4) +@@ -17,8 +20,10 @@ entry: + ret void + } + +-; Here, we expect a sequence of 4 immediate pushes +-; NORMAL-LABEL: test2 ++; We're optimizing for code size, so we should get pushes for x86, ++; even though there is a reserved call frame. ++; Make sure we don't touch x86-64 ++; NORMAL-LABEL: test1b: + ; NORMAL-NOT: subl {{.*}} %esp + ; NORMAL: pushl $4 + ; NORMAL-NEXT: pushl $3 +@@ -25,6 +30,42 @@ entry: + ; NORMAL-NEXT: pushl $2 + ; NORMAL-NEXT: pushl $1 + ; NORMAL-NEXT: call ++; NORMAL-NEXT: addl $16, %esp ++; X64-LABEL: test1b: ++; X64: movl $1, %ecx ++; X64-NEXT: movl $2, %edx ++; X64-NEXT: movl $3, %r8d ++; X64-NEXT: movl $4, %r9d ++; X64-NEXT: callq good ++define void @test1b() optsize { ++entry: ++ call void @good(i32 1, i32 2, i32 3, i32 4) ++ ret void ++} ++ ++; Same as above, but for minsize ++; NORMAL-LABEL: test1c: ++; NORMAL-NOT: subl {{.*}} %esp ++; NORMAL: pushl $4 ++; NORMAL-NEXT: pushl $3 ++; NORMAL-NEXT: pushl $2 ++; NORMAL-NEXT: pushl $1 ++; NORMAL-NEXT: call ++; NORMAL-NEXT: addl $16, %esp ++define void @test1c() minsize { ++entry: ++ call void @good(i32 1, i32 2, i32 3, i32 4) ++ ret void ++} ++ ++; If we have a reserved frame, we should have pushes ++; NORMAL-LABEL: test2: ++; NORMAL-NOT: subl {{.*}} %esp ++; NORMAL: pushl $4 ++; NORMAL-NEXT: pushl $3 ++; NORMAL-NEXT: pushl $2 ++; NORMAL-NEXT: pushl $1 ++; NORMAL-NEXT: call + define void @test2(i32 %k) { + entry: + %a = alloca i32, i32 %k +@@ -34,7 +75,7 @@ entry: + + ; Again, we expect a sequence of 4 immediate pushes + ; Checks that we generate the right pushes for >8bit immediates +-; NORMAL-LABEL: test2b ++; NORMAL-LABEL: test2b: + ; NORMAL-NOT: subl {{.*}} %esp + ; NORMAL: pushl $4096 + ; NORMAL-NEXT: pushl $3072 +@@ -41,15 +82,15 @@ entry: + ; NORMAL-NEXT: pushl $2048 + ; NORMAL-NEXT: pushl $1024 + ; NORMAL-NEXT: call +-define void @test2b(i32 %k) { ++; NORMAL-NEXT: addl $16, %esp ++define void @test2b() optsize { + entry: +- %a = alloca i32, i32 %k + call void @good(i32 1024, i32 2048, i32 3072, i32 4096) + ret void + } + + ; The first push should push a register +-; NORMAL-LABEL: test3 ++; NORMAL-LABEL: test3: + ; NORMAL-NOT: subl {{.*}} %esp + ; NORMAL: pushl $4 + ; NORMAL-NEXT: pushl $3 +@@ -56,15 +97,15 @@ entry: + ; NORMAL-NEXT: pushl $2 + ; NORMAL-NEXT: pushl %e{{..}} + ; NORMAL-NEXT: call +-define void @test3(i32 %k) { ++; NORMAL-NEXT: addl $16, %esp ++define void @test3(i32 %k) optsize { + entry: +- %a = alloca i32, i32 %k + call void @good(i32 %k, i32 2, i32 3, i32 4) + ret void + } + + ; We don't support weird calling conventions +-; NORMAL-LABEL: test4 ++; NORMAL-LABEL: test4: + ; NORMAL: subl $12, %esp + ; NORMAL-NEXT: movl $4, 8(%esp) + ; NORMAL-NEXT: movl $3, 4(%esp) +@@ -71,16 +112,16 @@ entry: + ; NORMAL-NEXT: movl $1, (%esp) + ; NORMAL-NEXT: movl $2, %eax + ; NORMAL-NEXT: call +-define void @test4(i32 %k) { ++; NORMAL-NEXT: addl $12, %esp ++define void @test4() optsize { + entry: +- %a = alloca i32, i32 %k + call void @inreg(i32 1, i32 2, i32 3, i32 4) + ret void + } + +-; Check that additional alignment is added when the pushes +-; don't add up to the required alignment. +-; ALIGNED-LABEL: test5 ++; When there is no reserved call frame, check that additional alignment ++; is added when the pushes don't add up to the required alignment. ++; ALIGNED-LABEL: test5: + ; ALIGNED: subl $16, %esp + ; ALIGNED-NEXT: pushl $4 + ; ALIGNED-NEXT: pushl $3 +@@ -97,7 +138,7 @@ entry: + ; Check that pushing the addresses of globals (Or generally, things that + ; aren't exactly immediates) isn't broken. + ; Fixes PR21878. +-; NORMAL-LABEL: test6 ++; NORMAL-LABEL: test6: + ; NORMAL: pushl $_ext + ; NORMAL-NEXT: call + declare void @f(i8*) +@@ -110,3 +151,108 @@ bb: + alloca i32 + ret void + } ++ ++; Check that we fold simple cases into the push ++; NORMAL-LABEL: test7: ++; NORMAL-NOT: subl {{.*}} %esp ++; NORMAL: movl 4(%esp), [[EAX:%e..]] ++; NORMAL-NEXT: pushl $4 ++; NORMAL-NEXT: pushl ([[EAX]]) ++; NORMAL-NEXT: pushl $2 ++; NORMAL-NEXT: pushl $1 ++; NORMAL-NEXT: call ++; NORMAL-NEXT: addl $16, %esp ++define void @test7(i32* %ptr) optsize { ++entry: ++ %val = load i32* %ptr ++ call void @good(i32 1, i32 2, i32 %val, i32 4) ++ ret void ++} ++ ++; But we don't want to fold stack-relative loads into the push, ++; because the offset will be wrong ++; NORMAL-LABEL: test8: ++; NORMAL-NOT: subl {{.*}} %esp ++; NORMAL: movl 4(%esp), [[EAX:%e..]] ++; NORMAL-NEXT: pushl $4 ++; NORMAL-NEXT: pushl [[EAX]] ++; NORMAL-NEXT: pushl $2 ++; NORMAL-NEXT: pushl $1 ++; NORMAL-NEXT: call ++; NORMAL-NEXT: addl $16, %esp ++define void @test8(i32* %ptr) optsize { ++entry: ++ %val = ptrtoint i32* %ptr to i32 ++ call void @good(i32 1, i32 2, i32 %val, i32 4) ++ ret void ++} ++ ++; If one function is using push instructions, and the other isn't ++; (because it has frame-index references), then we must resolve ++; these references correctly. ++; NORMAL-LABEL: test9: ++; NORMAL-NOT: leal (%esp), ++; NORMAL: pushl $4 ++; NORMAL-NEXT: pushl $3 ++; NORMAL-NEXT: pushl $2 ++; NORMAL-NEXT: pushl $1 ++; NORMAL-NEXT: call ++; NORMAL-NEXT: addl $16, %esp ++; NORMAL-NEXT: subl $16, %esp ++; NORMAL-NEXT: leal 16(%esp), [[EAX:%e..]] ++; NORMAL-NEXT: movl [[EAX]], 12(%esp) ++; NORMAL-NEXT: movl $7, 8(%esp) ++; NORMAL-NEXT: movl $6, 4(%esp) ++; NORMAL-NEXT: movl $5, (%esp) ++; NORMAL-NEXT: call ++; NORMAL-NEXT: addl $16, %esp ++define void @test9() optsize { ++entry: ++ %p = alloca i32, align 4 ++ call void @good(i32 1, i32 2, i32 3, i32 4) ++ %0 = ptrtoint i32* %p to i32 ++ call void @good(i32 5, i32 6, i32 7, i32 %0) ++ ret void ++} ++ ++; We can end up with an indirect call which gets reloaded on the spot. ++; Make sure we reference the correct stack slot - we spill into (%esp) ++; and reload from 16(%esp) due to the pushes. ++; NORMAL-LABEL: test10: ++; NORMAL: movl $_good, [[ALLOC:.*]] ++; NORMAL-NEXT: movl [[ALLOC]], [[EAX:%e..]] ++; NORMAL-NEXT: movl [[EAX]], (%esp) # 4-byte Spill ++; NORMAL: nop ++; NORMAL: pushl $4 ++; NORMAL-NEXT: pushl $3 ++; NORMAL-NEXT: pushl $2 ++; NORMAL-NEXT: pushl $1 ++; NORMAL-NEXT: calll *16(%esp) ++; NORMAL-NEXT: addl $16, %esp ++define void @test10() optsize { ++ %stack_fptr = alloca void (i32, i32, i32, i32)* ++ store void (i32, i32, i32, i32)* @good, void (i32, i32, i32, i32)** %stack_fptr ++ %good_ptr = load volatile void (i32, i32, i32, i32)** %stack_fptr ++ call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di}"() ++ call void (i32, i32, i32, i32)* %good_ptr(i32 1, i32 2, i32 3, i32 4) ++ ret void ++} ++ ++; We can't fold the load from the global into the push because of ++; interference from the store ++; NORMAL-LABEL: test11: ++; NORMAL: movl _the_global, [[EAX:%e..]] ++; NORMAL-NEXT: movl $42, _the_global ++; NORMAL-NEXT: pushl $4 ++; NORMAL-NEXT: pushl $3 ++; NORMAL-NEXT: pushl $2 ++; NORMAL-NEXT: pushl [[EAX]] ++; NORMAL-NEXT: call ++; NORMAL-NEXT: addl $16, %esp ++@the_global = external global i32 ++define void @test11() optsize { ++ %myload = load i32* @the_global ++ store i32 42, i32* @the_global ++ call void @good(i32 %myload, i32 2, i32 3, i32 4) ++ ret void ++}