From c7dac04c3480f3c20487f912f77343139fce2d99 Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Sun, 24 Dec 2017 01:00:08 +0000 Subject: [PATCH] Vendor import of llvm trunk r321414: https://llvm.org/svn/llvm-project/llvm/trunk@321414 --- docs/Extensions.rst | 28 + docs/MIRLangRef.rst | 45 +- docs/tutorial/LangImpl09.rst | 5 +- examples/Kaleidoscope/CMakeLists.txt | 1 + examples/Kaleidoscope/Chapter9/toy.cpp | 5 +- include/llvm-c/lto.h | 14 +- include/llvm/Analysis/AliasAnalysis.h | 77 +- .../llvm/Analysis/AliasAnalysisEvaluator.h | 8 +- include/llvm/Analysis/LoopAccessAnalysis.h | 15 - .../llvm/Analysis/MemoryDependenceAnalysis.h | 6 + include/llvm/Analysis/ProfileSummaryInfo.h | 8 +- .../llvm/Analysis/ScalarEvolutionExpander.h | 2 +- include/llvm/Analysis/TargetTransformInfo.h | 7 + .../llvm/Analysis/TargetTransformInfoImpl.h | 2 + include/llvm/BinaryFormat/Wasm.h | 2 +- .../WebAssembly.def => WasmRelocs.def} | 0 include/llvm/CodeGen/BasicTTIImpl.h | 14 +- .../CodeGen/GlobalISel/InstructionSelector.h | 17 +- .../GlobalISel/InstructionSelectorImpl.h | 6 +- .../{LiveStackAnalysis.h => LiveStacks.h} | 8 +- include/llvm/CodeGen/MachineOperand.h | 7 + include/llvm/CodeGen/RuntimeLibcalls.def | 3 + include/llvm/CodeGen/SDNodeProperties.td | 34 + include/llvm/CodeGen/SelectionDAGNodes.h | 12 +- include/llvm/CodeGen/TargetLowering.h | 11 +- include/llvm/DebugInfo/DWARF/DWARFUnit.h | 100 +- include/llvm/FuzzMutate/IRMutator.h | 4 +- include/llvm/IR/Function.h | 6 + include/llvm/IR/Intrinsics.td | 10 +- .../llvm/LTO/legacy/ThinLTOCodeGenerator.h | 10 +- include/llvm/MC/MCAsmInfo.h | 3 +- include/llvm/MC/MCStreamer.h | 11 + include/llvm/Object/Wasm.h | 21 +- include/llvm/Support/CachePruning.h | 5 +- include/llvm/Support/MemoryBuffer.h | 66 +- include/llvm/Support/YAMLTraits.h | 8 +- include/llvm/Target/TargetMachine.h | 18 +- include/llvm/Target/TargetSelectionDAG.td | 26 - include/llvm/Transforms/Instrumentation.h | 2 +- .../Transforms/Utils/CallPromotionUtils.h | 18 +- include/llvm/module.modulemap | 2 +- lib/Analysis/AliasAnalysis.cpp | 95 +- lib/Analysis/AliasAnalysisEvaluator.cpp | 61 +- lib/Analysis/BasicAliasAnalysis.cpp | 13 +- lib/Analysis/CFGPrinter.cpp | 8 +- lib/Analysis/GlobalsModRef.cpp | 21 +- lib/Analysis/InlineCost.cpp | 95 +- lib/Analysis/LoopAccessAnalysis.cpp | 71 - lib/Analysis/MemoryDependenceAnalysis.cpp | 23 +- lib/Analysis/MemorySSA.cpp | 39 +- lib/Analysis/ModuleSummaryAnalysis.cpp | 2 +- lib/Analysis/ProfileSummaryInfo.cpp | 60 +- lib/Analysis/ScalarEvolution.cpp | 1 + lib/Analysis/TargetTransformInfo.cpp | 4 + lib/Analysis/TypeBasedAliasAnalysis.cpp | 23 +- lib/Bitcode/Writer/BitcodeWriter.cpp | 2 +- lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 1 + lib/CodeGen/CMakeLists.txt | 2 +- lib/CodeGen/CodeGenPrepare.cpp | 17 +- lib/CodeGen/GlobalISel/LegalizerHelper.cpp | 3 + lib/CodeGen/InlineSpiller.cpp | 2 +- lib/CodeGen/LLVMTargetMachine.cpp | 7 +- .../{LiveStackAnalysis.cpp => LiveStacks.cpp} | 4 +- lib/CodeGen/MIRPrinter.cpp | 197 +- lib/CodeGen/MachineBlockPlacement.cpp | 6 +- lib/CodeGen/MachineOperand.cpp | 232 +- lib/CodeGen/MachineVerifier.cpp | 2 +- lib/CodeGen/README.txt | 2 +- lib/CodeGen/RegAllocBasic.cpp | 2 +- lib/CodeGen/RegAllocGreedy.cpp | 2 +- lib/CodeGen/RegAllocPBQP.cpp | 2 +- lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 173 +- .../SelectionDAG/LegalizeFloatTypes.cpp | 2 +- lib/CodeGen/SelectionDAG/LegalizeTypes.cpp | 8 +- .../SelectionDAG/LegalizeVectorTypes.cpp | 6 +- .../SelectionDAG/ResourcePriorityQueue.cpp | 1 + lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 3 + .../SelectionDAGAddressAnalysis.cpp | 17 + lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp | 32 +- lib/CodeGen/SelectionDAG/TargetLowering.cpp | 2 +- lib/CodeGen/StackSlotColoring.cpp | 2 +- lib/CodeGen/TargetLoweringBase.cpp | 68 +- lib/CodeGen/VirtRegMap.cpp | 2 +- lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp | 1 + lib/DebugInfo/DWARF/DWARFContext.cpp | 155 +- lib/DebugInfo/DWARF/DWARFUnit.cpp | 507 ++- lib/Demangle/ItaniumDemangle.cpp | 3 +- lib/FuzzMutate/IRMutator.cpp | 19 +- lib/IR/ConstantFold.cpp | 1 + lib/IR/Function.cpp | 4 +- lib/IR/Value.cpp | 11 +- lib/MC/MCAsmStreamer.cpp | 47 +- lib/MC/MCStreamer.cpp | 22 + lib/MC/WasmObjectWriter.cpp | 74 +- lib/Object/ELF.cpp | 1 + lib/Object/WasmObjectFile.cpp | 29 +- lib/Object/WindowsResource.cpp | 6 +- lib/ObjectYAML/WasmYAML.cpp | 2 +- lib/Passes/LLVMBuild.txt | 2 +- lib/Support/APFloat.cpp | 4 +- lib/Support/CachePruning.cpp | 4 +- lib/Support/MemoryBuffer.cpp | 162 +- lib/Support/StringRef.cpp | 2 +- lib/Support/TargetParser.cpp | 5 +- lib/Support/YAMLTraits.cpp | 7 +- lib/Target/AArch64/AArch64AsmPrinter.cpp | 14 + lib/Target/AArch64/AArch64FastISel.cpp | 5 +- lib/Target/AArch64/AArch64FrameLowering.cpp | 86 +- lib/Target/AArch64/AArch64ISelLowering.cpp | 12 +- lib/Target/AArch64/AArch64InstrInfo.cpp | 13 +- .../AArch64/AArch64SelectionDAGInfo.cpp | 8 +- lib/Target/AArch64/AArch64Subtarget.cpp | 13 - lib/Target/AArch64/AArch64Subtarget.h | 7 - lib/Target/AArch64/AArch64SystemOperands.td | 3 + lib/Target/AArch64/AArch64TargetMachine.cpp | 7 +- lib/Target/AArch64/AArch64TargetMachine.h | 3 +- .../AArch64/AsmParser/AArch64AsmParser.cpp | 36 +- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 6 + lib/Target/AMDGPU/AMDGPUISelLowering.h | 10 + lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 8 +- lib/Target/AMDGPU/AMDGPUTargetMachine.h | 2 +- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 34 +- .../Disassembler/AMDGPUDisassembler.cpp | 35 +- .../AMDGPU/Disassembler/AMDGPUDisassembler.h | 3 + .../AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp | 4 +- lib/Target/AMDGPU/SIRegisterInfo.td | 180 +- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 4 + lib/Target/ARC/ARCTargetMachine.cpp | 7 +- lib/Target/ARC/ARCTargetMachine.h | 2 +- lib/Target/ARM/ARM.h | 1 + lib/Target/ARM/ARM.td | 4 + lib/Target/ARM/ARMFastISel.cpp | 2 +- lib/Target/ARM/ARMISelLowering.cpp | 134 +- lib/Target/ARM/ARMISelLowering.h | 2 + lib/Target/ARM/ARMInstrInfo.td | 9 + lib/Target/ARM/ARMInstrThumb2.td | 4 + lib/Target/ARM/ARMInstructionSelector.cpp | 58 +- lib/Target/ARM/ARMLegalizerInfo.cpp | 7 + lib/Target/ARM/ARMRegisterBankInfo.cpp | 20 +- lib/Target/ARM/ARMSubtarget.cpp | 5 - lib/Target/ARM/ARMSubtarget.h | 9 +- lib/Target/ARM/ARMTargetMachine.cpp | 8 +- lib/Target/ARM/ARMTargetMachine.h | 3 +- lib/Target/ARM/ARMTargetTransformInfo.cpp | 19 - lib/Target/ARM/ARMTargetTransformInfo.h | 2 - lib/Target/ARM/AsmParser/ARMAsmParser.cpp | 10 +- .../ARM/Disassembler/ARMDisassembler.cpp | 4 + lib/Target/ARM/Thumb2SizeReduction.cpp | 8 +- lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp | 10 +- lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp | 70 +- lib/Target/Hexagon/HexagonISelLowering.cpp | 349 +- lib/Target/Hexagon/HexagonISelLowering.h | 12 + lib/Target/Hexagon/HexagonISelLoweringHVX.cpp | 138 +- lib/Target/Hexagon/HexagonPatterns.td | 9 + lib/Target/Hexagon/HexagonRegisterInfo.td | 5 +- lib/Target/Hexagon/HexagonSubtarget.h | 32 +- lib/Target/Hexagon/HexagonTargetMachine.cpp | 7 +- lib/Target/Hexagon/HexagonTargetMachine.h | 2 +- lib/Target/Lanai/LanaiTargetMachine.cpp | 7 +- lib/Target/Lanai/LanaiTargetMachine.h | 2 +- .../Mips/MCTargetDesc/MipsMCCodeEmitter.cpp | 2 +- lib/Target/Mips/MipsRegisterInfo.td | 2 +- lib/Target/Mips/MipsTargetMachine.cpp | 19 +- lib/Target/Mips/MipsTargetMachine.h | 2 +- lib/Target/NVPTX/NVPTXTargetMachine.cpp | 7 +- lib/Target/NVPTX/NVPTXTargetMachine.h | 2 +- lib/Target/PowerPC/PPCFrameLowering.cpp | 6 +- lib/Target/PowerPC/PPCISelLowering.cpp | 11 +- lib/Target/PowerPC/PPCMIPeephole.cpp | 18 +- lib/Target/PowerPC/PPCTargetMachine.cpp | 7 +- lib/Target/PowerPC/PPCTargetMachine.h | 2 +- lib/Target/SystemZ/SystemZTargetMachine.cpp | 7 +- lib/Target/SystemZ/SystemZTargetMachine.h | 2 +- lib/Target/TargetMachine.cpp | 13 +- .../WebAssembly/WebAssemblyRegStackify.cpp | 10 +- .../WebAssemblyRuntimeLibcallSignatures.cpp | 8 +- .../WebAssembly/WebAssemblyTargetMachine.cpp | 7 +- .../WebAssembly/WebAssemblyTargetMachine.h | 3 +- lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp | 2 +- lib/Target/X86/X86.td | 17 +- lib/Target/X86/X86DomainReassignment.cpp | 167 +- lib/Target/X86/X86ISelDAGToDAG.cpp | 9 +- lib/Target/X86/X86ISelLowering.cpp | 465 +-- lib/Target/X86/X86Instr3DNow.td | 20 +- lib/Target/X86/X86InstrFormats.td | 8 +- lib/Target/X86/X86InstrInfo.td | 3 + lib/Target/X86/X86InstrSSE.td | 2 +- lib/Target/X86/X86SelectionDAGInfo.cpp | 7 +- lib/Target/X86/X86Subtarget.cpp | 24 +- lib/Target/X86/X86Subtarget.h | 29 +- lib/Target/X86/X86TargetMachine.cpp | 7 +- lib/Target/X86/X86TargetMachine.h | 2 +- lib/Target/XCore/XCoreTargetMachine.cpp | 7 +- lib/Target/XCore/XCoreTargetMachine.h | 2 +- lib/Transforms/IPO/PartialInlining.cpp | 9 +- lib/Transforms/IPO/SampleProfile.cpp | 5 +- lib/Transforms/IPO/WholeProgramDevirt.cpp | 1 + .../InstCombine/InstCombineCalls.cpp | 1 + .../Instrumentation/HWAddressSanitizer.cpp | 25 +- lib/Transforms/Scalar/CallSiteSplitting.cpp | 46 +- lib/Transforms/Scalar/JumpThreading.cpp | 19 +- lib/Transforms/Scalar/LoopSink.cpp | 2 +- lib/Transforms/Scalar/LoopUnrollPass.cpp | 2 +- lib/Transforms/Scalar/MemCpyOptimizer.cpp | 56 +- lib/Transforms/Scalar/SCCP.cpp | 29 +- .../Scalar/SeparateConstOffsetFromGEP.cpp | 2 +- lib/Transforms/Utils/CallPromotionUtils.cpp | 255 +- lib/Transforms/Utils/LoopUnrollPeel.cpp | 2 +- lib/Transforms/Utils/SimplifyCFG.cpp | 203 +- lib/Transforms/Vectorize/LoopVectorize.cpp | 4 +- lib/Transforms/Vectorize/SLPVectorizer.cpp | 283 +- .../BasicAA/args-rets-allocas-loads.ll | 7 +- test/Analysis/BasicAA/call-attrs.ll | 6 +- test/Analysis/BasicAA/cs-cs-arm.ll | 6 +- test/Analysis/BasicAA/cs-cs.ll | 62 +- test/Analysis/MemorySSA/volatile-clobber.ll | 13 +- .../ValueTracking/memory-dereferenceable.ll | 20 +- .../AArch64/GlobalISel/arm64-fallback.ll | 2 +- .../fp128-legalize-crash-pr35690.mir | 44 + .../AArch64/GlobalISel/translate-gep.ll | 109 +- test/CodeGen/AArch64/arm64-jumptable.ll | 16 +- test/CodeGen/AArch64/arm64-memset-to-bzero.ll | 49 +- test/CodeGen/AArch64/arm64-neon-2velem.ll | 685 +--- .../AArch64/arm64-zero-cycle-zeroing.ll | 9 + test/CodeGen/AArch64/chkstk.ll | 25 + test/CodeGen/AArch64/ldst-paired-aliasing.ll | 5 +- test/CodeGen/AMDGPU/amdgpu.private-memory.ll | 3 +- .../memory-legalizer-store-infinite-loop.ll | 32 + .../ARM/GlobalISel/arm-instruction-select.mir | 100 + test/CodeGen/ARM/GlobalISel/arm-legalizer.mir | 52 + .../ARM/GlobalISel/arm-regbankselect.mir | 66 + test/CodeGen/ARM/avoid-cpsr-rmw.ll | 4 +- test/CodeGen/ARM/su-addsub-overflow.ll | 135 + test/CodeGen/ARM/usat.ll | 214 ++ test/CodeGen/BPF/objdump_imm_hex.ll | 65 + .../Hexagon/autohvx/build-vector-i32-type.ll | 19 + .../Hexagon/autohvx/isel-bool-vector.ll | 18 + .../Hexagon/autohvx/isel-select-const.ll | 32 + test/CodeGen/Hexagon/expand-vstorerw-undef.ll | 2 +- test/CodeGen/Hexagon/v60-cur.ll | 5 +- .../Hexagon/vect/vect-extract-i1-debug.ll | 14 + test/CodeGen/Hexagon/vect/vect-infloop.ll | 10 +- test/CodeGen/Mips/llvm-ir/extractelement.ll | 2 +- test/CodeGen/Mips/long-call-mcount.ll | 19 + .../Mips/sll-micromips-r6-encoding.mir | 46 + test/CodeGen/PowerPC/cmp_elimination.ll | 32 +- .../CodeGen/PowerPC/uint-to-ppcfp128-crash.ll | 15 + .../PowerPC/variable_elem_vec_extracts.ll | 6 +- test/CodeGen/Thumb2/t2sizereduction.mir | 83 + test/CodeGen/X86/avg-mask.ll | 24 +- test/CodeGen/X86/avg.ll | 2975 ++++------------- test/CodeGen/X86/avx512-calling-conv.ll | 88 +- test/CodeGen/X86/avx512-ext.ll | 21 +- .../avx512-extract-subvector-load-store.ll | 136 +- test/CodeGen/X86/avx512-insert-extract.ll | 163 +- test/CodeGen/X86/avx512-insert-extract_i1.ll | 3 +- test/CodeGen/X86/avx512-mask-op.ll | 22 +- test/CodeGen/X86/avx512-schedule.ll | 12 +- .../X86/avx512-shuffles/partial_permute.ll | 299 +- test/CodeGen/X86/avx512-skx-insert-subvec.ll | 7 +- test/CodeGen/X86/avx512-vec-cmp.ll | 5 +- test/CodeGen/X86/avx512vl-vec-masked-cmp.ll | 186 +- test/CodeGen/X86/bitcast-and-setcc-128.ll | 24 +- test/CodeGen/X86/bitcast-setcc-128.ll | 12 +- test/CodeGen/X86/combine-and.ll | 10 + test/CodeGen/X86/combine-or.ll | 20 +- test/CodeGen/X86/darwin-bzero.ll | 5 +- test/CodeGen/X86/extractelement-index.ll | 12 +- test/CodeGen/X86/fma-fneg-combine.ll | 166 +- test/CodeGen/X86/fmsubadd-combine.ll | 96 +- test/CodeGen/X86/fold-vector-sext-crash.ll | 6 +- test/CodeGen/X86/horizontal-reduce-smax.ll | 284 +- test/CodeGen/X86/horizontal-reduce-smin.ll | 284 +- test/CodeGen/X86/horizontal-reduce-umax.ll | 284 +- test/CodeGen/X86/horizontal-reduce-umin.ll | 170 +- test/CodeGen/X86/known-bits-vector.ll | 14 +- .../X86/machinesink-merge-debuginfo.ll | 60 +- .../CodeGen/X86/machinesink-null-debuginfo.ll | 10 +- test/CodeGen/X86/masked_gather_scatter.ll | 121 +- test/CodeGen/X86/popcnt.ll | 2 +- test/CodeGen/X86/prefetch.ll | 102 +- .../X86/shuffle-strided-with-offset-128.ll | 81 +- .../X86/shuffle-strided-with-offset-256.ll | 149 +- .../X86/shuffle-strided-with-offset-512.ll | 48 +- test/CodeGen/X86/shuffle-vs-trunc-128.ll | 55 +- test/CodeGen/X86/shuffle-vs-trunc-256.ll | 184 +- test/CodeGen/X86/shuffle-vs-trunc-512.ll | 5 +- test/CodeGen/X86/var-permute-256.ll | 270 +- test/CodeGen/X86/var-permute-512.ll | 208 +- test/CodeGen/X86/vector-compare-results.ll | 56 +- test/CodeGen/X86/vector-half-conversions.ll | 511 +-- test/CodeGen/X86/vector-rotate-128.ll | 76 +- test/CodeGen/X86/vector-shift-ashr-128.ll | 154 +- test/CodeGen/X86/vector-shift-lshr-128.ll | 154 +- test/CodeGen/X86/vector-shift-shl-128.ll | 154 +- test/CodeGen/X86/vector-shuffle-128-v16.ll | 113 +- test/CodeGen/X86/vector-shuffle-128-v8.ll | 397 ++- test/CodeGen/X86/vector-shuffle-256-v16.ll | 849 +++-- test/CodeGen/X86/vector-shuffle-256-v32.ll | 944 ++++-- test/CodeGen/X86/vector-shuffle-256-v4.ll | 229 +- test/CodeGen/X86/vector-shuffle-256-v8.ll | 264 +- test/CodeGen/X86/vector-shuffle-512-v32.ll | 18 +- test/CodeGen/X86/vector-shuffle-v1.ll | 185 +- .../X86/vector-shuffle-variable-128.ll | 600 ++-- test/CodeGen/X86/vector-trunc.ll | 62 +- test/CodeGen/X86/vector-zext.ll | 30 +- test/CodeGen/X86/vselect.ll | 6 +- .../DebugInfo/X86/dwarfdump-str-offsets-dwp.s | 173 +- .../X86/dwarfdump-str-offsets-invalid-3.s | 4 +- .../X86/dwarfdump-str-offsets-invalid-4.s | 5 +- .../X86/dwarfdump-str-offsets-invalid-6.s | 94 + .../X86/dwarfdump-str-offsets-macho.s | 34 +- test/DebugInfo/X86/dwarfdump-str-offsets.s | 66 +- .../HWAddressSanitizer/basic.ll | 72 +- .../HWAddressSanitizer/with-calls.ll | 39 +- test/MC/AArch64/arm64-system-encoding.s | 3 + test/MC/AArch64/basic-a64-diagnostics.s | 4 + test/MC/AArch64/dot-req.s | 10 + test/MC/AMDGPU/ds.s | 4 + test/MC/AMDGPU/expressions.s | 8 + .../AMDGPU/invalid-instructions-spellcheck.s | 48 + test/MC/AMDGPU/trap.s | 42 + test/MC/AMDGPU/vop1-gfx9-err.s | 12 +- test/MC/AMDGPU/vop3p-err.s | 6 +- test/MC/ARM/dfb-neg.s | 10 + test/MC/ARM/dfb.s | 6 + test/MC/COFF/align-nops.s | 2 +- .../AArch64/basic-a64-instructions.txt | 3 + test/MC/Disassembler/AMDGPU/ds_vi.txt | 3 + test/MC/Disassembler/AMDGPU/trap_gfx9.txt | 32 + test/MC/Disassembler/AMDGPU/trap_vi.txt | 16 + test/MC/Disassembler/ARM/dfb-arm.txt | 6 + test/MC/Disassembler/ARM/dfb-thumb.txt | 6 + test/MC/Disassembler/X86/x86-32.txt | 3 + test/MC/ELF/align-nops.s | 2 +- test/MC/MachO/x86_32-optimal_nop.s | 2 +- test/MC/Mips/eva/invalid.s | 47 +- test/MC/WebAssembly/weak-alias.ll | 173 +- test/MC/X86/3DNow.s | 2 + .../X86/AlignedBundling/different-sections.s | 4 +- test/MC/X86/AlignedBundling/long-nop-pad.s | 4 +- .../AlignedBundling/misaligned-bundle-group.s | 4 +- .../X86/AlignedBundling/misaligned-bundle.s | 4 +- .../AlignedBundling/pad-align-to-bundle-end.s | 4 +- .../X86/AlignedBundling/pad-bundle-groups.s | 4 +- .../AlignedBundling/relax-in-bundle-group.s | 4 +- .../AlignedBundling/single-inst-bundling.s | 4 +- test/MC/X86/CLFLUSHOPT-32.s | 26 + test/MC/X86/CLFLUSHOPT-64.s | 26 + test/MC/X86/CLFSH-32.s | 26 + test/MC/X86/CLFSH-64.s | 26 + test/MC/X86/x86_long_nop.s | 8 +- test/TableGen/GlobalISelEmitter.td | 53 +- test/TableGen/intrinsic-long-name.td | 2 + test/TableGen/intrinsic-struct.td | 2 + test/TableGen/intrinsic-varargs.td | 2 + test/ThinLTO/X86/cache.ll | 22 +- .../callsite-no-or-structure.ll | 139 + .../callsite-no-splitting.ll | 18 + test/Transforms/CodeGenPrepare/section.ll | 47 +- test/Transforms/GVN/tbaa.ll | 61 +- test/Transforms/Inline/AArch64/binop.ll | 291 ++ test/Transforms/Inline/ARM/inline-fp.ll | 113 + test/Transforms/Inline/inline-fp.ll | 137 - test/Transforms/Inline/redundant-loads.ll | 18 + .../InstCombine/2011-09-03-Trampoline.ll | 23 +- test/Transforms/JumpThreading/guards.ll | 103 + .../LoopVectorize/legal_preheader_check.ll | 27 + .../MemCpyOpt/memcpy-invoke-memcpy.ll | 48 + .../Transforms/MemCpyOpt/merge-into-memset.ll | 45 + test/Transforms/MemCpyOpt/mixed-sizes.ll | 36 + .../MemCpyOpt/nonlocal-memcpy-memcpy.ll | 114 + test/Transforms/NewGVN/tbaa.ll | 61 +- .../PGOProfile/icp_covariant_call_return.ll | 3 +- .../PGOProfile/icp_covariant_invoke_return.ll | 9 +- test/Transforms/PGOProfile/icp_invoke.ll | 17 +- .../Transforms/PGOProfile/icp_invoke_nouse.ll | 3 +- test/Transforms/PGOProfile/icp_vararg.ll | 3 +- .../PGOProfile/indirect_call_promotion.ll | 3 +- .../X86/jumbled-load-multiuse.ll | 24 +- .../X86/jumbled-load-shuffle-placement.ll | 125 - .../X86/jumbled-load-used-in-phi.ll | 225 -- .../SLPVectorizer/X86/jumbled-load.ll | 37 +- .../SLPVectorizer/X86/store-jumbled.ll | 25 +- test/Transforms/SampleProfile/entry_counts.ll | 4 +- .../SimplifyCFG/X86/if-conversion.ll | 231 ++ test/tools/llvm-cvtres/machine.test | 56 +- test/tools/llvm-cvtres/symbols.test | 12 +- test/tools/llvm-dwarfdump/X86/lookup.s | 7 +- .../llvm-objcopy/add-section-remove.test | 36 + test/tools/llvm-objcopy/add-section.test | 37 + test/tools/llvm-readobj/mips-got.test | 136 +- test/tools/llvm-readobj/mips-plt.test | 32 + tools/llvm-dwarfdump/llvm-dwarfdump.cpp | 3 +- tools/llvm-lto/llvm-lto.cpp | 4 + tools/llvm-objcopy/Object.cpp | 12 + tools/llvm-objcopy/Object.h | 15 + tools/llvm-objcopy/llvm-objcopy.cpp | 22 +- tools/llvm-readobj/ELFDumper.cpp | 718 ++-- tools/llvm-readobj/WasmDumper.cpp | 10 +- tools/opt-viewer/optrecord.py | 2 +- unittests/ADT/APFloatTest.cpp | 17 + unittests/ADT/StringRefTest.cpp | 7 +- unittests/CodeGen/MachineOperandTest.cpp | 63 + unittests/ExecutionEngine/Orc/CMakeLists.txt | 7 +- unittests/Support/CachePruningTest.cpp | 4 +- unittests/Support/MemoryBufferTest.cpp | 40 +- unittests/Support/TargetParserTest.cpp | 6 + unittests/Support/YAMLIOTest.cpp | 103 +- utils/TableGen/CMakeLists.txt | 1 + utils/TableGen/CodeGenDAGPatterns.cpp | 94 +- utils/TableGen/CodeGenDAGPatterns.h | 13 +- utils/TableGen/CodeGenIntrinsics.h | 8 + utils/TableGen/CodeGenTarget.cpp | 7 + utils/TableGen/CodeGenTarget.h | 20 +- utils/TableGen/GlobalISelEmitter.cpp | 115 +- utils/TableGen/IntrinsicEmitter.cpp | 1 + utils/TableGen/SDNodeProperties.cpp | 49 + utils/TableGen/SDNodeProperties.h | 40 + utils/docker/build_docker_image.sh | 6 +- utils/docker/scripts/build_install_llvm.sh | 39 + utils/git-svn/git-llvm | 11 +- utils/update_mir_test_checks.py | 63 +- 423 files changed, 14393 insertions(+), 10359 deletions(-) rename include/llvm/BinaryFormat/{WasmRelocs/WebAssembly.def => WasmRelocs.def} (100%) rename include/llvm/CodeGen/{LiveStackAnalysis.h => LiveStacks.h} (94%) create mode 100644 include/llvm/CodeGen/SDNodeProperties.td rename lib/CodeGen/{LiveStackAnalysis.cpp => LiveStacks.cpp} (96%) create mode 100644 test/CodeGen/AArch64/GlobalISel/fp128-legalize-crash-pr35690.mir create mode 100644 test/CodeGen/AArch64/chkstk.ll create mode 100644 test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll create mode 100644 test/CodeGen/ARM/su-addsub-overflow.ll create mode 100644 test/CodeGen/ARM/usat.ll create mode 100644 test/CodeGen/BPF/objdump_imm_hex.ll create mode 100644 test/CodeGen/Hexagon/autohvx/build-vector-i32-type.ll create mode 100644 test/CodeGen/Hexagon/autohvx/isel-bool-vector.ll create mode 100644 test/CodeGen/Hexagon/autohvx/isel-select-const.ll create mode 100644 test/CodeGen/Hexagon/vect/vect-extract-i1-debug.ll create mode 100644 test/CodeGen/Mips/long-call-mcount.ll create mode 100644 test/CodeGen/Mips/sll-micromips-r6-encoding.mir create mode 100644 test/CodeGen/PowerPC/uint-to-ppcfp128-crash.ll create mode 100644 test/CodeGen/Thumb2/t2sizereduction.mir create mode 100644 test/DebugInfo/X86/dwarfdump-str-offsets-invalid-6.s create mode 100644 test/MC/AMDGPU/invalid-instructions-spellcheck.s create mode 100644 test/MC/ARM/dfb-neg.s create mode 100644 test/MC/ARM/dfb.s create mode 100644 test/MC/Disassembler/ARM/dfb-arm.txt create mode 100644 test/MC/Disassembler/ARM/dfb-thumb.txt create mode 100644 test/MC/X86/CLFLUSHOPT-32.s create mode 100644 test/MC/X86/CLFLUSHOPT-64.s create mode 100644 test/MC/X86/CLFSH-32.s create mode 100644 test/MC/X86/CLFSH-64.s create mode 100644 test/Transforms/CallSiteSplitting/callsite-no-or-structure.ll create mode 100644 test/Transforms/CallSiteSplitting/callsite-no-splitting.ll create mode 100644 test/Transforms/Inline/AArch64/binop.ll create mode 100644 test/Transforms/Inline/ARM/inline-fp.ll delete mode 100644 test/Transforms/Inline/inline-fp.ll create mode 100644 test/Transforms/LoopVectorize/legal_preheader_check.ll create mode 100644 test/Transforms/MemCpyOpt/memcpy-invoke-memcpy.ll create mode 100644 test/Transforms/MemCpyOpt/merge-into-memset.ll create mode 100644 test/Transforms/MemCpyOpt/mixed-sizes.ll create mode 100644 test/Transforms/MemCpyOpt/nonlocal-memcpy-memcpy.ll delete mode 100644 test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll delete mode 100644 test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll create mode 100644 test/Transforms/SimplifyCFG/X86/if-conversion.ll create mode 100644 test/tools/llvm-objcopy/add-section-remove.test create mode 100644 test/tools/llvm-objcopy/add-section.test create mode 100644 utils/TableGen/SDNodeProperties.cpp create mode 100644 utils/TableGen/SDNodeProperties.h diff --git a/docs/Extensions.rst b/docs/Extensions.rst index 14fea30204b4..32eeadd78ba6 100644 --- a/docs/Extensions.rst +++ b/docs/Extensions.rst @@ -288,3 +288,31 @@ standard stack probe emission. The MSVC environment does not emit code for VLAs currently. +Windows on ARM64 +---------------- + +Stack Probe Emission +^^^^^^^^^^^^^^^^^^^^ + +The reference implementation (Microsoft Visual Studio 2017) emits stack probes +in the following fashion: + +.. code-block:: gas + + mov x15, #constant + bl __chkstk + sub sp, sp, x15, lsl #4 + +However, this has the limitation of 256 MiB (±128MiB). In order to accommodate +larger binaries, LLVM supports the use of ``-mcode-model=large`` to allow a 8GiB +(±4GiB) range via a slight deviation. It will generate an indirect jump as +follows: + +.. code-block:: gas + + mov x15, #constant + adrp x16, __chkstk + add x16, x16, :lo12:__chkstk + blr x16 + sub sp, sp, x15, lsl #4 + diff --git a/docs/MIRLangRef.rst b/docs/MIRLangRef.rst index f170c7210879..1176435c8761 100644 --- a/docs/MIRLangRef.rst +++ b/docs/MIRLangRef.rst @@ -692,6 +692,50 @@ The syntax is: EH_LABEL +CFIIndex Operands +^^^^^^^^^^^^^^^^^ + +A CFI Index operand is holding an index into a per-function side-table, +``MachineFunction::getFrameInstructions()``, which references all the frame +instructions in a ``MachineFunction``. A ``CFI_INSTRUCTION`` may look like it +contains multiple operands, but the only operand it contains is the CFI Index. +The other operands are tracked by the ``MCCFIInstruction`` object. + +The syntax is: + +.. code-block:: text + + CFI_INSTRUCTION offset %w30, -16 + +which may be emitted later in the MC layer as: + +.. code-block:: text + + .cfi_offset w30, -16 + +IntrinsicID Operands +^^^^^^^^^^^^^^^^^^^^ + +An Intrinsic ID operand contains a generic intrinsic ID or a target-specific ID. + +The syntax for the ``returnaddress`` intrinsic is: + +.. code-block:: text + + %x0 = COPY intrinsic(@llvm.returnaddress) + +Predicate Operands +^^^^^^^^^^^^^^^^^^ + +A Predicate operand contains an IR predicate from ``CmpInst::Predicate``, like +``ICMP_EQ``, etc. + +For an int eq predicate ``ICMP_EQ``, the syntax is: + +.. code-block:: text + + %2:gpr(s32) = G_ICMP intpred(eq), %0, %1 + .. TODO: Describe the parsers default behaviour when optional YAML attributes are missing. .. TODO: Describe the syntax for the bundled instructions. @@ -702,7 +746,6 @@ The syntax is: .. TODO: Describe the syntax of the stack object machine operands and their YAML definitions. .. TODO: Describe the syntax of the block address machine operands. -.. TODO: Describe the syntax of the CFI index machine operands. .. TODO: Describe the syntax of the metadata machine operands, and the instructions debug location attribute. .. TODO: Describe the syntax of the register live out machine operands. diff --git a/docs/tutorial/LangImpl09.rst b/docs/tutorial/LangImpl09.rst index fe5a95a5769e..d81f9fa0001c 100644 --- a/docs/tutorial/LangImpl09.rst +++ b/docs/tutorial/LangImpl09.rst @@ -197,7 +197,7 @@ expressions: if (DblTy) return DblTy; - DblTy = DBuilder->createBasicType("double", 64, 64, dwarf::DW_ATE_float); + DblTy = DBuilder->createBasicType("double", 64, dwarf::DW_ATE_float); return DblTy; } @@ -208,7 +208,8 @@ And then later on in ``main`` when we're constructing our module: DBuilder = new DIBuilder(*TheModule); KSDbgInfo.TheCU = DBuilder->createCompileUnit( - dwarf::DW_LANG_C, "fib.ks", ".", "Kaleidoscope Compiler", 0, "", 0); + dwarf::DW_LANG_C, DBuilder->createFile("fib.ks", "."), + "Kaleidoscope Compiler", 0, "", 0); There are a couple of things to note here. First, while we're producing a compile unit for a language called Kaleidoscope we used the language diff --git a/examples/Kaleidoscope/CMakeLists.txt b/examples/Kaleidoscope/CMakeLists.txt index 543b9f73b4fe..3822cdd9e1c4 100644 --- a/examples/Kaleidoscope/CMakeLists.txt +++ b/examples/Kaleidoscope/CMakeLists.txt @@ -14,3 +14,4 @@ add_subdirectory(Chapter5) add_subdirectory(Chapter6) add_subdirectory(Chapter7) add_subdirectory(Chapter8) +add_subdirectory(Chapter9) diff --git a/examples/Kaleidoscope/Chapter9/toy.cpp b/examples/Kaleidoscope/Chapter9/toy.cpp index 1b13e45ec460..821cf4d25a65 100644 --- a/examples/Kaleidoscope/Chapter9/toy.cpp +++ b/examples/Kaleidoscope/Chapter9/toy.cpp @@ -823,7 +823,7 @@ DIType *DebugInfo::getDoubleTy() { if (DblTy) return DblTy; - DblTy = DBuilder->createBasicType("double", 64, 64, dwarf::DW_ATE_float); + DblTy = DBuilder->createBasicType("double", 64, dwarf::DW_ATE_float); return DblTy; } @@ -1436,7 +1436,8 @@ int main() { // Currently down as "fib.ks" as a filename since we're redirecting stdin // but we'd like actual source locations. KSDbgInfo.TheCU = DBuilder->createCompileUnit( - dwarf::DW_LANG_C, "fib.ks", ".", "Kaleidoscope Compiler", 0, "", 0); + dwarf::DW_LANG_C, DBuilder->createFile("fib.ks", "."), + "Kaleidoscope Compiler", 0, "", 0); // Run the main "interpreter loop" now. MainLoop(); diff --git a/include/llvm-c/lto.h b/include/llvm-c/lto.h index 8d45b7832041..55f3e46c45ed 100644 --- a/include/llvm-c/lto.h +++ b/include/llvm-c/lto.h @@ -757,17 +757,17 @@ extern void thinlto_codegen_add_cross_referenced_symbol(thinlto_code_gen_t cg, * @ingroup LLVMCTLTO * * These entry points control the ThinLTO cache. The cache is intended to - * support incremental build, and thus needs to be persistent accross build. - * The client enabled the cache by supplying a path to an existing directory. + * support incremental builds, and thus needs to be persistent across builds. + * The client enables the cache by supplying a path to an existing directory. * The code generator will use this to store objects files that may be reused * during a subsequent build. * To avoid filling the disk space, a few knobs are provided: - * - The pruning interval limit the frequency at which the garbage collector - * will try to scan the cache directory to prune it from expired entries. - * Setting to -1 disable the pruning (default). + * - The pruning interval limits the frequency at which the garbage collector + * will try to scan the cache directory to prune expired entries. + * Setting to a negative number disables the pruning. * - The pruning expiration time indicates to the garbage collector how old an * entry needs to be to be removed. - * - Finally, the garbage collector can be instructed to prune the cache till + * - Finally, the garbage collector can be instructed to prune the cache until * the occupied space goes below a threshold. * @{ */ @@ -782,7 +782,7 @@ extern void thinlto_codegen_set_cache_dir(thinlto_code_gen_t cg, const char *cache_dir); /** - * Sets the cache pruning interval (in seconds). A negative value disable the + * Sets the cache pruning interval (in seconds). A negative value disables the * pruning. An unspecified default value will be applied, and a value of 0 will * be ignored. * diff --git a/include/llvm/Analysis/AliasAnalysis.h b/include/llvm/Analysis/AliasAnalysis.h index 9de075dfd681..362096b08e13 100644 --- a/include/llvm/Analysis/AliasAnalysis.h +++ b/include/llvm/Analysis/AliasAnalysis.h @@ -95,46 +95,81 @@ enum AliasResult { /// /// This is no access at all, a modification, a reference, or both /// a modification and a reference. These are specifically structured such that -/// they form a two bit matrix and bit-tests for 'mod' or 'ref' +/// they form a three bit matrix and bit-tests for 'mod' or 'ref' or 'must' /// work with any of the possible values. - enum class ModRefInfo { + /// Must is provided for completeness, but no routines will return only + /// Must today. See definition of Must below. + Must = 0, + /// The access may reference the value stored in memory, + /// a mustAlias relation was found, and no mayAlias or partialAlias found. + MustRef = 1, + /// The access may modify the value stored in memory, + /// a mustAlias relation was found, and no mayAlias or partialAlias found. + MustMod = 2, + /// The access may reference, modify or both the value stored in memory, + /// a mustAlias relation was found, and no mayAlias or partialAlias found. + MustModRef = MustRef | MustMod, /// The access neither references nor modifies the value stored in memory. - NoModRef = 0, + NoModRef = 4, /// The access may reference the value stored in memory. - Ref = 1, + Ref = NoModRef | MustRef, /// The access may modify the value stored in memory. - Mod = 2, + Mod = NoModRef | MustMod, /// The access may reference and may modify the value stored in memory. ModRef = Ref | Mod, + + /// About Must: + /// Must is set in a best effort manner. + /// We usually do not try our best to infer Must, instead it is merely + /// another piece of "free" information that is presented when available. + /// Must set means there was certainly a MustAlias found. For calls, + /// where multiple arguments are checked (argmemonly), this translates to + /// only MustAlias or NoAlias was found. + /// Must is not set for RAR accesses, even if the two locations must + /// alias. The reason is that two read accesses translate to an early return + /// of NoModRef. An additional alias check to set Must may be + /// expensive. Other cases may also not set Must(e.g. callCapturesBefore). + /// We refer to Must being *set* when the most significant bit is *cleared*. + /// Conversely we *clear* Must information by *setting* the Must bit to 1. }; LLVM_NODISCARD inline bool isNoModRef(const ModRefInfo MRI) { - return MRI == ModRefInfo::NoModRef; + return (static_cast(MRI) & static_cast(ModRefInfo::MustModRef)) == + static_cast(ModRefInfo::Must); } LLVM_NODISCARD inline bool isModOrRefSet(const ModRefInfo MRI) { - return static_cast(MRI) & static_cast(ModRefInfo::ModRef); + return static_cast(MRI) & static_cast(ModRefInfo::MustModRef); } LLVM_NODISCARD inline bool isModAndRefSet(const ModRefInfo MRI) { - return (static_cast(MRI) & static_cast(ModRefInfo::ModRef)) == - static_cast(ModRefInfo::ModRef); + return (static_cast(MRI) & static_cast(ModRefInfo::MustModRef)) == + static_cast(ModRefInfo::MustModRef); } LLVM_NODISCARD inline bool isModSet(const ModRefInfo MRI) { - return static_cast(MRI) & static_cast(ModRefInfo::Mod); + return static_cast(MRI) & static_cast(ModRefInfo::MustMod); } LLVM_NODISCARD inline bool isRefSet(const ModRefInfo MRI) { - return static_cast(MRI) & static_cast(ModRefInfo::Ref); + return static_cast(MRI) & static_cast(ModRefInfo::MustRef); +} +LLVM_NODISCARD inline bool isMustSet(const ModRefInfo MRI) { + return !(static_cast(MRI) & static_cast(ModRefInfo::NoModRef)); } LLVM_NODISCARD inline ModRefInfo setMod(const ModRefInfo MRI) { - return ModRefInfo(static_cast(MRI) | static_cast(ModRefInfo::Mod)); + return ModRefInfo(static_cast(MRI) | + static_cast(ModRefInfo::MustMod)); } LLVM_NODISCARD inline ModRefInfo setRef(const ModRefInfo MRI) { - return ModRefInfo(static_cast(MRI) | static_cast(ModRefInfo::Ref)); + return ModRefInfo(static_cast(MRI) | + static_cast(ModRefInfo::MustRef)); +} +LLVM_NODISCARD inline ModRefInfo setMust(const ModRefInfo MRI) { + return ModRefInfo(static_cast(MRI) & + static_cast(ModRefInfo::MustModRef)); } LLVM_NODISCARD inline ModRefInfo setModAndRef(const ModRefInfo MRI) { return ModRefInfo(static_cast(MRI) | - static_cast(ModRefInfo::ModRef)); + static_cast(ModRefInfo::MustModRef)); } LLVM_NODISCARD inline ModRefInfo clearMod(const ModRefInfo MRI) { return ModRefInfo(static_cast(MRI) & static_cast(ModRefInfo::Ref)); @@ -142,6 +177,10 @@ LLVM_NODISCARD inline ModRefInfo clearMod(const ModRefInfo MRI) { LLVM_NODISCARD inline ModRefInfo clearRef(const ModRefInfo MRI) { return ModRefInfo(static_cast(MRI) & static_cast(ModRefInfo::Mod)); } +LLVM_NODISCARD inline ModRefInfo clearMust(const ModRefInfo MRI) { + return ModRefInfo(static_cast(MRI) | + static_cast(ModRefInfo::NoModRef)); +} LLVM_NODISCARD inline ModRefInfo unionModRef(const ModRefInfo MRI1, const ModRefInfo MRI2) { return ModRefInfo(static_cast(MRI1) | static_cast(MRI2)); @@ -160,11 +199,11 @@ enum FunctionModRefLocation { /// Base case is no access to memory. FMRL_Nowhere = 0, /// Access to memory via argument pointers. - FMRL_ArgumentPointees = 4, + FMRL_ArgumentPointees = 8, /// Memory that is inaccessible via LLVM IR. - FMRL_InaccessibleMem = 8, + FMRL_InaccessibleMem = 16, /// Access to any memory. - FMRL_Anywhere = 16 | FMRL_InaccessibleMem | FMRL_ArgumentPointees + FMRL_Anywhere = 32 | FMRL_InaccessibleMem | FMRL_ArgumentPointees }; /// Summary of how a function affects memory in the program. @@ -344,7 +383,7 @@ class AAResults { /// result's bits are set to indicate the allowed aliasing ModRef kinds. Note /// that these bits do not necessarily account for the overall behavior of /// the function, but rather only provide additional per-argument - /// information. + /// information. This never sets ModRefInfo::Must. ModRefInfo getArgModRefInfo(ImmutableCallSite CS, unsigned ArgIdx); /// Return the behavior of the given call site. @@ -624,6 +663,8 @@ class AAResults { /// or reads the specified memory location \p MemLoc before instruction \p I /// in a BasicBlock. An ordered basic block \p OBB can be used to speed up /// instruction ordering queries inside the BasicBlock containing \p I. + /// Early exits in callCapturesBefore may lead to ModRefInfo::Must not being + /// set. ModRefInfo callCapturesBefore(const Instruction *I, const MemoryLocation &MemLoc, DominatorTree *DT, OrderedBasicBlock *OBB = nullptr); diff --git a/include/llvm/Analysis/AliasAnalysisEvaluator.h b/include/llvm/Analysis/AliasAnalysisEvaluator.h index 214574852655..cd2f631a01f4 100644 --- a/include/llvm/Analysis/AliasAnalysisEvaluator.h +++ b/include/llvm/Analysis/AliasAnalysisEvaluator.h @@ -35,19 +35,23 @@ class AAEvaluator : public PassInfoMixin { int64_t FunctionCount; int64_t NoAliasCount, MayAliasCount, PartialAliasCount, MustAliasCount; int64_t NoModRefCount, ModCount, RefCount, ModRefCount; + int64_t MustCount, MustRefCount, MustModCount, MustModRefCount; public: AAEvaluator() : FunctionCount(), NoAliasCount(), MayAliasCount(), PartialAliasCount(), MustAliasCount(), NoModRefCount(), ModCount(), RefCount(), - ModRefCount() {} + ModRefCount(), MustCount(), MustRefCount(), MustModCount(), + MustModRefCount() {} AAEvaluator(AAEvaluator &&Arg) : FunctionCount(Arg.FunctionCount), NoAliasCount(Arg.NoAliasCount), MayAliasCount(Arg.MayAliasCount), PartialAliasCount(Arg.PartialAliasCount), MustAliasCount(Arg.MustAliasCount), NoModRefCount(Arg.NoModRefCount), ModCount(Arg.ModCount), RefCount(Arg.RefCount), - ModRefCount(Arg.ModRefCount) { + ModRefCount(Arg.ModRefCount), MustCount(Arg.MustCount), + MustRefCount(Arg.MustRefCount), MustModCount(Arg.MustModCount), + MustModRefCount(Arg.MustModRefCount) { Arg.FunctionCount = 0; } ~AAEvaluator(); diff --git a/include/llvm/Analysis/LoopAccessAnalysis.h b/include/llvm/Analysis/LoopAccessAnalysis.h index 54f151ef82e2..28154c873b70 100644 --- a/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/include/llvm/Analysis/LoopAccessAnalysis.h @@ -667,21 +667,6 @@ int64_t getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr, const Loop *Lp, const ValueToValueMap &StridesMap = ValueToValueMap(), bool Assume = false, bool ShouldCheckWrap = true); -/// \brief Attempt to sort the 'loads' in \p VL and return the sorted values in -/// \p Sorted. -/// -/// Returns 'false' if sorting is not legal or feasible, otherwise returns -/// 'true'. If \p Mask is not null, it also returns the \p Mask which is the -/// shuffle mask for actual memory access order. -/// -/// For example, for a given VL of memory accesses in program order, a[i+2], -/// a[i+0], a[i+1] and a[i+3], this function will sort the VL and save the -/// sorted value in 'Sorted' as a[i+0], a[i+1], a[i+2], a[i+3] and saves the -/// mask for actual memory accesses in program order in 'Mask' as <2,0,1,3> -bool sortLoadAccesses(ArrayRef VL, const DataLayout &DL, - ScalarEvolution &SE, SmallVectorImpl &Sorted, - SmallVectorImpl *Mask = nullptr); - /// \brief Returns true if the memory operations \p A and \p B are consecutive. /// This is a simple API that does not depend on the analysis pass. bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, diff --git a/include/llvm/Analysis/MemoryDependenceAnalysis.h b/include/llvm/Analysis/MemoryDependenceAnalysis.h index c2974525a6ff..391a333594e9 100644 --- a/include/llvm/Analysis/MemoryDependenceAnalysis.h +++ b/include/llvm/Analysis/MemoryDependenceAnalysis.h @@ -407,6 +407,12 @@ class MemoryDependenceResults { void getNonLocalPointerDependency(Instruction *QueryInst, SmallVectorImpl &Result); + /// Perform a dependency query specifically for QueryInst's access to Loc. + /// The other comments for getNonLocalPointerDependency apply here as well. + void getNonLocalPointerDependencyFrom(Instruction *QueryInst, + const MemoryLocation &Loc, bool isLoad, + SmallVectorImpl &Result); + /// Removes an instruction from the dependence analysis, updating the /// dependence of instructions that previously depended on it. void removeInstruction(Instruction *InstToRemove); diff --git a/include/llvm/Analysis/ProfileSummaryInfo.h b/include/llvm/Analysis/ProfileSummaryInfo.h index bd7b00374821..293033458429 100644 --- a/include/llvm/Analysis/ProfileSummaryInfo.h +++ b/include/llvm/Analysis/ProfileSummaryInfo.h @@ -92,12 +92,12 @@ class ProfileSummaryInfo { bool hasHugeWorkingSetSize(); /// \brief Returns true if \p F has hot function entry. bool isFunctionEntryHot(const Function *F); - /// Returns true if \p F has hot function entry or hot call edge. - bool isFunctionHotInCallGraph(const Function *F); + /// Returns true if \p F contains hot code. + bool isFunctionHotInCallGraph(const Function *F, BlockFrequencyInfo &BFI); /// \brief Returns true if \p F has cold function entry. bool isFunctionEntryCold(const Function *F); - /// Returns true if \p F has cold function entry or cold call edge. - bool isFunctionColdInCallGraph(const Function *F); + /// Returns true if \p F contains only cold code. + bool isFunctionColdInCallGraph(const Function *F, BlockFrequencyInfo &BFI); /// \brief Returns true if \p F is a hot function. bool isHotCount(uint64_t C); /// \brief Returns true if count \p C is considered cold. diff --git a/include/llvm/Analysis/ScalarEvolutionExpander.h b/include/llvm/Analysis/ScalarEvolutionExpander.h index 4578e0da8ab2..3df04e98bd24 100644 --- a/include/llvm/Analysis/ScalarEvolutionExpander.h +++ b/include/llvm/Analysis/ScalarEvolutionExpander.h @@ -47,7 +47,7 @@ namespace llvm { ScalarEvolution &SE; const DataLayout &DL; - // New instructions receive a name to identifies them with the current pass. + // New instructions receive a name to identify them with the current pass. const char* IVName; // InsertedExpressions caches Values for reuse, so must track RAUW. diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h index c20f20cfbe4d..cecd8958e9d9 100644 --- a/include/llvm/Analysis/TargetTransformInfo.h +++ b/include/llvm/Analysis/TargetTransformInfo.h @@ -646,6 +646,9 @@ class TargetTransformInfo { /// \brief Additional properties of an operand's values. enum OperandValueProperties { OP_None = 0, OP_PowerOf2 = 1 }; + /// \return True if target can execute instructions out of order. + bool isOutOfOrder() const; + /// \return The number of scalar or vector registers that the target has. /// If 'Vectors' is true, it returns the number of vector registers. If it is /// set to false, it returns the number of scalar registers. @@ -1018,6 +1021,7 @@ class TargetTransformInfo::Concept { Type *Ty) = 0; virtual int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty) = 0; + virtual bool isOutOfOrder() const = 0; virtual unsigned getNumberOfRegisters(bool Vector) = 0; virtual unsigned getRegisterBitWidth(bool Vector) const = 0; virtual unsigned getMinVectorRegisterBitWidth() = 0; @@ -1295,6 +1299,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { Type *Ty) override { return Impl.getIntImmCost(IID, Idx, Imm, Ty); } + bool isOutOfOrder() const override { + return Impl.isOutOfOrder(); + } unsigned getNumberOfRegisters(bool Vector) override { return Impl.getNumberOfRegisters(Vector); } diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h index 4c37402278ef..3625675d53de 100644 --- a/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -337,6 +337,8 @@ class TargetTransformInfoImplBase { return TTI::TCC_Free; } + bool isOutOfOrder() const { return false; } + unsigned getNumberOfRegisters(bool Vector) { return 8; } unsigned getRegisterBitWidth(bool Vector) const { return 32; } diff --git a/include/llvm/BinaryFormat/Wasm.h b/include/llvm/BinaryFormat/Wasm.h index 506cd0393e9a..57a0b441821b 100644 --- a/include/llvm/BinaryFormat/Wasm.h +++ b/include/llvm/BinaryFormat/Wasm.h @@ -208,7 +208,7 @@ const unsigned WASM_SYMBOL_VISIBILITY_HIDDEN = 0x4; #define WASM_RELOC(name, value) name = value, enum : unsigned { -#include "WasmRelocs/WebAssembly.def" +#include "WasmRelocs.def" }; #undef WASM_RELOC diff --git a/include/llvm/BinaryFormat/WasmRelocs/WebAssembly.def b/include/llvm/BinaryFormat/WasmRelocs.def similarity index 100% rename from include/llvm/BinaryFormat/WasmRelocs/WebAssembly.def rename to include/llvm/BinaryFormat/WasmRelocs.def diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h index bb5e7f9e8e30..f1f9275b0786 100644 --- a/include/llvm/CodeGen/BasicTTIImpl.h +++ b/include/llvm/CodeGen/BasicTTIImpl.h @@ -302,9 +302,13 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { } unsigned getFPOpCost(Type *Ty) { - // By default, FP instructions are no more expensive since they are - // implemented in HW. Target specific TTI can override this. - return TargetTransformInfo::TCC_Basic; + // Check whether FADD is available, as a proxy for floating-point in + // general. + const TargetLoweringBase *TLI = getTLI(); + EVT VT = TLI->getValueType(DL, Ty); + if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT)) + return TargetTransformInfo::TCC_Basic; + return TargetTransformInfo::TCC_Expensive; } unsigned getOperationCost(unsigned Opcode, Type *Ty, Type *OpTy) { @@ -398,6 +402,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { return BaseT::getInstructionLatency(I); } + bool isOutOfOrder() const { + return getST()->getSchedModel().isOutOfOrder(); + } + /// @} /// \name Vector TTI Implementations diff --git a/include/llvm/CodeGen/GlobalISel/InstructionSelector.h b/include/llvm/CodeGen/GlobalISel/InstructionSelector.h index e599a1b179ec..4264a866b6c0 100644 --- a/include/llvm/CodeGen/GlobalISel/InstructionSelector.h +++ b/include/llvm/CodeGen/GlobalISel/InstructionSelector.h @@ -282,10 +282,6 @@ enum { /// Provides the logic to select generic machine instructions. class InstructionSelector { public: - using I64ImmediatePredicateFn = bool (*)(int64_t); - using APIntImmediatePredicateFn = bool (*)(const APInt &); - using APFloatImmediatePredicateFn = bool (*)(const APFloat &); - virtual ~InstructionSelector() = default; /// Select the (possibly generic) instruction \p I to only use target-specific @@ -319,9 +315,6 @@ class InstructionSelector { struct MatcherInfoTy { const LLT *TypeObjects; const PredicateBitset *FeatureBitsets; - const I64ImmediatePredicateFn *I64ImmPredicateFns; - const APIntImmediatePredicateFn *APIntImmPredicateFns; - const APFloatImmediatePredicateFn *APFloatImmPredicateFns; const ComplexMatcherMemFn *ComplexPredicates; }; @@ -340,6 +333,16 @@ class InstructionSelector { const RegisterBankInfo &RBI, const PredicateBitset &AvailableFeatures, CodeGenCoverage &CoverageInfo) const; + virtual bool testImmPredicate_I64(unsigned, int64_t) const { + llvm_unreachable("Subclasses must override this to use tablegen"); + } + virtual bool testImmPredicate_APInt(unsigned, const APInt &) const { + llvm_unreachable("Subclasses must override this to use tablegen"); + } + virtual bool testImmPredicate_APFloat(unsigned, const APFloat &) const { + llvm_unreachable("Subclasses must override this to use tablegen"); + } + /// Constrain a register operand of an instruction \p I to a specified /// register class. This could involve inserting COPYs before (for uses) or /// after (for defs) and may replace the operand of \p I. diff --git a/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h b/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h index ac2c055ab145..bf834cf8f5e3 100644 --- a/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h +++ b/include/llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h @@ -181,7 +181,7 @@ bool InstructionSelector::executeMatchTable( else llvm_unreachable("Expected Imm or CImm operand"); - if (!MatcherInfo.I64ImmPredicateFns[Predicate](Value)) + if (!testImmPredicate_I64(Predicate, Value)) if (handleReject() == RejectAndGiveUp) return false; break; @@ -202,7 +202,7 @@ bool InstructionSelector::executeMatchTable( else llvm_unreachable("Expected Imm or CImm operand"); - if (!MatcherInfo.APIntImmPredicateFns[Predicate](Value)) + if (!testImmPredicate_APInt(Predicate, Value)) if (handleReject() == RejectAndGiveUp) return false; break; @@ -221,7 +221,7 @@ bool InstructionSelector::executeMatchTable( assert(Predicate > GIPFP_APFloat_Invalid && "Expected a valid predicate"); APFloat Value = State.MIs[InsnID]->getOperand(1).getFPImm()->getValueAPF(); - if (!MatcherInfo.APFloatImmPredicateFns[Predicate](Value)) + if (!testImmPredicate_APFloat(Predicate, Value)) if (handleReject() == RejectAndGiveUp) return false; break; diff --git a/include/llvm/CodeGen/LiveStackAnalysis.h b/include/llvm/CodeGen/LiveStacks.h similarity index 94% rename from include/llvm/CodeGen/LiveStackAnalysis.h rename to include/llvm/CodeGen/LiveStacks.h index c90ae7b184f4..44ed785f7b53 100644 --- a/include/llvm/CodeGen/LiveStackAnalysis.h +++ b/include/llvm/CodeGen/LiveStacks.h @@ -1,4 +1,4 @@ -//===- LiveStackAnalysis.h - Live Stack Slot Analysis -----------*- C++ -*-===// +//===- LiveStacks.h - Live Stack Slot Analysis ------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -13,8 +13,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CODEGEN_LIVESTACKANALYSIS_H -#define LLVM_CODEGEN_LIVESTACKANALYSIS_H +#ifndef LLVM_CODEGEN_LIVESTACKS_H +#define LLVM_CODEGEN_LIVESTACKS_H #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -100,4 +100,4 @@ class LiveStacks : public MachineFunctionPass { } // end namespace llvm -#endif // LLVM_CODEGEN_LIVESTACK_ANALYSIS_H +#endif diff --git a/include/llvm/CodeGen/MachineOperand.h b/include/llvm/CodeGen/MachineOperand.h index ccf0917ed085..4be7942c2c64 100644 --- a/include/llvm/CodeGen/MachineOperand.h +++ b/include/llvm/CodeGen/MachineOperand.h @@ -29,6 +29,7 @@ class GlobalValue; class MachineBasicBlock; class MachineInstr; class MachineRegisterInfo; +class MCCFIInstruction; class MDNode; class ModuleSlotTracker; class TargetMachine; @@ -250,6 +251,12 @@ class MachineOperand { static void printStackObjectReference(raw_ostream &OS, unsigned FrameIndex, bool IsFixed, StringRef Name); + /// Print the offset with explicit +/- signs. + static void printOperandOffset(raw_ostream &OS, int64_t Offset); + + /// Print an IRSlotNumber. + static void printIRSlotNumber(raw_ostream &OS, int Slot); + /// Print the MachineOperand to \p os. /// Providing a valid \p TRI and \p IntrinsicInfo results in a more /// target-specific printing. If \p TRI and \p IntrinsicInfo are null, the diff --git a/include/llvm/CodeGen/RuntimeLibcalls.def b/include/llvm/CodeGen/RuntimeLibcalls.def index e042ae982e86..7695e9d782ef 100644 --- a/include/llvm/CodeGen/RuntimeLibcalls.def +++ b/include/llvm/CodeGen/RuntimeLibcalls.def @@ -165,6 +165,8 @@ HANDLE_LIBCALL(SINCOS_F64, nullptr) HANDLE_LIBCALL(SINCOS_F80, nullptr) HANDLE_LIBCALL(SINCOS_F128, nullptr) HANDLE_LIBCALL(SINCOS_PPCF128, nullptr) +HANDLE_LIBCALL(SINCOS_STRET_F32, nullptr) +HANDLE_LIBCALL(SINCOS_STRET_F64, nullptr) HANDLE_LIBCALL(POW_F32, "powf") HANDLE_LIBCALL(POW_F64, "pow") HANDLE_LIBCALL(POW_F80, "powl") @@ -334,6 +336,7 @@ HANDLE_LIBCALL(O_PPCF128, "__gcc_qunord") HANDLE_LIBCALL(MEMCPY, "memcpy") HANDLE_LIBCALL(MEMMOVE, "memmove") HANDLE_LIBCALL(MEMSET, "memset") +HANDLE_LIBCALL(BZERO, nullptr) // Element-wise unordered-atomic memory of different sizes HANDLE_LIBCALL(MEMCPY_ELEMENT_UNORDERED_ATOMIC_1, "__llvm_memcpy_element_unordered_atomic_1") diff --git a/include/llvm/CodeGen/SDNodeProperties.td b/include/llvm/CodeGen/SDNodeProperties.td new file mode 100644 index 000000000000..83bbab2fdc8d --- /dev/null +++ b/include/llvm/CodeGen/SDNodeProperties.td @@ -0,0 +1,34 @@ +//===- SDNodeProperties.td - Common code for DAG isels ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +class SDNodeProperty; + +// Selection DAG Pattern Operations +class SDPatternOperator { + list Properties = []; +} + +//===----------------------------------------------------------------------===// +// Selection DAG Node Properties. +// +// Note: These are hard coded into tblgen. +// +def SDNPCommutative : SDNodeProperty; // X op Y == Y op X +def SDNPAssociative : SDNodeProperty; // (X op Y) op Z == X op (Y op Z) +def SDNPHasChain : SDNodeProperty; // R/W chain operand and result +def SDNPOutGlue : SDNodeProperty; // Write a flag result +def SDNPInGlue : SDNodeProperty; // Read a flag operand +def SDNPOptInGlue : SDNodeProperty; // Optionally read a flag operand +def SDNPMayStore : SDNodeProperty; // May write to memory, sets 'mayStore'. +def SDNPMayLoad : SDNodeProperty; // May read memory, sets 'mayLoad'. +def SDNPSideEffect : SDNodeProperty; // Sets 'HasUnmodelledSideEffects'. +def SDNPMemOperand : SDNodeProperty; // Touches memory, has assoc MemOperand +def SDNPVariadic : SDNodeProperty; // Node has variable arguments. +def SDNPWantRoot : SDNodeProperty; // ComplexPattern gets the root of match +def SDNPWantParent : SDNodeProperty; // ComplexPattern gets the parent diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h index 7de2e766d521..522c2f1b2cb2 100644 --- a/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/include/llvm/CodeGen/SelectionDAGNodes.h @@ -189,8 +189,8 @@ class SDValue { inline bool isUndef() const; inline unsigned getMachineOpcode() const; inline const DebugLoc &getDebugLoc() const; - inline void dump() const; - inline void dumpr() const; + inline void dump(const SelectionDAG *G = nullptr) const; + inline void dumpr(const SelectionDAG *G = nullptr) const; /// Return true if this operand (which must be a chain) reaches the /// specified operand without crossing any side-effecting instructions. @@ -1089,12 +1089,12 @@ inline const DebugLoc &SDValue::getDebugLoc() const { return Node->getDebugLoc(); } -inline void SDValue::dump() const { - return Node->dump(); +inline void SDValue::dump(const SelectionDAG *G) const { + return Node->dump(G); } -inline void SDValue::dumpr() const { - return Node->dumpr(); +inline void SDValue::dumpr(const SelectionDAG *G) const { + return Node->dumpr(G); } // Define inline functions from the SDUse class. diff --git a/include/llvm/CodeGen/TargetLowering.h b/include/llvm/CodeGen/TargetLowering.h index 0fa19d09e776..380e3b19dc80 100644 --- a/include/llvm/CodeGen/TargetLowering.h +++ b/include/llvm/CodeGen/TargetLowering.h @@ -824,8 +824,8 @@ class TargetLoweringBase { /// also combined within this function. Currently, the minimum size check is /// performed in findJumpTable() in SelectionDAGBuiler and /// getEstimatedNumberOfCaseClusters() in BasicTTIImpl. - bool isSuitableForJumpTable(const SwitchInst *SI, uint64_t NumCases, - uint64_t Range) const { + virtual bool isSuitableForJumpTable(const SwitchInst *SI, uint64_t NumCases, + uint64_t Range) const { const bool OptForSize = SI->getParent()->getParent()->optForSize(); const unsigned MinDensity = getMinimumJumpTableDensity(OptForSize); const unsigned MaxJumpTableSize = @@ -1276,7 +1276,7 @@ class TargetLoweringBase { } /// Return lower limit for number of blocks in a jump table. - unsigned getMinimumJumpTableEntries() const; + virtual unsigned getMinimumJumpTableEntries() const; /// Return lower limit of the density in a jump table. unsigned getMinimumJumpTableDensity(bool OptForSize) const; @@ -2429,7 +2429,7 @@ class TargetLoweringBase { PromoteToType; /// Stores the name each libcall. - const char *LibcallRoutineNames[RTLIB::UNKNOWN_LIBCALL]; + const char *LibcallRoutineNames[RTLIB::UNKNOWN_LIBCALL + 1]; /// The ISD::CondCode that should be used to test the result of each of the /// comparison libcall against zero. @@ -2438,6 +2438,9 @@ class TargetLoweringBase { /// Stores the CallingConv that should be used for each libcall. CallingConv::ID LibcallCallingConvs[RTLIB::UNKNOWN_LIBCALL]; + /// Set default libcall names and calling conventions. + void InitLibcalls(const Triple &TT); + protected: /// Return true if the extension represented by \p I is free. /// \pre \p I is a sign, zero, or fp extension and diff --git a/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/include/llvm/DebugInfo/DWARF/DWARFUnit.h index e9178e03fa8a..3cec58383f87 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFUnit.h +++ b/include/llvm/DebugInfo/DWARF/DWARFUnit.h @@ -165,6 +165,29 @@ struct BaseAddress { uint64_t SectionIndex; }; +/// Represents a unit's contribution to the string offsets table. +struct StrOffsetsContributionDescriptor { + uint64_t Base = 0; + uint64_t Size = 0; + /// Format and version. + DWARFFormParams FormParams = {0, 0, dwarf::DwarfFormat::DWARF32}; + + StrOffsetsContributionDescriptor(uint64_t Base, uint64_t Size, + uint8_t Version, dwarf::DwarfFormat Format) + : Base(Base), Size(Size), FormParams({Version, 0, Format}) {} + + uint8_t getVersion() const { return FormParams.Version; } + dwarf::DwarfFormat getFormat() const { return FormParams.Format; } + uint8_t getDwarfOffsetByteSize() const { + return FormParams.getDwarfOffsetByteSize(); + } + /// Determine whether a contribution to the string offsets table is + /// consistent with the relevant section size and that its length is + /// a multiple of the size of one of its entries. + Optional + validateContributionSize(DWARFDataExtractor &DA); +}; + class DWARFUnit { DWARFContext &Context; /// Section containing this DWARFUnit. @@ -176,7 +199,6 @@ class DWARFUnit { const DWARFSection &LineSection; StringRef StringSection; const DWARFSection &StringOffsetSection; - uint64_t StringOffsetSectionBase = 0; const DWARFSection *AddrOffsetSection; uint32_t AddrOffsetSectionBase = 0; bool isLittleEndian; @@ -185,6 +207,9 @@ class DWARFUnit { // Version, address size, and DWARF format. DWARFFormParams FormParams; + /// Start, length, and DWARF format of the unit's contribution to the string + /// offsets table (DWARF v5). + Optional StringOffsetsTableContribution; uint32_t Offset; uint32_t Length; @@ -195,10 +220,40 @@ class DWARFUnit { /// The compile unit debug information entry items. std::vector DieArray; - /// Map from range's start address to end address and corresponding DIE. - /// IntervalMap does not support range removal, as a result, we use the - /// std::map::upper_bound for address range lookup. - std::map> AddrDieMap; + /// The vector of inlined subroutine DIEs that we can map directly to from + /// their subprogram below. + std::vector InlinedSubroutineDIEs; + + /// A type representing a subprogram DIE and a map (built using a sorted + /// vector) into that subprogram's inlined subroutine DIEs. + struct SubprogramDIEAddrInfo { + DWARFDie SubprogramDIE; + + uint64_t SubprogramBasePC; + + /// A vector sorted to allow mapping from a relative PC to the inlined + /// subroutine DIE with the most specific address range covering that PC. + /// + /// The PCs are relative to the `SubprogramBasePC`. + /// + /// The vector is sorted in ascending order of the first int which + /// represents the relative PC for an interval in the map. The second int + /// represents the index into the `InlinedSubroutineDIEs` vector of the DIE + /// that interval maps to. An index of '-1` indicates an empty mapping. The + /// interval covered is from the `.first` relative PC to the next entry's + /// `.first` relative PC. + std::vector> InlinedSubroutineDIEAddrMap; + }; + + /// Vector of the subprogram DIEs and their subroutine address maps. + std::vector SubprogramDIEAddrInfos; + + /// A vector sorted to allow mapping from a PC to the subprogram DIE (and + /// associated addr map) index. Subprograms with overlapping PC ranges aren't + /// supported here. Nothing will crash, but the mapping may be inaccurate. + /// This vector may also contain "empty" ranges marked by an address with + /// a DIE index of '-1'. + std::vector> SubprogramDIEAddrMap; using die_iterator_range = iterator_range::iterator>; @@ -219,6 +274,21 @@ class DWARFUnit { /// Size in bytes of the unit header. virtual uint32_t getHeaderSize() const { return getVersion() <= 4 ? 11 : 12; } + /// Find the unit's contribution to the string offsets table and determine its + /// length and form. The given offset is expected to be derived from the unit + /// DIE's DW_AT_str_offsets_base attribute. + Optional + determineStringOffsetsTableContribution(DWARFDataExtractor &DA, + uint64_t Offset); + + /// Find the unit's contribution to the string offsets table and determine its + /// length and form. The given offset is expected to be 0 in a dwo file or, + /// in a dwp file, the start of the unit's contribution to the string offsets + /// table section (as determined by the index table). + Optional + determineStringOffsetsTableContributionDWO(DWARFDataExtractor &DA, + uint64_t Offset); + public: DWARFUnit(DWARFContext &Context, const DWARFSection &Section, const DWARFDebugAbbrev *DA, const DWARFSection *RS, StringRef SS, @@ -242,9 +312,6 @@ class DWARFUnit { AddrOffsetSectionBase = Base; } - /// Recursively update address to Die map. - void updateAddressDieMap(DWARFDie Die); - void setRangesSection(const DWARFSection *RS, uint32_t Base) { RangeSection = RS; RangeSectionBase = Base; @@ -272,6 +339,10 @@ class DWARFUnit { uint32_t getNextUnitOffset() const { return Offset + Length + 4; } uint32_t getLength() const { return Length; } + const Optional & + getStringOffsetsTableContribution() const { + return StringOffsetsTableContribution; + } const DWARFFormParams &getFormParams() const { return FormParams; } uint16_t getVersion() const { return FormParams.Version; } dwarf::DwarfFormat getFormat() const { return FormParams.Format; } @@ -281,6 +352,16 @@ class DWARFUnit { return FormParams.getDwarfOffsetByteSize(); } + uint8_t getDwarfStringOffsetsByteSize() const { + assert(StringOffsetsTableContribution); + return StringOffsetsTableContribution->getDwarfOffsetByteSize(); + } + + uint64_t getStringOffsetsBase() const { + assert(StringOffsetsTableContribution); + return StringOffsetsTableContribution->Base; + } + const DWARFAbbreviationDeclarationSet *getAbbreviations() const; uint8_t getUnitType() const { return UnitType; } @@ -426,6 +507,9 @@ class DWARFUnit { /// parseDWO - Parses .dwo file for current compile unit. Returns true if /// it was actually constructed. bool parseDWO(); + + void buildSubprogramDIEAddrMap(); + void buildInlinedSubroutineDIEAddrMap(SubprogramDIEAddrInfo &SPInfo); }; } // end namespace llvm diff --git a/include/llvm/FuzzMutate/IRMutator.h b/include/llvm/FuzzMutate/IRMutator.h index 65ab871db0ef..9aa9d6d6a4bc 100644 --- a/include/llvm/FuzzMutate/IRMutator.h +++ b/include/llvm/FuzzMutate/IRMutator.h @@ -16,6 +16,7 @@ #ifndef LLVM_FUZZMUTATE_IRMUTATOR_H #define LLVM_FUZZMUTATE_IRMUTATOR_H +#include "llvm/ADT/Optional.h" #include "llvm/FuzzMutate/OpDescriptor.h" #include "llvm/Support/ErrorHandling.h" @@ -74,7 +75,8 @@ class IRMutator { class InjectorIRStrategy : public IRMutationStrategy { std::vector Operations; - fuzzerop::OpDescriptor chooseOperation(Value *Src, RandomIRBuilder &IB); + Optional chooseOperation(Value *Src, + RandomIRBuilder &IB); public: InjectorIRStrategy(std::vector &&Operations) diff --git a/include/llvm/IR/Function.h b/include/llvm/IR/Function.h index e811ae5e215a..79c56abe1c37 100644 --- a/include/llvm/IR/Function.h +++ b/include/llvm/IR/Function.h @@ -248,6 +248,12 @@ class Function : public GlobalObject, public ilist_node { /// pgo data. Optional getEntryCount() const; + /// Return true if the function is annotated with profile data. + /// + /// Presence of entry counts from a profile run implies the function has + /// profile annotations. + bool hasProfileData() const { return getEntryCount().hasValue(); } + /// Returns the set of GUIDs that needs to be imported to the function for /// sample PGO, to enable the same inlines as the profiled optimized binary. DenseSet getImportGUIDs() const; diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td index 07de0568cab0..a2a1f26292ce 100644 --- a/include/llvm/IR/Intrinsics.td +++ b/include/llvm/IR/Intrinsics.td @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// include "llvm/CodeGen/ValueTypes.td" +include "llvm/CodeGen/SDNodeProperties.td" //===----------------------------------------------------------------------===// // Properties we keep track of for intrinsics. @@ -264,16 +265,17 @@ def llvm_vararg_ty : LLVMType; // this means vararg here // intrinsic. // * Properties can be set to describe the behavior of the intrinsic. // -class SDPatternOperator; class Intrinsic ret_types, list param_types = [], - list properties = [], - string name = ""> : SDPatternOperator { + list intr_properties = [], + string name = "", + list sd_properties = []> : SDPatternOperator { string LLVMName = name; string TargetPrefix = ""; // Set to a prefix for target-specific intrinsics. list RetTypes = ret_types; list ParamTypes = param_types; - list IntrProperties = properties; + list IntrProperties = intr_properties; + let Properties = sd_properties; bit isTarget = 0; } diff --git a/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h b/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h index 14f0c48266f0..d794535700e5 100644 --- a/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h +++ b/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h @@ -148,10 +148,14 @@ class ThinLTOCodeGenerator { /// incremental build. void setCacheDir(std::string Path) { CacheOptions.Path = std::move(Path); } - /// Cache policy: interval (seconds) between two prune of the cache. Set to a - /// negative value (default) to disable pruning. A value of 0 will be ignored. + /// Cache policy: interval (seconds) between two prunes of the cache. Set to a + /// negative value to disable pruning. A value of 0 will be ignored. void setCachePruningInterval(int Interval) { - if (Interval) + if (Interval == 0) + return; + if(Interval < 0) + CacheOptions.Policy.Interval.reset(); + else CacheOptions.Policy.Interval = std::chrono::seconds(Interval); } diff --git a/include/llvm/MC/MCAsmInfo.h b/include/llvm/MC/MCAsmInfo.h index 234762f36dd4..c538c46fc072 100644 --- a/include/llvm/MC/MCAsmInfo.h +++ b/include/llvm/MC/MCAsmInfo.h @@ -165,7 +165,8 @@ class MCAsmInfo { const char *ZeroDirective; /// This directive allows emission of an ascii string with the standard C - /// escape characters embedded into it. Defaults to "\t.ascii\t" + /// escape characters embedded into it. If a target doesn't support this, it + /// can be set to null. Defaults to "\t.ascii\t" const char *AsciiDirective; /// If not null, this allows for special handling of zero terminated strings diff --git a/include/llvm/MC/MCStreamer.h b/include/llvm/MC/MCStreamer.h index 481d96724d40..a82051700708 100644 --- a/include/llvm/MC/MCStreamer.h +++ b/include/llvm/MC/MCStreamer.h @@ -95,6 +95,17 @@ class MCTargetStreamer { virtual void prettyPrintAsm(MCInstPrinter &InstPrinter, raw_ostream &OS, const MCInst &Inst, const MCSubtargetInfo &STI); + virtual void emitDwarfFileDirective(StringRef Directive); + + /// Update streamer for a new active section. + /// + /// This is called by PopSection and SwitchSection, if the current + /// section changes. + virtual void changeSection(const MCSection *CurSection, MCSection *Section, + const MCExpr *SubSection, raw_ostream &OS); + + virtual void emitValue(const MCExpr *Value); + virtual void finish(); }; diff --git a/include/llvm/Object/Wasm.h b/include/llvm/Object/Wasm.h index 5bb1a3fca3d1..71951d83f3cc 100644 --- a/include/llvm/Object/Wasm.h +++ b/include/llvm/Object/Wasm.h @@ -43,9 +43,9 @@ class WasmSymbol { }; WasmSymbol(StringRef Name, SymbolType Type, uint32_t Section, - uint32_t ElementIndex, uint32_t ImportIndex = 0) + uint32_t ElementIndex, uint32_t FunctionType = 0) : Name(Name), Type(Type), Section(Section), ElementIndex(ElementIndex), - ImportIndex(ImportIndex) {} + FunctionType(FunctionType) {} StringRef Name; SymbolType Type; @@ -55,8 +55,18 @@ class WasmSymbol { // Index into either the function or global index space. uint32_t ElementIndex; - // For imports, the index into the import table - uint32_t ImportIndex; + // For function, the type index + uint32_t FunctionType; + + // Symbols can be both exported and imported (in the case of the weakly + // defined symbol). In this the import index is stored as AltIndex. + uint32_t AltIndex = 0; + bool HasAltIndex = false; + + void setAltIndex(uint32_t Index) { + HasAltIndex = true; + AltIndex = Index; + } bool isFunction() const { return Type == WasmSymbol::SymbolType::FUNCTION_IMPORT || @@ -91,8 +101,7 @@ class WasmSymbol { void print(raw_ostream &Out) const { Out << "Name=" << Name << ", Type=" << static_cast(Type) - << ", Flags=" << Flags << " ElemIndex=" << ElementIndex - << ", ImportIndex=" << ImportIndex; + << ", Flags=" << Flags << " ElemIndex=" << ElementIndex; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) diff --git a/include/llvm/Support/CachePruning.h b/include/llvm/Support/CachePruning.h index c577e9b8b631..327c7df4570f 100644 --- a/include/llvm/Support/CachePruning.h +++ b/include/llvm/Support/CachePruning.h @@ -27,8 +27,9 @@ template class Expected; struct CachePruningPolicy { /// The pruning interval. This is intended to be used to avoid scanning the /// directory too often. It does not impact the decision of which file to - /// prune. A value of 0 forces the scan to occur. - std::chrono::seconds Interval = std::chrono::seconds(1200); + /// prune. A value of 0 forces the scan to occur. A value of None disables + /// pruning. + llvm::Optional Interval = std::chrono::seconds(1200); /// The expiration for a file. When a file hasn't been accessed for Expiration /// seconds, it is removed from the cache. A value of 0 disables the diff --git a/include/llvm/Support/MemoryBuffer.h b/include/llvm/Support/MemoryBuffer.h index 59c93f15d7b8..7b849fdb8670 100644 --- a/include/llvm/Support/MemoryBuffer.h +++ b/include/llvm/Support/MemoryBuffer.h @@ -15,6 +15,7 @@ #define LLVM_SUPPORT_MEMORYBUFFER_H #include "llvm-c/Types.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" #include "llvm/Support/CBindingWrapping.h" @@ -47,6 +48,9 @@ class MemoryBuffer { void init(const char *BufStart, const char *BufEnd, bool RequiresNullTerminator); + + static constexpr bool Writable = false; + public: MemoryBuffer(const MemoryBuffer &) = delete; MemoryBuffer &operator=(const MemoryBuffer &) = delete; @@ -119,12 +123,6 @@ class MemoryBuffer { static std::unique_ptr getNewMemBuffer(size_t Size, StringRef BufferName = ""); - /// Allocate a new MemoryBuffer of the specified size that is not initialized. - /// Note that the caller should initialize the memory allocated by this - /// method. The memory is owned by the MemoryBuffer object. - static std::unique_ptr - getNewUninitMemBuffer(size_t Size, const Twine &BufferName = ""); - /// Read all of stdin into a file buffer, and return it. static ErrorOr> getSTDIN(); @@ -156,6 +154,62 @@ class MemoryBuffer { MemoryBufferRef getMemBufferRef() const; }; +/// This class is an extension of MemoryBuffer, which allows writing to the +/// underlying contents. It only supports creation methods that are guaranteed +/// to produce a writable buffer. For example, mapping a file read-only is not +/// supported. +class WritableMemoryBuffer : public MemoryBuffer { +protected: + WritableMemoryBuffer() = default; + + static constexpr bool Writable = true; + +public: + using MemoryBuffer::getBuffer; + using MemoryBuffer::getBufferEnd; + using MemoryBuffer::getBufferStart; + + // const_cast is well-defined here, because the underlying buffer is + // guaranteed to have been initialized with a mutable buffer. + char *getBufferStart() { + return const_cast(MemoryBuffer::getBufferStart()); + } + char *getBufferEnd() { + return const_cast(MemoryBuffer::getBufferEnd()); + } + MutableArrayRef getBuffer() { + return {getBufferStart(), getBufferEnd()}; + } + + static ErrorOr> + getFile(const Twine &Filename, int64_t FileSize = -1, + bool IsVolatile = false); + + /// Map a subrange of the specified file as a WritableMemoryBuffer. + static ErrorOr> + getFileSlice(const Twine &Filename, uint64_t MapSize, uint64_t Offset, + bool IsVolatile = false); + + /// Allocate a new MemoryBuffer of the specified size that is not initialized. + /// Note that the caller should initialize the memory allocated by this + /// method. The memory is owned by the MemoryBuffer object. + static std::unique_ptr + getNewUninitMemBuffer(size_t Size, const Twine &BufferName = ""); + +private: + // Hide these base class factory function so one can't write + // WritableMemoryBuffer::getXXX() + // and be surprised that he got a read-only Buffer. + using MemoryBuffer::getFileAsStream; + using MemoryBuffer::getFileOrSTDIN; + using MemoryBuffer::getMemBuffer; + using MemoryBuffer::getMemBufferCopy; + using MemoryBuffer::getNewMemBuffer; + using MemoryBuffer::getOpenFile; + using MemoryBuffer::getOpenFileSlice; + using MemoryBuffer::getSTDIN; +}; + class MemoryBufferRef { StringRef Buffer; StringRef Identifier; diff --git a/include/llvm/Support/YAMLTraits.h b/include/llvm/Support/YAMLTraits.h index 83b097a199d6..674c78a11695 100644 --- a/include/llvm/Support/YAMLTraits.h +++ b/include/llvm/Support/YAMLTraits.h @@ -549,9 +549,9 @@ inline QuotingType needsQuotes(StringRef S) { // range. if (C <= 0x1F) return QuotingType::Double; - // C1 control block (0x80 - 0x9F) is excluded from the allowed character - // range. - if (C >= 0x80 && C <= 0x9F) + + // Always double quote UTF-8. + if ((C & 0x80) != 0) return QuotingType::Double; // The character is not safe, at least simple quoting needed. @@ -1725,7 +1725,7 @@ template struct StdMapStringCustomMappingTraitsImpl { template <> struct ScalarTraits { \ static void output(const Type &Value, void *ctx, raw_ostream &Out); \ static StringRef input(StringRef Scalar, void *ctxt, Type &Value); \ - static QuotingType mustQuote(StringRef) { return MustQuote; } \ + static QuotingType mustQuote(StringRef) { return MustQuote; } \ }; \ } \ } diff --git a/include/llvm/Target/TargetMachine.h b/include/llvm/Target/TargetMachine.h index 5421b22462ae..97442f9a7849 100644 --- a/include/llvm/Target/TargetMachine.h +++ b/include/llvm/Target/TargetMachine.h @@ -24,6 +24,7 @@ namespace llvm { +class Function; class GlobalValue; class MachineModuleInfo; class Mangler; @@ -38,6 +39,7 @@ class PassManagerBuilder; class Target; class TargetIntrinsicInfo; class TargetIRAnalysis; +class TargetTransformInfo; class TargetLoweringObjectFile; class TargetPassConfig; class TargetSubtargetInfo; @@ -204,7 +206,13 @@ class TargetMachine { /// This is used to construct the new pass manager's target IR analysis pass, /// set up appropriately for this target machine. Even the old pass manager /// uses this to answer queries about the IR. - virtual TargetIRAnalysis getTargetIRAnalysis(); + TargetIRAnalysis getTargetIRAnalysis(); + + /// \brief Return a TargetTransformInfo for a given function. + /// + /// The returned TargetTransformInfo is specialized to the subtarget + /// corresponding to \p F. + virtual TargetTransformInfo getTargetTransformInfo(const Function &F); /// Allow the target to modify the pass manager, e.g. by calling /// PassManagerBuilder::addExtension. @@ -280,11 +288,11 @@ class LLVMTargetMachine : public TargetMachine { void initAsmInfo(); public: - /// \brief Get a TargetIRAnalysis implementation for the target. + /// \brief Get a TargetTransformInfo implementation for the target. /// - /// This analysis will produce a TTI result which uses the common code - /// generator to answer queries about the IR. - TargetIRAnalysis getTargetIRAnalysis() override; + /// The TTI returned uses the common code generator to answer queries about + /// the IR. + TargetTransformInfo getTargetTransformInfo(const Function &F) override; /// Create a pass configuration object to be used by addPassToEmitX methods /// for generating a pipeline of CodeGen passes. diff --git a/include/llvm/Target/TargetSelectionDAG.td b/include/llvm/Target/TargetSelectionDAG.td index 06caa21d288c..f6162377b8b7 100644 --- a/include/llvm/Target/TargetSelectionDAG.td +++ b/include/llvm/Target/TargetSelectionDAG.td @@ -285,32 +285,6 @@ class SDCallSeqStart constraints> : class SDCallSeqEnd constraints> : SDTypeProfile<0, 2, constraints>; -//===----------------------------------------------------------------------===// -// Selection DAG Node Properties. -// -// Note: These are hard coded into tblgen. -// -class SDNodeProperty; -def SDNPCommutative : SDNodeProperty; // X op Y == Y op X -def SDNPAssociative : SDNodeProperty; // (X op Y) op Z == X op (Y op Z) -def SDNPHasChain : SDNodeProperty; // R/W chain operand and result -def SDNPOutGlue : SDNodeProperty; // Write a flag result -def SDNPInGlue : SDNodeProperty; // Read a flag operand -def SDNPOptInGlue : SDNodeProperty; // Optionally read a flag operand -def SDNPMayStore : SDNodeProperty; // May write to memory, sets 'mayStore'. -def SDNPMayLoad : SDNodeProperty; // May read memory, sets 'mayLoad'. -def SDNPSideEffect : SDNodeProperty; // Sets 'HasUnmodelledSideEffects'. -def SDNPMemOperand : SDNodeProperty; // Touches memory, has assoc MemOperand -def SDNPVariadic : SDNodeProperty; // Node has variable arguments. -def SDNPWantRoot : SDNodeProperty; // ComplexPattern gets the root of match -def SDNPWantParent : SDNodeProperty; // ComplexPattern gets the parent - -//===----------------------------------------------------------------------===// -// Selection DAG Pattern Operations -class SDPatternOperator { - list Properties = []; -} - //===----------------------------------------------------------------------===// // Selection DAG Node definitions. // diff --git a/include/llvm/Transforms/Instrumentation.h b/include/llvm/Transforms/Instrumentation.h index cd6b770f76ac..b1e13f17aef1 100644 --- a/include/llvm/Transforms/Instrumentation.h +++ b/include/llvm/Transforms/Instrumentation.h @@ -133,7 +133,7 @@ ModulePass *createAddressSanitizerModulePass(bool CompileKernel = false, FunctionPass *createMemorySanitizerPass(int TrackOrigins = 0, bool Recover = false); -FunctionPass *createHWAddressSanitizerPass(); +FunctionPass *createHWAddressSanitizerPass(bool Recover = false); // Insert ThreadSanitizer (race detection) instrumentation FunctionPass *createThreadSanitizerPass(); diff --git a/include/llvm/Transforms/Utils/CallPromotionUtils.h b/include/llvm/Transforms/Utils/CallPromotionUtils.h index e0bf85781d81..6e8ece723638 100644 --- a/include/llvm/Transforms/Utils/CallPromotionUtils.h +++ b/include/llvm/Transforms/Utils/CallPromotionUtils.h @@ -29,13 +29,23 @@ namespace llvm { bool isLegalToPromote(CallSite CS, Function *Callee, const char **FailureReason = nullptr); +/// Promote the given indirect call site to unconditionally call \p Callee. +/// +/// This function promotes the given call site, returning the direct call or +/// invoke instruction. If the function type of the call site doesn't match that +/// of the callee, bitcast instructions are inserted where appropriate. If \p +/// RetBitCast is non-null, it will be used to store the return value bitcast, +/// if created. +Instruction *promoteCall(CallSite CS, Function *Callee, + CastInst **RetBitCast = nullptr); + /// Promote the given indirect call site to conditionally call \p Callee. /// /// This function creates an if-then-else structure at the location of the call -/// site. The original call site is promoted and moved into the "then" block. A -/// clone of the indirect call site is placed in the "else" block and returned. -/// If \p BranchWeights is non-null, it will be used to set !prof metadata on -/// the new conditional branch. +/// site. The original call site is moved into the "else" block. A clone of the +/// indirect call site is promoted, placed in the "then" block, and returned. If +/// \p BranchWeights is non-null, it will be used to set !prof metadata on the +/// new conditional branch. Instruction *promoteCallWithIfThenElse(CallSite CS, Function *Callee, MDNode *BranchWeights = nullptr); diff --git a/include/llvm/module.modulemap b/include/llvm/module.modulemap index 382942be64a1..d8b07c4f54da 100644 --- a/include/llvm/module.modulemap +++ b/include/llvm/module.modulemap @@ -61,7 +61,7 @@ module LLVM_BinaryFormat { textual header "BinaryFormat/ELFRelocs/SystemZ.def" textual header "BinaryFormat/ELFRelocs/x86_64.def" textual header "BinaryFormat/ELFRelocs/WebAssembly.def" - textual header "BinaryFormat/WasmRelocs/WebAssembly.def" + textual header "BinaryFormat/WasmRelocs.def" } module LLVM_Config { requires cplusplus umbrella "Config" module * { export * } } diff --git a/lib/Analysis/AliasAnalysis.cpp b/lib/Analysis/AliasAnalysis.cpp index dd2db1e5b27b..55df66714178 100644 --- a/lib/Analysis/AliasAnalysis.cpp +++ b/lib/Analysis/AliasAnalysis.cpp @@ -133,9 +133,9 @@ ModRefInfo AAResults::getArgModRefInfo(ImmutableCallSite CS, unsigned ArgIdx) { } ModRefInfo AAResults::getModRefInfo(Instruction *I, ImmutableCallSite Call) { - // We may have two calls + // We may have two calls. if (auto CS = ImmutableCallSite(I)) { - // Check if the two calls modify the same memory + // Check if the two calls modify the same memory. return getModRefInfo(CS, Call); } else if (I->isFenceLike()) { // If this is a fence, just return ModRef. @@ -179,6 +179,7 @@ ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS, if (onlyAccessesArgPointees(MRB) || onlyAccessesInaccessibleOrArgMem(MRB)) { bool DoesAlias = false; + bool IsMustAlias = true; ModRefInfo AllArgsMask = ModRefInfo::NoModRef; if (doesAccessArgPointees(MRB)) { for (auto AI = CS.arg_begin(), AE = CS.arg_end(); AI != AE; ++AI) { @@ -193,6 +194,8 @@ ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS, DoesAlias = true; AllArgsMask = unionModRef(AllArgsMask, ArgMask); } + // Conservatively clear IsMustAlias unless only MustAlias is found. + IsMustAlias &= (ArgAlias == MustAlias); } } // Return NoModRef if no alias found with any argument. @@ -200,6 +203,8 @@ ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS, return ModRefInfo::NoModRef; // Logical & between other AA analyses and argument analysis. Result = intersectModRef(Result, AllArgsMask); + // If only MustAlias found above, set Must bit. + Result = IsMustAlias ? setMust(Result) : clearMust(Result); } // If Loc is a constant memory location, the call definitely could not @@ -251,6 +256,7 @@ ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS1, if (onlyAccessesArgPointees(CS2B)) { ModRefInfo R = ModRefInfo::NoModRef; if (doesAccessArgPointees(CS2B)) { + bool IsMustAlias = true; for (auto I = CS2.arg_begin(), E = CS2.arg_end(); I != E; ++I) { const Value *Arg = *I; if (!Arg->getType()->isPointerTy()) @@ -274,10 +280,19 @@ ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS1, ModRefInfo ModRefCS1 = getModRefInfo(CS1, CS2ArgLoc); ArgMask = intersectModRef(ArgMask, ModRefCS1); + // Conservatively clear IsMustAlias unless only MustAlias is found. + IsMustAlias &= isMustSet(ModRefCS1); + R = intersectModRef(unionModRef(R, ArgMask), Result); - if (R == Result) + if (R == Result) { + // On early exit, not all args were checked, cannot set Must. + if (I + 1 != E) + IsMustAlias = false; break; + } } + // If Alias found and only MustAlias found above, set Must bit. + R = IsMustAlias ? setMust(R) : clearMust(R); } return R; } @@ -287,6 +302,7 @@ ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS1, if (onlyAccessesArgPointees(CS1B)) { ModRefInfo R = ModRefInfo::NoModRef; if (doesAccessArgPointees(CS1B)) { + bool IsMustAlias = true; for (auto I = CS1.arg_begin(), E = CS1.arg_end(); I != E; ++I) { const Value *Arg = *I; if (!Arg->getType()->isPointerTy()) @@ -303,9 +319,18 @@ ModRefInfo AAResults::getModRefInfo(ImmutableCallSite CS1, (isRefSet(ArgModRefCS1) && isModSet(ModRefCS2))) R = intersectModRef(unionModRef(R, ArgModRefCS1), Result); - if (R == Result) + // Conservatively clear IsMustAlias unless only MustAlias is found. + IsMustAlias &= isMustSet(ModRefCS2); + + if (R == Result) { + // On early exit, not all args were checked, cannot set Must. + if (I + 1 != E) + IsMustAlias = false; break; + } } + // If Alias found and only MustAlias found above, set Must bit. + R = IsMustAlias ? setMust(R) : clearMust(R); } return R; } @@ -353,9 +378,13 @@ ModRefInfo AAResults::getModRefInfo(const LoadInst *L, // If the load address doesn't alias the given address, it doesn't read // or write the specified memory. - if (Loc.Ptr && !alias(MemoryLocation::get(L), Loc)) - return ModRefInfo::NoModRef; - + if (Loc.Ptr) { + AliasResult AR = alias(MemoryLocation::get(L), Loc); + if (AR == NoAlias) + return ModRefInfo::NoModRef; + if (AR == MustAlias) + return ModRefInfo::MustRef; + } // Otherwise, a load just reads. return ModRefInfo::Ref; } @@ -367,15 +396,20 @@ ModRefInfo AAResults::getModRefInfo(const StoreInst *S, return ModRefInfo::ModRef; if (Loc.Ptr) { + AliasResult AR = alias(MemoryLocation::get(S), Loc); // If the store address cannot alias the pointer in question, then the // specified memory cannot be modified by the store. - if (!alias(MemoryLocation::get(S), Loc)) + if (AR == NoAlias) return ModRefInfo::NoModRef; // If the pointer is a pointer to constant memory, then it could not have // been modified by this store. if (pointsToConstantMemory(Loc)) return ModRefInfo::NoModRef; + + // If the store address aliases the pointer as must alias, set Must. + if (AR == MustAlias) + return ModRefInfo::MustMod; } // Otherwise, a store just writes. @@ -393,15 +427,20 @@ ModRefInfo AAResults::getModRefInfo(const FenceInst *S, const MemoryLocation &Lo ModRefInfo AAResults::getModRefInfo(const VAArgInst *V, const MemoryLocation &Loc) { if (Loc.Ptr) { + AliasResult AR = alias(MemoryLocation::get(V), Loc); // If the va_arg address cannot alias the pointer in question, then the // specified memory cannot be accessed by the va_arg. - if (!alias(MemoryLocation::get(V), Loc)) + if (AR == NoAlias) return ModRefInfo::NoModRef; // If the pointer is a pointer to constant memory, then it could not have // been modified by this va_arg. if (pointsToConstantMemory(Loc)) return ModRefInfo::NoModRef; + + // If the va_arg aliases the pointer as must alias, set Must. + if (AR == MustAlias) + return ModRefInfo::MustModRef; } // Otherwise, a va_arg reads and writes. @@ -440,9 +479,17 @@ ModRefInfo AAResults::getModRefInfo(const AtomicCmpXchgInst *CX, if (isStrongerThanMonotonic(CX->getSuccessOrdering())) return ModRefInfo::ModRef; - // If the cmpxchg address does not alias the location, it does not access it. - if (Loc.Ptr && !alias(MemoryLocation::get(CX), Loc)) - return ModRefInfo::NoModRef; + if (Loc.Ptr) { + AliasResult AR = alias(MemoryLocation::get(CX), Loc); + // If the cmpxchg address does not alias the location, it does not access + // it. + if (AR == NoAlias) + return ModRefInfo::NoModRef; + + // If the cmpxchg address aliases the pointer as must alias, set Must. + if (AR == MustAlias) + return ModRefInfo::MustModRef; + } return ModRefInfo::ModRef; } @@ -453,9 +500,17 @@ ModRefInfo AAResults::getModRefInfo(const AtomicRMWInst *RMW, if (isStrongerThanMonotonic(RMW->getOrdering())) return ModRefInfo::ModRef; - // If the atomicrmw address does not alias the location, it does not access it. - if (Loc.Ptr && !alias(MemoryLocation::get(RMW), Loc)) - return ModRefInfo::NoModRef; + if (Loc.Ptr) { + AliasResult AR = alias(MemoryLocation::get(RMW), Loc); + // If the atomicrmw address does not alias the location, it does not access + // it. + if (AR == NoAlias) + return ModRefInfo::NoModRef; + + // If the atomicrmw address aliases the pointer as must alias, set Must. + if (AR == MustAlias) + return ModRefInfo::MustModRef; + } return ModRefInfo::ModRef; } @@ -493,6 +548,8 @@ ModRefInfo AAResults::callCapturesBefore(const Instruction *I, unsigned ArgNo = 0; ModRefInfo R = ModRefInfo::NoModRef; + bool MustAlias = true; + // Set flag only if no May found and all operands processed. for (auto CI = CS.data_operands_begin(), CE = CS.data_operands_end(); CI != CE; ++CI, ++ArgNo) { // Only look at the no-capture or byval pointer arguments. If this @@ -503,11 +560,14 @@ ModRefInfo AAResults::callCapturesBefore(const Instruction *I, ArgNo < CS.getNumArgOperands() && !CS.isByValArgument(ArgNo))) continue; + AliasResult AR = alias(MemoryLocation(*CI), MemoryLocation(Object)); // If this is a no-capture pointer argument, see if we can tell that it // is impossible to alias the pointer we're checking. If not, we have to // assume that the call could touch the pointer, even though it doesn't // escape. - if (isNoAlias(MemoryLocation(*CI), MemoryLocation(Object))) + if (AR != MustAlias) + MustAlias = false; + if (AR == NoAlias) continue; if (CS.doesNotAccessMemory(ArgNo)) continue; @@ -515,9 +575,10 @@ ModRefInfo AAResults::callCapturesBefore(const Instruction *I, R = ModRefInfo::Ref; continue; } + // Not returning MustModRef since we have not seen all the arguments. return ModRefInfo::ModRef; } - return R; + return MustAlias ? setMust(R) : clearMust(R); } /// canBasicBlockModify - Return true if it is possible for execution of the diff --git a/lib/Analysis/AliasAnalysisEvaluator.cpp b/lib/Analysis/AliasAnalysisEvaluator.cpp index 423acf739f58..f737cecc43d1 100644 --- a/lib/Analysis/AliasAnalysisEvaluator.cpp +++ b/lib/Analysis/AliasAnalysisEvaluator.cpp @@ -31,9 +31,13 @@ static cl::opt PrintPartialAlias("print-partial-aliases", cl::ReallyHidden static cl::opt PrintMustAlias("print-must-aliases", cl::ReallyHidden); static cl::opt PrintNoModRef("print-no-modref", cl::ReallyHidden); -static cl::opt PrintMod("print-mod", cl::ReallyHidden); static cl::opt PrintRef("print-ref", cl::ReallyHidden); +static cl::opt PrintMod("print-mod", cl::ReallyHidden); static cl::opt PrintModRef("print-modref", cl::ReallyHidden); +static cl::opt PrintMust("print-must", cl::ReallyHidden); +static cl::opt PrintMustRef("print-mustref", cl::ReallyHidden); +static cl::opt PrintMustMod("print-mustmod", cl::ReallyHidden); +static cl::opt PrintMustModRef("print-mustmodref", cl::ReallyHidden); static cl::opt EvalAAMD("evaluate-aa-metadata", cl::ReallyHidden); @@ -262,6 +266,25 @@ void AAEvaluator::runInternal(Function &F, AAResults &AA) { F.getParent()); ++ModRefCount; break; + case ModRefInfo::Must: + PrintModRefResults("Must", PrintMust, I, Pointer, F.getParent()); + ++MustCount; + break; + case ModRefInfo::MustMod: + PrintModRefResults("Just Mod (MustAlias)", PrintMustMod, I, Pointer, + F.getParent()); + ++MustModCount; + break; + case ModRefInfo::MustRef: + PrintModRefResults("Just Ref (MustAlias)", PrintMustRef, I, Pointer, + F.getParent()); + ++MustRefCount; + break; + case ModRefInfo::MustModRef: + PrintModRefResults("Both ModRef (MustAlias)", PrintMustModRef, I, + Pointer, F.getParent()); + ++MustModRefCount; + break; } } } @@ -288,6 +311,25 @@ void AAEvaluator::runInternal(Function &F, AAResults &AA) { PrintModRefResults("Both ModRef", PrintModRef, *C, *D, F.getParent()); ++ModRefCount; break; + case ModRefInfo::Must: + PrintModRefResults("Must", PrintMust, *C, *D, F.getParent()); + ++MustCount; + break; + case ModRefInfo::MustMod: + PrintModRefResults("Just Mod (MustAlias)", PrintMustMod, *C, *D, + F.getParent()); + ++MustModCount; + break; + case ModRefInfo::MustRef: + PrintModRefResults("Just Ref (MustAlias)", PrintMustRef, *C, *D, + F.getParent()); + ++MustRefCount; + break; + case ModRefInfo::MustModRef: + PrintModRefResults("Both ModRef (MustAlias)", PrintMustModRef, *C, *D, + F.getParent()); + ++MustModRefCount; + break; } } } @@ -325,7 +367,8 @@ AAEvaluator::~AAEvaluator() { } // Display the summary for mod/ref analysis - int64_t ModRefSum = NoModRefCount + ModCount + RefCount + ModRefCount; + int64_t ModRefSum = NoModRefCount + RefCount + ModCount + ModRefCount + + MustCount + MustRefCount + MustModCount + MustModRefCount; if (ModRefSum == 0) { errs() << " Alias Analysis Mod/Ref Evaluator Summary: no " "mod/ref!\n"; @@ -339,10 +382,22 @@ AAEvaluator::~AAEvaluator() { PrintPercent(RefCount, ModRefSum); errs() << " " << ModRefCount << " mod & ref responses "; PrintPercent(ModRefCount, ModRefSum); + errs() << " " << MustCount << " must responses "; + PrintPercent(MustCount, ModRefSum); + errs() << " " << MustModCount << " must mod responses "; + PrintPercent(MustModCount, ModRefSum); + errs() << " " << MustRefCount << " must ref responses "; + PrintPercent(MustRefCount, ModRefSum); + errs() << " " << MustModRefCount << " must mod & ref responses "; + PrintPercent(MustModRefCount, ModRefSum); errs() << " Alias Analysis Evaluator Mod/Ref Summary: " << NoModRefCount * 100 / ModRefSum << "%/" << ModCount * 100 / ModRefSum << "%/" << RefCount * 100 / ModRefSum - << "%/" << ModRefCount * 100 / ModRefSum << "%\n"; + << "%/" << ModRefCount * 100 / ModRefSum << "%/" + << MustCount * 100 / ModRefSum << "%/" + << MustRefCount * 100 / ModRefSum << "%/" + << MustModCount * 100 / ModRefSum << "%/" + << MustModRefCount * 100 / ModRefSum << "%\n"; } } diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp index 81b9f842249e..537813b6b752 100644 --- a/lib/Analysis/BasicAliasAnalysis.cpp +++ b/lib/Analysis/BasicAliasAnalysis.cpp @@ -781,6 +781,7 @@ ModRefInfo BasicAAResult::getModRefInfo(ImmutableCallSite CS, // Optimistically assume that call doesn't touch Object and check this // assumption in the following loop. ModRefInfo Result = ModRefInfo::NoModRef; + bool IsMustAlias = true; unsigned OperandNo = 0; for (auto CI = CS.data_operands_begin(), CE = CS.data_operands_end(); @@ -802,7 +803,8 @@ ModRefInfo BasicAAResult::getModRefInfo(ImmutableCallSite CS, // is impossible to alias the pointer we're checking. AliasResult AR = getBestAAResults().alias(MemoryLocation(*CI), MemoryLocation(Object)); - + if (AR != MustAlias) + IsMustAlias = false; // Operand doesnt alias 'Object', continue looking for other aliases if (AR == NoAlias) continue; @@ -818,13 +820,20 @@ ModRefInfo BasicAAResult::getModRefInfo(ImmutableCallSite CS, continue; } // This operand aliases 'Object' and call reads and writes into it. + // Setting ModRef will not yield an early return below, MustAlias is not + // used further. Result = ModRefInfo::ModRef; break; } + // No operand aliases, reset Must bit. Add below if at least one aliases + // and all aliases found are MustAlias. + if (isNoModRef(Result)) + IsMustAlias = false; + // Early return if we improved mod ref information if (!isModAndRefSet(Result)) - return Result; + return IsMustAlias ? setMust(Result) : clearMust(Result); } // If the CallSite is to malloc or calloc, we can assume that it doesn't diff --git a/lib/Analysis/CFGPrinter.cpp b/lib/Analysis/CFGPrinter.cpp index a85af6c9c93f..fb261755e5d1 100644 --- a/lib/Analysis/CFGPrinter.cpp +++ b/lib/Analysis/CFGPrinter.cpp @@ -82,7 +82,7 @@ PreservedAnalyses CFGOnlyViewerPass::run(Function &F, return PreservedAnalyses::all(); } -static void writeCFGToDotFile(Function &F) { +static void writeCFGToDotFile(Function &F, bool CFGOnly = false) { std::string Filename = ("cfg." + F.getName() + ".dot").str(); errs() << "Writing '" << Filename << "'..."; @@ -90,7 +90,7 @@ static void writeCFGToDotFile(Function &F) { raw_fd_ostream File(Filename, EC, sys::fs::F_Text); if (!EC) - WriteGraph(File, (const Function*)&F); + WriteGraph(File, (const Function*)&F, CFGOnly); else errs() << " error opening file for writing!"; errs() << "\n"; @@ -134,7 +134,7 @@ namespace { } bool runOnFunction(Function &F) override { - writeCFGToDotFile(F); + writeCFGToDotFile(F, /*CFGOnly=*/true); return false; } void print(raw_ostream &OS, const Module* = nullptr) const override {} @@ -152,7 +152,7 @@ INITIALIZE_PASS(CFGOnlyPrinterLegacyPass, "dot-cfg-only", PreservedAnalyses CFGOnlyPrinterPass::run(Function &F, FunctionAnalysisManager &AM) { - writeCFGToDotFile(F); + writeCFGToDotFile(F, /*CFGOnly=*/true); return PreservedAnalyses::all(); } diff --git a/lib/Analysis/GlobalsModRef.cpp b/lib/Analysis/GlobalsModRef.cpp index 23109c67e5c3..daee93267f56 100644 --- a/lib/Analysis/GlobalsModRef.cpp +++ b/lib/Analysis/GlobalsModRef.cpp @@ -85,12 +85,17 @@ class GlobalsAAResult::FunctionInfo { /// The bit that flags that this function may read any global. This is /// chosen to mix together with ModRefInfo bits. /// FIXME: This assumes ModRefInfo lattice will remain 4 bits! + /// It overlaps with ModRefInfo::Must bit! + /// FunctionInfo.getModRefInfo() masks out everything except ModRef so + /// this remains correct, but the Must info is lost. enum { MayReadAnyGlobal = 4 }; /// Checks to document the invariants of the bit packing here. - static_assert((MayReadAnyGlobal & static_cast(ModRefInfo::ModRef)) == 0, + static_assert((MayReadAnyGlobal & static_cast(ModRefInfo::MustModRef)) == + 0, "ModRef and the MayReadAnyGlobal flag bits overlap."); - static_assert(((MayReadAnyGlobal | static_cast(ModRefInfo::ModRef)) >> + static_assert(((MayReadAnyGlobal | + static_cast(ModRefInfo::MustModRef)) >> AlignedMapPointerTraits::NumLowBitsAvailable) == 0, "Insufficient low bits to store our flag and ModRef info."); @@ -125,14 +130,22 @@ class GlobalsAAResult::FunctionInfo { return *this; } + /// This method clears MayReadAnyGlobal bit added by GlobalsAAResult to return + /// the corresponding ModRefInfo. It must align in functionality with + /// clearMust(). + ModRefInfo globalClearMayReadAnyGlobal(int I) const { + return ModRefInfo((I & static_cast(ModRefInfo::ModRef)) | + static_cast(ModRefInfo::NoModRef)); + } + /// Returns the \c ModRefInfo info for this function. ModRefInfo getModRefInfo() const { - return ModRefInfo(Info.getInt() & static_cast(ModRefInfo::ModRef)); + return globalClearMayReadAnyGlobal(Info.getInt()); } /// Adds new \c ModRefInfo for this function to its state. void addModRefInfo(ModRefInfo NewMRI) { - Info.setInt(Info.getInt() | static_cast(NewMRI)); + Info.setInt(Info.getInt() | static_cast(setMust(NewMRI))); } /// Returns whether this function may read any global variable, and we don't diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp index fba96c8976a6..b0cb29203a5a 100644 --- a/lib/Analysis/InlineCost.cpp +++ b/lib/Analysis/InlineCost.cpp @@ -249,8 +249,6 @@ class CallAnalyzer : public InstVisitor { bool visitCastInst(CastInst &I); bool visitUnaryInstruction(UnaryInstruction &I); bool visitCmpInst(CmpInst &I); - bool visitAnd(BinaryOperator &I); - bool visitOr(BinaryOperator &I); bool visitSub(BinaryOperator &I); bool visitBinaryOperator(BinaryOperator &I); bool visitLoad(LoadInst &I); @@ -363,6 +361,7 @@ void CallAnalyzer::accumulateSROACost(DenseMap::iterator CostIt, void CallAnalyzer::disableLoadElimination() { if (EnableLoadElimination) { Cost += LoadEliminationCost; + LoadEliminationCost = 0; EnableLoadElimination = false; } } @@ -700,6 +699,22 @@ bool CallAnalyzer::visitCastInst(CastInst &I) { // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere. disableSROA(I.getOperand(0)); + // If this is a floating-point cast, and the target says this operation + // is expensive, this may eventually become a library call. Treat the cost + // as such. + switch (I.getOpcode()) { + case Instruction::FPTrunc: + case Instruction::FPExt: + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::FPToUI: + case Instruction::FPToSI: + if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive) + Cost += InlineConstants::CallPenalty; + default: + break; + } + return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); } @@ -1004,34 +1019,6 @@ bool CallAnalyzer::visitCmpInst(CmpInst &I) { return false; } -bool CallAnalyzer::visitOr(BinaryOperator &I) { - // This is necessary because the generic simplify instruction only works if - // both operands are constants. - for (unsigned i = 0; i < 2; ++i) { - if (ConstantInt *C = dyn_cast_or_null( - SimplifiedValues.lookup(I.getOperand(i)))) - if (C->isAllOnesValue()) { - SimplifiedValues[&I] = C; - return true; - } - } - return Base::visitOr(I); -} - -bool CallAnalyzer::visitAnd(BinaryOperator &I) { - // This is necessary because the generic simplify instruction only works if - // both operands are constants. - for (unsigned i = 0; i < 2; ++i) { - if (ConstantInt *C = dyn_cast_or_null( - SimplifiedValues.lookup(I.getOperand(i)))) - if (C->isZero()) { - SimplifiedValues[&I] = C; - return true; - } - } - return Base::visitAnd(I); -} - bool CallAnalyzer::visitSub(BinaryOperator &I) { // Try to handle a special case: we can fold computing the difference of two // constant-related pointers. @@ -1061,23 +1048,38 @@ bool CallAnalyzer::visitSub(BinaryOperator &I) { bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) { Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); - auto Evaluate = [&](SmallVectorImpl &COps) { - Value *SimpleV = nullptr; - if (auto FI = dyn_cast(&I)) - SimpleV = SimplifyFPBinOp(I.getOpcode(), COps[0], COps[1], - FI->getFastMathFlags(), DL); - else - SimpleV = SimplifyBinOp(I.getOpcode(), COps[0], COps[1], DL); - return dyn_cast_or_null(SimpleV); - }; + Constant *CLHS = dyn_cast(LHS); + if (!CLHS) + CLHS = SimplifiedValues.lookup(LHS); + Constant *CRHS = dyn_cast(RHS); + if (!CRHS) + CRHS = SimplifiedValues.lookup(RHS); - if (simplifyInstruction(I, Evaluate)) + Value *SimpleV = nullptr; + if (auto FI = dyn_cast(&I)) + SimpleV = SimplifyFPBinOp(I.getOpcode(), CLHS ? CLHS : LHS, + CRHS ? CRHS : RHS, FI->getFastMathFlags(), DL); + else + SimpleV = + SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL); + + if (Constant *C = dyn_cast_or_null(SimpleV)) + SimplifiedValues[&I] = C; + + if (SimpleV) return true; // Disable any SROA on arguments to arbitrary, unsimplified binary operators. disableSROA(LHS); disableSROA(RHS); + // If the instruction is floating point, and the target says this operation + // is expensive, this may eventually become a library call. Treat the cost + // as such. + if (I.getType()->isFloatingPointTy() && + TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive) + Cost += InlineConstants::CallPenalty; + return false; } @@ -1097,7 +1099,7 @@ bool CallAnalyzer::visitLoad(LoadInst &I) { // by any stores or calls, this load is likely to be redundant and can be // eliminated. if (EnableLoadElimination && - !LoadAddrSet.insert(I.getPointerOperand()).second) { + !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) { LoadEliminationCost += InlineConstants::InstrCost; return true; } @@ -1547,17 +1549,6 @@ bool CallAnalyzer::analyzeBlock(BasicBlock *BB, if (isa(I) || I->getType()->isVectorTy()) ++NumVectorInstructions; - // If the instruction is floating point, and the target says this operation - // is expensive or the function has the "use-soft-float" attribute, this may - // eventually become a library call. Treat the cost as such. - if (I->getType()->isFloatingPointTy()) { - // If the function has the "use-soft-float" attribute, mark it as - // expensive. - if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive || - (F.getFnAttribute("use-soft-float").getValueAsString() == "true")) - Cost += InlineConstants::CallPenalty; - } - // If the instruction simplified to a constant, there is no cost to this // instruction. Visit the instructions using our InstVisitor to account for // all of the per-instruction logic. The visit tree returns true if we diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp index ed8e5e8cc489..e141d6c58b65 100644 --- a/lib/Analysis/LoopAccessAnalysis.cpp +++ b/lib/Analysis/LoopAccessAnalysis.cpp @@ -1107,77 +1107,6 @@ static unsigned getAddressSpaceOperand(Value *I) { return -1; } -// TODO:This API can be improved by using the permutation of given width as the -// accesses are entered into the map. -bool llvm::sortLoadAccesses(ArrayRef VL, const DataLayout &DL, - ScalarEvolution &SE, - SmallVectorImpl &Sorted, - SmallVectorImpl *Mask) { - SmallVector, 4> OffValPairs; - OffValPairs.reserve(VL.size()); - Sorted.reserve(VL.size()); - - // Walk over the pointers, and map each of them to an offset relative to - // first pointer in the array. - Value *Ptr0 = getPointerOperand(VL[0]); - const SCEV *Scev0 = SE.getSCEV(Ptr0); - Value *Obj0 = GetUnderlyingObject(Ptr0, DL); - PointerType *PtrTy = dyn_cast(Ptr0->getType()); - uint64_t Size = DL.getTypeAllocSize(PtrTy->getElementType()); - - for (auto *Val : VL) { - // The only kind of access we care about here is load. - if (!isa(Val)) - return false; - - Value *Ptr = getPointerOperand(Val); - assert(Ptr && "Expected value to have a pointer operand."); - // If a pointer refers to a different underlying object, bail - the - // pointers are by definition incomparable. - Value *CurrObj = GetUnderlyingObject(Ptr, DL); - if (CurrObj != Obj0) - return false; - - const SCEVConstant *Diff = - dyn_cast(SE.getMinusSCEV(SE.getSCEV(Ptr), Scev0)); - // The pointers may not have a constant offset from each other, or SCEV - // may just not be smart enough to figure out they do. Regardless, - // there's nothing we can do. - if (!Diff || static_cast(Diff->getAPInt().abs().getSExtValue()) > - (VL.size() - 1) * Size) - return false; - - OffValPairs.emplace_back(Diff->getAPInt().getSExtValue(), Val); - } - SmallVector UseOrder(VL.size()); - for (unsigned i = 0; i < VL.size(); i++) { - UseOrder[i] = i; - } - - // Sort the memory accesses and keep the order of their uses in UseOrder. - std::sort(UseOrder.begin(), UseOrder.end(), - [&OffValPairs](unsigned Left, unsigned Right) { - return OffValPairs[Left].first < OffValPairs[Right].first; - }); - - for (unsigned i = 0; i < VL.size(); i++) - Sorted.emplace_back(OffValPairs[UseOrder[i]].second); - - // Sort UseOrder to compute the Mask. - if (Mask) { - Mask->reserve(VL.size()); - for (unsigned i = 0; i < VL.size(); i++) - Mask->emplace_back(i); - std::sort(Mask->begin(), Mask->end(), - [&UseOrder](unsigned Left, unsigned Right) { - return UseOrder[Left] < UseOrder[Right]; - }); - } - - return true; -} - - /// Returns true if the memory operations \p A and \p B are consecutive. bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, ScalarEvolution &SE, bool CheckType) { diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp index a6c590126c2f..bb7bf967994c 100644 --- a/lib/Analysis/MemoryDependenceAnalysis.cpp +++ b/lib/Analysis/MemoryDependenceAnalysis.cpp @@ -647,6 +647,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom( // Ok, this store might clobber the query pointer. Check to see if it is // a must alias: in this case, we want to return this as a def. + // FIXME: Use ModRefInfo::Must bit from getModRefInfo call above. MemoryLocation StoreLoc = MemoryLocation::get(SI); // If we found a pointer, check if it could be the same as our pointer. @@ -690,7 +691,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom( // If necessary, perform additional analysis. if (isModAndRefSet(MR)) MR = AA.callCapturesBefore(Inst, MemLoc, &DT, &OBB); - switch (MR) { + switch (clearMust(MR)) { case ModRefInfo::NoModRef: // If the call has no effect on the queried pointer, just ignore it. continue; @@ -919,6 +920,14 @@ void MemoryDependenceResults::getNonLocalPointerDependency( Instruction *QueryInst, SmallVectorImpl &Result) { const MemoryLocation Loc = MemoryLocation::get(QueryInst); bool isLoad = isa(QueryInst); + return getNonLocalPointerDependencyFrom(QueryInst, Loc, isLoad, Result); +} + +void MemoryDependenceResults::getNonLocalPointerDependencyFrom( + Instruction *QueryInst, + const MemoryLocation &Loc, + bool isLoad, + SmallVectorImpl &Result) { BasicBlock *FromBB = QueryInst->getParent(); assert(FromBB); @@ -1118,21 +1127,15 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB( // If we already have a cache entry for this CacheKey, we may need to do some // work to reconcile the cache entry and the current query. if (!Pair.second) { - if (CacheInfo->Size < Loc.Size) { - // The query's Size is greater than the cached one. Throw out the - // cached data and proceed with the query at the greater size. + if (CacheInfo->Size != Loc.Size) { + // The query's Size differs from the cached one. Throw out the + // cached data and proceed with the query at the new size. CacheInfo->Pair = BBSkipFirstBlockPair(); CacheInfo->Size = Loc.Size; for (auto &Entry : CacheInfo->NonLocalDeps) if (Instruction *Inst = Entry.getResult().getInst()) RemoveFromReverseMap(ReverseNonLocalPtrDeps, Inst, CacheKey); CacheInfo->NonLocalDeps.clear(); - } else if (CacheInfo->Size > Loc.Size) { - // This query's Size is less than the cached one. Conservatively restart - // the query using the greater size. - return getNonLocalPointerDepFromBB( - QueryInst, Pointer, Loc.getWithNewSize(CacheInfo->Size), isLoad, - StartBB, Result, Visited, SkipFirstBlock); } // If the query's AATags are inconsistent with the cached one, diff --git a/lib/Analysis/MemorySSA.cpp b/lib/Analysis/MemorySSA.cpp index 8fe190e8bcf8..6e9368c49d65 100644 --- a/lib/Analysis/MemorySSA.cpp +++ b/lib/Analysis/MemorySSA.cpp @@ -192,8 +192,6 @@ template <> struct DenseMapInfo { } }; -enum class Reorderability { Always, IfNoAlias, Never }; - } // end namespace llvm /// This does one-way checks to see if Use could theoretically be hoisted above @@ -202,22 +200,16 @@ enum class Reorderability { Always, IfNoAlias, Never }; /// This assumes that, for the purposes of MemorySSA, Use comes directly after /// MayClobber, with no potentially clobbering operations in between them. /// (Where potentially clobbering ops are memory barriers, aliased stores, etc.) -static Reorderability getLoadReorderability(const LoadInst *Use, - const LoadInst *MayClobber) { +static bool areLoadsReorderable(const LoadInst *Use, + const LoadInst *MayClobber) { bool VolatileUse = Use->isVolatile(); bool VolatileClobber = MayClobber->isVolatile(); // Volatile operations may never be reordered with other volatile operations. if (VolatileUse && VolatileClobber) - return Reorderability::Never; - - // The lang ref allows reordering of volatile and non-volatile operations. - // Whether an aliasing nonvolatile load and volatile load can be reordered, - // though, is ambiguous. Because it may not be best to exploit this ambiguity, - // we only allow volatile/non-volatile reordering if the volatile and - // non-volatile operations don't alias. - Reorderability Result = VolatileUse || VolatileClobber - ? Reorderability::IfNoAlias - : Reorderability::Always; + return false; + // Otherwise, volatile doesn't matter here. From the language reference: + // 'optimizers may change the order of volatile operations relative to + // non-volatile operations.'" // If a load is seq_cst, it cannot be moved above other loads. If its ordering // is weaker, it can be moved above other loads. We just need to be sure that @@ -229,9 +221,7 @@ static Reorderability getLoadReorderability(const LoadInst *Use, bool SeqCstUse = Use->getOrdering() == AtomicOrdering::SequentiallyConsistent; bool MayClobberIsAcquire = isAtLeastOrStrongerThan(MayClobber->getOrdering(), AtomicOrdering::Acquire); - if (SeqCstUse || MayClobberIsAcquire) - return Reorderability::Never; - return Result; + return !(SeqCstUse || MayClobberIsAcquire); } static bool instructionClobbersQuery(MemoryDef *MD, @@ -265,18 +255,9 @@ static bool instructionClobbersQuery(MemoryDef *MD, return isModOrRefSet(I); } - if (auto *DefLoad = dyn_cast(DefInst)) { - if (auto *UseLoad = dyn_cast(UseInst)) { - switch (getLoadReorderability(UseLoad, DefLoad)) { - case Reorderability::Always: - return false; - case Reorderability::Never: - return true; - case Reorderability::IfNoAlias: - return !AA.isNoAlias(UseLoc, MemoryLocation::get(DefLoad)); - } - } - } + if (auto *DefLoad = dyn_cast(DefInst)) + if (auto *UseLoad = dyn_cast(UseInst)) + return !areLoadsReorderable(UseLoad, DefLoad); return isModSet(AA.getModRefInfo(DefInst, UseLoc)); } diff --git a/lib/Analysis/ModuleSummaryAnalysis.cpp b/lib/Analysis/ModuleSummaryAnalysis.cpp index d54fb700200d..10badd89a4a8 100644 --- a/lib/Analysis/ModuleSummaryAnalysis.cpp +++ b/lib/Analysis/ModuleSummaryAnalysis.cpp @@ -454,7 +454,7 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex( std::unique_ptr BFIPtr; if (GetBFICallback) BFI = GetBFICallback(F); - else if (F.getEntryCount().hasValue()) { + else if (F.hasProfileData()) { LoopInfo LI{DominatorTree(const_cast(F))}; BranchProbabilityInfo BPI{F, LI}; BFIPtr = llvm::make_unique(F, BPI, LI); diff --git a/lib/Analysis/ProfileSummaryInfo.cpp b/lib/Analysis/ProfileSummaryInfo.cpp index 671744f93fb8..347d093b0f61 100644 --- a/lib/Analysis/ProfileSummaryInfo.cpp +++ b/lib/Analysis/ProfileSummaryInfo.cpp @@ -115,42 +115,62 @@ bool ProfileSummaryInfo::isFunctionEntryHot(const Function *F) { return FunctionCount && isHotCount(FunctionCount.getValue()); } -/// Returns true if the function's entry or total call edge count is hot. +/// Returns true if the function contains hot code. This can include a hot +/// function entry count, hot basic block, or (in the case of Sample PGO) +/// hot total call edge count. /// If it returns false, it either means it is not hot or it is unknown -/// whether it is hot or not (for example, no profile data is available). -bool ProfileSummaryInfo::isFunctionHotInCallGraph(const Function *F) { +/// (for example, no profile data is available). +bool ProfileSummaryInfo::isFunctionHotInCallGraph(const Function *F, + BlockFrequencyInfo &BFI) { if (!F || !computeSummary()) return false; if (auto FunctionCount = F->getEntryCount()) if (isHotCount(FunctionCount.getValue())) return true; - uint64_t TotalCallCount = 0; + if (hasSampleProfile()) { + uint64_t TotalCallCount = 0; + for (const auto &BB : *F) + for (const auto &I : BB) + if (isa(I) || isa(I)) + if (auto CallCount = getProfileCount(&I, nullptr)) + TotalCallCount += CallCount.getValue(); + if (isHotCount(TotalCallCount)) + return true; + } for (const auto &BB : *F) - for (const auto &I : BB) - if (isa(I) || isa(I)) - if (auto CallCount = getProfileCount(&I, nullptr)) - TotalCallCount += CallCount.getValue(); - return isHotCount(TotalCallCount); + if (isHotBB(&BB, &BFI)) + return true; + return false; } -/// Returns true if the function's entry and total call edge count is cold. +/// Returns true if the function only contains cold code. This means that +/// the function entry and blocks are all cold, and (in the case of Sample PGO) +/// the total call edge count is cold. /// If it returns false, it either means it is not cold or it is unknown -/// whether it is cold or not (for example, no profile data is available). -bool ProfileSummaryInfo::isFunctionColdInCallGraph(const Function *F) { +/// (for example, no profile data is available). +bool ProfileSummaryInfo::isFunctionColdInCallGraph(const Function *F, + BlockFrequencyInfo &BFI) { if (!F || !computeSummary()) return false; if (auto FunctionCount = F->getEntryCount()) if (!isColdCount(FunctionCount.getValue())) return false; - - uint64_t TotalCallCount = 0; + + if (hasSampleProfile()) { + uint64_t TotalCallCount = 0; + for (const auto &BB : *F) + for (const auto &I : BB) + if (isa(I) || isa(I)) + if (auto CallCount = getProfileCount(&I, nullptr)) + TotalCallCount += CallCount.getValue(); + if (!isColdCount(TotalCallCount)) + return false; + } for (const auto &BB : *F) - for (const auto &I : BB) - if (isa(I) || isa(I)) - if (auto CallCount = getProfileCount(&I, nullptr)) - TotalCallCount += CallCount.getValue(); - return isColdCount(TotalCallCount); + if (!isColdBB(&BB, &BFI)) + return false; + return true; } /// Returns true if the function's entry is a cold. If it returns false, it @@ -231,7 +251,7 @@ bool ProfileSummaryInfo::isColdCallSite(const CallSite &CS, // If there is no profile for the caller, and we know the profile is // accurate, we consider the callsite as cold. return (hasSampleProfile() && - (CS.getCaller()->getEntryCount() || ProfileSampleAccurate || + (CS.getCaller()->hasProfileData() || ProfileSampleAccurate || CS.getCaller()->hasFnAttribute("profile-sample-accurate"))); } diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp index 0b8604187121..2a8088dc4452 100644 --- a/lib/Analysis/ScalarEvolution.cpp +++ b/lib/Analysis/ScalarEvolution.cpp @@ -4368,6 +4368,7 @@ static Optional MatchBinaryOp(Value *V, DominatorTree &DT) { default: break; } + break; } default: diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp index b744cae51ed7..c9e9c6d1a419 100644 --- a/lib/Analysis/TargetTransformInfo.cpp +++ b/lib/Analysis/TargetTransformInfo.cpp @@ -314,6 +314,10 @@ int TargetTransformInfo::getIntImmCost(Intrinsic::ID IID, unsigned Idx, return Cost; } +bool TargetTransformInfo::isOutOfOrder() const { + return TTIImpl->isOutOfOrder(); +} + unsigned TargetTransformInfo::getNumberOfRegisters(bool Vector) const { return TTIImpl->getNumberOfRegisters(Vector); } diff --git a/lib/Analysis/TypeBasedAliasAnalysis.cpp b/lib/Analysis/TypeBasedAliasAnalysis.cpp index c9ed026a1e33..173db399b9d6 100644 --- a/lib/Analysis/TypeBasedAliasAnalysis.cpp +++ b/lib/Analysis/TypeBasedAliasAnalysis.cpp @@ -544,21 +544,32 @@ static bool matchAccessTags(const MDNode *A, const MDNode *B, TBAAStructTagNode TagA(A), TagB(B); const MDNode *CommonType = getLeastCommonType(TagA.getAccessType(), TagB.getAccessType()); - if (GenericTag) - *GenericTag = createAccessTag(CommonType); // TODO: We need to check if AccessType of TagA encloses AccessType of // TagB to support aggregate AccessType. If yes, return true. // Climb the type DAG from base type of A to see if we reach base type of B. uint64_t OffsetA; - if (findAccessType(TagA, TagB.getBaseType(), OffsetA)) - return OffsetA == TagB.getOffset(); + if (findAccessType(TagA, TagB.getBaseType(), OffsetA)) { + bool SameMemberAccess = OffsetA == TagB.getOffset(); + if (GenericTag) + *GenericTag = SameMemberAccess ? TagB.getNode() : + createAccessTag(CommonType); + return SameMemberAccess; + } // Climb the type DAG from base type of B to see if we reach base type of A. uint64_t OffsetB; - if (findAccessType(TagB, TagA.getBaseType(), OffsetB)) - return OffsetB == TagA.getOffset(); + if (findAccessType(TagB, TagA.getBaseType(), OffsetB)) { + bool SameMemberAccess = OffsetB == TagA.getOffset(); + if (GenericTag) + *GenericTag = SameMemberAccess ? TagA.getNode() : + createAccessTag(CommonType); + return SameMemberAccess; + } + + if (GenericTag) + *GenericTag = createAccessTag(CommonType); // If the final access types have different roots, they're part of different // potentially unrelated type systems, so we must be conservative. diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp index fd13dbc1f1e2..a7201ed97350 100644 --- a/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -3371,7 +3371,7 @@ void ModuleBitcodeWriterBase::writePerModuleFunctionSummaryRecord( for (auto &RI : FS->refs()) NameVals.push_back(VE.getValueID(RI.getValue())); - bool HasProfileData = F.getEntryCount().hasValue(); + bool HasProfileData = F.hasProfileData(); for (auto &ECI : FS->calls()) { NameVals.push_back(getValueId(ECI.first)); if (HasProfileData) diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 31037095aa2b..d7995447592c 100644 --- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -2033,6 +2033,7 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) { } } // else fallthrough + LLVM_FALLTHROUGH; // The MC library also has a right-shift operator, but it isn't consistently // signed or unsigned between different targets. diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt index 07ba5d36cc96..3aeb4910ab10 100644 --- a/lib/CodeGen/CMakeLists.txt +++ b/lib/CodeGen/CMakeLists.txt @@ -51,7 +51,7 @@ add_llvm_library(LLVMCodeGen LiveRangeShrink.cpp LiveRegMatrix.cpp LiveRegUnits.cpp - LiveStackAnalysis.cpp + LiveStacks.cpp LiveVariables.cpp LLVMTargetMachine.cpp LocalStackSlotAllocation.cpp diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index c4794380f791..d6f55bba716f 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -352,8 +352,6 @@ bool CodeGenPrepare::runOnFunction(Function &F) { // Clear per function information. InsertedInsts.clear(); PromotedInsts.clear(); - BFI.reset(); - BPI.reset(); ModifiedDT = false; if (auto *TPC = getAnalysisIfAvailable()) { @@ -365,14 +363,16 @@ bool CodeGenPrepare::runOnFunction(Function &F) { TLInfo = &getAnalysis().getTLI(); TTI = &getAnalysis().getTTI(F); LI = &getAnalysis().getLoopInfo(); + BPI.reset(new BranchProbabilityInfo(F, *LI)); + BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI)); OptSize = F.optForSize(); ProfileSummaryInfo *PSI = getAnalysis().getPSI(); if (ProfileGuidedSectionPrefix) { - if (PSI->isFunctionHotInCallGraph(&F)) + if (PSI->isFunctionHotInCallGraph(&F, *BFI)) F.setSectionPrefix(".hot"); - else if (PSI->isFunctionColdInCallGraph(&F)) + else if (PSI->isFunctionColdInCallGraph(&F, *BFI)) F.setSectionPrefix(".unlikely"); } @@ -652,13 +652,6 @@ bool CodeGenPrepare::isMergingEmptyBlockProfitable(BasicBlock *BB, if (SameIncomingValueBBs.count(Pred)) return true; - if (!BFI) { - Function &F = *BB->getParent(); - LoopInfo LI{DominatorTree(F)}; - BPI.reset(new BranchProbabilityInfo(F, LI)); - BFI.reset(new BlockFrequencyInfo(F, *BPI, LI)); - } - BlockFrequency PredFreq = BFI->getBlockFreq(Pred); BlockFrequency BBFreq = BFI->getBlockFreq(BB); @@ -3704,7 +3697,7 @@ bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode, } else { uint64_t TypeSize = DL.getTypeAllocSize(GTI.getIndexedType()); if (ConstantInt *CI = dyn_cast(AddrInst->getOperand(i))) { - ConstantOffset += CI->getSExtValue()*TypeSize; + ConstantOffset += CI->getSExtValue() * TypeSize; } else if (TypeSize) { // Scales of zero don't do anything. // We only allow one variable index at the moment. if (VariableOperand != -1) diff --git a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 87a658be4c29..a3b43c92a7fc 100644 --- a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -835,6 +835,9 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { case 64: ZeroTy = Type::getDoubleTy(Ctx); break; + case 128: + ZeroTy = Type::getFP128Ty(Ctx); + break; default: llvm_unreachable("unexpected floating-point type"); } diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp index 1aaf7a0ceef8..86ce4b7a9464 100644 --- a/lib/CodeGen/InlineSpiller.cpp +++ b/lib/CodeGen/InlineSpiller.cpp @@ -28,7 +28,7 @@ #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LiveRangeEdit.h" -#include "llvm/CodeGen/LiveStackAnalysis.h" +#include "llvm/CodeGen/LiveStacks.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineDominators.h" diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp index 92edfb059ad6..77a7aaa95732 100644 --- a/lib/CodeGen/LLVMTargetMachine.cpp +++ b/lib/CodeGen/LLVMTargetMachine.cpp @@ -81,10 +81,9 @@ LLVMTargetMachine::LLVMTargetMachine(const Target &T, this->OptLevel = OL; } -TargetIRAnalysis LLVMTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](const Function &F) { - return TargetTransformInfo(BasicTTIImpl(this, F)); - }); +TargetTransformInfo +LLVMTargetMachine::getTargetTransformInfo(const Function &F) { + return TargetTransformInfo(BasicTTIImpl(this, F)); } /// addPassesToX helper drives creation and initialization of TargetPassConfig. diff --git a/lib/CodeGen/LiveStackAnalysis.cpp b/lib/CodeGen/LiveStacks.cpp similarity index 96% rename from lib/CodeGen/LiveStackAnalysis.cpp rename to lib/CodeGen/LiveStacks.cpp index b0e58b0e3e5f..80ecfdb7a507 100644 --- a/lib/CodeGen/LiveStackAnalysis.cpp +++ b/lib/CodeGen/LiveStacks.cpp @@ -1,4 +1,4 @@ -//===-- LiveStackAnalysis.cpp - Live Stack Slot Analysis ------------------===// +//===-- LiveStacks.cpp - Live Stack Slot Analysis -------------------------===// // // The LLVM Compiler Infrastructure // @@ -13,7 +13,7 @@ // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/LiveStackAnalysis.h" +#include "llvm/CodeGen/LiveStacks.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetRegisterInfo.h" diff --git a/lib/CodeGen/MIRPrinter.cpp b/lib/CodeGen/MIRPrinter.cpp index 3568f96d2b9a..f91cca6e4e50 100644 --- a/lib/CodeGen/MIRPrinter.cpp +++ b/lib/CodeGen/MIRPrinter.cpp @@ -157,18 +157,14 @@ class MIPrinter { void print(const MachineBasicBlock &MBB); void print(const MachineInstr &MI); - void printIRBlockReference(const BasicBlock &BB); void printIRValueReference(const Value &V); void printStackObjectReference(int FrameIndex); - void printOffset(int64_t Offset); void print(const MachineInstr &MI, unsigned OpIdx, const TargetRegisterInfo *TRI, bool ShouldPrintRegisterTies, LLT TypeToPrint, bool PrintDef = true); void print(const LLVMContext &Context, const TargetInstrInfo &TII, const MachineMemOperand &Op); void printSyncScope(const LLVMContext &Context, SyncScope::ID SSID); - - void print(const MCCFIInstruction &CFI, const TargetRegisterInfo *TRI); }; } // end namespace llvm @@ -707,32 +703,6 @@ void MIPrinter::print(const MachineInstr &MI) { } } -static void printIRSlotNumber(raw_ostream &OS, int Slot) { - if (Slot == -1) - OS << ""; - else - OS << Slot; -} - -void MIPrinter::printIRBlockReference(const BasicBlock &BB) { - OS << "%ir-block."; - if (BB.hasName()) { - printLLVMNameWithoutPrefix(OS, BB.getName()); - return; - } - const Function *F = BB.getParent(); - int Slot; - if (F == MST.getCurrentFunction()) { - Slot = MST.getLocalSlot(&BB); - } else { - ModuleSlotTracker CustomMST(F->getParent(), - /*ShouldInitializeAllMetadata=*/false); - CustomMST.incorporateFunction(*F); - Slot = CustomMST.getLocalSlot(&BB); - } - printIRSlotNumber(OS, Slot); -} - void MIPrinter::printIRValueReference(const Value &V) { if (isa(V)) { V.printAsOperand(OS, /*PrintType=*/false, MST); @@ -750,7 +720,7 @@ void MIPrinter::printIRValueReference(const Value &V) { printLLVMNameWithoutPrefix(OS, V.getName()); return; } - printIRSlotNumber(OS, MST.getLocalSlot(&V)); + MachineOperand::printIRSlotNumber(OS, MST.getLocalSlot(&V)); } void MIPrinter::printStackObjectReference(int FrameIndex) { @@ -762,16 +732,6 @@ void MIPrinter::printStackObjectReference(int FrameIndex) { Operand.Name); } -void MIPrinter::printOffset(int64_t Offset) { - if (Offset == 0) - return; - if (Offset < 0) { - OS << " - " << -Offset; - return; - } - OS << " + " << Offset; -} - void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx, const TargetRegisterInfo *TRI, bool ShouldPrintRegisterTies, LLT TypeToPrint, @@ -787,6 +747,7 @@ void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx, LLVM_FALLTHROUGH; case MachineOperand::MO_Register: case MachineOperand::MO_CImmediate: + case MachineOperand::MO_FPImmediate: case MachineOperand::MO_MachineBasicBlock: case MachineOperand::MO_ConstantPoolIndex: case MachineOperand::MO_TargetIndex: @@ -795,7 +756,11 @@ void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx, case MachineOperand::MO_GlobalAddress: case MachineOperand::MO_RegisterLiveOut: case MachineOperand::MO_Metadata: - case MachineOperand::MO_MCSymbol: { + case MachineOperand::MO_MCSymbol: + case MachineOperand::MO_CFIIndex: + case MachineOperand::MO_IntrinsicID: + case MachineOperand::MO_Predicate: + case MachineOperand::MO_BlockAddress: { unsigned TiedOperandIdx = 0; if (ShouldPrintRegisterTies && Op.isReg() && Op.isTied() && !Op.isDef()) TiedOperandIdx = Op.getParent()->findTiedOperandIdx(OpIdx); @@ -804,21 +769,9 @@ void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx, TiedOperandIdx, TRI, TII); break; } - case MachineOperand::MO_FPImmediate: - Op.getFPImm()->printAsOperand(OS, /*PrintType=*/true, MST); - break; case MachineOperand::MO_FrameIndex: printStackObjectReference(Op.getIndex()); break; - case MachineOperand::MO_BlockAddress: - OS << "blockaddress("; - Op.getBlockAddress()->getFunction()->printAsOperand(OS, /*PrintType=*/false, - MST); - OS << ", "; - printIRBlockReference(*Op.getBlockAddress()->getBasicBlock()); - OS << ')'; - printOffset(Op.getOffset()); - break; case MachineOperand::MO_RegisterMask: { auto RegMaskInfo = RegisterMaskIds.find(Op.getRegMask()); if (RegMaskInfo != RegisterMaskIds.end()) @@ -827,28 +780,6 @@ void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx, printCustomRegMask(Op.getRegMask(), OS, TRI); break; } - case MachineOperand::MO_CFIIndex: { - const MachineFunction &MF = *Op.getParent()->getMF(); - print(MF.getFrameInstructions()[Op.getCFIIndex()], TRI); - break; - } - case MachineOperand::MO_IntrinsicID: { - Intrinsic::ID ID = Op.getIntrinsicID(); - if (ID < Intrinsic::num_intrinsics) - OS << "intrinsic(@" << Intrinsic::getName(ID, None) << ')'; - else { - const MachineFunction &MF = *Op.getParent()->getMF(); - const TargetIntrinsicInfo *TII = MF.getTarget().getIntrinsicInfo(); - OS << "intrinsic(@" << TII->getName(ID) << ')'; - } - break; - } - case MachineOperand::MO_Predicate: { - auto Pred = static_cast(Op.getPredicate()); - OS << (CmpInst::isIntPredicate(Pred) ? "int" : "float") << "pred(" - << CmpInst::getPredicateName(Pred) << ')'; - break; - } } } @@ -938,7 +869,7 @@ void MIPrinter::print(const LLVMContext &Context, const TargetInstrInfo &TII, break; } } - printOffset(Op.getOffset()); + MachineOperand::printOperandOffset(OS, Op.getOffset()); if (Op.getBaseAlignment() != Op.getSize()) OS << ", align " << Op.getBaseAlignment(); auto AAInfo = Op.getAAInfo(); @@ -978,118 +909,6 @@ void MIPrinter::printSyncScope(const LLVMContext &Context, SyncScope::ID SSID) { } } -static void printCFIRegister(unsigned DwarfReg, raw_ostream &OS, - const TargetRegisterInfo *TRI) { - int Reg = TRI->getLLVMRegNum(DwarfReg, true); - if (Reg == -1) { - OS << ""; - return; - } - OS << printReg(Reg, TRI); -} - -void MIPrinter::print(const MCCFIInstruction &CFI, - const TargetRegisterInfo *TRI) { - switch (CFI.getOperation()) { - case MCCFIInstruction::OpSameValue: - OS << "same_value "; - if (MCSymbol *Label = CFI.getLabel()) - MachineOperand::printSymbol(OS, *Label); - printCFIRegister(CFI.getRegister(), OS, TRI); - break; - case MCCFIInstruction::OpRememberState: - OS << "remember_state "; - if (MCSymbol *Label = CFI.getLabel()) - MachineOperand::printSymbol(OS, *Label); - break; - case MCCFIInstruction::OpRestoreState: - OS << "restore_state "; - if (MCSymbol *Label = CFI.getLabel()) - MachineOperand::printSymbol(OS, *Label); - break; - case MCCFIInstruction::OpOffset: - OS << "offset "; - if (MCSymbol *Label = CFI.getLabel()) - MachineOperand::printSymbol(OS, *Label); - printCFIRegister(CFI.getRegister(), OS, TRI); - OS << ", " << CFI.getOffset(); - break; - case MCCFIInstruction::OpDefCfaRegister: - OS << "def_cfa_register "; - if (MCSymbol *Label = CFI.getLabel()) - MachineOperand::printSymbol(OS, *Label); - printCFIRegister(CFI.getRegister(), OS, TRI); - break; - case MCCFIInstruction::OpDefCfaOffset: - OS << "def_cfa_offset "; - if (MCSymbol *Label = CFI.getLabel()) - MachineOperand::printSymbol(OS, *Label); - OS << CFI.getOffset(); - break; - case MCCFIInstruction::OpDefCfa: - OS << "def_cfa "; - if (MCSymbol *Label = CFI.getLabel()) - MachineOperand::printSymbol(OS, *Label); - printCFIRegister(CFI.getRegister(), OS, TRI); - OS << ", " << CFI.getOffset(); - break; - case MCCFIInstruction::OpRelOffset: - OS << "rel_offset "; - if (MCSymbol *Label = CFI.getLabel()) - MachineOperand::printSymbol(OS, *Label); - printCFIRegister(CFI.getRegister(), OS, TRI); - OS << ", " << CFI.getOffset(); - break; - case MCCFIInstruction::OpAdjustCfaOffset: - OS << "adjust_cfa_offset "; - if (MCSymbol *Label = CFI.getLabel()) - MachineOperand::printSymbol(OS, *Label); - OS << CFI.getOffset(); - break; - case MCCFIInstruction::OpRestore: - OS << "restore "; - if (MCSymbol *Label = CFI.getLabel()) - MachineOperand::printSymbol(OS, *Label); - printCFIRegister(CFI.getRegister(), OS, TRI); - break; - case MCCFIInstruction::OpEscape: { - OS << "escape "; - if (MCSymbol *Label = CFI.getLabel()) - MachineOperand::printSymbol(OS, *Label); - if (!CFI.getValues().empty()) { - size_t e = CFI.getValues().size() - 1; - for (size_t i = 0; i < e; ++i) - OS << format("0x%02x", uint8_t(CFI.getValues()[i])) << ", "; - OS << format("0x%02x", uint8_t(CFI.getValues()[e])) << ", "; - } - break; - } - case MCCFIInstruction::OpUndefined: - OS << "undefined "; - if (MCSymbol *Label = CFI.getLabel()) - MachineOperand::printSymbol(OS, *Label); - printCFIRegister(CFI.getRegister(), OS, TRI); - break; - case MCCFIInstruction::OpRegister: - OS << "register "; - if (MCSymbol *Label = CFI.getLabel()) - MachineOperand::printSymbol(OS, *Label); - printCFIRegister(CFI.getRegister(), OS, TRI); - OS << ", "; - printCFIRegister(CFI.getRegister2(), OS, TRI); - break; - case MCCFIInstruction::OpWindowSave: - OS << "window_save "; - if (MCSymbol *Label = CFI.getLabel()) - MachineOperand::printSymbol(OS, *Label); - break; - default: - // TODO: Print the other CFI Operations. - OS << ""; - break; - } -} - void llvm::printMIR(raw_ostream &OS, const Module &M) { yaml::Output Out(OS); Out << const_cast(M); diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp index 4ce689607730..84c808ee7938 100644 --- a/lib/CodeGen/MachineBlockPlacement.cpp +++ b/lib/CodeGen/MachineBlockPlacement.cpp @@ -1235,7 +1235,7 @@ void MachineBlockPlacement::precomputeTriangleChains() { // When profile is available, we need to handle the triangle-shape CFG. static BranchProbability getLayoutSuccessorProbThreshold( const MachineBasicBlock *BB) { - if (!BB->getParent()->getFunction().getEntryCount()) + if (!BB->getParent()->getFunction().hasProfileData()) return BranchProbability(StaticLikelyProb, 100); if (BB->succ_size() == 2) { const MachineBasicBlock *Succ1 = *BB->succ_begin(); @@ -2178,7 +2178,7 @@ MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) { // will be merged into the first outer loop chain for which this block is not // cold anymore. This needs precise profile data and we only do this when // profile data is available. - if (F->getFunction().getEntryCount() || ForceLoopColdBlock) { + if (F->getFunction().hasProfileData() || ForceLoopColdBlock) { BlockFrequency LoopFreq(0); for (auto LoopPred : L.getHeader()->predecessors()) if (!L.contains(LoopPred)) @@ -2220,7 +2220,7 @@ void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) { // for better layout. bool RotateLoopWithProfile = ForcePreciseRotationCost || - (PreciseRotationCost && F->getFunction().getEntryCount()); + (PreciseRotationCost && F->getFunction().hasProfileData()); // First check to see if there is an obviously preferable top block for the // loop. This will default to the header, but may end up as one of the diff --git a/lib/CodeGen/MachineOperand.cpp b/lib/CodeGen/MachineOperand.cpp index d17c481862a1..ec81c6391171 100644 --- a/lib/CodeGen/MachineOperand.cpp +++ b/lib/CodeGen/MachineOperand.cpp @@ -380,16 +380,6 @@ static void tryToGetTargetInfo(const MachineOperand &MO, } } -static void printOffset(raw_ostream &OS, int64_t Offset) { - if (Offset == 0) - return; - if (Offset < 0) { - OS << " - " << -Offset; - return; - } - OS << " + " << Offset; -} - static const char *getTargetIndexName(const MachineFunction &MF, int Index) { const auto *TII = MF.getSubtarget().getInstrInfo(); assert(TII && "expected instruction info"); @@ -412,6 +402,44 @@ static const char *getTargetFlagName(const TargetInstrInfo *TII, unsigned TF) { return nullptr; } +static void printCFIRegister(unsigned DwarfReg, raw_ostream &OS, + const TargetRegisterInfo *TRI) { + if (!TRI) { + OS << "%dwarfreg." << DwarfReg; + return; + } + + int Reg = TRI->getLLVMRegNum(DwarfReg, true); + if (Reg == -1) { + OS << ""; + return; + } + OS << printReg(Reg, TRI); +} + +static void printIRBlockReference(raw_ostream &OS, const BasicBlock &BB, + ModuleSlotTracker &MST) { + OS << "%ir-block."; + if (BB.hasName()) { + printLLVMNameWithoutPrefix(OS, BB.getName()); + return; + } + Optional Slot; + if (const Function *F = BB.getParent()) { + if (F == MST.getCurrentFunction()) { + Slot = MST.getLocalSlot(&BB); + } else if (const Module *M = F->getParent()) { + ModuleSlotTracker CustomMST(M, /*ShouldInitializeAllMetadata=*/false); + CustomMST.incorporateFunction(*F); + Slot = CustomMST.getLocalSlot(&BB); + } + } + if (Slot) + MachineOperand::printIRSlotNumber(OS, *Slot); + else + OS << ""; +} + void MachineOperand::printSubregIdx(raw_ostream &OS, uint64_t Index, const TargetRegisterInfo *TRI) { OS << "%subreg."; @@ -490,6 +518,125 @@ void MachineOperand::printStackObjectReference(raw_ostream &OS, OS << '.' << Name; } +void MachineOperand::printOperandOffset(raw_ostream &OS, int64_t Offset) { + if (Offset == 0) + return; + if (Offset < 0) { + OS << " - " << -Offset; + return; + } + OS << " + " << Offset; +} + +void MachineOperand::printIRSlotNumber(raw_ostream &OS, int Slot) { + if (Slot == -1) + OS << ""; + else + OS << Slot; +} + +static void printCFI(raw_ostream &OS, const MCCFIInstruction &CFI, + const TargetRegisterInfo *TRI) { + switch (CFI.getOperation()) { + case MCCFIInstruction::OpSameValue: + OS << "same_value "; + if (MCSymbol *Label = CFI.getLabel()) + MachineOperand::printSymbol(OS, *Label); + printCFIRegister(CFI.getRegister(), OS, TRI); + break; + case MCCFIInstruction::OpRememberState: + OS << "remember_state "; + if (MCSymbol *Label = CFI.getLabel()) + MachineOperand::printSymbol(OS, *Label); + break; + case MCCFIInstruction::OpRestoreState: + OS << "restore_state "; + if (MCSymbol *Label = CFI.getLabel()) + MachineOperand::printSymbol(OS, *Label); + break; + case MCCFIInstruction::OpOffset: + OS << "offset "; + if (MCSymbol *Label = CFI.getLabel()) + MachineOperand::printSymbol(OS, *Label); + printCFIRegister(CFI.getRegister(), OS, TRI); + OS << ", " << CFI.getOffset(); + break; + case MCCFIInstruction::OpDefCfaRegister: + OS << "def_cfa_register "; + if (MCSymbol *Label = CFI.getLabel()) + MachineOperand::printSymbol(OS, *Label); + printCFIRegister(CFI.getRegister(), OS, TRI); + break; + case MCCFIInstruction::OpDefCfaOffset: + OS << "def_cfa_offset "; + if (MCSymbol *Label = CFI.getLabel()) + MachineOperand::printSymbol(OS, *Label); + OS << CFI.getOffset(); + break; + case MCCFIInstruction::OpDefCfa: + OS << "def_cfa "; + if (MCSymbol *Label = CFI.getLabel()) + MachineOperand::printSymbol(OS, *Label); + printCFIRegister(CFI.getRegister(), OS, TRI); + OS << ", " << CFI.getOffset(); + break; + case MCCFIInstruction::OpRelOffset: + OS << "rel_offset "; + if (MCSymbol *Label = CFI.getLabel()) + MachineOperand::printSymbol(OS, *Label); + printCFIRegister(CFI.getRegister(), OS, TRI); + OS << ", " << CFI.getOffset(); + break; + case MCCFIInstruction::OpAdjustCfaOffset: + OS << "adjust_cfa_offset "; + if (MCSymbol *Label = CFI.getLabel()) + MachineOperand::printSymbol(OS, *Label); + OS << CFI.getOffset(); + break; + case MCCFIInstruction::OpRestore: + OS << "restore "; + if (MCSymbol *Label = CFI.getLabel()) + MachineOperand::printSymbol(OS, *Label); + printCFIRegister(CFI.getRegister(), OS, TRI); + break; + case MCCFIInstruction::OpEscape: { + OS << "escape "; + if (MCSymbol *Label = CFI.getLabel()) + MachineOperand::printSymbol(OS, *Label); + if (!CFI.getValues().empty()) { + size_t e = CFI.getValues().size() - 1; + for (size_t i = 0; i < e; ++i) + OS << format("0x%02x", uint8_t(CFI.getValues()[i])) << ", "; + OS << format("0x%02x", uint8_t(CFI.getValues()[e])) << ", "; + } + break; + } + case MCCFIInstruction::OpUndefined: + OS << "undefined "; + if (MCSymbol *Label = CFI.getLabel()) + MachineOperand::printSymbol(OS, *Label); + printCFIRegister(CFI.getRegister(), OS, TRI); + break; + case MCCFIInstruction::OpRegister: + OS << "register "; + if (MCSymbol *Label = CFI.getLabel()) + MachineOperand::printSymbol(OS, *Label); + printCFIRegister(CFI.getRegister(), OS, TRI); + OS << ", "; + printCFIRegister(CFI.getRegister2(), OS, TRI); + break; + case MCCFIInstruction::OpWindowSave: + OS << "window_save "; + if (MCSymbol *Label = CFI.getLabel()) + MachineOperand::printSymbol(OS, *Label); + break; + default: + // TODO: Print the other CFI Operations. + OS << ""; + break; + } +} + void MachineOperand::print(raw_ostream &OS, const TargetRegisterInfo *TRI, const TargetIntrinsicInfo *IntrinsicInfo) const { tryToGetTargetInfo(*this, TRI, IntrinsicInfo); @@ -561,29 +708,7 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST, getCImm()->printAsOperand(OS, /*PrintType=*/true, MST); break; case MachineOperand::MO_FPImmediate: - if (getFPImm()->getType()->isFloatTy()) { - OS << getFPImm()->getValueAPF().convertToFloat(); - } else if (getFPImm()->getType()->isHalfTy()) { - APFloat APF = getFPImm()->getValueAPF(); - bool Unused; - APF.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, &Unused); - OS << "half " << APF.convertToFloat(); - } else if (getFPImm()->getType()->isFP128Ty()) { - APFloat APF = getFPImm()->getValueAPF(); - SmallString<16> Str; - getFPImm()->getValueAPF().toString(Str); - OS << "quad " << Str; - } else if (getFPImm()->getType()->isX86_FP80Ty()) { - APFloat APF = getFPImm()->getValueAPF(); - OS << "x86_fp80 0xK"; - APInt API = APF.bitcastToAPInt(); - OS << format_hex_no_prefix(API.getHiBits(16).getZExtValue(), 4, - /*Upper=*/true); - OS << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16, - /*Upper=*/true); - } else { - OS << getFPImm()->getValueAPF().convertToDouble(); - } + getFPImm()->printAsOperand(OS, /*PrintType=*/true, MST); break; case MachineOperand::MO_MachineBasicBlock: OS << printMBBReference(*getMBB()); @@ -606,7 +731,7 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST, } case MachineOperand::MO_ConstantPoolIndex: OS << "%const." << getIndex(); - printOffset(OS, getOffset()); + printOperandOffset(OS, getOffset()); break; case MachineOperand::MO_TargetIndex: { OS << "target-index("; @@ -615,7 +740,7 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST, if (const auto *TargetIndexName = getTargetIndexName(*MF, getIndex())) Name = TargetIndexName; OS << Name << ')'; - printOffset(OS, getOffset()); + printOperandOffset(OS, getOffset()); break; } case MachineOperand::MO_JumpTableIndex: @@ -623,7 +748,7 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST, break; case MachineOperand::MO_GlobalAddress: getGlobal()->printAsOperand(OS, /*PrintType=*/false, MST); - printOffset(OS, getOffset()); + printOperandOffset(OS, getOffset()); break; case MachineOperand::MO_ExternalSymbol: { StringRef Name = getSymbolName(); @@ -633,16 +758,19 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST, } else { printLLVMNameWithoutPrefix(OS, Name); } - printOffset(OS, getOffset()); + printOperandOffset(OS, getOffset()); break; } - case MachineOperand::MO_BlockAddress: - OS << '<'; - getBlockAddress()->printAsOperand(OS, /*PrintType=*/false, MST); - if (getOffset()) - OS << "+" << getOffset(); - OS << '>'; + case MachineOperand::MO_BlockAddress: { + OS << "blockaddress("; + getBlockAddress()->getFunction()->printAsOperand(OS, /*PrintType=*/false, + MST); + OS << ", "; + printIRBlockReference(OS, *getBlockAddress()->getBasicBlock(), MST); + OS << ')'; + MachineOperand::printOperandOffset(OS, getOffset()); break; + } case MachineOperand::MO_RegisterMask: { OS << ""; + case MachineOperand::MO_CFIIndex: { + if (const MachineFunction *MF = getMFIfAvailable(*this)) + printCFI(OS, MF->getFrameInstructions()[getCFIIndex()], TRI); + else + OS << ""; break; + } case MachineOperand::MO_IntrinsicID: { Intrinsic::ID ID = getIntrinsicID(); if (ID < Intrinsic::num_intrinsics) - OS << "'; + OS << "intrinsic(@" << Intrinsic::getName(ID, None) << ')'; else if (IntrinsicInfo) - OS << "getName(ID) << '>'; + OS << "intrinsic(@" << IntrinsicInfo->getName(ID) << ')'; else - OS << "'; + OS << "intrinsic(" << ID << ')'; break; } case MachineOperand::MO_Predicate: { auto Pred = static_cast(getPredicate()); - OS << '<' << (CmpInst::isIntPredicate(Pred) ? "intpred" : "floatpred") - << CmpInst::getPredicateName(Pred) << '>'; + OS << (CmpInst::isIntPredicate(Pred) ? "int" : "float") << "pred(" + << CmpInst::getPredicateName(Pred) << ')'; break; } } diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp index c9fe7681e280..e0cc2ca9a2a2 100644 --- a/lib/CodeGen/MachineVerifier.cpp +++ b/lib/CodeGen/MachineVerifier.cpp @@ -37,7 +37,7 @@ #include "llvm/CodeGen/GlobalISel/RegisterBank.h" #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervals.h" -#include "llvm/CodeGen/LiveStackAnalysis.h" +#include "llvm/CodeGen/LiveStacks.h" #include "llvm/CodeGen/LiveVariables.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" diff --git a/lib/CodeGen/README.txt b/lib/CodeGen/README.txt index 2fcbd1280da4..3318e109155b 100644 --- a/lib/CodeGen/README.txt +++ b/lib/CodeGen/README.txt @@ -164,7 +164,7 @@ synthesize the various copy insertion/inspection methods in TargetInstrInfo. Stack coloring improvements: -1. Do proper LiveStackAnalysis on all stack objects including those which are +1. Do proper LiveStacks analysis on all stack objects including those which are not spill slots. 2. Reorder objects to fill in gaps between objects. e.g. 4, 1, , 4, 1, 1, 1, , 4 => 4, 1, 1, 1, 1, 4, 4 diff --git a/lib/CodeGen/RegAllocBasic.cpp b/lib/CodeGen/RegAllocBasic.cpp index 6e273277804b..1125d2c62bef 100644 --- a/lib/CodeGen/RegAllocBasic.cpp +++ b/lib/CodeGen/RegAllocBasic.cpp @@ -21,7 +21,7 @@ #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LiveRangeEdit.h" #include "llvm/CodeGen/LiveRegMatrix.h" -#include "llvm/CodeGen/LiveStackAnalysis.h" +#include "llvm/CodeGen/LiveStacks.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp index 186ef577e31d..e492c481a540 100644 --- a/lib/CodeGen/RegAllocGreedy.cpp +++ b/lib/CodeGen/RegAllocGreedy.cpp @@ -39,7 +39,7 @@ #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LiveRangeEdit.h" #include "llvm/CodeGen/LiveRegMatrix.h" -#include "llvm/CodeGen/LiveStackAnalysis.h" +#include "llvm/CodeGen/LiveStacks.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineDominators.h" diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp index 351e91c932eb..69a879701fae 100644 --- a/lib/CodeGen/RegAllocPBQP.cpp +++ b/lib/CodeGen/RegAllocPBQP.cpp @@ -45,7 +45,7 @@ #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LiveRangeEdit.h" -#include "llvm/CodeGen/LiveStackAnalysis.h" +#include "llvm/CodeGen/LiveStacks.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index f97732c1c49d..17f907eb07e8 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3988,10 +3988,12 @@ SDValue DAGCombiner::visitAND(SDNode *N) { if (SDValue RAND = ReassociateOps(ISD::AND, SDLoc(N), N0, N1)) return RAND; // fold (and (or x, C), D) -> D if (C & D) == D - if (N1C && N0.getOpcode() == ISD::OR) - if (ConstantSDNode *ORI = isConstOrConstSplat(N0.getOperand(1))) - if (N1C->getAPIntValue().isSubsetOf(ORI->getAPIntValue())) - return N1; + auto MatchSubset = [](ConstantSDNode *LHS, ConstantSDNode *RHS) { + return RHS->getAPIntValue().isSubsetOf(LHS->getAPIntValue()); + }; + if (N0.getOpcode() == ISD::OR && + matchBinaryPredicate(N0.getOperand(1), N1, MatchSubset)) + return N1; // fold (and (any_ext V), c) -> (zero_ext V) if 'and' only clears top bits. if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) { SDValue N0Op0 = N0.getOperand(0); @@ -4675,16 +4677,16 @@ SDValue DAGCombiner::visitOR(SDNode *N) { // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2) // iff (c1 & c2) != 0. - if (N1C && N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse()) { - if (ConstantSDNode *C1 = dyn_cast(N0.getOperand(1))) { - if (C1->getAPIntValue().intersects(N1C->getAPIntValue())) { - if (SDValue COR = - DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N1), VT, N1C, C1)) - return DAG.getNode( - ISD::AND, SDLoc(N), VT, - DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1), COR); - return SDValue(); - } + auto MatchIntersect = [](ConstantSDNode *LHS, ConstantSDNode *RHS) { + return LHS->getAPIntValue().intersects(RHS->getAPIntValue()); + }; + if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() && + matchBinaryPredicate(N0.getOperand(1), N1, MatchIntersect)) { + if (SDValue COR = DAG.FoldConstantArithmetic( + ISD::OR, SDLoc(N1), VT, N1.getNode(), N0.getOperand(1).getNode())) { + SDValue IOR = DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1); + AddToWorklist(IOR.getNode()); + return DAG.getNode(ISD::AND, SDLoc(N), VT, COR, IOR); } } @@ -5380,21 +5382,6 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { AddToWorklist(NotX.getNode()); return DAG.getNode(ISD::AND, SDLoc(N), VT, NotX, N1); } - // fold (xor (xor x, c1), c2) -> (xor x, (xor c1, c2)) - if (N1C && N0.getOpcode() == ISD::XOR) { - if (const ConstantSDNode *N00C = getAsNonOpaqueConstant(N0.getOperand(0))) { - SDLoc DL(N); - return DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(1), - DAG.getConstant(N1C->getAPIntValue() ^ - N00C->getAPIntValue(), DL, VT)); - } - if (const ConstantSDNode *N01C = getAsNonOpaqueConstant(N0.getOperand(1))) { - SDLoc DL(N); - return DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(0), - DAG.getConstant(N1C->getAPIntValue() ^ - N01C->getAPIntValue(), DL, VT)); - } - } // fold Y = sra (X, size(X)-1); xor (add (X, Y), Y) -> (abs X) unsigned OpSizeInBits = VT.getScalarSizeInBits(); @@ -10201,7 +10188,7 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) { case ISD::SETLT: case ISD::SETLE: std::swap(TrueOpnd, FalseOpnd); - // Fall through + LLVM_FALLTHROUGH; case ISD::SETOGT: case ISD::SETUGT: case ISD::SETOGE: @@ -10555,7 +10542,7 @@ static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(SDNode *N) { // value in one SSE register, but instruction selection cannot handle // FCOPYSIGN on SSE registers yet. EVT N1VT = N1->getValueType(0); - EVT N1Op0VT = N1->getOperand(0)->getValueType(0); + EVT N1Op0VT = N1->getOperand(0).getValueType(); return (N1VT == N1Op0VT || N1Op0VT != MVT::f128); } return false; @@ -13784,30 +13771,30 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { } } - if (StoreSDNode *ST1 = dyn_cast(Chain)) { - if (ST->isUnindexed() && !ST->isVolatile() && ST1->isUnindexed() && - !ST1->isVolatile() && ST1->getBasePtr() == Ptr && - ST->getMemoryVT() == ST1->getMemoryVT()) { - // If this is a store followed by a store with the same value to the same - // location, then the store is dead/noop. - if (ST1->getValue() == Value) { - // The store is dead, remove it. - return Chain; - } + // Deal with elidable overlapping chained stores. + if (StoreSDNode *ST1 = dyn_cast(Chain)) + if (OptLevel != CodeGenOpt::None && ST->isUnindexed() && + ST1->isUnindexed() && !ST1->isVolatile() && ST1->hasOneUse() && + !ST1->getBasePtr().isUndef() && !ST->isVolatile()) { + BaseIndexOffset STBasePtr = BaseIndexOffset::match(ST->getBasePtr(), DAG); + BaseIndexOffset ST1BasePtr = + BaseIndexOffset::match(ST1->getBasePtr(), DAG); + unsigned STBytes = ST->getMemoryVT().getStoreSize(); + unsigned ST1Bytes = ST1->getMemoryVT().getStoreSize(); + int64_t PtrDiff; + // If this is a store who's preceeding store to a subset of the same + // memory and no one other node is chained to that store we can + // effectively drop the store. Do not remove stores to undef as they may + // be used as data sinks. - // If this is a store who's preceeding store to the same location - // and no one other node is chained to that store we can effectively - // drop the store. Do not remove stores to undef as they may be used as - // data sinks. - if (OptLevel != CodeGenOpt::None && ST1->hasOneUse() && - !ST1->getBasePtr().isUndef()) { - // ST1 is fully overwritten and can be elided. Combine with it's chain - // value. + if (((ST->getBasePtr() == ST1->getBasePtr()) && + (ST->getValue() == ST1->getValue())) || + (STBasePtr.equalBaseIndex(ST1BasePtr, DAG, PtrDiff) && + (0 <= PtrDiff) && (PtrDiff + ST1Bytes <= STBytes))) { CombineTo(ST1, ST1->getChain()); - return SDValue(); + return SDValue(N, 0); } } - } // If this is an FP_ROUND or TRUNC followed by a store, fold this into a // truncating store. We can do this even if this is already a truncstore. @@ -15110,7 +15097,7 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { // Transform: concat_vectors(scalar, undef) -> scalar_to_vector(sclr). if (In->getOpcode() == ISD::BITCAST && - !In->getOperand(0)->getValueType(0).isVector()) { + !In->getOperand(0).getValueType().isVector()) { SDValue Scalar = In->getOperand(0); // If the bitcast type isn't legal, it might be a trunc of a legal type; @@ -15157,7 +15144,7 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { bool FoundMinVT = false; for (const SDValue &Op : N->ops()) if (ISD::BUILD_VECTOR == Op.getOpcode()) { - EVT OpSVT = Op.getOperand(0)->getValueType(0); + EVT OpSVT = Op.getOperand(0).getValueType(); MinVT = (!FoundMinVT || OpSVT.bitsLE(MinVT)) ? OpSVT : MinVT; FoundMinVT = true; } @@ -17418,43 +17405,6 @@ SDValue DAGCombiner::buildSqrtEstimate(SDValue Op, SDNodeFlags Flags) { return buildSqrtEstimateImpl(Op, Flags, false); } -/// Return true if base is a frame index, which is known not to alias with -/// anything but itself. Provides base object and offset as results. -static bool findBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset, - const GlobalValue *&GV, const void *&CV) { - // Assume it is a primitive operation. - Base = Ptr; Offset = 0; GV = nullptr; CV = nullptr; - - // If it's an adding a simple constant then integrate the offset. - if (Base.getOpcode() == ISD::ADD) { - if (ConstantSDNode *C = dyn_cast(Base.getOperand(1))) { - Base = Base.getOperand(0); - Offset += C->getSExtValue(); - } - } - - // Return the underlying GlobalValue, and update the Offset. Return false - // for GlobalAddressSDNode since the same GlobalAddress may be represented - // by multiple nodes with different offsets. - if (GlobalAddressSDNode *G = dyn_cast(Base)) { - GV = G->getGlobal(); - Offset += G->getOffset(); - return false; - } - - // Return the underlying Constant value, and update the Offset. Return false - // for ConstantSDNodes since the same constant pool entry may be represented - // by multiple nodes with different offsets. - if (ConstantPoolSDNode *C = dyn_cast(Base)) { - CV = C->isMachineConstantPoolEntry() ? (const void *)C->getMachineCPVal() - : (const void *)C->getConstVal(); - Offset += C->getOffset(); - return false; - } - // If it's any of the following then it can't alias with anything but itself. - return isa(Base); -} - /// Return true if there is any possibility that the two addresses overlap. bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const { // If they are the same then they must be aliases. @@ -17496,39 +17446,18 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const { return false; } - // FIXME: findBaseOffset and ConstantValue/GlobalValue/FrameIndex analysis - // modified to use BaseIndexOffset. + bool IsFI0 = isa(BasePtr0.getBase()); + bool IsFI1 = isa(BasePtr1.getBase()); + bool IsGV0 = isa(BasePtr0.getBase()); + bool IsGV1 = isa(BasePtr1.getBase()); + bool IsCV0 = isa(BasePtr0.getBase()); + bool IsCV1 = isa(BasePtr1.getBase()); - // Gather base node and offset information. - SDValue Base0, Base1; - int64_t Offset0, Offset1; - const GlobalValue *GV0, *GV1; - const void *CV0, *CV1; - bool IsFrameIndex0 = findBaseOffset(Op0->getBasePtr(), - Base0, Offset0, GV0, CV0); - bool IsFrameIndex1 = findBaseOffset(Op1->getBasePtr(), - Base1, Offset1, GV1, CV1); - - // If they have the same base address, then check to see if they overlap. - if (Base0 == Base1 || (GV0 && (GV0 == GV1)) || (CV0 && (CV0 == CV1))) - return !((Offset0 + NumBytes0) <= Offset1 || - (Offset1 + NumBytes1) <= Offset0); - - // It is possible for different frame indices to alias each other, mostly - // when tail call optimization reuses return address slots for arguments. - // To catch this case, look up the actual index of frame indices to compute - // the real alias relationship. - if (IsFrameIndex0 && IsFrameIndex1) { - MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); - Offset0 += MFI.getObjectOffset(cast(Base0)->getIndex()); - Offset1 += MFI.getObjectOffset(cast(Base1)->getIndex()); - return !((Offset0 + NumBytes0) <= Offset1 || - (Offset1 + NumBytes1) <= Offset0); - } - - // Otherwise, if we know what the bases are, and they aren't identical, then - // we know they cannot alias. - if ((IsFrameIndex0 || CV0 || GV0) && (IsFrameIndex1 || CV1 || GV1)) + // If of mismatched base types or checkable indices we can check + // they do not alias. + if ((BasePtr0.getIndex() == BasePtr1.getIndex() || (IsFI0 != IsFI1) || + (IsGV0 != IsGV1) || (IsCV0 != IsCV1)) && + (IsFI0 || IsGV0 || IsCV0) && (IsFI1 || IsGV1 || IsCV1)) return false; // If we know required SrcValue1 and SrcValue2 have relatively large alignment diff --git a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index eaf177d0661b..e28a3aa47ca3 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -1887,7 +1887,7 @@ SDValue DAGTypeLegalizer::PromoteFloatOp_STORE(SDNode *N, unsigned OpNo) { SDLoc DL(N); SDValue Promoted = GetPromotedFloat(Val); - EVT VT = ST->getOperand(1)->getValueType(0); + EVT VT = ST->getOperand(1).getValueType(); EVT IVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); SDValue NewVal; diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp index b60d7bca498a..4438ee7878b8 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp @@ -224,7 +224,7 @@ bool DAGTypeLegalizer::run() { assert(N->getNodeId() == ReadyToProcess && "Node should be ready if on worklist!"); - DEBUG(dbgs() << "Legalizing node: "; N->dump()); + DEBUG(dbgs() << "Legalizing node: "; N->dump(&DAG)); if (IgnoreNodeResults(N)) { DEBUG(dbgs() << "Ignoring node results\n"); goto ScanOperands; @@ -296,7 +296,7 @@ bool DAGTypeLegalizer::run() { continue; const auto Op = N->getOperand(i); - DEBUG(dbgs() << "Analyzing operand: "; Op.dump()); + DEBUG(dbgs() << "Analyzing operand: "; Op.dump(&DAG)); EVT OpVT = Op.getValueType(); switch (getTypeAction(OpVT)) { case TargetLowering::TypeLegal: @@ -445,7 +445,7 @@ bool DAGTypeLegalizer::run() { if (!isTypeLegal(Node.getValueType(i)) && !TLI.isTypeLegal(Node.getValueType(i))) { dbgs() << "Result type " << i << " illegal: "; - Node.dump(); + Node.dump(&DAG); Failed = true; } @@ -455,7 +455,7 @@ bool DAGTypeLegalizer::run() { !isTypeLegal(Node.getOperand(i).getValueType()) && !TLI.isTypeLegal(Node.getOperand(i).getValueType())) { dbgs() << "Operand type " << i << " illegal: "; - Node.getOperand(i).dump(); + Node.getOperand(i).dump(&DAG); Failed = true; } diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 8f2320f52a0f..ce1c01b621f0 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -331,7 +331,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_VSELECT(SDNode *N) { // At least try the common case where the boolean is generated by a // comparison. if (Cond->getOpcode() == ISD::SETCC) { - EVT OpVT = Cond->getOperand(0)->getValueType(0); + EVT OpVT = Cond->getOperand(0).getValueType(); ScalarBool = TLI.getBooleanContents(OpVT.getScalarType()); VecBool = TLI.getBooleanContents(OpVT); } else @@ -1548,14 +1548,14 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { break; case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: - if (N->getValueType(0).bitsLT(N->getOperand(0)->getValueType(0))) + if (N->getValueType(0).bitsLT(N->getOperand(0).getValueType())) Res = SplitVecOp_TruncateHelper(N); else Res = SplitVecOp_UnaryOp(N); break; case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: - if (N->getValueType(0).bitsLT(N->getOperand(0)->getValueType(0))) + if (N->getValueType(0).bitsLT(N->getOperand(0).getValueType())) Res = SplitVecOp_TruncateHelper(N); else Res = SplitVecOp_UnaryOp(N); diff --git a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp index 379f0dcef513..7f369c746d24 100644 --- a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp +++ b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp @@ -252,6 +252,7 @@ bool ResourcePriorityQueue::isResourceAvailable(SUnit *SU) { if (!ResourcesModel->canReserveResources(&TII->get( SU->getNode()->getMachineOpcode()))) return false; + break; case TargetOpcode::EXTRACT_SUBREG: case TargetOpcode::INSERT_SUBREG: case TargetOpcode::SUBREG_TO_REG: diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 12a21e74079e..a04c770c51c4 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -3750,6 +3750,9 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: case ISD::TRUNCATE: + case ISD::ANY_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND: case ISD::UINT_TO_FP: case ISD::SINT_TO_FP: case ISD::ABS: diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp index 544da362be69..d5980919d03c 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp @@ -37,6 +37,23 @@ bool BaseIndexOffset::equalBaseIndex(BaseIndexOffset &Other, return true; } + // Match Constants + if (auto *A = dyn_cast(Base)) + if (auto *B = dyn_cast(Other.Base)) { + bool IsMatch = + A->isMachineConstantPoolEntry() == B->isMachineConstantPoolEntry(); + if (IsMatch) { + if (A->isMachineConstantPoolEntry()) + IsMatch = A->getMachineCPVal() == B->getMachineCPVal(); + else + IsMatch = A->getConstVal() == B->getConstVal(); + } + if (IsMatch) { + Off += B->getOffset() - A->getOffset(); + return true; + } + } + const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); // Match non-equal FrameIndexes - If both frame indices are fixed diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 18f6997ef83c..d13ccc263718 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -3117,7 +3117,16 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, continue; } case OPC_RecordMemRef: - MatchedMemRefs.push_back(cast(N)->getMemOperand()); + if (auto *MN = dyn_cast(N)) + MatchedMemRefs.push_back(MN->getMemOperand()); + else { + DEBUG( + dbgs() << "Expected MemSDNode "; + N->dump(CurDAG); + dbgs() << '\n' + ); + } + continue; case OPC_CaptureGlueInput: @@ -3563,7 +3572,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, Ops.push_back(InputGlue); // Create the node. - SDNode *Res = nullptr; + MachineSDNode *Res = nullptr; bool IsMorphNodeTo = Opcode == OPC_MorphNodeTo || (Opcode >= OPC_MorphNodeTo0 && Opcode <= OPC_MorphNodeTo2); if (!IsMorphNodeTo) { @@ -3589,7 +3598,8 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, "Chain node replaced during MorphNode"); Chain.erase(std::remove(Chain.begin(), Chain.end(), N), Chain.end()); }); - Res = MorphNode(NodeToMatch, TargetOpc, VTList, Ops, EmitNodeInfo); + Res = cast(MorphNode(NodeToMatch, TargetOpc, VTList, + Ops, EmitNodeInfo)); } // If the node had chain/glue results, update our notion of the current @@ -3645,13 +3655,19 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, } } - cast(Res) - ->setMemRefs(MemRefs, MemRefs + NumMemRefs); + Res->setMemRefs(MemRefs, MemRefs + NumMemRefs); } - DEBUG(dbgs() << " " - << (IsMorphNodeTo ? "Morphed" : "Created") - << " node: "; Res->dump(CurDAG); dbgs() << "\n"); + DEBUG( + if (!MatchedMemRefs.empty() && Res->memoperands_empty()) + dbgs() << " Dropping mem operands\n"; + dbgs() << " " + << (IsMorphNodeTo ? "Morphed" : "Created") + << " node: "; + Res->dump(CurDAG); + + dbgs() << '\n'; + ); // If this was a MorphNodeTo then we're completely done! if (IsMorphNodeTo) { diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 58276052c10b..d76e52d78870 100644 --- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -3812,7 +3812,7 @@ SDValue TargetLowering::getVectorElementPointer(SelectionDAG &DAG, Index = DAG.getNode(ISD::MUL, dl, IdxVT, Index, DAG.getConstant(EltSize, dl, IdxVT)); - return DAG.getNode(ISD::ADD, dl, IdxVT, Index, VecPtr); + return DAG.getNode(ISD::ADD, dl, IdxVT, VecPtr, Index); } //===----------------------------------------------------------------------===// diff --git a/lib/CodeGen/StackSlotColoring.cpp b/lib/CodeGen/StackSlotColoring.cpp index 62f662d1ade4..8fc7a4a32842 100644 --- a/lib/CodeGen/StackSlotColoring.cpp +++ b/lib/CodeGen/StackSlotColoring.cpp @@ -16,7 +16,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervals.h" -#include "llvm/CodeGen/LiveStackAnalysis.h" +#include "llvm/CodeGen/LiveStacks.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineFrameInfo.h" diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp index 543c12eebb45..224ae1a3236a 100644 --- a/lib/CodeGen/TargetLoweringBase.cpp +++ b/lib/CodeGen/TargetLoweringBase.cpp @@ -89,6 +89,21 @@ static cl::opt OptsizeJumpTableDensity( cl::desc("Minimum density for building a jump table in " "an optsize function")); +static bool darwinHasSinCos(const Triple &TT) { + assert(TT.isOSDarwin() && "should be called with darwin triple"); + // Don't bother with 32 bit x86. + if (TT.getArch() == Triple::x86) + return false; + // Macos < 10.9 has no sincos_stret. + if (TT.isMacOSX()) + return !TT.isMacOSXVersionLT(10, 9) && TT.isArch64Bit(); + // iOS < 7.0 has no sincos_stret. + if (TT.isiOS()) + return !TT.isOSVersionLT(7, 0); + // Any other darwin such as WatchOS/TvOS is new enough. + return true; +} + // Although this default value is arbitrary, it is not random. It is assumed // that a condition that evaluates the same way by a higher percentage than this // is best represented as control flow. Therefore, the default value N should be @@ -100,44 +115,56 @@ static cl::opt MinPercentageForPredictableBranch( "or false to assume that the condition is predictable"), cl::Hidden); -/// InitLibcallNames - Set default libcall names. -static void InitLibcallNames(const char **Names, const Triple &TT) { +void TargetLoweringBase::InitLibcalls(const Triple &TT) { #define HANDLE_LIBCALL(code, name) \ - Names[RTLIB::code] = name; + setLibcallName(RTLIB::code, name); #include "llvm/CodeGen/RuntimeLibcalls.def" #undef HANDLE_LIBCALL + // Initialize calling conventions to their default. + for (int LC = 0; LC < RTLIB::UNKNOWN_LIBCALL; ++LC) + setLibcallCallingConv((RTLIB::Libcall)LC, CallingConv::C); // A few names are different on particular architectures or environments. if (TT.isOSDarwin()) { // For f16/f32 conversions, Darwin uses the standard naming scheme, instead // of the gnueabi-style __gnu_*_ieee. // FIXME: What about other targets? - Names[RTLIB::FPEXT_F16_F32] = "__extendhfsf2"; - Names[RTLIB::FPROUND_F32_F16] = "__truncsfhf2"; + setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2"); + setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2"); + + // Darwin 10 and higher has an optimized __bzero. + if (!TT.isMacOSX() || !TT.isMacOSXVersionLT(10, 6) || TT.isArch64Bit()) { + setLibcallName(RTLIB::BZERO, TT.isAArch64() ? "bzero" : "__bzero"); + } + + if (darwinHasSinCos(TT)) { + setLibcallName(RTLIB::SINCOS_STRET_F32, "__sincosf_stret"); + setLibcallName(RTLIB::SINCOS_STRET_F64, "__sincos_stret"); + if (TT.isWatchABI()) { + setLibcallCallingConv(RTLIB::SINCOS_STRET_F32, + CallingConv::ARM_AAPCS_VFP); + setLibcallCallingConv(RTLIB::SINCOS_STRET_F64, + CallingConv::ARM_AAPCS_VFP); + } + } } else { - Names[RTLIB::FPEXT_F16_F32] = "__gnu_h2f_ieee"; - Names[RTLIB::FPROUND_F32_F16] = "__gnu_f2h_ieee"; + setLibcallName(RTLIB::FPEXT_F16_F32, "__gnu_h2f_ieee"); + setLibcallName(RTLIB::FPROUND_F32_F16, "__gnu_f2h_ieee"); } if (TT.isGNUEnvironment() || TT.isOSFuchsia()) { - Names[RTLIB::SINCOS_F32] = "sincosf"; - Names[RTLIB::SINCOS_F64] = "sincos"; - Names[RTLIB::SINCOS_F80] = "sincosl"; - Names[RTLIB::SINCOS_F128] = "sincosl"; - Names[RTLIB::SINCOS_PPCF128] = "sincosl"; + setLibcallName(RTLIB::SINCOS_F32, "sincosf"); + setLibcallName(RTLIB::SINCOS_F64, "sincos"); + setLibcallName(RTLIB::SINCOS_F80, "sincosl"); + setLibcallName(RTLIB::SINCOS_F128, "sincosl"); + setLibcallName(RTLIB::SINCOS_PPCF128, "sincosl"); } if (TT.isOSOpenBSD()) { - Names[RTLIB::STACKPROTECTOR_CHECK_FAIL] = nullptr; + setLibcallName(RTLIB::STACKPROTECTOR_CHECK_FAIL, nullptr); } } -/// Set default libcall CallingConvs. -static void InitLibcallCallingConvs(CallingConv::ID *CCs) { - for (int LC = 0; LC < RTLIB::UNKNOWN_LIBCALL; ++LC) - CCs[LC] = CallingConv::C; -} - /// getFPEXT - Return the FPEXT_*_* value for the given types, or /// UNKNOWN_LIBCALL if there is none. RTLIB::Libcall RTLIB::getFPEXT(EVT OpVT, EVT RetVT) { @@ -524,9 +551,8 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) { std::fill(std::begin(LibcallRoutineNames), std::end(LibcallRoutineNames), nullptr); - InitLibcallNames(LibcallRoutineNames, TM.getTargetTriple()); + InitLibcalls(TM.getTargetTriple()); InitCmpLibcallCCs(CmpLibcallCCs); - InitLibcallCallingConvs(LibcallCallingConvs); } void TargetLoweringBase::initActions() { diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp index 64bb37a280a6..13f7e83f3dd0 100644 --- a/lib/CodeGen/VirtRegMap.cpp +++ b/lib/CodeGen/VirtRegMap.cpp @@ -22,7 +22,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervals.h" -#include "llvm/CodeGen/LiveStackAnalysis.h" +#include "llvm/CodeGen/LiveStacks.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" diff --git a/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp b/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp index 17f29737bf93..6a6b7fc6fc20 100644 --- a/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp +++ b/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp @@ -83,6 +83,7 @@ bool DWARFAcceleratorTable::validateForms() { !FormValue.isFormClass(DWARFFormValue::FC_Flag)) || FormValue.getForm() == dwarf::DW_FORM_sdata) return false; + break; default: break; } diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp index a5defa90eb35..eb23ca8229a3 100644 --- a/lib/DebugInfo/DWARF/DWARFContext.cpp +++ b/lib/DebugInfo/DWARF/DWARFContext.cpp @@ -88,70 +88,101 @@ static void dumpUUID(raw_ostream &OS, const ObjectFile &Obj) { } } -static void -dumpDWARFv5StringOffsetsSection(raw_ostream &OS, StringRef SectionName, - const DWARFObject &Obj, - const DWARFSection &StringOffsetsSection, - StringRef StringSection, bool LittleEndian) { +using ContributionCollection = + std::vector>; + +// Collect all the contributions to the string offsets table from all units, +// sort them by their starting offsets and remove duplicates. +static ContributionCollection +collectContributionData(DWARFContext::cu_iterator_range CUs, + DWARFContext::tu_section_iterator_range TUSs) { + ContributionCollection Contributions; + for (const auto &CU : CUs) + Contributions.push_back(CU->getStringOffsetsTableContribution()); + for (const auto &TUS : TUSs) + for (const auto &TU : TUS) + Contributions.push_back(TU->getStringOffsetsTableContribution()); + + // Sort the contributions so that any invalid ones are placed at + // the start of the contributions vector. This way they are reported + // first. + std::sort(Contributions.begin(), Contributions.end(), + [](const Optional &L, + const Optional &R) { + if (L && R) return L->Base < R->Base; + return R.hasValue(); + }); + + // Uniquify contributions, as it is possible that units (specifically + // type units in dwo or dwp files) share contributions. We don't want + // to report them more than once. + Contributions.erase( + std::unique(Contributions.begin(), Contributions.end(), + [](const Optional &L, + const Optional &R) { + if (L && R) + return L->Base == R->Base && L->Size == R->Size; + return false; + }), + Contributions.end()); + return Contributions; +} + +static void dumpDWARFv5StringOffsetsSection( + raw_ostream &OS, StringRef SectionName, const DWARFObject &Obj, + const DWARFSection &StringOffsetsSection, StringRef StringSection, + DWARFContext::cu_iterator_range CUs, + DWARFContext::tu_section_iterator_range TUSs, bool LittleEndian) { + auto Contributions = collectContributionData(CUs, TUSs); DWARFDataExtractor StrOffsetExt(Obj, StringOffsetsSection, LittleEndian, 0); - uint32_t Offset = 0; + DataExtractor StrData(StringSection, LittleEndian, 0); uint64_t SectionSize = StringOffsetsSection.Data.size(); - - while (Offset < SectionSize) { - unsigned Version = 0; - DwarfFormat Format = DWARF32; - unsigned EntrySize = 4; - // Perform validation and extract the segment size from the header. - if (!StrOffsetExt.isValidOffsetForDataOfSize(Offset, 4)) { - OS << "error: invalid contribution to string offsets table in section ." - << SectionName << ".\n"; - return; - } - uint32_t ContributionStart = Offset; - uint64_t ContributionSize = StrOffsetExt.getU32(&Offset); - // A contribution size of 0xffffffff indicates DWARF64, with the actual size - // in the following 8 bytes. Otherwise, the DWARF standard mandates that - // the contribution size must be at most 0xfffffff0. - if (ContributionSize == 0xffffffff) { - if (!StrOffsetExt.isValidOffsetForDataOfSize(Offset, 8)) { - OS << "error: invalid contribution to string offsets table in section ." - << SectionName << ".\n"; - return; - } - Format = DWARF64; - EntrySize = 8; - ContributionSize = StrOffsetExt.getU64(&Offset); - } else if (ContributionSize > 0xfffffff0) { + uint32_t Offset = 0; + for (auto &Contribution : Contributions) { + // Report an ill-formed contribution. + if (!Contribution) { OS << "error: invalid contribution to string offsets table in section ." << SectionName << ".\n"; return; } - // We must ensure that we don't read a partial record at the end, so we - // validate for a multiple of EntrySize. Also, we're expecting a version - // number and padding, which adds an additional 4 bytes. - uint64_t ValidationSize = - 4 + ((ContributionSize + EntrySize - 1) & (-(uint64_t)EntrySize)); - if (!StrOffsetExt.isValidOffsetForDataOfSize(Offset, ValidationSize)) { - OS << "error: contribution to string offsets table in section ." - << SectionName << " has invalid length.\n"; + dwarf::DwarfFormat Format = Contribution->getFormat(); + uint16_t Version = Contribution->getVersion(); + uint64_t ContributionHeader = Contribution->Base; + // In DWARF v5 there is a contribution header that immediately precedes + // the string offsets base (the location we have previously retrieved from + // the CU DIE's DW_AT_str_offsets attribute). The header is located either + // 8 or 16 bytes before the base, depending on the contribution's format. + if (Version >= 5) + ContributionHeader -= Format == DWARF32 ? 8 : 16; + + // Detect overlapping contributions. + if (Offset > ContributionHeader) { + OS << "error: overlapping contributions to string offsets table in " + "section ." + << SectionName << ".\n"; return; } - - Version = StrOffsetExt.getU16(&Offset); - Offset += 2; - OS << format("0x%8.8x: ", ContributionStart); - OS << "Contribution size = " << ContributionSize + // Report a gap in the table. + if (Offset < ContributionHeader) { + OS << format("0x%8.8x: Gap, length = ", Offset); + OS << (ContributionHeader - Offset) << "\n"; + } + OS << format("0x%8.8x: ", (uint32_t)ContributionHeader); + OS << "Contribution size = " << Contribution->Size + << ", Format = " << (Format == DWARF32 ? "DWARF32" : "DWARF64") << ", Version = " << Version << "\n"; - uint32_t ContributionBase = Offset; - DataExtractor StrData(StringSection, LittleEndian, 0); - while (Offset - ContributionBase < ContributionSize) { + Offset = Contribution->Base; + unsigned EntrySize = Contribution->getDwarfOffsetByteSize(); + while (Offset - Contribution->Base < Contribution->Size) { OS << format("0x%8.8x: ", Offset); - // FIXME: We can only extract strings in DWARF32 format at the moment. + // FIXME: We can only extract strings if the offset fits in 32 bits. uint64_t StringOffset = StrOffsetExt.getRelocatedValue(EntrySize, &Offset); - if (Format == DWARF32) { + // Extract the string if we can and display it. Otherwise just report + // the offset. + if (StringOffset <= std::numeric_limits::max()) { uint32_t StringOffset32 = (uint32_t)StringOffset; OS << format("%8.8x ", StringOffset32); const char *S = StrData.getCStr(&StringOffset32); @@ -162,6 +193,11 @@ dumpDWARFv5StringOffsetsSection(raw_ostream &OS, StringRef SectionName, OS << "\n"; } } + // Report a gap at the end of the table. + if (Offset < SectionSize) { + OS << format("0x%8.8x: Gap, length = ", Offset); + OS << (SectionSize - Offset) << "\n"; + } } // Dump a DWARF string offsets section. This may be a DWARF v5 formatted @@ -170,17 +206,18 @@ dumpDWARFv5StringOffsetsSection(raw_ostream &OS, StringRef SectionName, // a header containing size and version number. Alternatively, it may be a // monolithic series of string offsets, as generated by the pre-DWARF v5 // implementation of split DWARF. -static void dumpStringOffsetsSection(raw_ostream &OS, StringRef SectionName, - const DWARFObject &Obj, - const DWARFSection &StringOffsetsSection, - StringRef StringSection, bool LittleEndian, - unsigned MaxVersion) { +static void dumpStringOffsetsSection( + raw_ostream &OS, StringRef SectionName, const DWARFObject &Obj, + const DWARFSection &StringOffsetsSection, StringRef StringSection, + DWARFContext::cu_iterator_range CUs, + DWARFContext::tu_section_iterator_range TUSs, bool LittleEndian, + unsigned MaxVersion) { // If we have at least one (compile or type) unit with DWARF v5 or greater, // we assume that the section is formatted like a DWARF v5 string offsets // section. if (MaxVersion >= 5) dumpDWARFv5StringOffsetsSection(OS, SectionName, Obj, StringOffsetsSection, - StringSection, LittleEndian); + StringSection, CUs, TUSs, LittleEndian); else { DataExtractor strOffsetExt(StringOffsetsSection.Data, LittleEndian, 0); uint32_t offset = 0; @@ -468,12 +505,14 @@ void DWARFContext::dump( DObj->getStringOffsetSection().Data)) dumpStringOffsetsSection( OS, "debug_str_offsets", *DObj, DObj->getStringOffsetSection(), - DObj->getStringSection(), isLittleEndian(), getMaxVersion()); + DObj->getStringSection(), compile_units(), type_unit_sections(), + isLittleEndian(), getMaxVersion()); if (shouldDump(ExplicitDWO, ".debug_str_offsets.dwo", DIDT_ID_DebugStrOffsets, DObj->getStringOffsetDWOSection().Data)) dumpStringOffsetsSection( OS, "debug_str_offsets.dwo", *DObj, DObj->getStringOffsetDWOSection(), - DObj->getStringDWOSection(), isLittleEndian(), getMaxVersion()); + DObj->getStringDWOSection(), dwo_compile_units(), + dwo_type_unit_sections(), isLittleEndian(), getMaxVersion()); if (shouldDump(Explicit, ".gnu_index", DIDT_ID_GdbIndex, DObj->getGdbIndexSection())) { diff --git a/lib/DebugInfo/DWARF/DWARFUnit.cpp b/lib/DebugInfo/DWARF/DWARFUnit.cpp index c3d8ff2cbc29..df55d7debf92 100644 --- a/lib/DebugInfo/DWARF/DWARFUnit.cpp +++ b/lib/DebugInfo/DWARF/DWARFUnit.cpp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include "llvm/DebugInfo/DWARF/DWARFUnit.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringRef.h" #include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h" @@ -79,8 +80,10 @@ bool DWARFUnit::getAddrOffsetSectionItem(uint32_t Index, bool DWARFUnit::getStringOffsetSectionItem(uint32_t Index, uint64_t &Result) const { - unsigned ItemSize = getDwarfOffsetByteSize(); - uint32_t Offset = StringOffsetSectionBase + Index * ItemSize; + if (!StringOffsetsTableContribution) + return false; + unsigned ItemSize = getDwarfStringOffsetsByteSize(); + uint32_t Offset = getStringOffsetsBase() + Index * ItemSize; if (StringOffsetSection.Data.size() < Offset + ItemSize) return false; DWARFDataExtractor DA(Context.getDWARFObj(), StringOffsetSection, @@ -251,15 +254,28 @@ size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) { RangeSectionBase = toSectionOffset(UnitDie.find(DW_AT_rnglists_base), 0); } - // In general, we derive the offset of the unit's contibution to the - // debug_str_offsets{.dwo} section from the unit DIE's - // DW_AT_str_offsets_base attribute. In dwp files we add to it the offset - // we get from the index table. - StringOffsetSectionBase = - toSectionOffset(UnitDie.find(DW_AT_str_offsets_base), 0); + // In general, in DWARF v5 and beyond we derive the start of the unit's + // contribution to the string offsets table from the unit DIE's + // DW_AT_str_offsets_base attribute. Split DWARF units do not use this + // attribute, so we assume that there is a contribution to the string + // offsets table starting at offset 0 of the debug_str_offsets.dwo section. + // In both cases we need to determine the format of the contribution, + // which may differ from the unit's format. + uint64_t StringOffsetsContributionBase = + isDWO ? 0 : toSectionOffset(UnitDie.find(DW_AT_str_offsets_base), 0); if (IndexEntry) if (const auto *C = IndexEntry->getOffset(DW_SECT_STR_OFFSETS)) - StringOffsetSectionBase += C->Offset; + StringOffsetsContributionBase += C->Offset; + + DWARFDataExtractor DA(Context.getDWARFObj(), StringOffsetSection, + isLittleEndian, 0); + if (isDWO) + StringOffsetsTableContribution = + determineStringOffsetsTableContributionDWO( + DA, StringOffsetsContributionBase); + else if (getVersion() >= 5) + StringOffsetsTableContribution = determineStringOffsetsTableContribution( + DA, StringOffsetsContributionBase); // Don't fall back to DW_AT_GNU_ranges_base: it should be ignored for // skeleton CU DIE, so that DWARF users not aware of it are not broken. @@ -344,45 +360,378 @@ void DWARFUnit::collectAddressRanges(DWARFAddressRangesVector &CURanges) { clearDIEs(true); } -void DWARFUnit::updateAddressDieMap(DWARFDie Die) { - if (Die.isSubroutineDIE()) { +// Populates a map from PC addresses to subprogram DIEs. +// +// This routine tries to look at the smallest amount of the debug info it can +// to locate the DIEs. This is because many subprograms will never end up being +// read or needed at all. We want to be as lazy as possible. +void DWARFUnit::buildSubprogramDIEAddrMap() { + assert(SubprogramDIEAddrMap.empty() && "Must only build this map once!"); + SmallVector Worklist; + Worklist.push_back(getUnitDIE()); + do { + DWARFDie Die = Worklist.pop_back_val(); + + // Queue up child DIEs to recurse through. + // FIXME: This causes us to read a lot more debug info than we really need. + // We should look at pruning out DIEs which cannot transitively hold + // separate subprograms. + for (DWARFDie Child : Die.children()) + Worklist.push_back(Child); + + // If handling a non-subprogram DIE, nothing else to do. + if (!Die.isSubprogramDIE()) + continue; + + // For subprogram DIEs, store them, and insert relevant markers into the + // address map. We don't care about overlap at all here as DWARF doesn't + // meaningfully support that, so we simply will insert a range with no DIE + // starting from the high PC. In the event there are overlaps, sorting + // these may truncate things in surprising ways but still will allow + // lookups to proceed. + int DIEIndex = SubprogramDIEAddrInfos.size(); + SubprogramDIEAddrInfos.push_back({Die, (uint64_t)-1, {}}); for (const auto &R : Die.getAddressRanges()) { // Ignore 0-sized ranges. if (R.LowPC == R.HighPC) continue; - auto B = AddrDieMap.upper_bound(R.LowPC); - if (B != AddrDieMap.begin() && R.LowPC < (--B)->second.first) { - // The range is a sub-range of existing ranges, we need to split the - // existing range. - if (R.HighPC < B->second.first) - AddrDieMap[R.HighPC] = B->second; - if (R.LowPC > B->first) - AddrDieMap[B->first].first = R.LowPC; - } - AddrDieMap[R.LowPC] = std::make_pair(R.HighPC, Die); + + SubprogramDIEAddrMap.push_back({R.LowPC, DIEIndex}); + SubprogramDIEAddrMap.push_back({R.HighPC, -1}); + + if (R.LowPC < SubprogramDIEAddrInfos.back().SubprogramBasePC) + SubprogramDIEAddrInfos.back().SubprogramBasePC = R.LowPC; } + } while (!Worklist.empty()); + + if (SubprogramDIEAddrMap.empty()) { + // If we found no ranges, create a no-op map so that lookups remain simple + // but never find anything. + SubprogramDIEAddrMap.push_back({0, -1}); + return; } - // Parent DIEs are added to the AddrDieMap prior to the Children DIEs to - // simplify the logic to update AddrDieMap. The child's range will always - // be equal or smaller than the parent's range. With this assumption, when - // adding one range into the map, it will at most split a range into 3 - // sub-ranges. - for (DWARFDie Child = Die.getFirstChild(); Child; Child = Child.getSibling()) - updateAddressDieMap(Child); + + // Next, sort the ranges and remove both exact duplicates and runs with the + // same DIE index. We order the ranges so that non-empty ranges are + // preferred. Because there may be ties, we also need to use stable sort. + std::stable_sort(SubprogramDIEAddrMap.begin(), SubprogramDIEAddrMap.end(), + [](const std::pair &LHS, + const std::pair &RHS) { + if (LHS.first < RHS.first) + return true; + if (LHS.first > RHS.first) + return false; + + // For ranges that start at the same address, keep the one + // with a DIE. + if (LHS.second != -1 && RHS.second == -1) + return true; + + return false; + }); + SubprogramDIEAddrMap.erase( + std::unique(SubprogramDIEAddrMap.begin(), SubprogramDIEAddrMap.end(), + [](const std::pair &LHS, + const std::pair &RHS) { + // If the start addresses are exactly the same, we can + // remove all but the first one as it is the only one that + // will be found and used. + // + // If the DIE indices are the same, we can "merge" the + // ranges by eliminating the second. + return LHS.first == RHS.first || LHS.second == RHS.second; + }), + SubprogramDIEAddrMap.end()); + + assert(SubprogramDIEAddrMap.back().second == -1 && + "The last interval must not have a DIE as each DIE's address range is " + "bounded."); +} + +// Build the second level of mapping from PC to DIE, specifically one that maps +// a PC *within* a particular DWARF subprogram into a precise, maximally nested +// inlined subroutine DIE (if any exists). We build a separate map for each +// subprogram because many subprograms will never get queried for an address +// and this allows us to be significantly lazier in reading the DWARF itself. +void DWARFUnit::buildInlinedSubroutineDIEAddrMap( + SubprogramDIEAddrInfo &SPInfo) { + auto &AddrMap = SPInfo.InlinedSubroutineDIEAddrMap; + uint64_t BasePC = SPInfo.SubprogramBasePC; + + auto SubroutineAddrMapSorter = [](const std::pair &LHS, + const std::pair &RHS) { + if (LHS.first < RHS.first) + return true; + if (LHS.first > RHS.first) + return false; + + // For ranges that start at the same address, keep the + // non-empty one. + if (LHS.second != -1 && RHS.second == -1) + return true; + + return false; + }; + auto SubroutineAddrMapUniquer = [](const std::pair &LHS, + const std::pair &RHS) { + // If the start addresses are exactly the same, we can + // remove all but the first one as it is the only one that + // will be found and used. + // + // If the DIE indices are the same, we can "merge" the + // ranges by eliminating the second. + return LHS.first == RHS.first || LHS.second == RHS.second; + }; + + struct DieAndParentIntervalRange { + DWARFDie Die; + int ParentIntervalsBeginIdx, ParentIntervalsEndIdx; + }; + + SmallVector Worklist; + auto EnqueueChildDIEs = [&](const DWARFDie &Die, int ParentIntervalsBeginIdx, + int ParentIntervalsEndIdx) { + for (DWARFDie Child : Die.children()) + Worklist.push_back( + {Child, ParentIntervalsBeginIdx, ParentIntervalsEndIdx}); + }; + EnqueueChildDIEs(SPInfo.SubprogramDIE, 0, 0); + while (!Worklist.empty()) { + DWARFDie Die = Worklist.back().Die; + int ParentIntervalsBeginIdx = Worklist.back().ParentIntervalsBeginIdx; + int ParentIntervalsEndIdx = Worklist.back().ParentIntervalsEndIdx; + Worklist.pop_back(); + + // If we encounter a nested subprogram, simply ignore it. We map to + // (disjoint) subprograms before arriving here and we don't want to examine + // any inlined subroutines of an unrelated subpragram. + if (Die.getTag() == DW_TAG_subprogram) + continue; + + // For non-subroutines, just recurse to keep searching for inlined + // subroutines. + if (Die.getTag() != DW_TAG_inlined_subroutine) { + EnqueueChildDIEs(Die, ParentIntervalsBeginIdx, ParentIntervalsEndIdx); + continue; + } + + // Capture the inlined subroutine DIE that we will reference from the map. + int DIEIndex = InlinedSubroutineDIEs.size(); + InlinedSubroutineDIEs.push_back(Die); + + int DieIntervalsBeginIdx = AddrMap.size(); + // First collect the PC ranges for this DIE into our subroutine interval + // map. + for (auto R : Die.getAddressRanges()) { + // Clamp the PCs to be above the base. + R.LowPC = std::max(R.LowPC, BasePC); + R.HighPC = std::max(R.HighPC, BasePC); + // Compute relative PCs from the subprogram base and drop down to an + // unsigned 32-bit int to represent them within the data structure. This + // lets us cover a 4gb single subprogram. Because subprograms may be + // partitioned into distant parts of a binary (think hot/cold + // partitioning) we want to preserve as much as we can here without + // burning extra memory. Past that, we will simply truncate and lose the + // ability to map those PCs to a DIE more precise than the subprogram. + const uint32_t MaxRelativePC = std::numeric_limits::max(); + uint32_t RelativeLowPC = (R.LowPC - BasePC) > (uint64_t)MaxRelativePC + ? MaxRelativePC + : (uint32_t)(R.LowPC - BasePC); + uint32_t RelativeHighPC = (R.HighPC - BasePC) > (uint64_t)MaxRelativePC + ? MaxRelativePC + : (uint32_t)(R.HighPC - BasePC); + // Ignore empty or bogus ranges. + if (RelativeLowPC >= RelativeHighPC) + continue; + AddrMap.push_back({RelativeLowPC, DIEIndex}); + AddrMap.push_back({RelativeHighPC, -1}); + } + + // If there are no address ranges, there is nothing to do to map into them + // and there cannot be any child subroutine DIEs with address ranges of + // interest as those would all be required to nest within this DIE's + // non-existent ranges, so we can immediately continue to the next DIE in + // the worklist. + if (DieIntervalsBeginIdx == (int)AddrMap.size()) + continue; + + // The PCs from this DIE should never overlap, so we can easily sort them + // here. + std::sort(AddrMap.begin() + DieIntervalsBeginIdx, AddrMap.end(), + SubroutineAddrMapSorter); + // Remove any dead ranges. These should only come from "empty" ranges that + // were clobbered by some other range. + AddrMap.erase(std::unique(AddrMap.begin() + DieIntervalsBeginIdx, + AddrMap.end(), SubroutineAddrMapUniquer), + AddrMap.end()); + + // Compute the end index of this DIE's addr map intervals. + int DieIntervalsEndIdx = AddrMap.size(); + + assert(DieIntervalsBeginIdx != DieIntervalsEndIdx && + "Must not have an empty map for this layer!"); + assert(AddrMap.back().second == -1 && "Must end with an empty range!"); + assert(std::is_sorted(AddrMap.begin() + DieIntervalsBeginIdx, AddrMap.end(), + less_first()) && + "Failed to sort this DIE's interals!"); + + // If we have any parent intervals, walk the newly added ranges and find + // the parent ranges they were inserted into. Both of these are sorted and + // neither has any overlaps. We need to append new ranges to split up any + // parent ranges these new ranges would overlap when we merge them. + if (ParentIntervalsBeginIdx != ParentIntervalsEndIdx) { + int ParentIntervalIdx = ParentIntervalsBeginIdx; + for (int i = DieIntervalsBeginIdx, e = DieIntervalsEndIdx - 1; i < e; + ++i) { + const uint32_t IntervalStart = AddrMap[i].first; + const uint32_t IntervalEnd = AddrMap[i + 1].first; + const int IntervalDieIdx = AddrMap[i].second; + if (IntervalDieIdx == -1) { + // For empty intervals, nothing is required. This is a bit surprising + // however. If the prior interval overlaps a parent interval and this + // would be necessary to mark the end, we will synthesize a new end + // that switches back to the parent DIE below. And this interval will + // get dropped in favor of one with a DIE attached. However, we'll + // still include this and so worst-case, it will still end the prior + // interval. + continue; + } + + // We are walking the new ranges in order, so search forward from the + // last point for a parent range that might overlap. + auto ParentIntervalsRange = + make_range(AddrMap.begin() + ParentIntervalIdx, + AddrMap.begin() + ParentIntervalsEndIdx); + assert(std::is_sorted(ParentIntervalsRange.begin(), + ParentIntervalsRange.end(), less_first()) && + "Unsorted parent intervals can't be searched!"); + auto PI = std::upper_bound( + ParentIntervalsRange.begin(), ParentIntervalsRange.end(), + IntervalStart, + [](uint32_t LHS, const std::pair &RHS) { + return LHS < RHS.first; + }); + if (PI == ParentIntervalsRange.begin() || + PI == ParentIntervalsRange.end()) + continue; + + ParentIntervalIdx = PI - AddrMap.begin(); + int32_t &ParentIntervalDieIdx = std::prev(PI)->second; + uint32_t &ParentIntervalStart = std::prev(PI)->first; + const uint32_t ParentIntervalEnd = PI->first; + + // If the new range starts exactly at the position of the parent range, + // we need to adjust the parent range. Note that these collisions can + // only happen with the original parent range because we will merge any + // adjacent ranges in the child. + if (IntervalStart == ParentIntervalStart) { + // If there will be a tail, just shift the start of the parent + // forward. Note that this cannot change the parent ordering. + if (IntervalEnd < ParentIntervalEnd) { + ParentIntervalStart = IntervalEnd; + continue; + } + // Otherwise, mark this as becoming empty so we'll remove it and + // prefer the child range. + ParentIntervalDieIdx = -1; + continue; + } + + // Finally, if the parent interval will need to remain as a prefix to + // this one, insert a new interval to cover any tail. + if (IntervalEnd < ParentIntervalEnd) + AddrMap.push_back({IntervalEnd, ParentIntervalDieIdx}); + } + } + + // Note that we don't need to re-sort even this DIE's address map intervals + // after this. All of the newly added intervals actually fill in *gaps* in + // this DIE's address map, and we know that children won't need to lookup + // into those gaps. + + // Recurse through its children, giving them the interval map range of this + // DIE to use as their parent intervals. + EnqueueChildDIEs(Die, DieIntervalsBeginIdx, DieIntervalsEndIdx); + } + + if (AddrMap.empty()) { + AddrMap.push_back({0, -1}); + return; + } + + // Now that we've added all of the intervals needed, we need to resort and + // unique them. Most notably, this will remove all the empty ranges that had + // a parent range covering, etc. We only expect a single non-empty interval + // at any given start point, so we just use std::sort. This could potentially + // produce non-deterministic maps for invalid DWARF. + std::sort(AddrMap.begin(), AddrMap.end(), SubroutineAddrMapSorter); + AddrMap.erase( + std::unique(AddrMap.begin(), AddrMap.end(), SubroutineAddrMapUniquer), + AddrMap.end()); } DWARFDie DWARFUnit::getSubroutineForAddress(uint64_t Address) { extractDIEsIfNeeded(false); - if (AddrDieMap.empty()) - updateAddressDieMap(getUnitDIE()); - auto R = AddrDieMap.upper_bound(Address); - if (R == AddrDieMap.begin()) + + // We use a two-level mapping structure to locate subroutines for a given PC + // address. + // + // First, we map the address to a subprogram. This can be done more cheaply + // because subprograms cannot nest within each other. It also allows us to + // avoid detailed examination of many subprograms, instead only focusing on + // the ones which we end up actively querying. + if (SubprogramDIEAddrMap.empty()) + buildSubprogramDIEAddrMap(); + + assert(!SubprogramDIEAddrMap.empty() && + "We must always end up with a non-empty map!"); + + auto I = std::upper_bound( + SubprogramDIEAddrMap.begin(), SubprogramDIEAddrMap.end(), Address, + [](uint64_t LHS, const std::pair &RHS) { + return LHS < RHS.first; + }); + // If we find the beginning, then the address is before the first subprogram. + if (I == SubprogramDIEAddrMap.begin()) return DWARFDie(); - // upper_bound's previous item contains Address. - --R; - if (Address >= R->second.first) + // Back up to the interval containing the address and see if it + // has a DIE associated with it. + --I; + if (I->second == -1) return DWARFDie(); - return R->second.second; + + auto &SPInfo = SubprogramDIEAddrInfos[I->second]; + + // Now that we have the subprogram for this address, we do the second level + // mapping by building a map within a subprogram's PC range to any specific + // inlined subroutine. + if (SPInfo.InlinedSubroutineDIEAddrMap.empty()) + buildInlinedSubroutineDIEAddrMap(SPInfo); + + // We lookup within the inlined subroutine using a subprogram-relative + // address. + assert(Address >= SPInfo.SubprogramBasePC && + "Address isn't above the start of the subprogram!"); + uint32_t RelativeAddr = ((Address - SPInfo.SubprogramBasePC) > + (uint64_t)std::numeric_limits::max()) + ? std::numeric_limits::max() + : (uint32_t)(Address - SPInfo.SubprogramBasePC); + + auto J = + std::upper_bound(SPInfo.InlinedSubroutineDIEAddrMap.begin(), + SPInfo.InlinedSubroutineDIEAddrMap.end(), RelativeAddr, + [](uint32_t LHS, const std::pair &RHS) { + return LHS < RHS.first; + }); + // If we find the beginning, the address is before any inlined subroutine so + // return the subprogram DIE. + if (J == SPInfo.InlinedSubroutineDIEAddrMap.begin()) + return SPInfo.SubprogramDIE; + // Back up `J` and return the inlined subroutine if we have one or the + // subprogram if we don't. + --J; + return J->second == -1 ? SPInfo.SubprogramDIE + : InlinedSubroutineDIEs[J->second]; } void @@ -466,3 +815,89 @@ const DWARFAbbreviationDeclarationSet *DWARFUnit::getAbbreviations() const { Abbrevs = Abbrev->getAbbreviationDeclarationSet(AbbrOffset); return Abbrevs; } + +Optional +StrOffsetsContributionDescriptor::validateContributionSize( + DWARFDataExtractor &DA) { + uint8_t EntrySize = getDwarfOffsetByteSize(); + // In order to ensure that we don't read a partial record at the end of + // the section we validate for a multiple of the entry size. + uint64_t ValidationSize = alignTo(Size, EntrySize); + // Guard against overflow. + if (ValidationSize >= Size) + if (DA.isValidOffsetForDataOfSize((uint32_t)Base, ValidationSize)) + return *this; + return Optional(); +} + +// Look for a DWARF64-formatted contribution to the string offsets table +// starting at a given offset and record it in a descriptor. +static Optional +parseDWARF64StringOffsetsTableHeader(DWARFDataExtractor &DA, uint32_t Offset) { + if (!DA.isValidOffsetForDataOfSize(Offset, 16)) + return Optional(); + + if (DA.getU32(&Offset) != 0xffffffff) + return Optional(); + + uint64_t Size = DA.getU64(&Offset); + uint8_t Version = DA.getU16(&Offset); + (void)DA.getU16(&Offset); // padding + return StrOffsetsContributionDescriptor(Offset, Size, Version, DWARF64); + //return Optional(Descriptor); +} + +// Look for a DWARF32-formatted contribution to the string offsets table +// starting at a given offset and record it in a descriptor. +static Optional +parseDWARF32StringOffsetsTableHeader(DWARFDataExtractor &DA, uint32_t Offset) { + if (!DA.isValidOffsetForDataOfSize(Offset, 8)) + return Optional(); + uint32_t ContributionSize = DA.getU32(&Offset); + if (ContributionSize >= 0xfffffff0) + return Optional(); + uint8_t Version = DA.getU16(&Offset); + (void)DA.getU16(&Offset); // padding + return StrOffsetsContributionDescriptor(Offset, ContributionSize, Version, DWARF32); + //return Optional(Descriptor); +} + +Optional +DWARFUnit::determineStringOffsetsTableContribution(DWARFDataExtractor &DA, + uint64_t Offset) { + Optional Descriptor; + // Attempt to find a DWARF64 contribution 16 bytes before the base. + if (Offset >= 16) + Descriptor = + parseDWARF64StringOffsetsTableHeader(DA, (uint32_t)Offset - 16); + // Try to find a DWARF32 contribution 8 bytes before the base. + if (!Descriptor && Offset >= 8) + Descriptor = parseDWARF32StringOffsetsTableHeader(DA, (uint32_t)Offset - 8); + return Descriptor ? Descriptor->validateContributionSize(DA) : Descriptor; +} + +Optional +DWARFUnit::determineStringOffsetsTableContributionDWO(DWARFDataExtractor &DA, + uint64_t Offset) { + if (getVersion() >= 5) { + // Look for a valid contribution at the given offset. + auto Descriptor = + parseDWARF64StringOffsetsTableHeader(DA, (uint32_t)Offset); + if (!Descriptor) + Descriptor = parseDWARF32StringOffsetsTableHeader(DA, (uint32_t)Offset); + return Descriptor ? Descriptor->validateContributionSize(DA) : Descriptor; + } + // Prior to DWARF v5, we derive the contribution size from the + // index table (in a package file). In a .dwo file it is simply + // the length of the string offsets section. + uint64_t Size = 0; + if (!IndexEntry) + Size = StringOffsetSection.Data.size(); + else if (const auto *C = IndexEntry->getOffset(DW_SECT_STR_OFFSETS)) + Size = C->Length; + // Return a descriptor with the given offset as base, version 4 and + // DWARF32 format. + //return Optional( + //StrOffsetsContributionDescriptor(Offset, Size, 4, DWARF32)); + return StrOffsetsContributionDescriptor(Offset, Size, 4, DWARF32); +} diff --git a/lib/Demangle/ItaniumDemangle.cpp b/lib/Demangle/ItaniumDemangle.cpp index 34f4017d9828..9c2258f5b933 100644 --- a/lib/Demangle/ItaniumDemangle.cpp +++ b/lib/Demangle/ItaniumDemangle.cpp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Demangle/Demangle.h" +#include "llvm/Support/Compiler.h" // This file exports a single function: llvm::itanium_demangle. // It also has no dependencies on the rest of llvm. It is implemented this way @@ -1947,7 +1948,7 @@ static const char *parse_type(const char *first, const char *last, C &db) { break; } } - // falls through + LLVM_FALLTHROUGH; default: // must check for builtin-types before class-enum-types to avoid // ambiguities with operator-names diff --git a/lib/FuzzMutate/IRMutator.cpp b/lib/FuzzMutate/IRMutator.cpp index 15e7f86d1cdf..00b558ac4dcb 100644 --- a/lib/FuzzMutate/IRMutator.cpp +++ b/lib/FuzzMutate/IRMutator.cpp @@ -8,15 +8,17 @@ //===----------------------------------------------------------------------===// #include "llvm/FuzzMutate/IRMutator.h" +#include "llvm/ADT/Optional.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/FuzzMutate/Operations.h" #include "llvm/FuzzMutate/Random.h" #include "llvm/FuzzMutate/RandomIRBuilder.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Function.h" -#include "llvm/IR/Instructions.h" #include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" +#include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar/DCE.h" using namespace llvm; @@ -90,14 +92,14 @@ std::vector InjectorIRStrategy::getDefaultOps() { return Ops; } -fuzzerop::OpDescriptor +Optional InjectorIRStrategy::chooseOperation(Value *Src, RandomIRBuilder &IB) { auto OpMatchesPred = [&Src](fuzzerop::OpDescriptor &Op) { return Op.SourcePreds[0].matches({}, Src); }; auto RS = makeSampler(IB.Rand, make_filter_range(Operations, OpMatchesPred)); if (RS.isEmpty()) - report_fatal_error("No available operations for src type"); + return None; return *RS; } @@ -120,10 +122,15 @@ void InjectorIRStrategy::mutate(BasicBlock &BB, RandomIRBuilder &IB) { // Choose an operation that's constrained to be valid for the type of the // source, collect any other sources it needs, and then build it. - fuzzerop::OpDescriptor OpDesc = chooseOperation(Srcs[0], IB); - for (const auto &Pred : makeArrayRef(OpDesc.SourcePreds).slice(1)) + auto OpDesc = chooseOperation(Srcs[0], IB); + // Bail if no operation was found + if (!OpDesc) + return; + + for (const auto &Pred : makeArrayRef(OpDesc->SourcePreds).slice(1)) Srcs.push_back(IB.findOrCreateSource(BB, InstsBefore, Srcs, Pred)); - if (Value *Op = OpDesc.BuilderFunc(Srcs, Insts[IP])) { + + if (Value *Op = OpDesc->BuilderFunc(Srcs, Insts[IP])) { // Find a sink and wire up the results of the operation. IB.connectToSink(BB, InstsAfter, Op); } diff --git a/lib/IR/ConstantFold.cpp b/lib/IR/ConstantFold.cpp index 90b10309b58b..59818a1425f1 100644 --- a/lib/IR/ConstantFold.cpp +++ b/lib/IR/ConstantFold.cpp @@ -1674,6 +1674,7 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2, } } } + break; } default: break; diff --git a/lib/IR/Function.cpp b/lib/IR/Function.cpp index 1fff912ecf2f..7063f6f40a30 100644 --- a/lib/IR/Function.cpp +++ b/lib/IR/Function.cpp @@ -1333,7 +1333,9 @@ Optional Function::getEntryCount() const { if (MDS->getString().equals("function_entry_count")) { ConstantInt *CI = mdconst::extract(MD->getOperand(1)); uint64_t Count = CI->getValue().getZExtValue(); - if (Count == 0) + // A value of -1 is used for SamplePGO when there were no samples. + // Treat this the same as unknown. + if (Count == (uint64_t)-1) return None; return Count; } diff --git a/lib/IR/Value.cpp b/lib/IR/Value.cpp index eae697b2e4b9..163c785f5d76 100644 --- a/lib/IR/Value.cpp +++ b/lib/IR/Value.cpp @@ -627,9 +627,10 @@ uint64_t Value::getPointerDereferenceableBytes(const DataLayout &DL, CanBeNull = false; if (const Argument *A = dyn_cast(this)) { DerefBytes = A->getDereferenceableBytes(); - if (DerefBytes == 0 && A->hasByValAttr()) { + if (DerefBytes == 0 && (A->hasByValAttr() || A->hasStructRetAttr())) { Type *PT = cast(A->getType())->getElementType(); - DerefBytes = DL.getTypeStoreSize(PT); + if (PT->isSized()) + DerefBytes = DL.getTypeStoreSize(PT); } if (DerefBytes == 0) { DerefBytes = A->getDereferenceableOrNullBytes(); @@ -655,10 +656,8 @@ uint64_t Value::getPointerDereferenceableBytes(const DataLayout &DL, CanBeNull = true; } } else if (auto *AI = dyn_cast(this)) { - const ConstantInt *ArraySize = dyn_cast(AI->getArraySize()); - if (ArraySize && AI->getAllocatedType()->isSized()) { - DerefBytes = DL.getTypeStoreSize(AI->getAllocatedType()) * - ArraySize->getZExtValue(); + if (!AI->isArrayAllocation()) { + DerefBytes = DL.getTypeStoreSize(AI->getAllocatedType()); CanBeNull = false; } } else if (auto *GV = dyn_cast(this)) { diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp index 3357553cf19f..e521b6e7c704 100644 --- a/lib/MC/MCAsmStreamer.cpp +++ b/lib/MC/MCAsmStreamer.cpp @@ -405,9 +405,13 @@ void MCAsmStreamer::emitExplicitComments() { void MCAsmStreamer::ChangeSection(MCSection *Section, const MCExpr *Subsection) { assert(Section && "Cannot switch to a null section!"); - Section->PrintSwitchToSection( - *MAI, getContext().getObjectFileInfo()->getTargetTriple(), OS, - Subsection); + if (MCTargetStreamer *TS = getTargetStreamer()) { + TS->changeSection(getCurrentSectionOnly(), Section, Subsection, OS); + } else { + Section->PrintSwitchToSection( + *MAI, getContext().getObjectFileInfo()->getTargetTriple(), OS, + Subsection); + } } void MCAsmStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) { @@ -796,10 +800,15 @@ void MCAsmStreamer::EmitBytes(StringRef Data) { "Cannot emit contents before setting section!"); if (Data.empty()) return; - if (Data.size() == 1) { - OS << MAI->getData8bitsDirective(); - OS << (unsigned)(unsigned char)Data[0]; - EmitEOL(); + // If only single byte is provided or no ascii or asciz directives is + // supported, emit as vector of 8bits data. + if (Data.size() == 1 || + !(MAI->getAscizDirective() || MAI->getAsciiDirective())) { + const char *Directive = MAI->getData8bitsDirective(); + for (const unsigned char C : Data.bytes()) { + OS << Directive << (unsigned)C; + EmitEOL(); + } return; } @@ -884,8 +893,12 @@ void MCAsmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size, assert(Directive && "Invalid size for machine code value!"); OS << Directive; - Value->print(OS, MAI); - EmitEOL(); + if (MCTargetStreamer *TS = getTargetStreamer()) { + TS->emitValue(Value); + } else { + Value->print(OS, MAI); + EmitEOL(); + } } void MCAsmStreamer::EmitULEB128Value(const MCExpr *Value) { @@ -1097,13 +1110,19 @@ unsigned MCAsmStreamer::EmitDwarfFileDirective(unsigned FileNo, } } - OS << "\t.file\t" << FileNo << ' '; + SmallString<128> Str; + raw_svector_ostream OS1(Str); + OS1 << "\t.file\t" << FileNo << ' '; if (!Directory.empty()) { - PrintQuotedString(Directory, OS); - OS << ' '; + PrintQuotedString(Directory, OS1); + OS1 << ' '; + } + PrintQuotedString(Filename, OS1); + if (MCTargetStreamer *TS = getTargetStreamer()) { + TS->emitDwarfFileDirective(OS1.str()); + } else { + EmitRawText(OS1.str()); } - PrintQuotedString(Filename, OS); - EmitEOL(); return FileNo; } diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp index 6f3647d61932..6e801ed8777c 100644 --- a/lib/MC/MCStreamer.cpp +++ b/lib/MC/MCStreamer.cpp @@ -49,6 +49,28 @@ void MCTargetStreamer::emitLabel(MCSymbol *Symbol) {} void MCTargetStreamer::finish() {} +void MCTargetStreamer::changeSection(const MCSection *CurSection, + MCSection *Section, + const MCExpr *Subsection, + raw_ostream &OS) { + Section->PrintSwitchToSection( + *Streamer.getContext().getAsmInfo(), + Streamer.getContext().getObjectFileInfo()->getTargetTriple(), OS, + Subsection); +} + +void MCTargetStreamer::emitDwarfFileDirective(StringRef Directive) { + Streamer.EmitRawText(Directive); +} + +void MCTargetStreamer::emitValue(const MCExpr *Value) { + SmallString<128> Str; + raw_svector_ostream OS(Str); + + Value->print(OS, Streamer.getContext().getAsmInfo()); + Streamer.EmitRawText(OS.str()); +} + void MCTargetStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) {} MCStreamer::MCStreamer(MCContext &Ctx) diff --git a/lib/MC/WasmObjectWriter.cpp b/lib/MC/WasmObjectWriter.cpp index 6e76c5fac35f..0f0b645492ee 100644 --- a/lib/MC/WasmObjectWriter.cpp +++ b/lib/MC/WasmObjectWriter.cpp @@ -553,7 +553,7 @@ uint32_t WasmObjectWriter::getRelocationIndexValue( case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB: case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32: if (!IndirectSymbolIndices.count(RelEntry.Symbol)) - report_fatal_error("symbol not found table index space: " + + report_fatal_error("symbol not found in table index space: " + RelEntry.Symbol->getName()); return IndirectSymbolIndices[RelEntry.Symbol]; case wasm::R_WEBASSEMBLY_FUNCTION_INDEX_LEB: @@ -562,7 +562,7 @@ uint32_t WasmObjectWriter::getRelocationIndexValue( case wasm::R_WEBASSEMBLY_MEMORY_ADDR_SLEB: case wasm::R_WEBASSEMBLY_MEMORY_ADDR_I32: if (!SymbolIndices.count(RelEntry.Symbol)) - report_fatal_error("symbol not found function/global index space: " + + report_fatal_error("symbol not found in function/global index space: " + RelEntry.Symbol->getName()); return SymbolIndices[RelEntry.Symbol]; case wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB: @@ -994,33 +994,10 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm, SmallVector Exports; SmallVector, 4> SymbolFlags; SmallVector, 2> InitFuncs; - SmallPtrSet IsAddressTaken; unsigned NumFuncImports = 0; SmallVector DataSegments; uint32_t DataSize = 0; - // Populate the IsAddressTaken set. - for (const WasmRelocationEntry &RelEntry : CodeRelocations) { - switch (RelEntry.Type) { - case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB: - case wasm::R_WEBASSEMBLY_MEMORY_ADDR_SLEB: - IsAddressTaken.insert(RelEntry.Symbol); - break; - default: - break; - } - } - for (const WasmRelocationEntry &RelEntry : DataRelocations) { - switch (RelEntry.Type) { - case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32: - case wasm::R_WEBASSEMBLY_MEMORY_ADDR_I32: - IsAddressTaken.insert(RelEntry.Symbol); - break; - default: - break; - } - } - // In the special .global_variables section, we've encoded global // variables used by the function. Translate them into the Globals // list. @@ -1116,7 +1093,7 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm, continue; // If the symbol is not defined in this translation unit, import it. - if (!WS.isDefined(/*SetUsed=*/false)) { + if (!WS.isDefined(/*SetUsed=*/false) || WS.isVariable()) { WasmImport Import; Import.ModuleName = WS.getModuleName(); Import.FieldName = WS.getName(); @@ -1132,8 +1109,7 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm, Import.IsMutable = false; SymbolIndices[&WS] = NumGlobalImports; - // If this global is the stack pointer, make it mutable and remember it - // so that we can emit metadata for it. + // If this global is the stack pointer, make it mutable. if (WS.getName() == "__stack_pointer") Import.IsMutable = true; @@ -1218,14 +1194,7 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm, } DEBUG(dbgs() << " -> function index: " << Index << "\n"); - - // If needed, prepare the function to be called indirectly. - if (IsAddressTaken.count(&WS) != 0) { - IndirectSymbolIndices[&WS] = TableElems.size(); - DEBUG(dbgs() << " -> adding to table: " << TableElems.size() << "\n"); - TableElems.push_back(Index); - } - } else { + } else { if (WS.isTemporary() && !WS.getSize()) continue; @@ -1289,7 +1258,6 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm, uint32_t Index = SymbolIndices.find(ResolvedSym)->second; DEBUG(dbgs() << " -> index:" << Index << "\n"); - SymbolIndices[&WS] = Index; WasmExport Export; Export.FieldName = WS.getName(); Export.Index = Index; @@ -1304,12 +1272,34 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm, SymbolFlags.emplace_back(WS.getName(), wasm::WASM_SYMBOL_BINDING_LOCAL); } - // Add types for indirect function calls. - for (const WasmRelocationEntry &Fixup : CodeRelocations) { - if (Fixup.Type != wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB) - continue; + { + auto HandleReloc = [&](const WasmRelocationEntry &Rel) { + // Functions referenced by a relocation need to prepared to be called + // indirectly. + const MCSymbolWasm& WS = *Rel.Symbol; + if (WS.isFunction() && IndirectSymbolIndices.count(&WS) == 0) { + switch (Rel.Type) { + case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32: + case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB: + case wasm::R_WEBASSEMBLY_MEMORY_ADDR_I32: + case wasm::R_WEBASSEMBLY_MEMORY_ADDR_SLEB: { + uint32_t Index = SymbolIndices.find(&WS)->second; + IndirectSymbolIndices[&WS] = TableElems.size(); + DEBUG(dbgs() << " -> adding to table: " << TableElems.size() << "\n"); + TableElems.push_back(Index); + registerFunctionType(WS); + break; + } + default: + break; + } + } + }; - registerFunctionType(*Fixup.Symbol); + for (const WasmRelocationEntry &RelEntry : CodeRelocations) + HandleReloc(RelEntry); + for (const WasmRelocationEntry &RelEntry : DataRelocations) + HandleReloc(RelEntry); } // Translate .init_array section contents into start functions. diff --git a/lib/Object/ELF.cpp b/lib/Object/ELF.cpp index c72a1258c1ee..5906dc5f5307 100644 --- a/lib/Object/ELF.cpp +++ b/lib/Object/ELF.cpp @@ -138,6 +138,7 @@ StringRef llvm::object::getELFRelocationTypeName(uint32_t Machine, default: break; } + break; case ELF::EM_BPF: switch (Type) { #include "llvm/BinaryFormat/ELFRelocs/BPF.def" diff --git a/lib/Object/WasmObjectFile.cpp b/lib/Object/WasmObjectFile.cpp index 7a0c05ed8a15..48f98df6f34d 100644 --- a/lib/Object/WasmObjectFile.cpp +++ b/lib/Object/WasmObjectFile.cpp @@ -303,7 +303,6 @@ Error WasmObjectFile::parseNameSection(const uint8_t *Ptr, const uint8_t *End) { void WasmObjectFile::populateSymbolTable() { // Add imports to symbol table - size_t ImportIndex = 0; size_t GlobalIndex = 0; size_t FunctionIndex = 0; for (const wasm::WasmImport& Import : Imports) { @@ -312,7 +311,7 @@ void WasmObjectFile::populateSymbolTable() { assert(Import.Global.Type == wasm::WASM_TYPE_I32); SymbolMap.try_emplace(Import.Field, Symbols.size()); Symbols.emplace_back(Import.Field, WasmSymbol::SymbolType::GLOBAL_IMPORT, - ImportSection, GlobalIndex++, ImportIndex); + ImportSection, GlobalIndex++); DEBUG(dbgs() << "Adding import: " << Symbols.back() << " sym index:" << Symbols.size() << "\n"); break; @@ -320,14 +319,13 @@ void WasmObjectFile::populateSymbolTable() { SymbolMap.try_emplace(Import.Field, Symbols.size()); Symbols.emplace_back(Import.Field, WasmSymbol::SymbolType::FUNCTION_IMPORT, - ImportSection, FunctionIndex++, ImportIndex); + ImportSection, FunctionIndex++, Import.SigIndex); DEBUG(dbgs() << "Adding import: " << Symbols.back() << " sym index:" << Symbols.size() << "\n"); break; default: break; } - ImportIndex++; } // Add exports to symbol table @@ -338,11 +336,22 @@ void WasmObjectFile::populateSymbolTable() { Export.Kind == wasm::WASM_EXTERNAL_FUNCTION ? WasmSymbol::SymbolType::FUNCTION_EXPORT : WasmSymbol::SymbolType::GLOBAL_EXPORT; - SymbolMap.try_emplace(Export.Name, Symbols.size()); - Symbols.emplace_back(Export.Name, ExportType, - ExportSection, Export.Index); - DEBUG(dbgs() << "Adding export: " << Symbols.back() - << " sym index:" << Symbols.size() << "\n"); + auto Pair = SymbolMap.try_emplace(Export.Name, Symbols.size()); + if (Pair.second) { + Symbols.emplace_back(Export.Name, ExportType, + ExportSection, Export.Index); + DEBUG(dbgs() << "Adding export: " << Symbols.back() + << " sym index:" << Symbols.size() << "\n"); + } else { + uint32_t SymIndex = Pair.first->second; + const WasmSymbol &OldSym = Symbols[SymIndex]; + WasmSymbol NewSym(Export.Name, ExportType, ExportSection, Export.Index); + NewSym.setAltIndex(OldSym.ElementIndex); + Symbols[SymIndex] = NewSym; + + DEBUG(dbgs() << "Replacing existing symbol: " << NewSym + << " sym index:" << SymIndex << "\n"); + } } } } @@ -1017,7 +1026,7 @@ void WasmObjectFile::getRelocationTypeName( break; switch (Rel.Type) { -#include "llvm/BinaryFormat/WasmRelocs/WebAssembly.def" +#include "llvm/BinaryFormat/WasmRelocs.def" } #undef WASM_RELOC diff --git a/lib/Object/WindowsResource.cpp b/lib/Object/WindowsResource.cpp index 9ca584a4a1ae..271224ec6312 100644 --- a/lib/Object/WindowsResource.cpp +++ b/lib/Object/WindowsResource.cpp @@ -14,6 +14,7 @@ #include "llvm/Object/WindowsResource.h" #include "llvm/Object/COFF.h" #include "llvm/Support/FileOutputBuffer.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Support/MathExtras.h" #include #include @@ -560,10 +561,9 @@ void WindowsResourceCOFFWriter::writeSymbolTable() { // Now write a symbol for each relocation. for (unsigned i = 0; i < Data.size(); i++) { - char RelocationName[9]; - sprintf(RelocationName, "$R%06X", DataOffsets[i]); + auto RelocationName = formatv("$R{0:X-6}", i & 0xffffff).sstr(); Symbol = reinterpret_cast(BufferStart + CurrentOffset); - strncpy(Symbol->Name.ShortName, RelocationName, (size_t)COFF::NameSize); + memcpy(Symbol->Name.ShortName, RelocationName.data(), (size_t) COFF::NameSize); Symbol->Value = DataOffsets[i]; Symbol->SectionNumber = 2; Symbol->Type = COFF::IMAGE_SYM_DTYPE_NULL; diff --git a/lib/ObjectYAML/WasmYAML.cpp b/lib/ObjectYAML/WasmYAML.cpp index 8687f22949a2..b2411395dc0f 100644 --- a/lib/ObjectYAML/WasmYAML.cpp +++ b/lib/ObjectYAML/WasmYAML.cpp @@ -439,7 +439,7 @@ void ScalarEnumerationTraits::enumeration( void ScalarEnumerationTraits::enumeration( IO &IO, WasmYAML::RelocType &Type) { #define WASM_RELOC(name, value) IO.enumCase(Type, #name, wasm::name); -#include "llvm/BinaryFormat/WasmRelocs/WebAssembly.def" +#include "llvm/BinaryFormat/WasmRelocs.def" #undef WASM_RELOC } diff --git a/lib/Passes/LLVMBuild.txt b/lib/Passes/LLVMBuild.txt index 4d8c7f85d3aa..e2378a84328e 100644 --- a/lib/Passes/LLVMBuild.txt +++ b/lib/Passes/LLVMBuild.txt @@ -19,4 +19,4 @@ type = Library name = Passes parent = Libraries -required_libraries = Analysis CodeGen Core IPO InstCombine Scalar Support TransformUtils Vectorize Instrumentation +required_libraries = Analysis CodeGen Core IPO InstCombine Scalar Support Target TransformUtils Vectorize Instrumentation diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp index f7fb0cef16bf..3489feb93a02 100644 --- a/lib/Support/APFloat.cpp +++ b/lib/Support/APFloat.cpp @@ -2546,12 +2546,12 @@ IEEEFloat::convertFromDecimalString(StringRef str, roundingMode rounding_mode) { } bool IEEEFloat::convertFromStringSpecials(StringRef str) { - if (str.equals("inf") || str.equals("INFINITY")) { + if (str.equals("inf") || str.equals("INFINITY") || str.equals("+Inf")) { makeInf(false); return true; } - if (str.equals("-inf") || str.equals("-INFINITY")) { + if (str.equals("-inf") || str.equals("-INFINITY") || str.equals("-Inf")) { makeInf(true); return true; } diff --git a/lib/Support/CachePruning.cpp b/lib/Support/CachePruning.cpp index 3e97c991f504..141573c2a1c7 100644 --- a/lib/Support/CachePruning.cpp +++ b/lib/Support/CachePruning.cpp @@ -165,12 +165,14 @@ bool llvm::pruneCache(StringRef Path, CachePruningPolicy Policy) { return false; } } else { + if (!Policy.Interval) + return false; if (Policy.Interval != seconds(0)) { // Check whether the time stamp is older than our pruning interval. // If not, do nothing. const auto TimeStampModTime = FileStatus.getLastModificationTime(); auto TimeStampAge = CurrentTime - TimeStampModTime; - if (TimeStampAge <= Policy.Interval) { + if (TimeStampAge <= *Policy.Interval) { DEBUG(dbgs() << "Timestamp file too recent (" << duration_cast(TimeStampAge).count() << "s old), do not prune.\n"); diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp index 85e782b2c048..c709fc416df6 100644 --- a/lib/Support/MemoryBuffer.cpp +++ b/lib/Support/MemoryBuffer.cpp @@ -80,10 +80,12 @@ void *operator new(size_t N, const NamedBufferAlloc &Alloc) { namespace { /// MemoryBufferMem - Named MemoryBuffer pointing to a block of memory. -class MemoryBufferMem : public MemoryBuffer { +template +class MemoryBufferMem : public MB { public: MemoryBufferMem(StringRef InputData, bool RequiresNullTerminator) { - init(InputData.begin(), InputData.end(), RequiresNullTerminator); + MemoryBuffer::init(InputData.begin(), InputData.end(), + RequiresNullTerminator); } /// Disable sized deallocation for MemoryBufferMem, because it has @@ -95,21 +97,22 @@ class MemoryBufferMem : public MemoryBuffer { return StringRef(reinterpret_cast(this + 1)); } - BufferKind getBufferKind() const override { - return MemoryBuffer_Malloc; + MemoryBuffer::BufferKind getBufferKind() const override { + return MemoryBuffer::MemoryBuffer_Malloc; } }; } -static ErrorOr> -getFileAux(const Twine &Filename, int64_t FileSize, uint64_t MapSize, +template +static ErrorOr> +getFileAux(const Twine &Filename, int64_t FileSize, uint64_t MapSize, uint64_t Offset, bool RequiresNullTerminator, bool IsVolatile); std::unique_ptr MemoryBuffer::getMemBuffer(StringRef InputData, StringRef BufferName, bool RequiresNullTerminator) { auto *Ret = new (NamedBufferAlloc(BufferName)) - MemoryBufferMem(InputData, RequiresNullTerminator); + MemoryBufferMem(InputData, RequiresNullTerminator); return std::unique_ptr(Ret); } @@ -119,50 +122,30 @@ MemoryBuffer::getMemBuffer(MemoryBufferRef Ref, bool RequiresNullTerminator) { Ref.getBuffer(), Ref.getBufferIdentifier(), RequiresNullTerminator)); } -std::unique_ptr -MemoryBuffer::getMemBufferCopy(StringRef InputData, const Twine &BufferName) { - std::unique_ptr Buf = - getNewUninitMemBuffer(InputData.size(), BufferName); +static ErrorOr> +getMemBufferCopyImpl(StringRef InputData, const Twine &BufferName) { + auto Buf = WritableMemoryBuffer::getNewUninitMemBuffer(InputData.size(), BufferName); if (!Buf) - return nullptr; - memcpy(const_cast(Buf->getBufferStart()), InputData.data(), - InputData.size()); - return Buf; + return make_error_code(errc::not_enough_memory); + memcpy(Buf->getBufferStart(), InputData.data(), InputData.size()); + return std::move(Buf); } std::unique_ptr -MemoryBuffer::getNewUninitMemBuffer(size_t Size, const Twine &BufferName) { - // Allocate space for the MemoryBuffer, the data and the name. It is important - // that MemoryBuffer and data are aligned so PointerIntPair works with them. - // TODO: Is 16-byte alignment enough? We copy small object files with large - // alignment expectations into this buffer. - SmallString<256> NameBuf; - StringRef NameRef = BufferName.toStringRef(NameBuf); - size_t AlignedStringLen = - alignTo(sizeof(MemoryBufferMem) + NameRef.size() + 1, 16); - size_t RealLen = AlignedStringLen + Size + 1; - char *Mem = static_cast(operator new(RealLen, std::nothrow)); - if (!Mem) - return nullptr; - - // The name is stored after the class itself. - CopyStringRef(Mem + sizeof(MemoryBufferMem), NameRef); - - // The buffer begins after the name and must be aligned. - char *Buf = Mem + AlignedStringLen; - Buf[Size] = 0; // Null terminate buffer. - - auto *Ret = new (Mem) MemoryBufferMem(StringRef(Buf, Size), true); - return std::unique_ptr(Ret); +MemoryBuffer::getMemBufferCopy(StringRef InputData, const Twine &BufferName) { + auto Buf = getMemBufferCopyImpl(InputData, BufferName); + if (Buf) + return std::move(*Buf); + return nullptr; } std::unique_ptr MemoryBuffer::getNewMemBuffer(size_t Size, StringRef BufferName) { - std::unique_ptr SB = getNewUninitMemBuffer(Size, BufferName); + auto SB = WritableMemoryBuffer::getNewUninitMemBuffer(Size, BufferName); if (!SB) return nullptr; - memset(const_cast(SB->getBufferStart()), 0, Size); - return SB; + memset(SB->getBufferStart(), 0, Size); + return std::move(SB); } ErrorOr> @@ -179,10 +162,10 @@ MemoryBuffer::getFileOrSTDIN(const Twine &Filename, int64_t FileSize, ErrorOr> MemoryBuffer::getFileSlice(const Twine &FilePath, uint64_t MapSize, uint64_t Offset, bool IsVolatile) { - return getFileAux(FilePath, -1, MapSize, Offset, false, IsVolatile); + return getFileAux(FilePath, -1, MapSize, Offset, false, + IsVolatile); } - //===----------------------------------------------------------------------===// // MemoryBuffer::getFile implementation. //===----------------------------------------------------------------------===// @@ -191,7 +174,8 @@ namespace { /// \brief Memory maps a file descriptor using sys::fs::mapped_file_region. /// /// This handles converting the offset into a legal offset on the platform. -class MemoryBufferMMapFile : public MemoryBuffer { +template +class MemoryBufferMMapFile : public MB { sys::fs::mapped_file_region MFR; static uint64_t getLegalMapOffset(uint64_t Offset) { @@ -209,11 +193,13 @@ class MemoryBufferMMapFile : public MemoryBuffer { public: MemoryBufferMMapFile(bool RequiresNullTerminator, int FD, uint64_t Len, uint64_t Offset, std::error_code &EC) - : MFR(FD, sys::fs::mapped_file_region::readonly, + : MFR(FD, + MB::Writable ? sys::fs::mapped_file_region::priv + : sys::fs::mapped_file_region::readonly, getLegalMapSize(Len, Offset), getLegalMapOffset(Offset), EC) { if (!EC) { const char *Start = getStart(Len, Offset); - init(Start, Start + Len, RequiresNullTerminator); + MemoryBuffer::init(Start, Start + Len, RequiresNullTerminator); } } @@ -226,13 +212,13 @@ class MemoryBufferMMapFile : public MemoryBuffer { return StringRef(reinterpret_cast(this + 1)); } - BufferKind getBufferKind() const override { - return MemoryBuffer_MMap; + MemoryBuffer::BufferKind getBufferKind() const override { + return MemoryBuffer::MemoryBuffer_MMap; } }; } -static ErrorOr> +static ErrorOr> getMemoryBufferForStream(int FD, const Twine &BufferName) { const ssize_t ChunkSize = 4096*4; SmallString Buffer; @@ -246,37 +232,80 @@ getMemoryBufferForStream(int FD, const Twine &BufferName) { Buffer.set_size(Buffer.size() + ReadBytes); } while (ReadBytes != 0); - return MemoryBuffer::getMemBufferCopy(Buffer, BufferName); + return getMemBufferCopyImpl(Buffer, BufferName); } ErrorOr> MemoryBuffer::getFile(const Twine &Filename, int64_t FileSize, bool RequiresNullTerminator, bool IsVolatile) { - return getFileAux(Filename, FileSize, FileSize, 0, - RequiresNullTerminator, IsVolatile); + return getFileAux(Filename, FileSize, FileSize, 0, + RequiresNullTerminator, IsVolatile); } -static ErrorOr> +template +static ErrorOr> getOpenFileImpl(int FD, const Twine &Filename, uint64_t FileSize, uint64_t MapSize, int64_t Offset, bool RequiresNullTerminator, bool IsVolatile); -static ErrorOr> +template +static ErrorOr> getFileAux(const Twine &Filename, int64_t FileSize, uint64_t MapSize, uint64_t Offset, bool RequiresNullTerminator, bool IsVolatile) { int FD; std::error_code EC = sys::fs::openFileForRead(Filename, FD); + if (EC) return EC; - ErrorOr> Ret = - getOpenFileImpl(FD, Filename, FileSize, MapSize, Offset, - RequiresNullTerminator, IsVolatile); + auto Ret = getOpenFileImpl(FD, Filename, FileSize, MapSize, Offset, + RequiresNullTerminator, IsVolatile); close(FD); return Ret; } +ErrorOr> +WritableMemoryBuffer::getFile(const Twine &Filename, int64_t FileSize, + bool IsVolatile) { + return getFileAux(Filename, FileSize, FileSize, 0, + /*RequiresNullTerminator*/ false, + IsVolatile); +} + +ErrorOr> +WritableMemoryBuffer::getFileSlice(const Twine &Filename, uint64_t MapSize, + uint64_t Offset, bool IsVolatile) { + return getFileAux(Filename, -1, MapSize, Offset, false, + IsVolatile); +} + +std::unique_ptr +WritableMemoryBuffer::getNewUninitMemBuffer(size_t Size, const Twine &BufferName) { + using MemBuffer = MemoryBufferMem; + // Allocate space for the MemoryBuffer, the data and the name. It is important + // that MemoryBuffer and data are aligned so PointerIntPair works with them. + // TODO: Is 16-byte alignment enough? We copy small object files with large + // alignment expectations into this buffer. + SmallString<256> NameBuf; + StringRef NameRef = BufferName.toStringRef(NameBuf); + size_t AlignedStringLen = alignTo(sizeof(MemBuffer) + NameRef.size() + 1, 16); + size_t RealLen = AlignedStringLen + Size + 1; + char *Mem = static_cast(operator new(RealLen, std::nothrow)); + if (!Mem) + return nullptr; + + // The name is stored after the class itself. + CopyStringRef(Mem + sizeof(MemBuffer), NameRef); + + // The buffer begins after the name and must be aligned. + char *Buf = Mem + AlignedStringLen; + Buf[Size] = 0; // Null terminate buffer. + + auto *Ret = new (Mem) MemBuffer(StringRef(Buf, Size), true); + return std::unique_ptr(Ret); +} + static bool shouldUseMmap(int FD, size_t FileSize, size_t MapSize, @@ -332,7 +361,8 @@ static bool shouldUseMmap(int FD, return true; } -static ErrorOr> +template +static ErrorOr> getOpenFileImpl(int FD, const Twine &Filename, uint64_t FileSize, uint64_t MapSize, int64_t Offset, bool RequiresNullTerminator, bool IsVolatile) { @@ -364,22 +394,21 @@ getOpenFileImpl(int FD, const Twine &Filename, uint64_t FileSize, if (shouldUseMmap(FD, FileSize, MapSize, Offset, RequiresNullTerminator, PageSize, IsVolatile)) { std::error_code EC; - std::unique_ptr Result( - new (NamedBufferAlloc(Filename)) - MemoryBufferMMapFile(RequiresNullTerminator, FD, MapSize, Offset, EC)); + std::unique_ptr Result( + new (NamedBufferAlloc(Filename)) MemoryBufferMMapFile( + RequiresNullTerminator, FD, MapSize, Offset, EC)); if (!EC) return std::move(Result); } - std::unique_ptr Buf = - MemoryBuffer::getNewUninitMemBuffer(MapSize, Filename); + auto Buf = WritableMemoryBuffer::getNewUninitMemBuffer(MapSize, Filename); if (!Buf) { // Failed to create a buffer. The only way it can fail is if // new(std::nothrow) returns 0. return make_error_code(errc::not_enough_memory); } - char *BufPtr = const_cast(Buf->getBufferStart()); + char *BufPtr = Buf.get()->getBufferStart(); size_t BytesLeft = MapSize; #ifndef HAVE_PREAD @@ -412,7 +441,7 @@ getOpenFileImpl(int FD, const Twine &Filename, uint64_t FileSize, ErrorOr> MemoryBuffer::getOpenFile(int FD, const Twine &Filename, uint64_t FileSize, bool RequiresNullTerminator, bool IsVolatile) { - return getOpenFileImpl(FD, Filename, FileSize, FileSize, 0, + return getOpenFileImpl(FD, Filename, FileSize, FileSize, 0, RequiresNullTerminator, IsVolatile); } @@ -420,7 +449,8 @@ ErrorOr> MemoryBuffer::getOpenFileSlice(int FD, const Twine &Filename, uint64_t MapSize, int64_t Offset, bool IsVolatile) { assert(MapSize != uint64_t(-1)); - return getOpenFileImpl(FD, Filename, -1, MapSize, Offset, false, IsVolatile); + return getOpenFileImpl(FD, Filename, -1, MapSize, Offset, false, + IsVolatile); } ErrorOr> MemoryBuffer::getSTDIN() { diff --git a/lib/Support/StringRef.cpp b/lib/Support/StringRef.cpp index 90992fce0bcc..9ba7a09f9962 100644 --- a/lib/Support/StringRef.cpp +++ b/lib/Support/StringRef.cpp @@ -586,7 +586,7 @@ bool StringRef::getAsDouble(double &Result, bool AllowInexact) const { APFloat::opStatus Status = F.convertFromString(*this, APFloat::rmNearestTiesToEven); if (Status != APFloat::opOK) { - if (!AllowInexact || Status != APFloat::opInexact) + if (!AllowInexact || !(Status & APFloat::opInexact)) return true; } diff --git a/lib/Support/TargetParser.cpp b/lib/Support/TargetParser.cpp index c59068cb3550..b96ca084e9bf 100644 --- a/lib/Support/TargetParser.cpp +++ b/lib/Support/TargetParser.cpp @@ -537,7 +537,7 @@ StringRef llvm::AArch64::getDefaultCPU(StringRef Arch) { } unsigned llvm::AArch64::checkArchVersion(StringRef Arch) { - if (Arch[0] == 'v' && std::isdigit(Arch[1])) + if (Arch.size() >= 2 && Arch[0] == 'v' && std::isdigit(Arch[1])) return (Arch[1] - 48); return 0; } @@ -633,7 +633,7 @@ StringRef llvm::ARM::getCanonicalArchName(StringRef Arch) { // Only match non-marketing names if (offset != StringRef::npos) { // Must start with 'vN'. - if (A[0] != 'v' || !std::isdigit(A[1])) + if (A.size() >= 2 && (A[0] != 'v' || !std::isdigit(A[1]))) return Error; // Can't have an extra 'eb'. if (A.find("eb") != StringRef::npos) @@ -739,7 +739,6 @@ ARM::ProfileKind ARM::parseArchProfile(StringRef Arch) { case ARM::ArchKind::ARMV8_2A: case ARM::ArchKind::ARMV8_3A: return ARM::ProfileKind::A; - LLVM_FALLTHROUGH; case ARM::ArchKind::ARMV2: case ARM::ArchKind::ARMV2A: case ARM::ArchKind::ARMV3: diff --git a/lib/Support/YAMLTraits.cpp b/lib/Support/YAMLTraits.cpp index 05ca40f03018..f8a80ba87873 100644 --- a/lib/Support/YAMLTraits.cpp +++ b/lib/Support/YAMLTraits.cpp @@ -657,7 +657,12 @@ void Output::scalarString(StringRef &S, QuotingType MustQuote) { } i = j + 1; } else if (MustQuote == QuotingType::Double && - !sys::unicode::isPrintable(S[j])) { + !sys::unicode::isPrintable(S[j]) && (S[j] & 0x80) == 0) { + // If we're double quoting non-printable characters, we prefer printing + // them as "\x" + their hex representation. Note that special casing is + // needed for UTF-8, where a byte may be part of a UTF-8 sequence and + // appear as non-printable, in which case we want to print the correct + // unicode character and not its hex representation. output(StringRef(&Base[i], j - i)); // "flush" output(StringLiteral("\\x")); diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp index 67138f41dda8..2ff2ee347f56 100644 --- a/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -583,6 +583,20 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { switch (MI->getOpcode()) { default: break; + case AArch64::MOVIv2d_ns: + // If the target has , lower this + // instruction to movi.16b instead. + if (STI->hasZeroCycleZeroingFPWorkaround() && + MI->getOperand(1).getImm() == 0) { + MCInst TmpInst; + TmpInst.setOpcode(AArch64::MOVIv16b_ns); + TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg())); + TmpInst.addOperand(MCOperand::createImm(MI->getOperand(1).getImm())); + EmitToStreamer(*OutStreamer, TmpInst); + return; + } + break; + case AArch64::DBG_VALUE: { if (isVerbose() && OutStreamer->hasRawTextSupport()) { SmallString<128> TmpStr; diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp index fd1699fd363d..022200986d2b 100644 --- a/lib/Target/AArch64/AArch64FastISel.cpp +++ b/lib/Target/AArch64/AArch64FastISel.cpp @@ -5135,11 +5135,12 @@ bool AArch64FastISel::fastSelectInstruction(const Instruction *I) { return selectAtomicCmpXchg(cast(I)); } - // fall-back to target-independent instruction selection. - return selectOperator(I, I->getOpcode()); // Silence warnings. (void)&CC_AArch64_DarwinPCS_VarArg; (void)&CC_AArch64_Win64_VarArg; + + // fall-back to target-independent instruction selection. + return selectOperator(I, I->getOpcode()); } namespace llvm { diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp index 73944359223a..d66f7b59a4b5 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -97,6 +97,7 @@ #include "AArch64RegisterInfo.h" #include "AArch64Subtarget.h" #include "AArch64TargetMachine.h" +#include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/LivePhysRegs.h" @@ -335,6 +336,22 @@ bool AArch64FrameLowering::canUseAsPrologue( return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister; } +static bool windowsRequiresStackProbe(MachineFunction &MF, + unsigned StackSizeInBytes) { + const AArch64Subtarget &Subtarget = MF.getSubtarget(); + if (!Subtarget.isTargetWindows()) + return false; + const Function &F = MF.getFunction(); + // TODO: When implementing stack protectors, take that into account + // for the probe threshold. + unsigned StackProbeSize = 4096; + if (F.hasFnAttribute("stack-probe-size")) + F.getFnAttribute("stack-probe-size") + .getValueAsString() + .getAsInteger(0, StackProbeSize); + return StackSizeInBytes >= StackProbeSize; +} + bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( MachineFunction &MF, unsigned StackBumpBytes) const { AArch64FunctionInfo *AFI = MF.getInfo(); @@ -347,7 +364,7 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( // 512 is the maximum immediate for stp/ldp that will be used for // callee-save save/restores - if (StackBumpBytes >= 512) + if (StackBumpBytes >= 512 || windowsRequiresStackProbe(MF, StackBumpBytes)) return false; if (MFI.hasVarSizedObjects()) @@ -478,7 +495,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, return; int NumBytes = (int)MFI.getStackSize(); - if (!AFI->hasStackFrame()) { + if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) { assert(!HasFP && "unexpected function without stack frame but with FP"); // All of the stack allocation is for locals. @@ -550,6 +567,44 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, MachineInstr::FrameSetup); } + if (windowsRequiresStackProbe(MF, NumBytes)) { + uint32_t NumWords = NumBytes >> 4; + + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15) + .addImm(NumWords) + .setMIFlags(MachineInstr::FrameSetup); + + switch (MF.getTarget().getCodeModel()) { + case CodeModel::Small: + case CodeModel::Medium: + case CodeModel::Kernel: + BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL)) + .addExternalSymbol("__chkstk") + .addReg(AArch64::X15, RegState::Implicit) + .setMIFlags(MachineInstr::FrameSetup); + break; + case CodeModel::Large: + BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT)) + .addReg(AArch64::X16, RegState::Define) + .addExternalSymbol("__chkstk") + .addExternalSymbol("__chkstk") + .setMIFlags(MachineInstr::FrameSetup); + + BuildMI(MBB, MBBI, DL, TII->get(AArch64::BLR)) + .addReg(AArch64::X16, RegState::Kill) + .addReg(AArch64::X15, RegState::Implicit | RegState::Define) + .setMIFlags(MachineInstr::FrameSetup); + break; + } + + BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP) + .addReg(AArch64::SP, RegState::Kill) + .addReg(AArch64::X15, RegState::Kill) + .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4)) + .setMIFlags(MachineInstr::FrameSetup); + NumBytes = 0; + } + // Allocate space for the rest of the frame. if (NumBytes) { const bool NeedsRealignment = RegInfo->needsStackRealignment(MF); @@ -1164,18 +1219,32 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, unsigned UnspilledCSGPR = AArch64::NoRegister; unsigned UnspilledCSGPRPaired = AArch64::NoRegister; + MachineFrameInfo &MFI = MF.getFrameInfo(); + const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); + + unsigned BasePointerReg = RegInfo->hasBasePointer(MF) + ? RegInfo->getBaseRegister() + : (unsigned)AArch64::NoRegister; + + unsigned SpillEstimate = SavedRegs.count(); + for (unsigned i = 0; CSRegs[i]; ++i) { + unsigned Reg = CSRegs[i]; + unsigned PairedReg = CSRegs[i ^ 1]; + if (Reg == BasePointerReg) + SpillEstimate++; + if (produceCompactUnwindFrame(MF) && !SavedRegs.test(PairedReg)) + SpillEstimate++; + } + SpillEstimate += 2; // Conservatively include FP+LR in the estimate + unsigned StackEstimate = MFI.estimateStackSize(MF) + 8 * SpillEstimate; + // The frame record needs to be created by saving the appropriate registers - if (hasFP(MF)) { + if (hasFP(MF) || windowsRequiresStackProbe(MF, StackEstimate)) { SavedRegs.set(AArch64::FP); SavedRegs.set(AArch64::LR); } - unsigned BasePointerReg = AArch64::NoRegister; - if (RegInfo->hasBasePointer(MF)) - BasePointerReg = RegInfo->getBaseRegister(); - unsigned ExtraCSSpill = 0; - const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); // Figure out which callee-saved registers to save/restore. for (unsigned i = 0; CSRegs[i]; ++i) { const unsigned Reg = CSRegs[i]; @@ -1217,7 +1286,6 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // The CSR spill slots have not been allocated yet, so estimateStackSize // won't include them. - MachineFrameInfo &MFI = MF.getFrameInfo(); unsigned CFSize = MFI.estimateStackSize(MF) + 8 * NumRegsSpilled; DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n"); unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF); diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 1242cf5be188..6f7b2b6fd5b5 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -470,10 +470,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, if (Subtarget->hasPerfMon()) setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); - if (Subtarget->isTargetMachO()) { - // For iOS, we don't want to the normal expansion of a libcall to - // sincos. We want to issue a libcall to __sincos_stret to avoid memory - // traffic. + if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && + getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { + // Issue __sincos_stret if available. setOperationAction(ISD::FSINCOS, MVT::f64, Custom); setOperationAction(ISD::FSINCOS, MVT::f32, Custom); } else { @@ -2328,8 +2327,9 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, Entry.IsZExt = false; Args.push_back(Entry); - const char *LibcallName = - (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret"; + RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64 + : RTLIB::SINCOS_STRET_F32; + const char *LibcallName = getLibcallName(LC); SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout())); diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp index c7c560a81328..abbba7d1d5a9 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -4963,16 +4963,9 @@ void AArch64InstrInfo::insertOutlinerEpilogue( MachineBasicBlock &MBB, MachineFunction &MF, const MachineOutlinerInfo &MInfo) const { - bool ContainsCalls = false; - - for (MachineInstr &MI : MBB) { - if (MI.isCall()) { - ContainsCalls = true; - break; - } - } - - if (ContainsCalls) { + // Is there a call in the outlined range? + if (std::any_of(MBB.instr_begin(), MBB.instr_end(), + [](MachineInstr &MI) { return MI.isCall(); })) { // Fix up the instructions in the range, since we're going to modify the // stack. fixupPostOutline(MBB); diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index 7f5507371fa0..a719d47618e5 100644 --- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -25,11 +25,11 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( ConstantSDNode *SizeValue = dyn_cast(Size); const AArch64Subtarget &STI = DAG.getMachineFunction().getSubtarget(); - const char *bzeroEntry = - (V && V->isNullValue()) ? STI.getBZeroEntry() : nullptr; + const char *bzeroName = (V && V->isNullValue()) + ? DAG.getTargetLoweringInfo().getLibcallName(RTLIB::BZERO) : nullptr; // For small size (< 256), it is not beneficial to use bzero // instead of memset. - if (bzeroEntry && (!SizeValue || SizeValue->getZExtValue() > 256)) { + if (bzeroName && (!SizeValue || SizeValue->getZExtValue() > 256)) { const AArch64TargetLowering &TLI = *STI.getTargetLowering(); EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout()); @@ -45,7 +45,7 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( CLI.setDebugLoc(dl) .setChain(Chain) .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol(bzeroEntry, IntPtr), + DAG.getExternalSymbol(bzeroName, IntPtr), std::move(Args)) .setDiscardResult(); std::pair CallResult = TLI.LowerCallTo(CLI); diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp index e397d585ae77..688bb936d0ca 100644 --- a/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/lib/Target/AArch64/AArch64Subtarget.cpp @@ -217,19 +217,6 @@ unsigned char AArch64Subtarget::classifyGlobalFunctionReference( return AArch64II::MO_NO_FLAG; } -/// This function returns the name of a function which has an interface -/// like the non-standard bzero function, if such a function exists on -/// the current subtarget and it is considered prefereable over -/// memset with zero passed as the second argument. Otherwise it -/// returns null. -const char *AArch64Subtarget::getBZeroEntry() const { - // Prefer bzero on Darwin only. - if(isTargetDarwin()) - return "bzero"; - - return nullptr; -} - void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const { // LNT run (at least on Cyclone) showed reasonably significant gains for diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h index 5d9759d363dd..9245b2f396b7 100644 --- a/lib/Target/AArch64/AArch64Subtarget.h +++ b/lib/Target/AArch64/AArch64Subtarget.h @@ -309,13 +309,6 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo { unsigned char classifyGlobalFunctionReference(const GlobalValue *GV, const TargetMachine &TM) const; - /// This function returns the name of a function which has an interface - /// like the non-standard bzero function, if such a function exists on - /// the current subtarget and it is considered prefereable over - /// memset with zero passed as the second argument. Otherwise it - /// returns null. - const char *getBZeroEntry() const; - void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override; diff --git a/lib/Target/AArch64/AArch64SystemOperands.td b/lib/Target/AArch64/AArch64SystemOperands.td index df939add70fa..66b7e02ceb99 100644 --- a/lib/Target/AArch64/AArch64SystemOperands.td +++ b/lib/Target/AArch64/AArch64SystemOperands.td @@ -322,6 +322,9 @@ def : ROSysReg<"PMCEID0_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b110>; def : ROSysReg<"PMCEID1_EL0", 0b11, 0b011, 0b1001, 0b1100, 0b111>; def : ROSysReg<"MIDR_EL1", 0b11, 0b000, 0b0000, 0b0000, 0b000>; def : ROSysReg<"CCSIDR_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b000>; +def : ROSysReg<"CCSIDR2_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b010> { + let Requires = [{ {AArch64::HasV8_3aOps} }]; +} def : ROSysReg<"CLIDR_EL1", 0b11, 0b001, 0b0000, 0b0000, 0b001>; def : ROSysReg<"CTR_EL0", 0b11, 0b011, 0b0000, 0b0000, 0b001>; def : ROSysReg<"MPIDR_EL1", 0b11, 0b000, 0b0000, 0b0000, 0b101>; diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp index 64583ead73f2..0e6ad944c141 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -346,10 +346,9 @@ class AArch64PassConfig : public TargetPassConfig { } // end anonymous namespace -TargetIRAnalysis AArch64TargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](const Function &F) { - return TargetTransformInfo(AArch64TTIImpl(this, F)); - }); +TargetTransformInfo +AArch64TargetMachine::getTargetTransformInfo(const Function &F) { + return TargetTransformInfo(AArch64TTIImpl(this, F)); } TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) { diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h index 2bbfb2da3db6..8d28a5e30ebf 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.h +++ b/lib/Target/AArch64/AArch64TargetMachine.h @@ -44,8 +44,7 @@ class AArch64TargetMachine : public LLVMTargetMachine { // Pass Pipeline Configuration TargetPassConfig *createPassConfig(PassManagerBase &PM) override; - /// \brief Get the TargetIRAnalysis for this target. - TargetIRAnalysis getTargetIRAnalysis() override; + TargetTransformInfo getTargetTransformInfo(const Function &F) override; TargetLoweringObjectFile* getObjFileLowering() const override { return TLOF.get(); diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index aeffbd70fc81..6e63783e5646 100644 --- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -1975,10 +1975,6 @@ static bool isValidSVEKind(StringRef Name) { .Default(false); } -static bool isSVERegister(StringRef Name) { - return Name[0] == 'z' || Name[0] == 'p'; -} - static void parseValidVectorKind(StringRef Name, unsigned &NumElements, char &ElementKind) { assert(isValidVectorKind(Name)); @@ -2008,21 +2004,19 @@ bool AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, // Matches a register name or register alias previously defined by '.req' unsigned AArch64AsmParser::matchRegisterNameAlias(StringRef Name, RegKind Kind) { - unsigned RegNum; - switch (Kind) { - case RegKind::Scalar: - RegNum = MatchRegisterName(Name); - break; - case RegKind::NeonVector: - RegNum = MatchNeonVectorRegName(Name); - break; - case RegKind::SVEDataVector: - RegNum = matchSVEDataVectorRegName(Name); - break; - case RegKind::SVEPredicateVector: - RegNum = matchSVEPredicateVectorRegName(Name); - break; - } + unsigned RegNum = 0; + if ((RegNum = matchSVEDataVectorRegName(Name))) + return Kind == RegKind::SVEDataVector ? RegNum : 0; + + if ((RegNum = matchSVEPredicateVectorRegName(Name))) + return Kind == RegKind::SVEPredicateVector ? RegNum : 0; + + if ((RegNum = MatchNeonVectorRegName(Name))) + return Kind == RegKind::NeonVector ? RegNum : 0; + + // The parsed register must be of RegKind Scalar + if ((RegNum = MatchRegisterName(Name))) + return Kind == RegKind::Scalar ? RegNum : 0; if (!RegNum) { // Check for aliases registered via .req. Canonicalize to lower case. @@ -2049,10 +2043,8 @@ int AArch64AsmParser::tryParseRegister() { return -1; std::string lowerCase = Tok.getString().lower(); - if (isSVERegister(lowerCase)) - return -1; - unsigned RegNum = matchRegisterNameAlias(lowerCase, RegKind::Scalar); + // Also handle a few aliases of registers. if (RegNum == 0) RegNum = StringSwitch(lowerCase) diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index bb628b8c558f..fda6252f46e3 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -695,18 +695,24 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( IsSGPR = false; Width = 3; } else if (AMDGPU::SReg_128RegClass.contains(Reg)) { + assert(!AMDGPU::TTMP_128RegClass.contains(Reg) && + "trap handler registers should not be used"); IsSGPR = true; Width = 4; } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { IsSGPR = false; Width = 4; } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { + assert(!AMDGPU::TTMP_256RegClass.contains(Reg) && + "trap handler registers should not be used"); IsSGPR = true; Width = 8; } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { IsSGPR = false; Width = 8; } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { + assert(!AMDGPU::TTMP_512RegClass.contains(Reg) && + "trap handler registers should not be used"); IsSGPR = true; Width = 16; } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index 3f8a9b1964ca..5c31bddd9b1a 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -202,6 +202,16 @@ class AMDGPUTargetLowering : public TargetLowering { const char* getTargetNodeName(unsigned Opcode) const override; + // FIXME: Turn off MergeConsecutiveStores() before Instruction Selection + // for AMDGPU. + // A commit ( git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@319036 + // 91177308-0d34-0410-b5e6-96231b3b80d8 ) turned on + // MergeConsecutiveStores() before Instruction Selection for all targets. + // Enough AMDGPU compiles go into an infinite loop ( MergeConsecutiveStores() + // merges two stores; LegalizeStoreOps() un-merges; MergeConsecutiveStores() + // re-merges, etc. ) to warrant turning it off for now. + bool mergeStoresAfterLegalization() const override { return false; } + bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override { return true; } diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 6984f4e71613..2042dbf6d5e2 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -571,10 +571,9 @@ class GCNPassConfig final : public AMDGPUPassConfig { } // end anonymous namespace -TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](const Function &F) { - return TargetTransformInfo(AMDGPUTTIImpl(this, F)); - }); +TargetTransformInfo +AMDGPUTargetMachine::getTargetTransformInfo(const Function &F) { + return TargetTransformInfo(AMDGPUTTIImpl(this, F)); } void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { @@ -898,4 +897,3 @@ void GCNPassConfig::addPreEmitPass() { TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { return new GCNPassConfig(*this, PM); } - diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 5043e31f6f5b..5f9b2a7fca20 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -55,7 +55,7 @@ class AMDGPUTargetMachine : public LLVMTargetMachine { const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override { return &IntrinsicInfo; } - TargetIRAnalysis getTargetIRAnalysis() override; + TargetTransformInfo getTargetTransformInfo(const Function &F) override; TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF.get(); diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 2acd7f78faea..ebf656c549ec 100644 --- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -536,6 +536,10 @@ class AMDGPUOperand : public MCParsedAsmOperand { return EndLoc; } + SMRange getLocRange() const { + return SMRange(StartLoc, EndLoc); + } + Modifiers getModifiers() const { assert(isRegKind() || isImmTy(ImmTyNone)); return isRegKind() ? Reg.Mods : Imm.Mods; @@ -1491,6 +1495,8 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) { case 1: return AMDGPU::TTMP_32RegClassID; case 2: return AMDGPU::TTMP_64RegClassID; case 4: return AMDGPU::TTMP_128RegClassID; + case 8: return AMDGPU::TTMP_256RegClassID; + case 16: return AMDGPU::TTMP_512RegClassID; } } else if (Is == IS_SGPR) { switch (RegWidth) { @@ -1498,8 +1504,8 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) { case 1: return AMDGPU::SGPR_32RegClassID; case 2: return AMDGPU::SGPR_64RegClassID; case 4: return AMDGPU::SGPR_128RegClassID; - case 8: return AMDGPU::SReg_256RegClassID; - case 16: return AMDGPU::SReg_512RegClassID; + case 8: return AMDGPU::SGPR_256RegClassID; + case 16: return AMDGPU::SGPR_512RegClassID; } } return -1; @@ -1754,6 +1760,11 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands, bool AbsMod) { // TODO: add syntactic sugar for 1/(2*PI) bool Minus = false; if (getLexer().getKind() == AsmToken::Minus) { + const AsmToken NextToken = getLexer().peekTok(); + if (!NextToken.is(AsmToken::Integer) && + !NextToken.is(AsmToken::Real)) { + return MatchOperand_NoMatch; + } Minus = true; Parser.Lex(); } @@ -1783,7 +1794,7 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands, bool AbsMod) { return MatchOperand_Success; } default: - return Minus ? MatchOperand_ParseFail : MatchOperand_NoMatch; + return MatchOperand_NoMatch; } } @@ -2244,6 +2255,9 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, return true; } +static std::string AMDGPUMnemonicSpellCheck(StringRef S, uint64_t FBS, + unsigned VariantID = 0); + bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, @@ -2286,8 +2300,13 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, case Match_MissingFeature: return Error(IDLoc, "instruction not supported on this GPU"); - case Match_MnemonicFail: - return Error(IDLoc, "unrecognized instruction mnemonic"); + case Match_MnemonicFail: { + uint64_t FBS = ComputeAvailableFeatures(getSTI().getFeatureBits()); + std::string Suggestion = AMDGPUMnemonicSpellCheck( + ((AMDGPUOperand &)*Operands[0]).getToken(), FBS); + return Error(IDLoc, "invalid instruction" + Suggestion, + ((AMDGPUOperand &)*Operands[0]).getLocRange()); + } case Match_InvalidOperand: { SMLoc ErrorLoc = IDLoc; @@ -3838,7 +3857,9 @@ AMDGPUAsmParser::parseSwizzleOp(OperandVector &Operands) { return Ok? MatchOperand_Success : MatchOperand_ParseFail; } else { - return MatchOperand_NoMatch; + // Swizzle "offset" operand is optional. + // If it is omitted, try parsing other optional operands. + return parseOptionalOperand(Operands); } } @@ -4786,6 +4807,7 @@ extern "C" void LLVMInitializeAMDGPUAsmParser() { #define GET_REGISTER_MATCHER #define GET_MATCHER_IMPLEMENTATION +#define GET_MNEMONIC_SPELL_CHECKER #include "AMDGPUGenAsmMatcher.inc" // This fuction should be defined after auto-generated include so that we have diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 4a3f2c975179..47a2d3f2fdc5 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -348,10 +348,12 @@ MCOperand AMDGPUDisassembler::createSRegOperand(unsigned SRegClassID, case AMDGPU::TTMP_128RegClassID: // ToDo: unclear if s[100:104] is available on VI. Can we use VCC as SGPR in // this bundle? - case AMDGPU::SReg_256RegClassID: - // ToDo: unclear if s[96:104] is available on VI. Can we use VCC as SGPR in + case AMDGPU::SGPR_256RegClassID: + case AMDGPU::TTMP_256RegClassID: + // ToDo: unclear if s[96:104] is available on VI. Can we use VCC as SGPR in // this bundle? - case AMDGPU::SReg_512RegClassID: + case AMDGPU::SGPR_512RegClassID: + case AMDGPU::TTMP_512RegClassID: shift = 2; break; // ToDo: unclear if s[88:104] is available on VI. Can we use VCC as SGPR in @@ -441,11 +443,11 @@ MCOperand AMDGPUDisassembler::decodeOperand_SReg_128(unsigned Val) const { } MCOperand AMDGPUDisassembler::decodeOperand_SReg_256(unsigned Val) const { - return createSRegOperand(AMDGPU::SReg_256RegClassID, Val); + return decodeDstOp(OPW256, Val); } MCOperand AMDGPUDisassembler::decodeOperand_SReg_512(unsigned Val) const { - return createSRegOperand(AMDGPU::SReg_512RegClassID, Val); + return decodeDstOp(OPW512, Val); } MCOperand AMDGPUDisassembler::decodeLiteralConstant() const { @@ -593,6 +595,8 @@ unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const { return SGPR_32RegClassID; case OPW64: return SGPR_64RegClassID; case OPW128: return SGPR_128RegClassID; + case OPW256: return SGPR_256RegClassID; + case OPW512: return SGPR_512RegClassID; } } @@ -608,6 +612,8 @@ unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const { return TTMP_32RegClassID; case OPW64: return TTMP_64RegClassID; case OPW128: return TTMP_128RegClassID; + case OPW256: return TTMP_256RegClassID; + case OPW512: return TTMP_512RegClassID; } } @@ -659,6 +665,25 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) c } } +MCOperand AMDGPUDisassembler::decodeDstOp(const OpWidthTy Width, unsigned Val) const { + using namespace AMDGPU::EncValues; + + assert(Val < 128); + assert(Width == OPW256 || Width == OPW512); + + if (Val <= SGPR_MAX) { + assert(SGPR_MIN == 0); // "SGPR_MIN <= Val" is always true and causes compilation warning. + return createSRegOperand(getSgprClassId(Width), Val - SGPR_MIN); + } + + int TTmpIdx = getTTmpIdx(Val); + if (TTmpIdx >= 0) { + return createSRegOperand(getTtmpClassId(Width), TTmpIdx); + } + + llvm_unreachable("unknown dst register"); +} + MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const { using namespace AMDGPU; diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index ce396eb68c4c..75cfc5e11282 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -95,6 +95,8 @@ class AMDGPUDisassembler : public MCDisassembler { OPW32, OPW64, OPW128, + OPW256, + OPW512, OPW16, OPWV216, OPW_LAST_, @@ -110,6 +112,7 @@ class AMDGPUDisassembler : public MCDisassembler { MCOperand decodeLiteralConstant() const; MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val) const; + MCOperand decodeDstOp(const OpWidthTy Width, unsigned Val) const; MCOperand decodeSpecialReg32(unsigned Val) const; MCOperand decodeSpecialReg64(unsigned Val) const; diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp index 67663d39967c..bf57f88bef91 100644 --- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -335,13 +335,13 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O, } else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(RegNo)) { O << 'v'; NumRegs = 8; - } else if (MRI.getRegClass(AMDGPU::SReg_256RegClassID).contains(RegNo)) { + } else if (MRI.getRegClass(AMDGPU::SGPR_256RegClassID).contains(RegNo)) { O << 's'; NumRegs = 8; } else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(RegNo)) { O << 'v'; NumRegs = 16; - } else if (MRI.getRegClass(AMDGPU::SReg_512RegClassID).contains(RegNo)) { + } else if (MRI.getRegClass(AMDGPU::SGPR_512RegClassID).contains(RegNo)) { O << 's'; NumRegs = 16; } else { diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td index 6b7c3ffb7bb8..dd0efef7f91b 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/lib/Target/AMDGPU/SIRegisterInfo.td @@ -7,6 +7,26 @@ // //===----------------------------------------------------------------------===// +//===----------------------------------------------------------------------===// +// Helpers +//===----------------------------------------------------------------------===// + +class getSubRegs { + list ret2 = [sub0, sub1]; + list ret3 = [sub0, sub1, sub2]; + list ret4 = [sub0, sub1, sub2, sub3]; + list ret8 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7]; + list ret16 = [sub0, sub1, sub2, sub3, + sub4, sub5, sub6, sub7, + sub8, sub9, sub10, sub11, + sub12, sub13, sub14, sub15]; + + list ret = !if(!eq(size, 2), ret2, + !if(!eq(size, 3), ret3, + !if(!eq(size, 4), ret4, + !if(!eq(size, 8), ret8, ret16)))); +} + //===----------------------------------------------------------------------===// // Declarations that describe the SI registers //===----------------------------------------------------------------------===// @@ -141,19 +161,19 @@ def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, } // SGPR 64-bit registers -def SGPR_64Regs : RegisterTuples<[sub0, sub1], +def SGPR_64Regs : RegisterTuples.ret, [(add (decimate SGPR_32, 2)), (add (decimate (shl SGPR_32, 1), 2))]>; // SGPR 128-bit registers -def SGPR_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3], +def SGPR_128Regs : RegisterTuples.ret, [(add (decimate SGPR_32, 4)), (add (decimate (shl SGPR_32, 1), 4)), (add (decimate (shl SGPR_32, 2), 4)), (add (decimate (shl SGPR_32, 3), 4))]>; // SGPR 256-bit registers -def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7], +def SGPR_256Regs : RegisterTuples.ret, [(add (decimate SGPR_32, 4)), (add (decimate (shl SGPR_32, 1), 4)), (add (decimate (shl SGPR_32, 2), 4)), @@ -164,8 +184,7 @@ def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7], (add (decimate (shl SGPR_32, 7), 4))]>; // SGPR 512-bit registers -def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, - sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15], +def SGPR_512Regs : RegisterTuples.ret, [(add (decimate SGPR_32, 4)), (add (decimate (shl SGPR_32, 1), 4)), (add (decimate (shl SGPR_32, 2), 4)), @@ -190,47 +209,125 @@ def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32, } // Trap handler TMP 64-bit registers -def TTMP_64Regs : RegisterTuples<[sub0, sub1], +def TTMP_64Regs : RegisterTuples.ret, [(add (decimate TTMP_32, 2)), (add (decimate (shl TTMP_32, 1), 2))]>; // Trap handler TMP 128-bit registers -def TTMP_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3], +def TTMP_128Regs : RegisterTuples.ret, [(add (decimate TTMP_32, 4)), (add (decimate (shl TTMP_32, 1), 4)), (add (decimate (shl TTMP_32, 2), 4)), (add (decimate (shl TTMP_32, 3), 4))]>; -class TmpRegTuples ("TTMP"#Index0#tgt), - Register r1 = !cast("TTMP"#Index1#tgt), - Register r2 = !cast("TTMP"#Index2#tgt), - Register r3 = !cast("TTMP"#Index3#tgt)> : - RegisterWithSubRegs { - let SubRegIndices = !if(Is64Bit, [sub0, sub1], [sub0, sub1, sub2, sub3]); - let HWEncoding = r0.HWEncoding; +def TTMP_256Regs : RegisterTuples.ret, + [(add (decimate TTMP_32, 4)), + (add (decimate (shl TTMP_32, 1), 4)), + (add (decimate (shl TTMP_32, 2), 4)), + (add (decimate (shl TTMP_32, 3), 4)), + (add (decimate (shl TTMP_32, 4), 4)), + (add (decimate (shl TTMP_32, 5), 4)), + (add (decimate (shl TTMP_32, 6), 4)), + (add (decimate (shl TTMP_32, 7), 4))]>; + +def TTMP_512Regs : RegisterTuples.ret, + [(add (decimate TTMP_32, 4)), + (add (decimate (shl TTMP_32, 1), 4)), + (add (decimate (shl TTMP_32, 2), 4)), + (add (decimate (shl TTMP_32, 3), 4)), + (add (decimate (shl TTMP_32, 4), 4)), + (add (decimate (shl TTMP_32, 5), 4)), + (add (decimate (shl TTMP_32, 6), 4)), + (add (decimate (shl TTMP_32, 7), 4)), + (add (decimate (shl TTMP_32, 8), 4)), + (add (decimate (shl TTMP_32, 9), 4)), + (add (decimate (shl TTMP_32, 10), 4)), + (add (decimate (shl TTMP_32, 11), 4)), + (add (decimate (shl TTMP_32, 12), 4)), + (add (decimate (shl TTMP_32, 13), 4)), + (add (decimate (shl TTMP_32, 14), 4)), + (add (decimate (shl TTMP_32, 15), 4))]>; + +class TmpRegTuplesBase subRegs, + list indices = getSubRegs.ret, + int index1 = !add(index, !add(size, -1)), + string name = "ttmp["#index#":"#index1#"]"> : + RegisterWithSubRegs { + let HWEncoding = subRegs[0].HWEncoding; + let SubRegIndices = indices; } +class TmpRegTuples("TTMP"#index0#tgt), + Register r1 = !cast("TTMP"#index1#tgt), + Register r2 = !cast("TTMP"#index2#tgt), + Register r3 = !cast("TTMP"#index3#tgt), + Register r4 = !cast("TTMP"#index4#tgt), + Register r5 = !cast("TTMP"#index5#tgt), + Register r6 = !cast("TTMP"#index6#tgt), + Register r7 = !cast("TTMP"#index7#tgt)> : + TmpRegTuplesBase.ret>; + foreach Index = {0, 2, 4, 6, 8, 10, 12, 14} in { - def TTMP#Index#_TTMP#!add(Index,1)#_vi : TmpRegTuples<"_vi", 1, Index>; - def TTMP#Index#_TTMP#!add(Index,1)#_gfx9 : TmpRegTuples<"_gfx9", 1, Index>; + def TTMP#Index#_TTMP#!add(Index,1)#_vi : TmpRegTuples<"_vi", 2, Index>; + def TTMP#Index#_TTMP#!add(Index,1)#_gfx9 : TmpRegTuples<"_gfx9", 2, Index>; } foreach Index = {0, 4, 8, 12} in { def TTMP#Index#_TTMP#!add(Index,1)# _TTMP#!add(Index,2)# - _TTMP#!add(Index,3)#_vi : TmpRegTuples<"_vi", 0, Index>; + _TTMP#!add(Index,3)#_vi : TmpRegTuples<"_vi", 4, Index>; def TTMP#Index#_TTMP#!add(Index,1)# _TTMP#!add(Index,2)# - _TTMP#!add(Index,3)#_gfx9 : TmpRegTuples<"_gfx9", 0, Index>; + _TTMP#!add(Index,3)#_gfx9 : TmpRegTuples<"_gfx9", 4, Index>; } +foreach Index = {0, 4, 8} in { + def TTMP#Index#_TTMP#!add(Index,1)# + _TTMP#!add(Index,2)# + _TTMP#!add(Index,3)# + _TTMP#!add(Index,4)# + _TTMP#!add(Index,5)# + _TTMP#!add(Index,6)# + _TTMP#!add(Index,7)#_vi : TmpRegTuples<"_vi", 8, Index>; + def TTMP#Index#_TTMP#!add(Index,1)# + _TTMP#!add(Index,2)# + _TTMP#!add(Index,3)# + _TTMP#!add(Index,4)# + _TTMP#!add(Index,5)# + _TTMP#!add(Index,6)# + _TTMP#!add(Index,7)#_gfx9 : TmpRegTuples<"_gfx9", 8, Index>; +} + +def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_vi : + TmpRegTuplesBase<0, 16, + [TTMP0_vi, TTMP1_vi, TTMP2_vi, TTMP3_vi, + TTMP4_vi, TTMP5_vi, TTMP6_vi, TTMP7_vi, + TTMP8_vi, TTMP9_vi, TTMP10_vi, TTMP11_vi, + TTMP12_vi, TTMP13_vi, TTMP14_vi, TTMP15_vi]>; + +def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_gfx9 : + TmpRegTuplesBase<0, 16, + [TTMP0_gfx9, TTMP1_gfx9, TTMP2_gfx9, TTMP3_gfx9, + TTMP4_gfx9, TTMP5_gfx9, TTMP6_gfx9, TTMP7_gfx9, + TTMP8_gfx9, TTMP9_gfx9, TTMP10_gfx9, TTMP11_gfx9, + TTMP12_gfx9, TTMP13_gfx9, TTMP14_gfx9, TTMP15_gfx9]>; + + // VGPR 32-bit registers // i16/f16 only on VI+ def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, @@ -240,25 +337,25 @@ def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, } // VGPR 64-bit registers -def VGPR_64 : RegisterTuples<[sub0, sub1], +def VGPR_64 : RegisterTuples.ret, [(add (trunc VGPR_32, 255)), (add (shl VGPR_32, 1))]>; // VGPR 96-bit registers -def VGPR_96 : RegisterTuples<[sub0, sub1, sub2], +def VGPR_96 : RegisterTuples.ret, [(add (trunc VGPR_32, 254)), (add (shl VGPR_32, 1)), (add (shl VGPR_32, 2))]>; // VGPR 128-bit registers -def VGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3], +def VGPR_128 : RegisterTuples.ret, [(add (trunc VGPR_32, 253)), (add (shl VGPR_32, 1)), (add (shl VGPR_32, 2)), (add (shl VGPR_32, 3))]>; // VGPR 256-bit registers -def VGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7], +def VGPR_256 : RegisterTuples.ret, [(add (trunc VGPR_32, 249)), (add (shl VGPR_32, 1)), (add (shl VGPR_32, 2)), @@ -269,8 +366,7 @@ def VGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7], (add (shl VGPR_32, 7))]>; // VGPR 512-bit registers -def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, - sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15], +def VGPR_512 : RegisterTuples.ret, [(add (trunc VGPR_32, 241)), (add (shl VGPR_32, 1)), (add (shl VGPR_32, 2)), @@ -368,13 +464,31 @@ def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, } // End CopyCost = 2 -def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256)> { +def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256Regs)> { + let AllocationPriority = 11; +} + +def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add TTMP_256Regs)> { + let isAllocatable = 0; +} + +def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, + (add SGPR_256, TTMP_256)> { // Requires 4 s_mov_b64 to copy let CopyCost = 4; let AllocationPriority = 11; } -def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add SGPR_512)> { +def SGPR_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add SGPR_512Regs)> { + let AllocationPriority = 12; +} + +def TTMP_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add TTMP_512Regs)> { + let isAllocatable = 0; +} + +def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, + (add SGPR_512, TTMP_512)> { // Requires 8 s_mov_b64 to copy let CopyCost = 8; let AllocationPriority = 12; diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 819a7add0be4..125a3b22d0cf 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -667,6 +667,10 @@ bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) { CASE_VI_GFX9(TTMP4_TTMP5_TTMP6_TTMP7) \ CASE_VI_GFX9(TTMP8_TTMP9_TTMP10_TTMP11) \ CASE_VI_GFX9(TTMP12_TTMP13_TTMP14_TTMP15) \ + CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7) \ + CASE_VI_GFX9(TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11) \ + CASE_VI_GFX9(TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \ + CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \ } #define CASE_CI_VI(node) \ diff --git a/lib/Target/ARC/ARCTargetMachine.cpp b/lib/Target/ARC/ARCTargetMachine.cpp index d2512c281a61..1acae3a88870 100644 --- a/lib/Target/ARC/ARCTargetMachine.cpp +++ b/lib/Target/ARC/ARCTargetMachine.cpp @@ -88,8 +88,7 @@ extern "C" void LLVMInitializeARCTarget() { RegisterTargetMachine X(getTheARCTarget()); } -TargetIRAnalysis ARCTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](const Function &F) { - return TargetTransformInfo(ARCTTIImpl(this, F)); - }); +TargetTransformInfo +ARCTargetMachine::getTargetTransformInfo(const Function &F) { + return TargetTransformInfo(ARCTTIImpl(this, F)); } diff --git a/lib/Target/ARC/ARCTargetMachine.h b/lib/Target/ARC/ARCTargetMachine.h index 98021b3dc1d5..18117e3409af 100644 --- a/lib/Target/ARC/ARCTargetMachine.h +++ b/lib/Target/ARC/ARCTargetMachine.h @@ -40,7 +40,7 @@ class ARCTargetMachine : public LLVMTargetMachine { // Pass Pipeline Configuration TargetPassConfig *createPassConfig(PassManagerBase &PM) override; - TargetIRAnalysis getTargetIRAnalysis() override; + TargetTransformInfo getTargetTransformInfo(const Function &F) override; TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF.get(); } diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h index 3aac689c6310..9ffb4c2055f9 100644 --- a/lib/Target/ARM/ARM.h +++ b/lib/Target/ARM/ARM.h @@ -61,6 +61,7 @@ void initializeARMLoadStoreOptPass(PassRegistry &); void initializeARMPreAllocLoadStoreOptPass(PassRegistry &); void initializeARMConstantIslandsPass(PassRegistry &); void initializeARMExpandPseudoPass(PassRegistry &); +void initializeThumb2SizeReducePass(PassRegistry &); } // end namespace llvm diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index c1a3f639461d..c9766aa2161a 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -83,6 +83,9 @@ def FeatureDB : SubtargetFeature<"db", "HasDataBarrier", "true", def FeatureV7Clrex : SubtargetFeature<"v7clrex", "HasV7Clrex", "true", "Has v7 clrex instruction">; +def FeatureDFB : SubtargetFeature<"dfb", "HasFullDataBarrier", "true", + "Has full data barrier (dfb) instruction">; + def FeatureAcquireRelease : SubtargetFeature<"acquire-release", "HasAcquireRelease", "true", "Has v8 acquire/release (lda/ldaex " @@ -617,6 +620,7 @@ def ARMv83a : Architecture<"armv8.3-a", "ARMv83a", [HasV8_3aOps, def ARMv8r : Architecture<"armv8-r", "ARMv8r", [HasV8Ops, FeatureRClass, FeatureDB, + FeatureDFB, FeatureDSP, FeatureCRC, FeatureMP, diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp index 0ea435062ec0..60048d4453d8 100644 --- a/lib/Target/ARM/ARMFastISel.cpp +++ b/lib/Target/ARM/ARMFastISel.cpp @@ -1416,7 +1416,7 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value, case MVT::i8: case MVT::i16: needsExt = true; - // Intentional fall-through. + LLVM_FALLTHROUGH; case MVT::i32: if (isThumb2) { if (!UseImm) diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 1b4d7ff50848..aeda7c06a27a 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -1041,7 +1041,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, if (!Subtarget->isThumb1Only()) setOperationAction(ISD::SETCCE, MVT::i32, Custom); - setOperationAction(ISD::BRCOND, MVT::Other, Expand); + setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::BR_CC, MVT::i32, Custom); setOperationAction(ISD::BR_CC, MVT::f32, Custom); setOperationAction(ISD::BR_CC, MVT::f64, Custom); @@ -1084,20 +1084,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, } } - // Combine sin / cos into one node or libcall if possible. - if (Subtarget->hasSinCos()) { - setLibcallName(RTLIB::SINCOS_F32, "sincosf"); - setLibcallName(RTLIB::SINCOS_F64, "sincos"); - if (Subtarget->isTargetWatchABI()) { - setLibcallCallingConv(RTLIB::SINCOS_F32, CallingConv::ARM_AAPCS_VFP); - setLibcallCallingConv(RTLIB::SINCOS_F64, CallingConv::ARM_AAPCS_VFP); - } - if (Subtarget->isTargetIOS() || Subtarget->isTargetWatchOS()) { - // For iOS, we don't want to the normal expansion of a libcall to - // sincos. We want to issue a libcall to __sincos_stret. - setOperationAction(ISD::FSINCOS, MVT::f64, Custom); - setOperationAction(ISD::FSINCOS, MVT::f32, Custom); - } + // Use __sincos_stret if available. + if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && + getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { + setOperationAction(ISD::FSINCOS, MVT::f64, Custom); + setOperationAction(ISD::FSINCOS, MVT::f32, Custom); } // FP-ARMv8 implements a lot of rounding-like FP operations. @@ -1255,6 +1246,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::CMOV: return "ARMISD::CMOV"; case ARMISD::SSAT: return "ARMISD::SSAT"; + case ARMISD::USAT: return "ARMISD::USAT"; case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG"; case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG"; @@ -3902,6 +3894,10 @@ ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const { return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp); } +// This function returns three things: the arithmetic computation itself +// (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The +// comparison and the condition code define the case in which the arithmetic +// computation *does not* overflow. std::pair ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG, SDValue &ARMcc) const { @@ -3927,7 +3923,11 @@ ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG, break; case ISD::UADDO: ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); - Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS); + // We use ADDC here to correspond to its use in LowerUnsignedALUO. + // We do not use it in the USUBO case as Value may not be used. + Value = DAG.getNode(ARMISD::ADDC, dl, + DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS) + .getValue(0); OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); break; case ISD::SSUBO: @@ -4205,7 +4205,7 @@ static bool isUpperSaturate(const SDValue LHS, const SDValue RHS, ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))); } -// Check if two chained conditionals could be converted into SSAT. +// Check if two chained conditionals could be converted into SSAT or USAT. // // SSAT can replace a set of two conditional selectors that bound a number to an // interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples: @@ -4216,10 +4216,14 @@ static bool isUpperSaturate(const SDValue LHS, const SDValue RHS, // x < k ? (x < -k ? -k : x) : k // etc. // +// USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1 is +// a power of 2. +// // It returns true if the conversion can be done, false otherwise. -// Additionally, the variable is returned in parameter V and the constant in K. +// Additionally, the variable is returned in parameter V, the constant in K and +// usat is set to true if the conditional represents an unsigned saturation static bool isSaturatingConditional(const SDValue &Op, SDValue &V, - uint64_t &K) { + uint64_t &K, bool &usat) { SDValue LHS1 = Op.getOperand(0); SDValue RHS1 = Op.getOperand(1); SDValue TrueVal1 = Op.getOperand(2); @@ -4286,13 +4290,23 @@ static bool isSaturatingConditional(const SDValue &Op, SDValue &V, int64_t Val1 = cast(*K1)->getSExtValue(); int64_t Val2 = cast(*K2)->getSExtValue(); int64_t PosVal = std::max(Val1, Val2); + int64_t NegVal = std::min(Val1, Val2); if (((Val1 > Val2 && UpperCheckOp == &Op) || (Val1 < Val2 && UpperCheckOp == &Op2)) && - Val1 == ~Val2 && isPowerOf2_64(PosVal + 1)) { + isPowerOf2_64(PosVal + 1)) { + + // Handle the difference between USAT (unsigned) and SSAT (signed) saturation + if (Val1 == ~Val2) + usat = false; + else if (NegVal == 0) + usat = true; + else + return false; V = V2; K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive + return true; } @@ -4306,10 +4320,16 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { // Try to convert two saturating conditional selects into a single SSAT SDValue SatValue; uint64_t SatConstant; + bool SatUSat; if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) && - isSaturatingConditional(Op, SatValue, SatConstant)) - return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue, - DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); + isSaturatingConditional(Op, SatValue, SatConstant, SatUSat)) { + if (SatUSat) + return DAG.getNode(ARMISD::USAT, dl, VT, SatValue, + DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); + else + return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue, + DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); + } SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); @@ -4506,6 +4526,39 @@ ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } +SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { + SDValue Chain = Op.getOperand(0); + SDValue Cond = Op.getOperand(1); + SDValue Dest = Op.getOperand(2); + SDLoc dl(Op); + + // Optimize {s|u}{add|sub}.with.overflow feeding into a branch instruction. + unsigned Opc = Cond.getOpcode(); + if (Cond.getResNo() == 1 && (Opc == ISD::SADDO || Opc == ISD::UADDO || + Opc == ISD::SSUBO || Opc == ISD::USUBO)) { + // Only lower legal XALUO ops. + if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) + return SDValue(); + + // The actual operation with overflow check. + SDValue Value, OverflowCmp; + SDValue ARMcc; + std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); + + // Reverse the condition code. + ARMCC::CondCodes CondCode = + (ARMCC::CondCodes)cast(ARMcc)->getZExtValue(); + CondCode = ARMCC::getOppositeCondition(CondCode); + ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + + return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, + OverflowCmp); + } + + return SDValue(); +} + SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); ISD::CondCode CC = cast(Op.getOperand(1))->get(); @@ -4526,6 +4579,33 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { } } + // Optimize {s|u}{add|sub}.with.overflow feeding into a branch instruction. + unsigned Opc = LHS.getOpcode(); + if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) && + (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || + Opc == ISD::USUBO) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { + // Only lower legal XALUO ops. + if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) + return SDValue(); + + // The actual operation with overflow check. + SDValue Value, OverflowCmp; + SDValue ARMcc; + std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc); + + if ((CC == ISD::SETNE) != isOneConstant(RHS)) { + // Reverse the condition code. + ARMCC::CondCodes CondCode = + (ARMCC::CondCodes)cast(ARMcc)->getZExtValue(); + CondCode = ARMCC::getOppositeCondition(CondCode); + ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); + } + SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); + + return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, + OverflowCmp); + } + if (LHS.getValueType() == MVT::i32) { SDValue ARMcc; SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); @@ -7523,10 +7603,9 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { Entry.IsZExt = false; Args.push_back(Entry); - const char *LibcallName = - (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret"; RTLIB::Libcall LC = - (ArgVT == MVT::f64) ? RTLIB::SINCOS_F64 : RTLIB::SINCOS_F32; + (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; + const char *LibcallName = getLibcallName(LC); CallingConv::ID CC = getLibcallCallingConv(LC); SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL)); @@ -7782,6 +7861,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); + case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::BR_CC: return LowerBR_CC(Op, DAG); case ISD::BR_JT: return LowerBR_JT(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); @@ -13751,7 +13831,7 @@ Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, case AtomicOrdering::SequentiallyConsistent: if (!Inst->hasAtomicStore()) return nullptr; // Nothing to do - /*FALLTHROUGH*/ + LLVM_FALLTHROUGH; case AtomicOrdering::Release: case AtomicOrdering::AcquireRelease: if (Subtarget->preferISHSTBarriers()) diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 0a1af8d89f9b..bf63dfae4407 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -87,6 +87,7 @@ class VectorType; CMOV, // ARM conditional move instructions. SSAT, // Signed saturation + USAT, // Unsigned saturation BCC_i64, @@ -643,6 +644,7 @@ class VectorType; SDValue LowerUnsignedALUO(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index 4e13af596300..eb8526bfeadf 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -139,6 +139,8 @@ def ARMcmov : SDNode<"ARMISD::CMOV", SDT_ARMCMov, def ARMssatnoshift : SDNode<"ARMISD::SSAT", SDTIntSatNoShOp, []>; +def ARMusatnoshift : SDNode<"ARMISD::USAT", SDTIntSatNoShOp, []>; + def ARMbrcond : SDNode<"ARMISD::BRCOND", SDT_ARMBrcond, [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>; @@ -278,6 +280,9 @@ def HasDSP : Predicate<"Subtarget->hasDSP()">, def HasDB : Predicate<"Subtarget->hasDataBarrier()">, AssemblerPredicate<"FeatureDB", "data-barriers">; +def HasDFB : Predicate<"Subtarget->hasFullDataBarrier()">, + AssemblerPredicate<"FeatureDFB", + "full-data-barrier">; def HasV7Clrex : Predicate<"Subtarget->hasV7Clrex()">, AssemblerPredicate<"FeatureV7Clrex", "v7 clrex">; @@ -3832,6 +3837,8 @@ def : ARMV6Pat<(int_arm_usat GPRnopc:$a, imm0_31:$pos), (USAT imm0_31:$pos, GPRnopc:$a, 0)>; def : ARMPat<(ARMssatnoshift GPRnopc:$Rn, imm0_31:$imm), (SSAT imm0_31:$imm, GPRnopc:$Rn, 0)>; +def : ARMPat<(ARMusatnoshift GPRnopc:$Rn, imm0_31:$imm), + (USAT imm0_31:$imm, GPRnopc:$Rn, 0)>; def : ARMV6Pat<(int_arm_ssat16 GPRnopc:$a, imm1_16:$pos), (SSAT16 imm1_16:$pos, GPRnopc:$a)>; def : ARMV6Pat<(int_arm_usat16 GPRnopc:$a, imm0_15:$pos), @@ -5846,6 +5853,8 @@ include "ARMInstrNEON.td" def : InstAlias<"dmb", (DMB 0xf), 0>, Requires<[IsARM, HasDB]>; def : InstAlias<"dsb", (DSB 0xf), 0>, Requires<[IsARM, HasDB]>; def : InstAlias<"isb", (ISB 0xf), 0>, Requires<[IsARM, HasDB]>; +// Armv8-R 'Data Full Barrier' +def : InstAlias<"dfb", (DSB 0xc), 1>, Requires<[IsARM, HasDFB]>; // System instructions def : MnemonicAlias<"swi", "svc">; diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td index 670ed127da7e..4592249f5795 100644 --- a/lib/Target/ARM/ARMInstrThumb2.td +++ b/lib/Target/ARM/ARMInstrThumb2.td @@ -2336,6 +2336,8 @@ def t2USAT16: T2SatI<(ins imm0_15:$sat_imm, rGPR:$Rn), def : T2Pat<(ARMssatnoshift GPRnopc:$Rn, imm0_31:$imm), (t2SSAT imm0_31:$imm, GPRnopc:$Rn, 0)>; +def : T2Pat<(ARMusatnoshift GPRnopc:$Rn, imm0_31:$imm), + (t2USAT imm0_31:$imm, GPRnopc:$Rn, 0)>; def : T2Pat<(int_arm_ssat GPR:$a, imm1_32:$pos), (t2SSAT imm1_32:$pos, GPR:$a, 0)>; def : T2Pat<(int_arm_usat GPR:$a, imm0_31:$pos), @@ -4506,6 +4508,8 @@ def : t2InstAlias<"tst${p} $Rn, $Rm", def : InstAlias<"dmb${p}", (t2DMB 0xf, pred:$p), 0>, Requires<[HasDB]>; def : InstAlias<"dsb${p}", (t2DSB 0xf, pred:$p), 0>, Requires<[HasDB]>; def : InstAlias<"isb${p}", (t2ISB 0xf, pred:$p), 0>, Requires<[HasDB]>; +// Armv8-R 'Data Full Barrier' +def : InstAlias<"dfb${p}", (t2DSB 0xc, pred:$p), 1>, Requires<[HasDFB]>; // Alias for LDR, LDRB, LDRH, LDRSB, and LDRSH without the ".w" optional // width specifier. diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp index 6bbeae2e1151..b0fd0b476920 100644 --- a/lib/Target/ARM/ARMInstructionSelector.cpp +++ b/lib/Target/ARM/ARMInstructionSelector.cpp @@ -669,13 +669,22 @@ bool ARMInstructionSelector::select(MachineInstr &I, return true; } + using namespace TargetOpcode; + if (I.getOpcode() == G_CONSTANT) { + // Pointer constants should be treated the same as 32-bit integer constants. + // Change the type and let TableGen handle it. + unsigned ResultReg = I.getOperand(0).getReg(); + LLT Ty = MRI.getType(ResultReg); + if (Ty.isPointer()) + MRI.setType(ResultReg, LLT::scalar(32)); + } + if (selectImpl(I, CoverageInfo)) return true; MachineInstrBuilder MIB{MF, I}; bool isSExt = false; - using namespace TargetOpcode; switch (I.getOpcode()) { case G_SEXT: isSExt = true; @@ -741,6 +750,31 @@ bool ARMInstructionSelector::select(MachineInstr &I, const auto &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); const auto &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); + if (SrcRegBank.getID() == ARM::FPRRegBankID) { + // This should only happen in the obscure case where we have put a 64-bit + // integer into a D register. Get it out of there and keep only the + // interesting part. + assert(I.getOpcode() == G_TRUNC && "Unsupported operand for G_ANYEXT"); + assert(DstRegBank.getID() == ARM::GPRRegBankID && + "Unsupported combination of register banks"); + assert(MRI.getType(SrcReg).getSizeInBits() == 64 && "Unsupported size"); + assert(MRI.getType(DstReg).getSizeInBits() <= 32 && "Unsupported size"); + + unsigned IgnoredBits = MRI.createVirtualRegister(&ARM::GPRRegClass); + auto InsertBefore = std::next(I.getIterator()); + auto MovI = + BuildMI(MBB, InsertBefore, I.getDebugLoc(), TII.get(ARM::VMOVRRD)) + .addDef(DstReg) + .addDef(IgnoredBits) + .addUse(SrcReg) + .add(predOps(ARMCC::AL)); + if (!constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI)) + return false; + + MIB->eraseFromParent(); + return true; + } + if (SrcRegBank.getID() != DstRegBank.getID()) { DEBUG(dbgs() << "G_TRUNC/G_ANYEXT operands on different register banks\n"); return false; @@ -754,6 +788,28 @@ bool ARMInstructionSelector::select(MachineInstr &I, I.setDesc(TII.get(COPY)); return selectCopy(I, TII, MRI, TRI, RBI); } + case G_INTTOPTR: + case G_PTRTOINT: { + auto SrcReg = I.getOperand(1).getReg(); + auto DstReg = I.getOperand(0).getReg(); + + const auto &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); + const auto &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); + + if (SrcRegBank.getID() != DstRegBank.getID()) { + DEBUG(dbgs() + << "G_INTTOPTR/G_PTRTOINT operands on different register banks\n"); + return false; + } + + if (SrcRegBank.getID() != ARM::GPRRegBankID) { + DEBUG(dbgs() << "G_INTTOPTR/G_PTRTOINT on non-GPR not supported yet\n"); + return false; + } + + I.setDesc(TII.get(COPY)); + return selectCopy(I, TII, MRI, TRI, RBI); + } case G_SELECT: return selectSelect(MIB, MRI); case G_ICMP: { diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp index 2dd1dff64e87..8cff1f0869d0 100644 --- a/lib/Target/ARM/ARMLegalizerInfo.cpp +++ b/lib/Target/ARM/ARMLegalizerInfo.cpp @@ -126,6 +126,12 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { setAction({Op, s32}, Legal); } + setAction({G_INTTOPTR, p0}, Legal); + setAction({G_INTTOPTR, 1, s32}, Legal); + + setAction({G_PTRTOINT, s32}, Legal); + setAction({G_PTRTOINT, 1, p0}, Legal); + for (unsigned Op : {G_ASHR, G_LSHR, G_SHL}) setAction({Op, s32}, Legal); @@ -139,6 +145,7 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { setAction({G_BRCOND, s1}, Legal); setAction({G_CONSTANT, s32}, Legal); + setAction({G_CONSTANT, p0}, Legal); setLegalizeScalarToDifferentSizeStrategy(G_CONSTANT, 0, widen_1_8_16); setAction({G_ICMP, s1}, Legal); diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp index b32bfd449544..fad0e98285e6 100644 --- a/lib/Target/ARM/ARMRegisterBankInfo.cpp +++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp @@ -226,12 +226,30 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case G_SEXT: case G_ZEXT: case G_ANYEXT: - case G_TRUNC: case G_GEP: + case G_INTTOPTR: + case G_PTRTOINT: // FIXME: We're abusing the fact that everything lives in a GPR for now; in // the real world we would use different mappings. OperandsMapping = &ARM::ValueMappings[ARM::GPR3OpsIdx]; break; + case G_TRUNC: { + // In some cases we may end up with a G_TRUNC from a 64-bit value to a + // 32-bit value. This isn't a real floating point trunc (that would be a + // G_FPTRUNC). Instead it is an integer trunc in disguise, which can appear + // because the legalizer doesn't distinguish between integer and floating + // point values so it may leave some 64-bit integers un-narrowed. Until we + // have a more principled solution that doesn't let such things sneak all + // the way to this point, just map the source to a DPR and the destination + // to a GPR. + LLT LargeTy = MRI.getType(MI.getOperand(1).getReg()); + OperandsMapping = + LargeTy.getSizeInBits() <= 32 + ? &ARM::ValueMappings[ARM::GPR3OpsIdx] + : getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx], + &ARM::ValueMappings[ARM::DPR3OpsIdx]}); + break; + } case G_LOAD: case G_STORE: { LLT Ty = MRI.getType(MI.getOperand(0).getReg()); diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp index 4d4a88126ce6..23027e92481f 100644 --- a/lib/Target/ARM/ARMSubtarget.cpp +++ b/lib/Target/ARM/ARMSubtarget.cpp @@ -348,11 +348,6 @@ unsigned ARMSubtarget::getMispredictionPenalty() const { return SchedModel.MispredictPenalty; } -bool ARMSubtarget::hasSinCos() const { - return isTargetWatchOS() || - (isTargetIOS() && !getTargetTriple().isOSVersionLT(7, 0)); -} - bool ARMSubtarget::enableMachineScheduler() const { // Enable the MachineScheduler before register allocation for subtargets // with the use-misched feature. diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index 9301197e1387..eedb675a3304 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -236,6 +236,10 @@ class ARMSubtarget : public ARMGenSubtargetInfo { /// instructions. bool HasDataBarrier = false; + /// HasFullDataBarrier - True if the subtarget supports DFB data barrier + /// instruction. + bool HasFullDataBarrier = false; + /// HasV7Clrex - True if the subtarget supports CLREX instructions bool HasV7Clrex = false; @@ -544,6 +548,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo { bool hasDivideInThumbMode() const { return HasHardwareDivideInThumb; } bool hasDivideInARMMode() const { return HasHardwareDivideInARM; } bool hasDataBarrier() const { return HasDataBarrier; } + bool hasFullDataBarrier() const { return HasFullDataBarrier; } bool hasV7Clrex() const { return HasV7Clrex; } bool hasAcquireRelease() const { return HasAcquireRelease; } @@ -712,10 +717,6 @@ class ARMSubtarget : public ARMGenSubtargetInfo { unsigned getMispredictionPenalty() const; - /// This function returns true if the target has sincos() routine in its - /// compiler runtime or math libraries. - bool hasSinCos() const; - /// Returns true if machine scheduler should be enabled. bool enableMachineScheduler() const override; diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp index 51982b2dab14..0f6d1eddc985 100644 --- a/lib/Target/ARM/ARMTargetMachine.cpp +++ b/lib/Target/ARM/ARMTargetMachine.cpp @@ -92,6 +92,7 @@ extern "C" void LLVMInitializeARMTarget() { initializeARMConstantIslandsPass(Registry); initializeARMExecutionDepsFixPass(Registry); initializeARMExpandPseudoPass(Registry); + initializeThumb2SizeReducePass(Registry); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -282,10 +283,9 @@ ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const { return I.get(); } -TargetIRAnalysis ARMBaseTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](const Function &F) { - return TargetTransformInfo(ARMTTIImpl(this, F)); - }); +TargetTransformInfo +ARMBaseTargetMachine::getTargetTransformInfo(const Function &F) { + return TargetTransformInfo(ARMTTIImpl(this, F)); } ARMLETargetMachine::ARMLETargetMachine(const Target &T, const Triple &TT, diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h index 655ec3202bfb..2072bb731f0a 100644 --- a/lib/Target/ARM/ARMTargetMachine.h +++ b/lib/Target/ARM/ARMTargetMachine.h @@ -53,8 +53,7 @@ class ARMBaseTargetMachine : public LLVMTargetMachine { const ARMSubtarget *getSubtargetImpl() const = delete; bool isLittleEndian() const { return isLittle; } - /// \brief Get the TargetIRAnalysis for this target. - TargetIRAnalysis getTargetIRAnalysis() override; + TargetTransformInfo getTargetTransformInfo(const Function &F) override; // Pass Pipeline Configuration TargetPassConfig *createPassConfig(PassManagerBase &PM) override; diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp index cae01e415eff..43d7888075b5 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -394,25 +394,6 @@ int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, return 1; } -int ARMTTIImpl::getFPOpCost(Type *Ty) { - // Use similar logic that's in ARMISelLowering: - // Any ARM CPU with VFP2 has floating point, but Thumb1 didn't have access - // to VFP. - - if (ST->hasVFP2() && !ST->isThumb1Only()) { - if (Ty->isFloatTy()) { - return TargetTransformInfo::TCC_Basic; - } - - if (Ty->isDoubleTy()) { - return ST->isFPOnlySP() ? TargetTransformInfo::TCC_Expensive : - TargetTransformInfo::TCC_Basic; - } - } - - return TargetTransformInfo::TCC_Expensive; -} - int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) { // We only handle costs of reverse and alternate shuffles for now. diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h index 99353a3219a0..cd9fa0709020 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/lib/Target/ARM/ARMTargetTransformInfo.h @@ -156,8 +156,6 @@ class ARMTTIImpl : public BasicTTIImplBase { int getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr); - int getFPOpCost(Type *Ty); - int getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info = TTI::OK_AnyValue, diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index 26fda5f22b4f..97b642c99f80 100644 --- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -5581,11 +5581,11 @@ void ARMAsmParser::getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst, CanAcceptPredicationCode = Mnemonic != "cdp2" && Mnemonic != "clrex" && Mnemonic != "mcr2" && Mnemonic != "mcrr2" && Mnemonic != "mrc2" && Mnemonic != "mrrc2" && - Mnemonic != "dmb" && Mnemonic != "dsb" && Mnemonic != "isb" && - Mnemonic != "pld" && Mnemonic != "pli" && Mnemonic != "pldw" && - Mnemonic != "ldc2" && Mnemonic != "ldc2l" && Mnemonic != "stc2" && - Mnemonic != "stc2l" && !Mnemonic.startswith("rfe") && - !Mnemonic.startswith("srs"); + Mnemonic != "dmb" && Mnemonic != "dfb" && Mnemonic != "dsb" && + Mnemonic != "isb" && Mnemonic != "pld" && Mnemonic != "pli" && + Mnemonic != "pldw" && Mnemonic != "ldc2" && Mnemonic != "ldc2l" && + Mnemonic != "stc2" && Mnemonic != "stc2l" && + !Mnemonic.startswith("rfe") && !Mnemonic.startswith("srs"); } else if (isThumbOne()) { if (hasV6MOps()) CanAcceptPredicationCode = Mnemonic != "movs"; diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp index a29a2eeccfe8..53c635877675 100644 --- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -2386,6 +2386,7 @@ static DecodeStatus DecodeVLDInstruction(MCInst &Inst, unsigned Insn, case ARM::VLD4q32_UPD: if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder))) return MCDisassembler::Fail; + break; default: break; } @@ -3326,6 +3327,7 @@ static DecodeStatus DecodeT2AddrModeSOReg(MCInst &Inst, unsigned Val, case ARM::t2STRs: if (Rn == 15) return MCDisassembler::Fail; + break; default: break; } @@ -3391,6 +3393,7 @@ static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Insn, break; case ARM::t2LDRSBs: Inst.setOpcode(ARM::t2PLIs); + break; default: break; } @@ -3854,6 +3857,7 @@ static DecodeStatus DecodeT2AddrModeImm12(MCInst &Inst, unsigned Val, case ARM::t2STRHi12: if (Rn == 15) return MCDisassembler::Fail; + break; default: break; } diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp index 3920c73fba6a..5357e26856ea 100644 --- a/lib/Target/ARM/Thumb2SizeReduction.cpp +++ b/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -45,6 +45,7 @@ using namespace llvm; #define DEBUG_TYPE "t2-reduce-size" +#define THUMB2_SIZE_REDUCE_NAME "Thumb2 instruction size reduce pass" STATISTIC(NumNarrows, "Number of 32-bit instrs reduced to 16-bit ones"); STATISTIC(Num2Addrs, "Number of 32-bit instrs reduced to 2addr 16-bit ones"); @@ -162,7 +163,7 @@ namespace { const Thumb2InstrInfo *TII; const ARMSubtarget *STI; - Thumb2SizeReduce(std::function Ftor); + Thumb2SizeReduce(std::function Ftor = nullptr); bool runOnMachineFunction(MachineFunction &MF) override; @@ -172,7 +173,7 @@ namespace { } StringRef getPassName() const override { - return "Thumb2 instruction size reduction pass"; + return THUMB2_SIZE_REDUCE_NAME; } private: @@ -237,6 +238,9 @@ namespace { } // end anonymous namespace +INITIALIZE_PASS(Thumb2SizeReduce, DEBUG_TYPE, THUMB2_SIZE_REDUCE_NAME, false, + false) + Thumb2SizeReduce::Thumb2SizeReduce(std::function Ftor) : MachineFunctionPass(ID), PredicateFtor(std::move(Ftor)) { OptimizeSize = MinimizeSize = false; diff --git a/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp b/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp index 6f81e020b996..1f4ef098403d 100644 --- a/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp +++ b/lib/Target/BPF/InstPrinter/BPFInstPrinter.cpp @@ -56,7 +56,7 @@ void BPFInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, if (Op.isReg()) { O << getRegisterName(Op.getReg()); } else if (Op.isImm()) { - O << (int32_t)Op.getImm(); + O << formatImm((int32_t)Op.getImm()); } else { assert(Op.isExpr() && "Expected an expression"); printExpr(Op.getExpr(), O); @@ -76,9 +76,9 @@ void BPFInstPrinter::printMemOperand(const MCInst *MI, int OpNo, raw_ostream &O, if (OffsetOp.isImm()) { auto Imm = OffsetOp.getImm(); if (Imm >= 0) - O << " + " << formatDec(Imm); + O << " + " << formatImm(Imm); else - O << " - " << formatDec(-Imm); + O << " - " << formatImm(-Imm); } else { assert(0 && "Expected an immediate"); } @@ -88,7 +88,7 @@ void BPFInstPrinter::printImm64Operand(const MCInst *MI, unsigned OpNo, raw_ostream &O) { const MCOperand &Op = MI->getOperand(OpNo); if (Op.isImm()) - O << (uint64_t)Op.getImm(); + O << formatImm(Op.getImm()); else if (Op.isExpr()) printExpr(Op.getExpr(), O); else @@ -100,7 +100,7 @@ void BPFInstPrinter::printBrTargetOperand(const MCInst *MI, unsigned OpNo, const MCOperand &Op = MI->getOperand(OpNo); if (Op.isImm()) { int16_t Imm = Op.getImm(); - O << ((Imm >= 0) ? "+" : "") << Imm; + O << ((Imm >= 0) ? "+" : "") << formatImm(Imm); } else if (Op.isExpr()) { printExpr(Op.getExpr(), O); } else { diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp index 537f97c9a987..8b6c571dee02 100644 --- a/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp +++ b/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp @@ -756,11 +756,11 @@ struct ShuffleMask { ShuffleMask lo() const { size_t H = Mask.size()/2; - return ShuffleMask({Mask.data(), H}); + return ShuffleMask(Mask.take_front(H)); } ShuffleMask hi() const { size_t H = Mask.size()/2; - return ShuffleMask({Mask.data()+H, H}); + return ShuffleMask(Mask.take_back(H)); } }; @@ -836,15 +836,6 @@ namespace llvm { }; } -// Return a submask of A that is shorter than A by |C| elements: -// - if C > 0, return a submask of A that starts at position C, -// - if C <= 0, return a submask of A that starts at 0 (reduce A by |C|). -static ArrayRef subm(ArrayRef A, int C) { - if (C > 0) - return { A.data()+C, A.size()-C }; - return { A.data(), A.size()+C }; -} - static void splitMask(ArrayRef Mask, MutableArrayRef MaskL, MutableArrayRef MaskR) { unsigned VecLen = Mask.size(); @@ -910,21 +901,38 @@ bool HvxSelector::selectVectorConstants(SDNode *N) { // Since they are generated during the selection process, the main // selection algorithm is not aware of them. Select them directly // here. - if (!N->isMachineOpcode() && N->getOpcode() == ISD::LOAD) { - SDValue Addr = cast(N)->getBasePtr(); - unsigned AddrOpc = Addr.getOpcode(); - if (AddrOpc == HexagonISD::AT_PCREL || AddrOpc == HexagonISD::CP) { - if (Addr.getOperand(0).getOpcode() == ISD::TargetConstantPool) { - ISel.Select(N); - return true; - } + SmallVector Loads; + SmallVector WorkQ; + + // The DAG can change (due to CSE) during selection, so cache all the + // unselected nodes first to avoid traversing a mutating DAG. + + auto IsLoadToSelect = [] (SDNode *N) { + if (!N->isMachineOpcode() && N->getOpcode() == ISD::LOAD) { + SDValue Addr = cast(N)->getBasePtr(); + unsigned AddrOpc = Addr.getOpcode(); + if (AddrOpc == HexagonISD::AT_PCREL || AddrOpc == HexagonISD::CP) + if (Addr.getOperand(0).getOpcode() == ISD::TargetConstantPool) + return true; } + return false; + }; + + WorkQ.push_back(N); + for (unsigned i = 0; i != WorkQ.size(); ++i) { + SDNode *W = WorkQ[i]; + if (IsLoadToSelect(W)) { + Loads.push_back(W); + continue; + } + for (unsigned j = 0, f = W->getNumOperands(); j != f; ++j) + WorkQ.push_back(W->getOperand(j).getNode()); } - bool Selected = false; - for (unsigned I = 0, E = N->getNumOperands(); I != E; ++I) - Selected = selectVectorConstants(N->getOperand(I).getNode()) || Selected; - return Selected; + for (SDNode *L : Loads) + ISel.Select(L); + + return !Loads.empty(); } void HvxSelector::materialize(const ResultStack &Results) { @@ -1159,8 +1167,8 @@ OpRef HvxSelector::vmuxp(ArrayRef Bytes, OpRef Va, OpRef Vb, ResultStack &Results) { DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';}); size_t S = Bytes.size() / 2; - OpRef L = vmuxs({Bytes.data(), S}, OpRef::lo(Va), OpRef::lo(Vb), Results); - OpRef H = vmuxs({Bytes.data()+S, S}, OpRef::hi(Va), OpRef::hi(Vb), Results); + OpRef L = vmuxs(Bytes.take_front(S), OpRef::lo(Va), OpRef::lo(Vb), Results); + OpRef H = vmuxs(Bytes.drop_front(S), OpRef::hi(Va), OpRef::hi(Vb), Results); return concat(L, H, Results); } @@ -1435,7 +1443,7 @@ OpRef HvxSelector::contracting(ShuffleMask SM, OpRef Va, OpRef Vb, return OpRef::fail(); // Examine the rest of the mask. for (int I = L; I < N; I += L) { - auto S = findStrip(subm(SM.Mask,I), 1, N-I); + auto S = findStrip(SM.Mask.drop_front(I), 1, N-I); // Check whether the mask element at the beginning of each strip // increases by 2L each time. if (S.first - Strip.first != 2*I) @@ -1465,7 +1473,7 @@ OpRef HvxSelector::contracting(ShuffleMask SM, OpRef Va, OpRef Vb, std::pair PrevS = Strip; bool Flip = false; for (int I = L; I < N; I += L) { - auto S = findStrip(subm(SM.Mask,I), 1, N-I); + auto S = findStrip(SM.Mask.drop_front(I), 1, N-I); if (S.second != PrevS.second) return OpRef::fail(); int Diff = Flip ? PrevS.first - S.first + 2*L @@ -1524,7 +1532,7 @@ OpRef HvxSelector::expanding(ShuffleMask SM, OpRef Va, ResultStack &Results) { // First, check the non-ignored strips. for (int I = 2*L; I < 2*N; I += 2*L) { - auto S = findStrip(subm(SM.Mask,I), 1, N-I); + auto S = findStrip(SM.Mask.drop_front(I), 1, N-I); if (S.second != unsigned(L)) return OpRef::fail(); if (2*S.first != I) @@ -1532,7 +1540,7 @@ OpRef HvxSelector::expanding(ShuffleMask SM, OpRef Va, ResultStack &Results) { } // Check the -1s. for (int I = L; I < 2*N; I += 2*L) { - auto S = findStrip(subm(SM.Mask,I), 0, N-I); + auto S = findStrip(SM.Mask.drop_front(I), 0, N-I); if (S.first != -1 || S.second != unsigned(L)) return OpRef::fail(); } @@ -1666,8 +1674,8 @@ OpRef HvxSelector::perfect(ShuffleMask SM, OpRef Va, ResultStack &Results) { if (!isPowerOf2_32(X)) return OpRef::fail(); // Check the other segments of Mask. - for (int J = 0; J < VecLen; J += I) { - if (XorPow2(subm(SM.Mask, -J), I) != X) + for (int J = I; J < VecLen; J += I) { + if (XorPow2(SM.Mask.slice(J, I), I) != X) return OpRef::fail(); } Perm[Log2_32(X)] = Log2_32(I)-1; diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp index 586363335df1..0e0da2ddc400 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -761,11 +761,13 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Promote the value if needed. switch (VA.getLocInfo()) { default: - // Loc info must be one of Full, SExt, ZExt, or AExt. + // Loc info must be one of Full, BCvt, SExt, ZExt, or AExt. llvm_unreachable("Unknown loc info!"); - case CCValAssign::BCvt: case CCValAssign::Full: break; + case CCValAssign::BCvt: + Arg = DAG.getBitcast(VA.getLocVT(), Arg); + break; case CCValAssign::SExt: Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); break; @@ -1135,6 +1137,8 @@ SDValue HexagonTargetLowering::LowerFormalArguments( unsigned VReg = RegInfo.createVirtualRegister(&Hexagon::IntRegsRegClass); RegInfo.addLiveIn(VA.getLocReg(), VReg); + if (VA.getLocInfo() == CCValAssign::BCvt) + RegVT = VA.getValVT(); SDValue Copy = DAG.getCopyFromReg(Chain, dl, VReg, RegVT); // Treat values of type MVT::i1 specially: they are passed in // registers of type i32, but they need to remain as values of @@ -1155,6 +1159,8 @@ SDValue HexagonTargetLowering::LowerFormalArguments( unsigned VReg = RegInfo.createVirtualRegister(&Hexagon::DoubleRegsRegClass); RegInfo.addLiveIn(VA.getLocReg(), VReg); + if (VA.getLocInfo() == CCValAssign::BCvt) + RegVT = VA.getValVT(); InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT)); // Single Vector @@ -1715,8 +1721,8 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v4i1, &Hexagon::PredRegsRegClass); // ddccbbaa addRegisterClass(MVT::v8i1, &Hexagon::PredRegsRegClass); // hgfedcba addRegisterClass(MVT::i32, &Hexagon::IntRegsRegClass); - addRegisterClass(MVT::v4i8, &Hexagon::IntRegsRegClass); addRegisterClass(MVT::v2i16, &Hexagon::IntRegsRegClass); + addRegisterClass(MVT::v4i8, &Hexagon::IntRegsRegClass); addRegisterClass(MVT::i64, &Hexagon::DoubleRegsRegClass); addRegisterClass(MVT::v8i8, &Hexagon::DoubleRegsRegClass); addRegisterClass(MVT::v4i16, &Hexagon::DoubleRegsRegClass); @@ -1735,6 +1741,14 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v128i8, &Hexagon::HvxWRRegClass); addRegisterClass(MVT::v64i16, &Hexagon::HvxWRRegClass); addRegisterClass(MVT::v32i32, &Hexagon::HvxWRRegClass); + // These "short" boolean vector types should be legal because + // they will appear as results of vector compares. If they were + // not legal, type legalization would try to make them legal + // and that would require using operations that do not use or + // produce such types. That, in turn, would imply using custom + // nodes, which would be unoptimizable by the DAG combiner. + // The idea is to rely on target-independent operations as much + // as possible. addRegisterClass(MVT::v16i1, &Hexagon::HvxQRRegClass); addRegisterClass(MVT::v32i1, &Hexagon::HvxQRRegClass); addRegisterClass(MVT::v64i1, &Hexagon::HvxQRRegClass); @@ -1964,9 +1978,8 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Legal); // Types natively supported: - for (MVT NativeVT : {MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v32i1, MVT::v64i1, - MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16, MVT::v1i32, - MVT::v2i32, MVT::v1i64}) { + for (MVT NativeVT : {MVT::v32i1, MVT::v64i1, MVT::v4i8, MVT::v8i8, MVT::v2i16, + MVT::v4i16, MVT::v1i32, MVT::v2i32, MVT::v1i64}) { setOperationAction(ISD::BUILD_VECTOR, NativeVT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, NativeVT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, NativeVT, Custom); @@ -1992,63 +2005,6 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, AddPromotedToType(Opc, FromTy, ToTy); }; - if (Subtarget.useHVXOps()) { - bool Use64b = Subtarget.useHVX64BOps(); - ArrayRef LegalV = Use64b ? LegalV64 : LegalV128; - ArrayRef LegalW = Use64b ? LegalW64 : LegalW128; - MVT ByteV = Use64b ? MVT::v64i8 : MVT::v128i8; - MVT ByteW = Use64b ? MVT::v128i8 : MVT::v256i8; - - setOperationAction(ISD::VECTOR_SHUFFLE, ByteV, Legal); - setOperationAction(ISD::VECTOR_SHUFFLE, ByteW, Legal); - setOperationAction(ISD::CONCAT_VECTORS, ByteW, Legal); - setOperationAction(ISD::AND, ByteV, Legal); - setOperationAction(ISD::OR, ByteV, Legal); - setOperationAction(ISD::XOR, ByteV, Legal); - - for (MVT T : LegalV) { - setIndexedLoadAction(ISD::POST_INC, T, Legal); - setIndexedStoreAction(ISD::POST_INC, T, Legal); - - setOperationAction(ISD::ADD, T, Legal); - setOperationAction(ISD::SUB, T, Legal); - setOperationAction(ISD::VSELECT, T, Legal); - if (T != ByteV) { - setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, T, Legal); - setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, T, Legal); - } - - setOperationAction(ISD::MUL, T, Custom); - setOperationAction(ISD::SETCC, T, Custom); - setOperationAction(ISD::BUILD_VECTOR, T, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, T, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, T, Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, T, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, T, Custom); - if (T != ByteV) - setOperationAction(ISD::ANY_EXTEND_VECTOR_INREG, T, Custom); - } - - for (MVT T : LegalV) { - if (T == ByteV) - continue; - // Promote all shuffles and concats to operate on vectors of bytes. - setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteV); - setPromoteTo(ISD::CONCAT_VECTORS, T, ByteV); - setPromoteTo(ISD::AND, T, ByteV); - setPromoteTo(ISD::OR, T, ByteV); - setPromoteTo(ISD::XOR, T, ByteV); - } - - for (MVT T : LegalW) { - if (T == ByteW) - continue; - // Promote all shuffles and concats to operate on vectors of bytes. - setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteW); - setPromoteTo(ISD::CONCAT_VECTORS, T, ByteW); - } - } - // Subtarget-specific operation actions. // if (Subtarget.hasV5TOps()) { @@ -2110,6 +2066,67 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, setIndexedStoreAction(ISD::POST_INC, VT, Legal); } + if (Subtarget.useHVXOps()) { + bool Use64b = Subtarget.useHVX64BOps(); + ArrayRef LegalV = Use64b ? LegalV64 : LegalV128; + ArrayRef LegalW = Use64b ? LegalW64 : LegalW128; + MVT ByteV = Use64b ? MVT::v64i8 : MVT::v128i8; + MVT ByteW = Use64b ? MVT::v128i8 : MVT::v256i8; + + setOperationAction(ISD::VECTOR_SHUFFLE, ByteV, Legal); + setOperationAction(ISD::VECTOR_SHUFFLE, ByteW, Legal); + setOperationAction(ISD::CONCAT_VECTORS, ByteW, Legal); + setOperationAction(ISD::AND, ByteV, Legal); + setOperationAction(ISD::OR, ByteV, Legal); + setOperationAction(ISD::XOR, ByteV, Legal); + + for (MVT T : LegalV) { + setIndexedLoadAction(ISD::POST_INC, T, Legal); + setIndexedStoreAction(ISD::POST_INC, T, Legal); + + setOperationAction(ISD::ADD, T, Legal); + setOperationAction(ISD::SUB, T, Legal); + if (T != ByteV) { + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, T, Legal); + setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, T, Legal); + } + + setOperationAction(ISD::MUL, T, Custom); + setOperationAction(ISD::SETCC, T, Custom); + setOperationAction(ISD::BUILD_VECTOR, T, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, T, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, T, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, T, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, T, Custom); + if (T != ByteV) + setOperationAction(ISD::ANY_EXTEND_VECTOR_INREG, T, Custom); + } + + for (MVT T : LegalV) { + if (T == ByteV) + continue; + // Promote all shuffles and concats to operate on vectors of bytes. + setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteV); + setPromoteTo(ISD::CONCAT_VECTORS, T, ByteV); + setPromoteTo(ISD::AND, T, ByteV); + setPromoteTo(ISD::OR, T, ByteV); + setPromoteTo(ISD::XOR, T, ByteV); + } + + for (MVT T : LegalW) { + // Custom-lower BUILD_VECTOR for vector pairs. The standard (target- + // independent) handling of it would convert it to a load, which is + // not always the optimal choice. + setOperationAction(ISD::BUILD_VECTOR, T, Custom); + + if (T == ByteW) + continue; + // Promote all shuffles and concats to operate on vectors of bytes. + setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteW); + setPromoteTo(ISD::CONCAT_VECTORS, T, ByteW); + } + } + computeRegisterProperties(&HRI); // @@ -2256,6 +2273,7 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const { case HexagonISD::VINSERTW0: return "HexagonISD::VINSERTW0"; case HexagonISD::VROR: return "HexagonISD::VROR"; case HexagonISD::READCYCLE: return "HexagonISD::READCYCLE"; + case HexagonISD::VZERO: return "HexagonISD::VZERO"; case HexagonISD::OP_END: break; } return nullptr; @@ -2331,14 +2349,27 @@ bool HexagonTargetLowering::isShuffleMaskLegal(ArrayRef Mask, TargetLoweringBase::LegalizeTypeAction HexagonTargetLowering::getPreferredVectorAction(EVT VT) const { + if (VT.getVectorNumElements() == 1) + return TargetLoweringBase::TypeScalarizeVector; + + // Always widen vectors of i1. + MVT ElemTy = VT.getSimpleVT().getVectorElementType(); + if (ElemTy == MVT::i1) + return TargetLoweringBase::TypeWidenVector; + if (Subtarget.useHVXOps()) { // If the size of VT is at least half of the vector length, // widen the vector. Note: the threshold was not selected in // any scientific way. - if (VT.getSizeInBits() >= Subtarget.getVectorLength()*8/2) - return TargetLoweringBase::TypeWidenVector; + ArrayRef Tys = Subtarget.getHVXElementTypes(); + if (llvm::find(Tys, ElemTy) != Tys.end()) { + unsigned HwWidth = 8*Subtarget.getVectorLength(); + unsigned VecWidth = VT.getSizeInBits(); + if (VecWidth >= HwWidth/2 && VecWidth < HwWidth) + return TargetLoweringBase::TypeWidenVector; + } } - return TargetLowering::getPreferredVectorAction(VT); + return TargetLoweringBase::TypeSplitVector; } // Lower a vector shuffle (V1, V2, V3). V1 and V2 are the two vectors @@ -2463,21 +2494,43 @@ HexagonTargetLowering::LowerVECTOR_SHIFT(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::BITCAST, dl, VT, Result); } +bool +HexagonTargetLowering::getBuildVectorConstInts(ArrayRef Values, + MVT VecTy, SelectionDAG &DAG, + MutableArrayRef Consts) const { + MVT ElemTy = VecTy.getVectorElementType(); + unsigned ElemWidth = ElemTy.getSizeInBits(); + IntegerType *IntTy = IntegerType::get(*DAG.getContext(), ElemWidth); + bool AllConst = true; + + for (unsigned i = 0, e = Values.size(); i != e; ++i) { + SDValue V = Values[i]; + if (V.isUndef()) { + Consts[i] = ConstantInt::get(IntTy, 0); + continue; + } + if (auto *CN = dyn_cast(V.getNode())) { + const ConstantInt *CI = CN->getConstantIntValue(); + Consts[i] = const_cast(CI); + } else if (auto *CN = dyn_cast(V.getNode())) { + const ConstantFP *CF = CN->getConstantFPValue(); + APInt A = CF->getValueAPF().bitcastToAPInt(); + Consts[i] = ConstantInt::get(IntTy, A.getZExtValue()); + } else { + AllConst = false; + } + } + return AllConst; +} + SDValue HexagonTargetLowering::buildVector32(ArrayRef Elem, const SDLoc &dl, MVT VecTy, SelectionDAG &DAG) const { MVT ElemTy = VecTy.getVectorElementType(); assert(VecTy.getVectorNumElements() == Elem.size()); - SmallVector Consts; - bool AllConst = true; - for (SDValue V : Elem) { - if (isUndef(V)) - V = DAG.getConstant(0, dl, ElemTy); - auto *C = dyn_cast(V.getNode()); - Consts.push_back(C); - AllConst = AllConst && C != nullptr; - } + SmallVector Consts(Elem.size()); + bool AllConst = getBuildVectorConstInts(Elem, VecTy, DAG, Consts); unsigned First, Num = Elem.size(); for (First = 0; First != Num; ++First) @@ -2486,6 +2539,10 @@ HexagonTargetLowering::buildVector32(ArrayRef Elem, const SDLoc &dl, if (First == Num) return DAG.getUNDEF(VecTy); + if (AllConst && + llvm::all_of(Consts, [](ConstantInt *CI) { return CI->isZero(); })) + return getZero(dl, VecTy, DAG); + if (ElemTy == MVT::i16) { assert(Elem.size() == 2); if (AllConst) { @@ -2498,45 +2555,55 @@ HexagonTargetLowering::buildVector32(ArrayRef Elem, const SDLoc &dl, return DAG.getBitcast(MVT::v2i16, N); } - // First try generating a constant. - assert(ElemTy == MVT::i8 && Num == 4); - if (AllConst) { - int32_t V = (Consts[0]->getZExtValue() & 0xFF) | - (Consts[1]->getZExtValue() & 0xFF) << 8 | - (Consts[1]->getZExtValue() & 0xFF) << 16 | - Consts[2]->getZExtValue() << 24; - return DAG.getBitcast(MVT::v4i8, DAG.getConstant(V, dl, MVT::i32)); + if (ElemTy == MVT::i8) { + // First try generating a constant. + if (AllConst) { + int32_t V = (Consts[0]->getZExtValue() & 0xFF) | + (Consts[1]->getZExtValue() & 0xFF) << 8 | + (Consts[1]->getZExtValue() & 0xFF) << 16 | + Consts[2]->getZExtValue() << 24; + return DAG.getBitcast(MVT::v4i8, DAG.getConstant(V, dl, MVT::i32)); + } + + // Then try splat. + bool IsSplat = true; + for (unsigned i = 0; i != Num; ++i) { + if (i == First) + continue; + if (Elem[i] == Elem[First] || isUndef(Elem[i])) + continue; + IsSplat = false; + break; + } + if (IsSplat) { + // Legalize the operand to VSPLAT. + SDValue Ext = DAG.getZExtOrTrunc(Elem[First], dl, MVT::i32); + return DAG.getNode(HexagonISD::VSPLAT, dl, VecTy, Ext); + } + + // Generate + // (zxtb(Elem[0]) | (zxtb(Elem[1]) << 8)) | + // (zxtb(Elem[2]) | (zxtb(Elem[3]) << 8)) << 16 + assert(Elem.size() == 4); + SDValue Vs[4]; + for (unsigned i = 0; i != 4; ++i) { + Vs[i] = DAG.getZExtOrTrunc(Elem[i], dl, MVT::i32); + Vs[i] = DAG.getZeroExtendInReg(Vs[i], dl, MVT::i8); + } + SDValue S8 = DAG.getConstant(8, dl, MVT::i32); + SDValue T0 = DAG.getNode(ISD::SHL, dl, MVT::i32, {Vs[1], S8}); + SDValue T1 = DAG.getNode(ISD::SHL, dl, MVT::i32, {Vs[3], S8}); + SDValue B0 = DAG.getNode(ISD::OR, dl, MVT::i32, {Vs[0], T0}); + SDValue B1 = DAG.getNode(ISD::OR, dl, MVT::i32, {Vs[2], T1}); + + SDValue R = getNode(Hexagon::A2_combine_ll, dl, MVT::i32, {B1, B0}, DAG); + return DAG.getBitcast(MVT::v4i8, R); } - // Then try splat. - bool IsSplat = true; - for (unsigned i = 0; i != Num; ++i) { - if (i == First) - continue; - if (Elem[i] == Elem[First] || isUndef(Elem[i])) - continue; - IsSplat = false; - break; - } - if (IsSplat) - return DAG.getNode(HexagonISD::VSPLAT, dl, VecTy, Elem[First]); - - // Generate - // (zxtb(Elem[0]) | (zxtb(Elem[1]) << 8)) | - // (zxtb(Elem[2]) | (zxtb(Elem[3]) << 8)) << 16 - SDValue S8 = DAG.getConstant(8, dl, MVT::i32); - SDValue V0 = DAG.getZeroExtendInReg(Elem[0], dl, MVT::i8); - SDValue V1 = DAG.getZeroExtendInReg(Elem[1], dl, MVT::i8); - SDValue V2 = DAG.getZeroExtendInReg(Elem[2], dl, MVT::i8); - SDValue V3 = DAG.getZeroExtendInReg(Elem[3], dl, MVT::i8); - - SDValue V4 = DAG.getNode(ISD::SHL, dl, MVT::i32, {V1, S8}); - SDValue V5 = DAG.getNode(ISD::SHL, dl, MVT::i32, {V3, S8}); - SDValue V6 = DAG.getNode(ISD::OR, dl, MVT::i32, {V0, V4}); - SDValue V7 = DAG.getNode(ISD::OR, dl, MVT::i32, {V2, V5}); - - SDValue T0 = getNode(Hexagon::A2_combine_ll, dl, MVT::i32, {V7, V6}, DAG); - return DAG.getBitcast(MVT::v4i8, T0); +#ifndef NDEBUG + dbgs() << "VecTy: " << EVT(VecTy).getEVTString() << '\n'; +#endif + llvm_unreachable("Unexpected vector element type"); } SDValue @@ -2545,15 +2612,8 @@ HexagonTargetLowering::buildVector64(ArrayRef Elem, const SDLoc &dl, MVT ElemTy = VecTy.getVectorElementType(); assert(VecTy.getVectorNumElements() == Elem.size()); - SmallVector Consts; - bool AllConst = true; - for (SDValue V : Elem) { - if (isUndef(V)) - V = DAG.getConstant(0, dl, ElemTy); - auto *C = dyn_cast(V.getNode()); - Consts.push_back(C); - AllConst = AllConst && C != nullptr; - } + SmallVector Consts(Elem.size()); + bool AllConst = getBuildVectorConstInts(Elem, VecTy, DAG, Consts); unsigned First, Num = Elem.size(); for (First = 0; First != Num; ++First) @@ -2562,6 +2622,10 @@ HexagonTargetLowering::buildVector64(ArrayRef Elem, const SDLoc &dl, if (First == Num) return DAG.getUNDEF(VecTy); + if (AllConst && + llvm::all_of(Consts, [](ConstantInt *CI) { return CI->isZero(); })) + return getZero(dl, VecTy, DAG); + // First try splat if possible. if (ElemTy == MVT::i16) { bool IsSplat = true; @@ -2573,8 +2637,11 @@ HexagonTargetLowering::buildVector64(ArrayRef Elem, const SDLoc &dl, IsSplat = false; break; } - if (IsSplat) - return DAG.getNode(HexagonISD::VSPLAT, dl, VecTy, Elem[First]); + if (IsSplat) { + // Legalize the operand to VSPLAT. + SDValue Ext = DAG.getZExtOrTrunc(Elem[First], dl, MVT::i32); + return DAG.getNode(HexagonISD::VSPLAT, dl, VecTy, Ext); + } } // Then try constant. @@ -2593,10 +2660,10 @@ HexagonTargetLowering::buildVector64(ArrayRef Elem, const SDLoc &dl, MVT HalfTy = MVT::getVectorVT(ElemTy, Num/2); SDValue L = (ElemTy == MVT::i32) ? Elem[0] - : buildVector32({Elem.data(), Num/2}, dl, HalfTy, DAG); + : buildVector32(Elem.take_front(Num/2), dl, HalfTy, DAG); SDValue H = (ElemTy == MVT::i32) ? Elem[1] - : buildVector32({Elem.data()+Num/2, Num/2}, dl, HalfTy, DAG); + : buildVector32(Elem.drop_front(Num/2), dl, HalfTy, DAG); return DAG.getNode(HexagonISD::COMBINE, dl, VecTy, {H, L}); } @@ -2695,22 +2762,42 @@ HexagonTargetLowering::insertVector(SDValue VecV, SDValue ValV, SDValue IdxV, return DAG.getNode(ISD::BITCAST, dl, VecTy, InsV); } +SDValue +HexagonTargetLowering::getZero(const SDLoc &dl, MVT Ty, SelectionDAG &DAG) + const { + if (Ty.isVector()) { + assert(Ty.isInteger() && "Only integer vectors are supported here"); + unsigned W = Ty.getSizeInBits(); + if (W <= 64) + return DAG.getBitcast(Ty, DAG.getConstant(0, dl, MVT::getIntegerVT(W))); + return DAG.getNode(HexagonISD::VZERO, dl, Ty); + } + + if (Ty.isInteger()) + return DAG.getConstant(0, dl, Ty); + if (Ty.isFloatingPoint()) + return DAG.getConstantFP(0.0, dl, Ty); + llvm_unreachable("Invalid type for zero"); +} + SDValue HexagonTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { MVT VecTy = ty(Op); unsigned BW = VecTy.getSizeInBits(); + + if (Subtarget.useHVXOps() && Subtarget.isHVXVectorType(VecTy, true)) + return LowerHvxBuildVector(Op, DAG); + if (BW == 32 || BW == 64) { + const SDLoc &dl(Op); SmallVector Ops; for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) Ops.push_back(Op.getOperand(i)); if (BW == 32) - return buildVector32(Ops, SDLoc(Op), VecTy, DAG); - return buildVector64(Ops, SDLoc(Op), VecTy, DAG); + return buildVector32(Ops, dl, VecTy, DAG); + return buildVector64(Ops, dl, VecTy, DAG); } - if (Subtarget.useHVXOps() && Subtarget.isHVXVectorType(VecTy)) - return LowerHvxBuildVector(Op, DAG); - return SDValue(); } @@ -2822,7 +2909,7 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { #ifndef NDEBUG Op.getNode()->dumpr(&DAG); if (Opc > HexagonISD::OP_BEGIN && Opc < HexagonISD::OP_END) - errs() << "Check for a non-legal type in this operation\n"; + errs() << "Error: check for a non-legal type in this operation\n"; #endif llvm_unreachable("Should not custom lower this!"); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h index 0619e2e4e7f9..732834b464b4 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.h +++ b/lib/Target/Hexagon/HexagonISelLowering.h @@ -70,6 +70,7 @@ namespace HexagonISD { EH_RETURN, DCFETCH, READCYCLE, + VZERO, OP_END }; @@ -283,6 +284,9 @@ namespace HexagonISD { } private: + bool getBuildVectorConstInts(ArrayRef Values, MVT VecTy, + SelectionDAG &DAG, + MutableArrayRef Consts) const; SDValue buildVector32(ArrayRef Elem, const SDLoc &dl, MVT VecTy, SelectionDAG &DAG) const; SDValue buildVector64(ArrayRef Elem, const SDLoc &dl, MVT VecTy, @@ -301,6 +305,7 @@ namespace HexagonISD { SDNode *N = DAG.getMachineNode(MachineOpc, dl, Ty, Ops); return SDValue(N, 0); } + SDValue getZero(const SDLoc &dl, MVT Ty, SelectionDAG &DAG) const; using VectorPair = std::pair; using TypePair = std::pair; @@ -344,6 +349,13 @@ namespace HexagonISD { SDValue getByteShuffle(const SDLoc &dl, SDValue Op0, SDValue Op1, ArrayRef Mask, SelectionDAG &DAG) const; + MVT getVecBoolVT() const; + + SDValue buildHvxVectorSingle(ArrayRef Values, const SDLoc &dl, + MVT VecTy, SelectionDAG &DAG) const; + SDValue buildHvxVectorPred(ArrayRef Values, const SDLoc &dl, + MVT VecTy, SelectionDAG &DAG) const; + SDValue LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxExtractElement(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxInsertElement(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp index c1d44cb0e7de..51480d09d734 100644 --- a/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp +++ b/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp @@ -141,49 +141,50 @@ HexagonTargetLowering::getByteShuffle(const SDLoc &dl, SDValue Op0, opCastElem(Op1, MVT::i8, DAG), ByteMask); } -SDValue -HexagonTargetLowering::LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG) - const { - const SDLoc &dl(Op); - BuildVectorSDNode *BN = cast(Op.getNode()); - bool IsConst = BN->isConstant(); - MachineFunction &MF = DAG.getMachineFunction(); - MVT VecTy = ty(Op); +MVT +HexagonTargetLowering::getVecBoolVT() const { + return MVT::getVectorVT(MVT::i1, 8*Subtarget.getVectorLength()); +} - if (IsConst) { - SmallVector Elems; - for (SDValue V : BN->op_values()) { - if (auto *C = dyn_cast(V.getNode())) - Elems.push_back(const_cast(C->getConstantIntValue())); - } - Constant *CV = ConstantVector::get(Elems); - unsigned Align = VecTy.getSizeInBits() / 8; +SDValue +HexagonTargetLowering::buildHvxVectorSingle(ArrayRef Values, + const SDLoc &dl, MVT VecTy, + SelectionDAG &DAG) const { + unsigned VecLen = Values.size(); + MachineFunction &MF = DAG.getMachineFunction(); + MVT ElemTy = VecTy.getVectorElementType(); + unsigned ElemWidth = ElemTy.getSizeInBits(); + unsigned HwLen = Subtarget.getVectorLength(); + + SmallVector Consts(VecLen); + bool AllConst = getBuildVectorConstInts(Values, VecTy, DAG, Consts); + if (AllConst) { + if (llvm::all_of(Consts, [](ConstantInt *CI) { return CI->isZero(); })) + return getZero(dl, VecTy, DAG); + + ArrayRef Tmp((Constant**)Consts.begin(), + (Constant**)Consts.end()); + Constant *CV = ConstantVector::get(Tmp); + unsigned Align = HwLen; SDValue CP = LowerConstantPool(DAG.getConstantPool(CV, VecTy, Align), DAG); return DAG.getLoad(VecTy, dl, DAG.getEntryNode(), CP, MachinePointerInfo::getConstantPool(MF), Align); } - unsigned NumOps = Op.getNumOperands(); - unsigned HwLen = Subtarget.getVectorLength(); - unsigned ElemSize = VecTy.getVectorElementType().getSizeInBits() / 8; - assert(ElemSize*NumOps == HwLen); - + unsigned ElemSize = ElemWidth / 8; + assert(ElemSize*VecLen == HwLen); SmallVector Words; - SmallVector Ops; - for (unsigned i = 0; i != NumOps; ++i) - Ops.push_back(Op.getOperand(i)); if (VecTy.getVectorElementType() != MVT::i32) { - assert(ElemSize < 4 && "vNi64 should have been promoted to vNi32"); assert((ElemSize == 1 || ElemSize == 2) && "Invalid element size"); unsigned OpsPerWord = (ElemSize == 1) ? 4 : 2; MVT PartVT = MVT::getVectorVT(VecTy.getVectorElementType(), OpsPerWord); - for (unsigned i = 0; i != NumOps; i += OpsPerWord) { - SDValue W = buildVector32({&Ops[i], OpsPerWord}, dl, PartVT, DAG); + for (unsigned i = 0; i != VecLen; i += OpsPerWord) { + SDValue W = buildVector32(Values.slice(i, OpsPerWord), dl, PartVT, DAG); Words.push_back(DAG.getBitcast(MVT::i32, W)); } } else { - Words.assign(Ops.begin(), Ops.end()); + Words.assign(Values.begin(), Values.end()); } // Construct two halves in parallel, then or them together. @@ -207,6 +208,83 @@ HexagonTargetLowering::LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG) return DstV; } +SDValue +HexagonTargetLowering::buildHvxVectorPred(ArrayRef Values, + const SDLoc &dl, MVT VecTy, + SelectionDAG &DAG) const { + // Construct a vector V of bytes, such that a comparison V >u 0 would + // produce the required vector predicate. + unsigned VecLen = Values.size(); + unsigned HwLen = Subtarget.getVectorLength(); + assert(VecLen <= HwLen || VecLen == 8*HwLen); + SmallVector Bytes; + + if (VecLen <= HwLen) { + // In the hardware, each bit of a vector predicate corresponds to a byte + // of a vector register. Calculate how many bytes does a bit of VecTy + // correspond to. + assert(HwLen % VecLen == 0); + unsigned BitBytes = HwLen / VecLen; + for (SDValue V : Values) { + SDValue Ext = !V.isUndef() ? DAG.getZExtOrTrunc(V, dl, MVT::i8) + : DAG.getConstant(0, dl, MVT::i8); + for (unsigned B = 0; B != BitBytes; ++B) + Bytes.push_back(Ext); + } + } else { + // There are as many i1 values, as there are bits in a vector register. + // Divide the values into groups of 8 and check that each group consists + // of the same value (ignoring undefs). + for (unsigned I = 0; I != VecLen; I += 8) { + unsigned B = 0; + // Find the first non-undef value in this group. + for (; B != 8; ++B) { + if (!Values[I+B].isUndef()) + break; + } + SDValue F = Values[I+B]; + SDValue Ext = (B < 8) ? DAG.getZExtOrTrunc(F, dl, MVT::i8) + : DAG.getConstant(0, dl, MVT::i8); + Bytes.push_back(Ext); + // Verify that the rest of values in the group are the same as the + // first. + for (; B != 8; ++B) + assert(Values[I+B].isUndef() || Values[I+B] == F); + } + } + + MVT ByteTy = MVT::getVectorVT(MVT::i8, HwLen); + SDValue ByteVec = buildHvxVectorSingle(Bytes, dl, ByteTy, DAG); + SDValue Cmp = DAG.getSetCC(dl, VecTy, ByteVec, getZero(dl, ByteTy, DAG), + ISD::SETUGT); + return Cmp; +} + +SDValue +HexagonTargetLowering::LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG) + const { + const SDLoc &dl(Op); + MVT VecTy = ty(Op); + + unsigned Size = Op.getNumOperands(); + SmallVector Ops; + for (unsigned i = 0; i != Size; ++i) + Ops.push_back(Op.getOperand(i)); + + if (VecTy.getVectorElementType() == MVT::i1) + return buildHvxVectorPred(Ops, dl, VecTy, DAG); + + if (VecTy.getSizeInBits() == 16*Subtarget.getVectorLength()) { + ArrayRef A(Ops); + MVT SingleTy = typeSplit(VecTy).first; + SDValue V0 = buildHvxVectorSingle(A.take_front(Size/2), dl, SingleTy, DAG); + SDValue V1 = buildHvxVectorSingle(A.drop_front(Size/2), dl, SingleTy, DAG); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VecTy, V0, V1); + } + + return buildHvxVectorSingle(Ops, dl, VecTy, DAG); +} + SDValue HexagonTargetLowering::LowerHvxExtractElement(SDValue Op, SelectionDAG &DAG) const { @@ -399,6 +477,10 @@ HexagonTargetLowering::LowerHvxSetCC(SDValue Op, SelectionDAG &DAG) const { // (negate (swap-op NewCmp)), // the condition code for the NewCmp should be calculated from the original // CC by applying these operations in the reverse order. + // + // This could also be done through setCondCodeAction, but for negation it + // uses a xor with a vector of -1s, which it obtains from BUILD_VECTOR. + // That is far too expensive for what can be done with a single instruction. switch (CC) { case ISD::SETNE: // !eq diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td index e2120d3de2ef..cdc2085986a5 100644 --- a/lib/Target/Hexagon/HexagonPatterns.td +++ b/lib/Target/Hexagon/HexagonPatterns.td @@ -2899,6 +2899,8 @@ def HexagonREADCYCLE: SDNode<"HexagonISD::READCYCLE", SDTInt64Leaf, def: Pat<(HexagonREADCYCLE), (A4_tfrcpp UPCYCLE)>; +def SDTVecLeaf: SDTypeProfile<1, 0, [SDTCisVec<0>]>; + def SDTHexagonVEXTRACTW: SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVec<1>, SDTCisVT<2, i32>]>; def HexagonVEXTRACTW : SDNode<"HexagonISD::VEXTRACTW", SDTHexagonVEXTRACTW>; @@ -2920,7 +2922,14 @@ let Predicates = [UseHVX] in { def: OpR_RR_pat, VecI16, HVI16>; } +def HexagonVZERO: SDNode<"HexagonISD::VZERO", SDTVecLeaf>; +def vzero: PatFrag<(ops), (HexagonVZERO)>; + let Predicates = [UseHVX] in { + def: Pat<(VecI8 vzero), (V6_vd0)>; + def: Pat<(VecI16 vzero), (V6_vd0)>; + def: Pat<(VecI32 vzero), (V6_vd0)>; + def: Pat<(VecPI8 (concat_vectors HVI8:$Vs, HVI8:$Vt)), (Combinev HvxVR:$Vt, HvxVR:$Vs)>; def: Pat<(VecPI16 (concat_vectors HVI16:$Vs, HVI16:$Vt)), diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.td b/lib/Target/Hexagon/HexagonRegisterInfo.td index 2ceed70c2497..1d1e85e7ac7e 100644 --- a/lib/Target/Hexagon/HexagonRegisterInfo.td +++ b/lib/Target/Hexagon/HexagonRegisterInfo.td @@ -242,7 +242,7 @@ def VecQ32 // FIXME: the register order should be defined in terms of the preferred // allocation order... // -def IntRegs : RegisterClass<"Hexagon", [i32, f32, v4i8, v2i16], 32, +def IntRegs : RegisterClass<"Hexagon", [i32, f32, v32i1, v4i8, v2i16], 32, (add (sequence "R%u", 0, 9), (sequence "R%u", 12, 28), R10, R11, R29, R30, R31)>; @@ -254,7 +254,8 @@ def GeneralSubRegs : RegisterClass<"Hexagon", [i32], 32, def IntRegsLow8 : RegisterClass<"Hexagon", [i32], 32, (add R7, R6, R5, R4, R3, R2, R1, R0)> ; -def DoubleRegs : RegisterClass<"Hexagon", [i64, f64, v8i8, v4i16, v2i32], 64, +def DoubleRegs : RegisterClass<"Hexagon", + [i64, f64, v64i1, v8i8, v4i16, v2i32], 64, (add (sequence "D%u", 0, 4), (sequence "D%u", 6, 13), D5, D14, D15)>; def GeneralDoubleLow8Regs : RegisterClass<"Hexagon", [i64], 64, diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h index 678ef210d0ae..af93f20d97fc 100644 --- a/lib/Target/Hexagon/HexagonSubtarget.h +++ b/lib/Target/Hexagon/HexagonSubtarget.h @@ -204,14 +204,38 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo { llvm_unreachable("Invalid HVX vector length settings"); } - bool isHVXVectorType(MVT VecTy) const { + ArrayRef getHVXElementTypes() const { + static MVT Types[] = { MVT::i8, MVT::i16, MVT::i32 }; + return makeArrayRef(Types); + } + + bool isHVXVectorType(MVT VecTy, bool IncludeBool = false) const { if (!VecTy.isVector() || !useHVXOps()) return false; - unsigned ElemWidth = VecTy.getVectorElementType().getSizeInBits(); - if (ElemWidth < 8 || ElemWidth > 64) + MVT ElemTy = VecTy.getVectorElementType(); + if (!IncludeBool && ElemTy == MVT::i1) return false; + + unsigned HwLen = getVectorLength(); + unsigned NumElems = VecTy.getVectorNumElements(); + ArrayRef ElemTypes = getHVXElementTypes(); + + if (IncludeBool && ElemTy == MVT::i1) { + // Special case for the v512i1, etc. + if (8*HwLen == NumElems) + return true; + // Boolean HVX vector types are formed from regular HVX vector types + // by replacing the element type with i1. + for (MVT T : ElemTypes) + if (NumElems * T.getSizeInBits() == 8*HwLen) + return true; + return false; + } + unsigned VecWidth = VecTy.getSizeInBits(); - return VecWidth == 8*getVectorLength() || VecWidth == 16*getVectorLength(); + if (VecWidth != 8*HwLen && VecWidth != 16*HwLen) + return false; + return llvm::any_of(ElemTypes, [ElemTy] (MVT T) { return ElemTy == T; }); } unsigned getL1CacheLineSize() const; diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp index 0c40a7b8f382..363b703fef28 100644 --- a/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -258,10 +258,9 @@ void HexagonTargetMachine::adjustPassManager(PassManagerBuilder &PMB) { }); } -TargetIRAnalysis HexagonTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](const Function &F) { - return TargetTransformInfo(HexagonTTIImpl(this, F)); - }); +TargetTransformInfo +HexagonTargetMachine::getTargetTransformInfo(const Function &F) { + return TargetTransformInfo(HexagonTTIImpl(this, F)); } diff --git a/lib/Target/Hexagon/HexagonTargetMachine.h b/lib/Target/Hexagon/HexagonTargetMachine.h index acd41f920b53..a7c6a3437fbc 100644 --- a/lib/Target/Hexagon/HexagonTargetMachine.h +++ b/lib/Target/Hexagon/HexagonTargetMachine.h @@ -39,7 +39,7 @@ class HexagonTargetMachine : public LLVMTargetMachine { void adjustPassManager(PassManagerBuilder &PMB) override; TargetPassConfig *createPassConfig(PassManagerBase &PM) override; - TargetIRAnalysis getTargetIRAnalysis() override; + TargetTransformInfo getTargetTransformInfo(const Function &F) override; HexagonTargetObjectFile *getObjFileLowering() const override { return static_cast(TLOF.get()); diff --git a/lib/Target/Lanai/LanaiTargetMachine.cpp b/lib/Target/Lanai/LanaiTargetMachine.cpp index 9a73c95d6516..2c21a53b13bb 100644 --- a/lib/Target/Lanai/LanaiTargetMachine.cpp +++ b/lib/Target/Lanai/LanaiTargetMachine.cpp @@ -74,10 +74,9 @@ LanaiTargetMachine::LanaiTargetMachine(const Target &T, const Triple &TT, initAsmInfo(); } -TargetIRAnalysis LanaiTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](const Function &F) { - return TargetTransformInfo(LanaiTTIImpl(this, F)); - }); +TargetTransformInfo +LanaiTargetMachine::getTargetTransformInfo(const Function &F) { + return TargetTransformInfo(LanaiTTIImpl(this, F)); } namespace { diff --git a/lib/Target/Lanai/LanaiTargetMachine.h b/lib/Target/Lanai/LanaiTargetMachine.h index 2fb1a0536104..0db286ec13e7 100644 --- a/lib/Target/Lanai/LanaiTargetMachine.h +++ b/lib/Target/Lanai/LanaiTargetMachine.h @@ -42,7 +42,7 @@ class LanaiTargetMachine : public LLVMTargetMachine { return &Subtarget; } - TargetIRAnalysis getTargetIRAnalysis() override; + TargetTransformInfo getTargetTransformInfo(const Function &F) override; // Pass Pipeline Configuration TargetPassConfig *createPassConfig(PassManagerBase &pass_manager) override; diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp index ac81e6207456..2f6dd0035de3 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp @@ -188,7 +188,7 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, // so we have to special check for them. unsigned Opcode = TmpInst.getOpcode(); if ((Opcode != Mips::NOP) && (Opcode != Mips::SLL) && - (Opcode != Mips::SLL_MM) && !Binary) + (Opcode != Mips::SLL_MM) && (Opcode != Mips::SLL_MMR6) && !Binary) llvm_unreachable("unimplemented opcode in encodeInstruction()"); int NewOpcode = -1; diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td index 50537bed8ff0..c85ee20273c0 100644 --- a/lib/Target/Mips/MipsRegisterInfo.td +++ b/lib/Target/Mips/MipsRegisterInfo.td @@ -38,7 +38,7 @@ class MipsRegWithSubRegs Enc, string n, list subregs> let Namespace = "Mips"; } -// Mips CPU Registers +// Mips CPU Registers. class MipsGPRReg Enc, string n> : MipsReg; // Mips 64-bit CPU Registers diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp index 85193bffef56..fb79a4bf40c5 100644 --- a/lib/Target/Mips/MipsTargetMachine.cpp +++ b/lib/Target/Mips/MipsTargetMachine.cpp @@ -259,17 +259,16 @@ void MipsPassConfig::addPreRegAlloc() { addPass(createMipsOptimizePICCallPass()); } -TargetIRAnalysis MipsTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](const Function &F) { - if (Subtarget->allowMixed16_32()) { - DEBUG(errs() << "No Target Transform Info Pass Added\n"); - // FIXME: This is no longer necessary as the TTI returned is per-function. - return TargetTransformInfo(F.getParent()->getDataLayout()); - } +TargetTransformInfo +MipsTargetMachine::getTargetTransformInfo(const Function &F) { + if (Subtarget->allowMixed16_32()) { + DEBUG(errs() << "No Target Transform Info Pass Added\n"); + // FIXME: This is no longer necessary as the TTI returned is per-function. + return TargetTransformInfo(F.getParent()->getDataLayout()); + } - DEBUG(errs() << "Target Transform Info Pass Added\n"); - return TargetTransformInfo(BasicTTIImpl(this, F)); - }); + DEBUG(errs() << "Target Transform Info Pass Added\n"); + return TargetTransformInfo(BasicTTIImpl(this, F)); } // Implemented by targets that want to run passes immediately before diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h index ccfc9a938d9c..56e6e5d8daa2 100644 --- a/lib/Target/Mips/MipsTargetMachine.h +++ b/lib/Target/Mips/MipsTargetMachine.h @@ -44,7 +44,7 @@ class MipsTargetMachine : public LLVMTargetMachine { CodeGenOpt::Level OL, bool JIT, bool isLittle); ~MipsTargetMachine() override; - TargetIRAnalysis getTargetIRAnalysis() override; + TargetTransformInfo getTargetTransformInfo(const Function &F) override; const MipsSubtarget *getSubtargetImpl() const { if (Subtarget) diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 85f757878f94..d31e1cb5047b 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -180,10 +180,9 @@ void NVPTXTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { }); } -TargetIRAnalysis NVPTXTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](const Function &F) { - return TargetTransformInfo(NVPTXTTIImpl(this, F)); - }); +TargetTransformInfo +NVPTXTargetMachine::getTargetTransformInfo(const Function &F) { + return TargetTransformInfo(NVPTXTTIImpl(this, F)); } void NVPTXPassConfig::addEarlyCSEOrGVNPass() { diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.h b/lib/Target/NVPTX/NVPTXTargetMachine.h index 54a72a688ee3..eeebf64d39c3 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.h +++ b/lib/Target/NVPTX/NVPTXTargetMachine.h @@ -63,7 +63,7 @@ class NVPTXTargetMachine : public LLVMTargetMachine { void adjustPassManager(PassManagerBuilder &) override; - TargetIRAnalysis getTargetIRAnalysis() override; + TargetTransformInfo getTargetTransformInfo(const Function &F) override; bool isMachineVerifierClean() const override { return false; diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp index c870a2256691..7902da20a010 100644 --- a/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -1531,11 +1531,11 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF, void PPCFrameLowering::createTailCallBranchInstr(MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); - DebugLoc dl; - if (MBBI != MBB.end()) - dl = MBBI->getDebugLoc(); + // If we got this far a first terminator should exist. + assert(MBBI != MBB.end() && "Failed to find the first terminator."); + DebugLoc dl = MBBI->getDebugLoc(); const PPCInstrInfo &TII = *Subtarget.getInstrInfo(); // Create branch instruction for pseudo tail call return instruction diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 18e567fa589c..cea59de3e8a9 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -11882,6 +11882,12 @@ SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N, SDLoc dl(N); SDValue Op(N, 0); + // Don't handle ppc_fp128 here or i1 conversions. + if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) + return SDValue(); + if (Op.getOperand(0).getValueType() == MVT::i1) + return SDValue(); + SDValue FirstOperand(Op.getOperand(0)); bool SubWordLoad = FirstOperand.getOpcode() == ISD::LOAD && (FirstOperand.getValueType() == MVT::i8 || @@ -11910,11 +11916,6 @@ SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N, return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ld); } - // Don't handle ppc_fp128 here or i1 conversions. - if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) - return SDValue(); - if (Op.getOperand(0).getValueType() == MVT::i1) - return SDValue(); // For i32 intermediate values, unfortunately, the conversion functions // leave the upper 32 bits of the value are undefined. Within the set of diff --git a/lib/Target/PowerPC/PPCMIPeephole.cpp b/lib/Target/PowerPC/PPCMIPeephole.cpp index a2640727f813..474661aaaee8 100644 --- a/lib/Target/PowerPC/PPCMIPeephole.cpp +++ b/lib/Target/PowerPC/PPCMIPeephole.cpp @@ -1025,9 +1025,6 @@ bool PPCMIPeephole::eliminateRedundantTOCSaves( // bge 0, .LBB0_4 bool PPCMIPeephole::eliminateRedundantCompare(void) { - // FIXME: this transformation is causing miscompiles. Disabling it for now - // until we can resolve the issue. - return false; bool Simplified = false; for (MachineBasicBlock &MBB2 : *MF) { @@ -1087,10 +1084,21 @@ bool PPCMIPeephole::eliminateRedundantCompare(void) { // we replace it with a signed comparison if the comparison // to be merged is a signed comparison. // In other cases of opcode mismatch, we cannot optimize this. - if (isEqOrNe(BI2) && + + // We cannot change opcode when comparing against an immediate + // if the most significant bit of the immediate is one + // due to the difference in sign extension. + auto CmpAgainstImmWithSignBit = [](MachineInstr *I) { + if (!I->getOperand(2).isImm()) + return false; + int16_t Imm = (int16_t)I->getOperand(2).getImm(); + return Imm < 0; + }; + + if (isEqOrNe(BI2) && !CmpAgainstImmWithSignBit(CMPI2) && CMPI1->getOpcode() == getSignedCmpOpCode(CMPI2->getOpcode())) NewOpCode = CMPI1->getOpcode(); - else if (isEqOrNe(BI1) && + else if (isEqOrNe(BI1) && !CmpAgainstImmWithSignBit(CMPI1) && getSignedCmpOpCode(CMPI1->getOpcode()) == CMPI2->getOpcode()) NewOpCode = CMPI2->getOpcode(); else continue; diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp index 491f25ca2c64..20a83c973026 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -451,8 +451,7 @@ void PPCPassConfig::addPreEmitPass() { addPass(createPPCBranchSelectionPass(), false); } -TargetIRAnalysis PPCTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](const Function &F) { - return TargetTransformInfo(PPCTTIImpl(this, F)); - }); +TargetTransformInfo +PPCTargetMachine::getTargetTransformInfo(const Function &F) { + return TargetTransformInfo(PPCTTIImpl(this, F)); } diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h index 102bf7ca59c2..75b98a815ab4 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.h +++ b/lib/Target/PowerPC/PPCTargetMachine.h @@ -49,7 +49,7 @@ class PPCTargetMachine final : public LLVMTargetMachine { // Pass Pipeline Configuration TargetPassConfig *createPassConfig(PassManagerBase &PM) override; - TargetIRAnalysis getTargetIRAnalysis() override; + TargetTransformInfo getTargetTransformInfo(const Function &F) override; TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF.get(); diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp index e74d68182949..3a167a6d452a 100644 --- a/lib/Target/SystemZ/SystemZTargetMachine.cpp +++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp @@ -257,8 +257,7 @@ TargetPassConfig *SystemZTargetMachine::createPassConfig(PassManagerBase &PM) { return new SystemZPassConfig(*this, PM); } -TargetIRAnalysis SystemZTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](const Function &F) { - return TargetTransformInfo(SystemZTTIImpl(this, F)); - }); +TargetTransformInfo +SystemZTargetMachine::getTargetTransformInfo(const Function &F) { + return TargetTransformInfo(SystemZTTIImpl(this, F)); } diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h index 95ad5e339e0b..52bf8bba55de 100644 --- a/lib/Target/SystemZ/SystemZTargetMachine.h +++ b/lib/Target/SystemZ/SystemZTargetMachine.h @@ -44,7 +44,7 @@ class SystemZTargetMachine : public LLVMTargetMachine { // Override LLVMTargetMachine TargetPassConfig *createPassConfig(PassManagerBase &PM) override; - TargetIRAnalysis getTargetIRAnalysis() override; + TargetTransformInfo getTargetTransformInfo(const Function &F) override; TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF.get(); diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp index ad63c7a9cb30..c4c0dd22ee0c 100644 --- a/lib/Target/TargetMachine.cpp +++ b/lib/Target/TargetMachine.cpp @@ -219,10 +219,8 @@ CodeGenOpt::Level TargetMachine::getOptLevel() const { return OptLevel; } void TargetMachine::setOptLevel(CodeGenOpt::Level Level) { OptLevel = Level; } -TargetIRAnalysis TargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([](const Function &F) { - return TargetTransformInfo(F.getParent()->getDataLayout()); - }); +TargetTransformInfo TargetMachine::getTargetTransformInfo(const Function &F) { + return TargetTransformInfo(F.getParent()->getDataLayout()); } void TargetMachine::getNameWithPrefix(SmallVectorImpl &Name, @@ -244,3 +242,10 @@ MCSymbol *TargetMachine::getSymbol(const GlobalValue *GV) const { getNameWithPrefix(NameStr, GV, TLOF->getMangler()); return TLOF->getContext().getOrCreateSymbol(NameStr); } + +TargetIRAnalysis TargetMachine::getTargetIRAnalysis() { + // Since Analysis can't depend on Target, use a std::function to invert the + // dependency. + return TargetIRAnalysis( + [this](const Function &F) { return this->getTargetTransformInfo(F); }); +} diff --git a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp index 2bdba96ab674..a4bb967f36f6 100644 --- a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp +++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp @@ -746,6 +746,14 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) { MachineDominatorTree &MDT = getAnalysis(); LiveIntervals &LIS = getAnalysis(); + // Disable the TEE optimization if we aren't doing direct wasm object + // emission, because lowering TEE to TEE_LOCAL is done in the ExplicitLocals + // pass, which is also disabled. + bool UseTee = true; + if (MF.getSubtarget() + .getTargetTriple().isOSBinFormatELF()) + UseTee = false; + // Walk the instructions from the bottom up. Currently we don't look past // block boundaries, and the blocks aren't ordered so the block visitation // order isn't significant, but we may want to change this in the future. @@ -811,7 +819,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) { Insert = RematerializeCheapDef(Reg, Op, *Def, MBB, Insert->getIterator(), LIS, MFI, MRI, TII, TRI); - } else if (CanMove && + } else if (UseTee && CanMove && OneUseDominatesOtherUses(Reg, Op, MBB, MRI, MDT, LIS, MFI)) { Insert = MoveAndTeeForMultiUse(Reg, Op, Def, MBB, Insert, LIS, MFI, MRI, TII); diff --git a/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp index 2599064334ee..f808c063d7e4 100644 --- a/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp +++ b/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp @@ -223,6 +223,8 @@ RuntimeLibcallSignatures[RTLIB::UNKNOWN_LIBCALL] = { /* SINCOS_F80 */ unsupported, /* SINCOS_F128 */ func_i64_i64_iPTR_iPTR, /* SINCOS_PPCF128 */ unsupported, +/* SINCOS_STRET_F32 */ unsupported, +/* SINCOS_STRET_F64 */ unsupported, /* POW_F32 */ f32_func_f32_f32, /* POW_F64 */ f64_func_f64_f64, /* POW_F80 */ unsupported, @@ -390,8 +392,9 @@ RuntimeLibcallSignatures[RTLIB::UNKNOWN_LIBCALL] = { // MEMORY /* MEMCPY */ iPTR_func_iPTR_iPTR_iPTR, -/* MEMSET */ iPTR_func_iPTR_i32_iPTR, /* MEMMOVE */ iPTR_func_iPTR_iPTR_iPTR, +/* MEMSET */ iPTR_func_iPTR_i32_iPTR, +/* BZERO */ unsupported, // ELEMENT-WISE ATOMIC MEMORY /* MEMCPY_ELEMENT_UNORDERED_ATOMIC_1 */ unsupported, @@ -687,6 +690,8 @@ RuntimeLibcallNames[RTLIB::UNKNOWN_LIBCALL] = { /* SINCOS_F80 */ nullptr, /* SINCOS_F128 */ "sincosl", /* SINCOS_PPCF128 */ nullptr, +/* SINCOS_STRET_F32 */ nullptr, +/* SINCOS_STRET_F64 */ nullptr, /* POW_F32 */ "powf", /* POW_F64 */ "pow", /* POW_F80 */ nullptr, @@ -850,6 +855,7 @@ RuntimeLibcallNames[RTLIB::UNKNOWN_LIBCALL] = { /* MEMCPY */ "memcpy", /* MEMMOVE */ "memset", /* MEMSET */ "memmove", +/* BZERO */ nullptr, /* MEMCPY_ELEMENT_UNORDERED_ATOMIC_1 */ nullptr, /* MEMCPY_ELEMENT_UNORDERED_ATOMIC_2 */ nullptr, /* MEMCPY_ELEMENT_UNORDERED_ATOMIC_4 */ nullptr, diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp index 3cc19ef5fbab..d38cde74d2ec 100644 --- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -146,10 +146,9 @@ class WebAssemblyPassConfig final : public TargetPassConfig { }; } // end anonymous namespace -TargetIRAnalysis WebAssemblyTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](const Function &F) { - return TargetTransformInfo(WebAssemblyTTIImpl(this, F)); - }); +TargetTransformInfo +WebAssemblyTargetMachine::getTargetTransformInfo(const Function &F) { + return TargetTransformInfo(WebAssemblyTTIImpl(this, F)); } TargetPassConfig * diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.h b/lib/Target/WebAssembly/WebAssemblyTargetMachine.h index 224849526514..dd826befd117 100644 --- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.h +++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.h @@ -43,8 +43,7 @@ class WebAssemblyTargetMachine final : public LLVMTargetMachine { return TLOF.get(); } - /// \brief Get the TargetIRAnalysis for this target. - TargetIRAnalysis getTargetIRAnalysis() override; + TargetTransformInfo getTargetTransformInfo(const Function &F) override; bool usesPhysRegsForPEI() const override { return false; } }; diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 78385ae1877b..239db2a74b24 100644 --- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -78,7 +78,7 @@ class X86AsmBackend : public MCAsmBackend { CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" && CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" && CPU != "geode" && CPU != "winchip-c6" && CPU != "winchip2" && - CPU != "c3" && CPU != "c3-2" && CPU != "lakemont"; + CPU != "c3" && CPU != "c3-2" && CPU != "lakemont" && CPU != ""; } unsigned getNumFixupKinds() const override { diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 08731cd0204c..7e7c35569093 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -137,7 +137,7 @@ def FeatureVPOPCNTDQ : SubtargetFeature<"avx512vpopcntdq", "HasVPOPCNTDQ", def FeaturePFI : SubtargetFeature<"avx512pf", "HasPFI", "true", "Enable AVX-512 PreFetch Instructions", [FeatureAVX512]>; -def FeaturePREFETCHWT1 : SubtargetFeature<"prefetchwt1", "HasPFPREFETCHWT1", +def FeaturePREFETCHWT1 : SubtargetFeature<"prefetchwt1", "HasPREFETCHWT1", "true", "Prefetch with Intent to Write and T1 Hint">; def FeatureDQI : SubtargetFeature<"avx512dq", "HasDQI", "true", @@ -263,6 +263,12 @@ def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true", def FeatureSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true", "Use software floating point features.">; +// On recent X86 (port bound) processors, its preferable to combine to a single shuffle +// using a variable mask over multiple fixed shuffles. +def FeatureFastVariableShuffle + : SubtargetFeature<"fast-variable-shuffle", + "HasFastVariableShuffle", + "true", "Shuffles with variable masks are fast">; // On some X86 processors, there is no performance hazard to writing only the // lower parts of a YMM or ZMM register without clearing the upper part. def FeatureFastPartialYMMorZMMWrite @@ -620,7 +626,8 @@ def HSWFeatures : ProcessorFeatures; class HaswellProc : ProcModel; // Legacy alias. def BDWFeatures : ProcessorFeatures; class BroadwellProc : ProcModel; // FIXME: define KNL model diff --git a/lib/Target/X86/X86DomainReassignment.cpp b/lib/Target/X86/X86DomainReassignment.cpp index 0a87fb4533c2..ba7280c29cc9 100644 --- a/lib/Target/X86/X86DomainReassignment.cpp +++ b/lib/Target/X86/X86DomainReassignment.cpp @@ -301,60 +301,21 @@ typedef DenseMap /// different closure that manipulates the loaded or stored value. class Closure { private: - const TargetInstrInfo *TII; - MachineRegisterInfo *MRI; - /// Virtual registers in the closure. DenseSet Edges; /// Instructions in the closure. SmallVector Instrs; - /// A map of available Instruction Converters. - const InstrConverterBaseMap &Converters; - - /// The register domain of this closure. - RegDomain Domain; - /// Domains which this closure can legally be reassigned to. std::bitset LegalDstDomains; - /// Enqueue \p Reg to be considered for addition to the closure. - void visitRegister(unsigned Reg, SmallVectorImpl &Worklist); - - /// Add \p MI to this closure. - void encloseInstr(MachineInstr *MI); - - /// Calculate the total cost of reassigning the closure to \p Domain. - double calculateCost(RegDomain Domain) const; - - /// All edges that are included in some closure. - DenseSet &EnclosedEdges; - - /// All instructions that are included in some closure. - DenseMap &EnclosedInstrs; - public: - Closure(const TargetInstrInfo *TII, MachineRegisterInfo *MRI, - const InstrConverterBaseMap &Converters, - std::initializer_list LegalDstDomainList, - DenseSet &EnclosedEdges, - DenseMap &EnclosedInstrs) - : TII(TII), MRI(MRI), Converters(Converters), Domain(NoDomain), - EnclosedEdges(EnclosedEdges), EnclosedInstrs(EnclosedInstrs) { + Closure(std::initializer_list LegalDstDomainList) { for (RegDomain D : LegalDstDomainList) LegalDstDomains.set(D); } - /// Starting from \Reg, expand the closure as much as possible. - void buildClosure(unsigned E); - - /// /returns true if it is profitable to reassign the closure to \p Domain. - bool isReassignmentProfitable(RegDomain Domain) const; - - /// Reassign the closure to \p Domain. - void Reassign(RegDomain Domain) const; - /// Mark this closure as illegal for reassignment to all domains. void setAllIllegal() { LegalDstDomains.reset(); } @@ -364,10 +325,41 @@ class Closure { /// \returns true if is legal to reassign this closure to domain \p RD. bool isLegal(RegDomain RD) const { return LegalDstDomains[RD]; } + /// Mark this closure as illegal for reassignment to domain \p RD. + void setIllegal(RegDomain RD) { LegalDstDomains[RD] = false; } + bool empty() const { return Edges.empty(); } + + bool insertEdge(unsigned Reg) { + return Edges.insert(Reg).second; + } + + using const_edge_iterator = DenseSet::const_iterator; + iterator_range edges() const { + return iterator_range(Edges.begin(), Edges.end()); + } + + void addInstruction(MachineInstr *I) { + Instrs.push_back(I); + } + + ArrayRef instructions() const { + return Instrs; + } + }; class X86DomainReassignment : public MachineFunctionPass { + const X86Subtarget *STI; + MachineRegisterInfo *MRI; + const X86InstrInfo *TII; + + /// All edges that are included in some closure + DenseSet EnclosedEdges; + + /// All instructions that are included in some closure. + DenseMap EnclosedInstrs; + public: static char ID; @@ -387,22 +379,39 @@ class X86DomainReassignment : public MachineFunctionPass { } private: - const X86Subtarget *STI; - MachineRegisterInfo *MRI; - const X86InstrInfo *TII; - /// A map of available Instruction Converters. InstrConverterBaseMap Converters; /// Initialize Converters map. void initConverters(); + + /// Starting from \Reg, expand the closure as much as possible. + void buildClosure(Closure &, unsigned Reg); + + /// Enqueue \p Reg to be considered for addition to the closure. + void visitRegister(Closure &, unsigned Reg, RegDomain &Domain, + SmallVectorImpl &Worklist); + + /// Reassign the closure to \p Domain. + void reassign(const Closure &C, RegDomain Domain) const; + + /// Add \p MI to the closure. + void encloseInstr(Closure &C, MachineInstr *MI); + + /// /returns true if it is profitable to reassign the closure to \p Domain. + bool isReassignmentProfitable(const Closure &C, RegDomain Domain) const; + + /// Calculate the total cost of reassigning the closure to \p Domain. + double calculateCost(const Closure &C, RegDomain Domain) const; }; char X86DomainReassignment::ID = 0; } // End anonymous namespace. -void Closure::visitRegister(unsigned Reg, SmallVectorImpl &Worklist) { +void X86DomainReassignment::visitRegister(Closure &C, unsigned Reg, + RegDomain &Domain, + SmallVectorImpl &Worklist) { if (EnclosedEdges.count(Reg)) return; @@ -423,59 +432,61 @@ void Closure::visitRegister(unsigned Reg, SmallVectorImpl &Worklist) { Worklist.push_back(Reg); } -void Closure::encloseInstr(MachineInstr *MI) { +void X86DomainReassignment::encloseInstr(Closure &C, MachineInstr *MI) { auto I = EnclosedInstrs.find(MI); if (I != EnclosedInstrs.end()) { - if (I->second != this) + if (I->second != &C) // Instruction already belongs to another closure, avoid conflicts between // closure and mark this closure as illegal. - setAllIllegal(); + C.setAllIllegal(); return; } - EnclosedInstrs[MI] = this; - Instrs.push_back(MI); + EnclosedInstrs[MI] = &C; + C.addInstruction(MI); // Mark closure as illegal for reassignment to domains, if there is no // converter for the instruction or if the converter cannot convert the // instruction. - for (unsigned i = 0; i != LegalDstDomains.size(); ++i) { - if (LegalDstDomains[i]) { + for (int i = 0; i != NumDomains; ++i) { + if (C.isLegal((RegDomain)i)) { InstrConverterBase *IC = Converters.lookup({i, MI->getOpcode()}); if (!IC || !IC->isLegal(MI, TII)) - LegalDstDomains[i] = false; + C.setIllegal((RegDomain)i); } } } -double Closure::calculateCost(RegDomain DstDomain) const { - assert(isLegal(DstDomain) && "Cannot calculate cost for illegal closure"); +double X86DomainReassignment::calculateCost(const Closure &C, + RegDomain DstDomain) const { + assert(C.isLegal(DstDomain) && "Cannot calculate cost for illegal closure"); double Cost = 0.0; - for (auto MI : Instrs) + for (auto *MI : C.instructions()) Cost += Converters.lookup({DstDomain, MI->getOpcode()})->getExtraCost(MI, MRI); return Cost; } -bool Closure::isReassignmentProfitable(RegDomain Domain) const { - return calculateCost(Domain) < 0.0; +bool X86DomainReassignment::isReassignmentProfitable(const Closure &C, + RegDomain Domain) const { + return calculateCost(C, Domain) < 0.0; } -void Closure::Reassign(RegDomain Domain) const { - assert(isLegal(Domain) && "Cannot convert illegal closure"); +void X86DomainReassignment::reassign(const Closure &C, RegDomain Domain) const { + assert(C.isLegal(Domain) && "Cannot convert illegal closure"); // Iterate all instructions in the closure, convert each one using the // appropriate converter. SmallVector ToErase; - for (auto MI : Instrs) + for (auto *MI : C.instructions()) if (Converters.lookup({Domain, MI->getOpcode()}) ->convertInstr(MI, TII, MRI)) ToErase.push_back(MI); // Iterate all registers in the closure, replace them with registers in the // destination domain. - for (unsigned Reg : Edges) { + for (unsigned Reg : C.edges()) { MRI->setRegClass(Reg, getDstRC(MRI->getRegClass(Reg), Domain)); for (auto &MO : MRI->use_operands(Reg)) { if (MO.isReg()) @@ -512,18 +523,19 @@ static bool usedAsAddr(const MachineInstr &MI, unsigned Reg, return false; } -void Closure::buildClosure(unsigned Reg) { +void X86DomainReassignment::buildClosure(Closure &C, unsigned Reg) { SmallVector Worklist; - visitRegister(Reg, Worklist); + RegDomain Domain = NoDomain; + visitRegister(C, Reg, Domain, Worklist); while (!Worklist.empty()) { unsigned CurReg = Worklist.pop_back_val(); // Register already in this closure. - if (!Edges.insert(CurReg).second) + if (!C.insertEdge(CurReg)) continue; MachineInstr *DefMI = MRI->getVRegDef(CurReg); - encloseInstr(DefMI); + encloseInstr(C, DefMI); // Add register used by the defining MI to the worklist. // Do not add registers which are used in address calculation, they will be @@ -542,7 +554,7 @@ void Closure::buildClosure(unsigned Reg) { auto &Op = DefMI->getOperand(OpIdx); if (!Op.isReg() || !Op.isUse()) continue; - visitRegister(Op.getReg(), Worklist); + visitRegister(C, Op.getReg(), Domain, Worklist); } // Expand closure through register uses. @@ -550,10 +562,10 @@ void Closure::buildClosure(unsigned Reg) { // We would like to avoid converting closures which calculare addresses, // as this should remain in GPRs. if (usedAsAddr(UseMI, CurReg, TII)) { - setAllIllegal(); + C.setAllIllegal(); continue; } - encloseInstr(&UseMI); + encloseInstr(C, &UseMI); for (auto &DefOp : UseMI.defs()) { if (!DefOp.isReg()) @@ -561,10 +573,10 @@ void Closure::buildClosure(unsigned Reg) { unsigned DefReg = DefOp.getReg(); if (!TargetRegisterInfo::isVirtualRegister(DefReg)) { - setAllIllegal(); + C.setAllIllegal(); continue; } - visitRegister(DefReg, Worklist); + visitRegister(C, DefReg, Domain, Worklist); } } } @@ -701,8 +713,8 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) { initConverters(); bool Changed = false; - DenseSet EnclosedEdges; - DenseMap EnclosedInstrs; + EnclosedEdges.clear(); + EnclosedInstrs.clear(); std::vector Closures; @@ -719,9 +731,8 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) { continue; // Calculate closure starting with Reg. - Closure C(TII, MRI, Converters, {MaskDomain}, EnclosedEdges, - EnclosedInstrs); - C.buildClosure(Reg); + Closure C({MaskDomain}); + buildClosure(C, Reg); // Collect all closures that can potentially be converted. if (!C.empty() && C.isLegal(MaskDomain)) @@ -729,8 +740,8 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) { } for (Closure &C : Closures) - if (C.isReassignmentProfitable(MaskDomain)) { - C.Reassign(MaskDomain); + if (isReassignmentProfitable(C, MaskDomain)) { + reassign(C, MaskDomain); ++NumClosuresConverted; Changed = true; } diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index a6c7c5f22a3a..660c1eff3c4b 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -106,14 +106,15 @@ namespace { if (Base_Reg.getNode()) Base_Reg.getNode()->dump(); else - dbgs() << "nul"; - dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n' - << " Scale" << Scale << '\n' + dbgs() << "nul\n"; + if (BaseType == FrameIndexBase) + dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n'; + dbgs() << " Scale " << Scale << '\n' << "IndexReg "; if (IndexReg.getNode()) IndexReg.getNode()->dump(); else - dbgs() << "nul"; + dbgs() << "nul\n"; dbgs() << " Disp " << Disp << '\n' << "GV "; if (GV) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index a72f4daa5e11..5ac5d0348f8a 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -461,7 +461,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SRL_PARTS, VT, Custom); } - if (Subtarget.hasSSE1()) + if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow()) setOperationAction(ISD::PREFETCH , MVT::Other, Legal); setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); @@ -1622,16 +1622,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setLibcallName(RTLIB::MUL_I128, nullptr); } - // Combine sin / cos into one node or libcall if possible. - if (Subtarget.hasSinCos()) { - setLibcallName(RTLIB::SINCOS_F32, "sincosf"); - setLibcallName(RTLIB::SINCOS_F64, "sincos"); - if (Subtarget.isTargetDarwin()) { - // For MacOSX, we don't want the normal expansion of a libcall to sincos. - // We want to issue a libcall to __sincos_stret to avoid memory traffic. - setOperationAction(ISD::FSINCOS, MVT::f64, Custom); - setOperationAction(ISD::FSINCOS, MVT::f32, Custom); - } + // Combine sin / cos into _sincos_stret if it is available. + if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && + getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { + setOperationAction(ISD::FSINCOS, MVT::f64, Custom); + setOperationAction(ISD::FSINCOS, MVT::f32, Custom); } if (Subtarget.isTargetWin64()) { @@ -7480,9 +7475,9 @@ static bool isAddSub(const BuildVectorSDNode *BV, } /// Returns true if is possible to fold MUL and an idiom that has already been -/// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1). -/// If (and only if) true is returned, the operands of FMADDSUB are written to -/// parameters \p Opnd0, \p Opnd1, \p Opnd2. +/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into +/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the +/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2. /// /// Prior to calling this function it should be known that there is some /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation @@ -7505,12 +7500,12 @@ static bool isAddSub(const BuildVectorSDNode *BV, /// recognized ADDSUB idiom with ADDSUB operation is that such replacement /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit /// FMADDSUB is. -static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG, - SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, - unsigned ExpectedUses) { +static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, + SelectionDAG &DAG, + SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, + unsigned ExpectedUses) { if (Opnd0.getOpcode() != ISD::FMUL || - !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || - !Subtarget.hasAnyFMA()) + !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA()) return false; // FIXME: These checks must match the similar ones in @@ -7547,7 +7542,7 @@ static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, SDValue Opnd2; // TODO: According to coverage reports, the FMADDSUB transform is not // triggered by any tests. - if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) + if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2); // Do not generate X86ISD::ADDSUB node for 512-bit types even though @@ -11958,6 +11953,19 @@ static int canLowerByDroppingEvenElements(ArrayRef Mask, return 0; } +static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT, + ArrayRef Mask, SDValue V1, + SDValue V2, SelectionDAG &DAG) { + MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits()); + MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements()); + + SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true); + if (V2.isUndef()) + return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1); + + return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2); +} + /// \brief Generic lowering of v16i8 shuffles. /// /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to @@ -12148,6 +12156,10 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef Mask, if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack( DL, MVT::v16i8, V1, V2, Mask, DAG)) return Unpack; + + // If we have VBMI we can use one VPERM instead of multiple PSHUFBs. + if (Subtarget.hasVBMI() && Subtarget.hasVLX()) + return lowerVectorShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG); } return PSHUFB; @@ -13048,19 +13060,6 @@ static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT, DAG.getConstant(Immediate, DL, MVT::i8)); } -static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT, - ArrayRef Mask, SDValue V1, - SDValue V2, SelectionDAG &DAG) { - MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits()); - MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements()); - - SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true); - if (V2.isUndef()) - return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1); - - return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2); -} - /// \brief Handle lowering of 4-lane 64-bit floating point shuffles. /// /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 @@ -13615,6 +13614,10 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef Mask, DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG)) return PSHUFB; + // AVX512VBMIVL can lower to VPERMB. + if (Subtarget.hasVBMI() && Subtarget.hasVLX()) + return lowerVectorShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG); + // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( @@ -14077,6 +14080,10 @@ static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef Mask, Zeroable, Subtarget, DAG)) return Blend; + if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB( + DL, MVT::v32i16, Mask, V1, V2, Zeroable, Subtarget, DAG)) + return PSHUFB; + return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG); } @@ -14212,7 +14219,9 @@ static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef Mask, ExtVT = MVT::v4i32; break; case MVT::v8i1: - ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL + // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit + // shuffle. + ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64; break; case MVT::v16i1: ExtVT = MVT::v16i32; @@ -14569,11 +14578,10 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, unsigned NumElts = VecVT.getVectorNumElements(); // Extending v8i1/v16i1 to 512-bit get better performance on KNL // than extending to 128/256bit. - unsigned VecSize = (NumElts <= 4 ? 128 : 512); - MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize / NumElts), NumElts); - SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec); - SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, - ExtVT.getVectorElementType(), Ext, Idx); + MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8; + MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts); + SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec); + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx); return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); } @@ -14768,12 +14776,11 @@ static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, // Non constant index. Extend source and destination, // insert element and then truncate the result. unsigned NumElts = VecVT.getVectorNumElements(); - unsigned VecSize = (NumElts <= 4 ? 128 : 512); - MVT ExtVecVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts); - MVT ExtEltVT = ExtVecVT.getVectorElementType(); + MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8; + MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts); SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, - DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec), - DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx); + DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec), + DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx); return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp); } @@ -16287,21 +16294,6 @@ static SDValue LowerZERO_EXTEND_Mask(SDValue Op, return SelectedVal; } -static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, - SelectionDAG &DAG) { - SDValue In = Op->getOperand(0); - MVT InVT = In.getSimpleValueType(); - - if (InVT.getVectorElementType() == MVT::i1) - return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG); - - if (Subtarget.hasFp256()) - if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget)) - return Res; - - return SDValue(); -} - static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue In = Op.getOperand(0); @@ -16440,7 +16432,8 @@ static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG, assert((InVT.is256BitVector() || InVT.is128BitVector()) && "Unexpected vector type."); unsigned NumElts = InVT.getVectorNumElements(); - MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts); + MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts); + MVT ExtVT = MVT::getVectorVT(EltVT, NumElts); In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); InVT = ExtVT; ShiftInx = InVT.getScalarSizeInBits() - 1; @@ -18446,6 +18439,21 @@ static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, return V; } +static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + SDValue In = Op->getOperand(0); + MVT InVT = In.getSimpleValueType(); + + if (InVT.getVectorElementType() == MVT::i1) + return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG); + + if (Subtarget.hasFp256()) + if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget)) + return Res; + + return SDValue(); +} + // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG. // For sign extend this needs to handle all vector sizes and SSE4.1 and // non-SSE4.1 targets. For zero extend this should only handle inputs of @@ -21128,7 +21136,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, // ADC/ADCX/SBB case ADX: { SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32); - SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::i32); + SDVTList VTs = DAG.getVTList(Op.getOperand(3).getValueType(), MVT::i32); SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2), DAG.getConstant(-1, dl, MVT::i8)); SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3), @@ -22231,6 +22239,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask)); } + assert(VT == MVT::v16i8 && "Unexpected VT"); + SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v16i16, A); SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v16i16, B); SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB); @@ -22989,12 +22999,14 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, (Subtarget.hasAVX512() && VT == MVT::v16i16) || (Subtarget.hasAVX512() && VT == MVT::v16i8) || (Subtarget.hasBWI() && VT == MVT::v32i8)) { - MVT EvtSVT = (VT == MVT::v32i8 ? MVT::i16 : MVT::i32); + assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && + "Unexpected vector type"); + MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32; MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements()); unsigned ExtOpc = Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; R = DAG.getNode(ExtOpc, dl, ExtVT, R); - Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt); + Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt); return DAG.getNode(ISD::TRUNCATE, dl, VT, DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt)); } @@ -24101,8 +24113,9 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, // Only optimize x86_64 for now. i386 is a bit messy. For f32, // the small struct {f32, f32} is returned in (eax, edx). For f64, // the results are returned via SRet in memory. - const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret"; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; + const char *LibcallName = TLI.getLibcallName(LC); SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout())); @@ -24928,7 +24941,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case ISD::BITCAST: { assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); EVT DstVT = N->getValueType(0); - EVT SrcVT = N->getOperand(0)->getValueType(0); + EVT SrcVT = N->getOperand(0).getValueType(); if (SrcVT != MVT::f64 || (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8)) @@ -28407,8 +28420,6 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, // TODO - attempt to narrow Mask back to writemask size. bool IsEVEXShuffle = RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128); - if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits)) - return SDValue(); // TODO - handle 128/256-bit lane shuffles of 512-bit vectors. @@ -28491,11 +28502,10 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, - ShuffleVT)) { + ShuffleVT) && + (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 1 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! - if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) - return SDValue(); // AVX512 Writemask clash. Res = DAG.getBitcast(ShuffleSrcVT, V1); DCI.AddToWorklist(Res.getNode()); Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res); @@ -28505,11 +28515,10 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, Subtarget, Shuffle, - ShuffleVT, PermuteImm)) { + ShuffleVT, PermuteImm) && + (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 1 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! - if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) - return SDValue(); // AVX512 Writemask clash. Res = DAG.getBitcast(ShuffleVT, V1); DCI.AddToWorklist(Res.getNode()); Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, @@ -28520,12 +28529,11 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, } if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, - V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, - ShuffleVT, UnaryShuffle)) { + V1, V2, DL, DAG, Subtarget, Shuffle, + ShuffleSrcVT, ShuffleVT, UnaryShuffle) && + (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 1 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! - if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) - return SDValue(); // AVX512 Writemask clash. V1 = DAG.getBitcast(ShuffleSrcVT, V1); DCI.AddToWorklist(V1.getNode()); V2 = DAG.getBitcast(ShuffleSrcVT, V2); @@ -28538,11 +28546,10 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleVT, - PermuteImm)) { + PermuteImm) && + (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 1 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! - if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) - return SDValue(); // AVX512 Writemask clash. V1 = DAG.getBitcast(ShuffleVT, V1); DCI.AddToWorklist(V1.getNode()); V2 = DAG.getBitcast(ShuffleVT, V2); @@ -28594,8 +28601,8 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, return SDValue(); // Depth threshold above which we can efficiently use variable mask shuffles. - // TODO This should probably be target specific. - bool AllowVariableMask = (Depth >= 3) || HasVariableMask; + int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3; + bool AllowVariableMask = (Depth >= VariableShuffleDepth) || HasVariableMask; bool MaskContainsZeros = any_of(Mask, [](int M) { return M == SM_SentinelZero; }); @@ -29698,17 +29705,18 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, return SDValue(); } -/// Returns true iff the shuffle node \p N can be replaced with ADDSUB -/// operation. If true is returned then the operands of ADDSUB operation +/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD) +/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation /// are written to the parameters \p Opnd0 and \p Opnd1. /// -/// We combine shuffle to ADDSUB directly on the abstract vector shuffle nodes +/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes /// so it is easier to generically match. We also insert dummy vector shuffle /// nodes for the operands which explicitly discard the lanes which are unused /// by this operation to try to flow through the rest of the combiner /// the fact that they're unused. -static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget, - SDValue &Opnd0, SDValue &Opnd1) { +static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget, + SDValue &Opnd0, SDValue &Opnd1, + bool matchSubAdd = false) { EVT VT = N->getValueType(0); if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && @@ -29728,12 +29736,15 @@ static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget, SDValue V1 = N->getOperand(0); SDValue V2 = N->getOperand(1); - // We require the first shuffle operand to be the FSUB node, and the second to - // be the FADD node. - if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) { + unsigned ExpectedOpcode = matchSubAdd ? ISD::FADD : ISD::FSUB; + unsigned NextExpectedOpcode = matchSubAdd ? ISD::FSUB : ISD::FADD; + + // We require the first shuffle operand to be the ExpectedOpcode node, + // and the second to be the NextExpectedOpcode node. + if (V1.getOpcode() == NextExpectedOpcode && V2.getOpcode() == ExpectedOpcode) { ShuffleVectorSDNode::commuteMask(Mask); std::swap(V1, V2); - } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD) + } else if (V1.getOpcode() != ExpectedOpcode || V2.getOpcode() != NextExpectedOpcode) return false; // If there are other uses of these operations we can't fold them. @@ -29767,7 +29778,7 @@ static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue Opnd0, Opnd1; - if (!isAddSub(N, Subtarget, Opnd0, Opnd1)) + if (!isAddSubOrSubAdd(N, Subtarget, Opnd0, Opnd1)) return SDValue(); EVT VT = N->getValueType(0); @@ -29775,7 +29786,7 @@ static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, // Try to generate X86ISD::FMADDSUB node here. SDValue Opnd2; - if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) + if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2); // Do not generate X86ISD::ADDSUB node for 512-bit types even though @@ -29787,6 +29798,26 @@ static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1); } +/// \brief Try to combine a shuffle into a target-specific +/// mul-sub-add node. +static SDValue combineShuffleToFMSubAdd(SDNode *N, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + SDValue Opnd0, Opnd1; + if (!isAddSubOrSubAdd(N, Subtarget, Opnd0, Opnd1, true)) + return SDValue(); + + EVT VT = N->getValueType(0); + SDLoc DL(N); + + // Try to generate X86ISD::FMSUBADD node here. + SDValue Opnd2; + if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) + return DAG.getNode(X86ISD::FMSUBADD, DL, VT, Opnd0, Opnd1, Opnd2); + + return SDValue(); +} + // We are looking for a shuffle where both sources are concatenated with undef // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so // if we can express this as a single-source shuffle, that's preferable. @@ -29873,11 +29904,14 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // If we have legalized the vector types, look for blends of FADD and FSUB - // nodes that we can fuse into an ADDSUB node. + // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node. if (TLI.isTypeLegal(VT)) { if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG)) return AddSub; + if (SDValue FMSubAdd = combineShuffleToFMSubAdd(N, Subtarget, DAG)) + return FMSubAdd; + if (SDValue HAddSub = foldShuffleOfHorizOp(N)) return HAddSub; } @@ -30181,7 +30215,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2)) // sign-extend to a 256-bit operation to avoid truncation. if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() && - N0->getOperand(0)->getValueType(0).is256BitVector()) { + N0->getOperand(0).getValueType().is256BitVector()) { SExtVT = MVT::v4i64; FPCastVT = MVT::v4f64; } @@ -30194,8 +30228,8 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, // 256-bit because the shuffle is cheaper than sign extending the result of // the compare. if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() && - (N0->getOperand(0)->getValueType(0).is256BitVector() || - N0->getOperand(0)->getValueType(0).is512BitVector())) { + (N0->getOperand(0).getValueType().is256BitVector() || + N0->getOperand(0).getValueType().is512BitVector())) { SExtVT = MVT::v8i32; FPCastVT = MVT::v8f32; } @@ -30484,7 +30518,8 @@ static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0, return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1); } -// Attempt to replace an min/max v8i16 horizontal reduction with PHMINPOSUW. +// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with +// PHMINPOSUW. static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // Bail without SSE41. @@ -30492,7 +30527,7 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG, return SDValue(); EVT ExtractVT = Extract->getValueType(0); - if (ExtractVT != MVT::i16) + if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8) return SDValue(); // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns. @@ -30504,7 +30539,7 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG, EVT SrcVT = Src.getValueType(); EVT SrcSVT = SrcVT.getScalarType(); - if (SrcSVT != MVT::i16 || (SrcVT.getSizeInBits() % 128) != 0) + if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0) return SDValue(); SDLoc DL(Extract); @@ -30520,22 +30555,39 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG, SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits); MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi); } - assert(SrcVT == MVT::v8i16 && "Unexpected value type"); + assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || + (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && + "Unexpected value type"); // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask // to flip the value accordingly. SDValue Mask; + unsigned MaskEltsBits = ExtractVT.getSizeInBits(); if (BinOp == ISD::SMAX) - Mask = DAG.getConstant(APInt::getSignedMaxValue(16), DL, SrcVT); + Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT); else if (BinOp == ISD::SMIN) - Mask = DAG.getConstant(APInt::getSignedMinValue(16), DL, SrcVT); + Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT); else if (BinOp == ISD::UMAX) - Mask = DAG.getConstant(APInt::getAllOnesValue(16), DL, SrcVT); + Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT); if (Mask) MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos); - MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, SrcVT, MinPos); + // For v16i8 cases we need to perform UMIN on pairs of byte elements, + // shuffling each upper element down and insert zeros. This means that the + // v16i8 UMIN will leave the upper element as zero, performing zero-extension + // ready for the PHMINPOS. + if (ExtractVT == MVT::i8) { + SDValue Upper = DAG.getVectorShuffle( + SrcVT, DL, MinPos, getZeroVector(MVT::v16i8, Subtarget, DAG, DL), + {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16}); + MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper); + } + + // Perform the PHMINPOS on a v8i16 vector, + MinPos = DAG.getBitcast(MVT::v8i16, MinPos); + MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos); + MinPos = DAG.getBitcast(SrcVT, MinPos); if (Mask) MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos); @@ -30851,7 +30903,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget)) return Cmp; - // Attempt to replace min/max v8i16 reductions with PHMINPOSUW. + // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW. if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget)) return MinMax; @@ -32555,7 +32607,7 @@ static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) { // 1. MOVs can write to a register that differs from source // 2. MOVs accept memory operands - if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant || + if (VT.isVector() || N1.getOpcode() != ISD::Constant || N0.getOpcode() != ISD::SHL || !N0.hasOneUse() || N0.getOperand(1).getOpcode() != ISD::Constant) return SDValue(); @@ -32569,11 +32621,11 @@ static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) { if (SarConst.isNegative()) return SDValue(); - for (MVT SVT : MVT::integer_valuetypes()) { + for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) { unsigned ShiftSize = SVT.getSizeInBits(); // skipping types without corresponding sext/zext and // ShlConst that is not one of [56,48,32,24,16] - if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize) + if (ShiftSize >= Size || ShlConst != Size - ShiftSize) continue; SDLoc DL(N); SDValue NN = @@ -32626,37 +32678,6 @@ static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -/// \brief Returns a vector of 0s if the node in input is a vector logical -/// shift by a constant amount which is known to be bigger than or equal -/// to the vector element size in bits. -static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { - EVT VT = N->getValueType(0); - - if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 && - (!Subtarget.hasInt256() || - (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16))) - return SDValue(); - - SDValue Amt = N->getOperand(1); - SDLoc DL(N); - if (auto *AmtBV = dyn_cast(Amt)) - if (auto *AmtSplat = AmtBV->getConstantSplatNode()) { - const APInt &ShiftAmt = AmtSplat->getAPIntValue(); - unsigned MaxAmount = - VT.getSimpleVT().getScalarSizeInBits(); - - // SSE2/AVX2 logical shifts always return a vector of 0s - // if the shift amount is bigger than or equal to - // the element size. The constant shift amount will be - // encoded as a 8-bit immediate. - if (ShiftAmt.trunc(8).uge(MaxAmount)) - return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL); - } - - return SDValue(); -} - static SDValue combineShift(SDNode* N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -32672,11 +32693,6 @@ static SDValue combineShift(SDNode* N, SelectionDAG &DAG, if (SDValue V = combineShiftRightLogical(N, DAG)) return V; - // Try to fold this logical shift into a zero vector. - if (N->getOpcode() != ISD::SRA) - if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget)) - return V; - return SDValue(); } @@ -32996,21 +33012,20 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) { // register. In most cases we actually compare or select YMM-sized registers // and mixing the two types creates horrible code. This method optimizes // some of the transition sequences. +// Even with AVX-512 this is still useful for removing casts around logical +// operations on vXi1 mask types. static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); - if (!VT.is256BitVector()) - return SDValue(); + assert(VT.isVector() && "Expected vector type"); assert((N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node"); SDValue Narrow = N->getOperand(0); - EVT NarrowVT = Narrow->getValueType(0); - if (!NarrowVT.is128BitVector()) - return SDValue(); + EVT NarrowVT = Narrow.getValueType(); if (Narrow->getOpcode() != ISD::XOR && Narrow->getOpcode() != ISD::AND && @@ -33026,12 +33041,12 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, return SDValue(); // The type of the truncated inputs. - EVT WideVT = N0->getOperand(0)->getValueType(0); - if (WideVT != VT) + if (N0->getOperand(0).getValueType() != VT) return SDValue(); // The right side has to be a 'trunc' or a constant vector. - bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE; + bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE && + N1.getOperand(0).getValueType() == VT; ConstantSDNode *RHSConstSplat = nullptr; if (auto *RHSBV = dyn_cast(N1)) RHSConstSplat = RHSBV->getConstantSplatNode(); @@ -33040,37 +33055,31 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT)) + if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), VT)) return SDValue(); // Set N0 and N1 to hold the inputs to the new wide operation. N0 = N0->getOperand(0); if (RHSConstSplat) { - N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(), + N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT.getVectorElementType(), SDValue(RHSConstSplat, 0)); - N1 = DAG.getSplatBuildVector(WideVT, DL, N1); + N1 = DAG.getSplatBuildVector(VT, DL, N1); } else if (RHSTrunc) { N1 = N1->getOperand(0); } // Generate the wide operation. - SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1); + SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1); unsigned Opcode = N->getOpcode(); switch (Opcode) { + default: llvm_unreachable("Unexpected opcode"); case ISD::ANY_EXTEND: return Op; - case ISD::ZERO_EXTEND: { - unsigned InBits = NarrowVT.getScalarSizeInBits(); - APInt Mask = APInt::getAllOnesValue(InBits); - Mask = Mask.zext(VT.getScalarSizeInBits()); - return DAG.getNode(ISD::AND, DL, VT, - Op, DAG.getConstant(Mask, DL, VT)); - } + case ISD::ZERO_EXTEND: + return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType()); case ISD::SIGN_EXTEND: return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op, DAG.getValueType(NarrowVT)); - default: - llvm_unreachable("Unexpected opcode"); } } @@ -33882,16 +33891,6 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, if (!Subtarget.hasSSE2()) return SDValue(); - if (Subtarget.hasBWI()) { - if (VT.getSizeInBits() > 512) - return SDValue(); - } else if (Subtarget.hasAVX2()) { - if (VT.getSizeInBits() > 256) - return SDValue(); - } else { - if (VT.getSizeInBits() > 128) - return SDValue(); - } // Detect the following pattern: // @@ -33903,7 +33902,6 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, // %6 = trunc %5 to // // In AVX512, the last instruction can also be a trunc store. - if (In.getOpcode() != ISD::SRL) return SDValue(); @@ -33924,6 +33922,35 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, return true; }; + // Split vectors to legal target size and apply AVG. + auto LowerToAVG = [&](SDValue Op0, SDValue Op1) { + unsigned NumSubs = 1; + if (Subtarget.hasBWI()) { + if (VT.getSizeInBits() > 512) + NumSubs = VT.getSizeInBits() / 512; + } else if (Subtarget.hasAVX2()) { + if (VT.getSizeInBits() > 256) + NumSubs = VT.getSizeInBits() / 256; + } else { + if (VT.getSizeInBits() > 128) + NumSubs = VT.getSizeInBits() / 128; + } + + if (NumSubs == 1) + return DAG.getNode(X86ISD::AVG, DL, VT, Op0, Op1); + + SmallVector Subs; + EVT SubVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), + VT.getVectorNumElements() / NumSubs); + for (unsigned i = 0; i != NumSubs; ++i) { + unsigned Idx = i * SubVT.getVectorNumElements(); + SDValue LHS = extractSubVector(Op0, Idx, DAG, DL, SubVT.getSizeInBits()); + SDValue RHS = extractSubVector(Op1, Idx, DAG, DL, SubVT.getSizeInBits()); + Subs.push_back(DAG.getNode(X86ISD::AVG, DL, SubVT, LHS, RHS)); + } + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs); + }; + // Check if each element of the vector is left-shifted by one. auto LHS = In.getOperand(0); auto RHS = In.getOperand(1); @@ -33947,8 +33974,7 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, SDValue VecOnes = DAG.getConstant(1, DL, InVT); Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes); Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]); - return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0), - Operands[1]); + return LowerToAVG(Operands[0].getOperand(0), Operands[1]); } if (Operands[0].getOpcode() == ISD::ADD) @@ -33972,8 +33998,7 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, return SDValue(); // The pattern is detected, emit X86ISD::AVG instruction. - return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0), - Operands[1].getOperand(0)); + return LowerToAVG(Operands[0].getOperand(0), Operands[1].getOperand(0)); } return SDValue(); @@ -35872,14 +35897,8 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG, if (SDValue NewCMov = combineToExtendCMOV(N, DAG)) return NewCMov; - if (!DCI.isBeforeLegalizeOps()) { - if (InVT == MVT::i1) { - SDValue Zero = DAG.getConstant(0, DL, VT); - SDValue AllOnes = DAG.getAllOnesConstant(DL, VT); - return DAG.getSelect(DL, VT, N0, AllOnes, Zero); - } + if (!DCI.isBeforeLegalizeOps()) return SDValue(); - } if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR && isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) { @@ -35897,7 +35916,7 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget)) return V; - if (Subtarget.hasAVX() && VT.is256BitVector()) + if (VT.isVector()) if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget)) return R; @@ -36089,7 +36108,7 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget)) return V; - if (VT.is256BitVector()) + if (VT.isVector()) if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget)) return R; @@ -36244,39 +36263,54 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc DL(N); - // Pre-shrink oversized index elements to avoid triggering scalarization. - if (DCI.isBeforeLegalize()) { + if (DCI.isBeforeLegalizeOps()) { SDValue Index = N->getOperand(4); - if (Index.getScalarValueSizeInBits() > 64) { - EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, + // Remove any sign extends from 32 or smaller to larger than 32. + // Only do this before LegalizeOps in case we need the sign extend for + // legalization. + if (Index.getOpcode() == ISD::SIGN_EXTEND) { + if (Index.getScalarValueSizeInBits() > 32 && + Index.getOperand(0).getScalarValueSizeInBits() <= 32) { + SmallVector NewOps(N->op_begin(), N->op_end()); + NewOps[4] = Index.getOperand(0); + DAG.UpdateNodeOperands(N, NewOps); + // The original sign extend has less users, add back to worklist in case + // it needs to be removed + DCI.AddToWorklist(Index.getNode()); + DCI.AddToWorklist(N); + return SDValue(N, 0); + } + } + + // Make sure the index is either i32 or i64 + unsigned ScalarSize = Index.getScalarValueSizeInBits(); + if (ScalarSize != 32 && ScalarSize != 64) { + MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32; + EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT, Index.getValueType().getVectorNumElements()); - SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IndexVT, Index); + Index = DAG.getSExtOrTrunc(Index, DL, IndexVT); SmallVector NewOps(N->op_begin(), N->op_end()); - NewOps[4] = Trunc; + NewOps[4] = Index; DAG.UpdateNodeOperands(N, NewOps); DCI.AddToWorklist(N); return SDValue(N, 0); } - } - // Try to remove sign extends from i32 to i64 on the index. - // Only do this before legalize in case we are relying on it for - // legalization. - // TODO: We should maybe remove any sign extend once we learn how to sign - // extend narrow index during lowering. - if (DCI.isBeforeLegalizeOps()) { - SDValue Index = N->getOperand(4); - if (Index.getScalarValueSizeInBits() == 64 && - Index.getOpcode() == ISD::SIGN_EXTEND && + // Try to remove zero extends from 32->64 if we know the sign bit of + // the input is zero. + if (Index.getOpcode() == ISD::ZERO_EXTEND && + Index.getScalarValueSizeInBits() == 64 && Index.getOperand(0).getScalarValueSizeInBits() == 32) { - SmallVector NewOps(N->op_begin(), N->op_end()); - NewOps[4] = Index.getOperand(0); - DAG.UpdateNodeOperands(N, NewOps); - // The original sign extend has less users, add back to worklist in case - // it needs to be removed. - DCI.AddToWorklist(Index.getNode()); - DCI.AddToWorklist(N); - return SDValue(N, 0); + if (DAG.SignBitIsZero(Index.getOperand(0))) { + SmallVector NewOps(N->op_begin(), N->op_end()); + NewOps[4] = Index.getOperand(0); + DAG.UpdateNodeOperands(N, NewOps); + // The original zero extend has less users, add back to worklist in case + // it needs to be removed + DCI.AddToWorklist(Index.getNode()); + DCI.AddToWorklist(N); + return SDValue(N, 0); + } } } @@ -36288,6 +36322,7 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, SmallVector NewOps(N->op_begin(), N->op_end()); NewOps[2] = Mask.getOperand(0); DAG.UpdateNodeOperands(N, NewOps); + return SDValue(N, 0); } // With AVX2 we only demand the upper bit of the mask. @@ -36356,7 +36391,7 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, EVT VT = N->getValueType(0); if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC || - VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits()) + VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits()) return SDValue(); // Now check that the other operand of the AND is a constant. We could diff --git a/lib/Target/X86/X86Instr3DNow.td b/lib/Target/X86/X86Instr3DNow.td index 2acd8d17beb2..0d30b7d47f3e 100644 --- a/lib/Target/X86/X86Instr3DNow.td +++ b/lib/Target/X86/X86Instr3DNow.td @@ -116,14 +116,30 @@ defm PMULHRW : I3DNow_binop_rm_int<0xB7, "pmulhrw", I3DNOW_MISC_FUNC_ITINS, 1>; def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms", [(int_x86_mmx_femms)], IIC_MMX_EMMS>; +// PREFETCHWT1 is supported we want to use it for everything but T0. +def PrefetchWLevel : PatFrag<(ops), (i32 imm), [{ + return N->getSExtValue() == 3 || !Subtarget->hasPREFETCHWT1(); +}]>; + +// Use PREFETCHWT1 for NTA, T2, T1. +def PrefetchWT1Level : ImmLeaf; + let SchedRW = [WriteLoad] in { +let Predicates = [Has3DNow, NoSSEPrefetch] in def PREFETCH : I3DNow<0x0D, MRM0m, (outs), (ins i8mem:$addr), "prefetch\t$addr", - [(prefetch addr:$addr, (i32 0), imm, (i32 1))], + [(prefetch addr:$addr, imm, imm, (i32 1))], IIC_SSE_PREFETCH>; + def PREFETCHW : I<0x0D, MRM1m, (outs), (ins i8mem:$addr), "prefetchw\t$addr", - [(prefetch addr:$addr, (i32 1), (i32 3), (i32 1))], + [(prefetch addr:$addr, (i32 1), (i32 PrefetchWLevel), (i32 1))], IIC_SSE_PREFETCH>, TB, Requires<[HasPrefetchW]>; + +def PREFETCHWT1 : I<0x0D, MRM2m, (outs), (ins i8mem:$addr), "prefetchwt1\t$addr", + [(prefetch addr:$addr, (i32 1), (i32 PrefetchWT1Level), (i32 1))], + IIC_SSE_PREFETCH>, TB, Requires<[HasPREFETCHWT1]>; } // "3DNowA" instructions diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td index 2a6ed02fadab..0b266e5591b4 100644 --- a/lib/Target/X86/X86InstrFormats.td +++ b/lib/Target/X86/X86InstrFormats.td @@ -349,8 +349,9 @@ class X86Inst opcod, Format f, ImmType i, dag outs, dag ins, let TSFlags{54} = hasEVEX_RC; } -class PseudoI pattern> - : X86Inst<0, Pseudo, NoImm, oops, iops, "", NoItinerary> { +class PseudoI pattern, + InstrItinClass itin = NoItinerary> + : X86Inst<0, Pseudo, NoImm, oops, iops, "", itin> { let Pattern = pattern; } @@ -423,9 +424,8 @@ class FPI o, Format F, dag outs, dag ins, string asm, // FpI_ - Floating Point Pseudo Instruction template. Not Predicated. class FpI_ pattern, InstrItinClass itin = NoItinerary> - : X86Inst<0, Pseudo, NoImm, outs, ins, "", itin> { + : PseudoI { let FPForm = fp; - let Pattern = pattern; } // Templates for instructions that use a 16- or 32-bit segmented address as diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 42e89cb4831d..fdf3e73e4fcd 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -874,7 +874,10 @@ def HasADX : Predicate<"Subtarget->hasADX()">; def HasSHA : Predicate<"Subtarget->hasSHA()">; def HasPRFCHW : Predicate<"Subtarget->hasPRFCHW()">; def HasRDSEED : Predicate<"Subtarget->hasRDSEED()">; +def HasSSEPrefetch : Predicate<"Subtarget->hasSSEPrefetch()">; +def NoSSEPrefetch : Predicate<"!Subtarget->hasSSEPrefetch()">; def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">; +def HasPREFETCHWT1 : Predicate<"Subtarget->hasPREFETCHWT1()">; def HasLAHFSAHF : Predicate<"Subtarget->hasLAHFSAHF()">; def HasMWAITX : Predicate<"Subtarget->hasMWAITX()">; def HasCLZERO : Predicate<"Subtarget->hasCLZERO()">; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index a86a0bfc168d..b48fa1841979 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -3487,7 +3487,7 @@ let Predicates = [UseSSE2] in { //===----------------------------------------------------------------------===// // Prefetch intrinsic. -let Predicates = [HasSSE1], SchedRW = [WriteLoad] in { +let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in { def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src), "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))], IIC_SSE_PREFETCH>, TB; diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp index 1e04997ad294..e131f1a1e4bd 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -89,8 +89,9 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset( // Check to see if there is a specialized entry-point for memory zeroing. ConstantSDNode *ValC = dyn_cast(Val); - if (const char *bzeroEntry = ValC && - ValC->isNullValue() ? Subtarget.getBZeroEntry() : nullptr) { + if (const char *bzeroName = (ValC && ValC->isNullValue()) + ? DAG.getTargetLoweringInfo().getLibcallName(RTLIB::BZERO) + : nullptr) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout()); Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); @@ -106,7 +107,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset( CLI.setDebugLoc(dl) .setChain(Chain) .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol(bzeroEntry, IntPtr), + DAG.getExternalSymbol(bzeroName, IntPtr), std::move(Args)) .setDiscardResult(); diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 8b08766b6171..ad023623142f 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -174,28 +174,6 @@ X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV, return X86II::MO_NO_FLAG; } -/// This function returns the name of a function which has an interface like -/// the non-standard bzero function, if such a function exists on the -/// current subtarget and it is considered preferable over memset with zero -/// passed as the second argument. Otherwise it returns null. -const char *X86Subtarget::getBZeroEntry() const { - // Darwin 10 has a __bzero entry point for this purpose. - if (getTargetTriple().isMacOSX() && - !getTargetTriple().isMacOSXVersionLT(10, 6)) - return "__bzero"; - - return nullptr; -} - -bool X86Subtarget::hasSinCos() const { - if (getTargetTriple().isMacOSX()) { - return !getTargetTriple().isMacOSXVersionLT(10, 9) && is64Bit(); - } else if (getTargetTriple().isOSFuchsia()) { - return true; - } - return false; -} - /// Return true if the subtarget allows calls to immediate address. bool X86Subtarget::isLegalToCallImmediateAddr() const { // FIXME: I386 PE/COFF supports PC relative calls using IMAGE_REL_I386_REL32 @@ -324,6 +302,7 @@ void X86Subtarget::initializeEnvironment() { HasVNNI = false; HasBITALG = false; HasSHA = false; + HasPREFETCHWT1 = false; HasPRFCHW = false; HasRDSEED = false; HasLAHFSAHF = false; @@ -342,6 +321,7 @@ void X86Subtarget::initializeEnvironment() { HasSSEUnalignedMem = false; HasCmpxchg16b = false; UseLeaForSP = false; + HasFastVariableShuffle = false; HasFastPartialYMMorZMMWrite = false; HasFastGather = false; HasFastScalarFSQRT = false; diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index be4d46c470de..c9435890fc1f 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -201,7 +201,7 @@ class X86Subtarget final : public X86GenSubtargetInfo { bool HasCLZERO; /// Processor has Prefetch with intent to Write instruction - bool HasPFPREFETCHWT1; + bool HasPREFETCHWT1; /// True if SHLD instructions are slow. bool IsSHLDSlow; @@ -228,6 +228,10 @@ class X86Subtarget final : public X86GenSubtargetInfo { /// the stack pointer. This is an optimization for Intel Atom processors. bool UseLeaForSP; + /// True if its preferable to combine to a single shuffle using a variable + /// mask over multiple fixed shuffles. + bool HasFastVariableShuffle; + /// True if there is no performance penalty to writing only the lower parts /// of a YMM or ZMM register without clearing the upper part. bool HasFastPartialYMMorZMMWrite; @@ -513,7 +517,14 @@ class X86Subtarget final : public X86GenSubtargetInfo { bool hasRTM() const { return HasRTM; } bool hasADX() const { return HasADX; } bool hasSHA() const { return HasSHA; } - bool hasPRFCHW() const { return HasPRFCHW; } + bool hasPRFCHW() const { return HasPRFCHW || HasPREFETCHWT1; } + bool hasPREFETCHWT1() const { return HasPREFETCHWT1; } + bool hasSSEPrefetch() const { + // We implicitly enable these when we have a write prefix supporting cache + // level OR if we have prfchw, but don't already have a read prefetch from + // 3dnow. + return hasSSE1() || (hasPRFCHW() && !has3DNow()) || hasPREFETCHWT1(); + } bool hasRDSEED() const { return HasRDSEED; } bool hasLAHFSAHF() const { return HasLAHFSAHF; } bool hasMWAITX() const { return HasMWAITX; } @@ -527,6 +538,9 @@ class X86Subtarget final : public X86GenSubtargetInfo { bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; } bool hasCmpxchg16b() const { return HasCmpxchg16b; } bool useLeaForSP() const { return UseLeaForSP; } + bool hasFastVariableShuffle() const { + return HasFastVariableShuffle; + } bool hasFastPartialYMMorZMMWrite() const { return HasFastPartialYMMorZMMWrite; } @@ -682,17 +696,6 @@ class X86Subtarget final : public X86GenSubtargetInfo { /// Return true if the subtarget allows calls to immediate address. bool isLegalToCallImmediateAddr() const; - /// This function returns the name of a function which has an interface - /// like the non-standard bzero function, if such a function exists on - /// the current subtarget and it is considered prefereable over - /// memset with zero passed as the second argument. Otherwise it - /// returns null. - const char *getBZeroEntry() const; - - /// This function returns true if the target has sincos() routine in its - /// compiler runtime or math libraries. - bool hasSinCos() const; - /// Enable the MachineScheduler pass for all X86 subtargets. bool enableMachineScheduler() const override { return true; } diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index ea8c9862230e..e95e6ecae091 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -281,10 +281,9 @@ UseVZeroUpper("x86-use-vzeroupper", cl::Hidden, // X86 TTI query. //===----------------------------------------------------------------------===// -TargetIRAnalysis X86TargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](const Function &F) { - return TargetTransformInfo(X86TTIImpl(this, F)); - }); +TargetTransformInfo +X86TargetMachine::getTargetTransformInfo(const Function &F) { + return TargetTransformInfo(X86TTIImpl(this, F)); } //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h index 952bd1321ff9..5b21cd82b5b1 100644 --- a/lib/Target/X86/X86TargetMachine.h +++ b/lib/Target/X86/X86TargetMachine.h @@ -45,7 +45,7 @@ class X86TargetMachine final : public LLVMTargetMachine { // attributes of each function. const X86Subtarget *getSubtargetImpl() const = delete; - TargetIRAnalysis getTargetIRAnalysis() override; + TargetTransformInfo getTargetTransformInfo(const Function &F) override; // Set up the pass pipeline. TargetPassConfig *createPassConfig(PassManagerBase &PM) override; diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp index 3aa7187e0cd1..38925bfd51b0 100644 --- a/lib/Target/XCore/XCoreTargetMachine.cpp +++ b/lib/Target/XCore/XCoreTargetMachine.cpp @@ -108,8 +108,7 @@ extern "C" void LLVMInitializeXCoreTarget() { RegisterTargetMachine X(getTheXCoreTarget()); } -TargetIRAnalysis XCoreTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](const Function &F) { - return TargetTransformInfo(XCoreTTIImpl(this, F)); - }); +TargetTransformInfo +XCoreTargetMachine::getTargetTransformInfo(const Function &F) { + return TargetTransformInfo(XCoreTTIImpl(this, F)); } diff --git a/lib/Target/XCore/XCoreTargetMachine.h b/lib/Target/XCore/XCoreTargetMachine.h index 5baa3524d2a6..965b9b2c4d65 100644 --- a/lib/Target/XCore/XCoreTargetMachine.h +++ b/lib/Target/XCore/XCoreTargetMachine.h @@ -43,7 +43,7 @@ class XCoreTargetMachine : public LLVMTargetMachine { // Pass Pipeline Configuration TargetPassConfig *createPassConfig(PassManagerBase &PM) override; - TargetIRAnalysis getTargetIRAnalysis() override; + TargetTransformInfo getTargetTransformInfo(const Function &F) override; TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF.get(); diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp index 683655f1f68b..a9cfd8ded6fb 100644 --- a/lib/Transforms/IPO/PartialInlining.cpp +++ b/lib/Transforms/IPO/PartialInlining.cpp @@ -710,7 +710,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) { // Check if there is PGO data or user annoated branch data: static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) { - if (F->getEntryCount()) + if (F->hasProfileData()) return true; // Now check if any of the entry block has MD_prof data: for (auto *E : OI->Entries) { @@ -863,6 +863,7 @@ int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) { case Instruction::GetElementPtr: if (cast(I)->hasAllZeroIndices()) continue; + break; default: break; } @@ -1273,7 +1274,7 @@ std::pair PartialInlinerImpl::unswitchFunction(Function *F) { // Only try to outline cold regions if we have a profile summary, which // implies we have profiling information. - if (PSI->hasProfileSummary() && F->getEntryCount().hasValue() && + if (PSI->hasProfileSummary() && F->hasProfileData() && !DisableMultiRegionPartialInline) { std::unique_ptr OMRI = computeOutliningColdRegionsInfo(F); @@ -1379,10 +1380,10 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { Cloner.ClonedFunc->user_end()); DenseMap CallSiteToProfCountMap; - if (Cloner.OrigFunc->getEntryCount()) + auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount(); + if (CalleeEntryCount) computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap); - auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount(); uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0); bool AnyInline = false; diff --git a/lib/Transforms/IPO/SampleProfile.cpp b/lib/Transforms/IPO/SampleProfile.cpp index f0e781b9d923..7086c2eb52c4 100644 --- a/lib/Transforms/IPO/SampleProfile.cpp +++ b/lib/Transforms/IPO/SampleProfile.cpp @@ -1583,7 +1583,10 @@ bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) { } bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) { - F.setEntryCount(0); + // Initialize the entry count to -1, which will be treated conservatively + // by getEntryCount as the same as unknown (None). If we have samples this + // will be overwritten in emitAnnotations. + F.setEntryCount(-1); std::unique_ptr OwnedORE; if (AM) { auto &FAM = diff --git a/lib/Transforms/IPO/WholeProgramDevirt.cpp b/lib/Transforms/IPO/WholeProgramDevirt.cpp index ec56f0cde25d..5fbb001216a3 100644 --- a/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -1346,6 +1346,7 @@ void DevirtModule::importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo) { Constant *Bit = importConstant(Slot, CSByConstantArg.first, "bit", Int8Ty, ResByArg.Bit); applyVirtualConstProp(CSByConstantArg.second, "", Byte, Bit); + break; } default: break; diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index aa055121e710..a088d447337f 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -4394,6 +4394,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS, cast(Caller)->getCallingConv()); cast(NewCaller)->setAttributes(NewPAL); } + NewCaller->setDebugLoc(Caller->getDebugLoc()); return NewCaller; } diff --git a/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index 2a25423e04bd..8e2833d22032 100644 --- a/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -80,6 +80,11 @@ static cl::opt ClInstrumentAtomics( cl::desc("instrument atomic instructions (rmw, cmpxchg)"), cl::Hidden, cl::init(true)); +static cl::opt ClRecover( + "hwasan-recover", + cl::desc("Enable recovery mode (continue-after-error)."), + cl::Hidden, cl::init(false)); + namespace { /// \brief An instrumentation pass implementing detection of addressability bugs @@ -89,7 +94,8 @@ class HWAddressSanitizer : public FunctionPass { // Pass identification, replacement for typeid. static char ID; - HWAddressSanitizer() : FunctionPass(ID) {} + HWAddressSanitizer(bool Recover = false) + : FunctionPass(ID), Recover(Recover || ClRecover) {} StringRef getPassName() const override { return "HWAddressSanitizer"; } @@ -109,6 +115,8 @@ class HWAddressSanitizer : public FunctionPass { LLVMContext *C; Type *IntptrTy; + bool Recover; + Function *HwasanCtorFunction; Function *HwasanMemoryAccessCallback[2][kNumberOfAccessSizes]; @@ -126,8 +134,8 @@ INITIALIZE_PASS_END( HWAddressSanitizer, "hwasan", "HWAddressSanitizer: detect memory bugs using tagged addressing.", false, false) -FunctionPass *llvm::createHWAddressSanitizerPass() { - return new HWAddressSanitizer(); +FunctionPass *llvm::createHWAddressSanitizerPass(bool Recover) { + return new HWAddressSanitizer(Recover); } /// \brief Module-level initialization. @@ -156,10 +164,11 @@ void HWAddressSanitizer::initializeCallbacks(Module &M) { IRBuilder<> IRB(*C); for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) { const std::string TypeStr = AccessIsWrite ? "store" : "load"; + const std::string EndingStr = Recover ? "_noabort" : ""; HwasanMemoryAccessCallbackSized[AccessIsWrite] = checkSanitizerInterfaceFunction(M.getOrInsertFunction( - ClMemoryAccessCallbackPrefix + TypeStr, + ClMemoryAccessCallbackPrefix + TypeStr + EndingStr, FunctionType::get(IRB.getVoidTy(), {IntptrTy, IntptrTy}, false))); for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes; @@ -167,7 +176,7 @@ void HWAddressSanitizer::initializeCallbacks(Module &M) { HwasanMemoryAccessCallback[AccessIsWrite][AccessSizeIndex] = checkSanitizerInterfaceFunction(M.getOrInsertFunction( ClMemoryAccessCallbackPrefix + TypeStr + - itostr(1ULL << AccessSizeIndex), + itostr(1ULL << AccessSizeIndex) + EndingStr, FunctionType::get(IRB.getVoidTy(), {IntptrTy}, false))); } } @@ -246,14 +255,16 @@ void HWAddressSanitizer::instrumentMemAccessInline(Value *PtrLong, bool IsWrite, Value *TagMismatch = IRB.CreateICmpNE(PtrTag, MemTag); TerminatorInst *CheckTerm = - SplitBlockAndInsertIfThen(TagMismatch, InsertBefore, false, + SplitBlockAndInsertIfThen(TagMismatch, InsertBefore, !Recover, MDBuilder(*C).createBranchWeights(1, 100000)); IRB.SetInsertPoint(CheckTerm); // The signal handler will find the data address in x0. InlineAsm *Asm = InlineAsm::get( FunctionType::get(IRB.getVoidTy(), {PtrLong->getType()}, false), - "hlt #" + itostr(0x100 + IsWrite * 0x10 + AccessSizeIndex), "{x0}", + "hlt #" + + itostr(0x100 + Recover * 0x20 + IsWrite * 0x10 + AccessSizeIndex), + "{x0}", /*hasSideEffects=*/true); IRB.CreateCall(Asm, PtrLong); } diff --git a/lib/Transforms/Scalar/CallSiteSplitting.cpp b/lib/Transforms/Scalar/CallSiteSplitting.cpp index d8c408035038..207243231aad 100644 --- a/lib/Transforms/Scalar/CallSiteSplitting.cpp +++ b/lib/Transforms/Scalar/CallSiteSplitting.cpp @@ -13,10 +13,11 @@ // threading, or IPA-CP based function cloning, etc.). // As of now we support two cases : // -// 1) If a call site is dominated by an OR condition and if any of its arguments -// are predicated on this OR condition, try to split the condition with more -// constrained arguments. For example, in the code below, we try to split the -// call site since we can predicate the argument(ptr) based on the OR condition. +// 1) Try to a split call-site with constrained arguments, if any constraints +// on any argument can be found by following the single predecessors of the +// all site's predecessors. Currently this pass only handles call-sites with 2 +// predecessors. For example, in the code below, we try to split the call-site +// since we can predicate the argument(ptr) based on the OR condition. // // Split from : // if (!ptr || c) @@ -200,16 +201,15 @@ static bool canSplitCallSite(CallSite CS) { } /// Return true if the CS is split into its new predecessors which are directly -/// hooked to each of its orignial predecessors pointed by PredBB1 and PredBB2. -/// In OR predicated case, PredBB1 will point the header, and PredBB2 will point -/// to the second compare block. CallInst1 and CallInst2 will be the new -/// call-sites placed in the new predecessors split for PredBB1 and PredBB2, -/// repectively. Therefore, CallInst1 will be the call-site placed -/// between Header and Tail, and CallInst2 will be the call-site between TBB and -/// Tail. For example, in the IR below with an OR condition, the call-site can -/// be split +/// hooked to each of its original predecessors pointed by PredBB1 and PredBB2. +/// CallInst1 and CallInst2 will be the new call-sites placed in the new +/// predecessors split for PredBB1 and PredBB2, respectively. +/// For example, in the IR below with an OR condition, the call-site can +/// be split. Assuming PredBB1=Header and PredBB2=TBB, CallInst1 will be the +/// call-site placed between Header and Tail, and CallInst2 will be the +/// call-site between TBB and Tail. /// -/// from : +/// From : /// /// Header: /// %c = icmp eq i32* %a, null @@ -237,9 +237,9 @@ static bool canSplitCallSite(CallSite CS) { /// Tail: /// %p = phi i1 [%ca1, %Tail-split1],[%ca2, %Tail-split2] /// -/// Note that for an OR predicated case, CallInst1 and CallInst2 should be -/// created with more constrained arguments in -/// createCallSitesOnOrPredicatedArgument(). +/// Note that in case any arguments at the call-site are constrained by its +/// predecessors, new call-sites with more constrained arguments will be +/// created in createCallSitesOnPredicatedArgument(). static void splitCallSite(CallSite CS, BasicBlock *PredBB1, BasicBlock *PredBB2, Instruction *CallInst1, Instruction *CallInst2) { Instruction *Instr = CS.getInstruction(); @@ -332,18 +332,10 @@ static bool tryToSplitOnPHIPredicatedArgument(CallSite CS) { splitCallSite(CS, Preds[0], Preds[1], nullptr, nullptr); return true; } -// Check if one of the predecessors is a single predecessors of the other. -// This is a requirement for control flow modeling an OR. HeaderBB points to -// the single predecessor and OrBB points to other node. HeaderBB potentially -// contains the first compare of the OR and OrBB the second. -static bool isOrHeader(BasicBlock *HeaderBB, BasicBlock *OrBB) { - return OrBB->getSinglePredecessor() == HeaderBB && - HeaderBB->getTerminator()->getNumSuccessors() == 2; -} -static bool tryToSplitOnOrPredicatedArgument(CallSite CS) { +static bool tryToSplitOnPredicatedArgument(CallSite CS) { auto Preds = getTwoPredecessors(CS.getInstruction()->getParent()); - if (!isOrHeader(Preds[0], Preds[1]) && !isOrHeader(Preds[1], Preds[0])) + if (Preds[0] == Preds[1]) return false; SmallVector, 2> C1, C2; @@ -362,7 +354,7 @@ static bool tryToSplitOnOrPredicatedArgument(CallSite CS) { static bool tryToSplitCallSite(CallSite CS) { if (!CS.arg_size() || !canSplitCallSite(CS)) return false; - return tryToSplitOnOrPredicatedArgument(CS) || + return tryToSplitOnPredicatedArgument(CS) || tryToSplitOnPHIPredicatedArgument(CS); } diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index 6b0377e0ecb3..1476f7850cf0 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -282,7 +282,7 @@ bool JumpThreading::runOnFunction(Function &F) { auto AA = &getAnalysis().getAAResults(); std::unique_ptr BFI; std::unique_ptr BPI; - bool HasProfileData = F.getEntryCount().hasValue(); + bool HasProfileData = F.hasProfileData(); if (HasProfileData) { LoopInfo LI{DominatorTree(F)}; BPI.reset(new BranchProbabilityInfo(F, LI, TLI)); @@ -307,8 +307,7 @@ PreservedAnalyses JumpThreadingPass::run(Function &F, std::unique_ptr BFI; std::unique_ptr BPI; - bool HasProfileData = F.getEntryCount().hasValue(); - if (HasProfileData) { + if (F.hasProfileData()) { LoopInfo LI{DominatorTree(F)}; BPI.reset(new BranchProbabilityInfo(F, LI, &TLI)); BFI.reset(new BlockFrequencyInfo(F, *BPI, LI)); @@ -1333,6 +1332,20 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // code size. BasicBlock *UnavailablePred = nullptr; + // If the value is unavailable in one of predecessors, we will end up + // inserting a new instruction into them. It is only valid if all the + // instructions before LI are guaranteed to pass execution to its successor, + // or if LI is safe to speculate. + // TODO: If this logic becomes more complex, and we will perform PRE insertion + // farther than to a predecessor, we need to reuse the code from GVN's PRE. + // It requires domination tree analysis, so for this simple case it is an + // overkill. + if (PredsScanned.size() != AvailablePreds.size() && + !isSafeToSpeculativelyExecute(LI)) + for (auto I = LoadBB->begin(); &*I != LI; ++I) + if (!isGuaranteedToTransferExecutionToSuccessor(&*I)) + return false; + // If there is exactly one predecessor where the value is unavailable, the // already computed 'OneUnavailablePred' block is it. If it ends in an // unconditional branch, we know that it isn't a critical edge. diff --git a/lib/Transforms/Scalar/LoopSink.cpp b/lib/Transforms/Scalar/LoopSink.cpp index c9d55b4594fe..430a7085d93f 100644 --- a/lib/Transforms/Scalar/LoopSink.cpp +++ b/lib/Transforms/Scalar/LoopSink.cpp @@ -247,7 +247,7 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI, // Enable LoopSink only when runtime profile is available. // With static profile, the sinking decision may be sub-optimal. - if (!Preheader->getParent()->getEntryCount()) + if (!Preheader->getParent()->hasProfileData()) return false; const BlockFrequency PreheaderFreq = BFI.getBlockFreq(Preheader); diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp index 7b1d6446a24a..15e7da5e1a7a 100644 --- a/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -882,7 +882,7 @@ static bool computeUnrollCount( } // Check if the runtime trip count is too small when profile is available. - if (L->getHeader()->getParent()->getEntryCount()) { + if (L->getHeader()->getParent()->hasProfileData()) { if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) { if (*ProfileTripCount < FlatLoopTripCountThreshold) return false; diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 9c870b42a747..6af3fef963dc 100644 --- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -476,22 +476,33 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, Alignment = DL.getABITypeAlignment(EltType); } - AMemSet = - Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment); + // Remember the debug location. + DebugLoc Loc; + if (!Range.TheStores.empty()) + Loc = Range.TheStores[0]->getDebugLoc(); DEBUG(dbgs() << "Replace stores:\n"; for (Instruction *SI : Range.TheStores) - dbgs() << *SI << '\n'; - dbgs() << "With: " << *AMemSet << '\n'); - - if (!Range.TheStores.empty()) - AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc()); + dbgs() << *SI << '\n'); // Zap all the stores. for (Instruction *SI : Range.TheStores) { MD->removeInstruction(SI); SI->eraseFromParent(); } + + // Create the memset after removing the stores, so that if there any cached + // non-local dependencies on the removed instructions in + // MemoryDependenceAnalysis, the cache entries are updated to "dirty" + // entries pointing below the memset, so subsequent queries include the + // memset. + AMemSet = + Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment); + if (!Range.TheStores.empty()) + AMemSet->setDebugLoc(Loc); + + DEBUG(dbgs() << "With: " << *AMemSet << '\n'); + ++NumMemSetInfer; } @@ -1031,9 +1042,22 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, // // NOTE: This is conservative, it will stop on any read from the source loc, // not just the defining memcpy. - MemDepResult SourceDep = - MD->getPointerDependencyFrom(MemoryLocation::getForSource(MDep), false, - M->getIterator(), M->getParent()); + MemoryLocation SourceLoc = MemoryLocation::getForSource(MDep); + MemDepResult SourceDep = MD->getPointerDependencyFrom(SourceLoc, false, + M->getIterator(), M->getParent()); + + if (SourceDep.isNonLocal()) { + SmallVector NonLocalDepResults; + MD->getNonLocalPointerDependencyFrom(M, SourceLoc, /*isLoad=*/false, + NonLocalDepResults); + if (NonLocalDepResults.size() == 1) { + SourceDep = NonLocalDepResults[0].getResult(); + assert((!SourceDep.getInst() || + LookupDomTree().dominates(SourceDep.getInst(), M)) && + "when memdep returns exactly one result, it should dominate"); + } + } + if (!SourceDep.isClobber() || SourceDep.getInst() != MDep) return false; @@ -1235,6 +1259,18 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M) { MemDepResult SrcDepInfo = MD->getPointerDependencyFrom( SrcLoc, true, M->getIterator(), M->getParent()); + if (SrcDepInfo.isNonLocal()) { + SmallVector NonLocalDepResults; + MD->getNonLocalPointerDependencyFrom(M, SrcLoc, /*isLoad=*/true, + NonLocalDepResults); + if (NonLocalDepResults.size() == 1) { + SrcDepInfo = NonLocalDepResults[0].getResult(); + assert((!SrcDepInfo.getInst() || + LookupDomTree().dominates(SrcDepInfo.getInst(), M)) && + "when memdep returns exactly one result, it should dominate"); + } + } + if (SrcDepInfo.isClobber()) { if (MemCpyInst *MDep = dyn_cast(SrcDepInfo.getInst())) return processMemCpyMemCpyDependence(M, MDep); diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp index e5866b4718da..66608ec631f6 100644 --- a/lib/Transforms/Scalar/SCCP.cpp +++ b/lib/Transforms/Scalar/SCCP.cpp @@ -1929,9 +1929,32 @@ static bool runIPSCCP(Module &M, const DataLayout &DL, if (!I) continue; bool Folded = ConstantFoldTerminator(I->getParent()); - assert(Folded && - "Expect TermInst on constantint or blockaddress to be folded"); - (void) Folded; + if (!Folded) { + // The constant folder may not have been able to fold the terminator + // if this is a branch or switch on undef. Fold it manually as a + // branch to the first successor. +#ifndef NDEBUG + if (auto *BI = dyn_cast(I)) { + assert(BI->isConditional() && isa(BI->getCondition()) && + "Branch should be foldable!"); + } else if (auto *SI = dyn_cast(I)) { + assert(isa(SI->getCondition()) && "Switch should fold"); + } else { + llvm_unreachable("Didn't fold away reference to block!"); + } +#endif + + // Make this an uncond branch to the first successor. + TerminatorInst *TI = I->getParent()->getTerminator(); + BranchInst::Create(TI->getSuccessor(0), TI); + + // Remove entries in successor phi nodes to remove edges. + for (unsigned i = 1, e = TI->getNumSuccessors(); i != e; ++i) + TI->getSuccessor(i)->removePredecessor(TI->getParent()); + + // Remove the old terminator. + TI->eraseFromParent(); + } } // Finally, delete the basic block. diff --git a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 209821ff21d7..8fa9ffb6d014 100644 --- a/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -97,7 +97,7 @@ // load %p2 // ... // -// We can not do CSE for to the common part related to index "i64 %i". Lowering +// We can not do CSE to the common part related to index "i64 %i". Lowering // GEPs can achieve such goals. // If the target does not use alias analysis in codegen, this pass will // lower a GEP with multiple indices into arithmetic operations: diff --git a/lib/Transforms/Utils/CallPromotionUtils.cpp b/lib/Transforms/Utils/CallPromotionUtils.cpp index eb3139ce4293..8825f77555e7 100644 --- a/lib/Transforms/Utils/CallPromotionUtils.cpp +++ b/lib/Transforms/Utils/CallPromotionUtils.cpp @@ -23,10 +23,30 @@ using namespace llvm; /// Fix-up phi nodes in an invoke instruction's normal destination. /// /// After versioning an invoke instruction, values coming from the original -/// block will now either be coming from the original block or the "else" block. +/// block will now be coming from the "merge" block. For example, in the code +/// below: +/// +/// then_bb: +/// %t0 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst +/// +/// else_bb: +/// %t1 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst +/// +/// merge_bb: +/// %t2 = phi i32 [ %t0, %then_bb ], [ %t1, %else_bb ] +/// br %normal_dst +/// +/// normal_dst: +/// %t3 = phi i32 [ %x, %orig_bb ], ... +/// +/// "orig_bb" is no longer a predecessor of "normal_dst", so the phi nodes in +/// "normal_dst" must be fixed to refer to "merge_bb": +/// +/// normal_dst: +/// %t3 = phi i32 [ %x, %merge_bb ], ... +/// static void fixupPHINodeForNormalDest(InvokeInst *Invoke, BasicBlock *OrigBlock, - BasicBlock *ElseBlock, - Instruction *NewInst) { + BasicBlock *MergeBlock) { for (auto &I : *Invoke->getNormalDest()) { auto *Phi = dyn_cast(&I); if (!Phi) @@ -34,13 +54,7 @@ static void fixupPHINodeForNormalDest(InvokeInst *Invoke, BasicBlock *OrigBlock, int Idx = Phi->getBasicBlockIndex(OrigBlock); if (Idx == -1) continue; - Value *V = Phi->getIncomingValue(Idx); - if (dyn_cast(V) == Invoke) { - Phi->setIncomingBlock(Idx, ElseBlock); - Phi->addIncoming(NewInst, OrigBlock); - continue; - } - Phi->addIncoming(V, ElseBlock); + Phi->setIncomingBlock(Idx, MergeBlock); } } @@ -48,6 +62,23 @@ static void fixupPHINodeForNormalDest(InvokeInst *Invoke, BasicBlock *OrigBlock, /// /// After versioning an invoke instruction, values coming from the original /// block will now be coming from either the "then" block or the "else" block. +/// For example, in the code below: +/// +/// then_bb: +/// %t0 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst +/// +/// else_bb: +/// %t1 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst +/// +/// unwind_dst: +/// %t3 = phi i32 [ %x, %orig_bb ], ... +/// +/// "orig_bb" is no longer a predecessor of "unwind_dst", so the phi nodes in +/// "unwind_dst" must be fixed to refer to "then_bb" and "else_bb": +/// +/// unwind_dst: +/// %t3 = phi i32 [ %x, %then_bb ], [ %x, %else_bb ], ... +/// static void fixupPHINodeForUnwindDest(InvokeInst *Invoke, BasicBlock *OrigBlock, BasicBlock *ThenBlock, BasicBlock *ElseBlock) { @@ -64,44 +95,26 @@ static void fixupPHINodeForUnwindDest(InvokeInst *Invoke, BasicBlock *OrigBlock, } } -/// Get the phi node having the returned value of a call or invoke instruction -/// as it's operand. -static bool getRetPhiNode(Instruction *Inst, BasicBlock *Block) { - BasicBlock *FromBlock = Inst->getParent(); - for (auto &I : *Block) { - PHINode *PHI = dyn_cast(&I); - if (!PHI) - break; - int Idx = PHI->getBasicBlockIndex(FromBlock); - if (Idx == -1) - continue; - auto *V = PHI->getIncomingValue(Idx); - if (V == Inst) - return true; - } - return false; -} - /// Create a phi node for the returned value of a call or invoke instruction. /// /// After versioning a call or invoke instruction that returns a value, we have /// to merge the value of the original and new instructions. We do this by /// creating a phi node and replacing uses of the original instruction with this /// phi node. -static void createRetPHINode(Instruction *OrigInst, Instruction *NewInst) { +/// +/// For example, if \p OrigInst is defined in "else_bb" and \p NewInst is +/// defined in "then_bb", we create the following phi node: +/// +/// ; Uses of the original instruction are replaced by uses of the phi node. +/// %t0 = phi i32 [ %orig_inst, %else_bb ], [ %new_inst, %then_bb ], +/// +static void createRetPHINode(Instruction *OrigInst, Instruction *NewInst, + BasicBlock *MergeBlock, IRBuilder<> &Builder) { if (OrigInst->getType()->isVoidTy() || OrigInst->use_empty()) return; - BasicBlock *RetValBB = NewInst->getParent(); - if (auto *Invoke = dyn_cast(NewInst)) - RetValBB = Invoke->getNormalDest(); - BasicBlock *PhiBB = RetValBB->getSingleSuccessor(); - - if (getRetPhiNode(OrigInst, PhiBB)) - return; - - IRBuilder<> Builder(&PhiBB->front()); + Builder.SetInsertPoint(&MergeBlock->front()); PHINode *Phi = Builder.CreatePHI(OrigInst->getType(), 0); SmallVector UsersToUpdate; for (User *U : OrigInst->users()) @@ -109,7 +122,7 @@ static void createRetPHINode(Instruction *OrigInst, Instruction *NewInst) { for (User *U : UsersToUpdate) U->replaceUsesOfWith(OrigInst, Phi); Phi->addIncoming(OrigInst, OrigInst->getParent()); - Phi->addIncoming(NewInst, RetValBB); + Phi->addIncoming(NewInst, NewInst->getParent()); } /// Cast a call or invoke instruction to the given type. @@ -118,7 +131,41 @@ static void createRetPHINode(Instruction *OrigInst, Instruction *NewInst) { /// that of the callee. If this is the case, we have to cast the returned value /// to the correct type. The location of the cast depends on if we have a call /// or invoke instruction. -Instruction *createRetBitCast(CallSite CS, Type *RetTy) { +/// +/// For example, if the call instruction below requires a bitcast after +/// promotion: +/// +/// orig_bb: +/// %t0 = call i32 @func() +/// ... +/// +/// The bitcast is placed after the call instruction: +/// +/// orig_bb: +/// ; Uses of the original return value are replaced by uses of the bitcast. +/// %t0 = call i32 @func() +/// %t1 = bitcast i32 %t0 to ... +/// ... +/// +/// A similar transformation is performed for invoke instructions. However, +/// since invokes are terminating, a new block is created for the bitcast. For +/// example, if the invoke instruction below requires a bitcast after promotion: +/// +/// orig_bb: +/// %t0 = invoke i32 @func() to label %normal_dst unwind label %unwind_dst +/// +/// The edge between the original block and the invoke's normal destination is +/// split, and the bitcast is placed there: +/// +/// orig_bb: +/// %t0 = invoke i32 @func() to label %split_bb unwind label %unwind_dst +/// +/// split_bb: +/// ; Uses of the original return value are replaced by uses of the bitcast. +/// %t1 = bitcast i32 %t0 to ... +/// br label %normal_dst +/// +static void createRetBitCast(CallSite CS, Type *RetTy, CastInst **RetBitCast) { // Save the users of the calling instruction. These uses will be changed to // use the bitcast after we create it. @@ -130,19 +177,20 @@ Instruction *createRetBitCast(CallSite CS, Type *RetTy) { // value. The location depends on if we have a call or invoke instruction. Instruction *InsertBefore = nullptr; if (auto *Invoke = dyn_cast(CS.getInstruction())) - InsertBefore = &*Invoke->getNormalDest()->getFirstInsertionPt(); + InsertBefore = + &SplitEdge(Invoke->getParent(), Invoke->getNormalDest())->front(); else InsertBefore = &*std::next(CS.getInstruction()->getIterator()); // Bitcast the return value to the correct type. auto *Cast = CastInst::Create(Instruction::BitCast, CS.getInstruction(), RetTy, "", InsertBefore); + if (RetBitCast) + *RetBitCast = Cast; // Replace all the original uses of the calling instruction with the bitcast. for (User *U : UsersToUpdate) U->replaceUsesOfWith(CS.getInstruction(), Cast); - - return Cast; } /// Predicate and clone the given call site. @@ -152,21 +200,78 @@ Instruction *createRetBitCast(CallSite CS, Type *RetTy) { /// callee. The original call site is moved into the "else" block, and a clone /// of the call site is placed in the "then" block. The cloned instruction is /// returned. +/// +/// For example, the call instruction below: +/// +/// orig_bb: +/// %t0 = call i32 %ptr() +/// ... +/// +/// Is replace by the following: +/// +/// orig_bb: +/// %cond = icmp eq i32 ()* %ptr, @func +/// br i1 %cond, %then_bb, %else_bb +/// +/// then_bb: +/// ; The clone of the original call instruction is placed in the "then" +/// ; block. It is not yet promoted. +/// %t1 = call i32 %ptr() +/// br merge_bb +/// +/// else_bb: +/// ; The original call instruction is moved to the "else" block. +/// %t0 = call i32 %ptr() +/// br merge_bb +/// +/// merge_bb: +/// ; Uses of the original call instruction are replaced by uses of the phi +/// ; node. +/// %t2 = phi i32 [ %t0, %else_bb ], [ %t1, %then_bb ] +/// ... +/// +/// A similar transformation is performed for invoke instructions. However, +/// since invokes are terminating, more work is required. For example, the +/// invoke instruction below: +/// +/// orig_bb: +/// %t0 = invoke %ptr() to label %normal_dst unwind label %unwind_dst +/// +/// Is replace by the following: +/// +/// orig_bb: +/// %cond = icmp eq i32 ()* %ptr, @func +/// br i1 %cond, %then_bb, %else_bb +/// +/// then_bb: +/// ; The clone of the original invoke instruction is placed in the "then" +/// ; block, and its normal destination is set to the "merge" block. It is +/// ; not yet promoted. +/// %t1 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst +/// +/// else_bb: +/// ; The original invoke instruction is moved into the "else" block, and +/// ; its normal destination is set to the "merge" block. +/// %t0 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst +/// +/// merge_bb: +/// ; Uses of the original invoke instruction are replaced by uses of the +/// ; phi node, and the merge block branches to the normal destination. +/// %t2 = phi i32 [ %t0, %else_bb ], [ %t1, %then_bb ] +/// br %normal_dst +/// static Instruction *versionCallSite(CallSite CS, Value *Callee, - MDNode *BranchWeights, - BasicBlock *&ThenBlock, - BasicBlock *&ElseBlock, - BasicBlock *&MergeBlock) { + MDNode *BranchWeights) { IRBuilder<> Builder(CS.getInstruction()); Instruction *OrigInst = CS.getInstruction(); + BasicBlock *OrigBlock = OrigInst->getParent(); // Create the compare. The called value and callee must have the same type to // be compared. - auto *LHS = - Builder.CreateBitCast(CS.getCalledValue(), Builder.getInt8PtrTy()); - auto *RHS = Builder.CreateBitCast(Callee, Builder.getInt8PtrTy()); - auto *Cond = Builder.CreateICmpEQ(LHS, RHS); + if (CS.getCalledValue()->getType() != Callee->getType()) + Callee = Builder.CreateBitCast(Callee, CS.getCalledValue()->getType()); + auto *Cond = Builder.CreateICmpEQ(CS.getCalledValue(), Callee); // Create an if-then-else structure. The original instruction is moved into // the "else" block, and a clone of the original instruction is placed in the @@ -175,9 +280,9 @@ static Instruction *versionCallSite(CallSite CS, Value *Callee, TerminatorInst *ElseTerm = nullptr; SplitBlockAndInsertIfThenElse(Cond, CS.getInstruction(), &ThenTerm, &ElseTerm, BranchWeights); - ThenBlock = ThenTerm->getParent(); - ElseBlock = ElseTerm->getParent(); - MergeBlock = OrigInst->getParent(); + BasicBlock *ThenBlock = ThenTerm->getParent(); + BasicBlock *ElseBlock = ElseTerm->getParent(); + BasicBlock *MergeBlock = OrigInst->getParent(); ThenBlock->setName("if.true.direct_targ"); ElseBlock->setName("if.false.orig_indirect"); @@ -188,7 +293,8 @@ static Instruction *versionCallSite(CallSite CS, Value *Callee, NewInst->insertBefore(ThenTerm); // If the original call site is an invoke instruction, we have extra work to - // do since invoke instructions are terminating. + // do since invoke instructions are terminating. We have to fix-up phi nodes + // in the invoke's normal and unwind destinations. if (auto *OrigInvoke = dyn_cast(OrigInst)) { auto *NewInvoke = cast(NewInst); @@ -201,11 +307,19 @@ static Instruction *versionCallSite(CallSite CS, Value *Callee, Builder.SetInsertPoint(MergeBlock); Builder.CreateBr(OrigInvoke->getNormalDest()); - // Now set the normal destination of new the invoke instruction to be the + // Fix-up phi nodes in the original invoke's normal and unwind destinations. + fixupPHINodeForNormalDest(OrigInvoke, OrigBlock, MergeBlock); + fixupPHINodeForUnwindDest(OrigInvoke, MergeBlock, ThenBlock, ElseBlock); + + // Now set the normal destinations of the invoke instructions to be the // "merge" block. + OrigInvoke->setNormalDest(MergeBlock); NewInvoke->setNormalDest(MergeBlock); } + // Create a phi node for the returned value of the call site. + createRetPHINode(OrigInst, NewInst, MergeBlock, Builder); + return NewInst; } @@ -253,7 +367,8 @@ bool llvm::isLegalToPromote(CallSite CS, Function *Callee, return true; } -static void promoteCall(CallSite CS, Function *Callee, Instruction *&Cast) { +Instruction *llvm::promoteCall(CallSite CS, Function *Callee, + CastInst **RetBitCast) { assert(!CS.getCalledFunction() && "Only indirect call sites can be promoted"); // Set the called function of the call site to be the given callee. @@ -268,7 +383,7 @@ static void promoteCall(CallSite CS, Function *Callee, Instruction *&Cast) { // If the function type of the call site matches that of the callee, no // additional work is required. if (CS.getFunctionType() == Callee->getFunctionType()) - return; + return CS.getInstruction(); // Save the return types of the call site and callee. Type *CallSiteRetTy = CS.getInstruction()->getType(); @@ -294,7 +409,9 @@ static void promoteCall(CallSite CS, Function *Callee, Instruction *&Cast) { // If the return type of the call site doesn't match that of the callee, cast // the returned value to the appropriate type. if (!CallSiteRetTy->isVoidTy() && CallSiteRetTy != CalleeRetTy) - Cast = createRetBitCast(CS, CallSiteRetTy); + createRetBitCast(CS, CallSiteRetTy, RetBitCast); + + return CS.getInstruction(); } Instruction *llvm::promoteCallWithIfThenElse(CallSite CS, Function *Callee, @@ -303,26 +420,10 @@ Instruction *llvm::promoteCallWithIfThenElse(CallSite CS, Function *Callee, // Version the indirect call site. If the called value is equal to the given // callee, 'NewInst' will be executed, otherwise the original call site will // be executed. - BasicBlock *ThenBlock, *ElseBlock, *MergeBlock; - Instruction *NewInst = versionCallSite(CS, Callee, BranchWeights, ThenBlock, - ElseBlock, MergeBlock); + Instruction *NewInst = versionCallSite(CS, Callee, BranchWeights); // Promote 'NewInst' so that it directly calls the desired function. - Instruction *Cast = NewInst; - promoteCall(CallSite(NewInst), Callee, Cast); - - // If the original call site is an invoke instruction, we have to fix-up phi - // nodes in the invoke's normal and unwind destinations. - if (auto *OrigInvoke = dyn_cast(CS.getInstruction())) { - fixupPHINodeForNormalDest(OrigInvoke, MergeBlock, ElseBlock, Cast); - fixupPHINodeForUnwindDest(OrigInvoke, MergeBlock, ThenBlock, ElseBlock); - } - - // Create a phi node for the returned value of the call site. - createRetPHINode(CS.getInstruction(), Cast ? Cast : NewInst); - - // Return the new direct call. - return NewInst; + return promoteCall(CallSite(NewInst), Callee); } #undef DEBUG_TYPE diff --git a/lib/Transforms/Utils/LoopUnrollPeel.cpp b/lib/Transforms/Utils/LoopUnrollPeel.cpp index 4273ce0b6200..c84ae7d693d7 100644 --- a/lib/Transforms/Utils/LoopUnrollPeel.cpp +++ b/lib/Transforms/Utils/LoopUnrollPeel.cpp @@ -203,7 +203,7 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize, // hit the peeled section. // We only do this in the presence of profile information, since otherwise // our estimates of the trip count are not reliable enough. - if (UP.AllowPeeling && L->getHeader()->getParent()->getEntryCount()) { + if (UP.AllowPeeling && L->getHeader()->getParent()->hasProfileData()) { Optional PeelCount = getLoopEstimatedTripCount(L); if (!PeelCount) return; diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp index f02f80cc1b78..b3c80424c8b9 100644 --- a/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/lib/Transforms/Utils/SimplifyCFG.cpp @@ -127,6 +127,16 @@ static cl::opt MaxSpeculationDepth( cl::desc("Limit maximum recursion depth when calculating costs of " "speculatively executed instructions")); +static cl::opt DependenceChainLatency( + "dependence-chain-latency", cl::Hidden, cl::init(8), + cl::desc("Limit the maximum latency of dependence chain containing cmp " + "for if conversion")); + +static cl::opt SmallBBSize( + "small-bb-size", cl::Hidden, cl::init(40), + cl::desc("Check dependence chain latency only in basic block smaller than " + "this number")); + STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps"); STATISTIC(NumLinearMaps, "Number of switch instructions turned into linear mapping"); @@ -395,6 +405,166 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB, return true; } +/// Estimate the code size of the specified BB. +static unsigned CountBBCodeSize(BasicBlock *BB, + const TargetTransformInfo &TTI) { + unsigned Size = 0; + for (auto II = BB->begin(); !isa(II); ++II) + Size += TTI.getInstructionCost(&(*II), TargetTransformInfo::TCK_CodeSize); + return Size; +} + +/// Find out the latency of the longest dependence chain in the BB if +/// LongestChain is true, or the dependence chain containing the compare +/// instruction feeding the block's conditional branch. +static unsigned FindDependenceChainLatency(BasicBlock *BB, + DenseMap &Instructions, + const TargetTransformInfo &TTI, + bool LongestChain) { + unsigned MaxLatency = 0; + + BasicBlock::iterator II; + for (II = BB->begin(); !isa(II); ++II) { + unsigned Latency = 0; + for (unsigned O = 0, E = II->getNumOperands(); O != E; ++O) { + Instruction *Op = dyn_cast(II->getOperand(O)); + if (Op && Instructions.count(Op)) { + auto OpLatency = Instructions[Op]; + if (OpLatency > Latency) + Latency = OpLatency; + } + } + Latency += TTI.getInstructionCost(&(*II), TargetTransformInfo::TCK_Latency); + Instructions[&(*II)] = Latency; + + if (Latency > MaxLatency) + MaxLatency = Latency; + } + + if (LongestChain) + return MaxLatency; + + // The length of the dependence chain containing the compare instruction is + // wanted, so the terminator must be a BranchInst. + assert(isa(II)); + BranchInst* Br = cast(II); + Instruction *Cmp = dyn_cast(Br->getCondition()); + if (Cmp && Instructions.count(Cmp)) + return Instructions[Cmp]; + else + return 0; +} + +/// Instructions in BB2 may depend on instructions in BB1, and instructions +/// in BB1 may have users in BB2. If the last (in terms of latency) such kind +/// of instruction in BB1 is I, then the instructions after I can be executed +/// in parallel with instructions in BB2. +/// This function returns the latency of I. +static unsigned LatencyAdjustment(BasicBlock *BB1, BasicBlock *BB2, + BasicBlock *IfBlock1, BasicBlock *IfBlock2, + DenseMap &BB1Instructions) { + unsigned LastLatency = 0; + SmallVector Worklist; + BasicBlock::iterator II; + for (II = BB2->begin(); !isa(II); ++II) { + if (PHINode *PN = dyn_cast(II)) { + // Look for users in BB2. + bool InBBUser = false; + for (User *U : PN->users()) { + if (cast(U)->getParent() == BB2) { + InBBUser = true; + break; + } + } + // No such user, we don't care about this instruction and its operands. + if (!InBBUser) + break; + } + Worklist.push_back(&(*II)); + } + + while (!Worklist.empty()) { + Instruction *I = Worklist.pop_back_val(); + for (unsigned O = 0, E = I->getNumOperands(); O != E; ++O) { + if (Instruction *Op = dyn_cast(I->getOperand(O))) { + if (Op->getParent() == IfBlock1 || Op->getParent() == IfBlock2) + Worklist.push_back(Op); + else if (Op->getParent() == BB1 && BB1Instructions.count(Op)) { + if (BB1Instructions[Op] > LastLatency) + LastLatency = BB1Instructions[Op]; + } + } + } + } + + return LastLatency; +} + +/// If after if conversion, most of the instructions in this new BB construct a +/// long and slow dependence chain, it may be slower than cmp/branch, even +/// if the branch has a high miss rate, because the control dependence is +/// transformed into data dependence, and control dependence can be speculated, +/// and thus, the second part can execute in parallel with the first part on +/// modern OOO processor. +/// +/// To check this condition, this function finds the length of the dependence +/// chain in BB1 (only the part that can be executed in parallel with code after +/// branch in BB2) containing cmp, and if the length is longer than a threshold, +/// don't perform if conversion. +/// +/// BB1, BB2, IfBlock1 and IfBlock2 are candidate BBs for if conversion. +/// SpeculationSize contains the code size of IfBlock1 and IfBlock2. +static bool FindLongDependenceChain(BasicBlock *BB1, BasicBlock *BB2, + BasicBlock *IfBlock1, BasicBlock *IfBlock2, + unsigned SpeculationSize, + const TargetTransformInfo &TTI) { + // Accumulated latency of each instruction in their BBs. + DenseMap BB1Instructions; + DenseMap BB2Instructions; + + if (!TTI.isOutOfOrder()) + return false; + + unsigned NewBBSize = CountBBCodeSize(BB1, TTI) + CountBBCodeSize(BB2, TTI) + + SpeculationSize; + + // We check small BB only since it is more difficult to find unrelated + // instructions to fill functional units in a small BB. + if (NewBBSize > SmallBBSize) + return false; + + auto BB1Chain = + FindDependenceChainLatency(BB1, BB1Instructions, TTI, false); + auto BB2Chain = + FindDependenceChainLatency(BB2, BB2Instructions, TTI, true); + + // If there are many unrelated instructions in the new BB, there will be + // other instructions for the processor to issue regardless of the length + // of this new dependence chain. + // Modern processors can issue 3 or more instructions in each cycle. But in + // real world applications, an IPC of 2 is already very good for non-loop + // code with small basic blocks. Higher IPC is usually found in programs with + // small kernel. So IPC of 2 is more reasonable for most applications. + if ((BB1Chain + BB2Chain) * 2 <= NewBBSize) + return false; + + // We only care about part of the dependence chain in BB1 that can be + // executed in parallel with BB2, so adjust the latency. + BB1Chain -= + LatencyAdjustment(BB1, BB2, IfBlock1, IfBlock2, BB1Instructions); + + // Correctly predicted branch instruction can skip the dependence chain in + // BB1, but misprediction has a penalty, so only when the dependence chain is + // longer than DependenceChainLatency, then branch is better than select. + // Besides misprediction penalty, the threshold value DependenceChainLatency + // also depends on branch misprediction rate, taken branch latency and cmov + // latency. + if (BB1Chain >= DependenceChainLatency) + return true; + + return false; +} + /// Extract ConstantInt from value, looking through IntToPtr /// and PointerNullValue. Return NULL if value is not a constant int. static ConstantInt *GetConstantInt(Value *V, const DataLayout &DL) { @@ -1654,14 +1824,11 @@ namespace { } // end anonymous namespace -/// Given an unconditional branch that goes to BBEnd, -/// check whether BBEnd has only two predecessors and the other predecessor -/// ends with an unconditional branch. If it is true, sink any common code -/// in the two predecessors to BBEnd. -static bool SinkThenElseCodeToEnd(BranchInst *BI1) { - assert(BI1->isUnconditional()); - BasicBlock *BBEnd = BI1->getSuccessor(0); - +/// Check whether BB's predecessors end with unconditional branches. If it is +/// true, sink any common code from the predecessors to BB. +/// We also allow one predecessor to end with conditional branch (but no more +/// than one). +static bool SinkCommonCodeFromPredecessors(BasicBlock *BB) { // We support two situations: // (1) all incoming arcs are unconditional // (2) one incoming arc is conditional @@ -1705,7 +1872,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) { // SmallVector UnconditionalPreds; Instruction *Cond = nullptr; - for (auto *B : predecessors(BBEnd)) { + for (auto *B : predecessors(BB)) { auto *T = B->getTerminator(); if (isa(T) && cast(T)->isUnconditional()) UnconditionalPreds.push_back(B); @@ -1773,8 +1940,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) { DEBUG(dbgs() << "SINK: Splitting edge\n"); // We have a conditional edge and we're going to sink some instructions. // Insert a new block postdominating all blocks we're going to sink from. - if (!SplitBlockPredecessors(BI1->getSuccessor(0), UnconditionalPreds, - ".sink.split")) + if (!SplitBlockPredecessors(BB, UnconditionalPreds, ".sink.split")) // Edges couldn't be split. return false; Changed = true; @@ -2048,6 +2214,11 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB, if (!HaveRewritablePHIs && !(HoistCondStores && SpeculatedStoreValue)) return false; + // Don't do if conversion for long dependence chain. + if (FindLongDependenceChain(BB, EndBB, ThenBB, nullptr, + CountBBCodeSize(ThenBB, TTI), TTI)) + return false; + // If we get here, we can hoist the instruction and if-convert. DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *ThenBB << "\n";); @@ -2355,6 +2526,10 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI, } } + if (FindLongDependenceChain(DomBlock, BB, IfBlock1, IfBlock2, + AggressiveInsts.size(), TTI)) + return false; + DEBUG(dbgs() << "FOUND IF CONDITION! " << *IfCond << " T: " << IfTrue->getName() << " F: " << IfFalse->getName() << "\n"); @@ -5728,9 +5903,6 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, BasicBlock *BB = BI->getParent(); BasicBlock *Succ = BI->getSuccessor(0); - if (SinkCommon && Options.SinkCommonInsts && SinkThenElseCodeToEnd(BI)) - return true; - // If the Terminator is the only non-phi instruction, simplify the block. // If LoopHeader is provided, check if the block or its successor is a loop // header. (This is for early invocations before loop simplify and @@ -6008,6 +6180,9 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) { if (MergeBlockIntoPredecessor(BB)) return true; + if (SinkCommon && Options.SinkCommonInsts) + Changed |= SinkCommonCodeFromPredecessors(BB); + IRBuilder<> Builder(BB); // If there is a trivial two-entry PHI node in this basic block, and we can diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index fbcdc0df0f1c..52f32cda2609 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5049,13 +5049,13 @@ bool LoopVectorizationLegality::canVectorize() { bool Result = true; bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE); - if (DoExtraAnalysis) // We must have a loop in canonical form. Loops with indirectbr in them cannot // be canonicalized. if (!TheLoop->getLoopPreheader()) { + DEBUG(dbgs() << "LV: Loop doesn't have a legal pre-header.\n"); ORE->emit(createMissedAnalysis("CFGNotUnderstood") << "loop control flow is not understood by vectorizer"); - if (DoExtraAnalysis) + if (DoExtraAnalysis) Result = false; else return false; diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 76ba62f5d596..a7ccd3faec44 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -646,23 +646,17 @@ class BoUpSLP { int getEntryCost(TreeEntry *E); /// This is the recursive part of buildTree. - void buildTree_rec(ArrayRef Roots, unsigned Depth, int UserIndx = -1, - int OpdNum = 0); + void buildTree_rec(ArrayRef Roots, unsigned Depth, int); /// \returns True if the ExtractElement/ExtractValue instructions in VL can /// be vectorized to use the original vector (or aggregate "bitcast" to a vector). bool canReuseExtract(ArrayRef VL, Value *OpValue) const; - /// Vectorize a single entry in the tree.\p OpdNum indicate the ordinality of - /// operand corrsponding to this tree entry \p E for the user tree entry - /// indicated by \p UserIndx. - // In other words, "E == TreeEntry[UserIndx].getOperand(OpdNum)". - Value *vectorizeTree(TreeEntry *E, int OpdNum = 0, int UserIndx = -1); + /// Vectorize a single entry in the tree. + Value *vectorizeTree(TreeEntry *E); - /// Vectorize a single entry in the tree, starting in \p VL.\p OpdNum indicate - /// the ordinality of operand corrsponding to the \p VL of scalar values for the - /// user indicated by \p UserIndx this \p VL feeds into. - Value *vectorizeTree(ArrayRef VL, int OpdNum = 0, int UserIndx = -1); + /// Vectorize a single entry in the tree, starting in \p VL. + Value *vectorizeTree(ArrayRef VL); /// \returns the pointer to the vectorized value if \p VL is already /// vectorized, or NULL. They may happen in cycles. @@ -708,16 +702,6 @@ class BoUpSLP { return std::equal(VL.begin(), VL.end(), Scalars.begin()); } - /// \returns true if the scalars in VL are found in this tree entry. - bool isFoundJumbled(ArrayRef VL, const DataLayout &DL, - ScalarEvolution &SE) const { - assert(VL.size() == Scalars.size() && "Invalid size"); - SmallVector List; - if (!sortLoadAccesses(VL, DL, SE, List)) - return false; - return std::equal(List.begin(), List.end(), Scalars.begin()); - } - /// A vector of scalars. ValueList Scalars; @@ -727,14 +711,6 @@ class BoUpSLP { /// Do we need to gather this sequence ? bool NeedToGather = false; - /// Records optional shuffle mask for the uses of jumbled memory accesses. - /// For example, a non-empty ShuffleMask[1] represents the permutation of - /// lanes that operand #1 of this vectorized instruction should undergo - /// before feeding this vectorized instruction, whereas an empty - /// ShuffleMask[0] indicates that the lanes of operand #0 of this vectorized - /// instruction need not be permuted at all. - SmallVector, 2> ShuffleMask; - /// Points back to the VectorizableTree. /// /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has @@ -750,31 +726,12 @@ class BoUpSLP { /// Create a new VectorizableTree entry. TreeEntry *newTreeEntry(ArrayRef VL, bool Vectorized, - int &UserTreeIdx, const InstructionsState &S, - ArrayRef ShuffleMask = None, - int OpdNum = 0) { - assert((!Vectorized || S.Opcode != 0) && - "Vectorized TreeEntry without opcode"); + int &UserTreeIdx) { VectorizableTree.emplace_back(VectorizableTree); - int idx = VectorizableTree.size() - 1; TreeEntry *Last = &VectorizableTree[idx]; Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); Last->NeedToGather = !Vectorized; - - TreeEntry *UserTreeEntry = nullptr; - if (UserTreeIdx != -1) - UserTreeEntry = &VectorizableTree[UserTreeIdx]; - - if (UserTreeEntry && !ShuffleMask.empty()) { - if ((unsigned)OpdNum >= UserTreeEntry->ShuffleMask.size()) - UserTreeEntry->ShuffleMask.resize(OpdNum + 1); - assert(UserTreeEntry->ShuffleMask[OpdNum].empty() && - "Mask already present"); - using mask = SmallVector; - mask tempMask(ShuffleMask.begin(), ShuffleMask.end()); - UserTreeEntry->ShuffleMask[OpdNum] = tempMask; - } if (Vectorized) { for (int i = 0, e = VL.size(); i != e; ++i) { assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); @@ -1427,34 +1384,34 @@ void BoUpSLP::buildTree(ArrayRef Roots, } void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, - int UserTreeIdx, int OpdNum) { + int UserTreeIdx) { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); InstructionsState S = getSameOpcode(VL); if (Depth == RecursionMaxDepth) { DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); return; } // Don't handle vectors. if (S.OpValue->getType()->isVectorTy()) { DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); return; } if (StoreInst *SI = dyn_cast(S.OpValue)) if (SI->getValueOperand()->getType()->isVectorTy()) { DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); return; } // If all of the operands are identical or constant we have a simple solution. if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.Opcode) { DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); return; } @@ -1466,7 +1423,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, if (EphValues.count(VL[i])) { DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is ephemeral.\n"); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); return; } } @@ -1477,7 +1434,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n"); if (E->Scalars[i] != VL[i]) { DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); return; } } @@ -1496,7 +1453,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, if (getTreeEntry(I)) { DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is already in tree.\n"); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); return; } } @@ -1506,7 +1463,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, for (unsigned i = 0, e = VL.size(); i != e; ++i) { if (MustGather.count(VL[i])) { DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); return; } } @@ -1520,7 +1477,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, // Don't go into unreachable blocks. They may contain instructions with // dependency cycles which confuse the final scheduling. DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); return; } @@ -1529,7 +1486,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, for (unsigned j = i + 1; j < e; ++j) if (VL[i] == VL[j]) { DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); return; } @@ -1544,7 +1501,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, assert((!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); return; } DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); @@ -1563,12 +1520,12 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, if (Term) { DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); return; } } - newTreeEntry(VL, true, UserTreeIdx, S); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { @@ -1578,7 +1535,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, Operands.push_back(cast(j)->getIncomingValueForBlock( PH->getIncomingBlock(i))); - buildTree_rec(Operands, Depth + 1, UserTreeIdx, i); + buildTree_rec(Operands, Depth + 1, UserTreeIdx); } return; } @@ -1590,7 +1547,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, } else { BS.cancelScheduling(VL, VL0); } - newTreeEntry(VL, Reuse, UserTreeIdx, S); + newTreeEntry(VL, Reuse, UserTreeIdx); return; } case Instruction::Load: { @@ -1605,7 +1562,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); return; } @@ -1616,13 +1573,15 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, LoadInst *L = cast(VL[i]); if (!L->isSimple()) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } } // Check if the loads are consecutive, reversed, or neither. + // TODO: What we really want is to sort the loads, but for now, check + // the two likely directions. bool Consecutive = true; bool ReverseConsecutive = true; for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) { @@ -1636,7 +1595,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, if (Consecutive) { ++NumLoadsWantToKeepOrder; - newTreeEntry(VL, true, UserTreeIdx, S); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of loads.\n"); return; } @@ -1650,41 +1609,15 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, break; } - if (ReverseConsecutive) { - DEBUG(dbgs() << "SLP: Gathering reversed loads.\n"); - ++NumLoadsWantToChangeOrder; - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); - return; - } - - if (VL.size() > 2) { - bool ShuffledLoads = true; - SmallVector Sorted; - SmallVector Mask; - if (sortLoadAccesses(VL, *DL, *SE, Sorted, &Mask)) { - auto NewVL = makeArrayRef(Sorted.begin(), Sorted.end()); - for (unsigned i = 0, e = NewVL.size() - 1; i < e; ++i) { - if (!isConsecutiveAccess(NewVL[i], NewVL[i + 1], *DL, *SE)) { - ShuffledLoads = false; - break; - } - } - // TODO: Tracking how many load wants to have arbitrary shuffled order - // would be usefull. - if (ShuffledLoads) { - DEBUG(dbgs() << "SLP: added a vector of loads which needs " - "permutation of loaded lanes.\n"); - newTreeEntry(NewVL, true, UserTreeIdx, S, - makeArrayRef(Mask.begin(), Mask.end()), OpdNum); - return; - } - } - } - - DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); + + if (ReverseConsecutive) { + ++NumLoadsWantToChangeOrder; + DEBUG(dbgs() << "SLP: Gathering reversed loads.\n"); + } else { + DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); + } return; } case Instruction::ZExt: @@ -1704,12 +1637,12 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, Type *Ty = cast(VL[i])->getOperand(0)->getType(); if (Ty != SrcTy || !isValidElementType(Ty)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); return; } } - newTreeEntry(VL, true, UserTreeIdx, S); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of casts.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1718,7 +1651,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, for (Value *j : VL) Operands.push_back(cast(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx, i); + buildTree_rec(Operands, Depth + 1, UserTreeIdx); } return; } @@ -1732,13 +1665,13 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, if (Cmp->getPredicate() != P0 || Cmp->getOperand(0)->getType() != ComparedTy) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); return; } } - newTreeEntry(VL, true, UserTreeIdx, S); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of compares.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1747,7 +1680,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, for (Value *j : VL) Operands.push_back(cast(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx, i); + buildTree_rec(Operands, Depth + 1, UserTreeIdx); } return; } @@ -1770,7 +1703,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, case Instruction::And: case Instruction::Or: case Instruction::Xor: - newTreeEntry(VL, true, UserTreeIdx, S); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); // Sort operands of the instructions so that each side is more likely to @@ -1779,7 +1712,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, ValueList Left, Right; reorderInputsAccordingToOpcode(S.Opcode, VL, Left, Right); buildTree_rec(Left, Depth + 1, UserTreeIdx); - buildTree_rec(Right, Depth + 1, UserTreeIdx, 1); + buildTree_rec(Right, Depth + 1, UserTreeIdx); return; } @@ -1789,7 +1722,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, for (Value *j : VL) Operands.push_back(cast(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx, i); + buildTree_rec(Operands, Depth + 1, UserTreeIdx); } return; @@ -1799,7 +1732,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, if (cast(VL[j])->getNumOperands() != 2) { DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); return; } } @@ -1812,7 +1745,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, if (Ty0 != CurTy) { DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); return; } } @@ -1824,12 +1757,12 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, DEBUG( dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); return; } } - newTreeEntry(VL, true, UserTreeIdx, S); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); for (unsigned i = 0, e = 2; i < e; ++i) { ValueList Operands; @@ -1837,7 +1770,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, for (Value *j : VL) Operands.push_back(cast(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx, i); + buildTree_rec(Operands, Depth + 1, UserTreeIdx); } return; } @@ -1846,12 +1779,12 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } - newTreeEntry(VL, true, UserTreeIdx, S); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a vector of stores.\n"); ValueList Operands; @@ -1869,7 +1802,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (!isTriviallyVectorizable(ID)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); return; } @@ -1883,7 +1816,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, getVectorIntrinsicIDForCall(CI2, TLI) != ID || !CI->hasIdenticalOperandBundleSchema(*CI2)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] << "\n"); return; @@ -1894,7 +1827,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, Value *A1J = CI2->getArgOperand(1); if (A1I != A1J) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI << " argument "<< A1I<<"!=" << A1J << "\n"); @@ -1907,14 +1840,14 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, CI->op_begin() + CI->getBundleOperandsEndIndex(), CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!=" << *VL[i] << '\n'); return; } } - newTreeEntry(VL, true, UserTreeIdx, S); + newTreeEntry(VL, true, UserTreeIdx); for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -1922,7 +1855,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, CallInst *CI2 = dyn_cast(j); Operands.push_back(CI2->getArgOperand(i)); } - buildTree_rec(Operands, Depth + 1, UserTreeIdx, i); + buildTree_rec(Operands, Depth + 1, UserTreeIdx); } return; } @@ -1931,11 +1864,11 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, // then do not vectorize this instruction. if (!S.IsAltShuffle) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); return; } - newTreeEntry(VL, true, UserTreeIdx, S); + newTreeEntry(VL, true, UserTreeIdx); DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); // Reorder operands if reordering would enable vectorization. @@ -1943,7 +1876,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, ValueList Left, Right; reorderAltShuffleOperands(S.Opcode, VL, Left, Right); buildTree_rec(Left, Depth + 1, UserTreeIdx); - buildTree_rec(Right, Depth + 1, UserTreeIdx, 1); + buildTree_rec(Right, Depth + 1, UserTreeIdx); return; } @@ -1953,13 +1886,13 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, for (Value *j : VL) Operands.push_back(cast(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx, i); + buildTree_rec(Operands, Depth + 1, UserTreeIdx); } return; default: BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, S); + newTreeEntry(VL, false, UserTreeIdx); DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); return; } @@ -2797,20 +2730,12 @@ Value *BoUpSLP::alreadyVectorized(ArrayRef VL, Value *OpValue) const { return nullptr; } -Value *BoUpSLP::vectorizeTree(ArrayRef VL, int OpdNum, int UserIndx) { +Value *BoUpSLP::vectorizeTree(ArrayRef VL) { InstructionsState S = getSameOpcode(VL); if (S.Opcode) { if (TreeEntry *E = getTreeEntry(S.OpValue)) { - TreeEntry *UserTreeEntry = nullptr; - if (UserIndx != -1) - UserTreeEntry = &VectorizableTree[UserIndx]; - - if (E->isSame(VL) || - (UserTreeEntry && - (unsigned)OpdNum < UserTreeEntry->ShuffleMask.size() && - !UserTreeEntry->ShuffleMask[OpdNum].empty() && - E->isFoundJumbled(VL, *DL, *SE))) - return vectorizeTree(E, OpdNum, UserIndx); + if (E->isSame(VL)) + return vectorizeTree(E); } } @@ -2822,10 +2747,9 @@ Value *BoUpSLP::vectorizeTree(ArrayRef VL, int OpdNum, int UserIndx) { return Gather(VL, VecTy); } -Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) { +Value *BoUpSLP::vectorizeTree(TreeEntry *E) { IRBuilder<>::InsertPointGuard Guard(Builder); - TreeEntry *UserTreeEntry = nullptr; if (E->VectorizedValue) { DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); return E->VectorizedValue; @@ -2845,10 +2769,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) { return V; } - assert(ScalarToTreeEntry.count(E->Scalars[0]) && - "Expected user tree entry, missing!"); - int CurrIndx = ScalarToTreeEntry[E->Scalars[0]]; - unsigned ShuffleOrOp = S.IsAltShuffle ? (unsigned) Instruction::ShuffleVector : S.Opcode; switch (ShuffleOrOp) { @@ -2878,7 +2798,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) { Builder.SetInsertPoint(IBB->getTerminator()); Builder.SetCurrentDebugLocation(PH->getDebugLoc()); - Value *Vec = vectorizeTree(Operands, i, CurrIndx); + Value *Vec = vectorizeTree(Operands); NewPhi->addIncoming(Vec, IBB); } @@ -2931,7 +2851,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) { setInsertPointAfterBundle(E->Scalars, VL0); - Value *InVec = vectorizeTree(INVL, 0, CurrIndx); + Value *InVec = vectorizeTree(INVL); if (Value *V = alreadyVectorized(E->Scalars, VL0)) return V; @@ -2952,8 +2872,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) { setInsertPointAfterBundle(E->Scalars, VL0); - Value *L = vectorizeTree(LHSV, 0, CurrIndx); - Value *R = vectorizeTree(RHSV, 1, CurrIndx); + Value *L = vectorizeTree(LHSV); + Value *R = vectorizeTree(RHSV); if (Value *V = alreadyVectorized(E->Scalars, VL0)) return V; @@ -2980,9 +2900,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) { setInsertPointAfterBundle(E->Scalars, VL0); - Value *Cond = vectorizeTree(CondVec, 0, CurrIndx); - Value *True = vectorizeTree(TrueVec, 1, CurrIndx); - Value *False = vectorizeTree(FalseVec, 2, CurrIndx); + Value *Cond = vectorizeTree(CondVec); + Value *True = vectorizeTree(TrueVec); + Value *False = vectorizeTree(FalseVec); if (Value *V = alreadyVectorized(E->Scalars, VL0)) return V; @@ -3023,8 +2943,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) { setInsertPointAfterBundle(E->Scalars, VL0); - Value *LHS = vectorizeTree(LHSVL, 0, CurrIndx); - Value *RHS = vectorizeTree(RHSVL, 1, CurrIndx); + Value *LHS = vectorizeTree(LHSVL); + Value *RHS = vectorizeTree(RHSVL); if (Value *V = alreadyVectorized(E->Scalars, VL0)) return V; @@ -3045,20 +2965,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) { // sink them all the way down past store instructions. setInsertPointAfterBundle(E->Scalars, VL0); - if (UserIndx != -1) - UserTreeEntry = &VectorizableTree[UserIndx]; - - bool isJumbled = false; - LoadInst *LI = NULL; - if (UserTreeEntry && - (unsigned)OpdNum < UserTreeEntry->ShuffleMask.size() && - !UserTreeEntry->ShuffleMask[OpdNum].empty()) { - isJumbled = true; - LI = cast(E->Scalars[0]); - } else { - LI = cast(VL0); - } - + LoadInst *LI = cast(VL0); Type *ScalarLoadTy = LI->getType(); unsigned AS = LI->getPointerAddressSpace(); @@ -3080,21 +2987,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) { LI->setAlignment(Alignment); E->VectorizedValue = LI; ++NumVectorInstructions; - propagateMetadata(LI, E->Scalars); - - if (isJumbled) { - SmallVector Mask; - for (unsigned LaneEntry : UserTreeEntry->ShuffleMask[OpdNum]) - Mask.push_back(Builder.getInt32(LaneEntry)); - // Generate shuffle for jumbled memory access - Value *Undef = UndefValue::get(VecTy); - Value *Shuf = Builder.CreateShuffleVector((Value *)LI, Undef, - ConstantVector::get(Mask)); - E->VectorizedValue = Shuf; - ++NumVectorInstructions; - return Shuf; - } - return LI; + return propagateMetadata(LI, E->Scalars); } case Instruction::Store: { StoreInst *SI = cast(VL0); @@ -3107,7 +3000,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) { setInsertPointAfterBundle(E->Scalars, VL0); - Value *VecValue = vectorizeTree(ScalarStoreValues, 0, CurrIndx); + Value *VecValue = vectorizeTree(ScalarStoreValues); Value *ScalarPtr = SI->getPointerOperand(); Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS)); StoreInst *S = Builder.CreateStore(VecValue, VecPtr); @@ -3133,7 +3026,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) { for (Value *V : E->Scalars) Op0VL.push_back(cast(V)->getOperand(0)); - Value *Op0 = vectorizeTree(Op0VL, 0, CurrIndx); + Value *Op0 = vectorizeTree(Op0VL); std::vector OpVecs; for (int j = 1, e = cast(VL0)->getNumOperands(); j < e; @@ -3142,7 +3035,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) { for (Value *V : E->Scalars) OpVL.push_back(cast(V)->getOperand(j)); - Value *OpVec = vectorizeTree(OpVL, j, CurrIndx); + Value *OpVec = vectorizeTree(OpVL); OpVecs.push_back(OpVec); } @@ -3181,7 +3074,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) { OpVL.push_back(CEI->getArgOperand(j)); } - Value *OpVec = vectorizeTree(OpVL, j, CurrIndx); + Value *OpVec = vectorizeTree(OpVL); DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n"); OpVecs.push_back(OpVec); } @@ -3212,8 +3105,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) { reorderAltShuffleOperands(S.Opcode, E->Scalars, LHSVL, RHSVL); setInsertPointAfterBundle(E->Scalars, VL0); - Value *LHS = vectorizeTree(LHSVL, 0, CurrIndx); - Value *RHS = vectorizeTree(RHSVL, 1, CurrIndx); + Value *LHS = vectorizeTree(LHSVL); + Value *RHS = vectorizeTree(RHSVL); if (Value *V = alreadyVectorized(E->Scalars, VL0)) return V; @@ -3313,14 +3206,9 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { continue; TreeEntry *E = getTreeEntry(Scalar); assert(E && "Invalid scalar"); - assert((!E->NeedToGather) && "Extracting from a gather list"); + assert(!E->NeedToGather && "Extracting from a gather list"); - Value *Vec = dyn_cast(E->VectorizedValue); - if (Vec && dyn_cast(cast(Vec)->getOperand(0))) { - Vec = cast(E->VectorizedValue)->getOperand(0); - } else { - Vec = E->VectorizedValue; - } + Value *Vec = E->VectorizedValue; assert(Vec && "Can't find vectorizable value"); Value *Lane = Builder.getInt32(ExternalUse.Lane); @@ -4017,6 +3905,7 @@ static bool collectValuesToDemote(Value *V, SmallPtrSetImpl &Expr, // seed additional demotion, we save the truncated value. case Instruction::Trunc: Roots.push_back(I->getOperand(0)); + break; case Instruction::ZExt: case Instruction::SExt: break; diff --git a/test/Analysis/BasicAA/args-rets-allocas-loads.ll b/test/Analysis/BasicAA/args-rets-allocas-loads.ll index 05b56a07e44b..b31fb26f1c9b 100644 --- a/test/Analysis/BasicAA/args-rets-allocas-loads.ll +++ b/test/Analysis/BasicAA/args-rets-allocas-loads.ll @@ -308,4 +308,9 @@ define void @caller_a(double* %arg_a0, ; CHECK-NEXT: 0 mod responses (0.0%) ; CHECK-NEXT: 0 ref responses (0.0%) ; CHECK-NEXT: 140 mod & ref responses (76.0%) -; CHECK-NEXT: Alias Analysis Evaluator Mod/Ref Summary: 23%/0%/0%/76% +; CHECK-NEXT: 0 must responses (0.0%) +; CHECK-NEXT: 0 must mod responses (0.0%) +; CHECK-NEXT: 0 must ref responses (0.0%) +; CHECK-NEXT: 0 must mod & ref responses (0.0%) +; CHECK-NEXT: Alias Analysis Evaluator Mod/Ref Summary: 23%/0%/0%/76%/0%/0%/0%/0% + diff --git a/test/Analysis/BasicAA/call-attrs.ll b/test/Analysis/BasicAA/call-attrs.ll index 9cd17e486799..8538e8b4771d 100644 --- a/test/Analysis/BasicAA/call-attrs.ll +++ b/test/Analysis/BasicAA/call-attrs.ll @@ -31,12 +31,12 @@ entry: ret void } -; CHECK: Just Ref: Ptr: i8* %p <-> call void @readonly_attr(i8* %p) +; CHECK: Just Ref (MustAlias): Ptr: i8* %p <-> call void @readonly_attr(i8* %p) ; CHECK: Just Ref: Ptr: i8* %p <-> call void @readonly_func(i8* %p) -; CHECK: Just Mod: Ptr: i8* %p <-> call void @writeonly_attr(i8* %p) +; CHECK: Just Mod (MustAlias): Ptr: i8* %p <-> call void @writeonly_attr(i8* %p) ; CHECK: Just Mod: Ptr: i8* %p <-> call void @writeonly_func(i8* %p) ; CHECK: NoModRef: Ptr: i8* %p <-> call void @readnone_attr(i8* %p) ; CHECK: NoModRef: Ptr: i8* %p <-> call void @readnone_func(i8* %p) ; CHECK: Both ModRef: Ptr: i8* %p <-> call void @read_write(i8* %p, i8* %p, i8* %p) -; CHECK: Just Ref: Ptr: i8* %p <-> call void @func() [ "deopt"(i8* %p) ] +; CHECK: Just Ref (MustAlias): Ptr: i8* %p <-> call void @func() [ "deopt"(i8* %p) ] ; CHECK: Both ModRef: Ptr: i8* %p <-> call void @writeonly_attr(i8* %p) [ "deopt"(i8* %p) ] diff --git a/test/Analysis/BasicAA/cs-cs-arm.ll b/test/Analysis/BasicAA/cs-cs-arm.ll index 1580af9ea826..e4367bb6d61b 100644 --- a/test/Analysis/BasicAA/cs-cs-arm.ll +++ b/test/Analysis/BasicAA/cs-cs-arm.ll @@ -19,11 +19,11 @@ entry: ; CHECK-LABEL: Function: test1: ; CHECK: NoAlias: i8* %p, i8* %q -; CHECK: Just Ref: Ptr: i8* %p <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) +; CHECK: Just Ref (MustAlias): Ptr: i8* %p <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) ; CHECK: NoModRef: Ptr: i8* %q <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) ; CHECK: NoModRef: Ptr: i8* %p <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) -; CHECK: Both ModRef: Ptr: i8* %q <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) -; CHECK: Just Ref: Ptr: i8* %p <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) +; CHECK: Both ModRef (MustAlias): Ptr: i8* %q <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) +; CHECK: Just Ref (MustAlias): Ptr: i8* %p <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) ; CHECK: NoModRef: Ptr: i8* %q <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) ; CHECK: NoModRef: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #{{[0-9]+}} <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) ; CHECK: NoModRef: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #{{[0-9]+}} <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) diff --git a/test/Analysis/BasicAA/cs-cs.ll b/test/Analysis/BasicAA/cs-cs.ll index 3695275649b2..30b7e0720f0b 100644 --- a/test/Analysis/BasicAA/cs-cs.ll +++ b/test/Analysis/BasicAA/cs-cs.ll @@ -164,7 +164,7 @@ define void @test4(i8* %P, i8* noalias %Q) #3 { ; CHECK-LABEL: Function: test4: ; CHECK: NoAlias: i8* %P, i8* %Q -; CHECK: Just Mod: Ptr: i8* %P <-> tail call void @llvm.memset.p0i8.i64(i8* %P, i8 42, i64 8, i32 1, i1 false) +; CHECK: Just Mod (MustAlias): Ptr: i8* %P <-> tail call void @llvm.memset.p0i8.i64(i8* %P, i8 42, i64 8, i32 1, i1 false) ; CHECK: NoModRef: Ptr: i8* %Q <-> tail call void @llvm.memset.p0i8.i64(i8* %P, i8 42, i64 8, i32 1, i1 false) ; CHECK: Just Mod: Ptr: i8* %P <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) ; CHECK: Just Ref: Ptr: i8* %Q <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) @@ -192,6 +192,26 @@ define void @test5(i8* %P, i8* %Q, i8* %R) #3 { ; CHECK: Just Mod: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false) <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) } +define void @test5a(i8* noalias %P, i8* noalias %Q, i8* noalias %R) nounwind ssp { + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false) + ret void + +; CHECK-LABEL: Function: test5a: + +; CHECK: NoAlias: i8* %P, i8* %Q +; CHECK: NoAlias: i8* %P, i8* %R +; CHECK: NoAlias: i8* %Q, i8* %R +; CHECK: Just Mod: Ptr: i8* %P <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Just Ref: Ptr: i8* %Q <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: NoModRef: Ptr: i8* %R <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Just Mod: Ptr: i8* %P <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false) +; CHECK: NoModRef: Ptr: i8* %Q <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false) +; CHECK: Just Ref: Ptr: i8* %R <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false) +; CHECK: Just Mod: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false) +; CHECK: Just Mod: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false) <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +} + define void @test6(i8* %P) #3 { call void @llvm.memset.p0i8.i64(i8* %P, i8 -51, i64 32, i32 8, i1 false) call void @a_readonly_func(i8* %P) @@ -199,7 +219,7 @@ define void @test6(i8* %P) #3 { ; CHECK-LABEL: Function: test6: -; CHECK: Just Mod: Ptr: i8* %P <-> call void @llvm.memset.p0i8.i64(i8* %P, i8 -51, i64 32, i32 8, i1 false) +; CHECK: Just Mod (MustAlias): Ptr: i8* %P <-> call void @llvm.memset.p0i8.i64(i8* %P, i8 -51, i64 32, i32 8, i1 false) ; CHECK: Just Ref: Ptr: i8* %P <-> call void @a_readonly_func(i8* %P) ; CHECK: Just Mod: call void @llvm.memset.p0i8.i64(i8* %P, i8 -51, i64 32, i32 8, i1 false) <-> call void @a_readonly_func(i8* %P) ; CHECK: Just Ref: call void @a_readonly_func(i8* %P) <-> call void @llvm.memset.p0i8.i64(i8* %P, i8 -51, i64 32, i32 8, i1 false) @@ -237,9 +257,9 @@ entry: ; CHECK: NoModRef: Ptr: i8* %p <-> call void @an_inaccessiblememonly_func() ; CHECK: NoModRef: Ptr: i8* %q <-> call void @an_inaccessiblememonly_func() ; CHECK: NoModRef: Ptr: i8* %p <-> call void @an_inaccessibleorargmemonly_func(i8* %q) -; CHECK: Both ModRef: Ptr: i8* %q <-> call void @an_inaccessibleorargmemonly_func(i8* %q) +; CHECK: Both ModRef (MustAlias): Ptr: i8* %q <-> call void @an_inaccessibleorargmemonly_func(i8* %q) ; CHECK: NoModRef: Ptr: i8* %p <-> call void @an_argmemonly_func(i8* %q) -; CHECK: Both ModRef: Ptr: i8* %q <-> call void @an_argmemonly_func(i8* %q) +; CHECK: Both ModRef (MustAlias): Ptr: i8* %q <-> call void @an_argmemonly_func(i8* %q) ; CHECK: Just Ref: call void @a_readonly_func(i8* %p) <-> call void @an_inaccessiblememonly_func() ; CHECK: Just Ref: call void @a_readonly_func(i8* %p) <-> call void @an_inaccessibleorargmemonly_func(i8* %q) ; CHECK: Just Ref: call void @a_readonly_func(i8* %p) <-> call void @an_argmemonly_func(i8* %q) @@ -254,13 +274,35 @@ entry: ; CHECK: Both ModRef: call void @an_inaccessibleorargmemonly_func(i8* %q) <-> call void @a_readonly_func(i8* %p) ; CHECK: Both ModRef: call void @an_inaccessibleorargmemonly_func(i8* %q) <-> call void @a_writeonly_func(i8* %q) ; CHECK: Both ModRef: call void @an_inaccessibleorargmemonly_func(i8* %q) <-> call void @an_inaccessiblememonly_func() -; CHECK: Both ModRef: call void @an_inaccessibleorargmemonly_func(i8* %q) <-> call void @an_argmemonly_func(i8* %q) +; CHECK: Both ModRef (MustAlias): call void @an_inaccessibleorargmemonly_func(i8* %q) <-> call void @an_argmemonly_func(i8* %q) ; CHECK: Both ModRef: call void @an_argmemonly_func(i8* %q) <-> call void @a_readonly_func(i8* %p) ; CHECK: Both ModRef: call void @an_argmemonly_func(i8* %q) <-> call void @a_writeonly_func(i8* %q) ; CHECK: NoModRef: call void @an_argmemonly_func(i8* %q) <-> call void @an_inaccessiblememonly_func() -; CHECK: Both ModRef: call void @an_argmemonly_func(i8* %q) <-> call void @an_inaccessibleorargmemonly_func(i8* %q) +; CHECK: Both ModRef (MustAlias): call void @an_argmemonly_func(i8* %q) <-> call void @an_inaccessibleorargmemonly_func(i8* %q) } +;; test that MustAlias is set for calls when no MayAlias is found. +declare void @another_argmemonly_func(i8*, i8*) #0 +define void @test8a(i8* noalias %p, i8* noalias %q) { +entry: + call void @another_argmemonly_func(i8* %p, i8* %q) + ret void + +; CHECK-LABEL: Function: test8a +; CHECK: Both ModRef: Ptr: i8* %p <-> call void @another_argmemonly_func(i8* %p, i8* %q) +; CHECK: Both ModRef: Ptr: i8* %q <-> call void @another_argmemonly_func(i8* %p, i8* %q) +} +define void @test8b(i8* %p, i8* %q) { +entry: + call void @another_argmemonly_func(i8* %p, i8* %q) + ret void + +; CHECK-LABEL: Function: test8b +; CHECK: Both ModRef: Ptr: i8* %p <-> call void @another_argmemonly_func(i8* %p, i8* %q) +; CHECK: Both ModRef: Ptr: i8* %q <-> call void @another_argmemonly_func(i8* %p, i8* %q) +} + + ;; test that unknown operand bundle has unknown effect to the heap define void @test9(i8* %p) { ; CHECK-LABEL: Function: test9 @@ -310,9 +352,9 @@ entry: ; CHECK: NoModRef: Ptr: i8* %p <-> call void @an_inaccessiblememonly_func() #7 [ "unknown"() ] ; CHECK: NoModRef: Ptr: i8* %q <-> call void @an_inaccessiblememonly_func() #7 [ "unknown"() ] ; CHECK: NoModRef: Ptr: i8* %p <-> call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ] -; CHECK: Both ModRef: Ptr: i8* %q <-> call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ] +; CHECK: Both ModRef (MustAlias): Ptr: i8* %q <-> call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ] ; CHECK: NoModRef: Ptr: i8* %p <-> call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ] -; CHECK: Both ModRef: Ptr: i8* %q <-> call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ] +; CHECK: Both ModRef (MustAlias): Ptr: i8* %q <-> call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ] ; CHECK: Just Ref: call void @a_readonly_func(i8* %p) #6 [ "unknown"() ] <-> call void @an_inaccessiblememonly_func() #7 [ "unknown"() ] ; CHECK: Just Ref: call void @a_readonly_func(i8* %p) #6 [ "unknown"() ] <-> call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ] ; CHECK: Just Ref: call void @a_readonly_func(i8* %p) #6 [ "unknown"() ] <-> call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ] @@ -321,10 +363,10 @@ entry: ; CHECK: NoModRef: call void @an_inaccessiblememonly_func() #7 [ "unknown"() ] <-> call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ] ; CHECK: Both ModRef: call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ] <-> call void @a_readonly_func(i8* %p) #6 [ "unknown"() ] ; CHECK: Both ModRef: call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ] <-> call void @an_inaccessiblememonly_func() #7 [ "unknown"() ] -; CHECK: Both ModRef: call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ] <-> call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ] +; CHECK: Both ModRef (MustAlias): call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ] <-> call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ] ; CHECK: Both ModRef: call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ] <-> call void @a_readonly_func(i8* %p) #6 [ "unknown"() ] ; CHECK: NoModRef: call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ] <-> call void @an_inaccessiblememonly_func() #7 [ "unknown"() ] -; CHECK: Both ModRef: call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ] <-> call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ] +; CHECK: Both ModRef (MustAlias): call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ] <-> call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ] } attributes #0 = { argmemonly nounwind } diff --git a/test/Analysis/MemorySSA/volatile-clobber.ll b/test/Analysis/MemorySSA/volatile-clobber.ll index d6f960f3e382..53df7de499bd 100644 --- a/test/Analysis/MemorySSA/volatile-clobber.ll +++ b/test/Analysis/MemorySSA/volatile-clobber.ll @@ -22,8 +22,7 @@ define i32 @foo() { ret i32 %4 } -; Ensuring that we don't automatically hoist nonvolatile loads around volatile -; loads +; Ensuring we allow hoisting nonvolatile loads around volatile loads. ; CHECK-LABEL define void @volatile_only define void @volatile_only(i32* %arg1, i32* %arg2) { ; Trivially NoAlias/MustAlias @@ -36,7 +35,7 @@ define void @volatile_only(i32* %arg1, i32* %arg2) { ; CHECK: MemoryUse(liveOnEntry) ; CHECK-NEXT: load i32, i32* %b load i32, i32* %b -; CHECK: MemoryUse(1) +; CHECK: MemoryUse(liveOnEntry) ; CHECK-NEXT: load i32, i32* %a load i32, i32* %a @@ -44,7 +43,7 @@ define void @volatile_only(i32* %arg1, i32* %arg2) { ; CHECK: 2 = MemoryDef(1) ; CHECK-NEXT: load volatile i32, i32* %arg1 load volatile i32, i32* %arg1 -; CHECK: MemoryUse(2) +; CHECK: MemoryUse(liveOnEntry) ; CHECK-NEXT: load i32, i32* %arg2 load i32, i32* %arg2 @@ -75,10 +74,10 @@ define void @volatile_atomics(i32* %arg1, i32* %arg2) { ; CHECK: MemoryUse(1) ; CHECK-NEXT: load atomic i32, i32* %b unordered, align 4 load atomic i32, i32* %b unordered, align 4 -; CHECK: MemoryUse(2) +; CHECK: MemoryUse(1) ; CHECK-NEXT: load atomic i32, i32* %a unordered, align 4 load atomic i32, i32* %a unordered, align 4 -; CHECK: MemoryUse(2) +; CHECK: MemoryUse(1) ; CHECK-NEXT: load i32, i32* %a load i32, i32* %a @@ -86,7 +85,7 @@ define void @volatile_atomics(i32* %arg1, i32* %arg2) { ; CHECK: 3 = MemoryDef(2) ; CHECK-NEXT: load atomic volatile i32, i32* %arg1 monotonic, align 4 load atomic volatile i32, i32* %arg1 monotonic, align 4 -; CHECK: MemoryUse(3) +; CHECK: MemoryUse(1) ; CHECK-NEXT: load i32, i32* %arg2 load i32, i32* %arg2 diff --git a/test/Analysis/ValueTracking/memory-dereferenceable.ll b/test/Analysis/ValueTracking/memory-dereferenceable.ll index ca16a266123c..2e9453f670ce 100644 --- a/test/Analysis/ValueTracking/memory-dereferenceable.ll +++ b/test/Analysis/ValueTracking/memory-dereferenceable.ll @@ -20,7 +20,8 @@ declare i32* @foo() @globalptr.align16 = external global i8, align 16 ; CHECK-LABEL: 'test' -define void @test(i32 addrspace(1)* dereferenceable(8) %dparam, +define void @test(%struct.A* sret %result, + i32 addrspace(1)* dereferenceable(8) %dparam, i8 addrspace(1)* dereferenceable(32) align 1 %dparam.align1, i8 addrspace(1)* dereferenceable(32) align 16 %dparam.align16, i8* byval %i8_byval, @@ -41,17 +42,14 @@ entry: %empty_alloca = alloca i8, i64 0 %empty_load = load i8, i8* %empty_alloca - ; Load from too small array alloca -; CHECK-NOT: %small_array_alloca - %small_array_alloca = alloca i8, i64 2 - %saa_cast = bitcast i8* %small_array_alloca to i32* - %saa_load = load i32, i32* %saa_cast + ; Loads from sret arguments +; CHECK: %sret_gep{{.*}}(aligned) + %sret_gep = getelementptr inbounds %struct.A, %struct.A* %result, i64 0, i32 1, i64 2 + load i8, i8* %sret_gep - ; Load from array alloca -; CHECK: %big_array_alloca{{.*}}(unaligned) - %big_array_alloca = alloca i8, i64 4 - %baa_cast = bitcast i8* %big_array_alloca to i32* - %baa_load = load i32, i32* %baa_cast +; CHECK-NOT: %sret_gep_outside + %sret_gep_outside = getelementptr %struct.A, %struct.A* %result, i64 0, i32 1, i64 7 + load i8, i8* %sret_gep_outside ; CHECK: %dparam{{.*}}(aligned) %load3 = load i32, i32 addrspace(1)* %dparam diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll index 86ac5507a407..111aaf88b160 100644 --- a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll +++ b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll @@ -135,7 +135,7 @@ continue: } ; Check that we fallback on invoke translation failures. -; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to legalize instruction: %0:_(s128) = G_FCONSTANT quad 2 +; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to legalize instruction: %0:_(s128) = G_FCONSTANT fp128 0xL00000000000000004000000000000000 ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for test_quad_dump ; FALLBACK-WITH-REPORT-OUT-LABEL: test_quad_dump: define fp128 @test_quad_dump() { diff --git a/test/CodeGen/AArch64/GlobalISel/fp128-legalize-crash-pr35690.mir b/test/CodeGen/AArch64/GlobalISel/fp128-legalize-crash-pr35690.mir new file mode 100644 index 000000000000..47fda8f998d7 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/fp128-legalize-crash-pr35690.mir @@ -0,0 +1,44 @@ +# RUN: llc -O0 -run-pass=legalizer -global-isel -global-isel-abort=0 %s -o - | FileCheck %s +--- | + target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + target triple = "aarch64" + + define fp128 @x(fp128 %a) { + entry: + %a.addr = alloca fp128, align 16 + store fp128 %a, fp128* %a.addr, align 16 + %0 = load fp128, fp128* %a.addr, align 16 + %sub = fsub fp128 0xL00000000000000008000000000000000, %0 + ret fp128 %sub + } + +... +--- +name: x +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +fixedStack: +stack: + - { id: 0, name: a.addr, type: default, offset: 0, size: 16, alignment: 16, + stack-id: 0, callee-saved-register: '', callee-saved-restored: true, + di-variable: '', di-expression: '', di-location: '' } +body: | + bb.1.entry: + liveins: %q0 + + ; This test just checks we don't crash on G_FNEG of FP128 types. Expect to fall + ; back until support is added for fp128. + ; CHECK: ret + %0:_(s128) = COPY %q0 + %1:_(p0) = G_FRAME_INDEX %stack.0.a.addr + G_STORE %0(s128), %1(p0) :: (store 16 into %ir.a.addr) + %2:_(s128) = G_LOAD %1(p0) :: (load 16 from %ir.a.addr) + %3:_(s128) = G_FNEG %2 + %q0 = COPY %3(s128) + RET_ReallyLR implicit %q0 + +... diff --git a/test/CodeGen/AArch64/GlobalISel/translate-gep.ll b/test/CodeGen/AArch64/GlobalISel/translate-gep.ll index 865315bbe0a3..4b69575079a3 100644 --- a/test/CodeGen/AArch64/GlobalISel/translate-gep.ll +++ b/test/CodeGen/AArch64/GlobalISel/translate-gep.ll @@ -1,85 +1,104 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -mtriple=aarch64-linux-gnu -O0 -global-isel -stop-after=irtranslator -o - %s | FileCheck %s %type = type [4 x {i8, i32}] define %type* @first_offset_const(%type* %addr) { -; CHECK-LABEL: name: first_offset_const -; CHECK: [[BASE:%[0-9]+]]:_(p0) = COPY %x0 -; CHECK: [[OFFSET:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 -; CHECK: [[RES:%[0-9]+]]:_(p0) = G_GEP [[BASE]], [[OFFSET]](s64) -; CHECK: %x0 = COPY [[RES]](p0) + ; CHECK-LABEL: name: first_offset_const + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: %x0 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY %x0 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 + ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) + ; CHECK: %x0 = COPY [[GEP]](p0) + ; CHECK: RET_ReallyLR implicit %x0 %res = getelementptr %type, %type* %addr, i32 1 ret %type* %res } define %type* @first_offset_trivial(%type* %addr) { -; CHECK-LABEL: name: first_offset_trivial -; CHECK: [[BASE:%[0-9]+]]:_(p0) = COPY %x0 -; CHECK: [[TRIVIAL:%[0-9]+]]:_(p0) = COPY [[BASE]](p0) -; CHECK: %x0 = COPY [[TRIVIAL]](p0) + ; CHECK-LABEL: name: first_offset_trivial + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: %x0 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY %x0 + ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY [[COPY]](p0) + ; CHECK: %x0 = COPY [[COPY1]](p0) + ; CHECK: RET_ReallyLR implicit %x0 %res = getelementptr %type, %type* %addr, i32 0 ret %type* %res } define %type* @first_offset_variable(%type* %addr, i64 %idx) { -; CHECK-LABEL: name: first_offset_variable -; CHECK: [[BASE:%[0-9]+]]:_(p0) = COPY %x0 -; CHECK: [[IDX:%[0-9]+]]:_(s64) = COPY %x1 -; CHECK: [[SIZE:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 -; CHECK: [[OFFSET:%[0-9]+]]:_(s64) = G_MUL [[SIZE]], [[IDX]] -; CHECK: [[STEP0:%[0-9]+]]:_(p0) = G_GEP [[BASE]], [[OFFSET]](s64) -; CHECK: [[RES:%[0-9]+]]:_(p0) = COPY [[STEP0]](p0) -; CHECK: %x0 = COPY [[RES]](p0) + ; CHECK-LABEL: name: first_offset_variable + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: %x0, %x1 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY %x0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY %x1 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 + ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[C]], [[COPY1]] + ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[MUL]](s64) + ; CHECK: [[COPY2:%[0-9]+]]:_(p0) = COPY [[GEP]](p0) + ; CHECK: %x0 = COPY [[COPY2]](p0) + ; CHECK: RET_ReallyLR implicit %x0 %res = getelementptr %type, %type* %addr, i64 %idx ret %type* %res } define %type* @first_offset_ext(%type* %addr, i32 %idx) { -; CHECK-LABEL: name: first_offset_ext -; CHECK: [[BASE:%[0-9]+]]:_(p0) = COPY %x0 -; CHECK: [[IDX32:%[0-9]+]]:_(s32) = COPY %w1 -; CHECK: [[SIZE:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 -; CHECK: [[IDX64:%[0-9]+]]:_(s64) = G_SEXT [[IDX32]](s32) -; CHECK: [[OFFSET:%[0-9]+]]:_(s64) = G_MUL [[SIZE]], [[IDX64]] -; CHECK: [[STEP0:%[0-9]+]]:_(p0) = G_GEP [[BASE]], [[OFFSET]](s64) -; CHECK: [[RES:%[0-9]+]]:_(p0) = COPY [[STEP0]](p0) -; CHECK: %x0 = COPY [[RES]](p0) + ; CHECK-LABEL: name: first_offset_ext + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: %w1, %x0 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY %x0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %w1 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 + ; CHECK: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY1]](s32) + ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[C]], [[SEXT]] + ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[MUL]](s64) + ; CHECK: [[COPY2:%[0-9]+]]:_(p0) = COPY [[GEP]](p0) + ; CHECK: %x0 = COPY [[COPY2]](p0) + ; CHECK: RET_ReallyLR implicit %x0 %res = getelementptr %type, %type* %addr, i32 %idx ret %type* %res } %type1 = type [4 x [4 x i32]] define i32* @const_then_var(%type1* %addr, i64 %idx) { -; CHECK-LABEL: name: const_then_var -; CHECK: [[BASE:%[0-9]+]]:_(p0) = COPY %x0 -; CHECK: [[IDX:%[0-9]+]]:_(s64) = COPY %x1 -; CHECK: [[OFFSET1:%[0-9]+]]:_(s64) = G_CONSTANT i64 272 -; CHECK: [[SIZE:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 -; CHECK: [[BASE1:%[0-9]+]]:_(p0) = G_GEP [[BASE]], [[OFFSET1]](s64) -; CHECK: [[OFFSET2:%[0-9]+]]:_(s64) = G_MUL [[SIZE]], [[IDX]] -; CHECK: [[BASE2:%[0-9]+]]:_(p0) = G_GEP [[BASE1]], [[OFFSET2]](s64) -; CHECK: [[RES:%[0-9]+]]:_(p0) = COPY [[BASE2]](p0) -; CHECK: %x0 = COPY [[RES]](p0) + ; CHECK-LABEL: name: const_then_var + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: %x0, %x1 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY %x0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY %x1 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 272 + ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) + ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[C1]], [[COPY1]] + ; CHECK: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[GEP]], [[MUL]](s64) + ; CHECK: [[COPY2:%[0-9]+]]:_(p0) = COPY [[GEP1]](p0) + ; CHECK: %x0 = COPY [[COPY2]](p0) + ; CHECK: RET_ReallyLR implicit %x0 %res = getelementptr %type1, %type1* %addr, i32 4, i32 1, i64 %idx ret i32* %res } define i32* @var_then_const(%type1* %addr, i64 %idx) { -; CHECK-LABEL: name: var_then_const -; CHECK: [[BASE:%[0-9]+]]:_(p0) = COPY %x0 -; CHECK: [[IDX:%[0-9]+]]:_(s64) = COPY %x1 -; CHECK: [[SIZE:%[0-9]+]]:_(s64) = G_CONSTANT i64 64 -; CHECK: [[OFFSET2:%[0-9]+]]:_(s64) = G_CONSTANT i64 40 -; CHECK: [[OFFSET1:%[0-9]+]]:_(s64) = G_MUL [[SIZE]], [[IDX]] -; CHECK: [[BASE1:%[0-9]+]]:_(p0) = G_GEP [[BASE]], [[OFFSET1]](s64) -; CHECK: [[BASE2:%[0-9]+]]:_(p0) = G_GEP [[BASE1]], [[OFFSET2]](s64) -; CHECK: %x0 = COPY [[BASE2]](p0) + ; CHECK-LABEL: name: var_then_const + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: %x0, %x1 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY %x0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY %x1 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 64 + ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 40 + ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[C]], [[COPY1]] + ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[MUL]](s64) + ; CHECK: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[GEP]], [[C1]](s64) + ; CHECK: %x0 = COPY [[GEP1]](p0) + ; CHECK: RET_ReallyLR implicit %x0 %res = getelementptr %type1, %type1* %addr, i64 %idx, i32 2, i32 2 ret i32* %res } diff --git a/test/CodeGen/AArch64/arm64-jumptable.ll b/test/CodeGen/AArch64/arm64-jumptable.ll index f5c2ee6da0bf..fac3e5704d15 100644 --- a/test/CodeGen/AArch64/arm64-jumptable.ll +++ b/test/CodeGen/AArch64/arm64-jumptable.ll @@ -6,22 +6,20 @@ define void @sum(i32 %a, i32* %to, i32 %c) { entry: switch i32 %a, label %exit [ i32 1, label %bb1 - i32 2, label %bb2 + i32 2, label %exit.sink.split i32 3, label %bb3 i32 4, label %bb4 ] bb1: %b = add i32 %c, 1 - store i32 %b, i32* %to - br label %exit -bb2: - store i32 2, i32* %to - br label %exit + br label %exit.sink.split bb3: - store i32 3, i32* %to - br label %exit + br label %exit.sink.split bb4: - store i32 5, i32* %to + br label %exit.sink.split +exit.sink.split: + %.sink = phi i32 [ 5, %bb4 ], [ %b, %bb1 ], [ 3, %bb3 ], [ %a, %entry ] + store i32 %.sink, i32* %to br label %exit exit: ret void diff --git a/test/CodeGen/AArch64/arm64-memset-to-bzero.ll b/test/CodeGen/AArch64/arm64-memset-to-bzero.ll index 29036caabf3a..3466e1bace56 100644 --- a/test/CodeGen/AArch64/arm64-memset-to-bzero.ll +++ b/test/CodeGen/AArch64/arm64-memset-to-bzero.ll @@ -4,9 +4,10 @@ ; RUN: FileCheck --check-prefix=CHECK-LINUX --check-prefix=CHECK %s ; ARM64: Calls to bzero() replaced with calls to memset() -; CHECK: @fct1 +; CHECK-LABEL: fct1: ; For small size (<= 256), we do not change memset to bzero. -; CHECK: memset +; CHECK-DARWIN: {{b|bl}} _memset +; CHECK-LINUX: {{b|bl}} memset define void @fct1(i8* nocapture %ptr) { entry: tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 256, i32 1, i1 false) @@ -15,20 +16,20 @@ entry: declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) -; CHECK: @fct2 +; CHECK-LABEL: fct2: ; When the size is bigger than 256, change into bzero. -; CHECK-DARWIN: bzero -; CHECK-LINUX: memset +; CHECK-DARWIN: {{b|bl}} _bzero +; CHECK-LINUX: {{b|bl}} memset define void @fct2(i8* nocapture %ptr) { entry: tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 257, i32 1, i1 false) ret void } -; CHECK: @fct3 +; CHECK-LABEL: fct3: ; For unknown size, change to bzero. -; CHECK-DARWIN: bzero -; CHECK-LINUX: memset +; CHECK-DARWIN: {{b|bl}} _bzero +; CHECK-LINUX: {{b|bl}} memset define void @fct3(i8* nocapture %ptr, i32 %unknown) { entry: %conv = sext i32 %unknown to i64 @@ -36,9 +37,10 @@ entry: ret void } -; CHECK: @fct4 +; CHECK-LABEL: fct4: ; Size <= 256, no change. -; CHECK: memset +; CHECK-DARWIN: {{b|bl}} _memset +; CHECK-LINUX: {{b|bl}} memset define void @fct4(i8* %ptr) { entry: %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) @@ -50,10 +52,10 @@ declare i8* @__memset_chk(i8*, i32, i64, i64) declare i64 @llvm.objectsize.i64(i8*, i1) -; CHECK: @fct5 +; CHECK-LABEL: fct5: ; Size > 256, change. -; CHECK-DARWIN: bzero -; CHECK-LINUX: memset +; CHECK-DARWIN: {{b|bl}} _bzero +; CHECK-LINUX: {{b|bl}} memset define void @fct5(i8* %ptr) { entry: %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) @@ -61,10 +63,10 @@ entry: ret void } -; CHECK: @fct6 +; CHECK-LABEL: fct6: ; Size = unknown, change. -; CHECK-DARWIN: bzero -; CHECK-LINUX: memset +; CHECK-DARWIN: {{b|bl}} _bzero +; CHECK-LINUX: {{b|bl}} memset define void @fct6(i8* %ptr, i32 %unknown) { entry: %conv = sext i32 %unknown to i64 @@ -76,9 +78,10 @@ entry: ; Next functions check that memset is not turned into bzero ; when the set constant is non-zero, whatever the given size. -; CHECK: @fct7 +; CHECK-LABEL: fct7: ; memset with something that is not a zero, no change. -; CHECK: memset +; CHECK-DARWIN: {{b|bl}} _memset +; CHECK-LINUX: {{b|bl}} memset define void @fct7(i8* %ptr) { entry: %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) @@ -86,9 +89,10 @@ entry: ret void } -; CHECK: @fct8 +; CHECK-LABEL: fct8: ; memset with something that is not a zero, no change. -; CHECK: memset +; CHECK-DARWIN: {{b|bl}} _memset +; CHECK-LINUX: {{b|bl}} memset define void @fct8(i8* %ptr) { entry: %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) @@ -96,9 +100,10 @@ entry: ret void } -; CHECK: @fct9 +; CHECK-LABEL: fct9: ; memset with something that is not a zero, no change. -; CHECK: memset +; CHECK-DARWIN: {{b|bl}} _memset +; CHECK-LINUX: {{b|bl}} memset define void @fct9(i8* %ptr, i32 %unknown) { entry: %conv = sext i32 %unknown to i64 diff --git a/test/CodeGen/AArch64/arm64-neon-2velem.ll b/test/CodeGen/AArch64/arm64-neon-2velem.ll index d22bfc76d1d5..b3a2bcd5d669 100644 --- a/test/CodeGen/AArch64/arm64-neon-2velem.ll +++ b/test/CodeGen/AArch64/arm64-neon-2velem.ll @@ -1,6 +1,6 @@ -; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s -; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast -mcpu=exynos-m1 | FileCheck --check-prefix=EXYNOS %s +; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s --check-prefixes=CHECK,GENERIC ; The instruction latencies of Exynos-M1 trigger the transform we see under the Exynos check. +; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast -mcpu=exynos-m1 | FileCheck %s --check-prefixes=CHECK,EXYNOSM1 declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>) @@ -47,7 +47,6 @@ declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmla_lane_s16: ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> %mul = mul <4 x i16> %shuffle, %b @@ -58,7 +57,6 @@ entry: define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlaq_lane_s16: ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> %mul = mul <8 x i16> %shuffle, %b @@ -69,7 +67,6 @@ entry: define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmla_lane_s32: ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> %mul = mul <2 x i32> %shuffle, %b @@ -80,7 +77,6 @@ entry: define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlaq_lane_s32: ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> %mul = mul <4 x i32> %shuffle, %b @@ -91,7 +87,6 @@ entry: define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmla_laneq_s16: ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %mul = mul <4 x i16> %shuffle, %b @@ -102,7 +97,6 @@ entry: define <8 x i16> @test_vmlaq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlaq_laneq_s16: ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> %mul = mul <8 x i16> %shuffle, %b @@ -113,7 +107,6 @@ entry: define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmla_laneq_s32: ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> %mul = mul <2 x i32> %shuffle, %b @@ -124,7 +117,6 @@ entry: define <4 x i32> @test_vmlaq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlaq_laneq_s32: ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %mul = mul <4 x i32> %shuffle, %b @@ -135,7 +127,6 @@ entry: define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmls_lane_s16: ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> %mul = mul <4 x i16> %shuffle, %b @@ -146,7 +137,6 @@ entry: define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlsq_lane_s16: ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> %mul = mul <8 x i16> %shuffle, %b @@ -157,7 +147,6 @@ entry: define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmls_lane_s32: ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> %mul = mul <2 x i32> %shuffle, %b @@ -168,7 +157,6 @@ entry: define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlsq_lane_s32: ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> %mul = mul <4 x i32> %shuffle, %b @@ -179,7 +167,6 @@ entry: define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmls_laneq_s16: ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %mul = mul <4 x i16> %shuffle, %b @@ -190,7 +177,6 @@ entry: define <8 x i16> @test_vmlsq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlsq_laneq_s16: ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> %mul = mul <8 x i16> %shuffle, %b @@ -201,7 +187,6 @@ entry: define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmls_laneq_s32: ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> %mul = mul <2 x i32> %shuffle, %b @@ -212,7 +197,6 @@ entry: define <4 x i32> @test_vmlsq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlsq_laneq_s32: ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %mul = mul <4 x i32> %shuffle, %b @@ -223,7 +207,6 @@ entry: define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vmul_lane_s16: ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> %mul = mul <4 x i16> %shuffle, %a @@ -233,7 +216,6 @@ entry: define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vmulq_lane_s16: ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> %mul = mul <8 x i16> %shuffle, %a @@ -243,7 +225,6 @@ entry: define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vmul_lane_s32: ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> %mul = mul <2 x i32> %shuffle, %a @@ -253,7 +234,6 @@ entry: define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vmulq_lane_s32: ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> %mul = mul <4 x i32> %shuffle, %a @@ -263,7 +243,6 @@ entry: define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vmul_lane_u16: ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> %mul = mul <4 x i16> %shuffle, %a @@ -273,7 +252,6 @@ entry: define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vmulq_lane_u16: ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> %mul = mul <8 x i16> %shuffle, %a @@ -283,7 +261,6 @@ entry: define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vmul_lane_u32: ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> %mul = mul <2 x i32> %shuffle, %a @@ -293,7 +270,6 @@ entry: define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vmulq_lane_u32: ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> %mul = mul <4 x i32> %shuffle, %a @@ -303,7 +279,6 @@ entry: define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vmul_laneq_s16: ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %mul = mul <4 x i16> %shuffle, %a @@ -313,7 +288,6 @@ entry: define <8 x i16> @test_vmulq_laneq_s16(<8 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vmulq_laneq_s16: ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> %mul = mul <8 x i16> %shuffle, %a @@ -323,7 +297,6 @@ entry: define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vmul_laneq_s32: ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> %mul = mul <2 x i32> %shuffle, %a @@ -333,7 +306,6 @@ entry: define <4 x i32> @test_vmulq_laneq_s32(<4 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vmulq_laneq_s32: ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %mul = mul <4 x i32> %shuffle, %a @@ -343,7 +315,6 @@ entry: define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vmul_laneq_u16: ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %mul = mul <4 x i16> %shuffle, %a @@ -353,7 +324,6 @@ entry: define <8 x i16> @test_vmulq_laneq_u16(<8 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vmulq_laneq_u16: ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> %mul = mul <8 x i16> %shuffle, %a @@ -363,7 +333,6 @@ entry: define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vmul_laneq_u32: ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> %mul = mul <2 x i32> %shuffle, %a @@ -373,7 +342,6 @@ entry: define <4 x i32> @test_vmulq_laneq_u32(<4 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vmulq_laneq_u32: ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %mul = mul <4 x i32> %shuffle, %a @@ -382,12 +350,9 @@ entry: define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) { ; CHECK-LABEL: test_vfma_lane_f32: -; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfma_lane_f32: -; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1] -; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1] +; EXYNOSM1: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -398,12 +363,9 @@ declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) { ; CHECK-LABEL: test_vfmaq_lane_f32: -; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmaq_lane_f32: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[1] -; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[1] +; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) @@ -414,12 +376,9 @@ declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) { ; CHECK-LABEL: test_vfma_laneq_f32: -; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfma_laneq_f32: -; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[3] -; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[3] +; EXYNOSM1: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -428,12 +387,9 @@ entry: define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) { ; CHECK-LABEL: test_vfmaq_laneq_f32: -; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmaq_laneq_f32: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3] -; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3] +; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) @@ -442,12 +398,9 @@ entry: define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) { ; CHECK-LABEL: test_vfms_lane_f32: -; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfms_lane_f32: -; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1] -; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1] +; EXYNOSM1: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %sub = fsub <2 x float> , %v %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> @@ -457,12 +410,9 @@ entry: define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) { ; CHECK-LABEL: test_vfmsq_lane_f32: -; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmsq_lane_f32: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[1] -; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[1] +; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %sub = fsub <2 x float> , %v %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> @@ -472,12 +422,9 @@ entry: define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) { ; CHECK-LABEL: test_vfms_laneq_f32: -; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfms_laneq_f32: -; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[3] -; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[3] +; EXYNOSM1: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %sub = fsub <4 x float> , %v %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> @@ -487,12 +434,9 @@ entry: define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) { ; CHECK-LABEL: test_vfmsq_laneq_f32: -; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmsq_laneq_f32: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3] -; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3] +; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %sub = fsub <4 x float> , %v %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> @@ -502,12 +446,9 @@ entry: define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) { ; CHECK-LABEL: test_vfmaq_lane_f64: -; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmaq_lane_f64: -; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] -; EXYNOS: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d entry: %lane = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a) @@ -518,12 +459,9 @@ declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) { ; CHECK-LABEL: test_vfmaq_laneq_f64: -; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmaq_laneq_f64: -; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[1] -; EXYNOS: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] +; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[1] +; EXYNOSM1: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d entry: %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a) @@ -532,12 +470,9 @@ entry: define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) { ; CHECK-LABEL: test_vfmsq_lane_f64: -; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmsq_lane_f64: -; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] -; EXYNOS: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d entry: %sub = fsub <1 x double> , %v %lane = shufflevector <1 x double> %sub, <1 x double> undef, <2 x i32> zeroinitializer @@ -547,12 +482,9 @@ entry: define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) { ; CHECK-LABEL: test_vfmsq_laneq_f64: -; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmsq_laneq_f64: -; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[1] -; EXYNOS: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] +; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[1] +; EXYNOSM1: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d entry: %sub = fsub <2 x double> , %v %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> @@ -563,10 +495,6 @@ entry: define float @test_vfmas_laneq_f32(float %a, float %b, <4 x float> %v) { ; CHECK-LABEL: test_vfmas_laneq_f32 ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret -; EXNOS-LABEL: test_vfmas_laneq_f32 -; EXNOS: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] -; EXNOS-NEXT: ret entry: %extract = extractelement <4 x float> %v, i32 3 %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a) @@ -578,7 +506,6 @@ declare float @llvm.fma.f32(float, float, float) define double @test_vfmsd_lane_f64(double %a, double %b, <1 x double> %v) { ; CHECK-LABEL: test_vfmsd_lane_f64 ; CHECK: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} -; CHECK-NEXT: ret entry: %extract.rhs = extractelement <1 x double> %v, i32 0 %extract = fsub double -0.000000e+00, %extract.rhs @@ -591,10 +518,6 @@ declare double @llvm.fma.f64(double, double, double) define float @test_vfmss_lane_f32(float %a, float %b, <2 x float> %v) { ; CHECK-LABEL: test_vfmss_lane_f32 ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmss_lane_f32 -; EXYNOS: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] -; EXYNOS-NEXT: ret entry: %extract.rhs = extractelement <2 x float> %v, i32 1 %extract = fsub float -0.000000e+00, %extract.rhs @@ -605,7 +528,6 @@ entry: define float @test_vfmss_laneq_f32(float %a, float %b, <4 x float> %v) { ; CHECK-LABEL: test_vfmss_laneq_f32 ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %extract.rhs = extractelement <4 x float> %v, i32 3 %extract = fsub float -0.000000e+00, %extract.rhs @@ -616,10 +538,6 @@ entry: define double @test_vfmsd_laneq_f64(double %a, double %b, <2 x double> %v) { ; CHECK-LABEL: test_vfmsd_laneq_f64 ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmsd_laneq_f64 -; EXYNOS: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] -; EXYNOS-NEXT: ret entry: %extract.rhs = extractelement <2 x double> %v, i32 1 %extract = fsub double -0.000000e+00, %extract.rhs @@ -641,10 +559,6 @@ entry: define float @test_vfmss_lane_f32_0(float %a, float %b, <2 x float> %v) { ; CHECK-LABEL: test_vfmss_lane_f32_0 ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmss_lane_f32_0 -; EXYNOS: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] -; EXYNOS-NEXT: ret entry: %tmp0 = fsub <2 x float> , %v %tmp1 = extractelement <2 x float> %tmp0, i32 1 @@ -655,7 +569,6 @@ entry: define float @test_vfmss_laneq_f32_0(float %a, float %b, <4 x float> %v) { ; CHECK-LABEL: test_vfmss_laneq_f32_0 ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %tmp0 = fsub <4 x float>, %v %tmp1 = extractelement <4 x float> %tmp0, i32 3 @@ -666,7 +579,6 @@ entry: define double @test_vfmsd_laneq_f64_0(double %a, double %b, <2 x double> %v) { ; CHECK-LABEL: test_vfmsd_laneq_f64_0 ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] -; CHECK-NEXT: ret entry: %tmp0 = fsub <2 x double>, %v %tmp1 = extractelement <2 x double> %tmp0, i32 1 @@ -677,7 +589,6 @@ entry: define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlal_lane_s16: ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -688,7 +599,6 @@ entry: define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlal_lane_s32: ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -699,7 +609,6 @@ entry: define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlal_laneq_s16: ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -710,7 +619,6 @@ entry: define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlal_laneq_s32: ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -721,7 +629,6 @@ entry: define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlal_high_lane_s16: ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> @@ -733,7 +640,6 @@ entry: define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlal_high_lane_s32: ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> @@ -745,7 +651,6 @@ entry: define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlal_high_laneq_s16: ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> @@ -757,7 +662,6 @@ entry: define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlal_high_laneq_s32: ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> @@ -769,7 +673,6 @@ entry: define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlsl_lane_s16: ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -780,7 +683,6 @@ entry: define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlsl_lane_s32: ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -791,7 +693,6 @@ entry: define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlsl_laneq_s16: ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -802,7 +703,6 @@ entry: define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlsl_laneq_s32: ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -813,7 +713,6 @@ entry: define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlsl_high_lane_s16: ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> @@ -825,7 +724,6 @@ entry: define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlsl_high_lane_s32: ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> @@ -837,7 +735,6 @@ entry: define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlsl_high_laneq_s16: ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> @@ -849,7 +746,6 @@ entry: define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlsl_high_laneq_s32: ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> @@ -861,7 +757,6 @@ entry: define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlal_lane_u16: ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -872,7 +767,6 @@ entry: define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlal_lane_u32: ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -883,7 +777,6 @@ entry: define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlal_laneq_u16: ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -894,7 +787,6 @@ entry: define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlal_laneq_u32: ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -905,7 +797,6 @@ entry: define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlal_high_lane_u16: ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> @@ -917,7 +808,6 @@ entry: define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlal_high_lane_u32: ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> @@ -929,7 +819,6 @@ entry: define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlal_high_laneq_u16: ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> @@ -941,7 +830,6 @@ entry: define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlal_high_laneq_u32: ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> @@ -953,7 +841,6 @@ entry: define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlsl_lane_u16: ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -964,7 +851,6 @@ entry: define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlsl_lane_u32: ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -975,7 +861,6 @@ entry: define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlsl_laneq_u16: ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -986,7 +871,6 @@ entry: define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlsl_laneq_u32: ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -997,7 +881,6 @@ entry: define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlsl_high_lane_u16: ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> @@ -1009,7 +892,6 @@ entry: define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlsl_high_lane_u32: ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> @@ -1021,7 +903,6 @@ entry: define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlsl_high_laneq_u16: ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> @@ -1033,7 +914,6 @@ entry: define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlsl_high_laneq_u32: ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> @@ -1045,7 +925,6 @@ entry: define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vmull_lane_s16: ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -1055,7 +934,6 @@ entry: define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vmull_lane_s32: ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -1065,7 +943,6 @@ entry: define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vmull_lane_u16: ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -1075,7 +952,6 @@ entry: define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vmull_lane_u32: ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -1085,7 +961,6 @@ entry: define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vmull_high_lane_s16: ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> @@ -1096,7 +971,6 @@ entry: define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vmull_high_lane_s32: ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> @@ -1107,7 +981,6 @@ entry: define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vmull_high_lane_u16: ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> @@ -1118,7 +991,6 @@ entry: define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vmull_high_lane_u32: ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> @@ -1129,7 +1001,6 @@ entry: define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vmull_laneq_s16: ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -1139,7 +1010,6 @@ entry: define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vmull_laneq_s32: ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -1149,7 +1019,6 @@ entry: define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vmull_laneq_u16: ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -1159,7 +1028,6 @@ entry: define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vmull_laneq_u32: ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -1169,7 +1037,6 @@ entry: define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vmull_high_laneq_s16: ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> @@ -1180,7 +1047,6 @@ entry: define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vmull_high_laneq_s32: ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> @@ -1191,7 +1057,6 @@ entry: define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vmull_high_laneq_u16: ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> @@ -1202,7 +1067,6 @@ entry: define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vmull_high_laneq_u32: ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> @@ -1213,7 +1077,6 @@ entry: define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vqdmlal_lane_s16: ; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -1224,7 +1087,6 @@ entry: define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmlal_lane_s32: ; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -1235,7 +1097,6 @@ entry: define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vqdmlal_high_lane_s16: ; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> @@ -1247,7 +1108,6 @@ entry: define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmlal_high_lane_s32: ; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> @@ -1259,7 +1119,6 @@ entry: define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vqdmlsl_lane_s16: ; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -1270,7 +1129,6 @@ entry: define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmlsl_lane_s32: ; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -1281,7 +1139,6 @@ entry: define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vqdmlsl_high_lane_s16: ; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> @@ -1293,7 +1150,6 @@ entry: define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmlsl_high_lane_s32: ; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> @@ -1305,7 +1161,6 @@ entry: define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vqdmull_lane_s16: ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -1315,7 +1170,6 @@ entry: define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmull_lane_s32: ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -1325,7 +1179,6 @@ entry: define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vqdmull_laneq_s16: ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -1335,7 +1188,6 @@ entry: define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vqdmull_laneq_s32: ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -1345,7 +1197,6 @@ entry: define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vqdmull_high_lane_s16: ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> @@ -1356,7 +1207,6 @@ entry: define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmull_high_lane_s32: ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> @@ -1367,7 +1217,6 @@ entry: define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vqdmull_high_laneq_s16: ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> @@ -1378,7 +1227,6 @@ entry: define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vqdmull_high_laneq_s32: ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> @@ -1389,7 +1237,6 @@ entry: define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vqdmulh_lane_s16: ; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) @@ -1399,7 +1246,6 @@ entry: define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vqdmulhq_lane_s16: ; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle) @@ -1409,7 +1255,6 @@ entry: define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmulh_lane_s32: ; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle) @@ -1419,7 +1264,6 @@ entry: define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmulhq_lane_s32: ; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle) @@ -1429,7 +1273,6 @@ entry: define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vqrdmulh_lane_s16: ; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) @@ -1439,7 +1282,6 @@ entry: define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vqrdmulhq_lane_s16: ; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle) @@ -1449,7 +1291,6 @@ entry: define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vqrdmulh_lane_s32: ; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle) @@ -1459,7 +1300,6 @@ entry: define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vqrdmulhq_lane_s32: ; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle) @@ -1468,12 +1308,9 @@ entry: define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) { ; CHECK-LABEL: test_vmul_lane_f32: -; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmul_lane_f32: -; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1] -; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1] +; EXYNOSM1: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> %mul = fmul <2 x float> %shuffle, %a @@ -1483,10 +1320,6 @@ entry: define <1 x double> @test_vmul_lane_f64(<1 x double> %a, <1 x double> %v) { ; CHECK-LABEL: test_vmul_lane_f64: ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmul_lane_f64: -; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} -; EXYNOS-NEXT: ret entry: %0 = bitcast <1 x double> %a to <8 x i8> %1 = bitcast <8 x i8> %0 to double @@ -1498,12 +1331,9 @@ entry: define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) { ; CHECK-LABEL: test_vmulq_lane_f32: -; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulq_lane_f32: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[1] -; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[1] +; EXYNOSM1: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> %mul = fmul <4 x float> %shuffle, %a @@ -1512,12 +1342,9 @@ entry: define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) { ; CHECK-LABEL: test_vmulq_lane_f64: -; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulq_lane_f64: -; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] -; EXYNOS: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d -; EXYNOS-NEXT: ret +; GENERIC: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d entry: %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer %mul = fmul <2 x double> %shuffle, %a @@ -1526,12 +1353,9 @@ entry: define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) { ; CHECK-LABEL: test_vmul_laneq_f32: -; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmul_laneq_f32: -; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[3] -; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[3] +; EXYNOSM1: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> %mul = fmul <2 x float> %shuffle, %a @@ -1541,10 +1365,6 @@ entry: define <1 x double> @test_vmul_laneq_f64(<1 x double> %a, <2 x double> %v) { ; CHECK-LABEL: test_vmul_laneq_f64: ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmul_laneq_f64: -; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] -; EXYNOS-NEXT: ret entry: %0 = bitcast <1 x double> %a to <8 x i8> %1 = bitcast <8 x i8> %0 to double @@ -1556,12 +1376,9 @@ entry: define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) { ; CHECK-LABEL: test_vmulq_laneq_f32: -; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulq_laneq_f32: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3] -; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3] +; EXYNOSM1: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %mul = fmul <4 x float> %shuffle, %a @@ -1570,12 +1387,9 @@ entry: define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) { ; CHECK-LABEL: test_vmulq_laneq_f64: -; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulq_laneq_f64: -; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[1] -; EXYNOS: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] +; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[1] +; EXYNOSM1: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d entry: %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> %mul = fmul <2 x double> %shuffle, %a @@ -1584,12 +1398,9 @@ entry: define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) { ; CHECK-LABEL: test_vmulx_lane_f32: -; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulx_lane_f32: -; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1] -; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1] +; EXYNOSM1: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) @@ -1598,12 +1409,9 @@ entry: define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) { ; CHECK-LABEL: test_vmulxq_lane_f32: -; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulxq_lane_f32: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[1] -; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; Exynos-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[1] +; EXYNOSM1: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) @@ -1612,12 +1420,9 @@ entry: define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) { ; CHECK-LABEL: test_vmulxq_lane_f64: -; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulxq_lane_f64: -; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] -; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d entry: %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) @@ -1626,12 +1431,9 @@ entry: define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) { ; CHECK-LABEL: test_vmulx_laneq_f32: -; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulx_laneq_f32: -; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[3] -; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[3] +; EXYNOSM1: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) @@ -1640,12 +1442,9 @@ entry: define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) { ; CHECK-LABEL: test_vmulxq_laneq_f32: -; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulxq_laneq_f32: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3] -; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3] +; EXYNOSM1: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) @@ -1654,12 +1453,9 @@ entry: define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) { ; CHECK-LABEL: test_vmulxq_laneq_f64: -; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulxq_laneq_f64: -; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[1] -; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] +; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[1] +; EXYNOSM1: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d entry: %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) @@ -1669,7 +1465,6 @@ entry: define <4 x i16> @test_vmla_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmla_lane_s16_0: ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %mul = mul <4 x i16> %shuffle, %b @@ -1680,7 +1475,6 @@ entry: define <8 x i16> @test_vmlaq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlaq_lane_s16_0: ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer %mul = mul <8 x i16> %shuffle, %b @@ -1691,7 +1485,6 @@ entry: define <2 x i32> @test_vmla_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmla_lane_s32_0: ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %mul = mul <2 x i32> %shuffle, %b @@ -1702,7 +1495,6 @@ entry: define <4 x i32> @test_vmlaq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlaq_lane_s32_0: ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer %mul = mul <4 x i32> %shuffle, %b @@ -1713,7 +1505,6 @@ entry: define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmla_laneq_s16_0: ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %mul = mul <4 x i16> %shuffle, %b @@ -1724,7 +1515,6 @@ entry: define <8 x i16> @test_vmlaq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlaq_laneq_s16_0: ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer %mul = mul <8 x i16> %shuffle, %b @@ -1735,7 +1525,6 @@ entry: define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmla_laneq_s32_0: ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %mul = mul <2 x i32> %shuffle, %b @@ -1746,7 +1535,6 @@ entry: define <4 x i32> @test_vmlaq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlaq_laneq_s32_0: ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer %mul = mul <4 x i32> %shuffle, %b @@ -1757,7 +1545,6 @@ entry: define <4 x i16> @test_vmls_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmls_lane_s16_0: ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %mul = mul <4 x i16> %shuffle, %b @@ -1768,7 +1555,6 @@ entry: define <8 x i16> @test_vmlsq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlsq_lane_s16_0: ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer %mul = mul <8 x i16> %shuffle, %b @@ -1779,7 +1565,6 @@ entry: define <2 x i32> @test_vmls_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmls_lane_s32_0: ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %mul = mul <2 x i32> %shuffle, %b @@ -1790,7 +1575,6 @@ entry: define <4 x i32> @test_vmlsq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlsq_lane_s32_0: ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer %mul = mul <4 x i32> %shuffle, %b @@ -1801,7 +1585,6 @@ entry: define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmls_laneq_s16_0: ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %mul = mul <4 x i16> %shuffle, %b @@ -1812,7 +1595,6 @@ entry: define <8 x i16> @test_vmlsq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlsq_laneq_s16_0: ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer %mul = mul <8 x i16> %shuffle, %b @@ -1823,7 +1605,6 @@ entry: define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmls_laneq_s32_0: ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %mul = mul <2 x i32> %shuffle, %b @@ -1834,7 +1615,6 @@ entry: define <4 x i32> @test_vmlsq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlsq_laneq_s32_0: ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer %mul = mul <4 x i32> %shuffle, %b @@ -1845,7 +1625,6 @@ entry: define <4 x i16> @test_vmul_lane_s16_0(<4 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vmul_lane_s16_0: ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %mul = mul <4 x i16> %shuffle, %a @@ -1855,7 +1634,6 @@ entry: define <8 x i16> @test_vmulq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vmulq_lane_s16_0: ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer %mul = mul <8 x i16> %shuffle, %a @@ -1865,7 +1643,6 @@ entry: define <2 x i32> @test_vmul_lane_s32_0(<2 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vmul_lane_s32_0: ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %mul = mul <2 x i32> %shuffle, %a @@ -1875,7 +1652,6 @@ entry: define <4 x i32> @test_vmulq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vmulq_lane_s32_0: ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer %mul = mul <4 x i32> %shuffle, %a @@ -1885,7 +1661,6 @@ entry: define <4 x i16> @test_vmul_lane_u16_0(<4 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vmul_lane_u16_0: ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %mul = mul <4 x i16> %shuffle, %a @@ -1895,7 +1670,6 @@ entry: define <8 x i16> @test_vmulq_lane_u16_0(<8 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vmulq_lane_u16_0: ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer %mul = mul <8 x i16> %shuffle, %a @@ -1905,7 +1679,6 @@ entry: define <2 x i32> @test_vmul_lane_u32_0(<2 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vmul_lane_u32_0: ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %mul = mul <2 x i32> %shuffle, %a @@ -1915,7 +1688,6 @@ entry: define <4 x i32> @test_vmulq_lane_u32_0(<4 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vmulq_lane_u32_0: ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer %mul = mul <4 x i32> %shuffle, %a @@ -1925,7 +1697,6 @@ entry: define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vmul_laneq_s16_0: ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %mul = mul <4 x i16> %shuffle, %a @@ -1935,7 +1706,6 @@ entry: define <8 x i16> @test_vmulq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vmulq_laneq_s16_0: ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer %mul = mul <8 x i16> %shuffle, %a @@ -1945,7 +1715,6 @@ entry: define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vmul_laneq_s32_0: ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %mul = mul <2 x i32> %shuffle, %a @@ -1955,7 +1724,6 @@ entry: define <4 x i32> @test_vmulq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vmulq_laneq_s32_0: ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer %mul = mul <4 x i32> %shuffle, %a @@ -1965,7 +1733,6 @@ entry: define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vmul_laneq_u16_0: ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %mul = mul <4 x i16> %shuffle, %a @@ -1975,7 +1742,6 @@ entry: define <8 x i16> @test_vmulq_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vmulq_laneq_u16_0: ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer %mul = mul <8 x i16> %shuffle, %a @@ -1985,7 +1751,6 @@ entry: define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vmul_laneq_u32_0: ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %mul = mul <2 x i32> %shuffle, %a @@ -1995,7 +1760,6 @@ entry: define <4 x i32> @test_vmulq_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vmulq_laneq_u32_0: ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer %mul = mul <4 x i32> %shuffle, %a @@ -2004,12 +1768,9 @@ entry: define <2 x float> @test_vfma_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) { ; CHECK-LABEL: test_vfma_lane_f32_0: -; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfma_lane_f32_0: -; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] -; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -2018,12 +1779,9 @@ entry: define <4 x float> @test_vfmaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) { ; CHECK-LABEL: test_vfmaq_lane_f32_0: -; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmaq_lane_f32_0: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] -; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) @@ -2032,12 +1790,9 @@ entry: define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) { ; CHECK-LABEL: test_vfma_laneq_f32_0: -; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfma_laneq_f32_0: -; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] -; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -2046,12 +1801,9 @@ entry: define <4 x float> @test_vfmaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) { ; CHECK-LABEL: test_vfmaq_laneq_f32_0: -; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmaq_laneq_f32_0: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] -; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) @@ -2060,12 +1812,9 @@ entry: define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) { ; CHECK-LABEL: test_vfms_lane_f32_0: -; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfms_lane_f32_0: -; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] -; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %sub = fsub <2 x float> , %v %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> zeroinitializer @@ -2075,12 +1824,9 @@ entry: define <4 x float> @test_vfmsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) { ; CHECK-LABEL: test_vfmsq_lane_f32_0: -; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmsq_lane_f32_0: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] -; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %sub = fsub <2 x float> , %v %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> zeroinitializer @@ -2090,12 +1836,9 @@ entry: define <2 x float> @test_vfms_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) { ; CHECK-LABEL: test_vfms_laneq_f32_0: -; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfms_laneq_f32_0: -; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] -; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %sub = fsub <4 x float> , %v %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> zeroinitializer @@ -2105,12 +1848,9 @@ entry: define <4 x float> @test_vfmsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) { ; CHECK-LABEL: test_vfmsq_laneq_f32_0: -; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmsq_laneq_f32_0: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] -; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %sub = fsub <4 x float> , %v %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> zeroinitializer @@ -2120,12 +1860,9 @@ entry: define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) { ; CHECK-LABEL: test_vfmaq_laneq_f64_0: -; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmaq_laneq_f64_0: -; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] -; EXYNOS: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d entry: %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a) @@ -2134,12 +1871,9 @@ entry: define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) { ; CHECK-LABEL: test_vfmsq_laneq_f64_0: -; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmsq_laneq_f64_0: -; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] -; EXYNOS: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d entry: %sub = fsub <2 x double> , %v %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> zeroinitializer @@ -2150,7 +1884,6 @@ entry: define <4 x i32> @test_vmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlal_lane_s16_0: ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -2161,7 +1894,6 @@ entry: define <2 x i64> @test_vmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlal_lane_s32_0: ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -2172,7 +1904,6 @@ entry: define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlal_laneq_s16_0: ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -2183,7 +1914,6 @@ entry: define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlal_laneq_s32_0: ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -2194,7 +1924,6 @@ entry: define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlal_high_lane_s16_0: ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -2206,7 +1935,6 @@ entry: define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlal_high_lane_s32_0: ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -2218,7 +1946,6 @@ entry: define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlal_high_laneq_s16_0: ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer @@ -2230,7 +1957,6 @@ entry: define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlal_high_laneq_s32_0: ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer @@ -2242,7 +1968,6 @@ entry: define <4 x i32> @test_vmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlsl_lane_s16_0: ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -2253,7 +1978,6 @@ entry: define <2 x i64> @test_vmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlsl_lane_s32_0: ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -2264,7 +1988,6 @@ entry: define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlsl_laneq_s16_0: ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -2275,7 +1998,6 @@ entry: define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlsl_laneq_s32_0: ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -2286,7 +2008,6 @@ entry: define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlsl_high_lane_s16_0: ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -2298,7 +2019,6 @@ entry: define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlsl_high_lane_s32_0: ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -2310,7 +2030,6 @@ entry: define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlsl_high_laneq_s16_0: ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer @@ -2322,7 +2041,6 @@ entry: define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlsl_high_laneq_s32_0: ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer @@ -2334,7 +2052,6 @@ entry: define <4 x i32> @test_vmlal_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlal_lane_u16_0: ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -2345,7 +2062,6 @@ entry: define <2 x i64> @test_vmlal_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlal_lane_u32_0: ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -2356,7 +2072,6 @@ entry: define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlal_laneq_u16_0: ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -2367,7 +2082,6 @@ entry: define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlal_laneq_u32_0: ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -2378,7 +2092,6 @@ entry: define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlal_high_lane_u16_0: ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -2390,7 +2103,6 @@ entry: define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlal_high_lane_u32_0: ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -2402,7 +2114,6 @@ entry: define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlal_high_laneq_u16_0: ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer @@ -2414,7 +2125,6 @@ entry: define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlal_high_laneq_u32_0: ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer @@ -2426,7 +2136,6 @@ entry: define <4 x i32> @test_vmlsl_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlsl_lane_u16_0: ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -2437,7 +2146,6 @@ entry: define <2 x i64> @test_vmlsl_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlsl_lane_u32_0: ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -2448,7 +2156,6 @@ entry: define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlsl_laneq_u16_0: ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -2459,7 +2166,6 @@ entry: define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlsl_laneq_u32_0: ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -2470,7 +2176,6 @@ entry: define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlsl_high_lane_u16_0: ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -2482,7 +2187,6 @@ entry: define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlsl_high_lane_u32_0: ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -2494,7 +2198,6 @@ entry: define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlsl_high_laneq_u16_0: ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer @@ -2506,7 +2209,6 @@ entry: define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlsl_high_laneq_u32_0: ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer @@ -2518,7 +2220,6 @@ entry: define <4 x i32> @test_vmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vmull_lane_s16_0: ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -2528,7 +2229,6 @@ entry: define <2 x i64> @test_vmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vmull_lane_s32_0: ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -2538,7 +2238,6 @@ entry: define <4 x i32> @test_vmull_lane_u16_0(<4 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vmull_lane_u16_0: ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -2548,7 +2247,6 @@ entry: define <2 x i64> @test_vmull_lane_u32_0(<2 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vmull_lane_u32_0: ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -2558,7 +2256,6 @@ entry: define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vmull_high_lane_s16_0: ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -2569,7 +2266,6 @@ entry: define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vmull_high_lane_s32_0: ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -2580,7 +2276,6 @@ entry: define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vmull_high_lane_u16_0: ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -2591,7 +2286,6 @@ entry: define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vmull_high_lane_u32_0: ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -2602,7 +2296,6 @@ entry: define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vmull_laneq_s16_0: ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -2612,7 +2305,6 @@ entry: define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vmull_laneq_s32_0: ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -2622,7 +2314,6 @@ entry: define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vmull_laneq_u16_0: ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -2632,7 +2323,6 @@ entry: define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vmull_laneq_u32_0: ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -2642,7 +2332,6 @@ entry: define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vmull_high_laneq_s16_0: ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer @@ -2653,7 +2342,6 @@ entry: define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vmull_high_laneq_s32_0: ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer @@ -2664,7 +2352,6 @@ entry: define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vmull_high_laneq_u16_0: ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer @@ -2675,7 +2362,6 @@ entry: define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vmull_high_laneq_u32_0: ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer @@ -2686,7 +2372,6 @@ entry: define <4 x i32> @test_vqdmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vqdmlal_lane_s16_0: ; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -2697,7 +2382,6 @@ entry: define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmlal_lane_s32_0: ; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -2708,7 +2392,6 @@ entry: define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vqdmlal_high_lane_s16_0: ; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -2720,7 +2403,6 @@ entry: define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmlal_high_lane_s32_0: ; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -2732,7 +2414,6 @@ entry: define <4 x i32> @test_vqdmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vqdmlsl_lane_s16_0: ; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -2743,7 +2424,6 @@ entry: define <2 x i64> @test_vqdmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmlsl_lane_s32_0: ; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -2754,7 +2434,6 @@ entry: define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vqdmlsl_high_lane_s16_0: ; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -2766,7 +2445,6 @@ entry: define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmlsl_high_lane_s32_0: ; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -2778,7 +2456,6 @@ entry: define <4 x i32> @test_vqdmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vqdmull_lane_s16_0: ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -2788,7 +2465,6 @@ entry: define <2 x i64> @test_vqdmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmull_lane_s32_0: ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -2798,7 +2474,6 @@ entry: define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vqdmull_laneq_s16_0: ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -2808,7 +2483,6 @@ entry: define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vqdmull_laneq_s32_0: ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -2818,7 +2492,6 @@ entry: define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vqdmull_high_lane_s16_0: ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -2829,7 +2502,6 @@ entry: define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmull_high_lane_s32_0: ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -2840,7 +2512,6 @@ entry: define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vqdmull_high_laneq_s16_0: ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer @@ -2851,7 +2522,6 @@ entry: define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vqdmull_high_laneq_s32_0: ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer @@ -2862,7 +2532,6 @@ entry: define <4 x i16> @test_vqdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vqdmulh_lane_s16_0: ; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) @@ -2872,7 +2541,6 @@ entry: define <8 x i16> @test_vqdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vqdmulhq_lane_s16_0: ; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle) @@ -2882,7 +2550,6 @@ entry: define <2 x i32> @test_vqdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmulh_lane_s32_0: ; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle) @@ -2892,7 +2559,6 @@ entry: define <4 x i32> @test_vqdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmulhq_lane_s32_0: ; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle) @@ -2902,7 +2568,6 @@ entry: define <4 x i16> @test_vqrdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vqrdmulh_lane_s16_0: ; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) @@ -2912,7 +2577,6 @@ entry: define <8 x i16> @test_vqrdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vqrdmulhq_lane_s16_0: ; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle) @@ -2922,7 +2586,6 @@ entry: define <2 x i32> @test_vqrdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vqrdmulh_lane_s32_0: ; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle) @@ -2932,7 +2595,6 @@ entry: define <4 x i32> @test_vqrdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vqrdmulhq_lane_s32_0: ; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle) @@ -2941,12 +2603,9 @@ entry: define <2 x float> @test_vmul_lane_f32_0(<2 x float> %a, <2 x float> %v) { ; CHECK-LABEL: test_vmul_lane_f32_0: -; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmul_lane_f32_0: -; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] -; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer %mul = fmul <2 x float> %shuffle, %a @@ -2955,12 +2614,9 @@ entry: define <4 x float> @test_vmulq_lane_f32_0(<4 x float> %a, <2 x float> %v) { ; CHECK-LABEL: test_vmulq_lane_f32_0: -; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulq_lane_f32_0: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] -; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer %mul = fmul <4 x float> %shuffle, %a @@ -2969,12 +2625,9 @@ entry: define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) { ; CHECK-LABEL: test_vmul_laneq_f32_0: -; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmul_laneq_f32_0: -; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] -; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer %mul = fmul <2 x float> %shuffle, %a @@ -2984,10 +2637,6 @@ entry: define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) { ; CHECK-LABEL: test_vmul_laneq_f64_0: ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmul_laneq_f64_0: -; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] -; EXYNOS-NEXT: ret entry: %0 = bitcast <1 x double> %a to <8 x i8> %1 = bitcast <8 x i8> %0 to double @@ -2999,12 +2648,9 @@ entry: define <4 x float> @test_vmulq_laneq_f32_0(<4 x float> %a, <4 x float> %v) { ; CHECK-LABEL: test_vmulq_laneq_f32_0: -; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulq_laneq_f32_0: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] -; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer %mul = fmul <4 x float> %shuffle, %a @@ -3013,12 +2659,9 @@ entry: define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) { ; CHECK-LABEL: test_vmulq_laneq_f64_0: -; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulq_laneq_f64_0: -; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] -; EXYNOS: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d entry: %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer %mul = fmul <2 x double> %shuffle, %a @@ -3027,12 +2670,9 @@ entry: define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) { ; CHECK-LABEL: test_vmulx_lane_f32_0: -; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulx_lane_f32_0: -; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] -; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) @@ -3041,12 +2681,9 @@ entry: define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) { ; CHECK-LABEL: test_vmulxq_lane_f32_0: -; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulxq_lane_f32_0: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] -; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) @@ -3055,12 +2692,9 @@ entry: define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) { ; CHECK-LABEL: test_vmulxq_lane_f64_0: -; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulxq_lane_f64_0: -; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] -; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d entry: %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) @@ -3069,12 +2703,9 @@ entry: define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) { ; CHECK-LABEL: test_vmulx_laneq_f32_0: -; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulx_laneq_f32_0: -; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] -; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) @@ -3083,12 +2714,9 @@ entry: define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) { ; CHECK-LABEL: test_vmulxq_laneq_f32_0: -; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulxq_laneq_f32_0: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] -; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) @@ -3097,12 +2725,9 @@ entry: define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) { ; CHECK-LABEL: test_vmulxq_laneq_f64_0: -; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulxq_laneq_f64_0: -; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] -; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d entry: %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) @@ -3111,14 +2736,11 @@ entry: define <4 x float> @optimize_dup(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %v) { ; CHECK-LABEL: optimize_dup: -; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret -; EXYNOS-LABEL: optimize_dup: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3] -; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3] +; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s +; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %lane1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane1, <4 x float> %b, <4 x float> %a) @@ -3130,15 +2752,12 @@ entry: define <4 x float> @no_optimize_dup(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %v) { ; CHECK-LABEL: no_optimize_dup: -; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: no_optimize_dup: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3] -; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS: dup [[y:v[0-9]+]].4s, {{v[0-9]+}}.s[1] -; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[y]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3] +; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s +; EXYNOSM1: dup [[W:v[0-9]+]].4s, {{v[0-9]+}}.s[1] +; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[W]].4s entry: %lane1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane1, <4 x float> %b, <4 x float> %a) @@ -3150,8 +2769,7 @@ entry: define <2 x float> @test_vfma_lane_simdinstr_opt_pass_caching_a57(<2 x float> %a, <2 x float> %b, <2 x float> %v) "target-cpu"="cortex-a57" { ; CHECK-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_a57: -; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] entry: %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -3160,9 +2778,8 @@ entry: define <2 x float> @test_vfma_lane_simdinstr_opt_pass_caching_m1(<2 x float> %a, <2 x float> %b, <2 x float> %v) "target-cpu"="exynos-m1" { ; CHECK-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_m1: -; CHECK: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1] -; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; CHECK-NEXT: ret +; GENERIC: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1] +; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) diff --git a/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll b/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll index 453334dce601..2fb9d3b2d030 100644 --- a/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll +++ b/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll @@ -87,4 +87,13 @@ for.end: ret double %v0 } +define <2 x i64> @t6() { +; ALL-LABEL: t6: +; CYCLONE: movi.16b v0, #0 +; KRYO: movi v0.2d, #0000000000000000 +; FALKOR: movi v0.2d, #0000000000000000 + ret <2 x i64> zeroinitializer +} + + declare double @sin(double) diff --git a/test/CodeGen/AArch64/chkstk.ll b/test/CodeGen/AArch64/chkstk.ll new file mode 100644 index 000000000000..1c2e5528f10c --- /dev/null +++ b/test/CodeGen/AArch64/chkstk.ll @@ -0,0 +1,25 @@ +; RUN: llc -mtriple=aarch64-windows -verify-machineinstrs %s -o - \ +; RUN: | FileCheck -check-prefix CHECK-DEFAULT-CODE-MODEL %s + +; RUN: llc -mtriple=aarch64-windows -verify-machineinstrs -code-model=large %s -o - \ +; RUN: | FileCheck -check-prefix CHECK-LARGE-CODE-MODEL %s + +define void @check_watermark() { +entry: + %buffer = alloca [4096 x i8], align 1 + ret void +} + +; CHECK-DEFAULT-CODE-MODEL: check_watermark: +; CHECK-DEFAULT-CODE-MODEL-DAG: stp x29, x30, [sp +; CHECK-DEFAULT-CODE-MODEL-DAG: orr x15, xzr, #0x100 +; CHECK-DEFAULT-CODE-MODEL: bl __chkstk +; CHECK-DEFAULT-CODE-MODEL: sub sp, sp, x15, lsl #4 + +; CHECK-LARGE-CODE-MODEL: check_watermark: +; CHECK-LARGE-CODE-MODEL-DAG: stp x29, x30, [sp +; CHECK-LARGE-CODE-MODEL-DAG: orr x15, xzr, #0x100 +; CHECK-LARGE-CODE-MODEL-DAG: adrp x16, __chkstk +; CHECK-LARGE-CODE-MODEL-DAG: add x16, x16, __chkstk +; CHECK-LARGE-CODE-MODEL: blr x16 +; CHECK-LARGE-CODE-MODEL: sub sp, sp, x15, lsl #4 diff --git a/test/CodeGen/AArch64/ldst-paired-aliasing.ll b/test/CodeGen/AArch64/ldst-paired-aliasing.ll index 9c698b5fdcc6..9b0b51d369a3 100644 --- a/test/CodeGen/AArch64/ldst-paired-aliasing.ll +++ b/test/CodeGen/AArch64/ldst-paired-aliasing.ll @@ -10,11 +10,10 @@ declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #3 define i32 @main() local_unnamed_addr #1 { ; Make sure the stores happen in the correct order (the exact instructions could change). ; CHECK-LABEL: main: -; CHECK: stp xzr, xzr, [sp, #72] +; CHECK: str xzr, [sp, #80] ; CHECK: str w9, [sp, #80] -; CHECK: str q0, [sp, #48] +; CHECK: stp q0, q0, [sp, #48] ; CHECK: ldr w8, [sp, #48] -; CHECK: str q0, [sp, #64] for.body.lr.ph.i.i.i.i.i.i63: %b1 = alloca [10 x i32], align 16 diff --git a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll index 228d3c7d4306..71c4c83c28f9 100644 --- a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -251,7 +251,8 @@ entry: ; R600: MOVA_INT -; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: +; SI-PROMOTE-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: +; SI-PROMOTE-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding: ; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: [0x04,0x00,0x60,0xe0 ; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding: [0x05,0x00,0x60,0xe0 diff --git a/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll b/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll new file mode 100644 index 000000000000..f97785beab6f --- /dev/null +++ b/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll @@ -0,0 +1,32 @@ +; RUN: llc -mtriple=amdgcn--amdhsa-amdgiz -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; Effectively, check that the compile finishes; in the case +; of an infinite loop, llc toggles between merging 2 ST4s +; ( MergeConsecutiveStores() ) and breaking the resulting ST8 +; apart ( LegalizeStoreOps() ). + +target datalayout = "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5" + +; GCN-LABEL: {{^}}_Z6brokenPd: +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} +define amdgpu_kernel void @_Z6brokenPd(double* %arg) { +bb: + %tmp = alloca double, align 8, addrspace(5) + %tmp1 = alloca double, align 8, addrspace(5) + %tmp2 = load double, double* %arg, align 8 + br i1 1, label %bb6, label %bb4 + +bb3: ; No predecessors! + br label %bb4 + +bb4: ; preds = %bb3, %bb + %tmp5 = phi double addrspace(5)* [ %tmp1, %bb3 ], [ %tmp, %bb ] + store double %tmp2, double addrspace(5)* %tmp5, align 8 + br label %bb6 + +bb6: ; preds = %bb4, %bb + %tmp7 = phi double [ 0x7FF8123000000000, %bb4 ], [ 0x7FF8000000000000, %bb ] + store double %tmp7, double* %arg, align 8 + ret void +} diff --git a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir index 7c2666e3680f..8b9b83f6d0e7 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir @@ -6,6 +6,7 @@ define void @test_trunc_and_zext_s16() { ret void } define void @test_trunc_and_anyext_s8() { ret void } define void @test_trunc_and_anyext_s16() { ret void } + define void @test_trunc_s64() #0 { ret void } define void @test_add_s32() { ret void } define void @test_add_fold_imm_s32() { ret void } @@ -46,6 +47,10 @@ define void @test_gep() { ret void } define void @test_constant_imm() { ret void } define void @test_constant_cimm() { ret void } + define void @test_pointer_constant() { ret void } + + define void @test_inttoptr_s32() { ret void } + define void @test_ptrtoint_s32() { ret void } define void @test_select_s32() { ret void } define void @test_select_ptr() { ret void } @@ -241,6 +246,36 @@ body: | ; CHECK: BX_RET 14, %noreg, implicit %r0 ... --- +name: test_trunc_s64 +# CHECK-LABEL: name: test_trunc_s64 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +body: | + bb.0: + liveins: %r0, %d0 + + %0(s64) = COPY %d0 + ; CHECK: [[VREG:%[0-9]+]]:dpr = COPY %d0 + + %2(p0) = COPY %r0 + ; CHECK: [[PTR:%[0-9]+]]:gpr = COPY %r0 + + %1(s32) = G_TRUNC %0(s64) + ; CHECK: [[VREGTRUNC:%[0-9]+]]:gpr, [[UNINTERESTING:%[0-9]+]]:gpr = VMOVRRD [[VREG]] + + G_STORE %1(s32), %2 :: (store 4) + ; CHECK: STRi12 [[VREGTRUNC]], [[PTR]], 0, 14, %noreg + + BX_RET 14, %noreg + ; CHECK: BX_RET 14, %noreg +... +--- name: test_add_s32 # CHECK-LABEL: name: test_add_s32 legalized: true @@ -1075,6 +1110,71 @@ body: | BX_RET 14, %noreg, implicit %r0 ... --- +name: test_pointer_constant +# CHECK-LABEL: name: test_pointer_constant +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } +body: | + bb.0: + %0(p0) = G_CONSTANT i32 0 + ; CHECK: %[[C:[0-9]+]]:gpr = MOVi 0, 14, %noreg, %noreg + + %r0 = COPY %0(p0) + BX_RET 14, %noreg, implicit %r0 +... +--- +name: test_inttoptr_s32 +# CHECK-LABEL: name: test_inttoptr_s32 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } +body: | + bb.0: + liveins: %r0 + + %0(s32) = COPY %r0 + %1(p0) = G_INTTOPTR %0(s32) + ; CHECK: [[INT:%[0-9]+]]:gpr = COPY %r0 + ; CHECK: [[PTR:%[0-9]+]]:gpr = COPY [[INT]] + + %r0 = COPY %1(p0) + ; CHECK: %r0 = COPY [[PTR]] + + BX_RET 14, %noreg, implicit %r0 +... +--- +name: test_ptrtoint_s32 +# CHECK-LABEL: name: test_ptrtoint_s32 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } +body: | + bb.0: + liveins: %r0 + + %0(p0) = COPY %r0 + %1(s32) = G_PTRTOINT %0(p0) + ; CHECK: [[PTR:%[0-9]+]]:gpr = COPY %r0 + ; CHECK: [[INT:%[0-9]+]]:gpr = COPY [[PTR]] + + %r0 = COPY %1(s32) + ; CHECK: %r0 = COPY [[INT]] + + BX_RET 14, %noreg, implicit %r0 +... +--- name: test_select_s32 # CHECK-LABEL: name: test_select_s32 legalized: true diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir index e3e206cf76e9..204434e981b4 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir @@ -3,6 +3,9 @@ define void @test_sext_s8() { ret void } define void @test_zext_s16() { ret void } + define void @test_inttoptr_s32() { ret void } + define void @test_ptrtoint_s32() { ret void } + define void @test_add_s8() { ret void } define void @test_add_s16() { ret void } define void @test_add_s32() { ret void } @@ -101,6 +104,50 @@ body: | BX_RET 14, %noreg, implicit %r0 ... --- +name: test_inttoptr_s32 +# CHECK-LABEL: name: test_inttoptr_s32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.0: + liveins: %r0 + + %0(s32) = COPY %r0 + %1(p0) = G_INTTOPTR %0(s32) + ; G_INTTOPTR with s32 is legal, so we should find it unchanged in the output + ; CHECK: {{%[0-9]+}}:_(p0) = G_INTTOPTR {{%[0-9]+}} + %r0 = COPY %1(p0) + BX_RET 14, %noreg, implicit %r0 +... +--- +name: test_ptrtoint_s32 +# CHECK-LABEL: name: test_ptrtoint_s32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.0: + liveins: %r0 + + %0(p0) = COPY %r0 + %1(s32) = G_PTRTOINT %0(p0) + ; G_PTRTOINT with s32 is legal, so we should find it unchanged in the output + ; CHECK: {{%[0-9]+}}:_(s32) = G_PTRTOINT {{%[0-9]+}} + %r0 = COPY %1(s32) + BX_RET 14, %noreg, implicit %r0 +... +--- name: test_add_s8 # CHECK-LABEL: name: test_add_s8 legalized: false @@ -826,6 +873,7 @@ registers: - { id: 2, class: _ } - { id: 3, class: _ } - { id: 4, class: _ } + - { id: 5, class: _ } body: | bb.0: liveins: %r0 @@ -856,6 +904,10 @@ body: | ; CHECK: {{%[0-9]+}}:_(s1) = G_TRUNC [[EXT]](s32) ; CHECK-NOT: G_CONSTANT i1 + %5(p0) = G_CONSTANT 0 + G_STORE %5(p0), %4(p0) :: (store 4) + ; CHECK: {{%[0-9]+}}:_(p0) = G_CONSTANT 0 + %r0 = COPY %0(s32) BX_RET 14, %noreg, implicit %r0 ... diff --git a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir index 044740e33a2d..175333626f97 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir @@ -24,6 +24,9 @@ define void @test_constants() { ret void } + define void @test_inttoptr_s32() { ret void } + define void @test_ptrtoint_s32() { ret void } + @a_global = global float 1.0 define void @test_globals() { ret void } @@ -31,6 +34,7 @@ define void @test_anyext_s16_32() { ret void } define void @test_trunc_s32_16() { ret void } + define void @test_trunc_s64_32() #0 { ret void } define void @test_icmp_eq_s32() { ret void } define void @test_fcmp_one_s32() #0 { ret void } @@ -496,6 +500,44 @@ body: | BX_RET 14, %noreg, implicit %r0 ... --- +name: test_inttoptr_s32 +# CHECK-LABEL: name: test_inttoptr_s32 +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: gprb, preferred-register: '' } +# CHECK: - { id: 1, class: gprb, preferred-register: '' } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.0: + %0(s32) = COPY %r0 + %1(p0) = G_INTTOPTR %0(s32) + %r0 = COPY %1(p0) + BX_RET 14, %noreg, implicit %r0 +... +--- +name: test_ptrtoint_s32 +# CHECK-LABEL: name: test_ptrtoint_s32 +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: gprb, preferred-register: '' } +# CHECK: - { id: 1, class: gprb, preferred-register: '' } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.0: + %0(p0) = COPY %r0 + %1(s32) = G_PTRTOINT %0(p0) + %r0 = COPY %1(s32) + BX_RET 14, %noreg, implicit %r0 +... +--- name: test_globals # CHECK-LABEL: name: test_globals legalized: true @@ -584,6 +626,30 @@ body: | BX_RET 14, %noreg ... --- +name: test_trunc_s64_32 +# CHECK-LABEL: name: test_trunc_s64_32 +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: fprb, preferred-register: '' } +# CHECK: - { id: 1, class: gprb, preferred-register: '' } +# CHECK: - { id: 2, class: gprb, preferred-register: '' } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %d0 + + %0(s64) = COPY %d0 + %2(p0) = COPY %r0 + %1(s32) = G_TRUNC %0(s64) + G_STORE %1(s32), %2 :: (store 4) + BX_RET 14, %noreg +... +--- name: test_icmp_eq_s32 # CHECK-LABEL: name: test_icmp_eq_s32 legalized: true diff --git a/test/CodeGen/ARM/avoid-cpsr-rmw.ll b/test/CodeGen/ARM/avoid-cpsr-rmw.ll index 78d3ebf371a4..9373c5d44210 100644 --- a/test/CodeGen/ARM/avoid-cpsr-rmw.ll +++ b/test/CodeGen/ARM/avoid-cpsr-rmw.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CORTEX -; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=swift | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SWIFT +; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 -simplifycfg-sink-common=false | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CORTEX +; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=swift -simplifycfg-sink-common=false | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SWIFT ; Avoid some 's' 16-bit instruction which partially update CPSR (and add false ; dependency) when it isn't dependent on last CPSR defining instruction. ; rdar://8928208 diff --git a/test/CodeGen/ARM/su-addsub-overflow.ll b/test/CodeGen/ARM/su-addsub-overflow.ll new file mode 100644 index 000000000000..eef531282033 --- /dev/null +++ b/test/CodeGen/ARM/su-addsub-overflow.ll @@ -0,0 +1,135 @@ +; RUN: llc < %s -mtriple=arm-eabi -mcpu=generic | FileCheck %s + +define i32 @sadd(i32 %a, i32 %b) local_unnamed_addr #0 { +; CHECK-LABEL: sadd: +; CHECK: mov r[[R0:[0-9]+]], r0 +; CHECK-NEXT: add r[[R1:[0-9]+]], r[[R0]], r1 +; CHECK-NEXT: cmp r[[R1]], r[[R0]] +; CHECK-NEXT: movvc pc, lr +entry: + %0 = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) + %1 = extractvalue { i32, i1 } %0, 1 + br i1 %1, label %trap, label %cont + +trap: + tail call void @llvm.trap() #2 + unreachable + +cont: + %2 = extractvalue { i32, i1 } %0, 0 + ret i32 %2 + +} + +define i32 @uadd(i32 %a, i32 %b) local_unnamed_addr #0 { +; CHECK-LABEL: uadd: +; CHECK: mov r[[R0:[0-9]+]], r0 +; CHECK-NEXT: adds r[[R1:[0-9]+]], r[[R0]], r1 +; CHECK-NEXT: cmp r[[R1]], r[[R0]] +; CHECK-NEXT: movhs pc, lr +entry: + %0 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) + %1 = extractvalue { i32, i1 } %0, 1 + br i1 %1, label %trap, label %cont + +trap: + tail call void @llvm.trap() #2 + unreachable + +cont: + %2 = extractvalue { i32, i1 } %0, 0 + ret i32 %2 + +} + +define i32 @ssub(i32 %a, i32 %b) local_unnamed_addr #0 { +; CHECK-LABEL: ssub: +; CHECK: cmp r0, r1 +; CHECK-NEXT: subvc r0, r0, r1 +; CHECK-NEXT: movvc pc, lr +entry: + %0 = tail call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) + %1 = extractvalue { i32, i1 } %0, 1 + br i1 %1, label %trap, label %cont + +trap: + tail call void @llvm.trap() #2 + unreachable + +cont: + %2 = extractvalue { i32, i1 } %0, 0 + ret i32 %2 + +} + +define i32 @usub(i32 %a, i32 %b) local_unnamed_addr #0 { +; CHECK-LABEL: usub: +; CHECK: mov r[[R0:[0-9]+]], r0 +; CHECK-NEXT: subs r[[R1:[0-9]+]], r[[R0]], r1 +; CHECK-NEXT: cmp r[[R0]], r1 +; CHECK-NEXT: movhs pc, lr +entry: + %0 = tail call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) + %1 = extractvalue { i32, i1 } %0, 1 + br i1 %1, label %trap, label %cont + +trap: + tail call void @llvm.trap() #2 + unreachable + +cont: + %2 = extractvalue { i32, i1 } %0, 0 + ret i32 %2 + +} + +define void @sum(i32* %a, i32* %b, i32 %n) local_unnamed_addr #0 { +; CHECK-LABEL: sum: +; CHECK: ldr [[R0:r[0-9]+]], +; CHECK-NEXT: ldr [[R1:r[0-9]+|lr]], +; CHECK-NEXT: add [[R2:r[0-9]+]], [[R1]], [[R0]] +; CHECK-NEXT: cmp [[R2]], [[R1]] +; CHECK-NEXT: strvc [[R2]], +; CHECK-NEXT: addvc +; CHECK-NEXT: cmpvc +; CHECK-NEXT: bvs +entry: + %cmp7 = icmp eq i32 %n, 0 + br i1 %cmp7, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void + +for.body: + %i.08 = phi i32 [ %7, %cont2 ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %b, i32 %i.08 + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %a, i32 %i.08 + %1 = load i32, i32* %arrayidx1, align 4 + %2 = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %1, i32 %0) + %3 = extractvalue { i32, i1 } %2, 1 + br i1 %3, label %trap, label %cont + +trap: + tail call void @llvm.trap() #2 + unreachable + +cont: + %4 = extractvalue { i32, i1 } %2, 0 + store i32 %4, i32* %arrayidx1, align 4 + %5 = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %i.08, i32 1) + %6 = extractvalue { i32, i1 } %5, 1 + br i1 %6, label %trap, label %cont2 + +cont2: + %7 = extractvalue { i32, i1 } %5, 0 + %cmp = icmp eq i32 %7, %n + br i1 %cmp, label %for.cond.cleanup, label %for.body + +} + +declare void @llvm.trap() #2 +declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) #1 +declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1 +declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) #1 +declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #1 diff --git a/test/CodeGen/ARM/usat.ll b/test/CodeGen/ARM/usat.ll new file mode 100644 index 000000000000..8f19d11ef7bb --- /dev/null +++ b/test/CodeGen/ARM/usat.ll @@ -0,0 +1,214 @@ +; RUN: llc -mtriple=armv4t-eabi %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=V4T +; RUN: llc -mtriple=armv6-eabi %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=V6 +; RUN: llc -mtriple=armv6t2-eabi %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=V6T2 + +; Check for several conditions that should result in USAT. +; For example, the base test is equivalent to +; x < 0 ? 0 : (x > k ? k : x) in C. All patterns that bound x +; to the interval [0, k] where k + 1 is a power of 2 can be +; transformed into USAT. At the end there are some tests +; checking that conditionals are not transformed if they don't +; match the right pattern. + +; +; Base tests with different bit widths +; + +; x < 0 ? 0 : (x > k ? k : x) +; 32-bit base test +define i32 @unsigned_sat_base_32bit(i32 %x) #0 { +; CHECK-LABEL: unsigned_sat_base_32bit: +; V6: usat r0, #23, r0 +; V6T2: usat r0, #23, r0 +; V4T-NOT: usat +entry: + %cmpLow = icmp slt i32 %x, 0 + %cmpUp = icmp sgt i32 %x, 8388607 + %saturateUp = select i1 %cmpUp, i32 8388607, i32 %x + %saturateLow = select i1 %cmpLow, i32 0, i32 %saturateUp + ret i32 %saturateLow +} + +; x < 0 ? 0 : (x > k ? k : x) +; 16-bit base test +define i16 @unsigned_sat_base_16bit(i16 %x) #0 { +; CHECK-LABEL: unsigned_sat_base_16bit: +; V6: usat r0, #11, r0 +; V6T2: usat r0, #11, r0 +; V4T-NOT: usat +entry: + %cmpLow = icmp slt i16 %x, 0 + %cmpUp = icmp sgt i16 %x, 2047 + %saturateUp = select i1 %cmpUp, i16 2047, i16 %x + %saturateLow = select i1 %cmpLow, i16 0, i16 %saturateUp + ret i16 %saturateLow +} + +; x < 0 ? 0 : (x > k ? k : x) +; 8-bit base test +define i8 @unsigned_sat_base_8bit(i8 %x) #0 { +; CHECK-LABEL: unsigned_sat_base_8bit: +; V6: usat r0, #5, r0 +; V6T2: usat r0, #5, r0 +; V4T-NOT: usat +entry: + %cmpLow = icmp slt i8 %x, 0 + %cmpUp = icmp sgt i8 %x, 31 + %saturateUp = select i1 %cmpUp, i8 31, i8 %x + %saturateLow = select i1 %cmpLow, i8 0, i8 %saturateUp + ret i8 %saturateLow +} + +; +; Tests where the conditionals that check for upper and lower bounds, +; or the < and > operators, are arranged in different ways. Only some +; of the possible combinations that lead to USAT are tested. +; +; x < 0 ? 0 : (x < k ? x : k) +define i32 @unsigned_sat_lower_upper_1(i32 %x) #0 { +; CHECK-LABEL: unsigned_sat_lower_upper_1: +; V6: usat r0, #23, r0 +; V6T2: usat r0, #23, r0 +; V4T-NOT: usat +entry: + %cmpLow = icmp slt i32 %x, 0 + %cmpUp = icmp slt i32 %x, 8388607 + %saturateUp = select i1 %cmpUp, i32 %x, i32 8388607 + %saturateLow = select i1 %cmpLow, i32 0, i32 %saturateUp + ret i32 %saturateLow +} + +; x > 0 ? (x > k ? k : x) : 0 +define i32 @unsigned_sat_lower_upper_2(i32 %x) #0 { +; CHECK-LABEL: unsigned_sat_lower_upper_2: +; V6: usat r0, #23, r0 +; V6T2: usat r0, #23, r0 +; V4T-NOT: usat +entry: + %cmpLow = icmp sgt i32 %x, 0 + %cmpUp = icmp sgt i32 %x, 8388607 + %saturateUp = select i1 %cmpUp, i32 8388607, i32 %x + %saturateLow = select i1 %cmpLow, i32 %saturateUp, i32 0 + ret i32 %saturateLow +} + +; x < k ? (x < 0 ? 0 : x) : k +define i32 @unsigned_sat_upper_lower_1(i32 %x) #0 { +; CHECK-LABEL: unsigned_sat_upper_lower_1: +; V6: usat r0, #23, r0 +; V6T2: usat r0, #23, r0 +; V4T-NOT: usat +entry: + %cmpUp = icmp slt i32 %x, 8388607 + %cmpLow = icmp slt i32 %x, 0 + %saturateLow = select i1 %cmpLow, i32 0, i32 %x + %saturateUp = select i1 %cmpUp, i32 %saturateLow, i32 8388607 + ret i32 %saturateUp +} + +; x > k ? k : (x < 0 ? 0 : x) +define i32 @unsigned_sat_upper_lower_2(i32 %x) #0 { +; CHECK-LABEL: unsigned_sat_upper_lower_2: +; V6: usat r0, #23, r0 +; V6T2: usat r0, #23, r0 +; V4T-NOT: usat +entry: + %cmpUp = icmp sgt i32 %x, 8388607 + %cmpLow = icmp slt i32 %x, 0 + %saturateLow = select i1 %cmpLow, i32 0, i32 %x + %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow + ret i32 %saturateUp +} + +; k < x ? k : (x > 0 ? x : 0) +define i32 @unsigned_sat_upper_lower_3(i32 %x) #0 { +; CHECK-LABEL: unsigned_sat_upper_lower_3: +; V6: usat r0, #23, r0 +; V6T2: usat r0, #23, r0 +; V4T-NOT: usat +entry: + %cmpUp = icmp slt i32 8388607, %x + %cmpLow = icmp sgt i32 %x, 0 + %saturateLow = select i1 %cmpLow, i32 %x, i32 0 + %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow + ret i32 %saturateUp +} + +; +; The following tests check for patterns that should not transform +; into USAT but are similar enough that could confuse the selector. +; +; x > k ? k : (x > 0 ? 0 : x) +; First condition upper-saturates, second doesn't lower-saturate. +define i32 @no_unsigned_sat_missing_lower(i32 %x) #0 { +; CHECK-LABEL: no_unsigned_sat_missing_lower +; CHECK-NOT: usat +entry: + %cmpUp = icmp sgt i32 %x, 8388607 + %cmpLow = icmp sgt i32 %x, 0 + %saturateLow = select i1 %cmpLow, i32 0, i32 %x + %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow + ret i32 %saturateUp +} + +; x < k ? k : (x < 0 ? 0 : x) +; Second condition lower-saturates, first doesn't upper-saturate. +define i32 @no_unsigned_sat_missing_upper(i32 %x) #0 { +; CHECK-LABEL: no_unsigned_sat_missing_upper: +; CHECK-NOT: usat +entry: + %cmpUp = icmp slt i32 %x, 8388607 + %cmpLow = icmp slt i32 %x, 0 + %saturateLow = select i1 %cmpLow, i32 0, i32 %x + %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow + ret i32 %saturateUp +} + +; Lower constant is different in the select and in the compare +define i32 @no_unsigned_sat_incorrect_constant(i32 %x) #0 { +; CHECK-LABEL: no_unsigned_sat_incorrect_constant: +; CHECK-NOT: usat +entry: + %cmpUp = icmp sgt i32 %x, 8388607 + %cmpLow = icmp slt i32 %x, 0 + %saturateLow = select i1 %cmpLow, i32 -1, i32 %x + %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow + ret i32 %saturateUp +} + +; The interval is not [0, k] +define i32 @no_unsigned_sat_incorrect_interval(i32 %x) #0 { +; CHECK-LABEL: no_unsigned_sat_incorrect_interval: +; CHECK-NOT: usat +entry: + %cmpUp = icmp sgt i32 %x, 8388607 + %cmpLow = icmp slt i32 %x, -4 + %saturateLow = select i1 %cmpLow, i32 -4, i32 %x + %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow + ret i32 %saturateUp +} + +; The returned value (y) is not the same as the tested value (x). +define i32 @no_unsigned_sat_incorrect_return(i32 %x, i32 %y) #0 { +; CHECK-LABEL: no_unsigned_sat_incorrect_return: +; CHECK-NOT: usat +entry: + %cmpUp = icmp sgt i32 %x, 8388607 + %cmpLow = icmp slt i32 %x, 0 + %saturateLow = select i1 %cmpLow, i32 0, i32 %y + %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow + ret i32 %saturateUp +} + +; One of the values in a compare (y) is not the same as the rest +; of the compare and select values (x). +define i32 @no_unsigned_sat_incorrect_compare(i32 %x, i32 %y) #0 { +; CHECK-LABEL: no_unsigned_sat_incorrect_compare: +; CHECK-NOT: usat +entry: + %cmpUp = icmp sgt i32 %x, 8388607 + %cmpLow = icmp slt i32 %y, 0 + %saturateLow = select i1 %cmpLow, i32 0, i32 %x + %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow + ret i32 %saturateUp +} diff --git a/test/CodeGen/BPF/objdump_imm_hex.ll b/test/CodeGen/BPF/objdump_imm_hex.ll new file mode 100644 index 000000000000..a245a6c791f2 --- /dev/null +++ b/test/CodeGen/BPF/objdump_imm_hex.ll @@ -0,0 +1,65 @@ +; RUN: llc -march=bpfel -filetype=obj -o - %s | llvm-objdump -d - | FileCheck --check-prefix=CHECK-DEC %s +; RUN: llc -march=bpfel -filetype=obj -o - %s | llvm-objdump -d -print-imm-hex - | FileCheck --check-prefix=CHECK-HEX %s + +; Source Code: +; int gbl; +; int test(unsigned long long a, unsigned long long b) { +; int ret = 0; +; if (a == 0xABCDABCDabcdabcdULL) { +; gbl = gbl * gbl * 2; +; ret = 1; +; goto out; +; } +; if (b == 0xABCDabcdabcdULL) { +; gbl = gbl * 4; +; ret = 2; +; } +; out: +; return ret; +; } + +@gbl = common local_unnamed_addr global i32 0, align 4 + +; Function Attrs: norecurse nounwind +define i32 @test(i64, i64) local_unnamed_addr #0 { +; CHECK-LABEL: test + %3 = icmp eq i64 %0, -6067004223159161907 + br i1 %3, label %4, label %8 +; CHECK-DEC: 18 03 00 00 cd ab cd ab 00 00 00 00 cd ab cd ab r3 = -6067004223159161907 ll +; CHECK-DEC: 5d 31 07 00 00 00 00 00 if r1 != r3 goto +7 +; CHECK-HEX: 18 03 00 00 cd ab cd ab 00 00 00 00 cd ab cd ab r3 = -0x5432543254325433 ll +; CHECK-HEX: 5d 31 07 00 00 00 00 00 if r1 != r3 goto +0x7 + +;