Import LLVM r73954.

2009-06-23 14:50:01 +00:00 · 2009-06-23 14:50:01 +00:00 · 0408e1d309
parent b2f21fb044
commit 0408e1d309
149 changed files with 12272 additions and 1243 deletions
--- a/include/llvm/Analysis/IVUsers.h
+++ b/include/llvm/Analysis/IVUsers.h
@ -34,7 +34,7 @@ class IVUsersOfOneStride;
 class IVStrideUse : public CallbackVH, public ilist_node<IVStrideUse> {
 public:
  IVStrideUse(IVUsersOfOneStride *parent,
-              const SCEVHandle &offset,
+              const SCEV* offset,
              Instruction* U, Value *O)
    : CallbackVH(U), Parent(parent), Offset(offset),
      OperandValToReplace(O),
@ -58,10 +58,10 @@ public:
  /// getOffset - Return the offset to add to a theoeretical induction
  /// variable that starts at zero and counts up by the stride to compute
  /// the value for the use. This always has the same type as the stride.
-  SCEVHandle getOffset() const { return Offset; }
+  const SCEV* getOffset() const { return Offset; }

  /// setOffset - Assign a new offset to this use.
-  void setOffset(SCEVHandle Val) {
+  void setOffset(const SCEV* Val) {
    Offset = Val;
  }

@ -96,7 +96,7 @@ private:
  IVUsersOfOneStride *Parent;

  /// Offset - The offset to add to the base induction expression.
-  SCEVHandle Offset;
+  const SCEV* Offset;

  /// OperandValToReplace - The Value of the operand in the user instruction
  /// that this IVStrideUse is representing.
@ -158,7 +158,7 @@ public:
  /// initial value and the operand that uses the IV.
  ilist<IVStrideUse> Users;

-  void addUser(const SCEVHandle &Offset, Instruction *User, Value *Operand) {
+  void addUser(const SCEV* Offset, Instruction *User, Value *Operand) {
    Users.push_back(new IVStrideUse(this, Offset, User, Operand));
  }
 };
@ -178,12 +178,12 @@ public:

  /// IVUsesByStride - A mapping from the strides in StrideOrder to the
  /// uses in IVUses.
-  std::map<SCEVHandle, IVUsersOfOneStride*> IVUsesByStride;
+  std::map<const SCEV*, IVUsersOfOneStride*> IVUsesByStride;

  /// StrideOrder - An ordering of the keys in IVUsesByStride that is stable:
  /// We use this to iterate over the IVUsesByStride collection without being
  /// dependent on random ordering of pointers in the process.
-  SmallVector<SCEVHandle, 16> StrideOrder;
+  SmallVector<const SCEV*, 16> StrideOrder;

 private:
  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
@ -203,7 +203,7 @@ public:

  /// getReplacementExpr - Return a SCEV expression which computes the
  /// value of the OperandValToReplace of the given IVStrideUse.
-  SCEVHandle getReplacementExpr(const IVStrideUse &U) const;
+  const SCEV* getReplacementExpr(const IVStrideUse &U) const;

  void print(raw_ostream &OS, const Module* = 0) const;
  virtual void print(std::ostream &OS, const Module* = 0) const;
--- a/include/llvm/Analysis/LoopVR.h
+++ b/include/llvm/Analysis/LoopVR.h
@ -78,9 +78,9 @@ public:
 private:
  ConstantRange compute(Value *V);

-  ConstantRange getRange(SCEVHandle S, Loop *L, ScalarEvolution &SE);
+  ConstantRange getRange(const SCEV* S, Loop *L, ScalarEvolution &SE);

-  ConstantRange getRange(SCEVHandle S, SCEVHandle T, ScalarEvolution &SE);
+  ConstantRange getRange(const SCEV* S, const SCEV* T, ScalarEvolution &SE);

  std::map<Value *, ConstantRange *> Map;
 };
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@ -32,10 +32,16 @@ namespace llvm {
  class APInt;
  class ConstantInt;
  class Type;
-  class SCEVHandle;
  class ScalarEvolution;
  class TargetData;
-  template<> struct DenseMapInfo<SCEVHandle>;
+  class SCEVConstant;
+  class SCEVTruncateExpr;
+  class SCEVZeroExtendExpr;
+  class SCEVCommutativeExpr;
+  class SCEVUDivExpr;
+  class SCEVSignExtendExpr;
+  class SCEVAddRecExpr;
+  class SCEVUnknown;

  /// SCEV - This class represents an analyzed expression in the program.  These
  /// are reference-counted opaque objects that the client is not allowed to
@ -43,25 +49,14 @@ namespace llvm {
  ///
  class SCEV {
    const unsigned SCEVType;      // The SCEV baseclass this node corresponds to
-    mutable unsigned RefCount;
-
-    friend class SCEVHandle;
-    friend class DenseMapInfo<SCEVHandle>;
-    void addRef() const { ++RefCount; }
-    void dropRef() const {
-      if (--RefCount == 0)
-        delete this;
-    }
-
-    const ScalarEvolution* parent;

    SCEV(const SCEV &);            // DO NOT IMPLEMENT
    void operator=(const SCEV &);  // DO NOT IMPLEMENT
  protected:
    virtual ~SCEV();
  public:
-    explicit SCEV(unsigned SCEVTy, const ScalarEvolution* p) : 
-      SCEVType(SCEVTy), RefCount(0), parent(p) {}
+    explicit SCEV(unsigned SCEVTy) : 
+      SCEVType(SCEVTy) {}

    unsigned getSCEVType() const { return SCEVType; }

@ -92,9 +87,9 @@ namespace llvm {
    /// the same value, but which uses the concrete value Conc instead of the
    /// symbolic value.  If this SCEV does not use the symbolic value, it
    /// returns itself.
-    virtual SCEVHandle
-    replaceSymbolicValuesWithConcrete(const SCEVHandle &Sym,
-                                      const SCEVHandle &Conc,
+    virtual const SCEV*
+    replaceSymbolicValuesWithConcrete(const SCEV* Sym,
+                                      const SCEV* Conc,
                                      ScalarEvolution &SE) const = 0;

    /// dominates - Return true if elements that makes up this SCEV dominates
@ -129,17 +124,16 @@ namespace llvm {
  /// None of the standard SCEV operations are valid on this class, it is just a
  /// marker.
  struct SCEVCouldNotCompute : public SCEV {
-    SCEVCouldNotCompute(const ScalarEvolution* p);
-    ~SCEVCouldNotCompute();
+    SCEVCouldNotCompute();

    // None of these methods are valid for this object.
    virtual bool isLoopInvariant(const Loop *L) const;
    virtual const Type *getType() const;
    virtual bool hasComputableLoopEvolution(const Loop *L) const;
    virtual void print(raw_ostream &OS) const;
-    virtual SCEVHandle
-    replaceSymbolicValuesWithConcrete(const SCEVHandle &Sym,
-                                      const SCEVHandle &Conc,
+    virtual const SCEV*
+    replaceSymbolicValuesWithConcrete(const SCEV* Sym,
+                                      const SCEV* Conc,
                                      ScalarEvolution &SE) const;

    virtual bool dominates(BasicBlock *BB, DominatorTree *DT) const {
@ -151,83 +145,6 @@ namespace llvm {
    static bool classof(const SCEV *S);
  };

-  /// SCEVHandle - This class is used to maintain the SCEV object's refcounts,
-  /// freeing the objects when the last reference is dropped.
-  class SCEVHandle {
-    const SCEV *S;
-    SCEVHandle();  // DO NOT IMPLEMENT
-  public:
-    SCEVHandle(const SCEV *s) : S(s) {
-      assert(S && "Cannot create a handle to a null SCEV!");
-      S->addRef();
-    }
-    SCEVHandle(const SCEVHandle &RHS) : S(RHS.S) {
-      S->addRef();
-    }
-    ~SCEVHandle() { S->dropRef(); }
-
-    operator const SCEV*() const { return S; }
-
-    const SCEV &operator*() const { return *S; }
-    const SCEV *operator->() const { return S; }
-
-    bool operator==(const SCEV *RHS) const { return S == RHS; }
-    bool operator!=(const SCEV *RHS) const { return S != RHS; }
-
-    const SCEVHandle &operator=(SCEV *RHS) {
-      if (S != RHS) {
-        S->dropRef();
-        S = RHS;
-        S->addRef();
-      }
-      return *this;
-    }
-
-    const SCEVHandle &operator=(const SCEVHandle &RHS) {
-      if (S != RHS.S) {
-        S->dropRef();
-        S = RHS.S;
-        S->addRef();
-      }
-      return *this;
-    }
-  };
-
-  template<typename From> struct simplify_type;
-  template<> struct simplify_type<const SCEVHandle> {
-    typedef const SCEV* SimpleType;
-    static SimpleType getSimplifiedValue(const SCEVHandle &Node) {
-      return Node;
-    }
-  };
-  template<> struct simplify_type<SCEVHandle>
-    : public simplify_type<const SCEVHandle> {};
-
-  // Specialize DenseMapInfo for SCEVHandle so that SCEVHandle may be used
-  // as a key in DenseMaps.
-  template<>
-  struct DenseMapInfo<SCEVHandle> {
-    static inline SCEVHandle getEmptyKey() {
-      static SCEVCouldNotCompute Empty(0);
-      if (Empty.RefCount == 0)
-        Empty.addRef();
-      return &Empty;
-    }
-    static inline SCEVHandle getTombstoneKey() {
-      static SCEVCouldNotCompute Tombstone(0);
-      if (Tombstone.RefCount == 0)
-        Tombstone.addRef();
-      return &Tombstone;
-    }
-    static unsigned getHashValue(const SCEVHandle &Val) {
-      return DenseMapInfo<const SCEV *>::getHashValue(Val);
-    }
-    static bool isEqual(const SCEVHandle &LHS, const SCEVHandle &RHS) {
-      return LHS == RHS;
-    }
-    static bool isPod() { return false; }
-  };
-
  /// ScalarEvolution - This class is the main scalar evolution driver.  Because
  /// client code (intentionally) can't do much with the SCEV objects directly,
  /// they must ask this class for services.
@ -260,11 +177,11 @@ namespace llvm {

    /// CouldNotCompute - This SCEV is used to represent unknown trip
    /// counts and things.
-    SCEVHandle CouldNotCompute;
+    const SCEV* CouldNotCompute;

    /// Scalars - This is a cache of the scalars we have analyzed so far.
    ///
-    std::map<SCEVCallbackVH, SCEVHandle> Scalars;
+    std::map<SCEVCallbackVH, const SCEV*> Scalars;

    /// BackedgeTakenInfo - Information about the backedge-taken count
    /// of a loop. This currently inclues an exact count and a maximum count.
@ -272,19 +189,16 @@ namespace llvm {
    struct BackedgeTakenInfo {
      /// Exact - An expression indicating the exact backedge-taken count of
      /// the loop if it is known, or a SCEVCouldNotCompute otherwise.
-      SCEVHandle Exact;
+      const SCEV* Exact;

      /// Exact - An expression indicating the least maximum backedge-taken
      /// count of the loop that is known, or a SCEVCouldNotCompute.
-      SCEVHandle Max;
+      const SCEV* Max;

-      /*implicit*/ BackedgeTakenInfo(SCEVHandle exact) :
+      /*implicit*/ BackedgeTakenInfo(const SCEV* exact) :
        Exact(exact), Max(exact) {}

-      /*implicit*/ BackedgeTakenInfo(const SCEV *exact) :
-        Exact(exact), Max(exact) {}
-
-      BackedgeTakenInfo(SCEVHandle exact, SCEVHandle max) :
+      BackedgeTakenInfo(const SCEV* exact, const SCEV* max) :
        Exact(exact), Max(max) {}

      /// hasAnyInfo - Test whether this BackedgeTakenInfo contains any
@ -314,30 +228,30 @@ namespace llvm {

    /// createSCEV - We know that there is no SCEV for the specified value.
    /// Analyze the expression.
-    SCEVHandle createSCEV(Value *V);
+    const SCEV* createSCEV(Value *V);

    /// createNodeForPHI - Provide the special handling we need to analyze PHI
    /// SCEVs.
-    SCEVHandle createNodeForPHI(PHINode *PN);
+    const SCEV* createNodeForPHI(PHINode *PN);

    /// createNodeForGEP - Provide the special handling we need to analyze GEP
    /// SCEVs.
-    SCEVHandle createNodeForGEP(User *GEP);
+    const SCEV* createNodeForGEP(User *GEP);

    /// ReplaceSymbolicValueWithConcrete - This looks up the computed SCEV value
    /// for the specified instruction and replaces any references to the
    /// symbolic value SymName with the specified value.  This is used during
    /// PHI resolution.
    void ReplaceSymbolicValueWithConcrete(Instruction *I,
-                                          const SCEVHandle &SymName,
-                                          const SCEVHandle &NewVal);
+                                          const SCEV* SymName,
+                                          const SCEV* NewVal);

    /// getBECount - Subtract the end and start values and divide by the step,
    /// rounding up, to get the number of times the backedge is executed. Return
    /// CouldNotCompute if an intermediate computation overflows.
-    SCEVHandle getBECount(const SCEVHandle &Start,
-                          const SCEVHandle &End,
-                          const SCEVHandle &Step);
+    const SCEV* getBECount(const SCEV* Start,
+                          const SCEV* End,
+                          const SCEV* Step);

    /// getBackedgeTakenInfo - Return the BackedgeTakenInfo for the given
    /// loop, lazily computing new values if the loop hasn't been analyzed
@ -375,7 +289,7 @@ namespace llvm {

    /// ComputeLoadConstantCompareBackedgeTakenCount - Given an exit condition
    /// of 'icmp op load X, cst', try to see if we can compute the trip count.
-    SCEVHandle
+    const SCEV*
      ComputeLoadConstantCompareBackedgeTakenCount(LoadInst *LI,
                                                   Constant *RHS,
                                                   const Loop *L,
@ -386,18 +300,18 @@ namespace llvm {
    /// try to evaluate a few iterations of the loop until we get the exit
    /// condition gets a value of ExitWhen (true or false).  If we cannot
    /// evaluate the trip count of the loop, return CouldNotCompute.
-    SCEVHandle ComputeBackedgeTakenCountExhaustively(const Loop *L, Value *Cond,
+    const SCEV* ComputeBackedgeTakenCountExhaustively(const Loop *L, Value *Cond,
                                                     bool ExitWhen);

    /// HowFarToZero - Return the number of times a backedge comparing the
    /// specified value to zero will execute.  If not computable, return
    /// CouldNotCompute.
-    SCEVHandle HowFarToZero(const SCEV *V, const Loop *L);
+    const SCEV* HowFarToZero(const SCEV *V, const Loop *L);

    /// HowFarToNonZero - Return the number of times a backedge checking the
    /// specified value for nonzero will execute.  If not computable, return
    /// CouldNotCompute.
-    SCEVHandle HowFarToNonZero(const SCEV *V, const Loop *L);
+    const SCEV* HowFarToNonZero(const SCEV *V, const Loop *L);

    /// HowManyLessThans - Return the number of times a backedge containing the
    /// specified less-than comparison will execute.  If not computable, return
@ -449,109 +363,115 @@ namespace llvm {

    /// getSCEV - Return a SCEV expression handle for the full generality of the
    /// specified expression.
-    SCEVHandle getSCEV(Value *V);
+    const SCEV* getSCEV(Value *V);

-    SCEVHandle getConstant(ConstantInt *V);
-    SCEVHandle getConstant(const APInt& Val);
-    SCEVHandle getConstant(const Type *Ty, uint64_t V, bool isSigned = false);
-    SCEVHandle getTruncateExpr(const SCEVHandle &Op, const Type *Ty);
-    SCEVHandle getZeroExtendExpr(const SCEVHandle &Op, const Type *Ty);
-    SCEVHandle getSignExtendExpr(const SCEVHandle &Op, const Type *Ty);
-    SCEVHandle getAnyExtendExpr(const SCEVHandle &Op, const Type *Ty);
-    SCEVHandle getAddExpr(SmallVectorImpl<SCEVHandle> &Ops);
-    SCEVHandle getAddExpr(const SCEVHandle &LHS, const SCEVHandle &RHS) {
-      SmallVector<SCEVHandle, 2> Ops;
+    const SCEV* getConstant(ConstantInt *V);
+    const SCEV* getConstant(const APInt& Val);
+    const SCEV* getConstant(const Type *Ty, uint64_t V, bool isSigned = false);
+    const SCEV* getTruncateExpr(const SCEV* Op, const Type *Ty);
+    const SCEV* getZeroExtendExpr(const SCEV* Op, const Type *Ty);
+    const SCEV* getSignExtendExpr(const SCEV* Op, const Type *Ty);
+    const SCEV* getAnyExtendExpr(const SCEV* Op, const Type *Ty);
+    const SCEV* getAddExpr(SmallVectorImpl<const SCEV*> &Ops);
+    const SCEV* getAddExpr(const SCEV* LHS, const SCEV* RHS) {
+      SmallVector<const SCEV*, 2> Ops;
      Ops.push_back(LHS);
      Ops.push_back(RHS);
      return getAddExpr(Ops);
    }
-    SCEVHandle getAddExpr(const SCEVHandle &Op0, const SCEVHandle &Op1,
-                          const SCEVHandle &Op2) {
-      SmallVector<SCEVHandle, 3> Ops;
+    const SCEV* getAddExpr(const SCEV* Op0, const SCEV* Op1,
+                          const SCEV* Op2) {
+      SmallVector<const SCEV*, 3> Ops;
      Ops.push_back(Op0);
      Ops.push_back(Op1);
      Ops.push_back(Op2);
      return getAddExpr(Ops);
    }
-    SCEVHandle getMulExpr(SmallVectorImpl<SCEVHandle> &Ops);
-    SCEVHandle getMulExpr(const SCEVHandle &LHS, const SCEVHandle &RHS) {
-      SmallVector<SCEVHandle, 2> Ops;
+    const SCEV* getMulExpr(SmallVectorImpl<const SCEV*> &Ops);
+    const SCEV* getMulExpr(const SCEV* LHS, const SCEV* RHS) {
+      SmallVector<const SCEV*, 2> Ops;
      Ops.push_back(LHS);
      Ops.push_back(RHS);
      return getMulExpr(Ops);
    }
-    SCEVHandle getUDivExpr(const SCEVHandle &LHS, const SCEVHandle &RHS);
-    SCEVHandle getAddRecExpr(const SCEVHandle &Start, const SCEVHandle &Step,
+    const SCEV* getUDivExpr(const SCEV* LHS, const SCEV* RHS);
+    const SCEV* getAddRecExpr(const SCEV* Start, const SCEV* Step,
                             const Loop *L);
-    SCEVHandle getAddRecExpr(SmallVectorImpl<SCEVHandle> &Operands,
+    const SCEV* getAddRecExpr(SmallVectorImpl<const SCEV*> &Operands,
                             const Loop *L);
-    SCEVHandle getAddRecExpr(const SmallVectorImpl<SCEVHandle> &Operands,
+    const SCEV* getAddRecExpr(const SmallVectorImpl<const SCEV*> &Operands,
                             const Loop *L) {
-      SmallVector<SCEVHandle, 4> NewOp(Operands.begin(), Operands.end());
+      SmallVector<const SCEV*, 4> NewOp(Operands.begin(), Operands.end());
      return getAddRecExpr(NewOp, L);
    }
-    SCEVHandle getSMaxExpr(const SCEVHandle &LHS, const SCEVHandle &RHS);
-    SCEVHandle getSMaxExpr(SmallVectorImpl<SCEVHandle> &Operands);
-    SCEVHandle getUMaxExpr(const SCEVHandle &LHS, const SCEVHandle &RHS);
-    SCEVHandle getUMaxExpr(SmallVectorImpl<SCEVHandle> &Operands);
-    SCEVHandle getSMinExpr(const SCEVHandle &LHS, const SCEVHandle &RHS);
-    SCEVHandle getUMinExpr(const SCEVHandle &LHS, const SCEVHandle &RHS);
-    SCEVHandle getUnknown(Value *V);
-    SCEVHandle getCouldNotCompute();
+    const SCEV* getSMaxExpr(const SCEV* LHS, const SCEV* RHS);
+    const SCEV* getSMaxExpr(SmallVectorImpl<const SCEV*> &Operands);
+    const SCEV* getUMaxExpr(const SCEV* LHS, const SCEV* RHS);
+    const SCEV* getUMaxExpr(SmallVectorImpl<const SCEV*> &Operands);
+    const SCEV* getSMinExpr(const SCEV* LHS, const SCEV* RHS);
+    const SCEV* getUMinExpr(const SCEV* LHS, const SCEV* RHS);
+    const SCEV* getUnknown(Value *V);
+    const SCEV* getCouldNotCompute();

    /// getNegativeSCEV - Return the SCEV object corresponding to -V.
    ///
-    SCEVHandle getNegativeSCEV(const SCEVHandle &V);
+    const SCEV* getNegativeSCEV(const SCEV* V);

    /// getNotSCEV - Return the SCEV object corresponding to ~V.
    ///
-    SCEVHandle getNotSCEV(const SCEVHandle &V);
+    const SCEV* getNotSCEV(const SCEV* V);

    /// getMinusSCEV - Return LHS-RHS.
    ///
-    SCEVHandle getMinusSCEV(const SCEVHandle &LHS,
-                            const SCEVHandle &RHS);
+    const SCEV* getMinusSCEV(const SCEV* LHS,
+                            const SCEV* RHS);

    /// getTruncateOrZeroExtend - Return a SCEV corresponding to a conversion
    /// of the input value to the specified type.  If the type must be
    /// extended, it is zero extended.
-    SCEVHandle getTruncateOrZeroExtend(const SCEVHandle &V, const Type *Ty);
+    const SCEV* getTruncateOrZeroExtend(const SCEV* V, const Type *Ty);

    /// getTruncateOrSignExtend - Return a SCEV corresponding to a conversion
    /// of the input value to the specified type.  If the type must be
    /// extended, it is sign extended.
-    SCEVHandle getTruncateOrSignExtend(const SCEVHandle &V, const Type *Ty);
+    const SCEV* getTruncateOrSignExtend(const SCEV* V, const Type *Ty);

    /// getNoopOrZeroExtend - Return a SCEV corresponding to a conversion of
    /// the input value to the specified type.  If the type must be extended,
    /// it is zero extended.  The conversion must not be narrowing.
-    SCEVHandle getNoopOrZeroExtend(const SCEVHandle &V, const Type *Ty);
+    const SCEV* getNoopOrZeroExtend(const SCEV* V, const Type *Ty);

    /// getNoopOrSignExtend - Return a SCEV corresponding to a conversion of
    /// the input value to the specified type.  If the type must be extended,
    /// it is sign extended.  The conversion must not be narrowing.
-    SCEVHandle getNoopOrSignExtend(const SCEVHandle &V, const Type *Ty);
+    const SCEV* getNoopOrSignExtend(const SCEV* V, const Type *Ty);

    /// getNoopOrAnyExtend - Return a SCEV corresponding to a conversion of
    /// the input value to the specified type. If the type must be extended,
    /// it is extended with unspecified bits. The conversion must not be
    /// narrowing.
-    SCEVHandle getNoopOrAnyExtend(const SCEVHandle &V, const Type *Ty);
+    const SCEV* getNoopOrAnyExtend(const SCEV* V, const Type *Ty);

    /// getTruncateOrNoop - Return a SCEV corresponding to a conversion of the
    /// input value to the specified type.  The conversion must not be
    /// widening.
-    SCEVHandle getTruncateOrNoop(const SCEVHandle &V, const Type *Ty);
+    const SCEV* getTruncateOrNoop(const SCEV* V, const Type *Ty);

    /// getIntegerSCEV - Given an integer or FP type, create a constant for the
    /// specified signed integer value and return a SCEV for the constant.
-    SCEVHandle getIntegerSCEV(int Val, const Type *Ty);
+    const SCEV* getIntegerSCEV(int Val, const Type *Ty);

    /// getUMaxFromMismatchedTypes - Promote the operands to the wider of
    /// the types using zero-extension, and then perform a umax operation
    /// with them.
-    SCEVHandle getUMaxFromMismatchedTypes(const SCEVHandle &LHS,
-                                          const SCEVHandle &RHS);
+    const SCEV* getUMaxFromMismatchedTypes(const SCEV* LHS,
+                                          const SCEV* RHS);
+
+    /// getUMinFromMismatchedTypes - Promote the operands to the wider of
+    /// the types using zero-extension, and then perform a umin operation
+    /// with them.
+    const SCEV* getUMinFromMismatchedTypes(const SCEV* LHS,
+                                           const SCEV* RHS);

    /// hasSCEV - Return true if the SCEV for this value has already been
    /// computed.
@ -559,7 +479,7 @@ namespace llvm {

    /// setSCEV - Insert the specified SCEV into the map of current SCEVs for
    /// the specified value.
-    void setSCEV(Value *V, const SCEVHandle &H);
+    void setSCEV(Value *V, const SCEV* H);

    /// getSCEVAtScope - Return a SCEV expression handle for the specified value
    /// at the specified scope in the program.  The L value specifies a loop
@ -571,11 +491,11 @@ namespace llvm {
    ///
    /// In the case that a relevant loop exit value cannot be computed, the
    /// original value V is returned.
-    SCEVHandle getSCEVAtScope(const SCEV *S, const Loop *L);
+    const SCEV* getSCEVAtScope(const SCEV *S, const Loop *L);

    /// getSCEVAtScope - This is a convenience function which does
    /// getSCEVAtScope(getSCEV(V), L).
-    SCEVHandle getSCEVAtScope(Value *V, const Loop *L);
+    const SCEV* getSCEVAtScope(Value *V, const Loop *L);

    /// isLoopGuardedByCond - Test whether entry to the loop is protected by
    /// a conditional between LHS and RHS.  This is used to help avoid max
@ -594,12 +514,12 @@ namespace llvm {
    /// loop-invariant backedge-taken count (see
    /// hasLoopInvariantBackedgeTakenCount).
    ///
-    SCEVHandle getBackedgeTakenCount(const Loop *L);
+    const SCEV* getBackedgeTakenCount(const Loop *L);

    /// getMaxBackedgeTakenCount - Similar to getBackedgeTakenCount, except
    /// return the least SCEV value that is known never to be less than the
    /// actual backedge taken count.
-    SCEVHandle getMaxBackedgeTakenCount(const Loop *L);
+    const SCEV* getMaxBackedgeTakenCount(const Loop *L);

    /// hasLoopInvariantBackedgeTakenCount - Return true if the specified loop
    /// has an analyzable loop-invariant backedge-taken count.
@ -615,15 +535,15 @@ namespace llvm {
    /// guaranteed to end in (at every loop iteration).  It is, at the same time,
    /// the minimum number of times S is divisible by 2.  For example, given {4,+,8}
    /// it returns 2.  If S is guaranteed to be 0, it returns the bitwidth of S.
-    uint32_t GetMinTrailingZeros(const SCEVHandle &S);
+    uint32_t GetMinTrailingZeros(const SCEV* S);

    /// GetMinLeadingZeros - Determine the minimum number of zero bits that S is
    /// guaranteed to begin with (at every loop iteration).
-    uint32_t GetMinLeadingZeros(const SCEVHandle &S);
+    uint32_t GetMinLeadingZeros(const SCEV* S);

    /// GetMinSignBits - Determine the minimum number of sign bits that S is
    /// guaranteed to begin with.
-    uint32_t GetMinSignBits(const SCEVHandle &S);
+    uint32_t GetMinSignBits(const SCEV* S);

    virtual bool runOnFunction(Function &F);
    virtual void releaseMemory();
@ -633,6 +553,23 @@ namespace llvm {
    void print(std::ostream *OS, const Module* M = 0) const {
      if (OS) print(*OS, M);
    }
+    
+  private:
+    // Uniquing tables.
+    std::map<ConstantInt*, SCEVConstant*> SCEVConstants;
+    std::map<std::pair<const SCEV*, const Type*>,
+             SCEVTruncateExpr*> SCEVTruncates;
+    std::map<std::pair<const SCEV*, const Type*>,
+             SCEVZeroExtendExpr*> SCEVZeroExtends;
+    std::map<std::pair<unsigned, std::vector<const SCEV*> >,
+             SCEVCommutativeExpr*> SCEVCommExprs;
+    std::map<std::pair<const SCEV*, const SCEV*>,
+             SCEVUDivExpr*> SCEVUDivs;
+    std::map<std::pair<const SCEV*, const Type*>,
+             SCEVSignExtendExpr*> SCEVSignExtends;
+    std::map<std::pair<const Loop *, std::vector<const SCEV*> >,
+             SCEVAddRecExpr*> SCEVAddRecExprs;
+    std::map<Value*, SCEVUnknown*> SCEVUnknowns;
  };
 }

--- a/include/llvm/Analysis/ScalarEvolutionExpander.h
+++ b/include/llvm/Analysis/ScalarEvolutionExpander.h
@ -28,7 +28,7 @@ namespace llvm {
  /// memory.
  struct SCEVExpander : public SCEVVisitor<SCEVExpander, Value*> {
    ScalarEvolution &SE;
-    std::map<SCEVHandle, AssertingVH<Value> > InsertedExpressions;
+    std::map<const SCEV*, AssertingVH<Value> > InsertedExpressions;
    std::set<Value*> InsertedValues;

    BasicBlock::iterator InsertPt;
@ -77,12 +77,12 @@ namespace llvm {
    /// expression into the program.  The inserted code is inserted into the
    /// SCEVExpander's current insertion point. If a type is specified, the
    /// result will be expanded to have that type, with a cast if necessary.
-    Value *expandCodeFor(SCEVHandle SH, const Type *Ty = 0);
+    Value *expandCodeFor(const SCEV* SH, const Type *Ty = 0);

    /// expandCodeFor - Insert code to directly compute the specified SCEV
    /// expression into the program.  The inserted code is inserted into the
    /// specified block.
-    Value *expandCodeFor(SCEVHandle SH, const Type *Ty,
+    Value *expandCodeFor(const SCEV* SH, const Type *Ty,
                         BasicBlock::iterator IP) {
      setInsertionPoint(IP);
      return expandCodeFor(SH, Ty);
@ -105,7 +105,8 @@ namespace llvm {
  private:
    /// expandAddToGEP - Expand a SCEVAddExpr with a pointer type into a GEP
    /// instead of using ptrtoint+arithmetic+inttoptr.
-    Value *expandAddToGEP(const SCEVHandle *op_begin, const SCEVHandle *op_end,
+    Value *expandAddToGEP(const SCEV* const *op_begin,
+                          const SCEV* const *op_end,
                          const PointerType *PTy, const Type *Ty, Value *V);

    Value *expand(const SCEV *S);
--- a/include/llvm/Analysis/ScalarEvolutionExpressions.h
+++ b/include/llvm/Analysis/ScalarEvolutionExpressions.h
@ -36,10 +36,8 @@ namespace llvm {
    friend class ScalarEvolution;

    ConstantInt *V;
-    explicit SCEVConstant(ConstantInt *v, const ScalarEvolution* p) :
-      SCEV(scConstant, p), V(v) {}
-
-    virtual ~SCEVConstant();
+    explicit SCEVConstant(ConstantInt *v) :
+      SCEV(scConstant), V(v) {}
  public:
    ConstantInt *getValue() const { return V; }

@ -53,8 +51,8 @@ namespace llvm {

    virtual const Type *getType() const;

-    SCEVHandle replaceSymbolicValuesWithConcrete(const SCEVHandle &Sym,
-                                                 const SCEVHandle &Conc,
+    const SCEV* replaceSymbolicValuesWithConcrete(const SCEV* Sym,
+                                                 const SCEV* Conc,
                                                 ScalarEvolution &SE) const {
      return this;
    }
@ -77,15 +75,13 @@ namespace llvm {
  ///
  class SCEVCastExpr : public SCEV {
  protected:
-    SCEVHandle Op;
+    const SCEV* Op;
    const Type *Ty;

-    SCEVCastExpr(unsigned SCEVTy, const SCEVHandle &op, const Type *ty,
-                 const ScalarEvolution* p);
-    virtual ~SCEVCastExpr();
+    SCEVCastExpr(unsigned SCEVTy, const SCEV* op, const Type *ty);

  public:
-    const SCEVHandle &getOperand() const { return Op; }
+    const SCEV* getOperand() const { return Op; }
    virtual const Type *getType() const { return Ty; }

    virtual bool isLoopInvariant(const Loop *L) const {
@ -114,15 +110,13 @@ namespace llvm {
  class SCEVTruncateExpr : public SCEVCastExpr {
    friend class ScalarEvolution;

-    SCEVTruncateExpr(const SCEVHandle &op, const Type *ty,
-                     const ScalarEvolution* p);
-    virtual ~SCEVTruncateExpr();
+    SCEVTruncateExpr(const SCEV* op, const Type *ty);

  public:
-    SCEVHandle replaceSymbolicValuesWithConcrete(const SCEVHandle &Sym,
-                                                 const SCEVHandle &Conc,
+    const SCEV* replaceSymbolicValuesWithConcrete(const SCEV* Sym,
+                                                 const SCEV* Conc,
                                                 ScalarEvolution &SE) const {
-      SCEVHandle H = Op->replaceSymbolicValuesWithConcrete(Sym, Conc, SE);
+      const SCEV* H = Op->replaceSymbolicValuesWithConcrete(Sym, Conc, SE);
      if (H == Op)
        return this;
      return SE.getTruncateExpr(H, Ty);
@ -144,15 +138,13 @@ namespace llvm {
  class SCEVZeroExtendExpr : public SCEVCastExpr {
    friend class ScalarEvolution;

-    SCEVZeroExtendExpr(const SCEVHandle &op, const Type *ty,
-                       const ScalarEvolution* p);
-    virtual ~SCEVZeroExtendExpr();
+    SCEVZeroExtendExpr(const SCEV* op, const Type *ty);

  public:
-    SCEVHandle replaceSymbolicValuesWithConcrete(const SCEVHandle &Sym,
-                                                 const SCEVHandle &Conc,
+    const SCEV* replaceSymbolicValuesWithConcrete(const SCEV* Sym,
+                                                 const SCEV* Conc,
                                                 ScalarEvolution &SE) const {
-      SCEVHandle H = Op->replaceSymbolicValuesWithConcrete(Sym, Conc, SE);
+      const SCEV* H = Op->replaceSymbolicValuesWithConcrete(Sym, Conc, SE);
      if (H == Op)
        return this;
      return SE.getZeroExtendExpr(H, Ty);
@ -174,15 +166,13 @@ namespace llvm {
  class SCEVSignExtendExpr : public SCEVCastExpr {
    friend class ScalarEvolution;

-    SCEVSignExtendExpr(const SCEVHandle &op, const Type *ty,
-                       const ScalarEvolution* p);
-    virtual ~SCEVSignExtendExpr();
+    SCEVSignExtendExpr(const SCEV* op, const Type *ty);

  public:
-    SCEVHandle replaceSymbolicValuesWithConcrete(const SCEVHandle &Sym,
-                                                 const SCEVHandle &Conc,
+    const SCEV* replaceSymbolicValuesWithConcrete(const SCEV* Sym,
+                                                 const SCEV* Conc,
                                                 ScalarEvolution &SE) const {
-      SCEVHandle H = Op->replaceSymbolicValuesWithConcrete(Sym, Conc, SE);
+      const SCEV* H = Op->replaceSymbolicValuesWithConcrete(Sym, Conc, SE);
      if (H == Op)
        return this;
      return SE.getSignExtendExpr(H, Ty);
@ -204,22 +194,20 @@ namespace llvm {
  ///
  class SCEVNAryExpr : public SCEV {
  protected:
-    SmallVector<SCEVHandle, 8> Operands;
+    SmallVector<const SCEV*, 8> Operands;

-    SCEVNAryExpr(enum SCEVTypes T, const SmallVectorImpl<SCEVHandle> &ops,
-                 const ScalarEvolution* p)
-      : SCEV(T, p), Operands(ops.begin(), ops.end()) {}
-    virtual ~SCEVNAryExpr() {}
+    SCEVNAryExpr(enum SCEVTypes T, const SmallVectorImpl<const SCEV*> &ops)
+      : SCEV(T), Operands(ops.begin(), ops.end()) {}

  public:
    unsigned getNumOperands() const { return (unsigned)Operands.size(); }
-    const SCEVHandle &getOperand(unsigned i) const {
+    const SCEV* getOperand(unsigned i) const {
      assert(i < Operands.size() && "Operand index out of range!");
      return Operands[i];
    }

-    const SmallVectorImpl<SCEVHandle> &getOperands() const { return Operands; }
-    typedef SmallVectorImpl<SCEVHandle>::const_iterator op_iterator;
+    const SmallVectorImpl<const SCEV*> &getOperands() const { return Operands; }
+    typedef SmallVectorImpl<const SCEV*>::const_iterator op_iterator;
    op_iterator op_begin() const { return Operands.begin(); }
    op_iterator op_end() const { return Operands.end(); }

@ -266,14 +254,12 @@ namespace llvm {
  class SCEVCommutativeExpr : public SCEVNAryExpr {
  protected:
    SCEVCommutativeExpr(enum SCEVTypes T,
-                        const SmallVectorImpl<SCEVHandle> &ops,
-                        const ScalarEvolution* p)
-      : SCEVNAryExpr(T, ops, p) {}
-    ~SCEVCommutativeExpr();
+                        const SmallVectorImpl<const SCEV*> &ops)
+      : SCEVNAryExpr(T, ops) {}

  public:
-    SCEVHandle replaceSymbolicValuesWithConcrete(const SCEVHandle &Sym,
-                                                 const SCEVHandle &Conc,
+    const SCEV* replaceSymbolicValuesWithConcrete(const SCEV* Sym,
+                                                 const SCEV* Conc,
                                                 ScalarEvolution &SE) const;

    virtual const char *getOperationStr() const = 0;
@ -297,9 +283,8 @@ namespace llvm {
  class SCEVAddExpr : public SCEVCommutativeExpr {
    friend class ScalarEvolution;

-    explicit SCEVAddExpr(const SmallVectorImpl<SCEVHandle> &ops,
-                         const ScalarEvolution* p)
-      : SCEVCommutativeExpr(scAddExpr, ops, p) {
+    explicit SCEVAddExpr(const SmallVectorImpl<const SCEV*> &ops)
+      : SCEVCommutativeExpr(scAddExpr, ops) {
    }

  public:
@ -318,9 +303,8 @@ namespace llvm {
  class SCEVMulExpr : public SCEVCommutativeExpr {
    friend class ScalarEvolution;

-    explicit SCEVMulExpr(const SmallVectorImpl<SCEVHandle> &ops,
-                         const ScalarEvolution* p)
-      : SCEVCommutativeExpr(scMulExpr, ops, p) {
+    explicit SCEVMulExpr(const SmallVectorImpl<const SCEV*> &ops)
+      : SCEVCommutativeExpr(scMulExpr, ops) {
    }

  public:
@ -340,15 +324,14 @@ namespace llvm {
  class SCEVUDivExpr : public SCEV {
    friend class ScalarEvolution;

-    SCEVHandle LHS, RHS;
-    SCEVUDivExpr(const SCEVHandle &lhs, const SCEVHandle &rhs,
-                 const ScalarEvolution* p)
-      : SCEV(scUDivExpr, p), LHS(lhs), RHS(rhs) {}
+    const SCEV* LHS;
+    const SCEV* RHS;
+    SCEVUDivExpr(const SCEV* lhs, const SCEV* rhs)
+      : SCEV(scUDivExpr), LHS(lhs), RHS(rhs) {}

-    virtual ~SCEVUDivExpr();
  public:
-    const SCEVHandle &getLHS() const { return LHS; }
-    const SCEVHandle &getRHS() const { return RHS; }
+    const SCEV* getLHS() const { return LHS; }
+    const SCEV* getRHS() const { return RHS; }

    virtual bool isLoopInvariant(const Loop *L) const {
      return LHS->isLoopInvariant(L) && RHS->isLoopInvariant(L);
@ -359,11 +342,11 @@ namespace llvm {
             RHS->hasComputableLoopEvolution(L);
    }

-    SCEVHandle replaceSymbolicValuesWithConcrete(const SCEVHandle &Sym,
-                                                 const SCEVHandle &Conc,
+    const SCEV* replaceSymbolicValuesWithConcrete(const SCEV* Sym,
+                                                 const SCEV* Conc,
                                                 ScalarEvolution &SE) const {
-      SCEVHandle L = LHS->replaceSymbolicValuesWithConcrete(Sym, Conc, SE);
-      SCEVHandle R = RHS->replaceSymbolicValuesWithConcrete(Sym, Conc, SE);
+      const SCEV* L = LHS->replaceSymbolicValuesWithConcrete(Sym, Conc, SE);
+      const SCEV* R = RHS->replaceSymbolicValuesWithConcrete(Sym, Conc, SE);
      if (L == LHS && R == RHS)
        return this;
      else
@ -398,25 +381,23 @@ namespace llvm {

    const Loop *L;

-    SCEVAddRecExpr(const SmallVectorImpl<SCEVHandle> &ops, const Loop *l,
-                   const ScalarEvolution* p)
-      : SCEVNAryExpr(scAddRecExpr, ops, p), L(l) {
+    SCEVAddRecExpr(const SmallVectorImpl<const SCEV*> &ops, const Loop *l)
+      : SCEVNAryExpr(scAddRecExpr, ops), L(l) {
      for (size_t i = 0, e = Operands.size(); i != e; ++i)
        assert(Operands[i]->isLoopInvariant(l) &&
               "Operands of AddRec must be loop-invariant!");
    }
-    ~SCEVAddRecExpr();

  public:
-    const SCEVHandle &getStart() const { return Operands[0]; }
+    const SCEV* getStart() const { return Operands[0]; }
    const Loop *getLoop() const { return L; }

    /// getStepRecurrence - This method constructs and returns the recurrence
    /// indicating how much this expression steps by.  If this is a polynomial
    /// of degree N, it returns a chrec of degree N-1.
-    SCEVHandle getStepRecurrence(ScalarEvolution &SE) const {
+    const SCEV* getStepRecurrence(ScalarEvolution &SE) const {
      if (isAffine()) return getOperand(1);
-      return SE.getAddRecExpr(SmallVector<SCEVHandle, 3>(op_begin()+1,op_end()),
+      return SE.getAddRecExpr(SmallVector<const SCEV*, 3>(op_begin()+1,op_end()),
                              getLoop());
    }

@ -444,7 +425,7 @@ namespace llvm {

    /// evaluateAtIteration - Return the value of this chain of recurrences at
    /// the specified iteration number.
-    SCEVHandle evaluateAtIteration(SCEVHandle It, ScalarEvolution &SE) const;
+    const SCEV* evaluateAtIteration(const SCEV* It, ScalarEvolution &SE) const;

    /// getNumIterationsInRange - Return the number of iterations of this loop
    /// that produce values in the specified constant range.  Another way of
@ -452,11 +433,11 @@ namespace llvm {
    /// value is not in the condition, thus computing the exit count.  If the
    /// iteration count can't be computed, an instance of SCEVCouldNotCompute is
    /// returned.
-    SCEVHandle getNumIterationsInRange(ConstantRange Range,
+    const SCEV* getNumIterationsInRange(ConstantRange Range,
                                       ScalarEvolution &SE) const;

-    SCEVHandle replaceSymbolicValuesWithConcrete(const SCEVHandle &Sym,
-                                                 const SCEVHandle &Conc,
+    const SCEV* replaceSymbolicValuesWithConcrete(const SCEV* Sym,
+                                                 const SCEV* Conc,
                                                 ScalarEvolution &SE) const;

    virtual void print(raw_ostream &OS) const;
@ -475,9 +456,8 @@ namespace llvm {
  class SCEVSMaxExpr : public SCEVCommutativeExpr {
    friend class ScalarEvolution;

-    explicit SCEVSMaxExpr(const SmallVectorImpl<SCEVHandle> &ops,
-                          const ScalarEvolution* p)
-      : SCEVCommutativeExpr(scSMaxExpr, ops, p) {
+    explicit SCEVSMaxExpr(const SmallVectorImpl<const SCEV*> &ops)
+      : SCEVCommutativeExpr(scSMaxExpr, ops) {
    }

  public:
@ -497,9 +477,8 @@ namespace llvm {
  class SCEVUMaxExpr : public SCEVCommutativeExpr {
    friend class ScalarEvolution;

-    explicit SCEVUMaxExpr(const SmallVectorImpl<SCEVHandle> &ops,
-                          const ScalarEvolution* p)
-      : SCEVCommutativeExpr(scUMaxExpr, ops, p) {
+    explicit SCEVUMaxExpr(const SmallVectorImpl<const SCEV*> &ops)
+      : SCEVCommutativeExpr(scUMaxExpr, ops) {
    }

  public:
@ -522,11 +501,9 @@ namespace llvm {
    friend class ScalarEvolution;

    Value *V;
-    explicit SCEVUnknown(Value *v, const ScalarEvolution* p) :
-      SCEV(scUnknown, p), V(v) {}
-
-  protected:
-    ~SCEVUnknown();
+    explicit SCEVUnknown(Value *v) :
+      SCEV(scUnknown), V(v) {}
+      
  public:
    Value *getValue() const { return V; }

@ -535,8 +512,8 @@ namespace llvm {
      return false; // not computable
    }

-    SCEVHandle replaceSymbolicValuesWithConcrete(const SCEVHandle &Sym,
-                                                 const SCEVHandle &Conc,
+    const SCEV* replaceSymbolicValuesWithConcrete(const SCEV* Sym,
+                                                 const SCEV* Conc,
                                                 ScalarEvolution &SE) const {
      if (&*Sym == this) return Conc;
      return this;
--- a/include/llvm/CodeGen/BinaryObject.h
+++ b/include/llvm/CodeGen/BinaryObject.h
@ -61,6 +61,11 @@ public:
    return Relocations;
  }

+  /// hasRelocations - Return true if 'Relocations' is not empty
+  bool hasRelocations() const {
+    return !Relocations.empty();
+  }
+
  /// emitByte - This callback is invoked when a byte needs to be
  /// written to the data stream.
  inline void emitByte(uint8_t B) {
@ -317,6 +322,7 @@ public:
  void addRelocation(const MachineRelocation& relocation) {
    Relocations.push_back(relocation);
  }
+
 };

 } // end namespace llvm
--- a/include/llvm/Intrinsics.td
+++ b/include/llvm/Intrinsics.td
@ -116,6 +116,7 @@ def llvm_v2i64_ty      : LLVMType<v2i64>;    //  2 x i64
 def llvm_v2i32_ty      : LLVMType<v2i32>;    //  2 x i32
 def llvm_v1i64_ty      : LLVMType<v1i64>;    //  1 x i64
 def llvm_v4i32_ty      : LLVMType<v4i32>;    //  4 x i32
+def llvm_v2f32_ty      : LLVMType<v2f32>;    //  2 x float
 def llvm_v4f32_ty      : LLVMType<v4f32>;    //  4 x float
 def llvm_v2f64_ty      : LLVMType<v2f64>;    //  2 x double

--- a/include/llvm/IntrinsicsARM.td
+++ b/include/llvm/IntrinsicsARM.td
@ -19,3 +19,298 @@ let TargetPrefix = "arm" in {  // All intrinsics start with "llvm.arm.".
  def int_arm_thread_pointer : GCCBuiltin<"__builtin_thread_pointer">,
              Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
 }
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD (NEON)
+
+let TargetPrefix = "arm" in {  // All intrinsics start with "llvm.arm.".
+
+  // The following classes do not correspond directly to GCC builtins.
+  class Neon_1Arg_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  class Neon_1Arg_Float_Intrinsic
+    : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+  class Neon_1Arg_Narrow_Intrinsic
+    : Intrinsic<[llvm_anyint_ty],
+                [LLVMExtendedElementVectorType<0>], [IntrNoMem]>;
+  class Neon_1Arg_Long_Intrinsic
+    : Intrinsic<[llvm_anyint_ty],
+                [LLVMTruncatedElementVectorType<0>], [IntrNoMem]>;
+  class Neon_2Arg_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class Neon_2Arg_Float_Intrinsic
+    : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class Neon_2Arg_Narrow_Intrinsic
+    : Intrinsic<[llvm_anyint_ty],
+                [LLVMExtendedElementVectorType<0>,
+                 LLVMExtendedElementVectorType<0>],
+                [IntrNoMem]>;
+  class Neon_2Arg_Long_Intrinsic
+    : Intrinsic<[llvm_anyint_ty],
+                [LLVMTruncatedElementVectorType<0>,
+                 LLVMTruncatedElementVectorType<0>],
+                [IntrNoMem]>;
+  class Neon_2Arg_Wide_Intrinsic
+    : Intrinsic<[llvm_anyint_ty],
+                [LLVMMatchType<0>, LLVMTruncatedElementVectorType<0>],
+                [IntrNoMem]>;
+  class Neon_3Arg_Intrinsic
+    : Intrinsic<[llvm_anyint_ty],
+                [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+                [IntrNoMem]>;
+  class Neon_3Arg_Long_Intrinsic
+    : Intrinsic<[llvm_anyint_ty],
+                [LLVMMatchType<0>,
+                 LLVMTruncatedElementVectorType<0>,
+                 LLVMTruncatedElementVectorType<0>],
+                [IntrNoMem]>;
+  class Neon_CvtFxToFP_Intrinsic
+    : Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
+  class Neon_CvtFPToFx_Intrinsic
+    : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty], [IntrNoMem]>;
+}
+
+// Arithmetic ops
+
+let Properties = [IntrNoMem, Commutative] in {
+
+  // Vector Add.
+  def int_arm_neon_vhadds : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vhaddu : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vrhadds : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vrhaddu : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vqadds : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vqaddu : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vaddhn : Neon_2Arg_Narrow_Intrinsic;
+  def int_arm_neon_vraddhn : Neon_2Arg_Narrow_Intrinsic;
+  def int_arm_neon_vaddls : Neon_2Arg_Long_Intrinsic;
+  def int_arm_neon_vaddlu : Neon_2Arg_Long_Intrinsic;
+  def int_arm_neon_vaddws : Neon_2Arg_Wide_Intrinsic;
+  def int_arm_neon_vaddwu : Neon_2Arg_Wide_Intrinsic;
+
+  // Vector Multiply.
+  def int_arm_neon_vmulp : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vqdmulh : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vqrdmulh : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vmulls : Neon_2Arg_Long_Intrinsic;
+  def int_arm_neon_vmullu : Neon_2Arg_Long_Intrinsic;
+  def int_arm_neon_vmullp : Neon_2Arg_Long_Intrinsic;
+  def int_arm_neon_vqdmull : Neon_2Arg_Long_Intrinsic;
+
+  // Vector Multiply and Accumulate/Subtract.
+  def int_arm_neon_vmlals : Neon_3Arg_Long_Intrinsic;
+  def int_arm_neon_vmlalu : Neon_3Arg_Long_Intrinsic;
+  def int_arm_neon_vmlsls : Neon_3Arg_Long_Intrinsic;
+  def int_arm_neon_vmlslu : Neon_3Arg_Long_Intrinsic;
+  def int_arm_neon_vqdmlal : Neon_3Arg_Long_Intrinsic;
+  def int_arm_neon_vqdmlsl : Neon_3Arg_Long_Intrinsic;
+
+  // Vector Maximum.
+  def int_arm_neon_vmaxs : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vmaxu : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vmaxf : Neon_2Arg_Float_Intrinsic;
+
+  // Vector Minimum.
+  def int_arm_neon_vmins : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vminu : Neon_2Arg_Intrinsic;
+  def int_arm_neon_vminf : Neon_2Arg_Float_Intrinsic;
+
+  // Vector Reciprocal Step.
+  def int_arm_neon_vrecps : Neon_2Arg_Float_Intrinsic;
+
+  // Vector Reciprocal Square Root Step.
+  def int_arm_neon_vrsqrts : Neon_2Arg_Float_Intrinsic;
+}
+
+// Vector Subtract.
+def int_arm_neon_vhsubs : Neon_2Arg_Intrinsic;
+def int_arm_neon_vhsubu : Neon_2Arg_Intrinsic;
+def int_arm_neon_vqsubs : Neon_2Arg_Intrinsic;
+def int_arm_neon_vqsubu : Neon_2Arg_Intrinsic;
+def int_arm_neon_vsubhn : Neon_2Arg_Narrow_Intrinsic;
+def int_arm_neon_vrsubhn : Neon_2Arg_Narrow_Intrinsic;
+def int_arm_neon_vsubls : Neon_2Arg_Long_Intrinsic;
+def int_arm_neon_vsublu : Neon_2Arg_Long_Intrinsic;
+def int_arm_neon_vsubws : Neon_2Arg_Wide_Intrinsic;
+def int_arm_neon_vsubwu : Neon_2Arg_Wide_Intrinsic;
+
+// Vector Absolute Compare.
+let TargetPrefix = "arm" in {
+  def int_arm_neon_vacged : Intrinsic<[llvm_v2i32_ty],
+                                      [llvm_v2f32_ty, llvm_v2f32_ty],
+                                      [IntrNoMem]>;
+  def int_arm_neon_vacgeq : Intrinsic<[llvm_v4i32_ty],
+                                      [llvm_v4f32_ty, llvm_v4f32_ty],
+                                      [IntrNoMem]>;
+  def int_arm_neon_vacgtd : Intrinsic<[llvm_v2i32_ty],
+                                      [llvm_v2f32_ty, llvm_v2f32_ty],
+                                      [IntrNoMem]>;
+  def int_arm_neon_vacgtq : Intrinsic<[llvm_v4i32_ty],
+                                      [llvm_v4f32_ty, llvm_v4f32_ty],
+                                      [IntrNoMem]>;
+}
+
+// Vector Absolute Differences.
+def int_arm_neon_vabds : Neon_2Arg_Intrinsic;
+def int_arm_neon_vabdu : Neon_2Arg_Intrinsic;
+def int_arm_neon_vabdf : Neon_2Arg_Float_Intrinsic;
+def int_arm_neon_vabdls : Neon_2Arg_Long_Intrinsic;
+def int_arm_neon_vabdlu : Neon_2Arg_Long_Intrinsic;
+
+// Vector Absolute Difference and Accumulate.
+def int_arm_neon_vabas : Neon_3Arg_Intrinsic;
+def int_arm_neon_vabau : Neon_3Arg_Intrinsic;
+def int_arm_neon_vabals : Neon_3Arg_Long_Intrinsic;
+def int_arm_neon_vabalu : Neon_3Arg_Long_Intrinsic;
+
+// Vector Pairwise Add.
+def int_arm_neon_vpaddi : Neon_2Arg_Intrinsic;
+def int_arm_neon_vpaddf : Neon_2Arg_Float_Intrinsic;
+
+// Vector Pairwise Add Long.
+// Note: This is different than the other "long" NEON intrinsics because
+// the result vector has half as many elements as the source vector.
+// The source and destination vector types must be specified separately.
+let TargetPrefix = "arm" in {
+  def int_arm_neon_vpaddls : Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty],
+                                       [IntrNoMem]>;
+  def int_arm_neon_vpaddlu : Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty],
+                                       [IntrNoMem]>;
+}
+
+// Vector Pairwise Add and Accumulate Long.
+// Note: This is similar to vpaddl but the destination vector also appears
+// as the first argument.
+let TargetPrefix = "arm" in {
+  def int_arm_neon_vpadals : Intrinsic<[llvm_anyint_ty],
+                                       [LLVMMatchType<0>, llvm_anyint_ty],
+                                       [IntrNoMem]>;
+  def int_arm_neon_vpadalu : Intrinsic<[llvm_anyint_ty],
+                                       [LLVMMatchType<0>, llvm_anyint_ty],
+                                       [IntrNoMem]>;
+}
+
+// Vector Pairwise Maximum and Minimum.
+def int_arm_neon_vpmaxs : Neon_2Arg_Intrinsic;
+def int_arm_neon_vpmaxu : Neon_2Arg_Intrinsic;
+def int_arm_neon_vpmaxf : Neon_2Arg_Float_Intrinsic;
+def int_arm_neon_vpmins : Neon_2Arg_Intrinsic;
+def int_arm_neon_vpminu : Neon_2Arg_Intrinsic;
+def int_arm_neon_vpminf : Neon_2Arg_Float_Intrinsic;
+
+// Vector Shifts:
+//
+// The various saturating and rounding vector shift operations need to be
+// represented by intrinsics in LLVM, and even the basic VSHL variable shift
+// operation cannot be safely translated to LLVM's shift operators.  VSHL can
+// be used for both left and right shifts, or even combinations of the two,
+// depending on the signs of the shift amounts.  It also has well-defined
+// behavior for shift amounts that LLVM leaves undefined.  Only basic shifts
+// by constants can be represented with LLVM's shift operators.
+//
+// The shift counts for these intrinsics are always vectors, even for constant
+// shifts, where the constant is replicated.  For consistency with VSHL (and
+// other variable shift instructions), left shifts have positive shift counts
+// and right shifts have negative shift counts.  This convention is also used
+// for constant right shift intrinsics, and to help preserve sanity, the
+// intrinsic names use "shift" instead of either "shl" or "shr".  Where
+// applicable, signed and unsigned versions of the intrinsics are
+// distinguished with "s" and "u" suffixes.  A few NEON shift instructions,
+// such as VQSHLU, take signed operands but produce unsigned results; these
+// use a "su" suffix.
+
+// Vector Shift.
+def int_arm_neon_vshifts : Neon_2Arg_Intrinsic;
+def int_arm_neon_vshiftu : Neon_2Arg_Intrinsic;
+def int_arm_neon_vshiftls : Neon_2Arg_Long_Intrinsic;
+def int_arm_neon_vshiftlu : Neon_2Arg_Long_Intrinsic;
+def int_arm_neon_vshiftn : Neon_2Arg_Narrow_Intrinsic;
+
+// Vector Rounding Shift.
+def int_arm_neon_vrshifts : Neon_2Arg_Intrinsic;
+def int_arm_neon_vrshiftu : Neon_2Arg_Intrinsic;
+def int_arm_neon_vrshiftn : Neon_2Arg_Narrow_Intrinsic;
+
+// Vector Saturating Shift.
+def int_arm_neon_vqshifts : Neon_2Arg_Intrinsic;
+def int_arm_neon_vqshiftu : Neon_2Arg_Intrinsic;
+def int_arm_neon_vqshiftsu : Neon_2Arg_Intrinsic;
+def int_arm_neon_vqshiftns : Neon_2Arg_Narrow_Intrinsic;
+def int_arm_neon_vqshiftnu : Neon_2Arg_Narrow_Intrinsic;
+def int_arm_neon_vqshiftnsu : Neon_2Arg_Narrow_Intrinsic;
+
+// Vector Saturating Rounding Shift.
+def int_arm_neon_vqrshifts : Neon_2Arg_Intrinsic;
+def int_arm_neon_vqrshiftu : Neon_2Arg_Intrinsic;
+def int_arm_neon_vqrshiftns : Neon_2Arg_Narrow_Intrinsic;
+def int_arm_neon_vqrshiftnu : Neon_2Arg_Narrow_Intrinsic;
+def int_arm_neon_vqrshiftnsu : Neon_2Arg_Narrow_Intrinsic;
+
+// Vector Shift and Insert.
+def int_arm_neon_vshiftins : Neon_3Arg_Intrinsic;
+
+// Vector Absolute Value and Saturating Absolute Value.
+def int_arm_neon_vabs : Neon_1Arg_Intrinsic;
+def int_arm_neon_vabsf : Neon_1Arg_Float_Intrinsic;
+def int_arm_neon_vqabs : Neon_1Arg_Intrinsic;
+
+// Vector Saturating Negate.
+def int_arm_neon_vqneg : Neon_1Arg_Intrinsic;
+
+// Vector Count Leading Sign/Zero Bits.
+def int_arm_neon_vcls : Neon_1Arg_Intrinsic;
+def int_arm_neon_vclz : Neon_1Arg_Intrinsic;
+
+// Vector Count One Bits.
+def int_arm_neon_vcnt : Neon_1Arg_Intrinsic;
+
+// Vector Reciprocal Estimate.
+def int_arm_neon_vrecpe : Neon_1Arg_Intrinsic;
+def int_arm_neon_vrecpef : Neon_1Arg_Float_Intrinsic;
+
+// Vector Reciprocal Square Root Estimate.
+def int_arm_neon_vrsqrte : Neon_1Arg_Intrinsic;
+def int_arm_neon_vrsqrtef : Neon_1Arg_Float_Intrinsic;
+
+// Vector Conversions Between Floating-point and Fixed-point.
+def int_arm_neon_vcvtfp2fxs : Neon_CvtFPToFx_Intrinsic;
+def int_arm_neon_vcvtfp2fxu : Neon_CvtFPToFx_Intrinsic;
+def int_arm_neon_vcvtfxs2fp : Neon_CvtFxToFP_Intrinsic;
+def int_arm_neon_vcvtfxu2fp : Neon_CvtFxToFP_Intrinsic;
+
+// Narrowing and Lengthening Vector Moves.
+def int_arm_neon_vmovn : Neon_1Arg_Narrow_Intrinsic;
+def int_arm_neon_vqmovns : Neon_1Arg_Narrow_Intrinsic;
+def int_arm_neon_vqmovnu : Neon_1Arg_Narrow_Intrinsic;
+def int_arm_neon_vqmovnsu : Neon_1Arg_Narrow_Intrinsic;
+def int_arm_neon_vmovls : Neon_1Arg_Long_Intrinsic;
+def int_arm_neon_vmovlu : Neon_1Arg_Long_Intrinsic;
+
+let TargetPrefix = "arm" in {
+
+  // De-interleaving vector loads from N-element structures.
+  def int_arm_neon_vld3i : Intrinsic<[llvm_anyint_ty],
+                                     [llvm_ptr_ty], [IntrReadArgMem]>;
+  def int_arm_neon_vld3f : Intrinsic<[llvm_anyfloat_ty],
+                                     [llvm_ptr_ty], [IntrReadArgMem]>;
+  def int_arm_neon_vld4i : Intrinsic<[llvm_anyint_ty],
+                                     [llvm_ptr_ty], [IntrReadArgMem]>;
+  def int_arm_neon_vld4f : Intrinsic<[llvm_anyfloat_ty],
+                                     [llvm_ptr_ty], [IntrReadArgMem]>;
+
+  // Interleaving vector stores from N-element structures.
+  def int_arm_neon_vst3i : Intrinsic<[llvm_void_ty],
+                                     [llvm_anyint_ty, llvm_ptr_ty],
+                                     [IntrWriteArgMem]>;
+  def int_arm_neon_vst3f : Intrinsic<[llvm_void_ty],
+                                     [llvm_anyfloat_ty, llvm_ptr_ty],
+                                     [IntrWriteArgMem]>;
+  def int_arm_neon_vst4i : Intrinsic<[llvm_void_ty],
+                                     [llvm_anyint_ty, llvm_ptr_ty],
+                                     [IntrWriteArgMem]>;
+  def int_arm_neon_vst4f : Intrinsic<[llvm_void_ty],
+                                     [llvm_anyfloat_ty, llvm_ptr_ty],
+                                     [IntrWriteArgMem]>;
+}
--- a/include/llvm/Support/Timer.h
+++ b/include/llvm/Support/Timer.h
@ -152,6 +152,7 @@ class TimerGroup {
  unsigned NumTimers;
  std::vector<Timer> TimersToPrint;
 public:
+  TimerGroup() : Name("Miscellaneous Ungrouped Timers"), NumTimers(0) {}
  explicit TimerGroup(const std::string &name) : Name(name), NumTimers(0) {}
  ~TimerGroup() {
    assert(NumTimers == 0 &&
--- a/include/llvm/Target/TargetELFWriterInfo.h
+++ b/include/llvm/Target/TargetELFWriterInfo.h
@ -78,11 +78,32 @@ namespace llvm {

    /// Symbol Table Info
    unsigned getSymTabEntrySize() const { return is64Bit ? 24 : 16; }
-    unsigned getSymTabAlignment() const { return is64Bit ? 8 : 4; }
+
+    /// getPrefELFAlignment - Returns the preferred alignment for ELF. This
+    /// is used to align some sections.
+    unsigned getPrefELFAlignment() const { return is64Bit ? 8 : 4; }
+
+    /// getRelocationEntrySize - Entry size used in the relocation section
+    unsigned getRelocationEntrySize() const {
+      return is64Bit ? (hasRelocationAddend() ? 24 : 16)
+                     : (hasRelocationAddend() ? 12 : 8);
+    }

    /// getFunctionAlignment - Returns the alignment for function 'F', targets
    /// with different alignment constraints should overload this method
    virtual unsigned getFunctionAlignment(const Function *F) const;
+
+    /// getRelocationType - Returns the target specific ELF Relocation type.
+    /// 'MachineRelTy' contains the object code independent relocation type
+    virtual unsigned getRelocationType(unsigned MachineRelTy) const = 0;
+
+    /// hasRelocationAddend - True if the target uses an addend in the
+    /// ELF relocation entry.
+    virtual bool hasRelocationAddend() const = 0;
+
+    /// getAddendForRelTy - Gets the addend value for an ELF relocation entry
+    /// based on the target relocation type. If addend is not used returns 0.
+    virtual long int getAddendForRelTy(unsigned RelTy) const = 0;
  };

 } // end llvm namespace
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@ -28,7 +28,6 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
-#include "llvm/Support/ManagedStatic.h"
 #include <algorithm>
 using namespace llvm;

--- a/lib/Analysis/IVUsers.cpp
+++ b/lib/Analysis/IVUsers.cpp
@ -39,7 +39,7 @@ Pass *llvm::createIVUsersPass() {
 /// containsAddRecFromDifferentLoop - Determine whether expression S involves a
 /// subexpression that is an AddRec from a loop other than L.  An outer loop
 /// of L is OK, but not an inner loop nor a disjoint loop.
-static bool containsAddRecFromDifferentLoop(SCEVHandle S, Loop *L) {
+static bool containsAddRecFromDifferentLoop(const SCEV* S, Loop *L) {
  // This is very common, put it first.
  if (isa<SCEVConstant>(S))
    return false;
@ -80,10 +80,10 @@ static bool containsAddRecFromDifferentLoop(SCEVHandle S, Loop *L) {
 /// a mix of loop invariant and loop variant expressions.  The start cannot,
 /// however, contain an AddRec from a different loop, unless that loop is an
 /// outer loop of the current loop.
-static bool getSCEVStartAndStride(const SCEVHandle &SH, Loop *L, Loop *UseLoop,
-                                  SCEVHandle &Start, SCEVHandle &Stride,
+static bool getSCEVStartAndStride(const SCEV* &SH, Loop *L, Loop *UseLoop,
+                                  const SCEV* &Start, const SCEV* &Stride,
                                  ScalarEvolution *SE, DominatorTree *DT) {
-  SCEVHandle TheAddRec = Start;   // Initialize to zero.
+  const SCEV* TheAddRec = Start;   // Initialize to zero.

  // If the outer level is an AddExpr, the operands are all start values except
  // for a nested AddRecExpr.
@ -109,9 +109,9 @@ static bool getSCEVStartAndStride(const SCEVHandle &SH, Loop *L, Loop *UseLoop,

  // Use getSCEVAtScope to attempt to simplify other loops out of
  // the picture.
-  SCEVHandle AddRecStart = AddRec->getStart();
+  const SCEV* AddRecStart = AddRec->getStart();
  AddRecStart = SE->getSCEVAtScope(AddRecStart, UseLoop);
-  SCEVHandle AddRecStride = AddRec->getStepRecurrence(*SE);
+  const SCEV* AddRecStride = AddRec->getStepRecurrence(*SE);

  // FIXME: If Start contains an SCEVAddRecExpr from a different loop, other
  // than an outer loop of the current loop, reject it.  LSR has no concept of
@ -196,13 +196,13 @@ bool IVUsers::AddUsersIfInteresting(Instruction *I) {
    return true;    // Instruction already handled.

  // Get the symbolic expression for this instruction.
-  SCEVHandle ISE = SE->getSCEV(I);
+  const SCEV* ISE = SE->getSCEV(I);
  if (isa<SCEVCouldNotCompute>(ISE)) return false;

  // Get the start and stride for this expression.
  Loop *UseLoop = LI->getLoopFor(I->getParent());
-  SCEVHandle Start = SE->getIntegerSCEV(0, ISE->getType());
-  SCEVHandle Stride = Start;
+  const SCEV* Start = SE->getIntegerSCEV(0, ISE->getType());
+  const SCEV* Stride = Start;

  if (!getSCEVStartAndStride(ISE, L, UseLoop, Start, Stride, SE, DT))
    return false;  // Non-reducible symbolic expression, bail out.
@ -254,7 +254,7 @@ bool IVUsers::AddUsersIfInteresting(Instruction *I) {
      if (IVUseShouldUsePostIncValue(User, I, L, LI, DT, this)) {
        // The value used will be incremented by the stride more than we are
        // expecting, so subtract this off.
-        SCEVHandle NewStart = SE->getMinusSCEV(Start, Stride);
+        const SCEV* NewStart = SE->getMinusSCEV(Start, Stride);
        StrideUses->addUser(NewStart, User, I);
        StrideUses->Users.back().setIsUseOfPostIncrementedValue(true);
        DOUT << "   USING POSTINC SCEV, START=" << *NewStart<< "\n";
@ -295,9 +295,9 @@ bool IVUsers::runOnLoop(Loop *l, LPPassManager &LPM) {

 /// getReplacementExpr - Return a SCEV expression which computes the
 /// value of the OperandValToReplace of the given IVStrideUse.
-SCEVHandle IVUsers::getReplacementExpr(const IVStrideUse &U) const {
+const SCEV* IVUsers::getReplacementExpr(const IVStrideUse &U) const {
  // Start with zero.
-  SCEVHandle RetVal = SE->getIntegerSCEV(0, U.getParent()->Stride->getType());
+  const SCEV* RetVal = SE->getIntegerSCEV(0, U.getParent()->Stride->getType());
  // Create the basic add recurrence.
  RetVal = SE->getAddRecExpr(RetVal, U.getParent()->Stride, L);
  // Add the offset in a separate step, because it may be loop-variant.
@ -308,7 +308,7 @@ SCEVHandle IVUsers::getReplacementExpr(const IVStrideUse &U) const {
    RetVal = SE->getAddExpr(RetVal, U.getParent()->Stride);
  // Evaluate the expression out of the loop, if possible.
  if (!L->contains(U.getUser()->getParent())) {
-    SCEVHandle ExitVal = SE->getSCEVAtScope(RetVal, L->getParentLoop());
+    const SCEV* ExitVal = SE->getSCEVAtScope(RetVal, L->getParentLoop());
    if (ExitVal->isLoopInvariant(L))
      RetVal = ExitVal;
  }
@ -325,7 +325,7 @@ void IVUsers::print(raw_ostream &OS, const Module *M) const {
  OS << ":\n";

  for (unsigned Stride = 0, e = StrideOrder.size(); Stride != e; ++Stride) {
-    std::map<SCEVHandle, IVUsersOfOneStride*>::const_iterator SI =
+    std::map<const SCEV*, IVUsersOfOneStride*>::const_iterator SI =
      IVUsesByStride.find(StrideOrder[Stride]);
    assert(SI != IVUsesByStride.end() && "Stride doesn't exist!");
    OS << "  Stride " << *SI->first->getType() << " " << *SI->first << ":\n";
--- a/lib/Analysis/LoopVR.cpp
+++ b/lib/Analysis/LoopVR.cpp
@ -26,8 +26,8 @@ char LoopVR::ID = 0;
 static RegisterPass<LoopVR> X("loopvr", "Loop Value Ranges", false, true);

 /// getRange - determine the range for a particular SCEV within a given Loop
-ConstantRange LoopVR::getRange(SCEVHandle S, Loop *L, ScalarEvolution &SE) {
-  SCEVHandle T = SE.getBackedgeTakenCount(L);
+ConstantRange LoopVR::getRange(const SCEV* S, Loop *L, ScalarEvolution &SE) {
+  const SCEV* T = SE.getBackedgeTakenCount(L);
  if (isa<SCEVCouldNotCompute>(T))
    return ConstantRange(cast<IntegerType>(S->getType())->getBitWidth(), true);

@ -36,7 +36,7 @@ ConstantRange LoopVR::getRange(SCEVHandle S, Loop *L, ScalarEvolution &SE) {
 }

 /// getRange - determine the range for a particular SCEV with a given trip count
-ConstantRange LoopVR::getRange(SCEVHandle S, SCEVHandle T, ScalarEvolution &SE){
+ConstantRange LoopVR::getRange(const SCEV* S, const SCEV* T, ScalarEvolution &SE){

  if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S))
    return ConstantRange(C->getValue()->getValue());
@ -182,8 +182,8 @@ ConstantRange LoopVR::getRange(SCEVHandle S, SCEVHandle T, ScalarEvolution &SE){
    if (!Trip) return FullSet;

    if (AddRec->isAffine()) {
-      SCEVHandle StartHandle = AddRec->getStart();
-      SCEVHandle StepHandle = AddRec->getOperand(1);
+      const SCEV* StartHandle = AddRec->getStart();
+      const SCEV* StepHandle = AddRec->getOperand(1);

      const SCEVConstant *Step = dyn_cast<SCEVConstant>(StepHandle);
      if (!Step) return FullSet;
@ -194,7 +194,7 @@ ConstantRange LoopVR::getRange(SCEVHandle S, SCEVHandle T, ScalarEvolution &SE){
      if ((TripExt * StepExt).ugt(APInt::getLowBitsSet(ExWidth, ExWidth >> 1)))
        return FullSet;

-      SCEVHandle EndHandle = SE.getAddExpr(StartHandle,
+      const SCEV* EndHandle = SE.getAddExpr(StartHandle,
                                           SE.getMulExpr(T, StepHandle));
      const SCEVConstant *Start = dyn_cast<SCEVConstant>(StartHandle);
      const SCEVConstant *End = dyn_cast<SCEVConstant>(EndHandle);
@ -254,7 +254,7 @@ ConstantRange LoopVR::compute(Value *V) {

  ScalarEvolution &SE = getAnalysis<ScalarEvolution>();

-  SCEVHandle S = SE.getSCEV(I);
+  const SCEV* S = SE.getSCEV(I);
  if (isa<SCEVUnknown>(S) || isa<SCEVCouldNotCompute>(S))
    return ConstantRange(cast<IntegerType>(V->getType())->getBitWidth(), false);

--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
--- a/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/lib/Analysis/ScalarEvolutionExpander.cpp
@ -152,8 +152,8 @@ Value *SCEVExpander::InsertBinop(Instruction::BinaryOps Opcode, Value *LHS,
 /// TODO: When ScalarEvolution gets a SCEVSDivExpr, this can be made
 /// unnecessary; in its place, just signed-divide Ops[i] by the scale and
 /// check to see if the divide was folded.
-static bool FactorOutConstant(SCEVHandle &S,
-                              SCEVHandle &Remainder,
+static bool FactorOutConstant(const SCEV* &S,
+                              const SCEV* &Remainder,
                              const APInt &Factor,
                              ScalarEvolution &SE) {
  // Everything is divisible by one.
@ -168,7 +168,7 @@ static bool FactorOutConstant(SCEVHandle &S,
    // the value at this scale. It will be considered for subsequent
    // smaller scales.
    if (C->isZero() || !CI->isZero()) {
-      SCEVHandle Div = SE.getConstant(CI);
+      const SCEV* Div = SE.getConstant(CI);
      S = Div;
      Remainder =
        SE.getAddExpr(Remainder,
@ -182,8 +182,8 @@ static bool FactorOutConstant(SCEVHandle &S,
  if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S))
    if (const SCEVConstant *C = dyn_cast<SCEVConstant>(M->getOperand(0)))
      if (!C->getValue()->getValue().srem(Factor)) {
-        const SmallVectorImpl<SCEVHandle> &MOperands = M->getOperands();
-        SmallVector<SCEVHandle, 4> NewMulOps(MOperands.begin(), MOperands.end());
+        const SmallVectorImpl<const SCEV*> &MOperands = M->getOperands();
+        SmallVector<const SCEV*, 4> NewMulOps(MOperands.begin(), MOperands.end());
        NewMulOps[0] =
          SE.getConstant(C->getValue()->getValue().sdiv(Factor));
        S = SE.getMulExpr(NewMulOps);
@ -192,13 +192,13 @@ static bool FactorOutConstant(SCEVHandle &S,

  // In an AddRec, check if both start and step are divisible.
  if (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(S)) {
-    SCEVHandle Step = A->getStepRecurrence(SE);
-    SCEVHandle StepRem = SE.getIntegerSCEV(0, Step->getType());
+    const SCEV* Step = A->getStepRecurrence(SE);
+    const SCEV* StepRem = SE.getIntegerSCEV(0, Step->getType());
    if (!FactorOutConstant(Step, StepRem, Factor, SE))
      return false;
    if (!StepRem->isZero())
      return false;
-    SCEVHandle Start = A->getStart();
+    const SCEV* Start = A->getStart();
    if (!FactorOutConstant(Start, Remainder, Factor, SE))
      return false;
    S = SE.getAddRecExpr(Start, Step, A->getLoop());
@ -233,14 +233,14 @@ static bool FactorOutConstant(SCEVHandle &S,
 /// loop-invariant portions of expressions, after considering what
 /// can be folded using target addressing modes.
 ///
-Value *SCEVExpander::expandAddToGEP(const SCEVHandle *op_begin,
-                                    const SCEVHandle *op_end,
+Value *SCEVExpander::expandAddToGEP(const SCEV* const *op_begin,
+                                    const SCEV* const *op_end,
                                    const PointerType *PTy,
                                    const Type *Ty,
                                    Value *V) {
  const Type *ElTy = PTy->getElementType();
  SmallVector<Value *, 4> GepIndices;
-  SmallVector<SCEVHandle, 8> Ops(op_begin, op_end);
+  SmallVector<const SCEV*, 8> Ops(op_begin, op_end);
  bool AnyNonZeroIndices = false;

  // Decend down the pointer's type and attempt to convert the other
@ -251,14 +251,14 @@ Value *SCEVExpander::expandAddToGEP(const SCEVHandle *op_begin,
  for (;;) {
    APInt ElSize = APInt(SE.getTypeSizeInBits(Ty),
                         ElTy->isSized() ?  SE.TD->getTypeAllocSize(ElTy) : 0);
-    SmallVector<SCEVHandle, 8> NewOps;
-    SmallVector<SCEVHandle, 8> ScaledOps;
+    SmallVector<const SCEV*, 8> NewOps;
+    SmallVector<const SCEV*, 8> ScaledOps;
    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
      // Split AddRecs up into parts as either of the parts may be usable
      // without the other.
      if (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(Ops[i]))
        if (!A->getStart()->isZero()) {
-          SCEVHandle Start = A->getStart();
+          const SCEV* Start = A->getStart();
          Ops.push_back(SE.getAddRecExpr(SE.getIntegerSCEV(0, A->getType()),
                                         A->getStepRecurrence(SE),
                                         A->getLoop()));
@ -267,8 +267,8 @@ Value *SCEVExpander::expandAddToGEP(const SCEVHandle *op_begin,
        }
      // If the scale size is not 0, attempt to factor out a scale.
      if (ElSize != 0) {
-        SCEVHandle Op = Ops[i];
-        SCEVHandle Remainder = SE.getIntegerSCEV(0, Op->getType());
+        const SCEV* Op = Ops[i];
+        const SCEV* Remainder = SE.getIntegerSCEV(0, Op->getType());
        if (FactorOutConstant(Op, Remainder, ElSize, SE)) {
          ScaledOps.push_back(Op); // Op now has ElSize factored out.
          NewOps.push_back(Remainder);
@ -364,7 +364,7 @@ Value *SCEVExpander::visitAddExpr(const SCEVAddExpr *S) {
  // comments on expandAddToGEP for details.
  if (SE.TD)
    if (const PointerType *PTy = dyn_cast<PointerType>(V->getType())) {
-      const SmallVectorImpl<SCEVHandle> &Ops = S->getOperands();
+      const SmallVectorImpl<const SCEV*> &Ops = S->getOperands();
      return expandAddToGEP(&Ops[0], &Ops[Ops.size() - 1],
                            PTy, Ty, V);
    }
@ -420,7 +420,7 @@ Value *SCEVExpander::visitUDivExpr(const SCEVUDivExpr *S) {
 /// Move parts of Base into Rest to leave Base with the minimal
 /// expression that provides a pointer operand suitable for a
 /// GEP expansion.
-static void ExposePointerBase(SCEVHandle &Base, SCEVHandle &Rest,
+static void ExposePointerBase(const SCEV* &Base, const SCEV* &Rest,
                              ScalarEvolution &SE) {
  while (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(Base)) {
    Base = A->getStart();
@ -431,7 +431,7 @@ static void ExposePointerBase(SCEVHandle &Base, SCEVHandle &Rest,
  }
  if (const SCEVAddExpr *A = dyn_cast<SCEVAddExpr>(Base)) {
    Base = A->getOperand(A->getNumOperands()-1);
-    SmallVector<SCEVHandle, 8> NewAddOps(A->op_begin(), A->op_end());
+    SmallVector<const SCEV*, 8> NewAddOps(A->op_begin(), A->op_end());
    NewAddOps.back() = Rest;
    Rest = SE.getAddExpr(NewAddOps);
    ExposePointerBase(Base, Rest, SE);
@ -455,9 +455,9 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
  if (CanonicalIV &&
      SE.getTypeSizeInBits(CanonicalIV->getType()) >
      SE.getTypeSizeInBits(Ty)) {
-    SCEVHandle Start = SE.getAnyExtendExpr(S->getStart(),
+    const SCEV* Start = SE.getAnyExtendExpr(S->getStart(),
                                           CanonicalIV->getType());
-    SCEVHandle Step = SE.getAnyExtendExpr(S->getStepRecurrence(SE),
+    const SCEV* Step = SE.getAnyExtendExpr(S->getStepRecurrence(SE),
                                          CanonicalIV->getType());
    Value *V = expand(SE.getAddRecExpr(Start, Step, S->getLoop()));
    BasicBlock::iterator SaveInsertPt = getInsertionPoint();
@ -472,16 +472,16 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {

  // {X,+,F} --> X + {0,+,F}
  if (!S->getStart()->isZero()) {
-    const SmallVectorImpl<SCEVHandle> &SOperands = S->getOperands();
-    SmallVector<SCEVHandle, 4> NewOps(SOperands.begin(), SOperands.end());
+    const SmallVectorImpl<const SCEV*> &SOperands = S->getOperands();
+    SmallVector<const SCEV*, 4> NewOps(SOperands.begin(), SOperands.end());
    NewOps[0] = SE.getIntegerSCEV(0, Ty);
-    SCEVHandle Rest = SE.getAddRecExpr(NewOps, L);
+    const SCEV* Rest = SE.getAddRecExpr(NewOps, L);

    // Turn things like ptrtoint+arithmetic+inttoptr into GEP. See the
    // comments on expandAddToGEP for details.
    if (SE.TD) {
-      SCEVHandle Base = S->getStart();
-      SCEVHandle RestArray[1] = { Rest };
+      const SCEV* Base = S->getStart();
+      const SCEV* RestArray[1] = { Rest };
      // Dig into the expression to find the pointer base for a GEP.
      ExposePointerBase(Base, RestArray[0], SE);
      // If we found a pointer, expand the AddRec with a GEP.
@ -581,20 +581,20 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
  // folders, then expandCodeFor the closed form.  This allows the folders to
  // simplify the expression without having to build a bunch of special code
  // into this folder.
-  SCEVHandle IH = SE.getUnknown(I);   // Get I as a "symbolic" SCEV.
+  const SCEV* IH = SE.getUnknown(I);   // Get I as a "symbolic" SCEV.

  // Promote S up to the canonical IV type, if the cast is foldable.
-  SCEVHandle NewS = S;
-  SCEVHandle Ext = SE.getNoopOrAnyExtend(S, I->getType());
+  const SCEV* NewS = S;
+  const SCEV* Ext = SE.getNoopOrAnyExtend(S, I->getType());
  if (isa<SCEVAddRecExpr>(Ext))
    NewS = Ext;

-  SCEVHandle V = cast<SCEVAddRecExpr>(NewS)->evaluateAtIteration(IH, SE);
+  const SCEV* V = cast<SCEVAddRecExpr>(NewS)->evaluateAtIteration(IH, SE);
  //cerr << "Evaluated: " << *this << "\n     to: " << *V << "\n";

  // Truncate the result down to the original type, if needed.
-  SCEVHandle T = SE.getTruncateOrNoop(V, Ty);
-  return expand(V);
+  const SCEV* T = SE.getTruncateOrNoop(V, Ty);
+  return expand(T);
 }

 Value *SCEVExpander::visitTruncateExpr(const SCEVTruncateExpr *S) {
@ -654,7 +654,7 @@ Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) {
  return LHS;
 }

-Value *SCEVExpander::expandCodeFor(SCEVHandle SH, const Type *Ty) {
+Value *SCEVExpander::expandCodeFor(const SCEV* SH, const Type *Ty) {
  // Expand the code for this SCEV.
  Value *V = expand(SH);
  if (Ty) {
@ -667,7 +667,7 @@ Value *SCEVExpander::expandCodeFor(SCEVHandle SH, const Type *Ty) {

 Value *SCEVExpander::expand(const SCEV *S) {
  // Check to see if we already expanded this.
-  std::map<SCEVHandle, AssertingVH<Value> >::iterator I =
+  std::map<const SCEV*, AssertingVH<Value> >::iterator I =
    InsertedExpressions.find(S);
  if (I != InsertedExpressions.end())
    return I->second;
@ -685,7 +685,7 @@ Value *
 SCEVExpander::getOrInsertCanonicalInductionVariable(const Loop *L,
                                                    const Type *Ty) {
  assert(Ty->isInteger() && "Can only insert integer induction variables!");
-  SCEVHandle H = SE.getAddRecExpr(SE.getIntegerSCEV(0, Ty),
+  const SCEV* H = SE.getAddRecExpr(SE.getIntegerSCEV(0, Ty),
                                  SE.getIntegerSCEV(1, Ty), L);
  return expand(H);
 }
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@ -624,8 +624,12 @@ bool llvm::MaskedValueIsZero(Value *V, const APInt &Mask,
 /// 'Op' must have a scalar integer type.
 ///
 unsigned llvm::ComputeNumSignBits(Value *V, TargetData *TD, unsigned Depth) {
+  assert((TD || V->getType()->isIntOrIntVector()) &&
+         "ComputeNumSignBits requires a TargetData object to operate "
+         "on non-integer values!");
  const Type *Ty = V->getType();
-  unsigned TyBits = Ty->getScalarSizeInBits();
+  unsigned TyBits = TD ? TD->getTypeSizeInBits(V->getType()->getScalarType()) :
+                         Ty->getScalarSizeInBits();
  unsigned Tmp, Tmp2;
  unsigned FirstAnswer = 1;

--- a/lib/CodeGen/ELF.h
+++ b/lib/CodeGen/ELF.h
@ -128,7 +128,13 @@ namespace llvm {
  /// added to logical symbol table for the module.  This is eventually
  /// turned into a real symbol table in the file.
  struct ELFSym {
-    const GlobalValue *GV;    // The global value this corresponds to.
+    // The global value this corresponds to. Global symbols can be on of the 
+    // 3 types : if this symbol has a zero initializer, it is common or should
+    // be placed in bss section otherwise it's a constant.
+    const GlobalValue *GV;
+    bool IsCommon;
+    bool IsBss;
+    bool IsConstant;

    // ELF specific fields
    unsigned NameIdx;         // Index in .strtab of name, once emitted.
@ -159,8 +165,9 @@ namespace llvm {
      STV_PROTECTED = 3 // Visible in other components but not preemptable
    };

-    ELFSym(const GlobalValue *gv) : GV(gv), NameIdx(0), Value(0),
-                                    Size(0), Info(0), Other(0),
+    ELFSym(const GlobalValue *gv) : GV(gv), IsCommon(false), IsBss(false),
+                                    IsConstant(false), NameIdx(0), Value(0),
+                                    Size(0), Info(0), Other(STV_DEFAULT),
                                    SectionIdx(ELFSection::SHN_UNDEF) {
      if (!GV)
        return;
@ -180,16 +187,47 @@ namespace llvm {
      }
    }

-    void SetBind(unsigned X) {
+    unsigned getBind() {
+      return (Info >> 4) & 0xf;
+    }
+
+    void setBind(unsigned X) {
      assert(X == (X & 0xF) && "Bind value out of range!");
      Info = (Info & 0x0F) | (X << 4);
    }
-    void SetType(unsigned X) {
+    void setType(unsigned X) {
      assert(X == (X & 0xF) && "Type value out of range!");
      Info = (Info & 0xF0) | X;
    }
  };

+  /// ELFRelocation - This class contains all the information necessary to
+  /// to generate any 32-bit or 64-bit ELF relocation entry.
+  class ELFRelocation {
+    uint64_t r_offset;    // offset in the section of the object this applies to
+    uint32_t r_symidx;    // symbol table index of the symbol to use
+    uint32_t r_type;      // machine specific relocation type
+    int64_t  r_add;       // explicit relocation addend
+    bool     r_rela;      // if true then the addend is part of the entry
+                          // otherwise the addend is at the location specified
+                          // by r_offset
+  public:
+    uint64_t getInfo(bool is64Bit) const {
+      if (is64Bit)
+        return ((uint64_t)r_symidx << 32) + ((uint64_t)r_type & 0xFFFFFFFFL);
+      else
+        return (r_symidx << 8)  + (r_type & 0xFFL);
+    }
+
+    uint64_t getOffset() const { return r_offset; }
+    int64_t getAddend() const { return r_add; }
+
+    ELFRelocation(uint64_t off, uint32_t sym, uint32_t type,
+                  bool rela = true, int64_t addend = 0) :
+      r_offset(off), r_symidx(sym), r_type(type),
+      r_add(addend), r_rela(rela) {}
+  };
+
 } // end namespace llvm

 #endif
--- a/lib/CodeGen/ELFCodeEmitter.cpp
+++ b/lib/CodeGen/ELFCodeEmitter.cpp
@ -71,38 +71,37 @@ bool ELFCodeEmitter::finishFunction(MachineFunction &MF) {
  // Update Section Size
  ES->Size = CurBufferPtr - BufferBegin;

-  // Figure out the binding (linkage) of the symbol.
-  switch (MF.getFunction()->getLinkage()) {
-  default:
-    // appending linkage is illegal for functions.
-    assert(0 && "Unknown linkage type!");
-  case GlobalValue::ExternalLinkage:
-    FnSym.SetBind(ELFSym::STB_GLOBAL);
-    break;
-  case GlobalValue::LinkOnceAnyLinkage:
-  case GlobalValue::LinkOnceODRLinkage:
-  case GlobalValue::WeakAnyLinkage:
-  case GlobalValue::WeakODRLinkage:
-    FnSym.SetBind(ELFSym::STB_WEAK);
-    break;
-  case GlobalValue::PrivateLinkage:
-    assert (0 && "PrivateLinkage should not be in the symbol table.");
-  case GlobalValue::InternalLinkage:
-    FnSym.SetBind(ELFSym::STB_LOCAL);
-    break;
-  }
-
  // Set the symbol type as a function
-  FnSym.SetType(ELFSym::STT_FUNC);
-
+  FnSym.setType(ELFSym::STT_FUNC);
  FnSym.SectionIdx = ES->SectionIdx;
  FnSym.Size = CurBufferPtr-FnStartPtr;

  // Offset from start of Section
  FnSym.Value = FnStartPtr-BufferBegin;

-  // Finally, add it to the symtab.
-  EW.SymbolList.push_back(FnSym);
+  // Figure out the binding (linkage) of the symbol.
+  switch (MF.getFunction()->getLinkage()) {
+  default:
+    // appending linkage is illegal for functions.
+    assert(0 && "Unknown linkage type!");
+  case GlobalValue::ExternalLinkage:
+    FnSym.setBind(ELFSym::STB_GLOBAL);
+    EW.SymbolList.push_back(FnSym);
+    break;
+  case GlobalValue::LinkOnceAnyLinkage:
+  case GlobalValue::LinkOnceODRLinkage:
+  case GlobalValue::WeakAnyLinkage:
+  case GlobalValue::WeakODRLinkage:
+    FnSym.setBind(ELFSym::STB_WEAK);
+    EW.SymbolList.push_back(FnSym);
+    break;
+  case GlobalValue::PrivateLinkage:
+    assert (0 && "PrivateLinkage should not be in the symbol table.");
+  case GlobalValue::InternalLinkage:
+    FnSym.setBind(ELFSym::STB_LOCAL);
+    EW.SymbolList.push_front(FnSym);
+    break;
+  }

  // Relocations
  // -----------
@ -113,7 +112,6 @@ bool ELFCodeEmitter::finishFunction(MachineFunction &MF) {
  for (unsigned i = 0, e = Relocations.size(); i != e; ++i) {
    MachineRelocation &MR = Relocations[i];
    intptr_t Addr;
-
    if (MR.isBasicBlock()) {
      Addr = getMachineBasicBlockAddress(MR.getBasicBlock());
      MR.setConstantVal(ES->SectionIdx);
--- a/lib/CodeGen/ELFWriter.cpp
+++ b/lib/CodeGen/ELFWriter.cpp
@ -136,104 +136,40 @@ bool ELFWriter::doInitialization(Module &M) {
  ElfHdr.emitWord16(0); // Placeholder

  // Add the null section, which is required to be first in the file.
-  getSection("", ELFSection::SHT_NULL, 0);
-
-  // Start up the symbol table.  The first entry in the symtab is the null 
-  // entry.
-  SymbolList.push_back(ELFSym(0));
+  getNullSection();

  return false;
 }

-void ELFWriter::EmitGlobal(GlobalVariable *GV) {
+unsigned ELFWriter::getGlobalELFLinkage(const GlobalVariable *GV) {
+  if (GV->hasInternalLinkage())
+    return ELFSym::STB_LOCAL;

-  // XXX: put local symbols *before* global ones!
+  if (GV->hasWeakLinkage())
+    return ELFSym::STB_WEAK;
+
+  return ELFSym::STB_GLOBAL;
+}
+
+// For global symbols without a section, return the Null section as a
+// placeholder
+ELFSection &ELFWriter::getGlobalSymELFSection(const GlobalVariable *GV,
+                                              ELFSym &Sym) {
  const Section *S = TAI->SectionForGlobal(GV);
+  unsigned Flags = S->getFlags();
+  unsigned SectionType = ELFSection::SHT_PROGBITS;
+  unsigned SHdrFlags = ELFSection::SHF_ALLOC;
  DOUT << "Section " << S->getName() << " for global " << GV->getName() << "\n";

-  // If this is an external global, emit it now.  TODO: Note that it would be
-  // better to ignore the symbol here and only add it to the symbol table if
-  // referenced.
+  // If this is an external global, the symbol does not have a section.
  if (!GV->hasInitializer()) {
-    ELFSym ExternalSym(GV);
-    ExternalSym.SetBind(ELFSym::STB_GLOBAL);
-    ExternalSym.SetType(ELFSym::STT_NOTYPE);
-    ExternalSym.SectionIdx = ELFSection::SHN_UNDEF;
-    SymbolList.push_back(ExternalSym);
-    return;
+    Sym.SectionIdx = ELFSection::SHN_UNDEF;
+    return getNullSection();
  }

  const TargetData *TD = TM.getTargetData();
  unsigned Align = TD->getPreferredAlignment(GV);
  Constant *CV = GV->getInitializer();
-  unsigned Size = TD->getTypeAllocSize(CV->getType());
-
-  // If this global has a zero initializer, go to .bss or common section.
-  if (CV->isNullValue() || isa<UndefValue>(CV)) {
-    // If this global is part of the common block, add it now.  Variables are
-    // part of the common block if they are zero initialized and allowed to be
-    // merged with other symbols.
-    if (GV->hasLinkOnceLinkage() || GV->hasWeakLinkage() ||
-        GV->hasCommonLinkage()) {
-      ELFSym CommonSym(GV);
-      // Value for common symbols is the alignment required.
-      CommonSym.Value = Align;
-      CommonSym.Size  = Size;
-      CommonSym.SetBind(ELFSym::STB_GLOBAL);
-      CommonSym.SetType(ELFSym::STT_OBJECT);
-      CommonSym.SectionIdx = ELFSection::SHN_COMMON;
-      SymbolList.push_back(CommonSym);
-      getSection(S->getName(), ELFSection::SHT_NOBITS,
-        ELFSection::SHF_WRITE | ELFSection::SHF_ALLOC, 1);
-      return;
-    }
-
-    // Otherwise, this symbol is part of the .bss section.  Emit it now.
-    // Handle alignment.  Ensure section is aligned at least as much as required
-    // by this symbol.
-    ELFSection &BSSSection = getBSSSection();
-    BSSSection.Align = std::max(BSSSection.Align, Align);
-
-    // Within the section, emit enough virtual padding to get us to an alignment
-    // boundary.
-    if (Align)
-      BSSSection.Size = (BSSSection.Size + Align - 1) & ~(Align-1);
-
-    ELFSym BSSSym(GV);
-    BSSSym.Value = BSSSection.Size;
-    BSSSym.Size = Size;
-    BSSSym.SetType(ELFSym::STT_OBJECT);
-
-    switch (GV->getLinkage()) {
-    default:  // weak/linkonce/common handled above
-      assert(0 && "Unexpected linkage type!");
-    case GlobalValue::AppendingLinkage:  // FIXME: This should be improved!
-    case GlobalValue::ExternalLinkage:
-      BSSSym.SetBind(ELFSym::STB_GLOBAL);
-      break;
-    case GlobalValue::InternalLinkage:
-      BSSSym.SetBind(ELFSym::STB_LOCAL);
-      break;
-    }
-
-    // Set the idx of the .bss section
-    BSSSym.SectionIdx = BSSSection.SectionIdx;
-    if (!GV->hasPrivateLinkage())
-      SymbolList.push_back(BSSSym);
-
-    // Reserve space in the .bss section for this symbol.
-    BSSSection.Size += Size;
-    return;
-  }
-
-  /// Emit the Global symbol to the right ELF section
-  ELFSym GblSym(GV);
-  GblSym.Size = Size;
-  GblSym.SetType(ELFSym::STT_OBJECT);
-  GblSym.SetBind(ELFSym::STB_GLOBAL);
-  unsigned Flags = S->getFlags();
-  unsigned SectType = ELFSection::SHT_PROGBITS;
-  unsigned SHdrFlags = ELFSection::SHF_ALLOC;

  if (Flags & SectionFlags::Code)
    SHdrFlags |= ELFSection::SHF_EXECINSTR;
@ -246,29 +182,81 @@ void ELFWriter::EmitGlobal(GlobalVariable *GV) {
  if (Flags & SectionFlags::Strings)
    SHdrFlags |= ELFSection::SHF_STRINGS;

-  // Remove tab from section name prefix
-  std::string SectionName(S->getName());
-  size_t Pos = SectionName.find("\t");
-  if (Pos != std::string::npos)
-    SectionName.erase(Pos, 1);
+  // If this global has a zero initializer, go to .bss or common section.
+  // Variables are part of the common block if they are zero initialized
+  // and allowed to be merged with other symbols.
+  if (CV->isNullValue() || isa<UndefValue>(CV)) {
+    SectionType = ELFSection::SHT_NOBITS;
+    ELFSection &ElfS = getSection(S->getName(), SectionType, SHdrFlags);
+    if (GV->hasLinkOnceLinkage() || GV->hasWeakLinkage() ||
+        GV->hasCommonLinkage()) {
+      Sym.SectionIdx = ELFSection::SHN_COMMON;
+      Sym.IsCommon = true;
+      ElfS.Align = 1;
+      return ElfS;
+    }
+    Sym.IsBss = true;
+    Sym.SectionIdx = ElfS.SectionIdx;
+    if (Align) ElfS.Size = (ElfS.Size + Align-1) & ~(Align-1);
+    ElfS.Align = std::max(ElfS.Align, Align);
+    return ElfS;
+  }

-  // The section alignment should be bound to the element with
-  // the largest alignment
-  ELFSection &ElfS = getSection(SectionName, SectType, SHdrFlags);
-  GblSym.SectionIdx = ElfS.SectionIdx;
-  if (Align > ElfS.Align)
-    ElfS.Align = Align;
+  Sym.IsConstant = true;
+  ELFSection &ElfS = getSection(S->getName(), SectionType, SHdrFlags);
+  Sym.SectionIdx = ElfS.SectionIdx;
+  ElfS.Align = std::max(ElfS.Align, Align);
+  return ElfS;
+}

-  // S.Value should contain the symbol index inside the section,
-  // and all symbols should start on their required alignment boundary
-  GblSym.Value = (ElfS.size() + (Align-1)) & (-Align);
-  ElfS.emitAlignment(Align);
-
-  // Emit the constant symbol to its section
-  EmitGlobalConstant(CV, ElfS);
+void ELFWriter::EmitFunctionDeclaration(const Function *F) {
+  ELFSym GblSym(F);
+  GblSym.setBind(ELFSym::STB_GLOBAL);
+  GblSym.setType(ELFSym::STT_NOTYPE);
+  GblSym.SectionIdx = ELFSection::SHN_UNDEF;
  SymbolList.push_back(GblSym);
 }

+void ELFWriter::EmitGlobalVar(const GlobalVariable *GV) {
+  unsigned SymBind = getGlobalELFLinkage(GV);
+  unsigned Align=0, Size=0;
+  ELFSym GblSym(GV);
+  GblSym.setBind(SymBind);
+
+  if (GV->hasInitializer()) {
+    GblSym.setType(ELFSym::STT_OBJECT);
+    const TargetData *TD = TM.getTargetData();
+    Align = TD->getPreferredAlignment(GV);
+    Size = TD->getTypeAllocSize(GV->getInitializer()->getType());
+    GblSym.Size = Size;
+  } else {
+    GblSym.setType(ELFSym::STT_NOTYPE);
+  }
+
+  ELFSection &GblSection = getGlobalSymELFSection(GV, GblSym);
+
+  if (GblSym.IsCommon) {
+    GblSym.Value = Align;
+  } else if (GblSym.IsBss) {
+    GblSym.Value = GblSection.Size;
+    GblSection.Size += Size;
+  } else if (GblSym.IsConstant){
+    // GblSym.Value should contain the symbol index inside the section,
+    // and all symbols should start on their required alignment boundary
+    GblSym.Value = (GblSection.size() + (Align-1)) & (-Align);
+    GblSection.emitAlignment(Align);
+    EmitGlobalConstant(GV->getInitializer(), GblSection);
+  }
+
+  // Local symbols should come first on the symbol table.
+  if (!GV->hasPrivateLinkage()) {
+    if (SymBind == ELFSym::STB_LOCAL)
+      SymbolList.push_front(GblSym);
+    else
+      SymbolList.push_back(GblSym);
+  }
+}
+
 void ELFWriter::EmitGlobalConstantStruct(const ConstantStruct *CVS,
                                         ELFSection &GblS) {

@ -306,6 +294,7 @@ void ELFWriter::EmitGlobalConstant(const Constant *CV, ELFSection &GblS) {
  if (const ConstantArray *CVA = dyn_cast<ConstantArray>(CV)) {
    if (CVA->isString()) {
      std::string GblStr = CVA->getAsString();
+      GblStr.resize(GblStr.size()-1);
      GblS.emitString(GblStr);
    } else { // Not a string.  Print the values in successive locations
      for (unsigned i = 0, e = CVA->getNumOperands(); i != e; ++i)
@ -370,13 +359,39 @@ bool ELFWriter::doFinalization(Module &M) {

  // Build and emit data, bss and "common" sections.
  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
-       I != E; ++I)
-    EmitGlobal(I);
+       I != E; ++I) {
+    EmitGlobalVar(I);
+    GblSymLookup[I] = 0;
+  }
+
+  // Emit all pending globals
+  // TODO: this should be done only for referenced symbols
+  for (SetVector<GlobalValue*>::const_iterator I = PendingGlobals.begin(),
+       E = PendingGlobals.end(); I != E; ++I) {
+
+    // No need to emit the symbol again
+    if (GblSymLookup.find(*I) != GblSymLookup.end())
+      continue;
+
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(*I)) {
+      EmitGlobalVar(GV);
+    } else if (Function *F = dyn_cast<Function>(*I)) {
+      // If function is not in GblSymLookup, it doesn't have a body,
+      // so emit the symbol as a function declaration (no section associated)
+      EmitFunctionDeclaration(F);
+    } else {
+      assert("unknown howto handle pending global");
+    }
+    GblSymLookup[*I] = 0;
+  }

  // Emit non-executable stack note
  if (TAI->getNonexecutableStackDirective())
    getNonExecStackSection();

+  // Emit string table
+  EmitStringTable();
+
  // Emit the symbol table now, if non-empty.
  EmitSymbolTable();

@ -400,6 +415,67 @@ bool ELFWriter::doFinalization(Module &M) {

 /// EmitRelocations - Emit relocations
 void ELFWriter::EmitRelocations() {
+
+  // Create Relocation sections for each section which needs it.
+  for (std::list<ELFSection>::iterator I = SectionList.begin(),
+       E = SectionList.end(); I != E; ++I) {
+
+    // This section does not have relocations
+    if (!I->hasRelocations()) continue;
+
+    // Get the relocation section for section 'I'
+    bool HasRelA = TEW->hasRelocationAddend();
+    ELFSection &RelSec = getRelocSection(I->getName(), HasRelA);
+
+    // 'Link' - Section hdr idx of the associated symbol table
+    // 'Info' - Section hdr idx of the section to which the relocation applies
+    ELFSection &SymTab = getSymbolTableSection();
+    RelSec.Link = SymTab.SectionIdx;
+    RelSec.Info = I->SectionIdx;
+    RelSec.EntSize = TEW->getRelocationEntrySize();
+
+    // Get the relocations from Section
+    std::vector<MachineRelocation> Relos = I->getRelocations();
+    for (std::vector<MachineRelocation>::iterator MRI = Relos.begin(),
+         MRE = Relos.end(); MRI != MRE; ++MRI) {
+      MachineRelocation &MR = *MRI;
+
+      // Offset from the start of the section containing the symbol
+      unsigned Offset = MR.getMachineCodeOffset();
+
+      // Symbol index in the symbol table
+      unsigned SymIdx = 0;
+
+      // Target specific ELF relocation type
+      unsigned RelType = TEW->getRelocationType(MR.getRelocationType());
+
+      // Constant addend used to compute the value to be stored 
+      // into the relocatable field
+      int64_t Addend = TEW->getAddendForRelTy(RelType);
+
+      // There are several machine relocations types, and each one of
+      // them needs a different approach to retrieve the symbol table index.
+      if (MR.isGlobalValue()) {
+        const GlobalValue *G = MR.getGlobalValue();
+        SymIdx = GblSymLookup[G];
+      } else {
+        assert(0 && "dunno how to handle other relocation types");
+      }
+
+      // Get the relocation entry and emit to the relocation section
+      ELFRelocation Rel(Offset, SymIdx, RelType, HasRelA, Addend);
+      EmitRelocation(RelSec, Rel, HasRelA);
+    }
+  }
+}
+
+/// EmitRelocation - Write relocation 'Rel' to the relocation section 'Rel'
+void ELFWriter::EmitRelocation(BinaryObject &RelSec, ELFRelocation &Rel,
+                               bool HasRelA) {
+  RelSec.emitWord(Rel.getOffset());
+  RelSec.emitWord(Rel.getInfo(is64Bit));
+  if (HasRelA)
+    RelSec.emitWord(Rel.getAddend());
 }

 /// EmitSymbol - Write symbol 'Sym' to the symbol table 'SymbolTable'
@ -448,28 +524,28 @@ void ELFWriter::EmitSectionHeader(BinaryObject &SHdrTab,
  }
 }

-/// EmitSymbolTable - If the current symbol table is non-empty, emit the string
-/// table for it and then the symbol table itself.
-void ELFWriter::EmitSymbolTable() {
-  if (SymbolList.size() == 1) return;  // Only the null entry.
-
-  // FIXME: compact all local symbols to the start of the symtab.
-  unsigned FirstNonLocalSymbol = 1;
-
+/// EmitStringTable - If the current symbol table is non-empty, emit the string
+/// table for it
+void ELFWriter::EmitStringTable() {
+  if (!SymbolList.size()) return;  // Empty symbol table.
  ELFSection &StrTab = getStringTableSection();

  // Set the zero'th symbol to a null byte, as required.
  StrTab.emitByte(0);

+  // Walk on the symbol list and write symbol names into the
+  // string table.
  unsigned Index = 1;
-  for (unsigned i = 1, e = SymbolList.size(); i != e; ++i) {
+  for (std::list<ELFSym>::iterator I = SymbolList.begin(),
+       E = SymbolList.end(); I != E; ++I) {
+
    // Use the name mangler to uniquify the LLVM symbol.
-    std::string Name = Mang->getValueName(SymbolList[i].GV);
+    std::string Name = Mang->getValueName(I->GV);

    if (Name.empty()) {
-      SymbolList[i].NameIdx = 0;
+      I->NameIdx = 0;
    } else {
-      SymbolList[i].NameIdx = Index;
+      I->NameIdx = Index;
      StrTab.emitString(Name);

      // Keep track of the number of bytes emitted to this section.
@ -478,20 +554,45 @@ void ELFWriter::EmitSymbolTable() {
  }
  assert(Index == StrTab.size());
  StrTab.Size = Index;
+}

+/// EmitSymbolTable - Emit the symbol table itself.
+void ELFWriter::EmitSymbolTable() {
+  if (!SymbolList.size()) return;  // Empty symbol table.
+
+  unsigned FirstNonLocalSymbol = 1;
  // Now that we have emitted the string table and know the offset into the
  // string table of each symbol, emit the symbol table itself.
  ELFSection &SymTab = getSymbolTableSection();
-  SymTab.Align = TEW->getSymTabAlignment();
-  SymTab.Link  = StrTab.SectionIdx;      // Section Index of .strtab.
-  SymTab.Info  = FirstNonLocalSymbol;    // First non-STB_LOCAL symbol.
+  SymTab.Align = TEW->getPrefELFAlignment();
+
+  // Section Index of .strtab.
+  SymTab.Link = getStringTableSection().SectionIdx;

  // Size of each symtab entry.
  SymTab.EntSize = TEW->getSymTabEntrySize();

-  for (unsigned i = 0, e = SymbolList.size(); i != e; ++i)
-    EmitSymbol(SymTab, SymbolList[i]);
+  // The first entry in the symtab is the null symbol
+  ELFSym NullSym = ELFSym(0);
+  EmitSymbol(SymTab, NullSym);

+  // Emit all the symbols to the symbol table. Skip the null
+  // symbol, cause it's emitted already
+  unsigned Index = 1;
+  for (std::list<ELFSym>::iterator I = SymbolList.begin(),
+       E = SymbolList.end(); I != E; ++I, ++Index) {
+    // Keep track of the first non-local symbol
+    if (I->getBind() == ELFSym::STB_LOCAL)
+      FirstNonLocalSymbol++;
+
+    // Emit symbol to the symbol table
+    EmitSymbol(SymTab, *I);
+
+    // Record the symbol table index for each global value
+    GblSymLookup[I->GV] = Index;
+  }
+
+  SymTab.Info = FirstNonLocalSymbol;
  SymTab.Size = SymTab.size();
 }

@ -500,7 +601,7 @@ void ELFWriter::EmitSymbolTable() {
 /// section names.
 void ELFWriter::EmitSectionTableStringTable() {
  // First step: add the section for the string table to the list of sections:
-  ELFSection &SHStrTab = getSection(".shstrtab", ELFSection::SHT_STRTAB, 0);
+  ELFSection &SHStrTab = getSectionHeaderStringTableSection();

  // Now that we know which section number is the .shstrtab section, update the
  // e_shstrndx entry in the ELF header.
@ -559,7 +660,7 @@ void ELFWriter::OutputSectionsAndSectionTable() {
  }

  // Align Section Header.
-  unsigned TableAlign = is64Bit ? 8 : 4;
+  unsigned TableAlign = TEW->getPrefELFAlignment();
  FileOff = (FileOff+TableAlign-1) & ~(TableAlign-1);

  // Now that we know where all of the sections will be emitted, set the e_shnum
@ -586,13 +687,12 @@ void ELFWriter::OutputSectionsAndSectionTable() {
         << ", SectionData Size: " << S.size() << "\n";

    // Align FileOff to whatever the alignment restrictions of the section are.
-    if (S.Align) {
-      for (size_t NewFileOff = (FileOff+S.Align-1) & ~(S.Align-1);
-        FileOff != NewFileOff; ++FileOff)
-        O << (char)0xAB;
-    }
-
    if (S.size()) {
+      if (S.Align)  {
+        for (size_t NewFileOff = (FileOff+S.Align-1) & ~(S.Align-1);
+             FileOff != NewFileOff; ++FileOff)
+          O << (char)0xAB;
+      }
      O.write((char *)&S.getData()[0], S.Size);
      FileOff += S.Size;
    }
--- a/lib/CodeGen/ELFWriter.h
+++ b/lib/CodeGen/ELFWriter.h
@ -16,7 +16,7 @@

 #include "llvm/ADT/SetVector.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/Support/OutputBuffer.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetAsmInfo.h"
 #include "llvm/Target/TargetELFWriterInfo.h"
 #include "ELF.h"
@ -89,7 +89,7 @@ namespace llvm {
    bool doFinalization(Module &M);

  private:
-    // Blob containing the Elf header
+    /// Blob containing the Elf header
    BinaryObject ElfHdr;

    /// SectionList - This is the list of sections that we have emitted to the
@ -102,14 +102,35 @@ namespace llvm {
    /// the SectionList.
    std::map<std::string, ELFSection*> SectionLookup;

+    /// GblSymLookup - This is a mapping from global value to a symbol index
+    /// in the symbol table. This is useful since relocations symbol references
+    /// must be quickly mapped to a symbol table index
+    std::map<const GlobalValue*, uint32_t> GblSymLookup;
+
+    /// SymbolList - This is the list of symbols emitted to the symbol table
+    /// Local symbols go to the front and Globals to the back.
+    std::list<ELFSym> SymbolList;
+
+    /// PendingGlobals - List of externally defined symbols that we have been
+    /// asked to emit, but have not seen a reference to.  When a reference
+    /// is seen, the symbol will move from this list to the SymbolList.
+    SetVector<GlobalValue*> PendingGlobals;
+
    /// getSection - Return the section with the specified name, creating a new
    /// section if one does not already exist.
-    ELFSection &getSection(const std::string &Name, unsigned Type, 
+    ELFSection &getSection(const std::string &Name, unsigned Type,
                           unsigned Flags = 0, unsigned Align = 0) {
      ELFSection *&SN = SectionLookup[Name];
      if (SN) return *SN;

-      SectionList.push_back(ELFSection(Name, isLittleEndian, is64Bit));
+      // Remove tab from section name prefix. This is necessary becase TAI 
+      // sometimes return a section name prefixed with a "\t" char.
+      std::string SectionName(Name);
+      size_t Pos = SectionName.find("\t");
+      if (Pos != std::string::npos)
+        SectionName.erase(Pos, 1);
+
+      SectionList.push_back(ELFSection(SectionName, isLittleEndian, is64Bit));
      SN = &SectionList.back();
      SN->SectionIdx = NumSections++;
      SN->Type = Type;
@ -119,11 +140,25 @@ namespace llvm {
      return *SN;
    }

+    /// TODO: support mangled names here to emit the right .text section
+    /// for c++ object files.
    ELFSection &getTextSection() {
      return getSection(".text", ELFSection::SHT_PROGBITS,
                        ELFSection::SHF_EXECINSTR | ELFSection::SHF_ALLOC);
    }

+    /// Return the relocation section of section 'S'. 'RelA' is true
+    /// if the relocation section contains entries with addends.
+    ELFSection &getRelocSection(std::string SName, bool RelA) {
+      std::string RelSName(".rel");
+      unsigned SHdrTy = RelA ? ELFSection::SHT_RELA : ELFSection::SHT_REL;
+
+      if (RelA) RelSName.append("a");
+      RelSName.append(SName);
+
+      return getSection(RelSName, SHdrTy, 0, TEW->getPrefELFAlignment());
+    }
+
    ELFSection &getNonExecStackSection() {
      return getSection(".note.GNU-stack", ELFSection::SHT_PROGBITS, 0, 1);
    }
@ -136,25 +171,23 @@ namespace llvm {
      return getSection(".strtab", ELFSection::SHT_STRTAB, 0, 1);
    }

+    ELFSection &getSectionHeaderStringTableSection() {
+      return getSection(".shstrtab", ELFSection::SHT_STRTAB, 0, 1);
+    }
+
    ELFSection &getDataSection() {
      return getSection(".data", ELFSection::SHT_PROGBITS,
-                        ELFSection::SHF_WRITE | ELFSection::SHF_ALLOC);
+                        ELFSection::SHF_WRITE | ELFSection::SHF_ALLOC, 4);
    }

    ELFSection &getBSSSection() {
      return getSection(".bss", ELFSection::SHT_NOBITS,
-                        ELFSection::SHF_WRITE | ELFSection::SHF_ALLOC);
+                        ELFSection::SHF_WRITE | ELFSection::SHF_ALLOC, 4);
    }

-    /// SymbolList - This is the list of symbols we have emitted to the file.
-    /// This actually gets rearranged before emission to the file (to put the
-    /// local symbols first in the list).
-    std::vector<ELFSym> SymbolList;
-
-    /// PendingGlobals - List of externally defined symbols that we have been
-    /// asked to emit, but have not seen a reference to.  When a reference
-    /// is seen, the symbol will move from this list to the SymbolList.
-    SetVector<GlobalValue*> PendingGlobals;
+    ELFSection &getNullSection() {
+      return getSection("", ELFSection::SHT_NULL, 0);
+    }

    // As we complete the ELF file, we need to update fields in the ELF header
    // (e.g. the location of the section table).  These members keep track of
@ -165,15 +198,20 @@ namespace llvm {
    unsigned ELFHdr_e_shnum_Offset;     // e_shnum    in ELF header.

  private:
-    void EmitGlobal(GlobalVariable *GV);
+    void EmitFunctionDeclaration(const Function *F);
+    void EmitGlobalVar(const GlobalVariable *GV);
    void EmitGlobalConstant(const Constant *C, ELFSection &GblS);
    void EmitGlobalConstantStruct(const ConstantStruct *CVS,
                                  ELFSection &GblS);
+    unsigned getGlobalELFLinkage(const GlobalVariable *GV);
+    ELFSection &getGlobalSymELFSection(const GlobalVariable *GV, ELFSym &Sym);
    void EmitRelocations();
+    void EmitRelocation(BinaryObject &RelSec, ELFRelocation &Rel, bool HasRelA);
    void EmitSectionHeader(BinaryObject &SHdrTab, const ELFSection &SHdr);
    void EmitSectionTableStringTable();
    void EmitSymbol(BinaryObject &SymbolTable, ELFSym &Sym);
    void EmitSymbolTable();
+    void EmitStringTable();
    void OutputSectionsAndSectionTable();
  };
 }
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@ -3121,6 +3121,8 @@ bool MeetsMaxMemopRequirement(std::vector<MVT> &MemOps,
          VT = (MVT::SimpleValueType)(VT.getSimpleVT() - 1);
        VTSize = VT.getSizeInBits() / 8;
      } else {
+        // This can result in a type that is not legal on the target, e.g.
+        // 1 or 2 bytes on PPC.
        VT = (MVT::SimpleValueType)(VT.getSimpleVT() - 1);
        VTSize >>= 1;
      }
@ -3177,12 +3179,29 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
                           getMemBasePlusOffset(Dst, DstOff, DAG),
                           DstSV, DstSVOff + DstOff, false, DstAlign);
    } else {
-      Value = DAG.getLoad(VT, dl, Chain,
-                          getMemBasePlusOffset(Src, SrcOff, DAG),
-                          SrcSV, SrcSVOff + SrcOff, false, Align);
-      Store = DAG.getStore(Chain, dl, Value,
-                           getMemBasePlusOffset(Dst, DstOff, DAG),
-                           DstSV, DstSVOff + DstOff, false, DstAlign);
+      // The type might not be legal for the target.  This should only happen
+      // if the type is smaller than a legal type, as on PPC, so the right
+      // thing to do is generate a LoadExt/StoreTrunc pair.
+      // FIXME does the case above also need this?
+      if (TLI.isTypeLegal(VT)) {
+        Value = DAG.getLoad(VT, dl, Chain,
+                            getMemBasePlusOffset(Src, SrcOff, DAG),
+                            SrcSV, SrcSVOff + SrcOff, false, Align);
+        Store = DAG.getStore(Chain, dl, Value,
+                             getMemBasePlusOffset(Dst, DstOff, DAG),
+                             DstSV, DstSVOff + DstOff, false, DstAlign);
+      } else {
+        MVT NVT = VT;
+        while (!TLI.isTypeLegal(NVT)) {
+          NVT = (MVT::SimpleValueType(NVT.getSimpleVT() + 1));
+        }
+        Value = DAG.getExtLoad(ISD::EXTLOAD, dl, NVT, Chain,
+                               getMemBasePlusOffset(Src, SrcOff, DAG),
+                               SrcSV, SrcSVOff + SrcOff, VT, false, Align);
+        Store = DAG.getTruncStore(Chain, dl, Value,
+                               getMemBasePlusOffset(Dst, DstOff, DAG),
+                               DstSV, DstSVOff + DstOff, VT, false, DstAlign);
+      }
    }
    OutChains.push_back(Store);
    SrcOff += VTSize;
--- a/lib/CodeGen/SimpleRegisterCoalescing.cpp
+++ b/lib/CodeGen/SimpleRegisterCoalescing.cpp
@ -739,6 +739,9 @@ SimpleRegisterCoalescing::UpdateRegDefsUses(unsigned SrcReg, unsigned DstReg,

    // After updating the operand, check if the machine instruction has
    // become a copy. If so, update its val# information.
+    if (JoinedCopies.count(UseMI))
+      continue;
+
    const TargetInstrDesc &TID = UseMI->getDesc();
    unsigned CopySrcReg, CopyDstReg, CopySrcSubIdx, CopyDstSubIdx;
    if (TID.getNumDefs() == 1 && TID.getNumOperands() > 2 &&
@ -749,9 +752,10 @@ SimpleRegisterCoalescing::UpdateRegDefsUses(unsigned SrcReg, unsigned DstReg,
         allocatableRegs_[CopyDstReg])) {
      LiveInterval &LI = li_->getInterval(CopyDstReg);
      unsigned DefIdx = li_->getDefIndex(li_->getInstructionIndex(UseMI));
-      const LiveRange *DLR = LI.getLiveRangeContaining(DefIdx);
-      if (DLR->valno->def == DefIdx)
-        DLR->valno->copy = UseMI;
+      if (const LiveRange *DLR = LI.getLiveRangeContaining(DefIdx)) {
+        if (DLR->valno->def == DefIdx)
+          DLR->valno->copy = UseMI;
+      }
    }
  }
 }
--- a/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
+++ b/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
@ -27,6 +27,7 @@
 #include "llvm/System/DynamicLibrary.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Support/ManagedStatic.h"
+#include "llvm/System/Mutex.h"
 #include <csignal>
 #include <cstdio>
 #include <map>
@ -45,6 +46,8 @@

 using namespace llvm;

+static ManagedStatic<sys::Mutex> FunctionsLock;
+
 typedef GenericValue (*ExFunc)(const FunctionType *,
                               const std::vector<GenericValue> &);
 static ManagedStatic<std::map<const Function *, ExFunc> > ExportedFunctions;
@ -94,6 +97,7 @@ static ExFunc lookupFunction(const Function *F) {
    ExtName += getTypeID(FT->getContainedType(i));
  ExtName += "_" + F->getName();

+  sys::ScopedLock Writer(&*FunctionsLock);
  ExFunc FnPtr = FuncNames[ExtName];
  if (FnPtr == 0)
    FnPtr = FuncNames["lle_X_"+F->getName()];
@ -246,12 +250,16 @@ GenericValue Interpreter::callExternalFunction(Function *F,
                                     const std::vector<GenericValue> &ArgVals) {
  TheInterpreter = this;

+  FunctionsLock->acquire();
+
  // Do a lookup to see if the function is in our cache... this should just be a
  // deferred annotation!
  std::map<const Function *, ExFunc>::iterator FI = ExportedFunctions->find(F);
  if (ExFunc Fn = (FI == ExportedFunctions->end()) ? lookupFunction(F)
-                                                   : FI->second)
+                                                   : FI->second) {
+    FunctionsLock->release();
    return Fn(F->getFunctionType(), ArgVals);
+  }

 #ifdef USE_LIBFFI
  std::map<const Function *, RawFunc>::iterator RF = RawFunctions->find(F);
@ -264,6 +272,8 @@ GenericValue Interpreter::callExternalFunction(Function *F,
  } else {
    RawFn = RF->second;
  }
+  
+  FunctionsLock->release();

  GenericValue Result;
  if (RawFn != 0 && ffiInvoke(RawFn, F, ArgVals, getTargetData(), Result))
@ -529,6 +539,7 @@ GenericValue lle_X_fprintf(const FunctionType *FT,


 void Interpreter::initializeExternalFunctions() {
+  sys::ScopedLock Writer(&*FunctionsLock);
  FuncNames["lle_X_atexit"]       = lle_X_atexit;
  FuncNames["lle_X_exit"]         = lle_X_exit;
  FuncNames["lle_X_abort"]        = lle_X_abort;
--- a/lib/Support/Annotation.cpp
+++ b/lib/Support/Annotation.cpp
@ -13,6 +13,7 @@

 #include "llvm/Support/Annotation.h"
 #include "llvm/Support/ManagedStatic.h"
+#include "llvm/System/RWMutex.h"
 #include <map>
 #include <cstring>
 using namespace llvm;
@ -42,31 +43,33 @@ static unsigned IDCounter = 0;  // Unique ID counter

 // Static member to ensure initialiation on demand.
 static ManagedStatic<IDMapType> IDMap;
+static ManagedStatic<sys::SmartRWMutex<true> > AnnotationsLock;

 // On demand annotation creation support...
 typedef Annotation *(*AnnFactory)(AnnotationID, const Annotable *, void *);
 typedef std::map<unsigned, std::pair<AnnFactory,void*> > FactMapType;

-static FactMapType *TheFactMap = 0;
+static ManagedStatic<FactMapType> TheFactMap;
 static FactMapType &getFactMap() {
-  if (TheFactMap == 0)
-    TheFactMap = new FactMapType();
  return *TheFactMap;
 }

 static void eraseFromFactMap(unsigned ID) {
-  assert(TheFactMap && "No entries found!");
+  sys::SmartScopedWriter<true> Writer(&*AnnotationsLock);
  TheFactMap->erase(ID);
-  if (TheFactMap->empty()) {   // Delete when empty
-    delete TheFactMap;
-    TheFactMap = 0;
-  }
 }

 AnnotationID AnnotationManager::getID(const char *Name) {  // Name -> ID
+  AnnotationsLock->reader_acquire();
  IDMapType::iterator I = IDMap->find(Name);
-  if (I == IDMap->end()) {
-    (*IDMap)[Name] = IDCounter++;   // Add a new element
+  IDMapType::iterator E = IDMap->end();
+  AnnotationsLock->reader_release();
+  
+  if (I == E) {
+    sys::SmartScopedWriter<true> Writer(&*AnnotationsLock);
+    I = IDMap->find(Name);
+    if (I == IDMap->end())
+      (*IDMap)[Name] = IDCounter++;   // Add a new element
    return AnnotationID(IDCounter-1);
  }
  return AnnotationID(I->second);
@ -85,6 +88,7 @@ AnnotationID AnnotationManager::getID(const char *Name, Factory Fact,
 // only be used for debugging.
 //
 const char *AnnotationManager::getName(AnnotationID ID) {  // ID -> Name
+  sys::SmartScopedReader<true> Reader(&*AnnotationsLock);
  IDMapType &TheMap = *IDMap;
  for (IDMapType::iterator I = TheMap.begin(); ; ++I) {
    assert(I != TheMap.end() && "Annotation ID is unknown!");
@ -98,10 +102,12 @@ const char *AnnotationManager::getName(AnnotationID ID) {  // ID -> Name
 //
 void AnnotationManager::registerAnnotationFactory(AnnotationID ID, AnnFactory F,
                                                  void *ExtraData) {
-  if (F)
+  if (F) {
+    sys::SmartScopedWriter<true> Writer(&*AnnotationsLock);
    getFactMap()[ID.ID] = std::make_pair(F, ExtraData);
-  else
+  } else {
    eraseFromFactMap(ID.ID);
+  }
 }

 // createAnnotation - Create an annotation of the specified ID for the
@ -109,7 +115,13 @@ void AnnotationManager::registerAnnotationFactory(AnnotationID ID, AnnFactory F,
 //
 Annotation *AnnotationManager::createAnnotation(AnnotationID ID,
                                                const Annotable *Obj) {
+  AnnotationsLock->reader_acquire();
  FactMapType::iterator I = getFactMap().find(ID.ID);
-  if (I == getFactMap().end()) return 0;
+  if (I == getFactMap().end()) {
+    AnnotationsLock->reader_release();
+    return 0;
+  }
+  
+  AnnotationsLock->reader_release();
  return I->second.first(ID, Obj, I->second.second);
 }
--- a/lib/Support/PluginLoader.cpp
+++ b/lib/Support/PluginLoader.cpp
@ -16,13 +16,16 @@
 #include "llvm/Support/PluginLoader.h"
 #include "llvm/Support/Streams.h"
 #include "llvm/System/DynamicLibrary.h"
+#include "llvm/System/Mutex.h"
 #include <ostream>
 #include <vector>
 using namespace llvm;

 static ManagedStatic<std::vector<std::string> > Plugins;
+static ManagedStatic<sys::SmartMutex<true> > PluginsLock;

 void PluginLoader::operator=(const std::string &Filename) {
+  sys::SmartScopedLock<true> Lock(&*PluginsLock);
  std::string Error;
  if (sys::DynamicLibrary::LoadLibraryPermanently(Filename.c_str(), &Error)) {
    cerr << "Error opening '" << Filename << "': " << Error
@ -33,10 +36,12 @@ void PluginLoader::operator=(const std::string &Filename) {
 }

 unsigned PluginLoader::getNumPlugins() {
+  sys::SmartScopedLock<true> Lock(&*PluginsLock);
  return Plugins.isConstructed() ? Plugins->size() : 0;
 }

 std::string &PluginLoader::getPlugin(unsigned num) {
+  sys::SmartScopedLock<true> Lock(&*PluginsLock);
  assert(Plugins.isConstructed() && num < Plugins->size() &&
         "Asking for an out of bounds plugin");
  return (*Plugins)[num];
--- a/lib/Support/Statistic.cpp
+++ b/lib/Support/Statistic.cpp
@ -25,6 +25,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Streams.h"
+#include "llvm/System/Mutex.h"
 #include "llvm/ADT/StringExtras.h"
 #include <algorithm>
 #include <ostream>
@ -57,13 +58,14 @@ public:
 }

 static ManagedStatic<StatisticInfo> StatInfo;
-
+static ManagedStatic<sys::Mutex> StatLock;

 /// RegisterStatistic - The first time a statistic is bumped, this method is
 /// called.
 void Statistic::RegisterStatistic() {
  // If stats are enabled, inform StatInfo that this statistic should be
  // printed.
+  sys::ScopedLock Writer(&*StatLock);
  if (Enabled)
    StatInfo->addStatistic(this);
  // Remember we have been registered.
--- a/lib/Support/Timer.cpp
+++ b/lib/Support/Timer.cpp
@ -15,6 +15,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Streams.h"
+#include "llvm/System/Mutex.h"
 #include "llvm/System/Process.h"
 #include <algorithm>
 #include <fstream>
@ -50,25 +51,28 @@ namespace {
                   cl::Hidden, cl::location(getLibSupportInfoOutputFilename()));
 }

-static TimerGroup *DefaultTimerGroup = 0;
+static ManagedStatic<sys::SmartMutex<true> > TimerLock;
+static ManagedStatic<TimerGroup> DefaultTimerGroup;
 static TimerGroup *getDefaultTimerGroup() {
-  if (DefaultTimerGroup) return DefaultTimerGroup;
-  return DefaultTimerGroup = new TimerGroup("Miscellaneous Ungrouped Timers");
+  return &*DefaultTimerGroup;
 }

 Timer::Timer(const std::string &N)
  : Elapsed(0), UserTime(0), SystemTime(0), MemUsed(0), PeakMem(0), Name(N),
    Started(false), TG(getDefaultTimerGroup()) {
+  sys::SmartScopedLock<true> Lock(&*TimerLock);
  TG->addTimer();
 }

 Timer::Timer(const std::string &N, TimerGroup &tg)
  : Elapsed(0), UserTime(0), SystemTime(0), MemUsed(0), PeakMem(0), Name(N),
    Started(false), TG(&tg) {
+  sys::SmartScopedLock<true> Lock(&*TimerLock);
  TG->addTimer();
 }

 Timer::Timer(const Timer &T) {
+  sys::SmartScopedLock<true> Lock(&*TimerLock);
  TG = T.TG;
  if (TG) TG->addTimer();
  operator=(T);
@ -77,6 +81,7 @@ Timer::Timer(const Timer &T) {

 // Copy ctor, initialize with no TG member.
 Timer::Timer(bool, const Timer &T) {
+  sys::SmartScopedLock<true> Lock(&*TimerLock);
  TG = T.TG;     // Avoid assertion in operator=
  operator=(T);  // Copy contents
  TG = 0;
@ -84,6 +89,7 @@ Timer::Timer(bool, const Timer &T) {


 Timer::~Timer() {
+  sys::SmartScopedLock<true> Lock(&*TimerLock);
  if (TG) {
    if (Started) {
      Started = false;
@ -129,8 +135,10 @@ static TimeRecord getTimeRecord(bool Start) {
 }

 static ManagedStatic<std::vector<Timer*> > ActiveTimers;
+static ManagedStatic<sys::SmartMutex<true> > ActiveTimerLock;

 void Timer::startTimer() {
+  sys::SmartScopedLock<true> Lock(&*ActiveTimerLock);
  Started = true;
  ActiveTimers->push_back(this);
  TimeRecord TR = getTimeRecord(true);
@ -142,6 +150,7 @@ void Timer::startTimer() {
 }

 void Timer::stopTimer() {
+  sys::SmartScopedLock<true> Lock(&*ActiveTimerLock);
  TimeRecord TR = getTimeRecord(false);
  Elapsed    += TR.Elapsed;
  UserTime   += TR.UserTime;
@ -171,6 +180,7 @@ void Timer::sum(const Timer &T) {
 /// currently active timers, which will be printed when the timer group prints
 ///
 void Timer::addPeakMemoryMeasurement() {
+  sys::SmartScopedLock<true> Lock(&*ActiveTimerLock);
  size_t MemUsed = getMemUsage();

  for (std::vector<Timer*>::iterator I = ActiveTimers->begin(),
@ -193,7 +203,10 @@ static ManagedStatic<Name2Timer> NamedTimers;

 static ManagedStatic<Name2Pair> NamedGroupedTimers;

+static ManagedStatic<sys::SmartMutex<true> > NamedTimerLock;
+
 static Timer &getNamedRegionTimer(const std::string &Name) {
+  sys::SmartScopedLock<true> Lock(&*NamedTimerLock);
  Name2Timer::iterator I = NamedTimers->find(Name);
  if (I != NamedTimers->end())
    return I->second;
@ -203,6 +216,7 @@ static Timer &getNamedRegionTimer(const std::string &Name) {

 static Timer &getNamedRegionTimer(const std::string &Name,
                                  const std::string &GroupName) {
+  sys::SmartScopedLock<true> Lock(&*NamedTimerLock);

  Name2Pair::iterator I = NamedGroupedTimers->find(GroupName);
  if (I == NamedGroupedTimers->end()) {
@ -340,7 +354,7 @@ void TimerGroup::removeTimer() {
      // If this is not an collection of ungrouped times, print the total time.
      // Ungrouped timers don't really make sense to add up.  We still print the
      // TOTAL line to make the percentages make sense.
-      if (this != DefaultTimerGroup) {
+      if (this != &*DefaultTimerGroup) {
        *OutStream << "  Total Execution Time: ";

        printAlignedFP(Total.getProcessTime(), 4, 5, *OutStream);
@ -377,11 +391,5 @@ void TimerGroup::removeTimer() {
    if (OutStream != cerr.stream() && OutStream != cout.stream())
      delete OutStream;   // Close the file...
  }
-
-  // Delete default timer group!
-  if (NumTimers == 0 && this == DefaultTimerGroup) {
-    delete DefaultTimerGroup;
-    DefaultTimerGroup = 0;
-  }
 }

--- a/lib/Target/ARM/ARMCallingConv.td
+++ b/lib/Target/ARM/ARMCallingConv.td
@ -24,19 +24,29 @@ def CC_ARM_APCS : CallingConv<[

  CCIfType<[i8, i16], CCPromoteToType<i32>>,

-  // f64 is passed in pairs of GPRs, possibly split onto the stack
-  CCIfType<[f64], CCCustom<"CC_ARM_APCS_Custom_f64">>,
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  // f64 and v2f64 are passed in adjacent GPRs, possibly split onto the stack
+  CCIfType<[f64, v2f64], CCCustom<"CC_ARM_APCS_Custom_f64">>,

  CCIfType<[f32], CCBitConvertToType<i32>>,
  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,

  CCIfType<[i32], CCAssignToStack<4, 4>>,
-  CCIfType<[f64], CCAssignToStack<8, 4>>
+  CCIfType<[f64], CCAssignToStack<8, 4>>,
+  CCIfType<[v2f64], CCAssignToStack<16, 4>>
 ]>;

 def RetCC_ARM_APCS : CallingConv<[
  CCIfType<[f32], CCBitConvertToType<i32>>,
-  CCIfType<[f64], CCCustom<"RetCC_ARM_APCS_Custom_f64">>,
+
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_APCS_Custom_f64">>,

  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
  CCIfType<[i64], CCAssignToRegWithShadow<[R0, R2], [R1, R3]>>
@ -59,7 +69,8 @@ def CC_ARM_AAPCS_Common : CallingConv<[
                       CCAssignToReg<[R0, R1, R2, R3]>>>,

  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
-  CCIfType<[f64], CCAssignToStack<8, 8>>
+  CCIfType<[f64], CCAssignToStack<8, 8>>,
+  CCIfType<[v2f64], CCAssignToStack<16, 8>>
 ]>;

 def RetCC_ARM_AAPCS_Common : CallingConv<[
@ -72,13 +83,21 @@ def RetCC_ARM_AAPCS_Common : CallingConv<[
 //===----------------------------------------------------------------------===//

 def CC_ARM_AAPCS : CallingConv<[
-  CCIfType<[f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>,
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[f64, v2f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>,
  CCIfType<[f32], CCBitConvertToType<i32>>,
  CCDelegateTo<CC_ARM_AAPCS_Common>
 ]>;

 def RetCC_ARM_AAPCS : CallingConv<[
-  CCIfType<[f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>,
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>,
  CCIfType<[f32], CCBitConvertToType<i32>>,
  CCDelegateTo<RetCC_ARM_AAPCS_Common>
 ]>;
@ -88,6 +107,10 @@ def RetCC_ARM_AAPCS : CallingConv<[
 //===----------------------------------------------------------------------===//

 def CC_ARM_AAPCS_VFP : CallingConv<[
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
                                 S9, S10, S11, S12, S13, S14, S15]>>,
@ -95,6 +118,10 @@ def CC_ARM_AAPCS_VFP : CallingConv<[
 ]>;

 def RetCC_ARM_AAPCS_VFP : CallingConv<[
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
                                 S9, S10, S11, S12, S13, S14, S15]>>,
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@ -32,6 +32,9 @@
 #include "llvm/Support/Debug.h"
 using namespace llvm;

+static const unsigned arm_dsubreg_0 = 5;
+static const unsigned arm_dsubreg_1 = 6;
+
 //===--------------------------------------------------------------------===//
 /// ARMDAGToDAGISel - ARM specific code to select ARM machine
 /// instructions for SelectionDAG operations.
@ -579,17 +582,18 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
  switch (N->getOpcode()) {
  default: break;
  case ISD::Constant: {
-    // ARMv6T2 and later should materialize imms via MOV / MOVT pair.
-    if (Subtarget->hasV6T2Ops() || Subtarget->hasThumb2())
-      break;
-
    unsigned Val = cast<ConstantSDNode>(N)->getZExtValue();
    bool UseCP = true;
-    if (Subtarget->isThumb())
-      UseCP = (Val > 255 &&                          // MOV
-               ~Val > 255 &&                         // MOV + MVN
-               !ARM_AM::isThumbImmShiftedVal(Val));  // MOV + LSL
-    else
+    if (Subtarget->isThumb()) {
+      if (Subtarget->hasThumb2())
+        // Thumb2 has the MOVT instruction, so all immediates can
+        // be done with MOV + MOVT, at worst.
+        UseCP = 0;
+      else
+        UseCP = (Val > 255 &&                          // MOV
+                 ~Val > 255 &&                         // MOV + MVN
+                 !ARM_AM::isThumbImmShiftedVal(Val));  // MOV + LSL
+    } else
      UseCP = (ARM_AM::getSOImmVal(Val) == -1 &&     // MOV
               ARM_AM::getSOImmVal(~Val) == -1 &&    // MVN
               !ARM_AM::isSOImmTwoPartVal(Val));     // two instrs.
@ -917,6 +921,65 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) {
    return CurDAG->getTargetNode(TargetInstrInfo::DECLARE, dl,
                                 MVT::Other, Ops, 3);
  }
+
+  case ISD::CONCAT_VECTORS: {
+    MVT VT = Op.getValueType();
+    assert(VT.is128BitVector() && Op.getNumOperands() == 2 &&
+           "unexpected CONCAT_VECTORS");
+    SDValue N0 = Op.getOperand(0);
+    SDValue N1 = Op.getOperand(1);
+    SDNode *Result =
+      CurDAG->getTargetNode(TargetInstrInfo::IMPLICIT_DEF, dl, VT);
+    if (N0.getOpcode() != ISD::UNDEF)
+      Result = CurDAG->getTargetNode(TargetInstrInfo::INSERT_SUBREG, dl, VT,
+                                     SDValue(Result, 0), N0,
+                                     CurDAG->getTargetConstant(arm_dsubreg_0,
+                                                               MVT::i32));
+    if (N1.getOpcode() != ISD::UNDEF)
+      Result = CurDAG->getTargetNode(TargetInstrInfo::INSERT_SUBREG, dl, VT,
+                                     SDValue(Result, 0), N1,
+                                     CurDAG->getTargetConstant(arm_dsubreg_1,
+                                                               MVT::i32));
+    return Result;
+  }
+
+  case ISD::VECTOR_SHUFFLE: {
+    MVT VT = Op.getValueType();
+
+    // Match 128-bit splat to VDUPLANEQ.  (This could be done with a Pat in
+    // ARMInstrNEON.td but it is awkward because the shuffle mask needs to be
+    // transformed first into a lane number and then to both a subregister
+    // index and an adjusted lane number.)  If the source operand is a
+    // SCALAR_TO_VECTOR, leave it so it will be matched later as a VDUP.
+    ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+    if (VT.is128BitVector() && SVOp->isSplat() &&
+        Op.getOperand(0).getOpcode() != ISD::SCALAR_TO_VECTOR &&
+        Op.getOperand(1).getOpcode() == ISD::UNDEF) {
+      unsigned LaneVal = SVOp->getSplatIndex();
+
+      MVT HalfVT;
+      unsigned Opc = 0;
+      switch (VT.getVectorElementType().getSimpleVT()) {
+      default: assert(false && "unhandled VDUP splat type");
+      case MVT::i8:  Opc = ARM::VDUPLN8q;  HalfVT = MVT::v8i8; break;
+      case MVT::i16: Opc = ARM::VDUPLN16q; HalfVT = MVT::v4i16; break;
+      case MVT::i32: Opc = ARM::VDUPLN32q; HalfVT = MVT::v2i32; break;
+      case MVT::f32: Opc = ARM::VDUPLNfq;  HalfVT = MVT::v2f32; break;
+      }
+
+      // The source operand needs to be changed to a subreg of the original
+      // 128-bit operand, and the lane number needs to be adjusted accordingly.
+      unsigned NumElts = VT.getVectorNumElements() / 2;
+      unsigned SRVal = (LaneVal < NumElts ? arm_dsubreg_0 : arm_dsubreg_1);
+      SDValue SR = CurDAG->getTargetConstant(SRVal, MVT::i32);
+      SDValue NewLane = CurDAG->getTargetConstant(LaneVal % NumElts, MVT::i32);
+      SDNode *SubReg = CurDAG->getTargetNode(TargetInstrInfo::EXTRACT_SUBREG,
+                                             dl, HalfVT, N->getOperand(0), SR);
+      return CurDAG->SelectNodeTo(N, Opc, VT, SDValue(SubReg, 0), NewLane);
+    }
+
+    break;
+  }
  }

  return SelectCode(Op);
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@ -67,10 +67,65 @@ namespace llvm {
      EH_SJLJ_SETJMP,    // SjLj exception handling setjmp
      EH_SJLJ_LONGJMP,   // SjLj exception handling longjmp

-      THREAD_POINTER
+      THREAD_POINTER,
+
+      VCEQ,         // Vector compare equal.
+      VCGE,         // Vector compare greater than or equal.
+      VCGEU,        // Vector compare unsigned greater than or equal.
+      VCGT,         // Vector compare greater than.
+      VCGTU,        // Vector compare unsigned greater than.
+      VTST,         // Vector test bits.
+
+      // Vector shift by immediate:
+      VSHL,         // ...left
+      VSHRs,        // ...right (signed)
+      VSHRu,        // ...right (unsigned)
+      VSHLLs,       // ...left long (signed)
+      VSHLLu,       // ...left long (unsigned)
+      VSHLLi,       // ...left long (with maximum shift count)
+      VSHRN,        // ...right narrow
+
+      // Vector rounding shift by immediate:
+      VRSHRs,       // ...right (signed)
+      VRSHRu,       // ...right (unsigned)
+      VRSHRN,       // ...right narrow
+
+      // Vector saturating shift by immediate:
+      VQSHLs,       // ...left (signed)
+      VQSHLu,       // ...left (unsigned)
+      VQSHLsu,      // ...left (signed to unsigned)
+      VQSHRNs,      // ...right narrow (signed)
+      VQSHRNu,      // ...right narrow (unsigned)
+      VQSHRNsu,     // ...right narrow (signed to unsigned)
+
+      // Vector saturating rounding shift by immediate:
+      VQRSHRNs,     // ...right narrow (signed)
+      VQRSHRNu,     // ...right narrow (unsigned)
+      VQRSHRNsu,    // ...right narrow (signed to unsigned)
+
+      // Vector shift and insert:
+      VSLI,         // ...left
+      VSRI,         // ...right
+
+      // Vector get lane (VMOV scalar to ARM core register)
+      // (These are used for 8- and 16-bit element types only.)
+      VGETLANEu,    // zero-extend vector extract element
+      VGETLANEs,    // sign-extend vector extract element
+
+      // Vector duplicate lane (128-bit result only; 64-bit is a shuffle)
+      VDUPLANEQ     // splat a lane from a 64-bit vector to a 128-bit vector
    };
  }

+  /// Define some predicates that are used for node matching.
+  namespace ARM {
+    /// getVMOVImm - If this is a build_vector of constants which can be
+    /// formed by using a VMOV instruction of the specified element size,
+    /// return the constant being splatted.  The ByteSize field indicates the
+    /// number of bytes of each element [1248].
+    SDValue getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG);
+  }
+
  //===--------------------------------------------------------------------===//
  //  ARMTargetLowering - ARM Implementation of the TargetLowering interface

@ -151,6 +206,21 @@ namespace llvm {
    ///
    unsigned ARMPCLabelIndex;

+    void addTypeForNEON(MVT VT, MVT PromotedLdStVT, MVT PromotedBitwiseVT);
+    void addDRTypeForNEON(MVT VT);
+    void addQRTypeForNEON(MVT VT);
+
+    typedef SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPassVector;
+    void PassF64ArgInRegs(CallSDNode *TheCall, SelectionDAG &DAG,
+                          SDValue Chain, SDValue &Arg,
+                          RegsToPassVector &RegsToPass,
+                          CCValAssign &VA, CCValAssign &NextVA,
+                          SDValue &StackPtr,
+                          SmallVector<SDValue, 8> &MemOpChains,
+                          ISD::ArgFlagsTy Flags);
+    SDValue GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
+                                 SDValue &Root, SelectionDAG &DAG, DebugLoc dl);
+
    CCAssignFn *CCAssignFnForNode(unsigned CC, bool Return) const;
    SDValue LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG,
                             const SDValue &StackPtr, const CCValAssign &VA,
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@ -49,6 +49,11 @@ def VFPMiscFrm    : Format<22>;

 def ThumbFrm      : Format<23>;

+def NEONFrm       : Format<24>;
+def NEONGetLnFrm  : Format<25>;
+def NEONSetLnFrm  : Format<26>;
+def NEONDupFrm    : Format<27>;
+
 // Misc flag for data processing instructions that indicates whether
 // the instruction has a Rn register operand.
 class UnaryDP  { bit isUnaryDataProc = 1; }
@ -737,6 +742,14 @@ class TIx2<dag outs, dag ins, string asm, list<dag> pattern>
 class TJTI<dag outs, dag ins, string asm, list<dag> pattern>
  : ThumbI<outs, ins, AddrModeNone, SizeSpecial, asm, "", pattern>;

+// ThumbPat - Same as Pat<>, but requires that the compiler be in Thumb mode.
+class ThumbPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsThumb];
+}
+
+class ThumbV5Pat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsThumb, HasV5T];
+}

 //===----------------------------------------------------------------------===//

@ -857,12 +870,102 @@ class AVConv5I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, string opc,

 //===----------------------------------------------------------------------===//

+//===----------------------------------------------------------------------===//
+// ARM NEON Instruction templates.
+//

-// ThumbPat - Same as Pat<>, but requires that the compiler be in Thumb mode.
-class ThumbPat<dag pattern, dag result> : Pat<pattern, result> {
-  list<Predicate> Predicates = [IsThumb];
+class NeonI<dag oops, dag iops, AddrMode am, IndexMode im, string asm,
+            string cstr, list<dag> pattern>
+  : InstARM<am, Size4Bytes, im, NEONFrm, cstr> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString = asm;
+  let Pattern = pattern;
+  list<Predicate> Predicates = [HasNEON];
 }

-class ThumbV5Pat<dag pattern, dag result> : Pat<pattern, result> {
-  list<Predicate> Predicates = [IsThumb, HasV5T];
+class NI<dag oops, dag iops, string asm, list<dag> pattern>
+  : NeonI<oops, iops, AddrModeNone, IndexModeNone, asm, "", pattern> {
 }
+
+class NDataI<dag oops, dag iops, string asm, string cstr, list<dag> pattern>
+  : NeonI<oops, iops, AddrModeNone, IndexModeNone, asm, cstr, pattern> {
+  let Inst{31-25} = 0b1111001;
+}
+
+// NEON "one register and a modified immediate" format.
+class N1ModImm<bit op23, bits<3> op21_19, bits<4> op11_8, bit op7, bit op6,
+               bit op5, bit op4,
+               dag oops, dag iops, string asm, string cstr, list<dag> pattern>
+  : NDataI<oops, iops, asm, cstr, pattern> {
+  let Inst{23} = op23;
+  let Inst{21-19} = op21_19;
+  let Inst{11-8} = op11_8;
+  let Inst{7} = op7;
+  let Inst{6} = op6;
+  let Inst{5} = op5;
+  let Inst{4} = op4;
+}
+
+// NEON 2 vector register format.
+class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
+          bits<5> op11_7, bit op6, bit op4,
+          dag oops, dag iops, string asm, string cstr, list<dag> pattern>
+  : NDataI<oops, iops, asm, cstr, pattern> {
+  let Inst{24-23} = op24_23;
+  let Inst{21-20} = op21_20;
+  let Inst{19-18} = op19_18;
+  let Inst{17-16} = op17_16;
+  let Inst{11-7} = op11_7;
+  let Inst{6} = op6;
+  let Inst{4} = op4;
+}
+
+// NEON 2 vector register with immediate.
+class N2VImm<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
+             bit op6, bit op4,
+             dag oops, dag iops, string asm, string cstr, list<dag> pattern>
+  : NDataI<oops, iops, asm, cstr, pattern> {
+  let Inst{24} = op24;
+  let Inst{23} = op23;
+  let Inst{21-16} = op21_16;
+  let Inst{11-8} = op11_8;
+  let Inst{7} = op7;
+  let Inst{6} = op6;
+  let Inst{4} = op4;
+}
+
+// NEON 3 vector register format.
+class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
+          dag oops, dag iops, string asm, string cstr, list<dag> pattern>
+  : NDataI<oops, iops, asm, cstr, pattern> {
+  let Inst{24} = op24;
+  let Inst{23} = op23;
+  let Inst{21-20} = op21_20;
+  let Inst{11-8} = op11_8;
+  let Inst{6} = op6;
+  let Inst{4} = op4;
+}
+
+// NEON VMOVs between scalar and core registers.
+class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
+               dag oops, dag iops, Format f, string opc, string asm,
+               list<dag> pattern>
+  : AI<oops, iops, f, opc, asm, pattern> {
+  let Inst{27-20} = opcod1;
+  let Inst{11-8} = opcod2;
+  let Inst{6-5} = opcod3;
+  let Inst{4} = 1;
+  list<Predicate> Predicates = [HasNEON];
+}
+class NVGetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
+                dag oops, dag iops, string opc, string asm, list<dag> pattern>
+  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONGetLnFrm, opc, asm,
+             pattern>;
+class NVSetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
+                dag oops, dag iops, string opc, string asm, list<dag> pattern>
+  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONSetLnFrm, opc, asm,
+             pattern>;
+class NVDup<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
+            dag oops, dag iops, string opc, string asm, list<dag> pattern>
+  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONDupFrm, opc, asm, pattern>;
--- a/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/lib/Target/ARM/ARMInstrInfo.cpp
@ -59,6 +59,8 @@ bool ARMInstrInfo::isMoveInstr(const MachineInstr &MI,
    return false;
  case ARM::FCPYS:
  case ARM::FCPYD:
+  case ARM::VMOVD:
+  case ARM::VMOVQ:
    SrcReg = MI.getOperand(1).getReg();
    DstReg = MI.getOperand(0).getReg();
    return true;
@ -528,6 +530,8 @@ bool ARMInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
  else if (DestRC == ARM::DPRRegisterClass)
    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::FCPYD), DestReg)
                   .addReg(SrcReg));
+  else if (DestRC == ARM::QPRRegisterClass)
+    BuildMI(MBB, I, DL, get(ARM::VMOVQ), DestReg).addReg(SrcReg);
  else
    return false;
  
@ -844,6 +848,10 @@ canFoldMemoryOperand(const MachineInstr *MI,
  case ARM::FCPYS:
  case ARM::FCPYD:
    return true;
+
+  case ARM::VMOVD:
+  case ARM::VMOVQ:
+    return false; // FIXME
  }

  return false;
--- a/lib/Target/ARM/ARMInstrInfo.h
+++ b/lib/Target/ARM/ARMInstrInfo.h
@ -114,6 +114,12 @@ namespace ARMII {
    // Thumb format
    ThumbFrm      = 23 << FormShift,

+    // NEON format
+    NEONFrm       = 24 << FormShift,
+    NEONGetLnFrm  = 25 << FormShift,
+    NEONSetLnFrm  = 26 << FormShift,
+    NEONDupFrm    = 27 << FormShift,
+
    //===------------------------------------------------------------------===//
    // Field shifts - such shifts are used to set field while generating
    // machine instructions.
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@ -93,9 +93,15 @@ def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP", SDT_ARMEH_SJLJ_Setjmp>;
 def HasV5T    : Predicate<"Subtarget->hasV5TOps()">;
 def HasV5TE   : Predicate<"Subtarget->hasV5TEOps()">;
 def HasV6     : Predicate<"Subtarget->hasV6Ops()">;
+def HasV7     : Predicate<"Subtarget->hasV7Ops()">;
+def HasVFP2   : Predicate<"Subtarget->hasVFP2()">;
+def HasVFP3   : Predicate<"Subtarget->hasVFP3()">;
+def HasNEON   : Predicate<"Subtarget->hasNEON()">;
 def IsThumb   : Predicate<"Subtarget->isThumb()">;
 def HasThumb2 : Predicate<"Subtarget->hasThumb2()">;
 def IsARM     : Predicate<"!Subtarget->isThumb()">;
+def IsDarwin    : Predicate<"Subtarget->isTargetDarwin()">;
+def IsNotDarwin : Predicate<"!Subtarget->isTargetDarwin()">;

 //===----------------------------------------------------------------------===//
 // ARM Flag Definitions.
@ -518,6 +524,24 @@ def PICSTRB : AXI2stb<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
 }
 } // isNotDuplicable = 1

+
+// LEApcrel - Load a pc-relative address into a register without offending the
+// assembler.
+def LEApcrel : AXI1<0x0, (outs GPR:$dst), (ins i32imm:$label, pred:$p), Pseudo,
+                   !strconcat(!strconcat(".set PCRELV${:uid}, ($label-(",
+                                         "${:private}PCRELL${:uid}+8))\n"),
+                              !strconcat("${:private}PCRELL${:uid}:\n\t",
+                                         "add$p $dst, pc, #PCRELV${:uid}")),
+                   []>;
+
+def LEApcrelJT : AXI1<0x0, (outs GPR:$dst), (ins i32imm:$label, i32imm:$id, pred:$p),
+          Pseudo,
+          !strconcat(!strconcat(".set PCRELV${:uid}, (${label}_${id:no_hash}-(",
+                                         "${:private}PCRELL${:uid}+8))\n"),
+                              !strconcat("${:private}PCRELL${:uid}:\n\t",
+                                         "add$p $dst, pc, #PCRELV${:uid}")),
+                   []>;
+
 //===----------------------------------------------------------------------===//
 //  Control Flow Instructions.
 //
@ -539,21 +563,22 @@ let isReturn = 1, isTerminator = 1 in
                    LdStMulFrm, "ldm${p}${addr:submode} $addr, $dst1",
                    []>;

+// On non-Darwin platforms R9 is callee-saved.
 let isCall = 1, Itinerary = IIC_Br,
  Defs = [R0, R1, R2, R3, R12, LR,
          D0, D1, D2, D3, D4, D5, D6, D7, CPSR] in {
  def BL  : ABXI<0b1011, (outs), (ins i32imm:$func, variable_ops),
                "bl ${func:call}",
-                [(ARMcall tglobaladdr:$func)]>;
+                [(ARMcall tglobaladdr:$func)]>, Requires<[IsNotDarwin]>;

  def BL_pred : ABI<0b1011, (outs), (ins i32imm:$func, variable_ops),
                   "bl", " ${func:call}",
-                   [(ARMcall_pred tglobaladdr:$func)]>;
+                   [(ARMcall_pred tglobaladdr:$func)]>, Requires<[IsNotDarwin]>;

  // ARMv5T and above
  def BLX : AXI<(outs), (ins GPR:$func, variable_ops), BrMiscFrm,
                "blx $func",
-                [(ARMcall GPR:$func)]>, Requires<[IsARM, HasV5T]> {
+                [(ARMcall GPR:$func)]>, Requires<[IsARM, HasV5T, IsNotDarwin]> {
    let Inst{7-4}   = 0b0011;
    let Inst{19-8}  = 0b111111111111;
    let Inst{27-20} = 0b00010010;
@ -563,7 +588,36 @@ let isCall = 1, Itinerary = IIC_Br,
    // ARMv4T
    def BX : ABXIx2<(outs), (ins GPR:$func, variable_ops),
                     "mov lr, pc\n\tbx $func",
-                    [(ARMcall_nolink GPR:$func)]>;
+                    [(ARMcall_nolink GPR:$func)]>, Requires<[IsNotDarwin]>;
+  }
+}
+
+// On Darwin R9 is call-clobbered.
+let isCall = 1, Itinerary = IIC_Br,
+  Defs = [R0, R1, R2, R3, R9, R12, LR,
+          D0, D1, D2, D3, D4, D5, D6, D7, CPSR] in {
+  def BLr9  : ABXI<0b1011, (outs), (ins i32imm:$func, variable_ops),
+                "bl ${func:call}",
+                [(ARMcall tglobaladdr:$func)]>, Requires<[IsDarwin]>;
+
+  def BLr9_pred : ABI<0b1011, (outs), (ins i32imm:$func, variable_ops),
+                   "bl", " ${func:call}",
+                   [(ARMcall_pred tglobaladdr:$func)]>, Requires<[IsDarwin]>;
+
+  // ARMv5T and above
+  def BLXr9 : AXI<(outs), (ins GPR:$func, variable_ops), BrMiscFrm,
+                "blx $func",
+                [(ARMcall GPR:$func)]>, Requires<[IsARM, HasV5T, IsDarwin]> {
+    let Inst{7-4}   = 0b0011;
+    let Inst{19-8}  = 0b111111111111;
+    let Inst{27-20} = 0b00010010;
+  }
+
+  let Uses = [LR] in {
+    // ARMv4T
+    def BXr9 : ABXIx2<(outs), (ins GPR:$func, variable_ops),
+                     "mov lr, pc\n\tbx $func",
+                    [(ARMcall_nolink GPR:$func)]>, Requires<[IsDarwin]>;
  }
 }

@ -823,9 +877,9 @@ defm UXTH   : AI_unary_rrot<0b01101111,
 defm UXTB16 : AI_unary_rrot<0b01101100,
                            "uxtb16", UnOpFrag<(and node:$Src, 0x00FF00FF)>>;

-def : ARMV6Pat<(and (shl GPR:$Src, 8), 0xFF00FF),
+def : ARMV6Pat<(and (shl GPR:$Src, (i32 8)), 0xFF00FF),
               (UXTB16r_rot GPR:$Src, 24)>;
-def : ARMV6Pat<(and (srl GPR:$Src, 8), 0xFF00FF),
+def : ARMV6Pat<(and (srl GPR:$Src, (i32 8)), 0xFF00FF),
               (UXTB16r_rot GPR:$Src, 8)>;

 defm UXTAB : AI_bin_rrot<0b01101110, "uxtab",
@ -1006,7 +1060,7 @@ multiclass AI_smul<string opc, PatFrag opnode> {
  def BT : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
              !strconcat(opc, "bt"), " $dst, $a, $b",
              [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16),
-                                      (sra GPR:$b, 16)))]>,
+                                      (sra GPR:$b, (i32 16))))]>,
           Requires<[IsARM, HasV5TE]> {
             let Inst{5} = 0;
             let Inst{6} = 1;
@ -1014,7 +1068,7 @@ multiclass AI_smul<string opc, PatFrag opnode> {

  def TB : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
              !strconcat(opc, "tb"), " $dst, $a, $b",
-              [(set GPR:$dst, (opnode (sra GPR:$a, 16),
+              [(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)),
                                      (sext_inreg GPR:$b, i16)))]>,
           Requires<[IsARM, HasV5TE]> {
             let Inst{5} = 1;
@ -1023,8 +1077,8 @@ multiclass AI_smul<string opc, PatFrag opnode> {

  def TT : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
              !strconcat(opc, "tt"), " $dst, $a, $b",
-              [(set GPR:$dst, (opnode (sra GPR:$a, 16),
-                                      (sra GPR:$b, 16)))]>,
+              [(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)),
+                                      (sra GPR:$b, (i32 16))))]>,
            Requires<[IsARM, HasV5TE]> {
             let Inst{5} = 1;
             let Inst{6} = 1;
@ -1033,7 +1087,7 @@ multiclass AI_smul<string opc, PatFrag opnode> {
  def WB : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
              !strconcat(opc, "wb"), " $dst, $a, $b",
              [(set GPR:$dst, (sra (opnode GPR:$a,
-                                    (sext_inreg GPR:$b, i16)), 16))]>,
+                                    (sext_inreg GPR:$b, i16)), (i32 16)))]>,
           Requires<[IsARM, HasV5TE]> {
             let Inst{5} = 1;
             let Inst{6} = 0;
@ -1042,7 +1096,7 @@ multiclass AI_smul<string opc, PatFrag opnode> {
  def WT : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
              !strconcat(opc, "wt"), " $dst, $a, $b",
              [(set GPR:$dst, (sra (opnode GPR:$a,
-                                    (sra GPR:$b, 16)), 16))]>,
+                                    (sra GPR:$b, (i32 16))), (i32 16)))]>,
            Requires<[IsARM, HasV5TE]> {
             let Inst{5} = 1;
             let Inst{6} = 1;
@ -1064,7 +1118,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
  def BT : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
              !strconcat(opc, "bt"), " $dst, $a, $b, $acc",
              [(set GPR:$dst, (add GPR:$acc, (opnode (sext_inreg GPR:$a, i16),
-                                                     (sra GPR:$b, 16))))]>,
+                                                     (sra GPR:$b, (i32 16)))))]>,
           Requires<[IsARM, HasV5TE]> {
             let Inst{5} = 0;
             let Inst{6} = 1;
@ -1072,7 +1126,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {

  def TB : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
              !strconcat(opc, "tb"), " $dst, $a, $b, $acc",
-              [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, 16),
+              [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)),
                                                 (sext_inreg GPR:$b, i16))))]>,
           Requires<[IsARM, HasV5TE]> {
             let Inst{5} = 1;
@ -1081,8 +1135,8 @@ multiclass AI_smla<string opc, PatFrag opnode> {

  def TT : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
              !strconcat(opc, "tt"), " $dst, $a, $b, $acc",
-              [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, 16),
-                                                     (sra GPR:$b, 16))))]>,
+              [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)),
+                                                     (sra GPR:$b, (i32 16)))))]>,
            Requires<[IsARM, HasV5TE]> {
             let Inst{5} = 1;
             let Inst{6} = 1;
@ -1091,7 +1145,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
  def WB : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
              !strconcat(opc, "wb"), " $dst, $a, $b, $acc",
              [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a,
-                                            (sext_inreg GPR:$b, i16)), 16)))]>,
+                                       (sext_inreg GPR:$b, i16)), (i32 16))))]>,
           Requires<[IsARM, HasV5TE]> {
             let Inst{5} = 0;
             let Inst{6} = 0;
@ -1100,7 +1154,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
  def WT : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
              !strconcat(opc, "wt"), " $dst, $a, $b, $acc",
              [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a,
-                                                   (sra GPR:$b, 16)), 16)))]>,
+                                         (sra GPR:$b, (i32 16))), (i32 16))))]>,
            Requires<[IsARM, HasV5TE]> {
             let Inst{5} = 0;
             let Inst{6} = 1;
@ -1136,10 +1190,10 @@ def REV  : AMiscA1I<0b01101011, (outs GPR:$dst), (ins GPR:$src),
 def REV16 : AMiscA1I<0b01101011, (outs GPR:$dst), (ins GPR:$src),
               "rev16", " $dst, $src",
               [(set GPR:$dst,
-                   (or (and (srl GPR:$src, 8), 0xFF),
-                       (or (and (shl GPR:$src, 8), 0xFF00),
-                           (or (and (srl GPR:$src, 8), 0xFF0000),
-                               (and (shl GPR:$src, 8), 0xFF000000)))))]>,
+                   (or (and (srl GPR:$src, (i32 8)), 0xFF),
+                       (or (and (shl GPR:$src, (i32 8)), 0xFF00),
+                           (or (and (srl GPR:$src, (i32 8)), 0xFF0000),
+                               (and (shl GPR:$src, (i32 8)), 0xFF000000)))))]>,
               Requires<[IsARM, HasV6]> {
  let Inst{7-4}   = 0b1011;
  let Inst{11-8}  = 0b1111;
@ -1150,8 +1204,8 @@ def REVSH : AMiscA1I<0b01101111, (outs GPR:$dst), (ins GPR:$src),
               "revsh", " $dst, $src",
               [(set GPR:$dst,
                  (sext_inreg
-                    (or (srl (and GPR:$src, 0xFF00), 8),
-                        (shl GPR:$src, 8)), i16))]>,
+                    (or (srl (and GPR:$src, 0xFF00), (i32 8)),
+                        (shl GPR:$src, (i32 8))), i16))]>,
               Requires<[IsARM, HasV6]> {
  let Inst{7-4}   = 0b1011;
  let Inst{11-8}  = 0b1111;
@ -1186,7 +1240,7 @@ def PKHTB : AMiscA1I<0b01101000, (outs GPR:$dst),

 // Alternate cases for PKHTB where identities eliminate some nodes.  Note that
 // a shift amount of 0 is *not legal* here, it is PKHBT instead.
-def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF0000), (srl GPR:$src2, 16)),
+def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF0000), (srl GPR:$src2, (i32 16))),
               (PKHTB GPR:$src1, GPR:$src2, 16)>;
 def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF0000),
                   (and (srl GPR:$src2, imm1_15:$shamt), 0xFFFF)),
@ -1240,23 +1294,6 @@ def MOVCCi : AI1<0b1101, (outs GPR:$dst),
                RegConstraint<"$false = $dst">, UnaryDP;


-// LEApcrel - Load a pc-relative address into a register without offending the
-// assembler.
-def LEApcrel : AXI1<0x0, (outs GPR:$dst), (ins i32imm:$label, pred:$p), Pseudo,
-                   !strconcat(!strconcat(".set PCRELV${:uid}, ($label-(",
-                                         "${:private}PCRELL${:uid}+8))\n"),
-                              !strconcat("${:private}PCRELL${:uid}:\n\t",
-                                         "add$p $dst, pc, #PCRELV${:uid}")),
-                   []>;
-
-def LEApcrelJT : AXI1<0x0, (outs GPR:$dst), (ins i32imm:$label, i32imm:$id, pred:$p),
-          Pseudo,
-          !strconcat(!strconcat(".set PCRELV${:uid}, (${label}_${id:no_hash}-(",
-                                         "${:private}PCRELL${:uid}+8))\n"),
-                              !strconcat("${:private}PCRELL${:uid}:\n\t",
-                                         "add$p $dst, pc, #PCRELV${:uid}")),
-                   []>;
-
 //===----------------------------------------------------------------------===//
 // TLS Instructions
 //
@ -1321,7 +1358,10 @@ def : ARMPat<(xor GPR:$LHS, so_imm2part:$RHS),


 // Direct calls
-def : ARMPat<(ARMcall texternalsym:$func), (BL texternalsym:$func)>;
+def : ARMPat<(ARMcall texternalsym:$func), (BL texternalsym:$func)>,
+      Requires<[IsNotDarwin]>;
+def : ARMPat<(ARMcall texternalsym:$func), (BLr9 texternalsym:$func)>,
+      Requires<[IsDarwin]>;

 // zextload i1 -> zextload i8
 def : ARMPat<(zextloadi1 addrmode2:$addr),  (LDRB addrmode2:$addr)>;
@ -1335,47 +1375,54 @@ def : ARMPat<(extloadi8  addrmodepc:$addr), (PICLDRB addrmodepc:$addr)>;
 def : ARMPat<(extloadi16 addrmodepc:$addr), (PICLDRH addrmodepc:$addr)>;

 // smul* and smla*
-def : ARMV5TEPat<(mul (sra (shl GPR:$a, 16), 16), (sra (shl GPR:$b, 16), 16)),
+def : ARMV5TEPat<(mul (sra (shl GPR:$a, (i32 16)), (i32 16)),
+                      (sra (shl GPR:$b, (i32 16)), (i32 16))),
                 (SMULBB GPR:$a, GPR:$b)>;
 def : ARMV5TEPat<(mul sext_16_node:$a, sext_16_node:$b),
                 (SMULBB GPR:$a, GPR:$b)>;
-def : ARMV5TEPat<(mul (sra (shl GPR:$a, 16), 16), (sra GPR:$b, 16)),
+def : ARMV5TEPat<(mul (sra (shl GPR:$a, (i32 16)), (i32 16)),
+                      (sra GPR:$b, (i32 16))),
                 (SMULBT GPR:$a, GPR:$b)>;
-def : ARMV5TEPat<(mul sext_16_node:$a, (sra GPR:$b, 16)),
+def : ARMV5TEPat<(mul sext_16_node:$a, (sra GPR:$b, (i32 16))),
                 (SMULBT GPR:$a, GPR:$b)>;
-def : ARMV5TEPat<(mul (sra GPR:$a, 16), (sra (shl GPR:$b, 16), 16)),
+def : ARMV5TEPat<(mul (sra GPR:$a, (i32 16)),
+                      (sra (shl GPR:$b, (i32 16)), (i32 16))),
                 (SMULTB GPR:$a, GPR:$b)>;
-def : ARMV5TEPat<(mul (sra GPR:$a, 16), sext_16_node:$b),
+def : ARMV5TEPat<(mul (sra GPR:$a, (i32 16)), sext_16_node:$b),
                (SMULTB GPR:$a, GPR:$b)>;
-def : ARMV5TEPat<(sra (mul GPR:$a, (sra (shl GPR:$b, 16), 16)), 16),
+def : ARMV5TEPat<(sra (mul GPR:$a, (sra (shl GPR:$b, (i32 16)), (i32 16))),
+                      (i32 16)),
                 (SMULWB GPR:$a, GPR:$b)>;
-def : ARMV5TEPat<(sra (mul GPR:$a, sext_16_node:$b), 16),
+def : ARMV5TEPat<(sra (mul GPR:$a, sext_16_node:$b), (i32 16)),
                 (SMULWB GPR:$a, GPR:$b)>;

 def : ARMV5TEPat<(add GPR:$acc,
-                      (mul (sra (shl GPR:$a, 16), 16),
-                           (sra (shl GPR:$b, 16), 16))),
+                      (mul (sra (shl GPR:$a, (i32 16)), (i32 16)),
+                           (sra (shl GPR:$b, (i32 16)), (i32 16)))),
                 (SMLABB GPR:$a, GPR:$b, GPR:$acc)>;
 def : ARMV5TEPat<(add GPR:$acc,
                      (mul sext_16_node:$a, sext_16_node:$b)),
                 (SMLABB GPR:$a, GPR:$b, GPR:$acc)>;
 def : ARMV5TEPat<(add GPR:$acc,
-                      (mul (sra (shl GPR:$a, 16), 16), (sra GPR:$b, 16))),
+                      (mul (sra (shl GPR:$a, (i32 16)), (i32 16)),
+                           (sra GPR:$b, (i32 16)))),
                 (SMLABT GPR:$a, GPR:$b, GPR:$acc)>;
 def : ARMV5TEPat<(add GPR:$acc,
-                      (mul sext_16_node:$a, (sra GPR:$b, 16))),
+                      (mul sext_16_node:$a, (sra GPR:$b, (i32 16)))),
                 (SMLABT GPR:$a, GPR:$b, GPR:$acc)>;
 def : ARMV5TEPat<(add GPR:$acc,
-                      (mul (sra GPR:$a, 16), (sra (shl GPR:$b, 16), 16))),
+                      (mul (sra GPR:$a, (i32 16)),
+                           (sra (shl GPR:$b, (i32 16)), (i32 16)))),
                 (SMLATB GPR:$a, GPR:$b, GPR:$acc)>;
 def : ARMV5TEPat<(add GPR:$acc,
-                      (mul (sra GPR:$a, 16), sext_16_node:$b)),
+                      (mul (sra GPR:$a, (i32 16)), sext_16_node:$b)),
                 (SMLATB GPR:$a, GPR:$b, GPR:$acc)>;
 def : ARMV5TEPat<(add GPR:$acc,
-                      (sra (mul GPR:$a, (sra (shl GPR:$b, 16), 16)), 16)),
+                      (sra (mul GPR:$a, (sra (shl GPR:$b, (i32 16)), (i32 16))),
+                           (i32 16))),
                 (SMLAWB GPR:$a, GPR:$b, GPR:$acc)>;
 def : ARMV5TEPat<(add GPR:$acc,
-                      (sra (mul GPR:$a, sext_16_node:$b), 16)),
+                      (sra (mul GPR:$a, sext_16_node:$b), (i32 16))),
                 (SMLAWB GPR:$a, GPR:$b, GPR:$acc)>;

 //===----------------------------------------------------------------------===//
@ -1395,3 +1442,9 @@ include "ARMInstrThumb2.td"
 //

 include "ARMInstrVFP.td"
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD (NEON) Support
+//
+
+include "ARMInstrNEON.td"
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@ -319,7 +319,7 @@ def tAND : TIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),

 def tASRri : TI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
                "asr $dst, $lhs, $rhs",
-                [(set tGPR:$dst, (sra tGPR:$lhs, imm:$rhs))]>;
+                [(set tGPR:$dst, (sra tGPR:$lhs, (i32 imm:$rhs)))]>;

 def tASRrr : TIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
                 "asr $dst, $rhs",
@ -367,7 +367,7 @@ def tEOR : TIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),

 def tLSLri : TI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
                "lsl $dst, $lhs, $rhs",
-                [(set tGPR:$dst, (shl tGPR:$lhs, imm:$rhs))]>;
+                [(set tGPR:$dst, (shl tGPR:$lhs, (i32 imm:$rhs)))]>;

 def tLSLrr : TIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
                 "lsl $dst, $rhs",
@ -375,7 +375,7 @@ def tLSLrr : TIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),

 def tLSRri : TI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs),
                "lsr $dst, $lhs, $rhs",
-                [(set tGPR:$dst, (srl tGPR:$lhs, imm:$rhs))]>;
+                [(set tGPR:$dst, (srl tGPR:$lhs, (i32 imm:$rhs)))]>;

 def tLSRrr : TIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
                 "lsr $dst, $rhs",
@ -429,18 +429,18 @@ def tREV : TI<(outs tGPR:$dst), (ins tGPR:$src),
 def tREV16 : TI<(outs tGPR:$dst), (ins tGPR:$src),
                "rev16 $dst, $src",
                [(set tGPR:$dst,
-                    (or (and (srl tGPR:$src, 8), 0xFF),
-                        (or (and (shl tGPR:$src, 8), 0xFF00),
-                            (or (and (srl tGPR:$src, 8), 0xFF0000),
-                                (and (shl tGPR:$src, 8), 0xFF000000)))))]>,
+                    (or (and (srl tGPR:$src, (i32 8)), 0xFF),
+                        (or (and (shl tGPR:$src, (i32 8)), 0xFF00),
+                            (or (and (srl tGPR:$src, (i32 8)), 0xFF0000),
+                                (and (shl tGPR:$src, (i32 8)), 0xFF000000)))))]>,
                Requires<[IsThumb, HasV6]>;

 def tREVSH : TI<(outs tGPR:$dst), (ins tGPR:$src),
                "revsh $dst, $src",
                [(set tGPR:$dst,
                   (sext_inreg
-                     (or (srl (and tGPR:$src, 0xFFFF), 8),
-                         (shl tGPR:$src, 8)), i16))]>,
+                     (or (srl (and tGPR:$src, 0xFFFF), (i32 8)),
+                         (shl tGPR:$src, (i32 8))), i16))]>,
                Requires<[IsThumb, HasV6]>;

 def tROR : TIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@ -160,7 +160,7 @@ def tMOVi16  : PseudoInst<(outs GPR:$dst), (ins i32imm:$src),
                          [(set GPR:$dst, imm0_65535:$src)]>, 
                         Requires<[HasThumb2]>;

-let isTwoAddress = 1 in
+let Constraints = "$src = $dst" in
 def tMOVTi16 : PseudoInst<(outs GPR:$dst), (ins GPR:$src, i32imm:$imm),
                          "movt $dst, $imm",
                          [(set GPR:$dst, (or (and GPR:$src, 0xffff), 
--- a/lib/Target/ARM/ARMRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMRegisterInfo.cpp
@ -235,8 +235,10 @@ ARMRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
  };

  static const unsigned DarwinCalleeSavedRegs[] = {
+    // Darwin ABI deviates from ARM standard ABI. R9 is not a callee-saved
+    // register.
    ARM::LR,  ARM::R7,  ARM::R6, ARM::R5, ARM::R4,
-    ARM::R11, ARM::R10, ARM::R9, ARM::R8,
+    ARM::R11, ARM::R10, ARM::R8,

    ARM::D15, ARM::D14, ARM::D13, ARM::D12,
    ARM::D11, ARM::D10, ARM::D9,  ARM::D8,
@ -256,6 +258,7 @@ ARMRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
    0
  };
+
  static const TargetRegisterClass * const ThumbCalleeSavedRegClasses[] = {
    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::tGPRRegClass,
@ -265,7 +268,33 @@ ARMRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
    0
  };
-  return STI.isThumb() ? ThumbCalleeSavedRegClasses : CalleeSavedRegClasses;
+
+  static const TargetRegisterClass * const DarwinCalleeSavedRegClasses[] = {
+    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
+    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
+    &ARM::GPRRegClass, &ARM::GPRRegClass,
+
+    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+    0
+  };
+
+  static const TargetRegisterClass * const DarwinThumbCalleeSavedRegClasses[] ={
+    &ARM::GPRRegClass,  &ARM::tGPRRegClass, &ARM::tGPRRegClass,
+    &ARM::tGPRRegClass, &ARM::tGPRRegClass, &ARM::GPRRegClass,
+    &ARM::GPRRegClass,  &ARM::GPRRegClass,
+
+    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+    0
+  };
+
+  if (STI.isThumb()) {
+    return STI.isTargetDarwin()
+      ? DarwinThumbCalleeSavedRegClasses : ThumbCalleeSavedRegClasses;
+  }
+  return STI.isTargetDarwin()
+    ? DarwinCalleeSavedRegClasses : CalleeSavedRegClasses;
 }

 BitVector ARMRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
@ -497,7 +526,9 @@ ARMRegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
 ///
 bool ARMRegisterInfo::hasFP(const MachineFunction &MF) const {
  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  return NoFramePointerElim || MFI->hasVarSizedObjects();
+  return (NoFramePointerElim ||
+          MFI->hasVarSizedObjects() ||
+          MFI->isFrameAddressTaken());
 }

 // hasReservedCallFrame - Under normal circumstances, when a frame pointer is
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@ -77,6 +77,34 @@ def D13 : ARMReg<13, "d13", [S26, S27]>;
 def D14 : ARMReg<14, "d14", [S28, S29]>;
 def D15 : ARMReg<15, "d15", [S30, S31]>;

+// VFP3 defines 16 additional double registers
+def D16 : ARMFReg<16, "d16">; def D17 : ARMFReg<17, "d16">;
+def D18 : ARMFReg<18, "d16">; def D19 : ARMFReg<19, "d16">;
+def D20 : ARMFReg<20, "d16">; def D21 : ARMFReg<21, "d16">;
+def D22 : ARMFReg<22, "d16">; def D23 : ARMFReg<23, "d16">;
+def D24 : ARMFReg<24, "d16">; def D25 : ARMFReg<25, "d16">;
+def D26 : ARMFReg<26, "d16">; def D27 : ARMFReg<27, "d16">;
+def D28 : ARMFReg<28, "d16">; def D29 : ARMFReg<29, "d16">;
+def D30 : ARMFReg<30, "d16">; def D31 : ARMFReg<31, "d16">;
+
+// Advanced SIMD (NEON) defines 16 quad-word aliases
+def Q0  : ARMReg< 0,  "q0", [D0,   D1]>;
+def Q1  : ARMReg< 1,  "q1", [D2,   D3]>; 
+def Q2  : ARMReg< 2,  "q2", [D4,   D5]>;
+def Q3  : ARMReg< 3,  "q3", [D6,   D7]>;
+def Q4  : ARMReg< 4,  "q4", [D8,   D9]>;
+def Q5  : ARMReg< 5,  "q5", [D10, D11]>;
+def Q6  : ARMReg< 6,  "q6", [D12, D13]>;
+def Q7  : ARMReg< 7,  "q7", [D14, D15]>;
+def Q8  : ARMReg< 8,  "q8", [D16, D17]>;
+def Q9  : ARMReg< 9,  "q9", [D18, D19]>;
+def Q10 : ARMReg<10, "q10", [D20, D21]>;
+def Q11 : ARMReg<11, "q11", [D22, D23]>;
+def Q12 : ARMReg<12, "q12", [D24, D25]>;
+def Q13 : ARMReg<13, "q13", [D26, D27]>;
+def Q14 : ARMReg<14, "q14", [D28, D29]>;
+def Q15 : ARMReg<15, "q15", [D30, D31]>;
+
 // Current Program Status Register.
 def CPSR : ARMReg<0, "cpsr">;

@ -87,6 +115,7 @@ def CPSR : ARMReg<0, "cpsr">;
 // sp  == Stack Pointer
 // r12 == ip (scratch)
 // r7  == Frame Pointer (thumb-style backtraces)
+// r9  == May be reserved as Thread Register
 // r11 == Frame Pointer (arm-style backtraces)
 // r10 == Stack Limit
 //
@ -115,13 +144,13 @@ def GPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6,
      ARM::R4, ARM::R5, ARM::R6, ARM::R7,
      ARM::R8, ARM::R10,
      ARM::R11 };
-    // FP is R7, R9 is available.
+    // FP is R7, R9 is available as non-callee-saved register.
+    // This is used by Darwin.
    static const unsigned ARM_GPR_AO_3[] = {
      ARM::R0, ARM::R1, ARM::R2, ARM::R3,
-      ARM::R12,ARM::LR,
+      ARM::R9, ARM::R12,ARM::LR,
      ARM::R4, ARM::R5, ARM::R6,
-      ARM::R8, ARM::R9, ARM::R10,ARM::R11,
-      ARM::R7 };
+      ARM::R8, ARM::R10,ARM::R11,ARM::R7 };
    // FP is R7, R9 is not available.
    static const unsigned ARM_GPR_AO_4[] = {
      ARM::R0, ARM::R1, ARM::R2, ARM::R3,
@ -155,17 +184,15 @@ def GPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6,
      GPRClass::iterator I;

      if (Subtarget.isTargetDarwin()) {
-        if (Subtarget.isR9Reserved()) {
+        if (Subtarget.isR9Reserved())
          I = ARM_GPR_AO_4 + (sizeof(ARM_GPR_AO_4)/sizeof(unsigned));
-        } else {
+        else
          I = ARM_GPR_AO_3 + (sizeof(ARM_GPR_AO_3)/sizeof(unsigned));
-        }
      } else {
-        if (Subtarget.isR9Reserved()) {
+        if (Subtarget.isR9Reserved())
          I = ARM_GPR_AO_2 + (sizeof(ARM_GPR_AO_2)/sizeof(unsigned));
-        } else {
+        else
          I = ARM_GPR_AO_1 + (sizeof(ARM_GPR_AO_1)/sizeof(unsigned));
-        }
      }

      // Mac OS X requires FP not to be clobbered for backtracing purpose.
@ -208,14 +235,67 @@ def tGPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, R7]> {
  }];
 }

+// Scalar single precision floating point register class..
 def SPR : RegisterClass<"ARM", [f32], 32, [S0, S1, S2, S3, S4, S5, S6, S7, S8,
  S9, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22,
  S23, S24, S25, S26, S27, S28, S29, S30, S31]>;

+// Scalar double precision floating point / generic 64-bit vector register
+// class.
 // ARM requires only word alignment for double. It's more performant if it
 // is double-word alignment though.
-def DPR : RegisterClass<"ARM", [f64], 64, [D0, D1, D2, D3, D4, D5, D6, D7, D8,
-  D9, D10, D11, D12, D13, D14, D15]>;
+def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
+                        [D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
+                         D8,  D9,  D10, D11, D12, D13, D14, D15]> {
+  let SubRegClassList = [SPR, SPR];
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    // VFP2
+    static const unsigned ARM_DPR_VFP2[] = { 
+      ARM::D0,  ARM::D1,  ARM::D2,  ARM::D3, 
+      ARM::D4,  ARM::D5,  ARM::D6,  ARM::D7, 
+      ARM::D8,  ARM::D9,  ARM::D10, ARM::D11, 
+      ARM::D12, ARM::D13, ARM::D14, ARM::D15 };
+    // VFP3
+    static const unsigned ARM_DPR_VFP3[] = {
+      ARM::D0,  ARM::D1,  ARM::D2,  ARM::D3, 
+      ARM::D4,  ARM::D5,  ARM::D6,  ARM::D7, 
+      ARM::D8,  ARM::D9,  ARM::D10, ARM::D11, 
+      ARM::D12, ARM::D13, ARM::D14, ARM::D15,
+      ARM::D16, ARM::D17, ARM::D18, ARM::D15,
+      ARM::D20, ARM::D21, ARM::D22, ARM::D23,
+      ARM::D24, ARM::D25, ARM::D26, ARM::D27,
+      ARM::D28, ARM::D29, ARM::D30, ARM::D31 };
+    DPRClass::iterator
+    DPRClass::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>();
+      if (Subtarget.hasVFP3())
+        return ARM_DPR_VFP3;
+      return ARM_DPR_VFP2;
+    }
+
+    DPRClass::iterator
+    DPRClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>();
+      if (Subtarget.hasVFP3())
+        return ARM_DPR_VFP3 + (sizeof(ARM_DPR_VFP3)/sizeof(unsigned));
+      else
+        return ARM_DPR_VFP2 + (sizeof(ARM_DPR_VFP2)/sizeof(unsigned));
+    }
+  }];
+}
+
+// Generic 128-bit vector register class.
+def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
+                        [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7,
+                         Q8,  Q9,  Q10, Q11, Q12, Q13, Q14, Q15]> {
+  let SubRegClassList = [SPR, SPR, SPR, SPR, DPR, DPR];
+}

 // Condition code registers.
 def CCR : RegisterClass<"ARM", [i32], 32, [CPSR]>;
@ -225,12 +305,40 @@ def CCR : RegisterClass<"ARM", [i32], 32, [CPSR]>;
 // sub registers for each register.
 //

-def : SubRegSet<1, [D0, D1, D2, D3, D4, D5, D6, D7,
-                    D8, D9, D10, D11, D12, D13, D14, D15],
-                   [S0, S2, S4, S6, S8, S10, S12, S14,
-                    S16, S18, S20, S22, S24, S26, S28, S30]>;
+def arm_ssubreg_0 : PatLeaf<(i32 1)>;
+def arm_ssubreg_1 : PatLeaf<(i32 2)>;
+def arm_ssubreg_2 : PatLeaf<(i32 3)>;
+def arm_ssubreg_3 : PatLeaf<(i32 4)>;
+def arm_dsubreg_0 : PatLeaf<(i32 5)>;
+def arm_dsubreg_1 : PatLeaf<(i32 6)>;

-def : SubRegSet<2, [D0, D1, D2, D3, D4, D5, D6, D7,
-                    D8, D9, D10, D11, D12, D13, D14, D15],
-                   [S1, S3, S5, S7, S9, S11, S13, S15,
+// S sub-registers of D registers.
+def : SubRegSet<1, [D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
+                    D8,  D9,  D10, D11, D12, D13, D14, D15],
+                   [S0,  S2,  S4,  S6,  S8,  S10, S12, S14,
+                    S16, S18, S20, S22, S24, S26, S28, S30]>;
+def : SubRegSet<2, [D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
+                    D8,  D9,  D10, D11, D12, D13, D14, D15],
+                   [S1,  S3,  S5,  S7,  S9,  S11, S13, S15,
                    S17, S19, S21, S23, S25, S27, S29, S31]>;
+
+// S sub-registers of Q registers.
+def : SubRegSet<1, [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7],
+                   [S0,  S4,  S8,  S12, S16, S20, S24, S28]>;
+def : SubRegSet<2, [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7],
+                   [S1,  S5,  S9,  S13, S17, S21, S25, S29]>;
+def : SubRegSet<3, [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7],
+                   [S2,  S6,  S10, S14, S18, S22, S26, S30]>;
+def : SubRegSet<4, [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7],
+                   [S3,  S7,  S11, S15, S19, S23, S27, S31]>;
+
+// D sub-registers of Q registers.
+def : SubRegSet<5, [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7,
+                    Q8,  Q9,  Q10, Q11, Q12, Q13, Q14, Q15],
+                   [D0,  D2,  D4,  D6,  D8,  D10, D12, D14,
+                    D16, D18, D20, D22, D24, D26, D28, D30]>;
+def : SubRegSet<6, [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7,
+                    Q8,  Q9,  Q10, Q11, Q12, Q13, Q14, Q15],
+                   [D1,  D3,  D5,  D7,  D9,  D11, D13, D15,
+                    D17, D19, D21, D23, D25, D27, D29, D31]>;
+
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@ -16,15 +16,20 @@
 #include "llvm/Module.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/CommandLine.h"
 using namespace llvm;

+static cl::opt<bool>
+ReserveR9("arm-reserve-r9", cl::Hidden,
+          cl::desc("Reserve R9, making it unavailable as GPR"));
+
 ARMSubtarget::ARMSubtarget(const Module &M, const std::string &FS,
                           bool isThumb)
  : ARMArchVersion(V4T)
  , ARMFPUType(None)
  , IsThumb(isThumb)
  , ThumbMode(Thumb1)
-  , IsR9Reserved(false)
+  , IsR9Reserved(ReserveR9)
  , stackAlignment(4)
  , CPUString("generic")
  , TargetType(isELF) // Default to ELF unless otherwise specified.
@ -46,7 +51,7 @@ ARMSubtarget::ARMSubtarget(const Module &M, const std::string &FS,

  if (Len >= 5 && TT.substr(0, 4) == "armv")
    Idx = 4;
-  else if (Len >= 6 && TT.substr(0, 6) == "thumb") {
+  else if (Len >= 6 && TT.substr(0, 5) == "thumb") {
    IsThumb = true;
    if (Len >= 7 && TT[5] == 'v')
      Idx = 6;
@ -54,15 +59,19 @@ ARMSubtarget::ARMSubtarget(const Module &M, const std::string &FS,
  if (Idx) {
    unsigned SubVer = TT[Idx];
    if (SubVer > '4' && SubVer <= '9') {
-      if (SubVer >= '7')
+      if (SubVer >= '7') {
        ARMArchVersion = V7A;
-      else if (SubVer == '6')
+      } else if (SubVer == '6') {
        ARMArchVersion = V6;
-      else if (SubVer == '5') {
+        if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == '2')
+          ARMArchVersion = V6T2;
+      } else if (SubVer == '5') {
        ARMArchVersion = V5T;
        if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == 'e')
          ARMArchVersion = V5TE;
      }
+      if (ARMArchVersion >= V6T2)
+        ThumbMode = Thumb2;
    }
  }

@ -83,5 +92,5 @@ ARMSubtarget::ARMSubtarget(const Module &M, const std::string &FS,
    stackAlignment = 8;

  if (isTargetDarwin())
-    IsR9Reserved = true;
+    IsR9Reserved = ReserveR9 | (ARMArchVersion < V6);
 }
--- a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
@ -285,12 +285,22 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
                                 const char *Modifier) {
  const MachineOperand &MO = MI->getOperand(opNum);
  switch (MO.getType()) {
-  case MachineOperand::MO_Register:
-    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
-      O << TM.getRegisterInfo()->get(MO.getReg()).AsmName;
-    else
+  case MachineOperand::MO_Register: {
+    unsigned Reg = MO.getReg();
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      if (Modifier && strcmp(Modifier, "dregpair") == 0) {
+        unsigned DRegLo = TRI->getSubReg(Reg, 5); // arm_dsubreg_0
+        unsigned DRegHi = TRI->getSubReg(Reg, 6); // arm_dsubreg_1
+        O << '{'
+          << TRI->getAsmName(DRegLo) << "-" << TRI->getAsmName(DRegHi)
+          << '}';
+      } else {
+        O << TRI->getAsmName(Reg);
+      }
+    } else
      assert(0 && "not implemented");
    break;
+  }
  case MachineOperand::MO_Immediate: {
    if (!Modifier || strcmp(Modifier, "no_hash") != 0)
      O << "#";
--- a/lib/Target/ARM/README.txt
+++ b/lib/Target/ARM/README.txt
@ -552,3 +552,23 @@ __Z11no_overflowjj:

 //===---------------------------------------------------------------------===//

+Some of the NEON intrinsics may be appropriate for more general use, either
+as target-independent intrinsics or perhaps elsewhere in the ARM backend.
+Some of them may also be lowered to target-independent SDNodes, and perhaps
+some new SDNodes could be added.
+
+For example, maximum, minimum, and absolute value operations are well-defined
+and standard operations, both for vector and scalar types.
+
+The current NEON-specific intrinsics for count leading zeros and count one
+bits could perhaps be replaced by the target-independent ctlz and ctpop
+intrinsics.  It may also make sense to add a target-independent "ctls"
+intrinsic for "count leading sign bits".  Likewise, the backend could use
+the target-independent SDNodes for these operations.
+
+ARMv6 has scalar saturating and halving adds and subtracts.  The same
+intrinsics could possibly be used for both NEON's vector implementations of
+those operations and the ARMv6 scalar versions.
+
+//===---------------------------------------------------------------------===//
+
--- a/lib/Target/PIC16/PIC16ISelLowering.cpp
+++ b/lib/Target/PIC16/PIC16ISelLowering.cpp
@ -702,10 +702,12 @@ void PIC16TargetLowering::LegalizeAddress(SDValue Ptr, SelectionDAG &DAG,
  if (Ptr.getOpcode() == ISD::ADD) {
    SDValue OperLeft = Ptr.getOperand(0);
    SDValue OperRight = Ptr.getOperand(1);
-    if (OperLeft.getOpcode() == ISD::Constant) {
+    if ((OperLeft.getOpcode() == ISD::Constant) &&
+        (dyn_cast<ConstantSDNode>(OperLeft)->getZExtValue() < 32 )) {
      Offset = dyn_cast<ConstantSDNode>(OperLeft)->getZExtValue();
      Ptr = OperRight;
-    } else if (OperRight.getOpcode() == ISD::Constant) {
+    } else if ((OperRight.getOpcode() == ISD::Constant)  &&
+               (dyn_cast<ConstantSDNode>(OperRight)->getZExtValue() < 32 )){
      Offset = dyn_cast<ConstantSDNode>(OperRight)->getZExtValue();
      Ptr = OperLeft;
    }
--- a/lib/Target/TargetData.cpp
+++ b/lib/Target/TargetData.cpp
@ -23,6 +23,7 @@
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/ManagedStatic.h"
+#include "llvm/System/Mutex.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringExtras.h"
 #include <algorithm>
@ -345,11 +346,13 @@ typedef DenseMap<LayoutKey, StructLayout*, DenseMapLayoutKeyInfo> LayoutInfoTy;
 }

 static ManagedStatic<LayoutInfoTy> LayoutInfo;
+static ManagedStatic<sys::SmartMutex<true> > LayoutLock;

 TargetData::~TargetData() {
  if (!LayoutInfo.isConstructed())
    return;
  
+  sys::SmartScopedLock<true> Lock(&*LayoutLock);
  // Remove any layouts for this TD.
  LayoutInfoTy &TheMap = *LayoutInfo;
  for (LayoutInfoTy::iterator I = TheMap.begin(), E = TheMap.end(); I != E; ) {
@ -366,6 +369,7 @@ TargetData::~TargetData() {
 const StructLayout *TargetData::getStructLayout(const StructType *Ty) const {
  LayoutInfoTy &TheMap = *LayoutInfo;
  
+  sys::SmartScopedLock<true> Lock(&*LayoutLock);
  StructLayout *&SL = TheMap[LayoutKey(this, Ty)];
  if (SL) return SL;

@ -390,6 +394,7 @@ const StructLayout *TargetData::getStructLayout(const StructType *Ty) const {
 void TargetData::InvalidateStructLayoutInfo(const StructType *Ty) const {
  if (!LayoutInfo.isConstructed()) return;  // No cache.
  
+  sys::SmartScopedLock<true> Lock(&*LayoutLock);
  LayoutInfoTy::iterator I = LayoutInfo->find(LayoutKey(this, Ty));
  if (I == LayoutInfo->end()) return;
  
--- a/lib/Target/X86/X86ELFWriterInfo.cpp
+++ b/lib/Target/X86/X86ELFWriterInfo.cpp
@ -12,11 +12,17 @@
 //===----------------------------------------------------------------------===//

 #include "X86ELFWriterInfo.h"
+#include "X86Relocations.h"
 #include "llvm/Function.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetMachine.h"
+
 using namespace llvm;

+//===----------------------------------------------------------------------===//
+//  Implementation of the X86ELFWriterInfo class
+//===----------------------------------------------------------------------===//
+
 X86ELFWriterInfo::X86ELFWriterInfo(TargetMachine &TM)
  : TargetELFWriterInfo(TM) {
    bool is64Bit = TM.getTargetData()->getPointerSizeInBits() == 64;
@ -25,6 +31,34 @@ X86ELFWriterInfo::X86ELFWriterInfo(TargetMachine &TM)

 X86ELFWriterInfo::~X86ELFWriterInfo() {}

+unsigned X86ELFWriterInfo::getRelocationType(unsigned MachineRelTy) const {
+  if (is64Bit) {
+    switch(MachineRelTy) {
+    case X86::reloc_pcrel_word:
+      return R_X86_64_PC32;
+    case X86::reloc_absolute_word:
+      return R_X86_64_32;
+    case X86::reloc_absolute_dword:
+      return R_X86_64_64;
+    case X86::reloc_picrel_word:
+    default:
+      assert(0 && "unknown relocation type");
+    }
+  } else {
+    switch(MachineRelTy) {
+    case X86::reloc_pcrel_word:
+      return R_386_PC32;
+    case X86::reloc_absolute_word:
+      return R_386_32;
+    case X86::reloc_absolute_dword:
+    case X86::reloc_picrel_word:
+    default:
+      assert(0 && "unknown relocation type");
+    }
+  }
+  return 0;
+}
+
 unsigned X86ELFWriterInfo::getFunctionAlignment(const Function *F) const {
  unsigned FnAlign = 4;

@ -36,3 +70,15 @@ unsigned X86ELFWriterInfo::getFunctionAlignment(const Function *F) const {

  return (1 << FnAlign);
 }
+
+long int X86ELFWriterInfo::getAddendForRelTy(unsigned RelTy) const {
+  if (is64Bit) {
+    switch(RelTy) {
+    case R_X86_64_PC32: return -4;
+      break;
+    default:
+      assert(0 && "unknown x86 relocation type");
+    }
+  }
+  return 0;
+}
--- a/lib/Target/X86/X86ELFWriterInfo.h
+++ b/lib/Target/X86/X86ELFWriterInfo.h
@ -19,11 +19,43 @@
 namespace llvm {

  class X86ELFWriterInfo : public TargetELFWriterInfo {
+
+    // ELF Relocation types for X86
+    enum X86RelocationType {
+      R_386_NONE = 0,
+      R_386_32   = 1,
+      R_386_PC32 = 2
+    };
+
+    // ELF Relocation types for X86_64
+    enum X86_64RelocationType {
+      R_X86_64_NONE = 0,
+      R_X86_64_64   = 1,
+      R_X86_64_PC32 = 2,
+      R_X86_64_32   = 10,
+      R_X86_64_32S  = 11,
+      R_X86_64_PC64 = 24
+    };
+
  public:
    X86ELFWriterInfo(TargetMachine &TM);
    virtual ~X86ELFWriterInfo();

+    /// getFunctionAlignment - Returns the alignment for function 'F', targets
+    /// with different alignment constraints should overload this method
    virtual unsigned getFunctionAlignment(const Function *F) const;
+
+    /// getRelocationType - Returns the target specific ELF Relocation type.
+    /// 'MachineRelTy' contains the object code independent relocation type
+    virtual unsigned getRelocationType(unsigned MachineRelTy) const;
+
+    /// hasRelocationAddend - True if the target uses an addend in the
+    /// ELF relocation entry.
+    virtual bool hasRelocationAddend() const { return is64Bit ? true : false; }
+
+    /// getAddendForRelTy - Gets the addend value for an ELF relocation entry
+    /// based on the target relocation type
+    virtual long int getAddendForRelTy(unsigned RelTy) const;
  };

 } // end llvm namespace
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@ -96,7 +96,7 @@ namespace {

    void RewriteNonIntegerIVs(Loop *L);

-    ICmpInst *LinearFunctionTestReplace(Loop *L, SCEVHandle BackedgeTakenCount,
+    ICmpInst *LinearFunctionTestReplace(Loop *L, const SCEV* BackedgeTakenCount,
                                   Value *IndVar,
                                   BasicBlock *ExitingBlock,
                                   BranchInst *BI,
@ -128,7 +128,7 @@ Pass *llvm::createIndVarSimplifyPass() {
 /// SCEV analysis can determine a loop-invariant trip count of the loop, which
 /// is actually a much broader range than just linear tests.
 ICmpInst *IndVarSimplify::LinearFunctionTestReplace(Loop *L,
-                                   SCEVHandle BackedgeTakenCount,
+                                   const SCEV* BackedgeTakenCount,
                                   Value *IndVar,
                                   BasicBlock *ExitingBlock,
                                   BranchInst *BI,
@ -137,13 +137,13 @@ ICmpInst *IndVarSimplify::LinearFunctionTestReplace(Loop *L,
  // against the preincremented value, otherwise we prefer to compare against
  // the post-incremented value.
  Value *CmpIndVar;
-  SCEVHandle RHS = BackedgeTakenCount;
+  const SCEV* RHS = BackedgeTakenCount;
  if (ExitingBlock == L->getLoopLatch()) {
    // Add one to the "backedge-taken" count to get the trip count.
    // If this addition may overflow, we have to be more pessimistic and
    // cast the induction variable before doing the add.
-    SCEVHandle Zero = SE->getIntegerSCEV(0, BackedgeTakenCount->getType());
-    SCEVHandle N =
+    const SCEV* Zero = SE->getIntegerSCEV(0, BackedgeTakenCount->getType());
+    const SCEV* N =
      SE->getAddExpr(BackedgeTakenCount,
                     SE->getIntegerSCEV(1, BackedgeTakenCount->getType()));
    if ((isa<SCEVConstant>(N) && !N->isZero()) ||
@ -278,7 +278,7 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L,
        // Okay, this instruction has a user outside of the current loop
        // and varies predictably *inside* the loop.  Evaluate the value it
        // contains when the loop exits, if possible.
-        SCEVHandle ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop());
+        const SCEV* ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop());
        if (!ExitValue->isLoopInvariant(L))
          continue;

@ -348,7 +348,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {

  BasicBlock *Header       = L->getHeader();
  BasicBlock *ExitingBlock = L->getExitingBlock(); // may be null
-  SCEVHandle BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+  const SCEV* BackedgeTakenCount = SE->getBackedgeTakenCount(L);

  // Check to see if this loop has a computable loop-invariant execution count.
  // If so, this means that we can compute the final value of any expressions
@ -373,14 +373,14 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
      NeedCannIV = true;
  }
  for (unsigned i = 0, e = IU->StrideOrder.size(); i != e; ++i) {
-    SCEVHandle Stride = IU->StrideOrder[i];
+    const SCEV* Stride = IU->StrideOrder[i];
    const Type *Ty = SE->getEffectiveSCEVType(Stride->getType());
    if (!LargestType ||
        SE->getTypeSizeInBits(Ty) >
          SE->getTypeSizeInBits(LargestType))
      LargestType = Ty;

-    std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+    std::map<const SCEV*, IVUsersOfOneStride *>::iterator SI =
      IU->IVUsesByStride.find(IU->StrideOrder[i]);
    assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!");

@ -473,21 +473,20 @@ void IndVarSimplify::RewriteIVExpressions(Loop *L, const Type *LargestType,
  // the need for the code evaluation methods to insert induction variables
  // of different sizes.
  for (unsigned i = 0, e = IU->StrideOrder.size(); i != e; ++i) {
-    SCEVHandle Stride = IU->StrideOrder[i];
+    const SCEV* Stride = IU->StrideOrder[i];

-    std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+    std::map<const SCEV*, IVUsersOfOneStride *>::iterator SI =
      IU->IVUsesByStride.find(IU->StrideOrder[i]);
    assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!");
    ilist<IVStrideUse> &List = SI->second->Users;
    for (ilist<IVStrideUse>::iterator UI = List.begin(),
         E = List.end(); UI != E; ++UI) {
-      SCEVHandle Offset = UI->getOffset();
      Value *Op = UI->getOperandValToReplace();
      const Type *UseTy = Op->getType();
      Instruction *User = UI->getUser();

      // Compute the final addrec to expand into code.
-      SCEVHandle AR = IU->getReplacementExpr(*UI);
+      const SCEV* AR = IU->getReplacementExpr(*UI);

      Value *NewVal = 0;
      if (AR->isLoopInvariant(L)) {
--- a/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/lib/Transforms/Scalar/LoopDeletion.cpp
@ -187,7 +187,7 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
  // Don't remove loops for which we can't solve the trip count.
  // They could be infinite, in which case we'd be changing program behavior.
  ScalarEvolution& SE = getAnalysis<ScalarEvolution>();
-  SCEVHandle S = SE.getBackedgeTakenCount(L);
+  const SCEV* S = SE.getBackedgeTakenCount(L);
  if (isa<SCEVCouldNotCompute>(S))
    return false;
  
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@ -64,11 +64,11 @@ namespace {
  /// StrengthReduceStridedIVUsers. It contains the stride, the common base, as
  /// well as the PHI node and increment value created for rewrite.
  struct VISIBILITY_HIDDEN IVExpr {
-    SCEVHandle  Stride;
-    SCEVHandle  Base;
+    const SCEV*  Stride;
+    const SCEV*  Base;
    PHINode    *PHI;

-    IVExpr(const SCEVHandle &stride, const SCEVHandle &base, PHINode *phi)
+    IVExpr(const SCEV* const stride, const SCEV* const base, PHINode *phi)
      : Stride(stride), Base(base), PHI(phi) {}
  };

@ -77,7 +77,7 @@ namespace {
  struct VISIBILITY_HIDDEN IVsOfOneStride {
    std::vector<IVExpr> IVs;

-    void addIV(const SCEVHandle &Stride, const SCEVHandle &Base, PHINode *PHI) {
+    void addIV(const SCEV* const Stride, const SCEV* const Base, PHINode *PHI) {
      IVs.push_back(IVExpr(Stride, Base, PHI));
    }
  };
@ -91,11 +91,11 @@ namespace {

    /// IVsByStride - Keep track of all IVs that have been inserted for a
    /// particular stride.
-    std::map<SCEVHandle, IVsOfOneStride> IVsByStride;
+    std::map<const SCEV*, IVsOfOneStride> IVsByStride;

    /// StrideNoReuse - Keep track of all the strides whose ivs cannot be
    /// reused (nor should they be rewritten to reuse other strides).
-    SmallSet<SCEVHandle, 4> StrideNoReuse;
+    SmallSet<const SCEV*, 4> StrideNoReuse;

    /// DeadInsts - Keep track of instructions we may have made dead, so that
    /// we can remove them after we are done working.
@ -133,7 +133,7 @@ namespace {
  private:
    ICmpInst *ChangeCompareStride(Loop *L, ICmpInst *Cond,
                                  IVStrideUse* &CondUse,
-                                  const SCEVHandle* &CondStride);
+                                  const SCEV* const *  &CondStride);

    void OptimizeIndvars(Loop *L);
    void OptimizeLoopCountIV(Loop *L);
@ -149,16 +149,16 @@ namespace {
                          IVStrideUse* &CondUse);

    bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse,
-                           const SCEVHandle *&CondStride);
+                           const SCEV* const * &CondStride);
    bool RequiresTypeConversion(const Type *Ty, const Type *NewTy);
-    SCEVHandle CheckForIVReuse(bool, bool, bool, const SCEVHandle&,
+    const SCEV* CheckForIVReuse(bool, bool, bool, const SCEV* const&,
                             IVExpr&, const Type*,
                             const std::vector<BasedUser>& UsersToProcess);
    bool ValidScale(bool, int64_t,
                    const std::vector<BasedUser>& UsersToProcess);
    bool ValidOffset(bool, int64_t, int64_t,
                     const std::vector<BasedUser>& UsersToProcess);
-    SCEVHandle CollectIVUsers(const SCEVHandle &Stride,
+    const SCEV* CollectIVUsers(const SCEV* const &Stride,
                              IVUsersOfOneStride &Uses,
                              Loop *L,
                              bool &AllUsesAreAddresses,
@ -168,11 +168,11 @@ namespace {
                                const std::vector<BasedUser> &UsersToProcess,
                                const Loop *L,
                                bool AllUsesAreAddresses,
-                                SCEVHandle Stride);
+                                const SCEV* Stride);
    void PrepareToStrengthReduceFully(
                             std::vector<BasedUser> &UsersToProcess,
-                             SCEVHandle Stride,
-                             SCEVHandle CommonExprs,
+                             const SCEV* Stride,
+                             const SCEV* CommonExprs,
                             const Loop *L,
                             SCEVExpander &PreheaderRewriter);
    void PrepareToStrengthReduceFromSmallerStride(
@ -182,13 +182,13 @@ namespace {
                                         Instruction *PreInsertPt);
    void PrepareToStrengthReduceWithNewPhi(
                                  std::vector<BasedUser> &UsersToProcess,
-                                  SCEVHandle Stride,
-                                  SCEVHandle CommonExprs,
+                                  const SCEV* Stride,
+                                  const SCEV* CommonExprs,
                                  Value *CommonBaseV,
                                  Instruction *IVIncInsertPt,
                                  const Loop *L,
                                  SCEVExpander &PreheaderRewriter);
-    void StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
+    void StrengthReduceStridedIVUsers(const SCEV* const &Stride,
                                      IVUsersOfOneStride &Uses,
                                      Loop *L);
    void DeleteTriviallyDeadInstructions();
@ -232,7 +232,7 @@ void LoopStrengthReduce::DeleteTriviallyDeadInstructions() {
 /// containsAddRecFromDifferentLoop - Determine whether expression S involves a 
 /// subexpression that is an AddRec from a loop other than L.  An outer loop 
 /// of L is OK, but not an inner loop nor a disjoint loop.
-static bool containsAddRecFromDifferentLoop(SCEVHandle S, Loop *L) {
+static bool containsAddRecFromDifferentLoop(const SCEV* S, Loop *L) {
  // This is very common, put it first.
  if (isa<SCEVConstant>(S))
    return false;
@ -327,7 +327,7 @@ namespace {
    /// this use.  As the use is processed, information gets moved from this
    /// field to the Imm field (below).  BasedUser values are sorted by this
    /// field.
-    SCEVHandle Base;
+    const SCEV* Base;
    
    /// Inst - The instruction using the induction variable.
    Instruction *Inst;
@ -340,7 +340,7 @@ namespace {
    /// before Inst, because it will be folded into the imm field of the
    /// instruction.  This is also sometimes used for loop-variant values that
    /// must be added inside the loop.
-    SCEVHandle Imm;
+    const SCEV* Imm;

    /// Phi - The induction variable that performs the striding that
    /// should be used for this user.
@ -362,13 +362,13 @@ namespace {
    // Once we rewrite the code to insert the new IVs we want, update the
    // operands of Inst to use the new expression 'NewBase', with 'Imm' added
    // to it.
-    void RewriteInstructionToUseNewBase(const SCEVHandle &NewBase,
+    void RewriteInstructionToUseNewBase(const SCEV* const &NewBase,
                                        Instruction *InsertPt,
                                       SCEVExpander &Rewriter, Loop *L, Pass *P,
                                        LoopInfo &LI,
                                        SmallVectorImpl<WeakVH> &DeadInsts);
    
-    Value *InsertCodeForBaseAtPosition(const SCEVHandle &NewBase, 
+    Value *InsertCodeForBaseAtPosition(const SCEV* const &NewBase, 
                                       const Type *Ty,
                                       SCEVExpander &Rewriter,
                                       Instruction *IP, Loop *L,
@ -383,7 +383,7 @@ void BasedUser::dump() const {
  cerr << "   Inst: " << *Inst;
 }

-Value *BasedUser::InsertCodeForBaseAtPosition(const SCEVHandle &NewBase, 
+Value *BasedUser::InsertCodeForBaseAtPosition(const SCEV* const &NewBase, 
                                              const Type *Ty,
                                              SCEVExpander &Rewriter,
                                              Instruction *IP, Loop *L,
@ -407,7 +407,7 @@ Value *BasedUser::InsertCodeForBaseAtPosition(const SCEVHandle &NewBase,
  
  Value *Base = Rewriter.expandCodeFor(NewBase, 0, BaseInsertPt);

-  SCEVHandle NewValSCEV = SE->getUnknown(Base);
+  const SCEV* NewValSCEV = SE->getUnknown(Base);

  // If there is no immediate value, skip the next part.
  if (!Imm->isZero()) {
@ -430,7 +430,7 @@ Value *BasedUser::InsertCodeForBaseAtPosition(const SCEVHandle &NewBase,
 // value of NewBase in the case that it's a diffferent instruction from
 // the PHI that NewBase is computed from, or null otherwise.
 //
-void BasedUser::RewriteInstructionToUseNewBase(const SCEVHandle &NewBase,
+void BasedUser::RewriteInstructionToUseNewBase(const SCEV* const &NewBase,
                                               Instruction *NewBasePt,
                                      SCEVExpander &Rewriter, Loop *L, Pass *P,
                                      LoopInfo &LI,
@ -542,7 +542,7 @@ void BasedUser::RewriteInstructionToUseNewBase(const SCEVHandle &NewBase,

 /// fitsInAddressMode - Return true if V can be subsumed within an addressing
 /// mode, and does not need to be put in a register first.
-static bool fitsInAddressMode(const SCEVHandle &V, const Type *AccessTy,
+static bool fitsInAddressMode(const SCEV* const &V, const Type *AccessTy,
                             const TargetLowering *TLI, bool HasBaseReg) {
  if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(V)) {
    int64_t VC = SC->getValue()->getSExtValue();
@ -574,12 +574,12 @@ static bool fitsInAddressMode(const SCEVHandle &V, const Type *AccessTy,

 /// MoveLoopVariantsToImmediateField - Move any subexpressions from Val that are
 /// loop varying to the Imm operand.
-static void MoveLoopVariantsToImmediateField(SCEVHandle &Val, SCEVHandle &Imm,
+static void MoveLoopVariantsToImmediateField(const SCEV* &Val, const SCEV* &Imm,
                                             Loop *L, ScalarEvolution *SE) {
  if (Val->isLoopInvariant(L)) return;  // Nothing to do.
  
  if (const SCEVAddExpr *SAE = dyn_cast<SCEVAddExpr>(Val)) {
-    SmallVector<SCEVHandle, 4> NewOps;
+    SmallVector<const SCEV*, 4> NewOps;
    NewOps.reserve(SAE->getNumOperands());
    
    for (unsigned i = 0; i != SAE->getNumOperands(); ++i)
@ -597,10 +597,10 @@ static void MoveLoopVariantsToImmediateField(SCEVHandle &Val, SCEVHandle &Imm,
      Val = SE->getAddExpr(NewOps);
  } else if (const SCEVAddRecExpr *SARE = dyn_cast<SCEVAddRecExpr>(Val)) {
    // Try to pull immediates out of the start value of nested addrec's.
-    SCEVHandle Start = SARE->getStart();
+    const SCEV* Start = SARE->getStart();
    MoveLoopVariantsToImmediateField(Start, Imm, L, SE);
    
-    SmallVector<SCEVHandle, 4> Ops(SARE->op_begin(), SARE->op_end());
+    SmallVector<const SCEV*, 4> Ops(SARE->op_begin(), SARE->op_end());
    Ops[0] = Start;
    Val = SE->getAddRecExpr(Ops, SARE->getLoop());
  } else {
@ -616,15 +616,15 @@ static void MoveLoopVariantsToImmediateField(SCEVHandle &Val, SCEVHandle &Imm,
 /// Accumulate these immediate values into the Imm value.
 static void MoveImmediateValues(const TargetLowering *TLI,
                                const Type *AccessTy,
-                                SCEVHandle &Val, SCEVHandle &Imm,
+                                const SCEV* &Val, const SCEV* &Imm,
                                bool isAddress, Loop *L,
                                ScalarEvolution *SE) {
  if (const SCEVAddExpr *SAE = dyn_cast<SCEVAddExpr>(Val)) {
-    SmallVector<SCEVHandle, 4> NewOps;
+    SmallVector<const SCEV*, 4> NewOps;
    NewOps.reserve(SAE->getNumOperands());
    
    for (unsigned i = 0; i != SAE->getNumOperands(); ++i) {
-      SCEVHandle NewOp = SAE->getOperand(i);
+      const SCEV* NewOp = SAE->getOperand(i);
      MoveImmediateValues(TLI, AccessTy, NewOp, Imm, isAddress, L, SE);
      
      if (!NewOp->isLoopInvariant(L)) {
@ -643,11 +643,11 @@ static void MoveImmediateValues(const TargetLowering *TLI,
    return;
  } else if (const SCEVAddRecExpr *SARE = dyn_cast<SCEVAddRecExpr>(Val)) {
    // Try to pull immediates out of the start value of nested addrec's.
-    SCEVHandle Start = SARE->getStart();
+    const SCEV* Start = SARE->getStart();
    MoveImmediateValues(TLI, AccessTy, Start, Imm, isAddress, L, SE);
    
    if (Start != SARE->getStart()) {
-      SmallVector<SCEVHandle, 4> Ops(SARE->op_begin(), SARE->op_end());
+      SmallVector<const SCEV*, 4> Ops(SARE->op_begin(), SARE->op_end());
      Ops[0] = Start;
      Val = SE->getAddRecExpr(Ops, SARE->getLoop());
    }
@ -658,8 +658,8 @@ static void MoveImmediateValues(const TargetLowering *TLI,
        fitsInAddressMode(SME->getOperand(0), AccessTy, TLI, false) &&
        SME->getNumOperands() == 2 && SME->isLoopInvariant(L)) {

-      SCEVHandle SubImm = SE->getIntegerSCEV(0, Val->getType());
-      SCEVHandle NewOp = SME->getOperand(1);
+      const SCEV* SubImm = SE->getIntegerSCEV(0, Val->getType());
+      const SCEV* NewOp = SME->getOperand(1);
      MoveImmediateValues(TLI, AccessTy, NewOp, SubImm, isAddress, L, SE);
      
      // If we extracted something out of the subexpressions, see if we can 
@ -694,7 +694,7 @@ static void MoveImmediateValues(const TargetLowering *TLI,

 static void MoveImmediateValues(const TargetLowering *TLI,
                                Instruction *User,
-                                SCEVHandle &Val, SCEVHandle &Imm,
+                                const SCEV* &Val, const SCEV* &Imm,
                                bool isAddress, Loop *L,
                                ScalarEvolution *SE) {
  const Type *AccessTy = getAccessType(User);
@ -704,19 +704,19 @@ static void MoveImmediateValues(const TargetLowering *TLI,
 /// SeparateSubExprs - Decompose Expr into all of the subexpressions that are
 /// added together.  This is used to reassociate common addition subexprs
 /// together for maximal sharing when rewriting bases.
-static void SeparateSubExprs(SmallVector<SCEVHandle, 16> &SubExprs,
-                             SCEVHandle Expr,
+static void SeparateSubExprs(SmallVector<const SCEV*, 16> &SubExprs,
+                             const SCEV* Expr,
                             ScalarEvolution *SE) {
  if (const SCEVAddExpr *AE = dyn_cast<SCEVAddExpr>(Expr)) {
    for (unsigned j = 0, e = AE->getNumOperands(); j != e; ++j)
      SeparateSubExprs(SubExprs, AE->getOperand(j), SE);
  } else if (const SCEVAddRecExpr *SARE = dyn_cast<SCEVAddRecExpr>(Expr)) {
-    SCEVHandle Zero = SE->getIntegerSCEV(0, Expr->getType());
+    const SCEV* Zero = SE->getIntegerSCEV(0, Expr->getType());
    if (SARE->getOperand(0) == Zero) {
      SubExprs.push_back(Expr);
    } else {
      // Compute the addrec with zero as its base.
-      SmallVector<SCEVHandle, 4> Ops(SARE->op_begin(), SARE->op_end());
+      SmallVector<const SCEV*, 4> Ops(SARE->op_begin(), SARE->op_end());
      Ops[0] = Zero;   // Start with zero base.
      SubExprs.push_back(SE->getAddRecExpr(Ops, SARE->getLoop()));
      
@ -740,7 +740,7 @@ struct SubExprUseData { unsigned Count; bool notAllUsesAreFree; };
 /// not remove anything.  This looks for things like (a+b+c) and
 /// (a+c+d) and computes the common (a+c) subexpression.  The common expression
 /// is *removed* from the Bases and returned.
-static SCEVHandle 
+static const SCEV* 
 RemoveCommonExpressionsFromUseBases(std::vector<BasedUser> &Uses,
                                    ScalarEvolution *SE, Loop *L,
                                    const TargetLowering *TLI) {
@ -748,9 +748,9 @@ RemoveCommonExpressionsFromUseBases(std::vector<BasedUser> &Uses,

  // Only one use?  This is a very common case, so we handle it specially and
  // cheaply.
-  SCEVHandle Zero = SE->getIntegerSCEV(0, Uses[0].Base->getType());
-  SCEVHandle Result = Zero;
-  SCEVHandle FreeResult = Zero;
+  const SCEV* Zero = SE->getIntegerSCEV(0, Uses[0].Base->getType());
+  const SCEV* Result = Zero;
+  const SCEV* FreeResult = Zero;
  if (NumUses == 1) {
    // If the use is inside the loop, use its base, regardless of what it is:
    // it is clearly shared across all the IV's.  If the use is outside the loop
@ -766,13 +766,13 @@ RemoveCommonExpressionsFromUseBases(std::vector<BasedUser> &Uses,
  // Also track whether all uses of each expression can be moved into an
  // an addressing mode "for free"; such expressions are left within the loop.
  // struct SubExprUseData { unsigned Count; bool notAllUsesAreFree; };
-  std::map<SCEVHandle, SubExprUseData> SubExpressionUseData;
+  std::map<const SCEV*, SubExprUseData> SubExpressionUseData;
  
  // UniqueSubExprs - Keep track of all of the subexpressions we see in the
  // order we see them.
-  SmallVector<SCEVHandle, 16> UniqueSubExprs;
+  SmallVector<const SCEV*, 16> UniqueSubExprs;

-  SmallVector<SCEVHandle, 16> SubExprs;
+  SmallVector<const SCEV*, 16> SubExprs;
  unsigned NumUsesInsideLoop = 0;
  for (unsigned i = 0; i != NumUses; ++i) {
    // If the user is outside the loop, just ignore it for base computation.
@ -816,7 +816,7 @@ RemoveCommonExpressionsFromUseBases(std::vector<BasedUser> &Uses,
  // Now that we know how many times each is used, build Result.  Iterate over
  // UniqueSubexprs so that we have a stable ordering.
  for (unsigned i = 0, e = UniqueSubExprs.size(); i != e; ++i) {
-    std::map<SCEVHandle, SubExprUseData>::iterator I = 
+    std::map<const SCEV*, SubExprUseData>::iterator I = 
       SubExpressionUseData.find(UniqueSubExprs[i]);
    assert(I != SubExpressionUseData.end() && "Entry not found?");
    if (I->second.Count == NumUsesInsideLoop) { // Found CSE! 
@ -860,7 +860,7 @@ RemoveCommonExpressionsFromUseBases(std::vector<BasedUser> &Uses,
  if (FreeResult != Zero) {
    SeparateSubExprs(SubExprs, FreeResult, SE);
    for (unsigned j = 0, e = SubExprs.size(); j != e; ++j) {
-      std::map<SCEVHandle, SubExprUseData>::iterator I = 
+      std::map<const SCEV*, SubExprUseData>::iterator I = 
         SubExpressionUseData.find(SubExprs[j]);
      SubExpressionUseData.erase(I);
    }
@ -989,10 +989,10 @@ bool LoopStrengthReduce::RequiresTypeConversion(const Type *Ty1,
 /// be folded into the addressing mode, nor even that the factor be constant; 
 /// a multiply (executed once) outside the loop is better than another IV 
 /// within.  Well, usually.
-SCEVHandle LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg,
+const SCEV* LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg,
                                bool AllUsesAreAddresses,
                                bool AllUsesAreOutsideLoop,
-                                const SCEVHandle &Stride, 
+                                const SCEV* const &Stride, 
                                IVExpr &IV, const Type *Ty,
                                const std::vector<BasedUser>& UsersToProcess) {
  if (StrideNoReuse.count(Stride))
@ -1002,7 +1002,7 @@ SCEVHandle LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg,
    int64_t SInt = SC->getValue()->getSExtValue();
    for (unsigned NewStride = 0, e = IU->StrideOrder.size();
         NewStride != e; ++NewStride) {
-      std::map<SCEVHandle, IVsOfOneStride>::iterator SI = 
+      std::map<const SCEV*, IVsOfOneStride>::iterator SI = 
                IVsByStride.find(IU->StrideOrder[NewStride]);
      if (SI == IVsByStride.end() || !isa<SCEVConstant>(SI->first) ||
          StrideNoReuse.count(SI->first))
@ -1055,7 +1055,7 @@ SCEVHandle LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg,
    // an existing IV if we can.
    for (unsigned NewStride = 0, e = IU->StrideOrder.size();
         NewStride != e; ++NewStride) {
-      std::map<SCEVHandle, IVsOfOneStride>::iterator SI = 
+      std::map<const SCEV*, IVsOfOneStride>::iterator SI = 
                IVsByStride.find(IU->StrideOrder[NewStride]);
      if (SI == IVsByStride.end() || !isa<SCEVConstant>(SI->first))
        continue;
@ -1075,7 +1075,7 @@ SCEVHandle LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg,
    // -1*old.
    for (unsigned NewStride = 0, e = IU->StrideOrder.size();
         NewStride != e; ++NewStride) {
-      std::map<SCEVHandle, IVsOfOneStride>::iterator SI = 
+      std::map<const SCEV*, IVsOfOneStride>::iterator SI = 
                IVsByStride.find(IU->StrideOrder[NewStride]);
      if (SI == IVsByStride.end()) 
        continue;
@ -1104,7 +1104,7 @@ static bool PartitionByIsUseOfPostIncrementedValue(const BasedUser &Val) {

 /// isNonConstantNegative - Return true if the specified scev is negated, but
 /// not a constant.
-static bool isNonConstantNegative(const SCEVHandle &Expr) {
+static bool isNonConstantNegative(const SCEV* const &Expr) {
  const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(Expr);
  if (!Mul) return false;
  
@ -1121,7 +1121,7 @@ static bool isNonConstantNegative(const SCEVHandle &Expr) {
 /// of the strided accesses, as well as the old information from Uses. We
 /// progressively move information from the Base field to the Imm field, until
 /// we eventually have the full access expression to rewrite the use.
-SCEVHandle LoopStrengthReduce::CollectIVUsers(const SCEVHandle &Stride,
+const SCEV* LoopStrengthReduce::CollectIVUsers(const SCEV* const &Stride,
                                              IVUsersOfOneStride &Uses,
                                              Loop *L,
                                              bool &AllUsesAreAddresses,
@ -1152,7 +1152,7 @@ SCEVHandle LoopStrengthReduce::CollectIVUsers(const SCEVHandle &Stride,
  // for the strides (e.g. if we have "A+C+B" and "A+B+D" as our bases, find
  // "A+B"), emit it to the preheader, then remove the expression from the
  // UsersToProcess base values.
-  SCEVHandle CommonExprs =
+  const SCEV* CommonExprs =
    RemoveCommonExpressionsFromUseBases(UsersToProcess, SE, L, TLI);

  // Next, figure out what we can represent in the immediate fields of
@ -1218,7 +1218,7 @@ bool LoopStrengthReduce::ShouldUseFullStrengthReductionMode(
                                   const std::vector<BasedUser> &UsersToProcess,
                                   const Loop *L,
                                   bool AllUsesAreAddresses,
-                                   SCEVHandle Stride) {
+                                   const SCEV* Stride) {
  if (!EnableFullLSRMode)
    return false;

@ -1255,7 +1255,7 @@ bool LoopStrengthReduce::ShouldUseFullStrengthReductionMode(
        if (!Imm)       Imm = SE->getIntegerSCEV(0, Stride->getType());
        const Instruction *Inst = UsersToProcess[i].Inst;
        const Type *AccessTy = getAccessType(Inst);
-        SCEVHandle Diff = SE->getMinusSCEV(UsersToProcess[i].Imm, Imm);
+        const SCEV* Diff = SE->getMinusSCEV(UsersToProcess[i].Imm, Imm);
        if (!Diff->isZero() &&
            (!AllUsesAreAddresses ||
             !fitsInAddressMode(Diff, AccessTy, TLI, /*HasBaseReg=*/true)))
@ -1289,7 +1289,7 @@ bool LoopStrengthReduce::ShouldUseFullStrengthReductionMode(
 ///
 /// Return the created phi node.
 ///
-static PHINode *InsertAffinePhi(SCEVHandle Start, SCEVHandle Step,
+static PHINode *InsertAffinePhi(const SCEV* Start, const SCEV* Step,
                                Instruction *IVIncInsertPt,
                                const Loop *L,
                                SCEVExpander &Rewriter) {
@ -1309,7 +1309,7 @@ static PHINode *InsertAffinePhi(SCEVHandle Start, SCEVHandle Step,
  // If the stride is negative, insert a sub instead of an add for the
  // increment.
  bool isNegative = isNonConstantNegative(Step);
-  SCEVHandle IncAmount = Step;
+  const SCEV* IncAmount = Step;
  if (isNegative)
    IncAmount = Rewriter.SE.getNegativeSCEV(Step);

@ -1348,13 +1348,13 @@ static void SortUsersToProcess(std::vector<BasedUser> &UsersToProcess) {
  // loop before users outside of the loop with a particular base.
  //
  // We would like to use stable_sort here, but we can't.  The problem is that
-  // SCEVHandle's don't have a deterministic ordering w.r.t to each other, so
+  // const SCEV*'s don't have a deterministic ordering w.r.t to each other, so
  // we don't have anything to do a '<' comparison on.  Because we think the
  // number of uses is small, do a horrible bubble sort which just relies on
  // ==.
  for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i) {
    // Get a base value.
-    SCEVHandle Base = UsersToProcess[i].Base;
+    const SCEV* Base = UsersToProcess[i].Base;

    // Compact everything with this base to be consecutive with this one.
    for (unsigned j = i+1; j != e; ++j) {
@ -1373,8 +1373,8 @@ static void SortUsersToProcess(std::vector<BasedUser> &UsersToProcess) {
 void
 LoopStrengthReduce::PrepareToStrengthReduceFully(
                                        std::vector<BasedUser> &UsersToProcess,
-                                        SCEVHandle Stride,
-                                        SCEVHandle CommonExprs,
+                                        const SCEV* Stride,
+                                        const SCEV* CommonExprs,
                                        const Loop *L,
                                        SCEVExpander &PreheaderRewriter) {
  DOUT << "  Fully reducing all users\n";
@ -1386,9 +1386,9 @@ LoopStrengthReduce::PrepareToStrengthReduceFully(
    // TODO: The uses are grouped by base, but not sorted. We arbitrarily
    // pick the first Imm value here to start with, and adjust it for the
    // other uses.
-    SCEVHandle Imm = UsersToProcess[i].Imm;
-    SCEVHandle Base = UsersToProcess[i].Base;
-    SCEVHandle Start = SE->getAddExpr(CommonExprs, Base, Imm);
+    const SCEV* Imm = UsersToProcess[i].Imm;
+    const SCEV* Base = UsersToProcess[i].Base;
+    const SCEV* Start = SE->getAddExpr(CommonExprs, Base, Imm);
    PHINode *Phi = InsertAffinePhi(Start, Stride, IVIncInsertPt, L,
                                   PreheaderRewriter);
    // Loop over all the users with the same base.
@ -1420,8 +1420,8 @@ static Instruction *FindIVIncInsertPt(std::vector<BasedUser> &UsersToProcess,
 void
 LoopStrengthReduce::PrepareToStrengthReduceWithNewPhi(
                                         std::vector<BasedUser> &UsersToProcess,
-                                         SCEVHandle Stride,
-                                         SCEVHandle CommonExprs,
+                                         const SCEV* Stride,
+                                         const SCEV* CommonExprs,
                                         Value *CommonBaseV,
                                         Instruction *IVIncInsertPt,
                                         const Loop *L,
@ -1497,7 +1497,7 @@ static bool IsImmFoldedIntoAddrMode(GlobalValue *GV, int64_t Offset,
 /// StrengthReduceStridedIVUsers - Strength reduce all of the users of a single
 /// stride of IV.  All of the users may have different starting values, and this
 /// may not be the only stride.
-void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
+void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEV* const &Stride,
                                                      IVUsersOfOneStride &Uses,
                                                      Loop *L) {
  // If all the users are moved to another stride, then there is nothing to do.
@ -1520,7 +1520,7 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
  // move information from the Base field to the Imm field, until we eventually
  // have the full access expression to rewrite the use.
  std::vector<BasedUser> UsersToProcess;
-  SCEVHandle CommonExprs = CollectIVUsers(Stride, Uses, L, AllUsesAreAddresses,
+  const SCEV* CommonExprs = CollectIVUsers(Stride, Uses, L, AllUsesAreAddresses,
                                          AllUsesAreOutsideLoop,
                                          UsersToProcess);

@ -1538,8 +1538,8 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
  // If all uses are addresses, consider sinking the immediate part of the
  // common expression back into uses if they can fit in the immediate fields.
  if (TLI && HaveCommonExprs && AllUsesAreAddresses) {
-    SCEVHandle NewCommon = CommonExprs;
-    SCEVHandle Imm = SE->getIntegerSCEV(0, ReplacedTy);
+    const SCEV* NewCommon = CommonExprs;
+    const SCEV* Imm = SE->getIntegerSCEV(0, ReplacedTy);
    MoveImmediateValues(TLI, Type::VoidTy, NewCommon, Imm, true, L, SE);
    if (!Imm->isZero()) {
      bool DoSink = true;
@ -1585,7 +1585,7 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,

  Value *CommonBaseV = Constant::getNullValue(ReplacedTy);

-  SCEVHandle RewriteFactor = SE->getIntegerSCEV(0, ReplacedTy);
+  const SCEV* RewriteFactor = SE->getIntegerSCEV(0, ReplacedTy);
  IVExpr   ReuseIV(SE->getIntegerSCEV(0, Type::Int32Ty),
                   SE->getIntegerSCEV(0, Type::Int32Ty),
                   0);
@ -1625,7 +1625,7 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
  // strength-reduced forms.  This outer loop handles all bases, the inner
  // loop handles all users of a particular base.
  while (!UsersToProcess.empty()) {
-    SCEVHandle Base = UsersToProcess.back().Base;
+    const SCEV* Base = UsersToProcess.back().Base;
    Instruction *Inst = UsersToProcess.back().Inst;

    // Emit the code for Base into the preheader.
@ -1679,7 +1679,7 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
          User.Inst->moveBefore(IVIncInsertPt);
      }

-      SCEVHandle RewriteExpr = SE->getUnknown(RewriteOp);
+      const SCEV* RewriteExpr = SE->getUnknown(RewriteOp);

      if (SE->getEffectiveSCEVType(RewriteOp->getType()) !=
          SE->getEffectiveSCEVType(ReplacedTy)) {
@ -1711,7 +1711,7 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
        // The base has been used to initialize the PHI node but we don't want
        // it here.
        if (!ReuseIV.Base->isZero()) {
-          SCEVHandle typedBase = ReuseIV.Base;
+          const SCEV* typedBase = ReuseIV.Base;
          if (SE->getEffectiveSCEVType(RewriteExpr->getType()) !=
              SE->getEffectiveSCEVType(ReuseIV.Base->getType())) {
            // It's possible the original IV is a larger type than the new IV,
@ -1776,10 +1776,10 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
 /// set the IV user and stride information and return true, otherwise return
 /// false.
 bool LoopStrengthReduce::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse,
-                                       const SCEVHandle *&CondStride) {
+                                       const SCEV* const * &CondStride) {
  for (unsigned Stride = 0, e = IU->StrideOrder.size();
       Stride != e && !CondUse; ++Stride) {
-    std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+    std::map<const SCEV*, IVUsersOfOneStride *>::iterator SI =
      IU->IVUsesByStride.find(IU->StrideOrder[Stride]);
    assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!");

@ -1806,7 +1806,7 @@ namespace {
    const ScalarEvolution *SE;
    explicit StrideCompare(const ScalarEvolution *se) : SE(se) {}

-    bool operator()(const SCEVHandle &LHS, const SCEVHandle &RHS) {
+    bool operator()(const SCEV* const &LHS, const SCEV* const &RHS) {
      const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(LHS);
      const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(RHS);
      if (LHSC && RHSC) {
@ -1849,14 +1849,14 @@ namespace {
 /// if (v1 < 30) goto loop
 ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond,
                                                IVStrideUse* &CondUse,
-                                                const SCEVHandle* &CondStride) {
+                                              const SCEV* const* &CondStride) {
  // If there's only one stride in the loop, there's nothing to do here.
  if (IU->StrideOrder.size() < 2)
    return Cond;
  // If there are other users of the condition's stride, don't bother
  // trying to change the condition because the stride will still
  // remain.
-  std::map<SCEVHandle, IVUsersOfOneStride *>::iterator I =
+  std::map<const SCEV*, IVUsersOfOneStride *>::iterator I =
    IU->IVUsesByStride.find(*CondStride);
  if (I == IU->IVUsesByStride.end() ||
      I->second->Users.size() != 1)
@ -1873,11 +1873,11 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond,
  const Type *NewCmpTy = NULL;
  unsigned TyBits = SE->getTypeSizeInBits(CmpTy);
  unsigned NewTyBits = 0;
-  SCEVHandle *NewStride = NULL;
+  const SCEV* *NewStride = NULL;
  Value *NewCmpLHS = NULL;
  Value *NewCmpRHS = NULL;
  int64_t Scale = 1;
-  SCEVHandle NewOffset = SE->getIntegerSCEV(0, CmpTy);
+  const SCEV* NewOffset = SE->getIntegerSCEV(0, CmpTy);

  if (ConstantInt *C = dyn_cast<ConstantInt>(Cond->getOperand(1))) {
    int64_t CmpVal = C->getValue().getSExtValue();
@ -1889,7 +1889,7 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond,

    // Look for a suitable stride / iv as replacement.
    for (unsigned i = 0, e = IU->StrideOrder.size(); i != e; ++i) {
-      std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+      std::map<const SCEV*, IVUsersOfOneStride *>::iterator SI =
        IU->IVUsesByStride.find(IU->StrideOrder[i]);
      if (!isa<SCEVConstant>(SI->first))
        continue;
@ -1969,7 +1969,7 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond,
      bool AllUsesAreAddresses = true;
      bool AllUsesAreOutsideLoop = true;
      std::vector<BasedUser> UsersToProcess;
-      SCEVHandle CommonExprs = CollectIVUsers(SI->first, *SI->second, L,
+      const SCEV* CommonExprs = CollectIVUsers(SI->first, *SI->second, L,
                                              AllUsesAreAddresses,
                                              AllUsesAreOutsideLoop,
                                              UsersToProcess);
@ -2104,13 +2104,13 @@ ICmpInst *LoopStrengthReduce::OptimizeMax(Loop *L, ICmpInst *Cond,
  SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
  if (!Sel || !Sel->hasOneUse()) return Cond;

-  SCEVHandle BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+  const SCEV* BackedgeTakenCount = SE->getBackedgeTakenCount(L);
  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
    return Cond;
-  SCEVHandle One = SE->getIntegerSCEV(1, BackedgeTakenCount->getType());
+  const SCEV* One = SE->getIntegerSCEV(1, BackedgeTakenCount->getType());

  // Add one to the backedge-taken count to get the trip count.
-  SCEVHandle IterationCount = SE->getAddExpr(BackedgeTakenCount, One);
+  const SCEV* IterationCount = SE->getAddExpr(BackedgeTakenCount, One);

  // Check for a max calculation that matches the pattern.
  if (!isa<SCEVSMaxExpr>(IterationCount) && !isa<SCEVUMaxExpr>(IterationCount))
@ -2123,13 +2123,13 @@ ICmpInst *LoopStrengthReduce::OptimizeMax(Loop *L, ICmpInst *Cond,
  if (Max->getNumOperands() != 2)
    return Cond;

-  SCEVHandle MaxLHS = Max->getOperand(0);
-  SCEVHandle MaxRHS = Max->getOperand(1);
+  const SCEV* MaxLHS = Max->getOperand(0);
+  const SCEV* MaxRHS = Max->getOperand(1);
  if (!MaxLHS || MaxLHS != One) return Cond;

  // Check the relevant induction variable for conformance to
  // the pattern.
-  SCEVHandle IV = SE->getSCEV(Cond->getOperand(0));
+  const SCEV* IV = SE->getSCEV(Cond->getOperand(0));
  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
  if (!AR || !AR->isAffine() ||
      AR->getStart() != One ||
@ -2175,13 +2175,13 @@ ICmpInst *LoopStrengthReduce::OptimizeMax(Loop *L, ICmpInst *Cond,
 /// inside the loop then try to eliminate the cast opeation.
 void LoopStrengthReduce::OptimizeShadowIV(Loop *L) {

-  SCEVHandle BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+  const SCEV* BackedgeTakenCount = SE->getBackedgeTakenCount(L);
  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
    return;

  for (unsigned Stride = 0, e = IU->StrideOrder.size(); Stride != e;
       ++Stride) {
-    std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+    std::map<const SCEV*, IVUsersOfOneStride *>::iterator SI =
      IU->IVUsesByStride.find(IU->StrideOrder[Stride]);
    assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!");
    if (!isa<SCEVConstant>(SI->first))
@ -2311,7 +2311,7 @@ void LoopStrengthReduce::OptimizeLoopTermCond(Loop *L) {

  // Search IVUsesByStride to find Cond's IVUse if there is one.
  IVStrideUse *CondUse = 0;
-  const SCEVHandle *CondStride = 0;
+  const SCEV* const *CondStride = 0;
  ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
  if (!FindIVUserForCond(Cond, CondUse, CondStride))
    return; // setcc doesn't use the IV.
@ -2341,7 +2341,7 @@ void LoopStrengthReduce::OptimizeLoopTermCond(Loop *L) {
      int64_t SInt = SC->getValue()->getSExtValue();
      for (unsigned NewStride = 0, ee = IU->StrideOrder.size(); NewStride != ee;
           ++NewStride) {
-        std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+        std::map<const SCEV*, IVUsersOfOneStride *>::iterator SI =
          IU->IVUsesByStride.find(IU->StrideOrder[NewStride]);
        if (!isa<SCEVConstant>(SI->first) || SI->first == *CondStride)
          continue;
@ -2355,7 +2355,7 @@ void LoopStrengthReduce::OptimizeLoopTermCond(Loop *L) {
        bool AllUsesAreAddresses = true;
        bool AllUsesAreOutsideLoop = true;
        std::vector<BasedUser> UsersToProcess;
-        SCEVHandle CommonExprs = CollectIVUsers(SI->first, *SI->second, L,
+        const SCEV* CommonExprs = CollectIVUsers(SI->first, *SI->second, L,
                                                AllUsesAreAddresses,
                                                AllUsesAreOutsideLoop,
                                                UsersToProcess);
@ -2416,7 +2416,7 @@ void LoopStrengthReduce::OptimizeLoopTermCond(Loop *L) {
 void LoopStrengthReduce::OptimizeLoopCountIV(Loop *L) {

  // If the number of times the loop is executed isn't computable, give up.
-  SCEVHandle BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+  const SCEV* BackedgeTakenCount = SE->getBackedgeTakenCount(L);
  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
    return;

@ -2445,9 +2445,9 @@ void LoopStrengthReduce::OptimizeLoopCountIV(Loop *L) {
  // Handle only tests for equality for the moment, and only stride 1.
  if (Cond->getPredicate() != CmpInst::ICMP_EQ)
    return;
-  SCEVHandle IV = SE->getSCEV(Cond->getOperand(0));
+  const SCEV* IV = SE->getSCEV(Cond->getOperand(0));
  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
-  SCEVHandle One = SE->getIntegerSCEV(1, BackedgeTakenCount->getType());
+  const SCEV* One = SE->getIntegerSCEV(1, BackedgeTakenCount->getType());
  if (!AR || !AR->isAffine() || AR->getStepRecurrence(*SE) != One)
    return;
  // If the RHS of the comparison is defined inside the loop, the rewrite
@ -2563,7 +2563,7 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) {
    // strides deterministic - not dependent on map order.
    for (unsigned Stride = 0, e = IU->StrideOrder.size();
         Stride != e; ++Stride) {
-      std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+      std::map<const SCEV*, IVUsersOfOneStride *>::iterator SI =
        IU->IVUsesByStride.find(IU->StrideOrder[Stride]);
      assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!");
      // FIXME: Generalize to non-affine IV's.
--- a/test/Analysis/ScalarEvolution/pointer-sign-bits.ll
+++ b/test/Analysis/ScalarEvolution/pointer-sign-bits.ll
@ -0,0 +1,220 @@
+; RUN: llvm-as < %s | opt -analyze -scalar-evolution -disable-output
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
+  %JavaObject = type { [0 x i32 (...)*]*, i8* }
+
+define void @JnJVM_antlr_CSharpCodeGenerator_genBitSet__Lantlr_collections_impl_BitSet_2I(%JavaObject*, %JavaObject*, i32) {
+start:
+  br i1 undef, label %"stack overflow", label %"no stack overflow"
+
+"GOTO or IF*2":         ; preds = %"true verifyAndComputePtr89", %verifyNullCont84
+  unreachable
+
+"GOTO or IF*5":         ; preds = %"true verifyAndComputePtr127", %"GOTO or IF*6"
+  unreachable
+
+"GOTO or IF*6":         ; preds = %"true verifyAndComputePtr131.GOTO or IF*6_crit_edge", %"true verifyAndComputePtr89"
+  %indvar = phi i32 [ %indvar.next, %"true verifyAndComputePtr131.GOTO or IF*6_crit_edge" ], [ 0, %"true verifyAndComputePtr89" ]               ; <i32> [#uses=2]
+  %.0.in = add i32 %indvar, 0           ; <i32> [#uses=1]
+  %.0 = add i32 %.0.in, 1               ; <i32> [#uses=1]
+  %3 = icmp slt i32 %.0, %4             ; <i1> [#uses=1]
+  br i1 %3, label %verifyNullCont126, label %"GOTO or IF*5"
+
+end:            ; preds = %"no exception block35"
+  ret void
+
+"stack overflow":               ; preds = %start
+  ret void
+
+"no stack overflow":            ; preds = %start
+  br i1 undef, label %verifyNullCont, label %"no stack overflow.end_crit_edge"
+
+"no stack overflow.end_crit_edge":              ; preds = %"no stack overflow"
+  ret void
+
+verifyNullCont:         ; preds = %"no stack overflow"
+  br i1 undef, label %verifyNullCont9, label %verifyNullCont.end_crit_edge
+
+verifyNullCont.end_crit_edge:           ; preds = %verifyNullCont
+  ret void
+
+verifyNullCont9:                ; preds = %verifyNullCont
+  br i1 undef, label %verifyNullCont12, label %verifyNullCont9.end_crit_edge
+
+verifyNullCont9.end_crit_edge:          ; preds = %verifyNullCont9
+  ret void
+
+verifyNullCont12:               ; preds = %verifyNullCont9
+  br i1 undef, label %"no exception block13", label %verifyNullCont12.end_crit_edge
+
+verifyNullCont12.end_crit_edge:         ; preds = %verifyNullCont12
+  ret void
+
+"no exception block13":         ; preds = %verifyNullCont12
+  br i1 undef, label %verifyNullExit14, label %verifyNullCont15
+
+verifyNullExit14:               ; preds = %"no exception block13"
+  ret void
+
+verifyNullCont15:               ; preds = %"no exception block13"
+  br i1 undef, label %"no exception block16", label %verifyNullCont15.end_crit_edge
+
+verifyNullCont15.end_crit_edge:         ; preds = %verifyNullCont15
+  ret void
+
+"no exception block16":         ; preds = %verifyNullCont15
+  br i1 undef, label %verifyNullExit17, label %verifyNullCont18
+
+verifyNullExit17:               ; preds = %"no exception block16"
+  ret void
+
+verifyNullCont18:               ; preds = %"no exception block16"
+  br i1 undef, label %"no exception block19", label %verifyNullCont18.end_crit_edge
+
+verifyNullCont18.end_crit_edge:         ; preds = %verifyNullCont18
+  ret void
+
+"no exception block19":         ; preds = %verifyNullCont18
+  br i1 undef, label %verifyNullExit20, label %verifyNullCont21
+
+verifyNullExit20:               ; preds = %"no exception block19"
+  ret void
+
+verifyNullCont21:               ; preds = %"no exception block19"
+  br i1 undef, label %verifyNullCont24, label %verifyNullCont21.end_crit_edge
+
+verifyNullCont21.end_crit_edge:         ; preds = %verifyNullCont21
+  ret void
+
+verifyNullCont24:               ; preds = %verifyNullCont21
+  br i1 undef, label %verifyNullCont27, label %verifyNullCont24.end_crit_edge
+
+verifyNullCont24.end_crit_edge:         ; preds = %verifyNullCont24
+  ret void
+
+verifyNullCont27:               ; preds = %verifyNullCont24
+  br i1 undef, label %verifyNullCont32, label %verifyNullCont27.end_crit_edge
+
+verifyNullCont27.end_crit_edge:         ; preds = %verifyNullCont27
+  ret void
+
+verifyNullCont32:               ; preds = %verifyNullCont27
+  br i1 undef, label %verifyNullExit33, label %verifyNullCont34
+
+verifyNullExit33:               ; preds = %verifyNullCont32
+  ret void
+
+verifyNullCont34:               ; preds = %verifyNullCont32
+  br i1 undef, label %"no exception block35", label %verifyNullCont34.end_crit_edge
+
+verifyNullCont34.end_crit_edge:         ; preds = %verifyNullCont34
+  ret void
+
+"no exception block35":         ; preds = %verifyNullCont34
+  br i1 undef, label %end, label %verifyNullCont60
+
+verifyNullCont60:               ; preds = %"no exception block35"
+  br i1 undef, label %verifyNullCont63, label %verifyNullCont60.end_crit_edge
+
+verifyNullCont60.end_crit_edge:         ; preds = %verifyNullCont60
+  ret void
+
+verifyNullCont63:               ; preds = %verifyNullCont60
+  br i1 undef, label %"no exception block64", label %verifyNullCont63.end_crit_edge
+
+verifyNullCont63.end_crit_edge:         ; preds = %verifyNullCont63
+  ret void
+
+"no exception block64":         ; preds = %verifyNullCont63
+  br i1 undef, label %verifyNullExit65, label %verifyNullCont66
+
+verifyNullExit65:               ; preds = %"no exception block64"
+  ret void
+
+verifyNullCont66:               ; preds = %"no exception block64"
+  br i1 undef, label %"no exception block67", label %verifyNullCont66.end_crit_edge
+
+verifyNullCont66.end_crit_edge:         ; preds = %verifyNullCont66
+  ret void
+
+"no exception block67":         ; preds = %verifyNullCont66
+  br i1 undef, label %verifyNullExit68, label %verifyNullCont69
+
+verifyNullExit68:               ; preds = %"no exception block67"
+  ret void
+
+verifyNullCont69:               ; preds = %"no exception block67"
+  br i1 undef, label %"no exception block70", label %verifyNullCont69.end_crit_edge
+
+verifyNullCont69.end_crit_edge:         ; preds = %verifyNullCont69
+  ret void
+
+"no exception block70":         ; preds = %verifyNullCont69
+  br i1 undef, label %verifyNullExit71, label %verifyNullCont72
+
+verifyNullExit71:               ; preds = %"no exception block70"
+  ret void
+
+verifyNullCont72:               ; preds = %"no exception block70"
+  br i1 undef, label %verifyNullCont75, label %verifyNullCont72.end_crit_edge
+
+verifyNullCont72.end_crit_edge:         ; preds = %verifyNullCont72
+  ret void
+
+verifyNullCont75:               ; preds = %verifyNullCont72
+  br i1 undef, label %verifyNullCont78, label %verifyNullCont75.end_crit_edge
+
+verifyNullCont75.end_crit_edge:         ; preds = %verifyNullCont75
+  ret void
+
+verifyNullCont78:               ; preds = %verifyNullCont75
+  br i1 undef, label %"verifyNullCont78.GOTO or IF*4_crit_edge", label %verifyNullCont78.end_crit_edge
+
+"verifyNullCont78.GOTO or IF*4_crit_edge":              ; preds = %verifyNullCont78
+  br i1 undef, label %verifyNullExit80, label %verifyNullCont81
+
+verifyNullCont78.end_crit_edge:         ; preds = %verifyNullCont78
+  ret void
+
+verifyNullExit80:               ; preds = %"verifyNullCont78.GOTO or IF*4_crit_edge"
+  ret void
+
+verifyNullCont81:               ; preds = %"verifyNullCont78.GOTO or IF*4_crit_edge"
+  %4 = ptrtoint i8* undef to i32                ; <i32> [#uses=2]
+  %5 = icmp slt i32 0, %4               ; <i1> [#uses=1]
+  br i1 %5, label %verifyNullCont84, label %verifyNullCont172
+
+verifyNullCont84:               ; preds = %verifyNullCont81
+  br i1 undef, label %"GOTO or IF*2", label %verifyNullCont86
+
+verifyNullCont86:               ; preds = %verifyNullCont84
+  br i1 undef, label %"true verifyAndComputePtr", label %"false verifyAndComputePtr"
+
+"true verifyAndComputePtr":             ; preds = %verifyNullCont86
+  br i1 undef, label %"true verifyAndComputePtr89", label %"false verifyAndComputePtr90"
+
+"false verifyAndComputePtr":            ; preds = %verifyNullCont86
+  ret void
+
+"true verifyAndComputePtr89":           ; preds = %"true verifyAndComputePtr"
+  br i1 undef, label %"GOTO or IF*6", label %"GOTO or IF*2"
+
+"false verifyAndComputePtr90":          ; preds = %"true verifyAndComputePtr"
+  ret void
+
+verifyNullCont126:              ; preds = %"GOTO or IF*6"
+  br i1 undef, label %"true verifyAndComputePtr127", label %"false verifyAndComputePtr128"
+
+"true verifyAndComputePtr127":          ; preds = %verifyNullCont126
+  br i1 undef, label %"true verifyAndComputePtr131.GOTO or IF*6_crit_edge", label %"GOTO or IF*5"
+
+"false verifyAndComputePtr128":         ; preds = %verifyNullCont126
+  ret void
+
+"true verifyAndComputePtr131.GOTO or IF*6_crit_edge":           ; preds = %"true verifyAndComputePtr127"
+  %indvar.next = add i32 %indvar, 1             ; <i32> [#uses=1]
+  br label %"GOTO or IF*6"
+
+verifyNullCont172:              ; preds = %verifyNullCont81
+  unreachable
+}
--- a/test/Analysis/ScalarEvolution/trip-count7.ll
+++ b/test/Analysis/ScalarEvolution/trip-count7.ll
@ -0,0 +1,150 @@
+; RUN: llvm-as < %s | opt -analyze -scalar-evolution -disable-output \
+; RUN:   | grep {Loop bb7.i: Unpredictable backedge-taken count\\.}
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+
+	%struct.complex = type { float, float }
+	%struct.element = type { i32, i32 }
+	%struct.node = type { %struct.node*, %struct.node*, i32 }
+@seed = external global i64		; <i64*> [#uses=0]
+@_2E_str = external constant [18 x i8], align 1		; <[18 x i8]*> [#uses=0]
+@_2E_str1 = external constant [4 x i8], align 1		; <[4 x i8]*> [#uses=0]
+@value = external global float		; <float*> [#uses=0]
+@fixed = external global float		; <float*> [#uses=0]
+@floated = external global float		; <float*> [#uses=0]
+@permarray = external global [11 x i32], align 32		; <[11 x i32]*> [#uses=0]
+@pctr = external global i32		; <i32*> [#uses=0]
+@tree = external global %struct.node*		; <%struct.node**> [#uses=0]
+@stack = external global [4 x i32], align 16		; <[4 x i32]*> [#uses=0]
+@cellspace = external global [19 x %struct.element], align 32		; <[19 x %struct.element]*> [#uses=0]
+@freelist = external global i32		; <i32*> [#uses=0]
+@movesdone = external global i32		; <i32*> [#uses=0]
+@ima = external global [41 x [41 x i32]], align 32		; <[41 x [41 x i32]]*> [#uses=0]
+@imb = external global [41 x [41 x i32]], align 32		; <[41 x [41 x i32]]*> [#uses=0]
+@imr = external global [41 x [41 x i32]], align 32		; <[41 x [41 x i32]]*> [#uses=0]
+@rma = external global [41 x [41 x float]], align 32		; <[41 x [41 x float]]*> [#uses=0]
+@rmb = external global [41 x [41 x float]], align 32		; <[41 x [41 x float]]*> [#uses=0]
+@rmr = external global [41 x [41 x float]], align 32		; <[41 x [41 x float]]*> [#uses=0]
+@piececount = external global [4 x i32], align 16		; <[4 x i32]*> [#uses=0]
+@class = external global [13 x i32], align 32		; <[13 x i32]*> [#uses=0]
+@piecemax = external global [13 x i32], align 32		; <[13 x i32]*> [#uses=0]
+@puzzl = external global [512 x i32], align 32		; <[512 x i32]*> [#uses=0]
+@p = external global [13 x [512 x i32]], align 32		; <[13 x [512 x i32]]*> [#uses=0]
+@n = external global i32		; <i32*> [#uses=0]
+@kount = external global i32		; <i32*> [#uses=0]
+@sortlist = external global [5001 x i32], align 32		; <[5001 x i32]*> [#uses=0]
+@biggest = external global i32		; <i32*> [#uses=0]
+@littlest = external global i32		; <i32*> [#uses=0]
+@top = external global i32		; <i32*> [#uses=0]
+@z = external global [257 x %struct.complex], align 32		; <[257 x %struct.complex]*> [#uses=0]
+@w = external global [257 x %struct.complex], align 32		; <[257 x %struct.complex]*> [#uses=0]
+@e = external global [130 x %struct.complex], align 32		; <[130 x %struct.complex]*> [#uses=0]
+@zr = external global float		; <float*> [#uses=0]
+@zi = external global float		; <float*> [#uses=0]
+
+declare void @Initrand() nounwind
+
+declare i32 @Rand() nounwind
+
+declare void @Try(i32, i32*, i32*, i32*, i32*, i32*) nounwind
+
+declare i32 @puts(i8* nocapture) nounwind
+
+declare void @Queens(i32) nounwind
+
+declare i32 @printf(i8* nocapture, ...) nounwind
+
+declare i32 @main() nounwind
+
+declare void @Doit() nounwind
+
+declare void @Doit_bb7([15 x i32]*, [17 x i32]*, [9 x i32]*) nounwind
+
+define void @Doit_bb7_2E_i([9 x i32]* %x1, [15 x i32]* %c, [17 x i32]* %b, [9 x i32]* %a, i32* %q, i32* %x1.sub, i32* %b9, i32* %a10, i32* %c11) nounwind {
+newFuncRoot:
+	br label %bb7.i
+
+Try.exit.exitStub:		; preds = %bb7.i
+	ret void
+
+bb.i:		; preds = %bb7.i
+	%tmp = add i32 %j.0.i, 1		; <i32> [#uses=5]
+	store i32 0, i32* %q, align 4
+	%tmp1 = sext i32 %tmp to i64		; <i64> [#uses=1]
+	%tmp2 = getelementptr [9 x i32]* %a, i64 0, i64 %tmp1		; <i32*> [#uses=1]
+	%tmp3 = load i32* %tmp2, align 4		; <i32> [#uses=1]
+	%tmp4 = icmp eq i32 %tmp3, 0		; <i1> [#uses=1]
+	br i1 %tmp4, label %bb.i.bb7.i.backedge_crit_edge, label %bb1.i
+
+bb1.i:		; preds = %bb.i
+	%tmp5 = add i32 %j.0.i, 2		; <i32> [#uses=1]
+	%tmp6 = sext i32 %tmp5 to i64		; <i64> [#uses=1]
+	%tmp7 = getelementptr [17 x i32]* %b, i64 0, i64 %tmp6		; <i32*> [#uses=1]
+	%tmp8 = load i32* %tmp7, align 4		; <i32> [#uses=1]
+	%tmp9 = icmp eq i32 %tmp8, 0		; <i1> [#uses=1]
+	br i1 %tmp9, label %bb1.i.bb7.i.backedge_crit_edge, label %bb2.i
+
+bb2.i:		; preds = %bb1.i
+	%tmp10 = sub i32 7, %j.0.i		; <i32> [#uses=1]
+	%tmp11 = sext i32 %tmp10 to i64		; <i64> [#uses=1]
+	%tmp12 = getelementptr [15 x i32]* %c, i64 0, i64 %tmp11		; <i32*> [#uses=1]
+	%tmp13 = load i32* %tmp12, align 4		; <i32> [#uses=1]
+	%tmp14 = icmp eq i32 %tmp13, 0		; <i1> [#uses=1]
+	br i1 %tmp14, label %bb2.i.bb7.i.backedge_crit_edge, label %bb3.i
+
+bb3.i:		; preds = %bb2.i
+	%tmp15 = getelementptr [9 x i32]* %x1, i64 0, i64 1		; <i32*> [#uses=1]
+	store i32 %tmp, i32* %tmp15, align 4
+	%tmp16 = sext i32 %tmp to i64		; <i64> [#uses=1]
+	%tmp17 = getelementptr [9 x i32]* %a, i64 0, i64 %tmp16		; <i32*> [#uses=1]
+	store i32 0, i32* %tmp17, align 4
+	%tmp18 = add i32 %j.0.i, 2		; <i32> [#uses=1]
+	%tmp19 = sext i32 %tmp18 to i64		; <i64> [#uses=1]
+	%tmp20 = getelementptr [17 x i32]* %b, i64 0, i64 %tmp19		; <i32*> [#uses=1]
+	store i32 0, i32* %tmp20, align 4
+	%tmp21 = sub i32 7, %j.0.i		; <i32> [#uses=1]
+	%tmp22 = sext i32 %tmp21 to i64		; <i64> [#uses=1]
+	%tmp23 = getelementptr [15 x i32]* %c, i64 0, i64 %tmp22		; <i32*> [#uses=1]
+	store i32 0, i32* %tmp23, align 4
+	call void @Try(i32 2, i32* %q, i32* %b9, i32* %a10, i32* %c11, i32* %x1.sub) nounwind
+	%tmp24 = load i32* %q, align 4		; <i32> [#uses=1]
+	%tmp25 = icmp eq i32 %tmp24, 0		; <i1> [#uses=1]
+	br i1 %tmp25, label %bb5.i, label %bb3.i.bb7.i.backedge_crit_edge
+
+bb5.i:		; preds = %bb3.i
+	%tmp26 = sext i32 %tmp to i64		; <i64> [#uses=1]
+	%tmp27 = getelementptr [9 x i32]* %a, i64 0, i64 %tmp26		; <i32*> [#uses=1]
+	store i32 1, i32* %tmp27, align 4
+	%tmp28 = add i32 %j.0.i, 2		; <i32> [#uses=1]
+	%tmp29 = sext i32 %tmp28 to i64		; <i64> [#uses=1]
+	%tmp30 = getelementptr [17 x i32]* %b, i64 0, i64 %tmp29		; <i32*> [#uses=1]
+	store i32 1, i32* %tmp30, align 4
+	%tmp31 = sub i32 7, %j.0.i		; <i32> [#uses=1]
+	%tmp32 = sext i32 %tmp31 to i64		; <i64> [#uses=1]
+	%tmp33 = getelementptr [15 x i32]* %c, i64 0, i64 %tmp32		; <i32*> [#uses=1]
+	store i32 1, i32* %tmp33, align 4
+	br label %bb7.i.backedge
+
+bb7.i.backedge:		; preds = %bb3.i.bb7.i.backedge_crit_edge, %bb2.i.bb7.i.backedge_crit_edge, %bb1.i.bb7.i.backedge_crit_edge, %bb.i.bb7.i.backedge_crit_edge, %bb5.i
+	br label %bb7.i
+
+bb7.i:		; preds = %bb7.i.backedge, %newFuncRoot
+	%j.0.i = phi i32 [ 0, %newFuncRoot ], [ %tmp, %bb7.i.backedge ]		; <i32> [#uses=8]
+	%tmp34 = load i32* %q, align 4		; <i32> [#uses=1]
+	%tmp35 = icmp eq i32 %tmp34, 0		; <i1> [#uses=1]
+	%tmp36 = icmp ne i32 %j.0.i, 8		; <i1> [#uses=1]
+	%tmp37 = and i1 %tmp35, %tmp36		; <i1> [#uses=1]
+	br i1 %tmp37, label %bb.i, label %Try.exit.exitStub
+
+bb.i.bb7.i.backedge_crit_edge:		; preds = %bb.i
+	br label %bb7.i.backedge
+
+bb1.i.bb7.i.backedge_crit_edge:		; preds = %bb1.i
+	br label %bb7.i.backedge
+
+bb2.i.bb7.i.backedge_crit_edge:		; preds = %bb2.i
+	br label %bb7.i.backedge
+
+bb3.i.bb7.i.backedge_crit_edge:		; preds = %bb3.i
+	br label %bb7.i.backedge
+}
--- a/test/CodeGen/ARM/2007-03-13-InstrSched.ll
+++ b/test/CodeGen/ARM/2007-03-13-InstrSched.ll
@ -1,5 +1,8 @@
 ; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin -relocation-model=pic \
-; RUN:   -mattr=+v6 -ifcvt-limit=0 -stats |& grep asm-printer | grep 35
+; RUN:   -mattr=+v6 | grep r9
+; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin -relocation-model=pic \
+; RUN:   -mattr=+v6 -arm-reserve-r9 -ifcvt-limit=0 -stats |& grep asm-printer
+; | grep 35

 define void @test(i32 %tmp56222, i32 %tmp36224, i32 %tmp46223, i32 %i.0196.0.ph, i32 %tmp8, i32* %tmp1011, i32** %tmp1, i32* %d2.1.out, i32* %d3.1.out, i32* %d0.1.out, i32* %d1.1.out) {
 newFuncRoot:
--- a/test/CodeGen/ARM/2008-09-14-CoalescerBug.ll
+++ b/test/CodeGen/ARM/2008-09-14-CoalescerBug.ll
--- a/test/CodeGen/ARM/2009-06-02-ISelCrash.ll
+++ b/test/CodeGen/ARM/2009-06-02-ISelCrash.ll
@ -0,0 +1,62 @@
+; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin -relocation-model=pic -mattr=+v6,+vfp2
+
+@"\01LC" = external constant [15 x i8]		; <[15 x i8]*> [#uses=1]
+
+declare i32 @printf(i8* nocapture, ...) nounwind
+
+define i32 @main() nounwind {
+entry:
+	br label %bb.i1.i
+
+bb.i1.i:		; preds = %Cos.exit.i.i, %entry
+	br label %bb.i.i.i
+
+bb.i.i.i:		; preds = %bb.i.i.i, %bb.i1.i
+	br i1 undef, label %Cos.exit.i.i, label %bb.i.i.i
+
+Cos.exit.i.i:		; preds = %bb.i.i.i
+	br i1 undef, label %bb2.i.i, label %bb.i1.i
+
+bb2.i.i:		; preds = %Cos.exit.i.i
+	br label %bb3.i.i
+
+bb3.i.i:		; preds = %bb5.i.i, %bb2.i.i
+	br label %bb4.i.i
+
+bb4.i.i:		; preds = %bb4.i.i, %bb3.i.i
+	br i1 undef, label %bb5.i.i, label %bb4.i.i
+
+bb5.i.i:		; preds = %bb4.i.i
+	br i1 undef, label %bb.i, label %bb3.i.i
+
+bb.i:		; preds = %bb.i, %bb5.i.i
+	br i1 undef, label %bb1.outer2.i.i.outer, label %bb.i
+
+bb1.outer2.i.i.outer:		; preds = %Fft.exit.i, %bb5.i12.i, %bb.i
+	br label %bb1.outer2.i.i
+
+bb1.outer2.i.i:		; preds = %bb2.i9.i, %bb1.outer2.i.i.outer
+	br label %bb1.i.i
+
+bb1.i.i:		; preds = %bb1.i.i, %bb1.outer2.i.i
+	br i1 undef, label %bb2.i9.i, label %bb1.i.i
+
+bb2.i9.i:		; preds = %bb1.i.i
+	br i1 undef, label %bb4.i11.i, label %bb1.outer2.i.i
+
+bb4.i11.i:		; preds = %bb4.i11.i, %bb2.i9.i
+	br i1 undef, label %bb5.i12.i, label %bb4.i11.i
+
+bb5.i12.i:		; preds = %bb4.i11.i
+	br i1 undef, label %bb7.i.i, label %bb1.outer2.i.i.outer
+
+bb7.i.i:		; preds = %bb7.i.i, %bb5.i12.i
+	br i1 undef, label %Fft.exit.i, label %bb7.i.i
+
+Fft.exit.i:		; preds = %bb7.i.i
+	br i1 undef, label %bb5.i, label %bb1.outer2.i.i.outer
+
+bb5.i:		; preds = %Fft.exit.i
+	%0 = tail call i32 (i8*, ...)* @printf(i8* getelementptr ([15 x i8]* @"\01LC", i32 0, i32 0), double undef, double undef) nounwind		; <i32> [#uses=0]
+	unreachable
+}
--- a/test/CodeGen/ARM/2009-06-22-CoalescerBug.ll
+++ b/test/CodeGen/ARM/2009-06-22-CoalescerBug.ll
@ -0,0 +1,43 @@
+; RUN: llvm-as < %s | llc -mtriple=armv6-apple-darwin
+
+	%struct.rtunion = type { i64 }
+	%struct.rtx_def = type { i16, i8, i8, [1 x %struct.rtunion] }
+
+define arm_apcscc void @simplify_unary_real(i8* nocapture %p) nounwind {
+entry:
+	%tmp121 = load i64* null, align 4		; <i64> [#uses=1]
+	%0 = getelementptr %struct.rtx_def* null, i32 0, i32 3, i32 3, i32 0		; <i64*> [#uses=1]
+	%tmp122 = load i64* %0, align 4		; <i64> [#uses=1]
+	%1 = zext i64 undef to i192		; <i192> [#uses=2]
+	%2 = zext i64 %tmp121 to i192		; <i192> [#uses=1]
+	%3 = shl i192 %2, 64		; <i192> [#uses=2]
+	%4 = zext i64 %tmp122 to i192		; <i192> [#uses=1]
+	%5 = shl i192 %4, 128		; <i192> [#uses=1]
+	%6 = or i192 %3, %1		; <i192> [#uses=1]
+	%7 = or i192 %6, %5		; <i192> [#uses=2]
+	switch i32 undef, label %bb82 [
+		i32 77, label %bb38
+		i32 129, label %bb21
+		i32 130, label %bb20
+	]
+
+bb20:		; preds = %entry
+	ret void
+
+bb21:		; preds = %entry
+	br i1 undef, label %bb82, label %bb29
+
+bb29:		; preds = %bb21
+	%tmp18.i = and i192 %3, 1208907372870555465154560		; <i192> [#uses=1]
+	%mask.i = or i192 %tmp18.i, %1		; <i192> [#uses=1]
+	%mask41.i = or i192 %mask.i, 0		; <i192> [#uses=1]
+	br label %bb82
+
+bb38:		; preds = %entry
+	br label %bb82
+
+bb82:		; preds = %bb38, %bb29, %bb21, %entry
+	%d.0 = phi i192 [ %mask41.i, %bb29 ], [ undef, %bb38 ], [ %7, %entry ], [ %7, %bb21 ]		; <i192> [#uses=1]
+	%tmp51 = trunc i192 %d.0 to i64		; <i64> [#uses=0]
+	ret void
+}
--- a/test/CodeGen/ARM/arm-frameaddr.ll
+++ b/test/CodeGen/ARM/arm-frameaddr.ll
@ -0,0 +1,12 @@
+; RUN: llvm-as < %s | llc -mtriple=arm-apple-darwin  | grep mov | grep r7
+; RUN: llvm-as < %s | llc -mtriple=arm-linux-gnueabi | grep mov | grep r11
+; PR4344
+; PR4416
+
+define arm_aapcscc i8* @t() nounwind {
+entry:
+	%0 = call i8* @llvm.frameaddress(i32 0)
+        ret i8* %0
+}
+
+declare i8* @llvm.frameaddress(i32) nounwind readnone
--- a/test/CodeGen/ARM/neon_arith1.ll
+++ b/test/CodeGen/ARM/neon_arith1.ll
@ -0,0 +1,7 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep vadd
+
+define <8 x i8> @t_i8x8(<8 x i8> %a, <8 x i8> %b) nounwind {
+entry:
+	%0 = add <8 x i8> %a, %b
+	ret <8 x i8> %0
+}
--- a/test/CodeGen/ARM/neon_ld1.ll
+++ b/test/CodeGen/ARM/neon_ld1.ll
@ -0,0 +1,22 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep fldd | count 4
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep fstd
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep fmrrd
+
+define void @t1(<2 x i32>* %r, <4 x i16>* %a, <4 x i16>* %b) nounwind {
+entry:
+	%0 = load <4 x i16>* %a, align 8		; <<4 x i16>> [#uses=1]
+	%1 = load <4 x i16>* %b, align 8		; <<4 x i16>> [#uses=1]
+	%2 = add <4 x i16> %0, %1		; <<4 x i16>> [#uses=1]
+	%3 = bitcast <4 x i16> %2 to <2 x i32>		; <<2 x i32>> [#uses=1]
+	store <2 x i32> %3, <2 x i32>* %r, align 8
+	ret void
+}
+
+define <2 x i32> @t2(<4 x i16>* %a, <4 x i16>* %b) nounwind readonly {
+entry:
+	%0 = load <4 x i16>* %a, align 8		; <<4 x i16>> [#uses=1]
+	%1 = load <4 x i16>* %b, align 8		; <<4 x i16>> [#uses=1]
+	%2 = sub <4 x i16> %0, %1		; <<4 x i16>> [#uses=1]
+	%3 = bitcast <4 x i16> %2 to <2 x i32>		; <<2 x i32>> [#uses=1]
+	ret <2 x i32> %3
+}
--- a/test/CodeGen/ARM/neon_ld2.ll
+++ b/test/CodeGen/ARM/neon_ld2.ll
@ -0,0 +1,23 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep vldmia | count 4
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep vstmia | count 1
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | grep fmrrd  | count 2
+
+define void @t1(<4 x i32>* %r, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+entry:
+	%0 = load <2 x i64>* %a, align 16		; <<2 x i64>> [#uses=1]
+	%1 = load <2 x i64>* %b, align 16		; <<2 x i64>> [#uses=1]
+	%2 = add <2 x i64> %0, %1		; <<2 x i64>> [#uses=1]
+	%3 = bitcast <2 x i64> %2 to <4 x i32>		; <<4 x i32>> [#uses=1]
+	store <4 x i32> %3, <4 x i32>* %r, align 16
+	ret void
+}
+
+define <4 x i32> @t2(<2 x i64>* %a, <2 x i64>* %b) nounwind readonly {
+entry:
+	%0 = load <2 x i64>* %a, align 16		; <<2 x i64>> [#uses=1]
+	%1 = load <2 x i64>* %b, align 16		; <<2 x i64>> [#uses=1]
+	%2 = sub <2 x i64> %0, %1		; <<2 x i64>> [#uses=1]
+	%3 = bitcast <2 x i64> %2 to <4 x i32>		; <<4 x i32>> [#uses=1]
+	ret <4 x i32> %3
+}
+
--- a/test/CodeGen/ARM/vaba.ll
+++ b/test/CodeGen/ARM/vaba.ll
@ -0,0 +1,119 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vaba\\.s8} %t | count 2
+; RUN: grep {vaba\\.s16} %t | count 2
+; RUN: grep {vaba\\.s32} %t | count 2
+; RUN: grep {vaba\\.u8} %t | count 2
+; RUN: grep {vaba\\.u16} %t | count 2
+; RUN: grep {vaba\\.u32} %t | count 2
+
+define <8 x i8> @vabas8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = load <8 x i8>* %C
+	%tmp4 = call <8 x i8> @llvm.arm.neon.vabas.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
+	ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @vabas16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = call <4 x i16> @llvm.arm.neon.vabas.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @vabas32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = call <2 x i32> @llvm.arm.neon.vabas.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+	ret <2 x i32> %tmp4
+}
+
+define <8 x i8> @vabau8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = load <8 x i8>* %C
+	%tmp4 = call <8 x i8> @llvm.arm.neon.vabau.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
+	ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @vabau16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = call <4 x i16> @llvm.arm.neon.vabau.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @vabau32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = call <2 x i32> @llvm.arm.neon.vabau.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+	ret <2 x i32> %tmp4
+}
+
+define <16 x i8> @vabaQs8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = load <16 x i8>* %C
+	%tmp4 = call <16 x i8> @llvm.arm.neon.vabas.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i8> %tmp3)
+	ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @vabaQs16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = load <8 x i16>* %C
+	%tmp4 = call <8 x i16> @llvm.arm.neon.vabas.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i16> %tmp3)
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vabaQs32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = load <4 x i32>* %C
+	%tmp4 = call <4 x i32> @llvm.arm.neon.vabas.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> %tmp3)
+	ret <4 x i32> %tmp4
+}
+
+define <16 x i8> @vabaQu8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = load <16 x i8>* %C
+	%tmp4 = call <16 x i8> @llvm.arm.neon.vabau.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i8> %tmp3)
+	ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @vabaQu16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = load <8 x i16>* %C
+	%tmp4 = call <8 x i16> @llvm.arm.neon.vabau.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i16> %tmp3)
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vabaQu32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = load <4 x i32>* %C
+	%tmp4 = call <4 x i32> @llvm.arm.neon.vabau.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> %tmp3)
+	ret <4 x i32> %tmp4
+}
+
+declare <8 x i8>  @llvm.arm.neon.vabas.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vabas.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vabas.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vabau.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vabau.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vabau.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vabas.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vabas.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vabas.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vabau.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vabau.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vabau.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vabal.ll
+++ b/test/CodeGen/ARM/vabal.ll
@ -0,0 +1,63 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vabal\\.s8} %t | count 1
+; RUN: grep {vabal\\.s16} %t | count 1
+; RUN: grep {vabal\\.s32} %t | count 1
+; RUN: grep {vabal\\.u8} %t | count 1
+; RUN: grep {vabal\\.u16} %t | count 1
+; RUN: grep {vabal\\.u32} %t | count 1
+
+define <8 x i16> @vabals8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = load <8 x i8>* %C
+	%tmp4 = call <8 x i16> @llvm.arm.neon.vabals.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vabals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = call <4 x i32> @llvm.arm.neon.vabals.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vabals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = call <2 x i64> @llvm.arm.neon.vabals.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @vabalu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = load <8 x i8>* %C
+	%tmp4 = call <8 x i16> @llvm.arm.neon.vabalu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vabalu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = call <4 x i32> @llvm.arm.neon.vabalu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vabalu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = call <2 x i64> @llvm.arm.neon.vabalu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+	ret <2 x i64> %tmp4
+}
+
+declare <8 x i16> @llvm.arm.neon.vabals.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vabals.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vabals.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vabalu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vabalu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vabalu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vabd.ll
+++ b/test/CodeGen/ARM/vabd.ll
@ -0,0 +1,126 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vabd\\.s8} %t | count 2
+; RUN: grep {vabd\\.s16} %t | count 2
+; RUN: grep {vabd\\.s32} %t | count 2
+; RUN: grep {vabd\\.u8} %t | count 2
+; RUN: grep {vabd\\.u16} %t | count 2
+; RUN: grep {vabd\\.u32} %t | count 2
+; RUN: grep {vabd\\.f32} %t | count 2
+
+define <8 x i8> @vabds8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vabds16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vabds32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @vabdu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vabdu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vabdu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <2 x float> @vabdf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm.neon.vabdf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <16 x i8> @vabdQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vabdQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vabdQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <16 x i8> @vabdQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vabdQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vabdQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <4 x float> @vabdQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.arm.neon.vabdf.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vabds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vabdu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <2 x float> @llvm.arm.neon.vabdf.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <4 x float> @llvm.arm.neon.vabdf.v4f32(<4 x float>, <4 x float>) nounwind readnone
--- a/test/CodeGen/ARM/vabdl.ll
+++ b/test/CodeGen/ARM/vabdl.ll
@ -0,0 +1,57 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vabdl\\.s8} %t | count 1
+; RUN: grep {vabdl\\.s16} %t | count 1
+; RUN: grep {vabdl\\.s32} %t | count 1
+; RUN: grep {vabdl\\.u8} %t | count 1
+; RUN: grep {vabdl\\.u16} %t | count 1
+; RUN: grep {vabdl\\.u32} %t | count 1
+
+define <8 x i16> @vabdls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vabdls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vabdls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vabdls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vabdls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vabdls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @vabdlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vabdlu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vabdlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vabdlu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vabdlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vabdlu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i16> @llvm.arm.neon.vabdls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vabdls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vabdls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vabdlu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vabdlu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vabdlu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vabs.ll
+++ b/test/CodeGen/ARM/vabs.ll
@ -0,0 +1,64 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vabs\\.s8} %t | count 2
+; RUN: grep {vabs\\.s16} %t | count 2
+; RUN: grep {vabs\\.s32} %t | count 2
+; RUN: grep {vabs\\.f32} %t | count 2
+
+define <8 x i8> @vabss8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8> %tmp1)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vabss16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16> %tmp1)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vabss32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @vabsf32(<2 x float>* %A) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = call <2 x float> @llvm.arm.neon.vabsf.v2f32(<2 x float> %tmp1)
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @vabsQs8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8> %tmp1)
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vabsQs16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %tmp1)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vabsQs32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp2
+}
+
+define <4 x float> @vabsQf32(<4 x float>* %A) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = call <4 x float> @llvm.arm.neon.vabsf.v4f32(<4 x float> %tmp1)
+	ret <4 x float> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vabs.v8i8(<8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32>) nounwind readnone
+declare <2 x float> @llvm.arm.neon.vabsf.v2f32(<2 x float>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vabsf.v4f32(<4 x float>) nounwind readnone
+
--- a/test/CodeGen/ARM/vacge.ll
+++ b/test/CodeGen/ARM/vacge.ll
@ -0,0 +1,19 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vacge\\.f32} %t | count 2
+
+define <2 x i32> @vacgef32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vacged(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @vacgeQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vacgeq(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <2 x i32> @llvm.arm.neon.vacged(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vacgeq(<4 x float>, <4 x float>) nounwind readnone
--- a/test/CodeGen/ARM/vacgt.ll
+++ b/test/CodeGen/ARM/vacgt.ll
@ -0,0 +1,19 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vacgt\\.f32} %t | count 2
+
+define <2 x i32> @vacgtf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vacgtd(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @vacgtQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vacgtq(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <2 x i32> @llvm.arm.neon.vacgtd(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vacgtq(<4 x float>, <4 x float>) nounwind readnone
--- a/test/CodeGen/ARM/vadd.ll
+++ b/test/CodeGen/ARM/vadd.ll
@ -0,0 +1,76 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vadd\\.i8} %t | count 2
+; RUN: grep {vadd\\.i16} %t | count 2
+; RUN: grep {vadd\\.i32} %t | count 2
+; RUN: grep {vadd\\.i64} %t | count 2
+; RUN: grep {vadd\\.f32} %t | count 2
+
+define <8 x i8> @vaddi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = add <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vaddi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = add <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vaddi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = add <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vaddi64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = add <1 x i64> %tmp1, %tmp2
+	ret <1 x i64> %tmp3
+}
+
+define <2 x float> @vaddf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = add <2 x float> %tmp1, %tmp2
+	ret <2 x float> %tmp3
+}
+
+define <16 x i8> @vaddQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = add <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vaddQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = add <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vaddQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = add <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vaddQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = add <2 x i64> %tmp1, %tmp2
+	ret <2 x i64> %tmp3
+}
+
+define <4 x float> @vaddQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = add <4 x float> %tmp1, %tmp2
+	ret <4 x float> %tmp3
+}
--- a/test/CodeGen/ARM/vaddhn.ll
+++ b/test/CodeGen/ARM/vaddhn.ll
@ -0,0 +1,29 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vaddhn\\.i16} %t | count 1
+; RUN: grep {vaddhn\\.i32} %t | count 1
+; RUN: grep {vaddhn\\.i64} %t | count 1
+
+define <8 x i8> @vaddhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vaddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vaddhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vaddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vaddhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vaddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vaddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vaddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vaddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
--- a/test/CodeGen/ARM/vaddl.ll
+++ b/test/CodeGen/ARM/vaddl.ll
@ -0,0 +1,57 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vaddl\\.s8} %t | count 1
+; RUN: grep {vaddl\\.s16} %t | count 1
+; RUN: grep {vaddl\\.s32} %t | count 1
+; RUN: grep {vaddl\\.u8} %t | count 1
+; RUN: grep {vaddl\\.u16} %t | count 1
+; RUN: grep {vaddl\\.u32} %t | count 1
+
+define <8 x i16> @vaddls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vaddls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vaddls16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vaddls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vaddls32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vaddls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @vaddlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vaddlu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vaddlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vaddlu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vaddlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vaddlu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i16> @llvm.arm.neon.vaddls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vaddls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vaddls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vaddlu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vaddlu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vaddlu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vaddw.ll
+++ b/test/CodeGen/ARM/vaddw.ll
@ -0,0 +1,57 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vaddw\\.s8} %t | count 1
+; RUN: grep {vaddw\\.s16} %t | count 1
+; RUN: grep {vaddw\\.s32} %t | count 1
+; RUN: grep {vaddw\\.u8} %t | count 1
+; RUN: grep {vaddw\\.u16} %t | count 1
+; RUN: grep {vaddw\\.u32} %t | count 1
+
+define <8 x i16> @vaddws8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vaddws.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vaddws16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vaddws.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vaddws32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vaddws.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @vaddwu8(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vaddwu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vaddwu16(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vaddwu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vaddwu32(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vaddwu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i16> @llvm.arm.neon.vaddws.v8i16(<8 x i16>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vaddws.v4i32(<4 x i32>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vaddws.v2i64(<2 x i64>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vaddwu.v8i16(<8 x i16>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vaddwu.v4i32(<4 x i32>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vaddwu.v2i64(<2 x i64>, <2 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vand.ll
+++ b/test/CodeGen/ARM/vand.ll
@ -0,0 +1,59 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep vand %t | count 8
+; Note: function names do not include "vand" to allow simple grep for opcodes
+
+define <8 x i8> @v_andi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = and <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @v_andi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = and <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @v_andi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = and <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @v_andi64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = and <1 x i64> %tmp1, %tmp2
+	ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @v_andQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = and <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @v_andQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = and <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @v_andQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = and <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @v_andQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = and <2 x i64> %tmp1, %tmp2
+	ret <2 x i64> %tmp3
+}
--- a/test/CodeGen/ARM/vbic.ll
+++ b/test/CodeGen/ARM/vbic.ll
@ -0,0 +1,67 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep vbic %t | count 8
+; Note: function names do not include "vbic" to allow simple grep for opcodes
+
+define <8 x i8> @v_bici8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = xor <8 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+	%tmp4 = and <8 x i8> %tmp1, %tmp3
+	ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @v_bici16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = xor <4 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1 >
+	%tmp4 = and <4 x i16> %tmp1, %tmp3
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @v_bici32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = xor <2 x i32> %tmp2, < i32 -1, i32 -1 >
+	%tmp4 = and <2 x i32> %tmp1, %tmp3
+	ret <2 x i32> %tmp4
+}
+
+define <1 x i64> @v_bici64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = xor <1 x i64> %tmp2, < i64 -1 >
+	%tmp4 = and <1 x i64> %tmp1, %tmp3
+	ret <1 x i64> %tmp4
+}
+
+define <16 x i8> @v_bicQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = xor <16 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+	%tmp4 = and <16 x i8> %tmp1, %tmp3
+	ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @v_bicQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = xor <8 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 >
+	%tmp4 = and <8 x i16> %tmp1, %tmp3
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @v_bicQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = xor <4 x i32> %tmp2, < i32 -1, i32 -1, i32 -1, i32 -1 >
+	%tmp4 = and <4 x i32> %tmp1, %tmp3
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @v_bicQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = xor <2 x i64> %tmp2, < i64 -1, i64 -1 >
+	%tmp4 = and <2 x i64> %tmp1, %tmp3
+	ret <2 x i64> %tmp4
+}
--- a/test/CodeGen/ARM/vbsl.ll
+++ b/test/CodeGen/ARM/vbsl.ll
@ -0,0 +1,91 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep vbsl %t | count 8
+; Note: function names do not include "vbsl" to allow simple grep for opcodes
+
+define <8 x i8> @v_bsli8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = load <8 x i8>* %C
+	%tmp4 = and <8 x i8> %tmp1, %tmp2
+	%tmp5 = xor <8 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+	%tmp6 = and <8 x i8> %tmp5, %tmp3
+	%tmp7 = or <8 x i8> %tmp4, %tmp6
+	ret <8 x i8> %tmp7
+}
+
+define <4 x i16> @v_bsli16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = and <4 x i16> %tmp1, %tmp2
+	%tmp5 = xor <4 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1 >
+	%tmp6 = and <4 x i16> %tmp5, %tmp3
+	%tmp7 = or <4 x i16> %tmp4, %tmp6
+	ret <4 x i16> %tmp7
+}
+
+define <2 x i32> @v_bsli32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = and <2 x i32> %tmp1, %tmp2
+	%tmp5 = xor <2 x i32> %tmp1, < i32 -1, i32 -1 >
+	%tmp6 = and <2 x i32> %tmp5, %tmp3
+	%tmp7 = or <2 x i32> %tmp4, %tmp6
+	ret <2 x i32> %tmp7
+}
+
+define <1 x i64> @v_bsli64(<1 x i64>* %A, <1 x i64>* %B, <1 x i64>* %C) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = load <1 x i64>* %C
+	%tmp4 = and <1 x i64> %tmp1, %tmp2
+	%tmp5 = xor <1 x i64> %tmp1, < i64 -1 >
+	%tmp6 = and <1 x i64> %tmp5, %tmp3
+	%tmp7 = or <1 x i64> %tmp4, %tmp6
+	ret <1 x i64> %tmp7
+}
+
+define <16 x i8> @v_bslQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = load <16 x i8>* %C
+	%tmp4 = and <16 x i8> %tmp1, %tmp2
+	%tmp5 = xor <16 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+	%tmp6 = and <16 x i8> %tmp5, %tmp3
+	%tmp7 = or <16 x i8> %tmp4, %tmp6
+	ret <16 x i8> %tmp7
+}
+
+define <8 x i16> @v_bslQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = load <8 x i16>* %C
+	%tmp4 = and <8 x i16> %tmp1, %tmp2
+	%tmp5 = xor <8 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 >
+	%tmp6 = and <8 x i16> %tmp5, %tmp3
+	%tmp7 = or <8 x i16> %tmp4, %tmp6
+	ret <8 x i16> %tmp7
+}
+
+define <4 x i32> @v_bslQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = load <4 x i32>* %C
+	%tmp4 = and <4 x i32> %tmp1, %tmp2
+	%tmp5 = xor <4 x i32> %tmp1, < i32 -1, i32 -1, i32 -1, i32 -1 >
+	%tmp6 = and <4 x i32> %tmp5, %tmp3
+	%tmp7 = or <4 x i32> %tmp4, %tmp6
+	ret <4 x i32> %tmp7
+}
+
+define <2 x i64> @v_bslQi64(<2 x i64>* %A, <2 x i64>* %B, <2 x i64>* %C) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = load <2 x i64>* %C
+	%tmp4 = and <2 x i64> %tmp1, %tmp2
+	%tmp5 = xor <2 x i64> %tmp1, < i64 -1, i64 -1 >
+	%tmp6 = and <2 x i64> %tmp5, %tmp3
+	%tmp7 = or <2 x i64> %tmp4, %tmp6
+	ret <2 x i64> %tmp7
+}
--- a/test/CodeGen/ARM/vceq.ll
+++ b/test/CodeGen/ARM/vceq.ll
@ -0,0 +1,61 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vceq\\.i8} %t | count 2
+; RUN: grep {vceq\\.i16} %t | count 2
+; RUN: grep {vceq\\.i32} %t | count 2
+; RUN: grep {vceq\\.f32} %t | count 2
+
+define <8 x i8> @vceqi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = vicmp eq <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vceqi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = vicmp eq <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vceqi32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = vicmp eq <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <2 x i32> @vceqf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp oeq <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @vceqQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = vicmp eq <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vceqQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = vicmp eq <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vceqQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = vicmp eq <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <4 x i32> @vceqQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = vfcmp oeq <4 x float> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
--- a/test/CodeGen/ARM/vcge.ll
+++ b/test/CodeGen/ARM/vcge.ll
@ -0,0 +1,106 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vcge\\.s8} %t | count 2
+; RUN: grep {vcge\\.s16} %t | count 2
+; RUN: grep {vcge\\.s32} %t | count 2
+; RUN: grep {vcge\\.u8} %t | count 2
+; RUN: grep {vcge\\.u16} %t | count 2
+; RUN: grep {vcge\\.u32} %t | count 2
+; RUN: grep {vcge\\.f32} %t | count 2
+
+define <8 x i8> @vcges8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = vicmp sge <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vcges16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = vicmp sge <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vcges32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = vicmp sge <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @vcgeu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = vicmp uge <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vcgeu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = vicmp uge <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vcgeu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = vicmp uge <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <2 x i32> @vcgef32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp oge <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @vcgeQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = vicmp sge <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vcgeQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = vicmp sge <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vcgeQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = vicmp sge <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <16 x i8> @vcgeQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = vicmp uge <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vcgeQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = vicmp uge <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vcgeQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = vicmp uge <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <4 x i32> @vcgeQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = vfcmp oge <4 x float> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
--- a/test/CodeGen/ARM/vcgt.ll
+++ b/test/CodeGen/ARM/vcgt.ll
@ -0,0 +1,106 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vcgt\\.s8} %t | count 2
+; RUN: grep {vcgt\\.s16} %t | count 2
+; RUN: grep {vcgt\\.s32} %t | count 2
+; RUN: grep {vcgt\\.u8} %t | count 2
+; RUN: grep {vcgt\\.u16} %t | count 2
+; RUN: grep {vcgt\\.u32} %t | count 2
+; RUN: grep {vcgt\\.f32} %t | count 2
+
+define <8 x i8> @vcgts8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = vicmp sgt <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vcgts16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = vicmp sgt <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vcgts32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = vicmp sgt <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @vcgtu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = vicmp ugt <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vcgtu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = vicmp ugt <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vcgtu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = vicmp ugt <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <2 x i32> @vcgtf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp ogt <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @vcgtQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = vicmp sgt <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vcgtQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = vicmp sgt <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vcgtQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = vicmp sgt <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <16 x i8> @vcgtQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = vicmp ugt <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vcgtQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = vicmp ugt <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vcgtQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = vicmp ugt <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <4 x i32> @vcgtQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = vfcmp ogt <4 x float> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
--- a/test/CodeGen/ARM/vcls.ll
+++ b/test/CodeGen/ARM/vcls.ll
@ -0,0 +1,48 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vcls\\.s8} %t | count 2
+; RUN: grep {vcls\\.s16} %t | count 2
+; RUN: grep {vcls\\.s32} %t | count 2
+
+define <8 x i8> @vclss8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %tmp1)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vclss16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %tmp1)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vclss32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp2
+}
+
+define <16 x i8> @vclsQs8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %tmp1)
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vclsQs16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %tmp1)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vclsQs32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vcls.v8i8(<8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vclz.ll
+++ b/test/CodeGen/ARM/vclz.ll
@ -0,0 +1,48 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vclz\\.i8} %t | count 2
+; RUN: grep {vclz\\.i16} %t | count 2
+; RUN: grep {vclz\\.i32} %t | count 2
+
+define <8 x i8> @vclz8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vclz.v8i8(<8 x i8> %tmp1)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vclz16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vclz.v4i16(<4 x i16> %tmp1)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vclz32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vclz.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp2
+}
+
+define <16 x i8> @vclzQ8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.arm.neon.vclz.v16i8(<16 x i8> %tmp1)
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vclzQ16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vclz.v8i16(<8 x i16> %tmp1)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vclzQ32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vclz.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vclz.v8i8(<8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vclz.v4i16(<4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vclz.v2i32(<2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vclz.v16i8(<16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vclz.v8i16(<8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vclz.v4i32(<4 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vcnt.ll
+++ b/test/CodeGen/ARM/vcnt.ll
@ -0,0 +1,17 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vcnt\\.8} %t | count 2
+
+define <8 x i8> @vcnt8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vcnt.v8i8(<8 x i8> %tmp1)
+	ret <8 x i8> %tmp2
+}
+
+define <16 x i8> @vcntQ8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = call <16 x i8> @llvm.arm.neon.vcnt.v16i8(<16 x i8> %tmp1)
+	ret <16 x i8> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vcnt.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm.neon.vcnt.v16i8(<16 x i8>) nounwind readnone
--- a/test/CodeGen/ARM/vcvt.ll
+++ b/test/CodeGen/ARM/vcvt.ll
@ -0,0 +1,53 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vcvt\\.s32\\.f32} %t | count 2
+; RUN: grep {vcvt\\.u32\\.f32} %t | count 2
+; RUN: grep {vcvt\\.f32\\.s32} %t | count 2
+; RUN: grep {vcvt\\.f32\\.u32} %t | count 2
+
+define <2 x i32> @vcvt_f32tos32(<2 x float>* %A) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = fptosi <2 x float> %tmp1 to <2 x i32>
+	ret <2 x i32> %tmp2
+}
+
+define <2 x i32> @vcvt_f32tou32(<2 x float>* %A) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = fptoui <2 x float> %tmp1 to <2 x i32>
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @vcvt_s32tof32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = sitofp <2 x i32> %tmp1 to <2 x float>
+	ret <2 x float> %tmp2
+}
+
+define <2 x float> @vcvt_u32tof32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = uitofp <2 x i32> %tmp1 to <2 x float>
+	ret <2 x float> %tmp2
+}
+
+define <4 x i32> @vcvtQ_f32tos32(<4 x float>* %A) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = fptosi <4 x float> %tmp1 to <4 x i32>
+	ret <4 x i32> %tmp2
+}
+
+define <4 x i32> @vcvtQ_f32tou32(<4 x float>* %A) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = fptoui <4 x float> %tmp1 to <4 x i32>
+	ret <4 x i32> %tmp2
+}
+
+define <4 x float> @vcvtQ_s32tof32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = sitofp <4 x i32> %tmp1 to <4 x float>
+	ret <4 x float> %tmp2
+}
+
+define <4 x float> @vcvtQ_u32tof32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = uitofp <4 x i32> %tmp1 to <4 x float>
+	ret <4 x float> %tmp2
+}
--- a/test/CodeGen/ARM/vcvt_n.ll
+++ b/test/CodeGen/ARM/vcvt_n.ll
@ -0,0 +1,64 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vcvt\\.s32\\.f32} %t | count 2
+; RUN: grep {vcvt\\.u32\\.f32} %t | count 2
+; RUN: grep {vcvt\\.f32\\.s32} %t | count 2
+; RUN: grep {vcvt\\.f32\\.u32} %t | count 2
+
+define <2 x i32> @vcvt_f32tos32(<2 x float>* %A) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %tmp1, i32 1)
+	ret <2 x i32> %tmp2
+}
+
+define <2 x i32> @vcvt_f32tou32(<2 x float>* %A) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %tmp1, i32 1)
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @vcvt_s32tof32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %tmp1, i32 1)
+	ret <2 x float> %tmp2
+}
+
+define <2 x float> @vcvt_u32tof32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %tmp1, i32 1)
+	ret <2 x float> %tmp2
+}
+
+declare <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32) nounwind readnone
+declare <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+
+define <4 x i32> @vcvtQ_f32tos32(<4 x float>* %A) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %tmp1, i32 1)
+	ret <4 x i32> %tmp2
+}
+
+define <4 x i32> @vcvtQ_f32tou32(<4 x float>* %A) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %tmp1, i32 1)
+	ret <4 x i32> %tmp2
+}
+
+define <4 x float> @vcvtQ_s32tof32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %tmp1, i32 1)
+	ret <4 x float> %tmp2
+}
+
+define <4 x float> @vcvtQ_u32tof32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %tmp1, i32 1)
+	ret <4 x float> %tmp2
+}
+
+declare <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+
--- a/test/CodeGen/ARM/vdup.ll
+++ b/test/CodeGen/ARM/vdup.ll
@ -0,0 +1,134 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep vdup.8 %t | count 4
+; RUN: grep vdup.16 %t | count 4
+; RUN: grep vdup.32 %t | count 8
+
+define <8 x i8> @v_dup8(i8 %A) nounwind {
+	%tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0
+	%tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1
+	%tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2
+	%tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3
+	%tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4
+	%tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5
+	%tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6
+	%tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7
+	ret <8 x i8> %tmp8
+}
+
+define <4 x i16> @v_dup16(i16 %A) nounwind {
+	%tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0
+	%tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1
+	%tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2
+	%tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @v_dup32(i32 %A) nounwind {
+	%tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0
+	%tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @v_dupfloat(float %A) nounwind {
+	%tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0
+	%tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @v_dupQ8(i8 %A) nounwind {
+	%tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0
+	%tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1
+	%tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2
+	%tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3
+	%tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4
+	%tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5
+	%tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6
+	%tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7
+	%tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8
+	%tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9
+	%tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10
+	%tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11
+	%tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12
+	%tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13
+	%tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14
+	%tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15
+	ret <16 x i8> %tmp16
+}
+
+define <8 x i16> @v_dupQ16(i16 %A) nounwind {
+	%tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0
+	%tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1
+	%tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2
+	%tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3
+	%tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4
+	%tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5
+	%tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6
+	%tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7
+	ret <8 x i16> %tmp8
+}
+
+define <4 x i32> @v_dupQ32(i32 %A) nounwind {
+	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0
+	%tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1
+	%tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2
+	%tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3
+	ret <4 x i32> %tmp4
+}
+
+define <4 x float> @v_dupQfloat(float %A) nounwind {
+	%tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0
+	%tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1
+	%tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2
+	%tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3
+	ret <4 x float> %tmp4
+}
+
+; Check to make sure it works with shuffles, too.
+
+define <8 x i8> @v_shuffledup8(i8 %A) nounwind {
+	%tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @v_shuffledup16(i16 %A) nounwind {
+	%tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @v_shuffledup32(i32 %A) nounwind {
+	%tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0
+	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @v_shuffledupfloat(float %A) nounwind {
+	%tmp1 = insertelement <2 x float> undef, float %A, i32 0
+	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind {
+	%tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0
+	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind {
+	%tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0
+	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind {
+	%tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0
+	%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
+	ret <4 x i32> %tmp2
+}
+
+define <4 x float> @v_shuffledupQfloat(float %A) nounwind {
+	%tmp1 = insertelement <4 x float> undef, float %A, i32 0
+	%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
+	ret <4 x float> %tmp2
+}
--- a/test/CodeGen/ARM/vdup_lane.ll
+++ b/test/CodeGen/ARM/vdup_lane.ll
@ -0,0 +1,52 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep vdup.8 %t | count 2
+; RUN: grep vdup.16 %t | count 2
+; RUN: grep vdup.32 %t | count 4
+
+define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 >
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
+	ret <4 x i32> %tmp2
+}
+
+define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
+	ret <4 x float> %tmp2
+}
--- a/test/CodeGen/ARM/veor.ll
+++ b/test/CodeGen/ARM/veor.ll
@ -0,0 +1,59 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep veor %t | count 8
+; Note: function names do not include "veor" to allow simple grep for opcodes
+
+define <8 x i8> @v_eori8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = xor <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @v_eori16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = xor <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @v_eori32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = xor <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @v_eori64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+	%tmp1 = load <1 x i64>* %A
+	%tmp2 = load <1 x i64>* %B
+	%tmp3 = xor <1 x i64> %tmp1, %tmp2
+	ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @v_eorQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = xor <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @v_eorQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = xor <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @v_eorQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = xor <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @v_eorQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = xor <2 x i64> %tmp1, %tmp2
+	ret <2 x i64> %tmp3
+}
--- a/test/CodeGen/ARM/vfcmp.ll
+++ b/test/CodeGen/ARM/vfcmp.ll
@ -0,0 +1,96 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vceq\\.f32} %t | count 1
+; RUN: grep {vcgt\\.f32} %t | count 9
+; RUN: grep {vcge\\.f32} %t | count 5
+; RUN: grep vorr %t | count 4
+; RUN: grep vmvn %t | count 7
+
+; This tests vfcmp operations that do not map directly to NEON instructions.
+
+; une is implemented with VCEQ/VMVN
+define <2 x i32> @vcunef32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp une <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+; olt is implemented with VCGT
+define <2 x i32> @vcoltf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp olt <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+; ole is implemented with VCGE
+define <2 x i32> @vcolef32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp ole <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+; uge is implemented with VCGT/VMVN
+define <2 x i32> @vcugef32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp uge <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+; ule is implemented with VCGT/VMVN
+define <2 x i32> @vculef32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp ule <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+; ugt is implemented with VCGE/VMVN
+define <2 x i32> @vcugtf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp ugt <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+; ult is implemented with VCGE/VMVN
+define <2 x i32> @vcultf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp ult <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+; ueq is implemented with VCGT/VCGT/VORR/VMVN
+define <2 x i32> @vcueqf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp ueq <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+; one is implemented with VCGT/VCGT/VORR
+define <2 x i32> @vconef32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp one <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+; uno is implemented with VCGT/VCGE/VORR/VMVN
+define <2 x i32> @vcunof32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp uno <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+; ord is implemented with VCGT/VCGE/VORR
+define <2 x i32> @vcordf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = vfcmp ord <2 x float> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
--- a/test/CodeGen/ARM/vget_lane.ll
+++ b/test/CodeGen/ARM/vget_lane.ll
@ -0,0 +1,78 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmov\\.s8} %t | count 2
+; RUN: grep {vmov\\.s16} %t | count 2
+; RUN: grep {vmov\\.u8} %t | count 2
+; RUN: grep {vmov\\.u16} %t | count 2
+; RUN: grep {vmov\\.32} %t | count 2
+
+define i32 @vget_lanes8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = extractelement <8 x i8> %tmp1, i32 1
+	%tmp3 = sext i8 %tmp2 to i32
+	ret i32 %tmp3
+}
+
+define i32 @vget_lanes16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = extractelement <4 x i16> %tmp1, i32 1
+	%tmp3 = sext i16 %tmp2 to i32
+	ret i32 %tmp3
+}
+
+define i32 @vget_laneu8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = extractelement <8 x i8> %tmp1, i32 1
+	%tmp3 = zext i8 %tmp2 to i32
+	ret i32 %tmp3
+}
+
+define i32 @vget_laneu16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = extractelement <4 x i16> %tmp1, i32 1
+	%tmp3 = zext i16 %tmp2 to i32
+	ret i32 %tmp3
+}
+
+; Do a vector add to keep the extraction from being done directly from memory.
+define i32 @vget_lanei32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = add <2 x i32> %tmp1, %tmp1
+	%tmp3 = extractelement <2 x i32> %tmp2, i32 1
+	ret i32 %tmp3
+}
+
+define i32 @vgetQ_lanes8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = extractelement <16 x i8> %tmp1, i32 1
+	%tmp3 = sext i8 %tmp2 to i32
+	ret i32 %tmp3
+}
+
+define i32 @vgetQ_lanes16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = extractelement <8 x i16> %tmp1, i32 1
+	%tmp3 = sext i16 %tmp2 to i32
+	ret i32 %tmp3
+}
+
+define i32 @vgetQ_laneu8(<16 x i8>* %A) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = extractelement <16 x i8> %tmp1, i32 1
+	%tmp3 = zext i8 %tmp2 to i32
+	ret i32 %tmp3
+}
+
+define i32 @vgetQ_laneu16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = extractelement <8 x i16> %tmp1, i32 1
+	%tmp3 = zext i16 %tmp2 to i32
+	ret i32 %tmp3
+}
+
+; Do a vector add to keep the extraction from being done directly from memory.
+define i32 @vgetQ_lanei32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = add <4 x i32> %tmp1, %tmp1
+	%tmp3 = extractelement <4 x i32> %tmp2, i32 1
+	ret i32 %tmp3
+}
--- a/test/CodeGen/ARM/vhadd.ll
+++ b/test/CodeGen/ARM/vhadd.ll
@ -0,0 +1,107 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vhadd\\.s8} %t | count 2
+; RUN: grep {vhadd\\.s16} %t | count 2
+; RUN: grep {vhadd\\.s32} %t | count 2
+; RUN: grep {vhadd\\.u8} %t | count 2
+; RUN: grep {vhadd\\.u16} %t | count 2
+; RUN: grep {vhadd\\.u32} %t | count 2
+
+define <8 x i8> @vhadds8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vhadds16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vhadds32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @vhaddu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vhaddu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vhaddu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @vhaddQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vhaddQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vhaddQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <16 x i8> @vhaddQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vhaddQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vhaddQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vhsub.ll
+++ b/test/CodeGen/ARM/vhsub.ll
@ -0,0 +1,107 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vhsub\\.s8} %t | count 2
+; RUN: grep {vhsub\\.s16} %t | count 2
+; RUN: grep {vhsub\\.s32} %t | count 2
+; RUN: grep {vhsub\\.u8} %t | count 2
+; RUN: grep {vhsub\\.u16} %t | count 2
+; RUN: grep {vhsub\\.u32} %t | count 2
+
+define <8 x i8> @vhsubs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vhsubs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vhsubs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @vhsubu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vhsubu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vhsubu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @vhsubQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vhsubQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vhsubQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <16 x i8> @vhsubQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vhsubQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vhsubQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vhsubs.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vhsubu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vicmp.ll
+++ b/test/CodeGen/ARM/vicmp.ll
@ -0,0 +1,85 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vceq\\.i8} %t | count 2
+; RUN: grep {vceq\\.i16} %t | count 2
+; RUN: grep {vceq\\.i32} %t | count 2
+; RUN: grep vmvn %t | count 6
+; RUN: grep {vcgt\\.s8} %t | count 1
+; RUN: grep {vcge\\.s16} %t | count 1
+; RUN: grep {vcgt\\.u16} %t | count 1
+; RUN: grep {vcge\\.u32} %t | count 1
+
+; This tests vicmp operations that do not map directly to NEON instructions.
+; Not-equal (ne) operations are implemented by VCEQ/VMVN.  Less-than (lt/ult)
+; and less-than-or-equal (le/ule) are implemented by swapping the arguments
+; to VCGT and VCGE.  Test all the operand types for not-equal but only sample
+; the other operations.
+
+define <8 x i8> @vcnei8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = vicmp ne <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vcnei16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = vicmp ne <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vcnei32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = vicmp ne <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @vcneQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = vicmp ne <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vcneQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = vicmp ne <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vcneQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = vicmp ne <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <16 x i8> @vcltQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = vicmp slt <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @vcles16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = vicmp sle <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <4 x i16> @vcltu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = vicmp ult <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <4 x i32> @vcleQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = vicmp ule <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
--- a/test/CodeGen/ARM/vmax.ll
+++ b/test/CodeGen/ARM/vmax.ll
@ -0,0 +1,126 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmax\\.s8} %t | count 2
+; RUN: grep {vmax\\.s16} %t | count 2
+; RUN: grep {vmax\\.s32} %t | count 2
+; RUN: grep {vmax\\.u8} %t | count 2
+; RUN: grep {vmax\\.u16} %t | count 2
+; RUN: grep {vmax\\.u32} %t | count 2
+; RUN: grep {vmax\\.f32} %t | count 2
+
+define <8 x i8> @vmaxs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vmaxs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vmaxs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @vmaxu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vmaxu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vmaxu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <2 x float> @vmaxf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm.neon.vmaxf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <16 x i8> @vmaxQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vmaxQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vmaxQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <16 x i8> @vmaxQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vmaxQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vmaxQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <4 x float> @vmaxQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.arm.neon.vmaxf.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vmaxs.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vmaxu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <2 x float> @llvm.arm.neon.vmaxf.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <4 x float> @llvm.arm.neon.vmaxf.v4f32(<4 x float>, <4 x float>) nounwind readnone
--- a/test/CodeGen/ARM/vmin.ll
+++ b/test/CodeGen/ARM/vmin.ll
@ -0,0 +1,126 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmin\\.s8} %t | count 2
+; RUN: grep {vmin\\.s16} %t | count 2
+; RUN: grep {vmin\\.s32} %t | count 2
+; RUN: grep {vmin\\.u8} %t | count 2
+; RUN: grep {vmin\\.u16} %t | count 2
+; RUN: grep {vmin\\.u32} %t | count 2
+; RUN: grep {vmin\\.f32} %t | count 2
+
+define <8 x i8> @vmins8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vmins16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vmins32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @vminu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vminu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vminu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <2 x float> @vminf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm.neon.vminf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <16 x i8> @vminQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vminQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vminQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <16 x i8> @vminQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vminQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vminQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <4 x float> @vminQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.arm.neon.vminf.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vmins.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vminu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <2 x float> @llvm.arm.neon.vminf.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <4 x float> @llvm.arm.neon.vminf.v4f32(<4 x float>, <4 x float>) nounwind readnone
--- a/test/CodeGen/ARM/vmla.ll
+++ b/test/CodeGen/ARM/vmla.ll
@ -0,0 +1,77 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmla\\.i8} %t | count 2
+; RUN: grep {vmla\\.i16} %t | count 2
+; RUN: grep {vmla\\.i32} %t | count 2
+; RUN: grep {vmla\\.f32} %t | count 2
+
+define <8 x i8> @vmlai8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8> * %C) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = load <8 x i8>* %C
+	%tmp4 = mul <8 x i8> %tmp2, %tmp3
+	%tmp5 = add <8 x i8> %tmp1, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @vmlai16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = mul <4 x i16> %tmp2, %tmp3
+	%tmp5 = add <4 x i16> %tmp1, %tmp4
+	ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @vmlai32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = mul <2 x i32> %tmp2, %tmp3
+	%tmp5 = add <2 x i32> %tmp1, %tmp4
+	ret <2 x i32> %tmp5
+}
+
+define <2 x float> @vmlaf32(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = load <2 x float>* %C
+	%tmp4 = mul <2 x float> %tmp2, %tmp3
+	%tmp5 = add <2 x float> %tmp1, %tmp4
+	ret <2 x float> %tmp5
+}
+
+define <16 x i8> @vmlaQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8> * %C) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = load <16 x i8>* %C
+	%tmp4 = mul <16 x i8> %tmp2, %tmp3
+	%tmp5 = add <16 x i8> %tmp1, %tmp4
+	ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @vmlaQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = load <8 x i16>* %C
+	%tmp4 = mul <8 x i16> %tmp2, %tmp3
+	%tmp5 = add <8 x i16> %tmp1, %tmp4
+	ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @vmlaQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = load <4 x i32>* %C
+	%tmp4 = mul <4 x i32> %tmp2, %tmp3
+	%tmp5 = add <4 x i32> %tmp1, %tmp4
+	ret <4 x i32> %tmp5
+}
+
+define <4 x float> @vmlaQf32(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = load <4 x float>* %C
+	%tmp4 = mul <4 x float> %tmp2, %tmp3
+	%tmp5 = add <4 x float> %tmp1, %tmp4
+	ret <4 x float> %tmp5
+}
--- a/test/CodeGen/ARM/vmlal.ll
+++ b/test/CodeGen/ARM/vmlal.ll
@ -0,0 +1,63 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmlal\\.s8} %t | count 1
+; RUN: grep {vmlal\\.s16} %t | count 1
+; RUN: grep {vmlal\\.s32} %t | count 1
+; RUN: grep {vmlal\\.u8} %t | count 1
+; RUN: grep {vmlal\\.u16} %t | count 1
+; RUN: grep {vmlal\\.u32} %t | count 1
+
+define <8 x i16> @vmlals8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = load <8 x i8>* %C
+	%tmp4 = call <8 x i16> @llvm.arm.neon.vmlals.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vmlals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = call <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vmlals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = call <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @vmlalu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = load <8 x i8>* %C
+	%tmp4 = call <8 x i16> @llvm.arm.neon.vmlalu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vmlalu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = call <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vmlalu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = call <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+	ret <2 x i64> %tmp4
+}
+
+declare <8 x i16> @llvm.arm.neon.vmlals.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vmlalu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vmls.ll
+++ b/test/CodeGen/ARM/vmls.ll
@ -0,0 +1,77 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmls\\.i8} %t | count 2
+; RUN: grep {vmls\\.i16} %t | count 2
+; RUN: grep {vmls\\.i32} %t | count 2
+; RUN: grep {vmls\\.f32} %t | count 2
+
+define <8 x i8> @vmlsi8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8> * %C) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = load <8 x i8>* %C
+	%tmp4 = mul <8 x i8> %tmp2, %tmp3
+	%tmp5 = sub <8 x i8> %tmp1, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @vmlsi16(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = mul <4 x i16> %tmp2, %tmp3
+	%tmp5 = sub <4 x i16> %tmp1, %tmp4
+	ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @vmlsi32(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = mul <2 x i32> %tmp2, %tmp3
+	%tmp5 = sub <2 x i32> %tmp1, %tmp4
+	ret <2 x i32> %tmp5
+}
+
+define <2 x float> @vmlsf32(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = load <2 x float>* %C
+	%tmp4 = mul <2 x float> %tmp2, %tmp3
+	%tmp5 = sub <2 x float> %tmp1, %tmp4
+	ret <2 x float> %tmp5
+}
+
+define <16 x i8> @vmlsQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8> * %C) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = load <16 x i8>* %C
+	%tmp4 = mul <16 x i8> %tmp2, %tmp3
+	%tmp5 = sub <16 x i8> %tmp1, %tmp4
+	ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @vmlsQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = load <8 x i16>* %C
+	%tmp4 = mul <8 x i16> %tmp2, %tmp3
+	%tmp5 = sub <8 x i16> %tmp1, %tmp4
+	ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @vmlsQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = load <4 x i32>* %C
+	%tmp4 = mul <4 x i32> %tmp2, %tmp3
+	%tmp5 = sub <4 x i32> %tmp1, %tmp4
+	ret <4 x i32> %tmp5
+}
+
+define <4 x float> @vmlsQf32(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = load <4 x float>* %C
+	%tmp4 = mul <4 x float> %tmp2, %tmp3
+	%tmp5 = sub <4 x float> %tmp1, %tmp4
+	ret <4 x float> %tmp5
+}
--- a/test/CodeGen/ARM/vmlsl.ll
+++ b/test/CodeGen/ARM/vmlsl.ll
@ -0,0 +1,63 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmlsl\\.s8} %t | count 1
+; RUN: grep {vmlsl\\.s16} %t | count 1
+; RUN: grep {vmlsl\\.s32} %t | count 1
+; RUN: grep {vmlsl\\.u8} %t | count 1
+; RUN: grep {vmlsl\\.u16} %t | count 1
+; RUN: grep {vmlsl\\.u32} %t | count 1
+
+define <8 x i16> @vmlsls8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = load <8 x i8>* %C
+	%tmp4 = call <8 x i16> @llvm.arm.neon.vmlsls.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vmlsls16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = call <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vmlsls32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = call <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @vmlslu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = load <8 x i8>* %C
+	%tmp4 = call <8 x i16> @llvm.arm.neon.vmlslu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3)
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vmlslu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = load <4 x i16>* %C
+	%tmp4 = call <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vmlslu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = load <2 x i32>* %C
+	%tmp4 = call <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
+	ret <2 x i64> %tmp4
+}
+
+declare <8 x i16> @llvm.arm.neon.vmlsls.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vmlslu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vmov.ll
+++ b/test/CodeGen/ARM/vmov.ll
@ -0,0 +1,101 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep vmov.i8 %t | count 2
+; RUN: grep vmov.i16 %t | count 4
+; RUN: grep vmov.i32 %t | count 12
+; RUN: grep vmov.i64 %t | count 2
+; Note: function names do not include "vmov" to allow simple grep for opcodes
+
+define <8 x i8> @v_movi8() nounwind {
+	ret <8 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
+}
+
+define <4 x i16> @v_movi16a() nounwind {
+	ret <4 x i16> < i16 16, i16 16, i16 16, i16 16 >
+}
+
+; 0x1000 = 4096
+define <4 x i16> @v_movi16b() nounwind {
+	ret <4 x i16> < i16 4096, i16 4096, i16 4096, i16 4096 >
+}
+
+define <2 x i32> @v_movi32a() nounwind {
+	ret <2 x i32> < i32 32, i32 32 >
+}
+
+; 0x2000 = 8192
+define <2 x i32> @v_movi32b() nounwind {
+	ret <2 x i32> < i32 8192, i32 8192 >
+}
+
+; 0x200000 = 2097152
+define <2 x i32> @v_movi32c() nounwind {
+	ret <2 x i32> < i32 2097152, i32 2097152 >
+}
+
+; 0x20000000 = 536870912
+define <2 x i32> @v_movi32d() nounwind {
+	ret <2 x i32> < i32 536870912, i32 536870912 >
+}
+
+; 0x20ff = 8447
+define <2 x i32> @v_movi32e() nounwind {
+	ret <2 x i32> < i32 8447, i32 8447 >
+}
+
+; 0x20ffff = 2162687
+define <2 x i32> @v_movi32f() nounwind {
+	ret <2 x i32> < i32 2162687, i32 2162687 >
+}
+
+; 0xff0000ff0000ffff = 18374687574888349695
+define <1 x i64> @v_movi64() nounwind {
+	ret <1 x i64> < i64 18374687574888349695 >
+}
+
+define <16 x i8> @v_movQi8() nounwind {
+	ret <16 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
+}
+
+define <8 x i16> @v_movQi16a() nounwind {
+	ret <8 x i16> < i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16 >
+}
+
+; 0x1000 = 4096
+define <8 x i16> @v_movQi16b() nounwind {
+	ret <8 x i16> < i16 4096, i16 4096, i16 4096, i16 4096, i16 4096, i16 4096, i16 4096, i16 4096 >
+}
+
+define <4 x i32> @v_movQi32a() nounwind {
+	ret <4 x i32> < i32 32, i32 32, i32 32, i32 32 >
+}
+
+; 0x2000 = 8192
+define <4 x i32> @v_movQi32b() nounwind {
+	ret <4 x i32> < i32 8192, i32 8192, i32 8192, i32 8192 >
+}
+
+; 0x200000 = 2097152
+define <4 x i32> @v_movQi32c() nounwind {
+	ret <4 x i32> < i32 2097152, i32 2097152, i32 2097152, i32 2097152 >
+}
+
+; 0x20000000 = 536870912
+define <4 x i32> @v_movQi32d() nounwind {
+	ret <4 x i32> < i32 536870912, i32 536870912, i32 536870912, i32 536870912 >
+}
+
+; 0x20ff = 8447
+define <4 x i32> @v_movQi32e() nounwind {
+	ret <4 x i32> < i32 8447, i32 8447, i32 8447, i32 8447 >
+}
+
+; 0x20ffff = 2162687
+define <4 x i32> @v_movQi32f() nounwind {
+	ret <4 x i32> < i32 2162687, i32 2162687, i32 2162687, i32 2162687 >
+}
+
+; 0xff0000ff0000ffff = 18374687574888349695
+define <2 x i64> @v_movQi64() nounwind {
+	ret <2 x i64> < i64 18374687574888349695, i64 18374687574888349695 >
+}
+
--- a/test/CodeGen/ARM/vmovl.ll
+++ b/test/CodeGen/ARM/vmovl.ll
@ -0,0 +1,51 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmovl\\.s8} %t | count 1
+; RUN: grep {vmovl\\.s16} %t | count 1
+; RUN: grep {vmovl\\.s32} %t | count 1
+; RUN: grep {vmovl\\.u8} %t | count 1
+; RUN: grep {vmovl\\.u16} %t | count 1
+; RUN: grep {vmovl\\.u32} %t | count 1
+
+define <8 x i16> @vmovls8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vmovls.v8i16(<8 x i8> %tmp1)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vmovls16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vmovls.v4i32(<4 x i16> %tmp1)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vmovls32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i64> @llvm.arm.neon.vmovls.v2i64(<2 x i32> %tmp1)
+	ret <2 x i64> %tmp2
+}
+
+define <8 x i16> @vmovlu8(<8 x i8>* %A) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = call <8 x i16> @llvm.arm.neon.vmovlu.v8i16(<8 x i8> %tmp1)
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vmovlu16(<4 x i16>* %A) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = call <4 x i32> @llvm.arm.neon.vmovlu.v4i32(<4 x i16> %tmp1)
+	ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @vmovlu32(<2 x i32>* %A) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = call <2 x i64> @llvm.arm.neon.vmovlu.v2i64(<2 x i32> %tmp1)
+	ret <2 x i64> %tmp2
+}
+
+declare <8 x i16> @llvm.arm.neon.vmovls.v8i16(<8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmovls.v4i32(<4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vmovls.v2i64(<2 x i32>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vmovlu.v8i16(<8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmovlu.v4i32(<4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vmovlu.v2i64(<2 x i32>) nounwind readnone
--- a/test/CodeGen/ARM/vmovn.ll
+++ b/test/CodeGen/ARM/vmovn.ll
@ -0,0 +1,26 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmovn\\.i16} %t | count 1
+; RUN: grep {vmovn\\.i32} %t | count 1
+; RUN: grep {vmovn\\.i64} %t | count 1
+
+define <8 x i8> @vmovni16(<8 x i16>* %A) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = call <8 x i8> @llvm.arm.neon.vmovn.v8i8(<8 x i16> %tmp1)
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vmovni32(<4 x i32>* %A) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = call <4 x i16> @llvm.arm.neon.vmovn.v4i16(<4 x i32> %tmp1)
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vmovni64(<2 x i64>* %A) nounwind {
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = call <2 x i32> @llvm.arm.neon.vmovn.v2i32(<2 x i64> %tmp1)
+	ret <2 x i32> %tmp2
+}
+
+declare <8 x i8>  @llvm.arm.neon.vmovn.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vmovn.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vmovn.v2i32(<2 x i64>) nounwind readnone
--- a/test/CodeGen/ARM/vmul.ll
+++ b/test/CodeGen/ARM/vmul.ll
@ -0,0 +1,79 @@
+; RUN: llvm-as < %s | llc -march=arm -mattr=+neon > %t
+; RUN: grep {vmul\\.i8} %t | count 2
+; RUN: grep {vmul\\.i16} %t | count 2
+; RUN: grep {vmul\\.i32} %t | count 2
+; RUN: grep {vmul\\.f32} %t | count 2
+; RUN: grep {vmul\\.p8} %t | count 2
+
+define <8 x i8> @vmuli8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = mul <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vmuli16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = mul <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vmuli32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = mul <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+define <2 x float> @vmulf32(<2 x float>* %A, <2 x float>* %B) nounwind {
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = mul <2 x float> %tmp1, %tmp2
+	ret <2 x float> %tmp3
+}
+
+define <8 x i8> @vmulp8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @vmulQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = mul <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vmulQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = mul <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vmulQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = mul <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <4 x float> @vmulQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = mul <4 x float> %tmp1, %tmp2
+	ret <4 x float> %tmp3
+}
+
+define <16 x i8> @vmulQp8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vmulp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8>  @llvm.arm.neon.vmulp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
--- a/Show More
+++ b/Show More