From 687a64222b4c87c825258d4dfeb1f0794e8cb300 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Tue, 11 Jun 2019 18:16:27 +0000
Subject: [PATCH 1/7] Vendor import of llvm release_80 branch r363030:
 https://llvm.org/svn/llvm-project/llvm/branches/release_80@363030

---
 CMakeLists.txt                                |   2 +-
 cmake/modules/AddLLVM.cmake                   |   1 +
 cmake/modules/LLVMProcessSources.cmake        |  10 +-
 docs/ReleaseNotes.rst                         |  20 +
 lib/CodeGen/TargetRegisterInfo.cpp            |   6 +
 lib/DebugInfo/DWARF/DWARFDebugFrame.cpp       |   4 +-
 lib/MC/ELFObjectWriter.cpp                    |   1 +
 lib/MC/MCWin64EH.cpp                          |   2 +-
 lib/MC/WasmObjectWriter.cpp                   |   8 +-
 lib/Object/COFFImportFile.cpp                 |   2 +-
 lib/Target/AArch64/AArch64SchedExynosM4.td    |  36 +-
 lib/Target/AArch64/AArch64SchedPredExynos.td  |  11 -
 lib/Target/AArch64/AArch64SchedPredicates.td  |  53 --
 lib/Target/AMDGPU/SIFoldOperands.cpp          |  73 +-
 lib/Target/AMDGPU/VOP2Instructions.td         |  12 +-
 lib/Target/ARM/ARMISelLowering.cpp            |  30 +-
 lib/Target/AVR/AVRISelLowering.cpp            |  28 +-
 lib/Target/AVR/AVRISelLowering.h              |   8 +-
 lib/Target/AVR/AVRSubtarget.cpp               |  12 +-
 lib/Target/AVR/AVRSubtarget.h                 |   5 +-
 .../Mips/MCTargetDesc/MipsTargetStreamer.cpp  |   7 +-
 lib/Target/Mips/MicroMips32r6InstrInfo.td     |   4 +-
 lib/Target/Mips/MicroMipsInstrFPU.td          |   5 +
 lib/Target/Mips/MipsAsmPrinter.cpp            |   3 +-
 lib/Target/Mips/MipsDSPInstrInfo.td           |   4 +-
 lib/Target/Mips/MipsDelaySlotFiller.cpp       |   1 +
 lib/Target/Mips/MipsFastISel.cpp              |  35 +-
 lib/Target/Mips/MipsSEInstrInfo.cpp           |   3 +
 .../PowerPC/Disassembler/PPCDisassembler.cpp  |   8 +
 .../PowerPC/InstPrinter/PPCInstPrinter.cpp    |   7 +-
 .../PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp  |  30 +-
 lib/Target/PowerPC/PPCISelDAGToDAG.cpp        |   4 +-
 lib/Target/PowerPC/PPCInstrInfo.td            |   2 +
 lib/Target/PowerPC/PPCSubtarget.cpp           |   3 +
 lib/Target/Sparc/SparcRegisterInfo.cpp        |   4 +-
 .../WebAssembly/WebAssemblyISelLowering.cpp   |   9 +-
 lib/Target/X86/X86FastISel.cpp                |   2 +-
 lib/Target/X86/X86TargetMachine.cpp           |   6 +-
 test/CodeGen/AMDGPU/add.ll                    |  83 +-
 ...ds-negative-offset-addressing-mode-loop.ll |   6 +-
 test/CodeGen/AMDGPU/fence-barrier.ll          |   3 +-
 .../CodeGen/AMDGPU/fold-fi-operand-shrink.mir | 230 +++++
 .../AMDGPU/fold-immediate-operand-shrink.mir  |  72 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll |   3 +-
 test/CodeGen/AMDGPU/r600.add.ll               | 167 ++++
 test/CodeGen/AMDGPU/r600.sub.ll               | 152 ++++
 test/CodeGen/AMDGPU/salu-to-valu.ll           |   2 +-
 test/CodeGen/AMDGPU/sub.ll                    |  90 +-
 test/CodeGen/ARM/tail-call-scheduling.ll      |  35 +
 test/CodeGen/AVR/{mul.ll => hardware-mul.ll}  |   2 +
 test/CodeGen/AVR/smul-with-overflow.ll        |   2 +-
 test/CodeGen/AVR/software-mul.ll              |  28 +
 test/CodeGen/AVR/umul-with-overflow.ll        |   2 +-
 test/CodeGen/Mips/Fast-ISel/icmpbr1.ll        |   3 +-
 test/CodeGen/Mips/Fast-ISel/pr40325.ll        |  23 +
 test/CodeGen/Mips/abiflags32.ll               |   8 +
 test/CodeGen/Mips/llvm-ir/fptosi.ll           | 418 +++++++++
 .../Mips/micromips-pseudo-mtlohi-expand.ll    |  63 ++
 test/CodeGen/Mips/pseudo-jump-fill.ll         |  68 ++
 test/CodeGen/PowerPC/ppc32-pic-large.ll       |   4 +
 test/CodeGen/SPARC/fp128.ll                   |  23 +
 test/CodeGen/WebAssembly/varargs.ll           |  26 +
 test/CodeGen/X86/PR40322.ll                   | 164 ++++
 test/CodeGen/X86/fast-isel-nontemporal.ll     |  72 +-
 test/CodeGen/X86/regalloc-copy-hints.mir      | 805 ++++++++++++++++++
 test/MC/PowerPC/ppc64-localentry-symbols.s    |  34 +
 test/MC/WebAssembly/null-output.s             |  10 +
 test/tools/llvm-dlltool/coff-weak-exports.def |   3 +
 .../tools/llvm-objdump/AMDGPU/source-lines.ll |   4 +-
 .../llvm-objdump/PowerPC/branch-offset.s      |  43 +
 test/tools/llvm-objdump/PowerPC/lit.local.cfg |   2 +
 test/tools/llvm-objdump/eh_frame-coff.test    |   4 +-
 .../llvm-objdump/elf-symbol-visibility.test   |  37 +
 tools/llvm-objdump/llvm-objdump.cpp           |  30 +-
 utils/git-svn/git-llvm                        |   6 +-
 utils/lit/lit/__init__.py                     |   2 +-
 utils/release/merge-request.sh                |   3 +
 77 files changed, 2834 insertions(+), 359 deletions(-)
 create mode 100644 test/CodeGen/AMDGPU/fold-fi-operand-shrink.mir
 create mode 100644 test/CodeGen/AMDGPU/r600.add.ll
 create mode 100644 test/CodeGen/AMDGPU/r600.sub.ll
 create mode 100644 test/CodeGen/ARM/tail-call-scheduling.ll
 rename test/CodeGen/AVR/{mul.ll => hardware-mul.ll} (90%)
 create mode 100644 test/CodeGen/AVR/software-mul.ll
 create mode 100644 test/CodeGen/Mips/Fast-ISel/pr40325.ll
 create mode 100644 test/CodeGen/Mips/llvm-ir/fptosi.ll
 create mode 100644 test/CodeGen/Mips/micromips-pseudo-mtlohi-expand.ll
 create mode 100644 test/CodeGen/Mips/pseudo-jump-fill.ll
 create mode 100644 test/CodeGen/X86/PR40322.ll
 create mode 100644 test/CodeGen/X86/regalloc-copy-hints.mir
 create mode 100644 test/MC/PowerPC/ppc64-localentry-symbols.s
 create mode 100644 test/MC/WebAssembly/null-output.s
 create mode 100644 test/tools/llvm-objdump/PowerPC/branch-offset.s
 create mode 100644 test/tools/llvm-objdump/PowerPC/lit.local.cfg
 create mode 100644 test/tools/llvm-objdump/elf-symbol-visibility.test

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 27754f33949..81c2bab39ec 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -18,7 +18,7 @@ if(NOT DEFINED LLVM_VERSION_MINOR)
   set(LLVM_VERSION_MINOR 0)
 endif()
 if(NOT DEFINED LLVM_VERSION_PATCH)
-  set(LLVM_VERSION_PATCH 0)
+  set(LLVM_VERSION_PATCH 1)
 endif()
 if(NOT DEFINED LLVM_VERSION_SUFFIX)
   set(LLVM_VERSION_SUFFIX "")
diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake
index 0df6845aaa7..1a417447278 100755
--- a/cmake/modules/AddLLVM.cmake
+++ b/cmake/modules/AddLLVM.cmake
@@ -855,6 +855,7 @@ if(NOT LLVM_TOOLCHAIN_TOOLS)
     llvm-lib
     llvm-objdump
     llvm-rc
+    llvm-profdata
     )
 endif()
 
diff --git a/cmake/modules/LLVMProcessSources.cmake b/cmake/modules/LLVMProcessSources.cmake
index 7cbd2863500..d0be0e8b3ba 100644
--- a/cmake/modules/LLVMProcessSources.cmake
+++ b/cmake/modules/LLVMProcessSources.cmake
@@ -30,7 +30,15 @@ endmacro(add_td_sources)
 
 function(add_header_files_for_glob hdrs_out glob)
   file(GLOB hds ${glob})
-  set(${hdrs_out} ${hds} PARENT_SCOPE)
+  set(filtered)
+  foreach(file ${hds})
+    # Explicit existence check is necessary to filter dangling symlinks
+    # out.  See https://bugs.gentoo.org/674662.
+    if(EXISTS ${file})
+      list(APPEND filtered ${file})
+    endif()
+  endforeach()
+  set(${hdrs_out} ${filtered} PARENT_SCOPE)
 endfunction(add_header_files_for_glob)
 
 function(find_all_header_files hdrs_out additional_headerdirs)
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index 8209c08889e..16121585d24 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -110,6 +110,26 @@ updated to use LLJIT.
 MCJIT and ExecutionEngine continue to be supported, though ORC should be
 preferred for new projects.
 
+Changes to the C++ APIs
+-----------------------
+
+Three of the IR library methods related to debugging information for
+functions and methods have changed their prototypes:
+
+  DIBuilder::createMethod
+  DIBuilder::createFunction
+  DIBuilder::createTempFunctionFwdDecl
+
+In all cases, several individual parameters were removed, and replaced
+by a single 'SPFlags' (subprogram flags) parameter. The individual
+parameters are: 'isLocalToUnit'; 'isDefinition'; 'isOptimized'; and
+for 'createMethod', 'Virtuality'.  The new 'SPFlags' parameter has a
+default value equivalent to passing 'false' for the three 'bool'
+parameters, and zero (non-virtual) to the 'Virtuality' parameter.  For
+any old-style API call that passed 'true' or a non-zero virtuality to
+these methods, you will need to substitute the correct 'SPFlags' value.
+The helper method 'DISubprogram::toSPFlags()' might be useful in making
+this conversion.
 
 Changes to the AArch64 Target
 -----------------------------
diff --git a/lib/CodeGen/TargetRegisterInfo.cpp b/lib/CodeGen/TargetRegisterInfo.cpp
index 661dc18f7a8..d3059280a60 100644
--- a/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/lib/CodeGen/TargetRegisterInfo.cpp
@@ -14,6 +14,7 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -398,6 +399,7 @@ TargetRegisterInfo::getRegAllocationHints(unsigned VirtReg,
   const std::pair<unsigned, SmallVector<unsigned, 4>> &Hints_MRI =
     MRI.getRegAllocationHints(VirtReg);
 
+  SmallSet<unsigned, 32> HintedRegs;
   // First hint may be a target hint.
   bool Skip = (Hints_MRI.first != 0);
   for (auto Reg : Hints_MRI.second) {
@@ -411,6 +413,10 @@ TargetRegisterInfo::getRegAllocationHints(unsigned VirtReg,
     if (VRM && isVirtualRegister(Phys))
       Phys = VRM->getPhys(Phys);
 
+    // Don't add the same reg twice (Hints_MRI may contain multiple virtual
+    // registers allocated to the same physreg).
+    if (!HintedRegs.insert(Phys).second)
+      continue;
     // Check that Phys is a valid hint in VirtReg's register class.
     if (!isPhysicalRegister(Phys))
       continue;
diff --git a/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp b/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
index ba55ffc2817..8a88a2fa3a0 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
@@ -301,7 +301,7 @@ void CIE::dump(raw_ostream &OS, const MCRegisterInfo *MRI, bool IsEH) const {
   OS << format("  Data alignment factor: %d\n", (int32_t)DataAlignmentFactor);
   OS << format("  Return address column: %d\n", (int32_t)ReturnAddressRegister);
   if (Personality)
-    OS << format("  Personality Address: %08x\n", *Personality);
+    OS << format("  Personality Address: %016" PRIx64 "\n", *Personality);
   if (!AugmentationData.empty()) {
     OS << "  Augmentation data:    ";
     for (uint8_t Byte : AugmentationData)
@@ -320,7 +320,7 @@ void FDE::dump(raw_ostream &OS, const MCRegisterInfo *MRI, bool IsEH) const {
                (uint32_t)InitialLocation,
                (uint32_t)InitialLocation + (uint32_t)AddressRange);
   if (LSDAAddress)
-    OS << format("  LSDA Address: %08x\n", *LSDAAddress);
+    OS << format("  LSDA Address: %016" PRIx64 "\n", *LSDAAddress);
   CFIs.dump(OS, MRI, IsEH);
   OS << "\n";
 }
diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp
index ade858113a3..1b505776ca1 100644
--- a/lib/MC/ELFObjectWriter.cpp
+++ b/lib/MC/ELFObjectWriter.cpp
@@ -1271,6 +1271,7 @@ void ELFObjectWriter::executePostLayoutBinding(MCAssembler &Asm,
     // This is the first place we are able to copy this information.
     Alias->setExternal(Symbol.isExternal());
     Alias->setBinding(Symbol.getBinding());
+    Alias->setOther(Symbol.getOther());
 
     if (!Symbol.isUndefined() && !Rest.startswith("@@@"))
       continue;
diff --git a/lib/MC/MCWin64EH.cpp b/lib/MC/MCWin64EH.cpp
index 8bc1f08c887..3ef1514455a 100644
--- a/lib/MC/MCWin64EH.cpp
+++ b/lib/MC/MCWin64EH.cpp
@@ -522,7 +522,7 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
     if (MatchingEpilog) {
       assert(EpilogInfo.find(MatchingEpilog) != EpilogInfo.end() &&
              "Duplicate epilog not found");
-      EpilogInfo[EpilogStart] = EpilogInfo[MatchingEpilog];
+      EpilogInfo[EpilogStart] = EpilogInfo.lookup(MatchingEpilog);
       // Clear the unwind codes in the EpilogMap, so that they don't get output
       // in the logic below.
       EpilogInstrs.clear();
diff --git a/lib/MC/WasmObjectWriter.cpp b/lib/MC/WasmObjectWriter.cpp
index 333748db919..b07fe05cad5 100644
--- a/lib/MC/WasmObjectWriter.cpp
+++ b/lib/MC/WasmObjectWriter.cpp
@@ -368,7 +368,13 @@ void WasmObjectWriter::startCustomSection(SectionBookkeeping &Section,
 // Now that the section is complete and we know how big it is, patch up the
 // section size field at the start of the section.
 void WasmObjectWriter::endSection(SectionBookkeeping &Section) {
-  uint64_t Size = W.OS.tell() - Section.PayloadOffset;
+  uint64_t Size = W.OS.tell();
+  // /dev/null doesn't support seek/tell and can report offset of 0.
+  // Simply skip this patching in that case.
+  if (!Size)
+    return;
+
+  Size -= Section.PayloadOffset;
   if (uint32_t(Size) != Size)
     report_fatal_error("section size does not fit in a uint32_t");
 
diff --git a/lib/Object/COFFImportFile.cpp b/lib/Object/COFFImportFile.cpp
index dc11cc4bcff..e7c7efe4367 100644
--- a/lib/Object/COFFImportFile.cpp
+++ b/lib/Object/COFFImportFile.cpp
@@ -496,7 +496,7 @@ NewArchiveMember ObjectFactory::createWeakExternal(StringRef Sym,
 
   // COFF Header
   coff_file_header Header{
-      u16(0),
+      u16(Machine),
       u16(NumberOfSections),
       u32(0),
       u32(sizeof(Header) + (NumberOfSections * sizeof(coff_section))),
diff --git a/lib/Target/AArch64/AArch64SchedExynosM4.td b/lib/Target/AArch64/AArch64SchedExynosM4.td
index 4d892465b3f..61652b1d8e3 100644
--- a/lib/Target/AArch64/AArch64SchedExynosM4.td
+++ b/lib/Target/AArch64/AArch64SchedExynosM4.td
@@ -239,7 +239,6 @@ def M4WriteNEONK   : SchedWriteRes<[M4UnitNSHF,
                                     M4UnitS0]>    { let Latency = 5;
                                                     let NumMicroOps = 2; }
 def M4WriteNEONL   : SchedWriteRes<[M4UnitNMUL]>  { let Latency = 3; }
-def M4WriteNEONM   : SchedWriteRes<[M4UnitNMUL]>  { let Latency = 3; }
 def M4WriteNEONN   : SchedWriteRes<[M4UnitNMSC,
                                     M4UnitNMSC]>  { let Latency = 5;
                                                     let NumMicroOps = 2; }
@@ -480,8 +479,6 @@ def M4WriteCOPY    : SchedWriteVariant<[SchedVar<ExynosFPPred, [M4WriteNALU1]>,
                                         SchedVar<NoSchedPred,  [M4WriteZ0]>]>;
 def M4WriteMOVI    : SchedWriteVariant<[SchedVar<IsZeroFPIdiomPred, [M4WriteZ0]>,
                                         SchedVar<NoSchedPred,       [M4WriteNALU1]>]>;
-def M4WriteMULL    : SchedWriteVariant<[SchedVar<ExynosLongVectorUpperPred, [M4WriteNEONM]>,
-                                        SchedVar<NoSchedPred,               [M4WriteNMUL3]>]>;
 
 // Fast forwarding.
 def M4ReadAESM1    : SchedReadAdvance<+1, [M4WriteNCRY1]>;
@@ -489,7 +486,8 @@ def M4ReadFMACM1   : SchedReadAdvance<+1, [M4WriteFMAC4,
                                            M4WriteFMAC4H,
                                            M4WriteFMAC5]>;
 def M4ReadNMULM1   : SchedReadAdvance<+1, [M4WriteNMUL3]>;
-def M4ReadMULLP2   : SchedReadAdvance<-2, [M4WriteNEONM]>;
+def M4ReadNMULP2   : SchedReadAdvance<-2, [M4WriteNMUL3]>;
+
 
 //===----------------------------------------------------------------------===//
 // Coarse scheduling model.
@@ -662,10 +660,8 @@ def : InstRW<[M4WriteNEONK],  (instregex "^FMOVDXHighr")>;
 def : InstRW<[M4WriteFCVT3H], (instregex "^F(RECP|RSQRT)Ev1f16")>;
 def : InstRW<[M4WriteFCVT3],  (instregex "^F(RECP|RSQRT)Ev1i(32|64)")>;
 def : InstRW<[M4WriteNMSC1],  (instregex "^FRECPXv1")>;
-def : InstRW<[M4WriteFMAC4H,
-              M4ReadFMACM1],  (instregex "^F(RECP|RSQRT)S16")>;
-def : InstRW<[M4WriteFMAC4,
-              M4ReadFMACM1],  (instregex "^F(RECP|RSQRT)S(32|64)")>;
+def : InstRW<[M4WriteFMAC4H], (instregex "^F(RECP|RSQRT)S16")>;
+def : InstRW<[M4WriteFMAC4],  (instregex "^F(RECP|RSQRT)S(32|64)")>;
 
 // FP load instructions.
 def : InstRW<[WriteVLD],    (instregex "^LDR[SDQ]l")>;
@@ -736,14 +732,20 @@ def : InstRW<[M4WriteNALU1],  (instregex "^(AND|BIC|EOR|NOT|ORN|ORR)v")>;
 def : InstRW<[M4WriteNMSC1],  (instregex "^[SU](MIN|MAX)v")>;
 def : InstRW<[M4WriteNMSC2],  (instregex "^[SU](MIN|MAX)Pv")>;
 def : InstRW<[M4WriteNHAD3],  (instregex "^[SU](MIN|MAX)Vv")>;
-def : InstRW<[M4WriteNMUL3],  (instregex "^(SQR?D)?MULH?v")>;
 def : InstRW<[M4WriteNMUL3,
               M4ReadNMULM1],  (instregex "^ML[AS]v")>;
-def : InstRW<[M4WriteNMUL3],  (instregex "^SQRDML[AS]H")>;
-def : InstRW<[M4WriteMULL,
-              M4ReadMULLP2],  (instregex "^(S|U|SQD)ML[AS]Lv")>;
-def : InstRW<[M4WriteMULL,
-              M4ReadMULLP2],  (instregex "^(S|U|SQD)MULLv")>;
+def : InstRW<[M4WriteNMUL3,
+              M4ReadNMULM1],  (instregex "^(SQR?D)?MULH?v")>;
+def : InstRW<[M4WriteNMUL3,
+              M4ReadNMULM1],  (instregex "^SQRDML[AS]H")>;
+def : InstRW<[M4WriteNMUL3,
+              M4ReadNMULM1],  (instregex "^(S|U|SQD)ML[AS]L(v1(i32|i64)|v2i32|v4i16|v8i8)")>;
+def : InstRW<[M4WriteNMUL3,
+              M4ReadNMULP2],  (instregex "^(S|U|SQD)ML[AS]L(v4i32|v8i16|v16i8)")>;
+def : InstRW<[M4WriteNMUL3,
+              M4ReadNMULM1],  (instregex "^(S|U|SQD)MULL(v1(i32|i64)|v2i32|v4i16|v8i8)")>;
+def : InstRW<[M4WriteNMUL3,
+              M4ReadNMULP2],  (instregex "^(S|U|SQD)MULL(v4i32|v8i16|v16i8)")>;
 def : InstRW<[M4WriteNMUL3],  (instregex "^[SU]DOT(lane)?v")>;
 def : InstRW<[M4WriteNHAD3],  (instregex "^[SU]ADALPv")>;
 def : InstRW<[M4WriteNSHT4A], (instregex "^[SU]R?SRA[dv]")>;
@@ -808,10 +810,8 @@ def : InstRW<[M4WriteNALU1],  (instregex "^FMOVv.f(32|64)")>;
 def : InstRW<[M4WriteFCVT3H], (instregex "^F(RECP|RSQRT)Ev[248]f16")>;
 def : InstRW<[M4WriteFCVT3],  (instregex "^F(RECP|RSQRT)Ev[248]f(32|64)")>;
 def : InstRW<[M4WriteFCVT3],  (instregex "^U(RECP|RSQRT)Ev[24]i32")>;
-def : InstRW<[M4WriteFMAC4H,
-              M4ReadFMACM1],  (instregex "^F(RECP|RSQRT)Sv.f16")>;
-def : InstRW<[M4WriteFMAC4,
-              M4ReadFMACM1],  (instregex "^F(RECP|RSQRT)Sv.f(32|64)")>;
+def : InstRW<[M4WriteFMAC4H], (instregex "^F(RECP|RSQRT)Sv.f16")>;
+def : InstRW<[M4WriteFMAC4],  (instregex "^F(RECP|RSQRT)Sv.f(32|64)")>;
 def : InstRW<[M4WriteNSHF1],  (instregex "^REV(16|32|64)v")>;
 def : InstRW<[M4WriteNSHFA],  (instregex "^TB[LX]v(8|16)i8One")>;
 def : InstRW<[M4WriteNSHFB],  (instregex "^TB[LX]v(8|16)i8Two")>;
diff --git a/lib/Target/AArch64/AArch64SchedPredExynos.td b/lib/Target/AArch64/AArch64SchedPredExynos.td
index 48c54230e9d..316036d8940 100644
--- a/lib/Target/AArch64/AArch64SchedPredExynos.td
+++ b/lib/Target/AArch64/AArch64SchedPredExynos.td
@@ -103,17 +103,6 @@ def ExynosScaledIdxPred : MCSchedPredicate<ExynosScaledIdxFn>;
 // Identify FP instructions.
 def ExynosFPPred : MCSchedPredicate<CheckAny<[CheckDForm, CheckQForm]>>;
 
-// Identify whether an instruction whose result is a long vector
-// operates on the upper half of the input registers.
-def ExynosLongVectorUpperFn   : TIIPredicate<
-                                  "isExynosLongVectorUpper",
-                                  MCOpcodeSwitchStatement<
-                                  [MCOpcodeSwitchCase<
-                                    IsLongVectorUpperOp.ValidOpcodes,
-                                    MCReturnStatement<TruePred>>],
-                                  MCReturnStatement<FalsePred>>>;
-def ExynosLongVectorUpperPred : MCSchedPredicate<ExynosLongVectorUpperFn>;
-
 // Identify 128-bit NEON instructions.
 def ExynosQFormPred : MCSchedPredicate<CheckQForm>;
 
diff --git a/lib/Target/AArch64/AArch64SchedPredicates.td b/lib/Target/AArch64/AArch64SchedPredicates.td
index dbaf11fc95d..b23572b41b9 100644
--- a/lib/Target/AArch64/AArch64SchedPredicates.td
+++ b/lib/Target/AArch64/AArch64SchedPredicates.td
@@ -268,59 +268,6 @@ def IsStoreRegOffsetOp     : CheckOpcode<[STRBBroW, STRBBroX,
 def IsLoadStoreRegOffsetOp : CheckOpcode<!listconcat(IsLoadRegOffsetOp.ValidOpcodes,
                                                      IsStoreRegOffsetOp.ValidOpcodes)>;
 
-// Identify whether an instruction whose result is a long vector
-// operates on the upper half of the input registers.
-def IsLongVectorUpperOp    : CheckOpcode<[FCVTLv8i16, FCVTLv4i32,
-                                          FCVTNv8i16, FCVTNv4i32,
-                                          FCVTXNv4f32,
-                                          PMULLv16i8, PMULLv2i64,
-                                          RADDHNv8i16_v16i8, RADDHNv4i32_v8i16, RADDHNv2i64_v4i32,
-                                          RSHRNv16i8_shift, RSHRNv8i16_shift, RSHRNv4i32_shift,
-                                          RSUBHNv8i16_v16i8, RSUBHNv4i32_v8i16, RSUBHNv2i64_v4i32,
-                                          SABALv16i8_v8i16, SABALv8i16_v4i32, SABALv4i32_v2i64,
-                                          SABDLv16i8_v8i16, SABDLv8i16_v4i32, SABDLv4i32_v2i64,
-                                          SADDLv16i8_v8i16, SADDLv8i16_v4i32, SADDLv4i32_v2i64,
-                                          SADDWv16i8_v8i16, SADDWv8i16_v4i32, SADDWv4i32_v2i64,
-                                          SHLLv16i8, SHLLv8i16, SHLLv4i32,
-                                          SHRNv16i8_shift, SHRNv8i16_shift, SHRNv4i32_shift,
-                                          SMLALv16i8_v8i16, SMLALv8i16_v4i32, SMLALv4i32_v2i64,
-                                          SMLALv8i16_indexed, SMLALv4i32_indexed,
-                                          SMLSLv16i8_v8i16, SMLSLv8i16_v4i32, SMLSLv4i32_v2i64,
-                                          SMLSLv8i16_indexed, SMLSLv4i32_indexed,
-                                          SMULLv16i8_v8i16, SMULLv8i16_v4i32, SMULLv4i32_v2i64,
-                                          SMULLv8i16_indexed, SMULLv4i32_indexed,
-                                          SQDMLALv8i16_v4i32, SQDMLALv4i32_v2i64,
-                                          SQDMLALv8i16_indexed, SQDMLALv4i32_indexed,
-                                          SQDMLSLv8i16_v4i32, SQDMLSLv4i32_v2i64,
-                                          SQDMLSLv8i16_indexed, SQDMLSLv4i32_indexed,
-                                          SQDMULLv8i16_v4i32, SQDMULLv4i32_v2i64,
-                                          SQDMULLv8i16_indexed, SQDMULLv4i32_indexed,
-                                          SQRSHRNv16i8_shift, SQRSHRNv8i16_shift, SQRSHRNv4i32_shift,
-                                          SQRSHRUNv16i8_shift, SQRSHRUNv8i16_shift, SQRSHRUNv4i32_shift,
-                                          SQSHRNv16i8_shift, SQSHRNv8i16_shift, SQSHRNv4i32_shift,
-                                          SQSHRUNv16i8_shift, SQSHRUNv8i16_shift, SQSHRUNv4i32_shift,
-                                          SQXTNv16i8, SQXTNv8i16, SQXTNv4i32,
-                                          SQXTUNv16i8, SQXTUNv8i16, SQXTUNv4i32,
-                                          SSHLLv16i8_shift, SSHLLv8i16_shift, SSHLLv4i32_shift,
-                                          SSUBLv16i8_v8i16, SSUBLv8i16_v4i32, SSUBLv4i32_v2i64,
-                                          SSUBWv16i8_v8i16, SSUBWv8i16_v4i32, SSUBWv4i32_v2i64,
-                                          UABALv16i8_v8i16, UABALv8i16_v4i32, UABALv4i32_v2i64,
-                                          UABDLv16i8_v8i16, UABDLv8i16_v4i32, UABDLv4i32_v2i64,
-                                          UADDLv16i8_v8i16, UADDLv8i16_v4i32, UADDLv4i32_v2i64,
-                                          UADDWv16i8_v8i16, UADDWv8i16_v4i32, UADDWv4i32_v2i64,
-                                          UMLALv16i8_v8i16, UMLALv8i16_v4i32, UMLALv4i32_v2i64,
-                                          UMLALv8i16_indexed, UMLALv4i32_indexed,
-                                          UMLSLv16i8_v8i16, UMLSLv8i16_v4i32, UMLSLv4i32_v2i64,
-                                          UMLSLv8i16_indexed, UMLSLv4i32_indexed,
-                                          UMULLv16i8_v8i16, UMULLv8i16_v4i32, UMULLv4i32_v2i64,
-                                          UMULLv8i16_indexed, UMULLv4i32_indexed,
-                                          UQSHRNv16i8_shift, UQSHRNv8i16_shift, UQSHRNv4i32_shift,
-                                          UQXTNv16i8, UQXTNv8i16, UQXTNv4i32,
-                                          USHLLv16i8_shift, USHLLv8i16_shift, USHLLv4i32_shift,
-                                          USUBLv16i8_v8i16, USUBLv8i16_v4i32, USUBLv4i32_v2i64,
-                                          USUBWv16i8_v8i16, USUBWv8i16_v4i32, USUBWv4i32_v2i64,
-                                          XTNv16i8, XTNv8i16, XTNv4i32]>;
-
 // Target predicates.
 
 // Identify an instruction that effectively transfers a register to another.
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index f4e86695836..d679abd107d 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -201,49 +201,55 @@ static bool updateOperand(FoldCandidate &Fold,
         Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
       }
     }
+  }
 
-    if (Fold.needsShrink()) {
-      MachineBasicBlock *MBB = MI->getParent();
-      auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI);
-      if (Liveness != MachineBasicBlock::LQR_Dead)
-        return false;
+  if ((Fold.isImm() || Fold.isFI()) && Fold.needsShrink()) {
+    MachineBasicBlock *MBB = MI->getParent();
+    auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI);
+    if (Liveness != MachineBasicBlock::LQR_Dead)
+      return false;
 
-      MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
-      int Op32 = Fold.getShrinkOpcode();
-      MachineOperand &Dst0 = MI->getOperand(0);
-      MachineOperand &Dst1 = MI->getOperand(1);
-      assert(Dst0.isDef() && Dst1.isDef());
+    MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+    int Op32 = Fold.getShrinkOpcode();
+    MachineOperand &Dst0 = MI->getOperand(0);
+    MachineOperand &Dst1 = MI->getOperand(1);
+    assert(Dst0.isDef() && Dst1.isDef());
 
-      bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg());
+    bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg());
 
-      const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg());
-      unsigned NewReg0 = MRI.createVirtualRegister(Dst0RC);
-      const TargetRegisterClass *Dst1RC = MRI.getRegClass(Dst1.getReg());
-      unsigned NewReg1 = MRI.createVirtualRegister(Dst1RC);
+    const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg());
+    unsigned NewReg0 = MRI.createVirtualRegister(Dst0RC);
 
-      MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32);
+    MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32);
 
-      if (HaveNonDbgCarryUse) {
-        BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), Dst1.getReg())
-          .addReg(AMDGPU::VCC, RegState::Kill);
-      }
-
-      // Keep the old instruction around to avoid breaking iterators, but
-      // replace the outputs with dummy registers.
-      Dst0.setReg(NewReg0);
-      Dst1.setReg(NewReg1);
-
-      if (Fold.isCommuted())
-        TII.commuteInstruction(*Inst32, false);
-      return true;
+    if (HaveNonDbgCarryUse) {
+      BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), Dst1.getReg())
+        .addReg(AMDGPU::VCC, RegState::Kill);
     }
 
-    Old.ChangeToImmediate(Fold.ImmToFold);
+    // Keep the old instruction around to avoid breaking iterators, but
+    // replace it with a dummy instruction to remove uses.
+    //
+    // FIXME: We should not invert how this pass looks at operands to avoid
+    // this. Should track set of foldable movs instead of looking for uses
+    // when looking at a use.
+    Dst0.setReg(NewReg0);
+    for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
+      MI->RemoveOperand(I);
+    MI->setDesc(TII.get(AMDGPU::IMPLICIT_DEF));
+
+    if (Fold.isCommuted())
+      TII.commuteInstruction(*Inst32, false);
     return true;
   }
 
   assert(!Fold.needsShrink() && "not handled");
 
+  if (Fold.isImm()) {
+    Old.ChangeToImmediate(Fold.ImmToFold);
+    return true;
+  }
+
   if (Fold.isFI()) {
     Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
     return true;
@@ -344,7 +350,7 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
       if ((Opc == AMDGPU::V_ADD_I32_e64 ||
            Opc == AMDGPU::V_SUB_I32_e64 ||
            Opc == AMDGPU::V_SUBREV_I32_e64) && // FIXME
-          OpToFold->isImm()) {
+          (OpToFold->isImm() || OpToFold->isFI())) {
         MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
 
         // Verify the other operand is a VGPR, otherwise we would violate the
@@ -357,7 +363,10 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
 
         assert(MI->getOperand(1).isDef());
 
-        int Op32 =  AMDGPU::getVOPe32(Opc);
+        // Make sure to get the 32-bit version of the commuted opcode.
+        unsigned MaybeCommutedOpc = MI->getOpcode();
+        int Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
+
         FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true,
                                          Op32));
         return true;
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index e3fd7b5f9fa..8cf524a5128 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -515,18 +515,12 @@ let AddedComplexity = 1 in {
 }
 
 let SubtargetPredicate = HasAddNoCarryInsts in {
-  def : DivergentBinOp<add, V_ADD_U32_e32>;
-  def : DivergentBinOp<sub, V_SUB_U32_e32>;
-  def : DivergentBinOp<sub, V_SUBREV_U32_e32>;
+  def : DivergentBinOp<add, V_ADD_U32_e64>;
+  def : DivergentBinOp<sub, V_SUB_U32_e64>;
 }
 
-
-def : DivergentBinOp<add, V_ADD_I32_e32>;
-
 def : DivergentBinOp<add, V_ADD_I32_e64>;
-def : DivergentBinOp<sub, V_SUB_I32_e32>;
-
-def : DivergentBinOp<sub, V_SUBREV_I32_e32>;
+def : DivergentBinOp<sub, V_SUB_I32_e64>;
 
 def : DivergentBinOp<srl, V_LSHRREV_B32_e32>;
 def : DivergentBinOp<sra, V_ASHRREV_I32_e32>;
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 21de0f6a763..7e90edbbdab 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -1984,32 +1984,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // Build a sequence of copy-to-reg nodes chained together with token chain
   // and flag operands which copy the outgoing args into the appropriate regs.
   SDValue InFlag;
-  // Tail call byval lowering might overwrite argument registers so in case of
-  // tail call optimization the copies to registers are lowered later.
-  if (!isTailCall)
-    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
-                               RegsToPass[i].second, InFlag);
-      InFlag = Chain.getValue(1);
-    }
-
-  // For tail calls lower the arguments to the 'real' stack slot.
-  if (isTailCall) {
-    // Force all the incoming stack arguments to be loaded from the stack
-    // before any new outgoing arguments are stored to the stack, because the
-    // outgoing stack slots may alias the incoming argument stack slots, and
-    // the alias isn't otherwise explicit. This is slightly more conservative
-    // than necessary, because it means that each store effectively depends
-    // on every argument instead of just those arguments it would clobber.
-
-    // Do not flag preceding copytoreg stuff together with the following stuff.
-    InFlag = SDValue();
-    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
-                               RegsToPass[i].second, InFlag);
-      InFlag = Chain.getValue(1);
-    }
-    InFlag = SDValue();
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
   }
 
   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
diff --git a/lib/Target/AVR/AVRISelLowering.cpp b/lib/Target/AVR/AVRISelLowering.cpp
index 57fc978b54b..5db75778232 100644
--- a/lib/Target/AVR/AVRISelLowering.cpp
+++ b/lib/Target/AVR/AVRISelLowering.cpp
@@ -26,19 +26,21 @@
 
 #include "AVR.h"
 #include "AVRMachineFunctionInfo.h"
+#include "AVRSubtarget.h"
 #include "AVRTargetMachine.h"
 #include "MCTargetDesc/AVRMCTargetDesc.h"
 
 namespace llvm {
 
-AVRTargetLowering::AVRTargetLowering(AVRTargetMachine &tm)
-    : TargetLowering(tm) {
+AVRTargetLowering::AVRTargetLowering(const AVRTargetMachine &TM,
+                                     const AVRSubtarget &STI)
+    : TargetLowering(TM), Subtarget(STI) {
   // Set up the register classes.
   addRegisterClass(MVT::i8, &AVR::GPR8RegClass);
   addRegisterClass(MVT::i16, &AVR::DREGSRegClass);
 
   // Compute derived properties from the register classes.
-  computeRegisterProperties(tm.getSubtargetImpl()->getRegisterInfo());
+  computeRegisterProperties(Subtarget.getRegisterInfo());
 
   setBooleanContents(ZeroOrOneBooleanContent);
   setBooleanVectorContents(ZeroOrOneBooleanContent);
@@ -163,6 +165,13 @@ AVRTargetLowering::AVRTargetLowering(AVRTargetMachine &tm)
   setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand);
   setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand);
 
+  // Expand multiplications to libcalls when there is
+  // no hardware MUL.
+  if (!Subtarget.supportsMultiplication()) {
+    setOperationAction(ISD::SMUL_LOHI, MVT::i8, Expand);
+    setOperationAction(ISD::UMUL_LOHI, MVT::i8, Expand);
+  }
+
   for (MVT VT : MVT::integer_valuetypes()) {
     setOperationAction(ISD::MULHS, VT, Expand);
     setOperationAction(ISD::MULHU, VT, Expand);
@@ -1271,7 +1280,7 @@ SDValue AVRTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Add a register mask operand representing the call-preserved registers.
   const AVRTargetMachine &TM = (const AVRTargetMachine &)getTargetMachine();
-  const TargetRegisterInfo *TRI = TM.getSubtargetImpl()->getRegisterInfo();
+  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
   const uint32_t *Mask =
       TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv);
   assert(Mask && "Missing call preserved mask for calling convention");
@@ -1434,7 +1443,7 @@ MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI,
   MachineFunction *F = BB->getParent();
   MachineRegisterInfo &RI = F->getRegInfo();
   const AVRTargetMachine &TM = (const AVRTargetMachine &)getTargetMachine();
-  const TargetInstrInfo &TII = *TM.getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
   DebugLoc dl = MI.getDebugLoc();
 
   switch (MI.getOpcode()) {
@@ -1575,7 +1584,7 @@ static bool isCopyMulResult(MachineBasicBlock::iterator const &I) {
 MachineBasicBlock *AVRTargetLowering::insertMul(MachineInstr &MI,
                                                 MachineBasicBlock *BB) const {
   const AVRTargetMachine &TM = (const AVRTargetMachine &)getTargetMachine();
-  const TargetInstrInfo &TII = *TM.getSubtargetImpl()->getInstrInfo();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
   MachineBasicBlock::iterator I(MI);
   ++I; // in any case insert *after* the mul instruction
   if (isCopyMulResult(I))
@@ -1838,9 +1847,6 @@ std::pair<unsigned, const TargetRegisterClass *>
 AVRTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                                 StringRef Constraint,
                                                 MVT VT) const {
-  auto STI = static_cast<const AVRTargetMachine &>(this->getTargetMachine())
-                 .getSubtargetImpl();
-
   // We only support i8 and i16.
   //
   //:FIXME: remove this assert for now since it gets sometimes executed
@@ -1884,8 +1890,8 @@ AVRTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
     }
   }
 
-  return TargetLowering::getRegForInlineAsmConstraint(STI->getRegisterInfo(),
-                                                      Constraint, VT);
+  return TargetLowering::getRegForInlineAsmConstraint(
+      Subtarget.getRegisterInfo(), Constraint, VT);
 }
 
 void AVRTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
diff --git a/lib/Target/AVR/AVRISelLowering.h b/lib/Target/AVR/AVRISelLowering.h
index c90c65c81f7..7d77dd8fb01 100644
--- a/lib/Target/AVR/AVRISelLowering.h
+++ b/lib/Target/AVR/AVRISelLowering.h
@@ -64,12 +64,14 @@ enum NodeType {
 
 } // end of namespace AVRISD
 
+class AVRSubtarget;
 class AVRTargetMachine;
 
 /// Performs target lowering for the AVR.
 class AVRTargetLowering : public TargetLowering {
 public:
-  explicit AVRTargetLowering(AVRTargetMachine &TM);
+  explicit AVRTargetLowering(const AVRTargetMachine &TM,
+                             const AVRSubtarget &STI);
 
 public:
   MVT getScalarShiftAmountTy(const DataLayout &, EVT LHSTy) const override {
@@ -164,6 +166,10 @@ class AVRTargetLowering : public TargetLowering {
                           const SDLoc &dl, SelectionDAG &DAG,
                           SmallVectorImpl<SDValue> &InVals) const;
 
+protected:
+
+  const AVRSubtarget &Subtarget;
+
 private:
   MachineBasicBlock *insertShift(MachineInstr &MI, MachineBasicBlock *BB) const;
   MachineBasicBlock *insertMul(MachineInstr &MI, MachineBasicBlock *BB) const;
diff --git a/lib/Target/AVR/AVRSubtarget.cpp b/lib/Target/AVR/AVRSubtarget.cpp
index 556d69ec523..c7c566270f4 100644
--- a/lib/Target/AVR/AVRSubtarget.cpp
+++ b/lib/Target/AVR/AVRSubtarget.cpp
@@ -29,9 +29,9 @@
 namespace llvm {
 
 AVRSubtarget::AVRSubtarget(const Triple &TT, const std::string &CPU,
-                           const std::string &FS, AVRTargetMachine &TM)
+                           const std::string &FS, const AVRTargetMachine &TM)
     : AVRGenSubtargetInfo(TT, CPU, FS), InstrInfo(), FrameLowering(),
-      TLInfo(TM), TSInfo(),
+      TLInfo(TM, initializeSubtargetDependencies(CPU, FS, TM)), TSInfo(),
 
       // Subtarget features
       m_hasSRAM(false), m_hasJMPCALL(false), m_hasIJMPCALL(false),
@@ -44,4 +44,12 @@ AVRSubtarget::AVRSubtarget(const Triple &TT, const std::string &CPU,
   ParseSubtargetFeatures(CPU, FS);
 }
 
+AVRSubtarget &
+AVRSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS,
+                                              const TargetMachine &TM) {
+  // Parse features string.
+  ParseSubtargetFeatures(CPU, FS);
+  return *this;
+}
+
 } // end of namespace llvm
diff --git a/lib/Target/AVR/AVRSubtarget.h b/lib/Target/AVR/AVRSubtarget.h
index fa26738da19..ba036d5e406 100644
--- a/lib/Target/AVR/AVRSubtarget.h
+++ b/lib/Target/AVR/AVRSubtarget.h
@@ -37,7 +37,7 @@ class AVRSubtarget : public AVRGenSubtargetInfo {
   //! \param FS  The feature string.
   //! \param TM  The target machine.
   AVRSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS,
-               AVRTargetMachine &TM);
+               const AVRTargetMachine &TM);
 
   const AVRInstrInfo *getInstrInfo() const override { return &InstrInfo; }
   const TargetFrameLowering *getFrameLowering() const override { return &FrameLowering; }
@@ -49,6 +49,9 @@ class AVRSubtarget : public AVRGenSubtargetInfo {
   /// \note Definition of function is auto generated by `tblgen`.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
+  AVRSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS,
+                                                const TargetMachine &TM);
+
   // Subtarget feature getters.
   // See AVR.td for details.
   bool hasSRAM() const { return m_hasSRAM; }
diff --git a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index 58f9717e1cc..a46f84bd1c9 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -700,8 +700,11 @@ void MipsTargetAsmStreamer::emitDirectiveCpreturn(unsigned SaveLocation,
 }
 
 void MipsTargetAsmStreamer::emitDirectiveModuleFP() {
-  OS << "\t.module\tfp=";
-  OS << ABIFlagsSection.getFpABIString(ABIFlagsSection.getFpABI()) << "\n";
+  MipsABIFlagsSection::FpABIKind FpABI = ABIFlagsSection.getFpABI();
+  if (FpABI == MipsABIFlagsSection::FpABIKind::SOFT)
+    OS << "\t.module\tsoftfloat\n";
+  else
+    OS << "\t.module\tfp=" << ABIFlagsSection.getFpABIString(FpABI) << "\n";
 }
 
 void MipsTargetAsmStreamer::emitDirectiveSetFp(
diff --git a/lib/Target/Mips/MicroMips32r6InstrInfo.td b/lib/Target/Mips/MicroMips32r6InstrInfo.td
index c441aa76ad4..994a8882f94 100644
--- a/lib/Target/Mips/MicroMips32r6InstrInfo.td
+++ b/lib/Target/Mips/MicroMips32r6InstrInfo.td
@@ -1040,7 +1040,7 @@ class TRUNC_L_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"trunc.l.d", FGR64Opnd,
 class TRUNC_W_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"trunc.w.s", FGR32Opnd,
                                                     FGR32Opnd, II_TRUNC>;
 class TRUNC_W_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"trunc.w.d", FGR32Opnd,
-                                                    AFGR64Opnd, II_TRUNC>;
+                                                    FGR64Opnd, II_TRUNC>;
 class SQRT_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"sqrt.s", FGR32Opnd, FGR32Opnd,
                                                  II_SQRT_S, fsqrt>;
 class SQRT_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"sqrt.d", AFGR64Opnd, AFGR64Opnd,
@@ -1750,6 +1750,8 @@ def : MipsPat<(f32 fpimm0), (MTC1_MMR6 ZERO)>, ISA_MICROMIPS32R6;
 def : MipsPat<(f32 fpimm0neg), (FNEG_S_MMR6 (MTC1_MMR6 ZERO))>, ISA_MICROMIPS32R6;
 def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src),
               (TRUNC_W_D_MMR6 FGR64Opnd:$src)>, ISA_MICROMIPS32R6;
+def : MipsPat<(MipsTruncIntFP FGR32Opnd:$src),
+              (TRUNC_W_S_MMR6 FGR32Opnd:$src)>, ISA_MICROMIPS32R6;
 
 def : MipsPat<(and GPRMM16:$src, immZExtAndi16:$imm),
               (ANDI16_MMR6 GPRMM16:$src, immZExtAndi16:$imm)>,
diff --git a/lib/Target/Mips/MicroMipsInstrFPU.td b/lib/Target/Mips/MicroMipsInstrFPU.td
index 1731afc1961..9e76165e7ad 100644
--- a/lib/Target/Mips/MicroMipsInstrFPU.td
+++ b/lib/Target/Mips/MicroMipsInstrFPU.td
@@ -425,6 +425,11 @@ def : MipsPat<(f64 (fpextend FGR32Opnd:$src)),
 def : MipsPat<(MipsTruncIntFP AFGR64Opnd:$src),
               (TRUNC_W_MM AFGR64Opnd:$src)>, ISA_MICROMIPS32_NOT_MIPS32R6,
               FGR_32;
+def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src),
+              (CVT_W_D64_MM FGR64Opnd:$src)>, ISA_MICROMIPS32_NOT_MIPS32R6,
+              FGR_64;
+def : MipsPat<(MipsTruncIntFP FGR32Opnd:$src),
+              (TRUNC_W_S_MM FGR32Opnd:$src)>, ISA_MICROMIPS32_NOT_MIPS32R6;
 
 // Selects
 defm : MovzPats0<GPR32, FGR32, MOVZ_I_S_MM, SLT_MM, SLTu_MM, SLTi_MM, SLTiu_MM>,
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index a7a748b0840..c35f5beb688 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -813,7 +813,8 @@ void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
   // We should always emit a '.module fp=...' but binutils 2.24 does not accept
   // it. We therefore emit it when it contradicts the ABI defaults (-mfpxx or
   // -mfp64) and omit it otherwise.
-  if (ABI.IsO32() && (STI.isABI_FPXX() || STI.isFP64bit()))
+  if ((ABI.IsO32() && (STI.isABI_FPXX() || STI.isFP64bit())) ||
+      STI.useSoftFloat())
     TS.emitDirectiveModuleFP();
 
   // We should always emit a '.module [no]oddspreg' but binutils 2.24 does not
diff --git a/lib/Target/Mips/MipsDSPInstrInfo.td b/lib/Target/Mips/MipsDSPInstrInfo.td
index b9824220b55..a4078026e4f 100644
--- a/lib/Target/Mips/MipsDSPInstrInfo.td
+++ b/lib/Target/Mips/MipsDSPInstrInfo.td
@@ -1314,7 +1314,9 @@ def PseudoCMPU_LE_QB : PseudoCMP<CMPU_LE_QB>;
 def PseudoPICK_PH : PseudoPICK<PICK_PH>;
 def PseudoPICK_QB : PseudoPICK<PICK_QB>;
 
-def PseudoMTLOHI_DSP : PseudoMTLOHI<ACC64DSP, GPR32>;
+let AdditionalPredicates = [HasDSP] in {
+  def PseudoMTLOHI_DSP : PseudoMTLOHI<ACC64DSP, GPR32>;
+}
 
 // Patterns.
 class DSPPat<dag pattern, dag result, Predicate pred = HasDSP> :
diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index e3823e0dfdb..61e77fbeea6 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -726,6 +726,7 @@ bool MipsDelaySlotFiller::searchRange(MachineBasicBlock &MBB, IterTy Begin,
     // but we don't have enough information to make that decision.
      if (InMicroMipsMode && TII->getInstSizeInBytes(*CurrI) == 2 &&
         (Opcode == Mips::JR || Opcode == Mips::PseudoIndirectBranch ||
+         Opcode == Mips::PseudoIndirectBranch_MM ||
          Opcode == Mips::PseudoReturn || Opcode == Mips::TAILCALL))
       continue;
      // Instructions LWP/SWP and MOVEP should not be in a delay slot as that
diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp
index a18416b9e86..168750b2cba 100644
--- a/lib/Target/Mips/MipsFastISel.cpp
+++ b/lib/Target/Mips/MipsFastISel.cpp
@@ -954,21 +954,34 @@ bool MipsFastISel::selectBranch(const Instruction *I) {
   //
   MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
   MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
-  // For now, just try the simplest case where it's fed by a compare.
+
+  // Fold the common case of a conditional branch with a comparison
+  // in the same block.
+  unsigned ZExtCondReg = 0;
   if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
-    MVT CIMVT =
-        TLI.getValueType(DL, CI->getOperand(0)->getType(), true).getSimpleVT();
-    if (CIMVT == MVT::i1)
+    if (CI->hasOneUse() && CI->getParent() == I->getParent()) {
+      ZExtCondReg = createResultReg(&Mips::GPR32RegClass);
+      if (!emitCmp(ZExtCondReg, CI))
+        return false;
+    }
+  }
+
+  // For the general case, we need to mask with 1.
+  if (ZExtCondReg == 0) {
+    unsigned CondReg = getRegForValue(BI->getCondition());
+    if (CondReg == 0)
       return false;
 
-    unsigned CondReg = getRegForValue(CI);
-    BuildMI(*BrBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::BGTZ))
-        .addReg(CondReg)
-        .addMBB(TBB);
-    finishCondBranch(BI->getParent(), TBB, FBB);
-    return true;
+    ZExtCondReg = emitIntExt(MVT::i1, CondReg, MVT::i32, true);
+    if (ZExtCondReg == 0)
+      return false;
   }
-  return false;
+
+  BuildMI(*BrBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::BGTZ))
+      .addReg(ZExtCondReg)
+      .addMBB(TBB);
+  finishCondBranch(BI->getParent(), TBB, FBB);
+  return true;
 }
 
 bool MipsFastISel::selectCmp(const Instruction *I) {
diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp
index c7ab90ed2a3..2b26caaa9f4 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -447,6 +447,9 @@ bool MipsSEInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case Mips::PseudoMTLOHI_DSP:
     expandPseudoMTLoHi(MBB, MI, Mips::MTLO_DSP, Mips::MTHI_DSP, true);
     break;
+  case Mips::PseudoMTLOHI_MM:
+    expandPseudoMTLoHi(MBB, MI, Mips::MTLO_MM, Mips::MTHI_MM, false);
+    break;
   case Mips::PseudoCVT_S_W:
     expandCvtFPInt(MBB, MI, Mips::CVT_S_W, Mips::MTC1, false);
     break;
diff --git a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
index 26869f25082..cce239cac97 100644
--- a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -61,6 +61,14 @@ extern "C" void LLVMInitializePowerPCDisassembler() {
                                          createPPCLEDisassembler);
 }
 
+static DecodeStatus DecodePCRel24BranchTarget(MCInst &Inst, unsigned Imm,
+                                              uint64_t Addr,
+                                              const void *Decoder) {
+  int32_t Offset = SignExtend32<24>(Imm);
+  Inst.addOperand(MCOperand::createImm(Offset));
+  return MCDisassembler::Success;
+}
+
 // FIXME: These can be generated by TableGen from the existing register
 // encoding values!
 
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
index fc29e4effbb..6824168b890 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -382,8 +382,11 @@ void PPCInstPrinter::printBranchOperand(const MCInst *MI, unsigned OpNo,
 
   // Branches can take an immediate operand.  This is used by the branch
   // selection pass to print .+8, an eight byte displacement from the PC.
-  O << ".+";
-  printAbsBranchOperand(MI, OpNo, O);
+  O << ".";
+  int32_t Imm = SignExtend32<32>((unsigned)MI->getOperand(OpNo).getImm() << 2);
+  if (Imm >= 0)
+    O << "+";
+  O << Imm;
 }
 
 void PPCInstPrinter::printAbsBranchOperand(const MCInst *MI, unsigned OpNo,
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index a1e4e07b25a..78609ef3d4e 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -15,6 +15,7 @@
 #include "InstPrinter/PPCInstPrinter.h"
 #include "MCTargetDesc/PPCMCAsmInfo.h"
 #include "PPCTargetStreamer.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/BinaryFormat/ELF.h"
@@ -182,16 +183,33 @@ class PPCTargetELFStreamer : public PPCTargetStreamer {
 
   void emitAssignment(MCSymbol *S, const MCExpr *Value) override {
     auto *Symbol = cast<MCSymbolELF>(S);
+
     // When encoding an assignment to set symbol A to symbol B, also copy
     // the st_other bits encoding the local entry point offset.
-    if (Value->getKind() != MCExpr::SymbolRef)
-      return;
-    const auto &RhsSym = cast<MCSymbolELF>(
-        static_cast<const MCSymbolRefExpr *>(Value)->getSymbol());
-    unsigned Other = Symbol->getOther();
+    if (copyLocalEntry(Symbol, Value))
+      UpdateOther.insert(Symbol);
+    else
+      UpdateOther.erase(Symbol);
+  }
+
+  void finish() override {
+    for (auto *Sym : UpdateOther)
+      copyLocalEntry(Sym, Sym->getVariableValue());
+  }
+
+private:
+  SmallPtrSet<MCSymbolELF *, 32> UpdateOther;
+
+  bool copyLocalEntry(MCSymbolELF *D, const MCExpr *S) {
+    auto *Ref = dyn_cast<const MCSymbolRefExpr>(S);
+    if (!Ref)
+      return false;
+    const auto &RhsSym = cast<MCSymbolELF>(Ref->getSymbol());
+    unsigned Other = D->getOther();
     Other &= ~ELF::STO_PPC64_LOCAL_MASK;
     Other |= RhsSym.getOther() & ELF::STO_PPC64_LOCAL_MASK;
-    Symbol->setOther(Other);
+    D->setOther(Other);
+    return true;
   }
 };
 
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 31acd0ff870..70e9049a2ab 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -4359,8 +4359,8 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     const Module *M = MF->getFunction().getParent();
 
     if (PPCLowering->getPointerTy(CurDAG->getDataLayout()) != MVT::i32 ||
-        !PPCSubTarget->isSecurePlt() || !PPCSubTarget->isTargetELF() ||
-        M->getPICLevel() == PICLevel::SmallPIC)
+        (!TM.isPositionIndependent() || !PPCSubTarget->isSecurePlt()) ||
+        !PPCSubTarget->isTargetELF() || M->getPICLevel() == PICLevel::SmallPIC)
       break;
 
     SDValue Op = N->getOperand(1);
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index dd3f1ac7908..77aa4fe3d41 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -737,7 +737,9 @@ def abscondbrtarget : Operand<OtherVT> {
 def calltarget : Operand<iPTR> {
   let PrintMethod = "printBranchOperand";
   let EncoderMethod = "getDirectBrEncoding";
+  let DecoderMethod = "DecodePCRel24BranchTarget";
   let ParserMatchClass = PPCDirectBrAsmOperand;
+  let OperandType = "OPERAND_PCREL";
 }
 def abscalltarget : Operand<iPTR> {
   let PrintMethod = "printAbsBranchOperand";
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index c0cbfd779cb..1fdf74549de 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -138,6 +138,9 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   if (isDarwin())
     HasLazyResolverStubs = true;
 
+  if (TargetTriple.isOSNetBSD() || TargetTriple.isOSOpenBSD())
+    SecurePlt = true;
+
   if (HasSPE && IsPPC64)
     report_fatal_error( "SPE is only supported for 32-bit targets.\n", false);
   if (HasSPE && (HasAltivec || HasQPX || HasVSX || HasFPU))
diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp
index 33caa66154f..ad6ea3760fe 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -189,7 +189,7 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       MachineInstr *StMI =
         BuildMI(*MI.getParent(), II, dl, TII.get(SP::STDFri))
         .addReg(FrameReg).addImm(0).addReg(SrcEvenReg);
-      replaceFI(MF, II, *StMI, dl, 0, Offset, FrameReg);
+      replaceFI(MF, *StMI, *StMI, dl, 0, Offset, FrameReg);
       MI.setDesc(TII.get(SP::STDFri));
       MI.getOperand(2).setReg(SrcOddReg);
       Offset += 8;
@@ -201,7 +201,7 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       MachineInstr *StMI =
         BuildMI(*MI.getParent(), II, dl, TII.get(SP::LDDFri), DestEvenReg)
         .addReg(FrameReg).addImm(0);
-      replaceFI(MF, II, *StMI, dl, 1, Offset, FrameReg);
+      replaceFI(MF, *StMI, *StMI, dl, 1, Offset, FrameReg);
 
       MI.setDesc(TII.get(SP::LDDFri));
       MI.getOperand(0).setReg(DestOddReg);
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 003848e3422..f7f29d85cbb 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -669,13 +669,16 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
   if (IsVarArg) {
     // Outgoing non-fixed arguments are placed in a buffer. First
     // compute their offsets and the total amount of buffer space needed.
-    for (SDValue Arg :
-         make_range(OutVals.begin() + NumFixedArgs, OutVals.end())) {
+    for (unsigned I = NumFixedArgs; I < Outs.size(); ++I) {
+      const ISD::OutputArg &Out = Outs[I];
+      SDValue &Arg = OutVals[I];
       EVT VT = Arg.getValueType();
       assert(VT != MVT::iPTR && "Legalized args should be concrete");
       Type *Ty = VT.getTypeForEVT(*DAG.getContext());
+      unsigned Align = std::max(Out.Flags.getOrigAlign(),
+                                Layout.getABITypeAlignment(Ty));
       unsigned Offset = CCInfo.AllocateStack(Layout.getTypeAllocSize(Ty),
-                                             Layout.getABITypeAlignment(Ty));
+                                             Align);
       CCInfo.addLoc(CCValAssign::getMem(ArgLocs.size(), VT.getSimpleVT(),
                                         Offset, VT.getSimpleVT(),
                                         CCValAssign::Full));
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 9dd3f265254..12cd613c34c 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -399,7 +399,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
   case MVT::v2i64:
   case MVT::v8i16:
   case MVT::v16i8:
-    if (IsNonTemporal && Alignment >= 16)
+    if (IsNonTemporal && Alignment >= 16 && HasSSE41)
       Opc = HasVLX ? X86::VMOVNTDQAZ128rm :
             HasAVX ? X86::VMOVNTDQArm : X86::MOVNTDQArm;
     else if (Alignment >= 16)
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index afcb49dc226..217a12ddf89 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -38,6 +38,7 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
@@ -512,6 +513,9 @@ void X86PassConfig::addPreEmitPass2() {
   // correct CFA calculation rule where needed by inserting appropriate CFI
   // instructions.
   const Triple &TT = TM->getTargetTriple();
-  if (!TT.isOSDarwin() && !TT.isOSWindows())
+  const MCAsmInfo *MAI = TM->getMCAsmInfo();
+  if (!TT.isOSDarwin() &&
+      (!TT.isOSWindows() ||
+       MAI->getExceptionHandlingType() == ExceptionHandling::DwarfCFI))
     addPass(createCFIInstrInserter());
 }
diff --git a/test/CodeGen/AMDGPU/add.ll b/test/CodeGen/AMDGPU/add.ll
index 6a108db879c..bce924ec4a0 100644
--- a/test/CodeGen/AMDGPU/add.ll
+++ b/test/CodeGen/AMDGPU/add.ll
@@ -1,11 +1,8 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI,FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI,FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}s_add_i32:
-; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
 ; GCN: s_add_i32 s[[REG:[0-9]+]], {{s[0-9]+, s[0-9]+}}
 ; GCN: v_mov_b32_e32 v[[V_REG:[0-9]+]], s[[REG]]
 ; GCN: buffer_store_dword v[[V_REG]],
@@ -19,9 +16,6 @@ define amdgpu_kernel void @s_add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %
 }
 
 ; FUNC-LABEL: {{^}}s_add_v2i32:
-; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
 ; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
 ; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
 define amdgpu_kernel void @s_add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
@@ -34,11 +28,6 @@ define amdgpu_kernel void @s_add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> a
 }
 
 ; FUNC-LABEL: {{^}}s_add_v4i32:
-; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
 ; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
 ; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
 ; GCN: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
@@ -53,15 +42,6 @@ define amdgpu_kernel void @s_add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> a
 }
 
 ; FUNC-LABEL: {{^}}s_add_v8i32:
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-
 ; GCN: s_add_i32
 ; GCN: s_add_i32
 ; GCN: s_add_i32
@@ -78,23 +58,6 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}s_add_v16i32:
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-; EG: ADD_INT
-
 ; GCN: s_add_i32
 ; GCN: s_add_i32
 ; GCN: s_add_i32
@@ -124,7 +87,7 @@ entry:
 ; SIVI: v_add_{{i|u}}32_e32 v{{[0-9]+}}, vcc, [[A]], [[B]]
 ; GFX9: v_add_u32_e32 v{{[0-9]+}}, [[A]], [[B]]
 define amdgpu_kernel void @v_add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x()
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid
   %b_ptr = getelementptr i32, i32 addrspace(1)* %gep, i32 1
   %a = load volatile i32, i32 addrspace(1)* %gep
@@ -139,7 +102,7 @@ define amdgpu_kernel void @v_add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %
 ; SIVI: v_add_{{i|u}}32_e32 v{{[0-9]+}}, vcc, 0x7b, [[A]]
 ; GFX9: v_add_u32_e32 v{{[0-9]+}}, 0x7b, [[A]]
 define amdgpu_kernel void @v_add_imm_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %tid = call i32 @llvm.r600.read.tidig.x()
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid
   %b_ptr = getelementptr i32, i32 addrspace(1)* %gep, i32 1
   %a = load volatile i32, i32 addrspace(1)* %gep
@@ -151,13 +114,6 @@ define amdgpu_kernel void @v_add_imm_i32(i32 addrspace(1)* %out, i32 addrspace(1
 ; FUNC-LABEL: {{^}}add64:
 ; GCN: s_add_u32
 ; GCN: s_addc_u32
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]]
-; EG-DAG: ADD_INT {{[* ]*}}
-; EG-DAG: ADDC_UINT
-; EG-DAG: ADD_INT
-; EG-DAG: ADD_INT {{[* ]*}}
-; EG-NOT: SUB
 define amdgpu_kernel void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %add = add i64 %a, %b
@@ -172,13 +128,6 @@ entry:
 
 ; FUNC-LABEL: {{^}}add64_sgpr_vgpr:
 ; GCN-NOT: v_addc_u32_e32 s
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]]
-; EG-DAG: ADD_INT {{[* ]*}}
-; EG-DAG: ADDC_UINT
-; EG-DAG: ADD_INT
-; EG-DAG: ADD_INT {{[* ]*}}
-; EG-NOT: SUB
 define amdgpu_kernel void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) {
 entry:
   %0 = load i64, i64 addrspace(1)* %in
@@ -191,13 +140,6 @@ entry:
 ; FUNC-LABEL: {{^}}add64_in_branch:
 ; GCN: s_add_u32
 ; GCN: s_addc_u32
-
-; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]]
-; EG-DAG: ADD_INT {{[* ]*}}
-; EG-DAG: ADDC_UINT
-; EG-DAG: ADD_INT
-; EG-DAG: ADD_INT {{[* ]*}}
-; EG-NOT: SUB
 define amdgpu_kernel void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
 entry:
   %0 = icmp eq i64 %a, 0
@@ -217,7 +159,26 @@ endif:
   ret void
 }
 
-declare i32 @llvm.r600.read.tidig.x() #1
+; Make sure the VOP3 form of add is initially selected. Otherwise pair
+; of opies from/to VCC would be necessary
+
+; GCN-LABEL: {{^}}add_select_vop3:
+; SI: v_add_i32_e64 v0, s[0:1], s0, v0
+; VI: v_add_u32_e64 v0, s[0:1], s0, v0
+; GFX9: v_add_u32_e32 v0, s0, v0
+
+; GCN: ; def vcc
+; GCN: ds_write_b32
+; GCN: ; use vcc
+define amdgpu_ps void @add_select_vop3(i32 inreg %s, i32 %v) {
+  %vcc = call i64 asm sideeffect "; def vcc", "={vcc}"()
+  %sub = add i32 %v, %s
+  store i32 %sub, i32 addrspace(3)* undef
+  call void asm sideeffect "; use vcc", "{vcc}"(i64 %vcc)
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone speculatable }
diff --git a/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll b/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
index 5997e27fd81..e2c7f1c47cf 100644
--- a/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
+++ b/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
@@ -7,6 +7,8 @@ declare void @llvm.amdgcn.s.barrier() #1
 
 ; Function Attrs: nounwind
 ; CHECK-LABEL: {{^}}signed_ds_offset_addressing_loop:
+; SI: s_movk_i32 [[K_0X88:s[0-9]+]], 0x
+; SI: s_movk_i32 [[K_0X100:s[0-9]+]], 0x100
 ; CHECK: BB0_1:
 ; CHECK: v_add_i32_e32 [[VADDR:v[0-9]+]],
 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]]
@@ -14,9 +16,9 @@ declare void @llvm.amdgcn.s.barrier() #1
 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR8]]
 ; SI-DAG: v_add_i32_e32 [[VADDR0x80:v[0-9]+]], vcc, 0x80, [[VADDR]]
 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x80]]
-; SI-DAG: v_add_i32_e32 [[VADDR0x88:v[0-9]+]], vcc, 0x88, [[VADDR]]
+; SI-DAG: v_add_i32_e32 [[VADDR0x88:v[0-9]+]], vcc, [[K_0X88]], [[VADDR]]
 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x88]]
-; SI-DAG: v_add_i32_e32 [[VADDR0x100:v[0-9]+]], vcc, 0x100, [[VADDR]]
+; SI-DAG: v_add_i32_e32 [[VADDR0x100:v[0-9]+]], vcc, [[K_0X100]], [[VADDR]]
 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR0x100]]
 
 ; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset1:2
diff --git a/test/CodeGen/AMDGPU/fence-barrier.ll b/test/CodeGen/AMDGPU/fence-barrier.ll
index 8f5a06d01fa..7de4f1796b0 100644
--- a/test/CodeGen/AMDGPU/fence-barrier.ll
+++ b/test/CodeGen/AMDGPU/fence-barrier.ll
@@ -54,7 +54,8 @@ define amdgpu_kernel void @test_local(i32 addrspace(1)*) {
 }
 
 ; GCN-LABEL: {{^}}test_global
-; GCN: v_add_u32_e32 v{{[0-9]+}}, vcc, 0x888, v{{[0-9]+}}
+; GCN: s_movk_i32 [[K:s[0-9]+]], 0x888
+; GCN: v_add_u32_e32 v{{[0-9]+}}, vcc, [[K]], v{{[0-9]+}}
 ; GCN: flat_store_dword
 ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
 ; GCN-NEXT: s_barrier
diff --git a/test/CodeGen/AMDGPU/fold-fi-operand-shrink.mir b/test/CodeGen/AMDGPU/fold-fi-operand-shrink.mir
new file mode 100644
index 00000000000..ab544665efb
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fold-fi-operand-shrink.mir
@@ -0,0 +1,230 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass si-fold-operands,dead-mi-elimination  %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+
+# First operand is FI is in a VGPR, other operand is a VGPR
+name: shrink_vgpr_fi_vgpr_v_add_i32_e64_no_carry_out_use
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, offset: 0, size: 64, alignment: 16 }
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: shrink_vgpr_fi_vgpr_v_add_i32_e64_no_carry_out_use
+    ; GCN: liveins: $vgpr0
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN: [[V_ADD_I32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_I32_e32 [[V_MOV_B32_e32_]], [[COPY]], implicit-def $vcc, implicit $exec
+    ; GCN: S_ENDPGM implicit [[V_ADD_I32_e32_]]
+    %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %1:vgpr_32 = COPY $vgpr0
+    %2:vgpr_32, %3:sreg_64 = V_ADD_I32_e64 %0, %1, implicit $exec
+    S_ENDPGM implicit %2
+
+...
+
+---
+
+# First operand is a VGPR, other operand FI is in a VGPR
+name: shrink_vgpr_vgpr_fi_v_add_i32_e64_no_carry_out_use
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, offset: 0, size: 64, alignment: 16 }
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: shrink_vgpr_vgpr_fi_v_add_i32_e64_no_carry_out_use
+    ; GCN: liveins: $vgpr0
+    ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GCN: [[V_ADD_I32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_I32_e32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def $vcc, implicit $exec
+    ; GCN: S_ENDPGM implicit [[V_ADD_I32_e32_]]
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %2:vgpr_32, %3:sreg_64 = V_ADD_I32_e64 %0, %1, implicit $exec
+    S_ENDPGM implicit %2
+
+...
+
+---
+
+# First operand is FI is in an SGPR, other operand is a VGPR
+name: shrink_vgpr_fi_sgpr_v_add_i32_e64_no_carry_out_use
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, offset: 0, size: 64, alignment: 16 }
+body:             |
+  bb.0:
+    liveins: $sgpr0
+
+    ; GCN-LABEL: name: shrink_vgpr_fi_sgpr_v_add_i32_e64_no_carry_out_use
+    ; GCN: liveins: $sgpr0
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GCN: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0
+    ; GCN: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec
+    ; GCN: S_ENDPGM implicit [[V_ADD_I32_e64_]]
+    %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %1:sreg_32_xm0 = COPY $sgpr0
+    %2:vgpr_32, %3:sreg_64 = V_ADD_I32_e64 %0, %1, implicit $exec
+    S_ENDPGM implicit %2
+
+...
+
+---
+
+# First operand is an SGPR, other operand FI is in a VGPR
+name: shrink_sgpr_vgpr_fi_v_add_i32_e64_no_carry_out_use
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, offset: 0, size: 64, alignment: 16 }
+body:             |
+  bb.0:
+    liveins: $sgpr0
+
+    ; GCN-LABEL: name: shrink_sgpr_vgpr_fi_v_add_i32_e64_no_carry_out_use
+    ; GCN: liveins: $sgpr0
+    ; GCN: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GCN: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_I32_e64 [[V_MOV_B32_e32_]], [[COPY]], implicit $exec
+    ; GCN: S_ENDPGM implicit [[V_ADD_I32_e64_]]
+    %0:sreg_32_xm0 = COPY $sgpr0
+    %1:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %2:vgpr_32, %3:sreg_64 = V_ADD_I32_e64 %0, %1, implicit $exec
+    S_ENDPGM implicit %2
+
+...
+
+---
+
+# First operand is FI is in an SGPR, other operand is a VGPR
+name: shrink_sgpr_fi_vgpr_v_add_i32_e64_no_carry_out_use
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, offset: 0, size: 64, alignment: 16 }
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: shrink_sgpr_fi_vgpr_v_add_i32_e64_no_carry_out_use
+    ; GCN: liveins: $vgpr0
+    ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
+    ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN: [[V_ADD_I32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_I32_e32 [[S_MOV_B32_]], [[COPY]], implicit-def $vcc, implicit $exec
+    ; GCN: S_ENDPGM implicit [[V_ADD_I32_e32_]]
+    %0:sreg_32_xm0 = S_MOV_B32 %stack.0
+    %1:vgpr_32 = COPY $vgpr0
+    %2:vgpr_32, %3:sreg_64 = V_ADD_I32_e64 %0, %1, implicit $exec
+    S_ENDPGM implicit %2
+
+...
+
+---
+
+# First operand is a VGPR, other operand FI is in an SGPR
+name: shrink_vgpr_sgpr_fi_v_add_i32_e64_no_carry_out_use
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, offset: 0, size: 64, alignment: 16}
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: shrink_vgpr_sgpr_fi_v_add_i32_e64_no_carry_out_use
+    ; GCN: liveins: $vgpr0
+    ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
+    ; GCN: [[V_ADD_I32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_I32_e32 [[S_MOV_B32_]], [[COPY]], implicit-def $vcc, implicit $exec
+    ; GCN: S_ENDPGM implicit [[V_ADD_I32_e32_]]
+    %0:vgpr_32 = COPY $vgpr0
+    %1:sreg_32_xm0 = S_MOV_B32 %stack.0
+    %2:vgpr_32, %3:sreg_64 = V_ADD_I32_e64 %0, %1, implicit $exec
+    S_ENDPGM implicit %2
+
+...
+
+---
+
+# First operand is FI is in a VGPR, other operand is an inline imm in a VGPR
+name: shrink_vgpr_imm_fi_vgpr_v_add_i32_e64_no_carry_out_use
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, offset: 0, size: 64, alignment: 16 }
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: shrink_vgpr_imm_fi_vgpr_v_add_i32_e64_no_carry_out_use
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GCN: [[V_ADD_I32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_I32_e32 16, [[V_MOV_B32_e32_]], implicit-def $vcc, implicit $exec
+    ; GCN: S_ENDPGM implicit [[V_ADD_I32_e32_]]
+    %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %1:vgpr_32 = V_MOV_B32_e32 16, implicit $exec
+    %2:vgpr_32, %3:sreg_64 = V_ADD_I32_e64 %0, %1, implicit $exec
+    S_ENDPGM implicit %2
+
+...
+
+---
+
+# First operand is an inline imm in a VGPR, other operand FI is in a VGPR
+name: shrink_vgpr_imm_vgpr_fi_v_add_i32_e64_no_carry_out_use
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, offset: 0, size: 64, alignment: 16 }
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: shrink_vgpr_imm_vgpr_fi_v_add_i32_e64_no_carry_out_use
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GCN: [[V_ADD_I32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_I32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_I32_e64 16, [[V_MOV_B32_e32_]], implicit $exec
+    ; GCN: S_ENDPGM implicit [[V_ADD_I32_e64_]]
+    %0:vgpr_32 = V_MOV_B32_e32 16, implicit $exec
+    %1:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %2:vgpr_32, %3:sreg_64 = V_ADD_I32_e64 %0, %1, implicit $exec
+    S_ENDPGM implicit %2
+
+...
+
+---
+
+# First operand is FI is in a VGPR, other operand is an literal constant in a VGPR
+name: shrink_vgpr_k_fi_vgpr_v_add_i32_e64_no_carry_out_use
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, offset: 0, size: 64, alignment: 16 }
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: shrink_vgpr_k_fi_vgpr_v_add_i32_e64_no_carry_out_use
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GCN: [[V_ADD_I32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_I32_e32 1234, [[V_MOV_B32_e32_]], implicit-def $vcc, implicit $exec
+    ; GCN: S_ENDPGM implicit [[V_ADD_I32_e32_]]
+    %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %1:vgpr_32 = V_MOV_B32_e32 1234, implicit $exec
+    %2:vgpr_32, %3:sreg_64 = V_ADD_I32_e64 %0, %1, implicit $exec
+    S_ENDPGM implicit %2
+
+...
+
+---
+
+# First operand is a literal constant in a VGPR, other operand FI is in a VGPR
+name: shrink_vgpr_k_vgpr_fi_v_add_i32_e64_no_carry_out_use
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, offset: 0, size: 64, alignment: 16 }
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: shrink_vgpr_k_vgpr_fi_v_add_i32_e64_no_carry_out_use
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1234, implicit $exec
+    ; GCN: [[V_ADD_I32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_I32_e32 %stack.0, [[V_MOV_B32_e32_]], implicit-def $vcc, implicit $exec
+    ; GCN: S_ENDPGM implicit [[V_ADD_I32_e32_]]
+    %0:vgpr_32 = V_MOV_B32_e32 1234, implicit $exec
+    %1:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %2:vgpr_32, %3:sreg_64 = V_ADD_I32_e64 %0, %1, implicit $exec
+    S_ENDPGM implicit %2
+
+...
diff --git a/test/CodeGen/AMDGPU/fold-immediate-operand-shrink.mir b/test/CodeGen/AMDGPU/fold-immediate-operand-shrink.mir
index 847c2b720cd..15c453f36f6 100644
--- a/test/CodeGen/AMDGPU/fold-immediate-operand-shrink.mir
+++ b/test/CodeGen/AMDGPU/fold-immediate-operand-shrink.mir
@@ -250,8 +250,8 @@ body:             |
     ; GCN-LABEL: name: shrink_scalar_imm_vgpr_v_sub_i32_e64_no_carry_out_use
     ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 12345
     ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-    ; GCN: [[V_SUBREV_I32_e32_:%[0-9]+]]:vgpr_32 = V_SUBREV_I32_e32 [[S_MOV_B32_]], [[DEF]], implicit-def $vcc, implicit $exec
-    ; GCN: S_ENDPGM implicit [[V_SUBREV_I32_e32_]]
+    ; GCN: [[V_SUB_I32_e32_:%[0-9]+]]:vgpr_32 = V_SUB_I32_e32 [[S_MOV_B32_]], [[DEF]], implicit-def $vcc, implicit $exec
+    ; GCN: S_ENDPGM implicit [[V_SUB_I32_e32_]]
     %0:sreg_32_xm0 = S_MOV_B32 12345
     %1:vgpr_32 = IMPLICIT_DEF
     %2:vgpr_32, %3:sreg_64 = V_SUB_I32_e64 %0, %1, implicit $exec
@@ -269,8 +269,8 @@ body:             |
     ; GCN-LABEL: name: shrink_vgpr_scalar_imm_v_sub_i32_e64_no_carry_out_use
     ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 12345
-    ; GCN: [[V_SUB_I32_e32_:%[0-9]+]]:vgpr_32 = V_SUB_I32_e32 [[S_MOV_B32_]], [[DEF]], implicit-def $vcc, implicit $exec
-    ; GCN: S_ENDPGM implicit [[V_SUB_I32_e32_]]
+    ; GCN: [[V_SUBREV_I32_e32_:%[0-9]+]]:vgpr_32 = V_SUBREV_I32_e32 [[S_MOV_B32_]], [[DEF]], implicit-def $vcc, implicit $exec
+    ; GCN: S_ENDPGM implicit [[V_SUBREV_I32_e32_]]
     %0:vgpr_32 = IMPLICIT_DEF
     %1:sreg_32_xm0 = S_MOV_B32 12345
     %2:vgpr_32, %3:sreg_64 = V_SUB_I32_e64 %0, %1, implicit $exec
@@ -288,8 +288,8 @@ body:             |
     ; GCN-LABEL: name: shrink_scalar_imm_vgpr_v_subrev_i32_e64_no_carry_out_use
     ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 12345
     ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-    ; GCN: [[V_SUB_I32_e32_:%[0-9]+]]:vgpr_32 = V_SUB_I32_e32 [[S_MOV_B32_]], [[DEF]], implicit-def $vcc, implicit $exec
-    ; GCN: S_ENDPGM implicit [[V_SUB_I32_e32_]]
+    ; GCN: [[V_SUBREV_I32_e32_:%[0-9]+]]:vgpr_32 = V_SUBREV_I32_e32 [[S_MOV_B32_]], [[DEF]], implicit-def $vcc, implicit $exec
+    ; GCN: S_ENDPGM implicit [[V_SUBREV_I32_e32_]]
     %0:sreg_32_xm0 = S_MOV_B32 12345
     %1:vgpr_32 = IMPLICIT_DEF
     %2:vgpr_32, %3:sreg_64 = V_SUBREV_I32_e64 %0, %1, implicit $exec
@@ -307,8 +307,8 @@ body:             |
     ; GCN-LABEL: name: shrink_vgpr_scalar_imm_v_subrev_i32_e64_no_carry_out_use
     ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 12345
-    ; GCN: [[V_SUBREV_I32_e32_:%[0-9]+]]:vgpr_32 = V_SUBREV_I32_e32 [[S_MOV_B32_]], [[DEF]], implicit-def $vcc, implicit $exec
-    ; GCN: S_ENDPGM implicit [[V_SUBREV_I32_e32_]]
+    ; GCN: [[V_SUB_I32_e32_:%[0-9]+]]:vgpr_32 = V_SUB_I32_e32 [[S_MOV_B32_]], [[DEF]], implicit-def $vcc, implicit $exec
+    ; GCN: S_ENDPGM implicit [[V_SUB_I32_e32_]]
     %0:vgpr_32 = IMPLICIT_DEF
     %1:sreg_32_xm0 = S_MOV_B32 12345
     %2:vgpr_32, %3:sreg_64 = V_SUBREV_I32_e64 %0, %1, implicit $exec
@@ -590,3 +590,59 @@ body:             |
     S_ENDPGM implicit %2
 
 ...
+
+---
+name: shrink_add_kill_flags_src0
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+    ; GCN-LABEL: name: shrink_add_kill_flags_src0
+    ; GCN: liveins: $vgpr0
+    ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 518144, implicit $exec
+    ; GCN: [[V_ADD_I32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_I32_e32 killed [[V_MOV_B32_e32_]], [[COPY]], implicit-def $vcc, implicit $exec
+    ; GCN: S_ENDPGM implicit [[V_ADD_I32_e32_]]
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = V_MOV_B32_e32 518144, implicit $exec
+    %2:vgpr_32, %3:sreg_64_xexec = V_ADD_I32_e64 killed %1, %0, implicit $exec
+   S_ENDPGM implicit %2
+...
+
+---
+name: shrink_add_kill_flags_src1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+    ; GCN-LABEL: name: shrink_add_kill_flags_src1
+    ; GCN: liveins: $vgpr0
+    ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 518144, implicit $exec
+    ; GCN: [[V_ADD_I32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_I32_e32 [[V_MOV_B32_e32_]], killed [[COPY]], implicit-def $vcc, implicit $exec
+    ; GCN: S_ENDPGM implicit [[V_ADD_I32_e32_]]
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = V_MOV_B32_e32 518144, implicit $exec
+    %2:vgpr_32, %3:sreg_64_xexec = V_ADD_I32_e64 %1, killed %0, implicit $exec
+   S_ENDPGM implicit %2
+...
+
+---
+name: shrink_addc_kill_flags_src2
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vcc
+    ; GCN-LABEL: name: shrink_addc_kill_flags_src2
+    ; GCN: liveins: $vgpr0, $vcc
+    ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 518144, implicit $exec
+    ; GCN: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY $vcc
+    ; GCN: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[V_MOV_B32_e32_]], [[COPY]], [[COPY1]], implicit $exec
+    ; GCN: S_ENDPGM implicit [[V_ADDC_U32_e64_]]
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = V_MOV_B32_e32 518144, implicit $exec
+    %2:sreg_64_xexec = COPY $vcc
+    %3:vgpr_32, %4:sreg_64_xexec = V_ADDC_U32_e64 %1, %0, %2, implicit $exec
+   S_ENDPGM implicit %3
+...
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
index 18abf607aea..77d518f503a 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
@@ -15,7 +15,8 @@ define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2)
 }
 
 ; VI-LABEL: {{^}}dpp_test1:
-; VI: v_add_u32_e32 [[REG:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}
+; VI-OPT: v_add_u32_e32 [[REG:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}
+; VI-NOOPT: v_add_u32_e64 [[REG:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI-NOOPT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; VI-NEXT: s_nop 0
 ; VI-NEXT: s_nop 0
diff --git a/test/CodeGen/AMDGPU/r600.add.ll b/test/CodeGen/AMDGPU/r600.add.ll
new file mode 100644
index 00000000000..73eea3ef217
--- /dev/null
+++ b/test/CodeGen/AMDGPU/r600.add.ll
@@ -0,0 +1,167 @@
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}s_add_i32:
+; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+define amdgpu_kernel void @s_add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1)* %in
+  %b = load i32, i32 addrspace(1)* %b_ptr
+  %result = add i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_add_v2i32:
+; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+define amdgpu_kernel void @s_add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
+  %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
+  %result = add <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_add_v4i32:
+; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+define amdgpu_kernel void @s_add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
+  %result = add <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_add_v8i32:
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+define amdgpu_kernel void @s_add_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) {
+entry:
+  %0 = add <8 x i32> %a, %b
+  store <8 x i32> %0, <8 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_add_v16i32:
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+; EG: ADD_INT
+define amdgpu_kernel void @s_add_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) {
+entry:
+  %0 = add <16 x i32> %a, %b
+  store <16 x i32> %0, <16 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_add_i32:
+define amdgpu_kernel void @v_add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %gep, i32 1
+  %a = load volatile i32, i32 addrspace(1)* %gep
+  %b = load volatile i32, i32 addrspace(1)* %b_ptr
+  %result = add i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_add_imm_i32:
+define amdgpu_kernel void @v_add_imm_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %gep, i32 1
+  %a = load volatile i32, i32 addrspace(1)* %gep
+  %result = add i32 %a, 123
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}add64:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]]
+; EG-DAG: ADD_INT {{[* ]*}}
+; EG-DAG: ADDC_UINT
+; EG-DAG: ADD_INT
+; EG-DAG: ADD_INT {{[* ]*}}
+; EG-NOT: SUB
+define amdgpu_kernel void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %add = add i64 %a, %b
+  store i64 %add, i64 addrspace(1)* %out
+  ret void
+}
+
+; The v_addc_u32 and v_add_i32 instruction can't read SGPRs, because they
+; use VCC.  The test is designed so that %a will be stored in an SGPR and
+; %0 will be stored in a VGPR, so the comiler will be forced to copy %a
+; to a VGPR before doing the add.
+
+; FUNC-LABEL: {{^}}add64_sgpr_vgpr:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]]
+; EG-DAG: ADD_INT {{[* ]*}}
+; EG-DAG: ADDC_UINT
+; EG-DAG: ADD_INT
+; EG-DAG: ADD_INT {{[* ]*}}
+; EG-NOT: SUB
+define amdgpu_kernel void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) {
+entry:
+  %0 = load i64, i64 addrspace(1)* %in
+  %1 = add i64 %a, %0
+  store i64 %1, i64 addrspace(1)* %out
+  ret void
+}
+
+; Test i64 add inside a branch.
+; FUNC-LABEL: {{^}}add64_in_branch:
+; EG: MEM_RAT_CACHELESS STORE_RAW [[LO:T[0-9]+\.XY]]
+; EG-DAG: ADD_INT {{[* ]*}}
+; EG-DAG: ADDC_UINT
+; EG-DAG: ADD_INT
+; EG-DAG: ADD_INT {{[* ]*}}
+; EG-NOT: SUB
+define amdgpu_kernel void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
+entry:
+  %0 = icmp eq i64 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = load i64, i64 addrspace(1)* %in
+  br label %endif
+
+else:
+  %2 = add i64 %a, %b
+  br label %endif
+
+endif:
+  %3 = phi i64 [%1, %if], [%2, %else]
+  store i64 %3, i64 addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone speculatable }
diff --git a/test/CodeGen/AMDGPU/r600.sub.ll b/test/CodeGen/AMDGPU/r600.sub.ll
new file mode 100644
index 00000000000..2ded4f64328
--- /dev/null
+++ b/test/CodeGen/AMDGPU/r600.sub.ll
@@ -0,0 +1,152 @@
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=EG,FUNC %s
+
+declare i32 @llvm.r600.read.tidig.x() readnone
+
+; FUNC-LABEL: {{^}}s_sub_i32:
+define amdgpu_kernel void @s_sub_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+  %result = sub i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_sub_imm_i32:
+define amdgpu_kernel void @s_sub_imm_i32(i32 addrspace(1)* %out, i32 %a) {
+  %result = sub i32 1234, %a
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_sub_i32:
+; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+define amdgpu_kernel void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
+  %a = load i32, i32 addrspace(1)* %in
+  %b = load i32, i32 addrspace(1)* %b_ptr
+  %result = sub i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_sub_imm_i32:
+; EG: SUB_INT
+define amdgpu_kernel void @test_sub_imm_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %a = load i32, i32 addrspace(1)* %in
+  %result = sub i32 123, %a
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_sub_v2i32:
+; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+define amdgpu_kernel void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
+  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
+  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
+  %result = sub <2 x i32> %a, %b
+  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_sub_v4i32:
+; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+define amdgpu_kernel void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
+  %result = sub <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_sub_i16:
+define amdgpu_kernel void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
+  %b_ptr = getelementptr i16, i16 addrspace(1)* %gep, i32 1
+  %a = load volatile i16, i16 addrspace(1)* %gep
+  %b = load volatile i16, i16 addrspace(1)* %b_ptr
+  %result = sub i16 %a, %b
+  store i16 %result, i16 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_sub_v2i16:
+define amdgpu_kernel void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %gep = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i32 %tid
+  %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %gep, i16 1
+  %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep
+  %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
+  %result = sub <2 x i16> %a, %b
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_sub_v4i16:
+define amdgpu_kernel void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %gep = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i32 %tid
+  %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %gep, i16 1
+  %a = load <4 x i16>, <4 x i16> addrspace(1) * %gep
+  %b = load <4 x i16>, <4 x i16> addrspace(1) * %b_ptr
+  %result = sub <4 x i16> %a, %b
+  store <4 x i16> %result, <4 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}s_sub_i64:
+; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
+; EG-DAG: SUB_INT {{[* ]*}}
+; EG-DAG: SUBB_UINT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT {{[* ]*}}
+define amdgpu_kernel void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind {
+  %result = sub i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_sub_i64:
+; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
+; EG-DAG: SUB_INT {{[* ]*}}
+; EG-DAG: SUBB_UINT
+; EG-DAG: SUB_INT
+; EG-DAG: SUB_INT {{[* ]*}}
+define amdgpu_kernel void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind {
+  %tid = call i32 @llvm.r600.read.tidig.x() readnone
+  %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid
+  %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid
+  %a = load i64, i64 addrspace(1)* %a_ptr
+  %b = load i64, i64 addrspace(1)* %b_ptr
+  %result = sub i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_test_sub_v2i64:
+define amdgpu_kernel void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
+  %tid = call i32 @llvm.r600.read.tidig.x() readnone
+  %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid
+  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid
+  %a = load <2 x i64>, <2 x i64> addrspace(1)* %a_ptr
+  %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
+  %result = sub <2 x i64> %a, %b
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v_test_sub_v4i64:
+define amdgpu_kernel void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) {
+  %tid = call i32 @llvm.r600.read.tidig.x() readnone
+  %a_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inA, i32 %tid
+  %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inB, i32 %tid
+  %a = load <4 x i64>, <4 x i64> addrspace(1)* %a_ptr
+  %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
+  %result = sub <4 x i64> %a, %b
+  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/salu-to-valu.ll b/test/CodeGen/AMDGPU/salu-to-valu.ll
index 9b46962108c..cf42ee9b39b 100644
--- a/test/CodeGen/AMDGPU/salu-to-valu.ll
+++ b/test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -458,7 +458,7 @@ bb7:                                              ; preds = %bb3
 }
 
 ; GCN-LABEL: {{^}}phi_visit_order:
-; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 1, v{{[0-9]+}}
+; GCN: v_add_i32_e64 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 1, v{{[0-9]+}}
 define amdgpu_kernel void @phi_visit_order() {
 bb:
   br label %bb1
diff --git a/test/CodeGen/AMDGPU/sub.ll b/test/CodeGen/AMDGPU/sub.ll
index 4bd346dc586..485b374454d 100644
--- a/test/CodeGen/AMDGPU/sub.ll
+++ b/test/CodeGen/AMDGPU/sub.ll
@@ -1,11 +1,10 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89,FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89,FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=EG,FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
 
-declare i32 @llvm.r600.read.tidig.x() readnone
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone speculatable
 
-; FUNC-LABEL: {{^}}s_sub_i32:
+; GCN-LABEL: {{^}}s_sub_i32:
 ; GCN: s_load_dwordx2
 ; GCN: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}
 ; GCN: s_sub_i32 s{{[0-9]+}}, s[[A]], s[[B]]
@@ -15,7 +14,7 @@ define amdgpu_kernel void @s_sub_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   ret void
 }
 
-; FUNC-LABEL: {{^}}s_sub_imm_i32:
+; GCN-LABEL: {{^}}s_sub_imm_i32:
 ; GCN: s_load_dword [[A:s[0-9]+]]
 ; GCN: s_sub_i32 s{{[0-9]+}}, 0x4d2, [[A]]
 define amdgpu_kernel void @s_sub_imm_i32(i32 addrspace(1)* %out, i32 %a) {
@@ -24,9 +23,7 @@ define amdgpu_kernel void @s_sub_imm_i32(i32 addrspace(1)* %out, i32 %a) {
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_sub_i32:
-; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
+; GCN-LABEL: {{^}}test_sub_i32:
 ; SI: v_subrev_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 ; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 define amdgpu_kernel void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
@@ -38,9 +35,7 @@ define amdgpu_kernel void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_sub_imm_i32:
-; EG: SUB_INT
-
+; GCN-LABEL: {{^}}test_sub_imm_i32:
 ; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc, 0x7b, v{{[0-9]+}}
 ; GFX9: v_sub_u32_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}}
 define amdgpu_kernel void @test_sub_imm_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
@@ -50,10 +45,7 @@ define amdgpu_kernel void @test_sub_imm_i32(i32 addrspace(1)* %out, i32 addrspac
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_sub_v2i32:
-; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
+; GCN-LABEL: {{^}}test_sub_v2i32:
 ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 
@@ -68,12 +60,7 @@ define amdgpu_kernel void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_sub_v4i32:
-; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
+; GCN-LABEL: {{^}}test_sub_v4i32:
 ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
@@ -92,11 +79,11 @@ define amdgpu_kernel void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_sub_i16:
+; GCN-LABEL: {{^}}test_sub_i16:
 ; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc,
 ; GFX89: v_sub_u16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 define amdgpu_kernel void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x()
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
   %b_ptr = getelementptr i16, i16 addrspace(1)* %gep, i32 1
   %a = load volatile i16, i16 addrspace(1)* %gep
@@ -106,13 +93,13 @@ define amdgpu_kernel void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_sub_v2i16:
+; GCN-LABEL: {{^}}test_sub_v2i16:
 ; VI: v_sub_u16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; VI: v_sub_u16_sdwa v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 ; GFX9: v_pk_sub_i16
 define amdgpu_kernel void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x()
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i32 %tid
   %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %gep, i16 1
   %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep
@@ -122,7 +109,7 @@ define amdgpu_kernel void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_sub_v4i16:
+; GCN-LABEL: {{^}}test_sub_v4i16:
 ; VI: v_sub_u16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; VI: v_sub_u16_sdwa v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; VI: v_sub_u16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
@@ -131,7 +118,7 @@ define amdgpu_kernel void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16
 ; GFX9: v_pk_sub_i16
 ; GFX9: v_pk_sub_i16
 define amdgpu_kernel void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
-  %tid = call i32 @llvm.r600.read.tidig.x()
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i32 %tid
   %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %gep, i16 1
   %a = load <4 x i16>, <4 x i16> addrspace(1) * %gep
@@ -141,22 +128,16 @@ define amdgpu_kernel void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16
   ret void
 }
 
-; FUNC-LABEL: {{^}}s_sub_i64:
+; GCN-LABEL: {{^}}s_sub_i64:
 ; GCN: s_sub_u32
 ; GCN: s_subb_u32
-
-; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
-; EG-DAG: SUB_INT {{[* ]*}}
-; EG-DAG: SUBB_UINT
-; EG-DAG: SUB_INT
-; EG-DAG: SUB_INT {{[* ]*}}
 define amdgpu_kernel void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind {
   %result = sub i64 %a, %b
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: {{^}}v_sub_i64:
+; GCN-LABEL: {{^}}v_sub_i64:
 ; SI: v_sub_i32_e32
 ; SI: v_subb_u32_e32
 
@@ -165,14 +146,8 @@ define amdgpu_kernel void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64
 
 ; GFX9: v_sub_co_u32_e32
 ; GFX9: v_subb_co_u32_e32
-
-; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
-; EG-DAG: SUB_INT {{[* ]*}}
-; EG-DAG: SUBB_UINT
-; EG-DAG: SUB_INT
-; EG-DAG: SUB_INT {{[* ]*}}
 define amdgpu_kernel void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind {
-  %tid = call i32 @llvm.r600.read.tidig.x() readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid
   %a = load i64, i64 addrspace(1)* %a_ptr
@@ -182,7 +157,7 @@ define amdgpu_kernel void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspa
   ret void
 }
 
-; FUNC-LABEL: {{^}}v_test_sub_v2i64:
+; GCN-LABEL: {{^}}v_test_sub_v2i64:
 ; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc,
 ; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc,
 ; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc,
@@ -198,7 +173,7 @@ define amdgpu_kernel void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspa
 ; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc,
 ; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc,
 define amdgpu_kernel void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
-  %tid = call i32 @llvm.r600.read.tidig.x() readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid
   %a = load <2 x i64>, <2 x i64> addrspace(1)* %a_ptr
@@ -208,7 +183,7 @@ define amdgpu_kernel void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
   ret void
 }
 
-; FUNC-LABEL: {{^}}v_test_sub_v4i64:
+; GCN-LABEL: {{^}}v_test_sub_v4i64:
 ; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc,
 ; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc,
 ; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc,
@@ -236,7 +211,7 @@ define amdgpu_kernel void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
 ; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc,
 ; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc,
 define amdgpu_kernel void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) {
-  %tid = call i32 @llvm.r600.read.tidig.x() readnone
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %a_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inB, i32 %tid
   %a = load <4 x i64>, <4 x i64> addrspace(1)* %a_ptr
@@ -245,3 +220,22 @@ define amdgpu_kernel void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i
   store <4 x i64> %result, <4 x i64> addrspace(1)* %out
   ret void
 }
+
+; Make sure the VOP3 form of sub is initially selected. Otherwise pair
+; of opies from/to VCC would be necessary
+
+; GCN-LABEL: {{^}}sub_select_vop3:
+; SI: v_subrev_i32_e64 v0, s[0:1], s0, v0
+; VI: v_subrev_u32_e64 v0, s[0:1], s0, v0
+; GFX9: v_subrev_u32_e32 v0, s0, v0
+
+; GCN: ; def vcc
+; GCN: ds_write_b32
+; GCN: ; use vcc
+define amdgpu_ps void @sub_select_vop3(i32 inreg %s, i32 %v) {
+  %vcc = call i64 asm sideeffect "; def vcc", "={vcc}"()
+  %sub = sub i32 %v, %s
+  store i32 %sub, i32 addrspace(3)* undef
+  call void asm sideeffect "; use vcc", "{vcc}"(i64 %vcc)
+  ret void
+}
diff --git a/test/CodeGen/ARM/tail-call-scheduling.ll b/test/CodeGen/ARM/tail-call-scheduling.ll
new file mode 100644
index 00000000000..591da10256b
--- /dev/null
+++ b/test/CodeGen/ARM/tail-call-scheduling.ll
@@ -0,0 +1,35 @@
+; RUN: llc < %s | FileCheck %s
+target triple = "armv6kz-unknown-unknown-gnueabihf"
+
+; Make sure this doesn't crash, and we actually emit a tail call.
+; Unfortunately, this test is sort of fragile... the original issue only
+; shows up if scheduling happens in a very specific order. But including
+; it anyway just to demonstrate the issue.
+; CHECK: pop {r4, lr}
+
+@e = external local_unnamed_addr constant [0 x i32 (i32, i32)*], align 4
+
+; Function Attrs: nounwind sspstrong
+define i32 @AVI_ChunkRead_p_chk(i32 %g) nounwind sspstrong "target-cpu"="arm1176jzf-s" {
+entry:
+  %b = alloca i8, align 1
+  %tobool = icmp eq i32 %g, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %add = add nsw i32 %g, 1
+  %arrayidx = getelementptr inbounds [0 x i32 (i32, i32)*], [0 x i32 (i32, i32)*]* @e, i32 0, i32 %add
+  %0 = load i32 (i32, i32)*, i32 (i32, i32)** %arrayidx, align 4
+  %call = tail call i32 %0(i32 0, i32 0) #3
+  br label %return
+
+if.end:                                           ; preds = %entry
+  call void @c(i8* nonnull %b)
+  br label %return
+
+return:                                           ; preds = %if.end, %if.then
+  %retval.0 = phi i32 [ %call, %if.then ], [ 0, %if.end ]
+  ret i32 %retval.0
+}
+
+declare void @c(i8*)
diff --git a/test/CodeGen/AVR/mul.ll b/test/CodeGen/AVR/hardware-mul.ll
similarity index 90%
rename from test/CodeGen/AVR/mul.ll
rename to test/CodeGen/AVR/hardware-mul.ll
index 2f169347c46..650697857b7 100644
--- a/test/CodeGen/AVR/mul.ll
+++ b/test/CodeGen/AVR/hardware-mul.ll
@@ -1,5 +1,7 @@
 ; RUN: llc -mattr=mul,movw < %s -march=avr | FileCheck %s
 
+; Tests lowering of multiplication to hardware instructions.
+
 define i8 @mult8(i8 %a, i8 %b) {
 ; CHECK-LABEL: mult8:
 ; CHECK: muls r22, r24
diff --git a/test/CodeGen/AVR/smul-with-overflow.ll b/test/CodeGen/AVR/smul-with-overflow.ll
index 745e93005cc..9eb2c7411de 100644
--- a/test/CodeGen/AVR/smul-with-overflow.ll
+++ b/test/CodeGen/AVR/smul-with-overflow.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=avr | FileCheck %s
+; RUN: llc -mattr=avr6 < %s -march=avr | FileCheck %s
 
 define i1 @signed_multiplication_did_overflow(i8, i8) unnamed_addr {
 ; CHECK-LABEL: signed_multiplication_did_overflow:
diff --git a/test/CodeGen/AVR/software-mul.ll b/test/CodeGen/AVR/software-mul.ll
new file mode 100644
index 00000000000..9a4d28127eb
--- /dev/null
+++ b/test/CodeGen/AVR/software-mul.ll
@@ -0,0 +1,28 @@
+; RUN: llc -mattr=avr6,-mul < %s -march=avr | FileCheck %s
+; RUN: llc -mcpu=attiny85 < %s -march=avr | FileCheck %s
+; RUN: llc -mcpu=ata5272 < %s -march=avr | FileCheck %s
+; RUN: llc -mcpu=attiny861a < %s -march=avr | FileCheck %s
+; RUN: llc -mcpu=at90usb82 < %s -march=avr | FileCheck %s
+
+; Tests lowering of multiplication to compiler support routines.
+
+; CHECK-LABEL: mul8:
+define i8 @mul8(i8 %a, i8 %b) {
+; CHECK: mov  r25, r24
+; CHECK: mov  r24, r22
+; CHECK: mov  r22, r25
+; CHECK: call __mulqi3
+  %mul = mul i8 %b, %a
+  ret i8 %mul
+}
+
+; CHECK-LABEL: mul16:
+define i16 @mul16(i16 %a, i16 %b) {
+; CHECK: movw  r18, r24
+; CHECK: movw  r24, r22
+; CHECK: movw  r22, r18
+; CHECK: call  __mulhi3
+  %mul = mul nsw i16 %b, %a
+  ret i16 %mul
+}
+
diff --git a/test/CodeGen/AVR/umul-with-overflow.ll b/test/CodeGen/AVR/umul-with-overflow.ll
index aa8b10a313d..c6457552dea 100644
--- a/test/CodeGen/AVR/umul-with-overflow.ll
+++ b/test/CodeGen/AVR/umul-with-overflow.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=avr | FileCheck %s
+; RUN: llc -mattr=avr6 < %s -march=avr | FileCheck %s
 
 define i1 @unsigned_multiplication_did_overflow(i8, i8) unnamed_addr {
 ; CHECK-LABEL: unsigned_multiplication_did_overflow:
diff --git a/test/CodeGen/Mips/Fast-ISel/icmpbr1.ll b/test/CodeGen/Mips/Fast-ISel/icmpbr1.ll
index ef8e1c2b014..e44ab36532c 100644
--- a/test/CodeGen/Mips/Fast-ISel/icmpbr1.ll
+++ b/test/CodeGen/Mips/Fast-ISel/icmpbr1.ll
@@ -17,7 +17,8 @@ bb0:
 bb1:
 ; CHECK:       # %bb.1: # %bb1
 ; CHECK-NEXT:    lw $[[REG2:[0-9]+]], [[SPILL]]($sp) # 4-byte Folded Reload
-; CHECK-NEXT:    bgtz $[[REG2]], $BB0_3
+; CHECK-NEXT:    andi $[[REG3:[0-9]+]], $[[REG2]], 1
+; CHECK-NEXT:    bgtz $[[REG3]], $BB0_3
   br i1 %2, label %bb2, label %bb3
 bb2:
 ; CHECK:         $BB0_3: # %bb2
diff --git a/test/CodeGen/Mips/Fast-ISel/pr40325.ll b/test/CodeGen/Mips/Fast-ISel/pr40325.ll
new file mode 100644
index 00000000000..a9ce70fe8af
--- /dev/null
+++ b/test/CodeGen/Mips/Fast-ISel/pr40325.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=mipsel -relocation-model=pic -O0 -mcpu=mips32 < %s | FileCheck %s
+
+define void @test(i32 %x, i1* %p) nounwind {
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    move $1, $4
+; CHECK-NEXT:    andi $4, $4, 1
+; CHECK-NEXT:    sb $4, 0($5)
+; CHECK-NEXT:    andi $1, $1, 1
+; CHECK-NEXT:    bgtz $1, $BB0_1
+; CHECK-NEXT:    nop
+; CHECK-NEXT:  # %bb.1: # %foo
+; CHECK-NEXT:    jr $ra
+; CHECK-NEXT:    nop
+  %y = and i32 %x, 1
+  %c = icmp eq i32 %y, 1
+  store i1 %c, i1* %p
+  br i1 %c, label %foo, label %foo
+
+foo:
+  ret void
+}
diff --git a/test/CodeGen/Mips/abiflags32.ll b/test/CodeGen/Mips/abiflags32.ll
index 39e2a90151e..65201ec0381 100644
--- a/test/CodeGen/Mips/abiflags32.ll
+++ b/test/CodeGen/Mips/abiflags32.ll
@@ -1,6 +1,12 @@
 ; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 %s -o - | FileCheck %s
 ; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 -mattr=fp64 %s -o - | FileCheck  -check-prefix=CHECK-64 %s
 ; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips64 -target-abi n32 %s -o - | FileCheck  -check-prefix=CHECK-64n %s
+; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 \
+; RUN:     -mattr=soft-float %s -o - | FileCheck  -check-prefix=SOFT %s
+; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32r6 \
+; RUN:     -mattr=soft-float %s -o - | FileCheck  -check-prefix=SOFT %s
+; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips64 \
+; RUN:     -mattr=soft-float -target-abi n64 %s -o - | FileCheck  -check-prefix=SOFT %s
 
 ; CHECK: .nan    legacy
 ; We don't emit '.module fp=32' for compatibility with binutils 2.24 which
@@ -15,3 +21,5 @@
 ; We don't emit '.module fp=64' for compatibility with binutils 2.24 which
 ; doesn't accept .module.
 ; CHECK-64n-NOT: .module fp=64
+
+; SOFT: .module softfloat
diff --git a/test/CodeGen/Mips/llvm-ir/fptosi.ll b/test/CodeGen/Mips/llvm-ir/fptosi.ll
new file mode 100644
index 00000000000..03a0de74664
--- /dev/null
+++ b/test/CodeGen/Mips/llvm-ir/fptosi.ll
@@ -0,0 +1,418 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=mips-linux-gnu -mcpu=mips32 -asm-show-inst |\
+; RUN:   FileCheck %s -check-prefixes=M32
+; RUN: llc < %s -mtriple=mips-linux-gnu -mcpu=mips32r2 -asm-show-inst |\
+; RUN:   FileCheck %s -check-prefixes=M32
+; RUN: llc < %s -mtriple=mips-linux-gnu -mcpu=mips32r2 -mattr=+fp64 -asm-show-inst |\
+; RUN:   FileCheck %s -check-prefixes=M32R2-FP64
+; RUN: llc < %s -mtriple=mips-linux-gnu -mcpu=mips32r2 -mattr=+soft-float -asm-show-inst |\
+; RUN:   FileCheck %s -check-prefixes=M32R2-SF
+; RUN: llc < %s -mtriple=mips-linux-gnu -mcpu=mips32r3 -asm-show-inst |\
+; RUN:   FileCheck %s -check-prefixes=M32R3R5
+; RUN: llc < %s -mtriple=mips-linux-gnu -mcpu=mips32r5 -asm-show-inst |\
+; RUN:   FileCheck %s -check-prefixes=M32R3R5
+; RUN: llc < %s -mtriple=mips-linux-gnu -mcpu=mips32r6 -asm-show-inst |\
+; RUN:   FileCheck %s -check-prefixes=M32R6
+; RUN: llc < %s -mtriple=mips64-linux-gnu -mcpu=mips3 -asm-show-inst |\
+; RUN:   FileCheck %s -check-prefixes=M64
+; RUN: llc < %s -mtriple=mips64-linux-gnu -mcpu=mips64 -asm-show-inst |\
+; RUN:   FileCheck %s -check-prefixes=M64
+; RUN: llc < %s -mtriple=mips64-linux-gnu -mcpu=mips64r2 -asm-show-inst |\
+; RUN:   FileCheck %s -check-prefixes=M64
+; RUN: llc < %s -mtriple=mips64-linux-gnu -mcpu=mips64r6 -asm-show-inst |\
+; RUN:   FileCheck %s -check-prefixes=M64R6
+; RUN: llc < %s -mtriple=mips-linux-gnu -mcpu=mips32r2 -mattr=+micromips -asm-show-inst |\
+; RUN:   FileCheck %s -check-prefixes=MMR2-FP32
+; RUN: llc < %s -mtriple=mips-linux-gnu -mcpu=mips32r2 -mattr=+micromips,fp64 -asm-show-inst |\
+; RUN:   FileCheck %s -check-prefixes=MMR2-FP64
+; RUN: llc < %s -mtriple=mips-linux-gnu -mcpu=mips32r2 -mattr=+micromips,soft-float -asm-show-inst |\
+; RUN:   FileCheck %s -check-prefixes=MMR2-SF
+; RUN: llc < %s -mtriple=mips-linux-gnu -mcpu=mips32r6 -mattr=+micromips -asm-show-inst |\
+; RUN:   FileCheck %s -check-prefixes=MMR6
+; RUN: llc < %s -mtriple=mips-linux-gnu -mcpu=mips32r6 -mattr=+micromips,soft-float -asm-show-inst |\
+; RUN:   FileCheck %s -check-prefixes=MMR6-SF
+
+; Test that fptosi can be matched for MIPS targets for various FPU
+; configurations
+
+define i32 @test1(float %t) {
+; M32-LABEL: test1:
+; M32:       # %bb.0: # %entry
+; M32-NEXT:    trunc.w.s $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_S
+; M32-NEXT:    # <MCOperand Reg:147>
+; M32-NEXT:    # <MCOperand Reg:159>>
+; M32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
+; M32-NEXT:    # <MCOperand Reg:19>>
+; M32-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
+; M32-NEXT:    # <MCOperand Reg:321>
+; M32-NEXT:    # <MCOperand Reg:147>>
+;
+; M32R2-FP64-LABEL: test1:
+; M32R2-FP64:       # %bb.0: # %entry
+; M32R2-FP64-NEXT:    trunc.w.s $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_S
+; M32R2-FP64-NEXT:    # <MCOperand Reg:147>
+; M32R2-FP64-NEXT:    # <MCOperand Reg:159>>
+; M32R2-FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
+; M32R2-FP64-NEXT:    # <MCOperand Reg:19>>
+; M32R2-FP64-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
+; M32R2-FP64-NEXT:    # <MCOperand Reg:321>
+; M32R2-FP64-NEXT:    # <MCOperand Reg:147>>
+;
+; M32R2-SF-LABEL: test1:
+; M32R2-SF:       # %bb.0: # %entry
+; M32R2-SF-NEXT:    addiu $sp, $sp, -24 # <MCInst #{{[0-9]+}} ADDiu
+; M32R2-SF-NEXT:    # <MCOperand Reg:20>
+; M32R2-SF-NEXT:    # <MCOperand Reg:20>
+; M32R2-SF-NEXT:    # <MCOperand Imm:-24>>
+; M32R2-SF-NEXT:    .cfi_def_cfa_offset 24
+; M32R2-SF-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; M32R2-SF-NEXT:    # <MCInst #{{[0-9]+}} SW
+; M32R2-SF-NEXT:    # <MCOperand Reg:19>
+; M32R2-SF-NEXT:    # <MCOperand Reg:20>
+; M32R2-SF-NEXT:    # <MCOperand Imm:20>>
+; M32R2-SF-NEXT:    .cfi_offset 31, -4
+; M32R2-SF-NEXT:    jal __fixsfsi # <MCInst #{{[0-9]+}} JAL
+; M32R2-SF-NEXT:    # <MCOperand Expr:(__fixsfsi)>>
+; M32R2-SF-NEXT:    nop # <MCInst #{{[0-9]+}} SLL
+; M32R2-SF-NEXT:    # <MCOperand Reg:21>
+; M32R2-SF-NEXT:    # <MCOperand Reg:21>
+; M32R2-SF-NEXT:    # <MCOperand Imm:0>>
+; M32R2-SF-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; M32R2-SF-NEXT:    # <MCInst #{{[0-9]+}} LW
+; M32R2-SF-NEXT:    # <MCOperand Reg:19>
+; M32R2-SF-NEXT:    # <MCOperand Reg:20>
+; M32R2-SF-NEXT:    # <MCOperand Imm:20>>
+; M32R2-SF-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
+; M32R2-SF-NEXT:    # <MCOperand Reg:19>>
+; M32R2-SF-NEXT:    addiu $sp, $sp, 24 # <MCInst #{{[0-9]+}} ADDiu
+; M32R2-SF-NEXT:    # <MCOperand Reg:20>
+; M32R2-SF-NEXT:    # <MCOperand Reg:20>
+; M32R2-SF-NEXT:    # <MCOperand Imm:24>>
+;
+; M32R3R5-LABEL: test1:
+; M32R3R5:       # %bb.0: # %entry
+; M32R3R5-NEXT:    trunc.w.s $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_S
+; M32R3R5-NEXT:    # <MCOperand Reg:147>
+; M32R3R5-NEXT:    # <MCOperand Reg:159>>
+; M32R3R5-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
+; M32R3R5-NEXT:    # <MCOperand Reg:19>>
+; M32R3R5-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
+; M32R3R5-NEXT:    # <MCOperand Reg:321>
+; M32R3R5-NEXT:    # <MCOperand Reg:147>>
+;
+; M32R6-LABEL: test1:
+; M32R6:       # %bb.0: # %entry
+; M32R6-NEXT:    trunc.w.s $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_S
+; M32R6-NEXT:    # <MCOperand Reg:147>
+; M32R6-NEXT:    # <MCOperand Reg:159>>
+; M32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
+; M32R6-NEXT:    # <MCOperand Reg:21>
+; M32R6-NEXT:    # <MCOperand Reg:19>>
+; M32R6-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
+; M32R6-NEXT:    # <MCOperand Reg:321>
+; M32R6-NEXT:    # <MCOperand Reg:147>>
+;
+; M64-LABEL: test1:
+; M64:       # %bb.0: # %entry
+; M64-NEXT:    trunc.w.s $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_S
+; M64-NEXT:    # <MCOperand Reg:147>
+; M64-NEXT:    # <MCOperand Reg:159>>
+; M64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
+; M64-NEXT:    # <MCOperand Reg:301>>
+; M64-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
+; M64-NEXT:    # <MCOperand Reg:321>
+; M64-NEXT:    # <MCOperand Reg:147>>
+;
+; M64R6-LABEL: test1:
+; M64R6:       # %bb.0: # %entry
+; M64R6-NEXT:    trunc.w.s $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_S
+; M64R6-NEXT:    # <MCOperand Reg:147>
+; M64R6-NEXT:    # <MCOperand Reg:159>>
+; M64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
+; M64R6-NEXT:    # <MCOperand Reg:355>
+; M64R6-NEXT:    # <MCOperand Reg:301>>
+; M64R6-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
+; M64R6-NEXT:    # <MCOperand Reg:321>
+; M64R6-NEXT:    # <MCOperand Reg:147>>
+;
+; MMR2-FP32-LABEL: test1:
+; MMR2-FP32:       # %bb.0: # %entry
+; MMR2-FP32-NEXT:    trunc.w.s $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_S_MM
+; MMR2-FP32-NEXT:    # <MCOperand Reg:147>
+; MMR2-FP32-NEXT:    # <MCOperand Reg:159>>
+; MMR2-FP32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
+; MMR2-FP32-NEXT:    # <MCOperand Reg:19>>
+; MMR2-FP32-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1_MM
+; MMR2-FP32-NEXT:    # <MCOperand Reg:321>
+; MMR2-FP32-NEXT:    # <MCOperand Reg:147>>
+;
+; MMR2-FP64-LABEL: test1:
+; MMR2-FP64:       # %bb.0: # %entry
+; MMR2-FP64-NEXT:    trunc.w.s $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_S_MM
+; MMR2-FP64-NEXT:    # <MCOperand Reg:147>
+; MMR2-FP64-NEXT:    # <MCOperand Reg:159>>
+; MMR2-FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
+; MMR2-FP64-NEXT:    # <MCOperand Reg:19>>
+; MMR2-FP64-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1_MM
+; MMR2-FP64-NEXT:    # <MCOperand Reg:321>
+; MMR2-FP64-NEXT:    # <MCOperand Reg:147>>
+;
+; MMR2-SF-LABEL: test1:
+; MMR2-SF:       # %bb.0: # %entry
+; MMR2-SF-NEXT:    addiusp -24 # <MCInst #{{[0-9]+}} ADDIUSP_MM
+; MMR2-SF-NEXT:    # <MCOperand Imm:-24>>
+; MMR2-SF-NEXT:    .cfi_def_cfa_offset 24
+; MMR2-SF-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MMR2-SF-NEXT:    # <MCInst #{{[0-9]+}} SWSP_MM
+; MMR2-SF-NEXT:    # <MCOperand Reg:19>
+; MMR2-SF-NEXT:    # <MCOperand Reg:20>
+; MMR2-SF-NEXT:    # <MCOperand Imm:20>>
+; MMR2-SF-NEXT:    .cfi_offset 31, -4
+; MMR2-SF-NEXT:    jal __fixsfsi # <MCInst #{{[0-9]+}} JAL_MM
+; MMR2-SF-NEXT:    # <MCOperand Expr:(__fixsfsi)>>
+; MMR2-SF-NEXT:    nop # <MCInst #{{[0-9]+}} SLL
+; MMR2-SF-NEXT:    # <MCOperand Reg:21>
+; MMR2-SF-NEXT:    # <MCOperand Reg:21>
+; MMR2-SF-NEXT:    # <MCOperand Imm:0>>
+; MMR2-SF-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MMR2-SF-NEXT:    # <MCInst #{{[0-9]+}} LWSP_MM
+; MMR2-SF-NEXT:    # <MCOperand Reg:19>
+; MMR2-SF-NEXT:    # <MCOperand Reg:20>
+; MMR2-SF-NEXT:    # <MCOperand Imm:20>>
+; MMR2-SF-NEXT:    addiusp 24 # <MCInst #{{[0-9]+}} ADDIUSP_MM
+; MMR2-SF-NEXT:    # <MCOperand Imm:24>>
+; MMR2-SF-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
+; MMR2-SF-NEXT:    # <MCOperand Reg:19>>
+;
+; MMR6-LABEL: test1:
+; MMR6:       # %bb.0: # %entry
+; MMR6-NEXT:    trunc.w.s $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_S_MMR6
+; MMR6-NEXT:    # <MCOperand Reg:147>
+; MMR6-NEXT:    # <MCOperand Reg:159>>
+; MMR6-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1_MM
+; MMR6-NEXT:    # <MCOperand Reg:321>
+; MMR6-NEXT:    # <MCOperand Reg:147>>
+; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:19>>
+;
+; MMR6-SF-LABEL: test1:
+; MMR6-SF:       # %bb.0: # %entry
+; MMR6-SF-NEXT:    addiu $sp, $sp, -24 # <MCInst #{{[0-9]+}} ADDiu
+; MMR6-SF-NEXT:    # <MCOperand Reg:20>
+; MMR6-SF-NEXT:    # <MCOperand Reg:20>
+; MMR6-SF-NEXT:    # <MCOperand Imm:-24>>
+; MMR6-SF-NEXT:    .cfi_def_cfa_offset 24
+; MMR6-SF-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MMR6-SF-NEXT:    # <MCInst #{{[0-9]+}} SW
+; MMR6-SF-NEXT:    # <MCOperand Reg:19>
+; MMR6-SF-NEXT:    # <MCOperand Reg:20>
+; MMR6-SF-NEXT:    # <MCOperand Imm:20>>
+; MMR6-SF-NEXT:    .cfi_offset 31, -4
+; MMR6-SF-NEXT:    jalr __fixsfsi # <MCInst #{{[0-9]+}} JALRC16_MMR6
+; MMR6-SF-NEXT:    # <MCOperand Expr:(__fixsfsi)>>
+; MMR6-SF-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MMR6-SF-NEXT:    # <MCInst #{{[0-9]+}} LW
+; MMR6-SF-NEXT:    # <MCOperand Reg:19>
+; MMR6-SF-NEXT:    # <MCOperand Reg:20>
+; MMR6-SF-NEXT:    # <MCOperand Imm:20>>
+; MMR6-SF-NEXT:    addiu $sp, $sp, 24 # <MCInst #{{[0-9]+}} ADDiu
+; MMR6-SF-NEXT:    # <MCOperand Reg:20>
+; MMR6-SF-NEXT:    # <MCOperand Reg:20>
+; MMR6-SF-NEXT:    # <MCOperand Imm:24>>
+; MMR6-SF-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
+; MMR6-SF-NEXT:    # <MCOperand Reg:19>>
+entry:
+  %conv = fptosi float %t to i32
+  ret i32 %conv
+}
+
+define i32 @test2(double %t) {
+; M32-LABEL: test2:
+; M32:       # %bb.0: # %entry
+; M32-NEXT:    trunc.w.d $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_D32
+; M32-NEXT:    # <MCOperand Reg:147>
+; M32-NEXT:    # <MCOperand Reg:133>>
+; M32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
+; M32-NEXT:    # <MCOperand Reg:19>>
+; M32-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
+; M32-NEXT:    # <MCOperand Reg:321>
+; M32-NEXT:    # <MCOperand Reg:147>>
+;
+; M32R2-FP64-LABEL: test2:
+; M32R2-FP64:       # %bb.0: # %entry
+; M32R2-FP64-NEXT:    trunc.w.d $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_D64
+; M32R2-FP64-NEXT:    # <MCOperand Reg:147>
+; M32R2-FP64-NEXT:    # <MCOperand Reg:373>>
+; M32R2-FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
+; M32R2-FP64-NEXT:    # <MCOperand Reg:19>>
+; M32R2-FP64-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
+; M32R2-FP64-NEXT:    # <MCOperand Reg:321>
+; M32R2-FP64-NEXT:    # <MCOperand Reg:147>>
+;
+; M32R2-SF-LABEL: test2:
+; M32R2-SF:       # %bb.0: # %entry
+; M32R2-SF-NEXT:    addiu $sp, $sp, -24 # <MCInst #{{[0-9]+}} ADDiu
+; M32R2-SF-NEXT:    # <MCOperand Reg:20>
+; M32R2-SF-NEXT:    # <MCOperand Reg:20>
+; M32R2-SF-NEXT:    # <MCOperand Imm:-24>>
+; M32R2-SF-NEXT:    .cfi_def_cfa_offset 24
+; M32R2-SF-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; M32R2-SF-NEXT:    # <MCInst #{{[0-9]+}} SW
+; M32R2-SF-NEXT:    # <MCOperand Reg:19>
+; M32R2-SF-NEXT:    # <MCOperand Reg:20>
+; M32R2-SF-NEXT:    # <MCOperand Imm:20>>
+; M32R2-SF-NEXT:    .cfi_offset 31, -4
+; M32R2-SF-NEXT:    jal __fixdfsi # <MCInst #{{[0-9]+}} JAL
+; M32R2-SF-NEXT:    # <MCOperand Expr:(__fixdfsi)>>
+; M32R2-SF-NEXT:    nop # <MCInst #{{[0-9]+}} SLL
+; M32R2-SF-NEXT:    # <MCOperand Reg:21>
+; M32R2-SF-NEXT:    # <MCOperand Reg:21>
+; M32R2-SF-NEXT:    # <MCOperand Imm:0>>
+; M32R2-SF-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; M32R2-SF-NEXT:    # <MCInst #{{[0-9]+}} LW
+; M32R2-SF-NEXT:    # <MCOperand Reg:19>
+; M32R2-SF-NEXT:    # <MCOperand Reg:20>
+; M32R2-SF-NEXT:    # <MCOperand Imm:20>>
+; M32R2-SF-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
+; M32R2-SF-NEXT:    # <MCOperand Reg:19>>
+; M32R2-SF-NEXT:    addiu $sp, $sp, 24 # <MCInst #{{[0-9]+}} ADDiu
+; M32R2-SF-NEXT:    # <MCOperand Reg:20>
+; M32R2-SF-NEXT:    # <MCOperand Reg:20>
+; M32R2-SF-NEXT:    # <MCOperand Imm:24>>
+;
+; M32R3R5-LABEL: test2:
+; M32R3R5:       # %bb.0: # %entry
+; M32R3R5-NEXT:    trunc.w.d $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_D32
+; M32R3R5-NEXT:    # <MCOperand Reg:147>
+; M32R3R5-NEXT:    # <MCOperand Reg:133>>
+; M32R3R5-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
+; M32R3R5-NEXT:    # <MCOperand Reg:19>>
+; M32R3R5-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
+; M32R3R5-NEXT:    # <MCOperand Reg:321>
+; M32R3R5-NEXT:    # <MCOperand Reg:147>>
+;
+; M32R6-LABEL: test2:
+; M32R6:       # %bb.0: # %entry
+; M32R6-NEXT:    trunc.w.d $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_D64
+; M32R6-NEXT:    # <MCOperand Reg:147>
+; M32R6-NEXT:    # <MCOperand Reg:373>>
+; M32R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR
+; M32R6-NEXT:    # <MCOperand Reg:21>
+; M32R6-NEXT:    # <MCOperand Reg:19>>
+; M32R6-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
+; M32R6-NEXT:    # <MCOperand Reg:321>
+; M32R6-NEXT:    # <MCOperand Reg:147>>
+;
+; M64-LABEL: test2:
+; M64:       # %bb.0: # %entry
+; M64-NEXT:    trunc.w.d $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_D64
+; M64-NEXT:    # <MCOperand Reg:147>
+; M64-NEXT:    # <MCOperand Reg:373>>
+; M64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR
+; M64-NEXT:    # <MCOperand Reg:301>>
+; M64-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
+; M64-NEXT:    # <MCOperand Reg:321>
+; M64-NEXT:    # <MCOperand Reg:147>>
+;
+; M64R6-LABEL: test2:
+; M64R6:       # %bb.0: # %entry
+; M64R6-NEXT:    trunc.w.d $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_D64
+; M64R6-NEXT:    # <MCOperand Reg:147>
+; M64R6-NEXT:    # <MCOperand Reg:373>>
+; M64R6-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JALR64
+; M64R6-NEXT:    # <MCOperand Reg:355>
+; M64R6-NEXT:    # <MCOperand Reg:301>>
+; M64R6-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1
+; M64R6-NEXT:    # <MCOperand Reg:321>
+; M64R6-NEXT:    # <MCOperand Reg:147>>
+;
+; MMR2-FP32-LABEL: test2:
+; MMR2-FP32:       # %bb.0: # %entry
+; MMR2-FP32-NEXT:    trunc.w.d $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_MM
+; MMR2-FP32-NEXT:    # <MCOperand Reg:147>
+; MMR2-FP32-NEXT:    # <MCOperand Reg:133>>
+; MMR2-FP32-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
+; MMR2-FP32-NEXT:    # <MCOperand Reg:19>>
+; MMR2-FP32-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1_MM
+; MMR2-FP32-NEXT:    # <MCOperand Reg:321>
+; MMR2-FP32-NEXT:    # <MCOperand Reg:147>>
+;
+; MMR2-FP64-LABEL: test2:
+; MMR2-FP64:       # %bb.0: # %entry
+; MMR2-FP64-NEXT:    cvt.w.d $f0, $f12 # <MCInst #{{[0-9]+}} CVT_W_D64_MM
+; MMR2-FP64-NEXT:    # <MCOperand Reg:147>
+; MMR2-FP64-NEXT:    # <MCOperand Reg:373>>
+; MMR2-FP64-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
+; MMR2-FP64-NEXT:    # <MCOperand Reg:19>>
+; MMR2-FP64-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1_MM
+; MMR2-FP64-NEXT:    # <MCOperand Reg:321>
+; MMR2-FP64-NEXT:    # <MCOperand Reg:147>>
+;
+; MMR2-SF-LABEL: test2:
+; MMR2-SF:       # %bb.0: # %entry
+; MMR2-SF-NEXT:    addiusp -24 # <MCInst #{{[0-9]+}} ADDIUSP_MM
+; MMR2-SF-NEXT:    # <MCOperand Imm:-24>>
+; MMR2-SF-NEXT:    .cfi_def_cfa_offset 24
+; MMR2-SF-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MMR2-SF-NEXT:    # <MCInst #{{[0-9]+}} SWSP_MM
+; MMR2-SF-NEXT:    # <MCOperand Reg:19>
+; MMR2-SF-NEXT:    # <MCOperand Reg:20>
+; MMR2-SF-NEXT:    # <MCOperand Imm:20>>
+; MMR2-SF-NEXT:    .cfi_offset 31, -4
+; MMR2-SF-NEXT:    jal __fixdfsi # <MCInst #{{[0-9]+}} JAL_MM
+; MMR2-SF-NEXT:    # <MCOperand Expr:(__fixdfsi)>>
+; MMR2-SF-NEXT:    nop # <MCInst #{{[0-9]+}} SLL
+; MMR2-SF-NEXT:    # <MCOperand Reg:21>
+; MMR2-SF-NEXT:    # <MCOperand Reg:21>
+; MMR2-SF-NEXT:    # <MCOperand Imm:0>>
+; MMR2-SF-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MMR2-SF-NEXT:    # <MCInst #{{[0-9]+}} LWSP_MM
+; MMR2-SF-NEXT:    # <MCOperand Reg:19>
+; MMR2-SF-NEXT:    # <MCOperand Reg:20>
+; MMR2-SF-NEXT:    # <MCOperand Imm:20>>
+; MMR2-SF-NEXT:    addiusp 24 # <MCInst #{{[0-9]+}} ADDIUSP_MM
+; MMR2-SF-NEXT:    # <MCOperand Imm:24>>
+; MMR2-SF-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
+; MMR2-SF-NEXT:    # <MCOperand Reg:19>>
+;
+; MMR6-LABEL: test2:
+; MMR6:       # %bb.0: # %entry
+; MMR6-NEXT:    trunc.w.d $f0, $f12 # <MCInst #{{[0-9]+}} TRUNC_W_D_MMR6
+; MMR6-NEXT:    # <MCOperand Reg:147>
+; MMR6-NEXT:    # <MCOperand Reg:373>>
+; MMR6-NEXT:    mfc1 $2, $f0 # <MCInst #{{[0-9]+}} MFC1_MM
+; MMR6-NEXT:    # <MCOperand Reg:321>
+; MMR6-NEXT:    # <MCOperand Reg:147>>
+; MMR6-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
+; MMR6-NEXT:    # <MCOperand Reg:19>>
+;
+; MMR6-SF-LABEL: test2:
+; MMR6-SF:       # %bb.0: # %entry
+; MMR6-SF-NEXT:    addiu $sp, $sp, -24 # <MCInst #{{[0-9]+}} ADDiu
+; MMR6-SF-NEXT:    # <MCOperand Reg:20>
+; MMR6-SF-NEXT:    # <MCOperand Reg:20>
+; MMR6-SF-NEXT:    # <MCOperand Imm:-24>>
+; MMR6-SF-NEXT:    .cfi_def_cfa_offset 24
+; MMR6-SF-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; MMR6-SF-NEXT:    # <MCInst #{{[0-9]+}} SW
+; MMR6-SF-NEXT:    # <MCOperand Reg:19>
+; MMR6-SF-NEXT:    # <MCOperand Reg:20>
+; MMR6-SF-NEXT:    # <MCOperand Imm:20>>
+; MMR6-SF-NEXT:    .cfi_offset 31, -4
+; MMR6-SF-NEXT:    jalr __fixdfsi # <MCInst #{{[0-9]+}} JALRC16_MMR6
+; MMR6-SF-NEXT:    # <MCOperand Expr:(__fixdfsi)>>
+; MMR6-SF-NEXT:    lw $ra, 20($sp) # 4-byte Folded Reload
+; MMR6-SF-NEXT:    # <MCInst #{{[0-9]+}} LW
+; MMR6-SF-NEXT:    # <MCOperand Reg:19>
+; MMR6-SF-NEXT:    # <MCOperand Reg:20>
+; MMR6-SF-NEXT:    # <MCOperand Imm:20>>
+; MMR6-SF-NEXT:    addiu $sp, $sp, 24 # <MCInst #{{[0-9]+}} ADDiu
+; MMR6-SF-NEXT:    # <MCOperand Reg:20>
+; MMR6-SF-NEXT:    # <MCOperand Reg:20>
+; MMR6-SF-NEXT:    # <MCOperand Imm:24>>
+; MMR6-SF-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
+; MMR6-SF-NEXT:    # <MCOperand Reg:19>>
+entry:
+  %conv = fptosi double %t to i32
+  ret i32 %conv
+}
diff --git a/test/CodeGen/Mips/micromips-pseudo-mtlohi-expand.ll b/test/CodeGen/Mips/micromips-pseudo-mtlohi-expand.ll
new file mode 100644
index 00000000000..3f86bd24f34
--- /dev/null
+++ b/test/CodeGen/Mips/micromips-pseudo-mtlohi-expand.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=mipsel-linux-gnu -mcpu=mips32r2 -mattr=+micromips -asm-show-inst < %s |\
+; RUN:   FileCheck %s -check-prefixes=MMR2
+; RUN: llc -mtriple=mipsel-linux-gnu -mcpu=mips32r2 -mattr=+dsp,+micromips -asm-show-inst < %s |\
+; RUN:   FileCheck %s -check-prefixes=MMR2-DSP
+
+define i64 @test(i32 signext %a, i32 signext %b) {
+; MMR2-LABEL: test:
+; MMR2:       # %bb.0: # %entry
+; MMR2-NEXT:    li16 $2, 0 # <MCInst #{{[0-9]+}} LI16_MM
+; MMR2-NEXT:    # <MCOperand Reg:321>
+; MMR2-NEXT:    # <MCOperand Imm:0>>
+; MMR2-NEXT:    li16 $3, 1 # <MCInst #{{[0-9]+}} LI16_MM
+; MMR2-NEXT:    # <MCOperand Reg:322>
+; MMR2-NEXT:    # <MCOperand Imm:1>>
+; MMR2-NEXT:    mtlo $3 # <MCInst #{{[0-9]+}} MTLO_MM
+; MMR2-NEXT:    # <MCOperand Reg:322>>
+; MMR2-NEXT:    mthi $2 # <MCInst #{{[0-9]+}} MTHI_MM
+; MMR2-NEXT:    # <MCOperand Reg:321>>
+; MMR2-NEXT:    madd $4, $5 # <MCInst #{{[0-9]+}} MADD
+; MMR2-NEXT:    # <MCOperand Reg:22>
+; MMR2-NEXT:    # <MCOperand Reg:23>>
+; MMR2-NEXT:    mflo16 $2 # <MCInst #{{[0-9]+}} MFLO16_MM
+; MMR2-NEXT:    # <MCOperand Reg:321>>
+; MMR2-NEXT:    mfhi16 $3 # <MCInst #{{[0-9]+}} MFHI16_MM
+; MMR2-NEXT:    # <MCOperand Reg:322>>
+; MMR2-NEXT:    jrc $ra # <MCInst #{{[0-9]+}} JRC16_MM
+; MMR2-NEXT:    # <MCOperand Reg:19>>
+;
+; MMR2-DSP-LABEL: test:
+; MMR2-DSP:       # %bb.0: # %entry
+; MMR2-DSP-NEXT:    li16 $2, 0 # <MCInst #{{[0-9]+}} LI16_MM
+; MMR2-DSP-NEXT:    # <MCOperand Reg:321>
+; MMR2-DSP-NEXT:    # <MCOperand Imm:0>>
+; MMR2-DSP-NEXT:    li16 $3, 1 # <MCInst #{{[0-9]+}} LI16_MM
+; MMR2-DSP-NEXT:    # <MCOperand Reg:322>
+; MMR2-DSP-NEXT:    # <MCOperand Imm:1>>
+; MMR2-DSP-NEXT:    mtlo $3, $ac0 # <MCInst #{{[0-9]+}} MTLO_DSP
+; MMR2-DSP-NEXT:    # <MCOperand Reg:291>
+; MMR2-DSP-NEXT:    # <MCOperand Reg:322>>
+; MMR2-DSP-NEXT:    mthi $2, $ac0 # <MCInst #{{[0-9]+}} MTHI_DSP
+; MMR2-DSP-NEXT:    # <MCOperand Reg:253>
+; MMR2-DSP-NEXT:    # <MCOperand Reg:321>>
+; MMR2-DSP-NEXT:    madd $ac0, $4, $5 # <MCInst #{{[0-9]+}} MADD_DSP
+; MMR2-DSP-NEXT:    # <MCOperand Reg:26>
+; MMR2-DSP-NEXT:    # <MCOperand Reg:22>
+; MMR2-DSP-NEXT:    # <MCOperand Reg:23>
+; MMR2-DSP-NEXT:    # <MCOperand Reg:26>>
+; MMR2-DSP-NEXT:    mflo $2, $ac0 # <MCInst #{{[0-9]+}} MFLO_DSP
+; MMR2-DSP-NEXT:    # <MCOperand Reg:321>
+; MMR2-DSP-NEXT:    # <MCOperand Reg:26>>
+; MMR2-DSP-NEXT:    jr $ra # <MCInst #{{[0-9]+}} JR_MM
+; MMR2-DSP-NEXT:    # <MCOperand Reg:19>>
+; MMR2-DSP-NEXT:    mfhi $3, $ac0 # <MCInst #{{[0-9]+}} MFHI_DSP
+; MMR2-DSP-NEXT:    # <MCOperand Reg:322>
+; MMR2-DSP-NEXT:    # <MCOperand Reg:26>>
+entry:
+  %conv = sext i32 %a to i64
+  %conv1 = sext i32 %b to i64
+  %mul = mul nsw i64 %conv, %conv1
+  %add = add nsw i64 %mul, 1
+  ret i64 %add
+}
diff --git a/test/CodeGen/Mips/pseudo-jump-fill.ll b/test/CodeGen/Mips/pseudo-jump-fill.ll
new file mode 100644
index 00000000000..31f077d57a9
--- /dev/null
+++ b/test/CodeGen/Mips/pseudo-jump-fill.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=mipsel-linux-gnu -mattr=+micromips -relocation-model=pic < %s | FileCheck %s
+
+; Test that the delay slot filler correctly handles indirect branches for
+; microMIPS in regard to incorrectly using 16bit instructions in delay slots of
+; 32bit instructions.
+
+define i32 @test(i32 signext %x, i32 signext %c) {
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui $2, %hi(_gp_disp)
+; CHECK-NEXT:    addiu $2, $2, %lo(_gp_disp)
+; CHECK-NEXT:    addiur2 $5, $5, -1
+; CHECK-NEXT:    sltiu $1, $5, 4
+; CHECK-NEXT:    beqz $1, $BB0_3
+; CHECK-NEXT:    addu $3, $2, $25
+; CHECK-NEXT:  $BB0_1: # %entry
+; CHECK-NEXT:    li16 $2, 0
+; CHECK-NEXT:    sll16 $5, $5, 2
+; CHECK-NEXT:    lw $6, %got($JTI0_0)($3)
+; CHECK-NEXT:    addu16 $5, $5, $6
+; CHECK-NEXT:    lw $5, %lo($JTI0_0)($5)
+; CHECK-NEXT:    addu16 $3, $5, $3
+; CHECK-NEXT:    jr $3
+; CHECK-NEXT:    nop
+; CHECK-NEXT:  $BB0_2: # %sw.bb2
+; CHECK-NEXT:    addiur2 $2, $4, 1
+; CHECK-NEXT:    jrc $ra
+; CHECK-NEXT:  $BB0_3:
+; CHECK-NEXT:    move $2, $4
+; CHECK-NEXT:    jrc $ra
+; CHECK-NEXT:  $BB0_4: # %sw.bb3
+; CHECK-NEXT:    addius5 $4, 2
+; CHECK-NEXT:    move $2, $4
+; CHECK-NEXT:    jrc $ra
+; CHECK-NEXT:  $BB0_5: # %sw.bb5
+; CHECK-NEXT:    addius5 $4, 3
+; CHECK-NEXT:    move $2, $4
+; CHECK-NEXT:  $BB0_6: # %for.cond.cleanup
+; CHECK-NEXT:    jrc $ra
+entry:
+  switch i32 %c, label %sw.epilog [
+    i32 4, label %sw.bb5
+    i32 1, label %for.cond.cleanup
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+  ]
+
+sw.bb2:
+  %add = add nsw i32 %x, 1
+  br label %sw.epilog
+
+sw.bb3:
+  %add4 = add nsw i32 %x, 2
+  br label %sw.epilog
+
+sw.bb5:
+  %add6 = add nsw i32 %x, 3
+  br label %sw.epilog
+
+sw.epilog:
+  %a.0 = phi i32 [ %add6, %sw.bb5 ], [ %add4, %sw.bb3 ], [ %add, %sw.bb2 ], [ %x, %entry ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %a.028 = phi i32 [ %a.0, %sw.epilog ], [ 0, %entry ]
+  ret i32 %a.028
+}
diff --git a/test/CodeGen/PowerPC/ppc32-pic-large.ll b/test/CodeGen/PowerPC/ppc32-pic-large.ll
index d6e491ea273..272138e5121 100644
--- a/test/CodeGen/PowerPC/ppc32-pic-large.ll
+++ b/test/CodeGen/PowerPC/ppc32-pic-large.ll
@@ -1,5 +1,9 @@
 ; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu -relocation-model=pic | FileCheck -check-prefix=LARGE-BSS %s
 ; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu -mattr=+secure-plt -relocation-model=pic | FileCheck -check-prefix=LARGE-SECUREPLT %s
+; RUN: llc < %s -mtriple=powerpc-unknown-netbsd -mattr=+secure-plt -relocation-model=pic | FileCheck -check-prefix=LARGE-SECUREPLT %s
+; RUN: llc < %s -mtriple=powerpc-unknown-netbsd -relocation-model=pic | FileCheck -check-prefix=LARGE-SECUREPLT %s
+; RUN: llc < %s -mtriple=powerpc-unknown-openbsd -mattr=+secure-plt -relocation-model=pic | FileCheck -check-prefix=LARGE-SECUREPLT %s
+; RUN: llc < %s -mtriple=powerpc-unknown-openbsd -relocation-model=pic | FileCheck -check-prefix=LARGE-SECUREPLT %s
 @bar = common global i32 0, align 4
 
 declare i32 @call_foo(i32, ...)
diff --git a/test/CodeGen/SPARC/fp128.ll b/test/CodeGen/SPARC/fp128.ll
index 535f0ef60c4..21a9cdf77e0 100644
--- a/test/CodeGen/SPARC/fp128.ll
+++ b/test/CodeGen/SPARC/fp128.ll
@@ -53,6 +53,29 @@ entry:
   ret void
 }
 
+; CHECK-LABEL: f128_spill_large:
+; CHECK:       sethi 4, %g1
+; CHECK:       sethi 4, %g1
+; CHECK-NEXT:  add %g1, %sp, %g1
+; CHECK-NEXT:  std %f{{.+}}, [%g1]
+; CHECK:       sethi 4, %g1
+; CHECK-NEXT:  add %g1, %sp, %g1
+; CHECK-NEXT:  std %f{{.+}}, [%g1+8]
+; CHECK:       sethi 4, %g1
+; CHECK-NEXT:  add %g1, %sp, %g1
+; CHECK-NEXT:  ldd [%g1], %f{{.+}}
+; CHECK:       sethi 4, %g1
+; CHECK-NEXT:  add %g1, %sp, %g1
+; CHECK-NEXT:  ldd [%g1+8], %f{{.+}}
+
+define void @f128_spill_large(<251 x fp128>* noalias sret %scalar.result, <251 x fp128>* byval %a) {
+entry:
+  %0 = load <251 x fp128>, <251 x fp128>* %a, align 8
+  call void asm sideeffect "", "~{f0},~{f1},~{f2},~{f3},~{f4},~{f5},~{f6},~{f7},~{f8},~{f9},~{f10},~{f11},~{f12},~{f13},~{f14},~{f15},~{f16},~{f17},~{f18},~{f19},~{f20},~{f21},~{f22},~{f23},~{f24},~{f25},~{f26},~{f27},~{f28},~{f29},~{f30},~{f31}"()
+  store <251 x fp128> %0, <251 x fp128>* %scalar.result, align 8
+  ret void
+}
+
 ; CHECK-LABEL: f128_compare:
 ; HARD:       fcmpq
 ; HARD-NEXT:  nop
diff --git a/test/CodeGen/WebAssembly/varargs.ll b/test/CodeGen/WebAssembly/varargs.ll
index 1a73716c2a6..5a8df4cd2fe 100644
--- a/test/CodeGen/WebAssembly/varargs.ll
+++ b/test/CodeGen/WebAssembly/varargs.ll
@@ -163,6 +163,32 @@ define void @nonlegal_fixed(fp128 %x, ...) nounwind {
   ret void
 }
 
+; Test that an fp128 argument is properly aligned and allocated
+; within a vararg buffer.
+
+; CHECK-LABEL: call_fp128_alignment:
+; CHECK:      global.get      $push7=, __stack_pointer
+; CHECK-NEXT: i32.const       $push8=, 32
+; CHECK-NEXT: i32.sub         $push12=, $pop7, $pop8
+; CHECK-NEXT: local.tee       $push11=, $1=, $pop12
+; CHECK-NEXT: global.set      __stack_pointer@GLOBAL, $pop11
+; CHECK-NEXT: i32.const       $push0=, 24
+; CHECK-NEXT: i32.add         $push1=, $1, $pop0
+; CHECK-NEXT: i64.const       $push2=, -9223372036854775808
+; CHECK-NEXT: i64.store       0($pop1), $pop2
+; CHECK-NEXT: i32.const       $push3=, 16
+; CHECK-NEXT: i32.add         $push4=, $1, $pop3
+; CHECK-NEXT: i64.const       $push5=, 1
+; CHECK-NEXT: i64.store       0($pop4), $pop5
+; CHECK-NEXT: i32.const       $push6=, 7
+; CHECK-NEXT: i32.store       0($1), $pop6
+; CHECK-NEXT: call            callee@FUNCTION, $1
+define void @call_fp128_alignment(i8* %p) {
+entry:
+  call void (...) @callee(i8 7, fp128 0xL00000000000000018000000000000000)
+  ret void
+}
+
 declare void @llvm.va_start(i8*)
 declare void @llvm.va_end(i8*)
 declare void @llvm.va_copy(i8*, i8*)
diff --git a/test/CodeGen/X86/PR40322.ll b/test/CodeGen/X86/PR40322.ll
new file mode 100644
index 00000000000..22bf1822c65
--- /dev/null
+++ b/test/CodeGen/X86/PR40322.ll
@@ -0,0 +1,164 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-windows-gnu | FileCheck %s --check-prefix=CHECK-MINGW-X86
+
+%struct.as = type { i32* }
+
+@_ZZ2amiE2au = internal unnamed_addr global %struct.as zeroinitializer, align 4
+@_ZGVZ2amiE2au = internal global i64 0, align 8
+@_ZTIi = external constant i8*
+
+define void @_Z2ami(i32) #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+; CHECK-MINGW-X86-LABEL: _Z2ami:
+; CHECK-MINGW-X86:       # %bb.0: # %entry
+; CHECK-MINGW-X86-NEXT:    pushl %edi
+; CHECK-MINGW-X86-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-MINGW-X86-NEXT:    pushl %esi
+; CHECK-MINGW-X86-NEXT:    .cfi_def_cfa_offset 12
+; CHECK-MINGW-X86-NEXT:    .cfi_offset %esi, -12
+; CHECK-MINGW-X86-NEXT:    .cfi_offset %edi, -8
+; CHECK-MINGW-X86-NEXT:    movb __ZGVZ2amiE2au, %al
+; CHECK-MINGW-X86-NEXT:    testb %al, %al
+; CHECK-MINGW-X86-NEXT:    jne LBB0_4
+; CHECK-MINGW-X86-NEXT:  # %bb.1: # %init.check
+; CHECK-MINGW-X86-NEXT:    .cfi_escape 0x2e, 0x04
+; CHECK-MINGW-X86-NEXT:    pushl $__ZGVZ2amiE2au
+; CHECK-MINGW-X86-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-MINGW-X86-NEXT:    calll ___cxa_guard_acquire
+; CHECK-MINGW-X86-NEXT:    addl $4, %esp
+; CHECK-MINGW-X86-NEXT:    .cfi_adjust_cfa_offset -4
+; CHECK-MINGW-X86-NEXT:    testl %eax, %eax
+; CHECK-MINGW-X86-NEXT:    je LBB0_4
+; CHECK-MINGW-X86-NEXT:  # %bb.2: # %init
+; CHECK-MINGW-X86-NEXT:  Ltmp0:
+; CHECK-MINGW-X86-NEXT:    .cfi_escape 0x2e, 0x04
+; CHECK-MINGW-X86-NEXT:    pushl $4
+; CHECK-MINGW-X86-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-MINGW-X86-NEXT:    calll __Znwj
+; CHECK-MINGW-X86-NEXT:    addl $4, %esp
+; CHECK-MINGW-X86-NEXT:    .cfi_adjust_cfa_offset -4
+; CHECK-MINGW-X86-NEXT:  Ltmp1:
+; CHECK-MINGW-X86-NEXT:  # %bb.3: # %invoke.cont
+; CHECK-MINGW-X86-NEXT:    movl %eax, __ZZ2amiE2au
+; CHECK-MINGW-X86-NEXT:    .cfi_escape 0x2e, 0x04
+; CHECK-MINGW-X86-NEXT:    pushl $__ZGVZ2amiE2au
+; CHECK-MINGW-X86-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-MINGW-X86-NEXT:    calll ___cxa_guard_release
+; CHECK-MINGW-X86-NEXT:    addl $4, %esp
+; CHECK-MINGW-X86-NEXT:    .cfi_adjust_cfa_offset -4
+; CHECK-MINGW-X86-NEXT:  LBB0_4: # %init.end
+; CHECK-MINGW-X86-NEXT:    .cfi_escape 0x2e, 0x04
+; CHECK-MINGW-X86-NEXT:    pushl $4
+; CHECK-MINGW-X86-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-MINGW-X86-NEXT:    calll __Znwj
+; CHECK-MINGW-X86-NEXT:    addl $4, %esp
+; CHECK-MINGW-X86-NEXT:    .cfi_adjust_cfa_offset -4
+; CHECK-MINGW-X86-NEXT:    movl %eax, %esi
+; CHECK-MINGW-X86-NEXT:    .cfi_escape 0x2e, 0x04
+; CHECK-MINGW-X86-NEXT:    pushl $4
+; CHECK-MINGW-X86-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-MINGW-X86-NEXT:    calll ___cxa_allocate_exception
+; CHECK-MINGW-X86-NEXT:    addl $4, %esp
+; CHECK-MINGW-X86-NEXT:    .cfi_adjust_cfa_offset -4
+; CHECK-MINGW-X86-NEXT:    movl $0, (%eax)
+; CHECK-MINGW-X86-NEXT:  Ltmp3:
+; CHECK-MINGW-X86-NEXT:    .cfi_escape 0x2e, 0x0c
+; CHECK-MINGW-X86-NEXT:    movl .refptr.__ZTIi, %ecx
+; CHECK-MINGW-X86-NEXT:    pushl $0
+; CHECK-MINGW-X86-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-MINGW-X86-NEXT:    pushl %ecx
+; CHECK-MINGW-X86-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-MINGW-X86-NEXT:    pushl %eax
+; CHECK-MINGW-X86-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-MINGW-X86-NEXT:    calll ___cxa_throw
+; CHECK-MINGW-X86-NEXT:    addl $12, %esp
+; CHECK-MINGW-X86-NEXT:    .cfi_adjust_cfa_offset -12
+; CHECK-MINGW-X86-NEXT:  Ltmp4:
+; CHECK-MINGW-X86-NEXT:  # %bb.8: # %unreachable
+; CHECK-MINGW-X86-NEXT:  LBB0_5: # %lpad
+; CHECK-MINGW-X86-NEXT:  Ltmp2:
+; CHECK-MINGW-X86-NEXT:    movl %eax, %edi
+; CHECK-MINGW-X86-NEXT:    .cfi_escape 0x2e, 0x04
+; CHECK-MINGW-X86-NEXT:    pushl $__ZGVZ2amiE2au
+; CHECK-MINGW-X86-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-MINGW-X86-NEXT:    calll ___cxa_guard_abort
+; CHECK-MINGW-X86-NEXT:    jmp LBB0_7
+; CHECK-MINGW-X86-NEXT:  LBB0_6: # %lpad1
+; CHECK-MINGW-X86-NEXT:    .cfi_def_cfa_offset 12
+; CHECK-MINGW-X86-NEXT:  Ltmp5:
+; CHECK-MINGW-X86-NEXT:    movl %eax, %edi
+; CHECK-MINGW-X86-NEXT:    .cfi_escape 0x2e, 0x04
+; CHECK-MINGW-X86-NEXT:    pushl %esi
+; CHECK-MINGW-X86-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-MINGW-X86-NEXT:    calll __ZdlPv
+; CHECK-MINGW-X86-NEXT:  LBB0_7: # %eh.resume
+; CHECK-MINGW-X86-NEXT:    addl $4, %esp
+; CHECK-MINGW-X86-NEXT:    .cfi_adjust_cfa_offset -4
+; CHECK-MINGW-X86-NEXT:    .cfi_escape 0x2e, 0x04
+; CHECK-MINGW-X86-NEXT:    pushl %edi
+; CHECK-MINGW-X86-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-MINGW-X86-NEXT:    calll __Unwind_Resume
+; CHECK-MINGW-X86-NEXT:  Lfunc_end0:
+entry:
+  %1 = load atomic i8, i8* bitcast (i64* @_ZGVZ2amiE2au to i8*) acquire, align 8
+  %guard.uninitialized = icmp eq i8 %1, 0
+  br i1 %guard.uninitialized, label %init.check, label %init.end
+
+init.check:                                       ; preds = %entry
+  %2 = tail call i32 @__cxa_guard_acquire(i64* nonnull @_ZGVZ2amiE2au)
+  %tobool = icmp eq i32 %2, 0
+  br i1 %tobool, label %init.end, label %init
+
+init:                                             ; preds = %init.check
+  %call.i3 = invoke i8* @_Znwj(i32 4)
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:                                      ; preds = %init
+  store i8* %call.i3, i8** bitcast (%struct.as* @_ZZ2amiE2au to i8**), align 4
+  tail call void @__cxa_guard_release(i64* nonnull @_ZGVZ2amiE2au)
+  br label %init.end
+
+init.end:                                         ; preds = %init.check, %invoke.cont, %entry
+  %call.i = tail call i8* @_Znwj(i32 4)
+  %exception = tail call i8* @__cxa_allocate_exception(i32 4)
+  %3 = bitcast i8* %exception to i32*
+  store i32 0, i32* %3, align 16
+  invoke void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null)
+          to label %unreachable unwind label %lpad1
+
+lpad:                                             ; preds = %init
+  %4 = landingpad { i8*, i32 }
+          cleanup
+  %5 = extractvalue { i8*, i32 } %4, 0
+  %6 = extractvalue { i8*, i32 } %4, 1
+  tail call void @__cxa_guard_abort(i64* nonnull @_ZGVZ2amiE2au) #1
+  br label %eh.resume
+
+lpad1:                                            ; preds = %init.end
+  %7 = landingpad { i8*, i32 }
+          cleanup
+  %8 = extractvalue { i8*, i32 } %7, 0
+  %9 = extractvalue { i8*, i32 } %7, 1
+  tail call void @_ZdlPv(i8* nonnull %call.i)
+  br label %eh.resume
+
+eh.resume:                                        ; preds = %lpad1, %lpad
+  %exn.slot.0 = phi i8* [ %8, %lpad1 ], [ %5, %lpad ]
+  %ehselector.slot.0 = phi i32 [ %9, %lpad1 ], [ %6, %lpad ]
+  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn.slot.0, 0
+  %lpad.val2 = insertvalue { i8*, i32 } %lpad.val, i32 %ehselector.slot.0, 1
+  resume { i8*, i32 } %lpad.val2
+
+unreachable:                                      ; preds = %init.end
+  unreachable
+}
+
+declare i32 @__cxa_guard_acquire(i64*)
+declare i32 @__gxx_personality_v0(...)
+declare void @__cxa_guard_abort(i64*)
+declare void @__cxa_guard_release(i64*)
+declare i8* @__cxa_allocate_exception(i32)
+declare void @__cxa_throw(i8*, i8*, i8*)
+declare noalias nonnull i8* @_Znwj(i32)
+declare i8* @__cxa_begin_catch(i8*)
+declare void @__cxa_end_catch()
+declare void @_ZdlPv(i8*)
diff --git a/test/CodeGen/X86/fast-isel-nontemporal.ll b/test/CodeGen/X86/fast-isel-nontemporal.ll
index db1ebfe6060..37e380b2b48 100644
--- a/test/CodeGen/X86/fast-isel-nontemporal.ll
+++ b/test/CodeGen/X86/fast-isel-nontemporal.ll
@@ -300,10 +300,20 @@ entry:
 }
 
 define <16 x i8> @test_load_nt16xi8(<16 x i8>* nocapture %ptr) {
-; SSE-LABEL: test_load_nt16xi8:
-; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    movntdqa (%rdi), %xmm0
-; SSE-NEXT:    retq
+; SSE2-LABEL: test_load_nt16xi8:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4A-LABEL: test_load_nt16xi8:
+; SSE4A:       # %bb.0: # %entry
+; SSE4A-NEXT:    movdqa (%rdi), %xmm0
+; SSE4A-NEXT:    retq
+;
+; SSE41-LABEL: test_load_nt16xi8:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    movntdqa (%rdi), %xmm0
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_load_nt16xi8:
 ; AVX:       # %bb.0: # %entry
@@ -320,10 +330,20 @@ entry:
 }
 
 define <8 x i16> @test_load_nt8xi16(<8 x i16>* nocapture %ptr) {
-; SSE-LABEL: test_load_nt8xi16:
-; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    movntdqa (%rdi), %xmm0
-; SSE-NEXT:    retq
+; SSE2-LABEL: test_load_nt8xi16:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4A-LABEL: test_load_nt8xi16:
+; SSE4A:       # %bb.0: # %entry
+; SSE4A-NEXT:    movdqa (%rdi), %xmm0
+; SSE4A-NEXT:    retq
+;
+; SSE41-LABEL: test_load_nt8xi16:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    movntdqa (%rdi), %xmm0
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_load_nt8xi16:
 ; AVX:       # %bb.0: # %entry
@@ -340,10 +360,20 @@ entry:
 }
 
 define <4 x i32> @test_load_nt4xi32(<4 x i32>* nocapture %ptr) {
-; SSE-LABEL: test_load_nt4xi32:
-; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    movntdqa (%rdi), %xmm0
-; SSE-NEXT:    retq
+; SSE2-LABEL: test_load_nt4xi32:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4A-LABEL: test_load_nt4xi32:
+; SSE4A:       # %bb.0: # %entry
+; SSE4A-NEXT:    movdqa (%rdi), %xmm0
+; SSE4A-NEXT:    retq
+;
+; SSE41-LABEL: test_load_nt4xi32:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    movntdqa (%rdi), %xmm0
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_load_nt4xi32:
 ; AVX:       # %bb.0: # %entry
@@ -360,10 +390,20 @@ entry:
 }
 
 define <2 x i64> @test_load_nt2xi64(<2 x i64>* nocapture %ptr) {
-; SSE-LABEL: test_load_nt2xi64:
-; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    movntdqa (%rdi), %xmm0
-; SSE-NEXT:    retq
+; SSE2-LABEL: test_load_nt2xi64:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE4A-LABEL: test_load_nt2xi64:
+; SSE4A:       # %bb.0: # %entry
+; SSE4A-NEXT:    movdqa (%rdi), %xmm0
+; SSE4A-NEXT:    retq
+;
+; SSE41-LABEL: test_load_nt2xi64:
+; SSE41:       # %bb.0: # %entry
+; SSE41-NEXT:    movntdqa (%rdi), %xmm0
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_load_nt2xi64:
 ; AVX:       # %bb.0: # %entry
diff --git a/test/CodeGen/X86/regalloc-copy-hints.mir b/test/CodeGen/X86/regalloc-copy-hints.mir
new file mode 100644
index 00000000000..6287066e64f
--- /dev/null
+++ b/test/CodeGen/X86/regalloc-copy-hints.mir
@@ -0,0 +1,805 @@
+# RUN: llc -mtriple=i386-unknown-unknown -mcpu=i486 %s -o - -run-pass greedy \
+# RUN:   -debug-only=regalloc 2>&1 | FileCheck %s
+# REQUIRES: asserts
+
+--- |
+  %0 = type { %1 }
+  %1 = type { %2, %23, %23*, %27*, %28*, %29, %33*, %34, %42, i8, i32, i32, i32 }
+  %2 = type { %3, %6, %14, %14, i8, i8*, i8*, %16 }
+  %3 = type { i32 (...)**, %4*, %5* }
+  %4 = type { i32 (...)**, %3* }
+  %5 = type { i32 (...)** }
+  %6 = type { %7 }
+  %7 = type { %8, i32, %12 }
+  %8 = type { %9**, %9**, %9**, %10 }
+  %9 = type { i32, i32, i32, i8* }
+  %10 = type { %11 }
+  %11 = type { %9** }
+  %12 = type { %13 }
+  %13 = type { i32 }
+  %14 = type { i32, %15* }
+  %15 = type { i32, i32, i8* }
+  %16 = type { %17 }
+  %17 = type { %18*, %20, %22 }
+  %18 = type { %19* }
+  %19 = type <{ %18, %19*, %18*, i8, [3 x i8] }>
+  %20 = type { %21 }
+  %21 = type { %18 }
+  %22 = type { %13 }
+  %23 = type { %24 }
+  %24 = type { %18*, %25, %26 }
+  %25 = type { %21 }
+  %26 = type { %13 }
+  %27 = type { i32 (...)** }
+  %28 = type { i32 (...)** }
+  %29 = type { %30 }
+  %30 = type { %18*, %31, %32 }
+  %31 = type { %21 }
+  %32 = type { %13 }
+  %33 = type { i32 (...)** }
+  %34 = type { %35 }
+  %35 = type { %36 }
+  %36 = type { %37, i32, %41 }
+  %37 = type { %38**, %38**, %38**, %39 }
+  %38 = type { %42, i32 }
+  %39 = type { %40 }
+  %40 = type { %38** }
+  %41 = type { %13 }
+  %42 = type { %43 }
+  %43 = type { %18*, %44, %45 }
+  %44 = type { %21 }
+  %45 = type { %13 }
+  %46 = type { %47, %48 }
+  %47 = type <{ %18, %19*, %18*, i8 }>
+  %48 = type { %49 }
+  %49 = type { i32, %50 }
+  %50 = type { { i32, i32 }, { i32, i32 }, { i32, i32 }, { i32, i32 }, { i32, i32 }, { i32, i32 } }
+  
+  define void @fun(%0* %arg) local_unnamed_addr #0 align 2 personality i32 (...)* @__gxx_personality_v0 {
+  bb:
+    %tmp = getelementptr inbounds %0, %0* %arg, i32 0, i32 0, i32 1
+    %tmp1 = getelementptr inbounds %0, %0* %arg, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0
+    br i1 undef, label %bb5, label %bb6
+  
+  bb5:                                              ; preds = %bb
+    unreachable
+  
+  bb6:                                              ; preds = %bb
+    %tmp8 = getelementptr inbounds %0, %0* %arg, i32 0, i32 0, i32 8, i32 0, i32 1, i32 0, i32 0
+    br i1 undef, label %bb10, label %bb9
+  
+  bb9:                                              ; preds = %bb6
+    unreachable
+  
+  bb10:                                             ; preds = %bb6
+    store %18* %tmp8, %18** undef
+    br i1 undef, label %bb14, label %bb13
+  
+  bb13:                                             ; preds = %bb10
+    unreachable
+  
+  bb14:                                             ; preds = %bb10
+    br i1 undef, label %bb17, label %bb18
+  
+  bb17:                                             ; preds = %bb14
+    unreachable
+  
+  bb18:                                             ; preds = %bb14
+    br i1 undef, label %bb20, label %bb19
+  
+  bb19:                                             ; preds = %bb18
+    unreachable
+  
+  bb20:                                             ; preds = %bb18
+    br i1 undef, label %bb25, label %bb24
+  
+  bb24:                                             ; preds = %bb20
+    unreachable
+  
+  bb25:                                             ; preds = %bb20
+    br i1 undef, label %bb29, label %bb30
+  
+  bb29:                                             ; preds = %bb25
+    unreachable
+  
+  bb30:                                             ; preds = %bb25
+    br i1 undef, label %bb38, label %bb31
+  
+  bb31:                                             ; preds = %bb30
+    %tmp32 = getelementptr inbounds %0, %0* %arg, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0, i32 0
+    br i1 undef, label %bb34, label %bb35
+  
+  bb34:                                             ; preds = %bb31
+    unreachable
+  
+  bb35:                                             ; preds = %bb31
+    br i1 undef, label %bb40, label %bb36
+  
+  bb36:                                             ; preds = %bb35
+    unreachable
+  
+  bb38:                                             ; preds = %bb30
+    %tmp391 = bitcast %18* %tmp1 to %19**
+    br label %bb40
+  
+  bb40:                                             ; preds = %bb35, %bb38
+    %tmp41 = phi %18* [ %tmp1, %bb38 ], [ null, %bb35 ]
+    %tmp42 = phi %19** [ %tmp391, %bb38 ], [ %tmp32, %bb35 ]
+    br i1 undef, label %bb43, label %bb48
+  
+  bb43:                                             ; preds = %bb40
+    %tmp44 = tail call i8* @_Znwj()
+    store %18* %tmp41, %18** undef
+    %tmp46 = bitcast %19** %tmp42 to i8**
+    store i8* %tmp44, i8** %tmp46
+    %0 = bitcast i8* %tmp44 to %46*
+    tail call void @_ZNSt3__127__tree_balance_after_insertIPNS_16__tree_node_baseIPvEEEEvT_S5_()
+    br label %bb48
+  
+  bb48:                                             ; preds = %bb43, %bb40
+    %tmp49 = phi %46* [ %0, %bb43 ], [ undef, %bb40 ]
+    %tmp50 = getelementptr inbounds %46, %46* %tmp49, i32 0, i32 1, i32 0, i32 1, i32 4, i32 0
+    store i32 ptrtoint (i1 (%0*)* @_ZN15COLLADASaxFWL1429ColladaParserAutoGen14Private15_preEnd__authorEv to i32), i32* %tmp50
+    br i1 undef, label %bb52, label %bb53
+  
+  bb52:                                             ; preds = %bb48
+    unreachable
+  
+  bb53:                                             ; preds = %bb48
+    br i1 undef, label %bb55, label %bb54
+  
+  bb54:                                             ; preds = %bb53
+    unreachable
+  
+  bb55:                                             ; preds = %bb53
+    br i1 undef, label %bb59, label %bb58
+  
+  bb58:                                             ; preds = %bb55
+    unreachable
+  
+  bb59:                                             ; preds = %bb55
+    br i1 undef, label %bb62, label %bb61
+  
+  bb61:                                             ; preds = %bb59
+    unreachable
+  
+  bb62:                                             ; preds = %bb59
+    br i1 undef, label %bb64, label %bb65
+  
+  bb64:                                             ; preds = %bb62
+    unreachable
+  
+  bb65:                                             ; preds = %bb62
+    %tmp66 = icmp eq %46* null, null
+    br i1 %tmp66, label %bb72, label %bb67
+  
+  bb67:                                             ; preds = %bb65
+    %tmp68 = getelementptr inbounds %0, %0* %arg, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0, i32 0
+    br i1 undef, label %bb70, label %bb74
+  
+  bb70:                                             ; preds = %bb67
+    unreachable
+  
+  bb72:                                             ; preds = %bb65
+    %tmp732 = bitcast %18* %tmp1 to %19**
+    br label %bb74
+  
+  bb74:                                             ; preds = %bb67, %bb72
+    %tmp75 = phi %18* [ %tmp1, %bb72 ], [ null, %bb67 ]
+    %tmp76 = phi %19** [ %tmp732, %bb72 ], [ %tmp68, %bb67 ]
+    %tmp77 = tail call i8* @_Znwj()
+    store %18* %tmp75, %18** undef
+    %tmp79 = bitcast %19** %tmp76 to i8**
+    store i8* %tmp77, i8** %tmp79
+    %1 = bitcast i8* %tmp77 to %46*
+    tail call void @_ZNSt3__127__tree_balance_after_insertIPNS_16__tree_node_baseIPvEEEEvT_S5_()
+    %tmp81 = getelementptr inbounds %46, %46* %1, i32 0, i32 1, i32 0, i32 1, i32 2, i32 0
+    store i32 ptrtoint (i1 (%0*)* @_ZN15COLLADASaxFWL1429ColladaParserAutoGen14Private14_end__commentsEv to i32), i32* %tmp81
+    store %18* %tmp8, %18** undef
+    %2 = bitcast %0* %arg to i8*
+    %sunkaddr = getelementptr i8, i8* %2, i32 140
+    %3 = bitcast i8* %sunkaddr to %18**
+    %tmp85 = load %18*, %18** %3
+    %tmp864 = bitcast %18* %tmp85 to %19**
+    %tmp87 = load %19*, %19** %tmp864
+    %tmp88 = icmp eq %19* %tmp87, null
+    br i1 %tmp88, label %bb90, label %bb89
+  
+  bb89:                                             ; preds = %bb74
+    unreachable
+  
+  bb90:                                             ; preds = %bb74
+    br i1 undef, label %bb94, label %bb92
+  
+  bb92:                                             ; preds = %bb90
+    br i1 undef, label %bb96, label %bb97
+  
+  bb94:                                             ; preds = %bb90
+    unreachable
+  
+  bb96:                                             ; preds = %bb92
+    unreachable
+  
+  bb97:                                             ; preds = %bb92
+    br i1 undef, label %bb101, label %bb102
+  
+  bb101:                                            ; preds = %bb97
+    unreachable
+  
+  bb102:                                            ; preds = %bb97
+    br i1 undef, label %bb104, label %bb103
+  
+  bb103:                                            ; preds = %bb102
+    unreachable
+  
+  bb104:                                            ; preds = %bb102
+    br i1 undef, label %bb109, label %bb108
+  
+  bb108:                                            ; preds = %bb104
+    unreachable
+  
+  bb109:                                            ; preds = %bb104
+    br i1 undef, label %bb111, label %bb112
+  
+  bb111:                                            ; preds = %bb109
+    unreachable
+  
+  bb112:                                            ; preds = %bb109
+    br i1 undef, label %bb118, label %bb117
+  
+  bb117:                                            ; preds = %bb112
+    unreachable
+  
+  bb118:                                            ; preds = %bb112
+    br i1 undef, label %bb120, label %bb121
+  
+  bb120:                                            ; preds = %bb118
+    unreachable
+  
+  bb121:                                            ; preds = %bb118
+    br i1 undef, label %bb124, label %bb125
+  
+  bb124:                                            ; preds = %bb121
+    unreachable
+  
+  bb125:                                            ; preds = %bb121
+    %4 = bitcast %18* %tmp1 to %46**
+    %tmp126 = load %46*, %46** %4
+    %tmp127 = icmp eq %46* %tmp126, null
+    br i1 %tmp127, label %bb135, label %bb128
+  
+  bb128:                                            ; preds = %bb125
+    br label %bb129
+  
+  bb129:                                            ; preds = %bb131, %bb128
+    %tmp130 = icmp ugt i32 undef, 95406324
+    br i1 %tmp130, label %bb131, label %bb133
+  
+  bb131:                                            ; preds = %bb129
+    br label %bb129
+  
+  bb133:                                            ; preds = %bb129
+    unreachable
+  
+  bb135:                                            ; preds = %bb125
+    br i1 undef, label %bb137, label %bb138
+  
+  bb137:                                            ; preds = %bb135
+    unreachable
+  
+  bb138:                                            ; preds = %bb135
+    unreachable
+  }
+  
+  declare zeroext i1 @_ZN15COLLADASaxFWL1429ColladaParserAutoGen14Private15_preEnd__authorEv(%0*) #0
+  
+  declare zeroext i1 @_ZN15COLLADASaxFWL1429ColladaParserAutoGen14Private14_end__commentsEv(%0*) #0 align 2
+  
+  declare i32 @__gxx_personality_v0(...) #0
+  
+  declare noalias nonnull i8* @_Znwj() local_unnamed_addr #0
+  
+  declare void @_ZNSt3__127__tree_balance_after_insertIPNS_16__tree_node_baseIPvEEEEvT_S5_() local_unnamed_addr #0
+  
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #1
+  
+  attributes #0 = { "target-cpu"="i486" }
+  attributes #1 = { nounwind }
+
+...
+---
+# A physreg should always only be hinted once per getRegAllocationHints() query.
+# CHECK: hints: $ebx $edi
+# CHECK-NOT: hints: $ebx $edi $ebx $edi
+name:            fun
+alignment:       4
+tracksRegLiveness: true
+registers:       
+  - { id: 0, class: gr32 }
+  - { id: 1, class: gr32 }
+  - { id: 2, class: gr32 }
+  - { id: 3, class: gr32 }
+  - { id: 4, class: gr32 }
+  - { id: 5, class: gr32 }
+  - { id: 6, class: gr32 }
+  - { id: 7, class: gr32 }
+  - { id: 8, class: gr32 }
+  - { id: 9, class: gr32 }
+  - { id: 10, class: gr32 }
+  - { id: 11, class: gr32 }
+  - { id: 12, class: gr32 }
+  - { id: 13, class: gr32_abcd }
+  - { id: 14, class: gr8 }
+  - { id: 15, class: gr32_abcd }
+  - { id: 16, class: gr8 }
+  - { id: 17, class: gr32 }
+  - { id: 18, class: gr32_abcd }
+  - { id: 19, class: gr8 }
+  - { id: 20, class: gr32_abcd }
+  - { id: 21, class: gr8 }
+  - { id: 22, class: gr32_abcd }
+  - { id: 23, class: gr8 }
+  - { id: 24, class: gr32_abcd }
+  - { id: 25, class: gr8 }
+  - { id: 26, class: gr32_abcd }
+  - { id: 27, class: gr8 }
+  - { id: 28, class: gr32_abcd }
+  - { id: 29, class: gr8 }
+  - { id: 30, class: gr32_abcd }
+  - { id: 31, class: gr8 }
+  - { id: 32, class: gr32_abcd }
+  - { id: 33, class: gr8 }
+  - { id: 34, class: gr32 }
+  - { id: 35, class: gr32_abcd }
+  - { id: 36, class: gr8 }
+  - { id: 37, class: gr32 }
+  - { id: 38, class: gr32 }
+  - { id: 39, class: gr32_abcd }
+  - { id: 40, class: gr8 }
+  - { id: 41, class: gr32_abcd }
+  - { id: 42, class: gr8 }
+  - { id: 43, class: gr32_abcd }
+  - { id: 44, class: gr8 }
+  - { id: 45, class: gr32_abcd }
+  - { id: 46, class: gr8 }
+  - { id: 47, class: gr32_abcd }
+  - { id: 48, class: gr8 }
+  - { id: 49, class: gr8 }
+  - { id: 50, class: gr32_abcd }
+  - { id: 51, class: gr8 }
+  - { id: 52, class: gr32 }
+  - { id: 53, class: gr32 }
+  - { id: 54, class: gr32 }
+  - { id: 55, class: gr32 }
+  - { id: 56, class: gr32_abcd }
+  - { id: 57, class: gr8 }
+  - { id: 58, class: gr32_abcd }
+  - { id: 59, class: gr8 }
+  - { id: 60, class: gr32_abcd }
+  - { id: 61, class: gr8 }
+  - { id: 62, class: gr32_abcd }
+  - { id: 63, class: gr8 }
+  - { id: 64, class: gr32_abcd }
+  - { id: 65, class: gr8 }
+  - { id: 66, class: gr32_abcd }
+  - { id: 67, class: gr8 }
+  - { id: 68, class: gr32_abcd }
+  - { id: 69, class: gr8 }
+  - { id: 70, class: gr32_abcd }
+  - { id: 71, class: gr8 }
+  - { id: 72, class: gr32_abcd }
+  - { id: 73, class: gr8 }
+  - { id: 74, class: gr32 }
+  - { id: 75, class: gr32 }
+  - { id: 76, class: gr32_abcd }
+  - { id: 77, class: gr8 }
+  - { id: 78, class: gr32_abcd }
+  - { id: 79, class: gr32 }
+  - { id: 80, class: gr32 }
+  - { id: 81, class: gr32_abcd }
+  - { id: 82, class: gr32 }
+frameInfo:       
+  maxAlignment:    4
+  hasCalls:        true
+fixedStack:      
+  - { id: 0, size: 4, alignment: 4, stack-id: 0, isImmutable: true }
+body:             |
+  bb.0.bb:
+    successors: %bb.1(0x00000001), %bb.2(0x7fffffff)
+  
+    %13:gr32_abcd = MOV32r0 implicit-def dead $eflags
+    TEST8rr %13.sub_8bit, %13.sub_8bit, implicit-def $eflags
+    JNE_1 %bb.2, implicit killed $eflags
+    JMP_1 %bb.1
+  
+  bb.1.bb5:
+    successors: 
+  
+  
+  bb.2.bb6:
+    successors: %bb.4(0x7fffffff), %bb.3(0x00000001)
+  
+    %15:gr32_abcd = MOV32r0 implicit-def dead $eflags
+    TEST8rr %15.sub_8bit, %15.sub_8bit, implicit-def $eflags
+    JNE_1 %bb.4, implicit killed $eflags
+    JMP_1 %bb.3
+  
+  bb.3.bb9:
+    successors: 
+  
+  
+  bb.4.bb10:
+    successors: %bb.6(0x7fffffff), %bb.5(0x00000001)
+  
+    %12:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load 4 from %fixed-stack.0)
+    %1:gr32 = LEA32r %12, 1, $noreg, 144, $noreg
+    MOV32mr undef %17:gr32, 1, $noreg, 0, $noreg, %1 :: (store 4 into `%18** undef`)
+    %18:gr32_abcd = MOV32r0 implicit-def dead $eflags
+    TEST8rr %18.sub_8bit, %18.sub_8bit, implicit-def $eflags
+    JNE_1 %bb.6, implicit killed $eflags
+    JMP_1 %bb.5
+  
+  bb.5.bb13:
+    successors: 
+  
+  
+  bb.6.bb14:
+    successors: %bb.7(0x00000001), %bb.8(0x7fffffff)
+  
+    %20:gr32_abcd = MOV32r0 implicit-def dead $eflags
+    TEST8rr %20.sub_8bit, %20.sub_8bit, implicit-def $eflags
+    JNE_1 %bb.8, implicit killed $eflags
+    JMP_1 %bb.7
+  
+  bb.7.bb17:
+    successors: 
+  
+  
+  bb.8.bb18:
+    successors: %bb.10(0x7fffffff), %bb.9(0x00000001)
+  
+    %22:gr32_abcd = MOV32r0 implicit-def dead $eflags
+    TEST8rr %22.sub_8bit, %22.sub_8bit, implicit-def $eflags
+    JNE_1 %bb.10, implicit killed $eflags
+    JMP_1 %bb.9
+  
+  bb.9.bb19:
+    successors: 
+  
+  
+  bb.10.bb20:
+    successors: %bb.12(0x7fffffff), %bb.11(0x00000001)
+  
+    %24:gr32_abcd = MOV32r0 implicit-def dead $eflags
+    TEST8rr %24.sub_8bit, %24.sub_8bit, implicit-def $eflags
+    JNE_1 %bb.12, implicit killed $eflags
+    JMP_1 %bb.11
+  
+  bb.11.bb24:
+    successors: 
+  
+  
+  bb.12.bb25:
+    successors: %bb.13(0x00000001), %bb.14(0x7fffffff)
+  
+    %26:gr32_abcd = MOV32r0 implicit-def dead $eflags
+    TEST8rr %26.sub_8bit, %26.sub_8bit, implicit-def $eflags
+    JNE_1 %bb.14, implicit killed $eflags
+    JMP_1 %bb.13
+  
+  bb.13.bb29:
+    successors: 
+  
+  
+  bb.14.bb30:
+    %0:gr32 = LEA32r %12, 1, $noreg, 80, $noreg
+    %28:gr32_abcd = MOV32r0 implicit-def dead $eflags
+    TEST8rr %28.sub_8bit, %28.sub_8bit, implicit-def $eflags
+    JNE_1 %bb.20, implicit killed $eflags
+    JMP_1 %bb.15
+  
+  bb.15.bb31:
+    successors: %bb.16(0x00000001), %bb.17(0x7fffffff)
+  
+    %78:gr32_abcd = MOV32r0 implicit-def dead $eflags
+    TEST8rr %78.sub_8bit, %78.sub_8bit, implicit-def $eflags
+    JNE_1 %bb.17, implicit killed $eflags
+    JMP_1 %bb.16
+  
+  bb.16.bb34:
+    successors: 
+  
+  
+  bb.17.bb35:
+    successors: %bb.18(0x7fffffff), %bb.19(0x00000001)
+  
+    TEST8rr %78.sub_8bit, %78.sub_8bit, implicit-def $eflags
+    JE_1 %bb.19, implicit killed $eflags
+  
+  bb.18:
+    %79:gr32 = LEA32r %12, 1, $noreg, 80, $noreg
+    JMP_1 %bb.21
+  
+  bb.19.bb36:
+    successors: 
+  
+  
+  bb.20.bb38:
+    %78:gr32_abcd = COPY %0
+    %79:gr32 = COPY %0
+  
+  bb.21.bb40:
+    successors: %bb.22, %bb.23
+  
+    %35:gr32_abcd = MOV32r0 implicit-def dead $eflags
+    TEST8rr %35.sub_8bit, %35.sub_8bit, implicit-def $eflags
+    %80:gr32 = IMPLICIT_DEF
+    JNE_1 %bb.23, implicit killed $eflags
+    JMP_1 %bb.22
+  
+  bb.22.bb43:
+    ADJCALLSTACKDOWN32 0, 0, 0, implicit-def dead $esp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $esp, implicit $ssp
+    CALLpcrel32 @_Znwj, csr_32, implicit $esp, implicit $ssp, implicit-def $esp, implicit-def $ssp, implicit-def $eax
+    ADJCALLSTACKUP32 0, 0, implicit-def dead $esp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $esp, implicit $ssp
+    %80:gr32 = COPY killed $eax
+    MOV32mr undef %38:gr32, 1, $noreg, 0, $noreg, %78 :: (store 4 into `%18** undef`)
+    MOV32mr %79, 1, $noreg, 0, $noreg, %80 :: (store 4 into %ir.tmp46)
+    ADJCALLSTACKDOWN32 0, 0, 0, implicit-def dead $esp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $esp, implicit $ssp
+    CALLpcrel32 @_ZNSt3__127__tree_balance_after_insertIPNS_16__tree_node_baseIPvEEEEvT_S5_, csr_32, implicit $esp, implicit $ssp, implicit-def $esp, implicit-def $ssp
+    ADJCALLSTACKUP32 0, 0, implicit-def dead $esp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $esp, implicit $ssp
+  
+  bb.23.bb48:
+    successors: %bb.24(0x00000001), %bb.25(0x7fffffff)
+  
+    MOV32mi %80, 1, $noreg, 52, $noreg, @_ZN15COLLADASaxFWL1429ColladaParserAutoGen14Private15_preEnd__authorEv :: (store 4 into %ir.tmp50)
+    %39:gr32_abcd = MOV32r0 implicit-def dead $eflags
+    TEST8rr %39.sub_8bit, %39.sub_8bit, implicit-def $eflags
+    JNE_1 %bb.25, implicit killed $eflags
+    JMP_1 %bb.24
+  
+  bb.24.bb52:
+    successors: 
+  
+  
+  bb.25.bb53:
+    successors: %bb.27(0x7fffffff), %bb.26(0x00000001)
+  
+    %41:gr32_abcd = MOV32r0 implicit-def dead $eflags
+    TEST8rr %41.sub_8bit, %41.sub_8bit, implicit-def $eflags
+    JNE_1 %bb.27, implicit killed $eflags
+    JMP_1 %bb.26
+  
+  bb.26.bb54:
+    successors: 
+  
+  
+  bb.27.bb55:
+    successors: %bb.29(0x7fffffff), %bb.28(0x00000001)
+  
+    %43:gr32_abcd = MOV32r0 implicit-def dead $eflags
+    TEST8rr %43.sub_8bit, %43.sub_8bit, implicit-def $eflags
+    JNE_1 %bb.29, implicit killed $eflags
+    JMP_1 %bb.28
+  
+  bb.28.bb58:
+    successors: 
+  
+  
+  bb.29.bb59:
+    successors: %bb.31(0x7fffffff), %bb.30(0x00000001)
+  
+    %45:gr32_abcd = MOV32r0 implicit-def dead $eflags
+    TEST8rr %45.sub_8bit, %45.sub_8bit, implicit-def $eflags
+    JNE_1 %bb.31, implicit killed $eflags
+    JMP_1 %bb.30
+  
+  bb.30.bb61:
+    successors: 
+  
+  
+  bb.31.bb62:
+    successors: %bb.32(0x00000001), %bb.33(0x7fffffff)
+  
+    %47:gr32_abcd = MOV32r0 implicit-def dead $eflags
+    TEST8rr %47.sub_8bit, %47.sub_8bit, implicit-def $eflags
+    JNE_1 %bb.33, implicit killed $eflags
+    JMP_1 %bb.32
+  
+  bb.32.bb64:
+    successors: 
+  
+  
+  bb.33.bb65:
+    successors: %bb.37(0x30000000), %bb.34(0x50000000)
+  
+    %49:gr8 = MOV8ri 1
+    TEST8rr %49, %49, implicit-def $eflags
+    JNE_1 %bb.37, implicit killed $eflags
+    JMP_1 %bb.34
+  
+  bb.34.bb67:
+    successors: %bb.36(0x00000001), %bb.35(0x7fffffff)
+  
+    %81:gr32_abcd = MOV32r0 implicit-def dead $eflags
+    TEST8rr %81.sub_8bit, %81.sub_8bit, implicit-def $eflags
+    JE_1 %bb.36, implicit killed $eflags
+  
+  bb.35:
+    %82:gr32 = LEA32r %12, 1, $noreg, 80, $noreg
+    JMP_1 %bb.38
+  
+  bb.36.bb70:
+    successors: 
+  
+  
+  bb.37.bb72:
+    %81:gr32_abcd = COPY %0
+    %82:gr32 = COPY %0
+  
+  bb.38.bb74:
+    successors: %bb.40(0x7fffffff), %bb.39(0x00000001)
+  
+    ADJCALLSTACKDOWN32 0, 0, 0, implicit-def dead $esp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $esp, implicit $ssp
+    CALLpcrel32 @_Znwj, csr_32, implicit $esp, implicit $ssp, implicit-def $esp, implicit-def $ssp, implicit-def $eax
+    ADJCALLSTACKUP32 0, 0, implicit-def dead $esp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $esp, implicit $ssp
+    %52:gr32 = COPY killed $eax
+    MOV32mr undef %53:gr32, 1, $noreg, 0, $noreg, %81 :: (store 4 into `%18** undef`)
+    MOV32mr %82, 1, $noreg, 0, $noreg, %52 :: (store 4 into %ir.tmp79)
+    ADJCALLSTACKDOWN32 0, 0, 0, implicit-def dead $esp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $esp, implicit $ssp
+    CALLpcrel32 @_ZNSt3__127__tree_balance_after_insertIPNS_16__tree_node_baseIPvEEEEvT_S5_, csr_32, implicit $esp, implicit $ssp, implicit-def $esp, implicit-def $ssp
+    ADJCALLSTACKUP32 0, 0, implicit-def dead $esp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $esp, implicit $ssp
+    MOV32mi %52, 1, $noreg, 36, $noreg, @_ZN15COLLADASaxFWL1429ColladaParserAutoGen14Private14_end__commentsEv :: (store 4 into %ir.tmp81)
+    MOV32mr undef %54:gr32, 1, $noreg, 0, $noreg, %1 :: (store 4 into `%18** undef`)
+    %55:gr32 = MOV32rm %12, 1, $noreg, 140, $noreg :: (load 4 from %ir.3)
+    CMP32mi8 %55, 1, $noreg, 0, $noreg, 0, implicit-def $eflags :: (load 4 from %ir.tmp864)
+    JE_1 %bb.40, implicit killed $eflags
+    JMP_1 %bb.39
+  
+  bb.39.bb89:
+    successors: 
+  
+  
+  bb.40.bb90:
+    successors: %bb.42(0x00000001), %bb.41(0x7fffffff)
+  
+    %56:gr32_abcd = MOV32r0 implicit-def dead $eflags
+    TEST8rr %56.sub_8bit, %56.sub_8bit, implicit-def $eflags
+    JNE_1 %bb.42, implicit killed $eflags
+    JMP_1 %bb.41
+  
+  bb.41.bb92:
+    successors: %bb.43(0x00000001), %bb.44(0x7fffffff)
+  
+    %58:gr32_abcd = MOV32r0 implicit-def dead $eflags
+    TEST8rr %58.sub_8bit, %58.sub_8bit, implicit-def $eflags
+    JNE_1 %bb.43, implicit killed $eflags
+    JMP_1 %bb.44
+  
+  bb.42.bb94:
+    successors: 
+  
+  
+  bb.43.bb96:
+    successors: 
+  
+  
+  bb.44.bb97:
+    successors: %bb.45(0x00000001), %bb.46(0x7fffffff)
+  
+    %60:gr32_abcd = MOV32r0 implicit-def dead $eflags
+    TEST8rr %60.sub_8bit, %60.sub_8bit, implicit-def $eflags
+    JNE_1 %bb.46, implicit killed $eflags
+    JMP_1 %bb.45
+  
+  bb.45.bb101:
+    successors: 
+  
+  
+  bb.46.bb102:
+    successors: %bb.48(0x7fffffff), %bb.47(0x00000001)
+  
+    %62:gr32_abcd = MOV32r0 implicit-def dead $eflags
+    TEST8rr %62.sub_8bit, %62.sub_8bit, implicit-def $eflags
+    JNE_1 %bb.48, implicit killed $eflags
+    JMP_1 %bb.47
+  
+  bb.47.bb103:
+    successors: 
+  
+  
+  bb.48.bb104:
+    successors: %bb.50(0x7fffffff), %bb.49(0x00000001)
+  
+    %64:gr32_abcd = MOV32r0 implicit-def dead $eflags
+    TEST8rr %64.sub_8bit, %64.sub_8bit, implicit-def $eflags
+    JNE_1 %bb.50, implicit killed $eflags
+    JMP_1 %bb.49
+  
+  bb.49.bb108:
+    successors: 
+  
+  
+  bb.50.bb109:
+    successors: %bb.51(0x00000001), %bb.52(0x7fffffff)
+  
+    %66:gr32_abcd = MOV32r0 implicit-def dead $eflags
+    TEST8rr %66.sub_8bit, %66.sub_8bit, implicit-def $eflags
+    JNE_1 %bb.52, implicit killed $eflags
+    JMP_1 %bb.51
+  
+  bb.51.bb111:
+    successors: 
+  
+  
+  bb.52.bb112:
+    successors: %bb.54(0x7fffffff), %bb.53(0x00000001)
+  
+    %68:gr32_abcd = MOV32r0 implicit-def dead $eflags
+    TEST8rr %68.sub_8bit, %68.sub_8bit, implicit-def $eflags
+    JNE_1 %bb.54, implicit killed $eflags
+    JMP_1 %bb.53
+  
+  bb.53.bb117:
+    successors: 
+  
+  
+  bb.54.bb118:
+    successors: %bb.55(0x00000001), %bb.56(0x7fffffff)
+  
+    %70:gr32_abcd = MOV32r0 implicit-def dead $eflags
+    TEST8rr %70.sub_8bit, %70.sub_8bit, implicit-def $eflags
+    JNE_1 %bb.56, implicit killed $eflags
+    JMP_1 %bb.55
+  
+  bb.55.bb120:
+    successors: 
+  
+  
+  bb.56.bb121:
+    successors: %bb.57(0x00000001), %bb.58(0x7fffffff)
+  
+    %72:gr32_abcd = MOV32r0 implicit-def dead $eflags
+    TEST8rr %72.sub_8bit, %72.sub_8bit, implicit-def $eflags
+    JNE_1 %bb.58, implicit killed $eflags
+    JMP_1 %bb.57
+  
+  bb.57.bb124:
+    successors: 
+  
+  
+  bb.58.bb125:
+    successors: %bb.62(0x00000001), %bb.59(0x7fffffff)
+  
+    CMP32mi8 %0, 1, $noreg, 0, $noreg, 0, implicit-def $eflags :: (load 4 from %ir.4)
+    JE_1 %bb.62, implicit killed $eflags
+    JMP_1 %bb.59
+  
+  bb.59.bb128:
+  
+  bb.60.bb129:
+    successors: %bb.60(0x7fffffff), %bb.61(0x00000001)
+  
+    CMP32ri undef %75:gr32, 95406325, implicit-def $eflags
+    JB_1 %bb.61, implicit killed $eflags
+    JMP_1 %bb.60
+  
+  bb.61.bb133:
+    successors: 
+  
+  
+  bb.62.bb135:
+    successors: %bb.63, %bb.64
+  
+    %76:gr32_abcd = MOV32r0 implicit-def dead $eflags
+    TEST8rr %76.sub_8bit, %76.sub_8bit, implicit-def $eflags
+    JNE_1 %bb.64, implicit killed $eflags
+    JMP_1 %bb.63
+  
+  bb.63.bb137:
+    successors: 
+  
+  
+  bb.64.bb138:
+
+...
diff --git a/test/MC/PowerPC/ppc64-localentry-symbols.s b/test/MC/PowerPC/ppc64-localentry-symbols.s
new file mode 100644
index 00000000000..f1d5c5d0ab1
--- /dev/null
+++ b/test/MC/PowerPC/ppc64-localentry-symbols.s
@@ -0,0 +1,34 @@
+# RUN: llvm-mc -filetype=obj -triple=powerpc64-unknown-freebsd13.0 %s -o %t
+# RUN: llvm-objdump -t %t | FileCheck %s
+
+# CHECK: 0000000000000000 gw    F .text  00000000 0x60 __impl_foo
+# CHECK: 0000000000000000 g     F .text  00000000 0x60 foo
+# CHECK: 0000000000000000 gw    F .text  00000000 0x60 foo@FBSD_1.1
+# CHECK: 0000000000000008 g     F .text  00000000 0x60 func
+# CHECK: 0000000000000008 gw    F .text  00000000 0x60 weak_func
+
+.text
+.abiversion 2
+
+.globl foo
+.type foo,@function
+foo:
+  nop
+  nop
+  .localentry foo, 8
+
+.symver __impl_foo, foo@FBSD_1.1
+.weak   __impl_foo
+.set    __impl_foo, foo
+
+.globl  func
+# Mimick FreeBSD weak function/reference
+.weak   weak_func
+.equ    weak_func, func
+
+.p2align 2
+.type    func,@function
+func:
+  nop
+  nop
+  .localentry func, 8
diff --git a/test/MC/WebAssembly/null-output.s b/test/MC/WebAssembly/null-output.s
new file mode 100644
index 00000000000..a25d095e0cb
--- /dev/null
+++ b/test/MC/WebAssembly/null-output.s
@@ -0,0 +1,10 @@
+# RUN: llvm-mc -triple=wasm32-unknown-unknown -filetype=obj -o /dev/null < %s
+
+    .text
+    .section .text.main,"",@
+    .type    main,@function
+main:
+    .functype   main (i32, i32) -> (i32)
+    end_function
+.Lfunc_end0:
+    .size main, .Lfunc_end0-main
diff --git a/test/tools/llvm-dlltool/coff-weak-exports.def b/test/tools/llvm-dlltool/coff-weak-exports.def
index dbc59be8ae1..60f835233a5 100644
--- a/test/tools/llvm-dlltool/coff-weak-exports.def
+++ b/test/tools/llvm-dlltool/coff-weak-exports.def
@@ -1,5 +1,6 @@
 ; RUN: llvm-dlltool -m i386:x86-64 --input-def %s --output-lib %t.a
 ; RUN: llvm-nm %t.a | FileCheck %s
+; RUN: llvm-readobj %t.a | FileCheck -check-prefix=ARCH %s
 
 LIBRARY test.dll
 EXPORTS
@@ -26,3 +27,5 @@ ImpLibName3 = kernel32.Sleep
 ; CHECK-NEXT: W __imp_ImpLibName2
 ; CHECK:      T ImpLibName3
 ; CHECK-NEXT: T __imp_ImpLibName3
+
+; ARCH-NOT: unknown arch
diff --git a/test/tools/llvm-objdump/AMDGPU/source-lines.ll b/test/tools/llvm-objdump/AMDGPU/source-lines.ll
index 748f04754e4..4a4203d2a52 100644
--- a/test/tools/llvm-objdump/AMDGPU/source-lines.ll
+++ b/test/tools/llvm-objdump/AMDGPU/source-lines.ll
@@ -12,7 +12,7 @@
 ; LINE: v_mov_b32_e32 v{{[0-9]+}}, 0x888
 ; LINE: ; {{.*}}source-lines.cl:3
 ; LINE: ; {{.*}}source-lines.cl:4
-; LINE: v_add_u32_e32
+; LINE: v_add_u32_e64
 ; LINE: ; {{.*}}source-lines.cl:5
 ; LINE: flat_store_dword
 ; Epilogue.
@@ -28,7 +28,7 @@
 ; SOURCE: v_mov_b32_e32 v{{[0-9]+}}, 0x888
 ; SOURCE: ; int var1 = 0x888;
 ; SOURCE: ; int var2 = var0 + var1;
-; SOURCE: v_add_u32_e32
+; SOURCE: v_add_u32_e64
 ; SOURCE: ; *Out = var2;
 ; SOURCE: flat_store_dword
 ; Epilogue.
diff --git a/test/tools/llvm-objdump/PowerPC/branch-offset.s b/test/tools/llvm-objdump/PowerPC/branch-offset.s
new file mode 100644
index 00000000000..b0b3f05f9cd
--- /dev/null
+++ b/test/tools/llvm-objdump/PowerPC/branch-offset.s
@@ -0,0 +1,43 @@
+# RUN: llvm-mc -triple=powerpc64le-unknown-linux -filetype=obj %s -o %t.o
+# RUN: llvm-objdump -d %t.o | FileCheck %s
+
+# RUN: llvm-mc -triple=powerpc64-unknown-linux -filetype=obj %s -o %t.o
+# RUN: llvm-objdump -d %t.o | FileCheck %s
+
+# RUN: llvm-mc -triple=powerpc-unknown-linux -filetype=obj %s -o %t.o
+# RUN: llvm-objdump -d %t.o | FileCheck %s
+
+# CHECK: 0000000000000000 callee_back:
+# CHECK: 18: {{.*}} bl .-24
+# CHECK: 20: {{.*}} bl .+16
+# CHECK: 0000000000000030 callee_forward:
+
+        .text
+        .global caller
+        .type caller,@function
+        .type callee_forward,@function
+        .type callee_back,@function
+
+        .p2align 4
+callee_back:
+        li 3, 55
+        blr
+
+        .p2align 4
+caller:
+.Lgep:
+        addis 2, 12, .TOC.-.Lgep@ha
+        addi 2, 2, .TOC.-.Lgep@l
+.Llep:
+        .localentry caller, .Llep-.Lgep
+        bl callee_back
+        mr 31, 3
+        bl callee_forward
+        add 3, 3, 31
+        blr
+
+        .p2align 4
+callee_forward:
+        li 3, 66
+        blr
+
diff --git a/test/tools/llvm-objdump/PowerPC/lit.local.cfg b/test/tools/llvm-objdump/PowerPC/lit.local.cfg
new file mode 100644
index 00000000000..b77510721e1
--- /dev/null
+++ b/test/tools/llvm-objdump/PowerPC/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'PowerPC' in config.root.targets:
+      config.unsupported = True
diff --git a/test/tools/llvm-objdump/eh_frame-coff.test b/test/tools/llvm-objdump/eh_frame-coff.test
index 74eceeec4f3..cb481939dcc 100644
--- a/test/tools/llvm-objdump/eh_frame-coff.test
+++ b/test/tools/llvm-objdump/eh_frame-coff.test
@@ -8,7 +8,7 @@
 # CHECK:   Code alignment factor: 1
 # CHECK:   Data alignment factor: -4
 # CHECK:   Return address column: 8
-# CHECK:   Personality Address: 004025d7
+# CHECK:   Personality Address: 00000000004025d7
 # CHECK:   Augmentation data:     00 D7 25 40 00 00 00
 
 # CHECK:   DW_CFA_def_cfa: reg4 +4
@@ -17,7 +17,7 @@
 # CHECK:   DW_CFA_nop:
 
 # CHECK: 00000020 0000001c 00000024 FDE cie=00000024 pc=00401410...00401488
-# CHECK:   LSDA Address: 00406000
+# CHECK:   LSDA Address: 0000000000406000
 # CHECK:   DW_CFA_advance_loc: 1
 # CHECK:   DW_CFA_def_cfa_offset: +8
 # CHECK:   DW_CFA_offset: reg5 -8
diff --git a/test/tools/llvm-objdump/elf-symbol-visibility.test b/test/tools/llvm-objdump/elf-symbol-visibility.test
new file mode 100644
index 00000000000..da7f6d28516
--- /dev/null
+++ b/test/tools/llvm-objdump/elf-symbol-visibility.test
@@ -0,0 +1,37 @@
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-objdump --syms %t | FileCheck %s
+
+# CHECK:      SYMBOL TABLE:
+# CHECK-NEXT: .text  00000000 default
+# CHECK-NEXT: .text  00000000 .internal internal
+# CHECK-NEXT: .text  00000000 .hidden hidden
+# CHECK-NEXT: .text  00000000 .protected protected
+# CHECK-NEXT: .text  00000000 0x20 mips_pic
+
+!ELF
+FileHeader:
+  Class:   ELFCLASS32
+  Data:    ELFDATA2LSB
+  Type:    ET_REL
+  Machine: EM_MIPS
+  Flags:   [ EF_MIPS_ABI_O32, EF_MIPS_ARCH_32 ]
+Sections:
+  - Name: .text
+    Type: SHT_PROGBITS
+    Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+Symbols:
+  Local:
+    - Name:    default
+      Section: .text
+    - Name:    internal
+      Visibility: STV_INTERNAL
+      Section: .text
+    - Name:    hidden
+      Visibility: STV_HIDDEN
+      Section: .text
+    - Name:    protected
+      Visibility: STV_PROTECTED
+      Section: .text
+    - Name:    mips_pic
+      Other:   [ STO_MIPS_PIC ]
+      Section: .text
diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp
index ba8d3c5b8d5..9bd4528ef7f 100644
--- a/tools/llvm-objdump/llvm-objdump.cpp
+++ b/tools/llvm-objdump/llvm-objdump.cpp
@@ -2087,20 +2087,38 @@ void llvm::printSymbolTable(const ObjectFile *O, StringRef ArchiveName,
       outs() << SectionName;
     }
 
-    outs() << '\t';
     if (Common || isa<ELFObjectFileBase>(O)) {
       uint64_t Val =
           Common ? Symbol.getAlignment() : ELFSymbolRef(Symbol).getSize();
-      outs() << format("\t %08" PRIx64 " ", Val);
+      outs() << format("\t%08" PRIx64, Val);
     }
 
-    if (Hidden)
-      outs() << ".hidden ";
+    if (isa<ELFObjectFileBase>(O)) {
+      uint8_t Other = ELFSymbolRef(Symbol).getOther();
+      switch (Other) {
+      case ELF::STV_DEFAULT:
+        break;
+      case ELF::STV_INTERNAL:
+        outs() << " .internal";
+        break;
+      case ELF::STV_HIDDEN:
+        outs() << " .hidden";
+        break;
+      case ELF::STV_PROTECTED:
+        outs() << " .protected";
+        break;
+      default:
+        outs() << format(" 0x%02x", Other);
+        break;
+      }
+    } else if (Hidden) {
+      outs() << " .hidden";
+    }
 
     if (Demangle)
-      outs() << demangle(Name) << '\n';
+      outs() << ' ' << demangle(Name) << '\n';
     else
-      outs() << Name << '\n';
+      outs() << ' ' << Name << '\n';
   }
 }
 
diff --git a/utils/git-svn/git-llvm b/utils/git-svn/git-llvm
index 53c0b24ae2c..bf234301f13 100755
--- a/utils/git-svn/git-llvm
+++ b/utils/git-svn/git-llvm
@@ -42,7 +42,7 @@ else:
 
 # It's *almost* a straightforward mapping from the monorepo to svn...
 GIT_TO_SVN_DIR = {
-    d: (d + '/trunk')
+    d: (d + '/branches/release_80')
     for d in [
         'clang-tools-extra',
         'compiler-rt',
@@ -63,8 +63,8 @@ GIT_TO_SVN_DIR = {
         'pstl',
     ]
 }
-GIT_TO_SVN_DIR.update({'clang': 'cfe/trunk'})
-GIT_TO_SVN_DIR.update({'': 'monorepo-root/trunk'})
+GIT_TO_SVN_DIR.update({'clang': 'cfe/branches/release_80'})
+GIT_TO_SVN_DIR.update({'': 'monorepo-root/branches/release_80'})
 
 VERBOSE = False
 QUIET = False
diff --git a/utils/lit/lit/__init__.py b/utils/lit/lit/__init__.py
index 0d849d16f07..023bed81707 100644
--- a/utils/lit/lit/__init__.py
+++ b/utils/lit/lit/__init__.py
@@ -2,7 +2,7 @@
 
 __author__ = 'Daniel Dunbar'
 __email__ = 'daniel@minormatter.com'
-__versioninfo__ = (0, 8, 0)
+__versioninfo__ = (0, 8, 1)
 __version__ = '.'.join(str(v) for v in __versioninfo__) + 'dev'
 
 __all__ = []
diff --git a/utils/release/merge-request.sh b/utils/release/merge-request.sh
index 333b9043af3..c9763ee3cec 100755
--- a/utils/release/merge-request.sh
+++ b/utils/release/merge-request.sh
@@ -101,6 +101,9 @@ case $stable_version in
   7.0)
     release_metabug="39106"
     ;;
+  8.0)
+    release_metabug="41221"
+    ;;
   *)
     echo "error: invalid stable version"
     exit 1

From 70198750b9b77147a123f948252a51d04d5c6b28 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Tue, 11 Jun 2019 18:16:50 +0000
Subject: [PATCH 2/7] Vendor import of clang release_80 branch r363030:
 https://llvm.org/svn/llvm-project/cfe/branches/release_80@363030

---
 lib/Basic/Version.cpp                    |  2 +-
 lib/CodeGen/CGStmtOpenMP.cpp             |  5 ++--
 lib/Driver/ToolChains/Clang.cpp          | 12 ++++++---
 lib/Driver/ToolChains/Linux.cpp          | 33 ++++++++++++++----------
 lib/Sema/SemaOpenMP.cpp                  |  3 +--
 test/Driver/cl-options.c                 |  2 +-
 test/Driver/instrprof-ld.c               | 14 ++++++++++
 test/Driver/linux-ld.c                   | 10 +++++++
 test/OpenMP/ordered_doacross_codegen.cpp | 13 +++++++++-
 9 files changed, 70 insertions(+), 24 deletions(-)

diff --git a/lib/Basic/Version.cpp b/lib/Basic/Version.cpp
index a15c60e0f55..1d594b97418 100644
--- a/lib/Basic/Version.cpp
+++ b/lib/Basic/Version.cpp
@@ -36,7 +36,7 @@ std::string getClangRepositoryPath() {
 
   // If the SVN_REPOSITORY is empty, try to use the SVN keyword. This helps us
   // pick up a tag in an SVN export, for example.
-  StringRef SVNRepository("$URL: https://llvm.org/svn/llvm-project/cfe/tags/RELEASE_800/final/lib/Basic/Version.cpp $");
+  StringRef SVNRepository("$URL: https://llvm.org/svn/llvm-project/cfe/branches/release_80/lib/Basic/Version.cpp $");
   if (URL.empty()) {
     URL = SVNRepository.slice(SVNRepository.find(':'),
                               SVNRepository.find("/lib/Basic"));
diff --git a/lib/CodeGen/CGStmtOpenMP.cpp b/lib/CodeGen/CGStmtOpenMP.cpp
index eb1304d8934..44dc1cdee0b 100644
--- a/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/lib/CodeGen/CGStmtOpenMP.cpp
@@ -1518,8 +1518,9 @@ void CodeGenFunction::EmitOMPPrivateLoopCounters(
          I < E; ++I) {
       const auto *DRE = cast<DeclRefExpr>(C->getLoopCounter(I));
       const auto *VD = cast<VarDecl>(DRE->getDecl());
-      // Override only those variables that are really emitted already.
-      if (LocalDeclMap.count(VD)) {
+      // Override only those variables that can be captured to avoid re-emission
+      // of the variables declared within the loops.
+      if (DRE->refersToEnclosingVariableOrCapture()) {
         (void)LoopScope.addPrivate(VD, [this, DRE, VD]() {
           return CreateMemTemp(DRE->getType(), VD->getName());
         });
diff --git a/lib/Driver/ToolChains/Clang.cpp b/lib/Driver/ToolChains/Clang.cpp
index 589f53b1192..78ee7a78176 100644
--- a/lib/Driver/ToolChains/Clang.cpp
+++ b/lib/Driver/ToolChains/Clang.cpp
@@ -718,8 +718,9 @@ static void appendUserToPath(SmallVectorImpl<char> &Result) {
   Result.append(UID.begin(), UID.end());
 }
 
-static void addPGOAndCoverageFlags(Compilation &C, const Driver &D,
-                                   const InputInfo &Output, const ArgList &Args,
+static void addPGOAndCoverageFlags(const ToolChain &TC, Compilation &C,
+                                   const Driver &D, const InputInfo &Output,
+                                   const ArgList &Args,
                                    ArgStringList &CmdArgs) {
 
   auto *PGOGenerateArg = Args.getLastArg(options::OPT_fprofile_generate,
@@ -759,6 +760,11 @@ static void addPGOAndCoverageFlags(Compilation &C, const Driver &D,
                                            ProfileGenerateArg->getValue()));
     // The default is to use Clang Instrumentation.
     CmdArgs.push_back("-fprofile-instrument=clang");
+    if (TC.getTriple().isWindowsMSVCEnvironment()) {
+      // Add dependent lib for clang_rt.profile
+      CmdArgs.push_back(Args.MakeArgString("--dependent-lib=" +
+                                           TC.getCompilerRT(Args, "profile")));
+    }
   }
 
   if (PGOGenerateArg) {
@@ -4118,7 +4124,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   // sampling, overhead of call arc collection is way too high and there's no
   // way to collect the output.
   if (!Triple.isNVPTX())
-    addPGOAndCoverageFlags(C, D, Output, Args, CmdArgs);
+    addPGOAndCoverageFlags(TC, C, D, Output, Args, CmdArgs);
 
   if (auto *ABICompatArg = Args.getLastArg(options::OPT_fclang_abi_compat_EQ))
     ABICompatArg->render(Args, CmdArgs);
diff --git a/lib/Driver/ToolChains/Linux.cpp b/lib/Driver/ToolChains/Linux.cpp
index 65ab9b2daf5..dfdfb18319a 100644
--- a/lib/Driver/ToolChains/Linux.cpp
+++ b/lib/Driver/ToolChains/Linux.cpp
@@ -45,6 +45,7 @@ static std::string getMultiarchTriple(const Driver &D,
       TargetTriple.getEnvironment();
   bool IsAndroid = TargetTriple.isAndroid();
   bool IsMipsR6 = TargetTriple.getSubArch() == llvm::Triple::MipsSubArch_r6;
+  bool IsMipsN32Abi = TargetTriple.getEnvironment() == llvm::Triple::GNUABIN32;
 
   // For most architectures, just use whatever we have rather than trying to be
   // clever.
@@ -103,33 +104,37 @@ static std::string getMultiarchTriple(const Driver &D,
       return "aarch64_be-linux-gnu";
     break;
   case llvm::Triple::mips: {
-    std::string Arch = IsMipsR6 ? "mipsisa32r6" : "mips";
-    if (D.getVFS().exists(SysRoot + "/lib/" + Arch + "-linux-gnu"))
-      return Arch + "-linux-gnu";
+    std::string MT = IsMipsR6 ? "mipsisa32r6-linux-gnu" : "mips-linux-gnu";
+    if (D.getVFS().exists(SysRoot + "/lib/" + MT))
+      return MT;
     break;
   }
   case llvm::Triple::mipsel: {
     if (IsAndroid)
       return "mipsel-linux-android";
-    std::string Arch = IsMipsR6 ? "mipsisa32r6el" : "mipsel";
-    if (D.getVFS().exists(SysRoot + "/lib/" + Arch + "-linux-gnu"))
-      return Arch + "-linux-gnu";
+    std::string MT = IsMipsR6 ? "mipsisa32r6el-linux-gnu" : "mipsel-linux-gnu";
+    if (D.getVFS().exists(SysRoot + "/lib/" + MT))
+      return MT;
     break;
   }
   case llvm::Triple::mips64: {
-    std::string Arch = IsMipsR6 ? "mipsisa64r6" : "mips64";
-    std::string ABI = llvm::Triple::getEnvironmentTypeName(TargetEnvironment);
-    if (D.getVFS().exists(SysRoot + "/lib/" + Arch + "-linux-" + ABI))
-      return Arch + "-linux-" + ABI;
+    std::string MT = std::string(IsMipsR6 ? "mipsisa64r6" : "mips64") +
+                     "-linux-" + (IsMipsN32Abi ? "gnuabin32" : "gnuabi64");
+    if (D.getVFS().exists(SysRoot + "/lib/" + MT))
+      return MT;
+    if (D.getVFS().exists(SysRoot + "/lib/mips64-linux-gnu"))
+      return "mips64-linux-gnu";
     break;
   }
   case llvm::Triple::mips64el: {
     if (IsAndroid)
       return "mips64el-linux-android";
-    std::string Arch = IsMipsR6 ? "mipsisa64r6el" : "mips64el";
-    std::string ABI = llvm::Triple::getEnvironmentTypeName(TargetEnvironment);
-    if (D.getVFS().exists(SysRoot + "/lib/" + Arch + "-linux-" + ABI))
-      return Arch + "-linux-" + ABI;
+    std::string MT = std::string(IsMipsR6 ? "mipsisa64r6el" : "mips64el") +
+                     "-linux-" + (IsMipsN32Abi ? "gnuabin32" : "gnuabi64");
+    if (D.getVFS().exists(SysRoot + "/lib/" + MT))
+      return MT;
+    if (D.getVFS().exists(SysRoot + "/lib/mips64el-linux-gnu"))
+      return "mips64el-linux-gnu";
     break;
   }
   case llvm::Triple::ppc:
diff --git a/lib/Sema/SemaOpenMP.cpp b/lib/Sema/SemaOpenMP.cpp
index aedec746af9..8a0be0c472d 100644
--- a/lib/Sema/SemaOpenMP.cpp
+++ b/lib/Sema/SemaOpenMP.cpp
@@ -4602,8 +4602,7 @@ DeclRefExpr *OpenMPIterationSpaceChecker::buildCounterVar(
       Captures.insert(std::make_pair(LCRef, Ref));
     return Ref;
   }
-  return buildDeclRefExpr(SemaRef, VD, VD->getType().getNonReferenceType(),
-                          DefaultLoc);
+  return cast<DeclRefExpr>(LCRef);
 }
 
 Expr *OpenMPIterationSpaceChecker::buildPrivateCounterVar() const {
diff --git a/test/Driver/cl-options.c b/test/Driver/cl-options.c
index 909e391cec6..5048fd25c4a 100644
--- a/test/Driver/cl-options.c
+++ b/test/Driver/cl-options.c
@@ -66,7 +66,7 @@
 // RUN: %clang_cl -### /FA -fprofile-instr-generate=/tmp/somefile.profraw -- %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-GENERATE-FILE %s
 // RUN: %clang_cl -### /FA -fprofile-instr-generate -fprofile-instr-use -- %s 2>&1 | FileCheck -check-prefix=CHECK-NO-MIX-GEN-USE %s
 // RUN: %clang_cl -### /FA -fprofile-instr-generate -fprofile-instr-use=file -- %s 2>&1 | FileCheck -check-prefix=CHECK-NO-MIX-GEN-USE %s
-// CHECK-PROFILE-GENERATE: "-fprofile-instrument=clang"
+// CHECK-PROFILE-GENERATE: "-fprofile-instrument=clang" "--dependent-lib={{[^"]*}}clang_rt.profile-{{[^"]*}}.lib"
 // CHECK-PROFILE-GENERATE-FILE: "-fprofile-instrument-path=/tmp/somefile.profraw"
 // CHECK-NO-MIX-GEN-USE: '{{[a-z=-]*}}' not allowed with '{{[a-z=-]*}}'
 
diff --git a/test/Driver/instrprof-ld.c b/test/Driver/instrprof-ld.c
index ea201056997..1ac3f9650ff 100644
--- a/test/Driver/instrprof-ld.c
+++ b/test/Driver/instrprof-ld.c
@@ -121,3 +121,17 @@
 //
 // CHECK-WINDOWS-X86-64: "{{.*}}link{{(.exe)?}}"
 // CHECK-WINDOWS-X86-64: "{{.*}}clang_rt.profile-x86_64.lib"
+
+// Test instrumented profiling dependent-lib flags
+//
+// RUN: %clang %s -### -o %t.o -target x86_64-pc-win32 \
+// RUN:     -fprofile-instr-generate 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-WINDOWS-X86-64-DEPENDENT-LIB %s
+//
+// CHECK-WINDOWS-X86-64-DEPENDENT-LIB: "--dependent-lib={{[^"]*}}clang_rt.profile-{{[^"]*}}.lib"
+//
+// RUN: %clang %s -### -o %t.o -target x86_64-mingw32 \
+// RUN:     -fprofile-instr-generate 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-MINGW-X86-64-DEPENDENT-LIB %s
+//
+// CHECK-MINGW-X86-64-DEPENDENT-LIB-NOT: "--dependent-lib={{[^"]*}}clang_rt.profile-{{[^"]*}}.a"
diff --git a/test/Driver/linux-ld.c b/test/Driver/linux-ld.c
index 3ab81be4906..a8173478029 100644
--- a/test/Driver/linux-ld.c
+++ b/test/Driver/linux-ld.c
@@ -1632,6 +1632,11 @@
 // CHECK-DEBIAN-ML-MIPS64EL-N32: "-L[[SYSROOT]]/usr/lib"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     --target=mips64-unknown-linux-gnu \
+// RUN:     --gcc-toolchain="" \
+// RUN:     --sysroot=%S/Inputs/debian_6_mips64_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-DEBIAN-ML-MIPS64-GNUABI %s
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips64-linux-gnuabi64 -mabi=n64 \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/debian_6_mips64_tree \
@@ -1652,6 +1657,11 @@
 // CHECK-DEBIAN-ML-MIPS64-GNUABI: "{{.*}}/usr/lib/gcc/mips64-linux-gnuabi64/4.9/../../../mips64-linux-gnuabi64{{/|\\\\}}crtn.o"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     --target=mips64el-unknown-linux-gnu \
+// RUN:     --gcc-toolchain="" \
+// RUN:     --sysroot=%S/Inputs/debian_6_mips64_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-DEBIAN-ML-MIPS64EL-GNUABI %s
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips64el-linux-gnuabi64 -mabi=n64 \
 // RUN:     --gcc-toolchain="" \
 // RUN:     --sysroot=%S/Inputs/debian_6_mips64_tree \
diff --git a/test/OpenMP/ordered_doacross_codegen.cpp b/test/OpenMP/ordered_doacross_codegen.cpp
index 2f19e9c2d5e..a3abf248d76 100644
--- a/test/OpenMP/ordered_doacross_codegen.cpp
+++ b/test/OpenMP/ordered_doacross_codegen.cpp
@@ -16,6 +16,17 @@ extern int n;
 int a[10], b[10], c[10], d[10];
 void foo();
 
+// CHECK-LABEL:bar
+void bar() {
+  int i,j;
+// CHECK: call void @__kmpc_doacross_init(
+// CHECK: call void @__kmpc_doacross_fini(
+#pragma omp parallel for ordered(2)
+  for (i = 0; i < n; ++i)
+  for (j = 0; j < n; ++j)
+    a[i] = b[i] + 1;
+}
+
 // CHECK-LABEL: @main()
 int main() {
   int i;
@@ -35,7 +46,7 @@ int main() {
 // CHECK: call void @__kmpc_doacross_init([[IDENT]], i32 [[GTID]], i32 1, i8* [[CAST]])
 // CHECK: call void @__kmpc_for_static_init_4(
 #pragma omp for ordered(1)
-  for (i = 0; i < n; ++i) {
+  for (int i = 0; i < n; ++i) {
     a[i] = b[i] + 1;
     foo();
 // CHECK: invoke void [[FOO:.+]](

From 90579156da340660a42e22e794a3c03ac1b50112 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Tue, 11 Jun 2019 18:16:59 +0000
Subject: [PATCH 3/7] Vendor import of compiler-rt release_80 branch r363030:
 https://llvm.org/svn/llvm-project/compiler-rt/branches/release_80@363030

---
 cmake/base-config-ix.cmake                               | 4 ++--
 lib/sanitizer_common/sanitizer_platform_limits_netbsd.cc | 2 ++
 lib/xray/tests/CMakeLists.txt                            | 5 +++--
 test/builtins/Unit/compiler_rt_logb_test.c               | 7 +++++++
 4 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/cmake/base-config-ix.cmake b/cmake/base-config-ix.cmake
index 6684d7371d6..aeabf17653f 100644
--- a/cmake/base-config-ix.cmake
+++ b/cmake/base-config-ix.cmake
@@ -195,10 +195,10 @@ macro(test_targets)
       # clang's default CPU's. In the 64-bit case, we must also specify the ABI
       # since the default ABI differs between gcc and clang.
       # FIXME: Ideally, we would build the N32 library too.
-      test_target_arch(mipsel "" "-mips32r2" "-mabi=32")
+      test_target_arch(mipsel "" "-mips32r2" "-mabi=32" "-D_LARGEFILE_SOURCE" "-D_FILE_OFFSET_BITS=64")
       test_target_arch(mips64el "" "-mips64r2" "-mabi=64")
     elseif("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES "mips")
-      test_target_arch(mips "" "-mips32r2" "-mabi=32")
+      test_target_arch(mips "" "-mips32r2" "-mabi=32" "-D_LARGEFILE_SOURCE" "-D_FILE_OFFSET_BITS=64")
       test_target_arch(mips64 "" "-mips64r2" "-mabi=64")
     elseif("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES "arm")
       if(WIN32)
diff --git a/lib/sanitizer_common/sanitizer_platform_limits_netbsd.cc b/lib/sanitizer_common/sanitizer_platform_limits_netbsd.cc
index c112e044b1d..c32c80b3e48 100644
--- a/lib/sanitizer_common/sanitizer_platform_limits_netbsd.cc
+++ b/lib/sanitizer_common/sanitizer_platform_limits_netbsd.cc
@@ -124,7 +124,9 @@
 #include <dev/isa/isvio.h>
 #include <dev/isa/wtreg.h>
 #include <dev/iscsi/iscsi_ioctl.h>
+#if 0
 #include <dev/nvmm/nvmm_ioctl.h>
+#endif
 #include <dev/ofw/openfirmio.h>
 #include <dev/pci/amrio.h>
 #include <dev/pci/mlyreg.h>
diff --git a/lib/xray/tests/CMakeLists.txt b/lib/xray/tests/CMakeLists.txt
index deddc5101e7..14585290454 100644
--- a/lib/xray/tests/CMakeLists.txt
+++ b/lib/xray/tests/CMakeLists.txt
@@ -48,8 +48,7 @@ endfunction()
 
 set(XRAY_TEST_ARCH ${XRAY_SUPPORTED_ARCH})
 set(XRAY_UNITTEST_LINK_FLAGS
-  ${CMAKE_THREAD_LIBS_INIT}
-  -l${SANITIZER_CXX_ABI_LIBRARY})
+  ${CMAKE_THREAD_LIBS_INIT})
 
 if (NOT APPLE)
   # Needed by LLVMSupport.
@@ -81,6 +80,8 @@ if (NOT APPLE)
   append_list_if(COMPILER_RT_HAS_LIBEXECINFO -lexecinfo XRAY_UNITTEST_LINK_FLAGS)
 endif()
 
+list(APPEND XRAY_UNITTEST_LINK_FLAGS -l${SANITIZER_CXX_ABI_LIBRARY})
+
 macro(add_xray_unittest testname)
   cmake_parse_arguments(TEST "" "" "SOURCES;HEADERS" ${ARGN})
   if(UNIX AND NOT APPLE)
diff --git a/test/builtins/Unit/compiler_rt_logb_test.c b/test/builtins/Unit/compiler_rt_logb_test.c
index 79676598089..9625881316b 100644
--- a/test/builtins/Unit/compiler_rt_logb_test.c
+++ b/test/builtins/Unit/compiler_rt_logb_test.c
@@ -37,6 +37,10 @@ double cases[] = {
 };
 
 int main() {
+  // Do not the run the compiler-rt logb test case if using GLIBC version
+  // < 2.23. Older versions might not compute to the same value as the
+  // compiler-rt value.
+#if !defined(__GLIBC__) || (defined(__GLIBC_PREREQ) && __GLIBC_PREREQ(2, 23))
   const unsigned N = sizeof(cases) / sizeof(cases[0]);
   unsigned i;
   for (i = 0; i < N; ++i) {
@@ -58,6 +62,9 @@ int main() {
     if (test__compiler_rt_logb(fromRep(signBit ^ x))) return 1;
     x >>= 1;
   }
+#else
+  printf("skipped\n");
+#endif
 
   return 0;
 }

From a556a0efb19f863ac71333b3e1bc22d07f4ae89f Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Tue, 11 Jun 2019 18:17:11 +0000
Subject: [PATCH 4/7] Vendor import of libc++ release_80 branch r363030:
 https://llvm.org/svn/llvm-project/libcxx/branches/release_80@363030

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3a6c3c9df4d..6b83bce1ae7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,7 +27,7 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
   project(libcxx CXX C)
 
   set(PACKAGE_NAME libcxx)
-  set(PACKAGE_VERSION 8.0.0)
+  set(PACKAGE_VERSION 8.0.1)
   set(PACKAGE_STRING "${PACKAGE_NAME} ${PACKAGE_VERSION}")
   set(PACKAGE_BUGREPORT "llvm-bugs@lists.llvm.org")
 

From 2bcfbb511a63150d7b74d910c8dbda707d78a5e9 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Tue, 11 Jun 2019 18:17:16 +0000
Subject: [PATCH 5/7] Vendor import of LLVM libunwind release_80 branch
 r363030:
 https://llvm.org/svn/llvm-project/libunwind/branches/release_80@363030

---
 src/UnwindRegistersRestore.S | 232 +++++++++++++++----------------
 src/UnwindRegistersSave.S    | 256 +++++++++++++++++------------------
 src/assembly.h               |   2 -
 3 files changed, 244 insertions(+), 246 deletions(-)

diff --git a/src/UnwindRegistersRestore.S b/src/UnwindRegistersRestore.S
index 389db67579c..a155fbe2ddf 100644
--- a/src/UnwindRegistersRestore.S
+++ b/src/UnwindRegistersRestore.S
@@ -396,119 +396,119 @@ Lnovec:
 #elif defined(__ppc__)
 
 DEFINE_LIBUNWIND_PRIVATE_FUNCTION(_ZN9libunwind13Registers_ppc6jumptoEv)
-;
-; void libunwind::Registers_ppc::jumpto()
-;
-; On entry:
-;  thread_state pointer is in r3
-;
+//
+// void libunwind::Registers_ppc::jumpto()
+//
+// On entry:
+//  thread_state pointer is in r3
+//
 
-  ; restore integral registerrs
-  ; skip r0 for now
-  ; skip r1 for now
-  lwz     r2, 16(r3)
-  ; skip r3 for now
-  ; skip r4 for now
-  ; skip r5 for now
-  lwz     r6, 32(r3)
-  lwz     r7, 36(r3)
-  lwz     r8, 40(r3)
-  lwz     r9, 44(r3)
-  lwz    r10, 48(r3)
-  lwz    r11, 52(r3)
-  lwz    r12, 56(r3)
-  lwz    r13, 60(r3)
-  lwz    r14, 64(r3)
-  lwz    r15, 68(r3)
-  lwz    r16, 72(r3)
-  lwz    r17, 76(r3)
-  lwz    r18, 80(r3)
-  lwz    r19, 84(r3)
-  lwz    r20, 88(r3)
-  lwz    r21, 92(r3)
-  lwz    r22, 96(r3)
-  lwz    r23,100(r3)
-  lwz    r24,104(r3)
-  lwz    r25,108(r3)
-  lwz    r26,112(r3)
-  lwz    r27,116(r3)
-  lwz    r28,120(r3)
-  lwz    r29,124(r3)
-  lwz    r30,128(r3)
-  lwz    r31,132(r3)
+  // restore integral registerrs
+  // skip r0 for now
+  // skip r1 for now
+  lwz     %r2,  16(%r3)
+  // skip r3 for now
+  // skip r4 for now
+  // skip r5 for now
+  lwz     %r6,  32(%r3)
+  lwz     %r7,  36(%r3)
+  lwz     %r8,  40(%r3)
+  lwz     %r9,  44(%r3)
+  lwz     %r10, 48(%r3)
+  lwz     %r11, 52(%r3)
+  lwz     %r12, 56(%r3)
+  lwz     %r13, 60(%r3)
+  lwz     %r14, 64(%r3)
+  lwz     %r15, 68(%r3)
+  lwz     %r16, 72(%r3)
+  lwz     %r17, 76(%r3)
+  lwz     %r18, 80(%r3)
+  lwz     %r19, 84(%r3)
+  lwz     %r20, 88(%r3)
+  lwz     %r21, 92(%r3)
+  lwz     %r22, 96(%r3)
+  lwz     %r23,100(%r3)
+  lwz     %r24,104(%r3)
+  lwz     %r25,108(%r3)
+  lwz     %r26,112(%r3)
+  lwz     %r27,116(%r3)
+  lwz     %r28,120(%r3)
+  lwz     %r29,124(%r3)
+  lwz     %r30,128(%r3)
+  lwz     %r31,132(%r3)
 
-  ; restore float registers
-  lfd    f0, 160(r3)
-  lfd    f1, 168(r3)
-  lfd    f2, 176(r3)
-  lfd    f3, 184(r3)
-  lfd    f4, 192(r3)
-  lfd    f5, 200(r3)
-  lfd    f6, 208(r3)
-  lfd    f7, 216(r3)
-  lfd    f8, 224(r3)
-  lfd    f9, 232(r3)
-  lfd    f10,240(r3)
-  lfd    f11,248(r3)
-  lfd    f12,256(r3)
-  lfd    f13,264(r3)
-  lfd    f14,272(r3)
-  lfd    f15,280(r3)
-  lfd    f16,288(r3)
-  lfd    f17,296(r3)
-  lfd    f18,304(r3)
-  lfd    f19,312(r3)
-  lfd    f20,320(r3)
-  lfd    f21,328(r3)
-  lfd    f22,336(r3)
-  lfd    f23,344(r3)
-  lfd    f24,352(r3)
-  lfd    f25,360(r3)
-  lfd    f26,368(r3)
-  lfd    f27,376(r3)
-  lfd    f28,384(r3)
-  lfd    f29,392(r3)
-  lfd    f30,400(r3)
-  lfd    f31,408(r3)
+  // restore float registers
+  lfd     %f0, 160(%r3)
+  lfd     %f1, 168(%r3)
+  lfd     %f2, 176(%r3)
+  lfd     %f3, 184(%r3)
+  lfd     %f4, 192(%r3)
+  lfd     %f5, 200(%r3)
+  lfd     %f6, 208(%r3)
+  lfd     %f7, 216(%r3)
+  lfd     %f8, 224(%r3)
+  lfd     %f9, 232(%r3)
+  lfd     %f10,240(%r3)
+  lfd     %f11,248(%r3)
+  lfd     %f12,256(%r3)
+  lfd     %f13,264(%r3)
+  lfd     %f14,272(%r3)
+  lfd     %f15,280(%r3)
+  lfd     %f16,288(%r3)
+  lfd     %f17,296(%r3)
+  lfd     %f18,304(%r3)
+  lfd     %f19,312(%r3)
+  lfd     %f20,320(%r3)
+  lfd     %f21,328(%r3)
+  lfd     %f22,336(%r3)
+  lfd     %f23,344(%r3)
+  lfd     %f24,352(%r3)
+  lfd     %f25,360(%r3)
+  lfd     %f26,368(%r3)
+  lfd     %f27,376(%r3)
+  lfd     %f28,384(%r3)
+  lfd     %f29,392(%r3)
+  lfd     %f30,400(%r3)
+  lfd     %f31,408(%r3)
 
-  ; restore vector registers if any are in use
-  lwz    r5,156(r3)  ; test VRsave
-  cmpwi  r5,0
-  beq    Lnovec
-
-  subi  r4,r1,16
-  rlwinm  r4,r4,0,0,27  ; mask low 4-bits
-  ; r4 is now a 16-byte aligned pointer into the red zone
-  ; the _vectorRegisters may not be 16-byte aligned so copy via red zone temp buffer
+  // restore vector registers if any are in use
+  lwz     %r5, 156(%r3)       // test VRsave
+  cmpwi   %r5, 0
+  beq     Lnovec
 
+  subi    %r4, %r1, 16
+  rlwinm  %r4, %r4, 0, 0, 27  // mask low 4-bits
+  // r4 is now a 16-byte aligned pointer into the red zone
+  // the _vectorRegisters may not be 16-byte aligned so copy via red zone temp buffer
+ 
 
 #define LOAD_VECTOR_UNALIGNEDl(_index) \
-  andis.  r0,r5,(1<<(15-_index))  @\
-  beq    Ldone  ## _index     @\
-  lwz    r0, 424+_index*16(r3)  @\
-  stw    r0, 0(r4)        @\
-  lwz    r0, 424+_index*16+4(r3)  @\
-  stw    r0, 4(r4)        @\
-  lwz    r0, 424+_index*16+8(r3)  @\
-  stw    r0, 8(r4)        @\
-  lwz    r0, 424+_index*16+12(r3)@\
-  stw    r0, 12(r4)        @\
-  lvx    v ## _index,0,r4    @\
-Ldone  ## _index:
+  andis.  %r0, %r5, (1<<(15-_index))  SEPARATOR \
+  beq     Ldone ## _index             SEPARATOR \
+  lwz     %r0, 424+_index*16(%r3)     SEPARATOR \
+  stw     %r0, 0(%r4)                 SEPARATOR \
+  lwz     %r0, 424+_index*16+4(%r3)   SEPARATOR \
+  stw     %r0, 4(%r4)                 SEPARATOR \
+  lwz     %r0, 424+_index*16+8(%r3)   SEPARATOR \
+  stw     %r0, 8(%r4)                 SEPARATOR \
+  lwz     %r0, 424+_index*16+12(%r3)  SEPARATOR \
+  stw     %r0, 12(%r4)                SEPARATOR \
+  lvx     %v ## _index, 0, %r4        SEPARATOR \
+  Ldone ## _index:
 
 #define LOAD_VECTOR_UNALIGNEDh(_index) \
-  andi.  r0,r5,(1<<(31-_index))  @\
-  beq    Ldone  ## _index    @\
-  lwz    r0, 424+_index*16(r3)  @\
-  stw    r0, 0(r4)        @\
-  lwz    r0, 424+_index*16+4(r3)  @\
-  stw    r0, 4(r4)        @\
-  lwz    r0, 424+_index*16+8(r3)  @\
-  stw    r0, 8(r4)        @\
-  lwz    r0, 424+_index*16+12(r3)@\
-  stw    r0, 12(r4)        @\
-  lvx    v ## _index,0,r4    @\
-  Ldone  ## _index:
+  andi.   %r0, %r5, (1<<(31-_index))  SEPARATOR \
+  beq     Ldone ## _index             SEPARATOR \
+  lwz     %r0, 424+_index*16(%r3)     SEPARATOR \
+  stw     %r0, 0(%r4)                 SEPARATOR \
+  lwz     %r0, 424+_index*16+4(%r3)   SEPARATOR \
+  stw     %r0, 4(%r4)                 SEPARATOR \
+  lwz     %r0, 424+_index*16+8(%r3)   SEPARATOR \
+  stw     %r0, 8(%r4)                 SEPARATOR \
+  lwz     %r0, 424+_index*16+12(%r3)  SEPARATOR \
+  stw     %r0, 12(%r4)                SEPARATOR \
+  lvx     %v ## _index, 0, %r4        SEPARATOR \
+  Ldone ## _index:
 
 
   LOAD_VECTOR_UNALIGNEDl(0)
@@ -545,17 +545,17 @@ Ldone  ## _index:
   LOAD_VECTOR_UNALIGNEDh(31)
 
 Lnovec:
-  lwz    r0, 136(r3) ; __cr
-  mtocrf  255,r0
-  lwz    r0, 148(r3) ; __ctr
-  mtctr  r0
-  lwz    r0, 0(r3)  ; __ssr0
-  mtctr  r0
-  lwz    r0, 8(r3)  ; do r0 now
-  lwz    r5,28(r3)  ; do r5 now
-  lwz    r4,24(r3)  ; do r4 now
-  lwz    r1,12(r3)  ; do sp now
-  lwz    r3,20(r3)  ; do r3 last
+  lwz     %r0, 136(%r3)   // __cr
+  mtcr    %r0
+  lwz     %r0, 148(%r3)   // __ctr
+  mtctr   %r0
+  lwz     %r0,   0(%r3)   // __ssr0
+  mtctr   %r0
+  lwz     %r0,   8(%r3)   // do r0 now
+  lwz     %r5,  28(%r3)   // do r5 now
+  lwz     %r4,  24(%r3)   // do r4 now
+  lwz     %r1,  12(%r3)   // do sp now
+  lwz     %r3,  20(%r3)   // do r3 last
   bctr
 
 #elif defined(__arm64__) || defined(__aarch64__)
diff --git a/src/UnwindRegistersSave.S b/src/UnwindRegistersSave.S
index 48ecb0aec70..4b674afc6bd 100644
--- a/src/UnwindRegistersSave.S
+++ b/src/UnwindRegistersSave.S
@@ -557,144 +557,144 @@ DEFINE_LIBUNWIND_FUNCTION(unw_getcontext)
 
 #elif defined(__ppc__)
 
-;
-; extern int unw_getcontext(unw_context_t* thread_state)
-;
-; On entry:
-;  thread_state pointer is in r3
-;
+//
+// extern int unw_getcontext(unw_context_t* thread_state)
+//
+// On entry:
+//  thread_state pointer is in r3
+//
 DEFINE_LIBUNWIND_FUNCTION(unw_getcontext)
-  stw    r0,  8(r3)
-  mflr  r0
-  stw    r0,  0(r3)  ; store lr as ssr0
-  stw    r1, 12(r3)
-  stw    r2, 16(r3)
-  stw    r3, 20(r3)
-  stw    r4, 24(r3)
-  stw    r5, 28(r3)
-  stw    r6, 32(r3)
-  stw    r7, 36(r3)
-  stw    r8, 40(r3)
-  stw    r9, 44(r3)
-  stw     r10, 48(r3)
-  stw     r11, 52(r3)
-  stw     r12, 56(r3)
-  stw     r13, 60(r3)
-  stw     r14, 64(r3)
-  stw     r15, 68(r3)
-  stw     r16, 72(r3)
-  stw     r17, 76(r3)
-  stw     r18, 80(r3)
-  stw     r19, 84(r3)
-  stw     r20, 88(r3)
-  stw     r21, 92(r3)
-  stw     r22, 96(r3)
-  stw     r23,100(r3)
-  stw     r24,104(r3)
-  stw     r25,108(r3)
-  stw     r26,112(r3)
-  stw     r27,116(r3)
-  stw     r28,120(r3)
-  stw     r29,124(r3)
-  stw     r30,128(r3)
-  stw     r31,132(r3)
+  stw     %r0,   8(%r3)
+  mflr    %r0
+  stw     %r0,   0(%r3) // store lr as ssr0
+  stw     %r1,  12(%r3)
+  stw     %r2,  16(%r3)
+  stw     %r3,  20(%r3)
+  stw     %r4,  24(%r3)
+  stw     %r5,  28(%r3)
+  stw     %r6,  32(%r3)
+  stw     %r7,  36(%r3)
+  stw     %r8,  40(%r3)
+  stw     %r9,  44(%r3)
+  stw     %r10, 48(%r3)
+  stw     %r11, 52(%r3)
+  stw     %r12, 56(%r3)
+  stw     %r13, 60(%r3)
+  stw     %r14, 64(%r3)
+  stw     %r15, 68(%r3)
+  stw     %r16, 72(%r3)
+  stw     %r17, 76(%r3)
+  stw     %r18, 80(%r3)
+  stw     %r19, 84(%r3)
+  stw     %r20, 88(%r3)
+  stw     %r21, 92(%r3)
+  stw     %r22, 96(%r3)
+  stw     %r23,100(%r3)
+  stw     %r24,104(%r3)
+  stw     %r25,108(%r3)
+  stw     %r26,112(%r3)
+  stw     %r27,116(%r3)
+  stw     %r28,120(%r3)
+  stw     %r29,124(%r3)
+  stw     %r30,128(%r3)
+  stw     %r31,132(%r3)
 
-  ; save VRSave register
-  mfspr  r0,256
-  stw    r0,156(r3)
-  ; save CR registers
-  mfcr  r0
-  stw    r0,136(r3)
-  ; save CTR register
-  mfctr  r0
-  stw    r0,148(r3)
+  // save VRSave register
+  mfspr   %r0, 256
+  stw     %r0, 156(%r3)
+  // save CR registers
+  mfcr    %r0
+  stw     %r0, 136(%r3)
+  // save CTR register
+  mfctr   %r0
+  stw     %r0, 148(%r3)
 
-  ; save float registers
-  stfd    f0, 160(r3)
-  stfd    f1, 168(r3)
-  stfd    f2, 176(r3)
-  stfd    f3, 184(r3)
-  stfd    f4, 192(r3)
-  stfd    f5, 200(r3)
-  stfd    f6, 208(r3)
-  stfd    f7, 216(r3)
-  stfd    f8, 224(r3)
-  stfd    f9, 232(r3)
-  stfd    f10,240(r3)
-  stfd    f11,248(r3)
-  stfd    f12,256(r3)
-  stfd    f13,264(r3)
-  stfd    f14,272(r3)
-  stfd    f15,280(r3)
-  stfd    f16,288(r3)
-  stfd    f17,296(r3)
-  stfd    f18,304(r3)
-  stfd    f19,312(r3)
-  stfd    f20,320(r3)
-  stfd    f21,328(r3)
-  stfd    f22,336(r3)
-  stfd    f23,344(r3)
-  stfd    f24,352(r3)
-  stfd    f25,360(r3)
-  stfd    f26,368(r3)
-  stfd    f27,376(r3)
-  stfd    f28,384(r3)
-  stfd    f29,392(r3)
-  stfd    f30,400(r3)
-  stfd    f31,408(r3)
+  // save float registers
+  stfd    %f0, 160(%r3)
+  stfd    %f1, 168(%r3)
+  stfd    %f2, 176(%r3)
+  stfd    %f3, 184(%r3)
+  stfd    %f4, 192(%r3)
+  stfd    %f5, 200(%r3)
+  stfd    %f6, 208(%r3)
+  stfd    %f7, 216(%r3)
+  stfd    %f8, 224(%r3)
+  stfd    %f9, 232(%r3)
+  stfd    %f10,240(%r3)
+  stfd    %f11,248(%r3)
+  stfd    %f12,256(%r3)
+  stfd    %f13,264(%r3)
+  stfd    %f14,272(%r3)
+  stfd    %f15,280(%r3)
+  stfd    %f16,288(%r3)
+  stfd    %f17,296(%r3)
+  stfd    %f18,304(%r3)
+  stfd    %f19,312(%r3)
+  stfd    %f20,320(%r3)
+  stfd    %f21,328(%r3)
+  stfd    %f22,336(%r3)
+  stfd    %f23,344(%r3)
+  stfd    %f24,352(%r3)
+  stfd    %f25,360(%r3)
+  stfd    %f26,368(%r3)
+  stfd    %f27,376(%r3)
+  stfd    %f28,384(%r3)
+  stfd    %f29,392(%r3)
+  stfd    %f30,400(%r3)
+  stfd    %f31,408(%r3)
 
 
-  ; save vector registers
+  // save vector registers
 
-  subi  r4,r1,16
-  rlwinm  r4,r4,0,0,27  ; mask low 4-bits
-  ; r4 is now a 16-byte aligned pointer into the red zone
+  subi    %r4, %r1, 16
+  rlwinm  %r4, %r4, 0, 0, 27  // mask low 4-bits
+  // r4 is now a 16-byte aligned pointer into the red zone
 
 #define SAVE_VECTOR_UNALIGNED(_vec, _offset) \
-  stvx  _vec,0,r4           @\
-  lwz    r5, 0(r4)          @\
-  stw    r5, _offset(r3)    @\
-  lwz    r5, 4(r4)          @\
-  stw    r5, _offset+4(r3)  @\
-  lwz    r5, 8(r4)          @\
-  stw    r5, _offset+8(r3)  @\
-  lwz    r5, 12(r4)         @\
-  stw    r5, _offset+12(r3)
+  stvx    _vec, 0, %r4          SEPARATOR \
+  lwz     %r5, 0(%r4)           SEPARATOR \
+  stw     %r5, _offset(%r3)     SEPARATOR \
+  lwz     %r5, 4(%r4)           SEPARATOR \
+  stw     %r5, _offset+4(%r3)   SEPARATOR \
+  lwz     %r5, 8(%r4)           SEPARATOR \
+  stw     %r5, _offset+8(%r3)   SEPARATOR \
+  lwz     %r5, 12(%r4)          SEPARATOR \
+  stw     %r5, _offset+12(%r3)
 
-  SAVE_VECTOR_UNALIGNED( v0, 424+0x000)
-  SAVE_VECTOR_UNALIGNED( v1, 424+0x010)
-  SAVE_VECTOR_UNALIGNED( v2, 424+0x020)
-  SAVE_VECTOR_UNALIGNED( v3, 424+0x030)
-  SAVE_VECTOR_UNALIGNED( v4, 424+0x040)
-  SAVE_VECTOR_UNALIGNED( v5, 424+0x050)
-  SAVE_VECTOR_UNALIGNED( v6, 424+0x060)
-  SAVE_VECTOR_UNALIGNED( v7, 424+0x070)
-  SAVE_VECTOR_UNALIGNED( v8, 424+0x080)
-  SAVE_VECTOR_UNALIGNED( v9, 424+0x090)
-  SAVE_VECTOR_UNALIGNED(v10, 424+0x0A0)
-  SAVE_VECTOR_UNALIGNED(v11, 424+0x0B0)
-  SAVE_VECTOR_UNALIGNED(v12, 424+0x0C0)
-  SAVE_VECTOR_UNALIGNED(v13, 424+0x0D0)
-  SAVE_VECTOR_UNALIGNED(v14, 424+0x0E0)
-  SAVE_VECTOR_UNALIGNED(v15, 424+0x0F0)
-  SAVE_VECTOR_UNALIGNED(v16, 424+0x100)
-  SAVE_VECTOR_UNALIGNED(v17, 424+0x110)
-  SAVE_VECTOR_UNALIGNED(v18, 424+0x120)
-  SAVE_VECTOR_UNALIGNED(v19, 424+0x130)
-  SAVE_VECTOR_UNALIGNED(v20, 424+0x140)
-  SAVE_VECTOR_UNALIGNED(v21, 424+0x150)
-  SAVE_VECTOR_UNALIGNED(v22, 424+0x160)
-  SAVE_VECTOR_UNALIGNED(v23, 424+0x170)
-  SAVE_VECTOR_UNALIGNED(v24, 424+0x180)
-  SAVE_VECTOR_UNALIGNED(v25, 424+0x190)
-  SAVE_VECTOR_UNALIGNED(v26, 424+0x1A0)
-  SAVE_VECTOR_UNALIGNED(v27, 424+0x1B0)
-  SAVE_VECTOR_UNALIGNED(v28, 424+0x1C0)
-  SAVE_VECTOR_UNALIGNED(v29, 424+0x1D0)
-  SAVE_VECTOR_UNALIGNED(v30, 424+0x1E0)
-  SAVE_VECTOR_UNALIGNED(v31, 424+0x1F0)
+  SAVE_VECTOR_UNALIGNED( %v0, 424+0x000)
+  SAVE_VECTOR_UNALIGNED( %v1, 424+0x010)
+  SAVE_VECTOR_UNALIGNED( %v2, 424+0x020)
+  SAVE_VECTOR_UNALIGNED( %v3, 424+0x030)
+  SAVE_VECTOR_UNALIGNED( %v4, 424+0x040)
+  SAVE_VECTOR_UNALIGNED( %v5, 424+0x050)
+  SAVE_VECTOR_UNALIGNED( %v6, 424+0x060)
+  SAVE_VECTOR_UNALIGNED( %v7, 424+0x070)
+  SAVE_VECTOR_UNALIGNED( %v8, 424+0x080)
+  SAVE_VECTOR_UNALIGNED( %v9, 424+0x090)
+  SAVE_VECTOR_UNALIGNED(%v10, 424+0x0A0)
+  SAVE_VECTOR_UNALIGNED(%v11, 424+0x0B0)
+  SAVE_VECTOR_UNALIGNED(%v12, 424+0x0C0)
+  SAVE_VECTOR_UNALIGNED(%v13, 424+0x0D0)
+  SAVE_VECTOR_UNALIGNED(%v14, 424+0x0E0)
+  SAVE_VECTOR_UNALIGNED(%v15, 424+0x0F0)
+  SAVE_VECTOR_UNALIGNED(%v16, 424+0x100)
+  SAVE_VECTOR_UNALIGNED(%v17, 424+0x110)
+  SAVE_VECTOR_UNALIGNED(%v18, 424+0x120)
+  SAVE_VECTOR_UNALIGNED(%v19, 424+0x130)
+  SAVE_VECTOR_UNALIGNED(%v20, 424+0x140)
+  SAVE_VECTOR_UNALIGNED(%v21, 424+0x150)
+  SAVE_VECTOR_UNALIGNED(%v22, 424+0x160)
+  SAVE_VECTOR_UNALIGNED(%v23, 424+0x170)
+  SAVE_VECTOR_UNALIGNED(%v24, 424+0x180)
+  SAVE_VECTOR_UNALIGNED(%v25, 424+0x190)
+  SAVE_VECTOR_UNALIGNED(%v26, 424+0x1A0)
+  SAVE_VECTOR_UNALIGNED(%v27, 424+0x1B0)
+  SAVE_VECTOR_UNALIGNED(%v28, 424+0x1C0)
+  SAVE_VECTOR_UNALIGNED(%v29, 424+0x1D0)
+  SAVE_VECTOR_UNALIGNED(%v30, 424+0x1E0)
+  SAVE_VECTOR_UNALIGNED(%v31, 424+0x1F0)
 
-  li  r3, 0    ; return UNW_ESUCCESS
+  li      %r3, 0  // return UNW_ESUCCESS
   blr
 
 
diff --git a/src/assembly.h b/src/assembly.h
index 2df930214fa..7806892e9dc 100644
--- a/src/assembly.h
+++ b/src/assembly.h
@@ -29,8 +29,6 @@
 #ifdef _ARCH_PWR8
 #define PPC64_HAS_VMX
 #endif
-#elif defined(__POWERPC__) || defined(__powerpc__) || defined(__ppc__)
-#define SEPARATOR @
 #elif defined(__arm64__)
 #define SEPARATOR %%
 #else

From 444e4712399dfed9a74a0a1bd4880ea138a86616 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Tue, 11 Jun 2019 18:17:23 +0000
Subject: [PATCH 6/7] Vendor import of lld release_80 branch r363030:
 https://llvm.org/svn/llvm-project/lld/branches/release_80@363030

---
 COFF/Writer.cpp                               |  50 +++++++--
 ELF/Arch/PPC64.cpp                            |   9 ++
 ELF/InputSection.cpp                          |   6 +-
 ELF/SyntheticSections.cpp                     |   5 +
 ELF/Writer.cpp                                |  16 +--
 test/COFF/guardcf-thunk.s                     |  43 +++++++
 test/ELF/aarch64-cortex-a53-843419-tlsrelax.s |   4 +-
 test/ELF/aarch64-tls-gdle.s                   |   4 +-
 test/ELF/aarch64-tls-iele.s                   |   6 +-
 test/ELF/aarch64-tls-le.s                     |   8 +-
 test/ELF/aarch64-tlsld-ldst.s                 |  50 ++++-----
 test/ELF/arm-tls-le32.s                       |  12 +-
 test/ELF/arm-tls-norelax-ie-le.s              |   4 +-
 test/ELF/compressed-input-alignment.test      |  67 +++++++++++
 test/ELF/eh-frame-hdr-augmentation.s          |   4 +-
 test/ELF/emit-relocs-mergeable2.s             |  14 +++
 test/ELF/gc-sections-metadata-startstop.s     |   2 +-
 test/ELF/mips-micro-relocs.s                  |   6 +-
 test/ELF/mips-micror6-relocs.s                |   4 +-
 test/ELF/ppc64-bsymbolic-toc-restore.s        |   4 +-
 test/ELF/ppc64-call-reach.s                   |  40 +++----
 test/ELF/ppc64-ifunc.s                        | 106 ++++++++----------
 test/ELF/ppc64-local-dynamic.s                |   2 +-
 test/ELF/ppc64-local-entry.s                  |  47 ++++++++
 test/ELF/ppc64-long-branch-init.s             |  43 +++++++
 test/ELF/ppc64-plt-stub.s                     |  11 +-
 test/ELF/ppc64-rel-calls.s                    |   5 +-
 test/ELF/ppc64-toc-restore-recursive-call.s   |  17 ++-
 test/ELF/ppc64-toc-restore.s                  |  41 +++----
 wasm/OutputSections.cpp                       |  12 +-
 30 files changed, 433 insertions(+), 209 deletions(-)
 create mode 100644 test/COFF/guardcf-thunk.s
 create mode 100644 test/ELF/compressed-input-alignment.test
 create mode 100644 test/ELF/emit-relocs-mergeable2.s
 create mode 100644 test/ELF/ppc64-local-entry.s
 create mode 100644 test/ELF/ppc64-long-branch-init.s

diff --git a/COFF/Writer.cpp b/COFF/Writer.cpp
index 6acfaf9a445..56b797451cf 100644
--- a/COFF/Writer.cpp
+++ b/COFF/Writer.cpp
@@ -1351,19 +1351,47 @@ static void addSymbolToRVASet(SymbolRVASet &RVASet, Defined *S) {
 // symbol in an executable section.
 static void maybeAddAddressTakenFunction(SymbolRVASet &AddressTakenSyms,
                                          Symbol *S) {
-  auto *D = dyn_cast_or_null<DefinedCOFF>(S);
-
-  // Ignore undefined symbols and references to non-functions (e.g. globals and
-  // labels).
-  if (!D ||
-      D->getCOFFSymbol().getComplexType() != COFF::IMAGE_SYM_DTYPE_FUNCTION)
+  if (!S)
     return;
 
-  // Mark the symbol as address taken if it's in an executable section.
-  Chunk *RefChunk = D->getChunk();
-  OutputSection *OS = RefChunk ? RefChunk->getOutputSection() : nullptr;
-  if (OS && OS->Header.Characteristics & IMAGE_SCN_MEM_EXECUTE)
-    addSymbolToRVASet(AddressTakenSyms, D);
+  switch (S->kind()) {
+  case Symbol::DefinedLocalImportKind:
+  case Symbol::DefinedImportDataKind:
+    // Defines an __imp_ pointer, so it is data, so it is ignored.
+    break;
+  case Symbol::DefinedCommonKind:
+    // Common is always data, so it is ignored.
+    break;
+  case Symbol::DefinedAbsoluteKind:
+  case Symbol::DefinedSyntheticKind:
+    // Absolute is never code, synthetic generally isn't and usually isn't
+    // determinable.
+    break;
+  case Symbol::LazyKind:
+  case Symbol::UndefinedKind:
+    // Undefined symbols resolve to zero, so they don't have an RVA. Lazy
+    // symbols shouldn't have relocations.
+    break;
+
+  case Symbol::DefinedImportThunkKind:
+    // Thunks are always code, include them.
+    addSymbolToRVASet(AddressTakenSyms, cast<Defined>(S));
+    break;
+
+  case Symbol::DefinedRegularKind: {
+    // This is a regular, defined, symbol from a COFF file. Mark the symbol as
+    // address taken if the symbol type is function and it's in an executable
+    // section.
+    auto *D = cast<DefinedRegular>(S);
+    if (D->getCOFFSymbol().getComplexType() == COFF::IMAGE_SYM_DTYPE_FUNCTION) {
+      Chunk *RefChunk = D->getChunk();
+      OutputSection *OS = RefChunk ? RefChunk->getOutputSection() : nullptr;
+      if (OS && OS->Header.Characteristics & IMAGE_SCN_MEM_EXECUTE)
+        addSymbolToRVASet(AddressTakenSyms, D);
+    }
+    break;
+  }
+  }
 }
 
 // Visit all relocations from all section contributions of this object file and
diff --git a/ELF/Arch/PPC64.cpp b/ELF/Arch/PPC64.cpp
index 8a320c9a4e9..cbfa8073d33 100644
--- a/ELF/Arch/PPC64.cpp
+++ b/ELF/Arch/PPC64.cpp
@@ -113,6 +113,7 @@ class PPC64 final : public TargetInfo {
   void writeGotHeader(uint8_t *Buf) const override;
   bool needsThunk(RelExpr Expr, RelType Type, const InputFile *File,
                   uint64_t BranchAddr, const Symbol &S) const override;
+  uint32_t getThunkSectionSpacing() const override;
   bool inBranchRange(RelType Type, uint64_t Src, uint64_t Dst) const override;
   RelExpr adjustRelaxExpr(RelType Type, const uint8_t *Data,
                           RelExpr Expr) const override;
@@ -759,6 +760,14 @@ bool PPC64::needsThunk(RelExpr Expr, RelType Type, const InputFile *File,
   return !inBranchRange(Type, BranchAddr, S.getVA());
 }
 
+uint32_t PPC64::getThunkSectionSpacing() const {
+  // See comment in Arch/ARM.cpp for a more detailed explanation of
+  // getThunkSectionSpacing(). For PPC64 we pick the constant here based on
+  // R_PPC64_REL24, which is used by unconditional branch instructions.
+  // 0x2000000 = (1 << 24-1) * 4
+  return 0x2000000;
+}
+
 bool PPC64::inBranchRange(RelType Type, uint64_t Src, uint64_t Dst) const {
   int64_t Offset = Dst - Src;
   if (Type == R_PPC64_REL14)
diff --git a/ELF/InputSection.cpp b/ELF/InputSection.cpp
index 839bff7011e..ca2f49c07bb 100644
--- a/ELF/InputSection.cpp
+++ b/ELF/InputSection.cpp
@@ -248,6 +248,7 @@ void InputSectionBase::parseCompressedHeader() {
     }
 
     UncompressedSize = Hdr->ch_size;
+    Alignment = std::max<uint64_t>(Hdr->ch_addralign, 1);
     RawData = RawData.slice(sizeof(*Hdr));
     return;
   }
@@ -265,6 +266,7 @@ void InputSectionBase::parseCompressedHeader() {
   }
 
   UncompressedSize = Hdr->ch_size;
+  Alignment = std::max<uint64_t>(Hdr->ch_addralign, 1);
   RawData = RawData.slice(sizeof(*Hdr));
 }
 
@@ -578,10 +580,6 @@ static int64_t getTlsTpOffset() {
     // Variant 1. The thread pointer points to a TCB with a fixed 2-word size,
     // followed by a variable amount of alignment padding, followed by the TLS
     // segment.
-    //
-    // NB: While the ARM/AArch64 ABI formally has a 2-word TCB size, lld
-    // effectively increases the TCB size to 8 words for Android compatibility.
-    // It accomplishes this by increasing the segment's alignment.
     return alignTo(Config->Wordsize * 2, Out::TlsPhdr->p_align);
   case EM_386:
   case EM_X86_64:
diff --git a/ELF/SyntheticSections.cpp b/ELF/SyntheticSections.cpp
index b1a3f8bc70a..10675588ebe 100644
--- a/ELF/SyntheticSections.cpp
+++ b/ELF/SyntheticSections.cpp
@@ -2001,6 +2001,11 @@ template <class ELFT> void SymbolTableSection<ELFT>::writeTo(uint8_t *Buf) {
       ESym->setVisibility(Sym->Visibility);
     }
 
+    // The 3 most significant bits of st_other are used by OpenPOWER ABI.
+    // See getPPC64GlobalEntryToLocalEntryOffset() for more details.
+    if (Config->EMachine == EM_PPC64)
+      ESym->st_other |= Sym->StOther & 0xe0;
+
     ESym->st_name = Ent.StrTabOffset;
     ESym->st_shndx = getSymSectionIndex(Ent.Sym);
 
diff --git a/ELF/Writer.cpp b/ELF/Writer.cpp
index 17f4c7961d3..36ba0253263 100644
--- a/ELF/Writer.cpp
+++ b/ELF/Writer.cpp
@@ -547,6 +547,11 @@ static bool shouldKeepInSymtab(SectionBase *Sec, StringRef SymName,
   if (Config->Discard == DiscardPolicy::None)
     return true;
 
+  // If -emit-reloc is given, all symbols including local ones need to be
+  // copied because they may be referenced by relocations.
+  if (Config->EmitRelocs)
+    return true;
+
   // In ELF assembly .L symbols are normally discarded by the assembler.
   // If the assembler fails to do so, the linker discards them if
   // * --discard-locals is used.
@@ -2192,17 +2197,6 @@ template <class ELFT> void Writer<ELFT>::setPhdrs() {
     }
 
     if (P->p_type == PT_TLS && P->p_memsz) {
-      if (!Config->Shared &&
-          (Config->EMachine == EM_ARM || Config->EMachine == EM_AARCH64)) {
-        // On ARM/AArch64, reserve extra space (8 words) between the thread
-        // pointer and an executable's TLS segment by overaligning the segment.
-        // This reservation is needed for backwards compatibility with Android's
-        // TCB, which allocates several slots after the thread pointer (e.g.
-        // TLS_SLOT_STACK_GUARD==5). For simplicity, this overalignment is also
-        // done on other operating systems.
-        P->p_align = std::max<uint64_t>(P->p_align, Config->Wordsize * 8);
-      }
-
       // The TLS pointer goes after PT_TLS for variant 2 targets. At least glibc
       // will align it, so round up the size to make sure the offsets are
       // correct.
diff --git a/test/COFF/guardcf-thunk.s b/test/COFF/guardcf-thunk.s
new file mode 100644
index 00000000000..0969c580ac9
--- /dev/null
+++ b/test/COFF/guardcf-thunk.s
@@ -0,0 +1,43 @@
+# REQUIRES: x86
+
+# Make a DLL that exports exportfn1.
+# RUN: yaml2obj < %p/Inputs/export.yaml > %t.obj
+# RUN: lld-link /out:%t.dll /dll %t.obj /export:exportfn1 /implib:%t.lib
+
+# Make an obj that takes the address of that exported function.
+# RUN: llvm-mc -filetype=obj -triple=x86_64-windows-msvc %s -o %t2.obj
+# RUN: lld-link -entry:main -guard:cf %t2.obj %t.lib -nodefaultlib -out:%t.exe
+# RUN: llvm-readobj -coff-load-config %t.exe | FileCheck %s
+
+# Check that the gfids table contains *exactly* two entries, one for exportfn1
+# and one for main.
+# CHECK: GuardFidTable [
+# CHECK-NEXT: 0x{{[0-9A-Fa-f]+0$}}
+# CHECK-NEXT: 0x{{[0-9A-Fa-f]+0$}}
+# CHECK-NEXT: ]
+
+
+        .def     @feat.00;
+        .scl    3;
+        .type   0;
+        .endef
+        .globl  @feat.00
+@feat.00 = 0x001
+
+        .section .text,"rx"
+        .def     main; .scl    2; .type   32; .endef
+        .global main
+main:
+        leaq exportfn1(%rip), %rax
+        retq
+
+        .section .rdata,"dr"
+.globl _load_config_used
+_load_config_used:
+        .long 256
+        .fill 124, 1, 0
+        .quad __guard_fids_table
+        .quad __guard_fids_count
+        .long __guard_flags
+        .fill 128, 1, 0
+
diff --git a/test/ELF/aarch64-cortex-a53-843419-tlsrelax.s b/test/ELF/aarch64-cortex-a53-843419-tlsrelax.s
index 2db5c7e36bb..bff72d3725f 100644
--- a/test/ELF/aarch64-cortex-a53-843419-tlsrelax.s
+++ b/test/ELF/aarch64-cortex-a53-843419-tlsrelax.s
@@ -26,9 +26,9 @@ _start:
 // CHECK: _start:
 // CHECK-NEXT:   210ff8:        41 d0 3b d5     mrs     x1, TPIDR_EL0
 // CHECK-NEXT:   210ffc:        00 00 a0 d2     movz    x0, #0, lsl #16
-// CHECK-NEXT:   211000:        01 08 80 f2     movk    x1, #64
+// CHECK-NEXT:   211000:        01 02 80 f2     movk    x1, #16
 // CHECK-NEXT:   211004:        00 00 a0 d2     movz    x0, #0, lsl #16
-// CHECK-NEXT:   211008:        01 08 80 f2     movk    x1, #64
+// CHECK-NEXT:   211008:        01 02 80 f2     movk    x1, #16
 // CHECK-NEXT:   21100c:        c0 03 5f d6     ret
 
  .type  v,@object
diff --git a/test/ELF/aarch64-tls-gdle.s b/test/ELF/aarch64-tls-gdle.s
index 882ec8c1ae1..19fdc1d35c5 100644
--- a/test/ELF/aarch64-tls-gdle.s
+++ b/test/ELF/aarch64-tls-gdle.s
@@ -9,11 +9,11 @@
 #RELOC:      Relocations [
 #RELOC-NEXT: ]
 
-# TCB size = 64 and foo is first element from TLS register.
+# TCB size = 0x16 and foo is first element from TLS register.
 # CHECK: Disassembly of section .text:
 # CHECK: _start:
 # CHECK:  210000:	00 00 a0 d2	movz	x0, #0, lsl #16
-# CHECK:  210004:	00 08 80 f2 	movk	x0, #64
+# CHECK:  210004:	00 02 80 f2 	movk	x0, #16
 # CHECK:  210008:	1f 20 03 d5 	nop
 # CHECK:  21000c:	1f 20 03 d5 	nop
 
diff --git a/test/ELF/aarch64-tls-iele.s b/test/ELF/aarch64-tls-iele.s
index 0229d6676cd..9fec4ee7dc9 100644
--- a/test/ELF/aarch64-tls-iele.s
+++ b/test/ELF/aarch64-tls-iele.s
@@ -9,13 +9,13 @@
 # RELOC:      Relocations [
 # RELOC-NEXT: ]
 
-# TCB size = 64 and foo is first element from TLS register.
+# TCB size = 0x16 and foo is first element from TLS register.
 # CHECK: Disassembly of section .text:
 # CHECK: _start:
 # CHECK-NEXT: 210000:  00 00 a0 d2   movz   x0, #0, lsl #16
-# CHECK-NEXT: 210004:  80 08 80 f2   movk   x0, #68
+# CHECK-NEXT: 210004:  80 02 80 f2   movk   x0, #20
 # CHECK-NEXT: 210008:  00 00 a0 d2   movz   x0, #0, lsl #16
-# CHECK-NEXT: 21000c:  00 08 80 f2   movk   x0, #64
+# CHECK-NEXT: 21000c:  00 02 80 f2   movk   x0, #16
 
 .section .tdata
 .align 2
diff --git a/test/ELF/aarch64-tls-le.s b/test/ELF/aarch64-tls-le.s
index 49c322facb1..eda13758656 100644
--- a/test/ELF/aarch64-tls-le.s
+++ b/test/ELF/aarch64-tls-le.s
@@ -17,12 +17,12 @@ _start:
  add x0, x0, :tprel_hi12:v2
  add x0, x0, :tprel_lo12_nc:v2
 
-# TCB size = 64 and foo is first element from TLS register.
+# TCB size = 0x16 and foo is first element from TLS register.
 #CHECK: Disassembly of section .text:
 #CHECK: _start:
 #CHECK:  210000: 40 d0 3b d5     mrs     x0, TPIDR_EL0
 #CHECK:  210004: 00 00 40 91     add     x0, x0, #0, lsl #12
-#CHECK:  210008: 00 00 01 91     add     x0, x0, #64
+#CHECK:  210008: 00 40 00 91     add     x0, x0, #16
 #CHECK:  21000c: 40 d0 3b d5     mrs     x0, TPIDR_EL0
 #CHECK:  210010: 00 fc 7f 91     add     x0, x0, #4095, lsl #12
 #CHECK:  210014: 00 e0 3f 91     add     x0, x0, #4088
@@ -36,9 +36,9 @@ v1:
 .word  0
 .size  v1, 4
 
-# The current offset from the thread pointer is 68. Raise it to just below the
+# The current offset from the thread pointer is 20. Raise it to just below the
 # 24-bit limit.
-.space (0xfffff8 - 68)
+.space (0xfffff8 - 20)
 
 .type   v2,@object
 .globl  v2
diff --git a/test/ELF/aarch64-tlsld-ldst.s b/test/ELF/aarch64-tlsld-ldst.s
index 8ebdc2f152a..3144ca5d99a 100644
--- a/test/ELF/aarch64-tlsld-ldst.s
+++ b/test/ELF/aarch64-tlsld-ldst.s
@@ -26,27 +26,27 @@ _start:  mrs x8, TPIDR_EL0
 
 // CHECK: _start:
 // CHECK-NEXT:    210000:       48 d0 3b d5     mrs     x8, TPIDR_EL0
-// 0x0 + c40 = 0xc40       = tcb (64-bytes) + var0
-// CHECK-NEXT:    210004:       08 01 40 91     add x8, x8, #0, lsl #12
-// CHECK-NEXT:    210008:       14 11 c3 3d     ldr q20, [x8, #3136]
-// 0x1000 + 0x850 = 0x1850 = tcb + var1
-// CHECK-NEXT:    21000c:       08 05 40 91     add x8, x8, #1, lsl #12
-// CHECK-NEXT:    210010:       00 29 44 f9     ldr x0, [x8, #2128]
-// 0x2000 + 0x458 = 0x2458 = tcb + var2
-// CHECK-NEXT:    210014:       08 09 40 91     add x8, x8, #2, lsl #12
-// CHECK-NEXT:    210018:       00 59 44 b9     ldr w0, [x8, #1112]
-// 0x3000 + 0x5c  = 0x305c = tcb + var3
-// CHECK-NEXT:    21001c:       08 0d 40 91     add x8, x8, #3, lsl #12
-// CHECK-NEXT:    210020:       00 b9 40 79     ldrh  w0, [x8, #92]
-// 0x3000 + 0xc5e = 0x3c5e = tcb + var4
-// CHECK-NEXT:    210024:       08 0d 40 91     add x8, x8, #3, lsl #12
-// CHECK-NEXT:    210028:       00 79 71 39     ldrb  w0, [x8, #3166]
+// 0x0 + c10 = 0xc10       = tcb (16-bytes) + var0
+// CHECK-NEXT:    210004:       08 01 40 91     add     x8, x8, #0, lsl #12
+// CHECK-NEXT:    210008:       14 05 c3 3d     ldr     q20, [x8, #3088]
+// 0x1000 + 0x820 = 0x1820 = tcb + var1
+// CHECK-NEXT:    21000c:       08 05 40 91     add     x8, x8, #1, lsl #12
+// CHECK-NEXT:    210010:       00 11 44 f9     ldr     x0, [x8, #2080]
+// 0x2000 + 0x428 = 0x2428 = tcb + var2
+// CHECK-NEXT:    210014:       08 09 40 91     add     x8, x8, #2, lsl #12
+// CHECK-NEXT:    210018:       00 29 44 b9     ldr     w0, [x8, #1064]
+// 0x3000 + 0x2c  = 0x302c = tcb + var3
+// CHECK-NEXT:    21001c:       08 0d 40 91     add     x8, x8, #3, lsl #12
+// CHECK-NEXT:    210020:       00 59 40 79     ldrh    w0, [x8, #44]
+// 0x3000 + 0xc2e = 0x32ce = tcb + var4
+// CHECK-NEXT:    210024:       08 0d 40 91     add     x8, x8, #3, lsl #12
+// CHECK-NEXT:    210028:       00 b9 70 39     ldrb    w0, [x8, #3118]
 
-// CHECK-SYMS:      0000000000000c00    16 TLS     GLOBAL DEFAULT    2 var0
-// CHECK-SYMS-NEXT: 0000000000001810     8 TLS     GLOBAL DEFAULT    2 var1
-// CHECK-SYMS-NEXT: 0000000000002418     4 TLS     GLOBAL DEFAULT    2 var2
-// CHECK-SYMS-NEXT: 000000000000301c     2 TLS     GLOBAL DEFAULT    2 var3
-// CHECK-SYMS-NEXT: 0000000000003c1e     1 TLS     GLOBAL DEFAULT    2 var4
+// CHECK-SYMS:      0000000000000c00     0 TLS     GLOBAL DEFAULT    2 var0
+// CHECK-SYMS-NEXT: 0000000000001810     4 TLS     GLOBAL DEFAULT    2 var1
+// CHECK-SYMS-NEXT: 0000000000002418     2 TLS     GLOBAL DEFAULT    2 var2
+// CHECK-SYMS-NEXT: 000000000000301c     1 TLS     GLOBAL DEFAULT    2 var3
+// CHECK-SYMS-NEXT: 0000000000003c1e     0 TLS     GLOBAL DEFAULT    2 var4
 
         .globl var0
         .globl var1
@@ -59,12 +59,12 @@ _start:  mrs x8, TPIDR_EL0
         .type var3,@object
 
 .section .tbss,"awT",@nobits
-        .balign 64
+        .balign 16
         .space 1024 * 3
 var0:
         .quad 0
         .quad 0
-        .size var0, 16
+        .size var1, 16
         .space 1024 * 3
 var1:
         .quad 0
@@ -72,14 +72,14 @@ var1:
         .space 1024 * 3
 var2:
         .word 0
-        .size var2, 4
+        .size var1, 4
 
         .space 1024 * 3
 var3:
         .hword 0
-        .size var3, 2
+        .size var2, 2
         .space 1024 * 3
 var4:
         .byte 0
-        .size var4, 1
+        .size var3, 1
         .space 1024 * 3
diff --git a/test/ELF/arm-tls-le32.s b/test/ELF/arm-tls-le32.s
index f9a5fa9b2fc..7834dedf1be 100644
--- a/test/ELF/arm-tls-le32.s
+++ b/test/ELF/arm-tls-le32.s
@@ -69,9 +69,9 @@ x:
 
 // CHECK: Disassembly of section .text:
 // CHECK-NEXT: _start:
-// offset of x from Thread pointer = (TcbSize + 0x0 = 0x20)
-// CHECK-NEXT:   11000:         20 00 00 00
-// offset of z from Thread pointer = (TcbSize + 0x8 = 0x28)
-// CHECK-NEXT:   11004:         28 00 00 00
-// offset of y from Thread pointer = (TcbSize + 0x4 = 0x24)
-// CHECK-NEXT:   11008:         24 00 00 00
+// offset of x from Thread pointer = (TcbSize + 0x0 = 0x8)
+// CHECK-NEXT:   11000:         08 00 00 00
+// offset of z from Thread pointer = (TcbSize + 0x8 = 0x10)
+// CHECK-NEXT:   11004:         10 00 00 00
+// offset of y from Thread pointer = (TcbSize + 0x4 = 0xc)
+// CHECK-NEXT:   11008:         0c 00 00 00
diff --git a/test/ELF/arm-tls-norelax-ie-le.s b/test/ELF/arm-tls-norelax-ie-le.s
index 11c3e4f5dc1..4a52f547f0a 100644
--- a/test/ELF/arm-tls-norelax-ie-le.s
+++ b/test/ELF/arm-tls-norelax-ie-le.s
@@ -37,5 +37,5 @@ x2:
  .type x2, %object
 
 // CHECK: Contents of section .got:
-// x1 at offset 0x20 from TP, x2 at offset 0x24 from TP. Offsets include TCB size of 0x20
-// CHECK-NEXT: 13064 20000000 24000000
+// x1 at offset 8 from TP, x2 at offset 0xc from TP. Offsets include TCB size of 8
+// CHECK-NEXT: 13064 08000000 0c000000
diff --git a/test/ELF/compressed-input-alignment.test b/test/ELF/compressed-input-alignment.test
new file mode 100644
index 00000000000..a1f679034b7
--- /dev/null
+++ b/test/ELF/compressed-input-alignment.test
@@ -0,0 +1,67 @@
+# REQUIRES: zlib, x86
+
+# RUN: yaml2obj -docnum=1 %s -o %t.o
+# RUN: ld.lld %t.o %t.o -o %t2
+# RUN: llvm-readobj -sections -section-data %t2 | FileCheck %s
+
+# RUN: yaml2obj -docnum=2 %s -o %t.o
+# RUN: ld.lld %t.o %t.o -o %t2
+# RUN: llvm-readobj -sections -section-data %t2 | FileCheck %s
+
+# CHECK:        Name: .debug_info
+# CHECK-NEXT:   Type: SHT_PROGBITS
+# CHECK-NEXT:   Flags [
+# CHECK-NEXT:   ]
+# CHECK-NEXT:   Address: 0x0
+# CHECK-NEXT:   Offset: 0xE8
+# CHECK-NEXT:   Size: 108
+# CHECK-NEXT:   Link: 0
+# CHECK-NEXT:   Info: 0
+# CHECK-NEXT:   AddressAlignment: 1
+# CHECK-NEXT:   EntrySize: 0
+# CHECK-NEXT:   SectionData (
+# CHECK-NEXT:     0000: {{.*}} |ABCDEFGHIJKLMNOP|
+# CHECK-NEXT:     0010: {{.*}} |QRSTUVWXYZ.ABCDE|
+# CHECK-NEXT:     0020: {{.*}} |FGHIJKLMNOPQRSTU|
+# CHECK-NEXT:     0030: {{.*}} |VWXYZ.ABCDEFGHIJ|
+# CHECK-NEXT:     0040: {{.*}} |KLMNOPQRSTUVWXYZ|
+# CHECK-NEXT:     0050: {{.*}} |.ABCDEFGHIJKLMNO|
+# CHECK-NEXT:     0060: {{.*}} |PQRSTUVWXYZ.|
+# CHECK-NEXT:   )
+# CHECK-NEXT: }
+
+## YAML below is produced from the following code. AddressAlign of .debug_info is 8,
+## while compressed header has ch_addralign = 1. LLD had a bug and did not use the
+## value of ch_addralign at all. We produced broken section content.
+##
+## .section .debug_info,"",@progbits
+##  .string "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+##  .string "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+
+--- !ELF
+FileHeader:      
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_REL
+  Machine:         EM_X86_64
+Sections:
+  - Name:            .debug_info
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_COMPRESSED ]
+    AddressAlign:    0x0000000000000008
+    Content:         010000000000000036000000000000000100000000000000789C73747276717573F7F0F4F2F6F1F5F30F080C0A0E090D0B8F888C6270C42D0500ADA00FBF
+
+## YAML below is the same as above, with a single change: ch_addralign field of the compressed
+## header was set to 0. This is allowed by the standard, we have to support it.
+--- !ELF
+FileHeader:      
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_REL
+  Machine:         EM_X86_64
+Sections:
+  - Name:            .debug_info
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_COMPRESSED ]
+    AddressAlign:    0x0000000000000008
+    Content:         010000000000000036000000000000000000000000000000789C73747276717573F7F0F4F2F6F1F5F30F080C0A0E090D0B8F888C6270C42D0500ADA00FBF
diff --git a/test/ELF/eh-frame-hdr-augmentation.s b/test/ELF/eh-frame-hdr-augmentation.s
index 934f9200a27..c51ea1c4976 100644
--- a/test/ELF/eh-frame-hdr-augmentation.s
+++ b/test/ELF/eh-frame-hdr-augmentation.s
@@ -11,7 +11,7 @@
 // CHECK-NEXT:   Code alignment factor: 1
 // CHECK-NEXT:   Data alignment factor: -8
 // CHECK-NEXT:   Return address column: 16
-// CHECK-NEXT:   Personality Address: 00000dad
+// CHECK-NEXT:   Personality Address: 0000000000000dad
 // CHECK-NEXT:   Augmentation data:
 
 // CHECK:      DW_CFA_def_cfa:  reg7 +8
@@ -20,7 +20,7 @@
 // CHECK-NEXT: DW_CFA_nop:
 
 // CHECK:      00000020 00000014 00000024 FDE cie=00000024 pc=00000d98...00000d98
-// CHECK-NEXT:   LSDA Address: 00000d8f
+// CHECK-NEXT:   LSDA Address: 0000000000000d8f
 // CHECK-NEXT:   DW_CFA_nop:
 // CHECK-NEXT:   DW_CFA_nop:
 // CHECK-NEXT:   DW_CFA_nop:
diff --git a/test/ELF/emit-relocs-mergeable2.s b/test/ELF/emit-relocs-mergeable2.s
new file mode 100644
index 00000000000..22d4cf4238a
--- /dev/null
+++ b/test/ELF/emit-relocs-mergeable2.s
@@ -0,0 +1,14 @@
+# REQUIRES: x86
+# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o
+# RUN: ld.lld --emit-relocs %t.o -o %t.exe
+# RUN: llvm-readelf --relocations %t.exe | FileCheck %s
+
+# CHECK: 0000000000201004  000000010000000b R_X86_64_32S 0000000000200120 .Lfoo + 8
+
+.globl  _start
+_start:
+  movq .Lfoo+8, %rax
+.section .rodata.cst16,"aM",@progbits,16
+.Lfoo:
+  .quad 0
+  .quad 0
diff --git a/test/ELF/gc-sections-metadata-startstop.s b/test/ELF/gc-sections-metadata-startstop.s
index ede1899698c..0a1efb7e607 100644
--- a/test/ELF/gc-sections-metadata-startstop.s
+++ b/test/ELF/gc-sections-metadata-startstop.s
@@ -11,7 +11,7 @@
 # CHECK-NOT: yy
 
 # CHECK: SYMBOL TABLE:
-# CHECK: xx 00000000 __start_xx
+# CHECK:   xx    00000000 .protected __start_xx
 # CHECK: w *UND* 00000000 __start_yy
 
 .weak __start_xx
diff --git a/test/ELF/mips-micro-relocs.s b/test/ELF/mips-micro-relocs.s
index b539aa94676..566f6810f6e 100644
--- a/test/ELF/mips-micro-relocs.s
+++ b/test/ELF/mips-micro-relocs.s
@@ -39,9 +39,9 @@
 # EL-NEXT:      20028:       00 00 00 00     nop
 # EL-NEXT:      2002c:       00 94 e8 ff     b       -44
 
-# SYM: 00037ff0         .got            00000000 .hidden _gp
-# SYM: 00020000 g F     .text           00000000 foo
-# SYM: 00020010         .text           00000000 __start
+# SYM: 00037ff0         .got   00000000 .hidden _gp
+# SYM: 00020000 g F     .text  00000000 0x80 foo
+# SYM: 00020010         .text  00000000 0x80 __start
 
   .text
   .set micromips
diff --git a/test/ELF/mips-micror6-relocs.s b/test/ELF/mips-micror6-relocs.s
index ca2c1c064f5..fb450ba5cf3 100644
--- a/test/ELF/mips-micror6-relocs.s
+++ b/test/ELF/mips-micror6-relocs.s
@@ -26,8 +26,8 @@
 # EL-NEXT:    20014:  7f 80 f6 ff  beqzc  $3, -36
 # EL-NEXT:    20018:  ff b7 f4 ff  balc   -24 <foo>
 
-# SYM: 00020000 g F     .text           00000000 foo
-# SYM: 00020010         .text           00000000 __start
+# SYM: 00020000 g F     .text  00000000 0x80 foo
+# SYM: 00020010         .text  00000000 0x80 __start
 
   .text
   .set micromips
diff --git a/test/ELF/ppc64-bsymbolic-toc-restore.s b/test/ELF/ppc64-bsymbolic-toc-restore.s
index 49d347c4899..d467d22ff7b 100644
--- a/test/ELF/ppc64-bsymbolic-toc-restore.s
+++ b/test/ELF/ppc64-bsymbolic-toc-restore.s
@@ -53,7 +53,7 @@ caller:
 # CHECK-LABEL: caller
 # CHECK:         bl .+44
 # CHECK-NEXT:    mr 31, 3
-# CHECK-NEXT:    bl .+67108816
+# CHECK-NEXT:    bl .+44
 # CHECK-NEXT:    ld 2, 24(1)
 # CHECK-NEXT:    add 3, 3, 31
 # CHECK-NEXT:    addi 1, 1, 32
@@ -63,6 +63,6 @@ caller:
 # CHECK-EMPTY:
 # CHECK-NEXT:  def:
 # CHECK-NEXT:    addis 2, 12, 2
-# CHECK-NEXT:    addi 2, 2, -32636
+# CHECK-NEXT:    addi 2, 2, -32616
 # CHECK-NEXT:    li 3, 55
 # CHECK-NEXT:    blr
diff --git a/test/ELF/ppc64-call-reach.s b/test/ELF/ppc64-call-reach.s
index a02bfa82993..b843e7e531c 100644
--- a/test/ELF/ppc64-call-reach.s
+++ b/test/ELF/ppc64-call-reach.s
@@ -3,16 +3,16 @@
 # RUN: llvm-mc -filetype=obj -triple=powerpc64le-unknown-linux %s -o %t.o
 # RUN: ld.lld --defsym callee=0x12010010 --defsym tail_callee=0x12010020 \
 # RUN: %t.o -o %t
-# RUN: llvm-objdump -d %t | FileCheck %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s
 # RUN: ld.lld --defsym callee=0x12010010 --defsym tail_callee=0x12010020 \
 # RUN: %t.o -o %t
-# RUN: llvm-objdump -d %t | FileCheck %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s
 # RUN: ld.lld --defsym callee=0xE010014 --defsym tail_callee=0xE010024 \
 # RUN: %t.o -o %t
-# RUN: llvm-objdump -d %t | FileCheck --check-prefix=NEGOFFSET  %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck --check-prefix=NEGOFFSET  %s
 # RUN: ld.lld --defsym callee=0x12010018 --defsym tail_callee=0x12010028 \
 # RUN: %t.o -o %t
-# RUN: llvm-objdump -d %t | FileCheck --check-prefix=THUNK %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck --check-prefix=THUNK %s
 # RUN: llvm-readelf --sections %t | FileCheck --check-prefix=BRANCHLT %s
 # RUN: not ld.lld --defsym callee=0x1001002D --defsym tail_callee=0x1001002F \
 # RUN: %t.o -o %t 2>&1 | FileCheck --check-prefix=MISSALIGNED %s
@@ -20,16 +20,16 @@
 # RUN: llvm-mc -filetype=obj -triple=powerpc64-unknown-linux %s -o %t.o
 # RUN: ld.lld --defsym callee=0x12010010 --defsym tail_callee=0x12010020 \
 # RUN: %t.o -o %t
-# RUN: llvm-objdump -d %t | FileCheck %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s
 # RUN: ld.lld --defsym callee=0x12010010 --defsym tail_callee=0x12010020 \
 # RUN: %t.o -o %t
-# RUN: llvm-objdump -d %t | FileCheck %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s
 # RUN: ld.lld --defsym callee=0xE010014 --defsym tail_callee=0xE010024 \
 # RUN: %t.o -o %t
-# RUN: llvm-objdump -d %t | FileCheck --check-prefix=NEGOFFSET  %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck --check-prefix=NEGOFFSET  %s
 # RUN: ld.lld --defsym callee=0x12010018 --defsym tail_callee=0x12010028 \
 # RUN: %t.o -o %t
-# RUN: llvm-objdump -d %t | FileCheck --check-prefix=THUNK %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck --check-prefix=THUNK %s
 # RUN: llvm-readelf --sections %t | FileCheck --check-prefix=BRANCHLT %s
 # RUN: not ld.lld --defsym callee=0x1001002D --defsym tail_callee=0x1001002F \
 # RUN: %t.o -o %t 2>&1 | FileCheck --check-prefix=MISSALIGNED %s
@@ -58,37 +58,33 @@ test:
 # Check that we are branching to the definitions, and not range-extending
 # thunks.
 # CHECK-LABEL: test
-# CHECK:  10010014: {{.*}}  bl .+33554428
-# CHECK:  10010024: {{.*}}  b  .+33554428
+# CHECK:  10010014:       bl .+33554428
+# CHECK:  10010024:       b  .+33554428
 
 # NEGOFFSET-LABEL: test
-# NEGOFFSET:  10010014: {{.*}}  bl .+33554432
-# NEGOFFSET:  10010024: {{.*}}  b  .+33554432
+# NEGOFFSET:  10010014:       bl .-33554432
+# NEGOFFSET:  10010024:       b  .+33554432
+
+# THUNK-LABEL: test:
+# THUNK: 10010014:       bl .+20
+# THUNK: 10010024:       b .+20
 
 # .branch_lt[0]
 # THUNK-LABEL: __long_branch_callee:
-# THUNK-NEXT: 10010000: {{.*}} addis 12, 2, -1
+# THUNK-NEXT: 10010028:        addis 12, 2, -1
 # THUNK-NEXT:                  ld 12, -32768(12)
 # THUNK-NEXT:                  mtctr 12
 # THUNK-NEXT:                  bctr
 
 # .branch_lt[1]
 # THUNK-LABEL: __long_branch_tail_callee:
-# THUNK-NEXT: 10010010: {{.*}} addis 12, 2, -1
+# THUNK-NEXT: 10010038:        addis 12, 2, -1
 # THUNK-NEXT:                  ld 12, -32760(12)
 # THUNK-NEXT:                  mtctr 12
 # THUNK-NEXT:                  bctr
 
-# Each call now branches to a thunk, and although it is printed as positive
-# the offset is interpreted as a signed 26 bit value so 67108812 is actually
-# -52.
-# THUNK-LABEL: test:
-# THUNK: 10010034: {{.*}}  bl .+67108812
-# THUNK: 10010044: {{.*}}  b .+67108812
-
 # The offset from the TOC to the .branch_lt section  is (-1 << 16) - 32768.
 #                Name             Type            Address          Off    Size
 # BRANCHLT:     .branch_lt        PROGBITS        0000000010020000 020000 000010
 # BRANCHLT:     .got              PROGBITS        0000000010030000 030000 000008
 # BRANCHLT-NOT: .plt
-
diff --git a/test/ELF/ppc64-ifunc.s b/test/ELF/ppc64-ifunc.s
index 6f2d3318b9c..32e317f3c05 100644
--- a/test/ELF/ppc64-ifunc.s
+++ b/test/ELF/ppc64-ifunc.s
@@ -1,79 +1,67 @@
 # REQUIRES: ppc
 
 # RUN: llvm-mc -filetype=obj -triple=powerpc64le-unknown-linux %s -o %t.o
-# RUN: llvm-mc -filetype=obj -triple=powerpc64le-unknown-linux %p/Inputs/shared-ppc64.s -o %t2.o
-# RUN: ld.lld -shared %t2.o -o %t2.so
-# RUN: ld.lld %t.o %t2.so -o %t
-# RUN: llvm-objdump -D %t | FileCheck %s
-# RUN: llvm-readelf -dynamic-table %t | FileCheck --check-prefix=DT %s
-# RUN: llvm-readelf -dyn-relocations %t | FileCheck --check-prefix=DYNREL %s
+# RUN: ld.lld %t.o -o %t
+# RUN: llvm-nm %t | FileCheck --check-prefix=NM %s
+# RUN: llvm-readelf -S %t | FileCheck --check-prefix=SECTIONS %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s
+# RUN: llvm-readelf -r %t | FileCheck --check-prefix=DYNREL %s
 
 # RUN: llvm-mc -filetype=obj -triple=powerpc64-unknown-linux %s -o %t.o
-# RUN: llvm-mc -filetype=obj -triple=powerpc64-unknown-linux %p/Inputs/shared-ppc64.s -o %t2.o
-# RUN: ld.lld -shared %t2.o -o %t2.so
-# RUN: ld.lld %t.o %t2.so -o %t
-# RUN: llvm-objdump -D %t | FileCheck %s
-# RUN: llvm-readelf -dynamic-table %t | FileCheck --check-prefix=DT %s
-# RUN: llvm-readelf -dyn-relocations %t | FileCheck --check-prefix=DYNREL %s
+# RUN: ld.lld %t.o -o %t
+# RUN: llvm-nm %t | FileCheck --check-prefix=NM %s
+# RUN: llvm-readelf -S %t | FileCheck --check-prefix=SECTIONS %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s
+# RUN: llvm-readelf -r %t | FileCheck --check-prefix=DYNREL %s
 
-# CHECK: Disassembly of section .text:
+# NM-DAG: 0000000010028000 d .TOC.
+# NM-DAG: 0000000010010000 T ifunc
+# NM-DAG: 0000000010010004 T ifunc2
 
-# Tocbase    + (0 << 16) + 32560
-# 0x100280e0 +  0        + 32560 = 0x10030010 (.plt[2])
-# CHECK: __plt_foo:
-# CHECK-NEXT:     std 2, 24(1)
-# CHECK-NEXT:     addis 12, 2, 0
-# CHECK-NEXT:     ld 12, 32560(12)
-# CHECK-NEXT:     mtctr 12
-# CHECK-NEXT:     bctr
+# SECTIONS: .plt NOBITS 0000000010030000
 
-# Tocbase    + (0 << 16)  +  32568
-# 0x100280e0 +  0          + 32568 = 0x1003018 (.plt[3])
+# __plt_ifunc - . = 0x10010020 - 0x10010010 = 16
+# __plt_ifunc2 - . = 0x10010044 - 0x10010018 = 28
+# CHECK: _start:
+# CHECK-NEXT:                 addis 2, 12, 1
+# CHECK-NEXT:                 addi 2, 2, 32760
+# CHECK-NEXT: 10010010:       bl .+16
+# CHECK-NEXT:                 ld 2, 24(1)
+# CHECK-NEXT: 10010018:       bl .+28
+# CHECK-NEXT:                 ld 2, 24(1)
+
+# .plt[0] - .TOC. = 0x10030000 - 0x10028000 = (1<<16) - 32768
 # CHECK: __plt_ifunc:
 # CHECK-NEXT:     std 2, 24(1)
-# CHECK-NEXT:     addis 12, 2, 0
-# CHECK-NEXT:     ld 12, 32568(12)
+# CHECK-NEXT:     addis 12, 2, 1
+# CHECK-NEXT:     ld 12, -32768(12)
 # CHECK-NEXT:     mtctr 12
 # CHECK-NEXT:     bctr
 
-# CHECK: ifunc:
-# CHECK-NEXT: 10010028:  {{.*}} nop
+# .plt[1] - .TOC. = 0x10030000+8 - 0x10028000 = (1<<16) - 32760
+# CHECK: __plt_ifunc2:
+# CHECK-NEXT:     std 2, 24(1)
+# CHECK-NEXT:     addis 12, 2, 1
+# CHECK-NEXT:     ld 12, -32760(12)
+# CHECK-NEXT:     mtctr 12
+# CHECK-NEXT:     bctr
 
-# CHECK: _start:
-# CHECK-NEXT:     addis 2, 12, 2
-# CHECK-NEXT:     addi 2, 2, -32588
-# CHECK-NEXT:     bl .+67108812
-# CHECK-NEXT:     ld 2, 24(1)
-# CHECK-NEXT:     bl .+67108824
-# CHECK-NEXT:     ld 2, 24(1)
-
-# Check tocbase
-# CHECK:       Disassembly of section .got:
-# CHECK-NEXT:    .got:
-# CHECK-NEXT:    100200e0
-
-# Check .plt address
-# DT_PLTGOT should point to the start of the .plt section.
-# DT: 0x0000000000000003 PLTGOT 0x10030000
-
-# Check that we emit the correct dynamic relocation type for an ifunc
-# DYNREL: 'PLT' relocation section at offset 0x{{[0-9a-f]+}} contains 48 bytes:
-# 48 bytes --> 2 Elf64_Rela relocations
-# DYNREL-NEXT: Offset        Info           Type               Symbol's Value  Symbol's Name + Addend
-# DYNREL-NEXT: {{[0-9a-f]+}} {{[0-9a-f]+}}  R_PPC64_JMP_SLOT      {{0+}}            foo + 0
-# DYNREL-NEXT: {{[0-9a-f]+}} {{[0-9a-f]+}}  R_PPC64_IRELATIVE     10010028
-
-
-    .text
-    .abiversion 2
+# Check that we emit 2 R_PPC64_IRELATIVE.
+# DYNREL: R_PPC64_IRELATIVE       10010000
+# DYNREL: R_PPC64_IRELATIVE       10010004
 
 .type ifunc STT_GNU_IFUNC
 .globl ifunc
 ifunc:
- nop
+  nop
 
-    .global _start
-    .type   _start,@function
+.type ifunc2 STT_GNU_IFUNC
+.globl ifunc2
+ifunc2:
+  nop
+
+.global _start
+.type   _start,@function
 
 _start:
 .Lfunc_gep0:
@@ -81,7 +69,7 @@ _start:
   addi 2, 2, .TOC.-.Lfunc_gep0@l
 .Lfunc_lep0:
   .localentry     _start, .Lfunc_lep0-.Lfunc_gep0
-  bl foo
-  nop
   bl ifunc
   nop
+  bl ifunc2
+  nop
diff --git a/test/ELF/ppc64-local-dynamic.s b/test/ELF/ppc64-local-dynamic.s
index 6ed3b0fd8f0..87e33b784b8 100644
--- a/test/ELF/ppc64-local-dynamic.s
+++ b/test/ELF/ppc64-local-dynamic.s
@@ -113,7 +113,7 @@ k:
 // Dis:     test:
 // Dis:        addis 3, 2, 0
 // Dis-NEXT:   addi 3, 3, -32760
-// Dis-NEXT:   bl .+67108804
+// Dis-NEXT:   bl .+60
 // Dis-NEXT:   ld 2, 24(1)
 // Dis-NEXT:   addis 3, 3, 0
 // Dis-NEXT:   lwa 3, -32768(3)
diff --git a/test/ELF/ppc64-local-entry.s b/test/ELF/ppc64-local-entry.s
new file mode 100644
index 00000000000..2a2295169b9
--- /dev/null
+++ b/test/ELF/ppc64-local-entry.s
@@ -0,0 +1,47 @@
+# REQUIRES: ppc
+
+# RUN: llvm-mc -filetype=obj -triple=powerpc64 %s -o %t
+# RUN: ld.lld -r %t -o %t2
+# RUN: llvm-objdump -s -section=.symtab %t2 | FileCheck %s
+
+.text
+.abiversion 2
+.globl  _start
+.p2align	2
+.type   _start,@function
+
+_start:
+.Lfunc_begin0:
+.Lfunc_gep0:
+	addis 2, 12, .TOC.-.Lfunc_gep0@ha
+	addi 2, 2, .TOC.-.Lfunc_gep0@l
+.Lfunc_lep0:
+	.localentry	_start, .Lfunc_lep0-.Lfunc_gep0
+	# The code below is not important, it just needs to access some
+	# global data or function, in order to use the TOC.
+	# In this case, it performs the following:
+	# g += 10;
+	# Also note that this code is not intended to be run, but only
+	# to check if the linker will preserve the localentry info.
+	addis 3, 2, g@toc@ha
+	addi 3, 3, g@toc@l
+	lwz 4, 0(3)
+	addi 4, 4, 10
+	stw 4, 0(3)
+	blr
+	.long	0
+	.quad	0
+.Lfunc_end0:
+	.size   _start, .Lfunc_end0-.Lfunc_begin0
+
+	.type	g,@object               # @g
+	.lcomm	g,4,4
+
+// We expect the st_other byte to be 0x60:
+// localentry = 011 (gep + 2 instructions), reserved = 000,
+// visibility = 00 (STV_DEFAULT)
+// Currently, llvm-objdump does not support displaying
+// st_other's PPC64 specific flags, thus we check the
+// result of the hexdump of .symtab section.
+
+// CHECK: 0070 00000000 00000000 00000009 12600001
diff --git a/test/ELF/ppc64-long-branch-init.s b/test/ELF/ppc64-long-branch-init.s
new file mode 100644
index 00000000000..80b3919cc45
--- /dev/null
+++ b/test/ELF/ppc64-long-branch-init.s
@@ -0,0 +1,43 @@
+# REQUIRES: ppc
+
+# RUN: llvm-mc -filetype=obj -triple=powerpc64-pc-freebsd13.0 %s -o %t.o
+# RUN: ld.lld %t.o -o %t
+# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s
+
+## .init consists of sections from several object files. Sections other than the
+## last one do not have a terminator. Check we do not create a long branch stub
+## in the middle.
+## We currently use thunk section spacing to ensure the stub is in the end. This
+## is not foolproof but good enough to not break in practice.
+
+# CHECK: Disassembly of section .init:
+# CHECK-LABEL: _init:
+# CHECK:         blr
+# CHECK-EMPTY:
+# CHECK-LABEL: __long_branch_foo:
+
+.globl foo
+foo:
+  .space 0x2000000
+  blr
+
+.section .init,"ax",@progbits,unique,0
+.globl _init
+_init:
+  stdu 1, -48(1)
+  mflr 0
+  std 0, 64(1)
+
+.section .init,"ax",@progbits,unique,1
+  bl foo
+  nop
+
+.section .init,"ax",@progbits,unique,2
+  bl foo
+  nop
+
+.section .init,"ax",@progbits,unique,3
+  ld 1, 0(1)
+  ld 0, 16(1)
+  mtlr 0
+  blr
diff --git a/test/ELF/ppc64-plt-stub.s b/test/ELF/ppc64-plt-stub.s
index a644f487b8b..bf3ac09fd51 100644
--- a/test/ELF/ppc64-plt-stub.s
+++ b/test/ELF/ppc64-plt-stub.s
@@ -4,16 +4,19 @@
 // RUN: llvm-mc -filetype=obj -triple=powerpc64le-unknown-linux %p/Inputs/shared-ppc64.s -o %t2.o
 // RUN: ld.lld -shared %t2.o -o %t2.so
 // RUN: ld.lld %t.o %t2.so -o %t
-// RUN: llvm-objdump -d %t | FileCheck %s
+// RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s
 
 // RUN: llvm-mc -filetype=obj -triple=powerpc64-unknown-linux %s -o %t.o
 // RUN: llvm-mc -filetype=obj -triple=powerpc64-unknown-linux %p/Inputs/shared-ppc64.s -o %t2.o
 // RUN: ld.lld -shared %t2.o -o %t2.so
 // RUN: ld.lld %t.o %t2.so -o %t
-// RUN: llvm-objdump -d %t | FileCheck %s
+// RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s
 
 // CHECK:      Disassembly of section .text:
-// CHECK-NEXT: __plt_foo:
+// CHECK-NEXT: _start:
+// CHECK:      10010008: bl .+16
+
+// CHECK-LABEL: 0000000010010018 __plt_foo:
 // CHECK-NEXT:      std 2, 24(1)
 // CHECK-NEXT:      addis 12, 2, 0
 // CHECK-NEXT:      ld 12, 32560(12)
@@ -21,8 +24,6 @@
 // CHECK-NEXT:      bctr
 
 
-// CHECK:            _start:
-// CHECK:            bl .+67108824
         .text
         .abiversion 2
         .globl  _start
diff --git a/test/ELF/ppc64-rel-calls.s b/test/ELF/ppc64-rel-calls.s
index 4c79498dc56..8423eb43f21 100644
--- a/test/ELF/ppc64-rel-calls.s
+++ b/test/ELF/ppc64-rel-calls.s
@@ -30,9 +30,8 @@ bar:
   nop
   blr
 
-# FIXME: The printing here is misleading, the branch offset here is negative.
-# CHECK: 1001000c:       {{.*}}     bl .+67108852
+# CHECK: 1001000c:       {{.*}}     bl .-12
 # CHECK: 10010010:       {{.*}}     nop
-# CHECK: 10010014:       {{.*}}     bl .+67108844
+# CHECK: 10010014:       {{.*}}     bl .-20
 # CHECK: 10010018:       {{.*}}     nop
 # CHECK: 1001001c:       {{.*}}     blr
diff --git a/test/ELF/ppc64-toc-restore-recursive-call.s b/test/ELF/ppc64-toc-restore-recursive-call.s
index 4bedcfecf38..756a058cc56 100644
--- a/test/ELF/ppc64-toc-restore-recursive-call.s
+++ b/test/ELF/ppc64-toc-restore-recursive-call.s
@@ -1,8 +1,8 @@
 # REQUIRES: ppc
 
-# RUN: llvm-mc -filetype=obj -triple=powerpc64le-unknown-linux %s -o %t1.o
-# RUN: ld.lld -shared %t1.o -o %t
-# RUN: llvm-objdump -d -r %t | FileCheck %s
+# RUN: llvm-mc -filetype=obj -triple=powerpc64le-unknown-linux %s -o %t.o
+# RUN: ld.lld -shared %t.o -o %t.so
+# RUN: llvm-objdump -d --no-show-raw-insn -r %t.so | FileCheck %s
 
 # For a recursive call that is interposable the linker calls the plt-stub rather
 # then calling the function directly. Since the call is through a plt stub and
@@ -14,12 +14,11 @@
 # for recursive calls as well as keeps the logic for recursive calls consistent
 # with non-recursive calls.
 
-# CHECK-LABEL: __plt_recursive_func:
-# CHECK-NEXT: 10000:
-# CHECK-LABEL: recursive_func
-# CHECK-NEXT:  10014:
-# CHECK:       1003c: {{[0-9a-fA-F ]+}} bl .+67108804
-# CHECK-NEXT:  ld 2, 24(1)
+# CHECK-LABEL: 0000000000010000 recursive_func:
+# CHECK:       10028:       bl .+32
+# CHECK-NEXT:               ld 2, 24(1)
+
+# CHECK-LABEL: 0000000000010048 __plt_recursive_func:
 
         .abiversion 2
         .section ".text"
diff --git a/test/ELF/ppc64-toc-restore.s b/test/ELF/ppc64-toc-restore.s
index d9e06ca6e59..d65bef847a7 100644
--- a/test/ELF/ppc64-toc-restore.s
+++ b/test/ELF/ppc64-toc-restore.s
@@ -5,14 +5,14 @@
 // RUN: llvm-mc -filetype=obj -triple=powerpc64le-unknown-linux %p/Inputs/ppc64-func.s -o %t3.o
 // RUN: ld.lld -shared %t2.o -o %t2.so
 // RUN: ld.lld %t.o %t2.so %t3.o -o %t
-// RUN: llvm-objdump -d %t | FileCheck %s
+// RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s
 
 // RUN: llvm-mc -filetype=obj -triple=powerpc64-unknown-linux %s -o %t.o
 // RUN: llvm-mc -filetype=obj -triple=powerpc64-unknown-linux %p/Inputs/shared-ppc64.s -o %t2.o
 // RUN: llvm-mc -filetype=obj -triple=powerpc64-unknown-linux %p/Inputs/ppc64-func.s -o %t3.o
 // RUN: ld.lld -shared %t2.o -o %t2.so
 // RUN: ld.lld %t.o %t2.so %t3.o -o %t
-// RUN: llvm-objdump -d %t | FileCheck %s
+// RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck %s
 
     .text
     .abiversion 2
@@ -28,16 +28,11 @@ _start:
   bl foo
   nop
   bl bar_local
-
-
-// CHECK: Disassembly of section .text:
-// CHECK: _start:
-// CHECK:     1001001c: {{.*}}  bl .+67108836
-// CHECK-NOT: 10010020: {{.*}}  nop
-// CHECK:     10010020: {{.*}}  ld 2, 24(1)
-// CHECK:     10010024: {{.*}}  bl .+67108848
-// CHECK-NOT: 10010028: {{.*}}  nop
-// CHECK-NOT: 10010028: {{.*}}  ld 2, 24(1)
+// CHECK-LABEL: _start:
+// CHECK-NEXT:  10010008:       bl .+64
+// CHECK-NEXT:  1001000c:       ld 2, 24(1)
+// CHECK-NEXT:  10010010:       bl .-16
+// CHECK-EMPTY:
 
 # Calling a function in another object file which will have same
 # TOC base does not need a nop. If nop present, do not rewrite to
@@ -47,26 +42,24 @@ _diff_object:
   bl foo_not_shared
   bl foo_not_shared
   nop
-
-// CHECK: _diff_object:
-// CHECK-NEXT: 10010028: {{.*}}  bl .+24
-// CHECK-NEXT: 1001002c: {{.*}}  bl .+20
-// CHECK-NEXT: 10010030: {{.*}}  nop
+// CHECK-LABEL: _diff_object:
+// CHECK-NEXT:  10010014:       bl .+28
+// CHECK-NEXT:  10010018:       bl .+24
+// CHECK-NEXT:  1001001c:       nop
 
 # Branching to a local function does not need a nop
 .global noretbranch
 noretbranch:
   b bar_local
-// CHECK: noretbranch:
-// CHECK:     10010034:  {{.*}}  b .+67108832
-// CHECK-NOT: 10010038:  {{.*}}  nop
-// CHECK-NOT: 1001003c:  {{.*}}  ld 2, 24(1)
+// CHECK-LABEL: noretbranch:
+// CHECK:       10010020:        b .+67108832
+// CHECK-EMPTY:
 
 // This should come last to check the end-of-buffer condition.
 .global last
 last:
   bl foo
   nop
-// CHECK: last:
-// CHECK:      10010038: {{.*}}   bl .+67108808
-// CHECK-NEXT: 1001003c: {{.*}}   ld 2, 24(1)
+// CHECK-LABEL: last:
+// CHECK-NEXT:  10010024:       bl .+36
+// CHECK-NEXT:  10010028:       ld 2, 24(1)
diff --git a/wasm/OutputSections.cpp b/wasm/OutputSections.cpp
index 4123d63b746..6b7b18d4ca7 100644
--- a/wasm/OutputSections.cpp
+++ b/wasm/OutputSections.cpp
@@ -111,8 +111,8 @@ void CodeSection::writeTo(uint8_t *Buf) {
   memcpy(Buf, CodeSectionHeader.data(), CodeSectionHeader.size());
 
   // Write code section bodies
-  parallelForEach(Functions,
-                  [&](const InputChunk *Chunk) { Chunk->writeTo(Buf); });
+  for (const InputChunk *Chunk : Functions)
+    Chunk->writeTo(Buf);
 }
 
 uint32_t CodeSection::numRelocations() const {
@@ -176,7 +176,7 @@ void DataSection::writeTo(uint8_t *Buf) {
   // Write data section headers
   memcpy(Buf, DataSectionHeader.data(), DataSectionHeader.size());
 
-  parallelForEach(Segments, [&](const OutputSegment *Segment) {
+  for (const OutputSegment *Segment : Segments) {
     // Write data segment header
     uint8_t *SegStart = Buf + Segment->SectionOffset;
     memcpy(SegStart, Segment->Header.data(), Segment->Header.size());
@@ -184,7 +184,7 @@ void DataSection::writeTo(uint8_t *Buf) {
     // Write segment data payload
     for (const InputChunk *Chunk : Segment->InputSegments)
       Chunk->writeTo(Buf);
-  });
+  }
 }
 
 uint32_t DataSection::numRelocations() const {
@@ -232,8 +232,8 @@ void CustomSection::writeTo(uint8_t *Buf) {
   Buf += NameData.size();
 
   // Write custom sections payload
-  parallelForEach(InputSections,
-                  [&](const InputSection *Section) { Section->writeTo(Buf); });
+  for (const InputSection *Section : InputSections)
+    Section->writeTo(Buf);
 }
 
 uint32_t CustomSection::numRelocations() const {

From 89da04f7e8a7bb6826b79d539b009bb657c84482 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Tue, 11 Jun 2019 18:17:38 +0000
Subject: [PATCH 7/7] Vendor import of lldb release_80 branch r363030:
 https://llvm.org/svn/llvm-project/lldb/branches/release_80@363030

---
 .../NativePDB/globals-fundamental.cpp         |   2 +-
 lit/helper/build.py                           |  12 +-
 lit/helper/toolchain.py                       |   3 +-
 lit/lit.site.cfg.py.in                        |   1 +
 lit/tools/lldb-mi/lit.local.cfg               |   2 +
 .../NativeRegisterContextLinux_x86_64.cpp     | 136 +++++++++++-------
 .../Linux/NativeRegisterContextLinux_x86_64.h |   3 +-
 7 files changed, 99 insertions(+), 60 deletions(-)
 create mode 100644 lit/tools/lldb-mi/lit.local.cfg

diff --git a/lit/SymbolFile/NativePDB/globals-fundamental.cpp b/lit/SymbolFile/NativePDB/globals-fundamental.cpp
index 8891eddf668..b3d3e37fbeb 100644
--- a/lit/SymbolFile/NativePDB/globals-fundamental.cpp
+++ b/lit/SymbolFile/NativePDB/globals-fundamental.cpp
@@ -1,5 +1,5 @@
 // clang-format off
-// REQUIRES: lld
+// REQUIRES: lld, python
 
 // Test that we can display tag types.
 // RUN: %build --compiler=clang-cl --nodefaultlib -o %t.exe -- %s 
diff --git a/lit/helper/build.py b/lit/helper/build.py
index 26f321d709f..fd52b7db4b8 100755
--- a/lit/helper/build.py
+++ b/lit/helper/build.py
@@ -283,19 +283,17 @@ def __init__(self, toolchain_type, args):
                     print('Using alternate compiler "{0}" to match selected target.'.format(self.compiler))
 
         if self.mode == 'link' or self.mode == 'compile-and-link':
-            self.linker = self._find_linker('link') if toolchain_type == 'msvc' else self._find_linker('lld-link')
+            self.linker = self._find_linker('link') if toolchain_type == 'msvc' else self._find_linker('lld-link', args.tools_dir)
             if not self.linker:
                 raise ValueError('Unable to find an appropriate linker.')
 
         self.compile_env, self.link_env = self._get_visual_studio_environment()
 
-    def _find_linker(self, name):
-        if sys.platform == 'win32':
-            name = name + '.exe'
+    def _find_linker(self, name, search_paths=[]):
         compiler_dir = os.path.dirname(self.compiler)
-        linker_path = os.path.join(compiler_dir, name)
-        if not os.path.exists(linker_path):
-            raise ValueError('Could not find \'{}\''.format(linker_path))
+        linker_path = find_executable(name, [compiler_dir] + search_paths)
+        if linker_path is None:
+            raise ValueError('Could not find \'{}\''.format(name))
         return linker_path
 
     def _get_vc_install_dir(self):
diff --git a/lit/helper/toolchain.py b/lit/helper/toolchain.py
index 938f343badc..11aa0bcf4e7 100644
--- a/lit/helper/toolchain.py
+++ b/lit/helper/toolchain.py
@@ -51,7 +51,8 @@ def use_lldb_substitutions(config):
 
     llvm_config.add_tool_substitutions(primary_tools,
                                        [config.lldb_tools_dir])
-    if lldbmi.was_resolved:
+    # lldb-mi always fails without Python support
+    if lldbmi.was_resolved and not config.lldb_disable_python:
         config.available_features.add('lldb-mi')
 
 def _use_msvc_substitutions(config):
diff --git a/lit/lit.site.cfg.py.in b/lit/lit.site.cfg.py.in
index fbf88efcc2f..738b25d0931 100644
--- a/lit/lit.site.cfg.py.in
+++ b/lit/lit.site.cfg.py.in
@@ -17,6 +17,7 @@ config.python_executable = "@PYTHON_EXECUTABLE@"
 config.have_zlib = @LLVM_ENABLE_ZLIB@
 config.host_triple = "@LLVM_HOST_TRIPLE@"
 config.lldb_bitness = 64 if @LLDB_IS_64_BITS@ else 32
+config.lldb_disable_python = @LLDB_DISABLE_PYTHON@
 
 # Support substitution of the tools and libs dirs with user parameters. This is
 # used when we can't determine the tool dir at configuration time.
diff --git a/lit/tools/lldb-mi/lit.local.cfg b/lit/tools/lldb-mi/lit.local.cfg
new file mode 100644
index 00000000000..ff28e265b3f
--- /dev/null
+++ b/lit/tools/lldb-mi/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "lldb-mi" in config.available_features:
+    config.unsupported = True
diff --git a/source/Plugins/Process/Linux/NativeRegisterContextLinux_x86_64.cpp b/source/Plugins/Process/Linux/NativeRegisterContextLinux_x86_64.cpp
index 50bf29b094d..c7313e6f987 100755
--- a/source/Plugins/Process/Linux/NativeRegisterContextLinux_x86_64.cpp
+++ b/source/Plugins/Process/Linux/NativeRegisterContextLinux_x86_64.cpp
@@ -19,9 +19,26 @@
 
 #include "Plugins/Process/Utility/RegisterContextLinux_i386.h"
 #include "Plugins/Process/Utility/RegisterContextLinux_x86_64.h"
-
+#include <cpuid.h>
 #include <linux/elf.h>
 
+// Newer toolchains define __get_cpuid_count in cpuid.h, but some
+// older-but-still-supported ones (e.g. gcc 5.4.0) don't, so we
+// define it locally here, following the definition in clang/lib/Headers.
+static inline int get_cpuid_count(unsigned int __leaf,
+                                  unsigned int __subleaf,
+                                  unsigned int *__eax, unsigned int *__ebx,
+                                  unsigned int *__ecx, unsigned int *__edx)
+{
+    unsigned int __max_leaf = __get_cpuid_max(__leaf & 0x80000000, 0);
+
+    if (__max_leaf == 0 || __max_leaf < __leaf)
+        return 0;
+
+    __cpuid_count(__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
+    return 1;
+}
+
 using namespace lldb_private;
 using namespace lldb_private::process_linux;
 
@@ -268,12 +285,29 @@ CreateRegisterInfoInterface(const ArchSpec &target_arch) {
   }
 }
 
+// Return the size of the XSTATE area supported on this cpu. It is necessary to
+// allocate the full size of the area even if we do not use/recognise all of it
+// because ptrace(PTRACE_SETREGSET, NT_X86_XSTATE) will refuse to write to it if
+// we do not pass it a buffer of sufficient size. The size is always at least
+// sizeof(FPR) so that the allocated buffer can be safely cast to FPR*.
+static std::size_t GetXSTATESize() {
+  unsigned int eax, ebx, ecx, edx;
+  // First check whether the XSTATE are is supported at all.
+  if (!__get_cpuid(1, &eax, &ebx, &ecx, &edx) || !(ecx & bit_XSAVE))
+    return sizeof(FPR);
+
+  // Then fetch the maximum size of the area.
+  if (!get_cpuid_count(0x0d, 0, &eax, &ebx, &ecx, &edx))
+    return sizeof(FPR);
+  return std::max<std::size_t>(ecx, sizeof(FPR));
+}
+
 NativeRegisterContextLinux_x86_64::NativeRegisterContextLinux_x86_64(
     const ArchSpec &target_arch, NativeThreadProtocol &native_thread)
     : NativeRegisterContextLinux(native_thread,
                                  CreateRegisterInfoInterface(target_arch)),
-      m_xstate_type(XStateType::Invalid), m_fpr(), m_iovec(), m_ymm_set(),
-      m_mpx_set(), m_reg_info(), m_gpr_x86_64() {
+      m_xstate_type(XStateType::Invalid), m_ymm_set(), m_mpx_set(),
+      m_reg_info(), m_gpr_x86_64() {
   // Set up data about ranges of valid registers.
   switch (target_arch.GetMachine()) {
   case llvm::Triple::x86:
@@ -329,14 +363,13 @@ NativeRegisterContextLinux_x86_64::NativeRegisterContextLinux_x86_64(
     break;
   }
 
-  // Initialize m_iovec to point to the buffer and buffer size using the
-  // conventions of Berkeley style UIO structures, as required by PTRACE
-  // extensions.
-  m_iovec.iov_base = &m_fpr;
-  m_iovec.iov_len = sizeof(m_fpr);
+  std::size_t xstate_size = GetXSTATESize();
+  m_xstate.reset(static_cast<FPR *>(std::malloc(xstate_size)));
+  m_iovec.iov_base = m_xstate.get();
+  m_iovec.iov_len = xstate_size;
 
   // Clear out the FPR state.
-  ::memset(&m_fpr, 0, sizeof(m_fpr));
+  ::memset(m_xstate.get(), 0, xstate_size);
 
   // Store byte offset of fctrl (i.e. first register of FPR)
   const RegisterInfo *reg_info_fctrl = GetRegisterInfoByName("fctrl");
@@ -439,14 +472,17 @@ NativeRegisterContextLinux_x86_64::ReadRegister(const RegisterInfo *reg_info,
 
     if (byte_order != lldb::eByteOrderInvalid) {
       if (reg >= m_reg_info.first_st && reg <= m_reg_info.last_st)
-        reg_value.SetBytes(m_fpr.fxsave.stmm[reg - m_reg_info.first_st].bytes,
-                           reg_info->byte_size, byte_order);
+        reg_value.SetBytes(
+            m_xstate->fxsave.stmm[reg - m_reg_info.first_st].bytes,
+            reg_info->byte_size, byte_order);
       if (reg >= m_reg_info.first_mm && reg <= m_reg_info.last_mm)
-        reg_value.SetBytes(m_fpr.fxsave.stmm[reg - m_reg_info.first_mm].bytes,
-                           reg_info->byte_size, byte_order);
+        reg_value.SetBytes(
+            m_xstate->fxsave.stmm[reg - m_reg_info.first_mm].bytes,
+            reg_info->byte_size, byte_order);
       if (reg >= m_reg_info.first_xmm && reg <= m_reg_info.last_xmm)
-        reg_value.SetBytes(m_fpr.fxsave.xmm[reg - m_reg_info.first_xmm].bytes,
-                           reg_info->byte_size, byte_order);
+        reg_value.SetBytes(
+            m_xstate->fxsave.xmm[reg - m_reg_info.first_xmm].bytes,
+            reg_info->byte_size, byte_order);
       if (reg >= m_reg_info.first_ymm && reg <= m_reg_info.last_ymm) {
         // Concatenate ymm using the register halves in xmm.bytes and
         // ymmh.bytes
@@ -488,7 +524,7 @@ NativeRegisterContextLinux_x86_64::ReadRegister(const RegisterInfo *reg_info,
     return error;
   }
 
-  // Get pointer to m_fpr.fxsave variable and set the data from it.
+  // Get pointer to m_xstate->fxsave variable and set the data from it.
 
   // Byte offsets of all registers are calculated wrt 'UserArea' structure.
   // However, ReadFPR() reads fpu registers {using ptrace(PTRACE_GETFPREGS,..)}
@@ -499,9 +535,9 @@ NativeRegisterContextLinux_x86_64::ReadRegister(const RegisterInfo *reg_info,
   // Since, FPR structure is also one of the member of UserArea structure.
   // byte_offset(fpu wrt FPR) = byte_offset(fpu wrt UserArea) -
   // byte_offset(fctrl wrt UserArea)
-  assert((reg_info->byte_offset - m_fctrl_offset_in_userarea) < sizeof(m_fpr));
-  uint8_t *src =
-      (uint8_t *)&m_fpr + reg_info->byte_offset - m_fctrl_offset_in_userarea;
+  assert((reg_info->byte_offset - m_fctrl_offset_in_userarea) < sizeof(FPR));
+  uint8_t *src = (uint8_t *)m_xstate.get() + reg_info->byte_offset -
+                 m_fctrl_offset_in_userarea;
   switch (reg_info->byte_size) {
   case 1:
     reg_value.SetUInt8(*(uint8_t *)src);
@@ -527,7 +563,7 @@ NativeRegisterContextLinux_x86_64::ReadRegister(const RegisterInfo *reg_info,
 
 void NativeRegisterContextLinux_x86_64::UpdateXSTATEforWrite(
     uint32_t reg_index) {
-  XSAVE_HDR::XFeature &xstate_bv = m_fpr.xsave.header.xstate_bv;
+  XSAVE_HDR::XFeature &xstate_bv = m_xstate->xsave.header.xstate_bv;
   if (IsFPR(reg_index)) {
     // IsFPR considers both %st and %xmm registers as floating point, but these
     // map to two features. Set both flags, just in case.
@@ -559,15 +595,15 @@ Status NativeRegisterContextLinux_x86_64::WriteRegister(
   if (IsFPR(reg_index) || IsAVX(reg_index) || IsMPX(reg_index)) {
     if (reg_info->encoding == lldb::eEncodingVector) {
       if (reg_index >= m_reg_info.first_st && reg_index <= m_reg_info.last_st)
-        ::memcpy(m_fpr.fxsave.stmm[reg_index - m_reg_info.first_st].bytes,
+        ::memcpy(m_xstate->fxsave.stmm[reg_index - m_reg_info.first_st].bytes,
                  reg_value.GetBytes(), reg_value.GetByteSize());
 
       if (reg_index >= m_reg_info.first_mm && reg_index <= m_reg_info.last_mm)
-        ::memcpy(m_fpr.fxsave.stmm[reg_index - m_reg_info.first_mm].bytes,
+        ::memcpy(m_xstate->fxsave.stmm[reg_index - m_reg_info.first_mm].bytes,
                  reg_value.GetBytes(), reg_value.GetByteSize());
 
       if (reg_index >= m_reg_info.first_xmm && reg_index <= m_reg_info.last_xmm)
-        ::memcpy(m_fpr.fxsave.xmm[reg_index - m_reg_info.first_xmm].bytes,
+        ::memcpy(m_xstate->fxsave.xmm[reg_index - m_reg_info.first_xmm].bytes,
                  reg_value.GetBytes(), reg_value.GetByteSize());
 
       if (reg_index >= m_reg_info.first_ymm &&
@@ -596,7 +632,7 @@ Status NativeRegisterContextLinux_x86_64::WriteRegister(
           return Status("CopyMPXtoXSTATE() failed");
       }
     } else {
-      // Get pointer to m_fpr.fxsave variable and set the data to it.
+      // Get pointer to m_xstate->fxsave variable and set the data to it.
 
       // Byte offsets of all registers are calculated wrt 'UserArea' structure.
       // However, WriteFPR() takes m_fpr (of type FPR structure) and writes
@@ -608,8 +644,8 @@ Status NativeRegisterContextLinux_x86_64::WriteRegister(
       // byte_offset(fpu wrt FPR) = byte_offset(fpu wrt UserArea) -
       // byte_offset(fctrl wrt UserArea)
       assert((reg_info->byte_offset - m_fctrl_offset_in_userarea) <
-             sizeof(m_fpr));
-      uint8_t *dst = (uint8_t *)&m_fpr + reg_info->byte_offset -
+             sizeof(FPR));
+      uint8_t *dst = (uint8_t *)m_xstate.get() + reg_info->byte_offset -
                      m_fctrl_offset_in_userarea;
       switch (reg_info->byte_size) {
       case 1:
@@ -667,7 +703,7 @@ Status NativeRegisterContextLinux_x86_64::ReadAllRegisterValues(
   ::memcpy(dst, &m_gpr_x86_64, GetRegisterInfoInterface().GetGPRSize());
   dst += GetRegisterInfoInterface().GetGPRSize();
   if (m_xstate_type == XStateType::FXSAVE)
-    ::memcpy(dst, &m_fpr.fxsave, sizeof(m_fpr.fxsave));
+    ::memcpy(dst, &m_xstate->fxsave, sizeof(m_xstate->fxsave));
   else if (m_xstate_type == XStateType::XSAVE) {
     lldb::ByteOrder byte_order = GetByteOrder();
 
@@ -700,7 +736,7 @@ Status NativeRegisterContextLinux_x86_64::ReadAllRegisterValues(
       }
     }
     // Copy the extended register state including the assembled ymm registers.
-    ::memcpy(dst, &m_fpr, sizeof(m_fpr));
+    ::memcpy(dst, m_xstate.get(), sizeof(FPR));
   } else {
     assert(false && "how do we save the floating point registers?");
     error.SetErrorString("unsure how to save the floating point registers");
@@ -758,9 +794,9 @@ Status NativeRegisterContextLinux_x86_64::WriteAllRegisterValues(
 
   src += GetRegisterInfoInterface().GetGPRSize();
   if (m_xstate_type == XStateType::FXSAVE)
-    ::memcpy(&m_fpr.fxsave, src, sizeof(m_fpr.fxsave));
+    ::memcpy(&m_xstate->fxsave, src, sizeof(m_xstate->fxsave));
   else if (m_xstate_type == XStateType::XSAVE)
-    ::memcpy(&m_fpr.xsave, src, sizeof(m_fpr.xsave));
+    ::memcpy(&m_xstate->xsave, src, sizeof(m_xstate->xsave));
 
   error = WriteFPR();
   if (error.Fail())
@@ -814,12 +850,12 @@ bool NativeRegisterContextLinux_x86_64::IsCPUFeatureAvailable(
     return true;
   case RegSet::avx: // Check if CPU has AVX and if there is kernel support, by
                     // reading in the XCR0 area of XSAVE.
-    if ((m_fpr.xsave.i387.xcr0 & mask_XSTATE_AVX) == mask_XSTATE_AVX)
+    if ((m_xstate->xsave.i387.xcr0 & mask_XSTATE_AVX) == mask_XSTATE_AVX)
       return true;
      break;
   case RegSet::mpx: // Check if CPU has MPX and if there is kernel support, by
                     // reading in the XCR0 area of XSAVE.
-    if ((m_fpr.xsave.i387.xcr0 & mask_XSTATE_MPX) == mask_XSTATE_MPX)
+    if ((m_xstate->xsave.i387.xcr0 & mask_XSTATE_MPX) == mask_XSTATE_MPX)
       return true;
     break;
   }
@@ -856,10 +892,10 @@ Status NativeRegisterContextLinux_x86_64::WriteFPR() {
   switch (m_xstate_type) {
   case XStateType::FXSAVE:
     return WriteRegisterSet(
-        &m_iovec, sizeof(m_fpr.fxsave),
+        &m_iovec, sizeof(m_xstate->fxsave),
         fxsr_regset(GetRegisterInfoInterface().GetTargetArchitecture()));
   case XStateType::XSAVE:
-    return WriteRegisterSet(&m_iovec, sizeof(m_fpr.xsave), NT_X86_XSTATE);
+    return WriteRegisterSet(&m_iovec, sizeof(m_xstate->xsave), NT_X86_XSTATE);
   default:
     return Status("Unrecognized FPR type.");
   }
@@ -879,11 +915,11 @@ bool NativeRegisterContextLinux_x86_64::CopyXSTATEtoYMM(
 
   if (byte_order == lldb::eByteOrderLittle) {
     ::memcpy(m_ymm_set.ymm[reg_index - m_reg_info.first_ymm].bytes,
-             m_fpr.fxsave.xmm[reg_index - m_reg_info.first_ymm].bytes,
+             m_xstate->fxsave.xmm[reg_index - m_reg_info.first_ymm].bytes,
              sizeof(XMMReg));
     ::memcpy(m_ymm_set.ymm[reg_index - m_reg_info.first_ymm].bytes +
                  sizeof(XMMReg),
-             m_fpr.xsave.ymmh[reg_index - m_reg_info.first_ymm].bytes,
+             m_xstate->xsave.ymmh[reg_index - m_reg_info.first_ymm].bytes,
              sizeof(YMMHReg));
     return true;
   }
@@ -891,10 +927,10 @@ bool NativeRegisterContextLinux_x86_64::CopyXSTATEtoYMM(
   if (byte_order == lldb::eByteOrderBig) {
     ::memcpy(m_ymm_set.ymm[reg_index - m_reg_info.first_ymm].bytes +
                  sizeof(XMMReg),
-             m_fpr.fxsave.xmm[reg_index - m_reg_info.first_ymm].bytes,
+             m_xstate->fxsave.xmm[reg_index - m_reg_info.first_ymm].bytes,
              sizeof(XMMReg));
     ::memcpy(m_ymm_set.ymm[reg_index - m_reg_info.first_ymm].bytes,
-             m_fpr.xsave.ymmh[reg_index - m_reg_info.first_ymm].bytes,
+             m_xstate->xsave.ymmh[reg_index - m_reg_info.first_ymm].bytes,
              sizeof(YMMHReg));
     return true;
   }
@@ -907,19 +943,19 @@ bool NativeRegisterContextLinux_x86_64::CopyYMMtoXSTATE(
     return false;
 
   if (byte_order == lldb::eByteOrderLittle) {
-    ::memcpy(m_fpr.fxsave.xmm[reg - m_reg_info.first_ymm].bytes,
+    ::memcpy(m_xstate->fxsave.xmm[reg - m_reg_info.first_ymm].bytes,
              m_ymm_set.ymm[reg - m_reg_info.first_ymm].bytes, sizeof(XMMReg));
-    ::memcpy(m_fpr.xsave.ymmh[reg - m_reg_info.first_ymm].bytes,
+    ::memcpy(m_xstate->xsave.ymmh[reg - m_reg_info.first_ymm].bytes,
              m_ymm_set.ymm[reg - m_reg_info.first_ymm].bytes + sizeof(XMMReg),
              sizeof(YMMHReg));
     return true;
   }
 
   if (byte_order == lldb::eByteOrderBig) {
-    ::memcpy(m_fpr.fxsave.xmm[reg - m_reg_info.first_ymm].bytes,
+    ::memcpy(m_xstate->fxsave.xmm[reg - m_reg_info.first_ymm].bytes,
              m_ymm_set.ymm[reg - m_reg_info.first_ymm].bytes + sizeof(XMMReg),
              sizeof(XMMReg));
-    ::memcpy(m_fpr.xsave.ymmh[reg - m_reg_info.first_ymm].bytes,
+    ::memcpy(m_xstate->xsave.ymmh[reg - m_reg_info.first_ymm].bytes,
              m_ymm_set.ymm[reg - m_reg_info.first_ymm].bytes, sizeof(YMMHReg));
     return true;
   }
@@ -929,7 +965,7 @@ bool NativeRegisterContextLinux_x86_64::CopyYMMtoXSTATE(
 void *NativeRegisterContextLinux_x86_64::GetFPRBuffer() {
   switch (m_xstate_type) {
   case XStateType::FXSAVE:
-    return &m_fpr.fxsave;
+    return &m_xstate->fxsave;
   case XStateType::XSAVE:
     return &m_iovec;
   default:
@@ -940,7 +976,7 @@ void *NativeRegisterContextLinux_x86_64::GetFPRBuffer() {
 size_t NativeRegisterContextLinux_x86_64::GetFPRSize() {
   switch (m_xstate_type) {
   case XStateType::FXSAVE:
-    return sizeof(m_fpr.fxsave);
+    return sizeof(m_xstate->fxsave);
   case XStateType::XSAVE:
     return sizeof(m_iovec);
   default:
@@ -953,14 +989,14 @@ Status NativeRegisterContextLinux_x86_64::ReadFPR() {
 
   // Probe XSAVE and if it is not supported fall back to FXSAVE.
   if (m_xstate_type != XStateType::FXSAVE) {
-    error = ReadRegisterSet(&m_iovec, sizeof(m_fpr.xsave), NT_X86_XSTATE);
+    error = ReadRegisterSet(&m_iovec, sizeof(m_xstate->xsave), NT_X86_XSTATE);
     if (!error.Fail()) {
       m_xstate_type = XStateType::XSAVE;
       return error;
     }
   }
   error = ReadRegisterSet(
-      &m_iovec, sizeof(m_fpr.xsave),
+      &m_iovec, sizeof(m_xstate->xsave),
       fxsr_regset(GetRegisterInfoInterface().GetTargetArchitecture()));
   if (!error.Fail()) {
     m_xstate_type = XStateType::FXSAVE;
@@ -982,11 +1018,11 @@ bool NativeRegisterContextLinux_x86_64::CopyXSTATEtoMPX(uint32_t reg) {
 
   if (reg >= m_reg_info.first_mpxr && reg <= m_reg_info.last_mpxr) {
     ::memcpy(m_mpx_set.mpxr[reg - m_reg_info.first_mpxr].bytes,
-             m_fpr.xsave.mpxr[reg - m_reg_info.first_mpxr].bytes,
+             m_xstate->xsave.mpxr[reg - m_reg_info.first_mpxr].bytes,
              sizeof(MPXReg));
   } else {
     ::memcpy(m_mpx_set.mpxc[reg - m_reg_info.first_mpxc].bytes,
-             m_fpr.xsave.mpxc[reg - m_reg_info.first_mpxc].bytes,
+             m_xstate->xsave.mpxc[reg - m_reg_info.first_mpxc].bytes,
              sizeof(MPXCsr));
   }
   return true;
@@ -997,10 +1033,10 @@ bool NativeRegisterContextLinux_x86_64::CopyMPXtoXSTATE(uint32_t reg) {
     return false;
 
   if (reg >= m_reg_info.first_mpxr && reg <= m_reg_info.last_mpxr) {
-    ::memcpy(m_fpr.xsave.mpxr[reg - m_reg_info.first_mpxr].bytes,
+    ::memcpy(m_xstate->xsave.mpxr[reg - m_reg_info.first_mpxr].bytes,
              m_mpx_set.mpxr[reg - m_reg_info.first_mpxr].bytes, sizeof(MPXReg));
   } else {
-    ::memcpy(m_fpr.xsave.mpxc[reg - m_reg_info.first_mpxc].bytes,
+    ::memcpy(m_xstate->xsave.mpxc[reg - m_reg_info.first_mpxc].bytes,
              m_mpx_set.mpxc[reg - m_reg_info.first_mpxc].bytes, sizeof(MPXCsr));
   }
   return true;
diff --git a/source/Plugins/Process/Linux/NativeRegisterContextLinux_x86_64.h b/source/Plugins/Process/Linux/NativeRegisterContextLinux_x86_64.h
index 9dcf82f50a4..2970326306e 100644
--- a/source/Plugins/Process/Linux/NativeRegisterContextLinux_x86_64.h
+++ b/source/Plugins/Process/Linux/NativeRegisterContextLinux_x86_64.h
@@ -109,7 +109,8 @@ class NativeRegisterContextLinux_x86_64 : public NativeRegisterContextLinux {
 
   // Private member variables.
   mutable XStateType m_xstate_type;
-  FPR m_fpr; // Extended States Area, named FPR for historical reasons.
+  std::unique_ptr<FPR, llvm::FreeDeleter>
+      m_xstate; // Extended States Area, named FPR for historical reasons.
   struct iovec m_iovec;
   YMM m_ymm_set;
   MPX m_mpx_set;