
Includes patch suggested by @shuni64 which fixes half precision ABI issues Includes hipblaslt compression patch Includes configurable hipblaslt support in rocblas rocmPackages_6.hipblaslt: respect NIX_BUILD_CORES in tensilelite rocmPackages_6.hipblas: propagate hipblas-common rocmPackages_6.clr: avoid confusion with hipClangPath Co-authored-by: Gavin Zhao <git@gzgz.dev>
346 lines
14 KiB
Diff
346 lines
14 KiB
Diff
diff --git a/Tensile/TensileCreateLibrary.py b/Tensile/TensileCreateLibrary.py
|
|
index b8cea84558..1bc24bd1dd 100644
|
|
--- a/Tensile/TensileCreateLibrary.py
|
|
+++ b/Tensile/TensileCreateLibrary.py
|
|
@@ -41,6 +41,7 @@
|
|
from .SolutionLibrary import MasterSolutionLibrary
|
|
from .SolutionStructs import Solution
|
|
from .CustomYamlLoader import load_logic_gfx_arch
|
|
+from .Utilities.Profile import profile
|
|
|
|
import argparse
|
|
import collections
|
|
@@ -1233,7 +1234,7 @@ def validateLibrary(masterLibraries: MasterSolutionLibrary,
|
|
################################################################################
|
|
# Tensile Create Library
|
|
################################################################################
|
|
-@timing
|
|
+@profile
|
|
def TensileCreateLibrary():
|
|
print1("")
|
|
print1(HR)
|
|
@@ -1558,7 +1559,6 @@ def param(key, value):
|
|
|
|
print1("# Check if generated files exists.")
|
|
|
|
- @timing
|
|
def checkFileExistence(files):
|
|
for filePath in files:
|
|
if not os.path.exists(filePath):
|
|
diff --git a/Tensile/Utilities/Profile.py b/Tensile/Utilities/Profile.py
|
|
new file mode 100644
|
|
index 0000000000..cc3c7eb44c
|
|
--- /dev/null
|
|
+++ b/Tensile/Utilities/Profile.py
|
|
@@ -0,0 +1,77 @@
|
|
+################################################################################
|
|
+#
|
|
+# Copyright (C) 2016-2024 Advanced Micro Devices, Inc. All rights reserved.
|
|
+#
|
|
+# Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
+# of this software and associated documentation files (the "Software"), to deal
|
|
+# in the Software without restriction, including without limitation the rights
|
|
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
+# copies of the Software, and to permit persons to whom the Software is
|
|
+# furnished to do so, subject to the following conditions:
|
|
+#
|
|
+# The above copyright notice and this permission notice shall be included in
|
|
+# all copies or substantial portions of the Software.
|
|
+#
|
|
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
+# SOFTWARE.
|
|
+#
|
|
+################################################################################
|
|
+
|
|
+import cProfile
|
|
+import pstats
|
|
+import os
|
|
+
|
|
+from pathlib import Path
|
|
+from datetime import datetime, timezone
|
|
+from typing import Callable, Tuple
|
|
+
|
|
+PROFILE_ENV_VAR = "TENSILE_PROFILE"
|
|
+
|
|
+def profile(func: Callable) -> Callable:
|
|
+ """Profiling decorator.
|
|
+
|
|
+ Add ``@profile`` to mark a function for profiling; set the environment variable
|
|
+ TENSILE_PROFILE=ON to enable profiling decorated functions.
|
|
+ """
|
|
+ if not envVariableIsSet(PROFILE_ENV_VAR):
|
|
+ return func
|
|
+ def wrapper(*args, **kwargs):
|
|
+ path, filename = initProfileArtifacts(func.__name__)
|
|
+
|
|
+ prof = cProfile.Profile()
|
|
+ output = prof.runcall(func, *args, **kwargs)
|
|
+ result = pstats.Stats(prof)
|
|
+ result.sort_stats(pstats.SortKey.TIME)
|
|
+ result.dump_stats(path/filename)
|
|
+
|
|
+ return output
|
|
+ return wrapper
|
|
+
|
|
+def envVariableIsSet(varName: str) -> bool:
|
|
+ """Checks if the provided environment variable is set to "ON", "TRUE", or "1"
|
|
+ Args:
|
|
+ varName: Environment variable name.
|
|
+ Returns:
|
|
+ True if the environment variable is set, otherwise False.
|
|
+ """
|
|
+ value = os.environ.get(varName, "").upper()
|
|
+ return True if value in ["ON", "TRUE", "1"] else False
|
|
+
|
|
+def initProfileArtifacts(funcName: str) -> Tuple[Path, str]:
|
|
+ """Initializes filenames and paths for profiling artifacts based on the current datetime
|
|
+ Args:
|
|
+ funcName: The name of the function being profiled, nominally passed via func.__name__
|
|
+ Returns:
|
|
+ A tuple (path, filename) where the path is the artifact directory and filename is
|
|
+ a .prof file with the profiling results.
|
|
+ """
|
|
+ dt = datetime.now(timezone.utc)
|
|
+ filename = f"{funcName}-{dt.strftime('%Y-%m-%dT%H-%M-%SZ')}.prof"
|
|
+ path = Path().cwd()/f"profiling-results-{dt.strftime('%Y-%m-%d')}"
|
|
+ path.mkdir(exist_ok=True)
|
|
+ return path, filename
|
|
|
|
diff --git a/Tensile/TensileCreateLibrary.py b/Tensile/TensileCreateLibrary.py
|
|
index e62b0072df..2c843ba936 100644
|
|
--- a/Tensile/TensileCreateLibrary.py
|
|
+++ b/Tensile/TensileCreateLibrary.py
|
|
@@ -56,7 +56,7 @@
|
|
import sys
|
|
from timeit import default_timer as timer
|
|
from pathlib import Path
|
|
-from typing import Sequence, List
|
|
+from typing import Sequence, List, Union
|
|
|
|
def timing(func):
|
|
def wrapper(*args, **kwargs):
|
|
@@ -90,87 +90,142 @@ def processKernelSource(kernel, kernelWriterAssembly, ti):
|
|
|
|
return (err, src, header, kernelName, filename)
|
|
|
|
-def getAssemblyCodeObjectFiles(kernels, kernelWriterAssembly, outputPath):
|
|
- destDir = ensurePath(os.path.join(outputPath, 'library'))
|
|
- asmDir = kernelWriterAssembly.getAssemblyDirectory()
|
|
- archs = collections.defaultdict(list)
|
|
+def linkIntoCodeObject(
|
|
+ objFiles: List[str], coPathDest: Union[Path, str], kernelWriterAssembly: KernelWriterAssembly
|
|
+):
|
|
+ """Links object files into a code object file.
|
|
|
|
- for k in filter(lambda k: k['KernelLanguage'] == 'Assembly', kernels):
|
|
- archs[tuple(k['ISA'])].append(k)
|
|
+ Args:
|
|
+ objectFiles: A list of object files to be linked.
|
|
+ coPathDest: The destination path for the code object file.
|
|
+ kernelWriterAssembly: An instance of KernelWriterAssembly to get link arguments.
|
|
|
|
- coFiles = []
|
|
+ Raises:
|
|
+ RuntimeError: If linker invocation fails.
|
|
+ """
|
|
+ if os.name == "nt":
|
|
+ # On Windows, the objectFiles list command line (including spaces)
|
|
+ # exceeds the limit of 8191 characters, so using response file
|
|
+
|
|
+ responseFile = os.path.join('/tmp', 'clangArgs.txt')
|
|
+ with open(responseFile, 'wt') as file:
|
|
+ file.write(" ".join(objFiles))
|
|
+ file.flush()
|
|
+
|
|
+ args = [globalParameters['AssemblerPath'], '-target', 'amdgcn-amd-amdhsa', '-o', coFileRaw, '@clangArgs.txt']
|
|
+ subprocess.check_call(args, cwd=asmDir)
|
|
+ else:
|
|
+ numObjFiles = len(objFiles)
|
|
+ maxObjFiles = 10000
|
|
+
|
|
+ if numObjFiles > maxObjFiles:
|
|
+ batchedObjFiles = [objFiles[i:i+maxObjFiles] for i in range(0, numObjFiles, maxObjFiles)]
|
|
+ batchSize = int(math.ceil(numObjFiles / maxObjFiles))
|
|
+
|
|
+ newObjFiles = [str(coPathDest) + "." + str(i) for i in range(0, batchSize)]
|
|
+ newObjFilesOutput = []
|
|
+
|
|
+ for batch, filename in zip(batchedObjFiles, newObjFiles):
|
|
+ if len(batch) > 1:
|
|
+ args = [globalParameters["ROCmLdPath"], "-r"] + batch + [ "-o", filename]
|
|
+ print2(f"Linking object files into fewer object files: {' '.join(args)}")
|
|
+ subprocess.check_call(args)
|
|
+ newObjFilesOutput.append(filename)
|
|
+ else:
|
|
+ newObjFilesOutput.append(batchedObjFiles[0])
|
|
+
|
|
+ args = kernelWriterAssembly.getLinkCodeObjectArgs(newObjFilesOutput, str(coPathDest))
|
|
+ print2(f"Linking object files into code object: {' '.join(args)}")
|
|
+ subprocess.check_call(args)
|
|
+ else:
|
|
+ args = kernelWriterAssembly.getLinkCodeObjectArgs(objFiles, str(coPathDest))
|
|
+ print2(f"Linking object files into code object: {' '.join(args)}")
|
|
+ subprocess.check_call(args)
|
|
+
|
|
+
|
|
+def compressCodeObject(
|
|
+ coPathSrc: Union[Path, str], coPathDest: Union[Path, str], gfx: str, bundler: str
|
|
+):
|
|
+ """Compresses a code object file using the provided bundler.
|
|
+
|
|
+ Args:
|
|
+ coPathSrc: The source path of the code object file to be compressed.
|
|
+ coPathDest: The destination path for the compressed code object file.
|
|
+ gfx: The target GPU architecture.
|
|
+ bundler: The path to the Clang Offload Bundler executable.
|
|
+
|
|
+ Raises:
|
|
+ RuntimeError: If compressing the code object file fails.
|
|
+ """
|
|
+ args = [
|
|
+ bundler,
|
|
+ "--compress",
|
|
+ "--type=o",
|
|
+ "--bundle-align=4096",
|
|
+ f"--targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--{gfx}",
|
|
+ "--input=/dev/null",
|
|
+ f"--input={str(coPathSrc)}",
|
|
+ f"--output={str(coPathDest)}",
|
|
+ ]
|
|
+
|
|
+ print2(f"Bundling/compressing code objects: {' '.join(args)}")
|
|
+ try:
|
|
+ out = subprocess.check_output(args, stderr=subprocess.STDOUT)
|
|
+ print2(f"Output: {out}")
|
|
+ except subprocess.CalledProcessError as err:
|
|
+ raise RuntimeError(
|
|
+ f"Error compressing code object via bundling: {err.output}\nFailed command: {' '.join(args)}"
|
|
+ )
|
|
+
|
|
+def buildAssemblyCodeObjectFiles(kernels, kernelWriterAssembly, outputPath):
|
|
+
|
|
+ isAsm = lambda k: k["KernelLanguage"] == "Assembly"
|
|
+
|
|
+ extObj = ".o"
|
|
+ extCo = ".co"
|
|
+ extCoRaw = ".co.raw"
|
|
|
|
- for arch, archKernels in archs.items():
|
|
+ destDir = Path(ensurePath(os.path.join(outputPath, 'library')))
|
|
+ asmDir = Path(kernelWriterAssembly.getAssemblyDirectory())
|
|
+
|
|
+ archKernelMap = collections.defaultdict(list)
|
|
+ for k in filter(isAsm, kernels):
|
|
+ archKernelMap[tuple(k['ISA'])].append(k)
|
|
+
|
|
+ coFiles = []
|
|
+ for arch, archKernels in archKernelMap.items():
|
|
if len(archKernels) == 0:
|
|
continue
|
|
|
|
- archName = getGfxName(arch)
|
|
+ gfx = getGfxName(arch)
|
|
|
|
if globalParameters["MergeFiles"] or globalParameters["NumMergedFiles"] > 1 or globalParameters["LazyLibraryLoading"]:
|
|
- objectFiles = [kernelWriterAssembly.getKernelFileBase(k) + '.o' for k in archKernels if 'codeObjectFile' not in k]
|
|
+ objectFiles = [str(asmDir / (kernelWriterAssembly.getKernelFileBase(k) + extObj)) for k in archKernels if 'codeObjectFile' not in k]
|
|
|
|
- #Group kernels from placeholder libraries
|
|
coFileMap = collections.defaultdict(list)
|
|
+
|
|
if len(objectFiles):
|
|
- coFileMap[os.path.join(destDir, "TensileLibrary_"+archName+".co")] = objectFiles
|
|
+ coFileMap[asmDir / ("TensileLibrary_"+ gfx + extCoRaw)] = objectFiles
|
|
|
|
for kernel in archKernels:
|
|
coName = kernel.get("codeObjectFile", None)
|
|
if coName:
|
|
- coFileMap[os.path.join(destDir, coName+".co")] += [kernelWriterAssembly.getKernelFileBase(kernel) + '.o']
|
|
+ coFileMap[asmDir / (coName + extCoRaw)].append(str(asmDir / (kernelWriterAssembly.getKernelFileBase(kernel) + extObj)))
|
|
|
|
- for coFile, objectFiles in coFileMap.items():
|
|
- if os.name == "nt":
|
|
- # On Windows, the objectFiles list command line (including spaces)
|
|
- # exceeds the limit of 8191 characters, so using response file
|
|
+ for coFileRaw, objFiles in coFileMap.items():
|
|
|
|
- responseArgs = objectFiles
|
|
- responseFile = os.path.join(asmDir, 'clangArgs.txt')
|
|
- with open(responseFile, 'wt') as file:
|
|
- file.write( " ".join(responseArgs) )
|
|
- file.flush()
|
|
-
|
|
- args = [globalParameters['AssemblerPath'], '-target', 'amdgcn-amd-amdhsa', '-o', coFile, '@clangArgs.txt']
|
|
- subprocess.check_call(args, cwd=asmDir)
|
|
- else:
|
|
- numOfObjectFiles = len(objectFiles)
|
|
- splitFiles = 10000
|
|
- if numOfObjectFiles > splitFiles:
|
|
- slicedObjectFilesList = [objectFiles[x:x+splitFiles] for x in range(0, numOfObjectFiles, splitFiles)]
|
|
- objectFileBasename = os.path.split(coFile)[-1].split('.')[0]
|
|
- numOfOneSliceOfObjectFiles = int(math.ceil(numOfObjectFiles / splitFiles))
|
|
- newObjectFiles = [ objectFileBasename + "_" + str(i) + ".o" for i in range(0, numOfOneSliceOfObjectFiles)]
|
|
- newObjectFilesOutput = []
|
|
- for slicedObjectFiles, objectFile in zip(slicedObjectFilesList, newObjectFiles):
|
|
- if len(slicedObjectFiles) > 1:
|
|
- args = [globalParameters["ROCmLdPath"], "-r"] + slicedObjectFiles + [ "-o", objectFile ]
|
|
- if globalParameters["PrintCodeCommands"]:
|
|
- print(asmDir)
|
|
- print(' '.join(args))
|
|
- subprocess.check_call(args, cwd=asmDir)
|
|
- newObjectFilesOutput.append(objectFile)
|
|
- else:
|
|
- newObjectFilesOutput.append(slicedObjectFiles[0])
|
|
- args = kernelWriterAssembly.getLinkCodeObjectArgs(newObjectFilesOutput, coFile)
|
|
- if globalParameters["PrintCodeCommands"]:
|
|
- print(asmDir)
|
|
- print(' '.join(args))
|
|
- subprocess.check_call(args, cwd=asmDir)
|
|
- else:
|
|
- args = kernelWriterAssembly.getLinkCodeObjectArgs(objectFiles, coFile)
|
|
- if globalParameters["PrintCodeCommands"]:
|
|
- print(asmDir)
|
|
- print(' '.join(args))
|
|
- subprocess.check_call(args, cwd=asmDir)
|
|
+ linkIntoCodeObject(objFiles, coFileRaw, kernelWriterAssembly)
|
|
+ coFile = destDir / coFileRaw.name.replace(extCoRaw, extCo)
|
|
+ compressCodeObject(coFileRaw, coFile, gfx, globalParameters["ClangOffloadBundlerPath"])
|
|
|
|
coFiles.append(coFile)
|
|
else:
|
|
# no mergefiles
|
|
def newCoFileName(kName):
|
|
if globalParameters["PackageLibrary"]:
|
|
- return os.path.join(destDir, archName, kName + '.co')
|
|
+ return os.path.join(destDir, gfx, kName + '.co')
|
|
else:
|
|
- return os.path.join(destDir, kName + '_' + archName + '.co')
|
|
+ return os.path.join(destDir, kName + '_' + gfx + '.co')
|
|
|
|
def orgCoFileName(kName):
|
|
return os.path.join(asmDir, kName + '.co')
|
|
@@ -179,6 +234,8 @@ def orgCoFileName(kName):
|
|
map(lambda k: kernelWriterAssembly.getKernelFileBase(k), archKernels)), "Copying code objects"):
|
|
shutil.copyfile(src, dst)
|
|
coFiles.append(dst)
|
|
+ printWarning("Code object files are not compressed in `--no-merge-files` build mode.")
|
|
+
|
|
return coFiles
|
|
|
|
def which(p):
|
|
@@ -645,7 +702,7 @@ def success(kernel):
|
|
|
|
if not globalParameters["GenerateSourcesAndExit"]:
|
|
codeObjectFiles += buildSourceCodeObjectFiles(CxxCompiler, kernelFiles, outputPath)
|
|
- codeObjectFiles += getAssemblyCodeObjectFiles(kernelsToBuild, kernelWriterAssembly, outputPath)
|
|
+ codeObjectFiles += buildAssemblyCodeObjectFiles(kernelsToBuild, kernelWriterAssembly, outputPath)
|
|
|
|
Common.popWorkingPath() # build_tmp
|
|
Common.popWorkingPath() # workingDir
|
|
|