nixpkgs/pkgs/development/rocm-modules/6/tensile/tensilelite-compression.diff
Luna Nova 4d2c7ad003
rocmPackages: 6.0.2 -> 6.3.1
Includes patch suggested by @shuni64 which fixes half precision ABI
issues
Includes hipblaslt compression patch
Includes configurable hipblaslt support in rocblas
rocmPackages_6.hipblaslt: respect NIX_BUILD_CORES in tensilelite
rocmPackages_6.hipblas: propagate hipblas-common
rocmPackages_6.clr: avoid confusion with hipClangPath

Co-authored-by: Gavin Zhao <git@gzgz.dev>
2025-03-24 09:25:56 -07:00

346 lines
14 KiB
Diff

diff --git a/Tensile/TensileCreateLibrary.py b/Tensile/TensileCreateLibrary.py
index b8cea84558..1bc24bd1dd 100644
--- a/Tensile/TensileCreateLibrary.py
+++ b/Tensile/TensileCreateLibrary.py
@@ -41,6 +41,7 @@
from .SolutionLibrary import MasterSolutionLibrary
from .SolutionStructs import Solution
from .CustomYamlLoader import load_logic_gfx_arch
+from .Utilities.Profile import profile
import argparse
import collections
@@ -1233,7 +1234,7 @@ def validateLibrary(masterLibraries: MasterSolutionLibrary,
################################################################################
# Tensile Create Library
################################################################################
-@timing
+@profile
def TensileCreateLibrary():
print1("")
print1(HR)
@@ -1558,7 +1559,6 @@ def param(key, value):
print1("# Check if generated files exists.")
- @timing
def checkFileExistence(files):
for filePath in files:
if not os.path.exists(filePath):
diff --git a/Tensile/Utilities/Profile.py b/Tensile/Utilities/Profile.py
new file mode 100644
index 0000000000..cc3c7eb44c
--- /dev/null
+++ b/Tensile/Utilities/Profile.py
@@ -0,0 +1,77 @@
+################################################################################
+#
+# Copyright (C) 2016-2024 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+################################################################################
+
+import cProfile
+import pstats
+import os
+
+from pathlib import Path
+from datetime import datetime, timezone
+from typing import Callable, Tuple
+
+PROFILE_ENV_VAR = "TENSILE_PROFILE"
+
+def profile(func: Callable) -> Callable:
+ """Profiling decorator.
+
+ Add ``@profile`` to mark a function for profiling; set the environment variable
+ TENSILE_PROFILE=ON to enable profiling decorated functions.
+ """
+ if not envVariableIsSet(PROFILE_ENV_VAR):
+ return func
+ def wrapper(*args, **kwargs):
+ path, filename = initProfileArtifacts(func.__name__)
+
+ prof = cProfile.Profile()
+ output = prof.runcall(func, *args, **kwargs)
+ result = pstats.Stats(prof)
+ result.sort_stats(pstats.SortKey.TIME)
+ result.dump_stats(path/filename)
+
+ return output
+ return wrapper
+
+def envVariableIsSet(varName: str) -> bool:
+ """Checks if the provided environment variable is set to "ON", "TRUE", or "1"
+ Args:
+ varName: Environment variable name.
+ Returns:
+ True if the environment variable is set, otherwise False.
+ """
+ value = os.environ.get(varName, "").upper()
+ return True if value in ["ON", "TRUE", "1"] else False
+
+def initProfileArtifacts(funcName: str) -> Tuple[Path, str]:
+ """Initializes filenames and paths for profiling artifacts based on the current datetime
+ Args:
+ funcName: The name of the function being profiled, nominally passed via func.__name__
+ Returns:
+ A tuple (path, filename) where the path is the artifact directory and filename is
+ a .prof file with the profiling results.
+ """
+ dt = datetime.now(timezone.utc)
+ filename = f"{funcName}-{dt.strftime('%Y-%m-%dT%H-%M-%SZ')}.prof"
+ path = Path().cwd()/f"profiling-results-{dt.strftime('%Y-%m-%d')}"
+ path.mkdir(exist_ok=True)
+ return path, filename
diff --git a/Tensile/TensileCreateLibrary.py b/Tensile/TensileCreateLibrary.py
index e62b0072df..2c843ba936 100644
--- a/Tensile/TensileCreateLibrary.py
+++ b/Tensile/TensileCreateLibrary.py
@@ -56,7 +56,7 @@
import sys
from timeit import default_timer as timer
from pathlib import Path
-from typing import Sequence, List
+from typing import Sequence, List, Union
def timing(func):
def wrapper(*args, **kwargs):
@@ -90,87 +90,142 @@ def processKernelSource(kernel, kernelWriterAssembly, ti):
return (err, src, header, kernelName, filename)
-def getAssemblyCodeObjectFiles(kernels, kernelWriterAssembly, outputPath):
- destDir = ensurePath(os.path.join(outputPath, 'library'))
- asmDir = kernelWriterAssembly.getAssemblyDirectory()
- archs = collections.defaultdict(list)
+def linkIntoCodeObject(
+ objFiles: List[str], coPathDest: Union[Path, str], kernelWriterAssembly: KernelWriterAssembly
+):
+ """Links object files into a code object file.
- for k in filter(lambda k: k['KernelLanguage'] == 'Assembly', kernels):
- archs[tuple(k['ISA'])].append(k)
+ Args:
+ objectFiles: A list of object files to be linked.
+ coPathDest: The destination path for the code object file.
+ kernelWriterAssembly: An instance of KernelWriterAssembly to get link arguments.
- coFiles = []
+ Raises:
+ RuntimeError: If linker invocation fails.
+ """
+ if os.name == "nt":
+ # On Windows, the objectFiles list command line (including spaces)
+ # exceeds the limit of 8191 characters, so using response file
+
+ responseFile = os.path.join('/tmp', 'clangArgs.txt')
+ with open(responseFile, 'wt') as file:
+ file.write(" ".join(objFiles))
+ file.flush()
+
+ args = [globalParameters['AssemblerPath'], '-target', 'amdgcn-amd-amdhsa', '-o', coFileRaw, '@clangArgs.txt']
+ subprocess.check_call(args, cwd=asmDir)
+ else:
+ numObjFiles = len(objFiles)
+ maxObjFiles = 10000
+
+ if numObjFiles > maxObjFiles:
+ batchedObjFiles = [objFiles[i:i+maxObjFiles] for i in range(0, numObjFiles, maxObjFiles)]
+ batchSize = int(math.ceil(numObjFiles / maxObjFiles))
+
+ newObjFiles = [str(coPathDest) + "." + str(i) for i in range(0, batchSize)]
+ newObjFilesOutput = []
+
+ for batch, filename in zip(batchedObjFiles, newObjFiles):
+ if len(batch) > 1:
+ args = [globalParameters["ROCmLdPath"], "-r"] + batch + [ "-o", filename]
+ print2(f"Linking object files into fewer object files: {' '.join(args)}")
+ subprocess.check_call(args)
+ newObjFilesOutput.append(filename)
+ else:
+ newObjFilesOutput.append(batchedObjFiles[0])
+
+ args = kernelWriterAssembly.getLinkCodeObjectArgs(newObjFilesOutput, str(coPathDest))
+ print2(f"Linking object files into code object: {' '.join(args)}")
+ subprocess.check_call(args)
+ else:
+ args = kernelWriterAssembly.getLinkCodeObjectArgs(objFiles, str(coPathDest))
+ print2(f"Linking object files into code object: {' '.join(args)}")
+ subprocess.check_call(args)
+
+
+def compressCodeObject(
+ coPathSrc: Union[Path, str], coPathDest: Union[Path, str], gfx: str, bundler: str
+):
+ """Compresses a code object file using the provided bundler.
+
+ Args:
+ coPathSrc: The source path of the code object file to be compressed.
+ coPathDest: The destination path for the compressed code object file.
+ gfx: The target GPU architecture.
+ bundler: The path to the Clang Offload Bundler executable.
+
+ Raises:
+ RuntimeError: If compressing the code object file fails.
+ """
+ args = [
+ bundler,
+ "--compress",
+ "--type=o",
+ "--bundle-align=4096",
+ f"--targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--{gfx}",
+ "--input=/dev/null",
+ f"--input={str(coPathSrc)}",
+ f"--output={str(coPathDest)}",
+ ]
+
+ print2(f"Bundling/compressing code objects: {' '.join(args)}")
+ try:
+ out = subprocess.check_output(args, stderr=subprocess.STDOUT)
+ print2(f"Output: {out}")
+ except subprocess.CalledProcessError as err:
+ raise RuntimeError(
+ f"Error compressing code object via bundling: {err.output}\nFailed command: {' '.join(args)}"
+ )
+
+def buildAssemblyCodeObjectFiles(kernels, kernelWriterAssembly, outputPath):
+
+ isAsm = lambda k: k["KernelLanguage"] == "Assembly"
+
+ extObj = ".o"
+ extCo = ".co"
+ extCoRaw = ".co.raw"
- for arch, archKernels in archs.items():
+ destDir = Path(ensurePath(os.path.join(outputPath, 'library')))
+ asmDir = Path(kernelWriterAssembly.getAssemblyDirectory())
+
+ archKernelMap = collections.defaultdict(list)
+ for k in filter(isAsm, kernels):
+ archKernelMap[tuple(k['ISA'])].append(k)
+
+ coFiles = []
+ for arch, archKernels in archKernelMap.items():
if len(archKernels) == 0:
continue
- archName = getGfxName(arch)
+ gfx = getGfxName(arch)
if globalParameters["MergeFiles"] or globalParameters["NumMergedFiles"] > 1 or globalParameters["LazyLibraryLoading"]:
- objectFiles = [kernelWriterAssembly.getKernelFileBase(k) + '.o' for k in archKernels if 'codeObjectFile' not in k]
+ objectFiles = [str(asmDir / (kernelWriterAssembly.getKernelFileBase(k) + extObj)) for k in archKernels if 'codeObjectFile' not in k]
- #Group kernels from placeholder libraries
coFileMap = collections.defaultdict(list)
+
if len(objectFiles):
- coFileMap[os.path.join(destDir, "TensileLibrary_"+archName+".co")] = objectFiles
+ coFileMap[asmDir / ("TensileLibrary_"+ gfx + extCoRaw)] = objectFiles
for kernel in archKernels:
coName = kernel.get("codeObjectFile", None)
if coName:
- coFileMap[os.path.join(destDir, coName+".co")] += [kernelWriterAssembly.getKernelFileBase(kernel) + '.o']
+ coFileMap[asmDir / (coName + extCoRaw)].append(str(asmDir / (kernelWriterAssembly.getKernelFileBase(kernel) + extObj)))
- for coFile, objectFiles in coFileMap.items():
- if os.name == "nt":
- # On Windows, the objectFiles list command line (including spaces)
- # exceeds the limit of 8191 characters, so using response file
+ for coFileRaw, objFiles in coFileMap.items():
- responseArgs = objectFiles
- responseFile = os.path.join(asmDir, 'clangArgs.txt')
- with open(responseFile, 'wt') as file:
- file.write( " ".join(responseArgs) )
- file.flush()
-
- args = [globalParameters['AssemblerPath'], '-target', 'amdgcn-amd-amdhsa', '-o', coFile, '@clangArgs.txt']
- subprocess.check_call(args, cwd=asmDir)
- else:
- numOfObjectFiles = len(objectFiles)
- splitFiles = 10000
- if numOfObjectFiles > splitFiles:
- slicedObjectFilesList = [objectFiles[x:x+splitFiles] for x in range(0, numOfObjectFiles, splitFiles)]
- objectFileBasename = os.path.split(coFile)[-1].split('.')[0]
- numOfOneSliceOfObjectFiles = int(math.ceil(numOfObjectFiles / splitFiles))
- newObjectFiles = [ objectFileBasename + "_" + str(i) + ".o" for i in range(0, numOfOneSliceOfObjectFiles)]
- newObjectFilesOutput = []
- for slicedObjectFiles, objectFile in zip(slicedObjectFilesList, newObjectFiles):
- if len(slicedObjectFiles) > 1:
- args = [globalParameters["ROCmLdPath"], "-r"] + slicedObjectFiles + [ "-o", objectFile ]
- if globalParameters["PrintCodeCommands"]:
- print(asmDir)
- print(' '.join(args))
- subprocess.check_call(args, cwd=asmDir)
- newObjectFilesOutput.append(objectFile)
- else:
- newObjectFilesOutput.append(slicedObjectFiles[0])
- args = kernelWriterAssembly.getLinkCodeObjectArgs(newObjectFilesOutput, coFile)
- if globalParameters["PrintCodeCommands"]:
- print(asmDir)
- print(' '.join(args))
- subprocess.check_call(args, cwd=asmDir)
- else:
- args = kernelWriterAssembly.getLinkCodeObjectArgs(objectFiles, coFile)
- if globalParameters["PrintCodeCommands"]:
- print(asmDir)
- print(' '.join(args))
- subprocess.check_call(args, cwd=asmDir)
+ linkIntoCodeObject(objFiles, coFileRaw, kernelWriterAssembly)
+ coFile = destDir / coFileRaw.name.replace(extCoRaw, extCo)
+ compressCodeObject(coFileRaw, coFile, gfx, globalParameters["ClangOffloadBundlerPath"])
coFiles.append(coFile)
else:
# no mergefiles
def newCoFileName(kName):
if globalParameters["PackageLibrary"]:
- return os.path.join(destDir, archName, kName + '.co')
+ return os.path.join(destDir, gfx, kName + '.co')
else:
- return os.path.join(destDir, kName + '_' + archName + '.co')
+ return os.path.join(destDir, kName + '_' + gfx + '.co')
def orgCoFileName(kName):
return os.path.join(asmDir, kName + '.co')
@@ -179,6 +234,8 @@ def orgCoFileName(kName):
map(lambda k: kernelWriterAssembly.getKernelFileBase(k), archKernels)), "Copying code objects"):
shutil.copyfile(src, dst)
coFiles.append(dst)
+ printWarning("Code object files are not compressed in `--no-merge-files` build mode.")
+
return coFiles
def which(p):
@@ -645,7 +702,7 @@ def success(kernel):
if not globalParameters["GenerateSourcesAndExit"]:
codeObjectFiles += buildSourceCodeObjectFiles(CxxCompiler, kernelFiles, outputPath)
- codeObjectFiles += getAssemblyCodeObjectFiles(kernelsToBuild, kernelWriterAssembly, outputPath)
+ codeObjectFiles += buildAssemblyCodeObjectFiles(kernelsToBuild, kernelWriterAssembly, outputPath)
Common.popWorkingPath() # build_tmp
Common.popWorkingPath() # workingDir