triton-lang · ptillet · Nov 3, 2023 · Nov 2, 2023 · Nov 3, 2023 · Nov 3, 2023
@@ -3,8 +3,7 @@
 from collections import namedtuple
 
 # Create a named tuple for the output of the benchmark
-BenchmarkOutput = namedtuple(
-    'BenchmarkOutput', ['dev', 'name', 'batch_size', 'speedup', 'latency'])
+BenchmarkOutput = namedtuple('BenchmarkOutput', ['dev', 'name', 'batch_size', 'speedup', 'latency'])
 
 
 def parse_output(file_path: str) -> dict:
@@ -19,13 +18,11 @@ def parse_output(file_path: str) -> dict:
             batch_size = row[2]
             speedup = float(row[3])
             latency = float(row[4])
-            entries[name] = BenchmarkOutput(
-                dev, name, batch_size, speedup, latency)
+            entries[name] = BenchmarkOutput(dev, name, batch_size, speedup, latency)
     return entries
 
 
-def compare(baseline: dict, new: dict, threshold: float,
-            geomean_threshold: float) -> bool:
+def compare(baseline: dict, new: dict, threshold: float, geomean_threshold: float) -> bool:
     baseline_geomean = 1.0
     new_geomean = 1.0
     for key in new:
@@ -41,19 +38,16 @@ def compare(baseline: dict, new: dict, threshold: float,
             continue
 
         if new_latency < baseline_latency * (1 - threshold):
-            print(
-                f"New benchmark {key} is faster than baseline: {new_latency} vs {baseline_latency}")
+            print(f"New benchmark {key} is faster than baseline: {new_latency} vs {baseline_latency}")
         elif new_latency > baseline_latency * (1 + threshold):
-            print(
-                f"New benchmark {key} is slower than baseline: {new_latency} vs {baseline_latency}")
+            print(f"New benchmark {key} is slower than baseline: {new_latency} vs {baseline_latency}")
         else:
-            print(
-                f"New benchmark {key} is within threshold: {new_latency} vs {baseline_latency}")
+            print(f"New benchmark {key} is within threshold: {new_latency} vs {baseline_latency}")
         baseline_geomean *= baseline[key].speedup
         new_geomean *= new[key].speedup
 
-    baseline_geomean = baseline_geomean ** (1 / len(baseline))
-    new_geomean = new_geomean ** (1 / len(new))
+    baseline_geomean = baseline_geomean**(1 / len(baseline))
+    new_geomean = new_geomean**(1 / len(new))
     print(f"Baseline geomean: {baseline_geomean}")
     print(f"New geomean: {new_geomean}")
     assert new_geomean >= baseline_geomean * (1 - geomean_threshold), \

@@ -30,20 +30,14 @@ repos:
             ^docs/conf.py$
           )
 
-  - repo: https://github.com/PyCQA/isort
-    rev: 5.12.0
+  - repo: https://github.com/jlebar/yapf
+    rev: bf301f5ef7777e137b97219842629ca78eb5ef2a
     hooks:
-      - id: isort
-        exclude: '^python/triton/runtime/.*'
+      - id: yapf
+        args: ["-p", "-i"]
         stages: [commit, push, manual]
+        exclude: "python/test/unit/language/test_line_info.py"
 
-  - repo: https://github.com/pre-commit/mirrors-autopep8
-    rev: v1.6.0
-    hooks:
-      - id: autopep8
-        exclude: '^python/triton/runtime/.*'
-        args: ["-i"]
-        stages: [commit, push, manual]
   - repo: https://github.com/pre-commit/mirrors-clang-format
     rev: v16.0.6
     hooks:

@@ -22,7 +22,6 @@
 
 # -- General configuration ------------------------------------------------
 
-
 import os
 import shutil
 import sys
@@ -121,12 +120,9 @@ def documenter(app, obj, parent):
         return old_documenter(app, obj, parent)
 
     sphinx.ext.autosummary.get_documenter = documenter
-    sphinx.util.inspect.unwrap_all = forward_jit_fn(
-        sphinx.util.inspect.unwrap_all)
-    sphinx.util.inspect.signature = forward_jit_fn(
-        sphinx.util.inspect.signature)
-    sphinx.util.inspect.object_description = forward_jit_fn(
-        sphinx.util.inspect.object_description)
+    sphinx.util.inspect.unwrap_all = forward_jit_fn(sphinx.util.inspect.unwrap_all)
+    sphinx.util.inspect.signature = forward_jit_fn(sphinx.util.inspect.signature)
+    sphinx.util.inspect.object_description = forward_jit_fn(sphinx.util.inspect.object_description)
 
 
 # Auto Doc
@@ -139,7 +135,8 @@ def documenter(app, obj, parent):
     'sphinx.ext.coverage',
     'sphinx.ext.napoleon',
     'sphinx_multiversion',
-    'myst_parser']
+    'myst_parser',
+]
 autosummary_generate = True
 
 # versioning config
@@ -294,6 +291,6 @@ def documenter(app, obj, parent):
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
-    (master_doc, 'Triton', 'Triton Documentation', author,
-     'Triton', 'One line description of project.', 'Miscellaneous'),
+    (master_doc, 'Triton', 'Triton Documentation', author, 'Triton', 'One line description of project.',
+     'Miscellaneous'),
 ]
@@ -0,0 +1,26 @@
+[build-system]
+requires = ["setuptools>=40.8.0", "wheel", "cmake>=3.18", "ninja>=1.11.1"]
+
+[tool.yapf]
+based_on_style = "pep8"
+column_limit = 120
+disable_split_list_with_comment = true
+each_dict_entry_on_separate_line=false
+split_before_named_assigns = false
+split_complex_comprehension = true
+
+[tool.yapfignore]
+ignore_patterns = [
+    # This exclusion is also specified in .pre-commit-config.yaml.
+    # - We put it here because if you run yapf directly, we want it to skip the
+    #   file.
+    # - We also put it in .pre-commit-config because yapf raises an error if
+    #   pre-commit runs it but all of the files it might touch are ignored!
+    "python/test/unit/language/test_line_info.py"
+]
+
+[tool.ruff]
+line-length = 120
+
+[tool.ruff.lint]
+ignore = ["E501", "E701", "E731", "E741"]
@@ -4,8 +4,8 @@
 
 # triton kernel
 @triton.jit
-def kernel(X, stride_xm,
-           Z, stride_zn,
+def kernel(X, stride_xm,  #
+           Z, stride_zn,  #
            BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr):
     off_m = tl.arange(0, BLOCK_M)
     off_n = tl.arange(0, BLOCK_N)

@@ -10,4 +10,4 @@ def kernel(X, stride_xm, stride_xn, BLOCK: tl.constexpr):
 
 
 X = torch.randn(1, device="cuda")
-pgm = kernel[(1,)](X, 1, 1, BLOCK=1024)
+pgm = kernel[(1, )](X, 1, 1, BLOCK=1024)
@@ -55,6 +55,7 @@ class Package(NamedTuple):
     lib_flag: str
     syspath_var_name: str
 
+
 # pybind11
 
 
@@ -63,6 +64,7 @@ def get_pybind11_package_info():
     url = "https://github.com/pybind/pybind11/archive/refs/tags/v2.11.1.tar.gz"
     return Package("pybind11", name, url, "PYBIND11_INCLUDE_DIR", "", "PYBIND11_SYSPATH")
 
+
 # llvm
 
 
@@ -121,6 +123,7 @@ def get_thirdparty_packages(triton_cache_path):
             thirdparty_cmake_args.append(f"-D{p.lib_flag}={package_dir}/lib")
     return thirdparty_cmake_args
 
+
 # ---- package data ---
 
 
@@ -153,6 +156,7 @@ def download_and_copy(src_path, variable, version, url_func):
             os.makedirs(os.path.split(dst_path)[0], exist_ok=True)
             shutil.copy(src_path, dst_path)
 
+
 # ---- cmake extension ----
 
 
@@ -170,18 +174,21 @@ def get_cmake_dir():
 
 
 class CMakeClean(clean):
+
     def initialize_options(self):
         clean.initialize_options(self)
         self.build_temp = get_cmake_dir()
 
 
 class CMakeBuildPy(build_py):
+
     def run(self) -> None:
         self.run_command('build_ext')
         return super().run()
 
 
 class CMakeExtension(Extension):
+
     def __init__(self, name, path, sourcedir=""):
         Extension.__init__(self, name, sources=[])
         self.sourcedir = os.path.abspath(sourcedir)
@@ -204,7 +211,8 @@ def run(self):
         try:
             out = subprocess.check_output(["cmake", "--version"])
         except OSError:
-            raise RuntimeError("CMake must be installed to build the following extensions: " + ", ".join(e.name for e in self.extensions))
+            raise RuntimeError("CMake must be installed to build the following extensions: " +
+                               ", ".join(e.name for e in self.extensions))
 
         match = re.search(r"version\s*(?P<major>\d+)\.(?P<minor>\d+)([\d.]+)?", out.decode())
         cmake_major, cmake_minor = int(match.group("major")), int(match.group("minor"))
@@ -231,8 +239,10 @@ def build_extension(self, ext):
         # python directories
         python_include_dir = sysconfig.get_path("platinclude")
         cmake_args = [
-            "-G", "Ninja",  # Ninja is much faster than make
-            "-DCMAKE_MAKE_PROGRAM=" + ninja_dir,  # Pass explicit path to ninja otherwise cmake may cache a temporary path
+            "-G",
+            "Ninja",  # Ninja is much faster than make
+            "-DCMAKE_MAKE_PROGRAM=" +
+            ninja_dir,  # Pass explicit path to ninja otherwise cmake may cache a temporary path
             "-DCMAKE_EXPORT_COMPILE_COMMANDS=ON",
             "-DLLVM_ENABLE_WERROR=ON",
             "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + extdir,
@@ -266,12 +276,14 @@ def build_extension(self, ext):
             build_args += ['-j' + max_jobs]
 
         if check_env_flag("TRITON_BUILD_WITH_CLANG_LLD"):
-            cmake_args += ["-DCMAKE_C_COMPILER=clang",
-                           "-DCMAKE_CXX_COMPILER=clang++",
-                           "-DCMAKE_LINKER=lld",
-                           "-DCMAKE_EXE_LINKER_FLAGS=-fuse-ld=lld",
-                           "-DCMAKE_MODULE_LINKER_FLAGS=-fuse-ld=lld",
-                           "-DCMAKE_SHARED_LINKER_FLAGS=-fuse-ld=lld"]
+            cmake_args += [
+                "-DCMAKE_C_COMPILER=clang",
+                "-DCMAKE_CXX_COMPILER=clang++",
+                "-DCMAKE_LINKER=lld",
+                "-DCMAKE_EXE_LINKER_FLAGS=-fuse-ld=lld",
+                "-DCMAKE_MODULE_LINKER_FLAGS=-fuse-ld=lld",
+                "-DCMAKE_SHARED_LINKER_FLAGS=-fuse-ld=lld",
+            ]
 
         # Note that asan doesn't work with binaries that use the GPU, so this is
         # only useful for tools like triton-opt that don't run code on the GPU.
@@ -303,19 +315,22 @@ def build_extension(self, ext):
     src_path="bin/ptxas",
     variable="TRITON_PTXAS_PATH",
     version="12.1.105",
-    url_func=lambda arch, version: f"https://conda.anaconda.org/nvidia/label/cuda-12.1.1/linux-{arch}/cuda-nvcc-{version}-0.tar.bz2",
+    url_func=lambda arch, version:
+    f"https://conda.anaconda.org/nvidia/label/cuda-12.1.1/linux-{arch}/cuda-nvcc-{version}-0.tar.bz2",
 )
 download_and_copy(
     src_path="bin/cuobjdump",
     variable="TRITON_CUOBJDUMP_PATH",
     version="12.1.111",
-    url_func=lambda arch, version: f"https://conda.anaconda.org/nvidia/label/cuda-12.1.1/linux-{arch}/cuda-cuobjdump-{version}-0.tar.bz2",
+    url_func=lambda arch, version:
+    f"https://conda.anaconda.org/nvidia/label/cuda-12.1.1/linux-{arch}/cuda-cuobjdump-{version}-0.tar.bz2",
 )
 download_and_copy(
     src_path="bin/nvdisasm",
     variable="TRITON_NVDISASM_PATH",
     version="12.1.105",
-    url_func=lambda arch, version: f"https://conda.anaconda.org/nvidia/label/cuda-12.1.1/linux-{arch}/cuda-nvdisasm-{version}-0.tar.bz2",
+    url_func=lambda arch, version:
+    f"https://conda.anaconda.org/nvidia/label/cuda-12.1.1/linux-{arch}/cuda-nvdisasm-{version}-0.tar.bz2",
 )
 
 setup(
@@ -339,9 +354,7 @@ def build_extension(self, ext):
         "triton/third_party",
         "triton/tools",
     ],
-    install_requires=[
-        "filelock"
-    ],
+    install_requires=["filelock"],
     include_package_data=True,
     ext_modules=[CMakeExtension("triton", "triton/_C/")],
     cmdclass={"build_ext": CMakeBuild, "build_py": CMakeBuildPy, "clean": CMakeClean},

@@ -13,8 +13,7 @@
 
 import triton
 import triton.language as tl
-from triton.common.backend import (BaseBackend, compute_core_version_key,
-                                   register_backend)
+from triton.common.backend import (BaseBackend, compute_core_version_key, register_backend)
 from triton.common.build import quiet
 from triton.compiler.make_launcher import make_so_cache_key
 from triton.runtime.cache import get_cache_manager
@@ -81,6 +80,7 @@ def build_for_backend(name, src, srcdir):
 
 
 class ExtensionUtils:
+
     def __new__(cls):
         if not hasattr(cls, 'instance'):
             cls.instance = super(ExtensionUtils, cls).__new__(cls)
@@ -110,6 +110,7 @@ def __init__(self):
 
 
 class ExtensionDriver(DriverBase):
+
     def __new__(cls):
         if not hasattr(cls, 'instance'):
             cls.instance = super(ExtensionDriver, cls).__new__(cls)
@@ -256,13 +257,13 @@ def kernel(in_ptr0, out_ptr0, xnumel, XBLOCK: tl.constexpr):
 
     inp = torch.randn(10)
     out = torch.randn(10)
-    kernel[(10,)](inp, out, 10, XBLOCK=16)
+    kernel[(10, )](inp, out, 10, XBLOCK=16)
     spec = importlib.util.spec_from_file_location("__triton_launcher", ExtensionBackend.stub_so_path)
     mod = importlib.util.module_from_spec(spec)
     spec.loader.exec_module(mod)
     launch_counter = getattr(mod, "launch_counter")
 
     for _ in range(100):
-        kernel[(10,)](inp, out, 10, XBLOCK=16)
+        kernel[(10, )](inp, out, 10, XBLOCK=16)
 
     assert launch_counter() > 0
@@ -4,9 +4,7 @@
 
 
 def pytest_addoption(parser):
-    parser.addoption(
-        "--backend", action="store", default="", help="Codegen backend"
-    )
+    parser.addoption("--backend", action="store", default="", help="Codegen backend")
 
 
 @pytest.fixture

@@ -24,10 +24,10 @@ def kernel(x_ptr, y_ptr, out_ptr):
 
         if has_ipex:
             for _ in range(1000):
-                x = torch.randn((65536,), device="xpu", dtype=torch.float32)
-                y = torch.randn((65536,), device="xpu", dtype=torch.float32)
-                z = torch.zeros((65536,), device="xpu", dtype=torch.float32)
-                kernel[(65536,)](x, y, z, num_warps=32)
+                x = torch.randn((65536, ), device="xpu", dtype=torch.float32)
+                y = torch.randn((65536, ), device="xpu", dtype=torch.float32)
+                z = torch.zeros((65536, ), device="xpu", dtype=torch.float32)
+                kernel[(65536, )](x, y, z, num_warps=32)
                 assert torch.all(x + y == z)
     else:
         return
Original file line number	Diff line number	Diff line change
Expand Up		@@ -10,4 +10,4 @@ def kernel(X, stride_xm, stride_xn, BLOCK: tl.constexpr):


		X = torch.randn(1, device="cuda")
		pgm = kernel[(1,)](X, 1, 1, BLOCK=1024)
		pgm = kernel[(1, )](X, 1, 1, BLOCK=1024)