Add instruction count benchmark to run on pull requests (pytorch#131475)

This PR only adds the execution of the benchmarks on this PR and print results, following diffs will add checking out head~1 and running it and comparing. to access results goto test pr_time_benchmarks and inspect logs: you should see ``` + echo 'benchmark results on current PR: ' benchmark results on current PR: + cat /var/lib/jenkins/workspace/test/test-reports/pr_time_benchmarks_before.txt update_hint_regression,instruction_count,27971461254 ``` Pull Request resolved: pytorch#131475 Approved by: https://github.com/ezyang
akote123 · Aug 12, 2024 · f5e704a · f5e704a
1 parent 27c44c8
commit f5e704a
Show file tree

Hide file tree

Showing 34 changed files with 287 additions and 26 deletions.
diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
@@ -392,7 +392,20 @@ test_inductor_cpp_wrapper_abi_compatible() {
 # .github/workflows/inductor-perf-test-nightly.yml
 DYNAMO_BENCHMARK_FLAGS=()
 
-if [[ "${TEST_CONFIG}" == *dynamo_eager* ]]; then
+pr_time_benchmarks() {
+
+  TEST_REPORTS_DIR=$(pwd)/test/test-reports
+  mkdir -p "$TEST_REPORTS_DIR"
+  source benchmarks/dynamo/pr_time_benchmarks/benchmark_runner.sh "$TEST_REPORTS_DIR/pr_time_benchmarks_after.txt" "benchmarks/dynamo/pr_time_benchmarks/benchmarks"
+  echo "benchmark results on current PR: "
+  cat  "$TEST_REPORTS_DIR/pr_time_benchmarks_after.txt"
+
+}
+
+if [[ "${TEST_CONFIG}" == *pr_time_benchmarks* ]]; then
+  pr_time_benchmarks
+  exit 0
+elif [[ "${TEST_CONFIG}" == *dynamo_eager* ]]; then
   DYNAMO_BENCHMARK_FLAGS+=(--backend eager)
 elif [[ "${TEST_CONFIG}" == *aot_eager* ]]; then
   DYNAMO_BENCHMARK_FLAGS+=(--backend aot_eager)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -110,7 +110,6 @@ jobs:
           { config: "default", shard: 1, num_shards: 1 },
         ]}
 
-
   linux-jammy-py3_10-clang15-asan-build:
     name: linux-jammy-py3.10-clang15-asan
     uses: ./.github/workflows/_linux-build.yml
@@ -571,3 +570,24 @@ jobs:
       docker-image: ${{ needs.linux-focal-py3_12-clang10-experimental-split-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-focal-py3_12-clang10-experimental-split-build.outputs.test-matrix }}
       timeout-minutes: 600
+
+  linux-focal-cuda12_1-py3_10-gcc9-inductor-build:
+    name: cuda12.1-py3.10-gcc9-sm75
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm75
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '7.5'
+      test-matrix: |
+        { include: [
+          { config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
+        ]}
+
+  linux-focal-cuda12_1-py3_10-gcc9-inductor-test:
+    name: cuda12.1-py3.10-gcc9-sm75
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-focal-cuda12_1-py3_10-gcc9-inductor-build
+    with:
+      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm75
+      docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build.outputs.test-matrix }}
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
diff --git a/benchmarks/dynamo/pr_time_benchmarks/__init__.py b/benchmarks/dynamo/pr_time_benchmarks/__init__.py
diff --git a/benchmarks/dynamo/pr_time_benchmarks/benchmark_base.py b/benchmarks/dynamo/pr_time_benchmarks/benchmark_base.py
@@ -0,0 +1,64 @@
+import csv
+from abc import ABC, abstractmethod
+
+import torch._C._instruction_counter as i_counter
+
+
+class BenchmarkBase(ABC):
+    _instruction_count = False
+
+    def enable_instruction_count(self):
+        self._instruction_count = True
+        return self
+
+    def name(self):
+        return ""
+
+    def description(self):
+        return ""
+
+    @abstractmethod
+    def prepare(self):
+        pass
+
+    @abstractmethod
+    def work(self):
+        pass
+
+    def prepare_once(self):  # noqa: B027
+        pass
+
+    def count_instructions(self):
+        print(f"collecting instruction count for {self.name()}")
+        self.prepare_once()
+
+        results = []
+        for i in range(10):
+            self.prepare()
+            id = i_counter.start()
+            self.work()
+            count = i_counter.end(id)
+            print(f"instruction count for iteration {i} is {count}")
+            if i != 0:
+                results.append(count)
+        return min(results)
+
+    def append_results(self, path):
+        with open(path, "a", newline="") as csvfile:
+            # Create a writer object
+            writer = csv.writer(csvfile)
+            # Write the data to the CSV file
+            for entry in self.results:
+                writer.writerow(entry)
+
+    def print(self):
+        for entry in self.results:
+            print(f"{entry[0]},{entry[1]},{entry[2]}")
+
+    def collect_all(self):
+        self.results = []
+        if self._instruction_count:
+            self.results.append(
+                (self.name(), "instruction_count", self.count_instructions())
+            )
+        return self
diff --git a/benchmarks/dynamo/pr_time_benchmarks/benchmark_runner.sh b/benchmarks/dynamo/pr_time_benchmarks/benchmark_runner.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Check if the output file argument was provided
+if [ $# -eq 0 ]
+then
+    echo "Please provide the output file as an argument"
+    return
+fi
+
+# Check if the directory of Python programs argument was provided
+if [ $# -eq 1 ]
+then
+    echo "Please provide the directory of Python programs as an argument"
+    return
+fi
+
+# Set the output file
+output_file=$1
+# Set the directory of Python programs
+python_programs_dir=$2
+# Loop through all files in the directory of Python programs
+for file in $python_programs_dir/*.py
+do
+    # Execute the Python program and append the output to the output file
+   sudo env PATH="$PATH" python $file $output_file
+done
diff --git a/benchmarks/dynamo/pr_time_benchmarks/benchmarks/update_hint_benchmark.py b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/update_hint_benchmark.py
@@ -0,0 +1,46 @@
+import random
+import sys
+
+from benchmarks.dynamo.pr_time_benchmarks.benchmark_base import BenchmarkBase
+
+import torch
+
+
+class Benchmark(BenchmarkBase):
+    N = 20
+
+    def name(self):
+        return "update_hint_regression"
+
+    def description(self):
+        return "information at https://github.com/pytorch/pytorch/pull/129893"
+
+    def prepare_once(self):
+        torch._dynamo.config.capture_scalar_outputs = True
+        random.seed(42)
+        self.splits = torch.randint(10, (self.N,))
+        sz = self.splits.sum().item()
+        self.input = torch.randn(sz)
+
+    def prepare(self):
+        torch._dynamo.reset()
+
+    def work(self):
+        @torch.compile(fullgraph=True)
+        def f(a, b):
+            xs = b.tolist()
+            for x in xs:
+                torch._check_is_size(x)
+                torch._check(x <= self.N)
+            return a.split(xs)
+
+        f(self.input, self.splits)
+
+
+def main():
+    result_path = sys.argv[1]
+    Benchmark().enable_instruction_count().collect_all().append_results(result_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/instruction_counts/applications/ci.py b/benchmarks/instruction_counts/applications/ci.py
@@ -1,4 +1,5 @@
 """Collect instruction counts for continuous integration."""
+# mypy: ignore-errors
 import argparse
 import hashlib
 import json

diff --git a/benchmarks/instruction_counts/core/api.py b/benchmarks/instruction_counts/core/api.py
@@ -1,4 +1,5 @@
 """Key enums and structs used to handle data flow within the benchmark."""
+# mypy: ignore-errors
 import dataclasses
 import enum
 import itertools as it

diff --git a/benchmarks/instruction_counts/core/expand.py b/benchmarks/instruction_counts/core/expand.py
@@ -2,6 +2,7 @@
 
 This is mostly string manipulation, with just a bit of importlib magic.
 """
+# mypy: ignore-errors
 import importlib.abc
 import importlib.util
 import itertools as it

diff --git a/benchmarks/instruction_counts/core/types.py b/benchmarks/instruction_counts/core/types.py
@@ -1,4 +1,5 @@
 """Type annotations for various benchmark objects."""
+# mypy: ignore-errors
 from typing import Any, Dict, Optional, Tuple, Union
 
 from core.api import AutoLabels, GroupedBenchmark, TimerArgs

diff --git a/benchmarks/instruction_counts/core/utils.py b/benchmarks/instruction_counts/core/utils.py
@@ -1,3 +1,4 @@
+# mypy: ignore-errors
 import atexit
 import re
 import shutil

diff --git a/benchmarks/instruction_counts/definitions/setup.py b/benchmarks/instruction_counts/definitions/setup.py
@@ -1,5 +1,5 @@
 """Define some common setup blocks which benchmarks can reuse."""
-
+# mypy: ignore-errors
 import enum
 
 from core.api import GroupedSetup

diff --git a/benchmarks/instruction_counts/definitions/standard.py b/benchmarks/instruction_counts/definitions/standard.py
@@ -11,6 +11,7 @@
         - To set a label for the succeeding block, add `# @YOUR_LABEL` (Python)
           or `// @YOUR_LABEL` (C++).
 """
+# mypy: ignore-errors
 
 from core.api import GroupedModules, GroupedStmts, GroupedVariants
 from core.types import FlatIntermediateDefinition

diff --git a/benchmarks/instruction_counts/execution/runner.py b/benchmarks/instruction_counts/execution/runner.py
@@ -1,4 +1,5 @@
 """Run benchmarks while handling parallelism, isolation, and fault tolerance."""
+# mypy: ignore-errors
 import math
 import multiprocessing
 import subprocess

diff --git a/benchmarks/instruction_counts/execution/work.py b/benchmarks/instruction_counts/execution/work.py
@@ -1,4 +1,5 @@
 """Handle the details of subprocess calls and retries for a given benchmark run."""
+# mypy: ignore-errors
 import dataclasses
 import json
 import os

diff --git a/benchmarks/instruction_counts/main.py b/benchmarks/instruction_counts/main.py
@@ -5,6 +5,7 @@
 components) in future iterations. However this allows us to excercise the
 underlying benchmark generation infrastructure in the mean time.
 """
+# mypy: ignore-errors
 import argparse
 import sys
 from typing import List

diff --git a/benchmarks/operator_benchmark/benchmark_all_other_test.py b/benchmarks/operator_benchmark/benchmark_all_other_test.py
@@ -1,3 +1,4 @@
+import operator_benchmark as op_bench
 from pt import (  # noqa: F401
     add_test,
     ao_sparsifier_test,
@@ -29,8 +30,6 @@
     tensor_to_test,
 )
 
-import operator_benchmark as op_bench
-
 
 if __name__ == "__main__":
     op_bench.benchmark_runner.main()
diff --git a/benchmarks/operator_benchmark/benchmark_all_quantized_test.py b/benchmarks/operator_benchmark/benchmark_all_quantized_test.py
@@ -1,3 +1,4 @@
+import operator_benchmark as op_bench
 from pt import (  # noqa: F401
     qactivation_test,
     qarithmetic_test,
@@ -21,8 +22,6 @@
     qunary_test,
 )
 
-import operator_benchmark as op_bench
-
 
 if __name__ == "__main__":
     op_bench.benchmark_runner.main()
diff --git a/benchmarks/operator_benchmark/benchmark_all_test.py b/benchmarks/operator_benchmark/benchmark_all_test.py
@@ -1,8 +1,7 @@
 import benchmark_all_other_test  # noqa: F401
 import benchmark_all_quantized_test  # noqa: F401
-from pt import unary_test  # noqa: F401
-
 import operator_benchmark as op_bench
+from pt import unary_test  # noqa: F401
 
 
 if __name__ == "__main__":

diff --git a/benchmarks/operator_benchmark/pt/conv_test.py b/benchmarks/operator_benchmark/pt/conv_test.py
@@ -1,6 +1,5 @@
-from pt import configs
-
 import operator_benchmark as op_bench
+from pt import configs
 
 import torch
 import torch.nn as nn

diff --git a/benchmarks/operator_benchmark/pt/embeddingbag_test.py b/benchmarks/operator_benchmark/pt/embeddingbag_test.py
@@ -1,7 +1,6 @@
 import numpy
-from pt import configs
-
 import operator_benchmark as op_bench
+from pt import configs
 
 import torch
 

diff --git a/benchmarks/operator_benchmark/pt/gather_test.py b/benchmarks/operator_benchmark/pt/gather_test.py
@@ -1,5 +1,4 @@
 import numpy
-
 import operator_benchmark as op_bench
 
 import torch

diff --git a/benchmarks/operator_benchmark/pt/index_select_test.py b/benchmarks/operator_benchmark/pt/index_select_test.py
@@ -1,5 +1,4 @@
 import numpy
-
 import operator_benchmark as op_bench
 
 import torch

diff --git a/benchmarks/operator_benchmark/pt/linear_test.py b/benchmarks/operator_benchmark/pt/linear_test.py
@@ -1,6 +1,5 @@
-from pt import configs
-
 import operator_benchmark as op_bench
+from pt import configs
 
 import torch
 import torch.nn as nn

diff --git a/benchmarks/operator_benchmark/pt/qatembedding_ops_test.py b/benchmarks/operator_benchmark/pt/qatembedding_ops_test.py
@@ -1,7 +1,6 @@
 import numpy
-from pt import configs
-
 import operator_benchmark as op_bench
+from pt import configs
 
 import torch
 import torch.ao.nn.qat as nnqat

diff --git a/benchmarks/operator_benchmark/pt/qconv_test.py b/benchmarks/operator_benchmark/pt/qconv_test.py
@@ -1,6 +1,5 @@
-from pt import configs
-
 import operator_benchmark as op_bench
+from pt import configs
 
 import torch
 import torch.ao.nn.quantized as nnq

diff --git a/benchmarks/operator_benchmark/pt/qembedding_bag_lookups_test.py b/benchmarks/operator_benchmark/pt/qembedding_bag_lookups_test.py
@@ -1,7 +1,6 @@
 from typing import Optional
 
 import numpy as np
-
 import operator_benchmark as op_bench
 
 import torch

diff --git a/benchmarks/operator_benchmark/pt/qembeddingbag_test.py b/benchmarks/operator_benchmark/pt/qembeddingbag_test.py
@@ -1,7 +1,6 @@
 import numpy
-from pt import configs
-
 import operator_benchmark as op_bench
+from pt import configs
 
 import torch
 import torch.ao.nn.quantized as nnq

diff --git a/benchmarks/operator_benchmark/pt/qlinear_test.py b/benchmarks/operator_benchmark/pt/qlinear_test.py
@@ -1,6 +1,5 @@
-from pt import configs
-
 import operator_benchmark as op_bench
+from pt import configs
 
 import torch
 import torch.ao.nn.quantized as nnq

diff --git a/build_variables.bzl b/build_variables.bzl
@@ -913,6 +913,7 @@ libtorch_python_core_sources = [
     "torch/csrc/utils/disable_torch_function.cpp",
     "torch/csrc/utils/verbose.cpp",
     "torch/csrc/cpu/Module.cpp",
+    "torch/csrc/instruction_counter/Module.cpp",
 ] + lazy_tensor_core_python_sources
 
 libtorch_python_distributed_core_sources = [

diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
@@ -70,6 +70,7 @@
 #include <torch/csrc/functorch/init.h>
 #include <torch/csrc/fx/node.h>
 #include <torch/csrc/inductor/aoti_runner/pybind.h>
+#include <torch/csrc/instruction_counter/Module.h>
 #include <torch/csrc/jit/python/init.h>
 #include <torch/csrc/jit/python/python_ir.h>
 #include <torch/csrc/jit/python/python_tracer.h>
@@ -1698,6 +1699,7 @@ PyObject* initModule() {
 #endif
   torch::mtia::initModule(module);
   torch::cpu::initModule(module);
+  torch::instruction_counter::initModule(module);
   torch::initVerboseBindings(module);
   ASSERT_TRUE(THPStorage_init(module));