From d6c3a8d9e0c1dfb570733b957e643b5cebd2340e Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Tue, 7 Jan 2025 23:22:16 +0000 Subject: [PATCH] Don't do a loadgen release from dev branch, add python3.12,13 to loadgen test, exclude power-checker from auto format (#1994) * Update generate_final_report.py * Fix sdxl (#1911) * Fix typo in fid_score.py, fail_safe for SDXL short runs * [Automated Commit] Format Codebase * Fix typo in fid_score.py, fail_safe for SDXL short runs * Fix dlrmv2 reference implementation | Update run_local.sh * Fixes for filtering invalid results * [Automated Commit] Format Codebase * Update preprocess_submission.py * Added an option to pass in sample_ids.txt for SDXL accuracy check * [Automated Commit] Format Codebase * Update accuracy_coco.py * [Automated Commit] Format Codebase * Fix typo * Not use default for sample_ids.txt * Update requirements.txt (#1907) Updating the pip packages * Fix preprocess_sudbmission for a bug * Update submission_checker.py | Removed TEST05 * Fix to SDXL accuracy output * Added exists checks for rmtree in preprocess_submission script * [Automated Commit] Format Codebase * Delete .github/workflows/format.yml * Delete .github/scripts directory * Update build_wheels.yml | Added src distribution * Update VERSION.txt * Update build_wheels.yml * Update VERSION.txt * Update pyproject.toml * Increment version to 4.1.26 * Update MANIFEST.in * Increment version to 4.1.27 * Update pyproject.toml * Increment version to 4.1.28 * Update build_wheels.yml * Update VERSION.txt * Update accuracy_coco.py * Making sdxl run thread safe * Create format.yml | Run format on push instead of PR * Update backend_pytorch.py | Fix lock usage * Upgrade loadgen version to 5.0 (#1962) * Fix loadgen build for version numbers having "0" (#1967) * Fix loadgen build for version numbers having "0" * Update test-resnet50.yml * Update test-retinanet.yml * Update test-bert.yml * Increment version to 5.0.1 * Fix Dockerfile for 405B (#1960) Co-authored-by: Miro * Add llama3 metrics + remove llama3-99.9 (#1973) * Fix submission checker for v5.0 rgat (#1974) * Fix submission checker for v5.0 rgat * Update submission_checker.py | Updates for v5.0 * [Automated Commit] Format Codebase * Update submission_checker.py | Fixes latency_constraints for v5.0 * [Automated Commit] Format Codebase --------- Co-authored-by: mlcommons-bot * Fix test05 seeds missing error for v5.0 submission checker (#1976) * Fix llama3-405B docker workflow and performance sample count (#1978) * Fix llama3-405B docker workflow * Fix the performance sample count from 8312 to 8313 * More fixes * Increment version to 5.0.2 * Fix submission generation for v5.0 (#1981) * Fix submission checker for v5.0 rgat * Fix accuracy pattern for rgat, report-generator for v5.0 * More minor fixes for llama3.1-405b (#1983) * More minor fixes * Fix indentation for stats report * Remove unused rgat files (#1961) Co-authored-by: Miro * Update docker GPU, avoid long build time (#1966) Co-authored-by: Miro * Require equal issue mode for R-GAT (#1968) * Require equal issue mode for R-GAT * Add equal issue note in readme --------- Co-authored-by: Miro * Increment version to 5.0.3 * Docs update for r-gat (#1969) * Fixes #1648, restrict loadgen uncommitted error message to within the loadgen directory * Update test-rnnt.yml (#1688) Stopping the github action for rnnt * Added docs init Added github action for website publish Update benchmark documentation Update publish.yaml Update publish.yaml Update benchmark documentation Improved the submission documentation Fix taskname Removed unused images * Fix benchmark URLs * Fix links * Add _full variation to run commands * Added script flow diagram * Added docker setup command for CM, extra run options * Added support for docker options in the docs * Added --quiet to the CM run_cmds in docs * Fix the test query count for cm commands * Support ctuning-cpp implementation * Added commands for mobilenet models * Docs cleanup * Docs cleanup * Added separate files for dataset and models in the docs * Remove redundant tab in the docs * Fixes some WIP models in the docs * Use the official docs page for CM installation * Fix the deadlink in docs * Fix indendation issue in docs * Added dockerinfo for nvidia implementation * Added run options for gptj * Added execution environment tabs * Cleanup of the docs * Cleanup of the docs * Reordered the sections of the docs page * Removed an unnecessary heading in the docs * Fixes the commands for datacenter * Fix the build --sdist for loadgen * Fixes #1761, llama2 and mixtral runtime error on CPU systems * Added mixtral to the benchmark list, improved benchmark docs * Update docs for MLPerf inference v4.1 * Update docs for MLPerf inference v4.1 * Fix typo * Gave direct link to implementation readmes * Added tables detailing implementations * Update vision README.md, split the frameworks into separate rows * Update README.md * pointed links to specific frameworks * pointed links to specific frameworks * Update Submission_Guidelines.md * Update Submission_Guidelines.md * Update Submission_Guidelines.md * api support llama2 * Added request module and reduced max token len * Fix for llama2 api server * Update SUT_API offline to work for OpenAI * Update SUT_API.py * Minor fixes * Fix json import in SUT_API.py * Fix llama2 token length * Added model name verification with server * clean temp files * support num_workers in LLAMA2 SUTs * Remove batching from Offline SUT_API.py * Update SUT_API.py * Minor fixes for llama2 API * Fix for llama2 API * removed table of contents * enabled llama2-nvidia + vllm-NM : WIP * enabled dlrm for intel * lower cased implementation * added raw data input * corrected data download commands * renamed filename * changes for bert and vllm * documentation to work on custom repo and branch * benchmark index page update * enabled sdxl for nvidia and intel * updated vllm server run cmd * benchmark page information addition * fix indendation issue * Added submission categories * update submission page - generate submission with or w/o using CM for benchmarking * Updated kits dataset documentation * Updated model parameters * updation of information * updated non cm based benchmark * added info about hf password * added links to model and access tokens * Updated reference results structuree tree * submission docs cleanup * Some cleanups for benchmark info * Some cleanups for benchmark info * Some cleanups for benchmark info * added generic stubs deepsparse * Some cleanups for benchmark info * Some cleanups for benchmark info * Some cleanups for benchmark info * Some cleanups for benchmark info (FID and CLIP data added) * typo fix for bert deepsparse framework * added min system requirements for models * fixed code version * changes for displaying reference and intel implementation tip * added reference to installation page * updated neural magic documentation * Added links to the install page, redirect benchmarks page * added tips about batch size and dataset for nvidia llama2 * fix conditions logic * modified tips and additional run cmds * sentence corrections * Minor fix for the documentation * fixed bug in deepsparse generic model stubs + styling * added more information to stubs * Added SCC24 readme, support reproducibility in the docs * Made clear the custom CM repo URL format * Support conditional implementation, setup and run tips * Support rocm for sdxl * Fix _short tag support * Fix install URL * Expose bfloat16 and float16 options for sdxl * Expose download model to host option for sdxl * IndySCC24 documentation added * Improve the SCC24 docs * Improve the support of short variation * Improved the indyscc24 documentation * Updated scc run commands * removed test_query_count option for scc * Remove scc24 in the main docs * Remove scc24 in the main docs * Fix docs: indendation issue on the submission page * generalised code for skipping test query count * Fixes for SCC24 docs * Fix scenario text in main.py * Fix links for scc24 * Fix links for scc24 * Improve the general docs * Fix links for scc24 * Use float16 in scc24 doc * Improve scc24 docs * Improve scc24 docs * Use float16 in scc24 doc * fixed command bug * Fix typo in docs * Fix typo in docs * Remove unnecessary indendation in docs * initial commit for tip - native run CUDA * Updated tip * added docker_cm_repo_branch to more run option - docker * Update docs for IndySCC24 * Support custom repo branch and owner for final report generation * enabled amd implementation for llama2 * updations for amd - docs * Fix scenarios in docs page * formatted the files to pass the gh action * scenarios -> fixed_scenarios in docs * [Automated Commit] Format Codebase * Update indyscc24-bert.md * Update scc24.md * updated tip for reference implementation (#1912) * [Automated Commit] Format Codebase * fix for run suffix (#1913) * [Automated Commit] Format Codebase * Updation for adding submission flow diagram * Added submission flow diagram * Update scc24.md * changes in submission documentation (#1946) * update results category (#1947) * changes for adding rgat to docs (#1965) * Update index.md | Added R-GAT details (WIP) * Update index.md * Create system_requirements.yml * Update system_requirements.yml * Update system_requirements.yml * Update system_requirements.yml --------- Co-authored-by: anandhu-eng Co-authored-by: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Co-authored-by: Michael Goin Co-authored-by: arjunsuresh Co-authored-by: Pablo Gonzalez Co-authored-by: Mitchelle Rasquinha <80070689+mrasquinha-g@users.noreply.github.com> Co-authored-by: Miro * [Automated Commit] Format Codebase * Update automated run command section - R-GAT (#1970) * Update automated run command section * add cm commands for model and dataset downloads * Update README.md * Update cm run cmds --------- Co-authored-by: Miro * Unify llama3 names to llama3.1-405b (#1982) * Unify llama3 names to llama3.1-405b * Set mlperf.conf name to llama3_1-405b * Increment version to 5.0.4 * Create test-rgat.yml (#1984) * Create test-rgat.yml * Update test-rgat.yml * Update test-rgat.yml --------- Co-authored-by: Miro * Update compliance test table (#1987) Co-authored-by: Miro * Create benchmark-checklist.md for r-gat (#1985) * Create benchmark-checklist.md for r-gat * Update benchmark-checklist.md * Update benchmark-checklist.md * Update benchmark-checklist.md * Update benchmark-checklist.md * Update benchmark-checklist.md * Update benchmark-checklist.md * Update benchmark-checklist.md * Update benchmark-checklist.md * Update benchmark-checklist.md * Update benchmark-checklist.md * Update benchmark-checklist.md --------- Co-authored-by: Miro * Increment version to 5.0.5 * Added python3.12, 3.13 to loadgen test * Update format.yml | Don't format power_checker being synced from power-dev repo * Update index.md | Update accuracy for r-gat * Update benchmark-checklist.md for r-gat * Update CM commands in R-GAT README.md * Update README.md * Create reset-branch.yml * Create auto-update-dev.yml * Tested and fixed SDXL README (#1997) * Update SDXL README.md, improved CM commands * Update README.md | Fix SDXL model download path * Update README.md | Added cm command for downloading coco2014 size.50 * Update README.md | Fix SDXL calibration download command * Update SDXL README.md * Update README.md * Update preprocess_submission.py * Update README.md * Update README.md | added the outdirname in the CM command * Update README.md | added the outdirname in the CM Command * include cm commands - accuracy and calibration * Update README.md * Update README.md | added the outdirname in the CM command * Update README.md| added outdirname in the CM command * Support audit.conf with static mlperf.conf * Support audit.conf with static mlperf.conf * [Automated Commit] Format Codebase * Update test_settings_internal.cc | Fix conf_type usage * Update test_settings_internal.cc * Fixes to submission checker * [Automated Commit] Format Codebase * [Automated Commit] Format Codebase * Update submission_checker.py | Fix rgat performance_sample_count * Update evaluate-accuracy.py | Fixes #2008 * Update index.md * Update index.md * Update index.md * Update submission generation steps (WIP) * add submission generation graphs for local sync and through github repo (#2016) * add graphs for local sync and through github repo * Update index.md * Update index.md * Update index.md * Update index.md * Update index.md * Update index.md * Fixes to submission generation docs * Fixes to submission generation docs * Added link to the expected results folder structure * add docs for llama3 + inference version upgrade (#2020) * add docs for llama3 + inference version upgrade * add output path and hf token * Update CM run commands for llama3_1-405b (#2019) * Update CM run commands for llama3_1-405b * Update cm commands for llama3 * add information about hf tokens * Fixes the submission README * Update README.md * Create test-submission-generation.yml * Update test-submission-generation.yml * Clean invalid model results in preprocess_submission script * [Automated Commit] Format Codebase * Fixes the submission README * Update README.md * Update README.md * Update test-submission-generation.yml --------- Co-authored-by: arjunsuresh Co-authored-by: Zhihan Jiang <68881590+nvzhihanj@users.noreply.github.com> Co-authored-by: pgmpablo157321 Co-authored-by: Miro Co-authored-by: Pablo Gonzalez Co-authored-by: mlcommons-bot Co-authored-by: mrmhodak Co-authored-by: anandhu-eng Co-authored-by: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Co-authored-by: Michael Goin Co-authored-by: Mitchelle Rasquinha <80070689+mrasquinha-g@users.noreply.github.com> Co-authored-by: sahilavaran <139779393+sahilavaran@users.noreply.github.com> --- .github/workflows/auto-update-dev.yml | 34 +++ .github/workflows/build_wheels.yml | 1 - .github/workflows/format.yml | 2 +- .github/workflows/reset-branch.yml | 42 +++ .github/workflows/test-loadgen.yml | 2 +- .../workflows/test-submission-generation.yml | 52 ++++ .../language/get-llama3_1-405b-data.md | 41 +++ docs/benchmarks/language/llama3_1-405b.md | 13 + docs/index.md | 2 +- docs/submission/index.md | 277 ++++++++++-------- graph/R-GAT/README.md | 78 ++--- graph/R-GAT/benchmark-checklist.md | 7 +- language/gpt-j/README.md | 7 +- language/llama3.1-405b/README.md | 33 ++- language/mixtral-8x7b/evaluate-accuracy.py | 1 + loadgen/VERSION.txt | 2 +- loadgen/bindings/python_api.cc | 7 +- loadgen/loadgen.cc | 2 +- loadgen/test_settings.h | 2 +- loadgen/test_settings_internal.cc | 32 +- main.py | 10 +- mkdocs.yml | 1 + text_to_image/README.md | 60 ++-- tools/submission/README.md | 2 + tools/submission/preprocess_submission.py | 14 +- tools/submission/submission_checker.py | 56 ++-- tools/submission/truncate_accuracy_log.py | 2 +- 27 files changed, 535 insertions(+), 247 deletions(-) create mode 100644 .github/workflows/auto-update-dev.yml create mode 100644 .github/workflows/reset-branch.yml create mode 100644 .github/workflows/test-submission-generation.yml create mode 100644 docs/benchmarks/language/get-llama3_1-405b-data.md create mode 100644 docs/benchmarks/language/llama3_1-405b.md diff --git a/.github/workflows/auto-update-dev.yml b/.github/workflows/auto-update-dev.yml new file mode 100644 index 000000000..69cfdb281 --- /dev/null +++ b/.github/workflows/auto-update-dev.yml @@ -0,0 +1,34 @@ +name: Auto-Update Dev Branch from Master + +on: + push: + branches: + - master # Trigger workflow on commits to 'dev' branch + +jobs: + update-main: + runs-on: ubuntu-latest + permissions: + contents: write # Required to push to protected branches + + steps: + - name: Checkout Main Branch + uses: actions/checkout@v4 + with: + ref: dev + fetch-depth: 0 + ssh-key: ${{ secrets.DEPLOY_KEY }} + + - name: Configure Git User + run: | + git config user.name "github-actions" + git config user.email "github-actions@github.com" + + - name: Merge auto-update into dev + run: | + git fetch origin master:master + git merge --no-ff master -m "Auto-merge updates from master branch" + + - name: Push Changes to Main + run: | + git push origin dev diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index 6f67f56de..91282ec9c 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -7,7 +7,6 @@ on: branches: - master - loadgen-release - - dev paths: - loadgen/** diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml index dbf9a78bb..2ed014d04 100644 --- a/.github/workflows/format.yml +++ b/.github/workflows/format.yml @@ -38,7 +38,7 @@ jobs: for FILE in $(git diff --name-only $filter | grep -E '.*\.py$') do # Check if the file still exists in the working tree - if [ -f "$FILE" ]; then + if [ -f "$FILE" ] && [ "$FILE" != "tools/submission/power/power_checker.py" ]; then autopep8 --in-place -a "$FILE" git add "$FILE" fi diff --git a/.github/workflows/reset-branch.yml b/.github/workflows/reset-branch.yml new file mode 100644 index 000000000..76cf0b97e --- /dev/null +++ b/.github/workflows/reset-branch.yml @@ -0,0 +1,42 @@ +name: Reset Current Branch to Upstream After Squash Merge + +on: + workflow_dispatch: + inputs: + branch: + description: 'Branch to reset (leave blank for current branch)' + required: false + default: 'dev' + +jobs: + reset-branch: + runs-on: ubuntu-latest + + steps: + - name: Checkout Repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Detect Current Branch + if: ${{ inputs.branch == '' }} + run: echo "branch=$(git rev-parse --abbrev-ref HEAD)" >> $GITHUB_ENV + + - name: Use Input Branch + if: ${{ inputs.branch != '' }} + run: echo "branch=${{ inputs.branch }}" >> $GITHUB_ENV + + - name: Add Upstream Remote + run: | + git remote add upstream https://github.com/mlcommons/inference.git + git fetch upstream + - name: Reset Branch to Upstream + run: | + git checkout ${{ env.branch }} + git reset --hard upstream/${{ env.branch }} + if: success() + + - name: Force Push to Origin + run: | + git push origin ${{ env.branch }} --force-with-lease + if: success() diff --git a/.github/workflows/test-loadgen.yml b/.github/workflows/test-loadgen.yml index 8e42f625e..d73d5913b 100755 --- a/.github/workflows/test-loadgen.yml +++ b/.github/workflows/test-loadgen.yml @@ -21,7 +21,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/test-submission-generation.yml b/.github/workflows/test-submission-generation.yml new file mode 100644 index 000000000..97afc58cd --- /dev/null +++ b/.github/workflows/test-submission-generation.yml @@ -0,0 +1,52 @@ +# This workflow will test the submission generation using MLPerf Automation + +name: CM based Submission Generation + +on: + pull_request: + branches: [ "master", "dev" ] + paths: + - '.github/workflows/test-submission-generation.yml' + - '**' + - '!**.md' +jobs: + submission_generation: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + python-version: [ "3.12" ] + division: ["closed", "open", "closed-open"] + category: ["datacenter", "edge"] + case: ["closed"] + action: ["run", "docker"] + exclude: + - os: macos-latest + - os: windows-latest + - category: "edge" + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + pip install cm4mlops + - name: Pull repo where test cases are uploaded + run: | + git clone -b submission-generation-examples https://github.com/mlcommons/inference.git submission_generation_examples + - name: Run Submission Generation - ${{ matrix.case }} ${{ matrix.action }} ${{ matrix.category }} ${{ matrix.division }} + continue-on-error: true + run: | + if [ "${{ matrix.case }}" == "closed" ]; then + description="Test submission - contains closed edge and datacenter" + elif [ "${{ matrix.case }}" == "closed-power" ]; then + description="Test submission - contains closed-power edge and datacenter results" + fi + # Dynamically set the log group to simulate a dynamic step name + echo "::group::$description" + cm ${{ matrix.action }} script --tags=generate,inference,submission --adr.compiler.tags=gcc --version=v5.0 --clean --preprocess_submission=yes --submission_base_dir=mysubmissions --results_dir=$PWD/submission_generation_tests/${{ matrix.case }}/ --run-checker --submitter=MLCommons --tar=yes --division=${{ matrix.division }} --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes --quiet + cm ${{ matrix.action }} script --tags=run,submission,checker --submitter_id_off=mysubmitter_id --tar=yes --submission_dir=mysubmissions/submissions --submission_tar_file=mysubmission.tar.gz diff --git a/docs/benchmarks/language/get-llama3_1-405b-data.md b/docs/benchmarks/language/get-llama3_1-405b-data.md new file mode 100644 index 000000000..7333be64d --- /dev/null +++ b/docs/benchmarks/language/get-llama3_1-405b-data.md @@ -0,0 +1,41 @@ +--- +hide: + - toc +--- + +# Text Summarization using LLAMA3.1-405b + +## Dataset + +The benchmark implementation run command will automatically download the validation and calibration datasets and do the necessary preprocessing. In case you want to download only the datasets, you can use the below commands. + +=== "Validation" + + ### Get Validation Dataset + ``` + cm run script --tags=get,dataset,mlperf,inference,llama3,_validation --outdirname= -j + ``` + +=== "Calibration" + + ### Get Calibration Dataset + ``` + cm run script --tags=get,dataset,mlperf,inference,llama3,_calibration --outdirname= -j + ``` + +## Model +The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands. + +Get the Official MLPerf LLAMA3.1-405b Model + +=== "Pytorch" + + ### Pytorch + ``` + cm run script --tags=get,ml-model,llama3 --outdirname= --hf_token= -j + ``` + +!!! tip + + Downloading llama3.1-405B model from Hugging Face will require an [**access token**](https://huggingface.co/settings/tokens) which could be generated for your account. Additionally, ensure that your account has access to the [llama3.1-405B](https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct) model. + diff --git a/docs/benchmarks/language/llama3_1-405b.md b/docs/benchmarks/language/llama3_1-405b.md new file mode 100644 index 000000000..8163bb1e8 --- /dev/null +++ b/docs/benchmarks/language/llama3_1-405b.md @@ -0,0 +1,13 @@ +--- +hide: + - toc +--- + +# Text Summarization using LLAMA3_1-405b + +=== "MLCommons-Python" + ## MLPerf Reference Implementation in Python + +{{ mlperf_inference_implementation_readme (4, "llama3_1-405b-99", "reference", devices=["CPU","CUDA"]) }} + +{{ mlperf_inference_implementation_readme (4, "llama3_1-405b-99.9", "reference", devices=["CPU","CUDA"]) }} \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index b46d4c274..db9e3e440 100644 --- a/docs/index.md +++ b/docs/index.md @@ -163,7 +163,7 @@ The currently valid [MLPerf Inference Benchmarks](index_gh.md) as of MLPerf infe - **Dataset Size**: 788,379 - **QSL Size**: 788,379 - **Number of Parameters**: -- **Reference Model Accuracy**: ACC = ? +- **Reference Model Accuracy**: ACC = 72.86% - **Server Scenario Latency Constraint**: N/A - **Equal Issue mode**: True - **High accuracy variant**: No diff --git a/docs/submission/index.md b/docs/submission/index.md index 1050f5fb0..64ef5afa7 100644 --- a/docs/submission/index.md +++ b/docs/submission/index.md @@ -3,33 +3,17 @@ hide: - toc --- -

- Submission Generation Flow -

+Click [here](https://docs.google.com/presentation/d/1cmbpZUpVr78EIrhzyMBnnWnjJrD-mZ2vmSb-yETkTA8/edit?usp=sharing) to view the proposal slide for Common Automation for MLPerf Inference Submission Generation through CM. -

Figure: MLPerf Inference Submission Generation Flow

- - - -Click [here](https://youtu.be/eI1Hoecc3ho) to view the recording of the workshop: Streamlining your MLPerf Inference results using CM. - -Click [here](https://docs.google.com/presentation/d/1cmbpZUpVr78EIrhzyMBnnWnjJrD-mZ2vmSb-yETkTA8/edit?usp=sharing) to view the prposal slide for Common Automation for MLPerf Inference Submission Generation through CM. - -=== "CM based results" - If you have followed the `cm run` commands under the individual model pages in the [benchmarks](../index.md) directory, all the valid results will get aggregated to the `cm cache` folder. The following command could be used to browse the structure of inference results folder generated by CM. - ### Get results folder structure - ```bash - cm find cache --tags=get,mlperf,inference,results,dir | xargs tree - ``` -=== "Non CM based results" - If you have not followed the `cm run` commands under the individual model pages in the [benchmarks](../index.md) directory, please make sure that the result directory is structured in the following way. +=== "Custom automation based MLPerf results" + If you have not followed the `cm run` commands under the individual model pages in the [benchmarks](../index.md) directory, please make sure that the result directory is structured in the following way. You can see the real examples for the expected folder structure [here](https://github.com/mlcommons/inference/tree/submission-generation-examples). ``` └── System description ID(SUT Name) ├── system_meta.json └── Benchmark └── Scenario ├── Performance - | └── run_x/#1 run for all scenarios + | └── run_1 run for all scenarios | ├── mlperf_log_summary.txt | └── mlperf_log_detail.txt ├── Accuracy @@ -42,13 +26,13 @@ Click [here](https://docs.google.com/presentation/d/1cmbpZUpVr78EIrhzyMBnnWnjJrD | | └── run_x/#1 run for all scenarios | | ├── mlperf_log_summary.txt | | └── mlperf_log_detail.txt - | ├── Accuracy - | | ├── baseline_accuracy.txt - | | ├── compliance_accuracy.txt + | ├── Accuracy # for TEST01 only + | | ├── baseline_accuracy.txt (if test fails in deterministic mode) + | | ├── compliance_accuracy.txt (if test fails in deterministic mode) | | ├── mlperf_log_accuracy.json | | └── accuracy.txt | ├── verify_performance.txt - | └── verify_accuracy.txt #for TEST01 only + | └── verify_accuracy.txt # for TEST01 only |── user.conf └── measurements.json ``` @@ -67,99 +51,160 @@ Click [here](https://docs.google.com/presentation/d/1cmbpZUpVr78EIrhzyMBnnWnjJrD ``` -Once all the results across all the models are ready you can use the following command to generate a valid submission tree compliant with the [MLPerf requirements](https://github.com/mlcommons/policies/blob/master/submission_rules.adoc#inference-1). - -## Generate actual submission tree - -=== "Docker run" - ### Docker run - === "Closed" - ### Closed Submission - ```bash - cm docker script --tags=generate,inference,submission \ - --clean \ - --preprocess_submission=yes \ - --run-checker \ - --submitter=MLCommons \ - --tar=yes \ - --env.CM_TAR_OUTFILE=submission.tar.gz \ - --division=closed \ - --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \ - --quiet - ``` - - === "Open" - ### Open Submission - ```bash - cm docker script --tags=generate,inference,submission \ - --clean \ - --preprocess_submission=yes \ - --run-checker \ - --submitter=MLCommons \ - --tar=yes \ - --env.CM_TAR_OUTFILE=submission.tar.gz \ - --division=open \ - --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \ - --quiet - ``` - -=== "Native run" - ### Native run - === "Closed" - ### Closed Submission - ```bash - cm run script --tags=generate,inference,submission \ - --clean \ - --preprocess_submission=yes \ - --run-checker \ - --submitter=MLCommons \ - --tar=yes \ - --env.CM_TAR_OUTFILE=submission.tar.gz \ - --division=closed \ - --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \ - --quiet - ``` - - === "Open" - ### Open Submission - ```bash - cm run script --tags=generate,inference,submission \ - --clean \ - --preprocess_submission=yes \ - --run-checker \ - --submitter=MLCommons \ - --tar=yes \ - --env.CM_TAR_OUTFILE=submission.tar.gz \ - --division=open \ - --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \ - --quiet - ``` - -* Use `--hw_name="My system name"` to give a meaningful system name. Examples can be seen [here](https://github.com/mlcommons/inference_results_v3.0/tree/main/open/cTuning/systems) - -* Use `--submitter=` if your organization is an official MLCommons member and would like to submit under your organization - -* Use `--hw_notes_extra` option to add additional notes like `--hw_notes_extra="Result taken by NAME" ` - -* Use `--results_dir` option to specify the results folder for Non CM based benchmarks - -* Use `--category` option to specify the category for which submission is generated(datacenter/edge). By default, the category is taken from `system_meta.json` file located in the SUT root directory. - -* Use `--submission_base_dir` to specify the directory to which outputs from preprocess submission script and final submission is to be dumped. No need to provide `--submission_dir` along with this. For `docker run`, use `--submission_base_dir` instead of `--submission_dir`. - -The above command should generate "submission.tar.gz" if there are no submission checker issues and you can upload it to the [MLCommons Submission UI](https://submissions-ui.mlcommons.org/submission). - -## Aggregate Results in GitHub - -If you are collecting results across multiple systems you can generate different submissions and aggregate all of them to a GitHub repository (can be private) and use it to generate a single tar ball which can be uploaded to the [MLCommons Submission UI](https://submissions-ui.mlcommons.org/submission). - -Run the following command after **replacing `--repo_url` with your GitHub repository URL**. +=== "CM automation based results" + If you have followed the `cm run` commands under the individual model pages in the [benchmarks](../index.md) directory, all the valid results will get aggregated to the `cm cache` folder. The following command could be used to browse the structure of inference results folder generated by CM. + ### Get results folder structure + ```bash + cm find cache --tags=get,mlperf,inference,results,dir | xargs tree + ``` + + +Once all the results across all the models are ready you can use the following the below section to generate a valid submission tree compliant with the [MLPerf requirements](https://github.com/mlcommons/policies/blob/master/submission_rules.adoc#inference-1). + +## Generate submission folder + +The submission generation flow is explained in the below diagram + +```mermaid +flowchart LR + subgraph Generation [Submission Generation SUT1] + direction TB + A[populate system details] --> B[generate submission structure] + B --> C[truncate-accuracy-logs] + C --> D{Infer low talency results
and/or
filter out invalid results} + D --> yes --> E[preprocess-mlperf-inference-submission] + D --> no --> F[run-mlperf-inference-submission-checker] + E --> F + end + Input((Results SUT1)) --> Generation + Generation --> Output((Submission Folder
SUT1)) +``` + +### Command to generate submission folder ```bash -cm run script --tags=push,github,mlperf,inference,submission \ - --repo_url=https://github.com/GATEOverflow/mlperf_inference_submissions_v4.1 \ - --commit_message="Results on added by " \ - --quiet +cm run script --tags=generate,inference,submission \ + --clean \ + --preprocess_submission=yes \ + --run-checker=yes \ + --submitter=MLCommons \ + --division=closed \ + --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \ + --quiet ``` +!!! tip + * Use `--hw_name="My system name"` to give a meaningful system name. Examples can be seen [here](https://github.com/mlcommons/inference_results_v3.0/tree/main/open/cTuning/systems) + + * Use `--submitter=` if your organization is an official MLCommons member and would like to submit under your organization + + * Use `--hw_notes_extra` option to add additional notes like `--hw_notes_extra="Result taken by NAME" ` + + * Use `--results_dir` option to specify the results folder. It is automatically taken from CM cache for MLPerf automation based runs + + * Use `--submission_dir` option to specify the submission folder. (You can avoid this if you're pushing to github or only running a single SUT and CM will use its cache folder) + + * Use `--division=open` for open division submission + + * Use `--category` option to specify the category for which submission is generated(datacenter/edge). By default, the category is taken from `system_meta.json` file located in the SUT root directory. + + * Use `--submission_base_dir` to specify the directory to which the outputs from preprocess submission script and final submission is added. No need to provide `--submission_dir` along with this. For `docker run`, use `--submission_base_dir` instead of `--submission_dir`. + + +If there are multiple systems where MLPerf results are collected, the same process needs to be repeated on each of them. One we have submission folders on all the SUTs, we need to sync them to make a single submission folder + +=== "Sync Locally" + If you are having results in multiple systems, you need to merge them to one system. You can use `rsync` for this. For example, the below command will sync the submission folder from SUT2 to the one in SUT1. + ``` + rsync -avz username@host1:/ / + ``` + Same needs to be repeated for all other SUTs so that we have the full submissions in SUT1. + + ```mermaid + flowchart LR + subgraph SUT1 [Submission Generation SUT1] + A[Submission Folder SUT1] + end + subgraph SUT2 [Submission Generation SUT2] + B[Submission Folder SUT2] + end + subgraph SUT3 [Submission Generation SUT3] + C[Submission Folder SUT3] + end + subgraph SUTN [Submission Generation SUTN] + D[Submission Folder SUTN] + end + SUT2 --> SUT1 + SUT3 --> SUT1 + SUTN --> SUT1 + + ``` + +=== "Sync via a Github repo" + If you are collecting results across multiple systems you can generate different submissions and aggregate all of them to a GitHub repository (can be private) and use it to generate a single tar ball which can be uploaded to the [MLCommons Submission UI](https://submissions-ui.mlcommons.org/submission). + + Run the following command after **replacing `--repo_url` with your GitHub repository URL**. + + ```bash + cm run script --tags=push,github,mlperf,inference,submission \ + --repo_url=https://github.com/mlcommons/mlperf_inference_submissions_v5.0 \ + --commit_message="Results on added by " \ + --quiet + ``` + + ```mermaid + flowchart LR + subgraph SUT1 [Submission Generation SUT1] + A[Submission Folder SUT1] + end + subgraph SUT2 [Submission Generation SUT2] + B[Submission Folder SUT2] + end + subgraph SUT3 [Submission Generation SUT3] + C[Submission Folder SUT3] + end + subgraph SUTN [Submission Generation SUTN] + D[Submission Folder SUTN] + end + SUT2 -- git sync and push --> G[Github Repo] + SUT3 -- git sync and push --> G[Github Repo] + SUTN -- git sync and push --> G[Github Repo] + SUT1 -- git sync and push --> G[Github Repo] + + ``` + +## Upload the final submission + +!!! warning + If you are using GitHub for consolidating your results, make sure that you have run the [`push-to-github` command](#__tabbed_2_2) on the same system to ensure results are synced as is on the GitHub repository. + +Once you have all the results on the system, you can upload them to the MLCommons submission server as follows: + +=== "via CLI" + You can do the following command which will run the submission checker and upload the results to the MLCommons submission server + ``` + cm run script --tags=run,submission,checker \ + --submitter_id=<> \ + --submission_dir= + ``` +=== "via Browser" + You can do the following command to generate the final submission tar file and then upload to the [MLCommons Submission UI](https://submissions-ui.mlcommons.org/submission). + ``` + cm run script --tags=run,submission,checker \ + --submission_dir= \ + --tar=yes \ + --submission_tar_file=mysubmission.tar.gz + ``` + +```mermaid + flowchart LR + subgraph SUT [Combined Submissions] + A[Combined Submission Folder in SUT1] + end + SUT --> B[Run submission checker] + B --> C[Upload to MLC Submission server] + C --> D[Receive validation email] +``` + + -At the end, you can download the github repo and upload to the [MLCommons Submission UI](https://submissions-ui.mlcommons.org/submission). + diff --git a/graph/R-GAT/README.md b/graph/R-GAT/README.md index 4c31b42aa..561a65e6f 100644 --- a/graph/R-GAT/README.md +++ b/graph/R-GAT/README.md @@ -19,7 +19,9 @@ This is the reference implementation for MLPerf Inference Graph Neural Network. ## Automated command to run the benchmark via MLCommons CM -Please check the official inference documentation [here](https://docs.mlcommons.org/inference/benchmarks/graph/rgat/) +Please see the [new docs site](https://docs.mlcommons.org/inference/benchmarks/graph/rgat/) for an automated way to run this benchmark across different available implementations and do an end-to-end submission with or without docker. + +You can also do `pip install cm4mlops` and then use `cm` commands for downloading the model and datasets using the commands given in the later sections. ## Setup Set the following helper variables @@ -33,10 +35,7 @@ export MODEL_PATH=$PWD/inference/graph/R-GAT/model/ ```bash git clone --recurse-submodules https://github.com/mlcommons/inference.git --depth 1 ``` -Finally copy the `mlperf.conf` file to the stable diffusion folder -```bash -cp $ROOT_INFERENCE/mlperf.conf $GRAPH_FOLDER -``` + ### Install pytorch **For NVIDIA GPU based runs:** @@ -77,6 +76,13 @@ pip install dgl -f https://data.dgl.ai/wheels/torch-2.1/cu121/repo.html pip install dgl -f https://data.dgl.ai/wheels/torch-2.1/repo.html ``` + +### Download model through CM (Collective Minds) + +``` +cm run script --tags=get,ml-model,rgat --outdirname= +``` + ### Download model using Rclone To run Rclone on Windows, you can download the executable [here](https://rclone.org/install/#windows). @@ -95,15 +101,16 @@ You can then navigate in the terminal to your desired download directory and run rclone copy mlc-inference:mlcommons-inference-wg-public/R-GAT/RGAT.pt $MODEL_PATH -P ``` -### Download model through CM (Collective Minds) -``` -cm run script --tags=get,ml-model,rgat -j -``` ### Download and setup dataset #### Debug Dataset +**CM Command** +``` +cm run script --tags=get,dataset,igbh,_debug --outdirname= +``` + **Download Dataset** ```bash cd $GRAPH_FOLDER @@ -116,13 +123,16 @@ cd $GRAPH_FOLDER python3 tools/split_seeds.py --path igbh --dataset_size tiny ``` + + +#### Full Dataset +**Warning:** This script will download 2.2TB of data + **CM Command** ``` -cm run script --tags=get,dataset,igbh,_debug -j +cm run script --tags=get,dataset,igbh,_full --outdirname= ``` -#### Full Dataset -**Warning:** This script will download 2.2TB of data ```bash cd $GRAPH_FOLDER ./tools/download_igbh_full.sh igbh/ @@ -134,16 +144,15 @@ cd $GRAPH_FOLDER python3 tools/split_seeds.py --path igbh --dataset_size full ``` -**CM Command** -``` -cm run script --tags=get,dataset,igbh,_full -j -``` - #### Calibration dataset The calibration dataset contains 5000 nodes from the training paper nodes of the IGBH dataset. We provide the [Node ids](../../calibration/IGBH/calibration.txt) and the [script](tools/split_seeds.py) to generate them (using the `--calibration` flag). +**CM Command** +``` +cm run script --tags=get,dataset,igbh,_full,_calibration --outdirname= +``` ### Run the benchmark #### Debug Run @@ -155,20 +164,6 @@ cd $GRAPH_FOLDER python3 main.py --dataset igbh-dgl-tiny --dataset-path igbh/ --profile debug-dgl [--model-path ] [--in-memory] [--device ] [--dtype ] [--scenario ] ``` -##### Debug Run using CM -``` -cm run script --tags=run-mlperf,inference,_submission,_short,_r5.0-dev \ - --model=rgat \ - --implementation=reference \ - --framework=pytorch \ - --category=edge \ - --scenario=Offline \ - --execution_mode=test \ - --device= \ - --quiet \ - --test_query_count=10 \ - --docker -``` #### Local run ```bash @@ -179,25 +174,12 @@ cd $GRAPH_FOLDER python3 main.py --dataset igbh-dgl --dataset-path igbh/ --profile rgat-dgl-full [--model-path ] [--in-memory] [--device ] [--dtype ] [--scenario ] ``` -##### Local Run using CM -``` -cm run script --tags=run-mlperf,inference,_submission,_full,_r5.0-dev \ - --model=rgat \ - --implementation=reference \ - --framework=pytorch \ - --category=edge \ - --scenario=Offline \ - --execution_mode=test \ - --device=<>cpu or cuda> \ - --quiet \ - --test_query_count=10 \ - --docker +### Evaluate the accuracy +```bash +cm run script --tags=process,mlperf,accuracy,_igbh --result_dir= ``` -- Number of threads could be adjusted using `--threads=#`, where # is the desired number of threads. This option works only if the implementation in use supports threading. -- Batch size could be adjusted using `--batch_size=#`, where # is the desired batch size. This option works only if the implementation in use is supporting the given batch size. -- Add `--env.CM_DATASET_IGBH_PATH=` if you have already downloaded the dataset. The path will be automatically mounted when using docker run. -- Add `--env.CM_ML_MODEL_RGAT_CHECKPOINT_PATH=` if you have already downloaded the model. The path will be automatically mounted when using docker run. +Please click [here](https://github.com/mlcommons/inference/blob/dev/graph/R-GAT/tools/accuracy_igbh.py) to view the Python script for evaluating accuracy for the IGBH dataset. #### Run using docker diff --git a/graph/R-GAT/benchmark-checklist.md b/graph/R-GAT/benchmark-checklist.md index 2e76acb99..f83c816cb 100644 --- a/graph/R-GAT/benchmark-checklist.md +++ b/graph/R-GAT/benchmark-checklist.md @@ -21,7 +21,7 @@ #### **5. Validation Dataset: Unique Samples** Number of **unique samples** in the validation dataset and the QSL size specified in -- [ ] [inference policies benchmark section](https://github.com/mlcommons/inference_policies/blob/master/inference_rules.adoc#41-benchmarks) +- [X] [inference policies benchmark section](https://github.com/mlcommons/inference_policies/blob/master/inference_rules.adoc#41-benchmarks) - [X] [mlperf.conf](https://github.com/mlcommons/inference/blob/master/loadgen/mlperf.conf) - [X] [Inference benchmark docs](https://github.com/mlcommons/inference/blob/docs/docs/index.md) *(Ensure QSL size overflows the system cache if possible.)* @@ -37,12 +37,13 @@ Documented whether **Equal Issue Mode** is applicable in --- #### **7. Expected Accuracy and `accuracy.txt` Contents** -- [ ] Detailed expected accuracy and the required contents of the `accuracy.txt` file. +- [X] Expected accuracy updated in the [inference policies](https://github.com/mlcommons/inference_policies/blob/master/inference_rules.adoc#41-benchmarks) +- [X] `accuracy.txt` file generated by the reference accuracy script from the MLPerf accuracy log and is validated by the submission checker. --- #### **8. Reference Model Details** -- [ ] Reference model details updated in [Inference benchmark docs](https://github.com/mlcommons/inference/blob/docs/docs/index.md) +- [X] Reference model details updated in [Inference benchmark docs](https://github.com/mlcommons/inference/blob/docs/docs/index.md) --- diff --git a/language/gpt-j/README.md b/language/gpt-j/README.md index 9dc024a8e..765317635 100644 --- a/language/gpt-j/README.md +++ b/language/gpt-j/README.md @@ -2,6 +2,10 @@ Please see the [new docs site](https://docs.mlcommons.org/inference/benchmarks/language/gpt-j) for an automated way to run this benchmark across different available implementations and do an end-to-end submission with or without docker. +Please see the [new docs site](https://docs.mlcommons.org/inference/benchmarks/language/gpt-j/) for an automated way to run this benchmark across different available implementations and do an end-to-end submission with or without docker. + + + ### Setup Instructions ```bash @@ -75,8 +79,7 @@ Please download the fine-tuned GPT-J checkpoint using the instructions below. Th The following MLCommons CM commands can be used to programmatically download the model checkpoint. ``` -pip install cm4mlops -cm run script --tags=get,ml-model,gptj,_pytorch,_rclone -j +cm run script --tags=get,ml-model,gptj,_pytorch,_rclone ---outdirname =./model -P ``` #### Manual method diff --git a/language/llama3.1-405b/README.md b/language/llama3.1-405b/README.md index d1dd5ad4f..ea358bb98 100644 --- a/language/llama3.1-405b/README.md +++ b/language/llama3.1-405b/README.md @@ -9,6 +9,11 @@ Please see the [new docs site](https://docs.mlcommons.org/inference/benchmarks/language/llama3.1-405b) for an automated way to run this benchmark across different available implementations and do an end-to-end submission with or without docker. +## Automated command to run the benchmark via MLCommons CM + +Please see the [new docs site](https://docs.mlcommons.org/inference/benchmarks/language/llama3_1-405b/) for an automated way to run this benchmark across different available implementations and do an end-to-end submission with or without docker. + +You can also do pip install cm4mlops and then use cm commands for downloading the model and datasets using the commands given in the later sections. ## Prepare environment @@ -109,6 +114,15 @@ git clone https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct ${CHECKPOINT cd ${CHECKPOINT_PATH} && git checkout be673f326cab4cd22ccfef76109faf68e41aa5f1 ``` +### Download model through CM (Collective Mind) + +``` +cm run script --tags=get,ml-model,llama3 --outdirname=${CHECKPOINT_PATH} --hf_token= -j +``` + +**Note:** +Downloading llama3.1-405B model from Hugging Face will require an [**access token**](https://huggingface.co/settings/tokens) which could be generated for your account. Additionally, ensure that your account has access to the [llama3.1-405B](https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct) model. + ## Get Dataset ### Preprocessed @@ -129,6 +143,11 @@ You can then navigate in the terminal to your desired download directory and run ``` rclone copy mlc-inference:mlcommons-inference-wg-public/llama3.1_405b/mlperf_llama3.1_405b_dataset_8313_processed_fp16_eval.pkl ./ -P ``` +**CM Command** + +``` +cm run script --tags=get,dataset,mlperf,inference,llama3,_validation --outdirname= -j +``` You can also download the calibration dataset from the Cloudflare R2 bucket by running the following command: @@ -136,6 +155,12 @@ You can also download the calibration dataset from the Cloudflare R2 bucket by r rclone copy mlc-inference:mlcommons-inference-wg-public/llama3.1_405b/mlperf_llama3.1_405b_calibration_dataset_512_processed_fp16_eval.pkl ./ -P ``` +**CM Command** +``` +cm run script --tags=get,dataset,mlperf,inference,llama3,_calibration --outdirname= -j +``` + + ## Run Performance Benchmarks ### Offline @@ -169,7 +194,6 @@ python -u main.py --scenario Server \ The ServerSUT was not tested for GPU runs. - ## Run Accuracy Benchmarks ### Offline @@ -201,7 +225,6 @@ fi For the GPU run - The above steps have been automated in `run_accuracy.sh`. You can also modify this script to use `--device cpu` to adapt it to a CPU-only run. - ### Server ``` OUTPUT_LOG_DIR=server-accuracy-logs @@ -218,7 +241,6 @@ python -u main.py --scenario Server \ --tensor-parallel-size ${GPU_COUNT} \ --vllm - ACCURACY_LOG_FILE=${OUTPUT_LOG_DIR}/mlperf_log_accuracy.json if [ -e ${ACCURACY_LOG_FILE} ]; then python evaluate-accuracy.py --checkpoint-path ${CHECKPOINT_PATH} \ @@ -228,6 +250,11 @@ fi The ServerSUT was not tested for GPU runs. +### Evaluate the accuracy using CM +You can also evaulate the accuracy from the generated accuracy log by using the following CM command +``` +cm run script --tags=process,mlperf,accuracy,_dataset_llama3 --result_dir= +``` ## Accuracy Target Running the GPU implementation in FP16 precision resulted in the following FP16 accuracy targets: diff --git a/language/mixtral-8x7b/evaluate-accuracy.py b/language/mixtral-8x7b/evaluate-accuracy.py index 3ea79cea8..74485d569 100644 --- a/language/mixtral-8x7b/evaluate-accuracy.py +++ b/language/mixtral-8x7b/evaluate-accuracy.py @@ -121,6 +121,7 @@ def main(): checkpoint_path = args.checkpoint_path metric = evaluate.load("rouge") nltk.download("punkt") + nltk.download("punkt_tab") tokenizer = AutoTokenizer.from_pretrained( checkpoint_path, diff --git a/loadgen/VERSION.txt b/loadgen/VERSION.txt index 2d6c0bcf1..ab0fa336d 100644 --- a/loadgen/VERSION.txt +++ b/loadgen/VERSION.txt @@ -1 +1 @@ -5.0.4 +5.0.5 diff --git a/loadgen/bindings/python_api.cc b/loadgen/bindings/python_api.cc index 7f50f5f56..96396dab9 100644 --- a/loadgen/bindings/python_api.cc +++ b/loadgen/bindings/python_api.cc @@ -348,10 +348,11 @@ PYBIND11_MODULE(mlperf_loadgen, m) { &TestSettings::token_latency_scaling_factor) .def("FromConfig", &TestSettings::FromConfig, pybind11::arg("path"), pybind11::arg("model"), pybind11::arg("scenario"), - pybind11::arg("is_mlperf_conf") = false, + pybind11::arg("conf_type") = 1, "This function configures settings from the given user " - "configuration file, model, and scenario. The is_mlperf_conf flag " - "should be set to false or else only the default mlperf_conf file " + "configuration file, model, and scenario. The conf_type flag " + "should be set to 1 for loading user.conf or else only the default " + "mlperf_conf file " "will be loaded by the loadgen."); pybind11::enum_(m, "LoggingMode") diff --git a/loadgen/loadgen.cc b/loadgen/loadgen.cc index beda3a6c4..c731f1a8d 100644 --- a/loadgen/loadgen.cc +++ b/loadgen/loadgen.cc @@ -1228,7 +1228,7 @@ void StartTest(SystemUnderTest* sut, QuerySampleLibrary* qsl, RemoveValue(&audit_scenario, ' '); const std::string generic_model = "*"; test_settings.FromConfig(audit_config_filename, generic_model, - audit_scenario); + audit_scenario, 2); } if (test_settings.test05) { // If the configuration indicates we are running test05, diff --git a/loadgen/test_settings.h b/loadgen/test_settings.h index 739b2947f..584d073bb 100644 --- a/loadgen/test_settings.h +++ b/loadgen/test_settings.h @@ -237,7 +237,7 @@ struct TestSettings { /// \brief Load mlperf parameter config from file. int FromConfig(const std::string &path, const std::string &model, - const std::string &scenario, bool is_mlperf_conf = false); + const std::string &scenario, int conf_type = 1); /**@}*/ // ================================== diff --git a/loadgen/test_settings_internal.cc b/loadgen/test_settings_internal.cc index 1a7387f59..f654948f3 100644 --- a/loadgen/test_settings_internal.cc +++ b/loadgen/test_settings_internal.cc @@ -520,15 +520,15 @@ void TestSettingsInternal::LogSummary(AsyncSummary &summary) const { } // namespace loadgen int TestSettings::FromConfig(const std::string &path, const std::string &model, - const std::string &scenario, bool is_mlperf_conf) { + const std::string &scenario, int conf_type) { std::map kv; static int configCount = 0; - if (!is_mlperf_conf) { + if (conf_type == 1) { if (configCount == 0) { // Only allow userConf as the single configFile and loadgen loads the - // mlperfConf automatically - FromConfig("", model, scenario, true); + // mlperfConf automatically for perf and accuracy runs + FromConfig("", model, scenario, 0); } else { @@ -586,7 +586,7 @@ int TestSettings::FromConfig(const std::string &path, const std::string &model, std::unique_ptr fss; std::string line; - if (!is_mlperf_conf) { + if (conf_type != 0) { // dirt simple config parser fss = std::make_unique(path); if (!static_cast(fss.get())->is_open()) { @@ -691,20 +691,17 @@ int TestSettings::FromConfig(const std::string &path, const std::string &model, break; } } - if (is_mlperf_conf) { + + if (conf_type == 0) { lookupkv(model, scenario, "qsl_rng_seed", &qsl_rng_seed, nullptr); lookupkv(model, scenario, "sample_index_rng_seed", &sample_index_rng_seed, nullptr); lookupkv(model, scenario, "schedule_rng_seed", &schedule_rng_seed, nullptr); - lookupkv(model, scenario, "accuracy_log_rng_seed", &accuracy_log_rng_seed, - nullptr); - lookupkv(model, scenario, "accuracy_log_probability", nullptr, - &accuracy_log_probability, 0.01); - lookupkv(model, scenario, "accuracy_log_sampling_target", - &accuracy_log_sampling_target, nullptr); if (lookupkv(model, scenario, "sample_concatenate_permutation", &val, nullptr)) sample_concatenate_permutation = (val == 1) ? true : false; + lookupkv(model, scenario, "accuracy_log_probability", nullptr, + &accuracy_log_probability, 0.01); if (lookupkv(model, scenario, "test05", &val, nullptr)) test05 = (val == 1) ? true : false; lookupkv(model, scenario, "test05_qsl_rng_seed", &test05_qsl_rng_seed, @@ -715,8 +712,10 @@ int TestSettings::FromConfig(const std::string &path, const std::string &model, &test05_schedule_rng_seed, nullptr); } - // keys that can be overriden in user.conf but will make the results eligibale - // only for open submission keys to measure token metrics + // keys that can be overriden in user.conf but will make the results eligible + // only for open submissions + + // keys to measure token metrics if (lookupkv(model, scenario, "use_token_latencies", &val, nullptr)) { use_token_latencies = (val == 1) ? true : false; } @@ -781,6 +780,11 @@ int TestSettings::FromConfig(const std::string &path, const std::string &model, if (lookupkv(model, scenario, "print_timestamps", &val, nullptr)) print_timestamps = (val == 0) ? false : true; + // keys that are used in audit.conf + lookupkv(model, scenario, "accuracy_log_rng_seed", &accuracy_log_rng_seed, + nullptr); + lookupkv(model, scenario, "accuracy_log_sampling_target", + &accuracy_log_sampling_target, nullptr); return 0; } diff --git a/main.py b/main.py index 6a34587dd..1e561175f 100755 --- a/main.py +++ b/main.py @@ -28,7 +28,7 @@ def mlperf_inference_implementation_readme( content = "" execution_envs = ["Docker", "Native"] - code_version = "r4.1-dev" + code_version = "r5.0-dev" implementation_run_options = [] if model == "rnnt": @@ -50,6 +50,8 @@ def mlperf_inference_implementation_readme( frameworks = ["Onnxruntime", "Pytorch"] elif "bert" in model.lower(): frameworks = ["Pytorch", "Deepsparse"] + elif "llama3" in model.lower(): + frameworks = ["Pytorch"] else: frameworks = ["Pytorch"] @@ -127,6 +129,7 @@ def mlperf_inference_implementation_readme( "dlrm" in model.lower() or "llama2" in model.lower() or "mixtral" in model.lower() + or "llama3" in model.lower() ): categories = ["Datacenter"] else: @@ -499,6 +502,7 @@ def get_common_info(spaces, implementation, model): info += f"\n{pre_space}!!! tip\n\n" info += f"{pre_space} - Number of threads could be adjusted using `--threads=#`, where `#` is the desired number of threads. This option works only if the implementation in use supports threading.\n\n" info += f"{pre_space} - Batch size could be adjusted using `--batch_size=#`, where `#` is the desired batch size. This option works only if the implementation in use is supporting the given batch size.\n\n" + info += f"{pre_space} - `_r4.1-dev` could also be given instead of `_r5.0-dev` if you want to run the benchmark with the MLPerf version being 4.1.\n\n" if model == "rgat": info += f"{pre_space} - Add `--env.CM_DATASET_IGBH_PATH=` if you have already downloaded the dataset. The path will be automatically mounted when using docker run.\n\n" info += f"{pre_space} - Add `--env.CM_ML_MODEL_RGAT_CHECKPOINT_PATH=` if you have already downloaded the model. The path will be automatically mounted when using docker run.\n\n" @@ -522,7 +526,9 @@ def get_docker_info(spaces, model, implementation, if model == "sdxl": info += f"{pre_space} - `--env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes` option can be used to download the model on the host so that it can be reused across different container lanuches. \n\n" - + elif "llama3" in model.lower(): + info += f"{pre_space} - `--env.CM_MLPERF_MODEL_LLAMA3_DOWNLOAD_TO_HOST=yes` option can be used to download the model on the host so that it can be reused across different container lanuches. \n\n" + info += f"{pre_space} - `--env.CM_MLPERF_DATASET_LLAMA3_DOWNLOAD_TO_HOST=yes` option can be used to download the dataset on the host so that it can be reused across different container lanuches. \n\n" if implementation.lower() == "nvidia": info += f"{pre_space} - Default batch size is assigned based on [GPU memory](https://github.com/mlcommons/cm4mlops/blob/dd0c35856969c68945524d5c80414c615f5fe42c/script/app-mlperf-inference-nvidia/_cm.yaml#L1129) or the [specified GPU](https://github.com/mlcommons/cm4mlops/blob/dd0c35856969c68945524d5c80414c615f5fe42c/script/app-mlperf-inference-nvidia/_cm.yaml#L1370). Please click more option for *docker launch* or *run command* to see how to specify the GPU name.\n\n" info += f"{pre_space} - When run with `--all_models=yes`, all the benchmark models of NVIDIA implementation can be executed within the same container.\n\n" diff --git a/mkdocs.yml b/mkdocs.yml index 96bcfb758..9178191a3 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -39,6 +39,7 @@ nav: - IndySCC24: benchmarks/language/reproducibility/indyscc24-bert.md - GPT-J: benchmarks/language/gpt-j.md - LLAMA2-70B: benchmarks/language/llama2-70b.md + - LLAMA3-405B: benchmarks/language/llama3_1-405b.md - MIXTRAL-8x7B: benchmarks/language/mixtral-8x7b.md - Recommendation: - DLRM-v2: benchmarks/recommendation/dlrm-v2.md diff --git a/text_to_image/README.md b/text_to_image/README.md index 57c4343b1..84c8c7245 100644 --- a/text_to_image/README.md +++ b/text_to_image/README.md @@ -1,9 +1,11 @@ # MLPerf™ Inference Benchmarks for Text to Image -This is the reference implementation for MLPerf Inference text to image. +## Automated command to run the benchmark via MLCommons CM Please see the [new docs site](https://docs.mlcommons.org/inference/benchmarks/text_to_image/sdxl) for an automated way to run this benchmark across different available implementations and do an end-to-end submission with or without docker. +You can also do `pip install cm4mlops` and then use `cm` commands for downloading the model and datasets using the commands given in the later sections. + ## Supported Models | model | accuracy | dataset | model source | precision | notes | @@ -53,10 +55,10 @@ We host two checkpoints (fp32 and fp16) that are a snapshot of the [Hugging Face The following MLCommons CM commands can be used to programmatically download the model checkpoints. ``` -pip install cmind -cm pull repo mlcommons@ck -cm run script --tags=get,ml-model,sdxl,_fp16,_rclone -j -cm run script --tags=get,ml-model,sdxl,_fp32,_rclone -j +cm run script --tags=get,ml-model,sdxl,_fp16,_rclone --outdirname=$MODEL_PATH +``` +``` +cm run script --tags=get,ml-model,sdxl,_fp32,_rclone --outdirname-$MODEL_PATH ``` #### Manual method @@ -72,30 +74,35 @@ Once Rclone is installed, run the following command to authenticate with the buc rclone config create mlc-inference s3 provider=Cloudflare access_key_id=f65ba5eef400db161ea49967de89f47b secret_access_key=fbea333914c292b854f14d3fe232bad6c5407bf0ab1bebf78833c2b359bdfd2b endpoint=https://c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com ``` You can then navigate in the terminal to your desired download directory and run the following commands to download the checkpoints: +``` +cd $MODEL_PATH +``` **`fp32`** ``` -rclone copy mlc-inference:mlcommons-inference-wg-public/stable_diffusion_fp32 ./stable_diffusion_fp32 -P +rclone copy mlc-inference:mlcommons-inference-wg-public/stable_diffusion_fp32 $MODEL_PATH -P ``` **`fp16`** ``` -rclone copy mlc-inference:mlcommons-inference-wg-public/stable_diffusion_fp16 ./stable_diffusion_fp16 -P +rclone copy mlc-inference:mlcommons-inference-wg-public/stable_diffusion_fp16 $MODEL_PATH -P ``` -#### Move to model path +### Download validation dataset -```bash -mkdir $MODEL_PATH -cd $MODEL_PATH -# For fp32 -mv /stable_diffusion_fp32.zip . -unzip stable_diffusion_fp32.zip -# For fp16 -mv /stable_diffusion_fp16.zip . -unzip stable_diffusion_fp16.zip +#### CM METHOD +The following MLCommons CM commands can be used to programmatically download the validation dataset. + +``` +cm run script --tags=get,dataset,coco2014,_validation,_full --outdirname=coco2014 +``` + +For debugging you can download only a part of all the images in the dataset +``` +cm run script --tags=get,dataset,coco2014,_validation,_size.50 --outdirname=coco2014 ``` -### Download dataset + +#### MANUAL METHOD ```bash cd $SD_FOLDER/tools ./download-coco-2014.sh -n @@ -107,14 +114,25 @@ cd $SD_FOLDER/tools ``` If the file [captions.tsv](coco2014/captions/captions.tsv) can be found in the script, it will be used to download the target dataset subset, otherwise it will be generated. We recommend you to have this file for consistency. -#### Calibration dataset +### Download Calibration dataset (only if you are doing quantization) + +#### CM METHOD +The following MLCommons CM commands can be used to programmatically download the calibration dataset. + +``` +cm run script --tags=get,dataset,coco2014,_calibration --outdirname=coco2014 +``` + + +#### MANUAL METHOD We provide a script to download the calibration captions and images. To download only the captions: ```bash cd $SD_FOLDER/tools -./download-coco-2014-calibration.sh +./download-coco-2014-calibration.sh -n ``` -To download only the captions and images: + +To download both the captions and images: ```bash cd $SD_FOLDER/tools ./download-coco-2014-calibration.sh -i -n diff --git a/tools/submission/README.md b/tools/submission/README.md index 2459ab363..5a31a304a 100644 --- a/tools/submission/README.md +++ b/tools/submission/README.md @@ -1,5 +1,7 @@ # Tools to check Submissions +Please follow the [official submission automation page](https://docs.mlcommons.org/inference/submission/) for doing a submission. It wraps all the submission related files listed below. + ## `truncate_accuracy_log.py` (Mandatory) ### Inputs diff --git a/tools/submission/preprocess_submission.py b/tools/submission/preprocess_submission.py index 977af4d47..7eaa7f8f7 100644 --- a/tools/submission/preprocess_submission.py +++ b/tools/submission/preprocess_submission.py @@ -51,7 +51,7 @@ def get_args(): parser.add_argument( "--version", - default="v4.1", + default="v5.0", choices=list(checker.MODEL_CONFIG.keys()), help="mlperf version", ) @@ -405,12 +405,12 @@ def infer_scenario_results(args, config): continue if mlperf_model not in config.required: - log.error("Division %s, submitter %s, system %s has invalid " - "MLPerf model (%s) corresponding to given model (%s). " - "Valid ones for MLPerf inference version (%s) in (%s) " - "category are [%s]", division, submitter, system_id_json, - mlperf_model, model, config.version, system_type, - config.required.keys()) + log.warning(f"""Division {division}, submitter {submitter}, system {system_id_json} has invalid """ + f"""MLPerf model ({mlperf_model}) corresponding to given model ({model}). """ + f"""Valid ones for MLPerf inference version ({config.version}) in ({system_type}) """ + f"""category are [{config.required.keys()}]. Removing...""") + clean_model_dir(os.path.join( + log_path, system_desc, model)) continue required_scenarios = config.get_required(mlperf_model) diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py index 26d5212f9..fb1f1bd49 100755 --- a/tools/submission/submission_checker.py +++ b/tools/submission/submission_checker.py @@ -1133,21 +1133,18 @@ def find_error_in_detail_log(config, fname): return is_valid -def check_accuracy_dir(config, model, path, verbose): - is_valid = False - all_accuracy_valid = True - acc = None - result_acc = {} - hash_val = None - target = config.get_accuracy_target(model) - acc_upper_limit = config.get_accuracy_upper_limit(model) +def get_accuracy_values(config, model): + patterns = [] acc_targets = [] acc_types = [] + acc_limits = [] + up_patterns = [] + acc_limit_check = False + + target = config.get_accuracy_target(model) + acc_upper_limit = config.get_accuracy_upper_limit(model) if acc_upper_limit is not None: - acc_limits = [] - up_patterns = [] - acc_limit_check = True for i in range(0, len(acc_upper_limit), 2): acc_type, acc_target = acc_upper_limit[i: i + 2] acc_limits.append(acc_target) @@ -1158,6 +1155,22 @@ def check_accuracy_dir(config, model, path, verbose): patterns.append(ACC_PATTERN[acc_type]) acc_targets.append(acc_target) acc_types.append(acc_type) + + return patterns, acc_targets, acc_types, acc_limits, up_patterns, acc_upper_limit + + +def check_accuracy_dir(config, model, path, verbose): + is_valid = False + all_accuracy_valid = True + acc = None + result_acc = {} + hash_val = None + target = config.get_accuracy_target(model) + # acc_upper_limit = config.get_accuracy_upper_limit(model) + patterns, acc_targets, acc_types, acc_limits, up_patterns, acc_upper_limit = get_accuracy_values( + config, model) + acc_limit_check = True + acc_seen = [False for _ in acc_targets] with open(os.path.join(path, "accuracy.txt"), "r", encoding="utf-8") as f: @@ -1185,6 +1198,7 @@ def check_accuracy_dir(config, model, path, verbose): if acc: result_acc[acc_type] = acc acc = None + if acc_upper_limit is not None: for i, (pattern, acc_limit) in enumerate( zip(up_patterns, acc_limits)): @@ -1341,7 +1355,7 @@ def check_performance_dir( samples_per_query = mlperf_log["effective_samples_per_query"] min_duration = mlperf_log["effective_min_duration_ms"] equal_issue_used_check = ( - mlperf_log["effective_sample_concatenate_permutation"] == "true" + mlperf_log["effective_sample_concatenate_permutation"] == True ) if not config.requires_equal_issue(model, division): equal_issue_used_check = True @@ -1625,7 +1639,7 @@ def get_power_metric(config, scenario_fixed, log_path, is_valid, res): samples_per_query = 8 if (scenario_fixed in ["MultiStream"] - ) and scenario in ["SingleStream"]: + ) and scenario in ["SingleStream"]: power_metric = ( avg_power * power_duration * samples_per_query * 1000 / num_queries ) @@ -2849,13 +2863,11 @@ def check_compliance_acc_dir(test_dir, model, config): is_valid = False elif not acc_passed: target = config.get_accuracy_target(model) - patterns = [] - acc_types = [] - for i in range(0, len(target), 2): - acc_type = target[i: i + 2] - acc_types.append(acc_type) - patterns.append(ACC_PATTERN[acc_type[0]]) - acc_seen = [False for _ in acc_type] + patterns, acc_targets, acc_types, acc_limits, up_patterns, acc_upper_limit = get_accuracy_values( + config, model) + acc_limit_check = True + + acc_seen = [False for _ in acc_targets] acc_baseline = {acc_type: 0 for acc_type in acc_types} acc_compliance = {acc_type: 0 for acc_type in acc_types} with open( @@ -2898,6 +2910,10 @@ def check_compliance_acc_dir(test_dir, model, config): if delta_perc <= required_delta_perc: is_valid = True else: + log.error( + "Compliance test accuracy check (non-deterministic mode) in %s failed", + test_dir, + ) is_valid = False break elif "TEST06" in test_dir: diff --git a/tools/submission/truncate_accuracy_log.py b/tools/submission/truncate_accuracy_log.py index b7a9509ae..e0e1973ec 100755 --- a/tools/submission/truncate_accuracy_log.py +++ b/tools/submission/truncate_accuracy_log.py @@ -233,7 +233,7 @@ def truncate_results_dir(filter_submitter, backup, scenarios_to_skip): # get to work hash_val = get_hash(acc_log) with open(acc_txt, "a", encoding="utf-8") as f: - f.write("hash={0}\n".format(hash_val)) + f.write("\nhash={0}\n".format(hash_val)) truncate_file(acc_log) log.info("%s truncated", acc_log)