diff --git a/.gitignore b/.gitignore
index 5b0be9676..4a4c233bb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,3 +17,8 @@ config-debug.txt
# KDevelop files
.kdev4/
xmr-stak.kdev4
+
+# Idea/Clion project files
+cmake-build-release/
+cmake-build-debug/
+\.idea/
diff --git a/.travis.yml b/.travis.yml
index 0d2d51a0b..de5b45fe7 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -77,9 +77,8 @@ matrix:
before_install:
- . CI/checkPRBranch
- - if [ $TRAVIS_OS_NAME = osx ]; then
+ - if [ $TRAVIS_OS_NAME = osx ] ; then
brew update;
- brew tap homebrew/science;
fi
- export PATH=$CUDA_ROOT/bin:$PATH
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 10f33bda4..15a2684ca 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
project(xmr-stak)
-cmake_minimum_required(VERSION 3.1.0)
+cmake_minimum_required(VERSION 3.4.0)
# enforce C++11
set(CMAKE_CXX_STANDARD_REQUIRED ON)
@@ -36,10 +36,6 @@ if(NOT CMAKE_BUILD_TYPE)
endif()
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "${BUILD_TYPE}")
-set(XMR-STAK_CURRENCY "all" CACHE STRING "select miner currency")
-set_property(CACHE XMR-STAK_CURRENCY PROPERTY STRINGS "all;monero;aeon")
-
-
set(XMR-STAK_COMPILE "native" CACHE STRING "select CPU compute architecture")
set_property(CACHE XMR-STAK_COMPILE PROPERTY STRINGS "native;generic")
if(XMR-STAK_COMPILE STREQUAL "native")
@@ -53,16 +49,6 @@ else()
message(FATAL_ERROR "XMR-STAK_COMPILE is set to an unknown value '${XMR-STAK_COMPILE}'")
endif()
-if(XMR-STAK_CURRENCY STREQUAL "all")
- message(STATUS "Set miner currency to 'monero' and 'aeon'")
-elseif(XMR-STAK_CURRENCY STREQUAL "aeon")
- message(STATUS "Set miner currency to 'aeon'")
- add_definitions("-DCONF_NO_MONERO=1")
-elseif(XMR-STAK_CURRENCY STREQUAL "monero")
- message(STATUS "Set miner currency to 'monero'")
- add_definitions("-DCONF_NO_AEON=1")
-endif()
-
# option to add static libgcc and libstdc++
option(CMAKE_LINK_STATIC "link as much as possible libraries static" OFF)
@@ -438,6 +424,14 @@ else()
endif()
+# add -Wall for debug builds with gcc
+if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+ if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall")
+ endif()
+endif()
+
# activate static libgcc and libstdc++ linking
if(CMAKE_LINK_STATIC)
set(BUILD_SHARED_LIBRARIES OFF)
@@ -522,9 +516,11 @@ endif()
file(GLOB SRCFILES_CPP "xmrstak/cli/*.cpp")
set_source_files_properties(${SRCFILES_CPP} PROPERTIES LANGUAGE CXX)
-add_executable(xmr-stak
- ${SRCFILES_CPP}
-)
+if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
+ add_executable(xmr-stak ${SRCFILES_CPP} xmrstak/cli/xmr-stak.manifest)
+else()
+ add_executable(xmr-stak ${SRCFILES_CPP})
+endif()
set(EXECUTABLE_OUTPUT_PATH "bin")
set(LIBRARY_OUTPUT_PATH "bin")
diff --git a/Dockerfile b/Dockerfile
index 3e996efca..345838758 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -8,7 +8,7 @@ ENV XMRSTAK_CMAKE_FLAGS -DXMR-STAK_COMPILE=generic -DCUDA_ENABLE=ON -DOpenCL_ENA
# Innstall packages
RUN apt-get update \
&& set -x \
- && apt-get install -qq --no-install-recommends -y ca-certificates cmake cuda-core-9-0 git cuda-cudart-dev-9-0 libhwloc-dev libmicrohttpd-dev libssl-dev \
+ && apt-get install -qq --no-install-recommends -y build-essential ca-certificates cmake cuda-core-9-0 git cuda-cudart-dev-9-0 libhwloc-dev libmicrohttpd-dev libssl-dev \
&& git clone $GIT_REPOSITORY \
&& cd /xmr-stak \
&& cmake ${XMRSTAK_CMAKE_FLAGS} . \
@@ -16,7 +16,7 @@ RUN apt-get update \
&& cd - \
&& mv /xmr-stak/bin/* /usr/local/bin/ \
&& rm -rf /xmr-stak \
- && apt-get purge -y -qq cmake cuda-core-9-0 git cuda-cudart-dev-9-0 libhwloc-dev libmicrohttpd-dev libssl-dev \
+ && apt-get purge -y -qq build-essential cmake cuda-core-9-0 git cuda-cudart-dev-9-0 libhwloc-dev libmicrohttpd-dev libssl-dev \
&& apt-get clean -qq
VOLUME /mnt
diff --git a/README.md b/README.md
index 86570cc67..29e1e1253 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,11 @@ XMR-Stak is a universal Stratum pool miner. This miner supports CPUs, AMD and NV
## HTML reports
+## Video setup guide on Windows
+
+[](https://www.youtube.com/watch?v=-8paGFwxyMU)
+###### Video by Crypto Sewer
+
## Overview
* [Features](#features)
* [Supported altcoins](#supported-altcoins)
@@ -15,32 +20,37 @@ XMR-Stak is a universal Stratum pool miner. This miner supports CPUs, AMD and NV
* [HowTo Compile](doc/compile.md)
* [FAQ](doc/FAQ.md)
* [Developer Donation](#default-developer-donation)
-* [Release Cheksums](#release-checksums)
* [Developer PGP Key's](doc/pgp_keys.md)
## Features
- support all common backends (CPU/x86, AMD-GPU and NVIDIA-GPU)
-- support all common OS (Linux, Windows and MacOS)
+- support all common OS (Linux, Windows and macOS)
- supports algorithm cryptonight for Monero (XMR) and cryptonight-light (AEON)
- easy to use
- guided start (no need to edit a config file for the first start)
- auto configuration for each backend
- open source software (GPLv3)
- TLS support
-- HTML statistics
-- JSON API for monitoring
+- [HTML statistics](doc/usage.md#html-and-json-api-report-configuraton)
+- [JSON API for monitoring](doc/usage.md#html-and-json-api-report-configuraton)
## Supported altcoins
-Besides Monero, following coins can be mined using this miner:
+Besides [Monero](https://getmonero.org), following coins can be mined using this miner:
-- [Aeon](http://www.aeon.cash/)
+- [Aeon](http://www.aeon.cash)
+- [Edollar](https://edollar.cash)
- [Electroneum](https://electroneum.com)
+- [Graft](https://www.graft.network)
- [Intense](https://intensecoin.com)
+- [Karbo](https://karbo.io)
- [Sumokoin](https://www.sumokoin.org)
-For all coins, except Aeon, you can use Monero settings.
+If your prefered coin is not listed, you can chose one of the following algorithms:
+
+- Cryptonight - 2 MiB scratchpad memory
+- Cryptonight-light - 1 MiB scratchpad memory
Please note, this list is not complete, and is not an endorsement.
@@ -62,47 +72,5 @@ fireice-uk:
psychocrypt:
```
-43NoJVEXo21hGZ6tDG6Z3g4qimiGdJPE6GRxAmiWwm26gwr62Lqo7zRiCJFSBmbkwTGNuuES9ES5TgaVHceuYc4Y75txCTU
-```
-
-## Release Checksums
-
-Please use the [Developer PGP Key's](doc/pgp_keys.md) to verify the integrity of the precompiled binaries.
-
-```
------BEGIN PGP SIGNED MESSAGE-----
-Hash: SHA256
-
-XMR-Stak 2.2.0 Windows Build Checksums
-
-compiled by: psychocrypt
-
-$ sha1sum *
-3f1634244ccd336f7df581e3c82e1c6ca38ce714 libeay32.dll
-538f3bd9dfcafc379e912562bcf343333f5375c7 ssleay32.dll
-302e5be7c97fcd4922bf99b3533c0523ead5d109 xmrstak_cuda_backend.dll
-ad6b9e62a7ea132e1bec0efd8d9e5f8a2ae531ca xmr-stak.exe
-393bc5deb7e59e61cc7f4ccc0f4438402422f3b0 xmrstak_opencl_backend.dll
-
-$ sha3sum *
-5aeefca7278be1b2706d99bf89fa23646931f881aff8bbca33654eb1 libeay32.dll
-6b696caa620b0c6372881b11e503313152b5191c2d5497b26f81ab79 ssleay32.dll
-038de57a707664c7c3ab3a74c8bdb3ed4e22000a74d8b7c359c7c4b5 xmrstak_cuda_backend.dll
-19ab61049051178a362dc0d1c17af06f5ca1eb0a75182c0388e5aa22 xmr-stak.exe
-cc7ba0fbde50d72df2a530ce52a831578cfa19999841eb954554a022 xmrstak_opencl_backend.dll
-
-date
-Fri Dec 22 22:09:59 CET 2017
-
------BEGIN PGP SIGNATURE-----
-Version: GnuPG v2
-
-iQEcBAEBCAAGBQJaPXYSAAoJEAUWOMCIZelDQpAH/As2BD6qDZvbKH5NPHjjDv6T
-KBJ6/0h+x2k4Iy3GelrtaogB4LvUDzci4MRfaTXr23Xr+rhwsx3J2xvVdWKZgPXh
-bQm5pTJFhiao6Dh+Orway6TLmuaEBLNtknatSkjPUPKmkVd/A7kxxkdelDB//yb+
-7k5HGb84T+HU8HBlB00pDITyXv/414egpZGMqWeBXsYDeEYa8KHZlEIO3YI4JrEz
-pNW44Q1YcWZ+zxqTDrvMgjW8KJZcXg6ijJ3fEhGBo+hcnF+WuUB3Yd3Frf0ps5J5
-MjnWXl/uOobML6K70g2UQcHcEDbPk8f9LUxX1++/I0aHsRMGMYhRj0ad5KYE1IY=
-=VCEv
------END PGP SIGNATURE-----
+45tcqnJMgd3VqeTznNotiNj4G9PQoK67TGRiHyj6EYSZ31NUbAfs9XdiU5squmZb717iHJLxZv3KfEw8jCYGL5wa19yrVCn
```
diff --git a/THIRD-PARTY-LICENSES b/THIRD-PARTY-LICENSES
index 3e62013a5..d3202f41e 100644
--- a/THIRD-PARTY-LICENSES
+++ b/THIRD-PARTY-LICENSES
@@ -22,3 +22,8 @@ License: MIT License and BSD License
-------------------------------------------------------------------------
+Package: PicoSHA2
+Authors: okdshin
+License: MIT License
+
+-------------------------------------------------------------------------
diff --git a/doc/FAQ.md b/doc/FAQ.md
index 23507f264..8739fc40f 100644
--- a/doc/FAQ.md
+++ b/doc/FAQ.md
@@ -1,31 +1,35 @@
# FAQ
## Content Overview
-* [SeLockMemoryPrivilege failed](#selockmemoryprivilege-failed)
+* ["Obtaining SeLockMemoryPrivilege failed."](#obtaining-selockmemoryprivilege-failed)
* [VirtualAlloc failed](#virtualalloc-failed)
* [Error msvcp140.dll and vcruntime140.dll not available](#error-msvcp140dll-and-vcruntime140dll-not-available)
* [Error: MEMORY ALLOC FAILED: mmap failed](#error-memory-alloc-failed-mmap-failed)
* [Illegal instruction (core dumped)](#illegal-instruction)
* [Virus Protection Alert](#virus-protection-alert)
* [Change Currency to Mine](#change-currency-to-mine)
+* [How can I mine Monero](#how-can-i-mine-monero)
+* [Why is Monero named monero7](why-is-monero-named-monero7)
+* [Which currency must be chosen if my fork coin is not listed](#which-currency-must-be-chosen-if-my-fork-coin-is-not-listed)
-## SeLockMemoryPrivilege failed
+## "Obtaining SeLockMemoryPrivilege failed."
-Please see [config.txt](config.txt) under section **LARGE PAGE SUPPORT**
+For professional versions of Windows see [this article](https://msdn.microsoft.com/en-gb/library/ms190730.aspx).
+Make sure to reboot afterwards!
-For Windows 7 pro, or Windows 8 and above see [this article](https://msdn.microsoft.com/en-gb/library/ms190730.aspx) (make sure to reboot afterwards!).
+For Windows 7/10 Home:
-For Windows 7 Home :
+1) Download and install [Windows Server 2003 Resource Kit Tools](https://www.microsoft.com/en-us/download/details.aspx?id=17657). Ignore any incompatibility warning during installation.
-1) Download and install [Windows Server 2003 Resource Kit Tools](https://www.microsoft.com/en-us/download/details.aspx?id=17657). Ignore incompatiablity warning during installation.
+2) Open cmd or PowerShell as an administrator.
-2) In cmd or power shell: `ntrights -u %USERNAME% +r SeLockMemoryPrivilege` (where %USERNAME% is the user that will be running the program. This command needs to be run as admin)
+3) Use `ntrights -u %USERNAME% +r SeLockMemoryPrivilege` where %USERNAME% is the user that will be running the program.
-3) Reboot.
+4) Reboot.
Reference: http://rybkaforum.net/cgi-bin/rybkaforum/topic_show.pl?pid=259791#pid259791
-*Warning: do not download ntrights.exe from any other site other then the offical Microsoft download page.*
+*Warning: Do not download ntrights.exe from any other site other than the offical Microsoft download page.*
## VirtualAlloc failed
@@ -40,13 +44,18 @@ Download and install this [runtime package](https://go.microsoft.com/fwlink/?Lin
## Error: MEMORY ALLOC FAILED: mmap failed
-On Linux you will need to configure large page support `sudo sysctl -w vm.nr_hugepages=128` and increase your
-ulimit -l. To do this you need to add following lines to /etc/security/limits.conf:
+On Linux you will need to configure large page support and increase your ulimit -l.
+
+To set large page support, add the following lines to /etc/sysctl.conf:
+
+ vm.nr_hugepages=128
+
+To increase the ulimit, add following lines to /etc/security/limits.conf:
* soft memlock 262144
* hard memlock 262144
-Save file. You WILL need to log out and log back in for these settings to take affect on your user (no need to reboot, just relogin in your session).
+You WILL need to log out and log back in for these settings to take affect on your user (no need to reboot, just relogin in your session).
You can also do it Windows-style and simply run-as-root, but this is NOT recommended for security reasons.
@@ -63,4 +72,18 @@ If your antivirus software flags **xmr-stak**, it will likely move it to its qua
If the miner is compiled for Monero and Aeon than you can change
- the value `currency` in the config *or*
- - start the miner with the [command line option](usage.md) `--currency monero` or `--currency aeon`
+ - start the miner with the [command line option](usage.md) `--currency monero7` or `--currency aeon`
+ - run `xmr-stak --help` to see all supported currencies and algorithms
+
+## How can I mine Monero
+
+Set the value `currency` in `pools.txt` to `monero7`.
+
+## Why is Monero named monero7
+
+To avoid configuration conflicts after the hard fork of Monero to the new POW with our old naming schema where all cryptonight currencies was selected by choosing `monero` as currency we decided to switch to the name `monero7`.
+
+## Which currency must be chosen if my fork coin is not listed
+
+If your coin you want to mine is not listed please check the documentation of the coin and try to find out if `cryptonight` or `cryptonight-lite` is the used algorithm.
+Select one of these generic coin algorithms.
diff --git a/doc/Linux_deployment.md b/doc/Linux_deployment.md
index 323a97f5b..3219e8a99 100644
--- a/doc/Linux_deployment.md
+++ b/doc/Linux_deployment.md
@@ -15,6 +15,7 @@ For automatic deployments, please use the steps above to obtain config.txt and u
```
#!/bin/bash
+sudo apt install curl
curl -O `curl -s https://api.github.com/repos/fireice-uk/xmr-stak/releases/latest | grep -o 'browser_download_url.*xmr-stak-portbin-linux.tar.gz' | sed 's/.*"//'`
curl -O http://path.to/your/config.txt
tar xzf xmr-stak-portbin-linux.tar.gz
diff --git a/doc/compile.md b/doc/compile.md
index e97affa7d..984c013ac 100644
--- a/doc/compile.md
+++ b/doc/compile.md
@@ -9,7 +9,7 @@
* [Compile on Windows](compile_Windows.md)
* [Compile on Linux](compile_Linux.md)
* [Compile on FreeBSD](compile_FreeBSD.md)
-* [Compile on MacOS](compile_MacOS.md)
+* [Compile on macOS](compile_macOS.md)
## Build System
@@ -31,7 +31,7 @@ After the configuration you need to compile the miner, follow the guide for your
* [Compile in Windows](compile_Windows.md)
* [Compile in Linux](compile_Linux.md)
* [Compile in FreeBSD](compile_FreeBSD.md)
-* [Compile in MacOS](compile_MacOS.md)
+* [Compile in macOS](compile_macOS.md)
## Generic Build Options
- `CMAKE_INSTALL_PREFIX` install miner to the home folder
@@ -47,7 +47,6 @@ After the configuration you need to compile the miner, follow the guide for your
- there is no *http* interface available if option is disabled: `cmake .. -DMICROHTTPD_ENABLE=OFF`
- `OpenSSL_ENABLE` allow to disable/enable the dependency *OpenSSL*
- it is not possible to connect to a *https* secured pool if option is disabled: `cmake .. -DOpenSSL_ENABLE=OFF`
-- `XMR-STAK_CURRENCY` - compile for Monero(XMR) or Aeon(AEON) usage only e.g. `cmake .. -DXMR-STAK_CURRENCY=monero`
- `XMR-STAK_COMPILE` select the CPU compute architecture (default: native)
- native means the miner binary can be used only on the system where it is compiled but will archive the highest hash rate
- use `cmake .. -DXMR-STAK_COMPILE=generic` to run the miner on all CPU's with sse2
diff --git a/doc/compile_Linux.md b/doc/compile_Linux.md
index b7104acd2..729f4d222 100644
--- a/doc/compile_Linux.md
+++ b/doc/compile_Linux.md
@@ -8,7 +8,7 @@
### Cuda 8.0+ (only needed to use NVIDIA GPUs)
-- donwload and install [https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads)
+- download and install [https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads)
- for minimal install choose `Custom installation options` during the install and select
- CUDA/Develpment
- CUDA/Runtime
@@ -25,7 +25,7 @@
make install
# Arch
- sudo pacman -S base-devel hwloc openssl cmake libmicrohttpd
+ sudo pacman -S --needed base-devel hwloc openssl cmake libmicrohttpd
git clone https://github.com/fireice-uk/xmr-stak.git
mkdir xmr-stak/build
cd xmr-stak/build
@@ -64,6 +64,34 @@
cd xmr-stak/build
cmake ..
make install
+
+ # TinyCore Linux 8.x
+ # TinyCore is 32-bit only, but there is an x86-64 port, known as "Pure 64,"
+ # hosted on the TinyCore home page, and it works well.
+ # Beware that huge page support is not enabled in the kernel distributed
+ # with Pure 64. Consider http://wiki.tinycorelinux.net/wiki:custom_kernel
+ # Note that as of yet there are no distro packages for microhttpd or hwloc.
+ # hwloc is easy enough to install manually though, shown below.
+ # Also note that only CPU mining has been tested on this platform, thus the
+ # disabling of CUDA and OpenCL shown below.
+ tce-load -iw openssl-dev.tcz cmake.tcz make.tcz gcc.tcz git.tcz \
+ glibc_base-dev.tcz linux-4.8.1_api_headers.tcz \
+ glibc_add_lib.tcz
+ wget https://www.open-mpi.org/software/hwloc/v1.11/downloads/hwloc-1.11.8.tar.gz
+ tar xzvf hwloc-1.11.8.tar.gz
+ cd hwloc-1.11.8
+ ./configure --prefix=/usr/local
+ make
+ sudo make install
+ cd ..
+ git clone http://github.com/fireice-uk/xmr-stak
+ cd xmr-stak
+ mkdir build
+ cd build
+ CC=gcc cmake .. -DCUDA_ENABLE=OFF \
+ -DOpenCL_ENABLE=OFF \
+ -DMICROHTTPD_ENABLE=OFF
+ make install
```
- g++ version 5.1 or higher is required for full C++11 support.
diff --git a/doc/compile_Windows.md b/doc/compile_Windows.md
index c9a8ff78e..129596c76 100644
--- a/doc/compile_Windows.md
+++ b/doc/compile_Windows.md
@@ -4,53 +4,54 @@
### Preparation
-- open a command line `cmd`
-- run `mkdir C:\xmr-stak-dep`
+- Open a command line (Windows key + r) and enter `cmd`
+- Execute `mkdir C:\xmr-stak-dep`
-### Visual Studio 2017 Community
+### Visual Studio Community 2017
-- download VS2017 Community and install from [https://www.visualstudio.com/downloads/](https://www.visualstudio.com/downloads/)
-- during the install chose the components
+- Download and install [Visual Studio Community 2017](https://www.visualstudio.com/downloads/)
+- During install choose following components:
- `Desktop development with C++` (left side)
- - `VC++ 2015.3 v140 toolset for desktop` (right side)
+ - `VC++ 2015.3 v140 toolset for desktop` (right side - **NOT** needed for CUDA 9 or AMD GPU)
+ - Since release of VS2017 15.5 (12/04/17), require `VC++ 2017 version 15.4 v14.11 toolset` (under tab `Individual Components`, section `Compilers, build tools, and runtimes`), as CUDA 9.1 is not compatible with compiler 14.12.X
### CMake for Win64
-- download and install the latest version from [https://cmake.org/download/](https://cmake.org/download/)
-- tested version: [cmake 3.9](https://cmake.org/files/v3.9/cmake-3.9.0-rc3-win64-x64.msi)
-- during the install choose the option `Add CMake to the system PATH for all users`
+- Download and install latest version from https://cmake.org/download/
+- Tested version: [cmake 3.9](https://cmake.org/files/v3.9/cmake-3.9.0-rc3-win64-x64.msi)
+- During install choose option: `Add CMake to the system PATH for all users`
-### Cuda 8.0+ (only needed to use NVIDIA GPUs)
+### Cuda 8.0+ (only needed for NVIDIA GPUs)
-- donwload and install [https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads)
-- for minimal install choose `Custom installation options` during the install and select
- - CUDA/Develpment
+- Download and install https://developer.nvidia.com/cuda-downloads
+- For minimal install choose `Custom installation options` during the install and select
+ - CUDA/Development
- CUDA/Visual Studio Integration (ignore the warning during the install that VS2017 is not supported)
- CUDA/Runtime
- Driver components
-### AMD APP SDK 3.0 (only needed to use AMD GPUs)
+### AMD APP SDK 3.0 (only needed for AMD GPUs)
-- download and install the latest version from [http://developer.amd.com/amd-accelerated-parallel-processing-app-sdk/](http://developer.amd.com/amd-accelerated-parallel-processing-app-sdk/)
+- Download and install the latest version from http://developer.amd.com/amd-accelerated-parallel-processing-app-sdk/
### Dependencies OpenSSL/Hwloc and Microhttpd
-- for CUDA 8*:
- - download the version 1 of the precompiled binary from [https://github.com/fireice-uk/xmr-stak-dep/releases/download/v1/xmr-stak-dep.zip](https://github.com/fireice-uk/xmr-stak-dep/releases/download/v1/xmr-stak-dep.zip)
- - version 1 of the pre-compiled dependencies is not compatible with Visual Studio Toolset v141
-- for CUDA 9 **and/or** AMD GPUs, CPU:
- - download the version 2 of the precompiled binary from [https://github.com/fireice-uk/xmr-stak-dep/releases/download/v2/xmr-stak-dep.zip](https://github.com/fireice-uk/xmr-stak-dep/releases/download/v2/xmr-stak-dep.zip)
- - version 2 of the pre-compiled dependencies is not compatible with Visual Studio Toolset v140
-- unzip all to `C:\xmr-stak-dep`
+- For CUDA 8*:
+ - Download version 1 of the precompiled binary from https://github.com/fireice-uk/xmr-stak-dep/releases/download/v1/xmr-stak-dep.zip
+ - Version 1 of the pre-compiled dependencies is not compatible with Visual Studio Toolset v141
+- For CUDA 9* **and/or** AMD GPUs, CPU:
+ - Download version 2 of the precompiled binary from https://github.com/fireice-uk/xmr-stak-dep/releases/download/v2/xmr-stak-dep.zip
+ - Version 2 of the pre-compiled dependencies is not compatible with Visual Studio Toolset v140
+- Extract archive to `C:\xmr-stak-dep`
### Validate the Dependency Folder
-- open a command line `cmd`
-- run
+- Open a command line (Windows key + r) and enter `cmd`
+- Execute
```
cd c:\xmr-stak-dep
tree .
```
-- the result should have the same structure
+- You should see something like this:
```
C:\xmr-stak-dep>tree .
Folder PATH listing for volume Windows
@@ -75,29 +76,37 @@
## Compile
-- download and unzip `xmr-stak`
-- open the command line terminal `cmd`
-- `cd` to your unzipped source code directory
-- execute the following commands (NOTE: path to VS2017 can be different)
+- Download xmr-stak [Source Code.zip](https://github.com/fireice-uk/xmr-stak/releases) and save to a location in your home folder (C:\Users\USERNAME\)
+- Extract `Source Code.zip` (e.g. to `C:\Users\USERNAME\xmr-stak-`)
+- Open a command line (Windows key + r) and enter `cmd`
+- Go to extracted source code directory (e.g. `cd C:\Users\USERNAME\xmr-stak-`)
+- Execute the following commands (NOTE: path to Visual Studio Community 2017 can be different)
```
+ # Execute next line only if compiling for Cuda 9.1 and using Visual Studio 2017 >= 15.5 (released 12/04/17)
+ "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x64 -vcvars_ver=14.11
+
"C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\Common7\Tools\VsMSBuildCmd.bat"
- set CMAKE_PREFIX_PATH=C:\xmr-stak-dep\hwloc;C:\xmr-stak-dep\libmicrohttpd;C:\xmr-stak-dep\openssl
+ ```
+- Sometimes Windows will change the directory to `C:\Users\USERNAME\source\` instead of `C:\Users\USERNAME\xmr-stak-\`. If that's the case execute `cd C:\Users\USERNAME\xmr-stak-` followed by:
+ ```
mkdir build
+
cd build
+
+ set CMAKE_PREFIX_PATH=C:\xmr-stak-dep\hwloc;C:\xmr-stak-dep\libmicrohttpd;C:\xmr-stak-dep\openssl
```
- - for CUDA 8*
- ```
- cmake -G "Visual Studio 15 2017 Win64" -T v140,host=x64 ..
- ```
- - for CUDA 9 **and/or** AMD GPUs, CPU
- ```
- cmake -G "Visual Studio 15 2017 Win64" -T v141,host=x64 ..
- ```
+
+### CMake
+
+- See [build options](https://github.com/fireice-uk/xmr-stak/blob/master/doc/compile.md#build-system) to enable or disable dependencies.
+- For CUDA 8* execute: `cmake -G "Visual Studio 15 2017 Win64" -T v140,host=x64 ..`
+- For CUDA 9* **and/or** AMD GPUs, CPU execute: `cmake -G "Visual Studio 15 2017 Win64" -T v141,host=x64 ..`
+- Then execute
```
cmake --build . --config Release --target install
+
cd bin\Release
+
copy C:\xmr-stak-dep\openssl\bin\* .
```
-
-\* Miner is also compiled for AMD GPUs (if the AMD APP SDK is installed) and CPUs.
-CUDA 8 requires a downgrade to the old v140 tool chain.
+- Miner is by default compiled for NVIDIA GPUs (if CUDA is installed), AMD GPUs (if the AMD APP SDK is installed) and CPUs.
diff --git a/doc/compile_MacOS.md b/doc/compile_macOS.md
similarity index 69%
rename from doc/compile_MacOS.md
rename to doc/compile_macOS.md
index 1b0af91dc..46f1d5b32 100644
--- a/doc/compile_MacOS.md
+++ b/doc/compile_macOS.md
@@ -1,4 +1,4 @@
-# Compile **xmr-stak** for MacOS
+# Compile **xmr-stak** for macOS
## Dependencies
@@ -18,7 +18,13 @@ make install
### For AMD GPUs
-> 🖐 We need help with AMD GPU compilation instructions. Please submit a PR if you managed to install [AMD APP SDK](http://developer.amd.com/amd-accelerated-parallel-processing-app-sdk/) and to compile `xmr-stak` on MacOS.
+OpenCL is bundled with Xcode, so no other depedency then the basic ones needed. Just enable OpenCL via the `-DOpenCL_ENABLE=ON` CMake option.
+
+```shell
+brew install hwloc libmicrohttpd gcc openssl cmake
+cmake . -DOPENSSL_ROOT_DIR=/usr/local/opt/openssl -DCUDA_ENABLE=OFF -DOpenCL_ENABLE=ON
+make install
+```
### For CPU-only mining
diff --git a/doc/tuning.md b/doc/tuning.md
index 5125387d0..47ad0bb2c 100644
--- a/doc/tuning.md
+++ b/doc/tuning.md
@@ -1,6 +1,7 @@
# Tuning Guide
## Content Overview
+* [Benchmark](#benchmark)
* [Windows](#windows)
* [NVIDIA Backend](#nvidia-backend)
* [Choose Value for `threads` and `blocks`](#choose-value-for-threads-and-blocks)
@@ -8,11 +9,18 @@
* [AMD Backend](#amd-backend)
* [Choose `intensity` and `worksize`](#choose-intensity-and-worksize)
* [Add more GPUs](#add-more-gpus)
+ * [disable comp_mode](#disable-comp_mode)
+ * [change the scratchpad memory pattern](change-the-scratchpad-memory-pattern)
* [Increase Memory Pool](#increase-memory-pool)
* [Scratchpad Indexing](#scratchpad-indexing)
* [CPU Backend](#cpu-backend)
* [Choose Value for `low_power_mode`](#choose-value-for-low_power_mode)
+## Benchmark
+To benchmark the miner speed there are two ways.
+ - Mine against a pool end press the key `h` after 30 sec to see the hash report.
+ - Start the miner with the cli option `--benchmark BLOCKVERSION`. The miner will not connect to any pool and performs a 60sec performance benchmark with all enabled back-ends.
+
## Windows
"Run As Administrator" prompt (UAC) confirmation is needed to use large pages on Windows 7.
On Windows 10 it is only needed once to set up the account to use them.
@@ -46,8 +54,12 @@ To add a new GPU you need to add a new config set to `gpu_threads_conf`.
```
"gpu_threads_conf" :
[
- { "index" : 0, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" : 0, "affine_to_cpu" : false},
- { "index" : 1, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" : 0, "affine_to_cpu" : false},
+ { "index" : 0, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" : 0,
+ "affine_to_cpu" : false, "sync_mode" : 3,
+ },
+ { "index" : 1, "threads" : 17, "blocks" : 60, "bfactor" : 0, "bsleep" : 0,
+ "affine_to_cpu" : false, "sync_mode" : 3,
+ },
],
```
@@ -70,13 +82,26 @@ If you are unsure of either GPU or platform index value, you can use `clinfo` to
```
"gpu_threads_conf" :
[
- { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false },
- { "index" : 1, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false },
+ { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
+ "strided_index" : true, "mem_chunk" : 2, "comp_mode" : true
+ },
+ { "index" : 1, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
+ "strided_index" : true, "mem_chunk" : 2, "comp_mode" : true
+ },
],
"platform_index" : 0,
```
+### disable comp_mode
+
+`comp_mode` means compatibility mode and removes some checks in compute kernel those takes care that the miner can be used on a wide range of AMD/OpenCL GPU devices.
+To avoid miner crashes the `intensity` should be a multiple of `worksize` if `comp_mode` is `false`.
+
+### change the scratchpad memory pattern
+
+By changing `strided_index` to `2` the number of contiguous elements (a 16 byte) for one miner thread can be fine tuned with the option `mem_chunk`.
+
### Increase Memory Pool
By setting the following environment variables before the miner is started OpenCl allows the miner to more threads.
@@ -84,9 +109,9 @@ This variables must be set each time before the miner is started else it could b
```
export GPU_FORCE_64BIT_PTR=1
-export GPU_MAX_HEAP_SIZE=99
-export GPU_MAX_ALLOC_PERCENT=99
-export GPU_SINGLE_ALLOC_PERCENT=99
+export GPU_MAX_HEAP_SIZE=100
+export GPU_MAX_ALLOC_PERCENT=100
+export GPU_SINGLE_ALLOC_PERCENT=100
```
*Note:* Windows user must use `set` instead of `export` to define an environment variable.
diff --git a/doc/usage.md b/doc/usage.md
index 60cf69b40..1f1fb0961 100644
--- a/doc/usage.md
+++ b/doc/usage.md
@@ -5,7 +5,7 @@
* [Usage on Windows](#usage-on-windows)
* [Usage on Linux](#usage-on-linux)
* [Command Line Options](#command-line-options)
-* [HTML and JSON API report configuraton](#xx)
+* [HTML and JSON API report configuraton](#html-and-json-api-report-configuraton)
## Configurations
@@ -13,17 +13,18 @@ Before you started the miner the first time there are no config files available.
Config files will be created at the first start.
The number of files depends on the available backends.
`config.txt` contains the common miner settings.
+`pools.txt` contains the selected mining pools and currency to mine.
`amd.txt`, `cpu.txt` and `nvidia.txt` contains miner backend specific settings and can be used for further tuning ([Tuning Guide](tuning.md)).
## Usage on Windows
1) Double click the `xmr-stak.exe` file
-2) Fill in the pool url, username and password
+2) Fill in the pool url settings, currency, username and password
`set XMRSTAK_NOWAIT=1` disable the dialog `Press any key to exit.` for non UAC execution.
-## Usage on Linux & MacOS
+## Usage on Linux & macOS
1) Open a terminal within the folder with the binary
2) Start the miner with `./xmr-stak`
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index d9bc96235..8d0fd3289 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -15,6 +15,7 @@
#include "xmrstak/backend/cryptonight.hpp"
#include "xmrstak/jconf.hpp"
+#include "xmrstak/picosha2/picosha2.hpp"
#include
#include
@@ -25,8 +26,41 @@
#include
#include
+#include
+#include
+#include
+#include
+#include
+
+#if defined _MSC_VER
+#include
+#elif defined __GNUC__
+#include
+#include
+#endif
+
+
+
#ifdef _WIN32
#include
+#include
+
+static inline void create_directory(std::string dirname)
+{
+ _mkdir(dirname.data());
+}
+
+static inline std::string get_home()
+{
+ char path[MAX_PATH + 1];
+ // get folder "appdata\local"
+ if (SHGetSpecialFolderPathA(HWND_DESKTOP, path, CSIDL_LOCAL_APPDATA, FALSE))
+ {
+ return path;
+ }
+ else
+ return ".";
+}
static inline void port_sleep(size_t sec)
{
@@ -34,6 +68,22 @@ static inline void port_sleep(size_t sec)
}
#else
#include
+#include
+
+static inline void create_directory(std::string dirname)
+{
+ mkdir(dirname.data(), 0744);
+}
+
+static inline std::string get_home()
+{
+ const char *home = ".";
+
+ if ((home = getenv("HOME")) == nullptr)
+ home = getpwuid(getuid())->pw_dir;
+
+ return home;
+}
static inline void port_sleep(size_t sec)
{
@@ -84,6 +134,7 @@ const char* err_to_str(cl_int ret)
return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
+#ifdef CL_VERSION_1_2
case CL_COMPILE_PROGRAM_FAILURE:
return "CL_COMPILE_PROGRAM_FAILURE";
case CL_LINKER_NOT_AVAILABLE:
@@ -94,6 +145,7 @@ const char* err_to_str(cl_int ret)
return "CL_DEVICE_PARTITION_FAILED";
case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
+#endif
case CL_INVALID_VALUE:
return "CL_INVALID_VALUE";
case CL_INVALID_DEVICE_TYPE:
@@ -164,6 +216,7 @@ const char* err_to_str(cl_int ret)
return "CL_INVALID_GLOBAL_WORK_SIZE";
case CL_INVALID_PROPERTY:
return "CL_INVALID_PROPERTY";
+#ifdef CL_VERSION_1_2
case CL_INVALID_IMAGE_DESCRIPTOR:
return "CL_INVALID_IMAGE_DESCRIPTOR";
case CL_INVALID_COMPILER_OPTIONS:
@@ -172,6 +225,7 @@ const char* err_to_str(cl_int ret)
return "CL_INVALID_LINKER_OPTIONS";
case CL_INVALID_DEVICE_PARTITION_COUNT:
return "CL_INVALID_DEVICE_PARTITION_COUNT";
+#endif
#if defined(CL_VERSION_2_0) && !defined(CONF_ENFORCE_OpenCL_1_2)
case CL_INVALID_PIPE_SIZE:
return "CL_INVALID_PIPE_SIZE";
@@ -252,21 +306,9 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
return ERR_OCL_API;
}
- size_t hashMemSize;
- int threadMemMask;
- int hasIterations;
- if(::jconf::inst()->IsCurrencyMonero())
- {
- hashMemSize = MONERO_MEMORY;
- threadMemMask = MONERO_MASK;
- hasIterations = MONERO_ITER;
- }
- else
- {
- hashMemSize = AEON_MEMORY;
- threadMemMask = AEON_MASK;
- hasIterations = AEON_ITER;
- }
+ size_t hashMemSize = cn_select_memory(::jconf::inst()->GetMiningAlgo());
+ int threadMemMask = cn_select_mask(::jconf::inst()->GetMiningAlgo());
+ int hashIterations = cn_select_iter(::jconf::inst()->GetMiningAlgo());
size_t g_thd = ctx->rawIntensity;
ctx->ExtraBuffers[0] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, hashMemSize * g_thd, NULL, &ret);
@@ -323,60 +365,162 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
return ERR_OCL_API;
}
- ctx->Program = clCreateProgramWithSource(opencl_ctx, 1, (const char**)&source_code, NULL, &ret);
- if(ret != CL_SUCCESS)
+ std::vector devNameVec(1024);
+ if((ret = clGetDeviceInfo(ctx->DeviceID, CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL)) != CL_SUCCESS)
{
- printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithSource on the contents of cryptonight.cl", err_to_str(ret));
+ printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_NAME for device %u.", err_to_str(ret),ctx->deviceIdx );
return ERR_OCL_API;
}
- char options[256];
- snprintf(options, sizeof(options),
- "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d",
- hasIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex ? 1 : 0);
- ret = clBuildProgram(ctx->Program, 1, &ctx->DeviceID, options, NULL, NULL);
- if(ret != CL_SUCCESS)
+ auto miner_algo = ::jconf::inst()->GetMiningAlgo();
+
+ char options[512];
+ snprintf(options, sizeof(options),
+ "-DITERATIONS=%d -DMASK=%d -DWORKSIZE=%llu -DSTRIDED_INDEX=%d -DMEM_CHUNK_EXPONENT=%d -DCOMP_MODE=%d -DMEMORY=%llu -DALGO=%d",
+ hashIterations, threadMemMask, int_port(ctx->workSize), ctx->stridedIndex, int(1u<memChunk), ctx->compMode ? 1 : 0,
+ int_port(hashMemSize), int(miner_algo));
+ /* create a hash for the compile time cache
+ * used data:
+ * - source code
+ * - device name
+ * - compile paramater
+ */
+ std::string src_str(source_code);
+ src_str += options;
+ src_str += devNameVec.data();
+ std::string hash_hex_str;
+ picosha2::hash256_hex_string(src_str, hash_hex_str);
+
+ std::string cache_file = get_home() + "/.openclcache/" + hash_hex_str + ".openclbin";
+ std::ifstream clBinFile(cache_file, std::ofstream::in | std::ofstream::binary);
+ if(!clBinFile.good())
{
- size_t len;
- printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram.", err_to_str(ret));
+ printer::inst()->print_msg(L1,"OpenCL device %u - Precompiled code %s not found. Compiling ...",ctx->deviceIdx, cache_file.c_str());
+ ctx->Program = clCreateProgramWithSource(opencl_ctx, 1, (const char**)&source_code, NULL, &ret);
+ if(ret != CL_SUCCESS)
+ {
+ printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithSource on the OpenCL miner code", err_to_str(ret));
+ return ERR_OCL_API;
+ }
- if((ret = clGetProgramBuildInfo(ctx->Program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS)
+ ret = clBuildProgram(ctx->Program, 1, &ctx->DeviceID, options, NULL, NULL);
+ if(ret != CL_SUCCESS)
{
- printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret));
+ size_t len;
+ printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram.", err_to_str(ret));
+
+ if((ret = clGetProgramBuildInfo(ctx->Program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS)
+ {
+ printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret));
+ return ERR_OCL_API;
+ }
+
+ char* BuildLog = (char*)malloc(len + 1);
+ BuildLog[0] = '\0';
+
+ if((ret = clGetProgramBuildInfo(ctx->Program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS)
+ {
+ free(BuildLog);
+ printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret));
+ return ERR_OCL_API;
+ }
+
+ printer::inst()->print_str("Build log:\n");
+ std::cerr<Program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices,NULL);
+
- if((ret = clGetProgramBuildInfo(ctx->Program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS)
+ std::vector devices_ids(num_devices);
+ clGetProgramInfo(ctx->Program, CL_PROGRAM_DEVICES, sizeof(cl_device_id)* devices_ids.size(), devices_ids.data(),NULL);
+ int dev_id = 0;
+ /* Search for the gpu within the program context.
+ * The id can be different to ctx->DeviceID.
+ */
+ for(auto & ocl_device : devices_ids)
{
- free(BuildLog);
- printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret));
+ if(ocl_device == ctx->DeviceID)
+ break;
+ dev_id++;
+ }
+
+ cl_build_status status;
+ do
+ {
+ if((ret = clGetProgramBuildInfo(ctx->Program, ctx->DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS)
+ {
+ printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret));
+ return ERR_OCL_API;
+ }
+ port_sleep(1);
+ }
+ while(status == CL_BUILD_IN_PROGRESS);
+
+ std::vector binary_sizes(num_devices);
+ clGetProgramInfo (ctx->Program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * binary_sizes.size(), binary_sizes.data(), NULL);
+
+ std::vector all_programs(num_devices);
+ std::vector> program_storage;
+
+ int p_id = 0;
+ size_t mem_size = 0;
+ // create memory structure to query all OpenCL program binaries
+ for(auto & p : all_programs)
+ {
+ program_storage.emplace_back(std::vector(binary_sizes[p_id]));
+ all_programs[p_id] = program_storage[p_id].data();
+ mem_size += binary_sizes[p_id];
+ p_id++;
+ }
+
+ if( ret = clGetProgramInfo(ctx->Program, CL_PROGRAM_BINARIES, num_devices * sizeof(char*), all_programs.data(),NULL) != CL_SUCCESS)
+ {
+ printer::inst()->print_msg(L1,"Error %s when calling clGetProgramInfo.", err_to_str(ret));
return ERR_OCL_API;
}
-
- printer::inst()->print_str("Build log:\n");
- std::cerr<print_msg(L1, "OpenCL device %u - Precompiled code stored in file %s",ctx->deviceIdx, cache_file.c_str());
}
-
- cl_build_status status;
- do
+ else
{
- if((ret = clGetProgramBuildInfo(ctx->Program, ctx->DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS)
+ printer::inst()->print_msg(L1, "OpenCL device %u - Load precompiled cod from file %s",ctx->deviceIdx, cache_file.c_str());
+ std::ostringstream ss;
+ ss << clBinFile.rdbuf();
+ std::string s = ss.str();
+
+ size_t bin_size = s.size();
+ auto data_ptr = s.data();
+
+ cl_int clStatus;
+ ctx->Program = clCreateProgramWithBinary(
+ opencl_ctx, 1, &ctx->DeviceID, &bin_size,
+ (const unsigned char **)&data_ptr, &clStatus, &ret
+ );
+ if(ret != CL_SUCCESS)
+ {
+ printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithBinary. Try to delete file %s", err_to_str(ret), cache_file.c_str());
+ return ERR_OCL_API;
+ }
+ ret = clBuildProgram(ctx->Program, 1, &ctx->DeviceID, NULL, NULL, NULL);
+ if(ret != CL_SUCCESS)
{
- printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret));
+ printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram. Try to delete file %s", err_to_str(ret), cache_file.c_str());
return ERR_OCL_API;
}
- port_sleep(1);
}
- while(status == CL_BUILD_IN_PROGRESS);
- const char *KernelNames[] = { "cn0", "cn1", "cn2", "Blake", "Groestl", "JH", "Skein" };
- for(int i = 0; i < 7; ++i)
+ const char *KernelNames[] = { "cn0", "cn1", "cn2", "Blake", "Groestl", "JH", "Skein", "cn1_monero" };
+ for(int i = 0; i < 8; ++i)
{
ctx->Kernels[i] = clCreateKernel(ctx->Program, KernelNames[i], &ret);
if(ret != CL_SUCCESS)
@@ -487,7 +631,7 @@ std::vector getAMDDevices(int index)
printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get the device vendor name for device %u.", err_to_str(clStatus), k);
continue;
}
-
+
std::string devVendor(devVendorVec.data());
if( devVendor.find("Advanced Micro Devices") != std::string::npos || devVendor.find("AMD") != std::string::npos)
{
@@ -518,13 +662,13 @@ std::vector getAMDDevices(int index)
printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_NAME for device %u.", err_to_str(clStatus), k);
continue;
}
- printer::inst()->print_msg(L0,"Found OpenCL GPU %s.",ctx.name.c_str());
// if environment variable GPU_SINGLE_ALLOC_PERCENT is not set we can not allocate the full memory
ctx.deviceIdx = k;
ctx.freeMem = std::min(ctx.freeMem, maxMem);
ctx.name = std::string(devNameVec.data());
ctx.DeviceID = device_list[k];
+ printer::inst()->print_msg(L0,"Found OpenCL GPU %s.",ctx.name.c_str());
ctxVec.push_back(ctx);
}
}
@@ -549,6 +693,8 @@ int getAMDPlatformIdx()
clStatus = clGetPlatformIDs(numPlatforms, platforms, NULL);
int platformIndex = -1;
+ // Mesa OpenCL is the fallback if no AMD or Apple OpenCL is found
+ int mesaPlatform = -1;
if(clStatus == CL_SUCCESS)
{
@@ -559,13 +705,29 @@ int getAMDPlatformIdx()
clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, infoSize, platformNameVec.data(), NULL);
std::string platformName(platformNameVec.data());
- if( platformName.find("Advanced Micro Devices") != std::string::npos || platformName.find("Apple") != std::string::npos)
+ if( platformName.find("Advanced Micro Devices") != std::string::npos ||
+ platformName.find("Apple") != std::string::npos ||
+ platformName.find("Mesa") != std::string::npos
+ )
{
- platformIndex = i;
+
printer::inst()->print_msg(L0,"Found AMD platform index id = %i, name = %s",i , platformName.c_str());
- break;
+ if(platformName.find("Mesa") != std::string::npos)
+ mesaPlatform = i;
+ else
+ {
+ // exit if AMD or Apple platform is found
+ platformIndex = i;
+ break;
+ }
}
}
+ // fall back to Mesa OpenCL
+ if(platformIndex == -1 && mesaPlatform != -1)
+ {
+ printer::inst()->print_msg(L0,"No AMD platform found select Mesa as OpenCL platform");
+ platformIndex = mesaPlatform;
+ }
}
else
printer::inst()->print_msg(L1,"WARNING: %s when calling clGetPlatformIDs for platform information.", err_to_str(clStatus));
@@ -694,8 +856,18 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_BLAKE256"), blake256CL);
source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_GROESTL256"), groestl256CL);
+ // create a directory for the OpenCL compile cache
+ create_directory(get_home() + "/.openclcache");
+
for(int i = 0; i < num_gpus; ++i)
{
+ if(ctx[i].stridedIndex == 2 && (ctx[i].rawIntensity % ctx[i].workSize) != 0)
+ {
+ size_t reduced_intensity = (ctx[i].rawIntensity / ctx[i].workSize) * ctx[i].workSize;
+ ctx[i].rawIntensity = reduced_intensity;
+ printer::inst()->print_msg(L0, "WARNING AMD: gpu %d intensity is not a multiple of 'worksize', auto reduce intensity to %d", ctx[i].deviceIdx, int(reduced_intensity));
+ }
+
if((ret = InitOpenCLGpu(opencl_ctx, &ctx[i], source_code.c_str())) != ERR_SUCCESS)
{
return ret;
@@ -705,7 +877,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
return ERR_SUCCESS;
}
-size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target)
+size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo, uint32_t version)
{
cl_int ret;
@@ -750,29 +922,65 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
return(ERR_OCL_API);
}
- // CN2 Kernel
+ if(miner_algo == cryptonight_heavy)
+ {
+ // version
+ if ((ret = clSetKernelArg(ctx->Kernels[0], 4, sizeof(cl_uint), &version)) != CL_SUCCESS)
+ {
+ printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 0, argument 4.", err_to_str(ret));
+ return ERR_OCL_API;
+ }
+ }
+
+ // CN1 Kernel
+
+ /// @todo only activate if currency is monero
+ int cn_kernel_offset = 0;
+ if(miner_algo == cryptonight_monero && version >= 7)
+ {
+ cn_kernel_offset = 6;
+ }
// Scratchpads
- if((ret = clSetKernelArg(ctx->Kernels[1], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
+ if((ret = clSetKernelArg(ctx->Kernels[1 + cn_kernel_offset], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 0.", err_to_str(ret));
return ERR_OCL_API;
}
// States
- if((ret = clSetKernelArg(ctx->Kernels[1], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+ if((ret = clSetKernelArg(ctx->Kernels[1 + cn_kernel_offset], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 1.", err_to_str(ret));
return ERR_OCL_API;
}
// Threads
- if((ret = clSetKernelArg(ctx->Kernels[1], 2, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
+ if((ret = clSetKernelArg(ctx->Kernels[1 + cn_kernel_offset], 2, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 2.", err_to_str(ret));
return(ERR_OCL_API);
}
+ if(miner_algo == cryptonight_monero && version >= 7)
+ {
+ // Input
+ if ((ret = clSetKernelArg(ctx->Kernels[1 + cn_kernel_offset], 3, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS)
+ {
+ printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, arugment 4(input buffer).", err_to_str(ret));
+ return ERR_OCL_API;
+ }
+ }
+ else if(miner_algo == cryptonight_heavy)
+ {
+ // version
+ if ((ret = clSetKernelArg(ctx->Kernels[1], 3, sizeof(cl_uint), &version)) != CL_SUCCESS)
+ {
+ printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, argument 3 (version).", err_to_str(ret));
+ return ERR_OCL_API;
+ }
+ }
+
// CN3 Kernel
// Scratchpads
if((ret = clSetKernelArg(ctx->Kernels[2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
@@ -823,6 +1031,16 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
return(ERR_OCL_API);
}
+ if(miner_algo == cryptonight_heavy)
+ {
+ // version
+ if ((ret = clSetKernelArg(ctx->Kernels[2], 7, sizeof(cl_uint), &version)) != CL_SUCCESS)
+ {
+ printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 7.", err_to_str(ret));
+ return ERR_OCL_API;
+ }
+ }
+
for(int i = 0; i < 4; ++i)
{
// States
@@ -857,7 +1075,7 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
return ERR_SUCCESS;
}
-size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput)
+size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo, uint32_t version)
{
cl_int ret;
cl_uint zero = 0;
@@ -866,10 +1084,15 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput)
size_t g_intensity = ctx->rawIntensity;
size_t w_size = ctx->workSize;
- // round up to next multiple of w_size
- size_t g_thd = ((g_intensity + w_size - 1u) / w_size) * w_size;
- // number of global threads must be a multiple of the work group size (w_size)
- assert(g_thd%w_size == 0);
+ size_t g_thd = g_intensity;
+
+ if(ctx->compMode)
+ {
+ // round up to next multiple of w_size
+ g_thd = ((g_intensity + w_size - 1u) / w_size) * w_size;
+ // number of global threads must be a multiple of the work group size (w_size)
+ assert(g_thd%w_size == 0);
+ }
for(int i = 2; i < 6; ++i)
{
@@ -905,7 +1128,13 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput)
}*/
size_t tmpNonce = ctx->Nonce;
- if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[1], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
+ /// @todo only activate if currency is monero
+ int cn_kernel_offset = 0;
+ if(miner_algo == cryptonight_monero && version >= 7)
+ {
+ cn_kernel_offset = 6;
+ }
+ if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[1 + cn_kernel_offset], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1);
return ERR_OCL_API;
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp
index c17bac11b..a387b1535 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.hpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp
@@ -1,6 +1,7 @@
#pragma once
#include "xmrstak/misc/console.hpp"
+#include "xmrstak/jconf.hpp"
#if defined(__APPLE__)
#include
@@ -25,6 +26,8 @@ struct GpuContext
size_t rawIntensity;
size_t workSize;
int stridedIndex;
+ int memChunk;
+ int compMode;
/*Output vars*/
cl_device_id DeviceID;
@@ -33,7 +36,7 @@ struct GpuContext
cl_mem OutputBuffer;
cl_mem ExtraBuffers[6];
cl_program Program;
- cl_kernel Kernels[7];
+ cl_kernel Kernels[8];
size_t freeMem;
int computeUnits;
std::string name;
@@ -47,7 +50,7 @@ int getAMDPlatformIdx();
std::vector getAMDDevices(int index);
size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx);
-size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target);
-size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput);
+size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo, uint32_t version);
+size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo, uint32_t version);
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 255fcbbff..7a363570b 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -14,6 +14,11 @@ R"===(
* along with this program. If not, see .
*/
+/* For Mesa clover support */
+#ifdef cl_clang_storage_class_specifiers
+# pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable
+#endif
+
#ifdef cl_amd_media_ops
#pragma OPENCL EXTENSION cl_amd_media_ops : enable
#else
@@ -399,7 +404,7 @@ static const __constant uchar rcon[8] = { 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x
void AESExpandKey256(uint *keybuf)
{
//#pragma unroll 4
- for(uint c = 8, i = 1; c < 60; ++c)
+ for(uint c = 8, i = 1; c < 40; ++c)
{
// For 256-bit keys, an sbox permutation is done every other 4th uint generated, AND every 8th
uint t = ((!(c & 7)) || ((c & 7) == 4)) ? SubWord(keybuf[c - 1]) : keybuf[c - 1];
@@ -411,21 +416,42 @@ void AESExpandKey256(uint *keybuf)
}
}
+#define MEM_CHUNK (1<> 2);
-#else
+ Scratchpad += gIdx * (MEMORY >> 4);
+#elif(STRIDED_INDEX==1)
Scratchpad += gIdx;
+#elif(STRIDED_INDEX==2)
+ Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
#endif
((ulong8 *)State)[0] = vload8(0, input);
@@ -470,9 +500,10 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
}
mem_fence(CLK_GLOBAL_MEM_FENCE);
-
+#if(COMP_MODE==1)
// do not use early return here
if(gIdx < Threads)
+#endif
{
#pragma unroll
for(int i = 0; i < 25; ++i) states[i] = State[i];
@@ -486,12 +517,41 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
}
mem_fence(CLK_LOCAL_MEM_FENCE);
+
+// cryptonight_heavy
+#if (ALGO == 4)
+ if(version >= 3)
+ {
+ __local uint4 xin[8][WORKSIZE];
+
+ /* Also left over threads performe this loop.
+ * The left over thread results will be ignored
+ */
+ for(size_t i=0; i < 16; i++)
+ {
+ #pragma unroll
+ for(int j = 0; j < 10; ++j)
+ text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey1)[j]);
+ barrier(CLK_LOCAL_MEM_FENCE);
+ xin[get_local_id(1)][get_local_id(0)] = text;
+ barrier(CLK_LOCAL_MEM_FENCE);
+ text = mix_and_propagate(xin);
+ }
+ }
+#endif
+#if(COMP_MODE==1)
// do not use early return here
if(gIdx < Threads)
+#endif
{
+ int iterations = MEMORY >> 7;
+#if (ALGO == 4)
+ if(version < 3)
+ iterations >>= 1;
+#endif
#pragma unroll 2
- for(int i = 0; i < (ITERATIONS >> 5); ++i)
+ for(int i = 0; i < iterations; ++i)
{
#pragma unroll
for(int j = 0; j < 10; ++j)
@@ -503,13 +563,27 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
mem_fence(CLK_GLOBAL_MEM_FENCE);
}
+#define VARIANT1_1(p) \
+ uint table = 0x75310U; \
+ uint index = (((p).s2 >> 26) & 12) | (((p).s2 >> 23) & 2); \
+ (p).s2 ^= ((table >> index) & 0x30U) << 24
+
+#define VARIANT1_2(p) ((uint2 *)&(p))[0] ^= tweak1_2
+
+#define VARIANT1_INIT() \
+ tweak1_2 = as_uint2(input[4]); \
+ tweak1_2.s0 >>= 24; \
+ tweak1_2.s0 |= tweak1_2.s1 << 8; \
+ tweak1_2.s1 = get_global_id(0); \
+ tweak1_2 ^= as_uint2(states[24])
+
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-__kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Threads)
+__kernel void cn1_monero(__global uint4 *Scratchpad, __global ulong *states, ulong Threads, __global ulong *input)
{
ulong a[2], b[2];
__local uint AES0[256], AES1[256], AES2[256], AES3[256];
- const ulong gIdx = get_global_id(0) - get_global_offset(0);
+ const ulong gIdx = getIdx();
for(int i = get_local_id(0); i < 256; i += WORKSIZE)
{
@@ -522,16 +596,20 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
barrier(CLK_LOCAL_MEM_FENCE);
+ uint2 tweak1_2;
uint4 b_x;
-
+#if(COMP_MODE==1)
// do not use early return here
if(gIdx < Threads)
+#endif
{
states += 25 * gIdx;
#if(STRIDED_INDEX==0)
- Scratchpad += gIdx * (ITERATIONS >> 2);
-#else
+ Scratchpad += gIdx * (MEMORY >> 4);
+#elif(STRIDED_INDEX==1)
Scratchpad += gIdx;
+#elif(STRIDED_INDEX==2)
+ Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
#endif
a[0] = states[0] ^ states[4];
@@ -540,12 +618,15 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
b[1] = states[3] ^ states[7];
b_x = ((uint4 *)b)[0];
+ VARIANT1_INIT();
}
mem_fence(CLK_LOCAL_MEM_FENCE);
+#if(COMP_MODE==1)
// do not use early return here
if(gIdx < Threads)
+#endif
{
#pragma unroll 8
for(int i = 0; i < ITERATIONS; ++i)
@@ -554,9 +635,10 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
((uint4 *)c)[0] = Scratchpad[IDX((a[0] & MASK) >> 4)];
((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]);
- //b_x ^= ((uint4 *)c)[0];
- Scratchpad[IDX((a[0] & MASK) >> 4)] = b_x ^ ((uint4 *)c)[0];
+ b_x ^= ((uint4 *)c)[0];
+ VARIANT1_1(b_x);
+ Scratchpad[IDX((a[0] & MASK) >> 4)] = b_x;
uint4 tmp;
tmp = Scratchpad[IDX((c[0] & MASK) >> 4)];
@@ -564,25 +646,136 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
a[1] += c[0] * as_ulong2(tmp).s0;
a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
+ VARIANT1_2(a[1]);
Scratchpad[IDX((c[0] & MASK) >> 4)] = ((uint4 *)a)[0];
+ VARIANT1_2(a[1]);
+
+ ((uint4 *)a)[0] ^= tmp;
+
+ b_x = ((uint4 *)c)[0];
+ }
+ }
+ mem_fence(CLK_GLOBAL_MEM_FENCE);
+}
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Threads
+// cryptonight_heavy
+#if (ALGO == 4)
+ , uint version
+#endif
+)
+{
+ ulong a[2], b[2];
+ __local uint AES0[256], AES1[256], AES2[256], AES3[256];
+
+ const ulong gIdx = getIdx();
+
+ for(int i = get_local_id(0); i < 256; i += WORKSIZE)
+ {
+ const uint tmp = AES0_C[i];
+ AES0[i] = tmp;
+ AES1[i] = rotate(tmp, 8U);
+ AES2[i] = rotate(tmp, 16U);
+ AES3[i] = rotate(tmp, 24U);
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ uint4 b_x;
+#if(COMP_MODE==1)
+ // do not use early return here
+ if(gIdx < Threads)
+#endif
+ {
+ states += 25 * gIdx;
+#if(STRIDED_INDEX==0)
+ Scratchpad += gIdx * (MEMORY >> 4);
+#elif(STRIDED_INDEX==1)
+ Scratchpad += gIdx;
+#elif(STRIDED_INDEX==2)
+ Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
+#endif
+
+ a[0] = states[0] ^ states[4];
+ b[0] = states[2] ^ states[6];
+ a[1] = states[1] ^ states[5];
+ b[1] = states[3] ^ states[7];
+
+ b_x = ((uint4 *)b)[0];
+ }
+
+ mem_fence(CLK_LOCAL_MEM_FENCE);
+
+#if(COMP_MODE==1)
+ // do not use early return here
+ if(gIdx < Threads)
+#endif
+ {
+ ulong idx0 = a[0];
+ ulong mask = MASK;
+
+ int iterations = ITERATIONS;
+#if (ALGO == 4)
+ if(version < 3)
+ {
+ iterations <<= 1;
+ mask -= 0x200000;
+ }
+#endif
+ #pragma unroll 8
+ for(int i = 0; i < iterations; ++i)
+ {
+ ulong c[2];
+
+ ((uint4 *)c)[0] = Scratchpad[IDX((idx0 & mask) >> 4)];
+ ((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]);
+ //b_x ^= ((uint4 *)c)[0];
+
+ Scratchpad[IDX((idx0 & mask) >> 4)] = b_x ^ ((uint4 *)c)[0];
+
+ uint4 tmp;
+ tmp = Scratchpad[IDX((c[0] & mask) >> 4)];
+
+ a[1] += c[0] * as_ulong2(tmp).s0;
+ a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
+
+ Scratchpad[IDX((c[0] & mask) >> 4)] = ((uint4 *)a)[0];
((uint4 *)a)[0] ^= tmp;
+ idx0 = a[0];
b_x = ((uint4 *)c)[0];
+// cryptonight_heavy
+#if (ALGO == 4)
+ if(version >= 3)
+ {
+ long n = *((__global long*)(Scratchpad + (IDX((idx0 & mask) >> 4))));
+ int d = ((__global int*)(Scratchpad + (IDX((idx0 & mask) >> 4))))[2];
+ long q = n / (d | 0x5);
+ *((__global long*)(Scratchpad + (IDX((idx0 & mask) >> 4)))) = n ^ q;
+ idx0 = d ^ q;
+ }
+#endif
}
}
mem_fence(CLK_GLOBAL_MEM_FENCE);
}
__attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
-__kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads)
+__kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads
+// cryptonight_heavy
+#if (ALGO == 4)
+ , uint version
+#endif
+ )
{
__local uint AES0[256], AES1[256], AES2[256], AES3[256];
- uint ExpandedKey2[256];
+ uint ExpandedKey2[40];
ulong State[25];
uint4 text;
- const ulong gIdx = get_global_id(0) - get_global_offset(0);
+ const ulong gIdx = getIdx();
for(int i = get_local_id(1) * WORKSIZE + get_local_id(0);
i < 256;
@@ -597,14 +790,18 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
barrier(CLK_LOCAL_MEM_FENCE);
+#if(COMP_MODE==1)
// do not use early return here
if(gIdx < Threads)
+#endif
{
states += 25 * gIdx;
#if(STRIDED_INDEX==0)
- Scratchpad += gIdx * (ITERATIONS >> 2);
-#else
+ Scratchpad += gIdx * (MEMORY >> 4);
+#elif(STRIDED_INDEX==1)
Scratchpad += gIdx;
+#elif(STRIDED_INDEX==2)
+ Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
#endif
#if defined(__Tahiti__) || defined(__Pitcairn__)
@@ -624,26 +821,111 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
barrier(CLK_LOCAL_MEM_FENCE);
+#if (ALGO == 4)
+ __local uint4 xin[8][WORKSIZE];
+#endif
+
+#if(COMP_MODE==1)
// do not use early return here
if(gIdx < Threads)
+#endif
{
+ int iterations = MEMORY >> 7;
+#if (ALGO == 4)
+ if(version < 3)
+ {
+ iterations >>= 1;
+ #pragma unroll 2
+ for(int i = 0; i < iterations; ++i)
+ {
+ text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
+
+ #pragma unroll 10
+ for(int j = 0; j < 10; ++j)
+ text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+ }
+ }
+ else
+ {
+ #pragma unroll 2
+ for(int i = 0; i < iterations; ++i)
+ {
+ text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
+
+ #pragma unroll 10
+ for(int j = 0; j < 10; ++j)
+ text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ xin[get_local_id(1)][get_local_id(0)] = text;
+ barrier(CLK_LOCAL_MEM_FENCE);
+ text = mix_and_propagate(xin);
+ }
+
+ #pragma unroll 2
+ for(int i = 0; i < iterations; ++i)
+ {
+ text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
+
+ #pragma unroll 10
+ for(int j = 0; j < 10; ++j)
+ text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+ xin[get_local_id(1)][get_local_id(0)] = text;
+ barrier(CLK_LOCAL_MEM_FENCE);
+ text = mix_and_propagate(xin);
+ }
+ }
+#else
#pragma unroll 2
- for(int i = 0; i < (ITERATIONS >> 5); ++i)
+ for(int i = 0; i < iterations; ++i)
{
text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
+ #pragma unroll 10
+ for(int j = 0; j < 10; ++j)
+ text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+ }
+#endif
+ }
+
+// cryptonight_heavy
+#if (ALGO == 4)
+ if(version >= 3)
+ {
+ /* Also left over threads performe this loop.
+ * The left over thread results will be ignored
+ */
+ for(size_t i=0; i < 16; i++)
+ {
#pragma unroll
for(int j = 0; j < 10; ++j)
text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+ barrier(CLK_LOCAL_MEM_FENCE);
+ xin[get_local_id(1)][get_local_id(0)] = text;
+ barrier(CLK_LOCAL_MEM_FENCE);
+ text = mix_and_propagate(xin);
}
+ }
+#endif
+#if(COMP_MODE==1)
+ // do not use early return here
+ if(gIdx < Threads)
+#endif
+ {
vstore2(as_ulong2(text), get_local_id(1) + 4, states);
}
barrier(CLK_GLOBAL_MEM_FENCE);
+#if(COMP_MODE==1)
// do not use early return here
if(gIdx < Threads)
+#endif
{
if(!get_local_id(1))
{
@@ -653,21 +935,11 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
for(int i = 0; i < 25; ++i) states[i] = State[i];
- switch(State[0] & 3)
- {
- case 0:
- Branch0[atomic_inc(Branch0 + Threads)] = get_global_id(0) - get_global_offset(0);
- break;
- case 1:
- Branch1[atomic_inc(Branch1 + Threads)] = get_global_id(0) - get_global_offset(0);
- break;
- case 2:
- Branch2[atomic_inc(Branch2 + Threads)] = get_global_id(0) - get_global_offset(0);
- break;
- case 3:
- Branch3[atomic_inc(Branch3 + Threads)] = get_global_id(0) - get_global_offset(0);
- break;
- }
+ ulong StateSwitch = State[0] & 3;
+ __global uint *destinationBranch1 = StateSwitch == 0 ? Branch0 : Branch1;
+ __global uint *destinationBranch2 = StateSwitch == 2 ? Branch2 : Branch3;
+ __global uint *destinationBranch = StateSwitch < 2 ? destinationBranch1 : destinationBranch2;
+ destinationBranch[atomic_inc(destinationBranch + Threads)] = gIdx;
}
}
mem_fence(CLK_GLOBAL_MEM_FENCE);
@@ -704,8 +976,7 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u
for(uint i = 0; i < 4; ++i)
{
- if(i < 3) t[0] += 0x40UL;
- else t[0] += 0x08UL;
+ t[0] += i < 3 ? 0x40UL : 0x08UL;
t[2] = t[0] ^ t[1];
@@ -715,8 +986,7 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u
h = m ^ p;
- if(i < 2) t[1] = 0x3000000000000000UL;
- else t[1] = 0xB000000000000000UL;
+ t[1] = i < 2 ? 0x3000000000000000UL : 0xB000000000000000UL;
}
t[0] = 0x08UL;
@@ -744,6 +1014,27 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u
#define SWAP8(x) as_ulong(as_uchar8(x).s76543210)
+#define JHXOR \
+ h0h ^= input[0]; \
+ h0l ^= input[1]; \
+ h1h ^= input[2]; \
+ h1l ^= input[3]; \
+ h2h ^= input[4]; \
+ h2l ^= input[5]; \
+ h3h ^= input[6]; \
+ h3l ^= input[7]; \
+\
+ E8; \
+\
+ h4h ^= input[0]; \
+ h4l ^= input[1]; \
+ h5h ^= input[2]; \
+ h5l ^= input[3]; \
+ h6h ^= input[4]; \
+ h6l ^= input[5]; \
+ h7h ^= input[6]; \
+ h7l ^= input[7]
+
__kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, ulong Threads)
{
const uint idx = get_global_id(0) - get_global_offset(0);
@@ -757,46 +1048,27 @@ __kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint
sph_u64 h4h = 0x754D2E7F8996A371UL, h4l = 0x62E27DF70849141DUL, h5h = 0x948F2476F7957627UL, h5l = 0x6C29804757B6D587UL, h6h = 0x6C0D8EAC2D275E5CUL, h6l = 0x0F7A0557C6508451UL, h7h = 0xEA12247067D3E47BUL, h7l = 0x69D71CD313ABE389UL;
sph_u64 tmp;
- for(int i = 0; i < 5; ++i)
+ for(int i = 0; i < 3; ++i)
{
ulong input[8];
- if(i < 3)
- {
- for(int x = 0; x < 8; ++x) input[x] = (states[(i << 3) + x]);
- }
- else if(i == 3)
- {
- input[0] = (states[24]);
- input[1] = 0x80UL;
- for(int x = 2; x < 8; ++x) input[x] = 0x00UL;
- }
- else
- {
- input[7] = 0x4006000000000000UL;
-
- for(int x = 0; x < 7; ++x) input[x] = 0x00UL;
- }
-
- h0h ^= input[0];
- h0l ^= input[1];
- h1h ^= input[2];
- h1l ^= input[3];
- h2h ^= input[4];
- h2l ^= input[5];
- h3h ^= input[6];
- h3l ^= input[7];
-
- E8;
-
- h4h ^= input[0];
- h4l ^= input[1];
- h5h ^= input[2];
- h5l ^= input[3];
- h6h ^= input[4];
- h6l ^= input[5];
- h7h ^= input[6];
- h7l ^= input[7];
+ const int shifted = i << 3;
+ for(int x = 0; x < 8; ++x) input[x] = (states[shifted + x]);
+ JHXOR;
+ }
+ {
+ ulong input[8];
+ input[0] = (states[24]);
+ input[1] = 0x80UL;
+ #pragma unroll 6
+ for(int x = 2; x < 8; ++x) input[x] = 0x00UL;
+ JHXOR;
+ }
+ {
+ ulong input[8];
+ for(int x = 0; x < 7; ++x) input[x] = 0x00UL;
+ input[7] = 0x4006000000000000UL;
+ JHXOR;
}
//output[0] = h6h;
@@ -832,6 +1104,7 @@ __kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global u
((uint8 *)h)[0] = vload8(0U, c_IV256);
+ #pragma unroll 4
for(uint i = 0, bitlen = 0; i < 4; ++i)
{
if(i < 3)
@@ -907,6 +1180,7 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global
State[7] = 0x0001000000000000UL;
+ #pragma unroll 4
for(uint i = 0; i < 4; ++i)
{
ulong H[8], M[8];
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl b/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl
index 868757b7b..279b65231 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/wolf-skein.cl
@@ -3,6 +3,7 @@ R"===(
#define WOLF_SKEIN_CL
// Vectorized Skein implementation macros and functions by Wolf
+// Updated by taisel
#define SKEIN_KS_PARITY 0x1BD11BDAA9FC1A22
@@ -22,11 +23,11 @@ static const __constant ulong SKEIN512_256_IV[8] =
0xC36FBAF9393AD185UL, 0x3EEDBA1833EDFC13UL
};
-#define SKEIN_INJECT_KEY(p, s) do { \
+#define SKEIN_INJECT_KEY(p, s, q) do { \
p += h; \
- p.s5 += t[s % 3]; \
- p.s6 += t[(s + 1) % 3]; \
- p.s7 += s; \
+ p.s5 += t[s]; \
+ p.s6 += t[select(s + 1U, 0U, s == 2U)]; \
+ p.s7 += q; \
} while(0)
ulong SKEIN_ROT(const uint2 x, const uint y)
@@ -35,55 +36,55 @@ ulong SKEIN_ROT(const uint2 x, const uint y)
else return(as_ulong(amd_bitalign(x.s10, x, 32 - (y - 32))));
}
-void SkeinMix8(ulong4 *pv0, ulong4 *pv1, const uint rc0, const uint rc1, const uint rc2, const uint rc3)
+void SkeinMix8(ulong4 *pv0, ulong4 *pv1, const ulong4 rc)
{
*pv0 += *pv1;
- (*pv1).s0 = SKEIN_ROT(as_uint2((*pv1).s0), rc0);
- (*pv1).s1 = SKEIN_ROT(as_uint2((*pv1).s1), rc1);
- (*pv1).s2 = SKEIN_ROT(as_uint2((*pv1).s2), rc2);
- (*pv1).s3 = SKEIN_ROT(as_uint2((*pv1).s3), rc3);
+ (*pv1).s0 = SKEIN_ROT(as_uint2((*pv1).s0), rc.s0);
+ (*pv1).s1 = SKEIN_ROT(as_uint2((*pv1).s1), rc.s1);
+ (*pv1).s2 = SKEIN_ROT(as_uint2((*pv1).s2), rc.s2);
+ (*pv1).s3 = SKEIN_ROT(as_uint2((*pv1).s3), rc.s3);
*pv1 ^= *pv0;
}
-ulong8 SkeinEvenRound(ulong8 p, const ulong8 h, const ulong *t, const uint s)
+ulong8 SkeinEvenRound(ulong8 p, const ulong8 h, const ulong *t, const uint s, const uint q)
{
- SKEIN_INJECT_KEY(p, s);
+ SKEIN_INJECT_KEY(p, s, q);
ulong4 pv0 = p.even, pv1 = p.odd;
- SkeinMix8(&pv0, &pv1, 46, 36, 19, 37);
+ SkeinMix8(&pv0, &pv1, (ulong4)(46, 36, 19, 37));
pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0));
pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1));
- SkeinMix8(&pv0, &pv1, 33, 27, 14, 42);
+ SkeinMix8(&pv0, &pv1, (ulong4)(33, 27, 14, 42));
pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0));
pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1));
- SkeinMix8(&pv0, &pv1, 17, 49, 36, 39);
+ SkeinMix8(&pv0, &pv1, (ulong4)(17, 49, 36, 39));
pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0));
pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1));
- SkeinMix8(&pv0, &pv1, 44, 9, 54, 56);
+ SkeinMix8(&pv0, &pv1, (ulong4)(44, 9, 54, 56));
return(shuffle2(pv0, pv1, (ulong8)(1, 4, 2, 7, 3, 6, 0, 5)));
}
-ulong8 SkeinOddRound(ulong8 p, const ulong8 h, const ulong *t, const uint s)
+ulong8 SkeinOddRound(ulong8 p, const ulong8 h, const ulong *t, const uint s, const uint q)
{
- SKEIN_INJECT_KEY(p, s);
+ SKEIN_INJECT_KEY(p, s, q);
ulong4 pv0 = p.even, pv1 = p.odd;
- SkeinMix8(&pv0, &pv1, 39, 30, 34, 24);
+ SkeinMix8(&pv0, &pv1, (ulong4)(39, 30, 34, 24));
pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0));
pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1));
- SkeinMix8(&pv0, &pv1, 13, 50, 10, 17);
+ SkeinMix8(&pv0, &pv1, (ulong4)(13, 50, 10, 17));
pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0));
pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1));
- SkeinMix8(&pv0, &pv1, 25, 29, 39, 43);
+ SkeinMix8(&pv0, &pv1, (ulong4)(25, 29, 39, 43));
pv0 = shuffle(pv0, (ulong4)(1, 2, 3, 0));
pv1 = shuffle(pv1, (ulong4)(0, 3, 2, 1));
- SkeinMix8(&pv0, &pv1, 8, 35, 56, 22);
+ SkeinMix8(&pv0, &pv1, (ulong4)(8, 35, 56, 22));
return(shuffle2(pv0, pv1, (ulong8)(1, 4, 2, 7, 3, 6, 0, 5)));
}
@@ -92,20 +93,47 @@ ulong8 Skein512Block(ulong8 p, ulong8 h, ulong h8, const ulong *t)
#pragma unroll
for(int i = 0; i < 18; ++i)
{
- p = SkeinEvenRound(p, h, t, i);
+ p = SkeinEvenRound(p, h, t, 0U, i);
++i;
ulong tmp = h.s0;
h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0));
h.s7 = h8;
h8 = tmp;
- p = SkeinOddRound(p, h, t, i);
+ p = SkeinOddRound(p, h, t, 1U, i);
+ ++i;
+ tmp = h.s0;
+ h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0));
+ h.s7 = h8;
+ h8 = tmp;
+ p = SkeinEvenRound(p, h, t, 2U, i);
+ ++i;
+ tmp = h.s0;
+ h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0));
+ h.s7 = h8;
+ h8 = tmp;
+ p = SkeinOddRound(p, h, t, 0U, i);
+ ++i;
+ tmp = h.s0;
+ h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0));
+ h.s7 = h8;
+ h8 = tmp;
+ p = SkeinEvenRound(p, h, t, 1U, i);
+ ++i;
+ tmp = h.s0;
+ h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0));
+ h.s7 = h8;
+ h8 = tmp;
+ p = SkeinOddRound(p, h, t, 2U, i);
tmp = h.s0;
h = shuffle(h, (ulong8)(1, 2, 3, 4, 5, 6, 7, 0));
h.s7 = h8;
h8 = tmp;
}
- SKEIN_INJECT_KEY(p, 18);
+ p += h;
+ p.s5 += t[0];
+ p.s6 += t[1];
+ p.s7 += 18;
return(p);
}
diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
index 0bc52395a..ea057a0f6 100644
--- a/xmrstak/backend/amd/autoAdjust.hpp
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -83,25 +83,25 @@ class autoAdjust
constexpr size_t byteToMiB = 1024u * 1024u;
- size_t hashMemSize;
- if(::jconf::inst()->IsCurrencyMonero())
- {
- hashMemSize = MONERO_MEMORY;
- }
- else
- {
- hashMemSize = AEON_MEMORY;
- }
+ size_t hashMemSize = cn_select_memory(::jconf::inst()->GetMiningAlgo());
std::string conf;
- int i = 0;
for(auto& ctx : devVec)
{
/* 1000 is a magic selected limit, the reason is that more than 2GiB memory
* sowing down the memory performance because of TLB cache misses
*/
size_t maxThreads = 1000u;
- if(ctx.name.compare("gfx901") == 0)
+ if(
+ ctx.name.compare("gfx901") == 0 ||
+ ctx.name.compare("gfx904") == 0 ||
+ // APU
+ ctx.name.compare("gfx902") == 0 ||
+ // UNKNOWN
+ ctx.name.compare("gfx900") == 0 ||
+ ctx.name.compare("gfx903") == 0 ||
+ ctx.name.compare("gfx905") == 0
+ )
{
/* Increase the number of threads for AMD VEGA gpus.
* Limit the number of threads based on the issue: https://github.com/fireice-uk/xmr-stak/issues/5#issuecomment-339425089
@@ -109,6 +109,9 @@ class autoAdjust
*/
maxThreads = 2024u;
}
+ // increase all intensity limits by two for aeon
+ if(::jconf::inst()->GetMiningAlgo() == cryptonight_lite)
+ maxThreads *= 2u;
// keep 128MiB memory free (value is randomly chosen)
size_t availableMem = ctx.freeMem - (128u * byteToMiB);
@@ -118,14 +121,28 @@ class autoAdjust
size_t possibleIntensity = std::min( maxThreads , maxIntensity );
// map intensity to a multiple of the compute unit count, 8 is the number of threads per work group
size_t intensity = (possibleIntensity / (8 * ctx.computeUnits)) * ctx.computeUnits * 8;
- conf += std::string(" // gpu: ") + ctx.name + " memory:" + std::to_string(availableMem / byteToMiB) + "\n";
- conf += std::string(" // compute units: ") + std::to_string(ctx.computeUnits) + "\n";
- // set 8 threads per block (this is a good value for the most gpus)
- conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
- " \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
- " \"affine_to_cpu\" : false, \"strided_index\" : true\n"
- " },\n";
- ++i;
+ //If the intensity is 0, then it's because the multiple of the unit count is greater than intensity
+ if (intensity == 0)
+ {
+ printer::inst()->print_msg(L0, "WARNING: Auto detected intensity unexpectedly low. Try to set the environment variable GPU_SINGLE_ALLOC_PERCENT.");
+ intensity = possibleIntensity;
+
+ }
+ if (intensity != 0)
+ {
+ conf += std::string(" // gpu: ") + ctx.name + " memory:" + std::to_string(availableMem / byteToMiB) + "\n";
+ conf += std::string(" // compute units: ") + std::to_string(ctx.computeUnits) + "\n";
+ // set 8 threads per block (this is a good value for the most gpus)
+ conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
+ " \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
+ " \"affine_to_cpu\" : false, \"strided_index\" : 1, \"mem_chunk\" : 2,\n"
+ " \"comp_mode\" : true\n" +
+ " },\n";
+ }
+ else
+ {
+ printer::inst()->print_msg(L0, "WARNING: Ignore gpu %s, %s MiB free memory is not enough to suggest settings.", ctx.name.c_str(), std::to_string(availableMem / byteToMiB).c_str());
+ }
}
configTpl.replace("PLATFORMINDEX",std::to_string(platformIndex));
diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl
index af662f852..28855f070 100644
--- a/xmrstak/backend/amd/config.tpl
+++ b/xmrstak/backend/amd/config.tpl
@@ -1,17 +1,29 @@
R"===(
/*
* GPU configuration. You should play around with intensity and worksize as the fastest settings will vary.
- * index - GPU index number usually starts from 0
- * intensity - Number of parallel GPU threads (nothing to do with CPU threads)
- * worksize - Number of local GPU threads (nothing to do with CPU threads)
+ * index - GPU index number usually starts from 0
+ * intensity - Number of parallel GPU threads (nothing to do with CPU threads)
+ * worksize - Number of local GPU threads (nothing to do with CPU threads)
* affine_to_cpu - This will affine the thread to a CPU. This can make a GPU miner play along nicer with a CPU miner.
* strided_index - switch memory pattern used for the scratch pad memory
- * true = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks
- * false = use a contiguous block of memory per thread
+ * 2 = chunked memory, chunk size is controlled by 'mem_chunk'
+ * required: intensity must be a multiple of worksize
+ * 1 or true = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks
+ * 0 or false = use a contiguous block of memory per thread
+ * mem_chunk - range 0 to 18: set the number of elements (16byte) per chunk
+ * this value is only used if 'strided_index' == 2
+ * element count is computed with the equation: 2 to the power of 'mem_chunk' e.g. 4 means a chunk of 16 elements(256byte)
+ * comp_mode - Compatibility enable/disable the automatic guard around compute kernel which allows
+ * to use a intensity which is not the multiple of the worksize.
+ * If you set false and the intensity is not multiple of the worksize the miner can crash:
+ * in this case set the intensity to a multiple of the worksize or activate comp_mode.
* "gpu_threads_conf" :
* [
- * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : true },
+ * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : true, "mem_chunk" : 2, "comp_mode" : true },
* ],
+ * If you do not wish to mine with your AMD GPU(s) then use:
+ * "gpu_threads_conf" :
+ * null,
*/
"gpu_threads_conf" : [
diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp
index 07afb1964..93ba70909 100644
--- a/xmrstak/backend/amd/jconf.cpp
+++ b/xmrstak/backend/amd/jconf.cpp
@@ -56,9 +56,10 @@ struct configVal {
Type iType;
};
-//Same order as in configEnum, as per comment above
+// Same order as in configEnum, as per comment above
+// kNullType means any type
configVal oConfigValues[] = {
- { aGpuThreadsConf, "gpu_threads_conf", kArrayType },
+ { aGpuThreadsConf, "gpu_threads_conf", kNullType },
{ iPlatformIdx, "platform_index", kNumberType }
};
@@ -68,6 +69,8 @@ inline bool checkType(Type have, Type want)
{
if(want == have)
return true;
+ else if(want == kNullType)
+ return true;
else if(want == kTrueType && have == kFalseType)
return true;
else if(want == kFalseType && have == kTrueType)
@@ -103,14 +106,17 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
if(!oThdConf.IsObject())
return false;
- const Value *idx, *intensity, *w_size, *aff, *stridedIndex;
+ const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *compMode;
idx = GetObjectMember(oThdConf, "index");
intensity = GetObjectMember(oThdConf, "intensity");
w_size = GetObjectMember(oThdConf, "worksize");
aff = GetObjectMember(oThdConf, "affine_to_cpu");
stridedIndex = GetObjectMember(oThdConf, "strided_index");
+ memChunk = GetObjectMember(oThdConf, "mem_chunk");
+ compMode = GetObjectMember(oThdConf, "comp_mode");
- if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || stridedIndex == nullptr)
+ if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || memChunk == nullptr ||
+ stridedIndex == nullptr || compMode == nullptr)
return false;
if(!idx->IsUint64() || !intensity->IsUint64() || !w_size->IsUint64())
@@ -119,13 +125,38 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
if(!aff->IsUint64() && !aff->IsBool())
return false;
- if(!stridedIndex->IsBool())
+ if(!stridedIndex->IsBool() && !stridedIndex->IsNumber())
+ {
+ printer::inst()->print_msg(L0, "ERROR: strided_index must be a bool or a number");
+ return false;
+ }
+
+ if(stridedIndex->IsBool())
+ cfg.stridedIndex = stridedIndex->GetBool() ? 1 : 0;
+ else
+ cfg.stridedIndex = (int)stridedIndex->GetInt64();
+
+ if(cfg.stridedIndex > 2)
+ {
+ printer::inst()->print_msg(L0, "ERROR: strided_index must be smaller than 2");
+ return false;
+ }
+
+ cfg.memChunk = (int)memChunk->GetInt64();
+
+ if(!idx->IsUint64() || cfg.memChunk > 18 )
+ {
+ printer::inst()->print_msg(L0, "ERROR: mem_chunk must be smaller than 18");
+ return false;
+ }
+
+ if(!compMode->IsBool())
return false;
cfg.index = idx->GetUint64();
- cfg.intensity = intensity->GetUint64();
cfg.w_size = w_size->GetUint64();
- cfg.stridedIndex = stridedIndex->GetBool();
+ cfg.intensity = intensity->GetUint64();
+ cfg.compMode = compMode->GetBool();
if(aff->IsNumber())
cfg.cpu_aff = aff->GetInt64();
diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp
index ee1882aad..580b69fe7 100644
--- a/xmrstak/backend/amd/jconf.hpp
+++ b/xmrstak/backend/amd/jconf.hpp
@@ -26,7 +26,9 @@ class jconf
size_t intensity;
size_t w_size;
long long cpu_aff;
- bool stridedIndex;
+ int stridedIndex;
+ int memChunk;
+ bool compMode;
};
size_t GetThreadCount();
diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
index e83527c55..46a04d505 100644
--- a/xmrstak/backend/amd/minethd.cpp
+++ b/xmrstak/backend/amd/minethd.cpp
@@ -97,6 +97,8 @@ bool minethd::init_gpus()
vGpuData[i].rawIntensity = cfg.intensity;
vGpuData[i].workSize = cfg.w_size;
vGpuData[i].stridedIndex = cfg.stridedIndex;
+ vGpuData[i].memChunk = cfg.memChunk;
+ vGpuData[i].compMode = cfg.compMode;
}
return InitOpenCL(vGpuData.data(), n, jconf::inst()->GetPlatformIdx()) == ERR_SUCCESS;
@@ -139,7 +141,7 @@ std::vector* minethd::thread_starter(uint32_t threadOffset, miner_wor
if(cfg.cpu_aff >= 0)
{
#if defined(__APPLE__)
- printer::inst()->print_msg(L1, "WARNING on MacOS thread affinity is only advisory.");
+ printer::inst()->print_msg(L1, "WARNING on macOS thread affinity is only advisory.");
#endif
printer::inst()->print_msg(L1, "Starting AMD GPU thread %d, affinity: %d.", i, (int)cfg.cpu_aff);
@@ -189,9 +191,20 @@ void minethd::work_main()
uint64_t iCount = 0;
cryptonight_ctx* cpu_ctx;
cpu_ctx = cpu::minethd::minethd_alloc_ctx();
- cn_hash_fun hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, ::jconf::inst()->IsCurrencyMonero());
+ auto miner_algo = ::jconf::inst()->GetMiningAlgo();
+ cn_hash_fun hash_fun;
+ if(miner_algo == cryptonight_monero || miner_algo == cryptonight_heavy)
+ {
+ // start with cryptonight and switch later if fork version is reached
+ hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, cryptonight);
+ }
+ else
+ hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
+
globalStates::inst().iConsumeCnt++;
+ uint8_t version = 0;
+
while (bQuit == 0)
{
if (oWork.bStall)
@@ -205,6 +218,16 @@ void minethd::work_main()
std::this_thread::sleep_for(std::chrono::milliseconds(100));
consume_work();
+ uint8_t new_version = oWork.getVersion();
+ if(miner_algo == cryptonight_monero && version < 7 && new_version >= 7)
+ {
+ hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, cryptonight_monero);
+ }
+ else if(miner_algo == cryptonight_heavy && version < 3 && new_version >= 3)
+ {
+ hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, cryptonight_heavy);
+ }
+ version = new_version;
continue;
}
@@ -213,7 +236,8 @@ void minethd::work_main()
assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID));
uint64_t target = oWork.iTarget;
- XMRSetJob(pGpuCtx, oWork.bWorkBlob, oWork.iWorkSize, target);
+ /// \todo add monero hard for version
+ XMRSetJob(pGpuCtx, oWork.bWorkBlob, oWork.iWorkSize, target, miner_algo, version);
if(oWork.bNiceHash)
pGpuCtx->Nonce = *(uint32_t*)(oWork.bWorkBlob + 39);
@@ -229,7 +253,7 @@ void minethd::work_main()
cl_uint results[0x100];
memset(results,0,sizeof(cl_uint)*(0x100));
- XMRRunJob(pGpuCtx, results);
+ XMRRunJob(pGpuCtx, results, miner_algo, version);
for(size_t i = 0; i < results[0xFF]; i++)
{
@@ -245,7 +269,7 @@ void minethd::work_main()
if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget)
executor::inst()->push_event(ex_event(job_result(oWork.sJobID, results[i], bResult, iThreadNo), oWork.iPoolId));
else
- executor::inst()->push_event(ex_event("AMD Invalid Result", oWork.iPoolId));
+ executor::inst()->push_event(ex_event("AMD Invalid Result", pGpuCtx->deviceIdx, oWork.iPoolId));
}
iCount += pGpuCtx->rawIntensity;
@@ -256,6 +280,16 @@ void minethd::work_main()
}
consume_work();
+ uint8_t new_version = oWork.getVersion();
+ if(miner_algo == cryptonight_monero && version < 7 && new_version >= 7)
+ {
+ hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, cryptonight_monero);
+ }
+ else if(miner_algo == cryptonight_heavy && version < 3 && new_version >= 3)
+ {
+ hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, cryptonight_heavy);
+ }
+ version = new_version;
}
}
diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp
index 7bdb14e0a..db805ec56 100644
--- a/xmrstak/backend/cpu/autoAdjust.hpp
+++ b/xmrstak/backend/cpu/autoAdjust.hpp
@@ -33,25 +33,21 @@ class autoAdjust
{
public:
- size_t hashMemSize;
- size_t halfHashMemSize;
-
- autoAdjust()
+ bool printConfig()
{
+ size_t hashMemSizeKB;
+ size_t halfHashMemSizeKB;
+
if(::jconf::inst()->IsCurrencyMonero())
{
- hashMemSize = MONERO_MEMORY;
- halfHashMemSize = hashMemSize / 2u;
+ hashMemSizeKB = MONERO_MEMORY / 1024u;
+ halfHashMemSizeKB = hashMemSizeKB / 2u;
}
else
{
- hashMemSize = AEON_MEMORY;
- halfHashMemSize = hashMemSize / 2u;
+ hashMemSizeKB = AEON_MEMORY / 1024u;
+ halfHashMemSizeKB = hashMemSizeKB / 2u;
}
- }
-
- bool printConfig()
- {
configEditor configTpl{};
@@ -63,9 +59,10 @@ class autoAdjust
std::string conf;
- if(!detectL3Size() || L3KB_size < halfHashMemSize || L3KB_size > (halfHashMemSize * 100u))
+
+ if(!detectL3Size() || L3KB_size < halfHashMemSizeKB || L3KB_size > (halfHashMemSizeKB * 2048u))
{
- if(L3KB_size < halfHashMemSize || L3KB_size > (halfHashMemSize * 100))
+ if(L3KB_size < halfHashMemSizeKB || L3KB_size > (halfHashMemSizeKB * 2048))
printer::inst()->print_msg(L0, "Autoconf failed: L3 size sanity check failed - %u KB.", L3KB_size);
conf += std::string(" { \"low_power_mode\" : false, \"no_prefetch\" : true, \"affine_to_cpu\" : false },\n");
@@ -88,7 +85,7 @@ class autoAdjust
if(L3KB_size <= 0)
break;
- double_mode = L3KB_size / hashMemSize > (int32_t)(corecnt-i);
+ double_mode = L3KB_size / hashMemSizeKB > (int32_t)(corecnt-i);
conf += std::string(" { \"low_power_mode\" : ");
conf += std::string(double_mode ? "true" : "false");
@@ -107,9 +104,9 @@ class autoAdjust
aff_id++;
if(double_mode)
- L3KB_size -= hashMemSize * 2u;
+ L3KB_size -= hashMemSizeKB * 2u;
else
- L3KB_size -= hashMemSize;
+ L3KB_size -= hashMemSizeKB;
}
}
@@ -142,7 +139,7 @@ class autoAdjust
}
L3KB_size = ((get_masked(cpu_info[1], 31, 22) + 1) * (get_masked(cpu_info[1], 21, 12) + 1) *
- (get_masked(cpu_info[1], 11, 0) + 1) * (cpu_info[2] + 1)) / halfHashMemSize;
+ (get_masked(cpu_info[1], 11, 0) + 1) * (cpu_info[2] + 1)) / 1024;
return true;
}
diff --git a/xmrstak/backend/cpu/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
index ddeb89b31..568abb5cd 100644
--- a/xmrstak/backend/cpu/autoAdjustHwloc.hpp
+++ b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
@@ -28,16 +28,8 @@ class autoAdjust
autoAdjust()
{
- if(::jconf::inst()->IsCurrencyMonero())
- {
- hashMemSize = MONERO_MEMORY;
- halfHashMemSize = hashMemSize / 2u;
- }
- else
- {
- hashMemSize = AEON_MEMORY;
- halfHashMemSize = hashMemSize / 2u;
- }
+ hashMemSize = cn_select_memory(::jconf::inst()->GetMiningAlgo());
+ halfHashMemSize = hashMemSize / 2u;
}
bool printConfig()
diff --git a/xmrstak/backend/cpu/config.tpl b/xmrstak/backend/cpu/config.tpl
index b21a22d24..cb4b950db 100644
--- a/xmrstak/backend/cpu/config.tpl
+++ b/xmrstak/backend/cpu/config.tpl
@@ -2,7 +2,7 @@ R"===(
/*
* Thread configuration for each thread. Make sure it matches the number above.
* low_power_mode - This can either be a boolean (true or false), or a number between 1 to 5. When set to true,
- this mode will double the cache usage, and double the single thread performance. It will
+ * this mode will double the cache usage, and double the single thread performance. It will
* consume much less power (as less cores are working), but will max out at around 80-85% of
* the maximum performance. When set to a number N greater than 1, this mode will increase the
* cache usage and single thread performance by N times.
@@ -24,6 +24,9 @@ R"===(
* { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 0 },
* { "low_power_mode" : false, "no_prefetch" : true, "affine_to_cpu" : 1 },
* ],
+ * If you do not wish to mine with your CPU(s) then use:
+ * "cpu_threads_conf" :
+ * null,
*/
"cpu_threads_conf" :
diff --git a/xmrstak/backend/cpu/crypto/cryptonight.h b/xmrstak/backend/cpu/crypto/cryptonight.h
index 631c39a4a..5c9a73332 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight.h
@@ -7,8 +7,6 @@ extern "C" {
#include
#include
-#include "xmrstak/backend/cryptonight.hpp"
-
typedef struct {
uint8_t hash_state[224]; // Need only 200, explicit align
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index 9b6e1dc28..85373e828 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -16,6 +16,7 @@
#pragma once
#include "cryptonight.h"
+#include "xmrstak/backend/cryptonight.hpp"
#include
#include
@@ -148,7 +149,20 @@ static inline void soft_aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i
*x7 = soft_aesenc(*x7, key);
}
-template
+inline void mix_and_propagate(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3, __m128i& x4, __m128i& x5, __m128i& x6, __m128i& x7)
+{
+ __m128i tmp0 = x0;
+ x0 = _mm_xor_si128(x0, x1);
+ x1 = _mm_xor_si128(x1, x2);
+ x2 = _mm_xor_si128(x2, x3);
+ x3 = _mm_xor_si128(x3, x4);
+ x4 = _mm_xor_si128(x4, x5);
+ x5 = _mm_xor_si128(x5, x6);
+ x6 = _mm_xor_si128(x6, x7);
+ x7 = _mm_xor_si128(x7, tmp0);
+}
+
+template
void cn_explode_scratchpad(const __m128i* input, __m128i* output)
{
// This is more than we have registers, compiler will assign 2 keys on the stack
@@ -166,6 +180,40 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output)
xin6 = _mm_load_si128(input + 10);
xin7 = _mm_load_si128(input + 11);
+ if(ALGO == cryptonight_heavy)
+ {
+ for(size_t i=0; i < 16; i++)
+ {
+ if(SOFT_AES)
+ {
+ soft_aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ soft_aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ soft_aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ soft_aes_round(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ soft_aes_round(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ soft_aes_round(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ soft_aes_round(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ soft_aes_round(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ soft_aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ soft_aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ }
+ else
+ {
+ aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ aes_round(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ aes_round(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ aes_round(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ aes_round(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ aes_round(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
+ }
+ mix_and_propagate(xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7);
+ }
+ }
+
for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
{
if(SOFT_AES)
@@ -213,7 +261,7 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output)
}
}
-template
+template
void cn_implode_scratchpad(const __m128i* input, __m128i* output)
{
// This is more than we have registers, compiler will assign 2 keys on the stack
@@ -275,6 +323,93 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output)
aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
}
+
+ if(ALGO == cryptonight_heavy)
+ mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+ }
+
+ if(ALGO == cryptonight_heavy)
+ {
+ for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
+ {
+ if(PREFETCH)
+ _mm_prefetch((const char*)input + i + 0, _MM_HINT_NTA);
+
+ xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0);
+ xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1);
+ xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2);
+ xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3);
+
+ if(PREFETCH)
+ _mm_prefetch((const char*)input + i + 4, _MM_HINT_NTA);
+
+ xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4);
+ xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5);
+ xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6);
+ xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7);
+
+ if(SOFT_AES)
+ {
+ soft_aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ }
+ else
+ {
+ aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ }
+
+ if(ALGO == cryptonight_heavy)
+ mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+ }
+
+ for(size_t i=0; i < 16; i++)
+ {
+ if(SOFT_AES)
+ {
+ soft_aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ soft_aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ }
+ else
+ {
+ aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
+ }
+
+ mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
+ }
}
_mm_store_si128(output + 4, xout0);
@@ -287,13 +422,45 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output)
_mm_store_si128(output + 11, xout7);
}
-template
+inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
+{
+ mem_out[0] = _mm_cvtsi128_si64(tmp);
+
+ tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
+ uint64_t vh = _mm_cvtsi128_si64(tmp);
+
+ uint8_t x = vh >> 24;
+ static const uint16_t table = 0x7531;
+ const uint8_t index = (((x >> 3) & 6) | (x & 1)) << 1;
+ vh ^= ((table >> index) & 0x3) << 28;
+
+ mem_out[1] = vh;
+}
+
+template
void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_ctx* ctx0)
{
+ constexpr size_t MASK = cn_select_mask();
+ constexpr size_t ITERATIONS = cn_select_iter();
+ constexpr size_t MEM = cn_select_memory();
+
+ if(ALGO == cryptonight_monero && len < 43)
+ {
+ memset(output, 0, 32);
+ return;
+ }
+
keccak((const uint8_t *)input, len, ctx0->hash_state, 200);
+ uint64_t monero_const;
+ if(ALGO == cryptonight_monero)
+ {
+ monero_const = *reinterpret_cast(reinterpret_cast(input) + 35);
+ monero_const ^= *(reinterpret_cast(ctx0->hash_state) + 24);
+ }
+
// Optim - 99% time boundary
- cn_explode_scratchpad((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state);
+ cn_explode_scratchpad((__m128i*)ctx0->hash_state, (__m128i*)ctx0->long_state);
uint8_t* l0 = ctx0->long_state;
uint64_t* h0 = (uint64_t*)ctx0->hash_state;
@@ -315,12 +482,16 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c
else
cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
- _mm_store_si128((__m128i *)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
+ if(ALGO == cryptonight_monero)
+ cryptonight_monero_tweak((uint64_t*)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
+ else
+ _mm_store_si128((__m128i *)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
+
idx0 = _mm_cvtsi128_si64(cx);
- bx0 = cx;
if(PREFETCH)
_mm_prefetch((const char*)&l0[idx0 & MASK], _MM_HINT_T0);
+ bx0 = cx;
uint64_t hi, lo, cl, ch;
cl = ((uint64_t*)&l0[idx0 & MASK])[0];
@@ -329,19 +500,33 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c
lo = _umul128(idx0, cl, &hi);
al0 += hi;
- ah0 += lo;
((uint64_t*)&l0[idx0 & MASK])[0] = al0;
- ((uint64_t*)&l0[idx0 & MASK])[1] = ah0;
- ah0 ^= ch;
al0 ^= cl;
+ if(PREFETCH)
+ _mm_prefetch((const char*)&l0[al0 & MASK], _MM_HINT_T0);
+ ah0 += lo;
+
+ if(ALGO == cryptonight_monero)
+ ((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ monero_const;
+ else
+ ((uint64_t*)&l0[idx0 & MASK])[1] = ah0;
+ ah0 ^= ch;
+
idx0 = al0;
- if(PREFETCH)
- _mm_prefetch((const char*)&l0[idx0 & MASK], _MM_HINT_T0);
+ if(ALGO == cryptonight_heavy)
+ {
+ int64_t n = ((int64_t*)&l0[idx0 & MASK])[0];
+ int32_t d = ((int32_t*)&l0[idx0 & MASK])[2];
+ int64_t q = n / (d | 0x5);
+
+ ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q;
+ idx0 = d ^ q;
+ }
}
// Optim - 90% time boundary
- cn_implode_scratchpad((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state);
+ cn_implode_scratchpad((__m128i*)ctx0->long_state, (__m128i*)ctx0->hash_state);
// Optim - 99% time boundary
@@ -352,15 +537,34 @@ void cryptonight_hash(const void* input, size_t len, void* output, cryptonight_c
// This lovely creation will do 2 cn hashes at a time. We have plenty of space on silicon
// to fit temporary vars for two contexts. Function will read len*2 from input and write 64 bytes to output
// We are still limited by L3 cache, so doubling will only work with CPUs where we have more than 2MB to core (Xeons)
-template
+template
void cryptonight_double_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
{
+ constexpr size_t MASK = cn_select_mask();
+ constexpr size_t ITERATIONS = cn_select_iter();
+ constexpr size_t MEM = cn_select_memory();
+
+ if(ALGO == cryptonight_monero && len < 43)
+ {
+ memset(output, 0, 64);
+ return;
+ }
+
keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200);
keccak((const uint8_t *)input+len, len, ctx[1]->hash_state, 200);
+ uint64_t monero_const_0, monero_const_1;
+ if(ALGO == cryptonight_monero)
+ {
+ monero_const_0 = *reinterpret_cast(reinterpret_cast(input) + 35);
+ monero_const_0 ^= *(reinterpret_cast(ctx[0]->hash_state) + 24);
+ monero_const_1 = *reinterpret_cast(reinterpret_cast(input) + len + 35);
+ monero_const_1 ^= *(reinterpret_cast(ctx[1]->hash_state) + 24);
+ }
+
// Optim - 99% time boundary
- cn_explode_scratchpad((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state);
- cn_explode_scratchpad((__m128i*)ctx[1]->hash_state, (__m128i*)ctx[1]->long_state);
+ cn_explode_scratchpad((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state);
+ cn_explode_scratchpad((__m128i*)ctx[1]->hash_state, (__m128i*)ctx[1]->long_state);
uint8_t* l0 = ctx[0]->long_state;
uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
@@ -388,7 +592,11 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto
else
cx = _mm_aesenc_si128(cx, _mm_set_epi64x(axh0, axl0));
- _mm_store_si128((__m128i *)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
+ if(ALGO == cryptonight_monero)
+ cryptonight_monero_tweak((uint64_t*)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
+ else
+ _mm_store_si128((__m128i *)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
+
idx0 = _mm_cvtsi128_si64(cx);
bx0 = cx;
@@ -402,7 +610,11 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto
else
cx = _mm_aesenc_si128(cx, _mm_set_epi64x(axh1, axl1));
- _mm_store_si128((__m128i *)&l1[idx1 & MASK], _mm_xor_si128(bx1, cx));
+ if(ALGO == cryptonight_monero)
+ cryptonight_monero_tweak((uint64_t*)&l1[idx1 & MASK], _mm_xor_si128(bx1, cx));
+ else
+ _mm_store_si128((__m128i *)&l1[idx1 & MASK], _mm_xor_si128(bx1, cx));
+
idx1 = _mm_cvtsi128_si64(cx);
bx1 = cx;
@@ -418,11 +630,26 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto
axl0 += hi;
axh0 += lo;
((uint64_t*)&l0[idx0 & MASK])[0] = axl0;
- ((uint64_t*)&l0[idx0 & MASK])[1] = axh0;
+
+ if(ALGO == cryptonight_monero)
+ ((uint64_t*)&l0[idx0 & MASK])[1] = axh0 ^ monero_const_0;
+ else
+ ((uint64_t*)&l0[idx0 & MASK])[1] = axh0;
+
axh0 ^= ch;
axl0 ^= cl;
idx0 = axl0;
+ if(ALGO == cryptonight_heavy)
+ {
+ int64_t n = ((int64_t*)&l0[idx0 & MASK])[0];
+ int32_t d = ((int32_t*)&l0[idx0 & MASK])[2];
+ int64_t q = n / (d | 0x5);
+
+ ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q;
+ idx0 = d ^ q;
+ }
+
if(PREFETCH)
_mm_prefetch((const char*)&l0[idx0 & MASK], _MM_HINT_T0);
@@ -434,18 +661,33 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto
axl1 += hi;
axh1 += lo;
((uint64_t*)&l1[idx1 & MASK])[0] = axl1;
- ((uint64_t*)&l1[idx1 & MASK])[1] = axh1;
+
+ if(ALGO == cryptonight_monero)
+ ((uint64_t*)&l1[idx1 & MASK])[1] = axh1 ^ monero_const_1;
+ else
+ ((uint64_t*)&l1[idx1 & MASK])[1] = axh1;
+
axh1 ^= ch;
axl1 ^= cl;
idx1 = axl1;
+ if(ALGO == cryptonight_heavy)
+ {
+ int64_t n = ((int64_t*)&l1[idx1 & MASK])[0];
+ int32_t d = ((int32_t*)&l1[idx1 & MASK])[2];
+ int64_t q = n / (d | 0x5);
+
+ ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q;
+ idx1 = d ^ q;
+ }
+
if(PREFETCH)
_mm_prefetch((const char*)&l1[idx1 & MASK], _MM_HINT_T0);
}
// Optim - 90% time boundary
- cn_implode_scratchpad((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state);
- cn_implode_scratchpad((__m128i*)ctx[1]->long_state, (__m128i*)ctx[1]->hash_state);
+ cn_implode_scratchpad((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state);
+ cn_implode_scratchpad((__m128i*)ctx[1]->long_state, (__m128i*)ctx[1]->hash_state);
// Optim - 99% time boundary
@@ -456,12 +698,10 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto
}
#define CN_STEP1(a, b, c, l, ptr, idx) \
- a = _mm_xor_si128(a, c); \
- idx = _mm_cvtsi128_si64(a); \
ptr = (__m128i *)&l[idx & MASK]; \
if(PREFETCH) \
_mm_prefetch((const char*)ptr, _MM_HINT_T0); \
- c = _mm_load_si128(ptr)
+ c = _mm_load_si128(ptr);
#define CN_STEP2(a, b, c, l, ptr, idx) \
if(SOFT_AES) \
@@ -469,30 +709,64 @@ void cryptonight_double_hash(const void* input, size_t len, void* output, crypto
else \
c = _mm_aesenc_si128(c, a); \
b = _mm_xor_si128(b, c); \
- _mm_store_si128(ptr, b)
+ if(ALGO == cryptonight_monero) \
+ cryptonight_monero_tweak((uint64_t*)ptr, b); \
+ else \
+ _mm_store_si128(ptr, b);\
#define CN_STEP3(a, b, c, l, ptr, idx) \
idx = _mm_cvtsi128_si64(c); \
ptr = (__m128i *)&l[idx & MASK]; \
if(PREFETCH) \
_mm_prefetch((const char*)ptr, _MM_HINT_T0); \
- b = _mm_load_si128(ptr)
+ b = _mm_load_si128(ptr);
-#define CN_STEP4(a, b, c, l, ptr, idx) \
+#define CN_STEP4(a, b, c, l, mc, ptr, idx) \
lo = _umul128(idx, _mm_cvtsi128_si64(b), &hi); \
a = _mm_add_epi64(a, _mm_set_epi64x(lo, hi)); \
- _mm_store_si128(ptr, a)
+ if(ALGO == cryptonight_monero) \
+ _mm_store_si128(ptr, _mm_xor_si128(a, mc)); \
+ else \
+ _mm_store_si128(ptr, a);\
+ a = _mm_xor_si128(a, b); \
+ idx = _mm_cvtsi128_si64(a); \
+ if(ALGO == cryptonight_heavy) \
+ { \
+ int64_t n = ((int64_t*)&l[idx & MASK])[0]; \
+ int32_t d = ((int32_t*)&l[idx & MASK])[2]; \
+ int64_t q = n / (d | 0x5); \
+ ((int64_t*)&l[idx & MASK])[0] = n ^ q; \
+ idx = d ^ q; \
+ }
+
+#define CONST_INIT(ctx, n) \
+ __m128i mc##n = _mm_set_epi64x(*reinterpret_cast(reinterpret_cast(input) + n * len + 35) ^ \
+ *(reinterpret_cast((ctx)->hash_state) + 24), 0);
// This lovelier creation will do 3 cn hashes at a time.
-template
+template
void cryptonight_triple_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
{
+ constexpr size_t MASK = cn_select_mask();
+ constexpr size_t ITERATIONS = cn_select_iter();
+ constexpr size_t MEM = cn_select_memory();
+
+ if(ALGO == cryptonight_monero && len < 43)
+ {
+ memset(output, 0, 32 * 3);
+ return;
+ }
+
for (size_t i = 0; i < 3; i++)
{
keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
- cn_explode_scratchpad((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
+ cn_explode_scratchpad((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
}
+ CONST_INIT(ctx[0], 0);
+ CONST_INIT(ctx[1], 1);
+ CONST_INIT(ctx[2], 2);
+
uint8_t* l0 = ctx[0]->long_state;
uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
uint8_t* l1 = ctx[1]->long_state;
@@ -510,9 +784,14 @@ void cryptonight_triple_hash(const void* input, size_t len, void* output, crypto
__m128i cx1 = _mm_set_epi64x(0, 0);
__m128i cx2 = _mm_set_epi64x(0, 0);
+ uint64_t idx0, idx1, idx2;
+ idx0 = _mm_cvtsi128_si64(ax0);
+ idx1 = _mm_cvtsi128_si64(ax1);
+ idx2 = _mm_cvtsi128_si64(ax2);
+
for (size_t i = 0; i < ITERATIONS/2; i++)
{
- uint64_t idx0, idx1, idx2, hi, lo;
+ uint64_t hi, lo;
__m128i *ptr0, *ptr1, *ptr2;
// EVEN ROUND
@@ -528,9 +807,9 @@ void cryptonight_triple_hash(const void* input, size_t len, void* output, crypto
CN_STEP3(ax1, bx1, cx1, l1, ptr1, idx1);
CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2);
- CN_STEP4(ax0, bx0, cx0, l0, ptr0, idx0);
- CN_STEP4(ax1, bx1, cx1, l1, ptr1, idx1);
- CN_STEP4(ax2, bx2, cx2, l2, ptr2, idx2);
+ CN_STEP4(ax0, bx0, cx0, l0, mc0, ptr0, idx0);
+ CN_STEP4(ax1, bx1, cx1, l1, mc1, ptr1, idx1);
+ CN_STEP4(ax2, bx2, cx2, l2, mc2, ptr2, idx2);
// ODD ROUND
CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0);
@@ -545,29 +824,44 @@ void cryptonight_triple_hash(const void* input, size_t len, void* output, crypto
CN_STEP3(ax1, cx1, bx1, l1, ptr1, idx1);
CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2);
- CN_STEP4(ax0, cx0, bx0, l0, ptr0, idx0);
- CN_STEP4(ax1, cx1, bx1, l1, ptr1, idx1);
- CN_STEP4(ax2, cx2, bx2, l2, ptr2, idx2);
+ CN_STEP4(ax0, cx0, bx0, l0, mc0, ptr0, idx0);
+ CN_STEP4(ax1, cx1, bx1, l1, mc1, ptr1, idx1);
+ CN_STEP4(ax2, cx2, bx2, l2, mc2, ptr2, idx2);
}
for (size_t i = 0; i < 3; i++)
{
- cn_implode_scratchpad((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
+ cn_implode_scratchpad((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
keccakf((uint64_t*)ctx[i]->hash_state, 24);
extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
}
}
// This even lovelier creation will do 4 cn hashes at a time.
-template
+template
void cryptonight_quad_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
{
+ constexpr size_t MASK = cn_select_mask();
+ constexpr size_t ITERATIONS = cn_select_iter();
+ constexpr size_t MEM = cn_select_memory();
+
+ if(ALGO == cryptonight_monero && len < 43)
+ {
+ memset(output, 0, 32 * 4);
+ return;
+ }
+
for (size_t i = 0; i < 4; i++)
{
keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
- cn_explode_scratchpad((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
+ cn_explode_scratchpad((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
}
+ CONST_INIT(ctx[0], 0);
+ CONST_INIT(ctx[1], 1);
+ CONST_INIT(ctx[2], 2);
+ CONST_INIT(ctx[3], 3);
+
uint8_t* l0 = ctx[0]->long_state;
uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
uint8_t* l1 = ctx[1]->long_state;
@@ -589,10 +883,16 @@ void cryptonight_quad_hash(const void* input, size_t len, void* output, cryptoni
__m128i cx1 = _mm_set_epi64x(0, 0);
__m128i cx2 = _mm_set_epi64x(0, 0);
__m128i cx3 = _mm_set_epi64x(0, 0);
-
+
+ uint64_t idx0, idx1, idx2, idx3;
+ idx0 = _mm_cvtsi128_si64(ax0);
+ idx1 = _mm_cvtsi128_si64(ax1);
+ idx2 = _mm_cvtsi128_si64(ax2);
+ idx3 = _mm_cvtsi128_si64(ax3);
+
for (size_t i = 0; i < ITERATIONS/2; i++)
{
- uint64_t idx0, idx1, idx2, idx3, hi, lo;
+ uint64_t hi, lo;
__m128i *ptr0, *ptr1, *ptr2, *ptr3;
// EVEN ROUND
@@ -611,10 +911,10 @@ void cryptonight_quad_hash(const void* input, size_t len, void* output, cryptoni
CN_STEP3(ax2, bx2, cx2, l2, ptr2, idx2);
CN_STEP3(ax3, bx3, cx3, l3, ptr3, idx3);
- CN_STEP4(ax0, bx0, cx0, l0, ptr0, idx0);
- CN_STEP4(ax1, bx1, cx1, l1, ptr1, idx1);
- CN_STEP4(ax2, bx2, cx2, l2, ptr2, idx2);
- CN_STEP4(ax3, bx3, cx3, l3, ptr3, idx3);
+ CN_STEP4(ax0, bx0, cx0, l0, mc0, ptr0, idx0);
+ CN_STEP4(ax1, bx1, cx1, l1, mc1, ptr1, idx1);
+ CN_STEP4(ax2, bx2, cx2, l2, mc2, ptr2, idx2);
+ CN_STEP4(ax3, bx3, cx3, l3, mc3, ptr3, idx3);
// ODD ROUND
CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0);
@@ -632,30 +932,46 @@ void cryptonight_quad_hash(const void* input, size_t len, void* output, cryptoni
CN_STEP3(ax2, cx2, bx2, l2, ptr2, idx2);
CN_STEP3(ax3, cx3, bx3, l3, ptr3, idx3);
- CN_STEP4(ax0, cx0, bx0, l0, ptr0, idx0);
- CN_STEP4(ax1, cx1, bx1, l1, ptr1, idx1);
- CN_STEP4(ax2, cx2, bx2, l2, ptr2, idx2);
- CN_STEP4(ax3, cx3, bx3, l3, ptr3, idx3);
+ CN_STEP4(ax0, cx0, bx0, l0, mc0, ptr0, idx0);
+ CN_STEP4(ax1, cx1, bx1, l1, mc1, ptr1, idx1);
+ CN_STEP4(ax2, cx2, bx2, l2, mc2, ptr2, idx2);
+ CN_STEP4(ax3, cx3, bx3, l3, mc3, ptr3, idx3);
}
for (size_t i = 0; i < 4; i++)
{
- cn_implode_scratchpad((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
+ cn_implode_scratchpad((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
keccakf((uint64_t*)ctx[i]->hash_state, 24);
extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
}
}
// This most lovely creation will do 5 cn hashes at a time.
-template
+template
void cryptonight_penta_hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
{
+ constexpr size_t MASK = cn_select_mask();
+ constexpr size_t ITERATIONS = cn_select_iter();
+ constexpr size_t MEM = cn_select_memory();
+
+ if(ALGO == cryptonight_monero && len < 43)
+ {
+ memset(output, 0, 32 * 5);
+ return;
+ }
+
for (size_t i = 0; i < 5; i++)
{
keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
- cn_explode_scratchpad((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
+ cn_explode_scratchpad((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
}
+ CONST_INIT(ctx[0], 0);
+ CONST_INIT(ctx[1], 1);
+ CONST_INIT(ctx[2], 2);
+ CONST_INIT(ctx[3], 3);
+ CONST_INIT(ctx[4], 4);
+
uint8_t* l0 = ctx[0]->long_state;
uint64_t* h0 = (uint64_t*)ctx[0]->hash_state;
uint8_t* l1 = ctx[1]->long_state;
@@ -683,9 +999,16 @@ void cryptonight_penta_hash(const void* input, size_t len, void* output, crypton
__m128i cx3 = _mm_set_epi64x(0, 0);
__m128i cx4 = _mm_set_epi64x(0, 0);
+ uint64_t idx0, idx1, idx2, idx3, idx4;
+ idx0 = _mm_cvtsi128_si64(ax0);
+ idx1 = _mm_cvtsi128_si64(ax1);
+ idx2 = _mm_cvtsi128_si64(ax2);
+ idx3 = _mm_cvtsi128_si64(ax3);
+ idx4 = _mm_cvtsi128_si64(ax4);
+
for (size_t i = 0; i < ITERATIONS/2; i++)
{
- uint64_t idx0, idx1, idx2, idx3, idx4, hi, lo;
+ uint64_t hi, lo;
__m128i *ptr0, *ptr1, *ptr2, *ptr3, *ptr4;
// EVEN ROUND
@@ -707,11 +1030,11 @@ void cryptonight_penta_hash(const void* input, size_t len, void* output, crypton
CN_STEP3(ax3, bx3, cx3, l3, ptr3, idx3);
CN_STEP3(ax4, bx4, cx4, l4, ptr4, idx4);
- CN_STEP4(ax0, bx0, cx0, l0, ptr0, idx0);
- CN_STEP4(ax1, bx1, cx1, l1, ptr1, idx1);
- CN_STEP4(ax2, bx2, cx2, l2, ptr2, idx2);
- CN_STEP4(ax3, bx3, cx3, l3, ptr3, idx3);
- CN_STEP4(ax4, bx4, cx4, l4, ptr4, idx4);
+ CN_STEP4(ax0, bx0, cx0, l0, mc0, ptr0, idx0);
+ CN_STEP4(ax1, bx1, cx1, l1, mc1, ptr1, idx1);
+ CN_STEP4(ax2, bx2, cx2, l2, mc2, ptr2, idx2);
+ CN_STEP4(ax3, bx3, cx3, l3, mc3, ptr3, idx3);
+ CN_STEP4(ax4, bx4, cx4, l4, mc4, ptr4, idx4);
// ODD ROUND
CN_STEP1(ax0, cx0, bx0, l0, ptr0, idx0);
@@ -732,16 +1055,16 @@ void cryptonight_penta_hash(const void* input, size_t len, void* output, crypton
CN_STEP3(ax3, cx3, bx3, l3, ptr3, idx3);
CN_STEP3(ax4, cx4, bx4, l4, ptr4, idx4);
- CN_STEP4(ax0, cx0, bx0, l0, ptr0, idx0);
- CN_STEP4(ax1, cx1, bx1, l1, ptr1, idx1);
- CN_STEP4(ax2, cx2, bx2, l2, ptr2, idx2);
- CN_STEP4(ax3, cx3, bx3, l3, ptr3, idx3);
- CN_STEP4(ax4, cx4, bx4, l4, ptr4, idx4);
+ CN_STEP4(ax0, cx0, bx0, l0, mc0, ptr0, idx0);
+ CN_STEP4(ax1, cx1, bx1, l1, mc1, ptr1, idx1);
+ CN_STEP4(ax2, cx2, bx2, l2, mc2, ptr2, idx2);
+ CN_STEP4(ax3, cx3, bx3, l3, mc3, ptr3, idx3);
+ CN_STEP4(ax4, cx4, bx4, l4, mc4, ptr4, idx4);
}
for (size_t i = 0; i < 5; i++)
{
- cn_implode_scratchpad((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
+ cn_implode_scratchpad((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
keccakf((uint64_t*)ctx[i]->hash_state, 24);
extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
}
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
index 8b2207ddb..17fa24b06 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
+++ b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
@@ -28,9 +28,10 @@ extern "C"
#include "c_jh.h"
#include "c_skein.h"
}
+#include "xmrstak/backend/cryptonight.hpp"
#include "cryptonight.h"
#include "cryptonight_aesni.h"
-#include "xmrstak/backend/cryptonight.hpp"
+#include "xmrstak/misc/console.hpp"
#include "xmrstak/jconf.hpp"
#include
#include
@@ -73,6 +74,8 @@ void do_skein_hash(const void* input, size_t len, char* output) {
void (* const extra_hashes[4])(const void *, size_t, char *) = {do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash};
#ifdef _WIN32
+#include "xmrstak/misc/uac.hpp"
+
BOOL bRebootDesirable = FALSE; //If VirtualAlloc fails, suggest a reboot
BOOL AddPrivilege(TCHAR* pszPrivilege)
@@ -176,13 +179,16 @@ size_t cryptonight_init(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg)
if(AddPrivilege(TEXT("SeLockMemoryPrivilege")) == 0)
{
+ printer::inst()->print_msg(L0, "Elevating because we need to set up fast memory privileges.");
+ RequestElevation();
+
if(AddLargePageRights())
{
msg->warning = "Added SeLockMemoryPrivilege to the current account. You need to reboot for it to work";
bRebootDesirable = TRUE;
}
else
- msg->warning = "Obtaning SeLockMemoryPrivilege failed.";
+ msg->warning = "Obtaining SeLockMemoryPrivilege failed.";
return 0;
}
@@ -196,15 +202,8 @@ size_t cryptonight_init(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg)
cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg)
{
- size_t hashMemSize;
- if(::jconf::inst()->IsCurrencyMonero())
- {
- hashMemSize = MONERO_MEMORY;
- }
- else
- {
- hashMemSize = AEON_MEMORY;
- }
+ size_t hashMemSize = cn_select_memory(::jconf::inst()->GetMiningAlgo());
+
cryptonight_ctx* ptr = (cryptonight_ctx*)_mm_malloc(sizeof(cryptonight_ctx), 4096);
if(use_fast_mem == 0)
@@ -247,6 +246,9 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al
#elif defined(__FreeBSD__)
ptr->long_state = (uint8_t*)mmap(0, hashMemSize, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER | MAP_PREFAULT_READ, -1, 0);
+#elif defined(__OpenBSD__)
+ ptr->long_state = (uint8_t*)mmap(0, hashMemSize, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANON, -1, 0);
#else
ptr->long_state = (uint8_t*)mmap(0, hashMemSize, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, 0, 0);
@@ -276,15 +278,8 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al
void cryptonight_free_ctx(cryptonight_ctx* ctx)
{
- size_t hashMemSize;
- if(::jconf::inst()->IsCurrencyMonero())
- {
- hashMemSize = MONERO_MEMORY;
- }
- else
- {
- hashMemSize = AEON_MEMORY;
- }
+ size_t hashMemSize = cn_select_memory(::jconf::inst()->GetMiningAlgo());
+
if(ctx->ctx_info[0] != 0)
{
#ifdef _WIN32
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index 143b66f75..e263aca41 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -73,7 +73,16 @@ namespace cpu
bool minethd::thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id)
{
#if defined(_WIN32)
- return SetThreadAffinityMask(h, 1ULL << cpu_id) != 0;
+ // we can only pin up to 64 threads
+ if(cpu_id < 64)
+ {
+ return SetThreadAffinityMask(h, 1ULL << cpu_id) != 0;
+ }
+ else
+ {
+ printer::inst()->print_msg(L0, "WARNING: Windows supports only affinity up to 63.");
+ return false;
+ }
#elif defined(__APPLE__)
thread_port_t mach_thread;
thread_affinity_policy_data_t policy = { static_cast(cpu_id) };
@@ -84,6 +93,8 @@ bool minethd::thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id
CPU_ZERO(&mn);
CPU_SET(cpu_id, &mn);
return pthread_setaffinity_np(h, sizeof(cpuset_t), &mn) == 0;
+#elif defined(__OpenBSD__)
+ printer::inst()->print_msg(L0,"WARNING: thread pinning is not supported under OPENBSD.");
#else
cpu_set_t mn;
CPU_ZERO(&mn);
@@ -220,45 +231,44 @@ bool minethd::self_test()
bool bResult = true;
- bool mineMonero = ::jconf::inst()->IsCurrencyMonero();
- if(mineMonero)
+ if(::jconf::inst()->GetMiningAlgo() == cryptonight)
{
unsigned char out[32 * MAX_N];
cn_hash_fun hashf;
cn_hash_fun_multi hashf_multi;
- hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, mineMonero);
+ hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
hashf("This is a test", 14, out, ctx[0]);
bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
- hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, mineMonero);
+ hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight);
hashf("This is a test", 14, out, ctx[0]);
bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
- hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), false, mineMonero);
+ hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx);
bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
"\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
- hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), true, mineMonero);
+ hashf_multi = func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight);
hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx);
bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
"\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
- hashf_multi = func_multi_selector(3, ::jconf::inst()->HaveHardwareAes(), false, mineMonero);
+ hashf_multi = func_multi_selector(3, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
hashf_multi("This is a testThis is a testThis is a test", 14, out, ctx);
bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 96) == 0;
- hashf_multi = func_multi_selector(4, ::jconf::inst()->HaveHardwareAes(), false, mineMonero);
+ hashf_multi = func_multi_selector(4, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
hashf_multi("This is a testThis is a testThis is a testThis is a test", 14, out, ctx);
bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 128) == 0;
- hashf_multi = func_multi_selector(5, ::jconf::inst()->HaveHardwareAes(), false, mineMonero);
+ hashf_multi = func_multi_selector(5, ::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
hashf_multi("This is a testThis is a testThis is a testThis is a testThis is a test", 14, out, ctx);
bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
@@ -266,6 +276,12 @@ bool minethd::self_test()
"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 160) == 0;
}
+ else if(::jconf::inst()->GetMiningAlgo() == cryptonight_lite)
+ {
+ }
+ else if(::jconf::inst()->GetMiningAlgo() == cryptonight_monero)
+ {
+ }
for (int i = 0; i < MAX_N; i++)
cryptonight_free_ctx(ctx[i]);
@@ -307,7 +323,7 @@ std::vector minethd::thread_starter(uint32_t threadOffset, miner_work
if(cfg.iCpuAff >= 0)
{
#if defined(__APPLE__)
- printer::inst()->print_msg(L1, "WARNING on MacOS thread affinity is only advisory.");
+ printer::inst()->print_msg(L1, "WARNING on macOS thread affinity is only advisory.");
#endif
printer::inst()->print_msg(L1, "Starting %dx thread, affinity: %d.", cfg.iMultiway, (int)cfg.iCpuAff);
@@ -329,48 +345,56 @@ void minethd::consume_work()
globalStates::inst().inst().iConsumeCnt++;
}
-minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, bool mineMonero)
+minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo)
{
// We have two independent flag bits in the functions
// therefore we will build a binary digit and select the
// function as a two digit binary
- // Digit order SOFT_AES, NO_PREFETCH, MINER_ALGO
+
+ uint8_t algv;
+ switch(algo)
+ {
+ case cryptonight:
+ algv = 2;
+ break;
+ case cryptonight_lite:
+ algv = 1;
+ break;
+ case cryptonight_monero:
+ algv = 0;
+ break;
+ case cryptonight_heavy:
+ algv = 3;
+ break;
+ default:
+ algv = 2;
+ break;
+ }
static const cn_hash_fun func_table[] = {
- /* there will be 8 function entries if `CONF_NO_MONERO` and `CONF_NO_AEON`
- * is not defined. If one is defined there will be 4 entries.
- */
-#ifndef CONF_NO_MONERO
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash
-#endif
-#if (!defined(CONF_NO_AEON)) && (!defined(CONF_NO_MONERO))
- // comma will be added only if Monero and Aeon is build
- ,
-#endif
-#ifndef CONF_NO_AEON
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash,
- cryptonight_hash
-#endif
+ cryptonight_hash,
+ cryptonight_hash,
+ cryptonight_hash,
+ cryptonight_hash,
+ cryptonight_hash,
+ cryptonight_hash,
+ cryptonight_hash,
+ cryptonight_hash,
+ cryptonight_hash,
+ cryptonight_hash,
+ cryptonight_hash,
+ cryptonight_hash,
+ cryptonight_hash,
+ cryptonight_hash,
+ cryptonight_hash,
+ cryptonight_hash
};
- std::bitset<3> digit;
- digit.set(0, !bNoPrefetch);
- digit.set(1, !bHaveAes);
-
- // define aeon settings
-#if defined(CONF_NO_AEON) || defined(CONF_NO_MONERO)
- // ignore 3rd bit if only one currency is active
- digit.set(2, 0);
-#else
- digit.set(2, !mineMonero);
-#endif
+ std::bitset<2> digit;
+ digit.set(0, !bHaveAes);
+ digit.set(1, !bNoPrefetch);
- return func_table[digit.to_ulong()];
+ return func_table[ algv << 2 | digit.to_ulong() ];
}
void minethd::work_main()
@@ -390,7 +414,7 @@ void minethd::work_main()
uint32_t* piNonce;
job_result result;
- hash_fun = func_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->IsCurrencyMonero());
+ hash_fun = func_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->GetMiningAlgo());
ctx = minethd_alloc_ctx();
piHashVal = (uint64_t*)(result.bResult + 24);
@@ -423,6 +447,22 @@ void minethd::work_main()
if(oWork.bNiceHash)
result.iNonce = *piNonce;
+ if(::jconf::inst()->GetMiningAlgo() == cryptonight_monero)
+ {
+ if(oWork.bWorkBlob[0] >= 7)
+ hash_fun = func_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, cryptonight_monero);
+ else
+ hash_fun = func_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, cryptonight);
+ }
+
+ if(::jconf::inst()->GetMiningAlgo() == cryptonight_heavy)
+ {
+ if(oWork.bWorkBlob[0] >= 3)
+ hash_fun = func_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, cryptonight_heavy);
+ else
+ hash_fun = func_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, cryptonight);
+ }
+
while(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
{
if ((iCount++ & 0xF) == 0) //Store stats every 16 hashes
@@ -437,12 +477,13 @@ void minethd::work_main()
globalStates::inst().calc_start_nonce(result.iNonce, oWork.bNiceHash, nonce_chunk);
}
- *piNonce = ++result.iNonce;
+ *piNonce = result.iNonce;
hash_fun(oWork.bWorkBlob, oWork.iWorkSize, result.bResult, ctx);
if (*piHashVal < oWork.iTarget)
executor::inst()->push_event(ex_event(result, oWork.iPoolId));
+ result.iNonce++;
std::this_thread::yield();
}
@@ -453,93 +494,105 @@ void minethd::work_main()
cryptonight_free_ctx(ctx);
}
-minethd::cn_hash_fun_multi minethd::func_multi_selector(size_t N, bool bHaveAes, bool bNoPrefetch, bool mineMonero)
+minethd::cn_hash_fun_multi minethd::func_multi_selector(size_t N, bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo)
{
// We have two independent flag bits in the functions
// therefore we will build a binary digit and select the
// function as a two digit binary
- // Digit order SOFT_AES, NO_PREFETCH
+
+ uint8_t algv;
+ switch(algo)
+ {
+ case cryptonight:
+ algv = 2;
+ break;
+ case cryptonight_lite:
+ algv = 1;
+ break;
+ case cryptonight_monero:
+ algv = 0;
+ break;
+ default:
+ algv = 2;
+ break;
+ }
static const cn_hash_fun_multi func_table[] = {
- /* there will be 8*(MAX_N-1) function entries if `CONF_NO_MONERO` and `CONF_NO_AEON`
- * is not defined. If one is defined there will be 4*(MAX_N-1) entries.
- */
-#ifndef CONF_NO_MONERO
- cryptonight_double_hash,
- cryptonight_double_hash,
- cryptonight_double_hash,
- cryptonight_double_hash,
- cryptonight_triple_hash,
- cryptonight_triple_hash,
- cryptonight_triple_hash,
- cryptonight_triple_hash,
- cryptonight_quad_hash,
- cryptonight_quad_hash,
- cryptonight_quad_hash,
- cryptonight_quad_hash,
- cryptonight_penta_hash,
- cryptonight_penta_hash,
- cryptonight_penta_hash,
- cryptonight_penta_hash
-#endif
-#if (!defined(CONF_NO_AEON)) && (!defined(CONF_NO_MONERO))
- // comma will be added only if Monero and Aeon is build
- ,
-#endif
-#ifndef CONF_NO_AEON
- cryptonight_double_hash,
- cryptonight_double_hash,
- cryptonight_double_hash,
- cryptonight_double_hash,
- cryptonight_triple_hash,
- cryptonight_triple_hash,
- cryptonight_triple_hash,
- cryptonight_triple_hash,
- cryptonight_quad_hash,
- cryptonight_quad_hash,
- cryptonight_quad_hash,
- cryptonight_quad_hash,
- cryptonight_penta_hash,
- cryptonight_penta_hash,
- cryptonight_penta_hash,
- cryptonight_penta_hash
-#endif
+ cryptonight_double_hash,
+ cryptonight_double_hash,
+ cryptonight_double_hash,
+ cryptonight_double_hash,
+ cryptonight_triple_hash,
+ cryptonight_triple_hash,
+ cryptonight_triple_hash,
+ cryptonight_triple_hash,
+ cryptonight_quad_hash,
+ cryptonight_quad_hash,
+ cryptonight_quad_hash,
+ cryptonight_quad_hash,
+ cryptonight_penta_hash,
+ cryptonight_penta_hash,
+ cryptonight_penta_hash,
+ cryptonight_penta_hash,
+ cryptonight_double_hash,
+ cryptonight_double_hash,
+ cryptonight_double_hash,
+ cryptonight_double_hash,
+ cryptonight_triple_hash,
+ cryptonight_triple_hash,
+ cryptonight_triple_hash,
+ cryptonight_triple_hash,
+ cryptonight_quad_hash,
+ cryptonight_quad_hash,
+ cryptonight_quad_hash,
+ cryptonight_quad_hash,
+ cryptonight_penta_hash,
+ cryptonight_penta_hash,
+ cryptonight_penta_hash,
+ cryptonight_penta_hash,
+ cryptonight_double_hash,
+ cryptonight_double_hash,
+ cryptonight_double_hash,
+ cryptonight_double_hash,
+ cryptonight_triple_hash,
+ cryptonight_triple_hash,
+ cryptonight_triple_hash,
+ cryptonight_triple_hash,
+ cryptonight_quad_hash,
+ cryptonight_quad_hash,
+ cryptonight_quad_hash,
+ cryptonight_quad_hash,
+ cryptonight_penta_hash,
+ cryptonight_penta_hash,
+ cryptonight_penta_hash,
+ cryptonight_penta_hash
};
std::bitset<2> digit;
- digit.set(0, !bNoPrefetch);
- digit.set(1, !bHaveAes);
-
- // define aeon settings
-#if defined(CONF_NO_AEON) || defined(CONF_NO_MONERO)
- // ignore miner algo if only one currency is active
- size_t miner_algo_base = 0;
-#else
- size_t miner_algo_base = mineMonero ? 0 : 4*(MAX_N-1);
-#endif
-
- N = (N<2) ? 2 : (N>MAX_N) ? MAX_N : N;
- return func_table[miner_algo_base + 4*(N-2) + digit.to_ulong()];
+ digit.set(0, !bHaveAes);
+ digit.set(1, !bNoPrefetch);
+
+ return func_table[algv << 4 | (N-2) << 2 | digit.to_ulong()];
}
void minethd::double_work_main()
{
- multiway_work_main<2>(func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->IsCurrencyMonero()));
+ multiway_work_main<2>(func_multi_selector(2, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->GetMiningAlgo()));
}
void minethd::triple_work_main()
{
- multiway_work_main<3>(func_multi_selector(3, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->IsCurrencyMonero()));
+ multiway_work_main<3>(func_multi_selector(3, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->GetMiningAlgo()));
}
void minethd::quad_work_main()
{
- multiway_work_main<4>(func_multi_selector(4, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->IsCurrencyMonero()));
+ multiway_work_main<4>(func_multi_selector(4, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->GetMiningAlgo()));
}
void minethd::penta_work_main()
{
- multiway_work_main<5>(func_multi_selector(5, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->IsCurrencyMonero()));
+ multiway_work_main<5>(func_multi_selector(5, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, ::jconf::inst()->GetMiningAlgo()));
}
template
@@ -609,6 +662,22 @@ void minethd::multiway_work_main(cn_hash_fun_multi hash_fun_multi)
if(oWork.bNiceHash)
iNonce = *piNonce[0];
+ if(::jconf::inst()->GetMiningAlgo() == cryptonight_monero)
+ {
+ if(oWork.bWorkBlob[0] >= 7)
+ hash_fun_multi = func_multi_selector(N, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, cryptonight_monero);
+ else
+ hash_fun_multi = func_multi_selector(N, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, cryptonight);
+ }
+
+ if(::jconf::inst()->GetMiningAlgo() == cryptonight_heavy)
+ {
+ if(oWork.bWorkBlob[0] >= 3)
+ hash_fun_multi = func_multi_selector(N, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, cryptonight_heavy);
+ else
+ hash_fun_multi = func_multi_selector(N, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, cryptonight);
+ }
+
while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
{
if ((iCount++ & 0x7) == 0) //Store stats every 8*N hashes
@@ -626,7 +695,7 @@ void minethd::multiway_work_main(cn_hash_fun_multi hash_fun_multi)
}
for (size_t i = 0; i < N; i++)
- *piNonce[i] = ++iNonce;
+ *piNonce[i] = iNonce++;
hash_fun_multi(bWorkBlob, oWork.iWorkSize, bHashOut, ctx);
@@ -634,7 +703,7 @@ void minethd::multiway_work_main(cn_hash_fun_multi hash_fun_multi)
{
if (*piHashVal[i] < oWork.iTarget)
{
- executor::inst()->push_event(ex_event(job_result(oWork.sJobID, iNonce - N + 1 + i, bHashOut + 32 * i, iThreadNo), oWork.iPoolId));
+ executor::inst()->push_event(ex_event(job_result(oWork.sJobID, iNonce - N + i, bHashOut + 32 * i, iThreadNo), oWork.iPoolId));
}
}
diff --git a/xmrstak/backend/cpu/minethd.hpp b/xmrstak/backend/cpu/minethd.hpp
index 0433d0d36..ef1bbd2d1 100644
--- a/xmrstak/backend/cpu/minethd.hpp
+++ b/xmrstak/backend/cpu/minethd.hpp
@@ -1,5 +1,6 @@
#pragma once
+#include "xmrstak/jconf.hpp"
#include "crypto/cryptonight.h"
#include "xmrstak/backend/miner_work.hpp"
#include "xmrstak/backend/iBackend.hpp"
@@ -23,14 +24,14 @@ class minethd : public iBackend
typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx*);
- static cn_hash_fun func_selector(bool bHaveAes, bool bNoPrefetch, bool mineMonero);
+ static cn_hash_fun func_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo);
static bool thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id);
static cryptonight_ctx* minethd_alloc_ctx();
private:
typedef void (*cn_hash_fun_multi)(const void*, size_t, void*, cryptonight_ctx**);
- static cn_hash_fun_multi func_multi_selector(size_t N, bool bHaveAes, bool bNoPrefetch, bool mineMonero);
+ static cn_hash_fun_multi func_multi_selector(size_t N, bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo);
minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity);
diff --git a/xmrstak/backend/cryptonight.hpp b/xmrstak/backend/cryptonight.hpp
index 0ef5ae70d..fe10a9f92 100644
--- a/xmrstak/backend/cryptonight.hpp
+++ b/xmrstak/backend/cryptonight.hpp
@@ -1,12 +1,123 @@
#pragma once
+#include
+#include
+#include
+
+enum xmrstak_algo
+{
+ invalid_algo = 0,
+ cryptonight = 1,
+ cryptonight_lite = 2,
+ cryptonight_monero = 3,
+ cryptonight_heavy = 4
+};
// define aeon settings
-#define AEON_MEMORY 1048576llu
-#define AEON_MASK 0xFFFF0
-#define AEON_ITER 0x40000
+constexpr size_t CRYPTONIGHT_LITE_MEMORY = 1 * 1024 * 1024;
+constexpr uint32_t CRYPTONIGHT_LITE_MASK = 0xFFFF0;
+constexpr uint32_t CRYPTONIGHT_LITE_ITER = 0x40000;
+
+constexpr size_t CRYPTONIGHT_MEMORY = 2 * 1024 * 1024;
+constexpr uint32_t CRYPTONIGHT_MASK = 0x1FFFF0;
+constexpr uint32_t CRYPTONIGHT_ITER = 0x80000;
+
+constexpr size_t CRYPTONIGHT_HEAVY_MEMORY = 4 * 1024 * 1024;
+constexpr uint32_t CRYPTONIGHT_HEAVY_MASK = 0x3FFFF0;
+constexpr uint32_t CRYPTONIGHT_HEAVY_ITER = 0x40000;
+
+template
+inline constexpr size_t cn_select_memory() { return 0; }
+
+template<>
+inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_MEMORY; }
+
+template<>
+inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_LITE_MEMORY; }
+
+template<>
+inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_MEMORY; }
+
+template<>
+inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_HEAVY_MEMORY; }
+
+
+inline size_t cn_select_memory(xmrstak_algo algo)
+{
+ switch(algo)
+ {
+ case cryptonight:
+ return CRYPTONIGHT_MEMORY;
+ case cryptonight_lite:
+ return CRYPTONIGHT_LITE_MEMORY;
+ case cryptonight_monero:
+ return CRYPTONIGHT_MEMORY;
+ case cryptonight_heavy:
+ return CRYPTONIGHT_HEAVY_MEMORY;
+ default:
+ return 0;
+ }
+}
+
+template
+inline constexpr uint32_t cn_select_mask() { return 0; }
+
+template<>
+inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_MASK; }
+
+template<>
+inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_LITE_MASK; }
+
+template<>
+inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_MASK; }
+
+template<>
+inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_HEAVY_MASK; }
+
+inline size_t cn_select_mask(xmrstak_algo algo)
+{
+ switch(algo)
+ {
+ case cryptonight:
+ return CRYPTONIGHT_MASK;
+ case cryptonight_lite:
+ return CRYPTONIGHT_LITE_MASK;
+ case cryptonight_monero:
+ return CRYPTONIGHT_MASK;
+ case cryptonight_heavy:
+ return CRYPTONIGHT_HEAVY_MASK;
+ default:
+ return 0;
+ }
+}
+
+template
+inline constexpr uint32_t cn_select_iter() { return 0; }
+
+template<>
+inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_ITER; }
+
+template<>
+inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_LITE_ITER; }
+
+template<>
+inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_ITER; }
-// define xmr settings
-#define MONERO_MEMORY 2097152llu
-#define MONERO_MASK 0x1FFFF0
-#define MONERO_ITER 0x80000
+template<>
+inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_HEAVY_ITER; }
+inline size_t cn_select_iter(xmrstak_algo algo)
+{
+ switch(algo)
+ {
+ case cryptonight:
+ return CRYPTONIGHT_ITER;
+ case cryptonight_lite:
+ return CRYPTONIGHT_LITE_ITER;
+ case cryptonight_monero:
+ return CRYPTONIGHT_ITER;
+ case cryptonight_heavy:
+ return CRYPTONIGHT_HEAVY_ITER;
+ default:
+ return 0;
+ }
+}
diff --git a/xmrstak/backend/miner_work.hpp b/xmrstak/backend/miner_work.hpp
index 4bfe429f9..9e5a4e464 100644
--- a/xmrstak/backend/miner_work.hpp
+++ b/xmrstak/backend/miner_work.hpp
@@ -74,5 +74,11 @@ namespace xmrstak
return *this;
}
+
+ uint8_t getVersion() const
+ {
+ return bWorkBlob[0];
+ }
+
};
} // namepsace xmrstak
diff --git a/xmrstak/backend/nvidia/config.tpl b/xmrstak/backend/nvidia/config.tpl
index 54791729e..f489956af 100644
--- a/xmrstak/backend/nvidia/config.tpl
+++ b/xmrstak/backend/nvidia/config.tpl
@@ -26,6 +26,9 @@ R"===(
* "affine_to_cpu" : false, "sync_mode" : 3,
* },
* ],
+ * If you do not wish to mine with your nVidia GPU(s) then use:
+ * "gpu_threads_conf" :
+ * null,
*/
"gpu_threads_conf" :
diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp
index 556459639..153e4e33e 100644
--- a/xmrstak/backend/nvidia/minethd.cpp
+++ b/xmrstak/backend/nvidia/minethd.cpp
@@ -80,14 +80,22 @@ minethd::minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg)
ctx.syncMode = cfg.syncMode;
this->affinity = cfg.cpu_aff;
- std::unique_lock lck(thd_aff_set);
- std::future order_guard = order_fix.get_future();
+ std::future numa_guard = numa_promise.get_future();
+ thread_work_guard = thread_work_promise.get_future();
oWorkThd = std::thread(&minethd::work_main, this);
- order_guard.wait();
+ /* Wait until the gpu memory is initialized and numa cpu memory is pinned.
+ * The startup time is reduced if the memory is initialized in sequential order
+ * without concurrent threads (CUDA driver is less occupied).
+ */
+ numa_guard.wait();
+}
- if(affinity >= 0) //-1 means no affinity
+void minethd::start_mining()
+{
+ thread_work_promise.set_value();
+ if(this->affinity >= 0) //-1 means no affinity
if(!cpu::minethd::thd_setaffinity(oWorkThd.native_handle(), affinity))
printer::inst()->print_msg(L1, "WARNING setting affinity failed.");
}
@@ -166,7 +174,7 @@ std::vector* minethd::thread_starter(uint32_t threadOffset, miner_wor
if(cfg.cpu_aff >= 0)
{
#if defined(__APPLE__)
- printer::inst()->print_msg(L1, "WARNING on MacOS thread affinity is only advisory.");
+ printer::inst()->print_msg(L1, "WARNING on macOS thread affinity is only advisory.");
#endif
printer::inst()->print_msg(L1, "Starting NVIDIA GPU thread %d, affinity: %d.", i, (int)cfg.cpu_aff);
@@ -179,6 +187,11 @@ std::vector* minethd::thread_starter(uint32_t threadOffset, miner_wor
}
+ for (i = 0; i < n; i++)
+ {
+ static_cast((*pvThreads)[i])->start_mining();
+ }
+
return pvThreads;
}
@@ -208,26 +221,36 @@ void minethd::work_main()
if(affinity >= 0) //-1 means no affinity
bindMemoryToNUMANode(affinity);
- order_fix.set_value();
- std::unique_lock lck(thd_aff_set);
- lck.release();
+ if(cuda_get_deviceinfo(&ctx) != 0 || cryptonight_extra_cpu_init(&ctx) != 1)
+ {
+ printer::inst()->print_msg(L0, "Setup failed for GPU %d. Exitting.\n", (int)iThreadNo);
+ std::exit(0);
+ }
+
+ // numa memory bind and gpu memory is initialized
+ numa_promise.set_value();
+
std::this_thread::yield();
+ // wait until all NVIDIA devices are initialized
+ thread_work_guard.wait();
uint64_t iCount = 0;
cryptonight_ctx* cpu_ctx;
cpu_ctx = cpu::minethd::minethd_alloc_ctx();
- cn_hash_fun hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, ::jconf::inst()->IsCurrencyMonero());
+ auto miner_algo = ::jconf::inst()->GetMiningAlgo();
+ cn_hash_fun hash_fun;
+ if(miner_algo == cryptonight_monero || miner_algo == cryptonight_heavy)
+ {
+ // start with cryptonight and switch later if fork version is reached
+ hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, cryptonight);
+ }
+ else
+ hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
uint32_t iNonce;
globalStates::inst().iConsumeCnt++;
- if(cuda_get_deviceinfo(&ctx) != 0 || cryptonight_extra_cpu_init(&ctx) != 1)
- {
- printer::inst()->print_msg(L0, "Setup failed for GPU %d. Exitting.\n", (int)iThreadNo);
- std::exit(0);
- }
-
- bool mineMonero = strcmp_i(::jconf::inst()->GetCurrency(), "monero");
+ uint8_t version = 0;
while (bQuit == 0)
{
@@ -242,6 +265,16 @@ void minethd::work_main()
std::this_thread::sleep_for(std::chrono::milliseconds(100));
consume_work();
+ uint8_t new_version = oWork.getVersion();
+ if(miner_algo == cryptonight_monero && version < 7 && new_version >= 7)
+ {
+ hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, cryptonight_monero);
+ }
+ else if(miner_algo == cryptonight_heavy && version < 3 && new_version >= 3)
+ {
+ hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, cryptonight_heavy);
+ }
+ version = new_version;
continue;
}
@@ -266,11 +299,11 @@ void minethd::work_main()
uint32_t foundNonce[10];
uint32_t foundCount;
- cryptonight_extra_cpu_prepare(&ctx, iNonce);
+ cryptonight_extra_cpu_prepare(&ctx, iNonce, miner_algo, version);
- cryptonight_core_cpu_hash(&ctx, mineMonero);
+ cryptonight_core_cpu_hash(&ctx, miner_algo, iNonce, version);
- cryptonight_extra_cpu_final(&ctx, iNonce, oWork.iTarget, &foundCount, foundNonce);
+ cryptonight_extra_cpu_final(&ctx, iNonce, oWork.iTarget, &foundCount, foundNonce, miner_algo, version);
for(size_t i = 0; i < foundCount; i++)
{
@@ -287,7 +320,7 @@ void minethd::work_main()
if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget)
executor::inst()->push_event(ex_event(job_result(oWork.sJobID, foundNonce[i], bResult, iThreadNo), oWork.iPoolId));
else
- executor::inst()->push_event(ex_event("NVIDIA Invalid Result", oWork.iPoolId));
+ executor::inst()->push_event(ex_event("NVIDIA Invalid Result", ctx.device_id, oWork.iPoolId));
}
iCount += h_per_round;
@@ -301,6 +334,16 @@ void minethd::work_main()
}
consume_work();
+ uint8_t new_version = oWork.getVersion();
+ if(miner_algo == cryptonight_monero && version < 7 && new_version >= 7)
+ {
+ hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, cryptonight_monero);
+ }
+ else if(miner_algo == cryptonight_heavy && version < 3 && new_version >= 3)
+ {
+ hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, cryptonight_heavy);
+ }
+ version = new_version;
}
}
diff --git a/xmrstak/backend/nvidia/minethd.hpp b/xmrstak/backend/nvidia/minethd.hpp
index d13c8689c..fcd24fa89 100644
--- a/xmrstak/backend/nvidia/minethd.hpp
+++ b/xmrstak/backend/nvidia/minethd.hpp
@@ -32,7 +32,8 @@ class minethd : public iBackend
typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx*);
minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg);
-
+ void start_mining();
+
void work_main();
void consume_work();
@@ -44,8 +45,11 @@ class minethd : public iBackend
static miner_work oGlobalWork;
miner_work oWork;
- std::promise order_fix;
- std::mutex thd_aff_set;
+ std::promise numa_promise;
+ std::promise thread_work_promise;
+
+ // block thread until all NVIDIA GPUs are initialized
+ std::future thread_work_guard;
std::thread oWorkThd;
int64_t affinity;
diff --git a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
index afbdbaf88..29a35236e 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
@@ -3,6 +3,9 @@
#include
#include
+#include "xmrstak/jconf.hpp"
+#include "xmrstak/backend/cryptonight.hpp"
+
typedef struct {
int device_id;
const char *device_name;
@@ -20,6 +23,7 @@ typedef struct {
uint32_t *d_result_nonce;
uint32_t *d_long_state;
uint32_t *d_ctx_state;
+ uint32_t *d_ctx_state2;
uint32_t *d_ctx_a;
uint32_t *d_ctx_b;
uint32_t *d_ctx_key1;
@@ -41,8 +45,8 @@ int cuda_get_devicecount( int* deviceCount);
int cuda_get_deviceinfo(nvid_ctx *ctx);
int cryptonight_extra_cpu_init(nvid_ctx *ctx);
void cryptonight_extra_cpu_set_data( nvid_ctx* ctx, const void *data, uint32_t len);
-void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce);
-void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce);
+void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce, xmrstak_algo miner_algo, uint8_t version);
+void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce,xmrstak_algo miner_algo, uint8_t version);
}
-void cryptonight_core_cpu_hash(nvid_ctx* ctx, bool mineMonero);
+void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce, uint8_t version);
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index 15a6f36a7..ede578f04 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -6,6 +6,8 @@
#include
#include
+#include "xmrstak/jconf.hpp"
+
#ifdef _WIN32
#include
extern "C" void compat_usleep(uint64_t waitTime)
@@ -74,28 +76,50 @@ __device__ __forceinline__ uint64_t cuda_mul128( uint64_t multiplier, uint64_t m
template< typename T >
__device__ __forceinline__ T loadGlobal64( T * const addr )
{
+#if (__CUDA_ARCH__ < 700)
T x;
asm volatile( "ld.global.cg.u64 %0, [%1];" : "=l"( x ) : "l"( addr ) );
return x;
+#else
+ return *addr;
+#endif
}
template< typename T >
__device__ __forceinline__ T loadGlobal32( T * const addr )
{
+#if (__CUDA_ARCH__ < 700)
T x;
asm volatile( "ld.global.cg.u32 %0, [%1];" : "=r"( x ) : "l"( addr ) );
return x;
+#else
+ return *addr;
+#endif
}
template< typename T >
__device__ __forceinline__ void storeGlobal32( T* addr, T const & val )
{
+#if (__CUDA_ARCH__ < 700)
asm volatile( "st.global.cg.u32 [%0], %1;" : : "l"( addr ), "r"( val ) );
+#else
+ *addr = val;
+#endif
+}
+
+template< typename T >
+__device__ __forceinline__ void storeGlobal64( T* addr, T const & val )
+{
+#if (__CUDA_ARCH__ < 700)
+ asm volatile( "st.global.cg.u64 [%0], %1;" : : "l"( addr ), "l"( val ) );
+#else
+ *addr = val;
+#endif
}
-template
-__global__ void cryptonight_core_gpu_phase1( int threads, int bfactor, int partidx, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state, uint32_t * __restrict__ ctx_key1 )
+template
+__global__ void cryptonight_core_gpu_phase1( int threads, int bfactor, int partidx, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state2, uint32_t * __restrict__ ctx_key1 )
{
__shared__ uint32_t sharedMemory[1024];
@@ -105,7 +129,7 @@ __global__ void cryptonight_core_gpu_phase1( int threads, int bfactor, int parti
const int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 3;
const int sub = ( threadIdx.x & 7 ) << 2;
- const int batchsize = ITERATIONS >> bfactor;
+ const int batchsize = MEMORY >> bfactor;
const int start = partidx * batchsize;
const int end = start + batchsize;
@@ -119,18 +143,18 @@ __global__ void cryptonight_core_gpu_phase1( int threads, int bfactor, int parti
if( partidx == 0 )
{
// first round
- MEMCPY8( text, ctx_state + thread * 50 + sub + 16, 2 );
+ MEMCPY8( text, ctx_state2 + thread * 50 + sub + 16, 2 );
}
else
{
// load previous text data
- MEMCPY8( text, &long_state[( (uint64_t) thread << THREAD_SHIFT ) + sub + start - 32], 2 );
+ MEMCPY8( text, &long_state[( (uint64_t) thread * MEMORY ) + sub + start - 32], 2 );
}
__syncthreads( );
for ( int i = start; i < end; i += 32 )
{
cn_aes_pseudo_round_mut( sharedMemory, text, key );
- MEMCPY8(&long_state[((uint64_t) thread << THREAD_SHIFT) + (sub + i)], text, 2);
+ MEMCPY8(&long_state[((uint64_t) thread * MEMORY) + (sub + i)], text, 2);
}
}
@@ -145,33 +169,37 @@ __forceinline__ __device__ void unusedVar( const T& )
* - this method can be used with all compute architectures
* - for =sm_30
- * @param sub thread number within the group, range [0;4)
+ * @param sub thread number within the group, range [0:group_n]
* @param value value to share with other threads within the group
- * @param src thread number within the group from where the data is read, range [0;4)
+ * @param src thread number within the group from where the data is read, range [0:group_n]
*/
+template
__forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr,const uint32_t sub,const int val,const uint32_t src)
{
#if( __CUDA_ARCH__ < 300 )
ptr[sub] = val;
- return ptr[src&3];
+ return ptr[src & (group_n-1)];
#else
unusedVar( ptr );
unusedVar( sub );
# if(__CUDACC_VER_MAJOR__ >= 9)
- return __shfl_sync(0xFFFFFFFF, val, src, 4 );
+ return __shfl_sync(0xFFFFFFFF, val, src, group_n );
# else
- return __shfl( val, src, 4 );
+ return __shfl( val, src, group_n );
# endif
#endif
}
-template
+template
#ifdef XMR_STAK_THREADS
__launch_bounds__( XMR_STAK_THREADS * 4 )
#endif
-__global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b )
+__global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state,
+ uint32_t startNonce, uint32_t * __restrict__ d_input )
{
__shared__ uint32_t sharedMemory[1024];
@@ -180,6 +208,7 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
__syncthreads( );
const int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 2;
+ const uint32_t nonce = startNonce + thread;
const int sub = threadIdx.x & 3;
const int sub2 = sub & 2;
@@ -193,30 +222,48 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
return;
int i, k;
- uint32_t j;
+ uint32_t j;
const int batchsize = (ITERATIONS * 2) >> ( 2 + bfactor );
const int start = partidx * batchsize;
const int end = start + batchsize;
- uint32_t * long_state = &d_long_state[(IndexType) thread << THREAD_SHIFT];
- uint32_t * ctx_a = d_ctx_a + thread * 4;
- uint32_t * ctx_b = d_ctx_b + thread * 4;
- uint32_t a, d[2];
+ uint32_t * long_state = &d_long_state[(IndexType) thread * MEMORY];
+ uint32_t a, d[2], idx0;
uint32_t t1[2], t2[2], res;
- a = ctx_a[sub];
- d[1] = ctx_b[sub];
+ uint32_t tweak1_2[2];
+ if (ALGO == cryptonight_monero)
+ {
+ uint32_t * state = d_ctx_state + thread * 50;
+ tweak1_2[0] = (d_input[8] >> 24) | (d_input[9] << 8);
+ tweak1_2[0] ^= state[48];
+ tweak1_2[1] = nonce;
+ tweak1_2[1] ^= state[49];
+ }
+
+ a = (d_ctx_a + thread * 4)[sub];
+ idx0 = shuffle<4>(sPtr,sub, a, 0);
+ if(ALGO == cryptonight_heavy)
+ {
+ if(partidx != 0)
+ {
+ // state is stored after all ctx_b states
+ idx0 = *(d_ctx_b + threads * 4 + thread);
+ }
+ }
+ d[1] = (d_ctx_b + thread * 4)[sub];
+
#pragma unroll 2
for ( i = start; i < end; ++i )
{
#pragma unroll 2
for ( int x = 0; x < 2; ++x )
{
- j = ( ( shuffle(sPtr,sub, a, 0) & MASK ) >> 2 ) + sub;
+ j = ( ( idx0 & MASK ) >> 2 ) + sub;
const uint32_t x_0 = loadGlobal32( long_state + j );
- const uint32_t x_1 = shuffle(sPtr,sub, x_0, sub + 1);
- const uint32_t x_2 = shuffle(sPtr,sub, x_0, sub + 2);
- const uint32_t x_3 = shuffle(sPtr,sub, x_0, sub + 3);
+ const uint32_t x_1 = shuffle<4>(sPtr,sub, x_0, sub + 1);
+ const uint32_t x_2 = shuffle<4>(sPtr,sub, x_0, sub + 2);
+ const uint32_t x_3 = shuffle<4>(sPtr,sub, x_0, sub + 3);
d[x] = a ^
t_fn0( x_0 & 0xff ) ^
t_fn1( (x_1 >> 8) & 0xff ) ^
@@ -225,41 +272,74 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti
//XOR_BLOCKS_DST(c, b, &long_state[j]);
- t1[0] = shuffle(sPtr,sub, d[x], 0);
- //long_state[j] = d[0] ^ d[1];
- storeGlobal32( long_state + j, d[0] ^ d[1] );
-
+ t1[0] = shuffle<4>(sPtr,sub, d[x], 0);
+
+ const uint32_t z = d[0] ^ d[1];
+ if(ALGO == cryptonight_monero)
+ {
+ const uint32_t table = 0x75310U;
+ const uint32_t index = ((z >> 26) & 12) | ((z >> 23) & 2);
+ const uint32_t fork_7 = z ^ ((table >> index) & 0x30U) << 24;
+ storeGlobal32( long_state + j, sub == 2 ? fork_7 : z );
+ }
+ else
+ storeGlobal32( long_state + j, z );
+
//MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & MASK]);
j = ( ( *t1 & MASK ) >> 2 ) + sub;
uint32_t yy[2];
*( (uint64_t*) yy ) = loadGlobal64