From e859d8427420f5602b2a65d93d721fef0f75606b Mon Sep 17 00:00:00 2001 From: Robin Salen <30937548+Nashtare@users.noreply.github.com> Date: Thu, 10 Oct 2024 07:58:44 -0400 Subject: [PATCH] perf: reduce `MemBefore` initial size (#684) * Reduce KERNEL size * More * Reduce address overhead in syscalls * Remove gas charge todo * Tweak range * Apply suggestions * Ranges * More ranges * Review --- .../src/cpu/kernel/asm/cdk_pre_execution.asm | 17 +++-- .../src/cpu/kernel/asm/core/gas.asm | 22 +----- .../kernel/asm/core/precompiles/blake2_f.asm | 60 +++++++--------- .../kernel/asm/core/precompiles/bn_add.asm | 51 +++++++------- .../kernel/asm/core/precompiles/bn_mul.asm | 34 +++++----- .../cpu/kernel/asm/core/precompiles/ecrec.asm | 45 ++++++------ .../kernel/asm/core/precompiles/expmod.asm | 6 +- .../src/cpu/kernel/asm/core/terminate.asm | 2 +- .../asm/curve/bn254/curve_arithmetic/glv.asm | 2 +- .../kernel/asm/curve/secp256k1/curve_add.asm | 10 +-- .../cpu/kernel/asm/hash/blake2/blake2_f.asm | 28 ++++---- .../src/cpu/kernel/asm/main.asm | 4 +- .../src/cpu/kernel/asm/memory/syscalls.asm | 68 +++++++++---------- .../src/cpu/kernel/asm/mpt/hex_prefix.asm | 2 +- .../src/cpu/kernel/asm/signed.asm | 9 +-- .../src/cpu/kernel/asm/util/basic_macros.asm | 2 +- .../src/fixed_recursive_verifier.rs | 2 +- scripts/prove_stdio.sh | 11 ++- 18 files changed, 174 insertions(+), 201 deletions(-) diff --git a/evm_arithmetization/src/cpu/kernel/asm/cdk_pre_execution.asm b/evm_arithmetization/src/cpu/kernel/asm/cdk_pre_execution.asm index fa8828097..bc1145d63 100644 --- a/evm_arithmetization/src/cpu/kernel/asm/cdk_pre_execution.asm +++ b/evm_arithmetization/src/cpu/kernel/asm/cdk_pre_execution.asm @@ -58,19 +58,18 @@ global update_scalable_l1blockhash: PROVER_INPUT(ger) // stack: l1blockhash?, retdest DUP1 %eq_const(@U256_MAX) %jumpi(skip_and_exit) - // stack: l1blockhash, retdest + PUSH @SEGMENT_KERNEL_GENERAL + // stack: addr, l1blockhash, retdest PUSH @GLOBAL_EXIT_ROOT_STORAGE_POS PROVER_INPUT(ger) - // stack: root, GLOBAL_EXIT_ROOT_STORAGE_POS, l1blockhash, retdest - PUSH @SEGMENT_KERNEL_GENERAL - // stack: addr, root, GLOBAL_EXIT_ROOT_STORAGE_POS, l1blockhash, retdest + // stack: root, GLOBAL_EXIT_ROOT_STORAGE_POS, addr, l1blockhash, retdest + DUP3 + // stack: addr, root, GLOBAL_EXIT_ROOT_STORAGE_POS, addr, l1blockhash, retdest MSTORE_32BYTES_32 - // stack: addr, GLOBAL_EXIT_ROOT_STORAGE_POS, l1blockhash, retdest + // stack: addr', GLOBAL_EXIT_ROOT_STORAGE_POS, addr, l1blockhash, retdest MSTORE_32BYTES_32 - // stack: addr, l1blockhash, retdest - POP - // stack: l1blockhash, retdest - PUSH 64 PUSH @SEGMENT_KERNEL_GENERAL + // stack: addr'', addr, l1blockhash, retdest + %stack (addr_2, addr) -> (addr, 64) // stack: addr, len, l1blockhash, retdest KECCAK_GENERAL // stack: slot, l1blockhash, retdest diff --git a/evm_arithmetization/src/cpu/kernel/asm/core/gas.asm b/evm_arithmetization/src/cpu/kernel/asm/core/gas.asm index 2e16c373e..7b16cbed3 100644 --- a/evm_arithmetization/src/cpu/kernel/asm/core/gas.asm +++ b/evm_arithmetization/src/cpu/kernel/asm/core/gas.asm @@ -16,17 +16,7 @@ global sys_gas: %endmacro -// TODO: `%refund_gas` and `refund_gas_hook` are hooks used for debugging. They should be removed at some point and `refund_gas_original` renamed to `refund_gas`. %macro refund_gas - PUSH %%after %jump(refund_gas_hook) -%%after: - %refund_gas_original -%endmacro - -global refund_gas_hook: - JUMP - -%macro refund_gas_original // stack: amount DUP1 %journal_refund %mload_global_metadata(@GLOBAL_METADATA_REFUND_COUNTER) @@ -34,18 +24,8 @@ global refund_gas_hook: %mstore_global_metadata(@GLOBAL_METADATA_REFUND_COUNTER) %endmacro -// TODO: `%charge_gas` and `charge_gas_hook` are hooks used for debugging. They should be removed at some point and `charge_gas_original` renamed to `charge_gas`. -%macro charge_gas - PUSH %%after %jump(charge_gas_hook) -%%after: - %charge_gas_original -%endmacro - -global charge_gas_hook: - JUMP - // Charge gas. Faults if we exceed the limit for the current context. -%macro charge_gas_original +%macro charge_gas // stack: gas, kexit_info %shl_const(192) ADD diff --git a/evm_arithmetization/src/cpu/kernel/asm/core/precompiles/blake2_f.asm b/evm_arithmetization/src/cpu/kernel/asm/core/precompiles/blake2_f.asm index 8ad0e5c44..f8b4c8314 100644 --- a/evm_arithmetization/src/cpu/kernel/asm/core/precompiles/blake2_f.asm +++ b/evm_arithmetization/src/cpu/kernel/asm/core/precompiles/blake2_f.asm @@ -75,47 +75,39 @@ global precompile_blake2_f: SWAP1 // stack: t0_addr = m0_addr + 8 * 16, t_0, t_1, flag, blake2_f_contd, kexit_info + %sub_const(8) + // stack: m0_addr + 8 * (16 - 1), t_0, t_1, flag, blake2_f_contd, kexit_info + + PUSH @SEGMENT_CALLDATA + GET_CONTEXT + %build_address_no_offset + %rep 16 - // stack: m0_addr + 8 * (16 - i), m_(i+1), ..., m_15, t_0, t_1, flag, blake2_f_contd, kexit_info - %sub_const(8) - // stack: m0_addr + 8 * (16 - i - 1), m_(i+1), ..., m_15, t_0, t_1, flag, blake2_f_contd, kexit_info - DUP1 - // stack: m0_addr + 8 * (16 - i - 1), m0_addr + 8 * (16 - i - 1), m_(i+1), ..., m_15, t_0, t_1, flag, blake2_f_contd, kexit_info - PUSH @SEGMENT_CALLDATA - // stack: @SEGMENT_CALLDATA, m0_addr + 8 * (16 - i - 1), m0_addr + 8 * (16 - i - 1), m_(i+1), ..., m_15, t_0, t_1, flag, blake2_f_contd, kexit_info - GET_CONTEXT - // stack: ctx, @SEGMENT_CALLDATA, m0_addr + 8 * (16 - i - 1), m0_addr + 8 * (16 - i - 1), m_(i+1), ..., m_15, t_0, t_1, flag, blake2_f_contd, kexit_info - %build_address + // stack: base_addr, m0_addr + 8 * (16 - i - 1), m_(i+1), ..., m_15, t_0, t_1, flag, blake2_f_contd, kexit_info + DUP2 DUP2 + // stack: base_addr, m0_addr + 8 * (16 - i - 1), base_addr, m0_addr + 8 * (16 - i - 1), m_(i+1), ..., m_15, t_0, t_1, flag, blake2_f_contd, kexit_info + ADD // base_addr + offset %mload_packing_u64_LE - // stack: m_i, m0_addr + 8 * (16 - i - 1), m_(i+1), ..., m_15, t_0, t_1, flag, blake2_f_contd, kexit_info - SWAP1 - // stack: m0_addr + 8 * (16 - i - 1), m_i, m_(i+1), ..., m_15, t_0, t_1, flag, blake2_f_contd, kexit_info + // stack: m_i, base_addr, m0_addr + 8 * (16 - i - 1), m_(i+1), ..., m_15, t_0, t_1, flag, blake2_f_contd, kexit_info + SWAP2 %sub_const(8) SWAP1 + // stack: base_addr, m0_addr + 8 * (16 - i - 2), m_i, m_(i+1), ..., m_15, t_0, t_1, flag, blake2_f_contd, kexit_info %endrep - // stack: m0_addr = h0_addr + 8 * 8, m_0, ..., m_15, t_0, t_1, flag, blake2_f_contd, kexit_info + // stack: base_addr, m0_addr = h0_addr + 8 * 8, m_0, ..., m_15, t_0, t_1, flag, blake2_f_contd, kexit_info %rep 8 - // stack: h0_addr + 8 * (8 - i), h_(i+1), ..., h_7, m_0..m_15, t_0, t_1, flag, blake2_f_contd, kexit_info - %sub_const(8) - // stack: h0_addr + 8 * (8 - i - 1), h_(i+1), ..., h_7, m_0..m_15, t_0, t_1, flag, blake2_f_contd, kexit_info - DUP1 - // stack: h0_addr + 8 * (8 - i), h0_addr + 8 * (8 - i), h_(i+1), ..., h_7, m_0..m_15, t_0, t_1, flag, blake2_f_contd, kexit_info - PUSH @SEGMENT_CALLDATA - // stack: @SEGMENT_CALLDATA, h0_addr + 8 * (8 - i), h0_addr + 8 * (8 - i), h_(i+1), ..., h_7, m_0..m_15, t_0, t_1, flag, blake2_f_contd, kexit_info - GET_CONTEXT - // stack: ctx, @SEGMENT_CALLDATA, h0_addr + 8 * (8 - i), h0_addr + 8 * (8 - i), h_(i+1), ..., h_7, m_0..m_15, t_0, t_1, flag, blake2_f_contd, kexit_info - %build_address + // stack: base_addr, h0_addr + 8 * (8 - i - 1), h_(i+1), ..., h_7, m_0..m_15, t_0, t_1, flag, blake2_f_contd, kexit_info + DUP2 DUP2 + // stack: base_addr, h0_addr + 8 * (8 - i - 1), base_addr, h0_addr + 8 * (8 - i), h_(i+1), ..., h_7, m_0..m_15, t_0, t_1, flag, blake2_f_contd, kexit_info + ADD // base_addr + offset %mload_packing_u64_LE - // stack: h_i, h0_addr + 8 * (8 - i), h_(i+1), ..., h_7, m_0..m_15, t_0, t_1, flag, blake2_f_contd, kexit_info - SWAP1 - // stack: h0_addr + 8 * (8 - i), h_i, h_(i+1), ..., h_7, m_0..m_15, t_0, t_1, flag, blake2_f_contd, kexit_info + // stack: h_i, base_addr, h0_addr + 8 * (8 - i - 1), h_(i+1), ..., h_7, m_0..m_15, t_0, t_1, flag, blake2_f_contd, kexit_info + SWAP2 %sub_const(8) SWAP1 + // stack: base_addr, h0_addr + 8 * (8 - i - 1), h_i, h_(i+1), ..., h_7, m_0..m_15, t_0, t_1, flag, blake2_f_contd, kexit_info %endrep - // stack: h0_addr + 8 * 8 = 68, h_0, ..., h_7, m_0..m_15, t_0, t_1, flag, blake2_f_contd, kexit_info - POP - - %stack () -> (@SEGMENT_CALLDATA, 4) - GET_CONTEXT - // stack: ctx, @SEGMENT_CALLDATA, 4, h_0..h_7, m_0..m_15, t_0, t_1, flag, blake2_f_contd, kexit_info - %build_address_no_offset + // stack: base_addr, garbage, h_0, ..., h_7, m_0..m_15, t_0, t_1, flag, blake2_f_contd, kexit_info + + PUSH 4 SWAP2 POP + // stack: base_addr, 4, h_0, ..., h_7, m_0..m_15, t_0, t_1, flag, blake2_f_contd, kexit_info MLOAD_32BYTES // stack: rounds, h_0..h_7, m_0..m_15, t_0, t_1, flag, blake2_f_contd, kexit_info diff --git a/evm_arithmetization/src/cpu/kernel/asm/core/precompiles/bn_add.asm b/evm_arithmetization/src/cpu/kernel/asm/core/precompiles/bn_add.asm index 43414e859..038e63a90 100644 --- a/evm_arithmetization/src/cpu/kernel/asm/core/precompiles/bn_add.asm +++ b/evm_arithmetization/src/cpu/kernel/asm/core/precompiles/bn_add.asm @@ -14,50 +14,47 @@ global precompile_bn_add: %charge_gas_const(@BN_ADD_GAS) + GET_CONTEXT + PUSH @SEGMENT_CALLDATA + %build_address_no_offset + // stack: base_addr, kexit_info + // Load x0, y0, x1, y1 from the call data using `MLOAD_32BYTES`. PUSH bn_add_return - // stack: bn_add_return, kexit_info - %stack () -> (@SEGMENT_CALLDATA, 96, 32) - GET_CONTEXT - // stack: ctx, @SEGMENT_CALLDATA, 96, 32, bn_add_return, kexit_info - %build_address + // stack: bn_add_return, base_addr, kexit_info + %stack (bn_add_return, base_addr) -> (base_addr, 96, 32, bn_add_return, base_addr) + ADD // base_addr + offset MLOAD_32BYTES - // stack: y1, bn_add_return, kexit_info - %stack () -> (@SEGMENT_CALLDATA, 64, 32) - GET_CONTEXT - // stack: ctx, @SEGMENT_CALLDATA, 64, 32, y1, bn_add_return, kexit_info - %build_address + // stack: y1, bn_add_return, base_addr, kexit_info + %stack (y1, bn_add_return, base_addr) -> (base_addr, 64, 32, y1, bn_add_return, base_addr) + ADD // base_addr + offset MLOAD_32BYTES - // stack: x1, y1, bn_add_return, kexit_info - %stack () -> (@SEGMENT_CALLDATA, 32, 32) - GET_CONTEXT - // stack: ctx, @SEGMENT_CALLDATA, 32, 32, x1, y1, bn_add_return, kexit_info - %build_address + // stack: x1, y1, bn_add_return, base_addr, kexit_info + %stack (x1, y1, bn_add_return, base_addr) -> (base_addr, 32, 32, x1, y1, bn_add_return, base_addr) + ADD // base_addr + offset MLOAD_32BYTES - // stack: y0, x1, y1, bn_add_return, kexit_info - %stack () -> (@SEGMENT_CALLDATA, 32) - GET_CONTEXT - // stack: ctx, @SEGMENT_CALLDATA, 32, y0, x1, y1, bn_add_return, kexit_info - %build_address_no_offset + // stack: y0, x1, y1, bn_add_return, base_addr, kexit_info + %stack (y0, x1, y1, bn_add_return, base_addr) -> (base_addr, 32, y0, x1, y1, bn_add_return, base_addr) MLOAD_32BYTES - // stack: x0, y0, x1, y1, bn_add_return, kexit_info + // stack: x0, y0, x1, y1, bn_add_return, base_addr, kexit_info %jump(bn_add) bn_add_return: - // stack: x, y, kexit_info + // stack: x, y, base_addr, kexit_info DUP2 %eq_const(@U256_MAX) // bn_add returns (U256_MAX, U256_MAX) on bad input. DUP2 %eq_const(@U256_MAX) // bn_add returns (U256_MAX, U256_MAX) on bad input. MUL // Cheaper than AND %jumpi(fault_exception) - // stack: x, y, kexit_info + // stack: x, y, base_addr, kexit_info // Store the result (x, y) to the parent's return data using `mstore_unpacking`. %mstore_parent_context_metadata(@CTX_METADATA_RETURNDATA_SIZE, 64) %mload_context_metadata(@CTX_METADATA_PARENT_CONTEXT) - %stack (parent_ctx, x, y) -> (parent_ctx, @SEGMENT_RETURNDATA, x, parent_ctx, y) + %stack (parent_ctx, x, y) -> (parent_ctx, @SEGMENT_RETURNDATA, x, y) %build_address_no_offset + // stack: addr_x, x, y, base_addr, kexit_info MSTORE_32BYTES_32 - POP - %stack (parent_ctx, y) -> (parent_ctx, @SEGMENT_RETURNDATA, 32, y) - %build_address + // stack: addr_y = addr_x + 32, y, base_addr, kexit_info MSTORE_32BYTES_32 + // stack: addr, base_addr, kexit_info + POP %jump(pop_and_return_success) diff --git a/evm_arithmetization/src/cpu/kernel/asm/core/precompiles/bn_mul.asm b/evm_arithmetization/src/cpu/kernel/asm/core/precompiles/bn_mul.asm index c29080166..62a25d153 100644 --- a/evm_arithmetization/src/cpu/kernel/asm/core/precompiles/bn_mul.asm +++ b/evm_arithmetization/src/cpu/kernel/asm/core/precompiles/bn_mul.asm @@ -14,30 +14,28 @@ global precompile_bn_mul: %charge_gas_const(@BN_MUL_GAS) + GET_CONTEXT + PUSH @SEGMENT_CALLDATA + %build_address_no_offset + // stack: base_addr, kexit_info + // Load x, y, n from the call data using `MLOAD_32BYTES`. PUSH bn_mul_return - // stack: bn_mul_return, kexit_info - %stack () -> (@SEGMENT_CALLDATA, 64, 32) - GET_CONTEXT - // stack: ctx, @SEGMENT_CALLDATA, 64, 32, bn_mul_return, kexit_info - %build_address + // stack: bn_mul_return, base_addr, kexit_info + %stack (bn_mul_return, base_addr) -> (base_addr, 64, 32, bn_mul_return, base_addr) + ADD // base_addr + offset MLOAD_32BYTES - // stack: n, bn_mul_return, kexit_info - %stack () -> (@SEGMENT_CALLDATA, 32, 32) - GET_CONTEXT - // stack: ctx, @SEGMENT_CALLDATA, 32, 32, n, bn_mul_return, kexit_info - %build_address + // stack: n, bn_mul_return, base_addr, kexit_info + %stack (n, bn_mul_return, base_addr) -> (base_addr, 32, 32, n, bn_mul_return, base_addr) + ADD // base_addr + offset MLOAD_32BYTES - // stack: y, n, bn_mul_return, kexit_info - %stack () -> (@SEGMENT_CALLDATA, 32) - GET_CONTEXT - // stack: ctx, @SEGMENT_CALLDATA, 32, y, n, bn_mul_return, kexit_info - %build_address_no_offset + // stack: y, n, bn_mul_return, base_addr, kexit_info + %stack (y, n, bn_mul_return, base_addr) -> (base_addr, 32, y, n, bn_mul_return, base_addr) MLOAD_32BYTES - // stack: x, y, n, bn_mul_return, kexit_info + // stack: x, y, n, bn_mul_return, base_addr, kexit_info %jump(bn_mul) bn_mul_return: - // stack: Px, Py, kexit_info + // stack: Px, Py, base_addr, kexit_info DUP2 %eq_const(@U256_MAX) // bn_mul returns (U256_MAX, U256_MAX) on bad input. DUP2 %eq_const(@U256_MAX) // bn_mul returns (U256_MAX, U256_MAX) on bad input. MUL // Cheaper than AND @@ -55,4 +53,6 @@ bn_mul_contd6: %stack (parent_ctx, Py) -> (parent_ctx, @SEGMENT_RETURNDATA, 32, Py) %build_address MSTORE_32BYTES_32 + // stack: addr, base_addr, kexit_info + POP %jump(pop_and_return_success) diff --git a/evm_arithmetization/src/cpu/kernel/asm/core/precompiles/ecrec.asm b/evm_arithmetization/src/cpu/kernel/asm/core/precompiles/ecrec.asm index 4a27ca75b..8f3c30f7d 100644 --- a/evm_arithmetization/src/cpu/kernel/asm/core/precompiles/ecrec.asm +++ b/evm_arithmetization/src/cpu/kernel/asm/core/precompiles/ecrec.asm @@ -14,36 +14,33 @@ global precompile_ecrec: %charge_gas_const(@ECREC_GAS) + GET_CONTEXT + PUSH @SEGMENT_CALLDATA + %build_address_no_offset + // stack: base_addr, kexit_info + // Load hash, v, r, s from the call data using `MLOAD_32BYTES`. PUSH ecrec_return - // stack: ecrec_return, kexit_info - %stack () -> (@SEGMENT_CALLDATA, 96, 32) - GET_CONTEXT - // stack: ctx, @SEGMENT_CALLDATA, 96, 32, ecrec_return, kexit_info - %build_address + // stack: ecrec_return, base_addr, kexit_info + + %stack (ecrec_return, base_addr) -> (base_addr, 96, 32, ecrec_return, base_addr) + ADD // base_addr + offset MLOAD_32BYTES - // stack: s, ecrec_return, kexit_info - %stack () -> (@SEGMENT_CALLDATA, 64, 32) - GET_CONTEXT - // stack: ctx, @SEGMENT_CALLDATA, 64, 32, s, ecrec_return, kexit_info - %build_address + // stack: s, ecrec_return, base_addr, kexit_info + %stack (s, ecrec_return, base_addr) -> (base_addr, 64, 32, s, ecrec_return, base_addr) + ADD // base_addr + offset MLOAD_32BYTES - // stack: r, s, ecrec_return, kexit_info - %stack () -> (@SEGMENT_CALLDATA, 32, 32) - GET_CONTEXT - // stack: ctx, @SEGMENT_CALLDATA, 32, 32, r, s, ecrec_return, kexit_info - %build_address + // stack: r, s, ecrec_return, base_addr, kexit_info + %stack (r, s, ecrec_return, base_addr) -> (base_addr, 32, 32, r, s, ecrec_return, base_addr) + ADD // base_addr + offset MLOAD_32BYTES - // stack: v, r, s, ecrec_return, kexit_info - %stack () -> (@SEGMENT_CALLDATA, 32) - GET_CONTEXT - // stack: ctx, @SEGMENT_CALLDATA, 32, v, r, s, ecrec_return, kexit_info - %build_address_no_offset + // stack: v, r, s, ecrec_return, base_addr, kexit_info + %stack (v, r, s, ecrec_return, base_addr) -> (base_addr, 32, v, r, s, ecrec_return, base_addr) MLOAD_32BYTES - // stack: hash, v, r, s, ecrec_return, kexit_info + // stack: hash, v, r, s, ecrec_return, base_addr, kexit_info %jump(ecrecover) ecrec_return: - // stack: address, kexit_info + // stack: address, base_addr, kexit_info DUP1 %eq_const(@U256_MAX) %jumpi(ecrec_bad_input) // ecrecover returns U256_MAX on bad input. // Store the result address to the parent's return data using `mstore_unpacking`. @@ -52,9 +49,13 @@ ecrec_return: %stack (parent_ctx, address) -> (parent_ctx, @SEGMENT_RETURNDATA, address) %build_address_no_offset MSTORE_32BYTES_32 + // stack: addr, base_addr, kexit_info + POP %jump(pop_and_return_success) // On bad input, return empty return data but still return success. ecrec_bad_input: %mstore_parent_context_metadata(@CTX_METADATA_RETURNDATA_SIZE, 0) + // stack: addr, base_addr, kexit_info + POP %jump(pop_and_return_success) diff --git a/evm_arithmetization/src/cpu/kernel/asm/core/precompiles/expmod.asm b/evm_arithmetization/src/cpu/kernel/asm/core/precompiles/expmod.asm index 1dc7841b5..684c80810 100644 --- a/evm_arithmetization/src/cpu/kernel/asm/core/precompiles/expmod.asm +++ b/evm_arithmetization/src/cpu/kernel/asm/core/precompiles/expmod.asm @@ -85,7 +85,7 @@ store_limbs_return: %macro expmod_gas_f // stack: x // Overflow check - DUP1 %ge_const(0x800000000000000000000000000000007) %jumpi(fault_exception) + DUP1 %gt_const(0x800000000000000000000000000000006) %jumpi(fault_exception) // stack: x %ceil_div_const(8) // stack: ceil(x/8) @@ -100,7 +100,7 @@ calculate_l_E_prime: DUP1 %gt_const(0x100000000000000000000000000000000) %jumpi(fault_exception) DUP1 ISZERO %jumpi(case_le_zero) // stack: l_E, l_B, retdest - DUP1 %le_const(32) + DUP1 %lt_const(33) // stack: l_E <= 32, l_E, l_B, retdest %jumpi(case_le_32) // stack: l_E, l_B, retdest @@ -121,7 +121,7 @@ calculate_l_E_prime: // stack: l_E, log2(i[96 + l_B..128 + l_B]), l_B, retdest %sub_const(32) // Overflow check - DUP1 %ge_const(0x2000000000000000000000000000000000000000000000000000000000000000) %jumpi(fault_exception) + DUP1 %gt_const(0x1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff) %jumpi(fault_exception) %mul_const(8) // stack: 8 * (l_E - 32), log2(i[96 + l_B..128 + l_B]), l_B, retdest ADD diff --git a/evm_arithmetization/src/cpu/kernel/asm/core/terminate.asm b/evm_arithmetization/src/cpu/kernel/asm/core/terminate.asm index 1d406097c..2e3482ea3 100644 --- a/evm_arithmetization/src/cpu/kernel/asm/core/terminate.asm +++ b/evm_arithmetization/src/cpu/kernel/asm/core/terminate.asm @@ -193,7 +193,7 @@ revert_after_gas: %stack (addr, size, parent_ctx, kexit_info) -> ( parent_ctx, @SEGMENT_RETURNDATA, // DST - addr, // SRC + addr, // SRC size, sys_revert_finish, kexit_info // count, retdest, ... ) %build_address_no_offset diff --git a/evm_arithmetization/src/cpu/kernel/asm/curve/bn254/curve_arithmetic/glv.asm b/evm_arithmetization/src/cpu/kernel/asm/curve/bn254/curve_arithmetic/glv.asm index 32eb5b6c1..fea966f8b 100644 --- a/evm_arithmetization/src/cpu/kernel/asm/curve/bn254/curve_arithmetic/glv.asm +++ b/evm_arithmetization/src/cpu/kernel/asm/curve/bn254/curve_arithmetic/glv.asm @@ -76,7 +76,7 @@ global bn_glv_decompose: // along with a flag `underflow` set to 1 if there is an underflow, 0 otherwise. ADD %bn_sub_check_underflow // stack: k2, underflow, N, k, retdest - DUP1 %ge_const(0x80000000000000000000000000000000) %jumpi(negate) + DUP1 %gt_const(0x7fffffffffffffffffffffffffffffff) %jumpi(negate) %jump(contd) negate: // stack: k2, underflow, N, k, retdest diff --git a/evm_arithmetization/src/cpu/kernel/asm/curve/secp256k1/curve_add.asm b/evm_arithmetization/src/cpu/kernel/asm/curve/secp256k1/curve_add.asm index f1385de56..132beac0b 100644 --- a/evm_arithmetization/src/cpu/kernel/asm/curve/secp256k1/curve_add.asm +++ b/evm_arithmetization/src/cpu/kernel/asm/curve/secp256k1/curve_add.asm @@ -99,11 +99,11 @@ secp_add_valid_points_with_lambda: // stack: N, lambda, x0, y0, x1, y1, retdest DUP3 // stack: x0, N, lambda, x0, y0, x1, y1, retdest - %secp_base + DUP2 // stack: N, x0, N, lambda, x0, y0, x1, y1, retdest DUP7 // stack: x1, N, x0, N, lambda, x0, y0, x1, y1, retdest - %secp_base + DUP2 // stack: N, x1, N, x0, N, lambda, x0, y0, x1, y1, retdest DUP6 // stack: lambda, N, x1, N, x0, N, lambda, x0, y0, x1, y1, retdest @@ -117,7 +117,7 @@ secp_add_valid_points_with_lambda: // stack: x2, lambda, x0, y0, x1, y1, retdest // Compute y2 = lambda*(x1 - x2) - y1 - %secp_base %secp_base %secp_base // Pre-load moduli for incoming SUBMODs + %secp_base DUP1 DUP1 // Pre-load moduli for incoming SUBMODs // stack: N, N, N, x2, lambda, x0, y0, x1, y1, retdest DUP4 // stack: x2, N, N, N, x2, lambda, x0, y0, x1, y1, retdest @@ -244,11 +244,11 @@ global secp_double: // stack: x, y, (y < N) & (x < N) %secp_base // stack: N, x, y, b - %secp_base + DUP1 // stack: N, N, x, y, b DUP3 // stack: x, N, N, x, y, b - %secp_base + DUP2 // stack: N, x, N, N, x, y, b DUP2 // stack: x, N, x, N, N, x, y, b diff --git a/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/blake2_f.asm b/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/blake2_f.asm index d1a4a2ab6..aa9951997 100644 --- a/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/blake2_f.asm +++ b/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/blake2_f.asm @@ -35,13 +35,15 @@ global blake2_f: %add_const(7) %rep 8 // stack: addr, ... - DUP1 - // stack: addr, addr, ... + PUSH 1 + // stack: 1, addr, ... + DUP2 + // stack: addr, 1, addr, ... MLOAD_GENERAL - // stack: val, addr, ... - SWAP1 - // stack: addr, val, ... - %decrement + // stack: val, 1, addr, ... + SWAP2 + // stack: addr, 1, val, ... + SUB %endrep // stack: addr, h_0, ..., h_7, rounds, t0, t1, flag, retdest POP @@ -66,16 +68,18 @@ global blake2_f: %rep 4 // stack: i, addr, ... DUP2 + // stack: addr, i, addr, ... + %increment + // stack: addr + 1, i, addr, ... + SWAP2 + // stack: addr, i, addr + 1, ... DUP2 - // stack: i, addr, i, addr, ... + // stack: i, addr, i, addr + 1, ... %blake2_iv - // stack: IV_i, addr, i, addr, ... + // stack: IV_i, addr, i, addr + 1, ... MSTORE_GENERAL - // stack: i, addr, ... + // stack: i, addr + 1, ... %increment - SWAP1 - %increment - SWAP1 // stack: i + 1, addr + 1,... %endrep // stack: 4, start + 12, rounds, t0, t1, flag, retdest diff --git a/evm_arithmetization/src/cpu/kernel/asm/main.asm b/evm_arithmetization/src/cpu/kernel/asm/main.asm index e72c20ee6..51d464a2d 100644 --- a/evm_arithmetization/src/cpu/kernel/asm/main.asm +++ b/evm_arithmetization/src/cpu/kernel/asm/main.asm @@ -130,14 +130,14 @@ global start_txns: #[cfg(feature = eth_mainnet)] { // If txn_idx == 0, update the beacon_root for Ethereum mainnet. - %mload_global_metadata(@GLOBAL_METADATA_TXN_NUMBER_BEFORE) + DUP4 ISZERO %jumpi(set_beacon_root) } #[cfg(feature = cdk_erigon)] { // If txn_idx == 0, perform pre-state execution for CDK erigon. - %mload_global_metadata(@GLOBAL_METADATA_TXN_NUMBER_BEFORE) + DUP4 ISZERO %jumpi(pre_block_execution) } diff --git a/evm_arithmetization/src/cpu/kernel/asm/memory/syscalls.asm b/evm_arithmetization/src/cpu/kernel/asm/memory/syscalls.asm index 3e3d43f1f..d2148de91 100644 --- a/evm_arithmetization/src/cpu/kernel/asm/memory/syscalls.asm +++ b/evm_arithmetization/src/cpu/kernel/asm/memory/syscalls.asm @@ -102,7 +102,8 @@ calldataload_large_offset: // stack: kexit_info, dest_offset, offset, size GET_CONTEXT PUSH $segment - // stack: segment, context, kexit_info, dest_offset, offset, size + %build_address_no_offset + // stack: base_addr, kexit_info, dest_offset, offset, size %jump(wcopy_within_bounds) %endmacro @@ -130,12 +131,11 @@ codecopy_within_bounds: %jump(memcpy_bytes) wcopy_within_bounds: - // TODO: rework address creation to have less stack manipulation overhead - // stack: segment, src_ctx, kexit_info, dest_offset, offset, size + // stack: base_addr, kexit_info, dest_offset, offset, size GET_CONTEXT - %stack (context, segment, src_ctx, kexit_info, dest_offset, offset, size) -> - (src_ctx, segment, offset, @SEGMENT_MAIN_MEMORY, dest_offset, context, size, wcopy_after, kexit_info) - %build_address + %stack (context, base_addr, kexit_info, dest_offset, offset, size) -> + (base_addr, offset, @SEGMENT_MAIN_MEMORY, dest_offset, context, size, wcopy_after, kexit_info) + ADD // SRC SWAP3 %build_address // stack: DST, SRC, size, wcopy_after, kexit_info %jump(memcpy_bytes) @@ -287,24 +287,26 @@ global sys_mcopy: // stack: kexit_info, dest_offset, offset, size GET_CONTEXT PUSH @SEGMENT_MAIN_MEMORY + %build_address_no_offset - DUP5 DUP5 LT - // stack: dest_offset < offset, kexit_info, dest_offset, offset, size + DUP4 DUP4 LT + // stack: dest_offset < offset, base_addr, kexit_info, dest_offset, offset, size %jumpi(wcopy_within_bounds) - // stack: segment, context, kexit_info, dest_offset, offset, size - DUP6 PUSH 32 %min - // stack: shift=min(size, 32), segment, context, kexit_info, dest_offset, offset, size - DUP6 DUP8 ADD - // stack: offset + size, shift, segment, context, kexit_info, dest_offset, offset, size - DUP6 LT - // stack: dest_offset < offset + size, shift, segment, context, kexit_info, dest_offset, offset, size + // stack: base_addr, kexit_info, dest_offset, offset, size + + DUP5 PUSH 32 %min + // stack: shift=min(size, 32), base_addr, kexit_info, dest_offset, offset, size + DUP5 DUP7 ADD + // stack: offset + size, shift, base_addr, kexit_info, dest_offset, offset, size + DUP5 LT + // stack: dest_offset < offset + size, shift, base_addr, kexit_info, dest_offset, offset, size DUP2 - // stack: shift, dest_offset < offset + size, shift, segment, context, kexit_info, dest_offset, offset, size - DUP9 GT - // stack: size > shift, dest_offset < offset + size, shift, segment, context, kexit_info, dest_offset, offset, size + // stack: shift, dest_offset < offset + size, shift, base_addr, kexit_info, dest_offset, offset, size + DUP8 GT + // stack: size > shift, dest_offset < offset + size, shift, base_addr, kexit_info, dest_offset, offset, size MUL // AND - // stack: (size > shift) && (dest_offset < offset + size), shift, segment, context, kexit_info, dest_offset, offset, size + // stack: (size > shift) && (dest_offset < offset + size), shift, base_addr, kexit_info, dest_offset, offset, size // If the conditions `size > shift` and `dest_offset < offset + size` are satisfied, that means // we will get an overlap that will overwrite some SRC data. In that case, we will proceed to the @@ -313,7 +315,7 @@ global sys_mcopy: // Otherwise, we either have `SRC` < `DST`, or a small enough `size` that a single loop of // `memcpy_bytes` suffices and does not risk to overwrite `SRC` data before being read. - // stack: shift, segment, context, kexit_info, dest_offset, offset, size + // stack: shift, base_addr, kexit_info, dest_offset, offset, size POP %jump(wcopy_within_bounds) @@ -323,24 +325,22 @@ mcopy_with_overlap: // For this, we need to update `offset` and `dest_offset` to their final position, corresponding // to `x + size - min(32, size)`. - // stack: shift=min(size, 32), segment, context, kexit_info, dest_offset, offset, size + // stack: shift=min(size, 32), base_addr, kexit_info, dest_offset, offset, size DUP1 - // stack: shift, shift, segment, context, kexit_info, dest_offset, offset, size - DUP8 DUP8 ADD - // stack: offset+size, shift, shift, segment, context, kexit_info, dest_offset, offset, size + // stack: shift, shift, base_addr, kexit_info, dest_offset, offset, size + DUP7 DUP7 ADD + // stack: offset+size, shift, shift, base_addr, kexit_info, dest_offset, offset, size SUB - // stack: offset'=offset+size-shift, shift, segment, context, kexit_info, dest_offset, offset, size - SWAP5 DUP8 ADD - // stack: dest_offset+size, shift, segment, context, kexit_info, offset', offset, size + // stack: offset'=offset+size-shift, shift, base_addr, kexit_info, dest_offset, offset, size + SWAP4 DUP7 ADD + // stack: dest_offset+size, shift, base_addr, kexit_info, offset', offset, size SUB - // stack: dest_offset'=dest_offset+size-shift, segment, context, kexit_info, offset', offset, size + // stack: dest_offset'=dest_offset+size-shift, base_addr, kexit_info, offset', offset, size - %stack (next_dst_offset, segment, context, kexit_info, new_offset, offset, size) -> - (context, segment, new_offset, segment, next_dst_offset, context, size, wcopy_after, kexit_info) - %build_address // SRC - SWAP3 - %build_address // DST - // stack: DST, SRC, size, wcopy_after, kexit_info + DUP2 ADD // DST + // stack: DST, base_addr, kexit_info, new_offset, offset, size + SWAP3 ADD // SRC + %stack (SRC, kexit_info, DST, offset, size) -> (DST, SRC, size, wcopy_after, kexit_info) %jump(memcpy_bytes_backwards) mcopy_empty: diff --git a/evm_arithmetization/src/cpu/kernel/asm/mpt/hex_prefix.asm b/evm_arithmetization/src/cpu/kernel/asm/mpt/hex_prefix.asm index 0ca2458f0..3e8a783ab 100644 --- a/evm_arithmetization/src/cpu/kernel/asm/mpt/hex_prefix.asm +++ b/evm_arithmetization/src/cpu/kernel/asm/mpt/hex_prefix.asm @@ -6,7 +6,7 @@ // Pre stack: rlp_start_addr, num_nibbles, packed_nibbles, terminated, retdest // Post stack: rlp_end_addr global hex_prefix_rlp: - DUP2 %assert_lt_const(65) + DUP2 %assert_le_const(64) PUSH 2 DUP3 DIV // Compute the length of the hex-prefix string, in bytes: diff --git a/evm_arithmetization/src/cpu/kernel/asm/signed.asm b/evm_arithmetization/src/cpu/kernel/asm/signed.asm index 566d7d5ae..a9e9e3648 100644 --- a/evm_arithmetization/src/cpu/kernel/asm/signed.asm +++ b/evm_arithmetization/src/cpu/kernel/asm/signed.asm @@ -131,11 +131,12 @@ global _sys_sar: // Now assume shift < 256. // Stack: shift, value, return_info PUSH 0x8000000000000000000000000000000000000000000000000000000000000000 - DUP2 + DUP1 + DUP3 SHR - // Stack: 2^255 >> shift, shift, value, return_info - SWAP2 - %add_const(0x8000000000000000000000000000000000000000000000000000000000000000) + // Stack: 2^255 >> shift, 0x8000000000000000000000000000000000000000000000000000000000000000, shift, value, return_info + SWAP3 + ADD // Stack: 2^255 + value, shift, 2^255 >> shift, return_info SWAP1 SHR diff --git a/evm_arithmetization/src/cpu/kernel/asm/util/basic_macros.asm b/evm_arithmetization/src/cpu/kernel/asm/util/basic_macros.asm index 395852810..657ee9760 100644 --- a/evm_arithmetization/src/cpu/kernel/asm/util/basic_macros.asm +++ b/evm_arithmetization/src/cpu/kernel/asm/util/basic_macros.asm @@ -174,7 +174,7 @@ PUSH $c // stack: c, input, ... LT // Check it backwards: (input > c) == (c < input) - // stack: input >= c, ... + // stack: input > c, ... %endmacro %macro ge_const(c) diff --git a/evm_arithmetization/src/fixed_recursive_verifier.rs b/evm_arithmetization/src/fixed_recursive_verifier.rs index 746f63f44..0d7bb82dd 100644 --- a/evm_arithmetization/src/fixed_recursive_verifier.rs +++ b/evm_arithmetization/src/fixed_recursive_verifier.rs @@ -3174,7 +3174,7 @@ mod tests { "Create all recursive circuits", AllRecursiveCircuits::::new( &all_stark, - &[16..17, 8..9, 7..8, 4..9, 8..9, 4..7, 17..18, 17..18, 17..18], + &[16..17, 8..9, 7..8, 4..9, 8..9, 4..7, 16..17, 16..17, 16..17], &config, ) ); diff --git a/scripts/prove_stdio.sh b/scripts/prove_stdio.sh index 886f58883..c3c794086 100755 --- a/scripts/prove_stdio.sh +++ b/scripts/prove_stdio.sh @@ -54,10 +54,10 @@ if ! [[ $TEST_ONLY == "test_only" ]]; then echo "Using specific circuit sizes for witness_b19807080.json" export ARITHMETIC_CIRCUIT_SIZE="16..18" export BYTE_PACKING_CIRCUIT_SIZE="8..15" - export CPU_CIRCUIT_SIZE="14..20" - export KECCAK_CIRCUIT_SIZE="10..18" + export CPU_CIRCUIT_SIZE="9..20" + export KECCAK_CIRCUIT_SIZE="7..18" export KECCAK_SPONGE_CIRCUIT_SIZE="8..14" - export LOGIC_CIRCUIT_SIZE="8..17" + export LOGIC_CIRCUIT_SIZE="5..17" export MEMORY_CIRCUIT_SIZE="17..22" export MEMORY_BEFORE_CIRCUIT_SIZE="16..20" export MEMORY_AFTER_CIRCUIT_SIZE="7..20" @@ -73,7 +73,7 @@ if ! [[ $TEST_ONLY == "test_only" ]]; then export KECCAK_SPONGE_CIRCUIT_SIZE="8..9" export LOGIC_CIRCUIT_SIZE="4..14" export MEMORY_CIRCUIT_SIZE="17..22" - export MEMORY_BEFORE_CIRCUIT_SIZE="17..18" + export MEMORY_BEFORE_CIRCUIT_SIZE="16..18" export MEMORY_AFTER_CIRCUIT_SIZE="7..8" export POSEIDON_CIRCUIT_SIZE="4..8" else @@ -112,11 +112,10 @@ cargo build --release --jobs "$num_procs" start_time=$(date +%s%N) -"${REPO_ROOT}/target/release/leader" --runtime in-memory --load-strategy on-demand --block-batch-size $BLOCK_BATCH_SIZE \ +"${REPO_ROOT}/target/release/leader" --runtime in-memory --load-strategy on-demand -n 1 --block-batch-size $BLOCK_BATCH_SIZE \ --proof-output-dir $PROOF_OUTPUT_DIR stdio < $INPUT_FILE &> $OUTPUT_LOG end_time=$(date +%s%N) -set +o pipefail cat $OUTPUT_LOG | grep "Successfully wrote to disk proof file " | awk '{print $NF}' | tee $PROOFS_FILE_LIST if [ ! -s "$PROOFS_FILE_LIST" ]; then # Some error occurred, display the logs and exit.