From abef1d48259b73248250379af4c4afaf8cff7f87 Mon Sep 17 00:00:00 2001 From: Carsten Griwodz Date: Mon, 12 Aug 2024 09:00:55 +0200 Subject: [PATCH 1/3] Remove profiling nvtx from develop branch. This make trouble for continuous integration and is apparently not supported on all platforms. Since it is a debug function, it's just as well to remove it from the mainstream tree. --- CMakeLists.txt | 13 ------------- appveyor.yml | 2 +- cmake/sift_config.h.in | 1 - cudaInstallAppveyor.cmd | 3 --- src/CMakeLists.txt | 6 ------ src/popsift/popsift.cu | 6 ------ src/popsift/popsift.h | 10 ---------- src/popsift/s_filtergrid.cu | 9 --------- src/popsift/s_image.cu | 23 ----------------------- src/popsift/s_orientation.cu | 7 ------- src/popsift/sift_desc.cu | 10 ---------- src/popsift/sift_pyramid.cu | 14 -------------- 12 files changed, 1 insertion(+), 103 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3e9138a8..02d027f1 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,7 +28,6 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/${CMAKE_SYSTEM_NAME}-$ option(PopSift_BUILD_EXAMPLES "Build PopSift applications." ON) option(PopSift_BUILD_DOCS "Build PopSift documentation." OFF) -option(PopSift_USE_NVTX_PROFILING "Use CUDA NVTX for profiling." OFF) option(PopSift_ERRCHK_AFTER_KERNEL "Synchronize and check CUDA error after every kernel." OFF) option(PopSift_USE_POSITION_INDEPENDENT_CODE "Generate position independent code." ON) option(PopSift_USE_GRID_FILTER "Switch off grid filtering to massively reduce compile time while debugging other things." ON) @@ -99,10 +98,6 @@ find_package(CUDAToolkit) message(STATUS "CUDA Version is ${CUDAToolkit_VERSION}") set(CUDA_VERSION ${CUDAToolkit_VERSION}) -if(PopSift_USE_NVTX_PROFILING) - message(STATUS "PROFILING CPU CODE: NVTX is in use") -endif() - if(PopSift_ERRCHK_AFTER_KERNEL) message(STATUS "Synchronizing and checking errors after every kernel call") list(APPEND CUDA_NVCC_FLAGS "-DERRCHK_AFTER_KERNEL") @@ -153,13 +148,6 @@ else() set(DISABLE_GRID_FILTER 0) endif() -if(PopSift_USE_NVTX_PROFILING) - # library required for NVTX profiling of the CPU - set(PopSift_USE_NVTX 1) -else() - set(PopSift_USE_NVTX 0) -endif() - add_subdirectory(src) if(PopSift_BUILD_DOCS) @@ -197,7 +185,6 @@ message(STATUS "Build Shared libs: " ${BUILD_SHARED_LIBS}) message(STATUS "Build examples: " ${PopSift_BUILD_EXAMPLES}) message(STATUS "Build documentation: " ${PopSift_BUILD_DOCS}) message(STATUS "Generate position independent code: " ${CMAKE_POSITION_INDEPENDENT_CODE}) -message(STATUS "Use CUDA NVTX for profiling: " ${PopSift_USE_NVTX_PROFILING}) message(STATUS "Synchronize and check CUDA error after every kernel: " ${PopSift_ERRCHK_AFTER_KERNEL}) message(STATUS "Grid filtering: " ${PopSift_USE_GRID_FILTER}) message(STATUS "Additional warning for CUDA nvcc: " ${PopSift_NVCC_WARNINGS}) diff --git a/appveyor.yml b/appveyor.yml index 679ac5b5..2f01c648 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -44,7 +44,7 @@ install: before_build: - md build - cd build - - cmake -G "Visual Studio 17 2022" -A x64 -T v143,host=x64,cuda="%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.5" -DBUILD_SHARED_LIBS:BOOL=ON -DPopSift_USE_NVTX_PROFILING:BOOL=OFF -DPopSift_USE_GRID_FILTER:BOOL=OFF -DPopSift_BUILD_DOCS:BOOL=OFF -DPopSift_USE_POSITION_INDEPENDENT_CODE:BOOL=ON -DPopSift_BUILD_EXAMPLES:BOOL=ON -DCMAKE_BUILD_TYPE=%configuration% -DCMAKE_TOOLCHAIN_FILE=c:/tools/vcpkg/scripts/buildsystems/vcpkg.cmake .. + - cmake -G "Visual Studio 17 2022" -A x64 -T v143,host=x64,cuda="%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.5" -DBUILD_SHARED_LIBS:BOOL=ON -DPopSift_USE_GRID_FILTER:BOOL=OFF -DPopSift_BUILD_DOCS:BOOL=OFF -DPopSift_USE_POSITION_INDEPENDENT_CODE:BOOL=ON -DPopSift_BUILD_EXAMPLES:BOOL=ON -DCMAKE_BUILD_TYPE=%configuration% -DCMAKE_TOOLCHAIN_FILE=c:/tools/vcpkg/scripts/buildsystems/vcpkg.cmake .. - ls -l build: diff --git a/cmake/sift_config.h.in b/cmake/sift_config.h.in index b6807983..518285b8 100644 --- a/cmake/sift_config.h.in +++ b/cmake/sift_config.h.in @@ -13,5 +13,4 @@ #define POPSIFT_HAVE_SHFL_DOWN_SYNC() @PopSift_HAVE_SHFL_DOWN_SYNC@ #define POPSIFT_DISABLE_GRID_FILTER() @DISABLE_GRID_FILTER@ -#define POPSIFT_USE_NVTX() @PopSift_USE_NVTX@ diff --git a/cudaInstallAppveyor.cmd b/cudaInstallAppveyor.cmd index 9d43f7fa..c59938a4 100644 --- a/cudaInstallAppveyor.cmd +++ b/cudaInstallAppveyor.cmd @@ -4,14 +4,12 @@ echo Downloading CUDA toolkit 12 for Windows 10 appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.5.82-archive.zip -Filename cuda_nvcc.zip appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.5.82-archive.zip -Filename cuda_cudart.zip -appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.5.82-archive.zip -Filename cuda_nvtx.zip appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.5.82-archive.zip -Filename vs_integration.zip dir echo Unzipping CUDA toolkit 12 tar -xf cuda_nvcc.zip tar -xf cuda_cudart.zip -tar -xf cuda_nvtx.zip tar -xf vs_integration.zip dir @@ -22,7 +20,6 @@ mkdir "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.5\extras" echo Copying toolkit files to install dir(s) xcopy cuda_cudart-windows-x86_64-12.5.82-archive "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.5" /s /e /i /y xcopy cuda_nvcc-windows-x86_64-12.5.82-archive "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.5" /s /e /i /y -xcopy cuda_nvtx-windows-x86_64-12.5.82-archive "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.5" /s /e /i /y xcopy visual_studio_integration-windows-x86_64-12.5.82-archive "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v12.5\extras" /s /e /i /y diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index ff3b3681..bd4b5faa 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -45,12 +45,6 @@ target_link_libraries(popsift CUDA::cudart Threads::Threads) -if(PopSift_USE_NVTX_PROFILING) -target_link_libraries(popsift - PUBLIC - CUDA::nvtx3) -endif() - set_target_properties(popsift PROPERTIES VERSION ${PROJECT_VERSION}) set_target_properties(popsift PROPERTIES DEBUG_POSTFIX "d") set_target_properties(popsift PROPERTIES CUDA_SEPARABLE_COMPILATION ON) diff --git a/src/popsift/popsift.cu b/src/popsift/popsift.cu index 09575772..46ac8467 100755 --- a/src/popsift/popsift.cu +++ b/src/popsift/popsift.cu @@ -438,18 +438,12 @@ void SiftJob::setImg( popsift::ImageBase* img ) popsift::ImageBase* SiftJob::getImg() { -#if POPSIFT_IS_DEFINED(POPSIFT_USE_NVTX) - _nvtx_id = nvtxRangeStartA( "inserting image" ); -#endif return _img; } void SiftJob::setFeatures( popsift::FeaturesBase* f ) { _p.set_value( f ); -#if POPSIFT_IS_DEFINED(POPSIFT_USE_NVTX) - nvtxRangeEnd( _nvtx_id ); -#endif } popsift::FeaturesHost* SiftJob::get() diff --git a/src/popsift/popsift.h b/src/popsift/popsift.h index 5654cc76..4c555400 100755 --- a/src/popsift/popsift.h +++ b/src/popsift/popsift.h @@ -23,13 +23,6 @@ #include #include -#if POPSIFT_IS_DEFINED(POPSIFT_USE_NVTX) -#include -#else -#define nvtxRangeStartA(a) -#define nvtxRangeEnd(a) -#endif - /* user parameters */ namespace popsift { @@ -50,9 +43,6 @@ class SiftJob unsigned char* _imageData; popsift::ImageBase* _img; std::exception_ptr _err; -#if POPSIFT_IS_DEFINED(POPSIFT_USE_NVTX) - nvtxRangeId_t _nvtx_id; -#endif public: diff --git a/src/popsift/s_filtergrid.cu b/src/popsift/s_filtergrid.cu index bfe2e64e..ba973c97 100644 --- a/src/popsift/s_filtergrid.cu +++ b/src/popsift/s_filtergrid.cu @@ -9,13 +9,6 @@ #include "sift_extremum.h" #include "sift_pyramid.h" -#if POPSIFT_IS_DEFINED(POPSIFT_USE_NVTX) -#include -#else -#define nvtxRangePushA(a) -#define nvtxRangePop() -#endif - #if ! POPSIFT_IS_DEFINED(POPSIFT_DISABLE_GRID_FILTER) #include @@ -317,9 +310,7 @@ int Pyramid::extrema_filter_grid( const Config& conf, int ext_total ) } } - nvtxRangePushA( "writing back count" ); writeDescCountersToDevice( ); - nvtxRangePop( ); return ret_ext_total; } diff --git a/src/popsift/s_image.cu b/src/popsift/s_image.cu index a966dd39..6e0d217e 100755 --- a/src/popsift/s_image.cu +++ b/src/popsift/s_image.cu @@ -15,13 +15,6 @@ #include #include -#if POPSIFT_IS_DEFINED(POPSIFT_USE_NVTX) -#include -#else -#define nvtxRangePushA(a) -#define nvtxRangePop() -#endif - using namespace std; namespace popsift { @@ -98,8 +91,6 @@ void Image::resetDimensions( int w, int h ) destroyTexture( ); createTexture( ); } else { - nvtxRangePushA( "reallocating host-side image memory" ); - _max_w = max( w, _max_w ); _max_h = max( h, _max_h ); _input_image_h.freeHost( popsift::CudaAllocated ); @@ -111,21 +102,15 @@ void Image::resetDimensions( int w, int h ) destroyTexture( ); createTexture( ); - - nvtxRangePop(); // "reallocating host-side image memory" } } void Image::allocate( int w, int h ) { - nvtxRangePushA( "allocating host-side image memory" ); - _input_image_h.allocHost( w, h, popsift::CudaAllocated ); _input_image_d.allocDev( w, h ); createTexture( ); - - nvtxRangePop(); // "allocating host-side image memory" } void Image::destroyTexture( ) @@ -222,8 +207,6 @@ void ImageFloat::resetDimensions( int w, int h ) destroyTexture( ); createTexture( ); } else { - nvtxRangePushA( "reallocating host-side image memory" ); - _max_w = max( w, _max_w ); _max_h = max( h, _max_h ); _input_image_h.freeHost( popsift::CudaAllocated ); @@ -235,21 +218,15 @@ void ImageFloat::resetDimensions( int w, int h ) destroyTexture( ); createTexture( ); - - nvtxRangePop(); // "reallocating host-side image memory" } } void ImageFloat::allocate( int w, int h ) { - nvtxRangePushA( "allocating host-side image memory" ); - _input_image_h.allocHost( w, h, popsift::CudaAllocated ); _input_image_d.allocDev( w, h ); createTexture( ); - - nvtxRangePop(); // "allocating host-side image memory" } void ImageFloat::destroyTexture( ) diff --git a/src/popsift/s_orientation.cu b/src/popsift/s_orientation.cu index b34aaaa1..1f75229f 100644 --- a/src/popsift/s_orientation.cu +++ b/src/popsift/s_orientation.cu @@ -18,13 +18,6 @@ #include #include -#if POPSIFT_IS_DEFINED(POPSIFT_USE_NVTX) -#include -#else -#define nvtxRangePushA(a) -#define nvtxRangePop() -#endif - using namespace popsift; using namespace std; diff --git a/src/popsift/sift_desc.cu b/src/popsift/sift_desc.cu index f533df35..4632298a 100644 --- a/src/popsift/sift_desc.cu +++ b/src/popsift/sift_desc.cu @@ -21,13 +21,6 @@ #include #include -#if POPSIFT_IS_DEFINED(POPSIFT_USE_NVTX) -#include -#else -#define nvtxRangePushA(a) -#define nvtxRangePop() -#endif - using namespace popsift; using namespace std; @@ -55,11 +48,8 @@ using namespace std; __host__ void Pyramid::descriptors( const Config& conf ) { - nvtxRangePushA("Reading orientation count"); - readDescCountersFromDevice( _octaves[0].getStream() ); cudaStreamSynchronize( _octaves[0].getStream() ); - nvtxRangePop( ); for( int octave=_num_octaves-1; octave>=0; octave-- ) // for( int octave=0; octave<_num_octaves; octave++ ) diff --git a/src/popsift/sift_pyramid.cu b/src/popsift/sift_pyramid.cu index c03b0d61..340cffff 100644 --- a/src/popsift/sift_pyramid.cu +++ b/src/popsift/sift_pyramid.cu @@ -25,13 +25,6 @@ #define mkdir(path, perm) _mkdir(path) #endif -#if POPSIFT_IS_DEFINED(POPSIFT_USE_NVTX) -#include -#else -#define nvtxRangePushA(a) -#define nvtxRangePop() -#endif - #define PYRAMID_PRINT_DEBUG 0 using namespace std; @@ -285,12 +278,10 @@ FeaturesHost* Pyramid::get_descriptors( const Config& conf ) readDescCountersFromDevice(); - nvtxRangePushA( "download descriptors" ); FeaturesHost* features = new FeaturesHost( hct.ext_total, hct.ori_total ); if( hct.ext_total == 0 || hct.ori_total == 0 ) { - nvtxRangePop(); return features; } @@ -298,9 +289,7 @@ FeaturesHost* Pyramid::get_descriptors( const Config& conf ) prep_features<<>>( features->getDescriptors(), up_fac ); POP_SYNC_CHK; - nvtxRangePushA( "register host memory" ); features->pin( ); - nvtxRangePop(); popcuda_memcpy_async( features->getFeatures(), dobuf_shadow.features, hct.ext_total * sizeof(Feature), @@ -313,10 +302,7 @@ FeaturesHost* Pyramid::get_descriptors( const Config& conf ) cudaMemcpyDeviceToHost, _download_stream ); cudaStreamSynchronize( _download_stream ); - nvtxRangePushA( "unregister host memory" ); features->unpin( ); - nvtxRangePop(); - nvtxRangePop(); return features; } From 5e85a1de11a90e7f044f87333225a8992da4e15a Mon Sep 17 00:00:00 2001 From: Carsten Griwodz Date: Mon, 13 Jan 2025 10:39:36 +0100 Subject: [PATCH 2/3] Add new CMake policy for Boost integration --- src/application/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/application/CMakeLists.txt b/src/application/CMakeLists.txt index 2379c57d..d477f89d 100755 --- a/src/application/CMakeLists.txt +++ b/src/application/CMakeLists.txt @@ -39,6 +39,10 @@ endif() find_package(DevIL COMPONENTS IL ILU) # yields IL_FOUND, IL_LIBRARIES, IL_INCLUDE_DIR +# for newer CMake versions and Boost 1.70 pr newer must use Boost's make file +if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.30) + cmake_policy(SET CMP0167 NEW) +endif() if(PopSift_BOOST_USE_STATIC_LIBS) set(Boost_USE_STATIC_LIBS ON) endif() From 08939e05128ec8d4fe4a223211083c1fcb1369c1 Mon Sep 17 00:00:00 2001 From: Carsten Griwodz Date: Tue, 13 Aug 2024 15:50:37 +0200 Subject: [PATCH 3/3] [cuda] Adding a VLFeat-compliant feature descriptor --- src/CMakeLists.txt | 1 + src/application/main.cpp | 5 +- src/application/match.cpp | 5 +- src/popsift/s_desc_vlfeat.cu | 213 +++++++++++++++++++++++++++++++++++ src/popsift/s_desc_vlfeat.h | 19 ++++ src/popsift/s_gradiant.h | 31 +++++ src/popsift/sift_conf.cu | 21 ++++ src/popsift/sift_conf.h | 21 +++- src/popsift/sift_desc.cu | 3 + 9 files changed, 305 insertions(+), 14 deletions(-) create mode 100644 src/popsift/s_desc_vlfeat.cu create mode 100644 src/popsift/s_desc_vlfeat.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index bd4b5faa..86fc3c1b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -24,6 +24,7 @@ add_library(popsift popsift/s_desc_grid.cu popsift/s_desc_grid.h popsift/s_desc_igrid.cu popsift/s_desc_igrid.h popsift/s_desc_notile.cu popsift/s_desc_notile.h + popsift/s_desc_vlfeat.cu popsift/s_desc_vlfeat.h popsift/s_desc_norm_rs.h popsift/s_desc_norm_l2.h popsift/s_desc_normalize.h diff --git a/src/application/main.cpp b/src/application/main.cpp index bf1128ff..4bd6a91a 100755 --- a/src/application/main.cpp +++ b/src/application/main.cpp @@ -73,10 +73,7 @@ static void parseargs(int argc, char** argv, popsift::Config& config, string& in // "Choice of span (1-sided) for Gauss filters. Default is VLFeat-like computation depending on sigma. " // "Options are: vlfeat, relative, relative-all, opencv, fixed9, fixed15" ("desc-mode", value()->notifier([&](const std::string& s) { config.setDescMode(s); }), - "Choice of descriptor extraction modes:\n" - "loop, iloop, grid, igrid, notile\n" - "Default is loop\n" - "loop is OpenCV-like horizontal scanning, computing only valid points, grid extracts only useful points but rounds them, iloop uses linear texture and rotated gradiant fetching. igrid is grid with linear interpolation. notile is like igrid but avoids redundant gradiant fetching.") + popsift::Config::getDescModeUsage()) ("popsift-mode", bool_switch()->notifier([&](bool b) { if(b) config.setMode(popsift::Config::PopSift); }), "During the initial upscale, shift pixels by 1. In extrema refinement, steps up to 0.6, do not reject points when reaching max iterations, " "first contrast threshold is .8 * peak thresh. Shift feature coords octave 0 back to original pos.") diff --git a/src/application/match.cpp b/src/application/match.cpp index 3460975d..3a1af9fa 100755 --- a/src/application/match.cpp +++ b/src/application/match.cpp @@ -71,10 +71,7 @@ static void parseargs(int argc, char** argv, popsift::Config& config, string& lF ( "gauss-mode", value()->notifier([&](const std::string& s) { config.setGaussMode(s); }), popsift::Config::getGaussModeUsage() ) ("desc-mode", value()->notifier([&](const std::string& s) { config.setDescMode(s); }), - "Choice of descriptor extraction modes:\n" - "loop, iloop, grid, igrid, notile\n" - "Default is loop\n" - "loop is OpenCV-like horizontal scanning, computing only valid points, grid extracts only useful points but rounds them, iloop uses linear texture and rotated gradiant fetching. igrid is grid with linear interpolation. notile is like igrid but avoids redundant gradiant fetching.") + popsift::Config::getDescModeUsage() ) ("popsift-mode", bool_switch()->notifier([&](bool b) { if(b) config.setMode(popsift::Config::PopSift); }), "During the initial upscale, shift pixels by 1. In extrema refinement, steps up to 0.6, do not reject points when reaching max iterations, " "first contrast threshold is .8 * peak thresh. Shift feature coords octave 0 back to original pos.") diff --git a/src/popsift/s_desc_vlfeat.cu b/src/popsift/s_desc_vlfeat.cu new file mode 100644 index 00000000..5fc4df13 --- /dev/null +++ b/src/popsift/s_desc_vlfeat.cu @@ -0,0 +1,213 @@ +/* + * Copyright 2016-2017, Simula Research Laboratory + * 2018-2020, University of Oslo + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ +#include "popsift/sift_config.h" + +#include "common/assist.h" +#include "common/debug_macros.h" +#include "common/vec_macros.h" +#include "s_desc_vlfeat.h" +#include "s_gradiant.h" +#include "sift_constants.h" +#include "sift_pyramid.h" + +#include + +using namespace popsift; + +__device__ static inline +void ext_desc_vlfeat_sub( const float ang, + const Extremum* ext, + float* __restrict__ features, + cudaTextureObject_t layer_tex, + const int width, + const int height ) +{ + const float x = ext->xpos; + const float y = ext->ypos; + const int level = ext->lpos; // old_level; + const float sig = ext->sigma; + const float SBP = fabsf(DESC_MAGNIFY * sig); + + if( SBP == 0 ) { + return; + } + + float cos_t; + float sin_t; + __sincosf( ang, &sin_t, &cos_t ); + + const float csbp = cos_t * SBP; + const float ssbp = sin_t * SBP; + const float crsbp = cos_t / SBP; + const float srsbp = sin_t / SBP; + + // We have 4x4*16 bins. + // There centers have the offsets -1.5, -0.5, 0.5, 1.5 from the + // keypoint. The points that support them stretch from -2 to 2 + const float2 maxdist = make_float2( -2.0f, -2.0f ); + + // We rotate the corner of the maximum range by the keypoint orientation. + // const float ptx = csbp * maxdist - ssbp * maxdist; + // const float pty = csbp * maxdist + ssbp * maxdist; + const float ptx = fabsf( ::fmaf( csbp, maxdist.x, -ssbp * maxdist.y ) ); + const float pty = fabsf( ::fmaf( csbp, maxdist.y, ssbp * maxdist.x ) ); + + const float bsz = 2.0f * ( fabsf(csbp) + fabsf(ssbp) ); + + const int xmin = max(1, (int)floorf(x - ptx - bsz)); + const int ymin = max(1, (int)floorf(y - pty - bsz)); + const int xmax = min(width - 2, (int)floorf(x + ptx + bsz)); + const int ymax = min(height - 2, (int)floorf(y + pty + bsz)); + + __shared__ float dpt[128]; + + for( int i=threadIdx.x; i<128; i+=blockDim.x ) + { + dpt[i] = 0.0f; + } + + __syncthreads(); + + for( int pix_y = ymin; pix_y <= ymax; pix_y += 1 ) + { + for( int base_x = xmin; base_x <= xmax; base_x += 32 ) + { + float mod; + float th; + + get_gradiant32( mod, th, base_x, pix_y, layer_tex, level ); + + mod /= 2.0f; // Our mod is double that of vlfeat. Huh. + + th -= ang; + while( th > M_PI2 ) th -= M_PI2; + while( th < 0.0f ) th += M_PI2; + __syncthreads(); + + const int pix_x = base_x + threadIdx.x; + + if( ( pix_y <= ymax ) && ( pix_x <= xmax ) ) + { + // d : distance from keypoint + const float2 d = make_float2( pix_x - x, pix_y - y ); + + // n : normalized distance from keypoint + const float2 n = make_float2( ::fmaf( crsbp, d.x, srsbp * d.y ), + ::fmaf( crsbp, d.y, -srsbp * d.x ) ); + + const float ww = __expf( -scalbnf(n.x*n.x + n.y*n.y, -3)); + + const float nt = 8.0f * th / M_PI2; + + // neighbouring tile on the lower side: -2, -1, 0 or 1 + // (must use floorf because casting rounds towards zero + const int3 t0 = make_int3( (int)floorf(n.x - 0.5f), + (int)floorf(n.y - 0.5f), + (int)nt ); + const float wgt_x = - ( n.x - ( t0.x + 0.5f ) ); + const float wgt_y = - ( n.y - ( t0.y + 0.5f ) ); + const float wgt_t = - ( nt - t0.z ); + + for( int tx=0; tx<2; tx++ ) + { + for( int ty=0; ty<2; ty++ ) + { + for( int tt=0; tt<2; tt++ ) + { + if( ( t0.y + ty >= -2 ) && + ( t0.y + ty < 2 ) && + ( t0.x + tx >= -2 ) && + ( t0.x + tx < 2 ) ) + { + float i_wgt_x = ( tx == 0 ) ? 1.0f + wgt_x : wgt_x; + float i_wgt_y = ( ty == 0 ) ? 1.0f + wgt_y : wgt_y; + float i_wgt_t = ( tt == 0 ) ? 1.0f + wgt_t : wgt_t; + + i_wgt_x = fabsf( i_wgt_x ); + i_wgt_y = fabsf( i_wgt_y ); + i_wgt_t = fabsf( i_wgt_t ); + + const float val = ww + * mod + * i_wgt_x + * i_wgt_y + * i_wgt_t; + + const int offset = 80 + + ( t0.y + ty ) * 32 + + ( t0.x + tx ) * 8 + + ( t0.z + tt ) % 8; + + atomicAdd( &dpt[offset], val ); + } + } + } + } + } + __syncthreads(); + } + } + + for( int i=threadIdx.x; i<128; i+=blockDim.x ) + { + features[i] = dpt[i]; + } +} + +__global__ void ext_desc_vlfeat( int octave, cudaTextureObject_t layer_tex, int w, int h) +{ + const int o_offset = dct.ori_ps[octave] + blockIdx.x; + Descriptor* desc = &dbuf.desc [o_offset]; + const int ext_idx = dobuf.feat_to_ext_map[o_offset]; + Extremum* ext = dobuf.extrema + ext_idx; + + const int ext_base = ext->idx_ori; + const int ori_num = o_offset - ext_base; + const float ang = ext->orientation[ori_num]; + + ext_desc_vlfeat_sub( ang, + ext, + desc->features, + layer_tex, + w, + h ); +} + +namespace popsift +{ + +bool start_ext_desc_vlfeat( const int octave, Octave& oct_obj ) +{ + dim3 block; + dim3 grid; + grid.x = hct.ori_ct[octave]; + grid.y = 1; + grid.z = 1; + + if( grid.x == 0 ) return false; + + block.x = 32; + block.y = 1; + block.z = 1; + + size_t shared_size = 4 * 128 * sizeof(float); + + ext_desc_vlfeat + <<>> + ( octave, + oct_obj.getDataTexPoint( ), + oct_obj.getWidth(), + oct_obj.getHeight() ); + + POP_SYNC_CHK; + + return true; +} + +}; // namespace popsift diff --git a/src/popsift/s_desc_vlfeat.h b/src/popsift/s_desc_vlfeat.h new file mode 100644 index 00000000..713f19d8 --- /dev/null +++ b/src/popsift/s_desc_vlfeat.h @@ -0,0 +1,19 @@ +/* + * Copyright 2016-2017, Simula Research Laboratory + * 2018-2020, University of Oslo + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ +#pragma once +#include "sift_octave.h" +#include "sift_pyramid.h" + +namespace popsift +{ + +bool start_ext_desc_vlfeat( const int octave, Octave& oct_obj ); + +}; // namespace popsift + diff --git a/src/popsift/s_gradiant.h b/src/popsift/s_gradiant.h index aaec9e2d..bfeadbd3 100644 --- a/src/popsift/s_gradiant.h +++ b/src/popsift/s_gradiant.h @@ -68,6 +68,37 @@ void get_gradiant( float& grad, theta = atan2f(dy, dx); } +/* A version of get_gradiant that works for a (32,1,1) threadblock + * and pulls data to shared memory before computing. Data is pulled + * less frequently, meaning that we do not rely on the L1 cache. + */ +__device__ static inline +void get_gradiant32( float& grad, + float& theta, + const int x, + const int y, + cudaTextureObject_t layer, + const int level ) +{ + const int idx = threadIdx.x; + + __shared__ float x_array[34]; + + for( int i=idx; i<34; i += blockDim.x ) + { + x_array[i] = readTex( layer, x+i-1.0f, y, level ); + } + __syncthreads(); + + const float dx = x_array[idx+2] - x_array[idx]; + + const float dy = readTex( layer, x+idx, y+1.0f, level ) + - readTex( layer, x+idx, y-1.0f, level ); + + grad = hypotf( dx, dy ); // __fsqrt_rz(dx*dx + dy*dy); + theta = atan2f(dy, dx); +} + __device__ static inline void get_gradiant( float& grad, float& theta, diff --git a/src/popsift/sift_conf.cu b/src/popsift/sift_conf.cu index 251f58ff..b58470e6 100644 --- a/src/popsift/sift_conf.cu +++ b/src/popsift/sift_conf.cu @@ -72,6 +72,8 @@ void Config::setDescMode( const std::string& text ) setDescMode( Config::IGrid ); else if( text == "notile" ) setDescMode( Config::NoTile ); + else if( text == "vlfeat" ) + setDescMode( Config::VLFeat_Desc ); else POP_FATAL( "specified descriptor extraction mode must be one of loop, grid or igrid" ); } @@ -81,6 +83,25 @@ void Config::setDescMode( Config::DescMode m ) _desc_mode = m; } +const char* Config::getDescModeUsage( ) +{ + return "Choice of descriptor extraction modes:\n" + "loop, iloop, grid, igrid, notile, vlfeat\n" + "Default is loop\n" + "loop is OpenCV-like horizontal scanning, sampling every pixel in a radius around the " + "centers or the 16 tiles arond the keypoint. Each sampled point contributes to two " + "histogram bins." + "iloop is like loop but samples all constant 1-pixel distances from the keypoint, " + "using the CUDA texture engine for interpolation. " + "grid is like loop but works on rotated, normalized tiles, relying on CUDA 2D cache " + "to replace the manual data aligment idea of loop. " + "igrid iloop and grid. " + "notile is like igrid but handles all 16 tiles at once.\n" + "vlfeat is VLFeat-like horizontal scanning, sampling every pixel in a radius around " + "keypoint itself, using the 16 tile centers only for weighting. Every sampled point " + "contributes to up to eight historgram bins."; +} + void Config::setGaussMode( const std::string& m ) { if( m == "vlfeat" ) diff --git a/src/popsift/sift_conf.h b/src/popsift/sift_conf.h index 583a958c..7b3f2ccd 100644 --- a/src/popsift/sift_conf.h +++ b/src/popsift/sift_conf.h @@ -84,16 +84,20 @@ struct Config */ enum DescMode { - /// scan horizontal, extract valid points + /// scan horizontal, extract valid points - weight goes into 2 histogram bins Loop, - /// scan horizontal, extract valid points, interpolate with tex engine + /// loop-compatible; scan horizontal, extract valid points, interpolate with tex engine ILoop, - /// scan in rotated mode, round pixel address + /// loop-compatible; scan in rotated mode, round pixel address Grid, - /// scan in rotated mode, interpolate with tex engine + /// loop-compatible; scan in rotated mode, interpolate with tex engine IGrid, - /// variant of IGrid, no duplicate gradient fetching - NoTile + /// loop-compatible; variant of IGrid, no duplicate gradient fetching + NoTile, + /** extraction code according to VLFeat, similar to loop, weight goes into + * up to 8 histogram bins + */ + VLFeat_Desc }; /** @@ -182,6 +186,11 @@ struct Config */ void setDescMode( DescMode mode = Loop ); + /** + * @brief Helper functions for the main program's usage string. + */ + static const char* getDescModeUsage( ); + // void setGaussGroup( int groupsize ); // int getGaussGroup( ) const; diff --git a/src/popsift/sift_desc.cu b/src/popsift/sift_desc.cu index 4632298a..c03aee5f 100644 --- a/src/popsift/sift_desc.cu +++ b/src/popsift/sift_desc.cu @@ -13,6 +13,7 @@ #include "s_desc_loop.h" #include "s_desc_normalize.h" #include "s_desc_notile.h" +#include "s_desc_vlfeat.h" #include "s_gradiant.h" #include "sift_config.h" #include "sift_constants.h" @@ -67,6 +68,8 @@ void Pyramid::descriptors( const Config& conf ) start_ext_desc_igrid( octave, oct_obj ); } else if( conf.getDescMode() == Config::NoTile ) { start_ext_desc_notile( octave, oct_obj ); + } else if( conf.getDescMode() == Config::VLFeat_Desc ) { + start_ext_desc_vlfeat( octave, oct_obj ); } else { POP_FATAL( "not yet" ); }