From 4c0b474f5c8e40f5b72d8aa623bbc95a4ae4bd2e Mon Sep 17 00:00:00 2001 From: unknown Date: Tue, 16 Apr 2024 19:44:20 +0200 Subject: [PATCH 1/2] dirty but working support for VK_NV_cuda_kernel_launch! --- src/main.cpp | 277 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 271 insertions(+), 6 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index fdff138..b1ea8af 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,3 +1,6 @@ +//https://github.com/NVIDIA/cuda-samples/blob/master/Common/helper_math.h +//baja requeriments a vulkan 1.1 o vulkan 1.0 +// pointers check #include @@ -67,6 +70,7 @@ class ComputeApplication { VkPipeline pipeline; VkPipelineLayout pipelineLayout; VkShaderModule computeShaderModule; + VkCudaModuleNV computeShaderModuleCUDA; /* The command buffer is used to record commands, that will be submitted to a queue. @@ -133,6 +137,7 @@ class ComputeApplication { createDescriptorSetLayout(); createDescriptorSet(); createComputePipeline(); + createComputePipelineCUDA(); createCommandBuffer(); // Finally, run the recorded command buffer. @@ -209,7 +214,7 @@ class ComputeApplication { bool foundLayer = false; for (VkLayerProperties prop : layerProperties) { - if (strcmp("VK_LAYER_LUNARG_standard_validation", prop.layerName) == 0) { + if (strcmp("VK_LAYER_KHRONOS_validation", prop.layerName) == 0) { foundLayer = true; break; } @@ -217,9 +222,9 @@ class ComputeApplication { } if (!foundLayer) { - throw std::runtime_error("Layer VK_LAYER_LUNARG_standard_validation not supported\n"); + throw std::runtime_error("Layer VK_LAYER_KHRONOS_validation not supported\n"); } - enabledLayers.push_back("VK_LAYER_LUNARG_standard_validation"); // Alright, we can use this layer. + enabledLayers.push_back("VK_LAYER_KHRONOS_validation"); // Alright, we can use this layer. /* We need to enable an extension named VK_EXT_DEBUG_REPORT_EXTENSION_NAME, @@ -249,6 +254,8 @@ class ComputeApplication { enabledExtensions.push_back(VK_EXT_DEBUG_REPORT_EXTENSION_NAME); } + //enabledExtensions.push_back("VK_KHR_get_physical_device_properties2"); + //enabledExtensions.push_back("VK_KHR_device_group_creation"); /* Next, we actually create the instance. @@ -264,8 +271,13 @@ class ComputeApplication { applicationInfo.applicationVersion = 0; applicationInfo.pEngineName = "awesomeengine"; applicationInfo.engineVersion = 0; - applicationInfo.apiVersion = VK_API_VERSION_1_0;; - +#if 1 + //applicationInfo.apiVersion = VK_API_VERSION_1_0; + //applicationInfo.apiVersion = VK_API_VERSION_1_1; + applicationInfo.apiVersion = VK_API_VERSION_1_2; +#else + applicationInfo.apiVersion = VK_API_VERSION_1_3; +#endif VkInstanceCreateInfo createInfo = {}; createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO; createInfo.flags = 0; @@ -274,6 +286,7 @@ class ComputeApplication { // Give our desired layers and extensions to vulkan. createInfo.enabledLayerCount = enabledLayers.size(); createInfo.ppEnabledLayerNames = enabledLayers.data(); + //enabledExtensions.push_back(VK_NV_CUDA_KERNEL_LAUNCH_EXTENSION_NAME); createInfo.enabledExtensionCount = enabledExtensions.size(); createInfo.ppEnabledExtensionNames = enabledExtensions.data(); @@ -383,6 +396,7 @@ class ComputeApplication { return i; } + int launch_cuda = 0; void createDevice() { /* We create the logical device in this function. @@ -408,17 +422,59 @@ class ComputeApplication { // Specify any desired device features here. We do not need any for this application, though. VkPhysicalDeviceFeatures deviceFeatures = {}; + VkPhysicalDeviceVulkan11Features features11 = {}; + VkPhysicalDeviceBufferDeviceAddressFeatures buffea = {}; + VkPhysicalDeviceFeatures2 physical_features2 = {}; + //physical_features2.pNext = &features11; + physical_features2.pNext = &buffea; + physical_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2; + buffea.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES_KHR;//VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_ADDRESS_FEATURES_EXT; + + vkGetPhysicalDeviceFeatures2(physicalDevice, &physical_features2); + deviceCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; deviceCreateInfo.enabledLayerCount = enabledLayers.size(); // need to specify validation layers here as well. deviceCreateInfo.ppEnabledLayerNames = enabledLayers.data(); deviceCreateInfo.pQueueCreateInfos = &queueCreateInfo; // when creating the logical device, we also specify what queues it has. deviceCreateInfo.queueCreateInfoCount = 1; + //https://gpuopen-librariesandsdks.github.io/VulkanMemoryAllocator/html/enabling_buffer_device_address.html +#if 0 deviceCreateInfo.pEnabledFeatures = &deviceFeatures; +#else + deviceCreateInfo.pEnabledFeatures = {}; + deviceCreateInfo.pNext = &physical_features2; +#endif + std::vector enabled_devExtensions = {}; + uint32_t count = 0; + vkEnumerateDeviceExtensionProperties(physicalDevice, nullptr, &count, nullptr); + std::vector extensions(count); + vkEnumerateDeviceExtensionProperties(physicalDevice, nullptr, &count, extensions.data()); + + // Checking for support of VK_KHR_bind_memory2 + for (uint32_t i = 0; i < count; i++) { + if (strcmp(VK_NV_CUDA_KERNEL_LAUNCH_EXTENSION_NAME, extensions[i].extensionName) == 0) { + launch_cuda = 1; + enabled_devExtensions.push_back("VK_NV_cuda_kernel_launch"); + //enabled_devExtensions.push_back("VK_KHR_buffer_device_address"); + //enabled_devExtensions.push_back("VK_KHR_device_group"); + + //estas 2 ultimas no son device extesions + // enabled_devExtensions.push_back("VK_KHR_get_physical_device_properties2"); + //enabled_devExtensions.push_back("VK_KHR_device_group_creation"); + break; // VK_KHR_cuda is supported + } + } + + deviceCreateInfo.ppEnabledExtensionNames = enabled_devExtensions.data(); + deviceCreateInfo.enabledExtensionCount = enabled_devExtensions.size(); + VK_CHECK_RESULT(vkCreateDevice(physicalDevice, &deviceCreateInfo, NULL, &device)); // create logical device. // Get a handle to the only member of the queue family. vkGetDeviceQueue(device, queueFamilyIndex, 0, &queue); + + } // find memory type with desired properties. @@ -448,7 +504,11 @@ class ComputeApplication { VkBufferCreateInfo bufferCreateInfo = {}; bufferCreateInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; bufferCreateInfo.size = bufferSize; // buffer size in bytes. +#if 1//cuda + bufferCreateInfo.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT; // buffer is used as a storage buffer. +#else bufferCreateInfo.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; // buffer is used as a storage buffer. +#endif bufferCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; // buffer is exclusive to a single queue family at a time. VK_CHECK_RESULT(vkCreateBuffer(device, &bufferCreateInfo, NULL, &buffer)); // create buffer. @@ -482,6 +542,17 @@ class ComputeApplication { allocateInfo.memoryTypeIndex = findMemoryType( memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); +#if 1 + VkMemoryAllocateFlagsInfo flagsinfo; + flagsinfo.deviceMask = 0; + flagsinfo.pNext = NULL; + flagsinfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO_KHR; + flagsinfo.flags = VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT; + allocateInfo.pNext = &flagsinfo; + + +#endif + VK_CHECK_RESULT(vkAllocateMemory(device, &allocateInfo, NULL, &bufferMemory)); // allocate memory on device. // Now associate that allocated memory with the buffer. With that, the buffer is backed by actual memory. @@ -582,6 +653,7 @@ class ComputeApplication { FILE* fp = fopen(filename, "rb"); if (fp == NULL) { printf("Could not find or open file: %s\n", filename); + return(NULL); } // get file size. @@ -605,6 +677,187 @@ class ComputeApplication { return (uint32_t *)str; } + int writeFile(void* buf,uint32_t length, const char* filename) { + + FILE* fp = fopen(filename, "wb"); + if (fp == NULL) { + printf("Could not find or open file: %s\n", filename); + } + + int written=fwrite(buf, length, sizeof(char), fp); + fclose(fp); + + return (written==length); + } + + VkCudaLaunchInfoNV cudalaunch_inf; + //int launch_cuda = 1; + + PFN_vkCreateCudaModuleNV vkCreateCudaModuleNV; + PFN_vkGetCudaModuleCacheNV vkGetCudaModuleCacheNV; + PFN_vkCreateCudaFunctionNV vkCreateCudaFunctionNV; + + PFN_vkCmdCudaLaunchKernelNV vkCmdCudaLaunchKernelNV; + PFN_vkDestroyCudaModuleNV vkDestroyCudaModuleNV; + PFN_vkDestroyCudaFunctionNV vkDestroyCudaFunctionNV; + VkCudaFunctionNV cuda_fun; + + void createComputePipelineCUDA() { + /* + We create a compute pipeline here. + */ + +#define GET_VKFUNC(a) a=(PFN_##a)vkGetDeviceProcAddr(device, #a); if (a==NULL){printf("Error getting function pointer %s\n",#a); exit(1);/*return;*/} + +#if 1 + GET_VKFUNC(vkCreateCudaModuleNV); + GET_VKFUNC(vkGetCudaModuleCacheNV); + GET_VKFUNC(vkCreateCudaFunctionNV); + GET_VKFUNC(vkCmdCudaLaunchKernelNV); + GET_VKFUNC(vkDestroyCudaModuleNV); + GET_VKFUNC(vkDestroyCudaFunctionNV); + +#else + vkCreateCudaModuleNV = (PFN_vkCreateCudaModuleNV)vkGetDeviceProcAddr(device, "vkCreateCudaModuleNV"); + vkGetCudaModuleCacheNV = (PFN_vkGetCudaModuleCacheNV)vkGetDeviceProcAddr(device, "vkGetCudaModuleCacheNV"); + vkCreateCudaFunctionNV = (PFN_vkCreateCudaFunctionNV)vkGetDeviceProcAddr(device, "vkCreateCudaFunctionNV"); + vkCmdCudaLaunchKernelNV = (PFN_vkCmdCudaLaunchKernelNV)vkGetDeviceProcAddr(device, "vkCmdCudaLaunchKernelNV"); + + vkDestroyCudaModuleNV = (PFN_vkDestroyCudaModuleNV)vkGetDeviceProcAddr(device, "vkDestroyCudaModuleNV"); + vkDestroyCudaFunctionNV = (PFN_vkDestroyCudaFunctionNV)vkGetDeviceProcAddr(device, "vkDestroyCudaFunctionNV"); +#endif + + /* + Create a shader module. A shader module basically just encapsulates some shader code. + */ + uint32_t filelength; + // the code in comp.spv was created by running the command: + // glslangValidator.exe -V shader.comp + uint32_t* code; + int cachedread = 0; +#if 1 + char* shaderfile = "shaders/shader.ptx"; + char* cachedfile = "shaders/cache.ptx.bin"; +#else + char* shaderfile = "shaders/NVIDIA GeForce RTX 4070.12040_64.ptx"; +#endif + code = readFile(filelength, cachedfile); + if (!code) + { + cachedread = 0; + code = readFile(filelength, shaderfile);//"shaders/NVIDIA GeForce RTX 4070.12040_64.ptx"); + } + else + cachedread = 1; + VkCudaModuleCreateInfoNV createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_CUDA_MODULE_CREATE_INFO_NV; + createInfo.pData = code; + createInfo.dataSize = filelength; + createInfo.pNext = NULL; + + VK_CHECK_RESULT(vkCreateCudaModuleNV(device, &createInfo, NULL, &computeShaderModuleCUDA)); + delete[] code; + + void* cuda_CacheData = NULL; + size_t cuda_CacheSize = 0; + if (cachedread == 0) + { + + VK_CHECK_RESULT(vkGetCudaModuleCacheNV(device, computeShaderModuleCUDA, &cuda_CacheSize, NULL)); + cuda_CacheData = malloc(cuda_CacheSize + 1); + VK_CHECK_RESULT(vkGetCudaModuleCacheNV(device, computeShaderModuleCUDA, &cuda_CacheSize, cuda_CacheData)); + writeFile(cuda_CacheData, cuda_CacheSize, cachedfile /*"shaders/cache.ptx.bin"*/); + } + + VkCudaFunctionCreateInfoNV createinf_fn; + createinf_fn.sType = VK_STRUCTURE_TYPE_CUDA_FUNCTION_CREATE_INFO_NV; + createinf_fn.pNext = NULL; + createinf_fn.module = computeShaderModuleCUDA; +#if 1 + //createinf_fn.pName = "main2"; + createinf_fn.pName = "_Z5main2P6float4"; +#else + createinf_fn.pName = "_Z5main2v"; +#endif + // _Z5main2v + //"wmma_matmul";// "main"; + + VK_CHECK_RESULT(vkCreateCudaFunctionNV(device, &createinf_fn, NULL, &cuda_fun)); + + // VkCudaLaunchInfoNV cudalaunch_inf; + cudalaunch_inf.blockDimX = WORKGROUP_SIZE; + cudalaunch_inf.blockDimY = WORKGROUP_SIZE; + cudalaunch_inf.blockDimZ = 1; + cudalaunch_inf.gridDimX = 1; + cudalaunch_inf.gridDimY = 1; + cudalaunch_inf.gridDimZ = 1; + cudalaunch_inf.sType = VK_STRUCTURE_TYPE_CUDA_LAUNCH_INFO_NV; + cudalaunch_inf.sharedMemBytes = 0; + cudalaunch_inf.pNext = NULL; + cudalaunch_inf.function = cuda_fun; + cudalaunch_inf.pExtras = NULL; + cudalaunch_inf.extraCount = 0; + +#if 0 + cudalaunch_inf.paramCount = 0; + cudalaunch_inf.pParams = NULL; +#else + cudalaunch_inf.paramCount = 1; + VkBufferDeviceAddressInfo infobuf; + infobuf.buffer = buffer; + infobuf.pNext = NULL; + infobuf.sType = VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO; + static VkDeviceAddress address = vkGetBufferDeviceAddress(device, &infobuf);; + //void* params[] = { (void*)address,NULL }; +#if 0 + static void* params[] = { (void*)&buffer,NULL}; +#else + static void* params[] = { (void*)/*buffer*/&address,NULL }; +#endif + //cudalaunch_inf.pParams = (void**)&address;// (void**)&buffer; + cudalaunch_inf.pParams = params;// (void**)&buffer; +#endif + + //vkCmdCudaLaunchKernelNV( commandBuffer, &cudalaunch_inf); +#if 0 + /* + Now let us actually create the compute pipeline. + A compute pipeline is very simple compared to a graphics pipeline. + It only consists of a single stage with a compute shader. + + So first we specify the compute shader stage, and it's entry point(main). + */ + VkPipelineShaderStageCreateInfo shaderStageCreateInfo = {}; + shaderStageCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + shaderStageCreateInfo.stage = VK_SHADER_STAGE_COMPUTE_BIT; + shaderStageCreateInfo.module = computeShaderModule; + shaderStageCreateInfo.pName = "main"; + + /* + The pipeline layout allows the pipeline to access descriptor sets. + So we just specify the descriptor set layout we created earlier. + */ + VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo = {}; + pipelineLayoutCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + pipelineLayoutCreateInfo.setLayoutCount = 1; + pipelineLayoutCreateInfo.pSetLayouts = &descriptorSetLayout; + VK_CHECK_RESULT(vkCreatePipelineLayout(device, &pipelineLayoutCreateInfo, NULL, &pipelineLayout)); + + VkComputePipelineCreateInfo pipelineCreateInfo = {}; + pipelineCreateInfo.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO; + pipelineCreateInfo.stage = shaderStageCreateInfo; + pipelineCreateInfo.layout = pipelineLayout; + + /* + Now, we finally create the compute pipeline. + */ + VK_CHECK_RESULT(vkCreateComputePipelines( + device, VK_NULL_HANDLE, + 1, &pipelineCreateInfo, + NULL, &pipeline)); +#endif + } + void createComputePipeline() { /* We create a compute pipeline here. @@ -710,7 +963,15 @@ class ComputeApplication { The number of workgroups is specified in the arguments. If you are already familiar with compute shaders from OpenGL, this should be nothing new to you. */ - vkCmdDispatch(commandBuffer, (uint32_t)ceil(WIDTH / float(WORKGROUP_SIZE)), (uint32_t)ceil(HEIGHT / float(WORKGROUP_SIZE)), 1); + if (launch_cuda) + { + cudalaunch_inf.gridDimX = (uint32_t)ceil(WIDTH / float(WORKGROUP_SIZE)); + cudalaunch_inf.gridDimY = (uint32_t)ceil(HEIGHT / float(WORKGROUP_SIZE)); + cudalaunch_inf.gridDimZ = 1; + vkCmdCudaLaunchKernelNV(commandBuffer, &cudalaunch_inf); + } + else + vkCmdDispatch(commandBuffer, (uint32_t)ceil(WIDTH / float(WORKGROUP_SIZE)), (uint32_t)ceil(HEIGHT / float(WORKGROUP_SIZE)), 1); VK_CHECK_RESULT(vkEndCommandBuffer(commandBuffer)); // end recording commands. } @@ -764,6 +1025,10 @@ class ComputeApplication { func(instance, debugReportCallback, NULL); } + vkDestroyCudaModuleNV(device, computeShaderModuleCUDA, NULL); + vkDestroyCudaFunctionNV(device, cuda_fun, NULL); + + vkFreeMemory(device, bufferMemory, NULL); vkDestroyBuffer(device, buffer, NULL); vkDestroyShaderModule(device, computeShaderModule, NULL); From 8674359b522e04219dfc0afb656fdb3908da28c0 Mon Sep 17 00:00:00 2001 From: unknown Date: Tue, 16 Apr 2024 19:54:20 +0200 Subject: [PATCH 2/2] add CUDA shaders --- shaders/cache.ptx.bin | Bin 0 -> 11768 bytes shaders/hazptx.bat | 1 + shaders/shader.cu | 183 ++++++++++++++ shaders/shader.ptx | 570 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 754 insertions(+) create mode 100644 shaders/cache.ptx.bin create mode 100644 shaders/hazptx.bat create mode 100644 shaders/shader.cu create mode 100644 shaders/shader.ptx diff --git a/shaders/cache.ptx.bin b/shaders/cache.ptx.bin new file mode 100644 index 0000000000000000000000000000000000000000..0d71739199ff518d59bad61cf3ffda0f40172d88 GIT binary patch literal 11768 zcmeHNZ;V`36~FWT&D)*n%6Vn7XS5C|ZU+7Dtd#)|zwOiT($0t!BVzjN<>^XBcg z1Y%-8_yYUhx#!+<&pG#;-#zzEm#05-ol_`WUR3&V)T8Q<`Q~fmC7RfHk$&Ie-=f;; zqsqhInkp!@ZE^o%FY3+A;>*65Zr8NDJB$10_rz-3{2kj4%=VWxLQ%3qtaxPK7m+$B-&MkbcJJW-Eavol5(axa-TNUFvh1UGL1Ff#Atl}+8 zdv#U&5wsN4pnaq1aX;R!Dj90<`l+;+2My-ydAv7c-qQ`;P%}WV1*z?-b*uXCly+}b z`!}TBqR4>#fV78$2J`hZwAX7HdnYs_!P7E*_|3;v-XXrL<-7X*ttw3J>TeC@Qz^Dz zu}Zpbur_&A*P{+1py|y8=?IR$+ED!F!}t<)C$zn`>FN->{jE2?#Hbd+U+>p=dqINfTslKcJR#nrHQqr`(sbqZ5O-5h*!N){@$eYIe z$3+G7r~ICS7{7kn@*UCd`f2HMVMR^$92nG}atIHwf3oK~lqXh_jvR@1NfgqthmgPi zNg1#CE;EXG!1;E4OQ|-$Mc;Ad#FT{j2-T;mpgnaI{;>2;=(AhEdqwPPMUEt8k?V14 zTwQDZnXzA1D7D{9D!rG^V!sR3H{|T!kg=cY_ov)}{VwGRN{D@4&VC=P4H^4g*M~XM z|BUuyzO4OpmZ8P|4}w0geIN+sPNH24#dFYzP&!H2w(-o4wxZ${ogKGtI*3cPfe#r&mMwve`J zBcG5jyO115MP=BFe3`)f=-;>UF`to#7V6t`;J^Nq@DZPoKcr7yU+5F)J*zF_n~^`z zSM)1-seM{Msyn0~G{rN}5B>XW(f;#|d_(^a^tkr{g6ZoMKt*!UFC=fi5oIV4dpeG)kueCS~31w0bVXSb{$%{>0v(N`9l`pKu^Y@+P5!2 zf8l$Ry|Q}#x#zAC40^fhO~ypiw&nxhn$Z7v^_jMvuNjxi!*r>X(oe9z6CQqRGw=~Z zAP;yO_IJIoOU=SQ*y}F_O2YlJrc$j6>JwMj^xU6O@1S0AU#dRy7=0%71?c?~5BywA z;Qksb_sJ_B|Dn*ZhxNls27fpoe7~$t4E%|ez@yir5vzpTXShM{Pc7)nddu!_eyq`2 znf>eTL!Q!L54o7Hj{S_jOIA>CfPUnlXj-uTuE$lH*H7MNJyJzISOH<`LG*7$IQpPN zf%OXf(THmaeP5wI=F=^X{M`y&kLyvSzR(ZY4hi%p?1BD`*mouV1N0KQn9x5mALw$Vii{7tUM#9wODb~#UsMS<48b_Dv2tE~s6e!&cQHg8jP4Nv{# zYK%VteKB72t6}`+3(|NEM!|~ z7OVf59lz@s_NNSu4gttdC{6_2u-j-9mb<0`2b)u@)&72=vQ80PZ>bt&MYh z27kjL{H@hxe2Tw1qXmxzf2$(j;_sMKtBJiejR%L99Il+!N8_(fzybbNhw<0T;cuCt zoWb8f@GOJB&H!(TzZ&lZZ%+|_Gk7fc>j?h(#9yvo=t}+z9>bnST$FfsQv4kg{4H|* zHU5q<-ekQnz7Yq~U-16-#M>eq4S%Ni8wg8i!Cz0;GsNFL72qw)$38~jncb$wqR^4` zCLW&{2i|&${#Syx#AD+R;xXdcp1(ulFYE*UmQ(%6uj_nqOg#}sj-Buf`0FHs9Vz~1 z@@;MW4YTpxjQvs;e{udM-s5Qte(?T=cxvwhSbx?8>@+^Eg~!4+cszu^&?k(&0p31A zTDF%d_|#J`!2b1c2k=$$2mYU2?-q%#79Xdv{|SVp`05s!KjL%v5At*P>IcMExl;fi z_*Tl{>(F@ISLpbp3($Wu_5`l)U#spgzUup^0lw<`wrcRzIb_xYu9x_JS9}eJ;%^On4c{BS zdLCnAfUo5|zUuQJ@k{X4)%DjI@inZDi~k1oZFvY^gIs;<4eDFO>sVA3yh@0t#M`hI z(BXo&h!@naIeZOr`0C~G)eoE0hj@;BsmF5Nl)=}c?012z2kYBFt}+Mpt*$RUyVIE%x^&|2d_$vMXi}OuY*EVZ3gDhm6Pd zIF1bAs}~Soxd$?Tow%sJckI|F1n&j!Jc(byH{u=cYZ*H9DdPutDSSCYg5T-|WKD^F z!gu99RD8hkvF||sqh`Ju>Q~%Xe8$o;zvyr3$H`Fa(Rj-JpchcoRKQID^0wUJjHoZ7 zTru(bjU^@ZO>lm#O#Thnuk>T@3j)DcsW-r%UMi?24dQ(QXq@MSuMED`J!A#(BLg$7A9N_&DES1I9!834LCF2hzK|pN9T8zXw9^?8@&ut-Jl9`k410+50){ zSJZ7;J^rdo`JlPJpkEhxn}m-sAJOupUX;bs_9A|e5XrK78hFP0amYU{o2BNb@=v@$ z`MlrJ@=rj%o=?wz0`sF@;(0;eKPjL0>yVENkYqVoxkt{cR-b74HL-73L#f@F`K&&2 zK8HWj{#yq6t4x2?N8VmG`C^T~L` zd$?HC^&*pW7C!?HNn1NZ>A`Yxxz#3+~GxU+P<_cd2hLM5^?s z1cRQw6!}(>pHd`$m=94={?&xoCv;d69%M4!CZlx~rplmySW5p@gB~w@X%oEH_i46% z9=abx|5{WM`|SM(?srNw-_nS0^>56Vz5mn~5bz)31^klm`TM8&z2+f0)ZFKhPg2&~ z@Q>&CuV6YG54ew_Y1j|3UW`w~_x+jvI1g+4R?zF^>L&QR)WoOg%lpyD)%>>bHzK~w ze(?{{>Fr7ElwKOc@-oRf>gf(A^(+0G?|YHvbkh#z zqbqwe{zZhLt&K)(+Wph|tJ_Xn^vA+cca8CD`Ty5v7;k@$+E3x;S!?_b@YuynXS)sz zozP9jf9LfFJ>-iu1;^X= WIDTH || idy >= HEIGHT) + return; + + float x = float(idx) / float(WIDTH); + float y = float(idy) / float(HEIGHT); + + +#if 0 + /* + What follows is code for rendering the mandelbrot set. + */ + vec2 uv = vec2{ x,y }; + float n = 0.0; + vec2 temp1 = vec2{ uv.x - 0.5f, uv.y - 0.5f }; + temp1.x = (2.0 + 1.7 * 0.2); + temp1.y = (2.0 + 1.7 * 0.2); + vec2 c = vec2{ -.445, 0.0 }; + c.x = c.x + temp1.x; + c.y = c.y + temp1.y; + vec2 z = vec2{0,0}; + const int M =128; + for (int i = 0; i 2) break; + n++; + } + + // we use a simple cosine palette to determine color: + // http://iquilezles.org/www/articles/palettes/palettes.htm + float t = float(n) / float(M); + vec3 d = vec3{ 0.3f, 0.3f ,0.5f }; + vec3 e = vec3{ -0.2f, -0.3f ,-0.5f }; + vec3 f = vec3{ 2.1f, 2.0f, 3.0f }; + vec3 g = vec3{ 0.0f, 0.1f, 0.0f }; + vec3 hh = d + e * mycos(6.28318 * (f * t + g)); + vec4 color = vec4{ hh.x,hh.y,hh.z,1.0 }; +#else + + /* + What follows is code for rendering the mandelbrot set. + */ + vec2 uv = vec2a(x, y); + float n = 0.0; + vec2 c = vec2a(-.445, 0.0) + (uv - 0.5) * (2.0 + 1.7 * 0.2), + z = vec2a(0.0); + const int M = 128; + for (int i = 0; i < M; i++) + { + z = vec2a(z.x * z.x - z.y * z.y, 2.f * z.x * z.y) + c; + if (dot(z, z) > 2) break; + n++; + } + + // we use a simple cosine palette to determine color: + // http://iquilezles.org/www/articles/palettes/palettes.htm + float t = float(n) / float(M); + vec3 d = vec3d(0.3, 0.3, 0.5); + vec3 e = vec3d(-0.2, -0.3, -0.5); + vec3 f = vec3d(2.1, 2.0, 3.0); + vec3 g = vec3d(0.0, 0.1, 0.0); + vec4 color = vec4a(d + e * mycos(6.28318 * (f * t + g)), 1.0); +#endif + // store the rendered mandelbrot set into a storage buffer: +#ifdef USE_ARG + imageData[WIDTH * idy + idx] = float4{ color.x,color.y,color.z,1.0 }; + //imageData[WIDTH * idy + idx] = float4{ color.x,color.y,color.z,color.w }; + //imageData[WIDTH * idy + idx] = float4{ 1.0,0.0,0.0,1.0 }; +#endif +} \ No newline at end of file diff --git a/shaders/shader.ptx b/shaders/shader.ptx new file mode 100644 index 0000000..67aaee5 --- /dev/null +++ b/shaders/shader.ptx @@ -0,0 +1,570 @@ +// +// Generated by NVIDIA NVVM Compiler +// +// Compiler Build ID: CL-33961263 +// Cuda compilation tools, release 12.4, V12.4.99 +// Based on NVVM 7.0.1 +// + +.version 8.4 +.target sm_89 +.address_size 64 + + // .globl _Z5main2P6float4 +.global .align 4 .b8 __cudart_i2opi_f[24] = {65, 144, 67, 60, 153, 149, 98, 219, 192, 221, 52, 245, 209, 87, 39, 252, 41, 21, 68, 78, 110, 131, 249, 162}; + +.visible .entry _Z5main2P6float4( + .param .u64 _Z5main2P6float4_param_0 +) +{ + .local .align 4 .b8 __local_depot0[28]; + .reg .b64 %SP; + .reg .b64 %SPL; + .reg .pred %p<43>; + .reg .f32 %f<208>; + .reg .b32 %r<176>; + .reg .f64 %fd<7>; + .reg .b64 %rd<65>; + + + mov.u64 %SPL, __local_depot0; + ld.param.u64 %rd21, [_Z5main2P6float4_param_0]; + add.u64 %rd1, %SPL, 0; + mov.u32 %r62, %ctaid.x; + mov.u32 %r63, %ntid.x; + mov.u32 %r64, %tid.x; + mad.lo.s32 %r1, %r63, %r62, %r64; + mov.u32 %r65, %ctaid.y; + mov.u32 %r66, %ntid.y; + mov.u32 %r67, %tid.y; + mad.lo.s32 %r2, %r66, %r65, %r67; + setp.gt.s32 %p1, %r1, 3199; + setp.gt.s32 %p2, %r2, 2399; + or.pred %p3, %p1, %p2; + @%p3 bra $L__BB0_48; + + cvt.rn.f32.s32 %f87, %r1; + div.rn.f32 %f88, %f87, 0f45480000; + cvt.rn.f32.s32 %f89, %r2; + div.rn.f32 %f90, %f89, 0f45160000; + add.f32 %f91, %f88, 0fBF000000; + add.f32 %f92, %f90, 0fBF000000; + fma.rn.f32 %f1, %f91, 0f4015C28F, 0fBEE3D70A; + fma.rn.f32 %f2, %f92, 0f4015C28F, 0f00000000; + mov.f32 %f198, 0f00000000; + mov.u32 %r163, 0; + mov.f32 %f196, %f198; + mov.f32 %f197, %f198; + +$L__BB0_2: + mul.f32 %f93, %f196, %f196; + mul.f32 %f94, %f197, %f197; + sub.f32 %f95, %f94, %f93; + add.f32 %f96, %f197, %f197; + add.f32 %f6, %f1, %f95; + fma.rn.f32 %f7, %f96, %f196, %f2; + mul.f32 %f8, %f6, %f6; + mul.f32 %f9, %f7, %f7; + add.f32 %f97, %f9, %f8; + setp.gt.f32 %p4, %f97, 0f40000000; + @%p4 bra $L__BB0_11; + + add.f32 %f198, %f198, 0f3F800000; + add.f32 %f98, %f6, %f6; + sub.f32 %f99, %f8, %f9; + add.f32 %f11, %f1, %f99; + fma.rn.f32 %f12, %f98, %f7, %f2; + mul.f32 %f13, %f11, %f11; + mul.f32 %f14, %f12, %f12; + add.f32 %f100, %f14, %f13; + setp.gt.f32 %p5, %f100, 0f40000000; + @%p5 bra $L__BB0_11; + + add.f32 %f198, %f198, 0f3F800000; + add.f32 %f101, %f11, %f11; + sub.f32 %f102, %f13, %f14; + add.f32 %f16, %f1, %f102; + fma.rn.f32 %f17, %f101, %f12, %f2; + mul.f32 %f18, %f16, %f16; + mul.f32 %f19, %f17, %f17; + add.f32 %f103, %f19, %f18; + setp.gt.f32 %p6, %f103, 0f40000000; + @%p6 bra $L__BB0_11; + + add.f32 %f198, %f198, 0f3F800000; + add.f32 %f104, %f16, %f16; + sub.f32 %f105, %f18, %f19; + add.f32 %f21, %f1, %f105; + fma.rn.f32 %f22, %f104, %f17, %f2; + mul.f32 %f23, %f21, %f21; + mul.f32 %f24, %f22, %f22; + add.f32 %f106, %f24, %f23; + setp.gt.f32 %p7, %f106, 0f40000000; + @%p7 bra $L__BB0_11; + + add.f32 %f198, %f198, 0f3F800000; + add.f32 %f107, %f21, %f21; + sub.f32 %f108, %f23, %f24; + add.f32 %f26, %f1, %f108; + fma.rn.f32 %f27, %f107, %f22, %f2; + mul.f32 %f28, %f26, %f26; + mul.f32 %f29, %f27, %f27; + add.f32 %f109, %f29, %f28; + setp.gt.f32 %p8, %f109, 0f40000000; + @%p8 bra $L__BB0_11; + + add.f32 %f198, %f198, 0f3F800000; + add.f32 %f110, %f26, %f26; + sub.f32 %f111, %f28, %f29; + add.f32 %f31, %f1, %f111; + fma.rn.f32 %f32, %f110, %f27, %f2; + mul.f32 %f33, %f31, %f31; + mul.f32 %f34, %f32, %f32; + add.f32 %f112, %f34, %f33; + setp.gt.f32 %p9, %f112, 0f40000000; + @%p9 bra $L__BB0_11; + + add.f32 %f198, %f198, 0f3F800000; + add.f32 %f113, %f31, %f31; + sub.f32 %f114, %f33, %f34; + add.f32 %f36, %f1, %f114; + fma.rn.f32 %f37, %f113, %f32, %f2; + mul.f32 %f38, %f36, %f36; + mul.f32 %f39, %f37, %f37; + add.f32 %f115, %f39, %f38; + setp.gt.f32 %p10, %f115, 0f40000000; + @%p10 bra $L__BB0_11; + + add.f32 %f198, %f198, 0f3F800000; + add.f32 %f116, %f36, %f36; + sub.f32 %f117, %f38, %f39; + add.f32 %f197, %f1, %f117; + fma.rn.f32 %f196, %f116, %f37, %f2; + mul.f32 %f118, %f197, %f197; + fma.rn.f32 %f119, %f196, %f196, %f118; + setp.gt.f32 %p11, %f119, 0f40000000; + @%p11 bra $L__BB0_11; + + add.f32 %f198, %f198, 0f3F800000; + add.s32 %r163, %r163, 8; + setp.lt.u32 %p12, %r163, 128; + @%p12 bra $L__BB0_2; + +$L__BB0_11: + mul.f32 %f120, %f198, 0f3C000000; + fma.rn.f32 %f121, %f120, 0f40066666, 0f00000000; + fma.rn.f32 %f122, %f120, 0f40000000, 0f3DCCCCCD; + fma.rn.f32 %f123, %f120, 0f40400000, 0f00000000; + mul.f32 %f45, %f121, 0f40C90FD0; + mul.f32 %f46, %f122, 0f40C90FD0; + mul.f32 %f47, %f123, 0f40C90FD0; + mul.f32 %f124, %f45, 0f3F22F983; + cvt.rni.s32.f32 %r167, %f124; + cvt.rn.f32.s32 %f125, %r167; + mov.f32 %f126, 0fBFC90FDA; + fma.rn.f32 %f127, %f125, %f126, %f45; + mov.f32 %f128, 0fB3A22168; + fma.rn.f32 %f129, %f125, %f128, %f127; + mov.f32 %f130, 0fA7C234C5; + fma.rn.f32 %f199, %f125, %f130, %f129; + abs.f32 %f49, %f45; + setp.ltu.f32 %p13, %f49, 0f47CE4780; + add.s64 %rd2, %rd1, 24; + @%p13 bra $L__BB0_19; + + setp.eq.f32 %p14, %f49, 0f7F800000; + @%p14 bra $L__BB0_18; + bra.uni $L__BB0_13; + +$L__BB0_18: + mov.f32 %f133, 0f00000000; + mul.rn.f32 %f199, %f45, %f133; + mov.u32 %r167, 0; + bra.uni $L__BB0_19; + +$L__BB0_13: + mov.b32 %r6, %f45; + shr.u32 %r70, %r6, 23; + and.b32 %r71, %r70, 255; + add.s32 %r7, %r71, -128; + shl.b32 %r72, %r6, 8; + or.b32 %r8, %r72, -2147483648; + shr.u32 %r9, %r7, 5; + mov.u64 %rd58, 0; + mov.u32 %r164, 0; + mov.u64 %rd57, __cudart_i2opi_f; + mov.u64 %rd56, %rd1; + +$L__BB0_14: + .pragma "nounroll"; + ld.global.nc.u32 %r73, [%rd57]; + mad.wide.u32 %rd25, %r73, %r8, %rd58; + shr.u64 %rd58, %rd25, 32; + st.local.u32 [%rd56], %rd25; + add.s64 %rd57, %rd57, 4; + add.s64 %rd56, %rd56, 4; + add.s32 %r164, %r164, 1; + setp.ne.s32 %p15, %r164, 6; + @%p15 bra $L__BB0_14; + + st.local.u32 [%rd2], %rd58; + mov.u32 %r74, 4; + sub.s32 %r12, %r74, %r9; + mov.u32 %r75, 6; + sub.s32 %r76, %r75, %r9; + mul.wide.s32 %rd26, %r76, 4; + add.s64 %rd27, %rd1, %rd26; + ld.local.u32 %r165, [%rd27]; + ld.local.u32 %r166, [%rd27+-4]; + and.b32 %r15, %r7, 31; + setp.eq.s32 %p16, %r15, 0; + @%p16 bra $L__BB0_17; + + mov.u32 %r77, 32; + sub.s32 %r78, %r77, %r15; + shr.u32 %r79, %r166, %r78; + shl.b32 %r80, %r165, %r15; + add.s32 %r165, %r79, %r80; + mul.wide.s32 %rd28, %r12, 4; + add.s64 %rd29, %rd1, %rd28; + ld.local.u32 %r81, [%rd29]; + shr.u32 %r82, %r81, %r78; + shl.b32 %r83, %r166, %r15; + add.s32 %r166, %r82, %r83; + +$L__BB0_17: + and.b32 %r84, %r6, -2147483648; + shr.u32 %r85, %r166, 30; + shl.b32 %r86, %r165, 2; + or.b32 %r87, %r85, %r86; + shr.u32 %r88, %r87, 31; + shr.u32 %r89, %r165, 30; + add.s32 %r90, %r88, %r89; + neg.s32 %r91, %r90; + setp.eq.s32 %p17, %r84, 0; + selp.b32 %r167, %r90, %r91, %p17; + setp.ne.s32 %p18, %r88, 0; + xor.b32 %r92, %r84, -2147483648; + selp.b32 %r93, %r92, %r84, %p18; + selp.b32 %r94, -1, 0, %p18; + xor.b32 %r95, %r87, %r94; + shl.b32 %r96, %r166, 2; + xor.b32 %r97, %r96, %r94; + cvt.u64.u32 %rd30, %r95; + cvt.u64.u32 %rd31, %r97; + bfi.b64 %rd32, %rd30, %rd31, 32, 32; + cvt.rn.f64.s64 %fd1, %rd32; + mul.f64 %fd2, %fd1, 0d3BF921FB54442D19; + cvt.rn.f32.f64 %f131, %fd2; + setp.eq.s32 %p19, %r93, 0; + neg.f32 %f132, %f131; + selp.f32 %f199, %f131, %f132, %p19; + +$L__BB0_19: + add.s32 %r22, %r167, 1; + and.b32 %r23, %r22, 1; + setp.eq.s32 %p20, %r23, 0; + selp.f32 %f53, %f199, 0f3F800000, %p20; + mul.rn.f32 %f54, %f199, %f199; + mov.f32 %f200, 0fB94D4153; + @%p20 bra $L__BB0_21; + + mov.f32 %f135, 0fBAB607ED; + mov.f32 %f136, 0f37CBAC00; + fma.rn.f32 %f200, %f136, %f54, %f135; + +$L__BB0_21: + selp.f32 %f137, 0f3C0885E4, 0f3D2AAABB, %p20; + fma.rn.f32 %f138, %f200, %f54, %f137; + selp.f32 %f139, 0fBE2AAAA8, 0fBEFFFFFF, %p20; + fma.rn.f32 %f140, %f138, %f54, %f139; + mov.f32 %f141, 0f00000000; + fma.rn.f32 %f142, %f54, %f53, %f141; + fma.rn.f32 %f201, %f140, %f142, %f53; + and.b32 %r99, %r22, 2; + setp.eq.s32 %p22, %r99, 0; + @%p22 bra $L__BB0_23; + + mov.f32 %f144, 0fBF800000; + fma.rn.f32 %f201, %f201, %f144, %f141; + +$L__BB0_23: + mul.f32 %f145, %f46, 0f3F22F983; + cvt.rni.s32.f32 %r171, %f145; + cvt.rn.f32.s32 %f146, %r171; + mov.f32 %f147, 0fBFC90FDA; + fma.rn.f32 %f148, %f146, %f147, %f46; + mov.f32 %f149, 0fB3A22168; + fma.rn.f32 %f150, %f146, %f149, %f148; + mov.f32 %f151, 0fA7C234C5; + fma.rn.f32 %f202, %f146, %f151, %f150; + abs.f32 %f61, %f46; + setp.ltu.f32 %p23, %f61, 0f47CE4780; + @%p23 bra $L__BB0_31; + + setp.eq.f32 %p24, %f61, 0f7F800000; + @%p24 bra $L__BB0_30; + bra.uni $L__BB0_25; + +$L__BB0_30: + mov.f32 %f154, 0f00000000; + mul.rn.f32 %f202, %f46, %f154; + mov.u32 %r171, 0; + bra.uni $L__BB0_31; + +$L__BB0_25: + mov.b32 %r25, %f46; + shr.u32 %r101, %r25, 23; + and.b32 %r102, %r101, 255; + add.s32 %r26, %r102, -128; + shl.b32 %r103, %r25, 8; + or.b32 %r27, %r103, -2147483648; + shr.u32 %r28, %r26, 5; + mov.u64 %rd61, 0; + mov.u32 %r168, 0; + mov.u64 %rd60, __cudart_i2opi_f; + mov.u64 %rd59, %rd1; + +$L__BB0_26: + .pragma "nounroll"; + ld.global.nc.u32 %r104, [%rd60]; + mad.wide.u32 %rd35, %r104, %r27, %rd61; + shr.u64 %rd61, %rd35, 32; + st.local.u32 [%rd59], %rd35; + add.s64 %rd60, %rd60, 4; + add.s64 %rd59, %rd59, 4; + add.s32 %r168, %r168, 1; + setp.ne.s32 %p25, %r168, 6; + @%p25 bra $L__BB0_26; + + st.local.u32 [%rd2], %rd61; + mov.u32 %r105, 4; + sub.s32 %r31, %r105, %r28; + mov.u32 %r106, 6; + sub.s32 %r107, %r106, %r28; + mul.wide.s32 %rd36, %r107, 4; + add.s64 %rd37, %rd1, %rd36; + ld.local.u32 %r169, [%rd37]; + ld.local.u32 %r170, [%rd37+-4]; + and.b32 %r34, %r26, 31; + setp.eq.s32 %p26, %r34, 0; + @%p26 bra $L__BB0_29; + + mov.u32 %r108, 32; + sub.s32 %r109, %r108, %r34; + shr.u32 %r110, %r170, %r109; + shl.b32 %r111, %r169, %r34; + add.s32 %r169, %r110, %r111; + mul.wide.s32 %rd38, %r31, 4; + add.s64 %rd39, %rd1, %rd38; + ld.local.u32 %r112, [%rd39]; + shr.u32 %r113, %r112, %r109; + shl.b32 %r114, %r170, %r34; + add.s32 %r170, %r113, %r114; + +$L__BB0_29: + and.b32 %r115, %r25, -2147483648; + shr.u32 %r116, %r170, 30; + shl.b32 %r117, %r169, 2; + or.b32 %r118, %r116, %r117; + shr.u32 %r119, %r118, 31; + shr.u32 %r120, %r169, 30; + add.s32 %r121, %r119, %r120; + neg.s32 %r122, %r121; + setp.eq.s32 %p27, %r115, 0; + selp.b32 %r171, %r121, %r122, %p27; + setp.ne.s32 %p28, %r119, 0; + xor.b32 %r123, %r115, -2147483648; + selp.b32 %r124, %r123, %r115, %p28; + selp.b32 %r125, -1, 0, %p28; + xor.b32 %r126, %r118, %r125; + shl.b32 %r127, %r170, 2; + xor.b32 %r128, %r127, %r125; + cvt.u64.u32 %rd40, %r126; + cvt.u64.u32 %rd41, %r128; + bfi.b64 %rd42, %rd40, %rd41, 32, 32; + cvt.rn.f64.s64 %fd3, %rd42; + mul.f64 %fd4, %fd3, 0d3BF921FB54442D19; + cvt.rn.f32.f64 %f152, %fd4; + setp.eq.s32 %p29, %r124, 0; + neg.f32 %f153, %f152; + selp.f32 %f202, %f152, %f153, %p29; + +$L__BB0_31: + add.s32 %r41, %r171, 1; + and.b32 %r42, %r41, 1; + setp.eq.s32 %p30, %r42, 0; + selp.f32 %f65, %f202, 0f3F800000, %p30; + mul.rn.f32 %f66, %f202, %f202; + mov.f32 %f203, 0fB94D4153; + @%p30 bra $L__BB0_33; + + mov.f32 %f156, 0fBAB607ED; + mov.f32 %f157, 0f37CBAC00; + fma.rn.f32 %f203, %f157, %f66, %f156; + +$L__BB0_33: + selp.f32 %f158, 0f3C0885E4, 0f3D2AAABB, %p30; + fma.rn.f32 %f159, %f203, %f66, %f158; + selp.f32 %f160, 0fBE2AAAA8, 0fBEFFFFFF, %p30; + fma.rn.f32 %f161, %f159, %f66, %f160; + mov.f32 %f162, 0f00000000; + fma.rn.f32 %f163, %f66, %f65, %f162; + fma.rn.f32 %f204, %f161, %f163, %f65; + and.b32 %r130, %r41, 2; + setp.eq.s32 %p32, %r130, 0; + @%p32 bra $L__BB0_35; + + mov.f32 %f165, 0fBF800000; + fma.rn.f32 %f204, %f204, %f165, %f162; + +$L__BB0_35: + mul.f32 %f166, %f47, 0f3F22F983; + cvt.rni.s32.f32 %r175, %f166; + cvt.rn.f32.s32 %f167, %r175; + mov.f32 %f168, 0fBFC90FDA; + fma.rn.f32 %f169, %f167, %f168, %f47; + mov.f32 %f170, 0fB3A22168; + fma.rn.f32 %f171, %f167, %f170, %f169; + mov.f32 %f172, 0fA7C234C5; + fma.rn.f32 %f205, %f167, %f172, %f171; + abs.f32 %f73, %f47; + setp.ltu.f32 %p33, %f73, 0f47CE4780; + @%p33 bra $L__BB0_43; + + setp.eq.f32 %p34, %f73, 0f7F800000; + @%p34 bra $L__BB0_42; + bra.uni $L__BB0_37; + +$L__BB0_42: + mov.f32 %f175, 0f00000000; + mul.rn.f32 %f205, %f47, %f175; + mov.u32 %r175, 0; + bra.uni $L__BB0_43; + +$L__BB0_37: + mov.b32 %r44, %f47; + shr.u32 %r132, %r44, 23; + and.b32 %r133, %r132, 255; + add.s32 %r45, %r133, -128; + shl.b32 %r134, %r44, 8; + or.b32 %r46, %r134, -2147483648; + shr.u32 %r47, %r45, 5; + mov.u64 %rd64, 0; + mov.u32 %r172, 0; + mov.u64 %rd63, __cudart_i2opi_f; + mov.u64 %rd62, %rd1; + +$L__BB0_38: + .pragma "nounroll"; + ld.global.nc.u32 %r135, [%rd63]; + mad.wide.u32 %rd45, %r135, %r46, %rd64; + shr.u64 %rd64, %rd45, 32; + st.local.u32 [%rd62], %rd45; + add.s64 %rd63, %rd63, 4; + add.s64 %rd62, %rd62, 4; + add.s32 %r172, %r172, 1; + setp.ne.s32 %p35, %r172, 6; + @%p35 bra $L__BB0_38; + + st.local.u32 [%rd2], %rd64; + mov.u32 %r136, 4; + sub.s32 %r50, %r136, %r47; + mov.u32 %r137, 6; + sub.s32 %r138, %r137, %r47; + mul.wide.s32 %rd46, %r138, 4; + add.s64 %rd47, %rd1, %rd46; + ld.local.u32 %r173, [%rd47]; + ld.local.u32 %r174, [%rd47+-4]; + and.b32 %r53, %r45, 31; + setp.eq.s32 %p36, %r53, 0; + @%p36 bra $L__BB0_41; + + mov.u32 %r139, 32; + sub.s32 %r140, %r139, %r53; + shr.u32 %r141, %r174, %r140; + shl.b32 %r142, %r173, %r53; + add.s32 %r173, %r141, %r142; + mul.wide.s32 %rd48, %r50, 4; + add.s64 %rd49, %rd1, %rd48; + ld.local.u32 %r143, [%rd49]; + shr.u32 %r144, %r143, %r140; + shl.b32 %r145, %r174, %r53; + add.s32 %r174, %r144, %r145; + +$L__BB0_41: + and.b32 %r146, %r44, -2147483648; + shr.u32 %r147, %r174, 30; + shl.b32 %r148, %r173, 2; + or.b32 %r149, %r147, %r148; + shr.u32 %r150, %r149, 31; + shr.u32 %r151, %r173, 30; + add.s32 %r152, %r150, %r151; + neg.s32 %r153, %r152; + setp.eq.s32 %p37, %r146, 0; + selp.b32 %r175, %r152, %r153, %p37; + setp.ne.s32 %p38, %r150, 0; + xor.b32 %r154, %r146, -2147483648; + selp.b32 %r155, %r154, %r146, %p38; + selp.b32 %r156, -1, 0, %p38; + xor.b32 %r157, %r149, %r156; + shl.b32 %r158, %r174, 2; + xor.b32 %r159, %r158, %r156; + cvt.u64.u32 %rd50, %r157; + cvt.u64.u32 %rd51, %r159; + bfi.b64 %rd52, %rd50, %rd51, 32, 32; + cvt.rn.f64.s64 %fd5, %rd52; + mul.f64 %fd6, %fd5, 0d3BF921FB54442D19; + cvt.rn.f32.f64 %f173, %fd6; + setp.eq.s32 %p39, %r155, 0; + neg.f32 %f174, %f173; + selp.f32 %f205, %f173, %f174, %p39; + +$L__BB0_43: + add.s32 %r60, %r175, 1; + and.b32 %r61, %r60, 1; + setp.eq.s32 %p40, %r61, 0; + selp.f32 %f77, %f205, 0f3F800000, %p40; + mul.rn.f32 %f78, %f205, %f205; + mov.f32 %f206, 0fB94D4153; + @%p40 bra $L__BB0_45; + + mov.f32 %f177, 0fBAB607ED; + mov.f32 %f178, 0f37CBAC00; + fma.rn.f32 %f206, %f178, %f78, %f177; + +$L__BB0_45: + selp.f32 %f179, 0f3C0885E4, 0f3D2AAABB, %p40; + fma.rn.f32 %f180, %f206, %f78, %f179; + selp.f32 %f181, 0fBE2AAAA8, 0fBEFFFFFF, %p40; + fma.rn.f32 %f182, %f180, %f78, %f181; + mov.f32 %f183, 0f00000000; + fma.rn.f32 %f184, %f78, %f77, %f183; + fma.rn.f32 %f207, %f182, %f184, %f77; + and.b32 %r161, %r60, 2; + setp.eq.s32 %p42, %r161, 0; + @%p42 bra $L__BB0_47; + + mov.f32 %f186, 0fBF800000; + fma.rn.f32 %f207, %f207, %f186, %f183; + +$L__BB0_47: + mov.f32 %f187, 0f3E99999A; + mul.f32 %f188, %f204, 0f3E99999A; + mul.f32 %f189, %f207, 0f3F000000; + mov.f32 %f190, 0f3F000000; + mad.lo.s32 %r162, %r2, 3200, %r1; + cvta.to.global.u64 %rd53, %rd21; + mul.wide.s32 %rd54, %r162, 16; + add.s64 %rd55, %rd53, %rd54; + sub.f32 %f191, %f190, %f189; + sub.f32 %f192, %f187, %f188; + fma.rn.f32 %f193, %f201, 0fBE4CCCCD, 0f3E99999A; + mov.f32 %f194, 0f3F800000; + st.global.v4.f32 [%rd55], {%f193, %f192, %f191, %f194}; + +$L__BB0_48: + ret; + +} +