The key to passing function pointers to CUDA kernel is to use static pointers to device pointers followed by copying the pointers to the host side. Otherwise, I am sure you will get different kinds of weird errors.

核心就是: 使用static 指针指向device端的指针. 然后将指针拷贝到host端.

模板函数指针

#include <iostream>
// Since C++ 11 需要支持C++ 11. 否则报错!
template<typename T>
using func_t = T (*) (T, T);
template <typename T> 
__device__ T add_func (T x, T y)
    return x + y;
template <typename T> 
__device__ T mul_func (T x, T y)
    return x * y;
// Required for functional pointer argument in kernel function
// Static pointers to device functions
template <typename T> 
__device__ func_t<T> p_add_func = add_func<T>;
template <typename T> 
__device__ func_t<T> p_mul_func = mul_func<T>;
template <typename T> 
__global__ void kernel(func_t<T> op, T * d_x, T * d_y, T * result)
    *result = (*op)(*d_x, *d_y);
template <typename T> 
void test(T x, T y)
    func_t<T> h_add_func;
    func_t<T> h_mul_func;
    T * d_x, * d_y;
    cudaMalloc(&d_x, sizeof(T));
    cudaMalloc(&d_y, sizeof(T));
    cudaMemcpy(d_x, &x, sizeof(T), cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, &y, sizeof(T), cudaMemcpyHostToDevice);
    T result;
    T * d_result, * h_result;
    cudaMalloc(&d_result, sizeof(T));
    h_result = &result;
    // Copy device function pointer to host side
    cudaMemcpyFromSymbol(&h_add_func, p_add_func<T>, sizeof(func_t<T>));
    cudaMemcpyFromSymbol(&h_mul_func, p_mul_func<T>, sizeof(func_t<T>));
    kernel<T><<<1,1>>>(h_add_func, d_x, d_y, d_result);
    cudaDeviceSynchronize();
    cudaMemcpy(h_result, d_result, sizeof(T), cudaMemcpyDeviceToHost);
    std::cout << "Sum: " << result << std::endl;
    kernel<T><<<1,1>>>(h_mul_func, d_x, d_y, d_result);
    cudaDeviceSynchronize();
    cudaMemcpy(h_result, d_result, sizeof(T), cudaMemcpyDeviceToHost);
    std::cout << "Product: " << result << std::endl;
int main()
    std::cout << "Test int for type int ..." << std::endl;
    test<int>(2.05, 10.00);
    std::cout << "Test float for type float ..." << std::endl;
    test<float>(2.05, 10.00);
    std::cout << "Test double for type double ..." << std::endl;
    test<double>(2.05, 10.00);

vs2019 + cuda 10.1 测试:
在这里插入图片描述
vs2013+cuda6.5 报错:
在这里插入图片描述

先修改为不用模板的形式:

#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
typedef float(*func_t) (float, float);
__device__ float add_func(float x, float y)
	return x + y;
__device__ float mul_func(float x, float y)
	return x * y;
// Required for functional pointer argument in kernel function
// Static pointers to device functions
__device__ func_t p_add_func = add_func;
__device__ func_t p_mul_func = mul_func;
__global__ void kernel(func_t op, float * d_x, float * d_y, float * result)
	*result = (*op)(*d_x, *d_y);
void test(float x, float y)
	func_t h_add_func;
	func_t h_mul_func;
	float * d_x, *d_y;
	cudaMalloc(&d_x, sizeof(float));
	cudaMalloc(&d_y, sizeof(float));
	cudaMemcpy(d_x, &x, sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(d_y, &y, sizeof(float), cudaMemcpyHostToDevice);
	float result;
	float * d_result, *h_result;
	cudaMalloc(&d_result, sizeof(float));
	h_result = &result;
	// Copy device function pointer to host side
	cudaMemcpyFromSymbol(&h_add_func, p_add_func, sizeof(func_t));
	cudaMemcpyFromSymbol(&h_mul_func, p_mul_func, sizeof(func_t));
	kernel << <1, 1 >> >(h_add_func, d_x, d_y, d_result);
	cudaDeviceSynchronize();
	cudaMemcpy(h_result, d_result, sizeof(float), cudaMemcpyDeviceToHost);
	std::cout << "Sum: " << result << std::endl;
	kernel << <1, 1 >> >(h_mul_func, d_x, d_y, d_result);
	cudaDeviceSynchronize();
	cudaMemcpy(h_result, d_result, sizeof(float), cudaMemcpyDeviceToHost);
	std::cout << "Product: " << result << std::endl;
int main()
	std::cout << "Test int for type int ..." << std::endl;
	test(2.05, 10.00);
	std::cout << "Test float for type float ..." << std::endl;
	test(2.05, 10.00);
	std::cout << "Test double for type double ..." << std::endl;
	test(2.05, 10.00);

vs2013+cuda6.5 结果:
在这里插入图片描述

目录核心方法模板函数指针不用模板核心方法The key to passing function pointers to CUDA kernel is to use static pointers to device pointers followed by copying the pointers to the host side. Otherwise, I am sure you will... 深度学习CUDA编程干货-kernel的编写和调用 本文由林大佬原创,转载请注明出处,来自腾讯、阿里等一线AI算法工程师组成的QQ交流群欢迎你的加入: 1037662480 上一篇给大家分享了一些CUDA编程的干货,这一篇来夯实一下,我们主要看一些基础的cuda概念。 cuda编程主.
CUDA的Drive API中launch kernel 函数原型如下:CUresult CUDAAPI cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
cuda的global函数里面可以调用__device__函数,在有特殊需要的时候,还可以把__device__函数作为参数传入到一个__global__函数中 在cuda里面不能像c++那样简单地传入函数的指针,需要在传入前对函数的指针做一些包装。 typedef double(*funcFormat)(int,char); 这里面double表示函数的返回值,int,char是函数的参数列表,所有满足这种格式的函数都可以用这种函数类型指代。 上面的funcFormat是一种函数类型,在这里可以把
透视变换是图像处理中的一种常用技术,用于模拟相机对图像进行透视投影。在 CUDA 中实现透视变换需要编写一个 GPU 函数,该函数可以并行地处理图像中的每个像素。 以下是一个简单的 CUDA 透视变换的例子: __global__ void perspective_transform(float *out, float *in, int width, int height, float fx, float fy, float cx, float cy) int i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y * blockDim.y + threadIdx.y; if (i >= width || j >= height) return; int idx = j * width + i; float x = (i - cx) / fx; float y = (j - cy) / fy; out[idx] = in[idx] / (1 + x*x + y*y); int main() dim3 blockSize(16, 16); dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y); perspective_transform<<<gridSize, blockSize>>>(d_out, d_in, width, height, fx, fy, cx, cy); 这个例子中的 `perspective_transform` 函数是一个 CUDA 内核,它接收图像数据、图像的宽度和高度、以及透视变换的参数。它通过计算每个像素的新坐标并对其进行透视变换,实现了透视变换。 请注意,这只是一个简单的透视变换的例子,具体的实现方式可能因应用场景不同而有所差异。
没有理由把hist3得到的最后一行和一列去掉吧?目前算出来的值统计了99600多个数,不到100000. 把你定义的intervals改一下,int_x = [0:(1 / K1):1+1/K1]; int_y = [0:(1 / K2):1+1/K2];K1=length(int_x)-1; K2=length(int_y)-1; hist3方法无需删除行,可以得到一样的结果。
mamba-image python 包的安装 萝卜8959: 谢谢分享,这个方法有些复杂,我决定用python3.4试试 图像配准 - 三张灰度图合成彩色图 ECC算法 OpenCV C++/Python实现 weixin_47699542: python代码中的xrange是什么? 没定义啊?