模板函数指针

#include <iostream>
// Since C++ 11 需要支持C++ 11. 否则报错!
template<typename T>
using func_t = T (*) (T, T);
template <typename T> 
__device__ T add_func (T x, T y)
    return x + y;
template <typename T> 
__device__ T mul_func (T x, T y)
    return x * y;
// Required for functional pointer argument in kernel function
// Static pointers to device functions
template <typename T> 
__device__ func_t<T> p_add_func = add_func<T>;
template <typename T> 
__device__ func_t<T> p_mul_func = mul_func<T>;
template <typename T> 
__global__ void kernel(func_t<T> op, T * d_x, T * d_y, T * result)
    *result = (*op)(*d_x, *d_y);
template <typename T> 
void test(T x, T y)
    func_t<T> h_add_func;
    func_t<T> h_mul_func;
    T * d_x, * d_y;
    cudaMalloc(&d_x, sizeof(T));
    cudaMalloc(&d_y, sizeof(T));
    cudaMemcpy(d_x, &x, sizeof(T), cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, &y, sizeof(T), cudaMemcpyHostToDevice);
    T result;
    T * d_result, * h_result;
    cudaMalloc(&d_result, sizeof(T));
    h_result = &result;
    // Copy device function pointer to host side
    cudaMemcpyFromSymbol(&h_add_func, p_add_func<T>, sizeof(func_t<T>));
    cudaMemcpyFromSymbol(&h_mul_func, p_mul_func<T>, sizeof(func_t<T>));
    kernel<T><<<1,1>>>(h_add_func, d_x, d_y, d_result);
    cudaDeviceSynchronize();
    cudaMemcpy(h_result, d_result, sizeof(T), cudaMemcpyDeviceToHost);
    std::cout << "Sum: " << result << std::endl;
    kernel<T><<<1,1>>>(h_mul_func, d_x, d_y, d_result);
    cudaDeviceSynchronize();
    cudaMemcpy(h_result, d_result, sizeof(T), cudaMemcpyDeviceToHost);
    std::cout << "Product: " << result << std::endl;
int main()
    std::cout << "Test int for type int ..." << std::endl;
    test<int>(2.05, 10.00);
    std::cout << "Test float for type float ..." << std::endl;
    test<float>(2.05, 10.00);
    std::cout << "Test double for type double ..." << std::endl;
    test<double>(2.05, 10.00);
vs2019 + cuda 10.1 测试:
 
 vs2013+cuda6.5 报错:
  
先修改为不用模板的形式: 
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
typedef float(*func_t) (float, float);
__device__ float add_func(float x, float y)
	return x + y;
__device__ float mul_func(float x, float y)
	return x * y;
// Required for functional pointer argument in kernel function
// Static pointers to device functions
__device__ func_t p_add_func = add_func;
__device__ func_t p_mul_func = mul_func;
__global__ void kernel(func_t op, float * d_x, float * d_y, float * result)
	*result = (*op)(*d_x, *d_y);
void test(float x, float y)
	func_t h_add_func;
	func_t h_mul_func;
	float * d_x, *d_y;
	cudaMalloc(&d_x, sizeof(float));
	cudaMalloc(&d_y, sizeof(float));
	cudaMemcpy(d_x, &x, sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(d_y, &y, sizeof(float), cudaMemcpyHostToDevice);
	float result;
	float * d_result, *h_result;
	cudaMalloc(&d_result, sizeof(float));
	h_result = &result;
	// Copy device function pointer to host side
	cudaMemcpyFromSymbol(&h_add_func, p_add_func, sizeof(func_t));
	cudaMemcpyFromSymbol(&h_mul_func, p_mul_func, sizeof(func_t));
	kernel << <1, 1 >> >(h_add_func, d_x, d_y, d_result);
	cudaDeviceSynchronize();
	cudaMemcpy(h_result, d_result, sizeof(float), cudaMemcpyDeviceToHost);
	std::cout << "Sum: " << result << std::endl;
	kernel << <1, 1 >> >(h_mul_func, d_x, d_y, d_result);
	cudaDeviceSynchronize();
	cudaMemcpy(h_result, d_result, sizeof(float), cudaMemcpyDeviceToHost);
	std::cout << "Product: " << result << std::endl;
int main()
	std::cout << "Test int for type int ..." << std::endl;
	test(2.05, 10.00);
	std::cout << "Test float for type float ..." << std::endl;
	test(2.05, 10.00);
	std::cout << "Test double for type double ..." << std::endl;
	test(2.05, 10.00);
vs2013+cuda6.5 结果:
 
                    目录核心方法模板函数指针不用模板核心方法The key to passing function pointers to CUDA kernel is to use static pointers to device pointers followed by copying the pointers to the host side. Otherwise, I am sure you will...
深度学习CUDA编程干货-kernel的编写和调用
本文由林大佬原创，转载请注明出处，来自腾讯、阿里等一线AI算法工程师组成的QQ交流群欢迎你的加入: 1037662480
上一篇给大家分享了一些CUDA编程的干货，这一篇来夯实一下，我们主要看一些基础的cuda概念。
cuda编程主.
				在CUDA的Drive API中launch kernel 函数原型如下:CUresult CUDAAPI cuLaunchKernel(CUfunction f,
                                unsigned int gridDimX,
                                unsigned int gridDimY,
				cuda的global函数里面可以调用__device__函数，在有特殊需要的时候，还可以把__device__函数作为参数传入到一个__global__函数中
在cuda里面不能像c++那样简单地传入函数的指针，需要在传入前对函数的指针做一些包装。
typedef double(*funcFormat)(int,char);
这里面double表示函数的返回值，int,char是函数的参数列表，所有满足这种格式的函数都可以用这种函数类型指代。
上面的funcFormat是一种函数类型，在这里可以把
				透视变换是图像处理中的一种常用技术，用于模拟相机对图像进行透视投影。在 CUDA 中实现透视变换需要编写一个 GPU 函数，该函数可以并行地处理图像中的每个像素。
以下是一个简单的 CUDA 透视变换的例子：
__global__ void perspective_transform(float *out, float *in, int width, int height, float fx, float fy, float cx, float cy)
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int j = blockIdx.y * blockDim.y + threadIdx.y;
    if (i >= width || j >= height) return;
    int idx = j * width + i;
    float x = (i - cx) / fx;
    float y = (j - cy) / fy;
    out[idx] = in[idx] / (1 + x*x + y*y);
int main()
    dim3 blockSize(16, 16);
    dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y);
    perspective_transform<<<gridSize, blockSize>>>(d_out, d_in, width, height, fx, fy, cx, cy);
这个例子中的 `perspective_transform` 函数是一个 CUDA 内核，它接收图像数据、图像的宽度和高度、以及透视变换的参数。它通过计算每个像素的新坐标并对其进行透视变换，实现了透视变换。
请注意，这只是一个简单的透视变换的例子，具体的实现方式可能因应用场景不同而有所差异。
                    没有理由把hist3得到的最后一行和一列去掉吧？目前算出来的值统计了99600多个数，不到100000.
把你定义的intervals改一下，int_x = [0:(1 / K1):1+1/K1]; int_y = [0:(1 / K2):1+1/K2];K1=length(int_x)-1; K2=length(int_y)-1;  
hist3方法无需删除行，可以得到一样的结果。
                mamba-image python 包的安装
                    萝卜8959: 
                    谢谢分享，这个方法有些复杂，我决定用python3.4试试
                图像配准 - 三张灰度图合成彩色图 ECC算法 OpenCV C++/Python实现
                    weixin_47699542: 
                    python代码中的xrange是什么？ 没定义啊？