pycuda._driver.LogicError: cuModuleLoadDataEx failed: device kernel image is invali,
我出现错误的原因是运行pycuda的核函数中的arch
mod = SourceModule('''
#define INTER_RESIZE_COEF_BITS 11
#define INTER_RESIZE_COEF_SCALE (1 << INTER_RESIZE_COEF_BITS)
#define CAST_BITS (INTER_RESIZE_COEF_BITS << 1)
typedef unsigned char uint8_t;
static __inline__ __device__ int limit(int value, int low, int high){
return value < low ? low : (value > high ? high : value);
}
static __inline__ __device__ int resize_cast(int value){
return (value + (1 << (CAST_BITS - 1))) >> CAST_BITS;
}
__global__ void resize_bilinear_and_normalize_kernel(
uint8_t* src, int src_line_size, int src_width, int src_height,
uint8_t* dst, int dst_line_size, int dst_width, int dst_height,
float sx, float sy, int edge
){
int position = blockDim.x * blockIdx.x + threadIdx.x;
if (position >= edge) return;
int dx = position % dst_width;
int dy = position / dst_width;
float src_x = (dx + 0.5f) * sx - 0.5f;
float src_y = (dy + 0.5f) * sy - 0.5f;
int y_low = floorf(src_y);
int x_low = floorf(src_x);
int y_high = limit(y_low + 1, 0, src_height - 1);
int x_high = limit(x_low + 1, 0, src_width - 1);
y_low = limit(y_low, 0, src_height - 1);
x_low = limit(x_low, 0, src_width - 1);
int ly = rint((src_y - y_low) * INTER_RESIZE_COEF_SCALE);
int lx = rint((src_x - x_low) * INTER_RESIZE_COEF_SCALE);
int hy = INTER_RESIZE_COEF_SCALE - ly;
int hx = INTER_RESIZE_COEF_SCALE - lx;
int w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
uint8_t* v1 = src + y_low * src_line_size + x_low * 3;
uint8_t* v2 = src + y_low * src_line_size + x_high * 3;
uint8_t* v3 = src + y_high * src_line_size + x_low * 3;
uint8_t* v4 = src + y_high * src_line_size + x_high * 3;
uint8_t* output_ptr = dst + dy * dst_line_size + dx * 3;
output_ptr[0] = resize_cast(w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0]);
output_ptr[1] = resize_cast(w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1]);
output_ptr[2] = resize_cast(w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2]);
}
''', arch="sm_80")
对arch改变为你现在的显卡算力