纹理内存是只读内存,与常量内存相同的是,纹理内存也缓存在芯片中,因此某些情况下,它能减少对内存的请求并提供更高效的内存宽带。纹理内存专门为那些内存访问模式中存在大量空间局部性的图形应用程序而设计的。在某个计算应用程序中,这意味着一个线程读取的位置可能与邻近线程读取的位置“非常接近”。纹理缓存为了加速访问不连续的地址而设计的。
首先是一个不使用纹理内存的热传导模型。先用到了下面的辅助函数:
#ifndef __ANIM_H__ #define __ANIM_H__ #include <iostream> #include <windows.h> #include <GL/glut.h> struct AnimBitmap { unsigned char *pixels; //pixels是像素数组,大小为w*h*4 int width, height; void *dataBlock; //需要处理的数据块 void (*fAnim)(void*,int); //函数指针fAnim,在glutIdleFunc注册后,空闲时一直执行它(动画) void (*animExit)(void*); //函数指针animExit,在响应退出的时候可以调用这个函数 void (*clickDrag)(void*,int,int,int,int); //函数指针clickDrag,响应拖动的处理 int dragStartX, dragStartY; AnimBitmap( int w, int h, void *d = NULL ) { //构造函数,几个函数指针暂时为NULL width = w; height = h; pixels = new unsigned char[width * height * 4]; dataBlock = d; clickDrag = NULL; } ~AnimBitmap() { delete [] pixels; } unsigned char* get_ptr( void ) const { return pixels; } long image_size( void ) const { return width * height * 4; } void click_drag( void (*f)(void*,int,int,int,int)) { //设置拖动指针的值 clickDrag = f; } void anim_and_exit( void (*f)(void*,int), void(*e)(void*) ) { AnimBitmap** bitmap = get_bitmap_ptr(); *bitmap = this; //让静态变量所指的内容为当前定义的Bitmap fAnim = f; //设置动画函数 animExit = e; //设置退出需要处理的函数 int c=1; char* dummy = ""; glutInit( &c, &dummy ); glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA ); glutInitWindowSize( width, height ); glutCreateWindow( "bitmap" ); glutKeyboardFunc(Key); //注册按键响应 glutDisplayFunc(Draw); //注册画面内容 if (clickDrag != NULL) glutMouseFunc( mouse_func ); //注册拖动处理 glutIdleFunc( idle_func );//当没有窗口事件到达时,glutIdleFunc可以执行后台处理任务或连续动画。它被不断调用,直到有窗口事件发生。 glutMainLoop(); } static AnimBitmap** get_bitmap_ptr( void ) { static AnimBitmap* gBitmap; //静态变量,只第一次执行 return &gBitmap; } static void mouse_func( int button, int state, int mx, int my ) { if (button == GLUT_LEFT_BUTTON) { AnimBitmap* bitmap = *(get_bitmap_ptr()); if (state == GLUT_DOWN) { bitmap->dragStartX = mx; bitmap->dragStartY = my; } else if (state == GLUT_UP) { bitmap->clickDrag( bitmap->dataBlock,bitmap->dragStartX,bitmap->dragStartY,mx, my ); } } } static void idle_func( void ) { static int ticks = 1; AnimBitmap* bitmap = *(get_bitmap_ptr()); bitmap->fAnim( bitmap->dataBlock, ticks++ ); glutPostRedisplay(); } static void Key(unsigned char key, int x, int y) { switch (key) { case 27: AnimBitmap* bitmap = *(get_bitmap_ptr()); bitmap->animExit( bitmap->dataBlock ); //delete bitmap; exit(0); } } static void Draw( void ) { AnimBitmap* bitmap = *(get_bitmap_ptr()); glClearColor( 0.0, 0.0, 0.0, 1.0 ); glClear( GL_COLOR_BUFFER_BIT ); glDrawPixels( bitmap->width, bitmap->height, GL_RGBA, GL_UNSIGNED_BYTE, bitmap->pixels ); glutSwapBuffers(); } }; #endif
这个AnimBitmap.h文件用位图的指针来生成图,并产生动画。
例子中还是用了一个辅助函数,这个函数是为了将浮点数转化成RGBA值:
__device__ unsigned char value( float n1, float n2, int hue ) { if (hue > 360) hue -= 360; else if (hue < 0) hue += 360; if (hue < 60) return (unsigned char)(255 * (n1 + (n2-n1)*hue/60)); if (hue < 180) return (unsigned char)(255 * n2); if (hue < 240) return (unsigned char)(255 * (n1 + (n2-n1)*(240-hue)/60)); return (unsigned char)(255 * n1); } __global__ void float_to_color( unsigned char *optr,const float *outSrc ) { int x = threadIdx.x + blockIdx.x * blockDim.x; int y = threadIdx.y + blockIdx.y * blockDim.y; int offset = x + y * blockDim.x * gridDim.x; float l = outSrc[offset]; float s = 1; int h = (180 + (int)(360.0f * outSrc[offset])) % 360; float m1, m2; if (l <= 0.5f) m2 = l * (1 + s); else m2 = l + s - l * s; m1 = 2 * l - m2; optr[offset*4 + 0] = value( m1, m2, h+120 ); optr[offset*4 + 1] = value( m1, m2, h ); optr[offset*4 + 2] = value( m1, m2, h -120 ); optr[offset*4 + 3] = 255; }
需要处理的数据块是BlockData,这个数据块是需要处理的,通过这些数据得到最后位图,在退出时候,进行回收。
#include "AnimBitmap.h" #define DIM 1024 #define SPEED 0.25 #define MAX_TEMP 1.0f #define MIN_TEMP 0.0001f __global__ void copy_const_kernel(float *iptr,const float *cptr){ int x=threadIdx.x+blockIdx.x*blockDim.x; int y=threadIdx.y+blockIdx.y*blockDim.y; int offset=x+y*blockDim.x*gridDim.x; //输入缓冲区的线性偏移 if(cptr[offset]!=0) iptr[offset]=cptr[offset]; //当温度非零时,将温度复制到iptr中 } __global__ void blend_kernel(float *outSrc,const float *inSrc){ int x=threadIdx.x+blockIdx.x*blockDim.x; int y=threadIdx.y+blockIdx.y*blockDim.y; int offset=x+y*blockDim.x*gridDim.x; //每个线程与左右上下四点对应 int left=offset-1; int right=offset+1; if(x==0) left++; if(x==DIM-1) right--; int top=offset-DIM; int bottom=offset+DIM; if(y==0) top+=DIM; if(y==DIM-1) bottom-=DIM; //温度传播 outSrc[offset]=inSrc[offset]+SPEED*(inSrc[top]+inSrc[bottom]+inSrc[left]+inSrc[right]-inSrc[offset]*4); } struct DataBlock{ unsigned char* output_bitmap; float *dev_inSrc; float *dev_outSrc; float *dev_constSrc; AnimBitmap *bitmap; cudaEvent_t start,stop; float totalTime; float frames; }; void anim(DataBlock *d,int ticks){ cudaEventRecord(d->start,0); dim3 blocks(DIM/16,DIM/16); dim3 threads(16,16); AnimBitmap *bitmap=d->bitmap; for(int i=0;i<90;i++){ copy_const_kernel<<<blocks,threads>>>(d->dev_inSrc,d->dev_constSrc); blend_kernel<<<blocks,threads>>>(d->dev_outSrc,d->dev_inSrc); float * current=d->dev_inSrc; d->dev_inSrc=d->dev_outSrc; d->dev_outSrc=current; } float_to_color<<<blocks,threads>>>(d->output_bitmap,d->dev_inSrc); cudaMemcpy(bitmap->get_ptr(),d->output_bitmap,bitmap->image_size(),cudaMemcpyDeviceToHost); cudaEventRecord(d->stop,0); cudaEventSynchronize(d->stop); float elapsedTime; cudaEventElapsedTime(&elapsedTime,d->start,d->stop); d->totalTime+=elapsedTime; ++d->frames; printf("Average Time per frame:%3.1f ms\n",d->totalTime/d->frames); } void anim_exit(DataBlock *d){ cudaFree(d->dev_inSrc); cudaFree(d->dev_outSrc); cudaFree(d->dev_constSrc); cudaEventDestroy(d->start); cudaEventDestroy(d->stop); } int main(void){ DataBlock data; AnimBitmap bitmap(DIM,DIM,&data); data.bitmap=&bitmap; data.totalTime=0; data.frames=0; cudaEventCreate(&data.start); cudaEventCreate(&data.stop); cudaMalloc((void**)&data.output_bitmap,bitmap.image_size()); cudaMalloc((void**)&data.dev_inSrc,bitmap.image_size()); cudaMalloc((void**)&data.dev_outSrc,bitmap.image_size()); cudaMalloc((void**)&data.dev_constSrc,bitmap.image_size()); float *temp=(float*)malloc(bitmap.image_size()); for(int i=0;i<DIM*DIM;i++){ temp[i]=0; int x=i%DIM; int y=i/DIM; if((x>300)&&(x<600)&&(y>310)&&(y<601)) temp[i]=MAX_TEMP; } temp[DIM*100+100]=(MAX_TEMP+MIN_TEMP)/2; temp[DIM*700+100]=MIN_TEMP; temp[DIM*300+300]=MIN_TEMP; temp[DIM*200+700]=MIN_TEMP; for(int y=800;y<900;y++) for(int x=400;x<500;x++) temp[x+y*DIM]=MIN_TEMP; cudaMemcpy(data.dev_constSrc,temp,bitmap.image_size(),cudaMemcpyHostToDevice); //dev_constSrc是热源 for(int y=800;y<DIM;y++) for(int x=0;x<200;x++) temp[x+y*DIM]=MAX_TEMP; cudaMemcpy(data.dev_inSrc,temp,bitmap.image_size(),cudaMemcpyHostToDevice);//设置初始温度 free(temp); bitmap.anim_and_exit( (void (*)(void*,int))anim,(void (*)(void *))anim_exit ); return 0; }
接下来使用纹理内存:
温度计算的内存访问模式中存在着巨大的内存空间局部性,这种访问模式可以用GPU纹理内存加速。首先声明纹理内存 texture<float> tex;这个缓存区域分配内存后需要绑定到内存缓冲区。然后,启动核函数时,要用特殊的函数告诉GPU将读取请求转发到纹理内存而不是标准全局内存。当读取内存时不再使用方括号冲缓冲区中读取,而是将blend_kernel()改为tex1Dfetch().
blend_kernel()中又一个参数dstOut告诉那个缓冲区作为输入,哪个作为输出。
#include "cuda_runtime.h" #include "device_launch_parameters.h" #include <stdio.h> #include "AnimBitmap.h" #define DIM 1024 #define SPEED 0.25f #define MAX_TEMP 1.0f #define MIN_TEMP 0.0001f texture<float> texConst; texture<float> texIn; texture<float> texOut; __global__ void copy_const_kernel(float *iptr){ int x=threadIdx.x+blockIdx.x*blockDim.x; int y=threadIdx.y+blockIdx.y*blockDim.y; int offset=x+y*blockDim.x*gridDim.x; float c=tex1Dfetch(texConst,offset); if(c>0) iptr[offset]=c; } __global__ void blend_kernel(float *dst,bool dstOut){ int x=threadIdx.x+blockIdx.x*blockDim.x; int y=threadIdx.y+blockIdx.y*blockDim.y; int offset=x+y*blockDim.x*gridDim.x; int left=offset-1; int right=offset+1; if(x==0) left++; if(x==DIM-1) right--; int top=offset-DIM; int bottom=offset+DIM; if(y==0) top+=DIM; if(y==DIM-1) bottom-=DIM; float t,l,c,r,b; if(dstOut){ //true时候,texIn的区域作为输入 t=tex1Dfetch(texIn,top); l=tex1Dfetch(texIn,left); c=tex1Dfetch(texIn,offset); r=tex1Dfetch(texIn,right); b=tex1Dfetch(texIn,bottom); }else{ //false时候,texOut的区域作为输入 t=tex1Dfetch(texOut,top); l=tex1Dfetch(texOut,left); c=tex1Dfetch(texOut,offset); r=tex1Dfetch(texOut,right); b=tex1Dfetch(texOut,bottom); } dst[offset]=c+SPEED*(t+b+r+l-4*c); } struct DataBlock{ unsigned char* output_bitmap; float *dev_inSrc; float *dev_outSrc; float *dev_constSrc; AnimBitmap *bitmap; cudaEvent_t start,stop; float totalTime; float frames; }; __device__ unsigned char value( float n1, float n2, int hue ) { if (hue > 360) hue -= 360; else if (hue < 0) hue += 360; if (hue < 60) return (unsigned char)(255 * (n1 + (n2-n1)*hue/60)); if (hue < 180) return (unsigned char)(255 * n2); if (hue < 240) return (unsigned char)(255 * (n1 + (n2-n1)*(240-hue)/60)); return (unsigned char)(255 * n1); } __global__ void float_to_color( unsigned char *optr,const float *outSrc ) { int x = threadIdx.x + blockIdx.x * blockDim.x; int y = threadIdx.y + blockIdx.y * blockDim.y; int offset = x + y * blockDim.x * gridDim.x; float l = outSrc[offset]; float s = 1; int h = (180 + (int)(360.0f * outSrc[offset])) % 360; float m1, m2; if (l <= 0.5f) m2 = l * (1 + s); else m2 = l + s - l * s; m1 = 2 * l - m2; optr[offset*4 + 0] = value( m1, m2, h+120 ); optr[offset*4 + 1] = value( m1, m2, h ); optr[offset*4 + 2] = value( m1, m2, h -120 ); optr[offset*4 + 3] = 255; } void anim(DataBlock *d,int ticks){ cudaEventRecord(d->start,0); dim3 blocks(DIM/16,DIM/16); dim3 threads(16,16); AnimBitmap *bitmap=d->bitmap; volatile bool dstOut=true; float *in,*out; for(int i=0;i<90;i++){ if(dstOut){ in=d->dev_inSrc; out=d->dev_outSrc; }else{ out=d->dev_inSrc; in=d->dev_outSrc; } copy_const_kernel<<<blocks,threads>>>(in); blend_kernel<<<blocks,threads>>>(out,dstOut); dstOut=!dstOut; } float_to_color<<<blocks,threads>>>(d->output_bitmap,d->dev_inSrc); cudaMemcpy(bitmap->get_ptr(),d->output_bitmap,bitmap->image_size(),cudaMemcpyDeviceToHost); cudaEventRecord(d->stop,0); cudaEventSynchronize(d->stop); float elapsedTime; cudaEventElapsedTime(&elapsedTime,d->start,d->stop); d->totalTime+=elapsedTime; ++d->frames; printf("Average Time per frame:%3.1f ms\n",d->totalTime/d->frames); } void anim_exit(DataBlock *d){ //清楚绑定 cudaUnbindTexture(texIn); cudaUnbindTexture(texOut); cudaUnbindTexture(texConst); cudaFree(d->dev_inSrc); cudaFree(d->dev_outSrc); cudaFree(d->dev_constSrc); cudaEventDestroy(d->start); cudaEventDestroy(d->stop); } int main(void){ DataBlock data; AnimBitmap bitmap(DIM,DIM,&data); data.bitmap=&bitmap; data.totalTime=0; data.frames=0; cudaEventCreate(&data.start); cudaEventCreate(&data.stop); cudaMalloc((void**)&data.output_bitmap,bitmap.image_size()); cudaMalloc((void**)&data.dev_inSrc,bitmap.image_size()); cudaMalloc((void**)&data.dev_outSrc,bitmap.image_size()); cudaMalloc((void**)&data.dev_constSrc,bitmap.image_size()); //绑定纹理变量到内存的缓冲区 cudaBindTexture(NULL,texConst,data.dev_constSrc,bitmap.image_size()); cudaBindTexture(NULL,texIn,data.dev_inSrc,bitmap.image_size()); cudaBindTexture(NULL,texOut,data.dev_outSrc,bitmap.image_size()); float *temp=(float*)malloc(bitmap.image_size()); for(int i=0;i<DIM*DIM;i++){ temp[i]=0; int x=i%DIM; int y=i/DIM; if((x>300)&&(x<600)&&(y>310)&&(y<601)) temp[i]=MAX_TEMP; } cudaMemcpy(data.dev_constSrc,temp,bitmap.image_size(),cudaMemcpyHostToDevice); cudaMemcpy(data.dev_inSrc,temp,bitmap.image_size(),cudaMemcpyHostToDevice); free(temp); bitmap.anim_and_exit( (void (*)(void*,int))anim,(void (*)(void *))anim_exit ); return 0; }
除了一维纹理,还有二维纹理。使用是类似的。
定义的时候用texture<float,2> tex;同样进行绑定cudaBindTexture2D(NULL,tex,dev_inSrc,desc,DIM,DIM,size)
最后解除绑定。在写核函数时候,这里有一个好处是不用处理边界。
#define DIM 1024 #define PI 3.1415926535897932f #define MAX_TEMP 1.0f #define MIN_TEMP 0.0001f #define SPEED 0.25f // these exist on the GPU side texture<float,2> texConstSrc; texture<float,2> texIn; texture<float,2> texOut; __global__ void blend_kernel( float *dst, bool dstOut ) { // map from threadIdx/BlockIdx to pixel position int x = threadIdx.x + blockIdx.x * blockDim.x; int y = threadIdx.y + blockIdx.y * blockDim.y; int offset = x + y * blockDim.x * gridDim.x; float t, l, c, r, b; if (dstOut) { t = tex2D(texIn,x,y-1); l = tex2D(texIn,x-1,y); c = tex2D(texIn,x,y); r = tex2D(texIn,x+1,y); b = tex2D(texIn,x,y+1); } else { t = tex2D(texOut,x,y-1); l = tex2D(texOut,x-1,y); c = tex2D(texOut,x,y); r = tex2D(texOut,x+1,y); b = tex2D(texOut,x,y+1); } dst[offset] = c + SPEED * (t + b + r + l - 4 * c); } __global__ void copy_const_kernel( float *iptr ) { // map from threadIdx/BlockIdx to pixel position int x = threadIdx.x + blockIdx.x * blockDim.x; int y = threadIdx.y + blockIdx.y * blockDim.y; int offset = x + y * blockDim.x * gridDim.x; float c = tex2D(texConstSrc,x,y); if (c != 0) iptr[offset] = c; } // globals needed by the update routine struct DataBlock { unsigned char *output_bitmap; float *dev_inSrc; float *dev_outSrc; float *dev_constSrc; CPUAnimBitmap *bitmap; cudaEvent_t start, stop; float totalTime; float frames; }; void anim_gpu( DataBlock *d, int ticks ) { HANDLE_ERROR( cudaEventRecord( d->start, 0 ) ); dim3 blocks(DIM/16,DIM/16); dim3 threads(16,16); CPUAnimBitmap *bitmap = d->bitmap; // since tex is global and bound, we have to use a flag to // select which is in/out per iteration volatile bool dstOut = true; for (int i=0; i<90; i++) { float *in, *out; if (dstOut) { in = d->dev_inSrc; out = d->dev_outSrc; } else { out = d->dev_inSrc; in = d->dev_outSrc; } copy_const_kernel<<<blocks,threads>>>( in ); blend_kernel<<<blocks,threads>>>( out, dstOut ); dstOut = !dstOut; } float_to_color<<<blocks,threads>>>( d->output_bitmap, d->dev_inSrc ); HANDLE_ERROR( cudaMemcpy( bitmap->get_ptr(), d->output_bitmap, bitmap->image_size(), cudaMemcpyDeviceToHost ) ); HANDLE_ERROR( cudaEventRecord( d->stop, 0 ) ); HANDLE_ERROR( cudaEventSynchronize( d->stop ) ); float elapsedTime; HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime, d->start, d->stop ) ); d->totalTime += elapsedTime; ++d->frames; printf( "Average Time per frame: %3.1f ms\n", d->totalTime/d->frames ); } // clean up memory allocated on the GPU void anim_exit( DataBlock *d ) { cudaUnbindTexture( texIn ); cudaUnbindTexture( texOut ); cudaUnbindTexture( texConstSrc ); HANDLE_ERROR( cudaFree( d->dev_inSrc ) ); HANDLE_ERROR( cudaFree( d->dev_outSrc ) ); HANDLE_ERROR( cudaFree( d->dev_constSrc ) ); HANDLE_ERROR( cudaEventDestroy( d->start ) ); HANDLE_ERROR( cudaEventDestroy( d->stop ) ); } int main( void ) { DataBlock data; CPUAnimBitmap bitmap( DIM, DIM, &data ); data.bitmap = &bitmap; data.totalTime = 0; data.frames = 0; HANDLE_ERROR( cudaEventCreate( &data.start ) ); HANDLE_ERROR( cudaEventCreate( &data.stop ) ); int imageSize = bitmap.image_size(); HANDLE_ERROR( cudaMalloc( (void**)&data.output_bitmap, imageSize ) ); // assume float == 4 chars in size (ie rgba) HANDLE_ERROR( cudaMalloc( (void**)&data.dev_inSrc, imageSize ) ); HANDLE_ERROR( cudaMalloc( (void**)&data.dev_outSrc, imageSize ) ); HANDLE_ERROR( cudaMalloc( (void**)&data.dev_constSrc, imageSize ) ); cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>(); HANDLE_ERROR( cudaBindTexture2D( NULL, texConstSrc, data.dev_constSrc, desc, DIM, DIM, sizeof(float) * DIM ) ); HANDLE_ERROR( cudaBindTexture2D( NULL, texIn, data.dev_inSrc, desc, DIM, DIM, sizeof(float) * DIM ) ); HANDLE_ERROR( cudaBindTexture2D( NULL, texOut, data.dev_outSrc, desc, DIM, DIM, sizeof(float) * DIM ) ); // initialize the constant data float *temp = (float*)malloc( imageSize ); for (int i=0; i<DIM*DIM; i++) { temp[i] = 0; int x = i % DIM; int y = i / DIM; if ((x>300) && (x<600) && (y>310) && (y<601)) temp[i] = MAX_TEMP; } temp[DIM*100+100] = (MAX_TEMP + MIN_TEMP)/2; temp[DIM*700+100] = MIN_TEMP; temp[DIM*300+300] = MIN_TEMP; temp[DIM*200+700] = MIN_TEMP; for (int y=800; y<900; y++) { for (int x=400; x<500; x++) { temp[x+y*DIM] = MIN_TEMP; } } HANDLE_ERROR( cudaMemcpy( data.dev_constSrc, temp, imageSize, cudaMemcpyHostToDevice ) ); // initialize the input data for (int y=800; y<DIM; y++) { for (int x=0; x<200; x++) { temp[x+y*DIM] = MAX_TEMP; } } HANDLE_ERROR( cudaMemcpy( data.dev_inSrc, temp, imageSize, cudaMemcpyHostToDevice ) ); free( temp ); bitmap.anim_and_exit( (void (*)(void*,int))anim_gpu, (void (*)(void*))anim_exit ); }
相关推荐
《CUDA by Example》是一本介绍通用目的图形处理单元(GPU)编程的教程书籍,作者是Jason Sanders和Edward Kandrot。本书不仅为读者提供了通过实例学习CUDA编程的机会,也是NVIDIA官方推荐的GPU学习教程。它深入浅出...
由于给定文件的【标题】和【描述】均重复了"CUDA by Example (2010).pdf",而【标签】是"CUDA by Example",这表明文件的内容应该与这本书——《CUDA by Example》——紧密相关。该书由Jason Sanders和Edward ...
《CUDA By Example》中文译名《GPU高性能编程CUDA实战》是研究GPGPU异构并行计算非常不错的工具书。书中给出的代码,非常个别的地方有失误,但是都有人为标注了,而且对不同的编程工具可能需要自己配置链接库。...
《CUDA by Example》是一本深度探讨GPU高性能编程的权威书籍,中文版名为“GPU高性能编程CUDA实战”。本书通过丰富的实例,详细介绍了CUDA编程模型及其在并行计算中的应用。CUDA,全称为Compute Unified Device ...
《CUDA by Example》是一本备受推崇的CUDA编程指南,它为初学者提供了深入理解并掌握CUDA编程技术的宝贵资源。CUDA(Compute Unified Device Architecture)是由NVIDIA推出的一种并行计算平台和编程模型,旨在利用...
《CUDA by Example》是一本专为想要深入了解GPU编程的IT专业人士准备的书籍,特别是对于那些希望通过CUDA技术提高计算性能的开发者。CUDA(Compute Unified Device Architecture)是NVIDIA公司推出的一种并行计算...
cuda by example 中英文版及代码,中文版名称是 GPU高性能编程CUDA实战,含书签
《CUDA By Example》中文译名《GPU高性能编程CUDA实战》 源码 包括 book.h cpu_anim.h cpu_bitmap.h gl_helper.h gpu_anim.h glext.h glut.h
《CUDA by Example》是一本深度解析CUDA编程的权威著作,由NVIDIA公司的专家Jason Katzan和Erik Nijkamp合著。CUDA(Compute Unified Device Architecture)是NVIDIA推出的用于GPU编程的一种并行计算平台和编程模型...
### CUDA by Example: An Introduction to General-Purpose GPU Programming #### 一、概述 《CUDA by Example: An Introduction to General-Purpose GPU Programming》是一本由Jason Sanders和Edward Kandrot共同...
### CUDA by Example: An Introduction to General-Purpose GPU Programming #### 一、概述 《CUDA by Example: An Introduction to General-Purpose GPU Programming》是一本由Jason Sanders和Edward Kandrot共同...
### CUDA by Example: Key Insights and Detailed Analysis #### Introduction to CUDA CUDA (Compute Unified Device Architecture) is a parallel computing platform and application programming interface ...
《CUDA by Example: An Introduction to General-Purpose GPU Programming》是一本深入浅出介绍CUDA编程的书籍,旨在帮助读者理解并掌握GPU(图形处理器)的通用计算能力。CUDA是NVIDIA公司推出的一种编程模型,它...
《CUDA By Example》是一本深度探讨CUDA编程的权威书籍,由NVIDIA官方提供,旨在帮助开发者熟练掌握CUDA编程技术,从而充分利用GPU的并行计算能力。CUDA(Compute Unified Device Architecture)是NVIDIA推出的一种...
《CUDA BY EXAMPLE》是GPU高性能编程领域的一本经典著作,主要介绍了如何利用CUDA技术进行并行计算。CUDA,全称Compute Unified Device Architecture,是NVIDIA公司推出的一种编程模型,允许开发者直接利用图形...
书名《GPU高性能编程 CUDA实战》和《CUDA By Example》都是深入探讨CUDA编程技术的经典著作。CUDA(Compute Unified Device Architecture)是由NVIDIA推出的并行计算平台和编程模型,旨在利用图形处理单元(GPU)的...
《GPU高性能编程CUDA实战(CUDA By Example)》头文件_全书使用的,在163页通过多线程,使用多GPU的地方已完美调试,可运行通过。
介绍GPU编程,给出了很多CUDA编程的示例