bate's blog

調べたこと実装したことなどを取りとめもなく書きます。

引数とコンスタントメモリ

コンスタントを
使わない場合242.732391(ms)
使う場合242.835258(ms)
大して変わらないのと、実行する度に数値が変動する

#include "cDeviceMemory.h"

#include <windows.h>

#define USE_CONSTANT	0

#if USE_CONSTANT
float H_THRESHOLD;
int H_LIMIT;
float H_START_X;
float H_END_X;
float H_START_Y;
float H_END_Y;

__constant__ float D_THRESHOLD;
__constant__ int D_LIMIT;
__constant__ float D_START_X;
__constant__ float D_END_X;
__constant__ float D_START_Y;
__constant__ float D_END_Y;

__global__ void CalculateMandelbrotAsync(
	RGBQUAD* pBuf, int pitch, int width, int height)
{
	const float dx = (D_END_X - D_START_X) / width;
	const float dy = (D_END_Y - D_START_Y) /height;
	const int ix = threadIdx.x + blockIdx.x * blockDim.x;
	const int iy = threadIdx.y + blockIdx.y * blockDim.y;
	const float x = D_START_X + dx * ix;
	const float y = D_START_Y + dy * iy;
	
	float zr = 0.0f;
	float zi = 0.0f;
	int i = 0;
	float zr0 = 0;
	float zi0 = 0;
	for(; i < D_LIMIT; ++i) {
		zr0 = zr * zr - zi * zi + x;
		if(D_THRESHOLD < zr0) {
			break;
		}
		zi0 = 2.0f * zr * zi + y;
		if(D_THRESHOLD < zi0) {
			break;
		}
		zr = zr0;
		zi = zi0;
	}

	DWORD depth = ((256*256*256-1)*i)/D_LIMIT;
	RGBQUAD rgb = {(BYTE)(depth % (256*256)), (BYTE)((depth/256) % 256), (BYTE)(depth/256/256), 0};
	pBuf[ix + iy * pitch] = rgb;
}
#else
__global__ void CalculateMandelbrotAsync(
	RGBQUAD* pBuf, int pitch, int width, int height, float threshold, int limit,
	float startX, float endX, float startY, float endY)
{
	const float dx = (endX - startX) / width;
	const float dy = (endY - startY) /height;
	const int ix = threadIdx.x + blockIdx.x * blockDim.x;
	const int iy = threadIdx.y + blockIdx.y * blockDim.y;
	const float x = startX + dx * ix;
	const float y = startY + dy * iy;
	
	float zr = 0.0f;
	float zi = 0.0f;
	int i = 0;
	float zr0 = 0;
	float zi0 = 0;
	for(; i < limit; ++i) {
		zr0 = zr * zr - zi * zi + x;
		if(threshold < zr0) {
			break;
		}
		zi0 = 2.0f * zr * zi + y;
		if(threshold < zi0) {
			break;
		}
		zr = zr0;
		zi = zi0;
	}

	DWORD depth = ((256*256*256-1)*i)/limit;
	RGBQUAD rgb = {(BYTE)(depth % (256*256)), (BYTE)((depth/256) % 256), (BYTE)(depth/256/256), 0};
	pBuf[ix + iy * pitch] = rgb;
}
#endif

void CalculateMandelbrot(
	cDeviceMemory& deviceMem,
	int pitch, int width, int height, float threshold, int limit,
	float startX, float endX, float startY, float endY)
{
	dim3 block(32, 16);
	dim3 grid(width/32, height/16);

	int size = deviceMem.getSize();
	RGBQUAD* pBuf = (RGBQUAD*)deviceMem.getBuffer();

#if USE_CONSTANT
	H_THRESHOLD = threshold;
	H_LIMIT = limit;
	H_START_X = startX;
	H_END_X = endX;
	H_START_Y = startY;
	H_END_Y = endY;
	cudaMemcpyToSymbol(D_THRESHOLD,	&H_THRESHOLD,	sizeof(float));
	cudaMemcpyToSymbol(D_LIMIT,		&H_LIMIT,		sizeof(int));
	cudaMemcpyToSymbol(D_START_X,	&H_START_X,		sizeof(float));
	cudaMemcpyToSymbol(D_END_X,		&H_END_X,		sizeof(float));
	cudaMemcpyToSymbol(D_START_Y,	&H_START_Y,		sizeof(float));
	cudaMemcpyToSymbol(D_END_Y,		&H_END_Y,		sizeof(float));
	
	CalculateMandelbrotAsync<<< grid, block >>>(pBuf, pitch, width, height);
#else
	CalculateMandelbrotAsync<<< grid, block >>>(pBuf, pitch, width, height, threshold, limit, startX, endX, startY, endY);
#endif
}