Oct 1 2009

Asian Call Option Pricing, Part Deux

Well, it seems that while my code ‘worked,’ it didn’t really ‘work.’ Such are the subtleties of using libraries you don’t wholly understand. Check the comments on my Asian Call Option Pricing post. It is a quick fix to get rid of the threadIndex issue (just use a counting_iterator and set the thread index value to the iterator value modulus MT_RNG_COUNT — not perfect, but it works), but it isn’t quite as easy to refactor away the __shared__ state in the functor, which makes the Mersenne Twister hard to use. But not all is lost, if you check the experimental branch of Thrust, you will find a linear congruential generator that you can use.

You can check some example code on a monte-carlo implementation with Thrust here. Much better than my code!

  • Share/Bookmark

Sep 25 2009

Asian Call Option Pricing with Trust

Asian Call Options are valued based on their average price over a time-period. This program estimates an Asian Call Option that expires 250 periods from now, based on a stock currently at $50, with a $50 strike, and a volatility of 0.4. It runs 500,000 simulations, and averages their average price to get the price of the option.

Not too realistic, but fun to play with.

#include <vector_functions.h>
 
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
 
#include <thrust/transform_reduce.h>
 
// Mersenne Twister code reproduced & modified from: 
//										http://svn.jcornwall.me.uk/applications/MersenneTwisterGPU/
//										http://www.jcornwall.me.uk/2009/04/mersenne-twisters-in-cuda/
#include <cassert>
#include <cstdio>
#include <vector>
 
#define CUDA_SAFE_CALL(call)                                        \
  do {                                                              \
    cudaError err = call;                                           \
    if( cudaSuccess != err) {                                       \
      fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
        __FILE__, __LINE__, cudaGetErrorString( err) );             \
      exit(EXIT_FAILURE);                                           \
    }                                                               \
  } while(0)
 
#define MT_MM     9
#define MT_NN     19
#define MT_WMASK  0xFFFFFFFFU
#define MT_UMASK  0xFFFFFFFEU
#define MT_LMASK  0x1U
#define MT_RNG_COUNT 32768
#define MT_SHIFT0 12
#define MT_SHIFTB 7
#define MT_SHIFTC 15
#define MT_SHIFT1 18
 
// Record format for MersenneTwister.dat, created by spawnTwisters.c
struct mt_struct_stripped {
	unsigned int matrix_a;
	unsigned int mask_b;
	unsigned int mask_c;
	unsigned int seed;
};
 
// Per-thread state object for a single twister.
struct MersenneTwisterState {
	unsigned int mt[MT_NN];
	int iState;
	unsigned int mti1;
};
 
// Preloaded, offline-generated seed data structure.
__device__ static mt_struct_stripped MT[MT_RNG_COUNT];
 
__device__ unsigned int MersenneTwisterGenerate(MersenneTwisterState &state, unsigned int threadID) {
	int iState1 = state.iState + 1;
	int iStateM = state.iState + MT_MM;
 
	if(iState1 >= MT_NN) iState1 -= MT_NN;
	if(iStateM >= MT_NN) iStateM -= MT_NN;
 
	unsigned int mti = state.mti1;
	state.mti1 = state.mt[iState1];
	unsigned int mtiM = state.mt[iStateM];
 
	unsigned int x = (mti & MT_UMASK) | (state.mti1 & MT_LMASK);
	x = mtiM ^ (x >> 1) ^ ((x & 1) ? MT[threadID].matrix_a : 0);
	state.mt[state.iState] = x;
	state.iState = iState1;
 
	// Tempering transformation.
	x ^= (x >> MT_SHIFT0);
	x ^= (x << MT_SHIFTB) & MT[threadID].mask_b;
	x ^= (x << MT_SHIFTC) & MT[threadID].mask_c;
	x ^= (x >> MT_SHIFT1);
 
	return x;
}
 
#define TWISTER_WARM_UP 0
 
__device__ void MersenneTwisterInitialize(MersenneTwisterState &state, unsigned int threadID) {
	state.mt[0] = MT[threadID].seed;
	for(int i = 1; i < MT_NN; ++ i) {
		state.mt[i] = (1812433253U * (state.mt[i - 1] ^ (state.mt[i - 1] >> 30)) + i) & MT_WMASK;
	}
 
	state.iState = 0;
	state.mti1 = state.mt[0];
 
	// warm up the twister
        #if TWISTER_WARM_UP
	for(int i = 0; i < 10000; ++ i) {
		MersenneTwisterGenerate(state, threadID);
	}
        #endif
 
}
 
// 
__global__ void TestMersenneTwister(float *outArr, int nNumbers) {
	unsigned int tid = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
 
	MersenneTwisterState mtState;
	MersenneTwisterInitialize(mtState, tid);
 
	for(int i = tid; i < nNumbers; i += __mul24(blockDim.x, gridDim.x)) {
		// Make a floating-point number between 0...1 from integer 0...UINT_MAX.
		outArr[i] = float(MersenneTwisterGenerate(mtState, tid)) / 4294967295.0f;
	}
}
 
// Initialize seeds for the Mersenne Twister
void InitializeSeeds() {
	mt_struct_stripped *mtStripped = new mt_struct_stripped[MT_RNG_COUNT];
 
	FILE *datFile = fopen("MersenneTwister.dat", "rb");
	assert(datFile);
	assert(fread(mtStripped, sizeof(mt_struct_stripped) * MT_RNG_COUNT, 1, datFile));
	fclose(datFile);
 
	// Seed the structure with low-quality random numbers. Twisters will need "warming up"
	// before the RNG quality improves.
	srand(time(0));
	for(int i = 0; i < MT_RNG_COUNT; ++ i) {
		mtStripped[i].seed = rand();
	}
 
	// Upload the initial configurations to the GPU.
	CUDA_SAFE_CALL(cudaMemcpyToSymbol(MT, mtStripped, sizeof(mt_struct_stripped) * MT_RNG_COUNT, 0, cudaMemcpyHostToDevice));
	delete[] mtStripped;
}
 
// Generates random numbers on the GPU
void GenerateRandomNumbers(float *randomNumbers, int nRandomNumbers) {
	// Read offline-generated initial configuration file.
	float *randomNumbersDev;
	CUDA_SAFE_CALL(cudaMalloc((void **)&randomNumbersDev, sizeof(float) * nRandomNumbers));
 
	dim3 threads(512, 1);
	dim3 grid(MT_RNG_COUNT / 512, 1, 1);
 
	TestMersenneTwister<<<grid, threads>>>(randomNumbersDev, nRandomNumbers);
 
	CUDA_SAFE_CALL(cudaMemcpy(randomNumbers, randomNumbersDev, sizeof(float) * nRandomNumbers, cudaMemcpyDeviceToHost));
	CUDA_SAFE_CALL(cudaFree(randomNumbersDev));
}
// End Mersenne Twister Code
 
//Box Muller transform
#define PI 3.14159265358979f
__device__ 
void BoxMuller(float &u1, float &u2){
    float   r = sqrtf(-2.0f * logf(u1));
    float phi = 2 * PI * u2;
 
    u1 = (r * __cosf(phi));
	u2 = (r * __sinf(phi));
}
 
struct AsianOptionPricer {
	int term;
	float price0;
	float strike;
	float volatility;
	int numPeriods;
	float deltaT;
	float invariant;
 
	AsianOptionPricer(int t, float s, float v, float rfr, int p) : 
		term(t),
		strike(s), 
		volatility(v),
		numPeriods(p) {
			deltaT = (float)term / numPeriods;
			invariant = (rfr - volatility * volatility / 2) * deltaT;
	}
 
	__device__
	float operator()(float price0) const {
		__shared__ MersenneTwisterState mtState;
		unsigned int thrOffset = blockIdx.x * blockDim.x + threadIdx.x;
 
		MersenneTwisterInitialize(mtState, thrOffset);
 
		float sum = 0;
		float price = price0;
 
		// since we move the price two steps a loop, we only go from 0 to numPeriods/2
		for(int i = 0; i < (numPeriods) / 2; i++) {
			float u1 = float(MersenneTwisterGenerate(mtState, thrOffset)) / 4294967295.0f;
			float u2 = float(MersenneTwisterGenerate(mtState, thrOffset)) / 4294967295.0f;
 
			BoxMuller(u1, u2); //transform uniform into two independent standard normals
 
			// move the price a step
			price = price * __expf(invariant + volatility * u1 * sqrt(deltaT));
			sum = sum + price;
 
			// move the price a second step
			price = price * __expf(invariant + volatility * u2 * sqrt(deltaT));
			sum = sum + price;
		}
 
		float save = (sum / numPeriods) - strike;
		if(save < 0) {
			return 0.0;
		} else {
			return save;
		}
	}
};
 
int main() {
	float S0 = 50;
	int M = 500000;
 
	InitializeSeeds();
 
	thrust::device_vector<float> payoff(M); 
	thrust::fill(payoff.begin(), payoff.end(), S0);
 
	float r = 0.1;
	float T = 1;
	float sigma = 0.4;
 
	AsianOptionPricer pricer(T, 50, sigma, r, 250);
 
	float sum = thrust::transform_reduce(payoff.begin(), payoff.end(), pricer, 0.0f, thrust::plus<float>());
	printf("The price is: %f\n", (sum / M) * exp(-r * T));
 
	return 0;
}

Results:

macbook:asian option thrust choffstein$ ./price_asian_option_c++
Simulation with 500000 trials and 250 increments per trial took 14689312 milliseconds.
The price is: 5.59305
macbook:asian option thrust choffstein$ ./price_asian_option
Simulation with 500000 trials and 250 increments per trial took 991164 milliseconds.
The price is: 5.59295

Impressive speedup, no? About 7% of the time of my comparable C++ program (note: not exact replica). One should also note that the speedup is drastically reduced as the number of simulations decreases — most likely due to data copying speeds from to and from the GPU.

  • Share/Bookmark