Asian Call Option Pricing with Trust
Asian Call Options are valued based on their average price over a time-period. This program estimates an Asian Call Option that expires 250 periods from now, based on a stock currently at $50, with a $50 strike, and a volatility of 0.4. It runs 500,000 simulations, and averages their average price to get the price of the option.
Not too realistic, but fun to play with.
#include <vector_functions.h> #include <thrust/host_vector.h> #include <thrust/device_vector.h> #include <thrust/transform_reduce.h> // Mersenne Twister code reproduced & modified from: // http://svn.jcornwall.me.uk/applications/MersenneTwisterGPU/ // http://www.jcornwall.me.uk/2009/04/mersenne-twisters-in-cuda/ #include <cassert> #include <cstdio> #include <vector> #define CUDA_SAFE_CALL(call) \ do { \ cudaError err = call; \ if( cudaSuccess != err) { \ fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ __FILE__, __LINE__, cudaGetErrorString( err) ); \ exit(EXIT_FAILURE); \ } \ } while(0) #define MT_MM 9 #define MT_NN 19 #define MT_WMASK 0xFFFFFFFFU #define MT_UMASK 0xFFFFFFFEU #define MT_LMASK 0x1U #define MT_RNG_COUNT 32768 #define MT_SHIFT0 12 #define MT_SHIFTB 7 #define MT_SHIFTC 15 #define MT_SHIFT1 18 // Record format for MersenneTwister.dat, created by spawnTwisters.c struct mt_struct_stripped { unsigned int matrix_a; unsigned int mask_b; unsigned int mask_c; unsigned int seed; }; // Per-thread state object for a single twister. struct MersenneTwisterState { unsigned int mt[MT_NN]; int iState; unsigned int mti1; }; // Preloaded, offline-generated seed data structure. __device__ static mt_struct_stripped MT[MT_RNG_COUNT]; __device__ unsigned int MersenneTwisterGenerate(MersenneTwisterState &state, unsigned int threadID) { int iState1 = state.iState + 1; int iStateM = state.iState + MT_MM; if(iState1 >= MT_NN) iState1 -= MT_NN; if(iStateM >= MT_NN) iStateM -= MT_NN; unsigned int mti = state.mti1; state.mti1 = state.mt[iState1]; unsigned int mtiM = state.mt[iStateM]; unsigned int x = (mti & MT_UMASK) | (state.mti1 & MT_LMASK); x = mtiM ^ (x >> 1) ^ ((x & 1) ? MT[threadID].matrix_a : 0); state.mt[state.iState] = x; state.iState = iState1; // Tempering transformation. x ^= (x >> MT_SHIFT0); x ^= (x << MT_SHIFTB) & MT[threadID].mask_b; x ^= (x << MT_SHIFTC) & MT[threadID].mask_c; x ^= (x >> MT_SHIFT1); return x; } #define TWISTER_WARM_UP 0 __device__ void MersenneTwisterInitialize(MersenneTwisterState &state, unsigned int threadID) { state.mt[0] = MT[threadID].seed; for(int i = 1; i < MT_NN; ++ i) { state.mt[i] = (1812433253U * (state.mt[i - 1] ^ (state.mt[i - 1] >> 30)) + i) & MT_WMASK; } state.iState = 0; state.mti1 = state.mt[0]; // warm up the twister #if TWISTER_WARM_UP for(int i = 0; i < 10000; ++ i) { MersenneTwisterGenerate(state, threadID); } #endif } // __global__ void TestMersenneTwister(float *outArr, int nNumbers) { unsigned int tid = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; MersenneTwisterState mtState; MersenneTwisterInitialize(mtState, tid); for(int i = tid; i < nNumbers; i += __mul24(blockDim.x, gridDim.x)) { // Make a floating-point number between 0...1 from integer 0...UINT_MAX. outArr[i] = float(MersenneTwisterGenerate(mtState, tid)) / 4294967295.0f; } } // Initialize seeds for the Mersenne Twister void InitializeSeeds() { mt_struct_stripped *mtStripped = new mt_struct_stripped[MT_RNG_COUNT]; FILE *datFile = fopen("MersenneTwister.dat", "rb"); assert(datFile); assert(fread(mtStripped, sizeof(mt_struct_stripped) * MT_RNG_COUNT, 1, datFile)); fclose(datFile); // Seed the structure with low-quality random numbers. Twisters will need "warming up" // before the RNG quality improves. srand(time(0)); for(int i = 0; i < MT_RNG_COUNT; ++ i) { mtStripped[i].seed = rand(); } // Upload the initial configurations to the GPU. CUDA_SAFE_CALL(cudaMemcpyToSymbol(MT, mtStripped, sizeof(mt_struct_stripped) * MT_RNG_COUNT, 0, cudaMemcpyHostToDevice)); delete[] mtStripped; } // Generates random numbers on the GPU void GenerateRandomNumbers(float *randomNumbers, int nRandomNumbers) { // Read offline-generated initial configuration file. float *randomNumbersDev; CUDA_SAFE_CALL(cudaMalloc((void **)&randomNumbersDev, sizeof(float) * nRandomNumbers)); dim3 threads(512, 1); dim3 grid(MT_RNG_COUNT / 512, 1, 1); TestMersenneTwister<<<grid, threads>>>(randomNumbersDev, nRandomNumbers); CUDA_SAFE_CALL(cudaMemcpy(randomNumbers, randomNumbersDev, sizeof(float) * nRandomNumbers, cudaMemcpyDeviceToHost)); CUDA_SAFE_CALL(cudaFree(randomNumbersDev)); } // End Mersenne Twister Code //Box Muller transform #define PI 3.14159265358979f __device__ void BoxMuller(float &u1, float &u2){ float r = sqrtf(-2.0f * logf(u1)); float phi = 2 * PI * u2; u1 = (r * __cosf(phi)); u2 = (r * __sinf(phi)); } struct AsianOptionPricer { int term; float price0; float strike; float volatility; int numPeriods; float deltaT; float invariant; AsianOptionPricer(int t, float s, float v, float rfr, int p) : term(t), strike(s), volatility(v), numPeriods(p) { deltaT = (float)term / numPeriods; invariant = (rfr - volatility * volatility / 2) * deltaT; } __device__ float operator()(float price0) const { __shared__ MersenneTwisterState mtState; unsigned int thrOffset = blockIdx.x * blockDim.x + threadIdx.x; MersenneTwisterInitialize(mtState, thrOffset); float sum = 0; float price = price0; // since we move the price two steps a loop, we only go from 0 to numPeriods/2 for(int i = 0; i < (numPeriods) / 2; i++) { float u1 = float(MersenneTwisterGenerate(mtState, thrOffset)) / 4294967295.0f; float u2 = float(MersenneTwisterGenerate(mtState, thrOffset)) / 4294967295.0f; BoxMuller(u1, u2); //transform uniform into two independent standard normals // move the price a step price = price * __expf(invariant + volatility * u1 * sqrt(deltaT)); sum = sum + price; // move the price a second step price = price * __expf(invariant + volatility * u2 * sqrt(deltaT)); sum = sum + price; } float save = (sum / numPeriods) - strike; if(save < 0) { return 0.0; } else { return save; } } }; int main() { float S0 = 50; int M = 500000; InitializeSeeds(); thrust::device_vector<float> payoff(M); thrust::fill(payoff.begin(), payoff.end(), S0); float r = 0.1; float T = 1; float sigma = 0.4; AsianOptionPricer pricer(T, 50, sigma, r, 250); float sum = thrust::transform_reduce(payoff.begin(), payoff.end(), pricer, 0.0f, thrust::plus<float>()); printf("The price is: %f\n", (sum / M) * exp(-r * T)); return 0; }
Results:
macbook:asian option thrust choffstein$ ./price_asian_option_c++
Simulation with 500000 trials and 250 increments per trial took 14689312 milliseconds.
The price is: 5.59305
macbook:asian option thrust choffstein$ ./price_asian_option
Simulation with 500000 trials and 250 increments per trial took 991164 milliseconds.
The price is: 5.59295
Impressive speedup, no? About 7% of the time of my comparable C++ program (note: not exact replica). One should also note that the speedup is drastically reduced as the number of simulations decreases — most likely due to data copying speeds from to and from the GPU.