Oct
1
2009
Well, it seems that while my code ‘worked,’ it didn’t really ‘work.’ Such are the subtleties of using libraries you don’t wholly understand. Check the comments on my Asian Call Option Pricing post. It is a quick fix to get rid of the threadIndex issue (just use a counting_iterator and set the thread index value to the iterator value modulus MT_RNG_COUNT — not perfect, but it works), but it isn’t quite as easy to refactor away the __shared__ state in the functor, which makes the Mersenne Twister hard to use. But not all is lost, if you check the experimental branch of Thrust, you will find a linear congruential generator that you can use.
You can check some example code on a monte-carlo implementation with Thrust here. Much better than my code!
no comments | tags: gpu, thrust | posted in gpgpu, programming
Sep
25
2009
Asian Call Options are valued based on their average price over a time-period. This program estimates an Asian Call Option that expires 250 periods from now, based on a stock currently at $50, with a $50 strike, and a volatility of 0.4. It runs 500,000 simulations, and averages their average price to get the price of the option.
Not too realistic, but fun to play with.
#include <vector_functions.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform_reduce.h>
// Mersenne Twister code reproduced & modified from:
// http://svn.jcornwall.me.uk/applications/MersenneTwisterGPU/
// http://www.jcornwall.me.uk/2009/04/mersenne-twisters-in-cuda/
#include <cassert>
#include <cstdio>
#include <vector>
#define CUDA_SAFE_CALL(call) \
do { \
cudaError err = call; \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} \
} while(0)
#define MT_MM 9
#define MT_NN 19
#define MT_WMASK 0xFFFFFFFFU
#define MT_UMASK 0xFFFFFFFEU
#define MT_LMASK 0x1U
#define MT_RNG_COUNT 32768
#define MT_SHIFT0 12
#define MT_SHIFTB 7
#define MT_SHIFTC 15
#define MT_SHIFT1 18
// Record format for MersenneTwister.dat, created by spawnTwisters.c
struct mt_struct_stripped {
unsigned int matrix_a;
unsigned int mask_b;
unsigned int mask_c;
unsigned int seed;
};
// Per-thread state object for a single twister.
struct MersenneTwisterState {
unsigned int mt[MT_NN];
int iState;
unsigned int mti1;
};
// Preloaded, offline-generated seed data structure.
__device__ static mt_struct_stripped MT[MT_RNG_COUNT];
__device__ unsigned int MersenneTwisterGenerate(MersenneTwisterState &state, unsigned int threadID) {
int iState1 = state.iState + 1;
int iStateM = state.iState + MT_MM;
if(iState1 >= MT_NN) iState1 -= MT_NN;
if(iStateM >= MT_NN) iStateM -= MT_NN;
unsigned int mti = state.mti1;
state.mti1 = state.mt[iState1];
unsigned int mtiM = state.mt[iStateM];
unsigned int x = (mti & MT_UMASK) | (state.mti1 & MT_LMASK);
x = mtiM ^ (x >> 1) ^ ((x & 1) ? MT[threadID].matrix_a : 0);
state.mt[state.iState] = x;
state.iState = iState1;
// Tempering transformation.
x ^= (x >> MT_SHIFT0);
x ^= (x << MT_SHIFTB) & MT[threadID].mask_b;
x ^= (x << MT_SHIFTC) & MT[threadID].mask_c;
x ^= (x >> MT_SHIFT1);
return x;
}
#define TWISTER_WARM_UP 0
__device__ void MersenneTwisterInitialize(MersenneTwisterState &state, unsigned int threadID) {
state.mt[0] = MT[threadID].seed;
for(int i = 1; i < MT_NN; ++ i) {
state.mt[i] = (1812433253U * (state.mt[i - 1] ^ (state.mt[i - 1] >> 30)) + i) & MT_WMASK;
}
state.iState = 0;
state.mti1 = state.mt[0];
// warm up the twister
#if TWISTER_WARM_UP
for(int i = 0; i < 10000; ++ i) {
MersenneTwisterGenerate(state, threadID);
}
#endif
}
//
__global__ void TestMersenneTwister(float *outArr, int nNumbers) {
unsigned int tid = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
MersenneTwisterState mtState;
MersenneTwisterInitialize(mtState, tid);
for(int i = tid; i < nNumbers; i += __mul24(blockDim.x, gridDim.x)) {
// Make a floating-point number between 0...1 from integer 0...UINT_MAX.
outArr[i] = float(MersenneTwisterGenerate(mtState, tid)) / 4294967295.0f;
}
}
// Initialize seeds for the Mersenne Twister
void InitializeSeeds() {
mt_struct_stripped *mtStripped = new mt_struct_stripped[MT_RNG_COUNT];
FILE *datFile = fopen("MersenneTwister.dat", "rb");
assert(datFile);
assert(fread(mtStripped, sizeof(mt_struct_stripped) * MT_RNG_COUNT, 1, datFile));
fclose(datFile);
// Seed the structure with low-quality random numbers. Twisters will need "warming up"
// before the RNG quality improves.
srand(time(0));
for(int i = 0; i < MT_RNG_COUNT; ++ i) {
mtStripped[i].seed = rand();
}
// Upload the initial configurations to the GPU.
CUDA_SAFE_CALL(cudaMemcpyToSymbol(MT, mtStripped, sizeof(mt_struct_stripped) * MT_RNG_COUNT, 0, cudaMemcpyHostToDevice));
delete[] mtStripped;
}
// Generates random numbers on the GPU
void GenerateRandomNumbers(float *randomNumbers, int nRandomNumbers) {
// Read offline-generated initial configuration file.
float *randomNumbersDev;
CUDA_SAFE_CALL(cudaMalloc((void **)&randomNumbersDev, sizeof(float) * nRandomNumbers));
dim3 threads(512, 1);
dim3 grid(MT_RNG_COUNT / 512, 1, 1);
TestMersenneTwister<<<grid, threads>>>(randomNumbersDev, nRandomNumbers);
CUDA_SAFE_CALL(cudaMemcpy(randomNumbers, randomNumbersDev, sizeof(float) * nRandomNumbers, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaFree(randomNumbersDev));
}
// End Mersenne Twister Code
//Box Muller transform
#define PI 3.14159265358979f
__device__
void BoxMuller(float &u1, float &u2){
float r = sqrtf(-2.0f * logf(u1));
float phi = 2 * PI * u2;
u1 = (r * __cosf(phi));
u2 = (r * __sinf(phi));
}
struct AsianOptionPricer {
int term;
float price0;
float strike;
float volatility;
int numPeriods;
float deltaT;
float invariant;
AsianOptionPricer(int t, float s, float v, float rfr, int p) :
term(t),
strike(s),
volatility(v),
numPeriods(p) {
deltaT = (float)term / numPeriods;
invariant = (rfr - volatility * volatility / 2) * deltaT;
}
__device__
float operator()(float price0) const {
__shared__ MersenneTwisterState mtState;
unsigned int thrOffset = blockIdx.x * blockDim.x + threadIdx.x;
MersenneTwisterInitialize(mtState, thrOffset);
float sum = 0;
float price = price0;
// since we move the price two steps a loop, we only go from 0 to numPeriods/2
for(int i = 0; i < (numPeriods) / 2; i++) {
float u1 = float(MersenneTwisterGenerate(mtState, thrOffset)) / 4294967295.0f;
float u2 = float(MersenneTwisterGenerate(mtState, thrOffset)) / 4294967295.0f;
BoxMuller(u1, u2); //transform uniform into two independent standard normals
// move the price a step
price = price * __expf(invariant + volatility * u1 * sqrt(deltaT));
sum = sum + price;
// move the price a second step
price = price * __expf(invariant + volatility * u2 * sqrt(deltaT));
sum = sum + price;
}
float save = (sum / numPeriods) - strike;
if(save < 0) {
return 0.0;
} else {
return save;
}
}
};
int main() {
float S0 = 50;
int M = 500000;
InitializeSeeds();
thrust::device_vector<float> payoff(M);
thrust::fill(payoff.begin(), payoff.end(), S0);
float r = 0.1;
float T = 1;
float sigma = 0.4;
AsianOptionPricer pricer(T, 50, sigma, r, 250);
float sum = thrust::transform_reduce(payoff.begin(), payoff.end(), pricer, 0.0f, thrust::plus<float>());
printf("The price is: %f\n", (sum / M) * exp(-r * T));
return 0;
}
Results:
macbook:asian option thrust choffstein$ ./price_asian_option_c++
Simulation with 500000 trials and 250 increments per trial took 14689312 milliseconds.
The price is: 5.59305
macbook:asian option thrust choffstein$ ./price_asian_option
Simulation with 500000 trials and 250 increments per trial took 991164 milliseconds.
The price is: 5.59295
Impressive speedup, no? About 7% of the time of my comparable C++ program (note: not exact replica). One should also note that the speedup is drastically reduced as the number of simulations decreases — most likely due to data copying speeds from to and from the GPU.
1 comment | tags: CUDA, gpu, options, pricing, thrust | posted in gpgpu, programming
Sep
18
2009
Bored. Wrote a numerical integrator using Thrust for fun.
Solves, numerically, the integral from 0 to 1 for (x^2)*cos(x).

Analytically, we can solve and get: 2*cos(1) – sin(1) = 0.239133627
So, the code is as follows:
#include <vector_functions.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/transform.h>
#include <thrust/count.h>
#include <cstdlib>
#include <math.h>
float2 bottomLeft;
float2 topRight;
__device__
float f(float x) {
return (x*x*__cosf(x));
}
struct above_below_function {
__device__
float operator()(float2 p) const {
return (p.y < f(p.x)) ? 1 : 0;
}
};
float2 make_random_float2(){
return make_float2( (rand() / (float)RAND_MAX) * (float)(topRight.x - bottomLeft.x) + (float)bottomLeft.x,
(rand() / (float)RAND_MAX) * (float)(topRight.y - bottomLeft.y) + (float)bottomLeft.y);
}
int main() {
srand(time(0));
bottomLeft.x = 0;
bottomLeft.y = 0;
topRight.x = 1;
topRight.y = 1;
int numPoints = 10000000;
// find the area of our bounding square
float area = (topRight.x - bottomLeft.x) * (topRight.y - bottomLeft.y);
// generate our points
thrust::host_vector<float2> h_locations(numPoints);
thrust::generate(h_locations.begin(), h_locations.end(), make_random_float2);
// create two device vectors -- one for our locations, and another to create a count
thrust::device_vector<float2> d_locations = h_locations;
thrust::device_vector<unsigned int> d_boolean(numPoints);
// for each element, if the y is below f(x), then put a 1 in the array. otherwise, put a 0
thrust::transform(d_locations.begin(), d_locations.end(), d_boolean.begin(), above_below_function());
int total = thrust::count(d_boolean.begin(), d_boolean.end(), 1);
std::cout << "Area under curve: " << area * (total / (float)numPoints) << std::endl;
return 0;
}
The solutions? Here are a few sample runs…
Area under curve: 0.239177
Area under curve: 0.238916
Area under curve: 0.239181
Area under curve: 0.239129
Pretty accurate, and pretty damn fast too.
5 comments | tags: C++, gpgpu, monte-carlo, numerical methods | posted in gpgpu, programming
Jun
15
2009
Thought I would have a bit of fun to see if I could get CUDA and Thrust working with Ruby. Since no CUDA bindings exist, and I didn’t want to give up the power of Thrust, I decided to emply RubyInline to create wrappers. Got it a nice little demo working in under 30 minutes.
vectors.cu
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <iostream>
#include "vector_test.h"
extern "C" int vector_test() {
thrust::host_vector H(4);
H[0] = 14;
H[1] = 20;
H[2] = 38;
H[3] = 46;
std::cout << "H has size "<< H.size() << std::endl;
for(int i = 0; i < H.size(); i++)
std::cout << "H[" << i << "] = " << H[i] << std::endl;
H.resize(2);
std::cout << "H now has size " << H.size() << std::endl;
thrust::device_vector D = H;
D[0] = 99;
D[1] = 88;
for(int i = 0; i < D.size(); i++)
std::cout << "D[" << i << "] = " << D[i] << std::endl;
return 0;
}
vector_test.h
extern "C" int vector_test();
Compile a shared library: nvcc –shared -o libvectors.so vectors.cu -I/usr/local/cuda/include -L/usr/local/cuda/lib -lcudart
cuda_inline.rb
require 'rubygems'
require 'inline'
class MyWrapper
inline(:C) do |builder|
builder.include '"vector_test.h"'
builder.add_compile_flags '-x c++', '-lstdc++', '-L/usr/local/cuda/lib', '-L.', '-lvectors', '-I.'
builder.c '
void wrapper() {
vector_test();
}'
end
end
t = MyWrapper.new()
t.wrapper
Note here that I have to specify for RubyInline to look in the current directory for headers and libraries!
So, as long as libvectors.so and vector_test.h is in the directory you are running cuda_inline.rb from, everything is gravy! Maybe not the most efficient tool-chain, but it works!
$ ruby cuda_inline.rb
ld warning: in ./libvectors.so, file is not of required architecture
H has size 4
H[0] = 14
H[1] = 20
H[2] = 38
H[3] = 46
H now has size 2
D[0] = 99
D[1] = 88
no comments | tags: CUDA, gpgpu, ruby, RubyInline, thrust | posted in gpgpu, programming