#include "cuda_runtime.h" #include "device_launch_parameters.h" #include #include #include #include // the number of home pits per player const int HOME_PITS = 6; // the length of the sequence of pit to sow to for each player const int SEQUENCE_LENGTH = 2 * HOME_PITS + 1; // the total number of pits on a board, including stores const int BOARD_SIZE = 2 * (HOME_PITS + 1); /************************************************************************ * Lookup tables to help minimize the conditionals in the playout code. * Each one has a copy initialized on the host and a copy on the GPU. * These don't change, so we load them into constant memory (but where * they are doesn't matter much b/c they all fit in cache). * Step 1 in program is to copy from the host copy to the device copy. ************************************************************************ */ // the sequence of pits to sow in for each player, // starting the from pit for each player's 1st pit const int sequence[2][SEQUENCE_LENGTH] = { { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0 }, { 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, 4, 5, 7 } }; __constant__ int dSequence[2][SEQUENCE_LENGTH]; // the pit opposite each one const int opposite[BOARD_SIZE] = { 12, 11, 10, 9, 8, 7, 13, 5, 4, 3, 2, 1, 0, 6 }; __constant__ int dOpposite[BOARD_SIZE]; // which pits are stores const int store[BOARD_SIZE] = { 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1 }; __constant__ int dStore[BOARD_SIZE]; // which pits are owned by which players const int owner[BOARD_SIZE] = { 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 }; __constant__ int dOwner[BOARD_SIZE]; // number of 1 bits in the binary representation of 0,...,63 const int popCount[1 << HOME_PITS] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6 }; __constant__ int dPopCount[1 << HOME_PITS]; int legalMoves[1 << HOME_PITS][HOME_PITS]; __constant__ int dLegalMoves[1 << HOME_PITS][HOME_PITS]; // the starting game confuration, given as number of seeds in pits counterclockwise around the board, // the index of the current player (0/1), and the number of seeds in the home pits for the players, // and a random number seed (to be overwritten when copied) const int start[] = { 4, 4, 4, 4, 4, 4, 0, 4, 4, 4, 4, 4, 4, 0, 0, 24, 24, 0 }; const int stateSize = sizeof(start) / sizeof(const int); const int TURN = 14; const int P1_SEEDS = 15; const int P2_SEEDS = 16; const int RANDOM_SEED = 17; const int iterations = 1000; const int BLOCKS = 16; const int THREADS = 512; const int N = BLOCKS * THREADS; /* Space to hold current position for each thread; made this separate * to enable experimentation with where it is allocated * (global/constant/shared). */ __device__ int scratch[N * stateSize]; cudaError_t cudaInit(int **dStates, int n); cudaError_t cudaPlayout(int *states, int *dStates, int n); __global__ void playoutKernel(int *results); void cpuPlayout(int *, int id); void printPosition(int *); int main(int argc, char **argv) { // Initialize legalMoves programatically rather than hard-coding // just because it is large and I didn't want to type that much. // 1st index in array is a bitmap of legal moves, // 2nd index is the 1 bit to select from that // (0 = least significant bit set); // corresponding entry is the index of the move. // So legalMoves[49] = {0, 4, 5} and so legalMoves[49][1] = 4 // b/c 49 = 110001 and the 1 bits are in the 2^0, 2^4, and 2^5 places // so to randomly select a move, we can // 1) make binary number with 1s in same places as non-empty pits // 2) count the number of legal moves using popCount // 3) get a random number in that range // 4) look up the pit to select from legalMoves for (int bits = 0; bits < 64; bits++) { int curr = 0; for (int p = 0, bit = 1; p < HOME_PITS; p++, bit <<= 1) { if ((bits & bit) != 0) { legalMoves[bits][curr] = p; curr++; } } } int *dStates = 0; cudaInit(&dStates, N); // copy lookup tables to device // allocate space for end result of random playouts int *results = (int *)malloc(stateSize * N * sizeof(int)); time_t startTime, endTime; time(&startTime); for (int i = 0; i < iterations; i++) { for (int j = 0; j < stateSize; j++) { results[j] = start[j]; } results[stateSize - 1] = rand(); if (argc > 1 && strcmp(argv[1], "-cpu") == 0) { for (int c = N - 1; c >= 0; c--) { cpuPlayout(results, c); if (argc > 2 && strcmp(argv[2], "-output") == 0) { printPosition(results + stateSize * c); } } } if (argc > 1 && strcmp(argv[1], "-gpu") == 0) { // do N playouts simultaneously on GPU cudaPlayout(results, dStates, N); if (argc > 2 && strcmp(argv[2], "-output") == 0) { for (int c = 0; c < N; c++) { printPosition(results + stateSize * c); } } } } time(&endTime); free(results); double elapsed = difftime(endTime, startTime); printf("%f\n", elapsed / (N * iterations)); // cudaDeviceReset must be called before exiting in order for profiling and // tracing tools such as Nsight and Visual Profiler to show complete traces. cudaFree(dStates); cudaError_t cudaStatus = cudaDeviceReset(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaDeviceReset failed!"); return 1; } return 0; } cudaError_t cudaInit(int **dStates, int n) { // choose which GPU to run on; change this on a multi-GPU system cudaError_t cudaStatus = cudaSetDevice(0); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); goto Error; } // allocate GPU buffers for initial states cudaStatus = cudaMalloc((void**)dStates, n * stateSize * sizeof(int)); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMalloc failed!"); goto Error; } cudaMemcpyToSymbol(dSequence, sequence, sizeof(sequence)); cudaMemcpyToSymbol(dOpposite, opposite, sizeof(opposite)); cudaMemcpyToSymbol(dOwner, owner, sizeof(owner)); cudaMemcpyToSymbol(dStore, store, sizeof(store)); cudaMemcpyToSymbol(dPopCount, popCount, sizeof(popCount)); cudaMemcpyToSymbol(dLegalMoves, legalMoves, sizeof(legalMoves)); return cudaStatus; Error: cudaFree(dStates); return cudaStatus; } cudaError_t cudaPlayout(int *results, int *dStates, int N) { cudaError_t cudaStatus; // copy initial state to GPU memory cudaStatus = cudaMemcpy(dStates, results, stateSize * sizeof(int), cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } // run kernel with lots of threads (more than cores on GPU to hide latency) playoutKernel<<>>(dStates); // Check for any errors launching the kernel cudaStatus = cudaGetLastError(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); goto Error; } // cudaDeviceSynchronize waits for the kernel to finish, and returns // any errors encountered during the launch. cudaStatus = cudaDeviceSynchronize(); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus); goto Error; } // copy results from GPU memory back to host memory cudaStatus = cudaMemcpy(results, dStates, N * stateSize * sizeof(int), cudaMemcpyDeviceToHost); if (cudaStatus != cudaSuccess) { fprintf(stderr, "cudaMemcpy failed!"); goto Error; } return cudaStatus; Error: cudaFree(dStates); return cudaStatus; } /****************************************************** * This is the code that runs on the GPU. ****************************************************** */ __global__ void playoutKernel(int *results) { int id = threadIdx.x + blockIdx.x * THREADS; // copy from global to scratch (wherever we decided to put that) int *output = results + (id * stateSize); int *state = scratch + (id * stateSize); for (int i = 0; i < stateSize; i++) { state[i] = results[i]; } // start pseudorandom number generator in diff place for each thread int rand = state[RANDOM_SEED] + id; while (state[P1_SEEDS] > 0 && state[P2_SEEDS] > 0) { // get whose turn it is and their 1st pit int turn = state[TURN]; int firstPit = turn * (HOME_PITS + 1); // get legal moves int nonEmptyPits = 0; // a bitmap of the current player's non-empty pits int bit = 1; for (int p = 0; p < HOME_PITS; p++) { if (state[firstPit + p] > 0) { nonEmptyPits |= bit; } bit <<= 1; } int moveCount = dPopCount[nonEmptyPits]; // the number of legal moves // pick a move randomly // from Numerical Recipes via Wikipedia Linear congruential generator rand = rand * 1664525 + 1013904223; int selected = ((rand % moveCount) + moveCount) % moveCount; int move = dLegalMoves[nonEmptyPits][selected]; // update board for that move int seeds = state[firstPit + move]; // number of seeds to sow int fullRounds = seeds / SEQUENCE_LENGTH; // number of full turns around board int leftOver = seeds % SEQUENCE_LENGTH; // number of seeds on last turn around board int lastPit = dSequence[turn][(move + leftOver + SEQUENCE_LENGTH - 1) % SEQUENCE_LENGTH]; int changedPits = seeds < SEQUENCE_LENGTH ? seeds : SEQUENCE_LENGTH; // remove seeds from starting pit state[P1_SEEDS + turn] -= seeds; state[firstPit + move] = 0; // add seeds to subsequent pits for (int p = 0; p < changedPits; p++) { int toPit = dSequence[turn][(move + p) % SEQUENCE_LENGTH]; int sown = fullRounds; if (p < leftOver) { sown++; } state[toPit] += sown; if (dStore[toPit] == 0) { state[P1_SEEDS + dOwner[toPit]] += sown; } } // check for extra turn and capture int opp = dOpposite[lastPit]; int steal = state[opp]; int other = 1 - turn; if (dStore[lastPit] == 0) { // switch turn unless ended in store state[TURN] = other; if (dOwner[lastPit] == turn && state[lastPit] == 1 && steal > 0) { // steal seeds if ended in own empty home pit opposite non-empty pit state[P1_SEEDS + turn] -= 1; state[P1_SEEDS + other] -= steal; state[turn * (HOME_PITS + 1) + HOME_PITS] += (1 + steal); state[lastPit] = 0; state[opp] = 0; } } } // player with seeds left moves them to their store if (state[P1_SEEDS] == 0) { state[HOME_PITS] += state[P2_SEEDS]; } else { state[BOARD_SIZE - 1] += state[P1_SEEDS]; } for (int i = 0; i < 6; i++) { state[i] = 0; state[dOpposite[i]] = 0; } // copy from scratch space back to global space for (int i = 0; i < stateSize; i++) { output[i] = state[i]; } } void cpuPlayout(int *results, int id) { // copy from global to thread-local int *output = results + (id * stateSize); int state[stateSize]; for (int i = 0; i < stateSize; i++) { state[i] = results[i]; } int rand = state[RANDOM_SEED] + id; while (state[P1_SEEDS] > 0 && state[P2_SEEDS] > 0) { // get whose turn it is and their 1st pit int turn = state[TURN]; int firstPit = turn * (HOME_PITS + 1); // get legal moves int nonEmptyPits = 0; // a bitmap of the current player's non-empty pits int bit = 1; for (int p = 0; p < HOME_PITS; p++) { if (state[firstPit + p] > 0) { nonEmptyPits |= bit; } bit <<= 1; } int moveCount = popCount[nonEmptyPits]; // the number of legal moves // pick a move randomly rand = rand * 1664525 + 1013904223; // from Numerical Recipes via Wikipedia Linear congruential generator int selected = ((rand % moveCount) + moveCount) % moveCount; int move = legalMoves[nonEmptyPits][selected]; // update board for that move int seeds = state[firstPit + move]; // number of seeds to sow int fullRounds = seeds / SEQUENCE_LENGTH; // number of full turns around board int leftOver = seeds % SEQUENCE_LENGTH; // number of seeds on last turn around board int changedPits = seeds < SEQUENCE_LENGTH ? seeds : SEQUENCE_LENGTH; int lastPit = sequence[turn][(move + leftOver + SEQUENCE_LENGTH- 1) % SEQUENCE_LENGTH]; state[P1_SEEDS + turn] -= seeds; state[firstPit + move] = 0; for (int p = 0; p < changedPits; p++) { int toPit = sequence[turn][(move + p) % SEQUENCE_LENGTH]; int sown = fullRounds; if (p < leftOver) { sown++; } state[toPit] += sown; if (store[toPit] == 0) { state[P1_SEEDS + owner[toPit]] += sown; } } int opp = opposite[lastPit]; int steal = state[opp]; int other = 1 - turn; if (store[lastPit] == 0) { // switch turn unless ended in store state[TURN] = other; if (owner[lastPit] == turn && state[lastPit] == 1 && steal > 0) { // steal seeds if ended in own empty home pit opposite non-empty pit state[P1_SEEDS + turn] -= 1; state[P1_SEEDS + other] -= steal; state[turn * (HOME_PITS + 1) + HOME_PITS] += (1 + steal); state[lastPit] = 0; state[opp] = 0; } } } if (state[P1_SEEDS] == 0) { state[HOME_PITS] += state[P2_SEEDS]; } else { state[BOARD_SIZE - 1] += state[P1_SEEDS]; } for (int i = 0; i < 6; i++) { state[i] = 0; state[opposite[i]] = 0; } // local back to results array for (int i = 0; i < stateSize; i++) { output[i] = state[i]; } } void printPosition(int *state) { if (state[TURN] == 1) { printf("> "); } else { printf(" "); } for (int i = 13; i >= 7; i--) { printf("%3d", state[i]); } printf("\n"); if (state[TURN] == 0) { printf("> "); } else { printf(" "); } printf(" "); for (int i = 0; i < 7; i++) { printf("%3d", state[i]); } printf("\n"); printf("%d %d\n", state[P1_SEEDS], state[P2_SEEDS]); }