Aquila  2.0 prealpha
Cognitive Robotics Architecture
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Functions
kernels.cu File Reference
#include <cuda.h>
#include <cuda_runtime.h>

Functions

__global__ void forwardPassV2Kernel (int step, int sequenceOffset, float *activity, float *input, float *weight, int numNeurons, int numIONeurons, float *buffer)
 Forward pass.
 
__global__ void forwardPassV21Kernel (int step, int sequenceOffset, float *activity, float *input, float *buffer, float *potential, float *weight, float *previousPotential, float *error, int *deltaT, int numNeurons, int numIONeurons)
 Forward pass.
 
__global__ void forwardPassV1Kernel (int step, int sequenceOffset, float *activity, float *input, float *weight, float *previousPotential, float *error, float *potential, int *deltaT, int numNeurons, int numIONeurons)
 Forward pass.
 
__global__ void backwardPassV1Kernel (int step, int sequenceOffset, int numNeurons, int numIONeurons, float *input, float *activity, float *delta, float *deltaWeight, float *previousDelta, float *error, float *individualError, int *deltaT, float *weight)
 Calculates deltas, deltas on weights and errors parts.
 
__global__ void backwardPassV11Kernel (int step, int numNeurons, int numIONeurons, float *activity, float *delta, float *previousDelta, int *deltaT, float *weight)
 Calculates deltas on hidden neurons.
 
__global__ void backwardPassV2Kernel (int step, int sequenceOffset, int numNeurons, int numIONeurons, float *input, float *activity, float *delta, float *deltaWeight, float *previousDelta, float *error, float *individualError, int *deltaT, float *weight, float *buffer)
 Loads the buffer with deltas on weights fractions calculate deltas, deltas on weights and errors parts.
 
__global__ void backwardPassV21Kernel (float *input, float *output, int numNeurons, int numIONeurons)
 Calculates deltas on hidden neurons.
 
__global__ void backwardPassV3Kernel (int step, int numNeurons, int numIONeurons, float *activity, float *delta, float *previousDelta, float *deltaWeight, int *deltaT, float *weight)
 Calculates deltas on weights on hidden neurons and biases.
 
__global__ void updateWeightsKernel (float learningRate, float momentum, float *weight, float *deltaWeight, float *previousDeltaWeight, int numWeights)
 Updates weights.
 
__global__ void sumDeltaWeightsP2PKernel (int numWeights, float *masterDeltaWeight, float *peerDeltaWeight)
 Sums delta weights on the master device.
 
__global__ void updateWeightsP2PKernel (int numWeights, float learningRate, float momentum, float *masterWeight, float *peerWeight, float *deltaWeight, float *previousDeltaWeight)
 Modifies weights on the master device and copies to the peer device.
 
__global__ void sumErrorP2PKernel (float *masterError, float *peerError)
 Modifies weights on the master device and copies to the peer device.
 
__global__ void setInitStatesKernel (float initState, float *activity, int numNeurons, int numIONeurons, int numFastNeurons)
 Sets the initial states for all the units on device.
 
__global__ void resetParametersKernel (int numNeurons, int maxSequenceSteps, float *delta, float *previousDelta, float *potential, float *previousPotential, float *error)
 Resets delta and error parameters.
 
__global__ void resetDeltaWeightsKernel (int numWeights, int numIONeurons, float *deltaWeight, float *individualError)
 Resets delta weights and errors.
 
template<unsigned int blockSize>
__global__ void reduceKernel (float *input, float *output, unsigned int n, bool nIsPow2)
 Parallel reduction sum modified from NVIDIA SDK.
 
void resetDeltaWeightsOnDevice (dim3 grid, dim3 block, cudaStream_t stream, int numWeights, int numIONeurons, float *deltaWeight, float *individualError)
 Wrapper for resetDeltaWeightsKernel.
 
void setInitStatesOnDevice (dim3 grid, dim3 block, cudaStream_t stream, float initState, float *activity, int numNeurons, int numIONeurons, int numFastNeurons)
 Wrapper for setInitStatesKernel.
 
void resetParametersOnDevice (dim3 grid, dim3 block, cudaStream_t stream, int numNeurons, int maxSequenceSteps, float *delta, float *previousDelta, float *potential, float *previousPotential, float *error)
 Wrapper for resetParametersKernel.
 
void updateWeightsOnDevice (dim3 grid, dim3 block, float learningRate, float momentum, float *weight, float *deltaWeight, float *previousDeltaWeight, int numWeights)
 Wrapper for updateWeightsKernel.
 
void forwardPassV1onDevice (dim3 grid, dim3 block, cudaStream_t stream, int step, int sequenceOffset, float *activity, float *input, float *weight, float *previousPotential, float *error, float *potential, int *deltaT, int numNeurons, int numIONeurons)
 Wrapper for forwardPassV1Kernel.
 
void forwardPassV2onDevice (dim3 grid, dim3 block, cudaStream_t stream, int step, int sequenceOffset, float *activity, float *input, float *weight, int numNeurons, int numIONeurons, float *buffer)
 Wrapper for forwardPassV2Kernel.
 
void forwardPassV21onDevice (dim3 grid, dim3 block, int smemSize, cudaStream_t stream, int step, int sequenceOffset, float *activity, float *input, float *buffer, float *potential, float *weight, float *previousPotential, float *error, int *deltaT, int numNeurons, int numIONeurons)
 Wrapper for forwardPassV21Kernel.
 
void backwardPassV1onDevice (dim3 grid, dim3 block, cudaStream_t stream, int step, int sequenceOffset, int numNeurons, int numIONeurons, float *input, float *activity, float *delta, float *deltaWeight, float *previousDelta, float *error, float *individualError, int *deltaT, float *weight)
 Wrapper for backwardPassV1Kernel.
 
void backwardPassV11onDevice (dim3 grid, dim3 block, cudaStream_t stream, int step, int numNeurons, int numIONeurons, float *activity, float *delta, float *previousDelta, int *deltaT, float *weight)
 Wrapper for backwardPassV11Kernel.
 
void backwardPassV2onDevice (dim3 grid, dim3 block, cudaStream_t stream, int step, int sequenceOffset, int numNeurons, int numIONeurons, float *input, float *activity, float *delta, float *deltaWeight, float *previousDelta, float *error, float *individualError, int *deltaT, float *weight, float *buffer)
 Wrapper for backwardPassV2Kernel.
 
void backwardPassV21onDevice (dim3 grid, dim3 block, int smemSize, cudaStream_t stream, float *input, float *output, int numNeurons, int numIONeurons)
 Wrapper for backwardPassV21Kernel.
 
void backwardPassV3onDevice (dim3 grid, dim3 block, cudaStream_t stream, int step, int numNeurons, int numIONeurons, float *activity, float *delta, float *previousDelta, float *deltaWeight, int *deltaT, float *weight)
 Wrapper for backwardPassV3Kernel.
 
void reduceOnDevice (int size, dim3 grid, dim3 block, int smemSize, cudaStream_t stream, float *input, float *output, unsigned int n, bool nIsPow2)
 Wrapper for reduceKernel.
 
void sumDeltaWeightsP2PonDevice (dim3 grid, dim3 block, int numWeights, float *masterDeltaWeight, float *peerDeltaWeight)
 Wrapper for sumDeltaWeightsP2PKernel.
 
void updateWeightsP2PonDevice (dim3 grid, dim3 block, int numWeights, float learningRate, float momentum, float *masterWeight, float *peerWeight, float *deltaWeight, float *previousDeltaWeight)
 Wrapper for updateWeightsP2PKernel.
 
void sumErrorP2PonDevice (dim3 grid, dim3 block, float *masterError, float *peerError)
 Wrapper for sumErrorP2PKernel.
 

Function Documentation

__global__ void backwardPassV11Kernel ( int  step,
int  numNeurons,
int  numIONeurons,
float *  activity,
float *  delta,
float *  previousDelta,
int *  deltaT,
float *  weight 
)

Calculates deltas on hidden neurons.

Note
Slower version for larger networks over 1024 neurons.
Parameters
[in]step- current step
[in]numNeurons- number of neurons
[in]numIONeurons- number of input-ouput neurons
[in]activity- activities
[in]deltaT- delta-t values
[in]weight- weights
[in]previousDelta- previous deltas
[out]delta- deltas
void backwardPassV11onDevice ( dim3  grid,
dim3  block,
cudaStream_t  stream,
int  step,
int  numNeurons,
int  numIONeurons,
float *  activity,
float *  delta,
float *  previousDelta,
int *  deltaT,
float *  weight 
)

Wrapper for backwardPassV11Kernel.

Parameters
[in]grid- CUDA grid size
[in]block- CUDA block size
[in]stream- CUDA stream
[in]step- current step
[in]numNeurons- number of neurons
[in]numIONeurons- number of input-ouput neurons
[in]activity- activities
[in]deltaT- delta-t values
[in]weight- weights
[in]previousDelta- previous deltas
[out]delta- deltas
__global__ void backwardPassV1Kernel ( int  step,
int  sequenceOffset,
int  numNeurons,
int  numIONeurons,
float *  input,
float *  activity,
float *  delta,
float *  deltaWeight,
float *  previousDelta,
float *  error,
float *  individualError,
int *  deltaT,
float *  weight 
)

Calculates deltas, deltas on weights and errors parts.

Note
Slower version for larger networks over 1024 neurons.
Parameters
[in]step- current step
[in]sequenceOffset- sequence offsets
[in]numNeurons- number of neurons
[in]numIONeurons- number of input-output neurons
[in]input- input
[in]activity- activities
[in]error- errors
[in]individualError- error buffer
[in]deltaT- delta-t values
[in]weight- weights
[in]previousDelta- previous deltas
[out]delta- deltas
[out]deltaWeight- delta weights
void backwardPassV1onDevice ( dim3  grid,
dim3  block,
cudaStream_t  stream,
int  step,
int  sequenceOffset,
int  numNeurons,
int  numIONeurons,
float *  input,
float *  activity,
float *  delta,
float *  deltaWeight,
float *  previousDelta,
float *  error,
float *  individualError,
int *  deltaT,
float *  weight 
)

Wrapper for backwardPassV1Kernel.

Parameters
[in]grid- CUDA grid size
[in]block- CUDA block size
[in]stream- CUDA stream
[in]step- current step
[in]sequenceOffset- sequence offsets
[in]numNeurons- number of neurons
[in]numIONeurons- number of input-output neurons
[in]input- input
[in]activity- activities
[in]error- errors
[in]individualError- error buffer
[in]deltaT- delta-t values
[in]weight- weights
[in]previousDelta- previous deltas
[out]delta- deltas
[out]deltaWeight- delta weights
__global__ void backwardPassV21Kernel ( float *  input,
float *  output,
int  numNeurons,
int  numIONeurons 
)

Calculates deltas on hidden neurons.

Note
Used together with backwardPassV2Kerne. Faster version for networks of up to 1024 neurons.
Parameters
[in]input- input
[in]numNeurons- number of neurons
[in]numIONeurons- number of input-output neurons
[out]output- output
void backwardPassV21onDevice ( dim3  grid,
dim3  block,
int  smemSize,
cudaStream_t  stream,
float *  input,
float *  output,
int  numNeurons,
int  numIONeurons 
)

Wrapper for backwardPassV21Kernel.

Parameters
[in]grid- CUDA grid size
[in]block- CUDA block size
[in]smemSize- CUDA shared memory size
[in]stream- CUDA stream
[in]input- input
[in]numNeurons- number of neurons
[in]numIONeurons- number of input-output neurons
[out]output- output
__global__ void backwardPassV2Kernel ( int  step,
int  sequenceOffset,
int  numNeurons,
int  numIONeurons,
float *  input,
float *  activity,
float *  delta,
float *  deltaWeight,
float *  previousDelta,
float *  error,
float *  individualError,
int *  deltaT,
float *  weight,
float *  buffer 
)

Loads the buffer with deltas on weights fractions calculate deltas, deltas on weights and errors parts.

Note
Deltas on weights and errors are later summed by parallel reduction.
Parameters
[in]step- current step
[in]sequenceOffset- sequence offsets
[in]numNeurons- number of neurons
[in]numIONeurons- number of input-output neurons
[in]input- input
[in]activity- activities
[in]previousDelta- previous deltas
[in]error- errors
[in]individualError- error buffer
[in]deltaT- delta-t values
[in]weight- weights
[out]delta- deltas
[out]deltaWeight- delta weights
[out]buffer- buffer used for storing delta weights
void backwardPassV2onDevice ( dim3  grid,
dim3  block,
cudaStream_t  stream,
int  step,
int  sequenceOffset,
int  numNeurons,
int  numIONeurons,
float *  input,
float *  activity,
float *  delta,
float *  deltaWeight,
float *  previousDelta,
float *  error,
float *  individualError,
int *  deltaT,
float *  weight,
float *  buffer 
)

Wrapper for backwardPassV2Kernel.

Parameters
[in]grid- CUDA grid size
[in]block- CUDA block size
[in]stream- CUDA stream
[in]step- current step
[in]sequenceOffset- sequence offsets
[in]numNeurons- number of neurons
[in]numIONeurons- number of input-output neurons
[in]input- input
[in]activity- activities
[in]previousDelta- previous deltas
[in]error- errors
[in]individualError- error buffer
[in]deltaT- delta-t values
[in]weight- weights
[out]delta- deltas
[out]deltaWeight- delta weights
[out]buffer- buffer used for storing delta weights
__global__ void backwardPassV3Kernel ( int  step,
int  numNeurons,
int  numIONeurons,
float *  activity,
float *  delta,
float *  previousDelta,
float *  deltaWeight,
int *  deltaT,
float *  weight 
)

Calculates deltas on weights on hidden neurons and biases.

Parameters
[in]step- current step
[in]numNeurons- number of neurons
[in]numIONeurons- number of input-output neurons
[in]activity- activities
[in]deltaT- delta-t values
[in]weight- weights
[in]delta- deltas
[out]previousDelta- previous deltas
[out]deltaWeight- delta weights
void backwardPassV3onDevice ( dim3  grid,
dim3  block,
cudaStream_t  stream,
int  step,
int  numNeurons,
int  numIONeurons,
float *  activity,
float *  delta,
float *  previousDelta,
float *  deltaWeight,
int *  deltaT,
float *  weight 
)

Wrapper for backwardPassV3Kernel.

Parameters
[in]grid- CUDA grid size
[in]block- CUDA block size
[in]stream- CUDA stream
[in]step- current step
[in]numNeurons- number of neurons
[in]numIONeurons- number of input-output neurons
[in]activity- activities
[in]deltaT- delta-t values
[in]weight- weights
[in]delta- deltas
[out]previousDelta- previous deltas
[out]deltaWeight- delta weights
__global__ void forwardPassV1Kernel ( int  step,
int  sequenceOffset,
float *  activity,
float *  input,
float *  weight,
float *  previousPotential,
float *  error,
float *  potential,
int *  deltaT,
int  numNeurons,
int  numIONeurons 
)

Forward pass.

Note
Slower version for larger networks.
Parameters
[in]step- current step
[in]sequenceOffset- sequence offsets
[in]input- input
[in]weight- weights
[in]deltaT- delta-t values
[in]numNeurons- number of neurons
[in]numIONeurons- number of input-output neurons
[out]potential- potentials
[out]previousPotential- previous potentials
[out]activity- activities
[out]error- errors
void forwardPassV1onDevice ( dim3  grid,
dim3  block,
cudaStream_t  stream,
int  step,
int  sequenceOffset,
float *  activity,
float *  input,
float *  weight,
float *  previousPotential,
float *  error,
float *  potential,
int *  deltaT,
int  numNeurons,
int  numIONeurons 
)

Wrapper for forwardPassV1Kernel.

Parameters
[in]grid- CUDA grid size
[in]block- CUDA block size
[in]stream- CUDA stream
[in]step- current step
[in]sequenceOffset- sequence offsets
[in]input- input
[in]weight- weights
[in]deltaT- delta-t values
[in]numNeurons- number of neurons
[in]numIONeurons- number of input-output neurons
[out]potential- potentials
[out]previousPotential- previous potentials
[out]activity- activities
[out]error- errors
__global__ void forwardPassV21Kernel ( int  step,
int  sequenceOffset,
float *  activity,
float *  input,
float *  buffer,
float *  potential,
float *  weight,
float *  previousPotential,
float *  error,
int *  deltaT,
int  numNeurons,
int  numIONeurons 
)

Forward pass.

Note
Used with forwardPassV2Kernel. Faster version for networks of up to 1024 neurons.
Parameters
[in]step- current step
[in]sequenceOffset- sequence offsets
[in]input- input
[in]weight- weights
[in]deltaT- delta-t values
[in]numNeurons- number of neurons
[in]numIONeurons- number of input-output neurons
[out]potential- potentials
[out]previousPotential- previous potentials
[out]activity- activities
[out]error- errors
void forwardPassV21onDevice ( dim3  grid,
dim3  block,
int  smemSize,
cudaStream_t  stream,
int  step,
int  sequenceOffset,
float *  activity,
float *  input,
float *  buffer,
float *  potential,
float *  weight,
float *  previousPotential,
float *  error,
int *  deltaT,
int  numNeurons,
int  numIONeurons 
)

Wrapper for forwardPassV21Kernel.

Parameters
[in]grid- CUDA grid size
[in]block- CUDA block size
[in]smemSize- CUDA shared memory size
[in]stream- CUDA stream
[in]step- current step
[in]sequenceOffset- sequence offsets
[in]activity- activations
[in]input- input
[in]weight- weights
[in]numNeurons- number of neurons
[in]numIONeurons- number of input-output neurons
[out]buffer- buffer used for storing new activations
__global__ void forwardPassV2Kernel ( int  step,
int  sequenceOffset,
float *  activity,
float *  input,
float *  weight,
int  numNeurons,
int  numIONeurons,
float *  buffer 
)

Forward pass.

Note
Used with forwardPassV21Kernel - faster version for networks of up to 1024 neurons.
Parameters
[in]step- current step
[in]sequenceOffset- sequence offsets
[in]activity- activations
[in]input- input
[in]weight- weights
[in]numNeurons- number of neurons
[in]numIONeurons- number of input-output neurons
[out]buffer- buffer used for storing new activations
void forwardPassV2onDevice ( dim3  grid,
dim3  block,
cudaStream_t  stream,
int  step,
int  sequenceOffset,
float *  activity,
float *  input,
float *  weight,
int  numNeurons,
int  numIONeurons,
float *  buffer 
)

Wrapper for forwardPassV2Kernel.

Parameters
[in]grid- CUDA grid size
[in]block- CUDA block size
[in]stream- CUDA stream
[in]step- current step
[in]sequenceOffset- sequence offsets
[in]activity- activations
[in]input- input
[in]weight- weights
[in]numNeurons- number of neurons
[in]numIONeurons- number of input-output neurons
[out]buffer- buffer used for storing new activations
template<unsigned int blockSize>
__global__ void reduceKernel ( float *  input,
float *  output,
unsigned int  n,
bool  nIsPow2 
)

Parallel reduction sum modified from NVIDIA SDK.

Note
Number of threads are not known at a compile time, however, we always stick to power of 2 sizes
so here we are using templates to allow compilation for all known size, which results in higher throughput.
Parameters
[in]input- input
[in]n- number of elements to sum
[in]nIsPow2- determines if the number is of power of two
[out]output- output
void reduceOnDevice ( int  size,
dim3  grid,
dim3  block,
int  smemSize,
cudaStream_t  stream,
float *  input,
float *  output,
unsigned int  n,
bool  nIsPow2 
)

Wrapper for reduceKernel.

Parameters
[in]size- number of elements to sum
[in]grid- CUDA grid size
[in]block- CUDA block size
[in]smemSize- CUDA shared memory size
[in]stream- CUDA stream
[in]input- input
[in]n- number of elements to sum
[in]nIsPow2- determines if the number is of power of two
[out]output- output
__global__ void resetDeltaWeightsKernel ( int  numWeights,
int  numIONeurons,
float *  deltaWeight,
float *  individualError 
)

Resets delta weights and errors.

Parameters
[in]numWeights- number of weights
[in]numIONeurons- number of input-output neurons
[out]deltaWeight- delta weights
[out]individualError- error buffer
void resetDeltaWeightsOnDevice ( dim3  grid,
dim3  block,
cudaStream_t  stream,
int  numWeights,
int  numIONeurons,
float *  deltaWeight,
float *  individualError 
)

Wrapper for resetDeltaWeightsKernel.

Parameters
[in]grid- CUDA grid size
[in]block- CUDA block size
[in]stream- CUDA stream
[in]numWeights- number of weights
[in]numIONeurons- number of input-output neurons
[out]deltaWeight- delta weights
[out]individualError- error buffer
__global__ void resetParametersKernel ( int  numNeurons,
int  maxSequenceSteps,
float *  delta,
float *  previousDelta,
float *  potential,
float *  previousPotential,
float *  error 
)

Resets delta and error parameters.

Parameters
[in]numNeurons- number of neurons
[in]maxSequenceSteps- maximum number of sequence steps
[out]delta- deltas
[out]previousDelta- previous deltas
[out]potential- potentials
[out]previousPotential- previous potentials
[out]error- errors
void resetParametersOnDevice ( dim3  grid,
dim3  block,
cudaStream_t  stream,
int  numNeurons,
int  maxSequenceSteps,
float *  delta,
float *  previousDelta,
float *  potential,
float *  previousPotential,
float *  error 
)

Wrapper for resetParametersKernel.

Parameters
[in]grid- CUDA grid size
[in]block- CUDA block size
[in]stream- CUDA stream
[in]numNeurons- number of neurons
[in]maxSequenceSteps- maximum number of sequence steps
[out]delta- deltas
[out]previousDelta- previous deltas
[out]potential- potentials
[out]previousPotential- previous potentials
[out]error- errors
__global__ void setInitStatesKernel ( float  initState,
float *  activity,
int  numNeurons,
int  numIONeurons,
int  numFastNeurons 
)

Sets the initial states for all the units on device.

Note
Slow context units will be initialised with a specific value.
Parameters
[in]initState- initial state
[in]numNeurons- number of neurons
[in]numIONeurons- number of input-output neurons
[in]numFastNeurons- number of fast neurons
[out]activity- activities
void setInitStatesOnDevice ( dim3  grid,
dim3  block,
cudaStream_t  stream,
float  initState,
float *  activity,
int  numNeurons,
int  numIONeurons,
int  numFastNeurons 
)

Wrapper for setInitStatesKernel.

Parameters
[in]grid- CUDA grid size
[in]block- CUDA block size
[in]stream- CUDA stream
[in]initState- initial state
[in]numNeurons- number of neurons
[in]numIONeurons- number of input-output neurons
[in]numFastNeurons- number of fast neurons
[out]activity- activities
__global__ void sumDeltaWeightsP2PKernel ( int  numWeights,
float *  masterDeltaWeight,
float *  peerDeltaWeight 
)

Sums delta weights on the master device.

Parameters
[in]numWeights- number of weights
[in]peerDeltaWeight- delta weights from peer device
[out]masterDeltaWeight- delta weights from master device
void sumDeltaWeightsP2PonDevice ( dim3  grid,
dim3  block,
int  numWeights,
float *  masterDeltaWeight,
float *  peerDeltaWeight 
)

Wrapper for sumDeltaWeightsP2PKernel.

Parameters
[in]grid- CUDA grid size
[in]block- CUDA block size
[in]numWeights- number of weights
[in]peerDeltaWeight- delta weights from peer device
[out]masterDeltaWeight- delta weights from master device
__global__ void sumErrorP2PKernel ( float *  masterError,
float *  peerError 
)

Modifies weights on the master device and copies to the peer device.

Parameters
[in]peerError- error from peer device
[out]masterError- error from master device
void sumErrorP2PonDevice ( dim3  grid,
dim3  block,
float *  masterError,
float *  peerError 
)

Wrapper for sumErrorP2PKernel.

Parameters
[in]grid- CUDA grid size
[in]block- CUDA block size
[in]peerError- error from peer device
[out]masterError- error from master device
__global__ void updateWeightsKernel ( float  learningRate,
float  momentum,
float *  weight,
float *  deltaWeight,
float *  previousDeltaWeight,
int  numWeights 
)

Updates weights.

Parameters
[in]learningRate- learning rate
[in]momentum- momentum
[in]numWeights- number of weights
[in]deltaWeight- delta weights
[out]previousDeltaWeight- previous delta weights
[out]weight- weights
void updateWeightsOnDevice ( dim3  grid,
dim3  block,
float  learningRate,
float  momentum,
float *  weight,
float *  deltaWeight,
float *  previousDeltaWeight,
int  numWeights 
)

Wrapper for updateWeightsKernel.

Parameters
[in]grid- CUDA grid size
[in]block- CUDA block size
[in]learningRate- learning rate
[in]momentum- momentum
[in]numWeights- number of weights
[in]deltaWeight- delta weights
[out]previousDeltaWeight- previous delta weights
[out]weight- weights
__global__ void updateWeightsP2PKernel ( int  numWeights,
float  learningRate,
float  momentum,
float *  masterWeight,
float *  peerWeight,
float *  deltaWeight,
float *  previousDeltaWeight 
)

Modifies weights on the master device and copies to the peer device.

Parameters
[in]numWeights- number of weights
[in]learningRate- learning rate
[in]momentum- momentum
[in]deltaWeight- delta weights
[out]previousDeltaWeight- previous delta weights
[out]masterWeight- weigths from master device
[out]peerWeight- weights from peer device
void updateWeightsP2PonDevice ( dim3  grid,
dim3  block,
int  numWeights,
float  learningRate,
float  momentum,
float *  masterWeight,
float *  peerWeight,
float *  deltaWeight,
float *  previousDeltaWeight 
)

Wrapper for updateWeightsP2PKernel.

Parameters
[in]grid- CUDA grid size
[in]block- CUDA block size
[in]numWeights- number of weights
[in]learningRate- learning rate
[in]momentum- momentum
[in]deltaWeight- delta weights
[out]previousDeltaWeight- previous delta weights
[out]masterWeight- weigths from master device
[out]peerWeight- weights from peer device