#include <cuda.h>
#include <cuda_runtime.h>

Functions
__global__ void	forwardPassV2Kernel (int step, int sequenceOffset, float activity, float input, float weight, int numNeurons, int numIONeurons, float buffer)
	Forward pass.

__global__ void	forwardPassV21Kernel (int step, int sequenceOffset, float activity, float input, float buffer, float potential, float weight, float previousPotential, float error, int deltaT, int numNeurons, int numIONeurons)
	Forward pass.

__global__ void	forwardPassV1Kernel (int step, int sequenceOffset, float activity, float input, float weight, float previousPotential, float error, float potential, int *deltaT, int numNeurons, int numIONeurons)
	Forward pass.

__global__ void	backwardPassV1Kernel (int step, int sequenceOffset, int numNeurons, int numIONeurons, float input, float activity, float delta, float deltaWeight, float previousDelta, float error, float individualError, int deltaT, float *weight)
	Calculates deltas, deltas on weights and errors parts.

__global__ void	backwardPassV11Kernel (int step, int numNeurons, int numIONeurons, float activity, float delta, float previousDelta, int deltaT, float *weight)
	Calculates deltas on hidden neurons.

__global__ void	backwardPassV2Kernel (int step, int sequenceOffset, int numNeurons, int numIONeurons, float input, float activity, float delta, float deltaWeight, float previousDelta, float error, float individualError, int deltaT, float weight, float buffer)
	Loads the buffer with deltas on weights fractions calculate deltas, deltas on weights and errors parts.

__global__ void	backwardPassV21Kernel (float input, float output, int numNeurons, int numIONeurons)
	Calculates deltas on hidden neurons.

__global__ void	backwardPassV3Kernel (int step, int numNeurons, int numIONeurons, float activity, float delta, float previousDelta, float deltaWeight, int deltaT, float weight)
	Calculates deltas on weights on hidden neurons and biases.

__global__ void	updateWeightsKernel (float learningRate, float momentum, float weight, float deltaWeight, float *previousDeltaWeight, int numWeights)
	Updates weights.

__global__ void	sumDeltaWeightsP2PKernel (int numWeights, float masterDeltaWeight, float peerDeltaWeight)
	Sums delta weights on the master device.

__global__ void	updateWeightsP2PKernel (int numWeights, float learningRate, float momentum, float masterWeight, float peerWeight, float deltaWeight, float previousDeltaWeight)
	Modifies weights on the master device and copies to the peer device.

__global__ void	sumErrorP2PKernel (float masterError, float peerError)
	Modifies weights on the master device and copies to the peer device.

__global__ void	setInitStatesKernel (float initState, float *activity, int numNeurons, int numIONeurons, int numFastNeurons)
	Sets the initial states for all the units on device.

__global__ void	resetParametersKernel (int numNeurons, int maxSequenceSteps, float delta, float previousDelta, float potential, float previousPotential, float *error)
	Resets delta and error parameters.

__global__ void	resetDeltaWeightsKernel (int numWeights, int numIONeurons, float deltaWeight, float individualError)
	Resets delta weights and errors.

template<unsigned int blockSize>
__global__ void	reduceKernel (float input, float output, unsigned int n, bool nIsPow2)
	Parallel reduction sum modified from NVIDIA SDK.

void	resetDeltaWeightsOnDevice (dim3 grid, dim3 block, cudaStream_t stream, int numWeights, int numIONeurons, float deltaWeight, float individualError)
	Wrapper for resetDeltaWeightsKernel.

void	setInitStatesOnDevice (dim3 grid, dim3 block, cudaStream_t stream, float initState, float *activity, int numNeurons, int numIONeurons, int numFastNeurons)
	Wrapper for setInitStatesKernel.

void	resetParametersOnDevice (dim3 grid, dim3 block, cudaStream_t stream, int numNeurons, int maxSequenceSteps, float delta, float previousDelta, float potential, float previousPotential, float *error)
	Wrapper for resetParametersKernel.

void	updateWeightsOnDevice (dim3 grid, dim3 block, float learningRate, float momentum, float weight, float deltaWeight, float *previousDeltaWeight, int numWeights)
	Wrapper for updateWeightsKernel.

void	forwardPassV1onDevice (dim3 grid, dim3 block, cudaStream_t stream, int step, int sequenceOffset, float activity, float input, float weight, float previousPotential, float error, float potential, int *deltaT, int numNeurons, int numIONeurons)
	Wrapper for forwardPassV1Kernel.

void	forwardPassV2onDevice (dim3 grid, dim3 block, cudaStream_t stream, int step, int sequenceOffset, float activity, float input, float weight, int numNeurons, int numIONeurons, float buffer)
	Wrapper for forwardPassV2Kernel.

void	forwardPassV21onDevice (dim3 grid, dim3 block, int smemSize, cudaStream_t stream, int step, int sequenceOffset, float activity, float input, float buffer, float potential, float weight, float previousPotential, float error, int deltaT, int numNeurons, int numIONeurons)
	Wrapper for forwardPassV21Kernel.

void	backwardPassV1onDevice (dim3 grid, dim3 block, cudaStream_t stream, int step, int sequenceOffset, int numNeurons, int numIONeurons, float input, float activity, float delta, float deltaWeight, float previousDelta, float error, float individualError, int deltaT, float *weight)
	Wrapper for backwardPassV1Kernel.

void	backwardPassV11onDevice (dim3 grid, dim3 block, cudaStream_t stream, int step, int numNeurons, int numIONeurons, float activity, float delta, float previousDelta, int deltaT, float *weight)
	Wrapper for backwardPassV11Kernel.

void	backwardPassV2onDevice (dim3 grid, dim3 block, cudaStream_t stream, int step, int sequenceOffset, int numNeurons, int numIONeurons, float input, float activity, float delta, float deltaWeight, float previousDelta, float error, float individualError, int deltaT, float weight, float buffer)
	Wrapper for backwardPassV2Kernel.

void	backwardPassV21onDevice (dim3 grid, dim3 block, int smemSize, cudaStream_t stream, float input, float output, int numNeurons, int numIONeurons)
	Wrapper for backwardPassV21Kernel.

void	backwardPassV3onDevice (dim3 grid, dim3 block, cudaStream_t stream, int step, int numNeurons, int numIONeurons, float activity, float delta, float previousDelta, float deltaWeight, int deltaT, float weight)
	Wrapper for backwardPassV3Kernel.

void	reduceOnDevice (int size, dim3 grid, dim3 block, int smemSize, cudaStream_t stream, float input, float output, unsigned int n, bool nIsPow2)
	Wrapper for reduceKernel.

void	sumDeltaWeightsP2PonDevice (dim3 grid, dim3 block, int numWeights, float masterDeltaWeight, float peerDeltaWeight)
	Wrapper for sumDeltaWeightsP2PKernel.

void	updateWeightsP2PonDevice (dim3 grid, dim3 block, int numWeights, float learningRate, float momentum, float masterWeight, float peerWeight, float deltaWeight, float previousDeltaWeight)
	Wrapper for updateWeightsP2PKernel.

void	sumErrorP2PonDevice (dim3 grid, dim3 block, float masterError, float peerError)
	Wrapper for sumErrorP2PKernel.

Function Documentation

__global__ void backwardPassV11Kernel	(	int	step,
		int	numNeurons,
		int	numIONeurons,
		float *	activity,
		float *	delta,
		float *	previousDelta,
		int *	deltaT,
		float *	weight
	)

Calculates deltas on hidden neurons.

Note: Slower version for larger networks over 1024 neurons.

Parameters

[in]	step	- current step
[in]	numNeurons	- number of neurons
[in]	numIONeurons	- number of input-ouput neurons
[in]	activity	- activities
[in]	deltaT	- delta-t values
[in]	weight	- weights
[in]	previousDelta	- previous deltas
[out]	delta	- deltas

void backwardPassV11onDevice	(	dim3	grid,
		dim3	block,
		cudaStream_t	stream,
		int	step,
		int	numNeurons,
		int	numIONeurons,
		float *	activity,
		float *	delta,
		float *	previousDelta,
		int *	deltaT,
		float *	weight
	)

Wrapper for backwardPassV11Kernel.

Parameters

[in]	grid	- CUDA grid size
[in]	block	- CUDA block size
[in]	stream	- CUDA stream
[in]	step	- current step
[in]	numNeurons	- number of neurons
[in]	numIONeurons	- number of input-ouput neurons
[in]	activity	- activities
[in]	deltaT	- delta-t values
[in]	weight	- weights
[in]	previousDelta	- previous deltas
[out]	delta	- deltas

__global__ void backwardPassV1Kernel	(	int	step,
		int	sequenceOffset,
		int	numNeurons,
		int	numIONeurons,
		float *	input,
		float *	activity,
		float *	delta,
		float *	deltaWeight,
		float *	previousDelta,
		float *	error,
		float *	individualError,
		int *	deltaT,
		float *	weight
	)

Calculates deltas, deltas on weights and errors parts.

Note: Slower version for larger networks over 1024 neurons.

Parameters

[in]	step	- current step
[in]	sequenceOffset	- sequence offsets
[in]	numNeurons	- number of neurons
[in]	numIONeurons	- number of input-output neurons
[in]	input	- input
[in]	activity	- activities
[in]	error	- errors
[in]	individualError	- error buffer
[in]	deltaT	- delta-t values
[in]	weight	- weights
[in]	previousDelta	- previous deltas
[out]	delta	- deltas
[out]	deltaWeight	- delta weights

void backwardPassV1onDevice	(	dim3	grid,
		dim3	block,
		cudaStream_t	stream,
		int	step,
		int	sequenceOffset,
		int	numNeurons,
		int	numIONeurons,
		float *	input,
		float *	activity,
		float *	delta,
		float *	deltaWeight,
		float *	previousDelta,
		float *	error,
		float *	individualError,
		int *	deltaT,
		float *	weight
	)

Wrapper for backwardPassV1Kernel.

Parameters

[in]	grid	- CUDA grid size
[in]	block	- CUDA block size
[in]	stream	- CUDA stream
[in]	step	- current step
[in]	sequenceOffset	- sequence offsets
[in]	numNeurons	- number of neurons
[in]	numIONeurons	- number of input-output neurons
[in]	input	- input
[in]	activity	- activities
[in]	error	- errors
[in]	individualError	- error buffer
[in]	deltaT	- delta-t values
[in]	weight	- weights
[in]	previousDelta	- previous deltas
[out]	delta	- deltas
[out]	deltaWeight	- delta weights

__global__ void backwardPassV21Kernel	(	float *	input,
		float *	output,
		int	numNeurons,
		int	numIONeurons
	)

Calculates deltas on hidden neurons.

Note: Used together with backwardPassV2Kerne. Faster version for networks of up to 1024 neurons.

Parameters

[in]	input	- input
[in]	numNeurons	- number of neurons
[in]	numIONeurons	- number of input-output neurons
[out]	output	- output

void backwardPassV21onDevice	(	dim3	grid,
		dim3	block,
		int	smemSize,
		cudaStream_t	stream,
		float *	input,
		float *	output,
		int	numNeurons,
		int	numIONeurons
	)

Wrapper for backwardPassV21Kernel.

Parameters

[in]	grid	- CUDA grid size
[in]	block	- CUDA block size
[in]	smemSize	- CUDA shared memory size
[in]	stream	- CUDA stream
[in]	input	- input
[in]	numNeurons	- number of neurons
[in]	numIONeurons	- number of input-output neurons
[out]	output	- output

__global__ void backwardPassV2Kernel	(	int	step,
		int	sequenceOffset,
		int	numNeurons,
		int	numIONeurons,
		float *	input,
		float *	activity,
		float *	delta,
		float *	deltaWeight,
		float *	previousDelta,
		float *	error,
		float *	individualError,
		int *	deltaT,
		float *	weight,
		float *	buffer
	)

Loads the buffer with deltas on weights fractions calculate deltas, deltas on weights and errors parts.

Note: Deltas on weights and errors are later summed by parallel reduction.

Parameters

[in]	step	- current step
[in]	sequenceOffset	- sequence offsets
[in]	numNeurons	- number of neurons
[in]	numIONeurons	- number of input-output neurons
[in]	input	- input
[in]	activity	- activities
[in]	previousDelta	- previous deltas
[in]	error	- errors
[in]	individualError	- error buffer
[in]	deltaT	- delta-t values
[in]	weight	- weights
[out]	delta	- deltas
[out]	deltaWeight	- delta weights
[out]	buffer	- buffer used for storing delta weights

void backwardPassV2onDevice	(	dim3	grid,
		dim3	block,
		cudaStream_t	stream,
		int	step,
		int	sequenceOffset,
		int	numNeurons,
		int	numIONeurons,
		float *	input,
		float *	activity,
		float *	delta,
		float *	deltaWeight,
		float *	previousDelta,
		float *	error,
		float *	individualError,
		int *	deltaT,
		float *	weight,
		float *	buffer
	)

Wrapper for backwardPassV2Kernel.

Parameters

[in]	grid	- CUDA grid size
[in]	block	- CUDA block size
[in]	stream	- CUDA stream
[in]	step	- current step
[in]	sequenceOffset	- sequence offsets
[in]	numNeurons	- number of neurons
[in]	numIONeurons	- number of input-output neurons
[in]	input	- input
[in]	activity	- activities
[in]	previousDelta	- previous deltas
[in]	error	- errors
[in]	individualError	- error buffer
[in]	deltaT	- delta-t values
[in]	weight	- weights
[out]	delta	- deltas
[out]	deltaWeight	- delta weights
[out]	buffer	- buffer used for storing delta weights

__global__ void backwardPassV3Kernel	(	int	step,
		int	numNeurons,
		int	numIONeurons,
		float *	activity,
		float *	delta,
		float *	previousDelta,
		float *	deltaWeight,
		int *	deltaT,
		float *	weight
	)

Calculates deltas on weights on hidden neurons and biases.

Parameters

[in]	step	- current step
[in]	numNeurons	- number of neurons
[in]	numIONeurons	- number of input-output neurons
[in]	activity	- activities
[in]	deltaT	- delta-t values
[in]	weight	- weights
[in]	delta	- deltas
[out]	previousDelta	- previous deltas
[out]	deltaWeight	- delta weights

void backwardPassV3onDevice	(	dim3	grid,
		dim3	block,
		cudaStream_t	stream,
		int	step,
		int	numNeurons,
		int	numIONeurons,
		float *	activity,
		float *	delta,
		float *	previousDelta,
		float *	deltaWeight,
		int *	deltaT,
		float *	weight
	)

Wrapper for backwardPassV3Kernel.

Parameters

[in]	grid	- CUDA grid size
[in]	block	- CUDA block size
[in]	stream	- CUDA stream
[in]	step	- current step
[in]	numNeurons	- number of neurons
[in]	numIONeurons	- number of input-output neurons
[in]	activity	- activities
[in]	deltaT	- delta-t values
[in]	weight	- weights
[in]	delta	- deltas
[out]	previousDelta	- previous deltas
[out]	deltaWeight	- delta weights

__global__ void forwardPassV1Kernel	(	int	step,
		int	sequenceOffset,
		float *	activity,
		float *	input,
		float *	weight,
		float *	previousPotential,
		float *	error,
		float *	potential,
		int *	deltaT,
		int	numNeurons,
		int	numIONeurons
	)

Forward pass.

Note: Slower version for larger networks.

Parameters

[in]	step	- current step
[in]	sequenceOffset	- sequence offsets
[in]	input	- input
[in]	weight	- weights
[in]	deltaT	- delta-t values
[in]	numNeurons	- number of neurons
[in]	numIONeurons	- number of input-output neurons
[out]	potential	- potentials
[out]	previousPotential	- previous potentials
[out]	activity	- activities
[out]	error	- errors

void forwardPassV1onDevice	(	dim3	grid,
		dim3	block,
		cudaStream_t	stream,
		int	step,
		int	sequenceOffset,
		float *	activity,
		float *	input,
		float *	weight,
		float *	previousPotential,
		float *	error,
		float *	potential,
		int *	deltaT,
		int	numNeurons,
		int	numIONeurons
	)

Wrapper for forwardPassV1Kernel.

Parameters

[in]	grid	- CUDA grid size
[in]	block	- CUDA block size
[in]	stream	- CUDA stream
[in]	step	- current step
[in]	sequenceOffset	- sequence offsets
[in]	input	- input
[in]	weight	- weights
[in]	deltaT	- delta-t values
[in]	numNeurons	- number of neurons
[in]	numIONeurons	- number of input-output neurons
[out]	potential	- potentials
[out]	previousPotential	- previous potentials
[out]	activity	- activities
[out]	error	- errors

__global__ void forwardPassV21Kernel	(	int	step,
		int	sequenceOffset,
		float *	activity,
		float *	input,
		float *	buffer,
		float *	potential,
		float *	weight,
		float *	previousPotential,
		float *	error,
		int *	deltaT,
		int	numNeurons,
		int	numIONeurons
	)

Forward pass.

Note: Used with forwardPassV2Kernel. Faster version for networks of up to 1024 neurons.

Parameters

[in]	step	- current step
[in]	sequenceOffset	- sequence offsets
[in]	input	- input
[in]	weight	- weights
[in]	deltaT	- delta-t values
[in]	numNeurons	- number of neurons
[in]	numIONeurons	- number of input-output neurons
[out]	potential	- potentials
[out]	previousPotential	- previous potentials
[out]	activity	- activities
[out]	error	- errors

void forwardPassV21onDevice	(	dim3	grid,
		dim3	block,
		int	smemSize,
		cudaStream_t	stream,
		int	step,
		int	sequenceOffset,
		float *	activity,
		float *	input,
		float *	buffer,
		float *	potential,
		float *	weight,
		float *	previousPotential,
		float *	error,
		int *	deltaT,
		int	numNeurons,
		int	numIONeurons
	)

Wrapper for forwardPassV21Kernel.

Parameters

[in]	grid	- CUDA grid size
[in]	block	- CUDA block size
[in]	smemSize	- CUDA shared memory size
[in]	stream	- CUDA stream
[in]	step	- current step
[in]	sequenceOffset	- sequence offsets
[in]	activity	- activations
[in]	input	- input
[in]	weight	- weights
[in]	numNeurons	- number of neurons
[in]	numIONeurons	- number of input-output neurons
[out]	buffer	- buffer used for storing new activations

__global__ void forwardPassV2Kernel	(	int	step,
		int	sequenceOffset,
		float *	activity,
		float *	input,
		float *	weight,
		int	numNeurons,
		int	numIONeurons,
		float *	buffer
	)

Forward pass.

Note: Used with forwardPassV21Kernel - faster version for networks of up to 1024 neurons.

Parameters

[in]	step	- current step
[in]	sequenceOffset	- sequence offsets
[in]	activity	- activations
[in]	input	- input
[in]	weight	- weights
[in]	numNeurons	- number of neurons
[in]	numIONeurons	- number of input-output neurons
[out]	buffer	- buffer used for storing new activations

void forwardPassV2onDevice	(	dim3	grid,
		dim3	block,
		cudaStream_t	stream,
		int	step,
		int	sequenceOffset,
		float *	activity,
		float *	input,
		float *	weight,
		int	numNeurons,
		int	numIONeurons,
		float *	buffer
	)

Wrapper for forwardPassV2Kernel.

Parameters

[in]	grid	- CUDA grid size
[in]	block	- CUDA block size
[in]	stream	- CUDA stream
[in]	step	- current step
[in]	sequenceOffset	- sequence offsets
[in]	activity	- activations
[in]	input	- input
[in]	weight	- weights
[in]	numNeurons	- number of neurons
[in]	numIONeurons	- number of input-output neurons
[out]	buffer	- buffer used for storing new activations

template<unsigned int blockSize>

__global__ void reduceKernel	(	float *	input,
		float *	output,
		unsigned int	n,
		bool	nIsPow2
	)

Parallel reduction sum modified from NVIDIA SDK.

Note: Number of threads are not known at a compile time, however, we always stick to power of 2 sizes; so here we are using templates to allow compilation for all known size, which results in higher throughput.

Parameters

[in]	input	- input
[in]	n	- number of elements to sum
[in]	nIsPow2	- determines if the number is of power of two
[out]	output	- output

void reduceOnDevice	(	int	size,
		dim3	grid,
		dim3	block,
		int	smemSize,
		cudaStream_t	stream,
		float *	input,
		float *	output,
		unsigned int	n,
		bool	nIsPow2
	)

Wrapper for reduceKernel.

Parameters

[in]	size	- number of elements to sum
[in]	grid	- CUDA grid size
[in]	block	- CUDA block size
[in]	smemSize	- CUDA shared memory size
[in]	stream	- CUDA stream
[in]	input	- input
[in]	n	- number of elements to sum
[in]	nIsPow2	- determines if the number is of power of two
[out]	output	- output

__global__ void resetDeltaWeightsKernel	(	int	numWeights,
		int	numIONeurons,
		float *	deltaWeight,
		float *	individualError
	)

Resets delta weights and errors.

Parameters

[in]	numWeights	- number of weights
[in]	numIONeurons	- number of input-output neurons
[out]	deltaWeight	- delta weights
[out]	individualError	- error buffer

void resetDeltaWeightsOnDevice	(	dim3	grid,
		dim3	block,
		cudaStream_t	stream,
		int	numWeights,
		int	numIONeurons,
		float *	deltaWeight,
		float *	individualError
	)

Wrapper for resetDeltaWeightsKernel.

Parameters

[in]	grid	- CUDA grid size
[in]	block	- CUDA block size
[in]	stream	- CUDA stream
[in]	numWeights	- number of weights
[in]	numIONeurons	- number of input-output neurons
[out]	deltaWeight	- delta weights
[out]	individualError	- error buffer

__global__ void resetParametersKernel	(	int	numNeurons,
		int	maxSequenceSteps,
		float *	delta,
		float *	previousDelta,
		float *	potential,
		float *	previousPotential,
		float *	error
	)

Resets delta and error parameters.

Parameters

[in]	numNeurons	- number of neurons
[in]	maxSequenceSteps	- maximum number of sequence steps
[out]	delta	- deltas
[out]	previousDelta	- previous deltas
[out]	potential	- potentials
[out]	previousPotential	- previous potentials
[out]	error	- errors

void resetParametersOnDevice	(	dim3	grid,
		dim3	block,
		cudaStream_t	stream,
		int	numNeurons,
		int	maxSequenceSteps,
		float *	delta,
		float *	previousDelta,
		float *	potential,
		float *	previousPotential,
		float *	error
	)

Wrapper for resetParametersKernel.

Parameters

[in]	grid	- CUDA grid size
[in]	block	- CUDA block size
[in]	stream	- CUDA stream
[in]	numNeurons	- number of neurons
[in]	maxSequenceSteps	- maximum number of sequence steps
[out]	delta	- deltas
[out]	previousDelta	- previous deltas
[out]	potential	- potentials
[out]	previousPotential	- previous potentials
[out]	error	- errors

__global__ void setInitStatesKernel	(	float	initState,
		float *	activity,
		int	numNeurons,
		int	numIONeurons,
		int	numFastNeurons
	)

Sets the initial states for all the units on device.

Note: Slow context units will be initialised with a specific value.

Parameters

[in]	initState	- initial state
[in]	numNeurons	- number of neurons
[in]	numIONeurons	- number of input-output neurons
[in]	numFastNeurons	- number of fast neurons
[out]	activity	- activities

void setInitStatesOnDevice	(	dim3	grid,
		dim3	block,
		cudaStream_t	stream,
		float	initState,
		float *	activity,
		int	numNeurons,
		int	numIONeurons,
		int	numFastNeurons
	)

Wrapper for setInitStatesKernel.

Parameters

[in]	grid	- CUDA grid size
[in]	block	- CUDA block size
[in]	stream	- CUDA stream
[in]	initState	- initial state
[in]	numNeurons	- number of neurons
[in]	numIONeurons	- number of input-output neurons
[in]	numFastNeurons	- number of fast neurons
[out]	activity	- activities

__global__ void sumDeltaWeightsP2PKernel	(	int	numWeights,
		float *	masterDeltaWeight,
		float *	peerDeltaWeight
	)

Sums delta weights on the master device.

Parameters

[in]	numWeights	- number of weights
[in]	peerDeltaWeight	- delta weights from peer device
[out]	masterDeltaWeight	- delta weights from master device

void sumDeltaWeightsP2PonDevice	(	dim3	grid,
		dim3	block,
		int	numWeights,
		float *	masterDeltaWeight,
		float *	peerDeltaWeight
	)

Wrapper for sumDeltaWeightsP2PKernel.

Parameters

[in]	grid	- CUDA grid size
[in]	block	- CUDA block size
[in]	numWeights	- number of weights
[in]	peerDeltaWeight	- delta weights from peer device
[out]	masterDeltaWeight	- delta weights from master device

__global__ void sumErrorP2PKernel	(	float *	masterError,
		float *	peerError
	)

Modifies weights on the master device and copies to the peer device.

Parameters

[in]	peerError	- error from peer device
[out]	masterError	- error from master device

void sumErrorP2PonDevice	(	dim3	grid,
		dim3	block,
		float *	masterError,
		float *	peerError
	)

Wrapper for sumErrorP2PKernel.

Parameters

[in]	grid	- CUDA grid size
[in]	block	- CUDA block size
[in]	peerError	- error from peer device
[out]	masterError	- error from master device

__global__ void updateWeightsKernel	(	float	learningRate,
		float	momentum,
		float *	weight,
		float *	deltaWeight,
		float *	previousDeltaWeight,
		int	numWeights
	)

Updates weights.

Parameters

[in]	learningRate	- learning rate
[in]	momentum	- momentum
[in]	numWeights	- number of weights
[in]	deltaWeight	- delta weights
[out]	previousDeltaWeight	- previous delta weights
[out]	weight	- weights

void updateWeightsOnDevice	(	dim3	grid,
		dim3	block,
		float	learningRate,
		float	momentum,
		float *	weight,
		float *	deltaWeight,
		float *	previousDeltaWeight,
		int	numWeights
	)

Wrapper for updateWeightsKernel.

Parameters

[in]	grid	- CUDA grid size
[in]	block	- CUDA block size
[in]	learningRate	- learning rate
[in]	momentum	- momentum
[in]	numWeights	- number of weights
[in]	deltaWeight	- delta weights
[out]	previousDeltaWeight	- previous delta weights
[out]	weight	- weights

__global__ void updateWeightsP2PKernel	(	int	numWeights,
		float	learningRate,
		float	momentum,
		float *	masterWeight,
		float *	peerWeight,
		float *	deltaWeight,
		float *	previousDeltaWeight
	)

Modifies weights on the master device and copies to the peer device.

Parameters

[in]	numWeights	- number of weights
[in]	learningRate	- learning rate
[in]	momentum	- momentum
[in]	deltaWeight	- delta weights
[out]	previousDeltaWeight	- previous delta weights
[out]	masterWeight	- weigths from master device
[out]	peerWeight	- weights from peer device

void updateWeightsP2PonDevice	(	dim3	grid,
		dim3	block,
		int	numWeights,
		float	learningRate,
		float	momentum,
		float *	masterWeight,
		float *	peerWeight,
		float *	deltaWeight,
		float *	previousDeltaWeight
	)

Wrapper for updateWeightsP2PKernel.

Parameters

[in]	grid	- CUDA grid size
[in]	block	- CUDA block size
[in]	numWeights	- number of weights
[in]	learningRate	- learning rate
[in]	momentum	- momentum
[in]	deltaWeight	- delta weights
[out]	previousDeltaWeight	- previous delta weights
[out]	masterWeight	- weigths from master device
[out]	peerWeight	- weights from peer device

Functions

Function Documentation