MyCaffe  1.12.2.41
Deep learning software for Windows C# programmers.
CudaDnn.cs
1using System;
2using System.Collections.Generic;
3using System.Linq;
4using System.Text;
5using System.Collections;
6using System.Diagnostics;
7using System.Threading;
8using System.IO;
9using MyCaffe.basecode;
11using System.Runtime.Remoting.Channels;
12using System.Xml.Linq;
13using System.Security.Cryptography.X509Certificates;
15
16namespace MyCaffe.common
17{
21 public enum DIR
22 {
26 FWD = 0,
30 BWD = 1
31 }
32
36 public enum MEAN_ERROR
37 {
42 MSE = 1,
45 MAE = 2
46 }
47
51 public enum MATH_FUNCTION
52 {
56 NOP = 0,
57
61 ACOS = 1,
65 ACOSH = 2,
69 COS = 3,
73 COSH = 4,
74
78 ASIN = 10,
82 ASINH = 11,
86 SIN = 12,
90 SINH = 13,
91
95 ATAN = 20,
99 ATANH = 21,
103 TAN = 22,
107 TANH = 23,
108
112 CEIL = 30,
116 FLOOR = 31,
120 NEG = 32,
124 SIGN = 33,
128 SQRT = 34
129 }
130
134 public enum OP
135 {
139 MUL = 1,
143 DIV = 2,
147 ADD = 3,
151 SUB = 4
152 }
153
154
158 public enum DistanceMethod
159 {
163 HAMMING = 0,
167 EUCLIDEAN = 1
168 }
169
176 public enum PoolingMethod
177 {
181 MAX = 0,
185 AVE = 1
186 }
187
191 public enum DataType
192 {
196 DOUBLE,
200 FLOAT
201 }
202
206 public enum DEVINIT
207 {
211 NONE = 0x0000,
212
216 CUBLAS = 0x0001,
217
221 CURAND = 0x0002,
222
227 SETSEED = 0x0004
228 }
229
236 public enum BATCHNORM_MODE
237 {
241 PER_ACTIVATION = 0,
245 SPATIAL = 1,
250 }
251
258 public enum CONV_FWD_ALGO
259 {
263 NONE = -1,
267 IMPLICIT_GEMM = 0,
275 ALGO_GEMM = 2,
279 ALGO_DIRECT = 3,
283 ALGO_FFT = 4,
287 ALGO_FFT_TILING = 5,
291 ALGO_WINOGRAD = 6,
296 }
297
305 {
309 ALGO_0 = 0,
313 ALGO_1 = 1,
317 ALGO_FFT = 2,
321 ALGO_3 = 3
322 }
323
331 {
335 ALGO_0 = 0,
339 ALGO_1 = 1,
343 ALGO_FFT = 2
344 }
345
352 public enum POOLING_METHOD
353 {
357 MAX = 0,
361 AVE = 1,
365 STO_TRAIN = 2,
369 STO_TEST = 3
370 }
371
375 public enum RNN_MODE
376 {
380 RNN_RELU = 0,
384 RNN_TANH = 1,
388 LSTM = 2,
394 GRU = 3
395 }
396
400 public enum RNN_BIAS_MODE
401 {
405 RNN_NO_BIAS = 0,
413 RNN_DOUBLE_BIAS = 2,
418 }
419
423 public enum RNN_DATALAYOUT
424 {
437 }
438
442 public enum RNN_DIRECTION
443 {
452 }
453
457 public enum RNN_FILLER_TYPE
458 {
471 }
472
476 public enum DEVPROP
477 {
481 DEVICECOUNT = 1,
485 NAME = 2,
490 }
491
498 public enum MEMTEST_TYPE
499 {
503 MOV_INV_8 = 1
504 }
505
513 {
517 SUM = 0,
521 PROD = 1,
525 MAX = 2,
529 MIN = 3
530 }
531
539 public enum SSD_MINING_TYPE
540 {
544 NONE = 0,
548 MAX_NEGATIVE = 1,
555 HARD_EXAMPLE = 2
556 }
557
565 public enum SSD_MATCH_TYPE
566 {
570 BIPARTITE,
575 }
576
584 public enum SSD_CODE_TYPE
585 {
589 CORNER = 1,
593 CENTER_SIZE = 2,
597 CORNER_SIZE = 3
598 }
599
608 {
612 SOFTMAX,
617 }
618
627 {
631 L2,
636 }
637
644 public enum ORIENTATION
645 {
649 COL = 0,
653 ROW = 1
654 }
655
663 {
667 ADD = 0,
671 MUL = 1,
675 DIV = 2
676 }
677
681 public enum AGGREGATIONS
682 {
686 SUM = 0,
690 MAX = 1,
694 MIN = 2
695 }
696
701 {
705 DEFAULT = 1,
709 FAST = 0,
713 ACCURATE = 1,
717 LOG = 2
718 }
719
723 public enum SOFTMAX_MODE
724 {
728 INSTANCE,
732 CHANNEL
733 }
734
735#pragma warning disable 1591
736
743 public interface ICudaDevice
744 {
745 void SetDeviceID(int nDeviceID, DEVINIT flags = DEVINIT.NONE, long? lSeed = null);
746 void SetRandomSeed(long lSeed);
747 int GetDeviceCount();
748 int GetDeviceID();
749 void ResetDevice();
750 void SynchronizeDevice();
751 string GetDeviceName(int nDeviceID);
752 string GetDeviceP2PInfo(int nDeviceID);
753 string GetRequiredCompute(out int nMinMajor, out int nMinMinor);
754
755 }
756
763 public interface ICudaMemory
764 {
765 long AllocMemory(long lCount, bool bHalf = false);
766 long AllocMemory(List<double> rg);
767 long AllocMemory(List<float> rg);
768 long AllocMemory(double[] rgSrc, long hStream = 0);
769 long AllocMemory(float[] rgSrc, long hStream = 0);
770 void FreeMemory(long hMem);
771 double[] GetMemoryDouble(long hMem, long lCount = -1);
772 float[] GetMemoryFloat(long hMem, long lCount = -1);
773 void SetMemory(long hMem, List<double> rg);
774 void SetMemory(long hMem, List<float> rg);
775 void SetMemory(long hMem, double[] rgSrc, long hStream = 0);
776 void SetMemory(long hMem, float[] rgSrc, long hStream = 0);
777 void SetMemoryAt(long hMem, double[] rgSrc, int nOffset);
778 void SetMemoryAt(long hMem, float[] rgSrc, int nOffset);
779 long AllocHostBuffer(long lCount);
780 void FreeHostBuffer(long hMem);
781 double[] GetHostMemoryDouble(long hMem);
782 float[] GetHostMemoryFloat(long hMem);
783 long CreateMemoryPointer(long hData, long lOffset, long lCount);
784 void FreeMemoryPointer(long hMem);
785 }
786
793 public interface ICudaCuDnn
794 {
795 long CreateStream(bool bNonBlocking = false, int nIndex = -1);
796 void FreeStream(long h);
797 void SynchronizeStream(long h = 0);
798 void SynchronizeThread();
799
800 long CreateCuDNN(long hStream = 0);
801 void FreeCuDNN(long h);
802
803 long CreateTensorDesc();
804 void FreeTensorDesc(long h);
805 void SetTensorNdDesc(long hHandle, int[] rgDim, int[] rgStride, bool bHalf = false);
806 void SetTensorDesc(long hHandle, int n, int c, int h, int w, bool bHalf = false);
807 void SetTensorDesc(long hHandle, int n, int c, int h, int w, int nStride, int cStride, int hStride, int wStride, bool bHalf = false);
808 void AddTensor(long hHandle, long hSrcDesc, long hSrc, int nSrcOffset, long hDstDesc, long hDst, int nDstOffset);
809
810 void DeriveBatchNormDesc(long hFwdScaleBiasMeanVarDesc, long hFwdBottomDesc, long hBwdScaleBiasMeanVarDesc, long hBwdBottomDesc, BATCHNORM_MODE mode);
811
812 long CreateFilterDesc();
813 void FreeFilterDesc(long h);
814 void SetFilterNdDesc(long hHandle, int[] rgDim, bool bHalf = false);
815 void SetFilterDesc(long hHandle, int n, int c, int h, int w, bool bHalf = false);
816
817 long CreateConvolutionDesc();
818 void FreeConvolutionDesc(long h);
819 void SetConvolutionDesc(long hHandle, int hPad, int wPad, int hStride, int wStride, int hDilation, int wDilation, bool bUseTensorCores, bool bHalf = false);
820
821 long CreatePoolingDesc();
822 void FreePoolingDesc(long h);
823 void SetPoolingDesc(long hHandle, PoolingMethod method, int h, int w, int hPad, int wPad, int hStride, int wStride);
824
825 long CreateLRNDesc();
826 void FreeLRNDesc(long h);
827 void SetLRNDesc(long hHandle, uint nSize, double fAlpha, double fBeta, double fK);
828
829 long CreateRnnDataDesc();
830 void FreeRnnDataDesc(long h);
831 void SetRnnDataDesc(long hRnnDataDesc, RNN_DATALAYOUT layout, int nMaxSeqLen, int nBatchSize, int nVectorSize, bool bBidirectional = false, int[] rgSeqLen = null);
832
833 long CreateRnnDesc();
834 void FreeRnnDesc(long h);
835 void SetRnnDesc(long hHandle, long hRnnDesc, int nHiddenSize, int nNumLayers, long hDropoutDesc, RNN_MODE mode, bool bUseTensorCores, RNN_DIRECTION direction = RNN_DIRECTION.RNN_UNIDIRECTIONAL);
836 int GetRnnParamCount(long hHandle, long hRnnDesc, long hXDesc);
837 ulong GetRnnWorkspaceCount(long hHandle, long hRnnDesc, long hXDesc, out ulong nReservedCount);
838 void GetRnnLinLayerParams(long hHandle, long hRnnDesc, int nLayer, long hXDesc, long hWtDesc, long hWtData, int nLinLayer, out int nWtCount, out long hWt, out int nBiasCount, out long hBias);
839 void RnnForward(long hHandle, long hRnnDesc, long hXDesc, long hXData, long hHxDesc, long hHxData, long hCxDesc, long hCxData, long hWtDesc, long hWtData, long hYDesc, long hYData, long hHyDesc, long hHyData, long hCyDesc, long hCyData, long hWorkspace, ulong nWsCount, long hReserved, ulong hResCount, bool bTraining);
840 void RnnBackwardData(long hHandle, long hRnnDesc, long hYDesc, long hYData, long hYDiff, long hHyDesc, long hHyDiff, long hCyDesc, long hCyDiff, long hWtDesc, long hWtData, long hHxDesc, long hHxData, long hCxDesc, long hCxData, long hXDesc, long hXDiff, long hdHxDesc, long hHxDiff, long hdCxDesc, long hCxDiff, long hWorkspace, ulong nWsCount, long hReserved, ulong nResCount);
841 void RnnBackwardWeights(long hHandle, long hRnnDesc, long hXDesc, long hXData, long hHxDesc, long hHxData, long hYDesc, long hYData, long hWorkspace, ulong nWsCount, long hWtDesc, long hWtDiff, long hReserved, ulong nResCount);
842 }
843
850 public interface ICudaMath
851 {
852 void set(int nCount, long hHandle, double fVal, int nIdx = -1);
853 void set(int nCount, long hHandle, float fVal, int nIdx = -1);
854 double[] get_double(int nCount, long hHandle, int nIdx = -1);
855 float[] get_float(int nCount, long hHandle, int nIdx = -1);
856 void copy(int nCount, long hSrc, long hDst, int nSrcOffset = 0, int nDstOffset = 0, long hAsyncStream = -1, bool? bSrcHalfOverride = null, bool? bDstHalfOverride = null);
857 void copy(int nCount, int nNum, int nDim, long hSrc1, long hSrc2, long hDst, long hSimilar, bool bInvert = false);
858 void copy_expand(int n, int nNum, int nDim, long hSrc, long hDs);
859 void fill(int n, int nDim, long hSrc, int nSrcOff, int nCount, long hDst);
860 void sort(int nCount, long hY);
861
862 void channel_compare(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY);
863 void channel_fill(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, int nLabelDim, long hLabels, long hY);
864 void channel_fillfrom(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, DIR dir);
865 void channel_scale(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hA, long hY);
866 void channel_mulv(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hA, long hX, long hC);
867 void channel_sum(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, bool bSumAcrossChannels = true, DIR dir = DIR.FWD, int nChanalesY = -1);
868 void channel_mean(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY);
869 void channel_copy(int nCount, int nOuterNum, int nChannels, int nBlocks, int nInnerNum, int nOffset, long hX, long hY, DIR dir);
870 void channel_copyall(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY);
871 void channel_duplicate(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY);
872 void channel_percentile(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, double dfPercentile);
873 void channel_op_fwd(OP op, int nCount, int nC, int nN1, int nSD1, int nN2, int nSD2, long hA, long hB, long hY);
874 void channel_op_bwd(OP op, int nCount, int nC, int nN1, int nSD1, int nN2, int nSD2, int nCy, int nSDy, long hA, long hB, long hY, long hAd, long hBd, long hYd, long hWork);
875
876 void gemm(bool bTransA, bool bTransB, int m, int n, int k, double fAlpha, long hA, long hB, double fBeta, long hC);
877 void gemm(bool bTransA, bool bTransB, int m, int n, int k, float fAlpha, long hA, long hB, float fBeta, long hC);
878 void gemv(bool bTransA, int m, int n, double fAlpha, long hA, long hX, double fBeta, long hY);
879 void gemv(bool bTransA, int m, int n, float fAlpha, long hA, long hX, float fBeta, long hY);
880 void geam(bool bTransA, bool bTransB, int m, int n, double fAlpha, long hA, long hB, double fBeta, long hC);
881 void geam(bool bTransA, bool bTransB, int m, int n, float fAlpha, long hA, long hB, float fBeta, long hC);
882
883 void ger(int m, int n, double fAlpha, long hX, long hY, long hA);
884 void ger(int m, int n, float fAlpha, long hX, long hY, long hA);
885 void axpy(int n, double fAlpha, long hX, long hY);
886 void axpy(int n, float fAlpha, long hX, long hY);
887 void axpby(int n, double fAlpha, long hX, double fBeta, long hY);
888 void axpby(int n, float fAlpha, long hX, float fBeta, long hY);
889 void scal(int n, double fAlpha, long hX, int nXOff = 0);
890 void scal(int n, float fAlpha, long hX, int nXOff = 0);
891 double dot_double(int n, long hX, long hY);
892 float dot_float(int n, long hX, long hY);
893 double asum_double(int n, long hX, int nXOff = 0);
894 float asum_float(int n, long hX, int nXOff = 0);
895 void scale(int n, double fAlpha, long hX, long hY);
896 void scale(int n, float fAlpha, long hX, long hY);
897 void add_scalar(int n, double fAlpha, long hY);
898 void add_scalar(int n, float fAlpha, long hY);
899 void add(int n, long hA, long hB, long hY);
900 void add(int n, long hA, long hB, long hY, double dfAlpha);
901 void add(int n, long hA, long hB, long hY, float fAlpha);
902 void sub(int n, long hA, long hB, long hY, int nAOff = 0, int nBOff = 0, int nYOff = 0, int nB = 0);
903 void mul(int n, long hA, long hB, long hY, int nAOff = 0, int nBOff = 0, int nYOff = 0);
904 void mul_scalar(int n, double fAlpha, long hY);
905 void mul_scalar(int n, float fAlpha, long hY);
906 void div(int n, long hA, long hB, long hY);
907 void abs(int n, long hA, long hY);
908 void exp(int n, long hA, long hY);
909 void log(int n, long hA, long hY);
910 void powx(int n, long hA, double fAlpha, long hY, int nAOff = 0, int nYOff = 0);
911 void powx(int n, long hA, float fAlpha, long hY, int nAOff = 0, int nYOff = 0);
912 void sign(int n, long hX, long hY, int nXOff = 0, int nYOff = 0);
913 double min(int n, long hA, out long lPos, int nAOff = 0, long hWork = 0);
914 double max(int n, long hA, out long lPos, int nAOff = 0, long hWork = 0);
915 double sumsq(int n, long hW, long hA, int nAOff = 0);
916 double sumsqdiff(int n, long hW, long hA, long hB, int nAOff = 0, int nBOff = 0);
917 void sqrt(int n, long hA, long hY);
918 void sqrt_scale(int n, long hA, long hY);
919
920 void mask(int n, int nMaskDim, double fSearch, double fReplace, long hX, long hMask, long hY);
921 void mask(int n, int nMaskDim, float fSearch, float fReplace, long hX, long hMask, long hY);
922 void mask_batch(int n, int nBatch, int nMaskDim, double fSearch, double fReplace, long hX, long hMask, long hY);
923 void mask_batch(int n, int nBatch, int nMaskDim, float fSearch, float fReplace, long hX, long hMask, long hY);
924
925 void im2col(long hDataIm, int nDataImOffset, int nChannels, int nHeight, int nWidth, int nKernelH, int nKernelW, int nPadH, int nPadW, int nStrideH, int nStrideW, int nDilationH, int nDilationW, long hDataCol, int nDataColOffset);
926 void im2col_nd(long hDataIm, int nDataImOffset, int nNumSpatialAxes, int nColCount, int nChannelAxis, long hImShape, long hColShape, long hKernelShape, long hPad, long hStride, long hDilation, long hDataCol, int nDataColOffset);
927 void col2im(long hDataCol, int nDataColOffset, int nChannels, int nHeight, int nWidth, int nKernelH, int nKernelW, int nPadH, int nPadW, int nStrideH, int nStrideW, int nDilationH, int nDilationW, long hDataIm, int nDataImOffset);
928 void col2im_nd(long hDataCol, int nDataColOffset, int nNumSpatialAxes, int nColCount, int nChannelAxis, long hImShape, long hColShape, long hKernelShape, long hPad, long hStride, long hDilation, long hDataIm, int nDataImOffset);
929 }
930
937 public interface ICudaRandom
938 {
939 void rng_setseed(long lSeed);
940 void rng_uniform(int n, double fMin, double fMax, long hY);
941 void rng_uniform(int n, float fMin, float fMax, long hY);
942 void rng_gaussian(int n, double fMu, double fSigma, long hY);
943 void rng_gaussian(int n, float fMu, float fSigma, long hY);
944 void rng_bernoulli(int n, double fNonZeroProb, long hY);
945 void rng_bernoulli(int n, float fNonZeroProb, long hY);
946 }
947
954 public interface ICudaDnn : ICudaDevice, ICudaMemory, ICudaCuDnn, ICudaMath, ICudaRandom
955 {
956 }
957
958#pragma warning restore 1591
959
960
968 public class CudaDnn<T> : ICudaDnn, IDisposable
969 {
970 Params m_param = new Params();
971 CudaDnnMemoryTracker<T> m_memTracker;
972 int m_nDeviceId;
973 string m_strPath = "";
974 static int s_nIdxSeed = 0;
975 static string s_strCudaPath = "";
976 CudaControlLib.ICudaKernel m_cuda;
977 long m_hKernel = 0;
978 DataType m_dt;
979 CryptoRandom m_random = new CryptoRandom();
980 T m_tOne;
981 T m_tZero;
982 int m_nIdx;
983 long m_nGhostMemoryIndex = 1000;
984 Dictionary<long, T[]> m_rgGhostMemory = null;
985 bool m_bGhostMemoryEnabled = false;
986 bool m_bOwner = true;
987 object m_memSync = new object();
988 bool m_bEnableRnnExtendedVersion = false;
989 static object m_createSync = new object();
990 static object m_getconvSync = new object();
991 static ulong m_lBaseSize = (ulong)((typeof(T) == typeof(float)) ? sizeof(float) : sizeof(double));
992
996 public enum CUDAQRY
997 {
1001 DEVICE_NAME = 1000,
1006 DEVICE_P2P_INFO = 1001,
1010 DEVICE_INFO = 1002
1011 }
1012
1013#pragma warning disable 1591
1014
1020 public enum CUDAFN
1021 {
1022 INITIALIZE = -2,
1023 CLEANUP = -3,
1024 KERNEL_MEMCOPY = -4,
1025 KERNEL_ADD = -5,
1026 KERNEL_COPY_NCCL = -10,
1027
1028 SETDEVICE = 1,
1029 SETRANDOMSEED = 2,
1030 GETDEVICE = 3,
1031 RESETDEVICE = 4,
1032 SYNCHRONIZEDEVICE = 5,
1033 GETDEVICEPROP = 6,
1034 CHECKMEMORYATTRIB = 7,
1035 GETDEVICEMEMORY = 8,
1036 GETREQUIREDCOMPUTE = 9,
1037
1038 DEVICE_CANACCESSPEER = 10,
1039 DEVICE_ENABLEPEERACCESS = 11,
1040 DEVICE_DISABLEPEERACCESS = 12,
1041
1042 COPY_DEVICE_TO_HOST = 14,
1043 COPY_HOST_TO_DEVICE = 15,
1044
1045 CREATE_MEMORYPOINTER = 16,
1046 FREE_MEMORYPOINTER = 17,
1047
1048 ALLOCMEM_HALF = 19,
1049 ALLOCMEM = 20,
1050 FREEMEM = 21,
1051 GETMEM = 22,
1052 SETMEM = 23,
1053 SETMEMAT = 24,
1054
1055 ALLOCHOSTBUFFER = 25,
1056 FREEHOSTBUFFER = 26,
1057 GETHOSTMEM = 27,
1058 SETHOSTMEM = 28,
1059 GETHOSTBUFFERCAPACITY = 29,
1060
1061 CREATE_STREAM = 30,
1062 FREE_STREAM = 31,
1063 SYNCRHONIZE_STREAM = 32,
1064 SYNCHRONIZE_THREAD = 33,
1065
1066 CREATE_MEMTEST = 34,
1067 FREE_MEMTEST = 35,
1068 RUN_MEMTEST = 36,
1069
1070 CREATE_IMAGEOP = 37,
1071 FREE_IMAGEOP = 38,
1072 DISTORTIMAGE_IMAGEOP = 39,
1073
1074 CREATE_NCCL = 40,
1075 FREE_NCCL = 41,
1076 NCCL_INIT_SINGLEPROCESS = 42,
1077 NCCL_INIT_MULTIPROCESS = 43,
1078 NCCL_BROADCAST = 44,
1079 NCCL_ALLREDUCE = 45,
1080
1081 SETPIXEL = 46,
1082
1083 CREATE_CUDNN = 47,
1084 FREE_CUDNN = 48,
1085
1086 CREATE_TENSORDESC = 50,
1087 FREE_TENSORDESC = 51,
1088 SET_TENSORDESC = 52,
1089 ADD_TENSOR = 53,
1090 SET_TENSORNDDESC = 54,
1091
1092 CREATE_FILTERDESC = 60,
1093 FREE_FILTERDESC = 61,
1094 SET_FILTERDESC = 62,
1095 SET_FILTERNDDESC = 63,
1096
1097 CREATE_EXTENSION = 67,
1098 FREE_EXTENSION = 68,
1099 EXTENSION_RUN = 69,
1100
1101 CREATE_CONVDESC = 70,
1102 FREE_CONVDESC = 71,
1103 SET_CONVDESC = 72,
1104 GET_CONVINFO = 73,
1105 FWD_CONV = 74,
1106 BWD_CONV_BIAS = 75,
1107 BWD_CONV_FILTER = 76,
1108 BWD_CONV_DATA = 77,
1109
1110 CREATE_POOLDESC = 80,
1111 FREE_POOLDESC = 81,
1112 SET_POOLDESC = 82,
1113 FWD_POOL = 83,
1114 BWD_POOL = 84,
1115
1116 DERIVE_BNDESC = 86,
1117 FWD_BN = 87,
1118 BWD_BN = 88,
1119
1120 CREATE_LRNDESC = 90,
1121 FREE_LRNDESC = 91,
1122 SET_LRNDESC = 92,
1123
1124 GET_DROPOUT_INFO = 94,
1125 CREATE_DROPOUTDESC = 95,
1126 FREE_DROPOUTDESC = 96,
1127 SET_DROPOUTDESC = 97,
1128 FWD_DROPOUT = 98,
1129 BWD_DROPOUT = 99,
1130
1131 TANH_FWD = 100,
1132 TANH_BWD = 101,
1133
1134 ELU_FWD = 102,
1135 ELU_BWD = 103,
1136
1137 SIGMOID_FWD = 104,
1138 SIGMOID_BWD = 105,
1139
1140 RELU_FWD = 108,
1141 RELU_BWD = 109,
1142
1143 SOFTMAX_FWD = 111,
1144 SOFTMAX_BWD = 112,
1145
1146 LRN_CC_FWD = 120,
1147 LRN_CC_BWD = 121,
1148 LCN_CC_FWD = 122,
1149 LCN_CC_BWD = 123,
1150
1151 // DEPRECIATED, use RNN8 instead
1152 CREATE_RNN_DATA_DESC = 130,
1153 FREE_RNN_DATA_DESC = 131,
1154 SET_RNN_DATA_DESC = 132,
1155
1156 // DEPRECIATED, use RNN8 instead
1157 CREATE_RNN_DATA_DESCEX = 135,
1158 FREE_RNN_DATA_DESCEX = 136,
1159 SET_RNN_DATA_DESCEX = 137,
1160
1161 // DEPRECIATED, use RNN8 instead
1162 CREATE_RNN_DESC = 140,
1163 FREE_RNN_DESC = 141,
1164 SET_RNN_DESC = 142,
1165 GET_RNN_PARAMCOUNT = 143,
1166 GET_RNN_WORKSPACECOUNT = 144,
1167 GET_RNN_LINLAYERPARAMS = 145,
1168 FWD_RNN = 146,
1169 BWD_RNN_DATA = 147,
1170 BWD_RNN_WTS = 148,
1171
1172 RNN8_IS_SUPPORTED = 150,
1173 RNN8_CREATE = 151,
1174 RNN8_FREE = 152,
1175 RNN8_SET = 153,
1176 RNN8_GET_MEMORY_SIZES = 154,
1177 RNN8_INIT_WEIGHTS = 155,
1178 RNN8_FWD = 156,
1179 RNN8_BWD = 157,
1180
1181 CUDA_SET = 200,
1182 CUDA_GET = 201,
1183 CUDA_COPY = 202,
1184 CUDA_COPY_SIM = 203,
1185 CUDA_COPY_FILL = 204,
1186 CUDA_SORT = 205,
1187 CUDA_COPY_BATCH = 206,
1188 CUDA_COPY_SEQUENCE = 207,
1189 CUDA_COPY_EXPAND = 208,
1190 CUDA_COPY_SEQUENCE2 = 209,
1191
1192 CUDA_ADD3 = 217,
1193 CUDA_GEAM = 218,
1194 CUDA_GEMM2 = 219,
1195 CUDA_GEMM = 220,
1196 CUDA_GEMV = 221,
1197 CUDA_AXPY = 222,
1198 CUDA_AXPBY = 223,
1199 CUDA_SCAL = 224,
1200 CUDA_DOT = 225,
1201 CUDA_ASUM = 226,
1202 CUDA_SCALE = 227,
1203 CUDA_ADD_SCALAR = 228,
1204 CUDA_ADD = 229,
1205 CUDA_SUB = 230,
1206 CUDA_MUL = 231,
1207 CUDA_MUL_SCALAR = 232,
1208 CUDA_DIV = 233,
1209 CUDA_ABS = 234,
1210 CUDA_EXP = 235,
1211 CUDA_LOG = 236,
1212 CUDA_POWX = 237,
1213 CUDA_SIGN = 238,
1214 CUDA_SQRT = 239,
1215 CUDA_RECIPROCOL = 240,
1216 CUDA_STUDENT = 241,
1217 CUDA_LOGISTIC1 = 242,
1218 CUDA_LOGISTIC2 = 243,
1219 CUDA_ADD2 = 244,
1220 CUDA_COMPARE_SIGNS = 245,
1221 CUDA_MAXVAL = 246,
1222 CUDA_MINVAL = 247,
1223 CUDA_SUMSQ = 248,
1224 CUDA_SUMSQDIFF = 249,
1225 CUDA_WIDTH = 250,
1226 CUDA_CONTAINS_POINT = 251,
1227 CUDA_DENAN = 252,
1228 CUDA_SUB_AND_DOT = 253,
1229 CUDA_MINMAXVAL = 254,
1230 CUDA_SUM = 255,
1231 CUDA_SQRT_SCALE = 256,
1232 CUDA_GER = 257,
1233 CUDA_SET_BOUNDS = 259,
1234 CUDA_MINMAXVEC = 260,
1235 CUDA_TRANSPOSE = 261,
1236 CUDA_SCALE_TO_RANGE = 262,
1237 CUDA_ERF = 263,
1238 CUDA_MASK = 264,
1239
1240 CUDA_INTERP2 = 265,
1241 CUDA_MASK_BATCH = 266,
1242 CUDA_TRANSPOSE_HW = 267,
1243
1244 CUDA_MAX = 268,
1245 CUDA_MIN = 269,
1246
1247 CUDA_MULBSX = 270,
1248 CUDA_DIVBSX = 271,
1249
1250 CUDA_MAX_BWD2 = 272,
1251
1252 CUDA_IM2COL = 280,
1253 CUDA_IM2COL_ND = 281,
1254 CUDA_COL2IM = 282,
1255 CUDA_COL2IM_ND = 283,
1256
1257 CUDA_ACCURACY_FWD = 286,
1258
1259 CUDA_CHANNEL_MEAN = 287,
1260 CUDA_CHANNEL_MIN = 289,
1261 CUDA_CHANNEL_MAX = 290,
1262 CUDA_CHANNEL_SUB = 291,
1263 CUDA_CHANNEL_SUM = 292,
1264 CUDA_CHANNEL_DIV = 293,
1265 CUDA_CHANNEL_DOT = 294,
1266 CUDA_CHANNEL_MUL = 295,
1267 CUDA_CHANNEL_COMPARE = 296,
1268 CUDA_CHANNEL_FILL = 297,
1269 CUDA_CHANNEL_SCALE = 298,
1270 CUDA_CHANNEL_MULV = 299,
1271 CUDA_CHANNEL_COPY = 300,
1272 CUDA_CHANNEL_FILLFROM = 301,
1273 CUDA_CHANNEL_COPYALL = 302,
1274 CUDA_CHANNEL_DUP = 303,
1275 CUDA_CHANNEL_ADD = 304,
1276 CUDA_CHANNEL_PERCENTILE = 305,
1277 CUDA_CHANNEL_OP_FWD = 306,
1278 CUDA_CHANNEL_OP_BWD = 307,
1279
1280 CUDA_RNG_SETSEED = 349,
1281 CUDA_RNG_UNIFORM = 350,
1282 CUDA_RNG_GAUSSIAN = 351,
1283 // CUDA_RNG_BERNOULLI = 352, // Not implemented yet.
1284
1285 CUDA_BATCHREIDX_FWD = 386,
1286 CUDA_BATCHREIDX_BWD = 387,
1287
1288 CUDA_EMBED_FWD = 390,
1289 CUDA_EMBED_BWD = 391,
1290
1291 CUDA_CLIP_FWD = 394,
1292 CUDA_CLIP_BWD = 395,
1293
1294 CUDA_POOL_FWD = 400,
1295 CUDA_POOL_BWD = 401,
1296
1297 CUDA_UNPOOL_FWD = 410,
1298 CUDA_UNPOOL_BWD = 411,
1299
1300 CUDA_TANH_FWD = 420,
1301 CUDA_TANH_BWD = 421,
1302
1303 CUDA_MISH_FWD = 422,
1304 CUDA_MISH_BWD = 423,
1305
1306 CUDA_SIGMOID_FWD = 424,
1307 CUDA_SIGMOID_BWD = 425,
1308
1309 CUDA_SWISH_BWD = 427,
1310
1311 CUDA_RELU_FWD = 428,
1312 CUDA_RELU_BWD = 429,
1313
1314 CUDA_ELU_FWD = 430,
1315 CUDA_ELU_BWD = 431,
1316
1317 CUDA_DROPOUT_FWD = 432,
1318 CUDA_DROPOUT_BWD = 433,
1319
1320 CUDA_BNLL_FWD = 435,
1321 CUDA_BNLL_BWD = 436,
1322
1323 CUDA_PRELU_FWD = 438,
1324 CUDA_PRELU_BWD = 439,
1325 CUDA_PRELU_BWD_PARAM = 440,
1326
1327 CUDA_NLLLOSS_FWD = 442,
1328 CUDA_NLLLOSS_BWD = 443,
1329
1330 CUDA_SOFTMAXLOSS_FWD = 444,
1331 CUDA_SOFTMAXLOSS_BWD = 445,
1332
1333 CUDA_MIN_FWD = 446,
1334 CUDA_MIN_BWD = 447,
1335
1336 CUDA_MAX_FWD = 448,
1337 CUDA_MAX_BWD = 449,
1338
1339 CUDA_CROP_FWD = 450,
1340 CUDA_CROP_BWD = 451,
1341
1342 CUDA_CONCAT_FWD = 452,
1343 CUDA_CONCAT_BWD = 453,
1344
1345 CUDA_SLICE_FWD = 455,
1346 CUDA_SLICE_BWD = 456,
1347
1348 CUDA_TILE_FWD = 457,
1349 CUDA_TILE_BWD = 458,
1350
1351 CUDA_BIAS_FWD = 460,
1352
1353 CUDA_SCALE_FWD = 461,
1354
1355 CUDA_THRESHOLD_FWD = 462,
1356
1357 CUDA_CLL_BWD = 463,
1358
1359 CUDA_LRN_FILLSCALE = 465,
1360 CUDA_LRN_COMPUTEOUTPUT = 466,
1361 CUDA_LRN_COMPUTEDIFF = 467,
1362
1363 CUDA_SMOOTHL1_FWD = 470,
1364 CUDA_SMOOTHL1_BWD = 471,
1365
1366 CUDA_SERF_FWD = 472,
1367 CUDA_SERF_BWD = 473,
1368
1369 CUDA_PERMUTE = 474,
1370
1371 CUDA_GATHER_FWD = 476,
1372 CUDA_GATHER_BWD = 477,
1373
1374 CUDA_LSTM_FWD = 480,
1375 CUDA_LSTM_BWD = 481,
1376
1377 CUDA_LSTM_UNIT_FWD = 482,
1378 CUDA_LSTM_UNIT_BWD = 483,
1379
1380 CUDA_MATH_FWD = 487,
1381 CUDA_MATH_BWD = 488,
1382
1383 CUDA_COEFF_SUM_FWD = 490,
1384 CUDA_COEFF_SUM_BWD = 491,
1385
1386 CUDA_COEFF_SUB_FWD = 492,
1387 CUDA_COEFF_SUB_BWD = 493,
1388
1389 CUDA_MEAN_ERROR_LOSS_BWD = 495,
1390
1391 CUDA_SIGMOID_CROSS_ENTROPY_FWD = 496,
1392 CUDA_SIGMOID_CROSS_ENTROPY_BWD = 497,
1393 CUDA_SOFTMAX_CROSS_ENTROPY_FWD = 498,
1394 CUDA_SOFTMAX_CROSS_ENTROPY_BWD = 499,
1395
1396 CUDA_SGD_UPDATE = 500,
1397 CUDA_NESTEROV_UPDATE = 501,
1398 CUDA_ADAGRAD_UPDATE = 502,
1399 CUDA_ADADELTA_UPDATE = 503,
1400 CUDA_ADAM_UPDATE = 504,
1401 CUDA_RMSPROP_UPDATE = 505,
1402 CUDA_ADAMW_UPDATE = 506,
1403
1404 CUDA_COMBINE_DATA = 550,
1405
1406 CUDA_GELU_FWD = 600,
1407 CUDA_GELU_BWD = 601,
1408
1409 CUDA_SILU_FWD = 605,
1410 CUDA_SILU_BWD = 606,
1411
1412 CUDA_SOFTPLUS_FWD = 610,
1413 CUDA_SOFTPLUS_BWD = 611,
1414
1415 CUDA_LECUN_FWD = 615,
1416 CUDA_LECUN_BWD = 616,
1417
1418 CUDA_MTX_SET_DIAGONAL = 700,
1419 CUDA_MTX_SET_DIAGONAL2 = 701,
1420 CUDA_MTX_ADD_VECTOR = 702,
1421 CUDA_MTX_TRANSPOSE_OPERATION = 703,
1422 CUDA_MTX_AGGREGATE_COLS = 704,
1423 CUDA_MTX_AGGREGATE_ROWS = 705,
1424 CUDA_MTX_TRANSPOSE = 706,
1425 CUDA_MTX_MEANCENTER_BY_COL = 707,
1426 CUDA_MTX_MEANCENTER_BY_ROW = 708,
1427 CUDA_MTX_EUCLIDEAN_DIST = 709,
1428 CUDA_MTX_DOT = 710,
1429 CUDA_MTX_MEAN = 711,
1430 CUDA_MTX_STDEV = 712,
1431 CUDA_MTX_CORRELATIONS = 714,
1432
1433 CUDA_CREATE_PCA = 800,
1434 CUDA_RUN_PCA = 801,
1435 CUDA_FREE_PCA = 802,
1436
1437 CUDA_TSNE_UPDATE = 850,
1438 CUDA_TSNE_UPDATE_GRAD = 851,
1439 CUDA_TSNE_COMPUTE_EXACT_ERROR = 852,
1440 CUDA_TSNE_COMPUTE_SQUARED_EUCLIDEAN_DISTANCE = 854,
1441 CUDA_TSNE_COMPUTE_Q_MATRIX = 855,
1442 CUDA_TSNE_COMPUTE_EXACT_GRADIENT = 856,
1443 CUDA_TSNE_SYMMETRIZE_MATRIX = 858,
1444 CUDA_TSNE_COMPUTE_KNN_BOUNDS = 859,
1445
1446 CUDA_TSNE_CREATE_GAUSSIAN_PERPLEXITY = 870,
1447 CUDA_TSNE_FREE_GAUSSIAN_PERPLEXITY = 871,
1448 CUDA_TSNE_FIND_GAUSSIAN_PERPLEXITY = 872,
1449
1450 CUDA_TSNE_CREATE = 875,
1451 CUDA_TSNE_FREE = 876,
1452 CUDA_TSNE_COMPUTE_GRADIENT1 = 877,
1453 CUDA_TSNE_COMPUTE_ERROR1 = 878,
1454
1455 CUDA_GUASSIAN_BLUR = 900,
1456 CUDA_HAMMING_DIFF = 901,
1457 CUDA_CALC_BATCH_DIST = 902,
1458 CUDA_CALC_DFT = 903,
1459
1460 CUDA_CREATE_SSD = 950,
1461 CUDA_FREE_SSD = 951,
1462 CUDA_SETUP_SSD = 952,
1463 CUDA_SSD_FWD_MULTIBOXLOSS = 955,
1464 CUDA_SSD_ENCODE_LOCPRED = 958,
1465 CUDA_SSD_ENCODE_CONFPRED = 959,
1466
1467 CUDA_CREATE_LAYERNORM = 970,
1468 CUDA_FREE_LAYERNORM = 971,
1469 CUDA_LAYERNORM_FWD = 975,
1470 CUDA_LAYERNORM_BWD = 976,
1471
1472 CUDA_DEBUG = 1000
1473 }
1474
1475#pragma warning restore 1591
1476
1477
1488 public CudaDnn(int nDeviceID, DEVINIT flags = (DEVINIT.CUBLAS | DEVINIT.CURAND), long? lSeed = null, string strPath = "", bool bResetFirst = false, bool bEnableMemoryTrace = false)
1489 {
1490 m_memTracker = new CudaDnnMemoryTracker<T>(bEnableMemoryTrace);
1491 m_nDeviceId = nDeviceID;
1492 m_nIdx = get_index();
1493
1494 if (strPath == null || strPath.Length == 0)
1495 strPath = s_strCudaPath;
1496
1497 m_strPath = strPath;
1498 m_dt = (typeof(T) == typeof(double)) ? DataType.DOUBLE : DataType.FLOAT;
1499
1500 try
1501 {
1502 m_cuda = new CudaControlLib.CudaKernel();
1503 }
1504 catch (Exception excpt)
1505 {
1506 throw new Exception("The CudaControl is not registered! Make sure that you are using the 'x64' build and if so, run 'regsvr32 CudaControl.dll' from a CMD window with Administrative privileges to register.", excpt);
1507 }
1508
1509 try
1510 {
1511 if (string.IsNullOrEmpty(strPath))
1512 strPath = GetCudaDnnDllPath();
1513
1514 m_strPath = strPath;
1515
1516 string strDir = System.IO.Path.GetDirectoryName(strPath);
1517 string strCurDir = Directory.GetCurrentDirectory();
1518 Directory.SetCurrentDirectory(strDir);
1519
1520 m_cuda.Load(strPath);
1521
1522 Directory.SetCurrentDirectory(strCurDir);
1523 }
1524 catch (Exception excpt)
1525 {
1526 if (excpt.Message != null && excpt.Message.Length > 0)
1527 throw excpt;
1528
1529 throw new Exception("The CudaDnnDll.x.dll at '" + strPath + "' failed to load. The error code = 0x" + excpt.HResult.ToString("X"));
1530 }
1531
1532 try
1533 {
1534 lock (m_createSync)
1535 {
1536 if (m_dt == DataType.DOUBLE)
1537 {
1538 double[] rg = m_cuda.RunDouble(0, (int)CUDAFN.INITIALIZE, m_param.AsDouble(nDeviceID, (int)flags));
1539 m_hKernel = (long)rg[0];
1540 }
1541 else
1542 {
1543 float[] rg = m_cuda.RunFloat(0, (int)CUDAFN.INITIALIZE, m_param.AsFloat(nDeviceID, (int)flags));
1544 m_hKernel = (long)rg[0];
1545 }
1546 }
1547 }
1548 catch (Exception excpt)
1549 {
1550 if (excpt.Message != null && excpt.Message.Length > 0)
1551 throw excpt;
1552
1553 throw new Exception("CudaDnn failed to initialize. You may need to reboot or reset the Cuda GPU #" + nDeviceID.ToString() + ". The error code = 0x" + excpt.HResult.ToString("X"));
1554 }
1555
1556 if (bResetFirst)
1557 {
1558 ResetDevice();
1559
1560 lock (m_createSync)
1561 {
1562 if (m_dt == DataType.DOUBLE)
1563 {
1564 double[] rg = m_cuda.RunDouble(0, (int)CUDAFN.INITIALIZE, m_param.AsDouble(nDeviceID, (int)flags));
1565 m_hKernel = (long)rg[0];
1566 }
1567 else
1568 {
1569 float[] rg = m_cuda.RunFloat(0, (int)CUDAFN.INITIALIZE, m_param.AsFloat(nDeviceID, (int)flags));
1570 m_hKernel = (long)rg[0];
1571 }
1572 }
1573 }
1574
1575 if (lSeed.HasValue)
1576 SetRandomSeed(lSeed.Value);
1577
1578 m_tOne = (T)Convert.ChangeType(1.0, typeof(T));
1579 m_tZero = (T)Convert.ChangeType(0.0, typeof(T));
1580 }
1581
1587 public CudaDnn(CudaDnn<T> cuda, bool bEnableGhostMemory)
1588 {
1589 m_nDeviceId = cuda.m_nDeviceId;
1590 m_nIdx = get_index();
1591
1592 m_strPath = cuda.m_strPath;
1593 m_dt = cuda.m_dt;
1594 m_cuda = cuda.m_cuda;
1595 m_hKernel = cuda.m_hKernel;
1596 m_tOne = cuda.m_tOne;
1597 m_tZero = cuda.m_tZero;
1598
1599 if (bEnableGhostMemory)
1600 {
1601 m_rgGhostMemory = new Dictionary<long, T[]>();
1602 m_bGhostMemoryEnabled = true;
1603 }
1604
1605 m_bOwner = false;
1606 }
1607
1612 protected virtual void Dispose(bool bDisposing)
1613 {
1614 if (m_bOwner && m_hKernel != 0)
1615 {
1616 if (m_dt == DataType.DOUBLE)
1617 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CLEANUP, null);
1618 else
1619 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CLEANUP, null);
1620
1621 m_hKernel = 0;
1622 m_cuda = null;
1623 }
1624 }
1625
1629 public void Dispose()
1630 {
1631 Dispose(true);
1632 }
1633
1638 public static string GetCudaDnnDllPath()
1639 {
1640 FileInfo fi = new FileInfo(Process.GetCurrentProcess().MainModule.FileName);
1641
1642 string strPath = fi.DirectoryName + "\\cuda_12.1\\CudaDnnDll.12.1.dll";
1643
1644 if (!File.Exists(strPath))
1645 {
1646 strPath = fi.DirectoryName + "\\CudaDnnDll.12.1.dll";
1647 if (!File.Exists(strPath))
1648 {
1649 strPath = fi.DirectoryName + "\\cuda_12.0\\CudaDnnDll.12.0.dll";
1650 if (!File.Exists(strPath))
1651 {
1652 strPath = fi.DirectoryName + "\\CudaDnnDll.12.0.dll";
1653 if (!File.Exists(strPath))
1654 {
1655 if (!File.Exists(strPath))
1656 {
1657 strPath = fi.DirectoryName + "\\cuda_11.8\\CudaDnnDll.11.8.dll";
1658 if (!File.Exists(strPath))
1659 {
1660 strPath = fi.DirectoryName + "\\CudaDnnDll.11.8.dll";
1661 if (!File.Exists(strPath))
1662 {
1663 strPath = fi.DirectoryName + "\\cuda_11.7\\CudaDnnDll.11.7.dll";
1664 if (!File.Exists(strPath))
1665 {
1666 strPath = fi.DirectoryName + "\\CudaDnnDll.11.7.dll";
1667 if (!File.Exists(strPath))
1668 {
1669 strPath = fi.DirectoryName + "\\cuda_11.6\\CudaDnnDll.11.6.dll";
1670 if (!File.Exists(strPath))
1671 {
1672 strPath = fi.DirectoryName + "\\CudaDnnDll.11.6.dll";
1673 if (!File.Exists(strPath))
1674 {
1675 strPath = fi.DirectoryName + "\\cuda_11.5\\CudaDnnDll.11.5.dll";
1676 if (!File.Exists(strPath))
1677 {
1678 strPath = fi.DirectoryName + "\\CudaDnnDll.11.5.dll";
1679 if (!File.Exists(strPath))
1680 {
1681 strPath = fi.DirectoryName + "\\cuda_11.4\\CudaDnnDll.11.4.dll";
1682 if (!File.Exists(strPath))
1683 {
1684 strPath = fi.DirectoryName + "\\CudaDnnDll.11.4.dll";
1685 if (!File.Exists(strPath))
1686 {
1687 strPath = fi.DirectoryName + "\\cuda_11.3\\CudaDnnDll.11.3.dll";
1688 if (!File.Exists(strPath))
1689 {
1690 strPath = fi.DirectoryName + "\\CudaDnnDll.11.3.dll";
1691 if (!File.Exists(strPath))
1692 {
1693 strPath = fi.DirectoryName + "\\cuda_11.2\\CudaDnnDll.11.2.dll";
1694 if (!File.Exists(strPath))
1695 {
1696 strPath = fi.DirectoryName + "\\CudaDnnDll.11.2.dll";
1697 if (!File.Exists(strPath))
1698 {
1699 strPath = fi.DirectoryName + "\\cuda_11.1\\CudaDnnDll.11.1.dll";
1700 if (!File.Exists(strPath))
1701 {
1702 strPath = fi.DirectoryName + "\\CudaDnnDll.11.1.dll";
1703 if (!File.Exists(strPath))
1704 {
1705 strPath = fi.DirectoryName + "\\cuda_11.0\\CudaDnnDll.11.0.dll";
1706 if (!File.Exists(strPath))
1707 {
1708 strPath = fi.DirectoryName + "\\CudaDnnDll.11.0.dll";
1709 if (!File.Exists(strPath))
1710 {
1711 strPath = fi.DirectoryName + "\\cuda_10.2\\CudaDnnDll.10.2.dll";
1712 if (!File.Exists(strPath))
1713 {
1714 strPath = fi.DirectoryName + "\\CudaDnnDll.10.2.dll";
1715 if (!File.Exists(strPath))
1716 {
1717 strPath = fi.DirectoryName + "\\cuda_10.2.3_5\\CudaDnnDll.10.2.dll";
1718 if (!File.Exists(strPath))
1719 {
1720 strPath = fi.DirectoryName + "\\CudaDnnDll.10.2.3_5.dll";
1721 if (!File.Exists(strPath))
1722 {
1723 strPath = fi.DirectoryName + "\\CudaDnnDll.10.1.dll";
1724 if (!File.Exists(strPath))
1725 {
1726 strPath = fi.DirectoryName + "\\CudaDnnDll.10.0.dll";
1727 if (!File.Exists(strPath))
1728 {
1729 strPath = fi.DirectoryName + "\\CudaDnnDll.9.2.dll";
1730 if (!File.Exists(strPath))
1731 {
1732 strPath = fi.DirectoryName + "\\CudaDnnDll.9.1.dll";
1733 if (!File.Exists(strPath))
1734 {
1735 if (!File.Exists(strPath))
1736 strPath = fi.DirectoryName + "\\CudaDnnDll.8.dll";
1737 }
1738 }
1739 }
1740 }
1741 }
1742 }
1743 }
1744 }
1745 }
1746 }
1747 }
1748 }
1749 }
1750 }
1751 }
1752 }
1753 }
1754 }
1755 }
1756 }
1757 }
1758 }
1759 }
1760 }
1761 }
1762 }
1763 }
1764 }
1765 }
1766 }
1767 }
1768
1769 return strPath;
1770 }
1771
1776 {
1777 m_bGhostMemoryEnabled = false;
1778 }
1779
1783 public void ResetGhostMemory()
1784 {
1785 if (m_rgGhostMemory != null)
1786 m_bGhostMemoryEnabled = true;
1787 else
1788 m_bGhostMemoryEnabled = false;
1789 }
1790
1794 public ulong TotalMemoryUsed
1795 {
1796 get { return m_memTracker.TotalMemoryUsed; }
1797 }
1798
1802 public string TotalMemoryUsedAsText
1803 {
1804 get { return m_memTracker.TotalMemoryUsedText; }
1805 }
1806
1811 public long KernelHandle
1812 {
1813 get { return m_hKernel; }
1814 }
1815
1829 public void KernelCopy(int nCount, long hSrc, int nSrcOffset, long hDstKernel, long hDst, int nDstOffset, long hHostBuffer, long hHostKernel = -1, long hStream = -1, long hSrcKernel = -1)
1830 {
1831 if (hSrcKernel == -1)
1832 hSrcKernel = m_hKernel;
1833
1834 if (m_dt == DataType.DOUBLE)
1835 m_cuda.RunDouble((int)hSrcKernel, (int)CUDAFN.KERNEL_MEMCOPY, m_param.AsDouble(nCount, hSrc, nSrcOffset, hDstKernel, hDst, nDstOffset, hHostBuffer, hHostKernel, hStream));
1836 else
1837 m_cuda.RunFloat((int)hSrcKernel, (int)CUDAFN.KERNEL_MEMCOPY, m_param.AsFloat(nCount, hSrc, nSrcOffset, hDstKernel, hDst, nDstOffset, hHostBuffer, hHostKernel, hStream));
1838 }
1839
1848 public void KernelAdd(int nCount, long hA, long hDstKernel, long hB, long hC)
1849 {
1850 if (m_dt == DataType.DOUBLE)
1851 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.KERNEL_ADD, m_param.AsDouble(nCount, hA, hDstKernel, hB, hC));
1852 else
1853 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.KERNEL_ADD, m_param.AsFloat(nCount, hA, hDstKernel, hB, hC));
1854 }
1855
1866 public long KernelCopyNccl(long hSrcKernel, long hSrcNccl)
1867 {
1868 if (m_dt == DataType.DOUBLE)
1869 {
1870 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.KERNEL_COPY_NCCL, m_param.AsDouble(hSrcKernel, hSrcNccl));
1871 return (long)rg[0];
1872 }
1873 else
1874 {
1875 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.KERNEL_COPY_NCCL, m_param.AsFloat(hSrcKernel, hSrcNccl));
1876 return (long)rg[0];
1877 }
1878 }
1879
1880 private static int get_index()
1881 {
1882 s_nIdxSeed++;
1883 return s_nIdxSeed;
1884 }
1885
1890 public static void SetDefaultCudaPath(string strPath)
1891 {
1892 s_strCudaPath = strPath;
1893 }
1894
1899 public static ulong basetype_size(bool bUseHalfSize)
1900 {
1901 if (bUseHalfSize)
1902 return 2;
1903
1904 if (typeof(T) == typeof(float))
1905 return 4;
1906 else
1907 return 8;
1908 }
1909
1910 private double convertD(T fVal)
1911 {
1912 return (double)Convert.ChangeType(fVal, typeof(double));
1913 }
1914
1915 private float convertF(T fVal)
1916 {
1917 return (float)Convert.ChangeType(fVal, typeof(float));
1918 }
1919
1923 public string Path
1924 {
1925 get { return m_strPath; }
1926 }
1927
1931 public static string DefaultPath
1932 {
1933 get { return s_strCudaPath; }
1934 }
1935
1936#pragma warning disable 1591
1937
1938 public void CombineData(int nCount, long hOriginal, long hUpdated, double dfUpdatedPct, long hServer, double dfServerPct, long hNewData)
1939 {
1940 if (m_dt == DataType.DOUBLE)
1941 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_COMBINE_DATA, m_param.AsDouble(dfUpdatedPct, dfServerPct), m_param.AsLong(nCount, hOriginal, hUpdated, 0, hServer, 0, hNewData));
1942 else
1943 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_COMBINE_DATA, m_param.AsFloat((float)dfUpdatedPct, (float)dfServerPct), m_param.AsLong(nCount, hOriginal, hUpdated, 0, hServer, 0, hNewData));
1944 }
1945
1946#pragma warning restore 1591
1947
1948
1949 //---------------------------------------------------------------------
1950 // ICudaDevice Methods
1951 //---------------------------------------------------------------------
1952 #region ICudaDevice Methods
1953
1960 public void SetDeviceID(int nDeviceID = -1, DEVINIT flags = DEVINIT.NONE, long? lSeed = null)
1961 {
1962 if (m_cuda == null || m_hKernel <= 0)
1963 throw new Exception("CudaDnn has already nbeen disposed!");
1964
1965 if (nDeviceID == -1)
1966 nDeviceID = m_nDeviceId;
1967 else
1968 m_nDeviceId = nDeviceID;
1969
1970 if (m_dt == DataType.DOUBLE)
1971 {
1972 if (lSeed.HasValue)
1973 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.SETDEVICE, m_param.AsDouble(nDeviceID, (int)flags, lSeed.Value));
1974 else
1975 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.SETDEVICE, m_param.AsDouble(nDeviceID, (int)flags));
1976 }
1977 else
1978 {
1979 if (lSeed.HasValue)
1980 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.SETDEVICE, m_param.AsFloat(nDeviceID, (int)flags, lSeed.Value));
1981 else
1982 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.SETDEVICE, m_param.AsFloat(nDeviceID, (int)flags));
1983 }
1984 }
1985
1990 public void SetRandomSeed(long lSeed)
1991 {
1992 if (m_cuda == null || m_hKernel <= 0)
1993 throw new Exception("CudaDnn has already nbeen disposed!");
1994
1995 if (m_dt == DataType.DOUBLE)
1996 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.SETRANDOMSEED, m_param.AsDouble(lSeed));
1997 else
1998 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.SETRANDOMSEED, m_param.AsFloat(lSeed));
1999 }
2000
2004 public int OriginalDeviceID
2005 {
2006 get { return m_nDeviceId; }
2007 }
2008
2013 public int GetDeviceID()
2014 {
2015 if (m_cuda == null || m_hKernel <= 0)
2016 throw new Exception("CudaDnn has already nbeen disposed!");
2017
2018 if (m_dt == DataType.DOUBLE)
2019 {
2020 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.GETDEVICE, null);
2021 return (int)rg[0];
2022 }
2023 else
2024 {
2025 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.GETDEVICE, null);
2026 return (int)rg[0];
2027 }
2028 }
2029
2035 public string GetDeviceName(int nDeviceID)
2036 {
2037 if (m_cuda == null || m_hKernel <= 0)
2038 throw new Exception("CudaDnn has already nbeen disposed!");
2039
2040 string[] rgstr = m_cuda.QueryString((int)m_hKernel, (int)CUDAQRY.DEVICE_NAME, new int[] { nDeviceID });
2041 return rgstr[0];
2042 }
2043
2049 public string GetDeviceP2PInfo(int nDeviceID)
2050 {
2051 if (m_cuda == null || m_hKernel <= 0)
2052 throw new Exception("CudaDnn has already nbeen disposed!");
2053
2054 string[] rgstr = m_cuda.QueryString((int)m_hKernel, (int)CUDAQRY.DEVICE_P2P_INFO, new int[] { nDeviceID });
2055 return rgstr[0];
2056 }
2057
2064 public string GetDeviceInfo(int nDeviceID, bool bVerbose = false)
2065 {
2066 if (m_cuda == null || m_hKernel <= 0)
2067 throw new Exception("CudaDnn has already nbeen disposed!");
2068
2069 string[] rgstr = m_cuda.QueryString((int)m_hKernel, (int)CUDAQRY.DEVICE_INFO, new int[] { nDeviceID, (bVerbose) ? 1 : 0 });
2070 return rgstr[0];
2071 }
2072
2079 public void ResetDevice()
2080 {
2081 if (m_cuda == null || m_hKernel <= 0)
2082 throw new Exception("CudaDnn has already nbeen disposed!");
2083
2084 if (m_dt == DataType.DOUBLE)
2085 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.RESETDEVICE, null);
2086 else
2087 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.RESETDEVICE, null);
2088 }
2089
2093 public void SynchronizeDevice()
2094 {
2095 if (m_cuda == null || m_hKernel <= 0)
2096 throw new Exception("CudaDnn has already nbeen disposed!");
2097
2098 if (m_dt == DataType.DOUBLE)
2099 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.SYNCHRONIZEDEVICE, null);
2100 else
2101 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.SYNCHRONIZEDEVICE, null);
2102 }
2103
2109 public int GetMultiGpuBoardGroupID(int nDeviceID)
2110 {
2111 if (m_dt == DataType.DOUBLE)
2112 {
2113 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.GETDEVICEPROP, m_param.AsDouble(nDeviceID, (int)DEVPROP.MULTIGPUBOARDGROUPID));
2114 return (int)rg[0];
2115 }
2116 else
2117 {
2118 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.GETDEVICEPROP, m_param.AsFloat(nDeviceID, (int)DEVPROP.MULTIGPUBOARDGROUPID));
2119 return (int)rg[0];
2120 }
2121 }
2122
2127 public int GetDeviceCount()
2128 {
2129 if (m_cuda == null || m_hKernel <= 0)
2130 return 0;
2131
2132 try
2133 {
2134 if (m_dt == DataType.DOUBLE)
2135 {
2136 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.GETDEVICEPROP, m_param.AsDouble(0, (int)DEVPROP.DEVICECOUNT));
2137 return (int)rg[0];
2138 }
2139 else
2140 {
2141 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.GETDEVICEPROP, m_param.AsFloat(0, (int)DEVPROP.DEVICECOUNT));
2142 return (int)rg[0];
2143 }
2144 }
2145 catch (Exception)
2146 {
2147 return 0;
2148 }
2149 }
2150
2160 public bool CheckMemoryAttributes(long hSrc, int nSrcDeviceID, long hDst, int nDstDeviceID)
2161 {
2162 if (m_dt == DataType.DOUBLE)
2163 {
2164 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CHECKMEMORYATTRIB, m_param.AsDouble(hSrc, nSrcDeviceID, hDst, nDstDeviceID));
2165 return (rg[0] == 0) ? false : true;
2166 }
2167 else
2168 {
2169 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CHECKMEMORYATTRIB, m_param.AsFloat(hSrc, nSrcDeviceID, hDst, nDstDeviceID));
2170 return (rg[0] == 0) ? false : true;
2171 }
2172 }
2173
2182 public double GetDeviceMemory(out double dfFree, out double dfUsed, out bool bCudaCallUsed, int nDeviceID = -1)
2183 {
2184 if (nDeviceID == -1)
2185 nDeviceID = m_nDeviceId;
2186
2187 if (m_dt == DataType.DOUBLE)
2188 {
2189 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.GETDEVICEMEMORY, m_param.AsDouble(nDeviceID));
2190 dfFree = rg[1];
2191 dfUsed = rg[2];
2192 bCudaCallUsed = (rg[3] == 0) ? false : true;
2193 return rg[0];
2194 }
2195 else
2196 {
2197 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.GETDEVICEMEMORY, m_param.AsFloat(nDeviceID));
2198 dfFree = (double)rg[1];
2199 dfUsed = (double)rg[2];
2200 bCudaCallUsed = (rg[3] == 0) ? false : true;
2201 return (double)rg[0];
2202 }
2203 }
2204
2216 public string GetRequiredCompute(out int nMinMajor, out int nMinMinor)
2217 {
2218 if (m_dt == DataType.DOUBLE)
2219 {
2220 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.GETREQUIREDCOMPUTE, null);
2221 nMinMajor = (int)rg[0];
2222 nMinMinor = (int)rg[1];
2223 }
2224 else
2225 {
2226 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.GETREQUIREDCOMPUTE, null);
2227 nMinMajor = (int)rg[0];
2228 nMinMinor = (int)rg[1];
2229 }
2230
2231 return m_strPath;
2232 }
2233
2240 public bool DeviceCanAccessPeer(int nSrcDeviceID, int nPeerDeviceID)
2241 {
2242 if (m_dt == DataType.DOUBLE)
2243 {
2244 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.DEVICE_CANACCESSPEER, m_param.AsDouble(nSrcDeviceID, nPeerDeviceID));
2245 return (rg[0] == 0) ? false : true;
2246 }
2247 else
2248 {
2249 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.DEVICE_CANACCESSPEER, m_param.AsFloat(nSrcDeviceID, nPeerDeviceID));
2250 return (rg[0] == 0) ? false : true;
2251 }
2252 }
2253
2258 public void DeviceEnablePeerAccess(int nPeerDeviceID)
2259 {
2260 if (m_dt == DataType.DOUBLE)
2261 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.DEVICE_ENABLEPEERACCESS, m_param.AsDouble(nPeerDeviceID));
2262 else
2263 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.DEVICE_ENABLEPEERACCESS, m_param.AsFloat(nPeerDeviceID));
2264 }
2265
2270 public void DeviceDisablePeerAccess(int nPeerDeviceID)
2271 {
2272 if (m_dt == DataType.DOUBLE)
2273 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.DEVICE_DISABLEPEERACCESS, m_param.AsDouble(nPeerDeviceID));
2274 else
2275 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.DEVICE_DISABLEPEERACCESS, m_param.AsFloat(nPeerDeviceID));
2276 }
2277
2278 #endregion
2279
2280 //---------------------------------------------------------------------
2281 // ICudaMemory Methods
2282 //---------------------------------------------------------------------
2283 #region ICudaMemory Methods
2284
2291 public long AllocMemory(List<double> rg)
2292 {
2293 return AllocMemory(rg.ToArray());
2294 }
2295
2302 public long AllocMemory(List<float> rg)
2303 {
2304 return AllocMemory(rg.ToArray());
2305 }
2306
2314 public long AllocMemory(double[] rgSrc, long hStream = 0)
2315 {
2316 return AllocMemory(convert(rgSrc), hStream);
2317 }
2318
2326 public long AllocMemory(float[] rgSrc, long hStream = 0)
2327 {
2328 return AllocMemory(convert(rgSrc), hStream);
2329 }
2330
2338 public long AllocMemory(T[] rgSrc, long hStream = 0, bool bHalfSize = false)
2339 {
2340 if (rgSrc == null)
2341 throw new ArgumentNullException();
2342
2343 if (rgSrc.Length == 0)
2344 throw new ArgumentOutOfRangeException();
2345
2346 try
2347 {
2348 if (m_dt == DataType.DOUBLE)
2349 {
2350 if (bHalfSize)
2351 throw new Exception("Half sizes are only supported with the 'float' base type.");
2352
2353 List<double> rgInput = new List<double>() { rgSrc.Length };
2354 List<long> rgInput2 = new List<long>() { rgSrc.Length };
2355
2356 if (hStream > 0)
2357 {
2358 rgInput.Add(hStream);
2359 rgInput2.Add(hStream);
2360 }
2361
2362 rgInput.AddRange(convertD(rgSrc));
2363
2364 double[] rg;
2365
2366 lock (m_memSync)
2367 {
2368 if (m_rgGhostMemory == null || !m_bGhostMemoryEnabled)
2369 {
2370 rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.ALLOCMEM, rgInput.ToArray(), rgInput2.ToArray());
2371 }
2372 else
2373 {
2374 m_nGhostMemoryIndex++;
2375 m_rgGhostMemory.Add(m_nGhostMemoryIndex, convert(Utility.Clone<double>(rgInput).ToArray()));
2376 rg = new double[] { m_nGhostMemoryIndex };
2377 }
2378
2379 return m_memTracker.AllocMemory(m_hKernel, m_nDeviceId, (long)rg[0], (ulong)rgInput.Count, bHalfSize);
2380 }
2381 }
2382 else
2383 {
2384 List<float> rgInput = new List<float>() { rgSrc.Length };
2385 List<long> rgInput2 = new List<long>() { rgSrc.Length };
2386
2387 if (hStream > 0)
2388 {
2389 rgInput.Add(hStream);
2390 rgInput2.Add(hStream);
2391 }
2392
2393 rgInput.AddRange(convertF(rgSrc));
2394
2395 float[] rg;
2396
2397 lock (m_memSync)
2398 {
2399 if (m_rgGhostMemory == null || !m_bGhostMemoryEnabled)
2400 {
2401 if (bHalfSize)
2402 rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.ALLOCMEM_HALF, rgInput.ToArray(), rgInput2.ToArray());
2403 else
2404 rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.ALLOCMEM, rgInput.ToArray(), rgInput2.ToArray());
2405 }
2406 else
2407 {
2408 m_nGhostMemoryIndex++;
2409 m_rgGhostMemory.Add(m_nGhostMemoryIndex, convert(Utility.Clone<float>(rgInput).ToArray()));
2410 rg = new float[] { m_nGhostMemoryIndex };
2411 }
2412
2413 return m_memTracker.AllocMemory(m_hKernel, m_nDeviceId, (long)rg[0], (ulong)rgInput.Count, bHalfSize);
2414 }
2415 }
2416 }
2417 catch (Exception excpt)
2418 {
2419 string strMemory = m_memTracker.TotalMemoryUsedText;
2420 string strDevice = GetDeviceName(m_nDeviceId);
2421 throw new Exception("Out of memory! You are currently using " + strMemory + " of memory on " + strDevice + ". You may need to use a different GPU that has more memory.", excpt);
2422 }
2423 }
2424
2428 public static ulong BaseSize
2429 {
2430 get { return m_lBaseSize; }
2431 }
2432
2438 public static ulong ConvertByteSizeToCount(ulong ulSizeInBytes)
2439 {
2440 return ulSizeInBytes / m_lBaseSize;
2441 }
2442
2449 public long AllocMemory(long lCapacity, bool bHalfSize = false)
2450 {
2451 if (lCapacity <= 0)
2452 throw new ArgumentOutOfRangeException();
2453
2454 long[] rgIn = new long[] { lCapacity };
2455
2456 try
2457 {
2458 if (m_dt == DataType.DOUBLE)
2459 {
2460 if (bHalfSize)
2461 throw new Exception("Half sizes are only supported with the 'float' base type.");
2462
2463 double[] rgOut;
2464 lock (m_memSync)
2465 {
2466 if (m_rgGhostMemory == null || !m_bGhostMemoryEnabled)
2467 {
2468 rgOut = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.ALLOCMEM, null, rgIn);
2469 }
2470 else
2471 {
2472 m_nGhostMemoryIndex++;
2473 m_rgGhostMemory.Add(m_nGhostMemoryIndex, convert(Utility.Create<double>((int)lCapacity, 0).ToArray()));
2474 rgOut = new double[] { m_nGhostMemoryIndex };
2475 }
2476
2477 return m_memTracker.AllocMemory(m_hKernel, m_nDeviceId, (long)rgOut[0], (ulong)lCapacity, bHalfSize);
2478 }
2479 }
2480 else
2481 {
2482 float[] rgOut;
2483 lock (m_memSync)
2484 {
2485 if (m_rgGhostMemory == null || !m_bGhostMemoryEnabled)
2486 {
2487 if (bHalfSize)
2488 rgOut = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.ALLOCMEM_HALF, null, rgIn);
2489 else
2490 rgOut = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.ALLOCMEM, null, rgIn);
2491 }
2492 else
2493 {
2494 m_nGhostMemoryIndex++;
2495 m_rgGhostMemory.Add(m_nGhostMemoryIndex, convert(Utility.Create<float>((int)lCapacity, 0).ToArray()));
2496 rgOut = new float[] { m_nGhostMemoryIndex };
2497 }
2498
2499 return m_memTracker.AllocMemory(m_hKernel, m_nDeviceId, (long)rgOut[0], (ulong)lCapacity, bHalfSize);
2500 }
2501 }
2502 }
2503 catch (Exception excpt)
2504 {
2505 string strMemory = m_memTracker.TotalMemoryUsedText;
2506 string strDevice = GetDeviceName(m_nDeviceId);
2507 long lMb = (lCapacity * (int)basetype_size(false)) / 1000000;
2508
2509 throw new Exception("Out of memory! There is not enough memory to allocate the requested " + lMb.ToString("N0") + " MB of memory. You are currently using " + strMemory + " of memory on " + strDevice + ". You may need to use a different GPU that has more memory.", excpt);
2510 }
2511 }
2512
2517 public void FreeMemory(long hMem)
2518 {
2519 if (m_cuda == null || m_hKernel <= 0)
2520 {
2521 Trace.WriteLine("WARNING: CudaDnn has already been disposed, cannot free memory.");
2522 return;
2523 }
2524
2525 lock (m_memSync)
2526 {
2527 if (m_dt == DataType.DOUBLE)
2528 {
2529 m_memTracker.FreeMemory(m_hKernel, m_nDeviceId, hMem);
2530
2531 if (m_rgGhostMemory == null || !m_bGhostMemoryEnabled)
2532 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.FREEMEM, null, m_param.AsLong(hMem));
2533 else
2534 m_rgGhostMemory.Remove(hMem);
2535 }
2536 else
2537 {
2538 m_memTracker.FreeMemory(m_hKernel, m_nDeviceId, hMem);
2539
2540 if (m_rgGhostMemory == null || !m_bGhostMemoryEnabled)
2541 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.FREEMEM, null, m_param.AsLong(hMem));
2542 else
2543 m_rgGhostMemory.Remove(hMem);
2544 }
2545 }
2546 }
2547
2554 public void CopyDeviceToHost(long lCount, long hGpuSrc, long hHostDst)
2555 {
2556 if (m_dt == DataType.DOUBLE)
2557 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.COPY_DEVICE_TO_HOST, null, m_param.AsLong(lCount, hGpuSrc, hHostDst));
2558 else
2559 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.COPY_DEVICE_TO_HOST, null, m_param.AsLong(lCount, hGpuSrc, hHostDst));
2560 }
2561
2568 public void CopyHostToDevice(long lCount, long hHostSrc, long hGpuDst)
2569 {
2570 if (m_dt == DataType.DOUBLE)
2571 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.COPY_HOST_TO_DEVICE, null, m_param.AsLong(lCount, hHostSrc, hGpuDst));
2572 else
2573 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.COPY_HOST_TO_DEVICE, null, m_param.AsLong(lCount, hHostSrc, hGpuDst));
2574 }
2575
2581 public long AllocHostBuffer(long lCapacity)
2582 {
2583 if (lCapacity == 0)
2584 throw new ArgumentOutOfRangeException();
2585
2586 if (m_dt == DataType.DOUBLE)
2587 {
2588 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.ALLOCHOSTBUFFER, null, m_param.AsLong(lCapacity));
2589 return (long)rg[0];
2590 }
2591 else
2592 {
2593 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.ALLOCHOSTBUFFER, null, m_param.AsLong(lCapacity));
2594 return (long)rg[0];
2595 }
2596 }
2597
2602 public void FreeHostBuffer(long hMem)
2603 {
2604 if (m_cuda == null || m_hKernel <= 0)
2605 {
2606 Trace.WriteLine("WARNING: CudaDnn has already been disposed, cannot free memory.");
2607 return;
2608 }
2609
2610 if (m_dt == DataType.DOUBLE)
2611 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.FREEHOSTBUFFER, null, m_param.AsLong(hMem));
2612 else
2613 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.FREEHOSTBUFFER, null, m_param.AsLong(hMem));
2614 }
2615
2621 public long GetHostBufferCapacity(long hMem)
2622 {
2623 if (m_dt == DataType.DOUBLE)
2624 {
2625 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.GETHOSTBUFFERCAPACITY, null, m_param.AsLong(hMem));
2626 return (long)rg[0];
2627 }
2628 else
2629 {
2630 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.GETHOSTBUFFERCAPACITY, null, m_param.AsLong(hMem));
2631 return (long)rg[0];
2632 }
2633 }
2634
2641 public double[] GetHostMemoryDouble(long hMem)
2642 {
2643 return convertD(GetHostMemory(hMem));
2644 }
2645
2652 public float[] GetHostMemoryFloat(long hMem)
2653 {
2654 return convertF(GetHostMemory(hMem));
2655 }
2656
2662 public T[] GetHostMemory(long hMem)
2663 {
2664 if (m_dt == DataType.DOUBLE)
2665 return convert(m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.GETHOSTMEM, null, m_param.AsLong(hMem)));
2666 else
2667 return convert(m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.GETHOSTMEM, null, m_param.AsLong(hMem)));
2668 }
2669
2677 public double[] GetMemoryDouble(long hMem, long lCount = -1)
2678 {
2679 return convertD(GetMemory(hMem, lCount));
2680 }
2681
2689 public float[] GetMemoryFloat(long hMem, long lCount = -1)
2690 {
2691 return convertF(GetMemory(hMem, lCount));
2692 }
2693
2700 public T[] GetMemory(long hMem, long lCount = -1)
2701 {
2702 if (m_dt == DataType.DOUBLE)
2703 {
2704 if (m_rgGhostMemory == null)
2705 {
2706 double[] rgr = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.GETMEM, null, m_param.AsLong(hMem, lCount));
2707 return convert(rgr);
2708 }
2709 else
2710 {
2711 return m_rgGhostMemory[hMem];
2712 }
2713 }
2714 else
2715 {
2716 if (m_rgGhostMemory == null)
2717 {
2718 float[] rgr = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.GETMEM, null, m_param.AsLong(hMem, lCount));
2719 return convert(rgr);
2720 }
2721 else
2722 {
2723 return m_rgGhostMemory[hMem];
2724 }
2725 }
2726 }
2727
2734 public void SetMemory(long hMem, List<double> rg)
2735 {
2736 SetMemory(hMem, rg.ToArray());
2737 }
2738
2745 public void SetMemory(long hMem, List<float> rg)
2746 {
2747 SetMemory(hMem, rg.ToArray());
2748 }
2749
2757 public void SetMemory(long hMem, double[] rgSrc, long hStream = 0)
2758 {
2759 SetMemory(hMem, convert(rgSrc), hStream);
2760 }
2761
2769 public void SetMemory(long hMem, float[] rgSrc, long hStream = 0)
2770 {
2771 SetMemory(hMem, convert(rgSrc), hStream);
2772 }
2773
2781 public void SetMemory(long hMem, T[] rgSrc, long hStream = 0, int nCount = -1)
2782 {
2783 if (nCount == -1)
2784 nCount = rgSrc.Length;
2785
2786 if (rgSrc == null || nCount == 0)
2787 throw new ArgumentOutOfRangeException("There are no data items to set!");
2788
2789 if (m_hKernel > 0)
2790 {
2791 if (m_rgGhostMemory != null)
2792 {
2793 m_rgGhostMemory[hMem] = Utility.Clone<T>(rgSrc);
2794 }
2795 else
2796 {
2797 if (m_dt == DataType.DOUBLE)
2798 {
2799 int nDataCount = 2;
2800
2801 if (hStream > 0)
2802 nDataCount++;
2803
2804 nDataCount += nCount;
2805
2806 double[] rg = new double[nDataCount];
2807
2808 rg[0] = hMem;
2809 rg[1] = nCount;
2810 int nIdx = 2;
2811
2812 if (hStream > 0)
2813 {
2814 rg[nIdx] = hStream;
2815 nIdx++;
2816 }
2817
2818 long[] rgIn = new long[] { hMem, nCount };
2819
2820 convertD(rgSrc, rg, nIdx, nCount);
2821 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SETMEM, rg, rgIn);
2822 }
2823 else
2824 {
2825 int nDataCount = 2;
2826
2827 if (hStream > 0)
2828 nDataCount++;
2829
2830 nDataCount += nCount;
2831
2832 float[] rg = new float[nDataCount];
2833
2834 rg[0] = hMem;
2835 rg[1] = nCount;
2836 int nIdx = 2;
2837
2838 if (hStream > 0)
2839 {
2840 rg[nIdx] = hStream;
2841 nIdx++;
2842 }
2843
2844 long[] rgIn = new long[] { hMem, nCount };
2845
2846 convertF(rgSrc, rg, nIdx, nCount);
2847 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SETMEM, rg, rgIn);
2848 }
2849 }
2850 }
2851 }
2852
2860 public void SetMemoryAt(long hMem, double[] rgSrc, int nOffset)
2861 {
2862 SetMemoryAt(hMem, convert(rgSrc), nOffset);
2863 }
2864
2872 public void SetMemoryAt(long hMem, float[] rgSrc, int nOffset)
2873 {
2874 SetMemoryAt(hMem, convert(rgSrc), nOffset);
2875 }
2876
2883 public void SetMemoryAt(long hMem, T[] rgSrc, int nOffset)
2884 {
2885 if (rgSrc == null || rgSrc.Length == 0)
2886 throw new ArgumentOutOfRangeException("There are no data items to set!");
2887
2888 if (m_hKernel > 0)
2889 {
2890 if (m_rgGhostMemory != null)
2891 throw new Exception("Ghost memory does not support SetMemoryAt.");
2892
2893 if (m_dt == DataType.DOUBLE)
2894 {
2895 int nDataCount = 3 + rgSrc.Length;
2896 double[] rg = new double[nDataCount];
2897
2898 rg[0] = hMem;
2899 rg[1] = rgSrc.Length;
2900 rg[2] = nOffset;
2901
2902 long[] rgIn = new long[] { hMem, rgSrc.Length, nOffset };
2903
2904 convertD(rgSrc, rg, 3);
2905 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SETMEMAT, rg, rgIn);
2906 }
2907 else
2908 {
2909 int nDataCount = 3 + rgSrc.Length;
2910 float[] rg = new float[nDataCount];
2911
2912 rg[0] = hMem;
2913 rg[1] = rgSrc.Length;
2914 rg[2] = nOffset;
2915
2916 long[] rgIn = new long[] { hMem, rgSrc.Length, nOffset };
2917
2918 convertF(rgSrc, rg, 3);
2919 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SETMEMAT, rg, rgIn);
2920 }
2921 }
2922 }
2923
2933 public T[] SetPixel(long hMem, int nCount, bool bReturnOriginal, int nOffset, params Tuple<int, T>[] rgPixel)
2934 {
2935 if (rgPixel.Length == 0)
2936 throw new Exception("You must specify at least one pixel!");
2937
2938 if (m_dt == DataType.DOUBLE)
2939 {
2940 double[] rg = new double[5 + rgPixel.Length * 2];
2941
2942 rg[0] = hMem;
2943 rg[1] = nCount;
2944 rg[2] = (bReturnOriginal) ? 1 : 0;
2945 rg[3] = nOffset;
2946 rg[4] = rgPixel.Length;
2947 int nIdx = 5;
2948
2949 for (int i = 0; i < rgPixel.Length; i++)
2950 {
2951 rg[nIdx] = rgPixel[i].Item1;
2952 nIdx++;
2953 rg[nIdx] = convertD1(rgPixel[i].Item2);
2954 nIdx++;
2955 }
2956
2957 rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.SETPIXEL, rg);
2958 if (rg == null)
2959 return null;
2960
2961 return convert(rg);
2962 }
2963 else
2964 {
2965 float[] rg = new float[5 + rgPixel.Length * 2];
2966
2967 rg[0] = hMem;
2968 rg[1] = nCount;
2969 rg[2] = (bReturnOriginal) ? 1 : 0;
2970 rg[3] = nOffset;
2971 rg[4] = rgPixel.Length;
2972 int nIdx = 5;
2973
2974 for (int i = 0; i < rgPixel.Length; i++)
2975 {
2976 rg[nIdx] = rgPixel[i].Item1;
2977 nIdx++;
2978 rg[nIdx] = convertF1(rgPixel[i].Item2);
2979 nIdx++;
2980 }
2981
2982 rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.SETPIXEL, rg);
2983 if (rg == null)
2984 return null;
2985
2986 return convert(rg);
2987 }
2988 }
2989
2995 public void SetHostMemory(long hMem, T[] rgSrc)
2996 {
2997 if (m_dt == DataType.DOUBLE)
2998 {
2999 int nDataCount = 2 + rgSrc.Length;
3000 double[] rg = new double[nDataCount];
3001
3002 rg[0] = hMem;
3003 rg[1] = rgSrc.Length;
3004
3005 convertD(rgSrc, rg, 2);
3006 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SETHOSTMEM, rg, m_param.AsLong(hMem, rgSrc.Length));
3007 }
3008 else
3009 {
3010 int nDataCount = 2 + rgSrc.Length;
3011 float[] rg = new float[nDataCount];
3012
3013 rg[0] = hMem;
3014 rg[1] = rgSrc.Length;
3015
3016 convertF(rgSrc, rg, 2);
3017 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SETHOSTMEM, rg, m_param.AsLong(hMem, rgSrc.Length));
3018 }
3019 }
3020
3028 public long CreateMemoryPointer(long hData, long lOffset, long lCount)
3029 {
3030 if (m_dt == DataType.DOUBLE)
3031 {
3032 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CREATE_MEMORYPOINTER, null, m_param.AsLong(hData, lOffset, lCount));
3033 return (long)rg[0];
3034 }
3035 else
3036 {
3037 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CREATE_MEMORYPOINTER, null, m_param.AsLong(hData, lOffset, lCount));
3038 return (long)rg[0];
3039 }
3040 }
3041
3046 public void FreeMemoryPointer(long hData)
3047 {
3048 if (m_cuda == null || m_hKernel <= 0)
3049 {
3050 Trace.WriteLine("WARNING: CudaDnn has already been disposed, cannot free memory pointer.");
3051 return;
3052 }
3053
3054 if (m_dt == DataType.DOUBLE)
3055 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.FREE_MEMORYPOINTER, null, m_param.AsLong(hData));
3056 else
3057 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.FREE_MEMORYPOINTER, null, m_param.AsLong(hData));
3058 }
3059
3069 public long CreateMemoryTest(out ulong ulTotalNumBlocks, out double dfMemAllocatedInGB, out ulong ulMemStartAddr, out ulong ulBlockSize, double dfPctToAllocate = 1.0)
3070 {
3071 if (m_dt == DataType.DOUBLE)
3072 {
3073 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CREATE_MEMTEST, m_param.AsDouble(dfPctToAllocate));
3074 ulTotalNumBlocks = (ulong)rg[1];
3075 dfMemAllocatedInGB = (double)rg[2];
3076 ulMemStartAddr = (ulong)rg[3];
3077 ulBlockSize = (ulong)rg[4];
3078 return (long)rg[0];
3079 }
3080 else
3081 {
3082 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CREATE_MEMTEST, m_param.AsFloat((float)dfPctToAllocate));
3083 ulTotalNumBlocks = (ulong)rg[1];
3084 dfMemAllocatedInGB = (double)rg[2];
3085 ulMemStartAddr = (ulong)rg[3];
3086 ulBlockSize = (ulong)rg[4];
3087 return (long)rg[0];
3088 }
3089 }
3090
3095 public void FreeMemoryTest(long h)
3096 {
3097 if (m_dt == DataType.DOUBLE)
3098 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.FREE_MEMTEST, m_param.AsDouble(h));
3099 else
3100 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.FREE_MEMTEST, m_param.AsFloat(h));
3101 }
3102
3123 public T[] RunMemoryTest(long h, MEMTEST_TYPE type, ulong ulBlockStartOffset, ulong ulBlockCount, bool bVerbose, bool bWrite, bool bReadWrite, bool bRead)
3124 {
3125 List<ulong> rgErrorAddresses = new List<ulong>();
3126
3127 if (m_dt == DataType.DOUBLE)
3128 {
3129 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.RUN_MEMTEST, null, m_param.AsLong(h, (long)type, (long)ulBlockStartOffset, (long)ulBlockCount, (bVerbose) ? 1 : 0, (bWrite) ? 1 : 0, (bReadWrite) ? 1 : 0, (bRead) ? 1 : 0));
3130 return (T[])Convert.ChangeType(rg, typeof(T[]));
3131 }
3132 else
3133 {
3134 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.RUN_MEMTEST, null, m_param.AsLong(h, (long)type, (long)ulBlockStartOffset, (long)ulBlockCount, (bVerbose) ? 1 : 0, (bWrite) ? 1 : 0, (bReadWrite) ? 1 : 0, (bRead) ? 1 : 0));
3135 return (T[])Convert.ChangeType(rg, typeof(T[]));
3136 }
3137 }
3138
3153 public long CreateImageOp(int nNum, double dfBrightnessProb, double dfBrightnessDelta, double dfContrastProb, double dfContrastLower, double dfContrastUpper, double dfSaturationProb, double dfSaturationLower, double dfSaturationUpper, long lRandomSeed = 0)
3154 {
3155 if (m_dt == DataType.DOUBLE)
3156 {
3157 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CREATE_IMAGEOP, m_param.AsDouble(nNum, dfBrightnessProb, dfBrightnessDelta, dfContrastProb, dfContrastLower, dfContrastUpper, dfSaturationProb, dfSaturationLower, dfSaturationUpper, lRandomSeed));
3158 return (long)rg[0];
3159 }
3160 else
3161 {
3162 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CREATE_IMAGEOP, m_param.AsFloat(nNum, (float)dfBrightnessProb, (float)dfBrightnessDelta, (float)dfContrastProb, (float)dfContrastLower, (float)dfContrastUpper, (float)dfSaturationProb, (float)dfSaturationLower, (float)dfSaturationUpper, lRandomSeed));
3163 return (long)rg[0];
3164 }
3165 }
3166
3171 public void FreeImageOp(long h)
3172 {
3173 if (m_dt == DataType.DOUBLE)
3174 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.FREE_IMAGEOP, m_param.AsDouble(h));
3175 else
3176 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.FREE_IMAGEOP, m_param.AsFloat(h));
3177 }
3178
3188 public void DistortImage(long h, int nCount, int nNum, int nDim, long hX, long hY)
3189 {
3190 if (m_dt == DataType.DOUBLE)
3191 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.DISTORTIMAGE_IMAGEOP, null, m_param.AsLong(h, nCount, nNum, nDim, hX, hY));
3192 else
3193 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.DISTORTIMAGE_IMAGEOP, null, m_param.AsLong(h, nCount, nNum, nDim, hX, hY));
3194 }
3195
3196 #endregion
3197
3198 //---------------------------------------------------------------------
3199 // ICudaDnn Methods
3200 //---------------------------------------------------------------------
3201 #region ICudaDnn Methods
3202
3209 public long CreateStream(bool bNonBlocking = false, int nIndex = -1)
3210 {
3211 if (m_dt == DataType.DOUBLE)
3212 {
3213 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CREATE_STREAM, m_param.AsDouble((bNonBlocking) ? 1 : 0, nIndex));
3214 return (long)rg[0];
3215 }
3216 else
3217 {
3218 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CREATE_STREAM, m_param.AsFloat((bNonBlocking) ? 1 : 0, nIndex));
3219 return (long)rg[0];
3220 }
3221 }
3222
3227 public void FreeStream(long h)
3228 {
3229 if (m_dt == DataType.DOUBLE)
3230 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.FREE_STREAM, m_param.AsDouble(h));
3231 else
3232 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.FREE_STREAM, m_param.AsFloat(h));
3233 }
3234
3239 public void SynchronizeStream(long h = 0)
3240 {
3241 if (m_dt == DataType.DOUBLE)
3242 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.SYNCRHONIZE_STREAM, m_param.AsDouble(h));
3243 else
3244 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.SYNCRHONIZE_STREAM, m_param.AsFloat(h));
3245 }
3246
3250 public void SynchronizeThread()
3251 {
3252 if (m_dt == DataType.DOUBLE)
3253 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.SYNCHRONIZE_THREAD, null);
3254 else
3255 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.SYNCHRONIZE_THREAD, null);
3256 }
3257
3263 public long CreateCuDNN(long hStream = 0)
3264 {
3265 if (m_dt == DataType.DOUBLE)
3266 {
3267 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CREATE_CUDNN, m_param.AsDouble(hStream));
3268 return (long)rg[0];
3269 }
3270 else
3271 {
3272 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CREATE_CUDNN, m_param.AsFloat(hStream));
3273 return (long)rg[0];
3274 }
3275 }
3276
3281 public void FreeCuDNN(long h)
3282 {
3283 if (m_dt == DataType.DOUBLE)
3284 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.FREE_CUDNN, m_param.AsDouble(h));
3285 else
3286 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.FREE_CUDNN, m_param.AsFloat(h));
3287 }
3288
3297 public long CreateNCCL(int nDeviceId, int nCount, int nRank, Guid guid)
3298 {
3299 if (m_dt == DataType.DOUBLE)
3300 {
3301 List<double> rgParam = new List<double>() { nDeviceId, nCount, nRank };
3302 List<double> rgGuid = guidToArrayDouble(guid);
3303
3304 rgParam.Add(rgGuid.Count);
3305 rgParam.AddRange(rgGuid);
3306
3307 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CREATE_NCCL, rgParam.ToArray());
3308 return (long)rg[0];
3309 }
3310 else
3311 {
3312 List<float> rgParam = new List<float>() { nDeviceId, nCount, nRank };
3313 List<float> rgGuid = guidToArrayFloat(guid);
3314
3315 rgParam.Add(rgGuid.Count);
3316 rgParam.AddRange(rgGuid);
3317
3318 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CREATE_NCCL, rgParam.ToArray());
3319 return (long)rg[0];
3320 }
3321 }
3322
3323 private List<double> guidToArrayDouble(Guid guid)
3324 {
3325 List<double> rgdf = new List<double>();
3326 string str = guid.ToString();
3327 string[] rgstr = str.Split('-');
3328
3329 foreach (string str1 in rgstr)
3330 {
3331 long val = Convert.ToInt64(str1, 16);
3332 rgdf.Add(val);
3333 }
3334
3335 return rgdf;
3336 }
3337
3338 private List<float> guidToArrayFloat(Guid guid)
3339 {
3340 List<double> rgDf = guidToArrayDouble(guid);
3341 List<float> rg = new List<float>();
3342
3343 foreach (double df in rgDf)
3344 {
3345 rg.Add((float)df);
3346 }
3347
3348 return rg;
3349 }
3350
3355 public void FreeNCCL(long hNccl)
3356 {
3357 if (m_dt == DataType.DOUBLE)
3358 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.FREE_NCCL, m_param.AsDouble(hNccl));
3359 else
3360 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.FREE_NCCL, m_param.AsFloat(hNccl));
3361 }
3362
3370 public void NcclInitializeSingleProcess(params long[] rghNccl)
3371 {
3372 if (m_dt == DataType.DOUBLE)
3373 {
3374 List<double> rg = new List<double>() { 0, rghNccl.Length };
3375
3376 for (int i = 0; i < rghNccl.Length; i++)
3377 {
3378 rg.Add(rghNccl[i]);
3379 }
3380
3381 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.NCCL_INIT_SINGLEPROCESS, rg.ToArray());
3382 }
3383 else
3384 {
3385 List<float> rg = new List<float>() { 0, rghNccl.Length };
3386
3387 for (int i = 0; i < rghNccl.Length; i++)
3388 {
3389 rg.Add(rghNccl[i]);
3390 }
3391
3392 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.NCCL_INIT_SINGLEPROCESS, rg.ToArray());
3393 }
3394 }
3395
3403 public void NcclInitializeMultiProcess(long hNccl)
3404 {
3405 if (m_dt == DataType.DOUBLE)
3406 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.NCCL_INIT_MULTIPROCESS, m_param.AsDouble(hNccl));
3407 else
3408 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.NCCL_INIT_MULTIPROCESS, m_param.AsFloat(hNccl));
3409 }
3410
3421 public void NcclBroadcast(long hNccl, long hStream, long hX, int nCount)
3422 {
3423 Trace.WriteLine("Broadcasting from device ID " + GetDeviceID().ToString());
3424 if (m_dt == DataType.DOUBLE)
3425 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.NCCL_BROADCAST, null, m_param.AsLong(hNccl, hStream, hX, nCount));
3426 else
3427 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.NCCL_BROADCAST, null, m_param.AsLong(hNccl, hStream, hX, nCount));
3428 }
3429
3442 public void NcclAllReduce(long hNccl, long hStream, long hX, int nCount, NCCL_REDUCTION_OP op, double dfScale = 1.0)
3443 {
3444 if (m_dt == DataType.DOUBLE)
3445 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.NCCL_ALLREDUCE, m_param.AsDouble(dfScale), m_param.AsLong(hNccl, hStream, hX, nCount, (int)op, 0));
3446 else
3447 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.NCCL_ALLREDUCE, m_param.AsFloat((float)dfScale), m_param.AsLong(hNccl, hStream, hX, nCount, (int)op, 0));
3448 }
3449
3450
3456 public long CreateExtension(string strExtensionDllPath)
3457 {
3458 if (m_dt == DataType.DOUBLE)
3459 {
3460 double[] rg = m_cuda.RunDoubleEx((int)m_hKernel, (int)CUDAFN.CREATE_EXTENSION, null, strExtensionDllPath);
3461 return (long)rg[0];
3462 }
3463 else
3464 {
3465 float[] rg = m_cuda.RunFloatEx((int)m_hKernel, (int)CUDAFN.CREATE_EXTENSION, null, strExtensionDllPath);
3466 return (long)rg[0];
3467 }
3468 }
3469
3474 public void FreeExtension(long hExtension)
3475 {
3476 if (m_dt == DataType.DOUBLE)
3477 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.FREE_EXTENSION, m_param.AsDouble(hExtension));
3478 else
3479 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.FREE_EXTENSION, m_param.AsFloat(hExtension));
3480 }
3481
3489 public T[] RunExtension(long hExtension, long lfnIdx, T[] rgParam)
3490 {
3491 if (m_dt == DataType.DOUBLE)
3492 {
3493 List<double> rgdf = new List<double>() { hExtension, lfnIdx };
3494
3495 if (rgParam != null)
3496 rgdf.AddRange(Utility.ConvertVec<T>(rgParam));
3497
3498 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.EXTENSION_RUN, rgdf.ToArray());
3499 return Utility.ConvertVec<T>(rg);
3500 }
3501 else
3502 {
3503 List<float> rgf = new List<float>() { hExtension, lfnIdx };
3504
3505 if (rgParam != null)
3506 rgf.AddRange(Utility.ConvertVecF<T>(rgParam));
3507
3508 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.EXTENSION_RUN, rgf.ToArray());
3509 return Utility.ConvertVec<T>(rg);
3510 }
3511 }
3512
3513
3518 public long CreateTensorDesc()
3519 {
3520 if (m_dt == DataType.DOUBLE)
3521 {
3522 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CREATE_TENSORDESC, null);
3523 return (long)rg[0];
3524 }
3525 else
3526 {
3527 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CREATE_TENSORDESC, null);
3528 return (long)rg[0];
3529 }
3530 }
3531
3536 public void FreeTensorDesc(long h)
3537 {
3538 if (m_dt == DataType.DOUBLE)
3539 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.FREE_TENSORDESC, m_param.AsDouble(h));
3540 else
3541 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.FREE_TENSORDESC, m_param.AsFloat(h));
3542 }
3543
3551 public void SetTensorNdDesc(long hHandle, int[] rgDim, int[] rgStride, bool bHalf = false)
3552 {
3553 if (rgDim.Length != rgStride.Length)
3554 throw new Exception("The stride and dim arrays must have the same length.");
3555
3556 if (m_dt == DataType.DOUBLE)
3557 {
3558 List<long> rgArg = new List<long>() { hHandle, (bHalf) ? 1 : 0, rgDim.Length };
3559
3560 for (int i = 0; i < rgDim.Length; i++)
3561 {
3562 rgArg.Add(rgDim[i]);
3563 }
3564
3565 for (int i = 0; i < rgStride.Length; i++)
3566 {
3567 rgArg.Add(rgStride[i]);
3568 }
3569
3570 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SET_TENSORNDDESC, null, rgArg.ToArray());
3571 }
3572 else
3573 {
3574 List<long> rgArg = new List<long>() { hHandle, (bHalf) ? 1 : 0, rgDim.Length };
3575
3576 for (int i = 0; i < rgDim.Length; i++)
3577 {
3578 rgArg.Add(rgDim[i]);
3579 }
3580
3581 for (int i = 0; i < rgStride.Length; i++)
3582 {
3583 rgArg.Add(rgStride[i]);
3584 }
3585
3586 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SET_TENSORNDDESC, null, rgArg.ToArray());
3587 }
3588 }
3589
3599 public void SetTensorDesc(long hHandle, int n, int c, int h, int w, bool bHalf = false)
3600 {
3601 if (m_dt == DataType.DOUBLE)
3602 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SET_TENSORDESC, null, m_param.AsLong(hHandle, (bHalf) ? 1 : 0, n, c, h, w));
3603 else
3604 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SET_TENSORDESC, null, m_param.AsLong(hHandle, (bHalf) ? 1 : 0, n, c, h, w));
3605 }
3606
3620 public void SetTensorDesc(long hHandle, int n, int c, int h, int w, int nStride, int cStride, int hStride, int wStride, bool bHalf = false)
3621 {
3622 if (m_dt == DataType.DOUBLE)
3623 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SET_TENSORDESC, null, m_param.AsLong(hHandle, (bHalf) ? 1 : 0, n, c, h, w, nStride, cStride, hStride, wStride));
3624 else
3625 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SET_TENSORDESC, null, m_param.AsLong(hHandle, (bHalf) ? 1 : 0, n, c, h, w, nStride, cStride, hStride, wStride));
3626 }
3627
3638 public void AddTensor(long hCuDnn, long hSrcDesc, long hSrc, int nSrcOffset, long hDstDesc, long hDst, int nDstOffset)
3639 {
3640 AddTensor(hCuDnn, m_tOne, hSrcDesc, hSrc, nSrcOffset, m_tOne, hDstDesc, hDst, nDstOffset);
3641 }
3642
3655 public void AddTensor(long hCuDnn, T fAlpha, long hSrcDesc, long hSrc, int nSrcOffset, T fBeta, long hDstDesc, long hDst, int nDstOffset)
3656 {
3657 if (m_dt == DataType.DOUBLE)
3658 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.ADD_TENSOR, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hSrcDesc, hSrc, nSrcOffset, 0, hDstDesc, hDst, nDstOffset));
3659 else
3660 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.ADD_TENSOR, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hSrcDesc, hSrc, nSrcOffset, 0, hDstDesc, hDst, nDstOffset));
3661 }
3662
3663
3668 public long CreateFilterDesc()
3669 {
3670 if (m_dt == DataType.DOUBLE)
3671 {
3672 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CREATE_FILTERDESC, null);
3673 return (long)rg[0];
3674 }
3675 else
3676 {
3677 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CREATE_FILTERDESC, null);
3678 return (long)rg[0];
3679 }
3680 }
3681
3686 public void FreeFilterDesc(long h)
3687 {
3688 if (m_dt == DataType.DOUBLE)
3689 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.FREE_FILTERDESC, m_param.AsDouble(h));
3690 else
3691 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.FREE_FILTERDESC, m_param.AsFloat(h));
3692 }
3693
3700 public void SetFilterNdDesc(long hHandle, int[] rgDim, bool bHalf = false)
3701 {
3702 if (m_dt == DataType.DOUBLE)
3703 {
3704 List<long> rgArg = new List<long>() { hHandle, (bHalf) ? 1 : 0, rgDim.Length };
3705
3706 for (int i = 0; i < rgDim.Length; i++)
3707 {
3708 rgArg.Add(rgDim[i]);
3709 }
3710
3711 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SET_FILTERNDDESC, null, rgArg.ToArray());
3712 }
3713 else
3714 {
3715 List<long> rgArg = new List<long>() { hHandle, (bHalf) ? 1 : 0, rgDim.Length };
3716
3717 for (int i = 0; i < rgDim.Length; i++)
3718 {
3719 rgArg.Add(rgDim[i]);
3720 }
3721
3722 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SET_FILTERNDDESC, null, rgArg.ToArray());
3723 }
3724 }
3725
3735 public void SetFilterDesc(long hHandle, int n, int c, int h, int w, bool bHalf = false)
3736 {
3737 if (m_dt == DataType.DOUBLE)
3738 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SET_FILTERDESC, null, m_param.AsLong(hHandle, (bHalf) ? 1 : 0, n, c, h, w));
3739 else
3740 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SET_FILTERDESC, null, m_param.AsLong(hHandle, (bHalf) ? 1 : 0, n, c, h, w));
3741 }
3742
3748 {
3749 if (m_dt == DataType.DOUBLE)
3750 {
3751 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CREATE_CONVDESC, null);
3752 return (long)rg[0];
3753 }
3754 else
3755 {
3756 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CREATE_CONVDESC, null);
3757 return (long)rg[0];
3758 }
3759 }
3760
3765 public void FreeConvolutionDesc(long h)
3766 {
3767 if (m_dt == DataType.DOUBLE)
3768 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.FREE_CONVDESC, m_param.AsDouble(h));
3769 else
3770 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.FREE_CONVDESC, m_param.AsFloat(h));
3771 }
3772
3785 public void SetConvolutionDesc(long hHandle, int hPad, int wPad, int hStride, int wStride, int hDilation, int wDilation, bool bUseTensorCores, bool bHalf = false)
3786 {
3787 if (m_dt == DataType.DOUBLE)
3788 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SET_CONVDESC, null, m_param.AsLong(hHandle, (bHalf) ? 1 : 0, hPad, wPad, hStride, wStride, hDilation, wDilation, (bUseTensorCores) ? 1 : 0));
3789 else
3790 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SET_CONVDESC, null, m_param.AsLong(hHandle, (bHalf) ? 1 : 0, hPad, wPad, hStride, wStride, hDilation, wDilation, (bUseTensorCores) ? 1 : 0));
3791 }
3792
3810 public void GetConvolutionInfo(long hCuDnn, long hBottomDesc, long hFilterDesc, long hConvDesc, long hTopDesc, ulong lWorkspaceSizeLimitInBytes, bool bUseTensorCores, out CONV_FWD_ALGO algoFwd, out ulong lWsSizeFwd, out CONV_BWD_FILTER_ALGO algoBwdFilter, out ulong lWsSizeBwdFilter, out CONV_BWD_DATA_ALGO algoBwdData, out ulong lWsSizeBwdData, CONV_FWD_ALGO preferredFwdAlgo = CONV_FWD_ALGO.NONE)
3811 {
3812 lock (m_getconvSync)
3813 {
3814 if (m_dt == DataType.DOUBLE)
3815 {
3816 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.GET_CONVINFO, null, m_param.AsLong(hCuDnn, hBottomDesc, hFilterDesc, hConvDesc, hTopDesc, (long)lWorkspaceSizeLimitInBytes, (bUseTensorCores) ? 1 : 0, (int)preferredFwdAlgo));
3817 algoFwd = (CONV_FWD_ALGO)rg[0];
3818 lWsSizeFwd = (ulong)rg[1];
3819 algoBwdFilter = (CONV_BWD_FILTER_ALGO)rg[2];
3820 lWsSizeBwdFilter = (ulong)rg[3];
3821 algoBwdData = (CONV_BWD_DATA_ALGO)rg[4];
3822 lWsSizeBwdData = (ulong)rg[5];
3823 }
3824 else
3825 {
3826 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.GET_CONVINFO, null, m_param.AsLong(hCuDnn, hBottomDesc, hFilterDesc, hConvDesc, hTopDesc, (long)lWorkspaceSizeLimitInBytes, (bUseTensorCores) ? 1 : 0, (int)preferredFwdAlgo));
3827 algoFwd = (CONV_FWD_ALGO)rg[0];
3828 lWsSizeFwd = (ulong)rg[1];
3829 algoBwdFilter = (CONV_BWD_FILTER_ALGO)rg[2];
3830 lWsSizeBwdFilter = (ulong)rg[3];
3831 algoBwdData = (CONV_BWD_DATA_ALGO)rg[4];
3832 lWsSizeBwdData = (ulong)rg[5];
3833 }
3834 }
3835 }
3836
3856 public void ConvolutionForward(long hCuDnn, long hBottomDesc, long hBottomData, int nBottomOffset, long hFilterDesc, long hWeight, int nWeightOffset, long hConvDesc, CONV_FWD_ALGO algoFwd, long hWorkspace, int nWorkspaceOffset, ulong lWorkspaceSize, long hTopDesc, long hTopData, int nTopOffset, bool bSyncStream = true)
3857 {
3858 ConvolutionForward(hCuDnn, m_tOne, hBottomDesc, hBottomData, nBottomOffset, hFilterDesc, hWeight, nWeightOffset, hConvDesc, algoFwd, hWeight, nWeightOffset, lWorkspaceSize, m_tZero, hTopDesc, hTopData, nTopOffset, bSyncStream);
3859 }
3860
3882 public void ConvolutionForward(long hCuDnn, T fAlpha, long hBottomDesc, long hBottomData, int nBottomOffset, long hFilterDesc, long hWeight, int nWeightOffset, long hConvDesc, CONV_FWD_ALGO algoFwd, long hWorkspace, int nWorkspaceOffset, ulong lWorkspaceSize, T fBeta, long hTopDesc, long hTopData, int nTopOffset, bool bSyncStream = true)
3883 {
3884 if (m_dt == DataType.DOUBLE)
3885 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.FWD_CONV, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDesc, hBottomData, nBottomOffset, hFilterDesc, hWeight, nWeightOffset, hConvDesc, (long)algoFwd, hWorkspace, nWorkspaceOffset, (long)lWorkspaceSize, 0, hTopDesc, hTopData, nTopOffset, (bSyncStream) ? 1 : 0));
3886 else
3887 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.FWD_CONV, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDesc, hBottomData, nBottomOffset, hFilterDesc, hWeight, nWeightOffset, hConvDesc, (long)algoFwd, hWorkspace, nWorkspaceOffset, (long)lWorkspaceSize, 0, hTopDesc, hTopData, nTopOffset, (bSyncStream) ? 1 : 0));
3888 }
3889
3901 public void ConvolutionBackwardBias(long hCuDnn, long hTopDesc, long hTopDiff, int nTopOffset, long hBiasDesc, long hBiasDiff, int nBiasOffset, bool bSyncStream = true)
3902 {
3903 ConvolutionBackwardBias(hCuDnn, m_tOne, hTopDesc, hTopDiff, nTopOffset, m_tOne, hBiasDesc, hBiasDiff, nBiasOffset, bSyncStream);
3904 }
3905
3919 public void ConvolutionBackwardBias(long hCuDnn, T fAlpha, long hTopDesc, long hTopDiff, int nTopOffset, T fBeta, long hBiasDesc, long hBiasDiff, int nBiasOffset, bool bSyncStream = true)
3920 {
3921 if (m_dt == DataType.DOUBLE)
3922 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.BWD_CONV_BIAS, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDesc, hTopDiff, nTopOffset, 0, hBiasDesc, hBiasDiff, nBiasOffset, (bSyncStream) ? 1 : 0));
3923 else
3924 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.BWD_CONV_BIAS, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDesc, hTopDiff, nTopOffset, 0, hBiasDesc, hBiasDiff, nBiasOffset, (bSyncStream) ? 1 : 0));
3925 }
3926
3946 public void ConvolutionBackwardFilter(long hCuDnn, long hBottomDesc, long hBottomData, int nBottomOffset, long hTopDesc, long hTopDiff, int nTopOffset, long hConvDesc, CONV_BWD_FILTER_ALGO algoBwd, long hWorkspace, int nWorkspaceOffset, ulong lWorkspaceSize, long hFilterDesc, long hWeightDiff, int nWeightOffset, bool bSyncStream)
3947 {
3948 ConvolutionBackwardFilter(hCuDnn, m_tOne, hBottomDesc, hBottomData, nBottomOffset, hTopDesc, hTopDiff, nTopOffset, hConvDesc, algoBwd, hWorkspace, nWorkspaceOffset, lWorkspaceSize, m_tOne, hFilterDesc, hWeightDiff, nWeightOffset, bSyncStream);
3949 }
3950
3972 public void ConvolutionBackwardFilter(long hCuDnn, T fAlpha, long hBottomDesc, long hBottomData, int nBottomOffset, long hTopDesc, long hTopDiff, int nTopOffset, long hConvDesc, CONV_BWD_FILTER_ALGO algoBwd, long hWorkspace, int nWorkspaceOffset, ulong lWorkspaceSize, T fBeta, long hFilterDesc, long hWeightDiff, int nWeightOffset, bool bSyncStream = true)
3973 {
3974 if (m_dt == DataType.DOUBLE)
3975 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.BWD_CONV_FILTER, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDesc, hBottomData, nBottomOffset, hTopDesc, hTopDiff, nTopOffset, hConvDesc, (long)algoBwd, hWorkspace, nWorkspaceOffset, (long)lWorkspaceSize, 0, hFilterDesc, hWeightDiff, nWeightOffset, (bSyncStream) ? 1 : 0));
3976 else
3977 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.BWD_CONV_FILTER, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDesc, hBottomData, nBottomOffset, hTopDesc, hTopDiff, nTopOffset, hConvDesc, (long)algoBwd, hWorkspace, nWorkspaceOffset, (long)lWorkspaceSize, 0, hFilterDesc, hWeightDiff, nWeightOffset, (bSyncStream) ? 1 : 0));
3978 }
3979
3999 public void ConvolutionBackwardData(long hCuDnn, long hFilterDesc, long hWeight, int nWeightOffset, long hTopDesc, long hTopDiff, int nTopOffset, long hConvDesc, CONV_BWD_DATA_ALGO algoBwd, long hWorkspace, int nWorkspaceOffset, ulong lWorkspaceSize, long hBottomDesc, long hBottomDiff, int nBottomOffset, bool bSyncStream = true)
4000 {
4001 ConvolutionBackwardData(hCuDnn, m_tOne, hFilterDesc, hWeight, nWeightOffset, hTopDesc, hTopDiff, nTopOffset, hConvDesc, algoBwd, hWorkspace, nWorkspaceOffset, lWorkspaceSize, m_tZero, hBottomDesc, hBottomDiff, nBottomOffset, bSyncStream);
4002 }
4003
4025 public void ConvolutionBackwardData(long hCuDnn, T fAlpha, long hFilterDesc, long hWeight, int nWeightOffset, long hTopDesc, long hTopDiff, int nTopOffset, long hConvDesc, CONV_BWD_DATA_ALGO algoBwd, long hWorkspace, int nWorkspaceOffset, ulong lWorkspaceSize, T fBeta, long hBottomDesc, long hBottomDiff, int nBottomOffset, bool bSyncStream = true)
4026 {
4027 if (m_dt == DataType.DOUBLE)
4028 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.BWD_CONV_DATA, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hFilterDesc, hWeight, nWeightOffset, hTopDesc, hTopDiff, nTopOffset, hConvDesc, (long)algoBwd, hWorkspace, nWorkspaceOffset, (long)lWorkspaceSize, 0, hBottomDesc, hBottomDiff, nBottomOffset, (bSyncStream) ? 1 : 0));
4029 else
4030 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.BWD_CONV_DATA, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hFilterDesc, hWeight, nWeightOffset, hTopDesc, hTopDiff, nTopOffset, hConvDesc, (long)algoBwd, hWorkspace, nWorkspaceOffset, (long)lWorkspaceSize, 0, hBottomDesc, hBottomDiff, nBottomOffset, (bSyncStream) ? 1 : 0));
4031 }
4032
4037 public long CreatePoolingDesc()
4038 {
4039 if (m_dt == DataType.DOUBLE)
4040 {
4041 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CREATE_POOLDESC, null);
4042 return (long)rg[0];
4043 }
4044 else
4045 {
4046 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CREATE_POOLDESC, null);
4047 return (long)rg[0];
4048 }
4049 }
4050
4055 public void FreePoolingDesc(long h)
4056 {
4057 if (m_dt == DataType.DOUBLE)
4058 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.FREE_POOLDESC, m_param.AsDouble(h));
4059 else
4060 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.FREE_POOLDESC, m_param.AsFloat(h));
4061 }
4062
4074 public void SetPoolingDesc(long hHandle, PoolingMethod method, int h, int w, int hPad, int wPad, int hStride, int wStride)
4075 {
4076 if (m_dt == DataType.DOUBLE)
4077 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SET_POOLDESC, null, m_param.AsLong(hHandle, (int)method, h, w, hPad, wPad, hStride, wStride));
4078 else
4079 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SET_POOLDESC, null, m_param.AsLong(hHandle, (int)method, h, w, hPad, wPad, hStride, wStride));
4080 }
4081
4093 public void PoolingForward(long hCuDnn, long hPoolingDesc, T fAlpha, long hBottomDesc, long hBottomData, T fBeta, long hTopDesc, long hTopData)
4094 {
4095 if (m_dt == DataType.DOUBLE)
4096 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.FWD_POOL, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, hPoolingDesc, 0, hBottomDesc, hBottomData, 0, hTopDesc, hTopData));
4097 else
4098 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.FWD_POOL, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, hPoolingDesc, 0, hBottomDesc, hBottomData, 0, hTopDesc, hTopData));
4099 }
4100
4116 public void PoolingBackward(long hCuDnn, long hPoolingDesc, T fAlpha, long hTopDataDesc, long hTopData, long hTopDiffDesc, long hTopDiff, long hBottomDataDesc, long hBottomData, T fBeta, long hBottomDiffDesc, long hBottomDiff)
4117 {
4118 if (m_dt == DataType.DOUBLE)
4119 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.BWD_POOL, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, hPoolingDesc, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4120 else
4121 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.BWD_POOL, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, hPoolingDesc, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4122 }
4123
4132 public void DeriveBatchNormDesc(long hFwdScaleBiasMeanVarDesc, long hFwdBottomDesc, long hBwdScaleBiasMeanVarDesc, long hBwdBottomDesc, BATCHNORM_MODE mode)
4133 {
4134 if (m_dt == DataType.DOUBLE)
4135 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.DERIVE_BNDESC, null, m_param.AsLong(hFwdScaleBiasMeanVarDesc, hFwdBottomDesc, hBwdScaleBiasMeanVarDesc, hBwdBottomDesc, (int)mode));
4136 else
4137 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.DERIVE_BNDESC, null, m_param.AsLong(hFwdScaleBiasMeanVarDesc, hFwdBottomDesc, hBwdScaleBiasMeanVarDesc, hBwdBottomDesc, (int)mode));
4138 }
4139
4161 public void BatchNormForward(long hCuDnn, BATCHNORM_MODE mode, T fAlpha, T fBeta, long hFwdBottomDesc, long hBottomData, long hFwdTopDesc, long hTopData, long hFwdScaleBiasMeanVarDesc, long hScaleData, long hBiasData, double dfFactor, long hGlobalMean, long hGlobalVar, double dfEps, long hSaveMean, long hSaveInvVar, bool bTraining)
4162 {
4163 if (m_dt == DataType.DOUBLE)
4164 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.FWD_BN, m_param.AsDouble(convertD(fAlpha), convertD(fBeta), dfFactor, dfEps), m_param.AsLong(hCuDnn, (int)mode, 0, 0, hFwdBottomDesc, hBottomData, hFwdTopDesc, hTopData, hFwdScaleBiasMeanVarDesc, hScaleData, hBiasData, 0, hGlobalMean, hGlobalVar, 0, hSaveMean, hSaveInvVar, (bTraining) ? 1 : 0));
4165 else
4166 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.FWD_BN, m_param.AsFloat(convertF(fAlpha), convertF(fBeta), (float)dfFactor, (float)dfEps), m_param.AsLong(hCuDnn, (int)mode, 0, 0, hFwdBottomDesc, hBottomData, hFwdTopDesc, hTopData, hFwdScaleBiasMeanVarDesc, hScaleData, hBiasData, 0, hGlobalMean, hGlobalVar, 0, hSaveMean, hSaveInvVar, (bTraining) ? 1 : 0));
4167 }
4168
4191 public void BatchNormBackward(long hCuDnn, BATCHNORM_MODE mode, T fAlphaDiff, T fBetaDiff, T fAlphaParamDiff, T fBetaParamDiff, long hBwdBottomDesc, long hBottomData, long hTopDiffDesc, long hTopDiff, long hBottomDiffDesc, long hBottomDiff, long hBwdScaleBiasMeanVarDesc, long hScaleData, long hScaleDiff, long hBiasDiff, double dfEps, long hSaveMean, long hSaveInvVar)
4192 {
4193 if (m_dt == DataType.DOUBLE)
4194 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.BWD_BN, m_param.AsDouble(convertD(fAlphaDiff), convertD(fBetaDiff), convertD(fAlphaParamDiff), convertD(fBetaParamDiff), dfEps), m_param.AsLong(hCuDnn, (int)mode, 0, 0, 0, 0, hBwdBottomDesc, hBottomData, hTopDiffDesc, hTopDiff, hBottomDiffDesc, hBottomDiff, hBwdScaleBiasMeanVarDesc, hScaleData, hScaleDiff, hBiasDiff, 0, hSaveMean, hSaveInvVar));
4195 else
4196 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.BWD_BN, m_param.AsFloat(convertF(fAlphaDiff), convertF(fBetaDiff), convertF(fAlphaParamDiff), convertF(fBetaParamDiff), (float)dfEps), m_param.AsLong(hCuDnn, (int)mode, 0, 0, 0, 0, hBwdBottomDesc, hBottomData, hTopDiffDesc, hTopDiff, hBottomDiffDesc, hBottomDiff, hBwdScaleBiasMeanVarDesc, hScaleData, hScaleDiff, hBiasDiff, 0, hSaveMean, hSaveInvVar));
4197 }
4198
4203 public long CreateDropoutDesc()
4204 {
4205 if (m_dt == DataType.DOUBLE)
4206 {
4207 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CREATE_DROPOUTDESC, null);
4208 return (long)rg[0];
4209 }
4210 else
4211 {
4212 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CREATE_DROPOUTDESC, null);
4213 return (long)rg[0];
4214 }
4215 }
4216
4221 public void FreeDropoutDesc(long h)
4222 {
4223 if (m_dt == DataType.DOUBLE)
4224 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.FREE_DROPOUTDESC, m_param.AsDouble(h));
4225 else
4226 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.FREE_DROPOUTDESC, m_param.AsFloat(h));
4227 }
4228
4237 public void SetDropoutDesc(long hCuDnn, long hDropoutDesc, double dfDropout, long hStates, long lSeed)
4238 {
4239 if (m_dt == DataType.DOUBLE)
4240 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SET_DROPOUTDESC, m_param.AsDouble(dfDropout), m_param.AsLong(hCuDnn, hDropoutDesc, 0, hStates, lSeed));
4241 else
4242 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SET_DROPOUTDESC, m_param.AsFloat((float)dfDropout), m_param.AsLong(hCuDnn, hDropoutDesc, 0, hStates, lSeed));
4243 }
4244
4252 public void GetDropoutInfo(long hCuDnn, long hBottomDesc, out ulong ulStateCount, out ulong ulReservedCount)
4253 {
4254 if (m_dt == DataType.DOUBLE)
4255 {
4256 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.GET_DROPOUT_INFO, null, m_param.AsLong(hCuDnn, hBottomDesc));
4257 ulStateCount = (ulong)Math.Round(rg[0] / sizeof(double), 0, MidpointRounding.AwayFromZero);
4258 ulReservedCount = (ulong)Math.Round(rg[1] / sizeof(double), 0, MidpointRounding.AwayFromZero);
4259 }
4260 else
4261 {
4262 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.GET_DROPOUT_INFO, null, m_param.AsLong(hCuDnn, hBottomDesc));
4263 ulStateCount = (ulong)Math.Round(rg[0] / sizeof(float), 0, MidpointRounding.AwayFromZero);
4264 ulReservedCount = (ulong)Math.Round(rg[1] / sizeof(float), 0, MidpointRounding.AwayFromZero);
4265 }
4266 }
4267
4278 public void DropoutForward(long hCuDnn, long hDropoutDesc, long hBottomDesc, long hBottomData, long hTopDesc, long hTopData, long hReserved)
4279 {
4280 if (m_dt == DataType.DOUBLE)
4281 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.FWD_DROPOUT, null, m_param.AsLong(hCuDnn, hDropoutDesc, hBottomDesc, hBottomData, hTopDesc, hTopData, hReserved));
4282 else
4283 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.FWD_DROPOUT, null, m_param.AsLong(hCuDnn, hDropoutDesc, hBottomDesc, hBottomData, hTopDesc, hTopData, hReserved));
4284 }
4285
4296 public void DropoutBackward(long hCuDnn, long hDropoutDesc, long hTopDesc, long hTop, long hBottomDesc, long hBottom, long hReserved)
4297 {
4298 if (m_dt == DataType.DOUBLE)
4299 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.BWD_DROPOUT, null, m_param.AsLong(hCuDnn, hDropoutDesc, hTopDesc, hTop, hBottomDesc, hBottom, hReserved));
4300 else
4301 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.BWD_DROPOUT, null, m_param.AsLong(hCuDnn, hDropoutDesc, hTopDesc, hTop, hBottomDesc, hBottom, hReserved));
4302 }
4303
4308 public long CreateLRNDesc()
4309 {
4310 if (m_dt == DataType.DOUBLE)
4311 {
4312 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CREATE_LRNDESC, null);
4313 return (long)rg[0];
4314 }
4315 else
4316 {
4317 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CREATE_LRNDESC, null);
4318 return (long)rg[0];
4319 }
4320 }
4321
4326 public void FreeLRNDesc(long h)
4327 {
4328 if (m_dt == DataType.DOUBLE)
4329 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.FREE_LRNDESC, m_param.AsDouble(h));
4330 else
4331 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.FREE_LRNDESC, m_param.AsFloat(h));
4332 }
4333
4342 public void SetLRNDesc(long hHandle, uint nSize, double fAlpha, double fBeta, double fK)
4343 {
4344 if (m_dt == DataType.DOUBLE)
4345 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SET_LRNDESC, m_param.AsDouble(fAlpha, fBeta, fK), m_param.AsLong(hHandle, nSize, 0, 0, 0));
4346 else
4347 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SET_LRNDESC, m_param.AsFloat((float)fAlpha, (float)fBeta, (float)fK), m_param.AsLong(hHandle, nSize, 0, 0, 0));
4348 }
4349
4361 public void LRNCrossChannelForward(long hCuDnn, long hNormDesc, T fAlpha, long hBottomDesc, long hBottomData, T fBeta, long hTopDesc, long hTopData)
4362 {
4363 if (m_dt == DataType.DOUBLE)
4364 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.LRN_CC_FWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, hNormDesc, 0, hBottomDesc, hBottomData, 0, hTopDesc, hTopData));
4365 else
4366 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.LRN_CC_FWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, hNormDesc, 0, hBottomDesc, hBottomData, 0, hTopDesc, hTopData));
4367 }
4368
4384 public void LRNCrossChannelBackward(long hCuDnn, long hNormDesc, T fAlpha, long hTopDataDesc, long hTopData, long hTopDiffDesc, long hTopDiff, long hBottomDataDesc, long hBottomData, T fBeta, long hBottomDiffDesc, long hBottomDiff)
4385 {
4386 if (m_dt == DataType.DOUBLE)
4387 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.LRN_CC_BWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, hNormDesc, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4388 else
4389 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.LRN_CC_BWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, hNormDesc, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4390 }
4391
4408 public void DivisiveNormalizationForward(long hCuDnn, long hNormDesc, T fAlpha, long hBottomDataDesc, long hBottomData, long hTemp1, long hTemp2, T fBeta, long hTopDataDesc, long hTopData)
4409 {
4410 if (m_dt == DataType.DOUBLE)
4411 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.LCN_CC_FWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, hNormDesc, 0, hBottomDataDesc, hBottomData, hTemp1, hTemp2, 0, hTopDataDesc, hTopData));
4412 else
4413 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.LCN_CC_FWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, hNormDesc, 0, hBottomDataDesc, hBottomData, hTemp1, hTemp2, 0, hTopDataDesc, hTopData));
4414 }
4415
4433 public void DivisiveNormalizationBackward(long hCuDnn, long hNormDesc, T fAlpha, long hBottomDataDesc, long hBottomData, long hTopDiff, long hTemp1, long hTemp2, T fBeta, long hBottomDiffDesc, long hBottomDiff)
4434 {
4435 if (m_dt == DataType.DOUBLE)
4436 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.LCN_CC_BWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, hNormDesc, 0, hBottomDataDesc, hBottomData, hTopDiff, hTemp1, hTemp2, 0, hBottomDiffDesc, hBottomDiff));
4437 else
4438 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.LCN_CC_BWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, hNormDesc, 0, hBottomDataDesc, hBottomData, hTopDiff, hTemp1, hTemp2, 0, hBottomDiffDesc, hBottomDiff));
4439 }
4440
4451 public void TanhForward(long hCuDnn, T fAlpha, long hBottomDataDesc, long hBottomData, T fBeta, long hTopDataDesc, long hTopData)
4452 {
4453 if (m_dt == DataType.DOUBLE)
4454 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.TANH_FWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDataDesc, hBottomData, 0, hTopDataDesc, hTopData));
4455 else
4456 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.TANH_FWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDataDesc, hBottomData, 0, hTopDataDesc, hTopData));
4457 }
4458
4473 public void TanhBackward(long hCuDnn, T fAlpha, long hTopDataDesc, long hTopData, long hTopDiffDesc, long hTopDiff, long hBottomDataDesc, long hBottomData, T fBeta, long hBottomDiffDesc, long hBottomDiff)
4474 {
4475 if (m_dt == DataType.DOUBLE)
4476 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.TANH_BWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4477 else
4478 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.TANH_BWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4479 }
4480
4491 public void EluForward(long hCuDnn, T fAlpha, long hBottomDataDesc, long hBottomData, T fBeta, long hTopDataDesc, long hTopData)
4492 {
4493 if (m_dt == DataType.DOUBLE)
4494 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.ELU_FWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDataDesc, hBottomData, 0, hTopDataDesc, hTopData));
4495 else
4496 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.ELU_FWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDataDesc, hBottomData, 0, hTopDataDesc, hTopData));
4497 }
4498
4513 public void EluBackward(long hCuDnn, T fAlpha, long hTopDataDesc, long hTopData, long hTopDiffDesc, long hTopDiff, long hBottomDataDesc, long hBottomData, T fBeta, long hBottomDiffDesc, long hBottomDiff)
4514 {
4515 if (m_dt == DataType.DOUBLE)
4516 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.ELU_BWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4517 else
4518 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.ELU_BWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4519 }
4520
4531 public void SigmoidForward(long hCuDnn, T fAlpha, long hBottomDataDesc, long hBottomData, T fBeta, long hTopDataDesc, long hTopData)
4532 {
4533 if (m_dt == DataType.DOUBLE)
4534 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SIGMOID_FWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDataDesc, hBottomData, 0, hTopDataDesc, hTopData));
4535 else
4536 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SIGMOID_FWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDataDesc, hBottomData, 0, hTopDataDesc, hTopData));
4537 }
4538
4553 public void SigmoidBackward(long hCuDnn, T fAlpha, long hTopDataDesc, long hTopData, long hTopDiffDesc, long hTopDiff, long hBottomDataDesc, long hBottomData, T fBeta, long hBottomDiffDesc, long hBottomDiff)
4554 {
4555 if (m_dt == DataType.DOUBLE)
4556 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SIGMOID_BWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4557 else
4558 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SIGMOID_BWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4559 }
4560
4576 public void ReLUForward(long hCuDnn, T fAlpha, long hBottomDataDesc, long hBottomData, T fBeta, long hTopDataDesc, long hTopData)
4577 {
4578 if (m_dt == DataType.DOUBLE)
4579 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.RELU_FWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDataDesc, hBottomData, 0, hTopDataDesc, hTopData));
4580 else
4581 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.RELU_FWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDataDesc, hBottomData, 0, hTopDataDesc, hTopData));
4582 }
4583
4598 public void ReLUBackward(long hCuDnn, T fAlpha, long hTopDataDesc, long hTopData, long hTopDiffDesc, long hTopDiff, long hBottomDataDesc, long hBottomData, T fBeta, long hBottomDiffDesc, long hBottomDiff)
4599 {
4600 if (m_dt == DataType.DOUBLE)
4601 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.RELU_BWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4602 else
4603 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.RELU_BWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4604 }
4605
4618 public void SoftmaxForward(long hCuDnn, SOFTMAX_ALGORITHM alg, SOFTMAX_MODE mode, T fAlpha, long hBottomDataDesc, long hBottomData, T fBeta, long hTopDataDesc, long hTopData)
4619 {
4620 if (m_dt == DataType.DOUBLE)
4621 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SOFTMAX_FWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDataDesc, hBottomData, 0, hTopDataDesc, hTopData, (int)alg, (int)mode));
4622 else
4623 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SOFTMAX_FWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDataDesc, hBottomData, 0, hTopDataDesc, hTopData, (int)alg, (int)mode));
4624 }
4625
4640 public void SoftmaxBackward(long hCuDnn, SOFTMAX_ALGORITHM alg, SOFTMAX_MODE mode, T fAlpha, long hTopDataDesc, long hTopData, long hTopDiffDesc, long hTopDiff, T fBeta, long hBottomDiffDesc, long hBottomDiff)
4641 {
4642 if (m_dt == DataType.DOUBLE)
4643 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SOFTMAX_BWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, 0, hBottomDiffDesc, hBottomDiff, (int)alg, (int)mode));
4644 else
4645 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SOFTMAX_BWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, 0, hBottomDiffDesc, hBottomDiff, (int)alg, (int)mode));
4646 }
4647
4652 public long CreateRnnDataDesc()
4653 {
4654 int nFn = (m_bEnableRnnExtendedVersion) ? (int)CUDAFN.CREATE_RNN_DATA_DESCEX : (int)CUDAFN.CREATE_RNN_DATA_DESC;
4655
4656 if (m_dt == DataType.DOUBLE)
4657 {
4658 double[] rg = m_cuda.RunDouble((int)m_hKernel, nFn, null);
4659 return (long)rg[0];
4660 }
4661 else
4662 {
4663 float[] rg = m_cuda.RunFloat((int)m_hKernel, nFn, null);
4664 return (long)rg[0];
4665 }
4666 }
4667
4672 public void FreeRnnDataDesc(long h)
4673 {
4674 int nFn = (m_bEnableRnnExtendedVersion) ? (int)CUDAFN.FREE_RNN_DATA_DESCEX : (int)CUDAFN.FREE_RNN_DATA_DESC;
4675
4676 if (m_dt == DataType.DOUBLE)
4677 m_cuda.RunDouble((int)m_hKernel, nFn, m_param.AsDouble(h));
4678 else
4679 m_cuda.RunFloat((int)m_hKernel, nFn, m_param.AsFloat(h));
4680 }
4681
4692 public void SetRnnDataDesc(long hRnnDataDesc, RNN_DATALAYOUT layout, int nMaxSeqLen, int nBatchSize, int nVectorSize, bool bBidirectional = false, int[] rgSeqLen = null)
4693 {
4694 if (!m_bEnableRnnExtendedVersion && layout != RNN_DATALAYOUT.RNN_SEQ_MAJOR_UNPACKED)
4695 throw new Exception("The non-extended functions only support RNN_SEQ_MAJOR ordering.");
4696
4697 int nFn = (m_bEnableRnnExtendedVersion) ? (int)CUDAFN.SET_RNN_DATA_DESCEX : (int)CUDAFN.SET_RNN_DATA_DESC;
4698
4699 if (m_dt == DataType.DOUBLE)
4700 {
4701 List<long> rgArg = new List<long>() { hRnnDataDesc, (long)layout, nMaxSeqLen, nBatchSize, nVectorSize, (bBidirectional) ? 1 : 0 };
4702
4703 if (rgSeqLen != null)
4704 {
4705 for (int i = 0; i < rgSeqLen.Length; i++)
4706 {
4707 rgArg.Add(rgSeqLen[i]);
4708 }
4709 }
4710
4711 m_cuda.RunDoubleEx2((int)m_hKernel, nFn, null, rgArg.ToArray());
4712 }
4713 else
4714 {
4715 List<long> rgArg = new List<long>() { hRnnDataDesc, (long)layout, nMaxSeqLen, nBatchSize, nVectorSize, (bBidirectional) ? 1 : 0 };
4716
4717 if (rgSeqLen != null)
4718 {
4719 for (int i = 0; i < rgSeqLen.Length; i++)
4720 {
4721 rgArg.Add(rgSeqLen[i]);
4722 }
4723 }
4724
4725 m_cuda.RunFloatEx2((int)m_hKernel, nFn, null, rgArg.ToArray());
4726 }
4727 }
4728
4733 public long CreateRnnDesc()
4734 {
4735 if (m_dt == DataType.DOUBLE)
4736 {
4737 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CREATE_RNN_DESC, null);
4738 return (long)rg[0];
4739 }
4740 else
4741 {
4742 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CREATE_RNN_DESC, null);
4743 return (long)rg[0];
4744 }
4745 }
4746
4751 public void FreeRnnDesc(long h)
4752 {
4753 if (m_dt == DataType.DOUBLE)
4754 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.FREE_RNN_DESC, m_param.AsDouble(h));
4755 else
4756 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.FREE_RNN_DESC, m_param.AsFloat(h));
4757 }
4758
4770 public void SetRnnDesc(long hCuDnn, long hRnnDesc, int nHiddenCount, int nNumLayers, long hDropoutDesc, RNN_MODE mode, bool bUseTensorCores, RNN_DIRECTION direction = RNN_DIRECTION.RNN_UNIDIRECTIONAL)
4771 {
4772 if (m_dt == DataType.DOUBLE)
4773 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SET_RNN_DESC, null, m_param.AsLong(hCuDnn, hRnnDesc, nHiddenCount, nNumLayers, hDropoutDesc, (int)mode, (bUseTensorCores) ? 1 : 0, (long)direction));
4774 else
4775 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SET_RNN_DESC, null, m_param.AsLong(hCuDnn, hRnnDesc, nHiddenCount, nNumLayers, hDropoutDesc, (int)mode, (bUseTensorCores) ? 1 : 0, (long)direction));
4776 }
4777
4785 public int GetRnnParamCount(long hCuDnn, long hRnnDesc, long hXDesc)
4786 {
4787 if (m_dt == DataType.DOUBLE)
4788 {
4789 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.GET_RNN_PARAMCOUNT, null, m_param.AsLong(hCuDnn, hRnnDesc, hXDesc, (m_bEnableRnnExtendedVersion) ? 1 : 0));
4790 return (int)rg[0];
4791 }
4792 else
4793 {
4794 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.GET_RNN_PARAMCOUNT, null, m_param.AsLong(hCuDnn, hRnnDesc, hXDesc, (m_bEnableRnnExtendedVersion) ? 1 : 0));
4795 return (int)rg[0];
4796 }
4797 }
4798
4807 public ulong GetRnnWorkspaceCount(long hCuDnn, long hRnnDesc, long hXDesc, out ulong nReservedCount)
4808 {
4809 if (m_dt == DataType.DOUBLE)
4810 {
4811 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.GET_RNN_WORKSPACECOUNT, null, m_param.AsLong(hCuDnn, hRnnDesc, (m_bEnableRnnExtendedVersion) ? 1 : 0, hXDesc));
4812 nReservedCount = (ulong)rg[1];
4813 return (ulong)rg[0];
4814 }
4815 else
4816 {
4817 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.GET_RNN_WORKSPACECOUNT, null, m_param.AsLong(hCuDnn, hRnnDesc, (m_bEnableRnnExtendedVersion) ? 1 : 0, hXDesc));
4818 nReservedCount = (ulong)rg[1];
4819 return (ulong)rg[0];
4820 }
4821 }
4822
4837 public void GetRnnLinLayerParams(long hCuDnn, long hRnnDesc, int nLayer, long hXDesc, long hWtDesc, long hWtData, int nLinLayer, out int nWtCount, out long hWt, out int nBiasCount, out long hBias)
4838 {
4839 if (m_dt == DataType.DOUBLE)
4840 {
4841 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.GET_RNN_LINLAYERPARAMS, null, m_param.AsLong(hCuDnn, hRnnDesc, nLayer, hXDesc, hWtDesc, hWtData, nLinLayer, (m_bEnableRnnExtendedVersion) ? 1 : 0));
4842 nWtCount = (int)rg[0];
4843 hWt = (long)rg[1];
4844 nBiasCount = (int)rg[2];
4845 hBias = (long)rg[3];
4846 }
4847 else
4848 {
4849 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.GET_RNN_LINLAYERPARAMS, null, m_param.AsLong(hCuDnn, hRnnDesc, nLayer, hXDesc, hWtDesc, hWtData, nLinLayer, (m_bEnableRnnExtendedVersion) ? 1 : 0));
4850 nWtCount = (int)rg[0];
4851 hWt = (long)rg[1];
4852 nBiasCount = (int)rg[2];
4853 hBias = (long)rg[3];
4854 }
4855 }
4856
4881 public void RnnForward(long hCuDnn, long hRnnDesc, long hXDesc, long hXData, long hHxDesc, long hHxData, long hCxDesc, long hCxData, long hWtDesc, long hWtData, long hYDesc, long hYData, long hHyDesc, long hHyData, long hCyDesc, long hCyData, long hWorkspace, ulong nWsCount, long hReserved, ulong nResCount, bool bTraining)
4882 {
4883 if (m_dt == DataType.DOUBLE)
4884 {
4885 List<long> rgArg = new List<long>() { hCuDnn, hRnnDesc };
4886
4887 rgArg.Add(hXDesc);
4888 rgArg.Add(hXData);
4889
4890 rgArg.Add(hHxDesc);
4891 rgArg.Add(hHxData);
4892 rgArg.Add(hCxDesc);
4893 rgArg.Add(hCxData);
4894
4895 rgArg.Add(hWtDesc);
4896 rgArg.Add(hWtData);
4897
4898 rgArg.Add(hYDesc);
4899 rgArg.Add(hYData);
4900
4901 rgArg.Add(hHyDesc);
4902 rgArg.Add(hHyData);
4903 rgArg.Add(hCyDesc);
4904 rgArg.Add(hCyData);
4905
4906 rgArg.Add(hWorkspace);
4907 rgArg.Add((long)nWsCount);
4908 rgArg.Add(hReserved);
4909 rgArg.Add((long)nResCount);
4910 rgArg.Add((bTraining) ? 1 : 0);
4911
4912 if (m_bEnableRnnExtendedVersion)
4913 rgArg.Add(1);
4914
4915 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.FWD_RNN, null, rgArg.ToArray());
4916 }
4917 else
4918 {
4919 List<long> rgArg = new List<long>() { hCuDnn, hRnnDesc };
4920
4921 rgArg.Add(hXDesc);
4922 rgArg.Add(hXData);
4923
4924 rgArg.Add(hHxDesc);
4925 rgArg.Add(hHxData);
4926 rgArg.Add(hCxDesc);
4927 rgArg.Add(hCxData);
4928
4929 rgArg.Add(hWtDesc);
4930 rgArg.Add(hWtData);
4931
4932 rgArg.Add(hYDesc);
4933 rgArg.Add(hYData);
4934
4935 rgArg.Add(hHyDesc);
4936 rgArg.Add(hHyData);
4937 rgArg.Add(hCyDesc);
4938 rgArg.Add(hCyData);
4939
4940 rgArg.Add(hWorkspace);
4941 rgArg.Add((long)nWsCount);
4942 rgArg.Add(hReserved);
4943 rgArg.Add((long)nResCount);
4944 rgArg.Add((bTraining) ? 1 : 0);
4945
4946 if (m_bEnableRnnExtendedVersion)
4947 rgArg.Add(1);
4948
4949 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.FWD_RNN, null, rgArg.ToArray());
4950 }
4951 }
4952
4981 public void RnnBackwardData(long hCuDnn, long hRnnDesc, long hYDesc, long hYData, long hYDiff, long hHyDesc, long hHyDiff, long hCyDesc, long hCyDiff, long hWtDesc, long hWtData, long hHxDesc, long hHxData, long hCxDesc, long hCxData, long hXDesc, long hXDiff, long hdHxDesc, long hHxDiff, long hdCxDesc, long hCxDiff, long hWorkspace, ulong nWsCount, long hReserved, ulong nResCount)
4982 {
4983 if (m_dt == DataType.DOUBLE)
4984 {
4985 List<long> rgArg = new List<long>() { hCuDnn, hRnnDesc };
4986
4987 rgArg.Add(hYDesc);
4988 rgArg.Add(hYData);
4989 rgArg.Add(hYDiff);
4990
4991 rgArg.Add(hHyDesc);
4992 rgArg.Add(hHyDiff);
4993 rgArg.Add(hCyDesc);
4994 rgArg.Add(hCyDiff);
4995
4996 rgArg.Add(hWtDesc);
4997 rgArg.Add(hWtData);
4998
4999 rgArg.Add(hHxDesc);
5000 rgArg.Add(hHxData);
5001 rgArg.Add(hCxDesc);
5002 rgArg.Add(hCxData);
5003
5004 rgArg.Add(hXDesc);
5005 rgArg.Add(hXDiff);
5006
5007 rgArg.Add(hdHxDesc);
5008 rgArg.Add(hHxDiff);
5009 rgArg.Add(hdCxDesc);
5010 rgArg.Add(hCxDiff);
5011
5012 rgArg.Add(hWorkspace);
5013 rgArg.Add((long)nWsCount);
5014 rgArg.Add(hReserved);
5015 rgArg.Add((long)nResCount);
5016
5017 if (m_bEnableRnnExtendedVersion)
5018 rgArg.Add(1);
5019
5020 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.BWD_RNN_DATA, null, rgArg.ToArray());
5021 }
5022 else
5023 {
5024 List<long> rgArg = new List<long>() { hCuDnn, hRnnDesc };
5025
5026 rgArg.Add(hYDesc);
5027 rgArg.Add(hYData);
5028 rgArg.Add(hYDiff);
5029
5030 rgArg.Add(hHyDesc);
5031 rgArg.Add(hHyDiff);
5032 rgArg.Add(hCyDesc);
5033 rgArg.Add(hCyDiff);
5034
5035 rgArg.Add(hWtDesc);
5036 rgArg.Add(hWtData);
5037
5038 rgArg.Add(hHxDesc);
5039 rgArg.Add(hHxData);
5040 rgArg.Add(hCxDesc);
5041 rgArg.Add(hCxData);
5042
5043 rgArg.Add(hXDesc);
5044 rgArg.Add(hXDiff);
5045
5046 rgArg.Add(hdHxDesc);
5047 rgArg.Add(hHxDiff);
5048 rgArg.Add(hdCxDesc);
5049 rgArg.Add(hCxDiff);
5050
5051 rgArg.Add(hWorkspace);
5052 rgArg.Add((long)nWsCount);
5053 rgArg.Add(hReserved);
5054 rgArg.Add((long)nResCount);
5055
5056 if (m_bEnableRnnExtendedVersion)
5057 rgArg.Add(1);
5058
5059 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.BWD_RNN_DATA, null, rgArg.ToArray());
5060 }
5061 }
5062
5080 public void RnnBackwardWeights(long hCuDnn, long hRnnDesc, long hXDesc, long hXData, long hHxDesc, long hHxData, long hYDesc, long hYData, long hWorkspace, ulong nWsCount, long hWtDesc, long hWtDiff, long hReserved, ulong nResCount)
5081 {
5082 if (m_dt == DataType.DOUBLE)
5083 {
5084 List<long> rgArg = new List<long>() { hCuDnn, hRnnDesc };
5085
5086 rgArg.Add(hXDesc);
5087 rgArg.Add(hXData);
5088
5089 rgArg.Add(hHxDesc);
5090 rgArg.Add(hHxData);
5091
5092 rgArg.Add(hYDesc);
5093 rgArg.Add(hYData);
5094
5095 rgArg.Add(hWorkspace);
5096 rgArg.Add((long)nWsCount);
5097
5098 rgArg.Add(hWtDesc);
5099 rgArg.Add(hWtDiff);
5100
5101 rgArg.Add(hReserved);
5102 rgArg.Add((long)nResCount);
5103
5104 if (m_bEnableRnnExtendedVersion)
5105 rgArg.Add(1);
5106
5107 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.BWD_RNN_WTS, null, rgArg.ToArray());
5108 }
5109 else
5110 {
5111 List<long> rgArg = new List<long>() { hCuDnn, hRnnDesc };
5112
5113 rgArg.Add(hXDesc);
5114 rgArg.Add(hXData);
5115
5116 rgArg.Add(hHxDesc);
5117 rgArg.Add(hHxData);
5118
5119 rgArg.Add(hYDesc);
5120 rgArg.Add(hYData);
5121
5122 rgArg.Add(hWorkspace);
5123 rgArg.Add((long)nWsCount);
5124
5125 rgArg.Add(hWtDesc);
5126 rgArg.Add(hWtDiff);
5127
5128 rgArg.Add(hReserved);
5129 rgArg.Add((long)nResCount);
5130
5131 if (m_bEnableRnnExtendedVersion)
5132 rgArg.Add(1);
5133
5134 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.BWD_RNN_WTS, null, rgArg.ToArray());
5135 }
5136 }
5137
5138
5142 public bool IsRnn8Supported()
5143 {
5144 if (m_dt == DataType.DOUBLE)
5145 {
5146 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.RNN8_IS_SUPPORTED, null);
5147 return (rg[0] == 1) ? true : false;
5148 }
5149 else
5150 {
5151 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.RNN8_IS_SUPPORTED, null);
5152 return (rg[0] == 1) ? true : false;
5153 }
5154 }
5155
5160 public long CreateRnn8()
5161 {
5162 if (m_dt == DataType.DOUBLE)
5163 {
5164 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.RNN8_CREATE, null);
5165 return (long)rg[0];
5166 }
5167 else
5168 {
5169 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.RNN8_CREATE, null);
5170 return (long)rg[0];
5171 }
5172 }
5173
5178 public void FreeRnn8(long h)
5179 {
5180 if (m_dt == DataType.DOUBLE)
5181 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.RNN8_FREE, m_param.AsDouble(h));
5182 else
5183 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.RNN8_FREE, m_param.AsFloat(h));
5184 }
5185
5205 public void SetRnn8(long hCuDnn, long hRnn, bool bTraining, RNN_DATALAYOUT layout, RNN_MODE cellMode, RNN_BIAS_MODE biasMode, int nSequenceLen, int nBatchSize, int nInputs, int nHidden, int nOutputs, int nProjection, int nNumLayers, float fDropout, ulong lSeed, bool bBidirectional = false)
5206 {
5207 if (m_dt == DataType.DOUBLE)
5208 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.RNN8_SET, m_param.AsDouble((double)fDropout), m_param.AsLong(hCuDnn, hRnn, (bTraining) ? 1 : 0, (int)layout, (int)cellMode, (int)biasMode, nSequenceLen, nBatchSize, nInputs, nHidden, nOutputs, nProjection, nNumLayers, (long)lSeed, (bBidirectional) ? 1 : 0));
5209 else
5210 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.RNN8_SET, m_param.AsFloat(fDropout), m_param.AsLong(hCuDnn, hRnn, (bTraining) ? 1 : 0, (int)layout, (int)cellMode, (int)biasMode, nSequenceLen, nBatchSize, nInputs, nHidden, nOutputs, nProjection, nNumLayers, (long)lSeed, (bBidirectional) ? 1 : 0));
5211 }
5212
5221 public void GetRnn8MemorySizes(long hCuDnn, long hRnn, out ulong szWtCount, out ulong szWorkSize, out ulong szReservedSize)
5222 {
5223 if (m_dt == DataType.DOUBLE)
5224 {
5225 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.RNN8_GET_MEMORY_SIZES, null, m_param.AsLong(hCuDnn, hRnn));
5226 szWtCount = (ulong)rg[0];
5227 szWorkSize = (ulong)rg[1];
5228 szReservedSize = (ulong)rg[2];
5229 }
5230 else
5231 {
5232 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.RNN8_GET_MEMORY_SIZES, null, m_param.AsLong(hCuDnn, hRnn));
5233 szWtCount = (ulong)rg[0];
5234 szWorkSize = (ulong)rg[1];
5235 szReservedSize = (ulong)rg[2];
5236 }
5237 }
5238
5251 public void InitializeRnn8Weights(long hCuDnn, long hRnn, long hWt, RNN_FILLER_TYPE wtFt, double fWtVal, double fWtVal2, RNN_FILLER_TYPE biasFt, double fBiasVal, double fBiasVal2)
5252 {
5253 if (m_dt == DataType.DOUBLE)
5254 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.RNN8_INIT_WEIGHTS, m_param.AsDouble(fWtVal, fWtVal2, fBiasVal, fBiasVal2), m_param.AsLong(hCuDnn, hRnn, hWt, (int)wtFt, (int)biasFt));
5255 else
5256 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.RNN8_INIT_WEIGHTS, m_param.AsFloat((float)fWtVal, (float)fWtVal2, (float)fBiasVal, (float)fBiasVal2), m_param.AsLong(hCuDnn, hRnn, hWt, (int)wtFt, (int)biasFt));
5257 }
5258
5273 public void Rnn8Forward(long hCuDnn, long hRnn, long hX, long hY, long hhX, long hhY, long hcX, long hcY, long hWts, long hWork, long hReserved)
5274 {
5275 if (m_dt == DataType.DOUBLE)
5276 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.RNN8_FWD, null, m_param.AsLong(hCuDnn, hRnn, hX, hY, hhX, hhY, hcX, hcY, hWts, hWork, hReserved));
5277 else
5278 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.RNN8_FWD, null, m_param.AsLong(hCuDnn, hRnn, hX, hY, hhX, hhY, hcX, hcY, hWts, hWork, hReserved));
5279 }
5280
5300 public void Rnn8Backward(long hCuDnn, long hRnn, long hY, long hdY, long hX, long hdX, long hhX, long hdhY, long hdhX, long hcX, long hdcY, long hdcX, long hWt, long hdWt, long hWork, long hReserved)
5301 {
5302 if (m_dt == DataType.DOUBLE)
5303 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.RNN8_BWD, null, m_param.AsLong(hCuDnn, hRnn, hY, hdY, hX, hdX, hhX, hdhY, hdhX, hcX, hdcY, hdcX, hWt, hdWt, hWork, hReserved));
5304 else
5305 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.RNN8_BWD, null, m_param.AsLong(hCuDnn, hRnn, hY, hdY, hX, hdX, hhX, hdhY, hdhX, hcX, hdcY, hdcX, hWt, hdWt, hWork, hReserved));
5306 }
5307
5319 public long AllocPCAData(int nM, int nN, int nK, out int nCount)
5320 {
5321 nCount = nM * nN;
5322 return AllocMemory(nCount);
5323 }
5324
5336 public long AllocPCAScores(int nM, int nN, int nK, out int nCount)
5337 {
5338 nCount = nM * nK;
5339 return AllocMemory(nCount);
5340 }
5341
5353 public long AllocPCALoads(int nM, int nN, int nK, out int nCount)
5354 {
5355 nCount = nN * nK;
5356 return AllocMemory(nCount);
5357 }
5358
5370 public long AllocPCAEigenvalues(int nM, int nN, int nK, out int nCount)
5371 {
5372 nCount = nK * 1;
5373 return AllocHostBuffer(nCount);
5374 }
5375
5392 public long CreatePCA(int nMaxIterations, int nM, int nN, int nK, long hData, long hScoresResult, long hLoadsResult, long hResiduals = 0, long hEigenvalues = 0)
5393 {
5394 if (m_dt == DataType.DOUBLE)
5395 {
5396 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CREATE_PCA, null, m_param.AsLong(nMaxIterations, nM, nN, nK, hData, hScoresResult, hLoadsResult, hResiduals, hEigenvalues));
5397 return (long)rg[0];
5398 }
5399 else
5400 {
5401 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CREATE_PCA, null, m_param.AsLong(nMaxIterations, nM, nN, nK, hData, hScoresResult, hLoadsResult, hResiduals, hEigenvalues));
5402 return (long)rg[0];
5403 }
5404 }
5405
5417 public bool RunPCA(long hPCA, int nSteps, out int nCurrentK, out int nCurrentIteration)
5418 {
5419 bool bDone = false;
5420
5421 if (m_dt == DataType.DOUBLE)
5422 {
5423 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_RUN_PCA, null, m_param.AsLong(hPCA, nSteps));
5424 bDone = (rg[0] == 1.0) ? true : false;
5425 nCurrentIteration = (int)rg[1];
5426 nCurrentK = (int)rg[2];
5427 }
5428 else
5429 {
5430 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_RUN_PCA, null, m_param.AsLong(hPCA, nSteps));
5431 bDone = (rg[0] == 1.0f) ? true : false;
5432 nCurrentIteration = (int)rg[1];
5433 nCurrentK = (int)rg[2];
5434 }
5435
5436 return bDone;
5437 }
5438
5446 public void FreePCA(long hPCA)
5447 {
5448 if (m_dt == DataType.DOUBLE)
5449 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CUDA_FREE_PCA, m_param.AsDouble(hPCA));
5450 else
5451 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CUDA_FREE_PCA, m_param.AsFloat(hPCA));
5452 }
5453
5482 public long CreateSSD(int nNumClasses, bool bShareLocation, int nLocClasses, int nBackgroundLabelId, bool bUseDiffcultGt, SSD_MINING_TYPE miningType, SSD_MATCH_TYPE matchType, float fOverlapThreshold, bool bUsePriorForMatching, SSD_CODE_TYPE codeType, bool bEncodeVariantInTgt, bool bBpInside, bool bIgnoreCrossBoundaryBbox, bool bUsePriorForNms, SSD_CONF_LOSS_TYPE confLossType, SSD_LOC_LOSS_TYPE locLossType, float fNegPosRatio, float fNegOverlap, int nSampleSize, bool bMapObjectToAgnostic, bool bNmsParam, float? fNmsThreshold = null, int? nNmsTopK = null, float? fNmsEta = null)
5483 {
5484 int nGpuID = GetDeviceID();
5485
5486 if (m_dt == DataType.DOUBLE)
5487 {
5488 List<double> rgArg = new List<double>();
5489
5490 /* 0 */
5491 rgArg.Add(nGpuID);
5492 /* 1 */
5493 rgArg.Add(nNumClasses);
5494 /* 2 */
5495 rgArg.Add((bShareLocation) ? 1 : 0);
5496 /* 3 */
5497 rgArg.Add(nLocClasses);
5498 /* 4 */
5499 rgArg.Add(nBackgroundLabelId);
5500 /* 5 */
5501 rgArg.Add((bUseDiffcultGt) ? 1 : 0);
5502 /* 6 */
5503 rgArg.Add((int)miningType);
5504 /* 7 */
5505 rgArg.Add((int)matchType);
5506 /* 8 */
5507 rgArg.Add(fOverlapThreshold);
5508 /* 9 */
5509 rgArg.Add((bUsePriorForMatching) ? 1 : 0);
5510 /* 10 */
5511 rgArg.Add((int)codeType);
5512 /* 11 */
5513 rgArg.Add((bEncodeVariantInTgt) ? 1 : 0);
5514 /* 12 */
5515 rgArg.Add((bBpInside) ? 1 : 0);
5516 /* 13 */
5517 rgArg.Add((bIgnoreCrossBoundaryBbox) ? 1 : 0);
5518 /* 14 */
5519 rgArg.Add((bUsePriorForNms) ? 1 : 0);
5520 /* 15 */
5521 rgArg.Add((int)confLossType);
5522 /* 16 */
5523 rgArg.Add((int)locLossType);
5524 /* 17 */
5525 rgArg.Add(fNegPosRatio);
5526 /* 18 */
5527 rgArg.Add(fNegOverlap);
5528 /* 19 */
5529 rgArg.Add(nSampleSize);
5530 /* 20 */
5531 rgArg.Add((bMapObjectToAgnostic) ? 1 : 0);
5532 /* 21 */
5533 rgArg.Add((bNmsParam) ? 1 : 0);
5534
5535 if (bNmsParam)
5536 {
5537 if (!fNmsThreshold.HasValue)
5538 throw new Exception("An NMS threshold must be specified when the 'bNmsParam' is true.");
5539
5540 /* 22 */
5541 rgArg.Add(fNmsThreshold.GetValueOrDefault(0));
5542 /* 23 */
5543 rgArg.Add(nNmsTopK.GetValueOrDefault(-1));
5544 /* 24 */
5545 rgArg.Add(fNmsEta.GetValueOrDefault(1));
5546 }
5547
5548 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CUDA_CREATE_SSD, rgArg.ToArray());
5549 return (long)rg[0];
5550 }
5551 else
5552 {
5553 List<float> rgArg = new List<float>();
5554
5555 /* 0 */
5556 rgArg.Add(nGpuID);
5557 /* 1 */
5558 rgArg.Add(nNumClasses);
5559 /* 2 */
5560 rgArg.Add((bShareLocation) ? 1 : 0);
5561 /* 3 */
5562 rgArg.Add(nLocClasses);
5563 /* 4 */
5564 rgArg.Add(nBackgroundLabelId);
5565 /* 5 */
5566 rgArg.Add((bUseDiffcultGt) ? 1 : 0);
5567 /* 6 */
5568 rgArg.Add((int)miningType);
5569 /* 7 */
5570 rgArg.Add((int)matchType);
5571 /* 8 */
5572 rgArg.Add(fOverlapThreshold);
5573 /* 9 */
5574 rgArg.Add((bUsePriorForMatching) ? 1 : 0);
5575 /* 10 */
5576 rgArg.Add((int)codeType);
5577 /* 11 */
5578 rgArg.Add((bEncodeVariantInTgt) ? 1 : 0);
5579 /* 12 */
5580 rgArg.Add((bBpInside) ? 1 : 0);
5581 /* 13 */
5582 rgArg.Add((bIgnoreCrossBoundaryBbox) ? 1 : 0);
5583 /* 14 */
5584 rgArg.Add((bUsePriorForNms) ? 1 : 0);
5585 /* 15 */
5586 rgArg.Add((int)confLossType);
5587 /* 16 */
5588 rgArg.Add((int)locLossType);
5589 /* 17 */
5590 rgArg.Add(fNegPosRatio);
5591 /* 18 */
5592 rgArg.Add(fNegOverlap);
5593 /* 19 */
5594 rgArg.Add(nSampleSize);
5595 /* 20 */
5596 rgArg.Add((bMapObjectToAgnostic) ? 1 : 0);
5597 /* 21 */
5598 rgArg.Add((bNmsParam) ? 1 : 0);
5599
5600 if (bNmsParam)
5601 {
5602 if (!fNmsThreshold.HasValue)
5603 throw new Exception("An NMS threshold must be specified when the 'bNmsParam' is true.");
5604
5605 /* 22 */
5606 rgArg.Add(fNmsThreshold.GetValueOrDefault(0));
5607 /* 23 */
5608 rgArg.Add(nNmsTopK.GetValueOrDefault(-1));
5609 /* 24 */
5610 rgArg.Add(fNmsEta.GetValueOrDefault(1));
5611 }
5612
5613 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CUDA_CREATE_SSD, rgArg.ToArray());
5614 return (long)rg[0];
5615 }
5616 }
5617
5625 public void SetupSSD(long hSSD, int nNum, int nNumPriors, int nNumGt)
5626 {
5627 if (m_dt == DataType.DOUBLE)
5628 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CUDA_SETUP_SSD, m_param.AsDouble(hSSD, nNum, nNumPriors, nNumGt));
5629 else
5630 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CUDA_SETUP_SSD, m_param.AsFloat(hSSD, nNum, nNumPriors, nNumGt));
5631 }
5632
5637 public void FreeSSD(long hSSD)
5638 {
5639 if (m_dt == DataType.DOUBLE)
5640 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CUDA_FREE_SSD, m_param.AsDouble(hSSD));
5641 else
5642 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CUDA_FREE_SSD, m_param.AsFloat(hSSD));
5643 }
5644
5661 public int SsdMultiBoxLossForward(long hSSD, int nLocDataCount, long hLocGpuData, int nConfDataCount, long hConfGpuData, int nPriorDataCount, long hPriorGpuData, int nGtDataCount, long hGtGpuData, out List<DictionaryMap<List<int>>> rgAllMatchIndices, out List<List<int>> rgrgAllNegIndices, out int nNumNegs)
5662 {
5663 int nIdx = 0;
5664 int nMatchCount = 0;
5665 rgAllMatchIndices = new List<DictionaryMap<List<int>>>();
5666 rgrgAllNegIndices = new List<List<int>>();
5667
5668 if (m_dt == DataType.DOUBLE)
5669 {
5670 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SSD_FWD_MULTIBOXLOSS, null, m_param.AsLong(hSSD, nLocDataCount, hLocGpuData, nConfDataCount, hConfGpuData, nPriorDataCount, hPriorGpuData, nGtDataCount, hGtGpuData));
5671 nMatchCount = (int)rg[nIdx];
5672 nIdx++;
5673 nNumNegs = (int)rg[nIdx];
5674 nIdx++;
5675
5676 // Get the match indices.
5677 int nNumAllMatchIndices = (int)rg[nIdx];
5678 nIdx++;
5679 for (int i = 0; i < nNumAllMatchIndices; i++)
5680 {
5681 DictionaryMap<List<int>> map = new DictionaryMap<List<int>>(null);
5682
5683 int nMapCount = (int)rg[nIdx];
5684 nIdx++;
5685 for (int j = 0; j < nMapCount; j++)
5686 {
5687 int nLabel = (int)rg[nIdx];
5688 nIdx++;
5689 List<int> rgIdx = new List<int>();
5690
5691 int nItemCount = (int)rg[nIdx];
5692 nIdx++;
5693 for (int k = 0; k < nItemCount; k++)
5694 {
5695 int nItemIdx = (int)rg[nIdx];
5696 nIdx++;
5697 rgIdx.Add(nItemIdx);
5698 }
5699
5700 map[nLabel] = rgIdx;
5701 }
5702
5703 rgAllMatchIndices.Add(map);
5704 }
5705
5706 // Get the neg indices.
5707 int nNegListCount = (int)rg[nIdx];
5708 nIdx++;
5709 for (int i = 0; i < nNegListCount; i++)
5710 {
5711 int nItemCount = (int)rg[nIdx];
5712 nIdx++;
5713 List<int> rgItems = new List<int>();
5714
5715 for (int j = 0; j < nItemCount; j++)
5716 {
5717 int nItemIdx = (int)rg[nIdx];
5718 nIdx++;
5719 rgItems.Add(nItemIdx);
5720 }
5721
5722 rgrgAllNegIndices.Add(rgItems);
5723 }
5724 }
5725 else
5726 {
5727 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SSD_FWD_MULTIBOXLOSS, null, m_param.AsLong(hSSD, nLocDataCount, hLocGpuData, nConfDataCount, hConfGpuData, nPriorDataCount, hPriorGpuData, nGtDataCount, hGtGpuData));
5728 nMatchCount = (int)rg[nIdx];
5729 nIdx++;
5730 nNumNegs = (int)rg[nIdx];
5731 nIdx++;
5732
5733 // Get the match indices.
5734 int nMapListCount = (int)rg[nIdx];
5735 nIdx++;
5736 for (int i = 0; i < nMapListCount; i++)
5737 {
5738 DictionaryMap<List<int>> map = new DictionaryMap<List<int>>(null);
5739
5740 int nMapCount = (int)rg[nIdx];
5741 nIdx++;
5742 for (int j = 0; j < nMapCount; j++)
5743 {
5744 int nLabel = (int)rg[nIdx];
5745 nIdx++;
5746 List<int> rgIdx = new List<int>();
5747
5748 int nItemCount = (int)rg[nIdx];
5749 nIdx++;
5750 for (int k = 0; k < nItemCount; k++)
5751 {
5752 int nItemIdx = (int)rg[nIdx];
5753 nIdx++;
5754 rgIdx.Add(nItemIdx);
5755 }
5756
5757 map[nLabel] = rgIdx;
5758 }
5759
5760 rgAllMatchIndices.Add(map);
5761 }
5762
5763 // Get the neg indices.
5764 int nNegListCount = (int)rg[nIdx];
5765 nIdx++;
5766 for (int i = 0; i < nNegListCount; i++)
5767 {
5768 int nItemCount = (int)rg[nIdx];
5769 nIdx++;
5770 List<int> rgItems = new List<int>();
5771
5772 for (int j = 0; j < nItemCount; j++)
5773 {
5774 int nItemIdx = (int)rg[nIdx];
5775 nIdx++;
5776 rgItems.Add(nItemIdx);
5777 }
5778
5779 rgrgAllNegIndices.Add(rgItems);
5780 }
5781 }
5782
5783 return nMatchCount;
5784 }
5785
5794 public void SsdEncodeLocPrediction(long hSSD, int nLocPredCount, long hLocPred, int nLocGtCount, long hLocGt)
5795 {
5796 if (m_dt == DataType.DOUBLE)
5797 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SSD_ENCODE_LOCPRED, null, m_param.AsLong(hSSD, nLocPredCount, hLocPred, nLocGtCount, hLocGt));
5798 else
5799 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SSD_ENCODE_LOCPRED, null, m_param.AsLong(hSSD, nLocPredCount, hLocPred, nLocGtCount, hLocGt));
5800 }
5801
5810 public void SsdEncodeConfPrediction(long hSSD, int nConfPredCount, long hConfPred, int nConfGtCount, long hConfGt)
5811 {
5812 if (m_dt == DataType.DOUBLE)
5813 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SSD_ENCODE_CONFPRED, null, m_param.AsLong(hSSD, nConfPredCount, hConfPred, nConfGtCount, hConfGt));
5814 else
5815 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SSD_ENCODE_CONFPRED, null, m_param.AsLong(hSSD, nConfPredCount, hConfPred, nConfGtCount, hConfGt));
5816 }
5817
5828 public long CreateLayerNorm(int nGpuID, int nCount, int nOuterNum, int nChannels, int nInnerNum, float fEps = 1e-10f)
5829 {
5830 if (m_dt == DataType.DOUBLE)
5831 {
5832 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CREATE_LAYERNORM, m_param.AsDouble(fEps), m_param.AsLong(nGpuID, nCount, nOuterNum, nChannels, nInnerNum, 0));
5833 return (long)rg[0];
5834 }
5835 else
5836 {
5837 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CREATE_LAYERNORM, m_param.AsFloat(fEps), m_param.AsLong(nGpuID, nCount, nOuterNum, nChannels, nInnerNum, 0));
5838 return (long)rg[0];
5839 }
5840 }
5841
5846 public void FreeLayerNorm(long hLayerNorm)
5847 {
5848 if (m_dt == DataType.DOUBLE)
5849 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CUDA_FREE_LAYERNORM, m_param.AsDouble(hLayerNorm));
5850 else
5851 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CUDA_FREE_LAYERNORM, m_param.AsFloat(hLayerNorm));
5852 }
5853
5860 public void LayerNormForward(long hLayerNorm, long hXdata, long hYdata)
5861 {
5862 if (m_dt == DataType.DOUBLE)
5863 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_LAYERNORM_FWD, null, m_param.AsLong(hLayerNorm, hXdata, hYdata));
5864 else
5865 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_LAYERNORM_FWD, null, m_param.AsLong(hLayerNorm, hXdata, hYdata));
5866 }
5867
5875 public void LayerNormBackward(long hLayerNorm, long hYdata, long hYdiff, long hXdiff)
5876 {
5877 if (m_dt == DataType.DOUBLE)
5878 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_LAYERNORM_BWD, null, m_param.AsLong(hLayerNorm, hYdata, hYdiff, hXdiff));
5879 else
5880 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_LAYERNORM_BWD, null, m_param.AsLong(hLayerNorm, hYdata, hYdiff, hXdiff));
5881 }
5882
5883 #endregion
5884
5885 //---------------------------------------------------------------------
5886 // ICudaMath Methods
5887 //---------------------------------------------------------------------
5888 #region ICudaMath Methods
5889
5897 public void set(int nCount, long hHandle, double fVal, int nIdx = -1)
5898 {
5899 set(nCount, hHandle, (T)Convert.ChangeType(fVal, typeof(T)), nIdx);
5900 }
5901
5909 public void set(int nCount, long hHandle, float fVal, int nIdx = -1)
5910 {
5911 set(nCount, hHandle, (T)Convert.ChangeType(fVal, typeof(T)), nIdx);
5912 }
5913
5922 public void set(int nCount, long hHandle, T fVal, int nIdx = -1, int nXOff = 0)
5923 {
5924 if (m_dt == DataType.DOUBLE)
5925 {
5926 if (m_rgGhostMemory == null)
5927 {
5928 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SET, m_param.AsDouble(convertD(fVal)), m_param.AsLong(nCount, hHandle, 0, nIdx, nXOff));
5929 }
5930 else
5931 {
5932 if (nIdx >= 0)
5933 m_rgGhostMemory[hHandle][nIdx] = fVal;
5934 else
5935 Utility.Set<T>(m_rgGhostMemory[hHandle], fVal);
5936 }
5937 }
5938 else
5939 {
5940 if (m_rgGhostMemory == null)
5941 {
5942 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SET, m_param.AsFloat(convertF(fVal)), m_param.AsLong(nCount, hHandle, 0, nIdx, nXOff));
5943 }
5944 else
5945 {
5946 if (nIdx >= 0)
5947 m_rgGhostMemory[hHandle][nIdx] = fVal;
5948 else
5949 Utility.Set<T>(m_rgGhostMemory[hHandle], fVal);
5950 }
5951 }
5952 }
5953
5961 public double[] get_double(int nCount, long hHandle, int nIdx = -1)
5962 {
5963 return convertD(get(nCount, hHandle, nIdx));
5964 }
5965
5973 public float[] get_float(int nCount, long hHandle, int nIdx = -1)
5974 {
5975 return convertF(get(nCount, hHandle, nIdx));
5976 }
5977
5985 public T[] get(int nCount, long hHandle, int nIdx = -1)
5986 {
5987 if (m_dt == DataType.DOUBLE)
5988 return convert(m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_GET, null, m_param.AsLong(nCount, hHandle, nIdx)));
5989 else
5990 return convert(m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_GET, null, m_param.AsLong(nCount, hHandle, nIdx)));
5991 }
5992
6007 public void copy(int nCount, long hSrc, long hDst, int nSrcOffset = 0, int nDstOffset = 0, long hStream = -1, bool? bSrcHalfSizeOverride = null, bool? bDstHalfSizeOverride = null)
6008 {
6009 int nSrcHalfSizeOverride = -1;
6010 int nDstHalfSizeOverride = -1;
6011
6012 if (bSrcHalfSizeOverride.HasValue)
6013 nSrcHalfSizeOverride = (bSrcHalfSizeOverride.Value) ? 1 : 0;
6014
6015 if (bDstHalfSizeOverride.HasValue)
6016 nDstHalfSizeOverride = (bDstHalfSizeOverride.Value) ? 1 : 0;
6017
6018 if (m_dt == DataType.DOUBLE)
6019 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_COPY, null, m_param.AsLong(nCount, hSrc, hDst, nSrcOffset, nDstOffset, hStream, nSrcHalfSizeOverride, nDstHalfSizeOverride));
6020 else
6021 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_COPY, null, m_param.AsLong(nCount, hSrc, hDst, nSrcOffset, nDstOffset, hStream, nSrcHalfSizeOverride, nDstHalfSizeOverride));
6022 }
6023
6035 public void copy(int nCount, int nNum, int nDim, long hSrc1, long hSrc2, long hDst, long hSimilar, bool bInvert = false)
6036 {
6037 if (m_dt == DataType.DOUBLE)
6038 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_COPY_SIM, null, m_param.AsLong(nCount, nNum, nDim, hSrc1, hSrc2, hDst, hSimilar, (bInvert) ? 1 : 0));
6039 else
6040 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_COPY_SIM, null, m_param.AsLong(nCount, nNum, nDim, hSrc1, hSrc2, hDst, hSimilar, (bInvert) ? 1 : 0));
6041 }
6042
6062 public void copy_batch(int nCount, int nNum, int nDim, long hSrcData, long hSrcLbl, int nDstCount, long hDstCache, long hWorkDevData, int nLabelStart, int nLabelCount, int nCacheSize, long hCacheHostCursors, long hWorkDataHost)
6063 {
6064 if (m_dt == DataType.DOUBLE)
6065 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_COPY_BATCH, null, m_param.AsLong(nCount, nNum, nDim, hSrcData, hSrcLbl, nDstCount, hDstCache, hWorkDevData, nLabelStart, nLabelCount, nCacheSize, hCacheHostCursors, hWorkDataHost));
6066 else
6067 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_COPY_BATCH, null, m_param.AsLong(nCount, nNum, nDim, hSrcData, hSrcLbl, nDstCount, hDstCache, hWorkDevData, nLabelStart, nLabelCount, nCacheSize, hCacheHostCursors, hWorkDataHost));
6068 }
6069
6095 public void copy_sequence(int nK, int nNum, int nDim, long hSrcData, long hSrcLbl, int nSrcCacheCount, long hSrcCache, int nLabelStart, int nLabelCount, int nCacheSize, long hCacheHostCursors, bool bOutputLabels, List<long> rghTop, List<int> rgnTopCount, long hWorkDataHost, bool bCombinePositiveAndNegative = false, int nSeed = 0)
6096 {
6097 int nTopCount = 2 + nK;
6098
6099 if (bOutputLabels)
6100 nTopCount++;
6101
6102 if (bCombinePositiveAndNegative && nK != 0)
6103 throw new ArgumentOutOfRangeException("nK", "When using 'bCombinePositiveAndNegative', nK should be 0.");
6104
6105 if (nK < 0 || nK > 10)
6106 throw new ArgumentOutOfRangeException("nK", "The 'nK' parameter must be within the range [0,10]!");
6107
6108 if (rghTop.Count != nTopCount)
6109 throw new ArgumentOutOfRangeException("rghTop", "The 'rghTop' count must equal '" + nTopCount.ToString() + "' given nK = " + nK.ToString() + " and bOutputLabels = " + bOutputLabels.ToString() + "!");
6110
6111 if (rgnTopCount.Count != rghTop.Count)
6112 throw new ArgumentOutOfRangeException("rgnTopCount", "The 'rgnTopCount' count must equal the 'rghTop' count!");
6113
6114 if (m_dt == DataType.DOUBLE)
6115 {
6116 List<long> rgarg = new List<long>() { nK, nNum, nDim, hSrcData, hSrcLbl, nSrcCacheCount, hSrcCache, nLabelStart, nLabelCount, nCacheSize, hCacheHostCursors, (bOutputLabels) ? 1 : 0, hWorkDataHost, (bCombinePositiveAndNegative) ? 1 : 0, nSeed };
6117
6118 for (int i = 0; i < rghTop.Count; i++)
6119 {
6120 rgarg.Add(rghTop[i]);
6121 }
6122
6123 for (int i = 0; i < rgnTopCount.Count; i++)
6124 {
6125 rgarg.Add(rgnTopCount[i]);
6126 }
6127
6128 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_COPY_SEQUENCE, null, rgarg.ToArray());
6129 }
6130 else
6131 {
6132 List<long> rgarg = new List<long>() { nK, nNum, nDim, hSrcData, hSrcLbl, nSrcCacheCount, hSrcCache, nLabelStart, nLabelCount, nCacheSize, hCacheHostCursors, (bOutputLabels) ? 1 : 0, hWorkDataHost, (bCombinePositiveAndNegative) ? 1 : 0, nSeed };
6133
6134 for (int i = 0; i < rghTop.Count; i++)
6135 {
6136 rgarg.Add(rghTop[i]);
6137 }
6138
6139 for (int i = 0; i < rgnTopCount.Count; i++)
6140 {
6141 rgarg.Add(rgnTopCount[i]);
6142 }
6143
6144 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_COPY_SEQUENCE, null, rgarg.ToArray());
6145 }
6146 }
6147
6165 public void copy_sequence(int n, long hSrc, int nSrcStep, int nSrcStartIdx, int nCopyCount, int nCopyDim, long hDst, int nDstStep, int nDstStartIdx, int nSrcSpatialDim, int nDstSpatialDim, int nSrcSpatialDimStartIdx = 0, int nDstSpatialDimStartIdx = 0, int nSpatialDimCount = -1)
6166 {
6167 if (m_dt == DataType.DOUBLE)
6168 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_COPY_SEQUENCE2, null, m_param.AsLong(n, hSrc, nSrcStep, nSrcStartIdx, nCopyCount, nCopyDim, hDst, nDstStep, nDstStartIdx, nSrcSpatialDim, nDstSpatialDim, nSrcSpatialDimStartIdx, nDstSpatialDimStartIdx, nSpatialDimCount));
6169 else
6170 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_COPY_SEQUENCE2, null, m_param.AsLong(n, hSrc, nSrcStep, nSrcStartIdx, nCopyCount, nCopyDim, hDst, nDstStep, nDstStartIdx, nSrcSpatialDim, nDstSpatialDim, nSrcSpatialDimStartIdx, nDstSpatialDimStartIdx, nSpatialDimCount));
6171 }
6172
6182 public void copy_expand(int n, int nNum, int nDim, long hX, long hA)
6183 {
6184 if (m_dt == DataType.DOUBLE)
6185 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_COPY_EXPAND, null, m_param.AsLong(n, nNum, nDim, hX, hA));
6186 else
6187 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_COPY_EXPAND, null, m_param.AsLong(n, nNum, nDim, hX, hA));
6188 }
6189
6199 public void fill(int n, int nDim, long hSrc, int nSrcOff, int nCount, long hDst)
6200 {
6201 if (m_dt == DataType.DOUBLE)
6202 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_COPY_FILL, null, m_param.AsLong(n, nDim, hSrc, nSrcOff, nCount, hDst));
6203 else
6204 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_COPY_FILL, null, m_param.AsLong(n, nDim, hSrc, nSrcOff, nCount, hDst));
6205 }
6206
6212 public void sort(int nCount, long hY)
6213 {
6214 if (m_dt == DataType.DOUBLE)
6215 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SORT, null, m_param.AsLong(nCount, hY));
6216 else
6217 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SORT, null, m_param.AsLong(nCount, hY)); ;
6218 }
6219
6236 public void gemm(bool bTransA, bool bTransB, int m, int n, int k, double fAlpha, long hA, long hB, double fBeta, long hC)
6237 {
6238 gemm(bTransA, bTransB, m, n, k, (T)Convert.ChangeType(fAlpha, typeof(T)), hA, hB, (T)Convert.ChangeType(fBeta, typeof(T)), hC);
6239 }
6240
6257 public void gemm(bool bTransA, bool bTransB, int m, int n, int k, float fAlpha, long hA, long hB, float fBeta, long hC)
6258 {
6259 gemm(bTransA, bTransB, m, n, k, (T)Convert.ChangeType(fAlpha, typeof(T)), hA, hB, (T)Convert.ChangeType(fBeta, typeof(T)), hC);
6260 }
6261
6285 public void gemm(bool bTransA, bool bTransB, int m, int n, int k, T fAlpha, long hA, long hB, T fBeta, long hC, int nAOffset = 0, int nBOffset = 0, int nCOffset = 0, int nGroups = 1, int nGroupOffsetA = 0, int nGroupOffsetB = 0, int nGroupOffsetC = 0)
6286 {
6287 if (m_dt == DataType.DOUBLE)
6288 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_GEMM, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong((bTransA) ? 1 : 0, (bTransB) ? 1 : 0, m, n, k, 0, hA, hB, 0, hC, nAOffset, nBOffset, nCOffset, nGroups, nGroupOffsetA, nGroupOffsetB, nGroupOffsetC));
6289 else
6290 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_GEMM, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong((bTransA) ? 1 : 0, (bTransB) ? 1 : 0, m, n, k, 0, hA, hB, 0, hC, nAOffset, nBOffset, nCOffset, nGroups, nGroupOffsetA, nGroupOffsetB, nGroupOffsetC));
6291 }
6292
6312 public void gemm(bool bTransA, bool bTransB, int m, int n, int k, double fAlpha, long hA, long hB, double fBeta, long hC, uint lda, uint ldb, uint ldc)
6313 {
6314 if (m_dt == DataType.DOUBLE)
6315 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_GEMM2, m_param.AsDouble(fAlpha, fBeta), m_param.AsLong((bTransA) ? 1 : 0, (bTransB) ? 1 : 0, m, n, k, 0, hA, hB, 0, hC, lda, ldb, ldc));
6316 else
6317 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_GEMM2, m_param.AsFloat((float)fAlpha, (float)fBeta), m_param.AsLong((bTransA) ? 1 : 0, (bTransB) ? 1 : 0, m, n, k, 0, hA, hB, 0, hC, lda, ldb, ldc));
6318 }
6319
6343 public void gemm(bool bTransA, bool bTransB, int m, int n, int k, double fAlpha, long hA, long hB, double fBeta, long hC, uint lda, uint ldb, uint ldc, uint stridea, uint strideb, uint stridec, uint batch_count)
6344 {
6345 if (m_dt == DataType.DOUBLE)
6346 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_GEMM2, m_param.AsDouble(fAlpha, fBeta), m_param.AsLong((bTransA) ? 1 : 0, (bTransB) ? 1 : 0, m, n, k, 0, hA, hB, 0, hC, lda, ldb, ldc, stridea, strideb, stridec, batch_count));
6347 else
6348 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_GEMM2, m_param.AsFloat((float)fAlpha, (float)fBeta), m_param.AsLong((bTransA) ? 1 : 0, (bTransB) ? 1 : 0, m, n, k, 0, hA, hB, 0, hC, lda, ldb, ldc, stridea, strideb, stridec, batch_count));
6349 }
6350
6366 public void geam(bool bTransA, bool bTransB, int m, int n, double fAlpha, long hA, long hB, double fBeta, long hC)
6367 {
6368 geam(bTransA, bTransB, m, n, (T)Convert.ChangeType(fAlpha, typeof(T)), hA, hB, (T)Convert.ChangeType(fBeta, typeof(T)), hC);
6369 }
6370
6386 public void geam(bool bTransA, bool bTransB, int m, int n, float fAlpha, long hA, long hB, float fBeta, long hC)
6387 {
6388 geam(bTransA, bTransB, m, n, (T)Convert.ChangeType(fAlpha, typeof(T)), hA, hB, (T)Convert.ChangeType(fBeta, typeof(T)), hC);
6389 }
6390
6409 public void geam(bool bTransA, bool bTransB, int m, int n, T fAlpha, long hA, long hB, T fBeta, long hC, int nAOffset = 0, int nBOffset = 0, int nCOffset = 0)
6410 {
6411 if (m_dt == DataType.DOUBLE)
6412 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_GEAM, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong((bTransA) ? 1 : 0, (bTransB) ? 1 : 0, m, n, 0, hA, hB, 0, hC, nAOffset, nBOffset, nCOffset));
6413 else
6414 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_GEAM, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong((bTransA) ? 1 : 0, (bTransB) ? 1 : 0, m, n, 0, hA, hB, 0, hC, nAOffset, nBOffset, nCOffset));
6415 }
6416
6431 public void gemv(bool bTransA, int m, int n, double fAlpha, long hA, long hX, double fBeta, long hY)
6432 {
6433 gemv(bTransA, m, n, (T)Convert.ChangeType(fAlpha, typeof(T)), hA, hX, (T)Convert.ChangeType(fBeta, typeof(T)), hY);
6434 }
6435
6450 public void gemv(bool bTransA, int m, int n, float fAlpha, long hA, long hX, float fBeta, long hY)
6451 {
6452 gemv(bTransA, m, n, (T)Convert.ChangeType(fAlpha, typeof(T)), hA, hX, (T)Convert.ChangeType(fBeta, typeof(T)), hY);
6453 }
6454
6472 public void gemv(bool bTransA, int m, int n, T fAlpha, long hA, long hX, T fBeta, long hY, int nAOffset = 0, int nXOffset = 0, int nYOffset = 0)
6473 {
6474 if (m_dt == DataType.DOUBLE)
6475 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_GEMV, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong((bTransA) ? 1 : 0, m, n, 0, hA, hX, 0, hY, nAOffset, nXOffset, nYOffset));
6476 else
6477 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_GEMV, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong((bTransA) ? 1 : 0, m, n, 0, hA, hX, 0, hY, nAOffset, nXOffset, nYOffset));
6478 }
6479
6492 public void ger(int m, int n, double fAlpha, long hX, long hY, long hA)
6493 {
6494 ger(m, n, (T)Convert.ChangeType(fAlpha, typeof(T)), hX, hY, hA);
6495 }
6496
6509 public void ger(int m, int n, float fAlpha, long hX, long hY, long hA)
6510 {
6511 ger(m, n, (T)Convert.ChangeType(fAlpha, typeof(T)), hX, hY, hA);
6512 }
6513
6526 public void ger(int m, int n, T fAlpha, long hX, long hY, long hA)
6527 {
6528 if (m_dt == DataType.DOUBLE)
6529 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_GER, m_param.AsDouble(convertD(fAlpha)), m_param.AsLong(m, n, 0, hX, hY, hA));
6530 else
6531 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_GER, m_param.AsFloat(convertF(fAlpha)), m_param.AsLong(m, n, 0, hX, hY, hA));
6532 }
6533
6544 public void axpy(int n, double fAlpha, long hX, long hY)
6545 {
6546 axpy(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hX, hY);
6547 }
6548
6559 public void axpy(int n, float fAlpha, long hX, long hY)
6560 {
6561 axpy(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hX, hY);
6562 }
6563
6576 public void axpy(int n, T fAlpha, long hX, long hY, int nXOff = 0, int nYOff = 0)
6577 {
6578 if (m_dt == DataType.DOUBLE)
6579 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_AXPY, m_param.AsDouble(convertD(fAlpha)), m_param.AsLong(n, 0, hX, hY, nXOff, nYOff));
6580 else
6581 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_AXPY, m_param.AsFloat(convertF(fAlpha)), m_param.AsLong(n, 0, hX, hY, nXOff, nYOff));
6582 }
6583
6595 public void axpby(int n, double fAlpha, long hX, double fBeta, long hY)
6596 {
6597 axpby(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hX, (T)Convert.ChangeType(fBeta, typeof(T)), hY);
6598 }
6599
6611 public void axpby(int n, float fAlpha, long hX, float fBeta, long hY)
6612 {
6613 axpby(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hX, (T)Convert.ChangeType(fBeta, typeof(T)), hY);
6614 }
6615
6629 public void axpby(int n, T fAlpha, long hX, T fBeta, long hY)
6630 {
6631 if (m_dt == DataType.DOUBLE)
6632 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_AXPBY, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(n, 0, hX, 0, hY));
6633 else
6634 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_AXPBY, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(n, 0, hX, 0, hY));
6635 }
6636
6650 public void mulbsx(int n, long hA, int nAOff, long hX, int nXOff, int nC, int nSpatialDim, bool bTranspose, long hB, int nBOff)
6651 {
6652 if (m_dt == DataType.DOUBLE)
6653 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MULBSX, null, m_param.AsLong(n, hA, nAOff, hX, nXOff, nC, nSpatialDim, (bTranspose) ? 1 : 0, hB, nBOff));
6654 else
6655 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MULBSX, null, m_param.AsLong(n, hA, nAOff, hX, nXOff, nC, nSpatialDim, (bTranspose) ? 1 : 0, hB, nBOff));
6656 }
6657
6671 public void divbsx(int n, long hA, int nAOff, long hX, int nXOff, int nC, int nSpatialDim, bool bTranspose, long hB, int nBOff)
6672 {
6673 if (m_dt == DataType.DOUBLE)
6674 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_DIVBSX, null, m_param.AsLong(n, hA, nAOff, hX, nXOff, nC, nSpatialDim, (bTranspose) ? 1 : 0, hB, nBOff));
6675 else
6676 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_DIVBSX, null, m_param.AsLong(n, hA, nAOff, hX, nXOff, nC, nSpatialDim, (bTranspose) ? 1 : 0, hB, nBOff));
6677 }
6678
6695 public void matmul(uint nOuterCount, int m, int n, int k, long hA, long hB, long hC, double dfScale = 1.0, bool bTransA = false, bool bTransB = false)
6696 {
6697 uint ldb = (uint)n;
6698 uint lda = (uint)k;
6699 uint ldc = (uint)n;
6700 uint strideb = (uint)(k * n);
6701 uint stridea = (uint)(m * k);
6702 uint stridec = (uint)(m * n);
6703
6704 gemm(bTransB, bTransA, n, m, k, dfScale, hB, hA, 0.0, hC, ldb, lda, ldc, strideb, stridea, stridec, nOuterCount);
6705 }
6706
6716 public void transposeHW(int n, int c, int h, int w, long hSrc, long hDst)
6717 {
6718 if (m_dt == DataType.DOUBLE)
6719 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_TRANSPOSE_HW, null, m_param.AsLong(n, c, h, w, hSrc, hDst));
6720 else
6721 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_TRANSPOSE_HW, null, m_param.AsLong(n, c, h, w, hSrc, hDst));
6722 }
6723
6724
6732 public void set_bounds(int n, double dfMin, double dfMax, long hX)
6733 {
6734 if (m_dt == DataType.DOUBLE)
6735 {
6736 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SET_BOUNDS, m_param.AsDouble(dfMin, dfMax), m_param.AsLong(n, 0, 0, hX));
6737 }
6738 else
6739 {
6740 float fMin = -float.MaxValue;
6741 float fMax = float.MaxValue;
6742
6743 if (dfMin > -float.MaxValue && dfMin < float.MaxValue)
6744 fMin = (float)dfMin;
6745 else if (dfMin > float.MaxValue)
6746 fMin = float.MaxValue;
6747
6748 if (dfMax > -float.MaxValue && dfMax < float.MaxValue)
6749 fMax = (float)dfMax;
6750 else if (dfMin < -float.MaxValue)
6751 fMax = -float.MaxValue;
6752
6753 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SET_BOUNDS, m_param.AsFloat(fMin, fMax), m_param.AsLong(n, 0, 0, hX));
6754 }
6755 }
6756
6767 public void scal(int n, double fAlpha, long hX, int nXOff = 0)
6768 {
6769 scal(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hX, nXOff);
6770 }
6771
6782 public void scal(int n, float fAlpha, long hX, int nXOff = 0)
6783 {
6784 scal(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hX, nXOff);
6785 }
6786
6797 public void scal(int n, T fAlpha, long hX, int nXOff = 0)
6798 {
6799 if (m_dt == DataType.DOUBLE)
6800 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SCAL, m_param.AsDouble(convertD(fAlpha)), m_param.AsLong(n, 0, hX, nXOff));
6801 else
6802 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SCAL, m_param.AsFloat(convertF(fAlpha)), m_param.AsLong(n, 0, hX, nXOff));
6803 }
6804
6815 public double dot_double(int n, long hX, long hY)
6816 {
6817 return (double)Convert.ChangeType(dot(n, hX, hY), typeof(double));
6818 }
6819
6830 public float dot_float(int n, long hX, long hY)
6831 {
6832 return (float)Convert.ChangeType(dot(n, hX, hY), typeof(float));
6833 }
6834
6847 public T dot(int n, long hX, long hY, int nXOff = 0, int nYOff = 0)
6848 {
6849 if (m_dt == DataType.DOUBLE)
6850 {
6851 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_DOT, null, m_param.AsLong(n, hX, hY, nXOff, nYOff));
6852 return (T)Convert.ChangeType(rg[0], typeof(T));
6853 }
6854 else
6855 {
6856 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_DOT, null, m_param.AsLong(n, hX, hY, nXOff, nYOff));
6857 return (T)Convert.ChangeType(rg[0], typeof(T));
6858 }
6859 }
6860
6871 public double asum_double(int n, long hX, int nXOff = 0)
6872 {
6873 return (double)Convert.ChangeType(asum(n, hX, nXOff), typeof(double));
6874 }
6875
6886 public float asum_float(int n, long hX, int nXOff = 0)
6887 {
6888 return (float)Convert.ChangeType(asum(n, hX, nXOff), typeof(float));
6889 }
6890
6901 public T asum(int n, long hX, int nXOff = 0)
6902 {
6903 if (m_dt == DataType.DOUBLE)
6904 {
6905 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_ASUM, null, m_param.AsLong(n, hX, nXOff));
6906 return (T)Convert.ChangeType(rg[0], typeof(T));
6907 }
6908 else
6909 {
6910 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_ASUM, null, m_param.AsLong(n, hX, nXOff));
6911 return (T)Convert.ChangeType(rg[0], typeof(T));
6912 }
6913 }
6914
6925 public void scale(int n, double fAlpha, long hX, long hY)
6926 {
6927 scale(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hX, hY);
6928 }
6929
6940 public void scale(int n, float fAlpha, long hX, long hY)
6941 {
6942 scale(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hX, hY);
6943 }
6944
6957 public void scale(int n, T fAlpha, long hX, long hY, int nXOff = 0, int nYOff = 0)
6958 {
6959 if (m_dt == DataType.DOUBLE)
6960 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SCALE, m_param.AsDouble(convertD(fAlpha)), m_param.AsLong(n, 0, hX, hY, nXOff, nYOff));
6961 else
6962 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SCALE, m_param.AsFloat(convertF(fAlpha)), m_param.AsLong(n, 0, hX, hY, nXOff, nYOff));
6963 }
6964
6973 public void scale_to_range(int n, long hX, long hY, double fMin, double fMax)
6974 {
6975 if (m_dt == DataType.DOUBLE)
6976 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SCALE_TO_RANGE, m_param.AsDouble(fMin, fMax), m_param.AsLong(n, hX, hY, 0, 0));
6977 else
6978 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SCALE_TO_RANGE, m_param.AsFloat((float)fMin, (float)fMax), m_param.AsLong(n, hX, hY, 0, 0));
6979 }
6980
6986 public double erf(double dfVal)
6987 {
6988 return convertD(erf(convertD1(dfVal)));
6989 }
6990
6996 public float erf(float fVal)
6997 {
6998 return convertF(erf(convertF1(fVal)));
6999 }
7000
7006 public T erf(T fVal)
7007 {
7008 if (m_dt == DataType.DOUBLE)
7009 {
7010 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CUDA_ERF, m_param.AsDouble(convertD(fVal)));
7011 return convert(rg)[0];
7012 }
7013 else
7014 {
7015 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CUDA_ERF, m_param.AsFloat(convertF(fVal)));
7016 return convert(rg)[0];
7017 }
7018 }
7019
7030 public void mask(int n, int nMaskDim, T fSearch, T fReplace, long hX, long hMask, long hY)
7031 {
7032 if (m_dt == DataType.DOUBLE)
7033 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MASK, m_param.AsDouble(convertD(fSearch), convertD(fReplace)), m_param.AsLong(n, nMaskDim, 0, 0, hX, hMask, hY));
7034 else
7035 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MASK, m_param.AsFloat(convertF(fSearch), convertF(fReplace)), m_param.AsLong(n, nMaskDim, 0, 0, hX, hMask, hY));
7036 }
7037
7048 public void mask(int n, int nMaskDim, double fSearch, double fReplace, long hX, long hMask, long hY)
7049 {
7050 mask(n, nMaskDim, (T)Convert.ChangeType(fSearch, typeof(T)), (T)Convert.ChangeType(fReplace, typeof(T)), hX, hMask, hY);
7051 }
7052
7063 public void mask(int n, int nMaskDim, float fSearch, float fReplace, long hX, long hMask, long hY)
7064 {
7065 mask(n, nMaskDim, (T)Convert.ChangeType(fSearch, typeof(T)), (T)Convert.ChangeType(fReplace, typeof(T)), hX, hMask, hY);
7066 }
7067
7079 public void mask_batch(int n, int nBatch, int nMaskDim, T fSearch, T fReplace, long hX, long hMask, long hY)
7080 {
7081 if (m_dt == DataType.DOUBLE)
7082 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MASK_BATCH, m_param.AsDouble(convertD(fSearch), convertD(fReplace)), m_param.AsLong(n, nBatch, nMaskDim, 0, 0, hX, hMask, hY));
7083 else
7084 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MASK_BATCH, m_param.AsFloat(convertF(fSearch), convertF(fReplace)), m_param.AsLong(n, nBatch, nMaskDim, 0, 0, hX, hMask, hY));
7085 }
7086
7098 public void mask_batch(int n, int nBatch, int nMaskDim, double fSearch, double fReplace, long hX, long hMask, long hY)
7099 {
7100 mask_batch(n, nBatch, nMaskDim, (T)Convert.ChangeType(fSearch, typeof(T)), (T)Convert.ChangeType(fReplace, typeof(T)), hX, hMask, hY);
7101 }
7102
7114 public void mask_batch(int n, int nBatch, int nMaskDim, float fSearch, float fReplace, long hX, long hMask, long hY)
7115 {
7116 mask_batch(n, nBatch, nMaskDim, (T)Convert.ChangeType(fSearch, typeof(T)), (T)Convert.ChangeType(fReplace, typeof(T)), hX, hMask, hY);
7117 }
7118
7138 public void interp2(int nChannels, long hData1, int nX1, int nY1, int nHeight1, int nWidth1, int nHeight1A, int nWidth1A, long hData2, int nX2, int nY2, int nHeight2, int nWidth2, int nHeight2A, int nWidth2A, bool bBwd = false)
7139 {
7140 if (!(nX1 >= 0 && nY1 >= 0 && nHeight1 > 0 && nWidth1 > 0 && nX2 >= 0 && nY2 >= 0 && nHeight2 > 0 && nWidth2 > 0))
7141 throw new ArgumentOutOfRangeException("interp2: Invalid arguments found.");
7142
7143 if (!(nWidth1A >= nWidth1 + nX1 && nHeight1A >= nHeight1 + nY1 && nWidth2A >= nWidth2 + nX2 && nHeight2A >= nHeight2 + nY2))
7144 throw new ArgumentOutOfRangeException("interp2: Invalid arguments found.");
7145
7146 if (m_dt == DataType.DOUBLE)
7147 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_INTERP2, null, m_param.AsLong(nChannels, hData1, nX1, nY1, nHeight1, nWidth1, nHeight1A, nWidth1A, hData2, nX2, nY2, nHeight2, nWidth2, nHeight2A, nWidth2A, (bBwd) ? 1 : 0));
7148 else
7149 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_INTERP2, null, m_param.AsLong(nChannels, hData1, nX1, nY1, nHeight1, nWidth1, nHeight1A, nWidth1A, hData2, nX2, nY2, nHeight2, nWidth2, nHeight2A, nWidth2A, (bBwd) ? 1 : 0));
7150 }
7151
7161 public void add_scalar(int n, double fAlpha, long hY)
7162 {
7163 add_scalar(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hY);
7164 }
7165
7175 public void add_scalar(int n, float fAlpha, long hY)
7176 {
7177 add_scalar(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hY);
7178 }
7179
7190 public void add_scalar(int n, T fAlpha, long hY, int nYOff = 0)
7191 {
7192 if (m_dt == DataType.DOUBLE)
7193 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_ADD_SCALAR, m_param.AsDouble(convertD(fAlpha)), m_param.AsLong(n, 0, hY, nYOff));
7194 else
7195 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_ADD_SCALAR, m_param.AsFloat(convertF(fAlpha)), m_param.AsLong(n, 0, hY, nYOff));
7196 }
7197
7209 public void add(int n, long hA, long hB, long hC, long hY)
7210 {
7211 if (m_dt == DataType.DOUBLE)
7212 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_ADD3, null, m_param.AsLong(n, hA, hB, hC, hY));
7213 else
7214 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_ADD3, null, m_param.AsLong(n, hA, hB, hC, hY));
7215 }
7216
7227 public void add(int n, long hA, long hB, long hY)
7228 {
7229 if (m_dt == DataType.DOUBLE)
7230 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_ADD, null, m_param.AsLong(n, hA, hB, hY));
7231 else
7232 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_ADD, null, m_param.AsLong(n, hA, hB, hY));
7233 }
7234
7246 public void add(int n, long hA, long hB, long hY, double dfAlpha)
7247 {
7248 if (m_dt == DataType.DOUBLE)
7249 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_ADD, m_param.AsDouble(dfAlpha), m_param.AsLong(n, hA, hB, hY, 0));
7250 else
7251 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_ADD, m_param.AsFloat((float)dfAlpha), m_param.AsLong(n, hA, hB, hY, 0));
7252 }
7253
7265 public void add(int n, long hA, long hB, long hY, float fAlpha)
7266 {
7267 if (m_dt == DataType.DOUBLE)
7268 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_ADD, m_param.AsDouble(fAlpha), m_param.AsLong(n, hA, hB, hY, 0));
7269 else
7270 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_ADD, m_param.AsFloat(fAlpha), m_param.AsLong(n, hA, hB, hY, 0));
7271 }
7272
7288 public void add(int n, long hA, long hB, long hY, double dfAlphaA, double dfAlphaB, int nAOff = 0, int nBOff = 0, int nYOff = 0)
7289 {
7290 if (m_dt == DataType.DOUBLE)
7291 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_ADD2, m_param.AsDouble(dfAlphaA, dfAlphaB), m_param.AsLong(n, hA, hB, hY, 0, 0, nAOff, nBOff, nYOff));
7292 else
7293 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_ADD2, m_param.AsFloat((float)dfAlphaA, (float)dfAlphaB), m_param.AsLong(n, hA, hB, hY, 0, 0, nAOff, nBOff, nYOff));
7294 }
7295
7312 public void sub(int n, long hA, long hB, long hY, int nAOff = 0, int nBOff = 0, int nYOff = 0, int nB = 0)
7313 {
7314 if (m_dt == DataType.DOUBLE)
7315 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SUB, null, m_param.AsLong(n, hA, hB, hY, nAOff, nBOff, nYOff, nB));
7316 else
7317 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SUB, null, m_param.AsLong(n, hA, hB, hY, nAOff, nBOff, nYOff, nB));
7318 }
7319
7320
7334 public void mul(int n, long hA, long hB, long hY, int nAOff = 0, int nBOff = 0, int nYOff = 0)
7335 {
7336 if (m_dt == DataType.DOUBLE)
7337 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MUL, null, m_param.AsLong(n, hA, hB, hY, nAOff, nBOff, nYOff));
7338 else
7339 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MUL, null, m_param.AsLong(n, hA, hB, hY, nAOff, nBOff, nYOff));
7340 }
7341
7357 public void sub_and_dot(int n, int nN, int nInnerNum, long hA, long hB, long hY, int nAOff, int nBOff, int nYOff)
7358 {
7359 if (m_dt == DataType.DOUBLE)
7360 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SUB_AND_DOT, null, m_param.AsLong(n, nN, nInnerNum, hA, hB, hY, nAOff, nBOff, nYOff));
7361 else
7362 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SUB_AND_DOT, null, m_param.AsLong(n, nN, nInnerNum, hA, hB, hY, nAOff, nBOff, nYOff));
7363 }
7364
7374 public void mul_scalar(int n, double fAlpha, long hY)
7375 {
7376 mul_scalar(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hY);
7377 }
7378
7388 public void mul_scalar(int n, float fAlpha, long hY)
7389 {
7390 mul_scalar(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hY);
7391 }
7392
7402 public void mul_scalar(int n, T fAlpha, long hY)
7403 {
7404 if (m_dt == DataType.DOUBLE)
7405 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MUL_SCALAR, m_param.AsDouble(convertD(fAlpha)), m_param.AsLong(n, 0, hY));
7406 else
7407 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MUL_SCALAR, m_param.AsFloat(convertF(fAlpha)), m_param.AsLong(n, 0, hY));
7408 }
7409
7420 public void div(int n, long hA, long hB, long hY)
7421 {
7422 if (m_dt == DataType.DOUBLE)
7423 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_DIV, null, m_param.AsLong(n, hA, hB, hY));
7424 else
7425 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_DIV, null, m_param.AsLong(n, hA, hB, hY));
7426 }
7427
7437 public void abs(int n, long hA, long hY)
7438 {
7439 if (m_dt == DataType.DOUBLE)
7440 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_ABS, null, m_param.AsLong(n, hA, hY));
7441 else
7442 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_ABS, null, m_param.AsLong(n, hA, hY));
7443 }
7444
7454 public void exp(int n, long hA, long hY)
7455 {
7456 exp(n, hA, hY, 0, 0, 1.0);
7457 }
7458
7471 public void exp(int n, long hA, long hY, int nAOff, int nYOff, double dfBeta)
7472 {
7473 if (m_dt == DataType.DOUBLE)
7474 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_EXP, m_param.AsDouble(dfBeta), m_param.AsLong(n, hA, hY, nAOff, nYOff, 0));
7475 else
7476 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_EXP, m_param.AsFloat((float)dfBeta), m_param.AsLong(n, hA, hY, nAOff, nYOff, 0));
7477 }
7478
7488 public void log(int n, long hA, long hY)
7489 {
7490 log(n, hA, hY, 1.0, 0.0);
7491 }
7492
7504 public void log(int n, long hA, long hY, double dfBeta, double dfAlpha = 0)
7505 {
7506 if (m_dt == DataType.DOUBLE)
7507 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_LOG, m_param.AsDouble(dfBeta, dfAlpha), m_param.AsLong(n, hA, hY, 0, 0));
7508 else
7509 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_LOG, m_param.AsFloat((float)dfBeta, (float)dfAlpha), m_param.AsLong(n, hA, hY, 0, 0));
7510 }
7511
7524 public void powx(int n, long hA, double fAlpha, long hY, int nAOff = 0, int nYOff = 0)
7525 {
7526 powx(n, hA, (T)Convert.ChangeType(fAlpha, typeof(T)), hY, nAOff, nYOff);
7527 }
7528
7541 public void powx(int n, long hA, float fAlpha, long hY, int nAOff = 0, int nYOff = 0)
7542 {
7543 powx(n, hA, (T)Convert.ChangeType(fAlpha, typeof(T)), hY, nAOff, nYOff);
7544 }
7545
7558 public void powx(int n, long hA, T fAlpha, long hY, int nAOff = 0, int nYOff = 0)
7559 {
7560 if (m_dt == DataType.DOUBLE)
7561 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_POWX, m_param.AsDouble(convertD(fAlpha)), m_param.AsLong(n, hA, 0, hY, nAOff, nYOff));
7562 else
7563 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_POWX, m_param.AsFloat(convertF(fAlpha)), m_param.AsLong(n, hA, 0, hY, nAOff, nYOff));
7564 }
7565
7574 public void sign(int n, long hX, long hY, int nXOff = 0, int nYOff = 0)
7575 {
7576 if (m_dt == DataType.DOUBLE)
7577 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SIGN, null, m_param.AsLong(n, hX, hY, nXOff, nYOff));
7578 else
7579 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SIGN, null, m_param.AsLong(n, hX, hY, nXOff, nYOff));
7580 }
7581
7582#pragma warning disable 1591
7583
7584 public void student(int n, long hX, long hY)
7585 {
7586 if (m_dt == DataType.DOUBLE)
7587 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_STUDENT, null, m_param.AsLong(n, hX, hY));
7588 else
7589 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_STUDENT, null, m_param.AsLong(n, hX, hY));
7590 }
7591
7592 public void logistic1(int n, long hX, long hY)
7593 {
7594 if (m_dt == DataType.DOUBLE)
7595 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_LOGISTIC1, null, m_param.AsLong(n, hX, hY));
7596 else
7597 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_LOGISTIC1, null, m_param.AsLong(n, hX, hY));
7598 }
7599
7600 public void logistic2(int n, long hX, long hY)
7601 {
7602 if (m_dt == DataType.DOUBLE)
7603 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_LOGISTIC2, null, m_param.AsLong(n, hX, hY));
7604 else
7605 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_LOGISTIC2, null, m_param.AsLong(n, hX, hY));
7606 }
7607
7608 public void reciprocol(int n, long hX, long hY)
7609 {
7610 if (m_dt == DataType.DOUBLE)
7611 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_RECIPROCOL, null, m_param.AsLong(n, hX, hY));
7612 else
7613 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_RECIPROCOL, null, m_param.AsLong(n, hX, hY));
7614 }
7615
7616#pragma warning restore 1591
7617
7624 public void sqrt(int n, long hX, long hY)
7625 {
7626 if (m_dt == DataType.DOUBLE)
7627 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SQRT, null, m_param.AsLong(n, hX, hY));
7628 else
7629 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SQRT, null, m_param.AsLong(n, hX, hY));
7630 }
7631
7638 public void sqrt_scale(int nCount, long hX, long hY)
7639 {
7640 if (m_dt == DataType.DOUBLE)
7641 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SQRT_SCALE, null, m_param.AsLong(nCount, hX, hY));
7642 else
7643 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SQRT_SCALE, null, m_param.AsLong(nCount, hX, hY));
7644 }
7645
7653 public void compare_signs(int n, long hA, long hB, long hY)
7654 {
7655 if (m_dt == DataType.DOUBLE)
7656 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_COMPARE_SIGNS, null, m_param.AsLong(n, hA, hB, hY));
7657 else
7658 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_COMPARE_SIGNS, null, m_param.AsLong(n, hA, hB, hY));
7659 }
7660
7669 public void max(int n, long hA, long hB, long hY)
7670 {
7671 if (m_dt == DataType.DOUBLE)
7672 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MAX, null, m_param.AsLong(n, hA, hB, hY));
7673 else
7674 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MAX, null, m_param.AsLong(n, hA, hB, hY));
7675 }
7676
7686 public void max_bwd(int n, long hAdata, long hBdata, long hYdiff, long hAdiff, long hBdiff)
7687 {
7688 if (m_dt == DataType.DOUBLE)
7689 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MAX_BWD2, null, m_param.AsLong(n, hAdata, hBdata, hYdiff, hAdiff, hBdiff));
7690 else
7691 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MAX_BWD2, null, m_param.AsLong(n, hAdata, hBdata, hYdiff, hAdiff, hBdiff));
7692 }
7693
7702 public void min(int n, long hA, long hB, long hY)
7703 {
7704 if (m_dt == DataType.DOUBLE)
7705 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MIN, null, m_param.AsLong(n, hA, hB, hY));
7706 else
7707 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MIN, null, m_param.AsLong(n, hA, hB, hY));
7708 }
7709
7724 public double max(int n, long hA, out long lPos, int nAOff = 0, long hWork = 0)
7725 {
7726 if (hWork != 0)
7727 {
7728 if (m_dt == DataType.DOUBLE)
7729 {
7730 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MAXVAL, null, m_param.AsLong(n, hA, nAOff, hWork));
7731 lPos = (long)rg[1];
7732 return rg[0];
7733 }
7734 else
7735 {
7736 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MAXVAL, null, m_param.AsLong(n, hA, nAOff, hWork));
7737 lPos = (long)rg[1];
7738 return rg[0];
7739 }
7740 }
7741 else
7742 {
7743 if (m_dt == DataType.DOUBLE)
7744 {
7745 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MAXVAL, null, m_param.AsLong(n, hA, nAOff));
7746 lPos = (long)rg[1];
7747 return rg[0];
7748 }
7749 else
7750 {
7751 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MAXVAL, null, m_param.AsLong(n, hA, nAOff));
7752 lPos = (long)rg[1];
7753 return rg[0];
7754 }
7755 }
7756 }
7757
7772 public double min(int n, long hA, out long lPos, int nAOff = 0, long hWork = 0)
7773 {
7774 if (hWork != 0)
7775 {
7776 if (m_dt == DataType.DOUBLE)
7777 {
7778 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MINVAL, null, m_param.AsLong(n, hA, nAOff, hWork));
7779 lPos = (long)rg[1];
7780 return rg[0];
7781 }
7782 else
7783 {
7784 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MINVAL, null, m_param.AsLong(n, hA, nAOff, hWork));
7785 lPos = (long)rg[1];
7786 return rg[0];
7787 }
7788 }
7789 else
7790 {
7791 if (m_dt == DataType.DOUBLE)
7792 {
7793 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MINVAL, null, m_param.AsLong(n, hA, nAOff));
7794 lPos = (long)rg[1];
7795 return rg[0];
7796 }
7797 else
7798 {
7799 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MINVAL, null, m_param.AsLong(n, hA, nAOff));
7800 lPos = (long)rg[1];
7801 return rg[0];
7802 }
7803 }
7804 }
7805
7818 public Tuple<double, double, double, double> minmax(int n, long hA, long hWork1, long hWork2, bool bDetectNans = false, int nAOff = 0)
7819 {
7820 if (m_dt == DataType.DOUBLE)
7821 {
7822 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MINMAXVAL, null, m_param.AsLong(n, hA, hWork1, hWork2, (bDetectNans) ? 1 : 0, nAOff));
7823 return new Tuple<double, double, double, double>(rg[0], rg[1], rg[2], rg[3]);
7824 }
7825 else
7826 {
7827 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MINMAXVAL, null, m_param.AsLong(n, hA, hWork1, hWork2, (bDetectNans) ? 1 : 0, nAOff));
7828 return new Tuple<double, double, double, double>(rg[0], rg[1], rg[2], rg[3]);
7829 }
7830 }
7831
7843 public void minmax(int n, long hA, long hWork1, long hWork2, int nK, long hMin, long hMax, bool bNonZeroOnly)
7844 {
7845 if (m_dt == DataType.DOUBLE)
7846 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MINMAXVEC, null, m_param.AsLong(n, hA, hWork1, hWork2, nK, hMin, hMax, (bNonZeroOnly) ? 1 : 0));
7847 else
7848 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MINMAXVEC, null, m_param.AsLong(n, hA, hWork1, hWork2, nK, hMin, hMax, (bNonZeroOnly) ? 1 : 0));
7849 }
7850
7862 public void transpose(int n, long hX, long hY, long hXCounts, long hYCounts, long hMapping, int nNumAxes, long hBuffer)
7863 {
7864 if (m_dt == DataType.DOUBLE)
7865 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_TRANSPOSE, null, m_param.AsLong(n, hX, hY, hXCounts, hYCounts, hMapping, nNumAxes, hBuffer));
7866 else
7867 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_TRANSPOSE, null, m_param.AsLong(n, hX, hY, hXCounts, hYCounts, hMapping, nNumAxes, hBuffer));
7868 }
7869
7878 public double sumsq(int n, long hW, long hA, int nAOff = 0)
7879 {
7880 if (m_dt == DataType.DOUBLE)
7881 {
7882 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SUMSQ, null, m_param.AsLong(n, hW, hA, nAOff));
7883 return rg[0];
7884 }
7885 else
7886 {
7887 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SUMSQ, null, m_param.AsLong(n, hW, hA, nAOff));
7888 return rg[0];
7889 }
7890 }
7891
7902 public double sumsqdiff(int n, long hW, long hA, long hB, int nAOff = 0, int nBOff = 0)
7903 {
7904 if (m_dt == DataType.DOUBLE)
7905 {
7906 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SUMSQDIFF, null, m_param.AsLong(n, hW, hA, hB, nAOff, nBOff));
7907 return rg[0];
7908 }
7909 else
7910 {
7911 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SUMSQDIFF, null, m_param.AsLong(n, hW, hA, hB, nAOff, nBOff));
7912 return rg[0];
7913 }
7914 }
7915
7925 public void width(int n, long hMean, long hMin, long hMax, double dfAlpha, long hWidth)
7926 {
7927 if (m_dt == DataType.DOUBLE)
7928 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_WIDTH, m_param.AsDouble(dfAlpha), m_param.AsLong(n, hMean, hMin, hMax, 0, hWidth));
7929 else
7930 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_WIDTH, m_param.AsFloat((float)dfAlpha), m_param.AsLong(n, hMean, hMin, hMax, 0, hWidth));
7931 }
7932
7943 public bool contains_point(int n, long hMean, long hWidth, long hX, long hWork, int nXOff = 0)
7944 {
7945 if (m_dt == DataType.DOUBLE)
7946 {
7947 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CONTAINS_POINT, null, m_param.AsLong(n, hMean, hWidth, hX, hWork, nXOff));
7948 return (rg[0] == 0) ? false : true;
7949 }
7950 else
7951 {
7952 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CONTAINS_POINT, null, m_param.AsLong(n, hMean, hWidth, hX, hWork, nXOff));
7953 return (rg[0] == 0) ? false : true;
7954 }
7955 }
7956
7963 public void denan(int n, long hX, double dfReplacement)
7964 {
7965 if (m_dt == DataType.DOUBLE)
7966 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_DENAN, m_param.AsDouble(dfReplacement), m_param.AsLong(n, hX, 0));
7967 else
7968 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_DENAN, m_param.AsFloat((float)dfReplacement), m_param.AsLong(n, hX, 0));
7969 }
7970
7989 public void im2col(long hDataIm, int nDataImOffset, int nChannels, int nHeight, int nWidth, int nKernelH, int nKernelW, int nPadH, int nPadW, int nStrideH, int nStrideW, int nDilationH, int nDilationW, long hDataCol, int nDataColOffset)
7990 {
7991 if (m_dt == DataType.DOUBLE)
7992 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_IM2COL, null, m_param.AsLong(hDataIm, nDataImOffset, nChannels, nHeight, nWidth, nKernelH, nKernelW, nPadH, nPadW, nStrideH, nStrideW, nDilationH, nDilationW, hDataCol, nDataColOffset));
7993 else
7994 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_IM2COL, null, m_param.AsLong(hDataIm, nDataImOffset, nChannels, nHeight, nWidth, nKernelH, nKernelW, nPadH, nPadW, nStrideH, nStrideW, nDilationH, nDilationW, hDataCol, nDataColOffset));
7995 }
7996
8013 public void im2col_nd(long hDataIm, int nDataImOffset, int nNumSpatialAxes, int nImCount, int nChannelAxis, long hImShape, long hColShape, long hKernelShape, long hPad, long hStride, long hDilation, long hDataCol, int nDataColOffset)
8014 {
8015 if (m_dt == DataType.DOUBLE)
8016 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_IM2COL_ND, null, m_param.AsLong(hDataIm, nDataImOffset, nNumSpatialAxes, nImCount, nChannelAxis, hImShape, hColShape, hKernelShape, hPad, hStride, hDilation, hDataCol, nDataColOffset));
8017 else
8018 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_IM2COL_ND, null, m_param.AsLong(hDataIm, nDataImOffset, nNumSpatialAxes, nImCount, nChannelAxis, hImShape, hColShape, hKernelShape, hPad, hStride, hDilation, hDataCol, nDataColOffset));
8019 }
8020
8039 public void col2im(long hDataCol, int nDataColOffset, int nChannels, int nHeight, int nWidth, int nKernelH, int nKernelW, int nPadH, int nPadW, int nStrideH, int nStrideW, int nDilationH, int nDilationW, long hDataIm, int nDataImOffset)
8040 {
8041 if (m_dt == DataType.DOUBLE)
8042 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_COL2IM, null, m_param.AsLong(hDataCol, nDataColOffset, nChannels, nHeight, nWidth, nKernelH, nKernelW, nPadH, nPadW, nStrideH, nStrideW, nDilationH, nDilationW, hDataIm, nDataImOffset));
8043 else
8044 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_COL2IM, null, m_param.AsLong(hDataCol, nDataColOffset, nChannels, nHeight, nWidth, nKernelH, nKernelW, nPadH, nPadW, nStrideH, nStrideW, nDilationH, nDilationW, hDataIm, nDataImOffset));
8045 }
8046
8063 public void col2im_nd(long hDataCol, int nDataColOffset, int nNumSpatialAxes, int nColCount, int nChannelAxis, long hImShape, long hColShape, long hKernelShape, long hPad, long hStride, long hDilation, long hDataIm, int nDataImOffset)
8064 {
8065 if (m_dt == DataType.DOUBLE)
8066 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_COL2IM_ND, null, m_param.AsLong(hDataCol, nDataColOffset, nNumSpatialAxes, nColCount, nChannelAxis, hImShape, hColShape, hKernelShape, hPad, hStride, hDilation, hDataIm, nDataImOffset));
8067 else
8068 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_COL2IM_ND, null, m_param.AsLong(hDataCol, nDataColOffset, nNumSpatialAxes, nColCount, nChannelAxis, hImShape, hColShape, hKernelShape, hPad, hStride, hDilation, hDataIm, nDataImOffset));
8069 }
8070
8081 public void channel_min(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, bool bReturnIdx = false)
8082 {
8083 if (m_dt == DataType.DOUBLE)
8084 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_MIN, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, (bReturnIdx) ? 1 : 0));
8085 else
8086 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_MIN, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, (bReturnIdx) ? 1 : 0));
8087 }
8088
8099 public void channel_max(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, bool bReturnIdx = false)
8100 {
8101 if (m_dt == DataType.DOUBLE)
8102 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_MAX, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, (bReturnIdx) ? 1 : 0));
8103 else
8104 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_MAX, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, (bReturnIdx) ? 1 : 0));
8105 }
8106
8116 public void channel_mean(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY)
8117 {
8118 if (m_dt == DataType.DOUBLE)
8119 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_MEAN, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8120 else
8121 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_MEAN, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8122 }
8123
8133 public void channel_compare(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY)
8134 {
8135 if (m_dt == DataType.DOUBLE)
8136 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_COMPARE, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8137 else
8138 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_COMPARE, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8139 }
8140
8152 public void channel_fillfrom(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, DIR dir)
8153 {
8154 if (m_dt == DataType.DOUBLE)
8155 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_FILLFROM, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, (int)dir));
8156 else
8157 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_FILLFROM, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, (int)dir));
8158 }
8159
8179 public void channel_fill(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, int nLabelDim, long hLabels, long hY)
8180 {
8181 if (m_dt == DataType.DOUBLE)
8182 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_FILL, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, nLabelDim, hLabels, hY));
8183 else
8184 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_FILL, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, nLabelDim, hLabels, hY));
8185 }
8186
8197 public void channel_sub(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hA, long hX, long hY)
8198 {
8199 if (m_dt == DataType.DOUBLE)
8200 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_SUB, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, hA));
8201 else
8202 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_SUB, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, hA));
8203 }
8204
8214 public void channel_sub(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY)
8215 {
8216 if (m_dt == DataType.DOUBLE)
8217 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_SUB, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8218 else
8219 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_SUB, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8220 }
8221
8236 public void channel_sum(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, bool bSumAcrossChannels = true, DIR dir = DIR.FWD, int nChannelsY = -1)
8237 {
8238 if (m_dt == DataType.DOUBLE)
8239 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_SUM, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, (bSumAcrossChannels) ? 1 : 0, (int)dir, nChannelsY));
8240 else
8241 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_SUM, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, (bSumAcrossChannels) ? 1 : 0, (int)dir, nChannelsY));
8242 }
8243
8254 public void channel_div(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, int nMethod = 1)
8255 {
8256 if (m_dt == DataType.DOUBLE)
8257 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_DIV, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, nMethod));
8258 else
8259 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_DIV, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, nMethod));
8260 }
8261
8272 public void channel_mul(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, int nMethod = 1)
8273 {
8274 if (m_dt == DataType.DOUBLE)
8275 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_MUL, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, nMethod));
8276 else
8277 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_MUL, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, nMethod));
8278 }
8279
8290 public void channel_mulv(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hA, long hX, long hC)
8291 {
8292 if (m_dt == DataType.DOUBLE)
8293 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_MULV, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hA, hX, hC));
8294 else
8295 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_MULV, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hA, hX, hC));
8296 }
8297
8308 public void channel_scale(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hA, long hY)
8309 {
8310 if (m_dt == DataType.DOUBLE)
8311 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_SCALE, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hA, hY));
8312 else
8313 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_SCALE, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hA, hY));
8314 }
8315
8326 public void channel_dot(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hA, long hY)
8327 {
8328 if (m_dt == DataType.DOUBLE)
8329 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_DOT, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hA, hY));
8330 else
8331 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_DOT, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hA, hY));
8332 }
8333
8343 public void channel_duplicate(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY)
8344 {
8345 if (m_dt == DataType.DOUBLE)
8346 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_DUP, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8347 else
8348 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_DUP, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8349 }
8350
8361 public void channel_percentile(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, double dfPercentile)
8362 {
8363 if (m_dt == DataType.DOUBLE)
8364 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_PERCENTILE, m_param.AsDouble(dfPercentile), m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8365 else
8366 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_PERCENTILE, m_param.AsFloat((float)dfPercentile), m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8367 }
8368
8382 public void channel_op_fwd(OP op, int nCount, int nC, int nN1, int nSD1, int nN2, int nSD2, long hA, long hB, long hY)
8383 {
8384 int nCount1 = Math.Max(nN1, nN2) * nC * Math.Max(nSD1, nSD2);
8385 if (nCount1 != nCount)
8386 throw new Exception("The nCount must equal max(nN1, nN2) x nC x max(nSD1, nSD2).");
8387
8388 if (m_dt == DataType.DOUBLE)
8389 m_cuda.RunDoubleEx2((int) m_hKernel, (int) CUDAFN.CUDA_CHANNEL_OP_FWD, null, m_param.AsLong((int)op, nCount, nC, nN1, nSD1, nN2, nSD2, hA, hB, hY));
8390 else
8391 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_OP_FWD, null, m_param.AsLong((int)op, nCount, nC, nN1, nSD1, nN2, nSD2, hA, hB, hY));
8392 }
8393
8413 public void channel_op_bwd(OP op, int nCount, int nC, int nN1, int nSD1, int nN2, int nSD2, int nCy,int nSDy, long hA, long hB, long hY, long hAd, long hBd, long hYd, long hWork)
8414 {
8415 int nCount1 = Math.Max(nN1, nN2) * nC * Math.Max(nSD1, nSD2);
8416 if (nCount1 != nCount)
8417 throw new Exception("The nCount must equal max(nN1, nN2) x nC x max(nSD1, nSD2).");
8418
8419 if (m_dt == DataType.DOUBLE)
8420 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_OP_BWD, null, m_param.AsLong((int)op, nCount, nC, nN1, nSD1, nN2, nSD2, nCy, nSDy, hA, hB, hY, hAd, hBd, hYd, hWork));
8421 else
8422 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_OP_BWD, null, m_param.AsLong((int)op, nCount, nC, nN1, nSD1, nN2, nSD2, nCy, nSDy, hA, hB, hY, hAd, hBd, hYd, hWork));
8423 }
8424
8437 public void channel_add(int nCount, int nOuterNum, int nChannels, int nBlocks, int nInnerNum, int nOffset, long hX, long hY, DIR dir)
8438 {
8439 if (m_dt == DataType.DOUBLE)
8440 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_ADD, null, m_param.AsLong(nCount, nOuterNum, nChannels, nBlocks, nInnerNum, nOffset, hX, hY, (int)dir));
8441 else
8442 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_ADD, null, m_param.AsLong(nCount, nOuterNum, nChannels, nBlocks, nInnerNum, nOffset, hX, hY, (int)dir));
8443 }
8444
8457 public void channel_copy(int nCount, int nOuterNum, int nChannels, int nBlocks, int nInnerNum, int nOffset, long hX, long hY, DIR dir)
8458 {
8459 if (m_dt == DataType.DOUBLE)
8460 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_COPY, null, m_param.AsLong(nCount, nOuterNum, nChannels, nBlocks, nInnerNum, nOffset, hX, hY, (int)dir));
8461 else
8462 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_COPY, null, m_param.AsLong(nCount, nOuterNum, nChannels, nBlocks, nInnerNum, nOffset, hX, hY, (int)dir));
8463 }
8464
8474 public void channel_copyall(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY)
8475 {
8476 if (m_dt == DataType.DOUBLE)
8477 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_COPYALL, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8478 else
8479 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_COPYALL, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8480 }
8481
8482
8491 public void sum(int nCount, int nOuterNum, int nInnerNum, long hX, long hY)
8492 {
8493 if (m_dt == DataType.DOUBLE)
8494 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SUM, null, m_param.AsLong(nCount, nOuterNum, nInnerNum, hX, hY));
8495 else
8496 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SUM, null, m_param.AsLong(nCount, nOuterNum, nInnerNum, hX, hY));
8497 }
8498
8506 public void rng_setseed(long lSeed)
8507 {
8508 if (m_dt == DataType.DOUBLE)
8509 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CUDA_RNG_SETSEED, m_param.AsDouble(lSeed));
8510 else
8511 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CUDA_RNG_SETSEED, m_param.AsFloat(lSeed));
8512 }
8513
8524 public void rng_uniform(int n, double fMin, double fMax, long hY)
8525 {
8526 rng_uniform(n, (T)Convert.ChangeType(fMin, typeof(T)), (T)Convert.ChangeType(fMax, typeof(T)), hY);
8527 }
8528
8539 public void rng_uniform(int n, float fMin, float fMax, long hY)
8540 {
8541 rng_uniform(n, (T)Convert.ChangeType(fMin, typeof(T)), (T)Convert.ChangeType(fMax, typeof(T)), hY);
8542 }
8543
8554 public void rng_uniform(int n, T fMin, T fMax, long hY)
8555 {
8556 if (m_dt == DataType.DOUBLE)
8557 {
8558 if (m_rgGhostMemory == null || !m_bGhostMemoryEnabled)
8559 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_RNG_UNIFORM, m_param.AsDouble(convertD(fMin), convertD(fMax)), m_param.AsLong(n, 0, 0, hY));
8560 }
8561 else
8562 {
8563 if (m_rgGhostMemory == null || !m_bGhostMemoryEnabled)
8564 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_RNG_UNIFORM, m_param.AsFloat(convertF(fMin), convertF(fMax)), m_param.AsLong(n, 0, 0, hY));
8565 }
8566 }
8567
8578 public void rng_gaussian(int n, double fMu, double fSigma, long hY)
8579 {
8580 rng_gaussian(n, (T)Convert.ChangeType(fMu, typeof(T)), (T)Convert.ChangeType(fSigma, typeof(T)), hY);
8581 }
8582
8593 public void rng_gaussian(int n, float fMu, float fSigma, long hY)
8594 {
8595 rng_gaussian(n, (T)Convert.ChangeType(fMu, typeof(T)), (T)Convert.ChangeType(fSigma, typeof(T)), hY);
8596 }
8597
8608 public void rng_gaussian(int n, T fMu, T fSigma, long hY)
8609 {
8610 if (m_dt == DataType.DOUBLE)
8611 {
8612 if (m_rgGhostMemory == null || !m_bGhostMemoryEnabled)
8613 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_RNG_GAUSSIAN, m_param.AsDouble(convertD(fMu), convertD(fSigma)), m_param.AsLong(n, 0, 0, hY));
8614 }
8615 else
8616 {
8617 if (m_rgGhostMemory == null || !m_bGhostMemoryEnabled)
8618 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_RNG_GAUSSIAN, m_param.AsFloat(convertF(fMu), convertF(fSigma)), m_param.AsLong(n, 0, 0, hY));
8619 }
8620 }
8621
8631 public void rng_bernoulli(int n, double fNonZeroProb, long hY)
8632 {
8633 rng_bernoulli(n, (T)Convert.ChangeType(fNonZeroProb, typeof(T)), hY);
8634 }
8635
8645 public void rng_bernoulli(int n, float fNonZeroProb, long hY)
8646 {
8647 rng_bernoulli(n, (T)Convert.ChangeType(fNonZeroProb, typeof(T)), hY);
8648 }
8649
8659 public void rng_bernoulli(int n, T fNonZeroProb, long hY)
8660 {
8661 //if (m_dt == DataType.DOUBLE)
8662 // m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CUDA_RNG_BERNOULLI, new double[] { n, (double)Convert.ChangeType(fNonZeroProb, typeof(double)), hY });
8663 //else
8664 // m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CUDA_RNG_BERNOULLI, new float[] { n, (float)Convert.ChangeType(fNonZeroProb, typeof(float)), hY });
8665
8666 T[] rg = GetMemory(hY);
8667 fill_random(fNonZeroProb, rg);
8668 SetMemory(hY, rg);
8669 }
8670
8671#pragma warning disable 1591
8672
8673 public void fill_random(T fNonZeroProb, T[] rg)
8674 {
8675 double dfNonZeroProb = Utility.ConvertVal<T>(fNonZeroProb);
8676
8677 for (int i = 0; i < rg.Length; i++)
8678 {
8679 double dfRand = m_random.NextDouble();
8680 rg[i] = (dfRand <= dfNonZeroProb) ? m_tOne : m_tZero;
8681 }
8682 }
8683
8684#pragma warning restore 1591
8685
8686
8700 public void accuracy_fwd(int nCount, int nOuterNum, int nInnerNum, long hBottomData, long hBottomLabel, long hAccData, long hAccTotals, int? nIgnoreLabel, bool bLastElementOnly, int nBatch)
8701 {
8702 if (m_dt == DataType.DOUBLE)
8703 {
8704 List<long> rgArg = new List<long>() { nCount, nOuterNum, nInnerNum, hBottomData, hBottomLabel, hAccData, hAccTotals, (bLastElementOnly) ? 1 : 0, nBatch };
8705 if (nIgnoreLabel.HasValue)
8706 rgArg.Add(nIgnoreLabel.Value);
8707 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_ACCURACY_FWD, null, rgArg.ToArray());
8708 }
8709 else
8710 {
8711 List<long> rgArg = new List<long>() { nCount, nOuterNum, nInnerNum, hBottomData, hBottomLabel, hAccData, hAccTotals, (bLastElementOnly) ? 1 : 0, nBatch };
8712 if (nIgnoreLabel.HasValue)
8713 rgArg.Add(nIgnoreLabel.Value);
8714 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_ACCURACY_FWD, null, rgArg.ToArray());
8715 }
8716 }
8717
8718
8727 public void batchreidx_fwd(int nCount, int nInnerDim, long hBottomData, long hPermutData, long hTopData)
8728 {
8729 if (m_dt == DataType.DOUBLE)
8730 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_BATCHREIDX_FWD, null, m_param.AsLong(nCount, nInnerDim, hBottomData, hPermutData, hTopData));
8731 else
8732 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_BATCHREIDX_FWD, null, m_param.AsLong(nCount, nInnerDim, hBottomData, hPermutData, hTopData));
8733 }
8734
8745 public void batchreidx_bwd(int nCount, int nInnerDim, long hTopDiff, long hTopIdx, long hBegins, long hCounts, long hBottomDiff)
8746 {
8747 if (m_dt == DataType.DOUBLE)
8748 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_BATCHREIDX_BWD, null, m_param.AsLong(nCount, nInnerDim, hTopDiff, hTopIdx, hBegins, hCounts, hBottomDiff));
8749 else
8750 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_BATCHREIDX_BWD, null, m_param.AsLong(nCount, nInnerDim, hTopDiff, hTopIdx, hBegins, hCounts, hBottomDiff));
8751 }
8752
8763 public void embed_fwd(int nCount, long hBottomData, long hWeight, int nM, int nN, int nK, long hTopData)
8764 {
8765 if (m_dt == DataType.DOUBLE)
8766 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_EMBED_FWD, null, m_param.AsLong(nCount, hBottomData, hWeight, nM, nN, nK, hTopData));
8767 else
8768 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_EMBED_FWD, null, m_param.AsLong(nCount, hBottomData, hWeight, nM, nN, nK, hTopData));
8769 }
8770
8781 public void embed_bwd(int nCount, long hBottomData, long hTopDiff, int nM, int nN, int nK, long hWeightDiff)
8782 {
8783 if (m_dt == DataType.DOUBLE)
8784 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_EMBED_BWD, null, m_param.AsLong(nCount, hBottomData, hTopDiff, nM, nN, nK, hWeightDiff));
8785 else
8786 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_EMBED_BWD, null, m_param.AsLong(nCount, hBottomData, hTopDiff, nM, nN, nK, hWeightDiff));
8787 }
8788
8810 public void pooling_fwd(POOLING_METHOD method, int nCount, long hBottomData, int num, int nChannels, int nHeight, int nWidth, int nPooledHeight, int nPooledWidth, int nKernelH, int nKernelW, int nStrideH, int nStrideW, int nPadH, int nPadW, long hTopData, long hMask, long hTopMask)
8811 {
8812 if (m_dt == DataType.DOUBLE)
8813 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_POOL_FWD, null, m_param.AsLong((int)method, nCount, hBottomData, num, nChannels, nHeight, nWidth, nPooledHeight, nPooledWidth, nKernelH, nKernelW, nStrideH, nStrideW, nPadH, nPadW, hTopData, hMask, hTopMask));
8814 else
8815 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_POOL_FWD, null, m_param.AsLong((int)method, nCount, hBottomData, num, nChannels, nHeight, nWidth, nPooledHeight, nPooledWidth, nKernelH, nKernelW, nStrideH, nStrideW, nPadH, nPadW, hTopData, hMask, hTopMask));
8816 }
8817
8839 public void pooling_bwd(POOLING_METHOD method, int nCount, long hTopDiff, int num, int nChannels, int nHeight, int nWidth, int nPooledHeight, int nPooledWidth, int nKernelH, int nKernelW, int nStrideH, int nStrideW, int nPadH, int nPadW, long hBottomDiff, long hMask, long hTopMask)
8840 {
8841 if (m_dt == DataType.DOUBLE)
8842 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_POOL_BWD, null, m_param.AsLong((int)method, nCount, hTopDiff, num, nChannels, nHeight, nWidth, nPooledHeight, nPooledWidth, nKernelH, nKernelW, nStrideH, nStrideW, nPadH, nPadW, hBottomDiff, hMask, hTopMask));
8843 else
8844 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_POOL_BWD, null, m_param.AsLong((int)method, nCount, hTopDiff, num, nChannels, nHeight, nWidth, nPooledHeight, nPooledWidth, nKernelH, nKernelW, nStrideH, nStrideW, nPadH, nPadW, hBottomDiff, hMask, hTopMask));
8845 }
8846
8867 public void unpooling_fwd(POOLING_METHOD method, int nCount, long hBottomData, int num, int nChannels, int nHeight, int nWidth, int nPooledHeight, int nPooledWidth, int nKernelH, int nKernelW, int nStrideH, int nStrideW, int nPadH, int nPadW, long hTopData, long hMask)
8868 {
8869 if (m_dt == DataType.DOUBLE)
8870 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_UNPOOL_FWD, null, m_param.AsLong((int)method, nCount, hBottomData, num, nChannels, nHeight, nWidth, nPooledHeight, nPooledWidth, nKernelH, nKernelW, nStrideH, nStrideW, nPadH, nPadW, hTopData, hMask));
8871 else
8872 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_UNPOOL_FWD, null, m_param.AsLong((int)method, nCount, hBottomData, num, nChannels, nHeight, nWidth, nPooledHeight, nPooledWidth, nKernelH, nKernelW, nStrideH, nStrideW, nPadH, nPadW, hTopData, hMask));
8873 }
8874
8895 public void unpooling_bwd(POOLING_METHOD method, int nCount, long hTopDiff, int num, int nChannels, int nHeight, int nWidth, int nPooledHeight, int nPooledWidth, int nKernelH, int nKernelW, int nStrideH, int nStrideW, int nPadH, int nPadW, long hBottomDiff, long hMask)
8896 {
8897 if (m_dt == DataType.DOUBLE)
8898 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_UNPOOL_BWD, null, m_param.AsLong((int)method, nCount, hTopDiff, num, nChannels, nHeight, nWidth, nPooledHeight, nPooledWidth, nKernelH, nKernelW, nStrideH, nStrideW, nPadH, nPadW, hBottomDiff, hMask));
8899 else
8900 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_UNPOOL_BWD, null, m_param.AsLong((int)method, nCount, hTopDiff, num, nChannels, nHeight, nWidth, nPooledHeight, nPooledWidth, nKernelH, nKernelW, nStrideH, nStrideW, nPadH, nPadW, hBottomDiff, hMask));
8901 }
8902
8914 public void clip_fwd(int nCount, long hBottomData, long hTopData, T fMin, T fMax)
8915 {
8916 if (m_dt == DataType.DOUBLE)
8917 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CLIP_FWD, m_param.AsDouble(convertD1(fMin), convertD1(fMax)), m_param.AsLong(nCount, hBottomData, hTopData, 0, 0));
8918 else
8919 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CLIP_FWD, m_param.AsFloat(convertF1(fMin), convertF1(fMax)), m_param.AsLong(nCount, hBottomData, hTopData, 0, 0));
8920 }
8921
8931 public void clip_bwd(int nCount, long hTopDiff, long hBottomData, long hBottomDiff, T fMin, T fMax)
8932 {
8933 if (m_dt == DataType.DOUBLE)
8934 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CLIP_BWD, m_param.AsDouble(convertD1(fMin), convertD1(fMax)), m_param.AsLong(nCount, hTopDiff, hBottomData, hBottomDiff, 0, 0));
8935 else
8936 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CLIP_BWD, m_param.AsFloat(convertF1(fMin), convertF1(fMax)), m_param.AsLong(nCount, hTopDiff, hBottomData, hBottomDiff, 0, 0));
8937 }
8938
8949 public void math_fwd(int nCount, long hBottomData, long hTopData, MATH_FUNCTION function)
8950 {
8951 if (m_dt == DataType.DOUBLE)
8952 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MATH_FWD, null, m_param.AsLong(nCount, hBottomData, hTopData, (int)function));
8953 else
8954 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MATH_FWD, null, m_param.AsLong(nCount, hBottomData, hTopData, (int)function));
8955 }
8956
8966 public void math_bwd(int nCount, long hTopDiff, long hTopData, long hBottomDiff, long hBottomData, MATH_FUNCTION function)
8967 {
8968 if (m_dt == DataType.DOUBLE)
8969 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MATH_BWD, null, m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff, hBottomData, (int)function));
8970 else
8971 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MATH_BWD, null, m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff, hBottomData, (int)function));
8972 }
8973
8991 public void mean_error_loss_bwd(int nCount, long hPredicted, long hTarget, long hBottomDiff, MEAN_ERROR merr)
8992 {
8993 if (m_dt == DataType.DOUBLE)
8994 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MEAN_ERROR_LOSS_BWD, null, m_param.AsLong(nCount, hPredicted, hTarget, hBottomDiff, (int)merr));
8995 else
8996 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MEAN_ERROR_LOSS_BWD, null, m_param.AsLong(nCount, hPredicted, hTarget, hBottomDiff, (int)merr));
8997 }
8998
9011 public void mish_fwd(int nCount, long hBottomData, long hTopData, double dfThreshold)
9012 {
9013 if (m_dt == DataType.DOUBLE)
9014 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MISH_FWD, m_param.AsDouble(dfThreshold), m_param.AsLong(nCount, hBottomData, hTopData, 0));
9015 else
9016 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MISH_FWD, m_param.AsFloat((float)dfThreshold), m_param.AsLong(nCount, hBottomData, hTopData, 0));
9017 }
9018
9035 public void mish_bwd(int nCount, long hTopDiff, long hTopData, long hBottomDiff, long hBottomData, double dfThreshold, int nMethod = 0)
9036 {
9037 if (m_dt == DataType.DOUBLE)
9038 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MISH_BWD, m_param.AsDouble(dfThreshold), m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff, hBottomData, 0, nMethod));
9039 else
9040 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MISH_BWD, m_param.AsFloat((float)dfThreshold), m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff, hBottomData, 0, nMethod));
9041 }
9042
9064 public void gelu_fwd(int nCount, long hBottomData, long hTopData, bool bEnableBertVersion)
9065 {
9066 if (m_dt == DataType.DOUBLE)
9067 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_GELU_FWD, null, m_param.AsLong(nCount, hBottomData, hTopData, (bEnableBertVersion) ? 1 : 0));
9068 else
9069 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_GELU_FWD, null, m_param.AsLong(nCount, hBottomData, hTopData, (bEnableBertVersion) ? 1 : 0));
9070 }
9071
9098 public void gelu_bwd(int nCount, long hTopDiff, long hTopData, long hBottomDiff, long hBottomData, bool bEnableBertVersion)
9099 {
9100 if (m_dt == DataType.DOUBLE)
9101 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_GELU_BWD, null, m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff, hBottomData, (bEnableBertVersion) ? 1 : 0));
9102 else
9103 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_GELU_BWD, null, m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff, hBottomData, (bEnableBertVersion) ? 1 : 0));
9104 }
9105
9118 public void silu_fwd(int nCount, long hBottomData, long hTopData)
9119 {
9120 if (m_dt == DataType.DOUBLE)
9121 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SILU_FWD, null, m_param.AsLong(nCount, hBottomData, hTopData));
9122 else
9123 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SILU_FWD, null, m_param.AsLong(nCount, hBottomData, hTopData));
9124 }
9125
9140 public void silu_bwd(int nCount, long hTopDiff, long hTopData, long hBottomDiff, long hBottomData)
9141 {
9142 if (m_dt == DataType.DOUBLE)
9143 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SILU_BWD, null, m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff, hBottomData));
9144 else
9145 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SILU_BWD, null, m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff, hBottomData));
9146 }
9147
9148
9161 public void softplus_fwd(int nCount, long hBottomData, long hTopData)
9162 {
9163 if (m_dt == DataType.DOUBLE)
9164 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SOFTPLUS_FWD, null, m_param.AsLong(nCount, hBottomData, hTopData));
9165 else
9166 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SOFTPLUS_FWD, null, m_param.AsLong(nCount, hBottomData, hTopData));
9167 }
9168
9183 public void softplus_bwd(int nCount, long hTopDiff, long hTopData, long hBottomDiff, long hBottomData)
9184 {
9185 if (m_dt == DataType.DOUBLE)
9186 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SOFTPLUS_BWD, null, m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff, hBottomData));
9187 else
9188 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SOFTPLUS_BWD, null, m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff, hBottomData));
9189 }
9190
9203 public void lecun_fwd(int nCount, long hBottomData, long hTopData)
9204 {
9205 if (m_dt == DataType.DOUBLE)
9206 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_LECUN_FWD, null, m_param.AsLong(nCount, hBottomData, hTopData));
9207 else
9208 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_LECUN_FWD, null, m_param.AsLong(nCount, hBottomData, hTopData));
9209 }
9210
9225 public void lecun_bwd(int nCount, long hTopDiff, long hTopData, long hBottomDiff, long hBottomData)
9226 {
9227 if (m_dt == DataType.DOUBLE)
9228 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_LECUN_BWD, null, m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff, hBottomData));
9229 else
9230 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_LECUN_BWD, null, m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff, hBottomData));
9231 }
9232
9245 public void serf_fwd(int nCount, long hBottomData, long hTopData, double dfThreshold)
9246 {
9247 if (m_dt == DataType.DOUBLE)
9248 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SERF_FWD, m_param.AsDouble(dfThreshold), m_param.AsLong(nCount, hBottomData, hTopData, 0));
9249 else
9250 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SERF_FWD, m_param.AsFloat((float)dfThreshold), m_param.AsLong(nCount, hBottomData, hTopData, 0));
9251 }
9252
9267 public void serf_bwd(int nCount, long hTopDiff, long hTopData, long hBottomDiff, long hBottomData, double dfThreshold)
9268 {
9269 if (m_dt == DataType.DOUBLE)
9270 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SERF_BWD, m_param.AsDouble(dfThreshold), m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff, hBottomData, 0));
9271 else
9272 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SERF_BWD, m_param.AsFloat((float)dfThreshold), m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff, hBottomData, 0));
9273 }
9274
9286 public void tanh_fwd(int nCount, long hBottomData, long hTopData)
9287 {
9288 if (m_dt == DataType.DOUBLE)
9289 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_TANH_FWD, null, m_param.AsLong(nCount, hBottomData, hTopData));
9290 else
9291 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_TANH_FWD, null, m_param.AsLong(nCount, hBottomData, hTopData));
9292 }
9293
9304 public void tanh_bwd(int nCount, long hTopDiff, long hTopData, long hBottomDiff)
9305 {
9306 if (m_dt == DataType.DOUBLE)
9307 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_TANH_BWD, null, m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff));
9308 else
9309 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_TANH_BWD, null, m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff));
9310 }
9311
9323 public void sigmoid_fwd(int nCount, long hBottomData, long hTopData)
9324 {
9325 if (m_dt == DataType.DOUBLE)
9326 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SIGMOID_FWD, null, m_param.AsLong(nCount, hBottomData, hTopData));
9327 else
9328 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SIGMOID_FWD, null, m_param.AsLong(nCount, hBottomData, hTopData));
9329 }
9330
9341 public void sigmoid_bwd(int nCount, long hTopDiff, long hTopData, long hBottomDiff)
9342 {
9343 if (m_dt == DataType.DOUBLE)
9344 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SIGMOID_BWD, null, m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff));
9345 else
9346 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SIGMOID_BWD, null, m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff));
9347 }
9348
9361 public void swish_bwd(int nCount, long hTopDiff, long hTopData, long hSigmoidOutputData, long hBottomDiff, double dfBeta)
9362 {
9363 if (m_dt == DataType.DOUBLE)
9364 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SWISH_BWD, m_param.AsDouble(dfBeta), m_param.AsLong(nCount, hTopDiff, hTopData, hSigmoidOutputData, hBottomDiff, 0));
9365 else
9366 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SWISH_BWD, m_param.AsFloat((float)dfBeta), m_param.AsLong(nCount, hTopDiff, hTopData, hSigmoidOutputData, hBottomDiff, 0));
9367 }
9368
9383 public void relu_fwd(int nCount, long hBottomData, long hTopData, T fNegativeSlope)
9384 {
9385 if (m_dt == DataType.DOUBLE)
9386 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_RELU_FWD, m_param.AsDouble(convertD(fNegativeSlope)), m_param.AsLong(nCount, hBottomData, hTopData, 0));
9387 else
9388 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_RELU_FWD, m_param.AsFloat(convertF(fNegativeSlope)), m_param.AsLong(nCount, hBottomData, hTopData, 0));
9389 }
9390
9404 public void relu_bwd(int nCount, long hTopDiff, long hTopData, long hBottomDiff, T fNegativeSlope)
9405 {
9406 if (m_dt == DataType.DOUBLE)
9407 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_RELU_BWD, m_param.AsDouble(convertD(fNegativeSlope)), m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff, 0));
9408 else
9409 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_RELU_BWD, m_param.AsFloat(convertF(fNegativeSlope)), m_param.AsLong(nCount, hTopDiff, hTopData, hBottomDiff, 0));
9410 }
9411
9424 public void elu_fwd(int nCount, long hBottomData, long hTopData, double dfAlpha)
9425 {
9426 if (m_dt == DataType.DOUBLE)
9427 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_ELU_FWD, m_param.AsDouble(dfAlpha), m_param.AsLong(nCount, hBottomData, hTopData, 0));
9428 else
9429 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_ELU_FWD, m_param.AsFloat((float)dfAlpha), m_param.AsLong(nCount, hBottomData, hTopData, 0));
9430 }
9431
9444 public void elu_bwd(int nCount, long hTopDiff, long hTopData, long hBottomData, long hBottomDiff, double dfAlpha)
9445 {
9446 if (m_dt == DataType.DOUBLE)
9447 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_ELU_BWD, m_param.AsDouble(dfAlpha), m_param.AsLong(nCount, hTopDiff, hTopData, hBottomData, hBottomDiff, 0));
9448 else
9449 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_ELU_BWD, m_param.AsFloat((float)dfAlpha), m_param.AsLong(nCount, hTopDiff, hTopData, hBottomData, hBottomDiff, 0));
9450 }
9451
9464 public void dropout_fwd(int nCount, long hBottomData, long hMask, uint uiThreshold, T fScale, long hTopData)
9465 {
9466 if (m_dt == DataType.DOUBLE)
9467 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_DROPOUT_FWD, m_param.AsDouble(convertD(fScale)), m_param.AsLong(nCount, hBottomData, hMask, uiThreshold, 0, hTopData));
9468 else
9469 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_DROPOUT_FWD, m_param.AsFloat(convertF(fScale)), m_param.AsLong(nCount, hBottomData, hMask, uiThreshold, 0, hTopData));
9470 }
9471
9484 public void dropout_bwd(int nCount, long hTopDiff, long hMask, uint uiThreshold, T fScale, long hBottomDiff)
9485 {
9486 if (m_dt == DataType.DOUBLE)
9487 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_DROPOUT_BWD, m_param.AsDouble(convertD(fScale)), m_param.AsLong(nCount, hTopDiff, hMask, uiThreshold, 0, hBottomDiff));
9488 else
9489 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_DROPOUT_BWD, m_param.AsFloat(convertF(fScale)), m_param.AsLong(nCount, hTopDiff, hMask, uiThreshold, 0, hBottomDiff));
9490 }
9491
9501 public void bnll_fwd(int nCount, long hBottomData, long hTopData)
9502 {
9503 if (m_dt == DataType.DOUBLE)
9504 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_BNLL_FWD, null, m_param.AsLong(nCount, hBottomData, hTopData));
9505 else
9506 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_BNLL_FWD, null, m_param.AsLong(nCount, hBottomData, hTopData));
9507 }
9508
9516 public void bnll_bwd(int nCount, long hTopDiff, long hBottomData, long hBottomDiff)
9517 {
9518 if (m_dt == DataType.DOUBLE)
9519 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_BNLL_BWD, null, m_param.AsLong(nCount, hTopDiff, hBottomData, hBottomDiff));
9520 else
9521 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_BNLL_BWD, null, m_param.AsLong(nCount, hTopDiff, hBottomData, hBottomDiff));
9522 }
9523
9540 public void prelu_fwd(int nCount, int nChannels, int nDim, long hBottomData, long hTopData, long hSlopeData, int nDivFactor)
9541 {
9542 if (m_dt == DataType.DOUBLE)
9543 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_PRELU_FWD, null, m_param.AsLong(nCount, nChannels, nDim, hBottomData, hTopData, hSlopeData, nDivFactor));
9544 else
9545 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_PRELU_FWD, null, m_param.AsLong(nCount, nChannels, nDim, hBottomData, hTopData, hSlopeData, nDivFactor));
9546 }
9547
9548
9562 public void prelu_bwd_param(int nCDim, int nNum, int nTopOffset, long hTopDiff, long hBottomData, long hBackBuffDiff)
9563 {
9564 if (m_dt == DataType.DOUBLE)
9565 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_PRELU_BWD_PARAM, null, m_param.AsLong(nCDim, nNum, nTopOffset, hTopDiff, hBottomData, hBackBuffDiff));
9566 else
9567 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_PRELU_BWD_PARAM, null, m_param.AsLong(nCDim, nNum, nTopOffset, hTopDiff, hBottomData, hBackBuffDiff));
9568 }
9569
9585 public void prelu_bwd(int nCount, int nChannels, int nDim, long hTopDiff, long hBottomData, long hBottomDiff, long hSlopeData, int nDivFactor)
9586 {
9587 if (m_dt == DataType.DOUBLE)
9588 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_PRELU_BWD, null, m_param.AsLong(nCount, nChannels, nDim, hTopDiff, hBottomData, hBottomDiff, hSlopeData, nDivFactor));
9589 else
9590 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_PRELU_BWD, null, m_param.AsLong(nCount, nChannels, nDim, hTopDiff, hBottomData, hBottomDiff, hSlopeData, nDivFactor));
9591 }
9592
9605 public void softmaxloss_fwd(int nCount, long hProbData, long hLabel, long hLossData, int nOuterNum, int nDim, int nInnerNum, long hCounts, int? nIgnoreLabel)
9606 {
9607 if (m_dt == DataType.DOUBLE)
9608 {
9609 List<long> rg = new List<long>() { nCount, hProbData, hLabel, hLossData, nOuterNum, nDim, nInnerNum, hCounts };
9610
9611 if (nIgnoreLabel.HasValue)
9612 rg.Add(nIgnoreLabel.Value);
9613
9614 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SOFTMAXLOSS_FWD, null, rg.ToArray());
9615 }
9616 else
9617 {
9618 List<long> rg = new List<long>() { nCount, hProbData, hLabel, hLossData, nOuterNum, nDim, nInnerNum, hCounts };
9619
9620 if (nIgnoreLabel.HasValue)
9621 rg.Add(nIgnoreLabel.Value);
9622
9623 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SOFTMAXLOSS_FWD, null, rg.ToArray());
9624 }
9625 }
9626
9639 public void softmaxloss_bwd(int nCount, long hTopData, long hLabel, long hBottomDiff, int nOuterNum, int nDim, int nInnerNum, long hCounts, int? nIgnoreLabel)
9640 {
9641 if (m_dt == DataType.DOUBLE)
9642 {
9643 List<long> rg = new List<long>() { nCount, hTopData, hLabel, hBottomDiff, nOuterNum, nDim, nInnerNum, hCounts };
9644
9645 if (nIgnoreLabel.HasValue)
9646 rg.Add(nIgnoreLabel.Value);
9647
9648 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SOFTMAXLOSS_BWD, null, rg.ToArray());
9649 }
9650 else
9651 {
9652 List<long> rg = new List<long>() { nCount, hTopData, hLabel, hBottomDiff, nOuterNum, nDim, nInnerNum, hCounts };
9653
9654 if (nIgnoreLabel.HasValue)
9655 rg.Add(nIgnoreLabel.Value);
9656
9657 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SOFTMAXLOSS_BWD, null, rg.ToArray());
9658 }
9659 }
9660
9673 public void nllloss_fwd(int nCount, long hProbData, long hLabel, long hLossData, int nOuterNum, int nDim, int nInnerNum, long hCounts, int? nIgnoreLabel)
9674 {
9675 if (m_dt == DataType.DOUBLE)
9676 {
9677 List<long> rg = new List<long>() { nCount, hProbData, hLabel, hLossData, nOuterNum, nDim, nInnerNum, hCounts };
9678
9679 if (nIgnoreLabel.HasValue)
9680 rg.Add(nIgnoreLabel.Value);
9681
9682 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_NLLLOSS_FWD, null, rg.ToArray());
9683 }
9684 else
9685 {
9686 List<long> rg = new List<long>() { nCount, hProbData, hLabel, hLossData, nOuterNum, nDim, nInnerNum, hCounts };
9687
9688 if (nIgnoreLabel.HasValue)
9689 rg.Add(nIgnoreLabel.Value);
9690
9691 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_NLLLOSS_FWD, null, rg.ToArray());
9692 }
9693 }
9694
9707 public void nllloss_bwd(int nCount, long hTopData, long hLabel, long hBottomDiff, int nOuterNum, int nDim, int nInnerNum, long hCounts, int? nIgnoreLabel)
9708 {
9709 if (m_dt == DataType.DOUBLE)
9710 {
9711 List<long> rg = new List<long>() { nCount, hTopData, hLabel, hBottomDiff, nOuterNum, nDim, nInnerNum, hCounts };
9712
9713 if (nIgnoreLabel.HasValue)
9714 rg.Add(nIgnoreLabel.Value);
9715
9716 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_NLLLOSS_BWD, null, rg.ToArray());
9717 }
9718 else
9719 {
9720 List<long> rg = new List<long>() { nCount, hTopData, hLabel, hBottomDiff, nOuterNum, nDim, nInnerNum, hCounts };
9721
9722 if (nIgnoreLabel.HasValue)
9723 rg.Add(nIgnoreLabel.Value);
9724
9725 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_NLLLOSS_BWD, null, rg.ToArray());
9726 }
9727 }
9728
9729
9742 public void max_fwd(int nCount, long hBottomDataA, long hBottomDataB, int nIdx, long hTopData, long hMask)
9743 {
9744 if (m_dt == DataType.DOUBLE)
9745 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MAX_FWD, null, m_param.AsLong(nCount, hBottomDataA, hBottomDataB, nIdx, hTopData, hMask));
9746 else
9747 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MAX_FWD, null, m_param.AsLong(nCount, hBottomDataA, hBottomDataB, nIdx, hTopData, hMask));
9748 }
9749
9758 public void max_bwd(int nCount, long hTopDiff, int nIdx, long hMask, long hBottomDiff)
9759 {
9760 if (m_dt == DataType.DOUBLE)
9761 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MAX_BWD, null, m_param.AsLong(nCount, hTopDiff, nIdx, hMask, hBottomDiff));
9762 else
9763 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MAX_BWD, null, m_param.AsLong(nCount, hTopDiff, nIdx, hMask, hBottomDiff));
9764 }
9765
9778 public void min_fwd(int nCount, long hBottomDataA, long hBottomDataB, int nIdx, long hTopData, long hMask)
9779 {
9780 if (m_dt == DataType.DOUBLE)
9781 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MIN_FWD, null, m_param.AsLong(nCount, hBottomDataA, hBottomDataB, nIdx, hTopData, hMask));
9782 else
9783 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MIN_FWD, null, m_param.AsLong(nCount, hBottomDataA, hBottomDataB, nIdx, hTopData, hMask));
9784 }
9785
9794 public void min_bwd(int nCount, long hTopDiff, int nIdx, long hMask, long hBottomDiff)
9795 {
9796 if (m_dt == DataType.DOUBLE)
9797 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MIN_BWD, null, m_param.AsLong(nCount, hTopDiff, nIdx, hMask, hBottomDiff));
9798 else
9799 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MIN_BWD, null, m_param.AsLong(nCount, hTopDiff, nIdx, hMask, hBottomDiff));
9800 }
9801
9812 public void crop_fwd(int nCount, int nNumAxes, long hSrcStrides, long hDstStrides, long hOffsets, long hBottomData, long hTopData)
9813 {
9814 if (m_dt == DataType.DOUBLE)
9815 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CROP_FWD, null, m_param.AsLong(nCount, nNumAxes, hSrcStrides, hDstStrides, hOffsets, hBottomData, hTopData));
9816 else
9817 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CROP_FWD, null, m_param.AsLong(nCount, nNumAxes, hSrcStrides, hDstStrides, hOffsets, hBottomData, hTopData));
9818 }
9819
9830 public void crop_bwd(int nCount, int nNumAxes, long hSrcStrides, long hDstStrides, long hOffsets, long hBottomDiff, long hTopDiff)
9831 {
9832 if (m_dt == DataType.DOUBLE)
9833 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CROP_BWD, null, m_param.AsLong(nCount, nNumAxes, hSrcStrides, hDstStrides, hOffsets, hBottomDiff, hTopDiff));
9834 else
9835 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CROP_BWD, null, m_param.AsLong(nCount, nNumAxes, hSrcStrides, hDstStrides, hOffsets, hBottomDiff, hTopDiff));
9836 }
9837
9849 public void concat_fwd(int nCount, long hBottomData, int nNumConcats, int nConcatInputSize, int nTopConcatAxis, int nBottomConcatAxis, int nOffsetConcatAxis, long hTopData)
9850 {
9851 if (m_dt == DataType.DOUBLE)
9852 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CONCAT_FWD, null, m_param.AsLong(nCount, hBottomData, nNumConcats, nConcatInputSize, nTopConcatAxis, nBottomConcatAxis, nOffsetConcatAxis, hTopData));
9853 else
9854 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CONCAT_FWD, null, m_param.AsLong(nCount, hBottomData, nNumConcats, nConcatInputSize, nTopConcatAxis, nBottomConcatAxis, nOffsetConcatAxis, hTopData));
9855 }
9856
9857
9869 public void concat_bwd(int nCount, long hTopDiff, int nNumConcats, int nConcatInputSize, int nTopConcatAxis, int nBottomConcatAxis, int nOffsetConcatAxis, long hBottomDiff)
9870 {
9871 if (m_dt == DataType.DOUBLE)
9872 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CONCAT_BWD, null, m_param.AsLong(nCount, hTopDiff, nNumConcats, nConcatInputSize, nTopConcatAxis, nBottomConcatAxis, nOffsetConcatAxis, hBottomDiff));
9873 else
9874 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CONCAT_BWD, null, m_param.AsLong(nCount, hTopDiff, nNumConcats, nConcatInputSize, nTopConcatAxis, nBottomConcatAxis, nOffsetConcatAxis, hBottomDiff));
9875 }
9876
9888 public void slice_fwd(int nCount, long hBottomData, int nNumSlices, int nSliceSize, int nBottomSliceAxis, int nTopSliceAxis, int nOffsetSliceAxis, long hTopData)
9889 {
9890 if (m_dt == DataType.DOUBLE)
9891 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SLICE_FWD, null, m_param.AsLong(nCount, hBottomData, nNumSlices, nSliceSize, nBottomSliceAxis, nTopSliceAxis, nOffsetSliceAxis, hTopData));
9892 else
9893 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SLICE_FWD, null, m_param.AsLong(nCount, hBottomData, nNumSlices, nSliceSize, nBottomSliceAxis, nTopSliceAxis, nOffsetSliceAxis, hTopData));
9894 }
9895
9907 public void slice_bwd(int nCount, long hTopDiff, int nNumSlices, int nSliceSize, int nBottomSliceAxis, int nTopSliceAxis, int nOffsetSliceAxis, long hBottomDiff)
9908 {
9909 if (m_dt == DataType.DOUBLE)
9910 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SLICE_BWD, null, m_param.AsLong(nCount, hTopDiff, nNumSlices, nSliceSize, nBottomSliceAxis, nTopSliceAxis, nOffsetSliceAxis, hBottomDiff));
9911 else
9912 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SLICE_BWD, null, m_param.AsLong(nCount, hTopDiff, nNumSlices, nSliceSize, nBottomSliceAxis, nTopSliceAxis, nOffsetSliceAxis, hBottomDiff));
9913 }
9914
9924 public void tile_fwd(int nCount, long hBottomData, int nInnerDim, int nTiles, int nBottomTileAxis, long hTopData)
9925 {
9926 if (m_dt == DataType.DOUBLE)
9927 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_TILE_FWD, null, m_param.AsLong(nCount, hBottomData, nInnerDim, nTiles, nBottomTileAxis, hTopData));
9928 else
9929 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_TILE_FWD, null, m_param.AsLong(nCount, hBottomData, nInnerDim, nTiles, nBottomTileAxis, hTopData));
9930 }
9931
9941 public void tile_bwd(int nCount, long hTopDiff, int nTileSize, int nTiles, int nBottomTileAxis, long hBottomDiff)
9942 {
9943 if (m_dt == DataType.DOUBLE)
9944 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_TILE_BWD, null, m_param.AsLong(nCount, hTopDiff, nTileSize, nTiles, nBottomTileAxis, hBottomDiff));
9945 else
9946 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_TILE_BWD, null, m_param.AsLong(nCount, hTopDiff, nTileSize, nTiles, nBottomTileAxis, hBottomDiff));
9947 }
9948
9958 public void bias_fwd(int nCount, long hBottomData, long hBiasData, int nBiasDim, int nInnerDim, long hTopData)
9959 {
9960 if (m_dt == DataType.DOUBLE)
9961 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_BIAS_FWD, null, m_param.AsLong(nCount, hBottomData, hBiasData, nBiasDim, nInnerDim, hTopData));
9962 else
9963 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_BIAS_FWD, null, m_param.AsLong(nCount, hBottomData, hBiasData, nBiasDim, nInnerDim, hTopData));
9964 }
9965
9983 public void scale_fwd(int nCount, long hX, long hScaleData, int nScaleDim, int nInnerDim, long hY, long hBiasData = 0)
9984 {
9985 if (m_dt == DataType.DOUBLE)
9986 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SCALE_FWD, null, m_param.AsLong(nCount, hX, hScaleData, nScaleDim, nInnerDim, hY, hBiasData));
9987 else
9988 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SCALE_FWD, null, m_param.AsLong(nCount, hX, hScaleData, nScaleDim, nInnerDim, hY, hBiasData));
9989 }
9990
10001 public void threshold_fwd(int nCount, double dfThreshold, long hX, long hY)
10002 {
10003 if (m_dt == DataType.DOUBLE)
10004 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_THRESHOLD_FWD, m_param.AsDouble(dfThreshold), m_param.AsLong(nCount, 0, hX, hY));
10005 else
10006 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_THRESHOLD_FWD, m_param.AsFloat((float)dfThreshold), m_param.AsLong(nCount, 0, hX, hY));
10007 }
10008
10025 public void cll_bwd(int nCount, int nChannels, double dfMargin, bool bLegacyVersion, double dfAlpha, long hY, long hDiff, long hDistSq, long hBottomDiff)
10026 {
10027 if (m_dt == DataType.DOUBLE)
10028 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CLL_BWD, m_param.AsDouble(dfMargin, dfAlpha), m_param.AsLong(nCount, nChannels, 0, (bLegacyVersion) ? 1 : 0, 0, hY, hDiff, hDistSq, hBottomDiff));
10029 else
10030 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CLL_BWD, m_param.AsFloat((float)dfMargin, (float)dfAlpha), m_param.AsLong(nCount, nChannels, 0, (bLegacyVersion) ? 1 : 0, 0, hY, hDiff, hDistSq, hBottomDiff));
10031 }
10032
10044 public void smoothl1_fwd(int nCount, long hX, long hY)
10045 {
10046 if (m_dt == DataType.DOUBLE)
10047 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SMOOTHL1_FWD, null, m_param.AsLong(nCount, hX, hY));
10048 else
10049 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SMOOTHL1_FWD, null, m_param.AsLong( nCount, hX, hY));
10050 }
10051
10063 public void smoothl1_bwd(int nCount, long hX, long hY)
10064 {
10065 if (m_dt == DataType.DOUBLE)
10066 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SMOOTHL1_BWD, null, m_param.AsLong(nCount, hX, hY));
10067 else
10068 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SMOOTHL1_BWD, null, m_param.AsLong( nCount, hX, hY));
10069 }
10070
10082 public void permute(int nCount, long hBottom, bool bFwd, long hPermuteOrder, long hOldSteps, long hNewSteps, int nNumAxes, long hTop)
10083 {
10084 if (m_dt == DataType.DOUBLE)
10085 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_PERMUTE, null, m_param.AsLong(nCount, hBottom, (bFwd) ? 1 : 0, hPermuteOrder, hOldSteps, hNewSteps, nNumAxes, hTop));
10086 else
10087 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_PERMUTE, null, m_param.AsLong( nCount, hBottom, (bFwd) ? 1 : 0, hPermuteOrder, hOldSteps, hNewSteps, nNumAxes, hTop));
10088 }
10089
10102 public void gather_fwd(int nCount, long hBottom, long hTop, int nAxis, int nDim, int nDimAtAxis, int nM, int nN, long hIdx)
10103 {
10104 if (m_dt == DataType.DOUBLE)
10105 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_GATHER_FWD, null, m_param.AsLong(nCount, hBottom, hTop, nAxis, nDim, nDimAtAxis, nM, nN, hIdx));
10106 else
10107 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_GATHER_FWD, null, m_param.AsLong( nCount, hBottom, hTop, nAxis, nDim, nDimAtAxis, nM, nN, hIdx));
10108 }
10109
10122 public void gather_bwd(int nCount, long hTop, long hBottom, int nAxis, int nDim, int nDimAtAxis, int nM, int nN, long hIdx)
10123 {
10124 if (m_dt == DataType.DOUBLE)
10125 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_GATHER_BWD, null, m_param.AsLong(nCount, hTop, hBottom, nAxis, nDim, nDimAtAxis, nM, nN, hIdx));
10126 else
10127 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_GATHER_BWD, null, m_param.AsLong( nCount, hTop, hBottom, nAxis, nDim, nDimAtAxis, nM, nN, hIdx));
10128 }
10129
10143 public void lrn_fillscale(int nCount, long hBottomData, int nNum, int nChannels, int nHeight, int nWidth, int nSize, T fAlphaOverSize, T fK, long hScaleData)
10144 {
10145 if (m_dt == DataType.DOUBLE)
10146 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_LRN_FILLSCALE, m_param.AsDouble(convertD(fAlphaOverSize), convertD(fK)), m_param.AsLong(nCount, hBottomData, nNum, nChannels, nHeight, nWidth, nSize, 0, 0, hScaleData));
10147 else
10148 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_LRN_FILLSCALE, m_param.AsFloat(convertF(fAlphaOverSize), convertF(fK)), m_param.AsLong(nCount, hBottomData, nNum, nChannels, nHeight, nWidth, nSize, 0, 0, hScaleData));
10149 }
10150
10159 public void lrn_computeoutput(int nCount, long hBottomData, long hScaleData, T fNegativeBeta, long hTopData)
10160 {
10161 if (m_dt == DataType.DOUBLE)
10162 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_LRN_COMPUTEOUTPUT, m_param.AsDouble(convertD(fNegativeBeta)), m_param.AsLong(nCount, hBottomData, hScaleData, 0, hTopData));
10163 else
10164 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_LRN_COMPUTEOUTPUT, m_param.AsFloat(convertF(fNegativeBeta)), m_param.AsLong(nCount, hBottomData, hScaleData, 0, hTopData));
10165 }
10166
10167
10184 public void lrn_computediff(int nCount, long hBottomData, long hTopData, long hScaleData, long hTopDiff, int nNum, int nChannels, int nHeight, int nWidth, int nSize, T fNegativeBeta, T fCacheRatio, long hBottomDiff)
10185 {
10186 if (m_dt == DataType.DOUBLE)
10187 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_LRN_COMPUTEDIFF, m_param.AsDouble(convertD(fNegativeBeta), convertD(fCacheRatio)), m_param.AsLong(nCount, hBottomData, hTopData, hScaleData, hTopDiff, nNum, nChannels, nHeight, nWidth, nSize, 0, 0, hBottomDiff));
10188 else
10189 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_LRN_COMPUTEDIFF, m_param.AsFloat(convertF(fNegativeBeta), convertF(fCacheRatio)), m_param.AsLong(nCount, hBottomData, hTopData, hScaleData, hTopDiff, nNum, nChannels, nHeight, nWidth, nSize, 0, 0, hBottomDiff));
10190 }
10191
10203 public void sgd_update(int nCount, long hNetParamsDiff, long hHistoryData, T fMomentum, T fLocalRate)
10204 {
10205 if (m_dt == DataType.DOUBLE)
10206 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SGD_UPDATE, m_param.AsDouble(convertD(fMomentum), convertD(fLocalRate)), m_param.AsLong(nCount, hNetParamsDiff, hHistoryData, 0, 0));
10207 else
10208 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SGD_UPDATE, m_param.AsFloat(convertF(fMomentum), convertF(fLocalRate)), m_param.AsLong(nCount, hNetParamsDiff, hHistoryData, 0, 0));
10209 }
10210
10223 public void nesterov_update(int nCount, long hNetParamsDiff, long hHistoryData, T fMomentum, T fLocalRate)
10224 {
10225 if (m_dt == DataType.DOUBLE)
10226 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_NESTEROV_UPDATE, m_param.AsDouble(convertD(fMomentum), convertD(fLocalRate)), m_param.AsLong(nCount, hNetParamsDiff, hHistoryData, 0, 0));
10227 else
10228 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_NESTEROV_UPDATE, m_param.AsFloat(convertF(fMomentum), convertF(fLocalRate)), m_param.AsLong(nCount, hNetParamsDiff, hHistoryData, 0, 0));
10229 }
10230
10231
10243 public void adagrad_update(int nCount, long hNetParamsDiff, long hHistoryData, T fDelta, T fLocalRate)
10244 {
10245 if (m_dt == DataType.DOUBLE)
10246 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_ADAGRAD_UPDATE, m_param.AsDouble(convertD(fDelta), convertD(fLocalRate)), m_param.AsLong(nCount, hNetParamsDiff, hHistoryData, 0, 0));
10247 else
10248 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_ADAGRAD_UPDATE, m_param.AsFloat(convertF(fDelta), convertF(fLocalRate)), m_param.AsLong(nCount, hNetParamsDiff, hHistoryData, 0, 0));
10249 }
10250
10264 public void adadelta_update(int nCount, long hNetParamsDiff, long hHistoryData1, long hHistoryData2, T fMomentum, T fDelta, T fLocalRate)
10265 {
10266 if (m_dt == DataType.DOUBLE)
10267 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_ADADELTA_UPDATE, m_param.AsDouble(convertD(fMomentum), convertD(fDelta), convertD(fLocalRate)), m_param.AsLong(nCount, hNetParamsDiff, hHistoryData1, hHistoryData2, 0, 0, 0));
10268 else
10269 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_ADADELTA_UPDATE, m_param.AsFloat(convertF(fMomentum), convertF(fDelta), convertF(fLocalRate)), m_param.AsLong(nCount, hNetParamsDiff, hHistoryData1, hHistoryData2, 0, 0, 0));
10270 }
10271
10287 public void adam_update(int nCount, long hNetParamsDiff, long hValM, long hValV, T fBeta1, T fBeta2, T fEpsHat, T fLearningRate, T fCorrection)
10288 {
10289 if (m_dt == DataType.DOUBLE)
10290 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_ADAM_UPDATE, m_param.AsDouble(convertD(fBeta1), convertD(fBeta2), convertD(fEpsHat), convertD(fLearningRate), convertD(fCorrection)), m_param.AsLong(nCount, hNetParamsDiff, hValM, hValV, 0, 0, 0, 0, 0));
10291 else
10292 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_ADAM_UPDATE, m_param.AsFloat(convertF(fBeta1), convertF(fBeta2), convertF(fEpsHat), convertF(fLearningRate), convertF(fCorrection)), m_param.AsLong(nCount, hNetParamsDiff, hValM, hValV, 0, 0, 0, 0, 0));
10293 }
10294
10313 public void adamw_update(int nCount, long hNetParamsDiff, long hValM, long hValV, T fBeta1, T fBeta2, T fEpsHat, T fLearningRate, T fDecayRate, long hNetParamsData, int nStep)
10314 {
10315 if (m_dt == DataType.DOUBLE)
10316 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_ADAMW_UPDATE, m_param.AsDouble(convertD(fBeta1), convertD(fBeta2), convertD(fEpsHat), convertD(fLearningRate), convertD(fDecayRate)), m_param.AsLong(nCount, hNetParamsDiff, hValM, hValV, 0, 0, 0, 0, 0, hNetParamsData, nStep));
10317 else
10318 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_ADAMW_UPDATE, m_param.AsFloat(convertF(fBeta1), convertF(fBeta2), convertF(fEpsHat), convertF(fLearningRate), convertF(fDecayRate)), m_param.AsLong(nCount, hNetParamsDiff, hValM, hValV, 0, 0, 0, 0, 0, hNetParamsData, nStep));
10319 }
10320
10334 public void rmsprop_update(int nCount, long hNetParamsDiff, long hHistoryData, T fRmsDecay, T fDelta, T fLocalRate)
10335 {
10336 if (m_dt == DataType.DOUBLE)
10337 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_RMSPROP_UPDATE, m_param.AsDouble(convertD(fRmsDecay), convertD(fDelta), convertD(fLocalRate)), m_param.AsLong(nCount, hNetParamsDiff, hHistoryData, 0, 0, 0));
10338 else
10339 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_RMSPROP_UPDATE, m_param.AsFloat(convertF(fRmsDecay), convertF(fDelta), convertF(fLocalRate)), m_param.AsLong(nCount, hNetParamsDiff, hHistoryData, 0, 0, 0));
10340 }
10341
10372 public void lstm_fwd(int t, int nN, int nH, int nI, long hWeight_h, long hWeight_i, long hClipData, int nClipOffset, long hTopData, int nTopOffset, long hCellData, int nCellOffset, long hPreGateData, int nPreGateOffset, long hGateData, int nGateOffset, long hHT1Data, int nHT1Offset, long hCT1Data, int nCT1Offset, long hHtoGateData, long hContext = 0, long hWeight_c = 0, long hCtoGetData = 0)
10373 {
10374 if (m_dt == DataType.DOUBLE)
10375 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_LSTM_FWD, null, m_param.AsLong(t, nN, nH, nI, hWeight_h, hWeight_i, hClipData, nClipOffset, hTopData, nTopOffset, hCellData, nCellOffset, hPreGateData, nPreGateOffset, hGateData, nGateOffset, hHT1Data, nHT1Offset, hCT1Data, nCT1Offset, hHtoGateData, hContext, hWeight_c, hCtoGetData));
10376 else
10377 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_LSTM_FWD, null, m_param.AsLong( t, nN, nH, nI, hWeight_h, hWeight_i, hClipData, nClipOffset, hTopData, nTopOffset, hCellData, nCellOffset, hPreGateData, nPreGateOffset, hGateData, nGateOffset, hHT1Data, nHT1Offset, hCT1Data, nCT1Offset, hHtoGateData, hContext, hWeight_c, hCtoGetData));
10378 }
10379
10413 public void lstm_bwd(int t, int nN, int nH, int nI, double dfClippingThreshold, long hWeight_h, long hClipData, int nClipOffset, long hTopDiff, int nTopOffset, long hCellData, long hCellDiff, int nCellOffset, long hPreGateDiff, int nPreGateOffset, long hGateData, long hGateDiff, int nGateOffset, long hCT1Data, int nCT1Offset, long hDHT1Diff, int nDHT1Offset, long hDCT1Diff, int nDCT1Offset, long hHtoHData, long hContextDiff = 0, long hWeight_c = 0)
10414 {
10415 if (m_dt == DataType.DOUBLE)
10416 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_LSTM_BWD, m_param.AsDouble(dfClippingThreshold), m_param.AsLong(t, nN, nH, nI, 0, hWeight_h, hClipData, nClipOffset, hTopDiff, nTopOffset, hCellData, hCellDiff, nCellOffset, hPreGateDiff, nPreGateOffset, hGateData, hGateDiff, nGateOffset, hCT1Data, nCT1Offset, hDHT1Diff, nDHT1Offset, hDCT1Diff, nDCT1Offset, hHtoHData, hContextDiff, hWeight_c));
10417 else
10418 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_LSTM_BWD, m_param.AsFloat((float)dfClippingThreshold), m_param.AsLong( t, nN, nH, nI, 0, hWeight_h, hClipData, nClipOffset, hTopDiff, nTopOffset, hCellData, hCellDiff, nCellOffset, hPreGateDiff, nPreGateOffset, hGateData, hGateDiff, nGateOffset, hCT1Data, nCT1Offset, hDHT1Diff, nDHT1Offset, hDCT1Diff, nDCT1Offset, hHtoHData, hContextDiff, hWeight_c));
10419 }
10420
10436 public void lstm_unit_fwd(int nCount, int nHiddenDim, int nXCount, long hX, long hX_acts, long hC_prev, long hCont, long hC, long hH)
10437 {
10438 if (m_dt == DataType.DOUBLE)
10439 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_LSTM_UNIT_FWD, null, m_param.AsLong(nCount, nHiddenDim, nXCount, hX, hX_acts, hC_prev, hCont, hC, hH));
10440 else
10441 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_LSTM_UNIT_FWD, null, m_param.AsLong( nCount, nHiddenDim, nXCount, hX, hX_acts, hC_prev, hCont, hC, hH));
10442 }
10443
10463 public void lstm_unit_bwd(int nCount, int nHiddenDim, int nXCount, long hC_prev, long hX_acts, long hC, long hH, long hCont, long hC_diff, long hH_diff, long hC_prev_diff, long hX_acts_diff, long hX_diff)
10464 {
10465 if (m_dt == DataType.DOUBLE)
10466 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_LSTM_UNIT_BWD, null, m_param.AsLong(nCount, nHiddenDim, nXCount, hC_prev, hX_acts, hC, hH, hCont, hC_diff, hH_diff, hC_prev_diff, hX_acts_diff, hX_diff));
10467 else
10468 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_LSTM_UNIT_BWD, null, m_param.AsLong(nCount, nHiddenDim, nXCount, hC_prev, hX_acts, hC, hH, hCont, hC_diff, hH_diff, hC_prev_diff, hX_acts_diff, hX_diff));
10469 }
10470
10481 public void coeff_sum_fwd(int nCount, int nDim, int nNumOffset, double dfCoeff, long hCoeffData, long hBottom, long hTop)
10482 {
10483 if (m_dt == DataType.DOUBLE)
10484 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_COEFF_SUM_FWD, m_param.AsDouble(dfCoeff), m_param.AsLong(nCount, nDim, nNumOffset, 0, hCoeffData, hBottom, hTop));
10485 else
10486 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_COEFF_SUM_FWD, m_param.AsFloat((float)dfCoeff), m_param.AsLong(nCount, nDim, nNumOffset, 0, hCoeffData, hBottom, hTop));
10487 }
10488
10489
10500 public void coeff_sum_bwd(int nCount, int nDim, int nNumOffset, double dfCoeff, long hCoeffData, long hTopDiff, long hBottomDiff)
10501 {
10502 if (m_dt == DataType.DOUBLE)
10503 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_COEFF_SUM_BWD, m_param.AsDouble(dfCoeff), m_param.AsLong(nCount, nDim, nNumOffset, 0, hCoeffData, hTopDiff, hBottomDiff));
10504 else
10505 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_COEFF_SUM_BWD, m_param.AsFloat((float)dfCoeff), m_param.AsLong(nCount, nDim, nNumOffset, 0, hCoeffData, hTopDiff, hBottomDiff));
10506 }
10507
10518 public void coeff_sub_fwd(int nCount, int nDim, int nNumOffset, double dfCoeff, long hCoeffData, long hBottom, long hTop)
10519 {
10520 if (m_dt == DataType.DOUBLE)
10521 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_COEFF_SUB_FWD, m_param.AsDouble(dfCoeff), m_param.AsLong(nCount, nDim, nNumOffset, 0, hCoeffData, hBottom, hTop));
10522 else
10523 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_COEFF_SUB_FWD, m_param.AsFloat((float)dfCoeff), m_param.AsLong(nCount, nDim, nNumOffset, 0, hCoeffData, hBottom, hTop));
10524 }
10525
10526
10537 public void coeff_sub_bwd(int nCount, int nDim, int nNumOffset, double dfCoeff, long hCoeffData, long hTopDiff, long hBottomDiff)
10538 {
10539 if (m_dt == DataType.DOUBLE)
10540 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_COEFF_SUB_BWD, m_param.AsDouble(dfCoeff), m_param.AsLong(nCount, nDim, nNumOffset, 0, hCoeffData, hTopDiff, hBottomDiff));
10541 else
10542 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_COEFF_SUB_BWD, m_param.AsFloat((float)dfCoeff), m_param.AsLong(nCount, nDim, nNumOffset, 0, hCoeffData, hTopDiff, hBottomDiff));
10543 }
10544
10545
10556 public void sigmoid_cross_entropy_fwd(int nCount, long hInput, long hTarget, long hLoss, bool bHasIgnoreLabel, int nIgnoreLabel, long hCountData)
10557 {
10558 if (m_dt == DataType.DOUBLE)
10559 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SIGMOID_CROSS_ENTROPY_FWD, null, m_param.AsLong(nCount, hInput, hTarget, hLoss, (bHasIgnoreLabel) ? 1 : 0, nIgnoreLabel, hCountData));
10560 else
10561 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SIGMOID_CROSS_ENTROPY_FWD, null, m_param.AsLong(nCount, hInput, hTarget, hLoss, (bHasIgnoreLabel) ? 1 : 0, nIgnoreLabel, hCountData));
10562 }
10563
10571 public void sigmoid_cross_entropy_bwd(int nCount, int nIgnoreLabel, long hTarget, long hBottomDiff)
10572 {
10573 if (m_dt == DataType.DOUBLE)
10574 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SIGMOID_CROSS_ENTROPY_BWD, null, m_param.AsLong(nCount, nIgnoreLabel, hTarget, hBottomDiff));
10575 else
10576 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SIGMOID_CROSS_ENTROPY_BWD, null, m_param.AsLong(nCount, nIgnoreLabel, hTarget, hBottomDiff));
10577 }
10578
10595 public void softmax_cross_entropy_fwd(int nCount, long hProbData, long hLabel, long hLossDiff, long hLossData, int nOuterNum, int nDim, int nInnerNum, long hCounts, int? nIgnoreLabel)
10596 {
10597 if (m_dt == DataType.DOUBLE)
10598 {
10599 List<long> rg = new List<long>() { nCount, hProbData, hLabel, hLossDiff, hLossData, nOuterNum, nDim, nInnerNum, hCounts };
10600
10601 if (nIgnoreLabel.HasValue)
10602 rg.Add(nIgnoreLabel.Value);
10603
10604 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SOFTMAX_CROSS_ENTROPY_FWD, null, rg.ToArray());
10605 }
10606 else
10607 {
10608 List<long> rg = new List<long>() { nCount, hProbData, hLabel, hLossDiff, hLossData, nOuterNum, nDim, nInnerNum, hCounts };
10609
10610 if (nIgnoreLabel.HasValue)
10611 rg.Add(nIgnoreLabel.Value);
10612
10613 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SOFTMAX_CROSS_ENTROPY_FWD, null, rg.ToArray());
10614 }
10615 }
10616
10624 public void softmax_cross_entropy_bwd(int nCount, int nIgnoreLabel, long hTarget, long hBottomDiff)
10625 {
10626 if (m_dt == DataType.DOUBLE)
10627 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SOFTMAX_CROSS_ENTROPY_BWD, null, m_param.AsLong(nCount, nIgnoreLabel, hTarget, hBottomDiff));
10628 else
10629 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SOFTMAX_CROSS_ENTROPY_BWD, null, m_param.AsLong(nCount, nIgnoreLabel, hTarget, hBottomDiff));
10630 }
10631
10632#pragma warning disable 1591
10633
10637 public void debug()
10638 {
10639 if (m_dt == DataType.DOUBLE)
10640 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CUDA_DEBUG, null);
10641 else
10642 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CUDA_DEBUG, null);
10643 }
10644
10645 public void matrix_set_diagonal(int nCount, int nRows, double dfVal, long hData)
10646 {
10647 if (m_dt == DataType.DOUBLE)
10648 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MTX_SET_DIAGONAL, m_param.AsDouble(dfVal), m_param.AsLong(nCount, nRows, 0, hData));
10649 else
10650 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MTX_SET_DIAGONAL, m_param.AsFloat((float)dfVal), m_param.AsLong(nCount, nRows, 0, hData));
10651 }
10652
10653 public void matrix_set_diagonal(int nCount, int nRows, long hDiagonal, double dfScaleA, double dfScaleB, long hData)
10654 {
10655 if (m_dt == DataType.DOUBLE)
10656 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MTX_SET_DIAGONAL2, m_param.AsDouble(dfScaleA, dfScaleB), m_param.AsLong(nCount, nRows, hDiagonal, 0, 0, hData));
10657 else
10658 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MTX_SET_DIAGONAL2, m_param.AsFloat((float)dfScaleA, (float)dfScaleB), m_param.AsLong(nCount, nRows, hDiagonal, 0, 0, hData));
10659 }
10660
10661 public void matrix_add_vector(ORIENTATION orientation, int nWidth, int nHeight, double dfScale, long hA, long hB, long hY)
10662 {
10663 if (m_dt == DataType.DOUBLE)
10664 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MTX_ADD_VECTOR, m_param.AsDouble(dfScale), m_param.AsLong((int)orientation, nWidth, nHeight, 0, hA, hB, hY));
10665 else
10666 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MTX_ADD_VECTOR, m_param.AsFloat((float)dfScale), m_param.AsLong((int)orientation, nWidth, nHeight, 0, hA, hB, hY));
10667 }
10668
10669 public void matrix_transpose_operation(TRANSPOSE_OPERATION op, int nWidth, int nHeight, long hA, long hB, long hY, double dfScaleA = 1.0, double dfScaleB = 1.0)
10670 {
10671 if (m_dt == DataType.DOUBLE)
10672 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MTX_TRANSPOSE_OPERATION, m_param.AsDouble(dfScaleA, dfScaleB), m_param.AsLong((int)op, nWidth, nHeight, hA, hB, hY, 0, 0));
10673 else
10674 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MTX_TRANSPOSE_OPERATION, m_param.AsFloat((float)dfScaleA, (float)dfScaleB), m_param.AsLong((int)op, nWidth, nHeight, hA, hB, hY, 0, 0));
10675 }
10676
10677 public void matrix_transpose_add(int nWidth, int nHeight, double dfScaleA, double dfScaleB, long hA, long hB, long hY)
10678 {
10679 matrix_transpose_operation(TRANSPOSE_OPERATION.ADD, nWidth, nHeight, hA, hB, hY, dfScaleA, dfScaleB);
10680 }
10681
10682 public void matrix_transpose_mul(int nWidth, int nHeight, long hA, long hB, long hY)
10683 {
10684 matrix_transpose_operation(TRANSPOSE_OPERATION.MUL, nWidth, nHeight, hA, hB, hY);
10685 }
10686
10687 public void matrix_transpose_div(int nWidth, int nHeight, long hA, long hB, long hY)
10688 {
10689 matrix_transpose_operation(TRANSPOSE_OPERATION.DIV, nWidth, nHeight, hA, hB, hY);
10690 }
10691
10692 public void matrix_aggregate_cols(AGGREGATIONS op, int nWidth, int nHeight, long hA, long hY)
10693 {
10694 if (m_dt == DataType.DOUBLE)
10695 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MTX_AGGREGATE_COLS, null, m_param.AsLong((int)op, nWidth, nHeight, hA, hY));
10696 else
10697 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MTX_AGGREGATE_COLS, null, m_param.AsLong((int)op, nWidth, nHeight, hA, hY));
10698 }
10699
10700 public void matrix_aggregate_rows(AGGREGATIONS op, int nWidth, int nHeight, long hA, long hOnes, long hY)
10701 {
10702 if (m_dt == DataType.DOUBLE)
10703 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MTX_AGGREGATE_ROWS, null, m_param.AsLong((int)op, nWidth, nHeight, hA, hOnes, hY));
10704 else
10705 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MTX_AGGREGATE_ROWS, null, m_param.AsLong((int)op, nWidth, nHeight, hA, hOnes, hY));
10706 }
10707
10708 public void matrix_transpose(int nWidth, int nHeight, long hA, long hY)
10709 {
10710 if (m_dt == DataType.DOUBLE)
10711 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MTX_TRANSPOSE, null, m_param.AsLong(nWidth, nHeight, hA, hY));
10712 else
10713 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MTX_TRANSPOSE, null, m_param.AsLong(nWidth, nHeight, hA, hY));
10714 }
10715
10725 public void matrix_meancenter_by_column(int nWidth, int nHeight, long hA, long hB, long hY, bool bNormalize = false)
10726 {
10727 if (m_dt == DataType.DOUBLE)
10728 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MTX_MEANCENTER_BY_COL, null, m_param.AsLong(nWidth, nHeight, hA, hB, hY, (bNormalize) ? 1 : 0));
10729 else
10730 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MTX_MEANCENTER_BY_COL, null, m_param.AsLong(nWidth, nHeight, hA, hB, hY, (bNormalize) ? 1 : 0));
10731 }
10732
10733 public void matrix_euclidean_distance(long hX, long hY, long hOut, int n, int d, int nStart, int nEnd)
10734 {
10735 if (m_dt == DataType.DOUBLE)
10736 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MTX_EUCLIDEAN_DIST, null, m_param.AsLong(hX, hY, hOut, n, d, nStart, nEnd));
10737 else
10738 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MTX_EUCLIDEAN_DIST, null, m_param.AsLong(hX, hY, hOut, n, d, nStart, nEnd));
10739 }
10740
10741 public void matrix_dot(int m, int n, int k, long hA, long hB, long hC)
10742 {
10743 if (m_dt == DataType.DOUBLE)
10744 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MTX_DOT, null, m_param.AsLong(m, n, k, hA, hB, hC));
10745 else
10746 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MTX_DOT, null, m_param.AsLong(m, n, k, hA, hB, hC));
10747 }
10748
10749 public void matrix_mean_rows(int nWidth, int nHeight, long hA, long hOnes, double dfAlpha, long hY)
10750 {
10751 if (m_dt == DataType.DOUBLE)
10752 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MTX_MEAN, m_param.AsDouble(dfAlpha), m_param.AsLong(nWidth, nHeight, hA, hOnes, 0, hY));
10753 else
10754 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MTX_MEAN, m_param.AsFloat((float)dfAlpha), m_param.AsLong(nWidth, nHeight, hA, hOnes, 0, hY));
10755 }
10756
10757 public void matrix_stdev_rows(int nWidth, int nHeight, long hA, long hOnes, long hMean, long hWork, long hY)
10758 {
10759 if (m_dt == DataType.DOUBLE)
10760 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MTX_STDEV, null, m_param.AsLong(nWidth, nHeight, hA, hOnes, hMean, hWork, hY));
10761 else
10762 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MTX_STDEV, null, m_param.AsLong(nWidth, nHeight, hA, hOnes, hMean, hWork, hY));
10763 }
10764
10765 public void matrix_correlations(int nWidth, int nHeight, long hA, long hOnes, long hMean, long hStdev, long hWork, long hY)
10766 {
10767 if (m_dt == DataType.DOUBLE)
10768 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MTX_CORRELATIONS, null, m_param.AsLong(nWidth, nHeight, hA, hOnes, hMean, hStdev, hWork, hY));
10769 else
10770 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MTX_CORRELATIONS, null, m_param.AsLong(nWidth, nHeight, hA, hOnes, hMean, hStdev, hWork, hY));
10771 }
10772
10773#pragma warning restore 1591
10774
10775 #endregion
10776
10777 #region T-SNE Methods
10778
10779#pragma warning disable 1591
10780
10781 public void tsne_update(int n, double dfMomentum, double dfLearningRate, long hdY, long huY, long hGains, long hY, double fGainFactor1 = 0.2, double fGainFactor2 = 0.8)
10782 {
10783 if (m_dt == DataType.DOUBLE)
10784 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_TSNE_UPDATE, m_param.AsDouble(dfMomentum, dfLearningRate, fGainFactor1, fGainFactor2), m_param.AsLong(n, 0, 0, hdY, huY, hGains, hY, 0, 0));
10785 else
10786 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_TSNE_UPDATE, m_param.AsFloat((float)dfMomentum, (float)dfLearningRate, (float)fGainFactor1, (float)fGainFactor2), m_param.AsLong(n, 0, 0, hdY, huY, hGains, hY, 0, 0));
10787 }
10788
10789 public void tsne_update_grad(int n, long hPosF, long hNegF, double dfSumQ, long hdC)
10790 {
10791 if (m_dt == DataType.DOUBLE)
10792 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_TSNE_UPDATE_GRAD, m_param.AsDouble(dfSumQ), m_param.AsLong(n, hPosF, hNegF, 0, hdC));
10793 else
10794 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_TSNE_UPDATE_GRAD, m_param.AsFloat((float)dfSumQ), m_param.AsLong(n, hPosF, hNegF, 0, hdC));
10795 }
10796
10797 public void tsne_compute_exact_error(int n, long hP, long hQ, long hY)
10798 {
10799 if (m_dt == DataType.DOUBLE)
10800 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_TSNE_COMPUTE_EXACT_ERROR, null, m_param.AsLong(n, hP, hQ, hY));
10801 else
10802 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_TSNE_COMPUTE_EXACT_ERROR, null, m_param.AsLong(n, hP, hQ, hY));
10803 }
10804
10805 public void tsne_compute_squared_euclidean_distance(int n, int d, long hWork, long hX, long hDD_on_host)
10806 {
10807 if (m_dt == DataType.DOUBLE)
10808 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_TSNE_COMPUTE_SQUARED_EUCLIDEAN_DISTANCE, null, m_param.AsLong(n, d, hWork, hX, hDD_on_host));
10809 else
10810 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_TSNE_COMPUTE_SQUARED_EUCLIDEAN_DISTANCE, null, m_param.AsLong(n, d, hWork, hX, hDD_on_host));
10811 }
10812
10813 public double tsne_compute_q_matrix(int n, long hDD_on_host, long hQ, bool bQisHostMem)
10814 {
10815 if (m_dt == DataType.DOUBLE)
10816 {
10817 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_TSNE_COMPUTE_Q_MATRIX, null, m_param.AsLong(n, hDD_on_host, hQ, (bQisHostMem) ? 1 : 0));
10818 return rg[0];
10819 }
10820 else
10821 {
10822 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_TSNE_COMPUTE_Q_MATRIX, null, m_param.AsLong(n, hDD_on_host, hQ, (bQisHostMem) ? 1 : 0));
10823 return rg[0];
10824 }
10825 }
10826
10827 public void tsne_compute_exact_gradient(int n, int d, long hY, long hP, long hQ, bool bQonHost, long hdC, double dfSumQ)
10828 {
10829 if (m_dt == DataType.DOUBLE)
10830 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_TSNE_COMPUTE_EXACT_GRADIENT, m_param.AsDouble(dfSumQ), m_param.AsLong(n, d, hY, hP, hQ, (bQonHost) ? 1 : 0, hdC, 0));
10831 else
10832 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_TSNE_COMPUTE_EXACT_GRADIENT, m_param.AsFloat((float)dfSumQ), m_param.AsLong(n, d, hY, hP, hQ, (bQonHost) ? 1 : 0, hdC, 0));
10833 }
10834
10835 public long tsne_symmetrize_matrix(int n, long hRowP, long hColP, long hValP)
10836 {
10837 if (m_dt == DataType.DOUBLE)
10838 {
10839 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_TSNE_SYMMETRIZE_MATRIX, null, m_param.AsLong(n, hRowP, hColP, hValP));
10840 return (long)rg[0];
10841 }
10842 else
10843 {
10844 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_TSNE_SYMMETRIZE_MATRIX, null, m_param.AsLong(n, hRowP, hColP, hValP));
10845 return (long)rg[0];
10846 }
10847 }
10848
10849 public void tsne_compute_knn_bounds(int n, long hData, double dfCirclePct, out double dfMinX, out double dfMinY, out double dfMaxX, out double dfMaxY)
10850 {
10851 if (m_dt == DataType.DOUBLE)
10852 {
10853 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_TSNE_COMPUTE_KNN_BOUNDS, m_param.AsDouble(dfCirclePct), m_param.AsLong(n, hData, 0));
10854 dfMinX = rg[0];
10855 dfMinY = rg[1];
10856 dfMaxX = rg[2];
10857 dfMaxY = rg[3];
10858 }
10859 else
10860 {
10861 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_TSNE_COMPUTE_KNN_BOUNDS, m_param.AsFloat((float)dfCirclePct), m_param.AsLong(n, hData, 0));
10862 dfMinX = rg[0];
10863 dfMinY = rg[1];
10864 dfMaxX = rg[2];
10865 dfMaxY = rg[3];
10866 }
10867 }
10868
10869 public long CreateTsneGaussianPerplexity(int n, int d, int k, long hX, long hCurP, long hValP, long hRowPonHost, long hColPonHost, double fPerplexity)
10870 {
10871 if (m_dt == DataType.DOUBLE)
10872 {
10873 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_TSNE_CREATE_GAUSSIAN_PERPLEXITY, m_param.AsDouble(fPerplexity), m_param.AsLong(n, d, k, hX, hCurP, hValP, hRowPonHost, hColPonHost, 0));
10874 return (long)rg[0];
10875 }
10876 else
10877 {
10878 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_TSNE_CREATE_GAUSSIAN_PERPLEXITY, m_param.AsFloat((float)fPerplexity), m_param.AsLong(n, d, k, hX, hCurP, hValP, hRowPonHost, hColPonHost, 0));
10879 return (long)rg[0];
10880 }
10881 }
10882
10883 public bool FindTsneGaussianPerplexity(long hTsnePerplexity, out int nCurrentIteration, out int nMaxIteration)
10884 {
10885 bool bDone = false;
10886
10887 if (m_dt == DataType.DOUBLE)
10888 {
10889 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_TSNE_FIND_GAUSSIAN_PERPLEXITY, null, m_param.AsLong(hTsnePerplexity));
10890 bDone = (rg[0] == 1.0) ? true : false;
10891 nCurrentIteration = (int)rg[1];
10892 nMaxIteration = (int)rg[2];
10893 }
10894 else
10895 {
10896 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_TSNE_FIND_GAUSSIAN_PERPLEXITY, null, m_param.AsLong(hTsnePerplexity));
10897 bDone = (rg[0] == 1.0) ? true : false;
10898 nCurrentIteration = (int)rg[1];
10899 nMaxIteration = (int)rg[2];
10900 }
10901
10902 return bDone;
10903 }
10904
10905 public void FreeTsneGaussianPerplexity(long hTsnePerplexity)
10906 {
10907 if (m_dt == DataType.DOUBLE)
10908 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_TSNE_FREE_GAUSSIAN_PERPLEXITY, null, m_param.AsLong(hTsnePerplexity));
10909 else
10910 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_TSNE_FREE_GAUSSIAN_PERPLEXITY, null, m_param.AsLong(hTsnePerplexity));
10911 }
10912
10913 public long CreateTsne(int n, int d, long hY, long hValP, long hRowP, long hColP, long hdC, double fTheta)
10914 {
10915 if (m_dt == DataType.DOUBLE)
10916 {
10917 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_TSNE_CREATE, m_param.AsDouble(fTheta), m_param.AsLong(n, d, hY, hValP, hRowP, hColP, hdC, 0));
10918 return (long)rg[0];
10919 }
10920 else
10921 {
10922 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_TSNE_CREATE, m_param.AsFloat((float)fTheta), m_param.AsLong(n, d, hY, hValP, hRowP, hColP, hdC, 0));
10923 return (long)rg[0];
10924 }
10925 }
10926
10927 public void ComputeTsneGradient(long hTsne, bool bValPUpdated)
10928 {
10929 if (m_dt == DataType.DOUBLE)
10930 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_TSNE_COMPUTE_GRADIENT1, null, m_param.AsLong(hTsne, (bValPUpdated) ? 1 : 0));
10931 else
10932 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_TSNE_COMPUTE_GRADIENT1, null, m_param.AsLong(hTsne, (bValPUpdated) ? 1 : 0));
10933 }
10934
10935 public double EvaluateTsneError(long hTsne)
10936 {
10937 if (m_dt == DataType.DOUBLE)
10938 {
10939 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_TSNE_COMPUTE_ERROR1, null, m_param.AsLong(hTsne));
10940 return rg[0];
10941 }
10942 else
10943 {
10944 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_TSNE_COMPUTE_ERROR1, null, m_param.AsLong(hTsne));
10945 return rg[0];
10946 }
10947 }
10948
10949 public void FreeTsne(long hTsne)
10950 {
10951 if (m_dt == DataType.DOUBLE)
10952 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_TSNE_FREE, null, m_param.AsLong(hTsne));
10953 else
10954 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_TSNE_FREE, null, m_param.AsLong(hTsne));
10955 }
10956
10957#pragma warning restore 1591
10958
10959 #endregion
10960
10961 #region Image Processing And Misc
10962
10980 public void gaussian_blur(int n, int nChannels, int nHeight, int nWidth, double dfSigma, long hX, long hY)
10981 {
10982 if (m_dt == DataType.DOUBLE)
10983 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_GUASSIAN_BLUR, m_param.AsDouble(dfSigma), m_param.AsLong(n, nChannels, nHeight, nWidth, 0, hX, hY));
10984 else
10985 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_GUASSIAN_BLUR, m_param.AsFloat((float)dfSigma), m_param.AsLong(n, nChannels, nHeight, nWidth, 0, hX, hY));
10986 }
10987
11005 public double hamming_distance(int n, double dfThreshold, long hA, long hB, long hY, int nOffA = 0, int nOffB = 0, int nOffY = 0)
11006 {
11007 if (m_dt == DataType.DOUBLE)
11008 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_HAMMING_DIFF, m_param.AsDouble(dfThreshold), m_param.AsLong(n, 0, hA, hB, hY, nOffA, nOffB, nOffY));
11009 else
11010 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_HAMMING_DIFF, m_param.AsFloat((float)dfThreshold), m_param.AsLong(n, 0, hA, hB, hY, nOffA, nOffB, nOffY));
11011
11012 return asum_double(n, hY);
11013 }
11014
11027 public void calc_dft_coefficients(int n, long hX, int m, long hY)
11028 {
11029 if (m_dt == DataType.DOUBLE)
11030 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CALC_DFT, null, m_param.AsLong(n, hX, m, hY));
11031 else
11032 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CALC_DFT, null, m_param.AsLong(n, hX, m, hY));
11033 }
11034
11046 public double[] calculate_batch_distances(DistanceMethod distMethod, double dfThreshold, int nItemDim, long hSrc, long hTargets, long hWork, int[,] rgOffsets)
11047 {
11048 if (m_dt == DataType.DOUBLE)
11049 {
11050 List<long> rgArg = new List<long> { (int)distMethod, 0, nItemDim, hSrc, hTargets, hWork };
11051 int nDim0 = rgOffsets.GetLength(0);
11052 int nDim1 = rgOffsets.GetLength(1);
11053
11054 rgArg.Add(nDim0);
11055 rgArg.Add(nDim1);
11056
11057 for (int i = 0; i < nDim0; i++)
11058 {
11059 for (int j = 0; j < nDim1; j++)
11060 {
11061 rgArg.Add(rgOffsets[i, j]);
11062 }
11063 }
11064
11065 return m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CALC_BATCH_DIST, m_param.AsDouble(dfThreshold), rgArg.ToArray());
11066 }
11067 else
11068 {
11069 List<long> rgArg = new List<long> { (int)distMethod, 0, nItemDim, hSrc, hTargets, hWork };
11070 int nDim0 = rgOffsets.GetLength(0);
11071 int nDim1 = rgOffsets.GetLength(1);
11072
11073 rgArg.Add(nDim0);
11074 rgArg.Add(nDim1);
11075
11076 for (int i = 0; i < nDim0; i++)
11077 {
11078 for (int j = 0; j < nDim1; j++)
11079 {
11080 rgArg.Add(rgOffsets[i, j]);
11081 }
11082 }
11083
11084 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CALC_BATCH_DIST, m_param.AsFloat((float)dfThreshold), rgArg.ToArray());
11085 double[] rgD = new double[rg.Length];
11086
11087 for (int i = 0; i < rg.Length; i++)
11088 {
11089 rgD[i] = rg[i];
11090 }
11091
11092 return rgD;
11093 }
11094 }
11095
11096 #endregion
11097
11098 //---------------------------------------------------------------------
11099 // Conversion Methods
11100 //---------------------------------------------------------------------
11101 #region Convertion Methods
11102
11103 private T[] convert(double[] rg)
11104 {
11105 if (rg == null)
11106 return null;
11107
11108 if (typeof(T) == typeof(double))
11109 return (T[])Convert.ChangeType(rg, typeof(T[]));
11110
11111 T[] rgt = new T[rg.Length];
11112 Array.Copy(Array.ConvertAll(rg, p => Convert.ToSingle(p)), rgt, rg.Length);
11113
11114 return rgt;
11115 }
11116
11117 private T[] convert(float[] rg)
11118 {
11119 if (rg == null)
11120 return null;
11121
11122 if (typeof(T) == typeof(float))
11123 return (T[])Convert.ChangeType(rg, typeof(T[]));
11124
11125 T[] rgt = new T[rg.Length];
11126 Array.Copy(rg, rgt, rg.Length);
11127
11128 return rgt;
11129 }
11130
11131 private float convertF1(T f)
11132 {
11133 return (float)Convert.ChangeType(f, typeof(float));
11134 }
11135
11136 private T convertF1(float f)
11137 {
11138 return (T)Convert.ChangeType(f, typeof(T));
11139 }
11140
11141 private float[] convertF(T[] rg, int nCount = -1)
11142 {
11143 if (rg == null)
11144 return null;
11145
11146 if (nCount == -1)
11147 nCount = rg.Length;
11148
11149 if (typeof(T) == typeof(float))
11150 return (float[])Convert.ChangeType(rg, typeof(float[]));
11151
11152 float[] rgf = new float[rg.Length];
11153 Array.Copy(Array.ConvertAll(rg, p => Convert.ToSingle(p)), rgf, rg.Length);
11154
11155 return rgf;
11156 }
11157
11158 private float[] convertF(T[] rg, float[] rgDst, int nOffset = 0, int nCount = -1)
11159 {
11160 if (rg == null)
11161 return null;
11162
11163 if (nCount == -1)
11164 nCount = rg.Length;
11165
11166 if (typeof(T) == typeof(float))
11167 {
11168 float[] rgConv = (float[])Convert.ChangeType(rg, typeof(float[]));
11169 Array.Copy(rgConv, 0, rgDst, nOffset, nCount);
11170 }
11171 else
11172 {
11173 Array.Copy(rg, 0, rgDst, nOffset, nCount);
11174 }
11175
11176 return rgDst;
11177 }
11178
11179 private double convertD1(T df)
11180 {
11181 return (double)Convert.ChangeType(df, typeof(double));
11182 }
11183
11184 private T convertD1(double df)
11185 {
11186 return (T)Convert.ChangeType(df, typeof(T));
11187 }
11188
11189 private double[] convertD(T[] rg, int nCount = -1)
11190 {
11191 if (rg == null)
11192 return null;
11193
11194 if (nCount == -1)
11195 nCount = rg.Length;
11196
11197 if (typeof(T) == typeof(double))
11198 return (double[])Convert.ChangeType(rg, typeof(double[]));
11199
11200 double[] rgdf = new double[rg.Length];
11201 Array.Copy(rg, rgdf, rg.Length);
11202
11203 return rgdf;
11204 }
11205
11206 private double[] convertD(T[] rg, double[] rgDst, int nOffset = 0, int nCount = -1)
11207 {
11208 if (rg == null)
11209 return null;
11210
11211 if (nCount == -1)
11212 nCount = rg.Length;
11213
11214 if (typeof(T) == typeof(double))
11215 {
11216 double[] rgConv = (double[])Convert.ChangeType(rg, typeof(double[]));
11217 Array.Copy(rgConv, 0, rgDst, nOffset, nCount);
11218 }
11219 else
11220 {
11221 Array.Copy(rg, 0, rgDst, nOffset, nCount);
11222 }
11223
11224 return rgDst;
11225 }
11226
11227 #endregion
11228
11229 #region Debugging Methods
11230
11236 public void ReportMemory(Log log, string strLocation)
11237 {
11238 double dfFree;
11239 double dfUsed;
11240 bool bCudaCallUsed;
11241 int nGpuID = GetDeviceID();
11242 double dfMem = GetDeviceMemory(out dfFree, out dfUsed, out bCudaCallUsed);
11243 log.WriteLine(strLocation + " Memory (GPU " + nGpuID.ToString() + "): " + dfMem.ToString("N2") + " GB total; " + dfFree.ToString("N2") + " GB free; " + dfUsed.ToString("N2") + " GB used.", true);
11244 }
11245
11246 #endregion
11247 }
11248
11249#pragma warning disable 1591
11250
11251 class Params
11252 {
11253 public Params()
11254 {
11255 }
11256
11257 public long[] AsLong(params long[] rg)
11258 {
11259 return rg;
11260 }
11261
11262 public double[] AsDouble(params double[] rg)
11263 {
11264 return rg;
11265 }
11266
11267 public float[] AsFloat(params float[] rg)
11268 {
11269 return rg;
11270 }
11271 }
11272
11273#pragma warning restore 1591
11274}
The CryptoRandom is a random number generator that can use either the standard .Net Random objec or t...
Definition: CryptoRandom.cs:14
double NextDouble()
Returns a random double within the range .
Definition: CryptoRandom.cs:83
The Log class provides general output in text form.
Definition: Log.cs:13
void WriteLine(string str, bool bOverrideEnabled=false, bool bHeader=false, bool bError=false, bool bDisable=false)
Write a line of output.
Definition: Log.cs:80
The Utility class provides general utility funtions.
Definition: Utility.cs:35
static List< int > Create(int nCount, int nStart, int nInc)
Create a new List and fill it with values starting with start and incrementing by inc.
Definition: Utility.cs:721
static double[] ConvertVec(float[] rgf)
Convert an array of float to an array of generics.
Definition: Utility.cs:550
The CudaDnn object is the main interface to the Low-Level Cuda C++ DLL.
Definition: CudaDnn.cs:969
void channel_compare(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY)
Compares the values of the channels from X and places the result in Y where 1 is set if the values ar...
Definition: CudaDnn.cs:8133
void relu_fwd(int nCount, long hBottomData, long hTopData, T fNegativeSlope)
Performs a Rectifier Linear Unit (ReLU) forward pass in Cuda.
Definition: CudaDnn.cs:9383
long CreateTensorDesc()
Create a new instance of a tensor descriptor for use with NVIDIA's cuDnn.
Definition: CudaDnn.cs:3518
long CreateConvolutionDesc()
Create a new instance of a convolution descriptor for use with NVIDIA's cuDnn.
Definition: CudaDnn.cs:3747
void coeff_sub_bwd(int nCount, int nDim, int nNumOffset, double dfCoeff, long hCoeffData, long hTopDiff, long hBottomDiff)
Performs a coefficient sub backward pass in Cuda.
Definition: CudaDnn.cs:10537
CudaDnn(int nDeviceID, DEVINIT flags=(DEVINIT.CUBLAS|DEVINIT.CURAND), long? lSeed=null, string strPath="", bool bResetFirst=false, bool bEnableMemoryTrace=false)
The CudaDnn constructor.
Definition: CudaDnn.cs:1488
T[] GetMemory(long hMem, long lCount=-1)
Retrieves the GPU memory as an array of type 'T'
Definition: CudaDnn.cs:2700
void SetTensorDesc(long hHandle, int n, int c, int h, int w, int nStride, int cStride, int hStride, int wStride, bool bHalf=false)
Sets the values of a tensor descriptor.
Definition: CudaDnn.cs:3620
void SynchronizeStream(long h=0)
Synchronize a stream on the current GPU, waiting for its operations to complete.
Definition: CudaDnn.cs:3239
void log(int n, long hA, long hY, double dfBeta, double dfAlpha=0)
Calculates the log value of (A * beta) + alpha, and places the result in Y.
Definition: CudaDnn.cs:7504
void channel_mul(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, int nMethod=1)
Multiplies the values of the channels from X and places the result in Y.
Definition: CudaDnn.cs:8272
int GetDeviceID()
Returns the current device id set within Cuda.
Definition: CudaDnn.cs:2013
void SetRnnDesc(long hCuDnn, long hRnnDesc, int nHiddenCount, int nNumLayers, long hDropoutDesc, RNN_MODE mode, bool bUseTensorCores, RNN_DIRECTION direction=RNN_DIRECTION.RNN_UNIDIRECTIONAL)
Sets the RNN Descriptor values.
Definition: CudaDnn.cs:4770
void SetHostMemory(long hMem, T[] rgSrc)
Copies an array of type 'T' into a block of already allocated host memory.
Definition: CudaDnn.cs:2995
void channel_op_fwd(OP op, int nCount, int nC, int nN1, int nSD1, int nN2, int nSD2, long hA, long hB, long hY)
Performs a channel operation forward on the data.
Definition: CudaDnn.cs:8382
void DeviceDisablePeerAccess(int nPeerDeviceID)
Disables peer-to-peer access between the current device used by the CudaDnn instance and a peer devic...
Definition: CudaDnn.cs:2270
void CopyHostToDevice(long lCount, long hHostSrc, long hGpuDst)
Copy from Host memory to GPU memory.
Definition: CudaDnn.cs:2568
void ResetDevice()
Reset the current device.
Definition: CudaDnn.cs:2079
void sort(int nCount, long hY)
Sort the data in the GPU memory specified.
Definition: CudaDnn.cs:6212
float erf(float fVal)
Calculates the erf() function.
Definition: CudaDnn.cs:6996
void math_fwd(int nCount, long hBottomData, long hTopData, MATH_FUNCTION function)
Performs a Math function forward pass in Cuda.
Definition: CudaDnn.cs:8949
long CreateRnnDesc()
Create the RNN Descriptor.
Definition: CudaDnn.cs:4733
void clip_bwd(int nCount, long hTopDiff, long hBottomData, long hBottomDiff, T fMin, T fMax)
Performs a Clip backward pass in Cuda.
Definition: CudaDnn.cs:8931
void adadelta_update(int nCount, long hNetParamsDiff, long hHistoryData1, long hHistoryData2, T fMomentum, T fDelta, T fLocalRate)
Perform the AdaDelta update
Definition: CudaDnn.cs:10264
void gemv(bool bTransA, int m, int n, double fAlpha, long hA, long hX, double fBeta, long hY)
Perform a matrix-vector multiplication operation: y = alpha transA (A) x + beta y (where x and y are ...
Definition: CudaDnn.cs:6431
void copy_batch(int nCount, int nNum, int nDim, long hSrcData, long hSrcLbl, int nDstCount, long hDstCache, long hWorkDevData, int nLabelStart, int nLabelCount, int nCacheSize, long hCacheHostCursors, long hWorkDataHost)
Copy a batch of labeled items into a cache organized by label where older data is removed and replace...
Definition: CudaDnn.cs:6062
void crop_bwd(int nCount, int nNumAxes, long hSrcStrides, long hDstStrides, long hOffsets, long hBottomDiff, long hTopDiff)
Performs the crop backward operation.
Definition: CudaDnn.cs:9830
void matmul(uint nOuterCount, int m, int n, int k, long hA, long hB, long hC, double dfScale=1.0, bool bTransA=false, bool bTransB=false)
Perform matmul operation hC = matmul(hA, hB), where hA, hB and hC are all in row-major format.
Definition: CudaDnn.cs:6695
bool IsRnn8Supported()
Returns whether or not RNN8 is supported.
Definition: CudaDnn.cs:5142
void dropout_bwd(int nCount, long hTopDiff, long hMask, uint uiThreshold, T fScale, long hBottomDiff)
Performs a dropout backward pass in Cuda.
Definition: CudaDnn.cs:9484
void col2im_nd(long hDataCol, int nDataColOffset, int nNumSpatialAxes, int nColCount, int nChannelAxis, long hImShape, long hColShape, long hKernelShape, long hPad, long hStride, long hDilation, long hDataIm, int nDataImOffset)
Rearranges the columns into image blocks.
Definition: CudaDnn.cs:8063
void rng_uniform(int n, double fMin, double fMax, long hY)
Fill Y with random numbers using a uniform random distribution.
Definition: CudaDnn.cs:8524
void ResetGhostMemory()
Resets the ghost memory by enabling it if this instance was configured to use ghost memory.
Definition: CudaDnn.cs:1783
void channel_op_bwd(OP op, int nCount, int nC, int nN1, int nSD1, int nN2, int nSD2, int nCy, int nSDy, long hA, long hB, long hY, long hAd, long hBd, long hYd, long hWork)
Performs a channel operation backward on the data.
Definition: CudaDnn.cs:8413
void mul_scalar(int n, float fAlpha, long hY)
Mutlipy each element of Y by a scalar.
Definition: CudaDnn.cs:7388
double sumsq(int n, long hW, long hA, int nAOff=0)
Calculates the sum of squares of A.
Definition: CudaDnn.cs:7878
void copy(int nCount, long hSrc, long hDst, int nSrcOffset=0, int nDstOffset=0, long hStream=-1, bool? bSrcHalfSizeOverride=null, bool? bDstHalfSizeOverride=null)
Copy data from one block of GPU memory to another.
Definition: CudaDnn.cs:6007
void add_scalar(int n, T fAlpha, long hY, int nYOff=0)
Adds a scalar value to each element of Y.
Definition: CudaDnn.cs:7190
void slice_fwd(int nCount, long hBottomData, int nNumSlices, int nSliceSize, int nBottomSliceAxis, int nTopSliceAxis, int nOffsetSliceAxis, long hTopData)
Performs a slice forward pass in Cuda.
Definition: CudaDnn.cs:9888
void channel_sub(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hA, long hX, long hY)
Subtracts the values across the channels of X from A and places the result in Y.
Definition: CudaDnn.cs:8197
void mish_bwd(int nCount, long hTopDiff, long hTopData, long hBottomDiff, long hBottomData, double dfThreshold, int nMethod=0)
Performs a Mish backward pass in Cuda.
Definition: CudaDnn.cs:9035
void cll_bwd(int nCount, int nChannels, double dfMargin, bool bLegacyVersion, double dfAlpha, long hY, long hDiff, long hDistSq, long hBottomDiff)
Performs a contrastive loss layer backward pass in Cuda.
Definition: CudaDnn.cs:10025
void powx(int n, long hA, double fAlpha, long hY, int nAOff=0, int nYOff=0)
Calculates the A raised to the power alpha and places the result in Y.
Definition: CudaDnn.cs:7524
void mul_scalar(int n, double fAlpha, long hY)
Mutlipy each element of Y by a scalar.
Definition: CudaDnn.cs:7374
void exp(int n, long hA, long hY, int nAOff, int nYOff, double dfBeta)
Calculates the exponent value of A * beta and places the result in Y.
Definition: CudaDnn.cs:7471
long CreateDropoutDesc()
Create a new instance of a dropout descriptor for use with NVIDIA's cuDnn.
Definition: CudaDnn.cs:4203
void nllloss_bwd(int nCount, long hTopData, long hLabel, long hBottomDiff, int nOuterNum, int nDim, int nInnerNum, long hCounts, int? nIgnoreLabel)
Performs NLL Loss backward pass in Cuda.
Definition: CudaDnn.cs:9707
static ulong ConvertByteSizeToCount(ulong ulSizeInBytes)
Converts the byte size into the number of items in the base data type of float or double.
Definition: CudaDnn.cs:2438
void FreeNCCL(long hNccl)
Free an instance of NCCL.
Definition: CudaDnn.cs:3355
void FreeLayerNorm(long hLayerNorm)
Free the instance of LayerNorm GPU support.
Definition: CudaDnn.cs:5846
void ger(int m, int n, float fAlpha, long hX, long hY, long hA)
Perform a vector-vector multiplication operation: A = x * (fAlpha * y) (where x and y are vectors and...
Definition: CudaDnn.cs:6509
void add(int n, long hA, long hB, long hY, float fAlpha)
Adds A to (B times scalar) and places the result in Y.
Definition: CudaDnn.cs:7265
void relu_bwd(int nCount, long hTopDiff, long hTopData, long hBottomDiff, T fNegativeSlope)
Performs a Rectifier Linear Unit (ReLU) backward pass in Cuda.
Definition: CudaDnn.cs:9404
void adamw_update(int nCount, long hNetParamsDiff, long hValM, long hValV, T fBeta1, T fBeta2, T fEpsHat, T fLearningRate, T fDecayRate, long hNetParamsData, int nStep)
Perform the AdamW update
Definition: CudaDnn.cs:10313
void lstm_unit_bwd(int nCount, int nHiddenDim, int nXCount, long hC_prev, long hX_acts, long hC, long hH, long hCont, long hC_diff, long hH_diff, long hC_prev_diff, long hX_acts_diff, long hX_diff)
Peforms the simple LSTM backward pass in Cuda for a given LSTM unit.
Definition: CudaDnn.cs:10463
void scale(int n, float fAlpha, long hX, long hY)
Scales the values in X and places them in Y.
Definition: CudaDnn.cs:6940
void LRNCrossChannelBackward(long hCuDnn, long hNormDesc, T fAlpha, long hTopDataDesc, long hTopData, long hTopDiffDesc, long hTopDiff, long hBottomDataDesc, long hBottomData, T fBeta, long hBottomDiffDesc, long hBottomDiff)
Perform LRN cross channel backward pass.
Definition: CudaDnn.cs:4384
void Dispose()
Disposes this instance freeing up all of its host and GPU memory.
Definition: CudaDnn.cs:1629
string GetRequiredCompute(out int nMinMajor, out int nMinMinor)
The GetRequiredCompute function returns the Major and Minor compute values required by the current Cu...
Definition: CudaDnn.cs:2216
void AddTensor(long hCuDnn, T fAlpha, long hSrcDesc, long hSrc, int nSrcOffset, T fBeta, long hDstDesc, long hDst, int nDstOffset)
Add two tensors together.
Definition: CudaDnn.cs:3655
void width(int n, long hMean, long hMin, long hMax, double dfAlpha, long hWidth)
Calculates the width values.
Definition: CudaDnn.cs:7925
void scal(int n, double fAlpha, long hX, int nXOff=0)
Scales the data in X by a scaling factor.
Definition: CudaDnn.cs:6767
void max(int n, long hA, long hB, long hY)
Calculates the max of A and B and places the result in Y. This max is only computed on a per item bas...
Definition: CudaDnn.cs:7669
void ReLUBackward(long hCuDnn, T fAlpha, long hTopDataDesc, long hTopData, long hTopDiffDesc, long hTopDiff, long hBottomDataDesc, long hBottomData, T fBeta, long hBottomDiffDesc, long hBottomDiff)
Perform a ReLU backward pass.
Definition: CudaDnn.cs:4598
void channel_add(int nCount, int nOuterNum, int nChannels, int nBlocks, int nInnerNum, int nOffset, long hX, long hY, DIR dir)
Add data along channels similar to numpy split function but where the data is added instead of copied...
Definition: CudaDnn.cs:8437
long CreateRnnDataDesc()
Create the RNN Data Descriptor.
Definition: CudaDnn.cs:4652
void FreePCA(long hPCA)
Free the PCA instance associated with handle.
Definition: CudaDnn.cs:5446
void FreeMemory(long hMem)
Free previously allocated GPU memory.
Definition: CudaDnn.cs:2517
double[] calculate_batch_distances(DistanceMethod distMethod, double dfThreshold, int nItemDim, long hSrc, long hTargets, long hWork, int[,] rgOffsets)
The calculate_batch_distances method calculates a set of distances based on the DistanceMethod specif...
Definition: CudaDnn.cs:11046
void SetPoolingDesc(long hHandle, PoolingMethod method, int h, int w, int hPad, int wPad, int hStride, int wStride)
Set the values of a pooling descriptor.
Definition: CudaDnn.cs:4074
void PoolingForward(long hCuDnn, long hPoolingDesc, T fAlpha, long hBottomDesc, long hBottomData, T fBeta, long hTopDesc, long hTopData)
Perform a pooling forward pass.
Definition: CudaDnn.cs:4093
void gather_fwd(int nCount, long hBottom, long hTop, int nAxis, int nDim, int nDimAtAxis, int nM, int nN, long hIdx)
Performs a gather forward pass where data at specifies indexes along a given axis are copied to the o...
Definition: CudaDnn.cs:10102
void rng_setseed(long lSeed)
Sets the random number generator seed used by random number operations.
Definition: CudaDnn.cs:8506
void tile_bwd(int nCount, long hTopDiff, int nTileSize, int nTiles, int nBottomTileAxis, long hBottomDiff)
Performs a tile backward pass in Cuda.
Definition: CudaDnn.cs:9941
double dot_double(int n, long hX, long hY)
Computes the dot product of X and Y.
Definition: CudaDnn.cs:6815
void rng_gaussian(int n, double fMu, double fSigma, long hY)
Fill Y with random numbers using a gaussian random distribution.
Definition: CudaDnn.cs:8578
void add_scalar(int n, double fAlpha, long hY)
Adds a scalar value to each element of Y.
Definition: CudaDnn.cs:7161
void unpooling_fwd(POOLING_METHOD method, int nCount, long hBottomData, int num, int nChannels, int nHeight, int nWidth, int nPooledHeight, int nPooledWidth, int nKernelH, int nKernelW, int nStrideH, int nStrideW, int nPadH, int nPadW, long hTopData, long hMask)
Performs the forward pass for unpooling using Cuda
Definition: CudaDnn.cs:8867
void matrix_meancenter_by_column(int nWidth, int nHeight, long hA, long hB, long hY, bool bNormalize=false)
Mean center the data by columns, where each column is summed and then subtracted from each column val...
Definition: CudaDnn.cs:10725
void adagrad_update(int nCount, long hNetParamsDiff, long hHistoryData, T fDelta, T fLocalRate)
Perform the AdaGrad update
Definition: CudaDnn.cs:10243
void SynchronizeDevice()
Synchronize the operations on the current device.
Definition: CudaDnn.cs:2093
void sigmoid_cross_entropy_bwd(int nCount, int nIgnoreLabel, long hTarget, long hBottomDiff)
Performs a sigmoid cross entropy backward pass in Cuda when an ignore label is specified.
Definition: CudaDnn.cs:10571
void smoothl1_bwd(int nCount, long hX, long hY)
Performs the backward operation for the SmoothL1 loss.
Definition: CudaDnn.cs:10063
void scale_to_range(int n, long hX, long hY, double fMin, double fMax)
Scales the values in X and places the result in Y (can also run inline where X = Y).
Definition: CudaDnn.cs:6973
void ConvolutionBackwardFilter(long hCuDnn, T fAlpha, long hBottomDesc, long hBottomData, int nBottomOffset, long hTopDesc, long hTopDiff, int nTopOffset, long hConvDesc, CONV_BWD_FILTER_ALGO algoBwd, long hWorkspace, int nWorkspaceOffset, ulong lWorkspaceSize, T fBeta, long hFilterDesc, long hWeightDiff, int nWeightOffset, bool bSyncStream=true)
Perform a convolution backward pass on the filter.
Definition: CudaDnn.cs:3972
void SigmoidForward(long hCuDnn, T fAlpha, long hBottomDataDesc, long hBottomData, T fBeta, long hTopDataDesc, long hTopData)
Perform a Sigmoid forward pass.
Definition: CudaDnn.cs:4531
int SsdMultiBoxLossForward(long hSSD, int nLocDataCount, long hLocGpuData, int nConfDataCount, long hConfGpuData, int nPriorDataCount, long hPriorGpuData, int nGtDataCount, long hGtGpuData, out List< DictionaryMap< List< int > > > rgAllMatchIndices, out List< List< int > > rgrgAllNegIndices, out int nNumNegs)
Performs the SSD MultiBoxLoss forward operation.
Definition: CudaDnn.cs:5661
void TanhBackward(long hCuDnn, T fAlpha, long hTopDataDesc, long hTopData, long hTopDiffDesc, long hTopDiff, long hBottomDataDesc, long hBottomData, T fBeta, long hBottomDiffDesc, long hBottomDiff)
Perform a Tanh backward pass.
Definition: CudaDnn.cs:4473
void pooling_bwd(POOLING_METHOD method, int nCount, long hTopDiff, int num, int nChannels, int nHeight, int nWidth, int nPooledHeight, int nPooledWidth, int nKernelH, int nKernelW, int nStrideH, int nStrideW, int nPadH, int nPadW, long hBottomDiff, long hMask, long hTopMask)
Performs the backward pass for pooling using Cuda
Definition: CudaDnn.cs:8839
void gemm(bool bTransA, bool bTransB, int m, int n, int k, T fAlpha, long hA, long hB, T fBeta, long hC, int nAOffset=0, int nBOffset=0, int nCOffset=0, int nGroups=1, int nGroupOffsetA=0, int nGroupOffsetB=0, int nGroupOffsetC=0)
Perform a matrix-matrix multiplication operation: C = alpha transB (B) transA (A) + beta C
Definition: CudaDnn.cs:6285
void permute(int nCount, long hBottom, bool bFwd, long hPermuteOrder, long hOldSteps, long hNewSteps, int nNumAxes, long hTop)
Performs data permutation on the input and reorders the data which is placed in the output.
Definition: CudaDnn.cs:10082
void bnll_fwd(int nCount, long hBottomData, long hTopData)
Performs a binomial normal log liklihod (BNLL) forward pass in Cuda.
Definition: CudaDnn.cs:9501
void silu_bwd(int nCount, long hTopDiff, long hTopData, long hBottomDiff, long hBottomData)
Performs the Sigmoid-weighted Linear Unit (SiLU) activation backward pass in Cuda.
Definition: CudaDnn.cs:9140
void channel_min(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, bool bReturnIdx=false)
Calculates the minimum value within each channel of X and places the result in Y.
Definition: CudaDnn.cs:8081
void geam(bool bTransA, bool bTransB, int m, int n, float fAlpha, long hA, long hB, float fBeta, long hC)
Perform a matrix-matrix addition/transposition operation: C = alpha transA (A) + beta transB (B)
Definition: CudaDnn.cs:6386
void nllloss_fwd(int nCount, long hProbData, long hLabel, long hLossData, int nOuterNum, int nDim, int nInnerNum, long hCounts, int? nIgnoreLabel)
Performs NLL Loss forward pass in Cuda.
Definition: CudaDnn.cs:9673
void NcclBroadcast(long hNccl, long hStream, long hX, int nCount)
Broadcasts a block of GPU data to all NCCL instances.
Definition: CudaDnn.cs:3421
void FreeCuDNN(long h)
Free an instance of cuDnn.
Definition: CudaDnn.cs:3281
bool DeviceCanAccessPeer(int nSrcDeviceID, int nPeerDeviceID)
Query whether or not two devices can access each other via peer-to-peer memory copies.
Definition: CudaDnn.cs:2240
long CreateMemoryPointer(long hData, long lOffset, long lCount)
Creates a memory pointer into an already existing block of GPU memory.
Definition: CudaDnn.cs:3028
void lstm_bwd(int t, int nN, int nH, int nI, double dfClippingThreshold, long hWeight_h, long hClipData, int nClipOffset, long hTopDiff, int nTopOffset, long hCellData, long hCellDiff, int nCellOffset, long hPreGateDiff, int nPreGateOffset, long hGateData, long hGateDiff, int nGateOffset, long hCT1Data, int nCT1Offset, long hDHT1Diff, int nDHT1Offset, long hDCT1Diff, int nDCT1Offset, long hHtoHData, long hContextDiff=0, long hWeight_c=0)
Peforms the simple LSTM backward pass in Cuda.
Definition: CudaDnn.cs:10413
void denan(int n, long hX, double dfReplacement)
Replaces all NAN values witin X with a replacement value.
Definition: CudaDnn.cs:7963
void mask_batch(int n, int nBatch, int nMaskDim, float fSearch, float fReplace, long hX, long hMask, long hY)
Mask the mask the batch of data in the source with the mask by replacing all values 'fSearch' found i...
Definition: CudaDnn.cs:7114
void rng_uniform(int n, float fMin, float fMax, long hY)
Fill Y with random numbers using a uniform random distribution.
Definition: CudaDnn.cs:8539
void nesterov_update(int nCount, long hNetParamsDiff, long hHistoryData, T fMomentum, T fLocalRate)
Perform the Nesterov update
Definition: CudaDnn.cs:10223
void ConvolutionBackwardData(long hCuDnn, T fAlpha, long hFilterDesc, long hWeight, int nWeightOffset, long hTopDesc, long hTopDiff, int nTopOffset, long hConvDesc, CONV_BWD_DATA_ALGO algoBwd, long hWorkspace, int nWorkspaceOffset, ulong lWorkspaceSize, T fBeta, long hBottomDesc, long hBottomDiff, int nBottomOffset, bool bSyncStream=true)
Perform a convolution backward pass on the data.
Definition: CudaDnn.cs:4025
void ger(int m, int n, T fAlpha, long hX, long hY, long hA)
Perform a vector-vector multiplication operation: A = x * (fAlpha * y) (where x and y are vectors and...
Definition: CudaDnn.cs:6526
void coeff_sum_bwd(int nCount, int nDim, int nNumOffset, double dfCoeff, long hCoeffData, long hTopDiff, long hBottomDiff)
Performs a coefficient sum backward pass in Cuda.
Definition: CudaDnn.cs:10500
void channel_fill(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, int nLabelDim, long hLabels, long hY)
Fills each channel with the channel item of Y with the data of X matching the label index specified b...
Definition: CudaDnn.cs:8179
void SetMemory(long hMem, List< double > rg)
Copies a list of doubles into a block of already allocated GPU memory.
Definition: CudaDnn.cs:2734
void FreePoolingDesc(long h)
Free a pooling descriptor instance.
Definition: CudaDnn.cs:4055
void embed_bwd(int nCount, long hBottomData, long hTopDiff, int nM, int nN, int nK, long hWeightDiff)
Performs the backward pass for embed
Definition: CudaDnn.cs:8781
void mask(int n, int nMaskDim, double fSearch, double fReplace, long hX, long hMask, long hY)
Mask the mask the data in the source with the mask by replacing all values 'fSearch' found in the mas...
Definition: CudaDnn.cs:7048
void threshold_fwd(int nCount, double dfThreshold, long hX, long hY)
Performs a threshold pass in Cuda.
Definition: CudaDnn.cs:10001
void channel_mean(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY)
Calculates the mean value of each channel of X and places the result in Y.
Definition: CudaDnn.cs:8116
static string GetCudaDnnDllPath()
Returns the path to the CudaDnnDll module to use for low level CUDA processing.
Definition: CudaDnn.cs:1638
long AllocMemory(List< float > rg)
Allocate a block of GPU memory and copy a list of floats to it.
Definition: CudaDnn.cs:2302
long CreateSSD(int nNumClasses, bool bShareLocation, int nLocClasses, int nBackgroundLabelId, bool bUseDiffcultGt, SSD_MINING_TYPE miningType, SSD_MATCH_TYPE matchType, float fOverlapThreshold, bool bUsePriorForMatching, SSD_CODE_TYPE codeType, bool bEncodeVariantInTgt, bool bBpInside, bool bIgnoreCrossBoundaryBbox, bool bUsePriorForNms, SSD_CONF_LOSS_TYPE confLossType, SSD_LOC_LOSS_TYPE locLossType, float fNegPosRatio, float fNegOverlap, int nSampleSize, bool bMapObjectToAgnostic, bool bNmsParam, float? fNmsThreshold=null, int? nNmsTopK=null, float? fNmsEta=null)
Create an instance of the SSD GPU support.
Definition: CudaDnn.cs:5482
void embed_fwd(int nCount, long hBottomData, long hWeight, int nM, int nN, int nK, long hTopData)
Performs the forward pass for embed
Definition: CudaDnn.cs:8763
long AllocMemory(List< double > rg)
Allocate a block of GPU memory and copy a list of doubles to it.
Definition: CudaDnn.cs:2291
void copy_expand(int n, int nNum, int nDim, long hX, long hA)
Expand a vector of length 'nNum' into a matrix of size 'nNum' x 'nDim' by copying each value of the v...
Definition: CudaDnn.cs:6182
void FreeConvolutionDesc(long h)
Free a convolution descriptor instance.
Definition: CudaDnn.cs:3765
void SoftmaxBackward(long hCuDnn, SOFTMAX_ALGORITHM alg, SOFTMAX_MODE mode, T fAlpha, long hTopDataDesc, long hTopData, long hTopDiffDesc, long hTopDiff, T fBeta, long hBottomDiffDesc, long hBottomDiff)
Perform a Softmax backward pass.
Definition: CudaDnn.cs:4640
void mask_batch(int n, int nBatch, int nMaskDim, T fSearch, T fReplace, long hX, long hMask, long hY)
Mask the mask the batch of data in the source with the mask by replacing all values 'fSearch' found i...
Definition: CudaDnn.cs:7079
double[] get_double(int nCount, long hHandle, int nIdx=-1)
Queries the GPU memory by copying it into an array of
Definition: CudaDnn.cs:5961
void SetMemory(long hMem, List< float > rg)
Copies a list of float into a block of already allocated GPU memory.
Definition: CudaDnn.cs:2745
void mul_scalar(int n, T fAlpha, long hY)
Mutlipy each element of Y by a scalar.
Definition: CudaDnn.cs:7402
bool RunPCA(long hPCA, int nSteps, out int nCurrentK, out int nCurrentIteration)
Runs a number of steps of the iterative PCA algorithm.
Definition: CudaDnn.cs:5417
double hamming_distance(int n, double dfThreshold, long hA, long hB, long hY, int nOffA=0, int nOffB=0, int nOffY=0)
The hamming_distance calculates the Hamming Distance between X and Y both of length n.
Definition: CudaDnn.cs:11005
void min_bwd(int nCount, long hTopDiff, int nIdx, long hMask, long hBottomDiff)
Performs a min backward pass in Cuda.
Definition: CudaDnn.cs:9794
void sub(int n, long hA, long hB, long hY, int nAOff=0, int nBOff=0, int nYOff=0, int nB=0)
Subtracts B from A and places the result in Y.
Definition: CudaDnn.cs:7312
void elu_bwd(int nCount, long hTopDiff, long hTopData, long hBottomData, long hBottomDiff, double dfAlpha)
Performs a Exponential Linear Unit (ELU) backward pass in Cuda.
Definition: CudaDnn.cs:9444
void scale(int n, T fAlpha, long hX, long hY, int nXOff=0, int nYOff=0)
Scales the values in X and places them in Y.
Definition: CudaDnn.cs:6957
void SetMemoryAt(long hMem, double[] rgSrc, int nOffset)
Copies an array of double into a block of already allocated GPU memory starting at a specific offset.
Definition: CudaDnn.cs:2860
void slice_bwd(int nCount, long hTopDiff, int nNumSlices, int nSliceSize, int nBottomSliceAxis, int nTopSliceAxis, int nOffsetSliceAxis, long hBottomDiff)
Performs a slice backward pass in Cuda.
Definition: CudaDnn.cs:9907
void Rnn8Backward(long hCuDnn, long hRnn, long hY, long hdY, long hX, long hdX, long hhX, long hdhY, long hdhX, long hcX, long hdcY, long hdcX, long hWt, long hdWt, long hWork, long hReserved)
Calculate the backward pass through the RNN8 for both data and weights.
Definition: CudaDnn.cs:5300
void SetupSSD(long hSSD, int nNum, int nNumPriors, int nNumGt)
Setup the SSD GPU support.
Definition: CudaDnn.cs:5625
void SetMemory(long hMem, float[] rgSrc, long hStream=0)
Copies an array of float into a block of already allocated GPU memory.
Definition: CudaDnn.cs:2769
void FreeRnnDataDesc(long h)
Free an existing RNN Data descriptor.
Definition: CudaDnn.cs:4672
void SetDeviceID(int nDeviceID=-1, DEVINIT flags=DEVINIT.NONE, long? lSeed=null)
Set the device ID used by the current instance of CudaDnn.
Definition: CudaDnn.cs:1960
void DeviceEnablePeerAccess(int nPeerDeviceID)
Enables peer-to-peer access between the current device used by the CudaDnn instance and a peer device...
Definition: CudaDnn.cs:2258
long AllocPCAScores(int nM, int nN, int nK, out int nCount)
Allocates the GPU memory for the PCA scores.
Definition: CudaDnn.cs:5336
void FreeStream(long h)
Free a stream.
Definition: CudaDnn.cs:3227
void add(int n, long hA, long hB, long hY)
Adds A to B and places the result in Y.
Definition: CudaDnn.cs:7227
void tanh_fwd(int nCount, long hBottomData, long hTopData)
Performs a TanH forward pass in Cuda.
Definition: CudaDnn.cs:9286
void channel_dot(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hA, long hY)
Calculates the dot product the the values within each channel of X and places the result in Y.
Definition: CudaDnn.cs:8326
void DivisiveNormalizationForward(long hCuDnn, long hNormDesc, T fAlpha, long hBottomDataDesc, long hBottomData, long hTemp1, long hTemp2, T fBeta, long hTopDataDesc, long hTopData)
Performs a Devisive Normalization forward pass.
Definition: CudaDnn.cs:4408
void elu_fwd(int nCount, long hBottomData, long hTopData, double dfAlpha)
Performs a Exponential Linear Unit (ELU) forward pass in Cuda.
Definition: CudaDnn.cs:9424
long AllocMemory(float[] rgSrc, long hStream=0)
Allocate a block of GPU memory and copy an array of float to it, optionally using a stream for the co...
Definition: CudaDnn.cs:2326
void bias_fwd(int nCount, long hBottomData, long hBiasData, int nBiasDim, int nInnerDim, long hTopData)
Performs a bias forward pass in Cuda.
Definition: CudaDnn.cs:9958
void smoothl1_fwd(int nCount, long hX, long hY)
Performs the forward operation for the SmoothL1 loss.
Definition: CudaDnn.cs:10044
void rng_gaussian(int n, T fMu, T fSigma, long hY)
Fill Y with random numbers using a gaussian random distribution.
Definition: CudaDnn.cs:8608
void fill(int n, int nDim, long hSrc, int nSrcOff, int nCount, long hDst)
Fill data from the source data 'n' times in the destination.
Definition: CudaDnn.cs:6199
void rng_bernoulli(int n, T fNonZeroProb, long hY)
Fill Y with random numbers using a bernoulli random distribution.
Definition: CudaDnn.cs:8659
void softmax_cross_entropy_bwd(int nCount, int nIgnoreLabel, long hTarget, long hBottomDiff)
Performs a softmax cross entropy backward pass in Cuda when an ignore label is specified.
Definition: CudaDnn.cs:10624
void Rnn8Forward(long hCuDnn, long hRnn, long hX, long hY, long hhX, long hhY, long hcX, long hcY, long hWts, long hWork, long hReserved)
Calculate the forward pass through the RNN8.
Definition: CudaDnn.cs:5273
void lstm_fwd(int t, int nN, int nH, int nI, long hWeight_h, long hWeight_i, long hClipData, int nClipOffset, long hTopData, int nTopOffset, long hCellData, int nCellOffset, long hPreGateData, int nPreGateOffset, long hGateData, int nGateOffset, long hHT1Data, int nHT1Offset, long hCT1Data, int nCT1Offset, long hHtoGateData, long hContext=0, long hWeight_c=0, long hCtoGetData=0)
Peforms the simple LSTM foward pass in Cuda.
Definition: CudaDnn.cs:10372
T[] SetPixel(long hMem, int nCount, bool bReturnOriginal, int nOffset, params Tuple< int, T >[] rgPixel)
Set a pixel value where each pixel is defined a set index, value tuple.
Definition: CudaDnn.cs:2933
long CreateCuDNN(long hStream=0)
Create a new instance of NVIDIA's cuDnn.
Definition: CudaDnn.cs:3263
void rng_gaussian(int n, float fMu, float fSigma, long hY)
Fill Y with random numbers using a gaussian random distribution.
Definition: CudaDnn.cs:8593
void lecun_bwd(int nCount, long hTopDiff, long hTopData, long hBottomDiff, long hBottomData)
Performs the LeCun's Tanh function backward
Definition: CudaDnn.cs:9225
void add(int n, long hA, long hB, long hC, long hY)
Adds A, B and C and places the result in Y.
Definition: CudaDnn.cs:7209
void col2im(long hDataCol, int nDataColOffset, int nChannels, int nHeight, int nWidth, int nKernelH, int nKernelW, int nPadH, int nPadW, int nStrideH, int nStrideW, int nDilationH, int nDilationW, long hDataIm, int nDataImOffset)
Rearranges the columns into image blocks.
Definition: CudaDnn.cs:8039
void axpby(int n, double fAlpha, long hX, double fBeta, long hY)
Scale the vector x and then multiply the vector X by a scalar and add the result to the vector Y.
Definition: CudaDnn.cs:6595
long CreateFilterDesc()
Create a new instance of a filter descriptor for use with NVIDIA's cuDnn.
Definition: CudaDnn.cs:3668
void KernelCopy(int nCount, long hSrc, int nSrcOffset, long hDstKernel, long hDst, int nDstOffset, long hHostBuffer, long hHostKernel=-1, long hStream=-1, long hSrcKernel=-1)
Copy memory from the look-up tables in one kernel to another.
Definition: CudaDnn.cs:1829
void sign(int n, long hX, long hY, int nXOff=0, int nYOff=0)
Computes the sign of each element of X and places the result in Y.
Definition: CudaDnn.cs:7574
void FreeMemoryTest(long h)
Free a memory test, freeing up all GPU memory used.
Definition: CudaDnn.cs:3095
void axpby(int n, T fAlpha, long hX, T fBeta, long hY)
Scale the vector x by Alpha and scale vector y by Beta and then add both together.
Definition: CudaDnn.cs:6629
float[] get_float(int nCount, long hHandle, int nIdx=-1)
Queries the GPU memory by copying it into an array of
Definition: CudaDnn.cs:5973
void ConvolutionForward(long hCuDnn, T fAlpha, long hBottomDesc, long hBottomData, int nBottomOffset, long hFilterDesc, long hWeight, int nWeightOffset, long hConvDesc, CONV_FWD_ALGO algoFwd, long hWorkspace, int nWorkspaceOffset, ulong lWorkspaceSize, T fBeta, long hTopDesc, long hTopData, int nTopOffset, bool bSyncStream=true)
Perform a convolution forward pass.
Definition: CudaDnn.cs:3882
void lecun_fwd(int nCount, long hBottomData, long hTopData)
Performs the LeCun's Tanh function forward
Definition: CudaDnn.cs:9203
void div(int n, long hA, long hB, long hY)
Divides each element of A by each element of B and places the result in Y.
Definition: CudaDnn.cs:7420
void RnnForward(long hCuDnn, long hRnnDesc, long hXDesc, long hXData, long hHxDesc, long hHxData, long hCxDesc, long hCxData, long hWtDesc, long hWtData, long hYDesc, long hYData, long hHyDesc, long hHyData, long hCyDesc, long hCyData, long hWorkspace, ulong nWsCount, long hReserved, ulong nResCount, bool bTraining)
Run the RNN through a forward pass.
Definition: CudaDnn.cs:4881
void channel_copyall(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY)
Copy all data from X (shape 1,c,sd) to each num in Y (shape n,c,sd).
Definition: CudaDnn.cs:8474
void gemm(bool bTransA, bool bTransB, int m, int n, int k, double fAlpha, long hA, long hB, double fBeta, long hC, uint lda, uint ldb, uint ldc, uint stridea, uint strideb, uint stridec, uint batch_count)
Perform a matrix-matrix multiplication operation: C = alpha transB (B) transA (A) + beta C
Definition: CudaDnn.cs:6343
static ulong basetype_size(bool bUseHalfSize)
Returns the base type size in bytes.
Definition: CudaDnn.cs:1899
void log(int n, long hA, long hY)
Calculates the log value of A and places the result in Y.
Definition: CudaDnn.cs:7488
void channel_fillfrom(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, DIR dir)
Fills each channel with the the values stored in Src data where the X data continains nOuterNum x nCh...
Definition: CudaDnn.cs:8152
void GetRnn8MemorySizes(long hCuDnn, long hRnn, out ulong szWtCount, out ulong szWorkSize, out ulong szReservedSize)
Returns the memory sizes required for the RNN8.
Definition: CudaDnn.cs:5221
long CreateNCCL(int nDeviceId, int nCount, int nRank, Guid guid)
Create an instance of NVIDIA's NCCL 'Nickel'
Definition: CudaDnn.cs:3297
void silu_fwd(int nCount, long hBottomData, long hTopData)
Performs the Sigmoid-weighted Linear Unit (SiLU) activation forward pass in Cuda.
Definition: CudaDnn.cs:9118
void axpy(int n, double fAlpha, long hX, long hY)
Multiply the vector X by a scalar and add the result to the vector Y.
Definition: CudaDnn.cs:6544
void adam_update(int nCount, long hNetParamsDiff, long hValM, long hValV, T fBeta1, T fBeta2, T fEpsHat, T fLearningRate, T fCorrection)
Perform the Adam update
Definition: CudaDnn.cs:10287
void sum(int nCount, int nOuterNum, int nInnerNum, long hX, long hY)
Calculates the sum of inner values of X and places the result in Y.
Definition: CudaDnn.cs:8491
void SsdEncodeConfPrediction(long hSSD, int nConfPredCount, long hConfPred, int nConfGtCount, long hConfGt)
Encodes the SSD data into the confidence prediction and confidence ground truths.
Definition: CudaDnn.cs:5810
void rng_bernoulli(int n, float fNonZeroProb, long hY)
Fill Y with random numbers using a bernoulli random distribution.
Definition: CudaDnn.cs:8645
void transposeHW(int n, int c, int h, int w, long hSrc, long hDst)
Transpose a n*c number of matrices along the height and width dimensions. All matrices are in row-maj...
Definition: CudaDnn.cs:6716
void prelu_fwd(int nCount, int nChannels, int nDim, long hBottomData, long hTopData, long hSlopeData, int nDivFactor)
Performs Parameterized Rectifier Linear Unit (ReLU) forward pass in Cuda.
Definition: CudaDnn.cs:9540
void ConvolutionBackwardFilter(long hCuDnn, long hBottomDesc, long hBottomData, int nBottomOffset, long hTopDesc, long hTopDiff, int nTopOffset, long hConvDesc, CONV_BWD_FILTER_ALGO algoBwd, long hWorkspace, int nWorkspaceOffset, ulong lWorkspaceSize, long hFilterDesc, long hWeightDiff, int nWeightOffset, bool bSyncStream)
Perform a convolution backward pass on the filter.
Definition: CudaDnn.cs:3946
long CreateLRNDesc()
Create a new instance of a LRN descriptor for use with NVIDIA's cuDnn.
Definition: CudaDnn.cs:4308
void ReportMemory(Log log, string strLocation)
Report the memory use on the current GPU managed by the CudaDnn object.
Definition: CudaDnn.cs:11236
void mean_error_loss_bwd(int nCount, long hPredicted, long hTarget, long hBottomDiff, MEAN_ERROR merr)
Performs a Mean Error Loss backward pass in Cuda.
Definition: CudaDnn.cs:8991
void RnnBackwardWeights(long hCuDnn, long hRnnDesc, long hXDesc, long hXData, long hHxDesc, long hHxData, long hYDesc, long hYData, long hWorkspace, ulong nWsCount, long hWtDesc, long hWtDiff, long hReserved, ulong nResCount)
Run the RNN backward pass on the weights.
Definition: CudaDnn.cs:5080
void channel_max(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, bool bReturnIdx=false)
Calculates the maximum value within each channel of X and places the result in Y.
Definition: CudaDnn.cs:8099
void scal(int n, float fAlpha, long hX, int nXOff=0)
Scales the data in X by a scaling factor.
Definition: CudaDnn.cs:6782
void serf_fwd(int nCount, long hBottomData, long hTopData, double dfThreshold)
Performs a Serf forward pass in Cuda.
Definition: CudaDnn.cs:9245
void concat_bwd(int nCount, long hTopDiff, int nNumConcats, int nConcatInputSize, int nTopConcatAxis, int nBottomConcatAxis, int nOffsetConcatAxis, long hBottomDiff)
Performs a concat backward pass in Cuda.
Definition: CudaDnn.cs:9869
void sigmoid_fwd(int nCount, long hBottomData, long hTopData)
Performs a Sigmoid forward pass in Cuda.
Definition: CudaDnn.cs:9323
int GetMultiGpuBoardGroupID(int nDeviceID)
Query the mutli-gpu board group id for a device.
Definition: CudaDnn.cs:2109
bool contains_point(int n, long hMean, long hWidth, long hX, long hWork, int nXOff=0)
Returns true if the point is contained within the bounds.
Definition: CudaDnn.cs:7943
long CreateRnn8()
Create the RNN8.
Definition: CudaDnn.cs:5160
void lrn_computediff(int nCount, long hBottomData, long hTopData, long hScaleData, long hTopDiff, int nNum, int nChannels, int nHeight, int nWidth, int nSize, T fNegativeBeta, T fCacheRatio, long hBottomDiff)
Computes the diff used to calculate the LRN cross channel backward pass in Cuda.
Definition: CudaDnn.cs:10184
void SigmoidBackward(long hCuDnn, T fAlpha, long hTopDataDesc, long hTopData, long hTopDiffDesc, long hTopDiff, long hBottomDataDesc, long hBottomData, T fBeta, long hBottomDiffDesc, long hBottomDiff)
Perform a Sigmoid backward pass.
Definition: CudaDnn.cs:4553
long CreatePCA(int nMaxIterations, int nM, int nN, int nK, long hData, long hScoresResult, long hLoadsResult, long hResiduals=0, long hEigenvalues=0)
Creates a new PCA instance and returns the handle to it.
Definition: CudaDnn.cs:5392
void rmsprop_update(int nCount, long hNetParamsDiff, long hHistoryData, T fRmsDecay, T fDelta, T fLocalRate)
Perform the RMSProp update
Definition: CudaDnn.cs:10334
void SetMemory(long hMem, T[] rgSrc, long hStream=0, int nCount=-1)
Copies an array of type 'T' into a block of already allocated GPU memory.
Definition: CudaDnn.cs:2781
void FreeFilterDesc(long h)
Free a filter descriptor instance.
Definition: CudaDnn.cs:3686
void EluBackward(long hCuDnn, T fAlpha, long hTopDataDesc, long hTopData, long hTopDiffDesc, long hTopDiff, long hBottomDataDesc, long hBottomData, T fBeta, long hBottomDiffDesc, long hBottomDiff)
Perform a Elu backward pass.
Definition: CudaDnn.cs:4513
void im2col(long hDataIm, int nDataImOffset, int nChannels, int nHeight, int nWidth, int nKernelH, int nKernelW, int nPadH, int nPadW, int nStrideH, int nStrideW, int nDilationH, int nDilationW, long hDataCol, int nDataColOffset)
Rearranges image blocks into columns.
Definition: CudaDnn.cs:7989
void gemm(bool bTransA, bool bTransB, int m, int n, int k, double fAlpha, long hA, long hB, double fBeta, long hC)
Perform a matrix-matrix multiplication operation: C = alpha transB (B) transA (A) + beta C
Definition: CudaDnn.cs:6236
void SetConvolutionDesc(long hHandle, int hPad, int wPad, int hStride, int wStride, int hDilation, int wDilation, bool bUseTensorCores, bool bHalf=false)
Set the values of a convolution descriptor.
Definition: CudaDnn.cs:3785
void tile_fwd(int nCount, long hBottomData, int nInnerDim, int nTiles, int nBottomTileAxis, long hTopData)
Performs a tile forward pass in Cuda.
Definition: CudaDnn.cs:9924
void FreeSSD(long hSSD)
Free the instance of SSD GPU support.
Definition: CudaDnn.cs:5637
void SetFilterNdDesc(long hHandle, int[] rgDim, bool bHalf=false)
Sets the values of a filter descriptor.
Definition: CudaDnn.cs:3700
void ConvolutionBackwardBias(long hCuDnn, T fAlpha, long hTopDesc, long hTopDiff, int nTopOffset, T fBeta, long hBiasDesc, long hBiasDiff, int nBiasOffset, bool bSyncStream=true)
Perform a convolution backward pass on the bias.
Definition: CudaDnn.cs:3919
long CreatePoolingDesc()
Create a new instance of a pooling descriptor for use with NVIDIA's cuDnn.
Definition: CudaDnn.cs:4037
void scale(int n, double fAlpha, long hX, long hY)
Scales the values in X and places them in Y.
Definition: CudaDnn.cs:6925
void FreeRnn8(long h)
Free an existing RNN8.
Definition: CudaDnn.cs:5178
Tuple< double, double, double, double > minmax(int n, long hA, long hWork1, long hWork2, bool bDetectNans=false, int nAOff=0)
Finds the minimum and maximum values within A.
Definition: CudaDnn.cs:7818
void mask_batch(int n, int nBatch, int nMaskDim, double fSearch, double fReplace, long hX, long hMask, long hY)
Mask the mask the batch of data in the source with the mask by replacing all values 'fSearch' found i...
Definition: CudaDnn.cs:7098
long AllocMemory(long lCapacity, bool bHalfSize=false)
Allocate a block of GPU memory with a specified capacity.
Definition: CudaDnn.cs:2449
long AllocMemory(T[] rgSrc, long hStream=0, bool bHalfSize=false)
Allocate a block of GPU memory and copy an array of type 'T' to it, optionally using a stream for the...
Definition: CudaDnn.cs:2338
void lrn_fillscale(int nCount, long hBottomData, int nNum, int nChannels, int nHeight, int nWidth, int nSize, T fAlphaOverSize, T fK, long hScaleData)
Performs the fill scale operation used to calculate the LRN cross channel forward pass in Cuda.
Definition: CudaDnn.cs:10143
void LRNCrossChannelForward(long hCuDnn, long hNormDesc, T fAlpha, long hBottomDesc, long hBottomData, T fBeta, long hTopDesc, long hTopData)
Perform LRN cross channel forward pass.
Definition: CudaDnn.cs:4361
float[] GetHostMemoryFloat(long hMem)
Retrieves the host memory as an array of floats.
Definition: CudaDnn.cs:2652
void transpose(int n, long hX, long hY, long hXCounts, long hYCounts, long hMapping, int nNumAxes, long hBuffer)
Perform a transpose on X producing Y, similar to the numpy.transpose operation.
Definition: CudaDnn.cs:7862
int GetRnnParamCount(long hCuDnn, long hRnnDesc, long hXDesc)
Returns the RNN parameter count.
Definition: CudaDnn.cs:4785
void gaussian_blur(int n, int nChannels, int nHeight, int nWidth, double dfSigma, long hX, long hY)
The gaussian_blur runs a Gaussian blurring operation over each channel of the data using the sigma.
Definition: CudaDnn.cs:10980
void gelu_bwd(int nCount, long hTopDiff, long hTopData, long hBottomDiff, long hBottomData, bool bEnableBertVersion)
Performs a GELU backward pass in Cuda.
Definition: CudaDnn.cs:9098
void max_bwd(int nCount, long hTopDiff, int nIdx, long hMask, long hBottomDiff)
Performs a max backward pass in Cuda.
Definition: CudaDnn.cs:9758
double max(int n, long hA, out long lPos, int nAOff=0, long hWork=0)
Finds the maximum value of A.
Definition: CudaDnn.cs:7724
double GetDeviceMemory(out double dfFree, out double dfUsed, out bool bCudaCallUsed, int nDeviceID=-1)
Queries the amount of total, free and used memory on a given GPU.
Definition: CudaDnn.cs:2182
void PoolingBackward(long hCuDnn, long hPoolingDesc, T fAlpha, long hTopDataDesc, long hTopData, long hTopDiffDesc, long hTopDiff, long hBottomDataDesc, long hBottomData, T fBeta, long hBottomDiffDesc, long hBottomDiff)
Perform a pooling backward pass.
Definition: CudaDnn.cs:4116
void SsdEncodeLocPrediction(long hSSD, int nLocPredCount, long hLocPred, int nLocGtCount, long hLocGt)
Encodes the SSD data into the location prediction and location ground truths.
Definition: CudaDnn.cs:5794
void exp(int n, long hA, long hY)
Calculates the exponent value of A and places the result in Y.
Definition: CudaDnn.cs:7454
string GetDeviceInfo(int nDeviceID, bool bVerbose=false)
Query the device information of a device.
Definition: CudaDnn.cs:2064
void BatchNormBackward(long hCuDnn, BATCHNORM_MODE mode, T fAlphaDiff, T fBetaDiff, T fAlphaParamDiff, T fBetaParamDiff, long hBwdBottomDesc, long hBottomData, long hTopDiffDesc, long hTopDiff, long hBottomDiffDesc, long hBottomDiff, long hBwdScaleBiasMeanVarDesc, long hScaleData, long hScaleDiff, long hBiasDiff, double dfEps, long hSaveMean, long hSaveInvVar)
Run the batch norm backward pass.
Definition: CudaDnn.cs:4191
void sqrt_scale(int nCount, long hX, long hY)
Scale the data by the sqrt of the data. y = sqrt(abs(x)) * sign(x)
Definition: CudaDnn.cs:7638
void channel_mulv(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hA, long hX, long hC)
Multiplies the values in vector X by each channel in matrix A and places the result in matrix C.
Definition: CudaDnn.cs:8290
void softplus_bwd(int nCount, long hTopDiff, long hTopData, long hBottomDiff, long hBottomData)
Performs the Softplus function backward, a smooth approximation of the ReLU function
Definition: CudaDnn.cs:9183
void SetFilterDesc(long hHandle, int n, int c, int h, int w, bool bHalf=false)
Sets the values of a filter descriptor.
Definition: CudaDnn.cs:3735
void channel_sum(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, bool bSumAcrossChannels=true, DIR dir=DIR.FWD, int nChannelsY=-1)
Calculates the sum the the values either across or within each channel (depending on bSumAcrossChanne...
Definition: CudaDnn.cs:8236
void lrn_computeoutput(int nCount, long hBottomData, long hScaleData, T fNegativeBeta, long hTopData)
Computes the output used to calculate the LRN cross channel forward pass in Cuda.
Definition: CudaDnn.cs:10159
long AllocHostBuffer(long lCapacity)
Allocate a block of host memory with a specified capacity.
Definition: CudaDnn.cs:2581
void channel_copy(int nCount, int nOuterNum, int nChannels, int nBlocks, int nInnerNum, int nOffset, long hX, long hY, DIR dir)
Copy data along channels similar to numpy split function.
Definition: CudaDnn.cs:8457
float dot_float(int n, long hX, long hY)
Computes the dot product of X and Y.
Definition: CudaDnn.cs:6830
void add(int n, long hA, long hB, long hY, double dfAlphaA, double dfAlphaB, int nAOff=0, int nBOff=0, int nYOff=0)
Adds A to (B times scalar) and places the result in Y.
Definition: CudaDnn.cs:7288
void batchreidx_fwd(int nCount, int nInnerDim, long hBottomData, long hPermutData, long hTopData)
Performs the forward pass for batch re-index
Definition: CudaDnn.cs:8727
void mul(int n, long hA, long hB, long hY, int nAOff=0, int nBOff=0, int nYOff=0)
Multiplies each element of A with each element of B and places the result in Y.
Definition: CudaDnn.cs:7334
void channel_duplicate(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY)
Duplicates each channel 'nInnerNum' of times in the destination.
Definition: CudaDnn.cs:8343
long GetHostBufferCapacity(long hMem)
Returns the host memory capacity.
Definition: CudaDnn.cs:2621
void im2col_nd(long hDataIm, int nDataImOffset, int nNumSpatialAxes, int nImCount, int nChannelAxis, long hImShape, long hColShape, long hKernelShape, long hPad, long hStride, long hDilation, long hDataCol, int nDataColOffset)
Rearranges image blocks into columns.
Definition: CudaDnn.cs:8013
void GetRnnLinLayerParams(long hCuDnn, long hRnnDesc, int nLayer, long hXDesc, long hWtDesc, long hWtData, int nLinLayer, out int nWtCount, out long hWt, out int nBiasCount, out long hBias)
Returns the linear layer parameters (weights).
Definition: CudaDnn.cs:4837
void BatchNormForward(long hCuDnn, BATCHNORM_MODE mode, T fAlpha, T fBeta, long hFwdBottomDesc, long hBottomData, long hFwdTopDesc, long hTopData, long hFwdScaleBiasMeanVarDesc, long hScaleData, long hBiasData, double dfFactor, long hGlobalMean, long hGlobalVar, double dfEps, long hSaveMean, long hSaveInvVar, bool bTraining)
Run the batch norm forward pass.
Definition: CudaDnn.cs:4161
void unpooling_bwd(POOLING_METHOD method, int nCount, long hTopDiff, int num, int nChannels, int nHeight, int nWidth, int nPooledHeight, int nPooledWidth, int nKernelH, int nKernelW, int nStrideH, int nStrideW, int nPadH, int nPadW, long hBottomDiff, long hMask)
Performs the backward pass for unpooling using Cuda
Definition: CudaDnn.cs:8895
void gelu_fwd(int nCount, long hBottomData, long hTopData, bool bEnableBertVersion)
Performs a GELU forward pass in Cuda.
Definition: CudaDnn.cs:9064
void FreeDropoutDesc(long h)
Free a dropout descriptor instance.
Definition: CudaDnn.cs:4221
void FreeExtension(long hExtension)
Free an instance of an Extension.
Definition: CudaDnn.cs:3474
void GetConvolutionInfo(long hCuDnn, long hBottomDesc, long hFilterDesc, long hConvDesc, long hTopDesc, ulong lWorkspaceSizeLimitInBytes, bool bUseTensorCores, out CONV_FWD_ALGO algoFwd, out ulong lWsSizeFwd, out CONV_BWD_FILTER_ALGO algoBwdFilter, out ulong lWsSizeBwdFilter, out CONV_BWD_DATA_ALGO algoBwdData, out ulong lWsSizeBwdData, CONV_FWD_ALGO preferredFwdAlgo=CONV_FWD_ALGO.NONE)
Queryies the algorithms and workspace sizes used for a given convolution descriptor.
Definition: CudaDnn.cs:3810
long CreateLayerNorm(int nGpuID, int nCount, int nOuterNum, int nChannels, int nInnerNum, float fEps=1e-10f)
Create the Cuda version of LayerNorm
Definition: CudaDnn.cs:5828
void SoftmaxForward(long hCuDnn, SOFTMAX_ALGORITHM alg, SOFTMAX_MODE mode, T fAlpha, long hBottomDataDesc, long hBottomData, T fBeta, long hTopDataDesc, long hTopData)
Perform a Softmax forward pass.
Definition: CudaDnn.cs:4618
void debug()
The debug function is uses only during debugging the debug version of the low-level DLL.
Definition: CudaDnn.cs:10637
void SetTensorNdDesc(long hHandle, int[] rgDim, int[] rgStride, bool bHalf=false)
Sets the values of a tensor descriptor.
Definition: CudaDnn.cs:3551
void gemm(bool bTransA, bool bTransB, int m, int n, int k, float fAlpha, long hA, long hB, float fBeta, long hC)
Perform a matrix-matrix multiplication operation: C = alpha transB (B) transA (A) + beta C
Definition: CudaDnn.cs:6257
void channel_div(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, int nMethod=1)
Divides the values of the channels from X and places the result in Y.
Definition: CudaDnn.cs:8254
long KernelCopyNccl(long hSrcKernel, long hSrcNccl)
Copies an Nccl handle from one kernel to the current kernel of the current CudaDnn instance.
Definition: CudaDnn.cs:1866
void calc_dft_coefficients(int n, long hX, int m, long hY)
Calculates the discrete Fourier Transform (DFT) coefficients across the frequencies 1....
Definition: CudaDnn.cs:11027
void softmax_cross_entropy_fwd(int nCount, long hProbData, long hLabel, long hLossDiff, long hLossData, int nOuterNum, int nDim, int nInnerNum, long hCounts, int? nIgnoreLabel)
Performs a softmax cross entropy forward pass in Cuda.
Definition: CudaDnn.cs:10595
void softmaxloss_bwd(int nCount, long hTopData, long hLabel, long hBottomDiff, int nOuterNum, int nDim, int nInnerNum, long hCounts, int? nIgnoreLabel)
Performs Softmax Loss backward pass in Cuda.
Definition: CudaDnn.cs:9639
void SetMemoryAt(long hMem, float[] rgSrc, int nOffset)
Copies an array of float into a block of already allocated GPU memory starting at a specific offset.
Definition: CudaDnn.cs:2872
void min_fwd(int nCount, long hBottomDataA, long hBottomDataB, int nIdx, long hTopData, long hMask)
Performs a min forward pass in Cuda.
Definition: CudaDnn.cs:9778
long AllocMemory(double[] rgSrc, long hStream=0)
Allocate a block of GPU memory and copy an array of doubles to it, optionally using a stream for the ...
Definition: CudaDnn.cs:2314
double[] GetMemoryDouble(long hMem, long lCount=-1)
Retrieves the GPU memory as an array of doubles.
Definition: CudaDnn.cs:2677
void pooling_fwd(POOLING_METHOD method, int nCount, long hBottomData, int num, int nChannels, int nHeight, int nWidth, int nPooledHeight, int nPooledWidth, int nKernelH, int nKernelW, int nStrideH, int nStrideW, int nPadH, int nPadW, long hTopData, long hMask, long hTopMask)
Performs the forward pass for pooling using Cuda
Definition: CudaDnn.cs:8810
double sumsqdiff(int n, long hW, long hA, long hB, int nAOff=0, int nBOff=0)
Calculates the sum of squares of differences between A and B
Definition: CudaDnn.cs:7902
void SynchronizeThread()
Synchronize all kernel threads on the current GPU.
Definition: CudaDnn.cs:3250
void SetRnn8(long hCuDnn, long hRnn, bool bTraining, RNN_DATALAYOUT layout, RNN_MODE cellMode, RNN_BIAS_MODE biasMode, int nSequenceLen, int nBatchSize, int nInputs, int nHidden, int nOutputs, int nProjection, int nNumLayers, float fDropout, ulong lSeed, bool bBidirectional=false)
Set the RNN8 parameters.
Definition: CudaDnn.cs:5205
void add_scalar(int n, float fAlpha, long hY)
Adds a scalar value to each element of Y.
Definition: CudaDnn.cs:7175
void sigmoid_bwd(int nCount, long hTopDiff, long hTopData, long hBottomDiff)
Performs a Sigmoid backward pass in Cuda.
Definition: CudaDnn.cs:9341
void prelu_bwd_param(int nCDim, int nNum, int nTopOffset, long hTopDiff, long hBottomData, long hBackBuffDiff)
Performs Parameterized Rectifier Linear Unit (ReLU) backward param pass in Cuda.
Definition: CudaDnn.cs:9562
void FreeRnnDesc(long h)
Free an existing RNN descriptor.
Definition: CudaDnn.cs:4751
void mish_fwd(int nCount, long hBottomData, long hTopData, double dfThreshold)
Performs a Mish forward pass in Cuda.
Definition: CudaDnn.cs:9011
void channel_percentile(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, double dfPercentile)
Calculates the percentile along axis = 0.
Definition: CudaDnn.cs:8361
void divbsx(int n, long hA, int nAOff, long hX, int nXOff, int nC, int nSpatialDim, bool bTranspose, long hB, int nBOff)
Divide a matrix by a vector.
Definition: CudaDnn.cs:6671
void FreeLRNDesc(long h)
Free a LRN descriptor instance.
Definition: CudaDnn.cs:4326
void FreeHostBuffer(long hMem)
Free previously allocated host memory.
Definition: CudaDnn.cs:2602
void sigmoid_cross_entropy_fwd(int nCount, long hInput, long hTarget, long hLoss, bool bHasIgnoreLabel, int nIgnoreLabel, long hCountData)
Performs a sigmoid cross entropy forward pass in Cuda.
Definition: CudaDnn.cs:10556
void softmaxloss_fwd(int nCount, long hProbData, long hLabel, long hLossData, int nOuterNum, int nDim, int nInnerNum, long hCounts, int? nIgnoreLabel)
Performs Softmax Loss forward pass in Cuda.
Definition: CudaDnn.cs:9605
void rng_uniform(int n, T fMin, T fMax, long hY)
Fill Y with random numbers using a uniform random distribution.
Definition: CudaDnn.cs:8554
CudaDnn(CudaDnn< T > cuda, bool bEnableGhostMemory)
Alternate CudaDnn constructor.
Definition: CudaDnn.cs:1587
void EluForward(long hCuDnn, T fAlpha, long hBottomDataDesc, long hBottomData, T fBeta, long hTopDataDesc, long hTopData)
Perform a Elu forward pass.
Definition: CudaDnn.cs:4491
void DropoutForward(long hCuDnn, long hDropoutDesc, long hBottomDesc, long hBottomData, long hTopDesc, long hTopData, long hReserved)
Performs a dropout forward pass.
Definition: CudaDnn.cs:4278
void CopyDeviceToHost(long lCount, long hGpuSrc, long hHostDst)
Copy from GPU memory to Host memory.
Definition: CudaDnn.cs:2554
void sqrt(int n, long hX, long hY)
Computes the square root of each element of X and places the result in Y.
Definition: CudaDnn.cs:7624
void clip_fwd(int nCount, long hBottomData, long hTopData, T fMin, T fMax)
Performs a Clip forward pass in Cuda.
Definition: CudaDnn.cs:8914
void DisableGhostMemory()
Disables the ghost memory, if enabled.
Definition: CudaDnn.cs:1775
long AllocPCAData(int nM, int nN, int nK, out int nCount)
Allocates the GPU memory for the PCA Data.
Definition: CudaDnn.cs:5319
double asum_double(int n, long hX, int nXOff=0)
Computes the sum of absolute values in X.
Definition: CudaDnn.cs:6871
T asum(int n, long hX, int nXOff=0)
Computes the sum of absolute values in X.
Definition: CudaDnn.cs:6901
T erf(T fVal)
Calculates the erf() function.
Definition: CudaDnn.cs:7006
void add(int n, long hA, long hB, long hY, double dfAlpha)
Adds A to (B times scalar) and places the result in Y.
Definition: CudaDnn.cs:7246
void lstm_unit_fwd(int nCount, int nHiddenDim, int nXCount, long hX, long hX_acts, long hC_prev, long hCont, long hC, long hH)
Peforms the simple LSTM foward pass in Cuda for a given LSTM unit.
Definition: CudaDnn.cs:10436
void powx(int n, long hA, T fAlpha, long hY, int nAOff=0, int nYOff=0)
Calculates the A raised to the power alpha and places the result in Y.
Definition: CudaDnn.cs:7558
void powx(int n, long hA, float fAlpha, long hY, int nAOff=0, int nYOff=0)
Calculates the A raised to the power alpha and places the result in Y.
Definition: CudaDnn.cs:7541
void DivisiveNormalizationBackward(long hCuDnn, long hNormDesc, T fAlpha, long hBottomDataDesc, long hBottomData, long hTopDiff, long hTemp1, long hTemp2, T fBeta, long hBottomDiffDesc, long hBottomDiff)
Performs a Devisive Normalization backward pass.
Definition: CudaDnn.cs:4433
void geam(bool bTransA, bool bTransB, int m, int n, double fAlpha, long hA, long hB, double fBeta, long hC)
Perform a matrix-matrix addition/transposition operation: C = alpha transA (A) + beta transB (B)
Definition: CudaDnn.cs:6366
void gemv(bool bTransA, int m, int n, float fAlpha, long hA, long hX, float fBeta, long hY)
Perform a matrix-vector multiplication operation: y = alpha transA (A) x + beta y (where x and y are ...
Definition: CudaDnn.cs:6450
void GetDropoutInfo(long hCuDnn, long hBottomDesc, out ulong ulStateCount, out ulong ulReservedCount)
Query the dropout state and reserved counts.
Definition: CudaDnn.cs:4252
void SetMemory(long hMem, double[] rgSrc, long hStream=0)
Copies an array of double into a block of already allocated GPU memory.
Definition: CudaDnn.cs:2757
long CreateImageOp(int nNum, double dfBrightnessProb, double dfBrightnessDelta, double dfContrastProb, double dfContrastLower, double dfContrastUpper, double dfSaturationProb, double dfSaturationLower, double dfSaturationUpper, long lRandomSeed=0)
Create a new ImageOp used to perform image operations on the GPU.
Definition: CudaDnn.cs:3153
int GetDeviceCount()
Query the number of devices (gpu's) installed.
Definition: CudaDnn.cs:2127
void SetRandomSeed(long lSeed)
Set the random number generator seed.
Definition: CudaDnn.cs:1990
string GetDeviceP2PInfo(int nDeviceID)
Query the peer-to-peer information of a device.
Definition: CudaDnn.cs:2049
void channel_sub(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY)
Subtracts the values across the channels from X and places the result in Y.
Definition: CudaDnn.cs:8214
void ConvolutionForward(long hCuDnn, long hBottomDesc, long hBottomData, int nBottomOffset, long hFilterDesc, long hWeight, int nWeightOffset, long hConvDesc, CONV_FWD_ALGO algoFwd, long hWorkspace, int nWorkspaceOffset, ulong lWorkspaceSize, long hTopDesc, long hTopData, int nTopOffset, bool bSyncStream=true)
Perform a convolution forward pass.
Definition: CudaDnn.cs:3856
void ger(int m, int n, double fAlpha, long hX, long hY, long hA)
Perform a vector-vector multiplication operation: A = x * (fAlpha * y) (where x and y are vectors and...
Definition: CudaDnn.cs:6492
void DistortImage(long h, int nCount, int nNum, int nDim, long hX, long hY)
Distort an image using the ImageOp handle provided.
Definition: CudaDnn.cs:3188
void geam(bool bTransA, bool bTransB, int m, int n, T fAlpha, long hA, long hB, T fBeta, long hC, int nAOffset=0, int nBOffset=0, int nCOffset=0)
Perform a matrix-matrix multiplication operation: C = alpha transB (B) transA (A) + beta C
Definition: CudaDnn.cs:6409
void scal(int n, T fAlpha, long hX, int nXOff=0)
Scales the data in X by a scaling factor.
Definition: CudaDnn.cs:6797
void rng_bernoulli(int n, double fNonZeroProb, long hY)
Fill Y with random numbers using a bernoulli random distribution.
Definition: CudaDnn.cs:8631
long AllocPCAEigenvalues(int nM, int nN, int nK, out int nCount)
Allocates the GPU memory for the PCA eigenvalues.
Definition: CudaDnn.cs:5370
void max_bwd(int n, long hAdata, long hBdata, long hYdiff, long hAdiff, long hBdiff)
Propagates the Y diff back to the max of A or B and places the result in A if its data has the max,...
Definition: CudaDnn.cs:7686
void accuracy_fwd(int nCount, int nOuterNum, int nInnerNum, long hBottomData, long hBottomLabel, long hAccData, long hAccTotals, int? nIgnoreLabel, bool bLastElementOnly, int nBatch)
Performs the forward pass for the accuracy layer
Definition: CudaDnn.cs:8700
string GetDeviceName(int nDeviceID)
Query the name of a device.
Definition: CudaDnn.cs:2035
void interp2(int nChannels, long hData1, int nX1, int nY1, int nHeight1, int nWidth1, int nHeight1A, int nWidth1A, long hData2, int nX2, int nY2, int nHeight2, int nWidth2, int nHeight2A, int nWidth2A, bool bBwd=false)
Interpolates between two sizes within the spatial dimensions.
Definition: CudaDnn.cs:7138
void swish_bwd(int nCount, long hTopDiff, long hTopData, long hSigmoidOutputData, long hBottomDiff, double dfBeta)
Performs a Swish backward pass in Cuda.
Definition: CudaDnn.cs:9361
static void SetDefaultCudaPath(string strPath)
Used to optionally set the default path to the Low-Level Cuda Dnn DLL file.
Definition: CudaDnn.cs:1890
void max_fwd(int nCount, long hBottomDataA, long hBottomDataB, int nIdx, long hTopData, long hMask)
Performs a max forward pass in Cuda.
Definition: CudaDnn.cs:9742
void dropout_fwd(int nCount, long hBottomData, long hMask, uint uiThreshold, T fScale, long hTopData)
Performs a dropout forward pass in Cuda.
Definition: CudaDnn.cs:9464
double min(int n, long hA, out long lPos, int nAOff=0, long hWork=0)
Finds the minimum value of A.
Definition: CudaDnn.cs:7772
void bnll_bwd(int nCount, long hTopDiff, long hBottomData, long hBottomDiff)
Performs a binomial normal log liklihod (BNLL) backward pass in Cuda.
Definition: CudaDnn.cs:9516
T[] GetHostMemory(long hMem)
Retrieves the host memory as an array of type 'T'
Definition: CudaDnn.cs:2662
T[] RunMemoryTest(long h, MEMTEST_TYPE type, ulong ulBlockStartOffset, ulong ulBlockCount, bool bVerbose, bool bWrite, bool bReadWrite, bool bRead)
The RunMemoryTest method runs the memory test from the block start offset through the block count on ...
Definition: CudaDnn.cs:3123
void AddTensor(long hCuDnn, long hSrcDesc, long hSrc, int nSrcOffset, long hDstDesc, long hDst, int nDstOffset)
Add two tensors together.
Definition: CudaDnn.cs:3638
void coeff_sub_fwd(int nCount, int nDim, int nNumOffset, double dfCoeff, long hCoeffData, long hBottom, long hTop)
Performs a coefficient sub foward pass in Cuda.
Definition: CudaDnn.cs:10518
void sub_and_dot(int n, int nN, int nInnerNum, long hA, long hB, long hY, int nAOff, int nBOff, int nYOff)
Subtracts every nInnterNum element of B from A and performs a dot product on the result.
Definition: CudaDnn.cs:7357
void NcclInitializeMultiProcess(long hNccl)
Initializes a set of NCCL instances for use in different processes.
Definition: CudaDnn.cs:3403
T[] RunExtension(long hExtension, long lfnIdx, T[] rgParam)
Run a function on the extension specified.
Definition: CudaDnn.cs:3489
void gather_bwd(int nCount, long hTop, long hBottom, int nAxis, int nDim, int nDimAtAxis, int nM, int nN, long hIdx)
Performs a gather backward pass where data at specifies indexes along a given axis are copied to the ...
Definition: CudaDnn.cs:10122
void gemv(bool bTransA, int m, int n, T fAlpha, long hA, long hX, T fBeta, long hY, int nAOffset=0, int nXOffset=0, int nYOffset=0)
Perform a matrix-vector multiplication operation: y = alpha transA (A) x + beta y (where x and y are ...
Definition: CudaDnn.cs:6472
void prelu_bwd(int nCount, int nChannels, int nDim, long hTopDiff, long hBottomData, long hBottomDiff, long hSlopeData, int nDivFactor)
Performs Parameterized Rectifier Linear Unit (ReLU) backward pass in Cuda.
Definition: CudaDnn.cs:9585
void KernelAdd(int nCount, long hA, long hDstKernel, long hB, long hC)
Add memory from one kernel to memory residing on another kernel.
Definition: CudaDnn.cs:1848
void axpby(int n, float fAlpha, long hX, float fBeta, long hY)
Scale the vector x and then multiply the vector X by a scalar and add the result to the vector Y.
Definition: CudaDnn.cs:6611
void SetTensorDesc(long hHandle, int n, int c, int h, int w, bool bHalf=false)
Sets the values of a tensor descriptor.
Definition: CudaDnn.cs:3599
void SetMemoryAt(long hMem, T[] rgSrc, int nOffset)
Copies an array of type 'T' into a block of already allocated GPU memory starting at a specific offse...
Definition: CudaDnn.cs:2883
void InitializeRnn8Weights(long hCuDnn, long hRnn, long hWt, RNN_FILLER_TYPE wtFt, double fWtVal, double fWtVal2, RNN_FILLER_TYPE biasFt, double fBiasVal, double fBiasVal2)
Initialize the RNN8 weights
Definition: CudaDnn.cs:5251
void LayerNormBackward(long hLayerNorm, long hYdata, long hYdiff, long hXdiff)
Run the LayerNorm backward pass.
Definition: CudaDnn.cs:5875
void axpy(int n, float fAlpha, long hX, long hY)
Multiply the vector X by a scalar and add the result to the vector Y.
Definition: CudaDnn.cs:6559
void DeriveBatchNormDesc(long hFwdScaleBiasMeanVarDesc, long hFwdBottomDesc, long hBwdScaleBiasMeanVarDesc, long hBwdBottomDesc, BATCHNORM_MODE mode)
Derive the batch norm descriptors for both the forward and backward passes.
Definition: CudaDnn.cs:4132
void sgd_update(int nCount, long hNetParamsDiff, long hHistoryData, T fMomentum, T fLocalRate)
Perform the Stochastic Gradient Descent (SGD) update
Definition: CudaDnn.cs:10203
void minmax(int n, long hA, long hWork1, long hWork2, int nK, long hMin, long hMax, bool bNonZeroOnly)
Finds up to 'nK' minimum and maximum values within A.
Definition: CudaDnn.cs:7843
void LayerNormForward(long hLayerNorm, long hXdata, long hYdata)
Run the LayerNorm forward pass.
Definition: CudaDnn.cs:5860
double erf(double dfVal)
Calculates the erf() function.
Definition: CudaDnn.cs:6986
bool CheckMemoryAttributes(long hSrc, int nSrcDeviceID, long hDst, int nDstDeviceID)
Check the memory attributes of two memory blocks on different devices to see if they are compatible f...
Definition: CudaDnn.cs:2160
void FreeImageOp(long h)
Free an image op, freeing up all GPU memory used.
Definition: CudaDnn.cs:3171
void copy_sequence(int nK, int nNum, int nDim, long hSrcData, long hSrcLbl, int nSrcCacheCount, long hSrcCache, int nLabelStart, int nLabelCount, int nCacheSize, long hCacheHostCursors, bool bOutputLabels, List< long > rghTop, List< int > rgnTopCount, long hWorkDataHost, bool bCombinePositiveAndNegative=false, int nSeed=0)
Copy a sequence of cached items, organized by label, into an anchor, positive (if nK > 0),...
Definition: CudaDnn.cs:6095
void math_bwd(int nCount, long hTopDiff, long hTopData, long hBottomDiff, long hBottomData, MATH_FUNCTION function)
Performs a Math function backward pass in Cuda.
Definition: CudaDnn.cs:8966
void ConvolutionBackwardData(long hCuDnn, long hFilterDesc, long hWeight, int nWeightOffset, long hTopDesc, long hTopDiff, int nTopOffset, long hConvDesc, CONV_BWD_DATA_ALGO algoBwd, long hWorkspace, int nWorkspaceOffset, ulong lWorkspaceSize, long hBottomDesc, long hBottomDiff, int nBottomOffset, bool bSyncStream=true)
Perform a convolution backward pass on the data.
Definition: CudaDnn.cs:3999
void mask(int n, int nMaskDim, float fSearch, float fReplace, long hX, long hMask, long hY)
Mask the mask the data in the source with the mask by replacing all values 'fSearch' found in the mas...
Definition: CudaDnn.cs:7063
void axpy(int n, T fAlpha, long hX, long hY, int nXOff=0, int nYOff=0)
Multiply the vector X by a scalar and add the result to the vector Y.
Definition: CudaDnn.cs:6576
void TanhForward(long hCuDnn, T fAlpha, long hBottomDataDesc, long hBottomData, T fBeta, long hTopDataDesc, long hTopData)
Perform a Tanh forward pass.
Definition: CudaDnn.cs:4451
void FreeTensorDesc(long h)
Free a tensor descriptor instance.
Definition: CudaDnn.cs:3536
void mulbsx(int n, long hA, int nAOff, long hX, int nXOff, int nC, int nSpatialDim, bool bTranspose, long hB, int nBOff)
Multiply a matrix with a vector.
Definition: CudaDnn.cs:6650
void scale_fwd(int nCount, long hX, long hScaleData, int nScaleDim, int nInnerDim, long hY, long hBiasData=0)
Performs a scale forward pass in Cuda.
Definition: CudaDnn.cs:9983
long CreateStream(bool bNonBlocking=false, int nIndex=-1)
Create a new stream on the current GPU.
Definition: CudaDnn.cs:3209
void tanh_bwd(int nCount, long hTopDiff, long hTopData, long hBottomDiff)
Performs a TanH backward pass in Cuda.
Definition: CudaDnn.cs:9304
void ReLUForward(long hCuDnn, T fAlpha, long hBottomDataDesc, long hBottomData, T fBeta, long hTopDataDesc, long hTopData)
Perform a ReLU forward pass.
Definition: CudaDnn.cs:4576
ulong GetRnnWorkspaceCount(long hCuDnn, long hRnnDesc, long hXDesc, out ulong nReservedCount)
Returns the workspace and reserved counts.
Definition: CudaDnn.cs:4807
long CreateExtension(string strExtensionDllPath)
Create an instance of an Extension DLL.
Definition: CudaDnn.cs:3456
void serf_bwd(int nCount, long hTopDiff, long hTopData, long hBottomDiff, long hBottomData, double dfThreshold)
Performs a Serf backward pass in Cuda.
Definition: CudaDnn.cs:9267
void concat_fwd(int nCount, long hBottomData, int nNumConcats, int nConcatInputSize, int nTopConcatAxis, int nBottomConcatAxis, int nOffsetConcatAxis, long hTopData)
Performs a concat forward pass in Cuda.
Definition: CudaDnn.cs:9849
void crop_fwd(int nCount, int nNumAxes, long hSrcStrides, long hDstStrides, long hOffsets, long hBottomData, long hTopData)
Performs the crop forward operation.
Definition: CudaDnn.cs:9812
void softplus_fwd(int nCount, long hBottomData, long hTopData)
Performs the Softplus function forward, a smooth approximation of the ReLU function
Definition: CudaDnn.cs:9161
void RnnBackwardData(long hCuDnn, long hRnnDesc, long hYDesc, long hYData, long hYDiff, long hHyDesc, long hHyDiff, long hCyDesc, long hCyDiff, long hWtDesc, long hWtData, long hHxDesc, long hHxData, long hCxDesc, long hCxData, long hXDesc, long hXDiff, long hdHxDesc, long hHxDiff, long hdCxDesc, long hCxDiff, long hWorkspace, ulong nWsCount, long hReserved, ulong nResCount)
Run the RNN backward pass through the data.
Definition: CudaDnn.cs:4981
void copy(int nCount, int nNum, int nDim, long hSrc1, long hSrc2, long hDst, long hSimilar, bool bInvert=false)
Copy similar items of length 'nDim' from hSrc1 (where hSimilar(i) = 1) and dissimilar items of length...
Definition: CudaDnn.cs:6035
void DropoutBackward(long hCuDnn, long hDropoutDesc, long hTopDesc, long hTop, long hBottomDesc, long hBottom, long hReserved)
Performs a dropout backward pass.
Definition: CudaDnn.cs:4296
void NcclAllReduce(long hNccl, long hStream, long hX, int nCount, NCCL_REDUCTION_OP op, double dfScale=1.0)
Performs a reduction on all NCCL instances as specified by the reduction operation.
Definition: CudaDnn.cs:3442
void FreeMemoryPointer(long hData)
Frees a memory pointer.
Definition: CudaDnn.cs:3046
void SetRnnDataDesc(long hRnnDataDesc, RNN_DATALAYOUT layout, int nMaxSeqLen, int nBatchSize, int nVectorSize, bool bBidirectional=false, int[] rgSeqLen=null)
Sets the RNN Data Descriptor values.
Definition: CudaDnn.cs:4692
float asum_float(int n, long hX, int nXOff=0)
Computes the sum of absolute values in X.
Definition: CudaDnn.cs:6886
void min(int n, long hA, long hB, long hY)
Calculates the min of A and B and places the result in Y. This min is only computed on a per item bas...
Definition: CudaDnn.cs:7702
long CreateMemoryTest(out ulong ulTotalNumBlocks, out double dfMemAllocatedInGB, out ulong ulMemStartAddr, out ulong ulBlockSize, double dfPctToAllocate=1.0)
Creates a new memory test on the current GPU.
Definition: CudaDnn.cs:3069
void channel_scale(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hA, long hY)
Multiplies the values of the channels from X with the scalar values in B and places the result in Y.
Definition: CudaDnn.cs:8308
void mask(int n, int nMaskDim, T fSearch, T fReplace, long hX, long hMask, long hY)
Mask the mask the data in the source with the mask by replacing all values 'fSearch' found in the mas...
Definition: CudaDnn.cs:7030
void SetLRNDesc(long hHandle, uint nSize, double fAlpha, double fBeta, double fK)
Set the LRN descriptor values.
Definition: CudaDnn.cs:4342
void batchreidx_bwd(int nCount, int nInnerDim, long hTopDiff, long hTopIdx, long hBegins, long hCounts, long hBottomDiff)
Performs the backward pass for batch re-index
Definition: CudaDnn.cs:8745
void set_bounds(int n, double dfMin, double dfMax, long hX)
Set the bounds of all items within the data to a set range of values.
Definition: CudaDnn.cs:6732
float[] GetMemoryFloat(long hMem, long lCount=-1)
Retrieves the GPU memory as an array of float.
Definition: CudaDnn.cs:2689
void SetDropoutDesc(long hCuDnn, long hDropoutDesc, double dfDropout, long hStates, long lSeed)
Set the dropout descriptor values.
Definition: CudaDnn.cs:4237
virtual void Dispose(bool bDisposing)
Disposes this instance freeing up all of its host and GPU memory.
Definition: CudaDnn.cs:1612
void gemm(bool bTransA, bool bTransB, int m, int n, int k, double fAlpha, long hA, long hB, double fBeta, long hC, uint lda, uint ldb, uint ldc)
Perform a matrix-matrix multiplication operation: C = alpha transB (B) transA (A) + beta C
Definition: CudaDnn.cs:6312
void NcclInitializeSingleProcess(params long[] rghNccl)
Initializes a set of NCCL instances for use in a single process.
Definition: CudaDnn.cs:3370
void abs(int n, long hA, long hY)
Calculates the absolute value of A and places the result in Y.
Definition: CudaDnn.cs:7437
double[] GetHostMemoryDouble(long hMem)
Retrieves the host memory as an array of doubles.
Definition: CudaDnn.cs:2641
void compare_signs(int n, long hA, long hB, long hY)
Compares the signs of each value in A and B and places the result in Y.
Definition: CudaDnn.cs:7653
long AllocPCALoads(int nM, int nN, int nK, out int nCount)
Allocates the GPU memory for the PCA loads.
Definition: CudaDnn.cs:5353
T dot(int n, long hX, long hY, int nXOff=0, int nYOff=0)
Computes the dot product of X and Y.
Definition: CudaDnn.cs:6847
void coeff_sum_fwd(int nCount, int nDim, int nNumOffset, double dfCoeff, long hCoeffData, long hBottom, long hTop)
Performs a coefficient sum foward pass in Cuda.
Definition: CudaDnn.cs:10481
void ConvolutionBackwardBias(long hCuDnn, long hTopDesc, long hTopDiff, int nTopOffset, long hBiasDesc, long hBiasDiff, int nBiasOffset, bool bSyncStream=true)
Perform a convolution backward pass on the bias.
Definition: CudaDnn.cs:3901
void copy_sequence(int n, long hSrc, int nSrcStep, int nSrcStartIdx, int nCopyCount, int nCopyDim, long hDst, int nDstStep, int nDstStartIdx, int nSrcSpatialDim, int nDstSpatialDim, int nSrcSpatialDimStartIdx=0, int nDstSpatialDimStartIdx=0, int nSpatialDimCount=-1)
Copy a sequence from a source to a destination and allow for skip steps.
Definition: CudaDnn.cs:6165
The CudaDnnMemoryTracker is used for diagnostics in that it helps estimate the amount of memory that ...
void FreeMemory(long hKernel, int nDeviceID, long hMemory)
Simulate a memory free.
string TotalMemoryUsedText
Returns a text string describing the total amount of memory used (in bytes).
ulong TotalMemoryUsed
Returns the total amount of memory used (in bytes).
long AllocMemory(long hKernel, int nDeviceID, long hMemory, ulong lSize, bool bHalf)
Simulate a memory allocation.
The Params contains the base parameters used in multi-GPU training.
Definition: Parallel.cs:19
Specifies the parameters for the ReshapeTemporalLayer.
The MyCaffe.basecode contains all generic types used throughout MyCaffe.
Definition: Annotation.cs:12
@ NONE
No training category specified.
@ DEFAULT
Specifies to use the default data type of the gym used.
The MyCaffe.common namespace contains common MyCaffe classes.
Definition: BatchInput.cs:8
OP
Defines the operations performed by the channel_op function.
Definition: CudaDnn.cs:135
@ SUB
Specifies to perform a subtraction operation.
@ DIV
Specifies to perform a division operation.
@ MUL
Specifies to perform a multiplication operation.
@ ADD
Specifies to perform an addition operation.
AGGREGATIONS
Specifies different aggregation operations.
Definition: CudaDnn.cs:682
MEMTEST_TYPE
Specifies the memory test to perform.
Definition: CudaDnn.cs:499
@ MOV_INV_8
Specifies the mov-inv-8 test.
DEVINIT
Specifies the initialization flags used when initializing CUDA.
Definition: CudaDnn.cs:207
@ CURAND
Initialize cuRand. This should be initialized for cuRand is used for most of the random operations.
@ SETSEED
Set the cuRand random number generator seed - typically only used when testing to ensure that random ...
@ CUBLAS
Initialize cuBlas. This should be initialized for cuBlas is used for many of the math operations.
SSD_CONF_LOSS_TYPE
Defines the confidence loss types used during SSD cuda training.
Definition: CudaDnn.cs:608
@ SOFTMAX
Specifies to use softmax.
@ LOGISTIC
Specifies to use logistic.
CONV_BWD_FILTER_ALGO
Specifies the cuDnn convolution backward filter algorithm to use.
Definition: CudaDnn.cs:305
@ ALGO_3
Specifies to use algorithm 0 with a workspace - which is non-deterministic.
@ ALGO_1
Specifies to use algorithm 1.
@ ALGO_0
Specifies to use algorithm 0 - which is non-deterministic.
RNN_DATALAYOUT
Specifies the RNN data layout of the data input.
Definition: CudaDnn.cs:424
@ RNN_BATCH_MAJOR_UNPACKED
Specifies ordering with batch major ordering, padded, outer stride from one batch to the next.
@ RNN_SEQ_MAJOR_PACKED
Specifies ordering with sequence major ordering, and sequence length sorted and packed.
@ RNN_SEQ_MAJOR_UNPACKED
Specifies ordering with sequence major ordering, and padded outer stride from one time-step to the ne...
DistanceMethod
Specifies the distance method used when calculating batch distances.
Definition: CudaDnn.cs:159
@ HAMMING
Specifies to calculate the hamming distance.
@ EUCLIDEAN
Specifies to calculate the euclidean distance.
MEAN_ERROR
Defines the type of Mean Error to use.
Definition: CudaDnn.cs:37
@ MSE
Mean Squared Error (MSE) where is the predicted value.
SSD_MATCH_TYPE
Defines the matching method used during SSD cuda training.
Definition: CudaDnn.cs:566
@ BIPARTITE
Specifies to use Bi-Partite.
@ PER_PREDICTION
Specifies to use per-prediction matching.
MATH_FUNCTION
Defines the mathematical function to run.
Definition: CudaDnn.cs:52
@ TANH
Specifies to run the tanh function.
@ ASINH
Specifies to run the asinh function.
@ NOP
Specifies to run a no operation.
@ ACOS
Specifies to run the acos function.
@ SQRT
Specifies to run the sqrt function.
@ ACOSH
Specifies to run the acosh function.
@ FLOOR
Specifies to run the floor function.
@ SIN
Specifies to run the sin function.
@ CEIL
Specifies to run the ceil function.
@ NEG
Specifies to flip the sign of the inputs.
@ SIGN
Specifies to run the sign function.
@ TAN
Specifies to run the tan function.
@ ATANH
Specifies to run the atanh function.
@ ASIN
Specifies to run the asin function.
@ SINH
Specifies to run the sinh function.
@ ATAN
Specifies to run the atan function.
@ COSH
Specifies to run the cosh function.
@ COS
Specifies to run the cos function.
PoolingMethod
Specifies the pooling method used by the cuDnn function SetPoolingDesc.
Definition: CudaDnn.cs:177
DataType
Specifies the base datatype corresponding the the template type 'T'. Currently, only
Definition: CudaDnn.cs:192
@ FLOAT
Specifies the single type.
@ DOUBLE
Specifies the double type.
DIR
Defines the direction of data flow.
Definition: CudaDnn.cs:22
@ FWD
Specifies data is moving forward.
@ BWD
Specifies data is moving backward.
NCCL_REDUCTION_OP
Specifies the reduction operation to use with 'Nickel' NCCL.
Definition: CudaDnn.cs:513
@ PROD
Multiply the values.
@ MIN
Return the minimum value.
SSD_CODE_TYPE
Defines the encode/decode type used during SSD cuda training.
Definition: CudaDnn.cs:585
@ CENTER_SIZE
Encode the center size.
@ CORNER
Encode the corner.
@ CORNER_SIZE
Encode the corner size.
CONV_FWD_ALGO
Specifies the cuDnn convolution forward algorithm to use.
Definition: CudaDnn.cs:259
@ ALGO_FFT_TILING
Specifies to use the fft tiling algorithm.
@ ALGO_FFT
Specifies to use the fft algorithm.
@ ALGO_GEMM
Specifies to use the gemm algorithm.
@ IMPLICIT_PRECOMP_GEMM
Specifies to use the implicit pre-computation gemm algorithm.
@ ALGO_DIRECT
Specifies to use the direct algorithm.
@ ALGO_WINOGRAD_NONFUSED
Specifies to use the non-fused winograd algorithm.
@ IMPLICIT_GEMM
Specifies to use the implicit gemm algorithm.
@ ALGO_WINOGRAD
Specifies to use the winograd algorithm.
RNN_MODE
Specifies the RNN mode to use with the Recurrent Layer when using the cuDNN engine.
Definition: CudaDnn.cs:376
@ RNN_TANH
Specifies to use a single TanH gate Recurrent Learning unit.
@ GRU
Specifies to use the GRU RNN where and
@ RNN_RELU
Specifies to use a single RelU gate Recurrent Learning unit.
@ LSTM
Specifies to use a 4 gate LSTM Recurrent Learning unit.
BATCHNORM_MODE
Specifies the cuDnn batch norm mode to use.
Definition: CudaDnn.cs:237
@ PER_ACTIVATION
Specifies to use the per-activation batch normalization mode.
@ SPATIAL
Specifies to use the spatial batch normalization mode.
@ SPATIAL_PERSISTENT
Specifies to use the spatial persistent batch normalization mode.
ORIENTATION
Specifies the orientation of a matrix.
Definition: CudaDnn.cs:645
@ ROW
Specifies to add the vector to each row.
@ COL
Specifies to add the vector to each column.
RNN_BIAS_MODE
Specifies the RNN bias mode to use with the Recurrent Layer when using the cuDNN engine.
Definition: CudaDnn.cs:401
@ RNN_DOUBLE_BIAS
Specifies to use two bias in the input Gemm and recurrent Gemm of the rnn cell (default).
@ RNN_NO_BIAS
Specifies to use no bias in the RNN cells.
@ RNN_SINGLE_INP_BIAS
Specifies to use one bias in the input Gemm of the rnn cell.
@ RNN_SINGLE_REC_BIAS
Specifies to use one recurrent bias in the recurrent Gemm of the rnn cell.
SSD_LOC_LOSS_TYPE
Defines the location loss types used during SSD cuda training.
Definition: CudaDnn.cs:627
@ SMOOTH_L1
Specifies to use smooth L1 loss.
@ L2
Specifies to use L2 loss.
DEVPROP
Specifies certain device properties to query from Cuda.
Definition: CudaDnn.cs:477
@ MULTIGPUBOARDGROUPID
Query a GPU board group ID.
@ DEVICECOUNT
Query the number of devices (gpu's) installed.
@ NAME
Query the name of a given GPU.
RNN_DIRECTION
Specifies the RNN directional used.
Definition: CudaDnn.cs:443
@ RNN_UNIDIRECTIONAL
Specifies a single direction RNN (default)
@ RNN_BIDIRECTIONAL
Specifies a bi-direction RNN where the output is concatinated at each layer.
TRANSPOSE_OPERATION
Specifies the type of operation to perform along with a matrix transposition.
Definition: CudaDnn.cs:663
SOFTMAX_MODE
Specifies the SOFTMAX mode to use.
Definition: CudaDnn.cs:724
@ INSTANCE
Specifies to run the softmax separately for each N, across CHW dimensions.
@ CHANNEL
Specifies to run the softmax separately for each N*C, across HW dimensions.
SSD_MINING_TYPE
Defines the mining type used during SSD cuda training.
Definition: CudaDnn.cs:540
@ MAX_NEGATIVE
Select negatives based on the score.
@ HARD_EXAMPLE
Select hard examples based on Shrivastava et. al. method.
SOFTMAX_ALGORITHM
Specifies the SOFTMAX algorithm to use.
Definition: CudaDnn.cs:701
@ ACCURATE
Specifies to use the accurate algorithm.
@ LOG
Specifies to use the log algorithm.
@ FAST
Specifies to use the fast algorithm.
POOLING_METHOD
Specifies the pooling method to use when using the Caffe pooling (instead of the pooling from NVIDIA'...
Definition: CudaDnn.cs:353
@ STO_TRAIN
Select the stochastic value in the kernel - used during a training pass.
@ STO_TEST
Select the stochastic value in the kernel - used during a testing pass.
RNN_FILLER_TYPE
Defines the filler types used to fill the RNN8 weights.
Definition: CudaDnn.cs:458
@ RNN_GAUSSIAN_FILLER
Specifies to fill with a gaussian distribution.
@ RNN_XAVIER_FILLER
Specifies to fill with a uniform distribution.
@ RNN_CONSTANT_FILLER
Specifies to fill with a constant value.
CONV_BWD_DATA_ALGO
Specifies the cuDnn convolution backward data algorithm to use.
Definition: CudaDnn.cs:331
The MyCaffe.param namespace contains parameters used to create models.
The MyCaffe namespace contains the main body of MyCaffe code that closesly tracks the C++ Caffe open-...
Definition: Annotation.cs:12