MyCaffe  1.12.2.41
Deep learning software for Windows C# programmers.
CudaDnn.cs
1using System;
2using System.Collections.Generic;
3using System.Linq;
4using System.Text;
5using System.Collections;
6using System.Diagnostics;
7using System.Threading;
8using System.IO;
9using MyCaffe.basecode;
11using System.Runtime.Remoting.Channels;
12using System.Xml.Linq;
13using System.Security.Cryptography.X509Certificates;
15
16namespace MyCaffe.common
17{
21 public enum DIR
22 {
26 FWD = 0,
30 BWD = 1
31 }
32
36 public enum MEAN_ERROR
37 {
42 MSE = 1,
45 MAE = 2
46 }
47
51 public enum MATH_FUNCTION
52 {
56 NOP = 0,
57
61 ACOS = 1,
65 ACOSH = 2,
69 COS = 3,
73 COSH = 4,
74
78 ASIN = 10,
82 ASINH = 11,
86 SIN = 12,
90 SINH = 13,
91
95 ATAN = 20,
99 ATANH = 21,
103 TAN = 22,
107 TANH = 23,
108
112 CEIL = 30,
116 FLOOR = 31,
120 NEG = 32,
124 SIGN = 33,
128 SQRT = 34
129 }
130
134 public enum OP
135 {
139 MUL = 1,
143 DIV = 2,
147 ADD = 3,
151 SUB = 4
152 }
153
154
158 public enum DistanceMethod
159 {
163 HAMMING = 0,
167 EUCLIDEAN = 1
168 }
169
176 public enum PoolingMethod
177 {
181 MAX = 0,
185 AVE = 1
186 }
187
191 public enum DataType
192 {
196 DOUBLE,
200 FLOAT
201 }
202
206 public enum DEVINIT
207 {
211 NONE = 0x0000,
212
216 CUBLAS = 0x0001,
217
221 CURAND = 0x0002,
222
227 SETSEED = 0x0004
228 }
229
236 public enum BATCHNORM_MODE
237 {
241 PER_ACTIVATION = 0,
245 SPATIAL = 1,
250 }
251
258 public enum CONV_FWD_ALGO
259 {
263 NONE = -1,
267 IMPLICIT_GEMM = 0,
275 ALGO_GEMM = 2,
279 ALGO_DIRECT = 3,
283 ALGO_FFT = 4,
287 ALGO_FFT_TILING = 5,
291 ALGO_WINOGRAD = 6,
296 }
297
305 {
309 ALGO_0 = 0,
313 ALGO_1 = 1,
317 ALGO_FFT = 2,
321 ALGO_3 = 3
322 }
323
331 {
335 ALGO_0 = 0,
339 ALGO_1 = 1,
343 ALGO_FFT = 2
344 }
345
352 public enum POOLING_METHOD
353 {
357 MAX = 0,
361 AVE = 1,
365 STO_TRAIN = 2,
369 STO_TEST = 3
370 }
371
375 public enum RNN_MODE
376 {
380 RNN_RELU = 0,
384 RNN_TANH = 1,
388 LSTM = 2,
394 GRU = 3
395 }
396
400 public enum RNN_BIAS_MODE
401 {
405 RNN_NO_BIAS = 0,
413 RNN_DOUBLE_BIAS = 2,
418 }
419
423 public enum RNN_DATALAYOUT
424 {
437 }
438
442 public enum RNN_DIRECTION
443 {
452 }
453
457 public enum RNN_FILLER_TYPE
458 {
471 }
472
476 public enum DEVPROP
477 {
481 DEVICECOUNT = 1,
485 NAME = 2,
490 }
491
498 public enum MEMTEST_TYPE
499 {
503 MOV_INV_8 = 1
504 }
505
513 {
517 SUM = 0,
521 PROD = 1,
525 MAX = 2,
529 MIN = 3
530 }
531
539 public enum SSD_MINING_TYPE
540 {
544 NONE = 0,
548 MAX_NEGATIVE = 1,
555 HARD_EXAMPLE = 2
556 }
557
565 public enum SSD_MATCH_TYPE
566 {
570 BIPARTITE,
575 }
576
584 public enum SSD_CODE_TYPE
585 {
589 CORNER = 1,
593 CENTER_SIZE = 2,
597 CORNER_SIZE = 3
598 }
599
608 {
612 SOFTMAX,
617 }
618
627 {
631 L2,
636 }
637
644 public enum ORIENTATION
645 {
649 COL = 0,
653 ROW = 1
654 }
655
663 {
667 ADD = 0,
671 MUL = 1,
675 DIV = 2
676 }
677
681 public enum AGGREGATIONS
682 {
686 SUM = 0,
690 MAX = 1,
694 MIN = 2
695 }
696
701 {
705 DEFAULT = 1,
709 FAST = 0,
713 ACCURATE = 1,
717 LOG = 2
718 }
719
723 public enum SOFTMAX_MODE
724 {
728 INSTANCE,
732 CHANNEL
733 }
734
735#pragma warning disable 1591
736
743 public interface ICudaDevice
744 {
745 void SetDeviceID(int nDeviceID, DEVINIT flags = DEVINIT.NONE, long? lSeed = null);
746 void SetRandomSeed(long lSeed);
747 int GetDeviceCount();
748 int GetDeviceID();
749 void ResetDevice();
750 void SynchronizeDevice();
751 string GetDeviceName(int nDeviceID);
752 string GetDeviceP2PInfo(int nDeviceID);
753 string GetRequiredCompute(out int nMinMajor, out int nMinMinor);
754
755 }
756
763 public interface ICudaMemory
764 {
765 long AllocMemory(long lCount, bool bHalf = false);
766 long AllocMemory(List<double> rg);
767 long AllocMemory(List<float> rg);
768 long AllocMemory(double[] rgSrc, long hStream = 0);
769 long AllocMemory(float[] rgSrc, long hStream = 0);
770 void FreeMemory(long hMem);
771 double[] GetMemoryDouble(long hMem, long lCount = -1);
772 float[] GetMemoryFloat(long hMem, long lCount = -1);
773 void SetMemory(long hMem, List<double> rg);
774 void SetMemory(long hMem, List<float> rg);
775 void SetMemory(long hMem, double[] rgSrc, long hStream = 0);
776 void SetMemory(long hMem, float[] rgSrc, long hStream = 0);
777 void SetMemoryAt(long hMem, double[] rgSrc, int nOffset);
778 void SetMemoryAt(long hMem, float[] rgSrc, int nOffset);
779 long AllocHostBuffer(long lCount);
780 void FreeHostBuffer(long hMem);
781 double[] GetHostMemoryDouble(long hMem);
782 float[] GetHostMemoryFloat(long hMem);
783 long CreateMemoryPointer(long hData, long lOffset, long lCount);
784 void FreeMemoryPointer(long hMem);
785 }
786
793 public interface ICudaCuDnn
794 {
795 long CreateStream(bool bNonBlocking = false, int nIndex = -1);
796 void FreeStream(long h);
797 void SynchronizeStream(long h = 0);
798 void SynchronizeThread();
799
800 long CreateCuDNN(long hStream = 0);
801 void FreeCuDNN(long h);
802
803 long CreateTensorDesc();
804 void FreeTensorDesc(long h);
805 void SetTensorNdDesc(long hHandle, int[] rgDim, int[] rgStride, bool bHalf = false);
806 void SetTensorDesc(long hHandle, int n, int c, int h, int w, bool bHalf = false);
807 void SetTensorDesc(long hHandle, int n, int c, int h, int w, int nStride, int cStride, int hStride, int wStride, bool bHalf = false);
808 void AddTensor(long hHandle, long hSrcDesc, long hSrc, int nSrcOffset, long hDstDesc, long hDst, int nDstOffset);
809
810 void DeriveBatchNormDesc(long hFwdScaleBiasMeanVarDesc, long hFwdBottomDesc, long hBwdScaleBiasMeanVarDesc, long hBwdBottomDesc, BATCHNORM_MODE mode);
811
812 long CreateFilterDesc();
813 void FreeFilterDesc(long h);
814 void SetFilterNdDesc(long hHandle, int[] rgDim, bool bHalf = false);
815 void SetFilterDesc(long hHandle, int n, int c, int h, int w, bool bHalf = false);
816
817 long CreateConvolutionDesc();
818 void FreeConvolutionDesc(long h);
819 void SetConvolutionDesc(long hHandle, int hPad, int wPad, int hStride, int wStride, int hDilation, int wDilation, bool bUseTensorCores, bool bHalf = false);
820
821 long CreatePoolingDesc();
822 void FreePoolingDesc(long h);
823 void SetPoolingDesc(long hHandle, PoolingMethod method, int h, int w, int hPad, int wPad, int hStride, int wStride);
824
825 long CreateLRNDesc();
826 void FreeLRNDesc(long h);
827 void SetLRNDesc(long hHandle, uint nSize, double fAlpha, double fBeta, double fK);
828
829 long CreateRnnDataDesc();
830 void FreeRnnDataDesc(long h);
831 void SetRnnDataDesc(long hRnnDataDesc, RNN_DATALAYOUT layout, int nMaxSeqLen, int nBatchSize, int nVectorSize, bool bBidirectional = false, int[] rgSeqLen = null);
832
833 long CreateRnnDesc();
834 void FreeRnnDesc(long h);
835 void SetRnnDesc(long hHandle, long hRnnDesc, int nHiddenSize, int nNumLayers, long hDropoutDesc, RNN_MODE mode, bool bUseTensorCores, RNN_DIRECTION direction = RNN_DIRECTION.RNN_UNIDIRECTIONAL);
836 int GetRnnParamCount(long hHandle, long hRnnDesc, long hXDesc);
837 ulong GetRnnWorkspaceCount(long hHandle, long hRnnDesc, long hXDesc, out ulong nReservedCount);
838 void GetRnnLinLayerParams(long hHandle, long hRnnDesc, int nLayer, long hXDesc, long hWtDesc, long hWtData, int nLinLayer, out int nWtCount, out long hWt, out int nBiasCount, out long hBias);
839 void RnnForward(long hHandle, long hRnnDesc, long hXDesc, long hXData, long hHxDesc, long hHxData, long hCxDesc, long hCxData, long hWtDesc, long hWtData, long hYDesc, long hYData, long hHyDesc, long hHyData, long hCyDesc, long hCyData, long hWorkspace, ulong nWsCount, long hReserved, ulong hResCount, bool bTraining);
840 void RnnBackwardData(long hHandle, long hRnnDesc, long hYDesc, long hYData, long hYDiff, long hHyDesc, long hHyDiff, long hCyDesc, long hCyDiff, long hWtDesc, long hWtData, long hHxDesc, long hHxData, long hCxDesc, long hCxData, long hXDesc, long hXDiff, long hdHxDesc, long hHxDiff, long hdCxDesc, long hCxDiff, long hWorkspace, ulong nWsCount, long hReserved, ulong nResCount);
841 void RnnBackwardWeights(long hHandle, long hRnnDesc, long hXDesc, long hXData, long hHxDesc, long hHxData, long hYDesc, long hYData, long hWorkspace, ulong nWsCount, long hWtDesc, long hWtDiff, long hReserved, ulong nResCount);
842 }
843
850 public interface ICudaMath
851 {
852 void set(int nCount, long hHandle, double fVal, int nIdx = -1);
853 void set(int nCount, long hHandle, float fVal, int nIdx = -1);
854 double[] get_double(int nCount, long hHandle, int nIdx = -1);
855 float[] get_float(int nCount, long hHandle, int nIdx = -1);
856 void copy(int nCount, long hSrc, long hDst, int nSrcOffset = 0, int nDstOffset = 0, long hAsyncStream = -1, bool? bSrcHalfOverride = null, bool? bDstHalfOverride = null);
857 void copy(int nCount, int nNum, int nDim, long hSrc1, long hSrc2, long hDst, long hSimilar, bool bInvert = false);
858 void copy_expand(int n, int nNum, int nDim, long hSrc, long hDs);
859 void fill(int n, int nDim, long hSrc, int nSrcOff, int nCount, long hDst);
860 void sort(int nCount, long hY);
861
862 void channel_compare(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY);
863 void channel_fill(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, int nLabelDim, long hLabels, long hY);
864 void channel_fillfrom(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, DIR dir);
865 void channel_scale(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hA, long hY);
866 void channel_mulv(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hA, long hX, long hC);
867 void channel_sum(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, bool bSumAcrossChannels = true, DIR dir = DIR.FWD, int nChanalesY = -1);
868 void channel_mean(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY);
869 void channel_copy(int nCount, int nOuterNum, int nChannels, int nBlocks, int nInnerNum, int nOffset, long hX, long hY, DIR dir);
870 void channel_copyall(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY);
871 void channel_duplicate(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY);
872 void channel_percentile(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, double dfPercentile);
873 void channel_op_fwd(OP op, int nCount, int nC, int nN1, int nSD1, int nN2, int nSD2, long hA, long hB, long hY);
874 void channel_op_bwd(OP op, int nCount, int nC, int nN1, int nSD1, int nN2, int nSD2, int nCy, int nSDy, long hA, long hB, long hY, long hAd, long hBd, long hYd, long hWork);
875
876 void gemm(bool bTransA, bool bTransB, int m, int n, int k, double fAlpha, long hA, long hB, double fBeta, long hC);
877 void gemm(bool bTransA, bool bTransB, int m, int n, int k, float fAlpha, long hA, long hB, float fBeta, long hC);
878 void gemv(bool bTransA, int m, int n, double fAlpha, long hA, long hX, double fBeta, long hY);
879 void gemv(bool bTransA, int m, int n, float fAlpha, long hA, long hX, float fBeta, long hY);
880 void geam(bool bTransA, bool bTransB, int m, int n, double fAlpha, long hA, long hB, double fBeta, long hC);
881 void geam(bool bTransA, bool bTransB, int m, int n, float fAlpha, long hA, long hB, float fBeta, long hC);
882
883 void ger(int m, int n, double fAlpha, long hX, long hY, long hA);
884 void ger(int m, int n, float fAlpha, long hX, long hY, long hA);
885 void axpy(int n, double fAlpha, long hX, long hY);
886 void axpy(int n, float fAlpha, long hX, long hY);
887 void axpby(int n, double fAlpha, long hX, double fBeta, long hY);
888 void axpby(int n, float fAlpha, long hX, float fBeta, long hY);
889 void scal(int n, double fAlpha, long hX, int nXOff = 0);
890 void scal(int n, float fAlpha, long hX, int nXOff = 0);
891 double dot_double(int n, long hX, long hY);
892 float dot_float(int n, long hX, long hY);
893 double asum_double(int n, long hX, int nXOff = 0);
894 float asum_float(int n, long hX, int nXOff = 0);
895 void scale(int n, double fAlpha, long hX, long hY);
896 void scale(int n, float fAlpha, long hX, long hY);
897 void add_scalar(int n, double fAlpha, long hY);
898 void add_scalar(int n, float fAlpha, long hY);
899 void add(int n, long hA, long hB, long hY);
900 void add(int n, long hA, long hB, long hY, double dfAlpha);
901 void add(int n, long hA, long hB, long hY, float fAlpha);
902 void sub(int n, long hA, long hB, long hY, int nAOff = 0, int nBOff = 0, int nYOff = 0, int nB = 0);
903 void mul(int n, long hA, long hB, long hY, int nAOff = 0, int nBOff = 0, int nYOff = 0);
904 void mul_scalar(int n, double fAlpha, long hY);
905 void mul_scalar(int n, float fAlpha, long hY);
906 void div(int n, long hA, long hB, long hY);
907 void abs(int n, long hA, long hY);
908 void exp(int n, long hA, long hY);
909 void log(int n, long hA, long hY);
910 void powx(int n, long hA, double fAlpha, long hY, int nAOff = 0, int nYOff = 0);
911 void powx(int n, long hA, float fAlpha, long hY, int nAOff = 0, int nYOff = 0);
912 void sign(int n, long hX, long hY, int nXOff = 0, int nYOff = 0);
913 double min(int n, long hA, out long lPos, int nAOff = 0, long hWork = 0);
914 double max(int n, long hA, out long lPos, int nAOff = 0, long hWork = 0);
915 double sumsq(int n, long hW, long hA, int nAOff = 0);
916 double sumsqdiff(int n, long hW, long hA, long hB, int nAOff = 0, int nBOff = 0);
917 void sqrt(int n, long hA, long hY);
918 void sqrt_scale(int n, long hA, long hY);
919
920 void mask(int n, int nMaskDim, double fSearch, double fReplace, long hX, long hMask, long hY);
921 void mask(int n, int nMaskDim, float fSearch, float fReplace, long hX, long hMask, long hY);
922 void mask_batch(int n, int nBatch, int nMaskDim, double fSearch, double fReplace, long hX, long hMask, long hY);
923 void mask_batch(int n, int nBatch, int nMaskDim, float fSearch, float fReplace, long hX, long hMask, long hY);
924
925 void im2col(long hDataIm, int nDataImOffset, int nChannels, int nHeight, int nWidth, int nKernelH, int nKernelW, int nPadH, int nPadW, int nStrideH, int nStrideW, int nDilationH, int nDilationW, long hDataCol, int nDataColOffset);
926 void im2col_nd(long hDataIm, int nDataImOffset, int nNumSpatialAxes, int nColCount, int nChannelAxis, long hImShape, long hColShape, long hKernelShape, long hPad, long hStride, long hDilation, long hDataCol, int nDataColOffset);
927 void col2im(long hDataCol, int nDataColOffset, int nChannels, int nHeight, int nWidth, int nKernelH, int nKernelW, int nPadH, int nPadW, int nStrideH, int nStrideW, int nDilationH, int nDilationW, long hDataIm, int nDataImOffset);
928 void col2im_nd(long hDataCol, int nDataColOffset, int nNumSpatialAxes, int nColCount, int nChannelAxis, long hImShape, long hColShape, long hKernelShape, long hPad, long hStride, long hDilation, long hDataIm, int nDataImOffset);
929 }
930
937 public interface ICudaRandom
938 {
939 void rng_setseed(long lSeed);
940 void rng_uniform(int n, double fMin, double fMax, long hY);
941 void rng_uniform(int n, float fMin, float fMax, long hY);
942 void rng_gaussian(int n, double fMu, double fSigma, long hY);
943 void rng_gaussian(int n, float fMu, float fSigma, long hY);
944 void rng_bernoulli(int n, double fNonZeroProb, long hY);
945 void rng_bernoulli(int n, float fNonZeroProb, long hY);
946 }
947
954 public interface ICudaDnn : ICudaDevice, ICudaMemory, ICudaCuDnn, ICudaMath, ICudaRandom
955 {
956 }
957
958#pragma warning restore 1591
959
960
968 public class CudaDnn<T> : ICudaDnn, IDisposable
969 {
970 Params m_param = new Params();
971 CudaDnnMemoryTracker<T> m_memTracker;
972 int m_nDeviceId;
973 string m_strPath = "";
974 static int s_nIdxSeed = 0;
975 static string s_strCudaPath = "";
976 CudaControlLib.ICudaKernel m_cuda;
977 long m_hKernel = 0;
978 DataType m_dt;
979 CryptoRandom m_random = new CryptoRandom();
980 T m_tOne;
981 T m_tZero;
982 int m_nIdx;
983 long m_nGhostMemoryIndex = 1000;
984 Dictionary<long, T[]> m_rgGhostMemory = null;
985 bool m_bGhostMemoryEnabled = false;
986 bool m_bOwner = true;
987 object m_memSync = new object();
988 bool m_bEnableRnnExtendedVersion = false;
989 static object m_createSync = new object();
990 static object m_getconvSync = new object();
991 static ulong m_lBaseSize = (ulong)((typeof(T) == typeof(float)) ? sizeof(float) : sizeof(double));
992
996 public enum CUDAQRY
997 {
1001 DEVICE_NAME = 1000,
1006 DEVICE_P2P_INFO = 1001,
1010 DEVICE_INFO = 1002
1011 }
1012
1013#pragma warning disable 1591
1014
1020 public enum CUDAFN
1021 {
1022 INITIALIZE = -2,
1023 CLEANUP = -3,
1024 KERNEL_MEMCOPY = -4,
1025 KERNEL_ADD = -5,
1026 KERNEL_COPY_NCCL = -10,
1027
1028 SETDEVICE = 1,
1029 SETRANDOMSEED = 2,
1030 GETDEVICE = 3,
1031 RESETDEVICE = 4,
1032 SYNCHRONIZEDEVICE = 5,
1033 GETDEVICEPROP = 6,
1034 CHECKMEMORYATTRIB = 7,
1035 GETDEVICEMEMORY = 8,
1036 GETREQUIREDCOMPUTE = 9,
1037
1038 DEVICE_CANACCESSPEER = 10,
1039 DEVICE_ENABLEPEERACCESS = 11,
1040 DEVICE_DISABLEPEERACCESS = 12,
1041
1042 COPY_DEVICE_TO_HOST = 14,
1043 COPY_HOST_TO_DEVICE = 15,
1044
1045 CREATE_MEMORYPOINTER = 16,
1046 FREE_MEMORYPOINTER = 17,
1047
1048 ALLOCMEM_HALF = 19,
1049 ALLOCMEM = 20,
1050 FREEMEM = 21,
1051 GETMEM = 22,
1052 SETMEM = 23,
1053 SETMEMAT = 24,
1054
1055 ALLOCHOSTBUFFER = 25,
1056 FREEHOSTBUFFER = 26,
1057 GETHOSTMEM = 27,
1058 SETHOSTMEM = 28,
1059 GETHOSTBUFFERCAPACITY = 29,
1060
1061 CREATE_STREAM = 30,
1062 FREE_STREAM = 31,
1063 SYNCRHONIZE_STREAM = 32,
1064 SYNCHRONIZE_THREAD = 33,
1065
1066 CREATE_MEMTEST = 34,
1067 FREE_MEMTEST = 35,
1068 RUN_MEMTEST = 36,
1069
1070 CREATE_IMAGEOP = 37,
1071 FREE_IMAGEOP = 38,
1072 DISTORTIMAGE_IMAGEOP = 39,
1073
1074 CREATE_NCCL = 40,
1075 FREE_NCCL = 41,
1076 NCCL_INIT_SINGLEPROCESS = 42,
1077 NCCL_INIT_MULTIPROCESS = 43,
1078 NCCL_BROADCAST = 44,
1079 NCCL_ALLREDUCE = 45,
1080
1081 SETPIXEL = 46,
1082
1083 CREATE_CUDNN = 47,
1084 FREE_CUDNN = 48,
1085
1086 CREATE_TENSORDESC = 50,
1087 FREE_TENSORDESC = 51,
1088 SET_TENSORDESC = 52,
1089 ADD_TENSOR = 53,
1090 SET_TENSORNDDESC = 54,
1091
1092 CREATE_FILTERDESC = 60,
1093 FREE_FILTERDESC = 61,
1094 SET_FILTERDESC = 62,
1095 SET_FILTERNDDESC = 63,
1096
1097 CREATE_EXTENSION = 67,
1098 FREE_EXTENSION = 68,
1099 EXTENSION_RUN = 69,
1100
1101 CREATE_CONVDESC = 70,
1102 FREE_CONVDESC = 71,
1103 SET_CONVDESC = 72,
1104 GET_CONVINFO = 73,
1105 FWD_CONV = 74,
1106 BWD_CONV_BIAS = 75,
1107 BWD_CONV_FILTER = 76,
1108 BWD_CONV_DATA = 77,
1109
1110 CREATE_POOLDESC = 80,
1111 FREE_POOLDESC = 81,
1112 SET_POOLDESC = 82,
1113 FWD_POOL = 83,
1114 BWD_POOL = 84,
1115
1116 DERIVE_BNDESC = 86,
1117 FWD_BN = 87,
1118 BWD_BN = 88,
1119
1120 CREATE_LRNDESC = 90,
1121 FREE_LRNDESC = 91,
1122 SET_LRNDESC = 92,
1123
1124 GET_DROPOUT_INFO = 94,
1125 CREATE_DROPOUTDESC = 95,
1126 FREE_DROPOUTDESC = 96,
1127 SET_DROPOUTDESC = 97,
1128 FWD_DROPOUT = 98,
1129 BWD_DROPOUT = 99,
1130
1131 TANH_FWD = 100,
1132 TANH_BWD = 101,
1133
1134 ELU_FWD = 102,
1135 ELU_BWD = 103,
1136
1137 SIGMOID_FWD = 104,
1138 SIGMOID_BWD = 105,
1139
1140 RELU_FWD = 108,
1141 RELU_BWD = 109,
1142
1143 SOFTMAX_FWD = 111,
1144 SOFTMAX_BWD = 112,
1145
1146 LRN_CC_FWD = 120,
1147 LRN_CC_BWD = 121,
1148 LCN_CC_FWD = 122,
1149 LCN_CC_BWD = 123,
1150
1151 // DEPRECIATED, use RNN8 instead
1152 CREATE_RNN_DATA_DESC = 130,
1153 FREE_RNN_DATA_DESC = 131,
1154 SET_RNN_DATA_DESC = 132,
1155
1156 // DEPRECIATED, use RNN8 instead
1157 CREATE_RNN_DATA_DESCEX = 135,
1158 FREE_RNN_DATA_DESCEX = 136,
1159 SET_RNN_DATA_DESCEX = 137,
1160
1161 // DEPRECIATED, use RNN8 instead
1162 CREATE_RNN_DESC = 140,
1163 FREE_RNN_DESC = 141,
1164 SET_RNN_DESC = 142,
1165 GET_RNN_PARAMCOUNT = 143,
1166 GET_RNN_WORKSPACECOUNT = 144,
1167 GET_RNN_LINLAYERPARAMS = 145,
1168 FWD_RNN = 146,
1169 BWD_RNN_DATA = 147,
1170 BWD_RNN_WTS = 148,
1171
1172 RNN8_IS_SUPPORTED = 150,
1173 RNN8_CREATE = 151,
1174 RNN8_FREE = 152,
1175 RNN8_SET = 153,
1176 RNN8_GET_MEMORY_SIZES = 154,
1177 RNN8_INIT_WEIGHTS = 155,
1178 RNN8_FWD = 156,
1179 RNN8_BWD = 157,
1180
1181 CUDA_SET = 200,
1182 CUDA_GET = 201,
1183 CUDA_COPY = 202,
1184 CUDA_COPY_SIM = 203,
1185 CUDA_COPY_FILL = 204,
1186 CUDA_SORT = 205,
1187 CUDA_COPY_BATCH = 206,
1188 CUDA_COPY_SEQUENCE = 207,
1189 CUDA_COPY_EXPAND = 208,
1190 CUDA_COPY_SEQUENCE2 = 209,
1191
1192 CUDA_ADD3 = 217,
1193 CUDA_GEAM = 218,
1194 CUDA_GEMM2 = 219,
1195 CUDA_GEMM = 220,
1196 CUDA_GEMV = 221,
1197 CUDA_AXPY = 222,
1198 CUDA_AXPBY = 223,
1199 CUDA_SCAL = 224,
1200 CUDA_DOT = 225,
1201 CUDA_ASUM = 226,
1202 CUDA_SCALE = 227,
1203 CUDA_ADD_SCALAR = 228,
1204 CUDA_ADD = 229,
1205 CUDA_SUB = 230,
1206 CUDA_MUL = 231,
1207 CUDA_MUL_SCALAR = 232,
1208 CUDA_DIV = 233,
1209 CUDA_ABS = 234,
1210 CUDA_EXP = 235,
1211 CUDA_LOG = 236,
1212 CUDA_POWX = 237,
1213 CUDA_SIGN = 238,
1214 CUDA_SQRT = 239,
1215 CUDA_RECIPROCOL = 240,
1216 CUDA_STUDENT = 241,
1217 CUDA_LOGISTIC1 = 242,
1218 CUDA_LOGISTIC2 = 243,
1219 CUDA_ADD2 = 244,
1220 CUDA_COMPARE_SIGNS = 245,
1221 CUDA_MAXVAL = 246,
1222 CUDA_MINVAL = 247,
1223 CUDA_SUMSQ = 248,
1224 CUDA_SUMSQDIFF = 249,
1225 CUDA_WIDTH = 250,
1226 CUDA_CONTAINS_POINT = 251,
1227 CUDA_DENAN = 252,
1228 CUDA_SUB_AND_DOT = 253,
1229 CUDA_MINMAXVAL = 254,
1230 CUDA_SUM = 255,
1231 CUDA_SQRT_SCALE = 256,
1232 CUDA_GER = 257,
1233 CUDA_SET_BOUNDS = 259,
1234 CUDA_MINMAXVEC = 260,
1235 CUDA_TRANSPOSE = 261,
1236 CUDA_SCALE_TO_RANGE = 262,
1237 CUDA_ERF = 263,
1238 CUDA_MASK = 264,
1239
1240 CUDA_INTERP2 = 265,
1241 CUDA_MASK_BATCH = 266,
1242 CUDA_TRANSPOSE_HW = 267,
1243
1244 CUDA_MAX = 268,
1245 CUDA_MIN = 269,
1246
1247 CUDA_MULBSX = 270,
1248 CUDA_DIVBSX = 271,
1249
1250 CUDA_MAX_BWD2 = 272,
1251
1252 CUDA_IM2COL = 280,
1253 CUDA_IM2COL_ND = 281,
1254 CUDA_COL2IM = 282,
1255 CUDA_COL2IM_ND = 283,
1256
1257 CUDA_ACCURACY_FWD = 286,
1258
1259 CUDA_CHANNEL_MEAN = 287,
1260 CUDA_CHANNEL_MIN = 289,
1261 CUDA_CHANNEL_MAX = 290,
1262 CUDA_CHANNEL_SUB = 291,
1263 CUDA_CHANNEL_SUM = 292,
1264 CUDA_CHANNEL_DIV = 293,
1265 CUDA_CHANNEL_DOT = 294,
1266 CUDA_CHANNEL_MUL = 295,
1267 CUDA_CHANNEL_COMPARE = 296,
1268 CUDA_CHANNEL_FILL = 297,
1269 CUDA_CHANNEL_SCALE = 298,
1270 CUDA_CHANNEL_MULV = 299,
1271 CUDA_CHANNEL_COPY = 300,
1272 CUDA_CHANNEL_FILLFROM = 301,
1273 CUDA_CHANNEL_COPYALL = 302,
1274 CUDA_CHANNEL_DUP = 303,
1275 CUDA_CHANNEL_ADD = 304,
1276 CUDA_CHANNEL_PERCENTILE = 305,
1277 CUDA_CHANNEL_OP_FWD = 306,
1278 CUDA_CHANNEL_OP_BWD = 307,
1279
1280 CUDA_RNG_SETSEED = 349,
1281 CUDA_RNG_UNIFORM = 350,
1282 CUDA_RNG_GAUSSIAN = 351,
1283 // CUDA_RNG_BERNOULLI = 352, // Not implemented yet.
1284
1285 CUDA_BATCHREIDX_FWD = 386,
1286 CUDA_BATCHREIDX_BWD = 387,
1287
1288 CUDA_EMBED_FWD = 390,
1289 CUDA_EMBED_BWD = 391,
1290
1291 CUDA_CLIP_FWD = 394,
1292 CUDA_CLIP_BWD = 395,
1293
1294 CUDA_POOL_FWD = 400,
1295 CUDA_POOL_BWD = 401,
1296
1297 CUDA_UNPOOL_FWD = 410,
1298 CUDA_UNPOOL_BWD = 411,
1299
1300 CUDA_TANH_FWD = 420,
1301 CUDA_TANH_BWD = 421,
1302
1303 CUDA_MISH_FWD = 422,
1304 CUDA_MISH_BWD = 423,
1305
1306 CUDA_SIGMOID_FWD = 424,
1307 CUDA_SIGMOID_BWD = 425,
1308
1309 CUDA_SWISH_BWD = 427,
1310
1311 CUDA_RELU_FWD = 428,
1312 CUDA_RELU_BWD = 429,
1313
1314 CUDA_ELU_FWD = 430,
1315 CUDA_ELU_BWD = 431,
1316
1317 CUDA_DROPOUT_FWD = 432,
1318 CUDA_DROPOUT_BWD = 433,
1319
1320 CUDA_BNLL_FWD = 435,
1321 CUDA_BNLL_BWD = 436,
1322
1323 CUDA_PRELU_FWD = 438,
1324 CUDA_PRELU_BWD = 439,
1325 CUDA_PRELU_BWD_PARAM = 440,
1326
1327 CUDA_NLLLOSS_FWD = 442,
1328 CUDA_NLLLOSS_BWD = 443,
1329
1330 CUDA_SOFTMAXLOSS_FWD = 444,
1331 CUDA_SOFTMAXLOSS_BWD = 445,
1332
1333 CUDA_MIN_FWD = 446,
1334 CUDA_MIN_BWD = 447,
1335
1336 CUDA_MAX_FWD = 448,
1337 CUDA_MAX_BWD = 449,
1338
1339 CUDA_CROP_FWD = 450,
1340 CUDA_CROP_BWD = 451,
1341
1342 CUDA_CONCAT_FWD = 452,
1343 CUDA_CONCAT_BWD = 453,
1344
1345 CUDA_SLICE_FWD = 455,
1346 CUDA_SLICE_BWD = 456,
1347
1348 CUDA_TILE_FWD = 457,
1349 CUDA_TILE_BWD = 458,
1350
1351 CUDA_BIAS_FWD = 460,
1352
1353 CUDA_SCALE_FWD = 461,
1354
1355 CUDA_THRESHOLD_FWD = 462,
1356
1357 CUDA_CLL_BWD = 463,
1358
1359 CUDA_LRN_FILLSCALE = 465,
1360 CUDA_LRN_COMPUTEOUTPUT = 466,
1361 CUDA_LRN_COMPUTEDIFF = 467,
1362
1363 CUDA_SMOOTHL1_FWD = 470,
1364 CUDA_SMOOTHL1_BWD = 471,
1365
1366 CUDA_SERF_FWD = 472,
1367 CUDA_SERF_BWD = 473,
1368
1369 CUDA_PERMUTE = 474,
1370
1371 CUDA_GATHER_FWD = 476,
1372 CUDA_GATHER_BWD = 477,
1373
1374 CUDA_LSTM_FWD = 480,
1375 CUDA_LSTM_BWD = 481,
1376
1377 CUDA_LSTM_UNIT_FWD = 482,
1378 CUDA_LSTM_UNIT_BWD = 483,
1379
1380 CUDA_MATH_FWD = 487,
1381 CUDA_MATH_BWD = 488,
1382
1383 CUDA_COEFF_SUM_FWD = 490,
1384 CUDA_COEFF_SUM_BWD = 491,
1385
1386 CUDA_COEFF_SUB_FWD = 492,
1387 CUDA_COEFF_SUB_BWD = 493,
1388
1389 CUDA_MEAN_ERROR_LOSS_BWD = 495,
1390
1391 CUDA_SIGMOID_CROSS_ENTROPY_FWD = 496,
1392 CUDA_SIGMOID_CROSS_ENTROPY_BWD = 497,
1393 CUDA_SOFTMAX_CROSS_ENTROPY_FWD = 498,
1394 CUDA_SOFTMAX_CROSS_ENTROPY_BWD = 499,
1395
1396 CUDA_SGD_UPDATE = 500,
1397 CUDA_NESTEROV_UPDATE = 501,
1398 CUDA_ADAGRAD_UPDATE = 502,
1399 CUDA_ADADELTA_UPDATE = 503,
1400 CUDA_ADAM_UPDATE = 504,
1401 CUDA_RMSPROP_UPDATE = 505,
1402 CUDA_ADAMW_UPDATE = 506,
1403
1404 CUDA_COMBINE_DATA = 550,
1405
1406 CUDA_GELU_FWD = 600,
1407 CUDA_GELU_BWD = 601,
1408
1409 CUDA_SILU_FWD = 605,
1410 CUDA_SILU_BWD = 606,
1411
1412 CUDA_SOFTPLUS_FWD = 610,
1413 CUDA_SOFTPLUS_BWD = 611,
1414
1415 CUDA_LECUN_FWD = 615,
1416 CUDA_LECUN_BWD = 616,
1417
1418 CUDA_MTX_SET_DIAGONAL = 700,
1419 CUDA_MTX_SET_DIAGONAL2 = 701,
1420 CUDA_MTX_ADD_VECTOR = 702,
1421 CUDA_MTX_TRANSPOSE_OPERATION = 703,
1422 CUDA_MTX_AGGREGATE_COLS = 704,
1423 CUDA_MTX_AGGREGATE_ROWS = 705,
1424 CUDA_MTX_TRANSPOSE = 706,
1425 CUDA_MTX_MEANCENTER_BY_COL = 707,
1426 CUDA_MTX_MEANCENTER_BY_ROW = 708,
1427 CUDA_MTX_EUCLIDEAN_DIST = 709,
1428 CUDA_MTX_DOT = 710,
1429 CUDA_MTX_MEAN = 711,
1430 CUDA_MTX_STDEV = 712,
1431 CUDA_MTX_CORRELATIONS = 714,
1432
1433 CUDA_CREATE_PCA = 800,
1434 CUDA_RUN_PCA = 801,
1435 CUDA_FREE_PCA = 802,
1436
1437 CUDA_TSNE_UPDATE = 850,
1438 CUDA_TSNE_UPDATE_GRAD = 851,
1439 CUDA_TSNE_COMPUTE_EXACT_ERROR = 852,
1440 CUDA_TSNE_COMPUTE_SQUARED_EUCLIDEAN_DISTANCE = 854,
1441 CUDA_TSNE_COMPUTE_Q_MATRIX = 855,
1442 CUDA_TSNE_COMPUTE_EXACT_GRADIENT = 856,
1443 CUDA_TSNE_SYMMETRIZE_MATRIX = 858,
1444 CUDA_TSNE_COMPUTE_KNN_BOUNDS = 859,
1445
1446 CUDA_TSNE_CREATE_GAUSSIAN_PERPLEXITY = 870,
1447 CUDA_TSNE_FREE_GAUSSIAN_PERPLEXITY = 871,
1448 CUDA_TSNE_FIND_GAUSSIAN_PERPLEXITY = 872,
1449
1450 CUDA_TSNE_CREATE = 875,
1451 CUDA_TSNE_FREE = 876,
1452 CUDA_TSNE_COMPUTE_GRADIENT1 = 877,
1453 CUDA_TSNE_COMPUTE_ERROR1 = 878,
1454
1455 CUDA_GUASSIAN_BLUR = 900,
1456 CUDA_HAMMING_DIFF = 901,
1457 CUDA_CALC_BATCH_DIST = 902,
1458 CUDA_CALC_DFT = 903,
1459
1460 CUDA_CREATE_SSD = 950,
1461 CUDA_FREE_SSD = 951,
1462 CUDA_SETUP_SSD = 952,
1463 CUDA_SSD_FWD_MULTIBOXLOSS = 955,
1464 CUDA_SSD_ENCODE_LOCPRED = 958,
1465 CUDA_SSD_ENCODE_CONFPRED = 959,
1466
1467 CUDA_CREATE_LAYERNORM = 970,
1468 CUDA_FREE_LAYERNORM = 971,
1469 CUDA_LAYERNORM_FWD = 975,
1470 CUDA_LAYERNORM_BWD = 976,
1471
1472 CUDA_DEBUG = 1000
1473 }
1474
1475#pragma warning restore 1591
1476
1477
1488 public CudaDnn(int nDeviceID, DEVINIT flags = (DEVINIT.CUBLAS | DEVINIT.CURAND), long? lSeed = null, string strPath = "", bool bResetFirst = false, bool bEnableMemoryTrace = false)
1489 {
1490 m_memTracker = new CudaDnnMemoryTracker<T>(bEnableMemoryTrace);
1491 m_nDeviceId = nDeviceID;
1492 m_nIdx = get_index();
1493
1494 if (strPath == null || strPath.Length == 0)
1495 strPath = s_strCudaPath;
1496
1497 m_strPath = strPath;
1498 m_dt = (typeof(T) == typeof(double)) ? DataType.DOUBLE : DataType.FLOAT;
1499
1500 try
1501 {
1502 m_cuda = new CudaControlLib.CudaKernel();
1503 }
1504 catch (Exception excpt)
1505 {
1506 throw new Exception("The CudaControl is not registered! Make sure that you are using the 'x64' build and if so, run 'regsvr32 CudaControl.dll' from a CMD window with Administrative privileges to register.", excpt);
1507 }
1508
1509 try
1510 {
1511 if (string.IsNullOrEmpty(strPath))
1512 strPath = GetCudaDnnDllPath();
1513
1514 m_strPath = strPath;
1515
1516 string strDir = System.IO.Path.GetDirectoryName(strPath);
1517 string strCurDir = Directory.GetCurrentDirectory();
1518 Directory.SetCurrentDirectory(strDir);
1519
1520 m_cuda.Load(strPath);
1521
1522 Directory.SetCurrentDirectory(strCurDir);
1523 }
1524 catch (Exception excpt)
1525 {
1526 if (excpt.Message != null && excpt.Message.Length > 0)
1527 throw excpt;
1528
1529 throw new Exception("The CudaDnnDll.x.dll at '" + strPath + "' failed to load. The error code = 0x" + excpt.HResult.ToString("X"));
1530 }
1531
1532 try
1533 {
1534 lock (m_createSync)
1535 {
1536 if (m_dt == DataType.DOUBLE)
1537 {
1538 double[] rg = m_cuda.RunDouble(0, (int)CUDAFN.INITIALIZE, m_param.AsDouble(nDeviceID, (int)flags));
1539 m_hKernel = (long)rg[0];
1540 }
1541 else
1542 {
1543 float[] rg = m_cuda.RunFloat(0, (int)CUDAFN.INITIALIZE, m_param.AsFloat(nDeviceID, (int)flags));
1544 m_hKernel = (long)rg[0];
1545 }
1546 }
1547 }
1548 catch (Exception excpt)
1549 {
1550 if (excpt.Message != null && excpt.Message.Length > 0)
1551 throw excpt;
1552
1553 throw new Exception("CudaDnn failed to initialize. You may need to reboot or reset the Cuda GPU #" + nDeviceID.ToString() + ". The error code = 0x" + excpt.HResult.ToString("X"));
1554 }
1555
1556 if (bResetFirst)
1557 {
1558 ResetDevice();
1559
1560 lock (m_createSync)
1561 {
1562 if (m_dt == DataType.DOUBLE)
1563 {
1564 double[] rg = m_cuda.RunDouble(0, (int)CUDAFN.INITIALIZE, m_param.AsDouble(nDeviceID, (int)flags));
1565 m_hKernel = (long)rg[0];
1566 }
1567 else
1568 {
1569 float[] rg = m_cuda.RunFloat(0, (int)CUDAFN.INITIALIZE, m_param.AsFloat(nDeviceID, (int)flags));
1570 m_hKernel = (long)rg[0];
1571 }
1572 }
1573 }
1574
1575 if (lSeed.HasValue)
1576 SetRandomSeed(lSeed.Value);
1577
1578 m_tOne = (T)Convert.ChangeType(1.0, typeof(T));
1579 m_tZero = (T)Convert.ChangeType(0.0, typeof(T));
1580 }
1581
1587 public CudaDnn(CudaDnn<T> cuda, bool bEnableGhostMemory)
1588 {
1589 m_nDeviceId = cuda.m_nDeviceId;
1590 m_nIdx = get_index();
1591
1592 m_strPath = cuda.m_strPath;
1593 m_dt = cuda.m_dt;
1594 m_cuda = cuda.m_cuda;
1595 m_hKernel = cuda.m_hKernel;
1596 m_tOne = cuda.m_tOne;
1597 m_tZero = cuda.m_tZero;
1598
1599 if (bEnableGhostMemory)
1600 {
1601 m_rgGhostMemory = new Dictionary<long, T[]>();
1602 m_bGhostMemoryEnabled = true;
1603 }
1604
1605 m_bOwner = false;
1606 }
1607
1612 protected virtual void Dispose(bool bDisposing)
1613 {
1614 if (m_bOwner && m_hKernel != 0)
1615 {
1616 if (m_dt == DataType.DOUBLE)
1617 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CLEANUP, null);
1618 else
1619 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CLEANUP, null);
1620
1621 m_hKernel = 0;
1622 m_cuda = null;
1623 }
1624 }
1625
1629 public void Dispose()
1630 {
1631 Dispose(true);
1632 }
1633
1638 public static string GetCudaDnnDllPath()
1639 {
1640 FileInfo fi = new FileInfo(Process.GetCurrentProcess().MainModule.FileName);
1641
1642 string strPath = fi.DirectoryName + "\\cuda_12.1\\CudaDnnDll.12.1.dll";
1643
1644 if (!File.Exists(strPath))
1645 {
1646 strPath = fi.DirectoryName + "\\CudaDnnDll.12.1.dll";
1647 if (!File.Exists(strPath))
1648 {
1649 strPath = fi.DirectoryName + "\\cuda_12.0\\CudaDnnDll.12.0.dll";
1650 if (!File.Exists(strPath))
1651 {
1652 strPath = fi.DirectoryName + "\\CudaDnnDll.12.0.dll";
1653 if (!File.Exists(strPath))
1654 {
1655 if (!File.Exists(strPath))
1656 {
1657 strPath = fi.DirectoryName + "\\cuda_11.8\\CudaDnnDll.11.8.dll";
1658 if (!File.Exists(strPath))
1659 {
1660 strPath = fi.DirectoryName + "\\CudaDnnDll.11.8.dll";
1661 if (!File.Exists(strPath))
1662 {
1663 strPath = fi.DirectoryName + "\\cuda_11.7\\CudaDnnDll.11.7.dll";
1664 if (!File.Exists(strPath))
1665 {
1666 strPath = fi.DirectoryName + "\\CudaDnnDll.11.7.dll";
1667 if (!File.Exists(strPath))
1668 {
1669 strPath = fi.DirectoryName + "\\cuda_11.6\\CudaDnnDll.11.6.dll";
1670 if (!File.Exists(strPath))
1671 {
1672 strPath = fi.DirectoryName + "\\CudaDnnDll.11.6.dll";
1673 if (!File.Exists(strPath))
1674 {
1675 strPath = fi.DirectoryName + "\\cuda_11.5\\CudaDnnDll.11.5.dll";
1676 if (!File.Exists(strPath))
1677 {
1678 strPath = fi.DirectoryName + "\\CudaDnnDll.11.5.dll";
1679 if (!File.Exists(strPath))
1680 {
1681 strPath = fi.DirectoryName + "\\cuda_11.4\\CudaDnnDll.11.4.dll";
1682 if (!File.Exists(strPath))
1683 {
1684 strPath = fi.DirectoryName + "\\CudaDnnDll.11.4.dll";
1685 if (!File.Exists(strPath))
1686 {
1687 strPath = fi.DirectoryName + "\\cuda_11.3\\CudaDnnDll.11.3.dll";
1688 if (!File.Exists(strPath))
1689 {
1690 strPath = fi.DirectoryName + "\\CudaDnnDll.11.3.dll";
1691 if (!File.Exists(strPath))
1692 {
1693 strPath = fi.DirectoryName + "\\cuda_11.2\\CudaDnnDll.11.2.dll";
1694 if (!File.Exists(strPath))
1695 {
1696 strPath = fi.DirectoryName + "\\CudaDnnDll.11.2.dll";
1697 if (!File.Exists(strPath))
1698 {
1699 strPath = fi.DirectoryName + "\\cuda_11.1\\CudaDnnDll.11.1.dll";
1700 if (!File.Exists(strPath))
1701 {
1702 strPath = fi.DirectoryName + "\\CudaDnnDll.11.1.dll";
1703 if (!File.Exists(strPath))
1704 {
1705 strPath = fi.DirectoryName + "\\cuda_11.0\\CudaDnnDll.11.0.dll";
1706 if (!File.Exists(strPath))
1707 {
1708 strPath = fi.DirectoryName + "\\CudaDnnDll.11.0.dll";
1709 if (!File.Exists(strPath))
1710 {
1711 strPath = fi.DirectoryName + "\\cuda_10.2\\CudaDnnDll.10.2.dll";
1712 if (!File.Exists(strPath))
1713 {
1714 strPath = fi.DirectoryName + "\\CudaDnnDll.10.2.dll";
1715 if (!File.Exists(strPath))
1716 {
1717 strPath = fi.DirectoryName + "\\cuda_10.2.3_5\\CudaDnnDll.10.2.dll";
1718 if (!File.Exists(strPath))
1719 {
1720 strPath = fi.DirectoryName + "\\CudaDnnDll.10.2.3_5.dll";
1721 if (!File.Exists(strPath))
1722 {
1723 strPath = fi.DirectoryName + "\\CudaDnnDll.10.1.dll";
1724 if (!File.Exists(strPath))
1725 {
1726 strPath = fi.DirectoryName + "\\CudaDnnDll.10.0.dll";
1727 if (!File.Exists(strPath))
1728 {
1729 strPath = fi.DirectoryName + "\\CudaDnnDll.9.2.dll";
1730 if (!File.Exists(strPath))
1731 {
1732 strPath = fi.DirectoryName + "\\CudaDnnDll.9.1.dll";
1733 if (!File.Exists(strPath))
1734 {
1735 if (!File.Exists(strPath))
1736 strPath = fi.DirectoryName + "\\CudaDnnDll.8.dll";
1737 }
1738 }
1739 }
1740 }
1741 }
1742 }
1743 }
1744 }
1745 }
1746 }
1747 }
1748 }
1749 }
1750 }
1751 }
1752 }
1753 }
1754 }
1755 }
1756 }
1757 }
1758 }
1759 }
1760 }
1761 }
1762 }
1763 }
1764 }
1765 }
1766 }
1767 }
1768
1769 return strPath;
1770 }
1771
1776 {
1777 m_bGhostMemoryEnabled = false;
1778 }
1779
1783 public void ResetGhostMemory()
1784 {
1785 if (m_rgGhostMemory != null)
1786 m_bGhostMemoryEnabled = true;
1787 else
1788 m_bGhostMemoryEnabled = false;
1789 }
1790
1794 public ulong TotalMemoryUsed
1795 {
1796 get { return m_memTracker.TotalMemoryUsed; }
1797 }
1798
1802 public string TotalMemoryUsedAsText
1803 {
1804 get { return m_memTracker.TotalMemoryUsedText; }
1805 }
1806
1811 public long KernelHandle
1812 {
1813 get { return m_hKernel; }
1814 }
1815
1829 public void KernelCopy(int nCount, long hSrc, int nSrcOffset, long hDstKernel, long hDst, int nDstOffset, long hHostBuffer, long hHostKernel = -1, long hStream = -1, long hSrcKernel = -1)
1830 {
1831 if (hSrcKernel == -1)
1832 hSrcKernel = m_hKernel;
1833
1834 if (m_dt == DataType.DOUBLE)
1835 m_cuda.RunDouble((int)hSrcKernel, (int)CUDAFN.KERNEL_MEMCOPY, m_param.AsDouble(nCount, hSrc, nSrcOffset, hDstKernel, hDst, nDstOffset, hHostBuffer, hHostKernel, hStream));
1836 else
1837 m_cuda.RunFloat((int)hSrcKernel, (int)CUDAFN.KERNEL_MEMCOPY, m_param.AsFloat(nCount, hSrc, nSrcOffset, hDstKernel, hDst, nDstOffset, hHostBuffer, hHostKernel, hStream));
1838 }
1839
1848 public void KernelAdd(int nCount, long hA, long hDstKernel, long hB, long hC)
1849 {
1850 if (m_dt == DataType.DOUBLE)
1851 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.KERNEL_ADD, m_param.AsDouble(nCount, hA, hDstKernel, hB, hC));
1852 else
1853 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.KERNEL_ADD, m_param.AsFloat(nCount, hA, hDstKernel, hB, hC));
1854 }
1855
1866 public long KernelCopyNccl(long hSrcKernel, long hSrcNccl)
1867 {
1868 if (m_dt == DataType.DOUBLE)
1869 {
1870 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.KERNEL_COPY_NCCL, m_param.AsDouble(hSrcKernel, hSrcNccl));
1871 return (long)rg[0];
1872 }
1873 else
1874 {
1875 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.KERNEL_COPY_NCCL, m_param.AsFloat(hSrcKernel, hSrcNccl));
1876 return (long)rg[0];
1877 }
1878 }
1879
1880 private static int get_index()
1881 {
1882 s_nIdxSeed++;
1883 return s_nIdxSeed;
1884 }
1885
1890 public static void SetDefaultCudaPath(string strPath)
1891 {
1892 s_strCudaPath = strPath;
1893 }
1894
1899 public static ulong basetype_size(bool bUseHalfSize)
1900 {
1901 if (bUseHalfSize)
1902 return 2;
1903
1904 if (typeof(T) == typeof(float))
1905 return 4;
1906 else
1907 return 8;
1908 }
1909
1910 private double convertD(T fVal)
1911 {
1912 return (double)Convert.ChangeType(fVal, typeof(double));
1913 }
1914
1915 private float convertF(T fVal)
1916 {
1917 return (float)Convert.ChangeType(fVal, typeof(float));
1918 }
1919
1923 public string Path
1924 {
1925 get { return m_strPath; }
1926 }
1927
1931 public static string DefaultPath
1932 {
1933 get { return s_strCudaPath; }
1934 }
1935
1936#pragma warning disable 1591
1937
1938 public void CombineData(int nCount, long hOriginal, long hUpdated, double dfUpdatedPct, long hServer, double dfServerPct, long hNewData)
1939 {
1940 if (m_dt == DataType.DOUBLE)
1941 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_COMBINE_DATA, m_param.AsDouble(dfUpdatedPct, dfServerPct), m_param.AsLong(nCount, hOriginal, hUpdated, 0, hServer, 0, hNewData));
1942 else
1943 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_COMBINE_DATA, m_param.AsFloat((float)dfUpdatedPct, (float)dfServerPct), m_param.AsLong(nCount, hOriginal, hUpdated, 0, hServer, 0, hNewData));
1944 }
1945
1946#pragma warning restore 1591
1947
1948
1949 //---------------------------------------------------------------------
1950 // ICudaDevice Methods
1951 //---------------------------------------------------------------------
1952 #region ICudaDevice Methods
1953
1960 public void SetDeviceID(int nDeviceID = -1, DEVINIT flags = DEVINIT.NONE, long? lSeed = null)
1961 {
1962 if (m_cuda == null || m_hKernel <= 0)
1963 throw new Exception("CudaDnn has already nbeen disposed!");
1964
1965 if (nDeviceID == -1)
1966 nDeviceID = m_nDeviceId;
1967 else
1968 m_nDeviceId = nDeviceID;
1969
1970 if (m_dt == DataType.DOUBLE)
1971 {
1972 if (lSeed.HasValue)
1973 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.SETDEVICE, m_param.AsDouble(nDeviceID, (int)flags, lSeed.Value));
1974 else
1975 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.SETDEVICE, m_param.AsDouble(nDeviceID, (int)flags));
1976 }
1977 else
1978 {
1979 if (lSeed.HasValue)
1980 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.SETDEVICE, m_param.AsFloat(nDeviceID, (int)flags, lSeed.Value));
1981 else
1982 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.SETDEVICE, m_param.AsFloat(nDeviceID, (int)flags));
1983 }
1984 }
1985
1990 public void SetRandomSeed(long lSeed)
1991 {
1992 if (m_cuda == null || m_hKernel <= 0)
1993 throw new Exception("CudaDnn has already nbeen disposed!");
1994
1995 if (m_dt == DataType.DOUBLE)
1996 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.SETRANDOMSEED, m_param.AsDouble(lSeed));
1997 else
1998 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.SETRANDOMSEED, m_param.AsFloat(lSeed));
1999 }
2000
2004 public int OriginalDeviceID
2005 {
2006 get { return m_nDeviceId; }
2007 }
2008
2013 public int GetDeviceID()
2014 {
2015 if (m_cuda == null || m_hKernel <= 0)
2016 throw new Exception("CudaDnn has already nbeen disposed!");
2017
2018 if (m_dt == DataType.DOUBLE)
2019 {
2020 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.GETDEVICE, null);
2021 return (int)rg[0];
2022 }
2023 else
2024 {
2025 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.GETDEVICE, null);
2026 return (int)rg[0];
2027 }
2028 }
2029
2035 public string GetDeviceName(int nDeviceID)
2036 {
2037 if (m_cuda == null || m_hKernel <= 0)
2038 throw new Exception("CudaDnn has already nbeen disposed!");
2039
2040 string[] rgstr = m_cuda.QueryString((int)m_hKernel, (int)CUDAQRY.DEVICE_NAME, new int[] { nDeviceID });
2041 return rgstr[0];
2042 }
2043
2049 public string GetDeviceP2PInfo(int nDeviceID)
2050 {
2051 if (m_cuda == null || m_hKernel <= 0)
2052 throw new Exception("CudaDnn has already nbeen disposed!");
2053
2054 string[] rgstr = m_cuda.QueryString((int)m_hKernel, (int)CUDAQRY.DEVICE_P2P_INFO, new int[] { nDeviceID });
2055 return rgstr[0];
2056 }
2057
2064 public string GetDeviceInfo(int nDeviceID, bool bVerbose = false)
2065 {
2066 if (m_cuda == null || m_hKernel <= 0)
2067 throw new Exception("CudaDnn has already nbeen disposed!");
2068
2069 string[] rgstr = m_cuda.QueryString((int)m_hKernel, (int)CUDAQRY.DEVICE_INFO, new int[] { nDeviceID, (bVerbose) ? 1 : 0 });
2070 return rgstr[0];
2071 }
2072
2079 public void ResetDevice()
2080 {
2081 if (m_cuda == null || m_hKernel <= 0)
2082 throw new Exception("CudaDnn has already nbeen disposed!");
2083
2084 if (m_dt == DataType.DOUBLE)
2085 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.RESETDEVICE, null);
2086 else
2087 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.RESETDEVICE, null);
2088 }
2089
2093 public void SynchronizeDevice()
2094 {
2095 if (m_cuda == null || m_hKernel <= 0)
2096 throw new Exception("CudaDnn has already nbeen disposed!");
2097
2098 if (m_dt == DataType.DOUBLE)
2099 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.SYNCHRONIZEDEVICE, null);
2100 else
2101 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.SYNCHRONIZEDEVICE, null);
2102 }
2103
2109 public int GetMultiGpuBoardGroupID(int nDeviceID)
2110 {
2111 if (m_dt == DataType.DOUBLE)
2112 {
2113 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.GETDEVICEPROP, m_param.AsDouble(nDeviceID, (int)DEVPROP.MULTIGPUBOARDGROUPID));
2114 return (int)rg[0];
2115 }
2116 else
2117 {
2118 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.GETDEVICEPROP, m_param.AsFloat(nDeviceID, (int)DEVPROP.MULTIGPUBOARDGROUPID));
2119 return (int)rg[0];
2120 }
2121 }
2122
2127 public int GetDeviceCount()
2128 {
2129 if (m_cuda == null || m_hKernel <= 0)
2130 return 0;
2131
2132 try
2133 {
2134 if (m_dt == DataType.DOUBLE)
2135 {
2136 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.GETDEVICEPROP, m_param.AsDouble(0, (int)DEVPROP.DEVICECOUNT));
2137 return (int)rg[0];
2138 }
2139 else
2140 {
2141 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.GETDEVICEPROP, m_param.AsFloat(0, (int)DEVPROP.DEVICECOUNT));
2142 return (int)rg[0];
2143 }
2144 }
2145 catch (Exception)
2146 {
2147 return 0;
2148 }
2149 }
2150
2160 public bool CheckMemoryAttributes(long hSrc, int nSrcDeviceID, long hDst, int nDstDeviceID)
2161 {
2162 if (m_dt == DataType.DOUBLE)
2163 {
2164 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CHECKMEMORYATTRIB, m_param.AsDouble(hSrc, nSrcDeviceID, hDst, nDstDeviceID));
2165 return (rg[0] == 0) ? false : true;
2166 }
2167 else
2168 {
2169 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CHECKMEMORYATTRIB, m_param.AsFloat(hSrc, nSrcDeviceID, hDst, nDstDeviceID));
2170 return (rg[0] == 0) ? false : true;
2171 }
2172 }
2173
2182 public double GetDeviceMemory(out double dfFree, out double dfUsed, out bool bCudaCallUsed, int nDeviceID = -1)
2183 {
2184 if (nDeviceID == -1)
2185 nDeviceID = m_nDeviceId;
2186
2187 if (m_dt == DataType.DOUBLE)
2188 {
2189 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.GETDEVICEMEMORY, m_param.AsDouble(nDeviceID));
2190 dfFree = rg[1];
2191 dfUsed = rg[2];
2192 bCudaCallUsed = (rg[3] == 0) ? false : true;
2193 return rg[0];
2194 }
2195 else
2196 {
2197 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.GETDEVICEMEMORY, m_param.AsFloat(nDeviceID));
2198 dfFree = (double)rg[1];
2199 dfUsed = (double)rg[2];
2200 bCudaCallUsed = (rg[3] == 0) ? false : true;
2201 return (double)rg[0];
2202 }
2203 }
2204
2216 public string GetRequiredCompute(out int nMinMajor, out int nMinMinor)
2217 {
2218 if (m_dt == DataType.DOUBLE)
2219 {
2220 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.GETREQUIREDCOMPUTE, null);
2221 nMinMajor = (int)rg[0];
2222 nMinMinor = (int)rg[1];
2223 }
2224 else
2225 {
2226 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.GETREQUIREDCOMPUTE, null);
2227 nMinMajor = (int)rg[0];
2228 nMinMinor = (int)rg[1];
2229 }
2230
2231 return m_strPath;
2232 }
2233
2240 public bool DeviceCanAccessPeer(int nSrcDeviceID, int nPeerDeviceID)
2241 {
2242 if (m_dt == DataType.DOUBLE)
2243 {
2244 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.DEVICE_CANACCESSPEER, m_param.AsDouble(nSrcDeviceID, nPeerDeviceID));
2245 return (rg[0] == 0) ? false : true;
2246 }
2247 else
2248 {
2249 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.DEVICE_CANACCESSPEER, m_param.AsFloat(nSrcDeviceID, nPeerDeviceID));
2250 return (rg[0] == 0) ? false : true;
2251 }
2252 }
2253
2258 public void DeviceEnablePeerAccess(int nPeerDeviceID)
2259 {
2260 if (m_dt == DataType.DOUBLE)
2261 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.DEVICE_ENABLEPEERACCESS, m_param.AsDouble(nPeerDeviceID));
2262 else
2263 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.DEVICE_ENABLEPEERACCESS, m_param.AsFloat(nPeerDeviceID));
2264 }
2265
2270 public void DeviceDisablePeerAccess(int nPeerDeviceID)
2271 {
2272 if (m_dt == DataType.DOUBLE)
2273 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.DEVICE_DISABLEPEERACCESS, m_param.AsDouble(nPeerDeviceID));
2274 else
2275 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.DEVICE_DISABLEPEERACCESS, m_param.AsFloat(nPeerDeviceID));
2276 }
2277
2278 #endregion
2279
2280 //---------------------------------------------------------------------
2281 // ICudaMemory Methods
2282 //---------------------------------------------------------------------
2283 #region ICudaMemory Methods
2284
2291 public long AllocMemory(List<double> rg)
2292 {
2293 return AllocMemory(rg.ToArray());
2294 }
2295
2302 public long AllocMemory(List<float> rg)
2303 {
2304 return AllocMemory(rg.ToArray());
2305 }
2306
2314 public long AllocMemory(double[] rgSrc, long hStream = 0)
2315 {
2316 return AllocMemory(convert(rgSrc), hStream);
2317 }
2318
2326 public long AllocMemory(float[] rgSrc, long hStream = 0)
2327 {
2328 return AllocMemory(convert(rgSrc), hStream);
2329 }
2330
2338 public long AllocMemory(T[] rgSrc, long hStream = 0, bool bHalfSize = false)
2339 {
2340 if (rgSrc == null)
2341 throw new ArgumentNullException();
2342
2343 if (rgSrc.Length == 0)
2344 throw new ArgumentOutOfRangeException();
2345
2346 try
2347 {
2348 if (m_dt == DataType.DOUBLE)
2349 {
2350 if (bHalfSize)
2351 throw new Exception("Half sizes are only supported with the 'float' base type.");
2352
2353 List<double> rgInput = new List<double>() { rgSrc.Length };
2354 List<long> rgInput2 = new List<long>() { rgSrc.Length };
2355
2356 if (hStream > 0)
2357 {
2358 rgInput.Add(hStream);
2359 rgInput2.Add(hStream);
2360 }
2361
2362 rgInput.AddRange(convertD(rgSrc));
2363
2364 double[] rg;
2365
2366 lock (m_memSync)
2367 {
2368 if (m_rgGhostMemory == null || !m_bGhostMemoryEnabled)
2369 {
2370 rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.ALLOCMEM, rgInput.ToArray(), rgInput2.ToArray());
2371 }
2372 else
2373 {
2374 m_nGhostMemoryIndex++;
2375 m_rgGhostMemory.Add(m_nGhostMemoryIndex, convert(Utility.Clone<double>(rgInput).ToArray()));
2376 rg = new double[] { m_nGhostMemoryIndex };
2377 }
2378
2379 return m_memTracker.AllocMemory(m_hKernel, m_nDeviceId, (long)rg[0], (ulong)rgInput.Count, bHalfSize);
2380 }
2381 }
2382 else
2383 {
2384 List<float> rgInput = new List<float>() { rgSrc.Length };
2385 List<long> rgInput2 = new List<long>() { rgSrc.Length };
2386
2387 if (hStream > 0)
2388 {
2389 rgInput.Add(hStream);
2390 rgInput2.Add(hStream);
2391 }
2392
2393 rgInput.AddRange(convertF(rgSrc));
2394
2395 float[] rg;
2396
2397 lock (m_memSync)
2398 {
2399 if (m_rgGhostMemory == null || !m_bGhostMemoryEnabled)
2400 {
2401 if (bHalfSize)
2402 rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.ALLOCMEM_HALF, rgInput.ToArray(), rgInput2.ToArray());
2403 else
2404 rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.ALLOCMEM, rgInput.ToArray(), rgInput2.ToArray());
2405 }
2406 else
2407 {
2408 m_nGhostMemoryIndex++;
2409 m_rgGhostMemory.Add(m_nGhostMemoryIndex, convert(Utility.Clone<float>(rgInput).ToArray()));
2410 rg = new float[] { m_nGhostMemoryIndex };
2411 }
2412
2413 return m_memTracker.AllocMemory(m_hKernel, m_nDeviceId, (long)rg[0], (ulong)rgInput.Count, bHalfSize);
2414 }
2415 }
2416 }
2417 catch (Exception excpt)
2418 {
2419 string strMemory = m_memTracker.TotalMemoryUsedText;
2420 string strDevice = GetDeviceName(m_nDeviceId);
2421 throw new Exception("Out of memory! You are currently using " + strMemory + " of memory on " + strDevice + ". You may need to use a different GPU that has more memory.", excpt);
2422 }
2423 }
2424
2428 public static ulong BaseSize
2429 {
2430 get { return m_lBaseSize; }
2431 }
2432
2438 public static ulong ConvertByteSizeToCount(ulong ulSizeInBytes)
2439 {
2440 return ulSizeInBytes / m_lBaseSize;
2441 }
2442
2449 public long AllocMemory(long lCapacity, bool bHalfSize = false)
2450 {
2451 if (lCapacity <= 0)
2452 throw new ArgumentOutOfRangeException();
2453
2454 long[] rgIn = new long[] { lCapacity };
2455
2456 try
2457 {
2458 if (m_dt == DataType.DOUBLE)
2459 {
2460 if (bHalfSize)
2461 throw new Exception("Half sizes are only supported with the 'float' base type.");
2462
2463 double[] rgOut;
2464 lock (m_memSync)
2465 {
2466 if (m_rgGhostMemory == null || !m_bGhostMemoryEnabled)
2467 {
2468 rgOut = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.ALLOCMEM, null, rgIn);
2469 }
2470 else
2471 {
2472 m_nGhostMemoryIndex++;
2473 m_rgGhostMemory.Add(m_nGhostMemoryIndex, convert(Utility.Create<double>((int)lCapacity, 0).ToArray()));
2474 rgOut = new double[] { m_nGhostMemoryIndex };
2475 }
2476
2477 return m_memTracker.AllocMemory(m_hKernel, m_nDeviceId, (long)rgOut[0], (ulong)lCapacity, bHalfSize);
2478 }
2479 }
2480 else
2481 {
2482 float[] rgOut;
2483 lock (m_memSync)
2484 {
2485 if (m_rgGhostMemory == null || !m_bGhostMemoryEnabled)
2486 {
2487 if (bHalfSize)
2488 rgOut = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.ALLOCMEM_HALF, null, rgIn);
2489 else
2490 rgOut = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.ALLOCMEM, null, rgIn);
2491 }
2492 else
2493 {
2494 m_nGhostMemoryIndex++;
2495 m_rgGhostMemory.Add(m_nGhostMemoryIndex, convert(Utility.Create<float>((int)lCapacity, 0).ToArray()));
2496 rgOut = new float[] { m_nGhostMemoryIndex };
2497 }
2498
2499 return m_memTracker.AllocMemory(m_hKernel, m_nDeviceId, (long)rgOut[0], (ulong)lCapacity, bHalfSize);
2500 }
2501 }
2502 }
2503 catch (Exception excpt)
2504 {
2505 string strMemory = m_memTracker.TotalMemoryUsedText;
2506 string strDevice = GetDeviceName(m_nDeviceId);
2507 long lMb = (lCapacity * (int)basetype_size(false)) / 1000000;
2508
2509 throw new Exception("Out of memory! There is not enough memory to allocate the requested " + lMb.ToString("N0") + " MB of memory. You are currently using " + strMemory + " of memory on " + strDevice + ". You may need to use a different GPU that has more memory.", excpt);
2510 }
2511 }
2512
2517 public void FreeMemory(long hMem)
2518 {
2519 if (m_cuda == null || m_hKernel <= 0)
2520 {
2521 Trace.WriteLine("WARNING: CudaDnn has already been disposed, cannot free memory.");
2522 return;
2523 }
2524
2525 lock (m_memSync)
2526 {
2527 if (m_dt == DataType.DOUBLE)
2528 {
2529 m_memTracker.FreeMemory(m_hKernel, m_nDeviceId, hMem);
2530
2531 if (m_rgGhostMemory == null || !m_bGhostMemoryEnabled)
2532 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.FREEMEM, null, m_param.AsLong(hMem));
2533 else
2534 m_rgGhostMemory.Remove(hMem);
2535 }
2536 else
2537 {
2538 m_memTracker.FreeMemory(m_hKernel, m_nDeviceId, hMem);
2539
2540 if (m_rgGhostMemory == null || !m_bGhostMemoryEnabled)
2541 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.FREEMEM, null, m_param.AsLong(hMem));
2542 else
2543 m_rgGhostMemory.Remove(hMem);
2544 }
2545 }
2546 }
2547
2554 public void CopyDeviceToHost(long lCount, long hGpuSrc, long hHostDst)
2555 {
2556 if (m_dt == DataType.DOUBLE)
2557 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.COPY_DEVICE_TO_HOST, null, m_param.AsLong(lCount, hGpuSrc, hHostDst));
2558 else
2559 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.COPY_DEVICE_TO_HOST, null, m_param.AsLong(lCount, hGpuSrc, hHostDst));
2560 }
2561
2568 public void CopyHostToDevice(long lCount, long hHostSrc, long hGpuDst)
2569 {
2570 if (m_dt == DataType.DOUBLE)
2571 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.COPY_HOST_TO_DEVICE, null, m_param.AsLong(lCount, hHostSrc, hGpuDst));
2572 else
2573 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.COPY_HOST_TO_DEVICE, null, m_param.AsLong(lCount, hHostSrc, hGpuDst));
2574 }
2575
2581 public long AllocHostBuffer(long lCapacity)
2582 {
2583 if (lCapacity == 0)
2584 throw new ArgumentOutOfRangeException();
2585
2586 if (m_dt == DataType.DOUBLE)
2587 {
2588 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.ALLOCHOSTBUFFER, null, m_param.AsLong(lCapacity));
2589 return (long)rg[0];
2590 }
2591 else
2592 {
2593 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.ALLOCHOSTBUFFER, null, m_param.AsLong(lCapacity));
2594 return (long)rg[0];
2595 }
2596 }
2597
2602 public void FreeHostBuffer(long hMem)
2603 {
2604 if (m_cuda == null || m_hKernel <= 0)
2605 {
2606 Trace.WriteLine("WARNING: CudaDnn has already been disposed, cannot free memory.");
2607 return;
2608 }
2609
2610 if (m_dt == DataType.DOUBLE)
2611 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.FREEHOSTBUFFER, null, m_param.AsLong(hMem));
2612 else
2613 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.FREEHOSTBUFFER, null, m_param.AsLong(hMem));
2614 }
2615
2621 public long GetHostBufferCapacity(long hMem)
2622 {
2623 if (m_dt == DataType.DOUBLE)
2624 {
2625 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.GETHOSTBUFFERCAPACITY, null, m_param.AsLong(hMem));
2626 return (long)rg[0];
2627 }
2628 else
2629 {
2630 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.GETHOSTBUFFERCAPACITY, null, m_param.AsLong(hMem));
2631 return (long)rg[0];
2632 }
2633 }
2634
2641 public double[] GetHostMemoryDouble(long hMem)
2642 {
2643 return convertD(GetHostMemory(hMem));
2644 }
2645
2652 public float[] GetHostMemoryFloat(long hMem)
2653 {
2654 return convertF(GetHostMemory(hMem));
2655 }
2656
2662 public T[] GetHostMemory(long hMem)
2663 {
2664 if (m_dt == DataType.DOUBLE)
2665 return convert(m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.GETHOSTMEM, null, m_param.AsLong(hMem)));
2666 else
2667 return convert(m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.GETHOSTMEM, null, m_param.AsLong(hMem)));
2668 }
2669
2677 public double[] GetMemoryDouble(long hMem, long lCount = -1)
2678 {
2679 return convertD(GetMemory(hMem, lCount));
2680 }
2681
2689 public float[] GetMemoryFloat(long hMem, long lCount = -1)
2690 {
2691 return convertF(GetMemory(hMem, lCount));
2692 }
2693
2700 public T[] GetMemory(long hMem, long lCount = -1)
2701 {
2702 if (m_dt == DataType.DOUBLE)
2703 {
2704 if (m_rgGhostMemory == null)
2705 {
2706 double[] rgr = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.GETMEM, null, m_param.AsLong(hMem, lCount));
2707 return convert(rgr);
2708 }
2709 else
2710 {
2711 return m_rgGhostMemory[hMem];
2712 }
2713 }
2714 else
2715 {
2716 if (m_rgGhostMemory == null)
2717 {
2718 float[] rgr = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.GETMEM, null, m_param.AsLong(hMem, lCount));
2719 return convert(rgr);
2720 }
2721 else
2722 {
2723 return m_rgGhostMemory[hMem];
2724 }
2725 }
2726 }
2727
2734 public void SetMemory(long hMem, List<double> rg)
2735 {
2736 SetMemory(hMem, rg.ToArray());
2737 }
2738
2745 public void SetMemory(long hMem, List<float> rg)
2746 {
2747 SetMemory(hMem, rg.ToArray());
2748 }
2749
2757 public void SetMemory(long hMem, double[] rgSrc, long hStream = 0)
2758 {
2759 SetMemory(hMem, convert(rgSrc), hStream);
2760 }
2761
2769 public void SetMemory(long hMem, float[] rgSrc, long hStream = 0)
2770 {
2771 SetMemory(hMem, convert(rgSrc), hStream);
2772 }
2773
2781 public void SetMemory(long hMem, T[] rgSrc, long hStream = 0, int nCount = -1)
2782 {
2783 if (nCount == -1)
2784 nCount = rgSrc.Length;
2785
2786 if (rgSrc == null || nCount == 0)
2787 throw new ArgumentOutOfRangeException("There are no data items to set!");
2788
2789 if (m_hKernel > 0)
2790 {
2791 if (m_rgGhostMemory != null)
2792 {
2793 m_rgGhostMemory[hMem] = Utility.Clone<T>(rgSrc);
2794 }
2795 else
2796 {
2797 if (m_dt == DataType.DOUBLE)
2798 {
2799 int nDataCount = 2;
2800
2801 if (hStream > 0)
2802 nDataCount++;
2803
2804 nDataCount += nCount;
2805
2806 double[] rg = new double[nDataCount];
2807
2808 rg[0] = hMem;
2809 rg[1] = nCount;
2810 int nIdx = 2;
2811
2812 if (hStream > 0)
2813 {
2814 rg[nIdx] = hStream;
2815 nIdx++;
2816 }
2817
2818 long[] rgIn = new long[] { hMem, nCount };
2819
2820 convertD(rgSrc, rg, nIdx, nCount);
2821 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SETMEM, rg, rgIn);
2822 }
2823 else
2824 {
2825 int nDataCount = 2;
2826
2827 if (hStream > 0)
2828 nDataCount++;
2829
2830 nDataCount += nCount;
2831
2832 float[] rg = new float[nDataCount];
2833
2834 rg[0] = hMem;
2835 rg[1] = nCount;
2836 int nIdx = 2;
2837
2838 if (hStream > 0)
2839 {
2840 rg[nIdx] = hStream;
2841 nIdx++;
2842 }
2843
2844 long[] rgIn = new long[] { hMem, nCount };
2845
2846 convertF(rgSrc, rg, nIdx, nCount);
2847 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SETMEM, rg, rgIn);
2848 }
2849 }
2850 }
2851 }
2852
2860 public void SetMemoryAt(long hMem, double[] rgSrc, int nOffset)
2861 {
2862 SetMemoryAt(hMem, convert(rgSrc), nOffset);
2863 }
2864
2872 public void SetMemoryAt(long hMem, float[] rgSrc, int nOffset)
2873 {
2874 SetMemoryAt(hMem, convert(rgSrc), nOffset);
2875 }
2876
2883 public void SetMemoryAt(long hMem, T[] rgSrc, int nOffset)
2884 {
2885 if (rgSrc == null || rgSrc.Length == 0)
2886 throw new ArgumentOutOfRangeException("There are no data items to set!");
2887
2888 if (m_hKernel > 0)
2889 {
2890 if (m_rgGhostMemory != null)
2891 throw new Exception("Ghost memory does not support SetMemoryAt.");
2892
2893 if (m_dt == DataType.DOUBLE)
2894 {
2895 int nDataCount = 3 + rgSrc.Length;
2896 double[] rg = new double[nDataCount];
2897
2898 rg[0] = hMem;
2899 rg[1] = rgSrc.Length;
2900 rg[2] = nOffset;
2901
2902 long[] rgIn = new long[] { hMem, rgSrc.Length, nOffset };
2903
2904 convertD(rgSrc, rg, 3);
2905 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SETMEMAT, rg, rgIn);
2906 }
2907 else
2908 {
2909 int nDataCount = 3 + rgSrc.Length;
2910 float[] rg = new float[nDataCount];
2911
2912 rg[0] = hMem;
2913 rg[1] = rgSrc.Length;
2914 rg[2] = nOffset;
2915
2916 long[] rgIn = new long[] { hMem, rgSrc.Length, nOffset };
2917
2918 convertF(rgSrc, rg, 3);
2919 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SETMEMAT, rg, rgIn);
2920 }
2921 }
2922 }
2923
2933 public T[] SetPixel(long hMem, int nCount, bool bReturnOriginal, int nOffset, params Tuple<int, T>[] rgPixel)
2934 {
2935 if (rgPixel.Length == 0)
2936 throw new Exception("You must specify at least one pixel!");
2937
2938 if (m_dt == DataType.DOUBLE)
2939 {
2940 double[] rg = new double[5 + rgPixel.Length * 2];
2941
2942 rg[0] = hMem;
2943 rg[1] = nCount;
2944 rg[2] = (bReturnOriginal) ? 1 : 0;
2945 rg[3] = nOffset;
2946 rg[4] = rgPixel.Length;
2947 int nIdx = 5;
2948
2949 for (int i = 0; i < rgPixel.Length; i++)
2950 {
2951 rg[nIdx] = rgPixel[i].Item1;
2952 nIdx++;
2953 rg[nIdx] = convertD1(rgPixel[i].Item2);
2954 nIdx++;
2955 }
2956
2957 rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.SETPIXEL, rg);
2958 if (rg == null)
2959 return null;
2960
2961 return convert(rg);
2962 }
2963 else
2964 {
2965 float[] rg = new float[5 + rgPixel.Length * 2];
2966
2967 rg[0] = hMem;
2968 rg[1] = nCount;
2969 rg[2] = (bReturnOriginal) ? 1 : 0;
2970 rg[3] = nOffset;
2971 rg[4] = rgPixel.Length;
2972 int nIdx = 5;
2973
2974 for (int i = 0; i < rgPixel.Length; i++)
2975 {
2976 rg[nIdx] = rgPixel[i].Item1;
2977 nIdx++;
2978 rg[nIdx] = convertF1(rgPixel[i].Item2);
2979 nIdx++;
2980 }
2981
2982 rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.SETPIXEL, rg);
2983 if (rg == null)
2984 return null;
2985
2986 return convert(rg);
2987 }
2988 }
2989
2995 public void SetHostMemory(long hMem, T[] rgSrc)
2996 {
2997 if (m_dt == DataType.DOUBLE)
2998 {
2999 int nDataCount = 2 + rgSrc.Length;
3000 double[] rg = new double[nDataCount];
3001
3002 rg[0] = hMem;
3003 rg[1] = rgSrc.Length;
3004
3005 convertD(rgSrc, rg, 2);
3006 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SETHOSTMEM, rg, m_param.AsLong(hMem, rgSrc.Length));
3007 }
3008 else
3009 {
3010 int nDataCount = 2 + rgSrc.Length;
3011 float[] rg = new float[nDataCount];
3012
3013 rg[0] = hMem;
3014 rg[1] = rgSrc.Length;
3015
3016 convertF(rgSrc, rg, 2);
3017 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SETHOSTMEM, rg, m_param.AsLong(hMem, rgSrc.Length));
3018 }
3019 }
3020
3028 public long CreateMemoryPointer(long hData, long lOffset, long lCount)
3029 {
3030 if (m_dt == DataType.DOUBLE)
3031 {
3032 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CREATE_MEMORYPOINTER, null, m_param.AsLong(hData, lOffset, lCount));
3033 return (long)rg[0];
3034 }
3035 else
3036 {
3037 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CREATE_MEMORYPOINTER, null, m_param.AsLong(hData, lOffset, lCount));
3038 return (long)rg[0];
3039 }
3040 }
3041
3046 public void FreeMemoryPointer(long hData)
3047 {
3048 if (m_cuda == null || m_hKernel <= 0)
3049 {
3050 Trace.WriteLine("WARNING: CudaDnn has already been disposed, cannot free memory pointer.");
3051 return;
3052 }
3053
3054 if (m_dt == DataType.DOUBLE)
3055 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.FREE_MEMORYPOINTER, null, m_param.AsLong(hData));
3056 else
3057 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.FREE_MEMORYPOINTER, null, m_param.AsLong(hData));
3058 }
3059
3069 public long CreateMemoryTest(out ulong ulTotalNumBlocks, out double dfMemAllocatedInGB, out ulong ulMemStartAddr, out ulong ulBlockSize, double dfPctToAllocate = 1.0)
3070 {
3071 if (m_dt == DataType.DOUBLE)
3072 {
3073 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CREATE_MEMTEST, m_param.AsDouble(dfPctToAllocate));
3074 ulTotalNumBlocks = (ulong)rg[1];
3075 dfMemAllocatedInGB = (double)rg[2];
3076 ulMemStartAddr = (ulong)rg[3];
3077 ulBlockSize = (ulong)rg[4];
3078 return (long)rg[0];
3079 }
3080 else
3081 {
3082 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CREATE_MEMTEST, m_param.AsFloat((float)dfPctToAllocate));
3083 ulTotalNumBlocks = (ulong)rg[1];
3084 dfMemAllocatedInGB = (double)rg[2];
3085 ulMemStartAddr = (ulong)rg[3];
3086 ulBlockSize = (ulong)rg[4];
3087 return (long)rg[0];
3088 }
3089 }
3090
3095 public void FreeMemoryTest(long h)
3096 {
3097 if (m_dt == DataType.DOUBLE)
3098 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.FREE_MEMTEST, m_param.AsDouble(h));
3099 else
3100 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.FREE_MEMTEST, m_param.AsFloat(h));
3101 }
3102
3123 public T[] RunMemoryTest(long h, MEMTEST_TYPE type, ulong ulBlockStartOffset, ulong ulBlockCount, bool bVerbose, bool bWrite, bool bReadWrite, bool bRead)
3124 {
3125 List<ulong> rgErrorAddresses = new List<ulong>();
3126
3127 if (m_dt == DataType.DOUBLE)
3128 {
3129 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.RUN_MEMTEST, null, m_param.AsLong(h, (long)type, (long)ulBlockStartOffset, (long)ulBlockCount, (bVerbose) ? 1 : 0, (bWrite) ? 1 : 0, (bReadWrite) ? 1 : 0, (bRead) ? 1 : 0));
3130 return (T[])Convert.ChangeType(rg, typeof(T[]));
3131 }
3132 else
3133 {
3134 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.RUN_MEMTEST, null, m_param.AsLong(h, (long)type, (long)ulBlockStartOffset, (long)ulBlockCount, (bVerbose) ? 1 : 0, (bWrite) ? 1 : 0, (bReadWrite) ? 1 : 0, (bRead) ? 1 : 0));
3135 return (T[])Convert.ChangeType(rg, typeof(T[]));
3136 }
3137 }
3138
3153 public long CreateImageOp(int nNum, double dfBrightnessProb, double dfBrightnessDelta, double dfContrastProb, double dfContrastLower, double dfContrastUpper, double dfSaturationProb, double dfSaturationLower, double dfSaturationUpper, long lRandomSeed = 0)
3154 {
3155 if (m_dt == DataType.DOUBLE)
3156 {
3157 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CREATE_IMAGEOP, m_param.AsDouble(nNum, dfBrightnessProb, dfBrightnessDelta, dfContrastProb, dfContrastLower, dfContrastUpper, dfSaturationProb, dfSaturationLower, dfSaturationUpper, lRandomSeed));
3158 return (long)rg[0];
3159 }
3160 else
3161 {
3162 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CREATE_IMAGEOP, m_param.AsFloat(nNum, (float)dfBrightnessProb, (float)dfBrightnessDelta, (float)dfContrastProb, (float)dfContrastLower, (float)dfContrastUpper, (float)dfSaturationProb, (float)dfSaturationLower, (float)dfSaturationUpper, lRandomSeed));
3163 return (long)rg[0];
3164 }
3165 }
3166
3171 public void FreeImageOp(long h)
3172 {
3173 if (m_dt == DataType.DOUBLE)
3174 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.FREE_IMAGEOP, m_param.AsDouble(h));
3175 else
3176 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.FREE_IMAGEOP, m_param.AsFloat(h));
3177 }
3178
3188 public void DistortImage(long h, int nCount, int nNum, int nDim, long hX, long hY)
3189 {
3190 if (m_dt == DataType.DOUBLE)
3191 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.DISTORTIMAGE_IMAGEOP, null, m_param.AsLong(h, nCount, nNum, nDim, hX, hY));
3192 else
3193 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.DISTORTIMAGE_IMAGEOP, null, m_param.AsLong(h, nCount, nNum, nDim, hX, hY));
3194 }
3195
3196 #endregion
3197
3198 //---------------------------------------------------------------------
3199 // ICudaDnn Methods
3200 //---------------------------------------------------------------------
3201 #region ICudaDnn Methods
3202
3209 public long CreateStream(bool bNonBlocking = false, int nIndex = -1)
3210 {
3211 if (m_dt == DataType.DOUBLE)
3212 {
3213 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CREATE_STREAM, m_param.AsDouble((bNonBlocking) ? 1 : 0, nIndex));
3214 return (long)rg[0];
3215 }
3216 else
3217 {
3218 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CREATE_STREAM, m_param.AsFloat((bNonBlocking) ? 1 : 0, nIndex));
3219 return (long)rg[0];
3220 }
3221 }
3222
3227 public void FreeStream(long h)
3228 {
3229 if (m_dt == DataType.DOUBLE)
3230 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.FREE_STREAM, m_param.AsDouble(h));
3231 else
3232 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.FREE_STREAM, m_param.AsFloat(h));
3233 }
3234
3239 public void SynchronizeStream(long h = 0)
3240 {
3241 if (m_dt == DataType.DOUBLE)
3242 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.SYNCRHONIZE_STREAM, m_param.AsDouble(h));
3243 else
3244 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.SYNCRHONIZE_STREAM, m_param.AsFloat(h));
3245 }
3246
3250 public void SynchronizeThread()
3251 {
3252 if (m_dt == DataType.DOUBLE)
3253 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.SYNCHRONIZE_THREAD, null);
3254 else
3255 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.SYNCHRONIZE_THREAD, null);
3256 }
3257
3263 public long CreateCuDNN(long hStream = 0)
3264 {
3265 if (m_dt == DataType.DOUBLE)
3266 {
3267 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CREATE_CUDNN, m_param.AsDouble(hStream));
3268 return (long)rg[0];
3269 }
3270 else
3271 {
3272 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CREATE_CUDNN, m_param.AsFloat(hStream));
3273 return (long)rg[0];
3274 }
3275 }
3276
3281 public void FreeCuDNN(long h)
3282 {
3283 if (m_dt == DataType.DOUBLE)
3284 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.FREE_CUDNN, m_param.AsDouble(h));
3285 else
3286 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.FREE_CUDNN, m_param.AsFloat(h));
3287 }
3288
3297 public long CreateNCCL(int nDeviceId, int nCount, int nRank, Guid guid)
3298 {
3299 if (m_dt == DataType.DOUBLE)
3300 {
3301 List<double> rgParam = new List<double>() { nDeviceId, nCount, nRank };
3302 List<double> rgGuid = guidToArrayDouble(guid);
3303
3304 rgParam.Add(rgGuid.Count);
3305 rgParam.AddRange(rgGuid);
3306
3307 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CREATE_NCCL, rgParam.ToArray());
3308 return (long)rg[0];
3309 }
3310 else
3311 {
3312 List<float> rgParam = new List<float>() { nDeviceId, nCount, nRank };
3313 List<float> rgGuid = guidToArrayFloat(guid);
3314
3315 rgParam.Add(rgGuid.Count);
3316 rgParam.AddRange(rgGuid);
3317
3318 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CREATE_NCCL, rgParam.ToArray());
3319 return (long)rg[0];
3320 }
3321 }
3322
3323 private List<double> guidToArrayDouble(Guid guid)
3324 {
3325 List<double> rgdf = new List<double>();
3326 string str = guid.ToString();
3327 string[] rgstr = str.Split('-');
3328
3329 foreach (string str1 in rgstr)
3330 {
3331 long val = Convert.ToInt64(str1, 16);
3332 rgdf.Add(val);
3333 }
3334
3335 return rgdf;
3336 }
3337
3338 private List<float> guidToArrayFloat(Guid guid)
3339 {
3340 List<double> rgDf = guidToArrayDouble(guid);
3341 List<float> rg = new List<float>();
3342
3343 foreach (double df in rgDf)
3344 {
3345 rg.Add((float)df);
3346 }
3347
3348 return rg;
3349 }
3350
3355 public void FreeNCCL(long hNccl)
3356 {
3357 if (m_dt == DataType.DOUBLE)
3358 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.FREE_NCCL, m_param.AsDouble(hNccl));
3359 else
3360 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.FREE_NCCL, m_param.AsFloat(hNccl));
3361 }
3362
3370 public void NcclInitializeSingleProcess(params long[] rghNccl)
3371 {
3372 if (m_dt == DataType.DOUBLE)
3373 {
3374 List<double> rg = new List<double>() { 0, rghNccl.Length };
3375
3376 for (int i = 0; i < rghNccl.Length; i++)
3377 {
3378 rg.Add(rghNccl[i]);
3379 }
3380
3381 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.NCCL_INIT_SINGLEPROCESS, rg.ToArray());
3382 }
3383 else
3384 {
3385 List<float> rg = new List<float>() { 0, rghNccl.Length };
3386
3387 for (int i = 0; i < rghNccl.Length; i++)
3388 {
3389 rg.Add(rghNccl[i]);
3390 }
3391
3392 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.NCCL_INIT_SINGLEPROCESS, rg.ToArray());
3393 }
3394 }
3395
3403 public void NcclInitializeMultiProcess(long hNccl)
3404 {
3405 if (m_dt == DataType.DOUBLE)
3406 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.NCCL_INIT_MULTIPROCESS, m_param.AsDouble(hNccl));
3407 else
3408 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.NCCL_INIT_MULTIPROCESS, m_param.AsFloat(hNccl));
3409 }
3410
3421 public void NcclBroadcast(long hNccl, long hStream, long hX, int nCount)
3422 {
3423 Trace.WriteLine("Broadcasting from device ID " + GetDeviceID().ToString());
3424 if (m_dt == DataType.DOUBLE)
3425 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.NCCL_BROADCAST, null, m_param.AsLong(hNccl, hStream, hX, nCount));
3426 else
3427 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.NCCL_BROADCAST, null, m_param.AsLong(hNccl, hStream, hX, nCount));
3428 }
3429
3442 public void NcclAllReduce(long hNccl, long hStream, long hX, int nCount, NCCL_REDUCTION_OP op, double dfScale = 1.0)
3443 {
3444 if (m_dt == DataType.DOUBLE)
3445 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.NCCL_ALLREDUCE, m_param.AsDouble(dfScale), m_param.AsLong(hNccl, hStream, hX, nCount, (int)op, 0));
3446 else
3447 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.NCCL_ALLREDUCE, m_param.AsFloat((float)dfScale), m_param.AsLong(hNccl, hStream, hX, nCount, (int)op, 0));
3448 }
3449
3450
3456 public long CreateExtension(string strExtensionDllPath)
3457 {
3458 if (m_dt == DataType.DOUBLE)
3459 {
3460 double[] rg = m_cuda.RunDoubleEx((int)m_hKernel, (int)CUDAFN.CREATE_EXTENSION, null, strExtensionDllPath);
3461 return (long)rg[0];
3462 }
3463 else
3464 {
3465 float[] rg = m_cuda.RunFloatEx((int)m_hKernel, (int)CUDAFN.CREATE_EXTENSION, null, strExtensionDllPath);
3466 return (long)rg[0];
3467 }
3468 }
3469
3474 public void FreeExtension(long hExtension)
3475 {
3476 if (m_dt == DataType.DOUBLE)
3477 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.FREE_EXTENSION, m_param.AsDouble(hExtension));
3478 else
3479 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.FREE_EXTENSION, m_param.AsFloat(hExtension));
3480 }
3481
3489 public T[] RunExtension(long hExtension, long lfnIdx, T[] rgParam)
3490 {
3491 if (m_dt == DataType.DOUBLE)
3492 {
3493 List<double> rgdf = new List<double>() { hExtension, lfnIdx };
3494
3495 if (rgParam != null)
3496 rgdf.AddRange(Utility.ConvertVec<T>(rgParam));
3497
3498 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.EXTENSION_RUN, rgdf.ToArray());
3499 return Utility.ConvertVec<T>(rg);
3500 }
3501 else
3502 {
3503 List<float> rgf = new List<float>() { hExtension, lfnIdx };
3504
3505 if (rgParam != null)
3506 rgf.AddRange(Utility.ConvertVecF<T>(rgParam));
3507
3508 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.EXTENSION_RUN, rgf.ToArray());
3509 return Utility.ConvertVec<T>(rg);
3510 }
3511 }
3512
3513
3518 public long CreateTensorDesc()
3519 {
3520 if (m_dt == DataType.DOUBLE)
3521 {
3522 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CREATE_TENSORDESC, null);
3523 return (long)rg[0];
3524 }
3525 else
3526 {
3527 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CREATE_TENSORDESC, null);
3528 return (long)rg[0];
3529 }
3530 }
3531
3536 public void FreeTensorDesc(long h)
3537 {
3538 if (m_dt == DataType.DOUBLE)
3539 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.FREE_TENSORDESC, m_param.AsDouble(h));
3540 else
3541 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.FREE_TENSORDESC, m_param.AsFloat(h));
3542 }
3543
3551 public void SetTensorNdDesc(long hHandle, int[] rgDim, int[] rgStride, bool bHalf = false)
3552 {
3553 if (rgDim.Length != rgStride.Length)
3554 throw new Exception("The stride and dim arrays must have the same length.");
3555
3556 if (m_dt == DataType.DOUBLE)
3557 {
3558 List<long> rgArg = new List<long>() { hHandle, (bHalf) ? 1 : 0, rgDim.Length };
3559
3560 for (int i = 0; i < rgDim.Length; i++)
3561 {
3562 rgArg.Add(rgDim[i]);
3563 }
3564
3565 for (int i = 0; i < rgStride.Length; i++)
3566 {
3567 rgArg.Add(rgStride[i]);
3568 }
3569
3570 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SET_TENSORNDDESC, null, rgArg.ToArray());
3571 }
3572 else
3573 {
3574 List<long> rgArg = new List<long>() { hHandle, (bHalf) ? 1 : 0, rgDim.Length };
3575
3576 for (int i = 0; i < rgDim.Length; i++)
3577 {
3578 rgArg.Add(rgDim[i]);
3579 }
3580
3581 for (int i = 0; i < rgStride.Length; i++)
3582 {
3583 rgArg.Add(rgStride[i]);
3584 }
3585
3586 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SET_TENSORNDDESC, null, rgArg.ToArray());
3587 }
3588 }
3589
3599 public void SetTensorDesc(long hHandle, int n, int c, int h, int w, bool bHalf = false)
3600 {
3601 if (m_dt == DataType.DOUBLE)
3602 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SET_TENSORDESC, null, m_param.AsLong(hHandle, (bHalf) ? 1 : 0, n, c, h, w));
3603 else
3604 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SET_TENSORDESC, null, m_param.AsLong(hHandle, (bHalf) ? 1 : 0, n, c, h, w));
3605 }
3606
3620 public void SetTensorDesc(long hHandle, int n, int c, int h, int w, int nStride, int cStride, int hStride, int wStride, bool bHalf = false)
3621 {
3622 if (m_dt == DataType.DOUBLE)
3623 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SET_TENSORDESC, null, m_param.AsLong(hHandle, (bHalf) ? 1 : 0, n, c, h, w, nStride, cStride, hStride, wStride));
3624 else
3625 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SET_TENSORDESC, null, m_param.AsLong(hHandle, (bHalf) ? 1 : 0, n, c, h, w, nStride, cStride, hStride, wStride));
3626 }
3627
3638 public void AddTensor(long hCuDnn, long hSrcDesc, long hSrc, int nSrcOffset, long hDstDesc, long hDst, int nDstOffset)
3639 {
3640 AddTensor(hCuDnn, m_tOne, hSrcDesc, hSrc, nSrcOffset, m_tOne, hDstDesc, hDst, nDstOffset);
3641 }
3642
3655 public void AddTensor(long hCuDnn, T fAlpha, long hSrcDesc, long hSrc, int nSrcOffset, T fBeta, long hDstDesc, long hDst, int nDstOffset)
3656 {
3657 if (m_dt == DataType.DOUBLE)
3658 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.ADD_TENSOR, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hSrcDesc, hSrc, nSrcOffset, 0, hDstDesc, hDst, nDstOffset));
3659 else
3660 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.ADD_TENSOR, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hSrcDesc, hSrc, nSrcOffset, 0, hDstDesc, hDst, nDstOffset));
3661 }
3662
3663
3668 public long CreateFilterDesc()
3669 {
3670 if (m_dt == DataType.DOUBLE)
3671 {
3672 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CREATE_FILTERDESC, null);
3673 return (long)rg[0];
3674 }
3675 else
3676 {
3677 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CREATE_FILTERDESC, null);
3678 return (long)rg[0];
3679 }
3680 }
3681
3686 public void FreeFilterDesc(long h)
3687 {
3688 if (m_dt == DataType.DOUBLE)
3689 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.FREE_FILTERDESC, m_param.AsDouble(h));
3690 else
3691 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.FREE_FILTERDESC, m_param.AsFloat(h));
3692 }
3693
3700 public void SetFilterNdDesc(long hHandle, int[] rgDim, bool bHalf = false)
3701 {
3702 if (m_dt == DataType.DOUBLE)
3703 {
3704 List<long> rgArg = new List<long>() { hHandle, (bHalf) ? 1 : 0, rgDim.Length };
3705
3706 for (int i = 0; i < rgDim.Length; i++)
3707 {
3708 rgArg.Add(rgDim[i]);
3709 }
3710
3711 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SET_FILTERNDDESC, null, rgArg.ToArray());
3712 }
3713 else
3714 {
3715 List<long> rgArg = new List<long>() { hHandle, (bHalf) ? 1 : 0, rgDim.Length };
3716
3717 for (int i = 0; i < rgDim.Length; i++)
3718 {
3719 rgArg.Add(rgDim[i]);
3720 }
3721
3722 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SET_FILTERNDDESC, null, rgArg.ToArray());
3723 }
3724 }
3725
3735 public void SetFilterDesc(long hHandle, int n, int c, int h, int w, bool bHalf = false)
3736 {
3737 if (m_dt == DataType.DOUBLE)
3738 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SET_FILTERDESC, null, m_param.AsLong(hHandle, (bHalf) ? 1 : 0, n, c, h, w));
3739 else
3740 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SET_FILTERDESC, null, m_param.AsLong(hHandle, (bHalf) ? 1 : 0, n, c, h, w));
3741 }
3742
3748 {
3749 if (m_dt == DataType.DOUBLE)
3750 {
3751 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CREATE_CONVDESC, null);
3752 return (long)rg[0];
3753 }
3754 else
3755 {
3756 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CREATE_CONVDESC, null);
3757 return (long)rg[0];
3758 }
3759 }
3760
3765 public void FreeConvolutionDesc(long h)
3766 {
3767 if (m_dt == DataType.DOUBLE)
3768 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.FREE_CONVDESC, m_param.AsDouble(h));
3769 else
3770 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.FREE_CONVDESC, m_param.AsFloat(h));
3771 }
3772
3785 public void SetConvolutionDesc(long hHandle, int hPad, int wPad, int hStride, int wStride, int hDilation, int wDilation, bool bUseTensorCores, bool bHalf = false)
3786 {
3787 if (m_dt == DataType.DOUBLE)
3788 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SET_CONVDESC, null, m_param.AsLong(hHandle, (bHalf) ? 1 : 0, hPad, wPad, hStride, wStride, hDilation, wDilation, (bUseTensorCores) ? 1 : 0));
3789 else
3790 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SET_CONVDESC, null, m_param.AsLong(hHandle, (bHalf) ? 1 : 0, hPad, wPad, hStride, wStride, hDilation, wDilation, (bUseTensorCores) ? 1 : 0));
3791 }
3792
3810 public void GetConvolutionInfo(long hCuDnn, long hBottomDesc, long hFilterDesc, long hConvDesc, long hTopDesc, ulong lWorkspaceSizeLimitInBytes, bool bUseTensorCores, out CONV_FWD_ALGO algoFwd, out ulong lWsSizeFwd, out CONV_BWD_FILTER_ALGO algoBwdFilter, out ulong lWsSizeBwdFilter, out CONV_BWD_DATA_ALGO algoBwdData, out ulong lWsSizeBwdData, CONV_FWD_ALGO preferredFwdAlgo = CONV_FWD_ALGO.NONE)
3811 {
3812 lock (m_getconvSync)
3813 {
3814 if (m_dt == DataType.DOUBLE)
3815 {
3816 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.GET_CONVINFO, null, m_param.AsLong(hCuDnn, hBottomDesc, hFilterDesc, hConvDesc, hTopDesc, (long)lWorkspaceSizeLimitInBytes, (bUseTensorCores) ? 1 : 0, (int)preferredFwdAlgo));
3817 algoFwd = (CONV_FWD_ALGO)rg[0];
3818 lWsSizeFwd = (ulong)rg[1];
3819 algoBwdFilter = (CONV_BWD_FILTER_ALGO)rg[2];
3820 lWsSizeBwdFilter = (ulong)rg[3];
3821 algoBwdData = (CONV_BWD_DATA_ALGO)rg[4];
3822 lWsSizeBwdData = (ulong)rg[5];
3823 }
3824 else
3825 {
3826 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.GET_CONVINFO, null, m_param.AsLong(hCuDnn, hBottomDesc, hFilterDesc, hConvDesc, hTopDesc, (long)lWorkspaceSizeLimitInBytes, (bUseTensorCores) ? 1 : 0, (int)preferredFwdAlgo));
3827 algoFwd = (CONV_FWD_ALGO)rg[0];
3828 lWsSizeFwd = (ulong)rg[1];
3829 algoBwdFilter = (CONV_BWD_FILTER_ALGO)rg[2];
3830 lWsSizeBwdFilter = (ulong)rg[3];
3831 algoBwdData = (CONV_BWD_DATA_ALGO)rg[4];
3832 lWsSizeBwdData = (ulong)rg[5];
3833 }
3834 }
3835 }
3836
3856 public void ConvolutionForward(long hCuDnn, long hBottomDesc, long hBottomData, int nBottomOffset, long hFilterDesc, long hWeight, int nWeightOffset, long hConvDesc, CONV_FWD_ALGO algoFwd, long hWorkspace, int nWorkspaceOffset, ulong lWorkspaceSize, long hTopDesc, long hTopData, int nTopOffset, bool bSyncStream = true)
3857 {
3858 ConvolutionForward(hCuDnn, m_tOne, hBottomDesc, hBottomData, nBottomOffset, hFilterDesc, hWeight, nWeightOffset, hConvDesc, algoFwd, hWeight, nWeightOffset, lWorkspaceSize, m_tZero, hTopDesc, hTopData, nTopOffset, bSyncStream);
3859 }
3860
3882 public void ConvolutionForward(long hCuDnn, T fAlpha, long hBottomDesc, long hBottomData, int nBottomOffset, long hFilterDesc, long hWeight, int nWeightOffset, long hConvDesc, CONV_FWD_ALGO algoFwd, long hWorkspace, int nWorkspaceOffset, ulong lWorkspaceSize, T fBeta, long hTopDesc, long hTopData, int nTopOffset, bool bSyncStream = true)
3883 {
3884 if (m_dt == DataType.DOUBLE)
3885 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.FWD_CONV, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDesc, hBottomData, nBottomOffset, hFilterDesc, hWeight, nWeightOffset, hConvDesc, (long)algoFwd, hWorkspace, nWorkspaceOffset, (long)lWorkspaceSize, 0, hTopDesc, hTopData, nTopOffset, (bSyncStream) ? 1 : 0));
3886 else
3887 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.FWD_CONV, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDesc, hBottomData, nBottomOffset, hFilterDesc, hWeight, nWeightOffset, hConvDesc, (long)algoFwd, hWorkspace, nWorkspaceOffset, (long)lWorkspaceSize, 0, hTopDesc, hTopData, nTopOffset, (bSyncStream) ? 1 : 0));
3888 }
3889
3901 public void ConvolutionBackwardBias(long hCuDnn, long hTopDesc, long hTopDiff, int nTopOffset, long hBiasDesc, long hBiasDiff, int nBiasOffset, bool bSyncStream = true)
3902 {
3903 ConvolutionBackwardBias(hCuDnn, m_tOne, hTopDesc, hTopDiff, nTopOffset, m_tOne, hBiasDesc, hBiasDiff, nBiasOffset, bSyncStream);
3904 }
3905
3919 public void ConvolutionBackwardBias(long hCuDnn, T fAlpha, long hTopDesc, long hTopDiff, int nTopOffset, T fBeta, long hBiasDesc, long hBiasDiff, int nBiasOffset, bool bSyncStream = true)
3920 {
3921 if (m_dt == DataType.DOUBLE)
3922 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.BWD_CONV_BIAS, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDesc, hTopDiff, nTopOffset, 0, hBiasDesc, hBiasDiff, nBiasOffset, (bSyncStream) ? 1 : 0));
3923 else
3924 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.BWD_CONV_BIAS, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDesc, hTopDiff, nTopOffset, 0, hBiasDesc, hBiasDiff, nBiasOffset, (bSyncStream) ? 1 : 0));
3925 }
3926
3946 public void ConvolutionBackwardFilter(long hCuDnn, long hBottomDesc, long hBottomData, int nBottomOffset, long hTopDesc, long hTopDiff, int nTopOffset, long hConvDesc, CONV_BWD_FILTER_ALGO algoBwd, long hWorkspace, int nWorkspaceOffset, ulong lWorkspaceSize, long hFilterDesc, long hWeightDiff, int nWeightOffset, bool bSyncStream)
3947 {
3948 ConvolutionBackwardFilter(hCuDnn, m_tOne, hBottomDesc, hBottomData, nBottomOffset, hTopDesc, hTopDiff, nTopOffset, hConvDesc, algoBwd, hWorkspace, nWorkspaceOffset, lWorkspaceSize, m_tOne, hFilterDesc, hWeightDiff, nWeightOffset, bSyncStream);
3949 }
3950
3972 public void ConvolutionBackwardFilter(long hCuDnn, T fAlpha, long hBottomDesc, long hBottomData, int nBottomOffset, long hTopDesc, long hTopDiff, int nTopOffset, long hConvDesc, CONV_BWD_FILTER_ALGO algoBwd, long hWorkspace, int nWorkspaceOffset, ulong lWorkspaceSize, T fBeta, long hFilterDesc, long hWeightDiff, int nWeightOffset, bool bSyncStream = true)
3973 {
3974 if (m_dt == DataType.DOUBLE)
3975 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.BWD_CONV_FILTER, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDesc, hBottomData, nBottomOffset, hTopDesc, hTopDiff, nTopOffset, hConvDesc, (long)algoBwd, hWorkspace, nWorkspaceOffset, (long)lWorkspaceSize, 0, hFilterDesc, hWeightDiff, nWeightOffset, (bSyncStream) ? 1 : 0));
3976 else
3977 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.BWD_CONV_FILTER, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDesc, hBottomData, nBottomOffset, hTopDesc, hTopDiff, nTopOffset, hConvDesc, (long)algoBwd, hWorkspace, nWorkspaceOffset, (long)lWorkspaceSize, 0, hFilterDesc, hWeightDiff, nWeightOffset, (bSyncStream) ? 1 : 0));
3978 }
3979
3999 public void ConvolutionBackwardData(long hCuDnn, long hFilterDesc, long hWeight, int nWeightOffset, long hTopDesc, long hTopDiff, int nTopOffset, long hConvDesc, CONV_BWD_DATA_ALGO algoBwd, long hWorkspace, int nWorkspaceOffset, ulong lWorkspaceSize, long hBottomDesc, long hBottomDiff, int nBottomOffset, bool bSyncStream = true)
4000 {
4001 ConvolutionBackwardData(hCuDnn, m_tOne, hFilterDesc, hWeight, nWeightOffset, hTopDesc, hTopDiff, nTopOffset, hConvDesc, algoBwd, hWorkspace, nWorkspaceOffset, lWorkspaceSize, m_tZero, hBottomDesc, hBottomDiff, nBottomOffset, bSyncStream);
4002 }
4003
4025 public void ConvolutionBackwardData(long hCuDnn, T fAlpha, long hFilterDesc, long hWeight, int nWeightOffset, long hTopDesc, long hTopDiff, int nTopOffset, long hConvDesc, CONV_BWD_DATA_ALGO algoBwd, long hWorkspace, int nWorkspaceOffset, ulong lWorkspaceSize, T fBeta, long hBottomDesc, long hBottomDiff, int nBottomOffset, bool bSyncStream = true)
4026 {
4027 if (m_dt == DataType.DOUBLE)
4028 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.BWD_CONV_DATA, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hFilterDesc, hWeight, nWeightOffset, hTopDesc, hTopDiff, nTopOffset, hConvDesc, (long)algoBwd, hWorkspace, nWorkspaceOffset, (long)lWorkspaceSize, 0, hBottomDesc, hBottomDiff, nBottomOffset, (bSyncStream) ? 1 : 0));
4029 else
4030 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.BWD_CONV_DATA, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hFilterDesc, hWeight, nWeightOffset, hTopDesc, hTopDiff, nTopOffset, hConvDesc, (long)algoBwd, hWorkspace, nWorkspaceOffset, (long)lWorkspaceSize, 0, hBottomDesc, hBottomDiff, nBottomOffset, (bSyncStream) ? 1 : 0));
4031 }
4032
4037 public long CreatePoolingDesc()
4038 {
4039 if (m_dt == DataType.DOUBLE)
4040 {
4041 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CREATE_POOLDESC, null);
4042 return (long)rg[0];
4043 }
4044 else
4045 {
4046 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CREATE_POOLDESC, null);
4047 return (long)rg[0];
4048 }
4049 }
4050
4055 public void FreePoolingDesc(long h)
4056 {
4057 if (m_dt == DataType.DOUBLE)
4058 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.FREE_POOLDESC, m_param.AsDouble(h));
4059 else
4060 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.FREE_POOLDESC, m_param.AsFloat(h));
4061 }
4062
4074 public void SetPoolingDesc(long hHandle, PoolingMethod method, int h, int w, int hPad, int wPad, int hStride, int wStride)
4075 {
4076 if (m_dt == DataType.DOUBLE)
4077 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SET_POOLDESC, null, m_param.AsLong(hHandle, (int)method, h, w, hPad, wPad, hStride, wStride));
4078 else
4079 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SET_POOLDESC, null, m_param.AsLong(hHandle, (int)method, h, w, hPad, wPad, hStride, wStride));
4080 }
4081
4093 public void PoolingForward(long hCuDnn, long hPoolingDesc, T fAlpha, long hBottomDesc, long hBottomData, T fBeta, long hTopDesc, long hTopData)
4094 {
4095 if (m_dt == DataType.DOUBLE)
4096 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.FWD_POOL, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, hPoolingDesc, 0, hBottomDesc, hBottomData, 0, hTopDesc, hTopData));
4097 else
4098 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.FWD_POOL, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, hPoolingDesc, 0, hBottomDesc, hBottomData, 0, hTopDesc, hTopData));
4099 }
4100
4116 public void PoolingBackward(long hCuDnn, long hPoolingDesc, T fAlpha, long hTopDataDesc, long hTopData, long hTopDiffDesc, long hTopDiff, long hBottomDataDesc, long hBottomData, T fBeta, long hBottomDiffDesc, long hBottomDiff)
4117 {
4118 if (m_dt == DataType.DOUBLE)
4119 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.BWD_POOL, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, hPoolingDesc, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4120 else
4121 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.BWD_POOL, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, hPoolingDesc, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4122 }
4123
4132 public void DeriveBatchNormDesc(long hFwdScaleBiasMeanVarDesc, long hFwdBottomDesc, long hBwdScaleBiasMeanVarDesc, long hBwdBottomDesc, BATCHNORM_MODE mode)
4133 {
4134 if (m_dt == DataType.DOUBLE)
4135 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.DERIVE_BNDESC, null, m_param.AsLong(hFwdScaleBiasMeanVarDesc, hFwdBottomDesc, hBwdScaleBiasMeanVarDesc, hBwdBottomDesc, (int)mode));
4136 else
4137 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.DERIVE_BNDESC, null, m_param.AsLong(hFwdScaleBiasMeanVarDesc, hFwdBottomDesc, hBwdScaleBiasMeanVarDesc, hBwdBottomDesc, (int)mode));
4138 }
4139
4161 public void BatchNormForward(long hCuDnn, BATCHNORM_MODE mode, T fAlpha, T fBeta, long hFwdBottomDesc, long hBottomData, long hFwdTopDesc, long hTopData, long hFwdScaleBiasMeanVarDesc, long hScaleData, long hBiasData, double dfFactor, long hGlobalMean, long hGlobalVar, double dfEps, long hSaveMean, long hSaveInvVar, bool bTraining)
4162 {
4163 if (m_dt == DataType.DOUBLE)
4164 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.FWD_BN, m_param.AsDouble(convertD(fAlpha), convertD(fBeta), dfFactor, dfEps), m_param.AsLong(hCuDnn, (int)mode, 0, 0, hFwdBottomDesc, hBottomData, hFwdTopDesc, hTopData, hFwdScaleBiasMeanVarDesc, hScaleData, hBiasData, 0, hGlobalMean, hGlobalVar, 0, hSaveMean, hSaveInvVar, (bTraining) ? 1 : 0));
4165 else
4166 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.FWD_BN, m_param.AsFloat(convertF(fAlpha), convertF(fBeta), (float)dfFactor, (float)dfEps), m_param.AsLong(hCuDnn, (int)mode, 0, 0, hFwdBottomDesc, hBottomData, hFwdTopDesc, hTopData, hFwdScaleBiasMeanVarDesc, hScaleData, hBiasData, 0, hGlobalMean, hGlobalVar, 0, hSaveMean, hSaveInvVar, (bTraining) ? 1 : 0));
4167 }
4168
4191 public void BatchNormBackward(long hCuDnn, BATCHNORM_MODE mode, T fAlphaDiff, T fBetaDiff, T fAlphaParamDiff, T fBetaParamDiff, long hBwdBottomDesc, long hBottomData, long hTopDiffDesc, long hTopDiff, long hBottomDiffDesc, long hBottomDiff, long hBwdScaleBiasMeanVarDesc, long hScaleData, long hScaleDiff, long hBiasDiff, double dfEps, long hSaveMean, long hSaveInvVar)
4192 {
4193 if (m_dt == DataType.DOUBLE)
4194 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.BWD_BN, m_param.AsDouble(convertD(fAlphaDiff), convertD(fBetaDiff), convertD(fAlphaParamDiff), convertD(fBetaParamDiff), dfEps), m_param.AsLong(hCuDnn, (int)mode, 0, 0, 0, 0, hBwdBottomDesc, hBottomData, hTopDiffDesc, hTopDiff, hBottomDiffDesc, hBottomDiff, hBwdScaleBiasMeanVarDesc, hScaleData, hScaleDiff, hBiasDiff, 0, hSaveMean, hSaveInvVar));
4195 else
4196 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.BWD_BN, m_param.AsFloat(convertF(fAlphaDiff), convertF(fBetaDiff), convertF(fAlphaParamDiff), convertF(fBetaParamDiff), (float)dfEps), m_param.AsLong(hCuDnn, (int)mode, 0, 0, 0, 0, hBwdBottomDesc, hBottomData, hTopDiffDesc, hTopDiff, hBottomDiffDesc, hBottomDiff, hBwdScaleBiasMeanVarDesc, hScaleData, hScaleDiff, hBiasDiff, 0, hSaveMean, hSaveInvVar));
4197 }
4198
4203 public long CreateDropoutDesc()
4204 {
4205 if (m_dt == DataType.DOUBLE)
4206 {
4207 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CREATE_DROPOUTDESC, null);
4208 return (long)rg[0];
4209 }
4210 else
4211 {
4212 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CREATE_DROPOUTDESC, null);
4213 return (long)rg[0];
4214 }
4215 }
4216
4221 public void FreeDropoutDesc(long h)
4222 {
4223 if (m_dt == DataType.DOUBLE)
4224 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.FREE_DROPOUTDESC, m_param.AsDouble(h));
4225 else
4226 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.FREE_DROPOUTDESC, m_param.AsFloat(h));
4227 }
4228
4237 public void SetDropoutDesc(long hCuDnn, long hDropoutDesc, double dfDropout, long hStates, long lSeed)
4238 {
4239 if (m_dt == DataType.DOUBLE)
4240 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SET_DROPOUTDESC, m_param.AsDouble(dfDropout), m_param.AsLong(hCuDnn, hDropoutDesc, 0, hStates, lSeed));
4241 else
4242 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SET_DROPOUTDESC, m_param.AsFloat((float)dfDropout), m_param.AsLong(hCuDnn, hDropoutDesc, 0, hStates, lSeed));
4243 }
4244
4252 public void GetDropoutInfo(long hCuDnn, long hBottomDesc, out ulong ulStateCount, out ulong ulReservedCount)
4253 {
4254 if (m_dt == DataType.DOUBLE)
4255 {
4256 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.GET_DROPOUT_INFO, null, m_param.AsLong(hCuDnn, hBottomDesc));
4257 ulStateCount = (ulong)Math.Round(rg[0] / sizeof(double), 0, MidpointRounding.AwayFromZero);
4258 ulReservedCount = (ulong)Math.Round(rg[1] / sizeof(double), 0, MidpointRounding.AwayFromZero);
4259 }
4260 else
4261 {
4262 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.GET_DROPOUT_INFO, null, m_param.AsLong(hCuDnn, hBottomDesc));
4263 ulStateCount = (ulong)Math.Round(rg[0] / sizeof(float), 0, MidpointRounding.AwayFromZero);
4264 ulReservedCount = (ulong)Math.Round(rg[1] / sizeof(float), 0, MidpointRounding.AwayFromZero);
4265 }
4266 }
4267
4278 public void DropoutForward(long hCuDnn, long hDropoutDesc, long hBottomDesc, long hBottomData, long hTopDesc, long hTopData, long hReserved)
4279 {
4280 if (m_dt == DataType.DOUBLE)
4281 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.FWD_DROPOUT, null, m_param.AsLong(hCuDnn, hDropoutDesc, hBottomDesc, hBottomData, hTopDesc, hTopData, hReserved));
4282 else
4283 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.FWD_DROPOUT, null, m_param.AsLong(hCuDnn, hDropoutDesc, hBottomDesc, hBottomData, hTopDesc, hTopData, hReserved));
4284 }
4285
4296 public void DropoutBackward(long hCuDnn, long hDropoutDesc, long hTopDesc, long hTop, long hBottomDesc, long hBottom, long hReserved)
4297 {
4298 if (m_dt == DataType.DOUBLE)
4299 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.BWD_DROPOUT, null, m_param.AsLong(hCuDnn, hDropoutDesc, hTopDesc, hTop, hBottomDesc, hBottom, hReserved));
4300 else
4301 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.BWD_DROPOUT, null, m_param.AsLong(hCuDnn, hDropoutDesc, hTopDesc, hTop, hBottomDesc, hBottom, hReserved));
4302 }
4303
4308 public long CreateLRNDesc()
4309 {
4310 if (m_dt == DataType.DOUBLE)
4311 {
4312 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CREATE_LRNDESC, null);
4313 return (long)rg[0];
4314 }
4315 else
4316 {
4317 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CREATE_LRNDESC, null);
4318 return (long)rg[0];
4319 }
4320 }
4321
4326 public void FreeLRNDesc(long h)
4327 {
4328 if (m_dt == DataType.DOUBLE)
4329 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.FREE_LRNDESC, m_param.AsDouble(h));
4330 else
4331 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.FREE_LRNDESC, m_param.AsFloat(h));
4332 }
4333
4342 public void SetLRNDesc(long hHandle, uint nSize, double fAlpha, double fBeta, double fK)
4343 {
4344 if (m_dt == DataType.DOUBLE)
4345 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SET_LRNDESC, m_param.AsDouble(fAlpha, fBeta, fK), m_param.AsLong(hHandle, nSize, 0, 0, 0));
4346 else
4347 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SET_LRNDESC, m_param.AsFloat((float)fAlpha, (float)fBeta, (float)fK), m_param.AsLong(hHandle, nSize, 0, 0, 0));
4348 }
4349
4361 public void LRNCrossChannelForward(long hCuDnn, long hNormDesc, T fAlpha, long hBottomDesc, long hBottomData, T fBeta, long hTopDesc, long hTopData)
4362 {
4363 if (m_dt == DataType.DOUBLE)
4364 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.LRN_CC_FWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, hNormDesc, 0, hBottomDesc, hBottomData, 0, hTopDesc, hTopData));
4365 else
4366 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.LRN_CC_FWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, hNormDesc, 0, hBottomDesc, hBottomData, 0, hTopDesc, hTopData));
4367 }
4368
4384 public void LRNCrossChannelBackward(long hCuDnn, long hNormDesc, T fAlpha, long hTopDataDesc, long hTopData, long hTopDiffDesc, long hTopDiff, long hBottomDataDesc, long hBottomData, T fBeta, long hBottomDiffDesc, long hBottomDiff)
4385 {
4386 if (m_dt == DataType.DOUBLE)
4387 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.LRN_CC_BWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, hNormDesc, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4388 else
4389 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.LRN_CC_BWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, hNormDesc, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4390 }
4391
4408 public void DivisiveNormalizationForward(long hCuDnn, long hNormDesc, T fAlpha, long hBottomDataDesc, long hBottomData, long hTemp1, long hTemp2, T fBeta, long hTopDataDesc, long hTopData)
4409 {
4410 if (m_dt == DataType.DOUBLE)
4411 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.LCN_CC_FWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, hNormDesc, 0, hBottomDataDesc, hBottomData, hTemp1, hTemp2, 0, hTopDataDesc, hTopData));
4412 else
4413 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.LCN_CC_FWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, hNormDesc, 0, hBottomDataDesc, hBottomData, hTemp1, hTemp2, 0, hTopDataDesc, hTopData));
4414 }
4415
4433 public void DivisiveNormalizationBackward(long hCuDnn, long hNormDesc, T fAlpha, long hBottomDataDesc, long hBottomData, long hTopDiff, long hTemp1, long hTemp2, T fBeta, long hBottomDiffDesc, long hBottomDiff)
4434 {
4435 if (m_dt == DataType.DOUBLE)
4436 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.LCN_CC_BWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, hNormDesc, 0, hBottomDataDesc, hBottomData, hTopDiff, hTemp1, hTemp2, 0, hBottomDiffDesc, hBottomDiff));
4437 else
4438 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.LCN_CC_BWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, hNormDesc, 0, hBottomDataDesc, hBottomData, hTopDiff, hTemp1, hTemp2, 0, hBottomDiffDesc, hBottomDiff));
4439 }
4440
4451 public void TanhForward(long hCuDnn, T fAlpha, long hBottomDataDesc, long hBottomData, T fBeta, long hTopDataDesc, long hTopData)
4452 {
4453 if (m_dt == DataType.DOUBLE)
4454 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.TANH_FWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDataDesc, hBottomData, 0, hTopDataDesc, hTopData));
4455 else
4456 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.TANH_FWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDataDesc, hBottomData, 0, hTopDataDesc, hTopData));
4457 }
4458
4473 public void TanhBackward(long hCuDnn, T fAlpha, long hTopDataDesc, long hTopData, long hTopDiffDesc, long hTopDiff, long hBottomDataDesc, long hBottomData, T fBeta, long hBottomDiffDesc, long hBottomDiff)
4474 {
4475 if (m_dt == DataType.DOUBLE)
4476 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.TANH_BWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4477 else
4478 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.TANH_BWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4479 }
4480
4491 public void EluForward(long hCuDnn, T fAlpha, long hBottomDataDesc, long hBottomData, T fBeta, long hTopDataDesc, long hTopData)
4492 {
4493 if (m_dt == DataType.DOUBLE)
4494 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.ELU_FWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDataDesc, hBottomData, 0, hTopDataDesc, hTopData));
4495 else
4496 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.ELU_FWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDataDesc, hBottomData, 0, hTopDataDesc, hTopData));
4497 }
4498
4513 public void EluBackward(long hCuDnn, T fAlpha, long hTopDataDesc, long hTopData, long hTopDiffDesc, long hTopDiff, long hBottomDataDesc, long hBottomData, T fBeta, long hBottomDiffDesc, long hBottomDiff)
4514 {
4515 if (m_dt == DataType.DOUBLE)
4516 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.ELU_BWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4517 else
4518 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.ELU_BWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4519 }
4520
4531 public void SigmoidForward(long hCuDnn, T fAlpha, long hBottomDataDesc, long hBottomData, T fBeta, long hTopDataDesc, long hTopData)
4532 {
4533 if (m_dt == DataType.DOUBLE)
4534 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SIGMOID_FWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDataDesc, hBottomData, 0, hTopDataDesc, hTopData));
4535 else
4536 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SIGMOID_FWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDataDesc, hBottomData, 0, hTopDataDesc, hTopData));
4537 }
4538
4553 public void SigmoidBackward(long hCuDnn, T fAlpha, long hTopDataDesc, long hTopData, long hTopDiffDesc, long hTopDiff, long hBottomDataDesc, long hBottomData, T fBeta, long hBottomDiffDesc, long hBottomDiff)
4554 {
4555 if (m_dt == DataType.DOUBLE)
4556 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SIGMOID_BWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4557 else
4558 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SIGMOID_BWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4559 }
4560
4576 public void ReLUForward(long hCuDnn, T fAlpha, long hBottomDataDesc, long hBottomData, T fBeta, long hTopDataDesc, long hTopData)
4577 {
4578 if (m_dt == DataType.DOUBLE)
4579 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.RELU_FWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDataDesc, hBottomData, 0, hTopDataDesc, hTopData));
4580 else
4581 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.RELU_FWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDataDesc, hBottomData, 0, hTopDataDesc, hTopData));
4582 }
4583
4598 public void ReLUBackward(long hCuDnn, T fAlpha, long hTopDataDesc, long hTopData, long hTopDiffDesc, long hTopDiff, long hBottomDataDesc, long hBottomData, T fBeta, long hBottomDiffDesc, long hBottomDiff)
4599 {
4600 if (m_dt == DataType.DOUBLE)
4601 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.RELU_BWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4602 else
4603 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.RELU_BWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, hBottomDataDesc, hBottomData, 0, hBottomDiffDesc, hBottomDiff));
4604 }
4605
4618 public void SoftmaxForward(long hCuDnn, SOFTMAX_ALGORITHM alg, SOFTMAX_MODE mode, T fAlpha, long hBottomDataDesc, long hBottomData, T fBeta, long hTopDataDesc, long hTopData)
4619 {
4620 if (m_dt == DataType.DOUBLE)
4621 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SOFTMAX_FWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDataDesc, hBottomData, 0, hTopDataDesc, hTopData, (int)alg, (int)mode));
4622 else
4623 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SOFTMAX_FWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hBottomDataDesc, hBottomData, 0, hTopDataDesc, hTopData, (int)alg, (int)mode));
4624 }
4625
4640 public void SoftmaxBackward(long hCuDnn, SOFTMAX_ALGORITHM alg, SOFTMAX_MODE mode, T fAlpha, long hTopDataDesc, long hTopData, long hTopDiffDesc, long hTopDiff, T fBeta, long hBottomDiffDesc, long hBottomDiff)
4641 {
4642 if (m_dt == DataType.DOUBLE)
4643 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SOFTMAX_BWD, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, 0, hBottomDiffDesc, hBottomDiff, (int)alg, (int)mode));
4644 else
4645 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SOFTMAX_BWD, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(hCuDnn, 0, hTopDataDesc, hTopData, hTopDiffDesc, hTopDiff, 0, hBottomDiffDesc, hBottomDiff, (int)alg, (int)mode));
4646 }
4647
4652 public long CreateRnnDataDesc()
4653 {
4654 int nFn = (m_bEnableRnnExtendedVersion) ? (int)CUDAFN.CREATE_RNN_DATA_DESCEX : (int)CUDAFN.CREATE_RNN_DATA_DESC;
4655
4656 if (m_dt == DataType.DOUBLE)
4657 {
4658 double[] rg = m_cuda.RunDouble((int)m_hKernel, nFn, null);
4659 return (long)rg[0];
4660 }
4661 else
4662 {
4663 float[] rg = m_cuda.RunFloat((int)m_hKernel, nFn, null);
4664 return (long)rg[0];
4665 }
4666 }
4667
4672 public void FreeRnnDataDesc(long h)
4673 {
4674 int nFn = (m_bEnableRnnExtendedVersion) ? (int)CUDAFN.FREE_RNN_DATA_DESCEX : (int)CUDAFN.FREE_RNN_DATA_DESC;
4675
4676 if (m_dt == DataType.DOUBLE)
4677 m_cuda.RunDouble((int)m_hKernel, nFn, m_param.AsDouble(h));
4678 else
4679 m_cuda.RunFloat((int)m_hKernel, nFn, m_param.AsFloat(h));
4680 }
4681
4692 public void SetRnnDataDesc(long hRnnDataDesc, RNN_DATALAYOUT layout, int nMaxSeqLen, int nBatchSize, int nVectorSize, bool bBidirectional = false, int[] rgSeqLen = null)
4693 {
4694 if (!m_bEnableRnnExtendedVersion && layout != RNN_DATALAYOUT.RNN_SEQ_MAJOR_UNPACKED)
4695 throw new Exception("The non-extended functions only support RNN_SEQ_MAJOR ordering.");
4696
4697 int nFn = (m_bEnableRnnExtendedVersion) ? (int)CUDAFN.SET_RNN_DATA_DESCEX : (int)CUDAFN.SET_RNN_DATA_DESC;
4698
4699 if (m_dt == DataType.DOUBLE)
4700 {
4701 List<long> rgArg = new List<long>() { hRnnDataDesc, (long)layout, nMaxSeqLen, nBatchSize, nVectorSize, (bBidirectional) ? 1 : 0 };
4702
4703 if (rgSeqLen != null)
4704 {
4705 for (int i = 0; i < rgSeqLen.Length; i++)
4706 {
4707 rgArg.Add(rgSeqLen[i]);
4708 }
4709 }
4710
4711 m_cuda.RunDoubleEx2((int)m_hKernel, nFn, null, rgArg.ToArray());
4712 }
4713 else
4714 {
4715 List<long> rgArg = new List<long>() { hRnnDataDesc, (long)layout, nMaxSeqLen, nBatchSize, nVectorSize, (bBidirectional) ? 1 : 0 };
4716
4717 if (rgSeqLen != null)
4718 {
4719 for (int i = 0; i < rgSeqLen.Length; i++)
4720 {
4721 rgArg.Add(rgSeqLen[i]);
4722 }
4723 }
4724
4725 m_cuda.RunFloatEx2((int)m_hKernel, nFn, null, rgArg.ToArray());
4726 }
4727 }
4728
4733 public long CreateRnnDesc()
4734 {
4735 if (m_dt == DataType.DOUBLE)
4736 {
4737 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CREATE_RNN_DESC, null);
4738 return (long)rg[0];
4739 }
4740 else
4741 {
4742 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CREATE_RNN_DESC, null);
4743 return (long)rg[0];
4744 }
4745 }
4746
4751 public void FreeRnnDesc(long h)
4752 {
4753 if (m_dt == DataType.DOUBLE)
4754 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.FREE_RNN_DESC, m_param.AsDouble(h));
4755 else
4756 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.FREE_RNN_DESC, m_param.AsFloat(h));
4757 }
4758
4770 public void SetRnnDesc(long hCuDnn, long hRnnDesc, int nHiddenCount, int nNumLayers, long hDropoutDesc, RNN_MODE mode, bool bUseTensorCores, RNN_DIRECTION direction = RNN_DIRECTION.RNN_UNIDIRECTIONAL)
4771 {
4772 if (m_dt == DataType.DOUBLE)
4773 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.SET_RNN_DESC, null, m_param.AsLong(hCuDnn, hRnnDesc, nHiddenCount, nNumLayers, hDropoutDesc, (int)mode, (bUseTensorCores) ? 1 : 0, (long)direction));
4774 else
4775 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.SET_RNN_DESC, null, m_param.AsLong(hCuDnn, hRnnDesc, nHiddenCount, nNumLayers, hDropoutDesc, (int)mode, (bUseTensorCores) ? 1 : 0, (long)direction));
4776 }
4777
4785 public int GetRnnParamCount(long hCuDnn, long hRnnDesc, long hXDesc)
4786 {
4787 if (m_dt == DataType.DOUBLE)
4788 {
4789 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.GET_RNN_PARAMCOUNT, null, m_param.AsLong(hCuDnn, hRnnDesc, hXDesc, (m_bEnableRnnExtendedVersion) ? 1 : 0));
4790 return (int)rg[0];
4791 }
4792 else
4793 {
4794 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.GET_RNN_PARAMCOUNT, null, m_param.AsLong(hCuDnn, hRnnDesc, hXDesc, (m_bEnableRnnExtendedVersion) ? 1 : 0));
4795 return (int)rg[0];
4796 }
4797 }
4798
4807 public ulong GetRnnWorkspaceCount(long hCuDnn, long hRnnDesc, long hXDesc, out ulong nReservedCount)
4808 {
4809 if (m_dt == DataType.DOUBLE)
4810 {
4811 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.GET_RNN_WORKSPACECOUNT, null, m_param.AsLong(hCuDnn, hRnnDesc, (m_bEnableRnnExtendedVersion) ? 1 : 0, hXDesc));
4812 nReservedCount = (ulong)rg[1];
4813 return (ulong)rg[0];
4814 }
4815 else
4816 {
4817 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.GET_RNN_WORKSPACECOUNT, null, m_param.AsLong(hCuDnn, hRnnDesc, (m_bEnableRnnExtendedVersion) ? 1 : 0, hXDesc));
4818 nReservedCount = (ulong)rg[1];
4819 return (ulong)rg[0];
4820 }
4821 }
4822
4837 public void GetRnnLinLayerParams(long hCuDnn, long hRnnDesc, int nLayer, long hXDesc, long hWtDesc, long hWtData, int nLinLayer, out int nWtCount, out long hWt, out int nBiasCount, out long hBias)
4838 {
4839 if (m_dt == DataType.DOUBLE)
4840 {
4841 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.GET_RNN_LINLAYERPARAMS, null, m_param.AsLong(hCuDnn, hRnnDesc, nLayer, hXDesc, hWtDesc, hWtData, nLinLayer, (m_bEnableRnnExtendedVersion) ? 1 : 0));
4842 nWtCount = (int)rg[0];
4843 hWt = (long)rg[1];
4844 nBiasCount = (int)rg[2];
4845 hBias = (long)rg[3];
4846 }
4847 else
4848 {
4849 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.GET_RNN_LINLAYERPARAMS, null, m_param.AsLong(hCuDnn, hRnnDesc, nLayer, hXDesc, hWtDesc, hWtData, nLinLayer, (m_bEnableRnnExtendedVersion) ? 1 : 0));
4850 nWtCount = (int)rg[0];
4851 hWt = (long)rg[1];
4852 nBiasCount = (int)rg[2];
4853 hBias = (long)rg[3];
4854 }
4855 }
4856
4881 public void RnnForward(long hCuDnn, long hRnnDesc, long hXDesc, long hXData, long hHxDesc, long hHxData, long hCxDesc, long hCxData, long hWtDesc, long hWtData, long hYDesc, long hYData, long hHyDesc, long hHyData, long hCyDesc, long hCyData, long hWorkspace, ulong nWsCount, long hReserved, ulong nResCount, bool bTraining)
4882 {
4883 if (m_dt == DataType.DOUBLE)
4884 {
4885 List<long> rgArg = new List<long>() { hCuDnn, hRnnDesc };
4886
4887 rgArg.Add(hXDesc);
4888 rgArg.Add(hXData);
4889
4890 rgArg.Add(hHxDesc);
4891 rgArg.Add(hHxData);
4892 rgArg.Add(hCxDesc);
4893 rgArg.Add(hCxData);
4894
4895 rgArg.Add(hWtDesc);
4896 rgArg.Add(hWtData);
4897
4898 rgArg.Add(hYDesc);
4899 rgArg.Add(hYData);
4900
4901 rgArg.Add(hHyDesc);
4902 rgArg.Add(hHyData);
4903 rgArg.Add(hCyDesc);
4904 rgArg.Add(hCyData);
4905
4906 rgArg.Add(hWorkspace);
4907 rgArg.Add((long)nWsCount);
4908 rgArg.Add(hReserved);
4909 rgArg.Add((long)nResCount);
4910 rgArg.Add((bTraining) ? 1 : 0);
4911
4912 if (m_bEnableRnnExtendedVersion)
4913 rgArg.Add(1);
4914
4915 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.FWD_RNN, null, rgArg.ToArray());
4916 }
4917 else
4918 {
4919 List<long> rgArg = new List<long>() { hCuDnn, hRnnDesc };
4920
4921 rgArg.Add(hXDesc);
4922 rgArg.Add(hXData);
4923
4924 rgArg.Add(hHxDesc);
4925 rgArg.Add(hHxData);
4926 rgArg.Add(hCxDesc);
4927 rgArg.Add(hCxData);
4928
4929 rgArg.Add(hWtDesc);
4930 rgArg.Add(hWtData);
4931
4932 rgArg.Add(hYDesc);
4933 rgArg.Add(hYData);
4934
4935 rgArg.Add(hHyDesc);
4936 rgArg.Add(hHyData);
4937 rgArg.Add(hCyDesc);
4938 rgArg.Add(hCyData);
4939
4940 rgArg.Add(hWorkspace);
4941 rgArg.Add((long)nWsCount);
4942 rgArg.Add(hReserved);
4943 rgArg.Add((long)nResCount);
4944 rgArg.Add((bTraining) ? 1 : 0);
4945
4946 if (m_bEnableRnnExtendedVersion)
4947 rgArg.Add(1);
4948
4949 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.FWD_RNN, null, rgArg.ToArray());
4950 }
4951 }
4952
4981 public void RnnBackwardData(long hCuDnn, long hRnnDesc, long hYDesc, long hYData, long hYDiff, long hHyDesc, long hHyDiff, long hCyDesc, long hCyDiff, long hWtDesc, long hWtData, long hHxDesc, long hHxData, long hCxDesc, long hCxData, long hXDesc, long hXDiff, long hdHxDesc, long hHxDiff, long hdCxDesc, long hCxDiff, long hWorkspace, ulong nWsCount, long hReserved, ulong nResCount)
4982 {
4983 if (m_dt == DataType.DOUBLE)
4984 {
4985 List<long> rgArg = new List<long>() { hCuDnn, hRnnDesc };
4986
4987 rgArg.Add(hYDesc);
4988 rgArg.Add(hYData);
4989 rgArg.Add(hYDiff);
4990
4991 rgArg.Add(hHyDesc);
4992 rgArg.Add(hHyDiff);
4993 rgArg.Add(hCyDesc);
4994 rgArg.Add(hCyDiff);
4995
4996 rgArg.Add(hWtDesc);
4997 rgArg.Add(hWtData);
4998
4999 rgArg.Add(hHxDesc);
5000 rgArg.Add(hHxData);
5001 rgArg.Add(hCxDesc);
5002 rgArg.Add(hCxData);
5003
5004 rgArg.Add(hXDesc);
5005 rgArg.Add(hXDiff);
5006
5007 rgArg.Add(hdHxDesc);
5008 rgArg.Add(hHxDiff);
5009 rgArg.Add(hdCxDesc);
5010 rgArg.Add(hCxDiff);
5011
5012 rgArg.Add(hWorkspace);
5013 rgArg.Add((long)nWsCount);
5014 rgArg.Add(hReserved);
5015 rgArg.Add((long)nResCount);
5016
5017 if (m_bEnableRnnExtendedVersion)
5018 rgArg.Add(1);
5019
5020 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.BWD_RNN_DATA, null, rgArg.ToArray());
5021 }
5022 else
5023 {
5024 List<long> rgArg = new List<long>() { hCuDnn, hRnnDesc };
5025
5026 rgArg.Add(hYDesc);
5027 rgArg.Add(hYData);
5028 rgArg.Add(hYDiff);
5029
5030 rgArg.Add(hHyDesc);
5031 rgArg.Add(hHyDiff);
5032 rgArg.Add(hCyDesc);
5033 rgArg.Add(hCyDiff);
5034
5035 rgArg.Add(hWtDesc);
5036 rgArg.Add(hWtData);
5037
5038 rgArg.Add(hHxDesc);
5039 rgArg.Add(hHxData);
5040 rgArg.Add(hCxDesc);
5041 rgArg.Add(hCxData);
5042
5043 rgArg.Add(hXDesc);
5044 rgArg.Add(hXDiff);
5045
5046 rgArg.Add(hdHxDesc);
5047 rgArg.Add(hHxDiff);
5048 rgArg.Add(hdCxDesc);
5049 rgArg.Add(hCxDiff);
5050
5051 rgArg.Add(hWorkspace);
5052 rgArg.Add((long)nWsCount);
5053 rgArg.Add(hReserved);
5054 rgArg.Add((long)nResCount);
5055
5056 if (m_bEnableRnnExtendedVersion)
5057 rgArg.Add(1);
5058
5059 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.BWD_RNN_DATA, null, rgArg.ToArray());
5060 }
5061 }
5062
5080 public void RnnBackwardWeights(long hCuDnn, long hRnnDesc, long hXDesc, long hXData, long hHxDesc, long hHxData, long hYDesc, long hYData, long hWorkspace, ulong nWsCount, long hWtDesc, long hWtDiff, long hReserved, ulong nResCount)
5081 {
5082 if (m_dt == DataType.DOUBLE)
5083 {
5084 List<long> rgArg = new List<long>() { hCuDnn, hRnnDesc };
5085
5086 rgArg.Add(hXDesc);
5087 rgArg.Add(hXData);
5088
5089 rgArg.Add(hHxDesc);
5090 rgArg.Add(hHxData);
5091
5092 rgArg.Add(hYDesc);
5093 rgArg.Add(hYData);
5094
5095 rgArg.Add(hWorkspace);
5096 rgArg.Add((long)nWsCount);
5097
5098 rgArg.Add(hWtDesc);
5099 rgArg.Add(hWtDiff);
5100
5101 rgArg.Add(hReserved);
5102 rgArg.Add((long)nResCount);
5103
5104 if (m_bEnableRnnExtendedVersion)
5105 rgArg.Add(1);
5106
5107 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.BWD_RNN_WTS, null, rgArg.ToArray());
5108 }
5109 else
5110 {
5111 List<long> rgArg = new List<long>() { hCuDnn, hRnnDesc };
5112
5113 rgArg.Add(hXDesc);
5114 rgArg.Add(hXData);
5115
5116 rgArg.Add(hHxDesc);
5117 rgArg.Add(hHxData);
5118
5119 rgArg.Add(hYDesc);
5120 rgArg.Add(hYData);
5121
5122 rgArg.Add(hWorkspace);
5123 rgArg.Add((long)nWsCount);
5124
5125 rgArg.Add(hWtDesc);
5126 rgArg.Add(hWtDiff);
5127
5128 rgArg.Add(hReserved);
5129 rgArg.Add((long)nResCount);
5130
5131 if (m_bEnableRnnExtendedVersion)
5132 rgArg.Add(1);
5133
5134 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.BWD_RNN_WTS, null, rgArg.ToArray());
5135 }
5136 }
5137
5138
5142 public bool IsRnn8Supported()
5143 {
5144 if (m_dt == DataType.DOUBLE)
5145 {
5146 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.RNN8_IS_SUPPORTED, null);
5147 return (rg[0] == 1) ? true : false;
5148 }
5149 else
5150 {
5151 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.RNN8_IS_SUPPORTED, null);
5152 return (rg[0] == 1) ? true : false;
5153 }
5154 }
5155
5160 public long CreateRnn8()
5161 {
5162 if (m_dt == DataType.DOUBLE)
5163 {
5164 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.RNN8_CREATE, null);
5165 return (long)rg[0];
5166 }
5167 else
5168 {
5169 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.RNN8_CREATE, null);
5170 return (long)rg[0];
5171 }
5172 }
5173
5178 public void FreeRnn8(long h)
5179 {
5180 if (m_dt == DataType.DOUBLE)
5181 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.RNN8_FREE, m_param.AsDouble(h));
5182 else
5183 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.RNN8_FREE, m_param.AsFloat(h));
5184 }
5185
5205 public void SetRnn8(long hCuDnn, long hRnn, bool bTraining, RNN_DATALAYOUT layout, RNN_MODE cellMode, RNN_BIAS_MODE biasMode, int nSequenceLen, int nBatchSize, int nInputs, int nHidden, int nOutputs, int nProjection, int nNumLayers, float fDropout, ulong lSeed, bool bBidirectional = false)
5206 {
5207 if (m_dt == DataType.DOUBLE)
5208 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.RNN8_SET, m_param.AsDouble((double)fDropout), m_param.AsLong(hCuDnn, hRnn, (bTraining) ? 1 : 0, (int)layout, (int)cellMode, (int)biasMode, nSequenceLen, nBatchSize, nInputs, nHidden, nOutputs, nProjection, nNumLayers, (long)lSeed, (bBidirectional) ? 1 : 0));
5209 else
5210 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.RNN8_SET, m_param.AsFloat(fDropout), m_param.AsLong(hCuDnn, hRnn, (bTraining) ? 1 : 0, (int)layout, (int)cellMode, (int)biasMode, nSequenceLen, nBatchSize, nInputs, nHidden, nOutputs, nProjection, nNumLayers, (long)lSeed, (bBidirectional) ? 1 : 0));
5211 }
5212
5221 public void GetRnn8MemorySizes(long hCuDnn, long hRnn, out ulong szWtCount, out ulong szWorkSize, out ulong szReservedSize)
5222 {
5223 if (m_dt == DataType.DOUBLE)
5224 {
5225 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.RNN8_GET_MEMORY_SIZES, null, m_param.AsLong(hCuDnn, hRnn));
5226 szWtCount = (ulong)rg[0];
5227 szWorkSize = (ulong)rg[1];
5228 szReservedSize = (ulong)rg[2];
5229 }
5230 else
5231 {
5232 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.RNN8_GET_MEMORY_SIZES, null, m_param.AsLong(hCuDnn, hRnn));
5233 szWtCount = (ulong)rg[0];
5234 szWorkSize = (ulong)rg[1];
5235 szReservedSize = (ulong)rg[2];
5236 }
5237 }
5238
5251 public void InitializeRnn8Weights(long hCuDnn, long hRnn, long hWt, RNN_FILLER_TYPE wtFt, double fWtVal, double fWtVal2, RNN_FILLER_TYPE biasFt, double fBiasVal, double fBiasVal2)
5252 {
5253 if (m_dt == DataType.DOUBLE)
5254 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.RNN8_INIT_WEIGHTS, m_param.AsDouble(fWtVal, fWtVal2, fBiasVal, fBiasVal2), m_param.AsLong(hCuDnn, hRnn, hWt, (int)wtFt, (int)biasFt));
5255 else
5256 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.RNN8_INIT_WEIGHTS, m_param.AsFloat((float)fWtVal, (float)fWtVal2, (float)fBiasVal, (float)fBiasVal2), m_param.AsLong(hCuDnn, hRnn, hWt, (int)wtFt, (int)biasFt));
5257 }
5258
5273 public void Rnn8Forward(long hCuDnn, long hRnn, long hX, long hY, long hhX, long hhY, long hcX, long hcY, long hWts, long hWork, long hReserved)
5274 {
5275 if (m_dt == DataType.DOUBLE)
5276 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.RNN8_FWD, null, m_param.AsLong(hCuDnn, hRnn, hX, hY, hhX, hhY, hcX, hcY, hWts, hWork, hReserved));
5277 else
5278 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.RNN8_FWD, null, m_param.AsLong(hCuDnn, hRnn, hX, hY, hhX, hhY, hcX, hcY, hWts, hWork, hReserved));
5279 }
5280
5300 public void Rnn8Backward(long hCuDnn, long hRnn, long hY, long hdY, long hX, long hdX, long hhX, long hdhY, long hdhX, long hcX, long hdcY, long hdcX, long hWt, long hdWt, long hWork, long hReserved)
5301 {
5302 if (m_dt == DataType.DOUBLE)
5303 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.RNN8_BWD, null, m_param.AsLong(hCuDnn, hRnn, hY, hdY, hX, hdX, hhX, hdhY, hdhX, hcX, hdcY, hdcX, hWt, hdWt, hWork, hReserved));
5304 else
5305 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.RNN8_BWD, null, m_param.AsLong(hCuDnn, hRnn, hY, hdY, hX, hdX, hhX, hdhY, hdhX, hcX, hdcY, hdcX, hWt, hdWt, hWork, hReserved));
5306 }
5307
5319 public long AllocPCAData(int nM, int nN, int nK, out int nCount)
5320 {
5321 nCount = nM * nN;
5322 return AllocMemory(nCount);
5323 }
5324
5336 public long AllocPCAScores(int nM, int nN, int nK, out int nCount)
5337 {
5338 nCount = nM * nK;
5339 return AllocMemory(nCount);
5340 }
5341
5353 public long AllocPCALoads(int nM, int nN, int nK, out int nCount)
5354 {
5355 nCount = nN * nK;
5356 return AllocMemory(nCount);
5357 }
5358
5370 public long AllocPCAEigenvalues(int nM, int nN, int nK, out int nCount)
5371 {
5372 nCount = nK * 1;
5373 return AllocHostBuffer(nCount);
5374 }
5375
5392 public long CreatePCA(int nMaxIterations, int nM, int nN, int nK, long hData, long hScoresResult, long hLoadsResult, long hResiduals = 0, long hEigenvalues = 0)
5393 {
5394 if (m_dt == DataType.DOUBLE)
5395 {
5396 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CREATE_PCA, null, m_param.AsLong(nMaxIterations, nM, nN, nK, hData, hScoresResult, hLoadsResult, hResiduals, hEigenvalues));
5397 return (long)rg[0];
5398 }
5399 else
5400 {
5401 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CREATE_PCA, null, m_param.AsLong(nMaxIterations, nM, nN, nK, hData, hScoresResult, hLoadsResult, hResiduals, hEigenvalues));
5402 return (long)rg[0];
5403 }
5404 }
5405
5417 public bool RunPCA(long hPCA, int nSteps, out int nCurrentK, out int nCurrentIteration)
5418 {
5419 bool bDone = false;
5420
5421 if (m_dt == DataType.DOUBLE)
5422 {
5423 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_RUN_PCA, null, m_param.AsLong(hPCA, nSteps));
5424 bDone = (rg[0] == 1.0) ? true : false;
5425 nCurrentIteration = (int)rg[1];
5426 nCurrentK = (int)rg[2];
5427 }
5428 else
5429 {
5430 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_RUN_PCA, null, m_param.AsLong(hPCA, nSteps));
5431 bDone = (rg[0] == 1.0f) ? true : false;
5432 nCurrentIteration = (int)rg[1];
5433 nCurrentK = (int)rg[2];
5434 }
5435
5436 return bDone;
5437 }
5438
5446 public void FreePCA(long hPCA)
5447 {
5448 if (m_dt == DataType.DOUBLE)
5449 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CUDA_FREE_PCA, m_param.AsDouble(hPCA));
5450 else
5451 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CUDA_FREE_PCA, m_param.AsFloat(hPCA));
5452 }
5453
5482 public long CreateSSD(int nNumClasses, bool bShareLocation, int nLocClasses, int nBackgroundLabelId, bool bUseDiffcultGt, SSD_MINING_TYPE miningType, SSD_MATCH_TYPE matchType, float fOverlapThreshold, bool bUsePriorForMatching, SSD_CODE_TYPE codeType, bool bEncodeVariantInTgt, bool bBpInside, bool bIgnoreCrossBoundaryBbox, bool bUsePriorForNms, SSD_CONF_LOSS_TYPE confLossType, SSD_LOC_LOSS_TYPE locLossType, float fNegPosRatio, float fNegOverlap, int nSampleSize, bool bMapObjectToAgnostic, bool bNmsParam, float? fNmsThreshold = null, int? nNmsTopK = null, float? fNmsEta = null)
5483 {
5484 int nGpuID = GetDeviceID();
5485
5486 if (m_dt == DataType.DOUBLE)
5487 {
5488 List<double> rgArg = new List<double>();
5489
5490 /* 0 */
5491 rgArg.Add(nGpuID);
5492 /* 1 */
5493 rgArg.Add(nNumClasses);
5494 /* 2 */
5495 rgArg.Add((bShareLocation) ? 1 : 0);
5496 /* 3 */
5497 rgArg.Add(nLocClasses);
5498 /* 4 */
5499 rgArg.Add(nBackgroundLabelId);
5500 /* 5 */
5501 rgArg.Add((bUseDiffcultGt) ? 1 : 0);
5502 /* 6 */
5503 rgArg.Add((int)miningType);
5504 /* 7 */
5505 rgArg.Add((int)matchType);
5506 /* 8 */
5507 rgArg.Add(fOverlapThreshold);
5508 /* 9 */
5509 rgArg.Add((bUsePriorForMatching) ? 1 : 0);
5510 /* 10 */
5511 rgArg.Add((int)codeType);
5512 /* 11 */
5513 rgArg.Add((bEncodeVariantInTgt) ? 1 : 0);
5514 /* 12 */
5515 rgArg.Add((bBpInside) ? 1 : 0);
5516 /* 13 */
5517 rgArg.Add((bIgnoreCrossBoundaryBbox) ? 1 : 0);
5518 /* 14 */
5519 rgArg.Add((bUsePriorForNms) ? 1 : 0);
5520 /* 15 */
5521 rgArg.Add((int)confLossType);
5522 /* 16 */
5523 rgArg.Add((int)locLossType);
5524 /* 17 */
5525 rgArg.Add(fNegPosRatio);
5526 /* 18 */
5527 rgArg.Add(fNegOverlap);
5528 /* 19 */
5529 rgArg.Add(nSampleSize);
5530 /* 20 */
5531 rgArg.Add((bMapObjectToAgnostic) ? 1 : 0);
5532 /* 21 */
5533 rgArg.Add((bNmsParam) ? 1 : 0);
5534
5535 if (bNmsParam)
5536 {
5537 if (!fNmsThreshold.HasValue)
5538 throw new Exception("An NMS threshold must be specified when the 'bNmsParam' is true.");
5539
5540 /* 22 */
5541 rgArg.Add(fNmsThreshold.GetValueOrDefault(0));
5542 /* 23 */
5543 rgArg.Add(nNmsTopK.GetValueOrDefault(-1));
5544 /* 24 */
5545 rgArg.Add(fNmsEta.GetValueOrDefault(1));
5546 }
5547
5548 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CUDA_CREATE_SSD, rgArg.ToArray());
5549 return (long)rg[0];
5550 }
5551 else
5552 {
5553 List<float> rgArg = new List<float>();
5554
5555 /* 0 */
5556 rgArg.Add(nGpuID);
5557 /* 1 */
5558 rgArg.Add(nNumClasses);
5559 /* 2 */
5560 rgArg.Add((bShareLocation) ? 1 : 0);
5561 /* 3 */
5562 rgArg.Add(nLocClasses);
5563 /* 4 */
5564 rgArg.Add(nBackgroundLabelId);
5565 /* 5 */
5566 rgArg.Add((bUseDiffcultGt) ? 1 : 0);
5567 /* 6 */
5568 rgArg.Add((int)miningType);
5569 /* 7 */
5570 rgArg.Add((int)matchType);
5571 /* 8 */
5572 rgArg.Add(fOverlapThreshold);
5573 /* 9 */
5574 rgArg.Add((bUsePriorForMatching) ? 1 : 0);
5575 /* 10 */
5576 rgArg.Add((int)codeType);
5577 /* 11 */
5578 rgArg.Add((bEncodeVariantInTgt) ? 1 : 0);
5579 /* 12 */
5580 rgArg.Add((bBpInside) ? 1 : 0);
5581 /* 13 */
5582 rgArg.Add((bIgnoreCrossBoundaryBbox) ? 1 : 0);
5583 /* 14 */
5584 rgArg.Add((bUsePriorForNms) ? 1 : 0);
5585 /* 15 */
5586 rgArg.Add((int)confLossType);
5587 /* 16 */
5588 rgArg.Add((int)locLossType);
5589 /* 17 */
5590 rgArg.Add(fNegPosRatio);
5591 /* 18 */
5592 rgArg.Add(fNegOverlap);
5593 /* 19 */
5594 rgArg.Add(nSampleSize);
5595 /* 20 */
5596 rgArg.Add((bMapObjectToAgnostic) ? 1 : 0);
5597 /* 21 */
5598 rgArg.Add((bNmsParam) ? 1 : 0);
5599
5600 if (bNmsParam)
5601 {
5602 if (!fNmsThreshold.HasValue)
5603 throw new Exception("An NMS threshold must be specified when the 'bNmsParam' is true.");
5604
5605 /* 22 */
5606 rgArg.Add(fNmsThreshold.GetValueOrDefault(0));
5607 /* 23 */
5608 rgArg.Add(nNmsTopK.GetValueOrDefault(-1));
5609 /* 24 */
5610 rgArg.Add(fNmsEta.GetValueOrDefault(1));
5611 }
5612
5613 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CUDA_CREATE_SSD, rgArg.ToArray());
5614 return (long)rg[0];
5615 }
5616 }
5617
5625 public void SetupSSD(long hSSD, int nNum, int nNumPriors, int nNumGt)
5626 {
5627 if (m_dt == DataType.DOUBLE)
5628 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CUDA_SETUP_SSD, m_param.AsDouble(hSSD, nNum, nNumPriors, nNumGt));
5629 else
5630 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CUDA_SETUP_SSD, m_param.AsFloat(hSSD, nNum, nNumPriors, nNumGt));
5631 }
5632
5637 public void FreeSSD(long hSSD)
5638 {
5639 if (m_dt == DataType.DOUBLE)
5640 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CUDA_FREE_SSD, m_param.AsDouble(hSSD));
5641 else
5642 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CUDA_FREE_SSD, m_param.AsFloat(hSSD));
5643 }
5644
5661 public int SsdMultiBoxLossForward(long hSSD, int nLocDataCount, long hLocGpuData, int nConfDataCount, long hConfGpuData, int nPriorDataCount, long hPriorGpuData, int nGtDataCount, long hGtGpuData, out List<DictionaryMap<List<int>>> rgAllMatchIndices, out List<List<int>> rgrgAllNegIndices, out int nNumNegs)
5662 {
5663 int nIdx = 0;
5664 int nMatchCount = 0;
5665 rgAllMatchIndices = new List<DictionaryMap<List<int>>>();
5666 rgrgAllNegIndices = new List<List<int>>();
5667
5668 if (m_dt == DataType.DOUBLE)
5669 {
5670 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SSD_FWD_MULTIBOXLOSS, null, m_param.AsLong(hSSD, nLocDataCount, hLocGpuData, nConfDataCount, hConfGpuData, nPriorDataCount, hPriorGpuData, nGtDataCount, hGtGpuData));
5671 nMatchCount = (int)rg[nIdx];
5672 nIdx++;
5673 nNumNegs = (int)rg[nIdx];
5674 nIdx++;
5675
5676 // Get the match indices.
5677 int nNumAllMatchIndices = (int)rg[nIdx];
5678 nIdx++;
5679 for (int i = 0; i < nNumAllMatchIndices; i++)
5680 {
5681 DictionaryMap<List<int>> map = new DictionaryMap<List<int>>(null);
5682
5683 int nMapCount = (int)rg[nIdx];
5684 nIdx++;
5685 for (int j = 0; j < nMapCount; j++)
5686 {
5687 int nLabel = (int)rg[nIdx];
5688 nIdx++;
5689 List<int> rgIdx = new List<int>();
5690
5691 int nItemCount = (int)rg[nIdx];
5692 nIdx++;
5693 for (int k = 0; k < nItemCount; k++)
5694 {
5695 int nItemIdx = (int)rg[nIdx];
5696 nIdx++;
5697 rgIdx.Add(nItemIdx);
5698 }
5699
5700 map[nLabel] = rgIdx;
5701 }
5702
5703 rgAllMatchIndices.Add(map);
5704 }
5705
5706 // Get the neg indices.
5707 int nNegListCount = (int)rg[nIdx];
5708 nIdx++;
5709 for (int i = 0; i < nNegListCount; i++)
5710 {
5711 int nItemCount = (int)rg[nIdx];
5712 nIdx++;
5713 List<int> rgItems = new List<int>();
5714
5715 for (int j = 0; j < nItemCount; j++)
5716 {
5717 int nItemIdx = (int)rg[nIdx];
5718 nIdx++;
5719 rgItems.Add(nItemIdx);
5720 }
5721
5722 rgrgAllNegIndices.Add(rgItems);
5723 }
5724 }
5725 else
5726 {
5727 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SSD_FWD_MULTIBOXLOSS, null, m_param.AsLong(hSSD, nLocDataCount, hLocGpuData, nConfDataCount, hConfGpuData, nPriorDataCount, hPriorGpuData, nGtDataCount, hGtGpuData));
5728 nMatchCount = (int)rg[nIdx];
5729 nIdx++;
5730 nNumNegs = (int)rg[nIdx];
5731 nIdx++;
5732
5733 // Get the match indices.
5734 int nMapListCount = (int)rg[nIdx];
5735 nIdx++;
5736 for (int i = 0; i < nMapListCount; i++)
5737 {
5738 DictionaryMap<List<int>> map = new DictionaryMap<List<int>>(null);
5739
5740 int nMapCount = (int)rg[nIdx];
5741 nIdx++;
5742 for (int j = 0; j < nMapCount; j++)
5743 {
5744 int nLabel = (int)rg[nIdx];
5745 nIdx++;
5746 List<int> rgIdx = new List<int>();
5747
5748 int nItemCount = (int)rg[nIdx];
5749 nIdx++;
5750 for (int k = 0; k < nItemCount; k++)
5751 {
5752 int nItemIdx = (int)rg[nIdx];
5753 nIdx++;
5754 rgIdx.Add(nItemIdx);
5755 }
5756
5757 map[nLabel] = rgIdx;
5758 }
5759
5760 rgAllMatchIndices.Add(map);
5761 }
5762
5763 // Get the neg indices.
5764 int nNegListCount = (int)rg[nIdx];
5765 nIdx++;
5766 for (int i = 0; i < nNegListCount; i++)
5767 {
5768 int nItemCount = (int)rg[nIdx];
5769 nIdx++;
5770 List<int> rgItems = new List<int>();
5771
5772 for (int j = 0; j < nItemCount; j++)
5773 {
5774 int nItemIdx = (int)rg[nIdx];
5775 nIdx++;
5776 rgItems.Add(nItemIdx);
5777 }
5778
5779 rgrgAllNegIndices.Add(rgItems);
5780 }
5781 }
5782
5783 return nMatchCount;
5784 }
5785
5794 public void SsdEncodeLocPrediction(long hSSD, int nLocPredCount, long hLocPred, int nLocGtCount, long hLocGt)
5795 {
5796 if (m_dt == DataType.DOUBLE)
5797 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SSD_ENCODE_LOCPRED, null, m_param.AsLong(hSSD, nLocPredCount, hLocPred, nLocGtCount, hLocGt));
5798 else
5799 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SSD_ENCODE_LOCPRED, null, m_param.AsLong(hSSD, nLocPredCount, hLocPred, nLocGtCount, hLocGt));
5800 }
5801
5810 public void SsdEncodeConfPrediction(long hSSD, int nConfPredCount, long hConfPred, int nConfGtCount, long hConfGt)
5811 {
5812 if (m_dt == DataType.DOUBLE)
5813 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SSD_ENCODE_CONFPRED, null, m_param.AsLong(hSSD, nConfPredCount, hConfPred, nConfGtCount, hConfGt));
5814 else
5815 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SSD_ENCODE_CONFPRED, null, m_param.AsLong(hSSD, nConfPredCount, hConfPred, nConfGtCount, hConfGt));
5816 }
5817
5828 public long CreateLayerNorm(int nGpuID, int nCount, int nOuterNum, int nChannels, int nInnerNum, float fEps = 1e-10f)
5829 {
5830 if (m_dt == DataType.DOUBLE)
5831 {
5832 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CREATE_LAYERNORM, m_param.AsDouble(fEps), m_param.AsLong(nGpuID, nCount, nOuterNum, nChannels, nInnerNum, 0));
5833 return (long)rg[0];
5834 }
5835 else
5836 {
5837 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CREATE_LAYERNORM, m_param.AsFloat(fEps), m_param.AsLong(nGpuID, nCount, nOuterNum, nChannels, nInnerNum, 0));
5838 return (long)rg[0];
5839 }
5840 }
5841
5846 public void FreeLayerNorm(long hLayerNorm)
5847 {
5848 if (m_dt == DataType.DOUBLE)
5849 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CUDA_FREE_LAYERNORM, m_param.AsDouble(hLayerNorm));
5850 else
5851 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CUDA_FREE_LAYERNORM, m_param.AsFloat(hLayerNorm));
5852 }
5853
5860 public void LayerNormForward(long hLayerNorm, long hXdata, long hYdata)
5861 {
5862 if (m_dt == DataType.DOUBLE)
5863 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_LAYERNORM_FWD, null, m_param.AsLong(hLayerNorm, hXdata, hYdata));
5864 else
5865 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_LAYERNORM_FWD, null, m_param.AsLong(hLayerNorm, hXdata, hYdata));
5866 }
5867
5875 public void LayerNormBackward(long hLayerNorm, long hYdata, long hYdiff, long hXdiff)
5876 {
5877 if (m_dt == DataType.DOUBLE)
5878 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_LAYERNORM_BWD, null, m_param.AsLong(hLayerNorm, hYdata, hYdiff, hXdiff));
5879 else
5880 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_LAYERNORM_BWD, null, m_param.AsLong(hLayerNorm, hYdata, hYdiff, hXdiff));
5881 }
5882
5883 #endregion
5884
5885 //---------------------------------------------------------------------
5886 // ICudaMath Methods
5887 //---------------------------------------------------------------------
5888 #region ICudaMath Methods
5889
5897 public void set(int nCount, long hHandle, double fVal, int nIdx = -1)
5898 {
5899 set(nCount, hHandle, (T)Convert.ChangeType(fVal, typeof(T)), nIdx);
5900 }
5901
5909 public void set(int nCount, long hHandle, float fVal, int nIdx = -1)
5910 {
5911 set(nCount, hHandle, (T)Convert.ChangeType(fVal, typeof(T)), nIdx);
5912 }
5913
5922 public void set(int nCount, long hHandle, T fVal, int nIdx = -1, int nXOff = 0)
5923 {
5924 if (m_dt == DataType.DOUBLE)
5925 {
5926 if (m_rgGhostMemory == null)
5927 {
5928 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SET, m_param.AsDouble(convertD(fVal)), m_param.AsLong(nCount, hHandle, 0, nIdx, nXOff));
5929 }
5930 else
5931 {
5932 if (nIdx >= 0)
5933 m_rgGhostMemory[hHandle][nIdx] = fVal;
5934 else
5935 Utility.Set<T>(m_rgGhostMemory[hHandle], fVal);
5936 }
5937 }
5938 else
5939 {
5940 if (m_rgGhostMemory == null)
5941 {
5942 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SET, m_param.AsFloat(convertF(fVal)), m_param.AsLong(nCount, hHandle, 0, nIdx, nXOff));
5943 }
5944 else
5945 {
5946 if (nIdx >= 0)
5947 m_rgGhostMemory[hHandle][nIdx] = fVal;
5948 else
5949 Utility.Set<T>(m_rgGhostMemory[hHandle], fVal);
5950 }
5951 }
5952 }
5953
5961 public double[] get_double(int nCount, long hHandle, int nIdx = -1)
5962 {
5963 return convertD(get(nCount, hHandle, nIdx));
5964 }
5965
5973 public float[] get_float(int nCount, long hHandle, int nIdx = -1)
5974 {
5975 return convertF(get(nCount, hHandle, nIdx));
5976 }
5977
5985 public T[] get(int nCount, long hHandle, int nIdx = -1)
5986 {
5987 if (m_dt == DataType.DOUBLE)
5988 return convert(m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_GET, null, m_param.AsLong(nCount, hHandle, nIdx)));
5989 else
5990 return convert(m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_GET, null, m_param.AsLong(nCount, hHandle, nIdx)));
5991 }
5992
6007 public void copy(int nCount, long hSrc, long hDst, int nSrcOffset = 0, int nDstOffset = 0, long hStream = -1, bool? bSrcHalfSizeOverride = null, bool? bDstHalfSizeOverride = null)
6008 {
6009 int nSrcHalfSizeOverride = -1;
6010 int nDstHalfSizeOverride = -1;
6011
6012 if (bSrcHalfSizeOverride.HasValue)
6013 nSrcHalfSizeOverride = (bSrcHalfSizeOverride.Value) ? 1 : 0;
6014
6015 if (bDstHalfSizeOverride.HasValue)
6016 nDstHalfSizeOverride = (bDstHalfSizeOverride.Value) ? 1 : 0;
6017
6018 if (m_dt == DataType.DOUBLE)
6019 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_COPY, null, m_param.AsLong(nCount, hSrc, hDst, nSrcOffset, nDstOffset, hStream, nSrcHalfSizeOverride, nDstHalfSizeOverride));
6020 else
6021 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_COPY, null, m_param.AsLong(nCount, hSrc, hDst, nSrcOffset, nDstOffset, hStream, nSrcHalfSizeOverride, nDstHalfSizeOverride));
6022 }
6023
6035 public void copy(int nCount, int nNum, int nDim, long hSrc1, long hSrc2, long hDst, long hSimilar, bool bInvert = false)
6036 {
6037 if (m_dt == DataType.DOUBLE)
6038 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_COPY_SIM, null, m_param.AsLong(nCount, nNum, nDim, hSrc1, hSrc2, hDst, hSimilar, (bInvert) ? 1 : 0));
6039 else
6040 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_COPY_SIM, null, m_param.AsLong(nCount, nNum, nDim, hSrc1, hSrc2, hDst, hSimilar, (bInvert) ? 1 : 0));
6041 }
6042
6062 public void copy_batch(int nCount, int nNum, int nDim, long hSrcData, long hSrcLbl, int nDstCount, long hDstCache, long hWorkDevData, int nLabelStart, int nLabelCount, int nCacheSize, long hCacheHostCursors, long hWorkDataHost)
6063 {
6064 if (m_dt == DataType.DOUBLE)
6065 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_COPY_BATCH, null, m_param.AsLong(nCount, nNum, nDim, hSrcData, hSrcLbl, nDstCount, hDstCache, hWorkDevData, nLabelStart, nLabelCount, nCacheSize, hCacheHostCursors, hWorkDataHost));
6066 else
6067 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_COPY_BATCH, null, m_param.AsLong(nCount, nNum, nDim, hSrcData, hSrcLbl, nDstCount, hDstCache, hWorkDevData, nLabelStart, nLabelCount, nCacheSize, hCacheHostCursors, hWorkDataHost));
6068 }
6069
6095 public void copy_sequence(int nK, int nNum, int nDim, long hSrcData, long hSrcLbl, int nSrcCacheCount, long hSrcCache, int nLabelStart, int nLabelCount, int nCacheSize, long hCacheHostCursors, bool bOutputLabels, List<long> rghTop, List<int> rgnTopCount, long hWorkDataHost, bool bCombinePositiveAndNegative = false, int nSeed = 0)
6096 {
6097 int nTopCount = 2 + nK;
6098
6099 if (bOutputLabels)
6100 nTopCount++;
6101
6102 if (bCombinePositiveAndNegative && nK != 0)
6103 throw new ArgumentOutOfRangeException("nK", "When using 'bCombinePositiveAndNegative', nK should be 0.");
6104
6105 if (nK < 0 || nK > 10)
6106 throw new ArgumentOutOfRangeException("nK", "The 'nK' parameter must be within the range [0,10]!");
6107
6108 if (rghTop.Count != nTopCount)
6109 throw new ArgumentOutOfRangeException("rghTop", "The 'rghTop' count must equal '" + nTopCount.ToString() + "' given nK = " + nK.ToString() + " and bOutputLabels = " + bOutputLabels.ToString() + "!");
6110
6111 if (rgnTopCount.Count != rghTop.Count)
6112 throw new ArgumentOutOfRangeException("rgnTopCount", "The 'rgnTopCount' count must equal the 'rghTop' count!");
6113
6114 if (m_dt == DataType.DOUBLE)
6115 {
6116 List<long> rgarg = new List<long>() { nK, nNum, nDim, hSrcData, hSrcLbl, nSrcCacheCount, hSrcCache, nLabelStart, nLabelCount, nCacheSize, hCacheHostCursors, (bOutputLabels) ? 1 : 0, hWorkDataHost, (bCombinePositiveAndNegative) ? 1 : 0, nSeed };
6117
6118 for (int i = 0; i < rghTop.Count; i++)
6119 {
6120 rgarg.Add(rghTop[i]);
6121 }
6122
6123 for (int i = 0; i < rgnTopCount.Count; i++)
6124 {
6125 rgarg.Add(rgnTopCount[i]);
6126 }
6127
6128 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_COPY_SEQUENCE, null, rgarg.ToArray());
6129 }
6130 else
6131 {
6132 List<long> rgarg = new List<long>() { nK, nNum, nDim, hSrcData, hSrcLbl, nSrcCacheCount, hSrcCache, nLabelStart, nLabelCount, nCacheSize, hCacheHostCursors, (bOutputLabels) ? 1 : 0, hWorkDataHost, (bCombinePositiveAndNegative) ? 1 : 0, nSeed };
6133
6134 for (int i = 0; i < rghTop.Count; i++)
6135 {
6136 rgarg.Add(rghTop[i]);
6137 }
6138
6139 for (int i = 0; i < rgnTopCount.Count; i++)
6140 {
6141 rgarg.Add(rgnTopCount[i]);
6142 }
6143
6144 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_COPY_SEQUENCE, null, rgarg.ToArray());
6145 }
6146 }
6147
6165 public void copy_sequence(int n, long hSrc, int nSrcStep, int nSrcStartIdx, int nCopyCount, int nCopyDim, long hDst, int nDstStep, int nDstStartIdx, int nSrcSpatialDim, int nDstSpatialDim, int nSrcSpatialDimStartIdx = 0, int nDstSpatialDimStartIdx = 0, int nSpatialDimCount = -1)
6166 {
6167 if (m_dt == DataType.DOUBLE)
6168 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_COPY_SEQUENCE2, null, m_param.AsLong(n, hSrc, nSrcStep, nSrcStartIdx, nCopyCount, nCopyDim, hDst, nDstStep, nDstStartIdx, nSrcSpatialDim, nDstSpatialDim, nSrcSpatialDimStartIdx, nDstSpatialDimStartIdx, nSpatialDimCount));
6169 else
6170 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_COPY_SEQUENCE2, null, m_param.AsLong(n, hSrc, nSrcStep, nSrcStartIdx, nCopyCount, nCopyDim, hDst, nDstStep, nDstStartIdx, nSrcSpatialDim, nDstSpatialDim, nSrcSpatialDimStartIdx, nDstSpatialDimStartIdx, nSpatialDimCount));
6171 }
6172
6182 public void copy_expand(int n, int nNum, int nDim, long hX, long hA)
6183 {
6184 if (m_dt == DataType.DOUBLE)
6185 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_COPY_EXPAND, null, m_param.AsLong(n, nNum, nDim, hX, hA));
6186 else
6187 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_COPY_EXPAND, null, m_param.AsLong(n, nNum, nDim, hX, hA));
6188 }
6189
6199 public void fill(int n, int nDim, long hSrc, int nSrcOff, int nCount, long hDst)
6200 {
6201 if (m_dt == DataType.DOUBLE)
6202 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_COPY_FILL, null, m_param.AsLong(n, nDim, hSrc, nSrcOff, nCount, hDst));
6203 else
6204 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_COPY_FILL, null, m_param.AsLong(n, nDim, hSrc, nSrcOff, nCount, hDst));
6205 }
6206
6212 public void sort(int nCount, long hY)
6213 {
6214 if (m_dt == DataType.DOUBLE)
6215 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SORT, null, m_param.AsLong(nCount, hY));
6216 else
6217 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SORT, null, m_param.AsLong(nCount, hY)); ;
6218 }
6219
6236 public void gemm(bool bTransA, bool bTransB, int m, int n, int k, double fAlpha, long hA, long hB, double fBeta, long hC)
6237 {
6238 gemm(bTransA, bTransB, m, n, k, (T)Convert.ChangeType(fAlpha, typeof(T)), hA, hB, (T)Convert.ChangeType(fBeta, typeof(T)), hC);
6239 }
6240
6257 public void gemm(bool bTransA, bool bTransB, int m, int n, int k, float fAlpha, long hA, long hB, float fBeta, long hC)
6258 {
6259 gemm(bTransA, bTransB, m, n, k, (T)Convert.ChangeType(fAlpha, typeof(T)), hA, hB, (T)Convert.ChangeType(fBeta, typeof(T)), hC);
6260 }
6261
6285 public void gemm(bool bTransA, bool bTransB, int m, int n, int k, T fAlpha, long hA, long hB, T fBeta, long hC, int nAOffset = 0, int nBOffset = 0, int nCOffset = 0, int nGroups = 1, int nGroupOffsetA = 0, int nGroupOffsetB = 0, int nGroupOffsetC = 0)
6286 {
6287 if (m_dt == DataType.DOUBLE)
6288 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_GEMM, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong((bTransA) ? 1 : 0, (bTransB) ? 1 : 0, m, n, k, 0, hA, hB, 0, hC, nAOffset, nBOffset, nCOffset, nGroups, nGroupOffsetA, nGroupOffsetB, nGroupOffsetC));
6289 else
6290 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_GEMM, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong((bTransA) ? 1 : 0, (bTransB) ? 1 : 0, m, n, k, 0, hA, hB, 0, hC, nAOffset, nBOffset, nCOffset, nGroups, nGroupOffsetA, nGroupOffsetB, nGroupOffsetC));
6291 }
6292
6312 public void gemm(bool bTransA, bool bTransB, int m, int n, int k, double fAlpha, long hA, long hB, double fBeta, long hC, uint lda, uint ldb, uint ldc)
6313 {
6314 if (m_dt == DataType.DOUBLE)
6315 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_GEMM2, m_param.AsDouble(fAlpha, fBeta), m_param.AsLong((bTransA) ? 1 : 0, (bTransB) ? 1 : 0, m, n, k, 0, hA, hB, 0, hC, lda, ldb, ldc));
6316 else
6317 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_GEMM2, m_param.AsFloat((float)fAlpha, (float)fBeta), m_param.AsLong((bTransA) ? 1 : 0, (bTransB) ? 1 : 0, m, n, k, 0, hA, hB, 0, hC, lda, ldb, ldc));
6318 }
6319
6343 public void gemm(bool bTransA, bool bTransB, int m, int n, int k, double fAlpha, long hA, long hB, double fBeta, long hC, uint lda, uint ldb, uint ldc, uint stridea, uint strideb, uint stridec, uint batch_count)
6344 {
6345 if (m_dt == DataType.DOUBLE)
6346 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_GEMM2, m_param.AsDouble(fAlpha, fBeta), m_param.AsLong((bTransA) ? 1 : 0, (bTransB) ? 1 : 0, m, n, k, 0, hA, hB, 0, hC, lda, ldb, ldc, stridea, strideb, stridec, batch_count));
6347 else
6348 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_GEMM2, m_param.AsFloat((float)fAlpha, (float)fBeta), m_param.AsLong((bTransA) ? 1 : 0, (bTransB) ? 1 : 0, m, n, k, 0, hA, hB, 0, hC, lda, ldb, ldc, stridea, strideb, stridec, batch_count));
6349 }
6350
6366 public void geam(bool bTransA, bool bTransB, int m, int n, double fAlpha, long hA, long hB, double fBeta, long hC)
6367 {
6368 geam(bTransA, bTransB, m, n, (T)Convert.ChangeType(fAlpha, typeof(T)), hA, hB, (T)Convert.ChangeType(fBeta, typeof(T)), hC);
6369 }
6370
6386 public void geam(bool bTransA, bool bTransB, int m, int n, float fAlpha, long hA, long hB, float fBeta, long hC)
6387 {
6388 geam(bTransA, bTransB, m, n, (T)Convert.ChangeType(fAlpha, typeof(T)), hA, hB, (T)Convert.ChangeType(fBeta, typeof(T)), hC);
6389 }
6390
6409 public void geam(bool bTransA, bool bTransB, int m, int n, T fAlpha, long hA, long hB, T fBeta, long hC, int nAOffset = 0, int nBOffset = 0, int nCOffset = 0)
6410 {
6411 if (m_dt == DataType.DOUBLE)
6412 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_GEAM, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong((bTransA) ? 1 : 0, (bTransB) ? 1 : 0, m, n, 0, hA, hB, 0, hC, nAOffset, nBOffset, nCOffset));
6413 else
6414 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_GEAM, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong((bTransA) ? 1 : 0, (bTransB) ? 1 : 0, m, n, 0, hA, hB, 0, hC, nAOffset, nBOffset, nCOffset));
6415 }
6416
6431 public void gemv(bool bTransA, int m, int n, double fAlpha, long hA, long hX, double fBeta, long hY)
6432 {
6433 gemv(bTransA, m, n, (T)Convert.ChangeType(fAlpha, typeof(T)), hA, hX, (T)Convert.ChangeType(fBeta, typeof(T)), hY);
6434 }
6435
6450 public void gemv(bool bTransA, int m, int n, float fAlpha, long hA, long hX, float fBeta, long hY)
6451 {
6452 gemv(bTransA, m, n, (T)Convert.ChangeType(fAlpha, typeof(T)), hA, hX, (T)Convert.ChangeType(fBeta, typeof(T)), hY);
6453 }
6454
6472 public void gemv(bool bTransA, int m, int n, T fAlpha, long hA, long hX, T fBeta, long hY, int nAOffset = 0, int nXOffset = 0, int nYOffset = 0)
6473 {
6474 if (m_dt == DataType.DOUBLE)
6475 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_GEMV, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong((bTransA) ? 1 : 0, m, n, 0, hA, hX, 0, hY, nAOffset, nXOffset, nYOffset));
6476 else
6477 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_GEMV, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong((bTransA) ? 1 : 0, m, n, 0, hA, hX, 0, hY, nAOffset, nXOffset, nYOffset));
6478 }
6479
6492 public void ger(int m, int n, double fAlpha, long hX, long hY, long hA)
6493 {
6494 ger(m, n, (T)Convert.ChangeType(fAlpha, typeof(T)), hX, hY, hA);
6495 }
6496
6509 public void ger(int m, int n, float fAlpha, long hX, long hY, long hA)
6510 {
6511 ger(m, n, (T)Convert.ChangeType(fAlpha, typeof(T)), hX, hY, hA);
6512 }
6513
6526 public void ger(int m, int n, T fAlpha, long hX, long hY, long hA)
6527 {
6528 if (m_dt == DataType.DOUBLE)
6529 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_GER, m_param.AsDouble(convertD(fAlpha)), m_param.AsLong(m, n, 0, hX, hY, hA));
6530 else
6531 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_GER, m_param.AsFloat(convertF(fAlpha)), m_param.AsLong(m, n, 0, hX, hY, hA));
6532 }
6533
6544 public void axpy(int n, double fAlpha, long hX, long hY)
6545 {
6546 axpy(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hX, hY);
6547 }
6548
6559 public void axpy(int n, float fAlpha, long hX, long hY)
6560 {
6561 axpy(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hX, hY);
6562 }
6563
6576 public void axpy(int n, T fAlpha, long hX, long hY, int nXOff = 0, int nYOff = 0)
6577 {
6578 if (m_dt == DataType.DOUBLE)
6579 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_AXPY, m_param.AsDouble(convertD(fAlpha)), m_param.AsLong(n, 0, hX, hY, nXOff, nYOff));
6580 else
6581 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_AXPY, m_param.AsFloat(convertF(fAlpha)), m_param.AsLong(n, 0, hX, hY, nXOff, nYOff));
6582 }
6583
6595 public void axpby(int n, double fAlpha, long hX, double fBeta, long hY)
6596 {
6597 axpby(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hX, (T)Convert.ChangeType(fBeta, typeof(T)), hY);
6598 }
6599
6611 public void axpby(int n, float fAlpha, long hX, float fBeta, long hY)
6612 {
6613 axpby(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hX, (T)Convert.ChangeType(fBeta, typeof(T)), hY);
6614 }
6615
6629 public void axpby(int n, T fAlpha, long hX, T fBeta, long hY)
6630 {
6631 if (m_dt == DataType.DOUBLE)
6632 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_AXPBY, m_param.AsDouble(convertD(fAlpha), convertD(fBeta)), m_param.AsLong(n, 0, hX, 0, hY));
6633 else
6634 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_AXPBY, m_param.AsFloat(convertF(fAlpha), convertF(fBeta)), m_param.AsLong(n, 0, hX, 0, hY));
6635 }
6636
6650 public void mulbsx(int n, long hA, int nAOff, long hX, int nXOff, int nC, int nSpatialDim, bool bTranspose, long hB, int nBOff)
6651 {
6652 if (m_dt == DataType.DOUBLE)
6653 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MULBSX, null, m_param.AsLong(n, hA, nAOff, hX, nXOff, nC, nSpatialDim, (bTranspose) ? 1 : 0, hB, nBOff));
6654 else
6655 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MULBSX, null, m_param.AsLong(n, hA, nAOff, hX, nXOff, nC, nSpatialDim, (bTranspose) ? 1 : 0, hB, nBOff));
6656 }
6657
6671 public void divbsx(int n, long hA, int nAOff, long hX, int nXOff, int nC, int nSpatialDim, bool bTranspose, long hB, int nBOff)
6672 {
6673 if (m_dt == DataType.DOUBLE)
6674 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_DIVBSX, null, m_param.AsLong(n, hA, nAOff, hX, nXOff, nC, nSpatialDim, (bTranspose) ? 1 : 0, hB, nBOff));
6675 else
6676 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_DIVBSX, null, m_param.AsLong(n, hA, nAOff, hX, nXOff, nC, nSpatialDim, (bTranspose) ? 1 : 0, hB, nBOff));
6677 }
6678
6695 public void matmul(uint nOuterCount, int m, int n, int k, long hA, long hB, long hC, double dfScale = 1.0, bool bTransA = false, bool bTransB = false)
6696 {
6697 uint ldb = (uint)n;
6698 uint lda = (uint)k;
6699 uint ldc = (uint)n;
6700 uint strideb = (uint)(k * n);
6701 uint stridea = (uint)(m * k);
6702 uint stridec = (uint)(m * n);
6703
6704 gemm(bTransB, bTransA, n, m, k, dfScale, hB, hA, 0.0, hC, ldb, lda, ldc, strideb, stridea, stridec, nOuterCount);
6705 }
6706
6716 public void transposeHW(int n, int c, int h, int w, long hSrc, long hDst)
6717 {
6718 if (m_dt == DataType.DOUBLE)
6719 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_TRANSPOSE_HW, null, m_param.AsLong(n, c, h, w, hSrc, hDst));
6720 else
6721 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_TRANSPOSE_HW, null, m_param.AsLong(n, c, h, w, hSrc, hDst));
6722 }
6723
6724
6732 public void set_bounds(int n, double dfMin, double dfMax, long hX)
6733 {
6734 if (m_dt == DataType.DOUBLE)
6735 {
6736 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SET_BOUNDS, m_param.AsDouble(dfMin, dfMax), m_param.AsLong(n, 0, 0, hX));
6737 }
6738 else
6739 {
6740 float fMin = -float.MaxValue;
6741 float fMax = float.MaxValue;
6742
6743 if (dfMin > -float.MaxValue && dfMin < float.MaxValue)
6744 fMin = (float)dfMin;
6745 else if (dfMin > float.MaxValue)
6746 fMin = float.MaxValue;
6747
6748 if (dfMax > -float.MaxValue && dfMax < float.MaxValue)
6749 fMax = (float)dfMax;
6750 else if (dfMin < -float.MaxValue)
6751 fMax = -float.MaxValue;
6752
6753 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SET_BOUNDS, m_param.AsFloat(fMin, fMax), m_param.AsLong(n, 0, 0, hX));
6754 }
6755 }
6756
6767 public void scal(int n, double fAlpha, long hX, int nXOff = 0)
6768 {
6769 scal(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hX, nXOff);
6770 }
6771
6782 public void scal(int n, float fAlpha, long hX, int nXOff = 0)
6783 {
6784 scal(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hX, nXOff);
6785 }
6786
6797 public void scal(int n, T fAlpha, long hX, int nXOff = 0)
6798 {
6799 if (m_dt == DataType.DOUBLE)
6800 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SCAL, m_param.AsDouble(convertD(fAlpha)), m_param.AsLong(n, 0, hX, nXOff));
6801 else
6802 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SCAL, m_param.AsFloat(convertF(fAlpha)), m_param.AsLong(n, 0, hX, nXOff));
6803 }
6804
6815 public double dot_double(int n, long hX, long hY)
6816 {
6817 return (double)Convert.ChangeType(dot(n, hX, hY), typeof(double));
6818 }
6819
6830 public float dot_float(int n, long hX, long hY)
6831 {
6832 return (float)Convert.ChangeType(dot(n, hX, hY), typeof(float));
6833 }
6834
6847 public T dot(int n, long hX, long hY, int nXOff = 0, int nYOff = 0)
6848 {
6849 if (m_dt == DataType.DOUBLE)
6850 {
6851 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_DOT, null, m_param.AsLong(n, hX, hY, nXOff, nYOff));
6852 return (T)Convert.ChangeType(rg[0], typeof(T));
6853 }
6854 else
6855 {
6856 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_DOT, null, m_param.AsLong(n, hX, hY, nXOff, nYOff));
6857 return (T)Convert.ChangeType(rg[0], typeof(T));
6858 }
6859 }
6860
6871 public double asum_double(int n, long hX, int nXOff = 0)
6872 {
6873 return (double)Convert.ChangeType(asum(n, hX, nXOff), typeof(double));
6874 }
6875
6886 public float asum_float(int n, long hX, int nXOff = 0)
6887 {
6888 return (float)Convert.ChangeType(asum(n, hX, nXOff), typeof(float));
6889 }
6890
6901 public T asum(int n, long hX, int nXOff = 0)
6902 {
6903 if (m_dt == DataType.DOUBLE)
6904 {
6905 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_ASUM, null, m_param.AsLong(n, hX, nXOff));
6906 return (T)Convert.ChangeType(rg[0], typeof(T));
6907 }
6908 else
6909 {
6910 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_ASUM, null, m_param.AsLong(n, hX, nXOff));
6911 return (T)Convert.ChangeType(rg[0], typeof(T));
6912 }
6913 }
6914
6925 public void scale(int n, double fAlpha, long hX, long hY)
6926 {
6927 scale(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hX, hY);
6928 }
6929
6940 public void scale(int n, float fAlpha, long hX, long hY)
6941 {
6942 scale(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hX, hY);
6943 }
6944
6957 public void scale(int n, T fAlpha, long hX, long hY, int nXOff = 0, int nYOff = 0)
6958 {
6959 if (m_dt == DataType.DOUBLE)
6960 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SCALE, m_param.AsDouble(convertD(fAlpha)), m_param.AsLong(n, 0, hX, hY, nXOff, nYOff));
6961 else
6962 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SCALE, m_param.AsFloat(convertF(fAlpha)), m_param.AsLong(n, 0, hX, hY, nXOff, nYOff));
6963 }
6964
6973 public void scale_to_range(int n, long hX, long hY, double fMin, double fMax)
6974 {
6975 if (m_dt == DataType.DOUBLE)
6976 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SCALE_TO_RANGE, m_param.AsDouble(fMin, fMax), m_param.AsLong(n, hX, hY, 0, 0));
6977 else
6978 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SCALE_TO_RANGE, m_param.AsFloat((float)fMin, (float)fMax), m_param.AsLong(n, hX, hY, 0, 0));
6979 }
6980
6986 public double erf(double dfVal)
6987 {
6988 return convertD(erf(convertD1(dfVal)));
6989 }
6990
6996 public float erf(float fVal)
6997 {
6998 return convertF(erf(convertF1(fVal)));
6999 }
7000
7006 public T erf(T fVal)
7007 {
7008 if (m_dt == DataType.DOUBLE)
7009 {
7010 double[] rg = m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CUDA_ERF, m_param.AsDouble(convertD(fVal)));
7011 return convert(rg)[0];
7012 }
7013 else
7014 {
7015 float[] rg = m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CUDA_ERF, m_param.AsFloat(convertF(fVal)));
7016 return convert(rg)[0];
7017 }
7018 }
7019
7030 public void mask(int n, int nMaskDim, T fSearch, T fReplace, long hX, long hMask, long hY)
7031 {
7032 if (m_dt == DataType.DOUBLE)
7033 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MASK, m_param.AsDouble(convertD(fSearch), convertD(fReplace)), m_param.AsLong(n, nMaskDim, 0, 0, hX, hMask, hY));
7034 else
7035 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MASK, m_param.AsFloat(convertF(fSearch), convertF(fReplace)), m_param.AsLong(n, nMaskDim, 0, 0, hX, hMask, hY));
7036 }
7037
7048 public void mask(int n, int nMaskDim, double fSearch, double fReplace, long hX, long hMask, long hY)
7049 {
7050 mask(n, nMaskDim, (T)Convert.ChangeType(fSearch, typeof(T)), (T)Convert.ChangeType(fReplace, typeof(T)), hX, hMask, hY);
7051 }
7052
7063 public void mask(int n, int nMaskDim, float fSearch, float fReplace, long hX, long hMask, long hY)
7064 {
7065 mask(n, nMaskDim, (T)Convert.ChangeType(fSearch, typeof(T)), (T)Convert.ChangeType(fReplace, typeof(T)), hX, hMask, hY);
7066 }
7067
7079 public void mask_batch(int n, int nBatch, int nMaskDim, T fSearch, T fReplace, long hX, long hMask, long hY)
7080 {
7081 if (m_dt == DataType.DOUBLE)
7082 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MASK_BATCH, m_param.AsDouble(convertD(fSearch), convertD(fReplace)), m_param.AsLong(n, nBatch, nMaskDim, 0, 0, hX, hMask, hY));
7083 else
7084 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MASK_BATCH, m_param.AsFloat(convertF(fSearch), convertF(fReplace)), m_param.AsLong(n, nBatch, nMaskDim, 0, 0, hX, hMask, hY));
7085 }
7086
7098 public void mask_batch(int n, int nBatch, int nMaskDim, double fSearch, double fReplace, long hX, long hMask, long hY)
7099 {
7100 mask_batch(n, nBatch, nMaskDim, (T)Convert.ChangeType(fSearch, typeof(T)), (T)Convert.ChangeType(fReplace, typeof(T)), hX, hMask, hY);
7101 }
7102
7114 public void mask_batch(int n, int nBatch, int nMaskDim, float fSearch, float fReplace, long hX, long hMask, long hY)
7115 {
7116 mask_batch(n, nBatch, nMaskDim, (T)Convert.ChangeType(fSearch, typeof(T)), (T)Convert.ChangeType(fReplace, typeof(T)), hX, hMask, hY);
7117 }
7118
7138 public void interp2(int nChannels, long hData1, int nX1, int nY1, int nHeight1, int nWidth1, int nHeight1A, int nWidth1A, long hData2, int nX2, int nY2, int nHeight2, int nWidth2, int nHeight2A, int nWidth2A, bool bBwd = false)
7139 {
7140 if (!(nX1 >= 0 && nY1 >= 0 && nHeight1 > 0 && nWidth1 > 0 && nX2 >= 0 && nY2 >= 0 && nHeight2 > 0 && nWidth2 > 0))
7141 throw new ArgumentOutOfRangeException("interp2: Invalid arguments found.");
7142
7143 if (!(nWidth1A >= nWidth1 + nX1 && nHeight1A >= nHeight1 + nY1 && nWidth2A >= nWidth2 + nX2 && nHeight2A >= nHeight2 + nY2))
7144 throw new ArgumentOutOfRangeException("interp2: Invalid arguments found.");
7145
7146 if (m_dt == DataType.DOUBLE)
7147 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_INTERP2, null, m_param.AsLong(nChannels, hData1, nX1, nY1, nHeight1, nWidth1, nHeight1A, nWidth1A, hData2, nX2, nY2, nHeight2, nWidth2, nHeight2A, nWidth2A, (bBwd) ? 1 : 0));
7148 else
7149 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_INTERP2, null, m_param.AsLong(nChannels, hData1, nX1, nY1, nHeight1, nWidth1, nHeight1A, nWidth1A, hData2, nX2, nY2, nHeight2, nWidth2, nHeight2A, nWidth2A, (bBwd) ? 1 : 0));
7150 }
7151
7161 public void add_scalar(int n, double fAlpha, long hY)
7162 {
7163 add_scalar(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hY);
7164 }
7165
7175 public void add_scalar(int n, float fAlpha, long hY)
7176 {
7177 add_scalar(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hY);
7178 }
7179
7190 public void add_scalar(int n, T fAlpha, long hY, int nYOff = 0)
7191 {
7192 if (m_dt == DataType.DOUBLE)
7193 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_ADD_SCALAR, m_param.AsDouble(convertD(fAlpha)), m_param.AsLong(n, 0, hY, nYOff));
7194 else
7195 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_ADD_SCALAR, m_param.AsFloat(convertF(fAlpha)), m_param.AsLong(n, 0, hY, nYOff));
7196 }
7197
7209 public void add(int n, long hA, long hB, long hC, long hY)
7210 {
7211 if (m_dt == DataType.DOUBLE)
7212 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_ADD3, null, m_param.AsLong(n, hA, hB, hC, hY));
7213 else
7214 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_ADD3, null, m_param.AsLong(n, hA, hB, hC, hY));
7215 }
7216
7227 public void add(int n, long hA, long hB, long hY)
7228 {
7229 if (m_dt == DataType.DOUBLE)
7230 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_ADD, null, m_param.AsLong(n, hA, hB, hY));
7231 else
7232 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_ADD, null, m_param.AsLong(n, hA, hB, hY));
7233 }
7234
7246 public void add(int n, long hA, long hB, long hY, double dfAlpha)
7247 {
7248 if (m_dt == DataType.DOUBLE)
7249 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_ADD, m_param.AsDouble(dfAlpha), m_param.AsLong(n, hA, hB, hY, 0));
7250 else
7251 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_ADD, m_param.AsFloat((float)dfAlpha), m_param.AsLong(n, hA, hB, hY, 0));
7252 }
7253
7265 public void add(int n, long hA, long hB, long hY, float fAlpha)
7266 {
7267 if (m_dt == DataType.DOUBLE)
7268 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_ADD, m_param.AsDouble(fAlpha), m_param.AsLong(n, hA, hB, hY, 0));
7269 else
7270 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_ADD, m_param.AsFloat(fAlpha), m_param.AsLong(n, hA, hB, hY, 0));
7271 }
7272
7288 public void add(int n, long hA, long hB, long hY, double dfAlphaA, double dfAlphaB, int nAOff = 0, int nBOff = 0, int nYOff = 0)
7289 {
7290 if (m_dt == DataType.DOUBLE)
7291 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_ADD2, m_param.AsDouble(dfAlphaA, dfAlphaB), m_param.AsLong(n, hA, hB, hY, 0, 0, nAOff, nBOff, nYOff));
7292 else
7293 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_ADD2, m_param.AsFloat((float)dfAlphaA, (float)dfAlphaB), m_param.AsLong(n, hA, hB, hY, 0, 0, nAOff, nBOff, nYOff));
7294 }
7295
7312 public void sub(int n, long hA, long hB, long hY, int nAOff = 0, int nBOff = 0, int nYOff = 0, int nB = 0)
7313 {
7314 if (m_dt == DataType.DOUBLE)
7315 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SUB, null, m_param.AsLong(n, hA, hB, hY, nAOff, nBOff, nYOff, nB));
7316 else
7317 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SUB, null, m_param.AsLong(n, hA, hB, hY, nAOff, nBOff, nYOff, nB));
7318 }
7319
7320
7334 public void mul(int n, long hA, long hB, long hY, int nAOff = 0, int nBOff = 0, int nYOff = 0)
7335 {
7336 if (m_dt == DataType.DOUBLE)
7337 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MUL, null, m_param.AsLong(n, hA, hB, hY, nAOff, nBOff, nYOff));
7338 else
7339 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MUL, null, m_param.AsLong(n, hA, hB, hY, nAOff, nBOff, nYOff));
7340 }
7341
7357 public void sub_and_dot(int n, int nN, int nInnerNum, long hA, long hB, long hY, int nAOff, int nBOff, int nYOff)
7358 {
7359 if (m_dt == DataType.DOUBLE)
7360 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SUB_AND_DOT, null, m_param.AsLong(n, nN, nInnerNum, hA, hB, hY, nAOff, nBOff, nYOff));
7361 else
7362 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SUB_AND_DOT, null, m_param.AsLong(n, nN, nInnerNum, hA, hB, hY, nAOff, nBOff, nYOff));
7363 }
7364
7374 public void mul_scalar(int n, double fAlpha, long hY)
7375 {
7376 mul_scalar(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hY);
7377 }
7378
7388 public void mul_scalar(int n, float fAlpha, long hY)
7389 {
7390 mul_scalar(n, (T)Convert.ChangeType(fAlpha, typeof(T)), hY);
7391 }
7392
7402 public void mul_scalar(int n, T fAlpha, long hY)
7403 {
7404 if (m_dt == DataType.DOUBLE)
7405 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MUL_SCALAR, m_param.AsDouble(convertD(fAlpha)), m_param.AsLong(n, 0, hY));
7406 else
7407 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MUL_SCALAR, m_param.AsFloat(convertF(fAlpha)), m_param.AsLong(n, 0, hY));
7408 }
7409
7420 public void div(int n, long hA, long hB, long hY)
7421 {
7422 if (m_dt == DataType.DOUBLE)
7423 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_DIV, null, m_param.AsLong(n, hA, hB, hY));
7424 else
7425 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_DIV, null, m_param.AsLong(n, hA, hB, hY));
7426 }
7427
7437 public void abs(int n, long hA, long hY)
7438 {
7439 if (m_dt == DataType.DOUBLE)
7440 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_ABS, null, m_param.AsLong(n, hA, hY));
7441 else
7442 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_ABS, null, m_param.AsLong(n, hA, hY));
7443 }
7444
7454 public void exp(int n, long hA, long hY)
7455 {
7456 exp(n, hA, hY, 0, 0, 1.0);
7457 }
7458
7471 public void exp(int n, long hA, long hY, int nAOff, int nYOff, double dfBeta)
7472 {
7473 if (m_dt == DataType.DOUBLE)
7474 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_EXP, m_param.AsDouble(dfBeta), m_param.AsLong(n, hA, hY, nAOff, nYOff, 0));
7475 else
7476 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_EXP, m_param.AsFloat((float)dfBeta), m_param.AsLong(n, hA, hY, nAOff, nYOff, 0));
7477 }
7478
7488 public void log(int n, long hA, long hY)
7489 {
7490 log(n, hA, hY, 1.0, 0.0);
7491 }
7492
7504 public void log(int n, long hA, long hY, double dfBeta, double dfAlpha = 0)
7505 {
7506 if (m_dt == DataType.DOUBLE)
7507 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_LOG, m_param.AsDouble(dfBeta, dfAlpha), m_param.AsLong(n, hA, hY, 0, 0));
7508 else
7509 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_LOG, m_param.AsFloat((float)dfBeta, (float)dfAlpha), m_param.AsLong(n, hA, hY, 0, 0));
7510 }
7511
7524 public void powx(int n, long hA, double fAlpha, long hY, int nAOff = 0, int nYOff = 0)
7525 {
7526 powx(n, hA, (T)Convert.ChangeType(fAlpha, typeof(T)), hY, nAOff, nYOff);
7527 }
7528
7541 public void powx(int n, long hA, float fAlpha, long hY, int nAOff = 0, int nYOff = 0)
7542 {
7543 powx(n, hA, (T)Convert.ChangeType(fAlpha, typeof(T)), hY, nAOff, nYOff);
7544 }
7545
7558 public void powx(int n, long hA, T fAlpha, long hY, int nAOff = 0, int nYOff = 0)
7559 {
7560 if (m_dt == DataType.DOUBLE)
7561 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_POWX, m_param.AsDouble(convertD(fAlpha)), m_param.AsLong(n, hA, 0, hY, nAOff, nYOff));
7562 else
7563 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_POWX, m_param.AsFloat(convertF(fAlpha)), m_param.AsLong(n, hA, 0, hY, nAOff, nYOff));
7564 }
7565
7574 public void sign(int n, long hX, long hY, int nXOff = 0, int nYOff = 0)
7575 {
7576 if (m_dt == DataType.DOUBLE)
7577 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SIGN, null, m_param.AsLong(n, hX, hY, nXOff, nYOff));
7578 else
7579 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SIGN, null, m_param.AsLong(n, hX, hY, nXOff, nYOff));
7580 }
7581
7582#pragma warning disable 1591
7583
7584 public void student(int n, long hX, long hY)
7585 {
7586 if (m_dt == DataType.DOUBLE)
7587 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_STUDENT, null, m_param.AsLong(n, hX, hY));
7588 else
7589 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_STUDENT, null, m_param.AsLong(n, hX, hY));
7590 }
7591
7592 public void logistic1(int n, long hX, long hY)
7593 {
7594 if (m_dt == DataType.DOUBLE)
7595 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_LOGISTIC1, null, m_param.AsLong(n, hX, hY));
7596 else
7597 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_LOGISTIC1, null, m_param.AsLong(n, hX, hY));
7598 }
7599
7600 public void logistic2(int n, long hX, long hY)
7601 {
7602 if (m_dt == DataType.DOUBLE)
7603 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_LOGISTIC2, null, m_param.AsLong(n, hX, hY));
7604 else
7605 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_LOGISTIC2, null, m_param.AsLong(n, hX, hY));
7606 }
7607
7608 public void reciprocol(int n, long hX, long hY)
7609 {
7610 if (m_dt == DataType.DOUBLE)
7611 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_RECIPROCOL, null, m_param.AsLong(n, hX, hY));
7612 else
7613 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_RECIPROCOL, null, m_param.AsLong(n, hX, hY));
7614 }
7615
7616#pragma warning restore 1591
7617
7624 public void sqrt(int n, long hX, long hY)
7625 {
7626 if (m_dt == DataType.DOUBLE)
7627 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SQRT, null, m_param.AsLong(n, hX, hY));
7628 else
7629 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SQRT, null, m_param.AsLong(n, hX, hY));
7630 }
7631
7638 public void sqrt_scale(int nCount, long hX, long hY)
7639 {
7640 if (m_dt == DataType.DOUBLE)
7641 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SQRT_SCALE, null, m_param.AsLong(nCount, hX, hY));
7642 else
7643 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SQRT_SCALE, null, m_param.AsLong(nCount, hX, hY));
7644 }
7645
7653 public void compare_signs(int n, long hA, long hB, long hY)
7654 {
7655 if (m_dt == DataType.DOUBLE)
7656 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_COMPARE_SIGNS, null, m_param.AsLong(n, hA, hB, hY));
7657 else
7658 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_COMPARE_SIGNS, null, m_param.AsLong(n, hA, hB, hY));
7659 }
7660
7669 public void max(int n, long hA, long hB, long hY)
7670 {
7671 if (m_dt == DataType.DOUBLE)
7672 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MAX, null, m_param.AsLong(n, hA, hB, hY));
7673 else
7674 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MAX, null, m_param.AsLong(n, hA, hB, hY));
7675 }
7676
7686 public void max_bwd(int n, long hAdata, long hBdata, long hYdiff, long hAdiff, long hBdiff)
7687 {
7688 if (m_dt == DataType.DOUBLE)
7689 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MAX_BWD2, null, m_param.AsLong(n, hAdata, hBdata, hYdiff, hAdiff, hBdiff));
7690 else
7691 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MAX_BWD2, null, m_param.AsLong(n, hAdata, hBdata, hYdiff, hAdiff, hBdiff));
7692 }
7693
7702 public void min(int n, long hA, long hB, long hY)
7703 {
7704 if (m_dt == DataType.DOUBLE)
7705 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MIN, null, m_param.AsLong(n, hA, hB, hY));
7706 else
7707 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MIN, null, m_param.AsLong(n, hA, hB, hY));
7708 }
7709
7724 public double max(int n, long hA, out long lPos, int nAOff = 0, long hWork = 0)
7725 {
7726 if (hWork != 0)
7727 {
7728 if (m_dt == DataType.DOUBLE)
7729 {
7730 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MAXVAL, null, m_param.AsLong(n, hA, nAOff, hWork));
7731 lPos = (long)rg[1];
7732 return rg[0];
7733 }
7734 else
7735 {
7736 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MAXVAL, null, m_param.AsLong(n, hA, nAOff, hWork));
7737 lPos = (long)rg[1];
7738 return rg[0];
7739 }
7740 }
7741 else
7742 {
7743 if (m_dt == DataType.DOUBLE)
7744 {
7745 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MAXVAL, null, m_param.AsLong(n, hA, nAOff));
7746 lPos = (long)rg[1];
7747 return rg[0];
7748 }
7749 else
7750 {
7751 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MAXVAL, null, m_param.AsLong(n, hA, nAOff));
7752 lPos = (long)rg[1];
7753 return rg[0];
7754 }
7755 }
7756 }
7757
7772 public double min(int n, long hA, out long lPos, int nAOff = 0, long hWork = 0)
7773 {
7774 if (hWork != 0)
7775 {
7776 if (m_dt == DataType.DOUBLE)
7777 {
7778 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MINVAL, null, m_param.AsLong(n, hA, nAOff, hWork));
7779 lPos = (long)rg[1];
7780 return rg[0];
7781 }
7782 else
7783 {
7784 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MINVAL, null, m_param.AsLong(n, hA, nAOff, hWork));
7785 lPos = (long)rg[1];
7786 return rg[0];
7787 }
7788 }
7789 else
7790 {
7791 if (m_dt == DataType.DOUBLE)
7792 {
7793 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MINVAL, null, m_param.AsLong(n, hA, nAOff));
7794 lPos = (long)rg[1];
7795 return rg[0];
7796 }
7797 else
7798 {
7799 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MINVAL, null, m_param.AsLong(n, hA, nAOff));
7800 lPos = (long)rg[1];
7801 return rg[0];
7802 }
7803 }
7804 }
7805
7818 public Tuple<double, double, double, double> minmax(int n, long hA, long hWork1, long hWork2, bool bDetectNans = false, int nAOff = 0)
7819 {
7820 if (m_dt == DataType.DOUBLE)
7821 {
7822 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MINMAXVAL, null, m_param.AsLong(n, hA, hWork1, hWork2, (bDetectNans) ? 1 : 0, nAOff));
7823 return new Tuple<double, double, double, double>(rg[0], rg[1], rg[2], rg[3]);
7824 }
7825 else
7826 {
7827 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MINMAXVAL, null, m_param.AsLong(n, hA, hWork1, hWork2, (bDetectNans) ? 1 : 0, nAOff));
7828 return new Tuple<double, double, double, double>(rg[0], rg[1], rg[2], rg[3]);
7829 }
7830 }
7831
7843 public void minmax(int n, long hA, long hWork1, long hWork2, int nK, long hMin, long hMax, bool bNonZeroOnly)
7844 {
7845 if (m_dt == DataType.DOUBLE)
7846 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_MINMAXVEC, null, m_param.AsLong(n, hA, hWork1, hWork2, nK, hMin, hMax, (bNonZeroOnly) ? 1 : 0));
7847 else
7848 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_MINMAXVEC, null, m_param.AsLong(n, hA, hWork1, hWork2, nK, hMin, hMax, (bNonZeroOnly) ? 1 : 0));
7849 }
7850
7862 public void transpose(int n, long hX, long hY, long hXCounts, long hYCounts, long hMapping, int nNumAxes, long hBuffer)
7863 {
7864 if (m_dt == DataType.DOUBLE)
7865 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_TRANSPOSE, null, m_param.AsLong(n, hX, hY, hXCounts, hYCounts, hMapping, nNumAxes, hBuffer));
7866 else
7867 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_TRANSPOSE, null, m_param.AsLong(n, hX, hY, hXCounts, hYCounts, hMapping, nNumAxes, hBuffer));
7868 }
7869
7878 public double sumsq(int n, long hW, long hA, int nAOff = 0)
7879 {
7880 if (m_dt == DataType.DOUBLE)
7881 {
7882 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SUMSQ, null, m_param.AsLong(n, hW, hA, nAOff));
7883 return rg[0];
7884 }
7885 else
7886 {
7887 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SUMSQ, null, m_param.AsLong(n, hW, hA, nAOff));
7888 return rg[0];
7889 }
7890 }
7891
7902 public double sumsqdiff(int n, long hW, long hA, long hB, int nAOff = 0, int nBOff = 0)
7903 {
7904 if (m_dt == DataType.DOUBLE)
7905 {
7906 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SUMSQDIFF, null, m_param.AsLong(n, hW, hA, hB, nAOff, nBOff));
7907 return rg[0];
7908 }
7909 else
7910 {
7911 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SUMSQDIFF, null, m_param.AsLong(n, hW, hA, hB, nAOff, nBOff));
7912 return rg[0];
7913 }
7914 }
7915
7925 public void width(int n, long hMean, long hMin, long hMax, double dfAlpha, long hWidth)
7926 {
7927 if (m_dt == DataType.DOUBLE)
7928 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_WIDTH, m_param.AsDouble(dfAlpha), m_param.AsLong(n, hMean, hMin, hMax, 0, hWidth));
7929 else
7930 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_WIDTH, m_param.AsFloat((float)dfAlpha), m_param.AsLong(n, hMean, hMin, hMax, 0, hWidth));
7931 }
7932
7943 public bool contains_point(int n, long hMean, long hWidth, long hX, long hWork, int nXOff = 0)
7944 {
7945 if (m_dt == DataType.DOUBLE)
7946 {
7947 double[] rg = m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CONTAINS_POINT, null, m_param.AsLong(n, hMean, hWidth, hX, hWork, nXOff));
7948 return (rg[0] == 0) ? false : true;
7949 }
7950 else
7951 {
7952 float[] rg = m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CONTAINS_POINT, null, m_param.AsLong(n, hMean, hWidth, hX, hWork, nXOff));
7953 return (rg[0] == 0) ? false : true;
7954 }
7955 }
7956
7963 public void denan(int n, long hX, double dfReplacement)
7964 {
7965 if (m_dt == DataType.DOUBLE)
7966 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_DENAN, m_param.AsDouble(dfReplacement), m_param.AsLong(n, hX, 0));
7967 else
7968 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_DENAN, m_param.AsFloat((float)dfReplacement), m_param.AsLong(n, hX, 0));
7969 }
7970
7989 public void im2col(long hDataIm, int nDataImOffset, int nChannels, int nHeight, int nWidth, int nKernelH, int nKernelW, int nPadH, int nPadW, int nStrideH, int nStrideW, int nDilationH, int nDilationW, long hDataCol, int nDataColOffset)
7990 {
7991 if (m_dt == DataType.DOUBLE)
7992 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_IM2COL, null, m_param.AsLong(hDataIm, nDataImOffset, nChannels, nHeight, nWidth, nKernelH, nKernelW, nPadH, nPadW, nStrideH, nStrideW, nDilationH, nDilationW, hDataCol, nDataColOffset));
7993 else
7994 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_IM2COL, null, m_param.AsLong(hDataIm, nDataImOffset, nChannels, nHeight, nWidth, nKernelH, nKernelW, nPadH, nPadW, nStrideH, nStrideW, nDilationH, nDilationW, hDataCol, nDataColOffset));
7995 }
7996
8013 public void im2col_nd(long hDataIm, int nDataImOffset, int nNumSpatialAxes, int nImCount, int nChannelAxis, long hImShape, long hColShape, long hKernelShape, long hPad, long hStride, long hDilation, long hDataCol, int nDataColOffset)
8014 {
8015 if (m_dt == DataType.DOUBLE)
8016 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_IM2COL_ND, null, m_param.AsLong(hDataIm, nDataImOffset, nNumSpatialAxes, nImCount, nChannelAxis, hImShape, hColShape, hKernelShape, hPad, hStride, hDilation, hDataCol, nDataColOffset));
8017 else
8018 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_IM2COL_ND, null, m_param.AsLong(hDataIm, nDataImOffset, nNumSpatialAxes, nImCount, nChannelAxis, hImShape, hColShape, hKernelShape, hPad, hStride, hDilation, hDataCol, nDataColOffset));
8019 }
8020
8039 public void col2im(long hDataCol, int nDataColOffset, int nChannels, int nHeight, int nWidth, int nKernelH, int nKernelW, int nPadH, int nPadW, int nStrideH, int nStrideW, int nDilationH, int nDilationW, long hDataIm, int nDataImOffset)
8040 {
8041 if (m_dt == DataType.DOUBLE)
8042 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_COL2IM, null, m_param.AsLong(hDataCol, nDataColOffset, nChannels, nHeight, nWidth, nKernelH, nKernelW, nPadH, nPadW, nStrideH, nStrideW, nDilationH, nDilationW, hDataIm, nDataImOffset));
8043 else
8044 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_COL2IM, null, m_param.AsLong(hDataCol, nDataColOffset, nChannels, nHeight, nWidth, nKernelH, nKernelW, nPadH, nPadW, nStrideH, nStrideW, nDilationH, nDilationW, hDataIm, nDataImOffset));
8045 }
8046
8063 public void col2im_nd(long hDataCol, int nDataColOffset, int nNumSpatialAxes, int nColCount, int nChannelAxis, long hImShape, long hColShape, long hKernelShape, long hPad, long hStride, long hDilation, long hDataIm, int nDataImOffset)
8064 {
8065 if (m_dt == DataType.DOUBLE)
8066 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_COL2IM_ND, null, m_param.AsLong(hDataCol, nDataColOffset, nNumSpatialAxes, nColCount, nChannelAxis, hImShape, hColShape, hKernelShape, hPad, hStride, hDilation, hDataIm, nDataImOffset));
8067 else
8068 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_COL2IM_ND, null, m_param.AsLong(hDataCol, nDataColOffset, nNumSpatialAxes, nColCount, nChannelAxis, hImShape, hColShape, hKernelShape, hPad, hStride, hDilation, hDataIm, nDataImOffset));
8069 }
8070
8081 public void channel_min(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, bool bReturnIdx = false)
8082 {
8083 if (m_dt == DataType.DOUBLE)
8084 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_MIN, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, (bReturnIdx) ? 1 : 0));
8085 else
8086 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_MIN, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, (bReturnIdx) ? 1 : 0));
8087 }
8088
8099 public void channel_max(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, bool bReturnIdx = false)
8100 {
8101 if (m_dt == DataType.DOUBLE)
8102 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_MAX, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, (bReturnIdx) ? 1 : 0));
8103 else
8104 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_MAX, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, (bReturnIdx) ? 1 : 0));
8105 }
8106
8116 public void channel_mean(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY)
8117 {
8118 if (m_dt == DataType.DOUBLE)
8119 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_MEAN, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8120 else
8121 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_MEAN, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8122 }
8123
8133 public void channel_compare(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY)
8134 {
8135 if (m_dt == DataType.DOUBLE)
8136 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_COMPARE, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8137 else
8138 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_COMPARE, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8139 }
8140
8152 public void channel_fillfrom(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, DIR dir)
8153 {
8154 if (m_dt == DataType.DOUBLE)
8155 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_FILLFROM, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, (int)dir));
8156 else
8157 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_FILLFROM, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, (int)dir));
8158 }
8159
8179 public void channel_fill(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, int nLabelDim, long hLabels, long hY)
8180 {
8181 if (m_dt == DataType.DOUBLE)
8182 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_FILL, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, nLabelDim, hLabels, hY));
8183 else
8184 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_FILL, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, nLabelDim, hLabels, hY));
8185 }
8186
8197 public void channel_sub(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hA, long hX, long hY)
8198 {
8199 if (m_dt == DataType.DOUBLE)
8200 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_SUB, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, hA));
8201 else
8202 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_SUB, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, hA));
8203 }
8204
8214 public void channel_sub(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY)
8215 {
8216 if (m_dt == DataType.DOUBLE)
8217 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_SUB, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8218 else
8219 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_SUB, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8220 }
8221
8236 public void channel_sum(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, bool bSumAcrossChannels = true, DIR dir = DIR.FWD, int nChannelsY = -1)
8237 {
8238 if (m_dt == DataType.DOUBLE)
8239 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_SUM, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, (bSumAcrossChannels) ? 1 : 0, (int)dir, nChannelsY));
8240 else
8241 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_SUM, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, (bSumAcrossChannels) ? 1 : 0, (int)dir, nChannelsY));
8242 }
8243
8254 public void channel_div(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, int nMethod = 1)
8255 {
8256 if (m_dt == DataType.DOUBLE)
8257 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_DIV, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, nMethod));
8258 else
8259 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_DIV, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, nMethod));
8260 }
8261
8272 public void channel_mul(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, int nMethod = 1)
8273 {
8274 if (m_dt == DataType.DOUBLE)
8275 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_MUL, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, nMethod));
8276 else
8277 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_MUL, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY, nMethod));
8278 }
8279
8290 public void channel_mulv(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hA, long hX, long hC)
8291 {
8292 if (m_dt == DataType.DOUBLE)
8293 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_MULV, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hA, hX, hC));
8294 else
8295 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_MULV, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hA, hX, hC));
8296 }
8297
8308 public void channel_scale(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hA, long hY)
8309 {
8310 if (m_dt == DataType.DOUBLE)
8311 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_SCALE, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hA, hY));
8312 else
8313 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_SCALE, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hA, hY));
8314 }
8315
8326 public void channel_dot(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hA, long hY)
8327 {
8328 if (m_dt == DataType.DOUBLE)
8329 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_DOT, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hA, hY));
8330 else
8331 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_DOT, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hA, hY));
8332 }
8333
8343 public void channel_duplicate(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY)
8344 {
8345 if (m_dt == DataType.DOUBLE)
8346 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_DUP, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8347 else
8348 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_DUP, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8349 }
8350
8361 public void channel_percentile(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY, double dfPercentile)
8362 {
8363 if (m_dt == DataType.DOUBLE)
8364 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_PERCENTILE, m_param.AsDouble(dfPercentile), m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8365 else
8366 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_PERCENTILE, m_param.AsFloat((float)dfPercentile), m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8367 }
8368
8382 public void channel_op_fwd(OP op, int nCount, int nC, int nN1, int nSD1, int nN2, int nSD2, long hA, long hB, long hY)
8383 {
8384 int nCount1 = Math.Max(nN1, nN2) * nC * Math.Max(nSD1, nSD2);
8385 if (nCount1 != nCount)
8386 throw new Exception("The nCount must equal max(nN1, nN2) x nC x max(nSD1, nSD2).");
8387
8388 if (m_dt == DataType.DOUBLE)
8389 m_cuda.RunDoubleEx2((int) m_hKernel, (int) CUDAFN.CUDA_CHANNEL_OP_FWD, null, m_param.AsLong((int)op, nCount, nC, nN1, nSD1, nN2, nSD2, hA, hB, hY));
8390 else
8391 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_OP_FWD, null, m_param.AsLong((int)op, nCount, nC, nN1, nSD1, nN2, nSD2, hA, hB, hY));
8392 }
8393
8413 public void channel_op_bwd(OP op, int nCount, int nC, int nN1, int nSD1, int nN2, int nSD2, int nCy,int nSDy, long hA, long hB, long hY, long hAd, long hBd, long hYd, long hWork)
8414 {
8415 int nCount1 = Math.Max(nN1, nN2) * nC * Math.Max(nSD1, nSD2);
8416 if (nCount1 != nCount)
8417 throw new Exception("The nCount must equal max(nN1, nN2) x nC x max(nSD1, nSD2).");
8418
8419 if (m_dt == DataType.DOUBLE)
8420 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_OP_BWD, null, m_param.AsLong((int)op, nCount, nC, nN1, nSD1, nN2, nSD2, nCy, nSDy, hA, hB, hY, hAd, hBd, hYd, hWork));
8421 else
8422 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_OP_BWD, null, m_param.AsLong((int)op, nCount, nC, nN1, nSD1, nN2, nSD2, nCy, nSDy, hA, hB, hY, hAd, hBd, hYd, hWork));
8423 }
8424
8437 public void channel_add(int nCount, int nOuterNum, int nChannels, int nBlocks, int nInnerNum, int nOffset, long hX, long hY, DIR dir)
8438 {
8439 if (m_dt == DataType.DOUBLE)
8440 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_ADD, null, m_param.AsLong(nCount, nOuterNum, nChannels, nBlocks, nInnerNum, nOffset, hX, hY, (int)dir));
8441 else
8442 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_ADD, null, m_param.AsLong(nCount, nOuterNum, nChannels, nBlocks, nInnerNum, nOffset, hX, hY, (int)dir));
8443 }
8444
8457 public void channel_copy(int nCount, int nOuterNum, int nChannels, int nBlocks, int nInnerNum, int nOffset, long hX, long hY, DIR dir)
8458 {
8459 if (m_dt == DataType.DOUBLE)
8460 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_COPY, null, m_param.AsLong(nCount, nOuterNum, nChannels, nBlocks, nInnerNum, nOffset, hX, hY, (int)dir));
8461 else
8462 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_COPY, null, m_param.AsLong(nCount, nOuterNum, nChannels, nBlocks, nInnerNum, nOffset, hX, hY, (int)dir));
8463 }
8464
8474 public void channel_copyall(int nCount, int nOuterNum, int nChannels, int nInnerNum, long hX, long hY)
8475 {
8476 if (m_dt == DataType.DOUBLE)
8477 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_COPYALL, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8478 else
8479 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_CHANNEL_COPYALL, null, m_param.AsLong(nCount, nOuterNum, nChannels, nInnerNum, hX, hY));
8480 }
8481
8482
8491 public void sum(int nCount, int nOuterNum, int nInnerNum, long hX, long hY)
8492 {
8493 if (m_dt == DataType.DOUBLE)
8494 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_SUM, null, m_param.AsLong(nCount, nOuterNum, nInnerNum, hX, hY));
8495 else
8496 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_SUM, null, m_param.AsLong(nCount, nOuterNum, nInnerNum, hX, hY));
8497 }
8498
8506 public void rng_setseed(long lSeed)
8507 {
8508 if (m_dt == DataType.DOUBLE)
8509 m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CUDA_RNG_SETSEED, m_param.AsDouble(lSeed));
8510 else
8511 m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CUDA_RNG_SETSEED, m_param.AsFloat(lSeed));
8512 }
8513
8524 public void rng_uniform(int n, double fMin, double fMax, long hY)
8525 {
8526 rng_uniform(n, (T)Convert.ChangeType(fMin, typeof(T)), (T)Convert.ChangeType(fMax, typeof(T)), hY);
8527 }
8528
8539 public void rng_uniform(int n, float fMin, float fMax, long hY)
8540 {
8541 rng_uniform(n, (T)Convert.ChangeType(fMin, typeof(T)), (T)Convert.ChangeType(fMax, typeof(T)), hY);
8542 }
8543
8554 public void rng_uniform(int n, T fMin, T fMax, long hY)
8555 {
8556 if (m_dt == DataType.DOUBLE)
8557 {
8558 if (m_rgGhostMemory == null || !m_bGhostMemoryEnabled)
8559 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_RNG_UNIFORM, m_param.AsDouble(convertD(fMin), convertD(fMax)), m_param.AsLong(n, 0, 0, hY));
8560 }
8561 else
8562 {
8563 if (m_rgGhostMemory == null || !m_bGhostMemoryEnabled)
8564 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_RNG_UNIFORM, m_param.AsFloat(convertF(fMin), convertF(fMax)), m_param.AsLong(n, 0, 0, hY));
8565 }
8566 }
8567
8578 public void rng_gaussian(int n, double fMu, double fSigma, long hY)
8579 {
8580 rng_gaussian(n, (T)Convert.ChangeType(fMu, typeof(T)), (T)Convert.ChangeType(fSigma, typeof(T)), hY);
8581 }
8582
8593 public void rng_gaussian(int n, float fMu, float fSigma, long hY)
8594 {
8595 rng_gaussian(n, (T)Convert.ChangeType(fMu, typeof(T)), (T)Convert.ChangeType(fSigma, typeof(T)), hY);
8596 }
8597
8608 public void rng_gaussian(int n, T fMu, T fSigma, long hY)
8609 {
8610 if (m_dt == DataType.DOUBLE)
8611 {
8612 if (m_rgGhostMemory == null || !m_bGhostMemoryEnabled)
8613 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_RNG_GAUSSIAN, m_param.AsDouble(convertD(fMu), convertD(fSigma)), m_param.AsLong(n, 0, 0, hY));
8614 }
8615 else
8616 {
8617 if (m_rgGhostMemory == null || !m_bGhostMemoryEnabled)
8618 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_RNG_GAUSSIAN, m_param.AsFloat(convertF(fMu), convertF(fSigma)), m_param.AsLong(n, 0, 0, hY));
8619 }
8620 }
8621
8631 public void rng_bernoulli(int n, double fNonZeroProb, long hY)
8632 {
8633 rng_bernoulli(n, (T)Convert.ChangeType(fNonZeroProb, typeof(T)), hY);
8634 }
8635
8645 public void rng_bernoulli(int n, float fNonZeroProb, long hY)
8646 {
8647 rng_bernoulli(n, (T)Convert.ChangeType(fNonZeroProb, typeof(T)), hY);
8648 }
8649
8659 public void rng_bernoulli(int n, T fNonZeroProb, long hY)
8660 {
8661 //if (m_dt == DataType.DOUBLE)
8662 // m_cuda.RunDouble((int)m_hKernel, (int)CUDAFN.CUDA_RNG_BERNOULLI, new double[] { n, (double)Convert.ChangeType(fNonZeroProb, typeof(double)), hY });
8663 //else
8664 // m_cuda.RunFloat((int)m_hKernel, (int)CUDAFN.CUDA_RNG_BERNOULLI, new float[] { n, (float)Convert.ChangeType(fNonZeroProb, typeof(float)), hY });
8665
8666 T[] rg = GetMemory(hY);
8667 fill_random(fNonZeroProb, rg);
8668 SetMemory(hY, rg);
8669 }
8670
8671#pragma warning disable 1591
8672
8673 public void fill_random(T fNonZeroProb, T[] rg)
8674 {
8675 double dfNonZeroProb = Utility.ConvertVal<T>(fNonZeroProb);
8676
8677 for (int i = 0; i < rg.Length; i++)
8678 {
8679 double dfRand = m_random.NextDouble();
8680 rg[i] = (dfRand <= dfNonZeroProb) ? m_tOne : m_tZero;
8681 }
8682 }
8683
8684#pragma warning restore 1591
8685
8686
8700 public void accuracy_fwd(int nCount, int nOuterNum, int nInnerNum, long hBottomData, long hBottomLabel, long hAccData, long hAccTotals, int? nIgnoreLabel, bool bLastElementOnly, int nBatch)
8701 {
8702 if (m_dt == DataType.DOUBLE)
8703 {
8704 List<long> rgArg = new List<long>() { nCount, nOuterNum, nInnerNum, hBottomData, hBottomLabel, hAccData, hAccTotals, (bLastElementOnly) ? 1 : 0, nBatch };
8705 if (nIgnoreLabel.HasValue)
8706 rgArg.Add(nIgnoreLabel.Value);
8707 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_ACCURACY_FWD, null, rgArg.ToArray());
8708 }
8709 else
8710 {
8711 List<long> rgArg = new List<long>() { nCount, nOuterNum, nInnerNum, hBottomData, hBottomLabel, hAccData, hAccTotals, (bLastElementOnly) ? 1 : 0, nBatch };
8712 if (nIgnoreLabel.HasValue)
8713 rgArg.Add(nIgnoreLabel.Value);
8714 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_ACCURACY_FWD, null, rgArg.ToArray());
8715 }
8716 }
8717
8718
8727 public void batchreidx_fwd(int nCount, int nInnerDim, long hBottomData, long hPermutData, long hTopData)
8728 {
8729 if (m_dt == DataType.DOUBLE)
8730 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_BATCHREIDX_FWD, null, m_param.AsLong(nCount, nInnerDim, hBottomData, hPermutData, hTopData));
8731 else
8732 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_BATCHREIDX_FWD, null, m_param.AsLong(nCount, nInnerDim, hBottomData, hPermutData, hTopData));
8733 }
8734
8745 public void batchreidx_bwd(int nCount, int nInnerDim, long hTopDiff, long hTopIdx, long hBegins, long hCounts, long hBottomDiff)
8746 {
8747 if (m_dt == DataType.DOUBLE)
8748 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_BATCHREIDX_BWD, null, m_param.AsLong(nCount, nInnerDim, hTopDiff, hTopIdx, hBegins, hCounts, hBottomDiff));
8749 else
8750 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_BATCHREIDX_BWD, null, m_param.AsLong(nCount, nInnerDim, hTopDiff, hTopIdx, hBegins, hCounts, hBottomDiff));
8751 }
8752
8763 public void embed_fwd(int nCount, long hBottomData, long hWeight, int nM, int nN, int nK, long hTopData)
8764 {
8765 if (m_dt == DataType.DOUBLE)
8766 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_EMBED_FWD, null, m_param.AsLong(nCount, hBottomData, hWeight, nM, nN, nK, hTopData));
8767 else
8768 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_EMBED_FWD, null, m_param.AsLong(nCount, hBottomData, hWeight, nM, nN, nK, hTopData));
8769 }
8770
8781 public void embed_bwd(int nCount, long hBottomData, long hTopDiff, int nM, int nN, int nK, long hWeightDiff)
8782 {
8783 if (m_dt == DataType.DOUBLE)
8784 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_EMBED_BWD, null, m_param.AsLong(nCount, hBottomData, hTopDiff, nM, nN, nK, hWeightDiff));
8785 else
8786 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_EMBED_BWD, null, m_param.AsLong(nCount, hBottomData, hTopDiff, nM, nN, nK, hWeightDiff));
8787 }
8788
8810 public void pooling_fwd(POOLING_METHOD method, int nCount, long hBottomData, int num, int nChannels, int nHeight, int nWidth, int nPooledHeight, int nPooledWidth, int nKernelH, int nKernelW, int nStrideH, int nStrideW, int nPadH, int nPadW, long hTopData, long hMask, long hTopMask)
8811 {
8812 if (m_dt == DataType.DOUBLE)
8813 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_POOL_FWD, null, m_param.AsLong((int)method, nCount, hBottomData, num, nChannels, nHeight, nWidth, nPooledHeight, nPooledWidth, nKernelH, nKernelW, nStrideH, nStrideW, nPadH, nPadW, hTopData, hMask, hTopMask));
8814 else
8815 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_POOL_FWD, null, m_param.AsLong((int)method, nCount, hBottomData, num, nChannels, nHeight, nWidth, nPooledHeight, nPooledWidth, nKernelH, nKernelW, nStrideH, nStrideW, nPadH, nPadW, hTopData, hMask, hTopMask));
8816 }
8817
8839 public void pooling_bwd(POOLING_METHOD method, int nCount, long hTopDiff, int num, int nChannels, int nHeight, int nWidth, int nPooledHeight, int nPooledWidth, int nKernelH, int nKernelW, int nStrideH, int nStrideW, int nPadH, int nPadW, long hBottomDiff, long hMask, long hTopMask)
8840 {
8841 if (m_dt == DataType.DOUBLE)
8842 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_POOL_BWD, null, m_param.AsLong((int)method, nCount, hTopDiff, num, nChannels, nHeight, nWidth, nPooledHeight, nPooledWidth, nKernelH, nKernelW, nStrideH, nStrideW, nPadH, nPadW, hBottomDiff, hMask, hTopMask));
8843 else
8844 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_POOL_BWD, null, m_param.AsLong((int)method, nCount, hTopDiff, num, nChannels, nHeight, nWidth, nPooledHeight, nPooledWidth, nKernelH, nKernelW, nStrideH, nStrideW, nPadH, nPadW, hBottomDiff, hMask, hTopMask));
8845 }
8846
8867 public void unpooling_fwd(POOLING_METHOD method, int nCount, long hBottomData, int num, int nChannels, int nHeight, int nWidth, int nPooledHeight, int nPooledWidth, int nKernelH, int nKernelW, int nStrideH, int nStrideW, int nPadH, int nPadW, long hTopData, long hMask)
8868 {
8869 if (m_dt == DataType.DOUBLE)
8870 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_UNPOOL_FWD, null, m_param.AsLong((int)method, nCount, hBottomData, num, nChannels, nHeight, nWidth, nPooledHeight, nPooledWidth, nKernelH, nKernelW, nStrideH, nStrideW, nPadH, nPadW, hTopData, hMask));
8871 else
8872 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_UNPOOL_FWD, null, m_param.AsLong((int)method, nCount, hBottomData, num, nChannels, nHeight, nWidth, nPooledHeight, nPooledWidth, nKernelH, nKernelW, nStrideH, nStrideW, nPadH, nPadW, hTopData, hMask));
8873 }
8874
8895 public void unpooling_bwd(POOLING_METHOD method, int nCount, long hTopDiff, int num, int nChannels, int nHeight, int nWidth, int nPooledHeight, int nPooledWidth, int nKernelH, int nKernelW, int nStrideH, int nStrideW, int nPadH, int nPadW, long hBottomDiff, long hMask)
8896 {
8897 if (m_dt == DataType.DOUBLE)
8898 m_cuda.RunDoubleEx2((int)m_hKernel, (int)CUDAFN.CUDA_UNPOOL_BWD, null, m_param.AsLong((int)method, nCount, hTopDiff, num, nChannels, nHeight, nWidth, nPooledHeight, nPooledWidth, nKernelH, nKernelW, nStrideH, nStrideW, nPadH, nPadW, hBottomDiff, hMask));
8899 else
8900 m_cuda.RunFloatEx2((int)m_hKernel, (int)CUDAFN.CUDA_UNPOOL_BWD, null, m_param.AsLong((int)method, nCount, hTopDiff, num, nChannels, nHeight