mycaffe/html/_transformer_block_layer_8cs_source.html

using System;

using System.Collections.Generic;

using System.Linq;

using System.Text;

using MyCaffe.basecode;

using MyCaffe.common;

using MyCaffe.param;

using MyCaffe.fillers;

using System.Diagnostics;

using MyCaffe.param.gpt;

using System.Runtime.InteropServices.WindowsRuntime;


namespace MyCaffe.layers.gpt

{

    public class TransformerBlockLayer<T> : Layer<T>

    {

        Blob<T> m_blobLn1;

        Blob<T> m_blobAttn1;

        Blob<T> m_blobLn2;

        Blob<T> m_blobAttn2 = null;

        Blob<T> m_blobLn3 = null;

        Blob<T> m_blobMlp;

        Blob<T> m_blobMlpOut;

        Blob<T> m_blobX = null;

        Layer<T> m_ln1;          // Input layer normalization.

        Layer<T> m_attn1;        // Attention block used with encoder and decoder

        Layer<T> m_ln2;          // Layer normalization after the first attention block

        Layer<T> m_attn2 = null; // Attention block used with decoder only.

        Layer<T> m_ln3 = null;   // Layer normalization after second attention block, used with decoder only.

        // MLP block

        Layer<T> m_fc;      // initial linear

        Layer<T> m_proj;    // projection

        Layer<T> m_act;     // activation

        Layer<T> m_dropout = null; // resid dropout


        BlobCollection<T> m_colInternalBottom = new BlobCollection<T>();

        BlobCollection<T> m_colInternalTop = new BlobCollection<T>();


        public TransformerBlockLayer(CudaDnn<T> cuda, Log log, LayerParameter p)

            : base(cuda, log, p)

        {

            CancelEvent evtCancel = new CancelEvent();


            m_type = LayerParameter.LayerType.TRANSFORMER_BLOCK;


            m_blobLn1 = new Blob<T>(cuda, log);

            m_blobLn1.Name = m_param.name + " ln1";

            m_blobAttn1 = new Blob<T>(cuda, log);

            m_blobAttn1.Name = m_param.name + " attn1";

            m_blobLn2 = new Blob<T>(cuda, log);

            m_blobLn2.Name = m_param.name + " ln2";

            m_blobMlp = new Blob<T>(cuda, log);

            m_blobMlp.Name = m_param.name + " mlp";

            m_blobMlpOut = new Blob<T>(cuda, log);

            m_blobMlpOut.Name = m_param.name + " mlp_out";

            m_blobX = new Blob<T>(cuda, log);

            m_blobX.Name = m_param.name + " xB";


            LayerParameter ln1 = new LayerParameter(LayerParameter.LayerType.LAYERNORM, p.name + ".ln1");

            ln1.layer_norm_param.enable_cuda_impl = p.transformer_block_param.enable_layernorm_cuda_impl;

            m_ln1 = Layer<T>.Create(cuda, log, convertLayerParam(ln1, p), evtCancel) as Layer<T>;


            LayerParameter ln2 = new LayerParameter(LayerParameter.LayerType.LAYERNORM, p.name + ".ln2");

            ln2.layer_norm_param.enable_cuda_impl = p.transformer_block_param.enable_layernorm_cuda_impl;

            m_ln2 = Layer<T>.Create(cuda, log, convertLayerParam(ln2, p), evtCancel) as Layer<T>;


            if (p.transformer_block_param.block_type == TransformerBlockParameter.BLOCK_TYPE.CAUSAL_SELF_ATTENTION)

            {

                LayerParameter attn = new LayerParameter(LayerParameter.LayerType.CAUSAL_SELF_ATTENTION, p.name + ".attn");

                attn.causal_self_attention_param.block_size = p.transformer_block_param.block_size;

                attn.causal_self_attention_param.embed = p.transformer_block_param.embed;

                attn.causal_self_attention_param.heads = p.transformer_block_param.heads;

                attn.causal_self_attention_param.attn_dropout = p.transformer_block_param.attn_dropout;

                attn.causal_self_attention_param.resid_dropout = p.transformer_block_param.resid_dropout;

                attn.causal_self_attention_param.layers = p.transformer_block_param.layers;

                attn.parameters.Add((m_param.parameters.Count > 0) ? m_param.parameters[0] : new ParamSpec(1.0, 1.0));

                attn.parameters.Add((m_param.parameters.Count > 1) ? m_param.parameters[1] : new ParamSpec(1.0, 0.0));

                m_attn1 = Layer<T>.Create(cuda, log, convertLayerParam(attn, p), evtCancel);

            }

            else if (p.transformer_block_param.block_type == TransformerBlockParameter.BLOCK_TYPE.ENCODER)

            {

                LayerParameter attn = new LayerParameter(LayerParameter.LayerType.MULTIHEAD_ATTENTION, p.name + ".attn");

                attn.multihead_attention_param.block_size = p.transformer_block_param.block_size;

                attn.multihead_attention_param.embed = p.transformer_block_param.embed;

                attn.multihead_attention_param.heads = p.transformer_block_param.heads;

                attn.multihead_attention_param.attn_dropout = p.transformer_block_param.attn_dropout;

                attn.multihead_attention_param.resid_dropout = p.transformer_block_param.resid_dropout;

                attn.multihead_attention_param.layers = p.transformer_block_param.layers;

                attn.multihead_attention_param.weight_init = MultiheadAttentionParameter.WEIGHT_INIT.ENCODER_DECODER;

                attn.parameters.Add((m_param.parameters.Count > 0) ? m_param.parameters[0] : new ParamSpec(1.0, 1.0));

                attn.parameters.Add((m_param.parameters.Count > 1) ? m_param.parameters[1] : new ParamSpec(1.0, 0.0));

                m_attn1 = Layer<T>.Create(cuda, log, convertLayerParam(attn, p), evtCancel);

            }

            else if (p.transformer_block_param.block_type == TransformerBlockParameter.BLOCK_TYPE.DECODER)

            {

                m_blobAttn2 = new Blob<T>(cuda, log);

                m_blobAttn2.Name = m_param.name + " attn2";

                m_blobLn3 = new Blob<T>(cuda, log);

                m_blobLn3.Name = m_param.name + " ln3";


                LayerParameter ln3 = new LayerParameter(LayerParameter.LayerType.LAYERNORM, p.name + ".ln3");

                ln3.layer_norm_param.enable_cuda_impl = p.transformer_block_param.enable_layernorm_cuda_impl;

                m_ln3 = Layer<T>.Create(cuda, log, convertLayerParam(ln3, p), evtCancel) as Layer<T>;


                LayerParameter attn1 = new LayerParameter(LayerParameter.LayerType.MULTIHEAD_ATTENTION, p.name + ".attn1");

                attn1.multihead_attention_param.block_size = p.transformer_block_param.block_size;

                attn1.multihead_attention_param.embed = p.transformer_block_param.embed;

                attn1.multihead_attention_param.heads = p.transformer_block_param.heads;

                attn1.multihead_attention_param.attn_dropout = p.transformer_block_param.attn_dropout;

                attn1.multihead_attention_param.resid_dropout = p.transformer_block_param.resid_dropout;

                attn1.multihead_attention_param.layers = p.transformer_block_param.layers;

                attn1.multihead_attention_param.weight_init = MultiheadAttentionParameter.WEIGHT_INIT.ENCODER_DECODER;

                attn1.parameters.Add((m_param.parameters.Count > 0) ? m_param.parameters[0] : new ParamSpec(1.0, 1.0));

                attn1.parameters.Add((m_param.parameters.Count > 1) ? m_param.parameters[1] : new ParamSpec(1.0, 0.0));

                m_attn1 = Layer<T>.Create(cuda, log, convertLayerParam(attn1, p), evtCancel);


                LayerParameter attn2 = new LayerParameter(LayerParameter.LayerType.MULTIHEAD_ATTENTION, p.name + ".attn2");

                attn2.multihead_attention_param.block_size = p.transformer_block_param.block_size;

                attn2.multihead_attention_param.embed = p.transformer_block_param.embed;

                attn2.multihead_attention_param.heads = p.transformer_block_param.heads;

                attn2.multihead_attention_param.attn_dropout = p.transformer_block_param.attn_dropout;

                attn2.multihead_attention_param.resid_dropout = p.transformer_block_param.resid_dropout;

                attn2.multihead_attention_param.layers = p.transformer_block_param.layers;

                attn2.multihead_attention_param.weight_init = MultiheadAttentionParameter.WEIGHT_INIT.ENCODER_DECODER;

                attn2.parameters.Add((m_param.parameters.Count > 0) ? m_param.parameters[0] : new ParamSpec(1.0, 1.0));

                attn2.parameters.Add((m_param.parameters.Count > 1) ? m_param.parameters[1] : new ParamSpec(1.0, 0.0));

                m_attn2 = Layer<T>.Create(cuda, log, convertLayerParam(attn2, p), evtCancel);

            }

            else

            {

                throw new Exception("The block type '" + p.transformer_block_param.block_type.ToString() + "' is not supported!");

            }


            LayerParameter fc = new LayerParameter(LayerParameter.LayerType.INNERPRODUCT, p.name + ".fc");

            fc.inner_product_param.axis = 2;

            fc.inner_product_param.bias_term = true;

            fc.inner_product_param.num_output = (uint)(p.transformer_block_param.embed * 4);

            if (p.transformer_block_param.block_type == TransformerBlockParameter.BLOCK_TYPE.CAUSAL_SELF_ATTENTION)

            {

                fc.inner_product_param.weight_filler = new FillerParameter("gaussian", 0, 0, 0.02);

                fc.inner_product_param.bias_filler = new FillerParameter("constant", 0.0);

            }

            else

            {

                fc.inner_product_param.weight_filler = new FillerParameter("xavier");

                fc.inner_product_param.bias_filler = new FillerParameter("xavier");

            }

            fc.parameters.Add((m_param.parameters.Count > 0) ? m_param.parameters[0] : new ParamSpec(1.0, 1.0));

            fc.parameters.Add((m_param.parameters.Count > 1) ? m_param.parameters[1] : new ParamSpec(1.0, 0.0));

            m_fc = Layer<T>.Create(cuda, log, convertLayerParam(fc, p), evtCancel);


            LayerParameter proj = new LayerParameter(LayerParameter.LayerType.INNERPRODUCT, p.name + ".proj");

            proj.inner_product_param.axis = 2;

            proj.inner_product_param.bias_term = true;

            proj.inner_product_param.num_output = (uint)p.transformer_block_param.embed;

            if (p.transformer_block_param.block_type == TransformerBlockParameter.BLOCK_TYPE.CAUSAL_SELF_ATTENTION)

            {

                // apply special scaled init to the residual projections, per GPT-2 paper

                proj.inner_product_param.weight_filler = new FillerParameter("gaussian", 0, 0, 0.02/Math.Sqrt(2 * m_param.transformer_block_param.layers));

                proj.inner_product_param.bias_filler = new FillerParameter("constant", 0.0);

            }

            else

            {

                proj.inner_product_param.weight_filler = new FillerParameter("xavier");

                proj.inner_product_param.bias_filler = new FillerParameter("xavier");

            }

            proj.parameters.Add((m_param.parameters.Count > 0) ? m_param.parameters[0] : new ParamSpec(1.0, 1.0));

            proj.parameters.Add((m_param.parameters.Count > 1) ? m_param.parameters[1] : new ParamSpec(1.0, 0.0));

            m_proj = Layer<T>.Create(cuda, log, convertLayerParam(proj, p), evtCancel);


            // ReLU has a very similar curve, and is faster.

            LayerParameter.LayerType actType = LayerParameter.LayerType.RELU;

            bool? bEnableBert = null;


            if (p.transformer_block_param.activation == param.gpt.TransformerBlockParameter.ACTIVATION.GELU_BERT)

            {

                actType = LayerParameter.LayerType.GELU;

                bEnableBert = true;

            }

            else if (p.transformer_block_param.activation == param.gpt.TransformerBlockParameter.ACTIVATION.GELU)

            {

                actType = LayerParameter.LayerType.GELU;

                bEnableBert = false;

            }


            LayerParameter act = new LayerParameter(actType, p.name + ".act");

            if (bEnableBert.HasValue)

                act.gelu_param.enable_bert_version = bEnableBert.Value;


            m_act = Layer<T>.Create(cuda, log, convertLayerParam(act, p), evtCancel);


            if (p.transformer_block_param.resid_dropout > 0)

            {

                LayerParameter dropout = new LayerParameter(LayerParameter.LayerType.DROPOUT, p.name + ".drop");

                dropout.dropout_param.dropout_ratio = p.transformer_block_param.resid_dropout;

                m_dropout = Layer<T>.Create(cuda, log, convertLayerParam(dropout, p), evtCancel);

            }


            setup_internal_blobs(m_colInternalBlobs);

        }


        protected override void dispose()

        {

            dispose(ref m_blobLn1);

            dispose(ref m_blobAttn1);

            dispose(ref m_blobLn2);

            dispose(ref m_blobAttn2);

            dispose(ref m_blobLn3);

            dispose(ref m_blobMlp);

            dispose(ref m_blobMlpOut);

            dispose(ref m_blobX);


            dispose(ref m_ln1);

            dispose(ref m_attn1);

            dispose(ref m_ln2);

            dispose(ref m_attn2);

            dispose(ref m_ln3);

            dispose(ref m_fc);

            dispose(ref m_proj);

            dispose(ref m_act);

            dispose(ref m_dropout);


            base.dispose();

        }


        protected override void setup_internal_blobs(BlobCollection<T> col)

        {

            if (col.Count > 0)

                return;


            col.Add(m_blobLn1);

            col.Add(m_blobAttn1);

            col.Add(m_blobLn2);

            if (m_blobAttn2 != null)

                col.Add(m_blobAttn2);

            col.Add(m_blobX);

            if (m_blobLn3 != null)

                col.Add(m_blobLn3);

            col.Add(m_blobMlp);

            col.Add(m_blobMlpOut);


            col.Add(m_ln1.internal_blobs);

            col.Add(m_attn1.internal_blobs);

            col.Add(m_ln2.internal_blobs);

            if (m_attn2 != null)

                col.Add(m_attn2.internal_blobs);

            if (m_ln3 != null)

                col.Add(m_ln3.internal_blobs);

            col.Add(m_fc.internal_blobs);

            col.Add(m_act.internal_blobs);

            col.Add(m_proj.internal_blobs);

            if (m_dropout != null)

                col.Add(m_dropout.internal_blobs);

        }


        public override int MinBottomBlobs

        {

            get { return 1; }

        }


        public override int MaxBottomBlobs

        {

            get

            {

                switch (m_param.transformer_block_param.block_type)

                {

                    case TransformerBlockParameter.BLOCK_TYPE.ENCODER:

                        return 2;


                    case TransformerBlockParameter.BLOCK_TYPE.DECODER:

                        return 4;


                    default:

                        return 1;

                }

            }

        }


        public override int ExactNumTopBlobs

        {

            get { return 1; }

        }


        public override bool ReInitializeParameters(WEIGHT_TARGET target)

        {

            base.ReInitializeParameters(target);


            m_ln1.ReInitializeParameters(target);

            m_attn1.ReInitializeParameters(target);

            m_ln2.ReInitializeParameters(target);

            if (m_attn2 != null)

                m_attn2.ReInitializeParameters(target);

            if (m_ln3 != null)

                m_ln3.ReInitializeParameters(target);

            m_fc.ReInitializeParameters(target);

            m_proj.ReInitializeParameters(target);


            return true;

        }


        private void addInternal(Blob<T> bottom, Blob<T> top)

        {

            m_colInternalBottom.Clear();

            m_colInternalBottom.Add(bottom);


            m_colInternalTop.Clear();

            m_colInternalTop.Add(top);

        }


        private void addInternal(List<Blob<T>> rgBottom, Blob<T> top)

        {

            m_colInternalBottom.Clear();


            for (int i=0; i<rgBottom.Count; i++)

            {

                m_colInternalBottom.Add(rgBottom[i]);

            }


            m_colInternalTop.Clear();

            m_colInternalTop.Add(top);

        }


        public override void LayerSetUp(BlobCollection<T> colBottom, BlobCollection<T> colTop)

        {

            colTop[0].ReshapeLike(colBottom[0]);


            shareLayerBlob(m_blobLn1, colBottom[0].shape());

            m_blobLn1.ReshapeLike(colBottom[0]);

            shareLayerBlob(m_blobAttn1, colBottom[0].shape());

            m_blobAttn1.ReshapeLike(colBottom[0]);

            shareLayerBlob(m_blobLn2, colBottom[0].shape());

            m_blobLn2.ReshapeLike(colBottom[0]);

            shareLayerBlob(m_blobX, colBottom[0].shape());

            m_blobX.ReshapeLike(colBottom[0]);


            if (m_blobAttn2 != null)

            {

                shareLayerBlob(m_blobAttn2, colBottom[0].shape());

                m_blobAttn2.ReshapeLike(colBottom[0]);

            }


            if (m_blobLn3 != null)

            {

                shareLayerBlob(m_blobLn3, colBottom[0].shape());

                m_blobLn3.ReshapeLike(colBottom[0]);

            }


            shareLayerBlob(m_blobMlp, colBottom[0].shape());

            m_blobMlp.ReshapeLike(colBottom[0]);

            shareLayerBlob(m_blobMlpOut, colBottom[0].shape());

            m_blobMlpOut.ReshapeLike(colBottom[0]);


            addInternal(colBottom[0], m_blobLn1);

            m_ln1.LayerSetUp(m_colInternalBottom, m_colInternalTop);


            if (m_param.transformer_block_param.block_type == TransformerBlockParameter.BLOCK_TYPE.CAUSAL_SELF_ATTENTION)

            {

                // self.attn(self.ln_1(x))

                addInternal(m_blobLn1, m_blobAttn1);

                m_attn1.LayerSetUp(m_colInternalBottom, m_colInternalTop);

            }

            else if (m_param.transformer_block_param.block_type == TransformerBlockParameter.BLOCK_TYPE.ENCODER)

            {

                // self.attn(x_1, x_1, x_1, e_mask)

                addInternal(new List<Blob<T>>() { m_blobLn1, m_blobLn1, m_blobLn1, colBottom[1] }, m_blobAttn1);

                m_attn1.LayerSetUp(m_colInternalBottom, m_colInternalTop);

            }

            else if (m_param.transformer_block_param.block_type == TransformerBlockParameter.BLOCK_TYPE.DECODER)

            {

                // self.attn1(x_1, x_1, x_1, d_mask)

                addInternal(new List<Blob<T>>() { m_blobLn1, m_blobLn1, m_blobLn1, colBottom[1] }, m_blobAttn1);

                m_attn1.LayerSetUp(m_colInternalBottom, m_colInternalTop);

            }

            else

            {

                throw new Exception("Unknown block type '" + m_param.transformer_block_param.block_type.ToString() + "'!");

            }


            addInternal(colTop[0], m_blobLn2);

            m_ln2.LayerSetUp(m_colInternalBottom, m_colInternalTop);

            Blob<T> blobLn = m_blobLn2;


            if (m_param.transformer_block_param.block_type == TransformerBlockParameter.BLOCK_TYPE.DECODER)

            {

                // self.attn2(x_2, e_output, e_output, e_mask)

                addInternal(new List<Blob<T>>() { m_blobLn2, colBottom[2], colBottom[2], colBottom[3] }, m_blobAttn2);

                m_attn2.LayerSetUp(m_colInternalBottom, m_colInternalTop);


                addInternal(m_blobAttn2, m_blobLn3);

                m_ln3.LayerSetUp(m_colInternalBottom, m_colInternalTop);

                blobLn = m_blobLn3;

            }


            addInternal(blobLn, m_blobMlp);

            m_fc.LayerSetUp(m_colInternalBottom, m_colInternalTop);

            addInternal(m_blobLn2, m_blobMlp);

            m_fc.Reshape(m_colInternalBottom, m_colInternalTop);

            addInternal(m_blobMlp, m_blobMlp);

            m_act.LayerSetUp(m_colInternalBottom, m_colInternalTop);

            addInternal(m_blobMlp, m_blobMlpOut);

            m_proj.LayerSetUp(m_colInternalBottom, m_colInternalTop);


            if (m_dropout != null)

            {

                addInternal(m_blobMlpOut, m_blobMlpOut);

                m_dropout.LayerSetUp(m_colInternalBottom, m_colInternalTop);

            }


            colTop[0].ReshapeLike(m_blobMlpOut);


            blobs.Add(m_attn1.blobs);

            if (m_attn2 != null)

                blobs.Add(m_attn2.blobs);

            blobs.Add(m_fc.blobs);

            blobs.Add(m_proj.blobs);


            foreach (Blob<T> blob in blobs)

            {

                if (!blob.Name.StartsWith(m_param.name + "_"))

                    blob.Name = m_param.name + "_" + blob.Name;

            }

        }


        public override void Reshape(BlobCollection<T> colBottom, BlobCollection<T> colTop)

        {

            colTop[0].ReshapeLike(colBottom[0]);


            m_blobLn1.ReshapeLike(colBottom[0]);

            m_blobAttn1.ReshapeLike(colBottom[0]);

            m_blobLn2.ReshapeLike(colBottom[0]);

            m_blobX.ReshapeLike(colBottom[0]);


            if (m_blobAttn2 != null)

                m_blobAttn2.ReshapeLike(colBottom[0]);


            if (m_blobLn3 != null)

                m_blobLn3.ReshapeLike(colBottom[0]);


            m_blobMlp.ReshapeLike(colBottom[0]);

            m_blobMlpOut.ReshapeLike(colBottom[0]);


            addInternal(colBottom[0], m_blobLn1);

            m_ln1.Reshape(m_colInternalBottom, m_colInternalTop);


            if (m_param.transformer_block_param.block_type == TransformerBlockParameter.BLOCK_TYPE.CAUSAL_SELF_ATTENTION)

            {

                // self.attn(self.ln_1(x))

                addInternal(m_blobLn1, m_blobAttn1);

                m_attn1.Reshape(m_colInternalBottom, m_colInternalTop);

            }

            else if (m_param.transformer_block_param.block_type == TransformerBlockParameter.BLOCK_TYPE.ENCODER)

            {

                // self.attn(x_1, x_1, x_1, e_mask)

                addInternal(new List<Blob<T>>() { m_blobLn1, m_blobLn1, m_blobLn1, colBottom[1] }, m_blobAttn1);

                m_attn1.Reshape(m_colInternalBottom, m_colInternalTop);

            }

            else if (m_param.transformer_block_param.block_type == TransformerBlockParameter.BLOCK_TYPE.DECODER)

            {

                // self.attn1(x_1, x_1, x_1, d_mask)

                addInternal(new List<Blob<T>>() { m_blobLn1, m_blobLn1, m_blobLn1, colBottom[1] }, m_blobAttn1);

                m_attn1.Reshape(m_colInternalBottom, m_colInternalTop);

            }

            else

            {

                throw new Exception("Unknown block type '" + m_param.transformer_block_param.block_type.ToString() + "'!");

            }


            addInternal(colTop[0], m_blobLn2);

            m_ln2.Reshape(m_colInternalBottom, m_colInternalTop);

            Blob<T> blobLn = m_blobLn2;


            if (m_param.transformer_block_param.block_type == TransformerBlockParameter.BLOCK_TYPE.DECODER)

            {

                // self.attn2(x_2, e_output, e_output, e_mask)

                addInternal(new List<Blob<T>>() { m_blobLn2, colBottom[2], colBottom[2], colBottom[3] }, m_blobAttn2);

                m_attn2.Reshape(m_colInternalBottom, m_colInternalTop);


                addInternal(m_blobAttn2, m_blobLn3);

                m_ln3.Reshape(m_colInternalBottom, m_colInternalTop);

                blobLn = m_blobLn3;

            }


            addInternal(blobLn, m_blobMlp);

            m_fc.Reshape(m_colInternalBottom, m_colInternalTop);

            addInternal(m_blobMlp, m_blobMlp);

            m_act.Reshape(m_colInternalBottom, m_colInternalTop);

            addInternal(m_blobMlp, m_blobMlpOut);

            m_proj.Reshape(m_colInternalBottom, m_colInternalTop);


            if (m_dropout != null)

            {

                addInternal(m_blobMlpOut, m_blobMlpOut);

                m_dropout.Reshape(m_colInternalBottom, m_colInternalTop);

            }


            colTop[0].ReshapeLike(colBottom[0]);

        }


        protected override void forward(BlobCollection<T> colBottom, BlobCollection<T> colTop)

        {

            int nCount = colBottom[0].count();

            Blob<T> blobX = colBottom[0];

            Blob<T> blobXMask = (colBottom.Count > 1) ? colBottom[1] : null;

            Blob<T> blobEncOut = (colBottom.Count > 3) ? colBottom[2] : null;

            Blob<T> blobEncMask = (colBottom.Count > 3) ? colBottom[3] : null;


            //-------------------------------------------

            // x = x + self.attn(self.ln_1(x))


            // x_1 = self.ln_1(x)

            addInternal(blobX, m_blobLn1);

            m_ln1.Forward(m_colInternalBottom, m_colInternalTop);


            if (m_param.transformer_block_param.block_type == TransformerBlockParameter.BLOCK_TYPE.CAUSAL_SELF_ATTENTION)

            {

                // attn1 = self.attn(self.ln_1(x))

                addInternal(m_blobLn1, m_blobAttn1);

                m_attn1.Forward(m_colInternalBottom, m_colInternalTop);

            }

            else if (m_param.transformer_block_param.block_type == TransformerBlockParameter.BLOCK_TYPE.ENCODER)

            {

                // attn1 = self.attn(x_1, x_1, x_1, e_mask)

                addInternal(new List<Blob<T>>() { m_blobLn1, m_blobLn1, m_blobLn1, blobXMask }, m_blobAttn1);

                m_attn1.Forward(m_colInternalBottom, m_colInternalTop);

            }

            else if (m_param.transformer_block_param.block_type == TransformerBlockParameter.BLOCK_TYPE.DECODER)

            {

                // attn1 = self.attn1(x_1, x_1, x_1, d_mask)

                addInternal(new List<Blob<T>>() { m_blobLn1, m_blobLn1, m_blobLn1, blobXMask }, m_blobAttn1);

                m_attn1.Forward(m_colInternalBottom, m_colInternalTop);

            }

            else

            {

                throw new Exception("Unknown block type '" + m_param.transformer_block_param.block_type.ToString() + "'!");

            }


            // xB = x + self.attn1(self.ln_1(x))

            m_cuda.add(nCount, blobX.gpu_data, m_blobAttn1.gpu_data, m_blobX.mutable_gpu_data);


            // x_2 = self.ln_2(xB)

            addInternal(m_blobX, m_blobLn2);

            m_ln2.Forward(m_colInternalBottom, m_colInternalTop);

            Blob<T> blobLn = m_blobLn2;


            if (m_param.transformer_block_param.block_type == TransformerBlockParameter.BLOCK_TYPE.DECODER)

            {

                // attn2 = self.attn2(x_2, e_output, e_output, e_mask)

                addInternal(new List<Blob<T>>() { m_blobLn2, blobEncOut, blobEncOut, blobEncMask }, m_blobAttn2);

                m_attn2.Forward(m_colInternalBottom, m_colInternalTop);


                // xC = xB + self.attn2(self.ln_2(x))

                m_cuda.add(nCount, m_blobX.gpu_data, m_blobAttn2.gpu_data, m_blobX.mutable_gpu_data);


                // x_3 = self.ln3(xC)

                addInternal(m_blobX, m_blobLn3);

                m_ln3.Forward(m_colInternalBottom, m_colInternalTop);

                blobLn = m_blobLn3;

            }


            // CSA | ENCODER: ff = self.mlpf(self.ln_2(x_2)),

            // DECODER:       ff = self.mlpf(self.ln_3(x_3))

            addInternal(blobLn, m_blobMlp);

            m_fc.Forward(m_colInternalBottom, m_colInternalTop);

            addInternal(m_blobMlp, m_blobMlp);

            m_act.Forward(m_colInternalBottom, m_colInternalTop);

            addInternal(m_blobMlp, m_blobMlpOut);

            m_proj.Forward(m_colInternalBottom, m_colInternalTop);


            if (m_dropout != null)

            {

                addInternal(m_blobMlpOut, m_blobMlpOut);

                m_dropout.Forward(m_colInternalBottom, m_colInternalTop);

            }


            // CSA | ENCODER: xC = xB + self.mlpf(self.ln_2(x_2)),

            // DECODER:       xD = xC + self.mlpf(self.ln_3(x_3))

            m_cuda.add(nCount, m_blobX.gpu_data, m_blobMlpOut.gpu_data, colTop[0].mutable_gpu_data);

        }


        protected override void backward(BlobCollection<T> colTop, List<bool> rgbPropagateDown, BlobCollection<T> colBottom)

        {

            int nCount = colBottom[0].count();

            Blob<T> blobX = colBottom[0];

            Blob<T> blobXMask = (colBottom.Count > 1) ? colBottom[1] : null;

            Blob<T> blobEncOut = (colBottom.Count > 3) ? colBottom[2] : null;

            Blob<T> blobEncMask = (colBottom.Count > 3) ? colBottom[3] : null;


            // Gradient with respect to state then data.

            if (rgbPropagateDown[0])

            {

                List<bool> rgbPropagate = new List<bool>() { true, true };


                // CSA | ENCODER Gradient for xC = xB + self.mlpf(self.ln_2(x_2))

                // DECODER Gradient for       xD = xC + self.mlpf(self.ln_3(x_3))

                // xD -> ff (decoder), otherwise xC -> ff (encoder)

                m_cuda.copy(nCount, colTop[0].gpu_diff, m_blobMlpOut.mutable_gpu_diff);

                // xD -> xC (decoder), otherwise xC -> xB (encoder)

                m_cuda.copy(nCount, colTop[0].gpu_diff, m_blobX.mutable_gpu_diff); // xB, xC


                if (m_dropout != null)

                {

                    addInternal(m_blobMlpOut, m_blobMlpOut);

                    m_dropout.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);

                }


                // Gradient for MLP

                addInternal(m_blobMlp, m_blobMlpOut);

                m_proj.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);

                addInternal(m_blobMlp, m_blobMlp);

                m_act.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);

                Blob<T> blobLn = (m_param.transformer_block_param.block_type == TransformerBlockParameter.BLOCK_TYPE.DECODER) ? m_blobLn3 : m_blobLn2;

                // ff -> x_3 (decoder), otherwise x_2 (encoder)

                addInternal(blobLn, m_blobMlp);

                m_fc.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);


                if (m_param.transformer_block_param.block_type == TransformerBlockParameter.BLOCK_TYPE.DECODER)

                {

                    // x_3 = self.ln3(xC)

                    // x_3 -> xC1

                    m_blobAttn2.CopyFrom(m_blobX, true);

                    addInternal(m_blobX, m_blobLn3);

                    m_ln3.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);


                    // x_3 + xC1 -> xC

                    // xC -> xB (implied)

                    m_cuda.add(nCount, m_blobAttn2.gpu_diff, m_blobX.gpu_diff, m_blobX.mutable_gpu_diff);

                    // xC -> attn2

                    m_blobAttn2.CopyFrom(m_blobX, true);


                    // attn2 = self.attn2(x_2, e_output, e_output, e_mask)

                    // attn2 -> x_2 (ln2), e_output1, e_output2

                    addInternal(new List<Blob<T>>() { m_blobLn2, blobEncOut, blobEncOut, blobEncMask }, m_blobAttn2);

                    m_attn2.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);

                }


                // x_2 = self.ln_2(xB)

                // x_2 -> xB1

                m_blobAttn1.CopyFrom(m_blobX, true);

                addInternal(m_blobX, m_blobLn2);

                m_ln2.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);


                // xC + xB1 -> xB

                // xB -> x (implied)

                m_cuda.add(nCount, m_blobAttn1.gpu_diff, m_blobX.gpu_diff, m_blobX.mutable_gpu_diff);

                // xB -> attn1

                m_blobAttn1.CopyFrom(m_blobX, true);


                if (m_param.transformer_block_param.block_type == TransformerBlockParameter.BLOCK_TYPE.CAUSAL_SELF_ATTENTION)

                {

                    // Gradient for self.attn(self.ln_1(x))

                    addInternal(m_blobLn1, m_blobAttn1);

                    m_attn1.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);

                }

                else if (m_param.transformer_block_param.block_type == TransformerBlockParameter.BLOCK_TYPE.ENCODER ||

                         m_param.transformer_block_param.block_type == TransformerBlockParameter.BLOCK_TYPE.DECODER)

                {

                    // Gradient for self.attn(x_1, x_1, x_1, e_mask)

                    addInternal(new List<Blob<T>>() { m_blobLn1, m_blobLn1, m_blobLn1, blobXMask }, m_blobAttn1);

                    m_attn1.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);

                }

                else

                {

                    throw new Exception("Unknown block type '" + m_param.transformer_block_param.block_type.ToString() + "'!");

                }


                // x_1 = ln1(x)

                // x_1 -> x1

                addInternal(blobX, m_blobLn1);

                m_ln1.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);


                // Accumulate attention gradient with others in bottom[0].

                // x1 + xB -> x

                m_cuda.add(nCount, blobX.gpu_diff, m_blobX.gpu_diff, blobX.mutable_gpu_diff);

            }

        }

    }

}

MyCaffe.basecode.CancelEvent
The CancelEvent provides an extension to the manual cancel event that allows for overriding the manua...
Definition: CancelEvent.cs:17

MyCaffe.basecode.Log
The Log class provides general output in text form.
Definition: Log.cs:13

MyCaffe.common.BlobCollection
The BlobCollection contains a list of Blobs.
Definition: BlobCollection.cs:16

MyCaffe.common.BlobCollection.Add
void Add(Blob< T > b)
Add a new Blob to the collection.
Definition: BlobCollection.cs:92

MyCaffe.common.BlobCollection.Count
int Count
Returns the number of items in the collection.
Definition: BlobCollection.cs:30

MyCaffe.common.BlobCollection.Clear
void Clear(bool bDispose=false)
Remove all items from the collection.
Definition: BlobCollection.cs:135

MyCaffe.common.BlobCollection.ReshapeLike
void ReshapeLike(BlobCollection< T > src)
Reshapes all blobs in the collection to the sizes of the source.
Definition: BlobCollection.cs:214

MyCaffe.common.Blob
The Blob is the main holder of data that moves through the Layers of the Net.
Definition: Blob.cs:25

MyCaffe.common.Blob.mutable_gpu_diff
long mutable_gpu_diff
Returns the diff GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1555

MyCaffe.common.Blob.mutable_gpu_data
long mutable_gpu_data
Returns the data GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1487

MyCaffe.common.Blob.CopyFrom
void CopyFrom(Blob< T > src, int nSrcOffset, int nDstOffset, int nCount, bool bCopyData, bool bCopyDiff)
Copy from a source Blob.
Definition: Blob.cs:903

MyCaffe.common.Blob.ReshapeLike
void ReshapeLike(Blob< T > b, bool? bUseHalfSize=null)
Reshape this Blob to have the same shape as another Blob.
Definition: Blob.cs:648

MyCaffe.common.Blob.Name
string Name
Get/set the name of the Blob.
Definition: Blob.cs:2184

MyCaffe.common.Blob.gpu_diff
long gpu_diff
Returns the diff GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1541

MyCaffe.common.Blob.gpu_data
long gpu_data
Returns the data GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1479

MyCaffe.common.CudaDnn
The CudaDnn object is the main interface to the Low-Level Cuda C++ DLL.
Definition: CudaDnn.cs:969

MyCaffe.layers.Layer
An interface for the units of computation which can be composed into a Net.
Definition: Layer.cs:31

MyCaffe.layers.Layer.m_param
LayerParameter m_param
Specifies the LayerParameter describing the Layer.
Definition: Layer.cs:47

MyCaffe.layers.Layer.LayerSetUp
abstract void LayerSetUp(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Performs Layer specific setup. Derived layers should override this function as well as the Reshape fu...

MyCaffe.layers.Layer.shareLayerBlob
bool shareLayerBlob(Blob< T > b, List< int > rgMinShape)
Attempts to share a Layer Blob if another parameter Blob with the same name and acceptable size is fo...
Definition: Layer.cs:1170

MyCaffe.layers.Layer.Backward
void Backward(BlobCollection< T > colTop, List< bool > rgbPropagateDown, BlobCollection< T > colBottom)
Given the top Blob error gradients, compute the bottom Blob error gradients.
Definition: Layer.cs:815

MyCaffe.layers.Layer.ReInitializeParameters
virtual bool ReInitializeParameters(WEIGHT_TARGET target)
Re-initialize the parameters of the layer.
Definition: Layer.cs:389

MyCaffe.layers.Layer.Forward
double Forward(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Given the bottom (input) Blobs, this function computes the top (output) Blobs and the loss.
Definition: Layer.cs:728

MyCaffe.layers.Layer.Reshape
abstract void Reshape(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Adjust the shapes of top blobs and internal buffers to accomodate the shapes of the bottom blobs.

MyCaffe.layers.Layer.m_colInternalBlobs
BlobCollection< T > m_colInternalBlobs
Specifies internal blobs used by the layer.
Definition: Layer.cs:59

MyCaffe.layers.Layer.m_cuda
CudaDnn< T > m_cuda
Specifies the CudaDnn connection to Cuda.
Definition: Layer.cs:39

MyCaffe.layers.Layer.Create
static Layer< T > Create(CudaDnn< T > cuda, Log log, LayerParameter p, CancelEvent evtCancel, IXDatabaseBase db=null, TransferInput trxinput=null)
Create a new Layer based on the LayerParameter.
Definition: Layer.cs:1468

MyCaffe.layers.Layer.m_type
LayerParameter.LayerType m_type
Specifies the Layer type.
Definition: Layer.cs:35

MyCaffe.layers.Layer.blobs
BlobCollection< T > blobs
Returns the collection of learnable parameter Blobs for the Layer.
Definition: Layer.cs:875

MyCaffe.layers.Layer.internal_blobs
BlobCollection< T > internal_blobs
Returns the collection of internal Blobs used by the Layer.
Definition: Layer.cs:883

MyCaffe.layers.Layer.convertLayerParam
LayerParameter convertLayerParam(LayerParameter pChild, LayerParameter pParent)
Called to convert a parent LayerParameterEx, used in blob sharing, with a child layer parameter.
Definition: Layer.cs:1134

MyCaffe.layers.gpt.TransformerBlockLayer
The TransformerBlock provides a generic transformer block
Definition: TransformerBlockLayer.cs:24

MyCaffe.layers.gpt.TransformerBlockLayer.Reshape
override void Reshape(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Reshape the bottom (input) and top (output) blobs.
Definition: TransformerBlockLayer.cs:476

MyCaffe.layers.gpt.TransformerBlockLayer.ExactNumTopBlobs
override int ExactNumTopBlobs
Returns the exact number of required top (output) Blobs: trans
Definition: TransformerBlockLayer.cs:306

MyCaffe.layers.gpt.TransformerBlockLayer.TransformerBlockLayer
TransformerBlockLayer(CudaDnn< T > cuda, Log log, LayerParameter p)
The TransformerBlock constructor.
Definition: TransformerBlockLayer.cs:54

MyCaffe.layers.gpt.TransformerBlockLayer.forward
override void forward(BlobCollection< T > colBottom, BlobCollection< T > colTop)
The forward computation.
Definition: TransformerBlockLayer.cs:608

MyCaffe.layers.gpt.TransformerBlockLayer.ReInitializeParameters
override bool ReInitializeParameters(WEIGHT_TARGET target)
Re-initialize the parameters of the layer.
Definition: TransformerBlockLayer.cs:315

MyCaffe.layers.gpt.TransformerBlockLayer.LayerSetUp
override void LayerSetUp(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Setup the layer.
Definition: TransformerBlockLayer.cs:370

MyCaffe.layers.gpt.TransformerBlockLayer.backward
override void backward(BlobCollection< T > colTop, List< bool > rgbPropagateDown, BlobCollection< T > colBottom)
Computes the loss error gradient w.r.t the outputs.
Definition: TransformerBlockLayer.cs:710

MyCaffe.layers.gpt.TransformerBlockLayer.MinBottomBlobs
override int MinBottomBlobs
Returns the minimum number of required bottom (input) Blobs: input
Definition: TransformerBlockLayer.cs:277

MyCaffe.layers.gpt.TransformerBlockLayer.setup_internal_blobs
override void setup_internal_blobs(BlobCollection< T > col)
Derivative layers should add all internal blobws to the 'col' provided.
Definition: TransformerBlockLayer.cs:243

MyCaffe.layers.gpt.TransformerBlockLayer.MaxBottomBlobs
override int MaxBottomBlobs
Returns the maximum number of required bottom (input) Blobs: input, e_mask (when ENCODER,...
Definition: TransformerBlockLayer.cs:285

MyCaffe.layers.gpt.TransformerBlockLayer.dispose
override void dispose()
Releases all GPU and host resources used by the Layer.
Definition: TransformerBlockLayer.cs:218

MyCaffe.param.DropoutParameter.dropout_ratio
double dropout_ratio
Specifies the dropout ratio. (e.g. the probability that values will be dropped out and set to zero....
Definition: DropoutParameter.cs:63

MyCaffe.param.FillerParameter
Specifies the filler parameters used to create each Filler.
Definition: FillerParameter.cs:16

MyCaffe.param.InnerProductParameter.weight_filler
FillerParameter weight_filler
The filler for the weights.
Definition: InnerProductParameter.cs:119

MyCaffe.param.InnerProductParameter.axis
int axis
Specifies the first axis to be lumped into a single inner product computation; all preceding axes are...
Definition: InnerProductParameter.cs:142

MyCaffe.param.InnerProductParameter.bias_filler
FillerParameter bias_filler
The filler for the bias.
Definition: InnerProductParameter.cs:130

MyCaffe.param.InnerProductParameter.num_output
uint num_output
The number of outputs for the layer.
Definition: InnerProductParameter.cs:85

MyCaffe.param.InnerProductParameter.bias_term
bool bias_term
Whether to have bias terms or not.
Definition: InnerProductParameter.cs:108

MyCaffe.param.LayerParameter
Specifies the base parameter for all layers.
Definition: LayerParameter.cs:24

MyCaffe.param.LayerParameter.parameters
List< ParamSpec > parameters
Specifies the ParamSpec parameters of the LayerParameter.
Definition: LayerParameter.cs:1964

MyCaffe.param.LayerParameter.name
string name
Specifies the name of this LayerParameter.
Definition: LayerParameter.cs:1865

MyCaffe.param.LayerParameter.multihead_attention_param
MultiheadAttentionParameter multihead_attention_param
Returns the parameter set when initialized with LayerType.MULTIHEAD_ATTENTION
Definition: LayerParameter.cs:2200

MyCaffe.param.LayerParameter.layer_norm_param
LayerNormParameter layer_norm_param
Returns the parameter set when initialized with LayerType.LAYERNORM
Definition: LayerParameter.cs:2488

MyCaffe.param.LayerParameter.causal_self_attention_param
CausalSelfAttentionParameter causal_self_attention_param
Returns the parameter set when initialized with LayerType.CAUSAL_SELF_ATTENTION
Definition: LayerParameter.cs:2191

MyCaffe.param.LayerParameter.inner_product_param
InnerProductParameter inner_product_param
Returns the parameter set when initialized with LayerType.INNERPRODUCT
Definition: LayerParameter.cs:2452

MyCaffe.param.LayerParameter.transformer_block_param
TransformerBlockParameter transformer_block_param
Returns the parameter set when initialized with LayerType.TRANSFORMER_BLOCK
Definition: LayerParameter.cs:2885

MyCaffe.param.LayerParameter.LayerType
LayerType
Specifies the layer type.
Definition: LayerParameter.cs:110

MyCaffe.param.LayerParameter.gelu_param
GeluParameter gelu_param
Returns the parameter set when initialized with LayerType.GELU
Definition: LayerParameter.cs:2371

MyCaffe.param.LayerParameter.dropout_param
DropoutParameter dropout_param
Returns the parameter set when initialized with LayerType.DROPOUT
Definition: LayerParameter.cs:2290

MyCaffe.param.ParamSpec
Specifies training parameters (multipliers on global learning constants, and the name of other settin...
Definition: ParamSpec.cs:19

MyCaffe.param.gpt.CausalSelfAttentionParameter.embed
uint embed
Specifies size of the embed.
Definition: CausalSelfAttentionParameter.cs:53

MyCaffe.param.gpt.CausalSelfAttentionParameter.heads
uint heads
The number of heads used.
Definition: CausalSelfAttentionParameter.cs:44

MyCaffe.param.gpt.CausalSelfAttentionParameter.block_size
uint block_size
Specifies size of the block.
Definition: CausalSelfAttentionParameter.cs:62

MyCaffe.param.gpt.CausalSelfAttentionParameter.resid_dropout
double resid_dropout
Specifies dropout probability used on the residual weights.
Definition: CausalSelfAttentionParameter.cs:80

MyCaffe.param.gpt.CausalSelfAttentionParameter.layers
uint layers
The number of layers (transformer blocks) used.
Definition: CausalSelfAttentionParameter.cs:34

MyCaffe.param.gpt.CausalSelfAttentionParameter.attn_dropout
double attn_dropout
Specifies dropout probability used on the attention weights.
Definition: CausalSelfAttentionParameter.cs:71

MyCaffe.param.gpt.GeluParameter.enable_bert_version
bool enable_bert_version
Specifies to use the special BERT version used in GPT models.
Definition: GeluParameter.cs:29

MyCaffe.param.gpt.LayerNormParameter.enable_cuda_impl
bool enable_cuda_impl
Specifies to use the low-level full cuda implementation of LayerNorm (default = false).
Definition: LayerNormParameter.cs:59

MyCaffe.param.gpt.MultiheadAttentionParameter
Specifies the parameters for the MultiheadAttentionLayer.
Definition: MultiheadAttentionParameter.cs:16

MyCaffe.param.gpt.MultiheadAttentionParameter.WEIGHT_INIT
WEIGHT_INIT
Defines the weight initialization strategy.
Definition: MultiheadAttentionParameter.cs:29

MyCaffe.param.gpt.MultiheadAttentionParameter.attn_dropout
double attn_dropout
Specifies dropout probability used on the attention weights.
Definition: MultiheadAttentionParameter.cs:87

MyCaffe.param.gpt.MultiheadAttentionParameter.block_size
uint block_size
Specifies size of the block.
Definition: MultiheadAttentionParameter.cs:78

MyCaffe.param.gpt.MultiheadAttentionParameter.heads
uint heads
The number of heads used.
Definition: MultiheadAttentionParameter.cs:60

MyCaffe.param.gpt.MultiheadAttentionParameter.layers
uint layers
The number of layers (transformer blocks) used.
Definition: MultiheadAttentionParameter.cs:50

MyCaffe.param.gpt.MultiheadAttentionParameter.resid_dropout
double resid_dropout
Specifies dropout probability used on the residual weights.
Definition: MultiheadAttentionParameter.cs:96

MyCaffe.param.gpt.MultiheadAttentionParameter.embed
uint embed
Specifies size of the embed.
Definition: MultiheadAttentionParameter.cs:69

MyCaffe.param.gpt.MultiheadAttentionParameter.weight_init
WEIGHT_INIT weight_init
Specifies the weight initialization strategy (default = ENCODER_DECODER).
Definition: MultiheadAttentionParameter.cs:105

MyCaffe.param.gpt.TransformerBlockParameter
Specifies the parameters for the TransformerBlockLayer.
Definition: TransformerBlockParameter.cs:16

MyCaffe.param.gpt.TransformerBlockParameter.resid_dropout
double resid_dropout
Specifies dropout probability used on the residual weights.
Definition: TransformerBlockParameter.cs:152

MyCaffe.param.gpt.TransformerBlockParameter.layers
uint layers
The number of layers (transformer blocks) used.
Definition: TransformerBlockParameter.cs:106

MyCaffe.param.gpt.TransformerBlockParameter.heads
uint heads
The number of heads used.
Definition: TransformerBlockParameter.cs:116

MyCaffe.param.gpt.TransformerBlockParameter.block_type
BLOCK_TYPE block_type
Specifies the type of transformer block to configure.
Definition: TransformerBlockParameter.cs:96

MyCaffe.param.gpt.TransformerBlockParameter.activation
ACTIVATION activation
Specifies the activation type to use (default = RELU)
Definition: TransformerBlockParameter.cs:87

MyCaffe.param.gpt.TransformerBlockParameter.enable_layernorm_cuda_impl
bool enable_layernorm_cuda_impl
Specifies to use the low-level full cuda implementation of LayerNorm (default = false).
Definition: TransformerBlockParameter.cs:78

MyCaffe.param.gpt.TransformerBlockParameter.attn_dropout
double attn_dropout
Specifies dropout probability used on the attention weights.
Definition: TransformerBlockParameter.cs:143

MyCaffe.param.gpt.TransformerBlockParameter.BLOCK_TYPE
BLOCK_TYPE
Defines the type of transformer block
Definition: TransformerBlockParameter.cs:31

MyCaffe.param.gpt.TransformerBlockParameter.embed
uint embed
Specifies size of the embed.
Definition: TransformerBlockParameter.cs:125

MyCaffe.param.gpt.TransformerBlockParameter.block_size
uint block_size
Specifies size of the block.
Definition: TransformerBlockParameter.cs:134

MyCaffe.basecode
The MyCaffe.basecode contains all generic types used throughout MyCaffe.
Definition: Annotation.cs:12

MyCaffe.common
The MyCaffe.common namespace contains common MyCaffe classes.
Definition: BatchInput.cs:8

MyCaffe.common.WEIGHT_TARGET
WEIGHT_TARGET
Defines the type of weight to target in re-initializations.
Definition: Interfaces.cs:38

MyCaffe.fillers
The MyCaffe.fillers namespace contains all fillers including the Filler class.
Definition: BilinearFiller.cs:10

MyCaffe.layers.gpt
The MyCaffe.layers.gpt namespace contains all GPT related layers.
Definition: LayerFactory.cs:15

MyCaffe.param.gpt
Definition: CausalSelfAttentionParameter.cs:9

MyCaffe.param
The MyCaffe.param namespace contains parameters used to create models.
Definition: AttentionParameter.cs:9

MyCaffe
The MyCaffe namespace contains the main body of MyCaffe code that closesly tracks the C++ Caffe open-...
Definition: Annotation.cs:12

System
Definition: Component.cs:11