mycaffe/html/_causal_self_attention_layer2_8cs_source.html

using System;

using System.Collections.Generic;

using System.Linq;

using System.Text;

using MyCaffe.basecode;

using MyCaffe.common;

using MyCaffe.param;

using MyCaffe.fillers;

using System.Diagnostics;


namespace MyCaffe.layers.gpt

{

    public class CausalSelfAttentionLayer2<T> : Layer<T>

    {

        // Causal mask to ensure that atttention is only applied to the left in the input sequence.

        Blob<T> m_blobBias;

        Layer<T> m_mh_att = null;


        BlobCollection<T> m_colInternalBottom = new BlobCollection<T>();

        BlobCollection<T> m_colInternalTop = new BlobCollection<T>();


        public CausalSelfAttentionLayer2(CudaDnn<T> cuda, Log log, LayerParameter p)

            : base(cuda, log, p)

        {

            m_type = LayerParameter.LayerType.CAUSAL_SELF_ATTENTION;


            LayerParameter p1 = new LayerParameter(LayerParameter.LayerType.MULTIHEAD_ATTENTION);

            p1.multihead_attention_param.heads = p.causal_self_attention_param.heads;

            p1.multihead_attention_param.embed = p.causal_self_attention_param.embed;

            p1.multihead_attention_param.block_size = p.causal_self_attention_param.block_size;

            p1.multihead_attention_param.attn_dropout = p.causal_self_attention_param.attn_dropout;

            p1.multihead_attention_param.resid_dropout = p.causal_self_attention_param.resid_dropout;

            p1.multihead_attention_param.weight_init = param.gpt.MultiheadAttentionParameter.WEIGHT_INIT.GPT;

            m_mh_att = new MultiheadAttentionLayer<T>(m_cuda, m_log, p1);


            // Causal mask to ensure that atttention is only applied to the left in the input sequence.

            m_blobBias = new Blob<T>(cuda, log);

            m_blobBias.Name = m_param.name + " bias";


            List<int> rgShape = new List<int>() { 1, 1, (int)p.causal_self_attention_param.block_size, (int)p.causal_self_attention_param.block_size };

            shareLayerBlob(m_blobBias, rgShape);

            m_blobBias.Reshape(rgShape);

            fillBias(m_blobBias);


            setup_internal_blobs(m_colInternalBlobs);

        }


        protected override void dispose()

        {

            dispose(ref m_mh_att);

            dispose(ref m_blobBias);


            base.dispose();

        }


        protected override void setup_internal_blobs(BlobCollection<T> col)

        {

            if (col.Count > 0)

                return;


            col.Add(m_blobBias);


            col.Add(m_mh_att.internal_blobs);

        }


        private void fillBias(Blob<T> b)

        {

            b.SetData(1.0);


            float[] rgBiasData = convertF(b.mutable_cpu_data);


            for (int i = 0; i<b.height; i++)

            {

                for (int j = i + 1; j < b.width; j++)

                {

                    rgBiasData[i * b.width + j] = 0;

                }

            }


            b.mutable_cpu_data = convert(rgBiasData);

        }


        public override int ExactNumBottomBlobs

        {

            get { return 1; }

        }


        public override int ExactNumTopBlobs

        {

            get { return 1; }

        }


        public override bool ReInitializeParameters(WEIGHT_TARGET target)

        {

            base.ReInitializeParameters(target);


            m_mh_att.ReInitializeParameters(target);


            return true;

        }


        private void addInternal(Blob<T> bottom, Blob<T> top)

        {

            m_colInternalBottom.Clear();

            m_colInternalBottom.Add(bottom);


            m_colInternalTop.Clear();

            m_colInternalTop.Add(top);

        }


        private void addInternal(List<Blob<T>> rgBottom, Blob<T> top)

        {

            m_colInternalBottom.Clear();


            for (int i=0; i<rgBottom.Count; i++)

            {

                m_colInternalBottom.Add(rgBottom[i]);

            }


            m_colInternalTop.Clear();

            m_colInternalTop.Add(top);

        }


        public override void LayerSetUp(BlobCollection<T> colBottom, BlobCollection<T> colTop)

        {

            Blob<T> blobX = colBottom[0];


            addInternal(new List<Blob<T>> { blobX, blobX, blobX, m_blobBias }, colTop[0]);

            m_mh_att.LayerSetUp(m_colInternalBottom, m_colInternalTop);


            blobs.Add(m_mh_att.blobs);


            foreach (Blob<T> blob in blobs)

            {

                if (!blob.Name.StartsWith(m_param.name + "_"))

                    blob.Name = m_param.name + "_" + blob.Name;

            }

        }


        public override void Reshape(BlobCollection<T> colBottom, BlobCollection<T> colTop)

        {

            Blob<T> blobX = colBottom[0];


            addInternal(new List<Blob<T>> { blobX, blobX, blobX, m_blobBias }, colTop[0]);

            m_mh_att.Reshape(m_colInternalBottom, m_colInternalTop);

        }


        protected override void forward(BlobCollection<T> colBottom, BlobCollection<T> colTop)

        {

            Blob<T> blobX = colBottom[0];


            addInternal(new List<Blob<T>> { blobX, blobX, blobX, m_blobBias }, colTop[0]);

            m_mh_att.Forward(m_colInternalBottom, m_colInternalTop);

        }


        protected override void backward(BlobCollection<T> colTop, List<bool> rgbPropagateDown, BlobCollection<T> colBottom)

        {

            // Gradient with respect to state then data.

            if (rgbPropagateDown[0])

            {

                List<bool> rgbPropagate = new List<bool>() { true, true };


                Blob<T> blobX = colBottom[0];


                addInternal(new List<Blob<T>> { blobX, blobX, blobX, m_blobBias }, colTop[0]);

                m_mh_att.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);

            }

        }

    }

}

MyCaffe.basecode.Log
The Log class provides general output in text form.
Definition: Log.cs:13

MyCaffe.common.BlobCollection
The BlobCollection contains a list of Blobs.
Definition: BlobCollection.cs:16

MyCaffe.common.BlobCollection.Add
void Add(Blob< T > b)
Add a new Blob to the collection.
Definition: BlobCollection.cs:92

MyCaffe.common.BlobCollection.Count
int Count
Returns the number of items in the collection.
Definition: BlobCollection.cs:30

MyCaffe.common.BlobCollection.Clear
void Clear(bool bDispose=false)
Remove all items from the collection.
Definition: BlobCollection.cs:135

MyCaffe.common.Blob
The Blob is the main holder of data that moves through the Layers of the Net.
Definition: Blob.cs:25

MyCaffe.common.Blob.SetData
void SetData(T[] rgData, int nCount=-1, bool bSetCount=true)
Sets a number of items within the Blob's data.
Definition: Blob.cs:1922

MyCaffe.common.Blob.height
int height
DEPRECIATED; legacy shape accessor height: use shape(2) instead.
Definition: Blob.cs:808

MyCaffe.common.Blob.mutable_cpu_data
T[] mutable_cpu_data
Get data from the GPU and bring it over to the host, or Set data from the Host and send it over to th...
Definition: Blob.cs:1461

MyCaffe.common.Blob.width
int width
DEPRECIATED; legacy shape accessor width: use shape(3) instead.
Definition: Blob.cs:816

MyCaffe.common.Blob.Name
string Name
Get/set the name of the Blob.
Definition: Blob.cs:2184

MyCaffe.common.CudaDnn
The CudaDnn object is the main interface to the Low-Level Cuda C++ DLL.
Definition: CudaDnn.cs:969

MyCaffe.layers.Layer
An interface for the units of computation which can be composed into a Net.
Definition: Layer.cs:31

MyCaffe.layers.Layer.m_log
Log m_log
Specifies the Log for output.
Definition: Layer.cs:43

MyCaffe.layers.Layer.m_param
LayerParameter m_param
Specifies the LayerParameter describing the Layer.
Definition: Layer.cs:47

MyCaffe.layers.Layer.convert
void convert(BlobCollection< T > col)
Convert a collection of blobs from / to half size.
Definition: Layer.cs:535

MyCaffe.layers.Layer.LayerSetUp
abstract void LayerSetUp(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Performs Layer specific setup. Derived layers should override this function as well as the Reshape fu...

MyCaffe.layers.Layer.shareLayerBlob
bool shareLayerBlob(Blob< T > b, List< int > rgMinShape)
Attempts to share a Layer Blob if another parameter Blob with the same name and acceptable size is fo...
Definition: Layer.cs:1170

MyCaffe.layers.Layer.Backward
void Backward(BlobCollection< T > colTop, List< bool > rgbPropagateDown, BlobCollection< T > colBottom)
Given the top Blob error gradients, compute the bottom Blob error gradients.
Definition: Layer.cs:815

MyCaffe.layers.Layer.ReInitializeParameters
virtual bool ReInitializeParameters(WEIGHT_TARGET target)
Re-initialize the parameters of the layer.
Definition: Layer.cs:389

MyCaffe.layers.Layer.Forward
double Forward(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Given the bottom (input) Blobs, this function computes the top (output) Blobs and the loss.
Definition: Layer.cs:728

MyCaffe.layers.Layer.convertF
float convertF(T df)
Converts a generic to a float value.
Definition: Layer.cs:1359

MyCaffe.layers.Layer.Reshape
abstract void Reshape(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Adjust the shapes of top blobs and internal buffers to accomodate the shapes of the bottom blobs.

MyCaffe.layers.Layer.m_colInternalBlobs
BlobCollection< T > m_colInternalBlobs
Specifies internal blobs used by the layer.
Definition: Layer.cs:59

MyCaffe.layers.Layer.m_cuda
CudaDnn< T > m_cuda
Specifies the CudaDnn connection to Cuda.
Definition: Layer.cs:39

MyCaffe.layers.Layer.m_type
LayerParameter.LayerType m_type
Specifies the Layer type.
Definition: Layer.cs:35

MyCaffe.layers.Layer.blobs
BlobCollection< T > blobs
Returns the collection of learnable parameter Blobs for the Layer.
Definition: Layer.cs:875

MyCaffe.layers.Layer.internal_blobs
BlobCollection< T > internal_blobs
Returns the collection of internal Blobs used by the Layer.
Definition: Layer.cs:883

MyCaffe.layers.gpt.CausalSelfAttentionLayer2
The CausalSelfAttention provides a vanilla multi-head self-attention layer with projection at the end...
Definition: CausalSelfAttentionLayer2.cs:21

MyCaffe.layers.gpt.CausalSelfAttentionLayer2.CausalSelfAttentionLayer2
CausalSelfAttentionLayer2(CudaDnn< T > cuda, Log log, LayerParameter p)
The CausalSelfAttention constructor.
Definition: CausalSelfAttentionLayer2.cs:36

MyCaffe.layers.gpt.CausalSelfAttentionLayer2.ExactNumTopBlobs
override int ExactNumTopBlobs
Returns the exact number of required top (output) Blobs: attn
Definition: CausalSelfAttentionLayer2.cs:111

MyCaffe.layers.gpt.CausalSelfAttentionLayer2.backward
override void backward(BlobCollection< T > colTop, List< bool > rgbPropagateDown, BlobCollection< T > colBottom)
Computes the loss error gradient w.r.t the outputs.
Definition: CausalSelfAttentionLayer2.cs:214

MyCaffe.layers.gpt.CausalSelfAttentionLayer2.Reshape
override void Reshape(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Reshape the bottom (input) and top (output) blobs.
Definition: CausalSelfAttentionLayer2.cs:177

MyCaffe.layers.gpt.CausalSelfAttentionLayer2.LayerSetUp
override void LayerSetUp(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Setup the layer.
Definition: CausalSelfAttentionLayer2.cs:156

MyCaffe.layers.gpt.CausalSelfAttentionLayer2.ReInitializeParameters
override bool ReInitializeParameters(WEIGHT_TARGET target)
Re-initialize the parameters of the layer.
Definition: CausalSelfAttentionLayer2.cs:120

MyCaffe.layers.gpt.CausalSelfAttentionLayer2.forward
override void forward(BlobCollection< T > colBottom, BlobCollection< T > colTop)
The forward computation.
Definition: CausalSelfAttentionLayer2.cs:195

MyCaffe.layers.gpt.CausalSelfAttentionLayer2.dispose
override void dispose()
Releases all GPU and host resources used by the Layer.
Definition: CausalSelfAttentionLayer2.cs:63

MyCaffe.layers.gpt.CausalSelfAttentionLayer2.ExactNumBottomBlobs
override int ExactNumBottomBlobs
Returns the exact number of required bottom (input) Blobs: input
Definition: CausalSelfAttentionLayer2.cs:103

MyCaffe.layers.gpt.CausalSelfAttentionLayer2.setup_internal_blobs
override void setup_internal_blobs(BlobCollection< T > col)
Derivative layers should add all internal blobws to the 'col' provided.
Definition: CausalSelfAttentionLayer2.cs:72

MyCaffe.layers.gpt.MultiheadAttentionLayer
The MultiheadAttention provides a vanilla multi-head layer.
Definition: MultiheadAttentionLayer.cs:23

MyCaffe.param.LayerParameter
Specifies the base parameter for all layers.
Definition: LayerParameter.cs:24

MyCaffe.param.LayerParameter.name
string name
Specifies the name of this LayerParameter.
Definition: LayerParameter.cs:1865

MyCaffe.param.LayerParameter.multihead_attention_param
MultiheadAttentionParameter multihead_attention_param
Returns the parameter set when initialized with LayerType.MULTIHEAD_ATTENTION
Definition: LayerParameter.cs:2200

MyCaffe.param.LayerParameter.causal_self_attention_param
CausalSelfAttentionParameter causal_self_attention_param
Returns the parameter set when initialized with LayerType.CAUSAL_SELF_ATTENTION
Definition: LayerParameter.cs:2191

MyCaffe.param.LayerParameter.LayerType
LayerType
Specifies the layer type.
Definition: LayerParameter.cs:110

MyCaffe.basecode
The MyCaffe.basecode contains all generic types used throughout MyCaffe.
Definition: Annotation.cs:12

MyCaffe.common
The MyCaffe.common namespace contains common MyCaffe classes.
Definition: BatchInput.cs:8

MyCaffe.common.WEIGHT_TARGET
WEIGHT_TARGET
Defines the type of weight to target in re-initializations.
Definition: Interfaces.cs:38

MyCaffe.fillers
The MyCaffe.fillers namespace contains all fillers including the Filler class.
Definition: BilinearFiller.cs:10

MyCaffe.layers.gpt
The MyCaffe.layers.gpt namespace contains all GPT related layers.
Definition: LayerFactory.cs:15

MyCaffe.param
The MyCaffe.param namespace contains parameters used to create models.
Definition: AttentionParameter.cs:9

MyCaffe
The MyCaffe namespace contains the main body of MyCaffe code that closesly tracks the C++ Caffe open-...
Definition: Annotation.cs:12

System
Definition: Component.cs:11