MyCaffe  1.12.2.41
Deep learning software for Windows C# programmers.
CausalSelfAttentionLayer2.cs
1using System;
2using System.Collections.Generic;
3using System.Linq;
4using System.Text;
5using MyCaffe.basecode;
6using MyCaffe.common;
7using MyCaffe.param;
8using MyCaffe.fillers;
9using System.Diagnostics;
10
11namespace MyCaffe.layers.gpt
12{
20 public class CausalSelfAttentionLayer2<T> : Layer<T>
21 {
22 // Causal mask to ensure that atttention is only applied to the left in the input sequence.
23 Blob<T> m_blobBias;
24 Layer<T> m_mh_att = null;
25
26 BlobCollection<T> m_colInternalBottom = new BlobCollection<T>();
27 BlobCollection<T> m_colInternalTop = new BlobCollection<T>();
28
37 : base(cuda, log, p)
38 {
39 m_type = LayerParameter.LayerType.CAUSAL_SELF_ATTENTION;
40
41 LayerParameter p1 = new LayerParameter(LayerParameter.LayerType.MULTIHEAD_ATTENTION);
44 p1.multihead_attention_param.block_size = p.causal_self_attention_param.block_size;
45 p1.multihead_attention_param.attn_dropout = p.causal_self_attention_param.attn_dropout;
46 p1.multihead_attention_param.resid_dropout = p.causal_self_attention_param.resid_dropout;
47 p1.multihead_attention_param.weight_init = param.gpt.MultiheadAttentionParameter.WEIGHT_INIT.GPT;
48 m_mh_att = new MultiheadAttentionLayer<T>(m_cuda, m_log, p1);
49
50 // Causal mask to ensure that atttention is only applied to the left in the input sequence.
51 m_blobBias = new Blob<T>(cuda, log);
52 m_blobBias.Name = m_param.name + " bias";
53
54 List<int> rgShape = new List<int>() { 1, 1, (int)p.causal_self_attention_param.block_size, (int)p.causal_self_attention_param.block_size };
55 shareLayerBlob(m_blobBias, rgShape);
56 m_blobBias.Reshape(rgShape);
57 fillBias(m_blobBias);
58
60 }
61
63 protected override void dispose()
64 {
65 dispose(ref m_mh_att);
66 dispose(ref m_blobBias);
67
68 base.dispose();
69 }
70
72 protected override void setup_internal_blobs(BlobCollection<T> col)
73 {
74 if (col.Count > 0)
75 return;
76
77 col.Add(m_blobBias);
78
79 col.Add(m_mh_att.internal_blobs);
80 }
81
82 private void fillBias(Blob<T> b)
83 {
84 b.SetData(1.0);
85
86 float[] rgBiasData = convertF(b.mutable_cpu_data);
87
88 for (int i = 0; i<b.height; i++)
89 {
90 for (int j = i + 1; j < b.width; j++)
91 {
92 rgBiasData[i * b.width + j] = 0;
93 }
94 }
95
96 b.mutable_cpu_data = convert(rgBiasData);
97 }
98
102 public override int ExactNumBottomBlobs
103 {
104 get { return 1; }
105 }
106
110 public override int ExactNumTopBlobs
111 {
112 get { return 1; }
113 }
114
120 public override bool ReInitializeParameters(WEIGHT_TARGET target)
121 {
122 base.ReInitializeParameters(target);
123
124 m_mh_att.ReInitializeParameters(target);
125
126 return true;
127 }
128
129 private void addInternal(Blob<T> bottom, Blob<T> top)
130 {
131 m_colInternalBottom.Clear();
132 m_colInternalBottom.Add(bottom);
133
134 m_colInternalTop.Clear();
135 m_colInternalTop.Add(top);
136 }
137
138 private void addInternal(List<Blob<T>> rgBottom, Blob<T> top)
139 {
140 m_colInternalBottom.Clear();
141
142 for (int i=0; i<rgBottom.Count; i++)
143 {
144 m_colInternalBottom.Add(rgBottom[i]);
145 }
146
147 m_colInternalTop.Clear();
148 m_colInternalTop.Add(top);
149 }
150
156 public override void LayerSetUp(BlobCollection<T> colBottom, BlobCollection<T> colTop)
157 {
158 Blob<T> blobX = colBottom[0];
159
160 addInternal(new List<Blob<T>> { blobX, blobX, blobX, m_blobBias }, colTop[0]);
161 m_mh_att.LayerSetUp(m_colInternalBottom, m_colInternalTop);
162
163 blobs.Add(m_mh_att.blobs);
164
165 foreach (Blob<T> blob in blobs)
166 {
167 if (!blob.Name.StartsWith(m_param.name + "_"))
168 blob.Name = m_param.name + "_" + blob.Name;
169 }
170 }
171
177 public override void Reshape(BlobCollection<T> colBottom, BlobCollection<T> colTop)
178 {
179 Blob<T> blobX = colBottom[0];
180
181 addInternal(new List<Blob<T>> { blobX, blobX, blobX, m_blobBias }, colTop[0]);
182 m_mh_att.Reshape(m_colInternalBottom, m_colInternalTop);
183 }
184
195 protected override void forward(BlobCollection<T> colBottom, BlobCollection<T> colTop)
196 {
197 Blob<T> blobX = colBottom[0];
198
199 addInternal(new List<Blob<T>> { blobX, blobX, blobX, m_blobBias }, colTop[0]);
200 m_mh_att.Forward(m_colInternalBottom, m_colInternalTop);
201 }
202
214 protected override void backward(BlobCollection<T> colTop, List<bool> rgbPropagateDown, BlobCollection<T> colBottom)
215 {
216 // Gradient with respect to state then data.
217 if (rgbPropagateDown[0])
218 {
219 List<bool> rgbPropagate = new List<bool>() { true, true };
220
221 Blob<T> blobX = colBottom[0];
222
223 addInternal(new List<Blob<T>> { blobX, blobX, blobX, m_blobBias }, colTop[0]);
224 m_mh_att.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
225 }
226 }
227 }
228}
The Log class provides general output in text form.
Definition: Log.cs:13
The BlobCollection contains a list of Blobs.
void Add(Blob< T > b)
Add a new Blob to the collection.
int Count
Returns the number of items in the collection.
void Clear(bool bDispose=false)
Remove all items from the collection.
The Blob is the main holder of data that moves through the Layers of the Net.
Definition: Blob.cs:25
void SetData(T[] rgData, int nCount=-1, bool bSetCount=true)
Sets a number of items within the Blob's data.
Definition: Blob.cs:1922
int height
DEPRECIATED; legacy shape accessor height: use shape(2) instead.
Definition: Blob.cs:808
T[] mutable_cpu_data
Get data from the GPU and bring it over to the host, or Set data from the Host and send it over to th...
Definition: Blob.cs:1461
int width
DEPRECIATED; legacy shape accessor width: use shape(3) instead.
Definition: Blob.cs:816
string Name
Get/set the name of the Blob.
Definition: Blob.cs:2184
The CudaDnn object is the main interface to the Low-Level Cuda C++ DLL.
Definition: CudaDnn.cs:969
An interface for the units of computation which can be composed into a Net.
Definition: Layer.cs:31
Log m_log
Specifies the Log for output.
Definition: Layer.cs:43
LayerParameter m_param
Specifies the LayerParameter describing the Layer.
Definition: Layer.cs:47
void convert(BlobCollection< T > col)
Convert a collection of blobs from / to half size.
Definition: Layer.cs:535
abstract void LayerSetUp(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Performs Layer specific setup. Derived layers should override this function as well as the Reshape fu...
bool shareLayerBlob(Blob< T > b, List< int > rgMinShape)
Attempts to share a Layer Blob if another parameter Blob with the same name and acceptable size is fo...
Definition: Layer.cs:1170
void Backward(BlobCollection< T > colTop, List< bool > rgbPropagateDown, BlobCollection< T > colBottom)
Given the top Blob error gradients, compute the bottom Blob error gradients.
Definition: Layer.cs:815
virtual bool ReInitializeParameters(WEIGHT_TARGET target)
Re-initialize the parameters of the layer.
Definition: Layer.cs:389
double Forward(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Given the bottom (input) Blobs, this function computes the top (output) Blobs and the loss.
Definition: Layer.cs:728
float convertF(T df)
Converts a generic to a float value.
Definition: Layer.cs:1359
abstract void Reshape(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Adjust the shapes of top blobs and internal buffers to accomodate the shapes of the bottom blobs.
BlobCollection< T > m_colInternalBlobs
Specifies internal blobs used by the layer.
Definition: Layer.cs:59
CudaDnn< T > m_cuda
Specifies the CudaDnn connection to Cuda.
Definition: Layer.cs:39
LayerParameter.LayerType m_type
Specifies the Layer type.
Definition: Layer.cs:35
BlobCollection< T > blobs
Returns the collection of learnable parameter Blobs for the Layer.
Definition: Layer.cs:875
BlobCollection< T > internal_blobs
Returns the collection of internal Blobs used by the Layer.
Definition: Layer.cs:883
The CausalSelfAttention provides a vanilla multi-head self-attention layer with projection at the end...
CausalSelfAttentionLayer2(CudaDnn< T > cuda, Log log, LayerParameter p)
The CausalSelfAttention constructor.
override int ExactNumTopBlobs
Returns the exact number of required top (output) Blobs: attn
override void backward(BlobCollection< T > colTop, List< bool > rgbPropagateDown, BlobCollection< T > colBottom)
Computes the loss error gradient w.r.t the outputs.
override void Reshape(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Reshape the bottom (input) and top (output) blobs.
override void LayerSetUp(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Setup the layer.
override bool ReInitializeParameters(WEIGHT_TARGET target)
Re-initialize the parameters of the layer.
override void forward(BlobCollection< T > colBottom, BlobCollection< T > colTop)
The forward computation.
override void dispose()
Releases all GPU and host resources used by the Layer.
override int ExactNumBottomBlobs
Returns the exact number of required bottom (input) Blobs: input
override void setup_internal_blobs(BlobCollection< T > col)
Derivative layers should add all internal blobws to the 'col' provided.
The MultiheadAttention provides a vanilla multi-head layer.
Specifies the base parameter for all layers.
string name
Specifies the name of this LayerParameter.
MultiheadAttentionParameter multihead_attention_param
Returns the parameter set when initialized with LayerType.MULTIHEAD_ATTENTION
CausalSelfAttentionParameter causal_self_attention_param
Returns the parameter set when initialized with LayerType.CAUSAL_SELF_ATTENTION
LayerType
Specifies the layer type.
The MyCaffe.basecode contains all generic types used throughout MyCaffe.
Definition: Annotation.cs:12
The MyCaffe.common namespace contains common MyCaffe classes.
Definition: BatchInput.cs:8
WEIGHT_TARGET
Defines the type of weight to target in re-initializations.
Definition: Interfaces.cs:38
The MyCaffe.fillers namespace contains all fillers including the Filler class.
The MyCaffe.layers.gpt namespace contains all GPT related layers.
Definition: LayerFactory.cs:15
The MyCaffe.param namespace contains parameters used to create models.
The MyCaffe namespace contains the main body of MyCaffe code that closesly tracks the C++ Caffe open-...
Definition: Annotation.cs:12