MyCaffe  1.12.2.41
Deep learning software for Windows C# programmers.
CausalSelfAttentionLayer.cs
1using System;
2using System.Collections.Generic;
3using System.Linq;
4using System.Text;
5using MyCaffe.basecode;
6using MyCaffe.common;
7using MyCaffe.param;
8using MyCaffe.fillers;
9using System.Diagnostics;
10
11namespace MyCaffe.layers.gpt
12{
20 public class CausalSelfAttentionLayer<T> : Layer<T>
21 {
22 List<int> m_rgShape = new List<int>() { 1, 1, 1, 1 };
23 // Key, query, value projections for all heads, but in a batch.
24 Layer<T> m_c_attn = null;
25 // Output projection.
26 Layer<T> m_c_proj = null;
27 // Regularization
28 Layer<T> m_attn_dropout = null;
29 Layer<T> m_resid_dropout = null;
30 // Transpose
31 Layer<T> m_transpose;
32 Layer<T> m_transposeQ;
33 // Softmax
34 Layer<T> m_softmax = null;
35 // Causal mask to ensure that atttention is only applied to the left in the input sequence.
36 Blob<T> m_blobBias;
37 Blob<T> m_blobQ;
38 Blob<T> m_blobK;
39 Blob<T> m_blobV;
40 Blob<T> m_blobQt;
41 Blob<T> m_blobKt;
42 Blob<T> m_blobKt1;
43 Blob<T> m_blobVt;
44 Blob<T> m_blobWork;
45 Blob<T> m_blobAttA;
46 Blob<T> m_blobAttB;
47 Blob<T> m_blobIpAttn;
48 Blob<T> m_blobY;
49 // The number of heads.
50 int m_nHeads;
51 int m_nEmbed;
52 int m_nBlockSize;
53 double m_dfAttnDropout;
54 double m_dfResidDropout;
55 double m_dfIgnoreVal = -1e+29;
56
57 int m_nSize;
58 int m_nDataSize;
59 int m_nB;
60 int m_nT;
61 int m_nC;
62
63 BlobCollection<T> m_colInternalBottom = new BlobCollection<T>();
64 BlobCollection<T> m_colInternalTop = new BlobCollection<T>();
65
74 : base(cuda, log, p)
75 {
76 m_type = LayerParameter.LayerType.CAUSAL_SELF_ATTENTION;
77
78 m_nHeads = (int)p.causal_self_attention_param.heads;
79 m_nEmbed = (int)p.causal_self_attention_param.embed;
80 m_nBlockSize = (int)p.causal_self_attention_param.block_size;
81 m_dfAttnDropout = p.causal_self_attention_param.attn_dropout;
82 m_dfResidDropout = p.causal_self_attention_param.resid_dropout;
83
84 log.CHECK_EQ(m_nEmbed % m_nHeads, 0, "The embedding size must be divisible by the number of heads.");
85
86 // Key, query, value projections for all heads, but in a batch.
87 // input features = m_nHeads
88 LayerParameter ipAttn = new LayerParameter(LayerParameter.LayerType.INNERPRODUCT, m_param.name + ".c_attn");
89 ipAttn.inner_product_param.num_output = (uint)(3 * m_nEmbed);
90 ipAttn.inner_product_param.bias_term = true;
91 ipAttn.inner_product_param.weight_filler = new FillerParameter("gaussian", 0, 0, 0.02);
92 ipAttn.inner_product_param.bias_filler = new FillerParameter("constant", 0.0);
93 ipAttn.inner_product_param.axis = 2;
94 ipAttn.parameters.Add(new ParamSpec(1.0, 1.0));
95 ipAttn.parameters.Add(new ParamSpec(1.0, 0.0));
96 m_c_attn = Layer<T>.Create(cuda, log, convertLayerParam(ipAttn, p), null);
97
98 // Output projection.
99 // input features = m_nEmbed
100 LayerParameter ipProj = new LayerParameter(LayerParameter.LayerType.INNERPRODUCT, m_param.name + ".c_proj");
101 ipProj.inner_product_param.num_output = (uint)m_nEmbed;
102 ipProj.inner_product_param.bias_term = true;
103 ipProj.inner_product_param.weight_filler = new FillerParameter("gaussian", 0, 0, 0.02 / Math.Sqrt(2 * m_param.causal_self_attention_param.layers));
104 ipProj.inner_product_param.bias_filler = new FillerParameter("constant", 0.0);
105 ipProj.inner_product_param.axis = 2;
106 ipProj.parameters.Add(new ParamSpec(1.0, 1.0));
107 ipProj.parameters.Add(new ParamSpec(1.0, 0.0));
108 m_c_proj = Layer<T>.Create(cuda, log, convertLayerParam(ipProj, p), null);
109
110 // Regularization
111 if (m_dfAttnDropout > 0)
112 {
113 LayerParameter dropoutAttn = new LayerParameter(LayerParameter.LayerType.DROPOUT);
114 dropoutAttn.dropout_param.dropout_ratio = m_dfAttnDropout;
115 m_attn_dropout = Layer<T>.Create(cuda, log, convertLayerParam(dropoutAttn, p), null);
116 }
117
118 if (m_dfResidDropout > 0)
119 {
120 LayerParameter dropoutResid = new LayerParameter(LayerParameter.LayerType.DROPOUT);
121 dropoutResid.dropout_param.dropout_ratio = m_dfResidDropout;
122 m_resid_dropout = Layer<T>.Create(cuda, log, convertLayerParam(dropoutResid, p), null);
123 }
124
125 // Transpose
126 LayerParameter transpose = new LayerParameter(LayerParameter.LayerType.TRANSPOSE);
127 transpose.transpose_param.dim[1] = 2;
128 transpose.transpose_param.dim[2] = 1;
129 m_transpose = Layer<T>.Create(cuda, log, convertLayerParam(transpose, p), null);
130
131 LayerParameter transposeK = new LayerParameter(LayerParameter.LayerType.TRANSPOSE);
132 transposeK.transpose_param.dim[2] = 3;
133 transposeK.transpose_param.dim[3] = 2;
134 m_transposeQ = Layer<T>.Create(cuda, log, convertLayerParam(transposeK, p), null);
135
136 // Softmax
138 softmax.softmax_param.axis = -1;
140 m_softmax = Layer<T>.Create(cuda, log, convertLayerParam(softmax, p), null);
141
142 // Causal mask to ensure that atttention is only applied to the left in the input sequence.
143 m_blobBias = new Blob<T>(cuda, log);
144 m_blobBias.Name = m_param.name + " bias";
145
146 List<int> rgShape = new List<int>() { 1, 1, m_nBlockSize, m_nBlockSize };
147 m_blobBias.Reshape(rgShape);
148 fillBias(m_blobBias);
149
150 m_blobQ = new Blob<T>(cuda, log);
151 m_blobQ.Name = m_param.name + " Q";
152 m_blobK = new Blob<T>(cuda, log);
153 m_blobK.Name = m_param.name + " K";
154 m_blobV = new Blob<T>(cuda, log);
155 m_blobV.Name = m_param.name + " V";
156 m_blobQt = new Blob<T>(cuda, log);
157 m_blobQt.Name = m_param.name + " Qt";
158 m_blobKt = new Blob<T>(cuda, log);
159 m_blobKt.Name = m_param.name + " Kt";
160 m_blobKt1 = new Blob<T>(cuda, log);
161 m_blobKt1.Name = m_param.name + " Kt1";
162 m_blobVt = new Blob<T>(cuda, log);
163 m_blobVt.Name = m_param.name + " Vt";
164 m_blobAttA = new Blob<T>(cuda, log);
165 m_blobAttA.Name = m_param.name + " AttA";
166 m_blobAttB = new Blob<T>(cuda, log);
167 m_blobAttB.Name = m_param.name + " AttB";
168 m_blobWork = new Blob<T>(cuda, log);
169 m_blobWork.Name = m_param.name + " Work";
170
171 m_blobIpAttn = new Blob<T>(cuda, log);
172 m_blobIpAttn.Name = m_param.name + " IpAttn";
173 m_blobY = new Blob<T>(cuda, log);
174 m_blobY.Name = m_param.name + " Y";
175
177 }
178
180 protected override void dispose()
181 {
182 dispose(ref m_c_attn);
183 dispose(ref m_c_proj);
184 dispose(ref m_attn_dropout);
185 dispose(ref m_resid_dropout);
186 dispose(ref m_transpose);
187 dispose(ref m_transposeQ);
188 dispose(ref m_softmax);
189
190 dispose(ref m_blobBias);
191 dispose(ref m_blobQ);
192 dispose(ref m_blobK);
193 dispose(ref m_blobV);
194 dispose(ref m_blobQt);
195 dispose(ref m_blobKt);
196 dispose(ref m_blobKt1);
197 dispose(ref m_blobVt);
198 dispose(ref m_blobAttA);
199 dispose(ref m_blobAttB);
200 dispose(ref m_blobWork);
201 dispose(ref m_blobIpAttn);
202 dispose(ref m_blobY);
203
204 base.dispose();
205 }
206
208 protected override void setup_internal_blobs(BlobCollection<T> col)
209 {
210 if (col.Count > 0)
211 return;
212
213 col.Add(m_blobIpAttn);
214 col.Add(m_blobQ);
215 col.Add(m_blobK);
216 col.Add(m_blobV);
217 col.Add(m_blobQt);
218 col.Add(m_blobKt);
219 col.Add(m_blobVt);
220 col.Add(m_blobKt1);
221 col.Add(m_blobAttA);
222 col.Add(m_blobBias);
223 col.Add(m_blobAttB);
224 col.Add(m_blobWork);
225 col.Add(m_blobY);
226
227 col.Add(m_c_attn.internal_blobs);
228 col.Add(m_transpose.internal_blobs);
229 col.Add(m_transposeQ.internal_blobs);
230 col.Add(m_softmax.internal_blobs);
231 if (m_attn_dropout != null)
232 col.Add(m_attn_dropout.internal_blobs);
233 col.Add(m_c_proj.internal_blobs);
234 if (m_resid_dropout != null)
235 col.Add(m_resid_dropout.internal_blobs);
236 }
237
238 private void fillBias(Blob<T> b)
239 {
240 b.SetData(1.0);
241
242 float[] rgBiasData = convertF(b.mutable_cpu_data);
243
244 for (int i = 0; i<b.height; i++)
245 {
246 for (int j = i + 1; j < b.width; j++)
247 {
248 rgBiasData[i * b.width + j] = 0;
249 }
250 }
251
252 b.mutable_cpu_data = convert(rgBiasData);
253 }
254
258 public override int ExactNumBottomBlobs
259 {
260 get { return 1; }
261 }
262
266 public override int ExactNumTopBlobs
267 {
268 get { return 1; }
269 }
270
276 public override bool ReInitializeParameters(WEIGHT_TARGET target)
277 {
278 base.ReInitializeParameters(target);
279
280 m_c_attn.ReInitializeParameters(target);
281 m_c_proj.ReInitializeParameters(target);
282
283 return true;
284 }
285
286 private void addInternal(Blob<T> bottom, Blob<T> top)
287 {
288 m_colInternalBottom.Clear();
289 m_colInternalBottom.Add(bottom);
290
291 m_colInternalTop.Clear();
292 m_colInternalTop.Add(top);
293 }
294
295 private void addInternal(List<Blob<T>> rgBottom, Blob<T> top)
296 {
297 m_colInternalBottom.Clear();
298
299 for (int i=0; i<rgBottom.Count; i++)
300 {
301 m_colInternalBottom.Add(rgBottom[i]);
302 }
303
304 m_colInternalTop.Clear();
305 m_colInternalTop.Add(top);
306 }
307
313 public override void LayerSetUp(BlobCollection<T> colBottom, BlobCollection<T> colTop)
314 {
315 Blob<T> blobX = colBottom[0];
316
317 m_nB = blobX.num; // batch size
318 m_nT = blobX.channels; // sequence length
319 m_nC = blobX.height; // embedding dim (m_nEmbed)
320 m_nSize = m_nC / (int)m_nHeads;
321
322 m_nDataSize = blobX.count(3);
323
324 m_nSize *= m_nDataSize;
325
326 addInternal(blobX, m_blobIpAttn);
327 m_c_attn.Setup(m_colInternalBottom, m_colInternalTop);
328
329 blobs.Add(m_c_attn.blobs[0]);
330 blobs.Add(m_c_attn.blobs[1]);
331
332 m_rgShape[0] = m_nB;
333 m_rgShape[1] = m_nT;
334 m_rgShape[2] = m_nHeads;
335 m_rgShape[3] = m_nSize;
336
337 shareLayerBlob(m_blobQ, m_rgShape);
338 m_blobQ.Reshape(m_rgShape);
339 addInternal(m_blobQ, m_blobQt);
340 m_transpose.Setup(m_colInternalBottom, m_colInternalTop); // (B, nh, T, hs)
341
342 shareLayerBlob(m_blobAttA, blobX.shape());
343 m_blobAttA.ReshapeLike(blobX);
344 shareLayerBlob(m_blobAttB, blobX.shape());
345 m_blobAttB.ReshapeLike(blobX);
346 addInternal(m_blobAttA, m_blobAttB);
347 m_softmax.Setup(m_colInternalBottom, m_colInternalTop);
348
349 if (m_attn_dropout != null)
350 {
351 addInternal(m_blobAttB, m_blobAttB);
352 m_attn_dropout.Setup(m_colInternalBottom, m_colInternalTop);
353 }
354
355 m_rgShape[0] = m_nB;
356 m_rgShape[1] = m_nT;
357 m_rgShape[2] = m_nC;
358 m_rgShape[3] = m_nDataSize;
359
360 shareLayerBlob(m_blobY, m_rgShape);
361 m_blobY.Reshape(m_rgShape);
362
363 addInternal(m_blobY, colTop[0]);
364 m_c_proj.Setup(m_colInternalBottom, m_colInternalTop);
365
366 blobs.Add(m_c_proj.blobs[0]);
367 blobs.Add(m_c_proj.blobs[1]);
368
369 if (m_resid_dropout != null)
370 {
371 addInternal(colTop[0], colTop[0]);
372 m_resid_dropout.Setup(m_colInternalBottom, m_colInternalTop);
373 }
374
375 foreach (Blob<T> blob in blobs)
376 {
377 if (!blob.Name.StartsWith(m_param.name + "_"))
378 blob.Name = m_param.name + "_" + blob.Name;
379 }
380 }
381
387 public override void Reshape(BlobCollection<T> colBottom, BlobCollection<T> colTop)
388 {
389 Blob<T> blobX = colBottom[0];
390
391 m_nB = blobX.num; // batch size
392 m_nT = blobX.channels; // sequence length
393 m_nC = blobX.height; // embedding dim (m_nEmbed)
394 m_nSize = m_nC / m_nHeads;
395
396 m_nDataSize = blobX.count(3);
397
398 m_nSize *= m_nDataSize;
399
400 m_rgShape[0] = m_nB;
401 m_rgShape[1] = m_nT;
402 m_rgShape[2] = m_nHeads;
403 m_rgShape[3] = m_nSize;
404
405 shareLayerBlob(m_blobK, m_rgShape);
406 m_blobK.Reshape(m_rgShape);
407 shareLayerBlob(m_blobKt1, m_rgShape);
408 m_blobKt1.Reshape(m_rgShape);
409 shareLayerBlob(m_blobKt, m_rgShape);
410
411 addInternal(m_blobK, m_blobKt);
412 m_transpose.Reshape(m_colInternalBottom, m_colInternalTop); // (B, nh, T, hs)
413 m_blobKt1.ReshapeLike(m_blobKt);
414
415 shareLayerBlob(m_blobQ, m_rgShape);
416 m_blobQ.Reshape(m_rgShape);
417 shareLayerBlob(m_blobQt, m_rgShape);
418
419 addInternal(m_blobQ, m_blobQt);
420 m_transpose.Reshape(m_colInternalBottom, m_colInternalTop); // (B, nh, T, hs)
421
422 shareLayerBlob(m_blobV, m_rgShape);
423 m_blobV.Reshape(m_rgShape);
424 shareLayerBlob(m_blobVt, m_rgShape);
425
426 m_blobV.Reshape(m_nB, m_nT, m_nHeads, m_nSize);
427 addInternal(m_blobV, m_blobVt);
428 m_transpose.Reshape(m_colInternalBottom, m_colInternalTop); // (B, nh, T, hs)
429
430 m_rgShape[0] = m_nB;
431 m_rgShape[1] = m_nHeads;
432 m_rgShape[2] = m_nT;
433 m_rgShape[3] = m_nT;
434
435 shareLayerBlob(m_blobAttA, m_rgShape);
436 m_blobAttA.Reshape(m_rgShape);
437 shareLayerBlob(m_blobAttB, m_rgShape);
438 m_blobAttB.Reshape(m_rgShape);
439
440 m_rgShape[0] = m_blobVt.num;
441 m_rgShape[1] = m_blobVt.channels;
442 m_rgShape[2] = m_blobVt.width; // col major
443 m_rgShape[3] = m_blobVt.height;
444
445 shareLayerBlob(m_blobWork, m_rgShape);
446 m_blobWork.Reshape(m_rgShape); // col major
447
448 addInternal(m_blobWork, m_blobY);
449 m_transposeQ.Reshape(m_colInternalBottom, m_colInternalTop);
450
451 m_rgShape[0] = m_nB;
452 m_rgShape[1] = m_nT;
453 m_rgShape[2] = m_nC;
454 m_rgShape[3] = m_nDataSize;
455
456 shareLayerBlob(m_blobY, m_rgShape);
457 m_blobY.Reshape(m_rgShape);
458
459 addInternal(m_blobY, colTop[0]);
460 m_c_proj.Reshape(m_colInternalBottom, m_colInternalTop);
461
462 if (m_resid_dropout != null)
463 {
464 addInternal(colTop[0], colTop[0]);
465 m_resid_dropout.Reshape(m_colInternalBottom, m_colInternalTop);
466 }
467
468 if (m_blobBias.height != m_nT || m_blobBias.width != m_nT)
469 {
470 List<int> rgShape = new List<int>() { 1, 1, m_nT, m_nT };
471 m_blobBias.Reshape(rgShape);
472 fillBias(m_blobBias);
473 }
474 }
475
486 protected override void forward(BlobCollection<T> colBottom, BlobCollection<T> colTop)
487 {
488 Blob<T> blobX = colBottom[0];
489
490 // Calculate query, key, values for all heads in batch and move head forward to be the batch dim.
491 // q, k ,v = self.c_attn(x).split(self.n_embd, dim=2)
492 addInternal(blobX, m_blobIpAttn);
493 m_c_attn.Forward(m_colInternalBottom, m_colInternalTop);
494
495 // Split IP output (3 * nEmbed) into query, key, values.
496 int nCount = m_blobQ.count();
497 m_cuda.channel_copy(nCount, m_blobIpAttn.num, m_blobIpAttn.channels, 3, m_nEmbed, 0, m_blobIpAttn.gpu_data, m_blobQ.mutable_gpu_data, DIR.FWD);
498 m_cuda.channel_copy(nCount, m_blobIpAttn.num, m_blobIpAttn.channels, 3, m_nEmbed, 1, m_blobIpAttn.gpu_data, m_blobK.mutable_gpu_data, DIR.FWD);
499 m_cuda.channel_copy(nCount, m_blobIpAttn.num, m_blobIpAttn.channels, 3, m_nEmbed, 2, m_blobIpAttn.gpu_data, m_blobV.mutable_gpu_data, DIR.FWD);
500
501 // Transpose query, key and values along axes 1 & 2
502 // k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
503 // q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
504 // v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
505 addInternal(m_blobK, m_blobKt);
506 m_transpose.Forward(m_colInternalBottom, m_colInternalTop); // (B, nh, T, hs)
507 addInternal(m_blobQ, m_blobQt);
508 m_transpose.Forward(m_colInternalBottom, m_colInternalTop); // (B, nh, T, hs)
509 addInternal(m_blobV, m_blobVt);
510 m_transpose.Forward(m_colInternalBottom, m_colInternalTop); // (B, nh, T, hs)
511
512 // Multiply query and key(T) matrices and scale
513 // att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
514
515 addInternal(m_blobKt, m_blobKt1);
516 m_transposeQ.Forward(m_colInternalBottom, m_colInternalTop);
517
518 double dfScale = 1.0 / Math.Sqrt(m_nSize);
519 m_blobAttA.MatMul(m_blobQt, m_blobKt1);
520 m_blobAttA.scale_data(dfScale);
521
522 // Apply mask to attention matrix
523 // att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
524 m_cuda.mask(m_blobAttA.count(), m_blobBias.count(), convert(0.0), convert(m_dfIgnoreVal), m_blobAttA.gpu_data, m_blobBias.gpu_data, m_blobAttA.mutable_gpu_data); // all masked items set to -inf.
525
526 // Take softmax of attention along the last axis.
527 // att = F.softmax(att, dim = -1)
528 addInternal(m_blobAttA, m_blobAttB);
529 m_softmax.Forward(m_colInternalBottom, m_colInternalTop);
530
531 // Apply attention dropout.
532 // att = self.attn_dropout(att)
533 if (m_attn_dropout != null)
534 {
535 addInternal(m_blobAttB, m_blobAttB);
536 m_attn_dropout.Forward(m_colInternalBottom, m_colInternalTop);
537 }
538
539 m_blobWork.Reshape(m_blobVt.num, m_blobVt.channels, m_blobVt.height, m_blobVt.width);
540
541 // Multiply attention matrix with values
542 // y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
543 m_blobWork.MatMul(m_blobAttB, m_blobVt);
544
545 // Reassemble all head outputs side by side.
546 // y = y.transpose(1, 2).contiguous().view(B, T, C)
547 addInternal(m_blobWork, m_blobY);
548 m_transpose.Forward(m_colInternalBottom, m_colInternalTop);
549 m_blobY.Reshape(m_nB, m_nT, m_nC, m_nDataSize);
550
551 // Apply output projection.
552 // y = self.resid_dropout(self.c_proj(y))
553 addInternal(m_blobY, colTop[0]);
554 m_c_proj.Forward(m_colInternalBottom, m_colInternalTop);
555
556 // Apply resid dropout
557 if (m_resid_dropout != null)
558 {
559 addInternal(colTop[0], colTop[0]);
560 m_resid_dropout.Forward(m_colInternalBottom, m_colInternalTop);
561 }
562 }
563
575 protected override void backward(BlobCollection<T> colTop, List<bool> rgbPropagateDown, BlobCollection<T> colBottom)
576 {
577 // Gradient with respect to state then data.
578 if (rgbPropagateDown[0])
579 {
580 List<bool> rgbPropagate = new List<bool>() { true, true };
581
582 // Apply resid dropout
583 if (m_resid_dropout != null)
584 {
585 addInternal(colTop[0], colTop[0]);
586 m_resid_dropout.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
587 }
588
589 // Apply output projection.
590 // y = self.resid_dropout(self.c_proj(y))
591 addInternal(m_blobY, colTop[0]);
592 m_c_proj.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
593
594 // Reassemble all head outputs side by side.
595 // y = y.transpose(1, 2).contiguous().view(B, T, C)
596 addInternal(m_blobWork, m_blobY);
597 m_transpose.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
598
599 // Perform Self Attention backward pass
600 {
601 // Multiply attention matrix with values
602 // y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
603 m_blobY.CopyFrom(m_blobWork, true, true);
604
605 // Multiply attention matrix with values
606 // y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
607 // Gradient with respect to att
608 // att' = y' @ v^T
609 // Gradient with respect to vt
610 // vt' = att^T @ y'
611 m_blobY.MatMulGrad(m_blobAttB, m_blobVt, m_blobWork);
612
613 // Apply attention dropout.
614 // att = self.attn_dropout(att)
615 if (m_attn_dropout != null)
616 {
617 addInternal(m_blobAttB, m_blobAttB);
618 m_attn_dropout.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
619 }
620
621 // Take softmax of attention along the last axis.
622 // att = F.softmax(att, dim = -1)
623 addInternal(m_blobAttA, m_blobAttB);
624 m_softmax.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
625
626 // Multiply qt with kt^T to create attention matrix
627 // att = qt @ kt^T
628 // Gradient with respect to qt
629 // qt' = att' @ kt
630 // Gradient with respect to qt
631 // qt' = att' @ kt
632 double dfScale = 1.0 / Math.Sqrt(m_nSize);
633 m_blobAttA.MatMulGrad(m_blobQt, m_blobKt1, m_blobWork, dfScale);
634
635 // Transpose Kt1 back to Kt
636 addInternal(m_blobKt, m_blobKt1);
637 m_transposeQ.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
638 }
639
640 // Transpose query, key and values along axes 1 & 2
641 // k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
642 // q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
643 // v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
644 addInternal(m_blobK, m_blobKt);
645 m_transpose.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom); // (B, nh, T, hs)
646 addInternal(m_blobQ, m_blobQt);
647 m_transpose.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom); // (B, nh, T, hs)
648 addInternal(m_blobV, m_blobVt);
649 m_transpose.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom); // (B, nh, T, hs)
650
651 // Split IP output (3 * nEmbed) into query, key, values.
652 int nCount = m_blobQ.count();
653 m_cuda.channel_copy(nCount, m_blobIpAttn.num, m_blobIpAttn.channels, 3, m_nEmbed, 0, m_blobIpAttn.mutable_gpu_diff, m_blobQ.gpu_diff, DIR.BWD);
654 m_cuda.channel_copy(nCount, m_blobIpAttn.num, m_blobIpAttn.channels, 3, m_nEmbed, 1, m_blobIpAttn.mutable_gpu_diff, m_blobK.gpu_diff, DIR.BWD);
655 m_cuda.channel_copy(nCount, m_blobIpAttn.num, m_blobIpAttn.channels, 3, m_nEmbed, 2, m_blobIpAttn.mutable_gpu_diff, m_blobV.gpu_diff, DIR.BWD);
656
657 // Calculate query, key, values for all heads in batch and move head forward to be the batch dim.
658 // q, k ,v = self.c_attn(x).split(self.n_embd, dim=2)
659 addInternal(colBottom[0], m_blobIpAttn);
660 m_c_attn.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
661 }
662 }
663 }
664}
The Log class provides general output in text form.
Definition: Log.cs:13
void CHECK_EQ(double df1, double df2, string str)
Test whether one number is equal to another.
Definition: Log.cs:239
The BlobCollection contains a list of Blobs.
void Add(Blob< T > b)
Add a new Blob to the collection.
int Count
Returns the number of items in the collection.
void Clear(bool bDispose=false)
Remove all items from the collection.
The Blob is the main holder of data that moves through the Layers of the Net.
Definition: Blob.cs:25
int channels
DEPRECIATED; legacy shape accessor channels: use shape(1) instead.
Definition: Blob.cs:800
void SetData(T[] rgData, int nCount=-1, bool bSetCount=true)
Sets a number of items within the Blob's data.
Definition: Blob.cs:1922
void MatMul(Blob< T > blobA, Blob< T > blobB, bool bReshape=false, bool bTransA=false, bool bTransB=false, double dfScale=1.0, bool bADiff=false, bool bBDiff=false, bool bCDiff=false)
MatMul blobA with blobB and place the result in this blob (e.g. this = matmul(A, B))....
Definition: Blob.cs:3922
int height
DEPRECIATED; legacy shape accessor height: use shape(2) instead.
Definition: Blob.cs:808
void MatMulGrad(Blob< T > blobA, Blob< T > blobB, Blob< T > blobWork, double dfScale=1.0)
Calculates and propagates the gradient for blobA and BlobB given the input gradient in this blob's di...
Definition: Blob.cs:3974
long mutable_gpu_diff
Returns the diff GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1555
long mutable_gpu_data
Returns the data GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1487
T[] mutable_cpu_data
Get data from the GPU and bring it over to the host, or Set data from the Host and send it over to th...
Definition: Blob.cs:1461
void Reshape(int nNum, int nChannels, int nHeight, int nWidth, bool? bUseHalfSize=null)
DEPRECIATED; use
Definition: Blob.cs:442
void CopyFrom(Blob< T > src, int nSrcOffset, int nDstOffset, int nCount, bool bCopyData, bool bCopyDiff)
Copy from a source Blob.
Definition: Blob.cs:903
void scale_data(double df)
Scale the data by a scaling factor.
Definition: Blob.cs:1754
int width
DEPRECIATED; legacy shape accessor width: use shape(3) instead.
Definition: Blob.cs:816
List< int > shape()
Returns an array where each element contains the shape of an axis of the Blob.
Definition: Blob.cs:684
int count()
Returns the total number of items in the Blob.
Definition: Blob.cs:739
void ReshapeLike(Blob< T > b, bool? bUseHalfSize=null)
Reshape this Blob to have the same shape as another Blob.
Definition: Blob.cs:648
string Name
Get/set the name of the Blob.
Definition: Blob.cs:2184
long gpu_diff
Returns the diff GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1541
int num
DEPRECIATED; legacy shape accessor num: use shape(0) instead.
Definition: Blob.cs:792
long gpu_data
Returns the data GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1479
The CudaDnn object is the main interface to the Low-Level Cuda C++ DLL.
Definition: CudaDnn.cs:969
An interface for the units of computation which can be composed into a Net.
Definition: Layer.cs:31
LayerParameter m_param
Specifies the LayerParameter describing the Layer.
Definition: Layer.cs:47
void convert(BlobCollection< T > col)
Convert a collection of blobs from / to half size.
Definition: Layer.cs:535
bool shareLayerBlob(Blob< T > b, List< int > rgMinShape)
Attempts to share a Layer Blob if another parameter Blob with the same name and acceptable size is fo...
Definition: Layer.cs:1170
void Backward(BlobCollection< T > colTop, List< bool > rgbPropagateDown, BlobCollection< T > colBottom)
Given the top Blob error gradients, compute the bottom Blob error gradients.
Definition: Layer.cs:815
virtual bool ReInitializeParameters(WEIGHT_TARGET target)
Re-initialize the parameters of the layer.
Definition: Layer.cs:389
double Forward(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Given the bottom (input) Blobs, this function computes the top (output) Blobs and the loss.
Definition: Layer.cs:728
float convertF(T df)
Converts a generic to a float value.
Definition: Layer.cs:1359
abstract void Reshape(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Adjust the shapes of top blobs and internal buffers to accomodate the shapes of the bottom blobs.
BlobCollection< T > m_colInternalBlobs
Specifies internal blobs used by the layer.
Definition: Layer.cs:59
CudaDnn< T > m_cuda
Specifies the CudaDnn connection to Cuda.
Definition: Layer.cs:39
void Setup(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Implements common Layer setup functionality.
Definition: Layer.cs:439
static Layer< T > Create(CudaDnn< T > cuda, Log log, LayerParameter p, CancelEvent evtCancel, IXDatabaseBase db=null, TransferInput trxinput=null)
Create a new Layer based on the LayerParameter.
Definition: Layer.cs:1468
LayerParameter.LayerType m_type
Specifies the Layer type.
Definition: Layer.cs:35
BlobCollection< T > blobs
Returns the collection of learnable parameter Blobs for the Layer.
Definition: Layer.cs:875
BlobCollection< T > internal_blobs
Returns the collection of internal Blobs used by the Layer.
Definition: Layer.cs:883
LayerParameter convertLayerParam(LayerParameter pChild, LayerParameter pParent)
Called to convert a parent LayerParameterEx, used in blob sharing, with a child layer parameter.
Definition: Layer.cs:1134
The CausalSelfAttention provides a vanilla multi-head self-attention layer with projection at the end...
override void forward(BlobCollection< T > colBottom, BlobCollection< T > colTop)
The forward computation.
override int ExactNumBottomBlobs
Returns the exact number of required bottom (input) Blobs: input
override void LayerSetUp(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Setup the layer.
CausalSelfAttentionLayer(CudaDnn< T > cuda, Log log, LayerParameter p)
The CausalSelfAttention constructor.
override int ExactNumTopBlobs
Returns the exact number of required top (output) Blobs: attn
override void dispose()
Releases all GPU and host resources used by the Layer.
override bool ReInitializeParameters(WEIGHT_TARGET target)
Re-initialize the parameters of the layer.
override void backward(BlobCollection< T > colTop, List< bool > rgbPropagateDown, BlobCollection< T > colBottom)
Computes the loss error gradient w.r.t the outputs.
override void Reshape(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Reshape the bottom (input) and top (output) blobs.
override void setup_internal_blobs(BlobCollection< T > col)
Derivative layers should add all internal blobws to the 'col' provided.
double dropout_ratio
Specifies the dropout ratio. (e.g. the probability that values will be dropped out and set to zero....
Specifies whether to use the NVIDIA cuDnn version or Caffe version of a given forward/backward operat...
Engine engine
Specifies the Engine in use.
Engine
Defines the type of engine to use.
Specifies the filler parameters used to create each Filler.
FillerParameter weight_filler
The filler for the weights.
int axis
Specifies the first axis to be lumped into a single inner product computation; all preceding axes are...
FillerParameter bias_filler
The filler for the bias.
uint num_output
The number of outputs for the layer.
bool bias_term
Whether to have bias terms or not.
Specifies the base parameter for all layers.
List< ParamSpec > parameters
Specifies the ParamSpec parameters of the LayerParameter.
string name
Specifies the name of this LayerParameter.
SoftmaxParameter softmax_param
Returns the parameter set when initialized with LayerType.SOFTMAX
CausalSelfAttentionParameter causal_self_attention_param
Returns the parameter set when initialized with LayerType.CAUSAL_SELF_ATTENTION
InnerProductParameter inner_product_param
Returns the parameter set when initialized with LayerType.INNERPRODUCT
TransposeParameter transpose_param
Returns the parameter set when initialized with LayerType.TRANSPOSE
LayerType
Specifies the layer type.
DropoutParameter dropout_param
Returns the parameter set when initialized with LayerType.DROPOUT
Specifies training parameters (multipliers on global learning constants, and the name of other settin...
Definition: ParamSpec.cs:19
int axis
The axis along which to perform the softmax – may be negative to index from the end (e....
The MyCaffe.basecode contains all generic types used throughout MyCaffe.
Definition: Annotation.cs:12
The MyCaffe.common namespace contains common MyCaffe classes.
Definition: BatchInput.cs:8
DIR
Defines the direction of data flow.
Definition: CudaDnn.cs:22
WEIGHT_TARGET
Defines the type of weight to target in re-initializations.
Definition: Interfaces.cs:38
The MyCaffe.fillers namespace contains all fillers including the Filler class.
The MyCaffe.layers.gpt namespace contains all GPT related layers.
Definition: LayerFactory.cs:15
The MyCaffe.param namespace contains parameters used to create models.
The MyCaffe namespace contains the main body of MyCaffe code that closesly tracks the C++ Caffe open-...
Definition: Annotation.cs:12