MyCaffe  1.12.2.41
Deep learning software for Windows C# programmers.
TransformerBlockLayer.cs
1using System;
2using System.Collections.Generic;
3using System.Linq;
4using System.Text;
5using MyCaffe.basecode;
6using MyCaffe.common;
7using MyCaffe.param;
8using MyCaffe.fillers;
9using System.Diagnostics;
10using MyCaffe.param.gpt;
11using System.Runtime.InteropServices.WindowsRuntime;
12
13namespace MyCaffe.layers.gpt
14{
23 public class TransformerBlockLayer<T> : Layer<T>
24 {
25 Blob<T> m_blobLn1;
26 Blob<T> m_blobAttn1;
27 Blob<T> m_blobLn2;
28 Blob<T> m_blobAttn2 = null;
29 Blob<T> m_blobLn3 = null;
30 Blob<T> m_blobMlp;
31 Blob<T> m_blobMlpOut;
32 Blob<T> m_blobX = null;
33 Layer<T> m_ln1; // Input layer normalization.
34 Layer<T> m_attn1; // Attention block used with encoder and decoder
35 Layer<T> m_ln2; // Layer normalization after the first attention block
36 Layer<T> m_attn2 = null; // Attention block used with decoder only.
37 Layer<T> m_ln3 = null; // Layer normalization after second attention block, used with decoder only.
38 // MLP block
39 Layer<T> m_fc; // initial linear
40 Layer<T> m_proj; // projection
41 Layer<T> m_act; // activation
42 Layer<T> m_dropout = null; // resid dropout
43
44 BlobCollection<T> m_colInternalBottom = new BlobCollection<T>();
45 BlobCollection<T> m_colInternalTop = new BlobCollection<T>();
46
55 : base(cuda, log, p)
56 {
57 CancelEvent evtCancel = new CancelEvent();
58
59 m_type = LayerParameter.LayerType.TRANSFORMER_BLOCK;
60
61 m_blobLn1 = new Blob<T>(cuda, log);
62 m_blobLn1.Name = m_param.name + " ln1";
63 m_blobAttn1 = new Blob<T>(cuda, log);
64 m_blobAttn1.Name = m_param.name + " attn1";
65 m_blobLn2 = new Blob<T>(cuda, log);
66 m_blobLn2.Name = m_param.name + " ln2";
67 m_blobMlp = new Blob<T>(cuda, log);
68 m_blobMlp.Name = m_param.name + " mlp";
69 m_blobMlpOut = new Blob<T>(cuda, log);
70 m_blobMlpOut.Name = m_param.name + " mlp_out";
71 m_blobX = new Blob<T>(cuda, log);
72 m_blobX.Name = m_param.name + " xB";
73
74 LayerParameter ln1 = new LayerParameter(LayerParameter.LayerType.LAYERNORM, p.name + ".ln1");
76 m_ln1 = Layer<T>.Create(cuda, log, convertLayerParam(ln1, p), evtCancel) as Layer<T>;
77
78 LayerParameter ln2 = new LayerParameter(LayerParameter.LayerType.LAYERNORM, p.name + ".ln2");
80 m_ln2 = Layer<T>.Create(cuda, log, convertLayerParam(ln2, p), evtCancel) as Layer<T>;
81
83 {
84 LayerParameter attn = new LayerParameter(LayerParameter.LayerType.CAUSAL_SELF_ATTENTION, p.name + ".attn");
91 attn.parameters.Add((m_param.parameters.Count > 0) ? m_param.parameters[0] : new ParamSpec(1.0, 1.0));
92 attn.parameters.Add((m_param.parameters.Count > 1) ? m_param.parameters[1] : new ParamSpec(1.0, 0.0));
93 m_attn1 = Layer<T>.Create(cuda, log, convertLayerParam(attn, p), evtCancel);
94 }
96 {
97 LayerParameter attn = new LayerParameter(LayerParameter.LayerType.MULTIHEAD_ATTENTION, p.name + ".attn");
105 attn.parameters.Add((m_param.parameters.Count > 0) ? m_param.parameters[0] : new ParamSpec(1.0, 1.0));
106 attn.parameters.Add((m_param.parameters.Count > 1) ? m_param.parameters[1] : new ParamSpec(1.0, 0.0));
107 m_attn1 = Layer<T>.Create(cuda, log, convertLayerParam(attn, p), evtCancel);
108 }
110 {
111 m_blobAttn2 = new Blob<T>(cuda, log);
112 m_blobAttn2.Name = m_param.name + " attn2";
113 m_blobLn3 = new Blob<T>(cuda, log);
114 m_blobLn3.Name = m_param.name + " ln3";
115
116 LayerParameter ln3 = new LayerParameter(LayerParameter.LayerType.LAYERNORM, p.name + ".ln3");
118 m_ln3 = Layer<T>.Create(cuda, log, convertLayerParam(ln3, p), evtCancel) as Layer<T>;
119
120 LayerParameter attn1 = new LayerParameter(LayerParameter.LayerType.MULTIHEAD_ATTENTION, p.name + ".attn1");
128 attn1.parameters.Add((m_param.parameters.Count > 0) ? m_param.parameters[0] : new ParamSpec(1.0, 1.0));
129 attn1.parameters.Add((m_param.parameters.Count > 1) ? m_param.parameters[1] : new ParamSpec(1.0, 0.0));
130 m_attn1 = Layer<T>.Create(cuda, log, convertLayerParam(attn1, p), evtCancel);
131
132 LayerParameter attn2 = new LayerParameter(LayerParameter.LayerType.MULTIHEAD_ATTENTION, p.name + ".attn2");
140 attn2.parameters.Add((m_param.parameters.Count > 0) ? m_param.parameters[0] : new ParamSpec(1.0, 1.0));
141 attn2.parameters.Add((m_param.parameters.Count > 1) ? m_param.parameters[1] : new ParamSpec(1.0, 0.0));
142 m_attn2 = Layer<T>.Create(cuda, log, convertLayerParam(attn2, p), evtCancel);
143 }
144 else
145 {
146 throw new Exception("The block type '" + p.transformer_block_param.block_type.ToString() + "' is not supported!");
147 }
148
149 LayerParameter fc = new LayerParameter(LayerParameter.LayerType.INNERPRODUCT, p.name + ".fc");
154 {
155 fc.inner_product_param.weight_filler = new FillerParameter("gaussian", 0, 0, 0.02);
156 fc.inner_product_param.bias_filler = new FillerParameter("constant", 0.0);
157 }
158 else
159 {
162 }
163 fc.parameters.Add((m_param.parameters.Count > 0) ? m_param.parameters[0] : new ParamSpec(1.0, 1.0));
164 fc.parameters.Add((m_param.parameters.Count > 1) ? m_param.parameters[1] : new ParamSpec(1.0, 0.0));
165 m_fc = Layer<T>.Create(cuda, log, convertLayerParam(fc, p), evtCancel);
166
167 LayerParameter proj = new LayerParameter(LayerParameter.LayerType.INNERPRODUCT, p.name + ".proj");
168 proj.inner_product_param.axis = 2;
169 proj.inner_product_param.bias_term = true;
172 {
173 // apply special scaled init to the residual projections, per GPT-2 paper
174 proj.inner_product_param.weight_filler = new FillerParameter("gaussian", 0, 0, 0.02/Math.Sqrt(2 * m_param.transformer_block_param.layers));
175 proj.inner_product_param.bias_filler = new FillerParameter("constant", 0.0);
176 }
177 else
178 {
181 }
182 proj.parameters.Add((m_param.parameters.Count > 0) ? m_param.parameters[0] : new ParamSpec(1.0, 1.0));
183 proj.parameters.Add((m_param.parameters.Count > 1) ? m_param.parameters[1] : new ParamSpec(1.0, 0.0));
184 m_proj = Layer<T>.Create(cuda, log, convertLayerParam(proj, p), evtCancel);
185
186 // ReLU has a very similar curve, and is faster.
188 bool? bEnableBert = null;
189
190 if (p.transformer_block_param.activation == param.gpt.TransformerBlockParameter.ACTIVATION.GELU_BERT)
191 {
192 actType = LayerParameter.LayerType.GELU;
193 bEnableBert = true;
194 }
195 else if (p.transformer_block_param.activation == param.gpt.TransformerBlockParameter.ACTIVATION.GELU)
196 {
197 actType = LayerParameter.LayerType.GELU;
198 bEnableBert = false;
199 }
200
201 LayerParameter act = new LayerParameter(actType, p.name + ".act");
202 if (bEnableBert.HasValue)
203 act.gelu_param.enable_bert_version = bEnableBert.Value;
204
205 m_act = Layer<T>.Create(cuda, log, convertLayerParam(act, p), evtCancel);
206
208 {
209 LayerParameter dropout = new LayerParameter(LayerParameter.LayerType.DROPOUT, p.name + ".drop");
211 m_dropout = Layer<T>.Create(cuda, log, convertLayerParam(dropout, p), evtCancel);
212 }
213
215 }
216
218 protected override void dispose()
219 {
220 dispose(ref m_blobLn1);
221 dispose(ref m_blobAttn1);
222 dispose(ref m_blobLn2);
223 dispose(ref m_blobAttn2);
224 dispose(ref m_blobLn3);
225 dispose(ref m_blobMlp);
226 dispose(ref m_blobMlpOut);
227 dispose(ref m_blobX);
228
229 dispose(ref m_ln1);
230 dispose(ref m_attn1);
231 dispose(ref m_ln2);
232 dispose(ref m_attn2);
233 dispose(ref m_ln3);
234 dispose(ref m_fc);
235 dispose(ref m_proj);
236 dispose(ref m_act);
237 dispose(ref m_dropout);
238
239 base.dispose();
240 }
241
243 protected override void setup_internal_blobs(BlobCollection<T> col)
244 {
245 if (col.Count > 0)
246 return;
247
248 col.Add(m_blobLn1);
249 col.Add(m_blobAttn1);
250 col.Add(m_blobLn2);
251 if (m_blobAttn2 != null)
252 col.Add(m_blobAttn2);
253 col.Add(m_blobX);
254 if (m_blobLn3 != null)
255 col.Add(m_blobLn3);
256 col.Add(m_blobMlp);
257 col.Add(m_blobMlpOut);
258
259 col.Add(m_ln1.internal_blobs);
260 col.Add(m_attn1.internal_blobs);
261 col.Add(m_ln2.internal_blobs);
262 if (m_attn2 != null)
263 col.Add(m_attn2.internal_blobs);
264 if (m_ln3 != null)
265 col.Add(m_ln3.internal_blobs);
266 col.Add(m_fc.internal_blobs);
267 col.Add(m_act.internal_blobs);
268 col.Add(m_proj.internal_blobs);
269 if (m_dropout != null)
270 col.Add(m_dropout.internal_blobs);
271 }
272
276 public override int MinBottomBlobs
277 {
278 get { return 1; }
279 }
280
284 public override int MaxBottomBlobs
285 {
286 get
287 {
289 {
291 return 2;
292
294 return 4;
295
296 default:
297 return 1;
298 }
299 }
300 }
301
305 public override int ExactNumTopBlobs
306 {
307 get { return 1; }
308 }
309
315 public override bool ReInitializeParameters(WEIGHT_TARGET target)
316 {
317 base.ReInitializeParameters(target);
318
319 m_ln1.ReInitializeParameters(target);
320 m_attn1.ReInitializeParameters(target);
321 m_ln2.ReInitializeParameters(target);
322 if (m_attn2 != null)
323 m_attn2.ReInitializeParameters(target);
324 if (m_ln3 != null)
325 m_ln3.ReInitializeParameters(target);
326 m_fc.ReInitializeParameters(target);
327 m_proj.ReInitializeParameters(target);
328
329 return true;
330 }
331
332 private void addInternal(Blob<T> bottom, Blob<T> top)
333 {
334 m_colInternalBottom.Clear();
335 m_colInternalBottom.Add(bottom);
336
337 m_colInternalTop.Clear();
338 m_colInternalTop.Add(top);
339 }
340
341 private void addInternal(List<Blob<T>> rgBottom, Blob<T> top)
342 {
343 m_colInternalBottom.Clear();
344
345 for (int i=0; i<rgBottom.Count; i++)
346 {
347 m_colInternalBottom.Add(rgBottom[i]);
348 }
349
350 m_colInternalTop.Clear();
351 m_colInternalTop.Add(top);
352 }
353
370 public override void LayerSetUp(BlobCollection<T> colBottom, BlobCollection<T> colTop)
371 {
372 colTop[0].ReshapeLike(colBottom[0]);
373
374 shareLayerBlob(m_blobLn1, colBottom[0].shape());
375 m_blobLn1.ReshapeLike(colBottom[0]);
376 shareLayerBlob(m_blobAttn1, colBottom[0].shape());
377 m_blobAttn1.ReshapeLike(colBottom[0]);
378 shareLayerBlob(m_blobLn2, colBottom[0].shape());
379 m_blobLn2.ReshapeLike(colBottom[0]);
380 shareLayerBlob(m_blobX, colBottom[0].shape());
381 m_blobX.ReshapeLike(colBottom[0]);
382
383 if (m_blobAttn2 != null)
384 {
385 shareLayerBlob(m_blobAttn2, colBottom[0].shape());
386 m_blobAttn2.ReshapeLike(colBottom[0]);
387 }
388
389 if (m_blobLn3 != null)
390 {
391 shareLayerBlob(m_blobLn3, colBottom[0].shape());
392 m_blobLn3.ReshapeLike(colBottom[0]);
393 }
394
395 shareLayerBlob(m_blobMlp, colBottom[0].shape());
396 m_blobMlp.ReshapeLike(colBottom[0]);
397 shareLayerBlob(m_blobMlpOut, colBottom[0].shape());
398 m_blobMlpOut.ReshapeLike(colBottom[0]);
399
400 addInternal(colBottom[0], m_blobLn1);
401 m_ln1.LayerSetUp(m_colInternalBottom, m_colInternalTop);
402
404 {
405 // self.attn(self.ln_1(x))
406 addInternal(m_blobLn1, m_blobAttn1);
407 m_attn1.LayerSetUp(m_colInternalBottom, m_colInternalTop);
408 }
410 {
411 // self.attn(x_1, x_1, x_1, e_mask)
412 addInternal(new List<Blob<T>>() { m_blobLn1, m_blobLn1, m_blobLn1, colBottom[1] }, m_blobAttn1);
413 m_attn1.LayerSetUp(m_colInternalBottom, m_colInternalTop);
414 }
416 {
417 // self.attn1(x_1, x_1, x_1, d_mask)
418 addInternal(new List<Blob<T>>() { m_blobLn1, m_blobLn1, m_blobLn1, colBottom[1] }, m_blobAttn1);
419 m_attn1.LayerSetUp(m_colInternalBottom, m_colInternalTop);
420 }
421 else
422 {
423 throw new Exception("Unknown block type '" + m_param.transformer_block_param.block_type.ToString() + "'!");
424 }
425
426 addInternal(colTop[0], m_blobLn2);
427 m_ln2.LayerSetUp(m_colInternalBottom, m_colInternalTop);
428 Blob<T> blobLn = m_blobLn2;
429
431 {
432 // self.attn2(x_2, e_output, e_output, e_mask)
433 addInternal(new List<Blob<T>>() { m_blobLn2, colBottom[2], colBottom[2], colBottom[3] }, m_blobAttn2);
434 m_attn2.LayerSetUp(m_colInternalBottom, m_colInternalTop);
435
436 addInternal(m_blobAttn2, m_blobLn3);
437 m_ln3.LayerSetUp(m_colInternalBottom, m_colInternalTop);
438 blobLn = m_blobLn3;
439 }
440
441 addInternal(blobLn, m_blobMlp);
442 m_fc.LayerSetUp(m_colInternalBottom, m_colInternalTop);
443 addInternal(m_blobLn2, m_blobMlp);
444 m_fc.Reshape(m_colInternalBottom, m_colInternalTop);
445 addInternal(m_blobMlp, m_blobMlp);
446 m_act.LayerSetUp(m_colInternalBottom, m_colInternalTop);
447 addInternal(m_blobMlp, m_blobMlpOut);
448 m_proj.LayerSetUp(m_colInternalBottom, m_colInternalTop);
449
450 if (m_dropout != null)
451 {
452 addInternal(m_blobMlpOut, m_blobMlpOut);
453 m_dropout.LayerSetUp(m_colInternalBottom, m_colInternalTop);
454 }
455
456 colTop[0].ReshapeLike(m_blobMlpOut);
457
458 blobs.Add(m_attn1.blobs);
459 if (m_attn2 != null)
460 blobs.Add(m_attn2.blobs);
461 blobs.Add(m_fc.blobs);
462 blobs.Add(m_proj.blobs);
463
464 foreach (Blob<T> blob in blobs)
465 {
466 if (!blob.Name.StartsWith(m_param.name + "_"))
467 blob.Name = m_param.name + "_" + blob.Name;
468 }
469 }
470
476 public override void Reshape(BlobCollection<T> colBottom, BlobCollection<T> colTop)
477 {
478 colTop[0].ReshapeLike(colBottom[0]);
479
480 m_blobLn1.ReshapeLike(colBottom[0]);
481 m_blobAttn1.ReshapeLike(colBottom[0]);
482 m_blobLn2.ReshapeLike(colBottom[0]);
483 m_blobX.ReshapeLike(colBottom[0]);
484
485 if (m_blobAttn2 != null)
486 m_blobAttn2.ReshapeLike(colBottom[0]);
487
488 if (m_blobLn3 != null)
489 m_blobLn3.ReshapeLike(colBottom[0]);
490
491 m_blobMlp.ReshapeLike(colBottom[0]);
492 m_blobMlpOut.ReshapeLike(colBottom[0]);
493
494 addInternal(colBottom[0], m_blobLn1);
495 m_ln1.Reshape(m_colInternalBottom, m_colInternalTop);
496
498 {
499 // self.attn(self.ln_1(x))
500 addInternal(m_blobLn1, m_blobAttn1);
501 m_attn1.Reshape(m_colInternalBottom, m_colInternalTop);
502 }
504 {
505 // self.attn(x_1, x_1, x_1, e_mask)
506 addInternal(new List<Blob<T>>() { m_blobLn1, m_blobLn1, m_blobLn1, colBottom[1] }, m_blobAttn1);
507 m_attn1.Reshape(m_colInternalBottom, m_colInternalTop);
508 }
510 {
511 // self.attn1(x_1, x_1, x_1, d_mask)
512 addInternal(new List<Blob<T>>() { m_blobLn1, m_blobLn1, m_blobLn1, colBottom[1] }, m_blobAttn1);
513 m_attn1.Reshape(m_colInternalBottom, m_colInternalTop);
514 }
515 else
516 {
517 throw new Exception("Unknown block type '" + m_param.transformer_block_param.block_type.ToString() + "'!");
518 }
519
520 addInternal(colTop[0], m_blobLn2);
521 m_ln2.Reshape(m_colInternalBottom, m_colInternalTop);
522 Blob<T> blobLn = m_blobLn2;
523
525 {
526 // self.attn2(x_2, e_output, e_output, e_mask)
527 addInternal(new List<Blob<T>>() { m_blobLn2, colBottom[2], colBottom[2], colBottom[3] }, m_blobAttn2);
528 m_attn2.Reshape(m_colInternalBottom, m_colInternalTop);
529
530 addInternal(m_blobAttn2, m_blobLn3);
531 m_ln3.Reshape(m_colInternalBottom, m_colInternalTop);
532 blobLn = m_blobLn3;
533 }
534
535 addInternal(blobLn, m_blobMlp);
536 m_fc.Reshape(m_colInternalBottom, m_colInternalTop);
537 addInternal(m_blobMlp, m_blobMlp);
538 m_act.Reshape(m_colInternalBottom, m_colInternalTop);
539 addInternal(m_blobMlp, m_blobMlpOut);
540 m_proj.Reshape(m_colInternalBottom, m_colInternalTop);
541
542 if (m_dropout != null)
543 {
544 addInternal(m_blobMlpOut, m_blobMlpOut);
545 m_dropout.Reshape(m_colInternalBottom, m_colInternalTop);
546 }
547
548 colTop[0].ReshapeLike(colBottom[0]);
549 }
550
608 protected override void forward(BlobCollection<T> colBottom, BlobCollection<T> colTop)
609 {
610 int nCount = colBottom[0].count();
611 Blob<T> blobX = colBottom[0];
612 Blob<T> blobXMask = (colBottom.Count > 1) ? colBottom[1] : null;
613 Blob<T> blobEncOut = (colBottom.Count > 3) ? colBottom[2] : null;
614 Blob<T> blobEncMask = (colBottom.Count > 3) ? colBottom[3] : null;
615
616
617 //-------------------------------------------
618 // x = x + self.attn(self.ln_1(x))
619
620 // x_1 = self.ln_1(x)
621 addInternal(blobX, m_blobLn1);
622 m_ln1.Forward(m_colInternalBottom, m_colInternalTop);
623
625 {
626 // attn1 = self.attn(self.ln_1(x))
627 addInternal(m_blobLn1, m_blobAttn1);
628 m_attn1.Forward(m_colInternalBottom, m_colInternalTop);
629 }
631 {
632 // attn1 = self.attn(x_1, x_1, x_1, e_mask)
633 addInternal(new List<Blob<T>>() { m_blobLn1, m_blobLn1, m_blobLn1, blobXMask }, m_blobAttn1);
634 m_attn1.Forward(m_colInternalBottom, m_colInternalTop);
635 }
637 {
638 // attn1 = self.attn1(x_1, x_1, x_1, d_mask)
639 addInternal(new List<Blob<T>>() { m_blobLn1, m_blobLn1, m_blobLn1, blobXMask }, m_blobAttn1);
640 m_attn1.Forward(m_colInternalBottom, m_colInternalTop);
641 }
642 else
643 {
644 throw new Exception("Unknown block type '" + m_param.transformer_block_param.block_type.ToString() + "'!");
645 }
646
647 // xB = x + self.attn1(self.ln_1(x))
648 m_cuda.add(nCount, blobX.gpu_data, m_blobAttn1.gpu_data, m_blobX.mutable_gpu_data);
649
650 // x_2 = self.ln_2(xB)
651 addInternal(m_blobX, m_blobLn2);
652 m_ln2.Forward(m_colInternalBottom, m_colInternalTop);
653 Blob<T> blobLn = m_blobLn2;
654
656 {
657 // attn2 = self.attn2(x_2, e_output, e_output, e_mask)
658 addInternal(new List<Blob<T>>() { m_blobLn2, blobEncOut, blobEncOut, blobEncMask }, m_blobAttn2);
659 m_attn2.Forward(m_colInternalBottom, m_colInternalTop);
660
661 // xC = xB + self.attn2(self.ln_2(x))
662 m_cuda.add(nCount, m_blobX.gpu_data, m_blobAttn2.gpu_data, m_blobX.mutable_gpu_data);
663
664 // x_3 = self.ln3(xC)
665 addInternal(m_blobX, m_blobLn3);
666 m_ln3.Forward(m_colInternalBottom, m_colInternalTop);
667 blobLn = m_blobLn3;
668 }
669
670 // CSA | ENCODER: ff = self.mlpf(self.ln_2(x_2)),
671 // DECODER: ff = self.mlpf(self.ln_3(x_3))
672 addInternal(blobLn, m_blobMlp);
673 m_fc.Forward(m_colInternalBottom, m_colInternalTop);
674 addInternal(m_blobMlp, m_blobMlp);
675 m_act.Forward(m_colInternalBottom, m_colInternalTop);
676 addInternal(m_blobMlp, m_blobMlpOut);
677 m_proj.Forward(m_colInternalBottom, m_colInternalTop);
678
679 if (m_dropout != null)
680 {
681 addInternal(m_blobMlpOut, m_blobMlpOut);
682 m_dropout.Forward(m_colInternalBottom, m_colInternalTop);
683 }
684
685 // CSA | ENCODER: xC = xB + self.mlpf(self.ln_2(x_2)),
686 // DECODER: xD = xC + self.mlpf(self.ln_3(x_3))
687 m_cuda.add(nCount, m_blobX.gpu_data, m_blobMlpOut.gpu_data, colTop[0].mutable_gpu_data);
688 }
689
710 protected override void backward(BlobCollection<T> colTop, List<bool> rgbPropagateDown, BlobCollection<T> colBottom)
711 {
712 int nCount = colBottom[0].count();
713 Blob<T> blobX = colBottom[0];
714 Blob<T> blobXMask = (colBottom.Count > 1) ? colBottom[1] : null;
715 Blob<T> blobEncOut = (colBottom.Count > 3) ? colBottom[2] : null;
716 Blob<T> blobEncMask = (colBottom.Count > 3) ? colBottom[3] : null;
717
718 // Gradient with respect to state then data.
719 if (rgbPropagateDown[0])
720 {
721 List<bool> rgbPropagate = new List<bool>() { true, true };
722
723 // CSA | ENCODER Gradient for xC = xB + self.mlpf(self.ln_2(x_2))
724 // DECODER Gradient for xD = xC + self.mlpf(self.ln_3(x_3))
725 // xD -> ff (decoder), otherwise xC -> ff (encoder)
726 m_cuda.copy(nCount, colTop[0].gpu_diff, m_blobMlpOut.mutable_gpu_diff);
727 // xD -> xC (decoder), otherwise xC -> xB (encoder)
728 m_cuda.copy(nCount, colTop[0].gpu_diff, m_blobX.mutable_gpu_diff); // xB, xC
729
730 if (m_dropout != null)
731 {
732 addInternal(m_blobMlpOut, m_blobMlpOut);
733 m_dropout.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
734 }
735
736 // Gradient for MLP
737 addInternal(m_blobMlp, m_blobMlpOut);
738 m_proj.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
739 addInternal(m_blobMlp, m_blobMlp);
740 m_act.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
741 Blob<T> blobLn = (m_param.transformer_block_param.block_type == TransformerBlockParameter.BLOCK_TYPE.DECODER) ? m_blobLn3 : m_blobLn2;
742 // ff -> x_3 (decoder), otherwise x_2 (encoder)
743 addInternal(blobLn, m_blobMlp);
744 m_fc.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
745
747 {
748 // x_3 = self.ln3(xC)
749 // x_3 -> xC1
750 m_blobAttn2.CopyFrom(m_blobX, true);
751 addInternal(m_blobX, m_blobLn3);
752 m_ln3.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
753
754 // x_3 + xC1 -> xC
755 // xC -> xB (implied)
756 m_cuda.add(nCount, m_blobAttn2.gpu_diff, m_blobX.gpu_diff, m_blobX.mutable_gpu_diff);
757 // xC -> attn2
758 m_blobAttn2.CopyFrom(m_blobX, true);
759
760 // attn2 = self.attn2(x_2, e_output, e_output, e_mask)
761 // attn2 -> x_2 (ln2), e_output1, e_output2
762 addInternal(new List<Blob<T>>() { m_blobLn2, blobEncOut, blobEncOut, blobEncMask }, m_blobAttn2);
763 m_attn2.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
764 }
765
766 // x_2 = self.ln_2(xB)
767 // x_2 -> xB1
768 m_blobAttn1.CopyFrom(m_blobX, true);
769 addInternal(m_blobX, m_blobLn2);
770 m_ln2.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
771
772 // xC + xB1 -> xB
773 // xB -> x (implied)
774 m_cuda.add(nCount, m_blobAttn1.gpu_diff, m_blobX.gpu_diff, m_blobX.mutable_gpu_diff);
775 // xB -> attn1
776 m_blobAttn1.CopyFrom(m_blobX, true);
777
779 {
780 // Gradient for self.attn(self.ln_1(x))
781 addInternal(m_blobLn1, m_blobAttn1);
782 m_attn1.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
783 }
786 {
787 // Gradient for self.attn(x_1, x_1, x_1, e_mask)
788 addInternal(new List<Blob<T>>() { m_blobLn1, m_blobLn1, m_blobLn1, blobXMask }, m_blobAttn1);
789 m_attn1.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
790 }
791 else
792 {
793 throw new Exception("Unknown block type '" + m_param.transformer_block_param.block_type.ToString() + "'!");
794 }
795
796 // x_1 = ln1(x)
797 // x_1 -> x1
798 addInternal(blobX, m_blobLn1);
799 m_ln1.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
800
801 // Accumulate attention gradient with others in bottom[0].
802 // x1 + xB -> x
803 m_cuda.add(nCount, blobX.gpu_diff, m_blobX.gpu_diff, blobX.mutable_gpu_diff);
804 }
805 }
806 }
807}
The CancelEvent provides an extension to the manual cancel event that allows for overriding the manua...
Definition: CancelEvent.cs:17
The Log class provides general output in text form.
Definition: Log.cs:13
The BlobCollection contains a list of Blobs.
void Add(Blob< T > b)
Add a new Blob to the collection.
int Count
Returns the number of items in the collection.
void Clear(bool bDispose=false)
Remove all items from the collection.
void ReshapeLike(BlobCollection< T > src)
Reshapes all blobs in the collection to the sizes of the source.
The Blob is the main holder of data that moves through the Layers of the Net.
Definition: Blob.cs:25
long mutable_gpu_diff
Returns the diff GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1555
long mutable_gpu_data
Returns the data GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1487
void CopyFrom(Blob< T > src, int nSrcOffset, int nDstOffset, int nCount, bool bCopyData, bool bCopyDiff)
Copy from a source Blob.
Definition: Blob.cs:903
void ReshapeLike(Blob< T > b, bool? bUseHalfSize=null)
Reshape this Blob to have the same shape as another Blob.
Definition: Blob.cs:648
string Name
Get/set the name of the Blob.
Definition: Blob.cs:2184
long gpu_diff
Returns the diff GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1541
long gpu_data
Returns the data GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1479
The CudaDnn object is the main interface to the Low-Level Cuda C++ DLL.
Definition: CudaDnn.cs:969
An interface for the units of computation which can be composed into a Net.
Definition: Layer.cs:31
LayerParameter m_param
Specifies the LayerParameter describing the Layer.
Definition: Layer.cs:47
abstract void LayerSetUp(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Performs Layer specific setup. Derived layers should override this function as well as the Reshape fu...
bool shareLayerBlob(Blob< T > b, List< int > rgMinShape)
Attempts to share a Layer Blob if another parameter Blob with the same name and acceptable size is fo...
Definition: Layer.cs:1170
void Backward(BlobCollection< T > colTop, List< bool > rgbPropagateDown, BlobCollection< T > colBottom)
Given the top Blob error gradients, compute the bottom Blob error gradients.
Definition: Layer.cs:815
virtual bool ReInitializeParameters(WEIGHT_TARGET target)
Re-initialize the parameters of the layer.
Definition: Layer.cs:389
double Forward(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Given the bottom (input) Blobs, this function computes the top (output) Blobs and the loss.
Definition: Layer.cs:728
abstract void Reshape(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Adjust the shapes of top blobs and internal buffers to accomodate the shapes of the bottom blobs.
BlobCollection< T > m_colInternalBlobs
Specifies internal blobs used by the layer.
Definition: Layer.cs:59
CudaDnn< T > m_cuda
Specifies the CudaDnn connection to Cuda.
Definition: Layer.cs:39
static Layer< T > Create(CudaDnn< T > cuda, Log log, LayerParameter p, CancelEvent evtCancel, IXDatabaseBase db=null, TransferInput trxinput=null)
Create a new Layer based on the LayerParameter.
Definition: Layer.cs:1468
LayerParameter.LayerType m_type
Specifies the Layer type.
Definition: Layer.cs:35
BlobCollection< T > blobs
Returns the collection of learnable parameter Blobs for the Layer.
Definition: Layer.cs:875
BlobCollection< T > internal_blobs
Returns the collection of internal Blobs used by the Layer.
Definition: Layer.cs:883
LayerParameter convertLayerParam(LayerParameter pChild, LayerParameter pParent)
Called to convert a parent LayerParameterEx, used in blob sharing, with a child layer parameter.
Definition: Layer.cs:1134
The TransformerBlock provides a generic transformer block
override void Reshape(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Reshape the bottom (input) and top (output) blobs.
override int ExactNumTopBlobs
Returns the exact number of required top (output) Blobs: trans
TransformerBlockLayer(CudaDnn< T > cuda, Log log, LayerParameter p)
The TransformerBlock constructor.
override void forward(BlobCollection< T > colBottom, BlobCollection< T > colTop)
The forward computation.
override bool ReInitializeParameters(WEIGHT_TARGET target)
Re-initialize the parameters of the layer.
override void LayerSetUp(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Setup the layer.
override void backward(BlobCollection< T > colTop, List< bool > rgbPropagateDown, BlobCollection< T > colBottom)
Computes the loss error gradient w.r.t the outputs.
override int MinBottomBlobs
Returns the minimum number of required bottom (input) Blobs: input
override void setup_internal_blobs(BlobCollection< T > col)
Derivative layers should add all internal blobws to the 'col' provided.
override int MaxBottomBlobs
Returns the maximum number of required bottom (input) Blobs: input, e_mask (when ENCODER,...
override void dispose()
Releases all GPU and host resources used by the Layer.
double dropout_ratio
Specifies the dropout ratio. (e.g. the probability that values will be dropped out and set to zero....
Specifies the filler parameters used to create each Filler.
FillerParameter weight_filler
The filler for the weights.
int axis
Specifies the first axis to be lumped into a single inner product computation; all preceding axes are...
FillerParameter bias_filler
The filler for the bias.
uint num_output
The number of outputs for the layer.
bool bias_term
Whether to have bias terms or not.
Specifies the base parameter for all layers.
List< ParamSpec > parameters
Specifies the ParamSpec parameters of the LayerParameter.
string name
Specifies the name of this LayerParameter.
MultiheadAttentionParameter multihead_attention_param
Returns the parameter set when initialized with LayerType.MULTIHEAD_ATTENTION
LayerNormParameter layer_norm_param
Returns the parameter set when initialized with LayerType.LAYERNORM
CausalSelfAttentionParameter causal_self_attention_param
Returns the parameter set when initialized with LayerType.CAUSAL_SELF_ATTENTION
InnerProductParameter inner_product_param
Returns the parameter set when initialized with LayerType.INNERPRODUCT
TransformerBlockParameter transformer_block_param
Returns the parameter set when initialized with LayerType.TRANSFORMER_BLOCK
LayerType
Specifies the layer type.
GeluParameter gelu_param
Returns the parameter set when initialized with LayerType.GELU
DropoutParameter dropout_param
Returns the parameter set when initialized with LayerType.DROPOUT
Specifies training parameters (multipliers on global learning constants, and the name of other settin...
Definition: ParamSpec.cs:19
double resid_dropout
Specifies dropout probability used on the residual weights.
uint layers
The number of layers (transformer blocks) used.
double attn_dropout
Specifies dropout probability used on the attention weights.
bool enable_bert_version
Specifies to use the special BERT version used in GPT models.
bool enable_cuda_impl
Specifies to use the low-level full cuda implementation of LayerNorm (default = false).
Specifies the parameters for the MultiheadAttentionLayer.
WEIGHT_INIT
Defines the weight initialization strategy.
double attn_dropout
Specifies dropout probability used on the attention weights.
uint layers
The number of layers (transformer blocks) used.
double resid_dropout
Specifies dropout probability used on the residual weights.
WEIGHT_INIT weight_init
Specifies the weight initialization strategy (default = ENCODER_DECODER).
Specifies the parameters for the TransformerBlockLayer.
double resid_dropout
Specifies dropout probability used on the residual weights.
uint layers
The number of layers (transformer blocks) used.
BLOCK_TYPE block_type
Specifies the type of transformer block to configure.
ACTIVATION activation
Specifies the activation type to use (default = RELU)
bool enable_layernorm_cuda_impl
Specifies to use the low-level full cuda implementation of LayerNorm (default = false).
double attn_dropout
Specifies dropout probability used on the attention weights.
BLOCK_TYPE
Defines the type of transformer block
The MyCaffe.basecode contains all generic types used throughout MyCaffe.
Definition: Annotation.cs:12
The MyCaffe.common namespace contains common MyCaffe classes.
Definition: BatchInput.cs:8
WEIGHT_TARGET
Defines the type of weight to target in re-initializations.
Definition: Interfaces.cs:38
The MyCaffe.fillers namespace contains all fillers including the Filler class.
The MyCaffe.layers.gpt namespace contains all GPT related layers.
Definition: LayerFactory.cs:15
The MyCaffe.param namespace contains parameters used to create models.
The MyCaffe namespace contains the main body of MyCaffe code that closesly tracks the C++ Caffe open-...
Definition: Annotation.cs:12