MyCaffe  1.12.2.41
Deep learning software for Windows C# programmers.
MultiHeadAttentionInterpLayer.cs
1using System;
2using System.Collections.Generic;
3using System.Diagnostics;
4using System.Linq;
5using System.Reflection;
6using System.Text;
7using MyCaffe.basecode;
8using MyCaffe.common;
9using MyCaffe.param;
10
11namespace MyCaffe.layers.tft
12{
30 {
31 List<int> m_rgShapeQ;
32 List<int> m_rgShapeK;
33 List<int> m_rgShapeV;
34 List<int> m_rgShapeMask;
35 int m_nNumHeads;
36 int m_nDModel;
37 int m_nAllHeadsDim;
38 int m_nNumFut = 0;
39 int m_nNumHist = 0;
40 int m_nBlocks = 0;
41 double m_dfScale;
42 Layer<T> m_ipQLayer;
43 Layer<T> m_ipKLayer;
44 Layer<T> m_ipVLayer;
45 Layer<T> m_transpose;
46 Layer<T> m_softmax;
47 Layer<T> m_ipOutLayer;
48 Blob<T> m_blobQ;
49 Blob<T> m_blobK;
50 Blob<T> m_blobV;
51 Blob<T> m_blobIpQ;
52 Blob<T> m_blobIpK;
53 Blob<T> m_blobIpV;
54 Blob<T> m_blobMask;
55 Blob<T> m_blobIpVfull;
56 Blob<T> m_blobIpQt;
57 Blob<T> m_blobIpKt;
58 Blob<T> m_blobIpKt1;
59 Blob<T> m_blobIpVt;
60 Blob<T> m_blobAttnScores1;
61 Blob<T> m_blobAttnScoresAllHeads;
62 Blob<T> m_blobAttnOutputAllHeads;
63 Blob<T> m_blobWork;
64 BlobCollection<T> m_colTop = new BlobCollection<T>();
65 BlobCollection<T> m_colBtm = new BlobCollection<T>();
66 List<int> m_rgShape = new List<int>(4);
67
75 : base(cuda, log, p)
76 {
77 m_type = LayerParameter.LayerType.MULTIHEAD_ATTENTION_INTERP;
78
79 m_blobQ = new Blob<T>(cuda, log);
80 m_blobQ.Name = p.name + ".q";
81 m_blobK = new Blob<T>(cuda, log);
82 m_blobQ.Name = p.name + ".k";
83 m_blobV = new Blob<T>(cuda, log);
84 m_blobV.Name = p.name + ".v";
85 m_blobIpQ = new Blob<T>(cuda, log);
86 m_blobIpQ.Name = p.name + ".ipq";
87 m_blobIpK = new Blob<T>(cuda, log);
88 m_blobIpK.Name = p.name + ".ipk";
89 m_blobIpV = new Blob<T>(cuda, log);
90 m_blobIpV.Name = p.name + ".ipv";
91 m_blobMask = new Blob<T>(cuda, log, false);
92 m_blobMask.Name = p.name + ".mask";
93 m_blobIpVfull = new Blob<T>(cuda, log);
94 m_blobIpVfull.Name = p.name + ".ipvfull";
95 m_blobIpQt = new Blob<T>(cuda, log);
96 m_blobIpQt.Name = p.name + ".ipqt";
97 m_blobIpKt = new Blob<T>(cuda, log);
98 m_blobIpKt.Name = p.name + ".ipkt";
99 m_blobIpKt1 = new Blob<T>(cuda, log);
100 m_blobIpKt1.Name = p.name + ".ipkt1";
101 m_blobIpVt = new Blob<T>(cuda, log);
102 m_blobIpVt.Name = p.name + ".ipvt";
103 m_blobAttnScores1 = new Blob<T>(cuda, log);
104 m_blobAttnScores1.Name = p.name + ".attn_scores";
105 m_blobAttnScoresAllHeads = new Blob<T>(cuda, log);
106 m_blobAttnScoresAllHeads.Name = p.name + ".attn_scr_allhd";
107 m_blobAttnOutputAllHeads = new Blob<T>(cuda, log);
108 m_blobAttnOutputAllHeads.Name = p.name + ".attn_out_allhd";
109 m_blobWork = new Blob<T>(cuda, log);
110 m_blobWork.Name = p.name + ".work";
111 }
112
114 protected override void dispose()
115 {
116 dispose(ref m_blobQ);
117 dispose(ref m_blobK);
118 dispose(ref m_blobV);
119 dispose(ref m_blobIpQ);
120 dispose(ref m_blobIpK);
121 dispose(ref m_blobIpV);
122 dispose(ref m_blobMask);
123 dispose(ref m_blobIpVfull);
124 dispose(ref m_blobIpQt);
125 dispose(ref m_blobIpKt);
126 dispose(ref m_blobIpKt1);
127 dispose(ref m_blobIpVt);
128 dispose(ref m_blobAttnScores1);
129 dispose(ref m_blobAttnScoresAllHeads);
130 dispose(ref m_blobAttnOutputAllHeads);
131 dispose(ref m_blobWork);
132
133 dispose(ref m_ipQLayer);
134 dispose(ref m_ipKLayer);
135 dispose(ref m_ipVLayer);
136 dispose(ref m_transpose);
137 dispose(ref m_softmax);
138 dispose(ref m_ipOutLayer);
139 }
140
142 protected override void setup_internal_blobs(BlobCollection<T> col)
143 {
144 if (col.Count > 0)
145 return;
146
147 col.Add(m_blobQ);
148 col.Add(m_blobK);
149 col.Add(m_blobV);
150 col.Add(m_blobIpQ);
151 col.Add(m_blobIpK);
152 col.Add(m_blobIpV);
153 col.Add(m_blobMask);
154 col.Add(m_blobIpVfull);
155 col.Add(m_blobIpQt);
156 col.Add(m_blobIpKt);
157 col.Add(m_blobIpKt1);
158 col.Add(m_blobIpVt);
159 col.Add(m_blobAttnScores1);
160 col.Add(m_blobAttnScoresAllHeads);
161 col.Add(m_blobAttnOutputAllHeads);
162 col.Add(m_blobWork);
163 }
164
168 public override int MinBottomBlobs
169 {
170 get { return 1; }
171 }
172
176 public override int MaxBottomBlobs
177 {
178 get { return 4; }
179 }
180
184 public override int ExactNumTopBlobs
185 {
186 get { return 3; }
187 }
188
189 private void addBtmTop(Blob<T> btm, Blob<T> top)
190 {
191 m_colBtm.Clear();
192 m_colBtm.Add(btm);
193 m_colTop.Clear();
194 m_colTop.Add(top);
195 }
196
197 private void reshapeRepeat(Blob<T> b, List<int> rgShape, int nRepeat)
198 {
199 m_rgShape.Clear();
200 m_rgShape.AddRange(rgShape);
201 m_rgShape[3] *= nRepeat;
202 b.Reshape(m_rgShape);
203 }
204
205 private void reshapeFwd(Blob<T> b, int nNumHeads, List<int> rgShape = null)
206 {
207 m_rgShape.Clear();
208
209 if (rgShape == null)
210 rgShape = b.shape();
211
212 m_rgShape.Add(rgShape[0]);
213 m_rgShape.Add(rgShape[1]);
214 m_rgShape.Add(nNumHeads);
215 m_rgShape.Add(rgShape[2] / nNumHeads);
216 b.Reshape(m_rgShape);
217 }
218
219 private void reshapeBwd(Blob<T> b, int nNumHeads, List<int> rgShape = null)
220 {
221 m_rgShape.Clear();
222
223 if (rgShape == null)
224 rgShape = b.shape();
225
226 m_rgShape.Add(rgShape[0]);
227 m_rgShape.Add(rgShape[1]);
228 m_rgShape.Add(rgShape[2] * rgShape[3]);
229 b.Reshape(m_rgShape);
230 }
231
232 private void reshapeSansHead(Blob<T> b, List<int> rgShape)
233 {
234 m_rgShape.Clear();
235 m_rgShape.AddRange(rgShape);
236 m_rgShape.RemoveAt(1);
237 b.Reshape(m_rgShape);
238 }
239
240 private void calculateChannelMeanAcrossChannelsFwd(Blob<T> bBtm, Blob<T> bTop)
241 {
242 int nN = bBtm.num;
243 int nC = bBtm.channels;
244 int nSpatialDim = bBtm.count(2);
245 int nSpatialDimDst = bTop.count(1);
246
247 m_log.CHECK_EQ(bTop.num, nN, "Both src and dst must have same 'num'.");
248 m_log.CHECK_EQ(nSpatialDim, bTop.count(1), "Both src and dst must have the same spatial dim.");
249
250 bTop.SetData(0);
251 m_blobWork.ReshapeLike(bTop);
252
253 for (int i = 0; i < nC; i++)
254 {
255 m_cuda.channel_copy(m_blobWork.count(), nN, 1, nC, nSpatialDim, i, bBtm.gpu_data, m_blobWork.gpu_data, DIR.FWD);
256 m_cuda.add(m_blobWork.count(), m_blobWork.gpu_data, bTop.gpu_data, bTop.mutable_gpu_data);
257 }
258
259 bTop.scale_data(1.0 / nC);
260 }
261
262 private void calculateChannelMeanAcrossChannelsBwd(Blob<T> bBtm, Blob<T> bTop)
263 {
264 int nN = bBtm.num;
265 int nC = bBtm.channels;
266 int nSpatialDim = bBtm.count(2);
267
268 m_log.CHECK_EQ(bTop.num, nN, "Both src and dst must have same 'num'.");
269 m_log.CHECK_EQ(nSpatialDim, bTop.count(1), "Both src and dst must have the same spatial dim.");
270
271 bBtm.SetDiff(0);
272
273 for (int i = 0; i < nC; i++)
274 {
275 m_cuda.channel_copy(bTop.count(), nN, 1, nC, nSpatialDim, i, bBtm.gpu_diff, bTop.gpu_diff, DIR.BWD);
276 }
277
278 bBtm.scale_diff(1.0 / nC);
279 }
280
281 private void generate_mask(Blob<T> mask)
282 {
283 m_rgShape.Clear();
284 m_rgShape.Add(m_nNumFut);
285 m_rgShape.Add(m_nNumFut + m_nNumHist);
286 mask.Reshape(m_rgShape);
287
288 int nRow = m_nNumFut + m_nNumHist;
289 int nOutSeqLen = m_nNumFut; //- m_nTargetWindowStartIdx; not used
290 float[] rgData = new float[mask.count()];
291
292 for (int i = 0; i < m_nNumFut; i++)
293 {
294 for (int j = 0; j < m_nNumHist + nOutSeqLen; j++)
295 {
296 int nIdx = i * nRow + j;
297
298 if (j > m_nNumHist && j-m_nNumHist > i)
299 rgData[nIdx] = 1;
300 }
301 }
302
303 mask.mutable_cpu_data = convert(rgData);
304 }
305
311 public override void LayerSetUp(BlobCollection<T> colBottom, BlobCollection<T> colTop)
312 {
313 if (m_param.multihead_attention_interp_param.enable_self_attention)
314 m_log.CHECK_EQ(colBottom.Count, 1, "When using self-attention, there should only be one bottom.");
315 else
316 m_log.CHECK_EQ(colBottom.Count, 3, "When not using self-attention, there should be three bottom values: q, k, v");
317
318 m_nNumHeads = (int)m_param.multihead_attention_interp_param.num_heads;
319 m_nDModel = (int)m_param.multihead_attention_interp_param.embed_dim;
320 m_nAllHeadsDim = m_nNumHeads * m_nDModel;
321 m_dfScale = 1.0 / Math.Sqrt(m_nDModel);
322
323 m_log.CHECK(colBottom.Count == 1 || colBottom.Count == 4, "The bottom count must be 1 (input ->q,k,v, mask generated) or 4 for q,k,q,mask");
324
325 m_nNumFut = (int)m_param.multihead_attention_interp_param.num_future_steps;
326 m_log.CHECK_GT(m_nNumFut, 0, "The number of future steps must be greater than zero.");
327 m_nNumHist = (int)m_param.multihead_attention_interp_param.num_historical_steps;
328 m_log.CHECK_GT(m_nNumHist, 0, "The number of historical steps must be greater than zero.");
329 m_log.CHECK_EQ(m_nNumFut + m_nNumHist, colBottom[0].channels, "The number of future + historical steps must equal the bottom(0).channels.");
330 m_log.CHECK_EQ(m_nNumHist % m_nNumFut, 0, "The historical steps must be a multiple of the future steps! For example, historical steps = 90 and future steps = 30.");
331 m_nBlocks = (m_nNumHist + m_nNumFut) / m_nNumFut;
332
333 if (colBottom.Count == 1)
334 generate_mask(m_blobMask);
335 else
336 m_blobMask.ShareData(colBottom[3]);
337
338 if (m_ipQLayer == null)
339 {
340 LayerParameter ipQ = new LayerParameter(LayerParameter.LayerType.INNERPRODUCT, m_param.name + ".ipQ");
341 ipQ.inner_product_param.num_output = (uint)m_nAllHeadsDim;
348 ipQ.inner_product_param.bias_grad_scale = 1000000.0; // helps improve bias gradient accuracy.
349
350 m_ipQLayer = Layer<T>.Create(m_cuda, m_log, convertLayerParam(ipQ, m_param), null);
351
352 if (colBottom.Count == 1)
353 {
354 m_rgShape.Clear();
355 m_rgShape.Add(colBottom[0].num);
356 m_rgShape.Add(m_nNumFut);
357 m_rgShape.Add(colBottom[0].count(2));
358 m_blobQ.Reshape(m_rgShape);
359 }
360 else
361 {
362 m_blobQ.ReshapeLike(colBottom[0]);
363 }
364
365 addBtmTop(m_blobQ, m_blobIpQ);
366 m_ipQLayer.Setup(m_colBtm, m_colTop);
367 blobs.Add(m_ipQLayer.blobs);
368 }
369
370 if (m_ipKLayer == null)
371 {
372 LayerParameter ipK = new LayerParameter(LayerParameter.LayerType.INNERPRODUCT, m_param.name + ".ipK");
373 ipK.inner_product_param.num_output = (uint)m_nAllHeadsDim;
380 ipK.inner_product_param.bias_grad_scale = 1000000.0; // helps improve bias gradient accuracy.
381
382 m_ipKLayer = Layer<T>.Create(m_cuda, m_log, convertLayerParam(ipK, m_param), null);
383 m_blobK.ReshapeLike((colBottom.Count == 1) ? colBottom[0] : colBottom[1]);
384
385 addBtmTop(m_blobK, m_blobIpK);
386 m_ipKLayer.Setup(m_colBtm, m_colTop);
387 blobs.Add(m_ipKLayer.blobs);
388 }
389
390 if (m_ipVLayer == null)
391 {
392 LayerParameter ipV = new LayerParameter(LayerParameter.LayerType.INNERPRODUCT, m_param.name + ".ipV");
400
401 m_ipVLayer = Layer<T>.Create(m_cuda, m_log, convertLayerParam(ipV, m_param), null);
402 m_blobV.ReshapeLike((colBottom.Count == 1) ? colBottom[0] : colBottom[1]);
403
404 addBtmTop(m_blobV, m_blobIpV);
405 m_ipVLayer.Setup(m_colBtm, m_colTop);
406 blobs.Add(m_ipVLayer.blobs);
407 }
408
409 // Transpose
410 if (m_transpose == null)
411 {
412 // Reshape q, k, v projections to the following sizes
413 // queries tensor - q: [num_samples x num_future_steps x state_size]
414 // keys tensor - k: [num_samples x num_total_steps x state_size]
415 // values tensor - v: [num_samples x num_total_steps x state_size]
416 reshapeFwd(m_blobIpQ, m_nNumHeads);
417 reshapeFwd(m_blobIpK, m_nNumHeads);
418 reshapeFwd(m_blobIpV, m_nNumHeads);
419 reshapeRepeat(m_blobIpVfull, m_blobIpV.shape(), m_nNumHeads);
420
421 LayerParameter transpose = new LayerParameter(LayerParameter.LayerType.TRANSPOSE, m_param.name + ".trans");
422 transpose.transpose_param.dim[1] = 2;
423 transpose.transpose_param.dim[2] = 1;
424 m_transpose = Layer<T>.Create(m_cuda, m_log, convertLayerParam(transpose, m_param), null);
425
426 addBtmTop(m_blobIpQ, m_blobIpQt);
427 m_transpose.Setup(m_colBtm, m_colTop);
428 addBtmTop(m_blobIpK, m_blobIpKt);
429 m_transpose.Setup(m_colBtm, m_colTop);
430 addBtmTop(m_blobIpVfull, m_blobIpVt);
431 m_transpose.Setup(m_colBtm, m_colTop);
432 }
433
434 // Transpose
435 if (m_blobIpKt1.count() == 0)
436 {
437 List<int> rgShape = Utility.Clone<int>(m_blobIpKt.shape());
438 int nTemp = rgShape[2];
439 rgShape[2] = rgShape[3];
440 rgShape[3] = nTemp;
441 m_blobIpKt1.Reshape(rgShape);
442
443 m_blobAttnScores1.MatMul(m_blobIpQt, m_blobIpKt1, true);
444 }
445
446 // Softmax
447 if (m_softmax == null)
448 {
449 LayerParameter softmax = new LayerParameter(LayerParameter.LayerType.SOFTMAX, m_param.name + ".softmax");
450 softmax.softmax_param.axis = -1;
452 m_softmax = Layer<T>.Create(m_cuda, m_log, convertLayerParam(softmax, m_param), null);
453
454 addBtmTop(m_blobAttnScores1, m_blobAttnScoresAllHeads);
455 m_softmax.Setup(m_colBtm, m_colTop);
456
457 m_blobAttnOutputAllHeads.MatMul(m_blobAttnScoresAllHeads, m_blobIpVt, true);
458 }
459
460 if (m_ipOutLayer == null)
461 {
462 LayerParameter ipOut = new LayerParameter(LayerParameter.LayerType.INNERPRODUCT, m_param.name + ".ipOut");
464 ipOut.inner_product_param.axis = 2;
465 ipOut.inner_product_param.bias_term = true;
470
471 m_ipOutLayer = Layer<T>.Create(m_cuda, m_log, convertLayerParam(ipOut, m_param), null);
472
473 reshapeSansHead(colTop[1], m_blobAttnOutputAllHeads.shape());
474 reshapeSansHead(colTop[2], m_blobAttnScoresAllHeads.shape());
475
476 addBtmTop(colTop[1], colTop[0]);
477 m_ipOutLayer.Setup(m_colBtm, m_colTop);
478 blobs.Add(m_ipOutLayer.blobs);
479 }
480 }
481
489 protected override bool reshapeNeeded(BlobCollection<T> colBottom, BlobCollection<T> colTop, bool bReset = false)
490 {
491 if (bReset)
492 return true;
493
494 bool bShapeQDirty = m_rgShapeQ == null || !colBottom[0].CompareShape(m_rgShapeQ);
495 bool bShapeKDirty = (colBottom.Count == 1) ? bShapeQDirty : m_rgShapeK == null || !colBottom[1].CompareShape(m_rgShapeK);
496 bool bShapeVDirty = (colBottom.Count == 1) ? bShapeQDirty : m_rgShapeV == null || !colBottom[2].CompareShape(m_rgShapeV);
497 bool bShapeMaskDirty = false;
498
499 m_rgShapeQ = Utility.Clone<int>(colBottom[0].shape());
500 m_rgShapeK = m_rgShapeQ;
501 m_rgShapeV = m_rgShapeQ;
502
503 if (colBottom.Count > 1)
504 m_rgShapeK = Utility.Clone<int>(colBottom[1].shape());
505 if (colBottom.Count > 2)
506 m_rgShapeV = Utility.Clone<int>(colBottom[2].shape());
507
508 if (colBottom.Count > 3)
509 {
510 bShapeMaskDirty = m_rgShapeMask == null || !colBottom[3].CompareShape(m_rgShapeMask);
511 m_rgShapeMask = Utility.Clone<int>(colBottom[3].shape());
512 }
513
514 if (bShapeQDirty || bShapeKDirty || bShapeVDirty || bShapeMaskDirty)
515 return true;
516
517 return false;
518 }
519
525 public override void Reshape(BlobCollection<T> colBottom, BlobCollection<T> colTop)
526 {
527 if (!reshapeNeeded(colBottom, colTop))
528 return;
529
530 if (colBottom.Count == 1)
531 {
532 m_rgShape.Clear();
533 m_rgShape.Add(colBottom[0].num);
534 m_rgShape.Add(m_nNumFut);
535 m_rgShape.Add(colBottom[0].count(2));
536 m_blobQ.Reshape(m_rgShape);
537 }
538 else
539 {
540 m_blobQ.ReshapeLike(colBottom[0]);
541 }
542
543 addBtmTop(m_blobQ, m_blobIpQ);
544 m_ipQLayer.Reshape(m_colBtm, m_colTop);
545
546 m_blobK.ReshapeLike((colBottom.Count == 1) ? colBottom[0] : colBottom[1]);
547 addBtmTop(m_blobK, m_blobIpK);
548 m_ipKLayer.Reshape(m_colBtm, m_colTop);
549
550 m_blobV.ReshapeLike((colBottom.Count == 1) ? colBottom[0] : colBottom[2]);
551 addBtmTop(m_blobV, m_blobIpV);
552 m_ipVLayer.Reshape(m_colBtm, m_colTop);
553
554 // Reshape q, k, v projections to the following sizes
555 // queries tensor - q: [num_samples x num_future_steps x state_size]
556 // keys tensor - k: [num_samples x num_total_steps x state_size]
557 // values tensor - v: [num_samples x num_total_steps x state_size]
558 reshapeFwd(m_blobIpQ, m_nNumHeads);
559 reshapeFwd(m_blobIpK, m_nNumHeads);
560 reshapeFwd(m_blobIpV, m_nNumHeads);
561 reshapeRepeat(m_blobIpVfull, m_blobIpV.shape(), m_nNumHeads);
562
563 addBtmTop(m_blobIpQ, m_blobIpQt);
564 m_transpose.Reshape(m_colBtm, m_colTop);
565
566 addBtmTop(m_blobIpK, m_blobIpKt);
567 m_transpose.Reshape(m_colBtm, m_colTop);
568
569 addBtmTop(m_blobIpVfull, m_blobIpVt);
570 m_transpose.Reshape(m_colBtm, m_colTop);
571
572 List<int> rgShape = Utility.Clone<int>(m_blobIpKt.shape());
573 int nTemp = rgShape[2];
574 rgShape[2] = rgShape[3];
575 rgShape[3] = nTemp;
576 m_blobIpKt1.Reshape(rgShape);
577
578 m_blobAttnScores1.MatMul(m_blobIpQt, m_blobIpKt1, true);
579
580 addBtmTop(m_blobAttnScores1, m_blobAttnScoresAllHeads);
581 m_softmax.Reshape(m_colBtm, m_colTop);
582
583 colTop[1].MatMul(m_blobAttnScoresAllHeads, m_blobIpVt, true);
584 m_blobWork.ReshapeLike(colTop[1]);
585
586 reshapeSansHead(colTop[1], m_blobAttnOutputAllHeads.shape());
587 reshapeSansHead(colTop[2], m_blobAttnScoresAllHeads.shape());
588 colTop[2].type = BLOB_TYPE.ATTENTION;
589
590 addBtmTop(colTop[1], colTop[0]);
591 m_ipOutLayer.Reshape(m_colBtm, m_colTop);
592 }
593
594 private void copy_to_q_fwd(int nCount, Blob<T> bBtm, Blob<T> bTop)
595 {
596 if (nCount == 1)
597 {
598 // Copy just the future items to the top, so if future = 30,
599 // with input shape is btm(256,120,64) just the last (256,30,64) are copied to top
600 int nOuterNum = bBtm.num;
601 int nChannels = m_nBlocks;
602 int nInnerNum = (bBtm.channels / m_nBlocks) * bBtm.count(2);
603 m_cuda.channel_copy(bTop.count(), nOuterNum, nChannels, m_nBlocks, nInnerNum, m_nBlocks-1, bBtm.gpu_data, bTop.mutable_gpu_data, DIR.FWD);
604 }
605 else
606 {
607 bTop.CopyFrom(bBtm);
608 }
609 }
610
611 private void copy_to_q_bwd(int nCount, Blob<T> bBtm, Blob<T> bTop)
612 {
613 if (nCount == 1)
614 {
615 // Copy just the future items to the top, so if future = 30,
616 // with input shape is btm(256,120,64) just the last (256,30,64) are copied to top
617 int nOuterNum = bBtm.num;
618 int nChannels = m_nBlocks;
619 int nInnerNum = (bBtm.channels / m_nBlocks) * bBtm.count(2);
620 m_cuda.channel_add(bTop.count(), nOuterNum, nChannels, m_nBlocks, nInnerNum, m_nBlocks-1, bBtm.mutable_gpu_diff, bTop.gpu_diff, DIR.BWD);
621 }
622 else
623 {
624 bTop.CopyFrom(bBtm, true);
625 }
626 }
627
639 protected override void forward(BlobCollection<T> colBottom, BlobCollection<T> colTop)
640 {
641 // Calculate q, k, v projections
642 copy_to_q_fwd(colBottom.Count, colBottom[0], m_blobQ);
643
644 addBtmTop(m_blobQ, m_blobIpQ);
645 m_ipQLayer.Forward(m_colBtm, m_colTop);
646
647 m_blobK.CopyFrom((colBottom.Count == 1) ? colBottom[0] : colBottom[1]);
648
649 addBtmTop(m_blobK, m_blobIpK);
650 m_ipKLayer.Forward(m_colBtm, m_colTop);
651
652 m_blobV.CopyFrom((colBottom.Count == 1) ? colBottom[0] : colBottom[2]);
653
654 addBtmTop(m_blobV, m_blobIpV);
655 m_ipVLayer.Forward(m_colBtm, m_colTop);
656
657 // Reshape q, k, v projections to the following sizes
658 // queries tensor - q: [num_samples x num_future_steps x num_heads x state_size]
659 // keys tensor - k: [num_samples x num_total_steps x num_heads x state_size]
660 // values tensor - v: [num_samples x num_total_steps x num_heads x state_size]
661 reshapeFwd(m_blobIpQ, m_nNumHeads);
662 reshapeFwd(m_blobIpK, m_nNumHeads);
663 reshapeFwd(m_blobIpV, m_nNumHeads);
664 reshapeRepeat(m_blobIpVfull, m_blobIpV.shape(), m_nNumHeads);
665
666 // repeat blobIpV width to V full.
667 int nInnerNum = m_blobIpV.count(2);
668 for (int i = 0; i < m_nNumHeads; i++)
669 {
670 m_cuda.channel_copy(m_blobIpV.count(), m_blobIpV.num, m_blobIpV.channels, m_nNumHeads, nInnerNum, i, m_blobIpVfull.mutable_gpu_data, m_blobIpV.gpu_data, DIR.BWD);
671 }
672
673 // Transpose to get the new shapes
674 // queries tensor - q: [num_samples x num_heads x num_future_steps x state_size]
675 // keys tensor - k: [num_samples x num_heads x num_total_steps x state_size]
676 // values tensor - v: [num_samples x num_heads x num_total_steps x state_size]
677
678 addBtmTop(m_blobIpQ, m_blobIpQt);
679 m_transpose.Forward(m_colBtm, m_colTop);
680
681 addBtmTop(m_blobIpK, m_blobIpKt);
682 m_transpose.Forward(m_colBtm, m_colTop);
683
684 addBtmTop(m_blobIpVfull, m_blobIpVt);
685 m_transpose.Forward(m_colBtm, m_colTop);
686
687 //-----------------------------------------
688 // Calculate the attention
689 //-----------------------------------------
690 {
691 // Apply the scaled dot product
692 m_blobIpKt1.CopyFromAndTransposeHeightWidth(m_blobIpKt);
693 m_blobAttnScores1.MatMul(m_blobIpQt, m_blobIpKt1, true);
694 m_blobAttnScores1.scale_data(m_dfScale);
695
696 // Decoder masking is applied to the multi-head attention layer to ensure that each temporal dimension can
697 // only attend to the preceding features.
698 if (m_blobMask != null)
699 {
700 // Apply mask to attention matrix
701 // att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
702 float fInf = 1e29f;
703 // all masked items set to -inf.
704 m_cuda.mask_batch(m_blobAttnScores1.count(), 1, m_blobMask.count(), convert(1.0), convert(-1 * fInf), m_blobAttnScores1.gpu_data, m_blobMask.gpu_data, m_blobAttnScores1.mutable_gpu_data);
705 }
706
707 // Calculate the softmax to find the most imporant parts of the data (e.g. where to focus the attention)
708 addBtmTop(m_blobAttnScores1, m_blobAttnScoresAllHeads);
709 m_softmax.Forward(m_colBtm, m_colTop);
710
711 // Multiply the softmax with the values to get the attention outputs.
712 m_blobAttnOutputAllHeads.MatMul(m_blobAttnScoresAllHeads, m_blobIpVt, true);
713
714 // attention scores -> colTop[2], shape [num_samples x num_heads x num_future_steps x num_total_steps]
715 // attention output -> colTop[1], shape [num_samples x num_heads x num_future_steps x state_size]
716 }
717
718 // Average along all heads.
719 calculateChannelMeanAcrossChannelsFwd(m_blobAttnOutputAllHeads, colTop[1]);
720 calculateChannelMeanAcrossChannelsFwd(m_blobAttnScoresAllHeads, colTop[2]);
721
722 // Weight the attention outputs (in colTop[1]) placing the results in colTop[0]
723 addBtmTop(colTop[1], colTop[0]);
724 m_ipOutLayer.Forward(m_colBtm, m_colTop);
725 }
726
741 protected override void backward(BlobCollection<T> colTop, List<bool> rgbPropagateDown, BlobCollection<T> colBottom)
742 {
743 // Calculate grad for the attention output weights (colTop[0] grad -> colTop[1] attn output grad)
744 addBtmTop(colTop[1], colTop[0]);
745 m_ipOutLayer.Backward(m_colTop, rgbPropagateDown, m_colBtm);
746
747 // Average along all heads.
748 calculateChannelMeanAcrossChannelsBwd(m_blobAttnOutputAllHeads, colTop[1]);
749
750 //-----------------------------------------
751 // Calculate the attention gradients
752 //-----------------------------------------
753 {
754 // Multiply the softmax with the values to get the attention outputs.
755 m_blobAttnOutputAllHeads.MatMulGrad(m_blobAttnScoresAllHeads, m_blobIpVt, m_blobWork);
756
757 // Calculate the softmax gradient for the most imporant parts of the data (e.g. where to focus the attention)
758 addBtmTop(m_blobAttnScores1, m_blobAttnScoresAllHeads);
759 m_softmax.Backward(m_colTop, rgbPropagateDown, m_colBtm);
760
761 // Calculate the Qt and Kt1 gradients.
762 m_blobAttnScores1.MatMulGrad(m_blobIpQt, m_blobIpKt1, m_blobWork, m_dfScale);
763
764 // Transform the gradients back to Kt.
765 m_blobIpKt.CopyFromAndTransposeHeightWidth(m_blobIpKt1, true);
766 }
767
768 // Transpose the gradients back to Q, K and V
769 addBtmTop(m_blobIpQ, m_blobIpQt);
770 m_transpose.Backward(m_colTop, rgbPropagateDown, m_colBtm);
771
772 addBtmTop(m_blobIpK, m_blobIpKt);
773 m_transpose.Backward(m_colTop, rgbPropagateDown, m_colBtm);
774
775 addBtmTop(m_blobIpVfull, m_blobIpVt);
776 m_transpose.Backward(m_colTop, rgbPropagateDown, m_colBtm);
777
778 // Copy each IpVFull block to IpV
779 m_blobIpV.SetDiff(0);
780
781 int nOuterNum = m_blobIpVfull.count(0, 2);
782 m_cuda.channel_copy(m_blobIpV.count(), nOuterNum, 1, m_nNumHeads, m_blobIpVfull.width, 0, m_blobIpVfull.gpu_diff, m_blobIpV.mutable_gpu_diff, DIR.FWD);
783
784 for (int i = 1; i < m_nNumHeads; i++)
785 {
786 m_cuda.channel_add(m_blobIpV.count(), nOuterNum, 1, m_nNumHeads, m_blobIpVfull.width, i, m_blobIpVfull.gpu_diff, m_blobIpV.mutable_gpu_diff, DIR.FWD);
787 }
788
789 // Reshape back to original q, k, v projection shapes
790 // queries tensor - q: [num_samples x num_future_steps x state_size]
791 // keys tensor - k: [num_samples x num_total_steps x state_size]
792 // values tensor - v: [num_samples x num_total_steps x state_size]
793 reshapeBwd(m_blobIpQ, m_nNumHeads);
794 reshapeBwd(m_blobIpK, m_nNumHeads);
795 reshapeBwd(m_blobIpV, m_nNumHeads);
796
797 // Calculate q, k, v projection gradients
798 addBtmTop(m_blobQ, m_blobIpQ);
799 m_ipQLayer.Backward(m_colTop, rgbPropagateDown, m_colBtm);
800
801 addBtmTop(m_blobK, m_blobIpK);
802 m_ipKLayer.Backward(m_colTop, rgbPropagateDown, m_colBtm);
803
804 addBtmTop(m_blobV, m_blobIpV);
805 m_ipVLayer.Backward(m_colTop, rgbPropagateDown, m_colBtm);
806
807 if (colBottom.Count == 1)
808 {
809 colBottom[0].SetDiff(0);
810 copy_to_q_bwd(colBottom.Count, colBottom[0], m_blobQ);
811 m_cuda.add(colBottom[0].count(), colBottom[0].gpu_diff, m_blobK.gpu_diff, colBottom[0].mutable_gpu_diff);
812 m_cuda.add(colBottom[0].count(), colBottom[0].gpu_diff, m_blobV.gpu_diff, colBottom[0].mutable_gpu_diff);
813 }
814 else
815 {
816 colBottom[0].CopyFrom(m_blobQ, true);
817 colBottom[1].CopyFrom(m_blobK, true);
818 colBottom[2].CopyFrom(m_blobV, true);
819 }
820 }
821 }
822}
The Log class provides general output in text form.
Definition: Log.cs:13
void CHECK(bool b, string str)
Test a flag for true.
Definition: Log.cs:227
void CHECK_EQ(double df1, double df2, string str)
Test whether one number is equal to another.
Definition: Log.cs:239
void CHECK_GT(double df1, double df2, string str)
Test whether one number is greater than another.
Definition: Log.cs:299
The Utility class provides general utility funtions.
Definition: Utility.cs:35
The BlobCollection contains a list of Blobs.
void Add(Blob< T > b)
Add a new Blob to the collection.
void SetDiff(double df)
Set all blob diff to the value specified.
int Count
Returns the number of items in the collection.
void Clear(bool bDispose=false)
Remove all items from the collection.
void CopyFrom(BlobCollection< T > bSrc, bool bCopyDiff=false)
Copy the data or diff from another BlobCollection into this one.
The Blob is the main holder of data that moves through the Layers of the Net.
Definition: Blob.cs:25
int channels
DEPRECIATED; legacy shape accessor channels: use shape(1) instead.
Definition: Blob.cs:800
void SetData(T[] rgData, int nCount=-1, bool bSetCount=true)
Sets a number of items within the Blob's data.
Definition: Blob.cs:1922
void ShareData(Blob< T > b)
Set the data to point to the data of the other blob – useful in Layers which simply perform a copy in...
Definition: Blob.cs:1813
void MatMul(Blob< T > blobA, Blob< T > blobB, bool bReshape=false, bool bTransA=false, bool bTransB=false, double dfScale=1.0, bool bADiff=false, bool bBDiff=false, bool bCDiff=false)
MatMul blobA with blobB and place the result in this blob (e.g. this = matmul(A, B))....
Definition: Blob.cs:3922
void MatMulGrad(Blob< T > blobA, Blob< T > blobB, Blob< T > blobWork, double dfScale=1.0)
Calculates and propagates the gradient for blobA and BlobB given the input gradient in this blob's di...
Definition: Blob.cs:3974
long mutable_gpu_diff
Returns the diff GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1555
long mutable_gpu_data
Returns the data GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1487
void CopyFromAndTransposeHeightWidth(Blob< T > blobSrc, bool bCopyDiff=false, bool bUseCuda=true)
Copy from a source Blob and transpose the height and width of the copy.
Definition: Blob.cs:1002
T[] mutable_cpu_data
Get data from the GPU and bring it over to the host, or Set data from the Host and send it over to th...
Definition: Blob.cs:1461
void Reshape(int nNum, int nChannels, int nHeight, int nWidth, bool? bUseHalfSize=null)
DEPRECIATED; use
Definition: Blob.cs:442
void CopyFrom(Blob< T > src, int nSrcOffset, int nDstOffset, int nCount, bool bCopyData, bool bCopyDiff)
Copy from a source Blob.
Definition: Blob.cs:903
void scale_data(double df)
Scale the data by a scaling factor.
Definition: Blob.cs:1754
int width
DEPRECIATED; legacy shape accessor width: use shape(3) instead.
Definition: Blob.cs:816
List< int > shape()
Returns an array where each element contains the shape of an axis of the Blob.
Definition: Blob.cs:684
int count()
Returns the total number of items in the Blob.
Definition: Blob.cs:739
void ReshapeLike(Blob< T > b, bool? bUseHalfSize=null)
Reshape this Blob to have the same shape as another Blob.
Definition: Blob.cs:648
string Name
Get/set the name of the Blob.
Definition: Blob.cs:2184
long gpu_diff
Returns the diff GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1541
void scale_diff(double df)
Scale the diff by a scaling factor.
Definition: Blob.cs:1763
void SetDiff(double dfVal, int nIdx=-1)
Either sets all of the diff items in the Blob to a given value, or alternatively only sets a single i...
Definition: Blob.cs:1981
int num
DEPRECIATED; legacy shape accessor num: use shape(0) instead.
Definition: Blob.cs:792
long gpu_data
Returns the data GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1479
The CudaDnn object is the main interface to the Low-Level Cuda C++ DLL.
Definition: CudaDnn.cs:969
An interface for the units of computation which can be composed into a Net.
Definition: Layer.cs:31
Log m_log
Specifies the Log for output.
Definition: Layer.cs:43
LayerParameter m_param
Specifies the LayerParameter describing the Layer.
Definition: Layer.cs:47
void convert(BlobCollection< T > col)
Convert a collection of blobs from / to half size.
Definition: Layer.cs:535
void Backward(BlobCollection< T > colTop, List< bool > rgbPropagateDown, BlobCollection< T > colBottom)
Given the top Blob error gradients, compute the bottom Blob error gradients.
Definition: Layer.cs:815
double Forward(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Given the bottom (input) Blobs, this function computes the top (output) Blobs and the loss.
Definition: Layer.cs:728
abstract void Reshape(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Adjust the shapes of top blobs and internal buffers to accomodate the shapes of the bottom blobs.
CudaDnn< T > m_cuda
Specifies the CudaDnn connection to Cuda.
Definition: Layer.cs:39
void Setup(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Implements common Layer setup functionality.
Definition: Layer.cs:439
static Layer< T > Create(CudaDnn< T > cuda, Log log, LayerParameter p, CancelEvent evtCancel, IXDatabaseBase db=null, TransferInput trxinput=null)
Create a new Layer based on the LayerParameter.
Definition: Layer.cs:1468
LayerParameter.LayerType m_type
Specifies the Layer type.
Definition: Layer.cs:35
BlobCollection< T > blobs
Returns the collection of learnable parameter Blobs for the Layer.
Definition: Layer.cs:875
LayerParameter convertLayerParam(LayerParameter pChild, LayerParameter pParent)
Called to convert a parent LayerParameterEx, used in blob sharing, with a child layer parameter.
Definition: Layer.cs:1134
The MultiHeadAttentionInterpLayer implements the Multi-head Attention Interpretive Layer
override void dispose()
Releases all GPU and host resources used by the Layer.
override int MaxBottomBlobs
Returns the max number of required bottom (input) Blobs: q, k, v, mask
override void setup_internal_blobs(BlobCollection< T > col)
Derivative layers should add all internal blobws to the 'col' provided.
override void forward(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Forward computation
override void backward(BlobCollection< T > colTop, List< bool > rgbPropagateDown, BlobCollection< T > colBottom)
Computes the error gradient w.r.t. the stacked embedding numeric and categorical value inputs.
override void LayerSetUp(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Setup the layer.
override bool reshapeNeeded(BlobCollection< T > colBottom, BlobCollection< T > colTop, bool bReset=false)
Determines if a reshape is needed or not.
override void Reshape(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Reshape the top (output) blobs.
override int MinBottomBlobs
Returns the min number of required bottom (input) Blobs: input -> q,k,v, mask is generated
MultiHeadAttentionInterpLayer(CudaDnn< T > cuda, Log log, LayerParameter p)
The constructor.
override int ExactNumTopBlobs
Returns the exact number of required top (output) Blobs: y, attn_out, attn_scores
Specifies whether to use the NVIDIA cuDnn version or Caffe version of a given forward/backward operat...
Engine engine
Specifies the Engine in use.
Engine
Defines the type of engine to use.
double sigma_init
Specifies the initialization value for the sigma weight and sigma bias used when 'enable_noise' = tru...
FillerParameter weight_filler
The filler for the weights.
int axis
Specifies the first axis to be lumped into a single inner product computation; all preceding axes are...
bool enable_noise
Enable/disable noise in the inner-product layer (default = false).
double bias_grad_scale
Specifies a scaling value applied to the bias mutliplier and then unapplied after calculating the bia...
FillerParameter bias_filler
The filler for the bias.
uint num_output
The number of outputs for the layer.
bool bias_term
Whether to have bias terms or not.
Specifies the base parameter for all layers.
string name
Specifies the name of this LayerParameter.
SoftmaxParameter softmax_param
Returns the parameter set when initialized with LayerType.SOFTMAX
MultiHeadAttentionInterpParameter multihead_attention_interp_param
Returns the parameter set when initialized with LayerType.MULTIHEAD_ATTENTION_INTERP
InnerProductParameter inner_product_param
Returns the parameter set when initialized with LayerType.INNERPRODUCT
TransposeParameter transpose_param
Returns the parameter set when initialized with LayerType.TRANSPOSE
LayerType
Specifies the layer type.
int axis
The axis along which to perform the softmax – may be negative to index from the end (e....
The MyCaffe.basecode contains all generic types used throughout MyCaffe.
Definition: Annotation.cs:12
The MyCaffe.common namespace contains common MyCaffe classes.
Definition: BatchInput.cs:8
DIR
Defines the direction of data flow.
Definition: CudaDnn.cs:22
BLOB_TYPE
Defines the tpe of data held by a given Blob.
Definition: Interfaces.cs:62
The MyCaffe.layers.tft namespace contains all TFT related layers.
Definition: LayerFactory.cs:15
The MyCaffe.param namespace contains parameters used to create models.
The MyCaffe namespace contains the main body of MyCaffe code that closesly tracks the C++ Caffe open-...
Definition: Annotation.cs:12