MyCaffe  1.12.2.41
Deep learning software for Windows C# programmers.
AttentionLayer.cs
1using System;
2using System.Collections.Generic;
3using System.Linq;
4using System.Text;
5using MyCaffe.basecode;
6using MyCaffe.common;
7using MyCaffe.param;
8using MyCaffe.fillers;
10using System.Diagnostics;
11
12namespace MyCaffe.layers
13{
30 public class AttentionLayer<T> : Layer<T>
31 {
32 Layer<T> m_transposeX = null;
33 Layer<T> m_transposeClip = null;
34 Layer<T> m_ipUa = null;
35 Layer<T> m_ipWa = null;
36 Layer<T> m_tanh = null;
37 Layer<T> m_add1 = null;
38 Layer<T> m_ipV = null;
39
40 Blob<T> m_blobX = null;
41 Blob<T> m_blobClip = null;
42 Blob<T> m_blobX1 = null;
43 Blob<T> m_blobState = null;
44 Blob<T> m_blobUh = null;
45 Blob<T> m_blobWc = null;
46 Blob<T> m_blobFullWc = null;
47 Blob<T> m_blobAddOutput = null;
48 Blob<T> m_blobGG = null;
49 Blob<T> m_blobAA = null;
50 Blob<T> m_blobScale = null;
51 Blob<T> m_blobSoftmax = null;
52 Blob<T> m_blobFocusedInput = null;
53 Blob<T> m_blobContext = null;
54 Blob<T> m_blobWork = null;
55
56 BlobCollection<T> m_colInternalBottom = new BlobCollection<T>();
57 BlobCollection<T> m_colInternalTop = new BlobCollection<T>();
58
67 : base(cuda, log, p)
68 {
70
71 List<int> rgDimClip = new List<int>() { 1, 0 };
72 LayerParameter transposeClipparam = new LayerParameter(LayerParameter.LayerType.TRANSPOSE);
73 transposeClipparam.transpose_param.dim = new List<int>(rgDimClip);
74
75 m_transposeClip = new TransposeLayer<T>(cuda, log, transposeClipparam);
76
77 LayerParameter ipUaParam = new LayerParameter(LayerParameter.LayerType.INNERPRODUCT);
78 ipUaParam.name = "ipUa";
79 ipUaParam.inner_product_param.axis = 2;
83
85 {
87 ipUaParam = new LayerParameterEx<T>(ipUaParam, pEx.SharedBlobs, pEx.SharedLayerBlobs, pEx.SharedLayer);
88 }
89
90 m_ipUa = new InnerProductLayer<T>(cuda, log, ipUaParam);
91
92 LayerParameter ipWaParam = new LayerParameter(LayerParameter.LayerType.INNERPRODUCT);
93 ipWaParam.name = "ipWa";
94 ipWaParam.inner_product_param.axis = 1;
98
100 {
102 ipWaParam = new LayerParameterEx<T>(ipWaParam, pEx.SharedBlobs, pEx.SharedLayerBlobs, pEx.SharedLayer);
103 }
104
105 m_ipWa = new InnerProductLayer<T>(cuda, log, ipWaParam);
106
108 addParam.name = "add";
110
111 m_add1 = new EltwiseLayer<T>(cuda, log, addParam);
112
114 tanhParam.name = "tanh";
115 tanhParam.tanh_param.engine = EngineParameter.Engine.CUDNN;
116
117 m_tanh = new TanhLayer<T>(cuda, log, tanhParam);
118
119 LayerParameter ipVParam = new LayerParameter(LayerParameter.LayerType.INNERPRODUCT);
120 ipVParam.name = "ipV";
121 ipVParam.inner_product_param.axis = 2;
122 ipVParam.inner_product_param.num_output = 1;
123 ipVParam.inner_product_param.bias_term = false;
125
127 {
129 ipVParam = new LayerParameterEx<T>(ipVParam, pEx.SharedBlobs, pEx.SharedLayerBlobs, pEx.SharedLayer);
130 }
131
132 m_ipV = new InnerProductLayer<T>(cuda, log, ipVParam);
133
134 m_blobX = new Blob<T>(cuda, log);
135 m_blobX.Name = m_param.name + ".x";
136
137 m_blobClip = new Blob<T>(cuda, log);
138 m_blobClip.Name = m_param.name + ".clip";
139
140 m_blobX1 = new Blob<T>(cuda, log);
141 m_blobX1.Name = m_param.name + ".x1";
142
143 m_blobState = new Blob<T>(cuda, log);
144 m_blobState.Name = m_param.name + ".state";
145
146 m_blobUh = new Blob<T>(cuda, log);
147 m_blobUh.Name = m_param.name + ".Uh";
148
149 m_blobWc = new Blob<T>(cuda, log);
150 m_blobWc.Name = m_param.name + ".Wc";
151
152 m_blobFullWc = new Blob<T>(cuda, log);
153 m_blobFullWc.Name = m_param.name + ".Full Wc";
154
155 m_blobAddOutput = new Blob<T>(cuda, log);
156 m_blobAddOutput.Name = m_param.name + ".addOut";
157
158 m_blobGG = new Blob<T>(cuda, log);
159 m_blobGG.Name = m_param.name + ".gg";
160
161 m_blobAA = new Blob<T>(cuda, log);
162 m_blobAA.Name = m_param.name + ".aa";
163
164 m_blobScale = new Blob<T>(cuda, log, false);
165 m_blobScale.Name = m_param.name + ".scale";
166
167 m_blobSoftmax = new Blob<T>(cuda, log);
168 m_blobSoftmax.Name = m_param.name + ".softmax";
169
170 m_blobFocusedInput = new Blob<T>(cuda, log);
171 m_blobFocusedInput.Name = m_param.name + ".softmax_full";
172
173 m_blobContext = new Blob<T>(cuda, log);
174 m_blobContext.Name = m_param.name + ".context";
175
176 m_blobWork = new Blob<T>(cuda, log);
177 m_blobWork.Name = m_param.name + ".work";
178 }
179
181 protected override void dispose()
182 {
183 dispose(ref m_transposeX);
184 dispose(ref m_transposeClip);
185 dispose(ref m_ipUa);
186 dispose(ref m_ipWa);
187 dispose(ref m_tanh);
188 dispose(ref m_add1);
189 dispose(ref m_ipV);
190
191 dispose(ref m_blobState);
192 dispose(ref m_blobX);
193 dispose(ref m_blobClip);
194 dispose(ref m_blobX1);
195 dispose(ref m_blobUh);
196 dispose(ref m_blobWc);
197 dispose(ref m_blobFullWc);
198 dispose(ref m_blobAddOutput);
199 dispose(ref m_blobGG);
200 dispose(ref m_blobAA);
201 dispose(ref m_blobScale);
202 dispose(ref m_blobSoftmax);
203 dispose(ref m_blobFocusedInput);
204 dispose(ref m_blobContext);
205 dispose(ref m_blobWork);
206
207 base.dispose();
208 }
209
211 protected override void setup_internal_blobs(BlobCollection<T> col)
212 {
213 if (col.Count > 0)
214 return;
215
216 col.Add(m_blobState);
217 col.Add(m_blobUh);
218 col.Add(m_blobWc);
219 col.Add(m_blobFullWc);
220 col.Add(m_blobAddOutput);
221 col.Add(m_blobGG);
222 col.Add(m_blobAA);
223 col.Add(m_blobScale);
224 col.Add(m_blobFocusedInput);
225 col.Add(m_blobContext);
226 }
227
231 public override int ExactNumBottomBlobs
232 {
233 get { return 3; }
234 }
235
239 public override int ExactNumTopBlobs
240 {
241 get { return 1; }
242 }
243
249 public override bool ReInitializeParameters(WEIGHT_TARGET target)
250 {
251 base.ReInitializeParameters(target);
252
253 m_ipUa.ReInitializeParameters(target);
254 m_ipWa.ReInitializeParameters(target);
255
256 return true;
257 }
258
259 private void addInternal(Blob<T> bottom, Blob<T> top)
260 {
261 m_colInternalBottom.Clear();
262 m_colInternalBottom.Add(bottom);
263
264 m_colInternalTop.Clear();
265 m_colInternalTop.Add(top);
266 }
267
268 private void addInternal(List<Blob<T>> rgBottom, Blob<T> top)
269 {
270 m_colInternalBottom.Clear();
271
272 for (int i=0; i<rgBottom.Count; i++)
273 {
274 m_colInternalBottom.Add(rgBottom[i]);
275 }
276
277 m_colInternalTop.Clear();
278 m_colInternalTop.Add(top);
279 }
280
286 public override void LayerSetUp(BlobCollection<T> colBottom, BlobCollection<T> colTop)
287 {
288 Blob<T> blobX = colBottom[0];
289 Blob<T> blobCy = colBottom[1];
290 Blob<T> blobClip = colBottom[2];
291
292 m_log.CHECK_EQ(blobX.shape(1), 1, "Currently, only batch size = 1 is supported.");
293
294 m_rgbParamPropagateDown = new DictionaryMap<bool>(m_colBlobs.Count, true);
295
296 List<int> rgDimX = new List<int>() { 1, 0 };
297 while (rgDimX.Count < colBottom[0].num_axes)
298 {
299 rgDimX.Add(rgDimX.Count);
300 }
301
302 LayerParameter transposeXparam = new LayerParameter(LayerParameter.LayerType.TRANSPOSE);
303 transposeXparam.transpose_param.dim = new List<int>(rgDimX);
304
305 m_transposeX = new TransposeLayer<T>(m_cuda, m_log, transposeXparam);
306
307 addInternal(blobX, m_blobX);
308 m_transposeX.Setup(m_colInternalBottom, m_colInternalTop);
309 m_blobX1.ReshapeLike(m_blobX);
310
311 addInternal(m_blobX, m_blobUh);
312 m_ipUa.Setup(m_colInternalBottom, m_colInternalTop);
313
314 addInternal(blobClip, m_blobClip);
315 m_transposeClip.Setup(m_colInternalBottom, m_colInternalTop);
316 // Make sure the first item is set to 1.
317 m_blobClip.SetData(1, 0);
318
319 m_blobState.ReshapeLike(blobCy);
320
321 addInternal(m_blobState, m_blobWc);
322 m_ipWa.Setup(m_colInternalBottom, m_colInternalTop);
323
324 m_blobFullWc.ReshapeLike(m_blobUh);
325
326 addInternal(new List<Blob<T>>() { m_blobUh, m_blobFullWc }, m_blobAddOutput);
327 m_add1.Setup(m_colInternalBottom, m_colInternalTop);
328
329 addInternal(m_blobAddOutput, m_blobGG);
330 m_tanh.Setup(m_colInternalBottom, m_colInternalTop);
331
332 addInternal(m_blobGG, m_blobAA);
333 m_ipV.Setup(m_colInternalBottom, m_colInternalTop);
334
335 List<int> rgFocusShape = Utility.Clone<int>(blobX.shape());
336 rgFocusShape[0] = blobX.shape(1);
337 rgFocusShape[1] = blobX.shape(0);
338 m_blobFocusedInput.Reshape(rgFocusShape);
339
340 List<int> rgContextShape = Utility.Clone<int>(blobX.shape());
341 rgContextShape[0] = rgContextShape[1];
342 rgContextShape[1] = 1;
343 m_blobContext.Reshape(rgContextShape);
344
345 List<int> rgTopShape = Utility.Clone<int>(m_blobContext.shape());
346 rgTopShape[0] = m_blobContext.shape(1);
347 rgTopShape[1] = m_blobContext.shape(0);
348 colTop[0].Reshape(rgTopShape);
349
350 blobs.Clear();
351
352 foreach (Blob<T> blob in m_ipUa.blobs)
353 {
354 blobs.Add(blob);
355 }
356
357 foreach (Blob<T> blob in m_ipWa.blobs)
358 {
359 blobs.Add(blob);
360 }
361
362 // V
363 blobs.Add(m_ipV.blobs[0]);
364 }
365
371 public override void Reshape(BlobCollection<T> colBottom, BlobCollection<T> colTop)
372 {
373 if (!reshapeNeeded(colBottom, colTop))
374 return;
375
376 Blob<T> blobX = colBottom[0];
377 Blob<T> blobCy = colBottom[1];
378 Blob<T> blobClip = colBottom[2];
379
380 m_log.CHECK_EQ(blobClip.count(), blobX.count(0, 2), "The bottom[2] 'clip' must have shape T,B.");
381
382 addInternal(blobX, m_blobX);
383 m_transposeX.Reshape(m_colInternalBottom, m_colInternalTop);
384 m_blobX1.ReshapeLike(m_blobX);
385
386 addInternal(m_blobX, m_blobUh);
387 m_ipUa.Reshape(m_colInternalBottom, m_colInternalTop);
388
389 addInternal(blobClip, m_blobClip);
390 m_transposeClip.Reshape(m_colInternalBottom, m_colInternalTop);
391 // Make sure the first item is set to 1.
392 m_blobClip.SetData(1, 0);
393
394 m_blobState.ReshapeLike(blobCy);
395
396 addInternal(m_blobState, m_blobWc);
397 m_ipWa.Reshape(m_colInternalBottom, m_colInternalTop);
398
399 m_blobFullWc.ReshapeLike(m_blobUh);
400
401 addInternal(new List<Blob<T>>() { m_blobUh, m_blobFullWc }, m_blobAddOutput);
402 m_add1.Reshape(m_colInternalBottom, m_colInternalTop);
403
404 addInternal(m_blobAddOutput, m_blobGG);
405 m_tanh.Reshape(m_colInternalBottom, m_colInternalTop);
406
407 addInternal(m_blobGG, m_blobAA);
408 m_ipV.Reshape(m_colInternalBottom, m_colInternalTop);
409
410 m_blobSoftmax.ReshapeLike(m_blobAA);
411 m_blobScale.ReshapeLike(m_blobSoftmax);
412
413 List<int> rgFocusShape = Utility.Clone<int>(blobX.shape());
414 rgFocusShape[0] = blobX.shape(1);
415 rgFocusShape[1] = blobX.shape(0);
416 m_blobFocusedInput.Reshape(rgFocusShape);
417
418 List<int> rgContextShape = Utility.Clone<int>(blobX.shape());
419 rgContextShape[0] = rgContextShape[1];
420 rgContextShape[1] = 1;
421 m_blobContext.Reshape(rgContextShape);
422
423 List<int> rgTopShape = Utility.Clone<int>(m_blobContext.shape());
424 rgTopShape[0] = m_blobContext.shape(1);
425 rgTopShape[1] = m_blobContext.shape(0);
426 colTop[0].Reshape(rgTopShape);
427 }
428
429 private void apply_clip(Blob<T> blobInput, Blob<T> blobClip, Blob<T> blobOutput, bool bDiff = false)
430 {
431 long hSrc = (bDiff) ? blobInput.gpu_diff : blobInput.gpu_data;
432 long hClip = blobClip.gpu_data;
433 long hDst = (bDiff) ? blobOutput.mutable_gpu_diff : blobOutput.gpu_diff;
434
435 m_cuda.channel_scale(blobInput.count(), blobInput.num, blobInput.channels, blobInput.count(2), hSrc, hClip, hDst);
436 }
437
438 private void softmax_fwd(Blob<T> blobBottom, Blob<T> blobClip, Blob<T> blobScale, Blob<T> blobTop, int nAxis)
439 {
440 int nCount = blobBottom.count();
441 int nOuterNum = blobBottom.count(0, nAxis);
442 int nInnerNum = blobBottom.count(nAxis + 1);
443 int nChannels = blobTop.shape(nAxis);
444 long hBottomData = blobBottom.gpu_data;
445 long hTopData = blobTop.mutable_gpu_data;
446 long hScaleData = blobScale.mutable_gpu_data;
447
448 m_cuda.copy(nCount, hBottomData, hTopData);
449
450 // Mask all values pass the valid ones specified in the clip, to -1 billion.
451 int nValidValues = (int)convertF(blobClip.asum_data());
452 if (nValidValues < nCount)
453 blobTop.SetData(-1000000000.0, nValidValues, nCount - nValidValues);
454
455 // We need to subtract the max to avoid numerical issues, compute the exp
456 // and then normalize.
457 // compute max.
458 m_cuda.channel_max(nOuterNum * nInnerNum, nOuterNum, nChannels, nInnerNum, hTopData, hScaleData);
459
460 // subtract
461 m_cuda.channel_sub(nCount, nOuterNum, nChannels, nInnerNum, hScaleData, hTopData);
462
463 // exponentiate
464 m_cuda.exp(nCount, hTopData, hTopData);
465
466 // Sum after exp
467 m_cuda.channel_sum(nOuterNum * nInnerNum, nOuterNum, nChannels, nInnerNum, hTopData, hScaleData);
468
469 // divide
470 m_cuda.channel_div(nCount, nOuterNum, nChannels, nInnerNum, hScaleData, hTopData);
471
472 // Denan for divide by zero.
473 m_cuda.denan(nCount, blobTop.mutable_gpu_data, 0);
474 }
475
476 private void softmax_bwd(Blob<T> blobTop, Blob<T> blobClip, Blob<T> blobScale, Blob<T> blobBottom, int nAxis)
477 {
478 int nOuterNum = blobBottom.count(0, nAxis);
479 int nInnerNum = blobBottom.count(nAxis + 1);
480 long hTopDiff = blobTop.gpu_diff;
481 long hTopData = blobTop.gpu_data;
482 long hBottomDiff = blobBottom.mutable_gpu_diff;
483 long hScaleData = m_blobScale.mutable_gpu_data;
484 int nCount = blobTop.count();
485 int nChannels = blobTop.shape(nAxis);
486
487 m_cuda.copy(nCount, hTopDiff, hBottomDiff);
488
489 // Compute inner1d(top_diff, top_data) and subtract them from the bottom diff.
490 m_cuda.channel_dot(nOuterNum * nInnerNum, nOuterNum, nChannels, nInnerNum, hTopDiff, hTopData, hScaleData);
491 m_cuda.channel_sub(nCount, nOuterNum, nChannels, nInnerNum, hScaleData, hBottomDiff);
492
493 // Apply clip.
494 m_cuda.channel_scale(nCount, nOuterNum, nChannels, nInnerNum, hTopData, blobClip.gpu_data, hTopData);
495
496 // elementwise multiplication
497 m_cuda.mul(nCount, hBottomDiff, hTopData, hBottomDiff);
498 }
499
500 // Move to the GPU.
501 private float sum_diff(int nCount, Blob<T> b, int nOffset)
502 {
503 float[] rg = convertF(b.mutable_cpu_diff);
504 float fSum = 0;
505
506 for (int i = 0; i < nCount; i++)
507 {
508 fSum += rg[nOffset + i];
509 }
510
511 return fSum;
512 }
513
514 private void fill(Blob<T> blob1, Blob<T> blob2, Blob<T> blobFull, bool bUseDiff = false, int nBlob1AxisOffset = 0)
515 {
516 int nAxis = 1;
517 int nM = blob1.shape(nAxis + nBlob1AxisOffset);
518 int nN = blobFull.shape(nAxis);
519 int nK = blob1.count(nAxis + nBlob1AxisOffset + 1);
520
521 if (blob2 == null)
522 {
523 List<int> rgShape = new List<int>();
524 rgShape.Add(blob1.count(0, nAxis));
525 rgShape.Add(nN);
526 m_blobWork.Reshape(rgShape);
527 m_blobWork.SetData(1.0);
528 blob2 = m_blobWork;
529 }
530
531 if (bUseDiff)
532 m_cuda.gemm(true, true, nM, nN, nK, 1.0, blob1.gpu_diff, blob2.gpu_data, 0.0, blobFull.mutable_gpu_diff);
533 else
534 m_cuda.gemm(true, false, nM, nN, nK, 1.0, blob1.gpu_data, blob2.gpu_data, 0.0, blobFull.mutable_gpu_data);
535
536 // Transpose result.
537 m_blobWork.ReshapeLike(blobFull);
538
539 if (bUseDiff)
540 {
541 m_cuda.geam(true, false, nM, nN, 1.0, blobFull.gpu_diff, blobFull.gpu_diff, 0.0, m_blobWork.mutable_gpu_data);
542 m_cuda.copy(m_blobWork.count(), m_blobWork.gpu_data, blobFull.mutable_gpu_diff);
543 }
544 else
545 {
546 m_cuda.geam(true, false, nM, nN, 1.0, blobFull.gpu_data, blobFull.gpu_data, 0.0, m_blobWork.mutable_gpu_data);
547 m_cuda.copy(m_blobWork.count(), m_blobWork.gpu_data, blobFull.mutable_gpu_data);
548 }
549 }
550
562 protected override void forward(BlobCollection<T> colBottom, BlobCollection<T> colTop)
563 {
564 Blob<T> blobX = colBottom[0];
565 Blob<T> blobCy = colBottom[1];
566 Blob<T> blobClip = colBottom[2];
567
568 // Force values to 1 or 0.
569 m_cuda.sign(blobClip.count(), blobClip.gpu_data, blobClip.mutable_gpu_data);
570
571 addInternal(blobX, m_blobX);
572 m_transposeX.Forward(m_colInternalBottom, m_colInternalTop);
573
574 addInternal(blobClip, m_blobClip);
575 m_transposeClip.Forward(m_colInternalBottom, m_colInternalTop);
576 // Make sure the first item is set to 1.
577 m_blobClip.SetData(1, 0);
578
579 // Apply the clip.
580 apply_clip(m_blobX, blobClip, m_blobX);
581
582 // No need to transpose for state T = 1.
583 m_cuda.copy(blobCy.count(), blobCy.gpu_data, m_blobState.mutable_gpu_data);
584
585 addInternal(m_blobX, m_blobUh);
586 m_ipUa.Forward(m_colInternalBottom, m_colInternalTop);
587
588 addInternal(m_blobState, m_blobWc);
589 m_ipWa.Forward(m_colInternalBottom, m_colInternalTop);
590
591 // Duplicate Wc across all T.
592 fill(m_blobWc, null, m_blobFullWc);
593
594 addInternal(new List<Blob<T>>() { m_blobUh, m_blobFullWc }, m_blobAddOutput);
595 m_add1.Forward(m_colInternalBottom, m_colInternalTop);
596
597 addInternal(m_blobAddOutput, m_blobGG);
598 m_tanh.Forward(m_colInternalBottom, m_colInternalTop);
599
600 addInternal(m_blobGG, m_blobAA);
601 m_ipV.Forward(m_colInternalBottom, m_colInternalTop);
602
603 softmax_fwd(m_blobAA, m_blobClip, m_blobScale, m_blobSoftmax, 1);
604
605 // Apply softmax to each channel
606 m_blobFocusedInput.CopyFrom(m_blobX);
607 m_blobContext.SetData(0);
608 int nCount = m_blobFocusedInput.count(2);
609 int nOuterNum = m_blobFocusedInput.count(0, 2);
610 // Scale by softmax and sum.
611 m_cuda.gemv(true, nOuterNum, nCount, 1.0, m_blobFocusedInput.gpu_data, m_blobSoftmax.gpu_data, 0.0, m_blobContext.mutable_gpu_data);
612
613 // Reshape not needed for T = 1 in topT and top(0)
614 m_cuda.copy(m_blobContext.count(), m_blobContext.gpu_data, colTop[0].mutable_gpu_data);
615 }
616
628 protected override void backward(BlobCollection<T> colTop, List<bool> rgbPropagateDown, BlobCollection<T> colBottom)
629 {
630 // Gradient with respect to state then data.
631 if (rgbPropagateDown[0])
632 {
633 Blob<T> blobX = colBottom[0];
634 Blob<T> blobCy = colBottom[1];
635 Blob<T> blobClip = colBottom[2];
636
637 m_blobGG.SetDiff(0);
638 m_blobAA.SetDiff(0);
639 m_blobAddOutput.SetDiff(0);
640 m_blobUh.SetDiff(0);
641 m_blobFullWc.SetDiff(0);
642 m_blobWc.SetDiff(0);
643 m_blobScale.SetDiff(0);
644
645 List<bool> rgbPropagate = new List<bool>() { true, true };
646
647 // Reshape not needed for T = 1 in topT and top(0)
648 m_cuda.copy(colTop[0].count(), colTop[0].gpu_data, m_blobContext.mutable_gpu_data);
649 m_cuda.copy(colTop[0].count(), colTop[0].gpu_diff, m_blobContext.mutable_gpu_diff);
650
651 // Apply gradient w.r.t input. (x = context x softmax)
652 fill(m_blobContext, m_blobSoftmax, m_blobX1, true, 1);
653
654 // Apply gradient w.r.t softmax.
655 m_cuda.channel_mulv(m_blobX.count(), m_blobX.num, m_blobX.channels, m_blobX.count(2), m_blobX.gpu_data, m_blobContext.gpu_diff, m_blobFocusedInput.mutable_gpu_diff);
656 m_cuda.channel_sum(m_blobX.count(), m_blobX.count(0, 2), m_blobX.shape(2), m_blobX.count(3), m_blobFocusedInput.gpu_diff, m_blobSoftmax.mutable_gpu_diff);
657
658 softmax_bwd(m_blobSoftmax, m_blobClip, m_blobScale, m_blobAA, 1);
659
660 addInternal(m_blobGG, m_blobAA);
661 m_ipV.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
662
663 addInternal(m_blobAddOutput, m_blobGG);
664 m_tanh.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
665
666 addInternal(new List<Blob<T>>() { m_blobUh, m_blobFullWc }, m_blobAddOutput);
667 m_add1.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
668
669 // Sum weights by channel.
670 m_cuda.channel_sum(m_blobFullWc.count(), m_blobFullWc.num, m_blobFullWc.channels, m_blobWc.count(), m_blobFullWc.gpu_diff, m_blobWc.mutable_gpu_diff);
671
672 addInternal(m_blobState, m_blobWc);
673 m_ipWa.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
674
675 addInternal(m_blobX, m_blobUh);
676 m_ipUa.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
677 m_cuda.add(m_blobX.count(), m_blobX1.gpu_diff, m_blobX.gpu_diff, m_blobX.mutable_gpu_diff);
678
679 // No need to transpose for state T = 1.
680 m_cuda.copy(blobCy.count(), m_blobState.gpu_diff, blobCy.mutable_gpu_diff);
681
682 addInternal(blobX, m_blobX);
683 m_transposeX.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
684 }
685 }
686 }
687}
The Log class provides general output in text form.
Definition: Log.cs:13
void CHECK_EQ(double df1, double df2, string str)
Test whether one number is equal to another.
Definition: Log.cs:239
The Utility class provides general utility funtions.
Definition: Utility.cs:35
The BlobCollection contains a list of Blobs.
void Add(Blob< T > b)
Add a new Blob to the collection.
int Count
Returns the number of items in the collection.
void Clear(bool bDispose=false)
Remove all items from the collection.
void Reshape(int[] rgShape)
Reshapes all blobs in the collection to the given shape.
The Blob is the main holder of data that moves through the Layers of the Net.
Definition: Blob.cs:25
int channels
DEPRECIATED; legacy shape accessor channels: use shape(1) instead.
Definition: Blob.cs:800
void SetData(T[] rgData, int nCount=-1, bool bSetCount=true)
Sets a number of items within the Blob's data.
Definition: Blob.cs:1922
long mutable_gpu_diff
Returns the diff GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1555
T[] mutable_cpu_diff
Get diff from the GPU and bring it over to the host, or Set diff from the Host and send it over to th...
Definition: Blob.cs:1511
long mutable_gpu_data
Returns the data GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1487
void Reshape(int nNum, int nChannels, int nHeight, int nWidth, bool? bUseHalfSize=null)
DEPRECIATED; use
Definition: Blob.cs:442
void CopyFrom(Blob< T > src, int nSrcOffset, int nDstOffset, int nCount, bool bCopyData, bool bCopyDiff)
Copy from a source Blob.
Definition: Blob.cs:903
List< int > shape()
Returns an array where each element contains the shape of an axis of the Blob.
Definition: Blob.cs:684
T asum_data()
Compute the sum of absolute values (L1 norm) of the data.
Definition: Blob.cs:1706
int count()
Returns the total number of items in the Blob.
Definition: Blob.cs:739
void ReshapeLike(Blob< T > b, bool? bUseHalfSize=null)
Reshape this Blob to have the same shape as another Blob.
Definition: Blob.cs:648
string Name
Get/set the name of the Blob.
Definition: Blob.cs:2184
long gpu_diff
Returns the diff GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1541
void SetDiff(double dfVal, int nIdx=-1)
Either sets all of the diff items in the Blob to a given value, or alternatively only sets a single i...
Definition: Blob.cs:1981
int num
DEPRECIATED; legacy shape accessor num: use shape(0) instead.
Definition: Blob.cs:792
long gpu_data
Returns the data GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1479
The CudaDnn object is the main interface to the Low-Level Cuda C++ DLL.
Definition: CudaDnn.cs:969
[DEPRECIATED] The AttentionLayer provides focus for LSTM based encoder/decoder models.
AttentionLayer(CudaDnn< T > cuda, Log log, LayerParameter p)
The AttentionLayer constructor.
override void setup_internal_blobs(BlobCollection< T > col)
Derivative layers should add all internal blobws to the 'col' provided.
override void backward(BlobCollection< T > colTop, List< bool > rgbPropagateDown, BlobCollection< T > colBottom)
Computes the loss error gradient w.r.t the outputs.
override void forward(BlobCollection< T > colBottom, BlobCollection< T > colTop)
The forward computation.
override bool ReInitializeParameters(WEIGHT_TARGET target)
Re-initialize the parameters of the layer.
override int ExactNumTopBlobs
Returns the exact number of required top (output) Blobs: ip
override void LayerSetUp(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Setup the layer.
override void Reshape(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Reshape the bottom (input) and top (output) blobs.
override void dispose()
Releases all GPU and host resources used by the Layer.
override int ExactNumBottomBlobs
Returns the exact number of required bottom (input) Blobs: input, state (last ct),...
The EltwiseLayer computes elementwise oeprations, such as product and sum, along multiple input blobs...
Definition: EltwiseLayer.cs:23
The InnerProductLayer, also know as a 'fully-connected' layer, computes the inner product with a set ...
An interface for the units of computation which can be composed into a Net.
Definition: Layer.cs:31
Log m_log
Specifies the Log for output.
Definition: Layer.cs:43
LayerParameter m_param
Specifies the LayerParameter describing the Layer.
Definition: Layer.cs:47
void Backward(BlobCollection< T > colTop, List< bool > rgbPropagateDown, BlobCollection< T > colBottom)
Given the top Blob error gradients, compute the bottom Blob error gradients.
Definition: Layer.cs:815
virtual bool ReInitializeParameters(WEIGHT_TARGET target)
Re-initialize the parameters of the layer.
Definition: Layer.cs:389
double Forward(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Given the bottom (input) Blobs, this function computes the top (output) Blobs and the loss.
Definition: Layer.cs:728
float convertF(T df)
Converts a generic to a float value.
Definition: Layer.cs:1359
abstract void Reshape(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Adjust the shapes of top blobs and internal buffers to accomodate the shapes of the bottom blobs.
virtual bool reshapeNeeded(BlobCollection< T > colBottom, BlobCollection< T > colTop, bool bReset=true)
Tests the shapes of both the bottom and top blobs and if they are the same as the previous sizing,...
Definition: Layer.cs:622
CudaDnn< T > m_cuda
Specifies the CudaDnn connection to Cuda.
Definition: Layer.cs:39
void Setup(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Implements common Layer setup functionality.
Definition: Layer.cs:439
LayerParameter.LayerType m_type
Specifies the Layer type.
Definition: Layer.cs:35
BlobCollection< T > blobs
Returns the collection of learnable parameter Blobs for the Layer.
Definition: Layer.cs:875
BlobCollection< T > m_colBlobs
Specifies the learnable parameter Blobs of the Layer.
Definition: Layer.cs:55
DictionaryMap< bool > m_rgbParamPropagateDown
Specifies whether or not to compute the learnable diff of each parameter Blob.
Definition: Layer.cs:63
The LayerParameterEx class is used when sharing another Net to conserve GPU memory and extends the La...
Definition: Layer.cs:1750
BlobCollection< T > SharedBlobs
Returns the shared parameter Blobs.
Definition: Layer.cs:1782
Layer< T > SharedLayer
Returns the layer in the shared Net that matches this one.
Definition: Layer.cs:1774
BlobCollection< T > SharedLayerBlobs
Returns the shared Layer Blobs.
Definition: Layer.cs:1790
The TanhLayer is a neuron layer that calculates the tanh function, popular with auto-encoders....
Definition: TanhLayer.cs:28
The TransposeLayer performs a permute and transpose operation similar to numpy.transpose.
FillerParameter bias_filler
The filler for the bias.
FillerParameter weight_filler
The filler for the weights.
uint dim
Specifies the dim of the attention unit which should match the LSTM output size.
Specifies the parameters for the EltwiseLayer.
EltwiseOp
Defines the operation to perform.
EltwiseOp operation
Specifies the element-wise operation.
Specifies whether to use the NVIDIA cuDnn version or Caffe version of a given forward/backward operat...
Engine engine
Specifies the Engine in use.
Engine
Defines the type of engine to use.
FillerParameter weight_filler
The filler for the weights.
int axis
Specifies the first axis to be lumped into a single inner product computation; all preceding axes are...
FillerParameter bias_filler
The filler for the bias.
uint num_output
The number of outputs for the layer.
bool bias_term
Whether to have bias terms or not.
Specifies the base parameter for all layers.
string name
Specifies the name of this LayerParameter.
EltwiseParameter eltwise_param
Returns the parameter set when initialized with LayerType.ELTWISE
InnerProductParameter inner_product_param
Returns the parameter set when initialized with LayerType.INNERPRODUCT
TransposeParameter transpose_param
Returns the parameter set when initialized with LayerType.TRANSPOSE
AttentionParameter attention_param
Returns the parameter set when initialized with LayerType.ATTENTION
LayerType
Specifies the layer type.
TanhParameter tanh_param
Returns the parameter set when initialized with LayerType.TANH
The MyCaffe.basecode contains all generic types used throughout MyCaffe.
Definition: Annotation.cs:12
The MyCaffe.common namespace contains common MyCaffe classes.
Definition: BatchInput.cs:8
WEIGHT_TARGET
Defines the type of weight to target in re-initializations.
Definition: Interfaces.cs:38
The MyCaffe.fillers namespace contains all fillers including the Filler class.
The MyCaffe.layers.beta namespace contains all beta stage layers.
Definition: LayerFactory.cs:9
The MyCaffe.layers namespace contains all layers that have a solidified code base,...
Definition: LayerFactory.cs:15
The MyCaffe.param namespace contains parameters used to create models.
The MyCaffe namespace contains the main body of MyCaffe code that closesly tracks the C++ Caffe open-...
Definition: Annotation.cs:12