MyCaffe  1.12.2.41
Deep learning software for Windows C# programmers.
LSTMAttentionLayer.cs
1using System;
2using System.Collections.Generic;
3using System.Linq;
4using System.Text;
5using MyCaffe.basecode;
6using MyCaffe.common;
7using MyCaffe.param;
8using MyCaffe.fillers;
10
11namespace MyCaffe.layers
12{
44 public class LSTMAttentionLayer<T> : Layer<T>
45 {
46 int m_nI; // input dimension.
47 int m_nH; // number of hidden units.
48 int m_nT; // length of sequence.
49 int m_nN; // batch size.
50
51 double m_dfClippingThreshold; // threshold for clipped gradient.
52 Blob<T> m_blobBiasMultiplier;
53
54 Blob<T> m_blobCell; // Memory cell.
55 Blob<T> m_blobPreGate; // gate values before nonlinearity.
56 Blob<T> m_blobGate; // gate values after nonlinearity.
57
58 Blob<T> m_blob_C_0; // previous cell state value.
59 Blob<T> m_blob_H_0; // previous hidden activation value.
60 Blob<T> m_blob_C_T; // next cell state value.
61 Blob<T> m_blob_H_T; // next hidden activation value.
62
63 // Intermediate values.
64 Blob<T> m_blob_H_to_Gate;
65 Blob<T> m_blob_H_to_H;
66 Blob<T> m_blob_C_to_Gate = null;
67 Blob<T> m_blobEOutputWhd = null;
68 int m_nWeightItoHidx;
69 int m_nWeightHtoHidx;
70 int m_nWeightBiasidx;
71 int m_nWeightCtoHidx;
72 int m_nWeightWhdidx;
73 int m_nWeightWhdbidx;
74
75 // MaxT
76 Blob<T> m_blobMaxT = null;
77 int? m_nMaxT = null;
78
79 // Attention values
80 Layer<T> m_attention = null;
81 Blob<T> m_blobContext = null;
82 Blob<T> m_blobPrevCt = null;
83 Blob<T> m_blobContextFull = null;
84 BlobCollection<T> m_colInternalBottom = new BlobCollection<T>();
85 BlobCollection<T> m_colInternalTop = new BlobCollection<T>();
86
109 : base(cuda, log, p)
110 {
111 m_type = LayerParameter.LayerType.LSTM_ATTENTION;
112
113 m_blobBiasMultiplier = new Blob<T>(m_cuda, m_log);
114 m_blobBiasMultiplier.Name = m_param.name + " biasmult";
115 m_blobCell = new Blob<T>(m_cuda, m_log);
116 m_blobCell.Name = m_param.name + " cell";
117 m_blobPreGate = new Blob<T>(m_cuda, m_log);
118 m_blobPreGate.Name = m_param.name + " pregate";
119 m_blobGate = new Blob<T>(m_cuda, m_log);
120 m_blobGate.Name = m_param.name + " gate";
121 m_blob_C_0 = new Blob<T>(m_cuda, m_log);
122 m_blob_C_0.Name = m_param.name + " c_0";
123 m_blob_H_0 = new Blob<T>(m_cuda, m_log);
124 m_blob_H_0.Name = m_param.name + " h_0";
125 m_blob_C_T = new Blob<T>(m_cuda, m_log);
126 m_blob_C_T.Name = m_param.name + " c_t";
127 m_blob_H_T = new Blob<T>(m_cuda, m_log);
128 m_blob_H_T.Name = m_param.name + " h_t";
129 m_blob_H_to_Gate = new Blob<T>(m_cuda, m_log);
130 m_blob_H_to_Gate.Name = m_param.name + "h_to_gate";
131 m_blob_H_to_H = new Blob<T>(m_cuda, m_log);
132 m_blob_H_to_H.Name = m_param.name + " h_to_h";
133 m_blobMaxT = new Blob<T>(m_cuda, m_log);
134 m_blobMaxT.Name = m_param.name + " maxT";
135
137 {
138 m_blobEOutputWhd = new Blob<T>(m_cuda, m_log);
139 m_blobEOutputWhd.Name = m_param.name + " ip";
140 }
141 }
142
144 protected override void dispose()
145 {
146 base.dispose();
147
148 dispose(ref m_attention);
149 dispose(ref m_blobContext);
150 dispose(ref m_blobPrevCt);
151 dispose(ref m_blobContextFull);
152
153 dispose(ref m_blobBiasMultiplier);
154 dispose(ref m_blobCell);
155 dispose(ref m_blobPreGate);
156 dispose(ref m_blobGate);
157 dispose(ref m_blob_C_0);
158 dispose(ref m_blob_C_T);
159 dispose(ref m_blob_H_0);
160 dispose(ref m_blob_H_T);
161 dispose(ref m_blob_H_to_Gate);
162 dispose(ref m_blob_H_to_H);
163 dispose(ref m_blob_C_to_Gate);
164
165 dispose(ref m_blobMaxT);
166 dispose(ref m_blobEOutputWhd);
167 }
168
170 protected override void setup_internal_blobs(BlobCollection<T> col)
171 {
172 if (col.Count > 0)
173 return;
174
175 col.Add(m_blobBiasMultiplier);
176 col.Add(m_blobCell);
177 col.Add(m_blobPreGate);
178 col.Add(m_blobGate);
179 col.Add(m_blob_C_0);
180 col.Add(m_blob_H_0);
181 col.Add(m_blob_C_T);
182 col.Add(m_blob_H_T);
183 col.Add(m_blob_H_to_Gate);
184 col.Add(m_blob_H_to_H);
185 col.Add(m_blobMaxT);
186
187 if (m_blobEOutputWhd != null)
188 col.Add(m_blobEOutputWhd);
189
190 if (m_attention != null)
191 {
192 col.Add(m_blob_C_to_Gate);
193 col.Add(m_blobPrevCt);
194
195 foreach (Blob<T> b in m_attention.internal_blobs)
196 {
197 col.Add(b);
198 }
199 }
200 }
201
205 public override int MinBottomBlobs
206 {
207 get { return 1; }
208 }
209
218 public override int MaxBottomBlobs
219 {
220 get { return 5; }
221 }
222
226 public override int ExactNumTopBlobs
227 {
228 get { return 1; }
229 }
230
231 private void addInternal(Blob<T> bottom, Blob<T> top)
232 {
233 m_colInternalBottom.Clear();
234 m_colInternalBottom.Add(bottom);
235
236 m_colInternalTop.Clear();
237 m_colInternalTop.Add(top);
238 }
239
240 private void addInternal(List<Blob<T>> rgBottom, Blob<T> top)
241 {
242 m_colInternalBottom.Clear();
243
244 for (int i = 0; i < rgBottom.Count; i++)
245 {
246 m_colInternalBottom.Add(rgBottom[i]);
247 }
248
249 m_colInternalTop.Clear();
250 m_colInternalTop.Add(top);
251 }
252
258 public override void LayerSetUp(BlobCollection<T> colBottom, BlobCollection<T> colTop)
259 {
261
263 {
264 m_log.CHECK_GE(colBottom.Count, 4, "When using attention, four bottoms are required: x, xClip, encoding, encodingClip.");
265 m_log.CHECK_LE(colBottom.Count, 5, "When using attention, four bottoms are required: x, xClip, encoding, encodingClip, vocabcount (optional).");
266
267 if (colBottom.Count == 5)
268 {
269 if (p.num_output_ip != 0)
270 p.num_output_ip = (uint)convertF(colBottom[4].GetData(0));
271 }
272 }
273 else
274 {
275 m_log.CHECK_GE(colBottom.Count, 1, "When not using attention, at least one bottom is required: x.");
276 m_log.CHECK_LE(colBottom.Count, 2, "When not using attention, no more than two bottoms is required: x, clip.");
277 }
278
279 m_dfClippingThreshold = p.clipping_threshold;
280 m_nN = colBottom[0].channels;
281 m_nH = (int)p.num_output; // number of hidden units.
282 m_nI = colBottom[0].count(2); // input dimension.
283
284 // Check if we need to set up the weights.
285 if (m_colBlobs.Count > 0)
286 {
287 m_log.WriteLine("Skipping parameter initialization.");
288 }
289 else
290 {
292
293 Filler<T> weight_filler = Filler<T>.Create(m_cuda, m_log, p.weight_filler);
295
296 // input-to-hidden weights
297 // Initialize the weight.
298 List<int> rgShape1 = new List<int>() { 4 * m_nH, m_nI };
299 Blob<T> blobWeights_I_H = new Blob<T>(m_cuda, m_log);
300 blobWeights_I_H.Name = m_param.name + " weights I to H";
301 blobWeights_I_H.type = BLOB_TYPE.WEIGHT;
302
303 if (!shareParameter(blobWeights_I_H, rgShape1))
304 {
305 blobWeights_I_H.Reshape(rgShape1);
306 weight_filler.Fill(blobWeights_I_H);
307 }
308 m_nWeightItoHidx = m_colBlobs.Count;
309 m_colBlobs.Add(blobWeights_I_H);
310
311 // hidden-to-hidden weights
312 // Initialize the weight.
313 List<int> rgShape2 = new List<int>() { 4 * m_nH, m_nH };
314 Blob<T> blobWeights_H_H = new Blob<T>(m_cuda, m_log);
315 blobWeights_H_H.Name = m_param.name + " weights H to H";
316 blobWeights_H_H.type = BLOB_TYPE.WEIGHT;
317
318 if (!shareParameter(blobWeights_H_H, rgShape2))
319 {
320 blobWeights_H_H.Reshape(rgShape2);
321 weight_filler.Fill(blobWeights_H_H);
322 }
323 m_nWeightHtoHidx = m_colBlobs.Count;
324 m_colBlobs.Add(blobWeights_H_H);
325
326 // If necessary, initialize and fill the bias term.
327 List<int> rgShape3 = new List<int>() { 4 * m_nH };
328 Blob<T> blobBias = new Blob<T>(m_cuda, m_log);
329 blobBias.Name = m_param.name + " bias weights";
330 blobBias.type = BLOB_TYPE.WEIGHT;
331
332 if (!shareParameter(blobBias, rgShape3))
333 {
334 blobBias.Reshape(rgShape3);
335 bias_filler.Fill(blobBias);
336 }
337 m_nWeightBiasidx = m_colBlobs.Count;
338 m_colBlobs.Add(blobBias);
339
340 // Initialize the bias for the forget gate to 5.0 as described in the
341 // Clockwork RNN paper:
342 // [1] Koutnik, J., Greff, K., Gomez, F., Schmidhuber, J., 'A Clockwork RNN', 2014"
344 {
345 double[] rgBias = convertD(blobBias.mutable_cpu_data);
346
347 for (int i=m_nH; i<2*m_nH; i++)
348 {
349 rgBias[i] = 5.0;
350 }
351
352 blobBias.mutable_cpu_data = convert(rgBias);
353 }
354
356 {
357 Blob<T> blobWeightWhd = new Blob<T>(m_cuda, m_log);
358 blobWeightWhd.Name = m_param.name + " weights Whd";
359 blobWeightWhd.type = BLOB_TYPE.WEIGHT;
360
361 List<int> rgShapeWhd = new List<int>() { m_nH, (int)m_param.lstm_attention_param.num_output_ip };
362 if (!shareParameter(blobWeightWhd, rgShapeWhd))
363 {
364 blobWeightWhd.Reshape(rgShapeWhd);
365 weight_filler.Fill(blobWeightWhd);
366 }
367 m_nWeightWhdidx = m_colBlobs.Count;
368 m_colBlobs.Add(blobWeightWhd);
369
370 Blob<T> blobWeightWhdb = new Blob<T>(m_cuda, m_log);
371 blobWeightWhdb.Name = m_param.name + " weights Whdb";
372 blobWeightWhdb.type = BLOB_TYPE.WEIGHT;
373
374 List<int> rgShapeWhdb = new List<int>() { 1, (int)m_param.lstm_attention_param.num_output_ip };
375 if (!shareParameter(blobWeightWhdb, rgShape1))
376 {
377 blobWeightWhdb.Reshape(rgShapeWhdb);
378 bias_filler.Fill(blobWeightWhdb);
379 }
380 m_nWeightWhdbidx = m_colBlobs.Count;
381 m_colBlobs.Add(blobWeightWhdb);
382 }
383
385 {
386 // context-to-hidden weights
387 // Initialize the weight.
388 Blob<T> blobWeights_C_H = new Blob<T>(m_cuda, m_log);
389 blobWeights_C_H.Name = m_param.name + " weights C to H";
390 blobWeights_C_H.type = BLOB_TYPE.WEIGHT;
391
392 if (!shareParameter(blobWeights_C_H, rgShape1))
393 {
394 blobWeights_C_H.Reshape(rgShape1); // same shape as I to H
395 weight_filler.Fill(blobWeights_C_H);
396 }
397 m_nWeightCtoHidx = m_colBlobs.Count;
398 m_colBlobs.Add(blobWeights_C_H);
399 }
400 }
401
402 m_rgbParamPropagateDown = new DictionaryMap<bool>(m_colBlobs.Count, true);
403
404 List<int> rgCellShape = new List<int>() { m_nN, m_nH };
405 m_blob_C_0.Reshape(rgCellShape);
406 m_blob_H_0.Reshape(rgCellShape);
407 m_blob_C_T.Reshape(rgCellShape);
408 m_blob_H_T.Reshape(rgCellShape);
409 m_blob_H_to_H.Reshape(rgCellShape);
410
411 List<int> rgGateShape = new List<int>() { m_nN, 4, m_nH };
412 m_blob_H_to_Gate.Reshape(rgGateShape);
413
414 // Attention settings
416 {
417 m_blob_C_to_Gate = new Blob<T>(m_cuda, m_log, false);
418 m_blob_C_to_Gate.Name = m_param.name + "c_to_gate";
419 m_blob_C_to_Gate.Reshape(rgGateShape);
420
421 m_blobContext = new Blob<T>(m_cuda, m_log);
422 m_blobContext.Name = "context_out";
423
424 m_blobContextFull = new Blob<T>(m_cuda, m_log);
425 m_blobContextFull.Name = "context_full";
426
427 m_blobPrevCt = new Blob<T>(m_cuda, m_log);
428 m_blobPrevCt.Name = "prev_ct";
429
430 LayerParameter attentionParam = new LayerParameter(LayerParameter.LayerType.ATTENTION);
431 attentionParam.attention_param.axis = 2;
435
437 {
439 attentionParam = new LayerParameterEx<T>(attentionParam, pEx.SharedBlobs, pEx.SharedLayerBlobs, pEx.SharedLayer);
440 }
441
442 m_attention = new AttentionLayer<T>(m_cuda, m_log, attentionParam);
443
444 Blob<T> blobEncoding = colBottom[2];
445 Blob<T> blobEncodingClip = colBottom[3];
446 addInternal(new List<Blob<T>>() { blobEncoding, m_blob_C_T, blobEncodingClip }, m_blobContext);
447 m_attention.Setup(m_colInternalBottom, m_colInternalTop);
448
449 foreach (Blob<T> b in m_attention.blobs)
450 {
451 m_colBlobs.Add(b);
452 }
453 }
454 }
455
461 public override void Reshape(BlobCollection<T> colBottom, BlobCollection<T> colTop)
462 {
464 {
465 m_nMaxT = null;
466 m_nN = colBottom[0].channels;
467 m_bNetReshapeRequest = false;
468 }
469 else
470 {
471 if (!reshapeNeeded(colBottom, colTop))
472 return;
473 }
474
475 // Figure out the dimensions.
476 m_nT = colBottom[0].num; // length of sequence.
477 m_log.CHECK_EQ(colBottom[0].count() / m_nT / m_nN, m_nI, "The input size is incompatible with inner product parameters.");
478
479 // Gate initialization.
480 List<int> rgGateShape = new List<int>() { m_nT, m_nN, 4, m_nH };
481 m_blobPreGate.Reshape(rgGateShape);
482 m_blobGate.Reshape(rgGateShape);
483 m_blob_H_to_Gate.Reshape(rgGateShape);
484
485 List<int> rgTopShape = new List<int>() { m_nT, m_nN, m_nH };
486 m_blobCell.Reshape(rgTopShape);
487 colTop[0].Reshape(rgTopShape);
488
489 // Setup the bias multipler.
490 List<int> rgMultiplierShape = new List<int>() { m_nT, m_nN };
491 m_blobBiasMultiplier.Reshape(rgMultiplierShape);
492 m_blobBiasMultiplier.SetData(1.0);
493
494 List<int> rgCellShape = new List<int>() { m_nN, m_nH };
495 m_blob_C_0.Reshape(rgCellShape);
496 m_blob_H_0.Reshape(rgCellShape);
497 m_blob_C_T.Reshape(rgCellShape);
498 m_blob_H_T.Reshape(rgCellShape);
499 m_blob_H_to_H.Reshape(rgCellShape);
500
501 if (colBottom.Count > 1)
502 m_blobMaxT.Reshape(new List<int>() { 1, colBottom[1].channels });
503
505 {
506 List<int> rgIpShape = new List<int>() { m_nT, m_nN, (int)m_param.lstm_attention_param.num_output_ip };
507 m_blobEOutputWhd.Reshape(rgIpShape);
508 colTop[0].Reshape(rgIpShape);
509 }
510
511 // Attention reshape
513 {
514 m_blob_C_to_Gate.Reshape(rgGateShape);
515
516 Blob<T> blobEncoding = colBottom[2];
517 Blob<T> blobEncodingClip = colBottom[3];
518 addInternal(new List<Blob<T>>() { blobEncoding, m_blob_C_T, blobEncodingClip }, m_blobContext);
519 m_attention.Reshape(m_colInternalBottom, m_colInternalTop);
520
521 List<int> rgShape = Utility.Clone<int>(m_blobContext.shape());
522 rgShape[0] = m_nT;
523 m_blobContextFull.Reshape(rgShape);
524
525 m_blobPrevCt.ReshapeLike(m_blobCell);
526 }
527 }
528
529 // Find the longest clip length.
530 private int calculate_maxT(Blob<T> blob, out int nInitialClip)
531 {
532 int nMax = 1;
533
534 if (blob.count() > 1)
535 {
536 m_blobMaxT.SetData(0);
537
538 for (int t = 0; t < blob.num; t++)
539 {
540 int nSrcIdx = t * blob.channels;
541 m_cuda.add(m_blobMaxT.count(), blob.gpu_data, m_blobMaxT.gpu_data, m_blobMaxT.mutable_gpu_data, 1.0, 1.0, nSrcIdx, 0, 0);
542 }
543
544 long lPos;
545 nMax = (int)m_cuda.max(m_blobMaxT.count(), m_blobMaxT.gpu_data, out lPos);
546 if (convertF(blob.GetData(0)) == 0)
547 nMax++;
548 }
549
550 nInitialClip = (int)convertF(blob.GetData(0));
551
552 return nMax;
553 }
554
565 protected override void forward(BlobCollection<T> colBottom, BlobCollection<T> colTop)
566 {
567 long hTopData = colTop[0].mutable_gpu_data;
568 long hBottomData = colBottom[0].gpu_data;
569 long hClipData = 0;
570 int nMaxT = m_nT;
571 int nInitialClip = 0;
572 double dfOriginalClip = 0;
573
574 if (colBottom.Count > 1)
575 {
576 hClipData = colBottom[1].gpu_data;
577 m_log.CHECK_EQ(colBottom[0].count(0, 2), colBottom[1].count(), "The bottom[1].count() should equal the bottom[0].count(0,2).");
578
579 m_nMaxT = calculate_maxT(colBottom[1], out nInitialClip);
580 nMaxT = m_nMaxT.Value;
581 }
582
583 long hWeight_i = m_colBlobs[m_nWeightItoHidx].gpu_data;
584 long hWeight_h = m_colBlobs[m_nWeightHtoHidx].gpu_data;
585 long hBias = m_colBlobs[m_nWeightBiasidx].gpu_data;
586 long hWeight_c = (m_param.lstm_attention_param.enable_attention) ? m_colBlobs[m_nWeightCtoHidx].gpu_data : 0;
587 long hPreGateData = m_blobPreGate.mutable_gpu_data;
588 long hGateData = m_blobGate.mutable_gpu_data;
589 long hCellData = m_blobCell.mutable_gpu_data;
590 long hHtoGateData = m_blob_H_to_Gate.mutable_gpu_data;
591 long hCtoGateData = 0;
592
593 // Initialize previous state.
594 if (hClipData != 0 && nInitialClip != 0)
595 {
596 m_cuda.copy(m_blob_C_0.count(), m_blob_C_T.gpu_data, m_blob_C_0.mutable_gpu_data);
597 m_cuda.copy(m_blob_H_0.count(), m_blob_H_T.gpu_data, m_blob_H_0.mutable_gpu_data);
598 }
599 else
600 {
601 m_blob_C_0.SetData(0.0);
602 m_blob_H_0.SetData(0.0);
603 }
604
605 m_cuda.gemm(false, true, m_nT * m_nN, 4 * m_nH, m_nI, m_tOne, hBottomData, hWeight_i, m_tZero, hPreGateData);
606 m_cuda.gemm(false, false, m_nT * m_nN, 4 * m_nH, 1, m_tOne, m_blobBiasMultiplier.gpu_data, hBias, m_tOne, hPreGateData);
607
609 {
610 m_blobContextFull.SetData(0);
611 if (nInitialClip == 0)
612 m_blobPrevCt.SetData(0);
613
614 // Reset the clip for we want to use the initial context.
615 dfOriginalClip = convertD(colBottom[1].GetData(0));
616 colBottom[1].SetData(1.0, 0);
617 }
618
619 // Compute recurrent forward propagation
620 for (int t = 0; t < nMaxT; t++)
621 {
622 int nTopOffset = colTop[0].offset(t);
623 int nCellOffset = m_blobCell.offset(t);
624 int nPreGateOffset = m_blobPreGate.offset(t);
625 int nGateOffset = m_blobGate.offset(t);
626 int nClipOffset = (hClipData != 0) ? colBottom[1].offset(t) : 0;
627 int nHT1Offset;
628 long hHT1Data;
629 int nCT1Offset;
630 long hCT1Data;
631 long hContext = 0;
632
633 if (t == 0)
634 {
635 hHT1Data = m_blob_H_0.gpu_data;
636 nHT1Offset = 0;
637 hCT1Data = m_blob_C_0.gpu_data;
638 nCT1Offset = 0;
639 }
640 else
641 {
642 hHT1Data = m_blob_H_T.gpu_data;
643 nHT1Offset = -colTop[0].offset(1);
644 hCT1Data = m_blob_C_T.gpu_data;
645 nCT1Offset = -m_blobCell.offset(1);
646 }
647
649 {
650 Blob<T> blobEncoding = colBottom[2];
651 Blob<T> blobEncodingClip = colBottom[3];
652
653 addInternal(new List<Blob<T>>() { blobEncoding, m_blobPrevCt, blobEncodingClip }, m_blobContext);
654 m_attention.Forward(m_colInternalBottom, m_colInternalTop);
655 hContext = m_blobContext.gpu_data;
656 hCtoGateData = m_blob_C_to_Gate.mutable_gpu_data;
657
658 int nCount = m_blobContext.count();
659 m_cuda.copy(nCount, hContext, m_blobContextFull.mutable_gpu_data, 0, t * nCount);
660 }
661
662 m_cuda.lstm_fwd(t,
663 m_nN,
664 m_nH,
665 m_nI,
666 hWeight_h,
667 hWeight_i,
668 hClipData,
669 nClipOffset,
670 hTopData, // h_t data
671 nTopOffset, // h_t offset
672 hCellData, // c_t data
673 nCellOffset, // c_t offset
674 hPreGateData,
675 nPreGateOffset,
676 hGateData,
677 nGateOffset,
678 hHT1Data,
679 nHT1Offset,
680 hCT1Data,
681 nCT1Offset,
682 hHtoGateData,
683 hContext,
684 hWeight_c,
685 hCtoGateData);
686
688 m_blobPrevCt.CopyFrom(m_blobCell);
689 }
690
691 // Preserve cell state and output value for truncated BPTT
692 m_cuda.copy(m_nN * m_nH, hCellData, m_blob_C_T.mutable_gpu_data, m_blobCell.offset(nMaxT - 1));
693 m_cuda.copy(m_nN * m_nH, hTopData, m_blob_H_T.mutable_gpu_data, colTop[0].offset(nMaxT - 1));
694
696 {
697 int nM = m_nT * m_nN;
699 int nK = m_nH;
700 m_cuda.gemm(false, false, nM, nN, nK, Blob<T>.One, hTopData, m_colBlobs[m_nWeightWhdidx].gpu_data, Blob<T>.Zero, m_blobEOutputWhd.mutable_gpu_data);
701 m_cuda.add(colTop[0].count(), m_blobEOutputWhd.gpu_data, m_colBlobs[m_nWeightWhdbidx].gpu_data, m_blobEOutputWhd.mutable_gpu_data);
702 colTop[0].CopyFrom(m_blobEOutputWhd);
703 }
704
706 {
707 // Reset the clip to original value.
708 colBottom[1].SetData(dfOriginalClip, 0);
709 }
710 }
711
722 protected override void backward(BlobCollection<T> colTop, List<bool> rgbPropagateDown, BlobCollection<T> colBottom)
723 {
724 long hTopData = colTop[0].gpu_data;
725 long hBottomData = colBottom[0].gpu_data;
726 long hClipData = 0;
727 int nMaxT = m_nT;
728
729 List<bool> rgbPropagate = new List<bool>() { true, true };
730
731 if (colBottom.Count > 1)
732 {
733 hClipData = colBottom[1].gpu_data;
734 m_cuda.sign(colBottom[1].count(), hClipData, hClipData); // Set to 1 or 0.
735 m_log.CHECK_EQ(colBottom[0].count(0, 2), colBottom[1].count(), "The bottom[1].count() should equal the bottom[0].count(0,2).");
736 nMaxT = m_nMaxT.Value;
737 }
738
739 long hWeight_i = m_colBlobs[m_nWeightItoHidx].gpu_data;
740 long hWeight_h = m_colBlobs[m_nWeightHtoHidx].gpu_data;
741 long hGateData = m_blobGate.gpu_data;
742 long hCellData = m_blobCell.gpu_data;
743
744 long hTopDiff = colTop[0].mutable_gpu_diff;
745 long hPreGateDiff = m_blobPreGate.mutable_gpu_diff;
746 long hGateDiff = m_blobGate.mutable_gpu_diff;
747 long hCellDiff = m_blobCell.mutable_gpu_diff;
748 long hHtoHDiff = m_blob_H_to_H.mutable_gpu_diff;
749
750 m_blobCell.SetDiff(0);
751 m_blobGate.SetDiff(0);
752 m_blobPreGate.SetDiff(0);
753 m_blob_H_to_H.SetDiff(0);
754 m_blob_H_to_Gate.SetDiff(0);
755
756 long hWeight_c = 0;
757 long hContextData = 0;
758 long hContextDiff = 0;
759
761 {
762 int nM = m_nT * m_nN;
764 int nK = m_nH;
765
766 m_cuda.copy(colTop[0].count(), colTop[0].gpu_diff, m_blobEOutputWhd.mutable_gpu_diff);
767 m_cuda.add(colTop[0].count(), colTop[0].gpu_diff, m_colBlobs[m_nWeightWhdbidx].gpu_diff, m_colBlobs[m_nWeightWhdbidx].mutable_gpu_diff);
768 m_cuda.gemm(false, true, nM, nK, nN, Blob<T>.One, m_blobEOutputWhd.gpu_diff, m_colBlobs[m_nWeightWhdidx].gpu_data, Blob<T>.Zero, m_blob_H_T.mutable_gpu_diff);
769 m_cuda.gemm(true, false, nK, nN, nM, Blob<T>.One, m_blob_H_T.gpu_data, m_blobEOutputWhd.gpu_diff, Blob<T>.One, m_colBlobs[m_nWeightWhdidx].mutable_gpu_diff);
770 hTopDiff = m_blob_H_T.gpu_diff;
771 hTopData = m_blob_H_T.gpu_data;
772 }
773
775 {
776 m_blobContext.SetDiff(0);
777 m_blob_C_to_Gate.SetDiff(0);
778 hWeight_c = m_colBlobs[m_nWeightCtoHidx].gpu_data;
779 hContextData = m_blobContext.gpu_data;
780 hContextDiff = m_blobContext.mutable_gpu_diff;
781 m_cuda.sign(colBottom[3].count(), colBottom[3].gpu_data, colBottom[3].mutable_gpu_data); // Set to 1 or 0.
782 }
783
784 m_blob_C_T.SetDiff(0);
785 m_cuda.copy(m_nN * m_nH, m_blob_C_T.gpu_diff, hCellDiff, 0, m_blobCell.offset(nMaxT - 1));
786
787 for (int t = nMaxT - 1; t >= 0; t--)
788 {
789 int nTopOffset = colTop[0].offset(t);
790 int nCellOffset = m_blobCell.offset(t);
791 int nGateOffset = m_blobGate.offset(t);
792 int nPreGateOffset = m_blobPreGate.offset(t);
793 int nClipOffset = (hClipData == 0) ? 0 : colBottom[1].offset(t);
794 int nCT1Offset;
795 long hCT1Data;
796 int nDHT1Offset;
797 long hDHT1Diff;
798 int nDCT1Offset;
799 long hDCT1Diff;
800
801 if (t == 0)
802 {
803 nCT1Offset = 0;
804 hCT1Data = m_blob_C_0.gpu_data;
805 nDHT1Offset = 0;
806 hDHT1Diff = m_blob_H_0.mutable_gpu_diff;
807 nDCT1Offset = 0;
808 hDCT1Diff = m_blob_C_0.mutable_gpu_diff;
809 }
810 else
811 {
812 nCT1Offset = m_blobCell.offset(t - 1);
813 hCT1Data = hCellData;
814 nDHT1Offset = colTop[0].offset(t - 1);
815 hDHT1Diff = hTopDiff;
816 nDCT1Offset = m_blobCell.offset(t - 1);
817 hDCT1Diff = hCellDiff;
818 }
819
820 m_cuda.lstm_bwd(t,
821 m_nN,
822 m_nH,
823 m_nI,
824 m_dfClippingThreshold,
825 hWeight_h,
826 hClipData,
827 nClipOffset,
828 hTopDiff,
829 nTopOffset,
830 hCellData,
831 hCellDiff,
832 nCellOffset,
833 hPreGateDiff,
834 nPreGateOffset,
835 hGateData,
836 hGateDiff,
837 nGateOffset,
838 hCT1Data,
839 nCT1Offset,
840 hDHT1Diff,
841 nDHT1Offset,
842 hDCT1Diff,
843 nDCT1Offset,
844 hHtoHDiff,
845 hContextDiff,
846 hWeight_c);
847
849 {
850 Blob<T> blobEncoding = colBottom[2];
851 Blob<T> blobEncodingClip = colBottom[3];
852 addInternal(new List<Blob<T>>() { blobEncoding, m_blob_C_T, blobEncodingClip }, m_blobContext);
853 m_attention.Backward(m_colInternalTop, rgbPropagate, m_colInternalBottom);
854
855 int nCount = m_blobContext.count();
856 m_cuda.copy(nCount, hContextDiff, m_blobContextFull.mutable_gpu_diff, 0, t * nCount);
857 }
858 }
859
861 {
862 // Gradient w.r.t input-to-hidden weight
863 m_cuda.gemm(true, false, 4 * m_nH, m_nI, m_nT * m_nN, m_tOne, hPreGateDiff, hBottomData, m_tOne, m_colBlobs[m_nWeightItoHidx].mutable_gpu_diff);
864 }
865
867 {
868 // Gradient w.r.t. hidden-to-hidden weight
869 m_cuda.gemm(true, false, 4 * m_nH, m_nH, (m_nT - 1) * m_nN, m_tOne, hPreGateDiff, hTopData, m_tOne, m_colBlobs[m_nWeightHtoHidx].mutable_gpu_diff, m_blobPreGate.offset(1));
870
871 // Add gradient from previous time-step.
872 m_cuda.gemm(true, false, 4 * m_nH, m_nH, 1, m_tOne, hPreGateDiff, m_blob_H_0.gpu_data, m_tOne, m_colBlobs[m_nWeightHtoHidx].mutable_gpu_diff);
873 }
874
876 {
877 // Gradient w.r.t. bias.
878 m_cuda.gemv(true, m_nT * m_nN, 4 * m_nH, m_tOne, hPreGateDiff, m_blobBiasMultiplier.gpu_data, m_tOne, m_colBlobs[m_nWeightBiasidx].mutable_gpu_diff);
879 }
880
882 {
883 // Gradient w.r.t. context data.
884 m_cuda.gemm(true, false, 4 * m_nH, m_nI, m_nT * m_nN, m_tOne, hPreGateDiff, m_blobContextFull.gpu_data, m_tOne, m_colBlobs[m_nWeightCtoHidx].mutable_gpu_diff);
885 }
886
887 if (rgbPropagateDown[0])
888 {
889 // Gradient w.r.t. bottom data.
890 m_cuda.gemm(false, false, m_nT * m_nN, m_nI, 4 * m_nH, m_tOne, hPreGateDiff, hWeight_i, m_tZero, colBottom[0].mutable_gpu_diff);
891 }
892 }
893 }
894}
The Log class provides general output in text form.
Definition: Log.cs:13
void WriteLine(string str, bool bOverrideEnabled=false, bool bHeader=false, bool bError=false, bool bDisable=false)
Write a line of output.
Definition: Log.cs:80
void CHECK_EQ(double df1, double df2, string str)
Test whether one number is equal to another.
Definition: Log.cs:239
void CHECK_LE(double df1, double df2, string str)
Test whether one number is less than or equal to another.
Definition: Log.cs:263
void CHECK_GE(double df1, double df2, string str)
Test whether one number is greater than or equal to another.
Definition: Log.cs:287
The Utility class provides general utility funtions.
Definition: Utility.cs:35
The BlobCollection contains a list of Blobs.
void Add(Blob< T > b)
Add a new Blob to the collection.
void SetData(double df)
Set all blob data to the value specified.
int Count
Returns the number of items in the collection.
void Clear(bool bDispose=false)
Remove all items from the collection.
void Reshape(int[] rgShape)
Reshapes all blobs in the collection to the given shape.
void CopyFrom(BlobCollection< T > bSrc, bool bCopyDiff=false)
Copy the data or diff from another BlobCollection into this one.
The Blob is the main holder of data that moves through the Layers of the Net.
Definition: Blob.cs:25
int channels
DEPRECIATED; legacy shape accessor channels: use shape(1) instead.
Definition: Blob.cs:800
void SetData(T[] rgData, int nCount=-1, bool bSetCount=true)
Sets a number of items within the Blob's data.
Definition: Blob.cs:1922
long mutable_gpu_diff
Returns the diff GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1555
long mutable_gpu_data
Returns the data GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1487
T[] mutable_cpu_data
Get data from the GPU and bring it over to the host, or Set data from the Host and send it over to th...
Definition: Blob.cs:1461
static T One
Returns One (1) in type T.
Definition: Blob.cs:268
void Reshape(int nNum, int nChannels, int nHeight, int nWidth, bool? bUseHalfSize=null)
DEPRECIATED; use
Definition: Blob.cs:442
BLOB_TYPE type
Returns the BLOB_TYPE of the Blob.
Definition: Blob.cs:2761
void CopyFrom(Blob< T > src, int nSrcOffset, int nDstOffset, int nCount, bool bCopyData, bool bCopyDiff)
Copy from a source Blob.
Definition: Blob.cs:903
List< int > shape()
Returns an array where each element contains the shape of an axis of the Blob.
Definition: Blob.cs:684
static T Zero
Returns Zero (0) in type T.
Definition: Blob.cs:260
T GetData(int nIdx)
Returns the data at a given flat index within the Blob.
Definition: Blob.cs:1893
int count()
Returns the total number of items in the Blob.
Definition: Blob.cs:739
void ReshapeLike(Blob< T > b, bool? bUseHalfSize=null)
Reshape this Blob to have the same shape as another Blob.
Definition: Blob.cs:648
string Name
Get/set the name of the Blob.
Definition: Blob.cs:2184
int offset(int n, int c=0, int h=0, int w=0)
Returns the flat offset given the number, channel, height and width.
Definition: Blob.cs:850
long gpu_diff
Returns the diff GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1541
void SetDiff(double dfVal, int nIdx=-1)
Either sets all of the diff items in the Blob to a given value, or alternatively only sets a single i...
Definition: Blob.cs:1981
int num
DEPRECIATED; legacy shape accessor num: use shape(0) instead.
Definition: Blob.cs:792
long gpu_data
Returns the data GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1479
The CudaDnn object is the main interface to the Low-Level Cuda C++ DLL.
Definition: CudaDnn.cs:969
Abstract Filler class used to fill blobs with values.
Definition: Filler.cs:19
void Fill(Blob< T > b)
Fill the blob with values based on the actual filler used.
Definition: Filler.cs:50
static Filler< T > Create(CudaDnn< T > cuda, Log log, FillerParameter p)
Create a new Filler instance.
Definition: Filler.cs:79
[DEPRECIATED] The AttentionLayer provides focus for LSTM based encoder/decoder models.
The LSTMAttentionLayer adds attention to the long-short term memory layer and is used in encoder/deco...
override void dispose()
Releases all GPU and host resources used by the Layer.
override void Reshape(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Reshape the bottom (input) and top (output) blobs.
override int ExactNumTopBlobs
Returns the exact number of required top (output) Blobs: output (ht).
override int MinBottomBlobs
Returns the minimum number of required bottom (input) Blobs: input
override int MaxBottomBlobs
Returns the maximum number of required bottom (input) Blobs: input, inputClip, encoding,...
LSTMAttentionLayer(CudaDnn< T > cuda, Log log, LayerParameter p)
The AttentionDecodeLayer constructor.
override void LayerSetUp(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Setup the layer.
override void backward(BlobCollection< T > colTop, List< bool > rgbPropagateDown, BlobCollection< T > colBottom)
Computes the error gradient w.r.t. the inputs.
override void forward(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Forward computation.
override void setup_internal_blobs(BlobCollection< T > col)
Derivative layers should add all internal blobws to the 'col' provided.
An interface for the units of computation which can be composed into a Net.
Definition: Layer.cs:31
Log m_log
Specifies the Log for output.
Definition: Layer.cs:43
LayerParameter m_param
Specifies the LayerParameter describing the Layer.
Definition: Layer.cs:47
void convert(BlobCollection< T > col)
Convert a collection of blobs from / to half size.
Definition: Layer.cs:535
T m_tZero
Specifies a generic type equal to 0.0.
Definition: Layer.cs:76
void Backward(BlobCollection< T > colTop, List< bool > rgbPropagateDown, BlobCollection< T > colBottom)
Given the top Blob error gradients, compute the bottom Blob error gradients.
Definition: Layer.cs:815
T m_tOne
Specifies a generic type equal to 1.0.
Definition: Layer.cs:72
bool shareParameter(Blob< T > b, List< int > rgMinShape, bool bAllowEndsWithComparison=false)
Attempts to share a parameter Blob if another parameter Blob with the same name and accpetable size i...
Definition: Layer.cs:1152
double Forward(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Given the bottom (input) Blobs, this function computes the top (output) Blobs and the loss.
Definition: Layer.cs:728
float convertF(T df)
Converts a generic to a float value.
Definition: Layer.cs:1359
abstract void Reshape(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Adjust the shapes of top blobs and internal buffers to accomodate the shapes of the bottom blobs.
double convertD(T df)
Converts a generic to a double value.
Definition: Layer.cs:1349
virtual bool reshapeNeeded(BlobCollection< T > colBottom, BlobCollection< T > colTop, bool bReset=true)
Tests the shapes of both the bottom and top blobs and if they are the same as the previous sizing,...
Definition: Layer.cs:622
CudaDnn< T > m_cuda
Specifies the CudaDnn connection to Cuda.
Definition: Layer.cs:39
void Setup(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Implements common Layer setup functionality.
Definition: Layer.cs:439
LayerParameter.LayerType m_type
Specifies the Layer type.
Definition: Layer.cs:35
BlobCollection< T > blobs
Returns the collection of learnable parameter Blobs for the Layer.
Definition: Layer.cs:875
BlobCollection< T > internal_blobs
Returns the collection of internal Blobs used by the Layer.
Definition: Layer.cs:883
BlobCollection< T > m_colBlobs
Specifies the learnable parameter Blobs of the Layer.
Definition: Layer.cs:55
DictionaryMap< bool > m_rgbParamPropagateDown
Specifies whether or not to compute the learnable diff of each parameter Blob.
Definition: Layer.cs:63
bool m_bNetReshapeRequest
Specifies whether the reshape is requested from a Net.Reshape call or not.
Definition: Layer.cs:104
The LayerParameterEx class is used when sharing another Net to conserve GPU memory and extends the La...
Definition: Layer.cs:1750
BlobCollection< T > SharedBlobs
Returns the shared parameter Blobs.
Definition: Layer.cs:1782
Layer< T > SharedLayer
Returns the layer in the shared Net that matches this one.
Definition: Layer.cs:1774
BlobCollection< T > SharedLayerBlobs
Returns the shared Layer Blobs.
Definition: Layer.cs:1790
int axis
The axis along which to perform the softmax – may be negative to index from the end (e....
FillerParameter bias_filler
The filler for the bias.
FillerParameter weight_filler
The filler for the weights.
uint dim
Specifies the dim of the attention unit which should match the LSTM output size.
Specifies the parameters for the LSTMAttentionLayer that provides an attention based LSTM layer used ...
double clipping_threshold
Specifies the gradient clipping threshold, default = 0.0 (i.e. no clipping).
bool enable_clockwork_forgetgate_bias
When enabled, the forget gate bias is set to 5.0.
uint num_output_ip
Specifies the number of IP outputs for the layer. Note, when 0, no inner product is performed.
FillerParameter bias_filler
Specifies the filler parameters for the bias filler.
FillerParameter weight_filler
Specifies the filler parameters for the weight filler.
uint num_output
Specifies the number of outputs for the layer.
bool enable_attention
(default=false) When enabled, attention is applied to the input state on each cycle through the LSTM....
Specifies the base parameter for all layers.
string name
Specifies the name of this LayerParameter.
AttentionParameter attention_param
Returns the parameter set when initialized with LayerType.ATTENTION
LayerType
Specifies the layer type.
LSTMAttentionParameter lstm_attention_param
Returns the parameter set when initialized with LayerType.LSTM_ATTENTION
The MyCaffe.basecode contains all generic types used throughout MyCaffe.
Definition: Annotation.cs:12
The MyCaffe.common namespace contains common MyCaffe classes.
Definition: BatchInput.cs:8
BLOB_TYPE
Defines the tpe of data held by a given Blob.
Definition: Interfaces.cs:62
The MyCaffe.fillers namespace contains all fillers including the Filler class.
The MyCaffe.layers.beta namespace contains all beta stage layers.
Definition: LayerFactory.cs:9
The MyCaffe.layers namespace contains all layers that have a solidified code base,...
Definition: LayerFactory.cs:15
The MyCaffe.param namespace contains parameters used to create models.
The MyCaffe namespace contains the main body of MyCaffe code that closesly tracks the C++ Caffe open-...
Definition: Annotation.cs:12