MyCaffe  1.12.2.41
Deep learning software for Windows C# programmers.
LayerNormLayer.cs
1using System;
2using System.Collections.Generic;
3using System.Diagnostics;
4using System.Linq;
5using System.Text;
6using MyCaffe.basecode;
7using MyCaffe.common;
8using MyCaffe.param;
9
10namespace MyCaffe.layers.gpt
11{
22 public class LayerNormLayer<T> : Layer<T>
23 {
24 Blob<T> m_blobWork;
25 Blob<T> m_blobMu;
26 Blob<T> m_blobXmu;
27 Blob<T> m_blobXmuSq;
28 Blob<T> m_blobVar;
29 Blob<T> m_blobStdev;
30 Blob<T> m_blobStdevFull;
31 long m_hLayerNorm = 0;
32 int m_nCount = 0;
33 int m_nOuterNum = 0;
34 int m_nChannels = 0;
35 int m_nInnerNum = 0;
36 List<int> m_rgShape = new List<int>(4);
37
48 : base(cuda, log, p)
49 {
51
52 m_blobWork = new Blob<T>(cuda, log);
53 m_blobWork.Name = m_param.name + " work";
54 m_blobMu = new Blob<T>(cuda, log);
55 m_blobMu.Name = m_param.name + " mu";
56 m_blobXmu = new Blob<T>(cuda, log);
57 m_blobXmu.Name = m_param.name + " xmu";
58 m_blobXmuSq = new Blob<T>(cuda, log);
59 m_blobXmuSq.Name = m_param.name + " xmu_sq";
60 m_blobVar = new Blob<T>(cuda, log);
61 m_blobVar.Name = m_param.name + " var";
62 m_blobStdev = new Blob<T>(cuda, log);
63 m_blobStdev.Name = m_param.name + " stdev";
64 m_blobStdevFull = new Blob<T>(cuda, log);
65 m_blobStdevFull.Name = m_param.name + " stdev_full";
66
68 }
69
71 protected override void dispose()
72 {
73 dispose(ref m_blobWork);
74 dispose(ref m_blobMu);
75 dispose(ref m_blobXmu);
76 dispose(ref m_blobXmuSq);
77 dispose(ref m_blobVar);
78 dispose(ref m_blobStdev);
79 dispose(ref m_blobStdevFull);
80
81 if (m_hLayerNorm != 0)
82 {
83 m_cuda.FreeLayerNorm(m_hLayerNorm);
84 m_hLayerNorm = 0;
85 }
86
87 base.dispose();
88 }
89
91 protected override void setup_internal_blobs(BlobCollection<T> col)
92 {
93 if (col.Count > 0)
94 return;
95
96 col.Add(m_blobWork);
97 col.Add(m_blobMu);
98 col.Add(m_blobXmu);
99 col.Add(m_blobXmuSq);
100 col.Add(m_blobVar);
101 col.Add(m_blobStdev);
102 col.Add(m_blobStdevFull);
103 }
104
108 public override int ExactNumBottomBlobs
109 {
110 get { return 1; }
111 }
112
116 public override int ExactNumTopBlobs
117 {
118 get { return 1; }
119 }
120
126 public override void LayerSetUp(BlobCollection<T> colBottom, BlobCollection<T> colTop)
127 {
128 if (m_param.layer_norm_param.enable_passthrough)
129 m_log.WriteLine("WARNING: LayerNormLayer '" + m_param.name + "' is using passthrough mode which is only used when debugging.");
130 }
131
137 public override void Reshape(BlobCollection<T> colBottom, BlobCollection<T> colTop)
138 {
139 int nAxes = colBottom[0].num_axes;
140 m_nCount = colBottom[0].count();
141 m_nOuterNum = colBottom[0].num;
142 m_nChannels = (nAxes == 2) ? 1 : colBottom[0].channels;
143 m_nInnerNum = (nAxes == 2) ? colBottom[0].channels : colBottom[0].count(2);
144
145 if (m_param.layer_norm_param.enable_cuda_impl)
146 {
147 if (m_hLayerNorm == 0 || colBottom[0].count() != m_nCount || colBottom[0].num != m_nOuterNum || colBottom[0].channels != m_nChannels || colBottom[0].count(2) != m_nInnerNum)
148 {
149 if (m_hLayerNorm != 0)
150 m_cuda.FreeLayerNorm(m_hLayerNorm);
151
152 int nGpuID = m_cuda.GetDeviceID();
153 m_hLayerNorm = m_cuda.CreateLayerNorm(nGpuID, m_nCount, m_nOuterNum, m_nChannels, m_nInnerNum, (float)m_param.layer_norm_param.epsilon);
154 if (m_hLayerNorm == 0)
155 m_log.FAIL("Failed to create CUDA version LayerNorm!");
156 }
157 }
158 else
159 {
160 shareLayerBlob(m_blobWork, colBottom[0].shape());
161 m_blobWork.ReshapeLike(colBottom[0]);
162 shareLayerBlob(m_blobMu, colBottom[0].shape());
163 m_blobMu.ReshapeLike(colBottom[0]);
164 shareLayerBlob(m_blobXmu, colBottom[0].shape());
165 m_blobXmu.ReshapeLike(colBottom[0]);
166 shareLayerBlob(m_blobXmuSq, colBottom[0].shape());
167 m_blobXmuSq.ReshapeLike(colBottom[0]);
168 shareLayerBlob(m_blobVar, colBottom[0].shape());
169 m_blobVar.ReshapeLike(colBottom[0]);
170 shareLayerBlob(m_blobStdev, colBottom[0].shape());
171 m_blobStdev.ReshapeLike(colBottom[0]);
172 shareLayerBlob(m_blobStdevFull, colBottom[0].shape());
173 m_blobStdevFull.ReshapeLike(colBottom[0]);
174
175 m_rgShape.Clear();
176 m_rgShape.Add(m_nOuterNum);
177 m_rgShape.Add(m_nChannels);
178 if (nAxes > 2)
179 m_rgShape.Add(1);
180 m_blobMu.Reshape(m_rgShape);
181 m_blobVar.Reshape(m_rgShape);
182 m_blobStdev.Reshape(m_rgShape);
183 }
184
185 colTop[0].ReshapeLike(colBottom[0]);
186 }
187
195 protected override void forward(BlobCollection<T> colBottom, BlobCollection<T> colTop)
196 {
197 if (m_param.layer_norm_param.enable_passthrough)
198 {
199 colTop[0].CopyFrom(colBottom[0]);
200 return;
201 }
202
203 if (m_param.layer_norm_param.enable_cuda_impl)
204 m_cuda.LayerNormForward(m_hLayerNorm, colBottom[0].gpu_data, colTop[0].mutable_gpu_data);
205 else
206 forward_local(colBottom, colTop);
207 }
208
209 private void forward_local(BlobCollection<T> colBottom, BlobCollection<T> colTop)
210 {
211 //-----------------------------------
212 // Calculate the mean across the last dim.
213 // mean = x.mean(dim=-1, keepdim=True)
214 // --step1--
215 m_cuda.channel_mean(m_nCount, m_nOuterNum, m_nChannels, m_nInnerNum, colBottom[0].gpu_data, m_blobMu.mutable_gpu_data);
216 m_blobMu.Reshape(m_blobMu.num, m_blobMu.channels, 1, 1);
217
218 //-----------------------------------
219 // var = ((x - mean) ** 2).mean(dim=-1, keepdim=True)
220 // Copy each mean value per channel across all items in the channel (e.g. 1 -> channel items)
221 m_cuda.channel_fillfrom(m_nCount, m_nOuterNum, m_nChannels, m_nInnerNum, m_blobMu.gpu_data, m_blobXmu.mutable_gpu_data, DIR.FWD);
222
223 // --step2--
224 // Subtract the mean from the input.
225 // xmu = x - mean
226 m_cuda.sub(m_nCount, colBottom[0].gpu_data, m_blobXmu.gpu_data, m_blobXmu.mutable_gpu_data);
227
228 // --step3--
229 // Square the values
230 // xmusq = (xmu) ** 2
231 m_cuda.powx(m_nCount, m_blobXmu.gpu_data, 2.0, m_blobXmuSq.mutable_gpu_data);
232
233 // --step4--
234 // Calculate the mean across the last dim.
235 // var = xmusq.mean(dim=-1, keepdim=True)
236 // var shape = (n, c, 1)
237 m_cuda.channel_mean(m_nCount, m_nOuterNum, m_nChannels, m_nInnerNum, m_blobXmuSq.gpu_data, m_blobVar.mutable_gpu_data);
238 m_blobVar.Reshape(m_blobVar.num, m_blobVar.channels, 1, 1);
239
240 //-----------------------------------
241 // std = (var + self.epsilon).sqrt()
242 // Calculate the stdev across the last dim
243 // std = sqrt(var + eps)
244 // stdev shape: (n, c, 1)
245 // --step5--
246 m_blobStdev.Reshape(m_blobStdev.num, m_blobStdev.channels, 1, 1);
247 m_cuda.add_scalar(m_nOuterNum * m_nChannels, m_param.layer_norm_param.epsilon, m_blobVar.mutable_gpu_data);
248 m_cuda.sqrt(m_nOuterNum * m_nChannels, m_blobVar.gpu_data, m_blobStdev.mutable_gpu_data);
249
250 //-----------------------------------
251 // y = (x - mean) / std
252 // Normalize the input by centering and dividing by stdev across channels.
253 // Copy each stdev value per channel across all items in the channel (e.g. 1 -> channel items)
254 // --step6, step7--
255 m_cuda.channel_fillfrom(m_nCount, m_nOuterNum, m_nChannels, m_nInnerNum, m_blobStdev.gpu_data, m_blobStdevFull.mutable_gpu_data, DIR.FWD);
256 m_cuda.div(m_nCount, m_blobXmu.gpu_data, m_blobStdevFull.gpu_data, colTop[0].mutable_gpu_data);
257 }
258
267 protected override void backward(BlobCollection<T> colTop, List<bool> rgbPropagateDown, BlobCollection<T> colBottom)
268 {
269 if (rgbPropagateDown[0])
270 {
271 if (m_param.layer_norm_param.enable_passthrough)
272 {
273 colBottom[0].CopyFrom(colTop[0], true);
274 return;
275 }
276
277 if (m_param.layer_norm_param.enable_cuda_impl)
278 m_cuda.LayerNormBackward(m_hLayerNorm, colTop[0].gpu_data, colTop[0].gpu_diff, colBottom[0].mutable_gpu_diff);
279 else
280 backward_local(colTop, rgbPropagateDown, colBottom);
281 }
282 }
283
284 private void backward_local(BlobCollection<T> colTop, List<bool> rgbPropagateDown, BlobCollection<T> colBottom)
285 {
286 // Multiply previous dx by dy (grad)
287 // dx1 = dx * dy
288 m_blobWork.ReshapeLike(colTop[0]);
289 m_cuda.mul(m_nCount, colTop[0].gpu_data, colTop[0].gpu_diff, m_blobWork.mutable_gpu_diff);
290
291 // Average (dx * dy) across channel, dx1 = dx1.mean()
292 m_cuda.channel_mean(m_nCount, m_nOuterNum, m_nChannels, m_nInnerNum, m_blobWork.gpu_diff, m_blobVar.mutable_gpu_diff);
293
294 // Average dy across channel, dx2 = dy.mean()
295 m_cuda.channel_mean(m_nCount, m_nOuterNum, m_nChannels, m_nInnerNum, colTop[0].gpu_diff, m_blobStdev.mutable_gpu_diff);
296
297 // Multiply previous dx with dx1 (average across channel of dx * dy)
298 m_cuda.channel_fillfrom(m_nCount, m_nOuterNum, m_nChannels, m_nInnerNum, m_blobVar.gpu_diff, m_blobStdevFull.mutable_gpu_diff, DIR.FWD);
299 m_cuda.mul(m_nCount, colTop[0].gpu_data, m_blobStdevFull.gpu_diff, m_blobWork.mutable_gpu_diff);
300
301 // Add in dy average dx2
302 m_cuda.channel_fillfrom(m_nCount, m_nOuterNum, m_nChannels, m_nInnerNum, m_blobStdev.gpu_diff, m_blobStdevFull.mutable_gpu_diff, DIR.FWD);
303 m_cuda.add(m_nCount, m_blobWork.gpu_diff, m_blobStdevFull.gpu_diff, m_blobWork.mutable_gpu_diff);
304
305 // Subtract from original dy gradient
306 // dy - ((dx * dx1) + dx2)
307 m_cuda.sub(m_nCount, colTop[0].gpu_diff, m_blobWork.gpu_diff, m_blobWork.mutable_gpu_diff);
308
309 // Divide by the original stdev std, dx = (dy - ((dx * dx1) + dx2))/std
310 m_blobStdevFull.add_scalar(m_param.layer_norm_param.epsilon);
311 m_cuda.div(m_nCount, m_blobWork.gpu_diff, m_blobStdevFull.gpu_data, colBottom[0].mutable_gpu_diff);
312 }
313 }
314}
The Log class provides general output in text form.
Definition: Log.cs:13
void WriteLine(string str, bool bOverrideEnabled=false, bool bHeader=false, bool bError=false, bool bDisable=false)
Write a line of output.
Definition: Log.cs:80
void FAIL(string str)
Causes a failure which throws an exception with the desciptive text.
Definition: Log.cs:394
The BlobCollection contains a list of Blobs.
void Add(Blob< T > b)
Add a new Blob to the collection.
int Count
Returns the number of items in the collection.
void ReshapeLike(BlobCollection< T > src)
Reshapes all blobs in the collection to the sizes of the source.
void CopyFrom(BlobCollection< T > bSrc, bool bCopyDiff=false)
Copy the data or diff from another BlobCollection into this one.
The Blob is the main holder of data that moves through the Layers of the Net.
Definition: Blob.cs:25
int channels
DEPRECIATED; legacy shape accessor channels: use shape(1) instead.
Definition: Blob.cs:800
long mutable_gpu_diff
Returns the diff GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1555
long mutable_gpu_data
Returns the data GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1487
void Reshape(int nNum, int nChannels, int nHeight, int nWidth, bool? bUseHalfSize=null)
DEPRECIATED; use
Definition: Blob.cs:442
void add_scalar(double dfVal)
Adds a scalar value to the Blob.
Definition: Blob.cs:2779
void ReshapeLike(Blob< T > b, bool? bUseHalfSize=null)
Reshape this Blob to have the same shape as another Blob.
Definition: Blob.cs:648
string Name
Get/set the name of the Blob.
Definition: Blob.cs:2184
long gpu_diff
Returns the diff GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1541
int num
DEPRECIATED; legacy shape accessor num: use shape(0) instead.
Definition: Blob.cs:792
long gpu_data
Returns the data GPU handle used by the CudaDnn connection.
Definition: Blob.cs:1479
The CudaDnn object is the main interface to the Low-Level Cuda C++ DLL.
Definition: CudaDnn.cs:969
An interface for the units of computation which can be composed into a Net.
Definition: Layer.cs:31
Log m_log
Specifies the Log for output.
Definition: Layer.cs:43
LayerParameter m_param
Specifies the LayerParameter describing the Layer.
Definition: Layer.cs:47
bool shareLayerBlob(Blob< T > b, List< int > rgMinShape)
Attempts to share a Layer Blob if another parameter Blob with the same name and acceptable size is fo...
Definition: Layer.cs:1170
BlobCollection< T > m_colInternalBlobs
Specifies internal blobs used by the layer.
Definition: Layer.cs:59
CudaDnn< T > m_cuda
Specifies the CudaDnn connection to Cuda.
Definition: Layer.cs:39
LayerParameter.LayerType m_type
Specifies the Layer type.
Definition: Layer.cs:35
The LayerNormalizationLayer performs layer normalization similar to the PyTorch LayerNorm layer.
override void forward(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Computes the forward calculation.
override void backward(BlobCollection< T > colTop, List< bool > rgbPropagateDown, BlobCollection< T > colBottom)
Computes the error gradient w.r.t the inputs.
LayerNormLayer(CudaDnn< T > cuda, Log log, LayerParameter p)
The LayerNormalizationLayer constructor.
override void setup_internal_blobs(BlobCollection< T > col)
Derivative layers should add all internal blobws to the 'col' provided.
override int ExactNumTopBlobs
Returns the exact number of required top (output) Blobs: norm
override void LayerSetUp(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Setup the layer.
override void dispose()
Releases all GPU and host resources used by the Layer.
override void Reshape(BlobCollection< T > colBottom, BlobCollection< T > colTop)
Reshape the bottom (input) and top (output) blobs.
override int ExactNumBottomBlobs
Returns the exact number of required bottom (input) Blobs: data
Specifies the base parameter for all layers.
string name
Specifies the name of this LayerParameter.
LayerNormParameter layer_norm_param
Returns the parameter set when initialized with LayerType.LAYERNORM
LayerType
Specifies the layer type.
The MyCaffe.basecode contains all generic types used throughout MyCaffe.
Definition: Annotation.cs:12
The MyCaffe.common namespace contains common MyCaffe classes.
Definition: BatchInput.cs:8
DIR
Defines the direction of data flow.
Definition: CudaDnn.cs:22
The MyCaffe.layers.gpt namespace contains all GPT related layers.
Definition: LayerFactory.cs:15
The MyCaffe.param namespace contains parameters used to create models.
The MyCaffe namespace contains the main body of MyCaffe code that closesly tracks the C++ Caffe open-...
Definition: Annotation.cs:12