MyCaffe  1.12.2.41
Deep learning software for Windows C# programmers.
TransformerBlockParameter.cs
1using System;
2using System.Collections.Generic;
3using System.Linq;
4using System.Text;
6using MyCaffe.basecode;
7
8namespace MyCaffe.param.gpt
9{
13 [Serializable]
14 [TypeConverter(typeof(ExpandableObjectConverter))]
16 {
17 uint m_nHeads = 6;
18 uint m_nEmbed = 192;
19 double m_dfAttnDropout = 0.1;
20 double m_dfResidDropout = 0.1;
21 uint m_nBlockSize = 128;
22 uint m_nLayers = 6;
23 ACTIVATION m_activation = ACTIVATION.RELU;
24 BLOCK_TYPE m_type = BLOCK_TYPE.CAUSAL_SELF_ATTENTION;
25 bool m_bEnableLayerNormCudaImplementation = false;
26
30 public enum BLOCK_TYPE
31 {
35 CAUSAL_SELF_ATTENTION = 0,
39 ENCODER,
43 DECODER
44 }
45
49 public enum ACTIVATION
50 {
54 RELU = 0,
58 GELU = 1,
62 GELU_BERT = 2
63 }
64
67 {
68
69 }
70
78 {
79 get { return m_bEnableLayerNormCudaImplementation; }
80 set { m_bEnableLayerNormCudaImplementation = value; }
81 }
82
87 {
88 get { return m_activation; }
89 set { m_activation = value; }
90 }
91
96 {
97 get { return m_type; }
98 set { m_type = value; }
99 }
100
104 [Description("Specifies number of layers (transformer blocks) used.")]
105 public uint layers
106 {
107 get { return m_nLayers; }
108 set { m_nLayers = value; }
109 }
110
114 [Description("Specifies number of heads used.")]
115 public uint heads
116 {
117 get { return m_nHeads; }
118 set { m_nHeads = value; }
119 }
120
124 public uint embed
125 {
126 get { return m_nEmbed; }
127 set { m_nEmbed = value; }
128 }
129
133 public uint block_size
134 {
135 get { return m_nBlockSize; }
136 set { m_nBlockSize = value; }
137 }
138
142 public double attn_dropout
143 {
144 get { return m_dfAttnDropout; }
145 set { m_dfAttnDropout = value; }
146 }
147
151 public double resid_dropout
152 {
153 get { return m_dfResidDropout; }
154 set { m_dfResidDropout = value; }
155 }
156
158 public override object Load(System.IO.BinaryReader br, bool bNewInstance = true)
159 {
160 RawProto proto = RawProto.Parse(br.ReadString());
162
163 if (!bNewInstance)
164 Copy(p);
165
166 return p;
167 }
168
170 public override void Copy(LayerParameterBase src)
171 {
173
174 m_nLayers = p.layers;
175 m_nHeads = p.heads;
176 m_nEmbed = p.embed;
177 m_nBlockSize = p.block_size;
178 m_dfAttnDropout = p.attn_dropout;
179 m_dfResidDropout = p.resid_dropout;
180 m_activation = p.activation;
181 m_type = p.block_type;
182 m_bEnableLayerNormCudaImplementation = p.enable_layernorm_cuda_impl;
183 }
184
186 public override LayerParameterBase Clone()
187 {
189 p.Copy(this);
190 return p;
191 }
192
198 public override RawProto ToProto(string strName)
199 {
200 RawProtoCollection rgChildren = new RawProtoCollection();
201
202 rgChildren.Add("layers", layers.ToString());
203 rgChildren.Add("heads", heads.ToString());
204 rgChildren.Add("embed", embed.ToString());
205 rgChildren.Add("block_size", block_size.ToString());
206 rgChildren.Add("attn_dropout", attn_dropout.ToString());
207 rgChildren.Add("resid_dropout", resid_dropout.ToString());
208 rgChildren.Add("activation", activation.ToString());
209 rgChildren.Add("block_type", block_type.ToString());
210 rgChildren.Add("enable_ln_cuda_impl", enable_layernorm_cuda_impl.ToString());
211
212 return new RawProto(strName, "", rgChildren);
213 }
214
221 {
222 string strVal;
224
225 if ((strVal = rp.FindValue("layers")) != null)
226 p.layers = uint.Parse(strVal);
227
228 if ((strVal = rp.FindValue("heads")) != null)
229 p.heads = uint.Parse(strVal);
230
231 if ((strVal = rp.FindValue("embed")) != null)
232 p.embed = uint.Parse(strVal);
233
234 if ((strVal = rp.FindValue("block_size")) != null)
235 p.block_size = uint.Parse(strVal);
236
237 if ((strVal = rp.FindValue("attn_dropout")) != null)
238 p.attn_dropout = double.Parse(strVal);
239
240 if ((strVal = rp.FindValue("resid_dropout")) != null)
241 p.resid_dropout = double.Parse(strVal);
242
243 if ((strVal = rp.FindValue("activation")) != null)
244 {
245 if (strVal == ACTIVATION.GELU.ToString())
246 p.activation = ACTIVATION.GELU;
247 else if (strVal == ACTIVATION.GELU_BERT.ToString())
248 p.activation = ACTIVATION.GELU_BERT;
249 else
250 p.activation = ACTIVATION.RELU;
251 }
252
253 if ((strVal = rp.FindValue("block_type")) != null)
254 {
255 if (strVal == BLOCK_TYPE.CAUSAL_SELF_ATTENTION.ToString())
256 p.block_type = BLOCK_TYPE.CAUSAL_SELF_ATTENTION;
257 else if (strVal == BLOCK_TYPE.ENCODER.ToString())
258 p.block_type = BLOCK_TYPE.ENCODER;
259 else if (strVal == BLOCK_TYPE.DECODER.ToString())
260 p.block_type = BLOCK_TYPE.DECODER;
261 }
262
263 if ((strVal = rp.FindValue("enable_ln_cuda_impl")) != null)
264 p.enable_layernorm_cuda_impl = bool.Parse(strVal);
265
266 return p;
267 }
268 }
269}
The RawProtoCollection class is a list of RawProto objects.
void Add(RawProto p)
Adds a RawProto to the collection.
The RawProto class is used to parse and output Google prototxt file data.
Definition: RawProto.cs:17
static RawProto Parse(string str)
Parses a prototxt and places it in a new RawProto.
Definition: RawProto.cs:306
string FindValue(string strName)
Searches for a falue of a node within this nodes children.
Definition: RawProto.cs:105
The LayerParameterBase is the base class for all other layer specific parameters.
Specifies the parameters for the TransformerBlockLayer.
override object Load(System.IO.BinaryReader br, bool bNewInstance=true)
Load the parameter from a binary reader.
double resid_dropout
Specifies dropout probability used on the residual weights.
ACTIVATION
Defines the various activations supported by the TransformerBlock.
uint layers
The number of layers (transformer blocks) used.
static TransformerBlockParameter FromProto(RawProto rp)
Parses the parameter from a RawProto.
override RawProto ToProto(string strName)
Convert the parameter into a RawProto.
override void Copy(LayerParameterBase src)
Copy on parameter to another.
BLOCK_TYPE block_type
Specifies the type of transformer block to configure.
ACTIVATION activation
Specifies the activation type to use (default = RELU)
bool enable_layernorm_cuda_impl
Specifies to use the low-level full cuda implementation of LayerNorm (default = false).
TransformerBlockParameter()
Constructor for the parameter.
double attn_dropout
Specifies dropout probability used on the attention weights.
BLOCK_TYPE
Defines the type of transformer block
override LayerParameterBase Clone()
Creates a new copy of this instance of the parameter.
The MyCaffe.basecode contains all generic types used throughout MyCaffe.
Definition: Annotation.cs:12
The MyCaffe namespace contains the main body of MyCaffe code that closesly tracks the C++ Caffe open-...
Definition: Annotation.cs:12