MyCaffe  1.12.2.41
Deep learning software for Windows C# programmers.
MultiheadAttentionParameter.cs
1using System;
2using System.Collections.Generic;
3using System.Linq;
4using System.Text;
6using MyCaffe.basecode;
7
8namespace MyCaffe.param.gpt
9{
13 [Serializable]
14 [TypeConverter(typeof(ExpandableObjectConverter))]
16 {
17 uint m_nHeads = 6;
18 uint m_nEmbed = 192; // d_model
19 double m_dfAttnDropout;
20 double m_dfResidDropout;
21 uint m_nBlockSize = 128;
22 uint m_nLayers = 6;
23 WEIGHT_INIT m_weightInit = WEIGHT_INIT.ENCODER_DECODER;
24
28 public enum WEIGHT_INIT
29 {
33 GPT,
37 ENCODER_DECODER
38 }
39
42 {
43 }
44
48 [Description("Specifies number of layers (transformer blocks) used.")]
49 public uint layers
50 {
51 get { return m_nLayers; }
52 set { m_nLayers = value; }
53 }
54
58 [Description("Specifies number of heads used.")]
59 public uint heads
60 {
61 get { return m_nHeads; }
62 set { m_nHeads = value; }
63 }
64
68 public uint embed
69 {
70 get { return m_nEmbed; }
71 set { m_nEmbed = value; }
72 }
73
77 public uint block_size
78 {
79 get { return m_nBlockSize; }
80 set { m_nBlockSize = value; }
81 }
82
86 public double attn_dropout
87 {
88 get { return m_dfAttnDropout; }
89 set { m_dfAttnDropout = value; }
90 }
91
95 public double resid_dropout
96 {
97 get { return m_dfResidDropout; }
98 set { m_dfResidDropout = value; }
99 }
100
105 {
106 get { return m_weightInit; }
107 set { m_weightInit = value; }
108 }
109
111 public override object Load(System.IO.BinaryReader br, bool bNewInstance = true)
112 {
113 RawProto proto = RawProto.Parse(br.ReadString());
115
116 if (!bNewInstance)
117 Copy(p);
118
119 return p;
120 }
121
123 public override void Copy(LayerParameterBase src)
124 {
126
127 m_nLayers = p.layers;
128 m_nHeads = p.heads;
129 m_nEmbed = p.embed;
130 m_nBlockSize = p.block_size;
131 m_dfAttnDropout = p.attn_dropout;
132 m_dfResidDropout = p.resid_dropout;
133 m_weightInit = p.weight_init;
134 }
135
137 public override LayerParameterBase Clone()
138 {
140 p.Copy(this);
141 return p;
142 }
143
149 public override RawProto ToProto(string strName)
150 {
151 RawProtoCollection rgChildren = new RawProtoCollection();
152
153 rgChildren.Add("layers", layers.ToString());
154 rgChildren.Add("heads", heads.ToString());
155 rgChildren.Add("embed", embed.ToString());
156 rgChildren.Add("block_size", block_size.ToString());
157 rgChildren.Add("attn_dropout", attn_dropout.ToString());
158 rgChildren.Add("resid_dropout", resid_dropout.ToString());
159 rgChildren.Add("weight_init", weight_init.ToString());
160
161 return new RawProto(strName, "", rgChildren);
162 }
163
170 {
171 string strVal;
173
174 if ((strVal = rp.FindValue("layers")) != null)
175 p.layers = uint.Parse(strVal);
176
177 if ((strVal = rp.FindValue("heads")) != null)
178 p.heads = uint.Parse(strVal);
179
180 if ((strVal = rp.FindValue("embed")) != null)
181 p.embed = uint.Parse(strVal);
182
183 if ((strVal = rp.FindValue("block_size")) != null)
184 p.block_size = uint.Parse(strVal);
185
186 if ((strVal = rp.FindValue("attn_dropout")) != null)
187 p.attn_dropout = double.Parse(strVal);
188
189 if ((strVal = rp.FindValue("resid_dropout")) != null)
190 p.resid_dropout = double.Parse(strVal);
191
192 if ((strVal = rp.FindValue("weight_init")) != null)
193 {
194 if (strVal == WEIGHT_INIT.GPT.ToString())
195 p.weight_init = WEIGHT_INIT.GPT;
196 else if (strVal == WEIGHT_INIT.ENCODER_DECODER.ToString())
197 p.weight_init = WEIGHT_INIT.ENCODER_DECODER;
198 else
199 throw new Exception("Unknown weight init strategy '" + strVal + "'!");
200 }
201
202 return p;
203 }
204 }
205}
The RawProtoCollection class is a list of RawProto objects.
void Add(RawProto p)
Adds a RawProto to the collection.
The RawProto class is used to parse and output Google prototxt file data.
Definition: RawProto.cs:17
static RawProto Parse(string str)
Parses a prototxt and places it in a new RawProto.
Definition: RawProto.cs:306
string FindValue(string strName)
Searches for a falue of a node within this nodes children.
Definition: RawProto.cs:105
The LayerParameterBase is the base class for all other layer specific parameters.
Specifies the parameters for the MultiheadAttentionLayer.
WEIGHT_INIT
Defines the weight initialization strategy.
double attn_dropout
Specifies dropout probability used on the attention weights.
override RawProto ToProto(string strName)
Convert the parameter into a RawProto.
override object Load(System.IO.BinaryReader br, bool bNewInstance=true)
Load the parameter from a binary reader.
uint layers
The number of layers (transformer blocks) used.
double resid_dropout
Specifies dropout probability used on the residual weights.
override LayerParameterBase Clone()
Creates a new copy of this instance of the parameter.
MultiheadAttentionParameter()
Constructor for the parameter.
override void Copy(LayerParameterBase src)
Copy on parameter to another.
static MultiheadAttentionParameter FromProto(RawProto rp)
Parses the parameter from a RawProto.
WEIGHT_INIT weight_init
Specifies the weight initialization strategy (default = ENCODER_DECODER).
The MyCaffe.basecode contains all generic types used throughout MyCaffe.
Definition: Annotation.cs:12
The MyCaffe namespace contains the main body of MyCaffe code that closesly tracks the C++ Caffe open-...
Definition: Annotation.cs:12