MyCaffe  1.12.2.41
Deep learning software for Windows C# programmers.
VocabularyCharacter.cs
1using System;
2using System.Collections.Generic;
3using System.Linq;
4using System.Text;
5using System.Threading.Tasks;
6
7namespace MyCaffe.layers.gpt
8{
13 {
14 bool m_bEnablePad = true;
15 Random m_random;
16 Dictionary<char, int> m_rgVocabKeyToIdx = new Dictionary<char, int>();
17 Dictionary<int, char> m_rgVocabIdxToKey = new Dictionary<int, char>();
18 bool m_bAddBos;
19 bool m_bAddEos;
20
28 public VocabularyCharacter(Random random, bool bAddBos, bool bAddEos, bool bEnablePad)
29 {
30 m_random = random;
31 m_bAddBos = bAddBos;
32 m_bAddEos = bAddEos;
33 m_bEnablePad = bEnablePad;
34
35 if (bAddBos)
36 m_rgVocabKeyToIdx.Add(BOS, 1);
37
38 if (bAddEos)
39 m_rgVocabKeyToIdx.Add(EOS, 2);
40 }
41
45 public int Count
46 {
47 get { return m_rgVocabKeyToIdx.Count + ((m_bEnablePad) ? 1 : 0); }
48 }
49
54 public void Add(char ch)
55 {
56 if (!m_rgVocabKeyToIdx.ContainsKey(ch))
57 m_rgVocabKeyToIdx.Add(ch, 1);
58 }
59
64 public void Add(string str)
65 {
66 foreach (char ch in str)
67 {
68 Add(ch);
69 }
70 }
71
76 public int Build()
77 {
78 List<char> rgKeys = m_rgVocabKeyToIdx.Keys.ToList();
79 rgKeys.Sort();
80
81 m_rgVocabKeyToIdx.Clear();
82
83 int nPadOffset = (m_bEnablePad) ? 1 : 0;
84
85 // index 0 reserved for pad.
86 for (int i = 0; i < rgKeys.Count; i++)
87 {
88 m_rgVocabKeyToIdx.Add(rgKeys[i], i + nPadOffset);
89 m_rgVocabIdxToKey.Add(i + nPadOffset, rgKeys[i]);
90 }
91
92 return Count;
93 }
94
100 public int BuildFromString(string strData)
101 {
102 foreach (char ch in strData)
103 {
104 Add(ch);
105 }
106
107 return Build();
108 }
109
113 public char BOS
114 {
115 get { return (char)1; }
116 }
117
121 public char EOS
122 {
123 get { return (char)2; }
124 }
125
131 public int[] CreateTarget(int[] rgSrc)
132 {
133 List<int> rgTrg = new List<int>(rgSrc);
134
135 rgTrg.RemoveAt(0);
136 rgTrg.Add(EOS);
137
138 return rgTrg.ToArray();
139 }
140
147 public List<int> Tokenize(string str1, bool bMustExist = true)
148 {
149 if (str1.Length != 1)
150 throw new Exception("The character must be a single character!");
151
152 List<int> rgTokens = new List<int>();
153 char ch = str1[0];
154
155 if (!m_rgVocabKeyToIdx.ContainsKey(ch))
156 {
157 if (bMustExist)
158 throw new Exception("The character '" + ch.ToString() + " is not in the vocabulary!");
159 else
160 rgTokens.Add(m_random.Next(Count));
161 }
162
163 rgTokens.Add(m_rgVocabKeyToIdx[ch]);
164 return rgTokens;
165 }
166
174 public int[] Tokenize(string str, bool bAddBos, bool bAddEos)
175 {
176 List<int> rgTokens = new List<int>();
177
178 foreach (char ch in str)
179 {
180 rgTokens.AddRange(Tokenize(ch.ToString()));
181 }
182
183 if (bAddBos)
184 rgTokens.Insert(0, BOS);
185
186 if (bAddEos)
187 rgTokens.Add(EOS);
188
189 return rgTokens.ToArray();
190 }
191
199 public string Detokenize(int nIdxToken, bool bIgnoreBos, bool bIgnoreEos)
200 {
201 string str = "";
202
203 if (m_bAddBos && nIdxToken == BOS)
204 {
205 if (!bIgnoreBos)
206 str += "<BOS>";
207 }
208
209 else if (m_bAddEos && nIdxToken == EOS)
210 {
211 if (!bIgnoreEos)
212 str += "<EOS>";
213 }
214
215 else
216 {
217 if (m_rgVocabIdxToKey.ContainsKey(nIdxToken))
218 str += m_rgVocabIdxToKey[nIdxToken];
219 else if (nIdxToken == 0)
220 str += "";
221 else
222 throw new Exception("The token '" + nIdxToken.ToString() + "' is not in the vocabulary!");
223 }
224
225 return str;
226 }
227
235 public string Detokenize(float[] rgf, bool bIgnoreBos, bool bIgnoreEos)
236 {
237 string str = "";
238
239 foreach (float f in rgf)
240 {
241 string str1 = Detokenize((int)f, bIgnoreBos, bIgnoreEos);
242
243 if (!string.IsNullOrEmpty(str1))
244 {
245 char ch = str1[0];
246
247 if (ch == EOS)
248 break;
249
250 if (ch != 0 && ch != BOS && ch != EOS)
251 str += ch;
252 }
253 }
254
255 return str;
256 }
257 }
258}
The VocabularyCharacters class manages the data vocabulary of characters.
void Add(char ch)
Adds a new character to the vocabulary.
string Detokenize(int nIdxToken, bool bIgnoreBos, bool bIgnoreEos)
Detokenize an index token into its corresponding character.
int[] Tokenize(string str, bool bAddBos, bool bAddEos)
Tokenize a string of data.
int BuildFromString(string strData)
Build the vocabulary from a string.
List< int > Tokenize(string str1, bool bMustExist=true)
Tokenize a character into its corresponding index token.
char BOS
Returns the special BOS character.
int Build()
Builds the vocabulary from all characters added.
void Add(string str)
Add a string of characters to the vocabulary.
string Detokenize(float[] rgf, bool bIgnoreBos, bool bIgnoreEos)
Detokenize an array into a string.
VocabularyCharacter(Random random, bool bAddBos, bool bAddEos, bool bEnablePad)
The constructor.
int[] CreateTarget(int[] rgSrc)
Create a target that is offset from the source by one and ends with a EOS.
char EOS
Returns the special EOS character.
int? Count
Returns the size of the vocabulary.
The IVocabulary interface specifies the interface that all Vocabularies implement.
Definition: Interfaces.cs:14
The MyCaffe.layers.gpt namespace contains all GPT related layers.
Definition: LayerFactory.cs:15