MyCaffe  1.12.2.41
Deep learning software for Windows C# programmers.
VocabularyWord.cs
1using System;
2using System.Collections.Generic;
3using System.Diagnostics;
4using System.Linq;
5using System.Text;
6using System.Threading.Tasks;
7
8namespace MyCaffe.layers.gpt
9{
14 {
15 Random m_random;
16 Dictionary<string, int> m_rgVocabKeyToIdx = new Dictionary<string, int>();
17 Dictionary<int, string> m_rgVocabIdxToKey = new Dictionary<int, string>();
18 bool m_bAddBos;
19 bool m_bAddEos;
20
27 public VocabularyWord(Random random, bool bAddBos, bool bAddEos)
28 {
29 m_random = random;
30 m_bAddBos = bAddBos;
31 m_bAddEos = bAddEos;
32
33 if (bAddBos)
34 m_rgVocabKeyToIdx.Add(BOS.ToString(), 1);
35
36 if (bAddEos)
37 m_rgVocabKeyToIdx.Add(EOS.ToString(), 2);
38 }
39
43 public int Count
44 {
45 get { return m_rgVocabKeyToIdx.Count + 1; }
46 }
47
48 private bool isSymbol(char ch)
49 {
50 if (char.IsDigit(ch))
51 return true;
52
53 if (char.IsPunctuation(ch))
54 return true;
55
56 if (char.IsSymbol(ch))
57 return true;
58
59 System.Globalization.UnicodeCategory cat = char.GetUnicodeCategory(ch);
60 if (cat == System.Globalization.UnicodeCategory.OtherPunctuation ||
61 cat == System.Globalization.UnicodeCategory.OtherSymbol ||
62 cat == System.Globalization.UnicodeCategory.DecimalDigitNumber)
63 return true;
64
65 return false;
66 }
67
68 private string trim(string str)
69 {
70 string strOut = "";
71
72 foreach (char ch in str)
73 {
74 System.Globalization.UnicodeCategory cat = char.GetUnicodeCategory(ch);
75
76 if (!char.IsWhiteSpace(ch) && cat != System.Globalization.UnicodeCategory.SpaceSeparator)
77 strOut += ch;
78 }
79
80 return strOut;
81 }
82
87 public void Add(string str)
88 {
89 string[] rgstr = str.Split(' ');
90
91 foreach (string strWord in rgstr)
92 {
93 if (!string.IsNullOrEmpty(strWord))
94 {
95 string strWord1 = trim(strWord.ToLower().Trim('\'', '\"'));
96 if (string.IsNullOrEmpty(strWord1))
97 continue;
98
99 while (strWord1.Length > 0 && isSymbol(strWord1[strWord1.Length-1]) && strWord1[strWord1.Length-1] != ' ')
100 {
101 string strLast = strWord1[strWord1.Length - 1].ToString();
102 if (!m_rgVocabKeyToIdx.ContainsKey(strLast))
103 m_rgVocabKeyToIdx.Add(strLast, 1);
104
105 strWord1 = strWord1.Substring(0, strWord1.Length - 1);
106 }
107
108 strWord1 = trim(strWord1);
109 if (string.IsNullOrEmpty(strWord1))
110 continue;
111
112 while (strWord1.Length > 0 && isSymbol(strWord1[0]) && strWord1[0] != ' ')
113 {
114 string strFirst = strWord1[0].ToString();
115 if (!m_rgVocabKeyToIdx.ContainsKey(strFirst))
116 m_rgVocabKeyToIdx.Add(strFirst, 1);
117
118 strWord1 = strWord1.Substring(1);
119 }
120
121 strWord1 = trim(strWord1);
122 if (string.IsNullOrEmpty(strWord1))
123 continue;
124
125 if (!m_rgVocabKeyToIdx.ContainsKey(strWord1))
126 m_rgVocabKeyToIdx.Add(strWord1, 1);
127 }
128 }
129 }
130
135 public int Build()
136 {
137 List<string> rgKeys = m_rgVocabKeyToIdx.Keys.ToList();
138 rgKeys.Sort();
139
140 m_rgVocabKeyToIdx.Clear();
141
142 // index 0 reserved for pad.
143 for (int i = 0; i < rgKeys.Count; i++)
144 {
145 m_rgVocabKeyToIdx.Add(rgKeys[i], i + 1);
146 m_rgVocabIdxToKey.Add(i + 1, rgKeys[i]);
147 }
148
149 return Count;
150 }
151
157 public int BuildFromString(string strData)
158 {
159 string[] rgstrWords = strData.Split(' ');
160 foreach (string strWord in rgstrWords)
161 {
162 Add(strWord);
163 }
164
165 return Build();
166 }
167
171 public char BOS
172 {
173 get { return (char)1; }
174 }
175
179 public char EOS
180 {
181 get { return (char)2; }
182 }
183
189 public int[] CreateTarget(int[] rgSrc)
190 {
191 List<int> rgTrg = new List<int>(rgSrc);
192
193 rgTrg.RemoveAt(0);
194 rgTrg.Add(EOS);
195
196 return rgTrg.ToArray();
197 }
198
205 public List<int> Tokenize(string strWord, bool bMustExist = true)
206 {
207 List<int> rgTokens = new List<int>();
208
209 if (!string.IsNullOrEmpty(strWord))
210 {
211 string strWord1 = trim(strWord.ToLower().Trim('\'', '\"'));
212 if (string.IsNullOrEmpty(strWord1))
213 return rgTokens;
214
215 while (strWord1.Length > 0 && isSymbol(strWord1[strWord1.Length - 1]) && strWord1[strWord1.Length - 1] != ' ')
216 {
217 string strLast = strWord1[strWord1.Length - 1].ToString();
218 if (m_rgVocabKeyToIdx.ContainsKey(strLast))
219 rgTokens.Add(m_rgVocabKeyToIdx[strLast]);
220
221 strWord1 = strWord1.Substring(0, strWord1.Length - 1);
222 }
223
224 strWord1 = trim(strWord1);
225 if (string.IsNullOrEmpty(strWord1))
226 return rgTokens;
227
228 while (strWord1.Length > 0 && isSymbol(strWord1[0]) && strWord1[0] != ' ')
229 {
230 string strFirst = strWord1[0].ToString();
231 if (m_rgVocabKeyToIdx.ContainsKey(strFirst))
232 rgTokens.Add(m_rgVocabKeyToIdx[strFirst]);
233
234 strWord1 = strWord1.Substring(1);
235 }
236
237 strWord1 = trim(strWord1);
238 if (string.IsNullOrEmpty(strWord1))
239 return rgTokens;
240
241 if (m_rgVocabKeyToIdx.ContainsKey(strWord1))
242 rgTokens.Add(m_rgVocabKeyToIdx[strWord1]);
243 }
244
245 return rgTokens;
246 }
247
255 public int[] Tokenize(string str, bool bAddBos, bool bAddEos)
256 {
257 List<int> rgTokens = new List<int>();
258
259 string[] rgstr = str.Split(' ');
260 foreach (string strWord in rgstr)
261 {
262 rgTokens.AddRange(Tokenize(strWord));
263 }
264
265 if (bAddBos)
266 rgTokens.Insert(0, BOS);
267
268 if (bAddEos)
269 rgTokens.Add(EOS);
270
271 return rgTokens.ToArray();
272 }
273
281 public string Detokenize(int nIdxToken, bool bIgnoreBos, bool bIgnoreEos)
282 {
283 string str = null;
284
285 if (nIdxToken == 0)
286 return str;
287
288 str = "";
289
290 if (m_bAddBos && nIdxToken == BOS)
291 {
292 if (!bIgnoreBos)
293 str += "<BOS>";
294 }
295
296 else if (m_bAddEos && nIdxToken == EOS)
297 {
298 if (!bIgnoreEos)
299 str += "<EOS>";
300 }
301
302 else
303 {
304 if (!m_rgVocabIdxToKey.ContainsKey(nIdxToken))
305 throw new Exception("The token '" + nIdxToken.ToString() + "' is not in the vocabulary!");
306
307 str += m_rgVocabIdxToKey[nIdxToken];
308 }
309
310 return str;
311 }
312
320 public string Detokenize(float[] rgf, bool bIgnoreBos, bool bIgnoreEos)
321 {
322 string str = "";
323
324 foreach (float f in rgf)
325 {
326 string str1 = Detokenize((int)f, bIgnoreBos, bIgnoreEos);
327
328 if (!string.IsNullOrEmpty(str1))
329 str += str1;
330 }
331
332 return str;
333 }
334 }
335}
The VocabularyWords class manages the data vocabulary of words.
string Detokenize(float[] rgf, bool bIgnoreBos, bool bIgnoreEos)
Detokenize an array into a string.
List< int > Tokenize(string strWord, bool bMustExist=true)
Tokenize a character into its corresponding index token.
char EOS
Returns the special EOS character.
VocabularyWord(Random random, bool bAddBos, bool bAddEos)
The constructor.
int BuildFromString(string strData)
Build the vocabulary from a string.
int Count
Returns the size of the vocabulary.
char BOS
Returns the special BOS character.
string Detokenize(int nIdxToken, bool bIgnoreBos, bool bIgnoreEos)
Detokenize an index token into its corresponding character.
int[] Tokenize(string str, bool bAddBos, bool bAddEos)
Tokenize a string of data.
int Build()
Builds the vocabulary from all words added.
int[] CreateTarget(int[] rgSrc)
Create a target that is offset from the source by one and ends with a EOS.
void Add(string str)
Adds a new character to the vocabulary.
The IVocabulary interface specifies the interface that all Vocabularies implement.
Definition: Interfaces.cs:14
The MyCaffe.layers.gpt namespace contains all GPT related layers.
Definition: LayerFactory.cs:15