MyCaffe  1.12.2.41
Deep learning software for Windows C# programmers.
NumpyFile.cs
1using MyCaffe.basecode;
2using System;
3using System.Collections.Generic;
4using System.Diagnostics;
5using System.IO;
6using System.Linq;
7using System.Runtime.InteropServices;
8using System.Text;
9using System.Threading.Tasks;
10
11namespace MyCaffe.common
12{
17 public class NumpyFile<T> : IDisposable
18 {
19 FileStream m_fs = null;
20 BinaryReader m_br = null;
21 Type m_dataType;
22 int m_nDataTypeSize;
23 int[] m_rgShape;
24 long m_nHeaderSize;
25 int m_nCount = 0;
26 int m_nRows = 0;
27 int m_nColumns = 0;
28 int m_nFieldCount = 1;
29 Tuple<int, int> m_count;
30 Stopwatch m_sw = new Stopwatch();
31 Log m_log;
32 string m_strFile;
33
38 public NumpyFile(Log log)
39 {
40 m_log = log;
41 }
42
46 public void Dispose()
47 {
48 Close();
49 }
50
54 public void Close()
55 {
56 if (m_br != null)
57 {
58 m_br.Close();
59 m_br.Dispose();
60 m_br = null;
61 }
62
63 if (m_fs != null)
64 {
65 m_fs.Close();
66 m_fs.Dispose();
67 m_fs = null;
68 }
69 }
70
74 public Type DataType
75 {
76 get { return m_dataType; }
77 }
78
82 public int[] Shape
83 {
84 get { return m_rgShape; }
85 }
86
90 public int Rows
91 {
92 get { return m_nRows; }
93 }
94
98 public int Columns
99 {
100 get { return m_nColumns; }
101 }
102
106 public int Fields
107 {
108 get { return m_nFieldCount; }
109 }
110
114 public int TotalCount
115 {
116 get
117 {
118 int nCount = 1;
119 for (int i=0; i<m_rgShape.Length; i++)
120 {
121 nCount *= m_rgShape[i];
122 }
123 return nCount;
124 }
125 }
126
132 public void OpenRead(string strFile)
133 {
134 m_strFile = strFile;
135 m_fs = File.OpenRead(strFile);
136 m_br = new BinaryReader(m_fs);
137
138 BinaryReader br = m_br;
139
140 byte[] rgMagic = new byte[6];
141 for (int i = 0; i < rgMagic.Length; i++)
142 {
143 rgMagic[i] = br.ReadByte();
144 }
145
146 if (rgMagic[0] != 0x93 || rgMagic[1] != 0x4E || rgMagic[2] != 0x55 || rgMagic[3] != 0x4D || rgMagic[4] != 0x50 || rgMagic[5] != 0x59)
147 throw new Exception("The file is not a valid Numpy file!");
148
149 byte bMajor = br.ReadByte();
150 byte bMinor = br.ReadByte();
151
152 if (bMajor != 1 || bMinor != 0)
153 throw new Exception("The file is not a valid Numpy file!");
154
155 byte bHeaderLen1 = br.ReadByte();
156 byte bHeaderLen2 = br.ReadByte();
157 int nHeaderLen = bHeaderLen2 << 8 | bHeaderLen1;
158
159 byte[] rgHeader = new byte[nHeaderLen];
160 for (int i = 0; i < rgHeader.Length; i++)
161 {
162 rgHeader[i] = br.ReadByte();
163 }
164 string strHeader = Encoding.ASCII.GetString(rgHeader);
165
166 bool bFortranOrder;
167 m_count = parseHeaderEx(strHeader, out bFortranOrder, out m_rgShape, out m_dataType, out m_nDataTypeSize);
168
169 if (bFortranOrder)
170 throw new Exception("Currently the fortran ordering is not supported");
171
172 m_nCount = 1;
173
174 m_nRows = m_rgShape[0];
175 m_nColumns = (m_rgShape.Length == 1) ? 1 : m_rgShape[1];
176
177 for (int i=0; i<m_rgShape.Length; i++)
178 {
179 m_nCount *= m_rgShape[i];
180
181 if (i > 1)
182 m_nFieldCount *= m_rgShape[i];
183 }
184
185 m_nHeaderSize = m_fs.Position;
186 }
187
188 private static Tuple<int, int> parseHeaderEx(string str, out bool bFortranOrder, out int[] rgShape, out Type dataType, out int nDataTypeSize, int nMax = int.MaxValue)
189 {
190 int nNum = 1;
191 int nCount = 1;
192 List<int> rgShape1 = new List<int>();
193 str = str.Trim('{', '}', ' ', '\n', ',');
194
195 dataType = typeof(object);
196 nDataTypeSize = 1;
197
198 string strShape = null;
199 string strTarget = "'shape':";
200 int nPos = str.IndexOf(strTarget);
201 if (nPos > 0)
202 {
203 strShape = str.Substring(nPos + strTarget.Length);
204 str = str.Substring(0, nPos);
205
206 nPos = strShape.IndexOf(')');
207 str += strShape.Substring(nPos + 1);
208 str = str.Trim(',', ' ');
209
210 strShape = strShape.Substring(0, nPos);
211 strShape = strShape.Trim(' ', '(', ')');
212 string[] rgShapeStr = strShape.Split(',');
213
214 for (int i = 0; i < rgShapeStr.Count(); i++)
215 {
216 string strShape1 = rgShapeStr[i];
217 if (!string.IsNullOrEmpty(strShape1))
218 {
219 int nShape = int.Parse(strShape1);
220
221 if (i == 0 && nShape > nMax)
222 nShape = nMax;
223
224 rgShape1.Add(nShape);
225
226 if (i == 0)
227 nNum = rgShape1[rgShape1.Count - 1];
228 else
229 nCount *= rgShape1[rgShape1.Count - 1];
230 }
231 }
232 }
233
234 rgShape = rgShape1.ToArray();
235 bFortranOrder = false;
236
237 string[] rgstr = str.Split(',');
238 foreach (string str1 in rgstr)
239 {
240 string[] rgstrKeyVal = str1.Split(':');
241 if (rgstrKeyVal.Length != 2)
242 throw new Exception("Invalid header key value, '" + str1 + "'!");
243
244 string strKey = rgstrKeyVal[0].Trim('\'', ' ');
245 string strVal = rgstrKeyVal[1].Trim('\'', ' ');
246
247 switch (strKey)
248 {
249 case "descr":
250 if (strVal == "<f4")
251 dataType = typeof(float);
252 else if (strVal == "<f8")
253 dataType = typeof(double);
254 else if (strVal == "<i4")
255 dataType = typeof(int);
256 else if (strVal == "<i8")
257 dataType = typeof(long);
258 else if (strVal == "|b1")
259 dataType = typeof(bool);
260 else if (strVal.StartsWith("<U"))
261 {
262 strVal = strVal.Substring(2);
263 nDataTypeSize = int.Parse(strVal);
264 dataType = typeof(string);
265 }
266 else
267 throw new Exception("Unsupported data type '" + strVal + "', currenly only support '<f4'");
268 break;
269
270 case "fortran_order":
271 bFortranOrder = bool.Parse(strVal);
272 break;
273 }
274 }
275
276 nDataTypeSize = Marshal.SizeOf(dataType);
277
278 return new Tuple<int, int>(nNum, nCount);
279 }
280
290 public T[] LoadRow(T[] rgVal, int nRowIdx, int nStartIdx = 0, int nColumnCount = int.MaxValue)
291 {
292 if (m_br == null)
293 throw new Exception("The file is not open!");
294
295 if (nRowIdx >= m_nRows)
296 throw new Exception("The row index '" + nRowIdx.ToString() + "' is out of range!");
297
298 if (nStartIdx >= m_nColumns)
299 throw new Exception("The start index '" + nStartIdx.ToString() + "' is out of range!");
300
301 if (nColumnCount == int.MaxValue)
302 nColumnCount = m_nColumns - nStartIdx;
303 else if (nStartIdx + nColumnCount > m_nColumns)
304 return null;
305
306 int nSize = nColumnCount * m_nFieldCount * m_nDataTypeSize;
307
308 if (nStartIdx > 0)
309 {
310 long nOffset = m_nHeaderSize + (nRowIdx * m_nColumns + nStartIdx) * m_nFieldCount * m_nDataTypeSize;
311 m_fs.Seek(nOffset, SeekOrigin.Begin);
312 }
313
314 byte[] rgData = m_br.ReadBytes(nSize);
315 int nItemCount = nColumnCount * m_nFieldCount;
316
317 if (rgVal == null || rgVal.Length != nItemCount)
318 rgVal = new T[nItemCount];
319
320 Buffer.BlockCopy(rgData, 0, rgVal, 0, rgData.Length);
321
322 return rgVal;
323 }
324
332 public List<T[]> Load(int nStartIdx = 0, int nCount = int.MaxValue)
333 {
334 if (m_br == null)
335 throw new Exception("The file is not open!");
336
337 if (nStartIdx >= m_rgShape[0])
338 throw new Exception("The start index '" + nStartIdx.ToString() + "' is out of range!");
339
340 if (m_dataType == typeof(string))
341 throw new Exception("String data types not supported.");
342
343 if (nStartIdx + nCount > m_rgShape[0])
344 nCount = m_rgShape[1] - nStartIdx;
345
346 List<T[]> rgVal = new List<T[]>();
347
348 if (m_nCount > 0)
349 {
350 // Skip ahead to start index (if greater than zero).
351 if (nStartIdx > 0)
352 {
353 long nItems = 1;
354
355 for (int i = 1; i < m_rgShape.Length; i++)
356 {
357 nItems *= m_rgShape[i];
358 }
359
360 long lSeekPos = m_nHeaderSize + nStartIdx * nItems * m_nDataTypeSize;
361 m_fs.Seek(lSeekPos, SeekOrigin.Begin);
362 }
363
364 for (int i = nStartIdx; i < nStartIdx + nCount; i++)
365 {
366 T[] rgItemT = new T[m_count.Item2 * m_nDataTypeSize];
367 byte[] rgItem = m_br.ReadBytes(m_count.Item2 * m_nDataTypeSize);
368 Buffer.BlockCopy(rgItem, 0, rgItemT, 0, rgItem.Length);
369
370 if (m_log != null)
371 {
372 if (m_sw.Elapsed.TotalMilliseconds > 1000)
373 {
374 double dfPct = (double)i / (nCount - nStartIdx);
375 string strOut = "Loading '" + m_strFile + "' at " + dfPct.ToString("P5") + "...";
376 m_log.WriteLine(strOut, true);
377 m_sw.Restart();
378 }
379 }
380
381 rgVal.Add(rgItemT);
382 }
383 }
384
385 return rgVal;
386 }
387 }
388}
The Log class provides general output in text form.
Definition: Log.cs:13
void WriteLine(string str, bool bOverrideEnabled=false, bool bHeader=false, bool bError=false, bool bDisable=false)
Write a line of output.
Definition: Log.cs:80
The NumpyFile reads data from a numpy file in the base type specified.
Definition: NumpyFile.cs:18
NumpyFile(Log log)
The constructor.
Definition: NumpyFile.cs:38
List< T[]> Load(int nStartIdx=0, int nCount=int.MaxValue)
Load the data from the numpy file, optionally specifying the starting row index and number of rows to...
Definition: NumpyFile.cs:332
T[] LoadRow(T[] rgVal, int nRowIdx, int nStartIdx=0, int nColumnCount=int.MaxValue)
Load a single row (or portion of a row) from the numpy file.
Definition: NumpyFile.cs:290
int Columns
Returns the number of items per row.
Definition: NumpyFile.cs:99
void OpenRead(string strFile)
Open the numpy file for reading, and read in the header information.
Definition: NumpyFile.cs:132
int Rows
Returns the number of rows.
Definition: NumpyFile.cs:91
void Dispose()
Dispose all resources and close the file.
Definition: NumpyFile.cs:46
int[] Shape
Return the data shape of the data in the Numpy file.
Definition: NumpyFile.cs:83
int TotalCount
Returns the total number of items * fields in the data.
Definition: NumpyFile.cs:115
int Fields
Returns the number of fields per column item.
Definition: NumpyFile.cs:107
void Close()
Close the file if open.
Definition: NumpyFile.cs:54
The MyCaffe.basecode contains all generic types used throughout MyCaffe.
Definition: Annotation.cs:12
The MyCaffe.common namespace contains common MyCaffe classes.
Definition: BatchInput.cs:8
DataType
Specifies the base datatype corresponding the the template type 'T'. Currently, only
Definition: CudaDnn.cs:192
The MyCaffe namespace contains the main body of MyCaffe code that closesly tracks the C++ Caffe open-...
Definition: Annotation.cs:12