MyCaffe  1.12.2.41
Deep learning software for Windows C# programmers.
Parallel.cs
1using System;
2using System.Collections.Generic;
3using System.Diagnostics;
4using System.Linq;
5using System.Text;
6using System.Threading;
7using System.Threading.Tasks;
8using MyCaffe.basecode;
9using MyCaffe.param;
10using MyCaffe.solvers;
11
12namespace MyCaffe.common
13{
18 public class Params<T>
19 {
23 protected long m_lCount;
24
28 protected long m_lExtra;
29
33 protected long m_hData;
37 protected long m_hDiff;
41 protected int m_nDeviceID;
42
47 public Params(Solver<T> root_solver)
48 {
49 m_lCount = total_size(root_solver.net.learnable_parameters);
50 m_lExtra = 1000;
51 m_hData = 0;
52 m_hDiff = 0;
53
55 }
56
60 public long count
61 {
62 get { return m_lCount; }
63 }
64
68 public long data
69 {
70 get { return m_hData; }
71 }
72
76 public long diff
77 {
78 get { return m_hDiff; }
79 }
80
81 private long total_size(BlobCollection<T> rgParam)
82 {
83 long nSize = 0;
84
85 for (int i = 0; i < rgParam.Count; i++)
86 {
87 nSize += (long)rgParam[i].count();
88 }
89
90 // Size should have at least one byte, otherwise malloc fails
91 // if net has no learnable parameters.
92 if (nSize == 0)
93 nSize++;
94
95 return nSize;
96 }
97 }
98
103 public class GPUParams<T> : Params<T>, IDisposable
104 {
112 protected Log m_log;
116 protected long m_hStream;
117
121 public enum Op
122 {
126 copy,
130 replace_gpu,
134 replace_gpu_diff
135 }
136
144 public GPUParams(CudaDnn<T> cuda, Log log, Solver<T> root_solver, int nDeviceID)
145 : base(root_solver)
146 {
147 m_cuda = cuda;
148 m_log = log;
149
150 m_nDeviceID = m_cuda.GetDeviceID();
151
152 if (nDeviceID != m_nDeviceID)
153 m_cuda.SetDeviceID(nDeviceID);
154
155 // Allocate device buffers
156 m_hData = m_cuda.AllocMemory(m_lCount);
157
158 // Copy blob values
159 BlobCollection<T> net = root_solver.net.learnable_parameters;
160 apply_buffers(net, m_hData, m_lCount, Op.copy);
161
162 m_hDiff = m_cuda.AllocMemory(m_lCount);
163 m_cuda.set((int)m_lCount, m_hDiff, 0);
164
165 m_hStream = m_cuda.CreateStream();
166
167 if (m_nDeviceID != nDeviceID)
168 m_cuda.SetDeviceID(m_nDeviceID);
169 }
170
174 public void Dispose()
175 {
176 if (m_hData != 0)
177 {
178 m_cuda.FreeMemory(m_hData);
179 m_hData = 0;
180 }
181
182 if (m_hDiff != 0)
183 {
184 m_cuda.FreeMemory(m_hDiff);
185 m_hDiff = 0;
186 }
187
188 if (m_hStream != 0)
189 {
190 m_cuda.FreeStream(m_hStream);
191 m_hStream = 0;
192 }
193 }
194
198 public void SynchronizeStream()
199 {
200 m_cuda.SynchronizeStream(m_hStream);
201 }
202
207 public void Configure(Solver<T> solver)
208 {
209 BlobCollection<T> net = solver.net.learnable_parameters;
210 apply_buffers(net, m_hData, m_lCount, Op.replace_gpu);
211 apply_buffers(net, m_hDiff, m_lCount, Op.replace_gpu_diff);
212 }
213
221 public void apply_buffers(BlobCollection<T> rgBlobs, long hBuffer, long lTotalSize, Op op)
222 {
223 long lOffset = 0;
224
225 for (int i = 0; i < rgBlobs.Count; i++)
226 {
227 int nCount = rgBlobs[i].count();
228
229 switch (op)
230 {
231 // Init buffer to current values of blobs
232 case Op.copy:
233 m_cuda.copy(nCount, rgBlobs[i].data.gpu_data, hBuffer, 0, (int)lOffset);
234 break;
235
236 case Op.replace_gpu:
237 rgBlobs[i].data.set_gpu_data(hBuffer, nCount, lOffset);
238 break;
239
240 case Op.replace_gpu_diff:
241 if (rgBlobs[i].DiffExists)
242 rgBlobs[i].diff.set_gpu_data(hBuffer, nCount, lOffset);
243 break;
244 }
245
246 lOffset += nCount;
247 }
248
249 // total_size is at least one byte
250 // We allocate extra items past the items used as a pad.
251 m_log.CHECK_EQ(lTotalSize - m_lExtra, (lOffset == 0) ? 1 : lOffset, "The total memory doesn't match.");
252 }
253 }
254
266 public class NCCL<T> : GPUParams<T>, IDisposable
267 {
268 long m_hNccl;
269 Solver<T> m_solver;
270 ManualResetEvent m_evtGradientsReady = new ManualResetEvent(false);
271 List<ManualResetEvent> m_rgGradientReady = new List<ManualResetEvent>();
272
282 public NCCL(CudaDnn<T> cuda, Log log, Solver<T> root_solver, int nDeviceID, long hNccl, List<ManualResetEvent> rgGradientReadyEvents)
283 : base(cuda, log, root_solver, nDeviceID)
284 {
285 m_rgGradientReady = rgGradientReadyEvents;
286 if (rgGradientReadyEvents != null && rgGradientReadyEvents.Count > 0)
287 m_evtGradientsReady = rgGradientReadyEvents[root_solver.solver_rank];
288
289 m_solver = root_solver;
290 m_hNccl = hNccl;
291 Configure(root_solver);
292
293 root_solver.OnGradientsReady += Solver_OnGradientsReady;
294 }
295
299 public new void Dispose()
300 {
301 base.Dispose();
302
303 if (m_hNccl != 0)
304 {
305 m_cuda.FreeNCCL(m_hNccl);
306 m_hNccl = 0;
307 }
308 }
309
313 public void Broadcast()
314 {
315 m_cuda.NcclBroadcast(m_hNccl, m_hStream, m_hData, (int)m_lCount);
316 m_cuda.SynchronizeStream(m_hStream);
317 }
318
319 private void Solver_OnGradientsReady(object sender, GradientsReadyArgs e)
320 {
321 try
322 {
323 m_cuda.SynchronizeStream();
324 m_evtGradientsReady.Set();
325
326 while (!WaitHandle.WaitAll(m_rgGradientReady.ToArray(), 250))
327 {
328 if (m_solver.CancelEvent.WaitOne(0))
329 return;
330 }
331
332 double dfScale = 1.0 / m_solver.solver_count;
333 m_cuda.NcclAllReduce(m_hNccl, m_hStream, m_hDiff, (int)m_lCount, NCCL_REDUCTION_OP.SUM, dfScale);
334 m_cuda.SynchronizeStream(m_hStream);
335 }
336 finally
337 {
338 m_evtGradientsReady.Reset();
339 }
340 }
341
351 public void Run(List<int> rgGpus, int nIterationOverride = -1)
352 {
353 List<long> rghNccl = new List<long>();
354 Guid guid = Guid.NewGuid();
355
356 m_rgGradientReady = new List<ManualResetEvent>();
357
358 for (int i = 0; i < rgGpus.Count; i++)
359 {
360 long hNccl = m_cuda.CreateNCCL(rgGpus[i], m_solver.solver_count, i, guid);
361 rghNccl.Add(hNccl);
362 m_rgGradientReady.Add(new ManualResetEvent(false));
363 }
364
365 m_cuda.NcclInitializeSingleProcess(rghNccl.ToArray());
366 m_hNccl = rghNccl[0];
367 m_evtGradientsReady = m_rgGradientReady[0];
368
369 List<WaitHandle> rgWaitAllInit = new List<WaitHandle>();
370 List<Worker<T>> rgWorkers = new List<common.Worker<T>>();
371 ManualResetEvent evtAllCreated = new ManualResetEvent(false);
372
373 for (int i = 1; i < rghNccl.Count; i++)
374 {
375 Worker<T> worker = new Worker<T>();
376
377 SolverInfo<T> info = new common.SolverInfo<T>(m_solver, m_cuda.KernelHandle, rghNccl[i], i, nIterationOverride, m_cuda.Path, m_rgGradientReady, evtAllCreated);
378 worker.StartInternalThread(null, null, rgGpus[i], info);
379
380 List<WaitHandle> rgWait = new List<WaitHandle>();
381 rgWait.AddRange(m_solver.CancelEvent.Handles);
382 rgWait.Add(info.ErrorEvent);
383 rgWait.Add(info.StartedEvent);
384
385 int nWait = WaitHandle.WaitAny(rgWait.ToArray());
386 if (nWait < rgWait.Count - 2)
387 return;
388
389 if (nWait == rgWait.Count - 2)
390 {
391 if (info.Error != null)
392 throw info.Error;
393 else
394 throw new Exception("Error starting the solver.");
395 }
396
397 rgWaitAllInit.Add(info.InitializedEvent);
398 rgWorkers.Add(worker);
399 }
400
401 // Wait for all worksers to initialize
402 while (!WaitHandle.WaitAll(rgWaitAllInit.ToArray(), 250))
403 {
404 if (m_solver.CancelEvent.WaitOne(0))
405 return;
406 }
407
408 m_cuda.SynchronizeDevice();
409 evtAllCreated.Set();
410
411 // Run first solver on current thread.
412 Broadcast();
413
414 m_solver.Solve(nIterationOverride);
415
416 // Wait for shutdown
417 for (int i = 0; i < rgWorkers.Count; i++)
418 {
419 rgWorkers[i].StopInternalThread();
420 }
421 }
422 }
423
428 public class Worker<T> : InternalThread<T>
429 {
430 CudaDnn<T> m_cuda;
431
435 public Worker()
436 {
437 this.DoWork += Worker_DoWork;
438 }
439
440 private void Worker_DoWork(object sender, ActionStateArgs<T> e)
441 {
442 SolverInfo<T> info = e.Arg as SolverInfo<T>;
443 NCCL<T> nccl = null;
444
445 m_cuda = new common.CudaDnn<T>(e.DeviceID, DEVINIT.CUBLAS | DEVINIT.CURAND, null, info.CudaPath);
446
447 try
448 {
449 Solver<T> rank0 = info.Rank0;
450 Log log = new Log("Worker solver for DeviceID = " + e.DeviceID.ToString());
451
452 //-----------------------------------------
453 // Transfer the NCCL handle from the
454 // main kernel that created it to the
455 // one used by the CudaDnn on this thread.
456 //
457 // After the copy, this thread will 'own'
458 // the nccl and be responsible for its
459 // destruction.
460 //-----------------------------------------
461 long hNccl = m_cuda.KernelCopyNccl(info.KernelHandle, info.NcclHandle);
462
463 // Create solver and install callbacks
464 SolverParameter param = rank0.parameter.Clone();
465 param.device_id = e.DeviceID;
466 param.type = rank0.parameter.type;
467 Solver<T> solver = Solver<T>.Create(m_cuda, log, param, rank0.CancelEvent, null, null, rank0.Database, null, rank0.solver_count, info.SolverRank);
468 info.StartedEvent.Set();
469 log.CHECK_EQ((int)solver.type, (int)rank0.type, "The solver types should be the same.");
470
471 //-----------------------------------------
472 // Turn off logging for all other
473 // operations on the worker thread.
474 //-----------------------------------------
475 log.Enable = false;
476
477 nccl = new NCCL<T>(m_cuda, log, solver, e.DeviceID, hNccl, info.GradientReadyEvents);
478
479 info.InitializedEvent.Set();
480 m_cuda.SynchronizeDevice();
481
482 List<WaitHandle> rgWait = new List<WaitHandle>();
483 rgWait.AddRange(rank0.CancelEvent.Handles);
484 rgWait.Add(info.AllCreatedEvent);
485
486 int nWait = WaitHandle.WaitAny(rgWait.ToArray());
487 if (nWait < rgWait.Count - 1)
488 return;
489
490 nccl.Broadcast();
491
492 int nIterations = param.max_iter - solver.iter;
493 if (info.IterationOverride > 0)
494 nIterations = info.IterationOverride;
495
496 solver.Step(nIterations);
497 solver.Dispose();
498 }
499 catch (Exception excpt)
500 {
501 info.Error = excpt;
502 info.ErrorEvent.Set();
503 }
504 finally
505 {
506 if (nccl != null)
507 nccl.Dispose();
508
509 m_cuda.Dispose();
510 m_cuda = null;
511 }
512 }
513 }
514
519 public class SolverInfo<T>
520 {
521 string m_strCudaPath;
522 Solver<T> m_rank0;
523 long m_hSrcKernel;
524 long m_hSrcNccl;
525 int m_nSolverRank;
526 int m_nIterationOverride;
527 ManualResetEvent m_evtInitialized = new ManualResetEvent(false);
528 ManualResetEvent m_evtStarted = new ManualResetEvent(false);
529 ManualResetEvent m_evtAllCreated = new ManualResetEvent(false);
530 AutoResetEvent m_evtError = new AutoResetEvent(false);
531 List<ManualResetEvent> m_rgGradientReadyEvents = null;
532 Exception m_error = null;
533
545 public SolverInfo(Solver<T> rank0, long hSrcKernel, long hSrcNccl, int nSolverRank, int nIterationOverride, string strCudaPath, List<ManualResetEvent> rgGradientReadyEvents, ManualResetEvent evtAllCreated)
546 {
547 m_strCudaPath = strCudaPath;
548 m_rank0 = rank0;
549 m_hSrcKernel = hSrcKernel;
550 m_hSrcNccl = hSrcNccl;
551 m_nSolverRank = nSolverRank;
552 m_nIterationOverride = nIterationOverride;
553 m_rgGradientReadyEvents = rgGradientReadyEvents;
554 m_evtAllCreated = evtAllCreated;
555 }
556
561 {
562 get { return m_rank0; }
563 }
564
568 public string CudaPath
569 {
570 get { return m_strCudaPath; }
571 }
572
577 {
578 get { return m_nIterationOverride; }
579 }
580
584 public long KernelHandle
585 {
586 get { return m_hSrcKernel; }
587 }
588
592 public long NcclHandle
593 {
594 get { return m_hSrcNccl; }
595 }
596
600 public int SolverRank
601 {
602 get { return m_nSolverRank; }
603 }
604
608 public ManualResetEvent InitializedEvent
609 {
610 get { return m_evtInitialized; }
611 }
612
616 public ManualResetEvent StartedEvent
617 {
618 get { return m_evtStarted; }
619 }
620
624 public ManualResetEvent AllCreatedEvent
625 {
626 get { return m_evtAllCreated; }
627 }
628
632 public List<ManualResetEvent> GradientReadyEvents
633 {
634 get { return m_rgGradientReadyEvents; }
635 }
636
640 public Exception Error
641 {
642 get { return m_error; }
643 set { m_error = value; }
644 }
645
649 public AutoResetEvent ErrorEvent
650 {
651 get { return m_evtError; }
652 }
653 }
654}
WaitHandle[] Handles
Returns the internal wait handle of the CancelEvent.
Definition: CancelEvent.cs:302
bool WaitOne(int nMs=int.MaxValue)
Waits for the signal state to occur.
Definition: CancelEvent.cs:290
The Log class provides general output in text form.
Definition: Log.cs:13
bool Enable
Enables/disables the Log. When disabled, the Log does not output any data.
Definition: Log.cs:42
void CHECK_EQ(double df1, double df2, string str)
Test whether one number is equal to another.
Definition: Log.cs:239
The ActionStateArgs are sent to the DoWork event when fired from the InternalThreadEntry.
object Arg
Returns the user supplied argument.
int DeviceID
Returns the Device ID of the device to use in the thread.
The BlobCollection contains a list of Blobs.
int Count
Returns the number of items in the collection.
The CudaDnn object is the main interface to the Low-Level Cuda C++ DLL.
Definition: CudaDnn.cs:969
The GPUParams contains the connection to the low-level Cuda, and the stream associated with this inst...
Definition: Parallel.cs:104
long m_hStream
The handle to the Cuda stream used for synchronization.
Definition: Parallel.cs:116
void Configure(Solver< T > solver)
Configure the GPU Params by copying the Solver training Net parameters into the data and diff buffers...
Definition: Parallel.cs:207
GPUParams(CudaDnn< T > cuda, Log log, Solver< T > root_solver, int nDeviceID)
The GPUParams constructor.
Definition: Parallel.cs:144
void apply_buffers(BlobCollection< T > rgBlobs, long hBuffer, long lTotalSize, Op op)
Transfer between the data/diff buffers and a collection of Blobs (e.g. the learnable parameters).
Definition: Parallel.cs:221
Log m_log
The Log used for output.
Definition: Parallel.cs:112
void SynchronizeStream()
Synchronize with the Cuda stream.
Definition: Parallel.cs:198
void Dispose()
Release all GPU and Host resources used.
Definition: Parallel.cs:174
CudaDnn< T > m_cuda
The instance of CudaDnn that provides the connection to Cuda.
Definition: Parallel.cs:108
The GradientsReadyArgs is sent to the Solver::OnGradientsReady event which fires at the end of each S...
Definition: EventArgs.cs:734
The InternalThread manages an internal thread used for Parallel and data collection operations.
void StartInternalThread(CudaDnn< T > cuda, Log log, int nDeviceID=0, object arg=null, int nInitialDelay=0)
Starts running the internal thread function which then calls the DoWork event.
EventHandler< ActionStateArgs< T > > DoWork
The DoWork event is the working thread function.
The NCCL class manages the multi-GPU operations using the low-level NCCL functionality provided by th...
Definition: Parallel.cs:267
new void Dispose()
Release all GPU and Host resources used.
Definition: Parallel.cs:299
NCCL(CudaDnn< T > cuda, Log log, Solver< T > root_solver, int nDeviceID, long hNccl, List< ManualResetEvent > rgGradientReadyEvents)
The NCCL constructor.
Definition: Parallel.cs:282
void Run(List< int > rgGpus, int nIterationOverride=-1)
Run the root Solver and coordinate with all other Solver's participating in the multi-GPU training.
Definition: Parallel.cs:351
void Broadcast()
Broadcast the data to all other solvers participating in the multi-GPU session.
Definition: Parallel.cs:313
The Params contains the base parameters used in multi-GPU training.
Definition: Parallel.cs:19
long data
Returns the handle to the GPU memory containing the Net parameters.
Definition: Parallel.cs:69
Params(Solver< T > root_solver)
The Param constructor.
Definition: Parallel.cs:47
long m_lExtra
size of the padding added to the memory buffers.
Definition: Parallel.cs:28
long count
Returns the size of the buffers (in items).
Definition: Parallel.cs:61
long m_lCount
size of the buffers (in items).
Definition: Parallel.cs:23
long diff
Returns the handle to the GPU memory containing the Net gradients.
Definition: Parallel.cs:77
long m_hDiff
Handle to GPU memory containing the Net gradient.
Definition: Parallel.cs:37
long m_hData
Handle to GPU memory containing the Net parameters.
Definition: Parallel.cs:33
int m_nDeviceID
The Device ID.
Definition: Parallel.cs:41
The SolverInfo defines the user supplied arguments passed to each Worker.
Definition: Parallel.cs:520
int IterationOverride
Returns the training iteration override to use.
Definition: Parallel.cs:577
AutoResetEvent ErrorEvent
Returns the event that is set when an error occurs.
Definition: Parallel.cs:650
Exception Error
Returns the error (if any) that occured when running the solver thread.
Definition: Parallel.cs:641
SolverInfo(Solver< T > rank0, long hSrcKernel, long hSrcNccl, int nSolverRank, int nIterationOverride, string strCudaPath, List< ManualResetEvent > rgGradientReadyEvents, ManualResetEvent evtAllCreated)
The SolverInfo constructor.
Definition: Parallel.cs:545
string CudaPath
Returns the file path to the low-level CudaDnnDll.DLL file to use. Note, when null or emtpy,...
Definition: Parallel.cs:569
ManualResetEvent StartedEvent
Returns the event that is set after the Worker has started running.
Definition: Parallel.cs:617
long KernelHandle
Returns a handle to the kernel where the NCCL for this Solver was created (typically this is the kern...
Definition: Parallel.cs:585
Solver< T > Rank0
Returns rank Solver that will run in the Worker.
Definition: Parallel.cs:561
List< ManualResetEvent > GradientReadyEvents
Returns the event that is set after the gradients of the Solver in this Worker are ready.
Definition: Parallel.cs:633
long NcclHandle
Returns the handle to the NCCL instance for this Solver (typically this is created on the kernel that...
Definition: Parallel.cs:593
ManualResetEvent AllCreatedEvent
Returns the event that is set after all Workers have been created.
Definition: Parallel.cs:625
ManualResetEvent InitializedEvent
Returns the event that is set after the Worker has completed initializing.
Definition: Parallel.cs:609
int SolverRank
Returns the rank of this Solver.
Definition: Parallel.cs:601
The Worker manages each 'non' root sover running, where each Worker operates on a different GPU.
Definition: Parallel.cs:429
Worker()
The Worker constructor.
Definition: Parallel.cs:435
The SolverParameter is a parameter for the solver, specifying the train and test networks.
int max_iter
The maximum number of iterations.
SolverParameter Clone()
Creates a new copy of the SolverParameter.
int device_id
The device id that will be used when run on the GPU.
SolverType type
Specifies the solver type.
An interface for classes that perform optimization on Nets - this class serves as the base class for ...
Definition: Solver.cs:28
void Dispose()
Discards the resources (GPU and Host) used by this Solver.
Definition: Solver.cs:218
static SGDSolver< T > Create(CudaDnn< T > cuda, Log log, ProjectEx p, CancelEvent evtCancel, AutoResetEvent evtForceSnapshot, AutoResetEvent evtForceTest, IXDatabaseBase db, IXPersist< T > persist, int nSolverCount=1, int nSolverRank=0, Net< T > shareNet=null, onGetWorkspace getws=null, onSetWorkspace setws=null)
Create a new Solver based on the project containing the SolverParameter.
Definition: Solver.cs:1889
int iter
Returns the current training iteration.
Definition: Solver.cs:1245
SolverParameter.SolverType type
Returns the type of solver.
Definition: Solver.cs:1253
Net< T > net
Returns the main training Net.
Definition: Solver.cs:1229
int solver_count
Returns the solver count in a multi-GPU session.
Definition: Solver.cs:1290
CancelEvent CancelEvent
Returns the cancel event which when set cancels the current operation run by the Solver.
Definition: Solver.cs:1205
SolverParameter parameter
Returns the SolverParameter used.
Definition: Solver.cs:1221
bool Step(int nIters, TRAIN_STEP step=TRAIN_STEP.NONE, bool bZeroDiffs=true, bool bApplyUpdates=true, bool bDisableOutput=false, bool bDisableProgress=false, double? dfLossOverride=null, bool? bAllowSnapshot=null)
Steps a set of iterations through a training cycle.
Definition: Solver.cs:818
EventHandler< GradientsReadyArgs > OnGradientsReady
The OnGradientsReady event fires after the gradients of a Solver are ready for distribution to other ...
Definition: Solver.cs:126
int solver_rank
Returns this Solver's rank in a multi-GPU session.
Definition: Solver.cs:1298
IXDatabaseBase Database
Returns the in-memory MyCaffeDatabase used.
Definition: Solver.cs:310
virtual void Solve(int nIterationOverride=-1, byte[] rgWeights=null, byte[] rgState=null, TRAIN_STEP step=TRAIN_STEP.NONE)
The main entry of the solver function. In default, iter will be zero. Pass in a non-zero iter number ...
Definition: Solver.cs:744
The MyCaffe.basecode contains all generic types used throughout MyCaffe.
Definition: Annotation.cs:12
The MyCaffe.common namespace contains common MyCaffe classes.
Definition: BatchInput.cs:8
DEVINIT
Specifies the initialization flags used when initializing CUDA.
Definition: CudaDnn.cs:207
NCCL_REDUCTION_OP
Specifies the reduction operation to use with 'Nickel' NCCL.
Definition: CudaDnn.cs:513
The MyCaffe.param namespace contains parameters used to create models.
The MyCaffe.solvers namespace contains all solver classes, including the base Solver.
The MyCaffe namespace contains the main body of MyCaffe code that closesly tracks the C++ Caffe open-...
Definition: Annotation.cs:12