@@ -258,7 +258,7 @@ let parallel_matrix_multiply pool a b =
258
258
let k_n = Array.length b in
259
259
let res = Array.make_matrix i_n j_n 0 in
260
260
261
- Task.parallel_for pool ~chunk_size:chunk_size ~ start:0 ~finish:(i_n - 1) ~body:(fun i ->
261
+ Task.parallel_for pool ~start:0 ~finish:(i_n - 1) ~body:(fun i ->
262
262
for j = 0 to j_n - 1 do
263
263
for k = 0 to k_n - 1 do
264
264
res.(i).(j) <- res.(i).(j) + a.(i).(k) * b.(k).(j)
@@ -282,10 +282,12 @@ discussed earlier, `start` and `finish` as the names suggset are the starting
282
282
and ending values of the loop iterations, ` body ` contains the actual loop body
283
283
to be executed.
284
284
285
- One parameter that doesn't exist in the sequential version is
286
- the ` chunk_size ` . Chunk size determines the granularity of tasks when executing
287
- on multiple cores. The ideal ` chunk_size ` depends on a combination
288
- of factors:
285
+ Parallel for also has an optional parameter ` chunk_size ` . It determines the
286
+ granularity of tasks when executing them on multiple domains. If no parameter
287
+ is given for ` chunk size ` , a default chunk size is determined which performs
288
+ well in most cases. Only if the default chunk size doesn't work well, it is
289
+ recommended to experiment with different chunk sizes. The ideal ` chunk_size `
290
+ depends on a combination of factors:
289
291
290
292
* ** Nature of the loop:** There are two things to consider pertaining to the
291
293
loop while deciding on a ` chunk_size ` to use, the * number of iterations* in the
@@ -308,16 +310,16 @@ Let us find how the parallel matrix multiplication scales on multiple cores.
308
310
309
311
The speedup vs core is enumerated below for input matrices of size 1024x1024.
310
312
311
- | Cores | Time(s)| Speedup |
312
- | -------| --------| -------------|
313
- | 1 | 10.153 | 1 |
314
- | 2 | 5.166 | 1.965350368 |
315
- | 4 | 2.65 | 3.831320755 |
316
- | 8 | 1.35 | 7.520740741 |
317
- | 12 | 0.957 | 10.6091954 |
318
- | 16 | 0.742 | 13.68328841 |
319
- | 20 | 0.634 | 16.01419558 |
320
- | 24 | 0.655 | 15.50076336 |
313
+ | Cores | Time (s) | Speedup |
314
+ | -------| ---------- | -------------|
315
+ | 1 | 9.172 | 1 |
316
+ | 2 | 4.692 | 1.954816709 |
317
+ | 4 | 2.293 | 4 |
318
+ | 8 | 1.196 | 7.668896321 |
319
+ | 12 | 0.854 | 10.74004684 |
320
+ | 16 | 0.76 | 12.06842105 |
321
+ | 20 | 0.66 | 13.8969697 |
322
+ | 24 | 0.587 | 15.62521295 |
321
323
322
324
![ matrix-graph] ( images/matrix_multiplication.png )
323
325
@@ -703,7 +705,7 @@ let a = Array.create_float n
703
705
704
706
let _ =
705
707
let pool = Task.setup_pool ~num_domains:(num_domains - 1) in
706
- Task.parallel_for pool ~chunk_size:(n/num_domains) ~ start:0
708
+ Task.parallel_for pool ~start:0
707
709
~finish:(n - 1) ~body:(fun i -> Array.set a i (Random.float 1000.));
708
710
Task.teardown_pool pool
709
711
```
@@ -713,7 +715,7 @@ Let us measure how it scales.
713
715
| #Cores | Time(s) |
714
716
| --------| ---------|
715
717
| 1 | 3.136 |
716
- | 2 | 7.648 |
718
+ | 2 | 10.19 |
717
719
| 4 | 11.815 |
718
720
719
721
When we had expected to see speedup executing in multiple cores, what we see
@@ -751,7 +753,7 @@ let arr = Array.create_float n
751
753
let _ =
752
754
let domains = T.setup_pool ~num_domains:(num_domains - 1) in
753
755
let states = Array.init num_domains (fun _ -> Random.State.make_self_init()) in
754
- T.parallel_for domains ~chunk_size:(n/num_domains) ~ start:0 ~finish:(n-1)
756
+ T.parallel_for domains ~start:0 ~finish:(n-1)
755
757
~body:(fun i ->
756
758
let d = (Domain.self() :> int) mod num_domains in
757
759
Array.unsafe_set arr i (Random.State.float states.(d) 100. ))
0 commit comments