# ------------------------------------ snip ------------------ # ToDo: http://docs.sun.com/source/819-7480-11/appb-mca.html oob_tcp_peer_limit # ChangeLog: # 20.08.2008 fix race condition, have lead # to speed-down-jumps, MPI_ERR_TRUNCATE and dead-locks at end # # ToDo: ethernet kernel bypass: http://www.linux-mag.com/id/7253 # Open-MX 117MB/s 20us (all NICs) # GAMMA 123MB/s 11us (only some NICs) # # set perCore 0 or 1 perCore=0 # set title "MPI_Sendrecv for maxSpeed OR saturatedSpeed(msgsize) vs. cores" if (perCore) set title "MPI_Sendrecv for maxSpeed(msgsize) per core vs. cores" set xlabel "Cores" # GbE set label "1p 76us" at 24+3,0.8*3410/24**perCore tc lt 2 set label "4p 187us" at 96+3,3200/96**perCore tc lt 2 # 100MbE set label "1p 233us" at 1.0*56,1.2*600/56**perCore tc lt 1 set label "2p 359us" at 1.2*112,400/112**perCore tc lt 1 # SMP # comp1. 4*2Opteron-SMP: OpenMPI-1.2.6 3 other jobs # 8 131072 18 68 259.40 4042.32 1.5% 1.9us (alone) 64K..1M # x4600: 8*QuadOpt-2.3GHz # 32 262144 16 108 1647.95 5090.33 0.9% 3.1us set label "8p 1.9us" at 1.2* 8, 4042*0.8/8**perCore left tc lt 4 set label "32p 3.1us" at 1.2*32, 5090*0.9/32**perCore left tc lt 4 # QO-IB set label "1p 9.6us" at 64+2, 32*800*0.8/32**perCore center tc lt 6 set label "8p 292us" at 500+2,256*110*0.8/256**perCore center tc lt 6 # SC set label "1p 2.9us" at 958+2, 958*310*1.4/958**perCore center tc lt 7 set label "6p 7.4us" at 5748+2,5748*52*1.4/5748**perCore center tc lt 7 # # Altix: # 2032 131072 9 3.43 6706.49 39713.51 19.54 0.01% 11.4us set label "4*510p 11us" at 2032, 2032*20*1.4/2032**perCore center tc lt 5 set label "510p 5.7us" at 510, 510*120*1.4/510**perCore center tc lt 5 # set key left Left reverse if (perCore) set key right Left reverse set logscale xy #set ytics 2 # set label "constant routing" at 512, 512*6000 right set xtics 2 set ylabel "aggregate bandwidth [MB/s]" if (perCore) set ylabel "bandwidth per Core [MB/s]" # set xtics ("1" 1,"4" 4,"16" 16,"64" 64,"256" 256,"1k" 1024,"4k" 4096,"16k" 4*4096,"64k" 65536,"256k" 262144,"1M" 1048576,"4M" 4*1024*1024,"16M" 16*1024*1024) plot [2:1.2*2**13][16/4**perCore:4*2**20/50**perCore] \ "-" u 1:($6/$1**perCore) t "Altix4700-IA64-SMP 1.43us 2.6GB/s" w lp 5 3,\ "-" u 1:($6/$1**perCore) t "24*2DualOpt2.2GHz-2*GE 56us 235MB/s" w lp 2 4,\ "-" u 1:($6/$1**perCore) t "4*DualOpt-SMP, 8*Quad 1.5us 1.6GB/s" w lp 4 3,\ "-" u 1:($6/$1**perCore) t "2*Quad-Nehalem-SMP, 3.7us 3.3GB/s(sat)" w lp 8 3,\ "-" u 1:($6/$1**perCore) t "58*2Xeon3GHz-2*100ME 190us 23MB/s" w lp 1 4,\ "-" u 1:($6/$1**perCore) t "60*2QuadOpt2.1GHz-IB20G 4.8us 2.2GB/s" w lp 6 4,\ "-" u 1:($6/$1**perCore) t "SC 972*6MIPS-700MHz 1.45us 3GB/s" w lp 7 4,\ 0.01 t "" w l 1 1 # #"-" u 1:($6/$1**perCore) t "Altix4700 (bad test)" w lp 8 8,\ # Altix4700=altix4700 # localmem=8.5GB/s(shared for 2-4CPUs), link=6.4GB/s(2/Blade,1-5us) # using MPI 1.2 # Measure speed of MPI_sendrecv transactions: # ## !!! --- old wrong data (1 MPI_Sendrecv per loop) --- !!! ## - transfers pointer only (SMP trick) ## threads msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] error # 2 1048576 18 33 125.89 16659.27 3.0% 1.4us job=198388 cput=17m wall=9m 0.5MFLOPS 2220MIPS # 4 131072 21 50 23.84 21990.23 2.0% 2.6us job=198383 cput=40m wall=11m # 8 262144 20 59 56.27 37271.58 1.7% job=198382 cput=1h20m all=11m a07/133..140 # 16 262144 19 36 68.66 61083.98 2.8% job=198093 cput=3h wall=11m 1MFLOPS 2260MIPS # 32 131072 20 43 41.01 102280.15 2.3% job=197909 13MFLOPS 1124MIPS??? # 64 65536 20 27 25.75 162890.61 3.7% job 198225 a08/166..228 # 256 1048576 16 16.88 257.61 1042038.5 4070.46 2.7us 0ns! ## 508 524288 17 17.24 131.51 2025278.8 3986.77 2.8us 0ns max=6.4GB/s ## 1016 524288 17 17.40 132.72 4013422.1 3950.22 2.9us 0ns ## 2032 524288 17 17.19 131.18 8121128.0 3996.62 3.2us 0ns ## exec_host = a10/508+a11/508+a15/508+a16/508 over 4 partitions #e # ----------------------------------------------- # new good data for ALtix4700 (2 MPI_Sendrecv per loop) # ...,cpu/2,speed128k,minSpeed,maxSpeed,speed128k,2*cpu,... # threads msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] # 2 131072 12 1.45 353.72 741.11 370.55 0.00% interactive # 2 524288 12 5.02 1226.42 854.99 427.49 0.00% interactive 2 131072 14 1.65 100.55 2607.16 1303.58 0.00% 1.43us pr28yaSPEED.o224928 4 131072 13 1.59 193.64 2707.51 676.88 0.00% pr28yaSPEED.o223065 # 4 131072 13 1.69 206.85 2534.60 633.65 0.00% pr28yaSPEED.o224939 4 131072 12 1.27 310.21 1690.09 422.52 0.00% pr28yaSPEED.o224940 4 65536 14 1.61 98.37 2664.87 666.22 0.00% pr28yaSPEED.o224940 # 8 131072 12 1.47 358.92 2921.48 365.19 0.00% pr28yaSPEED.o223064 8 131072 12 1.08 263.45 3980.19 497.52 0.01% pr28yaSPEED.o224943 8 131072 13 3.06 373.73 2805.68 350.71 0.00% slow 8 4194304 7 1.02 7979.61 4205.02 525.63 0.01% pr28yaSPEED.o224943 8 131072 12 1.08 263.45 3980.19 497.52 0.01% pr28yaSPEED.o224943 # 16 131072 12 1.87 456.46 4594.34 287.15 0.00% pr28yaSPEED.o224953 # only one stress job per time! 16 131072 13 2.52 308.10 6806.66 425.42 0.00% dec08 16 131072 12 1.88 459.15 4567.49 285.47 0.00% pr28yaSPEED.o223063 16 131072 13 2.52 308.10 6806.66 425.42 0.00% dec08 32 131072 12 2.56 626.02 6699.96 209.37 0.00% dec08 32 8388608 6 2.25 35188.23 7628.56 238.39 0.01% 32 131072 12 2.56 626.02 6699.96 209.37 0.00% dec08 64 131072 11 1.62 792.84 10580.50 165.32 0.01% pr28yaSPEED.o223066 64 131072 12 3.60 878.84 9545.05 149.14 0.01% dec08 64 134217728 2 2.38 596034.47 14411.81 225.18 0.00% dec08 64 16777216 4 1.15 72028.73 14907.13 232.92 0.01% pr28yaSPEED.o223066 64 131072 11 1.62 792.84 10580.50 165.32 0.01% pr28yaSPEED.o223066 128 131072 11 1.18 576.88 29082.44 227.21 0.02% pr28yaSPEED.o223067 128 32768 14 2.90 177.25 23663.11 184.87 0.01% dec08 slow 128 16777216 5 1.49 46708.63 45976.16 359.19 0.01% pr28yaSPEED.o223067 128 131072 11 1.18 576.88 29082.44 227.21 0.02% 256 131072 11 1.59 776.03 43238.84 168.90 0.01% pr28yaSPEED.o223068 256 1048576 7 2.78 21713.21 12362.77 48.29 0.01% dec08 slow 256 134217728 3 3.65 456498.94 75267.95 294.02 0.01% dec08 2**28 256 16777216 5 1.53 47679.44 90080.07 351.88 0.02% pr28yaSPEED.o223068 256 131072 12 3.19 779.88 43025.31 168.07 0.01% dec08 2**28 510 131072 10 1.05 1028.47 64996.38 127.44 0.02% pr28yaSPEED.o223382 510 262144 7 3.46 27026.79 4946.70 9.70 0.00% dec08 lowest 510 16777216 5 1.58 49272.10 173655.7 340.50 0.02% pr28yaSPEED.o223382 510 131072 10 1.05 1028.47 64996.38 127.44 0.02% pr28yaSPEED.o223382 1016 131072 11 5.81 2839.10 46905.39 46.17 0.01% 1**24 a17/508+a19/508 1016 262144 5 1.02 31737.13 8392.01 8.26 0.01% dec08 lowest #1016 1073741824 1 8.05 4025178 271024.45 266.76 0.00% 2**30 maxS 1016 16777216 3 1.36 169457.58 100589.49 99.01 0.01% 9.9us 1016 131072 11 5.81 2839.10 46905.39 46.17 0.01% 1**24 a17/508+a19/508 #a07+a12+a14+a17 2**30 #a08+a09+a10+a11 2**30 (zeitgleich!) 2032 131072 9 3.43 6706.49 39713.51 19.54 0.01% 11.4us 2032 32768 8 2.41 9410.24 7075.76 3.48 0.01% 2032 33554432 3 3.39 423148.15 161131.76 79.30 0.01% # 2032 1073741824 1 7.94 3968903.03 549734.62 270.54 0.00% ??? 2032 131072 9 3.43 6706.49 39713.51 19.54 0.01% e ################################################################## #"-" u 1:6 t "12*2DualXeon3GHz-1GbE" w lp 1 1,\ #"-" u 1:6 t "" w lp 2 1,\ # using MPI 2.0 with 4 nodes, Jun08 leonardo-1GbE,2*DualXeon3GHz # leonardo (1 of 4 GbE used) # Measure speed of MPI_sendrecv transactions: # threads msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] error # 4 1048576 12 56 13671.88 306.78 1.8% # 8 2097152 10 24 23437.50 715.83 4.2% # 12 1048576 11 24 11718.75 1073.74 4.2% # #e # Measure speed of MPI_alltoall transactions: # 4 65536 16 37 564.58 464.32 2.7% # 8 65536 16 43 656.13 799.06 2.3% # 12 32760 16 25 381.47 1030.54 4.0% #e ################################################################## # ibio-cc # # ibio-cc 2*DualOpteron-275-2.2GHz L2=1MB 2*1Gb-Eth lam-7.1.2/mpich-1.2.7 64bit # PATH=$HOME/openmpi-1.2.6/bin:$PATH mpicc -O2 -o mpi_speed_ompi mpi_speed.c # PATH=$HOME/openmpi-1.2.6/bin:$PATH mpirun -v -hostfile bhostfile -np 24 ./mpi_speed_ompi # PATH=/opt/lam-7.1.2/bin:/usr/bin mpicc -o mpi_speed_lam -O2 mpi_speed.c # LAM_MPI_SSI_rpi=tcp LAM_MPI_SSI_rpi_tcp_short=1048576 default=64K no loop! # LAM_MPI_SSI_rpi=lamd (via deamon lamd) higher latencies, less tcp-traffic? # ToDo: try 7.1.5, rpi=lamd (10.0.0.x+udp) slow! enlarged lam-tcp-buffer # usysv uses SMP local, TCP remote # -ssi mpi_hostmap lam-hostmap.txt # nodeX mpi=nodeX-mpi (but odd=nodeX) # /opt/openmpi-1.2.6/mpicc -O2 -o mpi_stress mpi_stress.c # qsub -j y -cwd -pe openmpi 4-100 ./job # SGE, qrsh works! # for(j)MPI_Sendrecv(from i-j, to i+j) ToDo: (from/to n-i+j) # j=0: 0-0 1-4 2-3 3-2 4-1 0: 0-0 1-3 2-2 3-1 # 1: 0-1 1-0 2-4 3-3 4-2 1: 0-1 1-0 2-3 3-2 # 2: 0-2 1-1 2-0 3-4 4-3 2: 0-2 1-1 2-0 3-3 # 3: 0-3 1-2 2-1 3-0 4-4 3: 0-3 1-2 2-1 3-0 # 4: 0-4 1-3 2-2 3-1 4-0 # Measure speed of MPI_sendrecv transactions: # threads msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] error # 2 65536 17 96 732.42 178.96 1.0% 51.5us lam-7.1.2-tcp 2*eth1 # 4 1048576 13 116 14160.16 296.20 0.9% 51.5us # 8 1048576 11 27 13183.59 636.29 3.7% 52us # 16 1048576 13 105 12817.38 1308.94 1.0% 51us # 24 1048576 11 25 12207.03 2061.58 4.0% 52us ## 24 524288 12 37 9033.20 1392.96 2.7% (from/to n-i+j) # 48 1048576 12 104 25390.62 1982.29 1.0% 67us 2jobs/node ## 48 262144 13 48 5859.38 2147.48 2.1% odd=eth0,even=eth1 # 19.08.08: msgsize .gt. 16MB: # 50 268435456 0 2.70 2702967.9 4965.57 99.31 MPI_ERR_TRUNCATE! # btl_tcp_endpoint.c:415:mca_btl_tcp_endpoint_recv_blocking] recv() failed with errno=104 # same for 2^24 # [node6:20472] [0,0,0]-[0,1,24] mca_oob_tcp_msg_recv: readv failed: Connection reset by peer (104) # 25nodes: [node22:12113] *** MPI_ERR_TRUNCATE: message truncated # 25nodes 2^20 = ok bis 2^10 ??? # 2nd mpirun produces: A daemon on node node14 failed to start as expected (2nd mpirun) # 25*22_2_5: OK (no 2nd mpirun) # 25 1048576 8 2.22 8661.49 3026.55 121.06 # 72 262144 13 84 10253.91 1840.70 1.2% 82us 3jobs/node ## 96 hanging? try mpitask # 96 131072 13 55 6713.87 1874.17 1.8% 97us 4jobs/node ## 96 131072 13 46 5615.23 2240.85 2.2% 97us odd=eth0,even=eth1 ## 96 65536 14 50 3051.76 2061.58 2.0% 101us (from/to n-i+j) odd=eth0,even=eth1 # # /opt/mpich2-1.0.7/bin/mpicc -O2 -o mpi_stress_mpich mpi_stress.c # mpich-1.2.7p1 (try mpich2-1.0.7) # 2 131072 15 87 2655.03 98.73 1.1% # 4 131072 13 27 3295.90 159.07 3.7% # 8 65536 14 29 1770.02 296.20 3.4% # 16 131072 13 30 3662.11 572.66 3.3% # 24 131072 13 30 3662.11 858.99 3.3% # 48 131072 12 25 6103.52 1030.79 4.0% 88us 2jobs/node # 72 2097152 9 61 119140.62 1267.37 1.6% # hanging! 3j/n # 72*3 (netstat -atn | wc = 234) tcpdump eth1=no eth0=(123.UDP=NTP,UDP=NFS) #Proto Recv-Q Send-Q Local Address Foreign Address State #tcp 0 492 10.0.0.1:513 10.0.0.254:1023 ESTABLISHED # ibio-cc MPI_Sendrecv openmpi-1.2.6 k2.6.16 16*2Gbit/8=4GB/s max=4016MB/s 2 67108864 8 146 570312.50 235.34 0.7% 56us 4 16777216 9 62 121093.75 554.19 1.6% 56us # 4 16777216 3 1.07 135284.01 500.00 _replace 60us 8 4194304 12 120 29296.88 1145.32 0.8% 61us 16 67108864 8 120 468750.00 2290.65 0.8% 69us 24 67108864 8 121 472656.25 3407.58 0.8% 76us # 25 268435456 3 15.36 1920562.8 3494.23 139.77 92us no-nodeshift! 25 33554432 4 3.75 234593.38 3575.81 143.03 48 16777216 8 55 214843.75 3748.34 1.8% 118us 1..16M 2jobs/node # 48 262144 12 21 5126.95 2454.27 4.8% 183us from/to -i+j 72 16777216 8 77 300781.25 4016.07 1.3% 160us 3jobs/node 96 67108864 8 360 1406250.0 4581.30 0.3% 187us 1M-64M 4j/n # 96 65536 9 13 25390.62 247.79 7.7% hanging! # hanging: notraffic, sys=100% 2* strace:polling fd=1.. e #################################################################### # comp1. 4*2Opteron-SMP: using MPI 1.2 with 2 nodes # mpich-1.2.7p1-2005/11/04 MPI_sendrecv (ToDo: try MPICH2-1.0.7) # threads msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] error # 2 8192 20 47 44.82 365.53 2.1% # mpich-1.2.7p1 # 4 32768 19 99 188.83 694.14 1.0% # 8 32768 23 1717 204.68 1280.74 0.1% # # OpenMPI-1.2.6 3 other jobs # LD_LIBRARY_PATH=$HOME/openmpi-1.2.6/lib ./openmpi-1.2.6/bin/mpirun -v -np 4 ./mpi_speed_ompi 20 15 2 262144 19 177 337.60 1552.98 0.6% 1.5us 4K..16M 3 131072 19 99 188.83 2082.41 1.0% 1.6us 64K..1M 4 262144 18 106 404.36 2593.19 0.9% 1.8us 64K..16M 5 262144 18 114 434.88 3014.01 0.9% 1.9us 64K..1M 6 262144 17 61 465.39 3379.65 1.6% 1.9us 4K..1M 8 131072 18 68 259.40 4042.32 1.5% 1.9us (alone) 64K..1M # x4600: 8*QuadOpt-2.3GHz 32 262144 16 108 1647.95 5090.33 0.9% 3.1us e # 2*Quad-Nehalem-4MB-2GHz-72GB=18*4GB SMP gcc-4.2.2 ompi-1.3.3 (bigger size) # worse latencies, better bandwith especially with few cores, # stronly size dependent + processor pinning # the max. is much larger than saturation value! cpu tricks? # what to show? think most realistic is saturation value! # Measure speed of MPI_sendrecv transactions: # tasks msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] Bcast # saturated speeds (bigger msgsize) 2 2097152 10 1.31 1274.69 3290.46 1645.23 0.00% 3.7us ..64M 0x11 # 2 2097152 10 1.30 1270.35 3301.69 1650.84 0.00% 3.7us ..64M 0x55 3 2097152 10 1.87 1823.88 3449.49 1149.83 0.00% 4.8us ..64M 0x55 4 1048576 10 1.23 1205.79 3478.47 869.62 0.01% 6.0us ..64M 0x55 5 2097152 9 1.94 3781.12 2773.19 554.64 0.01% 2.6us 0xfd # 6 1048576 9 1.22 2378.43 2645.22 440.87 0.01% 3.9us 0xff 6 1048576 9 1.05 2054.96 3061.59 510.27 0.01% 3.2us 0xfd 7 1048576 9 1.38 2689.46 2729.19 389.88 0.00% 4.1us 0xff 8 1048576 9 1.64 3199.40 2621.93 327.74 0.01% 4.6us 0xff saturation! # maximum speed (SMP only? no real or special (cache-to-cache?) transfer?) # 2 524288 12 1.28 311.60 3365.09 1682.55 0.00% 1.5us ..64M 0xff 2 65536 16 1.28 19.59 6690.40 3345.20 0.00% 3.7us ..64M 0x11 cache+SMP-effect? # 2 65536 16 1.28 19.51 6717.39 3358.70 0.00% 3.7us ..64M 0x55 cache? # 3 524288 12 1.41 343.54 4578.41 1526.14 0.00% 1.8us ..64M 0xff 3 32768 17 1.60 12.23 8035.31 2678.44 0.00% 4.8us ..64M 0x55 # 4 262144 13 1.73 210.79 4974.61 1243.65 0.00% 2.9us ..64M 0xff 4 32768 17 1.97 15.06 8702.45 2175.61 0.00% 6.0us ..64M 0x55 5 131072 14 1.82 111.11 5898.40 1179.68 0.00% 3.0us ..64M 0xff # 5 131072 14 1.85 112.96 5801.65 1160.33 0.00% 2.6us ..64M 0xfd # 6 65536 14 1.13 68.81 5714.68 952.45 0.00% 3.9us ..64M 0xff 6 65536 15 1.97 60.22 6529.17 1088.20 0.00% 3.2us ..64M 0xfd # 7 65536 14 1.17 71.32 6432.13 918.88 0.00% 4.1us ..64M 0xff 8 65536 14 1.39 84.68 6191.74 773.97 0.00% 4.6us ..64M 0xff SMP+cache tricks? e ############################################################# # tina=56*2Xeon3GHz-100MbE OpenMPI-1.2.5 16*200Mbit/8=400MB/s max=245MB/s # !!! shows instabilities OpenMPI-1.2.6 num_nodes=31... 16*2M # ENTERASYS Matrix E5 # Measure speed of MPI_sendrecv transactions: # threads msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] error 2 16777216 8 365 1425781.2 23.53 0.3% 189us 4K..16M 4 16777216 8 263 1027343.7 65.32 0.4% 214us 4K..16M 8 1048576 10 67 65429.69 128.21 1.5% 219us 16 524288 11 70 34179.69 245.43 1.4% 217us 28 524288 11 104 50781.25 289.08 1.0% 221us 32 524288 11 104 50781.25 330.38 1.0% 221us 4K..16M 56 524288 10 57 55664.06 527.45 1.8% 233us 112 131072 12 119 29052.73 505.29 0.8% 359us 2jobs/node 64K..16M # Aug08 node=32 hanging (new ompi-1.2.6 eth0+1) # threads msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] # 2 65536 9 1.85 3622.72 36.18 18.09 125us # 4 1048576 4 1.15 72081.25 58.19 14.55 212us # 8 1048576 4 1.08 67753.44 123.81 15.48 210us # 16 524288 9 17.50 34176.64 245.45 15.34 216us # 24 524288 9 17.71 34594.54 363.73 15.16 221us # 30 262144 10 18.68 18240.41 431.15 14.37 221us # 31 1048576 7 10.05 78543.65 413.86 13.35 1of2hanging (ok if fixed) # 31 262144 10 18.61 18176.14 447.10 14.42 221us # 32 4194304 4 14.05 878045.63 152.86 4.78 hanging # 32 16777216 1 1.79 896047.00 599.15 18.72 hanging # 112: from/to n-i+j: ERROR: MPI_ERR_TRUNCATE: message truncated (1of2) OOM? # 2 16777216 8 366 1429687.5 23.47 0.3% 185us 4K..16M # 4 16777216 8 262 1023437.5 65.57 0.4% 212us 4K..16M # 8 1048576 10 68 66406.25 126.32 1.5% 216us # 16 524288 11 70 34179.69 245.43 1.4% 217us # 28 524288 11 103 50292.97 291.89 1.0% 221us # 32 524288 11 104 50781.25 330.38 1.0% 225us 4K..16M # 56 65536 12 35 8544.92 429.50 2.9% 233us # 112 131072 12 118 28808.59 509.57 0.8% 355us 2jobs/node 64K..16M e # ---------------------------------------------------------------- # uni Halle x4600 2*IB/DDR2 (2*20GBbits/s) IB/Voltaire 2*24 # "18*8DualOpt2.6GHz-2*IB20G" w lp 6 6,\ # see ../y2008/hpc_antrag2007/halle_BW_vs_msgsize.gpl # localBW=410..666MB/s # 2 262144 15 12.90 393.56 1332.15 666.08 N07c4+8 ## 8 131072 13 2.62 319.42 3282.78 410.35 N07c0,2,..14 2.4us # ## IB-BW: 490..510MB/s 780MB/s(2jobs/node) 1300MB/s(15jobs/node) # 2 131072 13 2.11 256.99 1020.05 510.02 N02+N07 4.4us # 4 65536 14 2.73 166.74 1572.17 393.04 N02+N07 c6+7 # 8 262144 11 2.18 1062.50 1973.80 246.72 N02+N07 c1,2,6,10 # 16 65536 13 3.45 421.56 2487.40 155.46 N02+N07 # 30 65536 12 3.11 758.94 2590.56 86.35 N02+N07 maximum? ## 2590.56/2=1300MB/s Saettigung? ## mpich test64.o311791 ERROR ### mode[mask]: 5 (set by arg3) #### +8=MPI_Alltoall +4=MPI_Sendrecv +1=ShiftPartners +2=Pairs(~Rings) ##[29] Abort: message truncated. ask 8388608 got 16777216 at line 1198 in file viacheck.c ##[5] Abort: [GE1N11:5] Got completion with error, code=VAPI_RETRY_EXC_ERR,\ ## vendor code=81, dest host rank = 16 ## at line 2157 in file viacheck.c ## ... # # karman: IB Infiniband-Cluster Feb09 # 2 * Quad-Core AMD Opteron(tm) Processor 2352 2100MHz L2=512kB # 32GB/node, 80GB local disk, 5TB NFS # InfiniBand: Mellanox Technologies MT25204 [InfiniHost III Lx HCA] # module add dot maui torque # module add ofed/1.3.1/base/64/1.3.1 # module add ofed/1.3.1/intel/openmpi/1.2.6 ??? mpicc failes # cascade of 2 switches? # 2 4194304 - - - 2306 1153 4.8us mvapich2 # 2 16777216 7 1.86 14526.90 2310 1155 4.8us # 2*1 2 16777216 7 1.95 15218.33 2204.87 1102.43 4.8us 2*1 ompi # 4 524288 11 1.18 573.74 3655.26 913.81 0.01% 4 8388608 7 1.03 8029.47 4178.91 1044.73 4.9us 4*1 ompi # 4 8388608 - - - 3856 964 5.8us # 4*1 # 4 2097152 8 1.04 4043.54 2074.57 518.64 node001..4 # 8 2097152 8 1.00 3924.12 4275.41 534.43 node001..8 13us # 8 16777216 5 1.26 39303.03 3414.95 426.87 node059..66 19us 8 8388608 7 1.08 8406.46 7983.01 997.88 8*1 5us ompi 16 8388608 7 1.22 9520.97 14097.07 881.07 16*1 14us ompi # 16 4194304 - - - 10752 672 11.5us # 16*1 32 8388608 7 1.41 11045.80 24302.05 759.44 32*1 14us ompi # 32 4194304 - - - 20224 632 9.6us # 32*1 mv # 32 4194304 3 1.30 161972.40 828.65 25.90 38ms # 32*1 # 56 8388608 3 1.09 136470.50 3442.22 61.47 100ms ? # 56*1 # 60 1024 16 1.20 18.26 3364.58 56.08 (after 07.03.) # 32 8388608 - - - 4992 156 156us # 4*8 # openmpi 60 8388608 7 1.68 13134.12 38321.28 638.69 60*1 12.6..14us 120 4194304 7 1.68 13140.77 38301.89 319.18 60*2 45us #120 1048576 9 1.79 3495.80 35994.38 299.95 60*2 52us 240 1048576 8 1.70 6654.82 37815.93 157.57 60*4 131us #240 262144 10 1.84 1799.10 34970.03 145.71 60*4 165us #480 4194304 5 1.90 59367.44 33911.95 70.65 60*8 265us 480 131072 10 1.84 1799.54 34961.39 72.84 60*8 260..276us #480 131072 3 3.14 392169.00 160.43 0.33 60*8 264us ???? # mvapich2? #480 16777216 3 1.07 133533.87 60307.28 125.64 +error # 60*8 mvapich #07.03.09 22:26 send desc error ???? MV2_DEFAULT_TIME_OUT=16 ??? # [172] Abort: [] Got completion with error 12, vendor code=81, dest rank=230 # at line 519 in file ibv_channel_manager.c # 480 1048576 3 1.43 178769.11 2815.46 5.87 671s ??? 60*8 e # -------------------------------------------------------- # SiCortex SC5832-700MHz # 2 16777216 7 1.32 10331.25 3247.86 1623.93 0.00% 2 1048576 11 1.47 718.59 2918.45 1459.22 0.00% 1.45us # 4 16777216 7 1.34 10469.66 6409.84 1602.46 0.00% 1.5us 4 1048576 11 1.52 741.82 5654.09 1413.52 0.00% # 8 16777216 7 1.44 11216.57 11966.02 1495.75 0.00% 8 1048576 11 1.57 764.53 10972.25 1371.53 0.00% # 16 16777216 7 1.62 12617.59 21274.70 1329.67 0.00% 16 1048576 11 1.84 896.90 18705.73 1169.11 0.00% 1.5us # what about lot of parallel N=16 jobs? # 32 16777216 6 1.07 16683.84 32179.10 1005.60 0.00% 32 1048576 10 1.21 1178.34 28476.10 889.88 0.00% 700MHz # 192 1048576 8 1.82 7090.57 28393.57 147.88 0.00% 2.9us !!! -N 32 # 32 1048576 10 1.21 1178.34 28476.10 889.88 0.00% 1.7us 700MHz # old values? 32 1048576 10 1.22 1189.75 28202.92 881.34 0.00% 64 1048576 10 1.48 1441.37 46559.09 727.49 0.00% 128 1048576 10 1.93 1887.97 71091.03 555.40 0.00% 256 1048576 9 1.23 2410.14 111377.38 435.07 0.00% # start=4s 512 1048576 9 1.54 3015.13 178059.22 347.77 0.00% 2.3us # 512 33554432 9 43.01 84007.06 204505.06 399.42 0.00% ppn=1 2.3us # after repair m20n13-tx2 # 1536 4194304 10 34.04 33239.35 193820.00 126.18 0.00% ppn=3 4.2us # 3072 8388608 8 34.85 136119.53 189317.46 61.63 0.00% ppn=6 6.3us 958 1048576 9 1.92 3747.18 268077.90 279.83 0.00% ppn=1 958 4194304 7 1.73 13542.50 296706.18 309.71 0.00% ppn=1 2.9us -N 958 700MHz 5748 2097152 5 1.28 39956.11 301691.77 52.49 0.00% 7.4us -N 958 # -N 958 -n 5748 # bei 64kB erste Saettigung 3*55MB/s, dann multi-rail bis 3*130MB/s (N=512) # 6*26MB/s, 6* 60MB/s # 1*175MB/s, 1*400MB/s # zu kurze messzeiten!? (580GB/s nur bei 64MB --bsize 26 + 2s)!??? # we need to make enough steps to have used all possible roots # i.e. 2^5=32 is less than 5748! have to multiply error with num nodes! # and use longer runtimes! # --------------------- end_Sendrecv SC --------------------------- e pause -1 # ------------- snip ------------------- #gnuplot #set logscale xy #set ytics 2 set nolabel set ytics auto set nologscale y set logscale x set key left set title "MPI_Sendrecv vs. msgsize - 1Gb-Ethernet" set xlabel "msgsize [B]" set ylabel "BW/task [MB/s]" set xtics ("1" 1,"4" 4,"16" 16,"64" 64,"256" 256,"1k" 1024,"4k" 4096,"16k" 4*4096,"64k" 65536,"256k" 262144,"1M" 1048576,"4M" 4*1024*1024,"16M" 16*1024*1024,"64M" 64*1024*1024,"256M" 256*1024*1024) # leonardo, ibio-cc plot [1:256*1024*1024] [.0625/4:] \ "-" u 2:($6/$1) t " 4x1: 2*DualXeon3GHz-GbE ompi-1.0.1" w lp 1 1,\ "-" u 2:($6/$1) t " 8x1: 2*DualXeon3GHz-GbE ompi-1.0.1" w lp 1 2,\ "-" u 2:($6/$1) t "14x1: 2*DualXeon3GHz-GbE ompi-1.0.1" w lp 1 5,\ "-" u 2:($6/$1) t "25x1: amd2.2GHz-2*GbE-ompi1.2.6" w lp 3 1,\ "-" u 2:($6/$1) t "25x2: amd2.2GHz-2*GbE-ompi1.2.6" w lp 5 2,\ "-" u 2:($6/$1) t "16x1: 2*HTXeon3GHz-100MbE-ompi1.2.6" w lp 4 1,\ 0.01 t "" w l 2 1 # # leonardo # using MPI 2.0 with 4 nodes, Jun08 leonardo-1GbE,4*Xeon3GHz # lx2.6.10-6-686-smp OMPI-1.0.1 gcc-3.5 # (near speed drops! n12,n14=32K) also 2jobs/node # node01-eth0 errors=22 # eth0: Tigon3 [BCM95704A41 rev 2002 PHY(serdes)] (PCIX:133MHz:64-bit) 10/100/1000BaseT # eth0: RXcsums[1] LinkChgREG[0] MIirq[0] ASF[1] Split[0] WireSpeed[1] TSOcap[0] # ompi_info --param all all # ompi_info --param btl tcp # Measure speed of MPI_sendrecv transactions: 20.8.08 # threads msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] error 4 268435456 0 2.50 2498537.92 429.75 107.44 0.00% 4 134217728 0 1.31 1308583.98 410.27 102.57 0.00% 4 67108864 1 1.82 908517.44 295.47 73.87 0.00% 4 33554432 2 1.57 391484.76 342.84 85.71 0.01% 4 16777216 3 1.56 195190.12 343.81 85.95 0.01% 4 8388608 4 1.75 109346.19 306.86 76.72 0.01% 4 4194304 5 1.67 52043.81 322.37 80.59 0.01% 4 2097152 6 1.67 26090.24 321.52 80.38 0.01% 4 1048576 7 1.73 13526.23 310.09 77.52 0.01% 4 524288 8 1.83 7134.70 293.94 73.48 0.02% 4 262144 9 1.54 3011.12 348.23 87.06 0.02% 4 131072 9 1.03 2008.95 260.98 65.24 0.03% 4 65536 10 1.09 1059.91 247.33 61.83 0.02% 4 32768 12 1.86 455.03 288.05 72.01 0.01% 4 16384 12 1.26 308.09 212.72 53.18 0.02% 4 8192 13 1.55 189.57 172.86 43.21 0.02% 4 4096 13 1.03 125.35 130.71 32.68 0.02% 4 2048 14 1.60 97.58 83.95 20.99 0.02% 4 1024 14 1.29 78.69 52.05 13.01 0.02% 4 512 14 1.06 64.61 31.70 7.92 0.02% 4 256 15 1.81 55.10 18.58 4.65 0.01% 4 128 15 1.63 49.69 10.30 2.58 0.01% 4 64 15 1.59 48.62 5.27 1.32 0.01% 4 32 15 1.51 46.07 2.78 0.69 0.02% 4 16 15 1.48 45.07 1.42 0.36 0.02% 4 8 15 1.48 45.02 0.71 0.18 0.02% 4 4 15 1.47 44.85 0.36 0.09 0.02% 4 2 15 1.46 44.58 0.18 0.04 0.02% 4 1 15 1.47 44.77 0.09 0.02 0.02% e # using MPI 2.0 with 8 nodes, leonardo 20.8.2008 # Measure speed of MPI_sendrecv transactions: # threads msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] error 8 268435456 0 2.70 2698937.97 795.68 99.46 0.00% 8 134217728 0 1.42 1418050.90 757.20 94.65 0.00% 8 67108864 0 1.17 1174354.06 457.16 57.15 0.00% 8 33554432 2 1.38 345605.25 776.71 97.09 0.01% 8 16777216 3 1.54 192749.00 696.33 87.04 0.01% 8 8388608 4 1.59 99521.81 674.31 84.29 0.01% 8 4194304 5 1.51 47233.62 710.39 88.80 0.01% 8 2097152 6 1.51 23642.50 709.62 88.70 0.01% 8 1048576 7 1.58 12337.41 679.93 84.99 0.02% 8 524288 8 1.74 6813.19 615.62 76.95 0.02% 8 262144 9 1.99 3890.79 539.00 67.38 0.02% 8 131072 9 1.06 2072.78 505.88 63.23 0.03% 8 65536 10 1.14 1109.76 472.43 59.05 0.02% 8 32768 7 1.29 10094.52 25.97 3.25 0.01% 8 16384 12 1.26 307.55 426.18 53.27 0.02% 8 8192 13 1.58 192.84 339.84 42.48 0.02% 8 4096 13 1.00 122.34 267.85 33.48 0.02% 8 2048 14 1.55 94.83 172.78 21.60 0.02% 8 1024 14 1.27 77.51 105.69 13.21 0.02% 8 512 14 1.09 66.39 61.69 7.71 0.02% 8 256 15 1.82 55.67 36.79 4.60 0.01% 8 128 15 1.73 52.80 19.40 2.42 0.01% 8 64 15 1.62 49.41 10.36 1.30 0.02% 8 32 15 1.69 51.53 4.97 0.62 0.02% 8 16 15 1.69 51.52 2.48 0.31 0.01% 8 8 15 1.55 47.40 1.35 0.17 0.02% 8 4 15 1.51 46.02 0.70 0.09 0.02% 8 2 15 1.58 48.30 0.33 0.04 0.02% 8 1 15 1.56 47.66 0.17 0.02 0.02% e #14 nodes Senndrecv leonardo 20.8.2008 # switching src/dst # threads msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] 14 268435456 0 2.71 2712346.08 1385.55 98.97 0.00% 14 134217728 0 1.42 1419497.09 1323.74 94.55 0.02% 14 67108864 1 1.43 715049.03 1313.93 93.85 0.01% 14 33554432 2 1.64 409956.50 1145.88 81.85 0.01% 14 16777216 3 1.39 173135.24 1356.63 96.90 0.01% 14 8388608 4 1.59 99304.00 1182.64 84.47 0.01% 14 4194304 5 1.53 47923.16 1225.30 87.52 0.02% 14 2097152 6 1.52 23714.64 1238.06 88.43 0.02% 14 1048576 7 1.57 12259.62 1197.43 85.53 0.02% 14 524288 8 1.68 6568.22 1117.51 79.82 0.02% 14 262144 9 1.94 3792.40 967.73 69.12 0.02% 14 131072 9 1.07 2089.17 878.34 62.74 0.04% 14 65536 10 1.16 1130.61 811.51 57.96 0.03% 14 32768 8 1.01 3960.53 115.83 8.27 0.03% 14 16384 12 1.79 437.36 524.46 37.46 0.02% 14 8192 13 1.70 207.81 551.89 39.42 0.02% 14 4096 13 1.06 129.04 444.38 31.74 0.03% 14 2048 14 1.72 104.84 273.49 19.54 0.02% 14 1024 14 1.35 82.48 173.80 12.41 0.02% 14 512 14 1.19 72.64 98.68 7.05 0.02% 14 256 14 1.20 72.94 49.14 3.51 0.02% 14 128 14 1.06 64.63 27.73 1.98 0.02% 14 64 15 1.80 54.94 16.31 1.16 0.02% 14 32 15 1.79 54.54 8.21 0.59 0.02% 14 16 15 1.73 52.93 4.23 0.30 0.02% 14 8 15 1.84 56.23 1.99 0.14 0.02% 14 4 15 1.81 55.09 1.02 0.07 0.02% 14 2 15 1.76 53.69 0.52 0.04 0.02% 14 1 15 1.84 56.06 0.25 0.02 0.02% # again 14 268435456 0 2.89 2886033.10 1302.17 93.01 0.00% 14 134217728 0 1.43 1426194.93 1317.53 94.11 0.02% 14 67108864 1 1.45 725654.45 1294.73 92.48 0.01% 14 33554432 2 1.63 406619.78 1155.29 82.52 0.01% 14 16777216 3 1.43 178433.26 1316.35 94.03 0.01% 14 8388608 4 1.58 98526.87 1191.96 85.14 0.01% 14 4194304 5 1.55 48419.16 1212.75 86.62 0.02% 14 2097152 6 1.52 23781.58 1234.57 88.18 0.02% 14 1048576 7 1.53 11966.02 1226.81 87.63 0.02% 14 524288 8 1.68 6552.42 1120.20 80.01 0.02% 14 262144 9 1.95 3801.21 965.49 68.96 0.02% 14 131072 9 1.07 2085.23 880.00 62.86 0.04% 14 65536 10 1.16 1133.87 809.18 57.80 0.04% 14 32768 9 2.60 5074.49 90.40 6.46 0.01% 14 16384 12 1.65 403.46 568.52 40.61 0.02% 14 8192 13 1.67 204.09 561.95 40.14 0.02% 14 4096 13 1.06 129.23 443.73 31.69 0.03% 14 2048 14 1.72 104.91 273.30 19.52 0.02% 14 1024 14 1.35 82.65 173.45 12.39 0.02% 14 512 14 1.19 72.63 98.70 7.05 0.02% 14 256 14 1.05 64.01 55.99 4.00 0.03% 14 128 15 1.87 57.00 31.44 2.25 0.02% 14 64 15 1.84 56.14 15.96 1.14 0.02% 14 32 15 1.85 56.45 7.94 0.57 0.02% 14 16 15 1.78 54.44 4.11 0.29 0.02% 14 8 15 1.86 56.64 1.98 0.14 0.02% 14 4 15 1.75 53.55 1.05 0.07 0.02% 14 2 15 1.79 54.66 0.51 0.04 0.02% 14 1 15 1.76 53.61 0.26 0.02 0.02% e # # ibio-cc OpenMPI-1.2.6 2*1Gb-eth 2DualOpt.-2.2GHz via SGE-qsh # openmpi-1.2.6/ompi/mpi/c/sendrecv.c # irecv(..,&req) + send + ompi_request_wait(&req,..) # # ToDo: Striping over multiple TCP # interconnects and here is my mca-params.conf file: # btl_tcp_if_include = eth0,eth1 # btl_tcp_max_rdma_size = 524288 # btl_tcp_latency_eth0 = 47 # btl_tcp_bandwidth_eth0 = 587 # btl_tcp_latency_eth1 = 51 # btl_tcp_bandwidth_eth1 = 233 # Something similar has to be done for openib and gm, in order to allow # us to strip the messages correctly. # # 20.8.08 # Measure speed of MPI_sendrecv transactions: # threads msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] 25 268435456 0 1.57 1570151.09 4274.04 170.96 0.27% 25 134217728 1 1.71 857333.42 3913.81 156.55 0.01% 25 67108864 1 1.11 554214.48 3027.21 121.09 0.01% 25 33554432 2 1.08 270599.48 3100.01 124.00 0.02% 25 16777216 4 1.85 115588.87 3628.64 145.15 0.01% 25 8388608 4 1.09 68107.87 3079.16 123.17 0.02% 25 4194304 5 1.03 32065.25 3270.13 130.81 0.03% 25 2097152 6 1.02 15969.79 3283.00 131.32 0.03% 25 1048576 7 1.02 7956.52 3294.71 131.79 0.04% 25 524288 8 1.17 4559.88 2874.46 114.98 0.04% 25 262144 9 1.30 2545.99 2574.08 102.96 0.03% 25 131072 10 1.41 1376.41 2380.69 95.23 0.03% 25 65536 11 1.66 808.97 2025.29 81.01 0.03% 25 32768 11 1.13 553.30 1480.57 59.22 0.04% 25 16384 12 1.32 322.91 1268.45 50.74 0.03% 25 8192 13 1.76 214.48 954.88 38.20 0.02% 25 4096 13 1.26 154.33 663.50 26.54 0.03% 25 2048 13 1.05 128.34 398.93 15.96 0.03% 25 1024 14 1.71 104.53 244.91 9.80 0.02% 25 512 14 1.49 90.80 140.96 5.64 0.02% 25 256 14 1.38 84.25 75.96 3.04 0.03% 25 128 14 1.33 80.88 39.57 1.58 0.03% 25 64 14 1.30 79.55 20.11 0.80 0.03% 25 32 14 1.28 78.06 10.25 0.41 0.03% 25 16 14 1.27 77.57 5.16 0.21 0.03% 25 8 14 1.27 77.67 2.57 0.10 0.03% 25 4 14 1.27 77.52 1.29 0.05 0.03% 25 2 14 1.27 77.51 0.65 0.03 0.03% 25 1 14 1.27 77.56 0.32 0.01 0.03% 25 1073741824 0 6.98 6979337.93 3846.14 153.85 0.03% 25 536870912 0 3.35 3346509.93 4010.68 160.43 0.00% 25 268435456 0 2.18 2183218.96 3073.85 122.95 0.00% 25 134217728 0 1.02 1019609.93 3290.91 131.64 0.00% 25 67108864 1 1.20 597723.01 2806.85 112.27 0.01% 25 33554432 2 1.03 256616.47 3268.93 130.76 0.02% 25 16777216 3 1.00 125383.97 3345.17 133.81 0.02% 25 8388608 4 1.01 63282.74 3313.94 132.56 0.03% 25 4194304 5 1.06 33074.97 3170.30 126.81 0.03% 25 2097152 6 1.03 16126.55 3251.09 130.04 0.03% 25 1048576 7 1.05 8200.53 3196.67 127.87 0.04% 25 524288 8 1.19 4648.16 2819.87 112.79 0.04% 25 262144 9 1.33 2588.30 2532.00 101.28 0.04% 25 131072 10 1.43 1398.70 2342.74 93.71 0.03% 25 65536 11 1.68 822.49 1991.99 79.68 0.03% 25 32768 11 1.13 552.63 1482.38 59.30 0.04% 25 16384 12 1.33 324.10 1263.80 50.55 0.03% 25 8192 13 1.75 214.01 956.97 38.28 0.02% 25 4096 13 1.26 154.26 663.80 26.55 0.03% 25 2048 13 1.04 127.24 402.39 16.10 0.03% 25 1024 14 1.68 102.82 248.97 9.96 0.02% 25 512 14 1.46 89.40 143.18 5.73 0.03% 25 256 14 1.36 82.70 77.39 3.10 0.03% 25 128 14 1.30 79.43 40.29 1.61 0.03% 25 64 14 1.25 76.48 20.92 0.84 0.03% 25 32 14 1.24 75.69 10.57 0.42 0.03% 25 16 14 1.24 75.70 5.28 0.21 0.03% 25 8 14 1.23 75.36 2.65 0.11 0.03% 25 4 14 1.23 75.14 1.33 0.05 0.03% 25 2 14 1.23 75.28 0.66 0.03 0.03% 25 1 14 1.23 75.05 0.33 0.01 0.03% 25 268435456 0 1.64 1639766.93 4092.59 163.70 0.37% 25 134217728 1 1.97 984678.03 3407.66 136.31 0.01% 25 67108864 2 1.80 451077.22 3719.37 148.77 0.01% 25 33554432 2 1.03 256617.01 3268.92 130.76 0.02% 25 16777216 4 1.96 122432.13 3425.82 137.03 0.01% 25 8388608 4 1.01 63003.61 3328.62 133.14 0.03% 25 4194304 5 1.04 32457.97 3230.57 129.22 0.03% 25 2097152 6 1.01 15712.34 3336.79 133.47 0.04% 25 1048576 7 1.03 8071.10 3247.93 129.92 0.04% 25 524288 8 1.19 4630.09 2830.87 113.23 0.04% 25 262144 9 1.32 2568.97 2551.06 102.04 0.04% 25 131072 10 1.43 1392.89 2352.52 94.10 0.03% 25 65536 11 1.68 819.96 1998.16 79.93 0.03% 25 32768 11 1.14 556.39 1472.34 58.89 0.04% 25 16384 12 1.32 323.43 1266.41 50.66 0.03% 25 8192 13 1.76 214.30 955.69 38.23 0.02% 25 4096 13 1.28 155.83 657.11 26.28 0.03% 25 67108864 2 1.77 442867.99 3788.31 151.53 0.41% 25 33554432 3 1.93 241367.01 3475.46 139.02 0.01% 25 16777216 3 1.02 128118.75 3273.76 130.95 0.02% 25 8388608 4 1.04 64832.31 3234.73 129.39 0.02% 25 4194304 5 1.04 32552.53 3221.18 128.85 0.03% 25 2097152 6 1.03 16060.64 3264.43 130.58 0.03% 25 1048576 7 1.05 8165.38 3210.43 128.42 0.04% 25 524288 8 1.20 4694.00 2792.33 111.69 0.04% 25 262144 9 1.33 2596.04 2524.46 100.98 0.03% 25 131072 10 1.44 1401.86 2337.46 93.50 0.03% 25 65536 11 1.69 826.17 1983.13 79.33 0.03% 25 32768 11 1.14 556.98 1470.78 58.83 0.03% 25 16384 12 1.34 326.21 1255.65 50.23 0.03% 25 8192 13 1.76 214.42 955.14 38.21 0.02% 25 4096 13 1.28 155.66 657.82 26.31 0.03% 25 2048 13 1.06 129.61 395.02 15.80 0.03% 25 1024 14 1.72 104.95 243.93 9.76 0.02% # taskset -c 0 speed-6% # 25 1048576 7 1.10 8580.15 3055.24 122.21 0.04% # 25 262144 9 1.43 2793.66 2345.88 93.84 0.03% # 25 65536 11 1.83 895.68 1829.22 73.17 0.02% # # 2nd mpirun call within SGE-qsub always produce: # error: executing task of job 21904 failed: # ERROR: A daemon on node node18 failed to start as expected. e # taskset -c 0,2 25*2 ibio-cc 20.8.08 # /opt/openmpi-1.2.6/bin/mpirun taskset -c 0,2 ./mpi_stress_ompi_dbg --bsize 28 # tasks msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] Bcast # 50 268435456 0 3.24 3241911.89 4140.08 82.80 0.21% # 50 16777216 1 1.11 553655.03 1515.13 30.30 0.02% # 50 1048576 4 1.69 105369.18 497.57 9.95 0.03% # 50 65536 4 1.79 111890.00 29.29 0.59 0.01% # no taskset 50 268435456 0 2.57 2572445.15 5217.52 104.35 0.11% 50 134217728 0 1.73 1726869.11 3886.16 77.72 0.01% 50 67108864 0 1.32 1317403.08 2547.01 50.94 0.01% 50 33554432 2 1.70 424415.23 3953.02 79.06 0.01% 50 16777216 3 1.74 217303.51 3860.32 77.21 0.02% 50 8388608 4 1.79 111873.63 3749.14 74.98 0.02% 50 4194304 5 1.87 58472.60 3586.56 71.73 0.02% 50 2097152 5 1.04 32384.19 3237.93 64.76 0.05% 50 1048576 7 1.99 15541.87 3373.39 67.47 0.03% 50 524288 8 1.96 7669.38 3418.06 68.36 0.03% 50 262144 8 1.05 4113.81 3186.15 63.72 0.05% 50 131072 9 1.16 2258.25 2902.07 58.04 0.04% 50 65536 10 1.30 1268.41 2583.38 51.67 0.04% 50 32768 9 1.04 2024.16 809.42 16.19 0.04% 50 16384 11 1.37 670.32 1222.10 24.44 0.03% 50 8192 12 1.26 307.41 1332.42 26.65 0.04% 50 4096 13 1.74 212.66 963.04 19.26 0.03% 50 2048 13 1.45 177.56 576.72 11.53 0.03% 50 1024 13 1.24 151.19 338.66 6.77 0.03% # no taskset 50 67108864 1 1.59 795148.97 4219.89 84.40 0.31% 50 33554432 2 1.75 437940.54 3830.93 76.62 0.01% 50 16777216 3 1.86 232073.87 3614.63 72.29 0.01% 50 8388608 4 1.77 110813.87 3785.00 75.70 0.02% 50 4194304 5 1.82 57015.26 3678.23 73.56 0.02% 50 2097152 5 1.25 38957.72 2691.57 53.83 0.03% 50 1048576 6 1.19 18635.03 2813.45 56.27 0.04% 50 524288 7 1.03 8009.25 3273.02 65.46 0.04% 50 262144 8 1.05 4102.06 3195.27 63.91 0.05% 50 131072 9 1.16 2265.08 2893.32 57.87 0.04% 50 65536 10 1.30 1265.11 2590.14 51.80 0.04% 50 32768 10 1.65 1611.19 1016.89 20.34 0.03% 50 16384 11 1.05 512.34 1598.95 31.98 0.04% 50 8192 12 1.28 312.34 1311.37 26.23 0.04% 50 4096 13 1.80 219.49 933.09 18.66 0.03% 50 2048 13 1.50 182.61 560.77 11.22 0.03% 50 1024 13 1.27 155.24 329.81 6.60 0.03% 50 67108864 1 1.44 718067.53 4672.88 93.46 0.40% 50 33554432 1 1.07 536415.93 3127.65 62.55 0.01% 50 16777216 3 1.75 218450.13 3840.06 76.80 0.02% 50 8388608 4 1.89 117986.87 3554.89 71.10 0.02% 50 4194304 4 1.14 71330.38 2940.05 58.80 0.03% 50 2097152 5 1.05 32841.09 3192.88 63.86 0.03% 50 1048576 6 1.02 15870.11 3303.62 66.07 0.04% 50 524288 7 1.00 7835.00 3345.81 66.92 0.05% 50 262144 8 1.29 5057.56 2591.61 51.83 0.04% 50 131072 9 1.16 2262.09 2897.15 57.94 0.04% 50 65536 10 1.31 1277.53 2564.94 51.30 0.04% 50 32768 9 1.12 2194.23 746.69 14.93 0.03% 50 16384 11 1.58 769.71 1064.30 21.29 0.03% 50 8192 12 1.27 309.17 1324.84 26.50 0.04% 50 4096 13 1.80 219.90 931.33 18.63 0.03% 50 2048 13 1.49 181.93 562.86 11.26 0.03% 50 1024 13 1.27 155.37 329.54 6.59 0.03% e # -------------------------------- tina -------------------------------- # tina openmpi-1.2.6 100MbE # ompi_info --param btl tcp # mpi all # mpirun --mca btl_tcp_if_exclude lo,eth0 --mca btl_base_debug 9 # --mca mpi_yield_when_idle 1, mpi_show_handle_leaks 1 # mpi_preconnect_all 1, mpi_preconnect_oob 1 # mpi_leave_pinned 1 (IB only?) # # no help! # # Measure speed of MPI_sendrecv transactions: # 32*2 taskset -c 0,2 # threads msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] # 64 67108864 0 3.56 3560389.89 1206.32 18.85 0.13% # 64 16777216 0 7.12 7124847.97 150.70 2.35 0.00% # 64 4194304 1 2.44 1218858.03 220.24 3.44 0.01% # 64 1048576 1 3.12 1559737.48 43.03 0.67 0.01% # # 64 67108864 0 6.13 6131572.09 700.47 10.94 0.48% # 64 16777216 0 2.06 2060620.96 521.08 8.14 0.01% # 64 4194304 0 2.42 2418980.92 110.97 1.73 0.00% # 64 1048576 1 2.78 1389362.51 48.30 0.75 0.01% # # threads msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] 32 67108864 0 3.88 3877085.96 553.89 17.31 0.24% 32 33554432 0 4.11 4109095.02 261.31 8.17 0.00% 32 16777216 0 1.76 1764898.07 304.19 9.51 0.00% 32 8388608 1 1.22 612131.01 438.53 13.70 0.01% 32 4194304 2 1.40 350329.52 383.12 11.97 0.01% 32 2097152 3 1.54 192683.11 348.29 10.88 0.02% 32 1048576 4 1.45 90619.82 370.28 11.57 0.02% 32 524288 5 1.26 39230.34 427.66 13.36 0.03% 32 262144 6 1.26 19726.59 425.24 13.29 0.03% 32 131072 7 1.33 10399.34 403.32 12.60 0.03% 32 65536 8 1.47 5723.41 366.42 11.45 0.03% 32 32768 8 2.03 7910.33 132.56 4.14 0.02% 32 16384 8 1.13 4411.03 118.86 3.71 0.03% 32 8192 10 1.43 1398.23 187.48 5.86 0.03% 32 4096 11 1.70 829.42 158.03 4.94 0.02% 32 2048 11 1.18 578.36 113.31 3.54 0.03% 32 1024 12 1.71 418.49 78.30 2.45 0.02% 32 512 12 1.32 322.73 50.77 1.59 0.03% 32 256 12 1.11 271.30 30.20 0.94 0.04% 32 128 12 1.00 245.33 16.70 0.52 0.04% 32 64 13 1.95 238.53 8.59 0.27 0.02% 32 32 13 1.90 232.18 4.41 0.14 0.02% 32 16 13 1.85 225.69 2.27 0.07 0.02% 32 8 13 1.82 222.26 1.15 0.04 0.02% 32 4 13 1.81 220.35 0.58 0.02 0.02% 32 2 13 1.83 223.83 0.29 0.01 0.02% 32 1 13 1.82 221.93 0.14 0.00 0.02% 32 134217728 0 6.98 6978298.93 615.47 19.23 0.41% 32 67108864 0 8.18 8180681.96 262.51 8.20 0.00% 32 33554432 0 2.56 2557002.02 419.92 13.12 0.00% 32 16777216 0 1.28 1281852.03 418.82 13.09 0.01% 32 8388608 2 2.01 502521.99 534.18 16.69 0.01% 32 4194304 2 1.52 380768.26 352.49 11.02 0.01% 32 2097152 3 1.55 193894.64 346.11 10.82 0.02% 32 1048576 4 1.41 88360.25 379.75 11.87 0.02% 32 524288 5 1.35 42191.57 397.64 12.43 0.03% 32 262144 6 1.25 19600.61 427.98 13.37 0.03% 32 131072 7 1.33 10383.51 403.94 12.62 0.03% 32 65536 8 1.47 5754.11 364.46 11.39 0.03% 32 32768 8 1.87 7297.02 143.70 4.49 0.02% 32 16384 8 1.16 4526.36 115.83 3.62 0.03% 32 8192 10 1.43 1392.61 188.24 5.88 0.03% 32 4096 11 1.71 833.18 157.31 4.92 0.02% 32 2048 11 1.18 575.54 113.87 3.56 0.03% 32 1024 12 1.72 418.92 78.22 2.44 0.02% # 20.8.08 16 67108864 0 3.04 3041053.91 353.08 22.07 0.03% 16 33554432 0 1.75 1745872.06 307.51 19.22 0.00% 16 16777216 1 1.81 903929.03 296.97 18.56 0.01% 16 8388608 2 2.08 518794.24 258.71 16.17 0.01% 16 4194304 2 1.05 263654.26 254.53 15.91 0.01% 16 2097152 3 1.29 161641.39 207.59 12.97 0.01% 16 1048576 4 1.17 72857.00 230.28 14.39 0.02% 16 524288 5 1.12 35016.94 239.56 14.97 0.03% 16 262144 6 1.25 19486.14 215.25 13.45 0.03% 16 131072 7 1.35 10566.01 198.48 12.41 0.03% 16 65536 8 1.45 5665.98 185.07 11.57 0.02% 16 32768 8 1.11 4337.49 120.87 7.55 0.03% 16 16384 9 1.15 2238.58 117.10 7.32 0.03% 16 8192 10 1.29 1260.49 103.98 6.50 0.02% 16 4096 11 1.61 785.89 83.39 5.21 0.02% 16 2048 11 1.14 557.85 58.74 3.67 0.03% 16 1024 12 1.70 413.88 39.59 2.47 0.02% 16 512 12 1.28 311.81 26.27 1.64 0.03% 16 256 12 1.06 259.61 15.78 0.99 0.03% 16 128 13 1.98 241.50 8.48 0.53 0.02% 16 64 13 1.98 242.25 4.23 0.26 0.02% 16 32 13 1.93 235.82 2.17 0.14 0.02% 16 16 13 1.81 220.79 1.16 0.07 0.02% 16 8 13 1.78 217.61 0.59 0.04 0.02% 16 4 13 1.78 217.55 0.29 0.02 0.02% 16 2 13 1.82 222.16 0.14 0.01 0.02% 16 1 13 1.85 225.33 0.07 0.00 0.02% 16 134217728 0 6.07 6070886.09 353.73 22.11 0.04% 16 67108864 0 3.38 3383237.06 317.37 19.84 0.00% 16 33554432 0 1.57 1574216.09 341.04 21.32 0.00% 16 16777216 1 1.85 925273.06 290.11 18.13 0.01% 16 8388608 1 1.26 632332.55 212.26 13.27 0.01% 16 4194304 2 1.12 280621.23 239.14 14.95 0.01% 16 2097152 3 1.08 135580.87 247.49 15.47 0.02% 16 1048576 4 1.18 73520.81 228.20 14.26 0.02% 16 524288 5 1.12 34860.03 240.64 15.04 0.03% 16 262144 6 1.22 19042.05 220.27 13.77 0.03% 16 131072 7 1.35 10524.39 199.27 12.45 0.03% 16 65536 8 1.44 5640.80 185.89 11.62 0.02% 16 32768 8 1.11 4349.77 120.53 7.53 0.03% 16 16384 9 1.15 2253.90 116.31 7.27 0.03% 16 8192 10 1.28 1254.77 104.46 6.53 0.03% 16 4096 11 1.59 778.67 84.16 5.26 0.02% 16 2048 11 1.15 559.71 58.54 3.66 0.03% 16 1024 12 1.67 408.79 40.08 2.50 0.02% e pause -1 # ------------- snip ------------------- #gnuplot #set logscale xy #set ytics 2 set nolabel set ytics auto set nologscale y set logscale x set key left set title "Performance(msgsize) MPI" set xlabel "message size [B]" set ylabel "nodeBW [MB/s]" set xtics ("1" 1,"4" 4,"16" 16,"64" 64,"256" 256,"1k" 1024,"4k" 4096,"16k" 4*4096,"64k" 65536,"256k" 262144,"1M" 1048576,"4M" 4*1024*1024,"16M" 16*1024*1024) plot [1:1024*1024*32] [.0625/4:] \ "-" u 2:($6/$1) t "32: 8*QuadOpteron-SMP " w lp 1 1,\ "-" u 2:($6/$1) t " 8: 4*DualOpteron-SMP " w lp 1 4,\ "-" u ($2/$1):($6/$1) t "" w lp 2 4,\ 0.01 t "Sendrecv" w l 1 1 # x4600 8*Quad # Measure speed of MPI_sendrecv transactions: # threads msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] error 32 1048576 14 172 10498.05 3196.25 0.6% 32 524288 15 148 4516.60 3714.57 0.7% 32 262144 16 108 1647.95 5090.33 0.9% 32 131072 17 109 831.60 5043.63 0.9% 32 65536 18 112 427.25 4908.53 0.9% 32 32768 17 29 221.25 4739.27 3.4% 32 16384 18 29 110.63 4739.27 3.4% 32 8192 19 32 61.04 4294.97 3.1% 32 4096 20 36 34.33 3817.75 2.8% 32 2048 21 32 15.26 4294.97 3.1% 3.1us e # comp1. 4*2opteron-SMP: use taskset to avoid task-to-CPU scheduling # using MPI 1.2 with 2 nodes, mpich -p4pg via loop-device # using MPI 1.2 with 8 nodes # works bad, because sometimes 2jobs lie on same CPU or moving, use taskset? # blocksize: 2^19 = 524288 (set by 2^arg1) # mintime[s]: 30 (set by arg2) # Measure speed of MPI_sendrecv transactions: # threads msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] error 8 33554432 8 73 285156.25 941.36 1.4% (two jobs/cpu possible!) 8 16777216 8 37 144531.25 928.64 2.7% 8 8388608 9 37 72265.62 928.64 2.7% 8 4194304 10 38 37109.38 904.20 2.6% 8 2097152 11 40 19531.25 858.99 2.5% 8 1048576 13 80 9765.62 858.99 1.2% 8 524288 13 38 4638.67 904.20 2.6% 838..904 8 262144 19 1123 2141.95 979.08 0.1% 8 131072 20 1060 1010.89 1037.28 0.1% 8 65536 21 1312 625.61 838.04 0.1% 8 32768 23 1717 204.68 1280.74 0.1% 8 16384 23 1107 131.96 993.24 0.1% 8 8192 20 60 57.22 1145.32 1.7% 8 4096 20 49 46.73 701.22 2.0% 8 2048 20 41 39.10 419.02 2.4% 8 1024 20 39 37.19 220.25 2.6% 8 512 20 37 35.29 116.08 2.7% 8 256 20 36 34.33 59.65 2.8% 8 128 20 36 34.33 29.83 2.8% 8 64 20 36 34.33 14.91 2.8% 8 32 20 36 34.33 7.46 2.8% 8 2 25 1161 34.60 0.46 0.1% 8 1 25 1161 34.60 0.23 0.1% e # Measure speed of MPI_alltoall transactions: 8 33554432 8 58 226562.50 1184.82 1.7% 8 16777216 9 60 117187.50 1145.32 1.7% 8 8388608 9 33 64453.12 1041.20 3.0% 8 4194304 10 34 33203.12 1010.58 2.9% 8 2097152 11 32 15625.00 1073.74 3.1% 8 1048576 13 64 7812.50 1073.74 1.6% 1073..1145 8 524288 13 33 4028.32 1041.20 3.0% 1010..1041 8 262144 19 1009 1924.51 1089.70 0.1% 8 131072 20 1086 1035.69 1012.44 0.1% 8 65536 22 1637 390.29 1343.33 0.1% 8 32768 22 1230 293.25 893.91 0.1% 8 16384 22 1018 242.71 540.04 0.1% 8 8192 18 60 228.88 286.33 1.7% 8 4096 18 58 221.25 148.10 1.7% 8 2048 18 34 129.70 126.32 2.9% 8 1024 18 33 125.89 65.08 3.0% 8 512 18 31 118.26 34.64 3.2% 8 256 18 31 118.26 17.32 3.2% 8 128 18 32 122.07 8.39 3.1% 8 64 18 31 118.26 4.33 3.2% 8 32 19 61 116.35 2.20 1.6% # 8 16 19 1537 2931.59 0.04 0.1% ?? # 8 8 19 1534 2925.87 0.02 0.1% ?? 8 16 18 31 118.26 1.08 3.2% lt=118..3662 ?? 8 8 18 31 118.26 0.54 3.2% lt=118..3357 ?? e pause -1 # #----------------------------------------------------------- set ytics auto set nologscale y set nolabel #set label "fixed nn-pairs" at 200*1024,8200 right #set label "fixed rings" at 300*1024,2300 left #set label "floating routes" at 160*1024,3200 left # rings + pairs set logscale xy set key left set title "MPI_Sendrecv Bandwidth vs. message size" set xlabel "message size [B]" set ylabel "nodeBW [MB/s]" set xtics ("1" 1,"4" 4,"16" 16,"64" 64,"256" 256,"1k" 1024,"4k" 4096,"16k" 2**14,"64k" 2**16,"256k" 2**18,"1M" 2**20,"4M" 2**22,"16M" 2**24,"64M" 2**26,"256M" 2**28) plot [1:2.**30] [.0625/4:] \ "-" u 2:($6/$1) t " 8: Altix4700" w lp 7 7,\ "-" u 2:($6/$1) t " 16: Altix4700" w lp 3 3,\ "-" u 2:($6/$1) t " 32: Altix4700" w lp 5 5,\ "-" u 2:($6/$1) t " 128: Altix4700" w lp 6 6,\ "-" u 2:($6/$1) t " 256: Altix4700" w lp 1 1,\ "-" u 2:($6/$1) t " 510: Altix4700" w lp 2 2,\ "-" u 2:($6/$1) t "1016: Altix4700" w lp 4 4,\ "-" u 2:($6/$1) t "2032: Altix4700" w lp 8 8,\ 0.01 t "" w l 2 1 # # NCPUS=1 ToDo: memalign!?=no, full_partition510 # /lrz/sys/mpi_altix/lib/libmpi.so altix4700 # SGI MPT 1.19 - Using the XPMEM NUMAlink interconnect # using MPI 1.2 with 32 nodes # blocksize: 2^24 = 16777216 (set by 2^arg1) # mintime[s]: 5 (set by arg2) # mode[mask]: 13 (set by arg3) # +8=MPI_Alltoall +4=MPI_Sendrecv +1=ShiftPartners +2=Pairs(~Rings) # Measure speed of MPI_sendrecv transactions: # new version v2.37 16Sep08 # exec_host = a06/295*8 # tasks msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] Bcast v0 8 1073741824 1 9.59 4796567.20 1790.85 223.86 0.00% 8 536870912 1 6.55 3272640.75 1312.39 164.05 0.00% 8 268435456 1 2.07 1034105.05 2076.66 259.58 0.00% 8 134217728 2 2.16 540697.59 1985.85 248.23 0.00% 8 67108864 4 3.86 241503.44 2223.04 277.88 0.00% 8 33554432 5 3.78 118248.81 2270.09 283.76 0.00% 8 16777216 6 3.90 60984.22 2200.86 275.11 0.00% 8 8388608 7 3.83 29926.95 2242.42 280.30 0.00% 8 4194304 8 3.36 13126.12 2556.31 319.54 0.00% 8 2097152 9 3.10 6056.86 2769.95 346.24 0.00% 8 1048576 10 3.08 3011.24 2785.76 348.22 0.00% 8 524288 11 3.08 1504.74 2787.40 348.42 0.00% 8 262144 12 3.08 752.20 2788.01 348.50 0.00% 8 131072 13 3.06 373.73 2805.68 350.71 0.00% 8 65536 14 3.09 188.30 2784.39 348.05 0.00% 8 32768 15 3.14 95.82 2735.68 341.96 0.00% 8 16384 16 3.23 49.29 2659.01 332.38 0.00% 8 8192 17 3.43 26.18 2503.01 312.88 0.00% 8 4096 18 3.94 15.02 2182.19 272.77 0.00% 8 2048 18 2.59 9.88 1657.90 207.24 0.00% 8 1024 18 2.01 7.66 1069.74 133.72 0.00% 8 512 19 3.60 6.86 597.10 74.64 0.00% 8 256 19 3.31 6.32 324.24 40.53 0.00% 8 128 19 3.15 6.00 170.58 21.32 0.00% 8 64 20 3.17 3.03 169.10 21.14 0.00% 8 32 20 3.00 2.86 89.57 11.20 0.00% 8 16 20 2.93 2.79 45.85 5.73 0.00% 8 8 20 2.91 2.77 23.10 2.89 0.00% 8 4 20 3.00 2.86 11.20 1.40 0.00% 8 2 20 2.97 2.83 5.66 0.71 0.00% 8 1 20 2.95 2.81 2.85 0.36 0.00% e # tasks msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] Bcast v0 16 67108864 4 2.50 156308.25 6869.39 429.34 0.00% 16 33554432 5 2.42 75562.49 7104.99 444.06 0.00% 16 16777216 6 2.40 37478.17 7162.45 447.65 0.01% 16 8388608 7 2.35 18391.90 7297.65 456.10 0.01% 16 4194304 8 2.34 9143.84 7339.24 458.70 0.01% 16 2097152 9 2.44 4774.36 7028.04 439.25 0.00% 16 1048576 10 2.48 2420.34 6931.75 433.23 0.00% 16 524288 11 2.48 1213.09 6915.09 432.19 0.00% 16 262144 12 2.50 610.65 6868.56 429.28 0.00% 16 131072 13 2.52 308.10 6806.66 425.42 0.00% 16 65536 14 2.57 157.03 6677.52 417.34 0.00% 16 32768 15 2.82 86.15 6085.71 380.36 0.00% 16 16384 16 3.00 45.81 5722.59 357.66 0.00% 16 8192 17 3.37 25.67 5105.25 319.08 0.00% 16 4096 17 2.14 16.30 4021.30 251.33 0.01% 16 2048 18 3.00 11.46 2860.54 178.78 0.00% 16 1024 18 2.44 9.32 1757.20 109.82 0.00% 16 512 18 2.24 8.54 959.02 59.94 0.00% 16 256 18 2.04 7.80 525.15 32.82 0.01% 16 128 19 3.97 7.58 270.15 16.88 0.00% 16 64 20 3.72 3.55 288.85 18.05 0.00% 16 32 20 3.51 3.34 153.11 9.57 0.00% 16 16 20 3.42 3.26 78.52 4.91 0.00% 16 8 20 3.39 3.23 39.58 2.47 0.00% 16 4 20 3.46 3.30 19.42 1.21 0.00% 16 2 20 3.45 3.29 9.73 0.61 0.00% 16 1 20 3.42 3.26 4.91 0.31 0.00% e # tasks msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] Bcast v0 32 16777216 4 1.15 72034.72 7452.95 232.90 0.01% 32 8388608 5 1.13 35381.68 7586.85 237.09 0.02% 32 4194304 6 1.21 18874.93 7110.90 222.22 0.01% 32 2097152 7 1.33 10356.80 6479.69 202.49 0.01% 32 1048576 8 1.31 5130.88 6539.71 204.37 0.01% 32 524288 9 1.33 2604.43 6441.79 201.31 0.01% 32 262144 10 1.32 1292.16 6491.93 202.87 0.01% 32 131072 11 1.33 648.26 6470.08 202.19 0.01% 32 65536 12 1.34 326.57 6421.76 200.68 0.01% 32 32768 13 1.43 174.80 5998.61 187.46 0.01% 32 16384 14 1.47 89.99 5826.32 182.07 0.01% 32 8192 15 1.57 47.96 5465.61 170.80 0.01% 32 4096 16 1.75 26.67 4915.38 153.61 0.01% 32 2048 16 1.08 16.44 3985.83 124.56 0.01% 32 1024 17 1.56 11.88 2757.31 86.17 0.01% 32 512 17 1.34 10.20 1606.68 50.21 0.01% 32 256 17 1.20 9.17 892.91 27.90 0.01% 32 128 17 1.16 8.88 461.20 14.41 0.01% 32 64 18 1.04 3.97 515.42 16.11 0.01% 32 32 18 1.00 3.83 267.42 8.36 0.02% 32 16 19 2.01 3.83 133.65 4.18 0.01% 32 8 18 1.01 3.87 66.18 2.07 0.02% 32 4 18 1.04 3.98 32.14 1.00 0.01% 32 2 19 2.01 3.83 16.70 0.52 0.01% 32 1 19 2.04 3.89 8.23 0.26 0.01% # only one stress job 32 67108864 3 2.02 252909.44 8491.12 265.35 0.01% 32 33554432 4 2.43 152009.91 7063.63 220.74 0.00% 32 16777216 5 2.28 71114.22 7549.42 235.92 0.01% 32 8388608 6 2.25 35188.23 7628.56 238.39 0.01% 32 4194304 7 2.30 17930.96 7485.25 233.91 0.01% 32 2097152 8 2.49 9707.57 6913.04 216.03 0.00% 32 1048576 9 2.57 5022.98 6680.18 208.76 0.00% 32 524288 10 2.54 2484.90 6751.66 210.99 0.00% 32 262144 11 2.55 1244.78 6739.01 210.59 0.00% 32 131072 12 2.56 626.02 6699.96 209.37 0.00% 32 65536 13 2.57 314.31 6672.26 208.51 0.01% 32 32768 14 2.78 169.56 6184.04 193.25 0.01% 32 16384 15 2.85 86.87 6035.28 188.60 0.01% 32 8192 16 2.99 45.63 5745.04 179.53 0.00% 32 4096 17 3.30 25.19 5202.44 162.58 0.00% 32 2048 17 2.02 15.38 4260.16 133.13 0.01% 32 1024 18 2.82 10.76 3045.06 95.16 0.01% 32 512 18 2.34 8.92 1836.53 57.39 0.01% 32 256 18 2.07 7.91 1035.22 32.35 0.01% 32 128 18 2.01 7.66 534.54 16.70 0.01% 32 64 20 3.66 3.49 586.17 18.32 0.00% 32 32 20 3.50 3.34 307.03 9.59 0.00% 32 16 20 3.43 3.27 156.47 4.89 0.00% 32 8 20 3.41 3.25 78.72 2.46 0.00% 32 4 20 3.48 3.32 38.52 1.20 0.00% 32 2 20 3.47 3.31 19.36 0.60 0.00% 32 1 20 3.45 3.29 9.72 0.30 0.00% e # dec08 a14/... # tasks msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] Bcast v0 128 1048576 9 2.60 5068.44 26481.07 206.88 0.01% 128 524288 10 2.54 2476.58 27097.41 211.70 0.01% 128 262144 11 2.55 1244.22 26968.34 210.69 0.01% 128 131072 12 2.57 626.80 26766.51 209.11 0.01% 128 65536 13 2.85 347.47 24142.07 188.61 0.01% 128 32768 14 2.90 177.25 23663.11 184.87 0.01% 128 16384 15 2.82 86.09 24360.53 190.32 0.01% 128 8192 16 3.02 46.08 22756.07 177.78 0.01% 128 4096 17 3.44 26.26 19961.82 155.95 0.01% 128 2048 17 2.24 17.09 15339.80 119.84 0.01% 128 1024 18 3.39 12.94 10133.01 79.16 0.01% 128 512 18 3.00 11.45 5721.50 44.70 0.01% 128 256 18 2.81 10.74 3051.49 23.84 0.01% 128 128 18 2.65 10.11 1620.60 12.66 0.01% 128 64 19 2.36 4.51 1818.24 14.21 0.01% 128 32 19 2.28 4.34 943.92 7.37 0.01% 128 16 19 2.24 4.27 479.77 3.75 0.01% 128 8 19 2.23 4.25 240.75 1.88 0.01% 128 4 19 2.26 4.30 118.96 0.93 0.01% 128 2 19 2.26 4.32 59.32 0.46 0.01% 128 1 19 2.25 4.29 29.86 0.23 0.01% # star 2**28 # tasks msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] Bcast v0 128 268435456 2 3.71 928625.22 37000.65 289.07 0.00% 128 134217728 2 2.07 518692.80 33121.47 258.76 0.00% 128 67108864 3 2.16 270213.99 31789.38 248.35 0.01% 128 33554432 4 2.67 166720.90 25761.42 201.26 0.01% 128 16777216 5 2.72 85141.12 25222.64 197.05 0.01% 128 8388608 6 2.44 38070.10 28204.33 220.35 0.01% 128 4194304 7 2.51 19605.67 27383.46 213.93 0.01% 128 2097152 8 2.57 10023.92 26779.48 209.21 0.01% 128 1048576 9 2.57 5024.54 26712.46 208.69 0.01% 128 524288 10 2.56 2502.30 26818.90 209.52 0.01% 128 262144 11 2.57 1256.01 26715.19 208.71 0.01% 128 131072 12 2.61 637.14 26331.87 205.72 0.01% 128 65536 13 2.77 337.54 24851.84 194.16 0.01% 128 32768 14 2.75 167.90 24980.78 195.16 0.01% 128 16384 15 2.82 86.16 24340.06 190.16 0.01% 128 8192 16 3.03 46.21 22689.19 177.26 0.01% 128 4096 17 3.44 26.26 19962.33 155.96 0.01% 128 2048 17 2.24 17.11 15325.07 119.73 0.01% 128 1024 18 3.39 12.92 10146.20 79.27 0.01% 128 512 18 3.00 11.45 5724.45 44.72 0.01% 128 256 18 2.82 10.75 3047.02 23.80 0.01% 128 128 18 2.65 10.11 1621.14 12.67 0.01% 128 64 19 2.36 4.50 1820.82 14.23 0.01% 128 32 19 2.27 4.33 945.60 7.39 0.01% 128 16 19 2.23 4.26 480.65 3.76 0.01% 128 8 19 2.23 4.26 240.55 1.88 0.01% 128 4 19 2.25 4.30 119.07 0.93 0.01% 128 2 19 2.25 4.29 59.64 0.47 0.01% 128 1 19 2.24 4.28 29.92 0.23 0.01% 128 1073741824 1 6.93 3463351.88 39683.80 310.03 0.00% 128 536870912 1 3.93 1965081.22 34970.30 273.21 0.00% 128 268435456 1 2.29 1147415.08 29945.34 233.95 0.00% 128 134217728 2 2.31 577826.67 29731.87 232.28 0.00% 128 67108864 3 2.69 335884.10 25574.10 199.80 0.01% 128 33554432 4 2.65 165626.76 25931.60 202.59 0.01% 128 16777216 5 2.65 82857.31 25917.85 202.48 0.01% 128 8388608 6 2.37 37008.26 29013.57 226.67 0.01% 128 4194304 7 2.50 19511.84 27515.13 214.96 0.01% 128 2097152 8 2.57 10044.47 26724.71 208.79 0.01% 128 1048576 9 2.58 5035.93 26652.03 208.22 0.01% 128 524288 10 2.55 2492.17 26927.86 210.37 0.01% 128 262144 11 2.56 1250.93 26823.60 209.56 0.01% 128 131072 12 2.62 638.78 26264.67 205.19 0.01% 128 65536 13 2.77 338.13 24809.02 193.82 0.01% 128 32768 14 2.82 172.16 24363.02 190.34 0.01% 128 16384 15 2.88 88.01 23829.79 186.17 0.01% 128 8192 16 3.14 47.87 21903.50 171.12 0.01% 128 4096 17 3.44 26.24 19980.17 156.10 0.01% 128 2048 17 2.24 17.08 15352.12 119.94 0.01% 128 1024 18 3.39 12.92 10148.58 79.29 0.01% 128 512 18 3.10 11.83 5539.18 43.27 0.01% 128 256 18 3.00 11.46 2859.26 22.34 0.01% 128 128 18 2.71 10.34 1585.10 12.38 0.01% 128 64 19 2.37 4.52 1811.55 14.15 0.01% 128 32 19 2.27 4.34 944.25 7.38 0.01% 128 16 19 2.24 4.27 480.10 3.75 0.01% 128 8 19 2.23 4.25 241.09 1.88 0.01% 128 4 19 2.25 4.30 119.17 0.93 0.01% 128 2 19 2.53 4.82 53.11 0.41 0.01% 128 1 19 2.41 4.60 27.83 0.22 0.01% e # dec08 exec_host = a13/232*256 # tasks msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] Bcast v0 256 268435456 1 2.02 1010852.67 67981.69 265.55 0.00% 256 134217728 3 3.65 456498.94 75267.95 294.02 0.01% 256 67108864 3 2.35 293622.10 58510.14 228.56 0.01% 256 33554432 4 2.86 178826.67 48034.98 187.64 0.01% 256 16777216 5 2.68 83825.42 51237.05 200.14 0.01% 256 8388608 6 3.50 54634.00 39306.73 153.54 0.02% 256 4194304 7 3.33 26021.44 41263.74 161.19 0.01% 256 2097152 7 3.05 23810.69 22547.47 88.08 0.01% 256 1048576 7 2.78 21713.21 12362.77 48.29 0.01% 256 524288 10 3.15 3075.90 43635.22 170.45 0.01% 256 262144 11 3.19 1559.63 43028.75 168.08 0.01% 256 131072 12 3.19 779.88 43025.31 168.07 0.01% 256 65536 12 2.29 560.08 29955.27 117.01 0.01% 256 32768 14 3.58 218.53 38386.47 149.95 0.01% 256 16384 15 3.57 108.99 38481.75 150.32 0.01% 256 8192 16 3.84 58.55 35816.03 139.91 0.01% 256 4096 16 2.10 31.98 32792.47 128.10 0.01% 256 2048 17 2.67 20.38 25726.45 100.49 0.01% 256 1024 18 3.91 14.93 17559.15 68.59 0.01% 256 512 17 2.51 19.12 6856.38 26.78 0.01% 256 256 18 3.20 12.21 5368.13 20.97 0.01% 256 128 18 3.00 11.43 2867.87 11.20 0.01% 256 64 19 2.71 5.17 3168.33 12.38 0.01% 256 32 19 2.61 4.99 1642.47 6.42 0.01% 256 16 19 2.60 4.95 827.54 3.23 0.01% 256 8 19 2.58 4.92 416.30 1.63 0.01% 256 4 19 2.62 4.99 205.16 0.80 0.01% 256 2 19 2.58 4.92 104.16 0.41 0.01% 256 1 19 2.59 4.94 51.77 0.20 0.01% # exec_host = a13/232*256 256 268435456 1 2.03 1015303.70 67683.67 264.39 0.00% 256 134217728 3 3.94 492667.31 69742.27 272.43 0.00% 256 67108864 3 2.53 316300.11 54315.09 212.17 0.01% 256 33554432 4 2.71 169525.33 50670.51 197.93 0.01% 256 16777216 5 2.65 82762.60 51895.03 202.71 0.02% 256 8388608 6 3.50 54715.29 39248.33 153.31 0.01% 256 4194304 7 3.32 25937.84 41396.73 161.71 0.01% 256 2097152 7 3.00 23428.74 22915.06 89.51 0.01% 256 1048576 7 2.77 21638.11 12405.68 48.46 0.01% 256 524288 10 3.13 3059.27 43872.51 171.38 0.01% 256 262144 11 3.13 1529.88 43865.50 171.35 0.01% 256 131072 12 3.20 781.02 42962.15 167.82 0.01% 256 65536 12 2.30 561.31 29889.62 116.76 0.01% 256 32768 14 3.59 219.03 38299.50 149.61 0.01% 256 16384 15 3.57 109.06 38457.61 150.23 0.01% 256 8192 15 2.06 62.93 33325.16 130.18 0.01% 256 4096 16 2.10 32.05 32719.01 127.81 0.02% 256 2048 17 2.69 20.51 25561.62 99.85 0.01% 256 1024 18 3.92 14.96 17523.33 68.45 0.01% 256 512 17 2.49 19.01 6893.43 26.93 0.01% 256 256 18 3.20 12.20 5369.74 20.98 0.01% 256 128 18 3.00 11.43 2866.94 11.20 0.01% 256 64 19 2.71 5.18 3165.95 12.37 0.01% 256 32 19 2.63 5.01 1634.30 6.38 0.01% 256 16 19 2.58 4.93 831.08 3.25 0.01% 256 8 19 2.57 4.91 417.44 1.63 0.01% 256 4 19 2.60 4.96 206.58 0.81 0.01% 256 2 19 2.59 4.95 103.53 0.40 0.01% 256 1 19 2.57 4.90 52.20 0.20 0.01% # other start block size! exec_host = a13/232*256 256 134217728 3 3.52 439732.14 78137.88 305.23 0.01% 256 67108864 4 3.34 208528.95 82386.02 321.82 0.01% 256 33554432 5 4.23 132137.92 65007.34 253.93 0.01% 256 16777216 5 2.22 69246.92 62023.94 242.28 0.01% 256 8388608 6 2.93 45817.70 46870.18 183.09 0.02% 256 4194304 7 2.31 18008.80 59623.19 232.90 0.01% 256 2097152 8 3.11 12150.22 44186.12 172.60 0.01% 256 1048576 9 3.12 6087.59 44095.50 172.25 0.01% 256 524288 10 3.13 3054.33 43943.47 171.65 0.01% 256 262144 11 3.12 1524.53 44019.51 171.95 0.01% 256 131072 12 3.14 767.82 43701.02 170.71 0.01% 256 65536 13 3.16 386.19 43442.83 169.70 0.01% 256 32768 12 2.03 496.67 16889.82 65.98 0.01% 256 16384 15 3.57 109.02 38471.55 150.28 0.01% 256 8192 16 3.83 58.41 35905.36 140.26 0.01% 256 4096 16 2.10 32.09 32672.82 127.63 0.01% 256 2048 17 2.68 20.45 25633.03 100.13 0.01% 256 1024 18 3.95 15.05 17418.13 68.04 0.01% 256 512 18 3.40 12.97 10105.84 39.48 0.01% 256 256 18 3.18 12.14 5400.47 21.10 0.01% 256 128 18 3.01 11.47 2855.62 11.15 0.01% 256 64 19 2.70 5.16 3176.82 12.41 0.01% 256 32 19 2.61 4.99 1642.87 6.42 0.01% 256 16 19 2.60 4.96 826.45 3.23 0.01% 256 8 19 2.57 4.90 417.65 1.63 0.01% 256 4 19 2.58 4.91 208.38 0.81 0.01% 256 2 19 2.59 4.94 103.59 0.40 0.01% 256 1 19 2.58 4.92 52.05 0.20 0.01% # exec_host = a13/232*256 256 134217728 3 3.62 451948.89 76025.72 296.98 0.01% 256 67108864 4 3.28 205143.67 83745.55 327.13 0.01% 256 33554432 5 3.60 112446.43 76391.35 298.40 0.01% 256 16777216 5 2.84 88618.30 48465.92 189.32 0.01% 256 8388608 6 2.90 45366.48 47336.35 184.91 0.02% 256 4194304 7 2.29 17907.00 59962.11 234.23 0.02% 256 2097152 8 3.12 12204.84 43988.35 171.83 0.01% 256 1048576 9 3.14 6124.15 43832.29 171.22 0.01% 256 524288 10 3.13 3053.69 43952.61 171.69 0.01% 256 262144 11 3.14 1532.60 43787.67 171.05 0.01% 256 131072 12 3.15 768.11 43684.67 170.64 0.01% 256 65536 13 3.18 387.85 43256.76 168.97 0.01% 256 32768 12 2.06 503.98 16644.80 65.02 0.01% 256 16384 15 3.58 109.27 38385.30 149.94 0.01% 256 8192 16 3.84 58.61 35778.72 139.76 0.01% 256 4096 16 2.10 32.00 32772.81 128.02 0.01% 256 2048 17 2.68 20.42 25669.70 100.27 0.01% 256 1024 18 3.92 14.96 17521.13 68.44 0.01% 256 512 18 3.40 12.96 10112.99 39.50 0.01% 256 256 18 3.18 12.14 5400.02 21.09 0.01% 256 128 18 3.00 11.43 2865.63 11.19 0.01% 256 64 19 2.70 5.15 3180.17 12.42 0.01% 256 32 19 2.63 5.01 1636.04 6.39 0.01% 256 16 19 2.59 4.94 829.96 3.24 0.01% 256 8 19 2.56 4.88 419.60 1.64 0.01% 256 4 19 2.58 4.93 207.90 0.81 0.01% 256 2 19 2.59 4.95 103.52 0.40 0.01% 256 1 19 2.58 4.91 52.10 0.20 0.01% # a13/232*256 # tasks msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] Bcast v0 256 4194304 7 2.60 20292.90 52912.20 206.69 0.01% 256 2097152 8 3.11 12152.96 44176.13 172.56 0.01% 256 1048576 9 3.15 6144.24 43688.96 170.66 0.01% 256 524288 9 2.39 4668.03 28752.55 112.31 0.01% 256 262144 11 3.14 1532.16 43800.15 171.09 0.01% 256 131072 12 3.17 773.25 43394.03 169.51 0.01% 256 65536 13 3.22 393.27 42660.64 166.64 0.01% 256 32768 12 2.02 492.73 17024.92 66.50 0.01% 256 16384 15 3.66 111.79 37519.55 146.56 0.01% 256 8192 15 2.00 61.14 34303.37 134.00 0.02% 256 4096 16 2.10 32.00 32763.79 127.98 0.01% 256 2048 17 2.68 20.47 25613.28 100.05 0.01% 256 1024 18 4.03 15.36 17069.25 66.68 0.01% 256 512 18 4.45 16.98 7717.23 30.15 0.01% 256 256 18 3.35 12.78 5129.04 20.04 0.01% 256 128 18 2.99 11.40 2873.53 11.22 0.01% 256 64 19 2.72 5.18 3162.16 12.35 0.01% 256 32 19 2.66 5.08 1614.09 6.31 0.01% 256 16 19 2.90 5.52 741.50 2.90 0.01% 256 8 19 2.83 5.40 379.50 1.48 0.01% 256 4 19 2.62 5.00 204.65 0.80 0.01% 256 2 19 3.51 6.70 76.40 0.30 0.01% 256 1 19 2.58 4.91 52.09 0.20 0.01% 256 4194304 7 2.62 20492.27 52397.40 204.68 0.02% 256 2097152 8 3.09 12076.10 44457.32 173.66 0.01% 256 1048576 9 3.18 6203.06 43274.68 169.04 0.01% 256 524288 9 2.39 4663.57 28780.02 112.42 0.01% 256 262144 11 3.13 1530.16 43857.55 171.32 0.01% 256 131072 12 3.15 767.86 43698.46 170.70 0.01% 256 65536 13 3.17 387.30 43318.44 169.21 0.01% 256 32768 13 2.82 344.44 24354.06 95.13 0.01% 256 16384 15 3.57 108.88 38521.69 150.48 0.01% 256 8192 15 2.01 61.28 34224.41 133.69 0.02% 256 4096 16 2.10 32.00 32770.01 128.01 0.01% 256 2048 17 2.66 20.30 25830.87 100.90 0.01% 256 1024 18 3.90 14.88 17613.34 68.80 0.01% 256 512 18 5.10 19.44 6742.76 26.34 0.01% 256 256 18 3.23 12.33 5316.30 20.77 0.01% 256 128 18 3.00 11.45 2861.84 11.18 0.01% 256 64 19 2.71 5.17 3170.37 12.38 0.01% 256 32 19 2.65 5.05 1622.15 6.34 0.01% 256 16 18 2.22 8.45 484.62 1.89 0.01% 256 8 19 2.57 4.90 417.88 1.63 0.01% 256 4 19 2.60 4.95 206.73 0.81 0.01% 256 2 19 2.60 4.96 103.25 0.40 0.01% 256 1 18 2.19 8.34 30.71 0.12 0.01% # other start block 2**9 # tasks msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] Bcast v0 256 512 17 2.27 17.34 7559.20 29.53 0.01% 256 256 18 3.40 12.95 5060.23 19.77 0.01% 256 128 18 3.12 11.91 2750.63 10.74 0.01% 256 64 18 2.24 8.55 1916.98 7.49 0.01% 256 32 19 2.63 5.02 1631.65 6.37 0.01% 256 16 19 2.57 4.91 834.23 3.26 0.01% 256 8 19 2.54 4.85 422.20 1.65 0.01% 256 4 19 2.55 4.86 210.81 0.82 0.01% 256 2 18 4.64 17.69 28.94 0.11 0.01% 256 1 19 2.59 4.94 51.83 0.20 0.01% e # dec08 a07/0*510 # tasks msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] Bcast v0 510 268435456 1 2.62 1309365.30 104556.06 205.01 0.00% 510 134217728 3 3.91 489052.49 139966.65 274.44 0.01% 510 67108864 3 2.06 258113.96 132598.49 260.00 0.01% 510 33554432 4 2.54 158956.25 107657.05 211.09 0.01% 510 16777216 5 2.82 88259.14 96946.11 190.09 0.02% 510 8388608 6 3.66 57135.09 74878.50 146.82 0.01% 510 4194304 6 2.76 43048.52 49690.33 97.43 0.02% 510 2097152 6 2.75 42960.42 24896.11 48.82 0.01% 510 1048576 6 3.49 54526.31 9807.63 19.23 0.00% 510 524288 6 3.05 47734.35 5601.56 10.98 0.00% 510 262144 7 3.46 27026.79 4946.70 9.70 0.00% 510 131072 8 3.01 11750.54 5688.82 11.15 0.00% 510 65536 9 2.28 4452.89 7505.99 14.72 0.01% 510 32768 10 2.78 2714.82 6155.73 12.07 0.01% 510 16384 14 3.12 190.38 43890.45 86.06 0.01% 510 8192 15 2.93 89.55 46655.65 91.48 0.01% 510 4096 16 2.72 41.54 50284.91 98.60 0.01% 510 2048 17 3.36 25.66 40702.09 79.81 0.01% 510 1024 17 2.35 17.90 29169.75 57.20 0.01% 510 512 18 3.93 14.99 17419.39 34.16 0.01% 510 256 18 3.63 13.83 9441.06 18.51 0.01% 510 128 18 3.39 12.93 5047.67 9.90 0.01% 510 64 19 3.13 5.97 5463.42 10.71 0.01% 510 32 19 3.07 5.85 2788.70 5.47 0.01% 510 16 19 3.02 5.76 1416.77 2.78 0.01% 510 8 19 3.03 5.78 705.39 1.38 0.01% 510 4 19 3.01 5.74 355.51 0.70 0.01% 510 2 19 3.01 5.74 177.79 0.35 0.01% 510 1 19 3.02 5.76 88.53 0.17 0.01% e # Dez08 Altix4700 a17/0*508+a18/0*508 blk=2^28=256MB # Measure speed of MPI_sendrecv transactions: # tasks msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] Bcast v0 1016 268435456 1 2.54 1268034.52 215081.23 211.69 0.00% 1016 134217728 1 3.39 1694931.30 80454.71 79.19 0.00% 1016 67108864 1 1.43 716342.22 95181.61 93.68 0.01% 1016 33554432 2 1.54 384922.21 88566.73 87.17 0.01% 1016 16777216 3 1.80 224653.09 75875.44 74.68 0.01% 1016 8388608 3 1.14 142955.67 59618.66 58.68 0.02% 1016 4194304 4 1.48 92704.69 45967.61 45.24 0.01% 1016 2097152 4 1.26 78630.73 27097.63 26.67 0.01% 1016 1048576 4 1.13 70655.29 15078.18 14.84 0.01% 1016 524288 4 1.06 66234.32 8042.31 7.92 0.01% 1016 262144 5 1.08 33790.91 7881.95 7.76 0.01% 1016 131072 7 1.54 12024.16 11075.13 10.90 0.02% 1016 65536 9 1.64 3203.55 20784.64 20.46 0.02% 1016 32768 8 2.27 8859.62 3757.76 3.70 0.02% 1016 16384 8 1.14 4462.84 3729.94 3.67 0.02% 1016 8192 10 1.70 1656.07 5025.79 4.95 0.01% 1016 4096 10 1.13 1099.42 3785.20 3.73 0.02% 1016 2048 14 1.21 74.15 28059.84 27.62 0.03% 1016 1024 15 1.32 40.31 25811.59 25.41 0.04% 1016 512 16 1.96 29.85 17424.48 17.15 0.02% 1016 256 16 1.80 27.45 9473.61 9.32 0.02% 1016 128 16 1.61 24.60 5286.01 5.20 0.03% 1016 64 17 1.23 9.40 6916.89 6.81 0.03% 1016 32 17 1.22 9.32 3488.34 3.43 0.03% 1016 16 17 1.22 9.28 1752.25 1.72 0.03% 1016 8 17 1.20 9.18 885.01 0.87 0.03% 1016 4 17 1.21 9.20 441.61 0.43 0.03% 1016 2 17 1.21 9.26 219.48 0.22 0.03% 1016 1 17 1.21 9.27 109.64 0.11 0.03% # a17/0*254+a17/1*254+a18/0*254+a18/1*254 2**28 # one run, may measure only good or bad paths ??? 1016 268435456 1 2.51 1254400.27 217418.98 214.00 0.00% 1016 134217728 1 3.31 1653363.60 82477.45 81.18 0.00% 1016 67108864 1 1.59 794465.82 85821.95 84.47 0.00% 1016 33554432 2 1.70 424016.00 80400.98 79.13 0.01% 1016 16777216 2 1.00 250158.00 68139.54 67.07 0.01% 1016 8388608 3 1.27 158634.34 53726.23 52.88 0.02% 1016 4194304 4 1.61 100521.52 42393.04 41.73 0.01% 1016 2097152 4 1.24 77339.86 27549.91 27.12 0.01% 1016 1048576 4 1.09 67891.07 15692.10 15.44 0.01% 1016 524288 5 1.74 54450.80 9782.71 9.63 0.01% 1016 262144 5 1.05 32760.55 8129.85 8.00 0.01% 1016 131072 7 1.41 11017.68 12086.85 11.90 0.01% 1016 65536 9 1.63 3173.95 20978.47 20.65 0.02% 1016 32768 8 2.31 9032.54 3685.82 3.63 0.01% 1016 16384 9 1.11 2177.49 7644.65 7.52 0.03% 1016 8192 9 1.12 2195.07 3791.70 3.73 0.03% 1016 4096 10 1.53 1496.03 2781.71 2.74 0.02% 1016 2048 14 1.23 75.04 27729.34 27.29 0.03% 1016 1024 15 1.33 40.52 25677.47 25.27 0.03% 1016 512 16 1.98 30.14 17258.54 16.99 0.02% 1016 256 16 1.65 25.20 10321.51 10.16 0.02% 1016 128 16 1.62 24.68 5268.86 5.19 0.03% 1016 64 17 1.22 9.31 6984.57 6.87 0.03% 1016 32 17 1.21 9.26 3511.07 3.46 0.03% 1016 16 17 1.22 9.28 1751.46 1.72 0.03% 1016 8 17 1.21 9.20 883.87 0.87 0.03% 1016 4 17 1.21 9.26 439.00 0.43 0.03% 1016 2 17 1.21 9.22 220.30 0.22 0.03% 1016 1 17 1.21 9.23 110.07 0.11 0.03% #a17+a19 2**28 10s # tasks msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] Bcast v0 1016 268435456 4 18.54 1158858.35 235344.06 231.64 0.00% 1016 134217728 2 11.58 2895400.30 47097.19 46.36 0.00% 1016 67108864 4 11.74 733707.40 92928.88 91.47 0.00% 1016 33554432 5 11.75 367224.58 92835.03 91.37 0.00% 1016 16777216 6 12.14 189633.06 89887.55 88.47 0.00% 1016 8388608 7 16.99 132707.09 64222.84 63.21 0.00% 1016 4194304 7 13.65 106628.69 39964.98 39.34 0.00% 1016 2097152 7 10.34 80798.52 26370.61 25.96 0.00% 1016 1048576 9 20.83 40674.80 26191.97 25.78 0.00% 1016 524288 8 11.71 45746.75 11644.03 11.46 0.00% 1016 262144 10 14.11 13780.36 19327.38 19.02 0.00% 1016 131072 12 12.02 2933.85 45390.56 44.68 0.01% 1016 65536 13 14.66 1790.12 37195.56 36.61 0.01% 1016 32768 14 13.18 804.60 41377.32 40.73 0.01% 1016 16384 15 12.89 393.45 42308.30 41.64 0.01% 1016 8192 16 13.53 206.50 40305.40 39.67 0.00% 1016 4096 17 14.25 108.71 38282.51 37.68 0.01% 1016 2048 18 16.85 64.28 32372.06 31.86 0.00% 1016 1024 18 11.13 42.45 24506.22 24.12 0.00% 1016 512 19 16.67 31.80 16358.76 16.10 0.00% 1016 256 19 14.01 26.72 9735.87 9.58 0.00% 1016 128 19 13.72 26.16 4971.26 4.89 0.00% 1016 64 20 10.46 9.97 6521.28 6.42 0.00% 1016 32 20 10.36 9.88 3289.29 3.24 0.00% 1016 16 20 10.33 9.85 1650.73 1.62 0.00% 1016 8 20 10.31 9.83 826.88 0.81 0.00% 1016 4 20 10.36 9.88 411.30 0.40 0.00% 1016 2 20 10.33 9.85 206.21 0.20 0.00% 1016 1 20 10.32 9.85 103.19 0.10 0.00% # a16/0*508+a19/0*508 # tasks msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] Bcast v08 0923 1016 1073741824 1 8.05 4025178.08 271024.45 266.76 0.00% 1016 536870912 1 4.16 2081933.23 261997.28 257.87 0.00% 1016 268435456 1 2.18 1089406.42 250347.73 246.41 0.00% 1016 134217728 1 3.46 1728578.75 78888.63 77.65 0.00% 1016 67108864 1 1.86 929529.12 73351.77 72.20 0.00% 1016 33554432 2 1.36 341006.13 99972.70 98.40 0.01% 1016 16777216 3 1.36 169457.58 100589.49 99.01 0.01% 1016 8388608 4 1.57 98413.33 86602.35 85.24 0.02% 1016 4194304 4 1.22 76096.48 56000.13 55.12 0.02% 1016 2097152 4 1.02 63693.58 33452.45 32.93 0.01% 1016 1048576 4 1.12 69694.19 15286.11 15.05 0.01% 1016 524288 5 1.80 56102.51 9494.70 9.35 0.01% 1016 262144 5 1.02 31737.13 8392.01 8.26 0.01% 1016 131072 7 1.28 10027.27 13280.70 13.07 0.01% 1016 65536 9 1.68 3285.94 20263.49 19.94 0.02% 1016 32768 8 2.35 9199.13 3619.07 3.56 0.01% 1016 16384 9 1.03 2006.53 8295.97 8.17 0.03% 1016 8192 9 1.19 2320.62 3586.57 3.53 0.04% 1016 4096 10 1.55 1510.42 2755.21 2.71 0.02% 1016 2048 14 1.38 83.98 24775.97 24.39 0.03% 1016 1024 15 1.50 45.67 22780.33 22.42 0.03% 1016 512 15 1.13 34.63 15023.39 14.79 0.04% 1016 256 16 1.79 27.36 9506.95 9.36 0.02% 1016 128 16 1.73 26.34 4936.98 4.86 0.02% 1016 64 17 1.70 12.95 5020.08 4.94 0.02% 1016 32 17 1.32 10.07 3230.09 3.18 0.03% 1016 16 17 1.32 10.05 1617.36 1.59 0.03% 1016 8 17 1.30 9.89 821.91 0.81 0.03% 1016 4 17 1.30 9.91 409.93 0.40 0.03% 1016 2 17 1.29 9.85 206.25 0.20 0.03% 1016 1 17 1.30 9.91 102.49 0.10 0.03% #a17+a19 2**24 # faster? # tasks msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] Bcast v0 1016 16777216 4 4.68 292646.60 58246.54 57.33 0.00% 1016 8388608 5 4.79 149647.88 56952.53 56.06 0.01% 1016 4194304 6 4.98 77774.84 54791.66 53.93 0.01% 1016 2097152 7 5.66 44216.14 48188.43 47.43 0.00% 1016 1048576 8 9.13 35648.78 29884.70 29.41 0.00% 1016 524288 8 4.61 18009.95 29576.80 29.11 0.01% 1016 262144 10 6.08 5934.75 44877.79 44.17 0.01% 1016 131072 11 5.81 2839.10 46905.39 46.17 0.01% 1016 65536 12 5.83 1422.74 46800.26 46.06 0.02% 1016 32768 11 7.21 3518.16 9463.00 9.31 0.00% 1016 16384 13 5.06 617.08 26975.70 26.55 0.01% 1016 8192 15 7.18 219.26 37959.79 37.36 0.01% 1016 4096 16 7.14 109.00 38178.29 37.58 0.01% 1016 2048 16 4.21 64.16 32428.63 31.92 0.01% 1016 1024 17 5.56 42.46 24505.25 24.12 0.01% 1016 512 17 4.18 31.89 16314.04 16.06 0.01% 1016 256 18 7.01 26.74 9726.32 9.57 0.01% 1016 128 18 6.91 26.35 4935.62 4.86 0.01% 1016 64 19 5.28 10.07 6460.24 6.36 0.01% 1016 32 19 5.20 9.91 3280.25 3.23 0.01% 1016 16 19 5.19 9.90 1641.48 1.62 0.01% 1016 8 19 5.18 9.88 822.71 0.81 0.01% 1016 4 19 5.19 9.89 410.83 0.40 0.01% 1016 2 19 5.18 9.88 205.59 0.20 0.01% 1016 1 19 5.16 9.84 103.21 0.10 0.01% #a17+a19 # tasks msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] Bcast v0 1016 32768 11 8.23 4019.52 8282.66 8.15 0.01% 1016 16384 14 8.18 499.29 33339.90 32.81 0.00% 1016 8192 16 13.96 212.98 39079.88 38.46 0.00% 1016 4096 17 14.33 109.30 38075.00 37.48 0.01% 1016 2048 17 8.44 64.36 32331.18 31.82 0.01% 1016 1024 18 11.11 42.37 24555.18 24.17 0.00% 1016 512 18 8.33 31.76 16376.85 16.12 0.01% 1016 256 19 13.97 26.64 9762.93 9.61 0.00% 1016 128 19 13.72 26.17 4969.29 4.89 0.00% 1016 64 20 10.46 9.97 6519.01 6.42 0.00% 1016 32 20 10.37 9.89 3288.97 3.24 0.00% 1016 16 20 10.35 9.87 1647.36 1.62 0.00% 1016 8 20 10.35 9.87 823.62 0.81 0.00% 1016 4 20 10.36 9.88 411.30 0.40 0.00% 1016 2 20 10.35 9.87 205.95 0.20 0.00% 1016 1 20 10.34 9.86 103.07 0.10 0.00% e #a08+a09+a10+a11 2**30 (zeitgleich!) #a07+a12+a14+a17 2**28 # tasks msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] Bcast v0 2032 268435456 1 2.36 1178063.77 463014.70 227.86 0.00% 2032 134217728 1 3.36 1679964.15 162343.00 79.89 0.00% 2032 67108864 2 3.43 856378.45 159234.75 78.36 0.00% 2032 33554432 3 3.39 423148.15 161131.76 79.30 0.01% 2032 16777216 4 3.38 211102.45 161491.74 79.47 0.01% 2032 8388608 4 2.36 147504.49 115560.22 56.87 0.01% 2032 4194304 5 2.03 63327.53 134583.28 66.23 0.02% 2032 2097152 6 3.46 54136.82 78715.62 38.74 0.01% 2032 1048576 6 2.64 41264.48 51635.36 25.41 0.01% 2032 524288 7 3.36 26239.21 40601.57 19.98 0.01% 2032 262144 8 3.03 11847.16 44962.37 22.13 0.01% 2032 131072 8 2.42 9438.86 28217.21 13.89 0.01% 2032 65536 10 3.41 3328.69 40006.43 19.69 0.02% 2032 32768 8 2.56 10012.12 6650.40 3.27 0.01% 2032 16384 11 4.34 2118.70 15713.55 7.73 0.01% 2032 8192 11 2.31 1130.05 14730.40 7.25 0.02% 2032 4096 12 2.32 565.90 14707.70 7.24 0.02% 2032 2048 14 2.10 128.03 32505.30 16.00 0.03% 2032 1024 16 3.99 60.81 34218.10 16.84 0.02% 2032 512 16 3.14 47.88 21729.92 10.69 0.03% 2032 256 16 3.42 52.22 9960.86 4.90 0.02% 2032 128 16 3.16 48.25 5390.59 2.65 0.02% 2032 64 17 2.46 18.74 6939.72 3.42 0.03% 2032 32 17 2.40 18.34 3545.48 1.74 0.03% 2032 16 17 2.29 17.46 1862.23 0.92 0.03% 2032 8 17 2.07 15.81 1028.29 0.51 0.03% 2032 4 17 2.11 16.08 505.46 0.25 0.03% 2032 2 17 2.16 16.50 246.37 0.12 0.03% 2032 1 17 2.09 15.97 127.27 0.06 0.03% #a07+a12+a14+a17 2**27 #a08+a09+a10+a11 2**30 (zeitgleich!) 2032 1073741824 1 7.94 3968903.03 549734.62 270.54 0.00% 2032 536870912 1 4.22 2110274.25 516957.31 254.41 0.00% 2032 268435456 1 2.65 1326452.62 411217.74 202.37 0.00% 2032 134217728 1 3.38 1692433.73 161146.89 79.30 0.00% 2032 67108864 1 2.26 1128250.20 120864.34 59.48 0.00% 2032 33554432 3 3.46 432456.61 157663.46 77.59 0.01% 2032 16777216 3 2.11 263191.34 129530.49 63.75 0.01% 2032 8388608 4 2.49 155687.00 109486.67 53.88 0.01% 2032 4194304 5 2.90 90700.04 93967.17 46.24 0.01% 2032 2097152 5 2.83 88339.57 48239.00 23.74 0.01% 2032 1048576 6 3.29 51395.21 41457.30 20.40 0.01% 2032 524288 6 2.16 33775.84 31541.87 15.52 0.01% 2032 262144 7 2.08 16265.60 32748.66 16.12 0.01% 2032 131072 9 3.43 6706.49 39713.51 19.54 0.01% 2032 65536 10 3.76 3671.87 36267.37 17.85 0.02% 2032 32768 8 2.41 9410.24 7075.76 3.48 0.01% 2032 16384 11 3.61 1760.31 18912.74 9.31 0.01% 2032 8192 11 2.64 1286.74 12936.72 6.37 0.02% 2032 4096 12 2.76 672.69 12372.86 6.09 0.01% 2032 2048 14 2.14 130.80 31816.10 15.66 0.02% 2032 1024 15 2.46 75.05 27726.12 13.64 0.02% 2032 512 16 3.50 53.38 19489.23 9.59 0.02% 2032 256 16 2.73 41.61 12501.84 6.15 0.02% 2032 128 16 2.66 40.64 6400.02 3.15 0.02% 2032 64 17 2.04 15.56 8355.71 4.11 0.03% 2032 32 17 2.02 15.43 4213.53 2.07 0.03% 2032 16 17 2.02 15.38 2113.51 1.04 0.03% 2032 8 17 2.05 15.64 1039.33 0.51 0.03% 2032 4 18 3.34 12.75 637.56 0.31 0.02% 2032 2 18 2.99 11.41 356.19 0.18 0.02% 2032 1 18 3.00 11.44 177.56 0.09 0.02% # exec_host = a08/0*508+a09/0*508+a10/0*508+a11/0*508 2**20 einzeln # tasks msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] Bcast v0 2032 1048576 7 2.95 23071.79 92351.16 45.45 0.01% 2032 524288 8 3.68 14364.78 74164.25 36.50 0.01% 2032 262144 8 2.83 11057.42 48173.66 23.71 0.01% 2032 131072 8 2.44 9537.50 27925.39 13.74 0.01% 2032 65536 10 2.71 2648.64 50278.25 24.74 0.02% 2032 32768 9 2.42 4732.35 14070.10 6.92 0.01% 2032 16384 11 3.15 1537.53 21653.14 10.66 0.02% 2032 8192 11 2.22 1085.92 15329.12 7.54 0.02% 2032 4096 12 2.39 582.67 14284.29 7.03 0.02% 2032 2048 15 2.89 88.10 47233.85 23.25 0.02% 2032 1024 16 3.42 52.26 39818.89 19.60 0.02% 2032 512 16 2.45 37.42 27801.34 13.68 0.03% 2032 256 16 2.00 30.55 17027.37 8.38 0.03% 2032 128 17 3.92 29.90 8699.68 4.28 0.02% 2032 64 17 2.58 19.66 6615.04 3.26 0.02% 2032 32 18 3.00 11.45 5678.65 2.79 0.02% 2032 16 18 2.98 11.35 2863.75 1.41 0.02% 2032 8 18 2.97 11.34 1433.02 0.71 0.02% 2032 4 18 2.98 11.38 714.34 0.35 0.02% 2032 2 18 3.03 11.57 351.21 0.17 0.02% 2032 1 18 2.99 11.40 178.21 0.09 0.02% e # old wrong version! # Job Id: 206922.pbs1 memalign # pr28yaSPEED.o207014 pause -1 # ------------- snip ------------------- #gnuplot #set logscale xy #set ytics 2 set nolabel set ytics auto set nologscale y set logscale x set logscale xy set key left set title "SiCortex SC5832 ice9 Performance(msgsize) MPI_Sendrecv(i-j,i+j)" set xlabel "message size [B]" set ylabel "nodeBW [MB/s]" set label "one rail" at 32*1024,300 right set label "multi rail" at 256*1024,100 set xtics ("1" 1,"4" 4,"16" 16,"64" 64,"256" 256,"1k" 1024,"4k" 4096,"16k" 4*4096,"64k" 65536,"256k" 262144,"1M" 1048576,"4M" 4*1024*1024,"16M" 16*1024*1024) plot [1:1024*1024*32] [1.0:2000] \ "-" u 2:(1*$6/$1) t "n=512 ppn=1 n* 400MB/s 2.3us" w lp 1 1,\ "-" u 2:(1*$6/$1) t "" w p 2 2,\ "-" u 2:(3*$6/$1) t "n=512 ppn=3 n* 380MB/s 4.3us" w lp 3 3,\ "-" u 2:(6*$6/$1) t "n=512 ppn=6 n* 370MB/s 6.0us" w lp 4 4,\ "-" u 2:(2*$6/$1) t "n=1 ppn=2 1190MB/s 1.20us" w lp 5 1,\ "-" u 2:(6*$6/$1) t "n=1 ppn=6 1060MB/s 2.4us" w lp 6 1,\ "-" u 2:(1*$6/$1) t "n=2 ppn=1 2*1560MB/s 1.21-1.3us (sc072)" w lp 7 2,\ "-" u 2:(1*$6/$1) t "n=3 ppn=1 3*1740MB/s (sc072)" w lp 8 2,\ 0.01 t "" w l 1 1 # 972*6 SC5832 ToDo: split on chip / off chip graphs? # Measure speed of MPI_sendrecv transactions: # threads msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] error # mpi_stress v2008-09-23 # using MPI 2.0 with 512 nodes # blocksize: 2^26 = 67108864 = 6.711e+07 # mintime[s]: 30 # pairwise: 0 # nodeshift: 1 # align: 1 # 0: dst 1 2 3 4 5 6 7 8 ... src 511 0 1 2 3 4 5 6 ... # 1: dst 2 3 4 5 6 7 8 9 ... src 510 511 0 1 2 3 4 5 ... # Measure speed of MPI_sendrecv transactions: # tasks msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] Bcast # enough relaxation time? Apr 25 18:24 slurm-2838.out # 512 67108864 6 10.89 170186.64 201894.45 394.33 0.35% # 512 67108864 7 21.54 168260.63 204205.45 398.84 0.21% 512 67108864 8 43.20 168740.18 203625.12 397.71 0.00% # 512 33554432 7 10.73 83811.23 204982.90 400.36 0.36% # 512 33554432 8 21.32 83284.96 206278.17 402.89 0.21% 512 33554432 9 43.01 84007.06 204505.06 399.42 0.00% # 512 16777216 8 10.77 42076.23 204151.72 398.73 0.37% # 512 16777216 9 21.62 42228.60 203415.10 397.30 0.21% 512 16777216 10 43.34 42321.66 202967.81 396.42 0.00% # 512 8388608 9 10.90 21283.47 201798.23 394.14 0.39% # 512 8388608 10 21.82 21309.81 201548.80 393.65 0.22% 512 8388608 11 43.66 21320.06 201451.95 393.46 0.00% 512 4194304 12 44.41 10843.21 198048.75 386.81 0.00% 512 2097152 13 45.95 5608.92 191434.77 373.90 0.00% 512 1048576 14 49.27 3007.22 178527.23 348.69 0.00% 512 524288 15 57.05 1740.95 154188.83 301.15 0.00% 512 262144 15 34.69 1058.72 126773.24 247.60 0.00% 512 131072 16 37.41 570.82 117565.43 229.62 0.00% # above=multirail, below=onerail 512 65536 17 49.08 374.46 89607.93 175.02 0.00% 512 32768 18 48.94 186.69 89866.61 175.52 0.00% 512 16384 19 48.98 93.43 89787.41 175.37 0.00% 512 8192 20 49.52 47.23 88814.73 173.47 0.00% 512 4096 21 52.26 24.92 84163.58 164.38 0.00% 512 2048 21 30.77 14.67 71475.22 139.60 0.00% 512 1024 21 52.13 24.86 21092.42 41.20 0.00% 512 512 22 59.47 14.18 18488.53 36.11 0.00% 512 256 22 30.88 7.36 17800.46 34.77 0.00% 512 128 23 41.01 4.89 13405.48 26.18 0.00% 512 64 24 45.85 2.73 11991.24 23.42 0.00% 512 32 24 41.76 2.49 6581.69 12.85 0.00% 512 16 24 39.75 2.37 3457.39 6.75 0.00% 512 8 24 39.68 2.37 1731.87 3.38 0.00% 512 4 24 39.06 2.33 879.75 1.72 0.00% 512 2 24 38.73 2.31 443.54 0.87 0.00% 512 1 24 38.33 2.28 224.11 0.44 0.00% e # Apr 14 11:24 slurm-2842.out# tasks msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] Bcast # 512 67108864 6 10.89 170186.64 201894.45 394.33 0.35% # 512 67108864 7 21.54 168260.63 204205.45 398.84 0.21% 512 67108864 8 43.20 168740.18 203625.12 397.71 0.00% # 512 33554432 7 10.73 83811.23 204982.90 400.36 0.36% # 512 33554432 8 21.32 83284.96 206278.17 402.89 0.21% 512 33554432 9 43.01 84007.06 204505.06 399.42 0.00% 512 16777216 10 43.34 42321.66 202967.81 396.42 0.00% 512 8388608 11 43.66 21320.06 201451.95 393.46 0.00% 512 4194304 12 44.41 10843.21 198048.75 386.81 0.00% 512 2097152 13 45.95 5608.92 191434.77 373.90 0.00% 512 1048576 14 49.27 3007.22 178527.23 348.69 0.00% 512 524288 15 57.05 1740.95 154188.83 301.15 0.00% 512 262144 15 34.69 1058.72 126773.24 247.60 0.00% 512 131072 16 37.41 570.82 117565.43 229.62 0.00% # 64K 512 65536 17 49.08 374.46 89607.93 175.02 0.00% 512 32768 18 48.94 186.69 89866.61 175.52 0.00% 512 16384 19 48.98 93.43 89787.41 175.37 0.00% 512 8192 20 49.52 47.23 88814.73 173.47 0.00% 512 4096 21 52.26 24.92 84163.58 164.38 0.00% 512 2048 21 30.77 14.67 71475.22 139.60 0.00% 512 1024 21 52.13 24.86 21092.42 41.20 0.00% 512 512 22 59.47 14.18 18488.53 36.11 0.00% 512 256 22 30.88 7.36 17800.46 34.77 0.00% 512 128 23 41.01 4.89 13405.48 26.18 0.00% 512 64 24 45.85 2.73 11991.24 23.42 0.00% 512 32 24 41.76 2.49 6581.69 12.85 0.00% 512 16 24 39.75 2.37 3457.39 6.75 0.00% 512 8 24 39.68 2.37 1731.87 3.38 0.00% 512 4 24 39.06 2.33 879.75 1.72 0.00% 512 2 24 38.73 2.31 443.54 0.87 0.00% 512 1 24 38.33 2.28 224.11 0.44 0.00% e # Apr 14 11:42 slurm-2843.out # tasks msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] Bcast # 1536 67108864 5 13.62 425504.80 242251.59 157.72 0.69% 1536 67108864 6 32.75 511747.85 201425.79 131.14 0.00% ??? # 1536 33554432 6 17.48 273163.32 188676.89 122.84 0.56% 1536 33554432 7 34.52 269677.54 191115.68 124.42 0.00% # 1536 16777216 7 17.31 135261.50 190518.39 124.04 0.61% 1536 16777216 8 34.23 133704.78 192736.59 125.48 0.00% 1536 8388608 9 34.58 67548.26 190751.04 124.19 0.00% 1536 4194304 10 34.04 33239.35 193820.00 126.18 0.00% 1536 2097152 11 35.28 17226.63 186991.03 121.74 0.00% 1536 1048576 12 37.33 9114.47 176709.39 115.05 0.00% 1536 524288 13 44.42 5421.79 148531.39 96.70 0.00% 1536 262144 14 54.81 3345.58 120353.95 78.36 0.00% 1536 131072 15 59.15 1804.96 111540.46 72.62 0.00% # 64K # 1536 65536 14 19.69 1201.60 83774.05 54.54 0.75% 1536 65536 15 39.38 1201.68 83768.48 54.54 0.00% 1536 32768 16 39.31 599.84 83908.35 54.63 0.00% 1536 16384 17 39.31 299.94 83902.45 54.62 0.00% 1536 8192 18 39.46 150.53 83593.28 54.42 0.00% 1536 4096 19 39.95 76.19 82573.70 53.76 0.00% 1536 2048 20 41.43 39.51 79623.14 51.84 0.00% 1536 1024 20 40.03 38.17 41204.75 26.83 0.00% 1536 512 21 45.26 21.58 36440.34 23.72 0.00% 1536 256 22 52.95 12.62 31147.90 20.28 0.00% 1536 128 22 37.26 8.88 22131.39 14.41 0.00% 1536 64 23 39.06 4.66 21114.15 13.75 0.00% 1536 32 23 35.07 4.18 11758.56 7.66 0.00% 1536 16 23 35.91 4.28 5740.69 3.74 0.00% 1536 8 23 36.04 4.30 2859.76 1.86 0.00% 1536 4 23 35.28 4.21 1460.90 0.95 0.00% 1536 2 23 37.04 4.41 695.82 0.45 0.00% 1536 1 23 36.78 4.39 350.28 0.23 0.00% e # Apr 14 12:03 slurm-2844.out # tasks msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] Bcast # 3072 67108864 4 12.06 753865.35 273468.50 89.02 1.43% # 3072 67108864 5 25.36 792422.58 260162.24 84.69 0.84% 3072 67108864 6 57.81 903272.23 228235.10 74.30 0.00% # 3072 33554432 5 17.11 534668.00 192791.07 62.76 1.23% 3072 33554432 6 34.95 546072.42 188764.73 61.45 0.00% # 3072 16777216 6 16.39 256168.53 201194.14 65.49 1.39% 3072 16777216 7 36.06 281681.23 182971.39 59.56 0.00% # 3072 8388608 7 17.87 139583.13 184619.76 60.10 1.46% 3072 8388608 8 34.85 136119.53 189317.46 61.63 0.00% 3072 4194304 9 34.87 68107.86 189183.77 61.58 0.00% 3072 2097152 10 35.84 35000.18 184069.07 59.92 0.00% 3072 1048576 11 37.24 18182.50 177160.71 57.67 0.00% 3072 524288 12 45.17 11027.93 146048.52 47.54 0.00% 3072 262144 13 55.86 6819.05 118096.59 38.44 0.00% 3072 131072 13 30.22 3689.35 109139.20 35.53 0.00% # 64K 3072 65536 14 40.69 2483.64 81061.17 26.39 0.00% 3072 32768 15 40.56 1237.82 81323.13 26.47 0.00% 3072 16384 16 40.91 624.26 80626.26 26.25 0.00% 3072 8192 17 41.50 316.59 79490.06 25.88 0.00% 3072 4096 18 42.09 160.55 78374.26 25.51 0.00% 3072 2048 19 44.56 84.99 74023.60 24.10 0.00% 3072 1024 20 57.07 54.42 57801.20 18.82 0.00% 3072 512 20 33.74 32.17 48884.83 15.91 0.00% 3072 256 21 37.54 17.90 43929.85 14.30 0.00% 3072 128 22 49.91 11.90 33044.37 10.76 0.00% 3072 64 23 57.68 6.88 28595.32 9.31 0.00% 3072 32 23 51.07 6.09 16147.08 5.26 0.00% 3072 16 23 50.07 5.97 8235.03 2.68 0.00% 3072 8 23 50.26 5.99 4101.98 1.34 0.00% 3072 4 23 49.94 5.95 2064.04 0.67 0.00% 3072 2 23 53.11 6.33 970.44 0.32 0.00% 3072 1 23 52.08 6.21 494.77 0.16 0.00% e # 8.5.09 sc072 # srun -v -p sca -N 1 -n 2 ../utils/mpi_stress090412 --bsize 26 --mintime 30 # tasks msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] Bcast # 2 67108864 7 14.49 113214.41 1185.52 592.76 0.00% # 2 67108864 8 28.98 113214.86 1185.51 592.76 0.00% 2 67108864 9 57.97 113227.45 1185.38 592.69 0.00% 2 33554432 10 57.87 56514.18 1187.47 593.73 0.00% 2 16777216 11 57.85 28245.85 1187.94 593.97 0.00% 2 8388608 12 58.08 14178.72 1183.27 591.63 0.00% 2 4194304 13 58.92 7192.33 1166.33 583.16 0.00% 2 2097152 13 30.29 3697.58 1134.34 567.17 0.00% 2 1048576 14 31.99 1952.64 1074.01 537.00 0.00% 2 524288 15 33.35 1017.62 1030.42 515.21 0.00% 2 262144 16 33.97 518.38 1011.39 505.70 0.00% 2 131072 17 33.20 253.27 1035.03 517.51 0.00% 2 65536 19 59.24 112.99 1160.05 580.03 0.00% 2 32768 19 31.07 59.26 1105.87 552.93 0.00% 2 16384 20 34.43 32.84 997.84 498.92 0.00% 2 8192 21 41.50 19.79 828.02 414.01 0.00% 2 8192 21 41.50 19.79 828.02 414.01 0.00% 2 4096 22 54.79 13.06 627.10 313.55 0.00% 2 2048 22 40.28 9.60 426.54 213.27 0.00% 2 1024 22 41.96 10.00 204.72 102.36 0.00% 2 512 23 40.50 4.83 212.10 106.05 0.00% 2 256 24 48.82 2.91 175.95 87.97 0.00% 2 128 24 35.47 2.11 121.10 60.55 0.00% 2 64 25 55.76 1.66 77.03 38.51 0.00% 2 32 25 47.71 1.42 45.01 22.51 0.00% 2 16 25 43.62 1.30 24.61 12.31 0.00% 2 8 25 42.46 1.27 12.64 6.32 0.00% 2 4 25 42.75 1.27 6.28 3.14 0.00% 2 2 25 40.63 1.21 3.30 1.65 0.00% 2 1 25 40.28 1.20 1.67 0.83 0.00% e # srun -v -p sca -N 1 -n 6 ../utils/mpi_stress090412 --bsize 26 --mintime 30 # tasks msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] Bcast # if started on 11 Nodes: 169..181MB/s (const. per Node) # means: intranode communication generates no internode traffic!?! (DMA only) # 6 67108864 5 12.16 379850.89 1060.03 176.67 0.00% # 6 67108864 6 24.32 379956.75 1059.73 176.62 0.00% 6 67108864 7 48.63 379950.62 1059.75 176.63 0.00% 6 33554432 8 48.62 189908.51 1060.12 176.69 0.00% 6 16777216 9 48.49 94703.98 1062.93 177.15 0.00% 6 8388608 10 48.76 47621.98 1056.90 176.15 0.00% 6 4194304 11 48.81 23835.38 1055.82 175.97 0.00% 6 2097152 12 48.53 11847.98 1062.03 177.01 0.00% 6 1048576 13 48.73 5948.35 1057.68 176.28 0.00% 6 524288 14 49.56 3024.61 1040.04 173.34 0.00% 6 262144 15 49.47 1509.77 1041.79 173.63 0.00% 6 131072 16 49.65 757.67 1037.96 172.99 0.00% 6 65536 17 50.80 387.61 1014.46 169.08 0.00% 6 32768 18 50.82 193.86 1014.18 169.03 0.00% 6 16384 19 52.72 100.56 977.52 162.92 0.00% 6 8192 20 56.92 54.29 905.40 150.90 0.00% 6 4096 20 32.77 31.26 786.28 131.05 0.00% 6 2048 21 43.06 20.53 598.51 99.75 0.00% 6 1024 21 46.43 22.14 277.51 46.25 0.00% 6 512 22 51.09 12.18 252.22 42.04 0.00% 6 256 23 54.08 6.45 238.26 39.71 0.00% 6 128 23 37.47 4.47 171.95 28.66 0.00% 6 64 24 39.38 2.35 163.60 27.27 0.00% 6 32 24 40.24 2.40 80.04 13.34 0.00% 6 16 24 37.73 2.25 42.69 7.11 0.00% 6 8 24 41.24 2.46 19.53 3.25 0.00% 6 4 24 39.64 2.36 10.16 1.69 0.00% 6 2 24 39.80 2.37 5.06 0.84 0.00% 6 1 24 40.14 2.39 2.51 0.42 0.00% e # srun -v -p sca -N 2 -n 2 ../utils/mpi_stress090412 --bsize 26 --mintime 30 # welche Knoten!? m0n0 - m0n1 # tasks msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] Bcast # in case of 5*started: 1620..1741MB/s n1-n2,n3-n4,n5-n6,... # 2 67108864 8 11.03 43068.62 3116.37 1558.18 0.00% # 2 67108864 9 22.03 43036.47 3118.70 1559.35 0.00% 2 67108864 10 44.07 43036.19 3118.72 1559.36 0.00% 2 33554432 11 44.31 21634.31 3101.96 1550.98 0.00% 2 16777216 12 44.27 10807.40 3104.76 1552.38 0.00% 2 8388608 13 45.70 5578.40 3007.53 1503.77 0.00% 2 4194304 14 47.03 2870.33 2922.53 1461.26 0.00% 2 2097152 15 46.91 1431.68 2929.63 1464.81 0.00% 2 1048576 16 48.77 744.19 2818.05 1409.03 0.00% 2 524288 17 47.87 365.21 2871.16 1435.58 0.00% 2 262144 18 54.28 207.07 2531.91 1265.96 0.00% 2 131072 19 58.88 112.31 2334.09 1167.04 0.00% # 64k multirail switch 2 65536 19 39.36 75.07 1746.01 873.00 0.00% 2 32768 20 42.72 40.74 1608.67 804.34 0.00% 2 16384 21 49.36 23.54 1392.23 696.11 0.00% 2 8192 21 31.41 14.98 1093.75 546.87 0.00% 2 4096 22 44.92 10.71 764.92 382.46 0.00% 2 2048 22 35.76 8.53 480.45 240.23 0.00% 2 1024 22 38.09 9.08 225.51 112.75 0.00% 2 512 23 37.97 4.53 226.26 113.13 0.00% 2 256 24 43.31 2.58 198.34 99.17 0.00% 2 128 24 32.38 1.93 132.64 66.32 0.00% 2 64 25 54.53 1.63 78.76 39.38 0.00% 2 32 25 46.64 1.39 46.04 23.02 0.00% 2 16 25 43.57 1.30 24.64 12.32 0.00% 2 8 25 42.36 1.26 12.67 6.34 0.00% 2 4 25 42.79 1.28 6.27 3.14 0.00% 2 2 25 43.23 1.29 3.10 1.55 0.00% 2 1 25 42.54 1.27 1.58 0.79 0.00% # m0n0 - m0n11 # tasks msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] Bcast # 2 67108864 8 11.10 43369.63 3094.74 1547.37 0.00% # 2 67108864 9 22.20 43364.32 3095.12 1547.56 0.00% 2 67108864 10 44.44 43399.30 3092.62 1546.31 0.00% 2 33554432 11 44.80 21873.26 3068.08 1534.04 0.00% 2 16777216 12 44.54 10873.87 3085.79 1542.89 0.00% 2 8388608 13 45.83 5593.92 2999.19 1499.59 0.00% 2 4194304 14 46.67 2848.58 2944.84 1472.42 0.00% 2 2097152 15 47.38 1445.99 2900.64 1450.32 0.00% 2 1048576 16 48.11 734.07 2856.88 1428.44 0.00% --bsize 26 # 2 1048576 16 48.08 733.65 2858.52 1429.26 0.00% --bsize 20 2 524288 17 48.82 372.47 2815.23 1407.62 0.00% --bsize 26 2 262144 18 53.35 203.51 2576.18 1288.09 0.00% 2 131072 19 55.31 105.49 2484.93 1242.47 0.00% 2 65536 19 31.59 60.25 2175.39 1087.70 0.00% 2 32768 20 34.63 33.02 1984.57 992.29 0.00% 2 16384 21 41.12 19.61 1671.38 835.69 0.00% --bsize 26 # 621MB/s ... 692MB/s (= n0-n1) schwingt sich ein? --bsize 14 2 8192 22 53.65 12.79 1280.98 640.49 0.00% --bsize 26 2 4096 22 39.30 9.37 874.22 437.11 0.00% 2 2048 22 32.95 7.86 521.35 260.68 0.00% 2 1024 22 38.02 9.06 225.95 112.97 0.00% 2 512 23 37.54 4.48 228.83 114.41 0.00% 2 256 24 43.25 2.58 198.62 99.31 0.00% 2 128 24 31.56 1.88 136.11 68.05 0.00% 2 64 25 52.87 1.58 81.23 40.62 0.00% 2 32 25 45.10 1.34 47.61 23.81 0.00% 2 16 25 41.96 1.25 25.59 12.80 0.00% 2 8 25 40.71 1.21 13.19 6.59 0.00% 2 4 25 41.12 1.23 6.53 3.26 0.00% 2 2 25 41.46 1.24 3.24 1.62 0.00% 2 1 25 40.85 1.22 1.64 0.82 0.00% # 8.5.09 sc072 n0-n11 start_msgsize < cache # srun -p sca -w sca-m0n0,sca-m0n11 ../utils/mpi_stress090412 --bsize 14 --mintime 30 # tasks msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] Bcast # 2 16384 19 13.81 26.35 1243.76 621.88 0.00% --bsize 14 # 2 16384 20 26.00 24.79 1321.67 660.83 0.00% --bsize 14 2 16384 21 49.58 23.64 1385.97 692.99 0.00% --bsize 14 einschwingen? # 2 8192 20 14.98 14.29 1146.89 573.44 0.00% # 2 8192 21 29.83 14.22 1151.90 575.95 0.00% 2 8192 22 59.35 14.15 1157.92 578.96 0.00% # 2 4096 21 19.80 9.44 867.46 433.73 0.00% 2 4096 22 39.62 9.45 867.33 433.66 0.00% 2 2048 22 30.98 7.39 554.50 277.25 0.00% 2 1024 22 38.01 9.06 225.98 112.99 0.00% # also for --bsize 10 2 512 23 37.55 4.48 228.73 114.37 0.00% 2 256 24 43.25 2.58 198.63 99.32 0.00% 2 128 24 31.55 1.88 136.15 68.08 0.00% 2 64 25 52.84 1.57 81.28 40.64 0.00% 2 32 25 45.44 1.35 47.26 23.63 0.00% 2 16 25 41.92 1.25 25.62 12.81 0.00% 2 8 25 40.73 1.21 13.18 6.59 0.00% 2 4 25 41.18 1.23 6.52 3.26 0.00% 2 2 25 41.45 1.24 3.24 1.62 0.00% 2 1 25 40.84 1.22 1.64 0.82 0.00% # n0-n11 --bsize 15 --mintime 30 # 2 32768 18 10.68 40.75 1608.29 804.14 0.00% # 2 32768 19 20.36 38.83 1687.69 843.84 0.00% 2 32768 20 39.48 37.65 1740.60 870.30 0.00% # 2 16384 19 11.51 21.96 1492.43 746.22 0.00% # 2 16384 20 22.90 21.84 1500.15 750.08 0.00% 2 16384 21 45.35 21.63 1515.20 757.60 0.00% # sca -w sca-m0n0,sca-m0n11 ../utils/mpi_stress090412 --bsize 15 --mintime 600 # tasks msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] Bcast # 2 32768 19 17.92 34.18 1917.50 958.75 0.00% # 2 32768 20 35.05 33.43 1960.68 980.34 0.00% # 2 32768 21 69.06 32.93 1990.03 995.02 0.00% # 2 32768 22 136.76 32.61 2009.88 1004.94 0.00% # 2 32768 23 270.31 32.22 2033.76 1016.88 0.00% # 2 32768 24 536.94 32.00 2047.75 1023.87 0.00% 2 32768 25 1071.99 31.95 2051.35 1025.67 0.00% # 2 16384 20 19.79 18.87 1736.47 868.23 0.00% # 2 16384 21 39.57 18.87 1736.47 868.24 0.00% # 2 16384 22 79.15 18.87 1736.48 868.24 0.00% # 2 16384 23 158.30 18.87 1736.47 868.24 0.00% # 2 16384 24 316.59 18.87 1736.49 868.24 0.00% 2 16384 25 633.18 18.87 1736.49 868.25 0.00% # sca -w sca-m0n0,sca-m0n11 ../utils/mpi_stress090412 --bsize 14 --mintime 600 # tasks msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] Bcast # 2 16384 19 12.13 23.14 1415.80 707.90 0.00% # 2 16384 20 23.52 22.43 1461.17 730.58 0.00% # 2 16384 21 45.87 21.87 1498.25 749.12 0.00% # 2 16384 22 88.08 21.00 1560.46 780.23 0.00% # 2 16384 23 169.69 20.23 1619.93 809.96 0.00% # 2 16384 24 328.58 19.58 1673.12 836.56 0.00% 2 16384 25 645.17 19.23 1704.22 852.11 0.00% # 2 8192 20 12.64 12.05 1359.42 679.71 0.00% # 2 8192 21 25.27 12.05 1359.44 679.72 0.00% # 2 8192 22 50.55 12.05 1359.43 679.71 0.00% # 2 8192 23 101.10 12.05 1359.47 679.73 0.00% # 2 8192 24 202.19 12.05 1359.49 679.74 0.00% # 2 8192 25 404.39 12.05 1359.48 679.74 0.00% 2 8192 26 808.77 12.05 1359.49 679.75 0.00% e # SC072 -N 3 -n 3 # tasks msgsize[B] loops[2^x] time[s] latency[us] SumBW[MB/s] BW[MB/s] Bcast # 3 67108864 9 19.78 38627.47 5212.01 1737.34 0.00% 3 67108864 10 39.56 38632.69 5211.30 1737.10 0.00% 3 33554432 11 40.35 19703.36 5108.94 1702.98 0.00% 3 16777216 12 40.83 9969.37 5048.63 1682.88 0.00% 3 8388608 13 43.37 5294.60 4753.11 1584.37 0.00% e # pause -1