/* measure speed of memory (best and worst case) INSTALL: mpicc [-D CFG_CPUSET=] -O2 -o memspeed memspeed.c # cpuset is a bit pattern (hex) for available cpu cores per node # useful to bind tasks to cores using sched_setaffinity() RUN: mpirun -np ./memspeed # needs: memory = tasks*2^log2size Bytes # time = (1..2)*2*mintime*(log2size-8) seconds mpirun -np 8 ./memspeed 29 2 # 8CPUs 4GB 512MB/core 22*2s mpirun -np 2 ./memspeed 27 2 # 2CPUs 256MB 128MB/core 20*2s ToDo: better read random index from array? ChangeLog: 2008-08-22: add mpi-functionality to fill all cores of a node, add a random shaked pointer chain (less instructions) SRC-RND: do { ll=aa[ll]; } while(--k); SRC-STR: do { x^=*ap; ap++; } while(--k); ; gcc-4.3.3-x86-64 .L41 = 3 instr/loop, .L59 = 4 instr/loop .L52: subl $1, %eax ; gcc-4.1.1-i386 3 instr/loop movl (%edx,%ebx,4), %ebx jne .L52 .L50: subq $1, %rax ; gcc-4.1.0-x86-64 3 instr/loop movq (%rdx,%rbp,8), %rbp jne .L50 .L55: ... shladd r14 = r37, 3, r15 ; gcc-4.1.1-ia64 3 instr/loop ... ld8 r37 = [r14] ... br.cloop.sptk.few .L55 L$66: s8addq $10, $0, $10 ; cxx-6.5-alpha 19 instr/8loop lda $2, -8($2) ; + 5 inst/loop ldq $10, ($10) cmple $2, 7, $3 s8addq $10, $0, $10 ldq $10, ($10) ... above 2 lines repeated 7 times (unrolled loop) beq $3, L$66 2007-04-27: 4 x86-instructions per loop (gcc-4.1.1 -O2) 5 opteron-Instr. per loop (gcc-4.1.0 -O2) 4 sparcv9-Instr. per loop (gcc-4.1.1 -O2) 32 alpha-Instr. per loop (cxx-6.5 -fast) .L15: xorl aa(,%edx,4), %edi ; gcc-x86-p4 addl %ebx, %edx subl $1, %eax jne .L15 .L16: ldx [%g3], %g1 ; gcc-sparcv9 add %g2, -1, %g2 xor %l0, %g1, %l0 brnz,pt %g2, .L16 .L900000128: ; CC-5.3-sparcv9 add %o1,%i3,%o1 xor %l0,%o2,%l0 add %i2,%i4,%i2 addcc %o4,-1,%o4 bne,a,pt %xcc,.L900000128 # P4-2.6GHz/2_maxspeed=(1.3GHz/(loop=5Instr)=260MHz)*(long=4)=1040MB/s # see also memspeed.gpl # System Array Rand MaxMB/s Err # new 4x->4summed Nehalem-2GHz-4MB 256MB 70 6338 114ns 64bit-gcc-4.3.3 72GB -O2 Nehalem-2GHz-4MB 256MB 70 6340 114ns 64bit-gcc-4.3.3 72GB -O1 Nehalem-2GHz-4MB 256MB 68 1850 118ns 64bit-gcc-4.3.3 72GB -O0 more x86-instr. Nehalem-2GHz-4MB 2x 256MB 66 4707 122ns 64bit-gcc-4.3.3 72GB -O0 more x86-instr. PM- 600MHz-1MB 512kB 284 393 - 14ns 32bit-gcc-4.1.2 2GB hdparm=385MB/s 0808 PM- 600MHz-1MB 256MB 23 382 - 174ns 32bit-gcc-4.1.2 2GB hdparm=385MB/s 0808 PM-1400MHz-1MB 64MB 31 870 - 127ns 32bit-gcc-4.1.2 2GB P4-2.6GHz/2-512kB 256MB 20 1090 - 197ns 32bit-gcc-4.1.1 hdparm=1181MB/s FSB=4*100MHz Mem=266MHz*8=2128MB/s CPU=13*100MHz (default=13*200MHz) L2=512k=2^19 0808 P4-2.6GHz/2-512kB HT 2x256MB 26 1533 - 307ns 32bit-gcc-4.1.1 hdparm=1181MB/s FSB=4*100MHz Mem=266MHz*8=2128MB/s CPU=13*100MHz (default=13*200MHz) L2=512k=2^19 0808 SunFire-880 32MB 23 220 6 346ns 64bit-gcc-4.1.1 USparcIII 750MHz SunFire-880 32MB 23 232 7 333ns 64bit-CC-5.3-fast-xarch=v9 SunFire-V490 32MB 92 798 19 84ns 64bit-gcc-4.1.1 USparcIV+ 10*150MHz L2=32MB 8-way DOpteron275-2.2GHz 32MB 89 2130 34 88ns 64bit-gcc-4.1.1 L2=1MB hdparm-T=1945MB/s 2.6.18 2DualCore DOpteron275-2.2GHz 2x 32MB 177 4129 16 88ns 64bit-gcc-4.1.1 L2=1MB hdparm-T=1945MB/s 2.6.18 2DualCore 4*44MB/s+2*85MB/s DOpteron275-2.2GHz 4x 32MB 340 7156 92 91ns 64bit-gcc-4.1.1 L2=1MB hdparm-T=1945MB/s 2.6.18 2DualCore DOpteron275-2.2GHz 6x 32MB 346 7200 98 177ns 64bit-gcc-4.1.1 L2=1MB hdparm-T=1945MB/s 2.6.18 2DualCore 4*44MB/s+2*85MB/s DOpteron275-2.2GHz 8x 32MB 332 7248 92 189ns 64bit-gcc-4.1.1 L2=1MB hdparm-T=1945MB/s 2.6.18 2DualCore DOpteron885-2.6GHz 32MB 35 838 21 223ns 64bit-gcc-4.1.0 8user hdparm-T=1660MB/s BW=6.4GB/s(1CPU) 4HOpteron8431-2.4GHz 8x 32MB 8*79 8*1820 - 101ns 64bit-gcc-4.1.2 3user hdparm-T=5800MB/s (summed) Xeon-2333Mz-4MB 32MB 69 3295 116ns 64bit-gcc-4.1.1-long thor model=15 stepping=4 i5000P (maxBW=21GB/s) 2Quad Xeon-2333Mz-4MB 4x 32MB 4*62 4*1466 131ns 64bit-gcc-4.1.1-long thor model=15 stepping=4 i5000P (maxBW=21GB/s) 2Quad Xeon-2333Mz-4MB 8x 32MB 8*55 8*734 144ns 64bit-gcc-4.1.1-long thor model=15 stepping=4 i5000P (maxBW=21GB/s) 2Quad Xeon-2333Mz-4MB 1x 512MB 60 3269 133ns 64bit-gcc-4.1.1-long thor model=15 stepping=4 i5000P (maxBW=21GB/s) 2Quad Xeon-2333Mz-4MB 8x 512MB 8*39 8*735 204ns 64bit-gcc-4.1.1-long thor model=15 stepping=4 i5000P (maxBW=21GB/s) 2Quad Aug08 alpha-gs160-731MHz 16MB 20 295 49 397ns 64bit-gcc-4.1.1-long alpha-gs160-731MHz 16MB 20 295 49 397ns 64bit-cxx-6.5-long alpha-gs1280 32MB 31 453 50 - 64bit-cxx-6.5-long (32cpus_per_rad) 0user alpha-gs1280 32MB 46 644 22 170ns 64bit-cxx-6.5-long (8cpus_per_rad) 8user alpha-gs1280 32MB 46 339 170 170ns 64bit-gcc-4.1.1-long (8cpus_per_rad) 8user alpha-gs1280 32MB 77 1109 9 101ns 64bit-cxx-6.5-long (1cpus_per_rad) 0user alpha-gs1280 32x32MB 2432 14400 1600 - 64bit-cxx-6.5-long (32cpus_per_rad) 0user alpha-gs1280 32x32MB 2432 35520 640 102ns 64bit-cxx-6.5-long (1cpus_per_rad) 0user alpha-es45-1250MHz 32MB 38 462 56 204ns 64bit-cxx-6.5-long 4user altix330-IA64-1.5GHz 32MB 22 309 12 349ns 64bit-gcc-4.1.1 Apr07 altix330-IA64-1.5GHz 8x 32MB 168 2304 80 970ns 64bit-gcc-4.1.1 Apr07 altix330-IA64-1.5GHz 15x32MB 660 9300 300 176ns 64bit-gcc-4.1.1 Apr07 altix330-IA64-1.5GHz 32MB 54 670 14 143ns numactl --localalloc altix330-IA64-1.5GHz 8x 32MB 392 4760 360 158ns numactl --localalloc altix330-IA64-1.5GHz 15x32MB 735 10050 300 158ns numactl --localalloc altix330-IA64-1.5GHz 8x512MB 418 5750 153ns min= 8*52MB/s aff+local altix330-IA64-1.5GHz 8x128MB 427 5748 150ns min= 8*53MB/s aff+local altix330-IA64-1.5GHz 2x128MB 91 709 174ns min= 2*45MB/s c0,1+local altix330-IA64-1.5GHz 2x128MB 107 1438 150ns min= 2*53MB/s aff0,8 altix330-IA64-1.5GHz 2x512MB 89 710 179ns min= 2*45MB/s c0,1+local altix330-IA64-1.5G 16x128MB 448 6928 285ns min=16*28MB/s localalloc altix330-IA64-1.5G 16x128MB 700 10800 182ns min=16*44MB/s localalloc altix330-IA64-1.5G 16x128MB 729 11335 182ns min=16*45MB/s aff+lalloc altix330-IA64-1.5G 16x512MB 480 7456 264ns min=16*30MB/s localalloc # check with linkstat! # ----------- old data (step only) -------------------- System Array Min MaxMB/s Err ln2MinStep # ---- old(!) ---- PM- 600MHz-1MB 512MB 16MB 52 364 16 12 PM-1400MHz-1MB 512MB 16MB 50 364 8 12 Xeon-3GHz-1MB 2GB 256MB 39 1800 48 16 32bit-gcc-2.96 connect model=3 stepping=4 hdparm=800MB/s busclock=200MHz Xeon-3GHz-2MB 2GB 256MB 44 2700 57 20 32bit-gcc-4.0.1 bb model=4 stepping=3 hdparm=1472MB/s Xeon-3GHz-2MB 12GB 256MB 39 1422 31 20 64bit-gcc-4.1.1 morus model=4 stepping=10 memtest86+1.65=1257MB/s hdparm=1286MB/s Xeon-3GHz-2MB 12GB 512MB 38 1450 31 20 32bit-gcc-4.1.1-long morus model=4 stepping=10 memtest86+1.65=1257MB/s hdparm=1286MB/s Xeon-3GHz-2MB 12GB 8192MB 80 1170 23 19 64bit-gcc-4.1.1-long morus model=4 stepping=10 memtest86+1.65=1257MB/s hdparm=1286MB/s alpha-gs1280 128GB 32MB 36 1117 39 15 64bit-gcc-3.4.3-long (32cpus_per_rad) 2/82ns==25MB/s alpha-gs1280 128GB 32MB 41 2485 45 15 64bit-cxx-6.5-long (32cpus_per_rad) (32x2.5GB/s=80GB/s local) remote.maxBW=6.3GB/s local.maxBW=12.3GB/s(767MHz*8channels*2Byte) 8B/(82ns..250ns)=>32..98MB/s alpha-gs1280 128GB 32MB 67 1117 39 16 64bit-gcc-3.4.3-long (1cpus_per_rad) 2/82ns==25MB/s alpha-gs1280 128GB 32MB 61 2485 47 16 64bit-cxx-6.5-long (1cpus_per_rad) (32x2.5GB/s=80GB/s local) alpha-gs1280 128GB 16GB 24 572 7 23 64bit-cxx-6.5-long (8cpus_per_rad) alpha-gs1280 128GB 16GB 29 1094 24 23 64bit-gcc-4.1.1-long (8cpus_per_rad) alpha-es45-1250MHz 16MB 87 818 20 11 cxx-int? 4/5CPU DOpteron275-2.2GHz-1MB 16MB 35 1500 34 13 64bit-gcc-3.4.6 hdparm-T=1900MB/s DOpteron885-2.6GHz-1MB 256MB 38 1120 39 15 64bit-gcc-4.0.3 hdparm-T=1660MB/s BW=6.4GB/s(1CPU) DOpteron885-2.6GHz-1MB 512MB 68 2300 82 15 64bit-gcc-4.0.3-long hdparm-T=1660MB/s DOpteron885-2.6GHz-1MB 512MB*4 46 2220 77 15 CPU3-6! 64bit-gcc-4.0.3-long hdparm-T=1660MB/s MinMax fuer 2 Jobs etc. # linkstat? cpu.model=generation stepping=version (same masque) ToDo: try __*_prefetch and improve loop (tune for generated asm-code) */ #define _GNU_SOURCE 1 // this defines __USE_GNU in features.h for sched.h #include #include #include #ifdef CFG_noMPI // for the case of pure SMP and MPI is not available #define MPI_Init(a,b) 1 #define MPI_SUCCESS 1 #define MPI_Comm_size(MPI_COMM_WORLD, mpi_n) #define MPI_Comm_rank(MPI_COMM_WORLD, mpi_i) #define MPI_Barrier(MPI_COMM_WORLD) #define MPI_Wtime() ((double)time(NULL)) #define MPI_Finalize() #define MPI_Abort(MPI_COMM_WORLD,xx) exit(xx) #else #include #endif #ifdef CFG_CPUSET // switch on if sched_setaffinity will benefit # include // declare sleep() # include // declare sched_setaffinity() # ifndef _FEATURES_H # warning "_FEATURES_H is not defined, try top add #include " # endif # ifndef __USE_GNU # warning "expect __USE_GNU defined, but it isn't, try -D_GNU_SOURCE=1" # endif #endif long int *aa, alen; // long array and size of it void check_ring( long int a2, int vvv, int mpi_i ) { long int k, l; // check that we have still a full ring if (vvv && !mpi_i) printf("# chain:"); for (l=0,k=0; k1) log2size=atoi(argv[1]); if (log2size<10) log2size=10; // min 1K if (argn>2) t =atoi(argv[2]); if (t<0) t=0; for (log2long=0; 1<>ii; ii++) if (1&(CFG_CPUSET>>ii)) ncores++; for (ii=0; CFG_CPUSET>>ii; ii++) if (1&(CFG_CPUSET>>ii)) { if (mpi_i%ncores==in) break; // mycore = 0,1,2,...,0,1,2,... --byslot in++; } CPU_ZERO(&cpuset); CPU_SET(ii,&cpuset); sleep(1); if (mpi_i>5;a2>>=1) { // its twicky to have a random "next_index" table without shortcuts // we need a 2nd array for prev_index for (k=0;kt || t2>10) && !mpi_i) // alive and final message printf("%c%3d %9g %9ld %5.2f %9.2f %8.2f %6.1f random\n", ((t2>t)?' ':'#'), mpi_n, a2*sizeof(long int)/(1024.), loops, t2, mpi_n*a2*sizeof(long)*1e-6*loops/t2, a2*sizeof(long)*1e-6*loops/t2, t2*1e9/(loops*1.*a2)); // l must be used, some compilers remove the mem access else if (t2>t) break; } } r00=ll; // avoid optimizations which skip the loop MPI_Barrier(MPI_COMM_WORLD); if (!mpi_i) printf("# Tasks Mem/Task[kB] loops t[s] aBW[MB/s] BW[MB/s] lat[ns]\n"); // measure memory throughput for step0 for (a2=alen;a2>>5;a2>>=1) { // reduce msgsize long int *ap=aa; // moving pointer loops=0; t1=MPI_Wtime(); for (i=1;i<(1l<<30);i*=2) { j=i; do { // repeat until some seconds are needed ap=aa; k=a2; // gcc-4.1 x86 2 instr. do { x^=*ap; // gcc-4.1 x86 1 instr. + latency ap++; // gcc-4.1 x86 1 instr. // __builtin_prefetch(aa); // gcc-4.1 x86-msse2 2 instr. // ToDo: better prefetch every 4th access only + unroll loop } while(--k); // gcc-4.1 x86 2 instr. } while(--j); // gcc-4.1 x86 2 instr. loops+=i; t2=MPI_Wtime()-t1; MPI_Bcast(&t2, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); if ((t2>t || t2>10) && !mpi_i) // alive and final message printf("%c%3d %9g %9ld %5.2f %9.2f %8.2f %6.1f lstream\n", ((t2>t)?' ':'#'), mpi_n, a2/(1024.)*sizeof(long int), loops, t2, mpi_n*a2*1e-6*sizeof(long int)*loops/t2, a2*1e-6*sizeof(long int)*loops/t2, t2*1e9/(loops*1.*a2)); // l must be used, some compilers remove the mem access else if (t2>t) break; } } MPI_Finalize(); return x+r00; }