diff -ru ATLAS/CONFIG/include/atlconf.h ATLAS-3.8.4-arm/CONFIG/include/atlconf.h --- ATLAS/CONFIG/include/atlconf.h 2011-05-14 19:33:24.000000000 +0200 +++ ATLAS-3.8.4-arm/CONFIG/include/atlconf.h 2011-05-16 13:50:44.000000000 +0200 @@ -14,9 +14,9 @@ OSWin9x, OSWinNT, OSHPUX, OSFreeBSD, OSOSX}; #define OSIsWin(OS_) (((OS_) == OSWinNT) || ((OS_) == OSWin9x)) -enum ARCHFAM {AFOther=0, AFPPC, AFSPARC, AFALPHA, AFX86, AFIA64, AFMIPS}; +enum ARCHFAM {AFOther=0, AFPPC, AFSPARC, AFALPHA, AFX86, AFIA64, AFMIPS, AFARM}; -#define NMACH 37 +#define NMACH 39 static char *machnam[NMACH] = {"UNKNOWN", "POWER3", "POWER4", "POWER5", "PPCG4", "PPCG5", "POWER6", "POWER7", @@ -25,7 +25,7 @@ "Efficeon", "K7", "HAMMER", "AMD64K10h", "UNKNOWNx86", "IA64Itan", "IA64Itan2", "USI", "USII", "USIII", "USIV", "UST2", "UnknownUS", - "MIPSR1xK", "MIPSICE9"}; + "MIPSR1xK", "MIPSICE9", "ARMv7VFP", "ARMv7VFPNEON"}; enum MACHTYPE {MACHOther, IbmPwr3, IbmPwr4, IbmPwr5, PPCG4, PPCG5, IbmPwr6, IbmPwr7, IntP5, IntP5MMX, IntPPRO, IntPII, IntPIII, IntPM, IntCoreS, @@ -34,7 +34,8 @@ IA64Itan, IA64Itan2, SunUSI, SunUSII, SunUSIII, SunUSIV, SunUST2, SunUSX, MIPSR1xK, /* includes R10K, R12K, R14K, R16K */ - MIPSICE9 /* SiCortex ICE9 -- like MIPS5K */ + MIPSICE9, /* SiCortex ICE9 -- like MIPS5K */ + ARMv7VFP, ARMv7VFPNEON }; #define MachIsX86(mach_) \ ( (mach_) >= IntP5 && (mach_) <= x86X ) @@ -51,6 +52,8 @@ #endif #define MachIsPPC(mach_) \ ( (mach_) >= PPCG4 && (mach_) <= PPCG5 ) +#define MachIsARM(mach_) \ + ( (mach_) >= ARMv7VFP && (mach_) <= ARMv7VFPNEON ) static char *f2c_namestr[5] = {"UNKNOWN","Add_", "Add__", "NoChange", "UpCase"}; static char *f2c_intstr[5] = @@ -63,18 +66,18 @@ enum F2CINT {f2c_IntErr=0, FintCint, FintClong, FintClonglong, FintCshort}; enum F2CSTRING {f2c_StrErr=0, fstrSun, fstrCray, fstrStructVal, fstrStructPtr}; -#define NISA 6 +#define NISA 7 static char *ISAXNAM[NISA] = - {"", "AltiVec", "SSE3", "SSE2", "SSE1", "3DNow"}; -enum ISAEXT {ISA_None=0, ISA_AV, ISA_SSE3, ISA_SSE2, ISA_SSE1, ISA_3DNow}; + {"", "AltiVec", "SSE3", "SSE2", "SSE1", "3DNow", "NEON"}; +enum ISAEXT {ISA_None=0, ISA_AV, ISA_SSE3, ISA_SSE2, ISA_SSE1, ISA_3DNow, ISA_NEON}; -#define NASMD 7 +#define NASMD 8 enum ASMDIA {ASM_None=0, gas_x86_32, gas_x86_64, gas_sparc, gas_ppc, gas_parisc, - gas_mips}; + gas_mips, gas_arm}; static char *ASMNAM[NASMD] = {"", "GAS_x8632", "GAS_x8664", "GAS_SPARC", "GAS_PPC", "GAS_PARISC", - "GAS_MIPS"}; + "GAS_MIPS", "GAS_ARM"}; /* diff -ru ATLAS/CONFIG/src/Makefile ATLAS-3.8.4-arm/CONFIG/src/Makefile --- ATLAS/CONFIG/src/Makefile 2011-05-14 19:33:24.000000000 +0200 +++ ATLAS-3.8.4-arm/CONFIG/src/Makefile 2011-04-20 16:43:26.000000000 +0200 @@ -151,6 +151,11 @@ $(MAKE) $(atlrun) atldir=$(mydir) exe=xprobe_sse3 args="$(args)" \ redir=config0.out - cat config0.out +IRun_NEON : + $(CC) $(CCFLAGS) -mfpu=neon -o xprobe_neon $(SRCdir)/backend/probe_svec.c $(SRCdir)/backend/probe_NEON.S + $(MAKE) $(atlrun) atldir=$(mydir) exe=xprobe_neon args="$(args)" \ + redir=config0.out + - cat config0.out IRun_GAS_SPARC : $(CC) $(CCFLAGS) -o xprobe_gas_sparc $(SRCdir)/backend/probe_this_asm.c $(SRCdir)/backend/probe_gas_sparc.S @@ -177,6 +182,11 @@ $(MAKE) $(atlrun) atldir=$(mydir) exe=xprobe_gas_x8632 args="$(args)" \ redir=config0.out - cat config0.out +IRun_GAS_ARM : + $(CC) $(CCFLAGS) -o xprobe_gas_arm $(SRCdir)/backend/probe_this_asm.c $(SRCdir)/backend/probe_gas_arm.S + $(MAKE) $(atlrun) atldir=$(mydir) exe=xprobe_gas_arm args="$(args)" \ + redir=config0.out + - cat config0.out IRunC2C : - rm -f config0.out xc2c c2cslave.o diff -ru ATLAS/CONFIG/src/atlcomp.txt ATLAS-3.8.4-arm/CONFIG/src/atlcomp.txt --- ATLAS/CONFIG/src/atlcomp.txt 2011-05-14 19:33:24.000000000 +0200 +++ ATLAS-3.8.4-arm/CONFIG/src/atlcomp.txt 2012-05-16 19:47:42.000000000 +0200 @@ -164,6 +164,19 @@ MACH=P4,PM OS=WinNT LVL=0 COMPS=icc,dmc,smc,dkc,skc,xcc 'icl' '-QxN -O3 -Qprec -fp:extended -fp:except -nologo -Oy' # +# ARM defaults +# +MACH=ARMv7VFP,ARMv7VFPNEON OS=ALL LVL=1000 COMPS=icc,xcc + 'gcc' '-march=armv7-a -O1' +MACH=ARMv7VFPNEON OS=ALL LVL=1000 COMPS=smc,skc + 'gcc' '-march=armv7-a -mfpu=neon -ffast-math -O3 -fschedule-insns -fschedule-insns2 -fprefetch-loop-arrays' +MACH=ARMv7VFP OS=ALL LVL=1000 COMPS=smc,skc + 'gcc' '-march=armv7-a -mfpu=vfpv3 -O1 -fno-expensive-optimizations' +MACH=ARMv7VFP,ARMv7VFPNEON OS=ALL LVL=1000 COMPS=dmc,dkc + 'gcc' '-march=armv7-a -mfpu=vfpv3 -O1 -fno-schedule-insns2' +MACH=ARMv7VFP,ARMv7VFPNEON OS=ALL LVL=1000 COMPS=f77 + 'gfortran' '-march=armv7-a -O1' +# # Generic defaults # MACH=ALL OS=ALL LVL=5 COMPS=icc,smc,dmc,skc,dkc,xcc diff -ru ATLAS/CONFIG/src/atlconf_misc.c ATLAS-3.8.4-arm/CONFIG/src/atlconf_misc.c --- ATLAS/CONFIG/src/atlconf_misc.c 2011-05-14 19:33:24.000000000 +0200 +++ ATLAS-3.8.4-arm/CONFIG/src/atlconf_misc.c 2011-04-20 16:43:26.000000000 +0200 @@ -480,6 +480,7 @@ else if (strstr(res, "alpha")) fam = AFALPHA; else if (strstr(res, "ia64")) fam = AFIA64; else if (strstr(res, "mips")) fam = AFMIPS; + else if (strstr(res, "arm")) fam = AFARM; else if ( strstr(res, "i686") || strstr(res, "i586") || strstr(res, "i486") || strstr(res, "i386") || strstr(res, "x86") || strstr(res, "x86_64") ) fam = AFX86; @@ -501,6 +502,7 @@ strstr(res, "i486") || strstr(res, "i386") || strstr(res, "x86_64") ) fam = AFX86; else if (strstr(res, "mips")) fam = AFMIPS; + else if (strstr(res, "arm")) fam = AFARM; } } return(fam); diff -ru ATLAS/CONFIG/src/backend/archinfo_linux.c ATLAS-3.8.4-arm/CONFIG/src/backend/archinfo_linux.c --- ATLAS/CONFIG/src/backend/archinfo_linux.c 2011-05-14 19:33:24.000000000 +0200 +++ ATLAS-3.8.4-arm/CONFIG/src/backend/archinfo_linux.c 2011-05-16 14:19:02.000000000 +0200 @@ -166,6 +166,20 @@ else if (strstr(res, "AMD-K7")) mach = AmdAthlon; } break; + case AFARM: + if ( !CmndOneLine(NULL, "fgrep 'Processor' /proc/cpuinfo", res) ) + { + if (strstr(res, "ARMv7") || strstr(res,"v7l")) mach = ARMv7VFP; + } + else if ( !CmndOneLine(NULL, "fgrep 'cpu' /proc/cpuinfo", res) ) + { + if (strstr(res, "ARMv7") || strstr(res,"v7l")) mach = ARMv7VFP; + } + if ( !CmndOneLine(NULL, "fgrep 'Features' /proc/cpuinfo", res) ) + { + if (strstr(res, "neon")) mach = ARMv7VFPNEON; + } + break; /* * Add these back if we get machine access and can test */ diff -ru ATLAS/README ATLAS-3.8.4-arm/README --- ATLAS/README 2011-05-14 19:33:23.000000000 +0200 +++ ATLAS-3.8.4-arm/README 2012-06-18 23:19:46.000000000 +0200 @@ -1,3 +1,19 @@ +ATLAS-3.8.4-arm +released 18 June 2012 + +Changes since beta 1: + + - assembly headers changed to link properly on Unbuntu 11.10 and 12.04 + - VFP single-precision GEMM by Clint Whaley backported from ATLAS 3.9.x + +Changes since beta 2: + + - VFP assembly GEMM kernels added for hard floating-point ABI (like Ubuntu 12.04) + +Changes since beta 3: + (none) + + This is the README index file. If you want install intructions, read ATLAS/INSTALL.txt. Windows users should read ATLAS/doc/Windows.txt as well. Note that all of this documentation is diff -ru ATLAS/tune/blas/gemm/CASES/ccases.flg ATLAS-3.8.4-arm/tune/blas/gemm/CASES/ccases.flg --- ATLAS/tune/blas/gemm/CASES/ccases.flg 2011-05-14 19:34:10.000000000 +0200 +++ ATLAS-3.8.4-arm/tune/blas/gemm/CASES/ccases.flg 2012-06-03 14:25:17.000000000 +0200 @@ -1,5 +1,5 @@ "" -20 +25 304 192 4 3 8 0 4 4 3 8 ATL_mm4x3x8p.c "R. Clint Whaley" \ gcc -mcpu=ultrasparc -mtune=ultrasparc -fomit-frame-pointer -O @@ -52,3 +52,18 @@ gcc -x assembler-with-cpp -mips4 332 192 8 2 4 1 0 8 2 4 ATL_smm8x2x4_av.c "IBM" +333 448 4 4 2 1 1 4 4 2 ATL_smm4x4x2pf_arm.c "R. Clint Whaley" \ +gcc +-x assembler-with-cpp -mfpu=vfpv3 -march=armv7-a +334 448 4 4 2 1 1 4 4 2 ATL_smm4x4x2dld_arm.c "R. Clint Whaley" \ +gcc +-x assembler-with-cpp -mfpu=vfpv3 -march=armv7-a +335 448 4 4 2 1 1 4 4 2 ATL_smm4x4x2pf_armhf.c "R. Clint Whaley" \ +gcc +-x assembler-with-cpp -mfpu=vfpv3 -march=armv7-a +336 448 4 4 2 1 1 4 4 2 ATL_smm4x4x2dld_armhf.c "R. Clint Whaley" \ +gcc +-x assembler-with-cpp -mfpu=vfpv3 -march=armv7-a +337 480 4 4 4 1 1 4 4 4 ATL_smm4x4x4p_neon.c "Vesperix Corporation" \ +gcc +-march=armv7-a -mfpu=neon -O2 diff -ru ATLAS/tune/blas/gemm/CASES/dcases.flg ATLAS-3.8.4-arm/tune/blas/gemm/CASES/dcases.flg --- ATLAS/tune/blas/gemm/CASES/dcases.flg 2011-05-14 19:34:10.000000000 +0200 +++ ATLAS-3.8.4-arm/tune/blas/gemm/CASES/dcases.flg 2012-06-03 14:23:02.000000000 +0200 @@ -1,5 +1,5 @@ "" -31 +33 306 192 4 3 8 0 4 4 3 8 ATL_mm4x3x8p.c "R. Clint Whaley" \ gcc -mcpu=ultrasparc -mtune=ultrasparc -fomit-frame-pointer -O -fno-schedule-insns -fno-schedule-insns2 @@ -85,6 +85,11 @@ 338 192 8 4 2 1 0 8 4 2 ATL_dmm8x4x2_vsx.c "IBM" \ gcc -O3 -mvsx - +339 448 4 4 2 1 1 4 4 2 ATL_dmm4x4x2pf_arm.c "R. Clint Whaley" \ +gcc +-x assembler-with-cpp -mfpu=vfpv3 -march=armv7-a +340 448 4 4 2 1 1 4 4 2 ATL_dmm4x4x2pf_armhf.c "R. Clint Whaley" \ +gcc +-x assembler-with-cpp -mfpu=vfpv3 -march=armv7-a # NB = 80 and NB = 120 are to force UltraSparc to try NB > L1 Cache, # needed for optimal performance on Ultra2 and Ultra4, respectively diff -ru ATLAS/tune/blas/gemm/CASES/scases.flg ATLAS-3.8.4-arm/tune/blas/gemm/CASES/scases.flg --- ATLAS/tune/blas/gemm/CASES/scases.flg 2011-05-14 19:34:10.000000000 +0200 +++ ATLAS-3.8.4-arm/tune/blas/gemm/CASES/scases.flg 2012-06-03 14:25:48.000000000 +0200 @@ -1,5 +1,5 @@ "" -22 +27 304 192 4 3 8 0 4 4 3 8 ATL_mm4x3x8p.c "R. Clint Whaley" \ gcc -mcpu=ultrasparc -mtune=ultrasparc -fomit-frame-pointer -O @@ -58,3 +58,18 @@ gcc -x assembler-with-cpp -mips4 332 192 8 2 4 1 0 8 2 4 ATL_smm8x2x4_av.c "IBM" +333 448 4 4 2 1 1 4 4 2 ATL_smm4x4x2pf_arm.c "R. Clint Whaley" \ +gcc +-x assembler-with-cpp -mfpu=vfpv3 -march=armv7-a +334 448 4 4 2 1 1 4 4 2 ATL_smm4x4x2dld_arm.c "R. Clint Whaley" \ +gcc +-x assembler-with-cpp -mfpu=vfpv3 -march=armv7-a +335 448 4 4 2 1 1 4 4 2 ATL_smm4x4x2pf_armhf.c "R. Clint Whaley" \ +gcc +-x assembler-with-cpp -mfpu=vfpv3 -march=armv7-a +336 448 4 4 2 1 1 4 4 2 ATL_smm4x4x2dld_armhf.c "R. Clint Whaley" \ +gcc +-x assembler-with-cpp -mfpu=vfpv3 -march=armv7-a +337 480 4 4 4 1 1 4 4 4 ATL_smm4x4x4p_neon.c "Vesperix Corporation" \ +gcc +-march=armv7-a -mfpu=neon -O2 diff -ru ATLAS/tune/blas/gemm/CASES/zcases.flg ATLAS-3.8.4-arm/tune/blas/gemm/CASES/zcases.flg --- ATLAS/tune/blas/gemm/CASES/zcases.flg 2011-05-14 19:34:10.000000000 +0200 +++ ATLAS-3.8.4-arm/tune/blas/gemm/CASES/zcases.flg 2012-06-03 14:23:36.000000000 +0200 @@ -1,5 +1,5 @@ "" -30 +32 306 192 4 3 8 0 4 4 3 8 ATL_mm4x3x8p.c "R. Clint Whaley" \ gcc -mcpu=ultrasparc -mtune=ultrasparc -fomit-frame-pointer -O -fno-schedule-insns -fno-schedule-insns2 @@ -82,6 +82,11 @@ 338 192 8 4 2 1 0 8 4 2 ATL_dmm8x4x2_vsx.c "IBM" \ gcc -O3 -mvsx - +339 448 4 4 2 1 1 4 4 2 ATL_dmm4x4x2pf_arm.c "R. Clint Whaley" \ +gcc +-x assembler-with-cpp -mfpu=vfpv3 -march=armv7-a +340 448 4 4 2 1 1 4 4 2 ATL_dmm4x4x2pf_armhf.c "R. Clint Whaley" \ +gcc +-x assembler-with-cpp -mfpu=vfpv3 -march=armv7-a # NB = 80 and NB = 120 are to force UltraSparc to try NB > L1 Cache, # needed for optimal performance on Ultra2 and Ultra4, respectively diff -ru ATLAS/tune/sysinfo/ATL_cputime.c ATLAS-3.8.4-arm/tune/sysinfo/ATL_cputime.c --- ATLAS/tune/sysinfo/ATL_cputime.c 2011-05-14 19:34:09.000000000 +0200 +++ ATLAS-3.8.4-arm/tune/sysinfo/ATL_cputime.c 2012-05-05 18:16:58.000000000 +0200 @@ -48,6 +48,26 @@ t0 = clock(); return(0.0); } +#elif defined(POSIX_HR) /* use the POSIX HR timers */ + #include + double ATL_cputime(void) + { + struct timespec ts; + static double t0; + double res; + static int INIT = 0; + + if (INIT) + { + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); + res = ts.tv_sec + 1.0e-9 * ts.tv_nsec; + return(res - t0); + } + clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&ts); + t0 = ts.tv_sec + 1.0e-9 * ts.tv_nsec; + INIT = 1; + return(0.0); + } #elif defined(UseTimes) #include #include diff -ru ATLAS/tune/sysinfo/ATL_walltime.c ATLAS-3.8.4-arm/tune/sysinfo/ATL_walltime.c --- ATLAS/tune/sysinfo/ATL_walltime.c 2011-05-14 19:34:09.000000000 +0200 +++ ATLAS-3.8.4-arm/tune/sysinfo/ATL_walltime.c 2012-05-05 18:23:20.000000000 +0200 @@ -79,6 +79,26 @@ myout = (myout<<32) | msout.LowPart; return(myout*freqRecip); } +#elif defined(POSIX_HR) /* use the POSIX HR timers */ + #include + double ATL_cputime(void) + { + struct timespec ts; + static double t0; + double res; + static int INIT = 0; + + if (INIT) + { + clock_gettime(CLOCK_REALTIME, &ts); + res = ts.tv_sec + 1.0e-9 * ts.tv_nsec; + return(res - t0); + } + clock_gettime(CLOCK_REALTIME,&ts); + t0 = ts.tv_sec + 1.0e-9 * ts.tv_nsec; + INIT = 1; + return(0.0); + } #elif defined(UseTimes) #include #include @@ -97,6 +117,26 @@ { return(gethrtime()*1.0e-9); } +#elif defined(POSIX_HR) /* use the POSIX HR timers */ + #include + double ATL_walltime(void) + { + struct timespec ts; + static double t0; + double res; + static int INIT = 0; + + if (INIT) + { + clock_gettime(CLOCK_REALTIME, &ts); + res = ts.tv_sec + 1.0e-9 * ts.tv_nsec; + return(res - t0); + } + clock_gettime(CLOCK_REALTIME, &ts); + t0 = ts.tv_sec + 1.0e-9 * ts.tv_nsec; + INIT = 1; + return(0.0); + } /* * Without gcc, I know no standard Windows wall-timer, so use cputime */