libatlas/atlas-3.8.4-arm.patch

376 lines
14 KiB
Diff

diff -ru ATLAS/CONFIG/include/atlconf.h ATLAS-3.8.4-arm/CONFIG/include/atlconf.h
--- ATLAS/CONFIG/include/atlconf.h 2011-05-14 19:33:24.000000000 +0200
+++ ATLAS-3.8.4-arm/CONFIG/include/atlconf.h 2011-05-16 13:50:44.000000000 +0200
@@ -14,9 +14,9 @@
OSWin9x, OSWinNT, OSHPUX, OSFreeBSD, OSOSX};
#define OSIsWin(OS_) (((OS_) == OSWinNT) || ((OS_) == OSWin9x))
-enum ARCHFAM {AFOther=0, AFPPC, AFSPARC, AFALPHA, AFX86, AFIA64, AFMIPS};
+enum ARCHFAM {AFOther=0, AFPPC, AFSPARC, AFALPHA, AFX86, AFIA64, AFMIPS, AFARM};
-#define NMACH 37
+#define NMACH 39
static char *machnam[NMACH] =
{"UNKNOWN", "POWER3", "POWER4", "POWER5", "PPCG4", "PPCG5",
"POWER6", "POWER7",
@@ -25,7 +25,7 @@
"Efficeon", "K7", "HAMMER", "AMD64K10h", "UNKNOWNx86",
"IA64Itan", "IA64Itan2",
"USI", "USII", "USIII", "USIV", "UST2", "UnknownUS",
- "MIPSR1xK", "MIPSICE9"};
+ "MIPSR1xK", "MIPSICE9", "ARMv7VFP", "ARMv7VFPNEON"};
enum MACHTYPE {MACHOther, IbmPwr3, IbmPwr4, IbmPwr5, PPCG4, PPCG5,
IbmPwr6, IbmPwr7,
IntP5, IntP5MMX, IntPPRO, IntPII, IntPIII, IntPM, IntCoreS,
@@ -34,7 +34,8 @@
IA64Itan, IA64Itan2,
SunUSI, SunUSII, SunUSIII, SunUSIV, SunUST2, SunUSX,
MIPSR1xK, /* includes R10K, R12K, R14K, R16K */
- MIPSICE9 /* SiCortex ICE9 -- like MIPS5K */
+ MIPSICE9, /* SiCortex ICE9 -- like MIPS5K */
+ ARMv7VFP, ARMv7VFPNEON
};
#define MachIsX86(mach_) \
( (mach_) >= IntP5 && (mach_) <= x86X )
@@ -51,6 +52,8 @@
#endif
#define MachIsPPC(mach_) \
( (mach_) >= PPCG4 && (mach_) <= PPCG5 )
+#define MachIsARM(mach_) \
+ ( (mach_) >= ARMv7VFP && (mach_) <= ARMv7VFPNEON )
static char *f2c_namestr[5] = {"UNKNOWN","Add_", "Add__", "NoChange", "UpCase"};
static char *f2c_intstr[5] =
@@ -63,18 +66,18 @@
enum F2CINT {f2c_IntErr=0, FintCint, FintClong, FintClonglong, FintCshort};
enum F2CSTRING {f2c_StrErr=0, fstrSun, fstrCray, fstrStructVal, fstrStructPtr};
-#define NISA 6
+#define NISA 7
static char *ISAXNAM[NISA] =
- {"", "AltiVec", "SSE3", "SSE2", "SSE1", "3DNow"};
-enum ISAEXT {ISA_None=0, ISA_AV, ISA_SSE3, ISA_SSE2, ISA_SSE1, ISA_3DNow};
+ {"", "AltiVec", "SSE3", "SSE2", "SSE1", "3DNow", "NEON"};
+enum ISAEXT {ISA_None=0, ISA_AV, ISA_SSE3, ISA_SSE2, ISA_SSE1, ISA_3DNow, ISA_NEON};
-#define NASMD 7
+#define NASMD 8
enum ASMDIA
{ASM_None=0, gas_x86_32, gas_x86_64, gas_sparc, gas_ppc, gas_parisc,
- gas_mips};
+ gas_mips, gas_arm};
static char *ASMNAM[NASMD] =
{"", "GAS_x8632", "GAS_x8664", "GAS_SPARC", "GAS_PPC", "GAS_PARISC",
- "GAS_MIPS"};
+ "GAS_MIPS", "GAS_ARM"};
/*
diff -ru ATLAS/CONFIG/src/Makefile ATLAS-3.8.4-arm/CONFIG/src/Makefile
--- ATLAS/CONFIG/src/Makefile 2011-05-14 19:33:24.000000000 +0200
+++ ATLAS-3.8.4-arm/CONFIG/src/Makefile 2011-04-20 16:43:26.000000000 +0200
@@ -151,6 +151,11 @@
$(MAKE) $(atlrun) atldir=$(mydir) exe=xprobe_sse3 args="$(args)" \
redir=config0.out
- cat config0.out
+IRun_NEON :
+ $(CC) $(CCFLAGS) -mfpu=neon -o xprobe_neon $(SRCdir)/backend/probe_svec.c $(SRCdir)/backend/probe_NEON.S
+ $(MAKE) $(atlrun) atldir=$(mydir) exe=xprobe_neon args="$(args)" \
+ redir=config0.out
+ - cat config0.out
IRun_GAS_SPARC :
$(CC) $(CCFLAGS) -o xprobe_gas_sparc $(SRCdir)/backend/probe_this_asm.c $(SRCdir)/backend/probe_gas_sparc.S
@@ -177,6 +182,11 @@
$(MAKE) $(atlrun) atldir=$(mydir) exe=xprobe_gas_x8632 args="$(args)" \
redir=config0.out
- cat config0.out
+IRun_GAS_ARM :
+ $(CC) $(CCFLAGS) -o xprobe_gas_arm $(SRCdir)/backend/probe_this_asm.c $(SRCdir)/backend/probe_gas_arm.S
+ $(MAKE) $(atlrun) atldir=$(mydir) exe=xprobe_gas_arm args="$(args)" \
+ redir=config0.out
+ - cat config0.out
IRunC2C :
- rm -f config0.out xc2c c2cslave.o
diff -ru ATLAS/CONFIG/src/atlcomp.txt ATLAS-3.8.4-arm/CONFIG/src/atlcomp.txt
--- ATLAS/CONFIG/src/atlcomp.txt 2011-05-14 19:33:24.000000000 +0200
+++ ATLAS-3.8.4-arm/CONFIG/src/atlcomp.txt 2012-05-16 19:47:42.000000000 +0200
@@ -164,6 +164,19 @@
MACH=P4,PM OS=WinNT LVL=0 COMPS=icc,dmc,smc,dkc,skc,xcc
'icl' '-QxN -O3 -Qprec -fp:extended -fp:except -nologo -Oy'
#
+# ARM defaults
+#
+MACH=ARMv7VFP,ARMv7VFPNEON OS=ALL LVL=1000 COMPS=icc,xcc
+ 'gcc' '-march=armv7-a -O1'
+MACH=ARMv7VFPNEON OS=ALL LVL=1000 COMPS=smc,skc
+ 'gcc' '-march=armv7-a -mfpu=neon -ffast-math -O3 -fschedule-insns -fschedule-insns2 -fprefetch-loop-arrays'
+MACH=ARMv7VFP OS=ALL LVL=1000 COMPS=smc,skc
+ 'gcc' '-march=armv7-a -mfpu=vfpv3 -O1 -fno-expensive-optimizations'
+MACH=ARMv7VFP,ARMv7VFPNEON OS=ALL LVL=1000 COMPS=dmc,dkc
+ 'gcc' '-march=armv7-a -mfpu=vfpv3 -O1 -fno-schedule-insns2'
+MACH=ARMv7VFP,ARMv7VFPNEON OS=ALL LVL=1000 COMPS=f77
+ 'gfortran' '-march=armv7-a -O1'
+#
# Generic defaults
#
MACH=ALL OS=ALL LVL=5 COMPS=icc,smc,dmc,skc,dkc,xcc
diff -ru ATLAS/CONFIG/src/atlconf_misc.c ATLAS-3.8.4-arm/CONFIG/src/atlconf_misc.c
--- ATLAS/CONFIG/src/atlconf_misc.c 2011-05-14 19:33:24.000000000 +0200
+++ ATLAS-3.8.4-arm/CONFIG/src/atlconf_misc.c 2011-04-20 16:43:26.000000000 +0200
@@ -480,6 +480,7 @@
else if (strstr(res, "alpha")) fam = AFALPHA;
else if (strstr(res, "ia64")) fam = AFIA64;
else if (strstr(res, "mips")) fam = AFMIPS;
+ else if (strstr(res, "arm")) fam = AFARM;
else if ( strstr(res, "i686") || strstr(res, "i586") ||
strstr(res, "i486") || strstr(res, "i386") ||
strstr(res, "x86") || strstr(res, "x86_64") ) fam = AFX86;
@@ -501,6 +502,7 @@
strstr(res, "i486") || strstr(res, "i386") ||
strstr(res, "x86_64") ) fam = AFX86;
else if (strstr(res, "mips")) fam = AFMIPS;
+ else if (strstr(res, "arm")) fam = AFARM;
}
}
return(fam);
diff -ru ATLAS/CONFIG/src/backend/archinfo_linux.c ATLAS-3.8.4-arm/CONFIG/src/backend/archinfo_linux.c
--- ATLAS/CONFIG/src/backend/archinfo_linux.c 2011-05-14 19:33:24.000000000 +0200
+++ ATLAS-3.8.4-arm/CONFIG/src/backend/archinfo_linux.c 2011-05-16 14:19:02.000000000 +0200
@@ -166,6 +166,20 @@
else if (strstr(res, "AMD-K7")) mach = AmdAthlon;
}
break;
+ case AFARM:
+ if ( !CmndOneLine(NULL, "fgrep 'Processor' /proc/cpuinfo", res) )
+ {
+ if (strstr(res, "ARMv7") || strstr(res,"v7l")) mach = ARMv7VFP;
+ }
+ else if ( !CmndOneLine(NULL, "fgrep 'cpu' /proc/cpuinfo", res) )
+ {
+ if (strstr(res, "ARMv7") || strstr(res,"v7l")) mach = ARMv7VFP;
+ }
+ if ( !CmndOneLine(NULL, "fgrep 'Features' /proc/cpuinfo", res) )
+ {
+ if (strstr(res, "neon")) mach = ARMv7VFPNEON;
+ }
+ break;
/*
* Add these back if we get machine access and can test
*/
diff -ru ATLAS/README ATLAS-3.8.4-arm/README
--- ATLAS/README 2011-05-14 19:33:23.000000000 +0200
+++ ATLAS-3.8.4-arm/README 2012-06-18 23:19:46.000000000 +0200
@@ -1,3 +1,19 @@
+ATLAS-3.8.4-arm
+released 18 June 2012
+
+Changes since beta 1:
+
+ - assembly headers changed to link properly on Unbuntu 11.10 and 12.04
+ - VFP single-precision GEMM by Clint Whaley backported from ATLAS 3.9.x
+
+Changes since beta 2:
+
+ - VFP assembly GEMM kernels added for hard floating-point ABI (like Ubuntu 12.04)
+
+Changes since beta 3:
+ (none)
+
+
This is the README index file. If you want install intructions,
read ATLAS/INSTALL.txt. Windows users should read
ATLAS/doc/Windows.txt as well. Note that all of this documentation is
diff -ru ATLAS/tune/blas/gemm/CASES/ccases.flg ATLAS-3.8.4-arm/tune/blas/gemm/CASES/ccases.flg
--- ATLAS/tune/blas/gemm/CASES/ccases.flg 2011-05-14 19:34:10.000000000 +0200
+++ ATLAS-3.8.4-arm/tune/blas/gemm/CASES/ccases.flg 2012-06-03 14:25:17.000000000 +0200
@@ -1,5 +1,5 @@
<ID> <flag> <mb> <nb> <kb> <muladd> <lat> <mu> <nu> <ku> <rout> "<Contributer>"
-20
+25
304 192 4 3 8 0 4 4 3 8 ATL_mm4x3x8p.c "R. Clint Whaley" \
gcc
-mcpu=ultrasparc -mtune=ultrasparc -fomit-frame-pointer -O
@@ -52,3 +52,18 @@
gcc
-x assembler-with-cpp -mips4
332 192 8 2 4 1 0 8 2 4 ATL_smm8x2x4_av.c "IBM"
+333 448 4 4 2 1 1 4 4 2 ATL_smm4x4x2pf_arm.c "R. Clint Whaley" \
+gcc
+-x assembler-with-cpp -mfpu=vfpv3 -march=armv7-a
+334 448 4 4 2 1 1 4 4 2 ATL_smm4x4x2dld_arm.c "R. Clint Whaley" \
+gcc
+-x assembler-with-cpp -mfpu=vfpv3 -march=armv7-a
+335 448 4 4 2 1 1 4 4 2 ATL_smm4x4x2pf_armhf.c "R. Clint Whaley" \
+gcc
+-x assembler-with-cpp -mfpu=vfpv3 -march=armv7-a
+336 448 4 4 2 1 1 4 4 2 ATL_smm4x4x2dld_armhf.c "R. Clint Whaley" \
+gcc
+-x assembler-with-cpp -mfpu=vfpv3 -march=armv7-a
+337 480 4 4 4 1 1 4 4 4 ATL_smm4x4x4p_neon.c "Vesperix Corporation" \
+gcc
+-march=armv7-a -mfpu=neon -O2
diff -ru ATLAS/tune/blas/gemm/CASES/dcases.flg ATLAS-3.8.4-arm/tune/blas/gemm/CASES/dcases.flg
--- ATLAS/tune/blas/gemm/CASES/dcases.flg 2011-05-14 19:34:10.000000000 +0200
+++ ATLAS-3.8.4-arm/tune/blas/gemm/CASES/dcases.flg 2012-06-03 14:23:02.000000000 +0200
@@ -1,5 +1,5 @@
<ID> <flag> <mb> <nb> <kb> <muladd> <lat> <mu> <nu> <ku> <rout> "<Contributer>"
-31
+33
306 192 4 3 8 0 4 4 3 8 ATL_mm4x3x8p.c "R. Clint Whaley" \
gcc
-mcpu=ultrasparc -mtune=ultrasparc -fomit-frame-pointer -O -fno-schedule-insns -fno-schedule-insns2
@@ -85,6 +85,11 @@
338 192 8 4 2 1 0 8 4 2 ATL_dmm8x4x2_vsx.c "IBM" \
gcc
-O3 -mvsx
-
+339 448 4 4 2 1 1 4 4 2 ATL_dmm4x4x2pf_arm.c "R. Clint Whaley" \
+gcc
+-x assembler-with-cpp -mfpu=vfpv3 -march=armv7-a
+340 448 4 4 2 1 1 4 4 2 ATL_dmm4x4x2pf_armhf.c "R. Clint Whaley" \
+gcc
+-x assembler-with-cpp -mfpu=vfpv3 -march=armv7-a
# NB = 80 and NB = 120 are to force UltraSparc to try NB > L1 Cache,
# needed for optimal performance on Ultra2 and Ultra4, respectively
diff -ru ATLAS/tune/blas/gemm/CASES/scases.flg ATLAS-3.8.4-arm/tune/blas/gemm/CASES/scases.flg
--- ATLAS/tune/blas/gemm/CASES/scases.flg 2011-05-14 19:34:10.000000000 +0200
+++ ATLAS-3.8.4-arm/tune/blas/gemm/CASES/scases.flg 2012-06-03 14:25:48.000000000 +0200
@@ -1,5 +1,5 @@
<ID> <flag> <mb> <nb> <kb> <muladd> <lat> <mu> <nu> <ku> <rout> "<Contributer>"
-22
+27
304 192 4 3 8 0 4 4 3 8 ATL_mm4x3x8p.c "R. Clint Whaley" \
gcc
-mcpu=ultrasparc -mtune=ultrasparc -fomit-frame-pointer -O
@@ -58,3 +58,18 @@
gcc
-x assembler-with-cpp -mips4
332 192 8 2 4 1 0 8 2 4 ATL_smm8x2x4_av.c "IBM"
+333 448 4 4 2 1 1 4 4 2 ATL_smm4x4x2pf_arm.c "R. Clint Whaley" \
+gcc
+-x assembler-with-cpp -mfpu=vfpv3 -march=armv7-a
+334 448 4 4 2 1 1 4 4 2 ATL_smm4x4x2dld_arm.c "R. Clint Whaley" \
+gcc
+-x assembler-with-cpp -mfpu=vfpv3 -march=armv7-a
+335 448 4 4 2 1 1 4 4 2 ATL_smm4x4x2pf_armhf.c "R. Clint Whaley" \
+gcc
+-x assembler-with-cpp -mfpu=vfpv3 -march=armv7-a
+336 448 4 4 2 1 1 4 4 2 ATL_smm4x4x2dld_armhf.c "R. Clint Whaley" \
+gcc
+-x assembler-with-cpp -mfpu=vfpv3 -march=armv7-a
+337 480 4 4 4 1 1 4 4 4 ATL_smm4x4x4p_neon.c "Vesperix Corporation" \
+gcc
+-march=armv7-a -mfpu=neon -O2
diff -ru ATLAS/tune/blas/gemm/CASES/zcases.flg ATLAS-3.8.4-arm/tune/blas/gemm/CASES/zcases.flg
--- ATLAS/tune/blas/gemm/CASES/zcases.flg 2011-05-14 19:34:10.000000000 +0200
+++ ATLAS-3.8.4-arm/tune/blas/gemm/CASES/zcases.flg 2012-06-03 14:23:36.000000000 +0200
@@ -1,5 +1,5 @@
<ID> <flag> <mb> <nb> <kb> <muladd> <lat> <mu> <nu> <ku> <rout> "<Contributer>"
-30
+32
306 192 4 3 8 0 4 4 3 8 ATL_mm4x3x8p.c "R. Clint Whaley" \
gcc
-mcpu=ultrasparc -mtune=ultrasparc -fomit-frame-pointer -O -fno-schedule-insns -fno-schedule-insns2
@@ -82,6 +82,11 @@
338 192 8 4 2 1 0 8 4 2 ATL_dmm8x4x2_vsx.c "IBM" \
gcc
-O3 -mvsx
-
+339 448 4 4 2 1 1 4 4 2 ATL_dmm4x4x2pf_arm.c "R. Clint Whaley" \
+gcc
+-x assembler-with-cpp -mfpu=vfpv3 -march=armv7-a
+340 448 4 4 2 1 1 4 4 2 ATL_dmm4x4x2pf_armhf.c "R. Clint Whaley" \
+gcc
+-x assembler-with-cpp -mfpu=vfpv3 -march=armv7-a
# NB = 80 and NB = 120 are to force UltraSparc to try NB > L1 Cache,
# needed for optimal performance on Ultra2 and Ultra4, respectively
diff -ru ATLAS/tune/sysinfo/ATL_cputime.c ATLAS-3.8.4-arm/tune/sysinfo/ATL_cputime.c
--- ATLAS/tune/sysinfo/ATL_cputime.c 2011-05-14 19:34:09.000000000 +0200
+++ ATLAS-3.8.4-arm/tune/sysinfo/ATL_cputime.c 2012-05-05 18:16:58.000000000 +0200
@@ -48,6 +48,26 @@
t0 = clock();
return(0.0);
}
+#elif defined(POSIX_HR) /* use the POSIX HR timers */
+ #include <time.h>
+ double ATL_cputime(void)
+ {
+ struct timespec ts;
+ static double t0;
+ double res;
+ static int INIT = 0;
+
+ if (INIT)
+ {
+ clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
+ res = ts.tv_sec + 1.0e-9 * ts.tv_nsec;
+ return(res - t0);
+ }
+ clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&ts);
+ t0 = ts.tv_sec + 1.0e-9 * ts.tv_nsec;
+ INIT = 1;
+ return(0.0);
+ }
#elif defined(UseTimes)
#include <stdlib.h>
#include <sys/times.h>
diff -ru ATLAS/tune/sysinfo/ATL_walltime.c ATLAS-3.8.4-arm/tune/sysinfo/ATL_walltime.c
--- ATLAS/tune/sysinfo/ATL_walltime.c 2011-05-14 19:34:09.000000000 +0200
+++ ATLAS-3.8.4-arm/tune/sysinfo/ATL_walltime.c 2012-05-05 18:23:20.000000000 +0200
@@ -79,6 +79,26 @@
myout = (myout<<32) | msout.LowPart;
return(myout*freqRecip);
}
+#elif defined(POSIX_HR) /* use the POSIX HR timers */
+ #include <time.h>
+ double ATL_cputime(void)
+ {
+ struct timespec ts;
+ static double t0;
+ double res;
+ static int INIT = 0;
+
+ if (INIT)
+ {
+ clock_gettime(CLOCK_REALTIME, &ts);
+ res = ts.tv_sec + 1.0e-9 * ts.tv_nsec;
+ return(res - t0);
+ }
+ clock_gettime(CLOCK_REALTIME,&ts);
+ t0 = ts.tv_sec + 1.0e-9 * ts.tv_nsec;
+ INIT = 1;
+ return(0.0);
+ }
#elif defined(UseTimes)
#include <stdlib.h>
#include <sys/times.h>
@@ -97,6 +117,26 @@
{
return(gethrtime()*1.0e-9);
}
+#elif defined(POSIX_HR) /* use the POSIX HR timers */
+ #include <time.h>
+ double ATL_walltime(void)
+ {
+ struct timespec ts;
+ static double t0;
+ double res;
+ static int INIT = 0;
+
+ if (INIT)
+ {
+ clock_gettime(CLOCK_REALTIME, &ts);
+ res = ts.tv_sec + 1.0e-9 * ts.tv_nsec;
+ return(res - t0);
+ }
+ clock_gettime(CLOCK_REALTIME, &ts);
+ t0 = ts.tv_sec + 1.0e-9 * ts.tv_nsec;
+ INIT = 1;
+ return(0.0);
+ }
/*
* Without gcc, I know no standard Windows wall-timer, so use cputime
*/