ispc/ispc-1.23.0-upstream-add_x86-isel-fix.patch

From c72af29f0bd82678918b941654aff892fbd7dc8a Mon Sep 17 00:00:00 2001
From: Aleksei Nurmukhametov <aleksei.nurmukhametov@intel.com>
Date: Thu, 29 Feb 2024 08:59:44 -0800
Subject: [PATCH] llvm_patches: add x86-isel fix

This backported patch fixes generation of redundant vmovd, vpinsrd
instructions.

Test case added as XFAIL until LLVM rebuild in CI.
---
 ...auxShuffleMask-handle-fp-int-bitcast.patch | 40 +++++++++++++++++++
 tests/lit-tests/2777.ispc                     | 34 ++++++++++++++++
 2 files changed, 74 insertions(+)
 create mode 100644 llvm_patches/16_0_17_0-X86-getFauxShuffleMask-handle-fp-int-bitcast.patch
 create mode 100644 tests/lit-tests/2777.ispc

diff --git a/llvm_patches/16_0_17_0-X86-getFauxShuffleMask-handle-fp-int-bitcast.patch b/llvm_patches/16_0_17_0-X86-getFauxShuffleMask-handle-fp-int-bitcast.patch
new file mode 100644
index 0000000000..f197b35051
--- /dev/null
+++ b/llvm_patches/16_0_17_0-X86-getFauxShuffleMask-handle-fp-int-bitcast.patch
@@ -0,0 +1,40 @@
+# This patch is required to fix the issue #2777
+From 7aa6ddfea210b6b114ac649c4d3219138e9cc52a Mon Sep 17 00:00:00 2001
+From: Simon Pilgrim <llvm-dev@redking.me.uk>
+Date: Thu, 29 Feb 2024 10:32:37 +0000
+Subject: [PATCH] [X86] getFauxShuffleMask - handle
+ insert_vector_elt(bitcast(extract_vector_elt(x))) shuffle patterns
+
+If the bitcast is between types of equal scalar size (i.e. fp<->int bitcasts), then we can safely peek through them
+
+Fixes #83289
+---
+ llvm/lib/Target/X86/X86ISelLowering.cpp | 7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
+index e43b33eed470..22eea8b3d43d 100644
+--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
++++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
+@@ -8477,13 +8477,16 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
+       }
+     }
+ 
+-    // Peek through trunc/aext/zext.
++    // Peek through trunc/aext/zext/bitcast.
+     // TODO: aext shouldn't require SM_SentinelZero padding.
+     // TODO: handle shift of scalars.
+     unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
+     while (Scl.getOpcode() == ISD::TRUNCATE ||
+            Scl.getOpcode() == ISD::ANY_EXTEND ||
+-           Scl.getOpcode() == ISD::ZERO_EXTEND) {
++           Scl.getOpcode() == ISD::ZERO_EXTEND ||
++           (Scl.getOpcode() == ISD::BITCAST &&
++            Scl.getScalarValueSizeInBits() ==
++                Scl.getOperand(0).getScalarValueSizeInBits())) {
+       Scl = Scl.getOperand(0);
+       MinBitsPerElt =
+           std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
+-- 
+2.25.1
+
diff --git a/tests/lit-tests/2777.ispc b/tests/lit-tests/2777.ispc
new file mode 100644
index 0000000000..ec7656629a
--- /dev/null
+++ b/tests/lit-tests/2777.ispc
@@ -0,0 +1,34 @@
+// RUN: %{ispc} %s --target=avx2-i32x8 --emit-asm -o - | FileCheck %s
+
+// REQUIRES: X86_ENABLED && LLVM_16_0+
+
+// XFAIL: *
+
+// CHECK-NOT: vmovd
+// CHECK-NOT: vpinsrd
+
+struct FVector4f
+{
+	float V[4];
+};
+
+// Extra vmovd, vpinsrd after 2x vmaxps
+inline uniform FVector4f VectorMax(const uniform FVector4f& V1, const uniform FVector4f& V2)
+{
+	varying float S0, S1, Result;
+	*((uniform FVector4f *uniform)&S0) = *((uniform FVector4f *uniform)&V1);
+	*((uniform FVector4f *uniform)&S1) = *((uniform FVector4f *uniform)&V2);
+
+	Result = max(S0, S1);
+
+	return *((uniform FVector4f *uniform)&Result);
+}
+
+export void foo(uniform float A[], uniform float B[])
+{
+
+	uniform FVector4f *uniform pA = (uniform FVector4f *uniform)A;
+	uniform FVector4f *uniform pB = (uniform FVector4f *uniform)B;
+
+        *pA = VectorMax(*pA, *pB);
+}
package created using the webbuild interface [release 1.23.0-1mamba;Sat May 04 2024] 2024-05-05 19:44:56 +02:00			`From c72af29f0bd82678918b941654aff892fbd7dc8a Mon Sep 17 00:00:00 2001`
			`From: Aleksei Nurmukhametov <aleksei.nurmukhametov@intel.com>`
			`Date: Thu, 29 Feb 2024 08:59:44 -0800`
			`Subject: [PATCH] llvm_patches: add x86-isel fix`

			`This backported patch fixes generation of redundant vmovd, vpinsrd`
			`instructions.`

			`Test case added as XFAIL until LLVM rebuild in CI.`
			`---`
			`...auxShuffleMask-handle-fp-int-bitcast.patch \| 40 +++++++++++++++++++`
			`tests/lit-tests/2777.ispc \| 34 ++++++++++++++++`
			`2 files changed, 74 insertions(+)`
			`create mode 100644 llvm_patches/16_0_17_0-X86-getFauxShuffleMask-handle-fp-int-bitcast.patch`
			`create mode 100644 tests/lit-tests/2777.ispc`

			`diff --git a/llvm_patches/16_0_17_0-X86-getFauxShuffleMask-handle-fp-int-bitcast.patch b/llvm_patches/16_0_17_0-X86-getFauxShuffleMask-handle-fp-int-bitcast.patch`
			`new file mode 100644`
			`index 0000000000..f197b35051`
			`--- /dev/null`
			`+++ b/llvm_patches/16_0_17_0-X86-getFauxShuffleMask-handle-fp-int-bitcast.patch`
			`@@ -0,0 +1,40 @@`
			`+# This patch is required to fix the issue #2777`
			`+From 7aa6ddfea210b6b114ac649c4d3219138e9cc52a Mon Sep 17 00:00:00 2001`
			`+From: Simon Pilgrim <llvm-dev@redking.me.uk>`
			`+Date: Thu, 29 Feb 2024 10:32:37 +0000`
			`+Subject: [PATCH] [X86] getFauxShuffleMask - handle`
			`+ insert_vector_elt(bitcast(extract_vector_elt(x))) shuffle patterns`
			`+`
			`+If the bitcast is between types of equal scalar size (i.e. fp<->int bitcasts), then we can safely peek through them`
			`+`
			`+Fixes #83289`
			`+---`
			`+ llvm/lib/Target/X86/X86ISelLowering.cpp \| 7 +++++--`
			`+ 1 file changed, 5 insertions(+), 2 deletions(-)`
			`+`
			`+diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp`
			`+index e43b33eed470..22eea8b3d43d 100644`
			`+--- a/llvm/lib/Target/X86/X86ISelLowering.cpp`
			`++++ b/llvm/lib/Target/X86/X86ISelLowering.cpp`
			`+@@ -8477,13 +8477,16 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,`
			`+ }`
			`+ }`
			`+`
			`+- // Peek through trunc/aext/zext.`
			`++ // Peek through trunc/aext/zext/bitcast.`
			`+ // TODO: aext shouldn't require SM_SentinelZero padding.`
			`+ // TODO: handle shift of scalars.`
			`+ unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();`
			`+ while (Scl.getOpcode() == ISD::TRUNCATE \|\|`
			`+ Scl.getOpcode() == ISD::ANY_EXTEND \|\|`
			`+- Scl.getOpcode() == ISD::ZERO_EXTEND) {`
			`++ Scl.getOpcode() == ISD::ZERO_EXTEND \|\|`
			`++ (Scl.getOpcode() == ISD::BITCAST &&`
			`++ Scl.getScalarValueSizeInBits() ==`
			`++ Scl.getOperand(0).getScalarValueSizeInBits())) {`
			`+ Scl = Scl.getOperand(0);`
			`+ MinBitsPerElt =`
			`+ std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());`
			`+--`
			`+2.25.1`
			`+`
			`diff --git a/tests/lit-tests/2777.ispc b/tests/lit-tests/2777.ispc`
			`new file mode 100644`
			`index 0000000000..ec7656629a`
			`--- /dev/null`
			`+++ b/tests/lit-tests/2777.ispc`
			`@@ -0,0 +1,34 @@`
			`+// RUN: %{ispc} %s --target=avx2-i32x8 --emit-asm -o - \| FileCheck %s`
			`+`
			`+// REQUIRES: X86_ENABLED && LLVM_16_0+`
			`+`
			`+// XFAIL: *`
			`+`
			`+// CHECK-NOT: vmovd`
			`+// CHECK-NOT: vpinsrd`
			`+`
			`+struct FVector4f`
			`+{`
			`+ float V[4];`
			`+};`
			`+`
			`+// Extra vmovd, vpinsrd after 2x vmaxps`
			`+inline uniform FVector4f VectorMax(const uniform FVector4f& V1, const uniform FVector4f& V2)`
			`+{`
			`+ varying float S0, S1, Result;`
			`+ ((uniform FVector4f uniform)&S0) = ((uniform FVector4f uniform)&V1);`
			`+ ((uniform FVector4f uniform)&S1) = ((uniform FVector4f uniform)&V2);`
			`+`
			`+ Result = max(S0, S1);`
			`+`
			`+ return ((uniform FVector4f uniform)&Result);`
			`+}`
			`+`
			`+export void foo(uniform float A[], uniform float B[])`
			`+{`
			`+`
			`+ uniform FVector4f uniform pA = (uniform FVector4f uniform)A;`
			`+ uniform FVector4f uniform pB = (uniform FVector4f uniform)B;`
			`+`
			`+ pA = VectorMax(pA, *pB);`
			`+}`