Skip to content

[AMDGPU] Add gfx1250 v_cvt_sr_pk_bf16_f32 instruction #151385

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -697,6 +697,7 @@ TARGET_BUILTIN(__builtin_amdgcn_exp2_bf16, "yy", "nc", "bf16-trans-insts")
TARGET_BUILTIN(__builtin_amdgcn_sin_bf16, "yy", "nc", "bf16-trans-insts")
TARGET_BUILTIN(__builtin_amdgcn_cos_bf16, "yy", "nc", "bf16-trans-insts")

TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_pk_bf16_f32, "V2yffi", "nc", "bf16-cvt-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_fp8, "hiIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_fp8, "V2hs", "nc", "gfx1250-insts")
Expand Down
2 changes: 1 addition & 1 deletion clang/test/CodeGenOpenCL/amdgpu-features.cl
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@
// GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32
// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32

// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"

Expand Down
28 changes: 28 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
typedef unsigned int uint;
typedef unsigned short int ushort;
typedef unsigned int __attribute__((ext_vector_type(2))) uint2;
typedef __bf16 __attribute__((ext_vector_type(2))) bfloat2;
typedef half __attribute__((ext_vector_type(2))) half2;

// CHECK-LABEL: @test_setprio_inc_wg(
Expand Down Expand Up @@ -254,6 +255,33 @@ void test_cos_bf16(global __bf16* out, __bf16 a)
*out = __builtin_amdgcn_cos_bf16(a);
}

// CHECK-LABEL: @test_cvt_sr_pk_bf16_f32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[B_ADDR:%.*]] = alloca float, align 4, addrspace(5)
// CHECK-NEXT: [[SR_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
// CHECK-NEXT: [[SR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SR_ADDR]] to ptr
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store float [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: store float [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4
// CHECK-NEXT: store i32 [[SR:%.*]], ptr [[SR_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[B_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.sr.pk.bf16.f32(float [[TMP0]], float [[TMP1]], i32 [[TMP2]])
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store <2 x bfloat> [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
// CHECK-NEXT: ret void
//
void test_cvt_sr_pk_bf16_f32(global bfloat2* out, float a, float b, uint sr)
{
*out = __builtin_amdgcn_cvt_sr_pk_bf16_f32(a, b, sr);
}

// CHECK-LABEL: @test_cvt_f16_fp8(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
Expand Down
4 changes: 4 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -593,6 +593,10 @@ def int_amdgcn_tanh : DefaultAttrsIntrinsic<
[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
>;

def int_amdgcn_cvt_sr_pk_bf16_f32 : DefaultAttrsIntrinsic<
[llvm_v2bf16_ty], [llvm_float_ty, llvm_float_ty, llvm_i32_ty], [IntrNoMem, IntrSpeculatable]
>, ClangBuiltin<"__builtin_amdgcn_cvt_sr_pk_bf16_f32">;

def int_amdgcn_cvt_pk_f16_fp8 : DefaultAttrsIntrinsic<
[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrSpeculatable]
>, ClangBuiltin<"__builtin_amdgcn_cvt_pk_f16_fp8">;
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4574,6 +4574,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_cvt_pknorm_u16:
case Intrinsic::amdgcn_cvt_pk_i16:
case Intrinsic::amdgcn_cvt_pk_u16:
case Intrinsic::amdgcn_cvt_sr_pk_bf16_f32:
case Intrinsic::amdgcn_cvt_pk_f16_fp8:
case Intrinsic::amdgcn_cvt_pk_f16_bf8:
case Intrinsic::amdgcn_sat_pk4_i4_i8:
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -2926,6 +2926,7 @@ def VOP_V2BF16_F32_F32 : VOPProfile <[v2bf16, f32, f32, untyped]>;
def VOP_V32F32_V6I32_F32 : VOPProfile <[v32f32, v6i32, f32, untyped]>;
def VOP_V32F16_V6I32_F32 : VOPProfile <[v32f16, v6i32, f32, untyped]>;
def VOP_V32BF16_V6I32_F32 : VOPProfile <[v32bf16, v6i32, f32, untyped]>;
def VOP_V2BF16_F32_F32_I32 : VOPProfile <[v2bf16, f32, f32, i32]>;
def VOP_V6I32_V32F16_F32 : VOPProfile<[v6i32, v32f16, f32, untyped]>;
def VOP_V6I32_V32BF16_F32 : VOPProfile<[v6i32, v32bf16, f32, untyped]>;
def VOP_V6I32_V16F32_V16F32_F32 : VOPProfile<[v6i32, v16f32, v16f32, f32]>;
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/VOP3Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1610,6 +1610,7 @@ def bf16_fpround : PatFrag <(ops node:$src0), (fpround $src0), [{ return true;
let SubtargetPredicate = HasBF16ConversionInsts in {
let ReadsModeReg = 0 in {
defm V_CVT_PK_BF16_F32 : VOP3Inst<"v_cvt_pk_bf16_f32", VOP3_Profile<VOP_V2BF16_F32_F32>>;
defm V_CVT_SR_PK_BF16_F32 : VOP3Inst<"v_cvt_sr_pk_bf16_f32", VOP3_Profile<VOP_V2BF16_F32_F32_I32>, int_amdgcn_cvt_sr_pk_bf16_f32>;
}
def : GCNPat<(v2bf16 (bf16_fpround v2f32:$src)),
(V_CVT_PK_BF16_F32_e64 0, (EXTRACT_SUBREG VReg_64:$src, sub0), 0, (EXTRACT_SUBREG VReg_64:$src, sub1))>;
Expand Down Expand Up @@ -2013,6 +2014,7 @@ let AssemblerPredicate = isGFX11Plus in {
// These instructions differ from GFX12 variant by supporting DPP:
defm V_LSHL_ADD_U64 : VOP3Only_Realtriple_gfx1250<0x252>;
defm V_CVT_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36d>;
defm V_CVT_SR_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36e>;

//===----------------------------------------------------------------------===//
// GFX10.
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/TargetParser/TargetParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
Features["tanh-insts"] = true;
Features["transpose-load-f4f6-insts"] = true;
Features["bf16-trans-insts"] = true;
Features["bf16-cvt-insts"] = true;
Features["fp8-conversion-insts"] = true;
Features["fp8e5m3-insts"] = true;
Features["permlane16-swap"] = true;
Expand Down
66 changes: 66 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sr.pk.bf16.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s
; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s

; FIXME: GlobalISel does not work with bf16

declare <2 x bfloat> @llvm.amdgcn.cvt.sr.pk.bf16.f32(float, float, i32) #0

define amdgpu_ps float @cvt_sr_pk_bf16_f32_vvv(float %src0, float %src1, i32 %src2) #1 {
; GCN-LABEL: cvt_sr_pk_bf16_f32_vvv:
; GCN: ; %bb.0:
; GCN-NEXT: v_cvt_sr_pk_bf16_f32 v0, v0, v1, v2
; GCN-NEXT: ; return to shader part epilog
%cvt = call <2 x bfloat> @llvm.amdgcn.cvt.sr.pk.bf16.f32(float %src0, float %src1, i32 %src2) #0
%ret = bitcast <2 x bfloat> %cvt to float
ret float %ret
}

define amdgpu_ps float @cvt_sr_pk_bf16_f32_sss(float inreg %src0, float inreg %src1, i32 inreg %src2) #1 {
; GCN-LABEL: cvt_sr_pk_bf16_f32_sss:
; GCN: ; %bb.0:
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: v_cvt_sr_pk_bf16_f32 v0, s0, s1, v0
; GCN-NEXT: ; return to shader part epilog
%cvt = call <2 x bfloat> @llvm.amdgcn.cvt.sr.pk.bf16.f32(float %src0, float %src1, i32 %src2) #0
%ret = bitcast <2 x bfloat> %cvt to float
ret float %ret
}

define amdgpu_ps float @cvt_sr_pk_bf16_f32_vvi(float %src0, float %src1) #1 {
; GCN-LABEL: cvt_sr_pk_bf16_f32_vvi:
; GCN: ; %bb.0:
; GCN-NEXT: v_cvt_sr_pk_bf16_f32 v0, v0, v1, 0x10002
; GCN-NEXT: ; return to shader part epilog
%cvt = call <2 x bfloat> @llvm.amdgcn.cvt.sr.pk.bf16.f32(float %src0, float %src1, i32 65538) #0
%ret = bitcast <2 x bfloat> %cvt to float
ret float %ret
}

define amdgpu_ps float @cvt_sr_pk_bf16_f32_vvi_mods(float %src0, float %src1) #1 {
; GCN-LABEL: cvt_sr_pk_bf16_f32_vvi_mods:
; GCN: ; %bb.0:
; GCN-NEXT: v_cvt_sr_pk_bf16_f32 v0, -v0, |v1|, 1
; GCN-NEXT: ; return to shader part epilog
%s0 = fneg float %src0
%s1 = call float @llvm.fabs.f32(float %src1) #0
%cvt = call <2 x bfloat> @llvm.amdgcn.cvt.sr.pk.bf16.f32(float %s0, float %s1, i32 1) #0
%ret = bitcast <2 x bfloat> %cvt to float
ret float %ret
}

define amdgpu_ps float @cvt_sr_pk_bf16_f32_ssi(float inreg %src0, float inreg %src1) #1 {
; GCN-LABEL: cvt_sr_pk_bf16_f32_ssi:
; GCN: ; %bb.0:
; GCN-NEXT: v_cvt_sr_pk_bf16_f32 v0, s0, s1, 1
; GCN-NEXT: ; return to shader part epilog
%cvt = call <2 x bfloat> @llvm.amdgcn.cvt.sr.pk.bf16.f32(float %src0, float %src1, i32 1) #0
%ret = bitcast <2 x bfloat> %cvt to float
ret float %ret
}

declare float @llvm.fabs.f32(float) #0

attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
45 changes: 45 additions & 0 deletions llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s
Original file line number Diff line number Diff line change
Expand Up @@ -321,3 +321,48 @@ v_cvt_pk_bf16_f32 v5, src_scc, vcc_lo mul:4

v_cvt_pk_bf16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2
// GFX1250: v_cvt_pk_bf16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2 ; encoding: [0xff,0x81,0x6d,0xd7,0xff,0xd6,0x00,0x38,0x56,0x34,0x12,0xaf]

v_cvt_sr_pk_bf16_f32 v5, v1, v2, s3
// GFX1250: v_cvt_sr_pk_bf16_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x6e,0xd7,0x01,0x05,0x0e,0x00]

v_cvt_sr_pk_bf16_f32 v5, v255, s2, s105
// GFX1250: v_cvt_sr_pk_bf16_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x6e,0xd7,0xff,0x05,0xa4,0x01]

v_cvt_sr_pk_bf16_f32 v5, s1, v255, exec_hi
// GFX1250: v_cvt_sr_pk_bf16_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x6e,0xd7,0x01,0xfe,0xff,0x01]

v_cvt_sr_pk_bf16_f32 v5, s105, s105, exec_lo
// GFX1250: v_cvt_sr_pk_bf16_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6e,0xd7,0x69,0xd2,0xf8,0x01]

v_cvt_sr_pk_bf16_f32 v5, vcc_lo, ttmp15, v3
// GFX1250: v_cvt_sr_pk_bf16_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6e,0xd7,0x6a,0xf6,0x0c,0x04]

v_cvt_sr_pk_bf16_f32 v5, vcc_hi, 0xaf123456, v255
// GFX1250: v_cvt_sr_pk_bf16_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x6e,0xd7,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf]

v_cvt_sr_pk_bf16_f32 v5, -|ttmp15|, -|src_scc|, ttmp15
// GFX1250: v_cvt_sr_pk_bf16_f32 v5, -|ttmp15|, -|src_scc|, ttmp15 ; encoding: [0x05,0x03,0x6e,0xd7,0x7b,0xfa,0xed,0x61]

v_cvt_sr_pk_bf16_f32 v5, m0, 0.5, m0
// GFX1250: v_cvt_sr_pk_bf16_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x6e,0xd7,0x7d,0xe0,0xf5,0x01]

v_cvt_sr_pk_bf16_f32 v5, |exec_lo|, -1, vcc_hi
// GFX1250: v_cvt_sr_pk_bf16_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6e,0xd7,0x7e,0x82,0xad,0x01]

v_cvt_sr_pk_bf16_f32 v5, -|exec_hi|, null, vcc_lo
// GFX1250: v_cvt_sr_pk_bf16_f32 v5, -|exec_hi|, null, vcc_lo ; encoding: [0x05,0x01,0x6e,0xd7,0x7f,0xf8,0xa8,0x21]

v_cvt_sr_pk_bf16_f32 v5, null, exec_lo, 0xaf123456
// GFX1250: v_cvt_sr_pk_bf16_f32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x6e,0xd7,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf]

v_cvt_sr_pk_bf16_f32 v5, -1, -|exec_hi|, src_scc
// GFX1250: v_cvt_sr_pk_bf16_f32 v5, -1, -|exec_hi|, src_scc ; encoding: [0x05,0x02,0x6e,0xd7,0xc1,0xfe,0xf4,0x43]

v_cvt_sr_pk_bf16_f32 v5, 0.5, -m0, 0.5 mul:2
// GFX1250: v_cvt_sr_pk_bf16_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6e,0xd7,0xf0,0xfa,0xc0,0x4b]

v_cvt_sr_pk_bf16_f32 v5, -src_scc, |vcc_lo|, -1 mul:4
// GFX1250: v_cvt_sr_pk_bf16_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6e,0xd7,0xfd,0xd4,0x04,0x33]

v_cvt_sr_pk_bf16_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2
// GFX1250: v_cvt_sr_pk_bf16_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6e,0xd7,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf]
45 changes: 45 additions & 0 deletions llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s
Original file line number Diff line number Diff line change
Expand Up @@ -321,3 +321,48 @@ v_cvt_pk_bf16_f32 v5, src_scc, vcc_lo mul:4

v_cvt_pk_bf16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2
// GFX1250: v_cvt_pk_bf16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2 ; encoding: [0xff,0x81,0x6d,0xd7,0xff,0xd6,0x00,0x38,0x56,0x34,0x12,0xaf]

v_cvt_sr_pk_bf16_f32 v5, v1, v2, s3
// GFX1250: v_cvt_sr_pk_bf16_f32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x6e,0xd7,0x01,0x05,0x0e,0x00]

v_cvt_sr_pk_bf16_f32 v5, v255, s2, s105
// GFX1250: v_cvt_sr_pk_bf16_f32 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x6e,0xd7,0xff,0x05,0xa4,0x01]

v_cvt_sr_pk_bf16_f32 v5, s1, v255, exec_hi
// GFX1250: v_cvt_sr_pk_bf16_f32 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x6e,0xd7,0x01,0xfe,0xff,0x01]

v_cvt_sr_pk_bf16_f32 v5, s105, s105, exec_lo
// GFX1250: v_cvt_sr_pk_bf16_f32 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6e,0xd7,0x69,0xd2,0xf8,0x01]

v_cvt_sr_pk_bf16_f32 v5, vcc_lo, ttmp15, v3
// GFX1250: v_cvt_sr_pk_bf16_f32 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6e,0xd7,0x6a,0xf6,0x0c,0x04]

v_cvt_sr_pk_bf16_f32 v5, vcc_hi, 0xaf123456, v255
// GFX1250: v_cvt_sr_pk_bf16_f32 v5, vcc_hi, 0xaf123456, v255 ; encoding: [0x05,0x00,0x6e,0xd7,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf]

v_cvt_sr_pk_bf16_f32 v5, -|ttmp15|, -|src_scc|, ttmp15
// GFX1250: v_cvt_sr_pk_bf16_f32 v5, -|ttmp15|, -|src_scc|, ttmp15 ; encoding: [0x05,0x03,0x6e,0xd7,0x7b,0xfa,0xed,0x61]

v_cvt_sr_pk_bf16_f32 v5, m0, 0.5, m0
// GFX1250: v_cvt_sr_pk_bf16_f32 v5, m0, 0.5, m0 ; encoding: [0x05,0x00,0x6e,0xd7,0x7d,0xe0,0xf5,0x01]

v_cvt_sr_pk_bf16_f32 v5, |exec_lo|, -1, vcc_hi
// GFX1250: v_cvt_sr_pk_bf16_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6e,0xd7,0x7e,0x82,0xad,0x01]

v_cvt_sr_pk_bf16_f32 v5, -|exec_hi|, null, vcc_lo
// GFX1250: v_cvt_sr_pk_bf16_f32 v5, -|exec_hi|, null, vcc_lo ; encoding: [0x05,0x01,0x6e,0xd7,0x7f,0xf8,0xa8,0x21]

v_cvt_sr_pk_bf16_f32 v5, null, exec_lo, 0xaf123456
// GFX1250: v_cvt_sr_pk_bf16_f32 v5, null, exec_lo, 0xaf123456 ; encoding: [0x05,0x00,0x6e,0xd7,0x7c,0xfc,0xfc,0x03,0x56,0x34,0x12,0xaf]

v_cvt_sr_pk_bf16_f32 v5, -1, -|exec_hi|, src_scc
// GFX1250: v_cvt_sr_pk_bf16_f32 v5, -1, -|exec_hi|, src_scc ; encoding: [0x05,0x02,0x6e,0xd7,0xc1,0xfe,0xf4,0x43]

v_cvt_sr_pk_bf16_f32 v5, 0.5, -m0, 0.5 mul:2
// GFX1250: v_cvt_sr_pk_bf16_f32 v5, 0.5, -m0, 0.5 mul:2 ; encoding: [0x05,0x00,0x6e,0xd7,0xf0,0xfa,0xc0,0x4b]

v_cvt_sr_pk_bf16_f32 v5, -src_scc, |vcc_lo|, -1 mul:4
// GFX1250: v_cvt_sr_pk_bf16_f32 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x6e,0xd7,0xfd,0xd4,0x04,0x33]

v_cvt_sr_pk_bf16_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2
// GFX1250: v_cvt_sr_pk_bf16_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6e,0xd7,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf]
Loading