Crysis 2 Maximum edition: Audio thread completely saturated with x87, breaks audio #4252

Sonicadvance1 · 2025-01-03T20:14:33Z

The first level of the game completely saturates its audio thread, resulting in it dropping samples. Even in reduced precision mode. Pretty much all the CPU time is spent in its fmodex.dll. and most of the blocks are just a ton of x87.

  19.30%  [JIT] tid 494822     [.] JIT_0x2e7a85c_0x413e867fc
  11.39%  [JIT] tid 494822     [.] JIT_0x2ea4521_0x413e642a4
  10.69%  [JIT] tid 494822     [.] JIT_0x2ea46e9_0x413e6d978
   7.19%  [JIT] tid 494822     [.] JIT_0x2e8e7af_0x413e94d28
   4.86%  [JIT] tid 494822     [.] JIT_0x2ea3931_0x413e39858
   4.58%  [JIT] tid 494822     [.] JIT_0x2e8e6e1_0x413ea3658
   3.84%  [JIT] tid 494822     [.] JIT_0x2e8e761_0x413ea3ce8
   2.84%  [JIT] tid 494822     [.] JIT_0x2ea4085_0x413e5119c
   2.74%  [JIT] tid 494822     [.] JIT_0x2e8eb0d_0x413e9a00c
   2.12%  [JIT] tid 494822     [.] JIT_0x2e735d4_0x413e9e090
   2.05%  [JIT] tid 494822     [.] JIT_0x2e8e659_0x413ea3330
   1.62%  [JIT] tid 494822     [.] JIT_0x2e8e373_0x413ea1cf0
   1.39%  FEXInterpreter       [.] 0x0000000000185480
   0.99%  [JIT] tid 494822     [.] JIT_0x2e8eb51_0x413e9a79c
   0.90%  [JIT] tid 494822     [.] JIT_0x2e8eb04_0x413e99a64
   0.83%  [JIT] tid 494822     [.] JIT_0x2e8e4dd_0x413e90e94
   0.78%  [JIT] tid 494822     [.] JIT_0x2ea46cf_0x413e69d04
   0.78%  [JIT] tid 494822     [.] JIT_0x2e8e4df_0x413e905ec
   0.70%  [JIT] tid 494822     [.] JIT_0x2ea44ed_0x413e605dc

fmodex.dll base address at 0x2e20000 in this capture.

First block:

02e7a85c  fld     st0, dword [edx]
02e7a85e  dec     dword [ebp+0x10 {arg4}]
02e7a861  fld     st0, dword [0x2efbf50]
02e7a867  add     edx, 0x8
02e7a86a  fld     st0, st0
02e7a86c  faddp   st2, st0
02e7a86e  fld     st0, dword [esi+0x1c0]
02e7a874  fmulp   st2, st0
02e7a876  fld     st0, dword [esi+0x1c4]
02e7a87c  fmul    st0, dword [ebp+0x14 {arg5}]
02e7a87f  faddp   st2, st0
02e7a881  fxch    st0, st1
02e7a883  fstp    dword [ebp+0xc {arg3}], st0
02e7a886  fld     st0, dword [edx-0x4]
02e7a889  fld     st0, st1
02e7a88b  faddp   st1, st0
02e7a88d  fmul    st0, dword [esi+0x1c0]
02e7a893  fld     st0, dword [esi+0x1c4]
02e7a899  fmul    st0, dword [ebp-0x10 {var_14_1}]
02e7a89c  faddp   st1, st0
02e7a89e  fstp    dword [ebp+0x8 {arg2}], st0
02e7a8a1  fld     st0, dword [ebp+0xc {arg3}]
02e7a8a4  fstp    dword [ebp+0x14 {arg5}], st0
02e7a8a7  fld     st0, dword [ebp+0x8 {arg2}]
02e7a8aa  fstp    dword [ebp-0x10 {var_14_1}], st0
02e7a8ad  fld     st0, dword [esi+0x1c0]
02e7a8b3  fmul    st0, dword [ebp+0xc {arg3}]
02e7a8b6  fld     st0, dword [esi+0x1c4]
02e7a8bc  fmul    st0, dword [ebp-0x14 {var_18_1}]
02e7a8bf  faddp   st1, st0
02e7a8c1  fstp    dword [ebp+0xc {arg3}], st0
02e7a8c4  fld     st0, dword [esi+0x1c0]
02e7a8ca  fmul    st0, dword [ebp+0x8 {arg2}]
02e7a8cd  fld     st0, dword [esi+0x1c4]
02e7a8d3  fmul    st0, dword [ebp-0xc {var_10_1}]
02e7a8d6  faddp   st1, st0
02e7a8d8  fstp    dword [ebp+0x8 {arg2}], st0
02e7a8db  fld     st0, dword [ebp+0xc {arg3}]
02e7a8de  fstp    dword [ebp-0x14 {var_18_1}], st0
02e7a8e1  fld     st0, dword [ebp+0x8 {arg2}]
02e7a8e4  fstp    dword [ebp-0xc {var_10_1}], st0
02e7a8e7  fld     st0, dword [ebp+0xc {arg3}]
02e7a8ea  fstp    dword [eax], st0
02e7a8ec  add     eax, 0x8
02e7a8ef  cmp     dword [ebp+0x10 {arg4}], 0x0
02e7a8f3  fld     st0, dword [ebp+0x8 {arg2}]
02e7a8f6  fstp    dword [eax-0x4], st0
02e7a8f9  fchs    
02e7a8fb  fstp    dword [data_2efbf50], st0
02e7a901  jne     0x2e7a85c

The text was updated successfully, but these errors were encountered:

pmatos · 2025-01-04T09:56:21Z

Interesting - this is a good example to extract potential blocks for optimization in the x87 stack optimization pass.

Sonicadvance1 · 2025-01-05T12:49:49Z

An easy win I noticed here is that vector stores aren't handling the small store ranges that are possible in arm64. Loads seem to handle it although.

    "Test": {
      "x86InstructionCount": 4,
      "ExpectedInstructionCount": 15,
      "x86Insts": [
        "fld     dword [ebp+16380]",
        "fstp    dword [eax-0x4]",
        "fld     dword [ebp-0x8]",
        "fstp    dword [eax+16370]"
      ],
      "ExpectedArm64ASM": [
        "ldr s2, [x9, #16380]",
        "sub w20, w4, #0x4 (4)",
        "str s2, [x20]",
        "ldur s2, [x9, #-8]",
        "mov w20, #0x3ff2",
        "add w20, w4, w20",
        "str s2, [x20]",
        "ldrb w20, [x28, #1019]",
        "add w20, w20, #0x7 (7)",
        "and w20, w20, #0x7",
        "ldrb w21, [x28, #1298]",
        "mov w22, #0x1",
        "lsl w20, w22, w20",
        "bic w20, w21, w20",
        "strb w20, [x28, #1298]"
      ]
    }

Happens for both 64-bit mode with 64-bit addressing, and 32-bit mode with 32-bit addressing. Should be an easy win to fold those immediates in to str and stur.

pmatos · 2025-01-06T13:04:24Z

An easy win I noticed here is that vector stores aren't handling the small store ranges that are possible in arm64. Loads seem to handle it although.

    "Test": {
      "x86InstructionCount": 4,
      "ExpectedInstructionCount": 15,
      "x86Insts": [
        "fld     dword [ebp+16380]",
        "fstp    dword [eax-0x4]",
        "fld     dword [ebp-0x8]",
        "fstp    dword [eax+16370]"
      ],
      "ExpectedArm64ASM": [
        "ldr s2, [x9, #16380]",
        "sub w20, w4, #0x4 (4)",
        "str s2, [x20]",
        "ldur s2, [x9, #-8]",
        "mov w20, #0x3ff2",
        "add w20, w4, w20",
        "str s2, [x20]",
        "ldrb w20, [x28, #1019]",
        "add w20, w20, #0x7 (7)",
        "and w20, w20, #0x7",
        "ldrb w21, [x28, #1298]",
        "mov w22, #0x1",
        "lsl w20, w22, w20",
        "bic w20, w21, w20",
        "strb w20, [x28, #1298]"
      ]
    }

Happens for both 64-bit mode with 64-bit addressing, and 32-bit mode with 32-bit addressing. Should be an easy win to fold those immediates in to str and stur.

Interesting - I am a bit confused about this. Are you on a pvt branch or sitting on uncommitted patches? I get 22 insts. This is the full json I am testing:

{
  "Features": {
    "Bitness": 64,
    "EnabledHostFeatures": [
      "SVE128",
      "SVE256",
      "AFP",
      "FLAGM",
      "FLAGM2",
      "CRYPTO"
    ],
    "DisabledHostFeatures": []
  },
  "Instructions": {
    "Test": {
      "x86InstructionCount": 4,
      "ExpectedInstructionCount": 22,
      "x86Insts": [
        "fld     dword [ebp+16380]",
        "fstp    dword [eax-0x4]",
        "fld     dword [ebp-0x8]",
        "fstp    dword [eax+16370]"
      ],
      "ExpectedArm64ASM": [
        "mov w20, #0x3ffc",
        "add x20, x9, x20",
        "mov w20, w20",
        "ldr s2, [x20]",
        "sub x20, x4, #0x4 (4)",
        "mov w20, w20",
        "str s2, [x20]",
        "sub x20, x9, #0x8 (8)",
        "mov w20, w20",
        "ldr s2, [x20]",
        "mov w20, #0x3ff2",
        "add x20, x4, x20",
        "mov w20, w20",
        "str s2, [x20]",
        "ldrb w20, [x28, #1019]",
        "add w20, w20, #0x7 (7)",
        "and w20, w20, #0x7",
        "ldrb w21, [x28, #1298]",
        "mov w22, #0x1",
        "lsl w20, w22, w20",
        "bic w20, w21, w20",
        "strb w20, [x28, #1298]"
      ]
    }
  }
}

Sonicadvance1 · 2025-01-06T19:10:34Z

An easy win I noticed here is that vector stores aren't handling the small store ranges that are possible in arm64. Loads seem to handle it although.

    "Test": {
      "x86InstructionCount": 4,
      "ExpectedInstructionCount": 15,
      "x86Insts": [
        "fld     dword [ebp+16380]",
        "fstp    dword [eax-0x4]",
        "fld     dword [ebp-0x8]",
        "fstp    dword [eax+16370]"
      ],
      "ExpectedArm64ASM": [
        "ldr s2, [x9, #16380]",
        "sub w20, w4, #0x4 (4)",
        "str s2, [x20]",
        "ldur s2, [x9, #-8]",
        "mov w20, #0x3ff2",
        "add w20, w4, w20",
        "str s2, [x20]",
        "ldrb w20, [x28, #1019]",
        "add w20, w20, #0x7 (7)",
        "and w20, w20, #0x7",
        "ldrb w21, [x28, #1298]",
        "mov w22, #0x1",
        "lsl w20, w22, w20",
        "bic w20, w21, w20",
        "strb w20, [x28, #1298]"
      ]
    }

Happens for both 64-bit mode with 64-bit addressing, and 32-bit mode with 32-bit addressing. Should be an easy win to fold those immediates in to str and stur.

Interesting - I am a bit confused about this. Are you on a pvt branch or sitting on uncommitted patches? I get 22 insts. This is the full json I am testing:

{
  "Features": {
    "Bitness": 64,
    "EnabledHostFeatures": [
      "SVE128",
      "SVE256",
      "AFP",
      "FLAGM",
      "FLAGM2",
      "CRYPTO"
    ],
    "DisabledHostFeatures": []
  },
  "Instructions": {
    "Test": {
      "x86InstructionCount": 4,
      "ExpectedInstructionCount": 22,
      "x86Insts": [
        "fld     dword [ebp+16380]",
        "fstp    dword [eax-0x4]",
        "fld     dword [ebp-0x8]",
        "fstp    dword [eax+16370]"
      ],
      "ExpectedArm64ASM": [
        "mov w20, #0x3ffc",
        "add x20, x9, x20",
        "mov w20, w20",
        "ldr s2, [x20]",
        "sub x20, x4, #0x4 (4)",
        "mov w20, w20",
        "str s2, [x20]",
        "sub x20, x9, #0x8 (8)",
        "mov w20, w20",
        "ldr s2, [x20]",
        "mov w20, #0x3ff2",
        "add x20, x4, x20",
        "mov w20, w20",
        "str s2, [x20]",
        "ldrb w20, [x28, #1019]",
        "add w20, w20, #0x7 (7)",
        "and w20, w20, #0x7",
        "ldrb w21, [x28, #1298]",
        "mov w22, #0x1",
        "lsl w20, w22, w20",
        "bic w20, w21, w20",
        "strb w20, [x28, #1298]"
      ]
    }
  }
}

Maybe because I had a few options enabled.

    "Env": {
      "FEX_X87REDUCEDPRECISION": "1",
      "FEX_TSOENABLED": "0",
      "FEX_TSOAUTOMIGRATION": "0"
    },

pmatos · 2025-01-07T08:40:51Z

An easy win I noticed here is that vector stores aren't handling the small store ranges that are possible in arm64. Loads seem to handle it although.

    "Test": {
      "x86InstructionCount": 4,
      "ExpectedInstructionCount": 15,
      "x86Insts": [
        "fld     dword [ebp+16380]",
        "fstp    dword [eax-0x4]",
        "fld     dword [ebp-0x8]",
        "fstp    dword [eax+16370]"
      ],
      "ExpectedArm64ASM": [
        "ldr s2, [x9, #16380]",
        "sub w20, w4, #0x4 (4)",
        "str s2, [x20]",
        "ldur s2, [x9, #-8]",
        "mov w20, #0x3ff2",
        "add w20, w4, w20",
        "str s2, [x20]",
        "ldrb w20, [x28, #1019]",
        "add w20, w20, #0x7 (7)",
        "and w20, w20, #0x7",
        "ldrb w21, [x28, #1298]",
        "mov w22, #0x1",
        "lsl w20, w22, w20",
        "bic w20, w21, w20",
        "strb w20, [x28, #1298]"
      ]
    }

Happens for both 64-bit mode with 64-bit addressing, and 32-bit mode with 32-bit addressing. Should be an easy win to fold those immediates in to str and stur.

Interesting - I am a bit confused about this. Are you on a pvt branch or sitting on uncommitted patches? I get 22 insts. This is the full json I am testing:

{
  "Features": {
    "Bitness": 64,
    "EnabledHostFeatures": [
      "SVE128",
      "SVE256",
      "AFP",
      "FLAGM",
      "FLAGM2",
      "CRYPTO"
    ],
    "DisabledHostFeatures": []
  },
  "Instructions": {
    "Test": {
      "x86InstructionCount": 4,
      "ExpectedInstructionCount": 22,
      "x86Insts": [
        "fld     dword [ebp+16380]",
        "fstp    dword [eax-0x4]",
        "fld     dword [ebp-0x8]",
        "fstp    dword [eax+16370]"
      ],
      "ExpectedArm64ASM": [
        "mov w20, #0x3ffc",
        "add x20, x9, x20",
        "mov w20, w20",
        "ldr s2, [x20]",
        "sub x20, x4, #0x4 (4)",
        "mov w20, w20",
        "str s2, [x20]",
        "sub x20, x9, #0x8 (8)",
        "mov w20, w20",
        "ldr s2, [x20]",
        "mov w20, #0x3ff2",
        "add x20, x4, x20",
        "mov w20, w20",
        "str s2, [x20]",
        "ldrb w20, [x28, #1019]",
        "add w20, w20, #0x7 (7)",
        "and w20, w20, #0x7",
        "ldrb w21, [x28, #1298]",
        "mov w22, #0x1",
        "lsl w20, w22, w20",
        "bic w20, w21, w20",
        "strb w20, [x28, #1298]"
      ]
    }
  }
}

Maybe because I had a few options enabled.

    "Env": {
      "FEX_X87REDUCEDPRECISION": "1",
      "FEX_TSOENABLED": "0",
      "FEX_TSOAUTOMIGRATION": "0"
    },

The difference was not actually those options but the bitness. 32bits result in 15 insts, 64bits result in 22 insts.

I have so many questions... :)

Sonicadvance1 · 2025-01-07T08:42:52Z

Oh right, the example block is from a 32-bit game, so yea would need a 32-bit option enabled.
Just occurs when address size equals GPR size and we know we can optimize those small offsets :)

pmatos · 2025-01-07T14:17:13Z

Oh right, the example block is from a 32-bit game, so yea would need a 32-bit option enabled. Just occurs when address size equals GPR size and we know we can optimize those small offsets :)

Yeah, I am fixing this. I made an incorrect assumption about memory optimization when I implemented the x87 stack optimization pass.

Mentioned initially in FEX-Emu#4252.

Includes tests and instcountci files and tests. When the x87 optimizations were implement, we missed optimizing different addressing modes. This commit addresses this issue. Discussed in FEX-Emu#4252.

Sonicadvance1 added Performance Things that will make FEX faster JIT Game related labels Jan 3, 2025

pmatos added a commit to pmatos/FEX that referenced this issue Jan 10, 2025

FST address modes optimization

8e26f86

Mentioned initially in FEX-Emu#4252.

pmatos mentioned this issue Jan 10, 2025

x87 fst/fld optimization for different addrmodes #4266

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Crysis 2 Maximum edition: Audio thread completely saturated with x87, breaks audio #4252

Crysis 2 Maximum edition: Audio thread completely saturated with x87, breaks audio #4252

Sonicadvance1 commented Jan 3, 2025

pmatos commented Jan 4, 2025

Sonicadvance1 commented Jan 5, 2025

pmatos commented Jan 6, 2025

Sonicadvance1 commented Jan 6, 2025

pmatos commented Jan 7, 2025

Sonicadvance1 commented Jan 7, 2025

pmatos commented Jan 7, 2025

Crysis 2 Maximum edition: Audio thread completely saturated with x87, breaks audio #4252

Crysis 2 Maximum edition: Audio thread completely saturated with x87, breaks audio #4252

Comments

Sonicadvance1 commented Jan 3, 2025

pmatos commented Jan 4, 2025

Sonicadvance1 commented Jan 5, 2025

pmatos commented Jan 6, 2025

Sonicadvance1 commented Jan 6, 2025

pmatos commented Jan 7, 2025

Sonicadvance1 commented Jan 7, 2025

pmatos commented Jan 7, 2025