Skip to content

[NPU]:Add support for the cross_entropy operator.#1148

Open
TianHao324 wants to merge 1 commit intolinkedin:mainfrom
TianHao324:cross
Open

[NPU]:Add support for the cross_entropy operator.#1148
TianHao324 wants to merge 1 commit intolinkedin:mainfrom
TianHao324:cross

Conversation

@TianHao324
Copy link
Contributor

Summary

To address the ub overflow issue in the benchmark, we have added an operator with a NPU-friendly implementation of cross_entropy .
The current performance is 5-6 times higher than the native code of the GPU, and it is only slightly lower than that of Hugging Face. Further research will be conducted in the future.

  • Memory-Aware Block Size
  • NPU Core-Aware Grid Sizing

Testing Done

image
  • Hardware Type: Atlas 800I A2
  • run make test to ensure correctness
  • run make checkstyle to ensure code style
  • run make test-convergence to ensure convergence

@TianHao324
Copy link
Contributor Author

**************************************
     BENCHMARKING SPEED for CROSS_ENTROPY
**************************************
********** Benchmark Data **********
[
  {
    "kernel_name": "cross_entropy",
    "kernel_provider": "liger",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "V",
    "x_label": "vocab size",
    "x_values": [
      1024,
      2048,
      4096,
      8192,
      16384,
      32768
    ],
    "y_values_50": [
      1.7440799474716187,
      1.8807799816131592,
      2.040940046310425,
      2.57177996635437,
      3.609760046005249,
      5.667240142822266
    ],
    "y_values_20": [
      1.7324639558792114,
      1.8643879890441895,
      2.027184009552002,
      2.5571560859680176,
      3.5952601432800293,
      5.655300140380859
    ],
    "y_values_80": [
      1.7613120079040527,
      1.898427963256836,
      2.052704095840454,
      2.597895860671997,
      3.625904083251953,
      5.681312084197998
    ],
    "timestamp": "2026-03-13 09:29:18",
    "kernel_operation_mode": "forward",
    "extra_benchmark_config_str": "{\"B\": 8, \"T\": 2048}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "cross_entropy",
    "kernel_provider": "huggingface",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "V",
    "x_label": "vocab size",
    "x_values": [
      1024,
      2048,
      4096,
      8192,
      16384,
      32768
    ],
    "y_values_50": [
      0.4331800043582916,
      0.6962400078773499,
      1.118880033493042,
      1.9523899555206299,
      3.8360800743103027,
      8.013080596923828
    ],
    "y_values_20": [
      0.4306679964065552,
      0.69377601146698,
      1.1175240278244019,
      1.9493600130081177,
      3.8318119049072266,
      8.00057601928711
    ],
    "y_values_80": [
      0.4352959990501404,
      0.6979519724845886,
      1.1214599609375,
      1.9549200534820557,
      3.8428759574890137,
      8.0300874710083
    ],
    "timestamp": "2026-03-13 09:29:19",
    "kernel_operation_mode": "forward",
    "extra_benchmark_config_str": "{\"B\": 8, \"T\": 2048}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "cross_entropy",
    "kernel_provider": "liger",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "V",
    "x_label": "vocab size",
    "x_values": [
      1024,
      2048,
      4096,
      8192,
      16384,
      32768
    ],
    "y_values_50": [
      1.8060400485992432,
      2.53193998336792,
      3.8266398906707764,
      6.3856000900268555,
      11.679719924926758,
      21.96086883544922
    ],
    "y_values_20": [
      1.7943999767303467,
      2.521375894546509,
      3.821068048477173,
      6.377971649169922,
      11.67396354675293,
      21.913740158081055
    ],
    "y_values_80": [
      1.8110599517822266,
      2.553691864013672,
      3.8379600048065186,
      6.407639980316162,
      11.716840744018555,
      22.007999420166016
    ],
    "timestamp": "2026-03-13 09:29:20",
    "kernel_operation_mode": "backward",
    "extra_benchmark_config_str": "{\"B\": 8, \"T\": 2048}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "cross_entropy",
    "kernel_provider": "huggingface",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "V",
    "x_label": "vocab size",
    "x_values": [
      1024,
      2048,
      4096,
      8192,
      16384,
      32768
    ],
    "y_values_50": [
      0.49070000648498535,
      0.9231300354003906,
      1.7569499015808105,
      3.408130168914795,
      6.7164201736450195,
      13.680139541625977
    ],
    "y_values_20": [
      0.4895839989185333,
      0.9211400151252747,
      1.7556239366531372,
      3.403140068054199,
      6.711668014526367,
      13.656871795654297
    ],
    "y_values_80": [
      0.49239999055862427,
      0.9251400232315063,
      1.7586760520935059,
      3.4108800888061523,
      6.719099998474121,
      13.687387466430664
    ],
    "timestamp": "2026-03-13 09:29:21",
    "kernel_operation_mode": "backward",
    "extra_benchmark_config_str": "{\"B\": 8, \"T\": 2048}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "cross_entropy",
    "kernel_provider": "liger",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "V",
    "x_label": "vocab size",
    "x_values": [
      1024,
      2048,
      4096,
      8192,
      16384,
      32768
    ],
    "y_values_50": [
      3.3448400497436523,
      4.767339706420898,
      6.829190254211426,
      11.314140319824219,
      20.290599822998047,
      38.17082214355469
    ],
    "y_values_20": [
      3.320215940475464,
      4.748836040496826,
      6.819339752197266,
      11.29755973815918,
      20.268524169921875,
      38.16250228881836
    ],
    "y_values_80": [
      3.3568639755249023,
      4.786159992218018,
      6.842671871185303,
      11.339603424072266,
      20.296236038208008,
      38.17913818359375
    ],
    "timestamp": "2026-03-13 09:29:22",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"B\": 8, \"T\": 2048}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "cross_entropy",
    "kernel_provider": "huggingface",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "V",
    "x_label": "vocab size",
    "x_values": [
      1024,
      2048,
      4096,
      8192,
      16384,
      32768
    ],
    "y_values_50": [
      1.1367499828338623,
      2.2049200534820557,
      4.125999927520752,
      7.9351701736450195,
      15.773730278015137,
      32.22629928588867
    ],
    "y_values_20": [
      1.134179949760437,
      2.2021000385284424,
      4.1238837242126465,
      7.931103706359863,
      15.762680053710938,
      32.22282028198242
    ],
    "y_values_80": [
      1.1402720212936401,
      2.2080399990081787,
      4.13293981552124,
      7.93828821182251,
      15.781660079956055,
      32.230777740478516
    ],
    "timestamp": "2026-03-13 09:29:23",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"B\": 8, \"T\": 2048}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "cross_entropy",
    "kernel_provider": "liger",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "V",
    "x_label": "vocab size",
    "x_values": [
      1024,
      2048,
      4096,
      8192,
      16384,
      32768
    ],
    "y_values_50": [
      1.7095600366592407,
      1.8112000226974487,
      1.9851300716400146,
      2.5277600288391113,
      3.563430070877075,
      5.656720161437988
    ],
    "y_values_20": [
      1.6908079385757446,
      1.8061479330062866,
      1.975108027458191,
      2.5159120559692383,
      3.5496439933776855,
      5.649620056152344
    ],
    "y_values_80": [
      1.722656011581421,
      1.8196359872817993,
      1.9961079359054565,
      2.5367400646209717,
      3.5727920532226562,
      5.670703887939453
    ],
    "timestamp": "2026-03-13 09:29:24",
    "kernel_operation_mode": "no-grad-forward",
    "extra_benchmark_config_str": "{\"B\": 8, \"T\": 2048}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "cross_entropy",
    "kernel_provider": "huggingface",
    "metric_name": "speed",
    "metric_unit": "ms",
    "gpu_name": "Ascend910B4",
    "x_name": "V",
    "x_label": "vocab size",
    "x_values": [
      1024,
      2048,
      4096,
      8192,
      16384,
      32768
    ],
    "y_values_50": [
      0.4330799877643585,
      0.6965800523757935,
      1.1198999881744385,
      1.9514000415802002,
      3.8371801376342773,
      8.018779754638672
    ],
    "y_values_20": [
      0.4309599995613098,
      0.6944559812545776,
      1.1177079677581787,
      1.9488799571990967,
      3.8304638862609863,
      8.010972023010254
    ],
    "y_values_80": [
      0.4356200098991394,
      0.6985679864883423,
      1.1218160390853882,
      1.9553799629211426,
      3.843883991241455,
      8.023728370666504
    ],
    "timestamp": "2026-03-13 09:29:25",
    "kernel_operation_mode": "no-grad-forward",
    "extra_benchmark_config_str": "{\"B\": 8, \"T\": 2048}",
    "liger_version": "0.0.0"
  }
]
**************************************
     BENCHMARKING MEMORY for CROSS_ENTROPY
**************************************
********** Benchmark Data **********
[
  {
    "kernel_name": "cross_entropy",
    "kernel_provider": "liger",
    "metric_name": "memory",
    "metric_unit": "MB",
    "gpu_name": "Ascend910B4",
    "x_name": "V",
    "x_label": "vocab size",
    "x_values": [
      1024,
      2048,
      4096,
      8192,
      16384,
      32768
    ],
    "y_values_50": [
      256.583984375,
      512.583984375,
      1024.583984375,
      2048.583984375,
      4096.583984375,
      8192.583984375
    ],
    "y_values_20": [
      256.583984375,
      512.583984375,
      1024.583984375,
      2048.583984375,
      4096.583984375,
      8192.583984375
    ],
    "y_values_80": [
      256.583984375,
      512.583984375,
      1024.583984375,
      2048.583984375,
      4096.583984375,
      8192.583984375
    ],
    "timestamp": "2026-03-13 09:29:26",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"B\": 8, \"T\": 2048}",
    "liger_version": "0.0.0"
  },
  {
    "kernel_name": "cross_entropy",
    "kernel_provider": "huggingface",
    "metric_name": "memory",
    "metric_unit": "MB",
    "gpu_name": "Ascend910B4",
    "x_name": "V",
    "x_label": "vocab size",
    "x_values": [
      1024,
      2048,
      4096,
      8192,
      16384,
      32768
    ],
    "y_values_50": [
      320.12890625,
      640.12890625,
      1280.12890625,
      2560.12890625,
      5120.12890625,
      10240.12890625
    ],
    "y_values_20": [
      320.12890625,
      640.12890625,
      1280.12890625,
      2560.12890625,
      5120.12890625,
      10240.12890625
    ],
    "y_values_80": [
      320.12890625,
      640.12890625,
      1280.12890625,
      2560.12890625,
      5120.12890625,
      10240.12890625
    ],
    "timestamp": "2026-03-13 09:29:26",
    "kernel_operation_mode": "full",
    "extra_benchmark_config_str": "{\"B\": 8, \"T\": 2048}",
    "liger_version": "0.0.0"
  }
]

@TianHao324
Copy link
Contributor Author

@Tcc0403 would you mind having a preview?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant