|
| 1 | +Pytorch version: 2.4.1-rc1 |
| 2 | +dot product calculation test |
| 3 | +tensor([[[ 0.0769, 1.4105, 0.0824, 0.5644, 0.5710, 0.8619, -0.0698, |
| 4 | + -0.1378], |
| 5 | + [-0.0206, 1.3138, -0.5070, 0.3971, 0.5620, 0.8419, -0.2367, |
| 6 | + 0.0135], |
| 7 | + [-0.1797, 1.3761, 0.0258, 0.5147, 0.5673, 0.7445, -0.0543, |
| 8 | + -0.0028]], |
| 9 | + |
| 10 | + [[-0.4074, 0.4956, 0.0553, -0.7740, -0.3718, 1.3344, 0.8070, |
| 11 | + -0.3321], |
| 12 | + [-0.5268, 0.5001, 0.0537, -0.6846, -0.3624, 1.1640, 0.6590, |
| 13 | + -0.2191], |
| 14 | + [-0.5697, 0.5082, 0.0254, -0.6951, -0.3435, 1.0934, 0.7012, |
| 15 | + -0.2850]]], device='cuda:0') |
| 16 | + |
| 17 | +Benchmarking cuda and cpu with Default, Math, Flash Attention amd Memory pytorch backends |
| 18 | +Device: AMD Radeon RX 7700S |
| 19 | + Default cuda:0 benchmark: |
| 20 | + 24404.260 microseconds, 0.02440425969834905 sec |
| 21 | + Math cuda:0 benchmark: |
| 22 | + 71419.426 microseconds, 0.07141942633703972 sec |
| 23 | + Flash Attention cuda:0 benchmark: |
| 24 | + 24076.089 microseconds, 0.02407608859939501 sec |
| 25 | + Memory Efficient cuda:0 benchmark: |
| 26 | + 24541.843 microseconds, 0.024541843199403956 sec |
| 27 | +Device: cpu-16 |
| 28 | + Default cpu benchmark: |
| 29 | + 26995025.818 microseconds, 26.99502581800334 sec |
| 30 | + Math cpu benchmark: |
| 31 | + 30105574.327 microseconds, 30.105574326997157 sec |
| 32 | + Flash Attention cpu benchmark: |
| 33 | + 26501703.386 microseconds, 26.501703385991274 sec |
| 34 | + Memory Efficient cpu benchmark: |
| 35 | + Memory Efficient cpu is not supported. See warnings for reasons. |
| 36 | +Summary |
| 37 | + |
| 38 | +Pytorch version: 2.4.1-rc1 |
| 39 | +ROCM HIP version: 6.1.40093-8099c494c |
| 40 | +Device: AMD Radeon RX 7700S |
| 41 | + Default cuda:0: 24404.260 ms |
| 42 | + Math cuda:0: 71419.426 ms |
| 43 | + Flash Attention cuda:0: 24076.089 ms |
| 44 | + Memory Efficient cuda:0: 24541.843 ms |
| 45 | + |
| 46 | +Device: cpu-16 |
| 47 | + Default cpu: 26995025.818 ms |
| 48 | + Math cpu: 30105574.327 ms |
| 49 | + Flash Attention cpu: 26501703.386 ms |
| 50 | + Memory Efficient cpu: -1.000 ms |
| 51 | + |
0 commit comments