Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/1xL4_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,4 @@ jobs:
pytest test/dtypes/test_affine_quantized_float.py --verbose -s
./test/float8/test_everything_single_gpu.sh
python test/quantization/quantize_/workflows/float8/test_float8_tensor.py
python test/kernel/test_blockwise_triton.py --verbose -s
2 changes: 1 addition & 1 deletion benchmarks/benchmark_blockwise_scaled_linear_triton.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from triton.testing import do_bench

from torchao.float8.float8_utils import compute_error
from torchao.prototype.blockwise_fp8_inference.blockwise_quantization import (
from torchao.kernel.blockwise_quantization import (
blockwise_fp8_gemm,
fp8_blockwise_act_quant,
fp8_blockwise_weight_quant,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

triton = pytest.importorskip("triton", reason="Triton required to run this test")

from torchao.prototype.blockwise_fp8_inference.blockwise_quantization import (
from torchao.kernel.blockwise_quantization import (
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note for later: let's rename this torchao.kernels* (plural)

blockwise_fp8_gemm,
fp8_blockwise_act_quant,
fp8_blockwise_weight_dequant,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,7 @@ def test_slice_and_copy_similar_to_vllm(self, granularity):
self._test_slice_and_copy_similar_to_vllm(config)

@unittest.skipIf(not is_sm_at_least_90(), "Nedd sm90+")
@unittest.skipIf(not _is_fbgemm_gpu_genai_available(), "Need fbgemm_gpu_genai")
def test_bmm(self):
# only support per row quantization
config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
Expand Down Expand Up @@ -406,6 +407,7 @@ def test_cat(self, granularity, sizes):
self.assertEqual(cat_qweight2.scale, ref_scale)

@unittest.skipIf(not is_sm_at_least_90(), "Nedd sm90+")
@unittest.skipIf(not _is_fbgemm_gpu_genai_available(), "Need fbgemm_gpu_genai")
def test_moe_weight_reshape_ops(self):
# only per row quantization is supported for bmm
granularity = PerRow()
Expand All @@ -416,6 +418,7 @@ def test_moe_weight_reshape_ops(self):
# that should be moved here after v1 config is deprecated:
# https://github.com/pytorch/ao/issues/2649
@unittest.skipIf(not is_sm_at_least_90(), "Nedd sm90+")
@unittest.skipIf(not _is_fbgemm_gpu_genai_available(), "Need fbgemm_gpu_genai")
def test_expected_gpu_kernel_fbgemm(self):
"""Making sure KernelPreference.FBGEMM calls correct quantize and gemm kernels
and the bias add happens in the gemm kernel for per row quantization
Expand Down
5 changes: 3 additions & 2 deletions torchao/prototype/blockwise_fp8_inference/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from .blockwise_linear import BlockwiseQuantLinear
from .blockwise_quantization import (
from torchao.kernel.blockwise_quantization import (
blockwise_fp8_gemm,
fp8_blockwise_act_quant,
fp8_blockwise_weight_dequant,
fp8_blockwise_weight_quant,
)

from .blockwise_linear import BlockwiseQuantLinear

__all__ = [
"blockwise_fp8_gemm",
"BlockwiseQuantLinear",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import torch
from torch import nn

from torchao.prototype.blockwise_fp8_inference.blockwise_quantization import (
from torchao.kernel.blockwise_quantization import (
blockwise_fp8_gemm,
fp8_blockwise_act_quant,
)
Expand Down
Loading