From 097ef716a66143c6deb429a8dabc29afab1aab38 Mon Sep 17 00:00:00 2001 From: SkalskiP Date: Thu, 21 May 2026 11:22:29 +0200 Subject: [PATCH 1/6] Add Google Gemini 3.5 VLM support, reusing the existing Gemini 2.5 response parser. --- src/supervision/detection/core.py | 5 ++- src/supervision/detection/vlm.py | 7 ++++ tests/detection/test_vlm.py | 68 +++++++++++++++++++++++++++++++ 3 files changed, 79 insertions(+), 1 deletion(-) diff --git a/src/supervision/detection/core.py b/src/supervision/detection/core.py index 30baca78b8..3949ec3fc9 100644 --- a/src/supervision/detection/core.py +++ b/src/supervision/detection/core.py @@ -977,6 +977,7 @@ def from_lmm( | Qwen2.5-VL | `QWEN_2_5_VL` | detection | `resolution_wh`, `input_wh` | `classes` | | Google Gemini 2.0 | `GOOGLE_GEMINI_2_0` | detection | `resolution_wh` | `classes` | | Google Gemini 2.5 | `GOOGLE_GEMINI_2_5` | detection, segmentation | `resolution_wh` | `classes` | + | Google Gemini 3.5 | `GOOGLE_GEMINI_3_5` | detection, segmentation | `resolution_wh` | `classes` | | Moondream | `MOONDREAM` | detection | `resolution_wh` | | | DeepSeek-VL2 | `DEEPSEEK_VL_2` | detection | `resolution_wh` | `classes` | @@ -1425,6 +1426,7 @@ def from_lmm( LMM.DEEPSEEK_VL_2: VLM.DEEPSEEK_VL_2, LMM.GOOGLE_GEMINI_2_0: VLM.GOOGLE_GEMINI_2_0, LMM.GOOGLE_GEMINI_2_5: VLM.GOOGLE_GEMINI_2_5, + LMM.GOOGLE_GEMINI_3_5: VLM.GOOGLE_GEMINI_3_5, } if isinstance(lmm, LMM): @@ -1464,6 +1466,7 @@ def from_vlm( | Qwen3-VL | `QWEN_3_VL` | detection | `resolution_wh`, | `classes` | | Google Gemini 2.0 | `GOOGLE_GEMINI_2_0` | detection | `resolution_wh` | `classes` | | Google Gemini 2.5 | `GOOGLE_GEMINI_2_5` | detection, segmentation | `resolution_wh` | `classes` | + | Google Gemini 3.5 | `GOOGLE_GEMINI_3_5` | detection, segmentation | `resolution_wh` | `classes` | | Moondream | `MOONDREAM` | detection | `resolution_wh` | | | DeepSeek-VL2 | `DEEPSEEK_VL_2` | detection | `resolution_wh` | `classes` | @@ -1931,7 +1934,7 @@ def from_vlm( xyxy = from_moondream(result, **kwargs) return cls(xyxy=xyxy) - if vlm == VLM.GOOGLE_GEMINI_2_5: + if vlm in (VLM.GOOGLE_GEMINI_2_5, VLM.GOOGLE_GEMINI_3_5): assert isinstance(result, str) gemini_result = from_google_gemini_2_5(result, **kwargs) data = {CLASS_NAME_DATA_FIELD: gemini_result[2]} diff --git a/src/supervision/detection/vlm.py b/src/supervision/detection/vlm.py index de53548d04..f655f73b4c 100644 --- a/src/supervision/detection/vlm.py +++ b/src/supervision/detection/vlm.py @@ -33,6 +33,7 @@ class LMM(Enum): QWEN_3_VL: Qwen3-VL open vision-language model from Alibaba. GOOGLE_GEMINI_2_0: Google Gemini 2.0 vision-language model. GOOGLE_GEMINI_2_5: Google Gemini 2.5 vision-language model. + GOOGLE_GEMINI_3_5: Google Gemini 3.5 vision-language model. MOONDREAM: The Moondream vision-language model. """ @@ -43,6 +44,7 @@ class LMM(Enum): DEEPSEEK_VL_2 = "deepseek_vl_2" GOOGLE_GEMINI_2_0 = "gemini_2_0" GOOGLE_GEMINI_2_5 = "gemini_2_5" + GOOGLE_GEMINI_3_5 = "gemini_3_5" MOONDREAM = "moondream" @classmethod @@ -80,6 +82,7 @@ class VLM(Enum): QWEN_3_VL: Qwen3-VL open vision-language model from Alibaba. GOOGLE_GEMINI_2_0: Google Gemini 2.0 vision-language model. GOOGLE_GEMINI_2_5: Google Gemini 2.5 vision-language model. + GOOGLE_GEMINI_3_5: Google Gemini 3.5 vision-language model. MOONDREAM: The Moondream vision-language model. """ @@ -90,6 +93,7 @@ class VLM(Enum): DEEPSEEK_VL_2 = "deepseek_vl_2" GOOGLE_GEMINI_2_0 = "gemini_2_0" GOOGLE_GEMINI_2_5 = "gemini_2_5" + GOOGLE_GEMINI_3_5 = "gemini_3_5" MOONDREAM = "moondream" @classmethod @@ -120,6 +124,7 @@ def from_value(cls, value: VLM | str) -> VLM: VLM.DEEPSEEK_VL_2: str, VLM.GOOGLE_GEMINI_2_0: str, VLM.GOOGLE_GEMINI_2_5: str, + VLM.GOOGLE_GEMINI_3_5: str, VLM.MOONDREAM: dict, } @@ -131,6 +136,7 @@ def from_value(cls, value: VLM | str) -> VLM: VLM.DEEPSEEK_VL_2: ["resolution_wh"], VLM.GOOGLE_GEMINI_2_0: ["resolution_wh"], VLM.GOOGLE_GEMINI_2_5: ["resolution_wh"], + VLM.GOOGLE_GEMINI_3_5: ["resolution_wh"], VLM.MOONDREAM: ["resolution_wh"], } @@ -142,6 +148,7 @@ def from_value(cls, value: VLM | str) -> VLM: VLM.DEEPSEEK_VL_2: ["resolution_wh", "classes"], VLM.GOOGLE_GEMINI_2_0: ["resolution_wh", "classes"], VLM.GOOGLE_GEMINI_2_5: ["resolution_wh", "classes"], + VLM.GOOGLE_GEMINI_3_5: ["resolution_wh", "classes"], VLM.MOONDREAM: ["resolution_wh"], } diff --git a/tests/detection/test_vlm.py b/tests/detection/test_vlm.py index b5d035b065..73b3378115 100644 --- a/tests/detection/test_vlm.py +++ b/tests/detection/test_vlm.py @@ -1297,3 +1297,71 @@ def test_from_deepseek_vl_2( detections.data[CLASS_NAME_DATA_FIELD], expected_detections.data[CLASS_NAME_DATA_FIELD], ) + + +@pytest.mark.parametrize( + "result, resolution_wh, classes", + [ + ( + "random text", + (1000, 1000), + None, + ), + ( + "```json\n[]\n```", + (1000, 1000), + None, + ), + ( + """```json + [ + {"box_2d": [100, 200, 300, 400], "label": "cat", "confidence": 0.8} + ] + ```""", + (1000, 500), + None, + ), + ( + """```json + [ + {"box_2d": [10, 20, 110, 120], "label": "cat", "confidence": 0.8}, + {"box_2d": [50, 100, 150, 200], "label": "dog", "confidence": 0.9} + ] + ```""", + (640, 480), + ["cat", "dog"], + ), + ], +) +def test_from_google_gemini_3_5_matches_2_5( + result: str, + resolution_wh: tuple[int, int], + classes: list[str] | None, +): + detections_2_5 = Detections.from_vlm( + vlm=VLM.GOOGLE_GEMINI_2_5, + result=result, + resolution_wh=resolution_wh, + classes=classes, + ) + detections_3_5 = Detections.from_vlm( + vlm=VLM.GOOGLE_GEMINI_3_5, + result=result, + resolution_wh=resolution_wh, + classes=classes, + ) + + assert len(detections_2_5) == len(detections_3_5) + + if len(detections_2_5) == 0: + return + + assert np.allclose(detections_2_5.xyxy, detections_3_5.xyxy) + assert np.array_equal(detections_2_5.class_id, detections_3_5.class_id) + assert np.array_equal( + detections_2_5.data[CLASS_NAME_DATA_FIELD], + detections_3_5.data[CLASS_NAME_DATA_FIELD], + ) + if detections_2_5.confidence is not None: + assert np.allclose(detections_2_5.confidence, detections_3_5.confidence) + From e99ca5402012fc01d153b91a912a255f8a1693e8 Mon Sep 17 00:00:00 2001 From: SkalskiP Date: Thu, 21 May 2026 11:33:20 +0200 Subject: [PATCH 2/6] Fix ruff PT006 lint by using tuple for pytest.mark.parametrize argument. --- tests/detection/test_vlm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/detection/test_vlm.py b/tests/detection/test_vlm.py index 73b3378115..beef23761b 100644 --- a/tests/detection/test_vlm.py +++ b/tests/detection/test_vlm.py @@ -1300,7 +1300,7 @@ def test_from_deepseek_vl_2( @pytest.mark.parametrize( - "result, resolution_wh, classes", + ("result", "resolution_wh", "classes"), [ ( "random text", From 4bd6cf7e357cc1d1abe1a8f15ec7180ad4aa6f3d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 21 May 2026 09:39:43 +0000 Subject: [PATCH 3/6] =?UTF-8?q?fix(pre=5Fcommit):=20=F0=9F=8E=A8=20auto=20?= =?UTF-8?q?format=20pre-commit=20hooks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/detection/test_vlm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/detection/test_vlm.py b/tests/detection/test_vlm.py index beef23761b..6933c4b39a 100644 --- a/tests/detection/test_vlm.py +++ b/tests/detection/test_vlm.py @@ -1364,4 +1364,3 @@ def test_from_google_gemini_3_5_matches_2_5( ) if detections_2_5.confidence is not None: assert np.allclose(detections_2_5.confidence, detections_3_5.confidence) - From 338b0ec6908904d3711febf4b5c551ddecd12501 Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Fri, 22 May 2026 21:27:40 +0200 Subject: [PATCH 4/6] Apply suggestions from code review Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> --- tests/detection/test_vlm.py | 27 ++------------------------- 1 file changed, 2 insertions(+), 25 deletions(-) diff --git a/tests/detection/test_vlm.py b/tests/detection/test_vlm.py index 6933c4b39a..25b6386c4b 100644 --- a/tests/detection/test_vlm.py +++ b/tests/detection/test_vlm.py @@ -1305,33 +1305,10 @@ def test_from_deepseek_vl_2( ( "random text", (1000, 1000), - None, + None ), ( - "```json\n[]\n```", - (1000, 1000), - None, - ), - ( - """```json - [ - {"box_2d": [100, 200, 300, 400], "label": "cat", "confidence": 0.8} - ] - ```""", - (1000, 500), - None, - ), - ( - """```json - [ - {"box_2d": [10, 20, 110, 120], "label": "cat", "confidence": 0.8}, - {"box_2d": [50, 100, 150, 200], "label": "dog", "confidence": 0.9} - ] - ```""", - (640, 480), - ["cat", "dog"], - ), - ], + " ) def test_from_google_gemini_3_5_matches_2_5( result: str, From 1dd1def5133692480aaf63c748358426e0ef0cbf Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Fri, 22 May 2026 21:29:26 +0200 Subject: [PATCH 5/6] Update test cases in test_vlm.py for JSON inputs --- tests/detection/test_vlm.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/tests/detection/test_vlm.py b/tests/detection/test_vlm.py index 25b6386c4b..e390737b7a 100644 --- a/tests/detection/test_vlm.py +++ b/tests/detection/test_vlm.py @@ -1308,7 +1308,30 @@ def test_from_deepseek_vl_2( None ), ( - " + "```json\n[]\n```", + (1000, 1000), + None + ), + ( + """```json + [ + {"box_2d": [100, 200, 300, 400], "label": "cat", "confidence": 0.8} + ] + ```""", + (1000, 500), + None + ), + ( + """```json + [ + {"box_2d": [10, 20, 110, 120], "label": "cat", "confidence": 0.8}, + {"box_2d": [50, 100, 150, 200], "label": "dog", "confidence": 0.9} + ] + ```""", + (640, 480), + ["cat", "dog"] + ), + ], ) def test_from_google_gemini_3_5_matches_2_5( result: str, From 4a56570315efef1c5efb1745a57150734acb6cc3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 22 May 2026 19:29:54 +0000 Subject: [PATCH 6/6] =?UTF-8?q?fix(pre=5Fcommit):=20=F0=9F=8E=A8=20auto=20?= =?UTF-8?q?format=20pre-commit=20hooks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/detection/test_vlm.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/tests/detection/test_vlm.py b/tests/detection/test_vlm.py index e390737b7a..98497b7d40 100644 --- a/tests/detection/test_vlm.py +++ b/tests/detection/test_vlm.py @@ -1302,16 +1302,8 @@ def test_from_deepseek_vl_2( @pytest.mark.parametrize( ("result", "resolution_wh", "classes"), [ - ( - "random text", - (1000, 1000), - None - ), - ( - "```json\n[]\n```", - (1000, 1000), - None - ), + ("random text", (1000, 1000), None), + ("```json\n[]\n```", (1000, 1000), None), ( """```json [ @@ -1319,7 +1311,7 @@ def test_from_deepseek_vl_2( ] ```""", (1000, 500), - None + None, ), ( """```json @@ -1329,7 +1321,7 @@ def test_from_deepseek_vl_2( ] ```""", (640, 480), - ["cat", "dog"] + ["cat", "dog"], ), ], )