Apply dist_thresh to Genius and Google backends

snejus · snejus · commit da95e2ffb88d · 2024-10-30T19:24:13.000Z
This commit introduces a distance threshold mechanism for the Genius and Google backends. - Create a new `SearchBackend` base class with a method `check_match` that performs checking. - Start using undocumented `dist_thresh` configuration option for good, and mention it in the docs. This controls the maximum allowable distance for matching artist and title names. These changes aim to improve the accuracy of lyrics matching, especially when there are slight variations in artist or title names, see #4791.
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
@@ -16,10 +16,10 @@
 
 from __future__ import annotations
 
-import difflib
 import errno
 import itertools
 import json
+import math
 import os.path
 import re
 import struct
@@ -30,14 +30,15 @@
 from functools import cached_property, partial, total_ordering
 from http import HTTPStatus
 from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator
-from urllib.parse import quote, urlencode
+from urllib.parse import quote, urlencode, urlparse
 
 import requests
 from typing_extensions import TypedDict
 from unidecode import unidecode
 
 import beets
 from beets import plugins, ui
+from beets.autotag.hooks import string_dist
 
 if TYPE_CHECKING:
     from beets.importer import ImportTask
@@ -58,6 +59,7 @@
 except ImportError:
     HAS_LANGDETECT = False
 
+
 DIV_RE = re.compile(r"<(/?)div>?", re.I)
 COMMENT_RE = re.compile(r"<!--.*-->", re.S)
 TAG_RE = re.compile(r"<[^>]*>")
@@ -485,15 +487,47 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
         return lyrics
 
 
-class Genius(Backend):
+class SearchBackend(Backend):
+    REQUIRES_BS = True
+
+    @cached_property
+    def dist_thresh(self) -> float:
+        return self.config["dist_thresh"].get(float)
+
+    def check_match(
+        self, target_artist: str, target_title: str, artist: str, title: str
+    ) -> bool:
+        """Check if the given artist and title are 'good enough' match."""
+        max_dist = max(
+            string_dist(target_artist, artist),
+            string_dist(target_title, title),
+        )
+
+        if (max_dist := round(max_dist, 2)) <= self.dist_thresh:
+            return True
+
+        if math.isclose(max_dist, self.dist_thresh, abs_tol=0.4):
+            # log out the candidate that did not make it but was close.
+            # This may show a matching candidate with some noise in the name
+            self._log.debug(
+                "({}, {}) does not match ({}, {}) but dist was close: {:.2f}",
+                artist,
+                title,
+                target_artist,
+                target_title,
+                max_dist,
+            )
+
+        return False
+
+
+class Genius(SearchBackend):
     """Fetch lyrics from Genius via genius-api.
 
     Simply adapted from
     bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/
     """
 
-    REQUIRES_BS = True
-
     base_url = "https://api.genius.com"
 
     def __init__(self, config, log):
@@ -516,19 +550,15 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
             self._log.debug("Genius API request returned invalid JSON")
             return None
 
-        # find a matching artist in the json
+        check = partial(self.check_match, artist, title)
         for hit in json["response"]["hits"]:
-            hit_artist = hit["result"]["primary_artist"]["name"]
-
-            if slug(hit_artist) == slug(artist):
-                html = self.fetch_url(hit["result"]["url"])
+            result = hit["result"]
+            if check(result["primary_artist"]["name"], result["title"]):
+                html = self.fetch_url(result["url"])
                 if not html:
                     return None
                 return self._scrape_lyrics_from_html(html)
 
-        self._log.debug(
-            "Genius failed to find a matching artist for '{0}'", artist
-        )
         return None
 
     def _search(self, artist, title):
@@ -724,10 +754,9 @@ def is_text_notcode(text):
         return None
 
 
-class Google(Backend):
+class Google(SearchBackend):
     """Fetch lyrics from Google search results."""
 
-    REQUIRES_BS = True
     SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
 
     def is_lyrics(self, text, artist=None):
@@ -775,21 +804,20 @@ def slugify(self, text):
     BY_TRANS = ["by", "par", "de", "von"]
     LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
 
-    def is_page_candidate(self, url_link, url_title, title, artist):
+    def is_page_candidate(
+        self, artist: str, title: str, url_link: str, url_title: str
+    ) -> bool:
         """Return True if the URL title makes it a good candidate to be a
         page that contains lyrics of title by artist.
         """
-        title = self.slugify(title.lower())
-        artist = self.slugify(artist.lower())
-        sitename = re.search(
-            "//([^/]+)/.*", self.slugify(url_link.lower())
-        ).group(1)
-        url_title = self.slugify(url_title.lower())
-
-        # Check if URL title contains song title (exact match)
-        if url_title.find(title) != -1:
+        title_slug = self.slugify(title.lower())
+        url_title_slug = self.slugify(url_title.lower())
+        if title_slug in url_title_slug:
             return True
 
+        artist = self.slugify(artist.lower())
+        sitename = urlparse(url_link).netloc
+
         # or try extracting song title from URL title and check if
         # they are close enough
         tokens = (
@@ -798,12 +826,9 @@ def is_page_candidate(self, url_link, url_title, title, artist):
             + self.LYRICS_TRANS
         )
         tokens = [re.escape(t) for t in tokens]
-        song_title = re.sub("(%s)" % "|".join(tokens), "", url_title)
+        song_title = re.sub("(%s)" % "|".join(tokens), "", url_title_slug)
 
-        song_title = song_title.strip("_|")
-        typo_ratio = 0.9
-        ratio = difflib.SequenceMatcher(None, song_title, title).ratio()
-        return ratio >= typo_ratio
+        return self.check_match(artist, title_slug, artist, song_title)
 
     def fetch(self, artist: str, title: str, *_) -> str | None:
         params = {
@@ -825,24 +850,21 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
             self._log.debug("google backend error: {0}", reason)
             return None
 
-        if "items" in data.keys():
-            for item in data["items"]:
-                url_link = item["link"]
-                url_title = item.get("title", "")
-                if not self.is_page_candidate(
-                    url_link, url_title, title, artist
-                ):
-                    continue
-                html = self.fetch_url(url_link)
-                if not html:
-                    continue
-                lyrics = scrape_lyrics_from_html(html)
-                if not lyrics:
-                    continue
-
-                if self.is_lyrics(lyrics, artist):
-                    self._log.debug("got lyrics from {0}", item["displayLink"])
-                    return lyrics
+        check_candidate = partial(self.is_page_candidate, artist, title)
+        for item in data.get("items", []):
+            url_link = item["link"]
+            if not check_candidate(url_link, item.get("title", "")):
+                continue
+            html = self.fetch_url(url_link)
+            if not html:
+                continue
+            lyrics = scrape_lyrics_from_html(html)
+            if not lyrics:
+                continue
+
+            if self.is_lyrics(lyrics, artist):
+                self._log.debug("got lyrics from {0}", item["displayLink"])
+                return lyrics
 
         return None
 
@@ -866,6 +888,7 @@ def __init__(self):
                 "bing_client_secret": None,
                 "bing_lang_from": [],
                 "bing_lang_to": None,
+                "dist_thresh": 0.11,
                 "google_API_key": None,
                 "google_engine_ID": "009217259823014548361:lndtuqkycfu",
                 "genius_api_key": "Ryq93pUGm8bM6eUWwD_M3NOFFDAtp2yEE7W"
@@ -877,7 +900,6 @@ def __init__(self):
                 # Musixmatch is disabled by default as they are currently blocking
                 # requests with the beets user agent.
                 "sources": [s for s in self.SOURCES if s != "musixmatch"],
-                "dist_thresh": 0.1,
             }
         )
         self.config["bing_client_secret"].redact = True
diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -23,6 +23,10 @@ New features:
 * Beets now uses ``platformdirs`` to determine the default music directory.
   This location varies between systems -- for example, users can configure it
   on Unix systems via ``user-dirs.dirs(5)``.
+* :doc:`plugins/lyrics`: Add new configuration option ``dist_thresh`` to
+  control the maximum allowed distance between the lyrics search result and the
+  tagged item's artist and title. This is useful for preventing false positives
+  when fetching lyrics.
 
 Bug fixes:
 
@@ -65,6 +69,9 @@ Bug fixes:
   ``lrclib`` over other sources since it returns reliable results quicker than
   others.
   :bug:`5102`
+* :doc:`plugins/lyrics`: Fix the issue with ``genius`` backend not being able
+  to match lyrics when there was a slight variation in the artist name.
+  :bug:`4791`
 
 For packagers:
 
diff --git a/docs/plugins/lyrics.rst b/docs/plugins/lyrics.rst
@@ -42,6 +42,12 @@ configuration file. The available options are:
   Default: ``[]``
 - **bing_lang_to**: Language to translate lyrics into.
   Default: None.
+- **dist_thresh**: The maximum distance between the artist and title
+  combination of the music file and lyrics candidate to consider them a match.
+  Lower values will make the plugin more strict, higher values will make it
+  more lenient. This does not apply to the ``lrclib`` backend as it matches
+  durations.
+  Default: ``0.11``.
 - **fallback**: By default, the file will be left unchanged when no lyrics are
   found. Use the empty string ``''`` to reset the lyrics in such a case.
   Default: None.
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
@@ -161,6 +161,42 @@ def test_slug(self, text, expected):
         assert lyrics.slug(text) == expected
 
 
+class TestSearchBackend:
+    @pytest.fixture
+    def backend(self, dist_thresh):
+        plugin = lyrics.LyricsPlugin()
+        plugin.config.set({"dist_thresh": dist_thresh})
+        return lyrics.SearchBackend(plugin.config, plugin._log)
+
+    @pytest.mark.parametrize(
+        "dist_thresh, target_artist, artist, should_match",
+        [
+            (0.11, "Target Artist", "Target Artist", True),
+            (0.11, "Target Artist", "Target Artis", True),
+            (0.11, "Target Artist", "Target Arti", False),
+            (0.11, "Psychonaut", "Psychonaut (BEL)", True),
+            (0.11, "beets song", "beats song", True),
+            (0.10, "beets song", "beats song", False),
+            (
+                0.11,
+                "Lucid Dreams (Forget Me)",
+                "Lucid Dreams (Remix) ft. Lil Uzi Vert",
+                False,
+            ),
+            (
+                0.12,
+                "Lucid Dreams (Forget Me)",
+                "Lucid Dreams (Remix) ft. Lil Uzi Vert",
+                True,
+            ),
+        ],
+    )
+    def test_check_match(self, backend, target_artist, artist, should_match):
+        assert (
+            backend.check_match(target_artist, "", artist, "") == should_match
+        )
+
+
 @pytest.fixture(scope="module")
 def lyrics_root_dir(pytestconfig: pytest.Config):
     return pytestconfig.rootpath / "test" / "rsrc" / "lyrics"
@@ -275,10 +311,10 @@ def test_is_page_candidate(
         self, backend, lyrics_html, url_title, artist, should_be_candidate
     ):
         result = backend.is_page_candidate(
+            artist,
+            self.TITLE,
             "http://www.example.com/lyrics/beetssong",
             url_title,
-            self.TITLE,
-            artist,
         )
         assert bool(result) == should_be_candidate