Skip to content

Commit da95e2f

Browse files
committed
Apply dist_thresh to Genius and Google backends
This commit introduces a distance threshold mechanism for the Genius and Google backends. - Create a new `SearchBackend` base class with a method `check_match` that performs checking. - Start using undocumented `dist_thresh` configuration option for good, and mention it in the docs. This controls the maximum allowable distance for matching artist and title names. These changes aim to improve the accuracy of lyrics matching, especially when there are slight variations in artist or title names, see #4791.
1 parent 16188fa commit da95e2f

File tree

4 files changed

+122
-51
lines changed

4 files changed

+122
-51
lines changed

beetsplug/lyrics.py

Lines changed: 71 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@
1616

1717
from __future__ import annotations
1818

19-
import difflib
2019
import errno
2120
import itertools
2221
import json
22+
import math
2323
import os.path
2424
import re
2525
import struct
@@ -30,14 +30,15 @@
3030
from functools import cached_property, partial, total_ordering
3131
from http import HTTPStatus
3232
from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator
33-
from urllib.parse import quote, urlencode
33+
from urllib.parse import quote, urlencode, urlparse
3434

3535
import requests
3636
from typing_extensions import TypedDict
3737
from unidecode import unidecode
3838

3939
import beets
4040
from beets import plugins, ui
41+
from beets.autotag.hooks import string_dist
4142

4243
if TYPE_CHECKING:
4344
from beets.importer import ImportTask
@@ -58,6 +59,7 @@
5859
except ImportError:
5960
HAS_LANGDETECT = False
6061

62+
6163
DIV_RE = re.compile(r"<(/?)div>?", re.I)
6264
COMMENT_RE = re.compile(r"<!--.*-->", re.S)
6365
TAG_RE = re.compile(r"<[^>]*>")
@@ -485,15 +487,47 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
485487
return lyrics
486488

487489

488-
class Genius(Backend):
490+
class SearchBackend(Backend):
491+
REQUIRES_BS = True
492+
493+
@cached_property
494+
def dist_thresh(self) -> float:
495+
return self.config["dist_thresh"].get(float)
496+
497+
def check_match(
498+
self, target_artist: str, target_title: str, artist: str, title: str
499+
) -> bool:
500+
"""Check if the given artist and title are 'good enough' match."""
501+
max_dist = max(
502+
string_dist(target_artist, artist),
503+
string_dist(target_title, title),
504+
)
505+
506+
if (max_dist := round(max_dist, 2)) <= self.dist_thresh:
507+
return True
508+
509+
if math.isclose(max_dist, self.dist_thresh, abs_tol=0.4):
510+
# log out the candidate that did not make it but was close.
511+
# This may show a matching candidate with some noise in the name
512+
self._log.debug(
513+
"({}, {}) does not match ({}, {}) but dist was close: {:.2f}",
514+
artist,
515+
title,
516+
target_artist,
517+
target_title,
518+
max_dist,
519+
)
520+
521+
return False
522+
523+
524+
class Genius(SearchBackend):
489525
"""Fetch lyrics from Genius via genius-api.
490526
491527
Simply adapted from
492528
bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/
493529
"""
494530

495-
REQUIRES_BS = True
496-
497531
base_url = "https://api.genius.com"
498532

499533
def __init__(self, config, log):
@@ -516,19 +550,15 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
516550
self._log.debug("Genius API request returned invalid JSON")
517551
return None
518552

519-
# find a matching artist in the json
553+
check = partial(self.check_match, artist, title)
520554
for hit in json["response"]["hits"]:
521-
hit_artist = hit["result"]["primary_artist"]["name"]
522-
523-
if slug(hit_artist) == slug(artist):
524-
html = self.fetch_url(hit["result"]["url"])
555+
result = hit["result"]
556+
if check(result["primary_artist"]["name"], result["title"]):
557+
html = self.fetch_url(result["url"])
525558
if not html:
526559
return None
527560
return self._scrape_lyrics_from_html(html)
528561

529-
self._log.debug(
530-
"Genius failed to find a matching artist for '{0}'", artist
531-
)
532562
return None
533563

534564
def _search(self, artist, title):
@@ -724,10 +754,9 @@ def is_text_notcode(text):
724754
return None
725755

726756

727-
class Google(Backend):
757+
class Google(SearchBackend):
728758
"""Fetch lyrics from Google search results."""
729759

730-
REQUIRES_BS = True
731760
SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
732761

733762
def is_lyrics(self, text, artist=None):
@@ -775,21 +804,20 @@ def slugify(self, text):
775804
BY_TRANS = ["by", "par", "de", "von"]
776805
LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
777806

778-
def is_page_candidate(self, url_link, url_title, title, artist):
807+
def is_page_candidate(
808+
self, artist: str, title: str, url_link: str, url_title: str
809+
) -> bool:
779810
"""Return True if the URL title makes it a good candidate to be a
780811
page that contains lyrics of title by artist.
781812
"""
782-
title = self.slugify(title.lower())
783-
artist = self.slugify(artist.lower())
784-
sitename = re.search(
785-
"//([^/]+)/.*", self.slugify(url_link.lower())
786-
).group(1)
787-
url_title = self.slugify(url_title.lower())
788-
789-
# Check if URL title contains song title (exact match)
790-
if url_title.find(title) != -1:
813+
title_slug = self.slugify(title.lower())
814+
url_title_slug = self.slugify(url_title.lower())
815+
if title_slug in url_title_slug:
791816
return True
792817

818+
artist = self.slugify(artist.lower())
819+
sitename = urlparse(url_link).netloc
820+
793821
# or try extracting song title from URL title and check if
794822
# they are close enough
795823
tokens = (
@@ -798,12 +826,9 @@ def is_page_candidate(self, url_link, url_title, title, artist):
798826
+ self.LYRICS_TRANS
799827
)
800828
tokens = [re.escape(t) for t in tokens]
801-
song_title = re.sub("(%s)" % "|".join(tokens), "", url_title)
829+
song_title = re.sub("(%s)" % "|".join(tokens), "", url_title_slug)
802830

803-
song_title = song_title.strip("_|")
804-
typo_ratio = 0.9
805-
ratio = difflib.SequenceMatcher(None, song_title, title).ratio()
806-
return ratio >= typo_ratio
831+
return self.check_match(artist, title_slug, artist, song_title)
807832

808833
def fetch(self, artist: str, title: str, *_) -> str | None:
809834
params = {
@@ -825,24 +850,21 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
825850
self._log.debug("google backend error: {0}", reason)
826851
return None
827852

828-
if "items" in data.keys():
829-
for item in data["items"]:
830-
url_link = item["link"]
831-
url_title = item.get("title", "")
832-
if not self.is_page_candidate(
833-
url_link, url_title, title, artist
834-
):
835-
continue
836-
html = self.fetch_url(url_link)
837-
if not html:
838-
continue
839-
lyrics = scrape_lyrics_from_html(html)
840-
if not lyrics:
841-
continue
842-
843-
if self.is_lyrics(lyrics, artist):
844-
self._log.debug("got lyrics from {0}", item["displayLink"])
845-
return lyrics
853+
check_candidate = partial(self.is_page_candidate, artist, title)
854+
for item in data.get("items", []):
855+
url_link = item["link"]
856+
if not check_candidate(url_link, item.get("title", "")):
857+
continue
858+
html = self.fetch_url(url_link)
859+
if not html:
860+
continue
861+
lyrics = scrape_lyrics_from_html(html)
862+
if not lyrics:
863+
continue
864+
865+
if self.is_lyrics(lyrics, artist):
866+
self._log.debug("got lyrics from {0}", item["displayLink"])
867+
return lyrics
846868

847869
return None
848870

@@ -866,6 +888,7 @@ def __init__(self):
866888
"bing_client_secret": None,
867889
"bing_lang_from": [],
868890
"bing_lang_to": None,
891+
"dist_thresh": 0.11,
869892
"google_API_key": None,
870893
"google_engine_ID": "009217259823014548361:lndtuqkycfu",
871894
"genius_api_key": "Ryq93pUGm8bM6eUWwD_M3NOFFDAtp2yEE7W"
@@ -877,7 +900,6 @@ def __init__(self):
877900
# Musixmatch is disabled by default as they are currently blocking
878901
# requests with the beets user agent.
879902
"sources": [s for s in self.SOURCES if s != "musixmatch"],
880-
"dist_thresh": 0.1,
881903
}
882904
)
883905
self.config["bing_client_secret"].redact = True

docs/changelog.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ New features:
2323
* Beets now uses ``platformdirs`` to determine the default music directory.
2424
This location varies between systems -- for example, users can configure it
2525
on Unix systems via ``user-dirs.dirs(5)``.
26+
* :doc:`plugins/lyrics`: Add new configuration option ``dist_thresh`` to
27+
control the maximum allowed distance between the lyrics search result and the
28+
tagged item's artist and title. This is useful for preventing false positives
29+
when fetching lyrics.
2630

2731
Bug fixes:
2832

@@ -65,6 +69,9 @@ Bug fixes:
6569
``lrclib`` over other sources since it returns reliable results quicker than
6670
others.
6771
:bug:`5102`
72+
* :doc:`plugins/lyrics`: Fix the issue with ``genius`` backend not being able
73+
to match lyrics when there was a slight variation in the artist name.
74+
:bug:`4791`
6875

6976
For packagers:
7077

docs/plugins/lyrics.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,12 @@ configuration file. The available options are:
4242
Default: ``[]``
4343
- **bing_lang_to**: Language to translate lyrics into.
4444
Default: None.
45+
- **dist_thresh**: The maximum distance between the artist and title
46+
combination of the music file and lyrics candidate to consider them a match.
47+
Lower values will make the plugin more strict, higher values will make it
48+
more lenient. This does not apply to the ``lrclib`` backend as it matches
49+
durations.
50+
Default: ``0.11``.
4551
- **fallback**: By default, the file will be left unchanged when no lyrics are
4652
found. Use the empty string ``''`` to reset the lyrics in such a case.
4753
Default: None.

test/plugins/test_lyrics.py

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,42 @@ def test_slug(self, text, expected):
161161
assert lyrics.slug(text) == expected
162162

163163

164+
class TestSearchBackend:
165+
@pytest.fixture
166+
def backend(self, dist_thresh):
167+
plugin = lyrics.LyricsPlugin()
168+
plugin.config.set({"dist_thresh": dist_thresh})
169+
return lyrics.SearchBackend(plugin.config, plugin._log)
170+
171+
@pytest.mark.parametrize(
172+
"dist_thresh, target_artist, artist, should_match",
173+
[
174+
(0.11, "Target Artist", "Target Artist", True),
175+
(0.11, "Target Artist", "Target Artis", True),
176+
(0.11, "Target Artist", "Target Arti", False),
177+
(0.11, "Psychonaut", "Psychonaut (BEL)", True),
178+
(0.11, "beets song", "beats song", True),
179+
(0.10, "beets song", "beats song", False),
180+
(
181+
0.11,
182+
"Lucid Dreams (Forget Me)",
183+
"Lucid Dreams (Remix) ft. Lil Uzi Vert",
184+
False,
185+
),
186+
(
187+
0.12,
188+
"Lucid Dreams (Forget Me)",
189+
"Lucid Dreams (Remix) ft. Lil Uzi Vert",
190+
True,
191+
),
192+
],
193+
)
194+
def test_check_match(self, backend, target_artist, artist, should_match):
195+
assert (
196+
backend.check_match(target_artist, "", artist, "") == should_match
197+
)
198+
199+
164200
@pytest.fixture(scope="module")
165201
def lyrics_root_dir(pytestconfig: pytest.Config):
166202
return pytestconfig.rootpath / "test" / "rsrc" / "lyrics"
@@ -275,10 +311,10 @@ def test_is_page_candidate(
275311
self, backend, lyrics_html, url_title, artist, should_be_candidate
276312
):
277313
result = backend.is_page_candidate(
314+
artist,
315+
self.TITLE,
278316
"http://www.example.com/lyrics/beetssong",
279317
url_title,
280-
self.TITLE,
281-
artist,
282318
)
283319
assert bool(result) == should_be_candidate
284320

0 commit comments

Comments
 (0)