From 622ed3c7191e20072234748f4584a2f42dc4c460 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= Date: Fri, 6 Sep 2024 12:11:01 +0100 Subject: [PATCH] Use a single slug implementation Tidy up 'Google.is_page_candidate' method and remove 'Google.sluggify' method which was a duplicate of 'slug'. Since 'GeniusFetchTest' only tested whether the artist name is cleaned up (the rest of the functionality is patched), remove it and move its test cases to the 'test_slug' test. --- beetsplug/lyrics.py | 36 +++---------- test/plugins/test_lyrics.py | 105 +++++++----------------------------- 2 files changed, 27 insertions(+), 114 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index c662110e90..88e0616feb 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -22,12 +22,12 @@ import itertools import os.path import re -import unicodedata import urllib from contextlib import contextmanager from functools import cached_property from html import unescape from typing import Any, Iterator +from urllib.parse import urlparse import requests from typing_extensions import TypedDict @@ -183,7 +183,7 @@ def generate_alternatives(string, patterns): return itertools.product(artists, multi_titles) -def slug(text): +def slug(text: str) -> str: """Make a URL-safe, human-readable version of the given text This will do the following: @@ -193,10 +193,6 @@ def slug(text): 3. strip whitespace 4. replace other non-word characters with dashes 5. strip extra dashes - - This somewhat duplicates the :func:`Google.slugify` function but - slugify is not as generic as this one, which can be reused - elsewhere. """ return re.sub(r"\W+", "-", unidecode(text).lower().strip()).strip("-") @@ -666,19 +662,6 @@ def is_lyrics(self, text, artist=None): self.debug("Bad triggers detected: {}", bad_triggers_occ) return len(bad_triggers_occ) < 2 - def slugify(self, text): - """Normalize a string and remove non-alphanumeric characters.""" - text = re.sub(r"[-'_\s]", "_", text) - text = re.sub(r"_+", "_", text).strip("_") - pat = r"([^,\(]*)\((.*?)\)" # Remove content within parentheses - text = re.sub(pat, r"\g<1>", text).strip() - try: - text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore") - text = str(re.sub(r"[-\s]+", " ", text.decode("utf-8"))) - except UnicodeDecodeError: - self.debug("Failed to normalize '{}'", text) - return text - BY_TRANS = ["by", "par", "de", "von"] LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"] @@ -686,12 +669,10 @@ def is_page_candidate(self, url_link, url_title, title, artist): """Return True if the URL title makes it a good candidate to be a page that contains lyrics of title by artist. """ - title = self.slugify(title.lower()) - artist = self.slugify(artist.lower()) - sitename = re.search( - "//([^/]+)/.*", self.slugify(url_link.lower()) - ).group(1) - url_title = self.slugify(url_title.lower()) + title = slug(title) + artist = re.escape(slug(artist)) + sitename = urlparse(url_link).netloc + url_title = slug(url_title) # Check if URL title contains song title (exact match) if url_title.find(title) != -1: @@ -700,14 +681,13 @@ def is_page_candidate(self, url_link, url_title, title, artist): # or try extracting song title from URL title and check if # they are close enough tokens = ( - [by + "_" + artist for by in self.BY_TRANS] + [by + "-" + artist for by in self.BY_TRANS] + [artist, sitename, sitename.replace("www.", "")] + self.LYRICS_TRANS ) - tokens = [re.escape(t) for t in tokens] song_title = re.sub("(%s)" % "|".join(tokens), "", url_title) - song_title = song_title.strip("_|") + song_title = song_title.strip("-|") typo_ratio = 0.9 ratio = difflib.SequenceMatcher(None, song_title, title).ratio() return ratio >= typo_ratio diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py index 0fc345e352..a93f271deb 100644 --- a/test/plugins/test_lyrics.py +++ b/test/plugins/test_lyrics.py @@ -14,13 +14,10 @@ """Tests for the 'lyrics' plugin.""" -import itertools import os import re -import unittest from functools import partial from http import HTTPStatus -from unittest.mock import patch import pytest @@ -43,11 +40,7 @@ ) -class LyricsPluginTest(unittest.TestCase): - def setUp(self): - """Set up configuration.""" - lyrics.LyricsPlugin() - +class TestLyricsUtils: def test_search_artist(self): item = Item(artist="Alice ft. Bob", title="song") assert ("Alice ft. Bob", ["song"]) in lyrics.search_pairs(item) @@ -159,6 +152,24 @@ def test_scrape_merge_paragraphs(self): text = "one

two

three" assert lyrics._scrape_merge_paragraphs(text) == "one\ntwo\nthree" + @pytest.mark.parametrize( + "text, expected", + [ + ("test", "test"), + ("Mørdag", "mordag"), + ("l'été c'est fait pour jouer", "l-ete-c-est-fait-pour-jouer"), + ("\xe7afe au lait (boisson)", "cafe-au-lait-boisson"), + ("Multiple spaces -- and symbols! -- merged", "multiple-spaces-and-symbols-merged"), # noqa: E501 + ("\u200bno-width-space", "no-width-space"), + ("El\u002dp", "el-p"), + ("\u200bblackbear", "blackbear"), + ("\u200d", ""), + ("\u2010", ""), + ], + ) # fmt: skip + def test_slug(self, text, expected): + assert lyrics.slug(text) == expected + @pytest.fixture(scope="module") def lyrics_root_dir(pytestconfig: pytest.Config): @@ -333,10 +344,6 @@ def test_is_page_candidate( def test_bad_lyrics(self, backend, lyrics): assert not backend.is_lyrics(lyrics) - def test_slugify(self, backend): - text = "http://site.com/\xe7afe-au_lait(boisson)" - assert backend.slugify(text) == "http://site.com/cafe_au_lait" - class TestGeniusLyrics(LyricsPluginBackendMixin): @pytest.fixture(scope="class") @@ -358,52 +365,6 @@ def test_scrape_genius_lyrics( assert len(result.splitlines()) == expected_line_count - @patch.object(lyrics.Genius, "scrape_lyrics") - @patch.object(lyrics.Backend, "fetch_text", return_value=True) - def test_json(self, mock_fetch_text, mock_scrape, backend): - """Ensure we're finding artist matches""" - with patch.object( - lyrics.Genius, - "_search", - return_value={ - "response": { - "hits": [ - { - "result": { - "primary_artist": { - "name": "\u200bblackbear", - }, - "url": "blackbear_url", - } - }, - { - "result": { - "primary_artist": {"name": "El\u002dp"}, - "url": "El-p_url", - } - }, - ] - } - }, - ) as mock_json: - # genius uses zero-width-spaces (\u200B) for lowercase - # artists so we make sure we can match those - assert backend.fetch("blackbear", "Idfc") is not None - mock_fetch_text.assert_called_once_with("blackbear_url") - mock_scrape.assert_called_once_with(True) - - # genius uses the hyphen minus (\u002D) as their dash - assert backend.fetch("El-p", "Idfc") is not None - mock_fetch_text.assert_called_with("El-p_url") - mock_scrape.assert_called_with(True) - - # test no matching artist - assert backend.fetch("doesntexist", "none") is None - - # test invalid json - mock_json.return_value = {"response": {"hits": []}} - assert backend.fetch("blackbear", "Idfc") is None - class TestTekstowoLyrics(LyricsPluginBackendMixin): @pytest.fixture(scope="class") @@ -519,31 +480,3 @@ def test_pick_lyrics_match(self, fetch_lyrics, expected_lyrics): ) def test_synced_config_option(self, fetch_lyrics, expected_lyrics): assert fetch_lyrics() == expected_lyrics - - -class SlugTests(unittest.TestCase): - def test_slug(self): - # plain ascii passthrough - text = "test" - assert lyrics.slug(text) == "test" - - # german unicode and capitals - text = "Mørdag" - assert lyrics.slug(text) == "mordag" - - # more accents and quotes - text = "l'été c'est fait pour jouer" - assert lyrics.slug(text) == "l-ete-c-est-fait-pour-jouer" - - # accents, parens and spaces - text = "\xe7afe au lait (boisson)" - assert lyrics.slug(text) == "cafe-au-lait-boisson" - text = "Multiple spaces -- and symbols! -- merged" - assert lyrics.slug(text) == "multiple-spaces-and-symbols-merged" - text = "\u200bno-width-space" - assert lyrics.slug(text) == "no-width-space" - - # variations of dashes should get standardized - dashes = ["\u200d", "\u2010"] - for dash1, dash2 in itertools.combinations(dashes, 2): - assert lyrics.slug(dash1) == lyrics.slug(dash2)