16
16
17
17
from __future__ import annotations
18
18
19
- import difflib
20
19
import errno
21
20
import itertools
22
21
import json
22
+ import math
23
23
import os .path
24
24
import re
25
25
import struct
30
30
from functools import cached_property , partial , total_ordering
31
31
from http import HTTPStatus
32
32
from typing import TYPE_CHECKING , ClassVar , Iterable , Iterator
33
- from urllib .parse import quote , urlencode
33
+ from urllib .parse import quote , urlencode , urlparse
34
34
35
35
import requests
36
36
from typing_extensions import TypedDict
37
37
from unidecode import unidecode
38
38
39
39
import beets
40
40
from beets import plugins , ui
41
+ from beets .autotag .hooks import string_dist
41
42
42
43
if TYPE_CHECKING :
43
44
from beets .importer import ImportTask
58
59
except ImportError :
59
60
HAS_LANGDETECT = False
60
61
62
+
61
63
DIV_RE = re .compile (r"<(/?)div>?" , re .I )
62
64
COMMENT_RE = re .compile (r"<!--.*-->" , re .S )
63
65
TAG_RE = re .compile (r"<[^>]*>" )
@@ -485,15 +487,47 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
485
487
return lyrics
486
488
487
489
488
- class Genius (Backend ):
490
+ class SearchBackend (Backend ):
491
+ REQUIRES_BS = True
492
+
493
+ @cached_property
494
+ def dist_thresh (self ) -> float :
495
+ return self .config ["dist_thresh" ].get (float )
496
+
497
+ def check_match (
498
+ self , target_artist : str , target_title : str , artist : str , title : str
499
+ ) -> bool :
500
+ """Check if the given artist and title are 'good enough' match."""
501
+ max_dist = max (
502
+ string_dist (target_artist , artist ),
503
+ string_dist (target_title , title ),
504
+ )
505
+
506
+ if (max_dist := round (max_dist , 2 )) <= self .dist_thresh :
507
+ return True
508
+
509
+ if math .isclose (max_dist , self .dist_thresh , abs_tol = 0.4 ):
510
+ # log out the candidate that did not make it but was close.
511
+ # This may show a matching candidate with some noise in the name
512
+ self ._log .debug (
513
+ "({}, {}) does not match ({}, {}) but dist was close: {:.2f}" ,
514
+ artist ,
515
+ title ,
516
+ target_artist ,
517
+ target_title ,
518
+ max_dist ,
519
+ )
520
+
521
+ return False
522
+
523
+
524
+ class Genius (SearchBackend ):
489
525
"""Fetch lyrics from Genius via genius-api.
490
526
491
527
Simply adapted from
492
528
bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/
493
529
"""
494
530
495
- REQUIRES_BS = True
496
-
497
531
base_url = "https://api.genius.com"
498
532
499
533
def __init__ (self , config , log ):
@@ -516,19 +550,15 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
516
550
self ._log .debug ("Genius API request returned invalid JSON" )
517
551
return None
518
552
519
- # find a matching artist in the json
553
+ check = partial ( self . check_match , artist , title )
520
554
for hit in json ["response" ]["hits" ]:
521
- hit_artist = hit ["result" ]["primary_artist" ]["name" ]
522
-
523
- if slug (hit_artist ) == slug (artist ):
524
- html = self .fetch_url (hit ["result" ]["url" ])
555
+ result = hit ["result" ]
556
+ if check (result ["primary_artist" ]["name" ], result ["title" ]):
557
+ html = self .fetch_url (result ["url" ])
525
558
if not html :
526
559
return None
527
560
return self ._scrape_lyrics_from_html (html )
528
561
529
- self ._log .debug (
530
- "Genius failed to find a matching artist for '{0}'" , artist
531
- )
532
562
return None
533
563
534
564
def _search (self , artist , title ):
@@ -724,10 +754,9 @@ def is_text_notcode(text):
724
754
return None
725
755
726
756
727
- class Google (Backend ):
757
+ class Google (SearchBackend ):
728
758
"""Fetch lyrics from Google search results."""
729
759
730
- REQUIRES_BS = True
731
760
SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
732
761
733
762
def is_lyrics (self , text , artist = None ):
@@ -775,21 +804,20 @@ def slugify(self, text):
775
804
BY_TRANS = ["by" , "par" , "de" , "von" ]
776
805
LYRICS_TRANS = ["lyrics" , "paroles" , "letras" , "liedtexte" ]
777
806
778
- def is_page_candidate (self , url_link , url_title , title , artist ):
807
+ def is_page_candidate (
808
+ self , artist : str , title : str , url_link : str , url_title : str
809
+ ) -> bool :
779
810
"""Return True if the URL title makes it a good candidate to be a
780
811
page that contains lyrics of title by artist.
781
812
"""
782
- title = self .slugify (title .lower ())
783
- artist = self .slugify (artist .lower ())
784
- sitename = re .search (
785
- "//([^/]+)/.*" , self .slugify (url_link .lower ())
786
- ).group (1 )
787
- url_title = self .slugify (url_title .lower ())
788
-
789
- # Check if URL title contains song title (exact match)
790
- if url_title .find (title ) != - 1 :
813
+ title_slug = self .slugify (title .lower ())
814
+ url_title_slug = self .slugify (url_title .lower ())
815
+ if title_slug in url_title_slug :
791
816
return True
792
817
818
+ artist = self .slugify (artist .lower ())
819
+ sitename = urlparse (url_link ).netloc
820
+
793
821
# or try extracting song title from URL title and check if
794
822
# they are close enough
795
823
tokens = (
@@ -798,12 +826,9 @@ def is_page_candidate(self, url_link, url_title, title, artist):
798
826
+ self .LYRICS_TRANS
799
827
)
800
828
tokens = [re .escape (t ) for t in tokens ]
801
- song_title = re .sub ("(%s)" % "|" .join (tokens ), "" , url_title )
829
+ song_title = re .sub ("(%s)" % "|" .join (tokens ), "" , url_title_slug )
802
830
803
- song_title = song_title .strip ("_|" )
804
- typo_ratio = 0.9
805
- ratio = difflib .SequenceMatcher (None , song_title , title ).ratio ()
806
- return ratio >= typo_ratio
831
+ return self .check_match (artist , title_slug , artist , song_title )
807
832
808
833
def fetch (self , artist : str , title : str , * _ ) -> str | None :
809
834
params = {
@@ -825,24 +850,21 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
825
850
self ._log .debug ("google backend error: {0}" , reason )
826
851
return None
827
852
828
- if "items" in data .keys ():
829
- for item in data ["items" ]:
830
- url_link = item ["link" ]
831
- url_title = item .get ("title" , "" )
832
- if not self .is_page_candidate (
833
- url_link , url_title , title , artist
834
- ):
835
- continue
836
- html = self .fetch_url (url_link )
837
- if not html :
838
- continue
839
- lyrics = scrape_lyrics_from_html (html )
840
- if not lyrics :
841
- continue
842
-
843
- if self .is_lyrics (lyrics , artist ):
844
- self ._log .debug ("got lyrics from {0}" , item ["displayLink" ])
845
- return lyrics
853
+ check_candidate = partial (self .is_page_candidate , artist , title )
854
+ for item in data .get ("items" , []):
855
+ url_link = item ["link" ]
856
+ if not check_candidate (url_link , item .get ("title" , "" )):
857
+ continue
858
+ html = self .fetch_url (url_link )
859
+ if not html :
860
+ continue
861
+ lyrics = scrape_lyrics_from_html (html )
862
+ if not lyrics :
863
+ continue
864
+
865
+ if self .is_lyrics (lyrics , artist ):
866
+ self ._log .debug ("got lyrics from {0}" , item ["displayLink" ])
867
+ return lyrics
846
868
847
869
return None
848
870
@@ -866,6 +888,7 @@ def __init__(self):
866
888
"bing_client_secret" : None ,
867
889
"bing_lang_from" : [],
868
890
"bing_lang_to" : None ,
891
+ "dist_thresh" : 0.11 ,
869
892
"google_API_key" : None ,
870
893
"google_engine_ID" : "009217259823014548361:lndtuqkycfu" ,
871
894
"genius_api_key" : "Ryq93pUGm8bM6eUWwD_M3NOFFDAtp2yEE7W"
@@ -877,7 +900,6 @@ def __init__(self):
877
900
# Musixmatch is disabled by default as they are currently blocking
878
901
# requests with the beets user agent.
879
902
"sources" : [s for s in self .SOURCES if s != "musixmatch" ],
880
- "dist_thresh" : 0.1 ,
881
903
}
882
904
)
883
905
self .config ["bing_client_secret" ].redact = True
0 commit comments