🕸 remove infinite redirect

asozialesnetzwerk · Sep 30, 2023 · 9af3635 · 9af3635
1 parent 78cc988
commit 9af3635
Show file tree

Hide file tree

Showing 5 changed files with 84 additions and 40 deletions.
diff --git a/an_website/main.py b/an_website/main.py
@@ -64,6 +64,7 @@
 from tornado.httpserver import HTTPServer
 from tornado.log import LogFormatter
 from tornado.web import Application, RedirectHandler
+from typed_stream import Stream
 
 from . import (
     DIR,
@@ -342,6 +343,31 @@ def ignore_modules(config: BetterConfigParser) -> None:
     )
 
 
+def get_normed_paths_from_module_infos(
+    module_infos: Iterable[ModuleInfo],
+) -> tuple[str, ...]:
+    """Get all paths from the module infos."""
+
+    def info_to_paths(info: ModuleInfo) -> Iterable[str | None]:
+        return (
+            info.path,
+            *info.aliases,
+            *Stream(info.sub_pages).map(lambda page: page.path),
+        )
+
+    return (
+        Stream(module_infos)
+        .flat_map(info_to_paths)
+        .filter()
+        .filter(lambda path: path.startswith("/"))
+        .map(str.strip, "/")
+        .filter(lambda p: len(p) > 1)
+        .map(str.lower)
+        .distinct()
+        .collect(tuple)
+    )
+
+
 def make_app(config: ConfigParser) -> str | Application:
     """Create the Tornado application and return it."""
     module_infos, duration = time_function(get_module_infos)
@@ -356,6 +382,7 @@ def make_app(config: ConfigParser) -> str | Application:
     return Application(
         handlers,  # type: ignore[arg-type]
         MODULE_INFOS=module_infos,
+        NORMED_PATHS=get_normed_paths_from_module_infos(module_infos),
         HANDLERS=handlers,
         # General settings
         autoreload=False,

diff --git a/an_website/utils/base_request_handler.py b/an_website/utils/base_request_handler.py
@@ -34,7 +34,7 @@
 from datetime import date, datetime, timedelta, timezone, tzinfo
 from functools import cached_property, partial, reduce
 from typing import TYPE_CHECKING, Any, ClassVar, Final, cast
-from urllib.parse import SplitResult, quote, urlsplit, urlunsplit
+from urllib.parse import SplitResult, urlsplit, urlunsplit
 from zoneinfo import ZoneInfo
 
 import elasticapm  # type: ignore[import]
@@ -299,8 +299,14 @@ def fix_url(
         if isinstance(url, str):
             url = urlsplit(url)
         if url.netloc and url.netloc.lower() != self.request.host.lower():
-            url = urlsplit(f"/redirect?to={quote(url.geturl())}")
-        path = url.path if new_path is None else new_path  # the path of the url
+            path = "/redirect"
+            query_args["to"] = url.geturl()
+            url = urlsplit(self.request.full_url())
+        else:
+            path = url.path if new_path is None else new_path
+        path = f"/{path.strip('/')}".lower()
+        if path == "/lolwut":
+            path = path.upper()
         if path.startswith("/soundboard/files/") or path in FILE_HASHES_DICT:
             query_args.update(
                 {key: None for key in self.user_settings.iter_option_names()}
@@ -322,7 +328,7 @@ def fix_url(
                 (
                     self.request.protocol,
                     self.request.host,
-                    path.rstrip("/"),
+                    "" if path == "/" else path,
                     url.query,
                     url.fragment,
                 )

diff --git a/an_website/utils/request_handler.py b/an_website/utils/request_handler.py
@@ -36,7 +36,7 @@
 from .base_request_handler import BaseRequestHandler
 from .utils import (
     SUS_PATHS,
-    normalized_levenshtein,
+    get_close_matches,
     remove_suffix_ignore_case,
     replace_umlauts,
 )
@@ -110,41 +110,10 @@ async def prepare(self) -> None:
         if len(this_path_normalized) == 1:
             return self.redirect(self.fix_url(new_path="/"))
 
-        distances: list[tuple[float, str]] = []
-        max_dist = 0.5
-
-        for module_info in self.get_module_infos():
-            if module_info.path is not None:
-                dist = min(  # get the smallest distance with the aliases
-                    normalized_levenshtein(
-                        this_path_normalized, path.strip("/").lower()
-                    )
-                    for path in (*module_info.aliases, module_info.path)
-                    if path != "/z"  # do not redirect to /z
-                )
-                if dist <= max_dist:
-                    # only if the distance is less than or equal {max_dist}
-                    distances.append((dist, module_info.path))
-            if len(module_info.sub_pages) > 0:
-                distances.extend(
-                    (
-                        normalized_levenshtein(
-                            this_path_normalized,
-                            sub_page.path.strip("/").lower(),
-                        ),
-                        sub_page.path,
-                    )
-                    for sub_page in module_info.sub_pages
-                    if sub_page.path is not None
-                )
-
-        if len(distances) > 0:
-            # sort to get the one with the smallest distance in index 0
-            distances.sort()
-            dist, path = distances[0]  # pylint: disable=redefined-outer-name
-            # redirect only if the distance is less than or equal {max_dist}
-            if dist <= max_dist:
-                return self.redirect(self.fix_url(new_path=path), False)
+        paths: tuple[str, ...] = self.settings.get("NORMED_PATHS") or ()
+        matches = get_close_matches(this_path_normalized, paths, count=1)
+        if matches:
+            return self.redirect(self.fix_url(new_path=matches[0]), False)
 
         self.set_status(404)
         self.write_error(404)

diff --git a/an_website/utils/utils.py b/an_website/utils/utils.py
@@ -18,6 +18,7 @@
 import argparse
 import asyncio
 import contextlib
+import heapq
 import logging
 import os
 import pathlib
@@ -577,6 +578,41 @@ def normalized_levenshtein(string1: str, string2: str) -> float:
     return float(distance(string1, string2)) / max(len(string1), len(string2))
 
 
+def get_close_matches(  # based on difflib.get_close_matches
+    word: str,
+    possibilities: Iterable[str],
+    count: int = 3,
+    cutoff: float = 0.5,
+) -> tuple[str, ...]:
+    """Use normalized_levenshtein to return list of the best "good enough" matches.
+
+    word is a sequence for which close matches are desired (typically a string).
+
+    possibilities is a list of sequences against which to match word
+    (typically a list of strings).
+
+    Optional arg count (default 3) is the maximum number of close matches to
+    return.  count must be > 0.
+
+    Optional arg cutoff (default 0.5) is a float in [0, 1].  Possibilities
+    that don't score at least that similar to word are ignored.
+
+    The best (no more than count) matches among the possibilities are returned
+    in a tuple, sorted by similarity score, most similar first.
+    """
+    if count <= 0:
+        raise ValueError(f"count must be > 0: {count}")
+    if not 0.0 <= cutoff <= 1.0:
+        raise ValueError(f"cutoff must be in [0.0, 1.0]: {cutoff}")
+    result: list[tuple[float, str]] = []
+    for possibility in possibilities:
+        ratio: float = normalized_levenshtein(possibility, word)
+        if ratio <= cutoff:
+            result.append((ratio, possibility))
+    # Strip scores for the best count matches
+    return tuple(word for score, word in heapq.nsmallest(count, result))
+
+
 def parse_bumpscosity(value: str | int | None) -> BumpscosityValue:
     """Parse a string to a valid bumpscosity value."""
     if isinstance(value, str):

diff --git a/tests/test_request_handlers.py b/tests/test_request_handlers.py
@@ -76,6 +76,12 @@ async def test_json_apis(fetch: FetchCallable) -> None:  # noqa: F811
 async def test_not_found_handler(fetch: FetchCallable) -> None:  # noqa: F811
     """Check if the NotFoundHandler works."""
     assert_valid_html_response(await fetch("/qwertzuiop"), {404})
+    assert_valid_html_response(
+        await fetch(
+            "/https:/github.com/asozialesnetzwerk/vertauschtewoerterplugin"
+        ),
+        {404},
+    )
 
     await assert_valid_redirect(fetch, "/services.html", "/services", {308})
     await assert_valid_redirect(fetch, "/services/", "/services", {308})