@@ -4,34 +4,6 @@ use std::str::Chars;
44use regex:: Regex ;
55use url:: Url ;
66
7- /// Defines how URL normalization will work. This struct offers reasonable defaults, as well as a fluent interface for building normalization.
8- ///
9- /// Construct an empty [`Options`] object and provide a query parameter:
10- ///
11- /// ```
12- /// # use urlnorm::*;
13- /// let options = Options::new().with_ignored_query_params(["fbclid"]);
14- /// ```
15- ///
16- /// Construct a default [`Options`] object and modify the query parameters:
17- ///
18- /// ```
19- /// # use urlnorm::*;
20- /// let options = Options::default().with_ignored_query_params(["fbclid"]);
21- /// ```
22- pub struct Options {
23- /// Query parameters to ignore. These are wrapped in the regular expression beginning and end-of-string markers (ie: `^...$`).
24- pub ignored_query_params : Vec < String > ,
25- /// Host prefixes to trim. These match only at the start of the URL's host, and repeated matches will be removed.
26- pub trimmed_host_prefixes : Vec < String > ,
27- /// Path extensions to trim. These match only at the end of the path, and an end-of-string marker (`$`) is added to the patterns
28- /// automatically.
29- pub trimmed_path_extension_suffixes : Vec < String > ,
30- /// Specifies the maximum length of a path extension to remove. Some paths may contain periods that signify identify or have some
31- /// other meaning than marking a file extension.
32- pub path_extension_length : usize ,
33- }
34-
357/// Default query parameters that are ignored.
368const DEFAULT_IGNORED_QUERY_PARAMS : [ & str ; 15 ] = [
379 "utm_source" ,
@@ -64,6 +36,53 @@ const DEFAULT_WWW_PREFIX: &str = r#"(?x)
6436/// By default, trim extensions that look like .html, .html5, etc.
6537const DEFAULT_EXTENSION_SUFFIX : & str = "[a-zA-Z]+[0-9]?$" ;
6638
39+ /// Defines how URL normalization will work. This struct offers reasonable defaults, as well as a fluent interface for building normalization.
40+ ///
41+ /// Construct an empty [`Options`] object and provide a query parameter:
42+ ///
43+ /// ```
44+ /// # use urlnorm::*;
45+ /// let options = Options::new().with_ignored_query_params(["fbclid"]);
46+ /// ```
47+ ///
48+ /// Construct a default [`Options`] object and modify the query parameters:
49+ ///
50+ /// ```
51+ /// # use urlnorm::*;
52+ /// let options = Options::default().with_ignored_query_params(["fbclid"]);
53+ /// ```
54+ ///
55+ /// And once you've constructed the [`Options`] object, you can [`Options::compile`] it
56+ /// to a [`UrlNormalizer`]. This may fail if the regular expressions fail to compile.
57+ ///
58+ /// ```
59+ /// # use urlnorm::*;
60+ /// let options: Options = Options::default().with_ignored_query_params(["fbclid"]);
61+ /// let normalizer: UrlNormalizer = options.compile().expect("Failed to compile");
62+ /// ```
63+ ///
64+ /// In most cases, however, you'll want to just use [`UrlNormalizer::default()`] and can skip [`Options`] entirely. The
65+ /// default [`UrlNormalizer`] is also infallible:
66+ ///
67+ /// ```
68+ /// # use url::Url;
69+ /// # use urlnorm::*;
70+ /// let normalizer = UrlNormalizer::default();
71+ /// let s = normalizer.compute_normalization_string(&Url::parse("http://google.com").unwrap());
72+ /// ```
73+ pub struct Options {
74+ /// Query parameters to ignore. These are wrapped in the regular expression beginning and end-of-string markers (ie: `^...$`).
75+ pub ignored_query_params : Vec < String > ,
76+ /// Host prefixes to trim. These match only at the start of the URL's host, and repeated matches will be removed.
77+ pub trimmed_host_prefixes : Vec < String > ,
78+ /// Path extensions to trim. These match only at the end of the path, and an end-of-string marker (`$`) is added to the patterns
79+ /// automatically.
80+ pub trimmed_path_extension_suffixes : Vec < String > ,
81+ /// Specifies the maximum length of a path extension to remove. Some paths may contain periods that signify identify or have some
82+ /// other meaning than marking a file extension.
83+ pub path_extension_length : usize ,
84+ }
85+
6786impl Default for Options {
6887 fn default ( ) -> Self {
6988 let new = Self :: new ( ) ;
@@ -125,6 +144,7 @@ impl Options {
125144 } )
126145 }
127146
147+ /// Replaces the ignored query parameters.
128148 pub fn with_ignored_query_params < S : AsRef < str > , I : IntoIterator < Item = S > > (
129149 mut self ,
130150 iter : I ,
@@ -133,6 +153,7 @@ impl Options {
133153 self
134154 }
135155
156+ /// Replaces the trimmed host prefixes.
136157 pub fn with_trimmed_host_prefixes < S : AsRef < str > , I : IntoIterator < Item = S > > (
137158 mut self ,
138159 iter : I ,
@@ -141,6 +162,7 @@ impl Options {
141162 self
142163 }
143164
165+ /// Replaces the trimmed path extensions.
144166 pub fn with_trimmed_path_extension_suffixes < S : AsRef < str > , I : IntoIterator < Item = S > > (
145167 mut self ,
146168 iter : I ,
@@ -150,6 +172,7 @@ impl Options {
150172 self
151173 }
152174
175+ /// Replaces the path extension length.
153176 pub fn with_path_extension_length ( mut self , path_extension_length : usize ) -> Self {
154177 self . path_extension_length = path_extension_length;
155178 self
@@ -466,8 +489,10 @@ mod test {
466489 #[ case( "https://google.com/?page=1" , "https://google.com/?page=2" ) ]
467490 #[ case( "https://google.com/?page=%31" , "https://google.com/?page=%32" ) ]
468491 #[ case( "https://amazon.com/product/ref=a" , "https://amazon.com/product/ref=b" ) ]
469- // Slightly modified query string param
492+ // Negative case: slightly modified query string param
470493 #[ case( "http://x.com?xfbclid=foo" , "http://x.com?xfbclid=basdf" ) ]
494+ // Negative case: long extension
495+ #[ case( "http://x.com/file.html12345" , "http://x.com/file.html12346" ) ]
471496 // Examples of real URLs that should not be normalized together
472497 #[ case( "http://arxiv.org/abs/1405.0126" , "http://arxiv.org/abs/1405.0351" ) ]
473498 #[ case(
0 commit comments