@@ -4,34 +4,6 @@ use std::str::Chars;
4
4
use regex:: Regex ;
5
5
use url:: Url ;
6
6
7
- /// Defines how URL normalization will work. This struct offers reasonable defaults, as well as a fluent interface for building normalization.
8
- ///
9
- /// Construct an empty [`Options`] object and provide a query parameter:
10
- ///
11
- /// ```
12
- /// # use urlnorm::*;
13
- /// let options = Options::new().with_ignored_query_params(["fbclid"]);
14
- /// ```
15
- ///
16
- /// Construct a default [`Options`] object and modify the query parameters:
17
- ///
18
- /// ```
19
- /// # use urlnorm::*;
20
- /// let options = Options::default().with_ignored_query_params(["fbclid"]);
21
- /// ```
22
- pub struct Options {
23
- /// Query parameters to ignore. These are wrapped in the regular expression beginning and end-of-string markers (ie: `^...$`).
24
- pub ignored_query_params : Vec < String > ,
25
- /// Host prefixes to trim. These match only at the start of the URL's host, and repeated matches will be removed.
26
- pub trimmed_host_prefixes : Vec < String > ,
27
- /// Path extensions to trim. These match only at the end of the path, and an end-of-string marker (`$`) is added to the patterns
28
- /// automatically.
29
- pub trimmed_path_extension_suffixes : Vec < String > ,
30
- /// Specifies the maximum length of a path extension to remove. Some paths may contain periods that signify identify or have some
31
- /// other meaning than marking a file extension.
32
- pub path_extension_length : usize ,
33
- }
34
-
35
7
/// Default query parameters that are ignored.
36
8
const DEFAULT_IGNORED_QUERY_PARAMS : [ & str ; 15 ] = [
37
9
"utm_source" ,
@@ -64,6 +36,53 @@ const DEFAULT_WWW_PREFIX: &str = r#"(?x)
64
36
/// By default, trim extensions that look like .html, .html5, etc.
65
37
const DEFAULT_EXTENSION_SUFFIX : & str = "[a-zA-Z]+[0-9]?$" ;
66
38
39
+ /// Defines how URL normalization will work. This struct offers reasonable defaults, as well as a fluent interface for building normalization.
40
+ ///
41
+ /// Construct an empty [`Options`] object and provide a query parameter:
42
+ ///
43
+ /// ```
44
+ /// # use urlnorm::*;
45
+ /// let options = Options::new().with_ignored_query_params(["fbclid"]);
46
+ /// ```
47
+ ///
48
+ /// Construct a default [`Options`] object and modify the query parameters:
49
+ ///
50
+ /// ```
51
+ /// # use urlnorm::*;
52
+ /// let options = Options::default().with_ignored_query_params(["fbclid"]);
53
+ /// ```
54
+ ///
55
+ /// And once you've constructed the [`Options`] object, you can [`Options::compile`] it
56
+ /// to a [`UrlNormalizer`]. This may fail if the regular expressions fail to compile.
57
+ ///
58
+ /// ```
59
+ /// # use urlnorm::*;
60
+ /// let options: Options = Options::default().with_ignored_query_params(["fbclid"]);
61
+ /// let normalizer: UrlNormalizer = options.compile().expect("Failed to compile");
62
+ /// ```
63
+ ///
64
+ /// In most cases, however, you'll want to just use [`UrlNormalizer::default()`] and can skip [`Options`] entirely. The
65
+ /// default [`UrlNormalizer`] is also infallible:
66
+ ///
67
+ /// ```
68
+ /// # use url::Url;
69
+ /// # use urlnorm::*;
70
+ /// let normalizer = UrlNormalizer::default();
71
+ /// let s = normalizer.compute_normalization_string(&Url::parse("http://google.com").unwrap());
72
+ /// ```
73
+ pub struct Options {
74
+ /// Query parameters to ignore. These are wrapped in the regular expression beginning and end-of-string markers (ie: `^...$`).
75
+ pub ignored_query_params : Vec < String > ,
76
+ /// Host prefixes to trim. These match only at the start of the URL's host, and repeated matches will be removed.
77
+ pub trimmed_host_prefixes : Vec < String > ,
78
+ /// Path extensions to trim. These match only at the end of the path, and an end-of-string marker (`$`) is added to the patterns
79
+ /// automatically.
80
+ pub trimmed_path_extension_suffixes : Vec < String > ,
81
+ /// Specifies the maximum length of a path extension to remove. Some paths may contain periods that signify identify or have some
82
+ /// other meaning than marking a file extension.
83
+ pub path_extension_length : usize ,
84
+ }
85
+
67
86
impl Default for Options {
68
87
fn default ( ) -> Self {
69
88
let new = Self :: new ( ) ;
@@ -125,6 +144,7 @@ impl Options {
125
144
} )
126
145
}
127
146
147
+ /// Replaces the ignored query parameters.
128
148
pub fn with_ignored_query_params < S : AsRef < str > , I : IntoIterator < Item = S > > (
129
149
mut self ,
130
150
iter : I ,
@@ -133,6 +153,7 @@ impl Options {
133
153
self
134
154
}
135
155
156
+ /// Replaces the trimmed host prefixes.
136
157
pub fn with_trimmed_host_prefixes < S : AsRef < str > , I : IntoIterator < Item = S > > (
137
158
mut self ,
138
159
iter : I ,
@@ -141,6 +162,7 @@ impl Options {
141
162
self
142
163
}
143
164
165
+ /// Replaces the trimmed path extensions.
144
166
pub fn with_trimmed_path_extension_suffixes < S : AsRef < str > , I : IntoIterator < Item = S > > (
145
167
mut self ,
146
168
iter : I ,
@@ -150,6 +172,7 @@ impl Options {
150
172
self
151
173
}
152
174
175
+ /// Replaces the path extension length.
153
176
pub fn with_path_extension_length ( mut self , path_extension_length : usize ) -> Self {
154
177
self . path_extension_length = path_extension_length;
155
178
self
@@ -466,8 +489,10 @@ mod test {
466
489
#[ case( "https://google.com/?page=1" , "https://google.com/?page=2" ) ]
467
490
#[ case( "https://google.com/?page=%31" , "https://google.com/?page=%32" ) ]
468
491
#[ case( "https://amazon.com/product/ref=a" , "https://amazon.com/product/ref=b" ) ]
469
- // Slightly modified query string param
492
+ // Negative case: slightly modified query string param
470
493
#[ case( "http://x.com?xfbclid=foo" , "http://x.com?xfbclid=basdf" ) ]
494
+ // Negative case: long extension
495
+ #[ case( "http://x.com/file.html12345" , "http://x.com/file.html12346" ) ]
471
496
// Examples of real URLs that should not be normalized together
472
497
#[ case( "http://arxiv.org/abs/1405.0126" , "http://arxiv.org/abs/1405.0351" ) ]
473
498
#[ case(
0 commit comments