Skip to content

Commit 07214c2

Browse files
authored
Version 1.25.0 (#112)
1 parent adfdb7b commit 07214c2

File tree

4 files changed

+173
-23
lines changed

4 files changed

+173
-23
lines changed

README.rst

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,6 @@
11
ada-url
22
========
33

4-
The `urlib.parse` module in Python does not follow the legacy RFC 3978 standard nor
5-
does it follow the newer WHATWG URL specification. It is also relatively slow.
6-
74
This is ``ada_url``, a fast standard-compliant Python library for working with URLs based on the ``Ada`` URL
85
parser.
96

@@ -27,7 +24,7 @@ Parsing URLs
2724
^^^^^^^^^^^^
2825

2926
The ``URL`` class is intended to match the one described in the
30-
`WHATWG URL spec <https://url.spec.whatwg.org/#url-class>`_:.
27+
`WHATWG URL spec <https://url.spec.whatwg.org/#url-class>`_.
3128

3229
.. code-block:: python
3330
@@ -127,7 +124,8 @@ that it properly encodes IDNs and resolves paths:
127124
>>> parsed_url.pathname
128125
'/path2/'
129126
130-
Contrast that with the Python standard library's ``urlib.parse`` module:
127+
Contrast that with the Python standard library's ``urllib.parse`` module, which loosely
128+
follows the older `RFC 3978 <https://datatracker.ietf.org/doc/html/rfc3978>`__ standard:
131129

132130
.. code-block:: python
133131
@@ -138,11 +136,13 @@ Contrast that with the Python standard library's ``urlib.parse`` module:
138136
>>> parsed_url.path
139137
'/./path/../path2/'
140138
141-
Alternative Python bindings
142-
---------------------------
139+
Performance
140+
-----------
143141

144142
This package uses `CFFI <https://github.com/ada-url/ada-python/>`__ to call
145-
the ``Ada`` library's functions, which has a performance cost.
146-
The alternative `can_ada <https://github.com/tktech/can_ada>`__ (Canadian Ada)
147-
package uses `pybind11 <https://pybind11.readthedocs.io/en/stable/>`__ to generate a
148-
Python extension module, which is more performant.
143+
the ``Ada`` C library's functions, which makes it faster than the Python standard
144+
library's ``urllib.parse`` module for most applications.
145+
146+
An alternative package, `can_ada <https://github.com/tktech/can_ada>`__, uses
147+
`pybind11 <https://pybind11.readthedocs.io/en/stable/>`__ to interact with the ``Ada``
148+
C++ library functions, which is even faster.

ada_url/ada.cpp

Lines changed: 140 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* auto-generated on 2025-06-30 19:51:09 -0400. Do not edit! */
1+
/* auto-generated on 2025-07-16 22:15:14 -0400. Do not edit! */
22
/* begin file src/ada.cpp */
33
#include "ada.h"
44
/* begin file src/checkers.cpp */
@@ -67,7 +67,8 @@ static constexpr std::array<uint8_t, 256> path_signature_table =
6767
std::array<uint8_t, 256> result{};
6868
for (size_t i = 0; i < 256; i++) {
6969
if (i <= 0x20 || i == 0x22 || i == 0x23 || i == 0x3c || i == 0x3e ||
70-
i == 0x3f || i == 0x60 || i == 0x7b || i == 0x7d || i > 0x7e) {
70+
i == 0x3f || i == 0x5e || i == 0x60 || i == 0x7b || i == 0x7d ||
71+
i > 0x7e) {
7172
result[i] = 1;
7273
} else if (i == 0x25) {
7374
result[i] = 8;
@@ -10444,6 +10445,8 @@ ADA_POP_DISABLE_WARNINGS
1044410445
#include <arm_neon.h>
1044510446
#elif ADA_SSE2
1044610447
#include <emmintrin.h>
10448+
#elif ADA_LSX
10449+
#include <lsxintrin.h>
1044710450
#endif
1044810451

1044910452
#include <ranges>
@@ -10552,6 +10555,38 @@ ada_really_inline bool has_tabs_or_newline(
1055210555
}
1055310556
return _mm_movemask_epi8(running) != 0;
1055410557
}
10558+
#elif ADA_LSX
10559+
ada_really_inline bool has_tabs_or_newline(
10560+
std::string_view user_input) noexcept {
10561+
// first check for short strings in which case we do it naively.
10562+
if (user_input.size() < 16) { // slow path
10563+
return std::ranges::any_of(user_input, is_tabs_or_newline);
10564+
}
10565+
// fast path for long strings (expected to be common)
10566+
size_t i = 0;
10567+
const __m128i mask1 = __lsx_vrepli_b('\r');
10568+
const __m128i mask2 = __lsx_vrepli_b('\n');
10569+
const __m128i mask3 = __lsx_vrepli_b('\t');
10570+
// If we supported SSSE3, we could use the algorithm that we use for NEON.
10571+
__m128i running{0};
10572+
for (; i + 15 < user_input.size(); i += 16) {
10573+
__m128i word = __lsx_vld((const __m128i*)(user_input.data() + i), 0);
10574+
running = __lsx_vor_v(
10575+
__lsx_vor_v(running, __lsx_vor_v(__lsx_vseq_b(word, mask1),
10576+
__lsx_vseq_b(word, mask2))),
10577+
__lsx_vseq_b(word, mask3));
10578+
}
10579+
if (i < user_input.size()) {
10580+
__m128i word = __lsx_vld(
10581+
(const __m128i*)(user_input.data() + user_input.length() - 16), 0);
10582+
running = __lsx_vor_v(
10583+
__lsx_vor_v(running, __lsx_vor_v(__lsx_vseq_b(word, mask1),
10584+
__lsx_vseq_b(word, mask2))),
10585+
__lsx_vseq_b(word, mask3));
10586+
}
10587+
if (__lsx_bz_v(running)) return false;
10588+
return true;
10589+
}
1055510590
#else
1055610591
ada_really_inline bool has_tabs_or_newline(
1055710592
std::string_view user_input) noexcept {
@@ -11385,6 +11420,58 @@ ada_really_inline size_t find_next_host_delimiter_special(
1138511420
}
1138611421
return size_t(view.length());
1138711422
}
11423+
#elif ADA_LSX
11424+
ada_really_inline size_t find_next_host_delimiter_special(
11425+
std::string_view view, size_t location) noexcept {
11426+
// first check for short strings in which case we do it naively.
11427+
if (view.size() - location < 16) { // slow path
11428+
for (size_t i = location; i < view.size(); i++) {
11429+
if (view[i] == ':' || view[i] == '/' || view[i] == '\\' ||
11430+
view[i] == '?' || view[i] == '[') {
11431+
return i;
11432+
}
11433+
}
11434+
return size_t(view.size());
11435+
}
11436+
// fast path for long strings (expected to be common)
11437+
size_t i = location;
11438+
const __m128i mask1 = __lsx_vrepli_b(':');
11439+
const __m128i mask2 = __lsx_vrepli_b('/');
11440+
const __m128i mask3 = __lsx_vrepli_b('\\');
11441+
const __m128i mask4 = __lsx_vrepli_b('?');
11442+
const __m128i mask5 = __lsx_vrepli_b('[');
11443+
11444+
for (; i + 15 < view.size(); i += 16) {
11445+
__m128i word = __lsx_vld((const __m128i*)(view.data() + i), 0);
11446+
__m128i m1 = __lsx_vseq_b(word, mask1);
11447+
__m128i m2 = __lsx_vseq_b(word, mask2);
11448+
__m128i m3 = __lsx_vseq_b(word, mask3);
11449+
__m128i m4 = __lsx_vseq_b(word, mask4);
11450+
__m128i m5 = __lsx_vseq_b(word, mask5);
11451+
__m128i m =
11452+
__lsx_vor_v(__lsx_vor_v(__lsx_vor_v(m1, m2), __lsx_vor_v(m3, m4)), m5);
11453+
int mask = __lsx_vpickve2gr_hu(__lsx_vmsknz_b(m), 0);
11454+
if (mask != 0) {
11455+
return i + trailing_zeroes(mask);
11456+
}
11457+
}
11458+
if (i < view.size()) {
11459+
__m128i word =
11460+
__lsx_vld((const __m128i*)(view.data() + view.length() - 16), 0);
11461+
__m128i m1 = __lsx_vseq_b(word, mask1);
11462+
__m128i m2 = __lsx_vseq_b(word, mask2);
11463+
__m128i m3 = __lsx_vseq_b(word, mask3);
11464+
__m128i m4 = __lsx_vseq_b(word, mask4);
11465+
__m128i m5 = __lsx_vseq_b(word, mask5);
11466+
__m128i m =
11467+
__lsx_vor_v(__lsx_vor_v(__lsx_vor_v(m1, m2), __lsx_vor_v(m3, m4)), m5);
11468+
int mask = __lsx_vpickve2gr_hu(__lsx_vmsknz_b(m), 0);
11469+
if (mask != 0) {
11470+
return view.length() - 16 + trailing_zeroes(mask);
11471+
}
11472+
}
11473+
return size_t(view.length());
11474+
}
1138811475
#else
1138911476
// : / [ \\ ?
1139011477
static constexpr std::array<uint8_t, 256> special_host_delimiters =
@@ -11518,6 +11605,53 @@ ada_really_inline size_t find_next_host_delimiter(std::string_view view,
1151811605
}
1151911606
return size_t(view.length());
1152011607
}
11608+
#elif ADA_LSX
11609+
ada_really_inline size_t find_next_host_delimiter(std::string_view view,
11610+
size_t location) noexcept {
11611+
// first check for short strings in which case we do it naively.
11612+
if (view.size() - location < 16) { // slow path
11613+
for (size_t i = location; i < view.size(); i++) {
11614+
if (view[i] == ':' || view[i] == '/' || view[i] == '?' ||
11615+
view[i] == '[') {
11616+
return i;
11617+
}
11618+
}
11619+
return size_t(view.size());
11620+
}
11621+
// fast path for long strings (expected to be common)
11622+
size_t i = location;
11623+
const __m128i mask1 = __lsx_vrepli_b(':');
11624+
const __m128i mask2 = __lsx_vrepli_b('/');
11625+
const __m128i mask4 = __lsx_vrepli_b('?');
11626+
const __m128i mask5 = __lsx_vrepli_b('[');
11627+
11628+
for (; i + 15 < view.size(); i += 16) {
11629+
__m128i word = __lsx_vld((const __m128i*)(view.data() + i), 0);
11630+
__m128i m1 = __lsx_vseq_b(word, mask1);
11631+
__m128i m2 = __lsx_vseq_b(word, mask2);
11632+
__m128i m4 = __lsx_vseq_b(word, mask4);
11633+
__m128i m5 = __lsx_vseq_b(word, mask5);
11634+
__m128i m = __lsx_vor_v(__lsx_vor_v(m1, m2), __lsx_vor_v(m4, m5));
11635+
int mask = __lsx_vpickve2gr_hu(__lsx_vmsknz_b(m), 0);
11636+
if (mask != 0) {
11637+
return i + trailing_zeroes(mask);
11638+
}
11639+
}
11640+
if (i < view.size()) {
11641+
__m128i word =
11642+
__lsx_vld((const __m128i*)(view.data() + view.length() - 16), 0);
11643+
__m128i m1 = __lsx_vseq_b(word, mask1);
11644+
__m128i m2 = __lsx_vseq_b(word, mask2);
11645+
__m128i m4 = __lsx_vseq_b(word, mask4);
11646+
__m128i m5 = __lsx_vseq_b(word, mask5);
11647+
__m128i m = __lsx_vor_v(__lsx_vor_v(m1, m2), __lsx_vor_v(m4, m5));
11648+
int mask = __lsx_vpickve2gr_hu(__lsx_vmsknz_b(m), 0);
11649+
if (mask != 0) {
11650+
return view.length() - 16 + trailing_zeroes(mask);
11651+
}
11652+
}
11653+
return size_t(view.length());
11654+
}
1152111655
#else
1152211656
// : / [ ?
1152311657
static constexpr std::array<uint8_t, 256> host_delimiters = []() consteval {
@@ -11762,8 +11896,8 @@ ada_really_inline void parse_prepared_path(std::string_view input,
1176211896
? path_buffer_tmp
1176311897
: path_view;
1176411898
if (unicode::is_double_dot_path_segment(path_buffer)) {
11765-
if ((helpers::shorten_path(path, type) || special) &&
11766-
location == std::string_view::npos) {
11899+
helpers::shorten_path(path, type);
11900+
if (location == std::string_view::npos) {
1176711901
path += '/';
1176811902
}
1176911903
} else if (unicode::is_single_dot_path_segment(path_buffer) &&
@@ -15318,8 +15452,8 @@ inline void url_aggregator::consume_prepared_path(std::string_view input) {
1531815452
? path_buffer_tmp
1531915453
: path_view;
1532015454
if (unicode::is_double_dot_path_segment(path_buffer)) {
15321-
if ((helpers::shorten_path(path, type) || special) &&
15322-
location == std::string_view::npos) {
15455+
helpers::shorten_path(path, type);
15456+
if (location == std::string_view::npos) {
1532315457
path += '/';
1532415458
}
1532515459
} else if (unicode::is_single_dot_path_segment(path_buffer) &&

ada_url/ada.h

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* auto-generated on 2025-06-30 19:51:09 -0400. Do not edit! */
1+
/* auto-generated on 2025-07-16 22:15:14 -0400. Do not edit! */
22
/* begin file include/ada.h */
33
/**
44
* @file ada.h
@@ -431,6 +431,10 @@ namespace ada {
431431
#define ADA_NEON 1
432432
#endif
433433

434+
#if defined(__loongarch_sx)
435+
#define ADA_LSX 1
436+
#endif
437+
434438
#ifndef __has_cpp_attribute
435439
#define ada_lifetime_bound
436440
#elif __has_cpp_attribute(msvc::lifetimebound)
@@ -4204,6 +4208,7 @@ enum class errors : uint8_t { type_error };
42044208
#include <string_view>
42054209
#include <string>
42064210
#include <optional>
4211+
#include <iostream>
42074212

42084213
#if ADA_TESTING
42094214
#include <iostream>
@@ -4233,6 +4238,17 @@ struct url_pattern_init {
42334238
pattern,
42344239
};
42354240

4241+
friend std::ostream& operator<<(std::ostream& os, process_type type) {
4242+
switch (type) {
4243+
case process_type::url:
4244+
return os << "url";
4245+
case process_type::pattern:
4246+
return os << "pattern";
4247+
default:
4248+
return os << "unknown";
4249+
}
4250+
}
4251+
42364252
// All strings must be valid UTF-8.
42374253
// @see https://urlpattern.spec.whatwg.org/#process-a-urlpatterninit
42384254
static tl::expected<url_pattern_init, errors> process(
@@ -9410,7 +9426,7 @@ result<std::optional<url_pattern_result>> url_pattern<regex_provider>::match(
94109426

94119427
#if ADA_INCLUDE_URL_PATTERN
94129428
namespace ada::url_pattern_helpers {
9413-
#ifdef ADA_TESTING
9429+
#if defined(ADA_TESTING) || defined(ADA_LOGGING)
94149430
inline std::string to_string(token_type type) {
94159431
switch (type) {
94169432
case token_type::INVALID_CHAR:
@@ -9437,7 +9453,7 @@ inline std::string to_string(token_type type) {
94379453
ada::unreachable();
94389454
}
94399455
}
9440-
#endif // ADA_TESTING
9456+
#endif // defined(ADA_TESTING) || defined(ADA_LOGGING)
94419457

94429458
template <url_pattern_regex::regex_concept regex_provider>
94439459
constexpr void constructor_string_parser<regex_provider>::rewind() {
@@ -10498,14 +10514,14 @@ constructor_string_parser<regex_provider>::parse(std::string_view input) {
1049810514
#ifndef ADA_ADA_VERSION_H
1049910515
#define ADA_ADA_VERSION_H
1050010516

10501-
#define ADA_VERSION "3.2.5"
10517+
#define ADA_VERSION "3.2.6"
1050210518

1050310519
namespace ada {
1050410520

1050510521
enum {
1050610522
ADA_VERSION_MAJOR = 3,
1050710523
ADA_VERSION_MINOR = 2,
10508-
ADA_VERSION_REVISION = 5,
10524+
ADA_VERSION_REVISION = 6,
1050910525
};
1051010526

1051110527
} // namespace ada

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "ada-url"
7-
version = "1.24.0"
7+
version = "1.25.0"
88
authors = [
99
{name = "Bo Bayles", email = "[email protected]"},
1010
]

0 commit comments

Comments
 (0)