Skip to content

Commit c3c931f

Browse files
committed
Refactor channel destruction logic
- Use ares_queue_wait_empty to wait for queries to be complete before destruction - Make sure NO queries are cancelled as side effects on __del__ - Start the destruction thread early, as soon as a channel is created Cancelling pending queries while in a query callback seemingly causes heap corruption or double-free bugs, so delay the operation until no Python code if using the channel anymore, that is, the destructor thread. Fixes: aio-libs/aiodns#175 Fixes: #248
1 parent 0486f67 commit c3c931f

File tree

3 files changed

+76
-76
lines changed

3 files changed

+76
-76
lines changed

docs/channel.rst

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -77,19 +77,6 @@
7777
While channels will attempt automatic cleanup during garbage collection, explicit
7878
closing is safer as it gives you control over when resources are released.
7979

80-
.. warning::
81-
The channel destruction mechanism has a limited throughput of 60 channels per minute
82-
(one channel per second) to ensure thread safety and prevent use-after-free errors
83-
in c-ares. This means:
84-
85-
- Avoid creating transient channels for individual queries
86-
- Reuse channel instances whenever possible
87-
- For applications with high query volume, use a single long-lived channel
88-
- If you must create multiple channels, consider pooling them
89-
90-
Creating and destroying channels rapidly will result in a backlog as the destruction
91-
queue processes channels sequentially with a 1-second delay between each.
92-
9380
.. py:method:: getaddrinfo(host, port, callback, family=0, type=0, proto=0, flags=0)
9481
9582
:param string host: Hostname to resolve.

src/_cffi_src/build_cares.py

Lines changed: 50 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -90,34 +90,6 @@
9090
typedef int... ares_socket_t;
9191
typedef int... ares_socklen_t;
9292
93-
#define ARES_SUCCESS ...
94-
95-
#define ARES_ENODATA ...
96-
#define ARES_EFORMERR ...
97-
#define ARES_ESERVFAIL ...
98-
#define ARES_ENOTFOUND ...
99-
#define ARES_ENOTIMP ...
100-
#define ARES_EREFUSED ...
101-
#define ARES_EBADQUERY ...
102-
#define ARES_EBADNAME ...
103-
#define ARES_EBADFAMILY ...
104-
#define ARES_EBADRESP ...
105-
#define ARES_ECONNREFUSED ...
106-
#define ARES_ETIMEOUT ...
107-
#define ARES_EOF ...
108-
#define ARES_EFILE ...
109-
#define ARES_ENOMEM ...
110-
#define ARES_EDESTRUCTION ...
111-
#define ARES_EBADSTR ...
112-
#define ARES_EBADFLAGS ...
113-
#define ARES_ENONAME ...
114-
#define ARES_EBADHINTS ...
115-
#define ARES_ENOTINITIALIZED ...
116-
#define ARES_ELOADIPHLPAPI ...
117-
#define ARES_EADDRGETNETWORKPARAMS ...
118-
#define ARES_ECANCELLED ...
119-
#define ARES_ESERVICE ...
120-
12193
#define ARES_FLAG_USEVC ...
12294
#define ARES_FLAG_PRIMARY ...
12395
#define ARES_FLAG_IGNTC ...
@@ -229,6 +201,54 @@
229201
size_t retry_delay;
230202
};
231203
204+
typedef enum {
205+
ARES_SUCCESS = 0,
206+
207+
/* Server error codes (ARES_ENODATA indicates no relevant answer) */
208+
ARES_ENODATA = 1,
209+
ARES_EFORMERR = 2,
210+
ARES_ESERVFAIL = 3,
211+
ARES_ENOTFOUND = 4,
212+
ARES_ENOTIMP = 5,
213+
ARES_EREFUSED = 6,
214+
215+
/* Locally generated error codes */
216+
ARES_EBADQUERY = 7,
217+
ARES_EBADNAME = 8,
218+
ARES_EBADFAMILY = 9,
219+
ARES_EBADRESP = 10,
220+
ARES_ECONNREFUSED = 11,
221+
ARES_ETIMEOUT = 12,
222+
ARES_EOF = 13,
223+
ARES_EFILE = 14,
224+
ARES_ENOMEM = 15,
225+
ARES_EDESTRUCTION = 16,
226+
ARES_EBADSTR = 17,
227+
228+
/* ares_getnameinfo error codes */
229+
ARES_EBADFLAGS = 18,
230+
231+
/* ares_getaddrinfo error codes */
232+
ARES_ENONAME = 19,
233+
ARES_EBADHINTS = 20,
234+
235+
/* Uninitialized library error code */
236+
ARES_ENOTINITIALIZED = 21, /* introduced in 1.7.0 */
237+
238+
/* ares_library_init error codes */
239+
ARES_ELOADIPHLPAPI = 22, /* introduced in 1.7.0 */
240+
ARES_EADDRGETNETWORKPARAMS = 23, /* introduced in 1.7.0 */
241+
242+
/* More error codes */
243+
ARES_ECANCELLED = 24, /* introduced in 1.7.0 */
244+
245+
/* More ares_getaddrinfo error codes */
246+
ARES_ESERVICE = 25, /* ares_getaddrinfo() was passed a text service name that
247+
* is not recognized. introduced in 1.16.0 */
248+
249+
ARES_ENOSERVER = 26 /* No DNS servers were configured */
250+
} ares_status_t;
251+
232252
/*! Values for ARES_OPT_EVENT_THREAD */
233253
typedef enum {
234254
/*! Default (best choice) event system */
@@ -597,6 +617,8 @@
597617
int ares_inet_pton(int af, const char *src, void *dst);
598618
599619
ares_bool_t ares_threadsafety(void);
620+
621+
ares_status_t ares_queue_wait_empty(ares_channel channel, int timeout_ms);
600622
"""
601623

602624
CALLBACKS = """

src/pycares/__init__.py

Lines changed: 26 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,7 @@
1212
import math
1313
import socket
1414
import threading
15-
import time
1615
from collections.abc import Callable, Iterable
17-
from contextlib import suppress
1816
from typing import Any, Callable, Final, Optional, Dict, Union
1917
from queue import SimpleQueue
2018

@@ -341,42 +339,46 @@ class _ChannelShutdownManager:
341339
def __init__(self) -> None:
342340
self._queue: SimpleQueue = SimpleQueue()
343341
self._thread: Optional[threading.Thread] = None
344-
self._thread_started = False
342+
self._start_lock = threading.Lock()
345343

346344
def _run_safe_shutdown_loop(self) -> None:
347345
"""Process channel destruction requests from the queue."""
348346
while True:
349347
# Block forever until we get a channel to destroy
350348
channel = self._queue.get()
351349

352-
# Sleep for 1 second to ensure c-ares has finished processing
353-
# Its important that c-ares is past this critcial section
354-
# so we use a delay to ensure it has time to finish processing
355-
# https://github.com/c-ares/c-ares/blob/4f42928848e8b73d322b15ecbe3e8d753bf8734e/src/lib/ares_process.c#L1422
356-
time.sleep(1.0)
350+
# Cancel all pending queries - this will trigger callbacks with ARES_ECANCELLED
351+
_lib.ares_cancel(channel[0])
352+
353+
# Wait for all queries to finish
354+
_lib.ares_queue_wait_empty(channel[0], -1)
357355

358356
# Destroy the channel
359357
if channel is not None:
360358
_lib.ares_destroy(channel[0])
361359

360+
def start(self) -> None:
361+
"""Start the background thread if not already started."""
362+
if self._thread is not None:
363+
return
364+
with self._start_lock:
365+
if self._thread is not None:
366+
# Started by another thread while waiting for the lock
367+
return
368+
self._thread = threading.Thread(target=self._run_safe_shutdown_loop, daemon=True)
369+
self._thread.start()
370+
362371
def destroy_channel(self, channel) -> None:
363372
"""
364373
Schedule channel destruction on the background thread with a safety delay.
365374
366375
Thread Safety and Synchronization:
367376
This method uses SimpleQueue which is thread-safe for putting items
368377
from multiple threads. The background thread processes channels
369-
sequentially with a 1-second delay before each destruction.
378+
sequentially waiting for queries to end before each destruction.
370379
"""
371-
# Put the channel in the queue
372380
self._queue.put(channel)
373381

374-
# Start the background thread if not already started
375-
if not self._thread_started:
376-
self._thread_started = True
377-
self._thread = threading.Thread(target=self._run_safe_shutdown_loop, daemon=True)
378-
self._thread.start()
379-
380382

381383
# Global shutdown manager instance
382384
_shutdown_manager = _ChannelShutdownManager()
@@ -504,11 +506,12 @@ def __init__(self,
504506
if local_dev:
505507
self.set_local_dev(local_dev)
506508

509+
# Ensure the shutdown thread is started
510+
_shutdown_manager.start()
511+
507512
def __del__(self) -> None:
508513
"""Ensure the channel is destroyed when the object is deleted."""
509-
if self._channel is not None:
510-
# Schedule channel destruction using the global shutdown manager
511-
self._schedule_destruction()
514+
self.close()
512515

513516
def _create_callback_handle(self, callback_data):
514517
"""
@@ -758,24 +761,12 @@ def close(self) -> None:
758761
# Already destroyed
759762
return
760763

761-
# Cancel all pending queries - this will trigger callbacks with ARES_ECANCELLED
762-
self.cancel()
764+
# NB: don't cancel queries here, it may lead to problem if done from a
765+
# query callback.
763766

764767
# Schedule channel destruction
765-
self._schedule_destruction()
766-
767-
def _schedule_destruction(self) -> None:
768-
"""Schedule channel destruction using the global shutdown manager."""
769-
if self._channel is None:
770-
return
771-
channel = self._channel
772-
self._channel = None
773-
# Can't start threads during interpreter shutdown
774-
# The channel will be cleaned up by the OS
775-
# TODO: Change to PythonFinalizationError when Python 3.12 support is dropped
776-
with suppress(RuntimeError):
777-
_shutdown_manager.destroy_channel(channel)
778-
768+
channel, self._channel = self._channel, None
769+
_shutdown_manager.destroy_channel(channel)
779770

780771

781772
class AresResult:

0 commit comments

Comments
 (0)