Skip to content

Commit

Permalink
use the stabilized context to assess failures with STRICT and LIST op…
Browse files Browse the repository at this point in the history
…tions
  • Loading branch information
julien6387 committed Jan 18, 2025
1 parent 31e2f02 commit 69facc5
Show file tree
Hide file tree
Showing 9 changed files with 177 additions and 194 deletions.
46 changes: 0 additions & 46 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,52 +63,6 @@
* Fix a possible race condition in the Supervisor proxies management.


## 0.19 (2025-xx-xx)

* Fix a random Python crash due to race condition when restarting **Supvisors**.

* Fix the statistics compiler, so that it manages the **Supvisors** instances discovered.

* Refactoring of the **Supvisors** internal state machine.
The state `INITIALIZATION` has been renamed as `SYNCHRONIZATION` and a new state `ELECTION` has been added
to get more stability in the *Master* election.

* Fix the issue raised by the rejected [Pull Request #120](https://github.com/julien6387/supvisors/pull/120).
**Supvisors** uses the `ioctl` functions to get the relevant network data on all network interfaces and matches
the local instance on the whole information available.
During the handshake, the local network data is shared with the other **Supvisors** instances using the new XML-RPC
`get_local_supvisors_info`.

* Take into account the network data in the **Supvisors** Web UI, so that the same network interface is used
when navigating to another node.

* Add a session cookie to **Supvisors** Web UI client.
All statistics images served by the **Supvisors** Web UI are renamed to allow multiple auto-refreshed views
in the browser tabs.

* Add the XML-RPCs `get_instance_state_modes` and `get_all_instances_state_modes` in support of getting detailed state
and modes information from a single **Supvisors** instance.
This information has been removed from the XML-RPCs `get_instance_info` and `get_all_instances_info`.

* Add a new option `supvisors_failure_strategy` to decide what to do when the initial conditions are not met anymore.

* Add a new option `css_files` to allow user CSS in the **Supvisors** Web UI.

* Remove the Supvisors instance state `UNKNOWN`, and rename `SILENT` as `STOPPED` (default value).

* Add the process PID to the process statistics event.

* Add a timestamp to the events of the **Supvisors** event interface.
This timestamp is also added to the XML-RPCs `get_supvisors_state`, `get_instance_state_modes`,
`get_all_instances_state_modes`, `get_application_info`, `get_all_applications_info`, `get_process_info`,
`get_all_process_info`.
The `last_event_time` in the Process status event is renamed as `last_event_mtime`.

* Cancel individual subscriptions from the Supervisor events rather than perform a global clear.

* Consider variable CPU frequency value returned by `psutil`.


## 0.18.6 (2024-08-20)

* Completion of fix about process CPU statistics when using SOLARIS mode.
Expand Down
11 changes: 0 additions & 11 deletions supvisors/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,17 +144,6 @@ def get_nodes_load(self) -> LoadMap:
for machine_id, identifiers in self.mapper.nodes.items()}

# methods on instances
def initial_running(self) -> bool:
""" Return True if all declared Supervisor instances are in RUNNING state. """
return all(status.state == SupvisorsInstanceStates.RUNNING
for identifier, status in self.instances.items()
if identifier in self.mapper.initial_identifiers)

def all_running(self) -> bool:
""" Return True if all Supervisor instances are in RUNNING state. """
return all(status.state == SupvisorsInstanceStates.RUNNING
for status in self.instances.values())

def running_identifiers(self) -> NameList:
""" Return the identifiers of the Supervisor instances in RUNNING state. """
return self.identifiers_by_states([SupvisorsInstanceStates.RUNNING])
Expand Down
17 changes: 9 additions & 8 deletions supvisors/statemachine.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ def _check_strict_failure(self) -> Optional[bool]:
in the supvisors_list option are running, and True otherwise.
Return None if the STRICT option is not set. """
if SynchronizationOptions.STRICT in self.supvisors.options.synchro_options:
if self.context.initial_running():
if self.state_modes.initial_running():
if self.sync_alerts[SynchronizationOptions.STRICT]:
self.sync_alerts[SynchronizationOptions.STRICT] = False
self.logger.warn('OnState.check_strict_failure: all expected Supvisors instances are RUNNING')
Expand All @@ -235,7 +235,7 @@ def _check_list_failure(self) -> Optional[bool]:
declared in the supvisors_list option and those discovered) are running, and True otherwise.
Return None if the LIST option is not set. """
if SynchronizationOptions.LIST in self.supvisors.options.synchro_options:
if self.context.all_running():
if self.state_modes.all_running():
if self.sync_alerts[SynchronizationOptions.LIST]:
self.sync_alerts[SynchronizationOptions.LIST] = False
self.logger.warn('OnState.check_list_failure: all known Supvisors instances are RUNNING')
Expand All @@ -251,7 +251,11 @@ def _check_core_failure(self) -> Optional[bool]:
More particularly, if the CORE option is set, return False if all the core Supvisors instances are running,
and True otherwise.
Return None if the CORE option is not set. """
Return None if the CORE option is not set.
NOTE: the CORE option is unset at startup if core_identifiers is not set or empty, so it is considered
as a failure here (not meant to happen anyway).
"""
if SynchronizationOptions.CORE in self.supvisors.options.synchro_options:
if self.state_modes.core_instances_running():
if self.sync_alerts[SynchronizationOptions.CORE]:
Expand Down Expand Up @@ -310,7 +314,6 @@ def _check_end_sync_strict(self) -> Optional[bool]:
"""
failure = self._check_strict_failure()
if failure is False:
self.logger.info('SynchronizationState.check_end_sync_strict: all expected Supvisors instances are RUNNING')
return True
return False if failure else None

Expand All @@ -322,7 +325,6 @@ def _check_end_sync_list(self) -> Optional[bool]:
"""
failure = self._check_list_failure()
if failure is False:
self.logger.info('SynchronizationState.check_end_sync_list: all known Supvisors instances are RUNNING')
return True
return False if failure else None

Expand Down Expand Up @@ -361,10 +363,9 @@ def _check_end_sync_core(self, uptime: float) -> Optional[bool]:
# in case of late start, a security limit of SYNCHRO_TIMEOUT_MIN is kept to give a chance
# to other Supvisors instances and limit the number of re-distributions
if uptime >= SupvisorsOptions.SYNCHRO_TIMEOUT_MIN:
self.logger.info('SynchronizationState.check_end_sync_core: all core Supvisors instances are RUNNING')
return True
self.logger.debug('SynchronizationState.check_end_sync_core: all core Supvisors instances are RUNNING'
f' but wait ({uptime} < {SupvisorsOptions.SYNCHRO_TIMEOUT_MIN})')
self.logger.info('SynchronizationState.check_end_sync_core: all core Supvisors instances are RUNNING,'
f' waiting ({uptime} < {SupvisorsOptions.SYNCHRO_TIMEOUT_MIN})')
return False
return False if failure else None

Expand Down
60 changes: 45 additions & 15 deletions supvisors/statemodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,19 +63,21 @@ def nick_identifier(self):
""" Property getter for the 'nick_identifier' attribute of the Supvisors instance. """
return self.supvisors_id.nick_identifier

def get_stable_identifiers(self) -> NameSet:
def get_stable_running_identifiers(self) -> NameSet:
""" Check the context stability of the Supvisors instance, by returning all its known remote Supvisors instances
that are in a stable state (RUNNING, STOPPED, ISOLATED).
that are in a RUNNING state.
In the event where any remote Supvisors instance is NOT in a stable state, the method returns an empty set.
In the event where any remote Supvisors instance is NOT in a stable state (RUNNING, STOPPED, ISOLATED),
the method returns an empty set.
:return: the Supvisors identifiers if the context is completely stable.
:return: the running Supvisors identifiers if the context is completely stable.
"""
stable_identifiers = set()
for identifier, state in self.instance_states.items():
if state not in StateModes.STABLE_STATES:
return set()
stable_identifiers.add(identifier)
if state == SupvisorsInstanceStates.RUNNING:
stable_identifiers.add(identifier)
return stable_identifiers

def running_identifiers(self) -> NameSet:
Expand Down Expand Up @@ -356,16 +358,13 @@ def evaluate_stability(self) -> None:
""" Evaluate the Supvisors stability, i.e. if all stable identifiers are identical for all Supvisors instances.
This is called periodically from the Supvisors FSM.
NOTE: a stable Supvisors instance is not necessarily RUNNING.
"""
stable_identifiers = [sm.get_stable_identifiers()
stable_identifiers = [sm.get_stable_running_identifiers()
for identifier, sm in self.instance_state_modes.items()
if self.is_running(identifier)]
self.logger.debug(f'SupvisorsStateModes.update_stability: stable_identifiers={stable_identifiers}')
if stable_identifiers and all(identifiers == stable_identifiers[0]
for identifiers in stable_identifiers):
# TBC: store only running ones ?
self.stable_identifiers = stable_identifiers[0]
else:
self.stable_identifiers = set()
Expand Down Expand Up @@ -457,17 +456,48 @@ def accept_master(self) -> None:
# the important thing is to exit the SYNCHRONIZATION state if a USER Master is available
self.master_identifier = next(iter(masters))

# Core instances
# Consolidated running instances
def all_running(self) -> bool:
""" Check if all Supvisors instances are in RUNNING state.
This is checked against the stable identifiers to ensure that the Supvisors instances are seen as RUNNING
by all Supvisors instances.
That's why this method is not in Supvisors Context module anymore.
:return: True if all Supvisors instances are in RUNNING state.
"""
self.logger.debug(f'SupvisorsStateModes.all_running: stable_identifiers={self.stable_identifiers}'
f' (expected nb={len(self.instance_state_modes)})')
return len(self.stable_identifiers) == len(self.instance_state_modes)

def initial_running(self) -> bool:
""" Check if all declared Supvisors instances are in RUNNING state.
This is checked against the stable identifiers to ensure that the declared Supvisors instances are seen
as RUNNING by all Supvisors instances.
That's why this method is not in Supvisors Context module anymore.
:return: True if all declared Supvisors instances are in RUNNING state, False other or no declared instance.
"""
initial_identifiers = set(self.mapper.initial_identifiers)
if initial_identifiers:
self.logger.debug(f'SupvisorsStateModes.initial_running: stable_identifiers={self.stable_identifiers}'
f' initial_identifiers={initial_identifiers}')
return initial_identifiers.issubset(self.stable_identifiers)
return False

def core_instances_running(self) -> bool:
""" Check if all Supvisors Core instances are in RUNNING state.
This is checked against the stable identifiers to ensure that the Supvisors Core instances are seen as RUNNING
by all Supvisors instances.
That's why this method is not in Supvisors Context module.
That's why this method is not in Supvisors Context module anymore.
:return: True if all Supvisors Core instances are in RUNNING state.
:return: True if all Supvisors Core instances are in RUNNING state, False otherwise or no Core instance.
"""
core_identifiers = set(self.mapper.core_identifiers)
stable_running_identifiers = {identifier for identifier in self.stable_identifiers
if self.is_running(identifier)}
return core_identifiers.issubset(stable_running_identifiers)
if core_identifiers:
self.logger.debug(f'SupvisorsStateModes.core_instances_running: stable_identifiers={self.stable_identifiers}'
f' core_identifiers={core_identifiers}')
return core_identifiers.issubset(self.stable_identifiers)
return False
2 changes: 1 addition & 1 deletion supvisors/test/etc/supervisord.conf
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ auto_fence = false
event_link = ZMQ
event_port = 60002
synchro_options = LIST,CORE,TIMEOUT,USER
synchro_timeout = 20
synchro_timeout = 30
inactivity_ticks = 3
core_identifiers = supv-01,supv-03
disabilities_file = /tmp/disabilities.json
Expand Down
2 changes: 1 addition & 1 deletion supvisors/test/etc/supervisord_alt.conf
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ stereotypes = third
rules_files = etc/my_movies*.xml
auto_fence = false
synchro_options = LIST,CORE,TIMEOUT,USER
synchro_timeout = 20
synchro_timeout = 30
inactivity_ticks = 3
core_identifiers = supv-01,supv-03
starting_strategy = CONFIG
Expand Down
38 changes: 1 addition & 37 deletions supvisors/tests/test_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,46 +122,10 @@ def test_get_nodes_load(mocker, context):
assert context.get_nodes_load() == {'01:23:45:67:89:ab': 10, 'ab:cd:ef:01:23:45': 8}


def test_initial_running(supvisors_instance, context):
""" Test the check of initial Supvisors instances running. """
# update mapper
supvisors_instance.mapper.initial_identifiers = ['10.0.0.1:25000', '10.0.0.2:25000', '10.0.0.3:25000']
assert not context.initial_running()
# add some RUNNING
context.local_status._state = SupvisorsInstanceStates.RUNNING
context.instances['10.0.0.1:25000']._state = SupvisorsInstanceStates.RUNNING
context.instances['10.0.0.2:25000']._state = SupvisorsInstanceStates.RUNNING
context.instances['10.0.0.4:25000']._state = SupvisorsInstanceStates.RUNNING
assert not context.initial_running()
# add some RUNNING so that all initial instances are RUNNING
context.instances['10.0.0.3:25000']._state = SupvisorsInstanceStates.RUNNING
assert context.initial_running()
# set all RUNNING
for instance in context.instances.values():
instance._state = SupvisorsInstanceStates.RUNNING
assert context.initial_running()


def test_all_running(context):
""" Test the check of all Supvisors instances running. """
assert not context.all_running()
# add some RUNNING
context.local_status._state = SupvisorsInstanceStates.RUNNING
context.instances['10.0.0.1:25000']._state = SupvisorsInstanceStates.RUNNING
context.instances['10.0.0.2:25000']._state = SupvisorsInstanceStates.RUNNING
context.instances['10.0.0.4:25000']._state = SupvisorsInstanceStates.RUNNING
assert not context.all_running()
# set all RUNNING
for instance in context.instances.values():
instance._state = SupvisorsInstanceStates.RUNNING
assert context.all_running()


def test_instances_by_states(supvisors_instance, context):
""" Test the access to instances in unknown state. """
# test initial states
all_instances = sorted(supvisors_instance.mapper.instances.keys())
assert not context.all_running()
assert context.running_identifiers() == []
assert context.isolated_identifiers() == []
assert sorted(context.valid_identifiers()) == all_instances
Expand All @@ -176,7 +140,6 @@ def test_instances_by_states(supvisors_instance, context):
context.instances['10.0.0.5:25000']._state = SupvisorsInstanceStates.FAILED
context.instances['10.0.0.6:25000']._state = SupvisorsInstanceStates.CHECKED
# test new states
assert not context.all_running()
assert context.running_identifiers() == ['10.0.0.1:25000']
assert context.isolated_identifiers() == ['10.0.0.3:25000']
assert sorted(context.valid_identifiers()) == sorted(['10.0.0.1:25000', '10.0.0.2:25000', '10.0.0.4:25000',
Expand All @@ -186,6 +149,7 @@ def test_instances_by_states(supvisors_instance, context):
SupvisorsInstanceStates.ISOLATED]) == ['10.0.0.1:25000', '10.0.0.3:25000']
assert context.identifiers_by_states([SupvisorsInstanceStates.STOPPED]) == ['10.0.0.4:25000']


def test_activate_checked(context):
""" Test the activation of checked Supvisors instances. """
# change node states
Expand Down
Loading

0 comments on commit 69facc5

Please sign in to comment.