Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[IMP] queue_job_cron_jobrunner: channel #750

Draft
wants to merge 3 commits into
base: 14.0
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions queue_job_cron_jobrunner/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -78,13 +78,6 @@ Parallel execution of jobs can be achieved by leveraging multiple ``ir.cron`` re
* Duplicate the ``queue_job_cron`` cron record as many times as needed, until you have
as much records as cron workers.

Known issues / Roadmap
======================

* Support channel capacity and priority. (See ``_acquire_one_job``)
* Gracefully handle CronWorker CPU timeouts. (See ``_job_runner``)
* Commit transaction after job state updated to started. (See ``_process``)

Bug Tracker
===========

Expand Down
181 changes: 151 additions & 30 deletions queue_job_cron_jobrunner/models/queue_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,24 @@

import logging
import traceback
from datetime import datetime
from datetime import datetime, timedelta
from io import StringIO

import psutil
from psycopg2 import OperationalError

from odoo import _, api, fields, models, tools
from odoo.service.model import PG_CONCURRENCY_ERRORS_TO_RETRY

from odoo.addons.base.models.ir_cron import _intervalTypes
from odoo.addons.queue_job.controllers.main import PG_RETRY
from odoo.addons.queue_job.exception import (
FailedJobError,
NothingToDoJob,
RetryableJobError,
)
from odoo.addons.queue_job.job import Job
from odoo.addons.queue_job.jobrunner import QueueJobRunner

_logger = logging.getLogger(__name__)

Expand All @@ -27,47 +30,86 @@
_inherit = "queue.job"

@api.model
def _acquire_one_job(self):
def _acquire_one_job(self, commit=False):
"""Acquire the next job to be run.

:returns: queue.job record (locked for update)
"""
# TODO: This method should respect channel priority and capacity,
# rather than just fetching them by creation date.
self.flush()
runner = QueueJobRunner.from_environ_or_config()
self.env.cr.execute(
"""
SELECT id
FROM queue_job
WHERE state = 'pending'
AND (eta IS NULL OR eta <= (now() AT TIME ZONE 'UTC'))
ORDER BY priority, date_created
LIMIT 1 FOR NO KEY UPDATE SKIP LOCKED
FOR NO KEY UPDATE
"""
)
row = self.env.cr.fetchone()
return self.browse(row and row[0])
rows = self.env.cr.fetchall()

channels = {}
for queue_job in self.search([("state", "=", "started")]):
if not queue_job.channel:
continue

Check warning on line 54 in queue_job_cron_jobrunner/models/queue_job.py

View check run for this annotation

Codecov / codecov/patch

queue_job_cron_jobrunner/models/queue_job.py#L54

Added line #L54 was not covered by tests
channels[queue_job.channel] = channels.get(queue_job.channel, 0) + 1
Comment on lines +51 to +55

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion: what do you think?

Suggested change
channels = {}
for queue_job in self.search([("state", "=", "started")]):
if not queue_job.channel:
continue
channels[queue_job.channel] = channels.get(queue_job.channel, 0) + 1
channels = defaultdict(int)
for queue_job in self.search([("state", "=", "started")]):
if not queue_job.channel:
continue
channels[queue_job.channel] += 1

channels_without_capacity = set()
for channel_str, running in channels.items():
channel = runner.channel_manager.get_channel_by_name(
channel_str, autocreate=True
)
if channel.capacity and channel.capacity <= running:
channels_without_capacity.add(channel_str)
channels_without_capacity.discard(
"root"
) # root must be disabled to avoid normal jobrunner
_logger.info(
"_acquire_one_job channels_without_capacity %s",
channels_without_capacity,
)

result = self.browse()
for row in rows:
queue_job = self.browse(row[0])
if queue_job.channel and queue_job.channel in channels_without_capacity:
continue
job = Job._load_from_db_record(queue_job)
job.set_started()
job.store()
_logger.info(
"_acquire_one_job queue.job %s[channel=%s,uuid=%s] started",
row[0],
job.channel,
job.uuid,
)
result = queue_job
break
self.flush()
if commit: # pragma: no cover
self.env.cr.commit() # pylint: disable=invalid-commit
return result

def _process(self, commit=False):
"""Process the job"""
self.ensure_one()
job = Job._load_from_db_record(self)
# Set it as started
job.set_started()
job.store()
_logger.debug("%s started", job.uuid)
# TODO: Commit the state change so that the state can be read from the UI
# while the job is processing. However, doing this will release the
# lock on the db, so we need to find another way.
# if commit:
# self.flush()
# self.env.cr.commit()

# Actual processing
try:
try:
with self.env.cr.savepoint():
_logger.info(
"perform %s[channel=%s,uuid=%s]",
self.id,
self.channel,
self.uuid,
)
job.perform()
_logger.info(
"performed %s[channel=%s,uuid=%s]",
self.id,
self.channel,
self.uuid,
)
job.set_done()
job.store()
except OperationalError as err:
Expand All @@ -87,20 +129,28 @@
msg = _("Job interrupted and set to Done: nothing to do.")
job.set_done(msg)
job.store()
_logger.info(

Check warning on line 132 in queue_job_cron_jobrunner/models/queue_job.py

View check run for this annotation

Codecov / codecov/patch

queue_job_cron_jobrunner/models/queue_job.py#L132

Added line #L132 was not covered by tests
"interrupted %s[channel=%s,uuid=%s]", self.id, self.channel, self.uuid
)

except RetryableJobError as err:
# delay the job later, requeue
job.postpone(result=str(err), seconds=5)
job.set_pending(reset_retry=False)
job.store()
_logger.debug("%s postponed", job)
_logger.info(

Check warning on line 141 in queue_job_cron_jobrunner/models/queue_job.py

View check run for this annotation

Codecov / codecov/patch

queue_job_cron_jobrunner/models/queue_job.py#L141

Added line #L141 was not covered by tests
"postponed %s[channel=%s,uuid=%s]", self.id, self.channel, self.uuid
)

except (FailedJobError, Exception):
with StringIO() as buff:
traceback.print_exc(file=buff)
_logger.error(buff.getvalue())
job.set_failed(exc_info=buff.getvalue())
job.store()
_logger.info(
"failed %s[channel=%s,uuid=%s]", self.id, self.channel, self.uuid
)

if commit: # pragma: no cover
self.env["base"].flush()
Expand All @@ -113,18 +163,71 @@
@api.model
def _job_runner(self, commit=True):
"""Short-lived job runner, triggered by async crons"""
job = self._acquire_one_job()
self._release_started_jobs(commit=commit)
job = self._acquire_one_job(commit=commit)

while job:
job._process(commit=commit)
job = self._acquire_one_job()
# TODO: If limit_time_real_cron is reached before all the jobs are done,
# the worker will be killed abruptly.
# Ideally, find a way to know if we're close to reaching this limit,
# stop processing, and trigger a new execution to continue.
#
# if job and limit_time_real_cron_reached_or_about_to_reach:
# self._cron_trigger()
# break

if self._stop_processing():
_logger.info(
"Stop processing queue jobs in this "
"ir.cron call, waiting next ir.cron call.",
)
return

job = self._acquire_one_job(commit=commit)

@api.model
def _stop_processing(self):
"""compute what ever the next ir.cron call is going to be
trigger, if yes we stop processing queue job here

One of the goal is to mitigate that, when you have a long list of queue
job to process, the cron thread can be killed
by odoo.sh or odoo with the limit_time_real_cron limit.

We suggest to set ir cron interval lower to the limit_time_real_cron.
"""
# In the current cursor (nor a new cursor) we can't see fresh nextcall which:
# is committed by Odoo at the end of the cron so we assume all crons are running
# so nextcall is the current started date
next_calls = [
cron.nextcall + _intervalTypes[cron.interval_type](cron.interval_number)
for cron in self.env["ir.cron"]
.sudo()
.search([("queue_job_runner", "=", True)])
]
if not next_calls:
_logger.info("Stopping queue job processing, no nextcall found.")
return True

next_cron_job_runner_trigger_date = min(next_calls)

stop_processing_threshold_seconds = int(
self.env["ir.config_parameter"]
.sudo()
.get_param(
"queue_job_cron_jobrunner.stop_processing_threshold_seconds",
"0",
)
)
end_process_queue_job_date = next_cron_job_runner_trigger_date - timedelta(
seconds=stop_processing_threshold_seconds
)
now = fields.Datetime.now()
_logger.debug(
"now: %s - estimated cron nextcall: %s - "
"Threshold: %ss"
"stop processing new job after %s",
now,
next_cron_job_runner_trigger_date,
stop_processing_threshold_seconds,
end_process_queue_job_date,
)
if now >= end_process_queue_job_date:
return True
return False

@api.model
def _cron_trigger(self, at=None):
Expand Down Expand Up @@ -166,6 +269,24 @@
if delayed_etas:
self._cron_trigger(at=list(delayed_etas))

@api.model
def _release_started_jobs(self, commit=False):
pids = [x.pid for x in psutil.process_iter()]
for record in self.search(
[("state", "=", "started"), ("worker_pid", "not in", pids)]
):
job = Job._load_from_db_record(record)
job.set_pending()
job.store()
_logger.info(
"release started job %s[channel=%s,uuid=%s]",
record.id,
record.channel,
record.uuid,
)
if commit: # pragma: no cover
self.env.cr.commit() # pylint: disable=invalid-commit

@api.model_create_multi
def create(self, vals_list):
# When jobs are created, also create the cron trigger
Expand Down
3 changes: 0 additions & 3 deletions queue_job_cron_jobrunner/readme/ROADMAP.rst

This file was deleted.

29 changes: 10 additions & 19 deletions queue_job_cron_jobrunner/static/description/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -392,12 +392,11 @@ <h1 class="title">Queue Job Cron Jobrunner</h1>
<div class="contents local topic" id="contents">
<ul class="simple">
<li><a class="reference internal" href="#configuration" id="toc-entry-1">Configuration</a></li>
<li><a class="reference internal" href="#known-issues-roadmap" id="toc-entry-2">Known issues / Roadmap</a></li>
<li><a class="reference internal" href="#bug-tracker" id="toc-entry-3">Bug Tracker</a></li>
<li><a class="reference internal" href="#credits" id="toc-entry-4">Credits</a><ul>
<li><a class="reference internal" href="#authors" id="toc-entry-5">Authors</a></li>
<li><a class="reference internal" href="#contributors" id="toc-entry-6">Contributors</a></li>
<li><a class="reference internal" href="#maintainers" id="toc-entry-7">Maintainers</a></li>
<li><a class="reference internal" href="#bug-tracker" id="toc-entry-2">Bug Tracker</a></li>
<li><a class="reference internal" href="#credits" id="toc-entry-3">Credits</a><ul>
<li><a class="reference internal" href="#authors" id="toc-entry-4">Authors</a></li>
<li><a class="reference internal" href="#contributors" id="toc-entry-5">Contributors</a></li>
<li><a class="reference internal" href="#maintainers" id="toc-entry-6">Maintainers</a></li>
</ul>
</li>
</ul>
Expand All @@ -423,32 +422,24 @@ <h1><a class="toc-backref" href="#toc-entry-1">Configuration</a></h1>
as much records as cron workers.</li>
</ul>
</div>
<div class="section" id="known-issues-roadmap">
<h1><a class="toc-backref" href="#toc-entry-2">Known issues / Roadmap</a></h1>
<ul class="simple">
<li>Support channel capacity and priority. (See <tt class="docutils literal">_acquire_one_job</tt>)</li>
<li>Gracefully handle CronWorker CPU timeouts. (See <tt class="docutils literal">_job_runner</tt>)</li>
<li>Commit transaction after job state updated to started. (See <tt class="docutils literal">_process</tt>)</li>
</ul>
</div>
<div class="section" id="bug-tracker">
<h1><a class="toc-backref" href="#toc-entry-3">Bug Tracker</a></h1>
<h1><a class="toc-backref" href="#toc-entry-2">Bug Tracker</a></h1>
<p>Bugs are tracked on <a class="reference external" href="https://github.com/OCA/queue/issues">GitHub Issues</a>.
In case of trouble, please check there if your issue has already been reported.
If you spotted it first, help us to smash it by providing a detailed and welcomed
<a class="reference external" href="https://github.com/OCA/queue/issues/new?body=module:%20queue_job_cron_jobrunner%0Aversion:%2014.0%0A%0A**Steps%20to%20reproduce**%0A-%20...%0A%0A**Current%20behavior**%0A%0A**Expected%20behavior**">feedback</a>.</p>
<p>Do not contact contributors directly about support or help with technical issues.</p>
</div>
<div class="section" id="credits">
<h1><a class="toc-backref" href="#toc-entry-4">Credits</a></h1>
<h1><a class="toc-backref" href="#toc-entry-3">Credits</a></h1>
<div class="section" id="authors">
<h2><a class="toc-backref" href="#toc-entry-5">Authors</a></h2>
<h2><a class="toc-backref" href="#toc-entry-4">Authors</a></h2>
<ul class="simple">
<li>Camptocamp SA</li>
</ul>
</div>
<div class="section" id="contributors">
<h2><a class="toc-backref" href="#toc-entry-6">Contributors</a></h2>
<h2><a class="toc-backref" href="#toc-entry-5">Contributors</a></h2>
<ul>
<li><p class="first"><a class="reference external" href="https://www.camptocamp.com">Camptocamp</a></p>
<blockquote>
Expand All @@ -460,7 +451,7 @@ <h2><a class="toc-backref" href="#toc-entry-6">Contributors</a></h2>
</ul>
</div>
<div class="section" id="maintainers">
<h2><a class="toc-backref" href="#toc-entry-7">Maintainers</a></h2>
<h2><a class="toc-backref" href="#toc-entry-6">Maintainers</a></h2>
<p>This module is maintained by the OCA.</p>
<a class="reference external image-reference" href="https://odoo-community.org">
<img alt="Odoo Community Association" src="https://odoo-community.org/logo.png" />
Expand Down
Loading