Skip to content

Commit e35ed43

Browse files
committed
Auto merge of #392 - Zeegomo:master, r=pietroalbini
Report agent crashes to the server fixed #246 Any error encountered by the agent will be reported to the server that will post it to the relevant GitHub thread. Add new `failed` experiment status and command `retry` to queue again jobs marked as failed
2 parents 2803f91 + d891acb commit e35ed43

File tree

9 files changed

+149
-5
lines changed

9 files changed

+149
-5
lines changed

docs/agent-http-api.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ behave this way:
6060
* `POST /agent-api/complete-experiment` should be called as soon as the agent
6161
has nothing left to do with the current experiment; after the method returns
6262
`next-experiment` will return a new experiment
63+
* `POST /error` should be called only when the agent has encountered an error
6364

6465
## Available endpoints
6566

@@ -227,3 +228,28 @@ The endpoint replies with `true`.
227228
"result": true
228229
}
229230
```
231+
232+
### `POST /error`
233+
234+
This endpoint tells the Crater server the agent has encountered an error.
235+
The endpoint expects the error description to be provided as the request body,
236+
encoded in JSON:
237+
238+
* `error`: a description of the error
239+
240+
For example, this is a valid request data:
241+
242+
```json
243+
{
244+
"error": "pc is not powered on"
245+
}
246+
```
247+
248+
The endpoint replies with `true`.
249+
250+
```json
251+
{
252+
"status": "success",
253+
"result": true
254+
}
255+
```

src/agent/api.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,4 +190,15 @@ impl AgentApi {
190190
Ok(())
191191
})
192192
}
193+
194+
pub fn report_error(&self, error: String) -> Fallible<()> {
195+
self.retry(|this| {
196+
let _: bool = this
197+
.build_request(Method::POST, "error")
198+
.json(&json!({ "error": error }))
199+
.send()?
200+
.to_api_response()?;
201+
Ok(())
202+
})
203+
}
193204
}

src/agent/mod.rs

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ mod api;
22
mod results;
33

44
use crate::agent::api::AgentApi;
5+
use crate::agent::results::ResultsUploader;
56
use crate::config::Config;
67
use crate::experiments::Experiment;
78
use crate::prelude::*;
@@ -47,15 +48,34 @@ fn run_heartbeat(url: &str, token: &str) {
4748
});
4849
}
4950

51+
fn run_experiment(
52+
agent: &Agent,
53+
db: &ResultsUploader,
54+
threads_count: usize,
55+
docker_env: &str,
56+
) -> Fallible<()> {
57+
let ex = agent.experiment()?;
58+
crate::runner::run_ex(&ex, db, threads_count, &agent.config, docker_env)?;
59+
agent.api.complete_experiment()?;
60+
Ok(())
61+
}
62+
5063
pub fn run(url: &str, token: &str, threads_count: usize, docker_env: &str) -> Fallible<()> {
5164
let agent = Agent::new(url, token)?;
5265
let db = results::ResultsUploader::new(&agent.api);
5366

5467
run_heartbeat(url, token);
5568

5669
loop {
57-
let ex = agent.experiment()?;
58-
crate::runner::run_ex(&ex, &db, threads_count, &agent.config, docker_env)?;
59-
agent.api.complete_experiment()?;
70+
if let Err(err) = run_experiment(&agent, &db, threads_count, docker_env) {
71+
utils::report_failure(&err);
72+
if let Err(e) = agent
73+
.api
74+
.report_error(format!("{}", err.find_root_cause()))
75+
.with_context(|_| "error encountered")
76+
{
77+
utils::report_failure(&e);
78+
}
79+
}
6080
}
6181
}

src/experiments.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ string_enum!(pub enum Status {
1212
Queued => "queued",
1313
Running => "running",
1414
NeedsReport => "needs-report",
15+
Failed => "failed",
1516
GeneratingReport => "generating-report",
1617
ReportFailed => "report-failed",
1718
Completed => "completed",
@@ -226,7 +227,10 @@ impl Experiment {
226227
)?;
227228
self.started_at = Some(now);
228229
// Check if the old status was "running" and there is no completed date
229-
} else if self.status == Status::Running && self.completed_at.is_none() {
230+
} else if self.status == Status::Running
231+
&& self.completed_at.is_none()
232+
&& status != Status::Failed
233+
{
230234
db.execute(
231235
"UPDATE experiments SET completed_at = ?1 WHERE name = ?2;",
232236
&[&now, &self.name.as_str()],

src/server/routes/agent.rs

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ use crate::server::{Data, HttpError};
88
use failure::Compat;
99
use http::{Response, StatusCode};
1010
use hyper::Body;
11+
use std::collections::HashMap;
1112
use std::sync::Arc;
1213
use warp::{self, Filter, Rejection};
1314

@@ -53,6 +54,14 @@ pub fn routes(
5354
.and(auth_filter(data.clone(), TokenType::Agent))
5455
.map(endpoint_heartbeat);
5556

57+
let error = warp::post2()
58+
.and(warp::path("error"))
59+
.and(warp::path::end())
60+
.and(warp::body::json())
61+
.and(data_filter.clone())
62+
.and(auth_filter(data.clone(), TokenType::Agent))
63+
.map(endpoint_error);
64+
5665
warp::any()
5766
.and(
5867
config
@@ -63,6 +72,8 @@ pub fn routes(
6372
.or(record_progress)
6473
.unify()
6574
.or(heartbeat)
75+
.unify()
76+
.or(error)
6677
.unify(),
6778
)
6879
.map(handle_results)
@@ -146,6 +157,39 @@ fn endpoint_heartbeat(data: Arc<Data>, auth: AuthDetails) -> Fallible<Response<B
146157
Ok(ApiResponse::Success { result: true }.into_response()?)
147158
}
148159

160+
fn endpoint_error(
161+
error: HashMap<String, String>,
162+
data: Arc<Data>,
163+
auth: AuthDetails,
164+
) -> Fallible<Response<Body>> {
165+
let mut ex = Experiment::run_by(&data.db, &Assignee::Agent(auth.name.clone()))?
166+
.ok_or_else(|| err_msg("no experiment run by this agent"))?;
167+
168+
ex.set_status(&data.db, Status::Failed)?;
169+
170+
if let Some(ref github_issue) = ex.github_issue {
171+
Message::new()
172+
.line(
173+
"rotating_light",
174+
format!(
175+
"Experiment **`{}`** has encountered an error: {}",
176+
ex.name,
177+
error.get("error").unwrap_or(&String::from("no error")),
178+
),
179+
)
180+
.line(
181+
"hammer_and_wrench",
182+
"If the error is fixed use the `retry` command.",
183+
)
184+
.note(
185+
"sos",
186+
"Can someone from the infra team check in on this? @rust-lang/infra",
187+
)
188+
.send(&github_issue.api_url, &data)?;
189+
}
190+
Ok(ApiResponse::Success { result: true }.into_response()?)
191+
}
192+
149193
fn handle_results(resp: Fallible<Response<Body>>) -> Response<Body> {
150194
match resp {
151195
Ok(resp) => resp,

src/server/routes/ui/experiments.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ impl ExperimentData {
2525
Status::Queued => ("", "Queued", true),
2626
Status::Running => ("orange", "Running", true),
2727
Status::NeedsReport => ("orange", "Needs report", false),
28+
Status::Failed => ("red", "Failed", false),
2829
Status::GeneratingReport => ("orange", "Generating report", false),
2930
Status::ReportFailed => ("red", "Report failed", false),
3031
Status::Completed => ("green", "Completed", false),
@@ -62,6 +63,7 @@ pub fn endpoint_queue(data: Arc<Data>) -> Fallible<Response<Body>> {
6263
let mut queued = Vec::new();
6364
let mut running = Vec::new();
6465
let mut needs_report = Vec::new();
66+
let mut failed = Vec::new();
6567
let mut generating_report = Vec::new();
6668
let mut report_failed = Vec::new();
6769

@@ -77,6 +79,7 @@ pub fn endpoint_queue(data: Arc<Data>) -> Fallible<Response<Body>> {
7779
Status::Queued => queued.push(ex),
7880
Status::Running => running.push(ex),
7981
Status::NeedsReport => needs_report.push(ex),
82+
Status::Failed => failed.push(ex),
8083
Status::GeneratingReport => generating_report.push(ex),
8184
Status::ReportFailed => report_failed.push(ex),
8285
Status::Completed => unreachable!(),
@@ -87,6 +90,7 @@ pub fn endpoint_queue(data: Arc<Data>) -> Fallible<Response<Body>> {
8790
experiments.append(&mut report_failed);
8891
experiments.append(&mut generating_report);
8992
experiments.append(&mut needs_report);
93+
experiments.append(&mut failed);
9094
experiments.append(&mut running);
9195
experiments.append(&mut queued);
9296

src/server/routes/webhooks/args.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,10 @@ generate_parser!(pub enum Command {
122122
name: Option<String> = "name",
123123
})
124124

125+
"retry" => Retry(RetryArgs {
126+
name: Option<String> = "name",
127+
})
128+
125129
"reload-acl" => ReloadACL(ReloadACLArgs {})
126130

127131
_ => Edit(EditArgs {

src/server/routes/webhooks/commands.rs

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@ use crate::experiments::{CapLints, CrateSelect, Experiment, GitHubIssue, Mode, S
44
use crate::prelude::*;
55
use crate::server::github::Issue;
66
use crate::server::messages::{Label, Message};
7-
use crate::server::routes::webhooks::args::{AbortArgs, EditArgs, RetryReportArgs, RunArgs};
7+
use crate::server::routes::webhooks::args::{
8+
AbortArgs, EditArgs, RetryArgs, RetryReportArgs, RunArgs,
9+
};
810
use crate::server::Data;
911

1012
pub fn ping(data: &Data, issue: &Issue) -> Fallible<()> {
@@ -108,6 +110,31 @@ pub fn retry_report(data: &Data, issue: &Issue, args: RetryReportArgs) -> Fallib
108110
}
109111
}
110112

113+
pub fn retry(data: &Data, issue: &Issue, args: RetryArgs) -> Fallible<()> {
114+
let name = get_name(&data.db, issue, args.name)?;
115+
116+
if let Some(mut experiment) = Experiment::get(&data.db, &name)? {
117+
if experiment.status != Status::Failed {
118+
bail!("Experiment **`{}`** didn't fail!", name);
119+
}
120+
121+
experiment.set_status(&data.db, Status::Queued)?;
122+
data.reports_worker.wake();
123+
124+
Message::new()
125+
.line(
126+
"hammer_and_wrench",
127+
format!("Experiment **`{}`** queued again.", name),
128+
)
129+
.set_label(Label::ExperimentQueued)
130+
.send(&issue.url, data)?;
131+
132+
Ok(())
133+
} else {
134+
bail!("an experiment named **`{}`** doesn't exist!", name);
135+
}
136+
}
137+
111138
pub fn abort(data: &Data, issue: &Issue, args: AbortArgs) -> Fallible<()> {
112139
let name = get_name(&data.db, issue, args.name)?;
113140

src/server/routes/webhooks/mod.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,10 @@ fn process_command(
111111
commands::retry_report(data, issue, args)?;
112112
}
113113

114+
Command::Retry(args) => {
115+
commands::retry(data, issue, args)?;
116+
}
117+
114118
Command::Abort(args) => {
115119
commands::abort(data, issue, args)?;
116120
}

0 commit comments

Comments
 (0)