Skip to content

Commit d248a3a

Browse files
committed
LiveCodeBench v6
1 parent 0661f90 commit d248a3a

10 files changed

+80787
-173976
lines changed

build/asset-manifest.json

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
{
22
"files": {
33
"main.css": "./static/css/main.be588df3.css",
4-
"main.js": "./static/js/main.0d028f0f.js",
4+
"main.js": "./static/js/main.8c8c9279.js",
55
"index.html": "./index.html",
66
"main.be588df3.css.map": "./static/css/main.be588df3.css.map",
7-
"main.0d028f0f.js.map": "./static/js/main.0d028f0f.js.map"
7+
"main.8c8c9279.js.map": "./static/js/main.8c8c9279.js.map"
88
},
99
"entrypoints": [
1010
"static/css/main.be588df3.css",
11-
"static/js/main.0d028f0f.js"
11+
"static/js/main.8c8c9279.js"
1212
]
1313
}

build/index.html

+1-1
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@
5454
<div class="columns is-centered">
5555
<div class="column has-text-centered">
5656
<h1 class="title is-1 publication-title">
57-
LiveCodeBench: Holistic and Contamination-Free Evaluation of
57+
LiveCodeBench: Holistic and Contamination Free Evaluation of
5858
Large Language Models for Code
5959
</h1>
6060
<div class="is-size-5 publication-authors">

build/leaderboard.html

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
<!doctype html><html lang="en"><head><title>LiveCodeBench Leaderboard</title><meta charset="UTF-8"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="LiveCodeBench Leaderboard"/><link rel="stylesheet" href="bootstrap.min.css"/><link rel="stylesheet" href="./css/bulma.min.css"/><link rel="stylesheet" href="./css/bulma-carousel.min.css"/><link rel="stylesheet" href="./css/bulma-slider.min.css"/><link rel="stylesheet" href="./css/fontawesome.all.min.css"/><link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css"/><link rel="stylesheet" href="./css/index.css"/><link rel="icon" href="./images/favicon.svg"/><script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script><script defer="defer" src="./js/fontawesome.all.min.js"></script><script src="./js/bulma-carousel.min.js"></script><script src="./js/bulma-slider.min.js"></script><script src="./js/index.js"></script><script defer="defer" src="./static/js/main.0d028f0f.js"></script><link href="./static/css/main.be588df3.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
1+
<!doctype html><html lang="en"><head><title>LiveCodeBench Leaderboard</title><meta charset="UTF-8"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="LiveCodeBench Leaderboard"/><link rel="stylesheet" href="bootstrap.min.css"/><link rel="stylesheet" href="./css/bulma.min.css"/><link rel="stylesheet" href="./css/bulma-carousel.min.css"/><link rel="stylesheet" href="./css/bulma-slider.min.css"/><link rel="stylesheet" href="./css/fontawesome.all.min.css"/><link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css"/><link rel="stylesheet" href="./css/index.css"/><link rel="icon" href="./images/favicon.svg"/><script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script><script defer="defer" src="./js/fontawesome.all.min.js"></script><script src="./js/bulma-carousel.min.js"></script><script src="./js/bulma-slider.min.js"></script><script src="./js/index.js"></script><script defer="defer" src="./static/js/main.8c8c9279.js"></script><link href="./static/css/main.be588df3.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>

build/static/js/main.0d028f0f.js renamed to build/static/js/main.8c8c9279.js

+3-3
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

build/static/js/main.0d028f0f.js.map renamed to build/static/js/main.8c8c9279.js.map

+1-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

public/index_home.html

+1-1
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@
5454
<div class="columns is-centered">
5555
<div class="column has-text-centered">
5656
<h1 class="title is-1 publication-title">
57-
LiveCodeBench: Holistic and Contamination-Free Evaluation of
57+
LiveCodeBench: Holistic and Contamination Free Evaluation of
5858
Large Language Models for Code
5959
</h1>
6060
<div class="is-size-5 publication-authors">

src/LeaderboardComp.tsx

+2-2
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ const Leaderboard = React.memo(function LeaderboardComponent(props: any) {
143143

144144
const numProblems = performances.filter(
145145
(result: any) =>
146-
result["model"] === "GPT-4O-2024-05-13" &&
146+
result["model"] === "GPT-4O-2024-08-06" &&
147147
result["date"] >= dateStartAndEnd[0] &&
148148
result["date"] <= dateStartAndEnd[1]
149149
).length;
@@ -237,7 +237,7 @@ const Leaderboard = React.memo(function LeaderboardComponent(props: any) {
237237

238238
message += "<br><br>We estimate cutoff dates based on release date and performance variation. Models highlighted in red are likely contaminated on some fraction of the problems in the given time-window. Feel free to adjust the slider to see the leaderboard at different time windows. Please offer feedback if you find any issues!"
239239

240-
message += "<br><br>Announcements: <br>1. We have made revisions to our official autograder, fixing some unhandled cases. In case you are performing local evaluations, please use the latest codebase. <br>2. We have been introducing larger fraction of difficult problems for the more recent releases in lines with model capability improvements. A drop in performance in the later months is expected."
240+
message += "<br><br>Note: We have been introducing larger fraction of difficult problems for the more recent releases in lines with model capability improvements. A drop in performance in the later months is expected."
241241

242242

243243

src/leaderboardLib.tsx

+7
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ function get_pass_at_1(
5151
result["date"] >= start &&
5252
result["date"] <= end
5353
)
54+
console.log("Results: ", results)
5455

5556
const average_pass = formatNumber(
5657
mean(results.map((result) => result["pass@1"]))
@@ -86,6 +87,12 @@ function get_pass_at_1(
8687
)
8788

8889
// console.log("COT PASS: ", cot_pass, cot_pass != undefined, cot_pass != null, cot_pass.toString() != "NaN")
90+
console.log(
91+
"Model: ",
92+
model,
93+
"Average Pass: ",
94+
average_pass,
95+
);
8996

9097
return {
9198
average_pass,

0 commit comments

Comments
 (0)