|
| 1 | +task_name,sites,eval_types,task_id,browsergym_split |
| 2 | +timewarp.1,wiki,llm_judge,1,test |
| 3 | +timewarp.2,wiki,llm_judge,2,test |
| 4 | +timewarp.3,wiki,llm_judge,3,test |
| 5 | +timewarp.4,wiki,llm_judge,4,test |
| 6 | +timewarp.5,wiki,llm_judge,5,test |
| 7 | +timewarp.6,wiki,llm_judge,6,test |
| 8 | +timewarp.7,wiki,llm_judge,7,test |
| 9 | +timewarp.8,wiki,llm_judge,8,test |
| 10 | +timewarp.9,wiki,llm_judge,9,test |
| 11 | +timewarp.10,wiki,llm_judge,10,test |
| 12 | +timewarp.11,wiki,llm_judge,11,test |
| 13 | +timewarp.12,wiki,llm_judge,12,test |
| 14 | +timewarp.13,wiki,llm_judge,13,test |
| 15 | +timewarp.14,wiki,llm_judge,14,test |
| 16 | +timewarp.15,wiki,llm_judge,15,test |
| 17 | +timewarp.16,wiki,llm_judge,16,test |
| 18 | +timewarp.17,wiki,llm_judge,17,test |
| 19 | +timewarp.18,wiki,llm_judge,18,test |
| 20 | +timewarp.19,wiki,llm_judge,19,test |
| 21 | +timewarp.20,wiki,llm_judge,20,test |
| 22 | +timewarp.21,wiki,llm_judge,21,test |
| 23 | +timewarp.22,wiki,llm_judge,22,test |
| 24 | +timewarp.23,wiki,llm_judge,23,test |
| 25 | +timewarp.24,wiki,llm_judge,24,test |
| 26 | +timewarp.25,wiki,llm_judge,25,test |
| 27 | +timewarp.26,wiki,llm_judge,26,test |
| 28 | +timewarp.27,wiki,llm_judge,27,test |
| 29 | +timewarp.28,wiki,llm_judge,28,test |
| 30 | +timewarp.29,wiki,llm_judge,29,test |
| 31 | +timewarp.30,wiki,llm_judge,30,test |
| 32 | +timewarp.31,wiki,llm_judge,31,test |
| 33 | +timewarp.32,news,llm_judge,32,test |
| 34 | +timewarp.33,news,llm_judge,33,test |
| 35 | +timewarp.34,news,llm_judge,34,test |
| 36 | +timewarp.35,news,llm_judge,35,test |
| 37 | +timewarp.36,news,llm_judge,36,test |
| 38 | +timewarp.37,news,llm_judge,37,test |
| 39 | +timewarp.38,news,llm_judge,38,test |
| 40 | +timewarp.39,news,llm_judge,39,test |
| 41 | +timewarp.40,news,llm_judge,40,test |
| 42 | +timewarp.41,news,llm_judge,41,test |
| 43 | +timewarp.42,news,llm_judge,42,test |
| 44 | +timewarp.43,news,llm_judge,43,test |
| 45 | +timewarp.44,news,llm_judge,44,test |
| 46 | +timewarp.45,news,llm_judge,45,test |
| 47 | +timewarp.46,news,llm_judge,46,test |
| 48 | +timewarp.47,news,llm_judge,47,test |
| 49 | +timewarp.48,news,llm_judge,48,test |
| 50 | +timewarp.49,news,llm_judge,49,test |
| 51 | +timewarp.50,news,llm_judge,50,test |
| 52 | +timewarp.51,news,llm_judge,51,test |
| 53 | +timewarp.52,news,llm_judge,52,test |
| 54 | +timewarp.53,news,llm_judge,53,test |
| 55 | +timewarp.54,webshop,llm_judge,54,test |
| 56 | +timewarp.55,webshop,llm_judge,55,test |
| 57 | +timewarp.56,webshop,llm_judge,56,test |
| 58 | +timewarp.57,webshop,llm_judge,57,test |
| 59 | +timewarp.58,webshop,llm_judge,58,test |
| 60 | +timewarp.59,webshop,llm_judge,59,test |
| 61 | +timewarp.60,webshop,llm_judge,60,test |
| 62 | +timewarp.61,webshop,llm_judge,61,test |
| 63 | +timewarp.62,webshop,llm_judge,62,test |
| 64 | +timewarp.63,webshop,llm_judge,63,test |
| 65 | +timewarp.64,webshop,llm_judge,64,test |
| 66 | +timewarp.65,webshop,llm_judge,65,test |
| 67 | +timewarp.66,webshop,llm_judge,66,test |
| 68 | +timewarp.67,webshop,llm_judge,67,test |
| 69 | +timewarp.68,webshop,llm_judge,68,test |
| 70 | +timewarp.69,webshop,llm_judge,69,test |
| 71 | +timewarp.70,webshop,llm_judge,70,test |
| 72 | +timewarp.71,webshop,llm_judge,71,test |
| 73 | +timewarp.72,webshop,llm_judge,72,test |
| 74 | +timewarp.73,webshop,llm_judge,73,test |
| 75 | +timewarp.74,webshop,llm_judge,74,test |
| 76 | +timewarp.75,webshop,llm_judge,75,test |
| 77 | +timewarp.76,webshop,llm_judge,76,test |
| 78 | +timewarp.77,webshop,llm_judge,77,test |
| 79 | +timewarp.78,webshop,llm_judge,78,test |
| 80 | +timewarp.79,webshop,llm_judge,79,test |
| 81 | +timewarp.80,webshop,llm_judge,80,test |
| 82 | +timewarp.81,"wiki,news",llm_judge,81,test |
| 83 | +timewarp.82,"wiki,news",llm_judge,82,test |
| 84 | +timewarp.83,"wiki,webshop",llm_judge,83,test |
| 85 | +timewarp.84,"wiki,news,webshop",llm_judge,84,test |
| 86 | +timewarp.85,"news,webshop",llm_judge,85,test |
| 87 | +timewarp.86,"wiki,news",llm_judge,86,test |
| 88 | +timewarp.87,"wiki,webshop",llm_judge,87,test |
| 89 | +timewarp.88,"news,webshop",llm_judge,88,test |
| 90 | +timewarp.89,"wiki,webshop",llm_judge,89,test |
| 91 | +timewarp.90,"wiki,webshop",llm_judge,90,test |
| 92 | +timewarp.91,"wiki,news",llm_judge,91,test |
| 93 | +timewarp.92,"wiki,news",llm_judge,92,test |
| 94 | +timewarp.93,"wiki,news",llm_judge,93,test |
| 95 | +timewarp.94,"wiki,news,webshop",llm_judge,94,test |
| 96 | +timewarp.95,"wiki,news,webshop",llm_judge,95,test |
| 97 | +timewarp.96,"news,webshop",llm_judge,96,test |
| 98 | +timewarp.97,"wiki,news,webshop",llm_judge,97,test |
| 99 | +timewarp.98,"wiki,news",llm_judge,98,test |
| 100 | +timewarp.99,"wiki,news,webshop",llm_judge,99,test |
| 101 | +timewarp.100,"wiki,news,webshop",llm_judge,100,test |
| 102 | +timewarp.101,"wiki,webshop",llm_judge,101,test |
| 103 | +timewarp.102,"wiki,news",llm_judge,102,test |
| 104 | +timewarp.103,"wiki,news",llm_judge,103,test |
| 105 | +timewarp.104,wiki,llm_judge,104,train |
| 106 | +timewarp.105,wiki,llm_judge,105,train |
| 107 | +timewarp.106,wiki,llm_judge,106,train |
| 108 | +timewarp.107,wiki,llm_judge,107,train |
| 109 | +timewarp.108,wiki,llm_judge,108,train |
| 110 | +timewarp.109,wiki,llm_judge,109,train |
| 111 | +timewarp.110,wiki,llm_judge,110,train |
| 112 | +timewarp.111,wiki,llm_judge,111,train |
| 113 | +timewarp.112,wiki,llm_judge,112,train |
| 114 | +timewarp.113,wiki,llm_judge,113,train |
| 115 | +timewarp.114,wiki,llm_judge,114,train |
| 116 | +timewarp.115,wiki,llm_judge,115,train |
| 117 | +timewarp.116,wiki,llm_judge,116,train |
| 118 | +timewarp.117,wiki,llm_judge,117,train |
| 119 | +timewarp.118,wiki,llm_judge,118,train |
| 120 | +timewarp.119,wiki,llm_judge,119,train |
| 121 | +timewarp.120,wiki,llm_judge,120,train |
| 122 | +timewarp.121,wiki,llm_judge,121,train |
| 123 | +timewarp.122,wiki,llm_judge,122,train |
| 124 | +timewarp.123,wiki,llm_judge,123,train |
| 125 | +timewarp.124,wiki,llm_judge,124,train |
| 126 | +timewarp.125,wiki,llm_judge,125,train |
| 127 | +timewarp.126,wiki,llm_judge,126,train |
| 128 | +timewarp.127,wiki,llm_judge,127,train |
| 129 | +timewarp.128,wiki,llm_judge,128,train |
| 130 | +timewarp.129,wiki,llm_judge,129,train |
| 131 | +timewarp.130,wiki,llm_judge,130,train |
| 132 | +timewarp.131,wiki,llm_judge,131,train |
| 133 | +timewarp.132,wiki,llm_judge,132,train |
| 134 | +timewarp.133,wiki,llm_judge,133,train |
| 135 | +timewarp.134,wiki,llm_judge,134,train |
| 136 | +timewarp.135,wiki,llm_judge,135,train |
| 137 | +timewarp.136,wiki,llm_judge,136,train |
| 138 | +timewarp.137,wiki,llm_judge,137,train |
| 139 | +timewarp.138,wiki,llm_judge,138,train |
| 140 | +timewarp.139,wiki,llm_judge,139,train |
| 141 | +timewarp.140,wiki,llm_judge,140,train |
| 142 | +timewarp.141,wiki,llm_judge,141,train |
| 143 | +timewarp.142,wiki,llm_judge,142,train |
| 144 | +timewarp.143,news,llm_judge,143,train |
| 145 | +timewarp.144,news,llm_judge,144,train |
| 146 | +timewarp.145,news,llm_judge,145,train |
| 147 | +timewarp.146,news,llm_judge,146,train |
| 148 | +timewarp.147,news,llm_judge,147,train |
| 149 | +timewarp.148,news,llm_judge,148,train |
| 150 | +timewarp.149,news,llm_judge,149,train |
| 151 | +timewarp.150,news,llm_judge,150,train |
| 152 | +timewarp.151,news,llm_judge,151,train |
| 153 | +timewarp.152,news,llm_judge,152,train |
| 154 | +timewarp.153,news,llm_judge,153,train |
| 155 | +timewarp.154,news,llm_judge,154,train |
| 156 | +timewarp.155,news,llm_judge,155,train |
| 157 | +timewarp.156,news,llm_judge,156,train |
| 158 | +timewarp.157,news,llm_judge,157,train |
| 159 | +timewarp.158,news,llm_judge,158,train |
| 160 | +timewarp.159,news,llm_judge,159,train |
| 161 | +timewarp.160,news,llm_judge,160,train |
| 162 | +timewarp.161,news,llm_judge,161,train |
| 163 | +timewarp.162,news,llm_judge,162,train |
| 164 | +timewarp.163,news,llm_judge,163,train |
| 165 | +timewarp.164,news,llm_judge,164,train |
| 166 | +timewarp.165,news,llm_judge,165,train |
| 167 | +timewarp.166,news,llm_judge,166,train |
| 168 | +timewarp.167,news,llm_judge,167,train |
| 169 | +timewarp.168,webshop,llm_judge,168,train |
| 170 | +timewarp.169,webshop,llm_judge,169,train |
| 171 | +timewarp.170,webshop,llm_judge,170,train |
| 172 | +timewarp.171,webshop,llm_judge,171,train |
| 173 | +timewarp.172,webshop,llm_judge,172,train |
| 174 | +timewarp.173,webshop,llm_judge,173,train |
| 175 | +timewarp.174,webshop,llm_judge,174,train |
| 176 | +timewarp.175,webshop,llm_judge,175,train |
| 177 | +timewarp.176,webshop,llm_judge,176,train |
| 178 | +timewarp.177,webshop,llm_judge,177,train |
| 179 | +timewarp.178,webshop,llm_judge,178,train |
| 180 | +timewarp.179,webshop,llm_judge,179,train |
| 181 | +timewarp.180,webshop,llm_judge,180,train |
| 182 | +timewarp.181,webshop,llm_judge,181,train |
| 183 | +timewarp.182,webshop,llm_judge,182,train |
| 184 | +timewarp.183,webshop,llm_judge,183,train |
| 185 | +timewarp.184,webshop,llm_judge,184,train |
| 186 | +timewarp.185,webshop,llm_judge,185,train |
| 187 | +timewarp.186,webshop,llm_judge,186,train |
| 188 | +timewarp.187,webshop,llm_judge,187,train |
| 189 | +timewarp.188,webshop,llm_judge,188,train |
| 190 | +timewarp.189,webshop,llm_judge,189,train |
| 191 | +timewarp.190,webshop,llm_judge,190,train |
| 192 | +timewarp.191,webshop,llm_judge,191,train |
| 193 | +timewarp.192,webshop,llm_judge,192,train |
| 194 | +timewarp.193,webshop,llm_judge,193,train |
| 195 | +timewarp.194,webshop,llm_judge,194,train |
| 196 | +timewarp.195,webshop,llm_judge,195,train |
| 197 | +timewarp.196,webshop,llm_judge,196,train |
| 198 | +timewarp.197,webshop,llm_judge,197,train |
| 199 | +timewarp.198,webshop,llm_judge,198,train |
| 200 | +timewarp.199,webshop,llm_judge,199,train |
| 201 | +timewarp.200,webshop,llm_judge,200,train |
| 202 | +timewarp.201,webshop,llm_judge,201,train |
| 203 | +timewarp.202,webshop,llm_judge,202,train |
| 204 | +timewarp.203,webshop,llm_judge,203,train |
| 205 | +timewarp.204,webshop,llm_judge,204,train |
| 206 | +timewarp.205,"wiki,news",llm_judge,205,train |
| 207 | +timewarp.206,"wiki,news",llm_judge,206,train |
| 208 | +timewarp.207,"wiki,webshop,news",llm_judge,207,train |
| 209 | +timewarp.208,"news,webshop",llm_judge,208,train |
| 210 | +timewarp.209,"wiki,news",llm_judge,209,train |
| 211 | +timewarp.210,"wiki,webshop,news",llm_judge,210,train |
| 212 | +timewarp.211,"news,webshop",llm_judge,211,train |
| 213 | +timewarp.212,"wiki,webshop",llm_judge,212,train |
| 214 | +timewarp.213,"wiki,webshop",llm_judge,213,train |
| 215 | +timewarp.214,"wiki,news",llm_judge,214,train |
| 216 | +timewarp.215,"wiki,news",llm_judge,215,train |
| 217 | +timewarp.216,"wiki,news,webshop",llm_judge,216,train |
| 218 | +timewarp.217,"wiki,news,webshop",llm_judge,217,train |
| 219 | +timewarp.218,"news,webshop",llm_judge,218,train |
| 220 | +timewarp.219,"wiki,news,webshop",llm_judge,219,train |
| 221 | +timewarp.220,"wiki,news,webshop",llm_judge,220,train |
| 222 | +timewarp.221,"wiki,news,webshop",llm_judge,221,train |
| 223 | +timewarp.222,"wiki,webshop",llm_judge,222,train |
| 224 | +timewarp.223,"wiki,news",llm_judge,223,train |
| 225 | +timewarp.224,"wiki,news",llm_judge,224,train |
| 226 | +timewarp.225,"wiki,news",llm_judge,225,train |
| 227 | +timewarp.226,"news,webshop",llm_judge,226,train |
| 228 | +timewarp.227,"news,webshop",llm_judge,227,train |
| 229 | +timewarp.228,"wiki,news",llm_judge,228,train |
| 230 | +timewarp.229,"news,webshop",llm_judge,229,train |
| 231 | +timewarp.230,"wiki,news",llm_judge,230,train |
| 232 | +timewarp.231,"wiki,news,webshop",llm_judge,231,train |
0 commit comments