Skip to content

Commit f757103

Browse files
authored
Moves left head for OpenCL backend (LeelaChessZero#1146)
1 parent a371dca commit f757103

File tree

4 files changed

+135
-24
lines changed

4 files changed

+135
-24
lines changed

src/neural/opencl/OpenCL.h

+19
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ class Layer {
6868
bool is_policy{false};
6969
bool is_conv_policy{false};
7070
bool is_value{false};
71+
bool is_moves_left{false};
7172
std::vector<cl::Buffer> weights;
7273
};
7374

@@ -187,6 +188,24 @@ class OpenCL_Network {
187188
m_layers[layer].ip_out_size = ip_out;
188189
}
189190

191+
void push_moves_left(unsigned int channels, unsigned int outputs,
192+
unsigned int ip_in, unsigned int ip_out,
193+
const std::vector<float>& weights,
194+
const std::vector<float>& biases,
195+
const std::vector<float>& fc_w,
196+
const std::vector<float>& fc_b) {
197+
size_t layer = get_layer_count();
198+
push_weights(layer, weights);
199+
push_weights(layer, biases);
200+
push_weights(layer, fc_w);
201+
push_weights(layer, fc_b);
202+
m_layers[layer].is_moves_left = true;
203+
m_layers[layer].outputs = outputs;
204+
m_layers[layer].channels = channels;
205+
m_layers[layer].ip_in_size = ip_in;
206+
m_layers[layer].ip_out_size = ip_out;
207+
}
208+
190209
size_t get_layer_count() const { return m_layers.size(); }
191210

192211
private:

src/neural/opencl/OpenCLBuffers.cc

+58-15
Original file line numberDiff line numberDiff line change
@@ -46,13 +46,23 @@ OpenCLBuffers::OpenCLBuffers(const OpenCL_Network& opencl_net)
4646
constexpr auto width = 8;
4747
constexpr auto height = 8;
4848

49-
auto finalSize_pol = layers[layers.size() - 2].ip_out_size * sizeof(net_t);
50-
auto finalSize_val = layers.back().ip_out_size * sizeof(net_t);
49+
m_finalSize_pol = 0;
50+
m_finalSize_val = 0;
51+
m_finalSize_mov = 0;
5152

5253
auto max_channels = unsigned{0};
5354
for (const auto& layer : layers) {
5455
max_channels =
5556
std::max(max_channels, std::max(layer.channels, layer.outputs));
57+
if (layer.is_policy || layer.is_conv_policy) {
58+
m_finalSize_pol = layer.ip_out_size * sizeof(net_t);
59+
}
60+
if (layer.is_value) {
61+
m_finalSize_val = layer.ip_out_size * sizeof(net_t);
62+
}
63+
if (layer.is_moves_left) {
64+
m_finalSize_mov = layer.ip_out_size * sizeof(net_t);
65+
}
5666
}
5767

5868
const auto mwg = m_opencl.m_sgemm_tuners.mwg;
@@ -86,16 +96,35 @@ OpenCLBuffers::OpenCLBuffers(const OpenCL_Network& opencl_net)
8696
try {
8797
m_pinnedOutBuffer_pol = cl::Buffer(
8898
m_opencl.m_context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
89-
max_batch_size * finalSize_pol);
99+
max_batch_size * m_finalSize_pol);
90100
} catch (const cl::Error& e) {
91101
CERR << "Error in m_pinnedOutBuffer_pol: " << e.what() << ": " << e.err()
92102
<< std::endl;
93103
throw;
94104
}
95105

96-
m_pinnedOutBuffer_val =
97-
cl::Buffer(m_opencl.m_context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
98-
max_batch_size * finalSize_val);
106+
try {
107+
m_pinnedOutBuffer_val = cl::Buffer(
108+
m_opencl.m_context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
109+
max_batch_size * m_finalSize_val);
110+
} catch (const cl::Error& e) {
111+
CERR << "Error in m_pinnedOutBuffer_val: " << e.what() << ": " << e.err()
112+
<< std::endl;
113+
throw;
114+
}
115+
116+
if (m_finalSize_mov > 0) {
117+
try {
118+
m_pinnedOutBuffer_mov = cl::Buffer(
119+
m_opencl.m_context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
120+
max_batch_size * m_finalSize_mov);
121+
} catch (const cl::Error& e) {
122+
CERR << "Error in m_pinnedOutBuffer_mov: " << e.what() << ": " << e.err()
123+
<< std::endl;
124+
throw;
125+
}
126+
}
127+
99128
m_pool_buffer =
100129
cl::Buffer(m_opencl.m_context, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS,
101130
alloc_pool_size);
@@ -104,12 +133,10 @@ OpenCLBuffers::OpenCLBuffers(const OpenCL_Network& opencl_net)
104133
void OpenCLBuffers::forward(const std::vector<net_t>& input,
105134
std::vector<net_t>& output_pol,
106135
std::vector<net_t>& output_val,
136+
std::vector<net_t>& output_mov,
107137
const int batch_size) {
108138
auto& layers = m_opencl_net.m_layers;
109139

110-
auto finalSize_pol = layers[layers.size() - 2].ip_out_size * sizeof(net_t);
111-
auto finalSize_val = layers.back().ip_out_size * sizeof(net_t);
112-
113140
const auto inSize = sizeof(net_t) * input.size();
114141
m_commandqueue.enqueueWriteBuffer(m_inBuffer, CL_FALSE, 0, inSize,
115142
input.data());
@@ -241,13 +268,15 @@ void OpenCLBuffers::forward(const std::vector<net_t>& input,
241268
layer.outputs * 8 * 8, layer.ip_in_size, layer.ip_out_size);
242269

243270
} else {
244-
assert(layer.is_value || layer.is_policy);
271+
assert(layer.is_value || layer.is_policy || layer.is_moves_left);
245272

246273
cl::Buffer out_buffer;
247274
if (layer.is_policy) {
248275
out_buffer = m_pinnedOutBuffer_pol;
249-
} else {
276+
} else if (layer.is_value) {
250277
out_buffer = m_pinnedOutBuffer_val;
278+
} else {
279+
out_buffer = m_pinnedOutBuffer_mov;
251280
}
252281

253282
auto conv_weights = begin(layer.weights);
@@ -265,22 +294,36 @@ void OpenCLBuffers::forward(const std::vector<net_t>& input,
265294

266295
auto pinnedOutBufferHost_pol = m_commandqueue.enqueueMapBuffer(
267296
m_pinnedOutBuffer_pol, CL_FALSE, CL_MAP_READ, 0,
268-
batch_size * finalSize_pol);
297+
batch_size * m_finalSize_pol);
269298
auto pinnedOutBufferHost_val = m_commandqueue.enqueueMapBuffer(
270299
m_pinnedOutBuffer_val, CL_FALSE, CL_MAP_READ, 0,
271-
batch_size * finalSize_val);
300+
batch_size * m_finalSize_val);
301+
void* pinnedOutBufferHost_mov;
302+
if (m_finalSize_mov > 0) {
303+
pinnedOutBufferHost_mov = m_commandqueue.enqueueMapBuffer(
304+
m_pinnedOutBuffer_mov, CL_FALSE, CL_MAP_READ, 0,
305+
batch_size * m_finalSize_mov);
306+
}
272307

273308
m_commandqueue.finish();
274309

275310
std::memcpy(output_pol.data(), pinnedOutBufferHost_pol,
276-
batch_size * finalSize_pol);
311+
batch_size * m_finalSize_pol);
277312
std::memcpy(output_val.data(), pinnedOutBufferHost_val,
278-
batch_size * finalSize_val);
313+
batch_size * m_finalSize_val);
314+
if (m_finalSize_mov > 0) {
315+
std::memcpy(output_mov.data(), pinnedOutBufferHost_mov,
316+
batch_size * m_finalSize_mov);
317+
}
279318

280319
m_commandqueue.enqueueUnmapMemObject(m_pinnedOutBuffer_pol,
281320
pinnedOutBufferHost_pol);
282321
m_commandqueue.enqueueUnmapMemObject(m_pinnedOutBuffer_val,
283322
pinnedOutBufferHost_val);
323+
if (m_finalSize_mov > 0) {
324+
m_commandqueue.enqueueUnmapMemObject(m_pinnedOutBuffer_mov,
325+
pinnedOutBufferHost_mov);
326+
}
284327
}
285328

286329
void OpenCLBuffers::convolve3(int channels, int outputs, cl::Buffer& bufferIn,

src/neural/opencl/OpenCLBuffers.h

+6-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,8 @@ class OpenCLBuffers {
4949
OpenCLBuffers(const OpenCL_Network& opencl_net);
5050

5151
void forward(const std::vector<net_t>& input, std::vector<net_t>& output_pol,
52-
std::vector<net_t>& output_val, const int batch_size);
52+
std::vector<net_t>& output_val, std::vector<net_t>& output_mov,
53+
const int batch_size);
5354

5455
private:
5556
using weight_slice_t = std::vector<cl::Buffer>::const_iterator;
@@ -99,4 +100,8 @@ class OpenCLBuffers {
99100
cl::Buffer m_pool_buffer;
100101
cl::Buffer m_pinnedOutBuffer_pol;
101102
cl::Buffer m_pinnedOutBuffer_val;
103+
cl::Buffer m_pinnedOutBuffer_mov;
104+
size_t m_finalSize_pol;
105+
size_t m_finalSize_val;
106+
size_t m_finalSize_mov;
102107
};

src/neural/opencl/network_opencl.cc

+52-8
Original file line numberDiff line numberDiff line change
@@ -45,24 +45,33 @@ class OpenCLNetwork;
4545
struct OpenCLWeights {
4646
const std::vector<float> ip2_val_w;
4747
const std::vector<float> ip2_val_b;
48+
const std::vector<float> ip2_mov_w;
49+
const std::vector<float> ip2_mov_b;
4850
const size_t num_output_policies = 1858;
4951
const size_t num_value_channels;
52+
const size_t num_moves_channels;
5053

5154
OpenCLWeights(const WeightsFile& file)
5255
: ip2_val_w(LayerAdapter(file.weights().ip2_val_w()).as_vector()),
5356
ip2_val_b(LayerAdapter(file.weights().ip2_val_b()).as_vector()),
54-
num_value_channels(LayerAdapter(file.weights().ip1_val_b()).size()) {}
57+
ip2_mov_w(LayerAdapter(file.weights().ip2_mov_w()).as_vector()),
58+
ip2_mov_b(LayerAdapter(file.weights().ip2_mov_b()).as_vector()),
59+
num_value_channels(LayerAdapter(file.weights().ip1_val_b()).size()),
60+
num_moves_channels(LayerAdapter(file.weights().ip1_mov_b()).size()) {}
5561
};
5662

5763
class OpenCLComputation : public NetworkComputation {
5864
public:
5965
OpenCLComputation(const OpenCL_Network& opencl_net,
60-
const OpenCLWeights& weights, const bool wdl)
66+
const OpenCLWeights& weights, const bool wdl,
67+
const bool moves_left)
6168
: opencl_net_(opencl_net),
6269
weights_(weights),
6370
policies_(),
6471
q_values_(),
65-
wdl_(wdl) {
72+
m_values_(),
73+
wdl_(wdl),
74+
moves_left_(moves_left) {
6675
buffers_ = opencl_net.acquire_buffers();
6776
}
6877

@@ -82,6 +91,7 @@ class OpenCLComputation : public NetworkComputation {
8291

8392
const auto num_output_policies = weights_.num_output_policies;
8493
const auto num_value_channels = weights_.num_value_channels;
94+
const auto num_moves_channels = weights_.num_moves_channels;
8595

8696
// Typically
8797
// input_channels = 112
@@ -90,6 +100,7 @@ class OpenCLComputation : public NetworkComputation {
90100

91101
std::vector<float> output_pol(largest_batch_size * num_output_policies);
92102
std::vector<float> output_val(largest_batch_size * num_value_channels);
103+
std::vector<float> output_mov(largest_batch_size * num_moves_channels);
93104
std::vector<float> input_data(largest_batch_size * kInputPlanes * kSquares);
94105

95106
for (size_t i = 0; i < plane_count; i += largest_batch_size) {
@@ -98,7 +109,8 @@ class OpenCLComputation : public NetworkComputation {
98109
EncodePlanes(planes_[i + j], &input_data[j * kSquares * kInputPlanes]);
99110
}
100111

101-
buffers_->forward(input_data, output_pol, output_val, batch_size);
112+
buffers_->forward(input_data, output_pol, output_val, output_mov,
113+
batch_size);
102114

103115
for (size_t j = 0; j < batch_size; j++) {
104116
std::vector<float> policy(num_output_policies);
@@ -135,6 +147,16 @@ class OpenCLComputation : public NetworkComputation {
135147

136148
q_values_.emplace_back(std::tanh(winrate));
137149
}
150+
151+
if (moves_left_) {
152+
auto m = weights_.ip2_mov_b[0];
153+
auto ptr_weights = weights_.ip2_mov_w.data();
154+
auto ptr_outputs = &output_mov[j * num_moves_channels];
155+
for (size_t i = 0; i < num_moves_channels; i++)
156+
m += ptr_weights[i] * std::max(0.0f, ptr_outputs[i]);
157+
158+
m_values_.emplace_back(std::max(0.0f, m));
159+
}
138160
}
139161
}
140162
}
@@ -162,8 +184,13 @@ class OpenCLComputation : public NetworkComputation {
162184
}
163185
}
164186

165-
float GetMVal(int /* sample */) const override {
166-
return 0.0f;
187+
float GetMVal(int sample) const override {
188+
if (moves_left_) {
189+
auto d = m_values_[sample];
190+
return d;
191+
} else {
192+
return 0.0f;
193+
}
167194
}
168195

169196
// Returns P value @move_id of @sample.
@@ -185,9 +212,11 @@ class OpenCLComputation : public NetworkComputation {
185212

186213
std::vector<std::vector<float>> policies_;
187214
std::vector<float> q_values_;
215+
std::vector<float> m_values_;
188216

189217
std::unique_ptr<OpenCLBuffers> buffers_;
190218
bool wdl_;
219+
bool moves_left_;
191220
};
192221

193222
void OpenCLComputation::EncodePlanes(const InputPlanes& sample, float* buffer) {
@@ -205,7 +234,7 @@ class OpenCLNetwork : public Network {
205234

206235
OpenCLNetwork(const WeightsFile& file, const OptionsDict& options)
207236
: capabilities_{file.format().network_format().input(),
208-
pblczero::NetworkFormat::MOVES_LEFT_NONE},
237+
file.format().network_format().moves_left()},
209238
weights_(file),
210239
params_(),
211240
opencl_(),
@@ -222,6 +251,9 @@ class OpenCLNetwork : public Network {
222251
wdl_ = file.format().network_format().output() ==
223252
pblczero::NetworkFormat::OUTPUT_WDL;
224253

254+
moves_left_ = file.format().network_format().moves_left() ==
255+
pblczero::NetworkFormat::MOVES_LEFT_V1;
256+
225257
auto max_batch_size_ =
226258
static_cast<size_t>(options.GetOrDefault<int>("batch_size", 16));
227259
if (max_batch_size_ > kHardMaxBatchSize) {
@@ -241,9 +273,11 @@ class OpenCLNetwork : public Network {
241273
const auto residual_blocks = weights.residual.size();
242274

243275
const auto num_value_input_planes = weights.value.biases.size();
276+
const auto num_moves_input_planes = weights.moves_left.biases.size();
244277
const auto num_policy_input_planes = weights.policy.biases.size();
245278
const auto num_output_policy = kPolicyOutputs;
246279
const auto num_value_channels = weights.ip1_val_b.size();
280+
const auto num_moves_channels = weights.ip1_mov_b.size();
247281

248282
// Typically
249283
// input_channels = 112
@@ -350,11 +384,20 @@ class OpenCLNetwork : public Network {
350384
weights.value.biases, weights.ip1_val_w,
351385
weights.ip1_val_b);
352386

387+
if (moves_left_) {
388+
opencl_net_.push_moves_left(
389+
channels, num_moves_input_planes,
390+
num_moves_input_planes * width * height, num_moves_channels,
391+
weights.moves_left.weights, weights.moves_left.biases,
392+
weights.ip1_mov_w, weights.ip1_mov_b);
393+
}
394+
353395
opencl_net_.setMaxMatchSize(max_batch_size_);
354396
}
355397

356398
std::unique_ptr<NetworkComputation> NewComputation() override {
357-
return std::make_unique<OpenCLComputation>(opencl_net_, weights_, wdl_);
399+
return std::make_unique<OpenCLComputation>(opencl_net_, weights_, wdl_,
400+
moves_left_);
358401
}
359402

360403
const NetworkCapabilities& GetCapabilities() const override {
@@ -372,6 +415,7 @@ class OpenCLNetwork : public Network {
372415
OpenCL opencl_;
373416
OpenCL_Network opencl_net_;
374417
bool wdl_;
418+
bool moves_left_;
375419
};
376420

377421
std::unique_ptr<Network> MakeOpenCLNetwork(const WeightsFile& weights,

0 commit comments

Comments
 (0)