forked from giuse/DNE
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathatari_ulerl_experiment.rb
293 lines (259 loc) · 12.1 KB
/
atari_ulerl_experiment.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
require 'forwardable'
require_relative 'gym_experiment'
require_relative 'observation_compressor'
require_relative 'atari_wrapper'
require_relative 'tools'
module DNE
# TODO: why doesn't it work when I use UInt8? We're in [0,255]!
NImage = Xumo::UInt32 # set a single data type for images
# Specialized GymExperiment class for Atari environments and UL-ERL.
class AtariUlerlExperiment < GymExperiment
attr_reader :compr, :resize, :preproc, :nobs_per_ind, :ntrials_per_ind
def initialize config
## Why would I wish:
# compr before super (current choice)
# - net.struct => compr.code_size
# super before compr
# - config[:run][:debug] # can live without
# - compr.dims => AtariWrapper.downsample # gonna duplicate the process, beware
# - obs size => AtariWrapper orig_size
puts "Initializing compressor" # if debug
compr_opts = config.delete :compr # otherwise unavailable for debug
seed_proport = compr_opts.delete :seed_proport
@nobs_per_ind = compr_opts.delete :nobs_per_ind
@preproc = compr_opts.delete :preproc
@ntrials_per_ind = config[:run].delete :ntrials_per_ind
@compr = ObservationCompressor.new **compr_opts
# default ninputs for network
config[:net][:ninputs] ||= compr.code_size
puts "Loading Atari OpenAI Gym environment" # if debug
super config
# initialize the centroids based on the env's reset obs
compr.reset_centrs single_env.reset_obs, proport: seed_proport if seed_proport
end
# Initializes the Atari environment
# @note python environments interfaced through pycall
# @param type [String] the type of environment as understood by OpenAI Gym
# @param [Array<Integer,Integer>] optional downsampling for rows and columns
# @return an initialized environment
# NEW ENVS CURRENTLY SPAWNED IN PARALL CHILD
# means that they'll get respawned every time
# should instead update @parall_env in parent after every pop
# for now keep like this, spawning is fast and gets deleted with child process
# we are ensuring there's one per ind and that's what matters
# note the same env is used for multiple evals on each ind
def init_env type:
# puts " initializing env" if debug
# print "(newenv) " #if debug
AtariWrapper.new gym.make(type), downsample: compr.downsample,
skip_type: skip_type, preproc: preproc
end
# How to aggregate observations coming from a sequence of noops
OBS_AGGR = {
avg: -> (obs_lst) { obs_lst.reduce(:+) / obs_lst.size},
new: -> (obs_lst) { obs_lst.first - env.reset_obs},
first: -> (obs_lst) { obs_lst.first },
last: -> (obs_lst) { obs_lst.last }
}
# Return the fitness of a single genotype
# @param genotype the individual to be evaluated
# @param env the environment to use for the evaluation
# @param render [bool] whether to render the evaluation on screen
# @param nsteps [Integer] how many interactions to run with the game. One interaction is one action choosing + enacting followed by `skip_frames` frame skips
def fitness_one genotype, env: single_env, render: false, nsteps: max_nsteps, aggr_type: :last
puts "Evaluating one individual" if debug
puts " Loading weights in network" if debug
net.deep_reset
net.load_weights genotype
observation = env.reset
# require 'pry'; binding.pry unless observation == env.reset_obs # => check passed, add to tests
env.render if render
tot_reward = 0
# # set of observations with highest novelty, representative of the ability of the individual
# # to obtain novel observations from the environment => hence reaching novel env states
# represent_obs = []
puts "IGNORING `nobs_per_ind=#{nobs_per_ind}` (random sampling obs)" if nobs_per_ind
represent_obs = observation
nobs = 1
puts " Running (max_nsteps: #{max_nsteps})" if debug
runtime = nsteps.times do |i|
code = compr.encode observation
# print code.to_a
selected_action = action_for code
obs_lst, rew, done, info_lst = env.execute selected_action, skip_frames: skip_frames
# puts "#{obs_lst}, #{rew}, #{done}, #{info_lst}" if debug
observation = OBS_AGGR[aggr_type].call obs_lst
tot_reward += rew
## NOTE: SWAP COMMENTS ON THE FOLLOWING to switch to novelty-based obs selection
# # The same observation represents the state both for action selection and for individual novelty
# # OPT: most obs will most likely have lower novelty, so place it first
# # TODO: I could add here a check if obs is already in represent_obs; in fact
# # though the probability is low (sequential markovian fully-observable env)
# novelty = compr.novelty observation, code
# represent_obs.unshift [observation, novelty]
# represent_obs.sort_by! &:last
# represent_obs.shift if represent_obs.size > nobs_per_ind
# Random sampling for representative obs
nobs += 1
represent_obs = observation if rand < 1.0/nobs
# Image selection by random sampling
env.render if render
break i if done
end
compr.train_set << represent_obs
# for novelty:
# represent_obs.each { |obs, _nov| compr.train_set << obs }
puts "=> Done! fitness: #{tot_reward}" if debug
# print tot_reward, ' ' # if debug
print "#{tot_reward}(#{runtime}) "
tot_reward
end
# Builds a function that return a list of fitnesses for a list of genotypes.
# Since Parallel runs in separate fork, this overload is needed to fetch out
# the training set before returning the fitness to the optimizer
# @param type the type of computation
# @return [lambda] function that evaluates the fitness of a list of genotype
# @note returned function has param genotypes [Array<gtype>] list of genotypes, return [Array<Numeric>] list of fitnesses for each genotype
def gen_fit_fn type, ntrials: ntrials_per_ind
return super unless type.nil? || type == :parallel
nprocs = Parallel.processor_count - 1 # it's actually faster this way
puts "Running in parallel on #{nprocs} processes"
-> (genotypes) do
print "Fits: "
fits, parall_infos = Parallel.map(0...genotypes.shape.first,
in_processes: nprocs, isolation: true) do |i|
# env = parall_envs[Parallel.worker_number]
env = parall_envs[i] # leveraging dynamic env allocation
# fit = fitness_one genotypes[i, true], env: env
fits = ntrials.times.map { fitness_one genotypes[i, true], env: env }
fit = fits.to_na.mean
print "[m#{fit}] "
[fit, compr.parall_info]
end.transpose
puts # newline here because I'm done `print`ing all ind fits
puts "Exporting training images"
parall_infos.each &compr.method(:add_from_parall_info)
puts "Training optimizer"
fits.to_na
end
end
# Return an action for an encoded observation
# The neural network is activated on the code, then its output is
# interpreted as a corresponding action
# @param code [Array] encoding for the current observation
# @return [Integer] action
# TODO: alternatives like softmax and such
def action_for code
output = net.activate code
nans = output.isnan
# this is a pretty reliable bug indicator
raise "\n\n\tNaN network output!!\n\n" if nans.any?
# action = output[0...6].max_index # limit to 6 actions
action = output.max_index
end
def update_opt
return false if @curr_ninputs == compr.code_size
puts " code_size: #{compr.code_size}"
diff = compr.code_size - @curr_ninputs
@curr_ninputs = compr.code_size
pl = net.struct.first(2).reduce(:*)
nw = diff * net.struct[1]
new_mu_val = 0 # value for the new means (0)
new_var_val = 0.0001 # value for the new variances (diagonal of covariance) (<<1)
new_cov_val = 0 # value for the other covariances (outside diagonal) (0)
old = case opt_type
when :XNES then opt
when :BDNES then opt.blocks.first
else raise NotImplementedError, "refactor and fill in this case block"
end
new_mu = old.mu.insert pl, [new_mu_val]*nw
new_sigma = old.sigma.insert [pl]*nw, new_cov_val, axis: 0
new_sigma = new_sigma.insert [pl]*nw, new_cov_val, axis: 1
new_sigma.diagonal[pl...(pl+nw)] = new_var_val
new_nes = NES::XNES.new new_mu.size, old.obj_fn, old.opt_type,
parallel_fit: old.parallel_fit, mu_init: new_mu, sigma_init: new_sigma,
**opt_opt
new_nes.instance_variable_set :@rng, old.rng # ensure rng continuity
new_nes.instance_variable_set :@best, old.best # ensure best continuity
case opt_type
when :XNES
@opt = new_nes
puts " new opt dims: #{opt.ndims}"
when :BDNES
opt.blocks[0] = new_nes
opt.ndims_lst[0] = new_mu.size
puts " new opt dims: #{opt.ndims_lst}"
else raise NotImplementedError, "refactor and fill in this case block"
end
puts " popsize: #{opt.popsize}"
puts " lrate: #{opt.lrate}"
# FIXME: I need to run these before I can use automatic popsize again!
# => update popsize in bdnes and its blocks before using it again
# if opt.kind_of? BDNES or something
# opt.instance_variable_set :popsize, blocks.map(&:popsize).max
# opt.blocks.each { |xnes| xnes.instance_variable_set :@popsize, opt.popsize }
# update net, since inputs have changed
@net = init_net netopts.merge({ninputs: compr.code_size})
puts " new net struct: #{net.struct}"
return true
end
# Run the experiment
def run ngens: max_ngens
@curr_ninputs = compr.code_size
ngens.times do |i|
$ngen = i # allows for conditional debugger calls `binding.pry if $ngen = n`
puts Time.now
puts "# Gen #{i+1}/#{ngens}"
# it just makes more sense run first, even though at first gen the trainset is empty
puts "Training compressor" if debug
compr.train
update_opt # if I have more centroids, I should update opt
opt.train
# Note: data analysis is done by extracting statistics from logs using regexes.
# Just `puts` anything you'd like to track, and save log to file
puts "Best fit so far: #{opt.best.first} -- " \
"Fit mean: #{opt.last_fits.mean} -- " \
"Fit stddev: #{opt.last_fits.stddev}\n" \
"Mu mean: #{opt.mu.mean} -- " \
"Mu stddev: #{opt.mu.stddev} -- " \
"Conv: #{opt.convergence}"
break if termination_criteria&.call(opt) # uhm currently unused
end
end
# Save experiment current state to file
def dump fname="dumps/atari_#{Time.now.strftime '%y%m%d_%H%M'}.bin"
raise NotImplementedError, "doesn't work with BDNES atm" if config[:opt][:type] == :BDNES
File.open(fname, 'wb') do |f|
Marshal.dump(
{ config: config,
best: opt.best.last,
mu: opt.mu,
sigma: opt.sigma,
centrs: compr.centrs
}, f)
end
puts "Experiment data dumped to `#{fname}`"
true
end
# Load experiment state from file
def load fname=Dir["dumps/atari_*.bin"].sort.last
hsh = File.open(fname, 'r') { |f| Marshal.load f }
initialize hsh[:config]
opt.instance_variable_set :@best, hsh[:best]
opt.instance_variable_set :@mu, hsh[:mu]
opt.instance_variable_set :@sigma, hsh[:sigma]
compr.instance_variable_set :@centrs, hsh[:centrs]
# Uhm haven't used that yet...
# what else needs to be done in order to be able to run `#show_ind` again?
puts "Experiment data loaded from `#{fname}`"
true
end
# Return an initialized exp from state on file
def self.load fname=Dir["atari_*.bin"].sort.last
# will initialize twice, but we're sure to have a conform `hsh[:config]`
hsh = File.open(fname, 'r') { |f| Marshal.load f }
new(hsh[:config]).tap { |exp| exp.load fname }
end
end
end
puts "USAGE: `bundle exec ruby experiments/atari.rb`" if __FILE__ == $0