diff --git a/python/deprecated/eval_sgf.py b/python/deprecated/eval_sgf.py
old mode 100755
new mode 100644
index 13bcde60e..26381b19a
--- a/python/deprecated/eval_sgf.py
+++ b/python/deprecated/eval_sgf.py
@@ -118,12 +118,12 @@ def play(pla,loc):
 
 print(board.to_string())
 
-saver = tf.train.Saver(
+saver = tf.compat.v1.train.Saver(
   max_to_keep = 10000,
   save_relative_paths = True,
 )
 
-with tf.Session() as session:
+with tf.compat.v1.Session() as session:
 
   if not debug:
     saver.restore(session, modelpath)
diff --git a/python/deprecated/export_model.py b/python/deprecated/export_model.py
old mode 100755
new mode 100644
index 522eeabc7..54141831e
--- a/python/deprecated/export_model.py
+++ b/python/deprecated/export_model.py
@@ -49,7 +49,7 @@ def log(s):
 model = Model(model_config)
 
 total_parameters = 0
-for variable in tf.trainable_variables():
+for variable in tf.compat.v1.trainable_variables():
   shape = variable.get_shape()
   variable_parameters = 1
   for dim in shape:
@@ -63,17 +63,17 @@ def log(s):
 
 print("Testing", flush=True)
 
-saver = tf.train.Saver(
+saver = tf.compat.v1.train.Saver(
   max_to_keep = 10000,
   save_relative_paths = True,
 )
 
 #Some tensorflow options
 #tfconfig = tf.ConfigProto(log_device_placement=False,device_count={'GPU': 0})
-tfconfig = tf.ConfigProto(log_device_placement=False)
+tfconfig = tf.compat.v1.ConfigProto(log_device_placement=False)
 #tfconfig.gpu_options.allow_growth = True
 #tfconfig.gpu_options.per_process_gpu_memory_fraction = 0.4
-with tf.Session(config=tfconfig) as session:
+with tf.compat.v1.Session(config=tfconfig) as session:
   saver.restore(session, model_file)
 
   sys.stdout.flush()
@@ -85,7 +85,7 @@ def log(s):
   sys.stderr.flush()
 
   if not for_cuda:
-    tf.train.write_graph(session.graph_def,export_dir,filename_prefix + ".graph.pb")
+    tf.io.write_graph(session.graph_def,export_dir,filename_prefix + ".graph.pb")
     savepath = export_dir + "/" + filename_prefix
     saver.save(session, savepath + ".weights")
     with open(savepath + ".config.json","w") as f:
@@ -112,7 +112,7 @@ def writeln(s):
     writeln(model.max_board_size) #y
     writeln(model.num_input_features)
 
-    variables = dict((variable.name,variable) for variable in tf.global_variables())
+    variables = dict((variable.name,variable) for variable in tf.compat.v1.global_variables())
     def get_weights(name):
       return np.array(variables[name+":0"].eval())
 
diff --git a/python/deprecated/find_poses.py b/python/deprecated/find_poses.py
old mode 100755
new mode 100644
index 9e8bfbf8d..203130e52
--- a/python/deprecated/find_poses.py
+++ b/python/deprecated/find_poses.py
@@ -57,7 +57,7 @@ def log(s):
 policy_probs_output = tf.nn.softmax(model.policy_output)
 
 total_parameters = 0
-for variable in tf.trainable_variables():
+for variable in tf.compat.v1.trainable_variables():
   shape = variable.get_shape()
   variable_parameters = 1
   for dim in shape:
@@ -94,17 +94,17 @@ def log(s):
 sgfhash_start = next_moves_start + next_moves_len
 sgfhash_len = 8
 
-saver = tf.train.Saver(
+saver = tf.compat.v1.train.Saver(
   max_to_keep = 10000,
   save_relative_paths = True,
 )
 
 #Some tensorflow options
 #tfconfig = tf.ConfigProto(log_device_placement=False,device_count={'GPU': 0})
-tfconfig = tf.ConfigProto(log_device_placement=False)
+tfconfig = tf.compat.v1.ConfigProto(log_device_placement=False)
 #tfconfig.gpu_options.allow_growth = True
 #tfconfig.gpu_options.per_process_gpu_memory_fraction = 0.4
-with tf.Session(config=tfconfig) as session:
+with tf.compat.v1.Session(config=tfconfig) as session:
   saver.restore(session, model_file)
 
   log("Began session, loaded model")
diff --git a/python/deprecated/mixmodels.py b/python/deprecated/mixmodels.py
old mode 100755
new mode 100644
index 3de1f9672..64307f11d
--- a/python/deprecated/mixmodels.py
+++ b/python/deprecated/mixmodels.py
@@ -45,7 +45,7 @@ def volume(variable):
 
 variables = {}
 total_parameters = 0
-for variable in tf.global_variables():
+for variable in tf.compat.v1.global_variables():
   variable_parameters = volume(variable)
   total_parameters += variable_parameters
   variables[variable.name] = variable
@@ -59,7 +59,7 @@ def volume(variable):
 
 print("Testing", flush=True)
 
-saver = tf.train.Saver(
+saver = tf.compat.v1.train.Saver(
   max_to_keep = 10000,
   save_relative_paths = True,
 )
@@ -67,8 +67,8 @@ def volume(variable):
 count = 0
 accum_weights = {}
 
-tfconfig = tf.ConfigProto(log_device_placement=False)
-with tf.Session(config=tfconfig) as session:
+tfconfig = tf.compat.v1.ConfigProto(log_device_placement=False)
+with tf.compat.v1.Session(config=tfconfig) as session:
 
   for model_file in model_files:
     saver.restore(session, model_file)
@@ -91,7 +91,7 @@ def run(fetches):
 
 assign_ops = dict([(name,variables[name].assign(accum_weights[name])) for name in accum_weights])
 
-with tf.Session(config=tfconfig) as session:
+with tf.compat.v1.Session(config=tfconfig) as session:
   session.run(assign_ops)
   print("Saving to " + output_file)
   saver.save(session, output_file)
diff --git a/python/deprecated/model.py b/python/deprecated/model.py
index 8ccd27c2c..51c303067 100644
--- a/python/deprecated/model.py
+++ b/python/deprecated/model.py
@@ -26,7 +26,7 @@ def __init__(self,config):
 
     self.reg_variables = []
     self.lr_adjusted_variables = {}
-    self.is_training = tf.placeholder(tf.bool,name="is_training")
+    self.is_training = tf.compat.v1.placeholder(tf.bool,name="is_training")
 
     #Accumulates outputs for printing stats about their activations
     self.outputs_by_layer = []
@@ -210,7 +210,7 @@ def addPrevPrevLadderFeature(loc,pos,workingMoves):
   # Build model -------------------------------------------------------------
 
   def ensure_variable_exists(self,name):
-    for v in tf.trainable_variables():
+    for v in tf.compat.v1.trainable_variables():
       if v.name == name:
         return name
     raise Exception("Could not find variable " + name)
@@ -227,7 +227,7 @@ def batchnorm(self,name,tensor):
     has_bias = True
     has_scale = False
     self.batch_norms[name] = (tensor.shape[-1].value,epsilon,has_bias,has_scale)
-    return tf.layers.batch_normalization(
+    return tf.compat.v1.layers.batch_normalization(
       tensor,
       axis=-1, #Because channels are our last axis, -1 refers to that via wacky python indexing
       momentum=0.99,
@@ -246,7 +246,7 @@ def init_stdev(self,num_inputs,num_outputs):
 
   def init_weights(self, shape, num_inputs, num_outputs):
     stdev = self.init_stdev(num_inputs,num_outputs) / 1.0
-    return tf.truncated_normal(shape=shape, stddev=stdev)
+    return tf.random.truncated_normal(shape=shape, stddev=stdev)
 
   def weight_variable_init_constant(self, name, shape, constant):
     init = tf.zeros(shape)
@@ -268,7 +268,7 @@ def weight_variable(self, name, shape, num_inputs, num_outputs, scale_initial_we
     return variable
 
   def conv2d(self, x, w):
-    return tf.nn.conv2d(x, w, strides=[1,1,1,1], padding='SAME')
+    return tf.nn.conv2d(input=x, filters=w, strides=[1,1,1,1], padding='SAME')
 
   def dilated_conv2d(self, x, w, dilation):
     return tf.nn.atrous_conv2d(x, w, rate = dilation, padding='SAME')
@@ -279,8 +279,8 @@ def apply_symmetry(self,tensor,symmetries,inverse):
     transp = symmetries[2]
 
     rev_axes = tf.concat([
-      tf.cond(ud, lambda: tf.constant([1]), lambda: tf.constant([],dtype='int32')),
-      tf.cond(lr, lambda: tf.constant([2]), lambda: tf.constant([],dtype='int32')),
+      tf.cond(pred=ud, true_fn=lambda: tf.constant([1]), false_fn=lambda: tf.constant([],dtype='int32')),
+      tf.cond(pred=lr, true_fn=lambda: tf.constant([2]), false_fn=lambda: tf.constant([],dtype='int32')),
     ], axis=0)
 
     if not inverse:
@@ -289,14 +289,14 @@ def apply_symmetry(self,tensor,symmetries,inverse):
     assert(len(tensor.shape) == 4 or len(tensor.shape) == 3)
     if len(tensor.shape) == 3:
       tensor = tf.cond(
-        transp,
-        lambda: tf.transpose(tensor, [0,2,1]),
-        lambda: tensor)
+        pred=transp,
+        true_fn=lambda: tf.transpose(a=tensor, perm=[0,2,1]),
+        false_fn=lambda: tensor)
     else:
       tensor = tf.cond(
-        transp,
-        lambda: tf.transpose(tensor, [0,2,1,3]),
-        lambda: tensor)
+        pred=transp,
+        true_fn=lambda: tf.transpose(a=tensor, perm=[0,2,1,3]),
+        false_fn=lambda: tensor)
 
     if inverse:
       tensor = tf.reverse(tensor, rev_axes)
@@ -321,14 +321,14 @@ def chain_pool(self,tensor,chains,num_chain_segments,empty,nonempty,mode):
     #Each one needs max_chain_idxs different buckets.
     num_segments_by_batch_and_channel = tf.fill([1,num_channels],1) * tf.expand_dims(num_chain_segments,axis=1)
     shift = tf.cumsum(tf.reshape(num_segments_by_batch_and_channel,[-1]),exclusive=True)
-    num_segments = tf.reduce_sum(num_chain_segments) * num_channels
+    num_segments = tf.reduce_sum(input_tensor=num_chain_segments) * num_channels
     shift = tf.reshape(shift,[-1,1,1,num_channels])
 
     segments = tf.expand_dims(chains,3) + shift
     if mode == "sum":
-      pools = tf.unsorted_segment_sum(tensor,segments,num_segments=num_segments)
+      pools = tf.math.unsorted_segment_sum(tensor,segments,num_segments=num_segments)
     elif mode == "max":
-      pools = tf.unsorted_segment_max(tensor,segments,num_segments=num_segments)
+      pools = tf.math.unsorted_segment_max(tensor,segments,num_segments=num_segments)
     else:
       assert False
 
@@ -367,13 +367,13 @@ def conv_weight_variable(self, name, diam1, diam2, in_channels, out_channels, sc
       weights = self.weight_variable(name,[diam1,diam2,in_channels,out_channels],in_channels*diam1*diam2,out_channels,scale_initial_weights,reg=reg)
     else:
       extra_initial_weight = self.init_weights([1,1,in_channels,out_channels], in_channels, out_channels) * emphasize_center_weight
-      extra_initial_weight = tf.pad(extra_initial_weight, [(radius1,radius1),(radius2,radius2),(0,0),(0,0)])
+      extra_initial_weight = tf.pad(tensor=extra_initial_weight, paddings=[(radius1,radius1),(radius2,radius2),(0,0),(0,0)])
       weights = self.weight_variable(name,[diam1,diam2,in_channels,out_channels],in_channels*diam1*diam2,out_channels,scale_initial_weights,extra_initial_weight,reg=reg)
 
     if emphasize_center_lr is not None:
       factor = tf.constant([emphasize_center_lr],dtype=tf.float32)
       factor = tf.reshape(factor,[1,1,1,1])
-      factor = tf.pad(factor, [(radius1,radius1),(radius2,radius2),(0,0),(0,0)], constant_values=1.0)
+      factor = tf.pad(tensor=factor, paddings=[(radius1,radius1),(radius2,radius2),(0,0),(0,0)], constant_values=1.0)
       self.add_lr_factor(weights.name, factor)
 
     return weights
@@ -399,7 +399,7 @@ def conv_only_extra_center_block(self, name, in_layer, diam, in_channels, out_ch
     radius = diam // 2
     center_weights = self.weight_variable(name+"/wcenter",[1,1,in_channels,out_channels],in_channels,out_channels,scale_initial_weights=0.3*scale_initial_weights)
     weights = self.weight_variable(name+"/w",[diam,diam,in_channels,out_channels],in_channels*diam*diam,out_channels,scale_initial_weights)
-    weights = weights + tf.pad(center_weights,[(radius,radius),(radius,radius),(0,0),(0,0)])
+    weights = weights + tf.pad(tensor=center_weights,paddings=[(radius,radius),(radius,radius),(0,0),(0,0)])
     out_layer = self.conv2d(in_layer, weights)
     self.outputs_by_layer.append((name,out_layer))
     return out_layer
@@ -435,8 +435,8 @@ def global_res_conv_block(self, name, in_layer, diam, main_channels, mid_channel
     self.outputs_by_layer.append((name+"/conv1b",conv1b_layer))
 
     trans1b_layer = self.parametric_relu(name+"/trans1b",(self.batchnorm(name+"/norm1b",conv1b_layer)))
-    trans1b_mean = tf.reduce_mean(trans1b_layer,axis=[1,2],keepdims=True)
-    trans1b_max = tf.reduce_max(trans1b_layer,axis=[1,2],keepdims=True)
+    trans1b_mean = tf.reduce_mean(input_tensor=trans1b_layer,axis=[1,2],keepdims=True)
+    trans1b_max = tf.reduce_max(input_tensor=trans1b_layer,axis=[1,2],keepdims=True)
     trans1b_pooled = tf.concat([trans1b_mean,trans1b_max],axis=3)
 
     remix_weights = self.weight_variable(name+"/w1r",[global_mid_channels*2,mid_channels],global_mid_channels*2,mid_channels, scale_initial_weights = 0.5)
@@ -552,7 +552,7 @@ def skew_right(tensor):
       assert(tensor.shape[1].value == n)
       assert(tensor.shape[2].value == n)
       c = tensor.shape[3].value
-      tensor = tf.pad(tensor,[[0,0],[0,0],[0,n],[0,0]]) #Pad 19x19 -> 19x38
+      tensor = tf.pad(tensor=tensor,paddings=[[0,0],[0,0],[0,n],[0,0]]) #Pad 19x19 -> 19x38
       tensor = tf.reshape(tensor,[-1,2*n*n,c]) #Linearize
       tensor = tensor[:,:((2*n-1)*n),:] #Chop off the 19 zeroes on the end
       tensor = tf.reshape(tensor,[-1,n,2*n-1,c]) #Now we are skewed 19x37 as desired
@@ -564,7 +564,7 @@ def unskew_right(tensor):
       assert(tensor.shape[2].value == 2*n-1)
       c = tensor.shape[3].value
       tensor = tf.reshape(tensor,[-1,n*(2*n-1),c]) #Linearize
-      tensor = tf.pad(tensor,[[0,0],[0,n],[0,0]]) #Pad 19*37 -> 19*38
+      tensor = tf.pad(tensor=tensor,paddings=[[0,0],[0,n],[0,0]]) #Pad 19*37 -> 19*38
       tensor = tf.reshape(tensor,[-1,n,2*n,c]) #Convert back to 19x38
       tensor = tensor[:,:,:n,:] #Chop off the extra, now we are 19x19
       return tensor
@@ -575,7 +575,7 @@ def skew_left(tensor):
       assert(tensor.shape[1].value == n)
       assert(tensor.shape[2].value == n)
       c = tensor.shape[3].value
-      tensor = tf.pad(tensor,[[0,0],[1,1],[n-2,0],[0,0]]) #Pad 19x19 -> 21x36
+      tensor = tf.pad(tensor=tensor,paddings=[[0,0],[1,1],[n-2,0],[0,0]]) #Pad 19x19 -> 21x36
       tensor = tf.reshape(tensor,[-1,(n+2)*(2*n-2),c]) #Linearize
       tensor = tensor[:,(2*n-3):(-n+1),:] #Chop off the 35 extra zeroes on the start and the 18 at the end.
       tensor = tf.reshape(tensor,[-1,n,2*n-1,c]) #Now we are skewed 19x37 as desired
@@ -588,7 +588,7 @@ def unskew_left(tensor):
       assert(tensor.shape[2].value == 2*n-1)
       c = tensor.shape[3].value
       tensor = tf.reshape(tensor,[-1,n*(2*n-1),c]) #Linearize
-      tensor = tf.pad(tensor,[[0,0],[2*n-3,n-1],[0,0]]) #Pad 19*37 -> 21*36
+      tensor = tf.pad(tensor=tensor,paddings=[[0,0],[2*n-3,n-1],[0,0]]) #Pad 19*37 -> 21*36
       tensor = tf.reshape(tensor,[-1,n+2,2*n-2,c]) #Convert back to 21x36
       tensor = tensor[:,1:(n+1),(n-2):,:] #Chop off the extra, now we are 19x19
       return tensor
@@ -683,10 +683,10 @@ def build_model(self, use_ranks, include_policy, include_value, predict_pass):
     self.version = 2 #V2 features, no internal architecture change.
 
     #Input layer---------------------------------------------------------------------------------
-    inputs = tf.placeholder(tf.float32, [None] + self.input_shape, name="inputs")
-    ranks = tf.placeholder(tf.float32, [None] + self.rank_shape, name="ranks")
-    symmetries = tf.placeholder(tf.bool, [3], name="symmetries")
-    include_history = tf.placeholder(tf.float32, [None] + [5], name="include_history")
+    inputs = tf.compat.v1.placeholder(tf.float32, [None] + self.input_shape, name="inputs")
+    ranks = tf.compat.v1.placeholder(tf.float32, [None] + self.rank_shape, name="ranks")
+    symmetries = tf.compat.v1.placeholder(tf.bool, [3], name="symmetries")
+    include_history = tf.compat.v1.placeholder(tf.float32, [None] + [5], name="include_history")
     self.inputs = inputs
     self.ranks = ranks
     self.symmetries = symmetries
@@ -879,8 +879,8 @@ def build_model(self, use_ranks, include_policy, include_value, predict_pass):
 
       #Fold g1 down to single values for the board.
       #For stdev, add a tiny constant to ensure numeric stability
-      g1_mean = tf.reduce_mean(g1_layer,axis=[1,2],keepdims=True)
-      g1_max = tf.reduce_max(g1_layer,axis=[1,2],keepdims=True)
+      g1_mean = tf.reduce_mean(input_tensor=g1_layer,axis=[1,2],keepdims=True)
+      g1_max = tf.reduce_max(input_tensor=g1_layer,axis=[1,2],keepdims=True)
       g2_layer = tf.concat([g1_mean,g1_max],axis=3) #shape [b,1,1,2*convg1num_channels]
       g2_num_channels = 2*g1_num_channels
       self.outputs_by_layer.append(("g2",g2_layer))
@@ -921,7 +921,7 @@ def build_model(self, use_ranks, include_policy, include_value, predict_pass):
       if not predict_pass:
         #Simply add the pass output on with a large negative constant that's probably way more negative than anything
         #else the neural net would output.
-        policy_output = tf.pad(policy_output,[(0,0),(0,1)], constant_values = -10000., name="policy_output")
+        policy_output = tf.pad(tensor=policy_output,paddings=[(0,0),(0,1)], constant_values = -10000., name="policy_output")
       else:
         #Add pass move based on the global g values
         matmulpass = self.weight_variable("matmulpass",[g2_num_channels,1],g2_num_channels*8,1)
@@ -935,7 +935,7 @@ def build_model(self, use_ranks, include_policy, include_value, predict_pass):
     else:
       #Don't include policy? Just set the policy output to all zeros.
       policy_output = tf.zeros_like(inputs[:,:,0])
-      policy_output = tf.pad(policy_output,[(0,0),(0,1)])
+      policy_output = tf.pad(tensor=policy_output,paddings=[(0,0),(0,1)])
       self.policy_output = policy_output
 
     if include_value:
@@ -947,7 +947,7 @@ def build_model(self, use_ranks, include_policy, include_value, predict_pass):
       self.v1_conv = ("v1",3,trunk_num_channels,v1_num_channels)
       self.v1_num_channels = v1_num_channels
 
-      v1_layer_pooled = tf.reduce_mean(v1_layer,axis=[1,2],keepdims=False)
+      v1_layer_pooled = tf.reduce_mean(input_tensor=v1_layer,axis=[1,2],keepdims=False)
       v1_size = v1_num_channels
 
       v2_size = 12
@@ -983,43 +983,43 @@ def __init__(self,model,for_optimization,require_last_move):
     value_output = model.value_output
 
     #Loss function
-    self.policy_targets = tf.placeholder(tf.float32, [None] + model.policy_target_shape)
-    self.value_target = tf.placeholder(tf.float32, [None] + model.value_target_shape)
-    self.target_weights_from_data = tf.placeholder(tf.float32, [None] + model.target_weights_shape)
+    self.policy_targets = tf.compat.v1.placeholder(tf.float32, [None] + model.policy_target_shape)
+    self.value_target = tf.compat.v1.placeholder(tf.float32, [None] + model.value_target_shape)
+    self.target_weights_from_data = tf.compat.v1.placeholder(tf.float32, [None] + model.target_weights_shape)
 
     if require_last_move == "all":
-      self.target_weights_used = self.target_weights_from_data * tf.reduce_sum(model.inputs[:,:,14],axis=[1])
+      self.target_weights_used = self.target_weights_from_data * tf.reduce_sum(input_tensor=model.inputs[:,:,14],axis=[1])
     elif require_last_move is True:
-      self.target_weights_used = self.target_weights_from_data * tf.reduce_sum(model.inputs[:,:,10],axis=[1])
+      self.target_weights_used = self.target_weights_from_data * tf.reduce_sum(input_tensor=model.inputs[:,:,10],axis=[1])
     else:
       self.target_weights_used = self.target_weights_from_data
 
     self.policy_loss = tf.reduce_sum(
-      self.target_weights_used *
-      tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.policy_targets, logits=policy_output)
+      input_tensor=self.target_weights_used *
+      tf.nn.softmax_cross_entropy_with_logits(labels=self.policy_targets, logits=policy_output)
     )
 
     cross_entropy_value_loss = 1.4*tf.reduce_sum(
-      self.target_weights_used *
+      input_tensor=self.target_weights_used *
       tf.nn.softmax_cross_entropy_with_logits(
-        labels=tf.stack([(1+self.value_target)/2,(1-self.value_target)/2],axis=1),
+        labels=tf.stop_gradient(tf.stack([(1+self.value_target)/2,(1-self.value_target)/2],axis=1)),
         logits=tf.stack([value_output,tf.zeros_like(value_output)],axis=1)
       )
     )
 
     l2_value_loss = tf.reduce_sum(
-      self.target_weights_used *
+      input_tensor=self.target_weights_used *
       tf.square(self.value_target - tf.tanh(value_output))
     )
 
     self.value_loss = 0.5 * (cross_entropy_value_loss + l2_value_loss)
     # self.value_loss = l2_value_loss
 
-    self.weight_sum = tf.reduce_sum(self.target_weights_used)
+    self.weight_sum = tf.reduce_sum(input_tensor=self.target_weights_used)
 
     if for_optimization:
       #Prior/Regularization
-      self.l2_reg_coeff = tf.placeholder(tf.float32)
+      self.l2_reg_coeff = tf.compat.v1.placeholder(tf.float32)
       self.reg_loss = self.l2_reg_coeff * tf.add_n([tf.nn.l2_loss(variable) for variable in model.reg_variables]) * self.weight_sum
 
       #The loss to optimize
@@ -1028,36 +1028,36 @@ def __init__(self,model,for_optimization,require_last_move):
 class Metrics:
   def __init__(self,model,target_vars,include_debug_stats):
     #Training results
-    policy_target_idxs = tf.argmax(target_vars.policy_targets, 1)
-    self.top1_prediction = tf.equal(tf.argmax(model.policy_output, 1), policy_target_idxs)
-    self.top4_prediction = tf.nn.in_top_k(model.policy_output,policy_target_idxs,4)
-    self.accuracy1 = tf.reduce_sum(target_vars.target_weights_used * tf.cast(self.top1_prediction, tf.float32))
-    self.accuracy4 = tf.reduce_sum(target_vars.target_weights_used * tf.cast(self.top4_prediction, tf.float32))
-    self.valueconf = tf.reduce_sum(tf.square(model.value_output))
+    policy_target_idxs = tf.argmax(input=target_vars.policy_targets, axis=1)
+    self.top1_prediction = tf.equal(tf.argmax(input=model.policy_output, axis=1), policy_target_idxs)
+    self.top4_prediction = tf.nn.in_top_k(predictions=model.policy_output,targets=policy_target_idxs,k=4)
+    self.accuracy1 = tf.reduce_sum(input_tensor=target_vars.target_weights_used * tf.cast(self.top1_prediction, tf.float32))
+    self.accuracy4 = tf.reduce_sum(input_tensor=target_vars.target_weights_used * tf.cast(self.top4_prediction, tf.float32))
+    self.valueconf = tf.reduce_sum(input_tensor=tf.square(model.value_output))
 
     #Debugging stats
     if include_debug_stats:
 
       def reduce_norm(x, axis=None, keepdims=False):
-        return tf.sqrt(tf.reduce_mean(tf.square(x), axis=axis, keepdims=keepdims))
+        return tf.sqrt(tf.reduce_mean(input_tensor=tf.square(x), axis=axis, keepdims=keepdims))
 
       def reduce_stdev(x, axis=None, keepdims=False):
-        m = tf.reduce_mean(x, axis=axis, keepdims=True)
+        m = tf.reduce_mean(input_tensor=x, axis=axis, keepdims=True)
         devs_squared = tf.square(x - m)
-        return tf.sqrt(tf.reduce_mean(devs_squared, axis=axis, keepdims=keepdims))
+        return tf.sqrt(tf.reduce_mean(input_tensor=devs_squared, axis=axis, keepdims=keepdims))
 
       self.activated_prop_by_layer = dict([
-        (name,tf.reduce_mean(tf.count_nonzero(layer,axis=[1,2])/layer.shape[1].value/layer.shape[2].value, axis=0)) for (name,layer) in model.outputs_by_layer
+        (name,tf.reduce_mean(input_tensor=tf.math.count_nonzero(layer,axis=[1,2])/layer.shape[1].value/layer.shape[2].value, axis=0)) for (name,layer) in model.outputs_by_layer
       ])
       self.mean_output_by_layer = dict([
-        (name,tf.reduce_mean(layer,axis=[0,1,2])) for (name,layer) in model.outputs_by_layer
+        (name,tf.reduce_mean(input_tensor=layer,axis=[0,1,2])) for (name,layer) in model.outputs_by_layer
       ])
       self.stdev_output_by_layer = dict([
         (name,reduce_stdev(layer,axis=[0,1,2])**2) for (name,layer) in model.outputs_by_layer
       ])
       self.mean_weights_by_var = dict([
-        (v.name,tf.reduce_mean(v)) for v in tf.trainable_variables()
+        (v.name,tf.reduce_mean(input_tensor=v)) for v in tf.compat.v1.trainable_variables()
       ])
       self.norm_weights_by_var = dict([
-        (v.name,reduce_norm(v)) for v in tf.trainable_variables()
+        (v.name,reduce_norm(v)) for v in tf.compat.v1.trainable_variables()
       ])
diff --git a/python/deprecated/play.py b/python/deprecated/play.py
old mode 100755
new mode 100644
index 30583d81a..17a4198ca
--- a/python/deprecated/play.py
+++ b/python/deprecated/play.py
@@ -265,7 +265,7 @@ def run_gtp(session):
 
   layerdict = dict(model.outputs_by_layer)
   weightdict = dict()
-  for v in tf.trainable_variables():
+  for v in tf.compat.v1.trainable_variables():
     weightdict[v.name] = v
 
   rank_policy_command_lookup = dict()
@@ -528,12 +528,12 @@ def add_input_feature_visualizations(layer_name, feature_idx, normalization_div)
       print('?%s ???\n\n' % (cmdid,), end='')
     sys.stdout.flush()
 
-saver = tf.train.Saver(
+saver = tf.compat.v1.train.Saver(
   max_to_keep = 10000,
   save_relative_paths = True,
 )
 
-with tf.Session() as session:
+with tf.compat.v1.Session() as session:
   saver.restore(session, modelpath)
   run_gtp(session)
 
diff --git a/python/deprecated/test.py b/python/deprecated/test.py
old mode 100755
new mode 100644
index 0b8711d31..26adf1145
--- a/python/deprecated/test.py
+++ b/python/deprecated/test.py
@@ -57,7 +57,7 @@ def log(s):
 metrics = Metrics(model,target_vars,include_debug_stats=False)
 
 total_parameters = 0
-for variable in tf.trainable_variables():
+for variable in tf.compat.v1.trainable_variables():
   shape = variable.get_shape()
   variable_parameters = 1
   for dim in shape:
@@ -94,17 +94,17 @@ def log(s):
 
 print("Testing", flush=True)
 
-saver = tf.train.Saver(
+saver = tf.compat.v1.train.Saver(
   max_to_keep = 10000,
   save_relative_paths = True,
 )
 
 #Some tensorflow options
 #tfconfig = tf.ConfigProto(log_device_placement=False,device_count={'GPU': 0})
-tfconfig = tf.ConfigProto(log_device_placement=False)
+tfconfig = tf.compat.v1.ConfigProto(log_device_placement=False)
 #tfconfig.gpu_options.allow_growth = True
 #tfconfig.gpu_options.per_process_gpu_memory_fraction = 0.4
-with tf.Session(config=tfconfig) as session:
+with tf.compat.v1.Session(config=tfconfig) as session:
   saver.restore(session, model_file)
 
   sys.stdout.flush()
diff --git a/python/deprecated/testlossbyhash.py b/python/deprecated/testlossbyhash.py
index 6b3ce3a00..2424e1096 100644
--- a/python/deprecated/testlossbyhash.py
+++ b/python/deprecated/testlossbyhash.py
@@ -52,7 +52,7 @@ def log(s):
 target_vars = Target_vars(model,for_optimization=False,require_last_move=require_last_move)
 
 total_parameters = 0
-for variable in tf.trainable_variables():
+for variable in tf.compat.v1.trainable_variables():
   shape = variable.get_shape()
   variable_parameters = 1
   for dim in shape:
@@ -89,17 +89,17 @@ def log(s):
 
 print("Testing", flush=True)
 
-saver = tf.train.Saver(
+saver = tf.compat.v1.train.Saver(
   max_to_keep = 10000,
   save_relative_paths = True,
 )
 
 #Some tensorflow options
 #tfconfig = tf.ConfigProto(log_device_placement=False,device_count={'GPU': 0})
-tfconfig = tf.ConfigProto(log_device_placement=False)
+tfconfig = tf.compat.v1.ConfigProto(log_device_placement=False)
 #tfconfig.gpu_options.allow_growth = True
 #tfconfig.gpu_options.per_process_gpu_memory_fraction = 0.4
-with tf.Session(config=tfconfig) as session:
+with tf.compat.v1.Session(config=tfconfig) as session:
   saver.restore(session, model_file)
 
   sys.stdout.flush()
diff --git a/python/deprecated/testmagnitudes.py b/python/deprecated/testmagnitudes.py
old mode 100755
new mode 100644
index 44b019fe0..b66bfe295
--- a/python/deprecated/testmagnitudes.py
+++ b/python/deprecated/testmagnitudes.py
@@ -57,7 +57,7 @@ def log(s):
 metrics = Metrics(model,target_vars,include_debug_stats=False)
 
 total_parameters = 0
-for variable in tf.trainable_variables():
+for variable in tf.compat.v1.trainable_variables():
   shape = variable.get_shape()
   variable_parameters = 1
   for dim in shape:
@@ -94,17 +94,17 @@ def log(s):
 
 print("Testing", flush=True)
 
-saver = tf.train.Saver(
+saver = tf.compat.v1.train.Saver(
   max_to_keep = 10000,
   save_relative_paths = True,
 )
 
 #Some tensorflow options
 #tfconfig = tf.ConfigProto(log_device_placement=False,device_count={'GPU': 0})
-tfconfig = tf.ConfigProto(log_device_placement=False)
+tfconfig = tf.compat.v1.ConfigProto(log_device_placement=False)
 #tfconfig.gpu_options.allow_growth = True
 #tfconfig.gpu_options.per_process_gpu_memory_fraction = 0.4
-with tf.Session(config=tfconfig) as session:
+with tf.compat.v1.Session(config=tfconfig) as session:
   saver.restore(session, model_file)
 
   sys.stdout.flush()
@@ -185,12 +185,12 @@ def run_validation_in_batches_and_print(fetches):
       sys.stdout.flush()
 
   vmetrics = {}
-  for variable in tf.trainable_variables():
-    vmetrics[variable.name + "/maxabsvalue"] = tf.reduce_max(tf.abs(variable))
+  for variable in tf.compat.v1.trainable_variables():
+    vmetrics[variable.name + "/maxabsvalue"] = tf.reduce_max(input_tensor=tf.abs(variable))
   for (layername,tensor) in model.outputs_by_layer:
-    vmetrics[layername + "/maxabsvalue"] = tf.reduce_max(tf.abs(tensor))
+    vmetrics[layername + "/maxabsvalue"] = tf.reduce_max(input_tensor=tf.abs(tensor))
   for (layername,tensor) in model.other_internal_outputs:
-    vmetrics[layername + "/maxabsvalue"] = tf.reduce_max(tf.abs(tensor))
+    vmetrics[layername + "/maxabsvalue"] = tf.reduce_max(input_tensor=tf.abs(tensor))
 
   run_validation_in_batches_and_print(vmetrics)
 
diff --git a/python/deprecated/train.py b/python/deprecated/train.py
old mode 100755
new mode 100644
index 6ff9bacb1..4bf401d18
--- a/python/deprecated/train.py
+++ b/python/deprecated/train.py
@@ -98,11 +98,11 @@ def detaillog(s):
 target_vars = Target_vars(model,for_optimization=True,require_last_move=False)
 
 #Training operation
-per_sample_learning_rate = tf.placeholder(tf.float32)
+per_sample_learning_rate = tf.compat.v1.placeholder(tf.float32)
 lr_adjusted_variables = model.lr_adjusted_variables
-update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) #collect batch norm update operations
+update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS) #collect batch norm update operations
 with tf.control_dependencies(update_ops):
-  optimizer = tf.train.MomentumOptimizer(per_sample_learning_rate, momentum=0.9, use_nesterov=True)
+  optimizer = tf.compat.v1.train.MomentumOptimizer(per_sample_learning_rate, momentum=0.9, use_nesterov=True)
   gradients = optimizer.compute_gradients(target_vars.opt_loss)
   adjusted_gradients = []
   for (grad,x) in gradients:
@@ -118,13 +118,13 @@ def detaillog(s):
 metrics = Metrics(model,target_vars,include_debug_stats=True)
 
 def reduce_norm(x, axis=None, keepdims=False):
-  return tf.sqrt(tf.reduce_mean(tf.square(x), axis=axis, keepdims=keepdims))
+  return tf.sqrt(tf.reduce_mean(input_tensor=tf.square(x), axis=axis, keepdims=keepdims))
 relative_update_by_var = dict([
   (v.name,per_sample_learning_rate * reduce_norm(grad) / (1e-10 + reduce_norm(v))) for (grad,v) in adjusted_gradients if grad is not None
 ])
 
 total_parameters = 0
-for variable in tf.trainable_variables():
+for variable in tf.compat.v1.trainable_variables():
   shape = variable.get_shape()
   variable_parameters = 1
   for dim in shape:
@@ -134,7 +134,7 @@ def reduce_norm(x, axis=None, keepdims=False):
 
 trainlog("Built model, %d total parameters" % total_parameters)
 
-for update_op in tf.get_collection(tf.GraphKeys.UPDATE_OPS):
+for update_op in tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS):
   trainlog("Additional update op on train step: %s" % update_op.name)
 
 # Open H5 file---------------------------------------------------------
@@ -227,20 +227,20 @@ def report_epoch_done(self,epoch):
 
 # Training ------------------------------------------------------------
 
-saver = tf.train.Saver(
+saver = tf.compat.v1.train.Saver(
   max_to_keep = 10000,
   save_relative_paths = True,
 )
 
 #Some tensorflow options
-tfconfig = tf.ConfigProto(log_device_placement=False)
+tfconfig = tf.compat.v1.ConfigProto(log_device_placement=False)
 #tfconfig.gpu_options.allow_growth = True
 #tfconfig.gpu_options.per_process_gpu_memory_fraction = 0.4
-with tf.Session(config=tfconfig) as session:
+with tf.compat.v1.Session(config=tfconfig) as session:
   if restart_file is not None:
     saver.restore(session, restart_file)
   else:
-    session.run(tf.global_variables_initializer())
+    session.run(tf.compat.v1.global_variables_initializer())
 
   sys.stdout.flush()
   sys.stderr.flush()
diff --git a/python/deprecated/visualize.py b/python/deprecated/visualize.py
old mode 100755
new mode 100644
index 0f1239490..779e2f565
--- a/python/deprecated/visualize.py
+++ b/python/deprecated/visualize.py
@@ -54,7 +54,7 @@ def volume(variable):
   return variable_parameters
 
 total_parameters = 0
-for variable in tf.global_variables():
+for variable in tf.compat.v1.global_variables():
   variable_parameters = volume(variable)
   total_parameters += variable_parameters
   log("Model variable %s, %d parameters" % (variable.name,variable_parameters))
@@ -66,17 +66,17 @@ def volume(variable):
 
 print("Testing", flush=True)
 
-saver = tf.train.Saver(
+saver = tf.compat.v1.train.Saver(
   max_to_keep = 10000,
   save_relative_paths = True,
 )
 
 #Some tensorflow options
 #tfconfig = tf.ConfigProto(log_device_placement=False,device_count={'GPU': 0})
-tfconfig = tf.ConfigProto(log_device_placement=False)
+tfconfig = tf.compat.v1.ConfigProto(log_device_placement=False)
 #tfconfig.gpu_options.allow_growth = True
 #tfconfig.gpu_options.per_process_gpu_memory_fraction = 0.4
-with tf.Session(config=tfconfig) as session:
+with tf.compat.v1.Session(config=tfconfig) as session:
   saver.restore(session, model_file)
 
   sys.stdout.flush()
@@ -91,7 +91,7 @@ def run(fetches):
     return session.run(fetches, feed_dict={})
 
   if dump is not None:
-    variables = dict((variable.name,variable) for variable in tf.trainable_variables())
+    variables = dict((variable.name,variable) for variable in tf.compat.v1.trainable_variables())
     for name in dump.split(","):
       variable = variables[name]
       variable = np.array(variable.eval())
@@ -116,13 +116,13 @@ def run(fetches):
                         for x0 in range(variable.shape[0])))
 
   if conv_norm_by_xy is not None:
-    variables = dict((variable.name,variable) for variable in tf.trainable_variables())
+    variables = dict((variable.name,variable) for variable in tf.compat.v1.trainable_variables())
     for name in conv_norm_by_xy.split(","):
       variable = variables[name]
 
       #Should be x,y,in_channels,out_channels
       assert(len(variable.shape) == 4)
-      norms = tf.sqrt(tf.reduce_mean(variable*variable,axis=[2,3]))
+      norms = tf.sqrt(tf.reduce_mean(input_tensor=variable*variable,axis=[2,3]))
       norms = np.array(run(norms))
       print(name + " " + str(volume(variable)) + " parameters")
       for y in range(norms.shape[1]):
@@ -131,7 +131,7 @@ def run(fetches):
         print("")
 
   if conv_norm_by_channel is not None:
-    variables = dict((variable.name,variable) for variable in tf.trainable_variables())
+    variables = dict((variable.name,variable) for variable in tf.compat.v1.trainable_variables())
 
     #Each convolution weight variable has a set of channels it takes in as input and a set of channels it produces
     #as output. This is a dictionary of the mapping.
@@ -178,7 +178,7 @@ def run(fetches):
 
       #Should be x,y,in_channels,out_channels
       assert(len(variable.shape) == 4)
-      norm = tf.sqrt(tf.reduce_mean(variable*variable,axis=[0,1]))
+      norm = tf.sqrt(tf.reduce_mean(input_tensor=variable*variable,axis=[0,1]))
       norm = np.array(run(norm))
       norms[var_name] = norm
 
diff --git a/python/export_model.py b/python/export_model.py
index bf01b8460..8ce1e63da 100644
--- a/python/export_model.py
+++ b/python/export_model.py
@@ -15,6 +15,7 @@
 from model import Model, ModelUtils
 import common
 
+tf.compat.v1.disable_eager_execution()
 #Command and args-------------------------------------------------------------------
 
 description = """
@@ -89,7 +90,7 @@ def log(s):
   sys.stderr.flush()
 
   if not for_cuda:
-    tf.train.write_graph(session.graph_def,export_dir,filename_prefix + ".graph.pb")
+    tf.io.write_graph(session.graph_def,export_dir,filename_prefix + ".graph.pb")
     savepath = export_dir + "/" + filename_prefix
     saver.save(session, savepath + ".weights")
     with open(savepath + ".config.json","w") as f:
diff --git a/python/genboard_run.py b/python/genboard_run.py
old mode 100755
new mode 100644
diff --git a/python/genboard_train.py b/python/genboard_train.py
old mode 100755
new mode 100644
diff --git a/python/model.py b/python/model.py
index 59b9070db..1f7e532a7 100644
--- a/python/model.py
+++ b/python/model.py
@@ -117,15 +117,15 @@ def __init__(self,config,pos_len,placeholders,is_training=False):
 
   def assert_batched_shape(self,name,tensor,shape):
     if (len(tensor.shape) != len(shape)+1 or
-        [int(tensor.shape[i+1].value) for i in range(len(shape))] != [int(x) for x in shape]):
+        [int(tensor.shape.as_list()[i+1]) for i in range(len(shape))] != [int(x) for x in shape]):
       raise Exception("%s should have shape %s after a batch dimension but instead it had shape %s" % (
-        name, str(shape), str([str(x.value) for x in tensor.shape])))
+        name, str(shape), str([str(x) for x in tensor.shape])))
 
   def assert_shape(self,name,tensor,shape):
     if (len(tensor.shape) != len(shape) or
-        [int(x.value) for x in tensor.shape] != [int(x) for x in shape]):
+        [int(x) for x in tensor.shape] != [int(x) for x in shape]):
       raise Exception("%s should have shape %s but instead it had shape %s" % (
-        name, str(shape), str([str(x.value) for x in tensor.shape])))
+        name, str(shape), str([str(x) for x in tensor.shape])))
 
   def xy_to_tensor_pos(self,x,y):
     return y * self.pos_len + x
@@ -509,34 +509,34 @@ def add_lr_factor(self,name,factor):
 
   def batchnorm_and_mask(self,name,tensor,mask,mask_sum,use_gamma_in_fixup=False):
     if self.use_fixup:
-      self.batch_norms[name] = (tensor.shape[-1].value,1e-20,True,use_gamma_in_fixup,self.use_fixup)
+      self.batch_norms[name] = (tensor.shape.as_list()[-1],1e-20,True,use_gamma_in_fixup,self.use_fixup)
       if use_gamma_in_fixup:
-        gamma = self.weight_variable_init_constant(name+"/gamma", [tensor.shape[3].value], 1.0)
-        beta = self.weight_variable_init_constant(name+"/beta", [tensor.shape[3].value], 0.0, reg="tiny")
+        gamma = self.weight_variable_init_constant(name+"/gamma", [tensor.shape.as_list()[3]], 1.0)
+        beta = self.weight_variable_init_constant(name+"/beta", [tensor.shape.as_list()[3]], 0.0, reg="tiny")
         return (tensor * gamma + beta) * mask
       else:
-        beta = self.weight_variable_init_constant(name+"/beta", [tensor.shape[3].value], 0.0, reg="tiny")
+        beta = self.weight_variable_init_constant(name+"/beta", [tensor.shape.as_list()[3]], 0.0, reg="tiny")
         return (tensor + beta) * mask
 
     epsilon = 0.001
     has_bias = True
     has_scale = False
-    self.batch_norms[name] = (tensor.shape[-1].value,epsilon,has_bias,has_scale,self.use_fixup)
+    self.batch_norms[name] = (tensor.shape.as_list()[-1],epsilon,has_bias,has_scale,self.use_fixup)
 
-    num_channels = tensor.shape[3].value
+    num_channels = tensor.shape.as_list()[3]
     collections = [tf.compat.v1.GraphKeys.GLOBAL_VARIABLES,tf.compat.v1.GraphKeys.MODEL_VARIABLES,tf.compat.v1.GraphKeys.MOVING_AVERAGE_VARIABLES]
 
     #Define variables to keep track of the mean and variance
     moving_mean = tf.compat.v1.get_variable(initializer=tf.zeros([num_channels]),name=(name+"/moving_mean"),trainable=False,collections=collections)
     moving_var = tf.compat.v1.get_variable(initializer=tf.ones([num_channels]),name=(name+"/moving_variance"),trainable=False,collections=collections)
-    beta = self.weight_variable_init_constant(name+"/beta", [tensor.shape[3].value], 0.0, reg=False)
+    beta = self.weight_variable_init_constant(name+"/beta", [tensor.shape.as_list()[3]], 0.0, reg=False)
 
     #This is the mean, computed only over exactly the areas of the mask, weighting each spot equally,
     #even across different elements in the batch that might have different board sizes.
-    mean = tf.reduce_sum(tensor * mask,axis=[0,1,2]) / mask_sum
+    mean = tf.reduce_sum(input_tensor=tensor * mask,axis=[0,1,2]) / mask_sum
     zmtensor = tensor-mean
     #Similarly, the variance computed exactly only over those spots
-    var = tf.reduce_sum(tf.square(zmtensor * mask),axis=[0,1,2]) / mask_sum
+    var = tf.reduce_sum(input_tensor=tf.square(zmtensor * mask),axis=[0,1,2]) / mask_sum
 
     with tf.compat.v1.variable_scope(name):
       mean_op = tf.keras.backend.moving_average_update(moving_mean,mean,0.998)
@@ -550,24 +550,9 @@ def training_f():
     def inference_f():
       return (moving_mean,moving_var)
 
-    use_mean,use_var = tf.cond(self.is_training_tensor,training_f,inference_f)
+    use_mean,use_var = tf.cond(pred=self.is_training_tensor,true_fn=training_f,false_fn=inference_f)
     return tf.nn.batch_normalization(tensor,use_mean,use_var,beta,None,epsilon) * mask
 
-  # def batchnorm(self,name,tensor):
-  #   epsilon = 0.001
-  #   has_bias = True
-  #   has_scale = False
-  #   self.batch_norms[name] = (tensor.shape[-1].value,epsilon,has_bias,has_scale)
-  #   return tf.layers.batch_normalization(
-  #     tensor,
-  #     axis=-1, #Because channels are our last axis, -1 refers to that via wacky python indexing
-  #     momentum=0.99,
-  #     epsilon=epsilon,
-  #     center=has_bias,
-  #     scale=has_scale,
-  #     training=self.is_training_tensor,
-  #     name=name,
-  #   )
 
   def init_stdev(self,num_inputs,num_outputs):
     #xavier
@@ -604,7 +589,7 @@ def weight_variable(self, name, shape, num_inputs, num_outputs, scale_initial_we
     return variable
 
   def conv2d(self, x, w):
-    return tf.nn.conv2d(x, w, strides=[1,1,1,1], padding='SAME')
+    return tf.nn.conv2d(input=x, filters=w, strides=[1,1,1,1], padding='SAME')
 
   def dilated_conv2d(self, x, w, dilation):
     return tf.nn.atrous_conv2d(x, w, rate = dilation, padding='SAME')
@@ -616,31 +601,31 @@ def apply_symmetry(self,tensor,symmetries,inverse):
 
     if not inverse:
       tensor = tf.cond(
-        ud,
-        lambda: tf.reverse(tensor,[1]),
-        lambda: tensor
+        pred=ud,
+        true_fn=lambda: tf.reverse(tensor,[1]),
+        false_fn=lambda: tensor
       )
       tensor = tf.cond(
-        lr,
-        lambda: tf.reverse(tensor,[2]),
-        lambda: tensor
+        pred=lr,
+        true_fn=lambda: tf.reverse(tensor,[2]),
+        false_fn=lambda: tensor
       )
 
     tensor = tf.cond(
-      transp,
-      lambda: tf.transpose(tensor, [0,2,1,3]),
-      lambda: tensor)
+      pred=transp,
+      true_fn=lambda: tf.transpose(a=tensor, perm=[0,2,1,3]),
+      false_fn=lambda: tensor)
 
     if inverse:
       tensor = tf.cond(
-        ud,
-        lambda: tf.reverse(tensor,[1]),
-        lambda: tensor
+        pred=ud,
+        true_fn=lambda: tf.reverse(tensor,[1]),
+        false_fn=lambda: tensor
       )
       tensor = tf.cond(
-        lr,
-        lambda: tf.reverse(tensor,[2]),
-        lambda: tensor
+        pred=lr,
+        true_fn=lambda: tf.reverse(tensor,[2]),
+        false_fn=lambda: tensor
       )
 
     return tensor
@@ -649,20 +634,14 @@ def apply_symmetry(self,tensor,symmetries,inverse):
 
   def relu(self, name, layer):
     assert(len(layer.shape) == 4)
-    #num_channels = layer.shape[3].value
-    #alphas = self.weight_variable_init_constant(name+"/relu",[1,1,1,num_channels],constant=0.0)
     return tf.nn.relu(layer)
 
   def relu_spatial1d(self, name, layer):
     assert(len(layer.shape) == 3)
-    #num_channels = layer.shape[1].value
-    #alphas = self.weight_variable_init_constant(name+"/relu",[1,num_channels],constant=0.0)
     return tf.nn.relu(layer)
 
   def relu_non_spatial(self, name, layer):
     assert(len(layer.shape) == 2)
-    #num_channels = layer.shape[1].value
-    #alphas = self.weight_variable_init_constant(name+"/relu",[1,num_channels],constant=0.0)
     return tf.nn.relu(layer)
 
   def merge_residual(self,name,trunk,residual):
@@ -785,8 +764,8 @@ def global_pool(self, in_layer, mask_sum_hw, mask_sum_hw_sqrt):
     div = tf.reshape(mask_sum_hw,[-1,1,1,1])
     div_sqrt = tf.reshape(mask_sum_hw_sqrt,[-1,1,1,1])
 
-    layer_raw_mean = tf.reduce_sum(in_layer,axis=[1,2],keepdims=True) / div
-    layer_raw_max = tf.reduce_max(in_layer,axis=[1,2],keepdims=True)
+    layer_raw_mean = tf.reduce_sum(input_tensor=in_layer,axis=[1,2],keepdims=True) / div
+    layer_raw_max = tf.reduce_max(input_tensor=in_layer,axis=[1,2],keepdims=True)
 
     # 1, (x-14)/10, and (x-14)^2/100 - 0.1 are three orthogonal functions over [9,19], the range of reasonable board sizes.
     # We have the 14 in there since it's the midpoint of that range. The /10 is just sort of arbitrary normalization to keep things on the same scale.
@@ -802,7 +781,7 @@ def value_head_pool(self, in_layer, mask_sum_hw, mask_sum_hw_sqrt):
     div = tf.reshape(mask_sum_hw,[-1,1])
     div_sqrt = tf.reshape(mask_sum_hw_sqrt,[-1,1])
 
-    layer_raw_mean = tf.reduce_sum(in_layer,axis=[1,2],keepdims=False) / div
+    layer_raw_mean = tf.reduce_sum(input_tensor=in_layer,axis=[1,2],keepdims=False) / div
 
     # 1, (x-14)/10, and (x-14)^2/100 - 0.1 are three orthogonal functions over [9,19], the range of reasonable board sizes.
     # We have the 14 in there since it's the midpoint of that range. The /10 and /100 are just sort of arbitrary normalization to keep things on the same scale
@@ -931,17 +910,17 @@ def build_model(self,config,placeholders):
     assert(hist_matrix_base.dtype == tf.float32)
     assert(hist_matrix_builder.dtype == tf.float32)
     assert(len(hist_matrix_builder.shape) == 3)
-    assert(hist_matrix_builder.shape[0].value == 5)
-    assert(hist_matrix_builder.shape[1].value == self.num_bin_input_features)
-    assert(hist_matrix_builder.shape[2].value == self.num_bin_input_features)
+    assert(hist_matrix_builder.shape.as_list()[0] == 5)
+    assert(hist_matrix_builder.shape.as_list()[1] == self.num_bin_input_features)
+    assert(hist_matrix_builder.shape.as_list()[2] == self.num_bin_input_features)
 
     hist_filter_matrix = hist_matrix_base + tf.tensordot(include_history, hist_matrix_builder, axes=[[1],[0]]) #[batch,move] * [move,inc,outc] = [batch,inc,outc]
     cur_layer = tf.reshape(cur_layer,[-1,self.pos_len*self.pos_len,self.num_bin_input_features]) #[batch,xy,inc]
     cur_layer = tf.matmul(cur_layer,hist_filter_matrix) #[batch,xy,inc] * [batch,inc,outc] = [batch,xy,outc]
     cur_layer = tf.reshape(cur_layer,[-1,self.pos_len,self.pos_len,self.num_bin_input_features])
 
-    assert(include_history.shape[1].value == 5)
-    transformed_global_inputs = global_inputs * tf.pad(include_history, [(0,0),(0,self.num_global_input_features - include_history.shape[1].value)], constant_values=1.0)
+    assert(include_history.shape.as_list()[1] == 5)
+    transformed_global_inputs = global_inputs * tf.pad(tensor=include_history, paddings=[(0,0),(0,self.num_global_input_features - include_history.shape.as_list()[1])], constant_values=1.0)
 
     self.transformed_bin_inputs = cur_layer
     self.transformed_global_inputs = transformed_global_inputs
@@ -962,8 +941,8 @@ def build_model(self,config,placeholders):
     self.gpool_num_channels = gpool_num_channels
 
     mask = cur_layer[:,:,:,0:1]
-    mask_sum = tf.reduce_sum(mask) # Global sum
-    mask_sum_hw = tf.reduce_sum(mask,axis=[1,2,3]) # Sum per batch element
+    mask_sum = tf.reduce_sum(input_tensor=mask) # Global sum
+    mask_sum_hw = tf.reduce_sum(input_tensor=mask,axis=[1,2,3]) # Sum per batch element
     mask_sum_hw_sqrt = tf.sqrt(mask_sum_hw)
 
     #Initial convolutional layer-------------------------------------------------------------------------------------
@@ -1137,7 +1116,7 @@ def scaletransform(tensor):
       #tf.where has a bug where nan values on the non-chosen side will still propagate nans back in gradients.
       #So we also abs the tensor, so that we never get a log of a negative value
       abstensor = tf.abs(tensor)
-      return tf.where(tensor > 0, 1.0 + tf.math.log(abstensor + 1.0), 1.0 / (1.0 + tf.math.log(abstensor + 1.0)))
+      return tf.compat.v1.where(tensor > 0, 1.0 + tf.math.log(abstensor + 1.0), 1.0 / (1.0 + tf.math.log(abstensor + 1.0)))
 
     scorebelief_len = self.scorebelief_target_shape[0]
     scorebelief_mid = self.pos_len*self.pos_len+Model.EXTRA_SCORE_DISTR_RADIUS
@@ -1235,7 +1214,7 @@ def scaletransform(tensor):
 
 def huber_loss(x,y,delta):
   absdiff = tf.abs(x - y)
-  return tf.where(absdiff > delta, (0.5 * delta*delta) + delta * (absdiff - delta), 0.5 * absdiff * absdiff)
+  return tf.compat.v1.where(absdiff > delta, (0.5 * delta*delta) + delta * (absdiff - delta), 0.5 * absdiff * absdiff)
 
 
 class Target_vars:
@@ -1342,42 +1321,42 @@ def __init__(self,model,for_optimization,placeholders):
 
 
     self.policy_loss_unreduced = self.policy_target_weight * (
-      tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.policy_target, logits=policy_output[:,:,0])
+      tf.nn.softmax_cross_entropy_with_logits(labels=self.policy_target, logits=policy_output[:,:,0])
     )
     self.policy1_loss_unreduced = self.policy_target_weight1 * 0.15 * (
-      tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.policy_target1, logits=policy_output[:,:,1])
+      tf.nn.softmax_cross_entropy_with_logits(labels=self.policy_target1, logits=policy_output[:,:,1])
     )
 
-    self.value_loss_unreduced = 1.20 * tf.nn.softmax_cross_entropy_with_logits_v2(
+    self.value_loss_unreduced = 1.20 * tf.nn.softmax_cross_entropy_with_logits(
       labels=self.value_target,
       logits=value_output
     )
 
     self.td_value_loss_unreduced = tf.constant([0.55,0.55,0.15],dtype=tf.float32) * (
-      tf.nn.softmax_cross_entropy_with_logits_v2(
+      tf.nn.softmax_cross_entropy_with_logits(
         labels=self.td_value_target,
         logits=td_value_prediction
       ) -
       # Subtract out the entropy, so as to get loss 0 at perfect prediction
-      tf.nn.softmax_cross_entropy_with_logits_v2(
+      tf.nn.softmax_cross_entropy_with_logits(
         labels=self.td_value_target,
         logits=tf.math.log(self.td_value_target + 1.0e-30)
       )
     )
-    self.td_value_loss_unreduced = tf.reduce_sum(self.td_value_loss_unreduced, axis=1)
+    self.td_value_loss_unreduced = tf.reduce_sum(input_tensor=self.td_value_loss_unreduced, axis=1)
 
     self.td_score_loss_unreduced = 0.0004 * self.ownership_target_weight * (
-      tf.reduce_sum(huber_loss(self.td_score_target, td_score_prediction, delta = 12.0), axis=1)
+      tf.reduce_sum(input_tensor=huber_loss(self.td_score_target, td_score_prediction, delta = 12.0), axis=1)
     )
 
     self.scorebelief_cdf_loss_unreduced = 0.020 * self.ownership_target_weight * (
       tf.reduce_sum(
-        tf.square(tf.cumsum(self.scorebelief_target,axis=1) - tf.cumsum(tf.nn.softmax(scorebelief_output,axis=1),axis=1)),
+        input_tensor=tf.square(tf.cumsum(self.scorebelief_target,axis=1) - tf.cumsum(tf.nn.softmax(scorebelief_output,axis=1),axis=1)),
         axis=1
       )
     )
     self.scorebelief_pdf_loss_unreduced = 0.020 * self.ownership_target_weight * (
-      tf.nn.softmax_cross_entropy_with_logits_v2(
+      tf.nn.softmax_cross_entropy_with_logits(
         labels=self.scorebelief_target,
         logits=scorebelief_output
       )
@@ -1388,7 +1367,7 @@ def __init__(self,model,for_optimization,placeholders):
     #Not unlike the way that policy and value loss are also equal-weighted by batch element.
     self.ownership_loss_unreduced = 1.5 * self.ownership_target_weight * (
       tf.reduce_sum(
-        tf.nn.softmax_cross_entropy_with_logits_v2(
+        input_tensor=tf.nn.softmax_cross_entropy_with_logits(
           labels=tf.stack([(1+self.ownership_target)/2,(1-self.ownership_target)/2],axis=3),
           logits=tf.stack([ownership_output,-ownership_output],axis=3)
         ) * tf.reshape(model.mask_before_symmetry,[-1,model.pos_len,model.pos_len]),
@@ -1398,7 +1377,7 @@ def __init__(self,model,for_optimization,placeholders):
 
     self.scoring_loss_unreduced = 1.0 * self.scoring_target_weight * (
       tf.reduce_sum(
-        tf.square(self.scoring_target - scoring_output) * tf.reshape(model.mask_before_symmetry,[-1,model.pos_len,model.pos_len]),
+        input_tensor=tf.square(self.scoring_target - scoring_output) * tf.reshape(model.mask_before_symmetry,[-1,model.pos_len,model.pos_len]),
         axis=[1,2]
       ) / model.mask_sum_hw
     )
@@ -1416,7 +1395,7 @@ def __init__(self,model,for_optimization,placeholders):
     #due to simply being farther in the future, so multiply by [1,0.25].
     self.futurepos_loss_unreduced = 0.25 * self.futurepos_target_weight * (
       tf.reduce_sum(
-        tf.square(tf.tanh(futurepos_output) - self.futurepos_target)
+        input_tensor=tf.square(tf.tanh(futurepos_output) - self.futurepos_target)
         * tf.reshape(model.mask_before_symmetry,[-1,model.pos_len,model.pos_len,1])
         * tf.reshape(tf.constant([1,0.25],dtype=tf.float32),[1,1,1,2]),
         axis=[1,2,3]
@@ -1427,10 +1406,10 @@ def __init__(self,model,for_optimization,placeholders):
     owned_target = tf.square(self.ownership_target)
     unowned_target = 1.0 - owned_target
     unowned_proportion = (
-      tf.reduce_sum(unowned_target * tf.reshape(model.mask_before_symmetry,[-1,model.pos_len,model.pos_len]),axis=[1,2])
-      / (1.0 + tf.reduce_sum(tf.reshape(model.mask_before_symmetry,[-1,model.pos_len,model.pos_len]),axis=[1,2]))
+      tf.reduce_sum(input_tensor=unowned_target * tf.reshape(model.mask_before_symmetry,[-1,model.pos_len,model.pos_len]),axis=[1,2])
+      / (1.0 + tf.reduce_sum(input_tensor=tf.reshape(model.mask_before_symmetry,[-1,model.pos_len,model.pos_len]),axis=[1,2]))
     )
-    unowned_proportion = tf.reduce_mean(unowned_proportion * self.ownership_target_weight)
+    unowned_proportion = tf.reduce_mean(input_tensor=unowned_proportion * self.ownership_target_weight)
     if model.is_training:
       moving_unowned_proportion = tf.compat.v1.get_variable(initializer=1.0,name=("moving_unowned_proportion"),trainable=False)
       moving_unowned_op = tf.keras.backend.moving_average_update(moving_unowned_proportion,unowned_proportion,0.998)
@@ -1441,7 +1420,7 @@ def __init__(self,model,for_optimization,placeholders):
 
     self.seki_loss_unreduced = (
       tf.reduce_sum(
-        tf.nn.softmax_cross_entropy_with_logits_v2(
+        input_tensor=tf.nn.softmax_cross_entropy_with_logits(
           labels=tf.stack([1.0-tf.square(self.seki_target), tf.nn.relu(self.seki_target), tf.nn.relu(-self.seki_target)],axis=3),
           logits=seki_output[:,:,:,0:3]
         ) * tf.reshape(model.mask_before_symmetry,[-1,model.pos_len,model.pos_len]),
@@ -1450,7 +1429,7 @@ def __init__(self,model,for_optimization,placeholders):
     )
     self.seki_loss_unreduced = self.seki_loss_unreduced + 0.5 * (
       tf.reduce_sum(
-        tf.nn.softmax_cross_entropy_with_logits_v2(
+        input_tensor=tf.nn.softmax_cross_entropy_with_logits(
           labels=tf.stack([unowned_target, owned_target],axis=3),
           logits=tf.stack([seki_output[:,:,:,3],tf.zeros_like(self.ownership_target)],axis=3)
         ) * tf.reshape(model.mask_before_symmetry,[-1,model.pos_len,model.pos_len]),
@@ -1461,7 +1440,7 @@ def __init__(self,model,for_optimization,placeholders):
     self.seki_weight_scale = seki_weight_scale
 
     #This is conditional upon there being a result
-    expected_score_from_belief = tf.reduce_sum(scorebelief_probs * model.score_belief_offset_vector,axis=1)
+    expected_score_from_belief = tf.reduce_sum(input_tensor=scorebelief_probs * model.score_belief_offset_vector,axis=1)
 
     #Huber will incentivize this to not actually converge to the mean, but rather something meanlike locally and something medianlike
     #for very large possible losses. This seems... okay - it might actually be what users want.
@@ -1470,7 +1449,7 @@ def __init__(self,model,for_optimization,placeholders):
     self.variance_time_loss_unreduced = 0.0003 * huber_loss(self.variance_time_target, variance_time_prediction, delta = 50.0)
 
     stdev_of_belief = tf.sqrt(0.001 + tf.reduce_sum(
-      scorebelief_probs * tf.square(
+      input_tensor=scorebelief_probs * tf.square(
         tf.reshape(model.score_belief_offset_vector,[1,-1]) - tf.reshape(expected_score_from_belief,[-1,1])
       ),axis=1))
     beliefstdevdiff = stdev_of_belief - scorestdev_prediction
@@ -1512,27 +1491,27 @@ def __init__(self,model,for_optimization,placeholders):
     self.scale_reg_loss_unreduced = tf.reshape(0.0004 * tf.add_n([tf.square(variable) for variable in model.prescale_variables]), [-1])
     #self.scale_reg_loss_unreduced = tf.zeros_like(self.winloss_reg_loss_unreduced)
 
-    self.policy_loss = tf.reduce_sum(self.target_weight_used * self.policy_loss_unreduced, name="losses/policy_loss")
-    self.policy1_loss = tf.reduce_sum(self.target_weight_used * self.policy1_loss_unreduced, name="losses/policy1_loss")
-    self.value_loss = tf.reduce_sum(self.target_weight_used * self.value_loss_unreduced, name="losses/value_loss")
-    self.td_value_loss = tf.reduce_sum(self.target_weight_used * self.td_value_loss_unreduced, name="losses/td_value_loss")
-    self.td_score_loss = tf.reduce_sum(self.target_weight_used * self.td_score_loss_unreduced, name="losses/td_score_loss")
-    self.scoremean_loss = tf.reduce_sum(self.target_weight_used * self.scoremean_loss_unreduced, name="losses/scoremean_loss")
-    self.lead_loss = tf.reduce_sum(self.target_weight_used * self.lead_loss_unreduced, name="losses/lead_loss")
-    self.variance_time_loss = tf.reduce_sum(self.target_weight_used * self.variance_time_loss_unreduced, name="losses/variance_time_loss")
-    self.scorebelief_pdf_loss = tf.reduce_sum(self.target_weight_used * self.scorebelief_pdf_loss_unreduced, name="losses/scorebelief_pdf_loss")
-    self.scorebelief_cdf_loss = tf.reduce_sum(self.target_weight_used * self.scorebelief_cdf_loss_unreduced, name="losses/scorebelief_cdf_loss")
-    self.ownership_loss = tf.reduce_sum(self.target_weight_used * self.ownership_loss_unreduced, name="losses/ownership_loss")
-    self.scoring_loss = tf.reduce_sum(self.target_weight_used * self.scoring_loss_unreduced, name="losses/scoring_loss")
-    self.futurepos_loss = tf.reduce_sum(self.target_weight_used * self.futurepos_loss_unreduced, name="losses/futurepos_loss")
-    self.seki_loss = tf.reduce_sum(self.target_weight_used * self.seki_loss_unreduced, name="losses/seki_loss")
-    self.scorestdev_reg_loss = tf.reduce_sum(self.target_weight_used * self.scorestdev_reg_loss_unreduced, name="losses/scorestdev_reg_loss")
-    self.shortterm_value_error_loss = tf.reduce_sum(self.target_weight_used * self.shortterm_value_error_loss_unreduced, name="losses/sloss")
-    self.shortterm_score_error_loss = tf.reduce_sum(self.target_weight_used * self.shortterm_score_error_loss_unreduced, name="losses/shortterm_score_error_loss")
+    self.policy_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.policy_loss_unreduced, name="losses/policy_loss")
+    self.policy1_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.policy1_loss_unreduced, name="losses/policy1_loss")
+    self.value_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.value_loss_unreduced, name="losses/value_loss")
+    self.td_value_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.td_value_loss_unreduced, name="losses/td_value_loss")
+    self.td_score_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.td_score_loss_unreduced, name="losses/td_score_loss")
+    self.scoremean_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.scoremean_loss_unreduced, name="losses/scoremean_loss")
+    self.lead_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.lead_loss_unreduced, name="losses/lead_loss")
+    self.variance_time_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.variance_time_loss_unreduced, name="losses/variance_time_loss")
+    self.scorebelief_pdf_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.scorebelief_pdf_loss_unreduced, name="losses/scorebelief_pdf_loss")
+    self.scorebelief_cdf_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.scorebelief_cdf_loss_unreduced, name="losses/scorebelief_cdf_loss")
+    self.ownership_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.ownership_loss_unreduced, name="losses/ownership_loss")
+    self.scoring_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.scoring_loss_unreduced, name="losses/scoring_loss")
+    self.futurepos_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.futurepos_loss_unreduced, name="losses/futurepos_loss")
+    self.seki_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.seki_loss_unreduced, name="losses/seki_loss")
+    self.scorestdev_reg_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.scorestdev_reg_loss_unreduced, name="losses/scorestdev_reg_loss")
+    self.shortterm_value_error_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.shortterm_value_error_loss_unreduced, name="losses/sloss")
+    self.shortterm_score_error_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.shortterm_score_error_loss_unreduced, name="losses/shortterm_score_error_loss")
     # self.winloss_reg_loss = tf.reduce_sum(self.target_weight_used * self.winloss_reg_loss_unreduced, name="losses/winloss_reg_loss")
-    self.scale_reg_loss = tf.reduce_sum(self.target_weight_used * self.scale_reg_loss_unreduced, name="losses/scale_reg_loss")
+    self.scale_reg_loss = tf.reduce_sum(input_tensor=self.target_weight_used * self.scale_reg_loss_unreduced, name="losses/scale_reg_loss")
 
-    self.weight_sum = tf.reduce_sum(self.target_weight_used, name="losses/weight_sum")
+    self.weight_sum = tf.reduce_sum(input_tensor=self.target_weight_used, name="losses/weight_sum")
 
     if for_optimization:
       #Prior/Regularization
@@ -1567,35 +1546,24 @@ def __init__(self,model,for_optimization,placeholders):
         self.scale_reg_loss
       )
 
-      # self.opt_loss = tf.Print(
-      #   self.opt_loss,
-      #   [self.value_target[0:10],
-      #    self.scorevalue_target[0:10],
-      #    self.scorebelief_target[0] * tf.constant(model.score_belief_offset_vector,dtype=tf.float32),
-      #    self.ownership_target_weight[0:10],
-      #    self.selfkomi[0:10],
-      #   ],
-      #   summarize=2000
-      # )
-
 class Metrics:
   def __init__(self,model,target_vars,include_debug_stats):
     #Training results
-    policy_target_idxs = tf.argmax(target_vars.policy_target, 1)
-    self.top1_prediction = tf.equal(tf.argmax(model.policy_output[:,:,0], 1), policy_target_idxs)
-    self.top4_prediction = tf.nn.in_top_k(model.policy_output[:,:,0],policy_target_idxs,4)
+    policy_target_idxs = tf.argmax(input=target_vars.policy_target, axis=1)
+    self.top1_prediction = tf.equal(tf.argmax(input=model.policy_output[:,:,0], axis=1), policy_target_idxs)
+    self.top4_prediction = tf.nn.in_top_k(predictions=model.policy_output[:,:,0],targets=policy_target_idxs,k=4)
     self.accuracy1_unreduced = tf.cast(self.top1_prediction, tf.float32)
     self.accuracy4_unreduced = tf.cast(self.top4_prediction, tf.float32)
-    self.value_entropy_unreduced = tf.nn.softmax_cross_entropy_with_logits_v2(labels=tf.nn.softmax(model.value_output,axis=1), logits=model.value_output)
+    self.value_entropy_unreduced = tf.nn.softmax_cross_entropy_with_logits(labels=tf.nn.softmax(model.value_output,axis=1), logits=model.value_output)
     self.value_conf_unreduced = 4 * tf.square(tf.nn.sigmoid(model.value_output[:,0] - model.value_output[:,1]) - 0.5)
     self.policy_target_entropy_unreduced = target_vars.policy_target_weight * (
-      -tf.reduce_sum(target_vars.policy_target * tf.math.log(target_vars.policy_target+(1e-20)), axis=1)
+      -tf.reduce_sum(input_tensor=target_vars.policy_target * tf.math.log(target_vars.policy_target+(1e-20)), axis=1)
     )
-    self.accuracy1 = tf.reduce_sum(target_vars.target_weight_used * self.accuracy1_unreduced, name="metrics/accuracy1")
-    self.accuracy4 = tf.reduce_sum(target_vars.target_weight_used * self.accuracy4_unreduced, name="metrics/accuracy4")
-    self.value_entropy = tf.reduce_sum(target_vars.target_weight_used * self.value_entropy_unreduced, name="metrics/value_entropy")
-    self.value_conf = tf.reduce_sum(target_vars.target_weight_used * self.value_conf_unreduced, name="metrics/value_conf")
-    self.policy_target_entropy = tf.reduce_sum(target_vars.target_weight_used * self.policy_target_entropy_unreduced, name="metrics/policy_target_entropy")
+    self.accuracy1 = tf.reduce_sum(input_tensor=target_vars.target_weight_used * self.accuracy1_unreduced, name="metrics/accuracy1")
+    self.accuracy4 = tf.reduce_sum(input_tensor=target_vars.target_weight_used * self.accuracy4_unreduced, name="metrics/accuracy4")
+    self.value_entropy = tf.reduce_sum(input_tensor=target_vars.target_weight_used * self.value_entropy_unreduced, name="metrics/value_entropy")
+    self.value_conf = tf.reduce_sum(input_tensor=target_vars.target_weight_used * self.value_conf_unreduced, name="metrics/value_conf")
+    self.policy_target_entropy = tf.reduce_sum(input_tensor=target_vars.target_weight_used * self.policy_target_entropy_unreduced, name="metrics/policy_target_entropy")
 
     # self.shortterm_value_error_mean_unreduced = target_vars.shortterm_diff_value
     # self.shortterm_score_error_mean_unreduced = target_vars.shortterm_diff_score
@@ -1607,24 +1575,24 @@ def __init__(self,model,target_vars,include_debug_stats):
     if include_debug_stats:
 
       def reduce_norm(x, axis=None, keepdims=False):
-        return tf.sqrt(tf.reduce_mean(tf.square(x), axis=axis, keepdims=keepdims))
+        return tf.sqrt(tf.reduce_mean(input_tensor=tf.square(x), axis=axis, keepdims=keepdims))
 
       def reduce_stdev(x, axis=None, keepdims=False):
-        m = tf.reduce_mean(x, axis=axis, keepdims=True)
+        m = tf.reduce_mean(input_tensor=x, axis=axis, keepdims=True)
         devs_squared = tf.square(x - m)
-        return tf.sqrt(tf.reduce_mean(devs_squared, axis=axis, keepdims=keepdims))
+        return tf.sqrt(tf.reduce_mean(input_tensor=devs_squared, axis=axis, keepdims=keepdims))
 
       self.activated_prop_by_layer = dict([
-        (name,tf.reduce_mean(tf.count_nonzero(layer,axis=[1,2])/layer.shape[1].value/layer.shape[2].value, axis=0)) for (name,layer) in model.outputs_by_layer
+        (name,tf.reduce_mean(input_tensor=tf.math.count_nonzero(layer,axis=[1,2])/layer.shape.as_list()[1]/layer.shape.as_list()[2], axis=0)) for (name,layer) in model.outputs_by_layer
       ])
       self.mean_output_by_layer = dict([
-        (name,tf.reduce_mean(layer,axis=[0,1,2])) for (name,layer) in model.outputs_by_layer
+        (name,tf.reduce_mean(input_tensor=layer,axis=[0,1,2])) for (name,layer) in model.outputs_by_layer
       ])
       self.stdev_output_by_layer = dict([
         (name,reduce_stdev(layer,axis=[0,1,2])) for (name,layer) in model.outputs_by_layer
       ])
       self.mean_weights_by_var = dict([
-        (v.name,tf.reduce_mean(v)) for v in tf.compat.v1.trainable_variables()
+        (v.name,tf.reduce_mean(input_tensor=v)) for v in tf.compat.v1.trainable_variables()
       ])
       self.norm_weights_by_var = dict([
         (v.name,reduce_norm(v)) for v in tf.compat.v1.trainable_variables()
@@ -1638,7 +1606,7 @@ def print_trainable_variables(logf):
       shape = variable.get_shape()
       variable_parameters = 1
       for dim in shape:
-        variable_parameters *= dim.value
+        variable_parameters *= dim
       total_parameters += variable_parameters
       logf("Model variable: %s, %d parameters" % (variable.name,variable_parameters))
 
@@ -1664,7 +1632,7 @@ def build_model_from_tfrecords_features(features,mode,print_model,trainlog,model
     bitmasks = tf.reshape(tf.constant([128,64,32,16,8,4,2,1],dtype=tf.uint8),[1,1,1,8])
     binchw = tf.reshape(tf.bitwise.bitwise_and(tf.expand_dims(binchwp,axis=3),bitmasks),[-1,num_bin_input_features,((pos_len*pos_len+7)//8)*8])
     binchw = binchw[:,:,:pos_len*pos_len]
-    binhwc = tf.cast(tf.transpose(binchw, [0,2,1]),tf.float32)
+    binhwc = tf.cast(tf.transpose(a=binchw, perm=[0,2,1]),tf.float32)
     binhwc = tf.math.minimum(binhwc,tf.constant(1.0))
 
     placeholders["bin_inputs"] = binhwc
@@ -1679,11 +1647,11 @@ def build_model_from_tfrecords_features(features,mode,print_model,trainlog,model
     placeholders["include_history"] = features["gtnc"][:,36:41]
 
     policy_target0 = features["ptncm"][:,0,:]
-    policy_target0 = policy_target0 / tf.reduce_sum(policy_target0,axis=1,keepdims=True)
+    policy_target0 = policy_target0 / tf.reduce_sum(input_tensor=policy_target0,axis=1,keepdims=True)
     placeholders["policy_target"] = policy_target0
     placeholders["policy_target_weight"] = features["gtnc"][:,26]
     policy_target1 = features["ptncm"][:,1,:]
-    policy_target1 = policy_target1 / tf.reduce_sum(policy_target1,axis=1,keepdims=True)
+    policy_target1 = policy_target1 / tf.reduce_sum(input_tensor=policy_target1,axis=1,keepdims=True)
     placeholders["policy_target1"] = policy_target1
     placeholders["policy_target_weight1"] = features["gtnc"][:,28]
 
@@ -1696,7 +1664,7 @@ def build_model_from_tfrecords_features(features,mode,print_model,trainlog,model
     placeholders["scorebelief_target"] = features["sdn"] / 100.0
     placeholders["ownership_target"] = features["vtnchw"][:,0]
     placeholders["scoring_target"] = features["vtnchw"][:,4] / 120.0
-    placeholders["futurepos_target"] = tf.transpose(features["vtnchw"][:,2:4], [0,2,3,1])
+    placeholders["futurepos_target"] = tf.transpose(a=features["vtnchw"][:,2:4], perm=[0,2,3,1])
     placeholders["seki_target"] = features["vtnchw"][:,1]
 
     placeholders["target_weight_from_data"] = features["gtnc"][:,25]
diff --git a/python/shuffle.py b/python/shuffle.py
old mode 100755
new mode 100644
index 33402cf85..140150743
--- a/python/shuffle.py
+++ b/python/shuffle.py
@@ -15,7 +15,8 @@
 
 import numpy as np
 import tensorflow as tf
-from tensorflow.python_io import TFRecordOptions,TFRecordCompressionType,TFRecordWriter
+from tensorflow.compat.v1.io import TFRecordCompressionType
+from tensorflow.io import TFRecordOptions,TFRecordWriter
 
 import tfrecordio
 
diff --git a/python/test.py b/python/test.py
index b01cea900..72a355c00 100644
--- a/python/test.py
+++ b/python/test.py
@@ -65,7 +65,7 @@ def log(s):
   dataset = dataset.flat_map(lambda fname: tf.data.TFRecordDataset(fname,compression_type="ZLIB"))
   parse_input = tfrecordio.make_tf_record_parser(model_config,pos_len,batch_size)
   dataset = dataset.map(parse_input)
-  iterator = dataset.make_one_shot_iterator()
+  iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
   features = iterator.get_next()
 elif using_npz:
   features = tfrecordio.make_raw_input_feature_placeholders(model_config,pos_len,batch_size)
diff --git a/python/tfrecordio.py b/python/tfrecordio.py
index a091c96a9..4289a5f47 100644
--- a/python/tfrecordio.py
+++ b/python/tfrecordio.py
@@ -40,8 +40,8 @@ def make_tf_record_parser(model_config,pos_len,batch_size,multi_num_gpus=None):
   raw_input_features = make_raw_input_features(model_config,pos_len,batch_size)
 
   def parse_input(serialized_example):
-    example = tf.io.parse_single_example(serialized_example,raw_input_features)
-    binchwp = tf.decode_raw(example["binchwp"],tf.uint8)
+    example = tf.io.parse_single_example(serialized=serialized_example,features=raw_input_features)
+    binchwp = tf.io.decode_raw(example["binchwp"],tf.uint8)
     ginc = example["ginc"]
     ptncm = example["ptncm"]
     gtnc = example["gtnc"]
diff --git a/python/train.py b/python/train.py
old mode 100755
new mode 100644
index 69c2c0040..39bc435ab
--- a/python/train.py
+++ b/python/train.py
@@ -24,6 +24,8 @@
 import modelconfigs
 import tfrecordio
 
+
+tf.compat.v1.disable_eager_execution()
 #Command and args-------------------------------------------------------------------
 
 description = """
@@ -163,7 +165,7 @@ def trainlog(s):
         placeholder = tf.compat.v1.placeholder(variable.dtype,variable.shape)
         assign_ops.append(tf.compat.v1.assign(variable,placeholder))
         swa_assign_placeholders[variable.name] = placeholder
-        swa_wvalues[variable.name] = np.zeros([elt.value for elt in variable.shape])
+        swa_wvalues[variable.name] = np.zeros([elt for elt in variable.shape])
     swa_assign_op = tf.group(*assign_ops)
   trainlog("Build SWA graph for SWA update and saving, %d variables" % len(swa_assign_placeholders))
 
@@ -243,7 +245,7 @@ def model_fn(features,labels,mode,params):
       synchronization=tf.VariableSynchronization.ON_READ,
       aggregation=tf.VariableAggregation.SUM
     )
-    wsum_op = tf.assign_add(wsum,target_vars.weight_sum)
+    wsum_op = tf.compat.v1.assign_add(wsum,target_vars.weight_sum)
     eval_metric_ops={
       #"wsum": (wsum.read_value(),wsum_op),
       "p0loss": tf.compat.v1.metrics.mean(target_vars.policy_loss_unreduced, weights=target_vars.target_weight_used),
@@ -282,8 +284,8 @@ def model_fn(features,labels,mode,params):
     printed_model_yet = True
 
     def moving_mean(name,x,weights):
-      sumwx = tf.reduce_sum(x*weights,name="printstats/wx/"+name)
-      sumw = tf.reduce_sum(weights,name="printstats/w/"+name)
+      sumwx = tf.reduce_sum(input_tensor=x*weights,name="printstats/wx/"+name)
+      sumw = tf.reduce_sum(input_tensor=weights,name="printstats/w/"+name)
       moving_wx = tf.compat.v1.get_variable(initializer=tf.zeros([]),name=(name+"/moving_wx"),trainable=False)
       moving_w = tf.compat.v1.get_variable(initializer=tf.zeros([]),name=(name+"/moving_w"),trainable=False)
 
diff --git a/python/visualize.py b/python/visualize.py
index 14ac47d4c..1605880ca 100644
--- a/python/visualize.py
+++ b/python/visualize.py
@@ -46,7 +46,7 @@ def log(s):
 
 pos_len = 19 # shouldn't matter, all we're doing is exporting weights that don't depend on this
 if name_scope is not None:
-  with tf.name_scope(name_scope):
+  with tf.compat.v1.name_scope(name_scope):
     model = Model(model_config,pos_len,{})
 else:
   model = Model(model_config,pos_len,{})
@@ -59,7 +59,7 @@ def volume(variable):
   return variable_parameters
 
 total_parameters = 0
-for variable in tf.global_variables():
+for variable in tf.compat.v1.global_variables():
   variable_parameters = volume(variable)
   total_parameters += variable_parameters
   log("Model variable %s, %d parameters" % (variable.name,variable_parameters))
@@ -120,7 +120,7 @@ def run(fetches):
 
   if show_all_weight_magnitudes:
     print("name,sumsq,l2regstrength,meansq,rms")
-    for variable in tf.trainable_variables():
+    for variable in tf.compat.v1.trainable_variables():
       values = np.array(variable.eval())
       sq = np.square(values)
       reg = np.sum(sq) if any(v.name == variable.name for v in model.reg_variables) else 0