Skip to content

Commit

Permalink
simpler cache, md5 id, gridshare touching, resolves #1282
Browse files Browse the repository at this point in the history
  • Loading branch information
MontrealSergiy committed Mar 15, 2024
1 parent deb6998 commit fe078ea
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 69 deletions.
38 changes: 0 additions & 38 deletions Bourreau/lib/bourreau_system_checks.rb
Original file line number Diff line number Diff line change
Expand Up @@ -485,44 +485,6 @@ def self.a100_ensure_dp_cache_symlink_exists #:nodoc:



# touch to avoid deletion by cluster bimonthly sctratch cleanup : 1) DataProvider cache dir, 2) DP_Cache_Key.md5 3) DP_Cache_Rev.id
# 4) gridshare dir, and 5) DP_Cache symbolic link located in it
# gridshare and cache_dir are typically updated often, but we touch them just in the case
def self.a105_ensure_dp_cache_and_symlink_will_exists #:nodoc:

myself = RemoteResource.current_resource
cache_dir = myself.dp_cache_dir
dp_cache_id = File.join cache_dir, DataProvider::DP_CACHE_ID_FILE
dp_cache_md5 = File.join cache_dir, DataProvider::DP_CACHE_MD5_FILE
gridshare_dir = myself.cms_shared_dir
sym_path = File.join gridshare_dir, DataProvider::DP_CACHE_SYML

puts "C> Updating timestamp for cache folder as well as its symlink, MD5 and ID files"

begin
FileUtils.touch [gridshare_dir, cache_dir, dp_cache_id, dp_cache_md5], verbose: true, nocreate: true
# files still might be deleted if a bourreau is not rebooted for a long time
# some cluster can have policies countering touch abuse
# touch command may fail for many reasons, e.g. resource issues
# sometimes touch might fails even if timestamp update is successful
rescue => e
puts "C> Cache MD5 and ID files timestamp update FAILED: " + e.message
return
end

# update timestamp for a softlink (rather than the folder it points to)

if system "touch -h #{sym_path}"
puts "C> Timestamps are updated."
else
puts "C> Cache symlink timestamp update FAILED!!!"
puts "C> Try to recreate the symlink manually!" # older version of touch or unix do no support symlink updates
end
return if Time.now - File.lstat(sym_path).mtime > 1.day # fail only if symlink is seriously outdated
end



def self.a110_ensure_task_class_git_commits_cached

#----------------------------------------------------------------------------
Expand Down
38 changes: 37 additions & 1 deletion BrainPortal/app/models/data_provider.rb
Original file line number Diff line number Diff line change
Expand Up @@ -1353,7 +1353,44 @@ def self.cleanup_leftover_cache_files(do_it=false, options={})
end
end

# Updates the time stamp for important auxiliary directories and files
# as workaround for HPC file deletion policies.
#
# Some Bourreaux systems are configured with disk allocations where files older than N days are erased automatically.
#
# To prevent such system from deleting the top-level directories for the DP_Cache, and some cbrain-specific admin files, I suggest that part of the boot process should touch them to reset their timestamps.
#
# On a portal or bourreau:
#
# - the +DataProvider+ cache dir
# - the +DP_Cache_Key.md5+ and
# - +DP_Cache_Rev.id+ located in that cache dir
#
# On a bourreau:
#
# - the +gridshare+ dir
# - the +DP_Cache+ symbolic link located in it.
def self.system_touch

myself = RemoteResource.current_resource
cache_dir = myself.dp_cache_dir
dp_cache_id = File.join cache_dir, DataProvider::DP_CACHE_ID_FILE
dp_cache_md5 = File.join cache_dir, DataProvider::DP_CACHE_MD5_FILE

FileUtils.touch [cache_dir, dp_cache_id, dp_cache_md5], verbose: true, nocreate: true

# touch only cache for Portal, for Bourreau touch gridshare
return true unless myself.is_a? Bourreau

gridshare_dir = myself.cms_shared_dir
sym_path = File.join gridshare_dir, DataProvider::DP_CACHE_SYML

FileUtils.touch gridshare_dir, verbose: true, nocreate: true

# update timestamp for a softlink rather than the folder it points to
return system("touch", "--no-deference", "--no-create", sym_path)

end

#################################################################
# Access restriction checking methods, using flags in meta-data.
Expand Down Expand Up @@ -1615,4 +1652,3 @@ def self.local_rsync_protects_args?
end

end

17 changes: 17 additions & 0 deletions BrainPortal/lib/cbrain_system_checks.rb
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,23 @@ def self.a050_check_data_provider_cache_wipe #:nodoc:
end
end

# prvents archiving/delete of cbrain system files adn directories, such as cache
def self.a060_ensure_system_files_will_not_be_deleted #:nodoc:

#-----------------------------------------------------------------------------
puts "C> Updating timestamp for important system files and directories"
#-----------------------------------------------------------------------------

cache_root = DataProvider.cache_rootdir rescue nil
# Need to perform a `to_s` due to a strange behaviour of `blank?`
# on `Pathname` (if a content of a `Pathname` is empty it will return true)
if cache_root.to_s.blank?
puts "C> \t- SKIPPING! No cache root directory yet configured!"
return
end

DataProvider.system_touch
end


def self.a080_ensure_set_starttime_revision #:nodoc:
Expand Down
30 changes: 0 additions & 30 deletions BrainPortal/lib/portal_system_checks.rb
Original file line number Diff line number Diff line change
Expand Up @@ -202,34 +202,4 @@ def self.z010_ensure_we_have_a_ssh_agent_locker #:nodoc:
}
)
end


# touch to avoid deletion by cluster bimonthly sctratch cleanup :
# 1) DataProvider cache dir
# 2) DP_Cache_Key.md5
# 3) DP_Cache_Rev.id
# 4) cache_dir which is typically updated often, but we touch it just in the case
def self.z020_dp_cache_and_symlink_will_exists #:nodoc:

myself = RemoteResource.current_resource
cache_dir = myself.dp_cache_dir
dp_cache_id = File.join cache_dir, DataProvider::DP_CACHE_ID_FILE
dp_cache_md5 = File.join cache_dir, DataProvider::DP_CACHE_MD5_FILE

puts "C> Updating timestamp for cache folder as well as its symlink, MD5 and ID files"

begin
FileUtils.touch [cache_dir, dp_cache_id, dp_cache_md5], verbose: true, nocreate: true
# files still might be deleted if a bourreau is not rebooted for a long time
# some cluster can have policies countering touch abuse
# touch command may fail for many reasons, e.g. resource issues
# sometimes touch might fails even if timestamp update is successful
puts "C> Timestamps are updated."
rescue => e
puts "C> Timestamps update FAILED: " + e.message
return
end
end

end

0 comments on commit fe078ea

Please sign in to comment.