-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathRakefile
More file actions
184 lines (156 loc) · 6.67 KB
/
Rakefile
File metadata and controls
184 lines (156 loc) · 6.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# frozen_string_literal: true
task default: %i[update delete write_layers_json]
require 'json'
require 'faraday'
require 'faraday/net_http_persistent'
require 'faraday/retry'
require 'uri'
require 'debug'
require 'progress_bar'
require 'progress_bar/core_ext/enumerable_with_progress'
require 'time'
CATALOG_URL = ENV.fetch('CATALOG_URL', 'https://earthworks.stanford.edu/catalog')
PURL_FETCHER_URL = ENV.fetch('PURL_FETCHER_URL', 'https://purl-fetcher.stanford.edu')
BASE_DIR = ENV.fetch('BASE_DIR', 'metadata-aardvark')
INSTITUTION = 'Stanford'
IGNORED_FIELDS = %w[timestamp layer_availability_score_f _version_ hashed_id_ssi solr_bboxtype__minX
solr_bboxtype__maxX solr_bboxtype__minY solr_bboxtype__maxY].freeze
DOC_ID_REGEX = /\Astanford-([b-df-hjkmnp-tv-z]{2})([0-9]{3})([b-df-hjkmnp-tv-z]{2})([0-9]{4})\z/i
# Wrap a function with a timestamp file to avoid re-processing documents
# If the block returns a timestamp, update the timestamp file
def with_timestamp(timestamp_file = './last_run')
return unless block_given?
# Call the provided block with the previous timestamp, receiving the most
# recent document's timestamp as the result of the block
previous_timestamp = get_file_timestamp(timestamp_file)
last_timestamp = yield(previous_timestamp)
return unless last_timestamp
# If the most recent document is in the past (beyond any reasonable clock-skew)
# go ahead and bump the timestamp so we don't repeat documents
start_timestamp = Time.now.utc
last_timestamp = (Time.parse(last_timestamp) + 1).utc.iso8601 if (Time.parse(last_timestamp) + 3600) < start_timestamp
puts "Updating last run: #{last_timestamp}"
File.open(timestamp_file, 'w') { |f| f.puts last_timestamp }
end
# Check the timestamp file for the last run time
def get_file_timestamp(file)
timestamp = Time.parse(File.read(file).strip)
puts "Last run: #{timestamp}"
timestamp
rescue Errno::ENOENT
puts "No timestamp found in #{file}"
nil
rescue ArgumentError
puts "Invalid timestamp format in #{file}"
nil
end
# Settings for retrying requests if the server rejects them
# See: https://github.com/lostisland/faraday-retry
def retry_options
{
max: 10,
interval: 1,
backoff_factor: 3,
exceptions: [Faraday::TimeoutError, Faraday::ConnectionFailed, Faraday::TooManyRequestsError]
}
end
# Persistent HTTP client with backoff used to make catalog requests
# pool_size controls parallelism
def make_client(pool_size: 1)
Faraday.new(CATALOG_URL) do |conn|
conn.request :retry, retry_options
conn.adapter(:net_http_persistent, pool_size:)
conn.response :raise_error
end
end
# Make an HTTP request and return parsed JSON
def get_json(url, params: {}, client: make_client)
JSON.parse(client.get(url, params).body.to_s)
end
# Yield all documents from the catalog updated since the given timestamp
# Returns the result of calling the block on each document
def updated_docs_since(timestamp = Time.at(0), &block)
# Query purl-fetcher for all released layers and filter to those updated since the timestamp
layers = get_json("#{PURL_FETCHER_URL}/released/Earthworks.json")
layers.filter! { |layer| Time.parse(layer['updated_at']) > timestamp } if timestamp
puts layers.empty? ? '== No updated layers found ==' : "== Found #{layers.length} updated layers =="
# For each druid, yield the parsed geoblacklight JSON from the catalog
client = make_client(pool_size: 4)
layers.map { |layer| layer['druid'].gsub('druid:', 'stanford-') }.with_progress.map do |doc_id|
block.call(get_json("#{CATALOG_URL}/#{doc_id}/raw", params: { format: :json }, client:))
rescue Faraday::ResourceNotFound
# Released but not indexed (e.g. because of bad metadata); ignore
end.compact
end
# Call block on all documents listed in layers.json that are no longer released
# Returns the result of calling the block on each document
def deleted_docs(&block)
# Query purl-fetcher for all released layers and compare to layers.json
released = get_json("#{PURL_FETCHER_URL}/released/Earthworks").map { |layer| layer['druid'] }
old = JSON.parse(File.read('layers.json')).keys
deleted = old.to_set - released.to_set
puts deleted.empty? ? '== No deleted layers found ==' : "== Found #{deleted.length} deleted layers =="
# For each druid, call the block with its document ID
deleted.map { |druid| druid.gsub('druid:', 'stanford-') }.with_progress.map do |doc_id|
block.call(doc_id)
end
end
# Write the document's metadata to a file in the appropriate directory
# Returns the document's timestamp
def write_doc_metadata(doc)
# Find the nested directory structure for the document
tree_dirs = doc['id'].match(DOC_ID_REGEX).captures.join('/')
return if tree_dirs.empty?
# Create the directory structure if it doesn't exist
tree = File.expand_path("#{BASE_DIR}/#{tree_dirs}")
FileUtils.mkdir_p(tree)
# Strip out ignored fields and write the document to the directory
IGNORED_FIELDS.each { |field| doc.delete(field) }
File.open("#{tree}/geoblacklight.json", 'w') { |f| f.puts JSON.pretty_generate(doc) }
# Return the document's timestamp
doc['gbl_mdModified_dt']
end
# Delete a document and its containing directory
# Takes a document id like 'stanford-bb058zh0946'
def delete_doc(doc_id)
# Find the nested directory structure for the document
tree_dirs = doc_id.match(DOC_ID_REGEX).captures.join('/')
return if tree_dirs.empty?
# Delete the directory if it exists
tree = File.expand_path("#{BASE_DIR}/#{tree_dirs}")
FileUtils.rm_rf(tree)
end
# This is run first
desc 'Update metadata for layers'
task :update do
puts '== Updating metadata for layers =='
with_timestamp do |previous_timestamp|
updated = updated_docs_since(previous_timestamp) do |doc|
write_doc_metadata(doc)
end
# Return the most recent timestamp from the updated documents
updated.max_by { |timestamp| Time.parse(timestamp) }
end
end
# This task is run after the update task
desc 'Delete metadata for layers no longer released'
task :delete do
puts '== Deleting metadata for layers no longer released =='
deleted_docs do |doc_id|
delete_doc(doc_id)
end
end
# See: https://opengeometadata.org/share-on-ogm/#naming-by-metadata-standard
# This task is run after the update and delete tasks
desc 'Write layers.json mapping layer IDs to file paths'
task :write_layers_json do
data = {}
# Crawl the directory structure and get each druid with its directory
Dir.glob("#{BASE_DIR}/**/geoblacklight.json").each do |file|
druid = File.dirname(file).split(%r{/}).drop(1).join
data["druid:#{druid}"] = File.dirname(file)
end
# Write the JSON to a file
File.open('./layers.json', 'w') { |f| f.puts JSON.pretty_generate(data) }
puts "== Wrote #{data.length} layers to layers.json =="
end