-
Notifications
You must be signed in to change notification settings - Fork 2
/
sundata_parser.rb
executable file
·232 lines (205 loc) · 7.67 KB
/
sundata_parser.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
#!/usr/bin/env ruby
require "csv"
require "ostruct"
# # SundataParser
# A class to read through an array of sundata TXT files and produce a csv file
# as output.
#
# ### Examples:
# # Creating a new parser object. This is done when customizing the csv generation process.
# SundataPaser.new ["data/file1.TXT", "data/file2.TXT"]
#
# # Using the automatic runner. This starts the process as if it were run from a terminal
# # prompt directly.
# SundataParser.run! ARGV
class SundataParser
attr_reader :input_files, :input_data, :output_data
# Initialize a new parser object
#
# This method isn't run directly, but is run as part of SundataParser.new
#
# - `input_files` an array of filenames to read as sundata files.
#
def initialize input_files
@input_files = input_files
@input_data = Hash.new
@output_data = Array.new
end
# Read input files into memory.
#
# Read from each input file in `input_files` and store the file contents
# associated with the filename in a Ruby Hash Object.
#
# ### Examples:
# parser.read
# parser.input_data.each do |filename, file_contents|
# # ...
# end
#
# Filenames that do not exist are ignored without error.
#
# Returns nil but after reading data will be available in `input_data`.
def read
@input_files.each do |file|
if File.exist? file
input_data[file] = File.read(file)
end
end
nil
end
# Set a preprocess script to be run once per input file when parsing data.
#
#
# This can be used to add custom fields to the final output and is especially
# useful if you have information you need to extract from the filename.
#
# To set a preprocessor, call this method with a Ruby block that has two
# inputs, the filename of the current input file and the OpenStruct object
# that will be used as the template for all row data in the final output.
# ### Examples:
#
# parser.preprocess do |filename, rowdata_template|
# rowdata_template.original_filename = filename
# rowdata_template.site_number = filename.split(" ")[2]
# end
#
# Does not return anything but stores the provided block for use during
# parsing. Raises an ArgumentError if called without a block.
def preprocess &preprocess_block
if preprocess_block.nil?
raise ArgumentError.new "#preprocess received with no block argument."
end
@preprocess_block = preprocess_block
end
# Parse the input data and build output data.
#
# Transform the plaintext data that has been read and build an Array of
# output rows. If a preprocessor block was set, that will be used to set
# initial values for rows in each filename.
#
# Does not have a return value but it will populate `output_data` with
# OpenStruct objects representing each entry in the original data table.
def parse
input_data.each do |filename, data|
rowdata_template = OpenStruct.new
@preprocess_block.call(filename, rowdata_template) if @preprocess_block
arrived_at_table_data = false
data.split("\r\n").each do |line|
next if line.strip.empty? # Skip blank and whitespace lines
if !arrived_at_table_data
next if line =~ /\ACreated by SunData/ # Skip created by line
if date_match = line.match(/\A(\d\d\d\d-\d\d-\d\d)\t\tLocal time is (GMT.\d+ Hrs)/)
rowdata_template.date = date_match[1]
rowdata_template.timezone = date_match[2]
end
if sunscan_match = line.match(/\ASunScan probe (v.*)/)
rowdata_template.sunscan_version = sunscan_match[1]
end
if matches = line.scan(/\s*([^\t:]+)\s*:\s*([^\t:]+)/)
matches.flatten.map(&:strip).each_slice(2) do |key, value|
next if value.nil? || value.empty?
rowdata_template[key.downcase.gsub(" ", "_")] = value
end
end
# Once we hit the table hearder we can start processing tabular data.
# The header is two lines long because of the formatting.
next if line =~ /\ATime\tPlot\t/
arrived_at_table_data = true and next if line =~ /\s+mitted\s+ent/
else
rowdata = rowdata_template.dup
table_line = line.split("\t")
rowdata.time = table_line[0]
rowdata.plot = table_line[1]
rowdata.sample = table_line[2]
rowdata.transmitted = table_line[3]
rowdata.spread = table_line[4]
rowdata.incident = table_line[5]
rowdata.beam = table_line[6]
rowdata.zenith_angle = table_line[7]
rowdata.lai = table_line[8]
rowdata.notes = table_line[9]
# Only record output data once the full line data has been captured.
output_data << rowdata
end
end
end
end
# Write csv output to the given filename.
#
# - `filename` The name of the csv file to write to.
# - `fields` The ordered array of fields to include. Defaults to all
# available fields.
#
# ### Examples
#
# parser.write_csv("sample-output.csv")
# parser.write_csv("sample-output.csv", ["time", "incident", "beam", "transmitted")
# parser.write_csv("sample-output.csv", %w[zenith_angle plot sample lai notes]
#
# After running row data will be written to the filename specified with a header
# line for import into a preferred spreadsheet tool.
def write_csv filename, fields = nil
# By default all fields present in every row of output_data will be incorporated.
if fields.nil?
# Transform each output struct into a list of its keys, then take the intersection of each Array of keys.
# This ensures that only fields present for all rows will be incorporated.
fields = output_data.map{|o| o.to_h.keys}.inject do |last_keys, this_keys|
last_keys & this_keys
end
end
CSV.open filename, "wb", row_sep: "\r\n" do |csv|
# Header line
csv << fields
output_data.each do |out|
output_row = []
fields.each do |field|
output_row << out[field]
end
csv << output_row
end
end
end
# Class method for running command line program.
# Automatically called if $0 is this file.
#
# Can be called manually by requiring this file. When called from Ruby
# this class method can take the preprocess_block directly.
def SundataParser.run! argument_values, &preprocess_block
require "optparse"
argument_values << "-h" if argument_values.empty?
optparser = OptionParser.new do |opts|
opts.banner = "Usage: sundata_parser.rb --outfile OUTPUT_FILENAME --fields FIELDS INPUT_FILE..."
opts.on "-oOUTFILE", "--outfile OUTFILE", "The name of the file output will be written to" do |outfile|
@outfile = outfile
end
opts.on "-fFIELDS", "--fields FIELDS", "A comma-separated list of fields to output" do |fields|
@fields = fields.split(",")
end
opts.on "-h", "--help", "Prints this help" do
puts opts
puts " for more information and help visit https://github.com/nuclearsandwich/sundata-parser"
exit
end
end.parse!(argument_values)
if argument_values.empty?
puts "Error: no input files. Run `#{$0} -h` for more information."
exit(1)
end
unless defined?(@outfile)
puts "Error: no output file. Run `#{$0} -h` for more information."
exit(1)
end
parser = SundataParser.new argument_values
parser.preprocess &preprocess_block unless preprocess_block.nil?
parser.read
parser.parse
if defined?(@fields)
parser.write_csv @outfile, @fields
else
parser.write_csv @outfile
end
end
end
if $0.match /\A\/?sundata_parser.rb/
SundataParser.run! ARGV
end