forked from DIGI-VUB/scan2text
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.R
303 lines (284 loc) · 25.2 KB
/
app.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
library(shinydashboard)
library(shiny)
library(shinyFiles)
library(tools)
library(data.table)
library(jsonlite)
library(magick)
library(pdftools)
library(tesseract)
library(digest)
download <- c("eng", "nld", "fra")
download <- setdiff(download, tesseract_info()$available)
for(model in download){
#tesseract_download("nld")
tesseract_download(model)
}
header <- dashboardHeader(title = "simple text extraction tool")
body <- dashboardBody(
tags$head(tags$script(
HTML('
$(document).ready(function() {
// define options to pass to bounding box constructor
var options = {
url: "https://raw.githubusercontent.com/DIGI-VUB/scan2text/master/example/example.png",
input_method: "select",
labels: [""],
color_list: [""],
onchange: function(entries) {
Shiny.onInputChange("rectCoord", JSON.stringify(entries, null, " "));
}
};
// Initialize the bounding-box annotator.
var annotator = new BBoxAnnotator(options);
// Initialize the reset button.
$("#reset_button").click(function(e) {
annotator.clear_all();
})
// define function to reset the bbox
// ...upon choosing new label category or new url
function reset_bbox(options) {
document.getElementById("bbox_annotator").setAttribute("style", "display:inline-block");
$(".image_frame").remove();
annotator = new BBoxAnnotator(options);
}
// update image url from shiny
Shiny.addCustomMessageHandler("change-img-url", function(url) {
annotator.clear_all();
options.url = url;
options.width = null;
options.height = null;
reset_bbox(options);
});
// update colors and categories from shiny
Shiny.addCustomMessageHandler("update-category-list", function(vals) {
options.labels = Object.values(vals);
options.color_list = Object.keys(vals);
reset_bbox(options);
});
// redraw rectangles based on list of entries
Shiny.addCustomMessageHandler("redraw-rects", function(vals) {
var arr = JSON.parse(vals);
arr.forEach(function(rect){
annotator.add_entry(rect);
});
if (annotator.onchange) {
annotator.onchange(annotator.entries);
}
});
});')),
#tags$head(tags$script(src = "bbox_annotation.js"))),
tags$head(tags$script(HTML("// Generated by CoffeeScript 2.5.0\n(function() {\n // https://github.com/kyamagu/bbox-annotator/blob/master/bbox_annotator.coffee\n // Use coffee-script compiler to obtain a javascript file.\n\n // coffee -c bbox_annotator.coffee\n\n // See http://coffeescript.org/\n\n // BBox selection window.\n var BBoxSelector;\n\n BBoxSelector = class BBoxSelector {\n // Initializes selector in the image frame.\n constructor(image_frame, options) {\n if (options == null) {\n options = {};\n }\n options.input_method || (options.input_method = \"text\");\n this.image_frame = image_frame;\n this.border_width = options.border_width || 2;\n this.selector = $('<div class=\"bbox_selector\"></div>');\n this.selector.css({\n // rectangle color when dragging\n \"border\": this.border_width + \"px dotted rgb(127,255,127)\",\n \"position\": \"absolute\"\n });\n this.image_frame.append(this.selector);\n this.selector.css({\n \"border-width\": this.border_width\n });\n this.selector.hide();\n this.create_label_box(options);\n }\n\n // Initializes a label input box.\n create_label_box(options) {\n var i, label, len, ref;\n options.labels || (options.labels = [\"object\"]);\n this.label_box = $('<div class=\"label_box\" style=\"z-index: 1000\"></div>');\n this.label_box.css({\n \"position\": \"absolute\"\n });\n this.image_frame.append(this.label_box);\n switch (options.input_method) {\n case 'select':\n if (typeof options.labels === \"string\") {\n options.labels = [options.labels];\n }\n this.label_input = $('<select class=\"label_input\" name=\"label\"></select>');\n this.label_box.append(this.label_input);\n this.label_input.append($('<option value>choose an item</option>'));\n ref = options.labels;\n for (i = 0, len = ref.length; i < len; i++) {\n label = ref[i];\n this.label_input.append('<option value=\"' + label + '\">' + label + '</option>');\n }\n this.label_input.change(function(e) {\n return this.blur();\n });\n break;\n case 'text':\n if (typeof options.labels === \"string\") {\n options.labels = [options.labels];\n }\n this.label_input = $('<input class=\"label_input\" name=\"label\" ' + 'type=\"text\" value>');\n this.label_box.append(this.label_input);\n this.label_input.autocomplete({\n source: options.labels || [''],\n autoFocus: true\n });\n break;\n case 'fixed':\n if ($.isArray(options.labels)) {\n options.labels = options.labels[0];\n }\n this.label_input = $('<input class=\"label_input\" name=\"label\" type=\"text\">');\n this.label_box.append(this.label_input);\n this.label_input.val(options.labels);\n break;\n default:\n throw 'Invalid label_input parameter: ' + options.input_method;\n }\n return this.label_box.hide();\n }\n\n // Crop x and y to the image size.\n crop(pageX, pageY) {\n var point;\n return point = {\n x: Math.min(Math.max(Math.round(pageX - this.image_frame.offset().left), 0), Math.round(this.image_frame.width() - 1)),\n y: Math.min(Math.max(Math.round(pageY - this.image_frame.offset().top), 0), Math.round(this.image_frame.height() - 1))\n };\n }\n\n // When a new selection is made.\n start(pageX, pageY) {\n this.pointer = this.crop(pageX, pageY);\n this.offset = this.pointer;\n this.refresh();\n this.selector.show();\n $('body').css('cursor', 'crosshair');\n return document.onselectstart = function() {\n return false;\n };\n }\n\n // When a selection updates.\n update_rectangle(pageX, pageY) {\n this.pointer = this.crop(pageX, pageY);\n return this.refresh();\n }\n\n // When starting to input label.\n input_label(options) {\n $('body').css('cursor', 'default');\n document.onselectstart = function() {\n return true;\n };\n this.label_box.show();\n return this.label_input.focus();\n }\n\n // Finish and return the annotation.\n finish(options) {\n var data;\n this.label_box.hide();\n this.selector.hide();\n data = this.rectangle();\n data.label = $.trim(this.label_input.val().toLowerCase());\n if (options.input_method !== 'fixed') {\n this.label_input.val('');\n }\n return data;\n }\n\n // Get a rectangle.\n rectangle() {\n var rect, x1, x2, y1, y2;\n x1 = Math.min(this.offset.x, this.pointer.x);\n y1 = Math.min(this.offset.y, this.pointer.y);\n x2 = Math.max(this.offset.x, this.pointer.x);\n y2 = Math.max(this.offset.y, this.pointer.y);\n return rect = {\n left: x1,\n top: y1,\n width: x2 - x1 + 1,\n height: y2 - y1 + 1\n };\n }\n\n // Update css of the box.\n refresh() {\n var rect;\n rect = this.rectangle();\n this.selector.css({\n left: (rect.left - this.border_width) + 'px',\n top: (rect.top - this.border_width) + 'px',\n width: rect.width + 'px',\n height: rect.height + 'px'\n });\n return this.label_box.css({\n left: (rect.left - this.border_width) + 'px',\n top: (rect.top + rect.height + this.border_width) + 'px'\n });\n }\n\n // Return input element.\n get_input_element() {\n return this.label_input;\n }\n\n };\n\n // Annotator object definition.\n this.BBoxAnnotator = class BBoxAnnotator {\n // Initialize the annotator layout and events.\n constructor(options) {\n var annotator, image_element;\n annotator = this;\n this.annotator_element = $(options.id || \"#bbox_annotator\");\n // allow us to access colors and labels in future steps\n this.color_list = options.color_list;\n this.label_list = options.labels;\n this.border_width = options.border_width || 2;\n this.show_label = options.show_label || (options.input_method !== \"fixed\");\n if (options.multiple != null) {\n this.multiple = options.multiple;\n } else {\n this.multiple = true;\n }\n this.image_frame = $('<div class=\"image_frame\"></div>');\n this.annotator_element.append(this.image_frame);\n if (options.guide) {\n annotator.initialize_guide(options.guide);\n }\n image_element = new Image();\n image_element.src = options.url;\n image_element.onload = function() {\n options.width || (options.width = image_element.width);\n options.height || (options.height = image_element.height);\n annotator.annotator_element.css({\n \"width\": (options.width + annotator.border_width) + 'px',\n \"height\": (options.height + annotator.border_width) + 'px',\n \"padding-left\": (annotator.border_width / 2) + 'px',\n \"padding-top\": (annotator.border_width / 2) + 'px',\n \"cursor\": \"crosshair\",\n \"overflow\": \"hidden\"\n });\n annotator.image_frame.css({\n \"background-image\": \"url('\" + image_element.src + \"')\",\n \"width\": options.width + \"px\",\n \"height\": options.height + \"px\",\n \"position\": \"relative\"\n });\n annotator.selector = new BBoxSelector(annotator.image_frame, options);\n return annotator.initialize_events(options);\n };\n image_element.onerror = function() {\n return annotator.annotator_element.text(\"Invalid image URL: \" + image_element.src);\n };\n this.entries = [];\n this.onchange = options.onchange;\n }\n\n // Initialize events.\n initialize_events(options) {\n var annotator, selector, status;\n status = 'free';\n this.hit_menuitem = false;\n annotator = this;\n selector = annotator.selector;\n this.annotator_element.mousedown(function(e) {\n if (!annotator.hit_menuitem) {\n switch (status) {\n case 'free':\n case 'input':\n if (status === 'input') {\n selector.get_input_element().blur();\n }\n if (e.which === 1) { // left button\n selector.start(e.pageX, e.pageY);\n status = 'hold';\n }\n }\n }\n annotator.hit_menuitem = false;\n return true;\n });\n $(window).mousemove(function(e) {\n var offset;\n switch (status) {\n case 'hold':\n selector.update_rectangle(e.pageX, e.pageY);\n }\n if (annotator.guide_h) {\n offset = annotator.image_frame.offset();\n annotator.guide_h.css('top', Math.floor(e.pageY - offset.top) + 'px');\n annotator.guide_v.css('left', Math.floor(e.pageX - offset.left) + 'px');\n }\n return true;\n });\n $(window).mouseup(function(e) {\n switch (status) {\n case 'hold':\n selector.update_rectangle(e.pageX, e.pageY);\n selector.input_label(options);\n status = 'input';\n if (options.input_method === 'fixed') {\n selector.get_input_element().blur();\n }\n }\n return true;\n });\n selector.get_input_element().blur(function(e) {\n var data;\n switch (status) {\n case 'input':\n data = selector.finish(options);\n if (data.label) {\n // store color with the entry\n // ...so we can redraw the rectangle upon changing label category\n data.color = annotator.color_list[annotator.label_list.indexOf(data.label)];\n annotator.add_entry(data);\n if (annotator.onchange) {\n annotator.onchange(annotator.entries);\n }\n }\n status = 'free';\n }\n return true;\n });\n selector.get_input_element().keypress(function(e) {\n switch (status) {\n case 'input':\n if (e.which === 13) {\n selector.get_input_element().blur();\n }\n }\n return e.which !== 13;\n });\n selector.get_input_element().mousedown(function(e) {\n return annotator.hit_menuitem = true;\n });\n selector.get_input_element().mousemove(function(e) {\n return annotator.hit_menuitem = true;\n });\n selector.get_input_element().mouseup(function(e) {\n return annotator.hit_menuitem = true;\n });\n return selector.get_input_element().parent().mousedown(function(e) {\n return annotator.hit_menuitem = true;\n });\n }\n\n // Add a new entry.\n add_entry(entry) {\n var annotator, box_element, close_button, text_box;\n if (!this.multiple) {\n this.annotator_element.find(\".annotated_bounding_box\").detach();\n this.entries.splice(0);\n }\n this.entries.push(entry);\n box_element = $('<div class=\"annotated_bounding_box\"></div>');\n box_element.appendTo(this.image_frame).css({\n // rectangle color -- when stopped dragging\n \"border\": this.border_width + \"px solid \" + entry.color,\n \"position\": \"absolute\",\n \"top\": (entry.top - this.border_width) + \"px\",\n \"left\": (entry.left - this.border_width) + \"px\",\n \"width\": entry.width + \"px\",\n \"height\": entry.height + \"px\",\n // text color when stopped dragging\n \"color\": entry.color,\n \"font-family\": \"monospace\",\n \"font-size\": \"small\"\n });\n close_button = $('<div></div>').appendTo(box_element).css({\n \"position\": \"absolute\",\n \"top\": \"-8px\",\n \"right\": \"-8px\",\n \"width\": \"16px\",\n \"height\": \"0\",\n \"padding\": \"16px 0 0 0\",\n \"overflow\": \"hidden\",\n \"color\": \"#fff\",\n \"background-color\": \"#030\",\n \"border\": \"2px solid #fff\",\n \"-moz-border-radius\": \"18px\",\n \"-webkit-border-radius\": \"18px\",\n \"border-radius\": \"18px\",\n \"cursor\": \"pointer\",\n \"-moz-user-select\": \"none\",\n \"-webkit-user-select\": \"none\",\n \"user-select\": \"none\",\n \"text-align\": \"center\"\n });\n $(\"<div></div>\").appendTo(close_button).html('×').css({\n \"display\": \"block\",\n \"text-align\": \"center\",\n \"width\": \"16px\",\n \"position\": \"absolute\",\n \"top\": \"-2px\",\n \"left\": \"0\",\n \"font-size\": \"16px\",\n \"line-height\": \"16px\",\n \"font-family\": '\"Helvetica Neue\", Consolas, Verdana, Tahoma, Calibri, ' + 'Helvetica, Menlo, \"Droid Sans\", sans-serif'\n });\n text_box = $('<div></div>').appendTo(box_element).css({\n \"overflow\": \"hidden\"\n });\n if (this.show_label) {\n text_box.text(entry.label);\n }\n annotator = this;\n box_element.hover((function(e) {\n return close_button.show();\n }), (function(e) {\n return close_button.hide();\n }));\n close_button.mousedown(function(e) {\n return annotator.hit_menuitem = true;\n });\n close_button.click(function(e) {\n var clicked_box, index;\n clicked_box = close_button.parent(\".annotated_bounding_box\");\n index = clicked_box.prevAll(\".annotated_bounding_box\").length;\n clicked_box.detach();\n annotator.entries.splice(index, 1);\n return annotator.onchange(annotator.entries);\n });\n return close_button.hide();\n }\n\n // Clear all entries.\n clear_all(e) {\n this.annotator_element.find(\".annotated_bounding_box\").detach();\n this.entries.splice(0);\n return this.onchange(this.entries);\n }\n\n // Add crosshair guide.\n initialize_guide(options) {\n this.guide_h = $('<div class=\"guide_h\"></div>').appendTo(this.image_frame).css({\n \"border\": \"1px dotted \" + (options.color || '#000'),\n \"height\": \"0\",\n \"width\": \"100%\",\n \"position\": \"absolute\",\n \"top\": \"0\",\n \"left\": \"0\"\n });\n return this.guide_v = $('<div class=\"guide_v\"></div>').appendTo(this.image_frame).css({\n \"border\": \"1px dotted \" + (options.color || '#000'),\n \"height\": \"100%\",\n \"width\": \"0\",\n \"position\": \"absolute\",\n \"top\": \"0\",\n \"left\": \"0\"\n });\n }\n\n };\n\n}).call(this);"
)))),
fluidRow(
column(width = 9,
box(title = "Image", width = NULL, status = "primary", solidHeader = FALSE,
div(id = "bbox_annotator", style = "display:inline-block")
),
box(title = "Selections", width = NULL, status = "primary", solidHeader = TRUE,
verbatimTextOutput("out_areas")
)
),
column(width = 3,
box(title = "Select file", width = NULL, status = "success",
shinyFilesButton(id = 'fileSelect', label = 'File', title = 'Select a file', multiple = FALSE),
p(
class = "text-muted",
paste("Note: this application takes either an image or .pdf and extracts text in bounding boxes"
)
),
uiOutput(outputId = "uo_selected_file"),
sliderInput(inputId = "file_dpi", label = "dpi - dots per inch pdf rendering", value = 112, min = 36, max = 300)
),
box(title = "Browse pages", width = NULL, status = "success",
p("Showing page: ", textOutput(outputId = "uo_selected_page")),
wellPanel(
actionButton(inputId = "ui_page_next", label = "Next page", icon = icon("angle-double-right")),
hr(),
actionButton(inputId = "ui_page_previous", label = "Previous page", icon = icon("angle-double-left"))
)
),
box(title = "Reset", width = NULL, status = "danger",
div(
tags$button(id = "reset_button", type = "reset", class = "btn btn-default action-button", "Reset all selections of page")
)
),
box(title = "Download results", width = NULL, status = "success",
downloadButton(outputId = "ui_downloadresults_csv", label = "Download your work in .csv")
),
box(title = "Perform Optical Character Recognition", width = NULL, status = "danger",
checkboxInput(inputId = "ui_ocr_yes", label = "Perform OCR", value = TRUE),
selectInput(inputId = "ui_ocr_language", label = "Language", choices = tesseract_info()$available, selected = "nld"),
p(
class = "text-muted",
paste("Uses Tesseract by default with the Dutch model"
)
)
)
)
)
)
ui <- dashboardPage(
header,
dashboardSidebar(disable = TRUE),
body
)
server <- function(input, output, session) {
saveDB <- function(x, path, store = TRUE){
x <- lapply(x, FUN=function(x){
rbindlist(x, use.names = TRUE, fill = TRUE, idcol = "page")
})
x <- rbindlist(x, use.names = TRUE, fill = TRUE, idcol = "file")
x <- setDF(x)
if(nrow(x) > 0 && store == TRUE){
write.csv2(x, file = path, na = "", row.names = FALSE)
}
invisible(x)
}
results <- list()
session$sendCustomMessage("update-category-list", list("orange" = "new text chunk", "pink" = "continuing text chunk"))
volumes <- c(CURRENT_DIRECTORY = getwd(), UP = "..", USER_FOLDER = Sys.getenv("USERPROFILE"), HOME = "")
shinyFiles::shinyFileChoose(input, id = 'fileSelect', roots = volumes, session = session)
getSelectedFile <- function(inputui, default = "https://raw.githubusercontent.com/DIGI-VUB/scan2text/master/example/example.png"){
f <- shinyFiles::parseFilePaths(volumes, inputui)
f <- as.character(f$datapath)
if(length(f) == 0){
f <- default
}
f
}
pagenumber <- reactiveVal(1)
image_content <- reactive({
input$fileSelect
x <- getSelectedFile(input$fileSelect)
if(length(x) == 0){
out <- list(failed = TRUE)
}else{
showModal(modalDialog("Reading data, please wait. This popup will close automatically when this is finished.", easyClose = FALSE, footer = NULL))
## read in the data
ok <- try(img <- image_read(x))
removeModal()
if(inherits(ok, "try-error")){
showModal(modalDialog(sprintf("Could not read file: %s. Is this file an image or a pdf file?", basename(x))))
}
## define path where to store object
www <- file.path(getwd(), "www", "appfolder")
if(!dir.exists(www)) dir.create(www, recursive = TRUE)
www <- cbind(file.info(x), Sys.time())
path <- file.path("appfolder", sprintf("%s_%s.png", digest::digest(www), seq_len(length(img))))
path_full <- file.path(getwd(), "www", path)
## start again from page 1
isolate({
pagenumber(1)
})
out <- list(failed = FALSE,
img = img,
dpi = input$file_dpi,
pages = length(img),
width = head(image_info(img)$width, 1),
height = head(image_info(img)$height, 1),
path = path,
path_full = path_full,
file_results = file.path(getwd(), "www", sprintf("results_%s.csv", file.path(file_path_sans_ext(basename(x))))),
file = x,
file_basename = basename(x))
}
out
})
observeEvent(input$ui_page_next, {
content <- image_content()
if(!content$failed){
if(pagenumber() < content$pages){
pagenumber(pagenumber() + 1)
}
}
})
observeEvent(input$ui_page_previous, {
content <- image_content()
if(!content$failed){
if(pagenumber() > 1){
pagenumber(pagenumber() - 1)
}
}
})
output$uo_selected_file <- renderUI({
content <- image_content()
if(!content$failed){
p(content$file_basename, br(),
sprintf("Dimension: %sx%s", content$width, content$height))
}
})
output$uo_selected_page <- renderText({
content <- image_content()
if(!content$failed){
page <- pagenumber()
content$p <- content$path[page]
content$p_full <- content$path_full[page]
img <- content$img
if(file_ext(content$file) == "pdf"){
showNotification("Converting pdf to png", id = "note_pdf_convert", session = session)
f <- pdf_convert(pdf = content$file, pages = page, dpi = content$dpi, format = "png", filenames = content$p_full)
removeNotification(id = "note_pdf_convert", session = session)
}else{
image_write(img[page], path = content$p_full, format = "png")
}
f <- content$p
session$sendCustomMessage("change-img-url", f)
sprintf("%s / %s", page, content$pages)
}
})
# Results
output$out_areas <- renderPrint({
isolate({
content <- image_content()
content$page <- pagenumber()
})
if(!is.null(input$rectCoord)) {
x <- jsonlite::fromJSON(input$rectCoord)
x <- as.data.frame(x)
if(nrow(x) == 0){
invisible()
}else{
if(!content$failed){
content$img <- image_read(content$path_full[content$page])
x$dpi <- content$dpi
if(input$ui_ocr_yes){
showNotification("Performing OCR", id = "note_ocr", session = session)
x$area <- sprintf("%sx%s+%s+%s", x$width, x$height, x$left, x$top)
x$text <- sapply(x$area, FUN=function(crop){
img <- image_crop(content$img, geometry = crop)
tesseract::ocr(img, engine = tesseract::tesseract(input$ui_ocr_language), HOCR = FALSE)
})
removeNotification(id = "note_ocr", session = session)
showModal(modalDialog(title = "OCR", tail(x$text, n = 1), easyClose = TRUE))
}
}
results[[content$file_basename]][[content$page]] <<- x
saveDB(results, path = content$file_results)
x
}
}else{
invisible()
}
})
observeEvent(input$reset_button, {
isolate({
content <- image_content()
})
if(!content$failed){
results[[content$file_basename]][[pagenumber()]] <<- NULL
saveDB(results, path = content$file_results)
}
})
output$ui_downloadresults_csv <- downloadHandler(
filename = function() {
sprintf("scan2text_%s.csv", format(Sys.time(), "%Y%m%d_%H%M%S"))
},
content = function(filename) {
isolate({
content <- image_content()
})
file.copy(content$file_results, to = filename, overwrite = TRUE)
}, contentType = "text/csv")
}
shinyApp(ui, server)