Start of PDF merging

tatjam · Sep 28, 2022 · afc219a · afc219a
1 parent 9858b95
commit afc219a
Show file tree

Hide file tree

Showing 4 changed files with 123 additions and 6 deletions.
diff --git a/src/document/document.nim b/src/document/document.nim
@@ -33,7 +33,6 @@ proc generate*(path: string, preset: Preset): Document =
         # PDF files may not actually have the files if the page is empty, we generate
         # it anyway as we will superimpose the pdf files later
         if strm.isNil:
-            echo "What in the world?"
             discard
         else:
             npage = load_page(strm).get()

diff --git a/src/gui/base.nim b/src/gui/base.nim
@@ -17,14 +17,14 @@ def.save_preset()
 
 load_presets()
 
-let x = download("fa906f70-e0e7-4492-ac6a-1b735b2f251c", all_presets[parseUUID("66d6d990-2fd8-4e31-8260-a53c41a71429")])
+#[let x = download("fa906f70-e0e7-4492-ac6a-1b735b2f251c", all_presets[parseUUID("66d6d990-2fd8-4e31-8260-a53c41a71429")])
 let doc = sync(x)
-doc.generate_pdf("output.pdf")
+doc.generate_pdf("output.pdf")]#
 
 
-#[let y = download("d4bd814c-dc0c-4352-b3bd-e37e8b6576d1")
+let y = download("d4bd814c-dc0c-4352-b3bd-e37e8b6576d1", all_presets[parseUUID("66d6d990-2fd8-4e31-8260-a53c41a71429")])
 let doc2 = sync(y)
-doc2.generate_pdf("output-pdf.pdf")]#
+doc2.generate_pdf("output-pdf.pdf")
 
 
 let MARGINS* = 8

diff --git a/src/pdf/pdfcombine.nim b/src/pdf/pdfcombine.nim
@@ -0,0 +1,114 @@
+# Allows adding pages to PDF files / adding content on top of existing pages
+# very barebones implementation, it would be a better idea to wrap a PDF library that
+# can read files, but it works
+import std/tables
+import std/streams
+import std/parseutils
+import std/strutils
+import json
+
+type PDFObjType = enum
+    ARRAY,
+    TABLE,
+    OTHER
+
+# We only really care about array and table objects
+type PDFObj = ref object
+    case kind: PDFObjType
+    of ARRAY:
+        elems: seq[PDFObj]
+    of TABLE:
+        children: Table[string, PDFObj]
+    of OTHER: discard
+
+
+type PDFMap = ref object
+    objects: seq[PDFObj]
+
+# Ignores comments
+proc get_next_line(f: FileStream): string = 
+    var in_comment = false
+    var line_start = true
+    var c: char
+    var line: string
+    while true:
+        c = f.readChar()
+        line.add(c)
+        if c == '%' and line_start == true:
+            in_comment = true
+
+        if line_start == true:
+            line_start = false
+
+        if c == '\n':
+            line_start = true
+            if not in_comment:
+                return line
+            in_comment = false
+            line = ""
+
+# We convert the weird syntax into JSON and parse it
+proc parse_table(f: FileStream): JsonNode =
+    var as_json = "{"
+    var depth = 0
+    assert f.get_next_line().startsWith("<<")
+    while true:
+        let line = f.get_next_line().unindent()
+        if line.startsWith(">>"):
+            # remove ',' from last elem
+            as_json.delete(as_json.len - 2, as_json.len - 2)
+            if depth != 0:
+                as_json.add("},\n")
+                depth = depth - 1
+                continue
+            else:
+                as_json.add("}\n")
+                echo as_json
+                return parseJson(as_json)
+        var separator_loc = line.find(" ")
+        assert separator_loc >= 0
+        var key = line.substr(1, separator_loc)
+        as_json.add("\"" & key & "\": ")
+        # Value is a bit more complicated as it may be many stuff, but for parsing
+        # we lump everything into a string EXCEPT sub dictionaries
+        var value = line.substr(separator_loc + 1)
+        value.removeSuffix({'\n', '\r', ' '})
+        if value.startsWith("<<"):
+            depth = depth + 1
+            as_json.add("{\n")
+        else:
+            as_json.add("\"" & value & "\",\n")
+
+
+
+# TODO: This could break on HUGE pdf files?
+proc get_uid(first_num: int, second_num: int): uint64 =
+    return first_num.uint64 + second_num.uint64 * 4294967296'u64
+
+# We only parse obj streams. TODO: This could fail on some very weird PDFs
+proc parse_pdf*(path: string): PDFMap =
+    var file = newFileStream(path, fmRead)
+
+    while true:
+        var line = file.get_next_line()
+        # Parse an object
+        if line.endsWith(" obj\n") or line.endsWith(" obj\r\n"):
+            # First line defines object ID
+            var first_num, second_num: int
+            let advance = line.parseInt(first_num)
+            line = line.substr(advance)
+            discard line.parseInt(second_num)
+            let object_uid = get_uid(first_num, second_num)
+
+            # next lines should be a table
+            let table = file.parse_table()
+
+
+    file.close()
+
+# page_map indicates wether a page in over goes into a new page (true)
+# or if it goes over the old page (ie an overlay)
+# Afterwards we COULD linearize or optimize the PDF for performance, but for relatively
+# small updates (which these are, a few lines on top of a long pdf) it should be good
+proc overlap_pdf*(base: string, over: string, page_map: seq[bool]) = 
+    discard
diff --git a/src/remarkablenim.nim b/src/remarkablenim.nim
@@ -1 +1,5 @@
-import gui/base
+#import gui/base
+
+import pdf/pdfcombine
+
+discard parse_pdf("./retmp/data/d4bd814c-dc0c-4352-b3bd-e37e8b6576d1.pdf")