-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathannotation_parser.py
More file actions
258 lines (217 loc) · 10.4 KB
/
annotation_parser.py
File metadata and controls
258 lines (217 loc) · 10.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
import os
class AnnotationParser:
"""
A parser for Kindle annotation text files.
- Uses a single dictionary (annotation_block -> True) per book
to deduplicate while preserving insertion order (Python 3.7+).
- Replicates the Java code's logic for line reading, but crucially
normalizes each block so that deduplication consistently works.
"""
NEW_LINE = "\n"
MAX_TITLE_LENGTH = 112
def __init__(self, input_path, output_path, merge_option=True):
"""
:param input_path: Path to the Kindle "My Clippings" text file.
:param output_path: Path to the directory where output files will be stored.
:param merge_option: If True, merges new highlights with old (deduplicating them).
If False, overwrites existing files.
"""
self.input_path = input_path
self.output_path = output_path
# We'll store data like:
# {
# "RawBookHeader1": {annotation_block1: True, annotation_block2: True, ...},
# "RawBookHeader2": {...},
# ...
# }
self.books = {}
self.merge_option = merge_option
def parse(self):
"""
Reads from the input file, then writes grouped annotations to the output folder.
"""
self._read()
self._write()
def number_of_books(self):
"""Return the number of distinct book entries parsed."""
return len(self.books)
# -------------------------------------------------------------------------
# READING LOGIC
# -------------------------------------------------------------------------
def _read(self):
"""
Reads the input file line by line. For each 'book header' line, we capture
one annotation block (lines until '==========') following the original Java logic:
- If the line is empty => do NOT add a newline.
- If the line is not empty => line + newline.
- At the very end, add exactly one newline.
Then we normalize the block to ensure deduplication in subsequent merges.
"""
if not os.path.isfile(self.input_path):
raise FileNotFoundError(f"Input file not found: {self.input_path}")
with open(self.input_path, 'r', encoding='utf-8') as infile:
while True:
header_line = infile.readline()
if not header_line:
# Reached end of file
break
header_line = header_line.rstrip('\r\n')
# Kindle quirk: if first char isn't alphanumeric, strip it
if header_line and not header_line[0].isalnum():
header_line = header_line[1:]
# Skip blank or invalid headers
if not header_line:
continue
# Read one annotation block, then normalize it
annotation_block = self._read_block(infile)
annotation_block = self._normalize_block(annotation_block)
# Initialize a dict for this book if needed
if header_line not in self.books:
self.books[header_line] = {}
# Insert into that dict (deduplicates automatically)
self.books[header_line].setdefault(annotation_block, True)
def _read_block(self, infile):
"""
Reads lines until '==========' or EOF, replicating Java's approach:
- If the line is empty => annotation += currentLine (no extra newline)
- If non-empty => annotation += currentLine + "\n"
- Finally, return annotation + "\n"
This yields exactly one extra newline at the end of each block.
"""
annotation = ""
while True:
current_line = infile.readline()
if not current_line or current_line.strip() == "==========":
break
current_line = current_line.rstrip('\r\n')
if current_line == "":
# Java: annotation += currentLine
annotation += current_line
else:
# Java: annotation += currentLine + NEW_LINE
annotation += current_line + self.NEW_LINE
# The Java code does: return annotation + NEW_LINE
# That means exactly 1 blank line at the end of each block.
return annotation + self.NEW_LINE
# -------------------------------------------------------------------------
# WRITING LOGIC
# -------------------------------------------------------------------------
def _write(self):
"""
For each distinct book, merges with any existing file (if merge_option is True),
then writes a single file with all unique annotation blocks in insertion order.
"""
if not os.path.exists(self.output_path):
os.makedirs(self.output_path, exist_ok=True)
for raw_header, annotation_dict in self.books.items():
title, author = self._get_title_author(raw_header)
# Make a safe filename
clean_title = "".join(ch for ch in title if ch.isalnum() or ch in " ._()-")
clean_title = clean_title[:self.MAX_TITLE_LENGTH]
hash_str = str(hash(raw_header))
output_filename = f"{clean_title}_{hash_str}.txt"
fullpath = os.path.join(self.output_path, output_filename)
# If merging and file exists, unify old blocks with new
if self.merge_option and os.path.exists(fullpath):
self._merge_with_existing(fullpath, annotation_dict)
# Now write the final blocks (overwrite file from scratch)
with open(fullpath, 'w', encoding='utf-8') as outfile:
outfile.write(f"{title}, {author}\n\n")
# Keys in a Python 3.7+ dict come out in insertion order
for block_text in annotation_dict:
outfile.write(block_text + "\n")
def _merge_with_existing(self, filepath, annotation_dict):
"""
Reads the existing file for this book, loads its annotation blocks
into 'annotation_dict' (deduplicating). We parse the old file's blocks
in the same "Java style" manner, then normalize them, so they match
exactly with newly parsed blocks if they're duplicates.
"""
old_blocks = self._read_existing_blocks(filepath)
for block in old_blocks:
block = self._normalize_block(block)
annotation_dict.setdefault(block, True)
def _read_existing_blocks(self, filepath):
"""
Skips:
- line 0: "Title, Author"
- line 1: blank line
Then extracts the rest as annotation blocks.
"""
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
lines = content.splitlines()
# Typically line[0] = "Title, Author", line[1] = blank
idx = 0
idx += 1 # skip "Title, Author"
if idx < len(lines) and not lines[idx].strip():
idx += 1 # skip the blank line after the title
annotation_lines = lines[idx:]
annotation_text = "\n".join(annotation_lines)
# We'll split them in a way consistent with how we wrote them:
# each block ends with exactly one blank line => effectively "\n" at the end
# We'll parse them by looking for double newlines or end-of-file.
return self._split_into_blocks(annotation_text)
def _split_into_blocks(self, annotation_text):
"""
Each block ended with exactly ONE extra newline in the file.
When you see an empty line in `annotation_text`, that typically signals
the boundary between blocks.
We'll rejoin them so each block has exactly one trailing newline
(the same structure as _read_block() produced).
"""
# The simplest approach: we can see that each block is separated
# by a completely blank line, because each block ended with "\n"
# plus the next block starts on the next line.
# Approach: parse line-by-line, accumulate lines in a "current block",
# and when we encounter a blank line, we treat that as the boundary
# after finishing the block. Then we'll re-join that block with "\n"
# and ensure it ends with EXACTLY one trailing newline.
lines = annotation_text.split("\n")
blocks = []
current_block = []
for line in lines:
if line.strip() == "":
# blank line => end of current block
# as long as we have something in current_block
if current_block:
block = "\n".join(current_block) + "\n"
blocks.append(block)
current_block = []
else:
# part of current block
current_block.append(line)
# If there's something left in current_block, finalize it
if current_block:
block = "\n".join(current_block) + "\n"
blocks.append(block)
return blocks
# -------------------------------------------------------------------------
# NORMALIZATION
# -------------------------------------------------------------------------
def _normalize_block(self, block_text):
"""
Ensures that two identical highlights that differ only in trailing
blank lines or trailing whitespace are recognized as the same.
- We strip extra trailing blank lines/whitespace.
- Then add exactly one newline at the end.
This matches how the Java code builds each block,
so if you parse the same highlight again, you get an identical string.
"""
# Remove trailing whitespace/newlines
block_text = block_text.rstrip()
# Now ensure exactly one trailing newline
return block_text + "\n"
# -------------------------------------------------------------------------
# HELPER: Extract (title, author) from "Title (Author)"
# -------------------------------------------------------------------------
def _get_title_author(self, book_header):
open_paren = book_header.rfind("(")
close_paren = book_header.rfind(")")
if open_paren > 0 and close_paren > open_paren:
title = book_header[:open_paren].strip()
author = book_header[open_paren + 1 : close_paren].strip()
else:
title = book_header.strip()
author = ""
return title, author