@@ -6,6 +6,7 @@ Common recipes and customizations for djot-php.
66
77- [ External Links] ( #external-links )
88- [ Custom Emoji/Symbols] ( #custom-emojisymbols )
9+ - [ Unicode Codepoints] ( #unicode-codepoints )
910- [ Abbreviations] ( #abbreviations )
1011- [ Syntax Highlighting] ( #syntax-highlighting )
1112- [ Table of Contents Generation] ( #table-of-contents-generation )
@@ -115,6 +116,171 @@ Output:
115116<p >I <span class =" emoji" title =" heart" >❤️</span > this <span class =" emoji" title =" rocket" >🚀</span > feature!</p >
116117```
117118
119+ ## Unicode Codepoints
120+
121+ Insert Unicode characters by codepoint using the ` :symbol: ` syntax. This is useful for
122+ hard-to-type characters like directional marks, variation selectors, zero-width joiners,
123+ and other invisible or special Unicode characters.
124+
125+ See [ djot issue #44 ] ( https://github.com/jgm/djot/issues/44 ) for background on this use case.
126+
127+ ### Supported Formats
128+
129+ ``` php
130+ use Djot\DjotConverter;
131+ use Djot\Event\RenderEvent;
132+ use Djot\Node\Inline\Symbol;
133+
134+ $converter = new DjotConverter();
135+
136+ $converter->on('render.symbol', function (RenderEvent $event): void {
137+ $symbol = $event->getNode();
138+ if (!$symbol instanceof Symbol) {
139+ return;
140+ }
141+
142+ $name = $symbol->getName();
143+
144+ // Hex with U+ prefix: :U+2192: → "→"
145+ if (preg_match('/^U\+([0-9A-Fa-f]+)$/', $name, $m)) {
146+ $codepoint = hexdec($m[1]);
147+ if ($codepoint >= 0 && $codepoint <= 0x10FFFF) {
148+ $event->setHtml(mb_chr($codepoint, 'UTF-8'));
149+
150+ return;
151+ }
152+ }
153+
154+ // Hex with 0x prefix: :0x14b: → "ŋ"
155+ if (preg_match('/^0x([0-9A-Fa-f]+)$/', $name, $m)) {
156+ $codepoint = hexdec($m[1]);
157+ if ($codepoint >= 0 && $codepoint <= 0x10FFFF) {
158+ $event->setHtml(mb_chr($codepoint, 'UTF-8'));
159+
160+ return;
161+ }
162+ }
163+
164+ // Decimal: :331: → "ŋ"
165+ if (preg_match('/^[0-9]+$/', $name)) {
166+ $codepoint = (int) $name;
167+ if ($codepoint >= 0 && $codepoint <= 0x10FFFF) {
168+ $event->setHtml(mb_chr($codepoint, 'UTF-8'));
169+
170+ return;
171+ }
172+ }
173+
174+ // Unknown symbol - keep original
175+ $event->setHtml(':' . htmlspecialchars($name, ENT_QUOTES, 'UTF-8') . ':');
176+ });
177+
178+ echo $converter->convert('Arrow: :U+2192: Eng: :0x14b: or :331:');
179+ ```
180+
181+ Output:
182+ ``` html
183+ <p >Arrow: → Eng: ŋ or ŋ</p >
184+ ```
185+
186+ ### Use Cases
187+
188+ ** Bidirectional text markers** (essential for mixed RTL/LTR content):
189+ ``` djot
190+ English text :U+200F: متن فارسی :U+200E: more English
191+ ```
192+
193+ - ` :U+200E: ` - Left-to-right mark (LRM)
194+ - ` :U+200F: ` - Right-to-left mark (RLM)
195+ - ` :U+200B: ` - Zero-width space (allows line breaks)
196+ - ` :U+2060: ` - Word joiner (prevents line breaks)
197+
198+ ** Variation selectors** (control glyph variants):
199+ ``` djot
200+ The character 㐂:U+E0102: uses the third registered variant.
201+ ```
202+
203+ ** Soft hyphens** (invisible until line break needed):
204+ ``` djot
205+ super:U+AD:cali:U+AD:fragi:U+AD:listic
206+ ```
207+
208+ ### Combining with Emoji
209+
210+ Handle both emoji names and codepoints:
211+
212+ ``` php
213+ $emojis = [
214+ 'heart' => '❤️',
215+ 'star' => '⭐',
216+ ];
217+
218+ $converter->on('render.symbol', function (RenderEvent $event) use ($emojis): void {
219+ $symbol = $event->getNode();
220+ if (!$symbol instanceof Symbol) {
221+ return;
222+ }
223+
224+ $name = $symbol->getName();
225+
226+ // Check emoji map first
227+ if (isset($emojis[$name])) {
228+ $event->setHtml($emojis[$name]);
229+
230+ return;
231+ }
232+
233+ // Then try codepoint formats
234+ $codepoint = null;
235+ if (preg_match('/^U\+([0-9A-Fa-f]+)$/', $name, $m)) {
236+ $codepoint = hexdec($m[1]);
237+ } elseif (preg_match('/^0x([0-9A-Fa-f]+)$/', $name, $m)) {
238+ $codepoint = hexdec($m[1]);
239+ } elseif (preg_match('/^[0-9]+$/', $name)) {
240+ $codepoint = (int) $name;
241+ }
242+
243+ if ($codepoint !== null && $codepoint >= 0 && $codepoint <= 0x10FFFF) {
244+ $event->setHtml(mb_chr($codepoint, 'UTF-8'));
245+
246+ return;
247+ }
248+
249+ // Unknown - keep original
250+ $event->setHtml(':' . htmlspecialchars($name, ENT_QUOTES, 'UTF-8') . ':');
251+ });
252+
253+ echo $converter->convert('I :heart: arrows :U+2192: and :star:');
254+ ```
255+
256+ Output:
257+ ``` html
258+ <p >I ❤️ arrows → and ⭐</p >
259+ ```
260+
261+ ### Alternatives
262+
263+ For simpler cases, djot provides built-in alternatives:
264+
265+ ** Non-breaking space** - use escaped space (` \ ` ):
266+ ``` djot
267+ 100\ km
268+ ```
269+
270+ Output: ` <p>100 km</p> `
271+
272+ ** HTML entities** - use raw HTML syntax:
273+ ``` djot
274+ `—`{=html} for em-dash, `©`{=html} for ©
275+ ```
276+
277+ Output: ` <p>— for em-dash, © for ©</p> `
278+
279+ The codepoint approach above is most useful when you need:
280+ - Invisible Unicode characters (directional marks, joiners)
281+ - Characters without named HTML entities
282+ - A consistent syntax for all special characters
283+
118284## Abbreviations
119285
120286Convert spans with ` abbr ` attribute to semantic ` <abbr> ` elements:
0 commit comments