Refactoring with bidi_fetch_char tested and debugged.
[bpt/emacs.git] / src / bidi.c
CommitLineData
b7b65b15 1/* Low-level bidirectional buffer-scanning functions for GNU Emacs.
73b0cd50 2 Copyright (C) 2000-2001, 2004-2005, 2009-2011
b118e65f 3 Free Software Foundation, Inc.
b7b65b15
EZ
4
5This file is part of GNU Emacs.
6
a8d11bd3 7GNU Emacs is free software: you can redistribute it and/or modify
b7b65b15 8it under the terms of the GNU General Public License as published by
a8d11bd3
EZ
9the Free Software Foundation, either version 3 of the License, or
10(at your option) any later version.
b7b65b15
EZ
11
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
16
b7b65b15 17You should have received a copy of the GNU General Public License
a8d11bd3 18along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
b7b65b15 19
2d6e4628
EZ
20/* Written by Eli Zaretskii <eliz@gnu.org>.
21
22 A sequential implementation of the Unicode Bidirectional algorithm,
b7b65b15
EZ
23 as per UAX#9, a part of the Unicode Standard.
24
25 Unlike the reference and most other implementations, this one is
940afb59
EZ
26 designed to be called once for every character in the buffer or
27 string.
b7b65b15 28
4b292a22 29 The main entry point is bidi_move_to_visually_next. Each time it
b7b65b15
EZ
30 is called, it finds the next character in the visual order, and
31 returns its information in a special structure. The caller is then
32 expected to process this character for display or any other
4b292a22
EZ
33 purposes, and call bidi_move_to_visually_next for the next
34 character. See the comments in bidi_move_to_visually_next for more
35 details about its algorithm that finds the next visual-order
b7b65b15
EZ
36 character by resolving their levels on the fly.
37
940afb59
EZ
38 The two other entry points are bidi_paragraph_init and
39 bidi_mirror_char. The first determines the base direction of a
40 paragraph, while the second returns the mirrored version of its
41 argument character.
42
89d3374a
EZ
43 If you want to understand the code, you will have to read it
44 together with the relevant portions of UAX#9. The comments include
45 references to UAX#9 rules, for that very reason.
46
b7b65b15
EZ
47 A note about references to UAX#9 rules: if the reference says
48 something like "X9/Retaining", it means that you need to refer to
49 rule X9 and to its modifications decribed in the "Implementation
50 Notes" section of UAX#9, under "Retaining Format Codes". */
51
b7b65b15 52#include <config.h>
b7b65b15 53#include <stdio.h>
29e3d8d1
EZ
54#include <setjmp.h>
55
b7b65b15
EZ
56#include "lisp.h"
57#include "buffer.h"
58#include "character.h"
59#include "dispextern.h"
60
61static int bidi_initialized = 0;
62
cbc4fd20 63static Lisp_Object bidi_type_table, bidi_mirror_table;
b7b65b15
EZ
64
65#define LRM_CHAR 0x200E
66#define RLM_CHAR 0x200F
b7b65b15 67#define BIDI_EOB -1
b7b65b15 68
b7b65b15
EZ
69/* Local data structures. (Look in dispextern.h for the rest.) */
70
71/* What we need to know about the current paragraph. */
72struct bidi_paragraph_info {
61bfec98
EZ
73 EMACS_INT start_bytepos; /* byte position where it begins */
74 EMACS_INT end_bytepos; /* byte position where it ends */
75 int embedding_level; /* its basic embedding level */
76 bidi_dir_t base_dir; /* its base direction */
b7b65b15
EZ
77};
78
79/* Data type for describing the bidirectional character categories. */
80typedef enum {
81 UNKNOWN_BC,
82 NEUTRAL,
83 WEAK,
84 STRONG
85} bidi_category_t;
86
e78aecca 87extern int bidi_ignore_explicit_marks_for_paragraph_level EXTERNALLY_VISIBLE;
b7b65b15
EZ
88int bidi_ignore_explicit_marks_for_paragraph_level = 1;
89
372b7a95 90static Lisp_Object paragraph_start_re, paragraph_separate_re;
6bff6497 91static Lisp_Object Qparagraph_start, Qparagraph_separate;
b7b65b15 92
b7b65b15 93static void
971de7fb 94bidi_initialize (void)
b7b65b15 95{
317fbf33
EZ
96
97#include "biditype.h"
cbc4fd20 98#include "bidimirror.h"
317fbf33 99
b7b65b15
EZ
100 int i;
101
102 bidi_type_table = Fmake_char_table (Qnil, make_number (STRONG_L));
2d6e4628 103 staticpro (&bidi_type_table);
b7b65b15
EZ
104
105 for (i = 0; i < sizeof bidi_type / sizeof bidi_type[0]; i++)
317fbf33 106 char_table_set_range (bidi_type_table, bidi_type[i].from, bidi_type[i].to,
b7b65b15 107 make_number (bidi_type[i].type));
6bff6497 108
cbc4fd20
EZ
109 bidi_mirror_table = Fmake_char_table (Qnil, Qnil);
110 staticpro (&bidi_mirror_table);
111
112 for (i = 0; i < sizeof bidi_mirror / sizeof bidi_mirror[0]; i++)
113 char_table_set (bidi_mirror_table, bidi_mirror[i].from,
114 make_number (bidi_mirror[i].to));
115
b4bf28b7
SM
116 Qparagraph_start = intern ("paragraph-start");
117 staticpro (&Qparagraph_start);
372b7a95
EZ
118 paragraph_start_re = Fsymbol_value (Qparagraph_start);
119 if (!STRINGP (paragraph_start_re))
120 paragraph_start_re = build_string ("\f\\|[ \t]*$");
121 staticpro (&paragraph_start_re);
b4bf28b7
SM
122 Qparagraph_separate = intern ("paragraph-separate");
123 staticpro (&Qparagraph_separate);
372b7a95
EZ
124 paragraph_separate_re = Fsymbol_value (Qparagraph_separate);
125 if (!STRINGP (paragraph_separate_re))
126 paragraph_separate_re = build_string ("[ \t\f]*$");
127 staticpro (&paragraph_separate_re);
b7b65b15
EZ
128 bidi_initialized = 1;
129}
130
6bff6497
EZ
131/* Return the bidi type of a character CH, subject to the current
132 directional OVERRIDE. */
fd3998ff 133static INLINE bidi_type_t
6bff6497 134bidi_get_type (int ch, bidi_dir_t override)
b7b65b15 135{
6bff6497
EZ
136 bidi_type_t default_type;
137
e7402cb2
EZ
138 if (ch == BIDI_EOB)
139 return NEUTRAL_B;
e342a24d
EZ
140 if (ch < 0 || ch > MAX_CHAR)
141 abort ();
6bff6497
EZ
142
143 default_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
144
145 if (override == NEUTRAL_DIR)
146 return default_type;
147
148 switch (default_type)
149 {
150 /* Although UAX#9 does not tell, it doesn't make sense to
151 override NEUTRAL_B and LRM/RLM characters. */
152 case NEUTRAL_B:
153 case LRE:
154 case LRO:
155 case RLE:
156 case RLO:
157 case PDF:
158 return default_type;
159 default:
160 switch (ch)
161 {
162 case LRM_CHAR:
163 case RLM_CHAR:
164 return default_type;
165 default:
166 if (override == L2R) /* X6 */
167 return STRONG_L;
168 else if (override == R2L)
169 return STRONG_R;
170 else
171 abort (); /* can't happen: handled above */
172 }
173 }
b7b65b15
EZ
174}
175
7d3b3862 176static void
2d6e4628
EZ
177bidi_check_type (bidi_type_t type)
178{
179 if (type < UNKNOWN_BT || type > NEUTRAL_ON)
180 abort ();
181}
182
b7b65b15 183/* Given a bidi TYPE of a character, return its category. */
fd3998ff 184static INLINE bidi_category_t
b7b65b15
EZ
185bidi_get_category (bidi_type_t type)
186{
187 switch (type)
188 {
189 case UNKNOWN_BT:
190 return UNKNOWN_BC;
191 case STRONG_L:
192 case STRONG_R:
193 case STRONG_AL:
194 case LRE:
195 case LRO:
196 case RLE:
197 case RLO:
198 return STRONG;
199 case PDF: /* ??? really?? */
200 case WEAK_EN:
201 case WEAK_ES:
202 case WEAK_ET:
203 case WEAK_AN:
204 case WEAK_CS:
205 case WEAK_NSM:
206 case WEAK_BN:
207 return WEAK;
208 case NEUTRAL_B:
209 case NEUTRAL_S:
210 case NEUTRAL_WS:
211 case NEUTRAL_ON:
212 return NEUTRAL;
213 default:
214 abort ();
215 }
216}
217
a6e8d97c
EZ
218/* Return the mirrored character of C, if it has one. If C has no
219 mirrored counterpart, return C.
220 Note: The conditions in UAX#9 clause L4 regarding the surrounding
221 context must be tested by the caller. */
b7b65b15
EZ
222int
223bidi_mirror_char (int c)
224{
cbc4fd20
EZ
225 Lisp_Object val;
226
227 if (c == BIDI_EOB)
228 return c;
229 if (c < 0 || c > MAX_CHAR)
230 abort ();
b7b65b15 231
cbc4fd20
EZ
232 val = CHAR_TABLE_REF (bidi_mirror_table, c);
233 if (INTEGERP (val))
b7b65b15 234 {
cbc4fd20 235 int v = XINT (val);
b7b65b15 236
cbc4fd20
EZ
237 if (v < 0 || v > MAX_CHAR)
238 abort ();
239
240 return v;
b7b65b15 241 }
cbc4fd20 242
b7b65b15
EZ
243 return c;
244}
245
246/* Copy the bidi iterator from FROM to TO. To save cycles, this only
247 copies the part of the level stack that is actually in use. */
fd3998ff 248static INLINE void
b7b65b15
EZ
249bidi_copy_it (struct bidi_it *to, struct bidi_it *from)
250{
251 int i;
252
e69a9370 253 /* Copy everything except the level stack and beyond. */
182ce2d2 254 memcpy (to, from, offsetof (struct bidi_it, level_stack[0]));
b7b65b15
EZ
255
256 /* Copy the active part of the level stack. */
257 to->level_stack[0] = from->level_stack[0]; /* level zero is always in use */
258 for (i = 1; i <= from->stack_idx; i++)
259 to->level_stack[i] = from->level_stack[i];
260}
261
262/* Caching the bidi iterator states. */
263
2fe72643
EZ
264#define BIDI_CACHE_CHUNK 200
265static struct bidi_it *bidi_cache;
266static size_t bidi_cache_size = 0;
0416466c 267static size_t elsz = sizeof (struct bidi_it);
2fe72643
EZ
268static int bidi_cache_idx; /* next unused cache slot */
269static int bidi_cache_last_idx; /* slot of last cache hit */
b7b65b15 270
fd3998ff 271static INLINE void
b7b65b15
EZ
272bidi_cache_reset (void)
273{
274 bidi_cache_idx = 0;
275 bidi_cache_last_idx = -1;
276}
277
2fe72643
EZ
278static INLINE void
279bidi_cache_shrink (void)
280{
281 if (bidi_cache_size > BIDI_CACHE_CHUNK)
282 {
0416466c
EZ
283 bidi_cache_size = BIDI_CACHE_CHUNK;
284 bidi_cache =
285 (struct bidi_it *) xrealloc (bidi_cache, bidi_cache_size * elsz);
2fe72643
EZ
286 }
287 bidi_cache_reset ();
288}
289
fd3998ff 290static INLINE void
b7b65b15
EZ
291bidi_cache_fetch_state (int idx, struct bidi_it *bidi_it)
292{
293 int current_scan_dir = bidi_it->scan_dir;
294
295 if (idx < 0 || idx >= bidi_cache_idx)
296 abort ();
297
298 bidi_copy_it (bidi_it, &bidi_cache[idx]);
299 bidi_it->scan_dir = current_scan_dir;
300 bidi_cache_last_idx = idx;
301}
302
303/* Find a cached state with a given CHARPOS and resolved embedding
304 level less or equal to LEVEL. if LEVEL is -1, disregard the
305 resolved levels in cached states. DIR, if non-zero, means search
306 in that direction from the last cache hit. */
fd3998ff 307static INLINE int
61bfec98 308bidi_cache_search (EMACS_INT charpos, int level, int dir)
b7b65b15
EZ
309{
310 int i, i_start;
311
312 if (bidi_cache_idx)
313 {
314 if (charpos < bidi_cache[bidi_cache_last_idx].charpos)
182ce2d2
EZ
315 {
316 dir = -1;
317 i_start = bidi_cache_last_idx - 1;
318 }
319 else if (charpos > (bidi_cache[bidi_cache_last_idx].charpos
320 + bidi_cache[bidi_cache_last_idx].nchars - 1))
321 {
322 dir = 1;
323 i_start = bidi_cache_last_idx + 1;
324 }
325 else if (dir)
b7b65b15
EZ
326 i_start = bidi_cache_last_idx;
327 else
328 {
329 dir = -1;
330 i_start = bidi_cache_idx - 1;
331 }
332
333 if (dir < 0)
334 {
335 /* Linear search for now; FIXME! */
336 for (i = i_start; i >= 0; i--)
182ce2d2
EZ
337 if (bidi_cache[i].charpos <= charpos
338 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
b7b65b15
EZ
339 && (level == -1 || bidi_cache[i].resolved_level <= level))
340 return i;
341 }
342 else
343 {
344 for (i = i_start; i < bidi_cache_idx; i++)
182ce2d2
EZ
345 if (bidi_cache[i].charpos <= charpos
346 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
b7b65b15
EZ
347 && (level == -1 || bidi_cache[i].resolved_level <= level))
348 return i;
349 }
350 }
351
352 return -1;
353}
354
355/* Find a cached state where the resolved level changes to a value
356 that is lower than LEVEL, and return its cache slot index. DIR is
357 the direction to search, starting with the last used cache slot.
358 BEFORE, if non-zero, means return the index of the slot that is
359 ``before'' the level change in the search direction. That is,
360 given the cached levels like this:
361
362 1122333442211
363 AB C
364
365 and assuming we are at the position cached at the slot marked with
366 C, searching backwards (DIR = -1) for LEVEL = 2 will return the
367 index of slot B or A, depending whether BEFORE is, respectively,
368 non-zero or zero. */
369static int
370bidi_cache_find_level_change (int level, int dir, int before)
371{
372 if (bidi_cache_idx)
373 {
374 int i = dir ? bidi_cache_last_idx : bidi_cache_idx - 1;
375 int incr = before ? 1 : 0;
376
377 if (!dir)
378 dir = -1;
379 else if (!incr)
380 i += dir;
381
382 if (dir < 0)
383 {
384 while (i >= incr)
385 {
386 if (bidi_cache[i - incr].resolved_level >= 0
387 && bidi_cache[i - incr].resolved_level < level)
388 return i;
389 i--;
390 }
391 }
392 else
393 {
394 while (i < bidi_cache_idx - incr)
395 {
396 if (bidi_cache[i + incr].resolved_level >= 0
397 && bidi_cache[i + incr].resolved_level < level)
398 return i;
399 i++;
400 }
401 }
402 }
403
404 return -1;
405}
406
fd3998ff 407static INLINE void
b7b65b15
EZ
408bidi_cache_iterator_state (struct bidi_it *bidi_it, int resolved)
409{
410 int idx;
411
412 /* We should never cache on backward scans. */
413 if (bidi_it->scan_dir == -1)
414 abort ();
415 idx = bidi_cache_search (bidi_it->charpos, -1, 1);
416
417 if (idx < 0)
418 {
419 idx = bidi_cache_idx;
2fe72643
EZ
420 /* Enlarge the cache as needed. */
421 if (idx >= bidi_cache_size)
422 {
0416466c 423 bidi_cache_size += BIDI_CACHE_CHUNK;
2fe72643 424 bidi_cache =
0416466c 425 (struct bidi_it *) xrealloc (bidi_cache, bidi_cache_size * elsz);
2fe72643 426 }
bd924a5d
EZ
427 /* Character positions should correspond to cache positions 1:1.
428 If we are outside the range of cached positions, the cache is
429 useless and must be reset. */
430 if (idx > 0 &&
182ce2d2
EZ
431 (bidi_it->charpos > (bidi_cache[idx - 1].charpos
432 + bidi_cache[idx - 1].nchars)
bd924a5d
EZ
433 || bidi_it->charpos < bidi_cache[0].charpos))
434 {
435 bidi_cache_reset ();
436 idx = 0;
437 }
102ebb00
EZ
438 if (bidi_it->nchars <= 0)
439 abort ();
b7b65b15
EZ
440 bidi_copy_it (&bidi_cache[idx], bidi_it);
441 if (!resolved)
442 bidi_cache[idx].resolved_level = -1;
443 }
444 else
445 {
446 /* Copy only the members which could have changed, to avoid
447 costly copying of the entire struct. */
448 bidi_cache[idx].type = bidi_it->type;
2d6e4628 449 bidi_check_type (bidi_it->type);
89d3374a
EZ
450 bidi_cache[idx].type_after_w1 = bidi_it->type_after_w1;
451 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
452 if (resolved)
453 bidi_cache[idx].resolved_level = bidi_it->resolved_level;
454 else
455 bidi_cache[idx].resolved_level = -1;
456 bidi_cache[idx].invalid_levels = bidi_it->invalid_levels;
457 bidi_cache[idx].invalid_rl_levels = bidi_it->invalid_rl_levels;
458 bidi_cache[idx].next_for_neutral = bidi_it->next_for_neutral;
459 bidi_cache[idx].next_for_ws = bidi_it->next_for_ws;
460 bidi_cache[idx].ignore_bn_limit = bidi_it->ignore_bn_limit;
461 }
462
463 bidi_cache_last_idx = idx;
464 if (idx >= bidi_cache_idx)
465 bidi_cache_idx = idx + 1;
466}
467
fd3998ff 468static INLINE bidi_type_t
61bfec98 469bidi_cache_find (EMACS_INT charpos, int level, struct bidi_it *bidi_it)
b7b65b15
EZ
470{
471 int i = bidi_cache_search (charpos, level, bidi_it->scan_dir);
472
473 if (i >= 0)
474 {
475 bidi_dir_t current_scan_dir = bidi_it->scan_dir;
476
5009f85e 477 bidi_copy_it (bidi_it, &bidi_cache[i]);
b7b65b15
EZ
478 bidi_cache_last_idx = i;
479 /* Don't let scan direction from from the cached state override
480 the current scan direction. */
481 bidi_it->scan_dir = current_scan_dir;
482 return bidi_it->type;
483 }
484
485 return UNKNOWN_BT;
486}
487
fd3998ff 488static INLINE int
b7b65b15
EZ
489bidi_peek_at_next_level (struct bidi_it *bidi_it)
490{
491 if (bidi_cache_idx == 0 || bidi_cache_last_idx == -1)
492 abort ();
493 return bidi_cache[bidi_cache_last_idx + bidi_it->scan_dir].resolved_level;
494}
495
be39f003
EZ
496/* Check if buffer position CHARPOS/BYTEPOS is the end of a paragraph.
497 Value is the non-negative length of the paragraph separator
498 following the buffer position, -1 if position is at the beginning
499 of a new paragraph, or -2 if position is neither at beginning nor
500 at end of a paragraph. */
fd3998ff 501static EMACS_INT
6bff6497 502bidi_at_paragraph_end (EMACS_INT charpos, EMACS_INT bytepos)
b7b65b15 503{
372b7a95
EZ
504 Lisp_Object sep_re;
505 Lisp_Object start_re;
be39f003
EZ
506 EMACS_INT val;
507
372b7a95
EZ
508 sep_re = paragraph_separate_re;
509 start_re = paragraph_start_re;
be39f003
EZ
510
511 val = fast_looking_at (sep_re, charpos, bytepos, ZV, ZV_BYTE, Qnil);
512 if (val < 0)
513 {
514 if (fast_looking_at (start_re, charpos, bytepos, ZV, ZV_BYTE, Qnil) >= 0)
515 val = -1;
516 else
517 val = -2;
518 }
b7b65b15 519
be39f003 520 return val;
b7b65b15
EZ
521}
522
523/* Determine the start-of-run (sor) directional type given the two
524 embedding levels on either side of the run boundary. Also, update
525 the saved info about previously seen characters, since that info is
526 generally valid for a single level run. */
fd3998ff 527static INLINE void
b7b65b15
EZ
528bidi_set_sor_type (struct bidi_it *bidi_it, int level_before, int level_after)
529{
530 int higher_level = level_before > level_after ? level_before : level_after;
531
532 /* The prev_was_pdf gork is required for when we have several PDFs
533 in a row. In that case, we want to compute the sor type for the
534 next level run only once: when we see the first PDF. That's
535 because the sor type depends only on the higher of the two levels
536 that we find on the two sides of the level boundary (see UAX#9,
537 clause X10), and so we don't need to know the final embedding
538 level to which we descend after processing all the PDFs. */
e342a24d 539 if (!bidi_it->prev_was_pdf || level_before < level_after)
b7b65b15
EZ
540 /* FIXME: should the default sor direction be user selectable? */
541 bidi_it->sor = (higher_level & 1) != 0 ? R2L : L2R;
542 if (level_before > level_after)
543 bidi_it->prev_was_pdf = 1;
544
545 bidi_it->prev.type = UNKNOWN_BT;
89d3374a
EZ
546 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
547 bidi_it->last_strong.orig_type = UNKNOWN_BT;
b7b65b15
EZ
548 bidi_it->prev_for_neutral.type = bidi_it->sor == R2L ? STRONG_R : STRONG_L;
549 bidi_it->prev_for_neutral.charpos = bidi_it->charpos;
550 bidi_it->prev_for_neutral.bytepos = bidi_it->bytepos;
89d3374a
EZ
551 bidi_it->next_for_neutral.type = bidi_it->next_for_neutral.type_after_w1 =
552 bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
b7b65b15
EZ
553 bidi_it->ignore_bn_limit = 0; /* meaning it's unknown */
554}
555
182ce2d2 556/* Perform initializations for reordering a new line of bidi text. */
6bff6497 557static void
be39f003
EZ
558bidi_line_init (struct bidi_it *bidi_it)
559{
560 bidi_it->scan_dir = 1; /* FIXME: do we need to have control on this? */
561 bidi_it->resolved_level = bidi_it->level_stack[0].level;
562 bidi_it->level_stack[0].override = NEUTRAL_DIR; /* X1 */
563 bidi_it->invalid_levels = 0;
564 bidi_it->invalid_rl_levels = -1;
565 bidi_it->next_en_pos = -1;
566 bidi_it->next_for_ws.type = UNKNOWN_BT;
b44d9321
EZ
567 bidi_set_sor_type (bidi_it,
568 bidi_it->paragraph_dir == R2L ? 1 : 0,
be39f003
EZ
569 bidi_it->level_stack[0].level); /* X10 */
570
571 bidi_cache_reset ();
572}
573
102ebb00
EZ
574/* Fetch and return the character at BYTEPOS/CHARPOS. If that
575 character is covered by a display string, treat the entire run of
576 covered characters as a single character u+FFFC, and return their
577 combined length in CH_LEN and NCHARS. DISP_POS specifies the
578 character position of the next display string, or -1 if not yet
579 computed. When the next character is at or beyond that position,
580 the function updates DISP_POS with the position of the next display
581 string. */
182ce2d2 582static INLINE int
102ebb00 583bidi_fetch_char (EMACS_INT bytepos, EMACS_INT charpos, EMACS_INT *disp_pos,
182ce2d2
EZ
584 EMACS_INT *ch_len, EMACS_INT *nchars)
585{
586 int ch;
587
588 /* FIXME: Support strings in addition to buffers. */
589 /* If we got past the last known position of display string, compute
590 the position of the next one. That position could be at BYTEPOS. */
102ebb00
EZ
591 if (charpos < ZV && charpos > *disp_pos)
592 *disp_pos = compute_display_string_pos (charpos);
593
594 /* Fetch the character at BYTEPOS. */
182ce2d2
EZ
595 if (bytepos >= ZV_BYTE)
596 {
597 ch = BIDI_EOB;
598 *ch_len = 1;
599 *nchars = 1;
600 }
601#if 0
102ebb00 602 else if (charpos >= *disp_pos)
182ce2d2
EZ
603 {
604 /* support characters covered by a display string */
605 ch = 0xFFFC; /* Unicode Object Replacement Character */
606 }
607#endif
608 else
609 {
610 ch = FETCH_MULTIBYTE_CHAR (bytepos);
611 *ch_len = CHAR_BYTES (ch);
612 *nchars = 1;
613 }
614
615 /* If we just entered a run of characters covered by a display
616 string, compute the position of the next display string. */
102ebb00
EZ
617 if (charpos + *nchars <= ZV && charpos + *nchars > *disp_pos)
618 *disp_pos = compute_display_string_pos (charpos + *nchars);
182ce2d2
EZ
619
620 return ch;
621}
622
623/* Looks like we won't need this one. */
624#if 0
625/* Fetch character at CHARPOS/BYTEPOS. Return the character, and
626 advance CHARPOS and BYTEPOS to the next character in logical
627 order. */
628static INLINE int
629bidi_fetch_char_advance (EMACS_INT *charpos, EMACS_INT *bytepos)
630{
631 int ch;
632
633 /* FIXME: Support strings in addition to buffers. */
634 FETCH_CHAR_ADVANCE_NO_CHECK (ch, charpos, bytepos);
635
636#if 0
637 if (...)
638 {
639 /* FIXME: Support characters covered by display strings. */
640 ch = 0xFFFC;
641 }
642#endif
643
644 return ch;
645}
646#endif
647
be39f003
EZ
648/* Find the beginning of this paragraph by looking back in the buffer.
649 Value is the byte position of the paragraph's beginning. */
650static EMACS_INT
b44d9321 651bidi_find_paragraph_start (EMACS_INT pos, EMACS_INT pos_byte)
6bff6497 652{
372b7a95 653 Lisp_Object re = paragraph_start_re;
6bff6497
EZ
654 EMACS_INT limit = ZV, limit_byte = ZV_BYTE;
655
6bff6497
EZ
656 while (pos_byte > BEGV_BYTE
657 && fast_looking_at (re, pos, pos_byte, limit, limit_byte, Qnil) < 0)
658 {
182ce2d2
EZ
659 /* FIXME: What if the paragraph beginning is covered by a
660 display string? And what if a display string covering some
661 of the text over which we scan back includes
662 paragraph_start_re? */
be39f003
EZ
663 pos = find_next_newline_no_quit (pos - 1, -1);
664 pos_byte = CHAR_TO_BYTE (pos);
6bff6497 665 }
be39f003 666 return pos_byte;
6bff6497
EZ
667}
668
bea4f10c 669/* Determine the base direction, a.k.a. base embedding level, of the
b44d9321
EZ
670 paragraph we are about to iterate through. If DIR is either L2R or
671 R2L, just use that. Otherwise, determine the paragraph direction
bea4f10c
EZ
672 from the first strong directional character of the paragraph.
673
182ce2d2 674 NO_DEFAULT_P non-zero means don't default to L2R if the paragraph
bea4f10c
EZ
675 has no strong directional characters and both DIR and
676 bidi_it->paragraph_dir are NEUTRAL_DIR. In that case, search back
677 in the buffer until a paragraph is found with a strong character,
678 or until hitting BEGV. In the latter case, fall back to L2R. This
679 flag is used in current-bidi-paragraph-direction.
680
681 Note that this function gives the paragraph separator the same
682 direction as the preceding paragraph, even though Emacs generally
683 views the separartor as not belonging to any paragraph. */
b7b65b15 684void
bea4f10c 685bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it, int no_default_p)
b7b65b15 686{
6bff6497 687 EMACS_INT bytepos = bidi_it->bytepos;
bea4f10c 688 EMACS_INT pstartbyte;
e7402cb2 689
5e65aec0
EZ
690 /* Special case for an empty buffer. */
691 if (bytepos == BEGV_BYTE && bytepos == ZV_BYTE)
692 dir = L2R;
9c82e145 693 /* We should never be called at EOB or before BEGV. */
5e65aec0 694 else if (bytepos >= ZV_BYTE || bytepos < BEGV_BYTE)
9c82e145
EZ
695 abort ();
696
be39f003
EZ
697 if (dir == L2R)
698 {
699 bidi_it->paragraph_dir = L2R;
700 bidi_it->new_paragraph = 0;
701 }
702 else if (dir == R2L)
703 {
704 bidi_it->paragraph_dir = R2L;
705 bidi_it->new_paragraph = 0;
706 }
b7b65b15
EZ
707 else if (dir == NEUTRAL_DIR) /* P2 */
708 {
182ce2d2
EZ
709 int ch;
710 EMACS_INT ch_len, nchars;
711 EMACS_INT pos, disp_pos = -1;
6bff6497 712 bidi_type_t type;
be39f003 713
d20e1419
EZ
714 if (!bidi_initialized)
715 bidi_initialize ();
716
be39f003
EZ
717 /* If we are inside a paragraph separator, we are just waiting
718 for the separator to be exhausted; use the previous paragraph
e5a2fec7
EZ
719 direction. But don't do that if we have been just reseated,
720 because we need to reinitialize below in that case. */
721 if (!bidi_it->first_elt
722 && bidi_it->charpos < bidi_it->separator_limit)
be39f003
EZ
723 return;
724
b44d9321 725 /* If we are on a newline, get past it to where the next
5e65aec0
EZ
726 paragraph might start. But don't do that at BEGV since then
727 we are potentially in a new paragraph that doesn't yet
728 exist. */
c143c213 729 pos = bidi_it->charpos;
5e65aec0 730 if (bytepos > BEGV_BYTE && FETCH_CHAR (bytepos) == '\n')
be39f003 731 {
b44d9321 732 bytepos++;
c143c213 733 pos++;
be39f003 734 }
b44d9321
EZ
735
736 /* We are either at the beginning of a paragraph or in the
737 middle of it. Find where this paragraph starts. */
bea4f10c 738 pstartbyte = bidi_find_paragraph_start (pos, bytepos);
be39f003
EZ
739 bidi_it->separator_limit = -1;
740 bidi_it->new_paragraph = 0;
bea4f10c
EZ
741
742 /* The following loop is run more than once only if NO_DEFAULT_P
743 is non-zero. */
744 do {
745 bytepos = pstartbyte;
bea4f10c 746 pos = BYTE_TO_CHAR (bytepos);
102ebb00 747 ch = bidi_fetch_char (bytepos, pos, &disp_pos, &ch_len, &nchars);
bea4f10c
EZ
748 type = bidi_get_type (ch, NEUTRAL_DIR);
749
182ce2d2 750 for (pos += nchars, bytepos += ch_len;
bea4f10c
EZ
751 /* NOTE: UAX#9 says to search only for L, AL, or R types
752 of characters, and ignore RLE, RLO, LRE, and LRO.
753 However, I'm not sure it makes sense to omit those 4;
754 should try with and without that to see the effect. */
755 (bidi_get_category (type) != STRONG)
756 || (bidi_ignore_explicit_marks_for_paragraph_level
757 && (type == RLE || type == RLO
758 || type == LRE || type == LRO));
759 type = bidi_get_type (ch, NEUTRAL_DIR))
760 {
761 if (type == NEUTRAL_B && bidi_at_paragraph_end (pos, bytepos) >= -1)
21fce5ab 762 break;
bea4f10c
EZ
763 if (bytepos >= ZV_BYTE)
764 {
765 /* Pretend there's a paragraph separator at end of
766 buffer. */
767 type = NEUTRAL_B;
768 break;
769 }
102ebb00
EZ
770 /* Fetch next character and advance to get past it. */
771 ch = bidi_fetch_char (bytepos, pos, &disp_pos, &ch_len, &nchars);
182ce2d2
EZ
772 pos += nchars;
773 bytepos += ch_len;
bea4f10c
EZ
774 }
775 if (type == STRONG_R || type == STRONG_AL) /* P3 */
776 bidi_it->paragraph_dir = R2L;
777 else if (type == STRONG_L)
778 bidi_it->paragraph_dir = L2R;
779 if (no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR)
780 {
781 /* If this paragraph is at BEGV, default to L2R. */
782 if (pstartbyte == BEGV_BYTE)
783 bidi_it->paragraph_dir = L2R; /* P3 and HL1 */
784 else
785 {
786 EMACS_INT prevpbyte = pstartbyte;
787 EMACS_INT p = BYTE_TO_CHAR (pstartbyte), pbyte = pstartbyte;
788
789 /* Find the beginning of the previous paragraph, if any. */
790 while (pbyte > BEGV_BYTE && prevpbyte >= pstartbyte)
791 {
182ce2d2
EZ
792 /* FXIME: What if p is covered by a display
793 string? See also a FIXME inside
794 bidi_find_paragraph_start. */
bea4f10c
EZ
795 p--;
796 pbyte = CHAR_TO_BYTE (p);
797 prevpbyte = bidi_find_paragraph_start (p, pbyte);
798 }
799 pstartbyte = prevpbyte;
800 }
801 }
802 } while (no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR);
b7b65b15 803 }
be39f003
EZ
804 else
805 abort ();
b7b65b15 806
b44d9321
EZ
807 /* Contrary to UAX#9 clause P3, we only default the paragraph
808 direction to L2R if we have no previous usable paragraph
bea4f10c 809 direction. This is allowed by the HL1 clause. */
d20e1419 810 if (bidi_it->paragraph_dir != L2R && bidi_it->paragraph_dir != R2L)
bea4f10c 811 bidi_it->paragraph_dir = L2R; /* P3 and HL1 ``higher-level protocols'' */
be39f003 812 if (bidi_it->paragraph_dir == R2L)
b44d9321 813 bidi_it->level_stack[0].level = 1;
be39f003 814 else
b44d9321 815 bidi_it->level_stack[0].level = 0;
be39f003
EZ
816
817 bidi_line_init (bidi_it);
b7b65b15
EZ
818}
819
6bff6497
EZ
820/* Do whatever UAX#9 clause X8 says should be done at paragraph's
821 end. */
fd3998ff 822static INLINE void
b7b65b15
EZ
823bidi_set_paragraph_end (struct bidi_it *bidi_it)
824{
825 bidi_it->invalid_levels = 0;
826 bidi_it->invalid_rl_levels = -1;
827 bidi_it->stack_idx = 0;
828 bidi_it->resolved_level = bidi_it->level_stack[0].level;
b7b65b15
EZ
829}
830
182ce2d2 831/* Initialize the bidi iterator from buffer/string position CHARPOS. */
b7b65b15 832void
6bff6497 833bidi_init_it (EMACS_INT charpos, EMACS_INT bytepos, struct bidi_it *bidi_it)
b7b65b15
EZ
834{
835 if (! bidi_initialized)
836 bidi_initialize ();
89d3374a
EZ
837 bidi_it->charpos = charpos;
838 bidi_it->bytepos = bytepos;
182ce2d2 839 bidi_it->nchars = -1; /* to be computed in bidi_resolve_explicit_1 */
6bff6497
EZ
840 bidi_it->first_elt = 1;
841 bidi_set_paragraph_end (bidi_it);
842 bidi_it->new_paragraph = 1;
be39f003 843 bidi_it->separator_limit = -1;
b7b65b15 844 bidi_it->type = NEUTRAL_B;
ebb5722e
EZ
845 bidi_it->type_after_w1 = NEUTRAL_B;
846 bidi_it->orig_type = NEUTRAL_B;
b7b65b15 847 bidi_it->prev_was_pdf = 0;
ebb5722e
EZ
848 bidi_it->prev.type = bidi_it->prev.type_after_w1 =
849 bidi_it->prev.orig_type = UNKNOWN_BT;
89d3374a
EZ
850 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
851 bidi_it->last_strong.orig_type = UNKNOWN_BT;
b7b65b15
EZ
852 bidi_it->next_for_neutral.charpos = -1;
853 bidi_it->next_for_neutral.type =
89d3374a
EZ
854 bidi_it->next_for_neutral.type_after_w1 =
855 bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
b7b65b15
EZ
856 bidi_it->prev_for_neutral.charpos = -1;
857 bidi_it->prev_for_neutral.type =
89d3374a
EZ
858 bidi_it->prev_for_neutral.type_after_w1 =
859 bidi_it->prev_for_neutral.orig_type = UNKNOWN_BT;
b7b65b15 860 bidi_it->sor = L2R; /* FIXME: should it be user-selectable? */
182ce2d2 861 bidi_it->disp_pos = -1; /* invalid/unknown */
2fe72643 862 bidi_cache_shrink ();
b7b65b15
EZ
863}
864
865/* Push the current embedding level and override status; reset the
866 current level to LEVEL and the current override status to OVERRIDE. */
fd3998ff 867static INLINE void
b7b65b15
EZ
868bidi_push_embedding_level (struct bidi_it *bidi_it,
869 int level, bidi_dir_t override)
870{
871 bidi_it->stack_idx++;
872 if (bidi_it->stack_idx >= BIDI_MAXLEVEL)
873 abort ();
874 bidi_it->level_stack[bidi_it->stack_idx].level = level;
875 bidi_it->level_stack[bidi_it->stack_idx].override = override;
876}
877
878/* Pop the embedding level and directional override status from the
879 stack, and return the new level. */
fd3998ff 880static INLINE int
b7b65b15
EZ
881bidi_pop_embedding_level (struct bidi_it *bidi_it)
882{
883 /* UAX#9 says to ignore invalid PDFs. */
884 if (bidi_it->stack_idx > 0)
885 bidi_it->stack_idx--;
886 return bidi_it->level_stack[bidi_it->stack_idx].level;
887}
888
889/* Record in SAVED_INFO the information about the current character. */
fd3998ff 890static INLINE void
b7b65b15
EZ
891bidi_remember_char (struct bidi_saved_info *saved_info,
892 struct bidi_it *bidi_it)
893{
894 saved_info->charpos = bidi_it->charpos;
895 saved_info->bytepos = bidi_it->bytepos;
896 saved_info->type = bidi_it->type;
2d6e4628 897 bidi_check_type (bidi_it->type);
89d3374a
EZ
898 saved_info->type_after_w1 = bidi_it->type_after_w1;
899 bidi_check_type (bidi_it->type_after_w1);
b7b65b15 900 saved_info->orig_type = bidi_it->orig_type;
2d6e4628 901 bidi_check_type (bidi_it->orig_type);
b7b65b15
EZ
902}
903
904/* Resolve the type of a neutral character according to the type of
905 surrounding strong text and the current embedding level. */
fd3998ff 906static INLINE bidi_type_t
b7b65b15
EZ
907bidi_resolve_neutral_1 (bidi_type_t prev_type, bidi_type_t next_type, int lev)
908{
909 /* N1: European and Arabic numbers are treated as though they were R. */
910 if (next_type == WEAK_EN || next_type == WEAK_AN)
911 next_type = STRONG_R;
912 if (prev_type == WEAK_EN || prev_type == WEAK_AN)
913 prev_type = STRONG_R;
914
915 if (next_type == prev_type) /* N1 */
916 return next_type;
917 else if ((lev & 1) == 0) /* N2 */
918 return STRONG_L;
919 else
920 return STRONG_R;
921}
922
fd3998ff 923static INLINE int
182ce2d2 924bidi_explicit_dir_char (int ch)
b7b65b15 925{
182ce2d2
EZ
926 bidi_type_t ch_type;
927
928 if (!bidi_initialized)
929 abort ();
930 ch_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
931 return (ch_type == LRE || ch_type == LRO
932 || ch_type == RLE || ch_type == RLO
933 || ch_type == PDF);
b7b65b15
EZ
934}
935
936/* A helper function for bidi_resolve_explicit. It advances to the
937 next character in logical order and determines the new embedding
938 level and directional override, but does not take into account
939 empty embeddings. */
940static int
941bidi_resolve_explicit_1 (struct bidi_it *bidi_it)
942{
943 int curchar;
944 bidi_type_t type;
945 int current_level;
946 int new_level;
947 bidi_dir_t override;
948
182ce2d2
EZ
949 /* If reseat()'ed, don't advance, so as to start iteration from the
950 position where we were reseated. bidi_it->bytepos can be less
951 than BEGV_BYTE after reseat to BEGV. */
952 if (bidi_it->bytepos < BEGV_BYTE
9c82e145 953 || bidi_it->first_elt)
e7402cb2 954 {
9c82e145
EZ
955 bidi_it->first_elt = 0;
956 if (bidi_it->charpos < BEGV)
957 bidi_it->charpos = BEGV;
958 bidi_it->bytepos = CHAR_TO_BYTE (bidi_it->charpos);
e7402cb2 959 }
9c82e145 960 else if (bidi_it->bytepos < ZV_BYTE) /* don't move at ZV */
b7b65b15 961 {
182ce2d2
EZ
962 /* Advance to the next character, skipping characters covered by
963 display strings (nchars > 1). */
102ebb00
EZ
964 if (bidi_it->nchars <= 0)
965 abort ();
182ce2d2 966 bidi_it->charpos += bidi_it->nchars;
e7402cb2
EZ
967 if (bidi_it->ch_len == 0)
968 abort ();
b7b65b15
EZ
969 bidi_it->bytepos += bidi_it->ch_len;
970 }
971
972 current_level = bidi_it->level_stack[bidi_it->stack_idx].level; /* X1 */
973 override = bidi_it->level_stack[bidi_it->stack_idx].override;
974 new_level = current_level;
975
e7402cb2
EZ
976 if (bidi_it->bytepos >= ZV_BYTE)
977 {
978 curchar = BIDI_EOB;
979 bidi_it->ch_len = 1;
182ce2d2
EZ
980 bidi_it->nchars = 1;
981 bidi_it->disp_pos = ZV_BYTE;
e7402cb2
EZ
982 }
983 else
984 {
182ce2d2
EZ
985 /* Fetch the character at BYTEPOS. If it is covered by a
986 display string, treat the entire run of covered characters as
987 a single character u+FFFC. */
102ebb00
EZ
988 curchar = bidi_fetch_char (bidi_it->bytepos, bidi_it->charpos,
989 &bidi_it->disp_pos,
990 &bidi_it->ch_len, &bidi_it->nchars);
e7402cb2 991 }
b7b65b15 992 bidi_it->ch = curchar;
b7b65b15 993
6bff6497
EZ
994 /* Don't apply directional override here, as all the types we handle
995 below will not be affected by the override anyway, and we need
996 the original type unaltered. The override will be applied in
997 bidi_resolve_weak. */
998 type = bidi_get_type (curchar, NEUTRAL_DIR);
89d3374a
EZ
999 bidi_it->orig_type = type;
1000 bidi_check_type (bidi_it->orig_type);
b7b65b15
EZ
1001
1002 if (type != PDF)
1003 bidi_it->prev_was_pdf = 0;
1004
89d3374a 1005 bidi_it->type_after_w1 = UNKNOWN_BT;
b7b65b15
EZ
1006
1007 switch (type)
1008 {
1009 case RLE: /* X2 */
1010 case RLO: /* X4 */
89d3374a
EZ
1011 bidi_it->type_after_w1 = type;
1012 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1013 type = WEAK_BN; /* X9/Retaining */
1014 if (bidi_it->ignore_bn_limit <= 0)
1015 {
1016 if (current_level <= BIDI_MAXLEVEL - 4)
1017 {
1018 /* Compute the least odd embedding level greater than
1019 the current level. */
1020 new_level = ((current_level + 1) & ~1) + 1;
89d3374a 1021 if (bidi_it->type_after_w1 == RLE)
b7b65b15
EZ
1022 override = NEUTRAL_DIR;
1023 else
1024 override = R2L;
1025 if (current_level == BIDI_MAXLEVEL - 4)
1026 bidi_it->invalid_rl_levels = 0;
1027 bidi_push_embedding_level (bidi_it, new_level, override);
1028 }
1029 else
1030 {
1031 bidi_it->invalid_levels++;
1032 /* See the commentary about invalid_rl_levels below. */
1033 if (bidi_it->invalid_rl_levels < 0)
1034 bidi_it->invalid_rl_levels = 0;
1035 bidi_it->invalid_rl_levels++;
1036 }
1037 }
89d3374a 1038 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
b7b65b15
EZ
1039 || bidi_it->next_en_pos > bidi_it->charpos)
1040 type = WEAK_EN;
1041 break;
1042 case LRE: /* X3 */
1043 case LRO: /* X5 */
89d3374a
EZ
1044 bidi_it->type_after_w1 = type;
1045 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1046 type = WEAK_BN; /* X9/Retaining */
1047 if (bidi_it->ignore_bn_limit <= 0)
1048 {
1049 if (current_level <= BIDI_MAXLEVEL - 5)
1050 {
1051 /* Compute the least even embedding level greater than
1052 the current level. */
1053 new_level = ((current_level + 2) & ~1);
89d3374a 1054 if (bidi_it->type_after_w1 == LRE)
b7b65b15
EZ
1055 override = NEUTRAL_DIR;
1056 else
1057 override = L2R;
1058 bidi_push_embedding_level (bidi_it, new_level, override);
1059 }
1060 else
1061 {
1062 bidi_it->invalid_levels++;
1063 /* invalid_rl_levels counts invalid levels encountered
1064 while the embedding level was already too high for
1065 LRE/LRO, but not for RLE/RLO. That is because
1066 there may be exactly one PDF which we should not
1067 ignore even though invalid_levels is non-zero.
1068 invalid_rl_levels helps to know what PDF is
1069 that. */
1070 if (bidi_it->invalid_rl_levels >= 0)
1071 bidi_it->invalid_rl_levels++;
1072 }
1073 }
89d3374a 1074 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
b7b65b15
EZ
1075 || bidi_it->next_en_pos > bidi_it->charpos)
1076 type = WEAK_EN;
1077 break;
1078 case PDF: /* X7 */
89d3374a
EZ
1079 bidi_it->type_after_w1 = type;
1080 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1081 type = WEAK_BN; /* X9/Retaining */
1082 if (bidi_it->ignore_bn_limit <= 0)
1083 {
1084 if (!bidi_it->invalid_rl_levels)
1085 {
1086 new_level = bidi_pop_embedding_level (bidi_it);
1087 bidi_it->invalid_rl_levels = -1;
1088 if (bidi_it->invalid_levels)
1089 bidi_it->invalid_levels--;
1090 /* else nothing: UAX#9 says to ignore invalid PDFs */
1091 }
1092 if (!bidi_it->invalid_levels)
1093 new_level = bidi_pop_embedding_level (bidi_it);
1094 else
1095 {
1096 bidi_it->invalid_levels--;
1097 bidi_it->invalid_rl_levels--;
1098 }
1099 }
89d3374a 1100 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
b7b65b15
EZ
1101 || bidi_it->next_en_pos > bidi_it->charpos)
1102 type = WEAK_EN;
1103 break;
1104 default:
1105 /* Nothing. */
1106 break;
1107 }
1108
1109 bidi_it->type = type;
2d6e4628 1110 bidi_check_type (bidi_it->type);
b7b65b15
EZ
1111
1112 return new_level;
1113}
1114
1115/* Given an iterator state in BIDI_IT, advance one character position
182ce2d2
EZ
1116 in the buffer/string to the next character (in the logical order),
1117 resolve any explicit embeddings and directional overrides, and
1118 return the embedding level of the character after resolving
1119 explicit directives and ignoring empty embeddings. */
b7b65b15
EZ
1120static int
1121bidi_resolve_explicit (struct bidi_it *bidi_it)
1122{
1123 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1124 int new_level = bidi_resolve_explicit_1 (bidi_it);
1125
1126 if (prev_level < new_level
1127 && bidi_it->type == WEAK_BN
1128 && bidi_it->ignore_bn_limit == 0 /* only if not already known */
1502b819 1129 && bidi_it->bytepos < ZV_BYTE /* not already at EOB */
182ce2d2
EZ
1130 && bidi_explicit_dir_char (FETCH_MULTIBYTE_CHAR (bidi_it->bytepos
1131 + bidi_it->ch_len)))
b7b65b15
EZ
1132 {
1133 /* Avoid pushing and popping embedding levels if the level run
1134 is empty, as this breaks level runs where it shouldn't.
1135 UAX#9 removes all the explicit embedding and override codes,
1136 so empty embeddings disappear without a trace. We need to
1137 behave as if we did the same. */
1138 struct bidi_it saved_it;
1139 int level = prev_level;
1140
1141 bidi_copy_it (&saved_it, bidi_it);
1142
182ce2d2
EZ
1143 while (bidi_explicit_dir_char (FETCH_MULTIBYTE_CHAR (bidi_it->bytepos
1144 + bidi_it->ch_len)))
b7b65b15 1145 {
182ce2d2
EZ
1146 /* This advances to the next character, skipping any
1147 characters covered by display strings. */
b7b65b15
EZ
1148 level = bidi_resolve_explicit_1 (bidi_it);
1149 }
1150
102ebb00
EZ
1151 if (bidi_it->nchars <= 0)
1152 abort ();
b7b65b15 1153 if (level == prev_level) /* empty embedding */
182ce2d2 1154 saved_it.ignore_bn_limit = bidi_it->charpos + bidi_it->nchars;
b7b65b15
EZ
1155 else /* this embedding is non-empty */
1156 saved_it.ignore_bn_limit = -1;
1157
1158 bidi_copy_it (bidi_it, &saved_it);
1159 if (bidi_it->ignore_bn_limit > 0)
1160 {
1161 /* We pushed a level, but we shouldn't have. Undo that. */
1162 if (!bidi_it->invalid_rl_levels)
1163 {
1164 new_level = bidi_pop_embedding_level (bidi_it);
1165 bidi_it->invalid_rl_levels = -1;
1166 if (bidi_it->invalid_levels)
1167 bidi_it->invalid_levels--;
1168 }
1169 if (!bidi_it->invalid_levels)
1170 new_level = bidi_pop_embedding_level (bidi_it);
1171 else
1172 {
1173 bidi_it->invalid_levels--;
1174 bidi_it->invalid_rl_levels--;
1175 }
1176 }
1177 }
1178
b7b65b15
EZ
1179 if (bidi_it->type == NEUTRAL_B) /* X8 */
1180 {
21fce5ab 1181 bidi_set_paragraph_end (bidi_it);
6bff6497
EZ
1182 /* This is needed by bidi_resolve_weak below, and in L1. */
1183 bidi_it->type_after_w1 = bidi_it->type;
89d3374a 1184 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1185 }
1186
1187 return new_level;
1188}
1189
182ce2d2
EZ
1190/* Advance in the buffer/string, resolve weak types and return the
1191 type of the next character after weak type resolution. */
fd3998ff 1192static bidi_type_t
b7b65b15
EZ
1193bidi_resolve_weak (struct bidi_it *bidi_it)
1194{
1195 bidi_type_t type;
1196 bidi_dir_t override;
1197 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1198 int new_level = bidi_resolve_explicit (bidi_it);
1199 int next_char;
1200 bidi_type_t type_of_next;
1201 struct bidi_it saved_it;
1202
1203 type = bidi_it->type;
1204 override = bidi_it->level_stack[bidi_it->stack_idx].override;
1205
1206 if (type == UNKNOWN_BT
1207 || type == LRE
1208 || type == LRO
1209 || type == RLE
1210 || type == RLO
1211 || type == PDF)
1212 abort ();
1213
1214 if (new_level != prev_level
1215 || bidi_it->type == NEUTRAL_B)
1216 {
1217 /* We've got a new embedding level run, compute the directional
1218 type of sor and initialize per-run variables (UAX#9, clause
1219 X10). */
1220 bidi_set_sor_type (bidi_it, prev_level, new_level);
1221 }
1222 else if (type == NEUTRAL_S || type == NEUTRAL_WS
1223 || type == WEAK_BN || type == STRONG_AL)
89d3374a
EZ
1224 bidi_it->type_after_w1 = type; /* needed in L1 */
1225 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1226
1227 /* Level and directional override status are already recorded in
1228 bidi_it, and do not need any change; see X6. */
1229 if (override == R2L) /* X6 */
1230 type = STRONG_R;
1231 else if (override == L2R)
1232 type = STRONG_L;
bc5a45f3 1233 else
b7b65b15 1234 {
bc5a45f3 1235 if (type == WEAK_NSM) /* W1 */
b7b65b15 1236 {
bc5a45f3 1237 /* Note that we don't need to consider the case where the
5930fe97
EZ
1238 prev character has its type overridden by an RLO or LRO,
1239 because then either the type of this NSM would have been
1240 also overridden, or the previous character is outside the
1241 current level run, and thus not relevant to this NSM.
1242 This is why NSM gets the type_after_w1 of the previous
1243 character. */
ebb5722e
EZ
1244 if (bidi_it->prev.type_after_w1 != UNKNOWN_BT
1245 /* if type_after_w1 is NEUTRAL_B, this NSM is at sor */
1246 && bidi_it->prev.type_after_w1 != NEUTRAL_B)
5930fe97 1247 type = bidi_it->prev.type_after_w1;
bc5a45f3
EZ
1248 else if (bidi_it->sor == R2L)
1249 type = STRONG_R;
1250 else if (bidi_it->sor == L2R)
1251 type = STRONG_L;
1252 else /* shouldn't happen! */
1253 abort ();
b7b65b15 1254 }
bc5a45f3
EZ
1255 if (type == WEAK_EN /* W2 */
1256 && bidi_it->last_strong.type_after_w1 == STRONG_AL)
1257 type = WEAK_AN;
1258 else if (type == STRONG_AL) /* W3 */
1259 type = STRONG_R;
1260 else if ((type == WEAK_ES /* W4 */
1261 && bidi_it->prev.type_after_w1 == WEAK_EN
1262 && bidi_it->prev.orig_type == WEAK_EN)
1263 || (type == WEAK_CS
1264 && ((bidi_it->prev.type_after_w1 == WEAK_EN
1265 && bidi_it->prev.orig_type == WEAK_EN)
1266 || bidi_it->prev.type_after_w1 == WEAK_AN)))
b7b65b15 1267 {
e7402cb2
EZ
1268 next_char =
1269 bidi_it->bytepos + bidi_it->ch_len >= ZV_BYTE
182ce2d2
EZ
1270 ? BIDI_EOB : FETCH_MULTIBYTE_CHAR (bidi_it->bytepos
1271 + bidi_it->ch_len);
6bff6497 1272 type_of_next = bidi_get_type (next_char, override);
b7b65b15 1273
bc5a45f3 1274 if (type_of_next == WEAK_BN
b7b65b15
EZ
1275 || bidi_explicit_dir_char (next_char))
1276 {
1277 bidi_copy_it (&saved_it, bidi_it);
1278 while (bidi_resolve_explicit (bidi_it) == new_level
bc5a45f3 1279 && bidi_it->type == WEAK_BN)
b7b65b15
EZ
1280 ;
1281 type_of_next = bidi_it->type;
b7b65b15
EZ
1282 bidi_copy_it (bidi_it, &saved_it);
1283 }
bc5a45f3
EZ
1284
1285 /* If the next character is EN, but the last strong-type
1286 character is AL, that next EN will be changed to AN when
1287 we process it in W2 above. So in that case, this ES
1288 should not be changed into EN. */
1289 if (type == WEAK_ES
1290 && type_of_next == WEAK_EN
1291 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1292 type = WEAK_EN;
1293 else if (type == WEAK_CS)
b7b65b15 1294 {
bc5a45f3
EZ
1295 if (bidi_it->prev.type_after_w1 == WEAK_AN
1296 && (type_of_next == WEAK_AN
1297 /* If the next character is EN, but the last
1298 strong-type character is AL, EN will be later
1299 changed to AN when we process it in W2 above.
1300 So in that case, this ES should not be
1301 changed into EN. */
1302 || (type_of_next == WEAK_EN
1303 && bidi_it->last_strong.type_after_w1 == STRONG_AL)))
1304 type = WEAK_AN;
1305 else if (bidi_it->prev.type_after_w1 == WEAK_EN
1306 && type_of_next == WEAK_EN
1307 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1308 type = WEAK_EN;
1309 }
1310 }
1311 else if (type == WEAK_ET /* W5: ET with EN before or after it */
1312 || type == WEAK_BN) /* W5/Retaining */
1313 {
1314 if (bidi_it->prev.type_after_w1 == WEAK_EN /* ET/BN w/EN before it */
1315 || bidi_it->next_en_pos > bidi_it->charpos)
1316 type = WEAK_EN;
1317 else /* W5: ET/BN with EN after it. */
1318 {
182ce2d2 1319 EMACS_INT en_pos = bidi_it->charpos + bidi_it->nchars;
bc5a45f3 1320
102ebb00
EZ
1321 if (bidi_it->nchars <= 0)
1322 abort ();
bc5a45f3
EZ
1323 next_char =
1324 bidi_it->bytepos + bidi_it->ch_len >= ZV_BYTE
182ce2d2
EZ
1325 ? BIDI_EOB : FETCH_MULTIBYTE_CHAR (bidi_it->bytepos
1326 + bidi_it->ch_len);
bc5a45f3
EZ
1327 type_of_next = bidi_get_type (next_char, override);
1328
1329 if (type_of_next == WEAK_ET
1330 || type_of_next == WEAK_BN
1331 || bidi_explicit_dir_char (next_char))
1332 {
1333 bidi_copy_it (&saved_it, bidi_it);
1334 while (bidi_resolve_explicit (bidi_it) == new_level
1335 && (bidi_it->type == WEAK_BN
1336 || bidi_it->type == WEAK_ET))
1337 ;
1338 type_of_next = bidi_it->type;
1339 en_pos = bidi_it->charpos;
1340 bidi_copy_it (bidi_it, &saved_it);
1341 }
1342 if (type_of_next == WEAK_EN)
b7b65b15 1343 {
bc5a45f3
EZ
1344 /* If the last strong character is AL, the EN we've
1345 found will become AN when we get to it (W2). */
1346 if (bidi_it->last_strong.type_after_w1 != STRONG_AL)
1347 {
1348 type = WEAK_EN;
1349 /* Remember this EN position, to speed up processing
1350 of the next ETs. */
1351 bidi_it->next_en_pos = en_pos;
1352 }
1353 else if (type == WEAK_BN)
1354 type = NEUTRAL_ON; /* W6/Retaining */
b7b65b15 1355 }
b7b65b15
EZ
1356 }
1357 }
1358 }
1359
1360 if (type == WEAK_ES || type == WEAK_ET || type == WEAK_CS /* W6 */
89d3374a
EZ
1361 || (type == WEAK_BN
1362 && (bidi_it->prev.type_after_w1 == WEAK_CS /* W6/Retaining */
1363 || bidi_it->prev.type_after_w1 == WEAK_ES
1364 || bidi_it->prev.type_after_w1 == WEAK_ET)))
b7b65b15
EZ
1365 type = NEUTRAL_ON;
1366
1367 /* Store the type we've got so far, before we clobber it with strong
1368 types in W7 and while resolving neutral types. But leave alone
1369 the original types that were recorded above, because we will need
1370 them for the L1 clause. */
89d3374a
EZ
1371 if (bidi_it->type_after_w1 == UNKNOWN_BT)
1372 bidi_it->type_after_w1 = type;
1373 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1374
1375 if (type == WEAK_EN) /* W7 */
1376 {
89d3374a 1377 if ((bidi_it->last_strong.type_after_w1 == STRONG_L)
b7b65b15
EZ
1378 || (bidi_it->last_strong.type == UNKNOWN_BT && bidi_it->sor == L2R))
1379 type = STRONG_L;
1380 }
1381
1382 bidi_it->type = type;
2d6e4628 1383 bidi_check_type (bidi_it->type);
b7b65b15
EZ
1384 return type;
1385}
1386
fd3998ff 1387static bidi_type_t
b7b65b15
EZ
1388bidi_resolve_neutral (struct bidi_it *bidi_it)
1389{
1390 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1391 bidi_type_t type = bidi_resolve_weak (bidi_it);
1392 int current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1393
1394 if (!(type == STRONG_R
1395 || type == STRONG_L
1396 || type == WEAK_BN
1397 || type == WEAK_EN
1398 || type == WEAK_AN
1399 || type == NEUTRAL_B
1400 || type == NEUTRAL_S
1401 || type == NEUTRAL_WS
1402 || type == NEUTRAL_ON))
1403 abort ();
1404
1405 if (bidi_get_category (type) == NEUTRAL
1406 || (type == WEAK_BN && prev_level == current_level))
1407 {
1408 if (bidi_it->next_for_neutral.type != UNKNOWN_BT)
1409 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1410 bidi_it->next_for_neutral.type,
1411 current_level);
1412 else
1413 {
1414 /* Arrrgh!! The UAX#9 algorithm is too deeply entrenched in
1415 the assumption of batch-style processing; see clauses W4,
1416 W5, and especially N1, which require to look far forward
182ce2d2
EZ
1417 (as well as back) in the buffer/string. May the fleas of
1418 a thousand camels infest the armpits of those who design
b7b65b15
EZ
1419 supposedly general-purpose algorithms by looking at their
1420 own implementations, and fail to consider other possible
1421 implementations! */
1422 struct bidi_it saved_it;
1423 bidi_type_t next_type;
1424
1425 if (bidi_it->scan_dir == -1)
1426 abort ();
1427
1428 bidi_copy_it (&saved_it, bidi_it);
1429 /* Scan the text forward until we find the first non-neutral
1430 character, and then use that to resolve the neutral we
1431 are dealing with now. We also cache the scanned iterator
1432 states, to salvage some of the effort later. */
1433 bidi_cache_iterator_state (bidi_it, 0);
1434 do {
1435 /* Record the info about the previous character, so that
1436 it will be cached below with this state. */
89d3374a 1437 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
b7b65b15
EZ
1438 && bidi_it->type != WEAK_BN)
1439 bidi_remember_char (&bidi_it->prev, bidi_it);
1440 type = bidi_resolve_weak (bidi_it);
1441 /* Paragraph separators have their levels fully resolved
1442 at this point, so cache them as resolved. */
1443 bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B);
1444 /* FIXME: implement L1 here, by testing for a newline and
1445 resetting the level for any sequence of whitespace
1446 characters adjacent to it. */
1447 } while (!(type == NEUTRAL_B
1448 || (type != WEAK_BN
1449 && bidi_get_category (type) != NEUTRAL)
1450 /* This is all per level run, so stop when we
1451 reach the end of this level run. */
1452 || bidi_it->level_stack[bidi_it->stack_idx].level !=
1453 current_level));
1454
1455 bidi_remember_char (&saved_it.next_for_neutral, bidi_it);
1456
1457 switch (type)
1458 {
1459 case STRONG_L:
1460 case STRONG_R:
1461 case STRONG_AL:
1462 next_type = type;
1463 break;
1464 case WEAK_EN:
1465 case WEAK_AN:
1466 /* N1: ``European and Arabic numbers are treated as
1467 though they were R.'' */
1468 next_type = STRONG_R;
1469 saved_it.next_for_neutral.type = STRONG_R;
1470 break;
1471 case WEAK_BN:
1472 if (!bidi_explicit_dir_char (bidi_it->ch))
1473 abort (); /* can't happen: BNs are skipped */
1474 /* FALLTHROUGH */
1475 case NEUTRAL_B:
1476 /* Marched all the way to the end of this level run.
1477 We need to use the eor type, whose information is
1478 stored by bidi_set_sor_type in the prev_for_neutral
1479 member. */
1480 if (saved_it.type != WEAK_BN
89d3374a 1481 || bidi_get_category (bidi_it->prev.type_after_w1) == NEUTRAL)
b7b65b15
EZ
1482 {
1483 next_type = bidi_it->prev_for_neutral.type;
1484 saved_it.next_for_neutral.type = next_type;
2d6e4628 1485 bidi_check_type (next_type);
b7b65b15
EZ
1486 }
1487 else
1488 {
1489 /* This is a BN which does not adjoin neutrals.
1490 Leave its type alone. */
1491 bidi_copy_it (bidi_it, &saved_it);
1492 return bidi_it->type;
1493 }
1494 break;
1495 default:
1496 abort ();
1497 }
1498 type = bidi_resolve_neutral_1 (saved_it.prev_for_neutral.type,
1499 next_type, current_level);
1500 saved_it.type = type;
2d6e4628 1501 bidi_check_type (type);
b7b65b15
EZ
1502 bidi_copy_it (bidi_it, &saved_it);
1503 }
1504 }
1505 return type;
1506}
1507
1508/* Given an iterator state in BIDI_IT, advance one character position
182ce2d2
EZ
1509 in the buffer/string to the next character (in the logical order),
1510 resolve the bidi type of that next character, and return that
1511 type. */
fd3998ff 1512static bidi_type_t
b7b65b15
EZ
1513bidi_type_of_next_char (struct bidi_it *bidi_it)
1514{
1515 bidi_type_t type;
1516
1517 /* This should always be called during a forward scan. */
1518 if (bidi_it->scan_dir != 1)
1519 abort ();
1520
1521 /* Reset the limit until which to ignore BNs if we step out of the
1522 area where we found only empty levels. */
1523 if ((bidi_it->ignore_bn_limit > 0
1524 && bidi_it->ignore_bn_limit <= bidi_it->charpos)
1525 || (bidi_it->ignore_bn_limit == -1
1526 && !bidi_explicit_dir_char (bidi_it->ch)))
1527 bidi_it->ignore_bn_limit = 0;
1528
1529 type = bidi_resolve_neutral (bidi_it);
1530
1531 return type;
1532}
1533
1534/* Given an iterator state BIDI_IT, advance one character position in
182ce2d2
EZ
1535 the buffer/string to the next character (in the current scan
1536 direction), resolve the embedding and implicit levels of that next
1537 character, and return the resulting level. */
fd3998ff 1538static int
b7b65b15
EZ
1539bidi_level_of_next_char (struct bidi_it *bidi_it)
1540{
1541 bidi_type_t type;
1542 int level, prev_level = -1;
1543 struct bidi_saved_info next_for_neutral;
182ce2d2 1544 EMACS_INT next_char_pos;
b7b65b15
EZ
1545
1546 if (bidi_it->scan_dir == 1)
1547 {
1548 /* There's no sense in trying to advance if we hit end of text. */
1502b819 1549 if (bidi_it->bytepos >= ZV_BYTE)
b7b65b15
EZ
1550 return bidi_it->resolved_level;
1551
1552 /* Record the info about the previous character. */
89d3374a 1553 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
b7b65b15
EZ
1554 && bidi_it->type != WEAK_BN)
1555 bidi_remember_char (&bidi_it->prev, bidi_it);
89d3374a
EZ
1556 if (bidi_it->type_after_w1 == STRONG_R
1557 || bidi_it->type_after_w1 == STRONG_L
1558 || bidi_it->type_after_w1 == STRONG_AL)
b7b65b15
EZ
1559 bidi_remember_char (&bidi_it->last_strong, bidi_it);
1560 /* FIXME: it sounds like we don't need both prev and
1561 prev_for_neutral members, but I'm leaving them both for now. */
1562 if (bidi_it->type == STRONG_R || bidi_it->type == STRONG_L
1563 || bidi_it->type == WEAK_EN || bidi_it->type == WEAK_AN)
1564 bidi_remember_char (&bidi_it->prev_for_neutral, bidi_it);
1565
1566 /* If we overstepped the characters used for resolving neutrals
1567 and whitespace, invalidate their info in the iterator. */
1568 if (bidi_it->charpos >= bidi_it->next_for_neutral.charpos)
1569 bidi_it->next_for_neutral.type = UNKNOWN_BT;
1570 if (bidi_it->next_en_pos >= 0
1571 && bidi_it->charpos >= bidi_it->next_en_pos)
1572 bidi_it->next_en_pos = -1;
1573 if (bidi_it->next_for_ws.type != UNKNOWN_BT
1574 && bidi_it->charpos >= bidi_it->next_for_ws.charpos)
1575 bidi_it->next_for_ws.type = UNKNOWN_BT;
1576
1577 /* This must be taken before we fill the iterator with the info
1578 about the next char. If we scan backwards, the iterator
1579 state must be already cached, so there's no need to know the
1580 embedding level of the previous character, since we will be
1581 returning to our caller shortly. */
1582 prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1583 }
1584 next_for_neutral = bidi_it->next_for_neutral;
1585
182ce2d2
EZ
1586 /* Perhaps the character we want is already cached. If it is, the
1587 call to bidi_cache_find below will return a type other than
1588 UNKNOWN_BT. */
102ebb00
EZ
1589 if (bidi_cache_idx && !bidi_it->first_elt)
1590 {
1591 if (bidi_it->scan_dir > 0)
1592 {
1593 if (bidi_it->nchars <= 0)
1594 abort ();
1595 next_char_pos = bidi_it->charpos + bidi_it->nchars;
1596 }
1597 else
1598 next_char_pos = bidi_it->charpos - 1;
1599 type = bidi_cache_find (next_char_pos, -1, bidi_it);
1600 }
182ce2d2 1601 else
102ebb00 1602 type = UNKNOWN_BT;
b7b65b15
EZ
1603 if (type != UNKNOWN_BT)
1604 {
1605 /* Don't lose the information for resolving neutrals! The
1606 cached states could have been cached before their
1607 next_for_neutral member was computed. If we are on our way
1608 forward, we can simply take the info from the previous
1609 state. */
1610 if (bidi_it->scan_dir == 1
1611 && bidi_it->next_for_neutral.type == UNKNOWN_BT)
1612 bidi_it->next_for_neutral = next_for_neutral;
1613
1614 /* If resolved_level is -1, it means this state was cached
1615 before it was completely resolved, so we cannot return
1616 it. */
1617 if (bidi_it->resolved_level != -1)
1618 return bidi_it->resolved_level;
1619 }
1620 if (bidi_it->scan_dir == -1)
1621 /* If we are going backwards, the iterator state is already cached
1622 from previous scans, and should be fully resolved. */
1623 abort ();
1624
1625 if (type == UNKNOWN_BT)
1626 type = bidi_type_of_next_char (bidi_it);
1627
1628 if (type == NEUTRAL_B)
1629 return bidi_it->resolved_level;
1630
1631 level = bidi_it->level_stack[bidi_it->stack_idx].level;
1632 if ((bidi_get_category (type) == NEUTRAL /* && type != NEUTRAL_B */)
1633 || (type == WEAK_BN && prev_level == level))
1634 {
1635 if (bidi_it->next_for_neutral.type == UNKNOWN_BT)
1636 abort ();
1637
1638 /* If the cached state shows a neutral character, it was not
1639 resolved by bidi_resolve_neutral, so do it now. */
1640 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1641 bidi_it->next_for_neutral.type,
1642 level);
1643 }
1644
1645 if (!(type == STRONG_R
1646 || type == STRONG_L
1647 || type == WEAK_BN
1648 || type == WEAK_EN
1649 || type == WEAK_AN))
1650 abort ();
1651 bidi_it->type = type;
2d6e4628 1652 bidi_check_type (bidi_it->type);
b7b65b15
EZ
1653
1654 /* For L1 below, we need to know, for each WS character, whether
0d327994 1655 it belongs to a sequence of WS characters preceding a newline
b7b65b15 1656 or a TAB or a paragraph separator. */
89d3374a 1657 if (bidi_it->orig_type == NEUTRAL_WS
b7b65b15
EZ
1658 && bidi_it->next_for_ws.type == UNKNOWN_BT)
1659 {
1660 int ch;
1661 int clen = bidi_it->ch_len;
6bff6497
EZ
1662 EMACS_INT bpos = bidi_it->bytepos;
1663 EMACS_INT cpos = bidi_it->charpos;
182ce2d2 1664 EMACS_INT disp_pos = bidi_it->disp_pos;
102ebb00 1665 EMACS_INT nc = bidi_it->nchars;
b7b65b15
EZ
1666 bidi_type_t chtype;
1667
102ebb00
EZ
1668 if (bidi_it->nchars <= 0)
1669 abort ();
b7b65b15 1670 do {
102ebb00 1671 ch = bidi_fetch_char (bpos += clen, cpos += nc, &disp_pos, &clen, &nc);
e7402cb2 1672 if (ch == '\n' || ch == BIDI_EOB /* || ch == LINESEP_CHAR */)
b7b65b15
EZ
1673 chtype = NEUTRAL_B;
1674 else
6bff6497 1675 chtype = bidi_get_type (ch, NEUTRAL_DIR);
b7b65b15
EZ
1676 } while (chtype == NEUTRAL_WS || chtype == WEAK_BN
1677 || bidi_explicit_dir_char (ch)); /* L1/Retaining */
1678 bidi_it->next_for_ws.type = chtype;
2d6e4628 1679 bidi_check_type (bidi_it->next_for_ws.type);
b7b65b15
EZ
1680 bidi_it->next_for_ws.charpos = cpos;
1681 bidi_it->next_for_ws.bytepos = bpos;
1682 }
1683
1684 /* Resolve implicit levels, with a twist: PDFs get the embedding
1685 level of the enbedding they terminate. See below for the
1686 reason. */
89d3374a 1687 if (bidi_it->orig_type == PDF
b7b65b15
EZ
1688 /* Don't do this if this formatting code didn't change the
1689 embedding level due to invalid or empty embeddings. */
1690 && prev_level != level)
1691 {
1692 /* Don't look in UAX#9 for the reason for this: it's our own
1693 private quirk. The reason is that we want the formatting
1694 codes to be delivered so that they bracket the text of their
1695 embedding. For example, given the text
1696
1697 {RLO}teST{PDF}
1698
1699 we want it to be displayed as
1700
0d68907d 1701 {PDF}STet{RLO}
b7b65b15
EZ
1702
1703 not as
1704
1705 STet{RLO}{PDF}
1706
1707 which will result because we bump up the embedding level as
1708 soon as we see the RLO and pop it as soon as we see the PDF,
1709 so RLO itself has the same embedding level as "teST", and
1710 thus would be normally delivered last, just before the PDF.
1711 The switch below fiddles with the level of PDF so that this
1712 ugly side effect does not happen.
1713
1714 (This is, of course, only important if the formatting codes
e7402cb2
EZ
1715 are actually displayed, but Emacs does need to display them
1716 if the user wants to.) */
b7b65b15
EZ
1717 level = prev_level;
1718 }
89d3374a
EZ
1719 else if (bidi_it->orig_type == NEUTRAL_B /* L1 */
1720 || bidi_it->orig_type == NEUTRAL_S
e7402cb2
EZ
1721 || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB
1722 /* || bidi_it->ch == LINESEP_CHAR */
89d3374a 1723 || (bidi_it->orig_type == NEUTRAL_WS
b7b65b15
EZ
1724 && (bidi_it->next_for_ws.type == NEUTRAL_B
1725 || bidi_it->next_for_ws.type == NEUTRAL_S)))
1726 level = bidi_it->level_stack[0].level;
1727 else if ((level & 1) == 0) /* I1 */
1728 {
1729 if (type == STRONG_R)
1730 level++;
1731 else if (type == WEAK_EN || type == WEAK_AN)
1732 level += 2;
1733 }
1734 else /* I2 */
1735 {
1736 if (type == STRONG_L || type == WEAK_EN || type == WEAK_AN)
1737 level++;
1738 }
1739
1740 bidi_it->resolved_level = level;
1741 return level;
1742}
1743
1744/* Move to the other edge of a level given by LEVEL. If END_FLAG is
1745 non-zero, we are at the end of a level, and we need to prepare to
1746 resume the scan of the lower level.
1747
1748 If this level's other edge is cached, we simply jump to it, filling
1749 the iterator structure with the iterator state on the other edge.
182ce2d2
EZ
1750 Otherwise, we walk the buffer or string until we come back to the
1751 same level as LEVEL.
b7b65b15
EZ
1752
1753 Note: we are not talking here about a ``level run'' in the UAX#9
1754 sense of the term, but rather about a ``level'' which includes
1755 all the levels higher than it. In other words, given the levels
1756 like this:
1757
1758 11111112222222333333334443343222222111111112223322111
1759 A B C
1760
1761 and assuming we are at point A scanning left to right, this
1762 function moves to point C, whereas the UAX#9 ``level 2 run'' ends
1763 at point B. */
1764static void
1765bidi_find_other_level_edge (struct bidi_it *bidi_it, int level, int end_flag)
1766{
1767 int dir = end_flag ? -bidi_it->scan_dir : bidi_it->scan_dir;
1768 int idx;
1769
1770 /* Try the cache first. */
1771 if ((idx = bidi_cache_find_level_change (level, dir, end_flag)) >= 0)
1772 bidi_cache_fetch_state (idx, bidi_it);
1773 else
1774 {
1775 int new_level;
1776
1777 if (end_flag)
1778 abort (); /* if we are at end of level, its edges must be cached */
1779
1780 bidi_cache_iterator_state (bidi_it, 1);
1781 do {
1782 new_level = bidi_level_of_next_char (bidi_it);
1783 bidi_cache_iterator_state (bidi_it, 1);
1784 } while (new_level >= level);
1785 }
1786}
1787
1788void
4b292a22 1789bidi_move_to_visually_next (struct bidi_it *bidi_it)
b7b65b15
EZ
1790{
1791 int old_level, new_level, next_level;
9c82e145 1792 struct bidi_it sentinel;
b7b65b15
EZ
1793
1794 if (bidi_it->scan_dir == 0)
1795 {
1796 bidi_it->scan_dir = 1; /* default to logical order */
1797 }
1798
be39f003
EZ
1799 /* If we just passed a newline, initialize for the next line. */
1800 if (!bidi_it->first_elt && bidi_it->orig_type == NEUTRAL_B)
1801 bidi_line_init (bidi_it);
1802
6dcfd253
EZ
1803 /* Prepare the sentinel iterator state, and cache it. When we bump
1804 into it, scanning backwards, we'll know that the last non-base
1805 level is exhausted. */
b7b65b15 1806 if (bidi_cache_idx == 0)
9c82e145
EZ
1807 {
1808 bidi_copy_it (&sentinel, bidi_it);
1809 if (bidi_it->first_elt)
1810 {
1811 sentinel.charpos--; /* cached charpos needs to be monotonic */
1812 sentinel.bytepos--;
1813 sentinel.ch = '\n'; /* doesn't matter, but why not? */
1814 sentinel.ch_len = 1;
182ce2d2 1815 sentinel.nchars = 1;
9c82e145 1816 }
6dcfd253 1817 bidi_cache_iterator_state (&sentinel, 1);
9c82e145 1818 }
b7b65b15
EZ
1819
1820 old_level = bidi_it->resolved_level;
1821 new_level = bidi_level_of_next_char (bidi_it);
b7b65b15
EZ
1822
1823 /* Reordering of resolved levels (clause L2) is implemented by
1824 jumping to the other edge of the level and flipping direction of
c0546589 1825 scanning the text whenever we find a level change. */
b7b65b15
EZ
1826 if (new_level != old_level)
1827 {
1828 int ascending = new_level > old_level;
1829 int level_to_search = ascending ? old_level + 1 : old_level;
1830 int incr = ascending ? 1 : -1;
1831 int expected_next_level = old_level + incr;
1832
b7b65b15
EZ
1833 /* Jump (or walk) to the other edge of this level. */
1834 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
1835 /* Switch scan direction and peek at the next character in the
1836 new direction. */
1837 bidi_it->scan_dir = -bidi_it->scan_dir;
1838
1839 /* The following loop handles the case where the resolved level
1840 jumps by more than one. This is typical for numbers inside a
1841 run of text with left-to-right embedding direction, but can
1842 also happen in other situations. In those cases the decision
1843 where to continue after a level change, and in what direction,
1844 is tricky. For example, given a text like below:
1845
1846 abcdefgh
1847 11336622
1848
1849 (where the numbers below the text show the resolved levels),
1850 the result of reordering according to UAX#9 should be this:
1851
1852 efdcghba
1853
1854 This is implemented by the loop below which flips direction
1855 and jumps to the other edge of the level each time it finds
1856 the new level not to be the expected one. The expected level
1857 is always one more or one less than the previous one. */
1858 next_level = bidi_peek_at_next_level (bidi_it);
1859 while (next_level != expected_next_level)
1860 {
1861 expected_next_level += incr;
1862 level_to_search += incr;
1863 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
1864 bidi_it->scan_dir = -bidi_it->scan_dir;
1865 next_level = bidi_peek_at_next_level (bidi_it);
1866 }
1867
1868 /* Finally, deliver the next character in the new direction. */
1869 next_level = bidi_level_of_next_char (bidi_it);
1870 }
1871
b44d9321
EZ
1872 /* Take note when we have just processed the newline that precedes
1873 the end of the paragraph. The next time we are about to be
1874 called, set_iterator_to_next will automatically reinit the
1875 paragraph direction, if needed. We do this at the newline before
1876 the paragraph separator, because the next character might not be
1877 the first character of the next paragraph, due to the bidi
c0546589
EZ
1878 reordering, whereas we _must_ know the paragraph base direction
1879 _before_ we process the paragraph's text, since the base
1880 direction affects the reordering. */
6bff6497 1881 if (bidi_it->scan_dir == 1
be39f003
EZ
1882 && bidi_it->orig_type == NEUTRAL_B
1883 && bidi_it->bytepos < ZV_BYTE)
1884 {
1885 EMACS_INT sep_len =
182ce2d2 1886 bidi_at_paragraph_end (bidi_it->charpos + bidi_it->nchars,
be39f003 1887 bidi_it->bytepos + bidi_it->ch_len);
102ebb00
EZ
1888 if (bidi_it->nchars <= 0)
1889 abort ();
be39f003
EZ
1890 if (sep_len >= 0)
1891 {
1892 bidi_it->new_paragraph = 1;
b44d9321
EZ
1893 /* Record the buffer position of the last character of the
1894 paragraph separator. */
182ce2d2
EZ
1895 bidi_it->separator_limit =
1896 bidi_it->charpos + bidi_it->nchars + sep_len;
be39f003
EZ
1897 }
1898 }
6bff6497 1899
b7b65b15
EZ
1900 if (bidi_it->scan_dir == 1 && bidi_cache_idx)
1901 {
1902 /* If we are at paragraph's base embedding level and beyond the
1903 last cached position, the cache's job is done and we can
1904 discard it. */
1905 if (bidi_it->resolved_level == bidi_it->level_stack[0].level
182ce2d2
EZ
1906 && bidi_it->charpos > (bidi_cache[bidi_cache_idx - 1].charpos
1907 + bidi_cache[bidi_cache_idx - 1].nchars - 1))
b7b65b15
EZ
1908 bidi_cache_reset ();
1909 /* But as long as we are caching during forward scan, we must
1910 cache each state, or else the cache integrity will be
1911 compromised: it assumes cached states correspond to buffer
1912 positions 1:1. */
1913 else
1914 bidi_cache_iterator_state (bidi_it, 1);
1915 }
1916}
1917
1918/* This is meant to be called from within the debugger, whenever you
1919 wish to examine the cache contents. */
e78aecca 1920void bidi_dump_cached_states (void) EXTERNALLY_VISIBLE;
b7b65b15
EZ
1921void
1922bidi_dump_cached_states (void)
1923{
1924 int i;
1925 int ndigits = 1;
1926
1927 if (bidi_cache_idx == 0)
1928 {
1929 fprintf (stderr, "The cache is empty.\n");
1930 return;
1931 }
1932 fprintf (stderr, "Total of %d state%s in cache:\n",
1933 bidi_cache_idx, bidi_cache_idx == 1 ? "" : "s");
1934
1935 for (i = bidi_cache[bidi_cache_idx - 1].charpos; i > 0; i /= 10)
1936 ndigits++;
1937 fputs ("ch ", stderr);
1938 for (i = 0; i < bidi_cache_idx; i++)
1939 fprintf (stderr, "%*c", ndigits, bidi_cache[i].ch);
1940 fputs ("\n", stderr);
1941 fputs ("lvl ", stderr);
1942 for (i = 0; i < bidi_cache_idx; i++)
1943 fprintf (stderr, "%*d", ndigits, bidi_cache[i].resolved_level);
1944 fputs ("\n", stderr);
1945 fputs ("pos ", stderr);
1946 for (i = 0; i < bidi_cache_idx; i++)
c2982e87 1947 fprintf (stderr, "%*"pI"d", ndigits, bidi_cache[i].charpos);
b7b65b15
EZ
1948 fputs ("\n", stderr);
1949}