Tested with buffer reordering, fixed one crash.
[bpt/emacs.git] / src / bidi.c
CommitLineData
b7b65b15 1/* Low-level bidirectional buffer-scanning functions for GNU Emacs.
73b0cd50 2 Copyright (C) 2000-2001, 2004-2005, 2009-2011
b118e65f 3 Free Software Foundation, Inc.
b7b65b15
EZ
4
5This file is part of GNU Emacs.
6
a8d11bd3 7GNU Emacs is free software: you can redistribute it and/or modify
b7b65b15 8it under the terms of the GNU General Public License as published by
a8d11bd3
EZ
9the Free Software Foundation, either version 3 of the License, or
10(at your option) any later version.
b7b65b15
EZ
11
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
16
b7b65b15 17You should have received a copy of the GNU General Public License
a8d11bd3 18along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
b7b65b15 19
2d6e4628
EZ
20/* Written by Eli Zaretskii <eliz@gnu.org>.
21
22 A sequential implementation of the Unicode Bidirectional algorithm,
b7b65b15
EZ
23 as per UAX#9, a part of the Unicode Standard.
24
25 Unlike the reference and most other implementations, this one is
940afb59
EZ
26 designed to be called once for every character in the buffer or
27 string.
b7b65b15 28
4b292a22 29 The main entry point is bidi_move_to_visually_next. Each time it
b7b65b15
EZ
30 is called, it finds the next character in the visual order, and
31 returns its information in a special structure. The caller is then
32 expected to process this character for display or any other
4b292a22
EZ
33 purposes, and call bidi_move_to_visually_next for the next
34 character. See the comments in bidi_move_to_visually_next for more
35 details about its algorithm that finds the next visual-order
b7b65b15
EZ
36 character by resolving their levels on the fly.
37
940afb59
EZ
38 The two other entry points are bidi_paragraph_init and
39 bidi_mirror_char. The first determines the base direction of a
40 paragraph, while the second returns the mirrored version of its
41 argument character.
42
89d3374a
EZ
43 If you want to understand the code, you will have to read it
44 together with the relevant portions of UAX#9. The comments include
45 references to UAX#9 rules, for that very reason.
46
b7b65b15
EZ
47 A note about references to UAX#9 rules: if the reference says
48 something like "X9/Retaining", it means that you need to refer to
49 rule X9 and to its modifications decribed in the "Implementation
50 Notes" section of UAX#9, under "Retaining Format Codes". */
51
b7b65b15 52#include <config.h>
b7b65b15 53#include <stdio.h>
29e3d8d1
EZ
54#include <setjmp.h>
55
b7b65b15
EZ
56#include "lisp.h"
57#include "buffer.h"
58#include "character.h"
59#include "dispextern.h"
60
61static int bidi_initialized = 0;
62
cbc4fd20 63static Lisp_Object bidi_type_table, bidi_mirror_table;
b7b65b15
EZ
64
65#define LRM_CHAR 0x200E
66#define RLM_CHAR 0x200F
b7b65b15 67#define BIDI_EOB -1
b7b65b15 68
b7b65b15
EZ
69/* Data type for describing the bidirectional character categories. */
70typedef enum {
71 UNKNOWN_BC,
72 NEUTRAL,
73 WEAK,
74 STRONG
75} bidi_category_t;
76
e78aecca 77extern int bidi_ignore_explicit_marks_for_paragraph_level EXTERNALLY_VISIBLE;
b7b65b15
EZ
78int bidi_ignore_explicit_marks_for_paragraph_level = 1;
79
372b7a95 80static Lisp_Object paragraph_start_re, paragraph_separate_re;
6bff6497 81static Lisp_Object Qparagraph_start, Qparagraph_separate;
b7b65b15 82
b7b65b15 83static void
971de7fb 84bidi_initialize (void)
b7b65b15 85{
317fbf33
EZ
86
87#include "biditype.h"
cbc4fd20 88#include "bidimirror.h"
317fbf33 89
b7b65b15
EZ
90 int i;
91
92 bidi_type_table = Fmake_char_table (Qnil, make_number (STRONG_L));
2d6e4628 93 staticpro (&bidi_type_table);
b7b65b15
EZ
94
95 for (i = 0; i < sizeof bidi_type / sizeof bidi_type[0]; i++)
317fbf33 96 char_table_set_range (bidi_type_table, bidi_type[i].from, bidi_type[i].to,
b7b65b15 97 make_number (bidi_type[i].type));
6bff6497 98
cbc4fd20
EZ
99 bidi_mirror_table = Fmake_char_table (Qnil, Qnil);
100 staticpro (&bidi_mirror_table);
101
102 for (i = 0; i < sizeof bidi_mirror / sizeof bidi_mirror[0]; i++)
103 char_table_set (bidi_mirror_table, bidi_mirror[i].from,
104 make_number (bidi_mirror[i].to));
105
b4bf28b7
SM
106 Qparagraph_start = intern ("paragraph-start");
107 staticpro (&Qparagraph_start);
372b7a95
EZ
108 paragraph_start_re = Fsymbol_value (Qparagraph_start);
109 if (!STRINGP (paragraph_start_re))
110 paragraph_start_re = build_string ("\f\\|[ \t]*$");
111 staticpro (&paragraph_start_re);
b4bf28b7
SM
112 Qparagraph_separate = intern ("paragraph-separate");
113 staticpro (&Qparagraph_separate);
372b7a95
EZ
114 paragraph_separate_re = Fsymbol_value (Qparagraph_separate);
115 if (!STRINGP (paragraph_separate_re))
116 paragraph_separate_re = build_string ("[ \t\f]*$");
117 staticpro (&paragraph_separate_re);
b7b65b15
EZ
118 bidi_initialized = 1;
119}
120
6bff6497
EZ
121/* Return the bidi type of a character CH, subject to the current
122 directional OVERRIDE. */
fd3998ff 123static INLINE bidi_type_t
6bff6497 124bidi_get_type (int ch, bidi_dir_t override)
b7b65b15 125{
6bff6497
EZ
126 bidi_type_t default_type;
127
e7402cb2
EZ
128 if (ch == BIDI_EOB)
129 return NEUTRAL_B;
e342a24d
EZ
130 if (ch < 0 || ch > MAX_CHAR)
131 abort ();
6bff6497
EZ
132
133 default_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
134
135 if (override == NEUTRAL_DIR)
136 return default_type;
137
138 switch (default_type)
139 {
140 /* Although UAX#9 does not tell, it doesn't make sense to
141 override NEUTRAL_B and LRM/RLM characters. */
142 case NEUTRAL_B:
143 case LRE:
144 case LRO:
145 case RLE:
146 case RLO:
147 case PDF:
148 return default_type;
149 default:
150 switch (ch)
151 {
152 case LRM_CHAR:
153 case RLM_CHAR:
154 return default_type;
155 default:
156 if (override == L2R) /* X6 */
157 return STRONG_L;
158 else if (override == R2L)
159 return STRONG_R;
160 else
161 abort (); /* can't happen: handled above */
162 }
163 }
b7b65b15
EZ
164}
165
7d3b3862 166static void
2d6e4628
EZ
167bidi_check_type (bidi_type_t type)
168{
169 if (type < UNKNOWN_BT || type > NEUTRAL_ON)
170 abort ();
171}
172
b7b65b15 173/* Given a bidi TYPE of a character, return its category. */
fd3998ff 174static INLINE bidi_category_t
b7b65b15
EZ
175bidi_get_category (bidi_type_t type)
176{
177 switch (type)
178 {
179 case UNKNOWN_BT:
180 return UNKNOWN_BC;
181 case STRONG_L:
182 case STRONG_R:
183 case STRONG_AL:
184 case LRE:
185 case LRO:
186 case RLE:
187 case RLO:
188 return STRONG;
189 case PDF: /* ??? really?? */
190 case WEAK_EN:
191 case WEAK_ES:
192 case WEAK_ET:
193 case WEAK_AN:
194 case WEAK_CS:
195 case WEAK_NSM:
196 case WEAK_BN:
197 return WEAK;
198 case NEUTRAL_B:
199 case NEUTRAL_S:
200 case NEUTRAL_WS:
201 case NEUTRAL_ON:
202 return NEUTRAL;
203 default:
204 abort ();
205 }
206}
207
a6e8d97c
EZ
208/* Return the mirrored character of C, if it has one. If C has no
209 mirrored counterpart, return C.
210 Note: The conditions in UAX#9 clause L4 regarding the surrounding
211 context must be tested by the caller. */
b7b65b15
EZ
212int
213bidi_mirror_char (int c)
214{
cbc4fd20
EZ
215 Lisp_Object val;
216
217 if (c == BIDI_EOB)
218 return c;
219 if (c < 0 || c > MAX_CHAR)
220 abort ();
b7b65b15 221
cbc4fd20
EZ
222 val = CHAR_TABLE_REF (bidi_mirror_table, c);
223 if (INTEGERP (val))
b7b65b15 224 {
cbc4fd20 225 int v = XINT (val);
b7b65b15 226
cbc4fd20
EZ
227 if (v < 0 || v > MAX_CHAR)
228 abort ();
229
230 return v;
b7b65b15 231 }
cbc4fd20 232
b7b65b15
EZ
233 return c;
234}
235
236/* Copy the bidi iterator from FROM to TO. To save cycles, this only
237 copies the part of the level stack that is actually in use. */
fd3998ff 238static INLINE void
b7b65b15
EZ
239bidi_copy_it (struct bidi_it *to, struct bidi_it *from)
240{
241 int i;
242
e69a9370 243 /* Copy everything except the level stack and beyond. */
182ce2d2 244 memcpy (to, from, offsetof (struct bidi_it, level_stack[0]));
b7b65b15
EZ
245
246 /* Copy the active part of the level stack. */
247 to->level_stack[0] = from->level_stack[0]; /* level zero is always in use */
248 for (i = 1; i <= from->stack_idx; i++)
249 to->level_stack[i] = from->level_stack[i];
250}
251
252/* Caching the bidi iterator states. */
253
2fe72643
EZ
254#define BIDI_CACHE_CHUNK 200
255static struct bidi_it *bidi_cache;
256static size_t bidi_cache_size = 0;
0416466c 257static size_t elsz = sizeof (struct bidi_it);
87e67904
EZ
258static EMACS_INT bidi_cache_idx; /* next unused cache slot */
259static EMACS_INT bidi_cache_last_idx; /* slot of last cache hit */
260static EMACS_INT bidi_cache_start = 0; /* start of cache for this
261 "stack" level */
262
263/* Reset the cache state to the empty state. We only reset the part
264 of the cache relevant to iteration of the current object. Previous
265 objects, which are pushed on the display iterator's stack, are left
266 intact. This is called when the cached information is no more
267 useful for the current iteration, e.g. when we were reseated to a
268 new position on the same object. */
fd3998ff 269static INLINE void
b7b65b15
EZ
270bidi_cache_reset (void)
271{
87e67904 272 bidi_cache_idx = bidi_cache_start;
b7b65b15
EZ
273 bidi_cache_last_idx = -1;
274}
275
87e67904
EZ
276/* Shrink the cache to its minimal size. Called when we init the bidi
277 iterator for reordering a buffer or a string that does not come
278 from display properties, because that means all the previously
279 cached info is of no further use. */
2fe72643
EZ
280static INLINE void
281bidi_cache_shrink (void)
282{
283 if (bidi_cache_size > BIDI_CACHE_CHUNK)
284 {
0416466c
EZ
285 bidi_cache_size = BIDI_CACHE_CHUNK;
286 bidi_cache =
287 (struct bidi_it *) xrealloc (bidi_cache, bidi_cache_size * elsz);
2fe72643
EZ
288 }
289 bidi_cache_reset ();
290}
291
fd3998ff 292static INLINE void
b7b65b15
EZ
293bidi_cache_fetch_state (int idx, struct bidi_it *bidi_it)
294{
295 int current_scan_dir = bidi_it->scan_dir;
296
87e67904 297 if (idx < bidi_cache_start || idx >= bidi_cache_idx)
b7b65b15
EZ
298 abort ();
299
300 bidi_copy_it (bidi_it, &bidi_cache[idx]);
301 bidi_it->scan_dir = current_scan_dir;
302 bidi_cache_last_idx = idx;
303}
304
305/* Find a cached state with a given CHARPOS and resolved embedding
306 level less or equal to LEVEL. if LEVEL is -1, disregard the
307 resolved levels in cached states. DIR, if non-zero, means search
308 in that direction from the last cache hit. */
fd3998ff 309static INLINE int
61bfec98 310bidi_cache_search (EMACS_INT charpos, int level, int dir)
b7b65b15
EZ
311{
312 int i, i_start;
313
314 if (bidi_cache_idx)
315 {
316 if (charpos < bidi_cache[bidi_cache_last_idx].charpos)
182ce2d2
EZ
317 {
318 dir = -1;
319 i_start = bidi_cache_last_idx - 1;
320 }
321 else if (charpos > (bidi_cache[bidi_cache_last_idx].charpos
322 + bidi_cache[bidi_cache_last_idx].nchars - 1))
323 {
324 dir = 1;
325 i_start = bidi_cache_last_idx + 1;
326 }
327 else if (dir)
b7b65b15
EZ
328 i_start = bidi_cache_last_idx;
329 else
330 {
331 dir = -1;
332 i_start = bidi_cache_idx - 1;
333 }
334
335 if (dir < 0)
336 {
337 /* Linear search for now; FIXME! */
87e67904 338 for (i = i_start; i >= bidi_cache_start; i--)
182ce2d2
EZ
339 if (bidi_cache[i].charpos <= charpos
340 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
b7b65b15
EZ
341 && (level == -1 || bidi_cache[i].resolved_level <= level))
342 return i;
343 }
344 else
345 {
346 for (i = i_start; i < bidi_cache_idx; i++)
182ce2d2
EZ
347 if (bidi_cache[i].charpos <= charpos
348 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
b7b65b15
EZ
349 && (level == -1 || bidi_cache[i].resolved_level <= level))
350 return i;
351 }
352 }
353
354 return -1;
355}
356
357/* Find a cached state where the resolved level changes to a value
358 that is lower than LEVEL, and return its cache slot index. DIR is
359 the direction to search, starting with the last used cache slot.
87e67904
EZ
360 If DIR is zero, we search backwards from the last occupied cache
361 slot. BEFORE, if non-zero, means return the index of the slot that
362 is ``before'' the level change in the search direction. That is,
b7b65b15
EZ
363 given the cached levels like this:
364
365 1122333442211
366 AB C
367
368 and assuming we are at the position cached at the slot marked with
369 C, searching backwards (DIR = -1) for LEVEL = 2 will return the
370 index of slot B or A, depending whether BEFORE is, respectively,
371 non-zero or zero. */
372static int
373bidi_cache_find_level_change (int level, int dir, int before)
374{
375 if (bidi_cache_idx)
376 {
377 int i = dir ? bidi_cache_last_idx : bidi_cache_idx - 1;
378 int incr = before ? 1 : 0;
379
380 if (!dir)
381 dir = -1;
382 else if (!incr)
383 i += dir;
384
385 if (dir < 0)
386 {
87e67904 387 while (i >= bidi_cache_start + incr)
b7b65b15
EZ
388 {
389 if (bidi_cache[i - incr].resolved_level >= 0
390 && bidi_cache[i - incr].resolved_level < level)
391 return i;
392 i--;
393 }
394 }
395 else
396 {
397 while (i < bidi_cache_idx - incr)
398 {
399 if (bidi_cache[i + incr].resolved_level >= 0
400 && bidi_cache[i + incr].resolved_level < level)
401 return i;
402 i++;
403 }
404 }
405 }
406
407 return -1;
408}
409
fd3998ff 410static INLINE void
b7b65b15
EZ
411bidi_cache_iterator_state (struct bidi_it *bidi_it, int resolved)
412{
413 int idx;
414
415 /* We should never cache on backward scans. */
416 if (bidi_it->scan_dir == -1)
417 abort ();
418 idx = bidi_cache_search (bidi_it->charpos, -1, 1);
419
420 if (idx < 0)
421 {
422 idx = bidi_cache_idx;
2fe72643
EZ
423 /* Enlarge the cache as needed. */
424 if (idx >= bidi_cache_size)
425 {
0416466c 426 bidi_cache_size += BIDI_CACHE_CHUNK;
2fe72643 427 bidi_cache =
0416466c 428 (struct bidi_it *) xrealloc (bidi_cache, bidi_cache_size * elsz);
2fe72643 429 }
bd924a5d
EZ
430 /* Character positions should correspond to cache positions 1:1.
431 If we are outside the range of cached positions, the cache is
432 useless and must be reset. */
87e67904 433 if (idx > bidi_cache_start &&
182ce2d2
EZ
434 (bidi_it->charpos > (bidi_cache[idx - 1].charpos
435 + bidi_cache[idx - 1].nchars)
bd924a5d
EZ
436 || bidi_it->charpos < bidi_cache[0].charpos))
437 {
438 bidi_cache_reset ();
87e67904 439 idx = bidi_cache_start;
bd924a5d 440 }
102ebb00
EZ
441 if (bidi_it->nchars <= 0)
442 abort ();
b7b65b15
EZ
443 bidi_copy_it (&bidi_cache[idx], bidi_it);
444 if (!resolved)
445 bidi_cache[idx].resolved_level = -1;
446 }
447 else
448 {
449 /* Copy only the members which could have changed, to avoid
450 costly copying of the entire struct. */
451 bidi_cache[idx].type = bidi_it->type;
2d6e4628 452 bidi_check_type (bidi_it->type);
89d3374a
EZ
453 bidi_cache[idx].type_after_w1 = bidi_it->type_after_w1;
454 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
455 if (resolved)
456 bidi_cache[idx].resolved_level = bidi_it->resolved_level;
457 else
458 bidi_cache[idx].resolved_level = -1;
459 bidi_cache[idx].invalid_levels = bidi_it->invalid_levels;
460 bidi_cache[idx].invalid_rl_levels = bidi_it->invalid_rl_levels;
461 bidi_cache[idx].next_for_neutral = bidi_it->next_for_neutral;
462 bidi_cache[idx].next_for_ws = bidi_it->next_for_ws;
463 bidi_cache[idx].ignore_bn_limit = bidi_it->ignore_bn_limit;
464 }
465
466 bidi_cache_last_idx = idx;
467 if (idx >= bidi_cache_idx)
468 bidi_cache_idx = idx + 1;
469}
470
fd3998ff 471static INLINE bidi_type_t
61bfec98 472bidi_cache_find (EMACS_INT charpos, int level, struct bidi_it *bidi_it)
b7b65b15
EZ
473{
474 int i = bidi_cache_search (charpos, level, bidi_it->scan_dir);
475
87e67904 476 if (i >= bidi_cache_start)
b7b65b15
EZ
477 {
478 bidi_dir_t current_scan_dir = bidi_it->scan_dir;
479
5009f85e 480 bidi_copy_it (bidi_it, &bidi_cache[i]);
b7b65b15
EZ
481 bidi_cache_last_idx = i;
482 /* Don't let scan direction from from the cached state override
483 the current scan direction. */
484 bidi_it->scan_dir = current_scan_dir;
485 return bidi_it->type;
486 }
487
488 return UNKNOWN_BT;
489}
490
fd3998ff 491static INLINE int
b7b65b15
EZ
492bidi_peek_at_next_level (struct bidi_it *bidi_it)
493{
87e67904 494 if (bidi_cache_idx == bidi_cache_start || bidi_cache_last_idx == -1)
b7b65b15
EZ
495 abort ();
496 return bidi_cache[bidi_cache_last_idx + bidi_it->scan_dir].resolved_level;
497}
498
be39f003
EZ
499/* Check if buffer position CHARPOS/BYTEPOS is the end of a paragraph.
500 Value is the non-negative length of the paragraph separator
501 following the buffer position, -1 if position is at the beginning
502 of a new paragraph, or -2 if position is neither at beginning nor
503 at end of a paragraph. */
fd3998ff 504static EMACS_INT
6bff6497 505bidi_at_paragraph_end (EMACS_INT charpos, EMACS_INT bytepos)
b7b65b15 506{
372b7a95
EZ
507 Lisp_Object sep_re;
508 Lisp_Object start_re;
be39f003
EZ
509 EMACS_INT val;
510
372b7a95
EZ
511 sep_re = paragraph_separate_re;
512 start_re = paragraph_start_re;
be39f003
EZ
513
514 val = fast_looking_at (sep_re, charpos, bytepos, ZV, ZV_BYTE, Qnil);
515 if (val < 0)
516 {
517 if (fast_looking_at (start_re, charpos, bytepos, ZV, ZV_BYTE, Qnil) >= 0)
518 val = -1;
519 else
520 val = -2;
521 }
b7b65b15 522
be39f003 523 return val;
b7b65b15
EZ
524}
525
526/* Determine the start-of-run (sor) directional type given the two
527 embedding levels on either side of the run boundary. Also, update
528 the saved info about previously seen characters, since that info is
529 generally valid for a single level run. */
fd3998ff 530static INLINE void
b7b65b15
EZ
531bidi_set_sor_type (struct bidi_it *bidi_it, int level_before, int level_after)
532{
533 int higher_level = level_before > level_after ? level_before : level_after;
534
535 /* The prev_was_pdf gork is required for when we have several PDFs
536 in a row. In that case, we want to compute the sor type for the
537 next level run only once: when we see the first PDF. That's
538 because the sor type depends only on the higher of the two levels
539 that we find on the two sides of the level boundary (see UAX#9,
540 clause X10), and so we don't need to know the final embedding
541 level to which we descend after processing all the PDFs. */
e342a24d 542 if (!bidi_it->prev_was_pdf || level_before < level_after)
b7b65b15
EZ
543 /* FIXME: should the default sor direction be user selectable? */
544 bidi_it->sor = (higher_level & 1) != 0 ? R2L : L2R;
545 if (level_before > level_after)
546 bidi_it->prev_was_pdf = 1;
547
548 bidi_it->prev.type = UNKNOWN_BT;
89d3374a
EZ
549 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
550 bidi_it->last_strong.orig_type = UNKNOWN_BT;
b7b65b15
EZ
551 bidi_it->prev_for_neutral.type = bidi_it->sor == R2L ? STRONG_R : STRONG_L;
552 bidi_it->prev_for_neutral.charpos = bidi_it->charpos;
553 bidi_it->prev_for_neutral.bytepos = bidi_it->bytepos;
89d3374a
EZ
554 bidi_it->next_for_neutral.type = bidi_it->next_for_neutral.type_after_w1 =
555 bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
87e67904 556 bidi_it->ignore_bn_limit = -1; /* meaning it's unknown */
b7b65b15
EZ
557}
558
182ce2d2 559/* Perform initializations for reordering a new line of bidi text. */
6bff6497 560static void
be39f003
EZ
561bidi_line_init (struct bidi_it *bidi_it)
562{
563 bidi_it->scan_dir = 1; /* FIXME: do we need to have control on this? */
564 bidi_it->resolved_level = bidi_it->level_stack[0].level;
565 bidi_it->level_stack[0].override = NEUTRAL_DIR; /* X1 */
566 bidi_it->invalid_levels = 0;
567 bidi_it->invalid_rl_levels = -1;
568 bidi_it->next_en_pos = -1;
569 bidi_it->next_for_ws.type = UNKNOWN_BT;
b44d9321
EZ
570 bidi_set_sor_type (bidi_it,
571 bidi_it->paragraph_dir == R2L ? 1 : 0,
be39f003
EZ
572 bidi_it->level_stack[0].level); /* X10 */
573
574 bidi_cache_reset ();
575}
576
87e67904
EZ
577/* Count bytes in multibyte string S between BEG/BEGBYTE and END. BEG
578 and END are zero-based character positions in S, BEGBYTE is byte
579 position corresponding to BEG. */
580static inline EMACS_INT
581bidi_count_bytes (const unsigned char *s, const EMACS_INT beg,
582 const EMACS_INT begbyte, const EMACS_INT end)
583{
584 EMACS_INT pos = beg;
585 const unsigned char *p = s + begbyte, *start = p;
586
587 if (!CHAR_HEAD_P (*p))
588 abort ();
589
590 while (pos < end)
591 {
592 p += BYTES_BY_CHAR_HEAD (*p);
593 pos++;
594 }
595
596 return p - start;
597}
598
599/* Fetch and returns the character at byte position BYTEPOS. If S is
600 non-NULL, fetch the character from string S; otherwise fetch the
601 character from the current buffer. */
602static inline int
603bidi_char_at_pos (EMACS_INT bytepos, const unsigned char *s)
604{
605 if (s)
606 return STRING_CHAR (s + bytepos);
607 else
608 return FETCH_MULTIBYTE_CHAR (bytepos);
609}
610
102ebb00
EZ
611/* Fetch and return the character at BYTEPOS/CHARPOS. If that
612 character is covered by a display string, treat the entire run of
613 covered characters as a single character u+FFFC, and return their
614 combined length in CH_LEN and NCHARS. DISP_POS specifies the
615 character position of the next display string, or -1 if not yet
616 computed. When the next character is at or beyond that position,
617 the function updates DISP_POS with the position of the next display
87e67904
EZ
618 string. STRING->s is the string to iterate, or NULL if iterating over
619 a buffer. */
620static inline int
102ebb00 621bidi_fetch_char (EMACS_INT bytepos, EMACS_INT charpos, EMACS_INT *disp_pos,
87e67904 622 struct bidi_string_data *string,
fc6f18ce 623 int frame_window_p, EMACS_INT *ch_len, EMACS_INT *nchars)
182ce2d2
EZ
624{
625 int ch;
87e67904 626 EMACS_INT endpos = string->s ? string->schars : ZV;
182ce2d2 627
182ce2d2 628 /* If we got past the last known position of display string, compute
87e67904
EZ
629 the position of the next one. That position could be at CHARPOS. */
630 if (charpos < endpos && charpos > *disp_pos)
631 *disp_pos = compute_display_string_pos (charpos, string, frame_window_p);
102ebb00
EZ
632
633 /* Fetch the character at BYTEPOS. */
87e67904 634 if (charpos >= endpos)
182ce2d2
EZ
635 {
636 ch = BIDI_EOB;
637 *ch_len = 1;
638 *nchars = 1;
87e67904 639 *disp_pos = endpos;
182ce2d2 640 }
102ebb00 641 else if (charpos >= *disp_pos)
182ce2d2 642 {
7b600102
EZ
643 EMACS_INT disp_end_pos;
644
645 /* We don't expect to find ourselves in the middle of a display
646 property. Hopefully, it will never be needed. */
647 if (charpos > *disp_pos)
648 abort ();
649 /* Return the Unicode Object Replacement Character to represent
87e67904 650 the entire run of characters covered by the display string. */
7b600102 651 ch = 0xFFFC;
87e67904 652 disp_end_pos = compute_display_string_end (*disp_pos, string);
7b600102 653 *nchars = disp_end_pos - *disp_pos;
87e67904
EZ
654 if (string->s)
655 *ch_len = bidi_count_bytes (string->s, *disp_pos, bytepos,
656 disp_end_pos);
657 else
658 *ch_len = CHAR_TO_BYTE (disp_end_pos) - bytepos;
182ce2d2 659 }
182ce2d2
EZ
660 else
661 {
87e67904
EZ
662 if (string->s)
663 {
664 EMACS_INT len;
665
666 ch = STRING_CHAR_AND_LENGTH (string->s + bytepos, len);
667 *ch_len = len;
668 }
669 else
670 {
671 ch = FETCH_MULTIBYTE_CHAR (bytepos);
672 *ch_len = CHAR_BYTES (ch);
673 }
182ce2d2
EZ
674 *nchars = 1;
675 }
676
677 /* If we just entered a run of characters covered by a display
678 string, compute the position of the next display string. */
87e67904
EZ
679 if (charpos + *nchars <= endpos && charpos + *nchars > *disp_pos)
680 *disp_pos = compute_display_string_pos (charpos + *nchars, string,
681 frame_window_p);
182ce2d2
EZ
682
683 return ch;
684}
685
be39f003
EZ
686/* Find the beginning of this paragraph by looking back in the buffer.
687 Value is the byte position of the paragraph's beginning. */
688static EMACS_INT
b44d9321 689bidi_find_paragraph_start (EMACS_INT pos, EMACS_INT pos_byte)
6bff6497 690{
372b7a95 691 Lisp_Object re = paragraph_start_re;
6bff6497
EZ
692 EMACS_INT limit = ZV, limit_byte = ZV_BYTE;
693
6bff6497
EZ
694 while (pos_byte > BEGV_BYTE
695 && fast_looking_at (re, pos, pos_byte, limit, limit_byte, Qnil) < 0)
696 {
182ce2d2
EZ
697 /* FIXME: What if the paragraph beginning is covered by a
698 display string? And what if a display string covering some
699 of the text over which we scan back includes
700 paragraph_start_re? */
be39f003
EZ
701 pos = find_next_newline_no_quit (pos - 1, -1);
702 pos_byte = CHAR_TO_BYTE (pos);
6bff6497 703 }
be39f003 704 return pos_byte;
6bff6497
EZ
705}
706
bea4f10c 707/* Determine the base direction, a.k.a. base embedding level, of the
b44d9321
EZ
708 paragraph we are about to iterate through. If DIR is either L2R or
709 R2L, just use that. Otherwise, determine the paragraph direction
bea4f10c
EZ
710 from the first strong directional character of the paragraph.
711
182ce2d2 712 NO_DEFAULT_P non-zero means don't default to L2R if the paragraph
bea4f10c
EZ
713 has no strong directional characters and both DIR and
714 bidi_it->paragraph_dir are NEUTRAL_DIR. In that case, search back
715 in the buffer until a paragraph is found with a strong character,
716 or until hitting BEGV. In the latter case, fall back to L2R. This
717 flag is used in current-bidi-paragraph-direction.
718
719 Note that this function gives the paragraph separator the same
720 direction as the preceding paragraph, even though Emacs generally
721 views the separartor as not belonging to any paragraph. */
b7b65b15 722void
bea4f10c 723bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it, int no_default_p)
b7b65b15 724{
6bff6497 725 EMACS_INT bytepos = bidi_it->bytepos;
87e67904 726 int string_p = bidi_it->string.s != NULL;
bea4f10c 727 EMACS_INT pstartbyte;
87e67904
EZ
728 /* Note that begbyte is a byte position, while end is a character
729 position. Yes, this is ugly, but we are trying to avoid costly
730 calls to BYTE_TO_CHAR and its ilk. */
731 EMACS_INT begbyte = string_p ? 0 : BEGV_BYTE;
732 EMACS_INT end = string_p ? bidi_it->string.schars : ZV;
e7402cb2 733
5e65aec0 734 /* Special case for an empty buffer. */
87e67904 735 if (bytepos == begbyte && bidi_it->charpos == end)
5e65aec0 736 dir = L2R;
9c82e145 737 /* We should never be called at EOB or before BEGV. */
87e67904 738 else if (bidi_it->charpos >= end || bytepos < begbyte)
9c82e145
EZ
739 abort ();
740
be39f003
EZ
741 if (dir == L2R)
742 {
743 bidi_it->paragraph_dir = L2R;
744 bidi_it->new_paragraph = 0;
745 }
746 else if (dir == R2L)
747 {
748 bidi_it->paragraph_dir = R2L;
749 bidi_it->new_paragraph = 0;
750 }
b7b65b15
EZ
751 else if (dir == NEUTRAL_DIR) /* P2 */
752 {
182ce2d2
EZ
753 int ch;
754 EMACS_INT ch_len, nchars;
755 EMACS_INT pos, disp_pos = -1;
6bff6497 756 bidi_type_t type;
be39f003 757
d20e1419
EZ
758 if (!bidi_initialized)
759 bidi_initialize ();
760
be39f003
EZ
761 /* If we are inside a paragraph separator, we are just waiting
762 for the separator to be exhausted; use the previous paragraph
e5a2fec7
EZ
763 direction. But don't do that if we have been just reseated,
764 because we need to reinitialize below in that case. */
765 if (!bidi_it->first_elt
766 && bidi_it->charpos < bidi_it->separator_limit)
be39f003
EZ
767 return;
768
b44d9321 769 /* If we are on a newline, get past it to where the next
5e65aec0
EZ
770 paragraph might start. But don't do that at BEGV since then
771 we are potentially in a new paragraph that doesn't yet
772 exist. */
c143c213 773 pos = bidi_it->charpos;
87e67904
EZ
774 if (bytepos > begbyte
775 && bidi_char_at_pos (bytepos, bidi_it->string.s) == '\n')
be39f003 776 {
b44d9321 777 bytepos++;
c143c213 778 pos++;
be39f003 779 }
b44d9321
EZ
780
781 /* We are either at the beginning of a paragraph or in the
782 middle of it. Find where this paragraph starts. */
87e67904
EZ
783 if (string_p)
784 {
785 /* We don't support changes of paragraph direction inside a
786 string. It is treated as a single paragraph. */
787 pstartbyte = 0;
788 }
789 else
790 pstartbyte = bidi_find_paragraph_start (pos, bytepos);
be39f003
EZ
791 bidi_it->separator_limit = -1;
792 bidi_it->new_paragraph = 0;
bea4f10c
EZ
793
794 /* The following loop is run more than once only if NO_DEFAULT_P
87e67904 795 is non-zero, and only if we are iterating on a buffer. */
bea4f10c
EZ
796 do {
797 bytepos = pstartbyte;
87e67904
EZ
798 if (!string_p)
799 pos = BYTE_TO_CHAR (bytepos);
800 ch = bidi_fetch_char (bytepos, pos, &disp_pos, &bidi_it->string,
801 bidi_it->frame_window_p, &ch_len, &nchars);
bea4f10c
EZ
802 type = bidi_get_type (ch, NEUTRAL_DIR);
803
182ce2d2 804 for (pos += nchars, bytepos += ch_len;
bea4f10c
EZ
805 /* NOTE: UAX#9 says to search only for L, AL, or R types
806 of characters, and ignore RLE, RLO, LRE, and LRO.
807 However, I'm not sure it makes sense to omit those 4;
808 should try with and without that to see the effect. */
809 (bidi_get_category (type) != STRONG)
810 || (bidi_ignore_explicit_marks_for_paragraph_level
811 && (type == RLE || type == RLO
812 || type == LRE || type == LRO));
813 type = bidi_get_type (ch, NEUTRAL_DIR))
814 {
87e67904
EZ
815 if (!string_p
816 && type == NEUTRAL_B
817 && bidi_at_paragraph_end (pos, bytepos) >= -1)
21fce5ab 818 break;
87e67904 819 if (pos >= end)
bea4f10c
EZ
820 {
821 /* Pretend there's a paragraph separator at end of
87e67904 822 buffer/string. */
bea4f10c
EZ
823 type = NEUTRAL_B;
824 break;
825 }
102ebb00 826 /* Fetch next character and advance to get past it. */
87e67904 827 ch = bidi_fetch_char (bytepos, pos, &disp_pos, &bidi_it->string,
fc6f18ce 828 bidi_it->frame_window_p, &ch_len, &nchars);
182ce2d2
EZ
829 pos += nchars;
830 bytepos += ch_len;
bea4f10c
EZ
831 }
832 if (type == STRONG_R || type == STRONG_AL) /* P3 */
833 bidi_it->paragraph_dir = R2L;
834 else if (type == STRONG_L)
835 bidi_it->paragraph_dir = L2R;
87e67904
EZ
836 if (!string_p
837 && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR)
bea4f10c
EZ
838 {
839 /* If this paragraph is at BEGV, default to L2R. */
840 if (pstartbyte == BEGV_BYTE)
841 bidi_it->paragraph_dir = L2R; /* P3 and HL1 */
842 else
843 {
844 EMACS_INT prevpbyte = pstartbyte;
845 EMACS_INT p = BYTE_TO_CHAR (pstartbyte), pbyte = pstartbyte;
846
847 /* Find the beginning of the previous paragraph, if any. */
848 while (pbyte > BEGV_BYTE && prevpbyte >= pstartbyte)
849 {
182ce2d2
EZ
850 /* FXIME: What if p is covered by a display
851 string? See also a FIXME inside
852 bidi_find_paragraph_start. */
bea4f10c
EZ
853 p--;
854 pbyte = CHAR_TO_BYTE (p);
855 prevpbyte = bidi_find_paragraph_start (p, pbyte);
856 }
857 pstartbyte = prevpbyte;
858 }
859 }
87e67904
EZ
860 } while (!string_p
861 && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR);
b7b65b15 862 }
be39f003
EZ
863 else
864 abort ();
b7b65b15 865
b44d9321
EZ
866 /* Contrary to UAX#9 clause P3, we only default the paragraph
867 direction to L2R if we have no previous usable paragraph
bea4f10c 868 direction. This is allowed by the HL1 clause. */
d20e1419 869 if (bidi_it->paragraph_dir != L2R && bidi_it->paragraph_dir != R2L)
bea4f10c 870 bidi_it->paragraph_dir = L2R; /* P3 and HL1 ``higher-level protocols'' */
be39f003 871 if (bidi_it->paragraph_dir == R2L)
b44d9321 872 bidi_it->level_stack[0].level = 1;
be39f003 873 else
b44d9321 874 bidi_it->level_stack[0].level = 0;
be39f003
EZ
875
876 bidi_line_init (bidi_it);
b7b65b15
EZ
877}
878
6bff6497
EZ
879/* Do whatever UAX#9 clause X8 says should be done at paragraph's
880 end. */
fd3998ff 881static INLINE void
b7b65b15
EZ
882bidi_set_paragraph_end (struct bidi_it *bidi_it)
883{
884 bidi_it->invalid_levels = 0;
885 bidi_it->invalid_rl_levels = -1;
886 bidi_it->stack_idx = 0;
887 bidi_it->resolved_level = bidi_it->level_stack[0].level;
b7b65b15
EZ
888}
889
182ce2d2 890/* Initialize the bidi iterator from buffer/string position CHARPOS. */
b7b65b15 891void
fc6f18ce
EZ
892bidi_init_it (EMACS_INT charpos, EMACS_INT bytepos, int frame_window_p,
893 struct bidi_it *bidi_it)
b7b65b15
EZ
894{
895 if (! bidi_initialized)
896 bidi_initialize ();
87e67904
EZ
897 if (charpos >= 0)
898 bidi_it->charpos = charpos;
899 if (bytepos >= 0)
900 bidi_it->bytepos = bytepos;
fc6f18ce 901 bidi_it->frame_window_p = frame_window_p;
182ce2d2 902 bidi_it->nchars = -1; /* to be computed in bidi_resolve_explicit_1 */
6bff6497
EZ
903 bidi_it->first_elt = 1;
904 bidi_set_paragraph_end (bidi_it);
905 bidi_it->new_paragraph = 1;
be39f003 906 bidi_it->separator_limit = -1;
b7b65b15 907 bidi_it->type = NEUTRAL_B;
ebb5722e
EZ
908 bidi_it->type_after_w1 = NEUTRAL_B;
909 bidi_it->orig_type = NEUTRAL_B;
b7b65b15 910 bidi_it->prev_was_pdf = 0;
ebb5722e
EZ
911 bidi_it->prev.type = bidi_it->prev.type_after_w1 =
912 bidi_it->prev.orig_type = UNKNOWN_BT;
89d3374a
EZ
913 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
914 bidi_it->last_strong.orig_type = UNKNOWN_BT;
b7b65b15
EZ
915 bidi_it->next_for_neutral.charpos = -1;
916 bidi_it->next_for_neutral.type =
89d3374a
EZ
917 bidi_it->next_for_neutral.type_after_w1 =
918 bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
b7b65b15
EZ
919 bidi_it->prev_for_neutral.charpos = -1;
920 bidi_it->prev_for_neutral.type =
89d3374a
EZ
921 bidi_it->prev_for_neutral.type_after_w1 =
922 bidi_it->prev_for_neutral.orig_type = UNKNOWN_BT;
b7b65b15 923 bidi_it->sor = L2R; /* FIXME: should it be user-selectable? */
182ce2d2 924 bidi_it->disp_pos = -1; /* invalid/unknown */
87e67904
EZ
925 /* We can only shrink the cache if we are at the bottom level of its
926 "stack". */
927 if (bidi_cache_start == 0)
928 bidi_cache_shrink ();
b7b65b15
EZ
929}
930
931/* Push the current embedding level and override status; reset the
932 current level to LEVEL and the current override status to OVERRIDE. */
fd3998ff 933static INLINE void
b7b65b15
EZ
934bidi_push_embedding_level (struct bidi_it *bidi_it,
935 int level, bidi_dir_t override)
936{
937 bidi_it->stack_idx++;
938 if (bidi_it->stack_idx >= BIDI_MAXLEVEL)
939 abort ();
940 bidi_it->level_stack[bidi_it->stack_idx].level = level;
941 bidi_it->level_stack[bidi_it->stack_idx].override = override;
942}
943
944/* Pop the embedding level and directional override status from the
945 stack, and return the new level. */
fd3998ff 946static INLINE int
b7b65b15
EZ
947bidi_pop_embedding_level (struct bidi_it *bidi_it)
948{
949 /* UAX#9 says to ignore invalid PDFs. */
950 if (bidi_it->stack_idx > 0)
951 bidi_it->stack_idx--;
952 return bidi_it->level_stack[bidi_it->stack_idx].level;
953}
954
955/* Record in SAVED_INFO the information about the current character. */
fd3998ff 956static INLINE void
b7b65b15
EZ
957bidi_remember_char (struct bidi_saved_info *saved_info,
958 struct bidi_it *bidi_it)
959{
960 saved_info->charpos = bidi_it->charpos;
961 saved_info->bytepos = bidi_it->bytepos;
962 saved_info->type = bidi_it->type;
2d6e4628 963 bidi_check_type (bidi_it->type);
89d3374a
EZ
964 saved_info->type_after_w1 = bidi_it->type_after_w1;
965 bidi_check_type (bidi_it->type_after_w1);
b7b65b15 966 saved_info->orig_type = bidi_it->orig_type;
2d6e4628 967 bidi_check_type (bidi_it->orig_type);
b7b65b15
EZ
968}
969
970/* Resolve the type of a neutral character according to the type of
971 surrounding strong text and the current embedding level. */
fd3998ff 972static INLINE bidi_type_t
b7b65b15
EZ
973bidi_resolve_neutral_1 (bidi_type_t prev_type, bidi_type_t next_type, int lev)
974{
975 /* N1: European and Arabic numbers are treated as though they were R. */
976 if (next_type == WEAK_EN || next_type == WEAK_AN)
977 next_type = STRONG_R;
978 if (prev_type == WEAK_EN || prev_type == WEAK_AN)
979 prev_type = STRONG_R;
980
981 if (next_type == prev_type) /* N1 */
982 return next_type;
983 else if ((lev & 1) == 0) /* N2 */
984 return STRONG_L;
985 else
986 return STRONG_R;
987}
988
fd3998ff 989static INLINE int
182ce2d2 990bidi_explicit_dir_char (int ch)
b7b65b15 991{
182ce2d2
EZ
992 bidi_type_t ch_type;
993
994 if (!bidi_initialized)
995 abort ();
996 ch_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
997 return (ch_type == LRE || ch_type == LRO
998 || ch_type == RLE || ch_type == RLO
999 || ch_type == PDF);
b7b65b15
EZ
1000}
1001
1002/* A helper function for bidi_resolve_explicit. It advances to the
1003 next character in logical order and determines the new embedding
1004 level and directional override, but does not take into account
1005 empty embeddings. */
1006static int
1007bidi_resolve_explicit_1 (struct bidi_it *bidi_it)
1008{
1009 int curchar;
1010 bidi_type_t type;
1011 int current_level;
1012 int new_level;
1013 bidi_dir_t override;
87e67904 1014 int string_p = bidi_it->string.s != NULL;
b7b65b15 1015
182ce2d2
EZ
1016 /* If reseat()'ed, don't advance, so as to start iteration from the
1017 position where we were reseated. bidi_it->bytepos can be less
1018 than BEGV_BYTE after reseat to BEGV. */
87e67904 1019 if (bidi_it->bytepos < (string_p ? 0 : BEGV_BYTE)
9c82e145 1020 || bidi_it->first_elt)
e7402cb2 1021 {
9c82e145 1022 bidi_it->first_elt = 0;
87e67904
EZ
1023 if (string_p)
1024 {
1025 if (bidi_it->charpos < 0)
1026 bidi_it->charpos = 0;
1027 bidi_it->bytepos = bidi_count_bytes (bidi_it->string.s, 0, 0,
1028 bidi_it->charpos);
1029 }
1030 else
1031 {
1032 if (bidi_it->charpos < BEGV)
1033 bidi_it->charpos = BEGV;
1034 bidi_it->bytepos = CHAR_TO_BYTE (bidi_it->charpos);
1035 }
e7402cb2 1036 }
87e67904
EZ
1037 /* Don't move at end of buffer/string. */
1038 else if (bidi_it->charpos < (string_p ? bidi_it->string.schars : ZV))
b7b65b15 1039 {
182ce2d2
EZ
1040 /* Advance to the next character, skipping characters covered by
1041 display strings (nchars > 1). */
102ebb00
EZ
1042 if (bidi_it->nchars <= 0)
1043 abort ();
182ce2d2 1044 bidi_it->charpos += bidi_it->nchars;
e7402cb2
EZ
1045 if (bidi_it->ch_len == 0)
1046 abort ();
b7b65b15
EZ
1047 bidi_it->bytepos += bidi_it->ch_len;
1048 }
1049
1050 current_level = bidi_it->level_stack[bidi_it->stack_idx].level; /* X1 */
1051 override = bidi_it->level_stack[bidi_it->stack_idx].override;
1052 new_level = current_level;
1053
87e67904 1054 if (bidi_it->charpos >= (string_p ? bidi_it->string.schars : ZV))
e7402cb2
EZ
1055 {
1056 curchar = BIDI_EOB;
1057 bidi_it->ch_len = 1;
182ce2d2 1058 bidi_it->nchars = 1;
87e67904 1059 bidi_it->disp_pos = (string_p ? bidi_it->string.schars : ZV);
e7402cb2
EZ
1060 }
1061 else
1062 {
182ce2d2
EZ
1063 /* Fetch the character at BYTEPOS. If it is covered by a
1064 display string, treat the entire run of covered characters as
1065 a single character u+FFFC. */
102ebb00 1066 curchar = bidi_fetch_char (bidi_it->bytepos, bidi_it->charpos,
87e67904
EZ
1067 &bidi_it->disp_pos, &bidi_it->string,
1068 bidi_it->frame_window_p,
102ebb00 1069 &bidi_it->ch_len, &bidi_it->nchars);
e7402cb2 1070 }
b7b65b15 1071 bidi_it->ch = curchar;
b7b65b15 1072
6bff6497
EZ
1073 /* Don't apply directional override here, as all the types we handle
1074 below will not be affected by the override anyway, and we need
1075 the original type unaltered. The override will be applied in
1076 bidi_resolve_weak. */
1077 type = bidi_get_type (curchar, NEUTRAL_DIR);
89d3374a
EZ
1078 bidi_it->orig_type = type;
1079 bidi_check_type (bidi_it->orig_type);
b7b65b15
EZ
1080
1081 if (type != PDF)
1082 bidi_it->prev_was_pdf = 0;
1083
89d3374a 1084 bidi_it->type_after_w1 = UNKNOWN_BT;
b7b65b15
EZ
1085
1086 switch (type)
1087 {
1088 case RLE: /* X2 */
1089 case RLO: /* X4 */
89d3374a
EZ
1090 bidi_it->type_after_w1 = type;
1091 bidi_check_type (bidi_it->type_after_w1);
b7b65b15 1092 type = WEAK_BN; /* X9/Retaining */
87e67904 1093 if (bidi_it->ignore_bn_limit <= -1)
b7b65b15
EZ
1094 {
1095 if (current_level <= BIDI_MAXLEVEL - 4)
1096 {
1097 /* Compute the least odd embedding level greater than
1098 the current level. */
1099 new_level = ((current_level + 1) & ~1) + 1;
89d3374a 1100 if (bidi_it->type_after_w1 == RLE)
b7b65b15
EZ
1101 override = NEUTRAL_DIR;
1102 else
1103 override = R2L;
1104 if (current_level == BIDI_MAXLEVEL - 4)
1105 bidi_it->invalid_rl_levels = 0;
1106 bidi_push_embedding_level (bidi_it, new_level, override);
1107 }
1108 else
1109 {
1110 bidi_it->invalid_levels++;
1111 /* See the commentary about invalid_rl_levels below. */
1112 if (bidi_it->invalid_rl_levels < 0)
1113 bidi_it->invalid_rl_levels = 0;
1114 bidi_it->invalid_rl_levels++;
1115 }
1116 }
89d3374a 1117 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
b7b65b15
EZ
1118 || bidi_it->next_en_pos > bidi_it->charpos)
1119 type = WEAK_EN;
1120 break;
1121 case LRE: /* X3 */
1122 case LRO: /* X5 */
89d3374a
EZ
1123 bidi_it->type_after_w1 = type;
1124 bidi_check_type (bidi_it->type_after_w1);
b7b65b15 1125 type = WEAK_BN; /* X9/Retaining */
87e67904 1126 if (bidi_it->ignore_bn_limit <= -1)
b7b65b15
EZ
1127 {
1128 if (current_level <= BIDI_MAXLEVEL - 5)
1129 {
1130 /* Compute the least even embedding level greater than
1131 the current level. */
1132 new_level = ((current_level + 2) & ~1);
89d3374a 1133 if (bidi_it->type_after_w1 == LRE)
b7b65b15
EZ
1134 override = NEUTRAL_DIR;
1135 else
1136 override = L2R;
1137 bidi_push_embedding_level (bidi_it, new_level, override);
1138 }
1139 else
1140 {
1141 bidi_it->invalid_levels++;
1142 /* invalid_rl_levels counts invalid levels encountered
1143 while the embedding level was already too high for
1144 LRE/LRO, but not for RLE/RLO. That is because
1145 there may be exactly one PDF which we should not
1146 ignore even though invalid_levels is non-zero.
1147 invalid_rl_levels helps to know what PDF is
1148 that. */
1149 if (bidi_it->invalid_rl_levels >= 0)
1150 bidi_it->invalid_rl_levels++;
1151 }
1152 }
89d3374a 1153 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
b7b65b15
EZ
1154 || bidi_it->next_en_pos > bidi_it->charpos)
1155 type = WEAK_EN;
1156 break;
1157 case PDF: /* X7 */
89d3374a
EZ
1158 bidi_it->type_after_w1 = type;
1159 bidi_check_type (bidi_it->type_after_w1);
b7b65b15 1160 type = WEAK_BN; /* X9/Retaining */
87e67904 1161 if (bidi_it->ignore_bn_limit <= -1)
b7b65b15
EZ
1162 {
1163 if (!bidi_it->invalid_rl_levels)
1164 {
1165 new_level = bidi_pop_embedding_level (bidi_it);
1166 bidi_it->invalid_rl_levels = -1;
1167 if (bidi_it->invalid_levels)
1168 bidi_it->invalid_levels--;
1169 /* else nothing: UAX#9 says to ignore invalid PDFs */
1170 }
1171 if (!bidi_it->invalid_levels)
1172 new_level = bidi_pop_embedding_level (bidi_it);
1173 else
1174 {
1175 bidi_it->invalid_levels--;
1176 bidi_it->invalid_rl_levels--;
1177 }
1178 }
89d3374a 1179 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
b7b65b15
EZ
1180 || bidi_it->next_en_pos > bidi_it->charpos)
1181 type = WEAK_EN;
1182 break;
1183 default:
1184 /* Nothing. */
1185 break;
1186 }
1187
1188 bidi_it->type = type;
2d6e4628 1189 bidi_check_type (bidi_it->type);
b7b65b15
EZ
1190
1191 return new_level;
1192}
1193
1194/* Given an iterator state in BIDI_IT, advance one character position
182ce2d2
EZ
1195 in the buffer/string to the next character (in the logical order),
1196 resolve any explicit embeddings and directional overrides, and
1197 return the embedding level of the character after resolving
1198 explicit directives and ignoring empty embeddings. */
b7b65b15
EZ
1199static int
1200bidi_resolve_explicit (struct bidi_it *bidi_it)
1201{
1202 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1203 int new_level = bidi_resolve_explicit_1 (bidi_it);
87e67904 1204 EMACS_INT eob = bidi_it->string.s ? bidi_it->string.schars : ZV;
b7b65b15
EZ
1205
1206 if (prev_level < new_level
1207 && bidi_it->type == WEAK_BN
87e67904
EZ
1208 && bidi_it->ignore_bn_limit == -1 /* only if not already known */
1209 && bidi_it->charpos < eob /* not already at EOB */
1210 && bidi_explicit_dir_char (bidi_char_at_pos (bidi_it->bytepos
1211 + bidi_it->ch_len,
1212 bidi_it->string.s)))
b7b65b15
EZ
1213 {
1214 /* Avoid pushing and popping embedding levels if the level run
1215 is empty, as this breaks level runs where it shouldn't.
1216 UAX#9 removes all the explicit embedding and override codes,
1217 so empty embeddings disappear without a trace. We need to
1218 behave as if we did the same. */
1219 struct bidi_it saved_it;
1220 int level = prev_level;
1221
1222 bidi_copy_it (&saved_it, bidi_it);
1223
87e67904
EZ
1224 while (bidi_explicit_dir_char (bidi_char_at_pos (bidi_it->bytepos
1225 + bidi_it->ch_len,
1226 bidi_it->string.s)))
b7b65b15 1227 {
182ce2d2
EZ
1228 /* This advances to the next character, skipping any
1229 characters covered by display strings. */
b7b65b15
EZ
1230 level = bidi_resolve_explicit_1 (bidi_it);
1231 }
1232
102ebb00
EZ
1233 if (bidi_it->nchars <= 0)
1234 abort ();
b7b65b15 1235 if (level == prev_level) /* empty embedding */
182ce2d2 1236 saved_it.ignore_bn_limit = bidi_it->charpos + bidi_it->nchars;
b7b65b15 1237 else /* this embedding is non-empty */
87e67904 1238 saved_it.ignore_bn_limit = -2;
b7b65b15
EZ
1239
1240 bidi_copy_it (bidi_it, &saved_it);
87e67904 1241 if (bidi_it->ignore_bn_limit > -1)
b7b65b15
EZ
1242 {
1243 /* We pushed a level, but we shouldn't have. Undo that. */
1244 if (!bidi_it->invalid_rl_levels)
1245 {
1246 new_level = bidi_pop_embedding_level (bidi_it);
1247 bidi_it->invalid_rl_levels = -1;
1248 if (bidi_it->invalid_levels)
1249 bidi_it->invalid_levels--;
1250 }
1251 if (!bidi_it->invalid_levels)
1252 new_level = bidi_pop_embedding_level (bidi_it);
1253 else
1254 {
1255 bidi_it->invalid_levels--;
1256 bidi_it->invalid_rl_levels--;
1257 }
1258 }
1259 }
1260
b7b65b15
EZ
1261 if (bidi_it->type == NEUTRAL_B) /* X8 */
1262 {
21fce5ab 1263 bidi_set_paragraph_end (bidi_it);
6bff6497
EZ
1264 /* This is needed by bidi_resolve_weak below, and in L1. */
1265 bidi_it->type_after_w1 = bidi_it->type;
89d3374a 1266 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1267 }
1268
1269 return new_level;
1270}
1271
182ce2d2
EZ
1272/* Advance in the buffer/string, resolve weak types and return the
1273 type of the next character after weak type resolution. */
fd3998ff 1274static bidi_type_t
b7b65b15
EZ
1275bidi_resolve_weak (struct bidi_it *bidi_it)
1276{
1277 bidi_type_t type;
1278 bidi_dir_t override;
1279 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1280 int new_level = bidi_resolve_explicit (bidi_it);
1281 int next_char;
1282 bidi_type_t type_of_next;
1283 struct bidi_it saved_it;
87e67904 1284 EMACS_INT eob = bidi_it->string.s ? bidi_it->string.schars : ZV;
b7b65b15
EZ
1285
1286 type = bidi_it->type;
1287 override = bidi_it->level_stack[bidi_it->stack_idx].override;
1288
1289 if (type == UNKNOWN_BT
1290 || type == LRE
1291 || type == LRO
1292 || type == RLE
1293 || type == RLO
1294 || type == PDF)
1295 abort ();
1296
1297 if (new_level != prev_level
1298 || bidi_it->type == NEUTRAL_B)
1299 {
1300 /* We've got a new embedding level run, compute the directional
1301 type of sor and initialize per-run variables (UAX#9, clause
1302 X10). */
1303 bidi_set_sor_type (bidi_it, prev_level, new_level);
1304 }
1305 else if (type == NEUTRAL_S || type == NEUTRAL_WS
1306 || type == WEAK_BN || type == STRONG_AL)
89d3374a
EZ
1307 bidi_it->type_after_w1 = type; /* needed in L1 */
1308 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1309
1310 /* Level and directional override status are already recorded in
1311 bidi_it, and do not need any change; see X6. */
1312 if (override == R2L) /* X6 */
1313 type = STRONG_R;
1314 else if (override == L2R)
1315 type = STRONG_L;
bc5a45f3 1316 else
b7b65b15 1317 {
bc5a45f3 1318 if (type == WEAK_NSM) /* W1 */
b7b65b15 1319 {
bc5a45f3 1320 /* Note that we don't need to consider the case where the
5930fe97
EZ
1321 prev character has its type overridden by an RLO or LRO,
1322 because then either the type of this NSM would have been
1323 also overridden, or the previous character is outside the
1324 current level run, and thus not relevant to this NSM.
1325 This is why NSM gets the type_after_w1 of the previous
1326 character. */
ebb5722e
EZ
1327 if (bidi_it->prev.type_after_w1 != UNKNOWN_BT
1328 /* if type_after_w1 is NEUTRAL_B, this NSM is at sor */
1329 && bidi_it->prev.type_after_w1 != NEUTRAL_B)
5930fe97 1330 type = bidi_it->prev.type_after_w1;
bc5a45f3
EZ
1331 else if (bidi_it->sor == R2L)
1332 type = STRONG_R;
1333 else if (bidi_it->sor == L2R)
1334 type = STRONG_L;
1335 else /* shouldn't happen! */
1336 abort ();
b7b65b15 1337 }
bc5a45f3
EZ
1338 if (type == WEAK_EN /* W2 */
1339 && bidi_it->last_strong.type_after_w1 == STRONG_AL)
1340 type = WEAK_AN;
1341 else if (type == STRONG_AL) /* W3 */
1342 type = STRONG_R;
1343 else if ((type == WEAK_ES /* W4 */
1344 && bidi_it->prev.type_after_w1 == WEAK_EN
1345 && bidi_it->prev.orig_type == WEAK_EN)
1346 || (type == WEAK_CS
1347 && ((bidi_it->prev.type_after_w1 == WEAK_EN
1348 && bidi_it->prev.orig_type == WEAK_EN)
1349 || bidi_it->prev.type_after_w1 == WEAK_AN)))
b7b65b15 1350 {
e7402cb2 1351 next_char =
87e67904
EZ
1352 bidi_it->charpos + bidi_it->nchars >= eob
1353 ? BIDI_EOB
1354 : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len,
1355 bidi_it->string.s);
6bff6497 1356 type_of_next = bidi_get_type (next_char, override);
b7b65b15 1357
bc5a45f3 1358 if (type_of_next == WEAK_BN
b7b65b15
EZ
1359 || bidi_explicit_dir_char (next_char))
1360 {
1361 bidi_copy_it (&saved_it, bidi_it);
1362 while (bidi_resolve_explicit (bidi_it) == new_level
bc5a45f3 1363 && bidi_it->type == WEAK_BN)
b7b65b15
EZ
1364 ;
1365 type_of_next = bidi_it->type;
b7b65b15
EZ
1366 bidi_copy_it (bidi_it, &saved_it);
1367 }
bc5a45f3
EZ
1368
1369 /* If the next character is EN, but the last strong-type
1370 character is AL, that next EN will be changed to AN when
1371 we process it in W2 above. So in that case, this ES
1372 should not be changed into EN. */
1373 if (type == WEAK_ES
1374 && type_of_next == WEAK_EN
1375 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1376 type = WEAK_EN;
1377 else if (type == WEAK_CS)
b7b65b15 1378 {
bc5a45f3
EZ
1379 if (bidi_it->prev.type_after_w1 == WEAK_AN
1380 && (type_of_next == WEAK_AN
1381 /* If the next character is EN, but the last
1382 strong-type character is AL, EN will be later
1383 changed to AN when we process it in W2 above.
1384 So in that case, this ES should not be
1385 changed into EN. */
1386 || (type_of_next == WEAK_EN
1387 && bidi_it->last_strong.type_after_w1 == STRONG_AL)))
1388 type = WEAK_AN;
1389 else if (bidi_it->prev.type_after_w1 == WEAK_EN
1390 && type_of_next == WEAK_EN
1391 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1392 type = WEAK_EN;
1393 }
1394 }
1395 else if (type == WEAK_ET /* W5: ET with EN before or after it */
1396 || type == WEAK_BN) /* W5/Retaining */
1397 {
1398 if (bidi_it->prev.type_after_w1 == WEAK_EN /* ET/BN w/EN before it */
1399 || bidi_it->next_en_pos > bidi_it->charpos)
1400 type = WEAK_EN;
1401 else /* W5: ET/BN with EN after it. */
1402 {
182ce2d2 1403 EMACS_INT en_pos = bidi_it->charpos + bidi_it->nchars;
bc5a45f3 1404
102ebb00
EZ
1405 if (bidi_it->nchars <= 0)
1406 abort ();
bc5a45f3 1407 next_char =
87e67904
EZ
1408 bidi_it->charpos + bidi_it->nchars >= eob
1409 ? BIDI_EOB
1410 : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len,
1411 bidi_it->string.s);
bc5a45f3
EZ
1412 type_of_next = bidi_get_type (next_char, override);
1413
1414 if (type_of_next == WEAK_ET
1415 || type_of_next == WEAK_BN
1416 || bidi_explicit_dir_char (next_char))
1417 {
1418 bidi_copy_it (&saved_it, bidi_it);
1419 while (bidi_resolve_explicit (bidi_it) == new_level
1420 && (bidi_it->type == WEAK_BN
1421 || bidi_it->type == WEAK_ET))
1422 ;
1423 type_of_next = bidi_it->type;
1424 en_pos = bidi_it->charpos;
1425 bidi_copy_it (bidi_it, &saved_it);
1426 }
1427 if (type_of_next == WEAK_EN)
b7b65b15 1428 {
bc5a45f3
EZ
1429 /* If the last strong character is AL, the EN we've
1430 found will become AN when we get to it (W2). */
1431 if (bidi_it->last_strong.type_after_w1 != STRONG_AL)
1432 {
1433 type = WEAK_EN;
1434 /* Remember this EN position, to speed up processing
1435 of the next ETs. */
1436 bidi_it->next_en_pos = en_pos;
1437 }
1438 else if (type == WEAK_BN)
1439 type = NEUTRAL_ON; /* W6/Retaining */
b7b65b15 1440 }
b7b65b15
EZ
1441 }
1442 }
1443 }
1444
1445 if (type == WEAK_ES || type == WEAK_ET || type == WEAK_CS /* W6 */
89d3374a
EZ
1446 || (type == WEAK_BN
1447 && (bidi_it->prev.type_after_w1 == WEAK_CS /* W6/Retaining */
1448 || bidi_it->prev.type_after_w1 == WEAK_ES
1449 || bidi_it->prev.type_after_w1 == WEAK_ET)))
b7b65b15
EZ
1450 type = NEUTRAL_ON;
1451
1452 /* Store the type we've got so far, before we clobber it with strong
1453 types in W7 and while resolving neutral types. But leave alone
1454 the original types that were recorded above, because we will need
1455 them for the L1 clause. */
89d3374a
EZ
1456 if (bidi_it->type_after_w1 == UNKNOWN_BT)
1457 bidi_it->type_after_w1 = type;
1458 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1459
1460 if (type == WEAK_EN) /* W7 */
1461 {
89d3374a 1462 if ((bidi_it->last_strong.type_after_w1 == STRONG_L)
b7b65b15
EZ
1463 || (bidi_it->last_strong.type == UNKNOWN_BT && bidi_it->sor == L2R))
1464 type = STRONG_L;
1465 }
1466
1467 bidi_it->type = type;
2d6e4628 1468 bidi_check_type (bidi_it->type);
b7b65b15
EZ
1469 return type;
1470}
1471
fd3998ff 1472static bidi_type_t
b7b65b15
EZ
1473bidi_resolve_neutral (struct bidi_it *bidi_it)
1474{
1475 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1476 bidi_type_t type = bidi_resolve_weak (bidi_it);
1477 int current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1478
1479 if (!(type == STRONG_R
1480 || type == STRONG_L
1481 || type == WEAK_BN
1482 || type == WEAK_EN
1483 || type == WEAK_AN
1484 || type == NEUTRAL_B
1485 || type == NEUTRAL_S
1486 || type == NEUTRAL_WS
1487 || type == NEUTRAL_ON))
1488 abort ();
1489
1490 if (bidi_get_category (type) == NEUTRAL
1491 || (type == WEAK_BN && prev_level == current_level))
1492 {
1493 if (bidi_it->next_for_neutral.type != UNKNOWN_BT)
1494 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1495 bidi_it->next_for_neutral.type,
1496 current_level);
1497 else
1498 {
1499 /* Arrrgh!! The UAX#9 algorithm is too deeply entrenched in
1500 the assumption of batch-style processing; see clauses W4,
1501 W5, and especially N1, which require to look far forward
182ce2d2
EZ
1502 (as well as back) in the buffer/string. May the fleas of
1503 a thousand camels infest the armpits of those who design
b7b65b15
EZ
1504 supposedly general-purpose algorithms by looking at their
1505 own implementations, and fail to consider other possible
1506 implementations! */
1507 struct bidi_it saved_it;
1508 bidi_type_t next_type;
1509
1510 if (bidi_it->scan_dir == -1)
1511 abort ();
1512
1513 bidi_copy_it (&saved_it, bidi_it);
1514 /* Scan the text forward until we find the first non-neutral
1515 character, and then use that to resolve the neutral we
1516 are dealing with now. We also cache the scanned iterator
1517 states, to salvage some of the effort later. */
1518 bidi_cache_iterator_state (bidi_it, 0);
1519 do {
1520 /* Record the info about the previous character, so that
1521 it will be cached below with this state. */
89d3374a 1522 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
b7b65b15
EZ
1523 && bidi_it->type != WEAK_BN)
1524 bidi_remember_char (&bidi_it->prev, bidi_it);
1525 type = bidi_resolve_weak (bidi_it);
1526 /* Paragraph separators have their levels fully resolved
1527 at this point, so cache them as resolved. */
1528 bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B);
1529 /* FIXME: implement L1 here, by testing for a newline and
1530 resetting the level for any sequence of whitespace
1531 characters adjacent to it. */
1532 } while (!(type == NEUTRAL_B
1533 || (type != WEAK_BN
1534 && bidi_get_category (type) != NEUTRAL)
1535 /* This is all per level run, so stop when we
1536 reach the end of this level run. */
1537 || bidi_it->level_stack[bidi_it->stack_idx].level !=
1538 current_level));
1539
1540 bidi_remember_char (&saved_it.next_for_neutral, bidi_it);
1541
1542 switch (type)
1543 {
1544 case STRONG_L:
1545 case STRONG_R:
1546 case STRONG_AL:
1547 next_type = type;
1548 break;
1549 case WEAK_EN:
1550 case WEAK_AN:
1551 /* N1: ``European and Arabic numbers are treated as
1552 though they were R.'' */
1553 next_type = STRONG_R;
1554 saved_it.next_for_neutral.type = STRONG_R;
1555 break;
1556 case WEAK_BN:
1557 if (!bidi_explicit_dir_char (bidi_it->ch))
1558 abort (); /* can't happen: BNs are skipped */
1559 /* FALLTHROUGH */
1560 case NEUTRAL_B:
1561 /* Marched all the way to the end of this level run.
1562 We need to use the eor type, whose information is
1563 stored by bidi_set_sor_type in the prev_for_neutral
1564 member. */
1565 if (saved_it.type != WEAK_BN
89d3374a 1566 || bidi_get_category (bidi_it->prev.type_after_w1) == NEUTRAL)
b7b65b15
EZ
1567 {
1568 next_type = bidi_it->prev_for_neutral.type;
1569 saved_it.next_for_neutral.type = next_type;
2d6e4628 1570 bidi_check_type (next_type);
b7b65b15
EZ
1571 }
1572 else
1573 {
1574 /* This is a BN which does not adjoin neutrals.
1575 Leave its type alone. */
1576 bidi_copy_it (bidi_it, &saved_it);
1577 return bidi_it->type;
1578 }
1579 break;
1580 default:
1581 abort ();
1582 }
1583 type = bidi_resolve_neutral_1 (saved_it.prev_for_neutral.type,
1584 next_type, current_level);
1585 saved_it.type = type;
2d6e4628 1586 bidi_check_type (type);
b7b65b15
EZ
1587 bidi_copy_it (bidi_it, &saved_it);
1588 }
1589 }
1590 return type;
1591}
1592
1593/* Given an iterator state in BIDI_IT, advance one character position
182ce2d2
EZ
1594 in the buffer/string to the next character (in the logical order),
1595 resolve the bidi type of that next character, and return that
1596 type. */
fd3998ff 1597static bidi_type_t
b7b65b15
EZ
1598bidi_type_of_next_char (struct bidi_it *bidi_it)
1599{
1600 bidi_type_t type;
1601
1602 /* This should always be called during a forward scan. */
1603 if (bidi_it->scan_dir != 1)
1604 abort ();
1605
1606 /* Reset the limit until which to ignore BNs if we step out of the
1607 area where we found only empty levels. */
87e67904 1608 if ((bidi_it->ignore_bn_limit > -1
b7b65b15 1609 && bidi_it->ignore_bn_limit <= bidi_it->charpos)
87e67904 1610 || (bidi_it->ignore_bn_limit == -2
b7b65b15 1611 && !bidi_explicit_dir_char (bidi_it->ch)))
87e67904 1612 bidi_it->ignore_bn_limit = -1;
b7b65b15
EZ
1613
1614 type = bidi_resolve_neutral (bidi_it);
1615
1616 return type;
1617}
1618
1619/* Given an iterator state BIDI_IT, advance one character position in
182ce2d2
EZ
1620 the buffer/string to the next character (in the current scan
1621 direction), resolve the embedding and implicit levels of that next
1622 character, and return the resulting level. */
fd3998ff 1623static int
b7b65b15
EZ
1624bidi_level_of_next_char (struct bidi_it *bidi_it)
1625{
1626 bidi_type_t type;
1627 int level, prev_level = -1;
1628 struct bidi_saved_info next_for_neutral;
87e67904 1629 EMACS_INT next_char_pos = -1;
b7b65b15
EZ
1630
1631 if (bidi_it->scan_dir == 1)
1632 {
1633 /* There's no sense in trying to advance if we hit end of text. */
87e67904 1634 if (bidi_it->charpos >= (bidi_it->string.s ? bidi_it->string.schars : ZV))
b7b65b15
EZ
1635 return bidi_it->resolved_level;
1636
1637 /* Record the info about the previous character. */
89d3374a 1638 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
b7b65b15
EZ
1639 && bidi_it->type != WEAK_BN)
1640 bidi_remember_char (&bidi_it->prev, bidi_it);
89d3374a
EZ
1641 if (bidi_it->type_after_w1 == STRONG_R
1642 || bidi_it->type_after_w1 == STRONG_L
1643 || bidi_it->type_after_w1 == STRONG_AL)
b7b65b15
EZ
1644 bidi_remember_char (&bidi_it->last_strong, bidi_it);
1645 /* FIXME: it sounds like we don't need both prev and
1646 prev_for_neutral members, but I'm leaving them both for now. */
1647 if (bidi_it->type == STRONG_R || bidi_it->type == STRONG_L
1648 || bidi_it->type == WEAK_EN || bidi_it->type == WEAK_AN)
1649 bidi_remember_char (&bidi_it->prev_for_neutral, bidi_it);
1650
1651 /* If we overstepped the characters used for resolving neutrals
1652 and whitespace, invalidate their info in the iterator. */
1653 if (bidi_it->charpos >= bidi_it->next_for_neutral.charpos)
1654 bidi_it->next_for_neutral.type = UNKNOWN_BT;
1655 if (bidi_it->next_en_pos >= 0
1656 && bidi_it->charpos >= bidi_it->next_en_pos)
1657 bidi_it->next_en_pos = -1;
1658 if (bidi_it->next_for_ws.type != UNKNOWN_BT
1659 && bidi_it->charpos >= bidi_it->next_for_ws.charpos)
1660 bidi_it->next_for_ws.type = UNKNOWN_BT;
1661
1662 /* This must be taken before we fill the iterator with the info
1663 about the next char. If we scan backwards, the iterator
1664 state must be already cached, so there's no need to know the
1665 embedding level of the previous character, since we will be
1666 returning to our caller shortly. */
1667 prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1668 }
1669 next_for_neutral = bidi_it->next_for_neutral;
1670
182ce2d2
EZ
1671 /* Perhaps the character we want is already cached. If it is, the
1672 call to bidi_cache_find below will return a type other than
1673 UNKNOWN_BT. */
87e67904 1674 if (bidi_cache_idx > bidi_cache_start && !bidi_it->first_elt)
102ebb00
EZ
1675 {
1676 if (bidi_it->scan_dir > 0)
1677 {
1678 if (bidi_it->nchars <= 0)
1679 abort ();
1680 next_char_pos = bidi_it->charpos + bidi_it->nchars;
1681 }
87e67904 1682 else if (bidi_it->charpos > (bidi_it->string.s ? 0 : 1))
102ebb00 1683 next_char_pos = bidi_it->charpos - 1;
87e67904
EZ
1684 if (next_char_pos >= 0)
1685 type = bidi_cache_find (next_char_pos, -1, bidi_it);
1686 else
1687 type = UNKNOWN_BT;
102ebb00 1688 }
182ce2d2 1689 else
102ebb00 1690 type = UNKNOWN_BT;
b7b65b15
EZ
1691 if (type != UNKNOWN_BT)
1692 {
1693 /* Don't lose the information for resolving neutrals! The
1694 cached states could have been cached before their
1695 next_for_neutral member was computed. If we are on our way
1696 forward, we can simply take the info from the previous
1697 state. */
1698 if (bidi_it->scan_dir == 1
1699 && bidi_it->next_for_neutral.type == UNKNOWN_BT)
1700 bidi_it->next_for_neutral = next_for_neutral;
1701
1702 /* If resolved_level is -1, it means this state was cached
1703 before it was completely resolved, so we cannot return
1704 it. */
1705 if (bidi_it->resolved_level != -1)
1706 return bidi_it->resolved_level;
1707 }
1708 if (bidi_it->scan_dir == -1)
1709 /* If we are going backwards, the iterator state is already cached
1710 from previous scans, and should be fully resolved. */
1711 abort ();
1712
1713 if (type == UNKNOWN_BT)
1714 type = bidi_type_of_next_char (bidi_it);
1715
1716 if (type == NEUTRAL_B)
1717 return bidi_it->resolved_level;
1718
1719 level = bidi_it->level_stack[bidi_it->stack_idx].level;
1720 if ((bidi_get_category (type) == NEUTRAL /* && type != NEUTRAL_B */)
1721 || (type == WEAK_BN && prev_level == level))
1722 {
1723 if (bidi_it->next_for_neutral.type == UNKNOWN_BT)
1724 abort ();
1725
1726 /* If the cached state shows a neutral character, it was not
1727 resolved by bidi_resolve_neutral, so do it now. */
1728 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1729 bidi_it->next_for_neutral.type,
1730 level);
1731 }
1732
1733 if (!(type == STRONG_R
1734 || type == STRONG_L
1735 || type == WEAK_BN
1736 || type == WEAK_EN
1737 || type == WEAK_AN))
1738 abort ();
1739 bidi_it->type = type;
2d6e4628 1740 bidi_check_type (bidi_it->type);
b7b65b15
EZ
1741
1742 /* For L1 below, we need to know, for each WS character, whether
0d327994 1743 it belongs to a sequence of WS characters preceding a newline
b7b65b15 1744 or a TAB or a paragraph separator. */
89d3374a 1745 if (bidi_it->orig_type == NEUTRAL_WS
b7b65b15
EZ
1746 && bidi_it->next_for_ws.type == UNKNOWN_BT)
1747 {
1748 int ch;
9d68c2a9 1749 EMACS_INT clen = bidi_it->ch_len;
6bff6497
EZ
1750 EMACS_INT bpos = bidi_it->bytepos;
1751 EMACS_INT cpos = bidi_it->charpos;
182ce2d2 1752 EMACS_INT disp_pos = bidi_it->disp_pos;
102ebb00 1753 EMACS_INT nc = bidi_it->nchars;
87e67904 1754 struct bidi_string_data bs = bidi_it->string;
b7b65b15 1755 bidi_type_t chtype;
fc6f18ce 1756 int fwp = bidi_it->frame_window_p;
b7b65b15 1757
102ebb00
EZ
1758 if (bidi_it->nchars <= 0)
1759 abort ();
b7b65b15 1760 do {
87e67904 1761 ch = bidi_fetch_char (bpos += clen, cpos += nc, &disp_pos, &bs, fwp,
fc6f18ce 1762 &clen, &nc);
e7402cb2 1763 if (ch == '\n' || ch == BIDI_EOB /* || ch == LINESEP_CHAR */)
b7b65b15
EZ
1764 chtype = NEUTRAL_B;
1765 else
6bff6497 1766 chtype = bidi_get_type (ch, NEUTRAL_DIR);
b7b65b15
EZ
1767 } while (chtype == NEUTRAL_WS || chtype == WEAK_BN
1768 || bidi_explicit_dir_char (ch)); /* L1/Retaining */
1769 bidi_it->next_for_ws.type = chtype;
2d6e4628 1770 bidi_check_type (bidi_it->next_for_ws.type);
b7b65b15
EZ
1771 bidi_it->next_for_ws.charpos = cpos;
1772 bidi_it->next_for_ws.bytepos = bpos;
1773 }
1774
1775 /* Resolve implicit levels, with a twist: PDFs get the embedding
1776 level of the enbedding they terminate. See below for the
1777 reason. */
89d3374a 1778 if (bidi_it->orig_type == PDF
b7b65b15
EZ
1779 /* Don't do this if this formatting code didn't change the
1780 embedding level due to invalid or empty embeddings. */
1781 && prev_level != level)
1782 {
1783 /* Don't look in UAX#9 for the reason for this: it's our own
1784 private quirk. The reason is that we want the formatting
1785 codes to be delivered so that they bracket the text of their
1786 embedding. For example, given the text
1787
1788 {RLO}teST{PDF}
1789
1790 we want it to be displayed as
1791
0d68907d 1792 {PDF}STet{RLO}
b7b65b15
EZ
1793
1794 not as
1795
1796 STet{RLO}{PDF}
1797
1798 which will result because we bump up the embedding level as
1799 soon as we see the RLO and pop it as soon as we see the PDF,
1800 so RLO itself has the same embedding level as "teST", and
1801 thus would be normally delivered last, just before the PDF.
1802 The switch below fiddles with the level of PDF so that this
1803 ugly side effect does not happen.
1804
1805 (This is, of course, only important if the formatting codes
e7402cb2
EZ
1806 are actually displayed, but Emacs does need to display them
1807 if the user wants to.) */
b7b65b15
EZ
1808 level = prev_level;
1809 }
89d3374a
EZ
1810 else if (bidi_it->orig_type == NEUTRAL_B /* L1 */
1811 || bidi_it->orig_type == NEUTRAL_S
e7402cb2
EZ
1812 || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB
1813 /* || bidi_it->ch == LINESEP_CHAR */
89d3374a 1814 || (bidi_it->orig_type == NEUTRAL_WS
b7b65b15
EZ
1815 && (bidi_it->next_for_ws.type == NEUTRAL_B
1816 || bidi_it->next_for_ws.type == NEUTRAL_S)))
1817 level = bidi_it->level_stack[0].level;
1818 else if ((level & 1) == 0) /* I1 */
1819 {
1820 if (type == STRONG_R)
1821 level++;
1822 else if (type == WEAK_EN || type == WEAK_AN)
1823 level += 2;
1824 }
1825 else /* I2 */
1826 {
1827 if (type == STRONG_L || type == WEAK_EN || type == WEAK_AN)
1828 level++;
1829 }
1830
1831 bidi_it->resolved_level = level;
1832 return level;
1833}
1834
1835/* Move to the other edge of a level given by LEVEL. If END_FLAG is
1836 non-zero, we are at the end of a level, and we need to prepare to
1837 resume the scan of the lower level.
1838
1839 If this level's other edge is cached, we simply jump to it, filling
1840 the iterator structure with the iterator state on the other edge.
182ce2d2
EZ
1841 Otherwise, we walk the buffer or string until we come back to the
1842 same level as LEVEL.
b7b65b15
EZ
1843
1844 Note: we are not talking here about a ``level run'' in the UAX#9
1845 sense of the term, but rather about a ``level'' which includes
1846 all the levels higher than it. In other words, given the levels
1847 like this:
1848
1849 11111112222222333333334443343222222111111112223322111
1850 A B C
1851
1852 and assuming we are at point A scanning left to right, this
1853 function moves to point C, whereas the UAX#9 ``level 2 run'' ends
1854 at point B. */
1855static void
1856bidi_find_other_level_edge (struct bidi_it *bidi_it, int level, int end_flag)
1857{
1858 int dir = end_flag ? -bidi_it->scan_dir : bidi_it->scan_dir;
1859 int idx;
1860
1861 /* Try the cache first. */
87e67904
EZ
1862 if ((idx = bidi_cache_find_level_change (level, dir, end_flag))
1863 >= bidi_cache_start)
b7b65b15
EZ
1864 bidi_cache_fetch_state (idx, bidi_it);
1865 else
1866 {
1867 int new_level;
1868
1869 if (end_flag)
1870 abort (); /* if we are at end of level, its edges must be cached */
1871
1872 bidi_cache_iterator_state (bidi_it, 1);
1873 do {
1874 new_level = bidi_level_of_next_char (bidi_it);
1875 bidi_cache_iterator_state (bidi_it, 1);
1876 } while (new_level >= level);
1877 }
1878}
1879
1880void
4b292a22 1881bidi_move_to_visually_next (struct bidi_it *bidi_it)
b7b65b15
EZ
1882{
1883 int old_level, new_level, next_level;
9c82e145 1884 struct bidi_it sentinel;
b7b65b15 1885
87e67904
EZ
1886 if (bidi_it->charpos < 0 || bidi_it->bytepos < 0)
1887 abort ();
1888
b7b65b15
EZ
1889 if (bidi_it->scan_dir == 0)
1890 {
1891 bidi_it->scan_dir = 1; /* default to logical order */
1892 }
1893
be39f003
EZ
1894 /* If we just passed a newline, initialize for the next line. */
1895 if (!bidi_it->first_elt && bidi_it->orig_type == NEUTRAL_B)
1896 bidi_line_init (bidi_it);
1897
6dcfd253
EZ
1898 /* Prepare the sentinel iterator state, and cache it. When we bump
1899 into it, scanning backwards, we'll know that the last non-base
1900 level is exhausted. */
87e67904 1901 if (bidi_cache_idx == bidi_cache_start)
9c82e145
EZ
1902 {
1903 bidi_copy_it (&sentinel, bidi_it);
1904 if (bidi_it->first_elt)
1905 {
1906 sentinel.charpos--; /* cached charpos needs to be monotonic */
1907 sentinel.bytepos--;
1908 sentinel.ch = '\n'; /* doesn't matter, but why not? */
1909 sentinel.ch_len = 1;
182ce2d2 1910 sentinel.nchars = 1;
9c82e145 1911 }
6dcfd253 1912 bidi_cache_iterator_state (&sentinel, 1);
9c82e145 1913 }
b7b65b15
EZ
1914
1915 old_level = bidi_it->resolved_level;
1916 new_level = bidi_level_of_next_char (bidi_it);
b7b65b15
EZ
1917
1918 /* Reordering of resolved levels (clause L2) is implemented by
1919 jumping to the other edge of the level and flipping direction of
c0546589 1920 scanning the text whenever we find a level change. */
b7b65b15
EZ
1921 if (new_level != old_level)
1922 {
1923 int ascending = new_level > old_level;
1924 int level_to_search = ascending ? old_level + 1 : old_level;
1925 int incr = ascending ? 1 : -1;
1926 int expected_next_level = old_level + incr;
1927
b7b65b15
EZ
1928 /* Jump (or walk) to the other edge of this level. */
1929 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
1930 /* Switch scan direction and peek at the next character in the
1931 new direction. */
1932 bidi_it->scan_dir = -bidi_it->scan_dir;
1933
1934 /* The following loop handles the case where the resolved level
1935 jumps by more than one. This is typical for numbers inside a
1936 run of text with left-to-right embedding direction, but can
1937 also happen in other situations. In those cases the decision
1938 where to continue after a level change, and in what direction,
1939 is tricky. For example, given a text like below:
1940
1941 abcdefgh
1942 11336622
1943
1944 (where the numbers below the text show the resolved levels),
1945 the result of reordering according to UAX#9 should be this:
1946
1947 efdcghba
1948
1949 This is implemented by the loop below which flips direction
1950 and jumps to the other edge of the level each time it finds
1951 the new level not to be the expected one. The expected level
1952 is always one more or one less than the previous one. */
1953 next_level = bidi_peek_at_next_level (bidi_it);
1954 while (next_level != expected_next_level)
1955 {
1956 expected_next_level += incr;
1957 level_to_search += incr;
1958 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
1959 bidi_it->scan_dir = -bidi_it->scan_dir;
1960 next_level = bidi_peek_at_next_level (bidi_it);
1961 }
1962
1963 /* Finally, deliver the next character in the new direction. */
1964 next_level = bidi_level_of_next_char (bidi_it);
1965 }
1966
b44d9321
EZ
1967 /* Take note when we have just processed the newline that precedes
1968 the end of the paragraph. The next time we are about to be
1969 called, set_iterator_to_next will automatically reinit the
1970 paragraph direction, if needed. We do this at the newline before
1971 the paragraph separator, because the next character might not be
1972 the first character of the next paragraph, due to the bidi
c0546589
EZ
1973 reordering, whereas we _must_ know the paragraph base direction
1974 _before_ we process the paragraph's text, since the base
1975 direction affects the reordering. */
87e67904 1976 if (bidi_it->scan_dir == 1 && bidi_it->orig_type == NEUTRAL_B)
be39f003 1977 {
87e67904
EZ
1978 /* The paragraph direction of the entire string, once
1979 determined, is in effect for the entire string. Setting the
1980 separator limit to the end of the string prevents
1981 bidi_paragraph_init from being called automatically on this
1982 string. */
1983 if (bidi_it->string.s)
1984 bidi_it->separator_limit = bidi_it->string.schars;
1985 else if (bidi_it->bytepos < ZV_BYTE)
be39f003 1986 {
87e67904
EZ
1987 EMACS_INT sep_len =
1988 bidi_at_paragraph_end (bidi_it->charpos + bidi_it->nchars,
1989 bidi_it->bytepos + bidi_it->ch_len);
1990 if (bidi_it->nchars <= 0)
1991 abort ();
1992 if (sep_len >= 0)
1993 {
1994 bidi_it->new_paragraph = 1;
1995 /* Record the buffer position of the last character of the
1996 paragraph separator. */
1997 bidi_it->separator_limit =
1998 bidi_it->charpos + bidi_it->nchars + sep_len;
1999 }
be39f003
EZ
2000 }
2001 }
6bff6497 2002
87e67904 2003 if (bidi_it->scan_dir == 1 && bidi_cache_idx > bidi_cache_start)
b7b65b15
EZ
2004 {
2005 /* If we are at paragraph's base embedding level and beyond the
2006 last cached position, the cache's job is done and we can
2007 discard it. */
2008 if (bidi_it->resolved_level == bidi_it->level_stack[0].level
182ce2d2
EZ
2009 && bidi_it->charpos > (bidi_cache[bidi_cache_idx - 1].charpos
2010 + bidi_cache[bidi_cache_idx - 1].nchars - 1))
b7b65b15
EZ
2011 bidi_cache_reset ();
2012 /* But as long as we are caching during forward scan, we must
2013 cache each state, or else the cache integrity will be
2014 compromised: it assumes cached states correspond to buffer
2015 positions 1:1. */
2016 else
2017 bidi_cache_iterator_state (bidi_it, 1);
2018 }
2019}
2020
2021/* This is meant to be called from within the debugger, whenever you
2022 wish to examine the cache contents. */
e78aecca 2023void bidi_dump_cached_states (void) EXTERNALLY_VISIBLE;
b7b65b15
EZ
2024void
2025bidi_dump_cached_states (void)
2026{
2027 int i;
2028 int ndigits = 1;
2029
2030 if (bidi_cache_idx == 0)
2031 {
2032 fprintf (stderr, "The cache is empty.\n");
2033 return;
2034 }
2035 fprintf (stderr, "Total of %d state%s in cache:\n",
2036 bidi_cache_idx, bidi_cache_idx == 1 ? "" : "s");
2037
2038 for (i = bidi_cache[bidi_cache_idx - 1].charpos; i > 0; i /= 10)
2039 ndigits++;
2040 fputs ("ch ", stderr);
2041 for (i = 0; i < bidi_cache_idx; i++)
2042 fprintf (stderr, "%*c", ndigits, bidi_cache[i].ch);
2043 fputs ("\n", stderr);
2044 fputs ("lvl ", stderr);
2045 for (i = 0; i < bidi_cache_idx; i++)
2046 fprintf (stderr, "%*d", ndigits, bidi_cache[i].resolved_level);
2047 fputs ("\n", stderr);
2048 fputs ("pos ", stderr);
2049 for (i = 0; i < bidi_cache_idx; i++)
c2982e87 2050 fprintf (stderr, "%*"pI"d", ndigits, bidi_cache[i].charpos);
b7b65b15
EZ
2051 fputs ("\n", stderr);
2052}