Fixed a bug with displaying strings padded with blanks.
[bpt/emacs.git] / src / bidi.c
CommitLineData
b7b65b15 1/* Low-level bidirectional buffer-scanning functions for GNU Emacs.
73b0cd50 2 Copyright (C) 2000-2001, 2004-2005, 2009-2011
b118e65f 3 Free Software Foundation, Inc.
b7b65b15
EZ
4
5This file is part of GNU Emacs.
6
a8d11bd3 7GNU Emacs is free software: you can redistribute it and/or modify
b7b65b15 8it under the terms of the GNU General Public License as published by
a8d11bd3
EZ
9the Free Software Foundation, either version 3 of the License, or
10(at your option) any later version.
b7b65b15
EZ
11
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
16
b7b65b15 17You should have received a copy of the GNU General Public License
a8d11bd3 18along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
b7b65b15 19
2d6e4628
EZ
20/* Written by Eli Zaretskii <eliz@gnu.org>.
21
22 A sequential implementation of the Unicode Bidirectional algorithm,
b7b65b15
EZ
23 as per UAX#9, a part of the Unicode Standard.
24
25 Unlike the reference and most other implementations, this one is
940afb59
EZ
26 designed to be called once for every character in the buffer or
27 string.
b7b65b15 28
4b292a22 29 The main entry point is bidi_move_to_visually_next. Each time it
b7b65b15
EZ
30 is called, it finds the next character in the visual order, and
31 returns its information in a special structure. The caller is then
32 expected to process this character for display or any other
4b292a22
EZ
33 purposes, and call bidi_move_to_visually_next for the next
34 character. See the comments in bidi_move_to_visually_next for more
35 details about its algorithm that finds the next visual-order
b7b65b15
EZ
36 character by resolving their levels on the fly.
37
940afb59
EZ
38 The two other entry points are bidi_paragraph_init and
39 bidi_mirror_char. The first determines the base direction of a
40 paragraph, while the second returns the mirrored version of its
41 argument character.
42
89d3374a
EZ
43 If you want to understand the code, you will have to read it
44 together with the relevant portions of UAX#9. The comments include
45 references to UAX#9 rules, for that very reason.
46
b7b65b15
EZ
47 A note about references to UAX#9 rules: if the reference says
48 something like "X9/Retaining", it means that you need to refer to
49 rule X9 and to its modifications decribed in the "Implementation
50 Notes" section of UAX#9, under "Retaining Format Codes". */
51
b7b65b15 52#include <config.h>
b7b65b15 53#include <stdio.h>
29e3d8d1
EZ
54#include <setjmp.h>
55
b7b65b15
EZ
56#include "lisp.h"
57#include "buffer.h"
58#include "character.h"
59#include "dispextern.h"
60
61static int bidi_initialized = 0;
62
cbc4fd20 63static Lisp_Object bidi_type_table, bidi_mirror_table;
b7b65b15
EZ
64
65#define LRM_CHAR 0x200E
66#define RLM_CHAR 0x200F
b7b65b15 67#define BIDI_EOB -1
b7b65b15 68
b7b65b15
EZ
69/* Data type for describing the bidirectional character categories. */
70typedef enum {
71 UNKNOWN_BC,
72 NEUTRAL,
73 WEAK,
74 STRONG
75} bidi_category_t;
76
e78aecca 77extern int bidi_ignore_explicit_marks_for_paragraph_level EXTERNALLY_VISIBLE;
b7b65b15
EZ
78int bidi_ignore_explicit_marks_for_paragraph_level = 1;
79
372b7a95 80static Lisp_Object paragraph_start_re, paragraph_separate_re;
6bff6497 81static Lisp_Object Qparagraph_start, Qparagraph_separate;
b7b65b15 82
b7b65b15 83static void
971de7fb 84bidi_initialize (void)
b7b65b15 85{
317fbf33
EZ
86
87#include "biditype.h"
cbc4fd20 88#include "bidimirror.h"
317fbf33 89
b7b65b15
EZ
90 int i;
91
92 bidi_type_table = Fmake_char_table (Qnil, make_number (STRONG_L));
2d6e4628 93 staticpro (&bidi_type_table);
b7b65b15
EZ
94
95 for (i = 0; i < sizeof bidi_type / sizeof bidi_type[0]; i++)
317fbf33 96 char_table_set_range (bidi_type_table, bidi_type[i].from, bidi_type[i].to,
b7b65b15 97 make_number (bidi_type[i].type));
6bff6497 98
cbc4fd20
EZ
99 bidi_mirror_table = Fmake_char_table (Qnil, Qnil);
100 staticpro (&bidi_mirror_table);
101
102 for (i = 0; i < sizeof bidi_mirror / sizeof bidi_mirror[0]; i++)
103 char_table_set (bidi_mirror_table, bidi_mirror[i].from,
104 make_number (bidi_mirror[i].to));
105
b4bf28b7
SM
106 Qparagraph_start = intern ("paragraph-start");
107 staticpro (&Qparagraph_start);
372b7a95
EZ
108 paragraph_start_re = Fsymbol_value (Qparagraph_start);
109 if (!STRINGP (paragraph_start_re))
110 paragraph_start_re = build_string ("\f\\|[ \t]*$");
111 staticpro (&paragraph_start_re);
b4bf28b7
SM
112 Qparagraph_separate = intern ("paragraph-separate");
113 staticpro (&Qparagraph_separate);
372b7a95
EZ
114 paragraph_separate_re = Fsymbol_value (Qparagraph_separate);
115 if (!STRINGP (paragraph_separate_re))
116 paragraph_separate_re = build_string ("[ \t\f]*$");
117 staticpro (&paragraph_separate_re);
b7b65b15
EZ
118 bidi_initialized = 1;
119}
120
6bff6497
EZ
121/* Return the bidi type of a character CH, subject to the current
122 directional OVERRIDE. */
fd3998ff 123static INLINE bidi_type_t
6bff6497 124bidi_get_type (int ch, bidi_dir_t override)
b7b65b15 125{
6bff6497
EZ
126 bidi_type_t default_type;
127
e7402cb2
EZ
128 if (ch == BIDI_EOB)
129 return NEUTRAL_B;
e342a24d
EZ
130 if (ch < 0 || ch > MAX_CHAR)
131 abort ();
6bff6497
EZ
132
133 default_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
134
135 if (override == NEUTRAL_DIR)
136 return default_type;
137
138 switch (default_type)
139 {
140 /* Although UAX#9 does not tell, it doesn't make sense to
141 override NEUTRAL_B and LRM/RLM characters. */
142 case NEUTRAL_B:
143 case LRE:
144 case LRO:
145 case RLE:
146 case RLO:
147 case PDF:
148 return default_type;
149 default:
150 switch (ch)
151 {
152 case LRM_CHAR:
153 case RLM_CHAR:
154 return default_type;
155 default:
156 if (override == L2R) /* X6 */
157 return STRONG_L;
158 else if (override == R2L)
159 return STRONG_R;
160 else
161 abort (); /* can't happen: handled above */
162 }
163 }
b7b65b15
EZ
164}
165
7d3b3862 166static void
2d6e4628
EZ
167bidi_check_type (bidi_type_t type)
168{
169 if (type < UNKNOWN_BT || type > NEUTRAL_ON)
170 abort ();
171}
172
b7b65b15 173/* Given a bidi TYPE of a character, return its category. */
fd3998ff 174static INLINE bidi_category_t
b7b65b15
EZ
175bidi_get_category (bidi_type_t type)
176{
177 switch (type)
178 {
179 case UNKNOWN_BT:
180 return UNKNOWN_BC;
181 case STRONG_L:
182 case STRONG_R:
183 case STRONG_AL:
184 case LRE:
185 case LRO:
186 case RLE:
187 case RLO:
188 return STRONG;
189 case PDF: /* ??? really?? */
190 case WEAK_EN:
191 case WEAK_ES:
192 case WEAK_ET:
193 case WEAK_AN:
194 case WEAK_CS:
195 case WEAK_NSM:
196 case WEAK_BN:
197 return WEAK;
198 case NEUTRAL_B:
199 case NEUTRAL_S:
200 case NEUTRAL_WS:
201 case NEUTRAL_ON:
202 return NEUTRAL;
203 default:
204 abort ();
205 }
206}
207
a6e8d97c
EZ
208/* Return the mirrored character of C, if it has one. If C has no
209 mirrored counterpart, return C.
210 Note: The conditions in UAX#9 clause L4 regarding the surrounding
211 context must be tested by the caller. */
b7b65b15
EZ
212int
213bidi_mirror_char (int c)
214{
cbc4fd20
EZ
215 Lisp_Object val;
216
217 if (c == BIDI_EOB)
218 return c;
219 if (c < 0 || c > MAX_CHAR)
220 abort ();
b7b65b15 221
cbc4fd20
EZ
222 val = CHAR_TABLE_REF (bidi_mirror_table, c);
223 if (INTEGERP (val))
b7b65b15 224 {
cbc4fd20 225 int v = XINT (val);
b7b65b15 226
cbc4fd20
EZ
227 if (v < 0 || v > MAX_CHAR)
228 abort ();
229
230 return v;
b7b65b15 231 }
cbc4fd20 232
b7b65b15
EZ
233 return c;
234}
235
236/* Copy the bidi iterator from FROM to TO. To save cycles, this only
237 copies the part of the level stack that is actually in use. */
fd3998ff 238static INLINE void
b7b65b15
EZ
239bidi_copy_it (struct bidi_it *to, struct bidi_it *from)
240{
241 int i;
242
e69a9370 243 /* Copy everything except the level stack and beyond. */
182ce2d2 244 memcpy (to, from, offsetof (struct bidi_it, level_stack[0]));
b7b65b15
EZ
245
246 /* Copy the active part of the level stack. */
247 to->level_stack[0] = from->level_stack[0]; /* level zero is always in use */
248 for (i = 1; i <= from->stack_idx; i++)
249 to->level_stack[i] = from->level_stack[i];
250}
251
252/* Caching the bidi iterator states. */
253
2fe72643
EZ
254#define BIDI_CACHE_CHUNK 200
255static struct bidi_it *bidi_cache;
256static size_t bidi_cache_size = 0;
0416466c 257static size_t elsz = sizeof (struct bidi_it);
87e67904
EZ
258static EMACS_INT bidi_cache_idx; /* next unused cache slot */
259static EMACS_INT bidi_cache_last_idx; /* slot of last cache hit */
260static EMACS_INT bidi_cache_start = 0; /* start of cache for this
261 "stack" level */
262
263/* Reset the cache state to the empty state. We only reset the part
264 of the cache relevant to iteration of the current object. Previous
265 objects, which are pushed on the display iterator's stack, are left
266 intact. This is called when the cached information is no more
267 useful for the current iteration, e.g. when we were reseated to a
268 new position on the same object. */
fd3998ff 269static INLINE void
b7b65b15
EZ
270bidi_cache_reset (void)
271{
87e67904 272 bidi_cache_idx = bidi_cache_start;
b7b65b15
EZ
273 bidi_cache_last_idx = -1;
274}
275
87e67904
EZ
276/* Shrink the cache to its minimal size. Called when we init the bidi
277 iterator for reordering a buffer or a string that does not come
278 from display properties, because that means all the previously
279 cached info is of no further use. */
2fe72643
EZ
280static INLINE void
281bidi_cache_shrink (void)
282{
283 if (bidi_cache_size > BIDI_CACHE_CHUNK)
284 {
0416466c
EZ
285 bidi_cache_size = BIDI_CACHE_CHUNK;
286 bidi_cache =
287 (struct bidi_it *) xrealloc (bidi_cache, bidi_cache_size * elsz);
2fe72643
EZ
288 }
289 bidi_cache_reset ();
290}
291
fd3998ff 292static INLINE void
b7b65b15
EZ
293bidi_cache_fetch_state (int idx, struct bidi_it *bidi_it)
294{
295 int current_scan_dir = bidi_it->scan_dir;
296
87e67904 297 if (idx < bidi_cache_start || idx >= bidi_cache_idx)
b7b65b15
EZ
298 abort ();
299
300 bidi_copy_it (bidi_it, &bidi_cache[idx]);
301 bidi_it->scan_dir = current_scan_dir;
302 bidi_cache_last_idx = idx;
303}
304
305/* Find a cached state with a given CHARPOS and resolved embedding
306 level less or equal to LEVEL. if LEVEL is -1, disregard the
307 resolved levels in cached states. DIR, if non-zero, means search
308 in that direction from the last cache hit. */
fd3998ff 309static INLINE int
61bfec98 310bidi_cache_search (EMACS_INT charpos, int level, int dir)
b7b65b15
EZ
311{
312 int i, i_start;
313
314 if (bidi_cache_idx)
315 {
316 if (charpos < bidi_cache[bidi_cache_last_idx].charpos)
182ce2d2
EZ
317 {
318 dir = -1;
319 i_start = bidi_cache_last_idx - 1;
320 }
321 else if (charpos > (bidi_cache[bidi_cache_last_idx].charpos
322 + bidi_cache[bidi_cache_last_idx].nchars - 1))
323 {
324 dir = 1;
325 i_start = bidi_cache_last_idx + 1;
326 }
327 else if (dir)
b7b65b15
EZ
328 i_start = bidi_cache_last_idx;
329 else
330 {
331 dir = -1;
332 i_start = bidi_cache_idx - 1;
333 }
334
335 if (dir < 0)
336 {
337 /* Linear search for now; FIXME! */
87e67904 338 for (i = i_start; i >= bidi_cache_start; i--)
182ce2d2
EZ
339 if (bidi_cache[i].charpos <= charpos
340 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
b7b65b15
EZ
341 && (level == -1 || bidi_cache[i].resolved_level <= level))
342 return i;
343 }
344 else
345 {
346 for (i = i_start; i < bidi_cache_idx; i++)
182ce2d2
EZ
347 if (bidi_cache[i].charpos <= charpos
348 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
b7b65b15
EZ
349 && (level == -1 || bidi_cache[i].resolved_level <= level))
350 return i;
351 }
352 }
353
354 return -1;
355}
356
357/* Find a cached state where the resolved level changes to a value
358 that is lower than LEVEL, and return its cache slot index. DIR is
359 the direction to search, starting with the last used cache slot.
87e67904
EZ
360 If DIR is zero, we search backwards from the last occupied cache
361 slot. BEFORE, if non-zero, means return the index of the slot that
362 is ``before'' the level change in the search direction. That is,
b7b65b15
EZ
363 given the cached levels like this:
364
365 1122333442211
366 AB C
367
368 and assuming we are at the position cached at the slot marked with
369 C, searching backwards (DIR = -1) for LEVEL = 2 will return the
370 index of slot B or A, depending whether BEFORE is, respectively,
371 non-zero or zero. */
372static int
373bidi_cache_find_level_change (int level, int dir, int before)
374{
375 if (bidi_cache_idx)
376 {
377 int i = dir ? bidi_cache_last_idx : bidi_cache_idx - 1;
378 int incr = before ? 1 : 0;
379
380 if (!dir)
381 dir = -1;
382 else if (!incr)
383 i += dir;
384
385 if (dir < 0)
386 {
87e67904 387 while (i >= bidi_cache_start + incr)
b7b65b15
EZ
388 {
389 if (bidi_cache[i - incr].resolved_level >= 0
390 && bidi_cache[i - incr].resolved_level < level)
391 return i;
392 i--;
393 }
394 }
395 else
396 {
397 while (i < bidi_cache_idx - incr)
398 {
399 if (bidi_cache[i + incr].resolved_level >= 0
400 && bidi_cache[i + incr].resolved_level < level)
401 return i;
402 i++;
403 }
404 }
405 }
406
407 return -1;
408}
409
fd3998ff 410static INLINE void
b7b65b15
EZ
411bidi_cache_iterator_state (struct bidi_it *bidi_it, int resolved)
412{
413 int idx;
414
415 /* We should never cache on backward scans. */
416 if (bidi_it->scan_dir == -1)
417 abort ();
418 idx = bidi_cache_search (bidi_it->charpos, -1, 1);
419
420 if (idx < 0)
421 {
422 idx = bidi_cache_idx;
2fe72643
EZ
423 /* Enlarge the cache as needed. */
424 if (idx >= bidi_cache_size)
425 {
0416466c 426 bidi_cache_size += BIDI_CACHE_CHUNK;
2fe72643 427 bidi_cache =
0416466c 428 (struct bidi_it *) xrealloc (bidi_cache, bidi_cache_size * elsz);
2fe72643 429 }
bd924a5d
EZ
430 /* Character positions should correspond to cache positions 1:1.
431 If we are outside the range of cached positions, the cache is
432 useless and must be reset. */
87e67904 433 if (idx > bidi_cache_start &&
182ce2d2
EZ
434 (bidi_it->charpos > (bidi_cache[idx - 1].charpos
435 + bidi_cache[idx - 1].nchars)
bd924a5d
EZ
436 || bidi_it->charpos < bidi_cache[0].charpos))
437 {
438 bidi_cache_reset ();
87e67904 439 idx = bidi_cache_start;
bd924a5d 440 }
102ebb00
EZ
441 if (bidi_it->nchars <= 0)
442 abort ();
b7b65b15
EZ
443 bidi_copy_it (&bidi_cache[idx], bidi_it);
444 if (!resolved)
445 bidi_cache[idx].resolved_level = -1;
446 }
447 else
448 {
449 /* Copy only the members which could have changed, to avoid
450 costly copying of the entire struct. */
451 bidi_cache[idx].type = bidi_it->type;
2d6e4628 452 bidi_check_type (bidi_it->type);
89d3374a
EZ
453 bidi_cache[idx].type_after_w1 = bidi_it->type_after_w1;
454 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
455 if (resolved)
456 bidi_cache[idx].resolved_level = bidi_it->resolved_level;
457 else
458 bidi_cache[idx].resolved_level = -1;
459 bidi_cache[idx].invalid_levels = bidi_it->invalid_levels;
460 bidi_cache[idx].invalid_rl_levels = bidi_it->invalid_rl_levels;
461 bidi_cache[idx].next_for_neutral = bidi_it->next_for_neutral;
462 bidi_cache[idx].next_for_ws = bidi_it->next_for_ws;
463 bidi_cache[idx].ignore_bn_limit = bidi_it->ignore_bn_limit;
464 }
465
466 bidi_cache_last_idx = idx;
467 if (idx >= bidi_cache_idx)
468 bidi_cache_idx = idx + 1;
469}
470
fd3998ff 471static INLINE bidi_type_t
61bfec98 472bidi_cache_find (EMACS_INT charpos, int level, struct bidi_it *bidi_it)
b7b65b15
EZ
473{
474 int i = bidi_cache_search (charpos, level, bidi_it->scan_dir);
475
87e67904 476 if (i >= bidi_cache_start)
b7b65b15
EZ
477 {
478 bidi_dir_t current_scan_dir = bidi_it->scan_dir;
479
5009f85e 480 bidi_copy_it (bidi_it, &bidi_cache[i]);
b7b65b15
EZ
481 bidi_cache_last_idx = i;
482 /* Don't let scan direction from from the cached state override
483 the current scan direction. */
484 bidi_it->scan_dir = current_scan_dir;
485 return bidi_it->type;
486 }
487
488 return UNKNOWN_BT;
489}
490
fd3998ff 491static INLINE int
b7b65b15
EZ
492bidi_peek_at_next_level (struct bidi_it *bidi_it)
493{
87e67904 494 if (bidi_cache_idx == bidi_cache_start || bidi_cache_last_idx == -1)
b7b65b15
EZ
495 abort ();
496 return bidi_cache[bidi_cache_last_idx + bidi_it->scan_dir].resolved_level;
497}
498
be39f003
EZ
499/* Check if buffer position CHARPOS/BYTEPOS is the end of a paragraph.
500 Value is the non-negative length of the paragraph separator
501 following the buffer position, -1 if position is at the beginning
502 of a new paragraph, or -2 if position is neither at beginning nor
503 at end of a paragraph. */
fd3998ff 504static EMACS_INT
6bff6497 505bidi_at_paragraph_end (EMACS_INT charpos, EMACS_INT bytepos)
b7b65b15 506{
372b7a95
EZ
507 Lisp_Object sep_re;
508 Lisp_Object start_re;
be39f003
EZ
509 EMACS_INT val;
510
372b7a95
EZ
511 sep_re = paragraph_separate_re;
512 start_re = paragraph_start_re;
be39f003
EZ
513
514 val = fast_looking_at (sep_re, charpos, bytepos, ZV, ZV_BYTE, Qnil);
515 if (val < 0)
516 {
517 if (fast_looking_at (start_re, charpos, bytepos, ZV, ZV_BYTE, Qnil) >= 0)
518 val = -1;
519 else
520 val = -2;
521 }
b7b65b15 522
be39f003 523 return val;
b7b65b15
EZ
524}
525
526/* Determine the start-of-run (sor) directional type given the two
527 embedding levels on either side of the run boundary. Also, update
528 the saved info about previously seen characters, since that info is
529 generally valid for a single level run. */
fd3998ff 530static INLINE void
b7b65b15
EZ
531bidi_set_sor_type (struct bidi_it *bidi_it, int level_before, int level_after)
532{
533 int higher_level = level_before > level_after ? level_before : level_after;
534
535 /* The prev_was_pdf gork is required for when we have several PDFs
536 in a row. In that case, we want to compute the sor type for the
537 next level run only once: when we see the first PDF. That's
538 because the sor type depends only on the higher of the two levels
539 that we find on the two sides of the level boundary (see UAX#9,
540 clause X10), and so we don't need to know the final embedding
541 level to which we descend after processing all the PDFs. */
e342a24d 542 if (!bidi_it->prev_was_pdf || level_before < level_after)
b7b65b15
EZ
543 /* FIXME: should the default sor direction be user selectable? */
544 bidi_it->sor = (higher_level & 1) != 0 ? R2L : L2R;
545 if (level_before > level_after)
546 bidi_it->prev_was_pdf = 1;
547
548 bidi_it->prev.type = UNKNOWN_BT;
89d3374a
EZ
549 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
550 bidi_it->last_strong.orig_type = UNKNOWN_BT;
b7b65b15
EZ
551 bidi_it->prev_for_neutral.type = bidi_it->sor == R2L ? STRONG_R : STRONG_L;
552 bidi_it->prev_for_neutral.charpos = bidi_it->charpos;
553 bidi_it->prev_for_neutral.bytepos = bidi_it->bytepos;
89d3374a
EZ
554 bidi_it->next_for_neutral.type = bidi_it->next_for_neutral.type_after_w1 =
555 bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
87e67904 556 bidi_it->ignore_bn_limit = -1; /* meaning it's unknown */
b7b65b15
EZ
557}
558
182ce2d2 559/* Perform initializations for reordering a new line of bidi text. */
6bff6497 560static void
be39f003
EZ
561bidi_line_init (struct bidi_it *bidi_it)
562{
563 bidi_it->scan_dir = 1; /* FIXME: do we need to have control on this? */
564 bidi_it->resolved_level = bidi_it->level_stack[0].level;
565 bidi_it->level_stack[0].override = NEUTRAL_DIR; /* X1 */
566 bidi_it->invalid_levels = 0;
567 bidi_it->invalid_rl_levels = -1;
568 bidi_it->next_en_pos = -1;
569 bidi_it->next_for_ws.type = UNKNOWN_BT;
b44d9321
EZ
570 bidi_set_sor_type (bidi_it,
571 bidi_it->paragraph_dir == R2L ? 1 : 0,
be39f003
EZ
572 bidi_it->level_stack[0].level); /* X10 */
573
574 bidi_cache_reset ();
575}
576
87e67904
EZ
577/* Count bytes in multibyte string S between BEG/BEGBYTE and END. BEG
578 and END are zero-based character positions in S, BEGBYTE is byte
579 position corresponding to BEG. */
580static inline EMACS_INT
581bidi_count_bytes (const unsigned char *s, const EMACS_INT beg,
582 const EMACS_INT begbyte, const EMACS_INT end)
583{
584 EMACS_INT pos = beg;
585 const unsigned char *p = s + begbyte, *start = p;
586
587 if (!CHAR_HEAD_P (*p))
588 abort ();
589
590 while (pos < end)
591 {
592 p += BYTES_BY_CHAR_HEAD (*p);
593 pos++;
594 }
595
596 return p - start;
597}
598
599/* Fetch and returns the character at byte position BYTEPOS. If S is
600 non-NULL, fetch the character from string S; otherwise fetch the
601 character from the current buffer. */
602static inline int
603bidi_char_at_pos (EMACS_INT bytepos, const unsigned char *s)
604{
605 if (s)
606 return STRING_CHAR (s + bytepos);
607 else
608 return FETCH_MULTIBYTE_CHAR (bytepos);
609}
610
102ebb00
EZ
611/* Fetch and return the character at BYTEPOS/CHARPOS. If that
612 character is covered by a display string, treat the entire run of
613 covered characters as a single character u+FFFC, and return their
614 combined length in CH_LEN and NCHARS. DISP_POS specifies the
615 character position of the next display string, or -1 if not yet
616 computed. When the next character is at or beyond that position,
617 the function updates DISP_POS with the position of the next display
87e67904
EZ
618 string. STRING->s is the string to iterate, or NULL if iterating over
619 a buffer. */
620static inline int
102ebb00 621bidi_fetch_char (EMACS_INT bytepos, EMACS_INT charpos, EMACS_INT *disp_pos,
87e67904 622 struct bidi_string_data *string,
fc6f18ce 623 int frame_window_p, EMACS_INT *ch_len, EMACS_INT *nchars)
182ce2d2
EZ
624{
625 int ch;
87e67904 626 EMACS_INT endpos = string->s ? string->schars : ZV;
6db161be 627 struct text_pos pos;
182ce2d2 628
182ce2d2 629 /* If we got past the last known position of display string, compute
87e67904
EZ
630 the position of the next one. That position could be at CHARPOS. */
631 if (charpos < endpos && charpos > *disp_pos)
6db161be
EZ
632 {
633 SET_TEXT_POS (pos, charpos, bytepos);
634 *disp_pos = compute_display_string_pos (&pos, string, frame_window_p);
635 }
102ebb00
EZ
636
637 /* Fetch the character at BYTEPOS. */
87e67904 638 if (charpos >= endpos)
182ce2d2
EZ
639 {
640 ch = BIDI_EOB;
641 *ch_len = 1;
642 *nchars = 1;
87e67904 643 *disp_pos = endpos;
182ce2d2 644 }
102ebb00 645 else if (charpos >= *disp_pos)
182ce2d2 646 {
7b600102
EZ
647 EMACS_INT disp_end_pos;
648
649 /* We don't expect to find ourselves in the middle of a display
650 property. Hopefully, it will never be needed. */
651 if (charpos > *disp_pos)
652 abort ();
653 /* Return the Unicode Object Replacement Character to represent
87e67904 654 the entire run of characters covered by the display string. */
7b600102 655 ch = 0xFFFC;
87e67904 656 disp_end_pos = compute_display_string_end (*disp_pos, string);
7b600102 657 *nchars = disp_end_pos - *disp_pos;
87e67904
EZ
658 if (string->s)
659 *ch_len = bidi_count_bytes (string->s, *disp_pos, bytepos,
660 disp_end_pos);
661 else
662 *ch_len = CHAR_TO_BYTE (disp_end_pos) - bytepos;
182ce2d2 663 }
182ce2d2
EZ
664 else
665 {
87e67904
EZ
666 if (string->s)
667 {
668 EMACS_INT len;
669
670 ch = STRING_CHAR_AND_LENGTH (string->s + bytepos, len);
671 *ch_len = len;
672 }
673 else
674 {
675 ch = FETCH_MULTIBYTE_CHAR (bytepos);
676 *ch_len = CHAR_BYTES (ch);
677 }
182ce2d2
EZ
678 *nchars = 1;
679 }
680
681 /* If we just entered a run of characters covered by a display
682 string, compute the position of the next display string. */
87e67904 683 if (charpos + *nchars <= endpos && charpos + *nchars > *disp_pos)
6db161be
EZ
684 {
685 SET_TEXT_POS (pos, charpos + *nchars, bytepos + *ch_len);
686 *disp_pos = compute_display_string_pos (&pos, string, frame_window_p);
687 }
182ce2d2
EZ
688
689 return ch;
690}
691
be39f003
EZ
692/* Find the beginning of this paragraph by looking back in the buffer.
693 Value is the byte position of the paragraph's beginning. */
694static EMACS_INT
b44d9321 695bidi_find_paragraph_start (EMACS_INT pos, EMACS_INT pos_byte)
6bff6497 696{
372b7a95 697 Lisp_Object re = paragraph_start_re;
6bff6497
EZ
698 EMACS_INT limit = ZV, limit_byte = ZV_BYTE;
699
6bff6497
EZ
700 while (pos_byte > BEGV_BYTE
701 && fast_looking_at (re, pos, pos_byte, limit, limit_byte, Qnil) < 0)
702 {
182ce2d2
EZ
703 /* FIXME: What if the paragraph beginning is covered by a
704 display string? And what if a display string covering some
705 of the text over which we scan back includes
706 paragraph_start_re? */
be39f003
EZ
707 pos = find_next_newline_no_quit (pos - 1, -1);
708 pos_byte = CHAR_TO_BYTE (pos);
6bff6497 709 }
be39f003 710 return pos_byte;
6bff6497
EZ
711}
712
bea4f10c 713/* Determine the base direction, a.k.a. base embedding level, of the
b44d9321
EZ
714 paragraph we are about to iterate through. If DIR is either L2R or
715 R2L, just use that. Otherwise, determine the paragraph direction
bea4f10c
EZ
716 from the first strong directional character of the paragraph.
717
182ce2d2 718 NO_DEFAULT_P non-zero means don't default to L2R if the paragraph
bea4f10c
EZ
719 has no strong directional characters and both DIR and
720 bidi_it->paragraph_dir are NEUTRAL_DIR. In that case, search back
721 in the buffer until a paragraph is found with a strong character,
722 or until hitting BEGV. In the latter case, fall back to L2R. This
723 flag is used in current-bidi-paragraph-direction.
724
725 Note that this function gives the paragraph separator the same
726 direction as the preceding paragraph, even though Emacs generally
727 views the separartor as not belonging to any paragraph. */
b7b65b15 728void
bea4f10c 729bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it, int no_default_p)
b7b65b15 730{
6bff6497 731 EMACS_INT bytepos = bidi_it->bytepos;
87e67904 732 int string_p = bidi_it->string.s != NULL;
bea4f10c 733 EMACS_INT pstartbyte;
87e67904
EZ
734 /* Note that begbyte is a byte position, while end is a character
735 position. Yes, this is ugly, but we are trying to avoid costly
736 calls to BYTE_TO_CHAR and its ilk. */
737 EMACS_INT begbyte = string_p ? 0 : BEGV_BYTE;
738 EMACS_INT end = string_p ? bidi_it->string.schars : ZV;
e7402cb2 739
5e65aec0 740 /* Special case for an empty buffer. */
87e67904 741 if (bytepos == begbyte && bidi_it->charpos == end)
5e65aec0 742 dir = L2R;
9c82e145 743 /* We should never be called at EOB or before BEGV. */
87e67904 744 else if (bidi_it->charpos >= end || bytepos < begbyte)
9c82e145
EZ
745 abort ();
746
be39f003
EZ
747 if (dir == L2R)
748 {
749 bidi_it->paragraph_dir = L2R;
750 bidi_it->new_paragraph = 0;
751 }
752 else if (dir == R2L)
753 {
754 bidi_it->paragraph_dir = R2L;
755 bidi_it->new_paragraph = 0;
756 }
b7b65b15
EZ
757 else if (dir == NEUTRAL_DIR) /* P2 */
758 {
182ce2d2
EZ
759 int ch;
760 EMACS_INT ch_len, nchars;
761 EMACS_INT pos, disp_pos = -1;
6bff6497 762 bidi_type_t type;
be39f003 763
d20e1419
EZ
764 if (!bidi_initialized)
765 bidi_initialize ();
766
be39f003
EZ
767 /* If we are inside a paragraph separator, we are just waiting
768 for the separator to be exhausted; use the previous paragraph
e5a2fec7
EZ
769 direction. But don't do that if we have been just reseated,
770 because we need to reinitialize below in that case. */
771 if (!bidi_it->first_elt
772 && bidi_it->charpos < bidi_it->separator_limit)
be39f003
EZ
773 return;
774
b44d9321 775 /* If we are on a newline, get past it to where the next
5e65aec0
EZ
776 paragraph might start. But don't do that at BEGV since then
777 we are potentially in a new paragraph that doesn't yet
778 exist. */
c143c213 779 pos = bidi_it->charpos;
87e67904
EZ
780 if (bytepos > begbyte
781 && bidi_char_at_pos (bytepos, bidi_it->string.s) == '\n')
be39f003 782 {
b44d9321 783 bytepos++;
c143c213 784 pos++;
be39f003 785 }
b44d9321
EZ
786
787 /* We are either at the beginning of a paragraph or in the
788 middle of it. Find where this paragraph starts. */
87e67904
EZ
789 if (string_p)
790 {
791 /* We don't support changes of paragraph direction inside a
792 string. It is treated as a single paragraph. */
793 pstartbyte = 0;
794 }
795 else
796 pstartbyte = bidi_find_paragraph_start (pos, bytepos);
be39f003
EZ
797 bidi_it->separator_limit = -1;
798 bidi_it->new_paragraph = 0;
bea4f10c
EZ
799
800 /* The following loop is run more than once only if NO_DEFAULT_P
87e67904 801 is non-zero, and only if we are iterating on a buffer. */
bea4f10c
EZ
802 do {
803 bytepos = pstartbyte;
87e67904
EZ
804 if (!string_p)
805 pos = BYTE_TO_CHAR (bytepos);
806 ch = bidi_fetch_char (bytepos, pos, &disp_pos, &bidi_it->string,
807 bidi_it->frame_window_p, &ch_len, &nchars);
bea4f10c
EZ
808 type = bidi_get_type (ch, NEUTRAL_DIR);
809
182ce2d2 810 for (pos += nchars, bytepos += ch_len;
bea4f10c
EZ
811 /* NOTE: UAX#9 says to search only for L, AL, or R types
812 of characters, and ignore RLE, RLO, LRE, and LRO.
813 However, I'm not sure it makes sense to omit those 4;
814 should try with and without that to see the effect. */
815 (bidi_get_category (type) != STRONG)
816 || (bidi_ignore_explicit_marks_for_paragraph_level
817 && (type == RLE || type == RLO
818 || type == LRE || type == LRO));
819 type = bidi_get_type (ch, NEUTRAL_DIR))
820 {
87e67904
EZ
821 if (!string_p
822 && type == NEUTRAL_B
823 && bidi_at_paragraph_end (pos, bytepos) >= -1)
21fce5ab 824 break;
87e67904 825 if (pos >= end)
bea4f10c
EZ
826 {
827 /* Pretend there's a paragraph separator at end of
87e67904 828 buffer/string. */
bea4f10c
EZ
829 type = NEUTRAL_B;
830 break;
831 }
102ebb00 832 /* Fetch next character and advance to get past it. */
87e67904 833 ch = bidi_fetch_char (bytepos, pos, &disp_pos, &bidi_it->string,
fc6f18ce 834 bidi_it->frame_window_p, &ch_len, &nchars);
182ce2d2
EZ
835 pos += nchars;
836 bytepos += ch_len;
bea4f10c
EZ
837 }
838 if (type == STRONG_R || type == STRONG_AL) /* P3 */
839 bidi_it->paragraph_dir = R2L;
840 else if (type == STRONG_L)
841 bidi_it->paragraph_dir = L2R;
87e67904
EZ
842 if (!string_p
843 && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR)
bea4f10c
EZ
844 {
845 /* If this paragraph is at BEGV, default to L2R. */
846 if (pstartbyte == BEGV_BYTE)
847 bidi_it->paragraph_dir = L2R; /* P3 and HL1 */
848 else
849 {
850 EMACS_INT prevpbyte = pstartbyte;
851 EMACS_INT p = BYTE_TO_CHAR (pstartbyte), pbyte = pstartbyte;
852
853 /* Find the beginning of the previous paragraph, if any. */
854 while (pbyte > BEGV_BYTE && prevpbyte >= pstartbyte)
855 {
182ce2d2
EZ
856 /* FXIME: What if p is covered by a display
857 string? See also a FIXME inside
858 bidi_find_paragraph_start. */
bea4f10c
EZ
859 p--;
860 pbyte = CHAR_TO_BYTE (p);
861 prevpbyte = bidi_find_paragraph_start (p, pbyte);
862 }
863 pstartbyte = prevpbyte;
864 }
865 }
87e67904
EZ
866 } while (!string_p
867 && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR);
b7b65b15 868 }
be39f003
EZ
869 else
870 abort ();
b7b65b15 871
b44d9321
EZ
872 /* Contrary to UAX#9 clause P3, we only default the paragraph
873 direction to L2R if we have no previous usable paragraph
bea4f10c 874 direction. This is allowed by the HL1 clause. */
d20e1419 875 if (bidi_it->paragraph_dir != L2R && bidi_it->paragraph_dir != R2L)
bea4f10c 876 bidi_it->paragraph_dir = L2R; /* P3 and HL1 ``higher-level protocols'' */
be39f003 877 if (bidi_it->paragraph_dir == R2L)
b44d9321 878 bidi_it->level_stack[0].level = 1;
be39f003 879 else
b44d9321 880 bidi_it->level_stack[0].level = 0;
be39f003
EZ
881
882 bidi_line_init (bidi_it);
b7b65b15
EZ
883}
884
6bff6497
EZ
885/* Do whatever UAX#9 clause X8 says should be done at paragraph's
886 end. */
fd3998ff 887static INLINE void
b7b65b15
EZ
888bidi_set_paragraph_end (struct bidi_it *bidi_it)
889{
890 bidi_it->invalid_levels = 0;
891 bidi_it->invalid_rl_levels = -1;
892 bidi_it->stack_idx = 0;
893 bidi_it->resolved_level = bidi_it->level_stack[0].level;
b7b65b15
EZ
894}
895
182ce2d2 896/* Initialize the bidi iterator from buffer/string position CHARPOS. */
b7b65b15 897void
fc6f18ce
EZ
898bidi_init_it (EMACS_INT charpos, EMACS_INT bytepos, int frame_window_p,
899 struct bidi_it *bidi_it)
b7b65b15
EZ
900{
901 if (! bidi_initialized)
902 bidi_initialize ();
87e67904
EZ
903 if (charpos >= 0)
904 bidi_it->charpos = charpos;
905 if (bytepos >= 0)
906 bidi_it->bytepos = bytepos;
fc6f18ce 907 bidi_it->frame_window_p = frame_window_p;
182ce2d2 908 bidi_it->nchars = -1; /* to be computed in bidi_resolve_explicit_1 */
6bff6497
EZ
909 bidi_it->first_elt = 1;
910 bidi_set_paragraph_end (bidi_it);
911 bidi_it->new_paragraph = 1;
be39f003 912 bidi_it->separator_limit = -1;
b7b65b15 913 bidi_it->type = NEUTRAL_B;
ebb5722e
EZ
914 bidi_it->type_after_w1 = NEUTRAL_B;
915 bidi_it->orig_type = NEUTRAL_B;
b7b65b15 916 bidi_it->prev_was_pdf = 0;
ebb5722e
EZ
917 bidi_it->prev.type = bidi_it->prev.type_after_w1 =
918 bidi_it->prev.orig_type = UNKNOWN_BT;
89d3374a
EZ
919 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
920 bidi_it->last_strong.orig_type = UNKNOWN_BT;
b7b65b15
EZ
921 bidi_it->next_for_neutral.charpos = -1;
922 bidi_it->next_for_neutral.type =
89d3374a
EZ
923 bidi_it->next_for_neutral.type_after_w1 =
924 bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
b7b65b15
EZ
925 bidi_it->prev_for_neutral.charpos = -1;
926 bidi_it->prev_for_neutral.type =
89d3374a
EZ
927 bidi_it->prev_for_neutral.type_after_w1 =
928 bidi_it->prev_for_neutral.orig_type = UNKNOWN_BT;
b7b65b15 929 bidi_it->sor = L2R; /* FIXME: should it be user-selectable? */
182ce2d2 930 bidi_it->disp_pos = -1; /* invalid/unknown */
87e67904
EZ
931 /* We can only shrink the cache if we are at the bottom level of its
932 "stack". */
933 if (bidi_cache_start == 0)
934 bidi_cache_shrink ();
b7b65b15
EZ
935}
936
937/* Push the current embedding level and override status; reset the
938 current level to LEVEL and the current override status to OVERRIDE. */
fd3998ff 939static INLINE void
b7b65b15
EZ
940bidi_push_embedding_level (struct bidi_it *bidi_it,
941 int level, bidi_dir_t override)
942{
943 bidi_it->stack_idx++;
944 if (bidi_it->stack_idx >= BIDI_MAXLEVEL)
945 abort ();
946 bidi_it->level_stack[bidi_it->stack_idx].level = level;
947 bidi_it->level_stack[bidi_it->stack_idx].override = override;
948}
949
950/* Pop the embedding level and directional override status from the
951 stack, and return the new level. */
fd3998ff 952static INLINE int
b7b65b15
EZ
953bidi_pop_embedding_level (struct bidi_it *bidi_it)
954{
955 /* UAX#9 says to ignore invalid PDFs. */
956 if (bidi_it->stack_idx > 0)
957 bidi_it->stack_idx--;
958 return bidi_it->level_stack[bidi_it->stack_idx].level;
959}
960
961/* Record in SAVED_INFO the information about the current character. */
fd3998ff 962static INLINE void
b7b65b15
EZ
963bidi_remember_char (struct bidi_saved_info *saved_info,
964 struct bidi_it *bidi_it)
965{
966 saved_info->charpos = bidi_it->charpos;
967 saved_info->bytepos = bidi_it->bytepos;
968 saved_info->type = bidi_it->type;
2d6e4628 969 bidi_check_type (bidi_it->type);
89d3374a
EZ
970 saved_info->type_after_w1 = bidi_it->type_after_w1;
971 bidi_check_type (bidi_it->type_after_w1);
b7b65b15 972 saved_info->orig_type = bidi_it->orig_type;
2d6e4628 973 bidi_check_type (bidi_it->orig_type);
b7b65b15
EZ
974}
975
976/* Resolve the type of a neutral character according to the type of
977 surrounding strong text and the current embedding level. */
fd3998ff 978static INLINE bidi_type_t
b7b65b15
EZ
979bidi_resolve_neutral_1 (bidi_type_t prev_type, bidi_type_t next_type, int lev)
980{
981 /* N1: European and Arabic numbers are treated as though they were R. */
982 if (next_type == WEAK_EN || next_type == WEAK_AN)
983 next_type = STRONG_R;
984 if (prev_type == WEAK_EN || prev_type == WEAK_AN)
985 prev_type = STRONG_R;
986
987 if (next_type == prev_type) /* N1 */
988 return next_type;
989 else if ((lev & 1) == 0) /* N2 */
990 return STRONG_L;
991 else
992 return STRONG_R;
993}
994
fd3998ff 995static INLINE int
182ce2d2 996bidi_explicit_dir_char (int ch)
b7b65b15 997{
182ce2d2
EZ
998 bidi_type_t ch_type;
999
1000 if (!bidi_initialized)
1001 abort ();
1002 ch_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
1003 return (ch_type == LRE || ch_type == LRO
1004 || ch_type == RLE || ch_type == RLO
1005 || ch_type == PDF);
b7b65b15
EZ
1006}
1007
1008/* A helper function for bidi_resolve_explicit. It advances to the
1009 next character in logical order and determines the new embedding
1010 level and directional override, but does not take into account
1011 empty embeddings. */
1012static int
1013bidi_resolve_explicit_1 (struct bidi_it *bidi_it)
1014{
1015 int curchar;
1016 bidi_type_t type;
1017 int current_level;
1018 int new_level;
1019 bidi_dir_t override;
87e67904 1020 int string_p = bidi_it->string.s != NULL;
b7b65b15 1021
182ce2d2
EZ
1022 /* If reseat()'ed, don't advance, so as to start iteration from the
1023 position where we were reseated. bidi_it->bytepos can be less
1024 than BEGV_BYTE after reseat to BEGV. */
87e67904 1025 if (bidi_it->bytepos < (string_p ? 0 : BEGV_BYTE)
9c82e145 1026 || bidi_it->first_elt)
e7402cb2 1027 {
9c82e145 1028 bidi_it->first_elt = 0;
87e67904
EZ
1029 if (string_p)
1030 {
1031 if (bidi_it->charpos < 0)
1032 bidi_it->charpos = 0;
1033 bidi_it->bytepos = bidi_count_bytes (bidi_it->string.s, 0, 0,
1034 bidi_it->charpos);
1035 }
1036 else
1037 {
1038 if (bidi_it->charpos < BEGV)
1039 bidi_it->charpos = BEGV;
1040 bidi_it->bytepos = CHAR_TO_BYTE (bidi_it->charpos);
1041 }
e7402cb2 1042 }
87e67904
EZ
1043 /* Don't move at end of buffer/string. */
1044 else if (bidi_it->charpos < (string_p ? bidi_it->string.schars : ZV))
b7b65b15 1045 {
182ce2d2
EZ
1046 /* Advance to the next character, skipping characters covered by
1047 display strings (nchars > 1). */
102ebb00
EZ
1048 if (bidi_it->nchars <= 0)
1049 abort ();
182ce2d2 1050 bidi_it->charpos += bidi_it->nchars;
e7402cb2
EZ
1051 if (bidi_it->ch_len == 0)
1052 abort ();
b7b65b15
EZ
1053 bidi_it->bytepos += bidi_it->ch_len;
1054 }
1055
1056 current_level = bidi_it->level_stack[bidi_it->stack_idx].level; /* X1 */
1057 override = bidi_it->level_stack[bidi_it->stack_idx].override;
1058 new_level = current_level;
1059
87e67904 1060 if (bidi_it->charpos >= (string_p ? bidi_it->string.schars : ZV))
e7402cb2
EZ
1061 {
1062 curchar = BIDI_EOB;
1063 bidi_it->ch_len = 1;
182ce2d2 1064 bidi_it->nchars = 1;
87e67904 1065 bidi_it->disp_pos = (string_p ? bidi_it->string.schars : ZV);
e7402cb2
EZ
1066 }
1067 else
1068 {
182ce2d2
EZ
1069 /* Fetch the character at BYTEPOS. If it is covered by a
1070 display string, treat the entire run of covered characters as
1071 a single character u+FFFC. */
102ebb00 1072 curchar = bidi_fetch_char (bidi_it->bytepos, bidi_it->charpos,
87e67904
EZ
1073 &bidi_it->disp_pos, &bidi_it->string,
1074 bidi_it->frame_window_p,
102ebb00 1075 &bidi_it->ch_len, &bidi_it->nchars);
e7402cb2 1076 }
b7b65b15 1077 bidi_it->ch = curchar;
b7b65b15 1078
6bff6497
EZ
1079 /* Don't apply directional override here, as all the types we handle
1080 below will not be affected by the override anyway, and we need
1081 the original type unaltered. The override will be applied in
1082 bidi_resolve_weak. */
1083 type = bidi_get_type (curchar, NEUTRAL_DIR);
89d3374a
EZ
1084 bidi_it->orig_type = type;
1085 bidi_check_type (bidi_it->orig_type);
b7b65b15
EZ
1086
1087 if (type != PDF)
1088 bidi_it->prev_was_pdf = 0;
1089
89d3374a 1090 bidi_it->type_after_w1 = UNKNOWN_BT;
b7b65b15
EZ
1091
1092 switch (type)
1093 {
1094 case RLE: /* X2 */
1095 case RLO: /* X4 */
89d3374a
EZ
1096 bidi_it->type_after_w1 = type;
1097 bidi_check_type (bidi_it->type_after_w1);
b7b65b15 1098 type = WEAK_BN; /* X9/Retaining */
87e67904 1099 if (bidi_it->ignore_bn_limit <= -1)
b7b65b15
EZ
1100 {
1101 if (current_level <= BIDI_MAXLEVEL - 4)
1102 {
1103 /* Compute the least odd embedding level greater than
1104 the current level. */
1105 new_level = ((current_level + 1) & ~1) + 1;
89d3374a 1106 if (bidi_it->type_after_w1 == RLE)
b7b65b15
EZ
1107 override = NEUTRAL_DIR;
1108 else
1109 override = R2L;
1110 if (current_level == BIDI_MAXLEVEL - 4)
1111 bidi_it->invalid_rl_levels = 0;
1112 bidi_push_embedding_level (bidi_it, new_level, override);
1113 }
1114 else
1115 {
1116 bidi_it->invalid_levels++;
1117 /* See the commentary about invalid_rl_levels below. */
1118 if (bidi_it->invalid_rl_levels < 0)
1119 bidi_it->invalid_rl_levels = 0;
1120 bidi_it->invalid_rl_levels++;
1121 }
1122 }
89d3374a 1123 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
b7b65b15
EZ
1124 || bidi_it->next_en_pos > bidi_it->charpos)
1125 type = WEAK_EN;
1126 break;
1127 case LRE: /* X3 */
1128 case LRO: /* X5 */
89d3374a
EZ
1129 bidi_it->type_after_w1 = type;
1130 bidi_check_type (bidi_it->type_after_w1);
b7b65b15 1131 type = WEAK_BN; /* X9/Retaining */
87e67904 1132 if (bidi_it->ignore_bn_limit <= -1)
b7b65b15
EZ
1133 {
1134 if (current_level <= BIDI_MAXLEVEL - 5)
1135 {
1136 /* Compute the least even embedding level greater than
1137 the current level. */
1138 new_level = ((current_level + 2) & ~1);
89d3374a 1139 if (bidi_it->type_after_w1 == LRE)
b7b65b15
EZ
1140 override = NEUTRAL_DIR;
1141 else
1142 override = L2R;
1143 bidi_push_embedding_level (bidi_it, new_level, override);
1144 }
1145 else
1146 {
1147 bidi_it->invalid_levels++;
1148 /* invalid_rl_levels counts invalid levels encountered
1149 while the embedding level was already too high for
1150 LRE/LRO, but not for RLE/RLO. That is because
1151 there may be exactly one PDF which we should not
1152 ignore even though invalid_levels is non-zero.
1153 invalid_rl_levels helps to know what PDF is
1154 that. */
1155 if (bidi_it->invalid_rl_levels >= 0)
1156 bidi_it->invalid_rl_levels++;
1157 }
1158 }
89d3374a 1159 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
b7b65b15
EZ
1160 || bidi_it->next_en_pos > bidi_it->charpos)
1161 type = WEAK_EN;
1162 break;
1163 case PDF: /* X7 */
89d3374a
EZ
1164 bidi_it->type_after_w1 = type;
1165 bidi_check_type (bidi_it->type_after_w1);
b7b65b15 1166 type = WEAK_BN; /* X9/Retaining */
87e67904 1167 if (bidi_it->ignore_bn_limit <= -1)
b7b65b15
EZ
1168 {
1169 if (!bidi_it->invalid_rl_levels)
1170 {
1171 new_level = bidi_pop_embedding_level (bidi_it);
1172 bidi_it->invalid_rl_levels = -1;
1173 if (bidi_it->invalid_levels)
1174 bidi_it->invalid_levels--;
1175 /* else nothing: UAX#9 says to ignore invalid PDFs */
1176 }
1177 if (!bidi_it->invalid_levels)
1178 new_level = bidi_pop_embedding_level (bidi_it);
1179 else
1180 {
1181 bidi_it->invalid_levels--;
1182 bidi_it->invalid_rl_levels--;
1183 }
1184 }
89d3374a 1185 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
b7b65b15
EZ
1186 || bidi_it->next_en_pos > bidi_it->charpos)
1187 type = WEAK_EN;
1188 break;
1189 default:
1190 /* Nothing. */
1191 break;
1192 }
1193
1194 bidi_it->type = type;
2d6e4628 1195 bidi_check_type (bidi_it->type);
b7b65b15
EZ
1196
1197 return new_level;
1198}
1199
1200/* Given an iterator state in BIDI_IT, advance one character position
182ce2d2
EZ
1201 in the buffer/string to the next character (in the logical order),
1202 resolve any explicit embeddings and directional overrides, and
1203 return the embedding level of the character after resolving
1204 explicit directives and ignoring empty embeddings. */
b7b65b15
EZ
1205static int
1206bidi_resolve_explicit (struct bidi_it *bidi_it)
1207{
1208 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1209 int new_level = bidi_resolve_explicit_1 (bidi_it);
87e67904 1210 EMACS_INT eob = bidi_it->string.s ? bidi_it->string.schars : ZV;
b7b65b15
EZ
1211
1212 if (prev_level < new_level
1213 && bidi_it->type == WEAK_BN
87e67904
EZ
1214 && bidi_it->ignore_bn_limit == -1 /* only if not already known */
1215 && bidi_it->charpos < eob /* not already at EOB */
1216 && bidi_explicit_dir_char (bidi_char_at_pos (bidi_it->bytepos
1217 + bidi_it->ch_len,
1218 bidi_it->string.s)))
b7b65b15
EZ
1219 {
1220 /* Avoid pushing and popping embedding levels if the level run
1221 is empty, as this breaks level runs where it shouldn't.
1222 UAX#9 removes all the explicit embedding and override codes,
1223 so empty embeddings disappear without a trace. We need to
1224 behave as if we did the same. */
1225 struct bidi_it saved_it;
1226 int level = prev_level;
1227
1228 bidi_copy_it (&saved_it, bidi_it);
1229
87e67904
EZ
1230 while (bidi_explicit_dir_char (bidi_char_at_pos (bidi_it->bytepos
1231 + bidi_it->ch_len,
1232 bidi_it->string.s)))
b7b65b15 1233 {
182ce2d2
EZ
1234 /* This advances to the next character, skipping any
1235 characters covered by display strings. */
b7b65b15
EZ
1236 level = bidi_resolve_explicit_1 (bidi_it);
1237 }
1238
102ebb00
EZ
1239 if (bidi_it->nchars <= 0)
1240 abort ();
b7b65b15 1241 if (level == prev_level) /* empty embedding */
182ce2d2 1242 saved_it.ignore_bn_limit = bidi_it->charpos + bidi_it->nchars;
b7b65b15 1243 else /* this embedding is non-empty */
87e67904 1244 saved_it.ignore_bn_limit = -2;
b7b65b15
EZ
1245
1246 bidi_copy_it (bidi_it, &saved_it);
87e67904 1247 if (bidi_it->ignore_bn_limit > -1)
b7b65b15
EZ
1248 {
1249 /* We pushed a level, but we shouldn't have. Undo that. */
1250 if (!bidi_it->invalid_rl_levels)
1251 {
1252 new_level = bidi_pop_embedding_level (bidi_it);
1253 bidi_it->invalid_rl_levels = -1;
1254 if (bidi_it->invalid_levels)
1255 bidi_it->invalid_levels--;
1256 }
1257 if (!bidi_it->invalid_levels)
1258 new_level = bidi_pop_embedding_level (bidi_it);
1259 else
1260 {
1261 bidi_it->invalid_levels--;
1262 bidi_it->invalid_rl_levels--;
1263 }
1264 }
1265 }
1266
b7b65b15
EZ
1267 if (bidi_it->type == NEUTRAL_B) /* X8 */
1268 {
21fce5ab 1269 bidi_set_paragraph_end (bidi_it);
6bff6497
EZ
1270 /* This is needed by bidi_resolve_weak below, and in L1. */
1271 bidi_it->type_after_w1 = bidi_it->type;
89d3374a 1272 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1273 }
1274
1275 return new_level;
1276}
1277
182ce2d2
EZ
1278/* Advance in the buffer/string, resolve weak types and return the
1279 type of the next character after weak type resolution. */
fd3998ff 1280static bidi_type_t
b7b65b15
EZ
1281bidi_resolve_weak (struct bidi_it *bidi_it)
1282{
1283 bidi_type_t type;
1284 bidi_dir_t override;
1285 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1286 int new_level = bidi_resolve_explicit (bidi_it);
1287 int next_char;
1288 bidi_type_t type_of_next;
1289 struct bidi_it saved_it;
87e67904 1290 EMACS_INT eob = bidi_it->string.s ? bidi_it->string.schars : ZV;
b7b65b15
EZ
1291
1292 type = bidi_it->type;
1293 override = bidi_it->level_stack[bidi_it->stack_idx].override;
1294
1295 if (type == UNKNOWN_BT
1296 || type == LRE
1297 || type == LRO
1298 || type == RLE
1299 || type == RLO
1300 || type == PDF)
1301 abort ();
1302
1303 if (new_level != prev_level
1304 || bidi_it->type == NEUTRAL_B)
1305 {
1306 /* We've got a new embedding level run, compute the directional
1307 type of sor and initialize per-run variables (UAX#9, clause
1308 X10). */
1309 bidi_set_sor_type (bidi_it, prev_level, new_level);
1310 }
1311 else if (type == NEUTRAL_S || type == NEUTRAL_WS
1312 || type == WEAK_BN || type == STRONG_AL)
89d3374a
EZ
1313 bidi_it->type_after_w1 = type; /* needed in L1 */
1314 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1315
1316 /* Level and directional override status are already recorded in
1317 bidi_it, and do not need any change; see X6. */
1318 if (override == R2L) /* X6 */
1319 type = STRONG_R;
1320 else if (override == L2R)
1321 type = STRONG_L;
bc5a45f3 1322 else
b7b65b15 1323 {
bc5a45f3 1324 if (type == WEAK_NSM) /* W1 */
b7b65b15 1325 {
bc5a45f3 1326 /* Note that we don't need to consider the case where the
5930fe97
EZ
1327 prev character has its type overridden by an RLO or LRO,
1328 because then either the type of this NSM would have been
1329 also overridden, or the previous character is outside the
1330 current level run, and thus not relevant to this NSM.
1331 This is why NSM gets the type_after_w1 of the previous
1332 character. */
ebb5722e
EZ
1333 if (bidi_it->prev.type_after_w1 != UNKNOWN_BT
1334 /* if type_after_w1 is NEUTRAL_B, this NSM is at sor */
1335 && bidi_it->prev.type_after_w1 != NEUTRAL_B)
5930fe97 1336 type = bidi_it->prev.type_after_w1;
bc5a45f3
EZ
1337 else if (bidi_it->sor == R2L)
1338 type = STRONG_R;
1339 else if (bidi_it->sor == L2R)
1340 type = STRONG_L;
1341 else /* shouldn't happen! */
1342 abort ();
b7b65b15 1343 }
bc5a45f3
EZ
1344 if (type == WEAK_EN /* W2 */
1345 && bidi_it->last_strong.type_after_w1 == STRONG_AL)
1346 type = WEAK_AN;
1347 else if (type == STRONG_AL) /* W3 */
1348 type = STRONG_R;
1349 else if ((type == WEAK_ES /* W4 */
1350 && bidi_it->prev.type_after_w1 == WEAK_EN
1351 && bidi_it->prev.orig_type == WEAK_EN)
1352 || (type == WEAK_CS
1353 && ((bidi_it->prev.type_after_w1 == WEAK_EN
1354 && bidi_it->prev.orig_type == WEAK_EN)
1355 || bidi_it->prev.type_after_w1 == WEAK_AN)))
b7b65b15 1356 {
e7402cb2 1357 next_char =
87e67904
EZ
1358 bidi_it->charpos + bidi_it->nchars >= eob
1359 ? BIDI_EOB
1360 : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len,
1361 bidi_it->string.s);
6bff6497 1362 type_of_next = bidi_get_type (next_char, override);
b7b65b15 1363
bc5a45f3 1364 if (type_of_next == WEAK_BN
b7b65b15
EZ
1365 || bidi_explicit_dir_char (next_char))
1366 {
1367 bidi_copy_it (&saved_it, bidi_it);
1368 while (bidi_resolve_explicit (bidi_it) == new_level
bc5a45f3 1369 && bidi_it->type == WEAK_BN)
b7b65b15
EZ
1370 ;
1371 type_of_next = bidi_it->type;
b7b65b15
EZ
1372 bidi_copy_it (bidi_it, &saved_it);
1373 }
bc5a45f3
EZ
1374
1375 /* If the next character is EN, but the last strong-type
1376 character is AL, that next EN will be changed to AN when
1377 we process it in W2 above. So in that case, this ES
1378 should not be changed into EN. */
1379 if (type == WEAK_ES
1380 && type_of_next == WEAK_EN
1381 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1382 type = WEAK_EN;
1383 else if (type == WEAK_CS)
b7b65b15 1384 {
bc5a45f3
EZ
1385 if (bidi_it->prev.type_after_w1 == WEAK_AN
1386 && (type_of_next == WEAK_AN
1387 /* If the next character is EN, but the last
1388 strong-type character is AL, EN will be later
1389 changed to AN when we process it in W2 above.
1390 So in that case, this ES should not be
1391 changed into EN. */
1392 || (type_of_next == WEAK_EN
1393 && bidi_it->last_strong.type_after_w1 == STRONG_AL)))
1394 type = WEAK_AN;
1395 else if (bidi_it->prev.type_after_w1 == WEAK_EN
1396 && type_of_next == WEAK_EN
1397 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1398 type = WEAK_EN;
1399 }
1400 }
1401 else if (type == WEAK_ET /* W5: ET with EN before or after it */
1402 || type == WEAK_BN) /* W5/Retaining */
1403 {
1404 if (bidi_it->prev.type_after_w1 == WEAK_EN /* ET/BN w/EN before it */
1405 || bidi_it->next_en_pos > bidi_it->charpos)
1406 type = WEAK_EN;
1407 else /* W5: ET/BN with EN after it. */
1408 {
182ce2d2 1409 EMACS_INT en_pos = bidi_it->charpos + bidi_it->nchars;
bc5a45f3 1410
102ebb00
EZ
1411 if (bidi_it->nchars <= 0)
1412 abort ();
bc5a45f3 1413 next_char =
87e67904
EZ
1414 bidi_it->charpos + bidi_it->nchars >= eob
1415 ? BIDI_EOB
1416 : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len,
1417 bidi_it->string.s);
bc5a45f3
EZ
1418 type_of_next = bidi_get_type (next_char, override);
1419
1420 if (type_of_next == WEAK_ET
1421 || type_of_next == WEAK_BN
1422 || bidi_explicit_dir_char (next_char))
1423 {
1424 bidi_copy_it (&saved_it, bidi_it);
1425 while (bidi_resolve_explicit (bidi_it) == new_level
1426 && (bidi_it->type == WEAK_BN
1427 || bidi_it->type == WEAK_ET))
1428 ;
1429 type_of_next = bidi_it->type;
1430 en_pos = bidi_it->charpos;
1431 bidi_copy_it (bidi_it, &saved_it);
1432 }
1433 if (type_of_next == WEAK_EN)
b7b65b15 1434 {
bc5a45f3
EZ
1435 /* If the last strong character is AL, the EN we've
1436 found will become AN when we get to it (W2). */
1437 if (bidi_it->last_strong.type_after_w1 != STRONG_AL)
1438 {
1439 type = WEAK_EN;
1440 /* Remember this EN position, to speed up processing
1441 of the next ETs. */
1442 bidi_it->next_en_pos = en_pos;
1443 }
1444 else if (type == WEAK_BN)
1445 type = NEUTRAL_ON; /* W6/Retaining */
b7b65b15 1446 }
b7b65b15
EZ
1447 }
1448 }
1449 }
1450
1451 if (type == WEAK_ES || type == WEAK_ET || type == WEAK_CS /* W6 */
89d3374a
EZ
1452 || (type == WEAK_BN
1453 && (bidi_it->prev.type_after_w1 == WEAK_CS /* W6/Retaining */
1454 || bidi_it->prev.type_after_w1 == WEAK_ES
1455 || bidi_it->prev.type_after_w1 == WEAK_ET)))
b7b65b15
EZ
1456 type = NEUTRAL_ON;
1457
1458 /* Store the type we've got so far, before we clobber it with strong
1459 types in W7 and while resolving neutral types. But leave alone
1460 the original types that were recorded above, because we will need
1461 them for the L1 clause. */
89d3374a
EZ
1462 if (bidi_it->type_after_w1 == UNKNOWN_BT)
1463 bidi_it->type_after_w1 = type;
1464 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1465
1466 if (type == WEAK_EN) /* W7 */
1467 {
89d3374a 1468 if ((bidi_it->last_strong.type_after_w1 == STRONG_L)
b7b65b15
EZ
1469 || (bidi_it->last_strong.type == UNKNOWN_BT && bidi_it->sor == L2R))
1470 type = STRONG_L;
1471 }
1472
1473 bidi_it->type = type;
2d6e4628 1474 bidi_check_type (bidi_it->type);
b7b65b15
EZ
1475 return type;
1476}
1477
fd3998ff 1478static bidi_type_t
b7b65b15
EZ
1479bidi_resolve_neutral (struct bidi_it *bidi_it)
1480{
1481 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1482 bidi_type_t type = bidi_resolve_weak (bidi_it);
1483 int current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1484
1485 if (!(type == STRONG_R
1486 || type == STRONG_L
1487 || type == WEAK_BN
1488 || type == WEAK_EN
1489 || type == WEAK_AN
1490 || type == NEUTRAL_B
1491 || type == NEUTRAL_S
1492 || type == NEUTRAL_WS
1493 || type == NEUTRAL_ON))
1494 abort ();
1495
1496 if (bidi_get_category (type) == NEUTRAL
1497 || (type == WEAK_BN && prev_level == current_level))
1498 {
1499 if (bidi_it->next_for_neutral.type != UNKNOWN_BT)
1500 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1501 bidi_it->next_for_neutral.type,
1502 current_level);
1503 else
1504 {
1505 /* Arrrgh!! The UAX#9 algorithm is too deeply entrenched in
1506 the assumption of batch-style processing; see clauses W4,
1507 W5, and especially N1, which require to look far forward
182ce2d2
EZ
1508 (as well as back) in the buffer/string. May the fleas of
1509 a thousand camels infest the armpits of those who design
b7b65b15
EZ
1510 supposedly general-purpose algorithms by looking at their
1511 own implementations, and fail to consider other possible
1512 implementations! */
1513 struct bidi_it saved_it;
1514 bidi_type_t next_type;
1515
1516 if (bidi_it->scan_dir == -1)
1517 abort ();
1518
1519 bidi_copy_it (&saved_it, bidi_it);
1520 /* Scan the text forward until we find the first non-neutral
1521 character, and then use that to resolve the neutral we
1522 are dealing with now. We also cache the scanned iterator
1523 states, to salvage some of the effort later. */
1524 bidi_cache_iterator_state (bidi_it, 0);
1525 do {
1526 /* Record the info about the previous character, so that
1527 it will be cached below with this state. */
89d3374a 1528 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
b7b65b15
EZ
1529 && bidi_it->type != WEAK_BN)
1530 bidi_remember_char (&bidi_it->prev, bidi_it);
1531 type = bidi_resolve_weak (bidi_it);
1532 /* Paragraph separators have their levels fully resolved
1533 at this point, so cache them as resolved. */
1534 bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B);
1535 /* FIXME: implement L1 here, by testing for a newline and
1536 resetting the level for any sequence of whitespace
1537 characters adjacent to it. */
1538 } while (!(type == NEUTRAL_B
1539 || (type != WEAK_BN
1540 && bidi_get_category (type) != NEUTRAL)
1541 /* This is all per level run, so stop when we
1542 reach the end of this level run. */
1543 || bidi_it->level_stack[bidi_it->stack_idx].level !=
1544 current_level));
1545
1546 bidi_remember_char (&saved_it.next_for_neutral, bidi_it);
1547
1548 switch (type)
1549 {
1550 case STRONG_L:
1551 case STRONG_R:
1552 case STRONG_AL:
1553 next_type = type;
1554 break;
1555 case WEAK_EN:
1556 case WEAK_AN:
1557 /* N1: ``European and Arabic numbers are treated as
1558 though they were R.'' */
1559 next_type = STRONG_R;
1560 saved_it.next_for_neutral.type = STRONG_R;
1561 break;
1562 case WEAK_BN:
1563 if (!bidi_explicit_dir_char (bidi_it->ch))
1564 abort (); /* can't happen: BNs are skipped */
1565 /* FALLTHROUGH */
1566 case NEUTRAL_B:
1567 /* Marched all the way to the end of this level run.
1568 We need to use the eor type, whose information is
1569 stored by bidi_set_sor_type in the prev_for_neutral
1570 member. */
1571 if (saved_it.type != WEAK_BN
89d3374a 1572 || bidi_get_category (bidi_it->prev.type_after_w1) == NEUTRAL)
b7b65b15
EZ
1573 {
1574 next_type = bidi_it->prev_for_neutral.type;
1575 saved_it.next_for_neutral.type = next_type;
2d6e4628 1576 bidi_check_type (next_type);
b7b65b15
EZ
1577 }
1578 else
1579 {
1580 /* This is a BN which does not adjoin neutrals.
1581 Leave its type alone. */
1582 bidi_copy_it (bidi_it, &saved_it);
1583 return bidi_it->type;
1584 }
1585 break;
1586 default:
1587 abort ();
1588 }
1589 type = bidi_resolve_neutral_1 (saved_it.prev_for_neutral.type,
1590 next_type, current_level);
1591 saved_it.type = type;
2d6e4628 1592 bidi_check_type (type);
b7b65b15
EZ
1593 bidi_copy_it (bidi_it, &saved_it);
1594 }
1595 }
1596 return type;
1597}
1598
1599/* Given an iterator state in BIDI_IT, advance one character position
182ce2d2
EZ
1600 in the buffer/string to the next character (in the logical order),
1601 resolve the bidi type of that next character, and return that
1602 type. */
fd3998ff 1603static bidi_type_t
b7b65b15
EZ
1604bidi_type_of_next_char (struct bidi_it *bidi_it)
1605{
1606 bidi_type_t type;
1607
1608 /* This should always be called during a forward scan. */
1609 if (bidi_it->scan_dir != 1)
1610 abort ();
1611
1612 /* Reset the limit until which to ignore BNs if we step out of the
1613 area where we found only empty levels. */
87e67904 1614 if ((bidi_it->ignore_bn_limit > -1
b7b65b15 1615 && bidi_it->ignore_bn_limit <= bidi_it->charpos)
87e67904 1616 || (bidi_it->ignore_bn_limit == -2
b7b65b15 1617 && !bidi_explicit_dir_char (bidi_it->ch)))
87e67904 1618 bidi_it->ignore_bn_limit = -1;
b7b65b15
EZ
1619
1620 type = bidi_resolve_neutral (bidi_it);
1621
1622 return type;
1623}
1624
1625/* Given an iterator state BIDI_IT, advance one character position in
182ce2d2
EZ
1626 the buffer/string to the next character (in the current scan
1627 direction), resolve the embedding and implicit levels of that next
1628 character, and return the resulting level. */
fd3998ff 1629static int
b7b65b15
EZ
1630bidi_level_of_next_char (struct bidi_it *bidi_it)
1631{
1632 bidi_type_t type;
1633 int level, prev_level = -1;
1634 struct bidi_saved_info next_for_neutral;
87e67904 1635 EMACS_INT next_char_pos = -1;
b7b65b15
EZ
1636
1637 if (bidi_it->scan_dir == 1)
1638 {
1639 /* There's no sense in trying to advance if we hit end of text. */
87e67904 1640 if (bidi_it->charpos >= (bidi_it->string.s ? bidi_it->string.schars : ZV))
b7b65b15
EZ
1641 return bidi_it->resolved_level;
1642
1643 /* Record the info about the previous character. */
89d3374a 1644 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
b7b65b15
EZ
1645 && bidi_it->type != WEAK_BN)
1646 bidi_remember_char (&bidi_it->prev, bidi_it);
89d3374a
EZ
1647 if (bidi_it->type_after_w1 == STRONG_R
1648 || bidi_it->type_after_w1 == STRONG_L
1649 || bidi_it->type_after_w1 == STRONG_AL)
b7b65b15
EZ
1650 bidi_remember_char (&bidi_it->last_strong, bidi_it);
1651 /* FIXME: it sounds like we don't need both prev and
1652 prev_for_neutral members, but I'm leaving them both for now. */
1653 if (bidi_it->type == STRONG_R || bidi_it->type == STRONG_L
1654 || bidi_it->type == WEAK_EN || bidi_it->type == WEAK_AN)
1655 bidi_remember_char (&bidi_it->prev_for_neutral, bidi_it);
1656
1657 /* If we overstepped the characters used for resolving neutrals
1658 and whitespace, invalidate their info in the iterator. */
1659 if (bidi_it->charpos >= bidi_it->next_for_neutral.charpos)
1660 bidi_it->next_for_neutral.type = UNKNOWN_BT;
1661 if (bidi_it->next_en_pos >= 0
1662 && bidi_it->charpos >= bidi_it->next_en_pos)
1663 bidi_it->next_en_pos = -1;
1664 if (bidi_it->next_for_ws.type != UNKNOWN_BT
1665 && bidi_it->charpos >= bidi_it->next_for_ws.charpos)
1666 bidi_it->next_for_ws.type = UNKNOWN_BT;
1667
1668 /* This must be taken before we fill the iterator with the info
1669 about the next char. If we scan backwards, the iterator
1670 state must be already cached, so there's no need to know the
1671 embedding level of the previous character, since we will be
1672 returning to our caller shortly. */
1673 prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1674 }
1675 next_for_neutral = bidi_it->next_for_neutral;
1676
182ce2d2
EZ
1677 /* Perhaps the character we want is already cached. If it is, the
1678 call to bidi_cache_find below will return a type other than
1679 UNKNOWN_BT. */
87e67904 1680 if (bidi_cache_idx > bidi_cache_start && !bidi_it->first_elt)
102ebb00
EZ
1681 {
1682 if (bidi_it->scan_dir > 0)
1683 {
1684 if (bidi_it->nchars <= 0)
1685 abort ();
1686 next_char_pos = bidi_it->charpos + bidi_it->nchars;
1687 }
bb269206
EZ
1688 else if (bidi_it->charpos >= (bidi_it->string.s ? 0 : 1))
1689 /* Implementation note: we allow next_char_pos to be as low as
1690 0 for buffers or -1 for strings, and that is okay because
1691 that's the "position" of the sentinel iterator state we
1692 cached at the beginning of the iteration. */
102ebb00 1693 next_char_pos = bidi_it->charpos - 1;
87e67904
EZ
1694 if (next_char_pos >= 0)
1695 type = bidi_cache_find (next_char_pos, -1, bidi_it);
1696 else
1697 type = UNKNOWN_BT;
102ebb00 1698 }
182ce2d2 1699 else
102ebb00 1700 type = UNKNOWN_BT;
b7b65b15
EZ
1701 if (type != UNKNOWN_BT)
1702 {
1703 /* Don't lose the information for resolving neutrals! The
1704 cached states could have been cached before their
1705 next_for_neutral member was computed. If we are on our way
1706 forward, we can simply take the info from the previous
1707 state. */
1708 if (bidi_it->scan_dir == 1
1709 && bidi_it->next_for_neutral.type == UNKNOWN_BT)
1710 bidi_it->next_for_neutral = next_for_neutral;
1711
1712 /* If resolved_level is -1, it means this state was cached
1713 before it was completely resolved, so we cannot return
1714 it. */
1715 if (bidi_it->resolved_level != -1)
1716 return bidi_it->resolved_level;
1717 }
1718 if (bidi_it->scan_dir == -1)
1719 /* If we are going backwards, the iterator state is already cached
1720 from previous scans, and should be fully resolved. */
1721 abort ();
1722
1723 if (type == UNKNOWN_BT)
1724 type = bidi_type_of_next_char (bidi_it);
1725
1726 if (type == NEUTRAL_B)
1727 return bidi_it->resolved_level;
1728
1729 level = bidi_it->level_stack[bidi_it->stack_idx].level;
1730 if ((bidi_get_category (type) == NEUTRAL /* && type != NEUTRAL_B */)
1731 || (type == WEAK_BN && prev_level == level))
1732 {
1733 if (bidi_it->next_for_neutral.type == UNKNOWN_BT)
1734 abort ();
1735
1736 /* If the cached state shows a neutral character, it was not
1737 resolved by bidi_resolve_neutral, so do it now. */
1738 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1739 bidi_it->next_for_neutral.type,
1740 level);
1741 }
1742
1743 if (!(type == STRONG_R
1744 || type == STRONG_L
1745 || type == WEAK_BN
1746 || type == WEAK_EN
1747 || type == WEAK_AN))
1748 abort ();
1749 bidi_it->type = type;
2d6e4628 1750 bidi_check_type (bidi_it->type);
b7b65b15
EZ
1751
1752 /* For L1 below, we need to know, for each WS character, whether
0d327994 1753 it belongs to a sequence of WS characters preceding a newline
b7b65b15 1754 or a TAB or a paragraph separator. */
89d3374a 1755 if (bidi_it->orig_type == NEUTRAL_WS
b7b65b15
EZ
1756 && bidi_it->next_for_ws.type == UNKNOWN_BT)
1757 {
1758 int ch;
9d68c2a9 1759 EMACS_INT clen = bidi_it->ch_len;
6bff6497
EZ
1760 EMACS_INT bpos = bidi_it->bytepos;
1761 EMACS_INT cpos = bidi_it->charpos;
182ce2d2 1762 EMACS_INT disp_pos = bidi_it->disp_pos;
102ebb00 1763 EMACS_INT nc = bidi_it->nchars;
87e67904 1764 struct bidi_string_data bs = bidi_it->string;
b7b65b15 1765 bidi_type_t chtype;
fc6f18ce 1766 int fwp = bidi_it->frame_window_p;
b7b65b15 1767
102ebb00
EZ
1768 if (bidi_it->nchars <= 0)
1769 abort ();
b7b65b15 1770 do {
87e67904 1771 ch = bidi_fetch_char (bpos += clen, cpos += nc, &disp_pos, &bs, fwp,
fc6f18ce 1772 &clen, &nc);
e7402cb2 1773 if (ch == '\n' || ch == BIDI_EOB /* || ch == LINESEP_CHAR */)
b7b65b15
EZ
1774 chtype = NEUTRAL_B;
1775 else
6bff6497 1776 chtype = bidi_get_type (ch, NEUTRAL_DIR);
b7b65b15
EZ
1777 } while (chtype == NEUTRAL_WS || chtype == WEAK_BN
1778 || bidi_explicit_dir_char (ch)); /* L1/Retaining */
1779 bidi_it->next_for_ws.type = chtype;
2d6e4628 1780 bidi_check_type (bidi_it->next_for_ws.type);
b7b65b15
EZ
1781 bidi_it->next_for_ws.charpos = cpos;
1782 bidi_it->next_for_ws.bytepos = bpos;
1783 }
1784
1785 /* Resolve implicit levels, with a twist: PDFs get the embedding
1786 level of the enbedding they terminate. See below for the
1787 reason. */
89d3374a 1788 if (bidi_it->orig_type == PDF
b7b65b15
EZ
1789 /* Don't do this if this formatting code didn't change the
1790 embedding level due to invalid or empty embeddings. */
1791 && prev_level != level)
1792 {
1793 /* Don't look in UAX#9 for the reason for this: it's our own
1794 private quirk. The reason is that we want the formatting
1795 codes to be delivered so that they bracket the text of their
1796 embedding. For example, given the text
1797
1798 {RLO}teST{PDF}
1799
1800 we want it to be displayed as
1801
0d68907d 1802 {PDF}STet{RLO}
b7b65b15
EZ
1803
1804 not as
1805
1806 STet{RLO}{PDF}
1807
1808 which will result because we bump up the embedding level as
1809 soon as we see the RLO and pop it as soon as we see the PDF,
1810 so RLO itself has the same embedding level as "teST", and
1811 thus would be normally delivered last, just before the PDF.
1812 The switch below fiddles with the level of PDF so that this
1813 ugly side effect does not happen.
1814
1815 (This is, of course, only important if the formatting codes
e7402cb2
EZ
1816 are actually displayed, but Emacs does need to display them
1817 if the user wants to.) */
b7b65b15
EZ
1818 level = prev_level;
1819 }
89d3374a
EZ
1820 else if (bidi_it->orig_type == NEUTRAL_B /* L1 */
1821 || bidi_it->orig_type == NEUTRAL_S
e7402cb2
EZ
1822 || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB
1823 /* || bidi_it->ch == LINESEP_CHAR */
89d3374a 1824 || (bidi_it->orig_type == NEUTRAL_WS
b7b65b15
EZ
1825 && (bidi_it->next_for_ws.type == NEUTRAL_B
1826 || bidi_it->next_for_ws.type == NEUTRAL_S)))
1827 level = bidi_it->level_stack[0].level;
1828 else if ((level & 1) == 0) /* I1 */
1829 {
1830 if (type == STRONG_R)
1831 level++;
1832 else if (type == WEAK_EN || type == WEAK_AN)
1833 level += 2;
1834 }
1835 else /* I2 */
1836 {
1837 if (type == STRONG_L || type == WEAK_EN || type == WEAK_AN)
1838 level++;
1839 }
1840
1841 bidi_it->resolved_level = level;
1842 return level;
1843}
1844
1845/* Move to the other edge of a level given by LEVEL. If END_FLAG is
1846 non-zero, we are at the end of a level, and we need to prepare to
1847 resume the scan of the lower level.
1848
1849 If this level's other edge is cached, we simply jump to it, filling
1850 the iterator structure with the iterator state on the other edge.
182ce2d2
EZ
1851 Otherwise, we walk the buffer or string until we come back to the
1852 same level as LEVEL.
b7b65b15
EZ
1853
1854 Note: we are not talking here about a ``level run'' in the UAX#9
1855 sense of the term, but rather about a ``level'' which includes
1856 all the levels higher than it. In other words, given the levels
1857 like this:
1858
1859 11111112222222333333334443343222222111111112223322111
1860 A B C
1861
1862 and assuming we are at point A scanning left to right, this
1863 function moves to point C, whereas the UAX#9 ``level 2 run'' ends
1864 at point B. */
1865static void
1866bidi_find_other_level_edge (struct bidi_it *bidi_it, int level, int end_flag)
1867{
1868 int dir = end_flag ? -bidi_it->scan_dir : bidi_it->scan_dir;
1869 int idx;
1870
1871 /* Try the cache first. */
87e67904
EZ
1872 if ((idx = bidi_cache_find_level_change (level, dir, end_flag))
1873 >= bidi_cache_start)
b7b65b15
EZ
1874 bidi_cache_fetch_state (idx, bidi_it);
1875 else
1876 {
1877 int new_level;
1878
1879 if (end_flag)
1880 abort (); /* if we are at end of level, its edges must be cached */
1881
1882 bidi_cache_iterator_state (bidi_it, 1);
1883 do {
1884 new_level = bidi_level_of_next_char (bidi_it);
1885 bidi_cache_iterator_state (bidi_it, 1);
1886 } while (new_level >= level);
1887 }
1888}
1889
1890void
4b292a22 1891bidi_move_to_visually_next (struct bidi_it *bidi_it)
b7b65b15
EZ
1892{
1893 int old_level, new_level, next_level;
9c82e145 1894 struct bidi_it sentinel;
b7b65b15 1895
87e67904
EZ
1896 if (bidi_it->charpos < 0 || bidi_it->bytepos < 0)
1897 abort ();
1898
b7b65b15
EZ
1899 if (bidi_it->scan_dir == 0)
1900 {
1901 bidi_it->scan_dir = 1; /* default to logical order */
1902 }
1903
be39f003
EZ
1904 /* If we just passed a newline, initialize for the next line. */
1905 if (!bidi_it->first_elt && bidi_it->orig_type == NEUTRAL_B)
1906 bidi_line_init (bidi_it);
1907
6dcfd253
EZ
1908 /* Prepare the sentinel iterator state, and cache it. When we bump
1909 into it, scanning backwards, we'll know that the last non-base
1910 level is exhausted. */
87e67904 1911 if (bidi_cache_idx == bidi_cache_start)
9c82e145
EZ
1912 {
1913 bidi_copy_it (&sentinel, bidi_it);
1914 if (bidi_it->first_elt)
1915 {
1916 sentinel.charpos--; /* cached charpos needs to be monotonic */
1917 sentinel.bytepos--;
1918 sentinel.ch = '\n'; /* doesn't matter, but why not? */
1919 sentinel.ch_len = 1;
182ce2d2 1920 sentinel.nchars = 1;
9c82e145 1921 }
6dcfd253 1922 bidi_cache_iterator_state (&sentinel, 1);
9c82e145 1923 }
b7b65b15
EZ
1924
1925 old_level = bidi_it->resolved_level;
1926 new_level = bidi_level_of_next_char (bidi_it);
b7b65b15
EZ
1927
1928 /* Reordering of resolved levels (clause L2) is implemented by
1929 jumping to the other edge of the level and flipping direction of
c0546589 1930 scanning the text whenever we find a level change. */
b7b65b15
EZ
1931 if (new_level != old_level)
1932 {
1933 int ascending = new_level > old_level;
1934 int level_to_search = ascending ? old_level + 1 : old_level;
1935 int incr = ascending ? 1 : -1;
1936 int expected_next_level = old_level + incr;
1937
b7b65b15
EZ
1938 /* Jump (or walk) to the other edge of this level. */
1939 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
1940 /* Switch scan direction and peek at the next character in the
1941 new direction. */
1942 bidi_it->scan_dir = -bidi_it->scan_dir;
1943
1944 /* The following loop handles the case where the resolved level
1945 jumps by more than one. This is typical for numbers inside a
1946 run of text with left-to-right embedding direction, but can
1947 also happen in other situations. In those cases the decision
1948 where to continue after a level change, and in what direction,
1949 is tricky. For example, given a text like below:
1950
1951 abcdefgh
1952 11336622
1953
1954 (where the numbers below the text show the resolved levels),
1955 the result of reordering according to UAX#9 should be this:
1956
1957 efdcghba
1958
1959 This is implemented by the loop below which flips direction
1960 and jumps to the other edge of the level each time it finds
1961 the new level not to be the expected one. The expected level
1962 is always one more or one less than the previous one. */
1963 next_level = bidi_peek_at_next_level (bidi_it);
1964 while (next_level != expected_next_level)
1965 {
1966 expected_next_level += incr;
1967 level_to_search += incr;
1968 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
1969 bidi_it->scan_dir = -bidi_it->scan_dir;
1970 next_level = bidi_peek_at_next_level (bidi_it);
1971 }
1972
1973 /* Finally, deliver the next character in the new direction. */
1974 next_level = bidi_level_of_next_char (bidi_it);
1975 }
1976
b44d9321
EZ
1977 /* Take note when we have just processed the newline that precedes
1978 the end of the paragraph. The next time we are about to be
1979 called, set_iterator_to_next will automatically reinit the
1980 paragraph direction, if needed. We do this at the newline before
1981 the paragraph separator, because the next character might not be
1982 the first character of the next paragraph, due to the bidi
c0546589
EZ
1983 reordering, whereas we _must_ know the paragraph base direction
1984 _before_ we process the paragraph's text, since the base
1985 direction affects the reordering. */
87e67904 1986 if (bidi_it->scan_dir == 1 && bidi_it->orig_type == NEUTRAL_B)
be39f003 1987 {
87e67904
EZ
1988 /* The paragraph direction of the entire string, once
1989 determined, is in effect for the entire string. Setting the
1990 separator limit to the end of the string prevents
1991 bidi_paragraph_init from being called automatically on this
1992 string. */
1993 if (bidi_it->string.s)
1994 bidi_it->separator_limit = bidi_it->string.schars;
1995 else if (bidi_it->bytepos < ZV_BYTE)
be39f003 1996 {
87e67904
EZ
1997 EMACS_INT sep_len =
1998 bidi_at_paragraph_end (bidi_it->charpos + bidi_it->nchars,
1999 bidi_it->bytepos + bidi_it->ch_len);
2000 if (bidi_it->nchars <= 0)
2001 abort ();
2002 if (sep_len >= 0)
2003 {
2004 bidi_it->new_paragraph = 1;
2005 /* Record the buffer position of the last character of the
2006 paragraph separator. */
2007 bidi_it->separator_limit =
2008 bidi_it->charpos + bidi_it->nchars + sep_len;
2009 }
be39f003
EZ
2010 }
2011 }
6bff6497 2012
87e67904 2013 if (bidi_it->scan_dir == 1 && bidi_cache_idx > bidi_cache_start)
b7b65b15
EZ
2014 {
2015 /* If we are at paragraph's base embedding level and beyond the
2016 last cached position, the cache's job is done and we can
2017 discard it. */
2018 if (bidi_it->resolved_level == bidi_it->level_stack[0].level
182ce2d2
EZ
2019 && bidi_it->charpos > (bidi_cache[bidi_cache_idx - 1].charpos
2020 + bidi_cache[bidi_cache_idx - 1].nchars - 1))
b7b65b15
EZ
2021 bidi_cache_reset ();
2022 /* But as long as we are caching during forward scan, we must
2023 cache each state, or else the cache integrity will be
2024 compromised: it assumes cached states correspond to buffer
2025 positions 1:1. */
2026 else
2027 bidi_cache_iterator_state (bidi_it, 1);
2028 }
2029}
2030
2031/* This is meant to be called from within the debugger, whenever you
2032 wish to examine the cache contents. */
e78aecca 2033void bidi_dump_cached_states (void) EXTERNALLY_VISIBLE;
b7b65b15
EZ
2034void
2035bidi_dump_cached_states (void)
2036{
2037 int i;
2038 int ndigits = 1;
2039
2040 if (bidi_cache_idx == 0)
2041 {
2042 fprintf (stderr, "The cache is empty.\n");
2043 return;
2044 }
2045 fprintf (stderr, "Total of %d state%s in cache:\n",
2046 bidi_cache_idx, bidi_cache_idx == 1 ? "" : "s");
2047
2048 for (i = bidi_cache[bidi_cache_idx - 1].charpos; i > 0; i /= 10)
2049 ndigits++;
2050 fputs ("ch ", stderr);
2051 for (i = 0; i < bidi_cache_idx; i++)
2052 fprintf (stderr, "%*c", ndigits, bidi_cache[i].ch);
2053 fputs ("\n", stderr);
2054 fputs ("lvl ", stderr);
2055 for (i = 0; i < bidi_cache_idx; i++)
2056 fprintf (stderr, "%*d", ndigits, bidi_cache[i].resolved_level);
2057 fputs ("\n", stderr);
2058 fputs ("pos ", stderr);
2059 for (i = 0; i < bidi_cache_idx; i++)
c2982e87 2060 fprintf (stderr, "%*"pI"d", ndigits, bidi_cache[i].charpos);
b7b65b15
EZ
2061 fputs ("\n", stderr);
2062}