Fix a gratuitous crash in emacs -nw with iso-safe terminal encoding.
[bpt/emacs.git] / src / bidi.c
CommitLineData
b7b65b15 1/* Low-level bidirectional buffer-scanning functions for GNU Emacs.
73b0cd50 2 Copyright (C) 2000-2001, 2004-2005, 2009-2011
b118e65f 3 Free Software Foundation, Inc.
b7b65b15
EZ
4
5This file is part of GNU Emacs.
6
a8d11bd3 7GNU Emacs is free software: you can redistribute it and/or modify
b7b65b15 8it under the terms of the GNU General Public License as published by
a8d11bd3
EZ
9the Free Software Foundation, either version 3 of the License, or
10(at your option) any later version.
b7b65b15
EZ
11
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
16
b7b65b15 17You should have received a copy of the GNU General Public License
a8d11bd3 18along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
b7b65b15 19
2d6e4628
EZ
20/* Written by Eli Zaretskii <eliz@gnu.org>.
21
22 A sequential implementation of the Unicode Bidirectional algorithm,
b7b65b15
EZ
23 as per UAX#9, a part of the Unicode Standard.
24
25 Unlike the reference and most other implementations, this one is
940afb59
EZ
26 designed to be called once for every character in the buffer or
27 string.
b7b65b15 28
4b292a22 29 The main entry point is bidi_move_to_visually_next. Each time it
b7b65b15
EZ
30 is called, it finds the next character in the visual order, and
31 returns its information in a special structure. The caller is then
32 expected to process this character for display or any other
4b292a22
EZ
33 purposes, and call bidi_move_to_visually_next for the next
34 character. See the comments in bidi_move_to_visually_next for more
35 details about its algorithm that finds the next visual-order
b7b65b15
EZ
36 character by resolving their levels on the fly.
37
940afb59
EZ
38 The two other entry points are bidi_paragraph_init and
39 bidi_mirror_char. The first determines the base direction of a
40 paragraph, while the second returns the mirrored version of its
41 argument character.
42
89d3374a
EZ
43 If you want to understand the code, you will have to read it
44 together with the relevant portions of UAX#9. The comments include
45 references to UAX#9 rules, for that very reason.
46
b7b65b15
EZ
47 A note about references to UAX#9 rules: if the reference says
48 something like "X9/Retaining", it means that you need to refer to
49 rule X9 and to its modifications decribed in the "Implementation
50 Notes" section of UAX#9, under "Retaining Format Codes". */
51
b7b65b15 52#include <config.h>
b7b65b15 53#include <stdio.h>
29e3d8d1
EZ
54#include <setjmp.h>
55
b7b65b15
EZ
56#include "lisp.h"
57#include "buffer.h"
58#include "character.h"
59#include "dispextern.h"
60
61static int bidi_initialized = 0;
62
cbc4fd20 63static Lisp_Object bidi_type_table, bidi_mirror_table;
b7b65b15
EZ
64
65#define LRM_CHAR 0x200E
66#define RLM_CHAR 0x200F
b7b65b15 67#define BIDI_EOB -1
b7b65b15 68
b7b65b15
EZ
69/* Data type for describing the bidirectional character categories. */
70typedef enum {
71 UNKNOWN_BC,
72 NEUTRAL,
73 WEAK,
74 STRONG
75} bidi_category_t;
76
e78aecca 77extern int bidi_ignore_explicit_marks_for_paragraph_level EXTERNALLY_VISIBLE;
b7b65b15
EZ
78int bidi_ignore_explicit_marks_for_paragraph_level = 1;
79
372b7a95 80static Lisp_Object paragraph_start_re, paragraph_separate_re;
6bff6497 81static Lisp_Object Qparagraph_start, Qparagraph_separate;
b7b65b15 82
b7b65b15 83static void
971de7fb 84bidi_initialize (void)
b7b65b15 85{
317fbf33
EZ
86
87#include "biditype.h"
cbc4fd20 88#include "bidimirror.h"
317fbf33 89
b7b65b15
EZ
90 int i;
91
92 bidi_type_table = Fmake_char_table (Qnil, make_number (STRONG_L));
2d6e4628 93 staticpro (&bidi_type_table);
b7b65b15
EZ
94
95 for (i = 0; i < sizeof bidi_type / sizeof bidi_type[0]; i++)
317fbf33 96 char_table_set_range (bidi_type_table, bidi_type[i].from, bidi_type[i].to,
b7b65b15 97 make_number (bidi_type[i].type));
6bff6497 98
cbc4fd20
EZ
99 bidi_mirror_table = Fmake_char_table (Qnil, Qnil);
100 staticpro (&bidi_mirror_table);
101
102 for (i = 0; i < sizeof bidi_mirror / sizeof bidi_mirror[0]; i++)
103 char_table_set (bidi_mirror_table, bidi_mirror[i].from,
104 make_number (bidi_mirror[i].to));
105
b4bf28b7
SM
106 Qparagraph_start = intern ("paragraph-start");
107 staticpro (&Qparagraph_start);
372b7a95
EZ
108 paragraph_start_re = Fsymbol_value (Qparagraph_start);
109 if (!STRINGP (paragraph_start_re))
110 paragraph_start_re = build_string ("\f\\|[ \t]*$");
111 staticpro (&paragraph_start_re);
b4bf28b7
SM
112 Qparagraph_separate = intern ("paragraph-separate");
113 staticpro (&Qparagraph_separate);
372b7a95
EZ
114 paragraph_separate_re = Fsymbol_value (Qparagraph_separate);
115 if (!STRINGP (paragraph_separate_re))
116 paragraph_separate_re = build_string ("[ \t\f]*$");
117 staticpro (&paragraph_separate_re);
b7b65b15
EZ
118 bidi_initialized = 1;
119}
120
6bff6497
EZ
121/* Return the bidi type of a character CH, subject to the current
122 directional OVERRIDE. */
fd3998ff 123static INLINE bidi_type_t
6bff6497 124bidi_get_type (int ch, bidi_dir_t override)
b7b65b15 125{
6bff6497
EZ
126 bidi_type_t default_type;
127
e7402cb2
EZ
128 if (ch == BIDI_EOB)
129 return NEUTRAL_B;
e342a24d
EZ
130 if (ch < 0 || ch > MAX_CHAR)
131 abort ();
6bff6497
EZ
132
133 default_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
134
135 if (override == NEUTRAL_DIR)
136 return default_type;
137
138 switch (default_type)
139 {
140 /* Although UAX#9 does not tell, it doesn't make sense to
141 override NEUTRAL_B and LRM/RLM characters. */
142 case NEUTRAL_B:
143 case LRE:
144 case LRO:
145 case RLE:
146 case RLO:
147 case PDF:
148 return default_type;
149 default:
150 switch (ch)
151 {
152 case LRM_CHAR:
153 case RLM_CHAR:
154 return default_type;
155 default:
156 if (override == L2R) /* X6 */
157 return STRONG_L;
158 else if (override == R2L)
159 return STRONG_R;
160 else
161 abort (); /* can't happen: handled above */
162 }
163 }
b7b65b15
EZ
164}
165
7d3b3862 166static void
2d6e4628
EZ
167bidi_check_type (bidi_type_t type)
168{
169 if (type < UNKNOWN_BT || type > NEUTRAL_ON)
170 abort ();
171}
172
b7b65b15 173/* Given a bidi TYPE of a character, return its category. */
fd3998ff 174static INLINE bidi_category_t
b7b65b15
EZ
175bidi_get_category (bidi_type_t type)
176{
177 switch (type)
178 {
179 case UNKNOWN_BT:
180 return UNKNOWN_BC;
181 case STRONG_L:
182 case STRONG_R:
183 case STRONG_AL:
184 case LRE:
185 case LRO:
186 case RLE:
187 case RLO:
188 return STRONG;
189 case PDF: /* ??? really?? */
190 case WEAK_EN:
191 case WEAK_ES:
192 case WEAK_ET:
193 case WEAK_AN:
194 case WEAK_CS:
195 case WEAK_NSM:
196 case WEAK_BN:
197 return WEAK;
198 case NEUTRAL_B:
199 case NEUTRAL_S:
200 case NEUTRAL_WS:
201 case NEUTRAL_ON:
202 return NEUTRAL;
203 default:
204 abort ();
205 }
206}
207
a6e8d97c
EZ
208/* Return the mirrored character of C, if it has one. If C has no
209 mirrored counterpart, return C.
210 Note: The conditions in UAX#9 clause L4 regarding the surrounding
211 context must be tested by the caller. */
b7b65b15
EZ
212int
213bidi_mirror_char (int c)
214{
cbc4fd20
EZ
215 Lisp_Object val;
216
217 if (c == BIDI_EOB)
218 return c;
219 if (c < 0 || c > MAX_CHAR)
220 abort ();
b7b65b15 221
cbc4fd20
EZ
222 val = CHAR_TABLE_REF (bidi_mirror_table, c);
223 if (INTEGERP (val))
b7b65b15 224 {
cbc4fd20 225 int v = XINT (val);
b7b65b15 226
cbc4fd20
EZ
227 if (v < 0 || v > MAX_CHAR)
228 abort ();
229
230 return v;
b7b65b15 231 }
cbc4fd20 232
b7b65b15
EZ
233 return c;
234}
235
236/* Copy the bidi iterator from FROM to TO. To save cycles, this only
237 copies the part of the level stack that is actually in use. */
fd3998ff 238static INLINE void
b7b65b15
EZ
239bidi_copy_it (struct bidi_it *to, struct bidi_it *from)
240{
241 int i;
242
e69a9370 243 /* Copy everything except the level stack and beyond. */
182ce2d2 244 memcpy (to, from, offsetof (struct bidi_it, level_stack[0]));
b7b65b15
EZ
245
246 /* Copy the active part of the level stack. */
247 to->level_stack[0] = from->level_stack[0]; /* level zero is always in use */
248 for (i = 1; i <= from->stack_idx; i++)
249 to->level_stack[i] = from->level_stack[i];
250}
251
252/* Caching the bidi iterator states. */
253
2fe72643
EZ
254#define BIDI_CACHE_CHUNK 200
255static struct bidi_it *bidi_cache;
256static size_t bidi_cache_size = 0;
0416466c 257static size_t elsz = sizeof (struct bidi_it);
87e67904
EZ
258static EMACS_INT bidi_cache_idx; /* next unused cache slot */
259static EMACS_INT bidi_cache_last_idx; /* slot of last cache hit */
260static EMACS_INT bidi_cache_start = 0; /* start of cache for this
261 "stack" level */
262
263/* Reset the cache state to the empty state. We only reset the part
264 of the cache relevant to iteration of the current object. Previous
265 objects, which are pushed on the display iterator's stack, are left
266 intact. This is called when the cached information is no more
267 useful for the current iteration, e.g. when we were reseated to a
268 new position on the same object. */
fd3998ff 269static INLINE void
b7b65b15
EZ
270bidi_cache_reset (void)
271{
87e67904 272 bidi_cache_idx = bidi_cache_start;
b7b65b15
EZ
273 bidi_cache_last_idx = -1;
274}
275
87e67904
EZ
276/* Shrink the cache to its minimal size. Called when we init the bidi
277 iterator for reordering a buffer or a string that does not come
278 from display properties, because that means all the previously
279 cached info is of no further use. */
2fe72643
EZ
280static INLINE void
281bidi_cache_shrink (void)
282{
283 if (bidi_cache_size > BIDI_CACHE_CHUNK)
284 {
0416466c
EZ
285 bidi_cache_size = BIDI_CACHE_CHUNK;
286 bidi_cache =
287 (struct bidi_it *) xrealloc (bidi_cache, bidi_cache_size * elsz);
2fe72643
EZ
288 }
289 bidi_cache_reset ();
290}
291
fd3998ff 292static INLINE void
b7b65b15
EZ
293bidi_cache_fetch_state (int idx, struct bidi_it *bidi_it)
294{
295 int current_scan_dir = bidi_it->scan_dir;
296
87e67904 297 if (idx < bidi_cache_start || idx >= bidi_cache_idx)
b7b65b15
EZ
298 abort ();
299
300 bidi_copy_it (bidi_it, &bidi_cache[idx]);
301 bidi_it->scan_dir = current_scan_dir;
302 bidi_cache_last_idx = idx;
303}
304
305/* Find a cached state with a given CHARPOS and resolved embedding
306 level less or equal to LEVEL. if LEVEL is -1, disregard the
307 resolved levels in cached states. DIR, if non-zero, means search
308 in that direction from the last cache hit. */
fd3998ff 309static INLINE int
61bfec98 310bidi_cache_search (EMACS_INT charpos, int level, int dir)
b7b65b15
EZ
311{
312 int i, i_start;
313
314 if (bidi_cache_idx)
315 {
316 if (charpos < bidi_cache[bidi_cache_last_idx].charpos)
182ce2d2
EZ
317 {
318 dir = -1;
319 i_start = bidi_cache_last_idx - 1;
320 }
321 else if (charpos > (bidi_cache[bidi_cache_last_idx].charpos
322 + bidi_cache[bidi_cache_last_idx].nchars - 1))
323 {
324 dir = 1;
325 i_start = bidi_cache_last_idx + 1;
326 }
327 else if (dir)
b7b65b15
EZ
328 i_start = bidi_cache_last_idx;
329 else
330 {
331 dir = -1;
332 i_start = bidi_cache_idx - 1;
333 }
334
335 if (dir < 0)
336 {
337 /* Linear search for now; FIXME! */
87e67904 338 for (i = i_start; i >= bidi_cache_start; i--)
182ce2d2
EZ
339 if (bidi_cache[i].charpos <= charpos
340 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
b7b65b15
EZ
341 && (level == -1 || bidi_cache[i].resolved_level <= level))
342 return i;
343 }
344 else
345 {
346 for (i = i_start; i < bidi_cache_idx; i++)
182ce2d2
EZ
347 if (bidi_cache[i].charpos <= charpos
348 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
b7b65b15
EZ
349 && (level == -1 || bidi_cache[i].resolved_level <= level))
350 return i;
351 }
352 }
353
354 return -1;
355}
356
357/* Find a cached state where the resolved level changes to a value
358 that is lower than LEVEL, and return its cache slot index. DIR is
359 the direction to search, starting with the last used cache slot.
87e67904
EZ
360 If DIR is zero, we search backwards from the last occupied cache
361 slot. BEFORE, if non-zero, means return the index of the slot that
362 is ``before'' the level change in the search direction. That is,
b7b65b15
EZ
363 given the cached levels like this:
364
365 1122333442211
366 AB C
367
368 and assuming we are at the position cached at the slot marked with
369 C, searching backwards (DIR = -1) for LEVEL = 2 will return the
370 index of slot B or A, depending whether BEFORE is, respectively,
371 non-zero or zero. */
372static int
373bidi_cache_find_level_change (int level, int dir, int before)
374{
375 if (bidi_cache_idx)
376 {
377 int i = dir ? bidi_cache_last_idx : bidi_cache_idx - 1;
378 int incr = before ? 1 : 0;
379
380 if (!dir)
381 dir = -1;
382 else if (!incr)
383 i += dir;
384
385 if (dir < 0)
386 {
87e67904 387 while (i >= bidi_cache_start + incr)
b7b65b15
EZ
388 {
389 if (bidi_cache[i - incr].resolved_level >= 0
390 && bidi_cache[i - incr].resolved_level < level)
391 return i;
392 i--;
393 }
394 }
395 else
396 {
397 while (i < bidi_cache_idx - incr)
398 {
399 if (bidi_cache[i + incr].resolved_level >= 0
400 && bidi_cache[i + incr].resolved_level < level)
401 return i;
402 i++;
403 }
404 }
405 }
406
407 return -1;
408}
409
fd3998ff 410static INLINE void
b7b65b15
EZ
411bidi_cache_iterator_state (struct bidi_it *bidi_it, int resolved)
412{
413 int idx;
414
415 /* We should never cache on backward scans. */
416 if (bidi_it->scan_dir == -1)
417 abort ();
418 idx = bidi_cache_search (bidi_it->charpos, -1, 1);
419
420 if (idx < 0)
421 {
422 idx = bidi_cache_idx;
2fe72643
EZ
423 /* Enlarge the cache as needed. */
424 if (idx >= bidi_cache_size)
425 {
0416466c 426 bidi_cache_size += BIDI_CACHE_CHUNK;
2fe72643 427 bidi_cache =
0416466c 428 (struct bidi_it *) xrealloc (bidi_cache, bidi_cache_size * elsz);
2fe72643 429 }
bd924a5d
EZ
430 /* Character positions should correspond to cache positions 1:1.
431 If we are outside the range of cached positions, the cache is
432 useless and must be reset. */
87e67904 433 if (idx > bidi_cache_start &&
182ce2d2
EZ
434 (bidi_it->charpos > (bidi_cache[idx - 1].charpos
435 + bidi_cache[idx - 1].nchars)
bd924a5d
EZ
436 || bidi_it->charpos < bidi_cache[0].charpos))
437 {
438 bidi_cache_reset ();
87e67904 439 idx = bidi_cache_start;
bd924a5d 440 }
102ebb00
EZ
441 if (bidi_it->nchars <= 0)
442 abort ();
b7b65b15
EZ
443 bidi_copy_it (&bidi_cache[idx], bidi_it);
444 if (!resolved)
445 bidi_cache[idx].resolved_level = -1;
446 }
447 else
448 {
449 /* Copy only the members which could have changed, to avoid
450 costly copying of the entire struct. */
451 bidi_cache[idx].type = bidi_it->type;
2d6e4628 452 bidi_check_type (bidi_it->type);
89d3374a
EZ
453 bidi_cache[idx].type_after_w1 = bidi_it->type_after_w1;
454 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
455 if (resolved)
456 bidi_cache[idx].resolved_level = bidi_it->resolved_level;
457 else
458 bidi_cache[idx].resolved_level = -1;
459 bidi_cache[idx].invalid_levels = bidi_it->invalid_levels;
460 bidi_cache[idx].invalid_rl_levels = bidi_it->invalid_rl_levels;
461 bidi_cache[idx].next_for_neutral = bidi_it->next_for_neutral;
462 bidi_cache[idx].next_for_ws = bidi_it->next_for_ws;
463 bidi_cache[idx].ignore_bn_limit = bidi_it->ignore_bn_limit;
464 }
465
466 bidi_cache_last_idx = idx;
467 if (idx >= bidi_cache_idx)
468 bidi_cache_idx = idx + 1;
469}
470
fd3998ff 471static INLINE bidi_type_t
61bfec98 472bidi_cache_find (EMACS_INT charpos, int level, struct bidi_it *bidi_it)
b7b65b15
EZ
473{
474 int i = bidi_cache_search (charpos, level, bidi_it->scan_dir);
475
87e67904 476 if (i >= bidi_cache_start)
b7b65b15
EZ
477 {
478 bidi_dir_t current_scan_dir = bidi_it->scan_dir;
479
5009f85e 480 bidi_copy_it (bidi_it, &bidi_cache[i]);
b7b65b15
EZ
481 bidi_cache_last_idx = i;
482 /* Don't let scan direction from from the cached state override
483 the current scan direction. */
484 bidi_it->scan_dir = current_scan_dir;
485 return bidi_it->type;
486 }
487
488 return UNKNOWN_BT;
489}
490
fd3998ff 491static INLINE int
b7b65b15
EZ
492bidi_peek_at_next_level (struct bidi_it *bidi_it)
493{
87e67904 494 if (bidi_cache_idx == bidi_cache_start || bidi_cache_last_idx == -1)
b7b65b15
EZ
495 abort ();
496 return bidi_cache[bidi_cache_last_idx + bidi_it->scan_dir].resolved_level;
497}
498
be39f003
EZ
499/* Check if buffer position CHARPOS/BYTEPOS is the end of a paragraph.
500 Value is the non-negative length of the paragraph separator
501 following the buffer position, -1 if position is at the beginning
502 of a new paragraph, or -2 if position is neither at beginning nor
503 at end of a paragraph. */
fd3998ff 504static EMACS_INT
6bff6497 505bidi_at_paragraph_end (EMACS_INT charpos, EMACS_INT bytepos)
b7b65b15 506{
372b7a95
EZ
507 Lisp_Object sep_re;
508 Lisp_Object start_re;
be39f003
EZ
509 EMACS_INT val;
510
372b7a95
EZ
511 sep_re = paragraph_separate_re;
512 start_re = paragraph_start_re;
be39f003
EZ
513
514 val = fast_looking_at (sep_re, charpos, bytepos, ZV, ZV_BYTE, Qnil);
515 if (val < 0)
516 {
517 if (fast_looking_at (start_re, charpos, bytepos, ZV, ZV_BYTE, Qnil) >= 0)
518 val = -1;
519 else
520 val = -2;
521 }
b7b65b15 522
be39f003 523 return val;
b7b65b15
EZ
524}
525
526/* Determine the start-of-run (sor) directional type given the two
527 embedding levels on either side of the run boundary. Also, update
528 the saved info about previously seen characters, since that info is
529 generally valid for a single level run. */
fd3998ff 530static INLINE void
b7b65b15
EZ
531bidi_set_sor_type (struct bidi_it *bidi_it, int level_before, int level_after)
532{
533 int higher_level = level_before > level_after ? level_before : level_after;
534
535 /* The prev_was_pdf gork is required for when we have several PDFs
536 in a row. In that case, we want to compute the sor type for the
537 next level run only once: when we see the first PDF. That's
538 because the sor type depends only on the higher of the two levels
539 that we find on the two sides of the level boundary (see UAX#9,
540 clause X10), and so we don't need to know the final embedding
541 level to which we descend after processing all the PDFs. */
e342a24d 542 if (!bidi_it->prev_was_pdf || level_before < level_after)
b7b65b15
EZ
543 /* FIXME: should the default sor direction be user selectable? */
544 bidi_it->sor = (higher_level & 1) != 0 ? R2L : L2R;
545 if (level_before > level_after)
546 bidi_it->prev_was_pdf = 1;
547
548 bidi_it->prev.type = UNKNOWN_BT;
89d3374a
EZ
549 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
550 bidi_it->last_strong.orig_type = UNKNOWN_BT;
b7b65b15
EZ
551 bidi_it->prev_for_neutral.type = bidi_it->sor == R2L ? STRONG_R : STRONG_L;
552 bidi_it->prev_for_neutral.charpos = bidi_it->charpos;
553 bidi_it->prev_for_neutral.bytepos = bidi_it->bytepos;
89d3374a
EZ
554 bidi_it->next_for_neutral.type = bidi_it->next_for_neutral.type_after_w1 =
555 bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
87e67904 556 bidi_it->ignore_bn_limit = -1; /* meaning it's unknown */
b7b65b15
EZ
557}
558
182ce2d2 559/* Perform initializations for reordering a new line of bidi text. */
6bff6497 560static void
be39f003
EZ
561bidi_line_init (struct bidi_it *bidi_it)
562{
563 bidi_it->scan_dir = 1; /* FIXME: do we need to have control on this? */
564 bidi_it->resolved_level = bidi_it->level_stack[0].level;
565 bidi_it->level_stack[0].override = NEUTRAL_DIR; /* X1 */
566 bidi_it->invalid_levels = 0;
567 bidi_it->invalid_rl_levels = -1;
568 bidi_it->next_en_pos = -1;
569 bidi_it->next_for_ws.type = UNKNOWN_BT;
b44d9321
EZ
570 bidi_set_sor_type (bidi_it,
571 bidi_it->paragraph_dir == R2L ? 1 : 0,
be39f003
EZ
572 bidi_it->level_stack[0].level); /* X10 */
573
574 bidi_cache_reset ();
575}
576
87e67904
EZ
577/* Count bytes in multibyte string S between BEG/BEGBYTE and END. BEG
578 and END are zero-based character positions in S, BEGBYTE is byte
579 position corresponding to BEG. */
580static inline EMACS_INT
581bidi_count_bytes (const unsigned char *s, const EMACS_INT beg,
582 const EMACS_INT begbyte, const EMACS_INT end)
583{
584 EMACS_INT pos = beg;
585 const unsigned char *p = s + begbyte, *start = p;
586
587 if (!CHAR_HEAD_P (*p))
588 abort ();
589
590 while (pos < end)
591 {
592 p += BYTES_BY_CHAR_HEAD (*p);
593 pos++;
594 }
595
596 return p - start;
597}
598
599/* Fetch and returns the character at byte position BYTEPOS. If S is
600 non-NULL, fetch the character from string S; otherwise fetch the
601 character from the current buffer. */
602static inline int
603bidi_char_at_pos (EMACS_INT bytepos, const unsigned char *s)
604{
605 if (s)
606 return STRING_CHAR (s + bytepos);
607 else
608 return FETCH_MULTIBYTE_CHAR (bytepos);
609}
610
102ebb00
EZ
611/* Fetch and return the character at BYTEPOS/CHARPOS. If that
612 character is covered by a display string, treat the entire run of
613 covered characters as a single character u+FFFC, and return their
614 combined length in CH_LEN and NCHARS. DISP_POS specifies the
615 character position of the next display string, or -1 if not yet
616 computed. When the next character is at or beyond that position,
617 the function updates DISP_POS with the position of the next display
9f257352
EZ
618 string. STRING->s is the C string to iterate, or NULL if iterating
619 over a buffer or a Lisp string; in the latter case, STRING->lstring
620 is the Lisp string. */
87e67904 621static inline int
102ebb00 622bidi_fetch_char (EMACS_INT bytepos, EMACS_INT charpos, EMACS_INT *disp_pos,
87e67904 623 struct bidi_string_data *string,
fc6f18ce 624 int frame_window_p, EMACS_INT *ch_len, EMACS_INT *nchars)
182ce2d2
EZ
625{
626 int ch;
9f257352
EZ
627 EMACS_INT endpos =
628 (string->s || STRINGP (string->lstring)) ? string->schars : ZV;
6db161be 629 struct text_pos pos;
182ce2d2 630
182ce2d2 631 /* If we got past the last known position of display string, compute
87e67904
EZ
632 the position of the next one. That position could be at CHARPOS. */
633 if (charpos < endpos && charpos > *disp_pos)
6db161be
EZ
634 {
635 SET_TEXT_POS (pos, charpos, bytepos);
636 *disp_pos = compute_display_string_pos (&pos, string, frame_window_p);
637 }
102ebb00
EZ
638
639 /* Fetch the character at BYTEPOS. */
87e67904 640 if (charpos >= endpos)
182ce2d2
EZ
641 {
642 ch = BIDI_EOB;
643 *ch_len = 1;
644 *nchars = 1;
87e67904 645 *disp_pos = endpos;
182ce2d2 646 }
102ebb00 647 else if (charpos >= *disp_pos)
182ce2d2 648 {
7b600102
EZ
649 EMACS_INT disp_end_pos;
650
651 /* We don't expect to find ourselves in the middle of a display
652 property. Hopefully, it will never be needed. */
653 if (charpos > *disp_pos)
654 abort ();
655 /* Return the Unicode Object Replacement Character to represent
87e67904 656 the entire run of characters covered by the display string. */
7b600102 657 ch = 0xFFFC;
87e67904 658 disp_end_pos = compute_display_string_end (*disp_pos, string);
7b600102 659 *nchars = disp_end_pos - *disp_pos;
87e67904
EZ
660 if (string->s)
661 *ch_len = bidi_count_bytes (string->s, *disp_pos, bytepos,
662 disp_end_pos);
9f257352
EZ
663 else if (STRINGP (string->lstring))
664 *ch_len = bidi_count_bytes (SDATA (string->lstring), *disp_pos,
665 bytepos, disp_end_pos);
87e67904
EZ
666 else
667 *ch_len = CHAR_TO_BYTE (disp_end_pos) - bytepos;
182ce2d2 668 }
182ce2d2
EZ
669 else
670 {
87e67904
EZ
671 if (string->s)
672 {
673 EMACS_INT len;
674
675 ch = STRING_CHAR_AND_LENGTH (string->s + bytepos, len);
676 *ch_len = len;
677 }
9f257352
EZ
678 else if (STRINGP (string->lstring))
679 {
680 EMACS_INT len;
681
682 ch = STRING_CHAR_AND_LENGTH (SDATA (string->lstring) + bytepos, len);
683 *ch_len = len;
684 }
87e67904
EZ
685 else
686 {
687 ch = FETCH_MULTIBYTE_CHAR (bytepos);
688 *ch_len = CHAR_BYTES (ch);
689 }
182ce2d2
EZ
690 *nchars = 1;
691 }
692
693 /* If we just entered a run of characters covered by a display
694 string, compute the position of the next display string. */
87e67904 695 if (charpos + *nchars <= endpos && charpos + *nchars > *disp_pos)
6db161be
EZ
696 {
697 SET_TEXT_POS (pos, charpos + *nchars, bytepos + *ch_len);
698 *disp_pos = compute_display_string_pos (&pos, string, frame_window_p);
699 }
182ce2d2
EZ
700
701 return ch;
702}
703
be39f003
EZ
704/* Find the beginning of this paragraph by looking back in the buffer.
705 Value is the byte position of the paragraph's beginning. */
706static EMACS_INT
b44d9321 707bidi_find_paragraph_start (EMACS_INT pos, EMACS_INT pos_byte)
6bff6497 708{
372b7a95 709 Lisp_Object re = paragraph_start_re;
6bff6497
EZ
710 EMACS_INT limit = ZV, limit_byte = ZV_BYTE;
711
6bff6497
EZ
712 while (pos_byte > BEGV_BYTE
713 && fast_looking_at (re, pos, pos_byte, limit, limit_byte, Qnil) < 0)
714 {
182ce2d2
EZ
715 /* FIXME: What if the paragraph beginning is covered by a
716 display string? And what if a display string covering some
717 of the text over which we scan back includes
718 paragraph_start_re? */
be39f003
EZ
719 pos = find_next_newline_no_quit (pos - 1, -1);
720 pos_byte = CHAR_TO_BYTE (pos);
6bff6497 721 }
be39f003 722 return pos_byte;
6bff6497
EZ
723}
724
bea4f10c 725/* Determine the base direction, a.k.a. base embedding level, of the
b44d9321
EZ
726 paragraph we are about to iterate through. If DIR is either L2R or
727 R2L, just use that. Otherwise, determine the paragraph direction
bea4f10c
EZ
728 from the first strong directional character of the paragraph.
729
182ce2d2 730 NO_DEFAULT_P non-zero means don't default to L2R if the paragraph
bea4f10c
EZ
731 has no strong directional characters and both DIR and
732 bidi_it->paragraph_dir are NEUTRAL_DIR. In that case, search back
733 in the buffer until a paragraph is found with a strong character,
734 or until hitting BEGV. In the latter case, fall back to L2R. This
735 flag is used in current-bidi-paragraph-direction.
736
737 Note that this function gives the paragraph separator the same
738 direction as the preceding paragraph, even though Emacs generally
739 views the separartor as not belonging to any paragraph. */
b7b65b15 740void
bea4f10c 741bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it, int no_default_p)
b7b65b15 742{
6bff6497 743 EMACS_INT bytepos = bidi_it->bytepos;
9f257352 744 int string_p = bidi_it->string.s != NULL || STRINGP (bidi_it->string.lstring);
bea4f10c 745 EMACS_INT pstartbyte;
87e67904
EZ
746 /* Note that begbyte is a byte position, while end is a character
747 position. Yes, this is ugly, but we are trying to avoid costly
748 calls to BYTE_TO_CHAR and its ilk. */
749 EMACS_INT begbyte = string_p ? 0 : BEGV_BYTE;
750 EMACS_INT end = string_p ? bidi_it->string.schars : ZV;
e7402cb2 751
5e65aec0 752 /* Special case for an empty buffer. */
87e67904 753 if (bytepos == begbyte && bidi_it->charpos == end)
5e65aec0 754 dir = L2R;
9c82e145 755 /* We should never be called at EOB or before BEGV. */
87e67904 756 else if (bidi_it->charpos >= end || bytepos < begbyte)
9c82e145
EZ
757 abort ();
758
be39f003
EZ
759 if (dir == L2R)
760 {
761 bidi_it->paragraph_dir = L2R;
762 bidi_it->new_paragraph = 0;
763 }
764 else if (dir == R2L)
765 {
766 bidi_it->paragraph_dir = R2L;
767 bidi_it->new_paragraph = 0;
768 }
b7b65b15
EZ
769 else if (dir == NEUTRAL_DIR) /* P2 */
770 {
182ce2d2
EZ
771 int ch;
772 EMACS_INT ch_len, nchars;
773 EMACS_INT pos, disp_pos = -1;
6bff6497 774 bidi_type_t type;
9f257352 775 const unsigned char *s;
be39f003 776
d20e1419
EZ
777 if (!bidi_initialized)
778 bidi_initialize ();
779
be39f003
EZ
780 /* If we are inside a paragraph separator, we are just waiting
781 for the separator to be exhausted; use the previous paragraph
e5a2fec7
EZ
782 direction. But don't do that if we have been just reseated,
783 because we need to reinitialize below in that case. */
784 if (!bidi_it->first_elt
785 && bidi_it->charpos < bidi_it->separator_limit)
be39f003
EZ
786 return;
787
b44d9321 788 /* If we are on a newline, get past it to where the next
5e65aec0
EZ
789 paragraph might start. But don't do that at BEGV since then
790 we are potentially in a new paragraph that doesn't yet
791 exist. */
c143c213 792 pos = bidi_it->charpos;
9f257352
EZ
793 s = STRINGP (bidi_it->string.lstring) ?
794 SDATA (bidi_it->string.lstring) : bidi_it->string.s;
795 if (bytepos > begbyte && bidi_char_at_pos (bytepos, s) == '\n')
be39f003 796 {
b44d9321 797 bytepos++;
c143c213 798 pos++;
be39f003 799 }
b44d9321
EZ
800
801 /* We are either at the beginning of a paragraph or in the
802 middle of it. Find where this paragraph starts. */
87e67904
EZ
803 if (string_p)
804 {
805 /* We don't support changes of paragraph direction inside a
806 string. It is treated as a single paragraph. */
807 pstartbyte = 0;
808 }
809 else
810 pstartbyte = bidi_find_paragraph_start (pos, bytepos);
be39f003
EZ
811 bidi_it->separator_limit = -1;
812 bidi_it->new_paragraph = 0;
bea4f10c
EZ
813
814 /* The following loop is run more than once only if NO_DEFAULT_P
87e67904 815 is non-zero, and only if we are iterating on a buffer. */
bea4f10c
EZ
816 do {
817 bytepos = pstartbyte;
87e67904
EZ
818 if (!string_p)
819 pos = BYTE_TO_CHAR (bytepos);
820 ch = bidi_fetch_char (bytepos, pos, &disp_pos, &bidi_it->string,
821 bidi_it->frame_window_p, &ch_len, &nchars);
bea4f10c
EZ
822 type = bidi_get_type (ch, NEUTRAL_DIR);
823
182ce2d2 824 for (pos += nchars, bytepos += ch_len;
bea4f10c
EZ
825 /* NOTE: UAX#9 says to search only for L, AL, or R types
826 of characters, and ignore RLE, RLO, LRE, and LRO.
827 However, I'm not sure it makes sense to omit those 4;
828 should try with and without that to see the effect. */
829 (bidi_get_category (type) != STRONG)
830 || (bidi_ignore_explicit_marks_for_paragraph_level
831 && (type == RLE || type == RLO
832 || type == LRE || type == LRO));
833 type = bidi_get_type (ch, NEUTRAL_DIR))
834 {
87e67904
EZ
835 if (!string_p
836 && type == NEUTRAL_B
837 && bidi_at_paragraph_end (pos, bytepos) >= -1)
21fce5ab 838 break;
87e67904 839 if (pos >= end)
bea4f10c
EZ
840 {
841 /* Pretend there's a paragraph separator at end of
87e67904 842 buffer/string. */
bea4f10c
EZ
843 type = NEUTRAL_B;
844 break;
845 }
102ebb00 846 /* Fetch next character and advance to get past it. */
87e67904 847 ch = bidi_fetch_char (bytepos, pos, &disp_pos, &bidi_it->string,
fc6f18ce 848 bidi_it->frame_window_p, &ch_len, &nchars);
182ce2d2
EZ
849 pos += nchars;
850 bytepos += ch_len;
bea4f10c
EZ
851 }
852 if (type == STRONG_R || type == STRONG_AL) /* P3 */
853 bidi_it->paragraph_dir = R2L;
854 else if (type == STRONG_L)
855 bidi_it->paragraph_dir = L2R;
87e67904
EZ
856 if (!string_p
857 && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR)
bea4f10c
EZ
858 {
859 /* If this paragraph is at BEGV, default to L2R. */
860 if (pstartbyte == BEGV_BYTE)
861 bidi_it->paragraph_dir = L2R; /* P3 and HL1 */
862 else
863 {
864 EMACS_INT prevpbyte = pstartbyte;
865 EMACS_INT p = BYTE_TO_CHAR (pstartbyte), pbyte = pstartbyte;
866
867 /* Find the beginning of the previous paragraph, if any. */
868 while (pbyte > BEGV_BYTE && prevpbyte >= pstartbyte)
869 {
182ce2d2
EZ
870 /* FXIME: What if p is covered by a display
871 string? See also a FIXME inside
872 bidi_find_paragraph_start. */
bea4f10c
EZ
873 p--;
874 pbyte = CHAR_TO_BYTE (p);
875 prevpbyte = bidi_find_paragraph_start (p, pbyte);
876 }
877 pstartbyte = prevpbyte;
878 }
879 }
87e67904
EZ
880 } while (!string_p
881 && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR);
b7b65b15 882 }
be39f003
EZ
883 else
884 abort ();
b7b65b15 885
b44d9321
EZ
886 /* Contrary to UAX#9 clause P3, we only default the paragraph
887 direction to L2R if we have no previous usable paragraph
bea4f10c 888 direction. This is allowed by the HL1 clause. */
d20e1419 889 if (bidi_it->paragraph_dir != L2R && bidi_it->paragraph_dir != R2L)
bea4f10c 890 bidi_it->paragraph_dir = L2R; /* P3 and HL1 ``higher-level protocols'' */
be39f003 891 if (bidi_it->paragraph_dir == R2L)
b44d9321 892 bidi_it->level_stack[0].level = 1;
be39f003 893 else
b44d9321 894 bidi_it->level_stack[0].level = 0;
be39f003
EZ
895
896 bidi_line_init (bidi_it);
b7b65b15
EZ
897}
898
6bff6497
EZ
899/* Do whatever UAX#9 clause X8 says should be done at paragraph's
900 end. */
fd3998ff 901static INLINE void
b7b65b15
EZ
902bidi_set_paragraph_end (struct bidi_it *bidi_it)
903{
904 bidi_it->invalid_levels = 0;
905 bidi_it->invalid_rl_levels = -1;
906 bidi_it->stack_idx = 0;
907 bidi_it->resolved_level = bidi_it->level_stack[0].level;
b7b65b15
EZ
908}
909
182ce2d2 910/* Initialize the bidi iterator from buffer/string position CHARPOS. */
b7b65b15 911void
fc6f18ce
EZ
912bidi_init_it (EMACS_INT charpos, EMACS_INT bytepos, int frame_window_p,
913 struct bidi_it *bidi_it)
b7b65b15
EZ
914{
915 if (! bidi_initialized)
916 bidi_initialize ();
87e67904
EZ
917 if (charpos >= 0)
918 bidi_it->charpos = charpos;
919 if (bytepos >= 0)
920 bidi_it->bytepos = bytepos;
fc6f18ce 921 bidi_it->frame_window_p = frame_window_p;
182ce2d2 922 bidi_it->nchars = -1; /* to be computed in bidi_resolve_explicit_1 */
6bff6497
EZ
923 bidi_it->first_elt = 1;
924 bidi_set_paragraph_end (bidi_it);
925 bidi_it->new_paragraph = 1;
be39f003 926 bidi_it->separator_limit = -1;
b7b65b15 927 bidi_it->type = NEUTRAL_B;
ebb5722e
EZ
928 bidi_it->type_after_w1 = NEUTRAL_B;
929 bidi_it->orig_type = NEUTRAL_B;
b7b65b15 930 bidi_it->prev_was_pdf = 0;
ebb5722e
EZ
931 bidi_it->prev.type = bidi_it->prev.type_after_w1 =
932 bidi_it->prev.orig_type = UNKNOWN_BT;
89d3374a
EZ
933 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
934 bidi_it->last_strong.orig_type = UNKNOWN_BT;
b7b65b15
EZ
935 bidi_it->next_for_neutral.charpos = -1;
936 bidi_it->next_for_neutral.type =
89d3374a
EZ
937 bidi_it->next_for_neutral.type_after_w1 =
938 bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
b7b65b15
EZ
939 bidi_it->prev_for_neutral.charpos = -1;
940 bidi_it->prev_for_neutral.type =
89d3374a
EZ
941 bidi_it->prev_for_neutral.type_after_w1 =
942 bidi_it->prev_for_neutral.orig_type = UNKNOWN_BT;
b7b65b15 943 bidi_it->sor = L2R; /* FIXME: should it be user-selectable? */
182ce2d2 944 bidi_it->disp_pos = -1; /* invalid/unknown */
87e67904
EZ
945 /* We can only shrink the cache if we are at the bottom level of its
946 "stack". */
947 if (bidi_cache_start == 0)
948 bidi_cache_shrink ();
b7b65b15
EZ
949}
950
951/* Push the current embedding level and override status; reset the
952 current level to LEVEL and the current override status to OVERRIDE. */
fd3998ff 953static INLINE void
b7b65b15
EZ
954bidi_push_embedding_level (struct bidi_it *bidi_it,
955 int level, bidi_dir_t override)
956{
957 bidi_it->stack_idx++;
958 if (bidi_it->stack_idx >= BIDI_MAXLEVEL)
959 abort ();
960 bidi_it->level_stack[bidi_it->stack_idx].level = level;
961 bidi_it->level_stack[bidi_it->stack_idx].override = override;
962}
963
964/* Pop the embedding level and directional override status from the
965 stack, and return the new level. */
fd3998ff 966static INLINE int
b7b65b15
EZ
967bidi_pop_embedding_level (struct bidi_it *bidi_it)
968{
969 /* UAX#9 says to ignore invalid PDFs. */
970 if (bidi_it->stack_idx > 0)
971 bidi_it->stack_idx--;
972 return bidi_it->level_stack[bidi_it->stack_idx].level;
973}
974
975/* Record in SAVED_INFO the information about the current character. */
fd3998ff 976static INLINE void
b7b65b15
EZ
977bidi_remember_char (struct bidi_saved_info *saved_info,
978 struct bidi_it *bidi_it)
979{
980 saved_info->charpos = bidi_it->charpos;
981 saved_info->bytepos = bidi_it->bytepos;
982 saved_info->type = bidi_it->type;
2d6e4628 983 bidi_check_type (bidi_it->type);
89d3374a
EZ
984 saved_info->type_after_w1 = bidi_it->type_after_w1;
985 bidi_check_type (bidi_it->type_after_w1);
b7b65b15 986 saved_info->orig_type = bidi_it->orig_type;
2d6e4628 987 bidi_check_type (bidi_it->orig_type);
b7b65b15
EZ
988}
989
990/* Resolve the type of a neutral character according to the type of
991 surrounding strong text and the current embedding level. */
fd3998ff 992static INLINE bidi_type_t
b7b65b15
EZ
993bidi_resolve_neutral_1 (bidi_type_t prev_type, bidi_type_t next_type, int lev)
994{
995 /* N1: European and Arabic numbers are treated as though they were R. */
996 if (next_type == WEAK_EN || next_type == WEAK_AN)
997 next_type = STRONG_R;
998 if (prev_type == WEAK_EN || prev_type == WEAK_AN)
999 prev_type = STRONG_R;
1000
1001 if (next_type == prev_type) /* N1 */
1002 return next_type;
1003 else if ((lev & 1) == 0) /* N2 */
1004 return STRONG_L;
1005 else
1006 return STRONG_R;
1007}
1008
fd3998ff 1009static INLINE int
182ce2d2 1010bidi_explicit_dir_char (int ch)
b7b65b15 1011{
182ce2d2
EZ
1012 bidi_type_t ch_type;
1013
1014 if (!bidi_initialized)
1015 abort ();
1016 ch_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
1017 return (ch_type == LRE || ch_type == LRO
1018 || ch_type == RLE || ch_type == RLO
1019 || ch_type == PDF);
b7b65b15
EZ
1020}
1021
1022/* A helper function for bidi_resolve_explicit. It advances to the
1023 next character in logical order and determines the new embedding
1024 level and directional override, but does not take into account
1025 empty embeddings. */
1026static int
1027bidi_resolve_explicit_1 (struct bidi_it *bidi_it)
1028{
1029 int curchar;
1030 bidi_type_t type;
1031 int current_level;
1032 int new_level;
1033 bidi_dir_t override;
9f257352 1034 int string_p = bidi_it->string.s != NULL || STRINGP (bidi_it->string.lstring);
b7b65b15 1035
182ce2d2
EZ
1036 /* If reseat()'ed, don't advance, so as to start iteration from the
1037 position where we were reseated. bidi_it->bytepos can be less
1038 than BEGV_BYTE after reseat to BEGV. */
87e67904 1039 if (bidi_it->bytepos < (string_p ? 0 : BEGV_BYTE)
9c82e145 1040 || bidi_it->first_elt)
e7402cb2 1041 {
9c82e145 1042 bidi_it->first_elt = 0;
87e67904
EZ
1043 if (string_p)
1044 {
9f257352
EZ
1045 const unsigned char *p =
1046 STRINGP (bidi_it->string.lstring)
1047 ? SDATA (bidi_it->string.lstring) : bidi_it->string.s;
1048
87e67904
EZ
1049 if (bidi_it->charpos < 0)
1050 bidi_it->charpos = 0;
9f257352 1051 bidi_it->bytepos = bidi_count_bytes (p, 0, 0, bidi_it->charpos);
87e67904
EZ
1052 }
1053 else
1054 {
1055 if (bidi_it->charpos < BEGV)
1056 bidi_it->charpos = BEGV;
1057 bidi_it->bytepos = CHAR_TO_BYTE (bidi_it->charpos);
1058 }
e7402cb2 1059 }
87e67904
EZ
1060 /* Don't move at end of buffer/string. */
1061 else if (bidi_it->charpos < (string_p ? bidi_it->string.schars : ZV))
b7b65b15 1062 {
182ce2d2
EZ
1063 /* Advance to the next character, skipping characters covered by
1064 display strings (nchars > 1). */
102ebb00
EZ
1065 if (bidi_it->nchars <= 0)
1066 abort ();
182ce2d2 1067 bidi_it->charpos += bidi_it->nchars;
e7402cb2
EZ
1068 if (bidi_it->ch_len == 0)
1069 abort ();
b7b65b15
EZ
1070 bidi_it->bytepos += bidi_it->ch_len;
1071 }
1072
1073 current_level = bidi_it->level_stack[bidi_it->stack_idx].level; /* X1 */
1074 override = bidi_it->level_stack[bidi_it->stack_idx].override;
1075 new_level = current_level;
1076
87e67904 1077 if (bidi_it->charpos >= (string_p ? bidi_it->string.schars : ZV))
e7402cb2
EZ
1078 {
1079 curchar = BIDI_EOB;
1080 bidi_it->ch_len = 1;
182ce2d2 1081 bidi_it->nchars = 1;
87e67904 1082 bidi_it->disp_pos = (string_p ? bidi_it->string.schars : ZV);
e7402cb2
EZ
1083 }
1084 else
1085 {
182ce2d2
EZ
1086 /* Fetch the character at BYTEPOS. If it is covered by a
1087 display string, treat the entire run of covered characters as
1088 a single character u+FFFC. */
102ebb00 1089 curchar = bidi_fetch_char (bidi_it->bytepos, bidi_it->charpos,
87e67904
EZ
1090 &bidi_it->disp_pos, &bidi_it->string,
1091 bidi_it->frame_window_p,
102ebb00 1092 &bidi_it->ch_len, &bidi_it->nchars);
e7402cb2 1093 }
b7b65b15 1094 bidi_it->ch = curchar;
b7b65b15 1095
6bff6497
EZ
1096 /* Don't apply directional override here, as all the types we handle
1097 below will not be affected by the override anyway, and we need
1098 the original type unaltered. The override will be applied in
1099 bidi_resolve_weak. */
1100 type = bidi_get_type (curchar, NEUTRAL_DIR);
89d3374a
EZ
1101 bidi_it->orig_type = type;
1102 bidi_check_type (bidi_it->orig_type);
b7b65b15
EZ
1103
1104 if (type != PDF)
1105 bidi_it->prev_was_pdf = 0;
1106
89d3374a 1107 bidi_it->type_after_w1 = UNKNOWN_BT;
b7b65b15
EZ
1108
1109 switch (type)
1110 {
1111 case RLE: /* X2 */
1112 case RLO: /* X4 */
89d3374a
EZ
1113 bidi_it->type_after_w1 = type;
1114 bidi_check_type (bidi_it->type_after_w1);
b7b65b15 1115 type = WEAK_BN; /* X9/Retaining */
87e67904 1116 if (bidi_it->ignore_bn_limit <= -1)
b7b65b15
EZ
1117 {
1118 if (current_level <= BIDI_MAXLEVEL - 4)
1119 {
1120 /* Compute the least odd embedding level greater than
1121 the current level. */
1122 new_level = ((current_level + 1) & ~1) + 1;
89d3374a 1123 if (bidi_it->type_after_w1 == RLE)
b7b65b15
EZ
1124 override = NEUTRAL_DIR;
1125 else
1126 override = R2L;
1127 if (current_level == BIDI_MAXLEVEL - 4)
1128 bidi_it->invalid_rl_levels = 0;
1129 bidi_push_embedding_level (bidi_it, new_level, override);
1130 }
1131 else
1132 {
1133 bidi_it->invalid_levels++;
1134 /* See the commentary about invalid_rl_levels below. */
1135 if (bidi_it->invalid_rl_levels < 0)
1136 bidi_it->invalid_rl_levels = 0;
1137 bidi_it->invalid_rl_levels++;
1138 }
1139 }
89d3374a 1140 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
b7b65b15
EZ
1141 || bidi_it->next_en_pos > bidi_it->charpos)
1142 type = WEAK_EN;
1143 break;
1144 case LRE: /* X3 */
1145 case LRO: /* X5 */
89d3374a
EZ
1146 bidi_it->type_after_w1 = type;
1147 bidi_check_type (bidi_it->type_after_w1);
b7b65b15 1148 type = WEAK_BN; /* X9/Retaining */
87e67904 1149 if (bidi_it->ignore_bn_limit <= -1)
b7b65b15
EZ
1150 {
1151 if (current_level <= BIDI_MAXLEVEL - 5)
1152 {
1153 /* Compute the least even embedding level greater than
1154 the current level. */
1155 new_level = ((current_level + 2) & ~1);
89d3374a 1156 if (bidi_it->type_after_w1 == LRE)
b7b65b15
EZ
1157 override = NEUTRAL_DIR;
1158 else
1159 override = L2R;
1160 bidi_push_embedding_level (bidi_it, new_level, override);
1161 }
1162 else
1163 {
1164 bidi_it->invalid_levels++;
1165 /* invalid_rl_levels counts invalid levels encountered
1166 while the embedding level was already too high for
1167 LRE/LRO, but not for RLE/RLO. That is because
1168 there may be exactly one PDF which we should not
1169 ignore even though invalid_levels is non-zero.
1170 invalid_rl_levels helps to know what PDF is
1171 that. */
1172 if (bidi_it->invalid_rl_levels >= 0)
1173 bidi_it->invalid_rl_levels++;
1174 }
1175 }
89d3374a 1176 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
b7b65b15
EZ
1177 || bidi_it->next_en_pos > bidi_it->charpos)
1178 type = WEAK_EN;
1179 break;
1180 case PDF: /* X7 */
89d3374a
EZ
1181 bidi_it->type_after_w1 = type;
1182 bidi_check_type (bidi_it->type_after_w1);
b7b65b15 1183 type = WEAK_BN; /* X9/Retaining */
87e67904 1184 if (bidi_it->ignore_bn_limit <= -1)
b7b65b15
EZ
1185 {
1186 if (!bidi_it->invalid_rl_levels)
1187 {
1188 new_level = bidi_pop_embedding_level (bidi_it);
1189 bidi_it->invalid_rl_levels = -1;
1190 if (bidi_it->invalid_levels)
1191 bidi_it->invalid_levels--;
1192 /* else nothing: UAX#9 says to ignore invalid PDFs */
1193 }
1194 if (!bidi_it->invalid_levels)
1195 new_level = bidi_pop_embedding_level (bidi_it);
1196 else
1197 {
1198 bidi_it->invalid_levels--;
1199 bidi_it->invalid_rl_levels--;
1200 }
1201 }
89d3374a 1202 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
b7b65b15
EZ
1203 || bidi_it->next_en_pos > bidi_it->charpos)
1204 type = WEAK_EN;
1205 break;
1206 default:
1207 /* Nothing. */
1208 break;
1209 }
1210
1211 bidi_it->type = type;
2d6e4628 1212 bidi_check_type (bidi_it->type);
b7b65b15
EZ
1213
1214 return new_level;
1215}
1216
1217/* Given an iterator state in BIDI_IT, advance one character position
182ce2d2
EZ
1218 in the buffer/string to the next character (in the logical order),
1219 resolve any explicit embeddings and directional overrides, and
1220 return the embedding level of the character after resolving
1221 explicit directives and ignoring empty embeddings. */
b7b65b15
EZ
1222static int
1223bidi_resolve_explicit (struct bidi_it *bidi_it)
1224{
1225 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1226 int new_level = bidi_resolve_explicit_1 (bidi_it);
87e67904 1227 EMACS_INT eob = bidi_it->string.s ? bidi_it->string.schars : ZV;
9f257352
EZ
1228 const unsigned char *s = STRINGP (bidi_it->string.lstring)
1229 ? SDATA (bidi_it->string.lstring) : bidi_it->string.s;
b7b65b15
EZ
1230
1231 if (prev_level < new_level
1232 && bidi_it->type == WEAK_BN
87e67904
EZ
1233 && bidi_it->ignore_bn_limit == -1 /* only if not already known */
1234 && bidi_it->charpos < eob /* not already at EOB */
1235 && bidi_explicit_dir_char (bidi_char_at_pos (bidi_it->bytepos
9f257352 1236 + bidi_it->ch_len, s)))
b7b65b15
EZ
1237 {
1238 /* Avoid pushing and popping embedding levels if the level run
1239 is empty, as this breaks level runs where it shouldn't.
1240 UAX#9 removes all the explicit embedding and override codes,
1241 so empty embeddings disappear without a trace. We need to
1242 behave as if we did the same. */
1243 struct bidi_it saved_it;
1244 int level = prev_level;
1245
1246 bidi_copy_it (&saved_it, bidi_it);
1247
87e67904 1248 while (bidi_explicit_dir_char (bidi_char_at_pos (bidi_it->bytepos
9f257352 1249 + bidi_it->ch_len, s)))
b7b65b15 1250 {
182ce2d2
EZ
1251 /* This advances to the next character, skipping any
1252 characters covered by display strings. */
b7b65b15 1253 level = bidi_resolve_explicit_1 (bidi_it);
9f257352
EZ
1254 /* If string.lstring was relocated inside bidi_resolve_explicit_1,
1255 a pointer to its data is no longer valid. */
1256 if (STRINGP (bidi_it->string.lstring))
1257 s = SDATA (bidi_it->string.lstring);
b7b65b15
EZ
1258 }
1259
102ebb00
EZ
1260 if (bidi_it->nchars <= 0)
1261 abort ();
b7b65b15 1262 if (level == prev_level) /* empty embedding */
182ce2d2 1263 saved_it.ignore_bn_limit = bidi_it->charpos + bidi_it->nchars;
b7b65b15 1264 else /* this embedding is non-empty */
87e67904 1265 saved_it.ignore_bn_limit = -2;
b7b65b15
EZ
1266
1267 bidi_copy_it (bidi_it, &saved_it);
87e67904 1268 if (bidi_it->ignore_bn_limit > -1)
b7b65b15
EZ
1269 {
1270 /* We pushed a level, but we shouldn't have. Undo that. */
1271 if (!bidi_it->invalid_rl_levels)
1272 {
1273 new_level = bidi_pop_embedding_level (bidi_it);
1274 bidi_it->invalid_rl_levels = -1;
1275 if (bidi_it->invalid_levels)
1276 bidi_it->invalid_levels--;
1277 }
1278 if (!bidi_it->invalid_levels)
1279 new_level = bidi_pop_embedding_level (bidi_it);
1280 else
1281 {
1282 bidi_it->invalid_levels--;
1283 bidi_it->invalid_rl_levels--;
1284 }
1285 }
1286 }
1287
b7b65b15
EZ
1288 if (bidi_it->type == NEUTRAL_B) /* X8 */
1289 {
21fce5ab 1290 bidi_set_paragraph_end (bidi_it);
6bff6497
EZ
1291 /* This is needed by bidi_resolve_weak below, and in L1. */
1292 bidi_it->type_after_w1 = bidi_it->type;
89d3374a 1293 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1294 }
1295
1296 return new_level;
1297}
1298
182ce2d2
EZ
1299/* Advance in the buffer/string, resolve weak types and return the
1300 type of the next character after weak type resolution. */
fd3998ff 1301static bidi_type_t
b7b65b15
EZ
1302bidi_resolve_weak (struct bidi_it *bidi_it)
1303{
1304 bidi_type_t type;
1305 bidi_dir_t override;
1306 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1307 int new_level = bidi_resolve_explicit (bidi_it);
1308 int next_char;
1309 bidi_type_t type_of_next;
1310 struct bidi_it saved_it;
9f257352
EZ
1311 EMACS_INT eob =
1312 (STRINGP (bidi_it->string.lstring) || bidi_it->string.s)
1313 ? bidi_it->string.schars : ZV;
b7b65b15
EZ
1314
1315 type = bidi_it->type;
1316 override = bidi_it->level_stack[bidi_it->stack_idx].override;
1317
1318 if (type == UNKNOWN_BT
1319 || type == LRE
1320 || type == LRO
1321 || type == RLE
1322 || type == RLO
1323 || type == PDF)
1324 abort ();
1325
1326 if (new_level != prev_level
1327 || bidi_it->type == NEUTRAL_B)
1328 {
1329 /* We've got a new embedding level run, compute the directional
1330 type of sor and initialize per-run variables (UAX#9, clause
1331 X10). */
1332 bidi_set_sor_type (bidi_it, prev_level, new_level);
1333 }
1334 else if (type == NEUTRAL_S || type == NEUTRAL_WS
1335 || type == WEAK_BN || type == STRONG_AL)
89d3374a
EZ
1336 bidi_it->type_after_w1 = type; /* needed in L1 */
1337 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1338
1339 /* Level and directional override status are already recorded in
1340 bidi_it, and do not need any change; see X6. */
1341 if (override == R2L) /* X6 */
1342 type = STRONG_R;
1343 else if (override == L2R)
1344 type = STRONG_L;
bc5a45f3 1345 else
b7b65b15 1346 {
bc5a45f3 1347 if (type == WEAK_NSM) /* W1 */
b7b65b15 1348 {
bc5a45f3 1349 /* Note that we don't need to consider the case where the
5930fe97
EZ
1350 prev character has its type overridden by an RLO or LRO,
1351 because then either the type of this NSM would have been
1352 also overridden, or the previous character is outside the
1353 current level run, and thus not relevant to this NSM.
1354 This is why NSM gets the type_after_w1 of the previous
1355 character. */
ebb5722e
EZ
1356 if (bidi_it->prev.type_after_w1 != UNKNOWN_BT
1357 /* if type_after_w1 is NEUTRAL_B, this NSM is at sor */
1358 && bidi_it->prev.type_after_w1 != NEUTRAL_B)
5930fe97 1359 type = bidi_it->prev.type_after_w1;
bc5a45f3
EZ
1360 else if (bidi_it->sor == R2L)
1361 type = STRONG_R;
1362 else if (bidi_it->sor == L2R)
1363 type = STRONG_L;
1364 else /* shouldn't happen! */
1365 abort ();
b7b65b15 1366 }
bc5a45f3
EZ
1367 if (type == WEAK_EN /* W2 */
1368 && bidi_it->last_strong.type_after_w1 == STRONG_AL)
1369 type = WEAK_AN;
1370 else if (type == STRONG_AL) /* W3 */
1371 type = STRONG_R;
1372 else if ((type == WEAK_ES /* W4 */
1373 && bidi_it->prev.type_after_w1 == WEAK_EN
1374 && bidi_it->prev.orig_type == WEAK_EN)
1375 || (type == WEAK_CS
1376 && ((bidi_it->prev.type_after_w1 == WEAK_EN
1377 && bidi_it->prev.orig_type == WEAK_EN)
1378 || bidi_it->prev.type_after_w1 == WEAK_AN)))
b7b65b15 1379 {
9f257352
EZ
1380 const unsigned char *s =
1381 STRINGP (bidi_it->string.lstring)
1382 ? SDATA (bidi_it->string.lstring) : bidi_it->string.s;
1383
e7402cb2 1384 next_char =
87e67904
EZ
1385 bidi_it->charpos + bidi_it->nchars >= eob
1386 ? BIDI_EOB
9f257352 1387 : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len, s);
6bff6497 1388 type_of_next = bidi_get_type (next_char, override);
b7b65b15 1389
bc5a45f3 1390 if (type_of_next == WEAK_BN
b7b65b15
EZ
1391 || bidi_explicit_dir_char (next_char))
1392 {
1393 bidi_copy_it (&saved_it, bidi_it);
1394 while (bidi_resolve_explicit (bidi_it) == new_level
bc5a45f3 1395 && bidi_it->type == WEAK_BN)
b7b65b15
EZ
1396 ;
1397 type_of_next = bidi_it->type;
b7b65b15
EZ
1398 bidi_copy_it (bidi_it, &saved_it);
1399 }
bc5a45f3
EZ
1400
1401 /* If the next character is EN, but the last strong-type
1402 character is AL, that next EN will be changed to AN when
1403 we process it in W2 above. So in that case, this ES
1404 should not be changed into EN. */
1405 if (type == WEAK_ES
1406 && type_of_next == WEAK_EN
1407 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1408 type = WEAK_EN;
1409 else if (type == WEAK_CS)
b7b65b15 1410 {
bc5a45f3
EZ
1411 if (bidi_it->prev.type_after_w1 == WEAK_AN
1412 && (type_of_next == WEAK_AN
1413 /* If the next character is EN, but the last
1414 strong-type character is AL, EN will be later
1415 changed to AN when we process it in W2 above.
1416 So in that case, this ES should not be
1417 changed into EN. */
1418 || (type_of_next == WEAK_EN
1419 && bidi_it->last_strong.type_after_w1 == STRONG_AL)))
1420 type = WEAK_AN;
1421 else if (bidi_it->prev.type_after_w1 == WEAK_EN
1422 && type_of_next == WEAK_EN
1423 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1424 type = WEAK_EN;
1425 }
1426 }
1427 else if (type == WEAK_ET /* W5: ET with EN before or after it */
1428 || type == WEAK_BN) /* W5/Retaining */
1429 {
1430 if (bidi_it->prev.type_after_w1 == WEAK_EN /* ET/BN w/EN before it */
1431 || bidi_it->next_en_pos > bidi_it->charpos)
1432 type = WEAK_EN;
1433 else /* W5: ET/BN with EN after it. */
1434 {
182ce2d2 1435 EMACS_INT en_pos = bidi_it->charpos + bidi_it->nchars;
9f257352
EZ
1436 const unsigned char *s =
1437 STRINGP (bidi_it->string.lstring)
1438 ? SDATA (bidi_it->string.lstring) : bidi_it->string.s;
bc5a45f3 1439
102ebb00
EZ
1440 if (bidi_it->nchars <= 0)
1441 abort ();
bc5a45f3 1442 next_char =
87e67904
EZ
1443 bidi_it->charpos + bidi_it->nchars >= eob
1444 ? BIDI_EOB
9f257352 1445 : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len, s);
bc5a45f3
EZ
1446 type_of_next = bidi_get_type (next_char, override);
1447
1448 if (type_of_next == WEAK_ET
1449 || type_of_next == WEAK_BN
1450 || bidi_explicit_dir_char (next_char))
1451 {
1452 bidi_copy_it (&saved_it, bidi_it);
1453 while (bidi_resolve_explicit (bidi_it) == new_level
1454 && (bidi_it->type == WEAK_BN
1455 || bidi_it->type == WEAK_ET))
1456 ;
1457 type_of_next = bidi_it->type;
1458 en_pos = bidi_it->charpos;
1459 bidi_copy_it (bidi_it, &saved_it);
1460 }
1461 if (type_of_next == WEAK_EN)
b7b65b15 1462 {
bc5a45f3
EZ
1463 /* If the last strong character is AL, the EN we've
1464 found will become AN when we get to it (W2). */
1465 if (bidi_it->last_strong.type_after_w1 != STRONG_AL)
1466 {
1467 type = WEAK_EN;
1468 /* Remember this EN position, to speed up processing
1469 of the next ETs. */
1470 bidi_it->next_en_pos = en_pos;
1471 }
1472 else if (type == WEAK_BN)
1473 type = NEUTRAL_ON; /* W6/Retaining */
b7b65b15 1474 }
b7b65b15
EZ
1475 }
1476 }
1477 }
1478
1479 if (type == WEAK_ES || type == WEAK_ET || type == WEAK_CS /* W6 */
89d3374a
EZ
1480 || (type == WEAK_BN
1481 && (bidi_it->prev.type_after_w1 == WEAK_CS /* W6/Retaining */
1482 || bidi_it->prev.type_after_w1 == WEAK_ES
1483 || bidi_it->prev.type_after_w1 == WEAK_ET)))
b7b65b15
EZ
1484 type = NEUTRAL_ON;
1485
1486 /* Store the type we've got so far, before we clobber it with strong
1487 types in W7 and while resolving neutral types. But leave alone
1488 the original types that were recorded above, because we will need
1489 them for the L1 clause. */
89d3374a
EZ
1490 if (bidi_it->type_after_w1 == UNKNOWN_BT)
1491 bidi_it->type_after_w1 = type;
1492 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1493
1494 if (type == WEAK_EN) /* W7 */
1495 {
89d3374a 1496 if ((bidi_it->last_strong.type_after_w1 == STRONG_L)
b7b65b15
EZ
1497 || (bidi_it->last_strong.type == UNKNOWN_BT && bidi_it->sor == L2R))
1498 type = STRONG_L;
1499 }
1500
1501 bidi_it->type = type;
2d6e4628 1502 bidi_check_type (bidi_it->type);
b7b65b15
EZ
1503 return type;
1504}
1505
fd3998ff 1506static bidi_type_t
b7b65b15
EZ
1507bidi_resolve_neutral (struct bidi_it *bidi_it)
1508{
1509 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1510 bidi_type_t type = bidi_resolve_weak (bidi_it);
1511 int current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1512
1513 if (!(type == STRONG_R
1514 || type == STRONG_L
1515 || type == WEAK_BN
1516 || type == WEAK_EN
1517 || type == WEAK_AN
1518 || type == NEUTRAL_B
1519 || type == NEUTRAL_S
1520 || type == NEUTRAL_WS
1521 || type == NEUTRAL_ON))
1522 abort ();
1523
1524 if (bidi_get_category (type) == NEUTRAL
1525 || (type == WEAK_BN && prev_level == current_level))
1526 {
1527 if (bidi_it->next_for_neutral.type != UNKNOWN_BT)
1528 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1529 bidi_it->next_for_neutral.type,
1530 current_level);
1531 else
1532 {
1533 /* Arrrgh!! The UAX#9 algorithm is too deeply entrenched in
1534 the assumption of batch-style processing; see clauses W4,
1535 W5, and especially N1, which require to look far forward
182ce2d2
EZ
1536 (as well as back) in the buffer/string. May the fleas of
1537 a thousand camels infest the armpits of those who design
b7b65b15
EZ
1538 supposedly general-purpose algorithms by looking at their
1539 own implementations, and fail to consider other possible
1540 implementations! */
1541 struct bidi_it saved_it;
1542 bidi_type_t next_type;
1543
1544 if (bidi_it->scan_dir == -1)
1545 abort ();
1546
1547 bidi_copy_it (&saved_it, bidi_it);
1548 /* Scan the text forward until we find the first non-neutral
1549 character, and then use that to resolve the neutral we
1550 are dealing with now. We also cache the scanned iterator
1551 states, to salvage some of the effort later. */
1552 bidi_cache_iterator_state (bidi_it, 0);
1553 do {
1554 /* Record the info about the previous character, so that
1555 it will be cached below with this state. */
89d3374a 1556 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
b7b65b15
EZ
1557 && bidi_it->type != WEAK_BN)
1558 bidi_remember_char (&bidi_it->prev, bidi_it);
1559 type = bidi_resolve_weak (bidi_it);
1560 /* Paragraph separators have their levels fully resolved
1561 at this point, so cache them as resolved. */
1562 bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B);
1563 /* FIXME: implement L1 here, by testing for a newline and
1564 resetting the level for any sequence of whitespace
1565 characters adjacent to it. */
1566 } while (!(type == NEUTRAL_B
1567 || (type != WEAK_BN
1568 && bidi_get_category (type) != NEUTRAL)
1569 /* This is all per level run, so stop when we
1570 reach the end of this level run. */
1571 || bidi_it->level_stack[bidi_it->stack_idx].level !=
1572 current_level));
1573
1574 bidi_remember_char (&saved_it.next_for_neutral, bidi_it);
1575
1576 switch (type)
1577 {
1578 case STRONG_L:
1579 case STRONG_R:
1580 case STRONG_AL:
1581 next_type = type;
1582 break;
1583 case WEAK_EN:
1584 case WEAK_AN:
1585 /* N1: ``European and Arabic numbers are treated as
1586 though they were R.'' */
1587 next_type = STRONG_R;
1588 saved_it.next_for_neutral.type = STRONG_R;
1589 break;
1590 case WEAK_BN:
1591 if (!bidi_explicit_dir_char (bidi_it->ch))
1592 abort (); /* can't happen: BNs are skipped */
1593 /* FALLTHROUGH */
1594 case NEUTRAL_B:
1595 /* Marched all the way to the end of this level run.
1596 We need to use the eor type, whose information is
1597 stored by bidi_set_sor_type in the prev_for_neutral
1598 member. */
1599 if (saved_it.type != WEAK_BN
89d3374a 1600 || bidi_get_category (bidi_it->prev.type_after_w1) == NEUTRAL)
b7b65b15
EZ
1601 {
1602 next_type = bidi_it->prev_for_neutral.type;
1603 saved_it.next_for_neutral.type = next_type;
2d6e4628 1604 bidi_check_type (next_type);
b7b65b15
EZ
1605 }
1606 else
1607 {
1608 /* This is a BN which does not adjoin neutrals.
1609 Leave its type alone. */
1610 bidi_copy_it (bidi_it, &saved_it);
1611 return bidi_it->type;
1612 }
1613 break;
1614 default:
1615 abort ();
1616 }
1617 type = bidi_resolve_neutral_1 (saved_it.prev_for_neutral.type,
1618 next_type, current_level);
1619 saved_it.type = type;
2d6e4628 1620 bidi_check_type (type);
b7b65b15
EZ
1621 bidi_copy_it (bidi_it, &saved_it);
1622 }
1623 }
1624 return type;
1625}
1626
1627/* Given an iterator state in BIDI_IT, advance one character position
182ce2d2
EZ
1628 in the buffer/string to the next character (in the logical order),
1629 resolve the bidi type of that next character, and return that
1630 type. */
fd3998ff 1631static bidi_type_t
b7b65b15
EZ
1632bidi_type_of_next_char (struct bidi_it *bidi_it)
1633{
1634 bidi_type_t type;
1635
1636 /* This should always be called during a forward scan. */
1637 if (bidi_it->scan_dir != 1)
1638 abort ();
1639
1640 /* Reset the limit until which to ignore BNs if we step out of the
1641 area where we found only empty levels. */
87e67904 1642 if ((bidi_it->ignore_bn_limit > -1
b7b65b15 1643 && bidi_it->ignore_bn_limit <= bidi_it->charpos)
87e67904 1644 || (bidi_it->ignore_bn_limit == -2
b7b65b15 1645 && !bidi_explicit_dir_char (bidi_it->ch)))
87e67904 1646 bidi_it->ignore_bn_limit = -1;
b7b65b15
EZ
1647
1648 type = bidi_resolve_neutral (bidi_it);
1649
1650 return type;
1651}
1652
1653/* Given an iterator state BIDI_IT, advance one character position in
182ce2d2
EZ
1654 the buffer/string to the next character (in the current scan
1655 direction), resolve the embedding and implicit levels of that next
1656 character, and return the resulting level. */
fd3998ff 1657static int
b7b65b15
EZ
1658bidi_level_of_next_char (struct bidi_it *bidi_it)
1659{
1660 bidi_type_t type;
1661 int level, prev_level = -1;
1662 struct bidi_saved_info next_for_neutral;
578b494e 1663 EMACS_INT next_char_pos = -2;
b7b65b15
EZ
1664
1665 if (bidi_it->scan_dir == 1)
1666 {
9f257352
EZ
1667 EMACS_INT eob =
1668 (bidi_it->string.s || STRINGP (bidi_it->string.lstring))
1669 ? bidi_it->string.schars : ZV;
1670
b7b65b15 1671 /* There's no sense in trying to advance if we hit end of text. */
9f257352 1672 if (bidi_it->charpos >= eob)
b7b65b15
EZ
1673 return bidi_it->resolved_level;
1674
1675 /* Record the info about the previous character. */
89d3374a 1676 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
b7b65b15
EZ
1677 && bidi_it->type != WEAK_BN)
1678 bidi_remember_char (&bidi_it->prev, bidi_it);
89d3374a
EZ
1679 if (bidi_it->type_after_w1 == STRONG_R
1680 || bidi_it->type_after_w1 == STRONG_L
1681 || bidi_it->type_after_w1 == STRONG_AL)
b7b65b15
EZ
1682 bidi_remember_char (&bidi_it->last_strong, bidi_it);
1683 /* FIXME: it sounds like we don't need both prev and
1684 prev_for_neutral members, but I'm leaving them both for now. */
1685 if (bidi_it->type == STRONG_R || bidi_it->type == STRONG_L
1686 || bidi_it->type == WEAK_EN || bidi_it->type == WEAK_AN)
1687 bidi_remember_char (&bidi_it->prev_for_neutral, bidi_it);
1688
1689 /* If we overstepped the characters used for resolving neutrals
1690 and whitespace, invalidate their info in the iterator. */
1691 if (bidi_it->charpos >= bidi_it->next_for_neutral.charpos)
1692 bidi_it->next_for_neutral.type = UNKNOWN_BT;
1693 if (bidi_it->next_en_pos >= 0
1694 && bidi_it->charpos >= bidi_it->next_en_pos)
1695 bidi_it->next_en_pos = -1;
1696 if (bidi_it->next_for_ws.type != UNKNOWN_BT
1697 && bidi_it->charpos >= bidi_it->next_for_ws.charpos)
1698 bidi_it->next_for_ws.type = UNKNOWN_BT;
1699
1700 /* This must be taken before we fill the iterator with the info
1701 about the next char. If we scan backwards, the iterator
1702 state must be already cached, so there's no need to know the
1703 embedding level of the previous character, since we will be
1704 returning to our caller shortly. */
1705 prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1706 }
1707 next_for_neutral = bidi_it->next_for_neutral;
1708
182ce2d2
EZ
1709 /* Perhaps the character we want is already cached. If it is, the
1710 call to bidi_cache_find below will return a type other than
1711 UNKNOWN_BT. */
87e67904 1712 if (bidi_cache_idx > bidi_cache_start && !bidi_it->first_elt)
102ebb00 1713 {
9f257352
EZ
1714 int bob =
1715 (bidi_it->string.s || STRINGP (bidi_it->string.lstring)) ? 0 : 1;
1716
102ebb00
EZ
1717 if (bidi_it->scan_dir > 0)
1718 {
1719 if (bidi_it->nchars <= 0)
1720 abort ();
1721 next_char_pos = bidi_it->charpos + bidi_it->nchars;
1722 }
9f257352 1723 else if (bidi_it->charpos >= bob)
bb269206
EZ
1724 /* Implementation note: we allow next_char_pos to be as low as
1725 0 for buffers or -1 for strings, and that is okay because
1726 that's the "position" of the sentinel iterator state we
1727 cached at the beginning of the iteration. */
102ebb00 1728 next_char_pos = bidi_it->charpos - 1;
578b494e 1729 if (next_char_pos >= bob - 1)
87e67904
EZ
1730 type = bidi_cache_find (next_char_pos, -1, bidi_it);
1731 else
1732 type = UNKNOWN_BT;
102ebb00 1733 }
182ce2d2 1734 else
102ebb00 1735 type = UNKNOWN_BT;
b7b65b15
EZ
1736 if (type != UNKNOWN_BT)
1737 {
1738 /* Don't lose the information for resolving neutrals! The
1739 cached states could have been cached before their
1740 next_for_neutral member was computed. If we are on our way
1741 forward, we can simply take the info from the previous
1742 state. */
1743 if (bidi_it->scan_dir == 1
1744 && bidi_it->next_for_neutral.type == UNKNOWN_BT)
1745 bidi_it->next_for_neutral = next_for_neutral;
1746
1747 /* If resolved_level is -1, it means this state was cached
1748 before it was completely resolved, so we cannot return
1749 it. */
1750 if (bidi_it->resolved_level != -1)
1751 return bidi_it->resolved_level;
1752 }
1753 if (bidi_it->scan_dir == -1)
1754 /* If we are going backwards, the iterator state is already cached
1755 from previous scans, and should be fully resolved. */
1756 abort ();
1757
1758 if (type == UNKNOWN_BT)
1759 type = bidi_type_of_next_char (bidi_it);
1760
1761 if (type == NEUTRAL_B)
1762 return bidi_it->resolved_level;
1763
1764 level = bidi_it->level_stack[bidi_it->stack_idx].level;
1765 if ((bidi_get_category (type) == NEUTRAL /* && type != NEUTRAL_B */)
1766 || (type == WEAK_BN && prev_level == level))
1767 {
1768 if (bidi_it->next_for_neutral.type == UNKNOWN_BT)
1769 abort ();
1770
1771 /* If the cached state shows a neutral character, it was not
1772 resolved by bidi_resolve_neutral, so do it now. */
1773 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1774 bidi_it->next_for_neutral.type,
1775 level);
1776 }
1777
1778 if (!(type == STRONG_R
1779 || type == STRONG_L
1780 || type == WEAK_BN
1781 || type == WEAK_EN
1782 || type == WEAK_AN))
1783 abort ();
1784 bidi_it->type = type;
2d6e4628 1785 bidi_check_type (bidi_it->type);
b7b65b15
EZ
1786
1787 /* For L1 below, we need to know, for each WS character, whether
0d327994 1788 it belongs to a sequence of WS characters preceding a newline
b7b65b15 1789 or a TAB or a paragraph separator. */
89d3374a 1790 if (bidi_it->orig_type == NEUTRAL_WS
b7b65b15
EZ
1791 && bidi_it->next_for_ws.type == UNKNOWN_BT)
1792 {
1793 int ch;
9d68c2a9 1794 EMACS_INT clen = bidi_it->ch_len;
6bff6497
EZ
1795 EMACS_INT bpos = bidi_it->bytepos;
1796 EMACS_INT cpos = bidi_it->charpos;
182ce2d2 1797 EMACS_INT disp_pos = bidi_it->disp_pos;
102ebb00 1798 EMACS_INT nc = bidi_it->nchars;
87e67904 1799 struct bidi_string_data bs = bidi_it->string;
b7b65b15 1800 bidi_type_t chtype;
fc6f18ce 1801 int fwp = bidi_it->frame_window_p;
b7b65b15 1802
102ebb00
EZ
1803 if (bidi_it->nchars <= 0)
1804 abort ();
b7b65b15 1805 do {
87e67904 1806 ch = bidi_fetch_char (bpos += clen, cpos += nc, &disp_pos, &bs, fwp,
fc6f18ce 1807 &clen, &nc);
e7402cb2 1808 if (ch == '\n' || ch == BIDI_EOB /* || ch == LINESEP_CHAR */)
b7b65b15
EZ
1809 chtype = NEUTRAL_B;
1810 else
6bff6497 1811 chtype = bidi_get_type (ch, NEUTRAL_DIR);
b7b65b15
EZ
1812 } while (chtype == NEUTRAL_WS || chtype == WEAK_BN
1813 || bidi_explicit_dir_char (ch)); /* L1/Retaining */
1814 bidi_it->next_for_ws.type = chtype;
2d6e4628 1815 bidi_check_type (bidi_it->next_for_ws.type);
b7b65b15
EZ
1816 bidi_it->next_for_ws.charpos = cpos;
1817 bidi_it->next_for_ws.bytepos = bpos;
1818 }
1819
1820 /* Resolve implicit levels, with a twist: PDFs get the embedding
1821 level of the enbedding they terminate. See below for the
1822 reason. */
89d3374a 1823 if (bidi_it->orig_type == PDF
b7b65b15
EZ
1824 /* Don't do this if this formatting code didn't change the
1825 embedding level due to invalid or empty embeddings. */
1826 && prev_level != level)
1827 {
1828 /* Don't look in UAX#9 for the reason for this: it's our own
1829 private quirk. The reason is that we want the formatting
1830 codes to be delivered so that they bracket the text of their
1831 embedding. For example, given the text
1832
1833 {RLO}teST{PDF}
1834
1835 we want it to be displayed as
1836
0d68907d 1837 {PDF}STet{RLO}
b7b65b15
EZ
1838
1839 not as
1840
1841 STet{RLO}{PDF}
1842
1843 which will result because we bump up the embedding level as
1844 soon as we see the RLO and pop it as soon as we see the PDF,
1845 so RLO itself has the same embedding level as "teST", and
1846 thus would be normally delivered last, just before the PDF.
1847 The switch below fiddles with the level of PDF so that this
1848 ugly side effect does not happen.
1849
1850 (This is, of course, only important if the formatting codes
e7402cb2
EZ
1851 are actually displayed, but Emacs does need to display them
1852 if the user wants to.) */
b7b65b15
EZ
1853 level = prev_level;
1854 }
89d3374a
EZ
1855 else if (bidi_it->orig_type == NEUTRAL_B /* L1 */
1856 || bidi_it->orig_type == NEUTRAL_S
e7402cb2
EZ
1857 || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB
1858 /* || bidi_it->ch == LINESEP_CHAR */
89d3374a 1859 || (bidi_it->orig_type == NEUTRAL_WS
b7b65b15
EZ
1860 && (bidi_it->next_for_ws.type == NEUTRAL_B
1861 || bidi_it->next_for_ws.type == NEUTRAL_S)))
1862 level = bidi_it->level_stack[0].level;
1863 else if ((level & 1) == 0) /* I1 */
1864 {
1865 if (type == STRONG_R)
1866 level++;
1867 else if (type == WEAK_EN || type == WEAK_AN)
1868 level += 2;
1869 }
1870 else /* I2 */
1871 {
1872 if (type == STRONG_L || type == WEAK_EN || type == WEAK_AN)
1873 level++;
1874 }
1875
1876 bidi_it->resolved_level = level;
1877 return level;
1878}
1879
1880/* Move to the other edge of a level given by LEVEL. If END_FLAG is
1881 non-zero, we are at the end of a level, and we need to prepare to
1882 resume the scan of the lower level.
1883
1884 If this level's other edge is cached, we simply jump to it, filling
1885 the iterator structure with the iterator state on the other edge.
182ce2d2
EZ
1886 Otherwise, we walk the buffer or string until we come back to the
1887 same level as LEVEL.
b7b65b15
EZ
1888
1889 Note: we are not talking here about a ``level run'' in the UAX#9
1890 sense of the term, but rather about a ``level'' which includes
1891 all the levels higher than it. In other words, given the levels
1892 like this:
1893
1894 11111112222222333333334443343222222111111112223322111
1895 A B C
1896
1897 and assuming we are at point A scanning left to right, this
1898 function moves to point C, whereas the UAX#9 ``level 2 run'' ends
1899 at point B. */
1900static void
1901bidi_find_other_level_edge (struct bidi_it *bidi_it, int level, int end_flag)
1902{
1903 int dir = end_flag ? -bidi_it->scan_dir : bidi_it->scan_dir;
1904 int idx;
1905
1906 /* Try the cache first. */
87e67904
EZ
1907 if ((idx = bidi_cache_find_level_change (level, dir, end_flag))
1908 >= bidi_cache_start)
b7b65b15
EZ
1909 bidi_cache_fetch_state (idx, bidi_it);
1910 else
1911 {
1912 int new_level;
1913
1914 if (end_flag)
1915 abort (); /* if we are at end of level, its edges must be cached */
1916
1917 bidi_cache_iterator_state (bidi_it, 1);
1918 do {
1919 new_level = bidi_level_of_next_char (bidi_it);
1920 bidi_cache_iterator_state (bidi_it, 1);
1921 } while (new_level >= level);
1922 }
1923}
1924
1925void
4b292a22 1926bidi_move_to_visually_next (struct bidi_it *bidi_it)
b7b65b15
EZ
1927{
1928 int old_level, new_level, next_level;
9c82e145 1929 struct bidi_it sentinel;
b7b65b15 1930
87e67904
EZ
1931 if (bidi_it->charpos < 0 || bidi_it->bytepos < 0)
1932 abort ();
1933
b7b65b15
EZ
1934 if (bidi_it->scan_dir == 0)
1935 {
1936 bidi_it->scan_dir = 1; /* default to logical order */
1937 }
1938
be39f003
EZ
1939 /* If we just passed a newline, initialize for the next line. */
1940 if (!bidi_it->first_elt && bidi_it->orig_type == NEUTRAL_B)
1941 bidi_line_init (bidi_it);
1942
6dcfd253
EZ
1943 /* Prepare the sentinel iterator state, and cache it. When we bump
1944 into it, scanning backwards, we'll know that the last non-base
1945 level is exhausted. */
87e67904 1946 if (bidi_cache_idx == bidi_cache_start)
9c82e145
EZ
1947 {
1948 bidi_copy_it (&sentinel, bidi_it);
1949 if (bidi_it->first_elt)
1950 {
1951 sentinel.charpos--; /* cached charpos needs to be monotonic */
1952 sentinel.bytepos--;
1953 sentinel.ch = '\n'; /* doesn't matter, but why not? */
1954 sentinel.ch_len = 1;
182ce2d2 1955 sentinel.nchars = 1;
9c82e145 1956 }
6dcfd253 1957 bidi_cache_iterator_state (&sentinel, 1);
9c82e145 1958 }
b7b65b15
EZ
1959
1960 old_level = bidi_it->resolved_level;
1961 new_level = bidi_level_of_next_char (bidi_it);
b7b65b15
EZ
1962
1963 /* Reordering of resolved levels (clause L2) is implemented by
1964 jumping to the other edge of the level and flipping direction of
c0546589 1965 scanning the text whenever we find a level change. */
b7b65b15
EZ
1966 if (new_level != old_level)
1967 {
1968 int ascending = new_level > old_level;
1969 int level_to_search = ascending ? old_level + 1 : old_level;
1970 int incr = ascending ? 1 : -1;
1971 int expected_next_level = old_level + incr;
1972
b7b65b15
EZ
1973 /* Jump (or walk) to the other edge of this level. */
1974 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
1975 /* Switch scan direction and peek at the next character in the
1976 new direction. */
1977 bidi_it->scan_dir = -bidi_it->scan_dir;
1978
1979 /* The following loop handles the case where the resolved level
1980 jumps by more than one. This is typical for numbers inside a
1981 run of text with left-to-right embedding direction, but can
1982 also happen in other situations. In those cases the decision
1983 where to continue after a level change, and in what direction,
1984 is tricky. For example, given a text like below:
1985
1986 abcdefgh
1987 11336622
1988
1989 (where the numbers below the text show the resolved levels),
1990 the result of reordering according to UAX#9 should be this:
1991
1992 efdcghba
1993
1994 This is implemented by the loop below which flips direction
1995 and jumps to the other edge of the level each time it finds
1996 the new level not to be the expected one. The expected level
1997 is always one more or one less than the previous one. */
1998 next_level = bidi_peek_at_next_level (bidi_it);
1999 while (next_level != expected_next_level)
2000 {
2001 expected_next_level += incr;
2002 level_to_search += incr;
2003 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
2004 bidi_it->scan_dir = -bidi_it->scan_dir;
2005 next_level = bidi_peek_at_next_level (bidi_it);
2006 }
2007
2008 /* Finally, deliver the next character in the new direction. */
2009 next_level = bidi_level_of_next_char (bidi_it);
2010 }
2011
b44d9321
EZ
2012 /* Take note when we have just processed the newline that precedes
2013 the end of the paragraph. The next time we are about to be
2014 called, set_iterator_to_next will automatically reinit the
2015 paragraph direction, if needed. We do this at the newline before
2016 the paragraph separator, because the next character might not be
2017 the first character of the next paragraph, due to the bidi
c0546589
EZ
2018 reordering, whereas we _must_ know the paragraph base direction
2019 _before_ we process the paragraph's text, since the base
2020 direction affects the reordering. */
87e67904 2021 if (bidi_it->scan_dir == 1 && bidi_it->orig_type == NEUTRAL_B)
be39f003 2022 {
87e67904
EZ
2023 /* The paragraph direction of the entire string, once
2024 determined, is in effect for the entire string. Setting the
2025 separator limit to the end of the string prevents
2026 bidi_paragraph_init from being called automatically on this
2027 string. */
9f257352 2028 if (bidi_it->string.s || STRINGP (bidi_it->string.lstring))
87e67904
EZ
2029 bidi_it->separator_limit = bidi_it->string.schars;
2030 else if (bidi_it->bytepos < ZV_BYTE)
be39f003 2031 {
87e67904
EZ
2032 EMACS_INT sep_len =
2033 bidi_at_paragraph_end (bidi_it->charpos + bidi_it->nchars,
2034 bidi_it->bytepos + bidi_it->ch_len);
2035 if (bidi_it->nchars <= 0)
2036 abort ();
2037 if (sep_len >= 0)
2038 {
2039 bidi_it->new_paragraph = 1;
2040 /* Record the buffer position of the last character of the
2041 paragraph separator. */
2042 bidi_it->separator_limit =
2043 bidi_it->charpos + bidi_it->nchars + sep_len;
2044 }
be39f003
EZ
2045 }
2046 }
6bff6497 2047
87e67904 2048 if (bidi_it->scan_dir == 1 && bidi_cache_idx > bidi_cache_start)
b7b65b15
EZ
2049 {
2050 /* If we are at paragraph's base embedding level and beyond the
2051 last cached position, the cache's job is done and we can
2052 discard it. */
2053 if (bidi_it->resolved_level == bidi_it->level_stack[0].level
182ce2d2
EZ
2054 && bidi_it->charpos > (bidi_cache[bidi_cache_idx - 1].charpos
2055 + bidi_cache[bidi_cache_idx - 1].nchars - 1))
b7b65b15
EZ
2056 bidi_cache_reset ();
2057 /* But as long as we are caching during forward scan, we must
2058 cache each state, or else the cache integrity will be
2059 compromised: it assumes cached states correspond to buffer
2060 positions 1:1. */
2061 else
2062 bidi_cache_iterator_state (bidi_it, 1);
2063 }
2064}
2065
2066/* This is meant to be called from within the debugger, whenever you
2067 wish to examine the cache contents. */
e78aecca 2068void bidi_dump_cached_states (void) EXTERNALLY_VISIBLE;
b7b65b15
EZ
2069void
2070bidi_dump_cached_states (void)
2071{
2072 int i;
2073 int ndigits = 1;
2074
2075 if (bidi_cache_idx == 0)
2076 {
2077 fprintf (stderr, "The cache is empty.\n");
2078 return;
2079 }
2080 fprintf (stderr, "Total of %d state%s in cache:\n",
2081 bidi_cache_idx, bidi_cache_idx == 1 ? "" : "s");
2082
2083 for (i = bidi_cache[bidi_cache_idx - 1].charpos; i > 0; i /= 10)
2084 ndigits++;
2085 fputs ("ch ", stderr);
2086 for (i = 0; i < bidi_cache_idx; i++)
2087 fprintf (stderr, "%*c", ndigits, bidi_cache[i].ch);
2088 fputs ("\n", stderr);
2089 fputs ("lvl ", stderr);
2090 for (i = 0; i < bidi_cache_idx; i++)
2091 fprintf (stderr, "%*d", ndigits, bidi_cache[i].resolved_level);
2092 fputs ("\n", stderr);
2093 fputs ("pos ", stderr);
2094 for (i = 0; i < bidi_cache_idx; i++)
c2982e87 2095 fprintf (stderr, "%*"pI"d", ndigits, bidi_cache[i].charpos);
b7b65b15
EZ
2096 fputs ("\n", stderr);
2097}