Initial version of display/overlay strings is working.
[bpt/emacs.git] / src / bidi.c
CommitLineData
cbb09f04 1/* Low-level bidirectional buffer/string-scanning functions for GNU Emacs.
73b0cd50 2 Copyright (C) 2000-2001, 2004-2005, 2009-2011
b118e65f 3 Free Software Foundation, Inc.
b7b65b15
EZ
4
5This file is part of GNU Emacs.
6
a8d11bd3 7GNU Emacs is free software: you can redistribute it and/or modify
b7b65b15 8it under the terms of the GNU General Public License as published by
a8d11bd3
EZ
9the Free Software Foundation, either version 3 of the License, or
10(at your option) any later version.
b7b65b15
EZ
11
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
16
b7b65b15 17You should have received a copy of the GNU General Public License
a8d11bd3 18along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
b7b65b15 19
2d6e4628
EZ
20/* Written by Eli Zaretskii <eliz@gnu.org>.
21
22 A sequential implementation of the Unicode Bidirectional algorithm,
cbb09f04 23 (UBA) as per UAX#9, a part of the Unicode Standard.
b7b65b15
EZ
24
25 Unlike the reference and most other implementations, this one is
940afb59
EZ
26 designed to be called once for every character in the buffer or
27 string.
b7b65b15 28
4b292a22 29 The main entry point is bidi_move_to_visually_next. Each time it
b7b65b15
EZ
30 is called, it finds the next character in the visual order, and
31 returns its information in a special structure. The caller is then
32 expected to process this character for display or any other
4b292a22
EZ
33 purposes, and call bidi_move_to_visually_next for the next
34 character. See the comments in bidi_move_to_visually_next for more
35 details about its algorithm that finds the next visual-order
b7b65b15
EZ
36 character by resolving their levels on the fly.
37
940afb59
EZ
38 The two other entry points are bidi_paragraph_init and
39 bidi_mirror_char. The first determines the base direction of a
40 paragraph, while the second returns the mirrored version of its
41 argument character.
42
89d3374a
EZ
43 If you want to understand the code, you will have to read it
44 together with the relevant portions of UAX#9. The comments include
45 references to UAX#9 rules, for that very reason.
46
b7b65b15
EZ
47 A note about references to UAX#9 rules: if the reference says
48 something like "X9/Retaining", it means that you need to refer to
49 rule X9 and to its modifications decribed in the "Implementation
50 Notes" section of UAX#9, under "Retaining Format Codes". */
51
b7b65b15 52#include <config.h>
b7b65b15 53#include <stdio.h>
29e3d8d1
EZ
54#include <setjmp.h>
55
b7b65b15
EZ
56#include "lisp.h"
57#include "buffer.h"
58#include "character.h"
59#include "dispextern.h"
60
61static int bidi_initialized = 0;
62
cbc4fd20 63static Lisp_Object bidi_type_table, bidi_mirror_table;
b7b65b15
EZ
64
65#define LRM_CHAR 0x200E
66#define RLM_CHAR 0x200F
b7b65b15 67#define BIDI_EOB -1
b7b65b15 68
b7b65b15
EZ
69/* Data type for describing the bidirectional character categories. */
70typedef enum {
71 UNKNOWN_BC,
72 NEUTRAL,
73 WEAK,
74 STRONG
75} bidi_category_t;
76
e78aecca 77extern int bidi_ignore_explicit_marks_for_paragraph_level EXTERNALLY_VISIBLE;
b7b65b15
EZ
78int bidi_ignore_explicit_marks_for_paragraph_level = 1;
79
372b7a95 80static Lisp_Object paragraph_start_re, paragraph_separate_re;
6bff6497 81static Lisp_Object Qparagraph_start, Qparagraph_separate;
b7b65b15 82
cbb09f04
EZ
83\f
84/***********************************************************************
85 Utilities
86 ***********************************************************************/
b7b65b15 87
6bff6497
EZ
88/* Return the bidi type of a character CH, subject to the current
89 directional OVERRIDE. */
58b9f433 90static inline bidi_type_t
6bff6497 91bidi_get_type (int ch, bidi_dir_t override)
b7b65b15 92{
6bff6497
EZ
93 bidi_type_t default_type;
94
e7402cb2
EZ
95 if (ch == BIDI_EOB)
96 return NEUTRAL_B;
e342a24d
EZ
97 if (ch < 0 || ch > MAX_CHAR)
98 abort ();
6bff6497
EZ
99
100 default_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
101
102 if (override == NEUTRAL_DIR)
103 return default_type;
104
105 switch (default_type)
106 {
107 /* Although UAX#9 does not tell, it doesn't make sense to
108 override NEUTRAL_B and LRM/RLM characters. */
109 case NEUTRAL_B:
110 case LRE:
111 case LRO:
112 case RLE:
113 case RLO:
114 case PDF:
115 return default_type;
116 default:
117 switch (ch)
118 {
119 case LRM_CHAR:
120 case RLM_CHAR:
121 return default_type;
122 default:
123 if (override == L2R) /* X6 */
124 return STRONG_L;
125 else if (override == R2L)
126 return STRONG_R;
127 else
128 abort (); /* can't happen: handled above */
129 }
130 }
b7b65b15
EZ
131}
132
7d3b3862 133static void
2d6e4628
EZ
134bidi_check_type (bidi_type_t type)
135{
136 if (type < UNKNOWN_BT || type > NEUTRAL_ON)
137 abort ();
138}
139
b7b65b15 140/* Given a bidi TYPE of a character, return its category. */
58b9f433 141static inline bidi_category_t
b7b65b15
EZ
142bidi_get_category (bidi_type_t type)
143{
144 switch (type)
145 {
146 case UNKNOWN_BT:
147 return UNKNOWN_BC;
148 case STRONG_L:
149 case STRONG_R:
150 case STRONG_AL:
151 case LRE:
152 case LRO:
153 case RLE:
154 case RLO:
155 return STRONG;
156 case PDF: /* ??? really?? */
157 case WEAK_EN:
158 case WEAK_ES:
159 case WEAK_ET:
160 case WEAK_AN:
161 case WEAK_CS:
162 case WEAK_NSM:
163 case WEAK_BN:
164 return WEAK;
165 case NEUTRAL_B:
166 case NEUTRAL_S:
167 case NEUTRAL_WS:
168 case NEUTRAL_ON:
169 return NEUTRAL;
170 default:
171 abort ();
172 }
173}
174
a6e8d97c
EZ
175/* Return the mirrored character of C, if it has one. If C has no
176 mirrored counterpart, return C.
177 Note: The conditions in UAX#9 clause L4 regarding the surrounding
178 context must be tested by the caller. */
b7b65b15
EZ
179int
180bidi_mirror_char (int c)
181{
cbc4fd20
EZ
182 Lisp_Object val;
183
184 if (c == BIDI_EOB)
185 return c;
186 if (c < 0 || c > MAX_CHAR)
187 abort ();
b7b65b15 188
cbc4fd20
EZ
189 val = CHAR_TABLE_REF (bidi_mirror_table, c);
190 if (INTEGERP (val))
b7b65b15 191 {
cbc4fd20 192 int v = XINT (val);
b7b65b15 193
cbc4fd20
EZ
194 if (v < 0 || v > MAX_CHAR)
195 abort ();
196
197 return v;
b7b65b15 198 }
cbc4fd20 199
b7b65b15
EZ
200 return c;
201}
202
cbb09f04
EZ
203/* Determine the start-of-run (sor) directional type given the two
204 embedding levels on either side of the run boundary. Also, update
205 the saved info about previously seen characters, since that info is
206 generally valid for a single level run. */
58b9f433 207static inline void
cbb09f04
EZ
208bidi_set_sor_type (struct bidi_it *bidi_it, int level_before, int level_after)
209{
210 int higher_level = level_before > level_after ? level_before : level_after;
211
212 /* The prev_was_pdf gork is required for when we have several PDFs
213 in a row. In that case, we want to compute the sor type for the
214 next level run only once: when we see the first PDF. That's
215 because the sor type depends only on the higher of the two levels
216 that we find on the two sides of the level boundary (see UAX#9,
217 clause X10), and so we don't need to know the final embedding
218 level to which we descend after processing all the PDFs. */
219 if (!bidi_it->prev_was_pdf || level_before < level_after)
220 /* FIXME: should the default sor direction be user selectable? */
221 bidi_it->sor = (higher_level & 1) != 0 ? R2L : L2R;
222 if (level_before > level_after)
223 bidi_it->prev_was_pdf = 1;
224
225 bidi_it->prev.type = UNKNOWN_BT;
226 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
227 bidi_it->last_strong.orig_type = UNKNOWN_BT;
228 bidi_it->prev_for_neutral.type = bidi_it->sor == R2L ? STRONG_R : STRONG_L;
229 bidi_it->prev_for_neutral.charpos = bidi_it->charpos;
230 bidi_it->prev_for_neutral.bytepos = bidi_it->bytepos;
231 bidi_it->next_for_neutral.type = bidi_it->next_for_neutral.type_after_w1 =
232 bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
233 bidi_it->ignore_bn_limit = -1; /* meaning it's unknown */
234}
235
236/* Push the current embedding level and override status; reset the
237 current level to LEVEL and the current override status to OVERRIDE. */
58b9f433 238static inline void
cbb09f04
EZ
239bidi_push_embedding_level (struct bidi_it *bidi_it,
240 int level, bidi_dir_t override)
241{
242 bidi_it->stack_idx++;
243 if (bidi_it->stack_idx >= BIDI_MAXLEVEL)
244 abort ();
245 bidi_it->level_stack[bidi_it->stack_idx].level = level;
246 bidi_it->level_stack[bidi_it->stack_idx].override = override;
247}
248
249/* Pop the embedding level and directional override status from the
250 stack, and return the new level. */
58b9f433 251static inline int
cbb09f04
EZ
252bidi_pop_embedding_level (struct bidi_it *bidi_it)
253{
254 /* UAX#9 says to ignore invalid PDFs. */
255 if (bidi_it->stack_idx > 0)
256 bidi_it->stack_idx--;
257 return bidi_it->level_stack[bidi_it->stack_idx].level;
258}
259
260/* Record in SAVED_INFO the information about the current character. */
58b9f433 261static inline void
cbb09f04
EZ
262bidi_remember_char (struct bidi_saved_info *saved_info,
263 struct bidi_it *bidi_it)
264{
265 saved_info->charpos = bidi_it->charpos;
266 saved_info->bytepos = bidi_it->bytepos;
267 saved_info->type = bidi_it->type;
268 bidi_check_type (bidi_it->type);
269 saved_info->type_after_w1 = bidi_it->type_after_w1;
270 bidi_check_type (bidi_it->type_after_w1);
271 saved_info->orig_type = bidi_it->orig_type;
272 bidi_check_type (bidi_it->orig_type);
273}
274
b7b65b15
EZ
275/* Copy the bidi iterator from FROM to TO. To save cycles, this only
276 copies the part of the level stack that is actually in use. */
58b9f433 277static inline void
b7b65b15
EZ
278bidi_copy_it (struct bidi_it *to, struct bidi_it *from)
279{
280 int i;
281
e69a9370 282 /* Copy everything except the level stack and beyond. */
182ce2d2 283 memcpy (to, from, offsetof (struct bidi_it, level_stack[0]));
b7b65b15
EZ
284
285 /* Copy the active part of the level stack. */
286 to->level_stack[0] = from->level_stack[0]; /* level zero is always in use */
287 for (i = 1; i <= from->stack_idx; i++)
288 to->level_stack[i] = from->level_stack[i];
289}
290
cbb09f04
EZ
291\f
292/***********************************************************************
293 Caching the bidi iterator states
294 ***********************************************************************/
b7b65b15 295
2fe72643
EZ
296#define BIDI_CACHE_CHUNK 200
297static struct bidi_it *bidi_cache;
298static size_t bidi_cache_size = 0;
0416466c 299static size_t elsz = sizeof (struct bidi_it);
87e67904
EZ
300static EMACS_INT bidi_cache_idx; /* next unused cache slot */
301static EMACS_INT bidi_cache_last_idx; /* slot of last cache hit */
302static EMACS_INT bidi_cache_start = 0; /* start of cache for this
303 "stack" level */
304
305/* Reset the cache state to the empty state. We only reset the part
306 of the cache relevant to iteration of the current object. Previous
307 objects, which are pushed on the display iterator's stack, are left
308 intact. This is called when the cached information is no more
309 useful for the current iteration, e.g. when we were reseated to a
310 new position on the same object. */
58b9f433 311static inline void
b7b65b15
EZ
312bidi_cache_reset (void)
313{
87e67904 314 bidi_cache_idx = bidi_cache_start;
b7b65b15
EZ
315 bidi_cache_last_idx = -1;
316}
317
87e67904
EZ
318/* Shrink the cache to its minimal size. Called when we init the bidi
319 iterator for reordering a buffer or a string that does not come
320 from display properties, because that means all the previously
321 cached info is of no further use. */
58b9f433 322static inline void
2fe72643
EZ
323bidi_cache_shrink (void)
324{
325 if (bidi_cache_size > BIDI_CACHE_CHUNK)
326 {
0416466c
EZ
327 bidi_cache_size = BIDI_CACHE_CHUNK;
328 bidi_cache =
329 (struct bidi_it *) xrealloc (bidi_cache, bidi_cache_size * elsz);
2fe72643
EZ
330 }
331 bidi_cache_reset ();
332}
333
58b9f433 334static inline void
b7b65b15
EZ
335bidi_cache_fetch_state (int idx, struct bidi_it *bidi_it)
336{
337 int current_scan_dir = bidi_it->scan_dir;
338
87e67904 339 if (idx < bidi_cache_start || idx >= bidi_cache_idx)
b7b65b15
EZ
340 abort ();
341
342 bidi_copy_it (bidi_it, &bidi_cache[idx]);
343 bidi_it->scan_dir = current_scan_dir;
344 bidi_cache_last_idx = idx;
345}
346
347/* Find a cached state with a given CHARPOS and resolved embedding
348 level less or equal to LEVEL. if LEVEL is -1, disregard the
349 resolved levels in cached states. DIR, if non-zero, means search
350 in that direction from the last cache hit. */
58b9f433 351static inline int
61bfec98 352bidi_cache_search (EMACS_INT charpos, int level, int dir)
b7b65b15
EZ
353{
354 int i, i_start;
355
356 if (bidi_cache_idx)
357 {
358 if (charpos < bidi_cache[bidi_cache_last_idx].charpos)
182ce2d2
EZ
359 {
360 dir = -1;
361 i_start = bidi_cache_last_idx - 1;
362 }
363 else if (charpos > (bidi_cache[bidi_cache_last_idx].charpos
364 + bidi_cache[bidi_cache_last_idx].nchars - 1))
365 {
366 dir = 1;
367 i_start = bidi_cache_last_idx + 1;
368 }
369 else if (dir)
b7b65b15
EZ
370 i_start = bidi_cache_last_idx;
371 else
372 {
373 dir = -1;
374 i_start = bidi_cache_idx - 1;
375 }
376
377 if (dir < 0)
378 {
379 /* Linear search for now; FIXME! */
87e67904 380 for (i = i_start; i >= bidi_cache_start; i--)
182ce2d2
EZ
381 if (bidi_cache[i].charpos <= charpos
382 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
b7b65b15
EZ
383 && (level == -1 || bidi_cache[i].resolved_level <= level))
384 return i;
385 }
386 else
387 {
388 for (i = i_start; i < bidi_cache_idx; i++)
182ce2d2
EZ
389 if (bidi_cache[i].charpos <= charpos
390 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
b7b65b15
EZ
391 && (level == -1 || bidi_cache[i].resolved_level <= level))
392 return i;
393 }
394 }
395
396 return -1;
397}
398
399/* Find a cached state where the resolved level changes to a value
400 that is lower than LEVEL, and return its cache slot index. DIR is
401 the direction to search, starting with the last used cache slot.
87e67904
EZ
402 If DIR is zero, we search backwards from the last occupied cache
403 slot. BEFORE, if non-zero, means return the index of the slot that
404 is ``before'' the level change in the search direction. That is,
b7b65b15
EZ
405 given the cached levels like this:
406
407 1122333442211
408 AB C
409
410 and assuming we are at the position cached at the slot marked with
411 C, searching backwards (DIR = -1) for LEVEL = 2 will return the
412 index of slot B or A, depending whether BEFORE is, respectively,
413 non-zero or zero. */
414static int
415bidi_cache_find_level_change (int level, int dir, int before)
416{
417 if (bidi_cache_idx)
418 {
419 int i = dir ? bidi_cache_last_idx : bidi_cache_idx - 1;
420 int incr = before ? 1 : 0;
421
422 if (!dir)
423 dir = -1;
424 else if (!incr)
425 i += dir;
426
427 if (dir < 0)
428 {
87e67904 429 while (i >= bidi_cache_start + incr)
b7b65b15
EZ
430 {
431 if (bidi_cache[i - incr].resolved_level >= 0
432 && bidi_cache[i - incr].resolved_level < level)
433 return i;
434 i--;
435 }
436 }
437 else
438 {
439 while (i < bidi_cache_idx - incr)
440 {
441 if (bidi_cache[i + incr].resolved_level >= 0
442 && bidi_cache[i + incr].resolved_level < level)
443 return i;
444 i++;
445 }
446 }
447 }
448
449 return -1;
450}
451
58b9f433
EZ
452static inline void
453bidi_cache_ensure_space (int idx)
454{
455 /* Enlarge the cache as needed. */
456 if (idx >= bidi_cache_size)
457 {
458 bidi_cache_size += BIDI_CACHE_CHUNK;
459 bidi_cache =
460 (struct bidi_it *) xrealloc (bidi_cache, bidi_cache_size * elsz);
461 }
462}
463
464static inline void
b7b65b15
EZ
465bidi_cache_iterator_state (struct bidi_it *bidi_it, int resolved)
466{
467 int idx;
468
469 /* We should never cache on backward scans. */
470 if (bidi_it->scan_dir == -1)
471 abort ();
472 idx = bidi_cache_search (bidi_it->charpos, -1, 1);
473
474 if (idx < 0)
475 {
476 idx = bidi_cache_idx;
58b9f433 477 bidi_cache_ensure_space (idx);
bd924a5d
EZ
478 /* Character positions should correspond to cache positions 1:1.
479 If we are outside the range of cached positions, the cache is
480 useless and must be reset. */
87e67904 481 if (idx > bidi_cache_start &&
182ce2d2
EZ
482 (bidi_it->charpos > (bidi_cache[idx - 1].charpos
483 + bidi_cache[idx - 1].nchars)
a1344e7d 484 || bidi_it->charpos < bidi_cache[bidi_cache_start].charpos))
bd924a5d
EZ
485 {
486 bidi_cache_reset ();
87e67904 487 idx = bidi_cache_start;
bd924a5d 488 }
102ebb00
EZ
489 if (bidi_it->nchars <= 0)
490 abort ();
b7b65b15
EZ
491 bidi_copy_it (&bidi_cache[idx], bidi_it);
492 if (!resolved)
493 bidi_cache[idx].resolved_level = -1;
494 }
495 else
496 {
497 /* Copy only the members which could have changed, to avoid
498 costly copying of the entire struct. */
499 bidi_cache[idx].type = bidi_it->type;
2d6e4628 500 bidi_check_type (bidi_it->type);
89d3374a
EZ
501 bidi_cache[idx].type_after_w1 = bidi_it->type_after_w1;
502 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
503 if (resolved)
504 bidi_cache[idx].resolved_level = bidi_it->resolved_level;
505 else
506 bidi_cache[idx].resolved_level = -1;
507 bidi_cache[idx].invalid_levels = bidi_it->invalid_levels;
508 bidi_cache[idx].invalid_rl_levels = bidi_it->invalid_rl_levels;
509 bidi_cache[idx].next_for_neutral = bidi_it->next_for_neutral;
510 bidi_cache[idx].next_for_ws = bidi_it->next_for_ws;
511 bidi_cache[idx].ignore_bn_limit = bidi_it->ignore_bn_limit;
512 }
513
514 bidi_cache_last_idx = idx;
515 if (idx >= bidi_cache_idx)
516 bidi_cache_idx = idx + 1;
517}
518
58b9f433 519static inline bidi_type_t
61bfec98 520bidi_cache_find (EMACS_INT charpos, int level, struct bidi_it *bidi_it)
b7b65b15
EZ
521{
522 int i = bidi_cache_search (charpos, level, bidi_it->scan_dir);
523
87e67904 524 if (i >= bidi_cache_start)
b7b65b15
EZ
525 {
526 bidi_dir_t current_scan_dir = bidi_it->scan_dir;
527
5009f85e 528 bidi_copy_it (bidi_it, &bidi_cache[i]);
b7b65b15
EZ
529 bidi_cache_last_idx = i;
530 /* Don't let scan direction from from the cached state override
531 the current scan direction. */
532 bidi_it->scan_dir = current_scan_dir;
533 return bidi_it->type;
534 }
535
536 return UNKNOWN_BT;
537}
538
58b9f433 539static inline int
b7b65b15
EZ
540bidi_peek_at_next_level (struct bidi_it *bidi_it)
541{
87e67904 542 if (bidi_cache_idx == bidi_cache_start || bidi_cache_last_idx == -1)
b7b65b15
EZ
543 abort ();
544 return bidi_cache[bidi_cache_last_idx + bidi_it->scan_dir].resolved_level;
545}
546
cbb09f04 547\f
58b9f433
EZ
548/***********************************************************************
549 Pushing and popping the bidi iterator state
550 ***********************************************************************/
0c22566f
EZ
551/* 5-slot stack for saving the start of the previous level of the
552 cache. xdisp.c maintains a 5-slot stack for its iterator state,
553 and we need the same size of our stack. */
554static int bidi_cache_start_stack[IT_STACK_SIZE];
58b9f433
EZ
555static int bidi_cache_sp;
556
557/* Push the bidi iterator state in preparation for reordering a
558 different object, e.g. display string found at certain buffer
559 position. Pushing the bidi iterator boils to saving its entire
560 state on the cache and starting a new cache "stacked" on top of the
561 current cache. */
562void
563bidi_push_it (struct bidi_it *bidi_it)
564{
565 /* Save the current iterator state in its entirety after the last
566 used cache slot. */
567 bidi_cache_ensure_space (bidi_cache_idx);
568 memcpy (&bidi_cache[bidi_cache_idx++], bidi_it, sizeof (struct bidi_it));
569
570 /* Push the current cache start onto the stack. */
0c22566f 571 if (bidi_cache_sp >= IT_STACK_SIZE)
58b9f433
EZ
572 abort ();
573 bidi_cache_start_stack[bidi_cache_sp++] = bidi_cache_start;
574
575 /* Start a new level of cache, and make it empty. */
576 bidi_cache_start = bidi_cache_idx;
577 bidi_cache_last_idx = -1;
578}
579
580/* Restore the iterator state saved by bidi_push_it and return the
581 cache to the corresponding state. */
582void
583bidi_pop_it (struct bidi_it *bidi_it)
584{
585 if (bidi_cache_start <= 0)
586 abort ();
587
588 /* Reset the next free cache slot index to what it was before the
589 call to bidi_push_it. */
590 bidi_cache_idx = bidi_cache_start - 1;
591
592 /* Restore the bidi iterator state saved in the cache. */
593 memcpy (bidi_it, &bidi_cache[bidi_cache_idx], sizeof (struct bidi_it));
594
595 /* Pop the previous cache start from the stack. */
596 if (bidi_cache_sp <= 0)
597 abort ();
598 bidi_cache_start = bidi_cache_start_stack[--bidi_cache_sp];
599
600 /* Invalidate the last-used cache slot data. */
601 bidi_cache_last_idx = -1;
602}
603
604\f
cbb09f04
EZ
605/***********************************************************************
606 Initialization
607 ***********************************************************************/
608static void
609bidi_initialize (void)
b7b65b15 610{
be39f003 611
cbb09f04
EZ
612#include "biditype.h"
613#include "bidimirror.h"
be39f003 614
cbb09f04 615 int i;
b7b65b15 616
cbb09f04
EZ
617 bidi_type_table = Fmake_char_table (Qnil, make_number (STRONG_L));
618 staticpro (&bidi_type_table);
619
620 for (i = 0; i < sizeof bidi_type / sizeof bidi_type[0]; i++)
621 char_table_set_range (bidi_type_table, bidi_type[i].from, bidi_type[i].to,
622 make_number (bidi_type[i].type));
623
624 bidi_mirror_table = Fmake_char_table (Qnil, Qnil);
625 staticpro (&bidi_mirror_table);
626
627 for (i = 0; i < sizeof bidi_mirror / sizeof bidi_mirror[0]; i++)
628 char_table_set (bidi_mirror_table, bidi_mirror[i].from,
629 make_number (bidi_mirror[i].to));
630
631 Qparagraph_start = intern ("paragraph-start");
632 staticpro (&Qparagraph_start);
633 paragraph_start_re = Fsymbol_value (Qparagraph_start);
634 if (!STRINGP (paragraph_start_re))
635 paragraph_start_re = build_string ("\f\\|[ \t]*$");
636 staticpro (&paragraph_start_re);
637 Qparagraph_separate = intern ("paragraph-separate");
638 staticpro (&Qparagraph_separate);
639 paragraph_separate_re = Fsymbol_value (Qparagraph_separate);
640 if (!STRINGP (paragraph_separate_re))
641 paragraph_separate_re = build_string ("[ \t\f]*$");
642 staticpro (&paragraph_separate_re);
58b9f433
EZ
643
644 bidi_cache_sp = 0;
645
cbb09f04 646 bidi_initialized = 1;
b7b65b15
EZ
647}
648
cbb09f04
EZ
649/* Do whatever UAX#9 clause X8 says should be done at paragraph's
650 end. */
58b9f433 651static inline void
cbb09f04 652bidi_set_paragraph_end (struct bidi_it *bidi_it)
b7b65b15 653{
cbb09f04
EZ
654 bidi_it->invalid_levels = 0;
655 bidi_it->invalid_rl_levels = -1;
656 bidi_it->stack_idx = 0;
657 bidi_it->resolved_level = bidi_it->level_stack[0].level;
658}
b7b65b15 659
cbb09f04
EZ
660/* Initialize the bidi iterator from buffer/string position CHARPOS. */
661void
662bidi_init_it (EMACS_INT charpos, EMACS_INT bytepos, int frame_window_p,
663 struct bidi_it *bidi_it)
664{
665 if (! bidi_initialized)
666 bidi_initialize ();
667 if (charpos >= 0)
668 bidi_it->charpos = charpos;
669 if (bytepos >= 0)
670 bidi_it->bytepos = bytepos;
671 bidi_it->frame_window_p = frame_window_p;
672 bidi_it->nchars = -1; /* to be computed in bidi_resolve_explicit_1 */
673 bidi_it->first_elt = 1;
674 bidi_set_paragraph_end (bidi_it);
675 bidi_it->new_paragraph = 1;
676 bidi_it->separator_limit = -1;
677 bidi_it->type = NEUTRAL_B;
678 bidi_it->type_after_w1 = NEUTRAL_B;
679 bidi_it->orig_type = NEUTRAL_B;
680 bidi_it->prev_was_pdf = 0;
681 bidi_it->prev.type = bidi_it->prev.type_after_w1 =
682 bidi_it->prev.orig_type = UNKNOWN_BT;
89d3374a
EZ
683 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
684 bidi_it->last_strong.orig_type = UNKNOWN_BT;
cbb09f04
EZ
685 bidi_it->next_for_neutral.charpos = -1;
686 bidi_it->next_for_neutral.type =
687 bidi_it->next_for_neutral.type_after_w1 =
89d3374a 688 bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
cbb09f04
EZ
689 bidi_it->prev_for_neutral.charpos = -1;
690 bidi_it->prev_for_neutral.type =
691 bidi_it->prev_for_neutral.type_after_w1 =
692 bidi_it->prev_for_neutral.orig_type = UNKNOWN_BT;
693 bidi_it->sor = L2R; /* FIXME: should it be user-selectable? */
694 bidi_it->disp_pos = -1; /* invalid/unknown */
695 /* We can only shrink the cache if we are at the bottom level of its
696 "stack". */
697 if (bidi_cache_start == 0)
698 bidi_cache_shrink ();
58b9f433
EZ
699 else
700 bidi_cache_reset ();
b7b65b15
EZ
701}
702
182ce2d2 703/* Perform initializations for reordering a new line of bidi text. */
6bff6497 704static void
be39f003
EZ
705bidi_line_init (struct bidi_it *bidi_it)
706{
707 bidi_it->scan_dir = 1; /* FIXME: do we need to have control on this? */
708 bidi_it->resolved_level = bidi_it->level_stack[0].level;
709 bidi_it->level_stack[0].override = NEUTRAL_DIR; /* X1 */
710 bidi_it->invalid_levels = 0;
711 bidi_it->invalid_rl_levels = -1;
712 bidi_it->next_en_pos = -1;
713 bidi_it->next_for_ws.type = UNKNOWN_BT;
b44d9321
EZ
714 bidi_set_sor_type (bidi_it,
715 bidi_it->paragraph_dir == R2L ? 1 : 0,
be39f003
EZ
716 bidi_it->level_stack[0].level); /* X10 */
717
718 bidi_cache_reset ();
719}
720
cbb09f04
EZ
721\f
722/***********************************************************************
723 Fetching characters
724 ***********************************************************************/
725
87e67904
EZ
726/* Count bytes in multibyte string S between BEG/BEGBYTE and END. BEG
727 and END are zero-based character positions in S, BEGBYTE is byte
728 position corresponding to BEG. */
729static inline EMACS_INT
730bidi_count_bytes (const unsigned char *s, const EMACS_INT beg,
731 const EMACS_INT begbyte, const EMACS_INT end)
732{
733 EMACS_INT pos = beg;
734 const unsigned char *p = s + begbyte, *start = p;
735
736 if (!CHAR_HEAD_P (*p))
737 abort ();
738
739 while (pos < end)
740 {
741 p += BYTES_BY_CHAR_HEAD (*p);
742 pos++;
743 }
744
745 return p - start;
746}
747
748/* Fetch and returns the character at byte position BYTEPOS. If S is
749 non-NULL, fetch the character from string S; otherwise fetch the
750 character from the current buffer. */
751static inline int
752bidi_char_at_pos (EMACS_INT bytepos, const unsigned char *s)
753{
754 if (s)
755 return STRING_CHAR (s + bytepos);
756 else
757 return FETCH_MULTIBYTE_CHAR (bytepos);
758}
759
102ebb00
EZ
760/* Fetch and return the character at BYTEPOS/CHARPOS. If that
761 character is covered by a display string, treat the entire run of
762 covered characters as a single character u+FFFC, and return their
763 combined length in CH_LEN and NCHARS. DISP_POS specifies the
764 character position of the next display string, or -1 if not yet
765 computed. When the next character is at or beyond that position,
766 the function updates DISP_POS with the position of the next display
9f257352
EZ
767 string. STRING->s is the C string to iterate, or NULL if iterating
768 over a buffer or a Lisp string; in the latter case, STRING->lstring
769 is the Lisp string. */
87e67904 770static inline int
102ebb00 771bidi_fetch_char (EMACS_INT bytepos, EMACS_INT charpos, EMACS_INT *disp_pos,
87e67904 772 struct bidi_string_data *string,
fc6f18ce 773 int frame_window_p, EMACS_INT *ch_len, EMACS_INT *nchars)
182ce2d2
EZ
774{
775 int ch;
9f257352
EZ
776 EMACS_INT endpos =
777 (string->s || STRINGP (string->lstring)) ? string->schars : ZV;
6db161be 778 struct text_pos pos;
182ce2d2 779
182ce2d2 780 /* If we got past the last known position of display string, compute
87e67904
EZ
781 the position of the next one. That position could be at CHARPOS. */
782 if (charpos < endpos && charpos > *disp_pos)
6db161be
EZ
783 {
784 SET_TEXT_POS (pos, charpos, bytepos);
785 *disp_pos = compute_display_string_pos (&pos, string, frame_window_p);
786 }
102ebb00
EZ
787
788 /* Fetch the character at BYTEPOS. */
87e67904 789 if (charpos >= endpos)
182ce2d2
EZ
790 {
791 ch = BIDI_EOB;
792 *ch_len = 1;
793 *nchars = 1;
87e67904 794 *disp_pos = endpos;
182ce2d2 795 }
102ebb00 796 else if (charpos >= *disp_pos)
182ce2d2 797 {
7b600102
EZ
798 EMACS_INT disp_end_pos;
799
800 /* We don't expect to find ourselves in the middle of a display
801 property. Hopefully, it will never be needed. */
802 if (charpos > *disp_pos)
803 abort ();
804 /* Return the Unicode Object Replacement Character to represent
87e67904 805 the entire run of characters covered by the display string. */
7b600102 806 ch = 0xFFFC;
87e67904 807 disp_end_pos = compute_display_string_end (*disp_pos, string);
7b600102 808 *nchars = disp_end_pos - *disp_pos;
87e67904
EZ
809 if (string->s)
810 *ch_len = bidi_count_bytes (string->s, *disp_pos, bytepos,
811 disp_end_pos);
9f257352
EZ
812 else if (STRINGP (string->lstring))
813 *ch_len = bidi_count_bytes (SDATA (string->lstring), *disp_pos,
814 bytepos, disp_end_pos);
87e67904
EZ
815 else
816 *ch_len = CHAR_TO_BYTE (disp_end_pos) - bytepos;
182ce2d2 817 }
182ce2d2
EZ
818 else
819 {
87e67904
EZ
820 if (string->s)
821 {
822 EMACS_INT len;
823
824 ch = STRING_CHAR_AND_LENGTH (string->s + bytepos, len);
825 *ch_len = len;
826 }
9f257352
EZ
827 else if (STRINGP (string->lstring))
828 {
829 EMACS_INT len;
830
831 ch = STRING_CHAR_AND_LENGTH (SDATA (string->lstring) + bytepos, len);
832 *ch_len = len;
833 }
87e67904
EZ
834 else
835 {
836 ch = FETCH_MULTIBYTE_CHAR (bytepos);
837 *ch_len = CHAR_BYTES (ch);
838 }
182ce2d2
EZ
839 *nchars = 1;
840 }
841
842 /* If we just entered a run of characters covered by a display
843 string, compute the position of the next display string. */
87e67904 844 if (charpos + *nchars <= endpos && charpos + *nchars > *disp_pos)
6db161be
EZ
845 {
846 SET_TEXT_POS (pos, charpos + *nchars, bytepos + *ch_len);
847 *disp_pos = compute_display_string_pos (&pos, string, frame_window_p);
848 }
182ce2d2
EZ
849
850 return ch;
851}
852
cbb09f04
EZ
853\f
854/***********************************************************************
855 Determining paragraph direction
856 ***********************************************************************/
857
858/* Check if buffer position CHARPOS/BYTEPOS is the end of a paragraph.
859 Value is the non-negative length of the paragraph separator
860 following the buffer position, -1 if position is at the beginning
861 of a new paragraph, or -2 if position is neither at beginning nor
862 at end of a paragraph. */
863static EMACS_INT
864bidi_at_paragraph_end (EMACS_INT charpos, EMACS_INT bytepos)
865{
866 Lisp_Object sep_re;
867 Lisp_Object start_re;
868 EMACS_INT val;
869
870 sep_re = paragraph_separate_re;
871 start_re = paragraph_start_re;
872
873 val = fast_looking_at (sep_re, charpos, bytepos, ZV, ZV_BYTE, Qnil);
874 if (val < 0)
875 {
876 if (fast_looking_at (start_re, charpos, bytepos, ZV, ZV_BYTE, Qnil) >= 0)
877 val = -1;
878 else
879 val = -2;
880 }
881
882 return val;
883}
884
be39f003
EZ
885/* Find the beginning of this paragraph by looking back in the buffer.
886 Value is the byte position of the paragraph's beginning. */
887static EMACS_INT
b44d9321 888bidi_find_paragraph_start (EMACS_INT pos, EMACS_INT pos_byte)
6bff6497 889{
372b7a95 890 Lisp_Object re = paragraph_start_re;
6bff6497
EZ
891 EMACS_INT limit = ZV, limit_byte = ZV_BYTE;
892
6bff6497
EZ
893 while (pos_byte > BEGV_BYTE
894 && fast_looking_at (re, pos, pos_byte, limit, limit_byte, Qnil) < 0)
895 {
182ce2d2
EZ
896 /* FIXME: What if the paragraph beginning is covered by a
897 display string? And what if a display string covering some
898 of the text over which we scan back includes
899 paragraph_start_re? */
be39f003
EZ
900 pos = find_next_newline_no_quit (pos - 1, -1);
901 pos_byte = CHAR_TO_BYTE (pos);
6bff6497 902 }
be39f003 903 return pos_byte;
6bff6497
EZ
904}
905
bea4f10c 906/* Determine the base direction, a.k.a. base embedding level, of the
b44d9321
EZ
907 paragraph we are about to iterate through. If DIR is either L2R or
908 R2L, just use that. Otherwise, determine the paragraph direction
bea4f10c
EZ
909 from the first strong directional character of the paragraph.
910
182ce2d2 911 NO_DEFAULT_P non-zero means don't default to L2R if the paragraph
bea4f10c
EZ
912 has no strong directional characters and both DIR and
913 bidi_it->paragraph_dir are NEUTRAL_DIR. In that case, search back
914 in the buffer until a paragraph is found with a strong character,
915 or until hitting BEGV. In the latter case, fall back to L2R. This
916 flag is used in current-bidi-paragraph-direction.
917
918 Note that this function gives the paragraph separator the same
919 direction as the preceding paragraph, even though Emacs generally
920 views the separartor as not belonging to any paragraph. */
b7b65b15 921void
bea4f10c 922bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it, int no_default_p)
b7b65b15 923{
6bff6497 924 EMACS_INT bytepos = bidi_it->bytepos;
9f257352 925 int string_p = bidi_it->string.s != NULL || STRINGP (bidi_it->string.lstring);
bea4f10c 926 EMACS_INT pstartbyte;
87e67904
EZ
927 /* Note that begbyte is a byte position, while end is a character
928 position. Yes, this is ugly, but we are trying to avoid costly
929 calls to BYTE_TO_CHAR and its ilk. */
930 EMACS_INT begbyte = string_p ? 0 : BEGV_BYTE;
931 EMACS_INT end = string_p ? bidi_it->string.schars : ZV;
e7402cb2 932
5e65aec0 933 /* Special case for an empty buffer. */
87e67904 934 if (bytepos == begbyte && bidi_it->charpos == end)
5e65aec0 935 dir = L2R;
9c82e145 936 /* We should never be called at EOB or before BEGV. */
87e67904 937 else if (bidi_it->charpos >= end || bytepos < begbyte)
9c82e145
EZ
938 abort ();
939
be39f003
EZ
940 if (dir == L2R)
941 {
942 bidi_it->paragraph_dir = L2R;
943 bidi_it->new_paragraph = 0;
944 }
945 else if (dir == R2L)
946 {
947 bidi_it->paragraph_dir = R2L;
948 bidi_it->new_paragraph = 0;
949 }
b7b65b15
EZ
950 else if (dir == NEUTRAL_DIR) /* P2 */
951 {
182ce2d2
EZ
952 int ch;
953 EMACS_INT ch_len, nchars;
954 EMACS_INT pos, disp_pos = -1;
6bff6497 955 bidi_type_t type;
9f257352 956 const unsigned char *s;
be39f003 957
d20e1419
EZ
958 if (!bidi_initialized)
959 bidi_initialize ();
960
be39f003
EZ
961 /* If we are inside a paragraph separator, we are just waiting
962 for the separator to be exhausted; use the previous paragraph
e5a2fec7
EZ
963 direction. But don't do that if we have been just reseated,
964 because we need to reinitialize below in that case. */
965 if (!bidi_it->first_elt
966 && bidi_it->charpos < bidi_it->separator_limit)
be39f003
EZ
967 return;
968
b44d9321 969 /* If we are on a newline, get past it to where the next
5e65aec0
EZ
970 paragraph might start. But don't do that at BEGV since then
971 we are potentially in a new paragraph that doesn't yet
972 exist. */
c143c213 973 pos = bidi_it->charpos;
9f257352
EZ
974 s = STRINGP (bidi_it->string.lstring) ?
975 SDATA (bidi_it->string.lstring) : bidi_it->string.s;
976 if (bytepos > begbyte && bidi_char_at_pos (bytepos, s) == '\n')
be39f003 977 {
b44d9321 978 bytepos++;
c143c213 979 pos++;
be39f003 980 }
b44d9321
EZ
981
982 /* We are either at the beginning of a paragraph or in the
983 middle of it. Find where this paragraph starts. */
87e67904
EZ
984 if (string_p)
985 {
986 /* We don't support changes of paragraph direction inside a
987 string. It is treated as a single paragraph. */
988 pstartbyte = 0;
989 }
990 else
991 pstartbyte = bidi_find_paragraph_start (pos, bytepos);
be39f003
EZ
992 bidi_it->separator_limit = -1;
993 bidi_it->new_paragraph = 0;
bea4f10c
EZ
994
995 /* The following loop is run more than once only if NO_DEFAULT_P
87e67904 996 is non-zero, and only if we are iterating on a buffer. */
bea4f10c
EZ
997 do {
998 bytepos = pstartbyte;
87e67904
EZ
999 if (!string_p)
1000 pos = BYTE_TO_CHAR (bytepos);
1001 ch = bidi_fetch_char (bytepos, pos, &disp_pos, &bidi_it->string,
1002 bidi_it->frame_window_p, &ch_len, &nchars);
bea4f10c
EZ
1003 type = bidi_get_type (ch, NEUTRAL_DIR);
1004
182ce2d2 1005 for (pos += nchars, bytepos += ch_len;
bea4f10c
EZ
1006 /* NOTE: UAX#9 says to search only for L, AL, or R types
1007 of characters, and ignore RLE, RLO, LRE, and LRO.
1008 However, I'm not sure it makes sense to omit those 4;
1009 should try with and without that to see the effect. */
1010 (bidi_get_category (type) != STRONG)
1011 || (bidi_ignore_explicit_marks_for_paragraph_level
1012 && (type == RLE || type == RLO
1013 || type == LRE || type == LRO));
1014 type = bidi_get_type (ch, NEUTRAL_DIR))
1015 {
87e67904
EZ
1016 if (!string_p
1017 && type == NEUTRAL_B
1018 && bidi_at_paragraph_end (pos, bytepos) >= -1)
21fce5ab 1019 break;
87e67904 1020 if (pos >= end)
bea4f10c
EZ
1021 {
1022 /* Pretend there's a paragraph separator at end of
87e67904 1023 buffer/string. */
bea4f10c
EZ
1024 type = NEUTRAL_B;
1025 break;
1026 }
102ebb00 1027 /* Fetch next character and advance to get past it. */
87e67904 1028 ch = bidi_fetch_char (bytepos, pos, &disp_pos, &bidi_it->string,
fc6f18ce 1029 bidi_it->frame_window_p, &ch_len, &nchars);
182ce2d2
EZ
1030 pos += nchars;
1031 bytepos += ch_len;
bea4f10c
EZ
1032 }
1033 if (type == STRONG_R || type == STRONG_AL) /* P3 */
1034 bidi_it->paragraph_dir = R2L;
1035 else if (type == STRONG_L)
1036 bidi_it->paragraph_dir = L2R;
87e67904
EZ
1037 if (!string_p
1038 && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR)
bea4f10c
EZ
1039 {
1040 /* If this paragraph is at BEGV, default to L2R. */
1041 if (pstartbyte == BEGV_BYTE)
1042 bidi_it->paragraph_dir = L2R; /* P3 and HL1 */
1043 else
1044 {
1045 EMACS_INT prevpbyte = pstartbyte;
1046 EMACS_INT p = BYTE_TO_CHAR (pstartbyte), pbyte = pstartbyte;
1047
1048 /* Find the beginning of the previous paragraph, if any. */
1049 while (pbyte > BEGV_BYTE && prevpbyte >= pstartbyte)
1050 {
182ce2d2
EZ
1051 /* FXIME: What if p is covered by a display
1052 string? See also a FIXME inside
1053 bidi_find_paragraph_start. */
bea4f10c
EZ
1054 p--;
1055 pbyte = CHAR_TO_BYTE (p);
1056 prevpbyte = bidi_find_paragraph_start (p, pbyte);
1057 }
1058 pstartbyte = prevpbyte;
1059 }
1060 }
87e67904
EZ
1061 } while (!string_p
1062 && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR);
b7b65b15 1063 }
be39f003
EZ
1064 else
1065 abort ();
b7b65b15 1066
b44d9321
EZ
1067 /* Contrary to UAX#9 clause P3, we only default the paragraph
1068 direction to L2R if we have no previous usable paragraph
bea4f10c 1069 direction. This is allowed by the HL1 clause. */
d20e1419 1070 if (bidi_it->paragraph_dir != L2R && bidi_it->paragraph_dir != R2L)
bea4f10c 1071 bidi_it->paragraph_dir = L2R; /* P3 and HL1 ``higher-level protocols'' */
be39f003 1072 if (bidi_it->paragraph_dir == R2L)
b44d9321 1073 bidi_it->level_stack[0].level = 1;
be39f003 1074 else
b44d9321 1075 bidi_it->level_stack[0].level = 0;
be39f003
EZ
1076
1077 bidi_line_init (bidi_it);
b7b65b15
EZ
1078}
1079
cbb09f04
EZ
1080\f
1081/***********************************************************************
1082 Resolving explicit and implicit levels.
58b9f433 1083 The rest of this file constitutes the core of the UBA implementation.
cbb09f04 1084 ***********************************************************************/
b7b65b15 1085
58b9f433 1086static inline int
182ce2d2 1087bidi_explicit_dir_char (int ch)
b7b65b15 1088{
182ce2d2
EZ
1089 bidi_type_t ch_type;
1090
1091 if (!bidi_initialized)
1092 abort ();
1093 ch_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
1094 return (ch_type == LRE || ch_type == LRO
1095 || ch_type == RLE || ch_type == RLO
1096 || ch_type == PDF);
b7b65b15
EZ
1097}
1098
1099/* A helper function for bidi_resolve_explicit. It advances to the
1100 next character in logical order and determines the new embedding
1101 level and directional override, but does not take into account
1102 empty embeddings. */
1103static int
1104bidi_resolve_explicit_1 (struct bidi_it *bidi_it)
1105{
1106 int curchar;
1107 bidi_type_t type;
1108 int current_level;
1109 int new_level;
1110 bidi_dir_t override;
9f257352 1111 int string_p = bidi_it->string.s != NULL || STRINGP (bidi_it->string.lstring);
b7b65b15 1112
182ce2d2
EZ
1113 /* If reseat()'ed, don't advance, so as to start iteration from the
1114 position where we were reseated. bidi_it->bytepos can be less
1115 than BEGV_BYTE after reseat to BEGV. */
87e67904 1116 if (bidi_it->bytepos < (string_p ? 0 : BEGV_BYTE)
9c82e145 1117 || bidi_it->first_elt)
e7402cb2 1118 {
9c82e145 1119 bidi_it->first_elt = 0;
87e67904
EZ
1120 if (string_p)
1121 {
9f257352
EZ
1122 const unsigned char *p =
1123 STRINGP (bidi_it->string.lstring)
1124 ? SDATA (bidi_it->string.lstring) : bidi_it->string.s;
1125
87e67904
EZ
1126 if (bidi_it->charpos < 0)
1127 bidi_it->charpos = 0;
9f257352 1128 bidi_it->bytepos = bidi_count_bytes (p, 0, 0, bidi_it->charpos);
87e67904
EZ
1129 }
1130 else
1131 {
1132 if (bidi_it->charpos < BEGV)
1133 bidi_it->charpos = BEGV;
1134 bidi_it->bytepos = CHAR_TO_BYTE (bidi_it->charpos);
1135 }
e7402cb2 1136 }
87e67904
EZ
1137 /* Don't move at end of buffer/string. */
1138 else if (bidi_it->charpos < (string_p ? bidi_it->string.schars : ZV))
b7b65b15 1139 {
182ce2d2
EZ
1140 /* Advance to the next character, skipping characters covered by
1141 display strings (nchars > 1). */
102ebb00
EZ
1142 if (bidi_it->nchars <= 0)
1143 abort ();
182ce2d2 1144 bidi_it->charpos += bidi_it->nchars;
e7402cb2
EZ
1145 if (bidi_it->ch_len == 0)
1146 abort ();
b7b65b15
EZ
1147 bidi_it->bytepos += bidi_it->ch_len;
1148 }
1149
1150 current_level = bidi_it->level_stack[bidi_it->stack_idx].level; /* X1 */
1151 override = bidi_it->level_stack[bidi_it->stack_idx].override;
1152 new_level = current_level;
1153
87e67904 1154 if (bidi_it->charpos >= (string_p ? bidi_it->string.schars : ZV))
e7402cb2
EZ
1155 {
1156 curchar = BIDI_EOB;
1157 bidi_it->ch_len = 1;
182ce2d2 1158 bidi_it->nchars = 1;
87e67904 1159 bidi_it->disp_pos = (string_p ? bidi_it->string.schars : ZV);
e7402cb2
EZ
1160 }
1161 else
1162 {
182ce2d2
EZ
1163 /* Fetch the character at BYTEPOS. If it is covered by a
1164 display string, treat the entire run of covered characters as
1165 a single character u+FFFC. */
102ebb00 1166 curchar = bidi_fetch_char (bidi_it->bytepos, bidi_it->charpos,
87e67904
EZ
1167 &bidi_it->disp_pos, &bidi_it->string,
1168 bidi_it->frame_window_p,
102ebb00 1169 &bidi_it->ch_len, &bidi_it->nchars);
e7402cb2 1170 }
b7b65b15 1171 bidi_it->ch = curchar;
b7b65b15 1172
6bff6497
EZ
1173 /* Don't apply directional override here, as all the types we handle
1174 below will not be affected by the override anyway, and we need
1175 the original type unaltered. The override will be applied in
1176 bidi_resolve_weak. */
1177 type = bidi_get_type (curchar, NEUTRAL_DIR);
89d3374a
EZ
1178 bidi_it->orig_type = type;
1179 bidi_check_type (bidi_it->orig_type);
b7b65b15
EZ
1180
1181 if (type != PDF)
1182 bidi_it->prev_was_pdf = 0;
1183
89d3374a 1184 bidi_it->type_after_w1 = UNKNOWN_BT;
b7b65b15
EZ
1185
1186 switch (type)
1187 {
1188 case RLE: /* X2 */
1189 case RLO: /* X4 */
89d3374a
EZ
1190 bidi_it->type_after_w1 = type;
1191 bidi_check_type (bidi_it->type_after_w1);
b7b65b15 1192 type = WEAK_BN; /* X9/Retaining */
87e67904 1193 if (bidi_it->ignore_bn_limit <= -1)
b7b65b15
EZ
1194 {
1195 if (current_level <= BIDI_MAXLEVEL - 4)
1196 {
1197 /* Compute the least odd embedding level greater than
1198 the current level. */
1199 new_level = ((current_level + 1) & ~1) + 1;
89d3374a 1200 if (bidi_it->type_after_w1 == RLE)
b7b65b15
EZ
1201 override = NEUTRAL_DIR;
1202 else
1203 override = R2L;
1204 if (current_level == BIDI_MAXLEVEL - 4)
1205 bidi_it->invalid_rl_levels = 0;
1206 bidi_push_embedding_level (bidi_it, new_level, override);
1207 }
1208 else
1209 {
1210 bidi_it->invalid_levels++;
1211 /* See the commentary about invalid_rl_levels below. */
1212 if (bidi_it->invalid_rl_levels < 0)
1213 bidi_it->invalid_rl_levels = 0;
1214 bidi_it->invalid_rl_levels++;
1215 }
1216 }
89d3374a 1217 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
b7b65b15
EZ
1218 || bidi_it->next_en_pos > bidi_it->charpos)
1219 type = WEAK_EN;
1220 break;
1221 case LRE: /* X3 */
1222 case LRO: /* X5 */
89d3374a
EZ
1223 bidi_it->type_after_w1 = type;
1224 bidi_check_type (bidi_it->type_after_w1);
b7b65b15 1225 type = WEAK_BN; /* X9/Retaining */
87e67904 1226 if (bidi_it->ignore_bn_limit <= -1)
b7b65b15
EZ
1227 {
1228 if (current_level <= BIDI_MAXLEVEL - 5)
1229 {
1230 /* Compute the least even embedding level greater than
1231 the current level. */
1232 new_level = ((current_level + 2) & ~1);
89d3374a 1233 if (bidi_it->type_after_w1 == LRE)
b7b65b15
EZ
1234 override = NEUTRAL_DIR;
1235 else
1236 override = L2R;
1237 bidi_push_embedding_level (bidi_it, new_level, override);
1238 }
1239 else
1240 {
1241 bidi_it->invalid_levels++;
1242 /* invalid_rl_levels counts invalid levels encountered
1243 while the embedding level was already too high for
1244 LRE/LRO, but not for RLE/RLO. That is because
1245 there may be exactly one PDF which we should not
1246 ignore even though invalid_levels is non-zero.
1247 invalid_rl_levels helps to know what PDF is
1248 that. */
1249 if (bidi_it->invalid_rl_levels >= 0)
1250 bidi_it->invalid_rl_levels++;
1251 }
1252 }
89d3374a 1253 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
b7b65b15
EZ
1254 || bidi_it->next_en_pos > bidi_it->charpos)
1255 type = WEAK_EN;
1256 break;
1257 case PDF: /* X7 */
89d3374a
EZ
1258 bidi_it->type_after_w1 = type;
1259 bidi_check_type (bidi_it->type_after_w1);
b7b65b15 1260 type = WEAK_BN; /* X9/Retaining */
87e67904 1261 if (bidi_it->ignore_bn_limit <= -1)
b7b65b15
EZ
1262 {
1263 if (!bidi_it->invalid_rl_levels)
1264 {
1265 new_level = bidi_pop_embedding_level (bidi_it);
1266 bidi_it->invalid_rl_levels = -1;
1267 if (bidi_it->invalid_levels)
1268 bidi_it->invalid_levels--;
1269 /* else nothing: UAX#9 says to ignore invalid PDFs */
1270 }
1271 if (!bidi_it->invalid_levels)
1272 new_level = bidi_pop_embedding_level (bidi_it);
1273 else
1274 {
1275 bidi_it->invalid_levels--;
1276 bidi_it->invalid_rl_levels--;
1277 }
1278 }
89d3374a 1279 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
b7b65b15
EZ
1280 || bidi_it->next_en_pos > bidi_it->charpos)
1281 type = WEAK_EN;
1282 break;
1283 default:
1284 /* Nothing. */
1285 break;
1286 }
1287
1288 bidi_it->type = type;
2d6e4628 1289 bidi_check_type (bidi_it->type);
b7b65b15
EZ
1290
1291 return new_level;
1292}
1293
1294/* Given an iterator state in BIDI_IT, advance one character position
182ce2d2
EZ
1295 in the buffer/string to the next character (in the logical order),
1296 resolve any explicit embeddings and directional overrides, and
1297 return the embedding level of the character after resolving
1298 explicit directives and ignoring empty embeddings. */
b7b65b15
EZ
1299static int
1300bidi_resolve_explicit (struct bidi_it *bidi_it)
1301{
1302 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1303 int new_level = bidi_resolve_explicit_1 (bidi_it);
87e67904 1304 EMACS_INT eob = bidi_it->string.s ? bidi_it->string.schars : ZV;
9f257352
EZ
1305 const unsigned char *s = STRINGP (bidi_it->string.lstring)
1306 ? SDATA (bidi_it->string.lstring) : bidi_it->string.s;
b7b65b15
EZ
1307
1308 if (prev_level < new_level
1309 && bidi_it->type == WEAK_BN
87e67904
EZ
1310 && bidi_it->ignore_bn_limit == -1 /* only if not already known */
1311 && bidi_it->charpos < eob /* not already at EOB */
1312 && bidi_explicit_dir_char (bidi_char_at_pos (bidi_it->bytepos
9f257352 1313 + bidi_it->ch_len, s)))
b7b65b15
EZ
1314 {
1315 /* Avoid pushing and popping embedding levels if the level run
1316 is empty, as this breaks level runs where it shouldn't.
1317 UAX#9 removes all the explicit embedding and override codes,
1318 so empty embeddings disappear without a trace. We need to
1319 behave as if we did the same. */
1320 struct bidi_it saved_it;
1321 int level = prev_level;
1322
1323 bidi_copy_it (&saved_it, bidi_it);
1324
87e67904 1325 while (bidi_explicit_dir_char (bidi_char_at_pos (bidi_it->bytepos
9f257352 1326 + bidi_it->ch_len, s)))
b7b65b15 1327 {
182ce2d2
EZ
1328 /* This advances to the next character, skipping any
1329 characters covered by display strings. */
b7b65b15 1330 level = bidi_resolve_explicit_1 (bidi_it);
9f257352
EZ
1331 /* If string.lstring was relocated inside bidi_resolve_explicit_1,
1332 a pointer to its data is no longer valid. */
1333 if (STRINGP (bidi_it->string.lstring))
1334 s = SDATA (bidi_it->string.lstring);
b7b65b15
EZ
1335 }
1336
102ebb00
EZ
1337 if (bidi_it->nchars <= 0)
1338 abort ();
b7b65b15 1339 if (level == prev_level) /* empty embedding */
182ce2d2 1340 saved_it.ignore_bn_limit = bidi_it->charpos + bidi_it->nchars;
b7b65b15 1341 else /* this embedding is non-empty */
87e67904 1342 saved_it.ignore_bn_limit = -2;
b7b65b15
EZ
1343
1344 bidi_copy_it (bidi_it, &saved_it);
87e67904 1345 if (bidi_it->ignore_bn_limit > -1)
b7b65b15
EZ
1346 {
1347 /* We pushed a level, but we shouldn't have. Undo that. */
1348 if (!bidi_it->invalid_rl_levels)
1349 {
1350 new_level = bidi_pop_embedding_level (bidi_it);
1351 bidi_it->invalid_rl_levels = -1;
1352 if (bidi_it->invalid_levels)
1353 bidi_it->invalid_levels--;
1354 }
1355 if (!bidi_it->invalid_levels)
1356 new_level = bidi_pop_embedding_level (bidi_it);
1357 else
1358 {
1359 bidi_it->invalid_levels--;
1360 bidi_it->invalid_rl_levels--;
1361 }
1362 }
1363 }
1364
b7b65b15
EZ
1365 if (bidi_it->type == NEUTRAL_B) /* X8 */
1366 {
21fce5ab 1367 bidi_set_paragraph_end (bidi_it);
6bff6497
EZ
1368 /* This is needed by bidi_resolve_weak below, and in L1. */
1369 bidi_it->type_after_w1 = bidi_it->type;
89d3374a 1370 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1371 }
1372
1373 return new_level;
1374}
1375
182ce2d2
EZ
1376/* Advance in the buffer/string, resolve weak types and return the
1377 type of the next character after weak type resolution. */
fd3998ff 1378static bidi_type_t
b7b65b15
EZ
1379bidi_resolve_weak (struct bidi_it *bidi_it)
1380{
1381 bidi_type_t type;
1382 bidi_dir_t override;
1383 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1384 int new_level = bidi_resolve_explicit (bidi_it);
1385 int next_char;
1386 bidi_type_t type_of_next;
1387 struct bidi_it saved_it;
9f257352
EZ
1388 EMACS_INT eob =
1389 (STRINGP (bidi_it->string.lstring) || bidi_it->string.s)
1390 ? bidi_it->string.schars : ZV;
b7b65b15
EZ
1391
1392 type = bidi_it->type;
1393 override = bidi_it->level_stack[bidi_it->stack_idx].override;
1394
1395 if (type == UNKNOWN_BT
1396 || type == LRE
1397 || type == LRO
1398 || type == RLE
1399 || type == RLO
1400 || type == PDF)
1401 abort ();
1402
1403 if (new_level != prev_level
1404 || bidi_it->type == NEUTRAL_B)
1405 {
1406 /* We've got a new embedding level run, compute the directional
1407 type of sor and initialize per-run variables (UAX#9, clause
1408 X10). */
1409 bidi_set_sor_type (bidi_it, prev_level, new_level);
1410 }
1411 else if (type == NEUTRAL_S || type == NEUTRAL_WS
1412 || type == WEAK_BN || type == STRONG_AL)
89d3374a
EZ
1413 bidi_it->type_after_w1 = type; /* needed in L1 */
1414 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1415
1416 /* Level and directional override status are already recorded in
1417 bidi_it, and do not need any change; see X6. */
1418 if (override == R2L) /* X6 */
1419 type = STRONG_R;
1420 else if (override == L2R)
1421 type = STRONG_L;
bc5a45f3 1422 else
b7b65b15 1423 {
bc5a45f3 1424 if (type == WEAK_NSM) /* W1 */
b7b65b15 1425 {
bc5a45f3 1426 /* Note that we don't need to consider the case where the
5930fe97
EZ
1427 prev character has its type overridden by an RLO or LRO,
1428 because then either the type of this NSM would have been
1429 also overridden, or the previous character is outside the
1430 current level run, and thus not relevant to this NSM.
1431 This is why NSM gets the type_after_w1 of the previous
1432 character. */
ebb5722e
EZ
1433 if (bidi_it->prev.type_after_w1 != UNKNOWN_BT
1434 /* if type_after_w1 is NEUTRAL_B, this NSM is at sor */
1435 && bidi_it->prev.type_after_w1 != NEUTRAL_B)
5930fe97 1436 type = bidi_it->prev.type_after_w1;
bc5a45f3
EZ
1437 else if (bidi_it->sor == R2L)
1438 type = STRONG_R;
1439 else if (bidi_it->sor == L2R)
1440 type = STRONG_L;
1441 else /* shouldn't happen! */
1442 abort ();
b7b65b15 1443 }
bc5a45f3
EZ
1444 if (type == WEAK_EN /* W2 */
1445 && bidi_it->last_strong.type_after_w1 == STRONG_AL)
1446 type = WEAK_AN;
1447 else if (type == STRONG_AL) /* W3 */
1448 type = STRONG_R;
1449 else if ((type == WEAK_ES /* W4 */
1450 && bidi_it->prev.type_after_w1 == WEAK_EN
1451 && bidi_it->prev.orig_type == WEAK_EN)
1452 || (type == WEAK_CS
1453 && ((bidi_it->prev.type_after_w1 == WEAK_EN
1454 && bidi_it->prev.orig_type == WEAK_EN)
1455 || bidi_it->prev.type_after_w1 == WEAK_AN)))
b7b65b15 1456 {
9f257352
EZ
1457 const unsigned char *s =
1458 STRINGP (bidi_it->string.lstring)
1459 ? SDATA (bidi_it->string.lstring) : bidi_it->string.s;
1460
e7402cb2 1461 next_char =
87e67904
EZ
1462 bidi_it->charpos + bidi_it->nchars >= eob
1463 ? BIDI_EOB
9f257352 1464 : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len, s);
6bff6497 1465 type_of_next = bidi_get_type (next_char, override);
b7b65b15 1466
bc5a45f3 1467 if (type_of_next == WEAK_BN
b7b65b15
EZ
1468 || bidi_explicit_dir_char (next_char))
1469 {
1470 bidi_copy_it (&saved_it, bidi_it);
1471 while (bidi_resolve_explicit (bidi_it) == new_level
bc5a45f3 1472 && bidi_it->type == WEAK_BN)
b7b65b15
EZ
1473 ;
1474 type_of_next = bidi_it->type;
b7b65b15
EZ
1475 bidi_copy_it (bidi_it, &saved_it);
1476 }
bc5a45f3
EZ
1477
1478 /* If the next character is EN, but the last strong-type
1479 character is AL, that next EN will be changed to AN when
1480 we process it in W2 above. So in that case, this ES
1481 should not be changed into EN. */
1482 if (type == WEAK_ES
1483 && type_of_next == WEAK_EN
1484 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1485 type = WEAK_EN;
1486 else if (type == WEAK_CS)
b7b65b15 1487 {
bc5a45f3
EZ
1488 if (bidi_it->prev.type_after_w1 == WEAK_AN
1489 && (type_of_next == WEAK_AN
1490 /* If the next character is EN, but the last
1491 strong-type character is AL, EN will be later
1492 changed to AN when we process it in W2 above.
1493 So in that case, this ES should not be
1494 changed into EN. */
1495 || (type_of_next == WEAK_EN
1496 && bidi_it->last_strong.type_after_w1 == STRONG_AL)))
1497 type = WEAK_AN;
1498 else if (bidi_it->prev.type_after_w1 == WEAK_EN
1499 && type_of_next == WEAK_EN
1500 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1501 type = WEAK_EN;
1502 }
1503 }
1504 else if (type == WEAK_ET /* W5: ET with EN before or after it */
1505 || type == WEAK_BN) /* W5/Retaining */
1506 {
1507 if (bidi_it->prev.type_after_w1 == WEAK_EN /* ET/BN w/EN before it */
1508 || bidi_it->next_en_pos > bidi_it->charpos)
1509 type = WEAK_EN;
1510 else /* W5: ET/BN with EN after it. */
1511 {
182ce2d2 1512 EMACS_INT en_pos = bidi_it->charpos + bidi_it->nchars;
9f257352
EZ
1513 const unsigned char *s =
1514 STRINGP (bidi_it->string.lstring)
1515 ? SDATA (bidi_it->string.lstring) : bidi_it->string.s;
bc5a45f3 1516
102ebb00
EZ
1517 if (bidi_it->nchars <= 0)
1518 abort ();
bc5a45f3 1519 next_char =
87e67904
EZ
1520 bidi_it->charpos + bidi_it->nchars >= eob
1521 ? BIDI_EOB
9f257352 1522 : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len, s);
bc5a45f3
EZ
1523 type_of_next = bidi_get_type (next_char, override);
1524
1525 if (type_of_next == WEAK_ET
1526 || type_of_next == WEAK_BN
1527 || bidi_explicit_dir_char (next_char))
1528 {
1529 bidi_copy_it (&saved_it, bidi_it);
1530 while (bidi_resolve_explicit (bidi_it) == new_level
1531 && (bidi_it->type == WEAK_BN
1532 || bidi_it->type == WEAK_ET))
1533 ;
1534 type_of_next = bidi_it->type;
1535 en_pos = bidi_it->charpos;
1536 bidi_copy_it (bidi_it, &saved_it);
1537 }
1538 if (type_of_next == WEAK_EN)
b7b65b15 1539 {
bc5a45f3
EZ
1540 /* If the last strong character is AL, the EN we've
1541 found will become AN when we get to it (W2). */
1542 if (bidi_it->last_strong.type_after_w1 != STRONG_AL)
1543 {
1544 type = WEAK_EN;
1545 /* Remember this EN position, to speed up processing
1546 of the next ETs. */
1547 bidi_it->next_en_pos = en_pos;
1548 }
1549 else if (type == WEAK_BN)
1550 type = NEUTRAL_ON; /* W6/Retaining */
b7b65b15 1551 }
b7b65b15
EZ
1552 }
1553 }
1554 }
1555
1556 if (type == WEAK_ES || type == WEAK_ET || type == WEAK_CS /* W6 */
89d3374a
EZ
1557 || (type == WEAK_BN
1558 && (bidi_it->prev.type_after_w1 == WEAK_CS /* W6/Retaining */
1559 || bidi_it->prev.type_after_w1 == WEAK_ES
1560 || bidi_it->prev.type_after_w1 == WEAK_ET)))
b7b65b15
EZ
1561 type = NEUTRAL_ON;
1562
1563 /* Store the type we've got so far, before we clobber it with strong
1564 types in W7 and while resolving neutral types. But leave alone
1565 the original types that were recorded above, because we will need
1566 them for the L1 clause. */
89d3374a
EZ
1567 if (bidi_it->type_after_w1 == UNKNOWN_BT)
1568 bidi_it->type_after_w1 = type;
1569 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1570
1571 if (type == WEAK_EN) /* W7 */
1572 {
89d3374a 1573 if ((bidi_it->last_strong.type_after_w1 == STRONG_L)
b7b65b15
EZ
1574 || (bidi_it->last_strong.type == UNKNOWN_BT && bidi_it->sor == L2R))
1575 type = STRONG_L;
1576 }
1577
1578 bidi_it->type = type;
2d6e4628 1579 bidi_check_type (bidi_it->type);
b7b65b15
EZ
1580 return type;
1581}
1582
cbb09f04
EZ
1583/* Resolve the type of a neutral character according to the type of
1584 surrounding strong text and the current embedding level. */
58b9f433 1585static inline bidi_type_t
cbb09f04
EZ
1586bidi_resolve_neutral_1 (bidi_type_t prev_type, bidi_type_t next_type, int lev)
1587{
1588 /* N1: European and Arabic numbers are treated as though they were R. */
1589 if (next_type == WEAK_EN || next_type == WEAK_AN)
1590 next_type = STRONG_R;
1591 if (prev_type == WEAK_EN || prev_type == WEAK_AN)
1592 prev_type = STRONG_R;
1593
1594 if (next_type == prev_type) /* N1 */
1595 return next_type;
1596 else if ((lev & 1) == 0) /* N2 */
1597 return STRONG_L;
1598 else
1599 return STRONG_R;
1600}
1601
fd3998ff 1602static bidi_type_t
b7b65b15
EZ
1603bidi_resolve_neutral (struct bidi_it *bidi_it)
1604{
1605 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1606 bidi_type_t type = bidi_resolve_weak (bidi_it);
1607 int current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1608
1609 if (!(type == STRONG_R
1610 || type == STRONG_L
1611 || type == WEAK_BN
1612 || type == WEAK_EN
1613 || type == WEAK_AN
1614 || type == NEUTRAL_B
1615 || type == NEUTRAL_S
1616 || type == NEUTRAL_WS
1617 || type == NEUTRAL_ON))
1618 abort ();
1619
1620 if (bidi_get_category (type) == NEUTRAL
1621 || (type == WEAK_BN && prev_level == current_level))
1622 {
1623 if (bidi_it->next_for_neutral.type != UNKNOWN_BT)
1624 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1625 bidi_it->next_for_neutral.type,
1626 current_level);
1627 else
1628 {
1629 /* Arrrgh!! The UAX#9 algorithm is too deeply entrenched in
1630 the assumption of batch-style processing; see clauses W4,
1631 W5, and especially N1, which require to look far forward
182ce2d2
EZ
1632 (as well as back) in the buffer/string. May the fleas of
1633 a thousand camels infest the armpits of those who design
b7b65b15
EZ
1634 supposedly general-purpose algorithms by looking at their
1635 own implementations, and fail to consider other possible
1636 implementations! */
1637 struct bidi_it saved_it;
1638 bidi_type_t next_type;
1639
1640 if (bidi_it->scan_dir == -1)
1641 abort ();
1642
1643 bidi_copy_it (&saved_it, bidi_it);
1644 /* Scan the text forward until we find the first non-neutral
1645 character, and then use that to resolve the neutral we
1646 are dealing with now. We also cache the scanned iterator
1647 states, to salvage some of the effort later. */
1648 bidi_cache_iterator_state (bidi_it, 0);
1649 do {
1650 /* Record the info about the previous character, so that
1651 it will be cached below with this state. */
89d3374a 1652 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
b7b65b15
EZ
1653 && bidi_it->type != WEAK_BN)
1654 bidi_remember_char (&bidi_it->prev, bidi_it);
1655 type = bidi_resolve_weak (bidi_it);
1656 /* Paragraph separators have their levels fully resolved
1657 at this point, so cache them as resolved. */
1658 bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B);
1659 /* FIXME: implement L1 here, by testing for a newline and
1660 resetting the level for any sequence of whitespace
1661 characters adjacent to it. */
1662 } while (!(type == NEUTRAL_B
1663 || (type != WEAK_BN
1664 && bidi_get_category (type) != NEUTRAL)
1665 /* This is all per level run, so stop when we
1666 reach the end of this level run. */
1667 || bidi_it->level_stack[bidi_it->stack_idx].level !=
1668 current_level));
1669
1670 bidi_remember_char (&saved_it.next_for_neutral, bidi_it);
1671
1672 switch (type)
1673 {
1674 case STRONG_L:
1675 case STRONG_R:
1676 case STRONG_AL:
1677 next_type = type;
1678 break;
1679 case WEAK_EN:
1680 case WEAK_AN:
1681 /* N1: ``European and Arabic numbers are treated as
1682 though they were R.'' */
1683 next_type = STRONG_R;
1684 saved_it.next_for_neutral.type = STRONG_R;
1685 break;
1686 case WEAK_BN:
1687 if (!bidi_explicit_dir_char (bidi_it->ch))
1688 abort (); /* can't happen: BNs are skipped */
1689 /* FALLTHROUGH */
1690 case NEUTRAL_B:
1691 /* Marched all the way to the end of this level run.
1692 We need to use the eor type, whose information is
1693 stored by bidi_set_sor_type in the prev_for_neutral
1694 member. */
1695 if (saved_it.type != WEAK_BN
89d3374a 1696 || bidi_get_category (bidi_it->prev.type_after_w1) == NEUTRAL)
b7b65b15
EZ
1697 {
1698 next_type = bidi_it->prev_for_neutral.type;
1699 saved_it.next_for_neutral.type = next_type;
2d6e4628 1700 bidi_check_type (next_type);
b7b65b15
EZ
1701 }
1702 else
1703 {
1704 /* This is a BN which does not adjoin neutrals.
1705 Leave its type alone. */
1706 bidi_copy_it (bidi_it, &saved_it);
1707 return bidi_it->type;
1708 }
1709 break;
1710 default:
1711 abort ();
1712 }
1713 type = bidi_resolve_neutral_1 (saved_it.prev_for_neutral.type,
1714 next_type, current_level);
1715 saved_it.type = type;
2d6e4628 1716 bidi_check_type (type);
b7b65b15
EZ
1717 bidi_copy_it (bidi_it, &saved_it);
1718 }
1719 }
1720 return type;
1721}
1722
1723/* Given an iterator state in BIDI_IT, advance one character position
182ce2d2
EZ
1724 in the buffer/string to the next character (in the logical order),
1725 resolve the bidi type of that next character, and return that
1726 type. */
fd3998ff 1727static bidi_type_t
b7b65b15
EZ
1728bidi_type_of_next_char (struct bidi_it *bidi_it)
1729{
1730 bidi_type_t type;
1731
1732 /* This should always be called during a forward scan. */
1733 if (bidi_it->scan_dir != 1)
1734 abort ();
1735
1736 /* Reset the limit until which to ignore BNs if we step out of the
1737 area where we found only empty levels. */
87e67904 1738 if ((bidi_it->ignore_bn_limit > -1
b7b65b15 1739 && bidi_it->ignore_bn_limit <= bidi_it->charpos)
87e67904 1740 || (bidi_it->ignore_bn_limit == -2
b7b65b15 1741 && !bidi_explicit_dir_char (bidi_it->ch)))
87e67904 1742 bidi_it->ignore_bn_limit = -1;
b7b65b15
EZ
1743
1744 type = bidi_resolve_neutral (bidi_it);
1745
1746 return type;
1747}
1748
1749/* Given an iterator state BIDI_IT, advance one character position in
182ce2d2
EZ
1750 the buffer/string to the next character (in the current scan
1751 direction), resolve the embedding and implicit levels of that next
1752 character, and return the resulting level. */
fd3998ff 1753static int
b7b65b15
EZ
1754bidi_level_of_next_char (struct bidi_it *bidi_it)
1755{
1756 bidi_type_t type;
1757 int level, prev_level = -1;
1758 struct bidi_saved_info next_for_neutral;
578b494e 1759 EMACS_INT next_char_pos = -2;
b7b65b15
EZ
1760
1761 if (bidi_it->scan_dir == 1)
1762 {
9f257352
EZ
1763 EMACS_INT eob =
1764 (bidi_it->string.s || STRINGP (bidi_it->string.lstring))
1765 ? bidi_it->string.schars : ZV;
1766
b7b65b15 1767 /* There's no sense in trying to advance if we hit end of text. */
9f257352 1768 if (bidi_it->charpos >= eob)
b7b65b15
EZ
1769 return bidi_it->resolved_level;
1770
1771 /* Record the info about the previous character. */
89d3374a 1772 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
b7b65b15
EZ
1773 && bidi_it->type != WEAK_BN)
1774 bidi_remember_char (&bidi_it->prev, bidi_it);
89d3374a
EZ
1775 if (bidi_it->type_after_w1 == STRONG_R
1776 || bidi_it->type_after_w1 == STRONG_L
1777 || bidi_it->type_after_w1 == STRONG_AL)
b7b65b15
EZ
1778 bidi_remember_char (&bidi_it->last_strong, bidi_it);
1779 /* FIXME: it sounds like we don't need both prev and
1780 prev_for_neutral members, but I'm leaving them both for now. */
1781 if (bidi_it->type == STRONG_R || bidi_it->type == STRONG_L
1782 || bidi_it->type == WEAK_EN || bidi_it->type == WEAK_AN)
1783 bidi_remember_char (&bidi_it->prev_for_neutral, bidi_it);
1784
1785 /* If we overstepped the characters used for resolving neutrals
1786 and whitespace, invalidate their info in the iterator. */
1787 if (bidi_it->charpos >= bidi_it->next_for_neutral.charpos)
1788 bidi_it->next_for_neutral.type = UNKNOWN_BT;
1789 if (bidi_it->next_en_pos >= 0
1790 && bidi_it->charpos >= bidi_it->next_en_pos)
1791 bidi_it->next_en_pos = -1;
1792 if (bidi_it->next_for_ws.type != UNKNOWN_BT
1793 && bidi_it->charpos >= bidi_it->next_for_ws.charpos)
1794 bidi_it->next_for_ws.type = UNKNOWN_BT;
1795
1796 /* This must be taken before we fill the iterator with the info
1797 about the next char. If we scan backwards, the iterator
1798 state must be already cached, so there's no need to know the
1799 embedding level of the previous character, since we will be
1800 returning to our caller shortly. */
1801 prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1802 }
1803 next_for_neutral = bidi_it->next_for_neutral;
1804
182ce2d2
EZ
1805 /* Perhaps the character we want is already cached. If it is, the
1806 call to bidi_cache_find below will return a type other than
1807 UNKNOWN_BT. */
87e67904 1808 if (bidi_cache_idx > bidi_cache_start && !bidi_it->first_elt)
102ebb00 1809 {
9f257352
EZ
1810 int bob =
1811 (bidi_it->string.s || STRINGP (bidi_it->string.lstring)) ? 0 : 1;
1812
102ebb00
EZ
1813 if (bidi_it->scan_dir > 0)
1814 {
1815 if (bidi_it->nchars <= 0)
1816 abort ();
1817 next_char_pos = bidi_it->charpos + bidi_it->nchars;
1818 }
9f257352 1819 else if (bidi_it->charpos >= bob)
bb269206
EZ
1820 /* Implementation note: we allow next_char_pos to be as low as
1821 0 for buffers or -1 for strings, and that is okay because
1822 that's the "position" of the sentinel iterator state we
1823 cached at the beginning of the iteration. */
102ebb00 1824 next_char_pos = bidi_it->charpos - 1;
578b494e 1825 if (next_char_pos >= bob - 1)
87e67904
EZ
1826 type = bidi_cache_find (next_char_pos, -1, bidi_it);
1827 else
1828 type = UNKNOWN_BT;
102ebb00 1829 }
182ce2d2 1830 else
102ebb00 1831 type = UNKNOWN_BT;
b7b65b15
EZ
1832 if (type != UNKNOWN_BT)
1833 {
1834 /* Don't lose the information for resolving neutrals! The
1835 cached states could have been cached before their
1836 next_for_neutral member was computed. If we are on our way
1837 forward, we can simply take the info from the previous
1838 state. */
1839 if (bidi_it->scan_dir == 1
1840 && bidi_it->next_for_neutral.type == UNKNOWN_BT)
1841 bidi_it->next_for_neutral = next_for_neutral;
1842
1843 /* If resolved_level is -1, it means this state was cached
1844 before it was completely resolved, so we cannot return
1845 it. */
1846 if (bidi_it->resolved_level != -1)
1847 return bidi_it->resolved_level;
1848 }
1849 if (bidi_it->scan_dir == -1)
1850 /* If we are going backwards, the iterator state is already cached
1851 from previous scans, and should be fully resolved. */
1852 abort ();
1853
1854 if (type == UNKNOWN_BT)
1855 type = bidi_type_of_next_char (bidi_it);
1856
1857 if (type == NEUTRAL_B)
1858 return bidi_it->resolved_level;
1859
1860 level = bidi_it->level_stack[bidi_it->stack_idx].level;
1861 if ((bidi_get_category (type) == NEUTRAL /* && type != NEUTRAL_B */)
1862 || (type == WEAK_BN && prev_level == level))
1863 {
1864 if (bidi_it->next_for_neutral.type == UNKNOWN_BT)
1865 abort ();
1866
1867 /* If the cached state shows a neutral character, it was not
1868 resolved by bidi_resolve_neutral, so do it now. */
1869 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1870 bidi_it->next_for_neutral.type,
1871 level);
1872 }
1873
1874 if (!(type == STRONG_R
1875 || type == STRONG_L
1876 || type == WEAK_BN
1877 || type == WEAK_EN
1878 || type == WEAK_AN))
1879 abort ();
1880 bidi_it->type = type;
2d6e4628 1881 bidi_check_type (bidi_it->type);
b7b65b15
EZ
1882
1883 /* For L1 below, we need to know, for each WS character, whether
0d327994 1884 it belongs to a sequence of WS characters preceding a newline
b7b65b15 1885 or a TAB or a paragraph separator. */
89d3374a 1886 if (bidi_it->orig_type == NEUTRAL_WS
b7b65b15
EZ
1887 && bidi_it->next_for_ws.type == UNKNOWN_BT)
1888 {
1889 int ch;
9d68c2a9 1890 EMACS_INT clen = bidi_it->ch_len;
6bff6497
EZ
1891 EMACS_INT bpos = bidi_it->bytepos;
1892 EMACS_INT cpos = bidi_it->charpos;
182ce2d2 1893 EMACS_INT disp_pos = bidi_it->disp_pos;
102ebb00 1894 EMACS_INT nc = bidi_it->nchars;
87e67904 1895 struct bidi_string_data bs = bidi_it->string;
b7b65b15 1896 bidi_type_t chtype;
fc6f18ce 1897 int fwp = bidi_it->frame_window_p;
b7b65b15 1898
102ebb00
EZ
1899 if (bidi_it->nchars <= 0)
1900 abort ();
b7b65b15 1901 do {
87e67904 1902 ch = bidi_fetch_char (bpos += clen, cpos += nc, &disp_pos, &bs, fwp,
fc6f18ce 1903 &clen, &nc);
e7402cb2 1904 if (ch == '\n' || ch == BIDI_EOB /* || ch == LINESEP_CHAR */)
b7b65b15
EZ
1905 chtype = NEUTRAL_B;
1906 else
6bff6497 1907 chtype = bidi_get_type (ch, NEUTRAL_DIR);
b7b65b15
EZ
1908 } while (chtype == NEUTRAL_WS || chtype == WEAK_BN
1909 || bidi_explicit_dir_char (ch)); /* L1/Retaining */
1910 bidi_it->next_for_ws.type = chtype;
2d6e4628 1911 bidi_check_type (bidi_it->next_for_ws.type);
b7b65b15
EZ
1912 bidi_it->next_for_ws.charpos = cpos;
1913 bidi_it->next_for_ws.bytepos = bpos;
1914 }
1915
1916 /* Resolve implicit levels, with a twist: PDFs get the embedding
1917 level of the enbedding they terminate. See below for the
1918 reason. */
89d3374a 1919 if (bidi_it->orig_type == PDF
b7b65b15
EZ
1920 /* Don't do this if this formatting code didn't change the
1921 embedding level due to invalid or empty embeddings. */
1922 && prev_level != level)
1923 {
1924 /* Don't look in UAX#9 for the reason for this: it's our own
1925 private quirk. The reason is that we want the formatting
1926 codes to be delivered so that they bracket the text of their
1927 embedding. For example, given the text
1928
1929 {RLO}teST{PDF}
1930
1931 we want it to be displayed as
1932
0d68907d 1933 {PDF}STet{RLO}
b7b65b15
EZ
1934
1935 not as
1936
1937 STet{RLO}{PDF}
1938
1939 which will result because we bump up the embedding level as
1940 soon as we see the RLO and pop it as soon as we see the PDF,
1941 so RLO itself has the same embedding level as "teST", and
1942 thus would be normally delivered last, just before the PDF.
1943 The switch below fiddles with the level of PDF so that this
1944 ugly side effect does not happen.
1945
1946 (This is, of course, only important if the formatting codes
e7402cb2
EZ
1947 are actually displayed, but Emacs does need to display them
1948 if the user wants to.) */
b7b65b15
EZ
1949 level = prev_level;
1950 }
89d3374a
EZ
1951 else if (bidi_it->orig_type == NEUTRAL_B /* L1 */
1952 || bidi_it->orig_type == NEUTRAL_S
e7402cb2
EZ
1953 || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB
1954 /* || bidi_it->ch == LINESEP_CHAR */
89d3374a 1955 || (bidi_it->orig_type == NEUTRAL_WS
b7b65b15
EZ
1956 && (bidi_it->next_for_ws.type == NEUTRAL_B
1957 || bidi_it->next_for_ws.type == NEUTRAL_S)))
1958 level = bidi_it->level_stack[0].level;
1959 else if ((level & 1) == 0) /* I1 */
1960 {
1961 if (type == STRONG_R)
1962 level++;
1963 else if (type == WEAK_EN || type == WEAK_AN)
1964 level += 2;
1965 }
1966 else /* I2 */
1967 {
1968 if (type == STRONG_L || type == WEAK_EN || type == WEAK_AN)
1969 level++;
1970 }
1971
1972 bidi_it->resolved_level = level;
1973 return level;
1974}
1975
1976/* Move to the other edge of a level given by LEVEL. If END_FLAG is
1977 non-zero, we are at the end of a level, and we need to prepare to
1978 resume the scan of the lower level.
1979
1980 If this level's other edge is cached, we simply jump to it, filling
1981 the iterator structure with the iterator state on the other edge.
182ce2d2
EZ
1982 Otherwise, we walk the buffer or string until we come back to the
1983 same level as LEVEL.
b7b65b15
EZ
1984
1985 Note: we are not talking here about a ``level run'' in the UAX#9
1986 sense of the term, but rather about a ``level'' which includes
1987 all the levels higher than it. In other words, given the levels
1988 like this:
1989
1990 11111112222222333333334443343222222111111112223322111
1991 A B C
1992
1993 and assuming we are at point A scanning left to right, this
1994 function moves to point C, whereas the UAX#9 ``level 2 run'' ends
1995 at point B. */
1996static void
1997bidi_find_other_level_edge (struct bidi_it *bidi_it, int level, int end_flag)
1998{
1999 int dir = end_flag ? -bidi_it->scan_dir : bidi_it->scan_dir;
2000 int idx;
2001
2002 /* Try the cache first. */
87e67904
EZ
2003 if ((idx = bidi_cache_find_level_change (level, dir, end_flag))
2004 >= bidi_cache_start)
b7b65b15
EZ
2005 bidi_cache_fetch_state (idx, bidi_it);
2006 else
2007 {
2008 int new_level;
2009
2010 if (end_flag)
2011 abort (); /* if we are at end of level, its edges must be cached */
2012
2013 bidi_cache_iterator_state (bidi_it, 1);
2014 do {
2015 new_level = bidi_level_of_next_char (bidi_it);
2016 bidi_cache_iterator_state (bidi_it, 1);
2017 } while (new_level >= level);
2018 }
2019}
2020
2021void
4b292a22 2022bidi_move_to_visually_next (struct bidi_it *bidi_it)
b7b65b15
EZ
2023{
2024 int old_level, new_level, next_level;
9c82e145 2025 struct bidi_it sentinel;
acb28818 2026 struct gcpro gcpro1;
b7b65b15 2027
87e67904
EZ
2028 if (bidi_it->charpos < 0 || bidi_it->bytepos < 0)
2029 abort ();
2030
b7b65b15
EZ
2031 if (bidi_it->scan_dir == 0)
2032 {
2033 bidi_it->scan_dir = 1; /* default to logical order */
2034 }
2035
acb28818
EZ
2036 /* The code below can call eval, and thus cause GC. If we are
2037 iterating a Lisp string, make sure it won't GCed. */
2038 if (STRINGP (bidi_it->string.lstring))
2039 GCPRO1 (bidi_it->string.lstring);
2040
be39f003
EZ
2041 /* If we just passed a newline, initialize for the next line. */
2042 if (!bidi_it->first_elt && bidi_it->orig_type == NEUTRAL_B)
2043 bidi_line_init (bidi_it);
2044
6dcfd253
EZ
2045 /* Prepare the sentinel iterator state, and cache it. When we bump
2046 into it, scanning backwards, we'll know that the last non-base
2047 level is exhausted. */
87e67904 2048 if (bidi_cache_idx == bidi_cache_start)
9c82e145
EZ
2049 {
2050 bidi_copy_it (&sentinel, bidi_it);
2051 if (bidi_it->first_elt)
2052 {
2053 sentinel.charpos--; /* cached charpos needs to be monotonic */
2054 sentinel.bytepos--;
2055 sentinel.ch = '\n'; /* doesn't matter, but why not? */
2056 sentinel.ch_len = 1;
182ce2d2 2057 sentinel.nchars = 1;
9c82e145 2058 }
6dcfd253 2059 bidi_cache_iterator_state (&sentinel, 1);
9c82e145 2060 }
b7b65b15
EZ
2061
2062 old_level = bidi_it->resolved_level;
2063 new_level = bidi_level_of_next_char (bidi_it);
b7b65b15
EZ
2064
2065 /* Reordering of resolved levels (clause L2) is implemented by
2066 jumping to the other edge of the level and flipping direction of
c0546589 2067 scanning the text whenever we find a level change. */
b7b65b15
EZ
2068 if (new_level != old_level)
2069 {
2070 int ascending = new_level > old_level;
2071 int level_to_search = ascending ? old_level + 1 : old_level;
2072 int incr = ascending ? 1 : -1;
2073 int expected_next_level = old_level + incr;
2074
b7b65b15
EZ
2075 /* Jump (or walk) to the other edge of this level. */
2076 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
2077 /* Switch scan direction and peek at the next character in the
2078 new direction. */
2079 bidi_it->scan_dir = -bidi_it->scan_dir;
2080
2081 /* The following loop handles the case where the resolved level
2082 jumps by more than one. This is typical for numbers inside a
2083 run of text with left-to-right embedding direction, but can
2084 also happen in other situations. In those cases the decision
2085 where to continue after a level change, and in what direction,
2086 is tricky. For example, given a text like below:
2087
2088 abcdefgh
2089 11336622
2090
2091 (where the numbers below the text show the resolved levels),
2092 the result of reordering according to UAX#9 should be this:
2093
2094 efdcghba
2095
2096 This is implemented by the loop below which flips direction
2097 and jumps to the other edge of the level each time it finds
2098 the new level not to be the expected one. The expected level
2099 is always one more or one less than the previous one. */
2100 next_level = bidi_peek_at_next_level (bidi_it);
2101 while (next_level != expected_next_level)
2102 {
2103 expected_next_level += incr;
2104 level_to_search += incr;
2105 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
2106 bidi_it->scan_dir = -bidi_it->scan_dir;
2107 next_level = bidi_peek_at_next_level (bidi_it);
2108 }
2109
2110 /* Finally, deliver the next character in the new direction. */
2111 next_level = bidi_level_of_next_char (bidi_it);
2112 }
2113
b44d9321
EZ
2114 /* Take note when we have just processed the newline that precedes
2115 the end of the paragraph. The next time we are about to be
2116 called, set_iterator_to_next will automatically reinit the
2117 paragraph direction, if needed. We do this at the newline before
2118 the paragraph separator, because the next character might not be
2119 the first character of the next paragraph, due to the bidi
c0546589
EZ
2120 reordering, whereas we _must_ know the paragraph base direction
2121 _before_ we process the paragraph's text, since the base
2122 direction affects the reordering. */
87e67904 2123 if (bidi_it->scan_dir == 1 && bidi_it->orig_type == NEUTRAL_B)
be39f003 2124 {
87e67904
EZ
2125 /* The paragraph direction of the entire string, once
2126 determined, is in effect for the entire string. Setting the
2127 separator limit to the end of the string prevents
2128 bidi_paragraph_init from being called automatically on this
2129 string. */
9f257352 2130 if (bidi_it->string.s || STRINGP (bidi_it->string.lstring))
87e67904
EZ
2131 bidi_it->separator_limit = bidi_it->string.schars;
2132 else if (bidi_it->bytepos < ZV_BYTE)
be39f003 2133 {
87e67904
EZ
2134 EMACS_INT sep_len =
2135 bidi_at_paragraph_end (bidi_it->charpos + bidi_it->nchars,
2136 bidi_it->bytepos + bidi_it->ch_len);
2137 if (bidi_it->nchars <= 0)
2138 abort ();
2139 if (sep_len >= 0)
2140 {
2141 bidi_it->new_paragraph = 1;
2142 /* Record the buffer position of the last character of the
2143 paragraph separator. */
2144 bidi_it->separator_limit =
2145 bidi_it->charpos + bidi_it->nchars + sep_len;
2146 }
be39f003
EZ
2147 }
2148 }
6bff6497 2149
87e67904 2150 if (bidi_it->scan_dir == 1 && bidi_cache_idx > bidi_cache_start)
b7b65b15
EZ
2151 {
2152 /* If we are at paragraph's base embedding level and beyond the
2153 last cached position, the cache's job is done and we can
2154 discard it. */
2155 if (bidi_it->resolved_level == bidi_it->level_stack[0].level
182ce2d2
EZ
2156 && bidi_it->charpos > (bidi_cache[bidi_cache_idx - 1].charpos
2157 + bidi_cache[bidi_cache_idx - 1].nchars - 1))
b7b65b15
EZ
2158 bidi_cache_reset ();
2159 /* But as long as we are caching during forward scan, we must
2160 cache each state, or else the cache integrity will be
2161 compromised: it assumes cached states correspond to buffer
2162 positions 1:1. */
2163 else
2164 bidi_cache_iterator_state (bidi_it, 1);
2165 }
acb28818
EZ
2166
2167 if (STRINGP (bidi_it->string.lstring))
2168 UNGCPRO;
b7b65b15
EZ
2169}
2170
2171/* This is meant to be called from within the debugger, whenever you
2172 wish to examine the cache contents. */
e78aecca 2173void bidi_dump_cached_states (void) EXTERNALLY_VISIBLE;
b7b65b15
EZ
2174void
2175bidi_dump_cached_states (void)
2176{
2177 int i;
2178 int ndigits = 1;
2179
2180 if (bidi_cache_idx == 0)
2181 {
2182 fprintf (stderr, "The cache is empty.\n");
2183 return;
2184 }
2185 fprintf (stderr, "Total of %d state%s in cache:\n",
2186 bidi_cache_idx, bidi_cache_idx == 1 ? "" : "s");
2187
2188 for (i = bidi_cache[bidi_cache_idx - 1].charpos; i > 0; i /= 10)
2189 ndigits++;
2190 fputs ("ch ", stderr);
2191 for (i = 0; i < bidi_cache_idx; i++)
2192 fprintf (stderr, "%*c", ndigits, bidi_cache[i].ch);
2193 fputs ("\n", stderr);
2194 fputs ("lvl ", stderr);
2195 for (i = 0; i < bidi_cache_idx; i++)
2196 fprintf (stderr, "%*d", ndigits, bidi_cache[i].resolved_level);
2197 fputs ("\n", stderr);
2198 fputs ("pos ", stderr);
2199 for (i = 0; i < bidi_cache_idx; i++)
c2982e87 2200 fprintf (stderr, "%*"pI"d", ndigits, bidi_cache[i].charpos);
b7b65b15
EZ
2201 fputs ("\n", stderr);
2202}