2 GiB, not 4 GiB, for buffer sizes.
[bpt/emacs.git] / src / bidi.c
CommitLineData
cbb09f04 1/* Low-level bidirectional buffer/string-scanning functions for GNU Emacs.
73b0cd50 2 Copyright (C) 2000-2001, 2004-2005, 2009-2011
b118e65f 3 Free Software Foundation, Inc.
b7b65b15
EZ
4
5This file is part of GNU Emacs.
6
a8d11bd3 7GNU Emacs is free software: you can redistribute it and/or modify
b7b65b15 8it under the terms of the GNU General Public License as published by
a8d11bd3
EZ
9the Free Software Foundation, either version 3 of the License, or
10(at your option) any later version.
b7b65b15
EZ
11
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
16
b7b65b15 17You should have received a copy of the GNU General Public License
a8d11bd3 18along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
b7b65b15 19
2d6e4628
EZ
20/* Written by Eli Zaretskii <eliz@gnu.org>.
21
22 A sequential implementation of the Unicode Bidirectional algorithm,
cbb09f04 23 (UBA) as per UAX#9, a part of the Unicode Standard.
b7b65b15
EZ
24
25 Unlike the reference and most other implementations, this one is
940afb59
EZ
26 designed to be called once for every character in the buffer or
27 string.
b7b65b15 28
4b292a22 29 The main entry point is bidi_move_to_visually_next. Each time it
b7b65b15
EZ
30 is called, it finds the next character in the visual order, and
31 returns its information in a special structure. The caller is then
32 expected to process this character for display or any other
4b292a22
EZ
33 purposes, and call bidi_move_to_visually_next for the next
34 character. See the comments in bidi_move_to_visually_next for more
35 details about its algorithm that finds the next visual-order
b7b65b15
EZ
36 character by resolving their levels on the fly.
37
95e1afc6 38 Two other entry points are bidi_paragraph_init and
940afb59
EZ
39 bidi_mirror_char. The first determines the base direction of a
40 paragraph, while the second returns the mirrored version of its
41 argument character.
42
95e1afc6
EZ
43 A few auxiliary entry points are used to initialize the bidi
44 iterator for iterating an object (buffer or string), push and pop
45 the bidi iterator state, and save and restore the state of the bidi
46 cache.
47
89d3374a
EZ
48 If you want to understand the code, you will have to read it
49 together with the relevant portions of UAX#9. The comments include
50 references to UAX#9 rules, for that very reason.
51
b7b65b15
EZ
52 A note about references to UAX#9 rules: if the reference says
53 something like "X9/Retaining", it means that you need to refer to
54 rule X9 and to its modifications decribed in the "Implementation
55 Notes" section of UAX#9, under "Retaining Format Codes". */
56
b7b65b15 57#include <config.h>
b7b65b15 58#include <stdio.h>
29e3d8d1
EZ
59#include <setjmp.h>
60
b7b65b15
EZ
61#include "lisp.h"
62#include "buffer.h"
63#include "character.h"
64#include "dispextern.h"
65
66static int bidi_initialized = 0;
67
cbc4fd20 68static Lisp_Object bidi_type_table, bidi_mirror_table;
b7b65b15
EZ
69
70#define LRM_CHAR 0x200E
71#define RLM_CHAR 0x200F
b7b65b15 72#define BIDI_EOB -1
b7b65b15 73
b7b65b15
EZ
74/* Data type for describing the bidirectional character categories. */
75typedef enum {
76 UNKNOWN_BC,
77 NEUTRAL,
78 WEAK,
79 STRONG
80} bidi_category_t;
81
32413314
EZ
82/* UAX#9 says to search only for L, AL, or R types of characters, and
83 ignore RLE, RLO, LRE, and LRO, when determining the base paragraph
84 level. Yudit indeed ignores them. This variable is therefore set
85 by default to ignore them, but setting it to zero will take them
86 into account. */
e78aecca 87extern int bidi_ignore_explicit_marks_for_paragraph_level EXTERNALLY_VISIBLE;
b7b65b15
EZ
88int bidi_ignore_explicit_marks_for_paragraph_level = 1;
89
372b7a95 90static Lisp_Object paragraph_start_re, paragraph_separate_re;
6bff6497 91static Lisp_Object Qparagraph_start, Qparagraph_separate;
b7b65b15 92
cbb09f04
EZ
93\f
94/***********************************************************************
95 Utilities
96 ***********************************************************************/
b7b65b15 97
6bff6497
EZ
98/* Return the bidi type of a character CH, subject to the current
99 directional OVERRIDE. */
55d4c1b2 100static inline bidi_type_t
6bff6497 101bidi_get_type (int ch, bidi_dir_t override)
b7b65b15 102{
6bff6497
EZ
103 bidi_type_t default_type;
104
e7402cb2
EZ
105 if (ch == BIDI_EOB)
106 return NEUTRAL_B;
e342a24d
EZ
107 if (ch < 0 || ch > MAX_CHAR)
108 abort ();
6bff6497
EZ
109
110 default_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
bca633fb
EZ
111 /* Every valid character code, even those that are unassigned by the
112 UCD, have some bidi-class property, according to
113 DerivedBidiClass.txt file. Therefore, if we ever get UNKNOWN_BT
114 (= zero) code from CHAR_TABLE_REF, that's a bug. */
115 if (default_type == UNKNOWN_BT)
116 abort ();
6bff6497
EZ
117
118 if (override == NEUTRAL_DIR)
119 return default_type;
120
121 switch (default_type)
122 {
123 /* Although UAX#9 does not tell, it doesn't make sense to
124 override NEUTRAL_B and LRM/RLM characters. */
125 case NEUTRAL_B:
126 case LRE:
127 case LRO:
128 case RLE:
129 case RLO:
130 case PDF:
131 return default_type;
132 default:
133 switch (ch)
134 {
135 case LRM_CHAR:
136 case RLM_CHAR:
137 return default_type;
138 default:
139 if (override == L2R) /* X6 */
140 return STRONG_L;
141 else if (override == R2L)
142 return STRONG_R;
143 else
144 abort (); /* can't happen: handled above */
145 }
146 }
b7b65b15
EZ
147}
148
f67cdd7f 149static inline void
2d6e4628
EZ
150bidi_check_type (bidi_type_t type)
151{
f67cdd7f 152 xassert (UNKNOWN_BT <= type && type <= NEUTRAL_ON);
2d6e4628
EZ
153}
154
b7b65b15 155/* Given a bidi TYPE of a character, return its category. */
55d4c1b2 156static inline bidi_category_t
b7b65b15
EZ
157bidi_get_category (bidi_type_t type)
158{
159 switch (type)
160 {
161 case UNKNOWN_BT:
162 return UNKNOWN_BC;
163 case STRONG_L:
164 case STRONG_R:
165 case STRONG_AL:
166 case LRE:
167 case LRO:
168 case RLE:
169 case RLO:
170 return STRONG;
171 case PDF: /* ??? really?? */
172 case WEAK_EN:
173 case WEAK_ES:
174 case WEAK_ET:
175 case WEAK_AN:
176 case WEAK_CS:
177 case WEAK_NSM:
178 case WEAK_BN:
179 return WEAK;
180 case NEUTRAL_B:
181 case NEUTRAL_S:
182 case NEUTRAL_WS:
183 case NEUTRAL_ON:
184 return NEUTRAL;
185 default:
186 abort ();
187 }
188}
189
a6e8d97c
EZ
190/* Return the mirrored character of C, if it has one. If C has no
191 mirrored counterpart, return C.
192 Note: The conditions in UAX#9 clause L4 regarding the surrounding
193 context must be tested by the caller. */
b7b65b15
EZ
194int
195bidi_mirror_char (int c)
196{
cbc4fd20
EZ
197 Lisp_Object val;
198
199 if (c == BIDI_EOB)
200 return c;
201 if (c < 0 || c > MAX_CHAR)
202 abort ();
b7b65b15 203
cbc4fd20
EZ
204 val = CHAR_TABLE_REF (bidi_mirror_table, c);
205 if (INTEGERP (val))
b7b65b15 206 {
cbc4fd20 207 int v = XINT (val);
b7b65b15 208
cbc4fd20
EZ
209 if (v < 0 || v > MAX_CHAR)
210 abort ();
211
212 return v;
b7b65b15 213 }
cbc4fd20 214
b7b65b15
EZ
215 return c;
216}
217
cbb09f04
EZ
218/* Determine the start-of-run (sor) directional type given the two
219 embedding levels on either side of the run boundary. Also, update
220 the saved info about previously seen characters, since that info is
221 generally valid for a single level run. */
58b9f433 222static inline void
cbb09f04
EZ
223bidi_set_sor_type (struct bidi_it *bidi_it, int level_before, int level_after)
224{
512a289d 225 int higher_level = (level_before > level_after ? level_before : level_after);
cbb09f04
EZ
226
227 /* The prev_was_pdf gork is required for when we have several PDFs
228 in a row. In that case, we want to compute the sor type for the
229 next level run only once: when we see the first PDF. That's
230 because the sor type depends only on the higher of the two levels
231 that we find on the two sides of the level boundary (see UAX#9,
232 clause X10), and so we don't need to know the final embedding
233 level to which we descend after processing all the PDFs. */
234 if (!bidi_it->prev_was_pdf || level_before < level_after)
235 /* FIXME: should the default sor direction be user selectable? */
512a289d 236 bidi_it->sor = ((higher_level & 1) != 0 ? R2L : L2R);
cbb09f04
EZ
237 if (level_before > level_after)
238 bidi_it->prev_was_pdf = 1;
239
240 bidi_it->prev.type = UNKNOWN_BT;
512a289d
RS
241 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1
242 = bidi_it->last_strong.orig_type = UNKNOWN_BT;
243 bidi_it->prev_for_neutral.type = (bidi_it->sor == R2L ? STRONG_R : STRONG_L);
cbb09f04
EZ
244 bidi_it->prev_for_neutral.charpos = bidi_it->charpos;
245 bidi_it->prev_for_neutral.bytepos = bidi_it->bytepos;
512a289d
RS
246 bidi_it->next_for_neutral.type = bidi_it->next_for_neutral.type_after_w1
247 = bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
cbb09f04
EZ
248 bidi_it->ignore_bn_limit = -1; /* meaning it's unknown */
249}
250
251/* Push the current embedding level and override status; reset the
252 current level to LEVEL and the current override status to OVERRIDE. */
58b9f433 253static inline void
cbb09f04
EZ
254bidi_push_embedding_level (struct bidi_it *bidi_it,
255 int level, bidi_dir_t override)
256{
257 bidi_it->stack_idx++;
7e2ad32c 258 xassert (bidi_it->stack_idx < BIDI_MAXLEVEL);
cbb09f04
EZ
259 bidi_it->level_stack[bidi_it->stack_idx].level = level;
260 bidi_it->level_stack[bidi_it->stack_idx].override = override;
261}
262
263/* Pop the embedding level and directional override status from the
264 stack, and return the new level. */
58b9f433 265static inline int
cbb09f04
EZ
266bidi_pop_embedding_level (struct bidi_it *bidi_it)
267{
268 /* UAX#9 says to ignore invalid PDFs. */
269 if (bidi_it->stack_idx > 0)
270 bidi_it->stack_idx--;
271 return bidi_it->level_stack[bidi_it->stack_idx].level;
272}
273
274/* Record in SAVED_INFO the information about the current character. */
58b9f433 275static inline void
cbb09f04
EZ
276bidi_remember_char (struct bidi_saved_info *saved_info,
277 struct bidi_it *bidi_it)
278{
279 saved_info->charpos = bidi_it->charpos;
280 saved_info->bytepos = bidi_it->bytepos;
281 saved_info->type = bidi_it->type;
282 bidi_check_type (bidi_it->type);
283 saved_info->type_after_w1 = bidi_it->type_after_w1;
284 bidi_check_type (bidi_it->type_after_w1);
285 saved_info->orig_type = bidi_it->orig_type;
286 bidi_check_type (bidi_it->orig_type);
287}
288
b7b65b15
EZ
289/* Copy the bidi iterator from FROM to TO. To save cycles, this only
290 copies the part of the level stack that is actually in use. */
55d4c1b2 291static inline void
b7b65b15
EZ
292bidi_copy_it (struct bidi_it *to, struct bidi_it *from)
293{
294 int i;
295
e69a9370 296 /* Copy everything except the level stack and beyond. */
182ce2d2 297 memcpy (to, from, offsetof (struct bidi_it, level_stack[0]));
b7b65b15
EZ
298
299 /* Copy the active part of the level stack. */
300 to->level_stack[0] = from->level_stack[0]; /* level zero is always in use */
301 for (i = 1; i <= from->stack_idx; i++)
302 to->level_stack[i] = from->level_stack[i];
303}
304
cbb09f04
EZ
305\f
306/***********************************************************************
307 Caching the bidi iterator states
308 ***********************************************************************/
b7b65b15 309
2fe72643
EZ
310#define BIDI_CACHE_CHUNK 200
311static struct bidi_it *bidi_cache;
39e378da 312static ptrdiff_t bidi_cache_size = 0;
ad6042bb 313enum { elsz = sizeof (struct bidi_it) };
39e378da
PE
314static ptrdiff_t bidi_cache_idx; /* next unused cache slot */
315static ptrdiff_t bidi_cache_last_idx; /* slot of last cache hit */
316static ptrdiff_t bidi_cache_start = 0; /* start of cache for this
87e67904
EZ
317 "stack" level */
318
bc18e09d
PE
319/* 5-slot stack for saving the start of the previous level of the
320 cache. xdisp.c maintains a 5-slot stack for its iterator state,
321 and we need the same size of our stack. */
322static ptrdiff_t bidi_cache_start_stack[IT_STACK_SIZE];
323static int bidi_cache_sp;
324
325/* Size of header used by bidi_shelve_cache. */
326enum
327 {
512a289d
RS
328 bidi_shelve_header_size
329 = (sizeof (bidi_cache_idx) + sizeof (bidi_cache_start_stack)
330 + sizeof (bidi_cache_sp) + sizeof (bidi_cache_start)
331 + sizeof (bidi_cache_last_idx))
bc18e09d
PE
332 };
333
87e67904
EZ
334/* Reset the cache state to the empty state. We only reset the part
335 of the cache relevant to iteration of the current object. Previous
336 objects, which are pushed on the display iterator's stack, are left
337 intact. This is called when the cached information is no more
338 useful for the current iteration, e.g. when we were reseated to a
339 new position on the same object. */
55d4c1b2 340static inline void
b7b65b15
EZ
341bidi_cache_reset (void)
342{
87e67904 343 bidi_cache_idx = bidi_cache_start;
b7b65b15
EZ
344 bidi_cache_last_idx = -1;
345}
346
87e67904
EZ
347/* Shrink the cache to its minimal size. Called when we init the bidi
348 iterator for reordering a buffer or a string that does not come
349 from display properties, because that means all the previously
350 cached info is of no further use. */
55d4c1b2 351static inline void
2fe72643
EZ
352bidi_cache_shrink (void)
353{
354 if (bidi_cache_size > BIDI_CACHE_CHUNK)
355 {
512a289d
RS
356 bidi_cache
357 = (struct bidi_it *) xrealloc (bidi_cache, BIDI_CACHE_CHUNK * elsz);
51f30bc5 358 bidi_cache_size = BIDI_CACHE_CHUNK;
2fe72643
EZ
359 }
360 bidi_cache_reset ();
361}
362
55d4c1b2 363static inline void
39e378da 364bidi_cache_fetch_state (ptrdiff_t idx, struct bidi_it *bidi_it)
b7b65b15
EZ
365{
366 int current_scan_dir = bidi_it->scan_dir;
367
87e67904 368 if (idx < bidi_cache_start || idx >= bidi_cache_idx)
b7b65b15
EZ
369 abort ();
370
371 bidi_copy_it (bidi_it, &bidi_cache[idx]);
372 bidi_it->scan_dir = current_scan_dir;
373 bidi_cache_last_idx = idx;
374}
375
376/* Find a cached state with a given CHARPOS and resolved embedding
377 level less or equal to LEVEL. if LEVEL is -1, disregard the
378 resolved levels in cached states. DIR, if non-zero, means search
379 in that direction from the last cache hit. */
39e378da 380static inline ptrdiff_t
61bfec98 381bidi_cache_search (EMACS_INT charpos, int level, int dir)
b7b65b15 382{
39e378da 383 ptrdiff_t i, i_start;
b7b65b15 384
6eec7596 385 if (bidi_cache_idx > bidi_cache_start)
b7b65b15 386 {
6eec7596
EZ
387 if (bidi_cache_last_idx == -1)
388 bidi_cache_last_idx = bidi_cache_idx - 1;
b7b65b15 389 if (charpos < bidi_cache[bidi_cache_last_idx].charpos)
182ce2d2
EZ
390 {
391 dir = -1;
392 i_start = bidi_cache_last_idx - 1;
393 }
394 else if (charpos > (bidi_cache[bidi_cache_last_idx].charpos
395 + bidi_cache[bidi_cache_last_idx].nchars - 1))
396 {
397 dir = 1;
398 i_start = bidi_cache_last_idx + 1;
399 }
400 else if (dir)
b7b65b15
EZ
401 i_start = bidi_cache_last_idx;
402 else
403 {
404 dir = -1;
405 i_start = bidi_cache_idx - 1;
406 }
407
408 if (dir < 0)
409 {
410 /* Linear search for now; FIXME! */
87e67904 411 for (i = i_start; i >= bidi_cache_start; i--)
182ce2d2
EZ
412 if (bidi_cache[i].charpos <= charpos
413 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
b7b65b15
EZ
414 && (level == -1 || bidi_cache[i].resolved_level <= level))
415 return i;
416 }
417 else
418 {
419 for (i = i_start; i < bidi_cache_idx; i++)
182ce2d2
EZ
420 if (bidi_cache[i].charpos <= charpos
421 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
b7b65b15
EZ
422 && (level == -1 || bidi_cache[i].resolved_level <= level))
423 return i;
424 }
425 }
426
427 return -1;
428}
429
430/* Find a cached state where the resolved level changes to a value
431 that is lower than LEVEL, and return its cache slot index. DIR is
432 the direction to search, starting with the last used cache slot.
87e67904
EZ
433 If DIR is zero, we search backwards from the last occupied cache
434 slot. BEFORE, if non-zero, means return the index of the slot that
435 is ``before'' the level change in the search direction. That is,
b7b65b15
EZ
436 given the cached levels like this:
437
438 1122333442211
439 AB C
440
441 and assuming we are at the position cached at the slot marked with
442 C, searching backwards (DIR = -1) for LEVEL = 2 will return the
443 index of slot B or A, depending whether BEFORE is, respectively,
444 non-zero or zero. */
39e378da 445static ptrdiff_t
b7b65b15
EZ
446bidi_cache_find_level_change (int level, int dir, int before)
447{
448 if (bidi_cache_idx)
449 {
39e378da 450 ptrdiff_t i = dir ? bidi_cache_last_idx : bidi_cache_idx - 1;
b7b65b15
EZ
451 int incr = before ? 1 : 0;
452
6eec7596
EZ
453 xassert (!dir || bidi_cache_last_idx >= 0);
454
b7b65b15
EZ
455 if (!dir)
456 dir = -1;
457 else if (!incr)
458 i += dir;
459
460 if (dir < 0)
461 {
87e67904 462 while (i >= bidi_cache_start + incr)
b7b65b15
EZ
463 {
464 if (bidi_cache[i - incr].resolved_level >= 0
465 && bidi_cache[i - incr].resolved_level < level)
466 return i;
467 i--;
468 }
469 }
470 else
471 {
472 while (i < bidi_cache_idx - incr)
473 {
474 if (bidi_cache[i + incr].resolved_level >= 0
475 && bidi_cache[i + incr].resolved_level < level)
476 return i;
477 i++;
478 }
479 }
480 }
481
482 return -1;
483}
484
58b9f433 485static inline void
39e378da 486bidi_cache_ensure_space (ptrdiff_t idx)
58b9f433
EZ
487{
488 /* Enlarge the cache as needed. */
489 if (idx >= bidi_cache_size)
490 {
f0eb61e9
PE
491 /* The bidi cache cannot be larger than the largest Lisp string
492 or buffer. */
512a289d
RS
493 ptrdiff_t string_or_buffer_bound
494 = max (BUF_BYTES_MAX, STRING_BYTES_BOUND);
f0eb61e9
PE
495
496 /* Also, it cannot be larger than what C can represent. */
512a289d
RS
497 ptrdiff_t c_bound
498 = (min (PTRDIFF_MAX, SIZE_MAX) - bidi_shelve_header_size) / elsz;
f0eb61e9 499
512a289d
RS
500 bidi_cache
501 = xpalloc (bidi_cache, &bidi_cache_size,
502 max (BIDI_CACHE_CHUNK, idx - bidi_cache_size + 1),
503 min (string_or_buffer_bound, c_bound), elsz);
58b9f433
EZ
504 }
505}
506
55d4c1b2 507static inline void
b7b65b15
EZ
508bidi_cache_iterator_state (struct bidi_it *bidi_it, int resolved)
509{
39e378da 510 ptrdiff_t idx;
b7b65b15
EZ
511
512 /* We should never cache on backward scans. */
513 if (bidi_it->scan_dir == -1)
514 abort ();
515 idx = bidi_cache_search (bidi_it->charpos, -1, 1);
516
517 if (idx < 0)
518 {
519 idx = bidi_cache_idx;
58b9f433 520 bidi_cache_ensure_space (idx);
bd924a5d
EZ
521 /* Character positions should correspond to cache positions 1:1.
522 If we are outside the range of cached positions, the cache is
523 useless and must be reset. */
87e67904 524 if (idx > bidi_cache_start &&
182ce2d2
EZ
525 (bidi_it->charpos > (bidi_cache[idx - 1].charpos
526 + bidi_cache[idx - 1].nchars)
a1344e7d 527 || bidi_it->charpos < bidi_cache[bidi_cache_start].charpos))
bd924a5d
EZ
528 {
529 bidi_cache_reset ();
87e67904 530 idx = bidi_cache_start;
bd924a5d 531 }
102ebb00
EZ
532 if (bidi_it->nchars <= 0)
533 abort ();
b7b65b15
EZ
534 bidi_copy_it (&bidi_cache[idx], bidi_it);
535 if (!resolved)
536 bidi_cache[idx].resolved_level = -1;
537 }
538 else
539 {
540 /* Copy only the members which could have changed, to avoid
541 costly copying of the entire struct. */
542 bidi_cache[idx].type = bidi_it->type;
2d6e4628 543 bidi_check_type (bidi_it->type);
89d3374a
EZ
544 bidi_cache[idx].type_after_w1 = bidi_it->type_after_w1;
545 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
546 if (resolved)
547 bidi_cache[idx].resolved_level = bidi_it->resolved_level;
548 else
549 bidi_cache[idx].resolved_level = -1;
550 bidi_cache[idx].invalid_levels = bidi_it->invalid_levels;
551 bidi_cache[idx].invalid_rl_levels = bidi_it->invalid_rl_levels;
552 bidi_cache[idx].next_for_neutral = bidi_it->next_for_neutral;
553 bidi_cache[idx].next_for_ws = bidi_it->next_for_ws;
554 bidi_cache[idx].ignore_bn_limit = bidi_it->ignore_bn_limit;
f67cdd7f 555 bidi_cache[idx].disp_pos = bidi_it->disp_pos;
0c95fcf7 556 bidi_cache[idx].disp_prop = bidi_it->disp_prop;
b7b65b15
EZ
557 }
558
559 bidi_cache_last_idx = idx;
560 if (idx >= bidi_cache_idx)
561 bidi_cache_idx = idx + 1;
562}
563
55d4c1b2 564static inline bidi_type_t
61bfec98 565bidi_cache_find (EMACS_INT charpos, int level, struct bidi_it *bidi_it)
b7b65b15 566{
39e378da 567 ptrdiff_t i = bidi_cache_search (charpos, level, bidi_it->scan_dir);
b7b65b15 568
87e67904 569 if (i >= bidi_cache_start)
b7b65b15
EZ
570 {
571 bidi_dir_t current_scan_dir = bidi_it->scan_dir;
572
5009f85e 573 bidi_copy_it (bidi_it, &bidi_cache[i]);
b7b65b15
EZ
574 bidi_cache_last_idx = i;
575 /* Don't let scan direction from from the cached state override
576 the current scan direction. */
577 bidi_it->scan_dir = current_scan_dir;
578 return bidi_it->type;
579 }
580
581 return UNKNOWN_BT;
582}
583
55d4c1b2 584static inline int
b7b65b15
EZ
585bidi_peek_at_next_level (struct bidi_it *bidi_it)
586{
87e67904 587 if (bidi_cache_idx == bidi_cache_start || bidi_cache_last_idx == -1)
b7b65b15
EZ
588 abort ();
589 return bidi_cache[bidi_cache_last_idx + bidi_it->scan_dir].resolved_level;
590}
591
cbb09f04 592\f
58b9f433
EZ
593/***********************************************************************
594 Pushing and popping the bidi iterator state
595 ***********************************************************************/
58b9f433
EZ
596
597/* Push the bidi iterator state in preparation for reordering a
598 different object, e.g. display string found at certain buffer
7e2ad32c
EZ
599 position. Pushing the bidi iterator boils down to saving its
600 entire state on the cache and starting a new cache "stacked" on top
601 of the current cache. */
58b9f433
EZ
602void
603bidi_push_it (struct bidi_it *bidi_it)
b7b65b15 604{
58b9f433
EZ
605 /* Save the current iterator state in its entirety after the last
606 used cache slot. */
607 bidi_cache_ensure_space (bidi_cache_idx);
608 memcpy (&bidi_cache[bidi_cache_idx++], bidi_it, sizeof (struct bidi_it));
be39f003 609
58b9f433 610 /* Push the current cache start onto the stack. */
7e2ad32c 611 xassert (bidi_cache_sp < IT_STACK_SIZE);
58b9f433 612 bidi_cache_start_stack[bidi_cache_sp++] = bidi_cache_start;
be39f003 613
58b9f433
EZ
614 /* Start a new level of cache, and make it empty. */
615 bidi_cache_start = bidi_cache_idx;
616 bidi_cache_last_idx = -1;
617}
618
619/* Restore the iterator state saved by bidi_push_it and return the
620 cache to the corresponding state. */
621void
622bidi_pop_it (struct bidi_it *bidi_it)
623{
624 if (bidi_cache_start <= 0)
625 abort ();
626
627 /* Reset the next free cache slot index to what it was before the
628 call to bidi_push_it. */
629 bidi_cache_idx = bidi_cache_start - 1;
630
631 /* Restore the bidi iterator state saved in the cache. */
632 memcpy (bidi_it, &bidi_cache[bidi_cache_idx], sizeof (struct bidi_it));
633
634 /* Pop the previous cache start from the stack. */
635 if (bidi_cache_sp <= 0)
636 abort ();
637 bidi_cache_start = bidi_cache_start_stack[--bidi_cache_sp];
638
639 /* Invalidate the last-used cache slot data. */
640 bidi_cache_last_idx = -1;
641}
642
ec7cc85b 643static ptrdiff_t bidi_cache_total_alloc;
35928349 644
ed94e6d7
EZ
645/* Stash away a copy of the cache and its control variables. */
646void *
647bidi_shelve_cache (void)
648{
649 unsigned char *databuf;
458bfed3 650 ptrdiff_t alloc;
ed94e6d7 651
35928349 652 /* Empty cache. */
ed94e6d7
EZ
653 if (bidi_cache_idx == 0)
654 return NULL;
655
458bfed3
PE
656 alloc = (bidi_shelve_header_size
657 + bidi_cache_idx * sizeof (struct bidi_it));
658 databuf = xmalloc (alloc);
659 bidi_cache_total_alloc += alloc;
35928349 660
ed94e6d7
EZ
661 memcpy (databuf, &bidi_cache_idx, sizeof (bidi_cache_idx));
662 memcpy (databuf + sizeof (bidi_cache_idx),
663 bidi_cache, bidi_cache_idx * sizeof (struct bidi_it));
664 memcpy (databuf + sizeof (bidi_cache_idx)
665 + bidi_cache_idx * sizeof (struct bidi_it),
666 bidi_cache_start_stack, sizeof (bidi_cache_start_stack));
667 memcpy (databuf + sizeof (bidi_cache_idx)
668 + bidi_cache_idx * sizeof (struct bidi_it)
669 + sizeof (bidi_cache_start_stack),
670 &bidi_cache_sp, sizeof (bidi_cache_sp));
671 memcpy (databuf + sizeof (bidi_cache_idx)
672 + bidi_cache_idx * sizeof (struct bidi_it)
673 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp),
674 &bidi_cache_start, sizeof (bidi_cache_start));
675 memcpy (databuf + sizeof (bidi_cache_idx)
676 + bidi_cache_idx * sizeof (struct bidi_it)
677 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
678 + sizeof (bidi_cache_start),
679 &bidi_cache_last_idx, sizeof (bidi_cache_last_idx));
680
681 return databuf;
682}
683
d1410150
EZ
684/* Restore the cache state from a copy stashed away by
685 bidi_shelve_cache, and free the buffer used to stash that copy.
686 JUST_FREE non-zero means free the buffer, but don't restore the
687 cache; used when the corresponding iterator is discarded instead of
688 being restored. */
ed94e6d7 689void
35928349 690bidi_unshelve_cache (void *databuf, int just_free)
ed94e6d7
EZ
691{
692 unsigned char *p = databuf;
693
694 if (!p)
be39f003 695 {
d1410150
EZ
696 if (!just_free)
697 {
698 /* A NULL pointer means an empty cache. */
699 bidi_cache_start = 0;
700 bidi_cache_sp = 0;
701 bidi_cache_reset ();
702 }
be39f003 703 }
ed94e6d7
EZ
704 else
705 {
35928349
EZ
706 if (just_free)
707 {
708 ptrdiff_t idx;
709
710 memcpy (&idx, p, sizeof (bidi_cache_idx));
512a289d
RS
711 bidi_cache_total_alloc
712 -= bidi_shelve_header_size + idx * sizeof (struct bidi_it);
35928349
EZ
713 }
714 else
715 {
716 memcpy (&bidi_cache_idx, p, sizeof (bidi_cache_idx));
717 bidi_cache_ensure_space (bidi_cache_idx);
718 memcpy (bidi_cache, p + sizeof (bidi_cache_idx),
719 bidi_cache_idx * sizeof (struct bidi_it));
720 memcpy (bidi_cache_start_stack,
721 p + sizeof (bidi_cache_idx)
722 + bidi_cache_idx * sizeof (struct bidi_it),
723 sizeof (bidi_cache_start_stack));
724 memcpy (&bidi_cache_sp,
725 p + sizeof (bidi_cache_idx)
726 + bidi_cache_idx * sizeof (struct bidi_it)
727 + sizeof (bidi_cache_start_stack),
728 sizeof (bidi_cache_sp));
729 memcpy (&bidi_cache_start,
730 p + sizeof (bidi_cache_idx)
731 + bidi_cache_idx * sizeof (struct bidi_it)
732 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp),
733 sizeof (bidi_cache_start));
734 memcpy (&bidi_cache_last_idx,
735 p + sizeof (bidi_cache_idx)
736 + bidi_cache_idx * sizeof (struct bidi_it)
737 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
738 + sizeof (bidi_cache_start),
739 sizeof (bidi_cache_last_idx));
512a289d
RS
740 bidi_cache_total_alloc
741 -= (bidi_shelve_header_size
742 + bidi_cache_idx * sizeof (struct bidi_it));
35928349 743 }
ed94e6d7
EZ
744
745 xfree (p);
746 }
747}
b7b65b15 748
58b9f433 749\f
cbb09f04
EZ
750/***********************************************************************
751 Initialization
752 ***********************************************************************/
753static void
754bidi_initialize (void)
b7b65b15 755{
474a8465
EZ
756 bidi_type_table = uniprop_table (intern ("bidi-class"));
757 if (NILP (bidi_type_table))
758 abort ();
cbb09f04
EZ
759 staticpro (&bidi_type_table);
760
474a8465
EZ
761 bidi_mirror_table = uniprop_table (intern ("mirroring"));
762 if (NILP (bidi_mirror_table))
763 abort ();
cbb09f04
EZ
764 staticpro (&bidi_mirror_table);
765
cbb09f04
EZ
766 Qparagraph_start = intern ("paragraph-start");
767 staticpro (&Qparagraph_start);
768 paragraph_start_re = Fsymbol_value (Qparagraph_start);
769 if (!STRINGP (paragraph_start_re))
770 paragraph_start_re = build_string ("\f\\|[ \t]*$");
771 staticpro (&paragraph_start_re);
772 Qparagraph_separate = intern ("paragraph-separate");
773 staticpro (&Qparagraph_separate);
774 paragraph_separate_re = Fsymbol_value (Qparagraph_separate);
775 if (!STRINGP (paragraph_separate_re))
776 paragraph_separate_re = build_string ("[ \t\f]*$");
777 staticpro (&paragraph_separate_re);
58b9f433
EZ
778
779 bidi_cache_sp = 0;
ec7cc85b 780 bidi_cache_total_alloc = 0;
58b9f433 781
cbb09f04 782 bidi_initialized = 1;
b7b65b15
EZ
783}
784
cbb09f04
EZ
785/* Do whatever UAX#9 clause X8 says should be done at paragraph's
786 end. */
55d4c1b2 787static inline void
cbb09f04 788bidi_set_paragraph_end (struct bidi_it *bidi_it)
b7b65b15 789{
cbb09f04
EZ
790 bidi_it->invalid_levels = 0;
791 bidi_it->invalid_rl_levels = -1;
792 bidi_it->stack_idx = 0;
793 bidi_it->resolved_level = bidi_it->level_stack[0].level;
794}
b7b65b15 795
cbb09f04
EZ
796/* Initialize the bidi iterator from buffer/string position CHARPOS. */
797void
798bidi_init_it (EMACS_INT charpos, EMACS_INT bytepos, int frame_window_p,
799 struct bidi_it *bidi_it)
800{
801 if (! bidi_initialized)
802 bidi_initialize ();
803 if (charpos >= 0)
804 bidi_it->charpos = charpos;
805 if (bytepos >= 0)
806 bidi_it->bytepos = bytepos;
807 bidi_it->frame_window_p = frame_window_p;
808 bidi_it->nchars = -1; /* to be computed in bidi_resolve_explicit_1 */
809 bidi_it->first_elt = 1;
810 bidi_set_paragraph_end (bidi_it);
811 bidi_it->new_paragraph = 1;
812 bidi_it->separator_limit = -1;
813 bidi_it->type = NEUTRAL_B;
814 bidi_it->type_after_w1 = NEUTRAL_B;
815 bidi_it->orig_type = NEUTRAL_B;
816 bidi_it->prev_was_pdf = 0;
512a289d
RS
817 bidi_it->prev.type = bidi_it->prev.type_after_w1
818 = bidi_it->prev.orig_type = UNKNOWN_BT;
819 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1
820 = bidi_it->last_strong.orig_type = UNKNOWN_BT;
cbb09f04 821 bidi_it->next_for_neutral.charpos = -1;
512a289d
RS
822 bidi_it->next_for_neutral.type
823 = bidi_it->next_for_neutral.type_after_w1
824 = bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
cbb09f04 825 bidi_it->prev_for_neutral.charpos = -1;
512a289d
RS
826 bidi_it->prev_for_neutral.type
827 = bidi_it->prev_for_neutral.type_after_w1
828 = bidi_it->prev_for_neutral.orig_type = UNKNOWN_BT;
cbb09f04
EZ
829 bidi_it->sor = L2R; /* FIXME: should it be user-selectable? */
830 bidi_it->disp_pos = -1; /* invalid/unknown */
0c95fcf7 831 bidi_it->disp_prop = 0;
cbb09f04
EZ
832 /* We can only shrink the cache if we are at the bottom level of its
833 "stack". */
834 if (bidi_cache_start == 0)
835 bidi_cache_shrink ();
58b9f433
EZ
836 else
837 bidi_cache_reset ();
b7b65b15
EZ
838}
839
182ce2d2 840/* Perform initializations for reordering a new line of bidi text. */
6bff6497 841static void
be39f003
EZ
842bidi_line_init (struct bidi_it *bidi_it)
843{
844 bidi_it->scan_dir = 1; /* FIXME: do we need to have control on this? */
845 bidi_it->resolved_level = bidi_it->level_stack[0].level;
846 bidi_it->level_stack[0].override = NEUTRAL_DIR; /* X1 */
847 bidi_it->invalid_levels = 0;
848 bidi_it->invalid_rl_levels = -1;
849 bidi_it->next_en_pos = -1;
850 bidi_it->next_for_ws.type = UNKNOWN_BT;
b44d9321 851 bidi_set_sor_type (bidi_it,
512a289d 852 (bidi_it->paragraph_dir == R2L ? 1 : 0),
be39f003
EZ
853 bidi_it->level_stack[0].level); /* X10 */
854
855 bidi_cache_reset ();
856}
857
cbb09f04
EZ
858\f
859/***********************************************************************
860 Fetching characters
861 ***********************************************************************/
862
f3014ef5
EZ
863/* Count bytes in string S between BEG/BEGBYTE and END. BEG and END
864 are zero-based character positions in S, BEGBYTE is byte position
865 corresponding to BEG. UNIBYTE, if non-zero, means S is a unibyte
866 string. */
87e67904
EZ
867static inline EMACS_INT
868bidi_count_bytes (const unsigned char *s, const EMACS_INT beg,
f3014ef5 869 const EMACS_INT begbyte, const EMACS_INT end, int unibyte)
87e67904
EZ
870{
871 EMACS_INT pos = beg;
872 const unsigned char *p = s + begbyte, *start = p;
873
f3014ef5
EZ
874 if (unibyte)
875 p = s + end;
876 else
87e67904 877 {
f3014ef5
EZ
878 if (!CHAR_HEAD_P (*p))
879 abort ();
880
881 while (pos < end)
882 {
883 p += BYTES_BY_CHAR_HEAD (*p);
884 pos++;
885 }
87e67904
EZ
886 }
887
888 return p - start;
889}
890
891/* Fetch and returns the character at byte position BYTEPOS. If S is
892 non-NULL, fetch the character from string S; otherwise fetch the
f3014ef5
EZ
893 character from the current buffer. UNIBYTE non-zero means S is a
894 unibyte string. */
87e67904 895static inline int
f3014ef5 896bidi_char_at_pos (EMACS_INT bytepos, const unsigned char *s, int unibyte)
87e67904
EZ
897{
898 if (s)
f3014ef5
EZ
899 {
900 if (unibyte)
901 return s[bytepos];
902 else
903 return STRING_CHAR (s + bytepos);
904 }
87e67904
EZ
905 else
906 return FETCH_MULTIBYTE_CHAR (bytepos);
907}
908
102ebb00
EZ
909/* Fetch and return the character at BYTEPOS/CHARPOS. If that
910 character is covered by a display string, treat the entire run of
0c95fcf7
EZ
911 covered characters as a single character, either u+2029 or u+FFFC,
912 and return their combined length in CH_LEN and NCHARS. DISP_POS
913 specifies the character position of the next display string, or -1
b75258b3
EZ
914 if not yet computed. When the next character is at or beyond that
915 position, the function updates DISP_POS with the position of the
916 next display string. DISP_PROP non-zero means that there's really
0c95fcf7
EZ
917 a display string at DISP_POS, as opposed to when we searched till
918 DISP_POS without finding one. If DISP_PROP is 2, it means the
919 display spec is of the form `(space ...)', which is replaced with
b75258b3
EZ
920 u+2029 to handle it as a paragraph separator. STRING->s is the C
921 string to iterate, or NULL if iterating over a buffer or a Lisp
922 string; in the latter case, STRING->lstring is the Lisp string. */
fec2107c 923static inline int
102ebb00 924bidi_fetch_char (EMACS_INT bytepos, EMACS_INT charpos, EMACS_INT *disp_pos,
0c95fcf7 925 int *disp_prop, struct bidi_string_data *string,
fc6f18ce 926 int frame_window_p, EMACS_INT *ch_len, EMACS_INT *nchars)
182ce2d2
EZ
927{
928 int ch;
512a289d
RS
929 EMACS_INT endpos
930 = (string->s || STRINGP (string->lstring)) ? string->schars : ZV;
6db161be 931 struct text_pos pos;
182ce2d2 932
182ce2d2 933 /* If we got past the last known position of display string, compute
87e67904
EZ
934 the position of the next one. That position could be at CHARPOS. */
935 if (charpos < endpos && charpos > *disp_pos)
6db161be
EZ
936 {
937 SET_TEXT_POS (pos, charpos, bytepos);
55439c61 938 *disp_pos = compute_display_string_pos (&pos, string, frame_window_p,
0c95fcf7 939 disp_prop);
6db161be 940 }
102ebb00
EZ
941
942 /* Fetch the character at BYTEPOS. */
87e67904 943 if (charpos >= endpos)
182ce2d2
EZ
944 {
945 ch = BIDI_EOB;
946 *ch_len = 1;
947 *nchars = 1;
87e67904 948 *disp_pos = endpos;
0c95fcf7 949 *disp_prop = 0;
182ce2d2 950 }
0c95fcf7 951 else if (charpos >= *disp_pos && *disp_prop)
182ce2d2 952 {
7b600102
EZ
953 EMACS_INT disp_end_pos;
954
955 /* We don't expect to find ourselves in the middle of a display
956 property. Hopefully, it will never be needed. */
957 if (charpos > *disp_pos)
958 abort ();
0c95fcf7
EZ
959 /* Text covered by `display' properties and overlays with
960 display properties or display strings is handled as a single
961 character that represents the entire run of characters
962 covered by the display property. */
963 if (*disp_prop == 2)
964 {
965 /* `(space ...)' display specs are handled as paragraph
966 separators for the purposes of the reordering; see UAX#9
967 section 3 and clause HL1 in section 4.3 there. */
968 ch = 0x2029;
969 }
970 else
971 {
972 /* All other display specs are handled as the Unicode Object
973 Replacement Character. */
974 ch = 0xFFFC;
975 }
87e67904 976 disp_end_pos = compute_display_string_end (*disp_pos, string);
fbcaa2f3
EZ
977 if (disp_end_pos < 0)
978 {
979 /* Somebody removed the display string from the buffer
980 behind our back. Recover by processing this buffer
981 position as if no display property were present there to
982 begin with. */
983 *disp_prop = 0;
984 goto normal_char;
985 }
7b600102 986 *nchars = disp_end_pos - *disp_pos;
f3014ef5
EZ
987 if (*nchars <= 0)
988 abort ();
87e67904
EZ
989 if (string->s)
990 *ch_len = bidi_count_bytes (string->s, *disp_pos, bytepos,
f3014ef5 991 disp_end_pos, string->unibyte);
9f257352
EZ
992 else if (STRINGP (string->lstring))
993 *ch_len = bidi_count_bytes (SDATA (string->lstring), *disp_pos,
f3014ef5 994 bytepos, disp_end_pos, string->unibyte);
87e67904
EZ
995 else
996 *ch_len = CHAR_TO_BYTE (disp_end_pos) - bytepos;
182ce2d2 997 }
182ce2d2
EZ
998 else
999 {
fbcaa2f3 1000 normal_char:
87e67904
EZ
1001 if (string->s)
1002 {
66ff7ddf 1003 int len;
87e67904 1004
f3014ef5
EZ
1005 if (!string->unibyte)
1006 {
1007 ch = STRING_CHAR_AND_LENGTH (string->s + bytepos, len);
1008 *ch_len = len;
1009 }
1010 else
1011 {
1012 ch = UNIBYTE_TO_CHAR (string->s[bytepos]);
1013 *ch_len = 1;
1014 }
87e67904 1015 }
9f257352
EZ
1016 else if (STRINGP (string->lstring))
1017 {
66ff7ddf 1018 int len;
9f257352 1019
f3014ef5
EZ
1020 if (!string->unibyte)
1021 {
1022 ch = STRING_CHAR_AND_LENGTH (SDATA (string->lstring) + bytepos,
1023 len);
1024 *ch_len = len;
1025 }
1026 else
1027 {
1028 ch = UNIBYTE_TO_CHAR (SREF (string->lstring, bytepos));
1029 *ch_len = 1;
1030 }
9f257352 1031 }
87e67904
EZ
1032 else
1033 {
1034 ch = FETCH_MULTIBYTE_CHAR (bytepos);
1035 *ch_len = CHAR_BYTES (ch);
1036 }
182ce2d2
EZ
1037 *nchars = 1;
1038 }
1039
1040 /* If we just entered a run of characters covered by a display
1041 string, compute the position of the next display string. */
55439c61 1042 if (charpos + *nchars <= endpos && charpos + *nchars > *disp_pos
0c95fcf7 1043 && *disp_prop)
6db161be
EZ
1044 {
1045 SET_TEXT_POS (pos, charpos + *nchars, bytepos + *ch_len);
55439c61 1046 *disp_pos = compute_display_string_pos (&pos, string, frame_window_p,
0c95fcf7 1047 disp_prop);
6db161be 1048 }
182ce2d2
EZ
1049
1050 return ch;
1051}
1052
cbb09f04
EZ
1053\f
1054/***********************************************************************
1055 Determining paragraph direction
1056 ***********************************************************************/
1057
1058/* Check if buffer position CHARPOS/BYTEPOS is the end of a paragraph.
1059 Value is the non-negative length of the paragraph separator
1060 following the buffer position, -1 if position is at the beginning
1061 of a new paragraph, or -2 if position is neither at beginning nor
1062 at end of a paragraph. */
1063static EMACS_INT
1064bidi_at_paragraph_end (EMACS_INT charpos, EMACS_INT bytepos)
1065{
1066 Lisp_Object sep_re;
1067 Lisp_Object start_re;
1068 EMACS_INT val;
1069
1070 sep_re = paragraph_separate_re;
1071 start_re = paragraph_start_re;
1072
1073 val = fast_looking_at (sep_re, charpos, bytepos, ZV, ZV_BYTE, Qnil);
1074 if (val < 0)
1075 {
1076 if (fast_looking_at (start_re, charpos, bytepos, ZV, ZV_BYTE, Qnil) >= 0)
1077 val = -1;
1078 else
1079 val = -2;
1080 }
1081
1082 return val;
1083}
1084
1137e8b8
EZ
1085/* On my 2005-vintage machine, searching back for paragraph start
1086 takes ~1 ms per line. And bidi_paragraph_init is called 4 times
1087 when user types C-p. The number below limits each call to
1088 bidi_paragraph_init to about 10 ms. */
1089#define MAX_PARAGRAPH_SEARCH 7500
1090
be39f003 1091/* Find the beginning of this paragraph by looking back in the buffer.
1137e8b8
EZ
1092 Value is the byte position of the paragraph's beginning, or
1093 BEGV_BYTE if paragraph_start_re is still not found after looking
1094 back MAX_PARAGRAPH_SEARCH lines in the buffer. */
be39f003 1095static EMACS_INT
b44d9321 1096bidi_find_paragraph_start (EMACS_INT pos, EMACS_INT pos_byte)
6bff6497 1097{
372b7a95 1098 Lisp_Object re = paragraph_start_re;
6bff6497 1099 EMACS_INT limit = ZV, limit_byte = ZV_BYTE;
1137e8b8 1100 EMACS_INT n = 0;
6bff6497 1101
6bff6497 1102 while (pos_byte > BEGV_BYTE
1137e8b8 1103 && n++ < MAX_PARAGRAPH_SEARCH
6bff6497
EZ
1104 && fast_looking_at (re, pos, pos_byte, limit, limit_byte, Qnil) < 0)
1105 {
182ce2d2
EZ
1106 /* FIXME: What if the paragraph beginning is covered by a
1107 display string? And what if a display string covering some
1108 of the text over which we scan back includes
1109 paragraph_start_re? */
be39f003
EZ
1110 pos = find_next_newline_no_quit (pos - 1, -1);
1111 pos_byte = CHAR_TO_BYTE (pos);
6bff6497 1112 }
1137e8b8
EZ
1113 if (n >= MAX_PARAGRAPH_SEARCH)
1114 pos_byte = BEGV_BYTE;
be39f003 1115 return pos_byte;
6bff6497
EZ
1116}
1117
bea4f10c 1118/* Determine the base direction, a.k.a. base embedding level, of the
b44d9321
EZ
1119 paragraph we are about to iterate through. If DIR is either L2R or
1120 R2L, just use that. Otherwise, determine the paragraph direction
bea4f10c
EZ
1121 from the first strong directional character of the paragraph.
1122
182ce2d2 1123 NO_DEFAULT_P non-zero means don't default to L2R if the paragraph
bea4f10c
EZ
1124 has no strong directional characters and both DIR and
1125 bidi_it->paragraph_dir are NEUTRAL_DIR. In that case, search back
1126 in the buffer until a paragraph is found with a strong character,
1127 or until hitting BEGV. In the latter case, fall back to L2R. This
1128 flag is used in current-bidi-paragraph-direction.
1129
1130 Note that this function gives the paragraph separator the same
1131 direction as the preceding paragraph, even though Emacs generally
1132 views the separartor as not belonging to any paragraph. */
b7b65b15 1133void
bea4f10c 1134bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it, int no_default_p)
b7b65b15 1135{
6bff6497 1136 EMACS_INT bytepos = bidi_it->bytepos;
9f257352 1137 int string_p = bidi_it->string.s != NULL || STRINGP (bidi_it->string.lstring);
bea4f10c 1138 EMACS_INT pstartbyte;
87e67904
EZ
1139 /* Note that begbyte is a byte position, while end is a character
1140 position. Yes, this is ugly, but we are trying to avoid costly
1141 calls to BYTE_TO_CHAR and its ilk. */
1142 EMACS_INT begbyte = string_p ? 0 : BEGV_BYTE;
1143 EMACS_INT end = string_p ? bidi_it->string.schars : ZV;
e7402cb2 1144
5e65aec0 1145 /* Special case for an empty buffer. */
87e67904 1146 if (bytepos == begbyte && bidi_it->charpos == end)
5e65aec0 1147 dir = L2R;
9c82e145 1148 /* We should never be called at EOB or before BEGV. */
87e67904 1149 else if (bidi_it->charpos >= end || bytepos < begbyte)
9c82e145
EZ
1150 abort ();
1151
be39f003
EZ
1152 if (dir == L2R)
1153 {
1154 bidi_it->paragraph_dir = L2R;
1155 bidi_it->new_paragraph = 0;
1156 }
1157 else if (dir == R2L)
1158 {
1159 bidi_it->paragraph_dir = R2L;
1160 bidi_it->new_paragraph = 0;
1161 }
b7b65b15
EZ
1162 else if (dir == NEUTRAL_DIR) /* P2 */
1163 {
182ce2d2
EZ
1164 int ch;
1165 EMACS_INT ch_len, nchars;
1166 EMACS_INT pos, disp_pos = -1;
0c95fcf7 1167 int disp_prop = 0;
6bff6497 1168 bidi_type_t type;
9f257352 1169 const unsigned char *s;
be39f003 1170
d20e1419
EZ
1171 if (!bidi_initialized)
1172 bidi_initialize ();
1173
be39f003
EZ
1174 /* If we are inside a paragraph separator, we are just waiting
1175 for the separator to be exhausted; use the previous paragraph
e5a2fec7
EZ
1176 direction. But don't do that if we have been just reseated,
1177 because we need to reinitialize below in that case. */
1178 if (!bidi_it->first_elt
1179 && bidi_it->charpos < bidi_it->separator_limit)
be39f003
EZ
1180 return;
1181
b44d9321 1182 /* If we are on a newline, get past it to where the next
5e65aec0
EZ
1183 paragraph might start. But don't do that at BEGV since then
1184 we are potentially in a new paragraph that doesn't yet
1185 exist. */
c143c213 1186 pos = bidi_it->charpos;
512a289d
RS
1187 s = (STRINGP (bidi_it->string.lstring)
1188 ? SDATA (bidi_it->string.lstring)
1189 : bidi_it->string.s);
f3014ef5
EZ
1190 if (bytepos > begbyte
1191 && bidi_char_at_pos (bytepos, s, bidi_it->string.unibyte) == '\n')
be39f003 1192 {
b44d9321 1193 bytepos++;
c143c213 1194 pos++;
be39f003 1195 }
b44d9321
EZ
1196
1197 /* We are either at the beginning of a paragraph or in the
1198 middle of it. Find where this paragraph starts. */
87e67904
EZ
1199 if (string_p)
1200 {
1201 /* We don't support changes of paragraph direction inside a
1202 string. It is treated as a single paragraph. */
1203 pstartbyte = 0;
1204 }
1205 else
1206 pstartbyte = bidi_find_paragraph_start (pos, bytepos);
be39f003
EZ
1207 bidi_it->separator_limit = -1;
1208 bidi_it->new_paragraph = 0;
bea4f10c
EZ
1209
1210 /* The following loop is run more than once only if NO_DEFAULT_P
87e67904 1211 is non-zero, and only if we are iterating on a buffer. */
bea4f10c
EZ
1212 do {
1213 bytepos = pstartbyte;
87e67904
EZ
1214 if (!string_p)
1215 pos = BYTE_TO_CHAR (bytepos);
0c95fcf7 1216 ch = bidi_fetch_char (bytepos, pos, &disp_pos, &disp_prop,
55439c61 1217 &bidi_it->string,
87e67904 1218 bidi_it->frame_window_p, &ch_len, &nchars);
bea4f10c
EZ
1219 type = bidi_get_type (ch, NEUTRAL_DIR);
1220
182ce2d2 1221 for (pos += nchars, bytepos += ch_len;
bea4f10c
EZ
1222 (bidi_get_category (type) != STRONG)
1223 || (bidi_ignore_explicit_marks_for_paragraph_level
1224 && (type == RLE || type == RLO
1225 || type == LRE || type == LRO));
1226 type = bidi_get_type (ch, NEUTRAL_DIR))
1227 {
87e67904 1228 if (pos >= end)
bea4f10c
EZ
1229 {
1230 /* Pretend there's a paragraph separator at end of
87e67904 1231 buffer/string. */
bea4f10c
EZ
1232 type = NEUTRAL_B;
1233 break;
1234 }
0bb23927
EZ
1235 if (!string_p
1236 && type == NEUTRAL_B
1237 && bidi_at_paragraph_end (pos, bytepos) >= -1)
cf99dcf8 1238 break;
102ebb00 1239 /* Fetch next character and advance to get past it. */
55439c61 1240 ch = bidi_fetch_char (bytepos, pos, &disp_pos,
0c95fcf7 1241 &disp_prop, &bidi_it->string,
fc6f18ce 1242 bidi_it->frame_window_p, &ch_len, &nchars);
182ce2d2
EZ
1243 pos += nchars;
1244 bytepos += ch_len;
bea4f10c 1245 }
32413314
EZ
1246 if ((type == STRONG_R || type == STRONG_AL) /* P3 */
1247 || (!bidi_ignore_explicit_marks_for_paragraph_level
1248 && (type == RLO || type == RLE)))
bea4f10c 1249 bidi_it->paragraph_dir = R2L;
32413314
EZ
1250 else if (type == STRONG_L
1251 || (!bidi_ignore_explicit_marks_for_paragraph_level
1252 && (type == LRO || type == LRE)))
bea4f10c 1253 bidi_it->paragraph_dir = L2R;
87e67904
EZ
1254 if (!string_p
1255 && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR)
bea4f10c
EZ
1256 {
1257 /* If this paragraph is at BEGV, default to L2R. */
1258 if (pstartbyte == BEGV_BYTE)
1259 bidi_it->paragraph_dir = L2R; /* P3 and HL1 */
1260 else
1261 {
1262 EMACS_INT prevpbyte = pstartbyte;
1263 EMACS_INT p = BYTE_TO_CHAR (pstartbyte), pbyte = pstartbyte;
1264
1265 /* Find the beginning of the previous paragraph, if any. */
1266 while (pbyte > BEGV_BYTE && prevpbyte >= pstartbyte)
1267 {
182ce2d2
EZ
1268 /* FXIME: What if p is covered by a display
1269 string? See also a FIXME inside
1270 bidi_find_paragraph_start. */
bea4f10c
EZ
1271 p--;
1272 pbyte = CHAR_TO_BYTE (p);
1273 prevpbyte = bidi_find_paragraph_start (p, pbyte);
1274 }
1275 pstartbyte = prevpbyte;
1276 }
1277 }
87e67904
EZ
1278 } while (!string_p
1279 && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR);
b7b65b15 1280 }
be39f003
EZ
1281 else
1282 abort ();
b7b65b15 1283
b44d9321
EZ
1284 /* Contrary to UAX#9 clause P3, we only default the paragraph
1285 direction to L2R if we have no previous usable paragraph
bea4f10c 1286 direction. This is allowed by the HL1 clause. */
d20e1419 1287 if (bidi_it->paragraph_dir != L2R && bidi_it->paragraph_dir != R2L)
bea4f10c 1288 bidi_it->paragraph_dir = L2R; /* P3 and HL1 ``higher-level protocols'' */
be39f003 1289 if (bidi_it->paragraph_dir == R2L)
b44d9321 1290 bidi_it->level_stack[0].level = 1;
be39f003 1291 else
b44d9321 1292 bidi_it->level_stack[0].level = 0;
be39f003
EZ
1293
1294 bidi_line_init (bidi_it);
b7b65b15
EZ
1295}
1296
cbb09f04
EZ
1297\f
1298/***********************************************************************
1299 Resolving explicit and implicit levels.
58b9f433 1300 The rest of this file constitutes the core of the UBA implementation.
cbb09f04 1301 ***********************************************************************/
b7b65b15 1302
55d4c1b2 1303static inline int
182ce2d2 1304bidi_explicit_dir_char (int ch)
b7b65b15 1305{
182ce2d2
EZ
1306 bidi_type_t ch_type;
1307
1308 if (!bidi_initialized)
1309 abort ();
1310 ch_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
1311 return (ch_type == LRE || ch_type == LRO
1312 || ch_type == RLE || ch_type == RLO
1313 || ch_type == PDF);
b7b65b15
EZ
1314}
1315
1316/* A helper function for bidi_resolve_explicit. It advances to the
1317 next character in logical order and determines the new embedding
1318 level and directional override, but does not take into account
1319 empty embeddings. */
1320static int
1321bidi_resolve_explicit_1 (struct bidi_it *bidi_it)
1322{
1323 int curchar;
1324 bidi_type_t type;
1325 int current_level;
1326 int new_level;
1327 bidi_dir_t override;
9f257352 1328 int string_p = bidi_it->string.s != NULL || STRINGP (bidi_it->string.lstring);
b7b65b15 1329
182ce2d2
EZ
1330 /* If reseat()'ed, don't advance, so as to start iteration from the
1331 position where we were reseated. bidi_it->bytepos can be less
1332 than BEGV_BYTE after reseat to BEGV. */
87e67904 1333 if (bidi_it->bytepos < (string_p ? 0 : BEGV_BYTE)
9c82e145 1334 || bidi_it->first_elt)
e7402cb2 1335 {
9c82e145 1336 bidi_it->first_elt = 0;
87e67904
EZ
1337 if (string_p)
1338 {
512a289d
RS
1339 const unsigned char *p
1340 = (STRINGP (bidi_it->string.lstring)
1341 ? SDATA (bidi_it->string.lstring)
1342 : bidi_it->string.s);
9f257352 1343
87e67904
EZ
1344 if (bidi_it->charpos < 0)
1345 bidi_it->charpos = 0;
f3014ef5
EZ
1346 bidi_it->bytepos = bidi_count_bytes (p, 0, 0, bidi_it->charpos,
1347 bidi_it->string.unibyte);
87e67904
EZ
1348 }
1349 else
1350 {
1351 if (bidi_it->charpos < BEGV)
1352 bidi_it->charpos = BEGV;
1353 bidi_it->bytepos = CHAR_TO_BYTE (bidi_it->charpos);
1354 }
e7402cb2 1355 }
87e67904
EZ
1356 /* Don't move at end of buffer/string. */
1357 else if (bidi_it->charpos < (string_p ? bidi_it->string.schars : ZV))
b7b65b15 1358 {
182ce2d2
EZ
1359 /* Advance to the next character, skipping characters covered by
1360 display strings (nchars > 1). */
102ebb00
EZ
1361 if (bidi_it->nchars <= 0)
1362 abort ();
182ce2d2 1363 bidi_it->charpos += bidi_it->nchars;
e7402cb2
EZ
1364 if (bidi_it->ch_len == 0)
1365 abort ();
b7b65b15
EZ
1366 bidi_it->bytepos += bidi_it->ch_len;
1367 }
1368
1369 current_level = bidi_it->level_stack[bidi_it->stack_idx].level; /* X1 */
1370 override = bidi_it->level_stack[bidi_it->stack_idx].override;
1371 new_level = current_level;
1372
87e67904 1373 if (bidi_it->charpos >= (string_p ? bidi_it->string.schars : ZV))
e7402cb2
EZ
1374 {
1375 curchar = BIDI_EOB;
1376 bidi_it->ch_len = 1;
182ce2d2 1377 bidi_it->nchars = 1;
87e67904 1378 bidi_it->disp_pos = (string_p ? bidi_it->string.schars : ZV);
0c95fcf7 1379 bidi_it->disp_prop = 0;
e7402cb2
EZ
1380 }
1381 else
1382 {
182ce2d2
EZ
1383 /* Fetch the character at BYTEPOS. If it is covered by a
1384 display string, treat the entire run of covered characters as
1385 a single character u+FFFC. */
102ebb00 1386 curchar = bidi_fetch_char (bidi_it->bytepos, bidi_it->charpos,
0c95fcf7 1387 &bidi_it->disp_pos, &bidi_it->disp_prop,
55439c61 1388 &bidi_it->string, bidi_it->frame_window_p,
102ebb00 1389 &bidi_it->ch_len, &bidi_it->nchars);
e7402cb2 1390 }
b7b65b15 1391 bidi_it->ch = curchar;
b7b65b15 1392
6bff6497
EZ
1393 /* Don't apply directional override here, as all the types we handle
1394 below will not be affected by the override anyway, and we need
1395 the original type unaltered. The override will be applied in
1396 bidi_resolve_weak. */
1397 type = bidi_get_type (curchar, NEUTRAL_DIR);
89d3374a
EZ
1398 bidi_it->orig_type = type;
1399 bidi_check_type (bidi_it->orig_type);
b7b65b15
EZ
1400
1401 if (type != PDF)
1402 bidi_it->prev_was_pdf = 0;
1403
89d3374a 1404 bidi_it->type_after_w1 = UNKNOWN_BT;
b7b65b15
EZ
1405
1406 switch (type)
1407 {
1408 case RLE: /* X2 */
1409 case RLO: /* X4 */
89d3374a
EZ
1410 bidi_it->type_after_w1 = type;
1411 bidi_check_type (bidi_it->type_after_w1);
b7b65b15 1412 type = WEAK_BN; /* X9/Retaining */
87e67904 1413 if (bidi_it->ignore_bn_limit <= -1)
b7b65b15
EZ
1414 {
1415 if (current_level <= BIDI_MAXLEVEL - 4)
1416 {
1417 /* Compute the least odd embedding level greater than
1418 the current level. */
1419 new_level = ((current_level + 1) & ~1) + 1;
89d3374a 1420 if (bidi_it->type_after_w1 == RLE)
b7b65b15
EZ
1421 override = NEUTRAL_DIR;
1422 else
1423 override = R2L;
1424 if (current_level == BIDI_MAXLEVEL - 4)
1425 bidi_it->invalid_rl_levels = 0;
1426 bidi_push_embedding_level (bidi_it, new_level, override);
1427 }
1428 else
1429 {
1430 bidi_it->invalid_levels++;
1431 /* See the commentary about invalid_rl_levels below. */
1432 if (bidi_it->invalid_rl_levels < 0)
1433 bidi_it->invalid_rl_levels = 0;
1434 bidi_it->invalid_rl_levels++;
1435 }
1436 }
89d3374a 1437 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
b7b65b15
EZ
1438 || bidi_it->next_en_pos > bidi_it->charpos)
1439 type = WEAK_EN;
1440 break;
1441 case LRE: /* X3 */
1442 case LRO: /* X5 */
89d3374a
EZ
1443 bidi_it->type_after_w1 = type;
1444 bidi_check_type (bidi_it->type_after_w1);
b7b65b15 1445 type = WEAK_BN; /* X9/Retaining */
87e67904 1446 if (bidi_it->ignore_bn_limit <= -1)
b7b65b15
EZ
1447 {
1448 if (current_level <= BIDI_MAXLEVEL - 5)
1449 {
1450 /* Compute the least even embedding level greater than
1451 the current level. */
1452 new_level = ((current_level + 2) & ~1);
89d3374a 1453 if (bidi_it->type_after_w1 == LRE)
b7b65b15
EZ
1454 override = NEUTRAL_DIR;
1455 else
1456 override = L2R;
1457 bidi_push_embedding_level (bidi_it, new_level, override);
1458 }
1459 else
1460 {
1461 bidi_it->invalid_levels++;
1462 /* invalid_rl_levels counts invalid levels encountered
1463 while the embedding level was already too high for
1464 LRE/LRO, but not for RLE/RLO. That is because
1465 there may be exactly one PDF which we should not
1466 ignore even though invalid_levels is non-zero.
1467 invalid_rl_levels helps to know what PDF is
1468 that. */
1469 if (bidi_it->invalid_rl_levels >= 0)
1470 bidi_it->invalid_rl_levels++;
1471 }
1472 }
89d3374a 1473 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
b7b65b15
EZ
1474 || bidi_it->next_en_pos > bidi_it->charpos)
1475 type = WEAK_EN;
1476 break;
1477 case PDF: /* X7 */
89d3374a
EZ
1478 bidi_it->type_after_w1 = type;
1479 bidi_check_type (bidi_it->type_after_w1);
b7b65b15 1480 type = WEAK_BN; /* X9/Retaining */
87e67904 1481 if (bidi_it->ignore_bn_limit <= -1)
b7b65b15
EZ
1482 {
1483 if (!bidi_it->invalid_rl_levels)
1484 {
1485 new_level = bidi_pop_embedding_level (bidi_it);
1486 bidi_it->invalid_rl_levels = -1;
1487 if (bidi_it->invalid_levels)
1488 bidi_it->invalid_levels--;
1489 /* else nothing: UAX#9 says to ignore invalid PDFs */
1490 }
1491 if (!bidi_it->invalid_levels)
1492 new_level = bidi_pop_embedding_level (bidi_it);
1493 else
1494 {
1495 bidi_it->invalid_levels--;
1496 bidi_it->invalid_rl_levels--;
1497 }
1498 }
89d3374a 1499 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
b7b65b15
EZ
1500 || bidi_it->next_en_pos > bidi_it->charpos)
1501 type = WEAK_EN;
1502 break;
1503 default:
1504 /* Nothing. */
1505 break;
1506 }
1507
1508 bidi_it->type = type;
2d6e4628 1509 bidi_check_type (bidi_it->type);
b7b65b15
EZ
1510
1511 return new_level;
1512}
1513
1514/* Given an iterator state in BIDI_IT, advance one character position
182ce2d2
EZ
1515 in the buffer/string to the next character (in the logical order),
1516 resolve any explicit embeddings and directional overrides, and
1517 return the embedding level of the character after resolving
1518 explicit directives and ignoring empty embeddings. */
b7b65b15
EZ
1519static int
1520bidi_resolve_explicit (struct bidi_it *bidi_it)
1521{
1522 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1523 int new_level = bidi_resolve_explicit_1 (bidi_it);
87e67904 1524 EMACS_INT eob = bidi_it->string.s ? bidi_it->string.schars : ZV;
512a289d
RS
1525 const unsigned char *s
1526 = (STRINGP (bidi_it->string.lstring)
1527 ? SDATA (bidi_it->string.lstring)
1528 : bidi_it->string.s);
b7b65b15
EZ
1529
1530 if (prev_level < new_level
1531 && bidi_it->type == WEAK_BN
87e67904
EZ
1532 && bidi_it->ignore_bn_limit == -1 /* only if not already known */
1533 && bidi_it->charpos < eob /* not already at EOB */
1534 && bidi_explicit_dir_char (bidi_char_at_pos (bidi_it->bytepos
f3014ef5
EZ
1535 + bidi_it->ch_len, s,
1536 bidi_it->string.unibyte)))
b7b65b15
EZ
1537 {
1538 /* Avoid pushing and popping embedding levels if the level run
1539 is empty, as this breaks level runs where it shouldn't.
1540 UAX#9 removes all the explicit embedding and override codes,
1541 so empty embeddings disappear without a trace. We need to
1542 behave as if we did the same. */
1543 struct bidi_it saved_it;
1544 int level = prev_level;
1545
1546 bidi_copy_it (&saved_it, bidi_it);
1547
87e67904 1548 while (bidi_explicit_dir_char (bidi_char_at_pos (bidi_it->bytepos
f3014ef5
EZ
1549 + bidi_it->ch_len, s,
1550 bidi_it->string.unibyte)))
b7b65b15 1551 {
182ce2d2
EZ
1552 /* This advances to the next character, skipping any
1553 characters covered by display strings. */
b7b65b15 1554 level = bidi_resolve_explicit_1 (bidi_it);
9f257352
EZ
1555 /* If string.lstring was relocated inside bidi_resolve_explicit_1,
1556 a pointer to its data is no longer valid. */
1557 if (STRINGP (bidi_it->string.lstring))
1558 s = SDATA (bidi_it->string.lstring);
b7b65b15
EZ
1559 }
1560
102ebb00
EZ
1561 if (bidi_it->nchars <= 0)
1562 abort ();
b7b65b15 1563 if (level == prev_level) /* empty embedding */
182ce2d2 1564 saved_it.ignore_bn_limit = bidi_it->charpos + bidi_it->nchars;
b7b65b15 1565 else /* this embedding is non-empty */
87e67904 1566 saved_it.ignore_bn_limit = -2;
b7b65b15
EZ
1567
1568 bidi_copy_it (bidi_it, &saved_it);
87e67904 1569 if (bidi_it->ignore_bn_limit > -1)
b7b65b15
EZ
1570 {
1571 /* We pushed a level, but we shouldn't have. Undo that. */
1572 if (!bidi_it->invalid_rl_levels)
1573 {
1574 new_level = bidi_pop_embedding_level (bidi_it);
1575 bidi_it->invalid_rl_levels = -1;
1576 if (bidi_it->invalid_levels)
1577 bidi_it->invalid_levels--;
1578 }
1579 if (!bidi_it->invalid_levels)
1580 new_level = bidi_pop_embedding_level (bidi_it);
1581 else
1582 {
1583 bidi_it->invalid_levels--;
1584 bidi_it->invalid_rl_levels--;
1585 }
1586 }
1587 }
1588
b7b65b15
EZ
1589 if (bidi_it->type == NEUTRAL_B) /* X8 */
1590 {
21fce5ab 1591 bidi_set_paragraph_end (bidi_it);
6bff6497
EZ
1592 /* This is needed by bidi_resolve_weak below, and in L1. */
1593 bidi_it->type_after_w1 = bidi_it->type;
89d3374a 1594 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1595 }
1596
1597 return new_level;
1598}
1599
182ce2d2
EZ
1600/* Advance in the buffer/string, resolve weak types and return the
1601 type of the next character after weak type resolution. */
fd3998ff 1602static bidi_type_t
b7b65b15
EZ
1603bidi_resolve_weak (struct bidi_it *bidi_it)
1604{
1605 bidi_type_t type;
1606 bidi_dir_t override;
1607 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1608 int new_level = bidi_resolve_explicit (bidi_it);
1609 int next_char;
1610 bidi_type_t type_of_next;
1611 struct bidi_it saved_it;
512a289d
RS
1612 EMACS_INT eob
1613 = ((STRINGP (bidi_it->string.lstring) || bidi_it->string.s)
1614 ? bidi_it->string.schars : ZV);
b7b65b15
EZ
1615
1616 type = bidi_it->type;
1617 override = bidi_it->level_stack[bidi_it->stack_idx].override;
1618
1619 if (type == UNKNOWN_BT
1620 || type == LRE
1621 || type == LRO
1622 || type == RLE
1623 || type == RLO
1624 || type == PDF)
1625 abort ();
1626
1627 if (new_level != prev_level
1628 || bidi_it->type == NEUTRAL_B)
1629 {
1630 /* We've got a new embedding level run, compute the directional
1631 type of sor and initialize per-run variables (UAX#9, clause
1632 X10). */
1633 bidi_set_sor_type (bidi_it, prev_level, new_level);
1634 }
1635 else if (type == NEUTRAL_S || type == NEUTRAL_WS
1636 || type == WEAK_BN || type == STRONG_AL)
89d3374a
EZ
1637 bidi_it->type_after_w1 = type; /* needed in L1 */
1638 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1639
1640 /* Level and directional override status are already recorded in
1641 bidi_it, and do not need any change; see X6. */
1642 if (override == R2L) /* X6 */
1643 type = STRONG_R;
1644 else if (override == L2R)
1645 type = STRONG_L;
bc5a45f3 1646 else
b7b65b15 1647 {
bc5a45f3 1648 if (type == WEAK_NSM) /* W1 */
b7b65b15 1649 {
bc5a45f3 1650 /* Note that we don't need to consider the case where the
5930fe97
EZ
1651 prev character has its type overridden by an RLO or LRO,
1652 because then either the type of this NSM would have been
1653 also overridden, or the previous character is outside the
1654 current level run, and thus not relevant to this NSM.
1655 This is why NSM gets the type_after_w1 of the previous
1656 character. */
ebb5722e
EZ
1657 if (bidi_it->prev.type_after_w1 != UNKNOWN_BT
1658 /* if type_after_w1 is NEUTRAL_B, this NSM is at sor */
1659 && bidi_it->prev.type_after_w1 != NEUTRAL_B)
5930fe97 1660 type = bidi_it->prev.type_after_w1;
bc5a45f3
EZ
1661 else if (bidi_it->sor == R2L)
1662 type = STRONG_R;
1663 else if (bidi_it->sor == L2R)
1664 type = STRONG_L;
1665 else /* shouldn't happen! */
1666 abort ();
b7b65b15 1667 }
bc5a45f3
EZ
1668 if (type == WEAK_EN /* W2 */
1669 && bidi_it->last_strong.type_after_w1 == STRONG_AL)
1670 type = WEAK_AN;
1671 else if (type == STRONG_AL) /* W3 */
1672 type = STRONG_R;
1673 else if ((type == WEAK_ES /* W4 */
1674 && bidi_it->prev.type_after_w1 == WEAK_EN
1675 && bidi_it->prev.orig_type == WEAK_EN)
1676 || (type == WEAK_CS
1677 && ((bidi_it->prev.type_after_w1 == WEAK_EN
1678 && bidi_it->prev.orig_type == WEAK_EN)
1679 || bidi_it->prev.type_after_w1 == WEAK_AN)))
b7b65b15 1680 {
512a289d
RS
1681 const unsigned char *s
1682 = (STRINGP (bidi_it->string.lstring)
1683 ? SDATA (bidi_it->string.lstring)
1684 : bidi_it->string.s);
1685
1686 next_char = (bidi_it->charpos + bidi_it->nchars >= eob
1687 ? BIDI_EOB
1688 : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len,
1689 s, bidi_it->string.unibyte));
6bff6497 1690 type_of_next = bidi_get_type (next_char, override);
b7b65b15 1691
bc5a45f3 1692 if (type_of_next == WEAK_BN
b7b65b15
EZ
1693 || bidi_explicit_dir_char (next_char))
1694 {
1695 bidi_copy_it (&saved_it, bidi_it);
1696 while (bidi_resolve_explicit (bidi_it) == new_level
bc5a45f3 1697 && bidi_it->type == WEAK_BN)
b7b65b15
EZ
1698 ;
1699 type_of_next = bidi_it->type;
b7b65b15
EZ
1700 bidi_copy_it (bidi_it, &saved_it);
1701 }
bc5a45f3
EZ
1702
1703 /* If the next character is EN, but the last strong-type
1704 character is AL, that next EN will be changed to AN when
1705 we process it in W2 above. So in that case, this ES
1706 should not be changed into EN. */
1707 if (type == WEAK_ES
1708 && type_of_next == WEAK_EN
1709 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1710 type = WEAK_EN;
1711 else if (type == WEAK_CS)
b7b65b15 1712 {
bc5a45f3
EZ
1713 if (bidi_it->prev.type_after_w1 == WEAK_AN
1714 && (type_of_next == WEAK_AN
1715 /* If the next character is EN, but the last
1716 strong-type character is AL, EN will be later
1717 changed to AN when we process it in W2 above.
1718 So in that case, this ES should not be
1719 changed into EN. */
1720 || (type_of_next == WEAK_EN
1721 && bidi_it->last_strong.type_after_w1 == STRONG_AL)))
1722 type = WEAK_AN;
1723 else if (bidi_it->prev.type_after_w1 == WEAK_EN
1724 && type_of_next == WEAK_EN
1725 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1726 type = WEAK_EN;
1727 }
1728 }
1729 else if (type == WEAK_ET /* W5: ET with EN before or after it */
1730 || type == WEAK_BN) /* W5/Retaining */
1731 {
1732 if (bidi_it->prev.type_after_w1 == WEAK_EN /* ET/BN w/EN before it */
1733 || bidi_it->next_en_pos > bidi_it->charpos)
1734 type = WEAK_EN;
1735 else /* W5: ET/BN with EN after it. */
1736 {
182ce2d2 1737 EMACS_INT en_pos = bidi_it->charpos + bidi_it->nchars;
512a289d
RS
1738 const unsigned char *s = (STRINGP (bidi_it->string.lstring)
1739 ? SDATA (bidi_it->string.lstring)
1740 : bidi_it->string.s);
bc5a45f3 1741
102ebb00
EZ
1742 if (bidi_it->nchars <= 0)
1743 abort ();
512a289d
RS
1744 next_char
1745 = (bidi_it->charpos + bidi_it->nchars >= eob
1746 ? BIDI_EOB
1747 : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len, s,
1748 bidi_it->string.unibyte));
bc5a45f3
EZ
1749 type_of_next = bidi_get_type (next_char, override);
1750
1751 if (type_of_next == WEAK_ET
1752 || type_of_next == WEAK_BN
1753 || bidi_explicit_dir_char (next_char))
1754 {
1755 bidi_copy_it (&saved_it, bidi_it);
1756 while (bidi_resolve_explicit (bidi_it) == new_level
1757 && (bidi_it->type == WEAK_BN
1758 || bidi_it->type == WEAK_ET))
1759 ;
1760 type_of_next = bidi_it->type;
1761 en_pos = bidi_it->charpos;
1762 bidi_copy_it (bidi_it, &saved_it);
1763 }
1764 if (type_of_next == WEAK_EN)
b7b65b15 1765 {
bc5a45f3
EZ
1766 /* If the last strong character is AL, the EN we've
1767 found will become AN when we get to it (W2). */
1768 if (bidi_it->last_strong.type_after_w1 != STRONG_AL)
1769 {
1770 type = WEAK_EN;
1771 /* Remember this EN position, to speed up processing
1772 of the next ETs. */
1773 bidi_it->next_en_pos = en_pos;
1774 }
1775 else if (type == WEAK_BN)
1776 type = NEUTRAL_ON; /* W6/Retaining */
b7b65b15 1777 }
b7b65b15
EZ
1778 }
1779 }
1780 }
1781
1782 if (type == WEAK_ES || type == WEAK_ET || type == WEAK_CS /* W6 */
89d3374a
EZ
1783 || (type == WEAK_BN
1784 && (bidi_it->prev.type_after_w1 == WEAK_CS /* W6/Retaining */
1785 || bidi_it->prev.type_after_w1 == WEAK_ES
1786 || bidi_it->prev.type_after_w1 == WEAK_ET)))
b7b65b15
EZ
1787 type = NEUTRAL_ON;
1788
1789 /* Store the type we've got so far, before we clobber it with strong
1790 types in W7 and while resolving neutral types. But leave alone
1791 the original types that were recorded above, because we will need
1792 them for the L1 clause. */
89d3374a
EZ
1793 if (bidi_it->type_after_w1 == UNKNOWN_BT)
1794 bidi_it->type_after_w1 = type;
1795 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1796
1797 if (type == WEAK_EN) /* W7 */
1798 {
89d3374a 1799 if ((bidi_it->last_strong.type_after_w1 == STRONG_L)
b7b65b15
EZ
1800 || (bidi_it->last_strong.type == UNKNOWN_BT && bidi_it->sor == L2R))
1801 type = STRONG_L;
1802 }
1803
1804 bidi_it->type = type;
2d6e4628 1805 bidi_check_type (bidi_it->type);
b7b65b15
EZ
1806 return type;
1807}
1808
cbb09f04
EZ
1809/* Resolve the type of a neutral character according to the type of
1810 surrounding strong text and the current embedding level. */
58b9f433 1811static inline bidi_type_t
cbb09f04
EZ
1812bidi_resolve_neutral_1 (bidi_type_t prev_type, bidi_type_t next_type, int lev)
1813{
1814 /* N1: European and Arabic numbers are treated as though they were R. */
1815 if (next_type == WEAK_EN || next_type == WEAK_AN)
1816 next_type = STRONG_R;
1817 if (prev_type == WEAK_EN || prev_type == WEAK_AN)
1818 prev_type = STRONG_R;
1819
1820 if (next_type == prev_type) /* N1 */
1821 return next_type;
1822 else if ((lev & 1) == 0) /* N2 */
1823 return STRONG_L;
1824 else
1825 return STRONG_R;
1826}
1827
fd3998ff 1828static bidi_type_t
b7b65b15
EZ
1829bidi_resolve_neutral (struct bidi_it *bidi_it)
1830{
1831 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1832 bidi_type_t type = bidi_resolve_weak (bidi_it);
1833 int current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1834
1835 if (!(type == STRONG_R
1836 || type == STRONG_L
1837 || type == WEAK_BN
1838 || type == WEAK_EN
1839 || type == WEAK_AN
1840 || type == NEUTRAL_B
1841 || type == NEUTRAL_S
1842 || type == NEUTRAL_WS
1843 || type == NEUTRAL_ON))
1844 abort ();
1845
1846 if (bidi_get_category (type) == NEUTRAL
1847 || (type == WEAK_BN && prev_level == current_level))
1848 {
1849 if (bidi_it->next_for_neutral.type != UNKNOWN_BT)
1850 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1851 bidi_it->next_for_neutral.type,
1852 current_level);
1853 else
1854 {
1855 /* Arrrgh!! The UAX#9 algorithm is too deeply entrenched in
1856 the assumption of batch-style processing; see clauses W4,
1857 W5, and especially N1, which require to look far forward
182ce2d2
EZ
1858 (as well as back) in the buffer/string. May the fleas of
1859 a thousand camels infest the armpits of those who design
b7b65b15
EZ
1860 supposedly general-purpose algorithms by looking at their
1861 own implementations, and fail to consider other possible
1862 implementations! */
1863 struct bidi_it saved_it;
1864 bidi_type_t next_type;
1865
1866 if (bidi_it->scan_dir == -1)
1867 abort ();
1868
1869 bidi_copy_it (&saved_it, bidi_it);
1870 /* Scan the text forward until we find the first non-neutral
1871 character, and then use that to resolve the neutral we
1872 are dealing with now. We also cache the scanned iterator
1873 states, to salvage some of the effort later. */
1874 bidi_cache_iterator_state (bidi_it, 0);
1875 do {
1876 /* Record the info about the previous character, so that
1877 it will be cached below with this state. */
89d3374a 1878 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
b7b65b15
EZ
1879 && bidi_it->type != WEAK_BN)
1880 bidi_remember_char (&bidi_it->prev, bidi_it);
1881 type = bidi_resolve_weak (bidi_it);
1882 /* Paragraph separators have their levels fully resolved
1883 at this point, so cache them as resolved. */
1884 bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B);
1885 /* FIXME: implement L1 here, by testing for a newline and
1886 resetting the level for any sequence of whitespace
1887 characters adjacent to it. */
1888 } while (!(type == NEUTRAL_B
1889 || (type != WEAK_BN
1890 && bidi_get_category (type) != NEUTRAL)
1891 /* This is all per level run, so stop when we
1892 reach the end of this level run. */
512a289d
RS
1893 || (bidi_it->level_stack[bidi_it->stack_idx].level
1894 != current_level)));
b7b65b15
EZ
1895
1896 bidi_remember_char (&saved_it.next_for_neutral, bidi_it);
1897
1898 switch (type)
1899 {
1900 case STRONG_L:
1901 case STRONG_R:
1902 case STRONG_AL:
1903 next_type = type;
1904 break;
1905 case WEAK_EN:
1906 case WEAK_AN:
1907 /* N1: ``European and Arabic numbers are treated as
1908 though they were R.'' */
1909 next_type = STRONG_R;
1910 saved_it.next_for_neutral.type = STRONG_R;
1911 break;
1912 case WEAK_BN:
1913 if (!bidi_explicit_dir_char (bidi_it->ch))
1914 abort (); /* can't happen: BNs are skipped */
1915 /* FALLTHROUGH */
1916 case NEUTRAL_B:
1917 /* Marched all the way to the end of this level run.
1918 We need to use the eor type, whose information is
1919 stored by bidi_set_sor_type in the prev_for_neutral
1920 member. */
1921 if (saved_it.type != WEAK_BN
89d3374a 1922 || bidi_get_category (bidi_it->prev.type_after_w1) == NEUTRAL)
b7b65b15
EZ
1923 {
1924 next_type = bidi_it->prev_for_neutral.type;
1925 saved_it.next_for_neutral.type = next_type;
2d6e4628 1926 bidi_check_type (next_type);
b7b65b15
EZ
1927 }
1928 else
1929 {
1930 /* This is a BN which does not adjoin neutrals.
1931 Leave its type alone. */
1932 bidi_copy_it (bidi_it, &saved_it);
1933 return bidi_it->type;
1934 }
1935 break;
1936 default:
1937 abort ();
1938 }
1939 type = bidi_resolve_neutral_1 (saved_it.prev_for_neutral.type,
1940 next_type, current_level);
1941 saved_it.type = type;
2d6e4628 1942 bidi_check_type (type);
b7b65b15
EZ
1943 bidi_copy_it (bidi_it, &saved_it);
1944 }
1945 }
1946 return type;
1947}
1948
1949/* Given an iterator state in BIDI_IT, advance one character position
182ce2d2
EZ
1950 in the buffer/string to the next character (in the logical order),
1951 resolve the bidi type of that next character, and return that
1952 type. */
fd3998ff 1953static bidi_type_t
b7b65b15
EZ
1954bidi_type_of_next_char (struct bidi_it *bidi_it)
1955{
1956 bidi_type_t type;
1957
1958 /* This should always be called during a forward scan. */
1959 if (bidi_it->scan_dir != 1)
1960 abort ();
1961
1962 /* Reset the limit until which to ignore BNs if we step out of the
1963 area where we found only empty levels. */
87e67904 1964 if ((bidi_it->ignore_bn_limit > -1
b7b65b15 1965 && bidi_it->ignore_bn_limit <= bidi_it->charpos)
87e67904 1966 || (bidi_it->ignore_bn_limit == -2
b7b65b15 1967 && !bidi_explicit_dir_char (bidi_it->ch)))
87e67904 1968 bidi_it->ignore_bn_limit = -1;
b7b65b15
EZ
1969
1970 type = bidi_resolve_neutral (bidi_it);
1971
1972 return type;
1973}
1974
1975/* Given an iterator state BIDI_IT, advance one character position in
182ce2d2
EZ
1976 the buffer/string to the next character (in the current scan
1977 direction), resolve the embedding and implicit levels of that next
1978 character, and return the resulting level. */
fd3998ff 1979static int
b7b65b15
EZ
1980bidi_level_of_next_char (struct bidi_it *bidi_it)
1981{
1982 bidi_type_t type;
1983 int level, prev_level = -1;
1984 struct bidi_saved_info next_for_neutral;
578b494e 1985 EMACS_INT next_char_pos = -2;
b7b65b15
EZ
1986
1987 if (bidi_it->scan_dir == 1)
1988 {
512a289d
RS
1989 EMACS_INT eob
1990 = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
1991 ? bidi_it->string.schars : ZV);
9f257352 1992
b7b65b15 1993 /* There's no sense in trying to advance if we hit end of text. */
9f257352 1994 if (bidi_it->charpos >= eob)
b7b65b15
EZ
1995 return bidi_it->resolved_level;
1996
1997 /* Record the info about the previous character. */
89d3374a 1998 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
b7b65b15
EZ
1999 && bidi_it->type != WEAK_BN)
2000 bidi_remember_char (&bidi_it->prev, bidi_it);
89d3374a
EZ
2001 if (bidi_it->type_after_w1 == STRONG_R
2002 || bidi_it->type_after_w1 == STRONG_L
2003 || bidi_it->type_after_w1 == STRONG_AL)
b7b65b15
EZ
2004 bidi_remember_char (&bidi_it->last_strong, bidi_it);
2005 /* FIXME: it sounds like we don't need both prev and
2006 prev_for_neutral members, but I'm leaving them both for now. */
2007 if (bidi_it->type == STRONG_R || bidi_it->type == STRONG_L
2008 || bidi_it->type == WEAK_EN || bidi_it->type == WEAK_AN)
2009 bidi_remember_char (&bidi_it->prev_for_neutral, bidi_it);
2010
2011 /* If we overstepped the characters used for resolving neutrals
2012 and whitespace, invalidate their info in the iterator. */
2013 if (bidi_it->charpos >= bidi_it->next_for_neutral.charpos)
2014 bidi_it->next_for_neutral.type = UNKNOWN_BT;
2015 if (bidi_it->next_en_pos >= 0
2016 && bidi_it->charpos >= bidi_it->next_en_pos)
2017 bidi_it->next_en_pos = -1;
2018 if (bidi_it->next_for_ws.type != UNKNOWN_BT
2019 && bidi_it->charpos >= bidi_it->next_for_ws.charpos)
2020 bidi_it->next_for_ws.type = UNKNOWN_BT;
2021
2022 /* This must be taken before we fill the iterator with the info
2023 about the next char. If we scan backwards, the iterator
2024 state must be already cached, so there's no need to know the
2025 embedding level of the previous character, since we will be
2026 returning to our caller shortly. */
2027 prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2028 }
2029 next_for_neutral = bidi_it->next_for_neutral;
2030
182ce2d2
EZ
2031 /* Perhaps the character we want is already cached. If it is, the
2032 call to bidi_cache_find below will return a type other than
2033 UNKNOWN_BT. */
87e67904 2034 if (bidi_cache_idx > bidi_cache_start && !bidi_it->first_elt)
102ebb00 2035 {
512a289d
RS
2036 int bob = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
2037 ? 0 : 1);
102ebb00
EZ
2038 if (bidi_it->scan_dir > 0)
2039 {
2040 if (bidi_it->nchars <= 0)
2041 abort ();
2042 next_char_pos = bidi_it->charpos + bidi_it->nchars;
2043 }
9f257352 2044 else if (bidi_it->charpos >= bob)
bb269206
EZ
2045 /* Implementation note: we allow next_char_pos to be as low as
2046 0 for buffers or -1 for strings, and that is okay because
2047 that's the "position" of the sentinel iterator state we
2048 cached at the beginning of the iteration. */
102ebb00 2049 next_char_pos = bidi_it->charpos - 1;
578b494e 2050 if (next_char_pos >= bob - 1)
87e67904
EZ
2051 type = bidi_cache_find (next_char_pos, -1, bidi_it);
2052 else
2053 type = UNKNOWN_BT;
102ebb00 2054 }
182ce2d2 2055 else
102ebb00 2056 type = UNKNOWN_BT;
b7b65b15
EZ
2057 if (type != UNKNOWN_BT)
2058 {
2059 /* Don't lose the information for resolving neutrals! The
2060 cached states could have been cached before their
2061 next_for_neutral member was computed. If we are on our way
2062 forward, we can simply take the info from the previous
2063 state. */
2064 if (bidi_it->scan_dir == 1
2065 && bidi_it->next_for_neutral.type == UNKNOWN_BT)
2066 bidi_it->next_for_neutral = next_for_neutral;
2067
2068 /* If resolved_level is -1, it means this state was cached
2069 before it was completely resolved, so we cannot return
2070 it. */
2071 if (bidi_it->resolved_level != -1)
2072 return bidi_it->resolved_level;
2073 }
2074 if (bidi_it->scan_dir == -1)
2075 /* If we are going backwards, the iterator state is already cached
2076 from previous scans, and should be fully resolved. */
2077 abort ();
2078
2079 if (type == UNKNOWN_BT)
2080 type = bidi_type_of_next_char (bidi_it);
2081
2082 if (type == NEUTRAL_B)
2083 return bidi_it->resolved_level;
2084
2085 level = bidi_it->level_stack[bidi_it->stack_idx].level;
2086 if ((bidi_get_category (type) == NEUTRAL /* && type != NEUTRAL_B */)
2087 || (type == WEAK_BN && prev_level == level))
2088 {
2089 if (bidi_it->next_for_neutral.type == UNKNOWN_BT)
2090 abort ();
2091
2092 /* If the cached state shows a neutral character, it was not
2093 resolved by bidi_resolve_neutral, so do it now. */
2094 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
2095 bidi_it->next_for_neutral.type,
2096 level);
2097 }
2098
2099 if (!(type == STRONG_R
2100 || type == STRONG_L
2101 || type == WEAK_BN
2102 || type == WEAK_EN
2103 || type == WEAK_AN))
2104 abort ();
2105 bidi_it->type = type;
2d6e4628 2106 bidi_check_type (bidi_it->type);
b7b65b15
EZ
2107
2108 /* For L1 below, we need to know, for each WS character, whether
0d327994 2109 it belongs to a sequence of WS characters preceding a newline
b7b65b15 2110 or a TAB or a paragraph separator. */
89d3374a 2111 if (bidi_it->orig_type == NEUTRAL_WS
b7b65b15
EZ
2112 && bidi_it->next_for_ws.type == UNKNOWN_BT)
2113 {
2114 int ch;
8264569d 2115 EMACS_INT clen = bidi_it->ch_len;
6bff6497
EZ
2116 EMACS_INT bpos = bidi_it->bytepos;
2117 EMACS_INT cpos = bidi_it->charpos;
182ce2d2 2118 EMACS_INT disp_pos = bidi_it->disp_pos;
102ebb00 2119 EMACS_INT nc = bidi_it->nchars;
87e67904 2120 struct bidi_string_data bs = bidi_it->string;
b7b65b15 2121 bidi_type_t chtype;
fc6f18ce 2122 int fwp = bidi_it->frame_window_p;
0c95fcf7 2123 int dpp = bidi_it->disp_prop;
b7b65b15 2124
102ebb00
EZ
2125 if (bidi_it->nchars <= 0)
2126 abort ();
b7b65b15 2127 do {
55439c61
EZ
2128 ch = bidi_fetch_char (bpos += clen, cpos += nc, &disp_pos, &dpp, &bs,
2129 fwp, &clen, &nc);
79beb178 2130 if (ch == '\n' || ch == BIDI_EOB)
b7b65b15
EZ
2131 chtype = NEUTRAL_B;
2132 else
6bff6497 2133 chtype = bidi_get_type (ch, NEUTRAL_DIR);
b7b65b15
EZ
2134 } while (chtype == NEUTRAL_WS || chtype == WEAK_BN
2135 || bidi_explicit_dir_char (ch)); /* L1/Retaining */
2136 bidi_it->next_for_ws.type = chtype;
2d6e4628 2137 bidi_check_type (bidi_it->next_for_ws.type);
b7b65b15
EZ
2138 bidi_it->next_for_ws.charpos = cpos;
2139 bidi_it->next_for_ws.bytepos = bpos;
2140 }
2141
2142 /* Resolve implicit levels, with a twist: PDFs get the embedding
2143 level of the enbedding they terminate. See below for the
2144 reason. */
89d3374a 2145 if (bidi_it->orig_type == PDF
b7b65b15
EZ
2146 /* Don't do this if this formatting code didn't change the
2147 embedding level due to invalid or empty embeddings. */
2148 && prev_level != level)
2149 {
2150 /* Don't look in UAX#9 for the reason for this: it's our own
2151 private quirk. The reason is that we want the formatting
2152 codes to be delivered so that they bracket the text of their
2153 embedding. For example, given the text
2154
2155 {RLO}teST{PDF}
2156
2157 we want it to be displayed as
2158
0d68907d 2159 {PDF}STet{RLO}
b7b65b15
EZ
2160
2161 not as
2162
2163 STet{RLO}{PDF}
2164
2165 which will result because we bump up the embedding level as
2166 soon as we see the RLO and pop it as soon as we see the PDF,
2167 so RLO itself has the same embedding level as "teST", and
2168 thus would be normally delivered last, just before the PDF.
2169 The switch below fiddles with the level of PDF so that this
2170 ugly side effect does not happen.
2171
2172 (This is, of course, only important if the formatting codes
e7402cb2
EZ
2173 are actually displayed, but Emacs does need to display them
2174 if the user wants to.) */
b7b65b15
EZ
2175 level = prev_level;
2176 }
89d3374a
EZ
2177 else if (bidi_it->orig_type == NEUTRAL_B /* L1 */
2178 || bidi_it->orig_type == NEUTRAL_S
e7402cb2 2179 || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB
89d3374a 2180 || (bidi_it->orig_type == NEUTRAL_WS
b7b65b15
EZ
2181 && (bidi_it->next_for_ws.type == NEUTRAL_B
2182 || bidi_it->next_for_ws.type == NEUTRAL_S)))
2183 level = bidi_it->level_stack[0].level;
2184 else if ((level & 1) == 0) /* I1 */
2185 {
2186 if (type == STRONG_R)
2187 level++;
2188 else if (type == WEAK_EN || type == WEAK_AN)
2189 level += 2;
2190 }
2191 else /* I2 */
2192 {
2193 if (type == STRONG_L || type == WEAK_EN || type == WEAK_AN)
2194 level++;
2195 }
2196
2197 bidi_it->resolved_level = level;
2198 return level;
2199}
2200
2201/* Move to the other edge of a level given by LEVEL. If END_FLAG is
2202 non-zero, we are at the end of a level, and we need to prepare to
2203 resume the scan of the lower level.
2204
2205 If this level's other edge is cached, we simply jump to it, filling
2206 the iterator structure with the iterator state on the other edge.
182ce2d2
EZ
2207 Otherwise, we walk the buffer or string until we come back to the
2208 same level as LEVEL.
b7b65b15
EZ
2209
2210 Note: we are not talking here about a ``level run'' in the UAX#9
2211 sense of the term, but rather about a ``level'' which includes
2212 all the levels higher than it. In other words, given the levels
2213 like this:
2214
2215 11111112222222333333334443343222222111111112223322111
2216 A B C
2217
2218 and assuming we are at point A scanning left to right, this
2219 function moves to point C, whereas the UAX#9 ``level 2 run'' ends
2220 at point B. */
2221static void
2222bidi_find_other_level_edge (struct bidi_it *bidi_it, int level, int end_flag)
2223{
2224 int dir = end_flag ? -bidi_it->scan_dir : bidi_it->scan_dir;
39e378da 2225 ptrdiff_t idx;
b7b65b15
EZ
2226
2227 /* Try the cache first. */
87e67904
EZ
2228 if ((idx = bidi_cache_find_level_change (level, dir, end_flag))
2229 >= bidi_cache_start)
b7b65b15
EZ
2230 bidi_cache_fetch_state (idx, bidi_it);
2231 else
2232 {
2233 int new_level;
2234
2235 if (end_flag)
2236 abort (); /* if we are at end of level, its edges must be cached */
2237
2238 bidi_cache_iterator_state (bidi_it, 1);
2239 do {
2240 new_level = bidi_level_of_next_char (bidi_it);
2241 bidi_cache_iterator_state (bidi_it, 1);
2242 } while (new_level >= level);
2243 }
2244}
2245
2246void
4b292a22 2247bidi_move_to_visually_next (struct bidi_it *bidi_it)
b7b65b15
EZ
2248{
2249 int old_level, new_level, next_level;
9c82e145 2250 struct bidi_it sentinel;
acb28818 2251 struct gcpro gcpro1;
b7b65b15 2252
87e67904
EZ
2253 if (bidi_it->charpos < 0 || bidi_it->bytepos < 0)
2254 abort ();
b7b65b15
EZ
2255
2256 if (bidi_it->scan_dir == 0)
2257 {
2258 bidi_it->scan_dir = 1; /* default to logical order */
2259 }
2260
acb28818 2261 /* The code below can call eval, and thus cause GC. If we are
7e2ad32c 2262 iterating a Lisp string, make sure it won't be GCed. */
acb28818
EZ
2263 if (STRINGP (bidi_it->string.lstring))
2264 GCPRO1 (bidi_it->string.lstring);
2265
be39f003 2266 /* If we just passed a newline, initialize for the next line. */
1137e8b8
EZ
2267 if (!bidi_it->first_elt
2268 && (bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB))
be39f003
EZ
2269 bidi_line_init (bidi_it);
2270
6dcfd253
EZ
2271 /* Prepare the sentinel iterator state, and cache it. When we bump
2272 into it, scanning backwards, we'll know that the last non-base
2273 level is exhausted. */
87e67904 2274 if (bidi_cache_idx == bidi_cache_start)
9c82e145
EZ
2275 {
2276 bidi_copy_it (&sentinel, bidi_it);
2277 if (bidi_it->first_elt)
2278 {
2279 sentinel.charpos--; /* cached charpos needs to be monotonic */
2280 sentinel.bytepos--;
2281 sentinel.ch = '\n'; /* doesn't matter, but why not? */
2282 sentinel.ch_len = 1;
182ce2d2 2283 sentinel.nchars = 1;
9c82e145 2284 }
6dcfd253 2285 bidi_cache_iterator_state (&sentinel, 1);
9c82e145 2286 }
b7b65b15
EZ
2287
2288 old_level = bidi_it->resolved_level;
2289 new_level = bidi_level_of_next_char (bidi_it);
b7b65b15
EZ
2290
2291 /* Reordering of resolved levels (clause L2) is implemented by
2292 jumping to the other edge of the level and flipping direction of
c0546589 2293 scanning the text whenever we find a level change. */
b7b65b15
EZ
2294 if (new_level != old_level)
2295 {
2296 int ascending = new_level > old_level;
2297 int level_to_search = ascending ? old_level + 1 : old_level;
2298 int incr = ascending ? 1 : -1;
2299 int expected_next_level = old_level + incr;
2300
b7b65b15
EZ
2301 /* Jump (or walk) to the other edge of this level. */
2302 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
2303 /* Switch scan direction and peek at the next character in the
2304 new direction. */
2305 bidi_it->scan_dir = -bidi_it->scan_dir;
2306
2307 /* The following loop handles the case where the resolved level
2308 jumps by more than one. This is typical for numbers inside a
2309 run of text with left-to-right embedding direction, but can
2310 also happen in other situations. In those cases the decision
2311 where to continue after a level change, and in what direction,
2312 is tricky. For example, given a text like below:
2313
2314 abcdefgh
2315 11336622
2316
2317 (where the numbers below the text show the resolved levels),
2318 the result of reordering according to UAX#9 should be this:
2319
2320 efdcghba
2321
2322 This is implemented by the loop below which flips direction
2323 and jumps to the other edge of the level each time it finds
2324 the new level not to be the expected one. The expected level
2325 is always one more or one less than the previous one. */
2326 next_level = bidi_peek_at_next_level (bidi_it);
2327 while (next_level != expected_next_level)
2328 {
2329 expected_next_level += incr;
2330 level_to_search += incr;
2331 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
2332 bidi_it->scan_dir = -bidi_it->scan_dir;
2333 next_level = bidi_peek_at_next_level (bidi_it);
2334 }
2335
2336 /* Finally, deliver the next character in the new direction. */
2337 next_level = bidi_level_of_next_char (bidi_it);
2338 }
2339
b44d9321
EZ
2340 /* Take note when we have just processed the newline that precedes
2341 the end of the paragraph. The next time we are about to be
2342 called, set_iterator_to_next will automatically reinit the
2343 paragraph direction, if needed. We do this at the newline before
2344 the paragraph separator, because the next character might not be
2345 the first character of the next paragraph, due to the bidi
c0546589
EZ
2346 reordering, whereas we _must_ know the paragraph base direction
2347 _before_ we process the paragraph's text, since the base
2348 direction affects the reordering. */
1137e8b8
EZ
2349 if (bidi_it->scan_dir == 1
2350 && (bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB))
be39f003 2351 {
87e67904
EZ
2352 /* The paragraph direction of the entire string, once
2353 determined, is in effect for the entire string. Setting the
2354 separator limit to the end of the string prevents
2355 bidi_paragraph_init from being called automatically on this
2356 string. */
9f257352 2357 if (bidi_it->string.s || STRINGP (bidi_it->string.lstring))
87e67904
EZ
2358 bidi_it->separator_limit = bidi_it->string.schars;
2359 else if (bidi_it->bytepos < ZV_BYTE)
be39f003 2360 {
512a289d
RS
2361 EMACS_INT sep_len
2362 = bidi_at_paragraph_end (bidi_it->charpos + bidi_it->nchars,
2363 bidi_it->bytepos + bidi_it->ch_len);
87e67904
EZ
2364 if (bidi_it->nchars <= 0)
2365 abort ();
2366 if (sep_len >= 0)
2367 {
2368 bidi_it->new_paragraph = 1;
2369 /* Record the buffer position of the last character of the
2370 paragraph separator. */
512a289d
RS
2371 bidi_it->separator_limit
2372 = bidi_it->charpos + bidi_it->nchars + sep_len;
87e67904 2373 }
be39f003
EZ
2374 }
2375 }
6bff6497 2376
87e67904 2377 if (bidi_it->scan_dir == 1 && bidi_cache_idx > bidi_cache_start)
b7b65b15
EZ
2378 {
2379 /* If we are at paragraph's base embedding level and beyond the
2380 last cached position, the cache's job is done and we can
2381 discard it. */
2382 if (bidi_it->resolved_level == bidi_it->level_stack[0].level
182ce2d2
EZ
2383 && bidi_it->charpos > (bidi_cache[bidi_cache_idx - 1].charpos
2384 + bidi_cache[bidi_cache_idx - 1].nchars - 1))
b7b65b15
EZ
2385 bidi_cache_reset ();
2386 /* But as long as we are caching during forward scan, we must
2387 cache each state, or else the cache integrity will be
2388 compromised: it assumes cached states correspond to buffer
2389 positions 1:1. */
2390 else
2391 bidi_cache_iterator_state (bidi_it, 1);
2392 }
acb28818
EZ
2393
2394 if (STRINGP (bidi_it->string.lstring))
2395 UNGCPRO;
b7b65b15
EZ
2396}
2397
2398/* This is meant to be called from within the debugger, whenever you
2399 wish to examine the cache contents. */
e78aecca 2400void bidi_dump_cached_states (void) EXTERNALLY_VISIBLE;
b7b65b15
EZ
2401void
2402bidi_dump_cached_states (void)
2403{
39e378da 2404 ptrdiff_t i;
b7b65b15
EZ
2405 int ndigits = 1;
2406
2407 if (bidi_cache_idx == 0)
2408 {
2409 fprintf (stderr, "The cache is empty.\n");
2410 return;
2411 }
df9733bf 2412 fprintf (stderr, "Total of %"pD"d state%s in cache:\n",
b7b65b15
EZ
2413 bidi_cache_idx, bidi_cache_idx == 1 ? "" : "s");
2414
2415 for (i = bidi_cache[bidi_cache_idx - 1].charpos; i > 0; i /= 10)
2416 ndigits++;
2417 fputs ("ch ", stderr);
2418 for (i = 0; i < bidi_cache_idx; i++)
2419 fprintf (stderr, "%*c", ndigits, bidi_cache[i].ch);
2420 fputs ("\n", stderr);
2421 fputs ("lvl ", stderr);
2422 for (i = 0; i < bidi_cache_idx; i++)
2423 fprintf (stderr, "%*d", ndigits, bidi_cache[i].resolved_level);
2424 fputs ("\n", stderr);
2425 fputs ("pos ", stderr);
2426 for (i = 0; i < bidi_cache_idx; i++)
c2982e87 2427 fprintf (stderr, "%*"pI"d", ndigits, bidi_cache[i].charpos);
b7b65b15
EZ
2428 fputs ("\n", stderr);
2429}