Revert last change in bidi.c.
[bpt/emacs.git] / src / bidi.c
CommitLineData
cbb09f04 1/* Low-level bidirectional buffer/string-scanning functions for GNU Emacs.
acaf905b 2 Copyright (C) 2000-2001, 2004-2005, 2009-2012
b118e65f 3 Free Software Foundation, Inc.
b7b65b15
EZ
4
5This file is part of GNU Emacs.
6
a8d11bd3 7GNU Emacs is free software: you can redistribute it and/or modify
b7b65b15 8it under the terms of the GNU General Public License as published by
a8d11bd3
EZ
9the Free Software Foundation, either version 3 of the License, or
10(at your option) any later version.
b7b65b15
EZ
11
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
16
b7b65b15 17You should have received a copy of the GNU General Public License
a8d11bd3 18along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
b7b65b15 19
2d6e4628
EZ
20/* Written by Eli Zaretskii <eliz@gnu.org>.
21
22 A sequential implementation of the Unicode Bidirectional algorithm,
cbb09f04 23 (UBA) as per UAX#9, a part of the Unicode Standard.
b7b65b15
EZ
24
25 Unlike the reference and most other implementations, this one is
940afb59
EZ
26 designed to be called once for every character in the buffer or
27 string.
b7b65b15 28
4b292a22 29 The main entry point is bidi_move_to_visually_next. Each time it
b7b65b15
EZ
30 is called, it finds the next character in the visual order, and
31 returns its information in a special structure. The caller is then
32 expected to process this character for display or any other
4b292a22
EZ
33 purposes, and call bidi_move_to_visually_next for the next
34 character. See the comments in bidi_move_to_visually_next for more
35 details about its algorithm that finds the next visual-order
b7b65b15
EZ
36 character by resolving their levels on the fly.
37
95e1afc6 38 Two other entry points are bidi_paragraph_init and
940afb59
EZ
39 bidi_mirror_char. The first determines the base direction of a
40 paragraph, while the second returns the mirrored version of its
41 argument character.
42
95e1afc6
EZ
43 A few auxiliary entry points are used to initialize the bidi
44 iterator for iterating an object (buffer or string), push and pop
45 the bidi iterator state, and save and restore the state of the bidi
46 cache.
47
89d3374a
EZ
48 If you want to understand the code, you will have to read it
49 together with the relevant portions of UAX#9. The comments include
50 references to UAX#9 rules, for that very reason.
51
b7b65b15
EZ
52 A note about references to UAX#9 rules: if the reference says
53 something like "X9/Retaining", it means that you need to refer to
cd1181db 54 rule X9 and to its modifications described in the "Implementation
b7b65b15
EZ
55 Notes" section of UAX#9, under "Retaining Format Codes". */
56
b7b65b15 57#include <config.h>
b7b65b15 58#include <stdio.h>
29e3d8d1
EZ
59#include <setjmp.h>
60
b7b65b15
EZ
61#include "lisp.h"
62#include "buffer.h"
63#include "character.h"
64#include "dispextern.h"
65
66static int bidi_initialized = 0;
67
cbc4fd20 68static Lisp_Object bidi_type_table, bidi_mirror_table;
b7b65b15
EZ
69
70#define LRM_CHAR 0x200E
71#define RLM_CHAR 0x200F
b7b65b15 72#define BIDI_EOB -1
b7b65b15 73
b7b65b15
EZ
74/* Data type for describing the bidirectional character categories. */
75typedef enum {
76 UNKNOWN_BC,
77 NEUTRAL,
78 WEAK,
79 STRONG
80} bidi_category_t;
81
32413314
EZ
82/* UAX#9 says to search only for L, AL, or R types of characters, and
83 ignore RLE, RLO, LRE, and LRO, when determining the base paragraph
84 level. Yudit indeed ignores them. This variable is therefore set
85 by default to ignore them, but setting it to zero will take them
86 into account. */
e78aecca 87extern int bidi_ignore_explicit_marks_for_paragraph_level EXTERNALLY_VISIBLE;
b7b65b15
EZ
88int bidi_ignore_explicit_marks_for_paragraph_level = 1;
89
372b7a95 90static Lisp_Object paragraph_start_re, paragraph_separate_re;
6bff6497 91static Lisp_Object Qparagraph_start, Qparagraph_separate;
b7b65b15 92
cbb09f04
EZ
93\f
94/***********************************************************************
95 Utilities
96 ***********************************************************************/
b7b65b15 97
6bff6497
EZ
98/* Return the bidi type of a character CH, subject to the current
99 directional OVERRIDE. */
55d4c1b2 100static inline bidi_type_t
6bff6497 101bidi_get_type (int ch, bidi_dir_t override)
b7b65b15 102{
6bff6497
EZ
103 bidi_type_t default_type;
104
e7402cb2
EZ
105 if (ch == BIDI_EOB)
106 return NEUTRAL_B;
e342a24d
EZ
107 if (ch < 0 || ch > MAX_CHAR)
108 abort ();
6bff6497
EZ
109
110 default_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
bca633fb
EZ
111 /* Every valid character code, even those that are unassigned by the
112 UCD, have some bidi-class property, according to
113 DerivedBidiClass.txt file. Therefore, if we ever get UNKNOWN_BT
114 (= zero) code from CHAR_TABLE_REF, that's a bug. */
115 if (default_type == UNKNOWN_BT)
116 abort ();
6bff6497
EZ
117
118 if (override == NEUTRAL_DIR)
119 return default_type;
120
121 switch (default_type)
122 {
123 /* Although UAX#9 does not tell, it doesn't make sense to
124 override NEUTRAL_B and LRM/RLM characters. */
125 case NEUTRAL_B:
126 case LRE:
127 case LRO:
128 case RLE:
129 case RLO:
130 case PDF:
131 return default_type;
132 default:
133 switch (ch)
134 {
135 case LRM_CHAR:
136 case RLM_CHAR:
137 return default_type;
138 default:
139 if (override == L2R) /* X6 */
140 return STRONG_L;
141 else if (override == R2L)
142 return STRONG_R;
143 else
144 abort (); /* can't happen: handled above */
145 }
146 }
b7b65b15
EZ
147}
148
f67cdd7f 149static inline void
2d6e4628
EZ
150bidi_check_type (bidi_type_t type)
151{
f67cdd7f 152 xassert (UNKNOWN_BT <= type && type <= NEUTRAL_ON);
2d6e4628
EZ
153}
154
b7b65b15 155/* Given a bidi TYPE of a character, return its category. */
55d4c1b2 156static inline bidi_category_t
b7b65b15
EZ
157bidi_get_category (bidi_type_t type)
158{
159 switch (type)
160 {
161 case UNKNOWN_BT:
162 return UNKNOWN_BC;
163 case STRONG_L:
164 case STRONG_R:
165 case STRONG_AL:
166 case LRE:
167 case LRO:
168 case RLE:
169 case RLO:
170 return STRONG;
171 case PDF: /* ??? really?? */
172 case WEAK_EN:
173 case WEAK_ES:
174 case WEAK_ET:
175 case WEAK_AN:
176 case WEAK_CS:
177 case WEAK_NSM:
178 case WEAK_BN:
179 return WEAK;
180 case NEUTRAL_B:
181 case NEUTRAL_S:
182 case NEUTRAL_WS:
183 case NEUTRAL_ON:
184 return NEUTRAL;
185 default:
186 abort ();
187 }
188}
189
a6e8d97c
EZ
190/* Return the mirrored character of C, if it has one. If C has no
191 mirrored counterpart, return C.
192 Note: The conditions in UAX#9 clause L4 regarding the surrounding
193 context must be tested by the caller. */
b7b65b15
EZ
194int
195bidi_mirror_char (int c)
196{
cbc4fd20
EZ
197 Lisp_Object val;
198
199 if (c == BIDI_EOB)
200 return c;
201 if (c < 0 || c > MAX_CHAR)
202 abort ();
b7b65b15 203
cbc4fd20
EZ
204 val = CHAR_TABLE_REF (bidi_mirror_table, c);
205 if (INTEGERP (val))
b7b65b15 206 {
4446092a 207 int v = XINT (val);
b7b65b15 208
cbc4fd20
EZ
209 if (v < 0 || v > MAX_CHAR)
210 abort ();
211
212 return v;
b7b65b15 213 }
cbc4fd20 214
b7b65b15
EZ
215 return c;
216}
217
cbb09f04
EZ
218/* Determine the start-of-run (sor) directional type given the two
219 embedding levels on either side of the run boundary. Also, update
220 the saved info about previously seen characters, since that info is
221 generally valid for a single level run. */
58b9f433 222static inline void
cbb09f04
EZ
223bidi_set_sor_type (struct bidi_it *bidi_it, int level_before, int level_after)
224{
512a289d 225 int higher_level = (level_before > level_after ? level_before : level_after);
cbb09f04
EZ
226
227 /* The prev_was_pdf gork is required for when we have several PDFs
228 in a row. In that case, we want to compute the sor type for the
229 next level run only once: when we see the first PDF. That's
230 because the sor type depends only on the higher of the two levels
231 that we find on the two sides of the level boundary (see UAX#9,
232 clause X10), and so we don't need to know the final embedding
233 level to which we descend after processing all the PDFs. */
234 if (!bidi_it->prev_was_pdf || level_before < level_after)
235 /* FIXME: should the default sor direction be user selectable? */
512a289d 236 bidi_it->sor = ((higher_level & 1) != 0 ? R2L : L2R);
cbb09f04
EZ
237 if (level_before > level_after)
238 bidi_it->prev_was_pdf = 1;
239
240 bidi_it->prev.type = UNKNOWN_BT;
512a289d
RS
241 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1
242 = bidi_it->last_strong.orig_type = UNKNOWN_BT;
243 bidi_it->prev_for_neutral.type = (bidi_it->sor == R2L ? STRONG_R : STRONG_L);
cbb09f04
EZ
244 bidi_it->prev_for_neutral.charpos = bidi_it->charpos;
245 bidi_it->prev_for_neutral.bytepos = bidi_it->bytepos;
512a289d
RS
246 bidi_it->next_for_neutral.type = bidi_it->next_for_neutral.type_after_w1
247 = bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
cbb09f04
EZ
248 bidi_it->ignore_bn_limit = -1; /* meaning it's unknown */
249}
250
251/* Push the current embedding level and override status; reset the
252 current level to LEVEL and the current override status to OVERRIDE. */
58b9f433 253static inline void
cbb09f04
EZ
254bidi_push_embedding_level (struct bidi_it *bidi_it,
255 int level, bidi_dir_t override)
256{
257 bidi_it->stack_idx++;
7e2ad32c 258 xassert (bidi_it->stack_idx < BIDI_MAXLEVEL);
cbb09f04
EZ
259 bidi_it->level_stack[bidi_it->stack_idx].level = level;
260 bidi_it->level_stack[bidi_it->stack_idx].override = override;
261}
262
263/* Pop the embedding level and directional override status from the
264 stack, and return the new level. */
58b9f433 265static inline int
cbb09f04
EZ
266bidi_pop_embedding_level (struct bidi_it *bidi_it)
267{
268 /* UAX#9 says to ignore invalid PDFs. */
269 if (bidi_it->stack_idx > 0)
270 bidi_it->stack_idx--;
271 return bidi_it->level_stack[bidi_it->stack_idx].level;
272}
273
274/* Record in SAVED_INFO the information about the current character. */
58b9f433 275static inline void
cbb09f04
EZ
276bidi_remember_char (struct bidi_saved_info *saved_info,
277 struct bidi_it *bidi_it)
278{
279 saved_info->charpos = bidi_it->charpos;
280 saved_info->bytepos = bidi_it->bytepos;
281 saved_info->type = bidi_it->type;
282 bidi_check_type (bidi_it->type);
283 saved_info->type_after_w1 = bidi_it->type_after_w1;
284 bidi_check_type (bidi_it->type_after_w1);
285 saved_info->orig_type = bidi_it->orig_type;
286 bidi_check_type (bidi_it->orig_type);
287}
288
b7b65b15
EZ
289/* Copy the bidi iterator from FROM to TO. To save cycles, this only
290 copies the part of the level stack that is actually in use. */
55d4c1b2 291static inline void
b7b65b15
EZ
292bidi_copy_it (struct bidi_it *to, struct bidi_it *from)
293{
294 int i;
295
e69a9370 296 /* Copy everything except the level stack and beyond. */
182ce2d2 297 memcpy (to, from, offsetof (struct bidi_it, level_stack[0]));
b7b65b15
EZ
298
299 /* Copy the active part of the level stack. */
300 to->level_stack[0] = from->level_stack[0]; /* level zero is always in use */
301 for (i = 1; i <= from->stack_idx; i++)
302 to->level_stack[i] = from->level_stack[i];
303}
304
cbb09f04
EZ
305\f
306/***********************************************************************
307 Caching the bidi iterator states
308 ***********************************************************************/
b7b65b15 309
2fe72643
EZ
310#define BIDI_CACHE_CHUNK 200
311static struct bidi_it *bidi_cache;
39e378da 312static ptrdiff_t bidi_cache_size = 0;
ad6042bb 313enum { elsz = sizeof (struct bidi_it) };
39e378da
PE
314static ptrdiff_t bidi_cache_idx; /* next unused cache slot */
315static ptrdiff_t bidi_cache_last_idx; /* slot of last cache hit */
316static ptrdiff_t bidi_cache_start = 0; /* start of cache for this
87e67904
EZ
317 "stack" level */
318
bc18e09d
PE
319/* 5-slot stack for saving the start of the previous level of the
320 cache. xdisp.c maintains a 5-slot stack for its iterator state,
321 and we need the same size of our stack. */
322static ptrdiff_t bidi_cache_start_stack[IT_STACK_SIZE];
323static int bidi_cache_sp;
324
325/* Size of header used by bidi_shelve_cache. */
326enum
327 {
512a289d
RS
328 bidi_shelve_header_size
329 = (sizeof (bidi_cache_idx) + sizeof (bidi_cache_start_stack)
330 + sizeof (bidi_cache_sp) + sizeof (bidi_cache_start)
331 + sizeof (bidi_cache_last_idx))
bc18e09d
PE
332 };
333
87e67904
EZ
334/* Reset the cache state to the empty state. We only reset the part
335 of the cache relevant to iteration of the current object. Previous
336 objects, which are pushed on the display iterator's stack, are left
337 intact. This is called when the cached information is no more
338 useful for the current iteration, e.g. when we were reseated to a
339 new position on the same object. */
55d4c1b2 340static inline void
b7b65b15
EZ
341bidi_cache_reset (void)
342{
87e67904 343 bidi_cache_idx = bidi_cache_start;
b7b65b15
EZ
344 bidi_cache_last_idx = -1;
345}
346
87e67904
EZ
347/* Shrink the cache to its minimal size. Called when we init the bidi
348 iterator for reordering a buffer or a string that does not come
349 from display properties, because that means all the previously
350 cached info is of no further use. */
55d4c1b2 351static inline void
2fe72643
EZ
352bidi_cache_shrink (void)
353{
354 if (bidi_cache_size > BIDI_CACHE_CHUNK)
355 {
512a289d
RS
356 bidi_cache
357 = (struct bidi_it *) xrealloc (bidi_cache, BIDI_CACHE_CHUNK * elsz);
51f30bc5 358 bidi_cache_size = BIDI_CACHE_CHUNK;
2fe72643
EZ
359 }
360 bidi_cache_reset ();
361}
362
55d4c1b2 363static inline void
39e378da 364bidi_cache_fetch_state (ptrdiff_t idx, struct bidi_it *bidi_it)
b7b65b15
EZ
365{
366 int current_scan_dir = bidi_it->scan_dir;
367
87e67904 368 if (idx < bidi_cache_start || idx >= bidi_cache_idx)
b7b65b15
EZ
369 abort ();
370
371 bidi_copy_it (bidi_it, &bidi_cache[idx]);
372 bidi_it->scan_dir = current_scan_dir;
373 bidi_cache_last_idx = idx;
374}
375
376/* Find a cached state with a given CHARPOS and resolved embedding
377 level less or equal to LEVEL. if LEVEL is -1, disregard the
378 resolved levels in cached states. DIR, if non-zero, means search
379 in that direction from the last cache hit. */
39e378da 380static inline ptrdiff_t
d311d28c 381bidi_cache_search (ptrdiff_t charpos, int level, int dir)
b7b65b15 382{
39e378da 383 ptrdiff_t i, i_start;
b7b65b15 384
6eec7596 385 if (bidi_cache_idx > bidi_cache_start)
b7b65b15 386 {
6eec7596
EZ
387 if (bidi_cache_last_idx == -1)
388 bidi_cache_last_idx = bidi_cache_idx - 1;
b7b65b15 389 if (charpos < bidi_cache[bidi_cache_last_idx].charpos)
182ce2d2
EZ
390 {
391 dir = -1;
392 i_start = bidi_cache_last_idx - 1;
393 }
394 else if (charpos > (bidi_cache[bidi_cache_last_idx].charpos
395 + bidi_cache[bidi_cache_last_idx].nchars - 1))
396 {
397 dir = 1;
398 i_start = bidi_cache_last_idx + 1;
399 }
400 else if (dir)
b7b65b15
EZ
401 i_start = bidi_cache_last_idx;
402 else
403 {
404 dir = -1;
405 i_start = bidi_cache_idx - 1;
406 }
407
408 if (dir < 0)
409 {
410 /* Linear search for now; FIXME! */
87e67904 411 for (i = i_start; i >= bidi_cache_start; i--)
182ce2d2
EZ
412 if (bidi_cache[i].charpos <= charpos
413 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
b7b65b15
EZ
414 && (level == -1 || bidi_cache[i].resolved_level <= level))
415 return i;
416 }
417 else
418 {
419 for (i = i_start; i < bidi_cache_idx; i++)
182ce2d2
EZ
420 if (bidi_cache[i].charpos <= charpos
421 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
b7b65b15
EZ
422 && (level == -1 || bidi_cache[i].resolved_level <= level))
423 return i;
424 }
425 }
426
427 return -1;
428}
429
430/* Find a cached state where the resolved level changes to a value
431 that is lower than LEVEL, and return its cache slot index. DIR is
432 the direction to search, starting with the last used cache slot.
87e67904
EZ
433 If DIR is zero, we search backwards from the last occupied cache
434 slot. BEFORE, if non-zero, means return the index of the slot that
435 is ``before'' the level change in the search direction. That is,
b7b65b15
EZ
436 given the cached levels like this:
437
438 1122333442211
439 AB C
440
441 and assuming we are at the position cached at the slot marked with
442 C, searching backwards (DIR = -1) for LEVEL = 2 will return the
443 index of slot B or A, depending whether BEFORE is, respectively,
444 non-zero or zero. */
39e378da 445static ptrdiff_t
b7b65b15
EZ
446bidi_cache_find_level_change (int level, int dir, int before)
447{
448 if (bidi_cache_idx)
449 {
39e378da 450 ptrdiff_t i = dir ? bidi_cache_last_idx : bidi_cache_idx - 1;
b7b65b15
EZ
451 int incr = before ? 1 : 0;
452
6eec7596
EZ
453 xassert (!dir || bidi_cache_last_idx >= 0);
454
b7b65b15
EZ
455 if (!dir)
456 dir = -1;
457 else if (!incr)
458 i += dir;
459
460 if (dir < 0)
461 {
87e67904 462 while (i >= bidi_cache_start + incr)
b7b65b15
EZ
463 {
464 if (bidi_cache[i - incr].resolved_level >= 0
465 && bidi_cache[i - incr].resolved_level < level)
466 return i;
467 i--;
468 }
469 }
470 else
471 {
472 while (i < bidi_cache_idx - incr)
473 {
474 if (bidi_cache[i + incr].resolved_level >= 0
475 && bidi_cache[i + incr].resolved_level < level)
476 return i;
477 i++;
478 }
479 }
480 }
481
482 return -1;
483}
484
58b9f433 485static inline void
39e378da 486bidi_cache_ensure_space (ptrdiff_t idx)
58b9f433
EZ
487{
488 /* Enlarge the cache as needed. */
489 if (idx >= bidi_cache_size)
490 {
f0eb61e9
PE
491 /* The bidi cache cannot be larger than the largest Lisp string
492 or buffer. */
512a289d
RS
493 ptrdiff_t string_or_buffer_bound
494 = max (BUF_BYTES_MAX, STRING_BYTES_BOUND);
f0eb61e9
PE
495
496 /* Also, it cannot be larger than what C can represent. */
512a289d
RS
497 ptrdiff_t c_bound
498 = (min (PTRDIFF_MAX, SIZE_MAX) - bidi_shelve_header_size) / elsz;
f0eb61e9 499
512a289d
RS
500 bidi_cache
501 = xpalloc (bidi_cache, &bidi_cache_size,
502 max (BIDI_CACHE_CHUNK, idx - bidi_cache_size + 1),
503 min (string_or_buffer_bound, c_bound), elsz);
58b9f433
EZ
504 }
505}
506
55d4c1b2 507static inline void
b7b65b15
EZ
508bidi_cache_iterator_state (struct bidi_it *bidi_it, int resolved)
509{
39e378da 510 ptrdiff_t idx;
b7b65b15
EZ
511
512 /* We should never cache on backward scans. */
513 if (bidi_it->scan_dir == -1)
514 abort ();
515 idx = bidi_cache_search (bidi_it->charpos, -1, 1);
516
517 if (idx < 0)
518 {
519 idx = bidi_cache_idx;
58b9f433 520 bidi_cache_ensure_space (idx);
bd924a5d
EZ
521 /* Character positions should correspond to cache positions 1:1.
522 If we are outside the range of cached positions, the cache is
523 useless and must be reset. */
87e67904 524 if (idx > bidi_cache_start &&
182ce2d2
EZ
525 (bidi_it->charpos > (bidi_cache[idx - 1].charpos
526 + bidi_cache[idx - 1].nchars)
a1344e7d 527 || bidi_it->charpos < bidi_cache[bidi_cache_start].charpos))
bd924a5d
EZ
528 {
529 bidi_cache_reset ();
87e67904 530 idx = bidi_cache_start;
bd924a5d 531 }
102ebb00
EZ
532 if (bidi_it->nchars <= 0)
533 abort ();
b7b65b15
EZ
534 bidi_copy_it (&bidi_cache[idx], bidi_it);
535 if (!resolved)
536 bidi_cache[idx].resolved_level = -1;
537 }
538 else
539 {
540 /* Copy only the members which could have changed, to avoid
541 costly copying of the entire struct. */
542 bidi_cache[idx].type = bidi_it->type;
2d6e4628 543 bidi_check_type (bidi_it->type);
89d3374a
EZ
544 bidi_cache[idx].type_after_w1 = bidi_it->type_after_w1;
545 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
546 if (resolved)
547 bidi_cache[idx].resolved_level = bidi_it->resolved_level;
548 else
549 bidi_cache[idx].resolved_level = -1;
550 bidi_cache[idx].invalid_levels = bidi_it->invalid_levels;
551 bidi_cache[idx].invalid_rl_levels = bidi_it->invalid_rl_levels;
552 bidi_cache[idx].next_for_neutral = bidi_it->next_for_neutral;
553 bidi_cache[idx].next_for_ws = bidi_it->next_for_ws;
554 bidi_cache[idx].ignore_bn_limit = bidi_it->ignore_bn_limit;
f67cdd7f 555 bidi_cache[idx].disp_pos = bidi_it->disp_pos;
0c95fcf7 556 bidi_cache[idx].disp_prop = bidi_it->disp_prop;
b7b65b15
EZ
557 }
558
559 bidi_cache_last_idx = idx;
560 if (idx >= bidi_cache_idx)
561 bidi_cache_idx = idx + 1;
562}
563
55d4c1b2 564static inline bidi_type_t
d311d28c 565bidi_cache_find (ptrdiff_t charpos, int level, struct bidi_it *bidi_it)
b7b65b15 566{
39e378da 567 ptrdiff_t i = bidi_cache_search (charpos, level, bidi_it->scan_dir);
b7b65b15 568
87e67904 569 if (i >= bidi_cache_start)
b7b65b15
EZ
570 {
571 bidi_dir_t current_scan_dir = bidi_it->scan_dir;
572
5009f85e 573 bidi_copy_it (bidi_it, &bidi_cache[i]);
b7b65b15 574 bidi_cache_last_idx = i;
5a5fa834 575 /* Don't let scan direction from the cached state override
b7b65b15
EZ
576 the current scan direction. */
577 bidi_it->scan_dir = current_scan_dir;
578 return bidi_it->type;
579 }
580
581 return UNKNOWN_BT;
582}
583
55d4c1b2 584static inline int
b7b65b15
EZ
585bidi_peek_at_next_level (struct bidi_it *bidi_it)
586{
87e67904 587 if (bidi_cache_idx == bidi_cache_start || bidi_cache_last_idx == -1)
b7b65b15
EZ
588 abort ();
589 return bidi_cache[bidi_cache_last_idx + bidi_it->scan_dir].resolved_level;
590}
591
cbb09f04 592\f
58b9f433
EZ
593/***********************************************************************
594 Pushing and popping the bidi iterator state
595 ***********************************************************************/
58b9f433
EZ
596
597/* Push the bidi iterator state in preparation for reordering a
598 different object, e.g. display string found at certain buffer
7e2ad32c
EZ
599 position. Pushing the bidi iterator boils down to saving its
600 entire state on the cache and starting a new cache "stacked" on top
601 of the current cache. */
58b9f433
EZ
602void
603bidi_push_it (struct bidi_it *bidi_it)
b7b65b15 604{
58b9f433
EZ
605 /* Save the current iterator state in its entirety after the last
606 used cache slot. */
607 bidi_cache_ensure_space (bidi_cache_idx);
608 memcpy (&bidi_cache[bidi_cache_idx++], bidi_it, sizeof (struct bidi_it));
be39f003 609
58b9f433 610 /* Push the current cache start onto the stack. */
7e2ad32c 611 xassert (bidi_cache_sp < IT_STACK_SIZE);
58b9f433 612 bidi_cache_start_stack[bidi_cache_sp++] = bidi_cache_start;
be39f003 613
58b9f433
EZ
614 /* Start a new level of cache, and make it empty. */
615 bidi_cache_start = bidi_cache_idx;
616 bidi_cache_last_idx = -1;
617}
618
619/* Restore the iterator state saved by bidi_push_it and return the
620 cache to the corresponding state. */
621void
622bidi_pop_it (struct bidi_it *bidi_it)
623{
624 if (bidi_cache_start <= 0)
625 abort ();
626
627 /* Reset the next free cache slot index to what it was before the
628 call to bidi_push_it. */
629 bidi_cache_idx = bidi_cache_start - 1;
630
631 /* Restore the bidi iterator state saved in the cache. */
632 memcpy (bidi_it, &bidi_cache[bidi_cache_idx], sizeof (struct bidi_it));
633
634 /* Pop the previous cache start from the stack. */
635 if (bidi_cache_sp <= 0)
636 abort ();
637 bidi_cache_start = bidi_cache_start_stack[--bidi_cache_sp];
638
639 /* Invalidate the last-used cache slot data. */
640 bidi_cache_last_idx = -1;
641}
642
ec7cc85b 643static ptrdiff_t bidi_cache_total_alloc;
35928349 644
ed94e6d7
EZ
645/* Stash away a copy of the cache and its control variables. */
646void *
647bidi_shelve_cache (void)
648{
649 unsigned char *databuf;
458bfed3 650 ptrdiff_t alloc;
ed94e6d7 651
35928349 652 /* Empty cache. */
ed94e6d7
EZ
653 if (bidi_cache_idx == 0)
654 return NULL;
655
458bfed3
PE
656 alloc = (bidi_shelve_header_size
657 + bidi_cache_idx * sizeof (struct bidi_it));
658 databuf = xmalloc (alloc);
659 bidi_cache_total_alloc += alloc;
35928349 660
ed94e6d7
EZ
661 memcpy (databuf, &bidi_cache_idx, sizeof (bidi_cache_idx));
662 memcpy (databuf + sizeof (bidi_cache_idx),
663 bidi_cache, bidi_cache_idx * sizeof (struct bidi_it));
664 memcpy (databuf + sizeof (bidi_cache_idx)
665 + bidi_cache_idx * sizeof (struct bidi_it),
666 bidi_cache_start_stack, sizeof (bidi_cache_start_stack));
667 memcpy (databuf + sizeof (bidi_cache_idx)
668 + bidi_cache_idx * sizeof (struct bidi_it)
669 + sizeof (bidi_cache_start_stack),
670 &bidi_cache_sp, sizeof (bidi_cache_sp));
671 memcpy (databuf + sizeof (bidi_cache_idx)
672 + bidi_cache_idx * sizeof (struct bidi_it)
673 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp),
674 &bidi_cache_start, sizeof (bidi_cache_start));
675 memcpy (databuf + sizeof (bidi_cache_idx)
676 + bidi_cache_idx * sizeof (struct bidi_it)
677 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
678 + sizeof (bidi_cache_start),
679 &bidi_cache_last_idx, sizeof (bidi_cache_last_idx));
680
681 return databuf;
682}
683
d1410150
EZ
684/* Restore the cache state from a copy stashed away by
685 bidi_shelve_cache, and free the buffer used to stash that copy.
686 JUST_FREE non-zero means free the buffer, but don't restore the
687 cache; used when the corresponding iterator is discarded instead of
688 being restored. */
ed94e6d7 689void
35928349 690bidi_unshelve_cache (void *databuf, int just_free)
ed94e6d7
EZ
691{
692 unsigned char *p = databuf;
693
694 if (!p)
be39f003 695 {
d1410150
EZ
696 if (!just_free)
697 {
698 /* A NULL pointer means an empty cache. */
699 bidi_cache_start = 0;
700 bidi_cache_sp = 0;
701 bidi_cache_reset ();
702 }
be39f003 703 }
ed94e6d7
EZ
704 else
705 {
35928349
EZ
706 if (just_free)
707 {
708 ptrdiff_t idx;
709
710 memcpy (&idx, p, sizeof (bidi_cache_idx));
512a289d
RS
711 bidi_cache_total_alloc
712 -= bidi_shelve_header_size + idx * sizeof (struct bidi_it);
35928349
EZ
713 }
714 else
715 {
716 memcpy (&bidi_cache_idx, p, sizeof (bidi_cache_idx));
717 bidi_cache_ensure_space (bidi_cache_idx);
718 memcpy (bidi_cache, p + sizeof (bidi_cache_idx),
719 bidi_cache_idx * sizeof (struct bidi_it));
720 memcpy (bidi_cache_start_stack,
721 p + sizeof (bidi_cache_idx)
722 + bidi_cache_idx * sizeof (struct bidi_it),
723 sizeof (bidi_cache_start_stack));
724 memcpy (&bidi_cache_sp,
725 p + sizeof (bidi_cache_idx)
726 + bidi_cache_idx * sizeof (struct bidi_it)
727 + sizeof (bidi_cache_start_stack),
728 sizeof (bidi_cache_sp));
729 memcpy (&bidi_cache_start,
730 p + sizeof (bidi_cache_idx)
731 + bidi_cache_idx * sizeof (struct bidi_it)
732 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp),
733 sizeof (bidi_cache_start));
734 memcpy (&bidi_cache_last_idx,
735 p + sizeof (bidi_cache_idx)
736 + bidi_cache_idx * sizeof (struct bidi_it)
737 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
738 + sizeof (bidi_cache_start),
739 sizeof (bidi_cache_last_idx));
512a289d
RS
740 bidi_cache_total_alloc
741 -= (bidi_shelve_header_size
742 + bidi_cache_idx * sizeof (struct bidi_it));
35928349 743 }
ed94e6d7
EZ
744
745 xfree (p);
746 }
747}
b7b65b15 748
58b9f433 749\f
cbb09f04
EZ
750/***********************************************************************
751 Initialization
752 ***********************************************************************/
753static void
754bidi_initialize (void)
b7b65b15 755{
474a8465
EZ
756 bidi_type_table = uniprop_table (intern ("bidi-class"));
757 if (NILP (bidi_type_table))
758 abort ();
cbb09f04
EZ
759 staticpro (&bidi_type_table);
760
474a8465
EZ
761 bidi_mirror_table = uniprop_table (intern ("mirroring"));
762 if (NILP (bidi_mirror_table))
763 abort ();
cbb09f04
EZ
764 staticpro (&bidi_mirror_table);
765
cbb09f04
EZ
766 Qparagraph_start = intern ("paragraph-start");
767 staticpro (&Qparagraph_start);
768 paragraph_start_re = Fsymbol_value (Qparagraph_start);
769 if (!STRINGP (paragraph_start_re))
770 paragraph_start_re = build_string ("\f\\|[ \t]*$");
771 staticpro (&paragraph_start_re);
772 Qparagraph_separate = intern ("paragraph-separate");
773 staticpro (&Qparagraph_separate);
774 paragraph_separate_re = Fsymbol_value (Qparagraph_separate);
775 if (!STRINGP (paragraph_separate_re))
776 paragraph_separate_re = build_string ("[ \t\f]*$");
777 staticpro (&paragraph_separate_re);
58b9f433
EZ
778
779 bidi_cache_sp = 0;
ec7cc85b 780 bidi_cache_total_alloc = 0;
58b9f433 781
cbb09f04 782 bidi_initialized = 1;
b7b65b15
EZ
783}
784
cbb09f04
EZ
785/* Do whatever UAX#9 clause X8 says should be done at paragraph's
786 end. */
55d4c1b2 787static inline void
cbb09f04 788bidi_set_paragraph_end (struct bidi_it *bidi_it)
b7b65b15 789{
cbb09f04
EZ
790 bidi_it->invalid_levels = 0;
791 bidi_it->invalid_rl_levels = -1;
792 bidi_it->stack_idx = 0;
793 bidi_it->resolved_level = bidi_it->level_stack[0].level;
794}
b7b65b15 795
cbb09f04
EZ
796/* Initialize the bidi iterator from buffer/string position CHARPOS. */
797void
d311d28c 798bidi_init_it (ptrdiff_t charpos, ptrdiff_t bytepos, int frame_window_p,
cbb09f04
EZ
799 struct bidi_it *bidi_it)
800{
801 if (! bidi_initialized)
802 bidi_initialize ();
803 if (charpos >= 0)
804 bidi_it->charpos = charpos;
805 if (bytepos >= 0)
806 bidi_it->bytepos = bytepos;
807 bidi_it->frame_window_p = frame_window_p;
808 bidi_it->nchars = -1; /* to be computed in bidi_resolve_explicit_1 */
809 bidi_it->first_elt = 1;
810 bidi_set_paragraph_end (bidi_it);
811 bidi_it->new_paragraph = 1;
812 bidi_it->separator_limit = -1;
813 bidi_it->type = NEUTRAL_B;
814 bidi_it->type_after_w1 = NEUTRAL_B;
815 bidi_it->orig_type = NEUTRAL_B;
816 bidi_it->prev_was_pdf = 0;
512a289d
RS
817 bidi_it->prev.type = bidi_it->prev.type_after_w1
818 = bidi_it->prev.orig_type = UNKNOWN_BT;
819 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1
820 = bidi_it->last_strong.orig_type = UNKNOWN_BT;
cbb09f04 821 bidi_it->next_for_neutral.charpos = -1;
512a289d
RS
822 bidi_it->next_for_neutral.type
823 = bidi_it->next_for_neutral.type_after_w1
824 = bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
cbb09f04 825 bidi_it->prev_for_neutral.charpos = -1;
512a289d
RS
826 bidi_it->prev_for_neutral.type
827 = bidi_it->prev_for_neutral.type_after_w1
828 = bidi_it->prev_for_neutral.orig_type = UNKNOWN_BT;
cbb09f04
EZ
829 bidi_it->sor = L2R; /* FIXME: should it be user-selectable? */
830 bidi_it->disp_pos = -1; /* invalid/unknown */
0c95fcf7 831 bidi_it->disp_prop = 0;
cbb09f04
EZ
832 /* We can only shrink the cache if we are at the bottom level of its
833 "stack". */
834 if (bidi_cache_start == 0)
835 bidi_cache_shrink ();
58b9f433
EZ
836 else
837 bidi_cache_reset ();
b7b65b15
EZ
838}
839
182ce2d2 840/* Perform initializations for reordering a new line of bidi text. */
6bff6497 841static void
be39f003
EZ
842bidi_line_init (struct bidi_it *bidi_it)
843{
844 bidi_it->scan_dir = 1; /* FIXME: do we need to have control on this? */
845 bidi_it->resolved_level = bidi_it->level_stack[0].level;
846 bidi_it->level_stack[0].override = NEUTRAL_DIR; /* X1 */
847 bidi_it->invalid_levels = 0;
848 bidi_it->invalid_rl_levels = -1;
4787455f
EZ
849 /* Setting this to zero will force its recomputation the first time
850 we need it for W5. */
851 bidi_it->next_en_pos = 0;
7b5d6677 852 bidi_it->next_en_type = UNKNOWN_BT;
be39f003 853 bidi_it->next_for_ws.type = UNKNOWN_BT;
b44d9321 854 bidi_set_sor_type (bidi_it,
512a289d 855 (bidi_it->paragraph_dir == R2L ? 1 : 0),
be39f003
EZ
856 bidi_it->level_stack[0].level); /* X10 */
857
858 bidi_cache_reset ();
859}
860
cbb09f04
EZ
861\f
862/***********************************************************************
863 Fetching characters
864 ***********************************************************************/
865
f3014ef5
EZ
866/* Count bytes in string S between BEG/BEGBYTE and END. BEG and END
867 are zero-based character positions in S, BEGBYTE is byte position
868 corresponding to BEG. UNIBYTE, if non-zero, means S is a unibyte
869 string. */
d311d28c
PE
870static inline ptrdiff_t
871bidi_count_bytes (const unsigned char *s, const ptrdiff_t beg,
872 const ptrdiff_t begbyte, const ptrdiff_t end, int unibyte)
87e67904 873{
d311d28c 874 ptrdiff_t pos = beg;
87e67904
EZ
875 const unsigned char *p = s + begbyte, *start = p;
876
f3014ef5
EZ
877 if (unibyte)
878 p = s + end;
879 else
87e67904 880 {
f3014ef5
EZ
881 if (!CHAR_HEAD_P (*p))
882 abort ();
883
884 while (pos < end)
885 {
886 p += BYTES_BY_CHAR_HEAD (*p);
887 pos++;
888 }
87e67904
EZ
889 }
890
891 return p - start;
892}
893
894/* Fetch and returns the character at byte position BYTEPOS. If S is
895 non-NULL, fetch the character from string S; otherwise fetch the
f3014ef5
EZ
896 character from the current buffer. UNIBYTE non-zero means S is a
897 unibyte string. */
87e67904 898static inline int
d311d28c 899bidi_char_at_pos (ptrdiff_t bytepos, const unsigned char *s, int unibyte)
87e67904
EZ
900{
901 if (s)
f3014ef5
EZ
902 {
903 if (unibyte)
904 return s[bytepos];
905 else
906 return STRING_CHAR (s + bytepos);
907 }
87e67904
EZ
908 else
909 return FETCH_MULTIBYTE_CHAR (bytepos);
910}
911
102ebb00
EZ
912/* Fetch and return the character at BYTEPOS/CHARPOS. If that
913 character is covered by a display string, treat the entire run of
0c95fcf7
EZ
914 covered characters as a single character, either u+2029 or u+FFFC,
915 and return their combined length in CH_LEN and NCHARS. DISP_POS
916 specifies the character position of the next display string, or -1
b75258b3
EZ
917 if not yet computed. When the next character is at or beyond that
918 position, the function updates DISP_POS with the position of the
919 next display string. DISP_PROP non-zero means that there's really
0c95fcf7
EZ
920 a display string at DISP_POS, as opposed to when we searched till
921 DISP_POS without finding one. If DISP_PROP is 2, it means the
922 display spec is of the form `(space ...)', which is replaced with
b75258b3
EZ
923 u+2029 to handle it as a paragraph separator. STRING->s is the C
924 string to iterate, or NULL if iterating over a buffer or a Lisp
925 string; in the latter case, STRING->lstring is the Lisp string. */
fec2107c 926static inline int
d311d28c 927bidi_fetch_char (ptrdiff_t bytepos, ptrdiff_t charpos, ptrdiff_t *disp_pos,
0c95fcf7 928 int *disp_prop, struct bidi_string_data *string,
d311d28c 929 int frame_window_p, ptrdiff_t *ch_len, ptrdiff_t *nchars)
182ce2d2
EZ
930{
931 int ch;
5895d7b9 932 ptrdiff_t endpos
512a289d 933 = (string->s || STRINGP (string->lstring)) ? string->schars : ZV;
6db161be 934 struct text_pos pos;
e99a9b8b 935 int len;
182ce2d2 936
182ce2d2 937 /* If we got past the last known position of display string, compute
87e67904
EZ
938 the position of the next one. That position could be at CHARPOS. */
939 if (charpos < endpos && charpos > *disp_pos)
6db161be
EZ
940 {
941 SET_TEXT_POS (pos, charpos, bytepos);
55439c61 942 *disp_pos = compute_display_string_pos (&pos, string, frame_window_p,
0c95fcf7 943 disp_prop);
6db161be 944 }
102ebb00
EZ
945
946 /* Fetch the character at BYTEPOS. */
87e67904 947 if (charpos >= endpos)
182ce2d2
EZ
948 {
949 ch = BIDI_EOB;
950 *ch_len = 1;
951 *nchars = 1;
87e67904 952 *disp_pos = endpos;
0c95fcf7 953 *disp_prop = 0;
182ce2d2 954 }
0c95fcf7 955 else if (charpos >= *disp_pos && *disp_prop)
182ce2d2 956 {
d311d28c 957 ptrdiff_t disp_end_pos;
7b600102
EZ
958
959 /* We don't expect to find ourselves in the middle of a display
960 property. Hopefully, it will never be needed. */
961 if (charpos > *disp_pos)
962 abort ();
0c95fcf7
EZ
963 /* Text covered by `display' properties and overlays with
964 display properties or display strings is handled as a single
965 character that represents the entire run of characters
966 covered by the display property. */
967 if (*disp_prop == 2)
968 {
969 /* `(space ...)' display specs are handled as paragraph
970 separators for the purposes of the reordering; see UAX#9
971 section 3 and clause HL1 in section 4.3 there. */
972 ch = 0x2029;
973 }
974 else
975 {
976 /* All other display specs are handled as the Unicode Object
977 Replacement Character. */
978 ch = 0xFFFC;
979 }
87e67904 980 disp_end_pos = compute_display_string_end (*disp_pos, string);
fbcaa2f3
EZ
981 if (disp_end_pos < 0)
982 {
983 /* Somebody removed the display string from the buffer
984 behind our back. Recover by processing this buffer
985 position as if no display property were present there to
986 begin with. */
987 *disp_prop = 0;
988 goto normal_char;
989 }
7b600102 990 *nchars = disp_end_pos - *disp_pos;
f3014ef5
EZ
991 if (*nchars <= 0)
992 abort ();
87e67904
EZ
993 if (string->s)
994 *ch_len = bidi_count_bytes (string->s, *disp_pos, bytepos,
f3014ef5 995 disp_end_pos, string->unibyte);
9f257352
EZ
996 else if (STRINGP (string->lstring))
997 *ch_len = bidi_count_bytes (SDATA (string->lstring), *disp_pos,
f3014ef5 998 bytepos, disp_end_pos, string->unibyte);
87e67904
EZ
999 else
1000 *ch_len = CHAR_TO_BYTE (disp_end_pos) - bytepos;
182ce2d2 1001 }
182ce2d2
EZ
1002 else
1003 {
fbcaa2f3 1004 normal_char:
87e67904
EZ
1005 if (string->s)
1006 {
87e67904 1007
f3014ef5
EZ
1008 if (!string->unibyte)
1009 {
1010 ch = STRING_CHAR_AND_LENGTH (string->s + bytepos, len);
1011 *ch_len = len;
1012 }
1013 else
1014 {
1015 ch = UNIBYTE_TO_CHAR (string->s[bytepos]);
1016 *ch_len = 1;
1017 }
87e67904 1018 }
9f257352
EZ
1019 else if (STRINGP (string->lstring))
1020 {
f3014ef5
EZ
1021 if (!string->unibyte)
1022 {
1023 ch = STRING_CHAR_AND_LENGTH (SDATA (string->lstring) + bytepos,
1024 len);
1025 *ch_len = len;
1026 }
1027 else
1028 {
1029 ch = UNIBYTE_TO_CHAR (SREF (string->lstring, bytepos));
1030 *ch_len = 1;
1031 }
9f257352 1032 }
87e67904
EZ
1033 else
1034 {
e99a9b8b
EZ
1035 ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (bytepos), len);
1036 *ch_len = len;
87e67904 1037 }
182ce2d2
EZ
1038 *nchars = 1;
1039 }
1040
1041 /* If we just entered a run of characters covered by a display
1042 string, compute the position of the next display string. */
55439c61 1043 if (charpos + *nchars <= endpos && charpos + *nchars > *disp_pos
0c95fcf7 1044 && *disp_prop)
6db161be
EZ
1045 {
1046 SET_TEXT_POS (pos, charpos + *nchars, bytepos + *ch_len);
55439c61 1047 *disp_pos = compute_display_string_pos (&pos, string, frame_window_p,
0c95fcf7 1048 disp_prop);
6db161be 1049 }
182ce2d2
EZ
1050
1051 return ch;
1052}
1053
cbb09f04
EZ
1054\f
1055/***********************************************************************
1056 Determining paragraph direction
1057 ***********************************************************************/
1058
1059/* Check if buffer position CHARPOS/BYTEPOS is the end of a paragraph.
1060 Value is the non-negative length of the paragraph separator
1061 following the buffer position, -1 if position is at the beginning
1062 of a new paragraph, or -2 if position is neither at beginning nor
1063 at end of a paragraph. */
d311d28c
PE
1064static ptrdiff_t
1065bidi_at_paragraph_end (ptrdiff_t charpos, ptrdiff_t bytepos)
cbb09f04
EZ
1066{
1067 Lisp_Object sep_re;
1068 Lisp_Object start_re;
d311d28c 1069 ptrdiff_t val;
cbb09f04
EZ
1070
1071 sep_re = paragraph_separate_re;
1072 start_re = paragraph_start_re;
1073
1074 val = fast_looking_at (sep_re, charpos, bytepos, ZV, ZV_BYTE, Qnil);
1075 if (val < 0)
1076 {
1077 if (fast_looking_at (start_re, charpos, bytepos, ZV, ZV_BYTE, Qnil) >= 0)
1078 val = -1;
1079 else
1080 val = -2;
1081 }
1082
1083 return val;
1084}
1085
1137e8b8
EZ
1086/* On my 2005-vintage machine, searching back for paragraph start
1087 takes ~1 ms per line. And bidi_paragraph_init is called 4 times
1088 when user types C-p. The number below limits each call to
1089 bidi_paragraph_init to about 10 ms. */
1090#define MAX_PARAGRAPH_SEARCH 7500
1091
be39f003 1092/* Find the beginning of this paragraph by looking back in the buffer.
1137e8b8
EZ
1093 Value is the byte position of the paragraph's beginning, or
1094 BEGV_BYTE if paragraph_start_re is still not found after looking
1095 back MAX_PARAGRAPH_SEARCH lines in the buffer. */
d311d28c
PE
1096static ptrdiff_t
1097bidi_find_paragraph_start (ptrdiff_t pos, ptrdiff_t pos_byte)
6bff6497 1098{
372b7a95 1099 Lisp_Object re = paragraph_start_re;
d311d28c 1100 ptrdiff_t limit = ZV, limit_byte = ZV_BYTE;
4c0078d4 1101 ptrdiff_t n = 0;
6bff6497 1102
6bff6497 1103 while (pos_byte > BEGV_BYTE
1137e8b8 1104 && n++ < MAX_PARAGRAPH_SEARCH
6bff6497
EZ
1105 && fast_looking_at (re, pos, pos_byte, limit, limit_byte, Qnil) < 0)
1106 {
182ce2d2
EZ
1107 /* FIXME: What if the paragraph beginning is covered by a
1108 display string? And what if a display string covering some
1109 of the text over which we scan back includes
1110 paragraph_start_re? */
be39f003
EZ
1111 pos = find_next_newline_no_quit (pos - 1, -1);
1112 pos_byte = CHAR_TO_BYTE (pos);
6bff6497 1113 }
1137e8b8
EZ
1114 if (n >= MAX_PARAGRAPH_SEARCH)
1115 pos_byte = BEGV_BYTE;
be39f003 1116 return pos_byte;
6bff6497
EZ
1117}
1118
bea4f10c 1119/* Determine the base direction, a.k.a. base embedding level, of the
b44d9321
EZ
1120 paragraph we are about to iterate through. If DIR is either L2R or
1121 R2L, just use that. Otherwise, determine the paragraph direction
bea4f10c
EZ
1122 from the first strong directional character of the paragraph.
1123
182ce2d2 1124 NO_DEFAULT_P non-zero means don't default to L2R if the paragraph
bea4f10c
EZ
1125 has no strong directional characters and both DIR and
1126 bidi_it->paragraph_dir are NEUTRAL_DIR. In that case, search back
1127 in the buffer until a paragraph is found with a strong character,
1128 or until hitting BEGV. In the latter case, fall back to L2R. This
1129 flag is used in current-bidi-paragraph-direction.
1130
1131 Note that this function gives the paragraph separator the same
1132 direction as the preceding paragraph, even though Emacs generally
9858f6c3 1133 views the separator as not belonging to any paragraph. */
b7b65b15 1134void
bea4f10c 1135bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it, int no_default_p)
b7b65b15 1136{
d311d28c 1137 ptrdiff_t bytepos = bidi_it->bytepos;
9f257352 1138 int string_p = bidi_it->string.s != NULL || STRINGP (bidi_it->string.lstring);
d311d28c 1139 ptrdiff_t pstartbyte;
87e67904
EZ
1140 /* Note that begbyte is a byte position, while end is a character
1141 position. Yes, this is ugly, but we are trying to avoid costly
1142 calls to BYTE_TO_CHAR and its ilk. */
d311d28c
PE
1143 ptrdiff_t begbyte = string_p ? 0 : BEGV_BYTE;
1144 ptrdiff_t end = string_p ? bidi_it->string.schars : ZV;
e7402cb2 1145
5e65aec0 1146 /* Special case for an empty buffer. */
87e67904 1147 if (bytepos == begbyte && bidi_it->charpos == end)
5e65aec0 1148 dir = L2R;
9c82e145 1149 /* We should never be called at EOB or before BEGV. */
87e67904 1150 else if (bidi_it->charpos >= end || bytepos < begbyte)
9c82e145
EZ
1151 abort ();
1152
be39f003
EZ
1153 if (dir == L2R)
1154 {
1155 bidi_it->paragraph_dir = L2R;
1156 bidi_it->new_paragraph = 0;
1157 }
1158 else if (dir == R2L)
1159 {
1160 bidi_it->paragraph_dir = R2L;
1161 bidi_it->new_paragraph = 0;
1162 }
b7b65b15
EZ
1163 else if (dir == NEUTRAL_DIR) /* P2 */
1164 {
182ce2d2 1165 int ch;
d311d28c
PE
1166 ptrdiff_t ch_len, nchars;
1167 ptrdiff_t pos, disp_pos = -1;
0c95fcf7 1168 int disp_prop = 0;
6bff6497 1169 bidi_type_t type;
9f257352 1170 const unsigned char *s;
be39f003 1171
d20e1419
EZ
1172 if (!bidi_initialized)
1173 bidi_initialize ();
1174
be39f003
EZ
1175 /* If we are inside a paragraph separator, we are just waiting
1176 for the separator to be exhausted; use the previous paragraph
e5a2fec7
EZ
1177 direction. But don't do that if we have been just reseated,
1178 because we need to reinitialize below in that case. */
1179 if (!bidi_it->first_elt
1180 && bidi_it->charpos < bidi_it->separator_limit)
be39f003
EZ
1181 return;
1182
b44d9321 1183 /* If we are on a newline, get past it to where the next
5e65aec0
EZ
1184 paragraph might start. But don't do that at BEGV since then
1185 we are potentially in a new paragraph that doesn't yet
1186 exist. */
c143c213 1187 pos = bidi_it->charpos;
512a289d
RS
1188 s = (STRINGP (bidi_it->string.lstring)
1189 ? SDATA (bidi_it->string.lstring)
1190 : bidi_it->string.s);
f3014ef5
EZ
1191 if (bytepos > begbyte
1192 && bidi_char_at_pos (bytepos, s, bidi_it->string.unibyte) == '\n')
be39f003 1193 {
b44d9321 1194 bytepos++;
c143c213 1195 pos++;
be39f003 1196 }
b44d9321
EZ
1197
1198 /* We are either at the beginning of a paragraph or in the
1199 middle of it. Find where this paragraph starts. */
87e67904
EZ
1200 if (string_p)
1201 {
1202 /* We don't support changes of paragraph direction inside a
1203 string. It is treated as a single paragraph. */
1204 pstartbyte = 0;
1205 }
1206 else
1207 pstartbyte = bidi_find_paragraph_start (pos, bytepos);
be39f003
EZ
1208 bidi_it->separator_limit = -1;
1209 bidi_it->new_paragraph = 0;
bea4f10c
EZ
1210
1211 /* The following loop is run more than once only if NO_DEFAULT_P
87e67904 1212 is non-zero, and only if we are iterating on a buffer. */
bea4f10c
EZ
1213 do {
1214 bytepos = pstartbyte;
87e67904
EZ
1215 if (!string_p)
1216 pos = BYTE_TO_CHAR (bytepos);
0c95fcf7 1217 ch = bidi_fetch_char (bytepos, pos, &disp_pos, &disp_prop,
55439c61 1218 &bidi_it->string,
87e67904 1219 bidi_it->frame_window_p, &ch_len, &nchars);
bea4f10c
EZ
1220 type = bidi_get_type (ch, NEUTRAL_DIR);
1221
182ce2d2 1222 for (pos += nchars, bytepos += ch_len;
bea4f10c
EZ
1223 (bidi_get_category (type) != STRONG)
1224 || (bidi_ignore_explicit_marks_for_paragraph_level
1225 && (type == RLE || type == RLO
1226 || type == LRE || type == LRO));
1227 type = bidi_get_type (ch, NEUTRAL_DIR))
1228 {
87e67904 1229 if (pos >= end)
bea4f10c
EZ
1230 {
1231 /* Pretend there's a paragraph separator at end of
87e67904 1232 buffer/string. */
bea4f10c
EZ
1233 type = NEUTRAL_B;
1234 break;
1235 }
0bb23927
EZ
1236 if (!string_p
1237 && type == NEUTRAL_B
1238 && bidi_at_paragraph_end (pos, bytepos) >= -1)
cf99dcf8 1239 break;
102ebb00 1240 /* Fetch next character and advance to get past it. */
55439c61 1241 ch = bidi_fetch_char (bytepos, pos, &disp_pos,
0c95fcf7 1242 &disp_prop, &bidi_it->string,
fc6f18ce 1243 bidi_it->frame_window_p, &ch_len, &nchars);
182ce2d2
EZ
1244 pos += nchars;
1245 bytepos += ch_len;
bea4f10c 1246 }
32413314
EZ
1247 if ((type == STRONG_R || type == STRONG_AL) /* P3 */
1248 || (!bidi_ignore_explicit_marks_for_paragraph_level
1249 && (type == RLO || type == RLE)))
bea4f10c 1250 bidi_it->paragraph_dir = R2L;
32413314
EZ
1251 else if (type == STRONG_L
1252 || (!bidi_ignore_explicit_marks_for_paragraph_level
1253 && (type == LRO || type == LRE)))
bea4f10c 1254 bidi_it->paragraph_dir = L2R;
87e67904
EZ
1255 if (!string_p
1256 && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR)
bea4f10c
EZ
1257 {
1258 /* If this paragraph is at BEGV, default to L2R. */
1259 if (pstartbyte == BEGV_BYTE)
1260 bidi_it->paragraph_dir = L2R; /* P3 and HL1 */
1261 else
1262 {
d311d28c
PE
1263 ptrdiff_t prevpbyte = pstartbyte;
1264 ptrdiff_t p = BYTE_TO_CHAR (pstartbyte), pbyte = pstartbyte;
bea4f10c
EZ
1265
1266 /* Find the beginning of the previous paragraph, if any. */
1267 while (pbyte > BEGV_BYTE && prevpbyte >= pstartbyte)
1268 {
182ce2d2
EZ
1269 /* FXIME: What if p is covered by a display
1270 string? See also a FIXME inside
1271 bidi_find_paragraph_start. */
bea4f10c
EZ
1272 p--;
1273 pbyte = CHAR_TO_BYTE (p);
1274 prevpbyte = bidi_find_paragraph_start (p, pbyte);
1275 }
1276 pstartbyte = prevpbyte;
1277 }
1278 }
87e67904
EZ
1279 } while (!string_p
1280 && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR);
b7b65b15 1281 }
be39f003
EZ
1282 else
1283 abort ();
b7b65b15 1284
b44d9321
EZ
1285 /* Contrary to UAX#9 clause P3, we only default the paragraph
1286 direction to L2R if we have no previous usable paragraph
bea4f10c 1287 direction. This is allowed by the HL1 clause. */
d20e1419 1288 if (bidi_it->paragraph_dir != L2R && bidi_it->paragraph_dir != R2L)
bea4f10c 1289 bidi_it->paragraph_dir = L2R; /* P3 and HL1 ``higher-level protocols'' */
be39f003 1290 if (bidi_it->paragraph_dir == R2L)
b44d9321 1291 bidi_it->level_stack[0].level = 1;
be39f003 1292 else
b44d9321 1293 bidi_it->level_stack[0].level = 0;
be39f003
EZ
1294
1295 bidi_line_init (bidi_it);
b7b65b15
EZ
1296}
1297
cbb09f04
EZ
1298\f
1299/***********************************************************************
1300 Resolving explicit and implicit levels.
58b9f433 1301 The rest of this file constitutes the core of the UBA implementation.
cbb09f04 1302 ***********************************************************************/
b7b65b15 1303
55d4c1b2 1304static inline int
182ce2d2 1305bidi_explicit_dir_char (int ch)
b7b65b15 1306{
182ce2d2
EZ
1307 bidi_type_t ch_type;
1308
1309 if (!bidi_initialized)
1310 abort ();
1311 ch_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
1312 return (ch_type == LRE || ch_type == LRO
1313 || ch_type == RLE || ch_type == RLO
1314 || ch_type == PDF);
b7b65b15
EZ
1315}
1316
1317/* A helper function for bidi_resolve_explicit. It advances to the
1318 next character in logical order and determines the new embedding
1319 level and directional override, but does not take into account
1320 empty embeddings. */
1321static int
1322bidi_resolve_explicit_1 (struct bidi_it *bidi_it)
1323{
1324 int curchar;
1325 bidi_type_t type;
1326 int current_level;
1327 int new_level;
1328 bidi_dir_t override;
9f257352 1329 int string_p = bidi_it->string.s != NULL || STRINGP (bidi_it->string.lstring);
b7b65b15 1330
182ce2d2
EZ
1331 /* If reseat()'ed, don't advance, so as to start iteration from the
1332 position where we were reseated. bidi_it->bytepos can be less
1333 than BEGV_BYTE after reseat to BEGV. */
87e67904 1334 if (bidi_it->bytepos < (string_p ? 0 : BEGV_BYTE)
9c82e145 1335 || bidi_it->first_elt)
e7402cb2 1336 {
9c82e145 1337 bidi_it->first_elt = 0;
87e67904
EZ
1338 if (string_p)
1339 {
512a289d
RS
1340 const unsigned char *p
1341 = (STRINGP (bidi_it->string.lstring)
1342 ? SDATA (bidi_it->string.lstring)
1343 : bidi_it->string.s);
9f257352 1344
87e67904
EZ
1345 if (bidi_it->charpos < 0)
1346 bidi_it->charpos = 0;
f3014ef5
EZ
1347 bidi_it->bytepos = bidi_count_bytes (p, 0, 0, bidi_it->charpos,
1348 bidi_it->string.unibyte);
87e67904
EZ
1349 }
1350 else
1351 {
1352 if (bidi_it->charpos < BEGV)
1353 bidi_it->charpos = BEGV;
1354 bidi_it->bytepos = CHAR_TO_BYTE (bidi_it->charpos);
1355 }
e7402cb2 1356 }
87e67904
EZ
1357 /* Don't move at end of buffer/string. */
1358 else if (bidi_it->charpos < (string_p ? bidi_it->string.schars : ZV))
b7b65b15 1359 {
182ce2d2
EZ
1360 /* Advance to the next character, skipping characters covered by
1361 display strings (nchars > 1). */
102ebb00
EZ
1362 if (bidi_it->nchars <= 0)
1363 abort ();
182ce2d2 1364 bidi_it->charpos += bidi_it->nchars;
e7402cb2
EZ
1365 if (bidi_it->ch_len == 0)
1366 abort ();
b7b65b15
EZ
1367 bidi_it->bytepos += bidi_it->ch_len;
1368 }
1369
1370 current_level = bidi_it->level_stack[bidi_it->stack_idx].level; /* X1 */
1371 override = bidi_it->level_stack[bidi_it->stack_idx].override;
1372 new_level = current_level;
1373
87e67904 1374 if (bidi_it->charpos >= (string_p ? bidi_it->string.schars : ZV))
e7402cb2
EZ
1375 {
1376 curchar = BIDI_EOB;
1377 bidi_it->ch_len = 1;
182ce2d2 1378 bidi_it->nchars = 1;
87e67904 1379 bidi_it->disp_pos = (string_p ? bidi_it->string.schars : ZV);
0c95fcf7 1380 bidi_it->disp_prop = 0;
e7402cb2
EZ
1381 }
1382 else
1383 {
182ce2d2
EZ
1384 /* Fetch the character at BYTEPOS. If it is covered by a
1385 display string, treat the entire run of covered characters as
1386 a single character u+FFFC. */
102ebb00 1387 curchar = bidi_fetch_char (bidi_it->bytepos, bidi_it->charpos,
0c95fcf7 1388 &bidi_it->disp_pos, &bidi_it->disp_prop,
55439c61 1389 &bidi_it->string, bidi_it->frame_window_p,
102ebb00 1390 &bidi_it->ch_len, &bidi_it->nchars);
e7402cb2 1391 }
b7b65b15 1392 bidi_it->ch = curchar;
b7b65b15 1393
6bff6497
EZ
1394 /* Don't apply directional override here, as all the types we handle
1395 below will not be affected by the override anyway, and we need
1396 the original type unaltered. The override will be applied in
1397 bidi_resolve_weak. */
1398 type = bidi_get_type (curchar, NEUTRAL_DIR);
89d3374a
EZ
1399 bidi_it->orig_type = type;
1400 bidi_check_type (bidi_it->orig_type);
b7b65b15
EZ
1401
1402 if (type != PDF)
1403 bidi_it->prev_was_pdf = 0;
1404
89d3374a 1405 bidi_it->type_after_w1 = UNKNOWN_BT;
b7b65b15
EZ
1406
1407 switch (type)
1408 {
1409 case RLE: /* X2 */
1410 case RLO: /* X4 */
89d3374a
EZ
1411 bidi_it->type_after_w1 = type;
1412 bidi_check_type (bidi_it->type_after_w1);
b7b65b15 1413 type = WEAK_BN; /* X9/Retaining */
87e67904 1414 if (bidi_it->ignore_bn_limit <= -1)
b7b65b15
EZ
1415 {
1416 if (current_level <= BIDI_MAXLEVEL - 4)
1417 {
1418 /* Compute the least odd embedding level greater than
1419 the current level. */
1420 new_level = ((current_level + 1) & ~1) + 1;
89d3374a 1421 if (bidi_it->type_after_w1 == RLE)
b7b65b15
EZ
1422 override = NEUTRAL_DIR;
1423 else
1424 override = R2L;
1425 if (current_level == BIDI_MAXLEVEL - 4)
1426 bidi_it->invalid_rl_levels = 0;
1427 bidi_push_embedding_level (bidi_it, new_level, override);
1428 }
1429 else
1430 {
1431 bidi_it->invalid_levels++;
1432 /* See the commentary about invalid_rl_levels below. */
1433 if (bidi_it->invalid_rl_levels < 0)
1434 bidi_it->invalid_rl_levels = 0;
1435 bidi_it->invalid_rl_levels++;
1436 }
1437 }
89d3374a 1438 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
7b5d6677
EZ
1439 || (bidi_it->next_en_pos > bidi_it->charpos
1440 && bidi_it->next_en_type == WEAK_EN))
b7b65b15
EZ
1441 type = WEAK_EN;
1442 break;
1443 case LRE: /* X3 */
1444 case LRO: /* X5 */
89d3374a
EZ
1445 bidi_it->type_after_w1 = type;
1446 bidi_check_type (bidi_it->type_after_w1);
b7b65b15 1447 type = WEAK_BN; /* X9/Retaining */
87e67904 1448 if (bidi_it->ignore_bn_limit <= -1)
b7b65b15
EZ
1449 {
1450 if (current_level <= BIDI_MAXLEVEL - 5)
1451 {
1452 /* Compute the least even embedding level greater than
1453 the current level. */
1454 new_level = ((current_level + 2) & ~1);
89d3374a 1455 if (bidi_it->type_after_w1 == LRE)
b7b65b15
EZ
1456 override = NEUTRAL_DIR;
1457 else
1458 override = L2R;
1459 bidi_push_embedding_level (bidi_it, new_level, override);
1460 }
1461 else
1462 {
1463 bidi_it->invalid_levels++;
1464 /* invalid_rl_levels counts invalid levels encountered
1465 while the embedding level was already too high for
1466 LRE/LRO, but not for RLE/RLO. That is because
1467 there may be exactly one PDF which we should not
1468 ignore even though invalid_levels is non-zero.
1469 invalid_rl_levels helps to know what PDF is
1470 that. */
1471 if (bidi_it->invalid_rl_levels >= 0)
1472 bidi_it->invalid_rl_levels++;
1473 }
1474 }
89d3374a 1475 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
7b5d6677
EZ
1476 || (bidi_it->next_en_pos > bidi_it->charpos
1477 && bidi_it->next_en_type == WEAK_EN))
b7b65b15
EZ
1478 type = WEAK_EN;
1479 break;
1480 case PDF: /* X7 */
89d3374a
EZ
1481 bidi_it->type_after_w1 = type;
1482 bidi_check_type (bidi_it->type_after_w1);
b7b65b15 1483 type = WEAK_BN; /* X9/Retaining */
87e67904 1484 if (bidi_it->ignore_bn_limit <= -1)
b7b65b15
EZ
1485 {
1486 if (!bidi_it->invalid_rl_levels)
1487 {
1488 new_level = bidi_pop_embedding_level (bidi_it);
1489 bidi_it->invalid_rl_levels = -1;
1490 if (bidi_it->invalid_levels)
1491 bidi_it->invalid_levels--;
1492 /* else nothing: UAX#9 says to ignore invalid PDFs */
1493 }
1494 if (!bidi_it->invalid_levels)
1495 new_level = bidi_pop_embedding_level (bidi_it);
1496 else
1497 {
1498 bidi_it->invalid_levels--;
1499 bidi_it->invalid_rl_levels--;
1500 }
1501 }
89d3374a 1502 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
7b5d6677
EZ
1503 || (bidi_it->next_en_pos > bidi_it->charpos
1504 && bidi_it->next_en_type == WEAK_EN))
b7b65b15
EZ
1505 type = WEAK_EN;
1506 break;
1507 default:
1508 /* Nothing. */
1509 break;
1510 }
1511
1512 bidi_it->type = type;
2d6e4628 1513 bidi_check_type (bidi_it->type);
b7b65b15
EZ
1514
1515 return new_level;
1516}
1517
1518/* Given an iterator state in BIDI_IT, advance one character position
182ce2d2
EZ
1519 in the buffer/string to the next character (in the logical order),
1520 resolve any explicit embeddings and directional overrides, and
1521 return the embedding level of the character after resolving
1522 explicit directives and ignoring empty embeddings. */
b7b65b15
EZ
1523static int
1524bidi_resolve_explicit (struct bidi_it *bidi_it)
1525{
1526 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1527 int new_level = bidi_resolve_explicit_1 (bidi_it);
d311d28c 1528 ptrdiff_t eob = bidi_it->string.s ? bidi_it->string.schars : ZV;
512a289d
RS
1529 const unsigned char *s
1530 = (STRINGP (bidi_it->string.lstring)
1531 ? SDATA (bidi_it->string.lstring)
1532 : bidi_it->string.s);
b7b65b15
EZ
1533
1534 if (prev_level < new_level
1535 && bidi_it->type == WEAK_BN
87e67904
EZ
1536 && bidi_it->ignore_bn_limit == -1 /* only if not already known */
1537 && bidi_it->charpos < eob /* not already at EOB */
1538 && bidi_explicit_dir_char (bidi_char_at_pos (bidi_it->bytepos
f3014ef5
EZ
1539 + bidi_it->ch_len, s,
1540 bidi_it->string.unibyte)))
b7b65b15
EZ
1541 {
1542 /* Avoid pushing and popping embedding levels if the level run
1543 is empty, as this breaks level runs where it shouldn't.
1544 UAX#9 removes all the explicit embedding and override codes,
1545 so empty embeddings disappear without a trace. We need to
1546 behave as if we did the same. */
1547 struct bidi_it saved_it;
1548 int level = prev_level;
1549
1550 bidi_copy_it (&saved_it, bidi_it);
1551
87e67904 1552 while (bidi_explicit_dir_char (bidi_char_at_pos (bidi_it->bytepos
f3014ef5
EZ
1553 + bidi_it->ch_len, s,
1554 bidi_it->string.unibyte)))
b7b65b15 1555 {
182ce2d2
EZ
1556 /* This advances to the next character, skipping any
1557 characters covered by display strings. */
b7b65b15 1558 level = bidi_resolve_explicit_1 (bidi_it);
9f257352
EZ
1559 /* If string.lstring was relocated inside bidi_resolve_explicit_1,
1560 a pointer to its data is no longer valid. */
1561 if (STRINGP (bidi_it->string.lstring))
1562 s = SDATA (bidi_it->string.lstring);
b7b65b15
EZ
1563 }
1564
102ebb00
EZ
1565 if (bidi_it->nchars <= 0)
1566 abort ();
b7b65b15 1567 if (level == prev_level) /* empty embedding */
182ce2d2 1568 saved_it.ignore_bn_limit = bidi_it->charpos + bidi_it->nchars;
b7b65b15 1569 else /* this embedding is non-empty */
87e67904 1570 saved_it.ignore_bn_limit = -2;
b7b65b15
EZ
1571
1572 bidi_copy_it (bidi_it, &saved_it);
87e67904 1573 if (bidi_it->ignore_bn_limit > -1)
b7b65b15
EZ
1574 {
1575 /* We pushed a level, but we shouldn't have. Undo that. */
1576 if (!bidi_it->invalid_rl_levels)
1577 {
1578 new_level = bidi_pop_embedding_level (bidi_it);
1579 bidi_it->invalid_rl_levels = -1;
1580 if (bidi_it->invalid_levels)
1581 bidi_it->invalid_levels--;
1582 }
1583 if (!bidi_it->invalid_levels)
1584 new_level = bidi_pop_embedding_level (bidi_it);
1585 else
1586 {
1587 bidi_it->invalid_levels--;
1588 bidi_it->invalid_rl_levels--;
1589 }
1590 }
1591 }
1592
b7b65b15
EZ
1593 if (bidi_it->type == NEUTRAL_B) /* X8 */
1594 {
21fce5ab 1595 bidi_set_paragraph_end (bidi_it);
6bff6497
EZ
1596 /* This is needed by bidi_resolve_weak below, and in L1. */
1597 bidi_it->type_after_w1 = bidi_it->type;
89d3374a 1598 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1599 }
1600
1601 return new_level;
1602}
1603
182ce2d2
EZ
1604/* Advance in the buffer/string, resolve weak types and return the
1605 type of the next character after weak type resolution. */
fd3998ff 1606static bidi_type_t
b7b65b15
EZ
1607bidi_resolve_weak (struct bidi_it *bidi_it)
1608{
1609 bidi_type_t type;
1610 bidi_dir_t override;
1611 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1612 int new_level = bidi_resolve_explicit (bidi_it);
1613 int next_char;
1614 bidi_type_t type_of_next;
1615 struct bidi_it saved_it;
5895d7b9 1616 ptrdiff_t eob
512a289d
RS
1617 = ((STRINGP (bidi_it->string.lstring) || bidi_it->string.s)
1618 ? bidi_it->string.schars : ZV);
b7b65b15
EZ
1619
1620 type = bidi_it->type;
1621 override = bidi_it->level_stack[bidi_it->stack_idx].override;
1622
1623 if (type == UNKNOWN_BT
1624 || type == LRE
1625 || type == LRO
1626 || type == RLE
1627 || type == RLO
1628 || type == PDF)
1629 abort ();
1630
1631 if (new_level != prev_level
1632 || bidi_it->type == NEUTRAL_B)
1633 {
1634 /* We've got a new embedding level run, compute the directional
1635 type of sor and initialize per-run variables (UAX#9, clause
1636 X10). */
1637 bidi_set_sor_type (bidi_it, prev_level, new_level);
1638 }
1639 else if (type == NEUTRAL_S || type == NEUTRAL_WS
1640 || type == WEAK_BN || type == STRONG_AL)
89d3374a
EZ
1641 bidi_it->type_after_w1 = type; /* needed in L1 */
1642 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1643
1644 /* Level and directional override status are already recorded in
1645 bidi_it, and do not need any change; see X6. */
1646 if (override == R2L) /* X6 */
1647 type = STRONG_R;
1648 else if (override == L2R)
1649 type = STRONG_L;
bc5a45f3 1650 else
b7b65b15 1651 {
bc5a45f3 1652 if (type == WEAK_NSM) /* W1 */
b7b65b15 1653 {
bc5a45f3 1654 /* Note that we don't need to consider the case where the
5930fe97
EZ
1655 prev character has its type overridden by an RLO or LRO,
1656 because then either the type of this NSM would have been
1657 also overridden, or the previous character is outside the
1658 current level run, and thus not relevant to this NSM.
1659 This is why NSM gets the type_after_w1 of the previous
1660 character. */
ebb5722e
EZ
1661 if (bidi_it->prev.type_after_w1 != UNKNOWN_BT
1662 /* if type_after_w1 is NEUTRAL_B, this NSM is at sor */
1663 && bidi_it->prev.type_after_w1 != NEUTRAL_B)
5930fe97 1664 type = bidi_it->prev.type_after_w1;
bc5a45f3
EZ
1665 else if (bidi_it->sor == R2L)
1666 type = STRONG_R;
1667 else if (bidi_it->sor == L2R)
1668 type = STRONG_L;
1669 else /* shouldn't happen! */
1670 abort ();
b7b65b15 1671 }
bc5a45f3
EZ
1672 if (type == WEAK_EN /* W2 */
1673 && bidi_it->last_strong.type_after_w1 == STRONG_AL)
1674 type = WEAK_AN;
1675 else if (type == STRONG_AL) /* W3 */
1676 type = STRONG_R;
1677 else if ((type == WEAK_ES /* W4 */
1678 && bidi_it->prev.type_after_w1 == WEAK_EN
1679 && bidi_it->prev.orig_type == WEAK_EN)
1680 || (type == WEAK_CS
1681 && ((bidi_it->prev.type_after_w1 == WEAK_EN
1682 && bidi_it->prev.orig_type == WEAK_EN)
1683 || bidi_it->prev.type_after_w1 == WEAK_AN)))
b7b65b15 1684 {
512a289d
RS
1685 const unsigned char *s
1686 = (STRINGP (bidi_it->string.lstring)
1687 ? SDATA (bidi_it->string.lstring)
1688 : bidi_it->string.s);
1689
1690 next_char = (bidi_it->charpos + bidi_it->nchars >= eob
1691 ? BIDI_EOB
1692 : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len,
1693 s, bidi_it->string.unibyte));
6bff6497 1694 type_of_next = bidi_get_type (next_char, override);
b7b65b15 1695
bc5a45f3 1696 if (type_of_next == WEAK_BN
b7b65b15
EZ
1697 || bidi_explicit_dir_char (next_char))
1698 {
1699 bidi_copy_it (&saved_it, bidi_it);
1700 while (bidi_resolve_explicit (bidi_it) == new_level
bc5a45f3 1701 && bidi_it->type == WEAK_BN)
b7b65b15
EZ
1702 ;
1703 type_of_next = bidi_it->type;
b7b65b15
EZ
1704 bidi_copy_it (bidi_it, &saved_it);
1705 }
bc5a45f3
EZ
1706
1707 /* If the next character is EN, but the last strong-type
1708 character is AL, that next EN will be changed to AN when
1709 we process it in W2 above. So in that case, this ES
1710 should not be changed into EN. */
1711 if (type == WEAK_ES
1712 && type_of_next == WEAK_EN
1713 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1714 type = WEAK_EN;
1715 else if (type == WEAK_CS)
b7b65b15 1716 {
bc5a45f3
EZ
1717 if (bidi_it->prev.type_after_w1 == WEAK_AN
1718 && (type_of_next == WEAK_AN
1719 /* If the next character is EN, but the last
1720 strong-type character is AL, EN will be later
1721 changed to AN when we process it in W2 above.
1722 So in that case, this ES should not be
1723 changed into EN. */
1724 || (type_of_next == WEAK_EN
1725 && bidi_it->last_strong.type_after_w1 == STRONG_AL)))
1726 type = WEAK_AN;
1727 else if (bidi_it->prev.type_after_w1 == WEAK_EN
1728 && type_of_next == WEAK_EN
1729 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1730 type = WEAK_EN;
1731 }
1732 }
1733 else if (type == WEAK_ET /* W5: ET with EN before or after it */
1734 || type == WEAK_BN) /* W5/Retaining */
1735 {
7b5d6677 1736 if (bidi_it->prev.type_after_w1 == WEAK_EN) /* ET/BN w/EN before it */
bc5a45f3 1737 type = WEAK_EN;
7b5d6677
EZ
1738 else if (bidi_it->next_en_pos > bidi_it->charpos
1739 && bidi_it->next_en_type != WEAK_BN)
1740 {
1741 if (bidi_it->next_en_type == WEAK_EN) /* ET/BN with EN after it */
1742 type = WEAK_EN;
1743 }
1744 else if (bidi_it->next_en_pos >=0)
bc5a45f3 1745 {
d311d28c 1746 ptrdiff_t en_pos = bidi_it->charpos + bidi_it->nchars;
512a289d
RS
1747 const unsigned char *s = (STRINGP (bidi_it->string.lstring)
1748 ? SDATA (bidi_it->string.lstring)
1749 : bidi_it->string.s);
bc5a45f3 1750
102ebb00
EZ
1751 if (bidi_it->nchars <= 0)
1752 abort ();
512a289d
RS
1753 next_char
1754 = (bidi_it->charpos + bidi_it->nchars >= eob
1755 ? BIDI_EOB
1756 : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len, s,
1757 bidi_it->string.unibyte));
bc5a45f3
EZ
1758 type_of_next = bidi_get_type (next_char, override);
1759
1760 if (type_of_next == WEAK_ET
1761 || type_of_next == WEAK_BN
1762 || bidi_explicit_dir_char (next_char))
1763 {
1764 bidi_copy_it (&saved_it, bidi_it);
1765 while (bidi_resolve_explicit (bidi_it) == new_level
1766 && (bidi_it->type == WEAK_BN
1767 || bidi_it->type == WEAK_ET))
1768 ;
1769 type_of_next = bidi_it->type;
1770 en_pos = bidi_it->charpos;
1771 bidi_copy_it (bidi_it, &saved_it);
1772 }
7b5d6677
EZ
1773 /* Remember this position, to speed up processing of the
1774 next ETs. */
1775 bidi_it->next_en_pos = en_pos;
bc5a45f3 1776 if (type_of_next == WEAK_EN)
b7b65b15 1777 {
bc5a45f3
EZ
1778 /* If the last strong character is AL, the EN we've
1779 found will become AN when we get to it (W2). */
7b5d6677
EZ
1780 if (bidi_it->last_strong.type_after_w1 == STRONG_AL)
1781 type_of_next = WEAK_AN;
bc5a45f3
EZ
1782 else if (type == WEAK_BN)
1783 type = NEUTRAL_ON; /* W6/Retaining */
7b5d6677
EZ
1784 else
1785 type = WEAK_EN;
b7b65b15 1786 }
4787455f
EZ
1787 else if (type_of_next == NEUTRAL_B)
1788 /* Record the fact that there are no more ENs from
1789 here to the end of paragraph, to avoid entering the
1790 loop above ever again in this paragraph. */
1791 bidi_it->next_en_pos = -1;
7b5d6677
EZ
1792 /* Record the type of the character where we ended our search. */
1793 bidi_it->next_en_type = type_of_next;
b7b65b15
EZ
1794 }
1795 }
1796 }
1797
1798 if (type == WEAK_ES || type == WEAK_ET || type == WEAK_CS /* W6 */
89d3374a
EZ
1799 || (type == WEAK_BN
1800 && (bidi_it->prev.type_after_w1 == WEAK_CS /* W6/Retaining */
1801 || bidi_it->prev.type_after_w1 == WEAK_ES
1802 || bidi_it->prev.type_after_w1 == WEAK_ET)))
b7b65b15
EZ
1803 type = NEUTRAL_ON;
1804
1805 /* Store the type we've got so far, before we clobber it with strong
1806 types in W7 and while resolving neutral types. But leave alone
1807 the original types that were recorded above, because we will need
1808 them for the L1 clause. */
89d3374a
EZ
1809 if (bidi_it->type_after_w1 == UNKNOWN_BT)
1810 bidi_it->type_after_w1 = type;
1811 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1812
1813 if (type == WEAK_EN) /* W7 */
1814 {
89d3374a 1815 if ((bidi_it->last_strong.type_after_w1 == STRONG_L)
b7b65b15
EZ
1816 || (bidi_it->last_strong.type == UNKNOWN_BT && bidi_it->sor == L2R))
1817 type = STRONG_L;
1818 }
1819
1820 bidi_it->type = type;
2d6e4628 1821 bidi_check_type (bidi_it->type);
b7b65b15
EZ
1822 return type;
1823}
1824
cbb09f04
EZ
1825/* Resolve the type of a neutral character according to the type of
1826 surrounding strong text and the current embedding level. */
58b9f433 1827static inline bidi_type_t
cbb09f04
EZ
1828bidi_resolve_neutral_1 (bidi_type_t prev_type, bidi_type_t next_type, int lev)
1829{
1830 /* N1: European and Arabic numbers are treated as though they were R. */
1831 if (next_type == WEAK_EN || next_type == WEAK_AN)
1832 next_type = STRONG_R;
1833 if (prev_type == WEAK_EN || prev_type == WEAK_AN)
1834 prev_type = STRONG_R;
1835
1836 if (next_type == prev_type) /* N1 */
1837 return next_type;
1838 else if ((lev & 1) == 0) /* N2 */
1839 return STRONG_L;
1840 else
1841 return STRONG_R;
1842}
1843
fd3998ff 1844static bidi_type_t
b7b65b15
EZ
1845bidi_resolve_neutral (struct bidi_it *bidi_it)
1846{
1847 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1848 bidi_type_t type = bidi_resolve_weak (bidi_it);
1849 int current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1850
1851 if (!(type == STRONG_R
1852 || type == STRONG_L
1853 || type == WEAK_BN
1854 || type == WEAK_EN
1855 || type == WEAK_AN
1856 || type == NEUTRAL_B
1857 || type == NEUTRAL_S
1858 || type == NEUTRAL_WS
1859 || type == NEUTRAL_ON))
1860 abort ();
1861
4787455f
EZ
1862 if ((type != NEUTRAL_B /* Don't risk entering the long loop below if
1863 we are already at paragraph end. */
1864 && bidi_get_category (type) == NEUTRAL)
b7b65b15
EZ
1865 || (type == WEAK_BN && prev_level == current_level))
1866 {
1867 if (bidi_it->next_for_neutral.type != UNKNOWN_BT)
1868 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1869 bidi_it->next_for_neutral.type,
1870 current_level);
4787455f
EZ
1871 /* The next two "else if" clauses are shortcuts for the
1872 important special case when we have a long sequence of
1873 neutral or WEAK_BN characters, such as whitespace or nulls or
1874 other control characters, on the base embedding level of the
1875 paragraph, and that sequence goes all the way to the end of
1876 the paragraph and follows a character whose resolved
1877 directionality is identical to the base embedding level.
1878 (This is what happens in a buffer with plain L2R text that
1879 happens to include long sequences of control characters.) By
1880 virtue of N1, the result of examining this long sequence will
1881 always be either STRONG_L or STRONG_R, depending on the base
1882 embedding level. So we use this fact directly instead of
1883 entering the expensive loop in the "else" clause. */
1884 else if (current_level == 0
1885 && bidi_it->prev_for_neutral.type == STRONG_L
1886 && !bidi_explicit_dir_char (bidi_it->ch))
1887 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1888 STRONG_L, current_level);
1889 else if (/* current level is 1 */
1890 current_level == 1
1891 /* base embedding level is also 1 */
1892 && bidi_it->level_stack[0].level == 1
1893 /* previous character is one of those considered R for
1894 the purposes of W5 */
1895 && (bidi_it->prev_for_neutral.type == STRONG_R
1896 || bidi_it->prev_for_neutral.type == WEAK_EN
1897 || bidi_it->prev_for_neutral.type == WEAK_AN)
1898 && !bidi_explicit_dir_char (bidi_it->ch))
1899 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1900 STRONG_R, current_level);
b7b65b15
EZ
1901 else
1902 {
1903 /* Arrrgh!! The UAX#9 algorithm is too deeply entrenched in
1904 the assumption of batch-style processing; see clauses W4,
1905 W5, and especially N1, which require to look far forward
182ce2d2
EZ
1906 (as well as back) in the buffer/string. May the fleas of
1907 a thousand camels infest the armpits of those who design
b7b65b15
EZ
1908 supposedly general-purpose algorithms by looking at their
1909 own implementations, and fail to consider other possible
1910 implementations! */
1911 struct bidi_it saved_it;
1912 bidi_type_t next_type;
1913
1914 if (bidi_it->scan_dir == -1)
1915 abort ();
1916
1917 bidi_copy_it (&saved_it, bidi_it);
1918 /* Scan the text forward until we find the first non-neutral
1919 character, and then use that to resolve the neutral we
1920 are dealing with now. We also cache the scanned iterator
1921 states, to salvage some of the effort later. */
1922 bidi_cache_iterator_state (bidi_it, 0);
1923 do {
1924 /* Record the info about the previous character, so that
1925 it will be cached below with this state. */
89d3374a 1926 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
b7b65b15
EZ
1927 && bidi_it->type != WEAK_BN)
1928 bidi_remember_char (&bidi_it->prev, bidi_it);
1929 type = bidi_resolve_weak (bidi_it);
1930 /* Paragraph separators have their levels fully resolved
1931 at this point, so cache them as resolved. */
1932 bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B);
1933 /* FIXME: implement L1 here, by testing for a newline and
1934 resetting the level for any sequence of whitespace
1935 characters adjacent to it. */
1936 } while (!(type == NEUTRAL_B
1937 || (type != WEAK_BN
1938 && bidi_get_category (type) != NEUTRAL)
1939 /* This is all per level run, so stop when we
1940 reach the end of this level run. */
512a289d
RS
1941 || (bidi_it->level_stack[bidi_it->stack_idx].level
1942 != current_level)));
b7b65b15
EZ
1943
1944 bidi_remember_char (&saved_it.next_for_neutral, bidi_it);
1945
1946 switch (type)
1947 {
1948 case STRONG_L:
1949 case STRONG_R:
1950 case STRONG_AL:
4787455f
EZ
1951 /* Actually, STRONG_AL cannot happen here, because
1952 bidi_resolve_weak converts it to STRONG_R, per W3. */
1953 xassert (type != STRONG_AL);
b7b65b15
EZ
1954 next_type = type;
1955 break;
1956 case WEAK_EN:
1957 case WEAK_AN:
1958 /* N1: ``European and Arabic numbers are treated as
1959 though they were R.'' */
1960 next_type = STRONG_R;
b7b65b15
EZ
1961 break;
1962 case WEAK_BN:
1963 if (!bidi_explicit_dir_char (bidi_it->ch))
1964 abort (); /* can't happen: BNs are skipped */
1965 /* FALLTHROUGH */
1966 case NEUTRAL_B:
1967 /* Marched all the way to the end of this level run.
1968 We need to use the eor type, whose information is
1969 stored by bidi_set_sor_type in the prev_for_neutral
1970 member. */
1971 if (saved_it.type != WEAK_BN
89d3374a 1972 || bidi_get_category (bidi_it->prev.type_after_w1) == NEUTRAL)
4787455f 1973 next_type = bidi_it->prev_for_neutral.type;
b7b65b15
EZ
1974 else
1975 {
1976 /* This is a BN which does not adjoin neutrals.
1977 Leave its type alone. */
1978 bidi_copy_it (bidi_it, &saved_it);
1979 return bidi_it->type;
1980 }
1981 break;
1982 default:
1983 abort ();
1984 }
1985 type = bidi_resolve_neutral_1 (saved_it.prev_for_neutral.type,
1986 next_type, current_level);
4787455f 1987 saved_it.next_for_neutral.type = next_type;
b7b65b15 1988 saved_it.type = type;
4787455f 1989 bidi_check_type (next_type);
2d6e4628 1990 bidi_check_type (type);
b7b65b15
EZ
1991 bidi_copy_it (bidi_it, &saved_it);
1992 }
1993 }
1994 return type;
1995}
1996
1997/* Given an iterator state in BIDI_IT, advance one character position
182ce2d2
EZ
1998 in the buffer/string to the next character (in the logical order),
1999 resolve the bidi type of that next character, and return that
2000 type. */
fd3998ff 2001static bidi_type_t
b7b65b15
EZ
2002bidi_type_of_next_char (struct bidi_it *bidi_it)
2003{
2004 bidi_type_t type;
2005
2006 /* This should always be called during a forward scan. */
2007 if (bidi_it->scan_dir != 1)
2008 abort ();
2009
2010 /* Reset the limit until which to ignore BNs if we step out of the
2011 area where we found only empty levels. */
87e67904 2012 if ((bidi_it->ignore_bn_limit > -1
b7b65b15 2013 && bidi_it->ignore_bn_limit <= bidi_it->charpos)
87e67904 2014 || (bidi_it->ignore_bn_limit == -2
b7b65b15 2015 && !bidi_explicit_dir_char (bidi_it->ch)))
87e67904 2016 bidi_it->ignore_bn_limit = -1;
b7b65b15
EZ
2017
2018 type = bidi_resolve_neutral (bidi_it);
2019
2020 return type;
2021}
2022
2023/* Given an iterator state BIDI_IT, advance one character position in
182ce2d2
EZ
2024 the buffer/string to the next character (in the current scan
2025 direction), resolve the embedding and implicit levels of that next
2026 character, and return the resulting level. */
fd3998ff 2027static int
b7b65b15
EZ
2028bidi_level_of_next_char (struct bidi_it *bidi_it)
2029{
2030 bidi_type_t type;
2031 int level, prev_level = -1;
2032 struct bidi_saved_info next_for_neutral;
d311d28c 2033 ptrdiff_t next_char_pos = -2;
b7b65b15
EZ
2034
2035 if (bidi_it->scan_dir == 1)
2036 {
5895d7b9 2037 ptrdiff_t eob
512a289d
RS
2038 = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
2039 ? bidi_it->string.schars : ZV);
9f257352 2040
b7b65b15 2041 /* There's no sense in trying to advance if we hit end of text. */
9f257352 2042 if (bidi_it->charpos >= eob)
b7b65b15
EZ
2043 return bidi_it->resolved_level;
2044
2045 /* Record the info about the previous character. */
89d3374a 2046 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
b7b65b15
EZ
2047 && bidi_it->type != WEAK_BN)
2048 bidi_remember_char (&bidi_it->prev, bidi_it);
89d3374a
EZ
2049 if (bidi_it->type_after_w1 == STRONG_R
2050 || bidi_it->type_after_w1 == STRONG_L
2051 || bidi_it->type_after_w1 == STRONG_AL)
b7b65b15
EZ
2052 bidi_remember_char (&bidi_it->last_strong, bidi_it);
2053 /* FIXME: it sounds like we don't need both prev and
2054 prev_for_neutral members, but I'm leaving them both for now. */
2055 if (bidi_it->type == STRONG_R || bidi_it->type == STRONG_L
2056 || bidi_it->type == WEAK_EN || bidi_it->type == WEAK_AN)
2057 bidi_remember_char (&bidi_it->prev_for_neutral, bidi_it);
2058
2059 /* If we overstepped the characters used for resolving neutrals
2060 and whitespace, invalidate their info in the iterator. */
2061 if (bidi_it->charpos >= bidi_it->next_for_neutral.charpos)
2062 bidi_it->next_for_neutral.type = UNKNOWN_BT;
2063 if (bidi_it->next_en_pos >= 0
2064 && bidi_it->charpos >= bidi_it->next_en_pos)
7b5d6677
EZ
2065 {
2066 bidi_it->next_en_pos = 0;
2067 bidi_it->next_en_type = UNKNOWN_BT;
2068 }
b7b65b15
EZ
2069 if (bidi_it->next_for_ws.type != UNKNOWN_BT
2070 && bidi_it->charpos >= bidi_it->next_for_ws.charpos)
2071 bidi_it->next_for_ws.type = UNKNOWN_BT;
2072
2073 /* This must be taken before we fill the iterator with the info
2074 about the next char. If we scan backwards, the iterator
2075 state must be already cached, so there's no need to know the
2076 embedding level of the previous character, since we will be
2077 returning to our caller shortly. */
2078 prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2079 }
2080 next_for_neutral = bidi_it->next_for_neutral;
2081
182ce2d2
EZ
2082 /* Perhaps the character we want is already cached. If it is, the
2083 call to bidi_cache_find below will return a type other than
2084 UNKNOWN_BT. */
87e67904 2085 if (bidi_cache_idx > bidi_cache_start && !bidi_it->first_elt)
102ebb00 2086 {
512a289d
RS
2087 int bob = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
2088 ? 0 : 1);
102ebb00
EZ
2089 if (bidi_it->scan_dir > 0)
2090 {
2091 if (bidi_it->nchars <= 0)
2092 abort ();
2093 next_char_pos = bidi_it->charpos + bidi_it->nchars;
2094 }
9f257352 2095 else if (bidi_it->charpos >= bob)
bb269206
EZ
2096 /* Implementation note: we allow next_char_pos to be as low as
2097 0 for buffers or -1 for strings, and that is okay because
2098 that's the "position" of the sentinel iterator state we
2099 cached at the beginning of the iteration. */
102ebb00 2100 next_char_pos = bidi_it->charpos - 1;
578b494e 2101 if (next_char_pos >= bob - 1)
87e67904
EZ
2102 type = bidi_cache_find (next_char_pos, -1, bidi_it);
2103 else
2104 type = UNKNOWN_BT;
102ebb00 2105 }
182ce2d2 2106 else
102ebb00 2107 type = UNKNOWN_BT;
b7b65b15
EZ
2108 if (type != UNKNOWN_BT)
2109 {
2110 /* Don't lose the information for resolving neutrals! The
2111 cached states could have been cached before their
2112 next_for_neutral member was computed. If we are on our way
2113 forward, we can simply take the info from the previous
2114 state. */
2115 if (bidi_it->scan_dir == 1
2116 && bidi_it->next_for_neutral.type == UNKNOWN_BT)
2117 bidi_it->next_for_neutral = next_for_neutral;
2118
2119 /* If resolved_level is -1, it means this state was cached
2120 before it was completely resolved, so we cannot return
2121 it. */
2122 if (bidi_it->resolved_level != -1)
2123 return bidi_it->resolved_level;
2124 }
2125 if (bidi_it->scan_dir == -1)
2126 /* If we are going backwards, the iterator state is already cached
2127 from previous scans, and should be fully resolved. */
2128 abort ();
2129
2130 if (type == UNKNOWN_BT)
2131 type = bidi_type_of_next_char (bidi_it);
2132
2133 if (type == NEUTRAL_B)
2134 return bidi_it->resolved_level;
2135
2136 level = bidi_it->level_stack[bidi_it->stack_idx].level;
2137 if ((bidi_get_category (type) == NEUTRAL /* && type != NEUTRAL_B */)
2138 || (type == WEAK_BN && prev_level == level))
2139 {
2140 if (bidi_it->next_for_neutral.type == UNKNOWN_BT)
2141 abort ();
2142
2143 /* If the cached state shows a neutral character, it was not
2144 resolved by bidi_resolve_neutral, so do it now. */
2145 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
2146 bidi_it->next_for_neutral.type,
2147 level);
2148 }
2149
2150 if (!(type == STRONG_R
2151 || type == STRONG_L
2152 || type == WEAK_BN
2153 || type == WEAK_EN
2154 || type == WEAK_AN))
2155 abort ();
2156 bidi_it->type = type;
2d6e4628 2157 bidi_check_type (bidi_it->type);
b7b65b15
EZ
2158
2159 /* For L1 below, we need to know, for each WS character, whether
0d327994 2160 it belongs to a sequence of WS characters preceding a newline
b7b65b15 2161 or a TAB or a paragraph separator. */
89d3374a 2162 if (bidi_it->orig_type == NEUTRAL_WS
b7b65b15
EZ
2163 && bidi_it->next_for_ws.type == UNKNOWN_BT)
2164 {
2165 int ch;
d311d28c
PE
2166 ptrdiff_t clen = bidi_it->ch_len;
2167 ptrdiff_t bpos = bidi_it->bytepos;
2168 ptrdiff_t cpos = bidi_it->charpos;
2169 ptrdiff_t disp_pos = bidi_it->disp_pos;
2170 ptrdiff_t nc = bidi_it->nchars;
87e67904 2171 struct bidi_string_data bs = bidi_it->string;
b7b65b15 2172 bidi_type_t chtype;
fc6f18ce 2173 int fwp = bidi_it->frame_window_p;
0c95fcf7 2174 int dpp = bidi_it->disp_prop;
b7b65b15 2175
102ebb00
EZ
2176 if (bidi_it->nchars <= 0)
2177 abort ();
b7b65b15 2178 do {
55439c61
EZ
2179 ch = bidi_fetch_char (bpos += clen, cpos += nc, &disp_pos, &dpp, &bs,
2180 fwp, &clen, &nc);
79beb178 2181 if (ch == '\n' || ch == BIDI_EOB)
b7b65b15
EZ
2182 chtype = NEUTRAL_B;
2183 else
6bff6497 2184 chtype = bidi_get_type (ch, NEUTRAL_DIR);
b7b65b15
EZ
2185 } while (chtype == NEUTRAL_WS || chtype == WEAK_BN
2186 || bidi_explicit_dir_char (ch)); /* L1/Retaining */
2187 bidi_it->next_for_ws.type = chtype;
2d6e4628 2188 bidi_check_type (bidi_it->next_for_ws.type);
b7b65b15
EZ
2189 bidi_it->next_for_ws.charpos = cpos;
2190 bidi_it->next_for_ws.bytepos = bpos;
2191 }
2192
2193 /* Resolve implicit levels, with a twist: PDFs get the embedding
55622b93 2194 level of the embedding they terminate. See below for the
b7b65b15 2195 reason. */
89d3374a 2196 if (bidi_it->orig_type == PDF
b7b65b15
EZ
2197 /* Don't do this if this formatting code didn't change the
2198 embedding level due to invalid or empty embeddings. */
2199 && prev_level != level)
2200 {
2201 /* Don't look in UAX#9 for the reason for this: it's our own
2202 private quirk. The reason is that we want the formatting
2203 codes to be delivered so that they bracket the text of their
2204 embedding. For example, given the text
2205
2206 {RLO}teST{PDF}
2207
2208 we want it to be displayed as
2209
0d68907d 2210 {PDF}STet{RLO}
b7b65b15
EZ
2211
2212 not as
2213
2214 STet{RLO}{PDF}
2215
2216 which will result because we bump up the embedding level as
2217 soon as we see the RLO and pop it as soon as we see the PDF,
2218 so RLO itself has the same embedding level as "teST", and
2219 thus would be normally delivered last, just before the PDF.
2220 The switch below fiddles with the level of PDF so that this
2221 ugly side effect does not happen.
2222
2223 (This is, of course, only important if the formatting codes
e7402cb2
EZ
2224 are actually displayed, but Emacs does need to display them
2225 if the user wants to.) */
b7b65b15
EZ
2226 level = prev_level;
2227 }
89d3374a
EZ
2228 else if (bidi_it->orig_type == NEUTRAL_B /* L1 */
2229 || bidi_it->orig_type == NEUTRAL_S
e7402cb2 2230 || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB
89d3374a 2231 || (bidi_it->orig_type == NEUTRAL_WS
b7b65b15
EZ
2232 && (bidi_it->next_for_ws.type == NEUTRAL_B
2233 || bidi_it->next_for_ws.type == NEUTRAL_S)))
2234 level = bidi_it->level_stack[0].level;
2235 else if ((level & 1) == 0) /* I1 */
2236 {
2237 if (type == STRONG_R)
2238 level++;
2239 else if (type == WEAK_EN || type == WEAK_AN)
2240 level += 2;
2241 }
2242 else /* I2 */
2243 {
2244 if (type == STRONG_L || type == WEAK_EN || type == WEAK_AN)
2245 level++;
2246 }
2247
2248 bidi_it->resolved_level = level;
2249 return level;
2250}
2251
2252/* Move to the other edge of a level given by LEVEL. If END_FLAG is
2253 non-zero, we are at the end of a level, and we need to prepare to
2254 resume the scan of the lower level.
2255
2256 If this level's other edge is cached, we simply jump to it, filling
2257 the iterator structure with the iterator state on the other edge.
182ce2d2
EZ
2258 Otherwise, we walk the buffer or string until we come back to the
2259 same level as LEVEL.
b7b65b15
EZ
2260
2261 Note: we are not talking here about a ``level run'' in the UAX#9
2262 sense of the term, but rather about a ``level'' which includes
2263 all the levels higher than it. In other words, given the levels
2264 like this:
2265
2266 11111112222222333333334443343222222111111112223322111
2267 A B C
2268
2269 and assuming we are at point A scanning left to right, this
2270 function moves to point C, whereas the UAX#9 ``level 2 run'' ends
2271 at point B. */
2272static void
2273bidi_find_other_level_edge (struct bidi_it *bidi_it, int level, int end_flag)
2274{
2275 int dir = end_flag ? -bidi_it->scan_dir : bidi_it->scan_dir;
39e378da 2276 ptrdiff_t idx;
b7b65b15
EZ
2277
2278 /* Try the cache first. */
87e67904
EZ
2279 if ((idx = bidi_cache_find_level_change (level, dir, end_flag))
2280 >= bidi_cache_start)
b7b65b15
EZ
2281 bidi_cache_fetch_state (idx, bidi_it);
2282 else
2283 {
2284 int new_level;
2285
2286 if (end_flag)
2287 abort (); /* if we are at end of level, its edges must be cached */
2288
2289 bidi_cache_iterator_state (bidi_it, 1);
2290 do {
2291 new_level = bidi_level_of_next_char (bidi_it);
2292 bidi_cache_iterator_state (bidi_it, 1);
2293 } while (new_level >= level);
2294 }
2295}
2296
2297void
4b292a22 2298bidi_move_to_visually_next (struct bidi_it *bidi_it)
b7b65b15
EZ
2299{
2300 int old_level, new_level, next_level;
9c82e145 2301 struct bidi_it sentinel;
acb28818 2302 struct gcpro gcpro1;
b7b65b15 2303
87e67904
EZ
2304 if (bidi_it->charpos < 0 || bidi_it->bytepos < 0)
2305 abort ();
b7b65b15
EZ
2306
2307 if (bidi_it->scan_dir == 0)
2308 {
2309 bidi_it->scan_dir = 1; /* default to logical order */
2310 }
2311
acb28818 2312 /* The code below can call eval, and thus cause GC. If we are
7e2ad32c 2313 iterating a Lisp string, make sure it won't be GCed. */
acb28818
EZ
2314 if (STRINGP (bidi_it->string.lstring))
2315 GCPRO1 (bidi_it->string.lstring);
2316
be39f003 2317 /* If we just passed a newline, initialize for the next line. */
1137e8b8
EZ
2318 if (!bidi_it->first_elt
2319 && (bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB))
be39f003
EZ
2320 bidi_line_init (bidi_it);
2321
6dcfd253
EZ
2322 /* Prepare the sentinel iterator state, and cache it. When we bump
2323 into it, scanning backwards, we'll know that the last non-base
2324 level is exhausted. */
87e67904 2325 if (bidi_cache_idx == bidi_cache_start)
9c82e145
EZ
2326 {
2327 bidi_copy_it (&sentinel, bidi_it);
2328 if (bidi_it->first_elt)
2329 {
2330 sentinel.charpos--; /* cached charpos needs to be monotonic */
2331 sentinel.bytepos--;
2332 sentinel.ch = '\n'; /* doesn't matter, but why not? */
2333 sentinel.ch_len = 1;
182ce2d2 2334 sentinel.nchars = 1;
9c82e145 2335 }
6dcfd253 2336 bidi_cache_iterator_state (&sentinel, 1);
9c82e145 2337 }
b7b65b15
EZ
2338
2339 old_level = bidi_it->resolved_level;
2340 new_level = bidi_level_of_next_char (bidi_it);
b7b65b15
EZ
2341
2342 /* Reordering of resolved levels (clause L2) is implemented by
2343 jumping to the other edge of the level and flipping direction of
c0546589 2344 scanning the text whenever we find a level change. */
b7b65b15
EZ
2345 if (new_level != old_level)
2346 {
2347 int ascending = new_level > old_level;
2348 int level_to_search = ascending ? old_level + 1 : old_level;
2349 int incr = ascending ? 1 : -1;
2350 int expected_next_level = old_level + incr;
2351
b7b65b15
EZ
2352 /* Jump (or walk) to the other edge of this level. */
2353 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
2354 /* Switch scan direction and peek at the next character in the
2355 new direction. */
2356 bidi_it->scan_dir = -bidi_it->scan_dir;
2357
2358 /* The following loop handles the case where the resolved level
2359 jumps by more than one. This is typical for numbers inside a
2360 run of text with left-to-right embedding direction, but can
2361 also happen in other situations. In those cases the decision
2362 where to continue after a level change, and in what direction,
2363 is tricky. For example, given a text like below:
2364
2365 abcdefgh
2366 11336622
2367
2368 (where the numbers below the text show the resolved levels),
2369 the result of reordering according to UAX#9 should be this:
2370
2371 efdcghba
2372
2373 This is implemented by the loop below which flips direction
2374 and jumps to the other edge of the level each time it finds
2375 the new level not to be the expected one. The expected level
2376 is always one more or one less than the previous one. */
2377 next_level = bidi_peek_at_next_level (bidi_it);
2378 while (next_level != expected_next_level)
2379 {
2380 expected_next_level += incr;
2381 level_to_search += incr;
2382 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
2383 bidi_it->scan_dir = -bidi_it->scan_dir;
2384 next_level = bidi_peek_at_next_level (bidi_it);
2385 }
2386
2387 /* Finally, deliver the next character in the new direction. */
2388 next_level = bidi_level_of_next_char (bidi_it);
2389 }
2390
b44d9321
EZ
2391 /* Take note when we have just processed the newline that precedes
2392 the end of the paragraph. The next time we are about to be
2393 called, set_iterator_to_next will automatically reinit the
2394 paragraph direction, if needed. We do this at the newline before
2395 the paragraph separator, because the next character might not be
2396 the first character of the next paragraph, due to the bidi
c0546589
EZ
2397 reordering, whereas we _must_ know the paragraph base direction
2398 _before_ we process the paragraph's text, since the base
2399 direction affects the reordering. */
1137e8b8
EZ
2400 if (bidi_it->scan_dir == 1
2401 && (bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB))
be39f003 2402 {
87e67904
EZ
2403 /* The paragraph direction of the entire string, once
2404 determined, is in effect for the entire string. Setting the
2405 separator limit to the end of the string prevents
2406 bidi_paragraph_init from being called automatically on this
2407 string. */
9f257352 2408 if (bidi_it->string.s || STRINGP (bidi_it->string.lstring))
87e67904
EZ
2409 bidi_it->separator_limit = bidi_it->string.schars;
2410 else if (bidi_it->bytepos < ZV_BYTE)
be39f003 2411 {
5895d7b9 2412 ptrdiff_t sep_len
512a289d
RS
2413 = bidi_at_paragraph_end (bidi_it->charpos + bidi_it->nchars,
2414 bidi_it->bytepos + bidi_it->ch_len);
87e67904
EZ
2415 if (bidi_it->nchars <= 0)
2416 abort ();
2417 if (sep_len >= 0)
2418 {
2419 bidi_it->new_paragraph = 1;
2420 /* Record the buffer position of the last character of the
2421 paragraph separator. */
512a289d
RS
2422 bidi_it->separator_limit
2423 = bidi_it->charpos + bidi_it->nchars + sep_len;
87e67904 2424 }
be39f003
EZ
2425 }
2426 }
6bff6497 2427
87e67904 2428 if (bidi_it->scan_dir == 1 && bidi_cache_idx > bidi_cache_start)
b7b65b15
EZ
2429 {
2430 /* If we are at paragraph's base embedding level and beyond the
2431 last cached position, the cache's job is done and we can
2432 discard it. */
2433 if (bidi_it->resolved_level == bidi_it->level_stack[0].level
182ce2d2
EZ
2434 && bidi_it->charpos > (bidi_cache[bidi_cache_idx - 1].charpos
2435 + bidi_cache[bidi_cache_idx - 1].nchars - 1))
b7b65b15
EZ
2436 bidi_cache_reset ();
2437 /* But as long as we are caching during forward scan, we must
2438 cache each state, or else the cache integrity will be
2439 compromised: it assumes cached states correspond to buffer
2440 positions 1:1. */
2441 else
2442 bidi_cache_iterator_state (bidi_it, 1);
2443 }
acb28818
EZ
2444
2445 if (STRINGP (bidi_it->string.lstring))
2446 UNGCPRO;
b7b65b15
EZ
2447}
2448
2449/* This is meant to be called from within the debugger, whenever you
2450 wish to examine the cache contents. */
e78aecca 2451void bidi_dump_cached_states (void) EXTERNALLY_VISIBLE;
b7b65b15
EZ
2452void
2453bidi_dump_cached_states (void)
2454{
39e378da 2455 ptrdiff_t i;
b7b65b15
EZ
2456 int ndigits = 1;
2457
2458 if (bidi_cache_idx == 0)
2459 {
2460 fprintf (stderr, "The cache is empty.\n");
2461 return;
2462 }
df9733bf 2463 fprintf (stderr, "Total of %"pD"d state%s in cache:\n",
b7b65b15
EZ
2464 bidi_cache_idx, bidi_cache_idx == 1 ? "" : "s");
2465
2466 for (i = bidi_cache[bidi_cache_idx - 1].charpos; i > 0; i /= 10)
2467 ndigits++;
2468 fputs ("ch ", stderr);
2469 for (i = 0; i < bidi_cache_idx; i++)
2470 fprintf (stderr, "%*c", ndigits, bidi_cache[i].ch);
2471 fputs ("\n", stderr);
2472 fputs ("lvl ", stderr);
2473 for (i = 0; i < bidi_cache_idx; i++)
2474 fprintf (stderr, "%*d", ndigits, bidi_cache[i].resolved_level);
2475 fputs ("\n", stderr);
2476 fputs ("pos ", stderr);
2477 for (i = 0; i < bidi_cache_idx; i++)
7c85f529 2478 fprintf (stderr, "%*"pD"d", ndigits, bidi_cache[i].charpos);
b7b65b15
EZ
2479 fputs ("\n", stderr);
2480}