Fix bug #16043 with crashes when displaying new bidi control characters.
[bpt/emacs.git] / src / bidi.c
CommitLineData
cbb09f04 1/* Low-level bidirectional buffer/string-scanning functions for GNU Emacs.
ab422c4d
PE
2 Copyright (C) 2000-2001, 2004-2005, 2009-2013 Free Software
3 Foundation, Inc.
b7b65b15
EZ
4
5This file is part of GNU Emacs.
6
a8d11bd3 7GNU Emacs is free software: you can redistribute it and/or modify
b7b65b15 8it under the terms of the GNU General Public License as published by
a8d11bd3
EZ
9the Free Software Foundation, either version 3 of the License, or
10(at your option) any later version.
b7b65b15
EZ
11
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
16
b7b65b15 17You should have received a copy of the GNU General Public License
a8d11bd3 18along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
b7b65b15 19
2d6e4628
EZ
20/* Written by Eli Zaretskii <eliz@gnu.org>.
21
22 A sequential implementation of the Unicode Bidirectional algorithm,
cbb09f04 23 (UBA) as per UAX#9, a part of the Unicode Standard.
b7b65b15
EZ
24
25 Unlike the reference and most other implementations, this one is
940afb59
EZ
26 designed to be called once for every character in the buffer or
27 string.
b7b65b15 28
4b292a22 29 The main entry point is bidi_move_to_visually_next. Each time it
b7b65b15
EZ
30 is called, it finds the next character in the visual order, and
31 returns its information in a special structure. The caller is then
32 expected to process this character for display or any other
4b292a22
EZ
33 purposes, and call bidi_move_to_visually_next for the next
34 character. See the comments in bidi_move_to_visually_next for more
35 details about its algorithm that finds the next visual-order
b7b65b15
EZ
36 character by resolving their levels on the fly.
37
95e1afc6 38 Two other entry points are bidi_paragraph_init and
940afb59
EZ
39 bidi_mirror_char. The first determines the base direction of a
40 paragraph, while the second returns the mirrored version of its
41 argument character.
42
95e1afc6
EZ
43 A few auxiliary entry points are used to initialize the bidi
44 iterator for iterating an object (buffer or string), push and pop
45 the bidi iterator state, and save and restore the state of the bidi
46 cache.
47
89d3374a
EZ
48 If you want to understand the code, you will have to read it
49 together with the relevant portions of UAX#9. The comments include
50 references to UAX#9 rules, for that very reason.
51
b7b65b15
EZ
52 A note about references to UAX#9 rules: if the reference says
53 something like "X9/Retaining", it means that you need to refer to
cd1181db 54 rule X9 and to its modifications described in the "Implementation
b7b65b15
EZ
55 Notes" section of UAX#9, under "Retaining Format Codes". */
56
b7b65b15 57#include <config.h>
b7b65b15 58#include <stdio.h>
29e3d8d1 59
b7b65b15 60#include "lisp.h"
b7b65b15 61#include "character.h"
e5560ff7 62#include "buffer.h"
b7b65b15 63#include "dispextern.h"
e30b79c1 64#include "region-cache.h"
b7b65b15 65
2cc21167 66static bool bidi_initialized = 0;
b7b65b15 67
cbc4fd20 68static Lisp_Object bidi_type_table, bidi_mirror_table;
b7b65b15
EZ
69
70#define LRM_CHAR 0x200E
71#define RLM_CHAR 0x200F
b7b65b15 72#define BIDI_EOB -1
b7b65b15 73
b7b65b15
EZ
74/* Data type for describing the bidirectional character categories. */
75typedef enum {
76 UNKNOWN_BC,
77 NEUTRAL,
78 WEAK,
ad826124
EZ
79 STRONG,
80 EXPLICIT_FORMATTING
b7b65b15
EZ
81} bidi_category_t;
82
32413314
EZ
83/* UAX#9 says to search only for L, AL, or R types of characters, and
84 ignore RLE, RLO, LRE, and LRO, when determining the base paragraph
85 level. Yudit indeed ignores them. This variable is therefore set
2cc21167
PE
86 by default to ignore them, but clearing it will take them into
87 account. */
88extern bool bidi_ignore_explicit_marks_for_paragraph_level EXTERNALLY_VISIBLE;
89bool bidi_ignore_explicit_marks_for_paragraph_level = 1;
b7b65b15 90
372b7a95 91static Lisp_Object paragraph_start_re, paragraph_separate_re;
6bff6497 92static Lisp_Object Qparagraph_start, Qparagraph_separate;
b7b65b15 93
cbb09f04
EZ
94\f
95/***********************************************************************
96 Utilities
97 ***********************************************************************/
b7b65b15 98
6bff6497
EZ
99/* Return the bidi type of a character CH, subject to the current
100 directional OVERRIDE. */
0a99eee1 101static bidi_type_t
6bff6497 102bidi_get_type (int ch, bidi_dir_t override)
b7b65b15 103{
6bff6497
EZ
104 bidi_type_t default_type;
105
e7402cb2
EZ
106 if (ch == BIDI_EOB)
107 return NEUTRAL_B;
e342a24d 108 if (ch < 0 || ch > MAX_CHAR)
1088b922 109 emacs_abort ();
6bff6497
EZ
110
111 default_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
bca633fb
EZ
112 /* Every valid character code, even those that are unassigned by the
113 UCD, have some bidi-class property, according to
114 DerivedBidiClass.txt file. Therefore, if we ever get UNKNOWN_BT
115 (= zero) code from CHAR_TABLE_REF, that's a bug. */
116 if (default_type == UNKNOWN_BT)
1088b922 117 emacs_abort ();
6bff6497 118
6bff6497
EZ
119 switch (default_type)
120 {
ad826124 121 case WEAK_BN:
6bff6497
EZ
122 case NEUTRAL_B:
123 case LRE:
124 case LRO:
125 case RLE:
126 case RLO:
127 case PDF:
128 return default_type;
ad826124
EZ
129 /* FIXME: The isolate controls are treated as BN until we add
130 support for UBA v6.3. */
131 case LRI:
132 case RLI:
133 case FSI:
134 case PDI:
135 return WEAK_BN;
6bff6497 136 default:
ad826124
EZ
137 if (override == L2R)
138 return STRONG_L;
139 else if (override == R2L)
140 return STRONG_R;
141 else
142 return default_type;
6bff6497 143 }
b7b65b15
EZ
144}
145
0a99eee1 146static void
2d6e4628
EZ
147bidi_check_type (bidi_type_t type)
148{
a54e2c05 149 eassert (UNKNOWN_BT <= type && type <= NEUTRAL_ON);
2d6e4628
EZ
150}
151
b7b65b15 152/* Given a bidi TYPE of a character, return its category. */
0a99eee1 153static bidi_category_t
b7b65b15
EZ
154bidi_get_category (bidi_type_t type)
155{
156 switch (type)
157 {
158 case UNKNOWN_BT:
159 return UNKNOWN_BC;
160 case STRONG_L:
161 case STRONG_R:
162 case STRONG_AL:
b7b65b15 163 return STRONG;
b7b65b15
EZ
164 case WEAK_EN:
165 case WEAK_ES:
166 case WEAK_ET:
167 case WEAK_AN:
168 case WEAK_CS:
169 case WEAK_NSM:
170 case WEAK_BN:
ad826124
EZ
171 /* FIXME */
172 case LRI:
173 case RLI:
174 case FSI:
175 case PDI:
b7b65b15
EZ
176 return WEAK;
177 case NEUTRAL_B:
178 case NEUTRAL_S:
179 case NEUTRAL_WS:
180 case NEUTRAL_ON:
181 return NEUTRAL;
ad826124
EZ
182 case LRE:
183 case LRO:
184 case RLE:
185 case RLO:
186 case PDF:
187#if 0
188 /* FIXME: This awaits implementation of isolate support. */
189 case LRI:
190 case RLI:
191 case FSI:
192 case PDI:
193#endif
194 return EXPLICIT_FORMATTING;
b7b65b15 195 default:
1088b922 196 emacs_abort ();
b7b65b15
EZ
197 }
198}
199
a6e8d97c
EZ
200/* Return the mirrored character of C, if it has one. If C has no
201 mirrored counterpart, return C.
202 Note: The conditions in UAX#9 clause L4 regarding the surrounding
203 context must be tested by the caller. */
b7b65b15
EZ
204int
205bidi_mirror_char (int c)
206{
cbc4fd20
EZ
207 Lisp_Object val;
208
209 if (c == BIDI_EOB)
210 return c;
211 if (c < 0 || c > MAX_CHAR)
1088b922 212 emacs_abort ();
b7b65b15 213
cbc4fd20
EZ
214 val = CHAR_TABLE_REF (bidi_mirror_table, c);
215 if (INTEGERP (val))
b7b65b15 216 {
da92a98c 217 int v;
9e1a06fc 218
da92a98c
PE
219 /* When debugging, check before assigning to V, so that the check
220 isn't broken by undefined behavior due to int overflow. */
81899c91 221 eassert (CHAR_VALID_P (XINT (val)));
9e1a06fc 222
da92a98c
PE
223 v = XINT (val);
224
9e1a06fc
EZ
225 /* Minimal test we must do in optimized builds, to prevent weird
226 crashes further down the road. */
227 if (v < 0 || v > MAX_CHAR)
1088b922 228 emacs_abort ();
9e1a06fc
EZ
229
230 return v;
b7b65b15 231 }
cbc4fd20 232
b7b65b15
EZ
233 return c;
234}
235
cbb09f04
EZ
236/* Determine the start-of-run (sor) directional type given the two
237 embedding levels on either side of the run boundary. Also, update
238 the saved info about previously seen characters, since that info is
239 generally valid for a single level run. */
0a99eee1 240static void
cbb09f04
EZ
241bidi_set_sor_type (struct bidi_it *bidi_it, int level_before, int level_after)
242{
512a289d 243 int higher_level = (level_before > level_after ? level_before : level_after);
cbb09f04
EZ
244
245 /* The prev_was_pdf gork is required for when we have several PDFs
246 in a row. In that case, we want to compute the sor type for the
247 next level run only once: when we see the first PDF. That's
248 because the sor type depends only on the higher of the two levels
249 that we find on the two sides of the level boundary (see UAX#9,
250 clause X10), and so we don't need to know the final embedding
251 level to which we descend after processing all the PDFs. */
252 if (!bidi_it->prev_was_pdf || level_before < level_after)
253 /* FIXME: should the default sor direction be user selectable? */
512a289d 254 bidi_it->sor = ((higher_level & 1) != 0 ? R2L : L2R);
cbb09f04
EZ
255 if (level_before > level_after)
256 bidi_it->prev_was_pdf = 1;
257
258 bidi_it->prev.type = UNKNOWN_BT;
512a289d
RS
259 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1
260 = bidi_it->last_strong.orig_type = UNKNOWN_BT;
261 bidi_it->prev_for_neutral.type = (bidi_it->sor == R2L ? STRONG_R : STRONG_L);
cbb09f04
EZ
262 bidi_it->prev_for_neutral.charpos = bidi_it->charpos;
263 bidi_it->prev_for_neutral.bytepos = bidi_it->bytepos;
512a289d
RS
264 bidi_it->next_for_neutral.type = bidi_it->next_for_neutral.type_after_w1
265 = bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
cbb09f04
EZ
266 bidi_it->ignore_bn_limit = -1; /* meaning it's unknown */
267}
268
269/* Push the current embedding level and override status; reset the
270 current level to LEVEL and the current override status to OVERRIDE. */
0a99eee1 271static void
cbb09f04
EZ
272bidi_push_embedding_level (struct bidi_it *bidi_it,
273 int level, bidi_dir_t override)
274{
275 bidi_it->stack_idx++;
a54e2c05 276 eassert (bidi_it->stack_idx < BIDI_MAXLEVEL);
cbb09f04
EZ
277 bidi_it->level_stack[bidi_it->stack_idx].level = level;
278 bidi_it->level_stack[bidi_it->stack_idx].override = override;
279}
280
281/* Pop the embedding level and directional override status from the
282 stack, and return the new level. */
0a99eee1 283static int
cbb09f04
EZ
284bidi_pop_embedding_level (struct bidi_it *bidi_it)
285{
286 /* UAX#9 says to ignore invalid PDFs. */
287 if (bidi_it->stack_idx > 0)
288 bidi_it->stack_idx--;
289 return bidi_it->level_stack[bidi_it->stack_idx].level;
290}
291
292/* Record in SAVED_INFO the information about the current character. */
0a99eee1 293static void
cbb09f04
EZ
294bidi_remember_char (struct bidi_saved_info *saved_info,
295 struct bidi_it *bidi_it)
296{
297 saved_info->charpos = bidi_it->charpos;
298 saved_info->bytepos = bidi_it->bytepos;
299 saved_info->type = bidi_it->type;
300 bidi_check_type (bidi_it->type);
301 saved_info->type_after_w1 = bidi_it->type_after_w1;
302 bidi_check_type (bidi_it->type_after_w1);
303 saved_info->orig_type = bidi_it->orig_type;
304 bidi_check_type (bidi_it->orig_type);
305}
306
b7b65b15
EZ
307/* Copy the bidi iterator from FROM to TO. To save cycles, this only
308 copies the part of the level stack that is actually in use. */
0a99eee1 309static void
b7b65b15
EZ
310bidi_copy_it (struct bidi_it *to, struct bidi_it *from)
311{
05584c31
PE
312 /* Copy everything from the start through the active part of
313 the level stack. */
314 memcpy (to, from,
315 (offsetof (struct bidi_it, level_stack[1])
316 + from->stack_idx * sizeof from->level_stack[0]));
b7b65b15
EZ
317}
318
cbb09f04
EZ
319\f
320/***********************************************************************
321 Caching the bidi iterator states
322 ***********************************************************************/
b7b65b15 323
2fe72643
EZ
324#define BIDI_CACHE_CHUNK 200
325static struct bidi_it *bidi_cache;
39e378da 326static ptrdiff_t bidi_cache_size = 0;
ad6042bb 327enum { elsz = sizeof (struct bidi_it) };
39e378da
PE
328static ptrdiff_t bidi_cache_idx; /* next unused cache slot */
329static ptrdiff_t bidi_cache_last_idx; /* slot of last cache hit */
330static ptrdiff_t bidi_cache_start = 0; /* start of cache for this
87e67904
EZ
331 "stack" level */
332
bc18e09d
PE
333/* 5-slot stack for saving the start of the previous level of the
334 cache. xdisp.c maintains a 5-slot stack for its iterator state,
335 and we need the same size of our stack. */
336static ptrdiff_t bidi_cache_start_stack[IT_STACK_SIZE];
337static int bidi_cache_sp;
338
339/* Size of header used by bidi_shelve_cache. */
340enum
341 {
512a289d
RS
342 bidi_shelve_header_size
343 = (sizeof (bidi_cache_idx) + sizeof (bidi_cache_start_stack)
344 + sizeof (bidi_cache_sp) + sizeof (bidi_cache_start)
345 + sizeof (bidi_cache_last_idx))
bc18e09d
PE
346 };
347
87e67904
EZ
348/* Reset the cache state to the empty state. We only reset the part
349 of the cache relevant to iteration of the current object. Previous
350 objects, which are pushed on the display iterator's stack, are left
351 intact. This is called when the cached information is no more
352 useful for the current iteration, e.g. when we were reseated to a
353 new position on the same object. */
0a99eee1 354static void
b7b65b15
EZ
355bidi_cache_reset (void)
356{
87e67904 357 bidi_cache_idx = bidi_cache_start;
b7b65b15
EZ
358 bidi_cache_last_idx = -1;
359}
360
87e67904
EZ
361/* Shrink the cache to its minimal size. Called when we init the bidi
362 iterator for reordering a buffer or a string that does not come
363 from display properties, because that means all the previously
364 cached info is of no further use. */
0a99eee1 365static void
2fe72643
EZ
366bidi_cache_shrink (void)
367{
368 if (bidi_cache_size > BIDI_CACHE_CHUNK)
369 {
38182d90 370 bidi_cache = xrealloc (bidi_cache, BIDI_CACHE_CHUNK * elsz);
51f30bc5 371 bidi_cache_size = BIDI_CACHE_CHUNK;
2fe72643
EZ
372 }
373 bidi_cache_reset ();
374}
375
0a99eee1 376static void
39e378da 377bidi_cache_fetch_state (ptrdiff_t idx, struct bidi_it *bidi_it)
b7b65b15
EZ
378{
379 int current_scan_dir = bidi_it->scan_dir;
380
87e67904 381 if (idx < bidi_cache_start || idx >= bidi_cache_idx)
1088b922 382 emacs_abort ();
b7b65b15
EZ
383
384 bidi_copy_it (bidi_it, &bidi_cache[idx]);
385 bidi_it->scan_dir = current_scan_dir;
386 bidi_cache_last_idx = idx;
387}
388
389/* Find a cached state with a given CHARPOS and resolved embedding
390 level less or equal to LEVEL. if LEVEL is -1, disregard the
391 resolved levels in cached states. DIR, if non-zero, means search
392 in that direction from the last cache hit. */
0a99eee1 393static ptrdiff_t
d311d28c 394bidi_cache_search (ptrdiff_t charpos, int level, int dir)
b7b65b15 395{
39e378da 396 ptrdiff_t i, i_start;
b7b65b15 397
6eec7596 398 if (bidi_cache_idx > bidi_cache_start)
b7b65b15 399 {
6eec7596
EZ
400 if (bidi_cache_last_idx == -1)
401 bidi_cache_last_idx = bidi_cache_idx - 1;
b7b65b15 402 if (charpos < bidi_cache[bidi_cache_last_idx].charpos)
182ce2d2
EZ
403 {
404 dir = -1;
405 i_start = bidi_cache_last_idx - 1;
406 }
407 else if (charpos > (bidi_cache[bidi_cache_last_idx].charpos
408 + bidi_cache[bidi_cache_last_idx].nchars - 1))
409 {
410 dir = 1;
411 i_start = bidi_cache_last_idx + 1;
412 }
413 else if (dir)
b7b65b15
EZ
414 i_start = bidi_cache_last_idx;
415 else
416 {
417 dir = -1;
418 i_start = bidi_cache_idx - 1;
419 }
420
421 if (dir < 0)
422 {
423 /* Linear search for now; FIXME! */
87e67904 424 for (i = i_start; i >= bidi_cache_start; i--)
182ce2d2
EZ
425 if (bidi_cache[i].charpos <= charpos
426 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
b7b65b15
EZ
427 && (level == -1 || bidi_cache[i].resolved_level <= level))
428 return i;
429 }
430 else
431 {
432 for (i = i_start; i < bidi_cache_idx; i++)
182ce2d2
EZ
433 if (bidi_cache[i].charpos <= charpos
434 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
b7b65b15
EZ
435 && (level == -1 || bidi_cache[i].resolved_level <= level))
436 return i;
437 }
438 }
439
440 return -1;
441}
442
443/* Find a cached state where the resolved level changes to a value
444 that is lower than LEVEL, and return its cache slot index. DIR is
445 the direction to search, starting with the last used cache slot.
87e67904 446 If DIR is zero, we search backwards from the last occupied cache
2cc21167 447 slot. BEFORE means return the index of the slot that
87e67904 448 is ``before'' the level change in the search direction. That is,
b7b65b15
EZ
449 given the cached levels like this:
450
451 1122333442211
452 AB C
453
454 and assuming we are at the position cached at the slot marked with
455 C, searching backwards (DIR = -1) for LEVEL = 2 will return the
456 index of slot B or A, depending whether BEFORE is, respectively,
2cc21167 457 true or false. */
39e378da 458static ptrdiff_t
2cc21167 459bidi_cache_find_level_change (int level, int dir, bool before)
b7b65b15
EZ
460{
461 if (bidi_cache_idx)
462 {
39e378da 463 ptrdiff_t i = dir ? bidi_cache_last_idx : bidi_cache_idx - 1;
b7b65b15
EZ
464 int incr = before ? 1 : 0;
465
a54e2c05 466 eassert (!dir || bidi_cache_last_idx >= 0);
6eec7596 467
b7b65b15
EZ
468 if (!dir)
469 dir = -1;
470 else if (!incr)
471 i += dir;
472
473 if (dir < 0)
474 {
87e67904 475 while (i >= bidi_cache_start + incr)
b7b65b15
EZ
476 {
477 if (bidi_cache[i - incr].resolved_level >= 0
478 && bidi_cache[i - incr].resolved_level < level)
479 return i;
480 i--;
481 }
482 }
483 else
484 {
485 while (i < bidi_cache_idx - incr)
486 {
487 if (bidi_cache[i + incr].resolved_level >= 0
488 && bidi_cache[i + incr].resolved_level < level)
489 return i;
490 i++;
491 }
492 }
493 }
494
495 return -1;
496}
497
0a99eee1 498static void
39e378da 499bidi_cache_ensure_space (ptrdiff_t idx)
58b9f433
EZ
500{
501 /* Enlarge the cache as needed. */
502 if (idx >= bidi_cache_size)
503 {
f0eb61e9
PE
504 /* The bidi cache cannot be larger than the largest Lisp string
505 or buffer. */
512a289d
RS
506 ptrdiff_t string_or_buffer_bound
507 = max (BUF_BYTES_MAX, STRING_BYTES_BOUND);
f0eb61e9
PE
508
509 /* Also, it cannot be larger than what C can represent. */
512a289d
RS
510 ptrdiff_t c_bound
511 = (min (PTRDIFF_MAX, SIZE_MAX) - bidi_shelve_header_size) / elsz;
f0eb61e9 512
512a289d
RS
513 bidi_cache
514 = xpalloc (bidi_cache, &bidi_cache_size,
515 max (BIDI_CACHE_CHUNK, idx - bidi_cache_size + 1),
516 min (string_or_buffer_bound, c_bound), elsz);
58b9f433
EZ
517 }
518}
519
0a99eee1 520static void
2cc21167 521bidi_cache_iterator_state (struct bidi_it *bidi_it, bool resolved)
b7b65b15 522{
39e378da 523 ptrdiff_t idx;
b7b65b15
EZ
524
525 /* We should never cache on backward scans. */
526 if (bidi_it->scan_dir == -1)
1088b922 527 emacs_abort ();
b7b65b15
EZ
528 idx = bidi_cache_search (bidi_it->charpos, -1, 1);
529
530 if (idx < 0)
531 {
532 idx = bidi_cache_idx;
58b9f433 533 bidi_cache_ensure_space (idx);
bd924a5d
EZ
534 /* Character positions should correspond to cache positions 1:1.
535 If we are outside the range of cached positions, the cache is
536 useless and must be reset. */
87e67904 537 if (idx > bidi_cache_start &&
182ce2d2
EZ
538 (bidi_it->charpos > (bidi_cache[idx - 1].charpos
539 + bidi_cache[idx - 1].nchars)
a1344e7d 540 || bidi_it->charpos < bidi_cache[bidi_cache_start].charpos))
bd924a5d
EZ
541 {
542 bidi_cache_reset ();
87e67904 543 idx = bidi_cache_start;
bd924a5d 544 }
102ebb00 545 if (bidi_it->nchars <= 0)
1088b922 546 emacs_abort ();
b7b65b15
EZ
547 bidi_copy_it (&bidi_cache[idx], bidi_it);
548 if (!resolved)
549 bidi_cache[idx].resolved_level = -1;
550 }
551 else
552 {
553 /* Copy only the members which could have changed, to avoid
554 costly copying of the entire struct. */
555 bidi_cache[idx].type = bidi_it->type;
2d6e4628 556 bidi_check_type (bidi_it->type);
89d3374a
EZ
557 bidi_cache[idx].type_after_w1 = bidi_it->type_after_w1;
558 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
559 if (resolved)
560 bidi_cache[idx].resolved_level = bidi_it->resolved_level;
561 else
562 bidi_cache[idx].resolved_level = -1;
563 bidi_cache[idx].invalid_levels = bidi_it->invalid_levels;
564 bidi_cache[idx].invalid_rl_levels = bidi_it->invalid_rl_levels;
565 bidi_cache[idx].next_for_neutral = bidi_it->next_for_neutral;
566 bidi_cache[idx].next_for_ws = bidi_it->next_for_ws;
567 bidi_cache[idx].ignore_bn_limit = bidi_it->ignore_bn_limit;
f67cdd7f 568 bidi_cache[idx].disp_pos = bidi_it->disp_pos;
0c95fcf7 569 bidi_cache[idx].disp_prop = bidi_it->disp_prop;
b7b65b15
EZ
570 }
571
572 bidi_cache_last_idx = idx;
573 if (idx >= bidi_cache_idx)
574 bidi_cache_idx = idx + 1;
575}
576
0a99eee1 577static bidi_type_t
d311d28c 578bidi_cache_find (ptrdiff_t charpos, int level, struct bidi_it *bidi_it)
b7b65b15 579{
39e378da 580 ptrdiff_t i = bidi_cache_search (charpos, level, bidi_it->scan_dir);
b7b65b15 581
87e67904 582 if (i >= bidi_cache_start)
b7b65b15
EZ
583 {
584 bidi_dir_t current_scan_dir = bidi_it->scan_dir;
585
5009f85e 586 bidi_copy_it (bidi_it, &bidi_cache[i]);
b7b65b15 587 bidi_cache_last_idx = i;
5a5fa834 588 /* Don't let scan direction from the cached state override
b7b65b15
EZ
589 the current scan direction. */
590 bidi_it->scan_dir = current_scan_dir;
591 return bidi_it->type;
592 }
593
594 return UNKNOWN_BT;
595}
596
0a99eee1 597static int
b7b65b15
EZ
598bidi_peek_at_next_level (struct bidi_it *bidi_it)
599{
87e67904 600 if (bidi_cache_idx == bidi_cache_start || bidi_cache_last_idx == -1)
1088b922 601 emacs_abort ();
b7b65b15
EZ
602 return bidi_cache[bidi_cache_last_idx + bidi_it->scan_dir].resolved_level;
603}
604
cbb09f04 605\f
58b9f433
EZ
606/***********************************************************************
607 Pushing and popping the bidi iterator state
608 ***********************************************************************/
58b9f433
EZ
609
610/* Push the bidi iterator state in preparation for reordering a
611 different object, e.g. display string found at certain buffer
7e2ad32c
EZ
612 position. Pushing the bidi iterator boils down to saving its
613 entire state on the cache and starting a new cache "stacked" on top
614 of the current cache. */
58b9f433
EZ
615void
616bidi_push_it (struct bidi_it *bidi_it)
b7b65b15 617{
58b9f433
EZ
618 /* Save the current iterator state in its entirety after the last
619 used cache slot. */
620 bidi_cache_ensure_space (bidi_cache_idx);
ae1d87e2 621 bidi_cache[bidi_cache_idx++] = *bidi_it;
be39f003 622
58b9f433 623 /* Push the current cache start onto the stack. */
a54e2c05 624 eassert (bidi_cache_sp < IT_STACK_SIZE);
58b9f433 625 bidi_cache_start_stack[bidi_cache_sp++] = bidi_cache_start;
be39f003 626
58b9f433
EZ
627 /* Start a new level of cache, and make it empty. */
628 bidi_cache_start = bidi_cache_idx;
629 bidi_cache_last_idx = -1;
630}
631
632/* Restore the iterator state saved by bidi_push_it and return the
633 cache to the corresponding state. */
634void
635bidi_pop_it (struct bidi_it *bidi_it)
636{
637 if (bidi_cache_start <= 0)
1088b922 638 emacs_abort ();
58b9f433
EZ
639
640 /* Reset the next free cache slot index to what it was before the
641 call to bidi_push_it. */
642 bidi_cache_idx = bidi_cache_start - 1;
643
644 /* Restore the bidi iterator state saved in the cache. */
ae1d87e2 645 *bidi_it = bidi_cache[bidi_cache_idx];
58b9f433
EZ
646
647 /* Pop the previous cache start from the stack. */
648 if (bidi_cache_sp <= 0)
1088b922 649 emacs_abort ();
58b9f433
EZ
650 bidi_cache_start = bidi_cache_start_stack[--bidi_cache_sp];
651
652 /* Invalidate the last-used cache slot data. */
653 bidi_cache_last_idx = -1;
654}
655
ec7cc85b 656static ptrdiff_t bidi_cache_total_alloc;
35928349 657
ed94e6d7
EZ
658/* Stash away a copy of the cache and its control variables. */
659void *
660bidi_shelve_cache (void)
661{
662 unsigned char *databuf;
458bfed3 663 ptrdiff_t alloc;
ed94e6d7 664
35928349 665 /* Empty cache. */
ed94e6d7
EZ
666 if (bidi_cache_idx == 0)
667 return NULL;
668
458bfed3
PE
669 alloc = (bidi_shelve_header_size
670 + bidi_cache_idx * sizeof (struct bidi_it));
671 databuf = xmalloc (alloc);
672 bidi_cache_total_alloc += alloc;
35928349 673
ed94e6d7
EZ
674 memcpy (databuf, &bidi_cache_idx, sizeof (bidi_cache_idx));
675 memcpy (databuf + sizeof (bidi_cache_idx),
676 bidi_cache, bidi_cache_idx * sizeof (struct bidi_it));
677 memcpy (databuf + sizeof (bidi_cache_idx)
678 + bidi_cache_idx * sizeof (struct bidi_it),
679 bidi_cache_start_stack, sizeof (bidi_cache_start_stack));
680 memcpy (databuf + sizeof (bidi_cache_idx)
681 + bidi_cache_idx * sizeof (struct bidi_it)
682 + sizeof (bidi_cache_start_stack),
683 &bidi_cache_sp, sizeof (bidi_cache_sp));
684 memcpy (databuf + sizeof (bidi_cache_idx)
685 + bidi_cache_idx * sizeof (struct bidi_it)
686 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp),
687 &bidi_cache_start, sizeof (bidi_cache_start));
688 memcpy (databuf + sizeof (bidi_cache_idx)
689 + bidi_cache_idx * sizeof (struct bidi_it)
690 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
691 + sizeof (bidi_cache_start),
692 &bidi_cache_last_idx, sizeof (bidi_cache_last_idx));
693
694 return databuf;
695}
696
d1410150
EZ
697/* Restore the cache state from a copy stashed away by
698 bidi_shelve_cache, and free the buffer used to stash that copy.
2cc21167 699 JUST_FREE means free the buffer, but don't restore the
d1410150
EZ
700 cache; used when the corresponding iterator is discarded instead of
701 being restored. */
ed94e6d7 702void
2cc21167 703bidi_unshelve_cache (void *databuf, bool just_free)
ed94e6d7
EZ
704{
705 unsigned char *p = databuf;
706
707 if (!p)
be39f003 708 {
d1410150
EZ
709 if (!just_free)
710 {
711 /* A NULL pointer means an empty cache. */
712 bidi_cache_start = 0;
713 bidi_cache_sp = 0;
714 bidi_cache_reset ();
715 }
be39f003 716 }
ed94e6d7
EZ
717 else
718 {
35928349
EZ
719 if (just_free)
720 {
721 ptrdiff_t idx;
722
723 memcpy (&idx, p, sizeof (bidi_cache_idx));
512a289d
RS
724 bidi_cache_total_alloc
725 -= bidi_shelve_header_size + idx * sizeof (struct bidi_it);
35928349
EZ
726 }
727 else
728 {
729 memcpy (&bidi_cache_idx, p, sizeof (bidi_cache_idx));
730 bidi_cache_ensure_space (bidi_cache_idx);
731 memcpy (bidi_cache, p + sizeof (bidi_cache_idx),
732 bidi_cache_idx * sizeof (struct bidi_it));
733 memcpy (bidi_cache_start_stack,
734 p + sizeof (bidi_cache_idx)
735 + bidi_cache_idx * sizeof (struct bidi_it),
736 sizeof (bidi_cache_start_stack));
737 memcpy (&bidi_cache_sp,
738 p + sizeof (bidi_cache_idx)
739 + bidi_cache_idx * sizeof (struct bidi_it)
740 + sizeof (bidi_cache_start_stack),
741 sizeof (bidi_cache_sp));
742 memcpy (&bidi_cache_start,
743 p + sizeof (bidi_cache_idx)
744 + bidi_cache_idx * sizeof (struct bidi_it)
745 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp),
746 sizeof (bidi_cache_start));
747 memcpy (&bidi_cache_last_idx,
748 p + sizeof (bidi_cache_idx)
749 + bidi_cache_idx * sizeof (struct bidi_it)
750 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
751 + sizeof (bidi_cache_start),
752 sizeof (bidi_cache_last_idx));
512a289d
RS
753 bidi_cache_total_alloc
754 -= (bidi_shelve_header_size
755 + bidi_cache_idx * sizeof (struct bidi_it));
35928349 756 }
ed94e6d7
EZ
757
758 xfree (p);
759 }
760}
b7b65b15 761
58b9f433 762\f
cbb09f04
EZ
763/***********************************************************************
764 Initialization
765 ***********************************************************************/
766static void
767bidi_initialize (void)
b7b65b15 768{
474a8465
EZ
769 bidi_type_table = uniprop_table (intern ("bidi-class"));
770 if (NILP (bidi_type_table))
1088b922 771 emacs_abort ();
cbb09f04
EZ
772 staticpro (&bidi_type_table);
773
474a8465
EZ
774 bidi_mirror_table = uniprop_table (intern ("mirroring"));
775 if (NILP (bidi_mirror_table))
1088b922 776 emacs_abort ();
cbb09f04
EZ
777 staticpro (&bidi_mirror_table);
778
cbb09f04
EZ
779 Qparagraph_start = intern ("paragraph-start");
780 staticpro (&Qparagraph_start);
781 paragraph_start_re = Fsymbol_value (Qparagraph_start);
782 if (!STRINGP (paragraph_start_re))
783 paragraph_start_re = build_string ("\f\\|[ \t]*$");
784 staticpro (&paragraph_start_re);
785 Qparagraph_separate = intern ("paragraph-separate");
786 staticpro (&Qparagraph_separate);
787 paragraph_separate_re = Fsymbol_value (Qparagraph_separate);
788 if (!STRINGP (paragraph_separate_re))
789 paragraph_separate_re = build_string ("[ \t\f]*$");
790 staticpro (&paragraph_separate_re);
58b9f433
EZ
791
792 bidi_cache_sp = 0;
ec7cc85b 793 bidi_cache_total_alloc = 0;
58b9f433 794
cbb09f04 795 bidi_initialized = 1;
b7b65b15
EZ
796}
797
cbb09f04
EZ
798/* Do whatever UAX#9 clause X8 says should be done at paragraph's
799 end. */
0a99eee1 800static void
cbb09f04 801bidi_set_paragraph_end (struct bidi_it *bidi_it)
b7b65b15 802{
cbb09f04
EZ
803 bidi_it->invalid_levels = 0;
804 bidi_it->invalid_rl_levels = -1;
805 bidi_it->stack_idx = 0;
806 bidi_it->resolved_level = bidi_it->level_stack[0].level;
807}
b7b65b15 808
cbb09f04
EZ
809/* Initialize the bidi iterator from buffer/string position CHARPOS. */
810void
2cc21167 811bidi_init_it (ptrdiff_t charpos, ptrdiff_t bytepos, bool frame_window_p,
cbb09f04
EZ
812 struct bidi_it *bidi_it)
813{
814 if (! bidi_initialized)
815 bidi_initialize ();
816 if (charpos >= 0)
817 bidi_it->charpos = charpos;
818 if (bytepos >= 0)
819 bidi_it->bytepos = bytepos;
820 bidi_it->frame_window_p = frame_window_p;
821 bidi_it->nchars = -1; /* to be computed in bidi_resolve_explicit_1 */
822 bidi_it->first_elt = 1;
823 bidi_set_paragraph_end (bidi_it);
824 bidi_it->new_paragraph = 1;
825 bidi_it->separator_limit = -1;
826 bidi_it->type = NEUTRAL_B;
827 bidi_it->type_after_w1 = NEUTRAL_B;
828 bidi_it->orig_type = NEUTRAL_B;
829 bidi_it->prev_was_pdf = 0;
512a289d
RS
830 bidi_it->prev.type = bidi_it->prev.type_after_w1
831 = bidi_it->prev.orig_type = UNKNOWN_BT;
832 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1
833 = bidi_it->last_strong.orig_type = UNKNOWN_BT;
cbb09f04 834 bidi_it->next_for_neutral.charpos = -1;
512a289d
RS
835 bidi_it->next_for_neutral.type
836 = bidi_it->next_for_neutral.type_after_w1
837 = bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
cbb09f04 838 bidi_it->prev_for_neutral.charpos = -1;
512a289d
RS
839 bidi_it->prev_for_neutral.type
840 = bidi_it->prev_for_neutral.type_after_w1
841 = bidi_it->prev_for_neutral.orig_type = UNKNOWN_BT;
cbb09f04
EZ
842 bidi_it->sor = L2R; /* FIXME: should it be user-selectable? */
843 bidi_it->disp_pos = -1; /* invalid/unknown */
0c95fcf7 844 bidi_it->disp_prop = 0;
cbb09f04
EZ
845 /* We can only shrink the cache if we are at the bottom level of its
846 "stack". */
847 if (bidi_cache_start == 0)
848 bidi_cache_shrink ();
58b9f433
EZ
849 else
850 bidi_cache_reset ();
b7b65b15
EZ
851}
852
182ce2d2 853/* Perform initializations for reordering a new line of bidi text. */
6bff6497 854static void
be39f003
EZ
855bidi_line_init (struct bidi_it *bidi_it)
856{
857 bidi_it->scan_dir = 1; /* FIXME: do we need to have control on this? */
858 bidi_it->resolved_level = bidi_it->level_stack[0].level;
859 bidi_it->level_stack[0].override = NEUTRAL_DIR; /* X1 */
860 bidi_it->invalid_levels = 0;
861 bidi_it->invalid_rl_levels = -1;
4787455f
EZ
862 /* Setting this to zero will force its recomputation the first time
863 we need it for W5. */
864 bidi_it->next_en_pos = 0;
7b5d6677 865 bidi_it->next_en_type = UNKNOWN_BT;
be39f003 866 bidi_it->next_for_ws.type = UNKNOWN_BT;
b44d9321 867 bidi_set_sor_type (bidi_it,
512a289d 868 (bidi_it->paragraph_dir == R2L ? 1 : 0),
be39f003
EZ
869 bidi_it->level_stack[0].level); /* X10 */
870
871 bidi_cache_reset ();
872}
873
cbb09f04
EZ
874\f
875/***********************************************************************
876 Fetching characters
877 ***********************************************************************/
878
f3014ef5
EZ
879/* Count bytes in string S between BEG/BEGBYTE and END. BEG and END
880 are zero-based character positions in S, BEGBYTE is byte position
2cc21167 881 corresponding to BEG. UNIBYTE means S is a unibyte string. */
0a99eee1 882static ptrdiff_t
b7ad2f74
PE
883bidi_count_bytes (const unsigned char *s, ptrdiff_t beg,
884 ptrdiff_t begbyte, ptrdiff_t end, bool unibyte)
87e67904 885{
d311d28c 886 ptrdiff_t pos = beg;
87e67904
EZ
887 const unsigned char *p = s + begbyte, *start = p;
888
f3014ef5
EZ
889 if (unibyte)
890 p = s + end;
891 else
87e67904 892 {
f3014ef5 893 if (!CHAR_HEAD_P (*p))
1088b922 894 emacs_abort ();
f3014ef5
EZ
895
896 while (pos < end)
897 {
898 p += BYTES_BY_CHAR_HEAD (*p);
899 pos++;
900 }
87e67904
EZ
901 }
902
903 return p - start;
904}
905
05584c31 906/* Fetch and return the character at byte position BYTEPOS. If S is
87e67904 907 non-NULL, fetch the character from string S; otherwise fetch the
2cc21167 908 character from the current buffer. UNIBYTE means S is a
f3014ef5 909 unibyte string. */
0a99eee1 910static int
2cc21167 911bidi_char_at_pos (ptrdiff_t bytepos, const unsigned char *s, bool unibyte)
87e67904
EZ
912{
913 if (s)
f3014ef5 914 {
05584c31 915 s += bytepos;
f3014ef5 916 if (unibyte)
05584c31 917 return *s;
f3014ef5 918 }
87e67904 919 else
05584c31
PE
920 s = BYTE_POS_ADDR (bytepos);
921 return STRING_CHAR (s);
87e67904
EZ
922}
923
36075a19 924/* Fetch and return the character at CHARPOS/BYTEPOS. If that
102ebb00 925 character is covered by a display string, treat the entire run of
0c95fcf7
EZ
926 covered characters as a single character, either u+2029 or u+FFFC,
927 and return their combined length in CH_LEN and NCHARS. DISP_POS
928 specifies the character position of the next display string, or -1
b75258b3
EZ
929 if not yet computed. When the next character is at or beyond that
930 position, the function updates DISP_POS with the position of the
2cc21167 931 next display string. *DISP_PROP non-zero means that there's really
0c95fcf7 932 a display string at DISP_POS, as opposed to when we searched till
2cc21167 933 DISP_POS without finding one. If *DISP_PROP is 2, it means the
0c95fcf7 934 display spec is of the form `(space ...)', which is replaced with
b75258b3
EZ
935 u+2029 to handle it as a paragraph separator. STRING->s is the C
936 string to iterate, or NULL if iterating over a buffer or a Lisp
937 string; in the latter case, STRING->lstring is the Lisp string. */
0a99eee1 938static int
36075a19 939bidi_fetch_char (ptrdiff_t charpos, ptrdiff_t bytepos, ptrdiff_t *disp_pos,
0c95fcf7 940 int *disp_prop, struct bidi_string_data *string,
5bf97bfc 941 struct window *w,
2cc21167 942 bool frame_window_p, ptrdiff_t *ch_len, ptrdiff_t *nchars)
182ce2d2
EZ
943{
944 int ch;
5895d7b9 945 ptrdiff_t endpos
512a289d 946 = (string->s || STRINGP (string->lstring)) ? string->schars : ZV;
6db161be 947 struct text_pos pos;
e99a9b8b 948 int len;
182ce2d2 949
182ce2d2 950 /* If we got past the last known position of display string, compute
87e67904
EZ
951 the position of the next one. That position could be at CHARPOS. */
952 if (charpos < endpos && charpos > *disp_pos)
6db161be
EZ
953 {
954 SET_TEXT_POS (pos, charpos, bytepos);
5bf97bfc 955 *disp_pos = compute_display_string_pos (&pos, string, w, frame_window_p,
0c95fcf7 956 disp_prop);
6db161be 957 }
102ebb00
EZ
958
959 /* Fetch the character at BYTEPOS. */
87e67904 960 if (charpos >= endpos)
182ce2d2
EZ
961 {
962 ch = BIDI_EOB;
963 *ch_len = 1;
964 *nchars = 1;
87e67904 965 *disp_pos = endpos;
0c95fcf7 966 *disp_prop = 0;
182ce2d2 967 }
0c95fcf7 968 else if (charpos >= *disp_pos && *disp_prop)
182ce2d2 969 {
d311d28c 970 ptrdiff_t disp_end_pos;
7b600102
EZ
971
972 /* We don't expect to find ourselves in the middle of a display
973 property. Hopefully, it will never be needed. */
974 if (charpos > *disp_pos)
1088b922 975 emacs_abort ();
0c95fcf7
EZ
976 /* Text covered by `display' properties and overlays with
977 display properties or display strings is handled as a single
978 character that represents the entire run of characters
979 covered by the display property. */
980 if (*disp_prop == 2)
981 {
982 /* `(space ...)' display specs are handled as paragraph
983 separators for the purposes of the reordering; see UAX#9
984 section 3 and clause HL1 in section 4.3 there. */
985 ch = 0x2029;
986 }
987 else
988 {
989 /* All other display specs are handled as the Unicode Object
990 Replacement Character. */
991 ch = 0xFFFC;
992 }
87e67904 993 disp_end_pos = compute_display_string_end (*disp_pos, string);
fbcaa2f3
EZ
994 if (disp_end_pos < 0)
995 {
996 /* Somebody removed the display string from the buffer
997 behind our back. Recover by processing this buffer
998 position as if no display property were present there to
999 begin with. */
1000 *disp_prop = 0;
1001 goto normal_char;
1002 }
7b600102 1003 *nchars = disp_end_pos - *disp_pos;
f3014ef5 1004 if (*nchars <= 0)
1088b922 1005 emacs_abort ();
87e67904
EZ
1006 if (string->s)
1007 *ch_len = bidi_count_bytes (string->s, *disp_pos, bytepos,
f3014ef5 1008 disp_end_pos, string->unibyte);
9f257352
EZ
1009 else if (STRINGP (string->lstring))
1010 *ch_len = bidi_count_bytes (SDATA (string->lstring), *disp_pos,
f3014ef5 1011 bytepos, disp_end_pos, string->unibyte);
87e67904
EZ
1012 else
1013 *ch_len = CHAR_TO_BYTE (disp_end_pos) - bytepos;
182ce2d2 1014 }
182ce2d2
EZ
1015 else
1016 {
fbcaa2f3 1017 normal_char:
87e67904
EZ
1018 if (string->s)
1019 {
87e67904 1020
f3014ef5
EZ
1021 if (!string->unibyte)
1022 {
1023 ch = STRING_CHAR_AND_LENGTH (string->s + bytepos, len);
1024 *ch_len = len;
1025 }
1026 else
1027 {
1028 ch = UNIBYTE_TO_CHAR (string->s[bytepos]);
1029 *ch_len = 1;
1030 }
87e67904 1031 }
9f257352
EZ
1032 else if (STRINGP (string->lstring))
1033 {
f3014ef5
EZ
1034 if (!string->unibyte)
1035 {
1036 ch = STRING_CHAR_AND_LENGTH (SDATA (string->lstring) + bytepos,
1037 len);
1038 *ch_len = len;
1039 }
1040 else
1041 {
1042 ch = UNIBYTE_TO_CHAR (SREF (string->lstring, bytepos));
1043 *ch_len = 1;
1044 }
9f257352 1045 }
87e67904
EZ
1046 else
1047 {
e99a9b8b
EZ
1048 ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (bytepos), len);
1049 *ch_len = len;
87e67904 1050 }
182ce2d2
EZ
1051 *nchars = 1;
1052 }
1053
1054 /* If we just entered a run of characters covered by a display
1055 string, compute the position of the next display string. */
55439c61 1056 if (charpos + *nchars <= endpos && charpos + *nchars > *disp_pos
0c95fcf7 1057 && *disp_prop)
6db161be
EZ
1058 {
1059 SET_TEXT_POS (pos, charpos + *nchars, bytepos + *ch_len);
5bf97bfc 1060 *disp_pos = compute_display_string_pos (&pos, string, w, frame_window_p,
0c95fcf7 1061 disp_prop);
6db161be 1062 }
182ce2d2
EZ
1063
1064 return ch;
1065}
1066
cbb09f04
EZ
1067\f
1068/***********************************************************************
1069 Determining paragraph direction
1070 ***********************************************************************/
1071
1072/* Check if buffer position CHARPOS/BYTEPOS is the end of a paragraph.
1073 Value is the non-negative length of the paragraph separator
1074 following the buffer position, -1 if position is at the beginning
1075 of a new paragraph, or -2 if position is neither at beginning nor
1076 at end of a paragraph. */
d311d28c
PE
1077static ptrdiff_t
1078bidi_at_paragraph_end (ptrdiff_t charpos, ptrdiff_t bytepos)
cbb09f04
EZ
1079{
1080 Lisp_Object sep_re;
1081 Lisp_Object start_re;
d311d28c 1082 ptrdiff_t val;
cbb09f04
EZ
1083
1084 sep_re = paragraph_separate_re;
1085 start_re = paragraph_start_re;
1086
1087 val = fast_looking_at (sep_re, charpos, bytepos, ZV, ZV_BYTE, Qnil);
1088 if (val < 0)
1089 {
1090 if (fast_looking_at (start_re, charpos, bytepos, ZV, ZV_BYTE, Qnil) >= 0)
1091 val = -1;
1092 else
1093 val = -2;
1094 }
1095
1096 return val;
1097}
1098
e30b79c1
DA
1099/* If the user has requested the long scans caching, make sure that
1100 BIDI cache is enabled. Otherwise, make sure it's disabled. */
1101
1102static struct region_cache *
1103bidi_paragraph_cache_on_off (void)
1104{
1105 if (NILP (BVAR (current_buffer, cache_long_scans)))
1106 {
1107 if (current_buffer->bidi_paragraph_cache)
1108 {
1109 free_region_cache (current_buffer->bidi_paragraph_cache);
1110 current_buffer->bidi_paragraph_cache = 0;
1111 }
1112 return NULL;
1113 }
1114 else
1115 {
1116 if (!current_buffer->bidi_paragraph_cache)
1117 current_buffer->bidi_paragraph_cache = new_region_cache ();
1118 return current_buffer->bidi_paragraph_cache;
1119 }
1120}
1121
1137e8b8
EZ
1122/* On my 2005-vintage machine, searching back for paragraph start
1123 takes ~1 ms per line. And bidi_paragraph_init is called 4 times
1124 when user types C-p. The number below limits each call to
1125 bidi_paragraph_init to about 10 ms. */
1126#define MAX_PARAGRAPH_SEARCH 7500
1127
be39f003 1128/* Find the beginning of this paragraph by looking back in the buffer.
1137e8b8
EZ
1129 Value is the byte position of the paragraph's beginning, or
1130 BEGV_BYTE if paragraph_start_re is still not found after looking
1131 back MAX_PARAGRAPH_SEARCH lines in the buffer. */
d311d28c
PE
1132static ptrdiff_t
1133bidi_find_paragraph_start (ptrdiff_t pos, ptrdiff_t pos_byte)
6bff6497 1134{
372b7a95 1135 Lisp_Object re = paragraph_start_re;
d311d28c 1136 ptrdiff_t limit = ZV, limit_byte = ZV_BYTE;
e30b79c1
DA
1137 struct region_cache *bpc = bidi_paragraph_cache_on_off ();
1138 ptrdiff_t n = 0, oldpos = pos, next;
6bff6497 1139
6bff6497 1140 while (pos_byte > BEGV_BYTE
1137e8b8 1141 && n++ < MAX_PARAGRAPH_SEARCH
6bff6497 1142 && fast_looking_at (re, pos, pos_byte, limit, limit_byte, Qnil) < 0)
b5426561
DA
1143 {
1144 /* FIXME: What if the paragraph beginning is covered by a
1145 display string? And what if a display string covering some
1146 of the text over which we scan back includes
1147 paragraph_start_re? */
1148 DEC_BOTH (pos, pos_byte);
e30b79c1
DA
1149 if (bpc && region_cache_backward (current_buffer, bpc, pos, &next))
1150 {
1151 pos = next, pos_byte = CHAR_TO_BYTE (pos);
1152 break;
1153 }
1154 else
1155 pos = find_newline_no_quit (pos, pos_byte, -1, &pos_byte);
b5426561 1156 }
1137e8b8 1157 if (n >= MAX_PARAGRAPH_SEARCH)
e30b79c1
DA
1158 pos = BEGV, pos_byte = BEGV_BYTE;
1159 if (bpc)
1160 know_region_cache (current_buffer, bpc, pos, oldpos);
cd88d682
EZ
1161 /* Positions returned by the region cache are not limited to
1162 BEGV..ZV range, so we limit them here. */
1163 pos_byte = clip_to_bounds (BEGV_BYTE, pos_byte, ZV_BYTE);
be39f003 1164 return pos_byte;
6bff6497
EZ
1165}
1166
ce811ad9
EZ
1167/* On a 3.4 GHz machine, searching forward for a strong directional
1168 character in a long paragraph full of weaks or neutrals takes about
1169 1 ms for each 20K characters. The number below limits each call to
1170 bidi_paragraph_init to less than 10 ms even on slow machines. */
1171#define MAX_STRONG_CHAR_SEARCH 100000
1172
bea4f10c 1173/* Determine the base direction, a.k.a. base embedding level, of the
b44d9321
EZ
1174 paragraph we are about to iterate through. If DIR is either L2R or
1175 R2L, just use that. Otherwise, determine the paragraph direction
bea4f10c
EZ
1176 from the first strong directional character of the paragraph.
1177
2cc21167 1178 NO_DEFAULT_P means don't default to L2R if the paragraph
bea4f10c
EZ
1179 has no strong directional characters and both DIR and
1180 bidi_it->paragraph_dir are NEUTRAL_DIR. In that case, search back
1181 in the buffer until a paragraph is found with a strong character,
1182 or until hitting BEGV. In the latter case, fall back to L2R. This
1183 flag is used in current-bidi-paragraph-direction.
1184
1185 Note that this function gives the paragraph separator the same
1186 direction as the preceding paragraph, even though Emacs generally
9858f6c3 1187 views the separator as not belonging to any paragraph. */
b7b65b15 1188void
2cc21167 1189bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it, bool no_default_p)
b7b65b15 1190{
d311d28c 1191 ptrdiff_t bytepos = bidi_it->bytepos;
2cc21167 1192 bool string_p = bidi_it->string.s || STRINGP (bidi_it->string.lstring);
d311d28c 1193 ptrdiff_t pstartbyte;
87e67904
EZ
1194 /* Note that begbyte is a byte position, while end is a character
1195 position. Yes, this is ugly, but we are trying to avoid costly
1196 calls to BYTE_TO_CHAR and its ilk. */
d311d28c
PE
1197 ptrdiff_t begbyte = string_p ? 0 : BEGV_BYTE;
1198 ptrdiff_t end = string_p ? bidi_it->string.schars : ZV;
e7402cb2 1199
5e65aec0 1200 /* Special case for an empty buffer. */
87e67904 1201 if (bytepos == begbyte && bidi_it->charpos == end)
5e65aec0 1202 dir = L2R;
9c82e145 1203 /* We should never be called at EOB or before BEGV. */
87e67904 1204 else if (bidi_it->charpos >= end || bytepos < begbyte)
1088b922 1205 emacs_abort ();
9c82e145 1206
be39f003
EZ
1207 if (dir == L2R)
1208 {
1209 bidi_it->paragraph_dir = L2R;
1210 bidi_it->new_paragraph = 0;
1211 }
1212 else if (dir == R2L)
1213 {
1214 bidi_it->paragraph_dir = R2L;
1215 bidi_it->new_paragraph = 0;
1216 }
b7b65b15
EZ
1217 else if (dir == NEUTRAL_DIR) /* P2 */
1218 {
182ce2d2 1219 int ch;
d311d28c
PE
1220 ptrdiff_t ch_len, nchars;
1221 ptrdiff_t pos, disp_pos = -1;
0c95fcf7 1222 int disp_prop = 0;
6bff6497 1223 bidi_type_t type;
9f257352 1224 const unsigned char *s;
be39f003 1225
d20e1419
EZ
1226 if (!bidi_initialized)
1227 bidi_initialize ();
1228
be39f003
EZ
1229 /* If we are inside a paragraph separator, we are just waiting
1230 for the separator to be exhausted; use the previous paragraph
e5a2fec7
EZ
1231 direction. But don't do that if we have been just reseated,
1232 because we need to reinitialize below in that case. */
1233 if (!bidi_it->first_elt
1234 && bidi_it->charpos < bidi_it->separator_limit)
be39f003
EZ
1235 return;
1236
b44d9321 1237 /* If we are on a newline, get past it to where the next
5e65aec0
EZ
1238 paragraph might start. But don't do that at BEGV since then
1239 we are potentially in a new paragraph that doesn't yet
1240 exist. */
c143c213 1241 pos = bidi_it->charpos;
512a289d
RS
1242 s = (STRINGP (bidi_it->string.lstring)
1243 ? SDATA (bidi_it->string.lstring)
1244 : bidi_it->string.s);
f3014ef5
EZ
1245 if (bytepos > begbyte
1246 && bidi_char_at_pos (bytepos, s, bidi_it->string.unibyte) == '\n')
be39f003 1247 {
b44d9321 1248 bytepos++;
c143c213 1249 pos++;
be39f003 1250 }
b44d9321
EZ
1251
1252 /* We are either at the beginning of a paragraph or in the
1253 middle of it. Find where this paragraph starts. */
87e67904
EZ
1254 if (string_p)
1255 {
1256 /* We don't support changes of paragraph direction inside a
1257 string. It is treated as a single paragraph. */
1258 pstartbyte = 0;
1259 }
1260 else
1261 pstartbyte = bidi_find_paragraph_start (pos, bytepos);
be39f003
EZ
1262 bidi_it->separator_limit = -1;
1263 bidi_it->new_paragraph = 0;
bea4f10c 1264
2cc21167
PE
1265 /* The following loop is run more than once only if NO_DEFAULT_P,
1266 and only if we are iterating on a buffer. */
bea4f10c 1267 do {
ce811ad9
EZ
1268 ptrdiff_t pos1;
1269
bea4f10c 1270 bytepos = pstartbyte;
87e67904
EZ
1271 if (!string_p)
1272 pos = BYTE_TO_CHAR (bytepos);
36075a19 1273 ch = bidi_fetch_char (pos, bytepos, &disp_pos, &disp_prop,
5bf97bfc 1274 &bidi_it->string, bidi_it->w,
87e67904 1275 bidi_it->frame_window_p, &ch_len, &nchars);
bea4f10c
EZ
1276 type = bidi_get_type (ch, NEUTRAL_DIR);
1277
ce811ad9 1278 pos1 = pos;
182ce2d2 1279 for (pos += nchars, bytepos += ch_len;
ce811ad9
EZ
1280 ((bidi_get_category (type) != STRONG)
1281 || (bidi_ignore_explicit_marks_for_paragraph_level
1282 && (type == RLE || type == RLO
1283 || type == LRE || type == LRO)))
1284 /* Stop when searched too far into an abnormally large
1285 paragraph full of weak or neutral characters. */
1286 && pos - pos1 < MAX_STRONG_CHAR_SEARCH;
bea4f10c
EZ
1287 type = bidi_get_type (ch, NEUTRAL_DIR))
1288 {
87e67904 1289 if (pos >= end)
bea4f10c
EZ
1290 {
1291 /* Pretend there's a paragraph separator at end of
87e67904 1292 buffer/string. */
bea4f10c
EZ
1293 type = NEUTRAL_B;
1294 break;
1295 }
0bb23927
EZ
1296 if (!string_p
1297 && type == NEUTRAL_B
1298 && bidi_at_paragraph_end (pos, bytepos) >= -1)
cf99dcf8 1299 break;
102ebb00 1300 /* Fetch next character and advance to get past it. */
36075a19 1301 ch = bidi_fetch_char (pos, bytepos, &disp_pos,
5bf97bfc 1302 &disp_prop, &bidi_it->string, bidi_it->w,
fc6f18ce 1303 bidi_it->frame_window_p, &ch_len, &nchars);
182ce2d2
EZ
1304 pos += nchars;
1305 bytepos += ch_len;
bea4f10c 1306 }
32413314
EZ
1307 if ((type == STRONG_R || type == STRONG_AL) /* P3 */
1308 || (!bidi_ignore_explicit_marks_for_paragraph_level
1309 && (type == RLO || type == RLE)))
bea4f10c 1310 bidi_it->paragraph_dir = R2L;
32413314
EZ
1311 else if (type == STRONG_L
1312 || (!bidi_ignore_explicit_marks_for_paragraph_level
1313 && (type == LRO || type == LRE)))
bea4f10c 1314 bidi_it->paragraph_dir = L2R;
87e67904
EZ
1315 if (!string_p
1316 && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR)
bea4f10c
EZ
1317 {
1318 /* If this paragraph is at BEGV, default to L2R. */
1319 if (pstartbyte == BEGV_BYTE)
1320 bidi_it->paragraph_dir = L2R; /* P3 and HL1 */
1321 else
1322 {
d311d28c
PE
1323 ptrdiff_t prevpbyte = pstartbyte;
1324 ptrdiff_t p = BYTE_TO_CHAR (pstartbyte), pbyte = pstartbyte;
bea4f10c
EZ
1325
1326 /* Find the beginning of the previous paragraph, if any. */
1327 while (pbyte > BEGV_BYTE && prevpbyte >= pstartbyte)
1328 {
182ce2d2
EZ
1329 /* FXIME: What if p is covered by a display
1330 string? See also a FIXME inside
1331 bidi_find_paragraph_start. */
36075a19 1332 DEC_BOTH (p, pbyte);
bea4f10c
EZ
1333 prevpbyte = bidi_find_paragraph_start (p, pbyte);
1334 }
1335 pstartbyte = prevpbyte;
1336 }
1337 }
87e67904
EZ
1338 } while (!string_p
1339 && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR);
b7b65b15 1340 }
be39f003 1341 else
1088b922 1342 emacs_abort ();
b7b65b15 1343
b44d9321
EZ
1344 /* Contrary to UAX#9 clause P3, we only default the paragraph
1345 direction to L2R if we have no previous usable paragraph
bea4f10c 1346 direction. This is allowed by the HL1 clause. */
d20e1419 1347 if (bidi_it->paragraph_dir != L2R && bidi_it->paragraph_dir != R2L)
bea4f10c 1348 bidi_it->paragraph_dir = L2R; /* P3 and HL1 ``higher-level protocols'' */
be39f003 1349 if (bidi_it->paragraph_dir == R2L)
b44d9321 1350 bidi_it->level_stack[0].level = 1;
be39f003 1351 else
b44d9321 1352 bidi_it->level_stack[0].level = 0;
be39f003
EZ
1353
1354 bidi_line_init (bidi_it);
b7b65b15
EZ
1355}
1356
cbb09f04
EZ
1357\f
1358/***********************************************************************
1359 Resolving explicit and implicit levels.
58b9f433 1360 The rest of this file constitutes the core of the UBA implementation.
cbb09f04 1361 ***********************************************************************/
b7b65b15 1362
0a99eee1 1363static bool
182ce2d2 1364bidi_explicit_dir_char (int ch)
b7b65b15 1365{
182ce2d2
EZ
1366 bidi_type_t ch_type;
1367
1368 if (!bidi_initialized)
1088b922 1369 emacs_abort ();
182ce2d2
EZ
1370 ch_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
1371 return (ch_type == LRE || ch_type == LRO
1372 || ch_type == RLE || ch_type == RLO
1373 || ch_type == PDF);
b7b65b15
EZ
1374}
1375
1376/* A helper function for bidi_resolve_explicit. It advances to the
1377 next character in logical order and determines the new embedding
1378 level and directional override, but does not take into account
1379 empty embeddings. */
1380static int
1381bidi_resolve_explicit_1 (struct bidi_it *bidi_it)
1382{
1383 int curchar;
1384 bidi_type_t type;
1385 int current_level;
1386 int new_level;
1387 bidi_dir_t override;
2cc21167 1388 bool string_p = bidi_it->string.s || STRINGP (bidi_it->string.lstring);
b7b65b15 1389
182ce2d2
EZ
1390 /* If reseat()'ed, don't advance, so as to start iteration from the
1391 position where we were reseated. bidi_it->bytepos can be less
1392 than BEGV_BYTE after reseat to BEGV. */
87e67904 1393 if (bidi_it->bytepos < (string_p ? 0 : BEGV_BYTE)
9c82e145 1394 || bidi_it->first_elt)
e7402cb2 1395 {
9c82e145 1396 bidi_it->first_elt = 0;
87e67904
EZ
1397 if (string_p)
1398 {
512a289d
RS
1399 const unsigned char *p
1400 = (STRINGP (bidi_it->string.lstring)
1401 ? SDATA (bidi_it->string.lstring)
1402 : bidi_it->string.s);
9f257352 1403
87e67904 1404 if (bidi_it->charpos < 0)
05e193f1
EZ
1405 bidi_it->charpos = bidi_it->bytepos = 0;
1406 eassert (bidi_it->bytepos == bidi_count_bytes (p, 0, 0,
1407 bidi_it->charpos,
1408 bidi_it->string.unibyte));
87e67904
EZ
1409 }
1410 else
1411 {
1412 if (bidi_it->charpos < BEGV)
05e193f1
EZ
1413 {
1414 bidi_it->charpos = BEGV;
1415 bidi_it->bytepos = BEGV_BYTE;
1416 }
1417 eassert (bidi_it->bytepos == CHAR_TO_BYTE (bidi_it->charpos));
87e67904 1418 }
e7402cb2 1419 }
87e67904
EZ
1420 /* Don't move at end of buffer/string. */
1421 else if (bidi_it->charpos < (string_p ? bidi_it->string.schars : ZV))
b7b65b15 1422 {
182ce2d2
EZ
1423 /* Advance to the next character, skipping characters covered by
1424 display strings (nchars > 1). */
102ebb00 1425 if (bidi_it->nchars <= 0)
1088b922 1426 emacs_abort ();
182ce2d2 1427 bidi_it->charpos += bidi_it->nchars;
e7402cb2 1428 if (bidi_it->ch_len == 0)
1088b922 1429 emacs_abort ();
b7b65b15
EZ
1430 bidi_it->bytepos += bidi_it->ch_len;
1431 }
1432
1433 current_level = bidi_it->level_stack[bidi_it->stack_idx].level; /* X1 */
1434 override = bidi_it->level_stack[bidi_it->stack_idx].override;
1435 new_level = current_level;
1436
87e67904 1437 if (bidi_it->charpos >= (string_p ? bidi_it->string.schars : ZV))
e7402cb2
EZ
1438 {
1439 curchar = BIDI_EOB;
1440 bidi_it->ch_len = 1;
182ce2d2 1441 bidi_it->nchars = 1;
87e67904 1442 bidi_it->disp_pos = (string_p ? bidi_it->string.schars : ZV);
0c95fcf7 1443 bidi_it->disp_prop = 0;
e7402cb2
EZ
1444 }
1445 else
1446 {
182ce2d2
EZ
1447 /* Fetch the character at BYTEPOS. If it is covered by a
1448 display string, treat the entire run of covered characters as
1449 a single character u+FFFC. */
36075a19 1450 curchar = bidi_fetch_char (bidi_it->charpos, bidi_it->bytepos,
0c95fcf7 1451 &bidi_it->disp_pos, &bidi_it->disp_prop,
5bf97bfc
EZ
1452 &bidi_it->string, bidi_it->w,
1453 bidi_it->frame_window_p,
102ebb00 1454 &bidi_it->ch_len, &bidi_it->nchars);
e7402cb2 1455 }
b7b65b15 1456 bidi_it->ch = curchar;
b7b65b15 1457
6bff6497
EZ
1458 /* Don't apply directional override here, as all the types we handle
1459 below will not be affected by the override anyway, and we need
1460 the original type unaltered. The override will be applied in
1461 bidi_resolve_weak. */
1462 type = bidi_get_type (curchar, NEUTRAL_DIR);
89d3374a
EZ
1463 bidi_it->orig_type = type;
1464 bidi_check_type (bidi_it->orig_type);
b7b65b15
EZ
1465
1466 if (type != PDF)
1467 bidi_it->prev_was_pdf = 0;
1468
89d3374a 1469 bidi_it->type_after_w1 = UNKNOWN_BT;
b7b65b15
EZ
1470
1471 switch (type)
1472 {
1473 case RLE: /* X2 */
1474 case RLO: /* X4 */
89d3374a
EZ
1475 bidi_it->type_after_w1 = type;
1476 bidi_check_type (bidi_it->type_after_w1);
b7b65b15 1477 type = WEAK_BN; /* X9/Retaining */
87e67904 1478 if (bidi_it->ignore_bn_limit <= -1)
b7b65b15
EZ
1479 {
1480 if (current_level <= BIDI_MAXLEVEL - 4)
1481 {
1482 /* Compute the least odd embedding level greater than
1483 the current level. */
1484 new_level = ((current_level + 1) & ~1) + 1;
89d3374a 1485 if (bidi_it->type_after_w1 == RLE)
b7b65b15
EZ
1486 override = NEUTRAL_DIR;
1487 else
1488 override = R2L;
1489 if (current_level == BIDI_MAXLEVEL - 4)
1490 bidi_it->invalid_rl_levels = 0;
1491 bidi_push_embedding_level (bidi_it, new_level, override);
1492 }
1493 else
1494 {
1495 bidi_it->invalid_levels++;
1496 /* See the commentary about invalid_rl_levels below. */
1497 if (bidi_it->invalid_rl_levels < 0)
1498 bidi_it->invalid_rl_levels = 0;
1499 bidi_it->invalid_rl_levels++;
1500 }
1501 }
89d3374a 1502 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
7b5d6677
EZ
1503 || (bidi_it->next_en_pos > bidi_it->charpos
1504 && bidi_it->next_en_type == WEAK_EN))
b7b65b15
EZ
1505 type = WEAK_EN;
1506 break;
1507 case LRE: /* X3 */
1508 case LRO: /* X5 */
89d3374a
EZ
1509 bidi_it->type_after_w1 = type;
1510 bidi_check_type (bidi_it->type_after_w1);
b7b65b15 1511 type = WEAK_BN; /* X9/Retaining */
87e67904 1512 if (bidi_it->ignore_bn_limit <= -1)
b7b65b15
EZ
1513 {
1514 if (current_level <= BIDI_MAXLEVEL - 5)
1515 {
1516 /* Compute the least even embedding level greater than
1517 the current level. */
1518 new_level = ((current_level + 2) & ~1);
89d3374a 1519 if (bidi_it->type_after_w1 == LRE)
b7b65b15
EZ
1520 override = NEUTRAL_DIR;
1521 else
1522 override = L2R;
1523 bidi_push_embedding_level (bidi_it, new_level, override);
1524 }
1525 else
1526 {
1527 bidi_it->invalid_levels++;
1528 /* invalid_rl_levels counts invalid levels encountered
1529 while the embedding level was already too high for
1530 LRE/LRO, but not for RLE/RLO. That is because
1531 there may be exactly one PDF which we should not
1532 ignore even though invalid_levels is non-zero.
1533 invalid_rl_levels helps to know what PDF is
1534 that. */
1535 if (bidi_it->invalid_rl_levels >= 0)
1536 bidi_it->invalid_rl_levels++;
1537 }
1538 }
89d3374a 1539 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
7b5d6677
EZ
1540 || (bidi_it->next_en_pos > bidi_it->charpos
1541 && bidi_it->next_en_type == WEAK_EN))
b7b65b15
EZ
1542 type = WEAK_EN;
1543 break;
1544 case PDF: /* X7 */
89d3374a
EZ
1545 bidi_it->type_after_w1 = type;
1546 bidi_check_type (bidi_it->type_after_w1);
b7b65b15 1547 type = WEAK_BN; /* X9/Retaining */
87e67904 1548 if (bidi_it->ignore_bn_limit <= -1)
b7b65b15
EZ
1549 {
1550 if (!bidi_it->invalid_rl_levels)
1551 {
1552 new_level = bidi_pop_embedding_level (bidi_it);
1553 bidi_it->invalid_rl_levels = -1;
1554 if (bidi_it->invalid_levels)
1555 bidi_it->invalid_levels--;
1556 /* else nothing: UAX#9 says to ignore invalid PDFs */
1557 }
1558 if (!bidi_it->invalid_levels)
1559 new_level = bidi_pop_embedding_level (bidi_it);
1560 else
1561 {
1562 bidi_it->invalid_levels--;
1563 bidi_it->invalid_rl_levels--;
1564 }
1565 }
89d3374a 1566 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
7b5d6677
EZ
1567 || (bidi_it->next_en_pos > bidi_it->charpos
1568 && bidi_it->next_en_type == WEAK_EN))
b7b65b15
EZ
1569 type = WEAK_EN;
1570 break;
1571 default:
1572 /* Nothing. */
1573 break;
1574 }
1575
1576 bidi_it->type = type;
2d6e4628 1577 bidi_check_type (bidi_it->type);
b7b65b15
EZ
1578
1579 return new_level;
1580}
1581
1582/* Given an iterator state in BIDI_IT, advance one character position
182ce2d2
EZ
1583 in the buffer/string to the next character (in the logical order),
1584 resolve any explicit embeddings and directional overrides, and
1585 return the embedding level of the character after resolving
1586 explicit directives and ignoring empty embeddings. */
b7b65b15
EZ
1587static int
1588bidi_resolve_explicit (struct bidi_it *bidi_it)
1589{
1590 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1591 int new_level = bidi_resolve_explicit_1 (bidi_it);
d311d28c 1592 ptrdiff_t eob = bidi_it->string.s ? bidi_it->string.schars : ZV;
512a289d
RS
1593 const unsigned char *s
1594 = (STRINGP (bidi_it->string.lstring)
1595 ? SDATA (bidi_it->string.lstring)
1596 : bidi_it->string.s);
b7b65b15
EZ
1597
1598 if (prev_level < new_level
1599 && bidi_it->type == WEAK_BN
87e67904
EZ
1600 && bidi_it->ignore_bn_limit == -1 /* only if not already known */
1601 && bidi_it->charpos < eob /* not already at EOB */
1602 && bidi_explicit_dir_char (bidi_char_at_pos (bidi_it->bytepos
f3014ef5
EZ
1603 + bidi_it->ch_len, s,
1604 bidi_it->string.unibyte)))
b7b65b15
EZ
1605 {
1606 /* Avoid pushing and popping embedding levels if the level run
1607 is empty, as this breaks level runs where it shouldn't.
1608 UAX#9 removes all the explicit embedding and override codes,
1609 so empty embeddings disappear without a trace. We need to
1610 behave as if we did the same. */
1611 struct bidi_it saved_it;
1612 int level = prev_level;
1613
1614 bidi_copy_it (&saved_it, bidi_it);
1615
87e67904 1616 while (bidi_explicit_dir_char (bidi_char_at_pos (bidi_it->bytepos
f3014ef5
EZ
1617 + bidi_it->ch_len, s,
1618 bidi_it->string.unibyte)))
b7b65b15 1619 {
182ce2d2
EZ
1620 /* This advances to the next character, skipping any
1621 characters covered by display strings. */
b7b65b15 1622 level = bidi_resolve_explicit_1 (bidi_it);
9f257352
EZ
1623 /* If string.lstring was relocated inside bidi_resolve_explicit_1,
1624 a pointer to its data is no longer valid. */
1625 if (STRINGP (bidi_it->string.lstring))
1626 s = SDATA (bidi_it->string.lstring);
b7b65b15
EZ
1627 }
1628
102ebb00 1629 if (bidi_it->nchars <= 0)
1088b922 1630 emacs_abort ();
b7b65b15 1631 if (level == prev_level) /* empty embedding */
182ce2d2 1632 saved_it.ignore_bn_limit = bidi_it->charpos + bidi_it->nchars;
b7b65b15 1633 else /* this embedding is non-empty */
87e67904 1634 saved_it.ignore_bn_limit = -2;
b7b65b15
EZ
1635
1636 bidi_copy_it (bidi_it, &saved_it);
87e67904 1637 if (bidi_it->ignore_bn_limit > -1)
b7b65b15
EZ
1638 {
1639 /* We pushed a level, but we shouldn't have. Undo that. */
1640 if (!bidi_it->invalid_rl_levels)
1641 {
1642 new_level = bidi_pop_embedding_level (bidi_it);
1643 bidi_it->invalid_rl_levels = -1;
1644 if (bidi_it->invalid_levels)
1645 bidi_it->invalid_levels--;
1646 }
1647 if (!bidi_it->invalid_levels)
1648 new_level = bidi_pop_embedding_level (bidi_it);
1649 else
1650 {
1651 bidi_it->invalid_levels--;
1652 bidi_it->invalid_rl_levels--;
1653 }
1654 }
1655 }
1656
b7b65b15
EZ
1657 if (bidi_it->type == NEUTRAL_B) /* X8 */
1658 {
21fce5ab 1659 bidi_set_paragraph_end (bidi_it);
6bff6497
EZ
1660 /* This is needed by bidi_resolve_weak below, and in L1. */
1661 bidi_it->type_after_w1 = bidi_it->type;
89d3374a 1662 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1663 }
1664
1665 return new_level;
1666}
1667
182ce2d2
EZ
1668/* Advance in the buffer/string, resolve weak types and return the
1669 type of the next character after weak type resolution. */
fd3998ff 1670static bidi_type_t
b7b65b15
EZ
1671bidi_resolve_weak (struct bidi_it *bidi_it)
1672{
1673 bidi_type_t type;
1674 bidi_dir_t override;
1675 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1676 int new_level = bidi_resolve_explicit (bidi_it);
1677 int next_char;
1678 bidi_type_t type_of_next;
1679 struct bidi_it saved_it;
5895d7b9 1680 ptrdiff_t eob
512a289d
RS
1681 = ((STRINGP (bidi_it->string.lstring) || bidi_it->string.s)
1682 ? bidi_it->string.schars : ZV);
b7b65b15
EZ
1683
1684 type = bidi_it->type;
1685 override = bidi_it->level_stack[bidi_it->stack_idx].override;
1686
1687 if (type == UNKNOWN_BT
1688 || type == LRE
1689 || type == LRO
1690 || type == RLE
1691 || type == RLO
1692 || type == PDF)
1088b922 1693 emacs_abort ();
b7b65b15
EZ
1694
1695 if (new_level != prev_level
1696 || bidi_it->type == NEUTRAL_B)
1697 {
1698 /* We've got a new embedding level run, compute the directional
1699 type of sor and initialize per-run variables (UAX#9, clause
1700 X10). */
1701 bidi_set_sor_type (bidi_it, prev_level, new_level);
1702 }
1703 else if (type == NEUTRAL_S || type == NEUTRAL_WS
1704 || type == WEAK_BN || type == STRONG_AL)
89d3374a
EZ
1705 bidi_it->type_after_w1 = type; /* needed in L1 */
1706 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1707
1708 /* Level and directional override status are already recorded in
1709 bidi_it, and do not need any change; see X6. */
1710 if (override == R2L) /* X6 */
1711 type = STRONG_R;
1712 else if (override == L2R)
1713 type = STRONG_L;
bc5a45f3 1714 else
b7b65b15 1715 {
bc5a45f3 1716 if (type == WEAK_NSM) /* W1 */
b7b65b15 1717 {
bc5a45f3 1718 /* Note that we don't need to consider the case where the
5930fe97
EZ
1719 prev character has its type overridden by an RLO or LRO,
1720 because then either the type of this NSM would have been
1721 also overridden, or the previous character is outside the
1722 current level run, and thus not relevant to this NSM.
1723 This is why NSM gets the type_after_w1 of the previous
1724 character. */
ebb5722e
EZ
1725 if (bidi_it->prev.type_after_w1 != UNKNOWN_BT
1726 /* if type_after_w1 is NEUTRAL_B, this NSM is at sor */
1727 && bidi_it->prev.type_after_w1 != NEUTRAL_B)
5930fe97 1728 type = bidi_it->prev.type_after_w1;
bc5a45f3
EZ
1729 else if (bidi_it->sor == R2L)
1730 type = STRONG_R;
1731 else if (bidi_it->sor == L2R)
1732 type = STRONG_L;
1733 else /* shouldn't happen! */
1088b922 1734 emacs_abort ();
b7b65b15 1735 }
bc5a45f3
EZ
1736 if (type == WEAK_EN /* W2 */
1737 && bidi_it->last_strong.type_after_w1 == STRONG_AL)
1738 type = WEAK_AN;
1739 else if (type == STRONG_AL) /* W3 */
1740 type = STRONG_R;
1741 else if ((type == WEAK_ES /* W4 */
1742 && bidi_it->prev.type_after_w1 == WEAK_EN
1743 && bidi_it->prev.orig_type == WEAK_EN)
1744 || (type == WEAK_CS
1745 && ((bidi_it->prev.type_after_w1 == WEAK_EN
1746 && bidi_it->prev.orig_type == WEAK_EN)
1747 || bidi_it->prev.type_after_w1 == WEAK_AN)))
b7b65b15 1748 {
512a289d
RS
1749 const unsigned char *s
1750 = (STRINGP (bidi_it->string.lstring)
1751 ? SDATA (bidi_it->string.lstring)
1752 : bidi_it->string.s);
1753
1754 next_char = (bidi_it->charpos + bidi_it->nchars >= eob
1755 ? BIDI_EOB
1756 : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len,
1757 s, bidi_it->string.unibyte));
6bff6497 1758 type_of_next = bidi_get_type (next_char, override);
b7b65b15 1759
bc5a45f3 1760 if (type_of_next == WEAK_BN
b7b65b15
EZ
1761 || bidi_explicit_dir_char (next_char))
1762 {
1763 bidi_copy_it (&saved_it, bidi_it);
1764 while (bidi_resolve_explicit (bidi_it) == new_level
bc5a45f3 1765 && bidi_it->type == WEAK_BN)
b7b65b15
EZ
1766 ;
1767 type_of_next = bidi_it->type;
b7b65b15
EZ
1768 bidi_copy_it (bidi_it, &saved_it);
1769 }
bc5a45f3
EZ
1770
1771 /* If the next character is EN, but the last strong-type
1772 character is AL, that next EN will be changed to AN when
1773 we process it in W2 above. So in that case, this ES
1774 should not be changed into EN. */
1775 if (type == WEAK_ES
1776 && type_of_next == WEAK_EN
1777 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1778 type = WEAK_EN;
1779 else if (type == WEAK_CS)
b7b65b15 1780 {
bc5a45f3
EZ
1781 if (bidi_it->prev.type_after_w1 == WEAK_AN
1782 && (type_of_next == WEAK_AN
1783 /* If the next character is EN, but the last
1784 strong-type character is AL, EN will be later
1785 changed to AN when we process it in W2 above.
1786 So in that case, this ES should not be
1787 changed into EN. */
1788 || (type_of_next == WEAK_EN
1789 && bidi_it->last_strong.type_after_w1 == STRONG_AL)))
1790 type = WEAK_AN;
1791 else if (bidi_it->prev.type_after_w1 == WEAK_EN
1792 && type_of_next == WEAK_EN
1793 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1794 type = WEAK_EN;
1795 }
1796 }
1797 else if (type == WEAK_ET /* W5: ET with EN before or after it */
1798 || type == WEAK_BN) /* W5/Retaining */
1799 {
7b5d6677 1800 if (bidi_it->prev.type_after_w1 == WEAK_EN) /* ET/BN w/EN before it */
bc5a45f3 1801 type = WEAK_EN;
7b5d6677
EZ
1802 else if (bidi_it->next_en_pos > bidi_it->charpos
1803 && bidi_it->next_en_type != WEAK_BN)
1804 {
1805 if (bidi_it->next_en_type == WEAK_EN) /* ET/BN with EN after it */
1806 type = WEAK_EN;
1807 }
1808 else if (bidi_it->next_en_pos >=0)
bc5a45f3 1809 {
d311d28c 1810 ptrdiff_t en_pos = bidi_it->charpos + bidi_it->nchars;
512a289d
RS
1811 const unsigned char *s = (STRINGP (bidi_it->string.lstring)
1812 ? SDATA (bidi_it->string.lstring)
1813 : bidi_it->string.s);
bc5a45f3 1814
102ebb00 1815 if (bidi_it->nchars <= 0)
1088b922 1816 emacs_abort ();
512a289d
RS
1817 next_char
1818 = (bidi_it->charpos + bidi_it->nchars >= eob
1819 ? BIDI_EOB
1820 : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len, s,
1821 bidi_it->string.unibyte));
bc5a45f3
EZ
1822 type_of_next = bidi_get_type (next_char, override);
1823
1824 if (type_of_next == WEAK_ET
1825 || type_of_next == WEAK_BN
1826 || bidi_explicit_dir_char (next_char))
1827 {
1828 bidi_copy_it (&saved_it, bidi_it);
1829 while (bidi_resolve_explicit (bidi_it) == new_level
1830 && (bidi_it->type == WEAK_BN
1831 || bidi_it->type == WEAK_ET))
1832 ;
1833 type_of_next = bidi_it->type;
1834 en_pos = bidi_it->charpos;
1835 bidi_copy_it (bidi_it, &saved_it);
1836 }
7b5d6677
EZ
1837 /* Remember this position, to speed up processing of the
1838 next ETs. */
1839 bidi_it->next_en_pos = en_pos;
bc5a45f3 1840 if (type_of_next == WEAK_EN)
b7b65b15 1841 {
bc5a45f3
EZ
1842 /* If the last strong character is AL, the EN we've
1843 found will become AN when we get to it (W2). */
7b5d6677
EZ
1844 if (bidi_it->last_strong.type_after_w1 == STRONG_AL)
1845 type_of_next = WEAK_AN;
bc5a45f3
EZ
1846 else if (type == WEAK_BN)
1847 type = NEUTRAL_ON; /* W6/Retaining */
7b5d6677
EZ
1848 else
1849 type = WEAK_EN;
b7b65b15 1850 }
4787455f
EZ
1851 else if (type_of_next == NEUTRAL_B)
1852 /* Record the fact that there are no more ENs from
1853 here to the end of paragraph, to avoid entering the
1854 loop above ever again in this paragraph. */
1855 bidi_it->next_en_pos = -1;
7b5d6677
EZ
1856 /* Record the type of the character where we ended our search. */
1857 bidi_it->next_en_type = type_of_next;
b7b65b15
EZ
1858 }
1859 }
1860 }
1861
1862 if (type == WEAK_ES || type == WEAK_ET || type == WEAK_CS /* W6 */
89d3374a
EZ
1863 || (type == WEAK_BN
1864 && (bidi_it->prev.type_after_w1 == WEAK_CS /* W6/Retaining */
1865 || bidi_it->prev.type_after_w1 == WEAK_ES
1866 || bidi_it->prev.type_after_w1 == WEAK_ET)))
b7b65b15
EZ
1867 type = NEUTRAL_ON;
1868
1869 /* Store the type we've got so far, before we clobber it with strong
1870 types in W7 and while resolving neutral types. But leave alone
1871 the original types that were recorded above, because we will need
1872 them for the L1 clause. */
89d3374a
EZ
1873 if (bidi_it->type_after_w1 == UNKNOWN_BT)
1874 bidi_it->type_after_w1 = type;
1875 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1876
1877 if (type == WEAK_EN) /* W7 */
1878 {
89d3374a 1879 if ((bidi_it->last_strong.type_after_w1 == STRONG_L)
b7b65b15
EZ
1880 || (bidi_it->last_strong.type == UNKNOWN_BT && bidi_it->sor == L2R))
1881 type = STRONG_L;
1882 }
1883
1884 bidi_it->type = type;
2d6e4628 1885 bidi_check_type (bidi_it->type);
b7b65b15
EZ
1886 return type;
1887}
1888
cbb09f04
EZ
1889/* Resolve the type of a neutral character according to the type of
1890 surrounding strong text and the current embedding level. */
0a99eee1 1891static bidi_type_t
cbb09f04
EZ
1892bidi_resolve_neutral_1 (bidi_type_t prev_type, bidi_type_t next_type, int lev)
1893{
1894 /* N1: European and Arabic numbers are treated as though they were R. */
1895 if (next_type == WEAK_EN || next_type == WEAK_AN)
1896 next_type = STRONG_R;
1897 if (prev_type == WEAK_EN || prev_type == WEAK_AN)
1898 prev_type = STRONG_R;
1899
1900 if (next_type == prev_type) /* N1 */
1901 return next_type;
1902 else if ((lev & 1) == 0) /* N2 */
1903 return STRONG_L;
1904 else
1905 return STRONG_R;
1906}
1907
fd3998ff 1908static bidi_type_t
b7b65b15
EZ
1909bidi_resolve_neutral (struct bidi_it *bidi_it)
1910{
1911 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1912 bidi_type_t type = bidi_resolve_weak (bidi_it);
1913 int current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1914
1915 if (!(type == STRONG_R
1916 || type == STRONG_L
1917 || type == WEAK_BN
1918 || type == WEAK_EN
1919 || type == WEAK_AN
1920 || type == NEUTRAL_B
1921 || type == NEUTRAL_S
1922 || type == NEUTRAL_WS
1923 || type == NEUTRAL_ON))
1088b922 1924 emacs_abort ();
b7b65b15 1925
4787455f
EZ
1926 if ((type != NEUTRAL_B /* Don't risk entering the long loop below if
1927 we are already at paragraph end. */
1928 && bidi_get_category (type) == NEUTRAL)
b7b65b15
EZ
1929 || (type == WEAK_BN && prev_level == current_level))
1930 {
1931 if (bidi_it->next_for_neutral.type != UNKNOWN_BT)
1932 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1933 bidi_it->next_for_neutral.type,
1934 current_level);
4787455f
EZ
1935 /* The next two "else if" clauses are shortcuts for the
1936 important special case when we have a long sequence of
1937 neutral or WEAK_BN characters, such as whitespace or nulls or
1938 other control characters, on the base embedding level of the
1939 paragraph, and that sequence goes all the way to the end of
1940 the paragraph and follows a character whose resolved
1941 directionality is identical to the base embedding level.
1942 (This is what happens in a buffer with plain L2R text that
1943 happens to include long sequences of control characters.) By
1944 virtue of N1, the result of examining this long sequence will
1945 always be either STRONG_L or STRONG_R, depending on the base
1946 embedding level. So we use this fact directly instead of
1947 entering the expensive loop in the "else" clause. */
1948 else if (current_level == 0
1949 && bidi_it->prev_for_neutral.type == STRONG_L
1950 && !bidi_explicit_dir_char (bidi_it->ch))
1951 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1952 STRONG_L, current_level);
1953 else if (/* current level is 1 */
1954 current_level == 1
1955 /* base embedding level is also 1 */
1956 && bidi_it->level_stack[0].level == 1
1957 /* previous character is one of those considered R for
1958 the purposes of W5 */
1959 && (bidi_it->prev_for_neutral.type == STRONG_R
1960 || bidi_it->prev_for_neutral.type == WEAK_EN
1961 || bidi_it->prev_for_neutral.type == WEAK_AN)
1962 && !bidi_explicit_dir_char (bidi_it->ch))
1963 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1964 STRONG_R, current_level);
b7b65b15
EZ
1965 else
1966 {
1967 /* Arrrgh!! The UAX#9 algorithm is too deeply entrenched in
1968 the assumption of batch-style processing; see clauses W4,
1969 W5, and especially N1, which require to look far forward
182ce2d2
EZ
1970 (as well as back) in the buffer/string. May the fleas of
1971 a thousand camels infest the armpits of those who design
b7b65b15
EZ
1972 supposedly general-purpose algorithms by looking at their
1973 own implementations, and fail to consider other possible
1974 implementations! */
1975 struct bidi_it saved_it;
1976 bidi_type_t next_type;
1977
1978 if (bidi_it->scan_dir == -1)
1088b922 1979 emacs_abort ();
b7b65b15
EZ
1980
1981 bidi_copy_it (&saved_it, bidi_it);
1982 /* Scan the text forward until we find the first non-neutral
1983 character, and then use that to resolve the neutral we
1984 are dealing with now. We also cache the scanned iterator
1985 states, to salvage some of the effort later. */
1986 bidi_cache_iterator_state (bidi_it, 0);
1987 do {
1988 /* Record the info about the previous character, so that
1989 it will be cached below with this state. */
89d3374a 1990 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
b7b65b15
EZ
1991 && bidi_it->type != WEAK_BN)
1992 bidi_remember_char (&bidi_it->prev, bidi_it);
1993 type = bidi_resolve_weak (bidi_it);
1994 /* Paragraph separators have their levels fully resolved
1995 at this point, so cache them as resolved. */
1996 bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B);
1997 /* FIXME: implement L1 here, by testing for a newline and
1998 resetting the level for any sequence of whitespace
1999 characters adjacent to it. */
2000 } while (!(type == NEUTRAL_B
2001 || (type != WEAK_BN
2002 && bidi_get_category (type) != NEUTRAL)
2003 /* This is all per level run, so stop when we
2004 reach the end of this level run. */
512a289d
RS
2005 || (bidi_it->level_stack[bidi_it->stack_idx].level
2006 != current_level)));
b7b65b15
EZ
2007
2008 bidi_remember_char (&saved_it.next_for_neutral, bidi_it);
2009
2010 switch (type)
2011 {
2012 case STRONG_L:
2013 case STRONG_R:
2014 case STRONG_AL:
4787455f
EZ
2015 /* Actually, STRONG_AL cannot happen here, because
2016 bidi_resolve_weak converts it to STRONG_R, per W3. */
a54e2c05 2017 eassert (type != STRONG_AL);
b7b65b15
EZ
2018 next_type = type;
2019 break;
2020 case WEAK_EN:
2021 case WEAK_AN:
2022 /* N1: ``European and Arabic numbers are treated as
2023 though they were R.'' */
2024 next_type = STRONG_R;
b7b65b15
EZ
2025 break;
2026 case WEAK_BN:
713bfeaa 2027 case NEUTRAL_ON: /* W6/Retaining */
b7b65b15 2028 if (!bidi_explicit_dir_char (bidi_it->ch))
1088b922 2029 emacs_abort (); /* can't happen: BNs are skipped */
b7b65b15
EZ
2030 /* FALLTHROUGH */
2031 case NEUTRAL_B:
2032 /* Marched all the way to the end of this level run.
2033 We need to use the eor type, whose information is
2034 stored by bidi_set_sor_type in the prev_for_neutral
2035 member. */
2036 if (saved_it.type != WEAK_BN
89d3374a 2037 || bidi_get_category (bidi_it->prev.type_after_w1) == NEUTRAL)
4787455f 2038 next_type = bidi_it->prev_for_neutral.type;
b7b65b15
EZ
2039 else
2040 {
2041 /* This is a BN which does not adjoin neutrals.
2042 Leave its type alone. */
2043 bidi_copy_it (bidi_it, &saved_it);
2044 return bidi_it->type;
2045 }
2046 break;
2047 default:
1088b922 2048 emacs_abort ();
b7b65b15
EZ
2049 }
2050 type = bidi_resolve_neutral_1 (saved_it.prev_for_neutral.type,
2051 next_type, current_level);
4787455f 2052 saved_it.next_for_neutral.type = next_type;
b7b65b15 2053 saved_it.type = type;
4787455f 2054 bidi_check_type (next_type);
2d6e4628 2055 bidi_check_type (type);
b7b65b15
EZ
2056 bidi_copy_it (bidi_it, &saved_it);
2057 }
2058 }
2059 return type;
2060}
2061
2062/* Given an iterator state in BIDI_IT, advance one character position
182ce2d2
EZ
2063 in the buffer/string to the next character (in the logical order),
2064 resolve the bidi type of that next character, and return that
2065 type. */
fd3998ff 2066static bidi_type_t
b7b65b15
EZ
2067bidi_type_of_next_char (struct bidi_it *bidi_it)
2068{
2069 bidi_type_t type;
2070
2071 /* This should always be called during a forward scan. */
2072 if (bidi_it->scan_dir != 1)
1088b922 2073 emacs_abort ();
b7b65b15
EZ
2074
2075 /* Reset the limit until which to ignore BNs if we step out of the
2076 area where we found only empty levels. */
87e67904 2077 if ((bidi_it->ignore_bn_limit > -1
b7b65b15 2078 && bidi_it->ignore_bn_limit <= bidi_it->charpos)
87e67904 2079 || (bidi_it->ignore_bn_limit == -2
b7b65b15 2080 && !bidi_explicit_dir_char (bidi_it->ch)))
87e67904 2081 bidi_it->ignore_bn_limit = -1;
b7b65b15
EZ
2082
2083 type = bidi_resolve_neutral (bidi_it);
2084
2085 return type;
2086}
2087
2088/* Given an iterator state BIDI_IT, advance one character position in
182ce2d2
EZ
2089 the buffer/string to the next character (in the current scan
2090 direction), resolve the embedding and implicit levels of that next
2091 character, and return the resulting level. */
fd3998ff 2092static int
b7b65b15
EZ
2093bidi_level_of_next_char (struct bidi_it *bidi_it)
2094{
2095 bidi_type_t type;
2096 int level, prev_level = -1;
2097 struct bidi_saved_info next_for_neutral;
d311d28c 2098 ptrdiff_t next_char_pos = -2;
b7b65b15
EZ
2099
2100 if (bidi_it->scan_dir == 1)
2101 {
5895d7b9 2102 ptrdiff_t eob
512a289d
RS
2103 = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
2104 ? bidi_it->string.schars : ZV);
9f257352 2105
b7b65b15 2106 /* There's no sense in trying to advance if we hit end of text. */
9f257352 2107 if (bidi_it->charpos >= eob)
b7b65b15
EZ
2108 return bidi_it->resolved_level;
2109
2110 /* Record the info about the previous character. */
89d3374a 2111 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
b7b65b15
EZ
2112 && bidi_it->type != WEAK_BN)
2113 bidi_remember_char (&bidi_it->prev, bidi_it);
89d3374a
EZ
2114 if (bidi_it->type_after_w1 == STRONG_R
2115 || bidi_it->type_after_w1 == STRONG_L
2116 || bidi_it->type_after_w1 == STRONG_AL)
b7b65b15
EZ
2117 bidi_remember_char (&bidi_it->last_strong, bidi_it);
2118 /* FIXME: it sounds like we don't need both prev and
2119 prev_for_neutral members, but I'm leaving them both for now. */
2120 if (bidi_it->type == STRONG_R || bidi_it->type == STRONG_L
2121 || bidi_it->type == WEAK_EN || bidi_it->type == WEAK_AN)
2122 bidi_remember_char (&bidi_it->prev_for_neutral, bidi_it);
2123
2124 /* If we overstepped the characters used for resolving neutrals
2125 and whitespace, invalidate their info in the iterator. */
2126 if (bidi_it->charpos >= bidi_it->next_for_neutral.charpos)
2127 bidi_it->next_for_neutral.type = UNKNOWN_BT;
2128 if (bidi_it->next_en_pos >= 0
2129 && bidi_it->charpos >= bidi_it->next_en_pos)
7b5d6677
EZ
2130 {
2131 bidi_it->next_en_pos = 0;
2132 bidi_it->next_en_type = UNKNOWN_BT;
2133 }
b7b65b15
EZ
2134 if (bidi_it->next_for_ws.type != UNKNOWN_BT
2135 && bidi_it->charpos >= bidi_it->next_for_ws.charpos)
2136 bidi_it->next_for_ws.type = UNKNOWN_BT;
2137
2138 /* This must be taken before we fill the iterator with the info
2139 about the next char. If we scan backwards, the iterator
2140 state must be already cached, so there's no need to know the
2141 embedding level of the previous character, since we will be
2142 returning to our caller shortly. */
2143 prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2144 }
2145 next_for_neutral = bidi_it->next_for_neutral;
2146
182ce2d2
EZ
2147 /* Perhaps the character we want is already cached. If it is, the
2148 call to bidi_cache_find below will return a type other than
2149 UNKNOWN_BT. */
87e67904 2150 if (bidi_cache_idx > bidi_cache_start && !bidi_it->first_elt)
102ebb00 2151 {
512a289d
RS
2152 int bob = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
2153 ? 0 : 1);
102ebb00
EZ
2154 if (bidi_it->scan_dir > 0)
2155 {
2156 if (bidi_it->nchars <= 0)
1088b922 2157 emacs_abort ();
102ebb00
EZ
2158 next_char_pos = bidi_it->charpos + bidi_it->nchars;
2159 }
9f257352 2160 else if (bidi_it->charpos >= bob)
bb269206
EZ
2161 /* Implementation note: we allow next_char_pos to be as low as
2162 0 for buffers or -1 for strings, and that is okay because
2163 that's the "position" of the sentinel iterator state we
2164 cached at the beginning of the iteration. */
102ebb00 2165 next_char_pos = bidi_it->charpos - 1;
578b494e 2166 if (next_char_pos >= bob - 1)
87e67904
EZ
2167 type = bidi_cache_find (next_char_pos, -1, bidi_it);
2168 else
2169 type = UNKNOWN_BT;
102ebb00 2170 }
182ce2d2 2171 else
102ebb00 2172 type = UNKNOWN_BT;
b7b65b15
EZ
2173 if (type != UNKNOWN_BT)
2174 {
2175 /* Don't lose the information for resolving neutrals! The
2176 cached states could have been cached before their
2177 next_for_neutral member was computed. If we are on our way
2178 forward, we can simply take the info from the previous
2179 state. */
2180 if (bidi_it->scan_dir == 1
2181 && bidi_it->next_for_neutral.type == UNKNOWN_BT)
2182 bidi_it->next_for_neutral = next_for_neutral;
2183
2184 /* If resolved_level is -1, it means this state was cached
2185 before it was completely resolved, so we cannot return
2186 it. */
2187 if (bidi_it->resolved_level != -1)
2188 return bidi_it->resolved_level;
2189 }
2190 if (bidi_it->scan_dir == -1)
2191 /* If we are going backwards, the iterator state is already cached
2192 from previous scans, and should be fully resolved. */
1088b922 2193 emacs_abort ();
b7b65b15
EZ
2194
2195 if (type == UNKNOWN_BT)
2196 type = bidi_type_of_next_char (bidi_it);
2197
2198 if (type == NEUTRAL_B)
2199 return bidi_it->resolved_level;
2200
2201 level = bidi_it->level_stack[bidi_it->stack_idx].level;
2202 if ((bidi_get_category (type) == NEUTRAL /* && type != NEUTRAL_B */)
2203 || (type == WEAK_BN && prev_level == level))
2204 {
2205 if (bidi_it->next_for_neutral.type == UNKNOWN_BT)
1088b922 2206 emacs_abort ();
b7b65b15
EZ
2207
2208 /* If the cached state shows a neutral character, it was not
2209 resolved by bidi_resolve_neutral, so do it now. */
2210 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
2211 bidi_it->next_for_neutral.type,
2212 level);
2213 }
2214
2215 if (!(type == STRONG_R
2216 || type == STRONG_L
2217 || type == WEAK_BN
2218 || type == WEAK_EN
2219 || type == WEAK_AN))
1088b922 2220 emacs_abort ();
b7b65b15 2221 bidi_it->type = type;
2d6e4628 2222 bidi_check_type (bidi_it->type);
b7b65b15
EZ
2223
2224 /* For L1 below, we need to know, for each WS character, whether
0d327994 2225 it belongs to a sequence of WS characters preceding a newline
b7b65b15 2226 or a TAB or a paragraph separator. */
89d3374a 2227 if (bidi_it->orig_type == NEUTRAL_WS
b7b65b15
EZ
2228 && bidi_it->next_for_ws.type == UNKNOWN_BT)
2229 {
2230 int ch;
d311d28c
PE
2231 ptrdiff_t clen = bidi_it->ch_len;
2232 ptrdiff_t bpos = bidi_it->bytepos;
2233 ptrdiff_t cpos = bidi_it->charpos;
2234 ptrdiff_t disp_pos = bidi_it->disp_pos;
2235 ptrdiff_t nc = bidi_it->nchars;
87e67904 2236 struct bidi_string_data bs = bidi_it->string;
b7b65b15 2237 bidi_type_t chtype;
2cc21167 2238 bool fwp = bidi_it->frame_window_p;
0c95fcf7 2239 int dpp = bidi_it->disp_prop;
b7b65b15 2240
102ebb00 2241 if (bidi_it->nchars <= 0)
1088b922 2242 emacs_abort ();
b7b65b15 2243 do {
36075a19 2244 ch = bidi_fetch_char (cpos += nc, bpos += clen, &disp_pos, &dpp, &bs,
5bf97bfc 2245 bidi_it->w, fwp, &clen, &nc);
79beb178 2246 if (ch == '\n' || ch == BIDI_EOB)
b7b65b15
EZ
2247 chtype = NEUTRAL_B;
2248 else
6bff6497 2249 chtype = bidi_get_type (ch, NEUTRAL_DIR);
b7b65b15
EZ
2250 } while (chtype == NEUTRAL_WS || chtype == WEAK_BN
2251 || bidi_explicit_dir_char (ch)); /* L1/Retaining */
2252 bidi_it->next_for_ws.type = chtype;
2d6e4628 2253 bidi_check_type (bidi_it->next_for_ws.type);
b7b65b15
EZ
2254 bidi_it->next_for_ws.charpos = cpos;
2255 bidi_it->next_for_ws.bytepos = bpos;
2256 }
2257
2258 /* Resolve implicit levels, with a twist: PDFs get the embedding
55622b93 2259 level of the embedding they terminate. See below for the
b7b65b15 2260 reason. */
89d3374a 2261 if (bidi_it->orig_type == PDF
b7b65b15
EZ
2262 /* Don't do this if this formatting code didn't change the
2263 embedding level due to invalid or empty embeddings. */
2264 && prev_level != level)
2265 {
2266 /* Don't look in UAX#9 for the reason for this: it's our own
2267 private quirk. The reason is that we want the formatting
2268 codes to be delivered so that they bracket the text of their
2269 embedding. For example, given the text
2270
2271 {RLO}teST{PDF}
2272
2273 we want it to be displayed as
2274
0d68907d 2275 {PDF}STet{RLO}
b7b65b15
EZ
2276
2277 not as
2278
2279 STet{RLO}{PDF}
2280
2281 which will result because we bump up the embedding level as
2282 soon as we see the RLO and pop it as soon as we see the PDF,
2283 so RLO itself has the same embedding level as "teST", and
2284 thus would be normally delivered last, just before the PDF.
2285 The switch below fiddles with the level of PDF so that this
2286 ugly side effect does not happen.
2287
2288 (This is, of course, only important if the formatting codes
e7402cb2
EZ
2289 are actually displayed, but Emacs does need to display them
2290 if the user wants to.) */
b7b65b15
EZ
2291 level = prev_level;
2292 }
89d3374a
EZ
2293 else if (bidi_it->orig_type == NEUTRAL_B /* L1 */
2294 || bidi_it->orig_type == NEUTRAL_S
e7402cb2 2295 || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB
89d3374a 2296 || (bidi_it->orig_type == NEUTRAL_WS
b7b65b15
EZ
2297 && (bidi_it->next_for_ws.type == NEUTRAL_B
2298 || bidi_it->next_for_ws.type == NEUTRAL_S)))
2299 level = bidi_it->level_stack[0].level;
2300 else if ((level & 1) == 0) /* I1 */
2301 {
2302 if (type == STRONG_R)
2303 level++;
2304 else if (type == WEAK_EN || type == WEAK_AN)
2305 level += 2;
2306 }
2307 else /* I2 */
2308 {
2309 if (type == STRONG_L || type == WEAK_EN || type == WEAK_AN)
2310 level++;
2311 }
2312
2313 bidi_it->resolved_level = level;
2314 return level;
2315}
2316
2cc21167
PE
2317/* Move to the other edge of a level given by LEVEL. If END_FLAG,
2318 we are at the end of a level, and we need to prepare to
b7b65b15
EZ
2319 resume the scan of the lower level.
2320
2321 If this level's other edge is cached, we simply jump to it, filling
2322 the iterator structure with the iterator state on the other edge.
182ce2d2
EZ
2323 Otherwise, we walk the buffer or string until we come back to the
2324 same level as LEVEL.
b7b65b15
EZ
2325
2326 Note: we are not talking here about a ``level run'' in the UAX#9
2327 sense of the term, but rather about a ``level'' which includes
2328 all the levels higher than it. In other words, given the levels
2329 like this:
2330
2331 11111112222222333333334443343222222111111112223322111
2332 A B C
2333
2334 and assuming we are at point A scanning left to right, this
2335 function moves to point C, whereas the UAX#9 ``level 2 run'' ends
2336 at point B. */
2337static void
2cc21167 2338bidi_find_other_level_edge (struct bidi_it *bidi_it, int level, bool end_flag)
b7b65b15
EZ
2339{
2340 int dir = end_flag ? -bidi_it->scan_dir : bidi_it->scan_dir;
39e378da 2341 ptrdiff_t idx;
b7b65b15
EZ
2342
2343 /* Try the cache first. */
87e67904
EZ
2344 if ((idx = bidi_cache_find_level_change (level, dir, end_flag))
2345 >= bidi_cache_start)
b7b65b15
EZ
2346 bidi_cache_fetch_state (idx, bidi_it);
2347 else
2348 {
2349 int new_level;
2350
1088b922 2351 /* If we are at end of level, its edges must be cached. */
b7b65b15 2352 if (end_flag)
1088b922 2353 emacs_abort ();
b7b65b15
EZ
2354
2355 bidi_cache_iterator_state (bidi_it, 1);
2356 do {
2357 new_level = bidi_level_of_next_char (bidi_it);
2358 bidi_cache_iterator_state (bidi_it, 1);
2359 } while (new_level >= level);
2360 }
2361}
2362
2363void
4b292a22 2364bidi_move_to_visually_next (struct bidi_it *bidi_it)
b7b65b15
EZ
2365{
2366 int old_level, new_level, next_level;
9c82e145 2367 struct bidi_it sentinel;
acb28818 2368 struct gcpro gcpro1;
b7b65b15 2369
87e67904 2370 if (bidi_it->charpos < 0 || bidi_it->bytepos < 0)
1088b922 2371 emacs_abort ();
b7b65b15
EZ
2372
2373 if (bidi_it->scan_dir == 0)
2374 {
2375 bidi_it->scan_dir = 1; /* default to logical order */
2376 }
2377
acb28818 2378 /* The code below can call eval, and thus cause GC. If we are
7e2ad32c 2379 iterating a Lisp string, make sure it won't be GCed. */
acb28818
EZ
2380 if (STRINGP (bidi_it->string.lstring))
2381 GCPRO1 (bidi_it->string.lstring);
2382
be39f003 2383 /* If we just passed a newline, initialize for the next line. */
1137e8b8
EZ
2384 if (!bidi_it->first_elt
2385 && (bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB))
be39f003
EZ
2386 bidi_line_init (bidi_it);
2387
6dcfd253
EZ
2388 /* Prepare the sentinel iterator state, and cache it. When we bump
2389 into it, scanning backwards, we'll know that the last non-base
2390 level is exhausted. */
87e67904 2391 if (bidi_cache_idx == bidi_cache_start)
9c82e145
EZ
2392 {
2393 bidi_copy_it (&sentinel, bidi_it);
2394 if (bidi_it->first_elt)
2395 {
2396 sentinel.charpos--; /* cached charpos needs to be monotonic */
2397 sentinel.bytepos--;
2398 sentinel.ch = '\n'; /* doesn't matter, but why not? */
2399 sentinel.ch_len = 1;
182ce2d2 2400 sentinel.nchars = 1;
9c82e145 2401 }
6dcfd253 2402 bidi_cache_iterator_state (&sentinel, 1);
9c82e145 2403 }
b7b65b15
EZ
2404
2405 old_level = bidi_it->resolved_level;
2406 new_level = bidi_level_of_next_char (bidi_it);
b7b65b15
EZ
2407
2408 /* Reordering of resolved levels (clause L2) is implemented by
2409 jumping to the other edge of the level and flipping direction of
c0546589 2410 scanning the text whenever we find a level change. */
b7b65b15
EZ
2411 if (new_level != old_level)
2412 {
2cc21167 2413 bool ascending = new_level > old_level;
b7b65b15
EZ
2414 int level_to_search = ascending ? old_level + 1 : old_level;
2415 int incr = ascending ? 1 : -1;
2416 int expected_next_level = old_level + incr;
2417
b7b65b15
EZ
2418 /* Jump (or walk) to the other edge of this level. */
2419 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
2420 /* Switch scan direction and peek at the next character in the
2421 new direction. */
2422 bidi_it->scan_dir = -bidi_it->scan_dir;
2423
2424 /* The following loop handles the case where the resolved level
2425 jumps by more than one. This is typical for numbers inside a
2426 run of text with left-to-right embedding direction, but can
2427 also happen in other situations. In those cases the decision
2428 where to continue after a level change, and in what direction,
2429 is tricky. For example, given a text like below:
2430
2431 abcdefgh
2432 11336622
2433
2434 (where the numbers below the text show the resolved levels),
2435 the result of reordering according to UAX#9 should be this:
2436
2437 efdcghba
2438
2439 This is implemented by the loop below which flips direction
2440 and jumps to the other edge of the level each time it finds
2441 the new level not to be the expected one. The expected level
2442 is always one more or one less than the previous one. */
2443 next_level = bidi_peek_at_next_level (bidi_it);
2444 while (next_level != expected_next_level)
2445 {
713bfeaa
EZ
2446 /* If next_level is -1, it means we have an unresolved level
2447 in the cache, which at this point should not happen. If
2448 it does, we will infloop. */
2449 eassert (next_level >= 0);
b7b65b15
EZ
2450 expected_next_level += incr;
2451 level_to_search += incr;
2452 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
2453 bidi_it->scan_dir = -bidi_it->scan_dir;
2454 next_level = bidi_peek_at_next_level (bidi_it);
2455 }
2456
2457 /* Finally, deliver the next character in the new direction. */
2458 next_level = bidi_level_of_next_char (bidi_it);
2459 }
2460
b44d9321
EZ
2461 /* Take note when we have just processed the newline that precedes
2462 the end of the paragraph. The next time we are about to be
2463 called, set_iterator_to_next will automatically reinit the
2464 paragraph direction, if needed. We do this at the newline before
2465 the paragraph separator, because the next character might not be
2466 the first character of the next paragraph, due to the bidi
c0546589
EZ
2467 reordering, whereas we _must_ know the paragraph base direction
2468 _before_ we process the paragraph's text, since the base
2469 direction affects the reordering. */
1137e8b8
EZ
2470 if (bidi_it->scan_dir == 1
2471 && (bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB))
be39f003 2472 {
87e67904
EZ
2473 /* The paragraph direction of the entire string, once
2474 determined, is in effect for the entire string. Setting the
2475 separator limit to the end of the string prevents
2476 bidi_paragraph_init from being called automatically on this
2477 string. */
9f257352 2478 if (bidi_it->string.s || STRINGP (bidi_it->string.lstring))
87e67904
EZ
2479 bidi_it->separator_limit = bidi_it->string.schars;
2480 else if (bidi_it->bytepos < ZV_BYTE)
be39f003 2481 {
5895d7b9 2482 ptrdiff_t sep_len
512a289d
RS
2483 = bidi_at_paragraph_end (bidi_it->charpos + bidi_it->nchars,
2484 bidi_it->bytepos + bidi_it->ch_len);
87e67904 2485 if (bidi_it->nchars <= 0)
1088b922 2486 emacs_abort ();
87e67904
EZ
2487 if (sep_len >= 0)
2488 {
2489 bidi_it->new_paragraph = 1;
2490 /* Record the buffer position of the last character of the
2491 paragraph separator. */
512a289d
RS
2492 bidi_it->separator_limit
2493 = bidi_it->charpos + bidi_it->nchars + sep_len;
87e67904 2494 }
be39f003
EZ
2495 }
2496 }
6bff6497 2497
87e67904 2498 if (bidi_it->scan_dir == 1 && bidi_cache_idx > bidi_cache_start)
b7b65b15
EZ
2499 {
2500 /* If we are at paragraph's base embedding level and beyond the
2501 last cached position, the cache's job is done and we can
2502 discard it. */
2503 if (bidi_it->resolved_level == bidi_it->level_stack[0].level
182ce2d2
EZ
2504 && bidi_it->charpos > (bidi_cache[bidi_cache_idx - 1].charpos
2505 + bidi_cache[bidi_cache_idx - 1].nchars - 1))
b7b65b15
EZ
2506 bidi_cache_reset ();
2507 /* But as long as we are caching during forward scan, we must
2508 cache each state, or else the cache integrity will be
2509 compromised: it assumes cached states correspond to buffer
2510 positions 1:1. */
2511 else
2512 bidi_cache_iterator_state (bidi_it, 1);
2513 }
acb28818
EZ
2514
2515 if (STRINGP (bidi_it->string.lstring))
2516 UNGCPRO;
b7b65b15
EZ
2517}
2518
2519/* This is meant to be called from within the debugger, whenever you
2520 wish to examine the cache contents. */
e78aecca 2521void bidi_dump_cached_states (void) EXTERNALLY_VISIBLE;
b7b65b15
EZ
2522void
2523bidi_dump_cached_states (void)
2524{
39e378da 2525 ptrdiff_t i;
b7b65b15
EZ
2526 int ndigits = 1;
2527
2528 if (bidi_cache_idx == 0)
2529 {
2530 fprintf (stderr, "The cache is empty.\n");
2531 return;
2532 }
df9733bf 2533 fprintf (stderr, "Total of %"pD"d state%s in cache:\n",
b7b65b15
EZ
2534 bidi_cache_idx, bidi_cache_idx == 1 ? "" : "s");
2535
2536 for (i = bidi_cache[bidi_cache_idx - 1].charpos; i > 0; i /= 10)
2537 ndigits++;
2538 fputs ("ch ", stderr);
2539 for (i = 0; i < bidi_cache_idx; i++)
2540 fprintf (stderr, "%*c", ndigits, bidi_cache[i].ch);
2541 fputs ("\n", stderr);
2542 fputs ("lvl ", stderr);
2543 for (i = 0; i < bidi_cache_idx; i++)
2544 fprintf (stderr, "%*d", ndigits, bidi_cache[i].resolved_level);
2545 fputs ("\n", stderr);
2546 fputs ("pos ", stderr);
2547 for (i = 0; i < bidi_cache_idx; i++)
7c85f529 2548 fprintf (stderr, "%*"pD"d", ndigits, bidi_cache[i].charpos);
b7b65b15
EZ
2549 fputs ("\n", stderr);
2550}