guile-elisp bootstrap (lisp)
[bpt/emacs.git] / src / bidi.c
CommitLineData
cbb09f04 1/* Low-level bidirectional buffer/string-scanning functions for GNU Emacs.
ba318903 2 Copyright (C) 2000-2001, 2004-2005, 2009-2014 Free Software
ab422c4d 3 Foundation, Inc.
b7b65b15
EZ
4
5This file is part of GNU Emacs.
6
a8d11bd3 7GNU Emacs is free software: you can redistribute it and/or modify
b7b65b15 8it under the terms of the GNU General Public License as published by
a8d11bd3
EZ
9the Free Software Foundation, either version 3 of the License, or
10(at your option) any later version.
b7b65b15
EZ
11
12GNU Emacs is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
16
b7b65b15 17You should have received a copy of the GNU General Public License
a8d11bd3 18along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
b7b65b15 19
2d6e4628
EZ
20/* Written by Eli Zaretskii <eliz@gnu.org>.
21
22 A sequential implementation of the Unicode Bidirectional algorithm,
cbb09f04 23 (UBA) as per UAX#9, a part of the Unicode Standard.
b7b65b15 24
aa5ccb01
EZ
25 Unlike the Reference Implementation and most other implementations,
26 this one is designed to be called once for every character in the
27 buffer or string. That way, we can leave intact the design of the
28 Emacs display engine, whereby an iterator object is used to
29 traverse buffer or string text character by character, and generate
30 the necessary data for displaying each character in 'struct glyph'
31 objects. (See xdisp.c for the details of that iteration.) The
32 functions on this file replace the original linear iteration in the
33 logical order of the text with a non-linear iteration in the visual
34 order, i.e. in the order characters should be shown on display.
b7b65b15 35
4b292a22 36 The main entry point is bidi_move_to_visually_next. Each time it
b7b65b15
EZ
37 is called, it finds the next character in the visual order, and
38 returns its information in a special structure. The caller is then
39 expected to process this character for display or any other
4b292a22
EZ
40 purposes, and call bidi_move_to_visually_next for the next
41 character. See the comments in bidi_move_to_visually_next for more
42 details about its algorithm that finds the next visual-order
b7b65b15
EZ
43 character by resolving their levels on the fly.
44
95e1afc6 45 Two other entry points are bidi_paragraph_init and
940afb59
EZ
46 bidi_mirror_char. The first determines the base direction of a
47 paragraph, while the second returns the mirrored version of its
48 argument character.
49
95e1afc6
EZ
50 A few auxiliary entry points are used to initialize the bidi
51 iterator for iterating an object (buffer or string), push and pop
52 the bidi iterator state, and save and restore the state of the bidi
53 cache.
54
89d3374a
EZ
55 If you want to understand the code, you will have to read it
56 together with the relevant portions of UAX#9. The comments include
57 references to UAX#9 rules, for that very reason.
58
b7b65b15
EZ
59 A note about references to UAX#9 rules: if the reference says
60 something like "X9/Retaining", it means that you need to refer to
cd1181db 61 rule X9 and to its modifications described in the "Implementation
aa5ccb01
EZ
62 Notes" section of UAX#9, under "Retaining Format Codes".
63
64 Here's the overview of the design of the reordering engine
65 implemented by this file.
66
67 Basic implementation structure
68 ------------------------------
69
70 The sequential processing steps described by UAX#9 are implemented
71 as recursive levels of processing, all of which examine the next
72 character in the logical order. This hierarchy of processing looks
73 as follows, from the innermost (deepest) to the outermost level,
74 omitting some subroutines used by each level:
75
76 bidi_fetch_char -- fetch next character
77 bidi_resolve_explicit -- resolve explicit levels and directions
78 bidi_resolve_weak -- resolve weak types
79 bidi_resolve_neutral -- resolve neutral types
80 bidi_level_of_next_char -- resolve implicit levels
81
82 Each level calls the level below it, and works on the result
83 returned by the lower level, including all of its sub-levels.
84
85 Unlike all the levels below it, bidi_level_of_next_char can return
86 the information about either the next or previous character in the
87 logical order, depending on the current direction of scanning the
88 buffer or string. For the next character, it calls all the levels
89 below it; for the previous character, it uses the cache, described
90 below.
91
92 Thus, the result of calling bidi_level_of_next_char is the resolved
93 level of the next or the previous character in the logical order.
94 Based on this information, the function bidi_move_to_visually_next
95 finds the next character in the visual order and updates the
96 direction in which the buffer is scanned, either forward or
97 backward, to find the next character to be displayed. (Text is
98 scanned backwards when it needs to be reversed for display, i.e. if
99 the visual order is the inverse of the logical order.) This
100 implements the last, reordering steps of the UBA, by successively
101 calling bidi_level_of_next_char until the character of the required
102 embedding level is found; the scan direction is dynamically updated
103 as a side effect. See the commentary before the 'while' loop in
104 bidi_move_to_visually_next, for the details.
105
106 Fetching characters
107 -------------------
108
109 In a nutshell, fetching the next character boils down to calling
110 STRING_CHAR_AND_LENGTH, passing it the address of a buffer or
111 string position. See bidi_fetch_char. However, if the next
112 character is "covered" by a display property of some kind,
113 bidi_fetch_char returns the u+FFFC "object replacement character"
114 that represents the entire run of text covered by the display
115 property. (The ch_len and nchars members of 'struct bidi_it'
116 reflect the length in bytes and characters of that text.) This is
117 so we reorder text on both sides of the display property as
118 appropriate for an image or embedded string. Similarly, text
119 covered by a display spec of the form '(space ...)', is replaced
120 with the u+2029 paragraph separator character, so such display
121 specs produce the same effect as a TAB under UBA. Both these
122 special characters are not actually displayed -- the display
123 property is displayed instead -- but just used to compute the
124 embedding level of the surrounding text so as to produce the
125 required effect.
126
127 Bidi iterator states
128 --------------------
129
130 The UBA is highly context dependent in some of its parts,
131 i.e. results of processing a character can generally depend on
132 characters very far away. The UAX#9 description of the UBA
133 prescribes a stateful processing of each character, whereby the
134 results of this processing depend on various state variables, such
135 as the current embedding level, level stack, and directional
136 override status. In addition, the UAX#9 description includes many
137 passages like this (from rule W2 in this case):
138
139 Search backward from each instance of a European number until the
140 first strong type (R, L, AL, or sos) is found. If an AL is found,
141 change the type of the European number to Arabic number.
142
143 To support this, we use a bidi iterator object, 'struct bidi_it',
144 which is a sub-structure of 'struct it' used by xdisp.c (see
145 dispextern.h for the definition of both of these structures). The
146 bidi iterator holds the entire state of the iteration required by
147 the UBA, and is updated as the text is traversed. In particular,
148 the embedding level of the current character being resolved is
149 recorded in the iterator state. To avoid costly searches backward
150 in support of rules like W2 above, the necessary character types
151 are also recorded in the iterator state as they are found during
152 the forward scan, and then used when such rules need to be applied.
153 (Forward scans cannot be avoided in this way; they need to be
154 performed at least once, and the results recorded in the iterator
155 state, to be reused until the forward scan oversteps the recorded
156 position.)
157
158 In this manner, the iterator state acts as a mini-cache of
159 contextual information required for resolving the level of the
160 current character by various UBA rules.
161
162 Caching of bidi iterator states
163 -------------------------------
164
165 As described above, the reordering engine uses the information
166 recorded in the bidi iterator state in order to resolve the
167 embedding level of the current character. When the reordering
168 engine needs to process the next character in the logical order, it
169 fetches it and applies to it all the UBA levels, updating the
170 iterator state as it goes. But when the buffer or string is
171 scanned backwards, i.e. in the reverse order of buffer/string
172 positions, the scanned characters were already processed during the
173 preceding forward scan (see bidi_find_other_level_edge). To avoid
174 costly re-processing of characters that were already processed
175 during the forward scan, the iterator states computed while
176 scanning forward are cached.
177
178 The cache is just a linear array of 'struct bidi_it' objects, which
179 is dynamically allocated and reallocated as needed, since the size
180 of the cache depends on the text being processed. We only need the
181 cache while processing embedded levels higher than the base
182 paragraph embedding level, because these higher levels require
183 changes in scan direction. Therefore, as soon as we are back to
184 the base embedding level, we can free the cache; see the calls to
185 bidi_cache_reset and bidi_cache_shrink, for the conditions to do
186 this.
187
188 The cache maintains the index of the next unused cache slot -- this
189 is where the next iterator state will be cached. The function
190 bidi_cache_iterator_state saves an instance of the state in the
191 cache and increments the unused slot index. The companion function
192 bidi_cache_find looks up a cached state that corresponds to a given
193 buffer/string position. All of the cached states must correspond
194 1:1 to the buffer or string region whose processing they reflect;
195 bidi.c will abort if it finds cache slots that violate this 1:1
196 correspondence.
197
198 When the parent iterator 'struct it' is pushed (see push_it in
199 xdisp.c) to pause the current iteration and start iterating over a
200 different object (e.g., a 'display' string that covers some buffer
201 text), the bidi iterator cache needs to be "pushed" as well, so
202 that a new empty cache could be used while iterating over the new
203 object. Later, when the new object is exhausted, and xdisp.c calls
204 pop_it, we need to "pop" the bidi cache as well and return to the
205 original cache. See bidi_push_it and bidi_pop_it for how this is
206 done.
207
208 Some functions of the display engine save copies of 'struct it' in
209 local variables, and restore them later. For examples, see
210 pos_visible_p and move_it_in_display_line_to in xdisp.c, and
211 window_scroll_pixel_based in window.c. When this happens, we need
212 to save and restore the bidi cache as well, because conceptually
213 the cache is part of the 'struct it' state, and needs to be in
214 perfect sync with the portion of the buffer/string that is being
215 processed. This saving and restoring of the cache state is handled
216 by bidi_shelve_cache and bidi_unshelve_cache, and the helper macros
217 SAVE_IT and RESTORE_IT defined on xdisp.c.
218
219 Note that, because reordering is implemented below the level in
220 xdisp.c that breaks glyphs into screen lines, we are violating
221 paragraph 3.4 of UAX#9. which mandates that line breaking shall be
222 done before reordering each screen line separately. However,
223 following UAX#9 to the letter in this matter goes against the basic
224 design of the Emacs display engine, and so we choose here this
225 minor deviation from the UBA letter in preference to redesign of
226 the display engine. The effect of this is only seen in continued
227 lines that are broken into screen lines in the middle of a run
228 whose direction is opposite to the paragraph's base direction.
229
230 Important design and implementation note: when the code needs to
231 scan far ahead, be sure to avoid such scans as much as possible
232 when the buffer/string doesn't contain any RTL characters. Users
233 of left-to-right scripts will never forgive you if you introduce
234 some slow-down due to bidi in situations that don't involve any
235 bidirectional text. See the large comment near the beginning of
236 bidi_resolve_neutral, for one situation where such shortcut was
237 necessary. */
b7b65b15 238
b7b65b15 239#include <config.h>
b7b65b15 240#include <stdio.h>
29e3d8d1 241
b7b65b15 242#include "lisp.h"
b7b65b15 243#include "character.h"
e5560ff7 244#include "buffer.h"
b7b65b15 245#include "dispextern.h"
e30b79c1 246#include "region-cache.h"
b7b65b15 247
2cc21167 248static bool bidi_initialized = 0;
b7b65b15 249
cbc4fd20 250static Lisp_Object bidi_type_table, bidi_mirror_table;
b7b65b15 251
c521fe29 252#define BIDI_EOB (-1)
b7b65b15 253
b7b65b15
EZ
254/* Data type for describing the bidirectional character categories. */
255typedef enum {
256 UNKNOWN_BC,
257 NEUTRAL,
258 WEAK,
ad826124
EZ
259 STRONG,
260 EXPLICIT_FORMATTING
b7b65b15
EZ
261} bidi_category_t;
262
32413314
EZ
263/* UAX#9 says to search only for L, AL, or R types of characters, and
264 ignore RLE, RLO, LRE, and LRO, when determining the base paragraph
265 level. Yudit indeed ignores them. This variable is therefore set
2cc21167
PE
266 by default to ignore them, but clearing it will take them into
267 account. */
268extern bool bidi_ignore_explicit_marks_for_paragraph_level EXTERNALLY_VISIBLE;
269bool bidi_ignore_explicit_marks_for_paragraph_level = 1;
b7b65b15 270
372b7a95 271static Lisp_Object paragraph_start_re, paragraph_separate_re;
6bff6497 272static Lisp_Object Qparagraph_start, Qparagraph_separate;
b7b65b15 273
cbb09f04
EZ
274\f
275/***********************************************************************
276 Utilities
277 ***********************************************************************/
b7b65b15 278
6bff6497
EZ
279/* Return the bidi type of a character CH, subject to the current
280 directional OVERRIDE. */
0a99eee1 281static bidi_type_t
6bff6497 282bidi_get_type (int ch, bidi_dir_t override)
b7b65b15 283{
6bff6497
EZ
284 bidi_type_t default_type;
285
e7402cb2
EZ
286 if (ch == BIDI_EOB)
287 return NEUTRAL_B;
e342a24d 288 if (ch < 0 || ch > MAX_CHAR)
1088b922 289 emacs_abort ();
6bff6497
EZ
290
291 default_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
bca633fb
EZ
292 /* Every valid character code, even those that are unassigned by the
293 UCD, have some bidi-class property, according to
294 DerivedBidiClass.txt file. Therefore, if we ever get UNKNOWN_BT
295 (= zero) code from CHAR_TABLE_REF, that's a bug. */
296 if (default_type == UNKNOWN_BT)
1088b922 297 emacs_abort ();
6bff6497 298
6bff6497
EZ
299 switch (default_type)
300 {
ad826124 301 case WEAK_BN:
6bff6497
EZ
302 case NEUTRAL_B:
303 case LRE:
304 case LRO:
305 case RLE:
306 case RLO:
307 case PDF:
308 return default_type;
ad826124
EZ
309 /* FIXME: The isolate controls are treated as BN until we add
310 support for UBA v6.3. */
311 case LRI:
312 case RLI:
313 case FSI:
314 case PDI:
315 return WEAK_BN;
6bff6497 316 default:
ad826124
EZ
317 if (override == L2R)
318 return STRONG_L;
319 else if (override == R2L)
320 return STRONG_R;
321 else
322 return default_type;
6bff6497 323 }
b7b65b15
EZ
324}
325
0a99eee1 326static void
2d6e4628
EZ
327bidi_check_type (bidi_type_t type)
328{
a54e2c05 329 eassert (UNKNOWN_BT <= type && type <= NEUTRAL_ON);
2d6e4628
EZ
330}
331
b7b65b15 332/* Given a bidi TYPE of a character, return its category. */
0a99eee1 333static bidi_category_t
b7b65b15
EZ
334bidi_get_category (bidi_type_t type)
335{
336 switch (type)
337 {
338 case UNKNOWN_BT:
339 return UNKNOWN_BC;
340 case STRONG_L:
341 case STRONG_R:
342 case STRONG_AL:
b7b65b15 343 return STRONG;
b7b65b15
EZ
344 case WEAK_EN:
345 case WEAK_ES:
346 case WEAK_ET:
347 case WEAK_AN:
348 case WEAK_CS:
349 case WEAK_NSM:
350 case WEAK_BN:
ad826124
EZ
351 /* FIXME */
352 case LRI:
353 case RLI:
354 case FSI:
355 case PDI:
b7b65b15
EZ
356 return WEAK;
357 case NEUTRAL_B:
358 case NEUTRAL_S:
359 case NEUTRAL_WS:
360 case NEUTRAL_ON:
361 return NEUTRAL;
ad826124
EZ
362 case LRE:
363 case LRO:
364 case RLE:
365 case RLO:
366 case PDF:
367#if 0
368 /* FIXME: This awaits implementation of isolate support. */
369 case LRI:
370 case RLI:
371 case FSI:
372 case PDI:
373#endif
374 return EXPLICIT_FORMATTING;
b7b65b15 375 default:
1088b922 376 emacs_abort ();
b7b65b15
EZ
377 }
378}
379
a6e8d97c
EZ
380/* Return the mirrored character of C, if it has one. If C has no
381 mirrored counterpart, return C.
382 Note: The conditions in UAX#9 clause L4 regarding the surrounding
383 context must be tested by the caller. */
b7b65b15
EZ
384int
385bidi_mirror_char (int c)
386{
cbc4fd20
EZ
387 Lisp_Object val;
388
389 if (c == BIDI_EOB)
390 return c;
391 if (c < 0 || c > MAX_CHAR)
1088b922 392 emacs_abort ();
b7b65b15 393
cbc4fd20
EZ
394 val = CHAR_TABLE_REF (bidi_mirror_table, c);
395 if (INTEGERP (val))
b7b65b15 396 {
da92a98c 397 int v;
9e1a06fc 398
da92a98c
PE
399 /* When debugging, check before assigning to V, so that the check
400 isn't broken by undefined behavior due to int overflow. */
81899c91 401 eassert (CHAR_VALID_P (XINT (val)));
9e1a06fc 402
da92a98c
PE
403 v = XINT (val);
404
9e1a06fc
EZ
405 /* Minimal test we must do in optimized builds, to prevent weird
406 crashes further down the road. */
407 if (v < 0 || v > MAX_CHAR)
1088b922 408 emacs_abort ();
9e1a06fc
EZ
409
410 return v;
b7b65b15 411 }
cbc4fd20 412
b7b65b15
EZ
413 return c;
414}
415
cbb09f04
EZ
416/* Determine the start-of-run (sor) directional type given the two
417 embedding levels on either side of the run boundary. Also, update
418 the saved info about previously seen characters, since that info is
419 generally valid for a single level run. */
0a99eee1 420static void
cbb09f04
EZ
421bidi_set_sor_type (struct bidi_it *bidi_it, int level_before, int level_after)
422{
512a289d 423 int higher_level = (level_before > level_after ? level_before : level_after);
cbb09f04
EZ
424
425 /* The prev_was_pdf gork is required for when we have several PDFs
426 in a row. In that case, we want to compute the sor type for the
427 next level run only once: when we see the first PDF. That's
428 because the sor type depends only on the higher of the two levels
429 that we find on the two sides of the level boundary (see UAX#9,
430 clause X10), and so we don't need to know the final embedding
431 level to which we descend after processing all the PDFs. */
432 if (!bidi_it->prev_was_pdf || level_before < level_after)
433 /* FIXME: should the default sor direction be user selectable? */
512a289d 434 bidi_it->sor = ((higher_level & 1) != 0 ? R2L : L2R);
cbb09f04
EZ
435 if (level_before > level_after)
436 bidi_it->prev_was_pdf = 1;
437
438 bidi_it->prev.type = UNKNOWN_BT;
512a289d
RS
439 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1
440 = bidi_it->last_strong.orig_type = UNKNOWN_BT;
441 bidi_it->prev_for_neutral.type = (bidi_it->sor == R2L ? STRONG_R : STRONG_L);
cbb09f04
EZ
442 bidi_it->prev_for_neutral.charpos = bidi_it->charpos;
443 bidi_it->prev_for_neutral.bytepos = bidi_it->bytepos;
512a289d
RS
444 bidi_it->next_for_neutral.type = bidi_it->next_for_neutral.type_after_w1
445 = bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
cbb09f04
EZ
446 bidi_it->ignore_bn_limit = -1; /* meaning it's unknown */
447}
448
449/* Push the current embedding level and override status; reset the
450 current level to LEVEL and the current override status to OVERRIDE. */
0a99eee1 451static void
cbb09f04
EZ
452bidi_push_embedding_level (struct bidi_it *bidi_it,
453 int level, bidi_dir_t override)
454{
455 bidi_it->stack_idx++;
a54e2c05 456 eassert (bidi_it->stack_idx < BIDI_MAXLEVEL);
cbb09f04
EZ
457 bidi_it->level_stack[bidi_it->stack_idx].level = level;
458 bidi_it->level_stack[bidi_it->stack_idx].override = override;
459}
460
461/* Pop the embedding level and directional override status from the
462 stack, and return the new level. */
0a99eee1 463static int
cbb09f04
EZ
464bidi_pop_embedding_level (struct bidi_it *bidi_it)
465{
466 /* UAX#9 says to ignore invalid PDFs. */
467 if (bidi_it->stack_idx > 0)
468 bidi_it->stack_idx--;
469 return bidi_it->level_stack[bidi_it->stack_idx].level;
470}
471
472/* Record in SAVED_INFO the information about the current character. */
0a99eee1 473static void
cbb09f04
EZ
474bidi_remember_char (struct bidi_saved_info *saved_info,
475 struct bidi_it *bidi_it)
476{
477 saved_info->charpos = bidi_it->charpos;
478 saved_info->bytepos = bidi_it->bytepos;
479 saved_info->type = bidi_it->type;
480 bidi_check_type (bidi_it->type);
481 saved_info->type_after_w1 = bidi_it->type_after_w1;
482 bidi_check_type (bidi_it->type_after_w1);
483 saved_info->orig_type = bidi_it->orig_type;
484 bidi_check_type (bidi_it->orig_type);
485}
486
b7b65b15
EZ
487/* Copy the bidi iterator from FROM to TO. To save cycles, this only
488 copies the part of the level stack that is actually in use. */
0a99eee1 489static void
b7b65b15
EZ
490bidi_copy_it (struct bidi_it *to, struct bidi_it *from)
491{
05584c31
PE
492 /* Copy everything from the start through the active part of
493 the level stack. */
494 memcpy (to, from,
495 (offsetof (struct bidi_it, level_stack[1])
496 + from->stack_idx * sizeof from->level_stack[0]));
b7b65b15
EZ
497}
498
cbb09f04
EZ
499\f
500/***********************************************************************
501 Caching the bidi iterator states
502 ***********************************************************************/
b7b65b15 503
2fe72643
EZ
504#define BIDI_CACHE_CHUNK 200
505static struct bidi_it *bidi_cache;
39e378da 506static ptrdiff_t bidi_cache_size = 0;
ad6042bb 507enum { elsz = sizeof (struct bidi_it) };
39e378da
PE
508static ptrdiff_t bidi_cache_idx; /* next unused cache slot */
509static ptrdiff_t bidi_cache_last_idx; /* slot of last cache hit */
510static ptrdiff_t bidi_cache_start = 0; /* start of cache for this
87e67904
EZ
511 "stack" level */
512
bc18e09d
PE
513/* 5-slot stack for saving the start of the previous level of the
514 cache. xdisp.c maintains a 5-slot stack for its iterator state,
515 and we need the same size of our stack. */
516static ptrdiff_t bidi_cache_start_stack[IT_STACK_SIZE];
517static int bidi_cache_sp;
518
519/* Size of header used by bidi_shelve_cache. */
520enum
521 {
512a289d
RS
522 bidi_shelve_header_size
523 = (sizeof (bidi_cache_idx) + sizeof (bidi_cache_start_stack)
524 + sizeof (bidi_cache_sp) + sizeof (bidi_cache_start)
525 + sizeof (bidi_cache_last_idx))
bc18e09d
PE
526 };
527
87e67904
EZ
528/* Reset the cache state to the empty state. We only reset the part
529 of the cache relevant to iteration of the current object. Previous
530 objects, which are pushed on the display iterator's stack, are left
531 intact. This is called when the cached information is no more
532 useful for the current iteration, e.g. when we were reseated to a
533 new position on the same object. */
0a99eee1 534static void
b7b65b15
EZ
535bidi_cache_reset (void)
536{
87e67904 537 bidi_cache_idx = bidi_cache_start;
b7b65b15
EZ
538 bidi_cache_last_idx = -1;
539}
540
87e67904
EZ
541/* Shrink the cache to its minimal size. Called when we init the bidi
542 iterator for reordering a buffer or a string that does not come
543 from display properties, because that means all the previously
544 cached info is of no further use. */
0a99eee1 545static void
2fe72643
EZ
546bidi_cache_shrink (void)
547{
548 if (bidi_cache_size > BIDI_CACHE_CHUNK)
549 {
38182d90 550 bidi_cache = xrealloc (bidi_cache, BIDI_CACHE_CHUNK * elsz);
51f30bc5 551 bidi_cache_size = BIDI_CACHE_CHUNK;
2fe72643
EZ
552 }
553 bidi_cache_reset ();
554}
555
0a99eee1 556static void
39e378da 557bidi_cache_fetch_state (ptrdiff_t idx, struct bidi_it *bidi_it)
b7b65b15
EZ
558{
559 int current_scan_dir = bidi_it->scan_dir;
560
87e67904 561 if (idx < bidi_cache_start || idx >= bidi_cache_idx)
1088b922 562 emacs_abort ();
b7b65b15
EZ
563
564 bidi_copy_it (bidi_it, &bidi_cache[idx]);
565 bidi_it->scan_dir = current_scan_dir;
566 bidi_cache_last_idx = idx;
567}
568
569/* Find a cached state with a given CHARPOS and resolved embedding
570 level less or equal to LEVEL. if LEVEL is -1, disregard the
571 resolved levels in cached states. DIR, if non-zero, means search
572 in that direction from the last cache hit. */
0a99eee1 573static ptrdiff_t
d311d28c 574bidi_cache_search (ptrdiff_t charpos, int level, int dir)
b7b65b15 575{
39e378da 576 ptrdiff_t i, i_start;
b7b65b15 577
6eec7596 578 if (bidi_cache_idx > bidi_cache_start)
b7b65b15 579 {
6eec7596
EZ
580 if (bidi_cache_last_idx == -1)
581 bidi_cache_last_idx = bidi_cache_idx - 1;
b7b65b15 582 if (charpos < bidi_cache[bidi_cache_last_idx].charpos)
182ce2d2
EZ
583 {
584 dir = -1;
585 i_start = bidi_cache_last_idx - 1;
586 }
587 else if (charpos > (bidi_cache[bidi_cache_last_idx].charpos
588 + bidi_cache[bidi_cache_last_idx].nchars - 1))
589 {
590 dir = 1;
591 i_start = bidi_cache_last_idx + 1;
592 }
593 else if (dir)
b7b65b15
EZ
594 i_start = bidi_cache_last_idx;
595 else
596 {
597 dir = -1;
598 i_start = bidi_cache_idx - 1;
599 }
600
601 if (dir < 0)
602 {
603 /* Linear search for now; FIXME! */
87e67904 604 for (i = i_start; i >= bidi_cache_start; i--)
182ce2d2
EZ
605 if (bidi_cache[i].charpos <= charpos
606 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
b7b65b15
EZ
607 && (level == -1 || bidi_cache[i].resolved_level <= level))
608 return i;
609 }
610 else
611 {
612 for (i = i_start; i < bidi_cache_idx; i++)
182ce2d2
EZ
613 if (bidi_cache[i].charpos <= charpos
614 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
b7b65b15
EZ
615 && (level == -1 || bidi_cache[i].resolved_level <= level))
616 return i;
617 }
618 }
619
620 return -1;
621}
622
623/* Find a cached state where the resolved level changes to a value
624 that is lower than LEVEL, and return its cache slot index. DIR is
625 the direction to search, starting with the last used cache slot.
87e67904 626 If DIR is zero, we search backwards from the last occupied cache
2cc21167 627 slot. BEFORE means return the index of the slot that
87e67904 628 is ``before'' the level change in the search direction. That is,
b7b65b15
EZ
629 given the cached levels like this:
630
631 1122333442211
632 AB C
633
634 and assuming we are at the position cached at the slot marked with
635 C, searching backwards (DIR = -1) for LEVEL = 2 will return the
636 index of slot B or A, depending whether BEFORE is, respectively,
2cc21167 637 true or false. */
39e378da 638static ptrdiff_t
2cc21167 639bidi_cache_find_level_change (int level, int dir, bool before)
b7b65b15
EZ
640{
641 if (bidi_cache_idx)
642 {
39e378da 643 ptrdiff_t i = dir ? bidi_cache_last_idx : bidi_cache_idx - 1;
b7b65b15
EZ
644 int incr = before ? 1 : 0;
645
a54e2c05 646 eassert (!dir || bidi_cache_last_idx >= 0);
6eec7596 647
b7b65b15
EZ
648 if (!dir)
649 dir = -1;
650 else if (!incr)
651 i += dir;
652
653 if (dir < 0)
654 {
87e67904 655 while (i >= bidi_cache_start + incr)
b7b65b15
EZ
656 {
657 if (bidi_cache[i - incr].resolved_level >= 0
658 && bidi_cache[i - incr].resolved_level < level)
659 return i;
660 i--;
661 }
662 }
663 else
664 {
665 while (i < bidi_cache_idx - incr)
666 {
667 if (bidi_cache[i + incr].resolved_level >= 0
668 && bidi_cache[i + incr].resolved_level < level)
669 return i;
670 i++;
671 }
672 }
673 }
674
675 return -1;
676}
677
0a99eee1 678static void
39e378da 679bidi_cache_ensure_space (ptrdiff_t idx)
58b9f433
EZ
680{
681 /* Enlarge the cache as needed. */
682 if (idx >= bidi_cache_size)
683 {
f0eb61e9
PE
684 /* The bidi cache cannot be larger than the largest Lisp string
685 or buffer. */
512a289d
RS
686 ptrdiff_t string_or_buffer_bound
687 = max (BUF_BYTES_MAX, STRING_BYTES_BOUND);
f0eb61e9
PE
688
689 /* Also, it cannot be larger than what C can represent. */
512a289d
RS
690 ptrdiff_t c_bound
691 = (min (PTRDIFF_MAX, SIZE_MAX) - bidi_shelve_header_size) / elsz;
f0eb61e9 692
512a289d
RS
693 bidi_cache
694 = xpalloc (bidi_cache, &bidi_cache_size,
695 max (BIDI_CACHE_CHUNK, idx - bidi_cache_size + 1),
696 min (string_or_buffer_bound, c_bound), elsz);
58b9f433
EZ
697 }
698}
699
0a99eee1 700static void
2cc21167 701bidi_cache_iterator_state (struct bidi_it *bidi_it, bool resolved)
b7b65b15 702{
39e378da 703 ptrdiff_t idx;
b7b65b15
EZ
704
705 /* We should never cache on backward scans. */
706 if (bidi_it->scan_dir == -1)
1088b922 707 emacs_abort ();
b7b65b15
EZ
708 idx = bidi_cache_search (bidi_it->charpos, -1, 1);
709
710 if (idx < 0)
711 {
712 idx = bidi_cache_idx;
58b9f433 713 bidi_cache_ensure_space (idx);
bd924a5d
EZ
714 /* Character positions should correspond to cache positions 1:1.
715 If we are outside the range of cached positions, the cache is
716 useless and must be reset. */
87e67904 717 if (idx > bidi_cache_start &&
182ce2d2
EZ
718 (bidi_it->charpos > (bidi_cache[idx - 1].charpos
719 + bidi_cache[idx - 1].nchars)
a1344e7d 720 || bidi_it->charpos < bidi_cache[bidi_cache_start].charpos))
bd924a5d
EZ
721 {
722 bidi_cache_reset ();
87e67904 723 idx = bidi_cache_start;
bd924a5d 724 }
102ebb00 725 if (bidi_it->nchars <= 0)
1088b922 726 emacs_abort ();
b7b65b15
EZ
727 bidi_copy_it (&bidi_cache[idx], bidi_it);
728 if (!resolved)
729 bidi_cache[idx].resolved_level = -1;
730 }
731 else
732 {
733 /* Copy only the members which could have changed, to avoid
734 costly copying of the entire struct. */
735 bidi_cache[idx].type = bidi_it->type;
2d6e4628 736 bidi_check_type (bidi_it->type);
89d3374a
EZ
737 bidi_cache[idx].type_after_w1 = bidi_it->type_after_w1;
738 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
739 if (resolved)
740 bidi_cache[idx].resolved_level = bidi_it->resolved_level;
741 else
742 bidi_cache[idx].resolved_level = -1;
743 bidi_cache[idx].invalid_levels = bidi_it->invalid_levels;
744 bidi_cache[idx].invalid_rl_levels = bidi_it->invalid_rl_levels;
745 bidi_cache[idx].next_for_neutral = bidi_it->next_for_neutral;
746 bidi_cache[idx].next_for_ws = bidi_it->next_for_ws;
747 bidi_cache[idx].ignore_bn_limit = bidi_it->ignore_bn_limit;
f67cdd7f 748 bidi_cache[idx].disp_pos = bidi_it->disp_pos;
0c95fcf7 749 bidi_cache[idx].disp_prop = bidi_it->disp_prop;
b7b65b15
EZ
750 }
751
752 bidi_cache_last_idx = idx;
753 if (idx >= bidi_cache_idx)
754 bidi_cache_idx = idx + 1;
755}
756
0a99eee1 757static bidi_type_t
d311d28c 758bidi_cache_find (ptrdiff_t charpos, int level, struct bidi_it *bidi_it)
b7b65b15 759{
39e378da 760 ptrdiff_t i = bidi_cache_search (charpos, level, bidi_it->scan_dir);
b7b65b15 761
87e67904 762 if (i >= bidi_cache_start)
b7b65b15
EZ
763 {
764 bidi_dir_t current_scan_dir = bidi_it->scan_dir;
765
5009f85e 766 bidi_copy_it (bidi_it, &bidi_cache[i]);
b7b65b15 767 bidi_cache_last_idx = i;
5a5fa834 768 /* Don't let scan direction from the cached state override
b7b65b15
EZ
769 the current scan direction. */
770 bidi_it->scan_dir = current_scan_dir;
771 return bidi_it->type;
772 }
773
774 return UNKNOWN_BT;
775}
776
0a99eee1 777static int
b7b65b15
EZ
778bidi_peek_at_next_level (struct bidi_it *bidi_it)
779{
87e67904 780 if (bidi_cache_idx == bidi_cache_start || bidi_cache_last_idx == -1)
1088b922 781 emacs_abort ();
b7b65b15
EZ
782 return bidi_cache[bidi_cache_last_idx + bidi_it->scan_dir].resolved_level;
783}
784
cbb09f04 785\f
58b9f433
EZ
786/***********************************************************************
787 Pushing and popping the bidi iterator state
788 ***********************************************************************/
58b9f433
EZ
789
790/* Push the bidi iterator state in preparation for reordering a
791 different object, e.g. display string found at certain buffer
7e2ad32c
EZ
792 position. Pushing the bidi iterator boils down to saving its
793 entire state on the cache and starting a new cache "stacked" on top
794 of the current cache. */
58b9f433
EZ
795void
796bidi_push_it (struct bidi_it *bidi_it)
b7b65b15 797{
58b9f433
EZ
798 /* Save the current iterator state in its entirety after the last
799 used cache slot. */
800 bidi_cache_ensure_space (bidi_cache_idx);
ae1d87e2 801 bidi_cache[bidi_cache_idx++] = *bidi_it;
be39f003 802
58b9f433 803 /* Push the current cache start onto the stack. */
a54e2c05 804 eassert (bidi_cache_sp < IT_STACK_SIZE);
58b9f433 805 bidi_cache_start_stack[bidi_cache_sp++] = bidi_cache_start;
be39f003 806
58b9f433
EZ
807 /* Start a new level of cache, and make it empty. */
808 bidi_cache_start = bidi_cache_idx;
809 bidi_cache_last_idx = -1;
810}
811
812/* Restore the iterator state saved by bidi_push_it and return the
813 cache to the corresponding state. */
814void
815bidi_pop_it (struct bidi_it *bidi_it)
816{
817 if (bidi_cache_start <= 0)
1088b922 818 emacs_abort ();
58b9f433
EZ
819
820 /* Reset the next free cache slot index to what it was before the
821 call to bidi_push_it. */
822 bidi_cache_idx = bidi_cache_start - 1;
823
824 /* Restore the bidi iterator state saved in the cache. */
ae1d87e2 825 *bidi_it = bidi_cache[bidi_cache_idx];
58b9f433
EZ
826
827 /* Pop the previous cache start from the stack. */
828 if (bidi_cache_sp <= 0)
1088b922 829 emacs_abort ();
58b9f433
EZ
830 bidi_cache_start = bidi_cache_start_stack[--bidi_cache_sp];
831
832 /* Invalidate the last-used cache slot data. */
833 bidi_cache_last_idx = -1;
834}
835
ec7cc85b 836static ptrdiff_t bidi_cache_total_alloc;
35928349 837
ed94e6d7
EZ
838/* Stash away a copy of the cache and its control variables. */
839void *
840bidi_shelve_cache (void)
841{
842 unsigned char *databuf;
458bfed3 843 ptrdiff_t alloc;
ed94e6d7 844
35928349 845 /* Empty cache. */
ed94e6d7
EZ
846 if (bidi_cache_idx == 0)
847 return NULL;
848
458bfed3
PE
849 alloc = (bidi_shelve_header_size
850 + bidi_cache_idx * sizeof (struct bidi_it));
851 databuf = xmalloc (alloc);
852 bidi_cache_total_alloc += alloc;
35928349 853
ed94e6d7
EZ
854 memcpy (databuf, &bidi_cache_idx, sizeof (bidi_cache_idx));
855 memcpy (databuf + sizeof (bidi_cache_idx),
856 bidi_cache, bidi_cache_idx * sizeof (struct bidi_it));
857 memcpy (databuf + sizeof (bidi_cache_idx)
858 + bidi_cache_idx * sizeof (struct bidi_it),
859 bidi_cache_start_stack, sizeof (bidi_cache_start_stack));
860 memcpy (databuf + sizeof (bidi_cache_idx)
861 + bidi_cache_idx * sizeof (struct bidi_it)
862 + sizeof (bidi_cache_start_stack),
863 &bidi_cache_sp, sizeof (bidi_cache_sp));
864 memcpy (databuf + sizeof (bidi_cache_idx)
865 + bidi_cache_idx * sizeof (struct bidi_it)
866 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp),
867 &bidi_cache_start, sizeof (bidi_cache_start));
868 memcpy (databuf + sizeof (bidi_cache_idx)
869 + bidi_cache_idx * sizeof (struct bidi_it)
870 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
871 + sizeof (bidi_cache_start),
872 &bidi_cache_last_idx, sizeof (bidi_cache_last_idx));
873
874 return databuf;
875}
876
d1410150
EZ
877/* Restore the cache state from a copy stashed away by
878 bidi_shelve_cache, and free the buffer used to stash that copy.
2cc21167 879 JUST_FREE means free the buffer, but don't restore the
d1410150
EZ
880 cache; used when the corresponding iterator is discarded instead of
881 being restored. */
ed94e6d7 882void
2cc21167 883bidi_unshelve_cache (void *databuf, bool just_free)
ed94e6d7
EZ
884{
885 unsigned char *p = databuf;
886
887 if (!p)
be39f003 888 {
d1410150
EZ
889 if (!just_free)
890 {
891 /* A NULL pointer means an empty cache. */
892 bidi_cache_start = 0;
893 bidi_cache_sp = 0;
894 bidi_cache_reset ();
895 }
be39f003 896 }
ed94e6d7
EZ
897 else
898 {
35928349
EZ
899 if (just_free)
900 {
901 ptrdiff_t idx;
902
903 memcpy (&idx, p, sizeof (bidi_cache_idx));
512a289d
RS
904 bidi_cache_total_alloc
905 -= bidi_shelve_header_size + idx * sizeof (struct bidi_it);
35928349
EZ
906 }
907 else
908 {
909 memcpy (&bidi_cache_idx, p, sizeof (bidi_cache_idx));
910 bidi_cache_ensure_space (bidi_cache_idx);
911 memcpy (bidi_cache, p + sizeof (bidi_cache_idx),
912 bidi_cache_idx * sizeof (struct bidi_it));
913 memcpy (bidi_cache_start_stack,
914 p + sizeof (bidi_cache_idx)
915 + bidi_cache_idx * sizeof (struct bidi_it),
916 sizeof (bidi_cache_start_stack));
917 memcpy (&bidi_cache_sp,
918 p + sizeof (bidi_cache_idx)
919 + bidi_cache_idx * sizeof (struct bidi_it)
920 + sizeof (bidi_cache_start_stack),
921 sizeof (bidi_cache_sp));
922 memcpy (&bidi_cache_start,
923 p + sizeof (bidi_cache_idx)
924 + bidi_cache_idx * sizeof (struct bidi_it)
925 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp),
926 sizeof (bidi_cache_start));
927 memcpy (&bidi_cache_last_idx,
928 p + sizeof (bidi_cache_idx)
929 + bidi_cache_idx * sizeof (struct bidi_it)
930 + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
931 + sizeof (bidi_cache_start),
932 sizeof (bidi_cache_last_idx));
512a289d
RS
933 bidi_cache_total_alloc
934 -= (bidi_shelve_header_size
935 + bidi_cache_idx * sizeof (struct bidi_it));
35928349 936 }
ed94e6d7
EZ
937
938 xfree (p);
939 }
940}
b7b65b15 941
58b9f433 942\f
cbb09f04
EZ
943/***********************************************************************
944 Initialization
945 ***********************************************************************/
946static void
947bidi_initialize (void)
b7b65b15 948{
474a8465
EZ
949 bidi_type_table = uniprop_table (intern ("bidi-class"));
950 if (NILP (bidi_type_table))
1088b922 951 emacs_abort ();
cbb09f04
EZ
952 staticpro (&bidi_type_table);
953
474a8465
EZ
954 bidi_mirror_table = uniprop_table (intern ("mirroring"));
955 if (NILP (bidi_mirror_table))
1088b922 956 emacs_abort ();
cbb09f04
EZ
957 staticpro (&bidi_mirror_table);
958
cbb09f04
EZ
959 Qparagraph_start = intern ("paragraph-start");
960 staticpro (&Qparagraph_start);
961 paragraph_start_re = Fsymbol_value (Qparagraph_start);
962 if (!STRINGP (paragraph_start_re))
963 paragraph_start_re = build_string ("\f\\|[ \t]*$");
964 staticpro (&paragraph_start_re);
965 Qparagraph_separate = intern ("paragraph-separate");
966 staticpro (&Qparagraph_separate);
967 paragraph_separate_re = Fsymbol_value (Qparagraph_separate);
968 if (!STRINGP (paragraph_separate_re))
969 paragraph_separate_re = build_string ("[ \t\f]*$");
970 staticpro (&paragraph_separate_re);
58b9f433
EZ
971
972 bidi_cache_sp = 0;
ec7cc85b 973 bidi_cache_total_alloc = 0;
58b9f433 974
cbb09f04 975 bidi_initialized = 1;
b7b65b15
EZ
976}
977
cbb09f04
EZ
978/* Do whatever UAX#9 clause X8 says should be done at paragraph's
979 end. */
0a99eee1 980static void
cbb09f04 981bidi_set_paragraph_end (struct bidi_it *bidi_it)
b7b65b15 982{
cbb09f04
EZ
983 bidi_it->invalid_levels = 0;
984 bidi_it->invalid_rl_levels = -1;
985 bidi_it->stack_idx = 0;
986 bidi_it->resolved_level = bidi_it->level_stack[0].level;
987}
b7b65b15 988
cbb09f04
EZ
989/* Initialize the bidi iterator from buffer/string position CHARPOS. */
990void
2cc21167 991bidi_init_it (ptrdiff_t charpos, ptrdiff_t bytepos, bool frame_window_p,
cbb09f04
EZ
992 struct bidi_it *bidi_it)
993{
994 if (! bidi_initialized)
995 bidi_initialize ();
996 if (charpos >= 0)
997 bidi_it->charpos = charpos;
998 if (bytepos >= 0)
999 bidi_it->bytepos = bytepos;
1000 bidi_it->frame_window_p = frame_window_p;
1001 bidi_it->nchars = -1; /* to be computed in bidi_resolve_explicit_1 */
1002 bidi_it->first_elt = 1;
1003 bidi_set_paragraph_end (bidi_it);
1004 bidi_it->new_paragraph = 1;
1005 bidi_it->separator_limit = -1;
1006 bidi_it->type = NEUTRAL_B;
1007 bidi_it->type_after_w1 = NEUTRAL_B;
1008 bidi_it->orig_type = NEUTRAL_B;
1009 bidi_it->prev_was_pdf = 0;
512a289d
RS
1010 bidi_it->prev.type = bidi_it->prev.type_after_w1
1011 = bidi_it->prev.orig_type = UNKNOWN_BT;
1012 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1
1013 = bidi_it->last_strong.orig_type = UNKNOWN_BT;
cbb09f04 1014 bidi_it->next_for_neutral.charpos = -1;
512a289d
RS
1015 bidi_it->next_for_neutral.type
1016 = bidi_it->next_for_neutral.type_after_w1
1017 = bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
cbb09f04 1018 bidi_it->prev_for_neutral.charpos = -1;
512a289d
RS
1019 bidi_it->prev_for_neutral.type
1020 = bidi_it->prev_for_neutral.type_after_w1
1021 = bidi_it->prev_for_neutral.orig_type = UNKNOWN_BT;
cbb09f04
EZ
1022 bidi_it->sor = L2R; /* FIXME: should it be user-selectable? */
1023 bidi_it->disp_pos = -1; /* invalid/unknown */
0c95fcf7 1024 bidi_it->disp_prop = 0;
cbb09f04
EZ
1025 /* We can only shrink the cache if we are at the bottom level of its
1026 "stack". */
1027 if (bidi_cache_start == 0)
1028 bidi_cache_shrink ();
58b9f433
EZ
1029 else
1030 bidi_cache_reset ();
b7b65b15
EZ
1031}
1032
182ce2d2 1033/* Perform initializations for reordering a new line of bidi text. */
6bff6497 1034static void
be39f003
EZ
1035bidi_line_init (struct bidi_it *bidi_it)
1036{
1037 bidi_it->scan_dir = 1; /* FIXME: do we need to have control on this? */
1038 bidi_it->resolved_level = bidi_it->level_stack[0].level;
1039 bidi_it->level_stack[0].override = NEUTRAL_DIR; /* X1 */
1040 bidi_it->invalid_levels = 0;
1041 bidi_it->invalid_rl_levels = -1;
4787455f
EZ
1042 /* Setting this to zero will force its recomputation the first time
1043 we need it for W5. */
1044 bidi_it->next_en_pos = 0;
7b5d6677 1045 bidi_it->next_en_type = UNKNOWN_BT;
be39f003 1046 bidi_it->next_for_ws.type = UNKNOWN_BT;
b44d9321 1047 bidi_set_sor_type (bidi_it,
512a289d 1048 (bidi_it->paragraph_dir == R2L ? 1 : 0),
be39f003
EZ
1049 bidi_it->level_stack[0].level); /* X10 */
1050
1051 bidi_cache_reset ();
1052}
1053
cbb09f04
EZ
1054\f
1055/***********************************************************************
1056 Fetching characters
1057 ***********************************************************************/
1058
f3014ef5
EZ
1059/* Count bytes in string S between BEG/BEGBYTE and END. BEG and END
1060 are zero-based character positions in S, BEGBYTE is byte position
2cc21167 1061 corresponding to BEG. UNIBYTE means S is a unibyte string. */
0a99eee1 1062static ptrdiff_t
b7ad2f74
PE
1063bidi_count_bytes (const unsigned char *s, ptrdiff_t beg,
1064 ptrdiff_t begbyte, ptrdiff_t end, bool unibyte)
87e67904 1065{
d311d28c 1066 ptrdiff_t pos = beg;
87e67904
EZ
1067 const unsigned char *p = s + begbyte, *start = p;
1068
f3014ef5
EZ
1069 if (unibyte)
1070 p = s + end;
1071 else
87e67904 1072 {
f3014ef5 1073 if (!CHAR_HEAD_P (*p))
1088b922 1074 emacs_abort ();
f3014ef5
EZ
1075
1076 while (pos < end)
1077 {
1078 p += BYTES_BY_CHAR_HEAD (*p);
1079 pos++;
1080 }
87e67904
EZ
1081 }
1082
1083 return p - start;
1084}
1085
05584c31 1086/* Fetch and return the character at byte position BYTEPOS. If S is
87e67904 1087 non-NULL, fetch the character from string S; otherwise fetch the
2cc21167 1088 character from the current buffer. UNIBYTE means S is a
f3014ef5 1089 unibyte string. */
0a99eee1 1090static int
2cc21167 1091bidi_char_at_pos (ptrdiff_t bytepos, const unsigned char *s, bool unibyte)
87e67904
EZ
1092{
1093 if (s)
f3014ef5 1094 {
05584c31 1095 s += bytepos;
f3014ef5 1096 if (unibyte)
05584c31 1097 return *s;
f3014ef5 1098 }
87e67904 1099 else
05584c31
PE
1100 s = BYTE_POS_ADDR (bytepos);
1101 return STRING_CHAR (s);
87e67904
EZ
1102}
1103
36075a19 1104/* Fetch and return the character at CHARPOS/BYTEPOS. If that
102ebb00 1105 character is covered by a display string, treat the entire run of
0c95fcf7
EZ
1106 covered characters as a single character, either u+2029 or u+FFFC,
1107 and return their combined length in CH_LEN and NCHARS. DISP_POS
1108 specifies the character position of the next display string, or -1
b75258b3
EZ
1109 if not yet computed. When the next character is at or beyond that
1110 position, the function updates DISP_POS with the position of the
2cc21167 1111 next display string. *DISP_PROP non-zero means that there's really
0c95fcf7 1112 a display string at DISP_POS, as opposed to when we searched till
2cc21167 1113 DISP_POS without finding one. If *DISP_PROP is 2, it means the
0c95fcf7 1114 display spec is of the form `(space ...)', which is replaced with
b75258b3
EZ
1115 u+2029 to handle it as a paragraph separator. STRING->s is the C
1116 string to iterate, or NULL if iterating over a buffer or a Lisp
1117 string; in the latter case, STRING->lstring is the Lisp string. */
0a99eee1 1118static int
36075a19 1119bidi_fetch_char (ptrdiff_t charpos, ptrdiff_t bytepos, ptrdiff_t *disp_pos,
0c95fcf7 1120 int *disp_prop, struct bidi_string_data *string,
5bf97bfc 1121 struct window *w,
2cc21167 1122 bool frame_window_p, ptrdiff_t *ch_len, ptrdiff_t *nchars)
182ce2d2
EZ
1123{
1124 int ch;
5895d7b9 1125 ptrdiff_t endpos
512a289d 1126 = (string->s || STRINGP (string->lstring)) ? string->schars : ZV;
6db161be 1127 struct text_pos pos;
e99a9b8b 1128 int len;
182ce2d2 1129
182ce2d2 1130 /* If we got past the last known position of display string, compute
87e67904
EZ
1131 the position of the next one. That position could be at CHARPOS. */
1132 if (charpos < endpos && charpos > *disp_pos)
6db161be
EZ
1133 {
1134 SET_TEXT_POS (pos, charpos, bytepos);
5bf97bfc 1135 *disp_pos = compute_display_string_pos (&pos, string, w, frame_window_p,
0c95fcf7 1136 disp_prop);
6db161be 1137 }
102ebb00
EZ
1138
1139 /* Fetch the character at BYTEPOS. */
87e67904 1140 if (charpos >= endpos)
182ce2d2
EZ
1141 {
1142 ch = BIDI_EOB;
1143 *ch_len = 1;
1144 *nchars = 1;
87e67904 1145 *disp_pos = endpos;
0c95fcf7 1146 *disp_prop = 0;
182ce2d2 1147 }
0c95fcf7 1148 else if (charpos >= *disp_pos && *disp_prop)
182ce2d2 1149 {
d311d28c 1150 ptrdiff_t disp_end_pos;
7b600102
EZ
1151
1152 /* We don't expect to find ourselves in the middle of a display
1153 property. Hopefully, it will never be needed. */
1154 if (charpos > *disp_pos)
1088b922 1155 emacs_abort ();
0c95fcf7
EZ
1156 /* Text covered by `display' properties and overlays with
1157 display properties or display strings is handled as a single
1158 character that represents the entire run of characters
1159 covered by the display property. */
1160 if (*disp_prop == 2)
1161 {
1162 /* `(space ...)' display specs are handled as paragraph
1163 separators for the purposes of the reordering; see UAX#9
1164 section 3 and clause HL1 in section 4.3 there. */
1165 ch = 0x2029;
1166 }
1167 else
1168 {
1169 /* All other display specs are handled as the Unicode Object
1170 Replacement Character. */
1171 ch = 0xFFFC;
1172 }
87e67904 1173 disp_end_pos = compute_display_string_end (*disp_pos, string);
fbcaa2f3
EZ
1174 if (disp_end_pos < 0)
1175 {
1176 /* Somebody removed the display string from the buffer
1177 behind our back. Recover by processing this buffer
1178 position as if no display property were present there to
1179 begin with. */
1180 *disp_prop = 0;
1181 goto normal_char;
1182 }
7b600102 1183 *nchars = disp_end_pos - *disp_pos;
f3014ef5 1184 if (*nchars <= 0)
1088b922 1185 emacs_abort ();
87e67904
EZ
1186 if (string->s)
1187 *ch_len = bidi_count_bytes (string->s, *disp_pos, bytepos,
f3014ef5 1188 disp_end_pos, string->unibyte);
9f257352
EZ
1189 else if (STRINGP (string->lstring))
1190 *ch_len = bidi_count_bytes (SDATA (string->lstring), *disp_pos,
f3014ef5 1191 bytepos, disp_end_pos, string->unibyte);
87e67904
EZ
1192 else
1193 *ch_len = CHAR_TO_BYTE (disp_end_pos) - bytepos;
182ce2d2 1194 }
182ce2d2
EZ
1195 else
1196 {
fbcaa2f3 1197 normal_char:
87e67904
EZ
1198 if (string->s)
1199 {
87e67904 1200
f3014ef5
EZ
1201 if (!string->unibyte)
1202 {
1203 ch = STRING_CHAR_AND_LENGTH (string->s + bytepos, len);
1204 *ch_len = len;
1205 }
1206 else
1207 {
1208 ch = UNIBYTE_TO_CHAR (string->s[bytepos]);
1209 *ch_len = 1;
1210 }
87e67904 1211 }
9f257352
EZ
1212 else if (STRINGP (string->lstring))
1213 {
f3014ef5
EZ
1214 if (!string->unibyte)
1215 {
1216 ch = STRING_CHAR_AND_LENGTH (SDATA (string->lstring) + bytepos,
1217 len);
1218 *ch_len = len;
1219 }
1220 else
1221 {
1222 ch = UNIBYTE_TO_CHAR (SREF (string->lstring, bytepos));
1223 *ch_len = 1;
1224 }
9f257352 1225 }
87e67904
EZ
1226 else
1227 {
e99a9b8b
EZ
1228 ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (bytepos), len);
1229 *ch_len = len;
87e67904 1230 }
182ce2d2
EZ
1231 *nchars = 1;
1232 }
1233
1234 /* If we just entered a run of characters covered by a display
1235 string, compute the position of the next display string. */
55439c61 1236 if (charpos + *nchars <= endpos && charpos + *nchars > *disp_pos
0c95fcf7 1237 && *disp_prop)
6db161be
EZ
1238 {
1239 SET_TEXT_POS (pos, charpos + *nchars, bytepos + *ch_len);
5bf97bfc 1240 *disp_pos = compute_display_string_pos (&pos, string, w, frame_window_p,
0c95fcf7 1241 disp_prop);
6db161be 1242 }
182ce2d2
EZ
1243
1244 return ch;
1245}
1246
cbb09f04
EZ
1247\f
1248/***********************************************************************
1249 Determining paragraph direction
1250 ***********************************************************************/
1251
1252/* Check if buffer position CHARPOS/BYTEPOS is the end of a paragraph.
1253 Value is the non-negative length of the paragraph separator
1254 following the buffer position, -1 if position is at the beginning
1255 of a new paragraph, or -2 if position is neither at beginning nor
1256 at end of a paragraph. */
d311d28c
PE
1257static ptrdiff_t
1258bidi_at_paragraph_end (ptrdiff_t charpos, ptrdiff_t bytepos)
cbb09f04
EZ
1259{
1260 Lisp_Object sep_re;
1261 Lisp_Object start_re;
d311d28c 1262 ptrdiff_t val;
cbb09f04
EZ
1263
1264 sep_re = paragraph_separate_re;
1265 start_re = paragraph_start_re;
1266
1267 val = fast_looking_at (sep_re, charpos, bytepos, ZV, ZV_BYTE, Qnil);
1268 if (val < 0)
1269 {
1270 if (fast_looking_at (start_re, charpos, bytepos, ZV, ZV_BYTE, Qnil) >= 0)
1271 val = -1;
1272 else
1273 val = -2;
1274 }
1275
1276 return val;
1277}
1278
e30b79c1
DA
1279/* If the user has requested the long scans caching, make sure that
1280 BIDI cache is enabled. Otherwise, make sure it's disabled. */
1281
1282static struct region_cache *
1283bidi_paragraph_cache_on_off (void)
1284{
c10e9ece
EZ
1285 struct buffer *cache_buffer = current_buffer;
1286 bool indirect_p = false;
1287
1288 /* For indirect buffers, make sure to use the cache of their base
1289 buffer. */
1290 if (cache_buffer->base_buffer)
1291 {
1292 cache_buffer = cache_buffer->base_buffer;
1293 indirect_p = true;
1294 }
1295
1296 /* Don't turn on or off the cache in the base buffer, if the value
1297 of cache-long-scans of the base buffer is inconsistent with that.
1298 This is because doing so will just make the cache pure overhead,
1299 since if we turn it on via indirect buffer, it will be
1300 immediately turned off by its base buffer. */
e30b79c1
DA
1301 if (NILP (BVAR (current_buffer, cache_long_scans)))
1302 {
c10e9ece
EZ
1303 if (!indirect_p
1304 || NILP (BVAR (cache_buffer, cache_long_scans)))
e30b79c1 1305 {
c10e9ece
EZ
1306 if (cache_buffer->bidi_paragraph_cache)
1307 {
1308 free_region_cache (cache_buffer->bidi_paragraph_cache);
1309 cache_buffer->bidi_paragraph_cache = 0;
1310 }
e30b79c1
DA
1311 }
1312 return NULL;
1313 }
1314 else
1315 {
c10e9ece
EZ
1316 if (!indirect_p
1317 || !NILP (BVAR (cache_buffer, cache_long_scans)))
1318 {
1319 if (!cache_buffer->bidi_paragraph_cache)
1320 cache_buffer->bidi_paragraph_cache = new_region_cache ();
1321 }
1322 return cache_buffer->bidi_paragraph_cache;
e30b79c1
DA
1323 }
1324}
1325
1137e8b8
EZ
1326/* On my 2005-vintage machine, searching back for paragraph start
1327 takes ~1 ms per line. And bidi_paragraph_init is called 4 times
1328 when user types C-p. The number below limits each call to
1329 bidi_paragraph_init to about 10 ms. */
1330#define MAX_PARAGRAPH_SEARCH 7500
1331
be39f003 1332/* Find the beginning of this paragraph by looking back in the buffer.
1137e8b8
EZ
1333 Value is the byte position of the paragraph's beginning, or
1334 BEGV_BYTE if paragraph_start_re is still not found after looking
1335 back MAX_PARAGRAPH_SEARCH lines in the buffer. */
d311d28c
PE
1336static ptrdiff_t
1337bidi_find_paragraph_start (ptrdiff_t pos, ptrdiff_t pos_byte)
6bff6497 1338{
372b7a95 1339 Lisp_Object re = paragraph_start_re;
d311d28c 1340 ptrdiff_t limit = ZV, limit_byte = ZV_BYTE;
e30b79c1
DA
1341 struct region_cache *bpc = bidi_paragraph_cache_on_off ();
1342 ptrdiff_t n = 0, oldpos = pos, next;
c10e9ece
EZ
1343 struct buffer *cache_buffer = current_buffer;
1344
1345 if (cache_buffer->base_buffer)
1346 cache_buffer = cache_buffer->base_buffer;
6bff6497 1347
6bff6497 1348 while (pos_byte > BEGV_BYTE
1137e8b8 1349 && n++ < MAX_PARAGRAPH_SEARCH
6bff6497 1350 && fast_looking_at (re, pos, pos_byte, limit, limit_byte, Qnil) < 0)
b5426561
DA
1351 {
1352 /* FIXME: What if the paragraph beginning is covered by a
1353 display string? And what if a display string covering some
1354 of the text over which we scan back includes
1355 paragraph_start_re? */
1356 DEC_BOTH (pos, pos_byte);
c10e9ece 1357 if (bpc && region_cache_backward (cache_buffer, bpc, pos, &next))
e30b79c1
DA
1358 {
1359 pos = next, pos_byte = CHAR_TO_BYTE (pos);
1360 break;
1361 }
1362 else
1363 pos = find_newline_no_quit (pos, pos_byte, -1, &pos_byte);
b5426561 1364 }
1137e8b8 1365 if (n >= MAX_PARAGRAPH_SEARCH)
e30b79c1
DA
1366 pos = BEGV, pos_byte = BEGV_BYTE;
1367 if (bpc)
c10e9ece 1368 know_region_cache (cache_buffer, bpc, pos, oldpos);
cd88d682
EZ
1369 /* Positions returned by the region cache are not limited to
1370 BEGV..ZV range, so we limit them here. */
1371 pos_byte = clip_to_bounds (BEGV_BYTE, pos_byte, ZV_BYTE);
be39f003 1372 return pos_byte;
6bff6497
EZ
1373}
1374
ce811ad9
EZ
1375/* On a 3.4 GHz machine, searching forward for a strong directional
1376 character in a long paragraph full of weaks or neutrals takes about
1377 1 ms for each 20K characters. The number below limits each call to
1378 bidi_paragraph_init to less than 10 ms even on slow machines. */
1379#define MAX_STRONG_CHAR_SEARCH 100000
1380
bea4f10c 1381/* Determine the base direction, a.k.a. base embedding level, of the
b44d9321
EZ
1382 paragraph we are about to iterate through. If DIR is either L2R or
1383 R2L, just use that. Otherwise, determine the paragraph direction
bea4f10c
EZ
1384 from the first strong directional character of the paragraph.
1385
2cc21167 1386 NO_DEFAULT_P means don't default to L2R if the paragraph
bea4f10c
EZ
1387 has no strong directional characters and both DIR and
1388 bidi_it->paragraph_dir are NEUTRAL_DIR. In that case, search back
1389 in the buffer until a paragraph is found with a strong character,
1390 or until hitting BEGV. In the latter case, fall back to L2R. This
1391 flag is used in current-bidi-paragraph-direction.
1392
1393 Note that this function gives the paragraph separator the same
1394 direction as the preceding paragraph, even though Emacs generally
9858f6c3 1395 views the separator as not belonging to any paragraph. */
b7b65b15 1396void
2cc21167 1397bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it, bool no_default_p)
b7b65b15 1398{
d311d28c 1399 ptrdiff_t bytepos = bidi_it->bytepos;
2cc21167 1400 bool string_p = bidi_it->string.s || STRINGP (bidi_it->string.lstring);
d311d28c 1401 ptrdiff_t pstartbyte;
87e67904
EZ
1402 /* Note that begbyte is a byte position, while end is a character
1403 position. Yes, this is ugly, but we are trying to avoid costly
1404 calls to BYTE_TO_CHAR and its ilk. */
d311d28c
PE
1405 ptrdiff_t begbyte = string_p ? 0 : BEGV_BYTE;
1406 ptrdiff_t end = string_p ? bidi_it->string.schars : ZV;
e7402cb2 1407
5e65aec0 1408 /* Special case for an empty buffer. */
87e67904 1409 if (bytepos == begbyte && bidi_it->charpos == end)
5e65aec0 1410 dir = L2R;
9c82e145 1411 /* We should never be called at EOB or before BEGV. */
87e67904 1412 else if (bidi_it->charpos >= end || bytepos < begbyte)
1088b922 1413 emacs_abort ();
9c82e145 1414
be39f003
EZ
1415 if (dir == L2R)
1416 {
1417 bidi_it->paragraph_dir = L2R;
1418 bidi_it->new_paragraph = 0;
1419 }
1420 else if (dir == R2L)
1421 {
1422 bidi_it->paragraph_dir = R2L;
1423 bidi_it->new_paragraph = 0;
1424 }
b7b65b15
EZ
1425 else if (dir == NEUTRAL_DIR) /* P2 */
1426 {
182ce2d2 1427 int ch;
d311d28c
PE
1428 ptrdiff_t ch_len, nchars;
1429 ptrdiff_t pos, disp_pos = -1;
0c95fcf7 1430 int disp_prop = 0;
6bff6497 1431 bidi_type_t type;
9f257352 1432 const unsigned char *s;
be39f003 1433
d20e1419
EZ
1434 if (!bidi_initialized)
1435 bidi_initialize ();
1436
be39f003
EZ
1437 /* If we are inside a paragraph separator, we are just waiting
1438 for the separator to be exhausted; use the previous paragraph
e5a2fec7
EZ
1439 direction. But don't do that if we have been just reseated,
1440 because we need to reinitialize below in that case. */
1441 if (!bidi_it->first_elt
1442 && bidi_it->charpos < bidi_it->separator_limit)
be39f003
EZ
1443 return;
1444
b44d9321 1445 /* If we are on a newline, get past it to where the next
5e65aec0
EZ
1446 paragraph might start. But don't do that at BEGV since then
1447 we are potentially in a new paragraph that doesn't yet
1448 exist. */
c143c213 1449 pos = bidi_it->charpos;
512a289d
RS
1450 s = (STRINGP (bidi_it->string.lstring)
1451 ? SDATA (bidi_it->string.lstring)
1452 : bidi_it->string.s);
f3014ef5
EZ
1453 if (bytepos > begbyte
1454 && bidi_char_at_pos (bytepos, s, bidi_it->string.unibyte) == '\n')
be39f003 1455 {
b44d9321 1456 bytepos++;
c143c213 1457 pos++;
be39f003 1458 }
b44d9321
EZ
1459
1460 /* We are either at the beginning of a paragraph or in the
1461 middle of it. Find where this paragraph starts. */
87e67904
EZ
1462 if (string_p)
1463 {
1464 /* We don't support changes of paragraph direction inside a
1465 string. It is treated as a single paragraph. */
1466 pstartbyte = 0;
1467 }
1468 else
1469 pstartbyte = bidi_find_paragraph_start (pos, bytepos);
be39f003
EZ
1470 bidi_it->separator_limit = -1;
1471 bidi_it->new_paragraph = 0;
bea4f10c 1472
2cc21167
PE
1473 /* The following loop is run more than once only if NO_DEFAULT_P,
1474 and only if we are iterating on a buffer. */
bea4f10c 1475 do {
ce811ad9
EZ
1476 ptrdiff_t pos1;
1477
bea4f10c 1478 bytepos = pstartbyte;
87e67904
EZ
1479 if (!string_p)
1480 pos = BYTE_TO_CHAR (bytepos);
36075a19 1481 ch = bidi_fetch_char (pos, bytepos, &disp_pos, &disp_prop,
5bf97bfc 1482 &bidi_it->string, bidi_it->w,
87e67904 1483 bidi_it->frame_window_p, &ch_len, &nchars);
bea4f10c
EZ
1484 type = bidi_get_type (ch, NEUTRAL_DIR);
1485
ce811ad9 1486 pos1 = pos;
182ce2d2 1487 for (pos += nchars, bytepos += ch_len;
ce811ad9
EZ
1488 ((bidi_get_category (type) != STRONG)
1489 || (bidi_ignore_explicit_marks_for_paragraph_level
1490 && (type == RLE || type == RLO
1491 || type == LRE || type == LRO)))
1492 /* Stop when searched too far into an abnormally large
1493 paragraph full of weak or neutral characters. */
1494 && pos - pos1 < MAX_STRONG_CHAR_SEARCH;
bea4f10c
EZ
1495 type = bidi_get_type (ch, NEUTRAL_DIR))
1496 {
87e67904 1497 if (pos >= end)
bea4f10c
EZ
1498 {
1499 /* Pretend there's a paragraph separator at end of
87e67904 1500 buffer/string. */
bea4f10c
EZ
1501 type = NEUTRAL_B;
1502 break;
1503 }
0bb23927
EZ
1504 if (!string_p
1505 && type == NEUTRAL_B
1506 && bidi_at_paragraph_end (pos, bytepos) >= -1)
cf99dcf8 1507 break;
102ebb00 1508 /* Fetch next character and advance to get past it. */
36075a19 1509 ch = bidi_fetch_char (pos, bytepos, &disp_pos,
5bf97bfc 1510 &disp_prop, &bidi_it->string, bidi_it->w,
fc6f18ce 1511 bidi_it->frame_window_p, &ch_len, &nchars);
182ce2d2
EZ
1512 pos += nchars;
1513 bytepos += ch_len;
bea4f10c 1514 }
32413314
EZ
1515 if ((type == STRONG_R || type == STRONG_AL) /* P3 */
1516 || (!bidi_ignore_explicit_marks_for_paragraph_level
1517 && (type == RLO || type == RLE)))
bea4f10c 1518 bidi_it->paragraph_dir = R2L;
32413314
EZ
1519 else if (type == STRONG_L
1520 || (!bidi_ignore_explicit_marks_for_paragraph_level
1521 && (type == LRO || type == LRE)))
bea4f10c 1522 bidi_it->paragraph_dir = L2R;
87e67904
EZ
1523 if (!string_p
1524 && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR)
bea4f10c
EZ
1525 {
1526 /* If this paragraph is at BEGV, default to L2R. */
1527 if (pstartbyte == BEGV_BYTE)
1528 bidi_it->paragraph_dir = L2R; /* P3 and HL1 */
1529 else
1530 {
d311d28c
PE
1531 ptrdiff_t prevpbyte = pstartbyte;
1532 ptrdiff_t p = BYTE_TO_CHAR (pstartbyte), pbyte = pstartbyte;
bea4f10c
EZ
1533
1534 /* Find the beginning of the previous paragraph, if any. */
1535 while (pbyte > BEGV_BYTE && prevpbyte >= pstartbyte)
1536 {
182ce2d2
EZ
1537 /* FXIME: What if p is covered by a display
1538 string? See also a FIXME inside
1539 bidi_find_paragraph_start. */
36075a19 1540 DEC_BOTH (p, pbyte);
bea4f10c
EZ
1541 prevpbyte = bidi_find_paragraph_start (p, pbyte);
1542 }
1543 pstartbyte = prevpbyte;
1544 }
1545 }
87e67904
EZ
1546 } while (!string_p
1547 && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR);
b7b65b15 1548 }
be39f003 1549 else
1088b922 1550 emacs_abort ();
b7b65b15 1551
b44d9321
EZ
1552 /* Contrary to UAX#9 clause P3, we only default the paragraph
1553 direction to L2R if we have no previous usable paragraph
bea4f10c 1554 direction. This is allowed by the HL1 clause. */
d20e1419 1555 if (bidi_it->paragraph_dir != L2R && bidi_it->paragraph_dir != R2L)
bea4f10c 1556 bidi_it->paragraph_dir = L2R; /* P3 and HL1 ``higher-level protocols'' */
be39f003 1557 if (bidi_it->paragraph_dir == R2L)
b44d9321 1558 bidi_it->level_stack[0].level = 1;
be39f003 1559 else
b44d9321 1560 bidi_it->level_stack[0].level = 0;
be39f003
EZ
1561
1562 bidi_line_init (bidi_it);
b7b65b15
EZ
1563}
1564
cbb09f04
EZ
1565\f
1566/***********************************************************************
1567 Resolving explicit and implicit levels.
58b9f433 1568 The rest of this file constitutes the core of the UBA implementation.
cbb09f04 1569 ***********************************************************************/
b7b65b15 1570
0a99eee1 1571static bool
182ce2d2 1572bidi_explicit_dir_char (int ch)
b7b65b15 1573{
182ce2d2
EZ
1574 bidi_type_t ch_type;
1575
1576 if (!bidi_initialized)
1088b922 1577 emacs_abort ();
182ce2d2
EZ
1578 ch_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
1579 return (ch_type == LRE || ch_type == LRO
1580 || ch_type == RLE || ch_type == RLO
1581 || ch_type == PDF);
b7b65b15
EZ
1582}
1583
1584/* A helper function for bidi_resolve_explicit. It advances to the
1585 next character in logical order and determines the new embedding
1586 level and directional override, but does not take into account
1587 empty embeddings. */
1588static int
1589bidi_resolve_explicit_1 (struct bidi_it *bidi_it)
1590{
1591 int curchar;
1592 bidi_type_t type;
1593 int current_level;
1594 int new_level;
1595 bidi_dir_t override;
2cc21167 1596 bool string_p = bidi_it->string.s || STRINGP (bidi_it->string.lstring);
b7b65b15 1597
182ce2d2
EZ
1598 /* If reseat()'ed, don't advance, so as to start iteration from the
1599 position where we were reseated. bidi_it->bytepos can be less
1600 than BEGV_BYTE after reseat to BEGV. */
87e67904 1601 if (bidi_it->bytepos < (string_p ? 0 : BEGV_BYTE)
9c82e145 1602 || bidi_it->first_elt)
e7402cb2 1603 {
9c82e145 1604 bidi_it->first_elt = 0;
87e67904
EZ
1605 if (string_p)
1606 {
512a289d
RS
1607 const unsigned char *p
1608 = (STRINGP (bidi_it->string.lstring)
1609 ? SDATA (bidi_it->string.lstring)
1610 : bidi_it->string.s);
9f257352 1611
87e67904 1612 if (bidi_it->charpos < 0)
05e193f1
EZ
1613 bidi_it->charpos = bidi_it->bytepos = 0;
1614 eassert (bidi_it->bytepos == bidi_count_bytes (p, 0, 0,
1615 bidi_it->charpos,
1616 bidi_it->string.unibyte));
87e67904
EZ
1617 }
1618 else
1619 {
1620 if (bidi_it->charpos < BEGV)
05e193f1
EZ
1621 {
1622 bidi_it->charpos = BEGV;
1623 bidi_it->bytepos = BEGV_BYTE;
1624 }
1625 eassert (bidi_it->bytepos == CHAR_TO_BYTE (bidi_it->charpos));
87e67904 1626 }
e7402cb2 1627 }
87e67904
EZ
1628 /* Don't move at end of buffer/string. */
1629 else if (bidi_it->charpos < (string_p ? bidi_it->string.schars : ZV))
b7b65b15 1630 {
182ce2d2
EZ
1631 /* Advance to the next character, skipping characters covered by
1632 display strings (nchars > 1). */
102ebb00 1633 if (bidi_it->nchars <= 0)
1088b922 1634 emacs_abort ();
182ce2d2 1635 bidi_it->charpos += bidi_it->nchars;
e7402cb2 1636 if (bidi_it->ch_len == 0)
1088b922 1637 emacs_abort ();
b7b65b15
EZ
1638 bidi_it->bytepos += bidi_it->ch_len;
1639 }
1640
1641 current_level = bidi_it->level_stack[bidi_it->stack_idx].level; /* X1 */
1642 override = bidi_it->level_stack[bidi_it->stack_idx].override;
1643 new_level = current_level;
1644
87e67904 1645 if (bidi_it->charpos >= (string_p ? bidi_it->string.schars : ZV))
e7402cb2
EZ
1646 {
1647 curchar = BIDI_EOB;
1648 bidi_it->ch_len = 1;
182ce2d2 1649 bidi_it->nchars = 1;
87e67904 1650 bidi_it->disp_pos = (string_p ? bidi_it->string.schars : ZV);
0c95fcf7 1651 bidi_it->disp_prop = 0;
e7402cb2
EZ
1652 }
1653 else
1654 {
182ce2d2
EZ
1655 /* Fetch the character at BYTEPOS. If it is covered by a
1656 display string, treat the entire run of covered characters as
1657 a single character u+FFFC. */
36075a19 1658 curchar = bidi_fetch_char (bidi_it->charpos, bidi_it->bytepos,
0c95fcf7 1659 &bidi_it->disp_pos, &bidi_it->disp_prop,
5bf97bfc
EZ
1660 &bidi_it->string, bidi_it->w,
1661 bidi_it->frame_window_p,
102ebb00 1662 &bidi_it->ch_len, &bidi_it->nchars);
e7402cb2 1663 }
b7b65b15 1664 bidi_it->ch = curchar;
b7b65b15 1665
6bff6497
EZ
1666 /* Don't apply directional override here, as all the types we handle
1667 below will not be affected by the override anyway, and we need
1668 the original type unaltered. The override will be applied in
1669 bidi_resolve_weak. */
1670 type = bidi_get_type (curchar, NEUTRAL_DIR);
89d3374a
EZ
1671 bidi_it->orig_type = type;
1672 bidi_check_type (bidi_it->orig_type);
b7b65b15
EZ
1673
1674 if (type != PDF)
1675 bidi_it->prev_was_pdf = 0;
1676
89d3374a 1677 bidi_it->type_after_w1 = UNKNOWN_BT;
b7b65b15
EZ
1678
1679 switch (type)
1680 {
1681 case RLE: /* X2 */
1682 case RLO: /* X4 */
89d3374a
EZ
1683 bidi_it->type_after_w1 = type;
1684 bidi_check_type (bidi_it->type_after_w1);
b7b65b15 1685 type = WEAK_BN; /* X9/Retaining */
87e67904 1686 if (bidi_it->ignore_bn_limit <= -1)
b7b65b15
EZ
1687 {
1688 if (current_level <= BIDI_MAXLEVEL - 4)
1689 {
1690 /* Compute the least odd embedding level greater than
1691 the current level. */
1692 new_level = ((current_level + 1) & ~1) + 1;
89d3374a 1693 if (bidi_it->type_after_w1 == RLE)
b7b65b15
EZ
1694 override = NEUTRAL_DIR;
1695 else
1696 override = R2L;
1697 if (current_level == BIDI_MAXLEVEL - 4)
1698 bidi_it->invalid_rl_levels = 0;
1699 bidi_push_embedding_level (bidi_it, new_level, override);
1700 }
1701 else
1702 {
1703 bidi_it->invalid_levels++;
1704 /* See the commentary about invalid_rl_levels below. */
1705 if (bidi_it->invalid_rl_levels < 0)
1706 bidi_it->invalid_rl_levels = 0;
1707 bidi_it->invalid_rl_levels++;
1708 }
1709 }
89d3374a 1710 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
7b5d6677
EZ
1711 || (bidi_it->next_en_pos > bidi_it->charpos
1712 && bidi_it->next_en_type == WEAK_EN))
b7b65b15
EZ
1713 type = WEAK_EN;
1714 break;
1715 case LRE: /* X3 */
1716 case LRO: /* X5 */
89d3374a
EZ
1717 bidi_it->type_after_w1 = type;
1718 bidi_check_type (bidi_it->type_after_w1);
b7b65b15 1719 type = WEAK_BN; /* X9/Retaining */
87e67904 1720 if (bidi_it->ignore_bn_limit <= -1)
b7b65b15
EZ
1721 {
1722 if (current_level <= BIDI_MAXLEVEL - 5)
1723 {
1724 /* Compute the least even embedding level greater than
1725 the current level. */
1726 new_level = ((current_level + 2) & ~1);
89d3374a 1727 if (bidi_it->type_after_w1 == LRE)
b7b65b15
EZ
1728 override = NEUTRAL_DIR;
1729 else
1730 override = L2R;
1731 bidi_push_embedding_level (bidi_it, new_level, override);
1732 }
1733 else
1734 {
1735 bidi_it->invalid_levels++;
1736 /* invalid_rl_levels counts invalid levels encountered
1737 while the embedding level was already too high for
1738 LRE/LRO, but not for RLE/RLO. That is because
1739 there may be exactly one PDF which we should not
1740 ignore even though invalid_levels is non-zero.
1741 invalid_rl_levels helps to know what PDF is
1742 that. */
1743 if (bidi_it->invalid_rl_levels >= 0)
1744 bidi_it->invalid_rl_levels++;
1745 }
1746 }
89d3374a 1747 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
7b5d6677
EZ
1748 || (bidi_it->next_en_pos > bidi_it->charpos
1749 && bidi_it->next_en_type == WEAK_EN))
b7b65b15
EZ
1750 type = WEAK_EN;
1751 break;
1752 case PDF: /* X7 */
89d3374a
EZ
1753 bidi_it->type_after_w1 = type;
1754 bidi_check_type (bidi_it->type_after_w1);
b7b65b15 1755 type = WEAK_BN; /* X9/Retaining */
87e67904 1756 if (bidi_it->ignore_bn_limit <= -1)
b7b65b15
EZ
1757 {
1758 if (!bidi_it->invalid_rl_levels)
1759 {
1760 new_level = bidi_pop_embedding_level (bidi_it);
1761 bidi_it->invalid_rl_levels = -1;
1762 if (bidi_it->invalid_levels)
1763 bidi_it->invalid_levels--;
1764 /* else nothing: UAX#9 says to ignore invalid PDFs */
1765 }
1766 if (!bidi_it->invalid_levels)
1767 new_level = bidi_pop_embedding_level (bidi_it);
1768 else
1769 {
1770 bidi_it->invalid_levels--;
1771 bidi_it->invalid_rl_levels--;
1772 }
1773 }
89d3374a 1774 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
7b5d6677
EZ
1775 || (bidi_it->next_en_pos > bidi_it->charpos
1776 && bidi_it->next_en_type == WEAK_EN))
b7b65b15
EZ
1777 type = WEAK_EN;
1778 break;
1779 default:
1780 /* Nothing. */
1781 break;
1782 }
1783
1784 bidi_it->type = type;
2d6e4628 1785 bidi_check_type (bidi_it->type);
b7b65b15
EZ
1786
1787 return new_level;
1788}
1789
1790/* Given an iterator state in BIDI_IT, advance one character position
182ce2d2
EZ
1791 in the buffer/string to the next character (in the logical order),
1792 resolve any explicit embeddings and directional overrides, and
1793 return the embedding level of the character after resolving
1794 explicit directives and ignoring empty embeddings. */
b7b65b15
EZ
1795static int
1796bidi_resolve_explicit (struct bidi_it *bidi_it)
1797{
1798 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1799 int new_level = bidi_resolve_explicit_1 (bidi_it);
d311d28c 1800 ptrdiff_t eob = bidi_it->string.s ? bidi_it->string.schars : ZV;
512a289d
RS
1801 const unsigned char *s
1802 = (STRINGP (bidi_it->string.lstring)
1803 ? SDATA (bidi_it->string.lstring)
1804 : bidi_it->string.s);
b7b65b15
EZ
1805
1806 if (prev_level < new_level
1807 && bidi_it->type == WEAK_BN
87e67904
EZ
1808 && bidi_it->ignore_bn_limit == -1 /* only if not already known */
1809 && bidi_it->charpos < eob /* not already at EOB */
1810 && bidi_explicit_dir_char (bidi_char_at_pos (bidi_it->bytepos
f3014ef5
EZ
1811 + bidi_it->ch_len, s,
1812 bidi_it->string.unibyte)))
b7b65b15
EZ
1813 {
1814 /* Avoid pushing and popping embedding levels if the level run
1815 is empty, as this breaks level runs where it shouldn't.
1816 UAX#9 removes all the explicit embedding and override codes,
1817 so empty embeddings disappear without a trace. We need to
1818 behave as if we did the same. */
1819 struct bidi_it saved_it;
1820 int level = prev_level;
1821
1822 bidi_copy_it (&saved_it, bidi_it);
1823
87e67904 1824 while (bidi_explicit_dir_char (bidi_char_at_pos (bidi_it->bytepos
f3014ef5
EZ
1825 + bidi_it->ch_len, s,
1826 bidi_it->string.unibyte)))
b7b65b15 1827 {
182ce2d2
EZ
1828 /* This advances to the next character, skipping any
1829 characters covered by display strings. */
b7b65b15 1830 level = bidi_resolve_explicit_1 (bidi_it);
9f257352
EZ
1831 /* If string.lstring was relocated inside bidi_resolve_explicit_1,
1832 a pointer to its data is no longer valid. */
1833 if (STRINGP (bidi_it->string.lstring))
1834 s = SDATA (bidi_it->string.lstring);
b7b65b15
EZ
1835 }
1836
102ebb00 1837 if (bidi_it->nchars <= 0)
1088b922 1838 emacs_abort ();
b7b65b15 1839 if (level == prev_level) /* empty embedding */
182ce2d2 1840 saved_it.ignore_bn_limit = bidi_it->charpos + bidi_it->nchars;
b7b65b15 1841 else /* this embedding is non-empty */
87e67904 1842 saved_it.ignore_bn_limit = -2;
b7b65b15
EZ
1843
1844 bidi_copy_it (bidi_it, &saved_it);
87e67904 1845 if (bidi_it->ignore_bn_limit > -1)
b7b65b15
EZ
1846 {
1847 /* We pushed a level, but we shouldn't have. Undo that. */
1848 if (!bidi_it->invalid_rl_levels)
1849 {
1850 new_level = bidi_pop_embedding_level (bidi_it);
1851 bidi_it->invalid_rl_levels = -1;
1852 if (bidi_it->invalid_levels)
1853 bidi_it->invalid_levels--;
1854 }
1855 if (!bidi_it->invalid_levels)
1856 new_level = bidi_pop_embedding_level (bidi_it);
1857 else
1858 {
1859 bidi_it->invalid_levels--;
1860 bidi_it->invalid_rl_levels--;
1861 }
1862 }
1863 }
1864
b7b65b15
EZ
1865 if (bidi_it->type == NEUTRAL_B) /* X8 */
1866 {
21fce5ab 1867 bidi_set_paragraph_end (bidi_it);
6bff6497
EZ
1868 /* This is needed by bidi_resolve_weak below, and in L1. */
1869 bidi_it->type_after_w1 = bidi_it->type;
89d3374a 1870 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1871 }
1872
1873 return new_level;
1874}
1875
182ce2d2
EZ
1876/* Advance in the buffer/string, resolve weak types and return the
1877 type of the next character after weak type resolution. */
fd3998ff 1878static bidi_type_t
b7b65b15
EZ
1879bidi_resolve_weak (struct bidi_it *bidi_it)
1880{
1881 bidi_type_t type;
1882 bidi_dir_t override;
1883 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1884 int new_level = bidi_resolve_explicit (bidi_it);
1885 int next_char;
1886 bidi_type_t type_of_next;
1887 struct bidi_it saved_it;
5895d7b9 1888 ptrdiff_t eob
512a289d
RS
1889 = ((STRINGP (bidi_it->string.lstring) || bidi_it->string.s)
1890 ? bidi_it->string.schars : ZV);
b7b65b15
EZ
1891
1892 type = bidi_it->type;
1893 override = bidi_it->level_stack[bidi_it->stack_idx].override;
1894
1895 if (type == UNKNOWN_BT
1896 || type == LRE
1897 || type == LRO
1898 || type == RLE
1899 || type == RLO
1900 || type == PDF)
1088b922 1901 emacs_abort ();
b7b65b15
EZ
1902
1903 if (new_level != prev_level
1904 || bidi_it->type == NEUTRAL_B)
1905 {
1906 /* We've got a new embedding level run, compute the directional
1907 type of sor and initialize per-run variables (UAX#9, clause
1908 X10). */
1909 bidi_set_sor_type (bidi_it, prev_level, new_level);
1910 }
1911 else if (type == NEUTRAL_S || type == NEUTRAL_WS
1912 || type == WEAK_BN || type == STRONG_AL)
89d3374a
EZ
1913 bidi_it->type_after_w1 = type; /* needed in L1 */
1914 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1915
1916 /* Level and directional override status are already recorded in
1917 bidi_it, and do not need any change; see X6. */
1918 if (override == R2L) /* X6 */
1919 type = STRONG_R;
1920 else if (override == L2R)
1921 type = STRONG_L;
bc5a45f3 1922 else
b7b65b15 1923 {
bc5a45f3 1924 if (type == WEAK_NSM) /* W1 */
b7b65b15 1925 {
bc5a45f3 1926 /* Note that we don't need to consider the case where the
5930fe97
EZ
1927 prev character has its type overridden by an RLO or LRO,
1928 because then either the type of this NSM would have been
1929 also overridden, or the previous character is outside the
1930 current level run, and thus not relevant to this NSM.
1931 This is why NSM gets the type_after_w1 of the previous
1932 character. */
ebb5722e
EZ
1933 if (bidi_it->prev.type_after_w1 != UNKNOWN_BT
1934 /* if type_after_w1 is NEUTRAL_B, this NSM is at sor */
1935 && bidi_it->prev.type_after_w1 != NEUTRAL_B)
5930fe97 1936 type = bidi_it->prev.type_after_w1;
bc5a45f3
EZ
1937 else if (bidi_it->sor == R2L)
1938 type = STRONG_R;
1939 else if (bidi_it->sor == L2R)
1940 type = STRONG_L;
1941 else /* shouldn't happen! */
1088b922 1942 emacs_abort ();
b7b65b15 1943 }
bc5a45f3
EZ
1944 if (type == WEAK_EN /* W2 */
1945 && bidi_it->last_strong.type_after_w1 == STRONG_AL)
1946 type = WEAK_AN;
1947 else if (type == STRONG_AL) /* W3 */
1948 type = STRONG_R;
1949 else if ((type == WEAK_ES /* W4 */
1950 && bidi_it->prev.type_after_w1 == WEAK_EN
1951 && bidi_it->prev.orig_type == WEAK_EN)
1952 || (type == WEAK_CS
1953 && ((bidi_it->prev.type_after_w1 == WEAK_EN
1954 && bidi_it->prev.orig_type == WEAK_EN)
1955 || bidi_it->prev.type_after_w1 == WEAK_AN)))
b7b65b15 1956 {
512a289d
RS
1957 const unsigned char *s
1958 = (STRINGP (bidi_it->string.lstring)
1959 ? SDATA (bidi_it->string.lstring)
1960 : bidi_it->string.s);
1961
1962 next_char = (bidi_it->charpos + bidi_it->nchars >= eob
1963 ? BIDI_EOB
1964 : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len,
1965 s, bidi_it->string.unibyte));
6bff6497 1966 type_of_next = bidi_get_type (next_char, override);
b7b65b15 1967
bc5a45f3 1968 if (type_of_next == WEAK_BN
b7b65b15
EZ
1969 || bidi_explicit_dir_char (next_char))
1970 {
1971 bidi_copy_it (&saved_it, bidi_it);
1972 while (bidi_resolve_explicit (bidi_it) == new_level
bc5a45f3 1973 && bidi_it->type == WEAK_BN)
b7b65b15
EZ
1974 ;
1975 type_of_next = bidi_it->type;
b7b65b15
EZ
1976 bidi_copy_it (bidi_it, &saved_it);
1977 }
bc5a45f3
EZ
1978
1979 /* If the next character is EN, but the last strong-type
1980 character is AL, that next EN will be changed to AN when
1981 we process it in W2 above. So in that case, this ES
1982 should not be changed into EN. */
1983 if (type == WEAK_ES
1984 && type_of_next == WEAK_EN
1985 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1986 type = WEAK_EN;
1987 else if (type == WEAK_CS)
b7b65b15 1988 {
bc5a45f3
EZ
1989 if (bidi_it->prev.type_after_w1 == WEAK_AN
1990 && (type_of_next == WEAK_AN
1991 /* If the next character is EN, but the last
1992 strong-type character is AL, EN will be later
1993 changed to AN when we process it in W2 above.
1994 So in that case, this ES should not be
1995 changed into EN. */
1996 || (type_of_next == WEAK_EN
1997 && bidi_it->last_strong.type_after_w1 == STRONG_AL)))
1998 type = WEAK_AN;
1999 else if (bidi_it->prev.type_after_w1 == WEAK_EN
2000 && type_of_next == WEAK_EN
2001 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
2002 type = WEAK_EN;
2003 }
2004 }
2005 else if (type == WEAK_ET /* W5: ET with EN before or after it */
2006 || type == WEAK_BN) /* W5/Retaining */
2007 {
7b5d6677 2008 if (bidi_it->prev.type_after_w1 == WEAK_EN) /* ET/BN w/EN before it */
bc5a45f3 2009 type = WEAK_EN;
7b5d6677
EZ
2010 else if (bidi_it->next_en_pos > bidi_it->charpos
2011 && bidi_it->next_en_type != WEAK_BN)
2012 {
2013 if (bidi_it->next_en_type == WEAK_EN) /* ET/BN with EN after it */
2014 type = WEAK_EN;
2015 }
2016 else if (bidi_it->next_en_pos >=0)
bc5a45f3 2017 {
d311d28c 2018 ptrdiff_t en_pos = bidi_it->charpos + bidi_it->nchars;
512a289d
RS
2019 const unsigned char *s = (STRINGP (bidi_it->string.lstring)
2020 ? SDATA (bidi_it->string.lstring)
2021 : bidi_it->string.s);
bc5a45f3 2022
102ebb00 2023 if (bidi_it->nchars <= 0)
1088b922 2024 emacs_abort ();
512a289d
RS
2025 next_char
2026 = (bidi_it->charpos + bidi_it->nchars >= eob
2027 ? BIDI_EOB
2028 : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len, s,
2029 bidi_it->string.unibyte));
bc5a45f3
EZ
2030 type_of_next = bidi_get_type (next_char, override);
2031
2032 if (type_of_next == WEAK_ET
2033 || type_of_next == WEAK_BN
2034 || bidi_explicit_dir_char (next_char))
2035 {
2036 bidi_copy_it (&saved_it, bidi_it);
2037 while (bidi_resolve_explicit (bidi_it) == new_level
2038 && (bidi_it->type == WEAK_BN
2039 || bidi_it->type == WEAK_ET))
2040 ;
2041 type_of_next = bidi_it->type;
2042 en_pos = bidi_it->charpos;
2043 bidi_copy_it (bidi_it, &saved_it);
2044 }
7b5d6677
EZ
2045 /* Remember this position, to speed up processing of the
2046 next ETs. */
2047 bidi_it->next_en_pos = en_pos;
bc5a45f3 2048 if (type_of_next == WEAK_EN)
b7b65b15 2049 {
bc5a45f3
EZ
2050 /* If the last strong character is AL, the EN we've
2051 found will become AN when we get to it (W2). */
7b5d6677
EZ
2052 if (bidi_it->last_strong.type_after_w1 == STRONG_AL)
2053 type_of_next = WEAK_AN;
bc5a45f3
EZ
2054 else if (type == WEAK_BN)
2055 type = NEUTRAL_ON; /* W6/Retaining */
7b5d6677
EZ
2056 else
2057 type = WEAK_EN;
b7b65b15 2058 }
4787455f
EZ
2059 else if (type_of_next == NEUTRAL_B)
2060 /* Record the fact that there are no more ENs from
2061 here to the end of paragraph, to avoid entering the
2062 loop above ever again in this paragraph. */
2063 bidi_it->next_en_pos = -1;
7b5d6677
EZ
2064 /* Record the type of the character where we ended our search. */
2065 bidi_it->next_en_type = type_of_next;
b7b65b15
EZ
2066 }
2067 }
2068 }
2069
2070 if (type == WEAK_ES || type == WEAK_ET || type == WEAK_CS /* W6 */
89d3374a
EZ
2071 || (type == WEAK_BN
2072 && (bidi_it->prev.type_after_w1 == WEAK_CS /* W6/Retaining */
2073 || bidi_it->prev.type_after_w1 == WEAK_ES
2074 || bidi_it->prev.type_after_w1 == WEAK_ET)))
b7b65b15
EZ
2075 type = NEUTRAL_ON;
2076
2077 /* Store the type we've got so far, before we clobber it with strong
2078 types in W7 and while resolving neutral types. But leave alone
2079 the original types that were recorded above, because we will need
2080 them for the L1 clause. */
89d3374a
EZ
2081 if (bidi_it->type_after_w1 == UNKNOWN_BT)
2082 bidi_it->type_after_w1 = type;
2083 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
2084
2085 if (type == WEAK_EN) /* W7 */
2086 {
89d3374a 2087 if ((bidi_it->last_strong.type_after_w1 == STRONG_L)
b7b65b15
EZ
2088 || (bidi_it->last_strong.type == UNKNOWN_BT && bidi_it->sor == L2R))
2089 type = STRONG_L;
2090 }
2091
2092 bidi_it->type = type;
2d6e4628 2093 bidi_check_type (bidi_it->type);
b7b65b15
EZ
2094 return type;
2095}
2096
cbb09f04
EZ
2097/* Resolve the type of a neutral character according to the type of
2098 surrounding strong text and the current embedding level. */
0a99eee1 2099static bidi_type_t
cbb09f04
EZ
2100bidi_resolve_neutral_1 (bidi_type_t prev_type, bidi_type_t next_type, int lev)
2101{
2102 /* N1: European and Arabic numbers are treated as though they were R. */
2103 if (next_type == WEAK_EN || next_type == WEAK_AN)
2104 next_type = STRONG_R;
2105 if (prev_type == WEAK_EN || prev_type == WEAK_AN)
2106 prev_type = STRONG_R;
2107
2108 if (next_type == prev_type) /* N1 */
2109 return next_type;
2110 else if ((lev & 1) == 0) /* N2 */
2111 return STRONG_L;
2112 else
2113 return STRONG_R;
2114}
2115
fd3998ff 2116static bidi_type_t
b7b65b15
EZ
2117bidi_resolve_neutral (struct bidi_it *bidi_it)
2118{
2119 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2120 bidi_type_t type = bidi_resolve_weak (bidi_it);
2121 int current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2122
2123 if (!(type == STRONG_R
2124 || type == STRONG_L
2125 || type == WEAK_BN
2126 || type == WEAK_EN
2127 || type == WEAK_AN
2128 || type == NEUTRAL_B
2129 || type == NEUTRAL_S
2130 || type == NEUTRAL_WS
2131 || type == NEUTRAL_ON))
1088b922 2132 emacs_abort ();
b7b65b15 2133
4787455f
EZ
2134 if ((type != NEUTRAL_B /* Don't risk entering the long loop below if
2135 we are already at paragraph end. */
2136 && bidi_get_category (type) == NEUTRAL)
b7b65b15
EZ
2137 || (type == WEAK_BN && prev_level == current_level))
2138 {
2139 if (bidi_it->next_for_neutral.type != UNKNOWN_BT)
2140 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
2141 bidi_it->next_for_neutral.type,
2142 current_level);
4787455f
EZ
2143 /* The next two "else if" clauses are shortcuts for the
2144 important special case when we have a long sequence of
2145 neutral or WEAK_BN characters, such as whitespace or nulls or
2146 other control characters, on the base embedding level of the
2147 paragraph, and that sequence goes all the way to the end of
2148 the paragraph and follows a character whose resolved
2149 directionality is identical to the base embedding level.
2150 (This is what happens in a buffer with plain L2R text that
2151 happens to include long sequences of control characters.) By
2152 virtue of N1, the result of examining this long sequence will
2153 always be either STRONG_L or STRONG_R, depending on the base
2154 embedding level. So we use this fact directly instead of
2155 entering the expensive loop in the "else" clause. */
2156 else if (current_level == 0
2157 && bidi_it->prev_for_neutral.type == STRONG_L
2158 && !bidi_explicit_dir_char (bidi_it->ch))
2159 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
2160 STRONG_L, current_level);
2161 else if (/* current level is 1 */
2162 current_level == 1
2163 /* base embedding level is also 1 */
2164 && bidi_it->level_stack[0].level == 1
2165 /* previous character is one of those considered R for
2166 the purposes of W5 */
2167 && (bidi_it->prev_for_neutral.type == STRONG_R
2168 || bidi_it->prev_for_neutral.type == WEAK_EN
2169 || bidi_it->prev_for_neutral.type == WEAK_AN)
2170 && !bidi_explicit_dir_char (bidi_it->ch))
2171 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
2172 STRONG_R, current_level);
b7b65b15
EZ
2173 else
2174 {
2175 /* Arrrgh!! The UAX#9 algorithm is too deeply entrenched in
2176 the assumption of batch-style processing; see clauses W4,
2177 W5, and especially N1, which require to look far forward
182ce2d2
EZ
2178 (as well as back) in the buffer/string. May the fleas of
2179 a thousand camels infest the armpits of those who design
b7b65b15
EZ
2180 supposedly general-purpose algorithms by looking at their
2181 own implementations, and fail to consider other possible
2182 implementations! */
2183 struct bidi_it saved_it;
2184 bidi_type_t next_type;
2185
2186 if (bidi_it->scan_dir == -1)
1088b922 2187 emacs_abort ();
b7b65b15
EZ
2188
2189 bidi_copy_it (&saved_it, bidi_it);
2190 /* Scan the text forward until we find the first non-neutral
2191 character, and then use that to resolve the neutral we
2192 are dealing with now. We also cache the scanned iterator
2193 states, to salvage some of the effort later. */
2194 bidi_cache_iterator_state (bidi_it, 0);
2195 do {
2196 /* Record the info about the previous character, so that
2197 it will be cached below with this state. */
89d3374a 2198 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
b7b65b15
EZ
2199 && bidi_it->type != WEAK_BN)
2200 bidi_remember_char (&bidi_it->prev, bidi_it);
2201 type = bidi_resolve_weak (bidi_it);
2202 /* Paragraph separators have their levels fully resolved
2203 at this point, so cache them as resolved. */
2204 bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B);
2205 /* FIXME: implement L1 here, by testing for a newline and
2206 resetting the level for any sequence of whitespace
2207 characters adjacent to it. */
2208 } while (!(type == NEUTRAL_B
2209 || (type != WEAK_BN
2210 && bidi_get_category (type) != NEUTRAL)
2211 /* This is all per level run, so stop when we
2212 reach the end of this level run. */
512a289d
RS
2213 || (bidi_it->level_stack[bidi_it->stack_idx].level
2214 != current_level)));
b7b65b15
EZ
2215
2216 bidi_remember_char (&saved_it.next_for_neutral, bidi_it);
2217
2218 switch (type)
2219 {
2220 case STRONG_L:
2221 case STRONG_R:
2222 case STRONG_AL:
4787455f
EZ
2223 /* Actually, STRONG_AL cannot happen here, because
2224 bidi_resolve_weak converts it to STRONG_R, per W3. */
a54e2c05 2225 eassert (type != STRONG_AL);
b7b65b15
EZ
2226 next_type = type;
2227 break;
2228 case WEAK_EN:
2229 case WEAK_AN:
2230 /* N1: ``European and Arabic numbers are treated as
2231 though they were R.'' */
2232 next_type = STRONG_R;
b7b65b15
EZ
2233 break;
2234 case WEAK_BN:
713bfeaa 2235 case NEUTRAL_ON: /* W6/Retaining */
b7b65b15 2236 if (!bidi_explicit_dir_char (bidi_it->ch))
1088b922 2237 emacs_abort (); /* can't happen: BNs are skipped */
b7b65b15
EZ
2238 /* FALLTHROUGH */
2239 case NEUTRAL_B:
2240 /* Marched all the way to the end of this level run.
2241 We need to use the eor type, whose information is
2242 stored by bidi_set_sor_type in the prev_for_neutral
2243 member. */
2244 if (saved_it.type != WEAK_BN
89d3374a 2245 || bidi_get_category (bidi_it->prev.type_after_w1) == NEUTRAL)
4787455f 2246 next_type = bidi_it->prev_for_neutral.type;
b7b65b15
EZ
2247 else
2248 {
2249 /* This is a BN which does not adjoin neutrals.
2250 Leave its type alone. */
2251 bidi_copy_it (bidi_it, &saved_it);
2252 return bidi_it->type;
2253 }
2254 break;
2255 default:
1088b922 2256 emacs_abort ();
b7b65b15
EZ
2257 }
2258 type = bidi_resolve_neutral_1 (saved_it.prev_for_neutral.type,
2259 next_type, current_level);
4787455f 2260 saved_it.next_for_neutral.type = next_type;
b7b65b15 2261 saved_it.type = type;
4787455f 2262 bidi_check_type (next_type);
2d6e4628 2263 bidi_check_type (type);
b7b65b15
EZ
2264 bidi_copy_it (bidi_it, &saved_it);
2265 }
2266 }
2267 return type;
2268}
2269
2270/* Given an iterator state in BIDI_IT, advance one character position
182ce2d2
EZ
2271 in the buffer/string to the next character (in the logical order),
2272 resolve the bidi type of that next character, and return that
2273 type. */
fd3998ff 2274static bidi_type_t
b7b65b15
EZ
2275bidi_type_of_next_char (struct bidi_it *bidi_it)
2276{
2277 bidi_type_t type;
2278
2279 /* This should always be called during a forward scan. */
2280 if (bidi_it->scan_dir != 1)
1088b922 2281 emacs_abort ();
b7b65b15
EZ
2282
2283 /* Reset the limit until which to ignore BNs if we step out of the
2284 area where we found only empty levels. */
87e67904 2285 if ((bidi_it->ignore_bn_limit > -1
b7b65b15 2286 && bidi_it->ignore_bn_limit <= bidi_it->charpos)
87e67904 2287 || (bidi_it->ignore_bn_limit == -2
b7b65b15 2288 && !bidi_explicit_dir_char (bidi_it->ch)))
87e67904 2289 bidi_it->ignore_bn_limit = -1;
b7b65b15
EZ
2290
2291 type = bidi_resolve_neutral (bidi_it);
2292
2293 return type;
2294}
2295
2296/* Given an iterator state BIDI_IT, advance one character position in
182ce2d2
EZ
2297 the buffer/string to the next character (in the current scan
2298 direction), resolve the embedding and implicit levels of that next
2299 character, and return the resulting level. */
fd3998ff 2300static int
b7b65b15
EZ
2301bidi_level_of_next_char (struct bidi_it *bidi_it)
2302{
2303 bidi_type_t type;
2304 int level, prev_level = -1;
2305 struct bidi_saved_info next_for_neutral;
d311d28c 2306 ptrdiff_t next_char_pos = -2;
b7b65b15
EZ
2307
2308 if (bidi_it->scan_dir == 1)
2309 {
5895d7b9 2310 ptrdiff_t eob
512a289d
RS
2311 = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
2312 ? bidi_it->string.schars : ZV);
9f257352 2313
b7b65b15 2314 /* There's no sense in trying to advance if we hit end of text. */
9f257352 2315 if (bidi_it->charpos >= eob)
b7b65b15
EZ
2316 return bidi_it->resolved_level;
2317
2318 /* Record the info about the previous character. */
89d3374a 2319 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
b7b65b15
EZ
2320 && bidi_it->type != WEAK_BN)
2321 bidi_remember_char (&bidi_it->prev, bidi_it);
89d3374a
EZ
2322 if (bidi_it->type_after_w1 == STRONG_R
2323 || bidi_it->type_after_w1 == STRONG_L
2324 || bidi_it->type_after_w1 == STRONG_AL)
b7b65b15
EZ
2325 bidi_remember_char (&bidi_it->last_strong, bidi_it);
2326 /* FIXME: it sounds like we don't need both prev and
2327 prev_for_neutral members, but I'm leaving them both for now. */
2328 if (bidi_it->type == STRONG_R || bidi_it->type == STRONG_L
2329 || bidi_it->type == WEAK_EN || bidi_it->type == WEAK_AN)
2330 bidi_remember_char (&bidi_it->prev_for_neutral, bidi_it);
2331
2332 /* If we overstepped the characters used for resolving neutrals
2333 and whitespace, invalidate their info in the iterator. */
2334 if (bidi_it->charpos >= bidi_it->next_for_neutral.charpos)
2335 bidi_it->next_for_neutral.type = UNKNOWN_BT;
2336 if (bidi_it->next_en_pos >= 0
2337 && bidi_it->charpos >= bidi_it->next_en_pos)
7b5d6677
EZ
2338 {
2339 bidi_it->next_en_pos = 0;
2340 bidi_it->next_en_type = UNKNOWN_BT;
2341 }
b7b65b15
EZ
2342 if (bidi_it->next_for_ws.type != UNKNOWN_BT
2343 && bidi_it->charpos >= bidi_it->next_for_ws.charpos)
2344 bidi_it->next_for_ws.type = UNKNOWN_BT;
2345
2346 /* This must be taken before we fill the iterator with the info
2347 about the next char. If we scan backwards, the iterator
2348 state must be already cached, so there's no need to know the
2349 embedding level of the previous character, since we will be
2350 returning to our caller shortly. */
2351 prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2352 }
2353 next_for_neutral = bidi_it->next_for_neutral;
2354
182ce2d2
EZ
2355 /* Perhaps the character we want is already cached. If it is, the
2356 call to bidi_cache_find below will return a type other than
2357 UNKNOWN_BT. */
87e67904 2358 if (bidi_cache_idx > bidi_cache_start && !bidi_it->first_elt)
102ebb00 2359 {
512a289d
RS
2360 int bob = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
2361 ? 0 : 1);
102ebb00
EZ
2362 if (bidi_it->scan_dir > 0)
2363 {
2364 if (bidi_it->nchars <= 0)
1088b922 2365 emacs_abort ();
102ebb00
EZ
2366 next_char_pos = bidi_it->charpos + bidi_it->nchars;
2367 }
9f257352 2368 else if (bidi_it->charpos >= bob)
bb269206
EZ
2369 /* Implementation note: we allow next_char_pos to be as low as
2370 0 for buffers or -1 for strings, and that is okay because
2371 that's the "position" of the sentinel iterator state we
2372 cached at the beginning of the iteration. */
102ebb00 2373 next_char_pos = bidi_it->charpos - 1;
578b494e 2374 if (next_char_pos >= bob - 1)
87e67904
EZ
2375 type = bidi_cache_find (next_char_pos, -1, bidi_it);
2376 else
2377 type = UNKNOWN_BT;
102ebb00 2378 }
182ce2d2 2379 else
102ebb00 2380 type = UNKNOWN_BT;
b7b65b15
EZ
2381 if (type != UNKNOWN_BT)
2382 {
2383 /* Don't lose the information for resolving neutrals! The
2384 cached states could have been cached before their
2385 next_for_neutral member was computed. If we are on our way
2386 forward, we can simply take the info from the previous
2387 state. */
2388 if (bidi_it->scan_dir == 1
2389 && bidi_it->next_for_neutral.type == UNKNOWN_BT)
2390 bidi_it->next_for_neutral = next_for_neutral;
2391
2392 /* If resolved_level is -1, it means this state was cached
2393 before it was completely resolved, so we cannot return
2394 it. */
2395 if (bidi_it->resolved_level != -1)
2396 return bidi_it->resolved_level;
2397 }
2398 if (bidi_it->scan_dir == -1)
2399 /* If we are going backwards, the iterator state is already cached
2400 from previous scans, and should be fully resolved. */
1088b922 2401 emacs_abort ();
b7b65b15
EZ
2402
2403 if (type == UNKNOWN_BT)
2404 type = bidi_type_of_next_char (bidi_it);
2405
2406 if (type == NEUTRAL_B)
2407 return bidi_it->resolved_level;
2408
2409 level = bidi_it->level_stack[bidi_it->stack_idx].level;
2410 if ((bidi_get_category (type) == NEUTRAL /* && type != NEUTRAL_B */)
2411 || (type == WEAK_BN && prev_level == level))
2412 {
2413 if (bidi_it->next_for_neutral.type == UNKNOWN_BT)
1088b922 2414 emacs_abort ();
b7b65b15
EZ
2415
2416 /* If the cached state shows a neutral character, it was not
2417 resolved by bidi_resolve_neutral, so do it now. */
2418 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
2419 bidi_it->next_for_neutral.type,
2420 level);
2421 }
2422
2423 if (!(type == STRONG_R
2424 || type == STRONG_L
2425 || type == WEAK_BN
2426 || type == WEAK_EN
2427 || type == WEAK_AN))
1088b922 2428 emacs_abort ();
b7b65b15 2429 bidi_it->type = type;
2d6e4628 2430 bidi_check_type (bidi_it->type);
b7b65b15
EZ
2431
2432 /* For L1 below, we need to know, for each WS character, whether
0d327994 2433 it belongs to a sequence of WS characters preceding a newline
b7b65b15 2434 or a TAB or a paragraph separator. */
89d3374a 2435 if (bidi_it->orig_type == NEUTRAL_WS
b7b65b15
EZ
2436 && bidi_it->next_for_ws.type == UNKNOWN_BT)
2437 {
2438 int ch;
d311d28c
PE
2439 ptrdiff_t clen = bidi_it->ch_len;
2440 ptrdiff_t bpos = bidi_it->bytepos;
2441 ptrdiff_t cpos = bidi_it->charpos;
2442 ptrdiff_t disp_pos = bidi_it->disp_pos;
2443 ptrdiff_t nc = bidi_it->nchars;
87e67904 2444 struct bidi_string_data bs = bidi_it->string;
b7b65b15 2445 bidi_type_t chtype;
2cc21167 2446 bool fwp = bidi_it->frame_window_p;
0c95fcf7 2447 int dpp = bidi_it->disp_prop;
b7b65b15 2448
102ebb00 2449 if (bidi_it->nchars <= 0)
1088b922 2450 emacs_abort ();
b7b65b15 2451 do {
36075a19 2452 ch = bidi_fetch_char (cpos += nc, bpos += clen, &disp_pos, &dpp, &bs,
5bf97bfc 2453 bidi_it->w, fwp, &clen, &nc);
79beb178 2454 if (ch == '\n' || ch == BIDI_EOB)
b7b65b15
EZ
2455 chtype = NEUTRAL_B;
2456 else
6bff6497 2457 chtype = bidi_get_type (ch, NEUTRAL_DIR);
b7b65b15
EZ
2458 } while (chtype == NEUTRAL_WS || chtype == WEAK_BN
2459 || bidi_explicit_dir_char (ch)); /* L1/Retaining */
2460 bidi_it->next_for_ws.type = chtype;
2d6e4628 2461 bidi_check_type (bidi_it->next_for_ws.type);
b7b65b15
EZ
2462 bidi_it->next_for_ws.charpos = cpos;
2463 bidi_it->next_for_ws.bytepos = bpos;
2464 }
2465
2466 /* Resolve implicit levels, with a twist: PDFs get the embedding
55622b93 2467 level of the embedding they terminate. See below for the
b7b65b15 2468 reason. */
89d3374a 2469 if (bidi_it->orig_type == PDF
b7b65b15
EZ
2470 /* Don't do this if this formatting code didn't change the
2471 embedding level due to invalid or empty embeddings. */
2472 && prev_level != level)
2473 {
2474 /* Don't look in UAX#9 for the reason for this: it's our own
2475 private quirk. The reason is that we want the formatting
2476 codes to be delivered so that they bracket the text of their
2477 embedding. For example, given the text
2478
2479 {RLO}teST{PDF}
2480
2481 we want it to be displayed as
2482
0d68907d 2483 {PDF}STet{RLO}
b7b65b15
EZ
2484
2485 not as
2486
2487 STet{RLO}{PDF}
2488
2489 which will result because we bump up the embedding level as
2490 soon as we see the RLO and pop it as soon as we see the PDF,
2491 so RLO itself has the same embedding level as "teST", and
2492 thus would be normally delivered last, just before the PDF.
2493 The switch below fiddles with the level of PDF so that this
2494 ugly side effect does not happen.
2495
2496 (This is, of course, only important if the formatting codes
e7402cb2
EZ
2497 are actually displayed, but Emacs does need to display them
2498 if the user wants to.) */
b7b65b15
EZ
2499 level = prev_level;
2500 }
89d3374a
EZ
2501 else if (bidi_it->orig_type == NEUTRAL_B /* L1 */
2502 || bidi_it->orig_type == NEUTRAL_S
e7402cb2 2503 || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB
89d3374a 2504 || (bidi_it->orig_type == NEUTRAL_WS
b7b65b15
EZ
2505 && (bidi_it->next_for_ws.type == NEUTRAL_B
2506 || bidi_it->next_for_ws.type == NEUTRAL_S)))
2507 level = bidi_it->level_stack[0].level;
2508 else if ((level & 1) == 0) /* I1 */
2509 {
2510 if (type == STRONG_R)
2511 level++;
2512 else if (type == WEAK_EN || type == WEAK_AN)
2513 level += 2;
2514 }
2515 else /* I2 */
2516 {
2517 if (type == STRONG_L || type == WEAK_EN || type == WEAK_AN)
2518 level++;
2519 }
2520
2521 bidi_it->resolved_level = level;
2522 return level;
2523}
2524
2cc21167
PE
2525/* Move to the other edge of a level given by LEVEL. If END_FLAG,
2526 we are at the end of a level, and we need to prepare to
b7b65b15
EZ
2527 resume the scan of the lower level.
2528
2529 If this level's other edge is cached, we simply jump to it, filling
2530 the iterator structure with the iterator state on the other edge.
182ce2d2
EZ
2531 Otherwise, we walk the buffer or string until we come back to the
2532 same level as LEVEL.
b7b65b15
EZ
2533
2534 Note: we are not talking here about a ``level run'' in the UAX#9
2535 sense of the term, but rather about a ``level'' which includes
2536 all the levels higher than it. In other words, given the levels
2537 like this:
2538
2539 11111112222222333333334443343222222111111112223322111
2540 A B C
2541
2542 and assuming we are at point A scanning left to right, this
2543 function moves to point C, whereas the UAX#9 ``level 2 run'' ends
2544 at point B. */
2545static void
2cc21167 2546bidi_find_other_level_edge (struct bidi_it *bidi_it, int level, bool end_flag)
b7b65b15
EZ
2547{
2548 int dir = end_flag ? -bidi_it->scan_dir : bidi_it->scan_dir;
39e378da 2549 ptrdiff_t idx;
b7b65b15
EZ
2550
2551 /* Try the cache first. */
87e67904
EZ
2552 if ((idx = bidi_cache_find_level_change (level, dir, end_flag))
2553 >= bidi_cache_start)
b7b65b15
EZ
2554 bidi_cache_fetch_state (idx, bidi_it);
2555 else
2556 {
2557 int new_level;
2558
1088b922 2559 /* If we are at end of level, its edges must be cached. */
b7b65b15 2560 if (end_flag)
1088b922 2561 emacs_abort ();
b7b65b15
EZ
2562
2563 bidi_cache_iterator_state (bidi_it, 1);
2564 do {
2565 new_level = bidi_level_of_next_char (bidi_it);
2566 bidi_cache_iterator_state (bidi_it, 1);
2567 } while (new_level >= level);
2568 }
2569}
2570
2571void
4b292a22 2572bidi_move_to_visually_next (struct bidi_it *bidi_it)
b7b65b15
EZ
2573{
2574 int old_level, new_level, next_level;
9c82e145 2575 struct bidi_it sentinel;
acb28818 2576 struct gcpro gcpro1;
b7b65b15 2577
87e67904 2578 if (bidi_it->charpos < 0 || bidi_it->bytepos < 0)
1088b922 2579 emacs_abort ();
b7b65b15
EZ
2580
2581 if (bidi_it->scan_dir == 0)
2582 {
2583 bidi_it->scan_dir = 1; /* default to logical order */
2584 }
2585
acb28818 2586 /* The code below can call eval, and thus cause GC. If we are
7e2ad32c 2587 iterating a Lisp string, make sure it won't be GCed. */
acb28818
EZ
2588 if (STRINGP (bidi_it->string.lstring))
2589 GCPRO1 (bidi_it->string.lstring);
2590
be39f003 2591 /* If we just passed a newline, initialize for the next line. */
1137e8b8
EZ
2592 if (!bidi_it->first_elt
2593 && (bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB))
be39f003
EZ
2594 bidi_line_init (bidi_it);
2595
6dcfd253
EZ
2596 /* Prepare the sentinel iterator state, and cache it. When we bump
2597 into it, scanning backwards, we'll know that the last non-base
2598 level is exhausted. */
87e67904 2599 if (bidi_cache_idx == bidi_cache_start)
9c82e145
EZ
2600 {
2601 bidi_copy_it (&sentinel, bidi_it);
2602 if (bidi_it->first_elt)
2603 {
2604 sentinel.charpos--; /* cached charpos needs to be monotonic */
2605 sentinel.bytepos--;
2606 sentinel.ch = '\n'; /* doesn't matter, but why not? */
2607 sentinel.ch_len = 1;
182ce2d2 2608 sentinel.nchars = 1;
9c82e145 2609 }
6dcfd253 2610 bidi_cache_iterator_state (&sentinel, 1);
9c82e145 2611 }
b7b65b15
EZ
2612
2613 old_level = bidi_it->resolved_level;
2614 new_level = bidi_level_of_next_char (bidi_it);
b7b65b15
EZ
2615
2616 /* Reordering of resolved levels (clause L2) is implemented by
2617 jumping to the other edge of the level and flipping direction of
c0546589 2618 scanning the text whenever we find a level change. */
b7b65b15
EZ
2619 if (new_level != old_level)
2620 {
2cc21167 2621 bool ascending = new_level > old_level;
b7b65b15
EZ
2622 int level_to_search = ascending ? old_level + 1 : old_level;
2623 int incr = ascending ? 1 : -1;
2624 int expected_next_level = old_level + incr;
2625
b7b65b15
EZ
2626 /* Jump (or walk) to the other edge of this level. */
2627 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
2628 /* Switch scan direction and peek at the next character in the
2629 new direction. */
2630 bidi_it->scan_dir = -bidi_it->scan_dir;
2631
2632 /* The following loop handles the case where the resolved level
2633 jumps by more than one. This is typical for numbers inside a
2634 run of text with left-to-right embedding direction, but can
2635 also happen in other situations. In those cases the decision
2636 where to continue after a level change, and in what direction,
2637 is tricky. For example, given a text like below:
2638
2639 abcdefgh
2640 11336622
2641
2642 (where the numbers below the text show the resolved levels),
2643 the result of reordering according to UAX#9 should be this:
2644
2645 efdcghba
2646
2647 This is implemented by the loop below which flips direction
2648 and jumps to the other edge of the level each time it finds
2649 the new level not to be the expected one. The expected level
2650 is always one more or one less than the previous one. */
2651 next_level = bidi_peek_at_next_level (bidi_it);
2652 while (next_level != expected_next_level)
2653 {
713bfeaa
EZ
2654 /* If next_level is -1, it means we have an unresolved level
2655 in the cache, which at this point should not happen. If
2656 it does, we will infloop. */
2657 eassert (next_level >= 0);
b7b65b15
EZ
2658 expected_next_level += incr;
2659 level_to_search += incr;
2660 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
2661 bidi_it->scan_dir = -bidi_it->scan_dir;
2662 next_level = bidi_peek_at_next_level (bidi_it);
2663 }
2664
2665 /* Finally, deliver the next character in the new direction. */
2666 next_level = bidi_level_of_next_char (bidi_it);
2667 }
2668
b44d9321
EZ
2669 /* Take note when we have just processed the newline that precedes
2670 the end of the paragraph. The next time we are about to be
2671 called, set_iterator_to_next will automatically reinit the
2672 paragraph direction, if needed. We do this at the newline before
2673 the paragraph separator, because the next character might not be
2674 the first character of the next paragraph, due to the bidi
c0546589
EZ
2675 reordering, whereas we _must_ know the paragraph base direction
2676 _before_ we process the paragraph's text, since the base
2677 direction affects the reordering. */
1137e8b8
EZ
2678 if (bidi_it->scan_dir == 1
2679 && (bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB))
be39f003 2680 {
87e67904
EZ
2681 /* The paragraph direction of the entire string, once
2682 determined, is in effect for the entire string. Setting the
2683 separator limit to the end of the string prevents
2684 bidi_paragraph_init from being called automatically on this
2685 string. */
9f257352 2686 if (bidi_it->string.s || STRINGP (bidi_it->string.lstring))
87e67904
EZ
2687 bidi_it->separator_limit = bidi_it->string.schars;
2688 else if (bidi_it->bytepos < ZV_BYTE)
be39f003 2689 {
5895d7b9 2690 ptrdiff_t sep_len
512a289d
RS
2691 = bidi_at_paragraph_end (bidi_it->charpos + bidi_it->nchars,
2692 bidi_it->bytepos + bidi_it->ch_len);
87e67904 2693 if (bidi_it->nchars <= 0)
1088b922 2694 emacs_abort ();
87e67904
EZ
2695 if (sep_len >= 0)
2696 {
2697 bidi_it->new_paragraph = 1;
2698 /* Record the buffer position of the last character of the
2699 paragraph separator. */
512a289d
RS
2700 bidi_it->separator_limit
2701 = bidi_it->charpos + bidi_it->nchars + sep_len;
87e67904 2702 }
be39f003
EZ
2703 }
2704 }
6bff6497 2705
87e67904 2706 if (bidi_it->scan_dir == 1 && bidi_cache_idx > bidi_cache_start)
b7b65b15
EZ
2707 {
2708 /* If we are at paragraph's base embedding level and beyond the
2709 last cached position, the cache's job is done and we can
2710 discard it. */
2711 if (bidi_it->resolved_level == bidi_it->level_stack[0].level
182ce2d2
EZ
2712 && bidi_it->charpos > (bidi_cache[bidi_cache_idx - 1].charpos
2713 + bidi_cache[bidi_cache_idx - 1].nchars - 1))
b7b65b15
EZ
2714 bidi_cache_reset ();
2715 /* But as long as we are caching during forward scan, we must
2716 cache each state, or else the cache integrity will be
2717 compromised: it assumes cached states correspond to buffer
2718 positions 1:1. */
2719 else
2720 bidi_cache_iterator_state (bidi_it, 1);
2721 }
acb28818
EZ
2722
2723 if (STRINGP (bidi_it->string.lstring))
2724 UNGCPRO;
b7b65b15
EZ
2725}
2726
2727/* This is meant to be called from within the debugger, whenever you
2728 wish to examine the cache contents. */
e78aecca 2729void bidi_dump_cached_states (void) EXTERNALLY_VISIBLE;
b7b65b15
EZ
2730void
2731bidi_dump_cached_states (void)
2732{
39e378da 2733 ptrdiff_t i;
b7b65b15
EZ
2734 int ndigits = 1;
2735
2736 if (bidi_cache_idx == 0)
2737 {
2738 fprintf (stderr, "The cache is empty.\n");
2739 return;
2740 }
df9733bf 2741 fprintf (stderr, "Total of %"pD"d state%s in cache:\n",
b7b65b15
EZ
2742 bidi_cache_idx, bidi_cache_idx == 1 ? "" : "s");
2743
2744 for (i = bidi_cache[bidi_cache_idx - 1].charpos; i > 0; i /= 10)
2745 ndigits++;
2746 fputs ("ch ", stderr);
2747 for (i = 0; i < bidi_cache_idx; i++)
2748 fprintf (stderr, "%*c", ndigits, bidi_cache[i].ch);
2749 fputs ("\n", stderr);
2750 fputs ("lvl ", stderr);
2751 for (i = 0; i < bidi_cache_idx; i++)
2752 fprintf (stderr, "%*d", ndigits, bidi_cache[i].resolved_level);
2753 fputs ("\n", stderr);
2754 fputs ("pos ", stderr);
2755 for (i = 0; i < bidi_cache_idx; i++)
7c85f529 2756 fprintf (stderr, "%*"pD"d", ndigits, bidi_cache[i].charpos);
b7b65b15
EZ
2757 fputs ("\n", stderr);
2758}