Declare Lisp_Object Q* variables to be 'static' if not exproted.
[bpt/emacs.git] / src / bidi.c
1 /* Low-level bidirectional buffer-scanning functions for GNU Emacs.
2 Copyright (C) 2000-2001, 2004-2005, 2009-2011
3 Free Software Foundation, Inc.
4
5 This file is part of GNU Emacs.
6
7 GNU Emacs is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
11
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
19
20 /* Written by Eli Zaretskii <eliz@gnu.org>.
21
22 A sequential implementation of the Unicode Bidirectional algorithm,
23 as per UAX#9, a part of the Unicode Standard.
24
25 Unlike the reference and most other implementations, this one is
26 designed to be called once for every character in the buffer or
27 string.
28
29 The main entry point is bidi_move_to_visually_next. Each time it
30 is called, it finds the next character in the visual order, and
31 returns its information in a special structure. The caller is then
32 expected to process this character for display or any other
33 purposes, and call bidi_move_to_visually_next for the next
34 character. See the comments in bidi_move_to_visually_next for more
35 details about its algorithm that finds the next visual-order
36 character by resolving their levels on the fly.
37
38 The two other entry points are bidi_paragraph_init and
39 bidi_mirror_char. The first determines the base direction of a
40 paragraph, while the second returns the mirrored version of its
41 argument character.
42
43 If you want to understand the code, you will have to read it
44 together with the relevant portions of UAX#9. The comments include
45 references to UAX#9 rules, for that very reason.
46
47 A note about references to UAX#9 rules: if the reference says
48 something like "X9/Retaining", it means that you need to refer to
49 rule X9 and to its modifications decribed in the "Implementation
50 Notes" section of UAX#9, under "Retaining Format Codes". */
51
52 #include <config.h>
53 #include <stdio.h>
54 #include <setjmp.h>
55
56 #include "lisp.h"
57 #include "buffer.h"
58 #include "character.h"
59 #include "dispextern.h"
60
61 static int bidi_initialized = 0;
62
63 static Lisp_Object bidi_type_table, bidi_mirror_table;
64
65 /* FIXME: Remove these when bidi_explicit_dir_char uses a lookup table. */
66 #define LRM_CHAR 0x200E
67 #define RLM_CHAR 0x200F
68 #define LRE_CHAR 0x202A
69 #define RLE_CHAR 0x202B
70 #define PDF_CHAR 0x202C
71 #define LRO_CHAR 0x202D
72 #define RLO_CHAR 0x202E
73
74 #define BIDI_EOB -1
75
76 /* Local data structures. (Look in dispextern.h for the rest.) */
77
78 /* What we need to know about the current paragraph. */
79 struct bidi_paragraph_info {
80 EMACS_INT start_bytepos; /* byte position where it begins */
81 EMACS_INT end_bytepos; /* byte position where it ends */
82 int embedding_level; /* its basic embedding level */
83 bidi_dir_t base_dir; /* its base direction */
84 };
85
86 /* Data type for describing the bidirectional character categories. */
87 typedef enum {
88 UNKNOWN_BC,
89 NEUTRAL,
90 WEAK,
91 STRONG
92 } bidi_category_t;
93
94 int bidi_ignore_explicit_marks_for_paragraph_level = 1;
95
96 static Lisp_Object paragraph_start_re, paragraph_separate_re;
97 static Lisp_Object Qparagraph_start, Qparagraph_separate;
98
99 static void
100 bidi_initialize (void)
101 {
102
103 #include "biditype.h"
104 #include "bidimirror.h"
105
106 int i;
107
108 bidi_type_table = Fmake_char_table (Qnil, make_number (STRONG_L));
109 staticpro (&bidi_type_table);
110
111 for (i = 0; i < sizeof bidi_type / sizeof bidi_type[0]; i++)
112 char_table_set_range (bidi_type_table, bidi_type[i].from, bidi_type[i].to,
113 make_number (bidi_type[i].type));
114
115 bidi_mirror_table = Fmake_char_table (Qnil, Qnil);
116 staticpro (&bidi_mirror_table);
117
118 for (i = 0; i < sizeof bidi_mirror / sizeof bidi_mirror[0]; i++)
119 char_table_set (bidi_mirror_table, bidi_mirror[i].from,
120 make_number (bidi_mirror[i].to));
121
122 Qparagraph_start = intern ("paragraph-start");
123 staticpro (&Qparagraph_start);
124 paragraph_start_re = Fsymbol_value (Qparagraph_start);
125 if (!STRINGP (paragraph_start_re))
126 paragraph_start_re = build_string ("\f\\|[ \t]*$");
127 staticpro (&paragraph_start_re);
128 Qparagraph_separate = intern ("paragraph-separate");
129 staticpro (&Qparagraph_separate);
130 paragraph_separate_re = Fsymbol_value (Qparagraph_separate);
131 if (!STRINGP (paragraph_separate_re))
132 paragraph_separate_re = build_string ("[ \t\f]*$");
133 staticpro (&paragraph_separate_re);
134 bidi_initialized = 1;
135 }
136
137 /* Return the bidi type of a character CH, subject to the current
138 directional OVERRIDE. */
139 static INLINE bidi_type_t
140 bidi_get_type (int ch, bidi_dir_t override)
141 {
142 bidi_type_t default_type;
143
144 if (ch == BIDI_EOB)
145 return NEUTRAL_B;
146 if (ch < 0 || ch > MAX_CHAR)
147 abort ();
148
149 default_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
150
151 if (override == NEUTRAL_DIR)
152 return default_type;
153
154 switch (default_type)
155 {
156 /* Although UAX#9 does not tell, it doesn't make sense to
157 override NEUTRAL_B and LRM/RLM characters. */
158 case NEUTRAL_B:
159 case LRE:
160 case LRO:
161 case RLE:
162 case RLO:
163 case PDF:
164 return default_type;
165 default:
166 switch (ch)
167 {
168 case LRM_CHAR:
169 case RLM_CHAR:
170 return default_type;
171 default:
172 if (override == L2R) /* X6 */
173 return STRONG_L;
174 else if (override == R2L)
175 return STRONG_R;
176 else
177 abort (); /* can't happen: handled above */
178 }
179 }
180 }
181
182 static void
183 bidi_check_type (bidi_type_t type)
184 {
185 if (type < UNKNOWN_BT || type > NEUTRAL_ON)
186 abort ();
187 }
188
189 /* Given a bidi TYPE of a character, return its category. */
190 static INLINE bidi_category_t
191 bidi_get_category (bidi_type_t type)
192 {
193 switch (type)
194 {
195 case UNKNOWN_BT:
196 return UNKNOWN_BC;
197 case STRONG_L:
198 case STRONG_R:
199 case STRONG_AL:
200 case LRE:
201 case LRO:
202 case RLE:
203 case RLO:
204 return STRONG;
205 case PDF: /* ??? really?? */
206 case WEAK_EN:
207 case WEAK_ES:
208 case WEAK_ET:
209 case WEAK_AN:
210 case WEAK_CS:
211 case WEAK_NSM:
212 case WEAK_BN:
213 return WEAK;
214 case NEUTRAL_B:
215 case NEUTRAL_S:
216 case NEUTRAL_WS:
217 case NEUTRAL_ON:
218 return NEUTRAL;
219 default:
220 abort ();
221 }
222 }
223
224 /* Return the mirrored character of C, if it has one. If C has no
225 mirrored counterpart, return C.
226 Note: The conditions in UAX#9 clause L4 regarding the surrounding
227 context must be tested by the caller. */
228 int
229 bidi_mirror_char (int c)
230 {
231 Lisp_Object val;
232
233 if (c == BIDI_EOB)
234 return c;
235 if (c < 0 || c > MAX_CHAR)
236 abort ();
237
238 val = CHAR_TABLE_REF (bidi_mirror_table, c);
239 if (INTEGERP (val))
240 {
241 int v = XINT (val);
242
243 if (v < 0 || v > MAX_CHAR)
244 abort ();
245
246 return v;
247 }
248
249 return c;
250 }
251
252 /* Copy the bidi iterator from FROM to TO. To save cycles, this only
253 copies the part of the level stack that is actually in use. */
254 static INLINE void
255 bidi_copy_it (struct bidi_it *to, struct bidi_it *from)
256 {
257 int i;
258
259 /* Copy everything except the level stack and beyond. */
260 memcpy (to, from, ((size_t)&((struct bidi_it *)0)->level_stack[0]));
261
262 /* Copy the active part of the level stack. */
263 to->level_stack[0] = from->level_stack[0]; /* level zero is always in use */
264 for (i = 1; i <= from->stack_idx; i++)
265 to->level_stack[i] = from->level_stack[i];
266 }
267
268 /* Caching the bidi iterator states. */
269
270 #define BIDI_CACHE_CHUNK 200
271 static struct bidi_it *bidi_cache;
272 static size_t bidi_cache_size = 0;
273 static size_t elsz = sizeof (struct bidi_it);
274 static int bidi_cache_idx; /* next unused cache slot */
275 static int bidi_cache_last_idx; /* slot of last cache hit */
276
277 static INLINE void
278 bidi_cache_reset (void)
279 {
280 bidi_cache_idx = 0;
281 bidi_cache_last_idx = -1;
282 }
283
284 static INLINE void
285 bidi_cache_shrink (void)
286 {
287 if (bidi_cache_size > BIDI_CACHE_CHUNK)
288 {
289 bidi_cache_size = BIDI_CACHE_CHUNK;
290 bidi_cache =
291 (struct bidi_it *) xrealloc (bidi_cache, bidi_cache_size * elsz);
292 }
293 bidi_cache_reset ();
294 }
295
296 static INLINE void
297 bidi_cache_fetch_state (int idx, struct bidi_it *bidi_it)
298 {
299 int current_scan_dir = bidi_it->scan_dir;
300
301 if (idx < 0 || idx >= bidi_cache_idx)
302 abort ();
303
304 bidi_copy_it (bidi_it, &bidi_cache[idx]);
305 bidi_it->scan_dir = current_scan_dir;
306 bidi_cache_last_idx = idx;
307 }
308
309 /* Find a cached state with a given CHARPOS and resolved embedding
310 level less or equal to LEVEL. if LEVEL is -1, disregard the
311 resolved levels in cached states. DIR, if non-zero, means search
312 in that direction from the last cache hit. */
313 static INLINE int
314 bidi_cache_search (EMACS_INT charpos, int level, int dir)
315 {
316 int i, i_start;
317
318 if (bidi_cache_idx)
319 {
320 if (charpos < bidi_cache[bidi_cache_last_idx].charpos)
321 dir = -1;
322 else if (charpos > bidi_cache[bidi_cache_last_idx].charpos)
323 dir = 1;
324 if (dir)
325 i_start = bidi_cache_last_idx;
326 else
327 {
328 dir = -1;
329 i_start = bidi_cache_idx - 1;
330 }
331
332 if (dir < 0)
333 {
334 /* Linear search for now; FIXME! */
335 for (i = i_start; i >= 0; i--)
336 if (bidi_cache[i].charpos == charpos
337 && (level == -1 || bidi_cache[i].resolved_level <= level))
338 return i;
339 }
340 else
341 {
342 for (i = i_start; i < bidi_cache_idx; i++)
343 if (bidi_cache[i].charpos == charpos
344 && (level == -1 || bidi_cache[i].resolved_level <= level))
345 return i;
346 }
347 }
348
349 return -1;
350 }
351
352 /* Find a cached state where the resolved level changes to a value
353 that is lower than LEVEL, and return its cache slot index. DIR is
354 the direction to search, starting with the last used cache slot.
355 BEFORE, if non-zero, means return the index of the slot that is
356 ``before'' the level change in the search direction. That is,
357 given the cached levels like this:
358
359 1122333442211
360 AB C
361
362 and assuming we are at the position cached at the slot marked with
363 C, searching backwards (DIR = -1) for LEVEL = 2 will return the
364 index of slot B or A, depending whether BEFORE is, respectively,
365 non-zero or zero. */
366 static int
367 bidi_cache_find_level_change (int level, int dir, int before)
368 {
369 if (bidi_cache_idx)
370 {
371 int i = dir ? bidi_cache_last_idx : bidi_cache_idx - 1;
372 int incr = before ? 1 : 0;
373
374 if (!dir)
375 dir = -1;
376 else if (!incr)
377 i += dir;
378
379 if (dir < 0)
380 {
381 while (i >= incr)
382 {
383 if (bidi_cache[i - incr].resolved_level >= 0
384 && bidi_cache[i - incr].resolved_level < level)
385 return i;
386 i--;
387 }
388 }
389 else
390 {
391 while (i < bidi_cache_idx - incr)
392 {
393 if (bidi_cache[i + incr].resolved_level >= 0
394 && bidi_cache[i + incr].resolved_level < level)
395 return i;
396 i++;
397 }
398 }
399 }
400
401 return -1;
402 }
403
404 static INLINE void
405 bidi_cache_iterator_state (struct bidi_it *bidi_it, int resolved)
406 {
407 int idx;
408
409 /* We should never cache on backward scans. */
410 if (bidi_it->scan_dir == -1)
411 abort ();
412 idx = bidi_cache_search (bidi_it->charpos, -1, 1);
413
414 if (idx < 0)
415 {
416 idx = bidi_cache_idx;
417 /* Enlarge the cache as needed. */
418 if (idx >= bidi_cache_size)
419 {
420 bidi_cache_size += BIDI_CACHE_CHUNK;
421 bidi_cache =
422 (struct bidi_it *) xrealloc (bidi_cache, bidi_cache_size * elsz);
423 }
424 /* Character positions should correspond to cache positions 1:1.
425 If we are outside the range of cached positions, the cache is
426 useless and must be reset. */
427 if (idx > 0 &&
428 (bidi_it->charpos > bidi_cache[idx - 1].charpos + 1
429 || bidi_it->charpos < bidi_cache[0].charpos))
430 {
431 bidi_cache_reset ();
432 idx = 0;
433 }
434 bidi_copy_it (&bidi_cache[idx], bidi_it);
435 if (!resolved)
436 bidi_cache[idx].resolved_level = -1;
437 }
438 else
439 {
440 /* Copy only the members which could have changed, to avoid
441 costly copying of the entire struct. */
442 bidi_cache[idx].type = bidi_it->type;
443 bidi_check_type (bidi_it->type);
444 bidi_cache[idx].type_after_w1 = bidi_it->type_after_w1;
445 bidi_check_type (bidi_it->type_after_w1);
446 if (resolved)
447 bidi_cache[idx].resolved_level = bidi_it->resolved_level;
448 else
449 bidi_cache[idx].resolved_level = -1;
450 bidi_cache[idx].invalid_levels = bidi_it->invalid_levels;
451 bidi_cache[idx].invalid_rl_levels = bidi_it->invalid_rl_levels;
452 bidi_cache[idx].next_for_neutral = bidi_it->next_for_neutral;
453 bidi_cache[idx].next_for_ws = bidi_it->next_for_ws;
454 bidi_cache[idx].ignore_bn_limit = bidi_it->ignore_bn_limit;
455 }
456
457 bidi_cache_last_idx = idx;
458 if (idx >= bidi_cache_idx)
459 bidi_cache_idx = idx + 1;
460 }
461
462 static INLINE bidi_type_t
463 bidi_cache_find (EMACS_INT charpos, int level, struct bidi_it *bidi_it)
464 {
465 int i = bidi_cache_search (charpos, level, bidi_it->scan_dir);
466
467 if (i >= 0)
468 {
469 bidi_dir_t current_scan_dir = bidi_it->scan_dir;
470
471 bidi_copy_it (bidi_it, &bidi_cache[i]);
472 bidi_cache_last_idx = i;
473 /* Don't let scan direction from from the cached state override
474 the current scan direction. */
475 bidi_it->scan_dir = current_scan_dir;
476 return bidi_it->type;
477 }
478
479 return UNKNOWN_BT;
480 }
481
482 static INLINE int
483 bidi_peek_at_next_level (struct bidi_it *bidi_it)
484 {
485 if (bidi_cache_idx == 0 || bidi_cache_last_idx == -1)
486 abort ();
487 return bidi_cache[bidi_cache_last_idx + bidi_it->scan_dir].resolved_level;
488 }
489
490 /* Check if buffer position CHARPOS/BYTEPOS is the end of a paragraph.
491 Value is the non-negative length of the paragraph separator
492 following the buffer position, -1 if position is at the beginning
493 of a new paragraph, or -2 if position is neither at beginning nor
494 at end of a paragraph. */
495 static EMACS_INT
496 bidi_at_paragraph_end (EMACS_INT charpos, EMACS_INT bytepos)
497 {
498 Lisp_Object sep_re;
499 Lisp_Object start_re;
500 EMACS_INT val;
501
502 sep_re = paragraph_separate_re;
503 start_re = paragraph_start_re;
504
505 val = fast_looking_at (sep_re, charpos, bytepos, ZV, ZV_BYTE, Qnil);
506 if (val < 0)
507 {
508 if (fast_looking_at (start_re, charpos, bytepos, ZV, ZV_BYTE, Qnil) >= 0)
509 val = -1;
510 else
511 val = -2;
512 }
513
514 return val;
515 }
516
517 /* Determine the start-of-run (sor) directional type given the two
518 embedding levels on either side of the run boundary. Also, update
519 the saved info about previously seen characters, since that info is
520 generally valid for a single level run. */
521 static INLINE void
522 bidi_set_sor_type (struct bidi_it *bidi_it, int level_before, int level_after)
523 {
524 int higher_level = level_before > level_after ? level_before : level_after;
525
526 /* The prev_was_pdf gork is required for when we have several PDFs
527 in a row. In that case, we want to compute the sor type for the
528 next level run only once: when we see the first PDF. That's
529 because the sor type depends only on the higher of the two levels
530 that we find on the two sides of the level boundary (see UAX#9,
531 clause X10), and so we don't need to know the final embedding
532 level to which we descend after processing all the PDFs. */
533 if (!bidi_it->prev_was_pdf || level_before < level_after)
534 /* FIXME: should the default sor direction be user selectable? */
535 bidi_it->sor = (higher_level & 1) != 0 ? R2L : L2R;
536 if (level_before > level_after)
537 bidi_it->prev_was_pdf = 1;
538
539 bidi_it->prev.type = UNKNOWN_BT;
540 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
541 bidi_it->last_strong.orig_type = UNKNOWN_BT;
542 bidi_it->prev_for_neutral.type = bidi_it->sor == R2L ? STRONG_R : STRONG_L;
543 bidi_it->prev_for_neutral.charpos = bidi_it->charpos;
544 bidi_it->prev_for_neutral.bytepos = bidi_it->bytepos;
545 bidi_it->next_for_neutral.type = bidi_it->next_for_neutral.type_after_w1 =
546 bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
547 bidi_it->ignore_bn_limit = 0; /* meaning it's unknown */
548 }
549
550 static void
551 bidi_line_init (struct bidi_it *bidi_it)
552 {
553 bidi_it->scan_dir = 1; /* FIXME: do we need to have control on this? */
554 bidi_it->resolved_level = bidi_it->level_stack[0].level;
555 bidi_it->level_stack[0].override = NEUTRAL_DIR; /* X1 */
556 bidi_it->invalid_levels = 0;
557 bidi_it->invalid_rl_levels = -1;
558 bidi_it->next_en_pos = -1;
559 bidi_it->next_for_ws.type = UNKNOWN_BT;
560 bidi_set_sor_type (bidi_it,
561 bidi_it->paragraph_dir == R2L ? 1 : 0,
562 bidi_it->level_stack[0].level); /* X10 */
563
564 bidi_cache_reset ();
565 }
566
567 /* Find the beginning of this paragraph by looking back in the buffer.
568 Value is the byte position of the paragraph's beginning. */
569 static EMACS_INT
570 bidi_find_paragraph_start (EMACS_INT pos, EMACS_INT pos_byte)
571 {
572 Lisp_Object re = paragraph_start_re;
573 EMACS_INT limit = ZV, limit_byte = ZV_BYTE;
574
575 while (pos_byte > BEGV_BYTE
576 && fast_looking_at (re, pos, pos_byte, limit, limit_byte, Qnil) < 0)
577 {
578 pos = find_next_newline_no_quit (pos - 1, -1);
579 pos_byte = CHAR_TO_BYTE (pos);
580 }
581 return pos_byte;
582 }
583
584 /* Determine the base direction, a.k.a. base embedding level, of the
585 paragraph we are about to iterate through. If DIR is either L2R or
586 R2L, just use that. Otherwise, determine the paragraph direction
587 from the first strong directional character of the paragraph.
588
589 NO_DEFAULT_P non-nil means don't default to L2R if the paragraph
590 has no strong directional characters and both DIR and
591 bidi_it->paragraph_dir are NEUTRAL_DIR. In that case, search back
592 in the buffer until a paragraph is found with a strong character,
593 or until hitting BEGV. In the latter case, fall back to L2R. This
594 flag is used in current-bidi-paragraph-direction.
595
596 Note that this function gives the paragraph separator the same
597 direction as the preceding paragraph, even though Emacs generally
598 views the separartor as not belonging to any paragraph. */
599 void
600 bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it, int no_default_p)
601 {
602 EMACS_INT bytepos = bidi_it->bytepos;
603 EMACS_INT pstartbyte;
604
605 /* Special case for an empty buffer. */
606 if (bytepos == BEGV_BYTE && bytepos == ZV_BYTE)
607 dir = L2R;
608 /* We should never be called at EOB or before BEGV. */
609 else if (bytepos >= ZV_BYTE || bytepos < BEGV_BYTE)
610 abort ();
611
612 if (dir == L2R)
613 {
614 bidi_it->paragraph_dir = L2R;
615 bidi_it->new_paragraph = 0;
616 }
617 else if (dir == R2L)
618 {
619 bidi_it->paragraph_dir = R2L;
620 bidi_it->new_paragraph = 0;
621 }
622 else if (dir == NEUTRAL_DIR) /* P2 */
623 {
624 int ch, ch_len;
625 EMACS_INT pos;
626 bidi_type_t type;
627
628 if (!bidi_initialized)
629 bidi_initialize ();
630
631 /* If we are inside a paragraph separator, we are just waiting
632 for the separator to be exhausted; use the previous paragraph
633 direction. But don't do that if we have been just reseated,
634 because we need to reinitialize below in that case. */
635 if (!bidi_it->first_elt
636 && bidi_it->charpos < bidi_it->separator_limit)
637 return;
638
639 /* If we are on a newline, get past it to where the next
640 paragraph might start. But don't do that at BEGV since then
641 we are potentially in a new paragraph that doesn't yet
642 exist. */
643 pos = bidi_it->charpos;
644 if (bytepos > BEGV_BYTE && FETCH_CHAR (bytepos) == '\n')
645 {
646 bytepos++;
647 pos++;
648 }
649
650 /* We are either at the beginning of a paragraph or in the
651 middle of it. Find where this paragraph starts. */
652 pstartbyte = bidi_find_paragraph_start (pos, bytepos);
653 bidi_it->separator_limit = -1;
654 bidi_it->new_paragraph = 0;
655
656 /* The following loop is run more than once only if NO_DEFAULT_P
657 is non-zero. */
658 do {
659 bytepos = pstartbyte;
660 ch = FETCH_CHAR (bytepos);
661 ch_len = CHAR_BYTES (ch);
662 pos = BYTE_TO_CHAR (bytepos);
663 type = bidi_get_type (ch, NEUTRAL_DIR);
664
665 for (pos++, bytepos += ch_len;
666 /* NOTE: UAX#9 says to search only for L, AL, or R types
667 of characters, and ignore RLE, RLO, LRE, and LRO.
668 However, I'm not sure it makes sense to omit those 4;
669 should try with and without that to see the effect. */
670 (bidi_get_category (type) != STRONG)
671 || (bidi_ignore_explicit_marks_for_paragraph_level
672 && (type == RLE || type == RLO
673 || type == LRE || type == LRO));
674 type = bidi_get_type (ch, NEUTRAL_DIR))
675 {
676 if (type == NEUTRAL_B && bidi_at_paragraph_end (pos, bytepos) >= -1)
677 break;
678 if (bytepos >= ZV_BYTE)
679 {
680 /* Pretend there's a paragraph separator at end of
681 buffer. */
682 type = NEUTRAL_B;
683 break;
684 }
685 FETCH_CHAR_ADVANCE (ch, pos, bytepos);
686 }
687 if (type == STRONG_R || type == STRONG_AL) /* P3 */
688 bidi_it->paragraph_dir = R2L;
689 else if (type == STRONG_L)
690 bidi_it->paragraph_dir = L2R;
691 if (no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR)
692 {
693 /* If this paragraph is at BEGV, default to L2R. */
694 if (pstartbyte == BEGV_BYTE)
695 bidi_it->paragraph_dir = L2R; /* P3 and HL1 */
696 else
697 {
698 EMACS_INT prevpbyte = pstartbyte;
699 EMACS_INT p = BYTE_TO_CHAR (pstartbyte), pbyte = pstartbyte;
700
701 /* Find the beginning of the previous paragraph, if any. */
702 while (pbyte > BEGV_BYTE && prevpbyte >= pstartbyte)
703 {
704 p--;
705 pbyte = CHAR_TO_BYTE (p);
706 prevpbyte = bidi_find_paragraph_start (p, pbyte);
707 }
708 pstartbyte = prevpbyte;
709 }
710 }
711 } while (no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR);
712 }
713 else
714 abort ();
715
716 /* Contrary to UAX#9 clause P3, we only default the paragraph
717 direction to L2R if we have no previous usable paragraph
718 direction. This is allowed by the HL1 clause. */
719 if (bidi_it->paragraph_dir != L2R && bidi_it->paragraph_dir != R2L)
720 bidi_it->paragraph_dir = L2R; /* P3 and HL1 ``higher-level protocols'' */
721 if (bidi_it->paragraph_dir == R2L)
722 bidi_it->level_stack[0].level = 1;
723 else
724 bidi_it->level_stack[0].level = 0;
725
726 bidi_line_init (bidi_it);
727 }
728
729 /* Do whatever UAX#9 clause X8 says should be done at paragraph's
730 end. */
731 static INLINE void
732 bidi_set_paragraph_end (struct bidi_it *bidi_it)
733 {
734 bidi_it->invalid_levels = 0;
735 bidi_it->invalid_rl_levels = -1;
736 bidi_it->stack_idx = 0;
737 bidi_it->resolved_level = bidi_it->level_stack[0].level;
738 }
739
740 /* Initialize the bidi iterator from buffer position CHARPOS. */
741 void
742 bidi_init_it (EMACS_INT charpos, EMACS_INT bytepos, struct bidi_it *bidi_it)
743 {
744 if (! bidi_initialized)
745 bidi_initialize ();
746 bidi_it->charpos = charpos;
747 bidi_it->bytepos = bytepos;
748 bidi_it->first_elt = 1;
749 bidi_set_paragraph_end (bidi_it);
750 bidi_it->new_paragraph = 1;
751 bidi_it->separator_limit = -1;
752 bidi_it->type = NEUTRAL_B;
753 bidi_it->type_after_w1 = NEUTRAL_B;
754 bidi_it->orig_type = NEUTRAL_B;
755 bidi_it->prev_was_pdf = 0;
756 bidi_it->prev.type = bidi_it->prev.type_after_w1 =
757 bidi_it->prev.orig_type = UNKNOWN_BT;
758 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
759 bidi_it->last_strong.orig_type = UNKNOWN_BT;
760 bidi_it->next_for_neutral.charpos = -1;
761 bidi_it->next_for_neutral.type =
762 bidi_it->next_for_neutral.type_after_w1 =
763 bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
764 bidi_it->prev_for_neutral.charpos = -1;
765 bidi_it->prev_for_neutral.type =
766 bidi_it->prev_for_neutral.type_after_w1 =
767 bidi_it->prev_for_neutral.orig_type = UNKNOWN_BT;
768 bidi_it->sor = L2R; /* FIXME: should it be user-selectable? */
769 bidi_cache_shrink ();
770 }
771
772 /* Push the current embedding level and override status; reset the
773 current level to LEVEL and the current override status to OVERRIDE. */
774 static INLINE void
775 bidi_push_embedding_level (struct bidi_it *bidi_it,
776 int level, bidi_dir_t override)
777 {
778 bidi_it->stack_idx++;
779 if (bidi_it->stack_idx >= BIDI_MAXLEVEL)
780 abort ();
781 bidi_it->level_stack[bidi_it->stack_idx].level = level;
782 bidi_it->level_stack[bidi_it->stack_idx].override = override;
783 }
784
785 /* Pop the embedding level and directional override status from the
786 stack, and return the new level. */
787 static INLINE int
788 bidi_pop_embedding_level (struct bidi_it *bidi_it)
789 {
790 /* UAX#9 says to ignore invalid PDFs. */
791 if (bidi_it->stack_idx > 0)
792 bidi_it->stack_idx--;
793 return bidi_it->level_stack[bidi_it->stack_idx].level;
794 }
795
796 /* Record in SAVED_INFO the information about the current character. */
797 static INLINE void
798 bidi_remember_char (struct bidi_saved_info *saved_info,
799 struct bidi_it *bidi_it)
800 {
801 saved_info->charpos = bidi_it->charpos;
802 saved_info->bytepos = bidi_it->bytepos;
803 saved_info->type = bidi_it->type;
804 bidi_check_type (bidi_it->type);
805 saved_info->type_after_w1 = bidi_it->type_after_w1;
806 bidi_check_type (bidi_it->type_after_w1);
807 saved_info->orig_type = bidi_it->orig_type;
808 bidi_check_type (bidi_it->orig_type);
809 }
810
811 /* Resolve the type of a neutral character according to the type of
812 surrounding strong text and the current embedding level. */
813 static INLINE bidi_type_t
814 bidi_resolve_neutral_1 (bidi_type_t prev_type, bidi_type_t next_type, int lev)
815 {
816 /* N1: European and Arabic numbers are treated as though they were R. */
817 if (next_type == WEAK_EN || next_type == WEAK_AN)
818 next_type = STRONG_R;
819 if (prev_type == WEAK_EN || prev_type == WEAK_AN)
820 prev_type = STRONG_R;
821
822 if (next_type == prev_type) /* N1 */
823 return next_type;
824 else if ((lev & 1) == 0) /* N2 */
825 return STRONG_L;
826 else
827 return STRONG_R;
828 }
829
830 static INLINE int
831 bidi_explicit_dir_char (int c)
832 {
833 /* FIXME: this should be replaced with a lookup table with suitable
834 bits set, like standard C ctype macros do. */
835 return (c == LRE_CHAR || c == LRO_CHAR
836 || c == RLE_CHAR || c == RLO_CHAR || c == PDF_CHAR);
837 }
838
839 /* A helper function for bidi_resolve_explicit. It advances to the
840 next character in logical order and determines the new embedding
841 level and directional override, but does not take into account
842 empty embeddings. */
843 static int
844 bidi_resolve_explicit_1 (struct bidi_it *bidi_it)
845 {
846 int curchar;
847 bidi_type_t type;
848 int current_level;
849 int new_level;
850 bidi_dir_t override;
851
852 if (bidi_it->bytepos < BEGV_BYTE /* after reseat to BEGV? */
853 || bidi_it->first_elt)
854 {
855 bidi_it->first_elt = 0;
856 if (bidi_it->charpos < BEGV)
857 bidi_it->charpos = BEGV;
858 bidi_it->bytepos = CHAR_TO_BYTE (bidi_it->charpos);
859 }
860 else if (bidi_it->bytepos < ZV_BYTE) /* don't move at ZV */
861 {
862 bidi_it->charpos++;
863 if (bidi_it->ch_len == 0)
864 abort ();
865 bidi_it->bytepos += bidi_it->ch_len;
866 }
867
868 current_level = bidi_it->level_stack[bidi_it->stack_idx].level; /* X1 */
869 override = bidi_it->level_stack[bidi_it->stack_idx].override;
870 new_level = current_level;
871
872 /* in case it is a unibyte character (not yet implemented) */
873 /* _fetch_multibyte_char_len = 1; */
874 if (bidi_it->bytepos >= ZV_BYTE)
875 {
876 curchar = BIDI_EOB;
877 bidi_it->ch_len = 1;
878 }
879 else
880 {
881 curchar = FETCH_CHAR (bidi_it->bytepos);
882 bidi_it->ch_len = CHAR_BYTES (curchar);
883 }
884 bidi_it->ch = curchar;
885
886 /* Don't apply directional override here, as all the types we handle
887 below will not be affected by the override anyway, and we need
888 the original type unaltered. The override will be applied in
889 bidi_resolve_weak. */
890 type = bidi_get_type (curchar, NEUTRAL_DIR);
891 bidi_it->orig_type = type;
892 bidi_check_type (bidi_it->orig_type);
893
894 if (type != PDF)
895 bidi_it->prev_was_pdf = 0;
896
897 bidi_it->type_after_w1 = UNKNOWN_BT;
898
899 switch (type)
900 {
901 case RLE: /* X2 */
902 case RLO: /* X4 */
903 bidi_it->type_after_w1 = type;
904 bidi_check_type (bidi_it->type_after_w1);
905 type = WEAK_BN; /* X9/Retaining */
906 if (bidi_it->ignore_bn_limit <= 0)
907 {
908 if (current_level <= BIDI_MAXLEVEL - 4)
909 {
910 /* Compute the least odd embedding level greater than
911 the current level. */
912 new_level = ((current_level + 1) & ~1) + 1;
913 if (bidi_it->type_after_w1 == RLE)
914 override = NEUTRAL_DIR;
915 else
916 override = R2L;
917 if (current_level == BIDI_MAXLEVEL - 4)
918 bidi_it->invalid_rl_levels = 0;
919 bidi_push_embedding_level (bidi_it, new_level, override);
920 }
921 else
922 {
923 bidi_it->invalid_levels++;
924 /* See the commentary about invalid_rl_levels below. */
925 if (bidi_it->invalid_rl_levels < 0)
926 bidi_it->invalid_rl_levels = 0;
927 bidi_it->invalid_rl_levels++;
928 }
929 }
930 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
931 || bidi_it->next_en_pos > bidi_it->charpos)
932 type = WEAK_EN;
933 break;
934 case LRE: /* X3 */
935 case LRO: /* X5 */
936 bidi_it->type_after_w1 = type;
937 bidi_check_type (bidi_it->type_after_w1);
938 type = WEAK_BN; /* X9/Retaining */
939 if (bidi_it->ignore_bn_limit <= 0)
940 {
941 if (current_level <= BIDI_MAXLEVEL - 5)
942 {
943 /* Compute the least even embedding level greater than
944 the current level. */
945 new_level = ((current_level + 2) & ~1);
946 if (bidi_it->type_after_w1 == LRE)
947 override = NEUTRAL_DIR;
948 else
949 override = L2R;
950 bidi_push_embedding_level (bidi_it, new_level, override);
951 }
952 else
953 {
954 bidi_it->invalid_levels++;
955 /* invalid_rl_levels counts invalid levels encountered
956 while the embedding level was already too high for
957 LRE/LRO, but not for RLE/RLO. That is because
958 there may be exactly one PDF which we should not
959 ignore even though invalid_levels is non-zero.
960 invalid_rl_levels helps to know what PDF is
961 that. */
962 if (bidi_it->invalid_rl_levels >= 0)
963 bidi_it->invalid_rl_levels++;
964 }
965 }
966 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
967 || bidi_it->next_en_pos > bidi_it->charpos)
968 type = WEAK_EN;
969 break;
970 case PDF: /* X7 */
971 bidi_it->type_after_w1 = type;
972 bidi_check_type (bidi_it->type_after_w1);
973 type = WEAK_BN; /* X9/Retaining */
974 if (bidi_it->ignore_bn_limit <= 0)
975 {
976 if (!bidi_it->invalid_rl_levels)
977 {
978 new_level = bidi_pop_embedding_level (bidi_it);
979 bidi_it->invalid_rl_levels = -1;
980 if (bidi_it->invalid_levels)
981 bidi_it->invalid_levels--;
982 /* else nothing: UAX#9 says to ignore invalid PDFs */
983 }
984 if (!bidi_it->invalid_levels)
985 new_level = bidi_pop_embedding_level (bidi_it);
986 else
987 {
988 bidi_it->invalid_levels--;
989 bidi_it->invalid_rl_levels--;
990 }
991 }
992 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
993 || bidi_it->next_en_pos > bidi_it->charpos)
994 type = WEAK_EN;
995 break;
996 default:
997 /* Nothing. */
998 break;
999 }
1000
1001 bidi_it->type = type;
1002 bidi_check_type (bidi_it->type);
1003
1004 return new_level;
1005 }
1006
1007 /* Given an iterator state in BIDI_IT, advance one character position
1008 in the buffer to the next character (in the logical order), resolve
1009 any explicit embeddings and directional overrides, and return the
1010 embedding level of the character after resolving explicit
1011 directives and ignoring empty embeddings. */
1012 static int
1013 bidi_resolve_explicit (struct bidi_it *bidi_it)
1014 {
1015 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1016 int new_level = bidi_resolve_explicit_1 (bidi_it);
1017
1018 if (prev_level < new_level
1019 && bidi_it->type == WEAK_BN
1020 && bidi_it->ignore_bn_limit == 0 /* only if not already known */
1021 && bidi_it->bytepos < ZV_BYTE /* not already at EOB */
1022 && bidi_explicit_dir_char (FETCH_CHAR (bidi_it->bytepos
1023 + bidi_it->ch_len)))
1024 {
1025 /* Avoid pushing and popping embedding levels if the level run
1026 is empty, as this breaks level runs where it shouldn't.
1027 UAX#9 removes all the explicit embedding and override codes,
1028 so empty embeddings disappear without a trace. We need to
1029 behave as if we did the same. */
1030 struct bidi_it saved_it;
1031 int level = prev_level;
1032
1033 bidi_copy_it (&saved_it, bidi_it);
1034
1035 while (bidi_explicit_dir_char (FETCH_CHAR (bidi_it->bytepos
1036 + bidi_it->ch_len)))
1037 {
1038 level = bidi_resolve_explicit_1 (bidi_it);
1039 }
1040
1041 if (level == prev_level) /* empty embedding */
1042 saved_it.ignore_bn_limit = bidi_it->charpos + 1;
1043 else /* this embedding is non-empty */
1044 saved_it.ignore_bn_limit = -1;
1045
1046 bidi_copy_it (bidi_it, &saved_it);
1047 if (bidi_it->ignore_bn_limit > 0)
1048 {
1049 /* We pushed a level, but we shouldn't have. Undo that. */
1050 if (!bidi_it->invalid_rl_levels)
1051 {
1052 new_level = bidi_pop_embedding_level (bidi_it);
1053 bidi_it->invalid_rl_levels = -1;
1054 if (bidi_it->invalid_levels)
1055 bidi_it->invalid_levels--;
1056 }
1057 if (!bidi_it->invalid_levels)
1058 new_level = bidi_pop_embedding_level (bidi_it);
1059 else
1060 {
1061 bidi_it->invalid_levels--;
1062 bidi_it->invalid_rl_levels--;
1063 }
1064 }
1065 }
1066
1067 if (bidi_it->type == NEUTRAL_B) /* X8 */
1068 {
1069 bidi_set_paragraph_end (bidi_it);
1070 /* This is needed by bidi_resolve_weak below, and in L1. */
1071 bidi_it->type_after_w1 = bidi_it->type;
1072 bidi_check_type (bidi_it->type_after_w1);
1073 }
1074
1075 return new_level;
1076 }
1077
1078 /* Advance in the buffer, resolve weak types and return the type of
1079 the next character after weak type resolution. */
1080 static bidi_type_t
1081 bidi_resolve_weak (struct bidi_it *bidi_it)
1082 {
1083 bidi_type_t type;
1084 bidi_dir_t override;
1085 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1086 int new_level = bidi_resolve_explicit (bidi_it);
1087 int next_char;
1088 bidi_type_t type_of_next;
1089 struct bidi_it saved_it;
1090
1091 type = bidi_it->type;
1092 override = bidi_it->level_stack[bidi_it->stack_idx].override;
1093
1094 if (type == UNKNOWN_BT
1095 || type == LRE
1096 || type == LRO
1097 || type == RLE
1098 || type == RLO
1099 || type == PDF)
1100 abort ();
1101
1102 if (new_level != prev_level
1103 || bidi_it->type == NEUTRAL_B)
1104 {
1105 /* We've got a new embedding level run, compute the directional
1106 type of sor and initialize per-run variables (UAX#9, clause
1107 X10). */
1108 bidi_set_sor_type (bidi_it, prev_level, new_level);
1109 }
1110 else if (type == NEUTRAL_S || type == NEUTRAL_WS
1111 || type == WEAK_BN || type == STRONG_AL)
1112 bidi_it->type_after_w1 = type; /* needed in L1 */
1113 bidi_check_type (bidi_it->type_after_w1);
1114
1115 /* Level and directional override status are already recorded in
1116 bidi_it, and do not need any change; see X6. */
1117 if (override == R2L) /* X6 */
1118 type = STRONG_R;
1119 else if (override == L2R)
1120 type = STRONG_L;
1121 else
1122 {
1123 if (type == WEAK_NSM) /* W1 */
1124 {
1125 /* Note that we don't need to consider the case where the
1126 prev character has its type overridden by an RLO or LRO,
1127 because then either the type of this NSM would have been
1128 also overridden, or the previous character is outside the
1129 current level run, and thus not relevant to this NSM.
1130 This is why NSM gets the type_after_w1 of the previous
1131 character. */
1132 if (bidi_it->prev.type_after_w1 != UNKNOWN_BT
1133 /* if type_after_w1 is NEUTRAL_B, this NSM is at sor */
1134 && bidi_it->prev.type_after_w1 != NEUTRAL_B)
1135 type = bidi_it->prev.type_after_w1;
1136 else if (bidi_it->sor == R2L)
1137 type = STRONG_R;
1138 else if (bidi_it->sor == L2R)
1139 type = STRONG_L;
1140 else /* shouldn't happen! */
1141 abort ();
1142 }
1143 if (type == WEAK_EN /* W2 */
1144 && bidi_it->last_strong.type_after_w1 == STRONG_AL)
1145 type = WEAK_AN;
1146 else if (type == STRONG_AL) /* W3 */
1147 type = STRONG_R;
1148 else if ((type == WEAK_ES /* W4 */
1149 && bidi_it->prev.type_after_w1 == WEAK_EN
1150 && bidi_it->prev.orig_type == WEAK_EN)
1151 || (type == WEAK_CS
1152 && ((bidi_it->prev.type_after_w1 == WEAK_EN
1153 && bidi_it->prev.orig_type == WEAK_EN)
1154 || bidi_it->prev.type_after_w1 == WEAK_AN)))
1155 {
1156 next_char =
1157 bidi_it->bytepos + bidi_it->ch_len >= ZV_BYTE
1158 ? BIDI_EOB : FETCH_CHAR (bidi_it->bytepos + bidi_it->ch_len);
1159 type_of_next = bidi_get_type (next_char, override);
1160
1161 if (type_of_next == WEAK_BN
1162 || bidi_explicit_dir_char (next_char))
1163 {
1164 bidi_copy_it (&saved_it, bidi_it);
1165 while (bidi_resolve_explicit (bidi_it) == new_level
1166 && bidi_it->type == WEAK_BN)
1167 ;
1168 type_of_next = bidi_it->type;
1169 bidi_copy_it (bidi_it, &saved_it);
1170 }
1171
1172 /* If the next character is EN, but the last strong-type
1173 character is AL, that next EN will be changed to AN when
1174 we process it in W2 above. So in that case, this ES
1175 should not be changed into EN. */
1176 if (type == WEAK_ES
1177 && type_of_next == WEAK_EN
1178 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1179 type = WEAK_EN;
1180 else if (type == WEAK_CS)
1181 {
1182 if (bidi_it->prev.type_after_w1 == WEAK_AN
1183 && (type_of_next == WEAK_AN
1184 /* If the next character is EN, but the last
1185 strong-type character is AL, EN will be later
1186 changed to AN when we process it in W2 above.
1187 So in that case, this ES should not be
1188 changed into EN. */
1189 || (type_of_next == WEAK_EN
1190 && bidi_it->last_strong.type_after_w1 == STRONG_AL)))
1191 type = WEAK_AN;
1192 else if (bidi_it->prev.type_after_w1 == WEAK_EN
1193 && type_of_next == WEAK_EN
1194 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1195 type = WEAK_EN;
1196 }
1197 }
1198 else if (type == WEAK_ET /* W5: ET with EN before or after it */
1199 || type == WEAK_BN) /* W5/Retaining */
1200 {
1201 if (bidi_it->prev.type_after_w1 == WEAK_EN /* ET/BN w/EN before it */
1202 || bidi_it->next_en_pos > bidi_it->charpos)
1203 type = WEAK_EN;
1204 else /* W5: ET/BN with EN after it. */
1205 {
1206 EMACS_INT en_pos = bidi_it->charpos + 1;
1207
1208 next_char =
1209 bidi_it->bytepos + bidi_it->ch_len >= ZV_BYTE
1210 ? BIDI_EOB : FETCH_CHAR (bidi_it->bytepos + bidi_it->ch_len);
1211 type_of_next = bidi_get_type (next_char, override);
1212
1213 if (type_of_next == WEAK_ET
1214 || type_of_next == WEAK_BN
1215 || bidi_explicit_dir_char (next_char))
1216 {
1217 bidi_copy_it (&saved_it, bidi_it);
1218 while (bidi_resolve_explicit (bidi_it) == new_level
1219 && (bidi_it->type == WEAK_BN
1220 || bidi_it->type == WEAK_ET))
1221 ;
1222 type_of_next = bidi_it->type;
1223 en_pos = bidi_it->charpos;
1224 bidi_copy_it (bidi_it, &saved_it);
1225 }
1226 if (type_of_next == WEAK_EN)
1227 {
1228 /* If the last strong character is AL, the EN we've
1229 found will become AN when we get to it (W2). */
1230 if (bidi_it->last_strong.type_after_w1 != STRONG_AL)
1231 {
1232 type = WEAK_EN;
1233 /* Remember this EN position, to speed up processing
1234 of the next ETs. */
1235 bidi_it->next_en_pos = en_pos;
1236 }
1237 else if (type == WEAK_BN)
1238 type = NEUTRAL_ON; /* W6/Retaining */
1239 }
1240 }
1241 }
1242 }
1243
1244 if (type == WEAK_ES || type == WEAK_ET || type == WEAK_CS /* W6 */
1245 || (type == WEAK_BN
1246 && (bidi_it->prev.type_after_w1 == WEAK_CS /* W6/Retaining */
1247 || bidi_it->prev.type_after_w1 == WEAK_ES
1248 || bidi_it->prev.type_after_w1 == WEAK_ET)))
1249 type = NEUTRAL_ON;
1250
1251 /* Store the type we've got so far, before we clobber it with strong
1252 types in W7 and while resolving neutral types. But leave alone
1253 the original types that were recorded above, because we will need
1254 them for the L1 clause. */
1255 if (bidi_it->type_after_w1 == UNKNOWN_BT)
1256 bidi_it->type_after_w1 = type;
1257 bidi_check_type (bidi_it->type_after_w1);
1258
1259 if (type == WEAK_EN) /* W7 */
1260 {
1261 if ((bidi_it->last_strong.type_after_w1 == STRONG_L)
1262 || (bidi_it->last_strong.type == UNKNOWN_BT && bidi_it->sor == L2R))
1263 type = STRONG_L;
1264 }
1265
1266 bidi_it->type = type;
1267 bidi_check_type (bidi_it->type);
1268 return type;
1269 }
1270
1271 static bidi_type_t
1272 bidi_resolve_neutral (struct bidi_it *bidi_it)
1273 {
1274 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1275 bidi_type_t type = bidi_resolve_weak (bidi_it);
1276 int current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1277
1278 if (!(type == STRONG_R
1279 || type == STRONG_L
1280 || type == WEAK_BN
1281 || type == WEAK_EN
1282 || type == WEAK_AN
1283 || type == NEUTRAL_B
1284 || type == NEUTRAL_S
1285 || type == NEUTRAL_WS
1286 || type == NEUTRAL_ON))
1287 abort ();
1288
1289 if (bidi_get_category (type) == NEUTRAL
1290 || (type == WEAK_BN && prev_level == current_level))
1291 {
1292 if (bidi_it->next_for_neutral.type != UNKNOWN_BT)
1293 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1294 bidi_it->next_for_neutral.type,
1295 current_level);
1296 else
1297 {
1298 /* Arrrgh!! The UAX#9 algorithm is too deeply entrenched in
1299 the assumption of batch-style processing; see clauses W4,
1300 W5, and especially N1, which require to look far forward
1301 (as well as back) in the buffer. May the fleas of a
1302 thousand camels infest the armpits of those who design
1303 supposedly general-purpose algorithms by looking at their
1304 own implementations, and fail to consider other possible
1305 implementations! */
1306 struct bidi_it saved_it;
1307 bidi_type_t next_type;
1308
1309 if (bidi_it->scan_dir == -1)
1310 abort ();
1311
1312 bidi_copy_it (&saved_it, bidi_it);
1313 /* Scan the text forward until we find the first non-neutral
1314 character, and then use that to resolve the neutral we
1315 are dealing with now. We also cache the scanned iterator
1316 states, to salvage some of the effort later. */
1317 bidi_cache_iterator_state (bidi_it, 0);
1318 do {
1319 /* Record the info about the previous character, so that
1320 it will be cached below with this state. */
1321 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
1322 && bidi_it->type != WEAK_BN)
1323 bidi_remember_char (&bidi_it->prev, bidi_it);
1324 type = bidi_resolve_weak (bidi_it);
1325 /* Paragraph separators have their levels fully resolved
1326 at this point, so cache them as resolved. */
1327 bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B);
1328 /* FIXME: implement L1 here, by testing for a newline and
1329 resetting the level for any sequence of whitespace
1330 characters adjacent to it. */
1331 } while (!(type == NEUTRAL_B
1332 || (type != WEAK_BN
1333 && bidi_get_category (type) != NEUTRAL)
1334 /* This is all per level run, so stop when we
1335 reach the end of this level run. */
1336 || bidi_it->level_stack[bidi_it->stack_idx].level !=
1337 current_level));
1338
1339 bidi_remember_char (&saved_it.next_for_neutral, bidi_it);
1340
1341 switch (type)
1342 {
1343 case STRONG_L:
1344 case STRONG_R:
1345 case STRONG_AL:
1346 next_type = type;
1347 break;
1348 case WEAK_EN:
1349 case WEAK_AN:
1350 /* N1: ``European and Arabic numbers are treated as
1351 though they were R.'' */
1352 next_type = STRONG_R;
1353 saved_it.next_for_neutral.type = STRONG_R;
1354 break;
1355 case WEAK_BN:
1356 if (!bidi_explicit_dir_char (bidi_it->ch))
1357 abort (); /* can't happen: BNs are skipped */
1358 /* FALLTHROUGH */
1359 case NEUTRAL_B:
1360 /* Marched all the way to the end of this level run.
1361 We need to use the eor type, whose information is
1362 stored by bidi_set_sor_type in the prev_for_neutral
1363 member. */
1364 if (saved_it.type != WEAK_BN
1365 || bidi_get_category (bidi_it->prev.type_after_w1) == NEUTRAL)
1366 {
1367 next_type = bidi_it->prev_for_neutral.type;
1368 saved_it.next_for_neutral.type = next_type;
1369 bidi_check_type (next_type);
1370 }
1371 else
1372 {
1373 /* This is a BN which does not adjoin neutrals.
1374 Leave its type alone. */
1375 bidi_copy_it (bidi_it, &saved_it);
1376 return bidi_it->type;
1377 }
1378 break;
1379 default:
1380 abort ();
1381 }
1382 type = bidi_resolve_neutral_1 (saved_it.prev_for_neutral.type,
1383 next_type, current_level);
1384 saved_it.type = type;
1385 bidi_check_type (type);
1386 bidi_copy_it (bidi_it, &saved_it);
1387 }
1388 }
1389 return type;
1390 }
1391
1392 /* Given an iterator state in BIDI_IT, advance one character position
1393 in the buffer to the next character (in the logical order), resolve
1394 the bidi type of that next character, and return that type. */
1395 static bidi_type_t
1396 bidi_type_of_next_char (struct bidi_it *bidi_it)
1397 {
1398 bidi_type_t type;
1399
1400 /* This should always be called during a forward scan. */
1401 if (bidi_it->scan_dir != 1)
1402 abort ();
1403
1404 /* Reset the limit until which to ignore BNs if we step out of the
1405 area where we found only empty levels. */
1406 if ((bidi_it->ignore_bn_limit > 0
1407 && bidi_it->ignore_bn_limit <= bidi_it->charpos)
1408 || (bidi_it->ignore_bn_limit == -1
1409 && !bidi_explicit_dir_char (bidi_it->ch)))
1410 bidi_it->ignore_bn_limit = 0;
1411
1412 type = bidi_resolve_neutral (bidi_it);
1413
1414 return type;
1415 }
1416
1417 /* Given an iterator state BIDI_IT, advance one character position in
1418 the buffer to the next character (in the logical order), resolve
1419 the embedding and implicit levels of that next character, and
1420 return the resulting level. */
1421 static int
1422 bidi_level_of_next_char (struct bidi_it *bidi_it)
1423 {
1424 bidi_type_t type;
1425 int level, prev_level = -1;
1426 struct bidi_saved_info next_for_neutral;
1427
1428 if (bidi_it->scan_dir == 1)
1429 {
1430 /* There's no sense in trying to advance if we hit end of text. */
1431 if (bidi_it->bytepos >= ZV_BYTE)
1432 return bidi_it->resolved_level;
1433
1434 /* Record the info about the previous character. */
1435 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
1436 && bidi_it->type != WEAK_BN)
1437 bidi_remember_char (&bidi_it->prev, bidi_it);
1438 if (bidi_it->type_after_w1 == STRONG_R
1439 || bidi_it->type_after_w1 == STRONG_L
1440 || bidi_it->type_after_w1 == STRONG_AL)
1441 bidi_remember_char (&bidi_it->last_strong, bidi_it);
1442 /* FIXME: it sounds like we don't need both prev and
1443 prev_for_neutral members, but I'm leaving them both for now. */
1444 if (bidi_it->type == STRONG_R || bidi_it->type == STRONG_L
1445 || bidi_it->type == WEAK_EN || bidi_it->type == WEAK_AN)
1446 bidi_remember_char (&bidi_it->prev_for_neutral, bidi_it);
1447
1448 /* If we overstepped the characters used for resolving neutrals
1449 and whitespace, invalidate their info in the iterator. */
1450 if (bidi_it->charpos >= bidi_it->next_for_neutral.charpos)
1451 bidi_it->next_for_neutral.type = UNKNOWN_BT;
1452 if (bidi_it->next_en_pos >= 0
1453 && bidi_it->charpos >= bidi_it->next_en_pos)
1454 bidi_it->next_en_pos = -1;
1455 if (bidi_it->next_for_ws.type != UNKNOWN_BT
1456 && bidi_it->charpos >= bidi_it->next_for_ws.charpos)
1457 bidi_it->next_for_ws.type = UNKNOWN_BT;
1458
1459 /* This must be taken before we fill the iterator with the info
1460 about the next char. If we scan backwards, the iterator
1461 state must be already cached, so there's no need to know the
1462 embedding level of the previous character, since we will be
1463 returning to our caller shortly. */
1464 prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1465 }
1466 next_for_neutral = bidi_it->next_for_neutral;
1467
1468 /* Perhaps it is already cached. */
1469 type = bidi_cache_find (bidi_it->charpos + bidi_it->scan_dir, -1, bidi_it);
1470 if (type != UNKNOWN_BT)
1471 {
1472 /* Don't lose the information for resolving neutrals! The
1473 cached states could have been cached before their
1474 next_for_neutral member was computed. If we are on our way
1475 forward, we can simply take the info from the previous
1476 state. */
1477 if (bidi_it->scan_dir == 1
1478 && bidi_it->next_for_neutral.type == UNKNOWN_BT)
1479 bidi_it->next_for_neutral = next_for_neutral;
1480
1481 /* If resolved_level is -1, it means this state was cached
1482 before it was completely resolved, so we cannot return
1483 it. */
1484 if (bidi_it->resolved_level != -1)
1485 return bidi_it->resolved_level;
1486 }
1487 if (bidi_it->scan_dir == -1)
1488 /* If we are going backwards, the iterator state is already cached
1489 from previous scans, and should be fully resolved. */
1490 abort ();
1491
1492 if (type == UNKNOWN_BT)
1493 type = bidi_type_of_next_char (bidi_it);
1494
1495 if (type == NEUTRAL_B)
1496 return bidi_it->resolved_level;
1497
1498 level = bidi_it->level_stack[bidi_it->stack_idx].level;
1499 if ((bidi_get_category (type) == NEUTRAL /* && type != NEUTRAL_B */)
1500 || (type == WEAK_BN && prev_level == level))
1501 {
1502 if (bidi_it->next_for_neutral.type == UNKNOWN_BT)
1503 abort ();
1504
1505 /* If the cached state shows a neutral character, it was not
1506 resolved by bidi_resolve_neutral, so do it now. */
1507 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1508 bidi_it->next_for_neutral.type,
1509 level);
1510 }
1511
1512 if (!(type == STRONG_R
1513 || type == STRONG_L
1514 || type == WEAK_BN
1515 || type == WEAK_EN
1516 || type == WEAK_AN))
1517 abort ();
1518 bidi_it->type = type;
1519 bidi_check_type (bidi_it->type);
1520
1521 /* For L1 below, we need to know, for each WS character, whether
1522 it belongs to a sequence of WS characters preceding a newline
1523 or a TAB or a paragraph separator. */
1524 if (bidi_it->orig_type == NEUTRAL_WS
1525 && bidi_it->next_for_ws.type == UNKNOWN_BT)
1526 {
1527 int ch;
1528 int clen = bidi_it->ch_len;
1529 EMACS_INT bpos = bidi_it->bytepos;
1530 EMACS_INT cpos = bidi_it->charpos;
1531 bidi_type_t chtype;
1532
1533 do {
1534 /*_fetch_multibyte_char_len = 1;*/
1535 ch = bpos + clen >= ZV_BYTE ? BIDI_EOB : FETCH_CHAR (bpos + clen);
1536 bpos += clen;
1537 cpos++;
1538 clen = (ch == BIDI_EOB ? 1 : CHAR_BYTES (ch));
1539 if (ch == '\n' || ch == BIDI_EOB /* || ch == LINESEP_CHAR */)
1540 chtype = NEUTRAL_B;
1541 else
1542 chtype = bidi_get_type (ch, NEUTRAL_DIR);
1543 } while (chtype == NEUTRAL_WS || chtype == WEAK_BN
1544 || bidi_explicit_dir_char (ch)); /* L1/Retaining */
1545 bidi_it->next_for_ws.type = chtype;
1546 bidi_check_type (bidi_it->next_for_ws.type);
1547 bidi_it->next_for_ws.charpos = cpos;
1548 bidi_it->next_for_ws.bytepos = bpos;
1549 }
1550
1551 /* Resolve implicit levels, with a twist: PDFs get the embedding
1552 level of the enbedding they terminate. See below for the
1553 reason. */
1554 if (bidi_it->orig_type == PDF
1555 /* Don't do this if this formatting code didn't change the
1556 embedding level due to invalid or empty embeddings. */
1557 && prev_level != level)
1558 {
1559 /* Don't look in UAX#9 for the reason for this: it's our own
1560 private quirk. The reason is that we want the formatting
1561 codes to be delivered so that they bracket the text of their
1562 embedding. For example, given the text
1563
1564 {RLO}teST{PDF}
1565
1566 we want it to be displayed as
1567
1568 {PDF}STet{RLO}
1569
1570 not as
1571
1572 STet{RLO}{PDF}
1573
1574 which will result because we bump up the embedding level as
1575 soon as we see the RLO and pop it as soon as we see the PDF,
1576 so RLO itself has the same embedding level as "teST", and
1577 thus would be normally delivered last, just before the PDF.
1578 The switch below fiddles with the level of PDF so that this
1579 ugly side effect does not happen.
1580
1581 (This is, of course, only important if the formatting codes
1582 are actually displayed, but Emacs does need to display them
1583 if the user wants to.) */
1584 level = prev_level;
1585 }
1586 else if (bidi_it->orig_type == NEUTRAL_B /* L1 */
1587 || bidi_it->orig_type == NEUTRAL_S
1588 || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB
1589 /* || bidi_it->ch == LINESEP_CHAR */
1590 || (bidi_it->orig_type == NEUTRAL_WS
1591 && (bidi_it->next_for_ws.type == NEUTRAL_B
1592 || bidi_it->next_for_ws.type == NEUTRAL_S)))
1593 level = bidi_it->level_stack[0].level;
1594 else if ((level & 1) == 0) /* I1 */
1595 {
1596 if (type == STRONG_R)
1597 level++;
1598 else if (type == WEAK_EN || type == WEAK_AN)
1599 level += 2;
1600 }
1601 else /* I2 */
1602 {
1603 if (type == STRONG_L || type == WEAK_EN || type == WEAK_AN)
1604 level++;
1605 }
1606
1607 bidi_it->resolved_level = level;
1608 return level;
1609 }
1610
1611 /* Move to the other edge of a level given by LEVEL. If END_FLAG is
1612 non-zero, we are at the end of a level, and we need to prepare to
1613 resume the scan of the lower level.
1614
1615 If this level's other edge is cached, we simply jump to it, filling
1616 the iterator structure with the iterator state on the other edge.
1617 Otherwise, we walk the buffer until we come back to the same level
1618 as LEVEL.
1619
1620 Note: we are not talking here about a ``level run'' in the UAX#9
1621 sense of the term, but rather about a ``level'' which includes
1622 all the levels higher than it. In other words, given the levels
1623 like this:
1624
1625 11111112222222333333334443343222222111111112223322111
1626 A B C
1627
1628 and assuming we are at point A scanning left to right, this
1629 function moves to point C, whereas the UAX#9 ``level 2 run'' ends
1630 at point B. */
1631 static void
1632 bidi_find_other_level_edge (struct bidi_it *bidi_it, int level, int end_flag)
1633 {
1634 int dir = end_flag ? -bidi_it->scan_dir : bidi_it->scan_dir;
1635 int idx;
1636
1637 /* Try the cache first. */
1638 if ((idx = bidi_cache_find_level_change (level, dir, end_flag)) >= 0)
1639 bidi_cache_fetch_state (idx, bidi_it);
1640 else
1641 {
1642 int new_level;
1643
1644 if (end_flag)
1645 abort (); /* if we are at end of level, its edges must be cached */
1646
1647 bidi_cache_iterator_state (bidi_it, 1);
1648 do {
1649 new_level = bidi_level_of_next_char (bidi_it);
1650 bidi_cache_iterator_state (bidi_it, 1);
1651 } while (new_level >= level);
1652 }
1653 }
1654
1655 void
1656 bidi_move_to_visually_next (struct bidi_it *bidi_it)
1657 {
1658 int old_level, new_level, next_level;
1659 struct bidi_it sentinel;
1660
1661 if (bidi_it->scan_dir == 0)
1662 {
1663 bidi_it->scan_dir = 1; /* default to logical order */
1664 }
1665
1666 /* If we just passed a newline, initialize for the next line. */
1667 if (!bidi_it->first_elt && bidi_it->orig_type == NEUTRAL_B)
1668 bidi_line_init (bidi_it);
1669
1670 /* Prepare the sentinel iterator state, and cache it. When we bump
1671 into it, scanning backwards, we'll know that the last non-base
1672 level is exhausted. */
1673 if (bidi_cache_idx == 0)
1674 {
1675 bidi_copy_it (&sentinel, bidi_it);
1676 if (bidi_it->first_elt)
1677 {
1678 sentinel.charpos--; /* cached charpos needs to be monotonic */
1679 sentinel.bytepos--;
1680 sentinel.ch = '\n'; /* doesn't matter, but why not? */
1681 sentinel.ch_len = 1;
1682 }
1683 bidi_cache_iterator_state (&sentinel, 1);
1684 }
1685
1686 old_level = bidi_it->resolved_level;
1687 new_level = bidi_level_of_next_char (bidi_it);
1688
1689 /* Reordering of resolved levels (clause L2) is implemented by
1690 jumping to the other edge of the level and flipping direction of
1691 scanning the text whenever we find a level change. */
1692 if (new_level != old_level)
1693 {
1694 int ascending = new_level > old_level;
1695 int level_to_search = ascending ? old_level + 1 : old_level;
1696 int incr = ascending ? 1 : -1;
1697 int expected_next_level = old_level + incr;
1698
1699 /* Jump (or walk) to the other edge of this level. */
1700 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
1701 /* Switch scan direction and peek at the next character in the
1702 new direction. */
1703 bidi_it->scan_dir = -bidi_it->scan_dir;
1704
1705 /* The following loop handles the case where the resolved level
1706 jumps by more than one. This is typical for numbers inside a
1707 run of text with left-to-right embedding direction, but can
1708 also happen in other situations. In those cases the decision
1709 where to continue after a level change, and in what direction,
1710 is tricky. For example, given a text like below:
1711
1712 abcdefgh
1713 11336622
1714
1715 (where the numbers below the text show the resolved levels),
1716 the result of reordering according to UAX#9 should be this:
1717
1718 efdcghba
1719
1720 This is implemented by the loop below which flips direction
1721 and jumps to the other edge of the level each time it finds
1722 the new level not to be the expected one. The expected level
1723 is always one more or one less than the previous one. */
1724 next_level = bidi_peek_at_next_level (bidi_it);
1725 while (next_level != expected_next_level)
1726 {
1727 expected_next_level += incr;
1728 level_to_search += incr;
1729 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
1730 bidi_it->scan_dir = -bidi_it->scan_dir;
1731 next_level = bidi_peek_at_next_level (bidi_it);
1732 }
1733
1734 /* Finally, deliver the next character in the new direction. */
1735 next_level = bidi_level_of_next_char (bidi_it);
1736 }
1737
1738 /* Take note when we have just processed the newline that precedes
1739 the end of the paragraph. The next time we are about to be
1740 called, set_iterator_to_next will automatically reinit the
1741 paragraph direction, if needed. We do this at the newline before
1742 the paragraph separator, because the next character might not be
1743 the first character of the next paragraph, due to the bidi
1744 reordering, whereas we _must_ know the paragraph base direction
1745 _before_ we process the paragraph's text, since the base
1746 direction affects the reordering. */
1747 if (bidi_it->scan_dir == 1
1748 && bidi_it->orig_type == NEUTRAL_B
1749 && bidi_it->bytepos < ZV_BYTE)
1750 {
1751 EMACS_INT sep_len =
1752 bidi_at_paragraph_end (bidi_it->charpos + 1,
1753 bidi_it->bytepos + bidi_it->ch_len);
1754 if (sep_len >= 0)
1755 {
1756 bidi_it->new_paragraph = 1;
1757 /* Record the buffer position of the last character of the
1758 paragraph separator. */
1759 bidi_it->separator_limit = bidi_it->charpos + 1 + sep_len;
1760 }
1761 }
1762
1763 if (bidi_it->scan_dir == 1 && bidi_cache_idx)
1764 {
1765 /* If we are at paragraph's base embedding level and beyond the
1766 last cached position, the cache's job is done and we can
1767 discard it. */
1768 if (bidi_it->resolved_level == bidi_it->level_stack[0].level
1769 && bidi_it->charpos > bidi_cache[bidi_cache_idx - 1].charpos)
1770 bidi_cache_reset ();
1771 /* But as long as we are caching during forward scan, we must
1772 cache each state, or else the cache integrity will be
1773 compromised: it assumes cached states correspond to buffer
1774 positions 1:1. */
1775 else
1776 bidi_cache_iterator_state (bidi_it, 1);
1777 }
1778 }
1779
1780 /* This is meant to be called from within the debugger, whenever you
1781 wish to examine the cache contents. */
1782 void
1783 bidi_dump_cached_states (void)
1784 {
1785 int i;
1786 int ndigits = 1;
1787
1788 if (bidi_cache_idx == 0)
1789 {
1790 fprintf (stderr, "The cache is empty.\n");
1791 return;
1792 }
1793 fprintf (stderr, "Total of %d state%s in cache:\n",
1794 bidi_cache_idx, bidi_cache_idx == 1 ? "" : "s");
1795
1796 for (i = bidi_cache[bidi_cache_idx - 1].charpos; i > 0; i /= 10)
1797 ndigits++;
1798 fputs ("ch ", stderr);
1799 for (i = 0; i < bidi_cache_idx; i++)
1800 fprintf (stderr, "%*c", ndigits, bidi_cache[i].ch);
1801 fputs ("\n", stderr);
1802 fputs ("lvl ", stderr);
1803 for (i = 0; i < bidi_cache_idx; i++)
1804 fprintf (stderr, "%*d", ndigits, bidi_cache[i].resolved_level);
1805 fputs ("\n", stderr);
1806 fputs ("pos ", stderr);
1807 for (i = 0; i < bidi_cache_idx; i++)
1808 fprintf (stderr, "%*ld", ndigits, (long)bidi_cache[i].charpos);
1809 fputs ("\n", stderr);
1810 }