1 /* Low-level bidirectional buffer-scanning functions for GNU Emacs.
2 Copyright (C) 2000-2001, 2004-2005, 2009-2011
3 Free Software Foundation, Inc.
5 This file is part of GNU Emacs.
7 GNU Emacs is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
20 /* Written by Eli Zaretskii <eliz@gnu.org>.
22 A sequential implementation of the Unicode Bidirectional algorithm,
23 as per UAX#9, a part of the Unicode Standard.
25 Unlike the reference and most other implementations, this one is
26 designed to be called once for every character in the buffer or
29 The main entry point is bidi_move_to_visually_next. Each time it
30 is called, it finds the next character in the visual order, and
31 returns its information in a special structure. The caller is then
32 expected to process this character for display or any other
33 purposes, and call bidi_move_to_visually_next for the next
34 character. See the comments in bidi_move_to_visually_next for more
35 details about its algorithm that finds the next visual-order
36 character by resolving their levels on the fly.
38 The two other entry points are bidi_paragraph_init and
39 bidi_mirror_char. The first determines the base direction of a
40 paragraph, while the second returns the mirrored version of its
43 If you want to understand the code, you will have to read it
44 together with the relevant portions of UAX#9. The comments include
45 references to UAX#9 rules, for that very reason.
47 A note about references to UAX#9 rules: if the reference says
48 something like "X9/Retaining", it means that you need to refer to
49 rule X9 and to its modifications decribed in the "Implementation
50 Notes" section of UAX#9, under "Retaining Format Codes". */
58 #include "character.h"
59 #include "dispextern.h"
61 static int bidi_initialized
= 0;
63 static Lisp_Object bidi_type_table
, bidi_mirror_table
;
65 #define LRM_CHAR 0x200E
66 #define RLM_CHAR 0x200F
69 /* Local data structures. (Look in dispextern.h for the rest.) */
71 /* What we need to know about the current paragraph. */
72 struct bidi_paragraph_info
{
73 EMACS_INT start_bytepos
; /* byte position where it begins */
74 EMACS_INT end_bytepos
; /* byte position where it ends */
75 int embedding_level
; /* its basic embedding level */
76 bidi_dir_t base_dir
; /* its base direction */
79 /* Data type for describing the bidirectional character categories. */
87 extern int bidi_ignore_explicit_marks_for_paragraph_level EXTERNALLY_VISIBLE
;
88 int bidi_ignore_explicit_marks_for_paragraph_level
= 1;
90 static Lisp_Object paragraph_start_re
, paragraph_separate_re
;
91 static Lisp_Object Qparagraph_start
, Qparagraph_separate
;
94 bidi_initialize (void)
98 #include "bidimirror.h"
102 bidi_type_table
= Fmake_char_table (Qnil
, make_number (STRONG_L
));
103 staticpro (&bidi_type_table
);
105 for (i
= 0; i
< sizeof bidi_type
/ sizeof bidi_type
[0]; i
++)
106 char_table_set_range (bidi_type_table
, bidi_type
[i
].from
, bidi_type
[i
].to
,
107 make_number (bidi_type
[i
].type
));
109 bidi_mirror_table
= Fmake_char_table (Qnil
, Qnil
);
110 staticpro (&bidi_mirror_table
);
112 for (i
= 0; i
< sizeof bidi_mirror
/ sizeof bidi_mirror
[0]; i
++)
113 char_table_set (bidi_mirror_table
, bidi_mirror
[i
].from
,
114 make_number (bidi_mirror
[i
].to
));
116 Qparagraph_start
= intern ("paragraph-start");
117 staticpro (&Qparagraph_start
);
118 paragraph_start_re
= Fsymbol_value (Qparagraph_start
);
119 if (!STRINGP (paragraph_start_re
))
120 paragraph_start_re
= build_string ("\f\\|[ \t]*$");
121 staticpro (¶graph_start_re
);
122 Qparagraph_separate
= intern ("paragraph-separate");
123 staticpro (&Qparagraph_separate
);
124 paragraph_separate_re
= Fsymbol_value (Qparagraph_separate
);
125 if (!STRINGP (paragraph_separate_re
))
126 paragraph_separate_re
= build_string ("[ \t\f]*$");
127 staticpro (¶graph_separate_re
);
128 bidi_initialized
= 1;
131 /* Return the bidi type of a character CH, subject to the current
132 directional OVERRIDE. */
133 static inline bidi_type_t
134 bidi_get_type (int ch
, bidi_dir_t override
)
136 bidi_type_t default_type
;
140 if (ch
< 0 || ch
> MAX_CHAR
)
143 default_type
= (bidi_type_t
) XINT (CHAR_TABLE_REF (bidi_type_table
, ch
));
145 if (override
== NEUTRAL_DIR
)
148 switch (default_type
)
150 /* Although UAX#9 does not tell, it doesn't make sense to
151 override NEUTRAL_B and LRM/RLM characters. */
166 if (override
== L2R
) /* X6 */
168 else if (override
== R2L
)
171 abort (); /* can't happen: handled above */
177 bidi_check_type (bidi_type_t type
)
179 if (type
< UNKNOWN_BT
|| type
> NEUTRAL_ON
)
183 /* Given a bidi TYPE of a character, return its category. */
184 static inline bidi_category_t
185 bidi_get_category (bidi_type_t type
)
199 case PDF
: /* ??? really?? */
218 /* Return the mirrored character of C, if it has one. If C has no
219 mirrored counterpart, return C.
220 Note: The conditions in UAX#9 clause L4 regarding the surrounding
221 context must be tested by the caller. */
223 bidi_mirror_char (int c
)
229 if (c
< 0 || c
> MAX_CHAR
)
232 val
= CHAR_TABLE_REF (bidi_mirror_table
, c
);
237 if (v
< 0 || v
> MAX_CHAR
)
246 /* Copy the bidi iterator from FROM to TO. To save cycles, this only
247 copies the part of the level stack that is actually in use. */
249 bidi_copy_it (struct bidi_it
*to
, struct bidi_it
*from
)
253 /* Copy everything except the level stack and beyond. */
254 memcpy (to
, from
, offsetof (struct bidi_it
, level_stack
[0]));
256 /* Copy the active part of the level stack. */
257 to
->level_stack
[0] = from
->level_stack
[0]; /* level zero is always in use */
258 for (i
= 1; i
<= from
->stack_idx
; i
++)
259 to
->level_stack
[i
] = from
->level_stack
[i
];
262 /* Caching the bidi iterator states. */
264 #define BIDI_CACHE_CHUNK 200
265 static struct bidi_it
*bidi_cache
;
266 static ptrdiff_t bidi_cache_size
= 0;
267 enum { elsz
= sizeof (struct bidi_it
) };
268 static ptrdiff_t bidi_cache_idx
; /* next unused cache slot */
269 static ptrdiff_t bidi_cache_last_idx
; /* slot of last cache hit */
272 bidi_cache_reset (void)
275 bidi_cache_last_idx
= -1;
279 bidi_cache_shrink (void)
281 if (bidi_cache_size
> BIDI_CACHE_CHUNK
)
283 bidi_cache_size
= BIDI_CACHE_CHUNK
;
285 (struct bidi_it
*) xrealloc (bidi_cache
, bidi_cache_size
* elsz
);
291 bidi_cache_fetch_state (ptrdiff_t idx
, struct bidi_it
*bidi_it
)
293 int current_scan_dir
= bidi_it
->scan_dir
;
295 if (idx
< 0 || idx
>= bidi_cache_idx
)
298 bidi_copy_it (bidi_it
, &bidi_cache
[idx
]);
299 bidi_it
->scan_dir
= current_scan_dir
;
300 bidi_cache_last_idx
= idx
;
303 /* Find a cached state with a given CHARPOS and resolved embedding
304 level less or equal to LEVEL. if LEVEL is -1, disregard the
305 resolved levels in cached states. DIR, if non-zero, means search
306 in that direction from the last cache hit. */
307 static inline ptrdiff_t
308 bidi_cache_search (EMACS_INT charpos
, int level
, int dir
)
310 ptrdiff_t i
, i_start
;
314 if (charpos
< bidi_cache
[bidi_cache_last_idx
].charpos
)
317 i_start
= bidi_cache_last_idx
- 1;
319 else if (charpos
> (bidi_cache
[bidi_cache_last_idx
].charpos
320 + bidi_cache
[bidi_cache_last_idx
].nchars
- 1))
323 i_start
= bidi_cache_last_idx
+ 1;
326 i_start
= bidi_cache_last_idx
;
330 i_start
= bidi_cache_idx
- 1;
335 /* Linear search for now; FIXME! */
336 for (i
= i_start
; i
>= 0; i
--)
337 if (bidi_cache
[i
].charpos
<= charpos
338 && charpos
< bidi_cache
[i
].charpos
+ bidi_cache
[i
].nchars
339 && (level
== -1 || bidi_cache
[i
].resolved_level
<= level
))
344 for (i
= i_start
; i
< bidi_cache_idx
; i
++)
345 if (bidi_cache
[i
].charpos
<= charpos
346 && charpos
< bidi_cache
[i
].charpos
+ bidi_cache
[i
].nchars
347 && (level
== -1 || bidi_cache
[i
].resolved_level
<= level
))
355 /* Find a cached state where the resolved level changes to a value
356 that is lower than LEVEL, and return its cache slot index. DIR is
357 the direction to search, starting with the last used cache slot.
358 BEFORE, if non-zero, means return the index of the slot that is
359 ``before'' the level change in the search direction. That is,
360 given the cached levels like this:
365 and assuming we are at the position cached at the slot marked with
366 C, searching backwards (DIR = -1) for LEVEL = 2 will return the
367 index of slot B or A, depending whether BEFORE is, respectively,
370 bidi_cache_find_level_change (int level
, int dir
, int before
)
374 ptrdiff_t i
= dir
? bidi_cache_last_idx
: bidi_cache_idx
- 1;
375 int incr
= before
? 1 : 0;
386 if (bidi_cache
[i
- incr
].resolved_level
>= 0
387 && bidi_cache
[i
- incr
].resolved_level
< level
)
394 while (i
< bidi_cache_idx
- incr
)
396 if (bidi_cache
[i
+ incr
].resolved_level
>= 0
397 && bidi_cache
[i
+ incr
].resolved_level
< level
)
408 bidi_cache_iterator_state (struct bidi_it
*bidi_it
, int resolved
)
412 /* We should never cache on backward scans. */
413 if (bidi_it
->scan_dir
== -1)
415 idx
= bidi_cache_search (bidi_it
->charpos
, -1, 1);
419 idx
= bidi_cache_idx
;
420 /* Enlarge the cache as needed. */
421 if (idx
>= bidi_cache_size
)
423 if (min (PTRDIFF_MAX
, SIZE_MAX
) / elsz
- BIDI_CACHE_CHUNK
425 memory_full (SIZE_MAX
);
426 bidi_cache_size
+= BIDI_CACHE_CHUNK
;
428 (struct bidi_it
*) xrealloc (bidi_cache
, bidi_cache_size
* elsz
);
430 /* Character positions should correspond to cache positions 1:1.
431 If we are outside the range of cached positions, the cache is
432 useless and must be reset. */
434 (bidi_it
->charpos
> (bidi_cache
[idx
- 1].charpos
435 + bidi_cache
[idx
- 1].nchars
)
436 || bidi_it
->charpos
< bidi_cache
[0].charpos
))
441 if (bidi_it
->nchars
<= 0)
443 bidi_copy_it (&bidi_cache
[idx
], bidi_it
);
445 bidi_cache
[idx
].resolved_level
= -1;
449 /* Copy only the members which could have changed, to avoid
450 costly copying of the entire struct. */
451 bidi_cache
[idx
].type
= bidi_it
->type
;
452 bidi_check_type (bidi_it
->type
);
453 bidi_cache
[idx
].type_after_w1
= bidi_it
->type_after_w1
;
454 bidi_check_type (bidi_it
->type_after_w1
);
456 bidi_cache
[idx
].resolved_level
= bidi_it
->resolved_level
;
458 bidi_cache
[idx
].resolved_level
= -1;
459 bidi_cache
[idx
].invalid_levels
= bidi_it
->invalid_levels
;
460 bidi_cache
[idx
].invalid_rl_levels
= bidi_it
->invalid_rl_levels
;
461 bidi_cache
[idx
].next_for_neutral
= bidi_it
->next_for_neutral
;
462 bidi_cache
[idx
].next_for_ws
= bidi_it
->next_for_ws
;
463 bidi_cache
[idx
].ignore_bn_limit
= bidi_it
->ignore_bn_limit
;
466 bidi_cache_last_idx
= idx
;
467 if (idx
>= bidi_cache_idx
)
468 bidi_cache_idx
= idx
+ 1;
471 static inline bidi_type_t
472 bidi_cache_find (EMACS_INT charpos
, int level
, struct bidi_it
*bidi_it
)
474 ptrdiff_t i
= bidi_cache_search (charpos
, level
, bidi_it
->scan_dir
);
478 bidi_dir_t current_scan_dir
= bidi_it
->scan_dir
;
480 bidi_copy_it (bidi_it
, &bidi_cache
[i
]);
481 bidi_cache_last_idx
= i
;
482 /* Don't let scan direction from from the cached state override
483 the current scan direction. */
484 bidi_it
->scan_dir
= current_scan_dir
;
485 return bidi_it
->type
;
492 bidi_peek_at_next_level (struct bidi_it
*bidi_it
)
494 if (bidi_cache_idx
== 0 || bidi_cache_last_idx
== -1)
496 return bidi_cache
[bidi_cache_last_idx
+ bidi_it
->scan_dir
].resolved_level
;
499 /* Check if buffer position CHARPOS/BYTEPOS is the end of a paragraph.
500 Value is the non-negative length of the paragraph separator
501 following the buffer position, -1 if position is at the beginning
502 of a new paragraph, or -2 if position is neither at beginning nor
503 at end of a paragraph. */
505 bidi_at_paragraph_end (EMACS_INT charpos
, EMACS_INT bytepos
)
508 Lisp_Object start_re
;
511 sep_re
= paragraph_separate_re
;
512 start_re
= paragraph_start_re
;
514 val
= fast_looking_at (sep_re
, charpos
, bytepos
, ZV
, ZV_BYTE
, Qnil
);
517 if (fast_looking_at (start_re
, charpos
, bytepos
, ZV
, ZV_BYTE
, Qnil
) >= 0)
526 /* Determine the start-of-run (sor) directional type given the two
527 embedding levels on either side of the run boundary. Also, update
528 the saved info about previously seen characters, since that info is
529 generally valid for a single level run. */
531 bidi_set_sor_type (struct bidi_it
*bidi_it
, int level_before
, int level_after
)
533 int higher_level
= level_before
> level_after
? level_before
: level_after
;
535 /* The prev_was_pdf gork is required for when we have several PDFs
536 in a row. In that case, we want to compute the sor type for the
537 next level run only once: when we see the first PDF. That's
538 because the sor type depends only on the higher of the two levels
539 that we find on the two sides of the level boundary (see UAX#9,
540 clause X10), and so we don't need to know the final embedding
541 level to which we descend after processing all the PDFs. */
542 if (!bidi_it
->prev_was_pdf
|| level_before
< level_after
)
543 /* FIXME: should the default sor direction be user selectable? */
544 bidi_it
->sor
= (higher_level
& 1) != 0 ? R2L
: L2R
;
545 if (level_before
> level_after
)
546 bidi_it
->prev_was_pdf
= 1;
548 bidi_it
->prev
.type
= UNKNOWN_BT
;
549 bidi_it
->last_strong
.type
= bidi_it
->last_strong
.type_after_w1
=
550 bidi_it
->last_strong
.orig_type
= UNKNOWN_BT
;
551 bidi_it
->prev_for_neutral
.type
= bidi_it
->sor
== R2L
? STRONG_R
: STRONG_L
;
552 bidi_it
->prev_for_neutral
.charpos
= bidi_it
->charpos
;
553 bidi_it
->prev_for_neutral
.bytepos
= bidi_it
->bytepos
;
554 bidi_it
->next_for_neutral
.type
= bidi_it
->next_for_neutral
.type_after_w1
=
555 bidi_it
->next_for_neutral
.orig_type
= UNKNOWN_BT
;
556 bidi_it
->ignore_bn_limit
= 0; /* meaning it's unknown */
559 /* Perform initializations for reordering a new line of bidi text. */
561 bidi_line_init (struct bidi_it
*bidi_it
)
563 bidi_it
->scan_dir
= 1; /* FIXME: do we need to have control on this? */
564 bidi_it
->resolved_level
= bidi_it
->level_stack
[0].level
;
565 bidi_it
->level_stack
[0].override
= NEUTRAL_DIR
; /* X1 */
566 bidi_it
->invalid_levels
= 0;
567 bidi_it
->invalid_rl_levels
= -1;
568 bidi_it
->next_en_pos
= -1;
569 bidi_it
->next_for_ws
.type
= UNKNOWN_BT
;
570 bidi_set_sor_type (bidi_it
,
571 bidi_it
->paragraph_dir
== R2L
? 1 : 0,
572 bidi_it
->level_stack
[0].level
); /* X10 */
577 /* Fetch and return the character at BYTEPOS/CHARPOS. If that
578 character is covered by a display string, treat the entire run of
579 covered characters as a single character u+FFFC, and return their
580 combined length in CH_LEN and NCHARS. DISP_POS specifies the
581 character position of the next display string, or -1 if not yet
582 computed. When the next character is at or beyond that position,
583 the function updates DISP_POS with the position of the next display
586 bidi_fetch_char (EMACS_INT bytepos
, EMACS_INT charpos
, EMACS_INT
*disp_pos
,
587 int frame_window_p
, EMACS_INT
*ch_len
, EMACS_INT
*nchars
)
591 /* FIXME: Support strings in addition to buffers. */
592 /* If we got past the last known position of display string, compute
593 the position of the next one. That position could be at BYTEPOS. */
594 if (charpos
< ZV
&& charpos
> *disp_pos
)
595 *disp_pos
= compute_display_string_pos (charpos
, frame_window_p
);
597 /* Fetch the character at BYTEPOS. */
598 if (bytepos
>= ZV_BYTE
)
605 else if (charpos
>= *disp_pos
)
607 EMACS_INT disp_end_pos
;
609 /* We don't expect to find ourselves in the middle of a display
610 property. Hopefully, it will never be needed. */
611 if (charpos
> *disp_pos
)
613 /* Return the Unicode Object Replacement Character to represent
614 the entire run of characters covered by the display
617 disp_end_pos
= compute_display_string_end (*disp_pos
);
618 *nchars
= disp_end_pos
- *disp_pos
;
619 *ch_len
= CHAR_TO_BYTE (disp_end_pos
) - bytepos
;
623 ch
= FETCH_MULTIBYTE_CHAR (bytepos
);
625 *ch_len
= CHAR_BYTES (ch
);
628 /* If we just entered a run of characters covered by a display
629 string, compute the position of the next display string. */
630 if (charpos
+ *nchars
<= ZV
&& charpos
+ *nchars
> *disp_pos
)
631 *disp_pos
= compute_display_string_pos (charpos
+ *nchars
, frame_window_p
);
636 /* Find the beginning of this paragraph by looking back in the buffer.
637 Value is the byte position of the paragraph's beginning. */
639 bidi_find_paragraph_start (EMACS_INT pos
, EMACS_INT pos_byte
)
641 Lisp_Object re
= paragraph_start_re
;
642 EMACS_INT limit
= ZV
, limit_byte
= ZV_BYTE
;
644 while (pos_byte
> BEGV_BYTE
645 && fast_looking_at (re
, pos
, pos_byte
, limit
, limit_byte
, Qnil
) < 0)
647 /* FIXME: What if the paragraph beginning is covered by a
648 display string? And what if a display string covering some
649 of the text over which we scan back includes
650 paragraph_start_re? */
651 pos
= find_next_newline_no_quit (pos
- 1, -1);
652 pos_byte
= CHAR_TO_BYTE (pos
);
657 /* Determine the base direction, a.k.a. base embedding level, of the
658 paragraph we are about to iterate through. If DIR is either L2R or
659 R2L, just use that. Otherwise, determine the paragraph direction
660 from the first strong directional character of the paragraph.
662 NO_DEFAULT_P non-zero means don't default to L2R if the paragraph
663 has no strong directional characters and both DIR and
664 bidi_it->paragraph_dir are NEUTRAL_DIR. In that case, search back
665 in the buffer until a paragraph is found with a strong character,
666 or until hitting BEGV. In the latter case, fall back to L2R. This
667 flag is used in current-bidi-paragraph-direction.
669 Note that this function gives the paragraph separator the same
670 direction as the preceding paragraph, even though Emacs generally
671 views the separartor as not belonging to any paragraph. */
673 bidi_paragraph_init (bidi_dir_t dir
, struct bidi_it
*bidi_it
, int no_default_p
)
675 EMACS_INT bytepos
= bidi_it
->bytepos
;
676 EMACS_INT pstartbyte
;
678 /* Special case for an empty buffer. */
679 if (bytepos
== BEGV_BYTE
&& bytepos
== ZV_BYTE
)
681 /* We should never be called at EOB or before BEGV. */
682 else if (bytepos
>= ZV_BYTE
|| bytepos
< BEGV_BYTE
)
687 bidi_it
->paragraph_dir
= L2R
;
688 bidi_it
->new_paragraph
= 0;
692 bidi_it
->paragraph_dir
= R2L
;
693 bidi_it
->new_paragraph
= 0;
695 else if (dir
== NEUTRAL_DIR
) /* P2 */
698 EMACS_INT ch_len
, nchars
;
699 EMACS_INT pos
, disp_pos
= -1;
702 if (!bidi_initialized
)
705 /* If we are inside a paragraph separator, we are just waiting
706 for the separator to be exhausted; use the previous paragraph
707 direction. But don't do that if we have been just reseated,
708 because we need to reinitialize below in that case. */
709 if (!bidi_it
->first_elt
710 && bidi_it
->charpos
< bidi_it
->separator_limit
)
713 /* If we are on a newline, get past it to where the next
714 paragraph might start. But don't do that at BEGV since then
715 we are potentially in a new paragraph that doesn't yet
717 pos
= bidi_it
->charpos
;
718 if (bytepos
> BEGV_BYTE
&& FETCH_CHAR (bytepos
) == '\n')
724 /* We are either at the beginning of a paragraph or in the
725 middle of it. Find where this paragraph starts. */
726 pstartbyte
= bidi_find_paragraph_start (pos
, bytepos
);
727 bidi_it
->separator_limit
= -1;
728 bidi_it
->new_paragraph
= 0;
730 /* The following loop is run more than once only if NO_DEFAULT_P
733 bytepos
= pstartbyte
;
734 pos
= BYTE_TO_CHAR (bytepos
);
735 ch
= bidi_fetch_char (bytepos
, pos
, &disp_pos
, bidi_it
->frame_window_p
,
737 type
= bidi_get_type (ch
, NEUTRAL_DIR
);
739 for (pos
+= nchars
, bytepos
+= ch_len
;
740 /* NOTE: UAX#9 says to search only for L, AL, or R types
741 of characters, and ignore RLE, RLO, LRE, and LRO.
742 However, I'm not sure it makes sense to omit those 4;
743 should try with and without that to see the effect. */
744 (bidi_get_category (type
) != STRONG
)
745 || (bidi_ignore_explicit_marks_for_paragraph_level
746 && (type
== RLE
|| type
== RLO
747 || type
== LRE
|| type
== LRO
));
748 type
= bidi_get_type (ch
, NEUTRAL_DIR
))
750 if (bytepos
>= ZV_BYTE
)
752 /* Pretend there's a paragraph separator at end of
757 if (type
== NEUTRAL_B
&& bidi_at_paragraph_end (pos
, bytepos
) >= -1)
759 /* Fetch next character and advance to get past it. */
760 ch
= bidi_fetch_char (bytepos
, pos
, &disp_pos
,
761 bidi_it
->frame_window_p
, &ch_len
, &nchars
);
765 if (type
== STRONG_R
|| type
== STRONG_AL
) /* P3 */
766 bidi_it
->paragraph_dir
= R2L
;
767 else if (type
== STRONG_L
)
768 bidi_it
->paragraph_dir
= L2R
;
769 if (no_default_p
&& bidi_it
->paragraph_dir
== NEUTRAL_DIR
)
771 /* If this paragraph is at BEGV, default to L2R. */
772 if (pstartbyte
== BEGV_BYTE
)
773 bidi_it
->paragraph_dir
= L2R
; /* P3 and HL1 */
776 EMACS_INT prevpbyte
= pstartbyte
;
777 EMACS_INT p
= BYTE_TO_CHAR (pstartbyte
), pbyte
= pstartbyte
;
779 /* Find the beginning of the previous paragraph, if any. */
780 while (pbyte
> BEGV_BYTE
&& prevpbyte
>= pstartbyte
)
782 /* FXIME: What if p is covered by a display
783 string? See also a FIXME inside
784 bidi_find_paragraph_start. */
786 pbyte
= CHAR_TO_BYTE (p
);
787 prevpbyte
= bidi_find_paragraph_start (p
, pbyte
);
789 pstartbyte
= prevpbyte
;
792 } while (no_default_p
&& bidi_it
->paragraph_dir
== NEUTRAL_DIR
);
797 /* Contrary to UAX#9 clause P3, we only default the paragraph
798 direction to L2R if we have no previous usable paragraph
799 direction. This is allowed by the HL1 clause. */
800 if (bidi_it
->paragraph_dir
!= L2R
&& bidi_it
->paragraph_dir
!= R2L
)
801 bidi_it
->paragraph_dir
= L2R
; /* P3 and HL1 ``higher-level protocols'' */
802 if (bidi_it
->paragraph_dir
== R2L
)
803 bidi_it
->level_stack
[0].level
= 1;
805 bidi_it
->level_stack
[0].level
= 0;
807 bidi_line_init (bidi_it
);
810 /* Do whatever UAX#9 clause X8 says should be done at paragraph's
813 bidi_set_paragraph_end (struct bidi_it
*bidi_it
)
815 bidi_it
->invalid_levels
= 0;
816 bidi_it
->invalid_rl_levels
= -1;
817 bidi_it
->stack_idx
= 0;
818 bidi_it
->resolved_level
= bidi_it
->level_stack
[0].level
;
821 /* Initialize the bidi iterator from buffer/string position CHARPOS. */
823 bidi_init_it (EMACS_INT charpos
, EMACS_INT bytepos
, int frame_window_p
,
824 struct bidi_it
*bidi_it
)
826 if (! bidi_initialized
)
828 bidi_it
->charpos
= charpos
;
829 bidi_it
->bytepos
= bytepos
;
830 bidi_it
->frame_window_p
= frame_window_p
;
831 bidi_it
->nchars
= -1; /* to be computed in bidi_resolve_explicit_1 */
832 bidi_it
->first_elt
= 1;
833 bidi_set_paragraph_end (bidi_it
);
834 bidi_it
->new_paragraph
= 1;
835 bidi_it
->separator_limit
= -1;
836 bidi_it
->type
= NEUTRAL_B
;
837 bidi_it
->type_after_w1
= NEUTRAL_B
;
838 bidi_it
->orig_type
= NEUTRAL_B
;
839 bidi_it
->prev_was_pdf
= 0;
840 bidi_it
->prev
.type
= bidi_it
->prev
.type_after_w1
=
841 bidi_it
->prev
.orig_type
= UNKNOWN_BT
;
842 bidi_it
->last_strong
.type
= bidi_it
->last_strong
.type_after_w1
=
843 bidi_it
->last_strong
.orig_type
= UNKNOWN_BT
;
844 bidi_it
->next_for_neutral
.charpos
= -1;
845 bidi_it
->next_for_neutral
.type
=
846 bidi_it
->next_for_neutral
.type_after_w1
=
847 bidi_it
->next_for_neutral
.orig_type
= UNKNOWN_BT
;
848 bidi_it
->prev_for_neutral
.charpos
= -1;
849 bidi_it
->prev_for_neutral
.type
=
850 bidi_it
->prev_for_neutral
.type_after_w1
=
851 bidi_it
->prev_for_neutral
.orig_type
= UNKNOWN_BT
;
852 bidi_it
->sor
= L2R
; /* FIXME: should it be user-selectable? */
853 bidi_it
->disp_pos
= -1; /* invalid/unknown */
854 bidi_cache_shrink ();
857 /* Push the current embedding level and override status; reset the
858 current level to LEVEL and the current override status to OVERRIDE. */
860 bidi_push_embedding_level (struct bidi_it
*bidi_it
,
861 int level
, bidi_dir_t override
)
863 bidi_it
->stack_idx
++;
864 if (bidi_it
->stack_idx
>= BIDI_MAXLEVEL
)
866 bidi_it
->level_stack
[bidi_it
->stack_idx
].level
= level
;
867 bidi_it
->level_stack
[bidi_it
->stack_idx
].override
= override
;
870 /* Pop the embedding level and directional override status from the
871 stack, and return the new level. */
873 bidi_pop_embedding_level (struct bidi_it
*bidi_it
)
875 /* UAX#9 says to ignore invalid PDFs. */
876 if (bidi_it
->stack_idx
> 0)
877 bidi_it
->stack_idx
--;
878 return bidi_it
->level_stack
[bidi_it
->stack_idx
].level
;
881 /* Record in SAVED_INFO the information about the current character. */
883 bidi_remember_char (struct bidi_saved_info
*saved_info
,
884 struct bidi_it
*bidi_it
)
886 saved_info
->charpos
= bidi_it
->charpos
;
887 saved_info
->bytepos
= bidi_it
->bytepos
;
888 saved_info
->type
= bidi_it
->type
;
889 bidi_check_type (bidi_it
->type
);
890 saved_info
->type_after_w1
= bidi_it
->type_after_w1
;
891 bidi_check_type (bidi_it
->type_after_w1
);
892 saved_info
->orig_type
= bidi_it
->orig_type
;
893 bidi_check_type (bidi_it
->orig_type
);
896 /* Resolve the type of a neutral character according to the type of
897 surrounding strong text and the current embedding level. */
898 static inline bidi_type_t
899 bidi_resolve_neutral_1 (bidi_type_t prev_type
, bidi_type_t next_type
, int lev
)
901 /* N1: European and Arabic numbers are treated as though they were R. */
902 if (next_type
== WEAK_EN
|| next_type
== WEAK_AN
)
903 next_type
= STRONG_R
;
904 if (prev_type
== WEAK_EN
|| prev_type
== WEAK_AN
)
905 prev_type
= STRONG_R
;
907 if (next_type
== prev_type
) /* N1 */
909 else if ((lev
& 1) == 0) /* N2 */
916 bidi_explicit_dir_char (int ch
)
920 if (!bidi_initialized
)
922 ch_type
= (bidi_type_t
) XINT (CHAR_TABLE_REF (bidi_type_table
, ch
));
923 return (ch_type
== LRE
|| ch_type
== LRO
924 || ch_type
== RLE
|| ch_type
== RLO
928 /* A helper function for bidi_resolve_explicit. It advances to the
929 next character in logical order and determines the new embedding
930 level and directional override, but does not take into account
933 bidi_resolve_explicit_1 (struct bidi_it
*bidi_it
)
941 /* If reseat()'ed, don't advance, so as to start iteration from the
942 position where we were reseated. bidi_it->bytepos can be less
943 than BEGV_BYTE after reseat to BEGV. */
944 if (bidi_it
->bytepos
< BEGV_BYTE
945 || bidi_it
->first_elt
)
947 bidi_it
->first_elt
= 0;
948 if (bidi_it
->charpos
< BEGV
)
949 bidi_it
->charpos
= BEGV
;
950 bidi_it
->bytepos
= CHAR_TO_BYTE (bidi_it
->charpos
);
952 else if (bidi_it
->bytepos
< ZV_BYTE
) /* don't move at ZV */
954 /* Advance to the next character, skipping characters covered by
955 display strings (nchars > 1). */
956 if (bidi_it
->nchars
<= 0)
958 bidi_it
->charpos
+= bidi_it
->nchars
;
959 if (bidi_it
->ch_len
== 0)
961 bidi_it
->bytepos
+= bidi_it
->ch_len
;
964 current_level
= bidi_it
->level_stack
[bidi_it
->stack_idx
].level
; /* X1 */
965 override
= bidi_it
->level_stack
[bidi_it
->stack_idx
].override
;
966 new_level
= current_level
;
968 if (bidi_it
->bytepos
>= ZV_BYTE
)
973 bidi_it
->disp_pos
= ZV
;
977 /* Fetch the character at BYTEPOS. If it is covered by a
978 display string, treat the entire run of covered characters as
979 a single character u+FFFC. */
980 curchar
= bidi_fetch_char (bidi_it
->bytepos
, bidi_it
->charpos
,
981 &bidi_it
->disp_pos
, bidi_it
->frame_window_p
,
982 &bidi_it
->ch_len
, &bidi_it
->nchars
);
984 bidi_it
->ch
= curchar
;
986 /* Don't apply directional override here, as all the types we handle
987 below will not be affected by the override anyway, and we need
988 the original type unaltered. The override will be applied in
989 bidi_resolve_weak. */
990 type
= bidi_get_type (curchar
, NEUTRAL_DIR
);
991 bidi_it
->orig_type
= type
;
992 bidi_check_type (bidi_it
->orig_type
);
995 bidi_it
->prev_was_pdf
= 0;
997 bidi_it
->type_after_w1
= UNKNOWN_BT
;
1003 bidi_it
->type_after_w1
= type
;
1004 bidi_check_type (bidi_it
->type_after_w1
);
1005 type
= WEAK_BN
; /* X9/Retaining */
1006 if (bidi_it
->ignore_bn_limit
<= 0)
1008 if (current_level
<= BIDI_MAXLEVEL
- 4)
1010 /* Compute the least odd embedding level greater than
1011 the current level. */
1012 new_level
= ((current_level
+ 1) & ~1) + 1;
1013 if (bidi_it
->type_after_w1
== RLE
)
1014 override
= NEUTRAL_DIR
;
1017 if (current_level
== BIDI_MAXLEVEL
- 4)
1018 bidi_it
->invalid_rl_levels
= 0;
1019 bidi_push_embedding_level (bidi_it
, new_level
, override
);
1023 bidi_it
->invalid_levels
++;
1024 /* See the commentary about invalid_rl_levels below. */
1025 if (bidi_it
->invalid_rl_levels
< 0)
1026 bidi_it
->invalid_rl_levels
= 0;
1027 bidi_it
->invalid_rl_levels
++;
1030 else if (bidi_it
->prev
.type_after_w1
== WEAK_EN
/* W5/Retaining */
1031 || bidi_it
->next_en_pos
> bidi_it
->charpos
)
1036 bidi_it
->type_after_w1
= type
;
1037 bidi_check_type (bidi_it
->type_after_w1
);
1038 type
= WEAK_BN
; /* X9/Retaining */
1039 if (bidi_it
->ignore_bn_limit
<= 0)
1041 if (current_level
<= BIDI_MAXLEVEL
- 5)
1043 /* Compute the least even embedding level greater than
1044 the current level. */
1045 new_level
= ((current_level
+ 2) & ~1);
1046 if (bidi_it
->type_after_w1
== LRE
)
1047 override
= NEUTRAL_DIR
;
1050 bidi_push_embedding_level (bidi_it
, new_level
, override
);
1054 bidi_it
->invalid_levels
++;
1055 /* invalid_rl_levels counts invalid levels encountered
1056 while the embedding level was already too high for
1057 LRE/LRO, but not for RLE/RLO. That is because
1058 there may be exactly one PDF which we should not
1059 ignore even though invalid_levels is non-zero.
1060 invalid_rl_levels helps to know what PDF is
1062 if (bidi_it
->invalid_rl_levels
>= 0)
1063 bidi_it
->invalid_rl_levels
++;
1066 else if (bidi_it
->prev
.type_after_w1
== WEAK_EN
/* W5/Retaining */
1067 || bidi_it
->next_en_pos
> bidi_it
->charpos
)
1071 bidi_it
->type_after_w1
= type
;
1072 bidi_check_type (bidi_it
->type_after_w1
);
1073 type
= WEAK_BN
; /* X9/Retaining */
1074 if (bidi_it
->ignore_bn_limit
<= 0)
1076 if (!bidi_it
->invalid_rl_levels
)
1078 new_level
= bidi_pop_embedding_level (bidi_it
);
1079 bidi_it
->invalid_rl_levels
= -1;
1080 if (bidi_it
->invalid_levels
)
1081 bidi_it
->invalid_levels
--;
1082 /* else nothing: UAX#9 says to ignore invalid PDFs */
1084 if (!bidi_it
->invalid_levels
)
1085 new_level
= bidi_pop_embedding_level (bidi_it
);
1088 bidi_it
->invalid_levels
--;
1089 bidi_it
->invalid_rl_levels
--;
1092 else if (bidi_it
->prev
.type_after_w1
== WEAK_EN
/* W5/Retaining */
1093 || bidi_it
->next_en_pos
> bidi_it
->charpos
)
1101 bidi_it
->type
= type
;
1102 bidi_check_type (bidi_it
->type
);
1107 /* Given an iterator state in BIDI_IT, advance one character position
1108 in the buffer/string to the next character (in the logical order),
1109 resolve any explicit embeddings and directional overrides, and
1110 return the embedding level of the character after resolving
1111 explicit directives and ignoring empty embeddings. */
1113 bidi_resolve_explicit (struct bidi_it
*bidi_it
)
1115 int prev_level
= bidi_it
->level_stack
[bidi_it
->stack_idx
].level
;
1116 int new_level
= bidi_resolve_explicit_1 (bidi_it
);
1118 if (prev_level
< new_level
1119 && bidi_it
->type
== WEAK_BN
1120 && bidi_it
->ignore_bn_limit
== 0 /* only if not already known */
1121 && bidi_it
->bytepos
< ZV_BYTE
/* not already at EOB */
1122 && bidi_explicit_dir_char (FETCH_MULTIBYTE_CHAR (bidi_it
->bytepos
1123 + bidi_it
->ch_len
)))
1125 /* Avoid pushing and popping embedding levels if the level run
1126 is empty, as this breaks level runs where it shouldn't.
1127 UAX#9 removes all the explicit embedding and override codes,
1128 so empty embeddings disappear without a trace. We need to
1129 behave as if we did the same. */
1130 struct bidi_it saved_it
;
1131 int level
= prev_level
;
1133 bidi_copy_it (&saved_it
, bidi_it
);
1135 while (bidi_explicit_dir_char (FETCH_MULTIBYTE_CHAR (bidi_it
->bytepos
1136 + bidi_it
->ch_len
)))
1138 /* This advances to the next character, skipping any
1139 characters covered by display strings. */
1140 level
= bidi_resolve_explicit_1 (bidi_it
);
1143 if (bidi_it
->nchars
<= 0)
1145 if (level
== prev_level
) /* empty embedding */
1146 saved_it
.ignore_bn_limit
= bidi_it
->charpos
+ bidi_it
->nchars
;
1147 else /* this embedding is non-empty */
1148 saved_it
.ignore_bn_limit
= -1;
1150 bidi_copy_it (bidi_it
, &saved_it
);
1151 if (bidi_it
->ignore_bn_limit
> 0)
1153 /* We pushed a level, but we shouldn't have. Undo that. */
1154 if (!bidi_it
->invalid_rl_levels
)
1156 new_level
= bidi_pop_embedding_level (bidi_it
);
1157 bidi_it
->invalid_rl_levels
= -1;
1158 if (bidi_it
->invalid_levels
)
1159 bidi_it
->invalid_levels
--;
1161 if (!bidi_it
->invalid_levels
)
1162 new_level
= bidi_pop_embedding_level (bidi_it
);
1165 bidi_it
->invalid_levels
--;
1166 bidi_it
->invalid_rl_levels
--;
1171 if (bidi_it
->type
== NEUTRAL_B
) /* X8 */
1173 bidi_set_paragraph_end (bidi_it
);
1174 /* This is needed by bidi_resolve_weak below, and in L1. */
1175 bidi_it
->type_after_w1
= bidi_it
->type
;
1176 bidi_check_type (bidi_it
->type_after_w1
);
1182 /* Advance in the buffer/string, resolve weak types and return the
1183 type of the next character after weak type resolution. */
1185 bidi_resolve_weak (struct bidi_it
*bidi_it
)
1188 bidi_dir_t override
;
1189 int prev_level
= bidi_it
->level_stack
[bidi_it
->stack_idx
].level
;
1190 int new_level
= bidi_resolve_explicit (bidi_it
);
1192 bidi_type_t type_of_next
;
1193 struct bidi_it saved_it
;
1195 type
= bidi_it
->type
;
1196 override
= bidi_it
->level_stack
[bidi_it
->stack_idx
].override
;
1198 if (type
== UNKNOWN_BT
1206 if (new_level
!= prev_level
1207 || bidi_it
->type
== NEUTRAL_B
)
1209 /* We've got a new embedding level run, compute the directional
1210 type of sor and initialize per-run variables (UAX#9, clause
1212 bidi_set_sor_type (bidi_it
, prev_level
, new_level
);
1214 else if (type
== NEUTRAL_S
|| type
== NEUTRAL_WS
1215 || type
== WEAK_BN
|| type
== STRONG_AL
)
1216 bidi_it
->type_after_w1
= type
; /* needed in L1 */
1217 bidi_check_type (bidi_it
->type_after_w1
);
1219 /* Level and directional override status are already recorded in
1220 bidi_it, and do not need any change; see X6. */
1221 if (override
== R2L
) /* X6 */
1223 else if (override
== L2R
)
1227 if (type
== WEAK_NSM
) /* W1 */
1229 /* Note that we don't need to consider the case where the
1230 prev character has its type overridden by an RLO or LRO,
1231 because then either the type of this NSM would have been
1232 also overridden, or the previous character is outside the
1233 current level run, and thus not relevant to this NSM.
1234 This is why NSM gets the type_after_w1 of the previous
1236 if (bidi_it
->prev
.type_after_w1
!= UNKNOWN_BT
1237 /* if type_after_w1 is NEUTRAL_B, this NSM is at sor */
1238 && bidi_it
->prev
.type_after_w1
!= NEUTRAL_B
)
1239 type
= bidi_it
->prev
.type_after_w1
;
1240 else if (bidi_it
->sor
== R2L
)
1242 else if (bidi_it
->sor
== L2R
)
1244 else /* shouldn't happen! */
1247 if (type
== WEAK_EN
/* W2 */
1248 && bidi_it
->last_strong
.type_after_w1
== STRONG_AL
)
1250 else if (type
== STRONG_AL
) /* W3 */
1252 else if ((type
== WEAK_ES
/* W4 */
1253 && bidi_it
->prev
.type_after_w1
== WEAK_EN
1254 && bidi_it
->prev
.orig_type
== WEAK_EN
)
1256 && ((bidi_it
->prev
.type_after_w1
== WEAK_EN
1257 && bidi_it
->prev
.orig_type
== WEAK_EN
)
1258 || bidi_it
->prev
.type_after_w1
== WEAK_AN
)))
1261 bidi_it
->bytepos
+ bidi_it
->ch_len
>= ZV_BYTE
1262 ? BIDI_EOB
: FETCH_MULTIBYTE_CHAR (bidi_it
->bytepos
1264 type_of_next
= bidi_get_type (next_char
, override
);
1266 if (type_of_next
== WEAK_BN
1267 || bidi_explicit_dir_char (next_char
))
1269 bidi_copy_it (&saved_it
, bidi_it
);
1270 while (bidi_resolve_explicit (bidi_it
) == new_level
1271 && bidi_it
->type
== WEAK_BN
)
1273 type_of_next
= bidi_it
->type
;
1274 bidi_copy_it (bidi_it
, &saved_it
);
1277 /* If the next character is EN, but the last strong-type
1278 character is AL, that next EN will be changed to AN when
1279 we process it in W2 above. So in that case, this ES
1280 should not be changed into EN. */
1282 && type_of_next
== WEAK_EN
1283 && bidi_it
->last_strong
.type_after_w1
!= STRONG_AL
)
1285 else if (type
== WEAK_CS
)
1287 if (bidi_it
->prev
.type_after_w1
== WEAK_AN
1288 && (type_of_next
== WEAK_AN
1289 /* If the next character is EN, but the last
1290 strong-type character is AL, EN will be later
1291 changed to AN when we process it in W2 above.
1292 So in that case, this ES should not be
1294 || (type_of_next
== WEAK_EN
1295 && bidi_it
->last_strong
.type_after_w1
== STRONG_AL
)))
1297 else if (bidi_it
->prev
.type_after_w1
== WEAK_EN
1298 && type_of_next
== WEAK_EN
1299 && bidi_it
->last_strong
.type_after_w1
!= STRONG_AL
)
1303 else if (type
== WEAK_ET
/* W5: ET with EN before or after it */
1304 || type
== WEAK_BN
) /* W5/Retaining */
1306 if (bidi_it
->prev
.type_after_w1
== WEAK_EN
/* ET/BN w/EN before it */
1307 || bidi_it
->next_en_pos
> bidi_it
->charpos
)
1309 else /* W5: ET/BN with EN after it. */
1311 EMACS_INT en_pos
= bidi_it
->charpos
+ bidi_it
->nchars
;
1313 if (bidi_it
->nchars
<= 0)
1316 bidi_it
->bytepos
+ bidi_it
->ch_len
>= ZV_BYTE
1317 ? BIDI_EOB
: FETCH_MULTIBYTE_CHAR (bidi_it
->bytepos
1319 type_of_next
= bidi_get_type (next_char
, override
);
1321 if (type_of_next
== WEAK_ET
1322 || type_of_next
== WEAK_BN
1323 || bidi_explicit_dir_char (next_char
))
1325 bidi_copy_it (&saved_it
, bidi_it
);
1326 while (bidi_resolve_explicit (bidi_it
) == new_level
1327 && (bidi_it
->type
== WEAK_BN
1328 || bidi_it
->type
== WEAK_ET
))
1330 type_of_next
= bidi_it
->type
;
1331 en_pos
= bidi_it
->charpos
;
1332 bidi_copy_it (bidi_it
, &saved_it
);
1334 if (type_of_next
== WEAK_EN
)
1336 /* If the last strong character is AL, the EN we've
1337 found will become AN when we get to it (W2). */
1338 if (bidi_it
->last_strong
.type_after_w1
!= STRONG_AL
)
1341 /* Remember this EN position, to speed up processing
1343 bidi_it
->next_en_pos
= en_pos
;
1345 else if (type
== WEAK_BN
)
1346 type
= NEUTRAL_ON
; /* W6/Retaining */
1352 if (type
== WEAK_ES
|| type
== WEAK_ET
|| type
== WEAK_CS
/* W6 */
1354 && (bidi_it
->prev
.type_after_w1
== WEAK_CS
/* W6/Retaining */
1355 || bidi_it
->prev
.type_after_w1
== WEAK_ES
1356 || bidi_it
->prev
.type_after_w1
== WEAK_ET
)))
1359 /* Store the type we've got so far, before we clobber it with strong
1360 types in W7 and while resolving neutral types. But leave alone
1361 the original types that were recorded above, because we will need
1362 them for the L1 clause. */
1363 if (bidi_it
->type_after_w1
== UNKNOWN_BT
)
1364 bidi_it
->type_after_w1
= type
;
1365 bidi_check_type (bidi_it
->type_after_w1
);
1367 if (type
== WEAK_EN
) /* W7 */
1369 if ((bidi_it
->last_strong
.type_after_w1
== STRONG_L
)
1370 || (bidi_it
->last_strong
.type
== UNKNOWN_BT
&& bidi_it
->sor
== L2R
))
1374 bidi_it
->type
= type
;
1375 bidi_check_type (bidi_it
->type
);
1380 bidi_resolve_neutral (struct bidi_it
*bidi_it
)
1382 int prev_level
= bidi_it
->level_stack
[bidi_it
->stack_idx
].level
;
1383 bidi_type_t type
= bidi_resolve_weak (bidi_it
);
1384 int current_level
= bidi_it
->level_stack
[bidi_it
->stack_idx
].level
;
1386 if (!(type
== STRONG_R
1391 || type
== NEUTRAL_B
1392 || type
== NEUTRAL_S
1393 || type
== NEUTRAL_WS
1394 || type
== NEUTRAL_ON
))
1397 if (bidi_get_category (type
) == NEUTRAL
1398 || (type
== WEAK_BN
&& prev_level
== current_level
))
1400 if (bidi_it
->next_for_neutral
.type
!= UNKNOWN_BT
)
1401 type
= bidi_resolve_neutral_1 (bidi_it
->prev_for_neutral
.type
,
1402 bidi_it
->next_for_neutral
.type
,
1406 /* Arrrgh!! The UAX#9 algorithm is too deeply entrenched in
1407 the assumption of batch-style processing; see clauses W4,
1408 W5, and especially N1, which require to look far forward
1409 (as well as back) in the buffer/string. May the fleas of
1410 a thousand camels infest the armpits of those who design
1411 supposedly general-purpose algorithms by looking at their
1412 own implementations, and fail to consider other possible
1414 struct bidi_it saved_it
;
1415 bidi_type_t next_type
;
1417 if (bidi_it
->scan_dir
== -1)
1420 bidi_copy_it (&saved_it
, bidi_it
);
1421 /* Scan the text forward until we find the first non-neutral
1422 character, and then use that to resolve the neutral we
1423 are dealing with now. We also cache the scanned iterator
1424 states, to salvage some of the effort later. */
1425 bidi_cache_iterator_state (bidi_it
, 0);
1427 /* Record the info about the previous character, so that
1428 it will be cached below with this state. */
1429 if (bidi_it
->type_after_w1
!= WEAK_BN
/* W1/Retaining */
1430 && bidi_it
->type
!= WEAK_BN
)
1431 bidi_remember_char (&bidi_it
->prev
, bidi_it
);
1432 type
= bidi_resolve_weak (bidi_it
);
1433 /* Paragraph separators have their levels fully resolved
1434 at this point, so cache them as resolved. */
1435 bidi_cache_iterator_state (bidi_it
, type
== NEUTRAL_B
);
1436 /* FIXME: implement L1 here, by testing for a newline and
1437 resetting the level for any sequence of whitespace
1438 characters adjacent to it. */
1439 } while (!(type
== NEUTRAL_B
1441 && bidi_get_category (type
) != NEUTRAL
)
1442 /* This is all per level run, so stop when we
1443 reach the end of this level run. */
1444 || bidi_it
->level_stack
[bidi_it
->stack_idx
].level
!=
1447 bidi_remember_char (&saved_it
.next_for_neutral
, bidi_it
);
1458 /* N1: ``European and Arabic numbers are treated as
1459 though they were R.'' */
1460 next_type
= STRONG_R
;
1461 saved_it
.next_for_neutral
.type
= STRONG_R
;
1464 if (!bidi_explicit_dir_char (bidi_it
->ch
))
1465 abort (); /* can't happen: BNs are skipped */
1468 /* Marched all the way to the end of this level run.
1469 We need to use the eor type, whose information is
1470 stored by bidi_set_sor_type in the prev_for_neutral
1472 if (saved_it
.type
!= WEAK_BN
1473 || bidi_get_category (bidi_it
->prev
.type_after_w1
) == NEUTRAL
)
1475 next_type
= bidi_it
->prev_for_neutral
.type
;
1476 saved_it
.next_for_neutral
.type
= next_type
;
1477 bidi_check_type (next_type
);
1481 /* This is a BN which does not adjoin neutrals.
1482 Leave its type alone. */
1483 bidi_copy_it (bidi_it
, &saved_it
);
1484 return bidi_it
->type
;
1490 type
= bidi_resolve_neutral_1 (saved_it
.prev_for_neutral
.type
,
1491 next_type
, current_level
);
1492 saved_it
.type
= type
;
1493 bidi_check_type (type
);
1494 bidi_copy_it (bidi_it
, &saved_it
);
1500 /* Given an iterator state in BIDI_IT, advance one character position
1501 in the buffer/string to the next character (in the logical order),
1502 resolve the bidi type of that next character, and return that
1505 bidi_type_of_next_char (struct bidi_it
*bidi_it
)
1509 /* This should always be called during a forward scan. */
1510 if (bidi_it
->scan_dir
!= 1)
1513 /* Reset the limit until which to ignore BNs if we step out of the
1514 area where we found only empty levels. */
1515 if ((bidi_it
->ignore_bn_limit
> 0
1516 && bidi_it
->ignore_bn_limit
<= bidi_it
->charpos
)
1517 || (bidi_it
->ignore_bn_limit
== -1
1518 && !bidi_explicit_dir_char (bidi_it
->ch
)))
1519 bidi_it
->ignore_bn_limit
= 0;
1521 type
= bidi_resolve_neutral (bidi_it
);
1526 /* Given an iterator state BIDI_IT, advance one character position in
1527 the buffer/string to the next character (in the current scan
1528 direction), resolve the embedding and implicit levels of that next
1529 character, and return the resulting level. */
1531 bidi_level_of_next_char (struct bidi_it
*bidi_it
)
1534 int level
, prev_level
= -1;
1535 struct bidi_saved_info next_for_neutral
;
1536 EMACS_INT next_char_pos
;
1538 if (bidi_it
->scan_dir
== 1)
1540 /* There's no sense in trying to advance if we hit end of text. */
1541 if (bidi_it
->bytepos
>= ZV_BYTE
)
1542 return bidi_it
->resolved_level
;
1544 /* Record the info about the previous character. */
1545 if (bidi_it
->type_after_w1
!= WEAK_BN
/* W1/Retaining */
1546 && bidi_it
->type
!= WEAK_BN
)
1547 bidi_remember_char (&bidi_it
->prev
, bidi_it
);
1548 if (bidi_it
->type_after_w1
== STRONG_R
1549 || bidi_it
->type_after_w1
== STRONG_L
1550 || bidi_it
->type_after_w1
== STRONG_AL
)
1551 bidi_remember_char (&bidi_it
->last_strong
, bidi_it
);
1552 /* FIXME: it sounds like we don't need both prev and
1553 prev_for_neutral members, but I'm leaving them both for now. */
1554 if (bidi_it
->type
== STRONG_R
|| bidi_it
->type
== STRONG_L
1555 || bidi_it
->type
== WEAK_EN
|| bidi_it
->type
== WEAK_AN
)
1556 bidi_remember_char (&bidi_it
->prev_for_neutral
, bidi_it
);
1558 /* If we overstepped the characters used for resolving neutrals
1559 and whitespace, invalidate their info in the iterator. */
1560 if (bidi_it
->charpos
>= bidi_it
->next_for_neutral
.charpos
)
1561 bidi_it
->next_for_neutral
.type
= UNKNOWN_BT
;
1562 if (bidi_it
->next_en_pos
>= 0
1563 && bidi_it
->charpos
>= bidi_it
->next_en_pos
)
1564 bidi_it
->next_en_pos
= -1;
1565 if (bidi_it
->next_for_ws
.type
!= UNKNOWN_BT
1566 && bidi_it
->charpos
>= bidi_it
->next_for_ws
.charpos
)
1567 bidi_it
->next_for_ws
.type
= UNKNOWN_BT
;
1569 /* This must be taken before we fill the iterator with the info
1570 about the next char. If we scan backwards, the iterator
1571 state must be already cached, so there's no need to know the
1572 embedding level of the previous character, since we will be
1573 returning to our caller shortly. */
1574 prev_level
= bidi_it
->level_stack
[bidi_it
->stack_idx
].level
;
1576 next_for_neutral
= bidi_it
->next_for_neutral
;
1578 /* Perhaps the character we want is already cached. If it is, the
1579 call to bidi_cache_find below will return a type other than
1581 if (bidi_cache_idx
&& !bidi_it
->first_elt
)
1583 if (bidi_it
->scan_dir
> 0)
1585 if (bidi_it
->nchars
<= 0)
1587 next_char_pos
= bidi_it
->charpos
+ bidi_it
->nchars
;
1590 next_char_pos
= bidi_it
->charpos
- 1;
1591 type
= bidi_cache_find (next_char_pos
, -1, bidi_it
);
1595 if (type
!= UNKNOWN_BT
)
1597 /* Don't lose the information for resolving neutrals! The
1598 cached states could have been cached before their
1599 next_for_neutral member was computed. If we are on our way
1600 forward, we can simply take the info from the previous
1602 if (bidi_it
->scan_dir
== 1
1603 && bidi_it
->next_for_neutral
.type
== UNKNOWN_BT
)
1604 bidi_it
->next_for_neutral
= next_for_neutral
;
1606 /* If resolved_level is -1, it means this state was cached
1607 before it was completely resolved, so we cannot return
1609 if (bidi_it
->resolved_level
!= -1)
1610 return bidi_it
->resolved_level
;
1612 if (bidi_it
->scan_dir
== -1)
1613 /* If we are going backwards, the iterator state is already cached
1614 from previous scans, and should be fully resolved. */
1617 if (type
== UNKNOWN_BT
)
1618 type
= bidi_type_of_next_char (bidi_it
);
1620 if (type
== NEUTRAL_B
)
1621 return bidi_it
->resolved_level
;
1623 level
= bidi_it
->level_stack
[bidi_it
->stack_idx
].level
;
1624 if ((bidi_get_category (type
) == NEUTRAL
/* && type != NEUTRAL_B */)
1625 || (type
== WEAK_BN
&& prev_level
== level
))
1627 if (bidi_it
->next_for_neutral
.type
== UNKNOWN_BT
)
1630 /* If the cached state shows a neutral character, it was not
1631 resolved by bidi_resolve_neutral, so do it now. */
1632 type
= bidi_resolve_neutral_1 (bidi_it
->prev_for_neutral
.type
,
1633 bidi_it
->next_for_neutral
.type
,
1637 if (!(type
== STRONG_R
1641 || type
== WEAK_AN
))
1643 bidi_it
->type
= type
;
1644 bidi_check_type (bidi_it
->type
);
1646 /* For L1 below, we need to know, for each WS character, whether
1647 it belongs to a sequence of WS characters preceding a newline
1648 or a TAB or a paragraph separator. */
1649 if (bidi_it
->orig_type
== NEUTRAL_WS
1650 && bidi_it
->next_for_ws
.type
== UNKNOWN_BT
)
1653 EMACS_INT clen
= bidi_it
->ch_len
;
1654 EMACS_INT bpos
= bidi_it
->bytepos
;
1655 EMACS_INT cpos
= bidi_it
->charpos
;
1656 EMACS_INT disp_pos
= bidi_it
->disp_pos
;
1657 EMACS_INT nc
= bidi_it
->nchars
;
1659 int fwp
= bidi_it
->frame_window_p
;
1661 if (bidi_it
->nchars
<= 0)
1664 ch
= bidi_fetch_char (bpos
+= clen
, cpos
+= nc
, &disp_pos
, fwp
,
1666 if (ch
== '\n' || ch
== BIDI_EOB
/* || ch == LINESEP_CHAR */)
1669 chtype
= bidi_get_type (ch
, NEUTRAL_DIR
);
1670 } while (chtype
== NEUTRAL_WS
|| chtype
== WEAK_BN
1671 || bidi_explicit_dir_char (ch
)); /* L1/Retaining */
1672 bidi_it
->next_for_ws
.type
= chtype
;
1673 bidi_check_type (bidi_it
->next_for_ws
.type
);
1674 bidi_it
->next_for_ws
.charpos
= cpos
;
1675 bidi_it
->next_for_ws
.bytepos
= bpos
;
1678 /* Resolve implicit levels, with a twist: PDFs get the embedding
1679 level of the enbedding they terminate. See below for the
1681 if (bidi_it
->orig_type
== PDF
1682 /* Don't do this if this formatting code didn't change the
1683 embedding level due to invalid or empty embeddings. */
1684 && prev_level
!= level
)
1686 /* Don't look in UAX#9 for the reason for this: it's our own
1687 private quirk. The reason is that we want the formatting
1688 codes to be delivered so that they bracket the text of their
1689 embedding. For example, given the text
1693 we want it to be displayed as
1701 which will result because we bump up the embedding level as
1702 soon as we see the RLO and pop it as soon as we see the PDF,
1703 so RLO itself has the same embedding level as "teST", and
1704 thus would be normally delivered last, just before the PDF.
1705 The switch below fiddles with the level of PDF so that this
1706 ugly side effect does not happen.
1708 (This is, of course, only important if the formatting codes
1709 are actually displayed, but Emacs does need to display them
1710 if the user wants to.) */
1713 else if (bidi_it
->orig_type
== NEUTRAL_B
/* L1 */
1714 || bidi_it
->orig_type
== NEUTRAL_S
1715 || bidi_it
->ch
== '\n' || bidi_it
->ch
== BIDI_EOB
1716 /* || bidi_it->ch == LINESEP_CHAR */
1717 || (bidi_it
->orig_type
== NEUTRAL_WS
1718 && (bidi_it
->next_for_ws
.type
== NEUTRAL_B
1719 || bidi_it
->next_for_ws
.type
== NEUTRAL_S
)))
1720 level
= bidi_it
->level_stack
[0].level
;
1721 else if ((level
& 1) == 0) /* I1 */
1723 if (type
== STRONG_R
)
1725 else if (type
== WEAK_EN
|| type
== WEAK_AN
)
1730 if (type
== STRONG_L
|| type
== WEAK_EN
|| type
== WEAK_AN
)
1734 bidi_it
->resolved_level
= level
;
1738 /* Move to the other edge of a level given by LEVEL. If END_FLAG is
1739 non-zero, we are at the end of a level, and we need to prepare to
1740 resume the scan of the lower level.
1742 If this level's other edge is cached, we simply jump to it, filling
1743 the iterator structure with the iterator state on the other edge.
1744 Otherwise, we walk the buffer or string until we come back to the
1745 same level as LEVEL.
1747 Note: we are not talking here about a ``level run'' in the UAX#9
1748 sense of the term, but rather about a ``level'' which includes
1749 all the levels higher than it. In other words, given the levels
1752 11111112222222333333334443343222222111111112223322111
1755 and assuming we are at point A scanning left to right, this
1756 function moves to point C, whereas the UAX#9 ``level 2 run'' ends
1759 bidi_find_other_level_edge (struct bidi_it
*bidi_it
, int level
, int end_flag
)
1761 int dir
= end_flag
? -bidi_it
->scan_dir
: bidi_it
->scan_dir
;
1764 /* Try the cache first. */
1765 if ((idx
= bidi_cache_find_level_change (level
, dir
, end_flag
)) >= 0)
1766 bidi_cache_fetch_state (idx
, bidi_it
);
1772 abort (); /* if we are at end of level, its edges must be cached */
1774 bidi_cache_iterator_state (bidi_it
, 1);
1776 new_level
= bidi_level_of_next_char (bidi_it
);
1777 bidi_cache_iterator_state (bidi_it
, 1);
1778 } while (new_level
>= level
);
1783 bidi_move_to_visually_next (struct bidi_it
*bidi_it
)
1785 int old_level
, new_level
, next_level
;
1786 struct bidi_it sentinel
;
1788 if (bidi_it
->scan_dir
== 0)
1790 bidi_it
->scan_dir
= 1; /* default to logical order */
1793 /* If we just passed a newline, initialize for the next line. */
1794 if (!bidi_it
->first_elt
&& bidi_it
->orig_type
== NEUTRAL_B
)
1795 bidi_line_init (bidi_it
);
1797 /* Prepare the sentinel iterator state, and cache it. When we bump
1798 into it, scanning backwards, we'll know that the last non-base
1799 level is exhausted. */
1800 if (bidi_cache_idx
== 0)
1802 bidi_copy_it (&sentinel
, bidi_it
);
1803 if (bidi_it
->first_elt
)
1805 sentinel
.charpos
--; /* cached charpos needs to be monotonic */
1807 sentinel
.ch
= '\n'; /* doesn't matter, but why not? */
1808 sentinel
.ch_len
= 1;
1809 sentinel
.nchars
= 1;
1811 bidi_cache_iterator_state (&sentinel
, 1);
1814 old_level
= bidi_it
->resolved_level
;
1815 new_level
= bidi_level_of_next_char (bidi_it
);
1817 /* Reordering of resolved levels (clause L2) is implemented by
1818 jumping to the other edge of the level and flipping direction of
1819 scanning the text whenever we find a level change. */
1820 if (new_level
!= old_level
)
1822 int ascending
= new_level
> old_level
;
1823 int level_to_search
= ascending
? old_level
+ 1 : old_level
;
1824 int incr
= ascending
? 1 : -1;
1825 int expected_next_level
= old_level
+ incr
;
1827 /* Jump (or walk) to the other edge of this level. */
1828 bidi_find_other_level_edge (bidi_it
, level_to_search
, !ascending
);
1829 /* Switch scan direction and peek at the next character in the
1831 bidi_it
->scan_dir
= -bidi_it
->scan_dir
;
1833 /* The following loop handles the case where the resolved level
1834 jumps by more than one. This is typical for numbers inside a
1835 run of text with left-to-right embedding direction, but can
1836 also happen in other situations. In those cases the decision
1837 where to continue after a level change, and in what direction,
1838 is tricky. For example, given a text like below:
1843 (where the numbers below the text show the resolved levels),
1844 the result of reordering according to UAX#9 should be this:
1848 This is implemented by the loop below which flips direction
1849 and jumps to the other edge of the level each time it finds
1850 the new level not to be the expected one. The expected level
1851 is always one more or one less than the previous one. */
1852 next_level
= bidi_peek_at_next_level (bidi_it
);
1853 while (next_level
!= expected_next_level
)
1855 expected_next_level
+= incr
;
1856 level_to_search
+= incr
;
1857 bidi_find_other_level_edge (bidi_it
, level_to_search
, !ascending
);
1858 bidi_it
->scan_dir
= -bidi_it
->scan_dir
;
1859 next_level
= bidi_peek_at_next_level (bidi_it
);
1862 /* Finally, deliver the next character in the new direction. */
1863 next_level
= bidi_level_of_next_char (bidi_it
);
1866 /* Take note when we have just processed the newline that precedes
1867 the end of the paragraph. The next time we are about to be
1868 called, set_iterator_to_next will automatically reinit the
1869 paragraph direction, if needed. We do this at the newline before
1870 the paragraph separator, because the next character might not be
1871 the first character of the next paragraph, due to the bidi
1872 reordering, whereas we _must_ know the paragraph base direction
1873 _before_ we process the paragraph's text, since the base
1874 direction affects the reordering. */
1875 if (bidi_it
->scan_dir
== 1
1876 && bidi_it
->orig_type
== NEUTRAL_B
1877 && bidi_it
->bytepos
< ZV_BYTE
)
1880 bidi_at_paragraph_end (bidi_it
->charpos
+ bidi_it
->nchars
,
1881 bidi_it
->bytepos
+ bidi_it
->ch_len
);
1882 if (bidi_it
->nchars
<= 0)
1886 bidi_it
->new_paragraph
= 1;
1887 /* Record the buffer position of the last character of the
1888 paragraph separator. */
1889 bidi_it
->separator_limit
=
1890 bidi_it
->charpos
+ bidi_it
->nchars
+ sep_len
;
1894 if (bidi_it
->scan_dir
== 1 && bidi_cache_idx
)
1896 /* If we are at paragraph's base embedding level and beyond the
1897 last cached position, the cache's job is done and we can
1899 if (bidi_it
->resolved_level
== bidi_it
->level_stack
[0].level
1900 && bidi_it
->charpos
> (bidi_cache
[bidi_cache_idx
- 1].charpos
1901 + bidi_cache
[bidi_cache_idx
- 1].nchars
- 1))
1902 bidi_cache_reset ();
1903 /* But as long as we are caching during forward scan, we must
1904 cache each state, or else the cache integrity will be
1905 compromised: it assumes cached states correspond to buffer
1908 bidi_cache_iterator_state (bidi_it
, 1);
1912 /* This is meant to be called from within the debugger, whenever you
1913 wish to examine the cache contents. */
1914 void bidi_dump_cached_states (void) EXTERNALLY_VISIBLE
;
1916 bidi_dump_cached_states (void)
1921 if (bidi_cache_idx
== 0)
1923 fprintf (stderr
, "The cache is empty.\n");
1926 fprintf (stderr
, "Total of %d state%s in cache:\n",
1927 bidi_cache_idx
, bidi_cache_idx
== 1 ? "" : "s");
1929 for (i
= bidi_cache
[bidi_cache_idx
- 1].charpos
; i
> 0; i
/= 10)
1931 fputs ("ch ", stderr
);
1932 for (i
= 0; i
< bidi_cache_idx
; i
++)
1933 fprintf (stderr
, "%*c", ndigits
, bidi_cache
[i
].ch
);
1934 fputs ("\n", stderr
);
1935 fputs ("lvl ", stderr
);
1936 for (i
= 0; i
< bidi_cache_idx
; i
++)
1937 fprintf (stderr
, "%*d", ndigits
, bidi_cache
[i
].resolved_level
);
1938 fputs ("\n", stderr
);
1939 fputs ("pos ", stderr
);
1940 for (i
= 0; i
< bidi_cache_idx
; i
++)
1941 fprintf (stderr
, "%*"pI
"d", ndigits
, bidi_cache
[i
].charpos
);
1942 fputs ("\n", stderr
);