X-Git-Url: https://git.hcoop.net/bpt/emacs.git/blobdiff_plain/57f97249c8e8673df19219e1f5cba478ea23024d..ad6042bb6f905a38c1c5b7a77981894355496e5c:/src/bidi.c diff --git a/src/bidi.c b/src/bidi.c index de189f0cb1..c83ee54992 100644 --- a/src/bidi.c +++ b/src/bidi.c @@ -1,4 +1,4 @@ -/* Low-level bidirectional buffer-scanning functions for GNU Emacs. +/* Low-level bidirectional buffer/string-scanning functions for GNU Emacs. Copyright (C) 2000-2001, 2004-2005, 2009-2011 Free Software Foundation, Inc. @@ -20,7 +20,7 @@ along with GNU Emacs. If not, see . */ /* Written by Eli Zaretskii . A sequential implementation of the Unicode Bidirectional algorithm, - as per UAX#9, a part of the Unicode Standard. + (UBA) as per UAX#9, a part of the Unicode Standard. Unlike the reference and most other implementations, this one is designed to be called once for every character in the buffer or @@ -35,11 +35,16 @@ along with GNU Emacs. If not, see . */ details about its algorithm that finds the next visual-order character by resolving their levels on the fly. - The two other entry points are bidi_paragraph_init and + Two other entry points are bidi_paragraph_init and bidi_mirror_char. The first determines the base direction of a paragraph, while the second returns the mirrored version of its argument character. + A few auxiliary entry points are used to initialize the bidi + iterator for iterating an object (buffer or string), push and pop + the bidi iterator state, and save and restore the state of the bidi + cache. + If you want to understand the code, you will have to read it together with the relevant portions of UAX#9. The comments include references to UAX#9 rules, for that very reason. @@ -66,16 +71,6 @@ static Lisp_Object bidi_type_table, bidi_mirror_table; #define RLM_CHAR 0x200F #define BIDI_EOB -1 -/* Local data structures. (Look in dispextern.h for the rest.) */ - -/* What we need to know about the current paragraph. */ -struct bidi_paragraph_info { - EMACS_INT start_bytepos; /* byte position where it begins */ - EMACS_INT end_bytepos; /* byte position where it ends */ - int embedding_level; /* its basic embedding level */ - bidi_dir_t base_dir; /* its base direction */ -}; - /* Data type for describing the bidirectional character categories. */ typedef enum { UNKNOWN_BC, @@ -90,47 +85,14 @@ int bidi_ignore_explicit_marks_for_paragraph_level = 1; static Lisp_Object paragraph_start_re, paragraph_separate_re; static Lisp_Object Qparagraph_start, Qparagraph_separate; -static void -bidi_initialize (void) -{ - -#include "biditype.h" -#include "bidimirror.h" - - int i; - - bidi_type_table = Fmake_char_table (Qnil, make_number (STRONG_L)); - staticpro (&bidi_type_table); - - for (i = 0; i < sizeof bidi_type / sizeof bidi_type[0]; i++) - char_table_set_range (bidi_type_table, bidi_type[i].from, bidi_type[i].to, - make_number (bidi_type[i].type)); - - bidi_mirror_table = Fmake_char_table (Qnil, Qnil); - staticpro (&bidi_mirror_table); - - for (i = 0; i < sizeof bidi_mirror / sizeof bidi_mirror[0]; i++) - char_table_set (bidi_mirror_table, bidi_mirror[i].from, - make_number (bidi_mirror[i].to)); - - Qparagraph_start = intern ("paragraph-start"); - staticpro (&Qparagraph_start); - paragraph_start_re = Fsymbol_value (Qparagraph_start); - if (!STRINGP (paragraph_start_re)) - paragraph_start_re = build_string ("\f\\|[ \t]*$"); - staticpro (¶graph_start_re); - Qparagraph_separate = intern ("paragraph-separate"); - staticpro (&Qparagraph_separate); - paragraph_separate_re = Fsymbol_value (Qparagraph_separate); - if (!STRINGP (paragraph_separate_re)) - paragraph_separate_re = build_string ("[ \t\f]*$"); - staticpro (¶graph_separate_re); - bidi_initialized = 1; -} + +/*********************************************************************** + Utilities + ***********************************************************************/ /* Return the bidi type of a character CH, subject to the current directional OVERRIDE. */ -static INLINE bidi_type_t +static inline bidi_type_t bidi_get_type (int ch, bidi_dir_t override) { bidi_type_t default_type; @@ -181,7 +143,7 @@ bidi_check_type (bidi_type_t type) } /* Given a bidi TYPE of a character, return its category. */ -static INLINE bidi_category_t +static inline bidi_category_t bidi_get_category (bidi_type_t type) { switch (type) @@ -243,9 +205,80 @@ bidi_mirror_char (int c) return c; } +/* Determine the start-of-run (sor) directional type given the two + embedding levels on either side of the run boundary. Also, update + the saved info about previously seen characters, since that info is + generally valid for a single level run. */ +static inline void +bidi_set_sor_type (struct bidi_it *bidi_it, int level_before, int level_after) +{ + int higher_level = level_before > level_after ? level_before : level_after; + + /* The prev_was_pdf gork is required for when we have several PDFs + in a row. In that case, we want to compute the sor type for the + next level run only once: when we see the first PDF. That's + because the sor type depends only on the higher of the two levels + that we find on the two sides of the level boundary (see UAX#9, + clause X10), and so we don't need to know the final embedding + level to which we descend after processing all the PDFs. */ + if (!bidi_it->prev_was_pdf || level_before < level_after) + /* FIXME: should the default sor direction be user selectable? */ + bidi_it->sor = (higher_level & 1) != 0 ? R2L : L2R; + if (level_before > level_after) + bidi_it->prev_was_pdf = 1; + + bidi_it->prev.type = UNKNOWN_BT; + bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 = + bidi_it->last_strong.orig_type = UNKNOWN_BT; + bidi_it->prev_for_neutral.type = bidi_it->sor == R2L ? STRONG_R : STRONG_L; + bidi_it->prev_for_neutral.charpos = bidi_it->charpos; + bidi_it->prev_for_neutral.bytepos = bidi_it->bytepos; + bidi_it->next_for_neutral.type = bidi_it->next_for_neutral.type_after_w1 = + bidi_it->next_for_neutral.orig_type = UNKNOWN_BT; + bidi_it->ignore_bn_limit = -1; /* meaning it's unknown */ +} + +/* Push the current embedding level and override status; reset the + current level to LEVEL and the current override status to OVERRIDE. */ +static inline void +bidi_push_embedding_level (struct bidi_it *bidi_it, + int level, bidi_dir_t override) +{ + bidi_it->stack_idx++; + xassert (bidi_it->stack_idx < BIDI_MAXLEVEL); + bidi_it->level_stack[bidi_it->stack_idx].level = level; + bidi_it->level_stack[bidi_it->stack_idx].override = override; +} + +/* Pop the embedding level and directional override status from the + stack, and return the new level. */ +static inline int +bidi_pop_embedding_level (struct bidi_it *bidi_it) +{ + /* UAX#9 says to ignore invalid PDFs. */ + if (bidi_it->stack_idx > 0) + bidi_it->stack_idx--; + return bidi_it->level_stack[bidi_it->stack_idx].level; +} + +/* Record in SAVED_INFO the information about the current character. */ +static inline void +bidi_remember_char (struct bidi_saved_info *saved_info, + struct bidi_it *bidi_it) +{ + saved_info->charpos = bidi_it->charpos; + saved_info->bytepos = bidi_it->bytepos; + saved_info->type = bidi_it->type; + bidi_check_type (bidi_it->type); + saved_info->type_after_w1 = bidi_it->type_after_w1; + bidi_check_type (bidi_it->type_after_w1); + saved_info->orig_type = bidi_it->orig_type; + bidi_check_type (bidi_it->orig_type); +} + /* Copy the bidi iterator from FROM to TO. To save cycles, this only copies the part of the level stack that is actually in use. */ -static INLINE void +static inline void bidi_copy_it (struct bidi_it *to, struct bidi_it *from) { int i; @@ -259,23 +292,38 @@ bidi_copy_it (struct bidi_it *to, struct bidi_it *from) to->level_stack[i] = from->level_stack[i]; } -/* Caching the bidi iterator states. */ + +/*********************************************************************** + Caching the bidi iterator states + ***********************************************************************/ #define BIDI_CACHE_CHUNK 200 static struct bidi_it *bidi_cache; -static size_t bidi_cache_size = 0; -static size_t elsz = sizeof (struct bidi_it); -static int bidi_cache_idx; /* next unused cache slot */ -static int bidi_cache_last_idx; /* slot of last cache hit */ - -static INLINE void +static EMACS_INT bidi_cache_size = 0; +enum { elsz = sizeof (struct bidi_it) }; +static EMACS_INT bidi_cache_idx; /* next unused cache slot */ +static EMACS_INT bidi_cache_last_idx; /* slot of last cache hit */ +static EMACS_INT bidi_cache_start = 0; /* start of cache for this + "stack" level */ + +/* Reset the cache state to the empty state. We only reset the part + of the cache relevant to iteration of the current object. Previous + objects, which are pushed on the display iterator's stack, are left + intact. This is called when the cached information is no more + useful for the current iteration, e.g. when we were reseated to a + new position on the same object. */ +static inline void bidi_cache_reset (void) { - bidi_cache_idx = 0; + bidi_cache_idx = bidi_cache_start; bidi_cache_last_idx = -1; } -static INLINE void +/* Shrink the cache to its minimal size. Called when we init the bidi + iterator for reordering a buffer or a string that does not come + from display properties, because that means all the previously + cached info is of no further use. */ +static inline void bidi_cache_shrink (void) { if (bidi_cache_size > BIDI_CACHE_CHUNK) @@ -287,12 +335,12 @@ bidi_cache_shrink (void) bidi_cache_reset (); } -static INLINE void -bidi_cache_fetch_state (int idx, struct bidi_it *bidi_it) +static inline void +bidi_cache_fetch_state (EMACS_INT idx, struct bidi_it *bidi_it) { int current_scan_dir = bidi_it->scan_dir; - if (idx < 0 || idx >= bidi_cache_idx) + if (idx < bidi_cache_start || idx >= bidi_cache_idx) abort (); bidi_copy_it (bidi_it, &bidi_cache[idx]); @@ -304,13 +352,15 @@ bidi_cache_fetch_state (int idx, struct bidi_it *bidi_it) level less or equal to LEVEL. if LEVEL is -1, disregard the resolved levels in cached states. DIR, if non-zero, means search in that direction from the last cache hit. */ -static INLINE int +static inline EMACS_INT bidi_cache_search (EMACS_INT charpos, int level, int dir) { - int i, i_start; + EMACS_INT i, i_start; - if (bidi_cache_idx) + if (bidi_cache_idx > bidi_cache_start) { + if (bidi_cache_last_idx == -1) + bidi_cache_last_idx = bidi_cache_idx - 1; if (charpos < bidi_cache[bidi_cache_last_idx].charpos) { dir = -1; @@ -333,7 +383,7 @@ bidi_cache_search (EMACS_INT charpos, int level, int dir) if (dir < 0) { /* Linear search for now; FIXME! */ - for (i = i_start; i >= 0; i--) + for (i = i_start; i >= bidi_cache_start; i--) if (bidi_cache[i].charpos <= charpos && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars && (level == -1 || bidi_cache[i].resolved_level <= level)) @@ -355,8 +405,9 @@ bidi_cache_search (EMACS_INT charpos, int level, int dir) /* Find a cached state where the resolved level changes to a value that is lower than LEVEL, and return its cache slot index. DIR is the direction to search, starting with the last used cache slot. - BEFORE, if non-zero, means return the index of the slot that is - ``before'' the level change in the search direction. That is, + If DIR is zero, we search backwards from the last occupied cache + slot. BEFORE, if non-zero, means return the index of the slot that + is ``before'' the level change in the search direction. That is, given the cached levels like this: 1122333442211 @@ -366,14 +417,16 @@ bidi_cache_search (EMACS_INT charpos, int level, int dir) C, searching backwards (DIR = -1) for LEVEL = 2 will return the index of slot B or A, depending whether BEFORE is, respectively, non-zero or zero. */ -static int +static EMACS_INT bidi_cache_find_level_change (int level, int dir, int before) { if (bidi_cache_idx) { - int i = dir ? bidi_cache_last_idx : bidi_cache_idx - 1; + EMACS_INT i = dir ? bidi_cache_last_idx : bidi_cache_idx - 1; int incr = before ? 1 : 0; + xassert (!dir || bidi_cache_last_idx >= 0); + if (!dir) dir = -1; else if (!incr) @@ -381,7 +434,7 @@ bidi_cache_find_level_change (int level, int dir, int before) if (dir < 0) { - while (i >= incr) + while (i >= bidi_cache_start + incr) { if (bidi_cache[i - incr].resolved_level >= 0 && bidi_cache[i - incr].resolved_level < level) @@ -404,10 +457,23 @@ bidi_cache_find_level_change (int level, int dir, int before) return -1; } -static INLINE void +static inline void +bidi_cache_ensure_space (EMACS_INT idx) +{ + /* Enlarge the cache as needed. */ + if (idx >= bidi_cache_size) + { + while (idx >= bidi_cache_size) + bidi_cache_size += BIDI_CACHE_CHUNK; + bidi_cache = + (struct bidi_it *) xrealloc (bidi_cache, bidi_cache_size * elsz); + } +} + +static inline void bidi_cache_iterator_state (struct bidi_it *bidi_it, int resolved) { - int idx; + EMACS_INT idx; /* We should never cache on backward scans. */ if (bidi_it->scan_dir == -1) @@ -417,23 +483,17 @@ bidi_cache_iterator_state (struct bidi_it *bidi_it, int resolved) if (idx < 0) { idx = bidi_cache_idx; - /* Enlarge the cache as needed. */ - if (idx >= bidi_cache_size) - { - bidi_cache_size += BIDI_CACHE_CHUNK; - bidi_cache = - (struct bidi_it *) xrealloc (bidi_cache, bidi_cache_size * elsz); - } + bidi_cache_ensure_space (idx); /* Character positions should correspond to cache positions 1:1. If we are outside the range of cached positions, the cache is useless and must be reset. */ - if (idx > 0 && + if (idx > bidi_cache_start && (bidi_it->charpos > (bidi_cache[idx - 1].charpos + bidi_cache[idx - 1].nchars) - || bidi_it->charpos < bidi_cache[0].charpos)) + || bidi_it->charpos < bidi_cache[bidi_cache_start].charpos)) { bidi_cache_reset (); - idx = 0; + idx = bidi_cache_start; } if (bidi_it->nchars <= 0) abort (); @@ -465,12 +525,12 @@ bidi_cache_iterator_state (struct bidi_it *bidi_it, int resolved) bidi_cache_idx = idx + 1; } -static INLINE bidi_type_t +static inline bidi_type_t bidi_cache_find (EMACS_INT charpos, int level, struct bidi_it *bidi_it) { - int i = bidi_cache_search (charpos, level, bidi_it->scan_dir); + EMACS_INT i = bidi_cache_search (charpos, level, bidi_it->scan_dir); - if (i >= 0) + if (i >= bidi_cache_start) { bidi_dir_t current_scan_dir = bidi_it->scan_dir; @@ -485,72 +545,248 @@ bidi_cache_find (EMACS_INT charpos, int level, struct bidi_it *bidi_it) return UNKNOWN_BT; } -static INLINE int +static inline int bidi_peek_at_next_level (struct bidi_it *bidi_it) { - if (bidi_cache_idx == 0 || bidi_cache_last_idx == -1) + if (bidi_cache_idx == bidi_cache_start || bidi_cache_last_idx == -1) abort (); return bidi_cache[bidi_cache_last_idx + bidi_it->scan_dir].resolved_level; } -/* Check if buffer position CHARPOS/BYTEPOS is the end of a paragraph. - Value is the non-negative length of the paragraph separator - following the buffer position, -1 if position is at the beginning - of a new paragraph, or -2 if position is neither at beginning nor - at end of a paragraph. */ -static EMACS_INT -bidi_at_paragraph_end (EMACS_INT charpos, EMACS_INT bytepos) + +/*********************************************************************** + Pushing and popping the bidi iterator state + ***********************************************************************/ +/* 5-slot stack for saving the start of the previous level of the + cache. xdisp.c maintains a 5-slot stack for its iterator state, + and we need the same size of our stack. */ +static EMACS_INT bidi_cache_start_stack[IT_STACK_SIZE]; +static int bidi_cache_sp; + +/* Push the bidi iterator state in preparation for reordering a + different object, e.g. display string found at certain buffer + position. Pushing the bidi iterator boils down to saving its + entire state on the cache and starting a new cache "stacked" on top + of the current cache. */ +void +bidi_push_it (struct bidi_it *bidi_it) { - Lisp_Object sep_re; - Lisp_Object start_re; - EMACS_INT val; + /* Save the current iterator state in its entirety after the last + used cache slot. */ + bidi_cache_ensure_space (bidi_cache_idx); + memcpy (&bidi_cache[bidi_cache_idx++], bidi_it, sizeof (struct bidi_it)); - sep_re = paragraph_separate_re; - start_re = paragraph_start_re; + /* Push the current cache start onto the stack. */ + xassert (bidi_cache_sp < IT_STACK_SIZE); + bidi_cache_start_stack[bidi_cache_sp++] = bidi_cache_start; - val = fast_looking_at (sep_re, charpos, bytepos, ZV, ZV_BYTE, Qnil); - if (val < 0) + /* Start a new level of cache, and make it empty. */ + bidi_cache_start = bidi_cache_idx; + bidi_cache_last_idx = -1; +} + +/* Restore the iterator state saved by bidi_push_it and return the + cache to the corresponding state. */ +void +bidi_pop_it (struct bidi_it *bidi_it) +{ + if (bidi_cache_start <= 0) + abort (); + + /* Reset the next free cache slot index to what it was before the + call to bidi_push_it. */ + bidi_cache_idx = bidi_cache_start - 1; + + /* Restore the bidi iterator state saved in the cache. */ + memcpy (bidi_it, &bidi_cache[bidi_cache_idx], sizeof (struct bidi_it)); + + /* Pop the previous cache start from the stack. */ + if (bidi_cache_sp <= 0) + abort (); + bidi_cache_start = bidi_cache_start_stack[--bidi_cache_sp]; + + /* Invalidate the last-used cache slot data. */ + bidi_cache_last_idx = -1; +} + +/* Stash away a copy of the cache and its control variables. */ +void * +bidi_shelve_cache (void) +{ + unsigned char *databuf; + + if (bidi_cache_idx == 0) + return NULL; + + databuf = xmalloc (sizeof (bidi_cache_idx) + + bidi_cache_idx * sizeof (struct bidi_it) + + sizeof (bidi_cache_start_stack) + + sizeof (bidi_cache_sp) + sizeof (bidi_cache_start) + + sizeof (bidi_cache_last_idx)); + memcpy (databuf, &bidi_cache_idx, sizeof (bidi_cache_idx)); + memcpy (databuf + sizeof (bidi_cache_idx), + bidi_cache, bidi_cache_idx * sizeof (struct bidi_it)); + memcpy (databuf + sizeof (bidi_cache_idx) + + bidi_cache_idx * sizeof (struct bidi_it), + bidi_cache_start_stack, sizeof (bidi_cache_start_stack)); + memcpy (databuf + sizeof (bidi_cache_idx) + + bidi_cache_idx * sizeof (struct bidi_it) + + sizeof (bidi_cache_start_stack), + &bidi_cache_sp, sizeof (bidi_cache_sp)); + memcpy (databuf + sizeof (bidi_cache_idx) + + bidi_cache_idx * sizeof (struct bidi_it) + + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp), + &bidi_cache_start, sizeof (bidi_cache_start)); + memcpy (databuf + sizeof (bidi_cache_idx) + + bidi_cache_idx * sizeof (struct bidi_it) + + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp) + + sizeof (bidi_cache_start), + &bidi_cache_last_idx, sizeof (bidi_cache_last_idx)); + + return databuf; +} + +/* Restore the cache state from a copy stashed away by bidi_shelve_cache. */ +void +bidi_unshelve_cache (void *databuf) +{ + unsigned char *p = databuf; + + if (!p) { - if (fast_looking_at (start_re, charpos, bytepos, ZV, ZV_BYTE, Qnil) >= 0) - val = -1; - else - val = -2; + /* A NULL pointer means an empty cache. */ + bidi_cache_start = 0; + bidi_cache_sp = 0; + bidi_cache_reset (); + } + else + { + memcpy (&bidi_cache_idx, p, sizeof (bidi_cache_idx)); + bidi_cache_ensure_space (bidi_cache_idx); + memcpy (bidi_cache, p + sizeof (bidi_cache_idx), + bidi_cache_idx * sizeof (struct bidi_it)); + memcpy (bidi_cache_start_stack, + p + sizeof (bidi_cache_idx) + + bidi_cache_idx * sizeof (struct bidi_it), + sizeof (bidi_cache_start_stack)); + memcpy (&bidi_cache_sp, + p + sizeof (bidi_cache_idx) + + bidi_cache_idx * sizeof (struct bidi_it) + + sizeof (bidi_cache_start_stack), + sizeof (bidi_cache_sp)); + memcpy (&bidi_cache_start, + p + sizeof (bidi_cache_idx) + + bidi_cache_idx * sizeof (struct bidi_it) + + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp), + sizeof (bidi_cache_start)); + memcpy (&bidi_cache_last_idx, + p + sizeof (bidi_cache_idx) + + bidi_cache_idx * sizeof (struct bidi_it) + + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp) + + sizeof (bidi_cache_start), + sizeof (bidi_cache_last_idx)); + + xfree (p); } - - return val; } -/* Determine the start-of-run (sor) directional type given the two - embedding levels on either side of the run boundary. Also, update - the saved info about previously seen characters, since that info is - generally valid for a single level run. */ -static INLINE void -bidi_set_sor_type (struct bidi_it *bidi_it, int level_before, int level_after) + +/*********************************************************************** + Initialization + ***********************************************************************/ +static void +bidi_initialize (void) { - int higher_level = level_before > level_after ? level_before : level_after; - /* The prev_was_pdf gork is required for when we have several PDFs - in a row. In that case, we want to compute the sor type for the - next level run only once: when we see the first PDF. That's - because the sor type depends only on the higher of the two levels - that we find on the two sides of the level boundary (see UAX#9, - clause X10), and so we don't need to know the final embedding - level to which we descend after processing all the PDFs. */ - if (!bidi_it->prev_was_pdf || level_before < level_after) - /* FIXME: should the default sor direction be user selectable? */ - bidi_it->sor = (higher_level & 1) != 0 ? R2L : L2R; - if (level_before > level_after) - bidi_it->prev_was_pdf = 1; +#include "biditype.h" +#include "bidimirror.h" - bidi_it->prev.type = UNKNOWN_BT; + int i; + + bidi_type_table = Fmake_char_table (Qnil, make_number (STRONG_L)); + staticpro (&bidi_type_table); + + for (i = 0; i < sizeof bidi_type / sizeof bidi_type[0]; i++) + char_table_set_range (bidi_type_table, bidi_type[i].from, bidi_type[i].to, + make_number (bidi_type[i].type)); + + bidi_mirror_table = Fmake_char_table (Qnil, Qnil); + staticpro (&bidi_mirror_table); + + for (i = 0; i < sizeof bidi_mirror / sizeof bidi_mirror[0]; i++) + char_table_set (bidi_mirror_table, bidi_mirror[i].from, + make_number (bidi_mirror[i].to)); + + Qparagraph_start = intern ("paragraph-start"); + staticpro (&Qparagraph_start); + paragraph_start_re = Fsymbol_value (Qparagraph_start); + if (!STRINGP (paragraph_start_re)) + paragraph_start_re = build_string ("\f\\|[ \t]*$"); + staticpro (¶graph_start_re); + Qparagraph_separate = intern ("paragraph-separate"); + staticpro (&Qparagraph_separate); + paragraph_separate_re = Fsymbol_value (Qparagraph_separate); + if (!STRINGP (paragraph_separate_re)) + paragraph_separate_re = build_string ("[ \t\f]*$"); + staticpro (¶graph_separate_re); + + bidi_cache_sp = 0; + + bidi_initialized = 1; +} + +/* Do whatever UAX#9 clause X8 says should be done at paragraph's + end. */ +static inline void +bidi_set_paragraph_end (struct bidi_it *bidi_it) +{ + bidi_it->invalid_levels = 0; + bidi_it->invalid_rl_levels = -1; + bidi_it->stack_idx = 0; + bidi_it->resolved_level = bidi_it->level_stack[0].level; +} + +/* Initialize the bidi iterator from buffer/string position CHARPOS. */ +void +bidi_init_it (EMACS_INT charpos, EMACS_INT bytepos, int frame_window_p, + struct bidi_it *bidi_it) +{ + if (! bidi_initialized) + bidi_initialize (); + if (charpos >= 0) + bidi_it->charpos = charpos; + if (bytepos >= 0) + bidi_it->bytepos = bytepos; + bidi_it->frame_window_p = frame_window_p; + bidi_it->nchars = -1; /* to be computed in bidi_resolve_explicit_1 */ + bidi_it->first_elt = 1; + bidi_set_paragraph_end (bidi_it); + bidi_it->new_paragraph = 1; + bidi_it->separator_limit = -1; + bidi_it->type = NEUTRAL_B; + bidi_it->type_after_w1 = NEUTRAL_B; + bidi_it->orig_type = NEUTRAL_B; + bidi_it->prev_was_pdf = 0; + bidi_it->prev.type = bidi_it->prev.type_after_w1 = + bidi_it->prev.orig_type = UNKNOWN_BT; bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 = bidi_it->last_strong.orig_type = UNKNOWN_BT; - bidi_it->prev_for_neutral.type = bidi_it->sor == R2L ? STRONG_R : STRONG_L; - bidi_it->prev_for_neutral.charpos = bidi_it->charpos; - bidi_it->prev_for_neutral.bytepos = bidi_it->bytepos; - bidi_it->next_for_neutral.type = bidi_it->next_for_neutral.type_after_w1 = + bidi_it->next_for_neutral.charpos = -1; + bidi_it->next_for_neutral.type = + bidi_it->next_for_neutral.type_after_w1 = bidi_it->next_for_neutral.orig_type = UNKNOWN_BT; - bidi_it->ignore_bn_limit = 0; /* meaning it's unknown */ + bidi_it->prev_for_neutral.charpos = -1; + bidi_it->prev_for_neutral.type = + bidi_it->prev_for_neutral.type_after_w1 = + bidi_it->prev_for_neutral.orig_type = UNKNOWN_BT; + bidi_it->sor = L2R; /* FIXME: should it be user-selectable? */ + bidi_it->disp_pos = -1; /* invalid/unknown */ + /* We can only shrink the cache if we are at the bottom level of its + "stack". */ + if (bidi_cache_start == 0) + bidi_cache_shrink (); + else + bidi_cache_reset (); } /* Perform initializations for reordering a new line of bidi text. */ @@ -571,6 +807,57 @@ bidi_line_init (struct bidi_it *bidi_it) bidi_cache_reset (); } + +/*********************************************************************** + Fetching characters + ***********************************************************************/ + +/* Count bytes in string S between BEG/BEGBYTE and END. BEG and END + are zero-based character positions in S, BEGBYTE is byte position + corresponding to BEG. UNIBYTE, if non-zero, means S is a unibyte + string. */ +static inline EMACS_INT +bidi_count_bytes (const unsigned char *s, const EMACS_INT beg, + const EMACS_INT begbyte, const EMACS_INT end, int unibyte) +{ + EMACS_INT pos = beg; + const unsigned char *p = s + begbyte, *start = p; + + if (unibyte) + p = s + end; + else + { + if (!CHAR_HEAD_P (*p)) + abort (); + + while (pos < end) + { + p += BYTES_BY_CHAR_HEAD (*p); + pos++; + } + } + + return p - start; +} + +/* Fetch and returns the character at byte position BYTEPOS. If S is + non-NULL, fetch the character from string S; otherwise fetch the + character from the current buffer. UNIBYTE non-zero means S is a + unibyte string. */ +static inline int +bidi_char_at_pos (EMACS_INT bytepos, const unsigned char *s, int unibyte) +{ + if (s) + { + if (unibyte) + return s[bytepos]; + else + return STRING_CHAR (s + bytepos); + } + else + return FETCH_MULTIBYTE_CHAR (bytepos); +} + /* Fetch and return the character at BYTEPOS/CHARPOS. If that character is covered by a display string, treat the entire run of covered characters as a single character u+FFFC, and return their @@ -578,26 +865,34 @@ bidi_line_init (struct bidi_it *bidi_it) character position of the next display string, or -1 if not yet computed. When the next character is at or beyond that position, the function updates DISP_POS with the position of the next display - string. */ -static INLINE int + string. STRING->s is the C string to iterate, or NULL if iterating + over a buffer or a Lisp string; in the latter case, STRING->lstring + is the Lisp string. */ +static inline int bidi_fetch_char (EMACS_INT bytepos, EMACS_INT charpos, EMACS_INT *disp_pos, + struct bidi_string_data *string, int frame_window_p, EMACS_INT *ch_len, EMACS_INT *nchars) { int ch; + EMACS_INT endpos = + (string->s || STRINGP (string->lstring)) ? string->schars : ZV; + struct text_pos pos; - /* FIXME: Support strings in addition to buffers. */ /* If we got past the last known position of display string, compute - the position of the next one. That position could be at BYTEPOS. */ - if (charpos < ZV && charpos > *disp_pos) - *disp_pos = compute_display_string_pos (charpos, frame_window_p); + the position of the next one. That position could be at CHARPOS. */ + if (charpos < endpos && charpos > *disp_pos) + { + SET_TEXT_POS (pos, charpos, bytepos); + *disp_pos = compute_display_string_pos (&pos, string, frame_window_p); + } /* Fetch the character at BYTEPOS. */ - if (bytepos >= ZV_BYTE) + if (charpos >= endpos) { ch = BIDI_EOB; *ch_len = 1; *nchars = 1; - *disp_pos = ZV; + *disp_pos = endpos; } else if (charpos >= *disp_pos) { @@ -608,28 +903,105 @@ bidi_fetch_char (EMACS_INT bytepos, EMACS_INT charpos, EMACS_INT *disp_pos, if (charpos > *disp_pos) abort (); /* Return the Unicode Object Replacement Character to represent - the entire run of characters covered by the display - string. */ + the entire run of characters covered by the display string. */ ch = 0xFFFC; - disp_end_pos = compute_display_string_end (*disp_pos); + disp_end_pos = compute_display_string_end (*disp_pos, string); *nchars = disp_end_pos - *disp_pos; - *ch_len = CHAR_TO_BYTE (disp_end_pos) - bytepos; + if (*nchars <= 0) + abort (); + if (string->s) + *ch_len = bidi_count_bytes (string->s, *disp_pos, bytepos, + disp_end_pos, string->unibyte); + else if (STRINGP (string->lstring)) + *ch_len = bidi_count_bytes (SDATA (string->lstring), *disp_pos, + bytepos, disp_end_pos, string->unibyte); + else + *ch_len = CHAR_TO_BYTE (disp_end_pos) - bytepos; } else { - ch = FETCH_MULTIBYTE_CHAR (bytepos); + if (string->s) + { + int len; + + if (!string->unibyte) + { + ch = STRING_CHAR_AND_LENGTH (string->s + bytepos, len); + *ch_len = len; + } + else + { + ch = UNIBYTE_TO_CHAR (string->s[bytepos]); + *ch_len = 1; + } + } + else if (STRINGP (string->lstring)) + { + int len; + + if (!string->unibyte) + { + ch = STRING_CHAR_AND_LENGTH (SDATA (string->lstring) + bytepos, + len); + *ch_len = len; + } + else + { + ch = UNIBYTE_TO_CHAR (SREF (string->lstring, bytepos)); + *ch_len = 1; + } + } + else + { + ch = FETCH_MULTIBYTE_CHAR (bytepos); + *ch_len = CHAR_BYTES (ch); + } *nchars = 1; - *ch_len = CHAR_BYTES (ch); } /* If we just entered a run of characters covered by a display string, compute the position of the next display string. */ - if (charpos + *nchars <= ZV && charpos + *nchars > *disp_pos) - *disp_pos = compute_display_string_pos (charpos + *nchars, frame_window_p); + if (charpos + *nchars <= endpos && charpos + *nchars > *disp_pos) + { + SET_TEXT_POS (pos, charpos + *nchars, bytepos + *ch_len); + *disp_pos = compute_display_string_pos (&pos, string, frame_window_p); + } return ch; } + +/*********************************************************************** + Determining paragraph direction + ***********************************************************************/ + +/* Check if buffer position CHARPOS/BYTEPOS is the end of a paragraph. + Value is the non-negative length of the paragraph separator + following the buffer position, -1 if position is at the beginning + of a new paragraph, or -2 if position is neither at beginning nor + at end of a paragraph. */ +static EMACS_INT +bidi_at_paragraph_end (EMACS_INT charpos, EMACS_INT bytepos) +{ + Lisp_Object sep_re; + Lisp_Object start_re; + EMACS_INT val; + + sep_re = paragraph_separate_re; + start_re = paragraph_start_re; + + val = fast_looking_at (sep_re, charpos, bytepos, ZV, ZV_BYTE, Qnil); + if (val < 0) + { + if (fast_looking_at (start_re, charpos, bytepos, ZV, ZV_BYTE, Qnil) >= 0) + val = -1; + else + val = -2; + } + + return val; +} + /* Find the beginning of this paragraph by looking back in the buffer. Value is the byte position of the paragraph's beginning. */ static EMACS_INT @@ -670,13 +1042,19 @@ void bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it, int no_default_p) { EMACS_INT bytepos = bidi_it->bytepos; + int string_p = bidi_it->string.s != NULL || STRINGP (bidi_it->string.lstring); EMACS_INT pstartbyte; + /* Note that begbyte is a byte position, while end is a character + position. Yes, this is ugly, but we are trying to avoid costly + calls to BYTE_TO_CHAR and its ilk. */ + EMACS_INT begbyte = string_p ? 0 : BEGV_BYTE; + EMACS_INT end = string_p ? bidi_it->string.schars : ZV; /* Special case for an empty buffer. */ - if (bytepos == BEGV_BYTE && bytepos == ZV_BYTE) + if (bytepos == begbyte && bidi_it->charpos == end) dir = L2R; /* We should never be called at EOB or before BEGV. */ - else if (bytepos >= ZV_BYTE || bytepos < BEGV_BYTE) + else if (bidi_it->charpos >= end || bytepos < begbyte) abort (); if (dir == L2R) @@ -695,6 +1073,7 @@ bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it, int no_default_p) EMACS_INT ch_len, nchars; EMACS_INT pos, disp_pos = -1; bidi_type_t type; + const unsigned char *s; if (!bidi_initialized) bidi_initialize (); @@ -712,7 +1091,10 @@ bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it, int no_default_p) we are potentially in a new paragraph that doesn't yet exist. */ pos = bidi_it->charpos; - if (bytepos > BEGV_BYTE && FETCH_CHAR (bytepos) == '\n') + s = STRINGP (bidi_it->string.lstring) ? + SDATA (bidi_it->string.lstring) : bidi_it->string.s; + if (bytepos > begbyte + && bidi_char_at_pos (bytepos, s, bidi_it->string.unibyte) == '\n') { bytepos++; pos++; @@ -720,17 +1102,25 @@ bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it, int no_default_p) /* We are either at the beginning of a paragraph or in the middle of it. Find where this paragraph starts. */ - pstartbyte = bidi_find_paragraph_start (pos, bytepos); + if (string_p) + { + /* We don't support changes of paragraph direction inside a + string. It is treated as a single paragraph. */ + pstartbyte = 0; + } + else + pstartbyte = bidi_find_paragraph_start (pos, bytepos); bidi_it->separator_limit = -1; bidi_it->new_paragraph = 0; /* The following loop is run more than once only if NO_DEFAULT_P - is non-zero. */ + is non-zero, and only if we are iterating on a buffer. */ do { bytepos = pstartbyte; - pos = BYTE_TO_CHAR (bytepos); - ch = bidi_fetch_char (bytepos, pos, &disp_pos, bidi_it->frame_window_p, - &ch_len, &nchars); + if (!string_p) + pos = BYTE_TO_CHAR (bytepos); + ch = bidi_fetch_char (bytepos, pos, &disp_pos, &bidi_it->string, + bidi_it->frame_window_p, &ch_len, &nchars); type = bidi_get_type (ch, NEUTRAL_DIR); for (pos += nchars, bytepos += ch_len; @@ -744,17 +1134,19 @@ bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it, int no_default_p) || type == LRE || type == LRO)); type = bidi_get_type (ch, NEUTRAL_DIR)) { - if (type == NEUTRAL_B && bidi_at_paragraph_end (pos, bytepos) >= -1) - break; - if (bytepos >= ZV_BYTE) + if (pos >= end) { /* Pretend there's a paragraph separator at end of - buffer. */ + buffer/string. */ type = NEUTRAL_B; break; } + if (!string_p + && type == NEUTRAL_B + && bidi_at_paragraph_end (pos, bytepos) >= -1) + break; /* Fetch next character and advance to get past it. */ - ch = bidi_fetch_char (bytepos, pos, &disp_pos, + ch = bidi_fetch_char (bytepos, pos, &disp_pos, &bidi_it->string, bidi_it->frame_window_p, &ch_len, &nchars); pos += nchars; bytepos += ch_len; @@ -763,7 +1155,8 @@ bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it, int no_default_p) bidi_it->paragraph_dir = R2L; else if (type == STRONG_L) bidi_it->paragraph_dir = L2R; - if (no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR) + if (!string_p + && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR) { /* If this paragraph is at BEGV, default to L2R. */ if (pstartbyte == BEGV_BYTE) @@ -786,7 +1179,8 @@ bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it, int no_default_p) pstartbyte = prevpbyte; } } - } while (no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR); + } while (!string_p + && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR); } else abort (); @@ -804,112 +1198,13 @@ bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it, int no_default_p) bidi_line_init (bidi_it); } -/* Do whatever UAX#9 clause X8 says should be done at paragraph's - end. */ -static INLINE void -bidi_set_paragraph_end (struct bidi_it *bidi_it) -{ - bidi_it->invalid_levels = 0; - bidi_it->invalid_rl_levels = -1; - bidi_it->stack_idx = 0; - bidi_it->resolved_level = bidi_it->level_stack[0].level; -} - -/* Initialize the bidi iterator from buffer/string position CHARPOS. */ -void -bidi_init_it (EMACS_INT charpos, EMACS_INT bytepos, int frame_window_p, - struct bidi_it *bidi_it) -{ - if (! bidi_initialized) - bidi_initialize (); - bidi_it->charpos = charpos; - bidi_it->bytepos = bytepos; - bidi_it->frame_window_p = frame_window_p; - bidi_it->nchars = -1; /* to be computed in bidi_resolve_explicit_1 */ - bidi_it->first_elt = 1; - bidi_set_paragraph_end (bidi_it); - bidi_it->new_paragraph = 1; - bidi_it->separator_limit = -1; - bidi_it->type = NEUTRAL_B; - bidi_it->type_after_w1 = NEUTRAL_B; - bidi_it->orig_type = NEUTRAL_B; - bidi_it->prev_was_pdf = 0; - bidi_it->prev.type = bidi_it->prev.type_after_w1 = - bidi_it->prev.orig_type = UNKNOWN_BT; - bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 = - bidi_it->last_strong.orig_type = UNKNOWN_BT; - bidi_it->next_for_neutral.charpos = -1; - bidi_it->next_for_neutral.type = - bidi_it->next_for_neutral.type_after_w1 = - bidi_it->next_for_neutral.orig_type = UNKNOWN_BT; - bidi_it->prev_for_neutral.charpos = -1; - bidi_it->prev_for_neutral.type = - bidi_it->prev_for_neutral.type_after_w1 = - bidi_it->prev_for_neutral.orig_type = UNKNOWN_BT; - bidi_it->sor = L2R; /* FIXME: should it be user-selectable? */ - bidi_it->disp_pos = -1; /* invalid/unknown */ - bidi_cache_shrink (); -} - -/* Push the current embedding level and override status; reset the - current level to LEVEL and the current override status to OVERRIDE. */ -static INLINE void -bidi_push_embedding_level (struct bidi_it *bidi_it, - int level, bidi_dir_t override) -{ - bidi_it->stack_idx++; - if (bidi_it->stack_idx >= BIDI_MAXLEVEL) - abort (); - bidi_it->level_stack[bidi_it->stack_idx].level = level; - bidi_it->level_stack[bidi_it->stack_idx].override = override; -} - -/* Pop the embedding level and directional override status from the - stack, and return the new level. */ -static INLINE int -bidi_pop_embedding_level (struct bidi_it *bidi_it) -{ - /* UAX#9 says to ignore invalid PDFs. */ - if (bidi_it->stack_idx > 0) - bidi_it->stack_idx--; - return bidi_it->level_stack[bidi_it->stack_idx].level; -} - -/* Record in SAVED_INFO the information about the current character. */ -static INLINE void -bidi_remember_char (struct bidi_saved_info *saved_info, - struct bidi_it *bidi_it) -{ - saved_info->charpos = bidi_it->charpos; - saved_info->bytepos = bidi_it->bytepos; - saved_info->type = bidi_it->type; - bidi_check_type (bidi_it->type); - saved_info->type_after_w1 = bidi_it->type_after_w1; - bidi_check_type (bidi_it->type_after_w1); - saved_info->orig_type = bidi_it->orig_type; - bidi_check_type (bidi_it->orig_type); -} - -/* Resolve the type of a neutral character according to the type of - surrounding strong text and the current embedding level. */ -static INLINE bidi_type_t -bidi_resolve_neutral_1 (bidi_type_t prev_type, bidi_type_t next_type, int lev) -{ - /* N1: European and Arabic numbers are treated as though they were R. */ - if (next_type == WEAK_EN || next_type == WEAK_AN) - next_type = STRONG_R; - if (prev_type == WEAK_EN || prev_type == WEAK_AN) - prev_type = STRONG_R; - - if (next_type == prev_type) /* N1 */ - return next_type; - else if ((lev & 1) == 0) /* N2 */ - return STRONG_L; - else - return STRONG_R; -} + +/*********************************************************************** + Resolving explicit and implicit levels. + The rest of this file constitutes the core of the UBA implementation. + ***********************************************************************/ -static INLINE int +static inline int bidi_explicit_dir_char (int ch) { bidi_type_t ch_type; @@ -934,19 +1229,35 @@ bidi_resolve_explicit_1 (struct bidi_it *bidi_it) int current_level; int new_level; bidi_dir_t override; + int string_p = bidi_it->string.s != NULL || STRINGP (bidi_it->string.lstring); /* If reseat()'ed, don't advance, so as to start iteration from the position where we were reseated. bidi_it->bytepos can be less than BEGV_BYTE after reseat to BEGV. */ - if (bidi_it->bytepos < BEGV_BYTE + if (bidi_it->bytepos < (string_p ? 0 : BEGV_BYTE) || bidi_it->first_elt) { bidi_it->first_elt = 0; - if (bidi_it->charpos < BEGV) - bidi_it->charpos = BEGV; - bidi_it->bytepos = CHAR_TO_BYTE (bidi_it->charpos); + if (string_p) + { + const unsigned char *p = + STRINGP (bidi_it->string.lstring) + ? SDATA (bidi_it->string.lstring) : bidi_it->string.s; + + if (bidi_it->charpos < 0) + bidi_it->charpos = 0; + bidi_it->bytepos = bidi_count_bytes (p, 0, 0, bidi_it->charpos, + bidi_it->string.unibyte); + } + else + { + if (bidi_it->charpos < BEGV) + bidi_it->charpos = BEGV; + bidi_it->bytepos = CHAR_TO_BYTE (bidi_it->charpos); + } } - else if (bidi_it->bytepos < ZV_BYTE) /* don't move at ZV */ + /* Don't move at end of buffer/string. */ + else if (bidi_it->charpos < (string_p ? bidi_it->string.schars : ZV)) { /* Advance to the next character, skipping characters covered by display strings (nchars > 1). */ @@ -962,12 +1273,12 @@ bidi_resolve_explicit_1 (struct bidi_it *bidi_it) override = bidi_it->level_stack[bidi_it->stack_idx].override; new_level = current_level; - if (bidi_it->bytepos >= ZV_BYTE) + if (bidi_it->charpos >= (string_p ? bidi_it->string.schars : ZV)) { curchar = BIDI_EOB; bidi_it->ch_len = 1; bidi_it->nchars = 1; - bidi_it->disp_pos = ZV; + bidi_it->disp_pos = (string_p ? bidi_it->string.schars : ZV); } else { @@ -975,7 +1286,8 @@ bidi_resolve_explicit_1 (struct bidi_it *bidi_it) display string, treat the entire run of covered characters as a single character u+FFFC. */ curchar = bidi_fetch_char (bidi_it->bytepos, bidi_it->charpos, - &bidi_it->disp_pos, bidi_it->frame_window_p, + &bidi_it->disp_pos, &bidi_it->string, + bidi_it->frame_window_p, &bidi_it->ch_len, &bidi_it->nchars); } bidi_it->ch = curchar; @@ -1000,7 +1312,7 @@ bidi_resolve_explicit_1 (struct bidi_it *bidi_it) bidi_it->type_after_w1 = type; bidi_check_type (bidi_it->type_after_w1); type = WEAK_BN; /* X9/Retaining */ - if (bidi_it->ignore_bn_limit <= 0) + if (bidi_it->ignore_bn_limit <= -1) { if (current_level <= BIDI_MAXLEVEL - 4) { @@ -1033,7 +1345,7 @@ bidi_resolve_explicit_1 (struct bidi_it *bidi_it) bidi_it->type_after_w1 = type; bidi_check_type (bidi_it->type_after_w1); type = WEAK_BN; /* X9/Retaining */ - if (bidi_it->ignore_bn_limit <= 0) + if (bidi_it->ignore_bn_limit <= -1) { if (current_level <= BIDI_MAXLEVEL - 5) { @@ -1068,7 +1380,7 @@ bidi_resolve_explicit_1 (struct bidi_it *bidi_it) bidi_it->type_after_w1 = type; bidi_check_type (bidi_it->type_after_w1); type = WEAK_BN; /* X9/Retaining */ - if (bidi_it->ignore_bn_limit <= 0) + if (bidi_it->ignore_bn_limit <= -1) { if (!bidi_it->invalid_rl_levels) { @@ -1111,13 +1423,17 @@ bidi_resolve_explicit (struct bidi_it *bidi_it) { int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level; int new_level = bidi_resolve_explicit_1 (bidi_it); + EMACS_INT eob = bidi_it->string.s ? bidi_it->string.schars : ZV; + const unsigned char *s = STRINGP (bidi_it->string.lstring) + ? SDATA (bidi_it->string.lstring) : bidi_it->string.s; if (prev_level < new_level && bidi_it->type == WEAK_BN - && bidi_it->ignore_bn_limit == 0 /* only if not already known */ - && bidi_it->bytepos < ZV_BYTE /* not already at EOB */ - && bidi_explicit_dir_char (FETCH_MULTIBYTE_CHAR (bidi_it->bytepos - + bidi_it->ch_len))) + && bidi_it->ignore_bn_limit == -1 /* only if not already known */ + && bidi_it->charpos < eob /* not already at EOB */ + && bidi_explicit_dir_char (bidi_char_at_pos (bidi_it->bytepos + + bidi_it->ch_len, s, + bidi_it->string.unibyte))) { /* Avoid pushing and popping embedding levels if the level run is empty, as this breaks level runs where it shouldn't. @@ -1129,12 +1445,17 @@ bidi_resolve_explicit (struct bidi_it *bidi_it) bidi_copy_it (&saved_it, bidi_it); - while (bidi_explicit_dir_char (FETCH_MULTIBYTE_CHAR (bidi_it->bytepos - + bidi_it->ch_len))) + while (bidi_explicit_dir_char (bidi_char_at_pos (bidi_it->bytepos + + bidi_it->ch_len, s, + bidi_it->string.unibyte))) { /* This advances to the next character, skipping any characters covered by display strings. */ level = bidi_resolve_explicit_1 (bidi_it); + /* If string.lstring was relocated inside bidi_resolve_explicit_1, + a pointer to its data is no longer valid. */ + if (STRINGP (bidi_it->string.lstring)) + s = SDATA (bidi_it->string.lstring); } if (bidi_it->nchars <= 0) @@ -1142,10 +1463,10 @@ bidi_resolve_explicit (struct bidi_it *bidi_it) if (level == prev_level) /* empty embedding */ saved_it.ignore_bn_limit = bidi_it->charpos + bidi_it->nchars; else /* this embedding is non-empty */ - saved_it.ignore_bn_limit = -1; + saved_it.ignore_bn_limit = -2; bidi_copy_it (bidi_it, &saved_it); - if (bidi_it->ignore_bn_limit > 0) + if (bidi_it->ignore_bn_limit > -1) { /* We pushed a level, but we shouldn't have. Undo that. */ if (!bidi_it->invalid_rl_levels) @@ -1188,6 +1509,9 @@ bidi_resolve_weak (struct bidi_it *bidi_it) int next_char; bidi_type_t type_of_next; struct bidi_it saved_it; + EMACS_INT eob = + (STRINGP (bidi_it->string.lstring) || bidi_it->string.s) + ? bidi_it->string.schars : ZV; type = bidi_it->type; override = bidi_it->level_stack[bidi_it->stack_idx].override; @@ -1254,10 +1578,15 @@ bidi_resolve_weak (struct bidi_it *bidi_it) && bidi_it->prev.orig_type == WEAK_EN) || bidi_it->prev.type_after_w1 == WEAK_AN))) { + const unsigned char *s = + STRINGP (bidi_it->string.lstring) + ? SDATA (bidi_it->string.lstring) : bidi_it->string.s; + next_char = - bidi_it->bytepos + bidi_it->ch_len >= ZV_BYTE - ? BIDI_EOB : FETCH_MULTIBYTE_CHAR (bidi_it->bytepos - + bidi_it->ch_len); + bidi_it->charpos + bidi_it->nchars >= eob + ? BIDI_EOB + : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len, s, + bidi_it->string.unibyte); type_of_next = bidi_get_type (next_char, override); if (type_of_next == WEAK_BN @@ -1306,13 +1635,17 @@ bidi_resolve_weak (struct bidi_it *bidi_it) else /* W5: ET/BN with EN after it. */ { EMACS_INT en_pos = bidi_it->charpos + bidi_it->nchars; + const unsigned char *s = + STRINGP (bidi_it->string.lstring) + ? SDATA (bidi_it->string.lstring) : bidi_it->string.s; if (bidi_it->nchars <= 0) abort (); next_char = - bidi_it->bytepos + bidi_it->ch_len >= ZV_BYTE - ? BIDI_EOB : FETCH_MULTIBYTE_CHAR (bidi_it->bytepos - + bidi_it->ch_len); + bidi_it->charpos + bidi_it->nchars >= eob + ? BIDI_EOB + : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len, s, + bidi_it->string.unibyte); type_of_next = bidi_get_type (next_char, override); if (type_of_next == WEAK_ET @@ -1373,6 +1706,25 @@ bidi_resolve_weak (struct bidi_it *bidi_it) return type; } +/* Resolve the type of a neutral character according to the type of + surrounding strong text and the current embedding level. */ +static inline bidi_type_t +bidi_resolve_neutral_1 (bidi_type_t prev_type, bidi_type_t next_type, int lev) +{ + /* N1: European and Arabic numbers are treated as though they were R. */ + if (next_type == WEAK_EN || next_type == WEAK_AN) + next_type = STRONG_R; + if (prev_type == WEAK_EN || prev_type == WEAK_AN) + prev_type = STRONG_R; + + if (next_type == prev_type) /* N1 */ + return next_type; + else if ((lev & 1) == 0) /* N2 */ + return STRONG_L; + else + return STRONG_R; +} + static bidi_type_t bidi_resolve_neutral (struct bidi_it *bidi_it) { @@ -1509,11 +1861,11 @@ bidi_type_of_next_char (struct bidi_it *bidi_it) /* Reset the limit until which to ignore BNs if we step out of the area where we found only empty levels. */ - if ((bidi_it->ignore_bn_limit > 0 + if ((bidi_it->ignore_bn_limit > -1 && bidi_it->ignore_bn_limit <= bidi_it->charpos) - || (bidi_it->ignore_bn_limit == -1 + || (bidi_it->ignore_bn_limit == -2 && !bidi_explicit_dir_char (bidi_it->ch))) - bidi_it->ignore_bn_limit = 0; + bidi_it->ignore_bn_limit = -1; type = bidi_resolve_neutral (bidi_it); @@ -1530,12 +1882,16 @@ bidi_level_of_next_char (struct bidi_it *bidi_it) bidi_type_t type; int level, prev_level = -1; struct bidi_saved_info next_for_neutral; - EMACS_INT next_char_pos; + EMACS_INT next_char_pos = -2; if (bidi_it->scan_dir == 1) { + EMACS_INT eob = + (bidi_it->string.s || STRINGP (bidi_it->string.lstring)) + ? bidi_it->string.schars : ZV; + /* There's no sense in trying to advance if we hit end of text. */ - if (bidi_it->bytepos >= ZV_BYTE) + if (bidi_it->charpos >= eob) return bidi_it->resolved_level; /* Record the info about the previous character. */ @@ -1575,17 +1931,27 @@ bidi_level_of_next_char (struct bidi_it *bidi_it) /* Perhaps the character we want is already cached. If it is, the call to bidi_cache_find below will return a type other than UNKNOWN_BT. */ - if (bidi_cache_idx && !bidi_it->first_elt) + if (bidi_cache_idx > bidi_cache_start && !bidi_it->first_elt) { + int bob = + (bidi_it->string.s || STRINGP (bidi_it->string.lstring)) ? 0 : 1; + if (bidi_it->scan_dir > 0) { if (bidi_it->nchars <= 0) abort (); next_char_pos = bidi_it->charpos + bidi_it->nchars; } - else + else if (bidi_it->charpos >= bob) + /* Implementation note: we allow next_char_pos to be as low as + 0 for buffers or -1 for strings, and that is okay because + that's the "position" of the sentinel iterator state we + cached at the beginning of the iteration. */ next_char_pos = bidi_it->charpos - 1; - type = bidi_cache_find (next_char_pos, -1, bidi_it); + if (next_char_pos >= bob - 1) + type = bidi_cache_find (next_char_pos, -1, bidi_it); + else + type = UNKNOWN_BT; } else type = UNKNOWN_BT; @@ -1647,18 +2013,19 @@ bidi_level_of_next_char (struct bidi_it *bidi_it) && bidi_it->next_for_ws.type == UNKNOWN_BT) { int ch; - int clen = bidi_it->ch_len; + EMACS_INT clen = bidi_it->ch_len; EMACS_INT bpos = bidi_it->bytepos; EMACS_INT cpos = bidi_it->charpos; EMACS_INT disp_pos = bidi_it->disp_pos; EMACS_INT nc = bidi_it->nchars; + struct bidi_string_data bs = bidi_it->string; bidi_type_t chtype; int fwp = bidi_it->frame_window_p; if (bidi_it->nchars <= 0) abort (); do { - ch = bidi_fetch_char (bpos += clen, cpos += nc, &disp_pos, fwp, + ch = bidi_fetch_char (bpos += clen, cpos += nc, &disp_pos, &bs, fwp, &clen, &nc); if (ch == '\n' || ch == BIDI_EOB /* || ch == LINESEP_CHAR */) chtype = NEUTRAL_B; @@ -1756,10 +2123,11 @@ static void bidi_find_other_level_edge (struct bidi_it *bidi_it, int level, int end_flag) { int dir = end_flag ? -bidi_it->scan_dir : bidi_it->scan_dir; - int idx; + EMACS_INT idx; /* Try the cache first. */ - if ((idx = bidi_cache_find_level_change (level, dir, end_flag)) >= 0) + if ((idx = bidi_cache_find_level_change (level, dir, end_flag)) + >= bidi_cache_start) bidi_cache_fetch_state (idx, bidi_it); else { @@ -1781,12 +2149,21 @@ bidi_move_to_visually_next (struct bidi_it *bidi_it) { int old_level, new_level, next_level; struct bidi_it sentinel; + struct gcpro gcpro1; + + if (bidi_it->charpos < 0 || bidi_it->bytepos < 0) + abort (); if (bidi_it->scan_dir == 0) { bidi_it->scan_dir = 1; /* default to logical order */ } + /* The code below can call eval, and thus cause GC. If we are + iterating a Lisp string, make sure it won't be GCed. */ + if (STRINGP (bidi_it->string.lstring)) + GCPRO1 (bidi_it->string.lstring); + /* If we just passed a newline, initialize for the next line. */ if (!bidi_it->first_elt && bidi_it->orig_type == NEUTRAL_B) bidi_line_init (bidi_it); @@ -1794,7 +2171,7 @@ bidi_move_to_visually_next (struct bidi_it *bidi_it) /* Prepare the sentinel iterator state, and cache it. When we bump into it, scanning backwards, we'll know that the last non-base level is exhausted. */ - if (bidi_cache_idx == 0) + if (bidi_cache_idx == bidi_cache_start) { bidi_copy_it (&sentinel, bidi_it); if (bidi_it->first_elt) @@ -1869,26 +2246,34 @@ bidi_move_to_visually_next (struct bidi_it *bidi_it) reordering, whereas we _must_ know the paragraph base direction _before_ we process the paragraph's text, since the base direction affects the reordering. */ - if (bidi_it->scan_dir == 1 - && bidi_it->orig_type == NEUTRAL_B - && bidi_it->bytepos < ZV_BYTE) + if (bidi_it->scan_dir == 1 && bidi_it->orig_type == NEUTRAL_B) { - EMACS_INT sep_len = - bidi_at_paragraph_end (bidi_it->charpos + bidi_it->nchars, - bidi_it->bytepos + bidi_it->ch_len); - if (bidi_it->nchars <= 0) - abort (); - if (sep_len >= 0) + /* The paragraph direction of the entire string, once + determined, is in effect for the entire string. Setting the + separator limit to the end of the string prevents + bidi_paragraph_init from being called automatically on this + string. */ + if (bidi_it->string.s || STRINGP (bidi_it->string.lstring)) + bidi_it->separator_limit = bidi_it->string.schars; + else if (bidi_it->bytepos < ZV_BYTE) { - bidi_it->new_paragraph = 1; - /* Record the buffer position of the last character of the - paragraph separator. */ - bidi_it->separator_limit = - bidi_it->charpos + bidi_it->nchars + sep_len; + EMACS_INT sep_len = + bidi_at_paragraph_end (bidi_it->charpos + bidi_it->nchars, + bidi_it->bytepos + bidi_it->ch_len); + if (bidi_it->nchars <= 0) + abort (); + if (sep_len >= 0) + { + bidi_it->new_paragraph = 1; + /* Record the buffer position of the last character of the + paragraph separator. */ + bidi_it->separator_limit = + bidi_it->charpos + bidi_it->nchars + sep_len; + } } } - if (bidi_it->scan_dir == 1 && bidi_cache_idx) + if (bidi_it->scan_dir == 1 && bidi_cache_idx > bidi_cache_start) { /* If we are at paragraph's base embedding level and beyond the last cached position, the cache's job is done and we can @@ -1904,6 +2289,9 @@ bidi_move_to_visually_next (struct bidi_it *bidi_it) else bidi_cache_iterator_state (bidi_it, 1); } + + if (STRINGP (bidi_it->string.lstring)) + UNGCPRO; } /* This is meant to be called from within the debugger, whenever you @@ -1920,7 +2308,7 @@ bidi_dump_cached_states (void) fprintf (stderr, "The cache is empty.\n"); return; } - fprintf (stderr, "Total of %d state%s in cache:\n", + fprintf (stderr, "Total of %"pD"d state%s in cache:\n", bidi_cache_idx, bidi_cache_idx == 1 ? "" : "s"); for (i = bidi_cache[bidi_cache_idx - 1].charpos; i > 0; i /= 10)