src/bidi.c

   1 /* Low-level bidirectional buffer/string-scanning functions for GNU Emacs.
   2    Copyright (C) 2000-2001, 2004-2005, 2009-2014 Free Software
   3    Foundation, Inc.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software: you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation, either version 3 of the License, or
  10 (at your option) any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  19
  20 /* Written by Eli Zaretskii <eliz@gnu.org>.
  21
  22    A sequential implementation of the Unicode Bidirectional algorithm,
  23    (UBA) as per UAX#9, a part of the Unicode Standard.
  24
  25    Unlike the Reference Implementation and most other implementations,
  26    this one is designed to be called once for every character in the
  27    buffer or string.  That way, we can leave intact the design of the
  28    Emacs display engine, whereby an iterator object is used to
  29    traverse buffer or string text character by character, and generate
  30    the necessary data for displaying each character in 'struct glyph'
  31    objects.  (See xdisp.c for the details of that iteration.)  The
  32    functions on this file replace the original linear iteration in the
  33    logical order of the text with a non-linear iteration in the visual
  34    order, i.e. in the order characters should be shown on display.
  35
  36    The main entry point is bidi_move_to_visually_next.  Each time it
  37    is called, it finds the next character in the visual order, and
  38    returns its information in a special structure.  The caller is then
  39    expected to process this character for display or any other
  40    purposes, and call bidi_move_to_visually_next for the next
  41    character.  See the comments in bidi_move_to_visually_next for more
  42    details about its algorithm that finds the next visual-order
  43    character by resolving their levels on the fly.
  44
  45    Two other entry points are bidi_paragraph_init and
  46    bidi_mirror_char.  The first determines the base direction of a
  47    paragraph, while the second returns the mirrored version of its
  48    argument character.
  49
  50    A few auxiliary entry points are used to initialize the bidi
  51    iterator for iterating an object (buffer or string), push and pop
  52    the bidi iterator state, and save and restore the state of the bidi
  53    cache.
  54
  55    If you want to understand the code, you will have to read it
  56    together with the relevant portions of UAX#9.  The comments include
  57    references to UAX#9 rules, for that very reason.
  58
  59    A note about references to UAX#9 rules: if the reference says
  60    something like "X9/Retaining", it means that you need to refer to
  61    rule X9 and to its modifications described in the "Implementation
  62    Notes" section of UAX#9, under "Retaining Format Codes".
  63
  64    Here's the overview of the design of the reordering engine
  65    implemented by this file.
  66
  67    Basic implementation structure
  68    ------------------------------
  69
  70    The sequential processing steps described by UAX#9 are implemented
  71    as recursive levels of processing, all of which examine the next
  72    character in the logical order.  This hierarchy of processing looks
  73    as follows, from the innermost (deepest) to the outermost level,
  74    omitting some subroutines used by each level:
  75
  76      bidi_fetch_char         -- fetch next character
  77      bidi_resolve_explicit   -- resolve explicit levels and directions
  78      bidi_resolve_weak       -- resolve weak types
  79      bidi_resolve_neutral    -- resolve neutral types
  80      bidi_level_of_next_char -- resolve implicit levels
  81
  82    Each level calls the level below it, and works on the result
  83    returned by the lower level, including all of its sub-levels.
  84
  85    Unlike all the levels below it, bidi_level_of_next_char can return
  86    the information about either the next or previous character in the
  87    logical order, depending on the current direction of scanning the
  88    buffer or string.  For the next character, it calls all the levels
  89    below it; for the previous character, it uses the cache, described
  90    below.
  91
  92    Thus, the result of calling bidi_level_of_next_char is the resolved
  93    level of the next or the previous character in the logical order.
  94    Based on this information, the function bidi_move_to_visually_next
  95    finds the next character in the visual order and updates the
  96    direction in which the buffer is scanned, either forward or
  97    backward, to find the next character to be displayed.  (Text is
  98    scanned backwards when it needs to be reversed for display, i.e. if
  99    the visual order is the inverse of the logical order.)  This
 100    implements the last, reordering steps of the UBA, by successively
 101    calling bidi_level_of_next_char until the character of the required
 102    embedding level is found; the scan direction is dynamically updated
 103    as a side effect.  See the commentary before the 'while' loop in
 104    bidi_move_to_visually_next, for the details.
 105
 106    Fetching characters
 107    -------------------
 108
 109    In a nutshell, fetching the next character boils down to calling
 110    STRING_CHAR_AND_LENGTH, passing it the address of a buffer or
 111    string position.  See bidi_fetch_char.  However, if the next
 112    character is "covered" by a display property of some kind,
 113    bidi_fetch_char returns the u+FFFC "object replacement character"
 114    that represents the entire run of text covered by the display
 115    property.  (The ch_len and nchars members of 'struct bidi_it'
 116    reflect the length in bytes and characters of that text.)  This is
 117    so we reorder text on both sides of the display property as
 118    appropriate for an image or embedded string.  Similarly, text
 119    covered by a display spec of the form '(space ...)', is replaced
 120    with the u+2029 paragraph separator character, so such display
 121    specs produce the same effect as a TAB under UBA.  Both these
 122    special characters are not actually displayed -- the display
 123    property is displayed instead -- but just used to compute the
 124    embedding level of the surrounding text so as to produce the
 125    required effect.
 126
 127    Bidi iterator states
 128    --------------------
 129
 130    The UBA is highly context dependent in some of its parts,
 131    i.e. results of processing a character can generally depend on
 132    characters very far away.  The UAX#9 description of the UBA
 133    prescribes a stateful processing of each character, whereby the
 134    results of this processing depend on various state variables, such
 135    as the current embedding level, level stack, and directional
 136    override status.  In addition, the UAX#9 description includes many
 137    passages like this (from rule W2 in this case):
 138
 139      Search backward from each instance of a European number until the
 140      first strong type (R, L, AL, or sos) is found. If an AL is found,
 141      change the type of the European number to Arabic number.
 142
 143    To support this, we use a bidi iterator object, 'struct bidi_it',
 144    which is a sub-structure of 'struct it' used by xdisp.c (see
 145    dispextern.h for the definition of both of these structures).  The
 146    bidi iterator holds the entire state of the iteration required by
 147    the UBA, and is updated as the text is traversed.  In particular,
 148    the embedding level of the current character being resolved is
 149    recorded in the iterator state.  To avoid costly searches backward
 150    in support of rules like W2 above, the necessary character types
 151    are also recorded in the iterator state as they are found during
 152    the forward scan, and then used when such rules need to be applied.
 153    (Forward scans cannot be avoided in this way; they need to be
 154    performed at least once, and the results recorded in the iterator
 155    state, to be reused until the forward scan oversteps the recorded
 156    position.)
 157
 158    In this manner, the iterator state acts as a mini-cache of
 159    contextual information required for resolving the level of the
 160    current character by various UBA rules.
 161
 162    Caching of bidi iterator states
 163    -------------------------------
 164
 165    As described above, the reordering engine uses the information
 166    recorded in the bidi iterator state in order to resolve the
 167    embedding level of the current character.  When the reordering
 168    engine needs to process the next character in the logical order, it
 169    fetches it and applies to it all the UBA levels, updating the
 170    iterator state as it goes.  But when the buffer or string is
 171    scanned backwards, i.e. in the reverse order of buffer/string
 172    positions, the scanned characters were already processed during the
 173    preceding forward scan (see bidi_find_other_level_edge).  To avoid
 174    costly re-processing of characters that were already processed
 175    during the forward scan, the iterator states computed while
 176    scanning forward are cached.
 177
 178    The cache is just a linear array of 'struct bidi_it' objects, which
 179    is dynamically allocated and reallocated as needed, since the size
 180    of the cache depends on the text being processed.  We only need the
 181    cache while processing embedded levels higher than the base
 182    paragraph embedding level, because these higher levels require
 183    changes in scan direction.  Therefore, as soon as we are back to
 184    the base embedding level, we can free the cache; see the calls to
 185    bidi_cache_reset and bidi_cache_shrink, for the conditions to do
 186    this.
 187
 188    The cache maintains the index of the next unused cache slot -- this
 189    is where the next iterator state will be cached.  The function
 190    bidi_cache_iterator_state saves an instance of the state in the
 191    cache and increments the unused slot index.  The companion function
 192    bidi_cache_find looks up a cached state that corresponds to a given
 193    buffer/string position.  All of the cached states must correspond
 194    1:1 to the buffer or string region whose processing they reflect;
 195    bidi.c will abort if it finds cache slots that violate this 1:1
 196    correspondence.
 197
 198    When the parent iterator 'struct it' is pushed (see push_it in
 199    xdisp.c) to pause the current iteration and start iterating over a
 200    different object (e.g., a 'display' string that covers some buffer
 201    text), the bidi iterator cache needs to be "pushed" as well, so
 202    that a new empty cache could be used while iterating over the new
 203    object.  Later, when the new object is exhausted, and xdisp.c calls
 204    pop_it, we need to "pop" the bidi cache as well and return to the
 205    original cache.  See bidi_push_it and bidi_pop_it for how this is
 206    done.
 207
 208    Some functions of the display engine save copies of 'struct it' in
 209    local variables, and restore them later.  For examples, see
 210    pos_visible_p and move_it_in_display_line_to in xdisp.c, and
 211    window_scroll_pixel_based in window.c.  When this happens, we need
 212    to save and restore the bidi cache as well, because conceptually
 213    the cache is part of the 'struct it' state, and needs to be in
 214    perfect sync with the portion of the buffer/string that is being
 215    processed.  This saving and restoring of the cache state is handled
 216    by bidi_shelve_cache and bidi_unshelve_cache, and the helper macros
 217    SAVE_IT and RESTORE_IT defined on xdisp.c.
 218
 219    Note that, because reordering is implemented below the level in
 220    xdisp.c that breaks glyphs into screen lines, we are violating
 221    paragraph 3.4 of UAX#9. which mandates that line breaking shall be
 222    done before reordering each screen line separately.  However,
 223    following UAX#9 to the letter in this matter goes against the basic
 224    design of the Emacs display engine, and so we choose here this
 225    minor deviation from the UBA letter in preference to redesign of
 226    the display engine.  The effect of this is only seen in continued
 227    lines that are broken into screen lines in the middle of a run
 228    whose direction is opposite to the paragraph's base direction.
 229
 230    Important design and implementation note: when the code needs to
 231    scan far ahead, be sure to avoid such scans as much as possible
 232    when the buffer/string doesn't contain any RTL characters.  Users
 233    of left-to-right scripts will never forgive you if you introduce
 234    some slow-down due to bidi in situations that don't involve any
 235    bidirectional text.  See the large comment near the beginning of
 236    bidi_resolve_neutral, for one situation where such shortcut was
 237    necessary.  */
 238
 239 #include <config.h>
 240 #include <stdio.h>
 241
 242 #include "lisp.h"
 243 #include "character.h"
 244 #include "buffer.h"
 245 #include "dispextern.h"
 246 #include "region-cache.h"
 247
 248 static bool bidi_initialized = 0;
 249
 250 static Lisp_Object bidi_type_table, bidi_mirror_table;
 251
 252 #define BIDI_EOB   (-1)
 253
 254 /* Data type for describing the bidirectional character categories.  */
 255 typedef enum {
 256   UNKNOWN_BC,
 257   NEUTRAL,
 258   WEAK,
 259   STRONG,
 260   EXPLICIT_FORMATTING
 261 } bidi_category_t;
 262
 263 /* UAX#9 says to search only for L, AL, or R types of characters, and
 264    ignore RLE, RLO, LRE, and LRO, when determining the base paragraph
 265    level.  Yudit indeed ignores them.  This variable is therefore set
 266    by default to ignore them, but clearing it will take them into
 267    account.  */
 268 extern bool bidi_ignore_explicit_marks_for_paragraph_level EXTERNALLY_VISIBLE;
 269 bool bidi_ignore_explicit_marks_for_paragraph_level = 1;
 270
 271 static Lisp_Object paragraph_start_re, paragraph_separate_re;
 272 static Lisp_Object Qparagraph_start, Qparagraph_separate;
 273
 274 \f
 275 /***********************************************************************
 276                         Utilities
 277  ***********************************************************************/
 278
 279 /* Return the bidi type of a character CH, subject to the current
 280    directional OVERRIDE.  */
 281 static bidi_type_t
 282 bidi_get_type (int ch, bidi_dir_t override)
 283 {
 284   bidi_type_t default_type;
 285
 286   if (ch == BIDI_EOB)
 287     return NEUTRAL_B;
 288   if (ch < 0 || ch > MAX_CHAR)
 289     emacs_abort ();
 290
 291   default_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
 292   /* Every valid character code, even those that are unassigned by the
 293      UCD, have some bidi-class property, according to
 294      DerivedBidiClass.txt file.  Therefore, if we ever get UNKNOWN_BT
 295      (= zero) code from CHAR_TABLE_REF, that's a bug.  */
 296   if (default_type == UNKNOWN_BT)
 297     emacs_abort ();
 298
 299   switch (default_type)
 300     {
 301       case WEAK_BN:
 302       case NEUTRAL_B:
 303       case LRE:
 304       case LRO:
 305       case RLE:
 306       case RLO:
 307       case PDF:
 308         return default_type;
 309         /* FIXME: The isolate controls are treated as BN until we add
 310            support for UBA v6.3.  */
 311       case LRI:
 312       case RLI:
 313       case FSI:
 314       case PDI:
 315         return WEAK_BN;
 316       default:
 317         if (override == L2R)
 318           return STRONG_L;
 319         else if (override == R2L)
 320           return STRONG_R;
 321         else
 322           return default_type;
 323     }
 324 }
 325
 326 static void
 327 bidi_check_type (bidi_type_t type)
 328 {
 329   eassert (UNKNOWN_BT <= type && type <= NEUTRAL_ON);
 330 }
 331
 332 /* Given a bidi TYPE of a character, return its category.  */
 333 static bidi_category_t
 334 bidi_get_category (bidi_type_t type)
 335 {
 336   switch (type)
 337     {
 338       case UNKNOWN_BT:
 339         return UNKNOWN_BC;
 340       case STRONG_L:
 341       case STRONG_R:
 342       case STRONG_AL:
 343         return STRONG;
 344       case WEAK_EN:
 345       case WEAK_ES:
 346       case WEAK_ET:
 347       case WEAK_AN:
 348       case WEAK_CS:
 349       case WEAK_NSM:
 350       case WEAK_BN:
 351         /* FIXME */
 352       case LRI:
 353       case RLI:
 354       case FSI:
 355       case PDI:
 356         return WEAK;
 357       case NEUTRAL_B:
 358       case NEUTRAL_S:
 359       case NEUTRAL_WS:
 360       case NEUTRAL_ON:
 361         return NEUTRAL;
 362       case LRE:
 363       case LRO:
 364       case RLE:
 365       case RLO:
 366       case PDF:
 367 #if 0
 368         /* FIXME: This awaits implementation of isolate support.  */
 369       case LRI:
 370       case RLI:
 371       case FSI:
 372       case PDI:
 373 #endif
 374         return EXPLICIT_FORMATTING;
 375       default:
 376         emacs_abort ();
 377     }
 378 }
 379
 380 /* Return the mirrored character of C, if it has one.  If C has no
 381    mirrored counterpart, return C.
 382    Note: The conditions in UAX#9 clause L4 regarding the surrounding
 383    context must be tested by the caller.  */
 384 int
 385 bidi_mirror_char (int c)
 386 {
 387   Lisp_Object val;
 388
 389   if (c == BIDI_EOB)
 390     return c;
 391   if (c < 0 || c > MAX_CHAR)
 392     emacs_abort ();
 393
 394   val = CHAR_TABLE_REF (bidi_mirror_table, c);
 395   if (INTEGERP (val))
 396     {
 397       int v;
 398
 399       /* When debugging, check before assigning to V, so that the check
 400          isn't broken by undefined behavior due to int overflow.  */
 401       eassert (CHAR_VALID_P (XINT (val)));
 402
 403       v = XINT (val);
 404
 405       /* Minimal test we must do in optimized builds, to prevent weird
 406          crashes further down the road.  */
 407       if (v < 0 || v > MAX_CHAR)
 408         emacs_abort ();
 409
 410       return v;
 411     }
 412
 413   return c;
 414 }
 415
 416 /* Determine the start-of-run (sor) directional type given the two
 417    embedding levels on either side of the run boundary.  Also, update
 418    the saved info about previously seen characters, since that info is
 419    generally valid for a single level run.  */
 420 static void
 421 bidi_set_sor_type (struct bidi_it *bidi_it, int level_before, int level_after)
 422 {
 423   int higher_level = (level_before > level_after ? level_before : level_after);
 424
 425   /* The prev_was_pdf gork is required for when we have several PDFs
 426      in a row.  In that case, we want to compute the sor type for the
 427      next level run only once: when we see the first PDF.  That's
 428      because the sor type depends only on the higher of the two levels
 429      that we find on the two sides of the level boundary (see UAX#9,
 430      clause X10), and so we don't need to know the final embedding
 431      level to which we descend after processing all the PDFs.  */
 432   if (!bidi_it->prev_was_pdf || level_before < level_after)
 433     /* FIXME: should the default sor direction be user selectable?  */
 434     bidi_it->sor = ((higher_level & 1) != 0 ? R2L : L2R);
 435   if (level_before > level_after)
 436     bidi_it->prev_was_pdf = 1;
 437
 438   bidi_it->prev.type = UNKNOWN_BT;
 439   bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1
 440     = bidi_it->last_strong.orig_type = UNKNOWN_BT;
 441   bidi_it->prev_for_neutral.type = (bidi_it->sor == R2L ? STRONG_R : STRONG_L);
 442   bidi_it->prev_for_neutral.charpos = bidi_it->charpos;
 443   bidi_it->prev_for_neutral.bytepos = bidi_it->bytepos;
 444   bidi_it->next_for_neutral.type = bidi_it->next_for_neutral.type_after_w1
 445     = bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
 446   bidi_it->ignore_bn_limit = -1; /* meaning it's unknown */
 447 }
 448
 449 /* Push the current embedding level and override status; reset the
 450    current level to LEVEL and the current override status to OVERRIDE.  */
 451 static void
 452 bidi_push_embedding_level (struct bidi_it *bidi_it,
 453                            int level, bidi_dir_t override)
 454 {
 455   bidi_it->stack_idx++;
 456   eassert (bidi_it->stack_idx < BIDI_MAXLEVEL);
 457   bidi_it->level_stack[bidi_it->stack_idx].level = level;
 458   bidi_it->level_stack[bidi_it->stack_idx].override = override;
 459 }
 460
 461 /* Pop the embedding level and directional override status from the
 462    stack, and return the new level.  */
 463 static int
 464 bidi_pop_embedding_level (struct bidi_it *bidi_it)
 465 {
 466   /* UAX#9 says to ignore invalid PDFs.  */
 467   if (bidi_it->stack_idx > 0)
 468     bidi_it->stack_idx--;
 469   return bidi_it->level_stack[bidi_it->stack_idx].level;
 470 }
 471
 472 /* Record in SAVED_INFO the information about the current character.  */
 473 static void
 474 bidi_remember_char (struct bidi_saved_info *saved_info,
 475                     struct bidi_it *bidi_it)
 476 {
 477   saved_info->charpos = bidi_it->charpos;
 478   saved_info->bytepos = bidi_it->bytepos;
 479   saved_info->type = bidi_it->type;
 480   bidi_check_type (bidi_it->type);
 481   saved_info->type_after_w1 = bidi_it->type_after_w1;
 482   bidi_check_type (bidi_it->type_after_w1);
 483   saved_info->orig_type = bidi_it->orig_type;
 484   bidi_check_type (bidi_it->orig_type);
 485 }
 486
 487 /* Copy the bidi iterator from FROM to TO.  To save cycles, this only
 488    copies the part of the level stack that is actually in use.  */
 489 static void
 490 bidi_copy_it (struct bidi_it *to, struct bidi_it *from)
 491 {
 492   /* Copy everything from the start through the active part of
 493      the level stack.  */
 494   memcpy (to, from,
 495           (offsetof (struct bidi_it, level_stack[1])
 496            + from->stack_idx * sizeof from->level_stack[0]));
 497 }
 498
 499 \f
 500 /***********************************************************************
 501                         Caching the bidi iterator states
 502  ***********************************************************************/
 503
 504 #define BIDI_CACHE_CHUNK 200
 505 static struct bidi_it *bidi_cache;
 506 static ptrdiff_t bidi_cache_size = 0;
 507 enum { elsz = sizeof (struct bidi_it) };
 508 static ptrdiff_t bidi_cache_idx;        /* next unused cache slot */
 509 static ptrdiff_t bidi_cache_last_idx;   /* slot of last cache hit */
 510 static ptrdiff_t bidi_cache_start = 0;  /* start of cache for this
 511                                            "stack" level */
 512
 513 /* 5-slot stack for saving the start of the previous level of the
 514    cache.  xdisp.c maintains a 5-slot stack for its iterator state,
 515    and we need the same size of our stack.  */
 516 static ptrdiff_t bidi_cache_start_stack[IT_STACK_SIZE];
 517 static int bidi_cache_sp;
 518
 519 /* Size of header used by bidi_shelve_cache.  */
 520 enum
 521   {
 522     bidi_shelve_header_size
 523       = (sizeof (bidi_cache_idx) + sizeof (bidi_cache_start_stack)
 524          + sizeof (bidi_cache_sp) + sizeof (bidi_cache_start)
 525          + sizeof (bidi_cache_last_idx))
 526   };
 527
 528 /* Reset the cache state to the empty state.  We only reset the part
 529    of the cache relevant to iteration of the current object.  Previous
 530    objects, which are pushed on the display iterator's stack, are left
 531    intact.  This is called when the cached information is no more
 532    useful for the current iteration, e.g. when we were reseated to a
 533    new position on the same object.  */
 534 static void
 535 bidi_cache_reset (void)
 536 {
 537   bidi_cache_idx = bidi_cache_start;
 538   bidi_cache_last_idx = -1;
 539 }
 540
 541 /* Shrink the cache to its minimal size.  Called when we init the bidi
 542    iterator for reordering a buffer or a string that does not come
 543    from display properties, because that means all the previously
 544    cached info is of no further use.  */
 545 static void
 546 bidi_cache_shrink (void)
 547 {
 548   if (bidi_cache_size > BIDI_CACHE_CHUNK)
 549     {
 550       bidi_cache = xrealloc (bidi_cache, BIDI_CACHE_CHUNK * elsz);
 551       bidi_cache_size = BIDI_CACHE_CHUNK;
 552     }
 553   bidi_cache_reset ();
 554 }
 555
 556 static void
 557 bidi_cache_fetch_state (ptrdiff_t idx, struct bidi_it *bidi_it)
 558 {
 559   int current_scan_dir = bidi_it->scan_dir;
 560
 561   if (idx < bidi_cache_start || idx >= bidi_cache_idx)
 562     emacs_abort ();
 563
 564   bidi_copy_it (bidi_it, &bidi_cache[idx]);
 565   bidi_it->scan_dir = current_scan_dir;
 566   bidi_cache_last_idx = idx;
 567 }
 568
 569 /* Find a cached state with a given CHARPOS and resolved embedding
 570    level less or equal to LEVEL.  if LEVEL is -1, disregard the
 571    resolved levels in cached states.  DIR, if non-zero, means search
 572    in that direction from the last cache hit.  */
 573 static ptrdiff_t
 574 bidi_cache_search (ptrdiff_t charpos, int level, int dir)
 575 {
 576   ptrdiff_t i, i_start;
 577
 578   if (bidi_cache_idx > bidi_cache_start)
 579     {
 580       if (bidi_cache_last_idx == -1)
 581         bidi_cache_last_idx = bidi_cache_idx - 1;
 582       if (charpos < bidi_cache[bidi_cache_last_idx].charpos)
 583         {
 584           dir = -1;
 585           i_start = bidi_cache_last_idx - 1;
 586         }
 587       else if (charpos > (bidi_cache[bidi_cache_last_idx].charpos
 588                           + bidi_cache[bidi_cache_last_idx].nchars - 1))
 589         {
 590           dir = 1;
 591           i_start = bidi_cache_last_idx + 1;
 592         }
 593       else if (dir)
 594         i_start = bidi_cache_last_idx;
 595       else
 596         {
 597           dir = -1;
 598           i_start = bidi_cache_idx - 1;
 599         }
 600
 601       if (dir < 0)
 602         {
 603           /* Linear search for now; FIXME!  */
 604           for (i = i_start; i >= bidi_cache_start; i--)
 605             if (bidi_cache[i].charpos <= charpos
 606                 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
 607                 && (level == -1 || bidi_cache[i].resolved_level <= level))
 608               return i;
 609         }
 610       else
 611         {
 612           for (i = i_start; i < bidi_cache_idx; i++)
 613             if (bidi_cache[i].charpos <= charpos
 614                 && charpos < bidi_cache[i].charpos + bidi_cache[i].nchars
 615                 && (level == -1 || bidi_cache[i].resolved_level <= level))
 616               return i;
 617         }
 618     }
 619
 620   return -1;
 621 }
 622
 623 /* Find a cached state where the resolved level changes to a value
 624    that is lower than LEVEL, and return its cache slot index.  DIR is
 625    the direction to search, starting with the last used cache slot.
 626    If DIR is zero, we search backwards from the last occupied cache
 627    slot.  BEFORE means return the index of the slot that
 628    is ``before'' the level change in the search direction.  That is,
 629    given the cached levels like this:
 630
 631          1122333442211
 632           AB        C
 633
 634    and assuming we are at the position cached at the slot marked with
 635    C, searching backwards (DIR = -1) for LEVEL = 2 will return the
 636    index of slot B or A, depending whether BEFORE is, respectively,
 637    true or false.  */
 638 static ptrdiff_t
 639 bidi_cache_find_level_change (int level, int dir, bool before)
 640 {
 641   if (bidi_cache_idx)
 642     {
 643       ptrdiff_t i = dir ? bidi_cache_last_idx : bidi_cache_idx - 1;
 644       int incr = before ? 1 : 0;
 645
 646       eassert (!dir || bidi_cache_last_idx >= 0);
 647
 648       if (!dir)
 649         dir = -1;
 650       else if (!incr)
 651         i += dir;
 652
 653       if (dir < 0)
 654         {
 655           while (i >= bidi_cache_start + incr)
 656             {
 657               if (bidi_cache[i - incr].resolved_level >= 0
 658                   && bidi_cache[i - incr].resolved_level < level)
 659                 return i;
 660               i--;
 661             }
 662         }
 663       else
 664         {
 665           while (i < bidi_cache_idx - incr)
 666             {
 667               if (bidi_cache[i + incr].resolved_level >= 0
 668                   && bidi_cache[i + incr].resolved_level < level)
 669                 return i;
 670               i++;
 671             }
 672         }
 673     }
 674
 675   return -1;
 676 }
 677
 678 static void
 679 bidi_cache_ensure_space (ptrdiff_t idx)
 680 {
 681   /* Enlarge the cache as needed.  */
 682   if (idx >= bidi_cache_size)
 683     {
 684       /* The bidi cache cannot be larger than the largest Lisp string
 685          or buffer.  */
 686       ptrdiff_t string_or_buffer_bound
 687         = max (BUF_BYTES_MAX, STRING_BYTES_BOUND);
 688
 689       /* Also, it cannot be larger than what C can represent.  */
 690       ptrdiff_t c_bound
 691         = (min (PTRDIFF_MAX, SIZE_MAX) - bidi_shelve_header_size) / elsz;
 692
 693       bidi_cache
 694         = xpalloc (bidi_cache, &bidi_cache_size,
 695                    max (BIDI_CACHE_CHUNK, idx - bidi_cache_size + 1),
 696                    min (string_or_buffer_bound, c_bound), elsz);
 697     }
 698 }
 699
 700 static void
 701 bidi_cache_iterator_state (struct bidi_it *bidi_it, bool resolved)
 702 {
 703   ptrdiff_t idx;
 704
 705   /* We should never cache on backward scans.  */
 706   if (bidi_it->scan_dir == -1)
 707     emacs_abort ();
 708   idx = bidi_cache_search (bidi_it->charpos, -1, 1);
 709
 710   if (idx < 0)
 711     {
 712       idx = bidi_cache_idx;
 713       bidi_cache_ensure_space (idx);
 714       /* Character positions should correspond to cache positions 1:1.
 715          If we are outside the range of cached positions, the cache is
 716          useless and must be reset.  */
 717       if (idx > bidi_cache_start &&
 718           (bidi_it->charpos > (bidi_cache[idx - 1].charpos
 719                                + bidi_cache[idx - 1].nchars)
 720            || bidi_it->charpos < bidi_cache[bidi_cache_start].charpos))
 721         {
 722           bidi_cache_reset ();
 723           idx = bidi_cache_start;
 724         }
 725       if (bidi_it->nchars <= 0)
 726         emacs_abort ();
 727       bidi_copy_it (&bidi_cache[idx], bidi_it);
 728       if (!resolved)
 729         bidi_cache[idx].resolved_level = -1;
 730     }
 731   else
 732     {
 733       /* Copy only the members which could have changed, to avoid
 734          costly copying of the entire struct.  */
 735       bidi_cache[idx].type = bidi_it->type;
 736       bidi_check_type (bidi_it->type);
 737       bidi_cache[idx].type_after_w1 = bidi_it->type_after_w1;
 738       bidi_check_type (bidi_it->type_after_w1);
 739       if (resolved)
 740         bidi_cache[idx].resolved_level = bidi_it->resolved_level;
 741       else
 742         bidi_cache[idx].resolved_level = -1;
 743       bidi_cache[idx].invalid_levels = bidi_it->invalid_levels;
 744       bidi_cache[idx].invalid_rl_levels = bidi_it->invalid_rl_levels;
 745       bidi_cache[idx].next_for_neutral = bidi_it->next_for_neutral;
 746       bidi_cache[idx].next_for_ws = bidi_it->next_for_ws;
 747       bidi_cache[idx].ignore_bn_limit = bidi_it->ignore_bn_limit;
 748       bidi_cache[idx].disp_pos = bidi_it->disp_pos;
 749       bidi_cache[idx].disp_prop = bidi_it->disp_prop;
 750     }
 751
 752   bidi_cache_last_idx = idx;
 753   if (idx >= bidi_cache_idx)
 754     bidi_cache_idx = idx + 1;
 755 }
 756
 757 static bidi_type_t
 758 bidi_cache_find (ptrdiff_t charpos, int level, struct bidi_it *bidi_it)
 759 {
 760   ptrdiff_t i = bidi_cache_search (charpos, level, bidi_it->scan_dir);
 761
 762   if (i >= bidi_cache_start)
 763     {
 764       bidi_dir_t current_scan_dir = bidi_it->scan_dir;
 765
 766       bidi_copy_it (bidi_it, &bidi_cache[i]);
 767       bidi_cache_last_idx = i;
 768       /* Don't let scan direction from the cached state override
 769          the current scan direction.  */
 770       bidi_it->scan_dir = current_scan_dir;
 771       return bidi_it->type;
 772     }
 773
 774   return UNKNOWN_BT;
 775 }
 776
 777 static int
 778 bidi_peek_at_next_level (struct bidi_it *bidi_it)
 779 {
 780   if (bidi_cache_idx == bidi_cache_start || bidi_cache_last_idx == -1)
 781     emacs_abort ();
 782   return bidi_cache[bidi_cache_last_idx + bidi_it->scan_dir].resolved_level;
 783 }
 784
 785 \f
 786 /***********************************************************************
 787              Pushing and popping the bidi iterator state
 788  ***********************************************************************/
 789
 790 /* Push the bidi iterator state in preparation for reordering a
 791    different object, e.g. display string found at certain buffer
 792    position.  Pushing the bidi iterator boils down to saving its
 793    entire state on the cache and starting a new cache "stacked" on top
 794    of the current cache.  */
 795 void
 796 bidi_push_it (struct bidi_it *bidi_it)
 797 {
 798   /* Save the current iterator state in its entirety after the last
 799      used cache slot.  */
 800   bidi_cache_ensure_space (bidi_cache_idx);
 801   bidi_cache[bidi_cache_idx++] = *bidi_it;
 802
 803   /* Push the current cache start onto the stack.  */
 804   eassert (bidi_cache_sp < IT_STACK_SIZE);
 805   bidi_cache_start_stack[bidi_cache_sp++] = bidi_cache_start;
 806
 807   /* Start a new level of cache, and make it empty.  */
 808   bidi_cache_start = bidi_cache_idx;
 809   bidi_cache_last_idx = -1;
 810 }
 811
 812 /* Restore the iterator state saved by bidi_push_it and return the
 813    cache to the corresponding state.  */
 814 void
 815 bidi_pop_it (struct bidi_it *bidi_it)
 816 {
 817   if (bidi_cache_start <= 0)
 818     emacs_abort ();
 819
 820   /* Reset the next free cache slot index to what it was before the
 821      call to bidi_push_it.  */
 822   bidi_cache_idx = bidi_cache_start - 1;
 823
 824   /* Restore the bidi iterator state saved in the cache.  */
 825   *bidi_it = bidi_cache[bidi_cache_idx];
 826
 827   /* Pop the previous cache start from the stack.  */
 828   if (bidi_cache_sp <= 0)
 829     emacs_abort ();
 830   bidi_cache_start = bidi_cache_start_stack[--bidi_cache_sp];
 831
 832   /* Invalidate the last-used cache slot data.  */
 833   bidi_cache_last_idx = -1;
 834 }
 835
 836 static ptrdiff_t bidi_cache_total_alloc;
 837
 838 /* Stash away a copy of the cache and its control variables.  */
 839 void *
 840 bidi_shelve_cache (void)
 841 {
 842   unsigned char *databuf;
 843   ptrdiff_t alloc;
 844
 845   /* Empty cache.  */
 846   if (bidi_cache_idx == 0)
 847     return NULL;
 848
 849   alloc = (bidi_shelve_header_size
 850            + bidi_cache_idx * sizeof (struct bidi_it));
 851   databuf = xmalloc (alloc);
 852   bidi_cache_total_alloc += alloc;
 853
 854   memcpy (databuf, &bidi_cache_idx, sizeof (bidi_cache_idx));
 855   memcpy (databuf + sizeof (bidi_cache_idx),
 856           bidi_cache, bidi_cache_idx * sizeof (struct bidi_it));
 857   memcpy (databuf + sizeof (bidi_cache_idx)
 858           + bidi_cache_idx * sizeof (struct bidi_it),
 859           bidi_cache_start_stack, sizeof (bidi_cache_start_stack));
 860   memcpy (databuf + sizeof (bidi_cache_idx)
 861           + bidi_cache_idx * sizeof (struct bidi_it)
 862           + sizeof (bidi_cache_start_stack),
 863           &bidi_cache_sp, sizeof (bidi_cache_sp));
 864   memcpy (databuf + sizeof (bidi_cache_idx)
 865           + bidi_cache_idx * sizeof (struct bidi_it)
 866           + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp),
 867           &bidi_cache_start, sizeof (bidi_cache_start));
 868   memcpy (databuf + sizeof (bidi_cache_idx)
 869           + bidi_cache_idx * sizeof (struct bidi_it)
 870           + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
 871           + sizeof (bidi_cache_start),
 872           &bidi_cache_last_idx, sizeof (bidi_cache_last_idx));
 873
 874   return databuf;
 875 }
 876
 877 /* Restore the cache state from a copy stashed away by
 878    bidi_shelve_cache, and free the buffer used to stash that copy.
 879    JUST_FREE means free the buffer, but don't restore the
 880    cache; used when the corresponding iterator is discarded instead of
 881    being restored.  */
 882 void
 883 bidi_unshelve_cache (void *databuf, bool just_free)
 884 {
 885   unsigned char *p = databuf;
 886
 887   if (!p)
 888     {
 889       if (!just_free)
 890         {
 891           /* A NULL pointer means an empty cache.  */
 892           bidi_cache_start = 0;
 893           bidi_cache_sp = 0;
 894           bidi_cache_reset ();
 895         }
 896     }
 897   else
 898     {
 899       if (just_free)
 900         {
 901           ptrdiff_t idx;
 902
 903           memcpy (&idx, p, sizeof (bidi_cache_idx));
 904           bidi_cache_total_alloc
 905             -= bidi_shelve_header_size + idx * sizeof (struct bidi_it);
 906         }
 907       else
 908         {
 909           memcpy (&bidi_cache_idx, p, sizeof (bidi_cache_idx));
 910           bidi_cache_ensure_space (bidi_cache_idx);
 911           memcpy (bidi_cache, p + sizeof (bidi_cache_idx),
 912                   bidi_cache_idx * sizeof (struct bidi_it));
 913           memcpy (bidi_cache_start_stack,
 914                   p + sizeof (bidi_cache_idx)
 915                   + bidi_cache_idx * sizeof (struct bidi_it),
 916                   sizeof (bidi_cache_start_stack));
 917           memcpy (&bidi_cache_sp,
 918                   p + sizeof (bidi_cache_idx)
 919                   + bidi_cache_idx * sizeof (struct bidi_it)
 920                   + sizeof (bidi_cache_start_stack),
 921                   sizeof (bidi_cache_sp));
 922           memcpy (&bidi_cache_start,
 923                   p + sizeof (bidi_cache_idx)
 924                   + bidi_cache_idx * sizeof (struct bidi_it)
 925                   + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp),
 926                   sizeof (bidi_cache_start));
 927           memcpy (&bidi_cache_last_idx,
 928                   p + sizeof (bidi_cache_idx)
 929                   + bidi_cache_idx * sizeof (struct bidi_it)
 930                   + sizeof (bidi_cache_start_stack) + sizeof (bidi_cache_sp)
 931                   + sizeof (bidi_cache_start),
 932                   sizeof (bidi_cache_last_idx));
 933           bidi_cache_total_alloc
 934             -= (bidi_shelve_header_size
 935                 + bidi_cache_idx * sizeof (struct bidi_it));
 936         }
 937
 938       xfree (p);
 939     }
 940 }
 941
 942 \f
 943 /***********************************************************************
 944                         Initialization
 945  ***********************************************************************/
 946 static void
 947 bidi_initialize (void)
 948 {
 949   bidi_type_table = uniprop_table (intern ("bidi-class"));
 950   if (NILP (bidi_type_table))
 951     emacs_abort ();
 952   staticpro (&bidi_type_table);
 953
 954   bidi_mirror_table = uniprop_table (intern ("mirroring"));
 955   if (NILP (bidi_mirror_table))
 956     emacs_abort ();
 957   staticpro (&bidi_mirror_table);
 958
 959   Qparagraph_start = intern ("paragraph-start");
 960   staticpro (&Qparagraph_start);
 961   paragraph_start_re = Fsymbol_value (Qparagraph_start);
 962   if (!STRINGP (paragraph_start_re))
 963     paragraph_start_re = build_string ("\f\\|[ \t]*$");
 964   staticpro (&paragraph_start_re);
 965   Qparagraph_separate = intern ("paragraph-separate");
 966   staticpro (&Qparagraph_separate);
 967   paragraph_separate_re = Fsymbol_value (Qparagraph_separate);
 968   if (!STRINGP (paragraph_separate_re))
 969     paragraph_separate_re = build_string ("[ \t\f]*$");
 970   staticpro (&paragraph_separate_re);
 971
 972   bidi_cache_sp = 0;
 973   bidi_cache_total_alloc = 0;
 974
 975   bidi_initialized = 1;
 976 }
 977
 978 /* Do whatever UAX#9 clause X8 says should be done at paragraph's
 979    end.  */
 980 static void
 981 bidi_set_paragraph_end (struct bidi_it *bidi_it)
 982 {
 983   bidi_it->invalid_levels = 0;
 984   bidi_it->invalid_rl_levels = -1;
 985   bidi_it->stack_idx = 0;
 986   bidi_it->resolved_level = bidi_it->level_stack[0].level;
 987 }
 988
 989 /* Initialize the bidi iterator from buffer/string position CHARPOS.  */
 990 void
 991 bidi_init_it (ptrdiff_t charpos, ptrdiff_t bytepos, bool frame_window_p,
 992               struct bidi_it *bidi_it)
 993 {
 994   if (! bidi_initialized)
 995     bidi_initialize ();
 996   if (charpos >= 0)
 997     bidi_it->charpos = charpos;
 998   if (bytepos >= 0)
 999     bidi_it->bytepos = bytepos;
1000   bidi_it->frame_window_p = frame_window_p;
1001   bidi_it->nchars = -1; /* to be computed in bidi_resolve_explicit_1 */
1002   bidi_it->first_elt = 1;
1003   bidi_set_paragraph_end (bidi_it);
1004   bidi_it->new_paragraph = 1;
1005   bidi_it->separator_limit = -1;
1006   bidi_it->type = NEUTRAL_B;
1007   bidi_it->type_after_w1 = NEUTRAL_B;
1008   bidi_it->orig_type = NEUTRAL_B;
1009   bidi_it->prev_was_pdf = 0;
1010   bidi_it->prev.type = bidi_it->prev.type_after_w1
1011     = bidi_it->prev.orig_type = UNKNOWN_BT;
1012   bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1
1013     = bidi_it->last_strong.orig_type = UNKNOWN_BT;
1014   bidi_it->next_for_neutral.charpos = -1;
1015   bidi_it->next_for_neutral.type
1016     = bidi_it->next_for_neutral.type_after_w1
1017     = bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
1018   bidi_it->prev_for_neutral.charpos = -1;
1019   bidi_it->prev_for_neutral.type
1020     = bidi_it->prev_for_neutral.type_after_w1
1021     = bidi_it->prev_for_neutral.orig_type = UNKNOWN_BT;
1022   bidi_it->sor = L2R;    /* FIXME: should it be user-selectable? */
1023   bidi_it->disp_pos = -1;       /* invalid/unknown */
1024   bidi_it->disp_prop = 0;
1025   /* We can only shrink the cache if we are at the bottom level of its
1026      "stack".  */
1027   if (bidi_cache_start == 0)
1028     bidi_cache_shrink ();
1029   else
1030     bidi_cache_reset ();
1031 }
1032
1033 /* Perform initializations for reordering a new line of bidi text.  */
1034 static void
1035 bidi_line_init (struct bidi_it *bidi_it)
1036 {
1037   bidi_it->scan_dir = 1; /* FIXME: do we need to have control on this? */
1038   bidi_it->resolved_level = bidi_it->level_stack[0].level;
1039   bidi_it->level_stack[0].override = NEUTRAL_DIR; /* X1 */
1040   bidi_it->invalid_levels = 0;
1041   bidi_it->invalid_rl_levels = -1;
1042   /* Setting this to zero will force its recomputation the first time
1043      we need it for W5.  */
1044   bidi_it->next_en_pos = 0;
1045   bidi_it->next_en_type = UNKNOWN_BT;
1046   bidi_it->next_for_ws.type = UNKNOWN_BT;
1047   bidi_set_sor_type (bidi_it,
1048                      (bidi_it->paragraph_dir == R2L ? 1 : 0),
1049                      bidi_it->level_stack[0].level); /* X10 */
1050
1051   bidi_cache_reset ();
1052 }
1053
1054 \f
1055 /***********************************************************************
1056                         Fetching characters
1057  ***********************************************************************/
1058
1059 /* Count bytes in string S between BEG/BEGBYTE and END.  BEG and END
1060    are zero-based character positions in S, BEGBYTE is byte position
1061    corresponding to BEG.  UNIBYTE means S is a unibyte string.  */
1062 static ptrdiff_t
1063 bidi_count_bytes (const unsigned char *s, ptrdiff_t beg,
1064                   ptrdiff_t begbyte, ptrdiff_t end, bool unibyte)
1065 {
1066   ptrdiff_t pos = beg;
1067   const unsigned char *p = s + begbyte, *start = p;
1068
1069   if (unibyte)
1070     p = s + end;
1071   else
1072     {
1073       if (!CHAR_HEAD_P (*p))
1074         emacs_abort ();
1075
1076       while (pos < end)
1077         {
1078           p += BYTES_BY_CHAR_HEAD (*p);
1079           pos++;
1080         }
1081     }
1082
1083   return p - start;
1084 }
1085
1086 /* Fetch and return the character at byte position BYTEPOS.  If S is
1087    non-NULL, fetch the character from string S; otherwise fetch the
1088    character from the current buffer.  UNIBYTE means S is a
1089    unibyte string.  */
1090 static int
1091 bidi_char_at_pos (ptrdiff_t bytepos, const unsigned char *s, bool unibyte)
1092 {
1093   if (s)
1094     {
1095       s += bytepos;
1096       if (unibyte)
1097         return *s;
1098     }
1099   else
1100     s = BYTE_POS_ADDR (bytepos);
1101   return STRING_CHAR (s);
1102 }
1103
1104 /* Fetch and return the character at CHARPOS/BYTEPOS.  If that
1105    character is covered by a display string, treat the entire run of
1106    covered characters as a single character, either u+2029 or u+FFFC,
1107    and return their combined length in CH_LEN and NCHARS.  DISP_POS
1108    specifies the character position of the next display string, or -1
1109    if not yet computed.  When the next character is at or beyond that
1110    position, the function updates DISP_POS with the position of the
1111    next display string.  *DISP_PROP non-zero means that there's really
1112    a display string at DISP_POS, as opposed to when we searched till
1113    DISP_POS without finding one.  If *DISP_PROP is 2, it means the
1114    display spec is of the form `(space ...)', which is replaced with
1115    u+2029 to handle it as a paragraph separator.  STRING->s is the C
1116    string to iterate, or NULL if iterating over a buffer or a Lisp
1117    string; in the latter case, STRING->lstring is the Lisp string.  */
1118 static int
1119 bidi_fetch_char (ptrdiff_t charpos, ptrdiff_t bytepos, ptrdiff_t *disp_pos,
1120                  int *disp_prop, struct bidi_string_data *string,
1121                  struct window *w,
1122                  bool frame_window_p, ptrdiff_t *ch_len, ptrdiff_t *nchars)
1123 {
1124   int ch;
1125   ptrdiff_t endpos
1126     = (string->s || STRINGP (string->lstring)) ? string->schars : ZV;
1127   struct text_pos pos;
1128   int len;
1129
1130   /* If we got past the last known position of display string, compute
1131      the position of the next one.  That position could be at CHARPOS.  */
1132   if (charpos < endpos && charpos > *disp_pos)
1133     {
1134       SET_TEXT_POS (pos, charpos, bytepos);
1135       *disp_pos = compute_display_string_pos (&pos, string, w, frame_window_p,
1136                                               disp_prop);
1137     }
1138
1139   /* Fetch the character at BYTEPOS.  */
1140   if (charpos >= endpos)
1141     {
1142       ch = BIDI_EOB;
1143       *ch_len = 1;
1144       *nchars = 1;
1145       *disp_pos = endpos;
1146       *disp_prop = 0;
1147     }
1148   else if (charpos >= *disp_pos && *disp_prop)
1149     {
1150       ptrdiff_t disp_end_pos;
1151
1152       /* We don't expect to find ourselves in the middle of a display
1153          property.  Hopefully, it will never be needed.  */
1154       if (charpos > *disp_pos)
1155         emacs_abort ();
1156       /* Text covered by `display' properties and overlays with
1157          display properties or display strings is handled as a single
1158          character that represents the entire run of characters
1159          covered by the display property.  */
1160       if (*disp_prop == 2)
1161         {
1162           /* `(space ...)' display specs are handled as paragraph
1163              separators for the purposes of the reordering; see UAX#9
1164              section 3 and clause HL1 in section 4.3 there.  */
1165           ch = 0x2029;
1166         }
1167       else
1168         {
1169           /* All other display specs are handled as the Unicode Object
1170              Replacement Character.  */
1171           ch = 0xFFFC;
1172         }
1173       disp_end_pos = compute_display_string_end (*disp_pos, string);
1174       if (disp_end_pos < 0)
1175         {
1176           /* Somebody removed the display string from the buffer
1177              behind our back.  Recover by processing this buffer
1178              position as if no display property were present there to
1179              begin with.  */
1180           *disp_prop = 0;
1181           goto normal_char;
1182         }
1183       *nchars = disp_end_pos - *disp_pos;
1184       if (*nchars <= 0)
1185         emacs_abort ();
1186       if (string->s)
1187         *ch_len = bidi_count_bytes (string->s, *disp_pos, bytepos,
1188                                     disp_end_pos, string->unibyte);
1189       else if (STRINGP (string->lstring))
1190         *ch_len = bidi_count_bytes (SDATA (string->lstring), *disp_pos,
1191                                     bytepos, disp_end_pos, string->unibyte);
1192       else
1193         *ch_len = CHAR_TO_BYTE (disp_end_pos) - bytepos;
1194     }
1195   else
1196     {
1197     normal_char:
1198       if (string->s)
1199         {
1200
1201           if (!string->unibyte)
1202             {
1203               ch = STRING_CHAR_AND_LENGTH (string->s + bytepos, len);
1204               *ch_len = len;
1205             }
1206           else
1207             {
1208               ch = UNIBYTE_TO_CHAR (string->s[bytepos]);
1209               *ch_len = 1;
1210             }
1211         }
1212       else if (STRINGP (string->lstring))
1213         {
1214           if (!string->unibyte)
1215             {
1216               ch = STRING_CHAR_AND_LENGTH (SDATA (string->lstring) + bytepos,
1217                                            len);
1218               *ch_len = len;
1219             }
1220           else
1221             {
1222               ch = UNIBYTE_TO_CHAR (SREF (string->lstring, bytepos));
1223               *ch_len = 1;
1224             }
1225         }
1226       else
1227         {
1228           ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (bytepos), len);
1229           *ch_len = len;
1230         }
1231       *nchars = 1;
1232     }
1233
1234   /* If we just entered a run of characters covered by a display
1235      string, compute the position of the next display string.  */
1236   if (charpos + *nchars <= endpos && charpos + *nchars > *disp_pos
1237       && *disp_prop)
1238     {
1239       SET_TEXT_POS (pos, charpos + *nchars, bytepos + *ch_len);
1240       *disp_pos = compute_display_string_pos (&pos, string, w, frame_window_p,
1241                                               disp_prop);
1242     }
1243
1244   return ch;
1245 }
1246
1247 \f
1248 /***********************************************************************
1249                         Determining paragraph direction
1250  ***********************************************************************/
1251
1252 /* Check if buffer position CHARPOS/BYTEPOS is the end of a paragraph.
1253    Value is the non-negative length of the paragraph separator
1254    following the buffer position, -1 if position is at the beginning
1255    of a new paragraph, or -2 if position is neither at beginning nor
1256    at end of a paragraph.  */
1257 static ptrdiff_t
1258 bidi_at_paragraph_end (ptrdiff_t charpos, ptrdiff_t bytepos)
1259 {
1260   Lisp_Object sep_re;
1261   Lisp_Object start_re;
1262   ptrdiff_t val;
1263
1264   sep_re = paragraph_separate_re;
1265   start_re = paragraph_start_re;
1266
1267   val = fast_looking_at (sep_re, charpos, bytepos, ZV, ZV_BYTE, Qnil);
1268   if (val < 0)
1269     {
1270       if (fast_looking_at (start_re, charpos, bytepos, ZV, ZV_BYTE, Qnil) >= 0)
1271         val = -1;
1272       else
1273         val = -2;
1274     }
1275
1276   return val;
1277 }
1278
1279 /* If the user has requested the long scans caching, make sure that
1280    BIDI cache is enabled.  Otherwise, make sure it's disabled.  */
1281
1282 static struct region_cache *
1283 bidi_paragraph_cache_on_off (void)
1284 {
1285   struct buffer *cache_buffer = current_buffer;
1286   bool indirect_p = false;
1287
1288   /* For indirect buffers, make sure to use the cache of their base
1289      buffer.  */
1290   if (cache_buffer->base_buffer)
1291     {
1292       cache_buffer = cache_buffer->base_buffer;
1293       indirect_p = true;
1294     }
1295
1296   /* Don't turn on or off the cache in the base buffer, if the value
1297      of cache-long-scans of the base buffer is inconsistent with that.
1298      This is because doing so will just make the cache pure overhead,
1299      since if we turn it on via indirect buffer, it will be
1300      immediately turned off by its base buffer.  */
1301   if (NILP (BVAR (current_buffer, cache_long_scans)))
1302     {
1303       if (!indirect_p
1304           || NILP (BVAR (cache_buffer, cache_long_scans)))
1305         {
1306           if (cache_buffer->bidi_paragraph_cache)
1307             {
1308               free_region_cache (cache_buffer->bidi_paragraph_cache);
1309               cache_buffer->bidi_paragraph_cache = 0;
1310             }
1311         }
1312       return NULL;
1313     }
1314   else
1315     {
1316       if (!indirect_p
1317           || !NILP (BVAR (cache_buffer, cache_long_scans)))
1318         {
1319           if (!cache_buffer->bidi_paragraph_cache)
1320             cache_buffer->bidi_paragraph_cache = new_region_cache ();
1321         }
1322       return cache_buffer->bidi_paragraph_cache;
1323     }
1324 }
1325
1326 /* On my 2005-vintage machine, searching back for paragraph start
1327    takes ~1 ms per line.  And bidi_paragraph_init is called 4 times
1328    when user types C-p.  The number below limits each call to
1329    bidi_paragraph_init to about 10 ms.  */
1330 #define MAX_PARAGRAPH_SEARCH 7500
1331
1332 /* Find the beginning of this paragraph by looking back in the buffer.
1333    Value is the byte position of the paragraph's beginning, or
1334    BEGV_BYTE if paragraph_start_re is still not found after looking
1335    back MAX_PARAGRAPH_SEARCH lines in the buffer.  */
1336 static ptrdiff_t
1337 bidi_find_paragraph_start (ptrdiff_t pos, ptrdiff_t pos_byte)
1338 {
1339   Lisp_Object re = paragraph_start_re;
1340   ptrdiff_t limit = ZV, limit_byte = ZV_BYTE;
1341   struct region_cache *bpc = bidi_paragraph_cache_on_off ();
1342   ptrdiff_t n = 0, oldpos = pos, next;
1343   struct buffer *cache_buffer = current_buffer;
1344
1345   if (cache_buffer->base_buffer)
1346     cache_buffer = cache_buffer->base_buffer;
1347
1348   while (pos_byte > BEGV_BYTE
1349          && n++ < MAX_PARAGRAPH_SEARCH
1350          && fast_looking_at (re, pos, pos_byte, limit, limit_byte, Qnil) < 0)
1351     {
1352       /* FIXME: What if the paragraph beginning is covered by a
1353          display string?  And what if a display string covering some
1354          of the text over which we scan back includes
1355          paragraph_start_re?  */
1356       DEC_BOTH (pos, pos_byte);
1357       if (bpc && region_cache_backward (cache_buffer, bpc, pos, &next))
1358         {
1359           pos = next, pos_byte = CHAR_TO_BYTE (pos);
1360           break;
1361         }
1362       else
1363         pos = find_newline_no_quit (pos, pos_byte, -1, &pos_byte);
1364     }
1365   if (n >= MAX_PARAGRAPH_SEARCH)
1366     pos = BEGV, pos_byte = BEGV_BYTE;
1367   if (bpc)
1368     know_region_cache (cache_buffer, bpc, pos, oldpos);
1369   /* Positions returned by the region cache are not limited to
1370      BEGV..ZV range, so we limit them here.  */
1371   pos_byte = clip_to_bounds (BEGV_BYTE, pos_byte, ZV_BYTE);
1372   return pos_byte;
1373 }
1374
1375 /* On a 3.4 GHz machine, searching forward for a strong directional
1376    character in a long paragraph full of weaks or neutrals takes about
1377    1 ms for each 20K characters.  The number below limits each call to
1378    bidi_paragraph_init to less than 10 ms even on slow machines.  */
1379 #define MAX_STRONG_CHAR_SEARCH 100000
1380
1381 /* Determine the base direction, a.k.a. base embedding level, of the
1382    paragraph we are about to iterate through.  If DIR is either L2R or
1383    R2L, just use that.  Otherwise, determine the paragraph direction
1384    from the first strong directional character of the paragraph.
1385
1386    NO_DEFAULT_P means don't default to L2R if the paragraph
1387    has no strong directional characters and both DIR and
1388    bidi_it->paragraph_dir are NEUTRAL_DIR.  In that case, search back
1389    in the buffer until a paragraph is found with a strong character,
1390    or until hitting BEGV.  In the latter case, fall back to L2R.  This
1391    flag is used in current-bidi-paragraph-direction.
1392
1393    Note that this function gives the paragraph separator the same
1394    direction as the preceding paragraph, even though Emacs generally
1395    views the separator as not belonging to any paragraph.  */
1396 void
1397 bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it, bool no_default_p)
1398 {
1399   ptrdiff_t bytepos = bidi_it->bytepos;
1400   bool string_p = bidi_it->string.s || STRINGP (bidi_it->string.lstring);
1401   ptrdiff_t pstartbyte;
1402   /* Note that begbyte is a byte position, while end is a character
1403      position.  Yes, this is ugly, but we are trying to avoid costly
1404      calls to BYTE_TO_CHAR and its ilk.  */
1405   ptrdiff_t begbyte = string_p ? 0 : BEGV_BYTE;
1406   ptrdiff_t end = string_p ? bidi_it->string.schars : ZV;
1407
1408   /* Special case for an empty buffer. */
1409   if (bytepos == begbyte && bidi_it->charpos == end)
1410     dir = L2R;
1411   /* We should never be called at EOB or before BEGV.  */
1412   else if (bidi_it->charpos >= end || bytepos < begbyte)
1413     emacs_abort ();
1414
1415   if (dir == L2R)
1416     {
1417       bidi_it->paragraph_dir = L2R;
1418       bidi_it->new_paragraph = 0;
1419     }
1420   else if (dir == R2L)
1421     {
1422       bidi_it->paragraph_dir = R2L;
1423       bidi_it->new_paragraph = 0;
1424     }
1425   else if (dir == NEUTRAL_DIR)  /* P2 */
1426     {
1427       int ch;
1428       ptrdiff_t ch_len, nchars;
1429       ptrdiff_t pos, disp_pos = -1;
1430       int disp_prop = 0;
1431       bidi_type_t type;
1432       const unsigned char *s;
1433
1434       if (!bidi_initialized)
1435         bidi_initialize ();
1436
1437       /* If we are inside a paragraph separator, we are just waiting
1438          for the separator to be exhausted; use the previous paragraph
1439          direction.  But don't do that if we have been just reseated,
1440          because we need to reinitialize below in that case.  */
1441       if (!bidi_it->first_elt
1442           && bidi_it->charpos < bidi_it->separator_limit)
1443         return;
1444
1445       /* If we are on a newline, get past it to where the next
1446          paragraph might start.  But don't do that at BEGV since then
1447          we are potentially in a new paragraph that doesn't yet
1448          exist.  */
1449       pos = bidi_it->charpos;
1450       s = (STRINGP (bidi_it->string.lstring)
1451            ? SDATA (bidi_it->string.lstring)
1452            : bidi_it->string.s);
1453       if (bytepos > begbyte
1454           && bidi_char_at_pos (bytepos, s, bidi_it->string.unibyte) == '\n')
1455         {
1456           bytepos++;
1457           pos++;
1458         }
1459
1460       /* We are either at the beginning of a paragraph or in the
1461          middle of it.  Find where this paragraph starts.  */
1462       if (string_p)
1463         {
1464           /* We don't support changes of paragraph direction inside a
1465              string.  It is treated as a single paragraph.  */
1466           pstartbyte = 0;
1467         }
1468       else
1469         pstartbyte = bidi_find_paragraph_start (pos, bytepos);
1470       bidi_it->separator_limit = -1;
1471       bidi_it->new_paragraph = 0;
1472
1473       /* The following loop is run more than once only if NO_DEFAULT_P,
1474          and only if we are iterating on a buffer.  */
1475       do {
1476         ptrdiff_t pos1;
1477
1478         bytepos = pstartbyte;
1479         if (!string_p)
1480           pos = BYTE_TO_CHAR (bytepos);
1481         ch = bidi_fetch_char (pos, bytepos, &disp_pos, &disp_prop,
1482                               &bidi_it->string, bidi_it->w,
1483                               bidi_it->frame_window_p, &ch_len, &nchars);
1484         type = bidi_get_type (ch, NEUTRAL_DIR);
1485
1486         pos1 = pos;
1487         for (pos += nchars, bytepos += ch_len;
1488              ((bidi_get_category (type) != STRONG)
1489               || (bidi_ignore_explicit_marks_for_paragraph_level
1490                   && (type == RLE || type == RLO
1491                       || type == LRE || type == LRO)))
1492                /* Stop when searched too far into an abnormally large
1493                   paragraph full of weak or neutral characters.  */
1494                && pos - pos1 < MAX_STRONG_CHAR_SEARCH;
1495              type = bidi_get_type (ch, NEUTRAL_DIR))
1496           {
1497             if (pos >= end)
1498               {
1499                 /* Pretend there's a paragraph separator at end of
1500                    buffer/string.  */
1501                 type = NEUTRAL_B;
1502                 break;
1503               }
1504             if (!string_p
1505                 && type == NEUTRAL_B
1506                 && bidi_at_paragraph_end (pos, bytepos) >= -1)
1507               break;
1508             /* Fetch next character and advance to get past it.  */
1509             ch = bidi_fetch_char (pos, bytepos, &disp_pos,
1510                                   &disp_prop, &bidi_it->string, bidi_it->w,
1511                                   bidi_it->frame_window_p, &ch_len, &nchars);
1512             pos += nchars;
1513             bytepos += ch_len;
1514           }
1515         if ((type == STRONG_R || type == STRONG_AL) /* P3 */
1516             || (!bidi_ignore_explicit_marks_for_paragraph_level
1517                 && (type == RLO || type == RLE)))
1518           bidi_it->paragraph_dir = R2L;
1519         else if (type == STRONG_L
1520                  || (!bidi_ignore_explicit_marks_for_paragraph_level
1521                      && (type == LRO || type == LRE)))
1522           bidi_it->paragraph_dir = L2R;
1523         if (!string_p
1524             && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR)
1525           {
1526             /* If this paragraph is at BEGV, default to L2R.  */
1527             if (pstartbyte == BEGV_BYTE)
1528               bidi_it->paragraph_dir = L2R; /* P3 and HL1 */
1529             else
1530               {
1531                 ptrdiff_t prevpbyte = pstartbyte;
1532                 ptrdiff_t p = BYTE_TO_CHAR (pstartbyte), pbyte = pstartbyte;
1533
1534                 /* Find the beginning of the previous paragraph, if any.  */
1535                 while (pbyte > BEGV_BYTE && prevpbyte >= pstartbyte)
1536                   {
1537                     /* FXIME: What if p is covered by a display
1538                        string?  See also a FIXME inside
1539                        bidi_find_paragraph_start.  */
1540                     DEC_BOTH (p, pbyte);
1541                     prevpbyte = bidi_find_paragraph_start (p, pbyte);
1542                   }
1543                 pstartbyte = prevpbyte;
1544               }
1545           }
1546       } while (!string_p
1547                && no_default_p && bidi_it->paragraph_dir == NEUTRAL_DIR);
1548     }
1549   else
1550     emacs_abort ();
1551
1552   /* Contrary to UAX#9 clause P3, we only default the paragraph
1553      direction to L2R if we have no previous usable paragraph
1554      direction.  This is allowed by the HL1 clause.  */
1555   if (bidi_it->paragraph_dir != L2R && bidi_it->paragraph_dir != R2L)
1556     bidi_it->paragraph_dir = L2R; /* P3 and HL1 ``higher-level protocols'' */
1557   if (bidi_it->paragraph_dir == R2L)
1558     bidi_it->level_stack[0].level = 1;
1559   else
1560     bidi_it->level_stack[0].level = 0;
1561
1562   bidi_line_init (bidi_it);
1563 }
1564
1565 \f
1566 /***********************************************************************
1567                  Resolving explicit and implicit levels.
1568   The rest of this file constitutes the core of the UBA implementation.
1569  ***********************************************************************/
1570
1571 static bool
1572 bidi_explicit_dir_char (int ch)
1573 {
1574   bidi_type_t ch_type;
1575
1576   if (!bidi_initialized)
1577     emacs_abort ();
1578   ch_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
1579   return (ch_type == LRE || ch_type == LRO
1580           || ch_type == RLE || ch_type == RLO
1581           || ch_type == PDF);
1582 }
1583
1584 /* A helper function for bidi_resolve_explicit.  It advances to the
1585    next character in logical order and determines the new embedding
1586    level and directional override, but does not take into account
1587    empty embeddings.  */
1588 static int
1589 bidi_resolve_explicit_1 (struct bidi_it *bidi_it)
1590 {
1591   int curchar;
1592   bidi_type_t type;
1593   int current_level;
1594   int new_level;
1595   bidi_dir_t override;
1596   bool string_p = bidi_it->string.s || STRINGP (bidi_it->string.lstring);
1597
1598   /* If reseat()'ed, don't advance, so as to start iteration from the
1599      position where we were reseated.  bidi_it->bytepos can be less
1600      than BEGV_BYTE after reseat to BEGV.  */
1601   if (bidi_it->bytepos < (string_p ? 0 : BEGV_BYTE)
1602       || bidi_it->first_elt)
1603     {
1604       bidi_it->first_elt = 0;
1605       if (string_p)
1606         {
1607           const unsigned char *p
1608             = (STRINGP (bidi_it->string.lstring)
1609                ? SDATA (bidi_it->string.lstring)
1610                : bidi_it->string.s);
1611
1612           if (bidi_it->charpos < 0)
1613             bidi_it->charpos = bidi_it->bytepos = 0;
1614           eassert (bidi_it->bytepos == bidi_count_bytes (p, 0, 0,
1615                                                          bidi_it->charpos,
1616                                                          bidi_it->string.unibyte));
1617         }
1618       else
1619         {
1620           if (bidi_it->charpos < BEGV)
1621             {
1622               bidi_it->charpos = BEGV;
1623               bidi_it->bytepos = BEGV_BYTE;
1624             }
1625           eassert (bidi_it->bytepos == CHAR_TO_BYTE (bidi_it->charpos));
1626         }
1627     }
1628   /* Don't move at end of buffer/string.  */
1629   else if (bidi_it->charpos < (string_p ? bidi_it->string.schars : ZV))
1630     {
1631       /* Advance to the next character, skipping characters covered by
1632          display strings (nchars > 1).  */
1633       if (bidi_it->nchars <= 0)
1634         emacs_abort ();
1635       bidi_it->charpos += bidi_it->nchars;
1636       if (bidi_it->ch_len == 0)
1637         emacs_abort ();
1638       bidi_it->bytepos += bidi_it->ch_len;
1639     }
1640
1641   current_level = bidi_it->level_stack[bidi_it->stack_idx].level; /* X1 */
1642   override = bidi_it->level_stack[bidi_it->stack_idx].override;
1643   new_level = current_level;
1644
1645   if (bidi_it->charpos >= (string_p ? bidi_it->string.schars : ZV))
1646     {
1647       curchar = BIDI_EOB;
1648       bidi_it->ch_len = 1;
1649       bidi_it->nchars = 1;
1650       bidi_it->disp_pos = (string_p ? bidi_it->string.schars : ZV);
1651       bidi_it->disp_prop = 0;
1652     }
1653   else
1654     {
1655       /* Fetch the character at BYTEPOS.  If it is covered by a
1656          display string, treat the entire run of covered characters as
1657          a single character u+FFFC.  */
1658       curchar = bidi_fetch_char (bidi_it->charpos, bidi_it->bytepos,
1659                                  &bidi_it->disp_pos, &bidi_it->disp_prop,
1660                                  &bidi_it->string, bidi_it->w,
1661                                  bidi_it->frame_window_p,
1662                                  &bidi_it->ch_len, &bidi_it->nchars);
1663     }
1664   bidi_it->ch = curchar;
1665
1666   /* Don't apply directional override here, as all the types we handle
1667      below will not be affected by the override anyway, and we need
1668      the original type unaltered.  The override will be applied in
1669      bidi_resolve_weak.  */
1670   type = bidi_get_type (curchar, NEUTRAL_DIR);
1671   bidi_it->orig_type = type;
1672   bidi_check_type (bidi_it->orig_type);
1673
1674   if (type != PDF)
1675     bidi_it->prev_was_pdf = 0;
1676
1677   bidi_it->type_after_w1 = UNKNOWN_BT;
1678
1679   switch (type)
1680     {
1681       case RLE: /* X2 */
1682       case RLO: /* X4 */
1683         bidi_it->type_after_w1 = type;
1684         bidi_check_type (bidi_it->type_after_w1);
1685         type = WEAK_BN; /* X9/Retaining */
1686         if (bidi_it->ignore_bn_limit <= -1)
1687           {
1688             if (current_level <= BIDI_MAXLEVEL - 4)
1689               {
1690                 /* Compute the least odd embedding level greater than
1691                    the current level.  */
1692                 new_level = ((current_level + 1) & ~1) + 1;
1693                 if (bidi_it->type_after_w1 == RLE)
1694                   override = NEUTRAL_DIR;
1695                 else
1696                   override = R2L;
1697                 if (current_level == BIDI_MAXLEVEL - 4)
1698                   bidi_it->invalid_rl_levels = 0;
1699                 bidi_push_embedding_level (bidi_it, new_level, override);
1700               }
1701             else
1702               {
1703                 bidi_it->invalid_levels++;
1704                 /* See the commentary about invalid_rl_levels below.  */
1705                 if (bidi_it->invalid_rl_levels < 0)
1706                   bidi_it->invalid_rl_levels = 0;
1707                 bidi_it->invalid_rl_levels++;
1708               }
1709           }
1710         else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
1711                  || (bidi_it->next_en_pos > bidi_it->charpos
1712                      && bidi_it->next_en_type == WEAK_EN))
1713           type = WEAK_EN;
1714         break;
1715       case LRE: /* X3 */
1716       case LRO: /* X5 */
1717         bidi_it->type_after_w1 = type;
1718         bidi_check_type (bidi_it->type_after_w1);
1719         type = WEAK_BN; /* X9/Retaining */
1720         if (bidi_it->ignore_bn_limit <= -1)
1721           {
1722             if (current_level <= BIDI_MAXLEVEL - 5)
1723               {
1724                 /* Compute the least even embedding level greater than
1725                    the current level.  */
1726                 new_level = ((current_level + 2) & ~1);
1727                 if (bidi_it->type_after_w1 == LRE)
1728                   override = NEUTRAL_DIR;
1729                 else
1730                   override = L2R;
1731                 bidi_push_embedding_level (bidi_it, new_level, override);
1732               }
1733             else
1734               {
1735                 bidi_it->invalid_levels++;
1736                 /* invalid_rl_levels counts invalid levels encountered
1737                    while the embedding level was already too high for
1738                    LRE/LRO, but not for RLE/RLO.  That is because
1739                    there may be exactly one PDF which we should not
1740                    ignore even though invalid_levels is non-zero.
1741                    invalid_rl_levels helps to know what PDF is
1742                    that.  */
1743                 if (bidi_it->invalid_rl_levels >= 0)
1744                   bidi_it->invalid_rl_levels++;
1745               }
1746           }
1747         else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
1748                  || (bidi_it->next_en_pos > bidi_it->charpos
1749                      && bidi_it->next_en_type == WEAK_EN))
1750           type = WEAK_EN;
1751         break;
1752       case PDF: /* X7 */
1753         bidi_it->type_after_w1 = type;
1754         bidi_check_type (bidi_it->type_after_w1);
1755         type = WEAK_BN; /* X9/Retaining */
1756         if (bidi_it->ignore_bn_limit <= -1)
1757           {
1758             if (!bidi_it->invalid_rl_levels)
1759               {
1760                 new_level = bidi_pop_embedding_level (bidi_it);
1761                 bidi_it->invalid_rl_levels = -1;
1762                 if (bidi_it->invalid_levels)
1763                   bidi_it->invalid_levels--;
1764                 /* else nothing: UAX#9 says to ignore invalid PDFs */
1765               }
1766             if (!bidi_it->invalid_levels)
1767               new_level = bidi_pop_embedding_level (bidi_it);
1768             else
1769               {
1770                 bidi_it->invalid_levels--;
1771                 bidi_it->invalid_rl_levels--;
1772               }
1773           }
1774         else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
1775                  || (bidi_it->next_en_pos > bidi_it->charpos
1776                      && bidi_it->next_en_type == WEAK_EN))
1777           type = WEAK_EN;
1778         break;
1779       default:
1780         /* Nothing.  */
1781         break;
1782     }
1783
1784   bidi_it->type = type;
1785   bidi_check_type (bidi_it->type);
1786
1787   return new_level;
1788 }
1789
1790 /* Given an iterator state in BIDI_IT, advance one character position
1791    in the buffer/string to the next character (in the logical order),
1792    resolve any explicit embeddings and directional overrides, and
1793    return the embedding level of the character after resolving
1794    explicit directives and ignoring empty embeddings.  */
1795 static int
1796 bidi_resolve_explicit (struct bidi_it *bidi_it)
1797 {
1798   int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1799   int new_level  = bidi_resolve_explicit_1 (bidi_it);
1800   ptrdiff_t eob = bidi_it->string.s ? bidi_it->string.schars : ZV;
1801   const unsigned char *s
1802     = (STRINGP (bidi_it->string.lstring)
1803        ? SDATA (bidi_it->string.lstring)
1804        : bidi_it->string.s);
1805
1806   if (prev_level < new_level
1807       && bidi_it->type == WEAK_BN
1808       && bidi_it->ignore_bn_limit == -1 /* only if not already known */
1809       && bidi_it->charpos < eob         /* not already at EOB */
1810       && bidi_explicit_dir_char (bidi_char_at_pos (bidi_it->bytepos
1811                                                    + bidi_it->ch_len, s,
1812                                                    bidi_it->string.unibyte)))
1813     {
1814       /* Avoid pushing and popping embedding levels if the level run
1815          is empty, as this breaks level runs where it shouldn't.
1816          UAX#9 removes all the explicit embedding and override codes,
1817          so empty embeddings disappear without a trace.  We need to
1818          behave as if we did the same.  */
1819       struct bidi_it saved_it;
1820       int level = prev_level;
1821
1822       bidi_copy_it (&saved_it, bidi_it);
1823
1824       while (bidi_explicit_dir_char (bidi_char_at_pos (bidi_it->bytepos
1825                                                        + bidi_it->ch_len, s,
1826                                                        bidi_it->string.unibyte)))
1827         {
1828           /* This advances to the next character, skipping any
1829              characters covered by display strings.  */
1830           level = bidi_resolve_explicit_1 (bidi_it);
1831           /* If string.lstring was relocated inside bidi_resolve_explicit_1,
1832              a pointer to its data is no longer valid.  */
1833           if (STRINGP (bidi_it->string.lstring))
1834             s = SDATA (bidi_it->string.lstring);
1835         }
1836
1837       if (bidi_it->nchars <= 0)
1838         emacs_abort ();
1839       if (level == prev_level)  /* empty embedding */
1840         saved_it.ignore_bn_limit = bidi_it->charpos + bidi_it->nchars;
1841       else                      /* this embedding is non-empty */
1842         saved_it.ignore_bn_limit = -2;
1843
1844       bidi_copy_it (bidi_it, &saved_it);
1845       if (bidi_it->ignore_bn_limit > -1)
1846         {
1847           /* We pushed a level, but we shouldn't have.  Undo that. */
1848           if (!bidi_it->invalid_rl_levels)
1849             {
1850               new_level = bidi_pop_embedding_level (bidi_it);
1851               bidi_it->invalid_rl_levels = -1;
1852               if (bidi_it->invalid_levels)
1853                 bidi_it->invalid_levels--;
1854             }
1855           if (!bidi_it->invalid_levels)
1856             new_level = bidi_pop_embedding_level (bidi_it);
1857           else
1858             {
1859               bidi_it->invalid_levels--;
1860               bidi_it->invalid_rl_levels--;
1861             }
1862         }
1863     }
1864
1865   if (bidi_it->type == NEUTRAL_B)       /* X8 */
1866     {
1867       bidi_set_paragraph_end (bidi_it);
1868       /* This is needed by bidi_resolve_weak below, and in L1.  */
1869       bidi_it->type_after_w1 = bidi_it->type;
1870       bidi_check_type (bidi_it->type_after_w1);
1871     }
1872
1873   return new_level;
1874 }
1875
1876 /* Advance in the buffer/string, resolve weak types and return the
1877    type of the next character after weak type resolution.  */
1878 static bidi_type_t
1879 bidi_resolve_weak (struct bidi_it *bidi_it)
1880 {
1881   bidi_type_t type;
1882   bidi_dir_t override;
1883   int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1884   int new_level  = bidi_resolve_explicit (bidi_it);
1885   int next_char;
1886   bidi_type_t type_of_next;
1887   struct bidi_it saved_it;
1888   ptrdiff_t eob
1889     = ((STRINGP (bidi_it->string.lstring) || bidi_it->string.s)
1890        ? bidi_it->string.schars : ZV);
1891
1892   type = bidi_it->type;
1893   override = bidi_it->level_stack[bidi_it->stack_idx].override;
1894
1895   if (type == UNKNOWN_BT
1896       || type == LRE
1897       || type == LRO
1898       || type == RLE
1899       || type == RLO
1900       || type == PDF)
1901     emacs_abort ();
1902
1903   if (new_level != prev_level
1904       || bidi_it->type == NEUTRAL_B)
1905     {
1906       /* We've got a new embedding level run, compute the directional
1907          type of sor and initialize per-run variables (UAX#9, clause
1908          X10).  */
1909       bidi_set_sor_type (bidi_it, prev_level, new_level);
1910     }
1911   else if (type == NEUTRAL_S || type == NEUTRAL_WS
1912            || type == WEAK_BN || type == STRONG_AL)
1913     bidi_it->type_after_w1 = type;      /* needed in L1 */
1914   bidi_check_type (bidi_it->type_after_w1);
1915
1916   /* Level and directional override status are already recorded in
1917      bidi_it, and do not need any change; see X6.  */
1918   if (override == R2L)          /* X6 */
1919     type = STRONG_R;
1920   else if (override == L2R)
1921     type = STRONG_L;
1922   else
1923     {
1924       if (type == WEAK_NSM)     /* W1 */
1925         {
1926           /* Note that we don't need to consider the case where the
1927              prev character has its type overridden by an RLO or LRO,
1928              because then either the type of this NSM would have been
1929              also overridden, or the previous character is outside the
1930              current level run, and thus not relevant to this NSM.
1931              This is why NSM gets the type_after_w1 of the previous
1932              character.  */
1933           if (bidi_it->prev.type_after_w1 != UNKNOWN_BT
1934               /* if type_after_w1 is NEUTRAL_B, this NSM is at sor */
1935               && bidi_it->prev.type_after_w1 != NEUTRAL_B)
1936             type = bidi_it->prev.type_after_w1;
1937           else if (bidi_it->sor == R2L)
1938             type = STRONG_R;
1939           else if (bidi_it->sor == L2R)
1940             type = STRONG_L;
1941           else /* shouldn't happen! */
1942             emacs_abort ();
1943         }
1944       if (type == WEAK_EN       /* W2 */
1945           && bidi_it->last_strong.type_after_w1 == STRONG_AL)
1946         type = WEAK_AN;
1947       else if (type == STRONG_AL) /* W3 */
1948         type = STRONG_R;
1949       else if ((type == WEAK_ES /* W4 */
1950                 && bidi_it->prev.type_after_w1 == WEAK_EN
1951                 && bidi_it->prev.orig_type == WEAK_EN)
1952                || (type == WEAK_CS
1953                    && ((bidi_it->prev.type_after_w1 == WEAK_EN
1954                         && bidi_it->prev.orig_type == WEAK_EN)
1955                        || bidi_it->prev.type_after_w1 == WEAK_AN)))
1956         {
1957           const unsigned char *s
1958             = (STRINGP (bidi_it->string.lstring)
1959                ? SDATA (bidi_it->string.lstring)
1960                : bidi_it->string.s);
1961
1962           next_char = (bidi_it->charpos + bidi_it->nchars >= eob
1963                        ? BIDI_EOB
1964                        : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len,
1965                                            s, bidi_it->string.unibyte));
1966           type_of_next = bidi_get_type (next_char, override);
1967
1968           if (type_of_next == WEAK_BN
1969               || bidi_explicit_dir_char (next_char))
1970             {
1971               bidi_copy_it (&saved_it, bidi_it);
1972               while (bidi_resolve_explicit (bidi_it) == new_level
1973                      && bidi_it->type == WEAK_BN)
1974                 ;
1975               type_of_next = bidi_it->type;
1976               bidi_copy_it (bidi_it, &saved_it);
1977             }
1978
1979           /* If the next character is EN, but the last strong-type
1980              character is AL, that next EN will be changed to AN when
1981              we process it in W2 above.  So in that case, this ES
1982              should not be changed into EN.  */
1983           if (type == WEAK_ES
1984               && type_of_next == WEAK_EN
1985               && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1986             type = WEAK_EN;
1987           else if (type == WEAK_CS)
1988             {
1989               if (bidi_it->prev.type_after_w1 == WEAK_AN
1990                   && (type_of_next == WEAK_AN
1991                       /* If the next character is EN, but the last
1992                          strong-type character is AL, EN will be later
1993                          changed to AN when we process it in W2 above.
1994                          So in that case, this ES should not be
1995                          changed into EN.  */
1996                       || (type_of_next == WEAK_EN
1997                           && bidi_it->last_strong.type_after_w1 == STRONG_AL)))
1998                 type = WEAK_AN;
1999               else if (bidi_it->prev.type_after_w1 == WEAK_EN
2000                        && type_of_next == WEAK_EN
2001                        && bidi_it->last_strong.type_after_w1 != STRONG_AL)
2002                 type = WEAK_EN;
2003             }
2004         }
2005       else if (type == WEAK_ET  /* W5: ET with EN before or after it */
2006                || type == WEAK_BN)      /* W5/Retaining */
2007         {
2008           if (bidi_it->prev.type_after_w1 == WEAK_EN) /* ET/BN w/EN before it */
2009             type = WEAK_EN;
2010           else if (bidi_it->next_en_pos > bidi_it->charpos
2011                    && bidi_it->next_en_type != WEAK_BN)
2012             {
2013               if (bidi_it->next_en_type == WEAK_EN) /* ET/BN with EN after it */
2014                 type = WEAK_EN;
2015             }
2016           else if (bidi_it->next_en_pos >=0)
2017             {
2018               ptrdiff_t en_pos = bidi_it->charpos + bidi_it->nchars;
2019               const unsigned char *s = (STRINGP (bidi_it->string.lstring)
2020                                         ? SDATA (bidi_it->string.lstring)
2021                                         : bidi_it->string.s);
2022
2023               if (bidi_it->nchars <= 0)
2024                 emacs_abort ();
2025               next_char
2026                 = (bidi_it->charpos + bidi_it->nchars >= eob
2027                    ? BIDI_EOB
2028                    : bidi_char_at_pos (bidi_it->bytepos + bidi_it->ch_len, s,
2029                                        bidi_it->string.unibyte));
2030               type_of_next = bidi_get_type (next_char, override);
2031
2032               if (type_of_next == WEAK_ET
2033                   || type_of_next == WEAK_BN
2034                   || bidi_explicit_dir_char (next_char))
2035                 {
2036                   bidi_copy_it (&saved_it, bidi_it);
2037                   while (bidi_resolve_explicit (bidi_it) == new_level
2038                          && (bidi_it->type == WEAK_BN
2039                              || bidi_it->type == WEAK_ET))
2040                     ;
2041                   type_of_next = bidi_it->type;
2042                   en_pos = bidi_it->charpos;
2043                   bidi_copy_it (bidi_it, &saved_it);
2044                 }
2045               /* Remember this position, to speed up processing of the
2046                  next ETs.  */
2047               bidi_it->next_en_pos = en_pos;
2048               if (type_of_next == WEAK_EN)
2049                 {
2050                   /* If the last strong character is AL, the EN we've
2051                      found will become AN when we get to it (W2). */
2052                   if (bidi_it->last_strong.type_after_w1 == STRONG_AL)
2053                     type_of_next = WEAK_AN;
2054                   else if (type == WEAK_BN)
2055                     type = NEUTRAL_ON; /* W6/Retaining */
2056                   else
2057                     type = WEAK_EN;
2058                 }
2059               else if (type_of_next == NEUTRAL_B)
2060                 /* Record the fact that there are no more ENs from
2061                    here to the end of paragraph, to avoid entering the
2062                    loop above ever again in this paragraph.  */
2063                 bidi_it->next_en_pos = -1;
2064               /* Record the type of the character where we ended our search.  */
2065               bidi_it->next_en_type = type_of_next;
2066             }
2067         }
2068     }
2069
2070   if (type == WEAK_ES || type == WEAK_ET || type == WEAK_CS /* W6 */
2071       || (type == WEAK_BN
2072           && (bidi_it->prev.type_after_w1 == WEAK_CS        /* W6/Retaining */
2073               || bidi_it->prev.type_after_w1 == WEAK_ES
2074               || bidi_it->prev.type_after_w1 == WEAK_ET)))
2075     type = NEUTRAL_ON;
2076
2077   /* Store the type we've got so far, before we clobber it with strong
2078      types in W7 and while resolving neutral types.  But leave alone
2079      the original types that were recorded above, because we will need
2080      them for the L1 clause.  */
2081   if (bidi_it->type_after_w1 == UNKNOWN_BT)
2082     bidi_it->type_after_w1 = type;
2083   bidi_check_type (bidi_it->type_after_w1);
2084
2085   if (type == WEAK_EN)  /* W7 */
2086     {
2087       if ((bidi_it->last_strong.type_after_w1 == STRONG_L)
2088           || (bidi_it->last_strong.type == UNKNOWN_BT && bidi_it->sor == L2R))
2089         type = STRONG_L;
2090     }
2091
2092   bidi_it->type = type;
2093   bidi_check_type (bidi_it->type);
2094   return type;
2095 }
2096
2097 /* Resolve the type of a neutral character according to the type of
2098    surrounding strong text and the current embedding level.  */
2099 static bidi_type_t
2100 bidi_resolve_neutral_1 (bidi_type_t prev_type, bidi_type_t next_type, int lev)
2101 {
2102   /* N1: European and Arabic numbers are treated as though they were R.  */
2103   if (next_type == WEAK_EN || next_type == WEAK_AN)
2104     next_type = STRONG_R;
2105   if (prev_type == WEAK_EN || prev_type == WEAK_AN)
2106     prev_type = STRONG_R;
2107
2108   if (next_type == prev_type)   /* N1 */
2109     return next_type;
2110   else if ((lev & 1) == 0)      /* N2 */
2111     return STRONG_L;
2112   else
2113     return STRONG_R;
2114 }
2115
2116 static bidi_type_t
2117 bidi_resolve_neutral (struct bidi_it *bidi_it)
2118 {
2119   int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2120   bidi_type_t type = bidi_resolve_weak (bidi_it);
2121   int current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2122
2123   if (!(type == STRONG_R
2124         || type == STRONG_L
2125         || type == WEAK_BN
2126         || type == WEAK_EN
2127         || type == WEAK_AN
2128         || type == NEUTRAL_B
2129         || type == NEUTRAL_S
2130         || type == NEUTRAL_WS
2131         || type == NEUTRAL_ON))
2132     emacs_abort ();
2133
2134   if ((type != NEUTRAL_B /* Don't risk entering the long loop below if
2135                             we are already at paragraph end.  */
2136        && bidi_get_category (type) == NEUTRAL)
2137       || (type == WEAK_BN && prev_level == current_level))
2138     {
2139       if (bidi_it->next_for_neutral.type != UNKNOWN_BT)
2140         type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
2141                                        bidi_it->next_for_neutral.type,
2142                                        current_level);
2143       /* The next two "else if" clauses are shortcuts for the
2144          important special case when we have a long sequence of
2145          neutral or WEAK_BN characters, such as whitespace or nulls or
2146          other control characters, on the base embedding level of the
2147          paragraph, and that sequence goes all the way to the end of
2148          the paragraph and follows a character whose resolved
2149          directionality is identical to the base embedding level.
2150          (This is what happens in a buffer with plain L2R text that
2151          happens to include long sequences of control characters.)  By
2152          virtue of N1, the result of examining this long sequence will
2153          always be either STRONG_L or STRONG_R, depending on the base
2154          embedding level.  So we use this fact directly instead of
2155          entering the expensive loop in the "else" clause.  */
2156       else if (current_level == 0
2157                && bidi_it->prev_for_neutral.type == STRONG_L
2158                && !bidi_explicit_dir_char (bidi_it->ch))
2159         type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
2160                                        STRONG_L, current_level);
2161       else if (/* current level is 1 */
2162                current_level == 1
2163                /* base embedding level is also 1 */
2164                && bidi_it->level_stack[0].level == 1
2165                /* previous character is one of those considered R for
2166                   the purposes of W5 */
2167                && (bidi_it->prev_for_neutral.type == STRONG_R
2168                    || bidi_it->prev_for_neutral.type == WEAK_EN
2169                    || bidi_it->prev_for_neutral.type == WEAK_AN)
2170                && !bidi_explicit_dir_char (bidi_it->ch))
2171         type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
2172                                        STRONG_R, current_level);
2173       else
2174         {
2175           /* Arrrgh!!  The UAX#9 algorithm is too deeply entrenched in
2176              the assumption of batch-style processing; see clauses W4,
2177              W5, and especially N1, which require to look far forward
2178              (as well as back) in the buffer/string.  May the fleas of
2179              a thousand camels infest the armpits of those who design
2180              supposedly general-purpose algorithms by looking at their
2181              own implementations, and fail to consider other possible
2182              implementations!  */
2183           struct bidi_it saved_it;
2184           bidi_type_t next_type;
2185
2186           if (bidi_it->scan_dir == -1)
2187             emacs_abort ();
2188
2189           bidi_copy_it (&saved_it, bidi_it);
2190           /* Scan the text forward until we find the first non-neutral
2191              character, and then use that to resolve the neutral we
2192              are dealing with now.  We also cache the scanned iterator
2193              states, to salvage some of the effort later.  */
2194           bidi_cache_iterator_state (bidi_it, 0);
2195           do {
2196             /* Record the info about the previous character, so that
2197                it will be cached below with this state.  */
2198             if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
2199                 && bidi_it->type != WEAK_BN)
2200               bidi_remember_char (&bidi_it->prev, bidi_it);
2201             type = bidi_resolve_weak (bidi_it);
2202             /* Paragraph separators have their levels fully resolved
2203                at this point, so cache them as resolved.  */
2204             bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B);
2205             /* FIXME: implement L1 here, by testing for a newline and
2206                resetting the level for any sequence of whitespace
2207                characters adjacent to it.  */
2208           } while (!(type == NEUTRAL_B
2209                      || (type != WEAK_BN
2210                          && bidi_get_category (type) != NEUTRAL)
2211                      /* This is all per level run, so stop when we
2212                         reach the end of this level run.  */
2213                      || (bidi_it->level_stack[bidi_it->stack_idx].level
2214                          != current_level)));
2215
2216           bidi_remember_char (&saved_it.next_for_neutral, bidi_it);
2217
2218           switch (type)
2219             {
2220               case STRONG_L:
2221               case STRONG_R:
2222               case STRONG_AL:
2223                 /* Actually, STRONG_AL cannot happen here, because
2224                    bidi_resolve_weak converts it to STRONG_R, per W3.  */
2225                 eassert (type != STRONG_AL);
2226                 next_type = type;
2227                 break;
2228               case WEAK_EN:
2229               case WEAK_AN:
2230                 /* N1: ``European and Arabic numbers are treated as
2231                    though they were R.''  */
2232                 next_type = STRONG_R;
2233                 break;
2234               case WEAK_BN:
2235               case NEUTRAL_ON:  /* W6/Retaining */
2236                 if (!bidi_explicit_dir_char (bidi_it->ch))
2237                   emacs_abort (); /* can't happen: BNs are skipped */
2238                 /* FALLTHROUGH */
2239               case NEUTRAL_B:
2240                 /* Marched all the way to the end of this level run.
2241                    We need to use the eor type, whose information is
2242                    stored by bidi_set_sor_type in the prev_for_neutral
2243                    member.  */
2244                 if (saved_it.type != WEAK_BN
2245                     || bidi_get_category (bidi_it->prev.type_after_w1) == NEUTRAL)
2246                   next_type = bidi_it->prev_for_neutral.type;
2247                 else
2248                   {
2249                     /* This is a BN which does not adjoin neutrals.
2250                        Leave its type alone.  */
2251                     bidi_copy_it (bidi_it, &saved_it);
2252                     return bidi_it->type;
2253                   }
2254                 break;
2255               default:
2256                 emacs_abort ();
2257             }
2258           type = bidi_resolve_neutral_1 (saved_it.prev_for_neutral.type,
2259                                          next_type, current_level);
2260           saved_it.next_for_neutral.type = next_type;
2261           saved_it.type = type;
2262           bidi_check_type (next_type);
2263           bidi_check_type (type);
2264           bidi_copy_it (bidi_it, &saved_it);
2265         }
2266     }
2267   return type;
2268 }
2269
2270 /* Given an iterator state in BIDI_IT, advance one character position
2271    in the buffer/string to the next character (in the logical order),
2272    resolve the bidi type of that next character, and return that
2273    type.  */
2274 static bidi_type_t
2275 bidi_type_of_next_char (struct bidi_it *bidi_it)
2276 {
2277   bidi_type_t type;
2278
2279   /* This should always be called during a forward scan.  */
2280   if (bidi_it->scan_dir != 1)
2281     emacs_abort ();
2282
2283   /* Reset the limit until which to ignore BNs if we step out of the
2284      area where we found only empty levels.  */
2285   if ((bidi_it->ignore_bn_limit > -1
2286        && bidi_it->ignore_bn_limit <= bidi_it->charpos)
2287       || (bidi_it->ignore_bn_limit == -2
2288           && !bidi_explicit_dir_char (bidi_it->ch)))
2289     bidi_it->ignore_bn_limit = -1;
2290
2291   type = bidi_resolve_neutral (bidi_it);
2292
2293   return type;
2294 }
2295
2296 /* Given an iterator state BIDI_IT, advance one character position in
2297    the buffer/string to the next character (in the current scan
2298    direction), resolve the embedding and implicit levels of that next
2299    character, and return the resulting level.  */
2300 static int
2301 bidi_level_of_next_char (struct bidi_it *bidi_it)
2302 {
2303   bidi_type_t type;
2304   int level, prev_level = -1;
2305   struct bidi_saved_info next_for_neutral;
2306   ptrdiff_t next_char_pos = -2;
2307
2308   if (bidi_it->scan_dir == 1)
2309     {
2310       ptrdiff_t eob
2311         = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
2312            ? bidi_it->string.schars : ZV);
2313
2314       /* There's no sense in trying to advance if we hit end of text.  */
2315       if (bidi_it->charpos >= eob)
2316         return bidi_it->resolved_level;
2317
2318       /* Record the info about the previous character.  */
2319       if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
2320           && bidi_it->type != WEAK_BN)
2321         bidi_remember_char (&bidi_it->prev, bidi_it);
2322       if (bidi_it->type_after_w1 == STRONG_R
2323           || bidi_it->type_after_w1 == STRONG_L
2324           || bidi_it->type_after_w1 == STRONG_AL)
2325         bidi_remember_char (&bidi_it->last_strong, bidi_it);
2326       /* FIXME: it sounds like we don't need both prev and
2327          prev_for_neutral members, but I'm leaving them both for now.  */
2328       if (bidi_it->type == STRONG_R || bidi_it->type == STRONG_L
2329           || bidi_it->type == WEAK_EN || bidi_it->type == WEAK_AN)
2330         bidi_remember_char (&bidi_it->prev_for_neutral, bidi_it);
2331
2332       /* If we overstepped the characters used for resolving neutrals
2333          and whitespace, invalidate their info in the iterator.  */
2334       if (bidi_it->charpos >= bidi_it->next_for_neutral.charpos)
2335         bidi_it->next_for_neutral.type = UNKNOWN_BT;
2336       if (bidi_it->next_en_pos >= 0
2337           && bidi_it->charpos >= bidi_it->next_en_pos)
2338         {
2339           bidi_it->next_en_pos = 0;
2340           bidi_it->next_en_type = UNKNOWN_BT;
2341         }
2342       if (bidi_it->next_for_ws.type != UNKNOWN_BT
2343           && bidi_it->charpos >= bidi_it->next_for_ws.charpos)
2344         bidi_it->next_for_ws.type = UNKNOWN_BT;
2345
2346       /* This must be taken before we fill the iterator with the info
2347          about the next char.  If we scan backwards, the iterator
2348          state must be already cached, so there's no need to know the
2349          embedding level of the previous character, since we will be
2350          returning to our caller shortly.  */
2351       prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
2352     }
2353   next_for_neutral = bidi_it->next_for_neutral;
2354
2355   /* Perhaps the character we want is already cached.  If it is, the
2356      call to bidi_cache_find below will return a type other than
2357      UNKNOWN_BT.  */
2358   if (bidi_cache_idx > bidi_cache_start && !bidi_it->first_elt)
2359     {
2360       int bob = ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
2361                  ? 0 : 1);
2362       if (bidi_it->scan_dir > 0)
2363         {
2364           if (bidi_it->nchars <= 0)
2365             emacs_abort ();
2366           next_char_pos = bidi_it->charpos + bidi_it->nchars;
2367         }
2368       else if (bidi_it->charpos >= bob)
2369         /* Implementation note: we allow next_char_pos to be as low as
2370            0 for buffers or -1 for strings, and that is okay because
2371            that's the "position" of the sentinel iterator state we
2372            cached at the beginning of the iteration.  */
2373         next_char_pos = bidi_it->charpos - 1;
2374       if (next_char_pos >= bob - 1)
2375         type = bidi_cache_find (next_char_pos, -1, bidi_it);
2376       else
2377         type = UNKNOWN_BT;
2378     }
2379   else
2380     type = UNKNOWN_BT;
2381   if (type != UNKNOWN_BT)
2382     {
2383       /* Don't lose the information for resolving neutrals!  The
2384          cached states could have been cached before their
2385          next_for_neutral member was computed.  If we are on our way
2386          forward, we can simply take the info from the previous
2387          state.  */
2388       if (bidi_it->scan_dir == 1
2389           && bidi_it->next_for_neutral.type == UNKNOWN_BT)
2390         bidi_it->next_for_neutral = next_for_neutral;
2391
2392       /* If resolved_level is -1, it means this state was cached
2393          before it was completely resolved, so we cannot return
2394          it.  */
2395       if (bidi_it->resolved_level != -1)
2396         return bidi_it->resolved_level;
2397     }
2398   if (bidi_it->scan_dir == -1)
2399     /* If we are going backwards, the iterator state is already cached
2400        from previous scans, and should be fully resolved.  */
2401     emacs_abort ();
2402
2403   if (type == UNKNOWN_BT)
2404     type = bidi_type_of_next_char (bidi_it);
2405
2406   if (type == NEUTRAL_B)
2407     return bidi_it->resolved_level;
2408
2409   level = bidi_it->level_stack[bidi_it->stack_idx].level;
2410   if ((bidi_get_category (type) == NEUTRAL /* && type != NEUTRAL_B */)
2411       || (type == WEAK_BN && prev_level == level))
2412     {
2413       if (bidi_it->next_for_neutral.type == UNKNOWN_BT)
2414         emacs_abort ();
2415
2416       /* If the cached state shows a neutral character, it was not
2417          resolved by bidi_resolve_neutral, so do it now.  */
2418       type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
2419                                      bidi_it->next_for_neutral.type,
2420                                      level);
2421     }
2422
2423   if (!(type == STRONG_R
2424         || type == STRONG_L
2425         || type == WEAK_BN
2426         || type == WEAK_EN
2427         || type == WEAK_AN))
2428     emacs_abort ();
2429   bidi_it->type = type;
2430   bidi_check_type (bidi_it->type);
2431
2432   /* For L1 below, we need to know, for each WS character, whether
2433      it belongs to a sequence of WS characters preceding a newline
2434      or a TAB or a paragraph separator.  */
2435   if (bidi_it->orig_type == NEUTRAL_WS
2436       && bidi_it->next_for_ws.type == UNKNOWN_BT)
2437     {
2438       int ch;
2439       ptrdiff_t clen = bidi_it->ch_len;
2440       ptrdiff_t bpos = bidi_it->bytepos;
2441       ptrdiff_t cpos = bidi_it->charpos;
2442       ptrdiff_t disp_pos = bidi_it->disp_pos;
2443       ptrdiff_t nc = bidi_it->nchars;
2444       struct bidi_string_data bs = bidi_it->string;
2445       bidi_type_t chtype;
2446       bool fwp = bidi_it->frame_window_p;
2447       int dpp = bidi_it->disp_prop;
2448
2449       if (bidi_it->nchars <= 0)
2450         emacs_abort ();
2451       do {
2452         ch = bidi_fetch_char (cpos += nc, bpos += clen, &disp_pos, &dpp, &bs,
2453                               bidi_it->w, fwp, &clen, &nc);
2454         if (ch == '\n' || ch == BIDI_EOB)
2455           chtype = NEUTRAL_B;
2456         else
2457           chtype = bidi_get_type (ch, NEUTRAL_DIR);
2458       } while (chtype == NEUTRAL_WS || chtype == WEAK_BN
2459                || bidi_explicit_dir_char (ch)); /* L1/Retaining */
2460       bidi_it->next_for_ws.type = chtype;
2461       bidi_check_type (bidi_it->next_for_ws.type);
2462       bidi_it->next_for_ws.charpos = cpos;
2463       bidi_it->next_for_ws.bytepos = bpos;
2464     }
2465
2466   /* Resolve implicit levels, with a twist: PDFs get the embedding
2467      level of the embedding they terminate.  See below for the
2468      reason.  */
2469   if (bidi_it->orig_type == PDF
2470       /* Don't do this if this formatting code didn't change the
2471          embedding level due to invalid or empty embeddings.  */
2472       && prev_level != level)
2473     {
2474       /* Don't look in UAX#9 for the reason for this: it's our own
2475          private quirk.  The reason is that we want the formatting
2476          codes to be delivered so that they bracket the text of their
2477          embedding.  For example, given the text
2478
2479              {RLO}teST{PDF}
2480
2481          we want it to be displayed as
2482
2483              {PDF}STet{RLO}
2484
2485          not as
2486
2487              STet{RLO}{PDF}
2488
2489          which will result because we bump up the embedding level as
2490          soon as we see the RLO and pop it as soon as we see the PDF,
2491          so RLO itself has the same embedding level as "teST", and
2492          thus would be normally delivered last, just before the PDF.
2493          The switch below fiddles with the level of PDF so that this
2494          ugly side effect does not happen.
2495
2496          (This is, of course, only important if the formatting codes
2497          are actually displayed, but Emacs does need to display them
2498          if the user wants to.)  */
2499       level = prev_level;
2500     }
2501   else if (bidi_it->orig_type == NEUTRAL_B /* L1 */
2502            || bidi_it->orig_type == NEUTRAL_S
2503            || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB
2504            || (bidi_it->orig_type == NEUTRAL_WS
2505                && (bidi_it->next_for_ws.type == NEUTRAL_B
2506                    || bidi_it->next_for_ws.type == NEUTRAL_S)))
2507     level = bidi_it->level_stack[0].level;
2508   else if ((level & 1) == 0) /* I1 */
2509     {
2510       if (type == STRONG_R)
2511         level++;
2512       else if (type == WEAK_EN || type == WEAK_AN)
2513         level += 2;
2514     }
2515   else                  /* I2 */
2516     {
2517       if (type == STRONG_L || type == WEAK_EN || type == WEAK_AN)
2518         level++;
2519     }
2520
2521   bidi_it->resolved_level = level;
2522   return level;
2523 }
2524
2525 /* Move to the other edge of a level given by LEVEL.  If END_FLAG,
2526    we are at the end of a level, and we need to prepare to
2527    resume the scan of the lower level.
2528
2529    If this level's other edge is cached, we simply jump to it, filling
2530    the iterator structure with the iterator state on the other edge.
2531    Otherwise, we walk the buffer or string until we come back to the
2532    same level as LEVEL.
2533
2534    Note: we are not talking here about a ``level run'' in the UAX#9
2535    sense of the term, but rather about a ``level'' which includes
2536    all the levels higher than it.  In other words, given the levels
2537    like this:
2538
2539          11111112222222333333334443343222222111111112223322111
2540                 A      B                    C
2541
2542    and assuming we are at point A scanning left to right, this
2543    function moves to point C, whereas the UAX#9 ``level 2 run'' ends
2544    at point B.  */
2545 static void
2546 bidi_find_other_level_edge (struct bidi_it *bidi_it, int level, bool end_flag)
2547 {
2548   int dir = end_flag ? -bidi_it->scan_dir : bidi_it->scan_dir;
2549   ptrdiff_t idx;
2550
2551   /* Try the cache first.  */
2552   if ((idx = bidi_cache_find_level_change (level, dir, end_flag))
2553       >= bidi_cache_start)
2554     bidi_cache_fetch_state (idx, bidi_it);
2555   else
2556     {
2557       int new_level;
2558
2559       /* If we are at end of level, its edges must be cached.  */
2560       if (end_flag)
2561         emacs_abort ();
2562
2563       bidi_cache_iterator_state (bidi_it, 1);
2564       do {
2565         new_level = bidi_level_of_next_char (bidi_it);
2566         bidi_cache_iterator_state (bidi_it, 1);
2567       } while (new_level >= level);
2568     }
2569 }
2570
2571 void
2572 bidi_move_to_visually_next (struct bidi_it *bidi_it)
2573 {
2574   int old_level, new_level, next_level;
2575   struct bidi_it sentinel;
2576   struct gcpro gcpro1;
2577
2578   if (bidi_it->charpos < 0 || bidi_it->bytepos < 0)
2579     emacs_abort ();
2580
2581   if (bidi_it->scan_dir == 0)
2582     {
2583       bidi_it->scan_dir = 1;    /* default to logical order */
2584     }
2585
2586   /* The code below can call eval, and thus cause GC.  If we are
2587      iterating a Lisp string, make sure it won't be GCed.  */
2588   if (STRINGP (bidi_it->string.lstring))
2589     GCPRO1 (bidi_it->string.lstring);
2590
2591   /* If we just passed a newline, initialize for the next line.  */
2592   if (!bidi_it->first_elt
2593       && (bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB))
2594     bidi_line_init (bidi_it);
2595
2596   /* Prepare the sentinel iterator state, and cache it.  When we bump
2597      into it, scanning backwards, we'll know that the last non-base
2598      level is exhausted.  */
2599   if (bidi_cache_idx == bidi_cache_start)
2600     {
2601       bidi_copy_it (&sentinel, bidi_it);
2602       if (bidi_it->first_elt)
2603         {
2604           sentinel.charpos--;   /* cached charpos needs to be monotonic */
2605           sentinel.bytepos--;
2606           sentinel.ch = '\n';   /* doesn't matter, but why not? */
2607           sentinel.ch_len = 1;
2608           sentinel.nchars = 1;
2609         }
2610       bidi_cache_iterator_state (&sentinel, 1);
2611     }
2612
2613   old_level = bidi_it->resolved_level;
2614   new_level = bidi_level_of_next_char (bidi_it);
2615
2616   /* Reordering of resolved levels (clause L2) is implemented by
2617      jumping to the other edge of the level and flipping direction of
2618      scanning the text whenever we find a level change.  */
2619   if (new_level != old_level)
2620     {
2621       bool ascending = new_level > old_level;
2622       int level_to_search = ascending ? old_level + 1 : old_level;
2623       int incr = ascending ? 1 : -1;
2624       int expected_next_level = old_level + incr;
2625
2626       /* Jump (or walk) to the other edge of this level.  */
2627       bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
2628       /* Switch scan direction and peek at the next character in the
2629          new direction.  */
2630       bidi_it->scan_dir = -bidi_it->scan_dir;
2631
2632       /* The following loop handles the case where the resolved level
2633          jumps by more than one.  This is typical for numbers inside a
2634          run of text with left-to-right embedding direction, but can
2635          also happen in other situations.  In those cases the decision
2636          where to continue after a level change, and in what direction,
2637          is tricky.  For example, given a text like below:
2638
2639                   abcdefgh
2640                   11336622
2641
2642          (where the numbers below the text show the resolved levels),
2643          the result of reordering according to UAX#9 should be this:
2644
2645                   efdcghba
2646
2647          This is implemented by the loop below which flips direction
2648          and jumps to the other edge of the level each time it finds
2649          the new level not to be the expected one.  The expected level
2650          is always one more or one less than the previous one.  */
2651       next_level = bidi_peek_at_next_level (bidi_it);
2652       while (next_level != expected_next_level)
2653         {
2654           /* If next_level is -1, it means we have an unresolved level
2655              in the cache, which at this point should not happen.  If
2656              it does, we will infloop.  */
2657           eassert (next_level >= 0);
2658           expected_next_level += incr;
2659           level_to_search += incr;
2660           bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
2661           bidi_it->scan_dir = -bidi_it->scan_dir;
2662           next_level = bidi_peek_at_next_level (bidi_it);
2663         }
2664
2665       /* Finally, deliver the next character in the new direction.  */
2666       next_level = bidi_level_of_next_char (bidi_it);
2667     }
2668
2669   /* Take note when we have just processed the newline that precedes
2670      the end of the paragraph.  The next time we are about to be
2671      called, set_iterator_to_next will automatically reinit the
2672      paragraph direction, if needed.  We do this at the newline before
2673      the paragraph separator, because the next character might not be
2674      the first character of the next paragraph, due to the bidi
2675      reordering, whereas we _must_ know the paragraph base direction
2676      _before_ we process the paragraph's text, since the base
2677      direction affects the reordering.  */
2678   if (bidi_it->scan_dir == 1
2679       && (bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB))
2680     {
2681       /* The paragraph direction of the entire string, once
2682          determined, is in effect for the entire string.  Setting the
2683          separator limit to the end of the string prevents
2684          bidi_paragraph_init from being called automatically on this
2685          string.  */
2686       if (bidi_it->string.s || STRINGP (bidi_it->string.lstring))
2687         bidi_it->separator_limit = bidi_it->string.schars;
2688       else if (bidi_it->bytepos < ZV_BYTE)
2689         {
2690           ptrdiff_t sep_len
2691             = bidi_at_paragraph_end (bidi_it->charpos + bidi_it->nchars,
2692                                      bidi_it->bytepos + bidi_it->ch_len);
2693           if (bidi_it->nchars <= 0)
2694             emacs_abort ();
2695           if (sep_len >= 0)
2696             {
2697               bidi_it->new_paragraph = 1;
2698               /* Record the buffer position of the last character of the
2699                  paragraph separator.  */
2700               bidi_it->separator_limit
2701                 = bidi_it->charpos + bidi_it->nchars + sep_len;
2702             }
2703         }
2704     }
2705
2706   if (bidi_it->scan_dir == 1 && bidi_cache_idx > bidi_cache_start)
2707     {
2708       /* If we are at paragraph's base embedding level and beyond the
2709          last cached position, the cache's job is done and we can
2710          discard it.  */
2711       if (bidi_it->resolved_level == bidi_it->level_stack[0].level
2712           && bidi_it->charpos > (bidi_cache[bidi_cache_idx - 1].charpos
2713                                  + bidi_cache[bidi_cache_idx - 1].nchars - 1))
2714         bidi_cache_reset ();
2715         /* But as long as we are caching during forward scan, we must
2716            cache each state, or else the cache integrity will be
2717            compromised: it assumes cached states correspond to buffer
2718            positions 1:1.  */
2719       else
2720         bidi_cache_iterator_state (bidi_it, 1);
2721     }
2722
2723   if (STRINGP (bidi_it->string.lstring))
2724     UNGCPRO;
2725 }
2726
2727 /* This is meant to be called from within the debugger, whenever you
2728    wish to examine the cache contents.  */
2729 void bidi_dump_cached_states (void) EXTERNALLY_VISIBLE;
2730 void
2731 bidi_dump_cached_states (void)
2732 {
2733   ptrdiff_t i;
2734   int ndigits = 1;
2735
2736   if (bidi_cache_idx == 0)
2737     {
2738       fprintf (stderr, "The cache is empty.\n");
2739       return;
2740     }
2741   fprintf (stderr, "Total of  %"pD"d state%s in cache:\n",
2742            bidi_cache_idx, bidi_cache_idx == 1 ? "" : "s");
2743
2744   for (i = bidi_cache[bidi_cache_idx - 1].charpos; i > 0; i /= 10)
2745     ndigits++;
2746   fputs ("ch  ", stderr);
2747   for (i = 0; i < bidi_cache_idx; i++)
2748     fprintf (stderr, "%*c", ndigits, bidi_cache[i].ch);
2749   fputs ("\n", stderr);
2750   fputs ("lvl ", stderr);
2751   for (i = 0; i < bidi_cache_idx; i++)
2752     fprintf (stderr, "%*d", ndigits, bidi_cache[i].resolved_level);
2753   fputs ("\n", stderr);
2754   fputs ("pos ", stderr);
2755   for (i = 0; i < bidi_cache_idx; i++)
2756     fprintf (stderr, "%*"pD"d", ndigits, bidi_cache[i].charpos);
2757   fputs ("\n", stderr);
2758 }