src/bidi.c

   1 /* Low-level bidirectional buffer-scanning functions for GNU Emacs.
   2    Copyright (C) 2000, 2001, 2004, 2005, 2009, 2010
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software: you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation, either version 3 of the License, or
  10 (at your option) any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  19
  20 /* Written by Eli Zaretskii <eliz@gnu.org>.
  21
  22    A sequential implementation of the Unicode Bidirectional algorithm,
  23    as per UAX#9, a part of the Unicode Standard.
  24
  25    Unlike the reference and most other implementations, this one is
  26    designed to be called once for every character in the buffer or
  27    string.
  28
  29    The main entry point is bidi_move_to_visually_next.  Each time it
  30    is called, it finds the next character in the visual order, and
  31    returns its information in a special structure.  The caller is then
  32    expected to process this character for display or any other
  33    purposes, and call bidi_move_to_visually_next for the next
  34    character.  See the comments in bidi_move_to_visually_next for more
  35    details about its algorithm that finds the next visual-order
  36    character by resolving their levels on the fly.
  37
  38    The two other entry points are bidi_paragraph_init and
  39    bidi_mirror_char.  The first determines the base direction of a
  40    paragraph, while the second returns the mirrored version of its
  41    argument character.
  42
  43    If you want to understand the code, you will have to read it
  44    together with the relevant portions of UAX#9.  The comments include
  45    references to UAX#9 rules, for that very reason.
  46
  47    A note about references to UAX#9 rules: if the reference says
  48    something like "X9/Retaining", it means that you need to refer to
  49    rule X9 and to its modifications decribed in the "Implementation
  50    Notes" section of UAX#9, under "Retaining Format Codes".  */
  51
  52 #ifdef HAVE_CONFIG_H
  53 #include <config.h>
  54 #endif
  55
  56 #include <stdio.h>
  57
  58 #ifdef HAVE_STRING_H
  59 #include <string.h>
  60 #endif
  61
  62 #include <setjmp.h>
  63
  64 #include "lisp.h"
  65 #include "buffer.h"
  66 #include "character.h"
  67 #include "dispextern.h"
  68
  69 static int bidi_initialized = 0;
  70
  71 static Lisp_Object bidi_type_table, bidi_mirror_table;
  72
  73 /* FIXME: Remove these when bidi_explicit_dir_char uses a lookup table.  */
  74 #define LRM_CHAR   0x200E
  75 #define RLM_CHAR   0x200F
  76 #define LRE_CHAR   0x202A
  77 #define RLE_CHAR   0x202B
  78 #define PDF_CHAR   0x202C
  79 #define LRO_CHAR   0x202D
  80 #define RLO_CHAR   0x202E
  81
  82 #define BIDI_EOB   -1
  83 #define BIDI_BOB   -2           /* FIXME: Is this needed? */
  84
  85 /* Local data structures.  (Look in dispextern.h for the rest.)  */
  86
  87 /* What we need to know about the current paragraph.  */
  88 struct bidi_paragraph_info {
  89   int start_bytepos;    /* byte position where it begins */
  90   int end_bytepos;      /* byte position where it ends */
  91   int embedding_level;  /* its basic embedding level */
  92   bidi_dir_t base_dir;  /* its base direction */
  93 };
  94
  95 /* Data type for describing the bidirectional character categories.  */
  96 typedef enum {
  97   UNKNOWN_BC,
  98   NEUTRAL,
  99   WEAK,
 100   STRONG
 101 } bidi_category_t;
 102
 103 int bidi_ignore_explicit_marks_for_paragraph_level = 1;
 104
 105 static Lisp_Object paragraph_start_re, paragraph_separate_re;
 106 static Lisp_Object Qparagraph_start, Qparagraph_separate;
 107
 108 static void
 109 bidi_initialize (void)
 110 {
 111
 112 #include "biditype.h"
 113 #include "bidimirror.h"
 114
 115   int i;
 116
 117   bidi_type_table = Fmake_char_table (Qnil, make_number (STRONG_L));
 118   staticpro (&bidi_type_table);
 119
 120   for (i = 0; i < sizeof bidi_type / sizeof bidi_type[0]; i++)
 121     char_table_set_range (bidi_type_table, bidi_type[i].from, bidi_type[i].to,
 122                           make_number (bidi_type[i].type));
 123
 124   bidi_mirror_table = Fmake_char_table (Qnil, Qnil);
 125   staticpro (&bidi_mirror_table);
 126
 127   for (i = 0; i < sizeof bidi_mirror / sizeof bidi_mirror[0]; i++)
 128     char_table_set (bidi_mirror_table, bidi_mirror[i].from,
 129                     make_number (bidi_mirror[i].to));
 130
 131   Qparagraph_start = intern ("paragraph-start");
 132   staticpro (&Qparagraph_start);
 133   paragraph_start_re = Fsymbol_value (Qparagraph_start);
 134   if (!STRINGP (paragraph_start_re))
 135     paragraph_start_re = build_string ("\f\\|[ \t]*$");
 136   staticpro (&paragraph_start_re);
 137   Qparagraph_separate = intern ("paragraph-separate");
 138   staticpro (&Qparagraph_separate);
 139   paragraph_separate_re = Fsymbol_value (Qparagraph_separate);
 140   if (!STRINGP (paragraph_separate_re))
 141     paragraph_separate_re = build_string ("[ \t\f]*$");
 142   staticpro (&paragraph_separate_re);
 143   bidi_initialized = 1;
 144 }
 145
 146 /* Return the bidi type of a character CH, subject to the current
 147    directional OVERRIDE.  */
 148 static INLINE bidi_type_t
 149 bidi_get_type (int ch, bidi_dir_t override)
 150 {
 151   bidi_type_t default_type;
 152
 153   if (ch == BIDI_EOB)
 154     return NEUTRAL_B;
 155   if (ch < 0 || ch > MAX_CHAR)
 156     abort ();
 157
 158   default_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
 159
 160   if (override == NEUTRAL_DIR)
 161     return default_type;
 162
 163   switch (default_type)
 164     {
 165       /* Although UAX#9 does not tell, it doesn't make sense to
 166          override NEUTRAL_B and LRM/RLM characters.  */
 167       case NEUTRAL_B:
 168       case LRE:
 169       case LRO:
 170       case RLE:
 171       case RLO:
 172       case PDF:
 173         return default_type;
 174       default:
 175         switch (ch)
 176           {
 177             case LRM_CHAR:
 178             case RLM_CHAR:
 179               return default_type;
 180             default:
 181               if (override == L2R) /* X6 */
 182                 return STRONG_L;
 183               else if (override == R2L)
 184                 return STRONG_R;
 185               else
 186                 abort ();       /* can't happen: handled above */
 187           }
 188     }
 189 }
 190
 191 void
 192 bidi_check_type (bidi_type_t type)
 193 {
 194   if (type < UNKNOWN_BT || type > NEUTRAL_ON)
 195     abort ();
 196 }
 197
 198 /* Given a bidi TYPE of a character, return its category.  */
 199 static INLINE bidi_category_t
 200 bidi_get_category (bidi_type_t type)
 201 {
 202   switch (type)
 203     {
 204       case UNKNOWN_BT:
 205         return UNKNOWN_BC;
 206       case STRONG_L:
 207       case STRONG_R:
 208       case STRONG_AL:
 209       case LRE:
 210       case LRO:
 211       case RLE:
 212       case RLO:
 213         return STRONG;
 214       case PDF:         /* ??? really?? */
 215       case WEAK_EN:
 216       case WEAK_ES:
 217       case WEAK_ET:
 218       case WEAK_AN:
 219       case WEAK_CS:
 220       case WEAK_NSM:
 221       case WEAK_BN:
 222         return WEAK;
 223       case NEUTRAL_B:
 224       case NEUTRAL_S:
 225       case NEUTRAL_WS:
 226       case NEUTRAL_ON:
 227         return NEUTRAL;
 228       default:
 229         abort ();
 230     }
 231 }
 232
 233 /* Return the mirrored character of C, if it has one.  If C has no
 234    mirrored counterpart, return C.
 235    Note: The conditions in UAX#9 clause L4 regarding the surrounding
 236    context must be tested by the caller.  */
 237 int
 238 bidi_mirror_char (int c)
 239 {
 240   Lisp_Object val;
 241
 242   if (c == BIDI_EOB)
 243     return c;
 244   if (c < 0 || c > MAX_CHAR)
 245     abort ();
 246
 247   val = CHAR_TABLE_REF (bidi_mirror_table, c);
 248   if (INTEGERP (val))
 249     {
 250       int v = XINT (val);
 251
 252       if (v < 0 || v > MAX_CHAR)
 253         abort ();
 254
 255       return v;
 256     }
 257
 258   return c;
 259 }
 260
 261 /* Copy the bidi iterator from FROM to TO.  To save cycles, this only
 262    copies the part of the level stack that is actually in use.  */
 263 static INLINE void
 264 bidi_copy_it (struct bidi_it *to, struct bidi_it *from)
 265 {
 266   int i;
 267
 268   /* Copy everything except the level stack and beyond.  */
 269   memcpy (to, from, ((size_t)&((struct bidi_it *)0)->level_stack[0]));
 270
 271   /* Copy the active part of the level stack.  */
 272   to->level_stack[0] = from->level_stack[0]; /* level zero is always in use */
 273   for (i = 1; i <= from->stack_idx; i++)
 274     to->level_stack[i] = from->level_stack[i];
 275 }
 276
 277 /* Caching the bidi iterator states.  */
 278
 279 #define BIDI_CACHE_CHUNK 200
 280 static struct bidi_it *bidi_cache;
 281 static size_t bidi_cache_size = 0;
 282 static size_t elsz = sizeof (struct bidi_it);
 283 static int bidi_cache_idx;      /* next unused cache slot */
 284 static int bidi_cache_last_idx; /* slot of last cache hit */
 285
 286 static INLINE void
 287 bidi_cache_reset (void)
 288 {
 289   bidi_cache_idx = 0;
 290   bidi_cache_last_idx = -1;
 291 }
 292
 293 static INLINE void
 294 bidi_cache_shrink (void)
 295 {
 296   if (bidi_cache_size > BIDI_CACHE_CHUNK)
 297     {
 298       bidi_cache_size = BIDI_CACHE_CHUNK;
 299       bidi_cache =
 300         (struct bidi_it *) xrealloc (bidi_cache, bidi_cache_size * elsz);
 301     }
 302   bidi_cache_reset ();
 303 }
 304
 305 static INLINE void
 306 bidi_cache_fetch_state (int idx, struct bidi_it *bidi_it)
 307 {
 308   int current_scan_dir = bidi_it->scan_dir;
 309
 310   if (idx < 0 || idx >= bidi_cache_idx)
 311     abort ();
 312
 313   bidi_copy_it (bidi_it, &bidi_cache[idx]);
 314   bidi_it->scan_dir = current_scan_dir;
 315   bidi_cache_last_idx = idx;
 316 }
 317
 318 /* Find a cached state with a given CHARPOS and resolved embedding
 319    level less or equal to LEVEL.  if LEVEL is -1, disregard the
 320    resolved levels in cached states.  DIR, if non-zero, means search
 321    in that direction from the last cache hit.  */
 322 static INLINE int
 323 bidi_cache_search (int charpos, int level, int dir)
 324 {
 325   int i, i_start;
 326
 327   if (bidi_cache_idx)
 328     {
 329       if (charpos < bidi_cache[bidi_cache_last_idx].charpos)
 330         dir = -1;
 331       else if (charpos > bidi_cache[bidi_cache_last_idx].charpos)
 332         dir = 1;
 333       if (dir)
 334         i_start = bidi_cache_last_idx;
 335       else
 336         {
 337           dir = -1;
 338           i_start = bidi_cache_idx - 1;
 339         }
 340
 341       if (dir < 0)
 342         {
 343           /* Linear search for now; FIXME!  */
 344           for (i = i_start; i >= 0; i--)
 345             if (bidi_cache[i].charpos == charpos
 346                 && (level == -1 || bidi_cache[i].resolved_level <= level))
 347               return i;
 348         }
 349       else
 350         {
 351           for (i = i_start; i < bidi_cache_idx; i++)
 352             if (bidi_cache[i].charpos == charpos
 353                 && (level == -1 || bidi_cache[i].resolved_level <= level))
 354               return i;
 355         }
 356     }
 357
 358   return -1;
 359 }
 360
 361 /* Find a cached state where the resolved level changes to a value
 362    that is lower than LEVEL, and return its cache slot index.  DIR is
 363    the direction to search, starting with the last used cache slot.
 364    BEFORE, if non-zero, means return the index of the slot that is
 365    ``before'' the level change in the search direction.  That is,
 366    given the cached levels like this:
 367
 368          1122333442211
 369           AB        C
 370
 371    and assuming we are at the position cached at the slot marked with
 372    C, searching backwards (DIR = -1) for LEVEL = 2 will return the
 373    index of slot B or A, depending whether BEFORE is, respectively,
 374    non-zero or zero.  */
 375 static int
 376 bidi_cache_find_level_change (int level, int dir, int before)
 377 {
 378   if (bidi_cache_idx)
 379     {
 380       int i = dir ? bidi_cache_last_idx : bidi_cache_idx - 1;
 381       int incr = before ? 1 : 0;
 382
 383       if (!dir)
 384         dir = -1;
 385       else if (!incr)
 386         i += dir;
 387
 388       if (dir < 0)
 389         {
 390           while (i >= incr)
 391             {
 392               if (bidi_cache[i - incr].resolved_level >= 0
 393                   && bidi_cache[i - incr].resolved_level < level)
 394                 return i;
 395               i--;
 396             }
 397         }
 398       else
 399         {
 400           while (i < bidi_cache_idx - incr)
 401             {
 402               if (bidi_cache[i + incr].resolved_level >= 0
 403                   && bidi_cache[i + incr].resolved_level < level)
 404                 return i;
 405               i++;
 406             }
 407         }
 408     }
 409
 410   return -1;
 411 }
 412
 413 static INLINE void
 414 bidi_cache_iterator_state (struct bidi_it *bidi_it, int resolved)
 415 {
 416   int idx;
 417
 418   /* We should never cache on backward scans.  */
 419   if (bidi_it->scan_dir == -1)
 420     abort ();
 421   idx = bidi_cache_search (bidi_it->charpos, -1, 1);
 422
 423   if (idx < 0)
 424     {
 425       idx = bidi_cache_idx;
 426       /* Enlarge the cache as needed.  */
 427       if (idx >= bidi_cache_size)
 428         {
 429           bidi_cache_size += BIDI_CACHE_CHUNK;
 430           bidi_cache =
 431             (struct bidi_it *) xrealloc (bidi_cache, bidi_cache_size * elsz);
 432         }
 433       /* Character positions should correspond to cache positions 1:1.
 434          If we are outside the range of cached positions, the cache is
 435          useless and must be reset.  */
 436       if (idx > 0 &&
 437           (bidi_it->charpos > bidi_cache[idx - 1].charpos + 1
 438            || bidi_it->charpos < bidi_cache[0].charpos))
 439         {
 440           bidi_cache_reset ();
 441           idx = 0;
 442         }
 443       bidi_copy_it (&bidi_cache[idx], bidi_it);
 444       if (!resolved)
 445         bidi_cache[idx].resolved_level = -1;
 446     }
 447   else
 448     {
 449       /* Copy only the members which could have changed, to avoid
 450          costly copying of the entire struct.  */
 451       bidi_cache[idx].type = bidi_it->type;
 452       bidi_check_type (bidi_it->type);
 453       bidi_cache[idx].type_after_w1 = bidi_it->type_after_w1;
 454       bidi_check_type (bidi_it->type_after_w1);
 455       if (resolved)
 456         bidi_cache[idx].resolved_level = bidi_it->resolved_level;
 457       else
 458         bidi_cache[idx].resolved_level = -1;
 459       bidi_cache[idx].invalid_levels = bidi_it->invalid_levels;
 460       bidi_cache[idx].invalid_rl_levels = bidi_it->invalid_rl_levels;
 461       bidi_cache[idx].next_for_neutral = bidi_it->next_for_neutral;
 462       bidi_cache[idx].next_for_ws = bidi_it->next_for_ws;
 463       bidi_cache[idx].ignore_bn_limit = bidi_it->ignore_bn_limit;
 464     }
 465
 466   bidi_cache_last_idx = idx;
 467   if (idx >= bidi_cache_idx)
 468     bidi_cache_idx = idx + 1;
 469 }
 470
 471 static INLINE bidi_type_t
 472 bidi_cache_find (int charpos, int level, struct bidi_it *bidi_it)
 473 {
 474   int i = bidi_cache_search (charpos, level, bidi_it->scan_dir);
 475
 476   if (i >= 0)
 477     {
 478       bidi_dir_t current_scan_dir = bidi_it->scan_dir;
 479
 480       bidi_copy_it (bidi_it, &bidi_cache[i]);
 481       bidi_cache_last_idx = i;
 482       /* Don't let scan direction from from the cached state override
 483          the current scan direction.  */
 484       bidi_it->scan_dir = current_scan_dir;
 485       return bidi_it->type;
 486     }
 487
 488   return UNKNOWN_BT;
 489 }
 490
 491 static INLINE int
 492 bidi_peek_at_next_level (struct bidi_it *bidi_it)
 493 {
 494   if (bidi_cache_idx == 0 || bidi_cache_last_idx == -1)
 495     abort ();
 496   return bidi_cache[bidi_cache_last_idx + bidi_it->scan_dir].resolved_level;
 497 }
 498
 499 /* Check if buffer position CHARPOS/BYTEPOS is the end of a paragraph.
 500    Value is the non-negative length of the paragraph separator
 501    following the buffer position, -1 if position is at the beginning
 502    of a new paragraph, or -2 if position is neither at beginning nor
 503    at end of a paragraph.  */
 504 static EMACS_INT
 505 bidi_at_paragraph_end (EMACS_INT charpos, EMACS_INT bytepos)
 506 {
 507   /* FIXME: Why Fbuffer_local_value rather than just Fsymbol_value?  */
 508   Lisp_Object sep_re;
 509   Lisp_Object start_re;
 510   EMACS_INT val;
 511
 512   sep_re = paragraph_separate_re;
 513   start_re = paragraph_start_re;
 514
 515   val = fast_looking_at (sep_re, charpos, bytepos, ZV, ZV_BYTE, Qnil);
 516   if (val < 0)
 517     {
 518       if (fast_looking_at (start_re, charpos, bytepos, ZV, ZV_BYTE, Qnil) >= 0)
 519         val = -1;
 520       else
 521         val = -2;
 522     }
 523
 524   return val;
 525 }
 526
 527 /* Determine the start-of-run (sor) directional type given the two
 528    embedding levels on either side of the run boundary.  Also, update
 529    the saved info about previously seen characters, since that info is
 530    generally valid for a single level run.  */
 531 static INLINE void
 532 bidi_set_sor_type (struct bidi_it *bidi_it, int level_before, int level_after)
 533 {
 534   int higher_level = level_before > level_after ? level_before : level_after;
 535
 536   /* The prev_was_pdf gork is required for when we have several PDFs
 537      in a row.  In that case, we want to compute the sor type for the
 538      next level run only once: when we see the first PDF.  That's
 539      because the sor type depends only on the higher of the two levels
 540      that we find on the two sides of the level boundary (see UAX#9,
 541      clause X10), and so we don't need to know the final embedding
 542      level to which we descend after processing all the PDFs.  */
 543   if (!bidi_it->prev_was_pdf || level_before < level_after)
 544     /* FIXME: should the default sor direction be user selectable?  */
 545     bidi_it->sor = (higher_level & 1) != 0 ? R2L : L2R;
 546   if (level_before > level_after)
 547     bidi_it->prev_was_pdf = 1;
 548
 549   bidi_it->prev.type = UNKNOWN_BT;
 550   bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
 551     bidi_it->last_strong.orig_type = UNKNOWN_BT;
 552   bidi_it->prev_for_neutral.type = bidi_it->sor == R2L ? STRONG_R : STRONG_L;
 553   bidi_it->prev_for_neutral.charpos = bidi_it->charpos;
 554   bidi_it->prev_for_neutral.bytepos = bidi_it->bytepos;
 555   bidi_it->next_for_neutral.type = bidi_it->next_for_neutral.type_after_w1 =
 556     bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
 557   bidi_it->ignore_bn_limit = 0; /* meaning it's unknown */
 558 }
 559
 560 static void
 561 bidi_line_init (struct bidi_it *bidi_it)
 562 {
 563   bidi_it->scan_dir = 1; /* FIXME: do we need to have control on this? */
 564   bidi_it->resolved_level = bidi_it->level_stack[0].level;
 565   bidi_it->level_stack[0].override = NEUTRAL_DIR; /* X1 */
 566   bidi_it->invalid_levels = 0;
 567   bidi_it->invalid_rl_levels = -1;
 568   bidi_it->next_en_pos = -1;
 569   bidi_it->next_for_ws.type = UNKNOWN_BT;
 570   bidi_set_sor_type (bidi_it,
 571                      bidi_it->paragraph_dir == R2L ? 1 : 0,
 572                      bidi_it->level_stack[0].level); /* X10 */
 573
 574   bidi_cache_reset ();
 575 }
 576
 577 /* Find the beginning of this paragraph by looking back in the buffer.
 578    Value is the byte position of the paragraph's beginning.  */
 579 static EMACS_INT
 580 bidi_find_paragraph_start (EMACS_INT pos, EMACS_INT pos_byte)
 581 {
 582   Lisp_Object re = paragraph_start_re;
 583   EMACS_INT limit = ZV, limit_byte = ZV_BYTE;
 584
 585   while (pos_byte > BEGV_BYTE
 586          && fast_looking_at (re, pos, pos_byte, limit, limit_byte, Qnil) < 0)
 587     {
 588       pos = find_next_newline_no_quit (pos - 1, -1);
 589       pos_byte = CHAR_TO_BYTE (pos);
 590     }
 591   return pos_byte;
 592 }
 593
 594 /* Determine the direction, a.k.a. base embedding level, of the
 595    paragraph we are about to iterate through.  If DIR is either L2R or
 596    R2L, just use that.  Otherwise, determine the paragraph direction
 597    from the first strong character of the paragraph.
 598
 599    Note that this gives the paragraph separator the same direction as
 600    the preceding paragraph, even though Emacs generally views the
 601    separartor as not belonging to any paragraph.  */
 602 void
 603 bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it)
 604 {
 605   EMACS_INT bytepos = bidi_it->bytepos;
 606
 607   /* Special case for an empty buffer. */
 608   if (bytepos == BEGV_BYTE && bytepos == ZV_BYTE)
 609     dir = L2R;
 610   /* We should never be called at EOB or before BEGV.  */
 611   else if (bytepos >= ZV_BYTE || bytepos < BEGV_BYTE)
 612     abort ();
 613
 614   if (dir == L2R)
 615     {
 616       bidi_it->paragraph_dir = L2R;
 617       bidi_it->new_paragraph = 0;
 618     }
 619   else if (dir == R2L)
 620     {
 621       bidi_it->paragraph_dir = R2L;
 622       bidi_it->new_paragraph = 0;
 623     }
 624   else if (dir == NEUTRAL_DIR)  /* P2 */
 625     {
 626       int ch, ch_len;
 627       EMACS_INT pos;
 628       bidi_type_t type;
 629
 630       if (!bidi_initialized)
 631         bidi_initialize ();
 632
 633       /* If we are inside a paragraph separator, we are just waiting
 634          for the separator to be exhausted; use the previous paragraph
 635          direction.  But don't do that if we have been just reseated,
 636          because we need to reinitialize below in that case.  */
 637       if (!bidi_it->first_elt
 638           && bidi_it->charpos < bidi_it->separator_limit)
 639         return;
 640
 641       /* If we are on a newline, get past it to where the next
 642          paragraph might start.  But don't do that at BEGV since then
 643          we are potentially in a new paragraph that doesn't yet
 644          exist.  */
 645       pos = bidi_it->charpos;
 646       if (bytepos > BEGV_BYTE && FETCH_CHAR (bytepos) == '\n')
 647         {
 648           bytepos++;
 649           pos++;
 650         }
 651
 652       /* We are either at the beginning of a paragraph or in the
 653          middle of it.  Find where this paragraph starts.  */
 654       bytepos = bidi_find_paragraph_start (pos, bytepos);
 655
 656       bidi_it->separator_limit = -1;
 657       bidi_it->new_paragraph = 0;
 658       ch = FETCH_CHAR (bytepos);
 659       ch_len = CHAR_BYTES (ch);
 660       pos = BYTE_TO_CHAR (bytepos);
 661       type = bidi_get_type (ch, NEUTRAL_DIR);
 662
 663       for (pos++, bytepos += ch_len;
 664            /* NOTE: UAX#9 says to search only for L, AL, or R types of
 665               characters, and ignore RLE, RLO, LRE, and LRO.  However,
 666               I'm not sure it makes sense to omit those 4; should try
 667               with and without that to see the effect.  */
 668            (bidi_get_category (type) != STRONG)
 669              || (bidi_ignore_explicit_marks_for_paragraph_level
 670                  && (type == RLE || type == RLO
 671                      || type == LRE || type == LRO));
 672            type = bidi_get_type (ch, NEUTRAL_DIR))
 673         {
 674           if (type == NEUTRAL_B && bidi_at_paragraph_end (pos, bytepos) >= -1)
 675             break;
 676           if (bytepos >= ZV_BYTE)
 677             {
 678               /* Pretend there's a paragraph separator at end of buffer.  */
 679               type = NEUTRAL_B;
 680               break;
 681             }
 682           FETCH_CHAR_ADVANCE (ch, pos, bytepos);
 683         }
 684       if (type == STRONG_R || type == STRONG_AL) /* P3 */
 685         bidi_it->paragraph_dir = R2L;
 686       else if (type == STRONG_L)
 687         bidi_it->paragraph_dir = L2R;
 688     }
 689   else
 690     abort ();
 691
 692   /* Contrary to UAX#9 clause P3, we only default the paragraph
 693      direction to L2R if we have no previous usable paragraph
 694      direction.  */
 695   if (bidi_it->paragraph_dir != L2R && bidi_it->paragraph_dir != R2L)
 696     bidi_it->paragraph_dir = L2R; /* P3 and ``higher protocols'' */
 697   if (bidi_it->paragraph_dir == R2L)
 698     bidi_it->level_stack[0].level = 1;
 699   else
 700     bidi_it->level_stack[0].level = 0;
 701
 702   bidi_line_init (bidi_it);
 703 }
 704
 705 /* Do whatever UAX#9 clause X8 says should be done at paragraph's
 706    end.  */
 707 static INLINE void
 708 bidi_set_paragraph_end (struct bidi_it *bidi_it)
 709 {
 710   bidi_it->invalid_levels = 0;
 711   bidi_it->invalid_rl_levels = -1;
 712   bidi_it->stack_idx = 0;
 713   bidi_it->resolved_level = bidi_it->level_stack[0].level;
 714 }
 715
 716 /* Initialize the bidi iterator from buffer position CHARPOS.  */
 717 void
 718 bidi_init_it (EMACS_INT charpos, EMACS_INT bytepos, struct bidi_it *bidi_it)
 719 {
 720   if (! bidi_initialized)
 721     bidi_initialize ();
 722   bidi_it->charpos = charpos;
 723   bidi_it->bytepos = bytepos;
 724   bidi_it->first_elt = 1;
 725   bidi_set_paragraph_end (bidi_it);
 726   bidi_it->new_paragraph = 1;
 727   bidi_it->separator_limit = -1;
 728   bidi_it->type = NEUTRAL_B;
 729   bidi_it->type_after_w1 = NEUTRAL_B;
 730   bidi_it->orig_type = NEUTRAL_B;
 731   bidi_it->prev_was_pdf = 0;
 732   bidi_it->prev.type = bidi_it->prev.type_after_w1 =
 733     bidi_it->prev.orig_type = UNKNOWN_BT;
 734   bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
 735     bidi_it->last_strong.orig_type = UNKNOWN_BT;
 736   bidi_it->next_for_neutral.charpos = -1;
 737   bidi_it->next_for_neutral.type =
 738     bidi_it->next_for_neutral.type_after_w1 =
 739     bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
 740   bidi_it->prev_for_neutral.charpos = -1;
 741   bidi_it->prev_for_neutral.type =
 742     bidi_it->prev_for_neutral.type_after_w1 =
 743     bidi_it->prev_for_neutral.orig_type = UNKNOWN_BT;
 744   bidi_it->sor = L2R;    /* FIXME: should it be user-selectable? */
 745   bidi_cache_shrink ();
 746 }
 747
 748 /* Push the current embedding level and override status; reset the
 749    current level to LEVEL and the current override status to OVERRIDE.  */
 750 static INLINE void
 751 bidi_push_embedding_level (struct bidi_it *bidi_it,
 752                            int level, bidi_dir_t override)
 753 {
 754   bidi_it->stack_idx++;
 755   if (bidi_it->stack_idx >= BIDI_MAXLEVEL)
 756     abort ();
 757   bidi_it->level_stack[bidi_it->stack_idx].level = level;
 758   bidi_it->level_stack[bidi_it->stack_idx].override = override;
 759 }
 760
 761 /* Pop the embedding level and directional override status from the
 762    stack, and return the new level.  */
 763 static INLINE int
 764 bidi_pop_embedding_level (struct bidi_it *bidi_it)
 765 {
 766   /* UAX#9 says to ignore invalid PDFs.  */
 767   if (bidi_it->stack_idx > 0)
 768     bidi_it->stack_idx--;
 769   return bidi_it->level_stack[bidi_it->stack_idx].level;
 770 }
 771
 772 /* Record in SAVED_INFO the information about the current character.  */
 773 static INLINE void
 774 bidi_remember_char (struct bidi_saved_info *saved_info,
 775                     struct bidi_it *bidi_it)
 776 {
 777   saved_info->charpos = bidi_it->charpos;
 778   saved_info->bytepos = bidi_it->bytepos;
 779   saved_info->type = bidi_it->type;
 780   bidi_check_type (bidi_it->type);
 781   saved_info->type_after_w1 = bidi_it->type_after_w1;
 782   bidi_check_type (bidi_it->type_after_w1);
 783   saved_info->orig_type = bidi_it->orig_type;
 784   bidi_check_type (bidi_it->orig_type);
 785 }
 786
 787 /* Resolve the type of a neutral character according to the type of
 788    surrounding strong text and the current embedding level.  */
 789 static INLINE bidi_type_t
 790 bidi_resolve_neutral_1 (bidi_type_t prev_type, bidi_type_t next_type, int lev)
 791 {
 792   /* N1: European and Arabic numbers are treated as though they were R.  */
 793   if (next_type == WEAK_EN || next_type == WEAK_AN)
 794     next_type = STRONG_R;
 795   if (prev_type == WEAK_EN || prev_type == WEAK_AN)
 796     prev_type = STRONG_R;
 797
 798   if (next_type == prev_type)   /* N1 */
 799     return next_type;
 800   else if ((lev & 1) == 0)      /* N2 */
 801     return STRONG_L;
 802   else
 803     return STRONG_R;
 804 }
 805
 806 static INLINE int
 807 bidi_explicit_dir_char (int c)
 808 {
 809   /* FIXME: this should be replaced with a lookup table with suitable
 810      bits set, like standard C ctype macros do.  */
 811   return (c == LRE_CHAR || c == LRO_CHAR
 812           || c == RLE_CHAR || c == RLO_CHAR || c == PDF_CHAR);
 813 }
 814
 815 /* A helper function for bidi_resolve_explicit.  It advances to the
 816    next character in logical order and determines the new embedding
 817    level and directional override, but does not take into account
 818    empty embeddings.  */
 819 static int
 820 bidi_resolve_explicit_1 (struct bidi_it *bidi_it)
 821 {
 822   int curchar;
 823   bidi_type_t type;
 824   int current_level;
 825   int new_level;
 826   bidi_dir_t override;
 827
 828   if (bidi_it->bytepos < BEGV_BYTE      /* after reseat to BEGV? */
 829       || bidi_it->first_elt)
 830     {
 831       bidi_it->first_elt = 0;
 832       if (bidi_it->charpos < BEGV)
 833         bidi_it->charpos = BEGV;
 834       bidi_it->bytepos = CHAR_TO_BYTE (bidi_it->charpos);
 835     }
 836   else if (bidi_it->bytepos < ZV_BYTE)  /* don't move at ZV */
 837     {
 838       bidi_it->charpos++;
 839       if (bidi_it->ch_len == 0)
 840         abort ();
 841       bidi_it->bytepos += bidi_it->ch_len;
 842     }
 843
 844   current_level = bidi_it->level_stack[bidi_it->stack_idx].level; /* X1 */
 845   override = bidi_it->level_stack[bidi_it->stack_idx].override;
 846   new_level = current_level;
 847
 848   /* in case it is a unibyte character (not yet implemented) */
 849   /* _fetch_multibyte_char_len = 1; */
 850   if (bidi_it->bytepos >= ZV_BYTE)
 851     {
 852       curchar = BIDI_EOB;
 853       bidi_it->ch_len = 1;
 854     }
 855   else
 856     {
 857       curchar = FETCH_CHAR (bidi_it->bytepos);
 858       bidi_it->ch_len = CHAR_BYTES (curchar);
 859     }
 860   bidi_it->ch = curchar;
 861
 862   /* Don't apply directional override here, as all the types we handle
 863      below will not be affected by the override anyway, and we need
 864      the original type unaltered.  The override will be applied in
 865      bidi_resolve_weak.  */
 866   type = bidi_get_type (curchar, NEUTRAL_DIR);
 867   bidi_it->orig_type = type;
 868   bidi_check_type (bidi_it->orig_type);
 869
 870   if (type != PDF)
 871     bidi_it->prev_was_pdf = 0;
 872
 873   bidi_it->type_after_w1 = UNKNOWN_BT;
 874
 875   switch (type)
 876     {
 877       case RLE: /* X2 */
 878       case RLO: /* X4 */
 879         bidi_it->type_after_w1 = type;
 880         bidi_check_type (bidi_it->type_after_w1);
 881         type = WEAK_BN; /* X9/Retaining */
 882         if (bidi_it->ignore_bn_limit <= 0)
 883           {
 884             if (current_level <= BIDI_MAXLEVEL - 4)
 885               {
 886                 /* Compute the least odd embedding level greater than
 887                    the current level.  */
 888                 new_level = ((current_level + 1) & ~1) + 1;
 889                 if (bidi_it->type_after_w1 == RLE)
 890                   override = NEUTRAL_DIR;
 891                 else
 892                   override = R2L;
 893                 if (current_level == BIDI_MAXLEVEL - 4)
 894                   bidi_it->invalid_rl_levels = 0;
 895                 bidi_push_embedding_level (bidi_it, new_level, override);
 896               }
 897             else
 898               {
 899                 bidi_it->invalid_levels++;
 900                 /* See the commentary about invalid_rl_levels below.  */
 901                 if (bidi_it->invalid_rl_levels < 0)
 902                   bidi_it->invalid_rl_levels = 0;
 903                 bidi_it->invalid_rl_levels++;
 904               }
 905           }
 906         else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
 907                  || bidi_it->next_en_pos > bidi_it->charpos)
 908           type = WEAK_EN;
 909         break;
 910       case LRE: /* X3 */
 911       case LRO: /* X5 */
 912         bidi_it->type_after_w1 = type;
 913         bidi_check_type (bidi_it->type_after_w1);
 914         type = WEAK_BN; /* X9/Retaining */
 915         if (bidi_it->ignore_bn_limit <= 0)
 916           {
 917             if (current_level <= BIDI_MAXLEVEL - 5)
 918               {
 919                 /* Compute the least even embedding level greater than
 920                    the current level.  */
 921                 new_level = ((current_level + 2) & ~1);
 922                 if (bidi_it->type_after_w1 == LRE)
 923                   override = NEUTRAL_DIR;
 924                 else
 925                   override = L2R;
 926                 bidi_push_embedding_level (bidi_it, new_level, override);
 927               }
 928             else
 929               {
 930                 bidi_it->invalid_levels++;
 931                 /* invalid_rl_levels counts invalid levels encountered
 932                    while the embedding level was already too high for
 933                    LRE/LRO, but not for RLE/RLO.  That is because
 934                    there may be exactly one PDF which we should not
 935                    ignore even though invalid_levels is non-zero.
 936                    invalid_rl_levels helps to know what PDF is
 937                    that.  */
 938                 if (bidi_it->invalid_rl_levels >= 0)
 939                   bidi_it->invalid_rl_levels++;
 940               }
 941           }
 942         else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
 943                  || bidi_it->next_en_pos > bidi_it->charpos)
 944           type = WEAK_EN;
 945         break;
 946       case PDF: /* X7 */
 947         bidi_it->type_after_w1 = type;
 948         bidi_check_type (bidi_it->type_after_w1);
 949         type = WEAK_BN; /* X9/Retaining */
 950         if (bidi_it->ignore_bn_limit <= 0)
 951           {
 952             if (!bidi_it->invalid_rl_levels)
 953               {
 954                 new_level = bidi_pop_embedding_level (bidi_it);
 955                 bidi_it->invalid_rl_levels = -1;
 956                 if (bidi_it->invalid_levels)
 957                   bidi_it->invalid_levels--;
 958                 /* else nothing: UAX#9 says to ignore invalid PDFs */
 959               }
 960             if (!bidi_it->invalid_levels)
 961               new_level = bidi_pop_embedding_level (bidi_it);
 962             else
 963               {
 964                 bidi_it->invalid_levels--;
 965                 bidi_it->invalid_rl_levels--;
 966               }
 967           }
 968         else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
 969                  || bidi_it->next_en_pos > bidi_it->charpos)
 970           type = WEAK_EN;
 971         break;
 972       default:
 973         /* Nothing.  */
 974         break;
 975     }
 976
 977   bidi_it->type = type;
 978   bidi_check_type (bidi_it->type);
 979
 980   return new_level;
 981 }
 982
 983 /* Given an iterator state in BIDI_IT, advance one character position
 984    in the buffer to the next character (in the logical order), resolve
 985    any explicit embeddings and directional overrides, and return the
 986    embedding level of the character after resolving explicit
 987    directives and ignoring empty embeddings.  */
 988 static int
 989 bidi_resolve_explicit (struct bidi_it *bidi_it)
 990 {
 991   int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
 992   int new_level  = bidi_resolve_explicit_1 (bidi_it);
 993
 994   if (prev_level < new_level
 995       && bidi_it->type == WEAK_BN
 996       && bidi_it->ignore_bn_limit == 0 /* only if not already known */
 997       && bidi_it->bytepos < ZV_BYTE    /* not already at EOB */
 998       && bidi_explicit_dir_char (FETCH_CHAR (bidi_it->bytepos
 999                                              + bidi_it->ch_len)))
1000     {
1001       /* Avoid pushing and popping embedding levels if the level run
1002          is empty, as this breaks level runs where it shouldn't.
1003          UAX#9 removes all the explicit embedding and override codes,
1004          so empty embeddings disappear without a trace.  We need to
1005          behave as if we did the same.  */
1006       struct bidi_it saved_it;
1007       int level = prev_level;
1008
1009       bidi_copy_it (&saved_it, bidi_it);
1010
1011       while (bidi_explicit_dir_char (FETCH_CHAR (bidi_it->bytepos
1012                                                  + bidi_it->ch_len)))
1013         {
1014           level = bidi_resolve_explicit_1 (bidi_it);
1015         }
1016
1017       if (level == prev_level)  /* empty embedding */
1018         saved_it.ignore_bn_limit = bidi_it->charpos + 1;
1019       else                      /* this embedding is non-empty */
1020         saved_it.ignore_bn_limit = -1;
1021
1022       bidi_copy_it (bidi_it, &saved_it);
1023       if (bidi_it->ignore_bn_limit > 0)
1024         {
1025           /* We pushed a level, but we shouldn't have.  Undo that. */
1026           if (!bidi_it->invalid_rl_levels)
1027             {
1028               new_level = bidi_pop_embedding_level (bidi_it);
1029               bidi_it->invalid_rl_levels = -1;
1030               if (bidi_it->invalid_levels)
1031                 bidi_it->invalid_levels--;
1032             }
1033           if (!bidi_it->invalid_levels)
1034             new_level = bidi_pop_embedding_level (bidi_it);
1035           else
1036             {
1037               bidi_it->invalid_levels--;
1038               bidi_it->invalid_rl_levels--;
1039             }
1040         }
1041     }
1042
1043   if (bidi_it->type == NEUTRAL_B)       /* X8 */
1044     {
1045       bidi_set_paragraph_end (bidi_it);
1046       /* This is needed by bidi_resolve_weak below, and in L1.  */
1047       bidi_it->type_after_w1 = bidi_it->type;
1048       bidi_check_type (bidi_it->type_after_w1);
1049     }
1050
1051   return new_level;
1052 }
1053
1054 /* Advance in the buffer, resolve weak types and return the type of
1055    the next character after weak type resolution.  */
1056 static bidi_type_t
1057 bidi_resolve_weak (struct bidi_it *bidi_it)
1058 {
1059   bidi_type_t type;
1060   bidi_dir_t override;
1061   int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1062   int new_level  = bidi_resolve_explicit (bidi_it);
1063   int next_char;
1064   bidi_type_t type_of_next;
1065   struct bidi_it saved_it;
1066
1067   type = bidi_it->type;
1068   override = bidi_it->level_stack[bidi_it->stack_idx].override;
1069
1070   if (type == UNKNOWN_BT
1071       || type == LRE
1072       || type == LRO
1073       || type == RLE
1074       || type == RLO
1075       || type == PDF)
1076     abort ();
1077
1078   if (new_level != prev_level
1079       || bidi_it->type == NEUTRAL_B)
1080     {
1081       /* We've got a new embedding level run, compute the directional
1082          type of sor and initialize per-run variables (UAX#9, clause
1083          X10).  */
1084       bidi_set_sor_type (bidi_it, prev_level, new_level);
1085     }
1086   else if (type == NEUTRAL_S || type == NEUTRAL_WS
1087            || type == WEAK_BN || type == STRONG_AL)
1088     bidi_it->type_after_w1 = type;      /* needed in L1 */
1089   bidi_check_type (bidi_it->type_after_w1);
1090
1091   /* Level and directional override status are already recorded in
1092      bidi_it, and do not need any change; see X6.  */
1093   if (override == R2L)          /* X6 */
1094     type = STRONG_R;
1095   else if (override == L2R)
1096     type = STRONG_L;
1097   else
1098     {
1099       if (type == WEAK_NSM)     /* W1 */
1100         {
1101           /* Note that we don't need to consider the case where the
1102              prev character has its type overridden by an RLO or LRO,
1103              because then either the type of this NSM would have been
1104              also overridden, or the previous character is outside the
1105              current level run, and thus not relevant to this NSM.
1106              This is why NSM gets the type_after_w1 of the previous
1107              character.  */
1108           if (bidi_it->prev.type_after_w1 != UNKNOWN_BT
1109               /* if type_after_w1 is NEUTRAL_B, this NSM is at sor */
1110               && bidi_it->prev.type_after_w1 != NEUTRAL_B)
1111             type = bidi_it->prev.type_after_w1;
1112           else if (bidi_it->sor == R2L)
1113             type = STRONG_R;
1114           else if (bidi_it->sor == L2R)
1115             type = STRONG_L;
1116           else /* shouldn't happen! */
1117             abort ();
1118         }
1119       if (type == WEAK_EN       /* W2 */
1120           && bidi_it->last_strong.type_after_w1 == STRONG_AL)
1121         type = WEAK_AN;
1122       else if (type == STRONG_AL) /* W3 */
1123         type = STRONG_R;
1124       else if ((type == WEAK_ES /* W4 */
1125                 && bidi_it->prev.type_after_w1 == WEAK_EN
1126                 && bidi_it->prev.orig_type == WEAK_EN)
1127                || (type == WEAK_CS
1128                    && ((bidi_it->prev.type_after_w1 == WEAK_EN
1129                         && bidi_it->prev.orig_type == WEAK_EN)
1130                        || bidi_it->prev.type_after_w1 == WEAK_AN)))
1131         {
1132           next_char =
1133             bidi_it->bytepos + bidi_it->ch_len >= ZV_BYTE
1134             ? BIDI_EOB : FETCH_CHAR (bidi_it->bytepos + bidi_it->ch_len);
1135           type_of_next = bidi_get_type (next_char, override);
1136
1137           if (type_of_next == WEAK_BN
1138               || bidi_explicit_dir_char (next_char))
1139             {
1140               bidi_copy_it (&saved_it, bidi_it);
1141               while (bidi_resolve_explicit (bidi_it) == new_level
1142                      && bidi_it->type == WEAK_BN)
1143                 ;
1144               type_of_next = bidi_it->type;
1145               bidi_copy_it (bidi_it, &saved_it);
1146             }
1147
1148           /* If the next character is EN, but the last strong-type
1149              character is AL, that next EN will be changed to AN when
1150              we process it in W2 above.  So in that case, this ES
1151              should not be changed into EN.  */
1152           if (type == WEAK_ES
1153               && type_of_next == WEAK_EN
1154               && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1155             type = WEAK_EN;
1156           else if (type == WEAK_CS)
1157             {
1158               if (bidi_it->prev.type_after_w1 == WEAK_AN
1159                   && (type_of_next == WEAK_AN
1160                       /* If the next character is EN, but the last
1161                          strong-type character is AL, EN will be later
1162                          changed to AN when we process it in W2 above.
1163                          So in that case, this ES should not be
1164                          changed into EN.  */
1165                       || (type_of_next == WEAK_EN
1166                           && bidi_it->last_strong.type_after_w1 == STRONG_AL)))
1167                 type = WEAK_AN;
1168               else if (bidi_it->prev.type_after_w1 == WEAK_EN
1169                        && type_of_next == WEAK_EN
1170                        && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1171                 type = WEAK_EN;
1172             }
1173         }
1174       else if (type == WEAK_ET  /* W5: ET with EN before or after it */
1175                || type == WEAK_BN)      /* W5/Retaining */
1176         {
1177           if (bidi_it->prev.type_after_w1 == WEAK_EN /* ET/BN w/EN before it */
1178               || bidi_it->next_en_pos > bidi_it->charpos)
1179             type = WEAK_EN;
1180           else                  /* W5: ET/BN with EN after it.  */
1181             {
1182               EMACS_INT en_pos = bidi_it->charpos + 1;
1183
1184               next_char =
1185                 bidi_it->bytepos + bidi_it->ch_len >= ZV_BYTE
1186                 ? BIDI_EOB : FETCH_CHAR (bidi_it->bytepos + bidi_it->ch_len);
1187               type_of_next = bidi_get_type (next_char, override);
1188
1189               if (type_of_next == WEAK_ET
1190                   || type_of_next == WEAK_BN
1191                   || bidi_explicit_dir_char (next_char))
1192                 {
1193                   bidi_copy_it (&saved_it, bidi_it);
1194                   while (bidi_resolve_explicit (bidi_it) == new_level
1195                          && (bidi_it->type == WEAK_BN
1196                              || bidi_it->type == WEAK_ET))
1197                     ;
1198                   type_of_next = bidi_it->type;
1199                   en_pos = bidi_it->charpos;
1200                   bidi_copy_it (bidi_it, &saved_it);
1201                 }
1202               if (type_of_next == WEAK_EN)
1203                 {
1204                   /* If the last strong character is AL, the EN we've
1205                      found will become AN when we get to it (W2). */
1206                   if (bidi_it->last_strong.type_after_w1 != STRONG_AL)
1207                     {
1208                       type = WEAK_EN;
1209                       /* Remember this EN position, to speed up processing
1210                          of the next ETs.  */
1211                       bidi_it->next_en_pos = en_pos;
1212                     }
1213                   else if (type == WEAK_BN)
1214                     type = NEUTRAL_ON; /* W6/Retaining */
1215                 }
1216             }
1217         }
1218     }
1219
1220   if (type == WEAK_ES || type == WEAK_ET || type == WEAK_CS /* W6 */
1221       || (type == WEAK_BN
1222           && (bidi_it->prev.type_after_w1 == WEAK_CS        /* W6/Retaining */
1223               || bidi_it->prev.type_after_w1 == WEAK_ES
1224               || bidi_it->prev.type_after_w1 == WEAK_ET)))
1225     type = NEUTRAL_ON;
1226
1227   /* Store the type we've got so far, before we clobber it with strong
1228      types in W7 and while resolving neutral types.  But leave alone
1229      the original types that were recorded above, because we will need
1230      them for the L1 clause.  */
1231   if (bidi_it->type_after_w1 == UNKNOWN_BT)
1232     bidi_it->type_after_w1 = type;
1233   bidi_check_type (bidi_it->type_after_w1);
1234
1235   if (type == WEAK_EN)  /* W7 */
1236     {
1237       if ((bidi_it->last_strong.type_after_w1 == STRONG_L)
1238           || (bidi_it->last_strong.type == UNKNOWN_BT && bidi_it->sor == L2R))
1239         type = STRONG_L;
1240     }
1241
1242   bidi_it->type = type;
1243   bidi_check_type (bidi_it->type);
1244   return type;
1245 }
1246
1247 static bidi_type_t
1248 bidi_resolve_neutral (struct bidi_it *bidi_it)
1249 {
1250   int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1251   bidi_type_t type = bidi_resolve_weak (bidi_it);
1252   int current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1253
1254   if (!(type == STRONG_R
1255         || type == STRONG_L
1256         || type == WEAK_BN
1257         || type == WEAK_EN
1258         || type == WEAK_AN
1259         || type == NEUTRAL_B
1260         || type == NEUTRAL_S
1261         || type == NEUTRAL_WS
1262         || type == NEUTRAL_ON))
1263     abort ();
1264
1265   if (bidi_get_category (type) == NEUTRAL
1266       || (type == WEAK_BN && prev_level == current_level))
1267     {
1268       if (bidi_it->next_for_neutral.type != UNKNOWN_BT)
1269         type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1270                                        bidi_it->next_for_neutral.type,
1271                                        current_level);
1272       else
1273         {
1274           /* Arrrgh!!  The UAX#9 algorithm is too deeply entrenched in
1275              the assumption of batch-style processing; see clauses W4,
1276              W5, and especially N1, which require to look far forward
1277              (as well as back) in the buffer.  May the fleas of a
1278              thousand camels infest the armpits of those who design
1279              supposedly general-purpose algorithms by looking at their
1280              own implementations, and fail to consider other possible
1281              implementations!  */
1282           struct bidi_it saved_it;
1283           bidi_type_t next_type;
1284
1285           if (bidi_it->scan_dir == -1)
1286             abort ();
1287
1288           bidi_copy_it (&saved_it, bidi_it);
1289           /* Scan the text forward until we find the first non-neutral
1290              character, and then use that to resolve the neutral we
1291              are dealing with now.  We also cache the scanned iterator
1292              states, to salvage some of the effort later.  */
1293           bidi_cache_iterator_state (bidi_it, 0);
1294           do {
1295             /* Record the info about the previous character, so that
1296                it will be cached below with this state.  */
1297             if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
1298                 && bidi_it->type != WEAK_BN)
1299               bidi_remember_char (&bidi_it->prev, bidi_it);
1300             type = bidi_resolve_weak (bidi_it);
1301             /* Paragraph separators have their levels fully resolved
1302                at this point, so cache them as resolved.  */
1303             bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B);
1304             /* FIXME: implement L1 here, by testing for a newline and
1305                resetting the level for any sequence of whitespace
1306                characters adjacent to it.  */
1307           } while (!(type == NEUTRAL_B
1308                      || (type != WEAK_BN
1309                          && bidi_get_category (type) != NEUTRAL)
1310                      /* This is all per level run, so stop when we
1311                         reach the end of this level run.  */
1312                      || bidi_it->level_stack[bidi_it->stack_idx].level !=
1313                      current_level));
1314
1315           bidi_remember_char (&saved_it.next_for_neutral, bidi_it);
1316
1317           switch (type)
1318             {
1319               case STRONG_L:
1320               case STRONG_R:
1321               case STRONG_AL:
1322                 next_type = type;
1323                 break;
1324               case WEAK_EN:
1325               case WEAK_AN:
1326                 /* N1: ``European and Arabic numbers are treated as
1327                    though they were R.''  */
1328                 next_type = STRONG_R;
1329                 saved_it.next_for_neutral.type = STRONG_R;
1330                 break;
1331               case WEAK_BN:
1332                 if (!bidi_explicit_dir_char (bidi_it->ch))
1333                   abort ();             /* can't happen: BNs are skipped */
1334                 /* FALLTHROUGH */
1335               case NEUTRAL_B:
1336                 /* Marched all the way to the end of this level run.
1337                    We need to use the eor type, whose information is
1338                    stored by bidi_set_sor_type in the prev_for_neutral
1339                    member.  */
1340                 if (saved_it.type != WEAK_BN
1341                     || bidi_get_category (bidi_it->prev.type_after_w1) == NEUTRAL)
1342                   {
1343                     next_type = bidi_it->prev_for_neutral.type;
1344                     saved_it.next_for_neutral.type = next_type;
1345                     bidi_check_type (next_type);
1346                   }
1347                 else
1348                   {
1349                     /* This is a BN which does not adjoin neutrals.
1350                        Leave its type alone.  */
1351                     bidi_copy_it (bidi_it, &saved_it);
1352                     return bidi_it->type;
1353                   }
1354                 break;
1355               default:
1356                 abort ();
1357             }
1358           type = bidi_resolve_neutral_1 (saved_it.prev_for_neutral.type,
1359                                          next_type, current_level);
1360           saved_it.type = type;
1361           bidi_check_type (type);
1362           bidi_copy_it (bidi_it, &saved_it);
1363         }
1364     }
1365   return type;
1366 }
1367
1368 /* Given an iterator state in BIDI_IT, advance one character position
1369    in the buffer to the next character (in the logical order), resolve
1370    the bidi type of that next character, and return that type.  */
1371 static bidi_type_t
1372 bidi_type_of_next_char (struct bidi_it *bidi_it)
1373 {
1374   bidi_type_t type;
1375
1376   /* This should always be called during a forward scan.  */
1377   if (bidi_it->scan_dir != 1)
1378     abort ();
1379
1380   /* Reset the limit until which to ignore BNs if we step out of the
1381      area where we found only empty levels.  */
1382   if ((bidi_it->ignore_bn_limit > 0
1383        && bidi_it->ignore_bn_limit <= bidi_it->charpos)
1384       || (bidi_it->ignore_bn_limit == -1
1385           && !bidi_explicit_dir_char (bidi_it->ch)))
1386     bidi_it->ignore_bn_limit = 0;
1387
1388   type = bidi_resolve_neutral (bidi_it);
1389
1390   return type;
1391 }
1392
1393 /* Given an iterator state BIDI_IT, advance one character position in
1394    the buffer to the next character (in the logical order), resolve
1395    the embedding and implicit levels of that next character, and
1396    return the resulting level.  */
1397 static int
1398 bidi_level_of_next_char (struct bidi_it *bidi_it)
1399 {
1400   bidi_type_t type;
1401   int level, prev_level = -1;
1402   struct bidi_saved_info next_for_neutral;
1403
1404   if (bidi_it->scan_dir == 1)
1405     {
1406       /* There's no sense in trying to advance if we hit end of text.  */
1407       if (bidi_it->bytepos >= ZV_BYTE)
1408         return bidi_it->resolved_level;
1409
1410       /* Record the info about the previous character.  */
1411       if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
1412           && bidi_it->type != WEAK_BN)
1413         bidi_remember_char (&bidi_it->prev, bidi_it);
1414       if (bidi_it->type_after_w1 == STRONG_R
1415           || bidi_it->type_after_w1 == STRONG_L
1416           || bidi_it->type_after_w1 == STRONG_AL)
1417         bidi_remember_char (&bidi_it->last_strong, bidi_it);
1418       /* FIXME: it sounds like we don't need both prev and
1419          prev_for_neutral members, but I'm leaving them both for now.  */
1420       if (bidi_it->type == STRONG_R || bidi_it->type == STRONG_L
1421           || bidi_it->type == WEAK_EN || bidi_it->type == WEAK_AN)
1422         bidi_remember_char (&bidi_it->prev_for_neutral, bidi_it);
1423
1424       /* If we overstepped the characters used for resolving neutrals
1425          and whitespace, invalidate their info in the iterator.  */
1426       if (bidi_it->charpos >= bidi_it->next_for_neutral.charpos)
1427         bidi_it->next_for_neutral.type = UNKNOWN_BT;
1428       if (bidi_it->next_en_pos >= 0
1429           && bidi_it->charpos >= bidi_it->next_en_pos)
1430         bidi_it->next_en_pos = -1;
1431       if (bidi_it->next_for_ws.type != UNKNOWN_BT
1432           && bidi_it->charpos >= bidi_it->next_for_ws.charpos)
1433         bidi_it->next_for_ws.type = UNKNOWN_BT;
1434
1435       /* This must be taken before we fill the iterator with the info
1436          about the next char.  If we scan backwards, the iterator
1437          state must be already cached, so there's no need to know the
1438          embedding level of the previous character, since we will be
1439          returning to our caller shortly.  */
1440       prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1441     }
1442   next_for_neutral = bidi_it->next_for_neutral;
1443
1444   /* Perhaps it is already cached.  */
1445   type = bidi_cache_find (bidi_it->charpos + bidi_it->scan_dir, -1, bidi_it);
1446   if (type != UNKNOWN_BT)
1447     {
1448       /* Don't lose the information for resolving neutrals!  The
1449          cached states could have been cached before their
1450          next_for_neutral member was computed.  If we are on our way
1451          forward, we can simply take the info from the previous
1452          state.  */
1453       if (bidi_it->scan_dir == 1
1454           && bidi_it->next_for_neutral.type == UNKNOWN_BT)
1455         bidi_it->next_for_neutral = next_for_neutral;
1456
1457       /* If resolved_level is -1, it means this state was cached
1458          before it was completely resolved, so we cannot return
1459          it.  */
1460       if (bidi_it->resolved_level != -1)
1461         return bidi_it->resolved_level;
1462     }
1463   if (bidi_it->scan_dir == -1)
1464     /* If we are going backwards, the iterator state is already cached
1465        from previous scans, and should be fully resolved.  */
1466     abort ();
1467
1468   if (type == UNKNOWN_BT)
1469     type = bidi_type_of_next_char (bidi_it);
1470
1471   if (type == NEUTRAL_B)
1472     return bidi_it->resolved_level;
1473
1474   level = bidi_it->level_stack[bidi_it->stack_idx].level;
1475   if ((bidi_get_category (type) == NEUTRAL /* && type != NEUTRAL_B */)
1476       || (type == WEAK_BN && prev_level == level))
1477     {
1478       if (bidi_it->next_for_neutral.type == UNKNOWN_BT)
1479         abort ();
1480
1481       /* If the cached state shows a neutral character, it was not
1482          resolved by bidi_resolve_neutral, so do it now.  */
1483       type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1484                                      bidi_it->next_for_neutral.type,
1485                                      level);
1486     }
1487
1488   if (!(type == STRONG_R
1489         || type == STRONG_L
1490         || type == WEAK_BN
1491         || type == WEAK_EN
1492         || type == WEAK_AN))
1493     abort ();
1494   bidi_it->type = type;
1495   bidi_check_type (bidi_it->type);
1496
1497   /* For L1 below, we need to know, for each WS character, whether
1498      it belongs to a sequence of WS characters preceeding a newline
1499      or a TAB or a paragraph separator.  */
1500   if (bidi_it->orig_type == NEUTRAL_WS
1501       && bidi_it->next_for_ws.type == UNKNOWN_BT)
1502     {
1503       int ch;
1504       int clen = bidi_it->ch_len;
1505       EMACS_INT bpos = bidi_it->bytepos;
1506       EMACS_INT cpos = bidi_it->charpos;
1507       bidi_type_t chtype;
1508
1509       do {
1510         /*_fetch_multibyte_char_len = 1;*/
1511         ch = bpos + clen >= ZV_BYTE ? BIDI_EOB : FETCH_CHAR (bpos + clen);
1512         bpos += clen;
1513         cpos++;
1514         clen = (ch == BIDI_EOB ? 1 : CHAR_BYTES (ch));
1515         if (ch == '\n' || ch == BIDI_EOB /* || ch == LINESEP_CHAR */)
1516           chtype = NEUTRAL_B;
1517         else
1518           chtype = bidi_get_type (ch, NEUTRAL_DIR);
1519       } while (chtype == NEUTRAL_WS || chtype == WEAK_BN
1520                || bidi_explicit_dir_char (ch)); /* L1/Retaining */
1521       bidi_it->next_for_ws.type = chtype;
1522       bidi_check_type (bidi_it->next_for_ws.type);
1523       bidi_it->next_for_ws.charpos = cpos;
1524       bidi_it->next_for_ws.bytepos = bpos;
1525     }
1526
1527   /* Resolve implicit levels, with a twist: PDFs get the embedding
1528      level of the enbedding they terminate.  See below for the
1529      reason.  */
1530   if (bidi_it->orig_type == PDF
1531       /* Don't do this if this formatting code didn't change the
1532          embedding level due to invalid or empty embeddings.  */
1533       && prev_level != level)
1534     {
1535       /* Don't look in UAX#9 for the reason for this: it's our own
1536          private quirk.  The reason is that we want the formatting
1537          codes to be delivered so that they bracket the text of their
1538          embedding.  For example, given the text
1539
1540              {RLO}teST{PDF}
1541
1542          we want it to be displayed as
1543
1544              {RLO}STet{PDF}
1545
1546          not as
1547
1548              STet{RLO}{PDF}
1549
1550          which will result because we bump up the embedding level as
1551          soon as we see the RLO and pop it as soon as we see the PDF,
1552          so RLO itself has the same embedding level as "teST", and
1553          thus would be normally delivered last, just before the PDF.
1554          The switch below fiddles with the level of PDF so that this
1555          ugly side effect does not happen.
1556
1557          (This is, of course, only important if the formatting codes
1558          are actually displayed, but Emacs does need to display them
1559          if the user wants to.)  */
1560       level = prev_level;
1561     }
1562   else if (bidi_it->orig_type == NEUTRAL_B /* L1 */
1563            || bidi_it->orig_type == NEUTRAL_S
1564            || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB
1565            /* || bidi_it->ch == LINESEP_CHAR */
1566            || (bidi_it->orig_type == NEUTRAL_WS
1567                && (bidi_it->next_for_ws.type == NEUTRAL_B
1568                    || bidi_it->next_for_ws.type == NEUTRAL_S)))
1569     level = bidi_it->level_stack[0].level;
1570   else if ((level & 1) == 0) /* I1 */
1571     {
1572       if (type == STRONG_R)
1573         level++;
1574       else if (type == WEAK_EN || type == WEAK_AN)
1575         level += 2;
1576     }
1577   else                  /* I2 */
1578     {
1579       if (type == STRONG_L || type == WEAK_EN || type == WEAK_AN)
1580         level++;
1581     }
1582
1583   bidi_it->resolved_level = level;
1584   return level;
1585 }
1586
1587 /* Move to the other edge of a level given by LEVEL.  If END_FLAG is
1588    non-zero, we are at the end of a level, and we need to prepare to
1589    resume the scan of the lower level.
1590
1591    If this level's other edge is cached, we simply jump to it, filling
1592    the iterator structure with the iterator state on the other edge.
1593    Otherwise, we walk the buffer until we come back to the same level
1594    as LEVEL.
1595
1596    Note: we are not talking here about a ``level run'' in the UAX#9
1597    sense of the term, but rather about a ``level'' which includes
1598    all the levels higher than it.  In other words, given the levels
1599    like this:
1600
1601          11111112222222333333334443343222222111111112223322111
1602                 A      B                    C
1603
1604    and assuming we are at point A scanning left to right, this
1605    function moves to point C, whereas the UAX#9 ``level 2 run'' ends
1606    at point B.  */
1607 static void
1608 bidi_find_other_level_edge (struct bidi_it *bidi_it, int level, int end_flag)
1609 {
1610   int dir = end_flag ? -bidi_it->scan_dir : bidi_it->scan_dir;
1611   int idx;
1612
1613   /* Try the cache first.  */
1614   if ((idx = bidi_cache_find_level_change (level, dir, end_flag)) >= 0)
1615     bidi_cache_fetch_state (idx, bidi_it);
1616   else
1617     {
1618       int new_level;
1619
1620       if (end_flag)
1621         abort (); /* if we are at end of level, its edges must be cached */
1622
1623       bidi_cache_iterator_state (bidi_it, 1);
1624       do {
1625         new_level = bidi_level_of_next_char (bidi_it);
1626         bidi_cache_iterator_state (bidi_it, 1);
1627       } while (new_level >= level);
1628     }
1629 }
1630
1631 void
1632 bidi_move_to_visually_next (struct bidi_it *bidi_it)
1633 {
1634   int old_level, new_level, next_level;
1635   struct bidi_it sentinel;
1636
1637   if (bidi_it->scan_dir == 0)
1638     {
1639       bidi_it->scan_dir = 1;    /* default to logical order */
1640     }
1641
1642   /* If we just passed a newline, initialize for the next line.  */
1643   if (!bidi_it->first_elt && bidi_it->orig_type == NEUTRAL_B)
1644     bidi_line_init (bidi_it);
1645
1646   /* Prepare the sentinel iterator state, and cache it.  When we bump
1647      into it, scanning backwards, we'll know that the last non-base
1648      level is exhausted.  */
1649   if (bidi_cache_idx == 0)
1650     {
1651       bidi_copy_it (&sentinel, bidi_it);
1652       if (bidi_it->first_elt)
1653         {
1654           sentinel.charpos--;   /* cached charpos needs to be monotonic */
1655           sentinel.bytepos--;
1656           sentinel.ch = '\n';   /* doesn't matter, but why not? */
1657           sentinel.ch_len = 1;
1658         }
1659       bidi_cache_iterator_state (&sentinel, 1);
1660     }
1661
1662   old_level = bidi_it->resolved_level;
1663   new_level = bidi_level_of_next_char (bidi_it);
1664
1665   /* Reordering of resolved levels (clause L2) is implemented by
1666      jumping to the other edge of the level and flipping direction of
1667      scanning the text whenever we find a level change.  */
1668   if (new_level != old_level)
1669     {
1670       int ascending = new_level > old_level;
1671       int level_to_search = ascending ? old_level + 1 : old_level;
1672       int incr = ascending ? 1 : -1;
1673       int expected_next_level = old_level + incr;
1674
1675       /* Jump (or walk) to the other edge of this level.  */
1676       bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
1677       /* Switch scan direction and peek at the next character in the
1678          new direction.  */
1679       bidi_it->scan_dir = -bidi_it->scan_dir;
1680
1681       /* The following loop handles the case where the resolved level
1682          jumps by more than one.  This is typical for numbers inside a
1683          run of text with left-to-right embedding direction, but can
1684          also happen in other situations.  In those cases the decision
1685          where to continue after a level change, and in what direction,
1686          is tricky.  For example, given a text like below:
1687
1688                   abcdefgh
1689                   11336622
1690
1691          (where the numbers below the text show the resolved levels),
1692          the result of reordering according to UAX#9 should be this:
1693
1694                   efdcghba
1695
1696          This is implemented by the loop below which flips direction
1697          and jumps to the other edge of the level each time it finds
1698          the new level not to be the expected one.  The expected level
1699          is always one more or one less than the previous one.  */
1700       next_level = bidi_peek_at_next_level (bidi_it);
1701       while (next_level != expected_next_level)
1702         {
1703           expected_next_level += incr;
1704           level_to_search += incr;
1705           bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
1706           bidi_it->scan_dir = -bidi_it->scan_dir;
1707           next_level = bidi_peek_at_next_level (bidi_it);
1708         }
1709
1710       /* Finally, deliver the next character in the new direction.  */
1711       next_level = bidi_level_of_next_char (bidi_it);
1712     }
1713
1714   /* Take note when we have just processed the newline that precedes
1715      the end of the paragraph.  The next time we are about to be
1716      called, set_iterator_to_next will automatically reinit the
1717      paragraph direction, if needed.  We do this at the newline before
1718      the paragraph separator, because the next character might not be
1719      the first character of the next paragraph, due to the bidi
1720      reordering, whereas we _must_ know the paragraph base direction
1721      _before_ we process the paragraph's text, since the base
1722      direction affects the reordering.  */
1723   if (bidi_it->scan_dir == 1
1724       && bidi_it->orig_type == NEUTRAL_B
1725       && bidi_it->bytepos < ZV_BYTE)
1726     {
1727       EMACS_INT sep_len =
1728         bidi_at_paragraph_end (bidi_it->charpos + 1,
1729                                bidi_it->bytepos + bidi_it->ch_len);
1730       if (sep_len >= 0)
1731         {
1732           bidi_it->new_paragraph = 1;
1733           /* Record the buffer position of the last character of the
1734              paragraph separator.  */
1735           bidi_it->separator_limit = bidi_it->charpos + 1 + sep_len;
1736         }
1737     }
1738
1739   if (bidi_it->scan_dir == 1 && bidi_cache_idx)
1740     {
1741       /* If we are at paragraph's base embedding level and beyond the
1742          last cached position, the cache's job is done and we can
1743          discard it.  */
1744       if (bidi_it->resolved_level == bidi_it->level_stack[0].level
1745           && bidi_it->charpos > bidi_cache[bidi_cache_idx - 1].charpos)
1746         bidi_cache_reset ();
1747         /* But as long as we are caching during forward scan, we must
1748            cache each state, or else the cache integrity will be
1749            compromised: it assumes cached states correspond to buffer
1750            positions 1:1.  */
1751       else
1752         bidi_cache_iterator_state (bidi_it, 1);
1753     }
1754 }
1755
1756 /* This is meant to be called from within the debugger, whenever you
1757    wish to examine the cache contents.  */
1758 void
1759 bidi_dump_cached_states (void)
1760 {
1761   int i;
1762   int ndigits = 1;
1763
1764   if (bidi_cache_idx == 0)
1765     {
1766       fprintf (stderr, "The cache is empty.\n");
1767       return;
1768     }
1769   fprintf (stderr, "Total of %d state%s in cache:\n",
1770            bidi_cache_idx, bidi_cache_idx == 1 ? "" : "s");
1771
1772   for (i = bidi_cache[bidi_cache_idx - 1].charpos; i > 0; i /= 10)
1773     ndigits++;
1774   fputs ("ch  ", stderr);
1775   for (i = 0; i < bidi_cache_idx; i++)
1776     fprintf (stderr, "%*c", ndigits, bidi_cache[i].ch);
1777   fputs ("\n", stderr);
1778   fputs ("lvl ", stderr);
1779   for (i = 0; i < bidi_cache_idx; i++)
1780     fprintf (stderr, "%*d", ndigits, bidi_cache[i].resolved_level);
1781   fputs ("\n", stderr);
1782   fputs ("pos ", stderr);
1783   for (i = 0; i < bidi_cache_idx; i++)
1784     fprintf (stderr, "%*d", ndigits, bidi_cache[i].charpos);
1785   fputs ("\n", stderr);
1786 }