src/bidi.c

   1 /* Low-level bidirectional buffer-scanning functions for GNU Emacs.
   2    Copyright (C) 2000, 2001, 2004, 2005, 2009, 2010
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GNU Emacs.
   6
   7 GNU Emacs is free software: you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation, either version 3 of the License, or
  10 (at your option) any later version.
  11
  12 GNU Emacs is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  19
  20 /* Written by Eli Zaretskii <eliz@gnu.org>.
  21
  22    A sequential implementation of the Unicode Bidirectional algorithm,
  23    as per UAX#9, a part of the Unicode Standard.
  24
  25    Unlike the reference and most other implementations, this one is
  26    designed to be called once for every character in the buffer or
  27    string.
  28
  29    The main entry point is bidi_move_to_visually_next.  Each time it
  30    is called, it finds the next character in the visual order, and
  31    returns its information in a special structure.  The caller is then
  32    expected to process this character for display or any other
  33    purposes, and call bidi_move_to_visually_next for the next
  34    character.  See the comments in bidi_move_to_visually_next for more
  35    details about its algorithm that finds the next visual-order
  36    character by resolving their levels on the fly.
  37
  38    The two other entry points are bidi_paragraph_init and
  39    bidi_mirror_char.  The first determines the base direction of a
  40    paragraph, while the second returns the mirrored version of its
  41    argument character.
  42
  43    If you want to understand the code, you will have to read it
  44    together with the relevant portions of UAX#9.  The comments include
  45    references to UAX#9 rules, for that very reason.
  46
  47    A note about references to UAX#9 rules: if the reference says
  48    something like "X9/Retaining", it means that you need to refer to
  49    rule X9 and to its modifications decribed in the "Implementation
  50    Notes" section of UAX#9, under "Retaining Format Codes".  */
  51
  52 #include <config.h>
  53 #include <stdio.h>
  54 #include <string.h>
  55 #include <setjmp.h>
  56
  57 #include "lisp.h"
  58 #include "buffer.h"
  59 #include "character.h"
  60 #include "dispextern.h"
  61
  62 static int bidi_initialized = 0;
  63
  64 static Lisp_Object bidi_type_table, bidi_mirror_table;
  65
  66 /* FIXME: Remove these when bidi_explicit_dir_char uses a lookup table.  */
  67 #define LRM_CHAR   0x200E
  68 #define RLM_CHAR   0x200F
  69 #define LRE_CHAR   0x202A
  70 #define RLE_CHAR   0x202B
  71 #define PDF_CHAR   0x202C
  72 #define LRO_CHAR   0x202D
  73 #define RLO_CHAR   0x202E
  74
  75 #define BIDI_EOB   -1
  76 #define BIDI_BOB   -2           /* FIXME: Is this needed? */
  77
  78 /* Local data structures.  (Look in dispextern.h for the rest.)  */
  79
  80 /* What we need to know about the current paragraph.  */
  81 struct bidi_paragraph_info {
  82   int start_bytepos;    /* byte position where it begins */
  83   int end_bytepos;      /* byte position where it ends */
  84   int embedding_level;  /* its basic embedding level */
  85   bidi_dir_t base_dir;  /* its base direction */
  86 };
  87
  88 /* Data type for describing the bidirectional character categories.  */
  89 typedef enum {
  90   UNKNOWN_BC,
  91   NEUTRAL,
  92   WEAK,
  93   STRONG
  94 } bidi_category_t;
  95
  96 int bidi_ignore_explicit_marks_for_paragraph_level = 1;
  97
  98 static Lisp_Object paragraph_start_re, paragraph_separate_re;
  99 static Lisp_Object Qparagraph_start, Qparagraph_separate;
 100
 101 static void
 102 bidi_initialize (void)
 103 {
 104
 105 #include "biditype.h"
 106 #include "bidimirror.h"
 107
 108   int i;
 109
 110   bidi_type_table = Fmake_char_table (Qnil, make_number (STRONG_L));
 111   staticpro (&bidi_type_table);
 112
 113   for (i = 0; i < sizeof bidi_type / sizeof bidi_type[0]; i++)
 114     char_table_set_range (bidi_type_table, bidi_type[i].from, bidi_type[i].to,
 115                           make_number (bidi_type[i].type));
 116
 117   bidi_mirror_table = Fmake_char_table (Qnil, Qnil);
 118   staticpro (&bidi_mirror_table);
 119
 120   for (i = 0; i < sizeof bidi_mirror / sizeof bidi_mirror[0]; i++)
 121     char_table_set (bidi_mirror_table, bidi_mirror[i].from,
 122                     make_number (bidi_mirror[i].to));
 123
 124   Qparagraph_start = intern ("paragraph-start");
 125   staticpro (&Qparagraph_start);
 126   paragraph_start_re = Fsymbol_value (Qparagraph_start);
 127   if (!STRINGP (paragraph_start_re))
 128     paragraph_start_re = build_string ("\f\\|[ \t]*$");
 129   staticpro (&paragraph_start_re);
 130   Qparagraph_separate = intern ("paragraph-separate");
 131   staticpro (&Qparagraph_separate);
 132   paragraph_separate_re = Fsymbol_value (Qparagraph_separate);
 133   if (!STRINGP (paragraph_separate_re))
 134     paragraph_separate_re = build_string ("[ \t\f]*$");
 135   staticpro (&paragraph_separate_re);
 136   bidi_initialized = 1;
 137 }
 138
 139 /* Return the bidi type of a character CH, subject to the current
 140    directional OVERRIDE.  */
 141 static INLINE bidi_type_t
 142 bidi_get_type (int ch, bidi_dir_t override)
 143 {
 144   bidi_type_t default_type;
 145
 146   if (ch == BIDI_EOB)
 147     return NEUTRAL_B;
 148   if (ch < 0 || ch > MAX_CHAR)
 149     abort ();
 150
 151   default_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
 152
 153   if (override == NEUTRAL_DIR)
 154     return default_type;
 155
 156   switch (default_type)
 157     {
 158       /* Although UAX#9 does not tell, it doesn't make sense to
 159          override NEUTRAL_B and LRM/RLM characters.  */
 160       case NEUTRAL_B:
 161       case LRE:
 162       case LRO:
 163       case RLE:
 164       case RLO:
 165       case PDF:
 166         return default_type;
 167       default:
 168         switch (ch)
 169           {
 170             case LRM_CHAR:
 171             case RLM_CHAR:
 172               return default_type;
 173             default:
 174               if (override == L2R) /* X6 */
 175                 return STRONG_L;
 176               else if (override == R2L)
 177                 return STRONG_R;
 178               else
 179                 abort ();       /* can't happen: handled above */
 180           }
 181     }
 182 }
 183
 184 void
 185 bidi_check_type (bidi_type_t type)
 186 {
 187   if (type < UNKNOWN_BT || type > NEUTRAL_ON)
 188     abort ();
 189 }
 190
 191 /* Given a bidi TYPE of a character, return its category.  */
 192 static INLINE bidi_category_t
 193 bidi_get_category (bidi_type_t type)
 194 {
 195   switch (type)
 196     {
 197       case UNKNOWN_BT:
 198         return UNKNOWN_BC;
 199       case STRONG_L:
 200       case STRONG_R:
 201       case STRONG_AL:
 202       case LRE:
 203       case LRO:
 204       case RLE:
 205       case RLO:
 206         return STRONG;
 207       case PDF:         /* ??? really?? */
 208       case WEAK_EN:
 209       case WEAK_ES:
 210       case WEAK_ET:
 211       case WEAK_AN:
 212       case WEAK_CS:
 213       case WEAK_NSM:
 214       case WEAK_BN:
 215         return WEAK;
 216       case NEUTRAL_B:
 217       case NEUTRAL_S:
 218       case NEUTRAL_WS:
 219       case NEUTRAL_ON:
 220         return NEUTRAL;
 221       default:
 222         abort ();
 223     }
 224 }
 225
 226 /* Return the mirrored character of C, if it has one.  If C has no
 227    mirrored counterpart, return C.
 228    Note: The conditions in UAX#9 clause L4 regarding the surrounding
 229    context must be tested by the caller.  */
 230 int
 231 bidi_mirror_char (int c)
 232 {
 233   Lisp_Object val;
 234
 235   if (c == BIDI_EOB)
 236     return c;
 237   if (c < 0 || c > MAX_CHAR)
 238     abort ();
 239
 240   val = CHAR_TABLE_REF (bidi_mirror_table, c);
 241   if (INTEGERP (val))
 242     {
 243       int v = XINT (val);
 244
 245       if (v < 0 || v > MAX_CHAR)
 246         abort ();
 247
 248       return v;
 249     }
 250
 251   return c;
 252 }
 253
 254 /* Copy the bidi iterator from FROM to TO.  To save cycles, this only
 255    copies the part of the level stack that is actually in use.  */
 256 static INLINE void
 257 bidi_copy_it (struct bidi_it *to, struct bidi_it *from)
 258 {
 259   int i;
 260
 261   /* Copy everything except the level stack and beyond.  */
 262   memcpy (to, from, ((size_t)&((struct bidi_it *)0)->level_stack[0]));
 263
 264   /* Copy the active part of the level stack.  */
 265   to->level_stack[0] = from->level_stack[0]; /* level zero is always in use */
 266   for (i = 1; i <= from->stack_idx; i++)
 267     to->level_stack[i] = from->level_stack[i];
 268 }
 269
 270 /* Caching the bidi iterator states.  */
 271
 272 #define BIDI_CACHE_CHUNK 200
 273 static struct bidi_it *bidi_cache;
 274 static size_t bidi_cache_size = 0;
 275 static size_t elsz = sizeof (struct bidi_it);
 276 static int bidi_cache_idx;      /* next unused cache slot */
 277 static int bidi_cache_last_idx; /* slot of last cache hit */
 278
 279 static INLINE void
 280 bidi_cache_reset (void)
 281 {
 282   bidi_cache_idx = 0;
 283   bidi_cache_last_idx = -1;
 284 }
 285
 286 static INLINE void
 287 bidi_cache_shrink (void)
 288 {
 289   if (bidi_cache_size > BIDI_CACHE_CHUNK)
 290     {
 291       bidi_cache_size = BIDI_CACHE_CHUNK;
 292       bidi_cache =
 293         (struct bidi_it *) xrealloc (bidi_cache, bidi_cache_size * elsz);
 294     }
 295   bidi_cache_reset ();
 296 }
 297
 298 static INLINE void
 299 bidi_cache_fetch_state (int idx, struct bidi_it *bidi_it)
 300 {
 301   int current_scan_dir = bidi_it->scan_dir;
 302
 303   if (idx < 0 || idx >= bidi_cache_idx)
 304     abort ();
 305
 306   bidi_copy_it (bidi_it, &bidi_cache[idx]);
 307   bidi_it->scan_dir = current_scan_dir;
 308   bidi_cache_last_idx = idx;
 309 }
 310
 311 /* Find a cached state with a given CHARPOS and resolved embedding
 312    level less or equal to LEVEL.  if LEVEL is -1, disregard the
 313    resolved levels in cached states.  DIR, if non-zero, means search
 314    in that direction from the last cache hit.  */
 315 static INLINE int
 316 bidi_cache_search (int charpos, int level, int dir)
 317 {
 318   int i, i_start;
 319
 320   if (bidi_cache_idx)
 321     {
 322       if (charpos < bidi_cache[bidi_cache_last_idx].charpos)
 323         dir = -1;
 324       else if (charpos > bidi_cache[bidi_cache_last_idx].charpos)
 325         dir = 1;
 326       if (dir)
 327         i_start = bidi_cache_last_idx;
 328       else
 329         {
 330           dir = -1;
 331           i_start = bidi_cache_idx - 1;
 332         }
 333
 334       if (dir < 0)
 335         {
 336           /* Linear search for now; FIXME!  */
 337           for (i = i_start; i >= 0; i--)
 338             if (bidi_cache[i].charpos == charpos
 339                 && (level == -1 || bidi_cache[i].resolved_level <= level))
 340               return i;
 341         }
 342       else
 343         {
 344           for (i = i_start; i < bidi_cache_idx; i++)
 345             if (bidi_cache[i].charpos == charpos
 346                 && (level == -1 || bidi_cache[i].resolved_level <= level))
 347               return i;
 348         }
 349     }
 350
 351   return -1;
 352 }
 353
 354 /* Find a cached state where the resolved level changes to a value
 355    that is lower than LEVEL, and return its cache slot index.  DIR is
 356    the direction to search, starting with the last used cache slot.
 357    BEFORE, if non-zero, means return the index of the slot that is
 358    ``before'' the level change in the search direction.  That is,
 359    given the cached levels like this:
 360
 361          1122333442211
 362           AB        C
 363
 364    and assuming we are at the position cached at the slot marked with
 365    C, searching backwards (DIR = -1) for LEVEL = 2 will return the
 366    index of slot B or A, depending whether BEFORE is, respectively,
 367    non-zero or zero.  */
 368 static int
 369 bidi_cache_find_level_change (int level, int dir, int before)
 370 {
 371   if (bidi_cache_idx)
 372     {
 373       int i = dir ? bidi_cache_last_idx : bidi_cache_idx - 1;
 374       int incr = before ? 1 : 0;
 375
 376       if (!dir)
 377         dir = -1;
 378       else if (!incr)
 379         i += dir;
 380
 381       if (dir < 0)
 382         {
 383           while (i >= incr)
 384             {
 385               if (bidi_cache[i - incr].resolved_level >= 0
 386                   && bidi_cache[i - incr].resolved_level < level)
 387                 return i;
 388               i--;
 389             }
 390         }
 391       else
 392         {
 393           while (i < bidi_cache_idx - incr)
 394             {
 395               if (bidi_cache[i + incr].resolved_level >= 0
 396                   && bidi_cache[i + incr].resolved_level < level)
 397                 return i;
 398               i++;
 399             }
 400         }
 401     }
 402
 403   return -1;
 404 }
 405
 406 static INLINE void
 407 bidi_cache_iterator_state (struct bidi_it *bidi_it, int resolved)
 408 {
 409   int idx;
 410
 411   /* We should never cache on backward scans.  */
 412   if (bidi_it->scan_dir == -1)
 413     abort ();
 414   idx = bidi_cache_search (bidi_it->charpos, -1, 1);
 415
 416   if (idx < 0)
 417     {
 418       idx = bidi_cache_idx;
 419       /* Enlarge the cache as needed.  */
 420       if (idx >= bidi_cache_size)
 421         {
 422           bidi_cache_size += BIDI_CACHE_CHUNK;
 423           bidi_cache =
 424             (struct bidi_it *) xrealloc (bidi_cache, bidi_cache_size * elsz);
 425         }
 426       /* Character positions should correspond to cache positions 1:1.
 427          If we are outside the range of cached positions, the cache is
 428          useless and must be reset.  */
 429       if (idx > 0 &&
 430           (bidi_it->charpos > bidi_cache[idx - 1].charpos + 1
 431            || bidi_it->charpos < bidi_cache[0].charpos))
 432         {
 433           bidi_cache_reset ();
 434           idx = 0;
 435         }
 436       bidi_copy_it (&bidi_cache[idx], bidi_it);
 437       if (!resolved)
 438         bidi_cache[idx].resolved_level = -1;
 439     }
 440   else
 441     {
 442       /* Copy only the members which could have changed, to avoid
 443          costly copying of the entire struct.  */
 444       bidi_cache[idx].type = bidi_it->type;
 445       bidi_check_type (bidi_it->type);
 446       bidi_cache[idx].type_after_w1 = bidi_it->type_after_w1;
 447       bidi_check_type (bidi_it->type_after_w1);
 448       if (resolved)
 449         bidi_cache[idx].resolved_level = bidi_it->resolved_level;
 450       else
 451         bidi_cache[idx].resolved_level = -1;
 452       bidi_cache[idx].invalid_levels = bidi_it->invalid_levels;
 453       bidi_cache[idx].invalid_rl_levels = bidi_it->invalid_rl_levels;
 454       bidi_cache[idx].next_for_neutral = bidi_it->next_for_neutral;
 455       bidi_cache[idx].next_for_ws = bidi_it->next_for_ws;
 456       bidi_cache[idx].ignore_bn_limit = bidi_it->ignore_bn_limit;
 457     }
 458
 459   bidi_cache_last_idx = idx;
 460   if (idx >= bidi_cache_idx)
 461     bidi_cache_idx = idx + 1;
 462 }
 463
 464 static INLINE bidi_type_t
 465 bidi_cache_find (int charpos, int level, struct bidi_it *bidi_it)
 466 {
 467   int i = bidi_cache_search (charpos, level, bidi_it->scan_dir);
 468
 469   if (i >= 0)
 470     {
 471       bidi_dir_t current_scan_dir = bidi_it->scan_dir;
 472
 473       bidi_copy_it (bidi_it, &bidi_cache[i]);
 474       bidi_cache_last_idx = i;
 475       /* Don't let scan direction from from the cached state override
 476          the current scan direction.  */
 477       bidi_it->scan_dir = current_scan_dir;
 478       return bidi_it->type;
 479     }
 480
 481   return UNKNOWN_BT;
 482 }
 483
 484 static INLINE int
 485 bidi_peek_at_next_level (struct bidi_it *bidi_it)
 486 {
 487   if (bidi_cache_idx == 0 || bidi_cache_last_idx == -1)
 488     abort ();
 489   return bidi_cache[bidi_cache_last_idx + bidi_it->scan_dir].resolved_level;
 490 }
 491
 492 /* Check if buffer position CHARPOS/BYTEPOS is the end of a paragraph.
 493    Value is the non-negative length of the paragraph separator
 494    following the buffer position, -1 if position is at the beginning
 495    of a new paragraph, or -2 if position is neither at beginning nor
 496    at end of a paragraph.  */
 497 static EMACS_INT
 498 bidi_at_paragraph_end (EMACS_INT charpos, EMACS_INT bytepos)
 499 {
 500   /* FIXME: Why Fbuffer_local_value rather than just Fsymbol_value?  */
 501   Lisp_Object sep_re;
 502   Lisp_Object start_re;
 503   EMACS_INT val;
 504
 505   sep_re = paragraph_separate_re;
 506   start_re = paragraph_start_re;
 507
 508   val = fast_looking_at (sep_re, charpos, bytepos, ZV, ZV_BYTE, Qnil);
 509   if (val < 0)
 510     {
 511       if (fast_looking_at (start_re, charpos, bytepos, ZV, ZV_BYTE, Qnil) >= 0)
 512         val = -1;
 513       else
 514         val = -2;
 515     }
 516
 517   return val;
 518 }
 519
 520 /* Determine the start-of-run (sor) directional type given the two
 521    embedding levels on either side of the run boundary.  Also, update
 522    the saved info about previously seen characters, since that info is
 523    generally valid for a single level run.  */
 524 static INLINE void
 525 bidi_set_sor_type (struct bidi_it *bidi_it, int level_before, int level_after)
 526 {
 527   int higher_level = level_before > level_after ? level_before : level_after;
 528
 529   /* The prev_was_pdf gork is required for when we have several PDFs
 530      in a row.  In that case, we want to compute the sor type for the
 531      next level run only once: when we see the first PDF.  That's
 532      because the sor type depends only on the higher of the two levels
 533      that we find on the two sides of the level boundary (see UAX#9,
 534      clause X10), and so we don't need to know the final embedding
 535      level to which we descend after processing all the PDFs.  */
 536   if (!bidi_it->prev_was_pdf || level_before < level_after)
 537     /* FIXME: should the default sor direction be user selectable?  */
 538     bidi_it->sor = (higher_level & 1) != 0 ? R2L : L2R;
 539   if (level_before > level_after)
 540     bidi_it->prev_was_pdf = 1;
 541
 542   bidi_it->prev.type = UNKNOWN_BT;
 543   bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
 544     bidi_it->last_strong.orig_type = UNKNOWN_BT;
 545   bidi_it->prev_for_neutral.type = bidi_it->sor == R2L ? STRONG_R : STRONG_L;
 546   bidi_it->prev_for_neutral.charpos = bidi_it->charpos;
 547   bidi_it->prev_for_neutral.bytepos = bidi_it->bytepos;
 548   bidi_it->next_for_neutral.type = bidi_it->next_for_neutral.type_after_w1 =
 549     bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
 550   bidi_it->ignore_bn_limit = 0; /* meaning it's unknown */
 551 }
 552
 553 static void
 554 bidi_line_init (struct bidi_it *bidi_it)
 555 {
 556   bidi_it->scan_dir = 1; /* FIXME: do we need to have control on this? */
 557   bidi_it->resolved_level = bidi_it->level_stack[0].level;
 558   bidi_it->level_stack[0].override = NEUTRAL_DIR; /* X1 */
 559   bidi_it->invalid_levels = 0;
 560   bidi_it->invalid_rl_levels = -1;
 561   bidi_it->next_en_pos = -1;
 562   bidi_it->next_for_ws.type = UNKNOWN_BT;
 563   bidi_set_sor_type (bidi_it,
 564                      bidi_it->paragraph_dir == R2L ? 1 : 0,
 565                      bidi_it->level_stack[0].level); /* X10 */
 566
 567   bidi_cache_reset ();
 568 }
 569
 570 /* Find the beginning of this paragraph by looking back in the buffer.
 571    Value is the byte position of the paragraph's beginning.  */
 572 static EMACS_INT
 573 bidi_find_paragraph_start (EMACS_INT pos, EMACS_INT pos_byte)
 574 {
 575   Lisp_Object re = paragraph_start_re;
 576   EMACS_INT limit = ZV, limit_byte = ZV_BYTE;
 577
 578   while (pos_byte > BEGV_BYTE
 579          && fast_looking_at (re, pos, pos_byte, limit, limit_byte, Qnil) < 0)
 580     {
 581       pos = find_next_newline_no_quit (pos - 1, -1);
 582       pos_byte = CHAR_TO_BYTE (pos);
 583     }
 584   return pos_byte;
 585 }
 586
 587 /* Determine the direction, a.k.a. base embedding level, of the
 588    paragraph we are about to iterate through.  If DIR is either L2R or
 589    R2L, just use that.  Otherwise, determine the paragraph direction
 590    from the first strong character of the paragraph.
 591
 592    Note that this gives the paragraph separator the same direction as
 593    the preceding paragraph, even though Emacs generally views the
 594    separartor as not belonging to any paragraph.  */
 595 void
 596 bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it)
 597 {
 598   EMACS_INT bytepos = bidi_it->bytepos;
 599
 600   /* Special case for an empty buffer. */
 601   if (bytepos == BEGV_BYTE && bytepos == ZV_BYTE)
 602     dir = L2R;
 603   /* We should never be called at EOB or before BEGV.  */
 604   else if (bytepos >= ZV_BYTE || bytepos < BEGV_BYTE)
 605     abort ();
 606
 607   if (dir == L2R)
 608     {
 609       bidi_it->paragraph_dir = L2R;
 610       bidi_it->new_paragraph = 0;
 611     }
 612   else if (dir == R2L)
 613     {
 614       bidi_it->paragraph_dir = R2L;
 615       bidi_it->new_paragraph = 0;
 616     }
 617   else if (dir == NEUTRAL_DIR)  /* P2 */
 618     {
 619       int ch, ch_len;
 620       EMACS_INT pos;
 621       bidi_type_t type;
 622
 623       if (!bidi_initialized)
 624         bidi_initialize ();
 625
 626       /* If we are inside a paragraph separator, we are just waiting
 627          for the separator to be exhausted; use the previous paragraph
 628          direction.  But don't do that if we have been just reseated,
 629          because we need to reinitialize below in that case.  */
 630       if (!bidi_it->first_elt
 631           && bidi_it->charpos < bidi_it->separator_limit)
 632         return;
 633
 634       /* If we are on a newline, get past it to where the next
 635          paragraph might start.  But don't do that at BEGV since then
 636          we are potentially in a new paragraph that doesn't yet
 637          exist.  */
 638       pos = bidi_it->charpos;
 639       if (bytepos > BEGV_BYTE && FETCH_CHAR (bytepos) == '\n')
 640         {
 641           bytepos++;
 642           pos++;
 643         }
 644
 645       /* We are either at the beginning of a paragraph or in the
 646          middle of it.  Find where this paragraph starts.  */
 647       bytepos = bidi_find_paragraph_start (pos, bytepos);
 648
 649       bidi_it->separator_limit = -1;
 650       bidi_it->new_paragraph = 0;
 651       ch = FETCH_CHAR (bytepos);
 652       ch_len = CHAR_BYTES (ch);
 653       pos = BYTE_TO_CHAR (bytepos);
 654       type = bidi_get_type (ch, NEUTRAL_DIR);
 655
 656       for (pos++, bytepos += ch_len;
 657            /* NOTE: UAX#9 says to search only for L, AL, or R types of
 658               characters, and ignore RLE, RLO, LRE, and LRO.  However,
 659               I'm not sure it makes sense to omit those 4; should try
 660               with and without that to see the effect.  */
 661            (bidi_get_category (type) != STRONG)
 662              || (bidi_ignore_explicit_marks_for_paragraph_level
 663                  && (type == RLE || type == RLO
 664                      || type == LRE || type == LRO));
 665            type = bidi_get_type (ch, NEUTRAL_DIR))
 666         {
 667           if (type == NEUTRAL_B && bidi_at_paragraph_end (pos, bytepos) >= -1)
 668             break;
 669           if (bytepos >= ZV_BYTE)
 670             {
 671               /* Pretend there's a paragraph separator at end of buffer.  */
 672               type = NEUTRAL_B;
 673               break;
 674             }
 675           FETCH_CHAR_ADVANCE (ch, pos, bytepos);
 676         }
 677       if (type == STRONG_R || type == STRONG_AL) /* P3 */
 678         bidi_it->paragraph_dir = R2L;
 679       else if (type == STRONG_L)
 680         bidi_it->paragraph_dir = L2R;
 681     }
 682   else
 683     abort ();
 684
 685   /* Contrary to UAX#9 clause P3, we only default the paragraph
 686      direction to L2R if we have no previous usable paragraph
 687      direction.  */
 688   if (bidi_it->paragraph_dir != L2R && bidi_it->paragraph_dir != R2L)
 689     bidi_it->paragraph_dir = L2R; /* P3 and ``higher protocols'' */
 690   if (bidi_it->paragraph_dir == R2L)
 691     bidi_it->level_stack[0].level = 1;
 692   else
 693     bidi_it->level_stack[0].level = 0;
 694
 695   bidi_line_init (bidi_it);
 696 }
 697
 698 /* Do whatever UAX#9 clause X8 says should be done at paragraph's
 699    end.  */
 700 static INLINE void
 701 bidi_set_paragraph_end (struct bidi_it *bidi_it)
 702 {
 703   bidi_it->invalid_levels = 0;
 704   bidi_it->invalid_rl_levels = -1;
 705   bidi_it->stack_idx = 0;
 706   bidi_it->resolved_level = bidi_it->level_stack[0].level;
 707 }
 708
 709 /* Initialize the bidi iterator from buffer position CHARPOS.  */
 710 void
 711 bidi_init_it (EMACS_INT charpos, EMACS_INT bytepos, struct bidi_it *bidi_it)
 712 {
 713   if (! bidi_initialized)
 714     bidi_initialize ();
 715   bidi_it->charpos = charpos;
 716   bidi_it->bytepos = bytepos;
 717   bidi_it->first_elt = 1;
 718   bidi_set_paragraph_end (bidi_it);
 719   bidi_it->new_paragraph = 1;
 720   bidi_it->separator_limit = -1;
 721   bidi_it->type = NEUTRAL_B;
 722   bidi_it->type_after_w1 = NEUTRAL_B;
 723   bidi_it->orig_type = NEUTRAL_B;
 724   bidi_it->prev_was_pdf = 0;
 725   bidi_it->prev.type = bidi_it->prev.type_after_w1 =
 726     bidi_it->prev.orig_type = UNKNOWN_BT;
 727   bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
 728     bidi_it->last_strong.orig_type = UNKNOWN_BT;
 729   bidi_it->next_for_neutral.charpos = -1;
 730   bidi_it->next_for_neutral.type =
 731     bidi_it->next_for_neutral.type_after_w1 =
 732     bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
 733   bidi_it->prev_for_neutral.charpos = -1;
 734   bidi_it->prev_for_neutral.type =
 735     bidi_it->prev_for_neutral.type_after_w1 =
 736     bidi_it->prev_for_neutral.orig_type = UNKNOWN_BT;
 737   bidi_it->sor = L2R;    /* FIXME: should it be user-selectable? */
 738   bidi_cache_shrink ();
 739 }
 740
 741 /* Push the current embedding level and override status; reset the
 742    current level to LEVEL and the current override status to OVERRIDE.  */
 743 static INLINE void
 744 bidi_push_embedding_level (struct bidi_it *bidi_it,
 745                            int level, bidi_dir_t override)
 746 {
 747   bidi_it->stack_idx++;
 748   if (bidi_it->stack_idx >= BIDI_MAXLEVEL)
 749     abort ();
 750   bidi_it->level_stack[bidi_it->stack_idx].level = level;
 751   bidi_it->level_stack[bidi_it->stack_idx].override = override;
 752 }
 753
 754 /* Pop the embedding level and directional override status from the
 755    stack, and return the new level.  */
 756 static INLINE int
 757 bidi_pop_embedding_level (struct bidi_it *bidi_it)
 758 {
 759   /* UAX#9 says to ignore invalid PDFs.  */
 760   if (bidi_it->stack_idx > 0)
 761     bidi_it->stack_idx--;
 762   return bidi_it->level_stack[bidi_it->stack_idx].level;
 763 }
 764
 765 /* Record in SAVED_INFO the information about the current character.  */
 766 static INLINE void
 767 bidi_remember_char (struct bidi_saved_info *saved_info,
 768                     struct bidi_it *bidi_it)
 769 {
 770   saved_info->charpos = bidi_it->charpos;
 771   saved_info->bytepos = bidi_it->bytepos;
 772   saved_info->type = bidi_it->type;
 773   bidi_check_type (bidi_it->type);
 774   saved_info->type_after_w1 = bidi_it->type_after_w1;
 775   bidi_check_type (bidi_it->type_after_w1);
 776   saved_info->orig_type = bidi_it->orig_type;
 777   bidi_check_type (bidi_it->orig_type);
 778 }
 779
 780 /* Resolve the type of a neutral character according to the type of
 781    surrounding strong text and the current embedding level.  */
 782 static INLINE bidi_type_t
 783 bidi_resolve_neutral_1 (bidi_type_t prev_type, bidi_type_t next_type, int lev)
 784 {
 785   /* N1: European and Arabic numbers are treated as though they were R.  */
 786   if (next_type == WEAK_EN || next_type == WEAK_AN)
 787     next_type = STRONG_R;
 788   if (prev_type == WEAK_EN || prev_type == WEAK_AN)
 789     prev_type = STRONG_R;
 790
 791   if (next_type == prev_type)   /* N1 */
 792     return next_type;
 793   else if ((lev & 1) == 0)      /* N2 */
 794     return STRONG_L;
 795   else
 796     return STRONG_R;
 797 }
 798
 799 static INLINE int
 800 bidi_explicit_dir_char (int c)
 801 {
 802   /* FIXME: this should be replaced with a lookup table with suitable
 803      bits set, like standard C ctype macros do.  */
 804   return (c == LRE_CHAR || c == LRO_CHAR
 805           || c == RLE_CHAR || c == RLO_CHAR || c == PDF_CHAR);
 806 }
 807
 808 /* A helper function for bidi_resolve_explicit.  It advances to the
 809    next character in logical order and determines the new embedding
 810    level and directional override, but does not take into account
 811    empty embeddings.  */
 812 static int
 813 bidi_resolve_explicit_1 (struct bidi_it *bidi_it)
 814 {
 815   int curchar;
 816   bidi_type_t type;
 817   int current_level;
 818   int new_level;
 819   bidi_dir_t override;
 820
 821   if (bidi_it->bytepos < BEGV_BYTE      /* after reseat to BEGV? */
 822       || bidi_it->first_elt)
 823     {
 824       bidi_it->first_elt = 0;
 825       if (bidi_it->charpos < BEGV)
 826         bidi_it->charpos = BEGV;
 827       bidi_it->bytepos = CHAR_TO_BYTE (bidi_it->charpos);
 828     }
 829   else if (bidi_it->bytepos < ZV_BYTE)  /* don't move at ZV */
 830     {
 831       bidi_it->charpos++;
 832       if (bidi_it->ch_len == 0)
 833         abort ();
 834       bidi_it->bytepos += bidi_it->ch_len;
 835     }
 836
 837   current_level = bidi_it->level_stack[bidi_it->stack_idx].level; /* X1 */
 838   override = bidi_it->level_stack[bidi_it->stack_idx].override;
 839   new_level = current_level;
 840
 841   /* in case it is a unibyte character (not yet implemented) */
 842   /* _fetch_multibyte_char_len = 1; */
 843   if (bidi_it->bytepos >= ZV_BYTE)
 844     {
 845       curchar = BIDI_EOB;
 846       bidi_it->ch_len = 1;
 847     }
 848   else
 849     {
 850       curchar = FETCH_CHAR (bidi_it->bytepos);
 851       bidi_it->ch_len = CHAR_BYTES (curchar);
 852     }
 853   bidi_it->ch = curchar;
 854
 855   /* Don't apply directional override here, as all the types we handle
 856      below will not be affected by the override anyway, and we need
 857      the original type unaltered.  The override will be applied in
 858      bidi_resolve_weak.  */
 859   type = bidi_get_type (curchar, NEUTRAL_DIR);
 860   bidi_it->orig_type = type;
 861   bidi_check_type (bidi_it->orig_type);
 862
 863   if (type != PDF)
 864     bidi_it->prev_was_pdf = 0;
 865
 866   bidi_it->type_after_w1 = UNKNOWN_BT;
 867
 868   switch (type)
 869     {
 870       case RLE: /* X2 */
 871       case RLO: /* X4 */
 872         bidi_it->type_after_w1 = type;
 873         bidi_check_type (bidi_it->type_after_w1);
 874         type = WEAK_BN; /* X9/Retaining */
 875         if (bidi_it->ignore_bn_limit <= 0)
 876           {
 877             if (current_level <= BIDI_MAXLEVEL - 4)
 878               {
 879                 /* Compute the least odd embedding level greater than
 880                    the current level.  */
 881                 new_level = ((current_level + 1) & ~1) + 1;
 882                 if (bidi_it->type_after_w1 == RLE)
 883                   override = NEUTRAL_DIR;
 884                 else
 885                   override = R2L;
 886                 if (current_level == BIDI_MAXLEVEL - 4)
 887                   bidi_it->invalid_rl_levels = 0;
 888                 bidi_push_embedding_level (bidi_it, new_level, override);
 889               }
 890             else
 891               {
 892                 bidi_it->invalid_levels++;
 893                 /* See the commentary about invalid_rl_levels below.  */
 894                 if (bidi_it->invalid_rl_levels < 0)
 895                   bidi_it->invalid_rl_levels = 0;
 896                 bidi_it->invalid_rl_levels++;
 897               }
 898           }
 899         else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
 900                  || bidi_it->next_en_pos > bidi_it->charpos)
 901           type = WEAK_EN;
 902         break;
 903       case LRE: /* X3 */
 904       case LRO: /* X5 */
 905         bidi_it->type_after_w1 = type;
 906         bidi_check_type (bidi_it->type_after_w1);
 907         type = WEAK_BN; /* X9/Retaining */
 908         if (bidi_it->ignore_bn_limit <= 0)
 909           {
 910             if (current_level <= BIDI_MAXLEVEL - 5)
 911               {
 912                 /* Compute the least even embedding level greater than
 913                    the current level.  */
 914                 new_level = ((current_level + 2) & ~1);
 915                 if (bidi_it->type_after_w1 == LRE)
 916                   override = NEUTRAL_DIR;
 917                 else
 918                   override = L2R;
 919                 bidi_push_embedding_level (bidi_it, new_level, override);
 920               }
 921             else
 922               {
 923                 bidi_it->invalid_levels++;
 924                 /* invalid_rl_levels counts invalid levels encountered
 925                    while the embedding level was already too high for
 926                    LRE/LRO, but not for RLE/RLO.  That is because
 927                    there may be exactly one PDF which we should not
 928                    ignore even though invalid_levels is non-zero.
 929                    invalid_rl_levels helps to know what PDF is
 930                    that.  */
 931                 if (bidi_it->invalid_rl_levels >= 0)
 932                   bidi_it->invalid_rl_levels++;
 933               }
 934           }
 935         else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
 936                  || bidi_it->next_en_pos > bidi_it->charpos)
 937           type = WEAK_EN;
 938         break;
 939       case PDF: /* X7 */
 940         bidi_it->type_after_w1 = type;
 941         bidi_check_type (bidi_it->type_after_w1);
 942         type = WEAK_BN; /* X9/Retaining */
 943         if (bidi_it->ignore_bn_limit <= 0)
 944           {
 945             if (!bidi_it->invalid_rl_levels)
 946               {
 947                 new_level = bidi_pop_embedding_level (bidi_it);
 948                 bidi_it->invalid_rl_levels = -1;
 949                 if (bidi_it->invalid_levels)
 950                   bidi_it->invalid_levels--;
 951                 /* else nothing: UAX#9 says to ignore invalid PDFs */
 952               }
 953             if (!bidi_it->invalid_levels)
 954               new_level = bidi_pop_embedding_level (bidi_it);
 955             else
 956               {
 957                 bidi_it->invalid_levels--;
 958                 bidi_it->invalid_rl_levels--;
 959               }
 960           }
 961         else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
 962                  || bidi_it->next_en_pos > bidi_it->charpos)
 963           type = WEAK_EN;
 964         break;
 965       default:
 966         /* Nothing.  */
 967         break;
 968     }
 969
 970   bidi_it->type = type;
 971   bidi_check_type (bidi_it->type);
 972
 973   return new_level;
 974 }
 975
 976 /* Given an iterator state in BIDI_IT, advance one character position
 977    in the buffer to the next character (in the logical order), resolve
 978    any explicit embeddings and directional overrides, and return the
 979    embedding level of the character after resolving explicit
 980    directives and ignoring empty embeddings.  */
 981 static int
 982 bidi_resolve_explicit (struct bidi_it *bidi_it)
 983 {
 984   int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
 985   int new_level  = bidi_resolve_explicit_1 (bidi_it);
 986
 987   if (prev_level < new_level
 988       && bidi_it->type == WEAK_BN
 989       && bidi_it->ignore_bn_limit == 0 /* only if not already known */
 990       && bidi_it->bytepos < ZV_BYTE    /* not already at EOB */
 991       && bidi_explicit_dir_char (FETCH_CHAR (bidi_it->bytepos
 992                                              + bidi_it->ch_len)))
 993     {
 994       /* Avoid pushing and popping embedding levels if the level run
 995          is empty, as this breaks level runs where it shouldn't.
 996          UAX#9 removes all the explicit embedding and override codes,
 997          so empty embeddings disappear without a trace.  We need to
 998          behave as if we did the same.  */
 999       struct bidi_it saved_it;
1000       int level = prev_level;
1001
1002       bidi_copy_it (&saved_it, bidi_it);
1003
1004       while (bidi_explicit_dir_char (FETCH_CHAR (bidi_it->bytepos
1005                                                  + bidi_it->ch_len)))
1006         {
1007           level = bidi_resolve_explicit_1 (bidi_it);
1008         }
1009
1010       if (level == prev_level)  /* empty embedding */
1011         saved_it.ignore_bn_limit = bidi_it->charpos + 1;
1012       else                      /* this embedding is non-empty */
1013         saved_it.ignore_bn_limit = -1;
1014
1015       bidi_copy_it (bidi_it, &saved_it);
1016       if (bidi_it->ignore_bn_limit > 0)
1017         {
1018           /* We pushed a level, but we shouldn't have.  Undo that. */
1019           if (!bidi_it->invalid_rl_levels)
1020             {
1021               new_level = bidi_pop_embedding_level (bidi_it);
1022               bidi_it->invalid_rl_levels = -1;
1023               if (bidi_it->invalid_levels)
1024                 bidi_it->invalid_levels--;
1025             }
1026           if (!bidi_it->invalid_levels)
1027             new_level = bidi_pop_embedding_level (bidi_it);
1028           else
1029             {
1030               bidi_it->invalid_levels--;
1031               bidi_it->invalid_rl_levels--;
1032             }
1033         }
1034     }
1035
1036   if (bidi_it->type == NEUTRAL_B)       /* X8 */
1037     {
1038       bidi_set_paragraph_end (bidi_it);
1039       /* This is needed by bidi_resolve_weak below, and in L1.  */
1040       bidi_it->type_after_w1 = bidi_it->type;
1041       bidi_check_type (bidi_it->type_after_w1);
1042     }
1043
1044   return new_level;
1045 }
1046
1047 /* Advance in the buffer, resolve weak types and return the type of
1048    the next character after weak type resolution.  */
1049 static bidi_type_t
1050 bidi_resolve_weak (struct bidi_it *bidi_it)
1051 {
1052   bidi_type_t type;
1053   bidi_dir_t override;
1054   int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1055   int new_level  = bidi_resolve_explicit (bidi_it);
1056   int next_char;
1057   bidi_type_t type_of_next;
1058   struct bidi_it saved_it;
1059
1060   type = bidi_it->type;
1061   override = bidi_it->level_stack[bidi_it->stack_idx].override;
1062
1063   if (type == UNKNOWN_BT
1064       || type == LRE
1065       || type == LRO
1066       || type == RLE
1067       || type == RLO
1068       || type == PDF)
1069     abort ();
1070
1071   if (new_level != prev_level
1072       || bidi_it->type == NEUTRAL_B)
1073     {
1074       /* We've got a new embedding level run, compute the directional
1075          type of sor and initialize per-run variables (UAX#9, clause
1076          X10).  */
1077       bidi_set_sor_type (bidi_it, prev_level, new_level);
1078     }
1079   else if (type == NEUTRAL_S || type == NEUTRAL_WS
1080            || type == WEAK_BN || type == STRONG_AL)
1081     bidi_it->type_after_w1 = type;      /* needed in L1 */
1082   bidi_check_type (bidi_it->type_after_w1);
1083
1084   /* Level and directional override status are already recorded in
1085      bidi_it, and do not need any change; see X6.  */
1086   if (override == R2L)          /* X6 */
1087     type = STRONG_R;
1088   else if (override == L2R)
1089     type = STRONG_L;
1090   else
1091     {
1092       if (type == WEAK_NSM)     /* W1 */
1093         {
1094           /* Note that we don't need to consider the case where the
1095              prev character has its type overridden by an RLO or LRO,
1096              because then either the type of this NSM would have been
1097              also overridden, or the previous character is outside the
1098              current level run, and thus not relevant to this NSM.
1099              This is why NSM gets the type_after_w1 of the previous
1100              character.  */
1101           if (bidi_it->prev.type_after_w1 != UNKNOWN_BT
1102               /* if type_after_w1 is NEUTRAL_B, this NSM is at sor */
1103               && bidi_it->prev.type_after_w1 != NEUTRAL_B)
1104             type = bidi_it->prev.type_after_w1;
1105           else if (bidi_it->sor == R2L)
1106             type = STRONG_R;
1107           else if (bidi_it->sor == L2R)
1108             type = STRONG_L;
1109           else /* shouldn't happen! */
1110             abort ();
1111         }
1112       if (type == WEAK_EN       /* W2 */
1113           && bidi_it->last_strong.type_after_w1 == STRONG_AL)
1114         type = WEAK_AN;
1115       else if (type == STRONG_AL) /* W3 */
1116         type = STRONG_R;
1117       else if ((type == WEAK_ES /* W4 */
1118                 && bidi_it->prev.type_after_w1 == WEAK_EN
1119                 && bidi_it->prev.orig_type == WEAK_EN)
1120                || (type == WEAK_CS
1121                    && ((bidi_it->prev.type_after_w1 == WEAK_EN
1122                         && bidi_it->prev.orig_type == WEAK_EN)
1123                        || bidi_it->prev.type_after_w1 == WEAK_AN)))
1124         {
1125           next_char =
1126             bidi_it->bytepos + bidi_it->ch_len >= ZV_BYTE
1127             ? BIDI_EOB : FETCH_CHAR (bidi_it->bytepos + bidi_it->ch_len);
1128           type_of_next = bidi_get_type (next_char, override);
1129
1130           if (type_of_next == WEAK_BN
1131               || bidi_explicit_dir_char (next_char))
1132             {
1133               bidi_copy_it (&saved_it, bidi_it);
1134               while (bidi_resolve_explicit (bidi_it) == new_level
1135                      && bidi_it->type == WEAK_BN)
1136                 ;
1137               type_of_next = bidi_it->type;
1138               bidi_copy_it (bidi_it, &saved_it);
1139             }
1140
1141           /* If the next character is EN, but the last strong-type
1142              character is AL, that next EN will be changed to AN when
1143              we process it in W2 above.  So in that case, this ES
1144              should not be changed into EN.  */
1145           if (type == WEAK_ES
1146               && type_of_next == WEAK_EN
1147               && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1148             type = WEAK_EN;
1149           else if (type == WEAK_CS)
1150             {
1151               if (bidi_it->prev.type_after_w1 == WEAK_AN
1152                   && (type_of_next == WEAK_AN
1153                       /* If the next character is EN, but the last
1154                          strong-type character is AL, EN will be later
1155                          changed to AN when we process it in W2 above.
1156                          So in that case, this ES should not be
1157                          changed into EN.  */
1158                       || (type_of_next == WEAK_EN
1159                           && bidi_it->last_strong.type_after_w1 == STRONG_AL)))
1160                 type = WEAK_AN;
1161               else if (bidi_it->prev.type_after_w1 == WEAK_EN
1162                        && type_of_next == WEAK_EN
1163                        && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1164                 type = WEAK_EN;
1165             }
1166         }
1167       else if (type == WEAK_ET  /* W5: ET with EN before or after it */
1168                || type == WEAK_BN)      /* W5/Retaining */
1169         {
1170           if (bidi_it->prev.type_after_w1 == WEAK_EN /* ET/BN w/EN before it */
1171               || bidi_it->next_en_pos > bidi_it->charpos)
1172             type = WEAK_EN;
1173           else                  /* W5: ET/BN with EN after it.  */
1174             {
1175               EMACS_INT en_pos = bidi_it->charpos + 1;
1176
1177               next_char =
1178                 bidi_it->bytepos + bidi_it->ch_len >= ZV_BYTE
1179                 ? BIDI_EOB : FETCH_CHAR (bidi_it->bytepos + bidi_it->ch_len);
1180               type_of_next = bidi_get_type (next_char, override);
1181
1182               if (type_of_next == WEAK_ET
1183                   || type_of_next == WEAK_BN
1184                   || bidi_explicit_dir_char (next_char))
1185                 {
1186                   bidi_copy_it (&saved_it, bidi_it);
1187                   while (bidi_resolve_explicit (bidi_it) == new_level
1188                          && (bidi_it->type == WEAK_BN
1189                              || bidi_it->type == WEAK_ET))
1190                     ;
1191                   type_of_next = bidi_it->type;
1192                   en_pos = bidi_it->charpos;
1193                   bidi_copy_it (bidi_it, &saved_it);
1194                 }
1195               if (type_of_next == WEAK_EN)
1196                 {
1197                   /* If the last strong character is AL, the EN we've
1198                      found will become AN when we get to it (W2). */
1199                   if (bidi_it->last_strong.type_after_w1 != STRONG_AL)
1200                     {
1201                       type = WEAK_EN;
1202                       /* Remember this EN position, to speed up processing
1203                          of the next ETs.  */
1204                       bidi_it->next_en_pos = en_pos;
1205                     }
1206                   else if (type == WEAK_BN)
1207                     type = NEUTRAL_ON; /* W6/Retaining */
1208                 }
1209             }
1210         }
1211     }
1212
1213   if (type == WEAK_ES || type == WEAK_ET || type == WEAK_CS /* W6 */
1214       || (type == WEAK_BN
1215           && (bidi_it->prev.type_after_w1 == WEAK_CS        /* W6/Retaining */
1216               || bidi_it->prev.type_after_w1 == WEAK_ES
1217               || bidi_it->prev.type_after_w1 == WEAK_ET)))
1218     type = NEUTRAL_ON;
1219
1220   /* Store the type we've got so far, before we clobber it with strong
1221      types in W7 and while resolving neutral types.  But leave alone
1222      the original types that were recorded above, because we will need
1223      them for the L1 clause.  */
1224   if (bidi_it->type_after_w1 == UNKNOWN_BT)
1225     bidi_it->type_after_w1 = type;
1226   bidi_check_type (bidi_it->type_after_w1);
1227
1228   if (type == WEAK_EN)  /* W7 */
1229     {
1230       if ((bidi_it->last_strong.type_after_w1 == STRONG_L)
1231           || (bidi_it->last_strong.type == UNKNOWN_BT && bidi_it->sor == L2R))
1232         type = STRONG_L;
1233     }
1234
1235   bidi_it->type = type;
1236   bidi_check_type (bidi_it->type);
1237   return type;
1238 }
1239
1240 static bidi_type_t
1241 bidi_resolve_neutral (struct bidi_it *bidi_it)
1242 {
1243   int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1244   bidi_type_t type = bidi_resolve_weak (bidi_it);
1245   int current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1246
1247   if (!(type == STRONG_R
1248         || type == STRONG_L
1249         || type == WEAK_BN
1250         || type == WEAK_EN
1251         || type == WEAK_AN
1252         || type == NEUTRAL_B
1253         || type == NEUTRAL_S
1254         || type == NEUTRAL_WS
1255         || type == NEUTRAL_ON))
1256     abort ();
1257
1258   if (bidi_get_category (type) == NEUTRAL
1259       || (type == WEAK_BN && prev_level == current_level))
1260     {
1261       if (bidi_it->next_for_neutral.type != UNKNOWN_BT)
1262         type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1263                                        bidi_it->next_for_neutral.type,
1264                                        current_level);
1265       else
1266         {
1267           /* Arrrgh!!  The UAX#9 algorithm is too deeply entrenched in
1268              the assumption of batch-style processing; see clauses W4,
1269              W5, and especially N1, which require to look far forward
1270              (as well as back) in the buffer.  May the fleas of a
1271              thousand camels infest the armpits of those who design
1272              supposedly general-purpose algorithms by looking at their
1273              own implementations, and fail to consider other possible
1274              implementations!  */
1275           struct bidi_it saved_it;
1276           bidi_type_t next_type;
1277
1278           if (bidi_it->scan_dir == -1)
1279             abort ();
1280
1281           bidi_copy_it (&saved_it, bidi_it);
1282           /* Scan the text forward until we find the first non-neutral
1283              character, and then use that to resolve the neutral we
1284              are dealing with now.  We also cache the scanned iterator
1285              states, to salvage some of the effort later.  */
1286           bidi_cache_iterator_state (bidi_it, 0);
1287           do {
1288             /* Record the info about the previous character, so that
1289                it will be cached below with this state.  */
1290             if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
1291                 && bidi_it->type != WEAK_BN)
1292               bidi_remember_char (&bidi_it->prev, bidi_it);
1293             type = bidi_resolve_weak (bidi_it);
1294             /* Paragraph separators have their levels fully resolved
1295                at this point, so cache them as resolved.  */
1296             bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B);
1297             /* FIXME: implement L1 here, by testing for a newline and
1298                resetting the level for any sequence of whitespace
1299                characters adjacent to it.  */
1300           } while (!(type == NEUTRAL_B
1301                      || (type != WEAK_BN
1302                          && bidi_get_category (type) != NEUTRAL)
1303                      /* This is all per level run, so stop when we
1304                         reach the end of this level run.  */
1305                      || bidi_it->level_stack[bidi_it->stack_idx].level !=
1306                      current_level));
1307
1308           bidi_remember_char (&saved_it.next_for_neutral, bidi_it);
1309
1310           switch (type)
1311             {
1312               case STRONG_L:
1313               case STRONG_R:
1314               case STRONG_AL:
1315                 next_type = type;
1316                 break;
1317               case WEAK_EN:
1318               case WEAK_AN:
1319                 /* N1: ``European and Arabic numbers are treated as
1320                    though they were R.''  */
1321                 next_type = STRONG_R;
1322                 saved_it.next_for_neutral.type = STRONG_R;
1323                 break;
1324               case WEAK_BN:
1325                 if (!bidi_explicit_dir_char (bidi_it->ch))
1326                   abort ();             /* can't happen: BNs are skipped */
1327                 /* FALLTHROUGH */
1328               case NEUTRAL_B:
1329                 /* Marched all the way to the end of this level run.
1330                    We need to use the eor type, whose information is
1331                    stored by bidi_set_sor_type in the prev_for_neutral
1332                    member.  */
1333                 if (saved_it.type != WEAK_BN
1334                     || bidi_get_category (bidi_it->prev.type_after_w1) == NEUTRAL)
1335                   {
1336                     next_type = bidi_it->prev_for_neutral.type;
1337                     saved_it.next_for_neutral.type = next_type;
1338                     bidi_check_type (next_type);
1339                   }
1340                 else
1341                   {
1342                     /* This is a BN which does not adjoin neutrals.
1343                        Leave its type alone.  */
1344                     bidi_copy_it (bidi_it, &saved_it);
1345                     return bidi_it->type;
1346                   }
1347                 break;
1348               default:
1349                 abort ();
1350             }
1351           type = bidi_resolve_neutral_1 (saved_it.prev_for_neutral.type,
1352                                          next_type, current_level);
1353           saved_it.type = type;
1354           bidi_check_type (type);
1355           bidi_copy_it (bidi_it, &saved_it);
1356         }
1357     }
1358   return type;
1359 }
1360
1361 /* Given an iterator state in BIDI_IT, advance one character position
1362    in the buffer to the next character (in the logical order), resolve
1363    the bidi type of that next character, and return that type.  */
1364 static bidi_type_t
1365 bidi_type_of_next_char (struct bidi_it *bidi_it)
1366 {
1367   bidi_type_t type;
1368
1369   /* This should always be called during a forward scan.  */
1370   if (bidi_it->scan_dir != 1)
1371     abort ();
1372
1373   /* Reset the limit until which to ignore BNs if we step out of the
1374      area where we found only empty levels.  */
1375   if ((bidi_it->ignore_bn_limit > 0
1376        && bidi_it->ignore_bn_limit <= bidi_it->charpos)
1377       || (bidi_it->ignore_bn_limit == -1
1378           && !bidi_explicit_dir_char (bidi_it->ch)))
1379     bidi_it->ignore_bn_limit = 0;
1380
1381   type = bidi_resolve_neutral (bidi_it);
1382
1383   return type;
1384 }
1385
1386 /* Given an iterator state BIDI_IT, advance one character position in
1387    the buffer to the next character (in the logical order), resolve
1388    the embedding and implicit levels of that next character, and
1389    return the resulting level.  */
1390 static int
1391 bidi_level_of_next_char (struct bidi_it *bidi_it)
1392 {
1393   bidi_type_t type;
1394   int level, prev_level = -1;
1395   struct bidi_saved_info next_for_neutral;
1396
1397   if (bidi_it->scan_dir == 1)
1398     {
1399       /* There's no sense in trying to advance if we hit end of text.  */
1400       if (bidi_it->bytepos >= ZV_BYTE)
1401         return bidi_it->resolved_level;
1402
1403       /* Record the info about the previous character.  */
1404       if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
1405           && bidi_it->type != WEAK_BN)
1406         bidi_remember_char (&bidi_it->prev, bidi_it);
1407       if (bidi_it->type_after_w1 == STRONG_R
1408           || bidi_it->type_after_w1 == STRONG_L
1409           || bidi_it->type_after_w1 == STRONG_AL)
1410         bidi_remember_char (&bidi_it->last_strong, bidi_it);
1411       /* FIXME: it sounds like we don't need both prev and
1412          prev_for_neutral members, but I'm leaving them both for now.  */
1413       if (bidi_it->type == STRONG_R || bidi_it->type == STRONG_L
1414           || bidi_it->type == WEAK_EN || bidi_it->type == WEAK_AN)
1415         bidi_remember_char (&bidi_it->prev_for_neutral, bidi_it);
1416
1417       /* If we overstepped the characters used for resolving neutrals
1418          and whitespace, invalidate their info in the iterator.  */
1419       if (bidi_it->charpos >= bidi_it->next_for_neutral.charpos)
1420         bidi_it->next_for_neutral.type = UNKNOWN_BT;
1421       if (bidi_it->next_en_pos >= 0
1422           && bidi_it->charpos >= bidi_it->next_en_pos)
1423         bidi_it->next_en_pos = -1;
1424       if (bidi_it->next_for_ws.type != UNKNOWN_BT
1425           && bidi_it->charpos >= bidi_it->next_for_ws.charpos)
1426         bidi_it->next_for_ws.type = UNKNOWN_BT;
1427
1428       /* This must be taken before we fill the iterator with the info
1429          about the next char.  If we scan backwards, the iterator
1430          state must be already cached, so there's no need to know the
1431          embedding level of the previous character, since we will be
1432          returning to our caller shortly.  */
1433       prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1434     }
1435   next_for_neutral = bidi_it->next_for_neutral;
1436
1437   /* Perhaps it is already cached.  */
1438   type = bidi_cache_find (bidi_it->charpos + bidi_it->scan_dir, -1, bidi_it);
1439   if (type != UNKNOWN_BT)
1440     {
1441       /* Don't lose the information for resolving neutrals!  The
1442          cached states could have been cached before their
1443          next_for_neutral member was computed.  If we are on our way
1444          forward, we can simply take the info from the previous
1445          state.  */
1446       if (bidi_it->scan_dir == 1
1447           && bidi_it->next_for_neutral.type == UNKNOWN_BT)
1448         bidi_it->next_for_neutral = next_for_neutral;
1449
1450       /* If resolved_level is -1, it means this state was cached
1451          before it was completely resolved, so we cannot return
1452          it.  */
1453       if (bidi_it->resolved_level != -1)
1454         return bidi_it->resolved_level;
1455     }
1456   if (bidi_it->scan_dir == -1)
1457     /* If we are going backwards, the iterator state is already cached
1458        from previous scans, and should be fully resolved.  */
1459     abort ();
1460
1461   if (type == UNKNOWN_BT)
1462     type = bidi_type_of_next_char (bidi_it);
1463
1464   if (type == NEUTRAL_B)
1465     return bidi_it->resolved_level;
1466
1467   level = bidi_it->level_stack[bidi_it->stack_idx].level;
1468   if ((bidi_get_category (type) == NEUTRAL /* && type != NEUTRAL_B */)
1469       || (type == WEAK_BN && prev_level == level))
1470     {
1471       if (bidi_it->next_for_neutral.type == UNKNOWN_BT)
1472         abort ();
1473
1474       /* If the cached state shows a neutral character, it was not
1475          resolved by bidi_resolve_neutral, so do it now.  */
1476       type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1477                                      bidi_it->next_for_neutral.type,
1478                                      level);
1479     }
1480
1481   if (!(type == STRONG_R
1482         || type == STRONG_L
1483         || type == WEAK_BN
1484         || type == WEAK_EN
1485         || type == WEAK_AN))
1486     abort ();
1487   bidi_it->type = type;
1488   bidi_check_type (bidi_it->type);
1489
1490   /* For L1 below, we need to know, for each WS character, whether
1491      it belongs to a sequence of WS characters preceeding a newline
1492      or a TAB or a paragraph separator.  */
1493   if (bidi_it->orig_type == NEUTRAL_WS
1494       && bidi_it->next_for_ws.type == UNKNOWN_BT)
1495     {
1496       int ch;
1497       int clen = bidi_it->ch_len;
1498       EMACS_INT bpos = bidi_it->bytepos;
1499       EMACS_INT cpos = bidi_it->charpos;
1500       bidi_type_t chtype;
1501
1502       do {
1503         /*_fetch_multibyte_char_len = 1;*/
1504         ch = bpos + clen >= ZV_BYTE ? BIDI_EOB : FETCH_CHAR (bpos + clen);
1505         bpos += clen;
1506         cpos++;
1507         clen = (ch == BIDI_EOB ? 1 : CHAR_BYTES (ch));
1508         if (ch == '\n' || ch == BIDI_EOB /* || ch == LINESEP_CHAR */)
1509           chtype = NEUTRAL_B;
1510         else
1511           chtype = bidi_get_type (ch, NEUTRAL_DIR);
1512       } while (chtype == NEUTRAL_WS || chtype == WEAK_BN
1513                || bidi_explicit_dir_char (ch)); /* L1/Retaining */
1514       bidi_it->next_for_ws.type = chtype;
1515       bidi_check_type (bidi_it->next_for_ws.type);
1516       bidi_it->next_for_ws.charpos = cpos;
1517       bidi_it->next_for_ws.bytepos = bpos;
1518     }
1519
1520   /* Resolve implicit levels, with a twist: PDFs get the embedding
1521      level of the enbedding they terminate.  See below for the
1522      reason.  */
1523   if (bidi_it->orig_type == PDF
1524       /* Don't do this if this formatting code didn't change the
1525          embedding level due to invalid or empty embeddings.  */
1526       && prev_level != level)
1527     {
1528       /* Don't look in UAX#9 for the reason for this: it's our own
1529          private quirk.  The reason is that we want the formatting
1530          codes to be delivered so that they bracket the text of their
1531          embedding.  For example, given the text
1532
1533              {RLO}teST{PDF}
1534
1535          we want it to be displayed as
1536
1537              {RLO}STet{PDF}
1538
1539          not as
1540
1541              STet{RLO}{PDF}
1542
1543          which will result because we bump up the embedding level as
1544          soon as we see the RLO and pop it as soon as we see the PDF,
1545          so RLO itself has the same embedding level as "teST", and
1546          thus would be normally delivered last, just before the PDF.
1547          The switch below fiddles with the level of PDF so that this
1548          ugly side effect does not happen.
1549
1550          (This is, of course, only important if the formatting codes
1551          are actually displayed, but Emacs does need to display them
1552          if the user wants to.)  */
1553       level = prev_level;
1554     }
1555   else if (bidi_it->orig_type == NEUTRAL_B /* L1 */
1556            || bidi_it->orig_type == NEUTRAL_S
1557            || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB
1558            /* || bidi_it->ch == LINESEP_CHAR */
1559            || (bidi_it->orig_type == NEUTRAL_WS
1560                && (bidi_it->next_for_ws.type == NEUTRAL_B
1561                    || bidi_it->next_for_ws.type == NEUTRAL_S)))
1562     level = bidi_it->level_stack[0].level;
1563   else if ((level & 1) == 0) /* I1 */
1564     {
1565       if (type == STRONG_R)
1566         level++;
1567       else if (type == WEAK_EN || type == WEAK_AN)
1568         level += 2;
1569     }
1570   else                  /* I2 */
1571     {
1572       if (type == STRONG_L || type == WEAK_EN || type == WEAK_AN)
1573         level++;
1574     }
1575
1576   bidi_it->resolved_level = level;
1577   return level;
1578 }
1579
1580 /* Move to the other edge of a level given by LEVEL.  If END_FLAG is
1581    non-zero, we are at the end of a level, and we need to prepare to
1582    resume the scan of the lower level.
1583
1584    If this level's other edge is cached, we simply jump to it, filling
1585    the iterator structure with the iterator state on the other edge.
1586    Otherwise, we walk the buffer until we come back to the same level
1587    as LEVEL.
1588
1589    Note: we are not talking here about a ``level run'' in the UAX#9
1590    sense of the term, but rather about a ``level'' which includes
1591    all the levels higher than it.  In other words, given the levels
1592    like this:
1593
1594          11111112222222333333334443343222222111111112223322111
1595                 A      B                    C
1596
1597    and assuming we are at point A scanning left to right, this
1598    function moves to point C, whereas the UAX#9 ``level 2 run'' ends
1599    at point B.  */
1600 static void
1601 bidi_find_other_level_edge (struct bidi_it *bidi_it, int level, int end_flag)
1602 {
1603   int dir = end_flag ? -bidi_it->scan_dir : bidi_it->scan_dir;
1604   int idx;
1605
1606   /* Try the cache first.  */
1607   if ((idx = bidi_cache_find_level_change (level, dir, end_flag)) >= 0)
1608     bidi_cache_fetch_state (idx, bidi_it);
1609   else
1610     {
1611       int new_level;
1612
1613       if (end_flag)
1614         abort (); /* if we are at end of level, its edges must be cached */
1615
1616       bidi_cache_iterator_state (bidi_it, 1);
1617       do {
1618         new_level = bidi_level_of_next_char (bidi_it);
1619         bidi_cache_iterator_state (bidi_it, 1);
1620       } while (new_level >= level);
1621     }
1622 }
1623
1624 void
1625 bidi_move_to_visually_next (struct bidi_it *bidi_it)
1626 {
1627   int old_level, new_level, next_level;
1628   struct bidi_it sentinel;
1629
1630   if (bidi_it->scan_dir == 0)
1631     {
1632       bidi_it->scan_dir = 1;    /* default to logical order */
1633     }
1634
1635   /* If we just passed a newline, initialize for the next line.  */
1636   if (!bidi_it->first_elt && bidi_it->orig_type == NEUTRAL_B)
1637     bidi_line_init (bidi_it);
1638
1639   /* Prepare the sentinel iterator state, and cache it.  When we bump
1640      into it, scanning backwards, we'll know that the last non-base
1641      level is exhausted.  */
1642   if (bidi_cache_idx == 0)
1643     {
1644       bidi_copy_it (&sentinel, bidi_it);
1645       if (bidi_it->first_elt)
1646         {
1647           sentinel.charpos--;   /* cached charpos needs to be monotonic */
1648           sentinel.bytepos--;
1649           sentinel.ch = '\n';   /* doesn't matter, but why not? */
1650           sentinel.ch_len = 1;
1651         }
1652       bidi_cache_iterator_state (&sentinel, 1);
1653     }
1654
1655   old_level = bidi_it->resolved_level;
1656   new_level = bidi_level_of_next_char (bidi_it);
1657
1658   /* Reordering of resolved levels (clause L2) is implemented by
1659      jumping to the other edge of the level and flipping direction of
1660      scanning the text whenever we find a level change.  */
1661   if (new_level != old_level)
1662     {
1663       int ascending = new_level > old_level;
1664       int level_to_search = ascending ? old_level + 1 : old_level;
1665       int incr = ascending ? 1 : -1;
1666       int expected_next_level = old_level + incr;
1667
1668       /* Jump (or walk) to the other edge of this level.  */
1669       bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
1670       /* Switch scan direction and peek at the next character in the
1671          new direction.  */
1672       bidi_it->scan_dir = -bidi_it->scan_dir;
1673
1674       /* The following loop handles the case where the resolved level
1675          jumps by more than one.  This is typical for numbers inside a
1676          run of text with left-to-right embedding direction, but can
1677          also happen in other situations.  In those cases the decision
1678          where to continue after a level change, and in what direction,
1679          is tricky.  For example, given a text like below:
1680
1681                   abcdefgh
1682                   11336622
1683
1684          (where the numbers below the text show the resolved levels),
1685          the result of reordering according to UAX#9 should be this:
1686
1687                   efdcghba
1688
1689          This is implemented by the loop below which flips direction
1690          and jumps to the other edge of the level each time it finds
1691          the new level not to be the expected one.  The expected level
1692          is always one more or one less than the previous one.  */
1693       next_level = bidi_peek_at_next_level (bidi_it);
1694       while (next_level != expected_next_level)
1695         {
1696           expected_next_level += incr;
1697           level_to_search += incr;
1698           bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
1699           bidi_it->scan_dir = -bidi_it->scan_dir;
1700           next_level = bidi_peek_at_next_level (bidi_it);
1701         }
1702
1703       /* Finally, deliver the next character in the new direction.  */
1704       next_level = bidi_level_of_next_char (bidi_it);
1705     }
1706
1707   /* Take note when we have just processed the newline that precedes
1708      the end of the paragraph.  The next time we are about to be
1709      called, set_iterator_to_next will automatically reinit the
1710      paragraph direction, if needed.  We do this at the newline before
1711      the paragraph separator, because the next character might not be
1712      the first character of the next paragraph, due to the bidi
1713      reordering, whereas we _must_ know the paragraph base direction
1714      _before_ we process the paragraph's text, since the base
1715      direction affects the reordering.  */
1716   if (bidi_it->scan_dir == 1
1717       && bidi_it->orig_type == NEUTRAL_B
1718       && bidi_it->bytepos < ZV_BYTE)
1719     {
1720       EMACS_INT sep_len =
1721         bidi_at_paragraph_end (bidi_it->charpos + 1,
1722                                bidi_it->bytepos + bidi_it->ch_len);
1723       if (sep_len >= 0)
1724         {
1725           bidi_it->new_paragraph = 1;
1726           /* Record the buffer position of the last character of the
1727              paragraph separator.  */
1728           bidi_it->separator_limit = bidi_it->charpos + 1 + sep_len;
1729         }
1730     }
1731
1732   if (bidi_it->scan_dir == 1 && bidi_cache_idx)
1733     {
1734       /* If we are at paragraph's base embedding level and beyond the
1735          last cached position, the cache's job is done and we can
1736          discard it.  */
1737       if (bidi_it->resolved_level == bidi_it->level_stack[0].level
1738           && bidi_it->charpos > bidi_cache[bidi_cache_idx - 1].charpos)
1739         bidi_cache_reset ();
1740         /* But as long as we are caching during forward scan, we must
1741            cache each state, or else the cache integrity will be
1742            compromised: it assumes cached states correspond to buffer
1743            positions 1:1.  */
1744       else
1745         bidi_cache_iterator_state (bidi_it, 1);
1746     }
1747 }
1748
1749 /* This is meant to be called from within the debugger, whenever you
1750    wish to examine the cache contents.  */
1751 void
1752 bidi_dump_cached_states (void)
1753 {
1754   int i;
1755   int ndigits = 1;
1756
1757   if (bidi_cache_idx == 0)
1758     {
1759       fprintf (stderr, "The cache is empty.\n");
1760       return;
1761     }
1762   fprintf (stderr, "Total of %d state%s in cache:\n",
1763            bidi_cache_idx, bidi_cache_idx == 1 ? "" : "s");
1764
1765   for (i = bidi_cache[bidi_cache_idx - 1].charpos; i > 0; i /= 10)
1766     ndigits++;
1767   fputs ("ch  ", stderr);
1768   for (i = 0; i < bidi_cache_idx; i++)
1769     fprintf (stderr, "%*c", ndigits, bidi_cache[i].ch);
1770   fputs ("\n", stderr);
1771   fputs ("lvl ", stderr);
1772   for (i = 0; i < bidi_cache_idx; i++)
1773     fprintf (stderr, "%*d", ndigits, bidi_cache[i].resolved_level);
1774   fputs ("\n", stderr);
1775   fputs ("pos ", stderr);
1776   for (i = 0; i < bidi_cache_idx; i++)
1777     fprintf (stderr, "%*ld", ndigits, (long)bidi_cache[i].charpos);
1778   fputs ("\n", stderr);
1779 }