/* Low-level bidirectional buffer/string-scanning functions for GNU Emacs.
- Copyright (C) 2000-2001, 2004-2005, 2009-2011
+ Copyright (C) 2000-2001, 2004-2005, 2009-2012
Free Software Foundation, Inc.
This file is part of GNU Emacs.
A note about references to UAX#9 rules: if the reference says
something like "X9/Retaining", it means that you need to refer to
- rule X9 and to its modifications decribed in the "Implementation
+ rule X9 and to its modifications described in the "Implementation
Notes" section of UAX#9, under "Retaining Format Codes". */
#include <config.h>
#include <setjmp.h>
#include "lisp.h"
-#include "buffer.h"
#include "character.h"
+#include "buffer.h"
#include "dispextern.h"
static int bidi_initialized = 0;
static inline void
bidi_check_type (bidi_type_t type)
{
- xassert (UNKNOWN_BT <= type && type <= NEUTRAL_ON);
+ eassert (UNKNOWN_BT <= type && type <= NEUTRAL_ON);
}
/* Given a bidi TYPE of a character, return its category. */
val = CHAR_TABLE_REF (bidi_mirror_table, c);
if (INTEGERP (val))
{
- EMACS_INT v = XINT (val);
+ int v;
+
+ /* When debugging, check before assigning to V, so that the check
+ isn't broken by undefined behavior due to int overflow. */
+ eassert (CHAR_VALID_P (XINT (val)));
+ v = XINT (val);
+
+ /* Minimal test we must do in optimized builds, to prevent weird
+ crashes further down the road. */
if (v < 0 || v > MAX_CHAR)
abort ();
int level, bidi_dir_t override)
{
bidi_it->stack_idx++;
- xassert (bidi_it->stack_idx < BIDI_MAXLEVEL);
+ eassert (bidi_it->stack_idx < BIDI_MAXLEVEL);
bidi_it->level_stack[bidi_it->stack_idx].level = level;
bidi_it->level_stack[bidi_it->stack_idx].override = override;
}
{
if (bidi_cache_size > BIDI_CACHE_CHUNK)
{
- bidi_cache
- = (struct bidi_it *) xrealloc (bidi_cache, BIDI_CACHE_CHUNK * elsz);
+ bidi_cache = xrealloc (bidi_cache, BIDI_CACHE_CHUNK * elsz);
bidi_cache_size = BIDI_CACHE_CHUNK;
}
bidi_cache_reset ();
ptrdiff_t i = dir ? bidi_cache_last_idx : bidi_cache_idx - 1;
int incr = before ? 1 : 0;
- xassert (!dir || bidi_cache_last_idx >= 0);
+ eassert (!dir || bidi_cache_last_idx >= 0);
if (!dir)
dir = -1;
bidi_copy_it (bidi_it, &bidi_cache[i]);
bidi_cache_last_idx = i;
- /* Don't let scan direction from from the cached state override
+ /* Don't let scan direction from the cached state override
the current scan direction. */
bidi_it->scan_dir = current_scan_dir;
return bidi_it->type;
memcpy (&bidi_cache[bidi_cache_idx++], bidi_it, sizeof (struct bidi_it));
/* Push the current cache start onto the stack. */
- xassert (bidi_cache_sp < IT_STACK_SIZE);
+ eassert (bidi_cache_sp < IT_STACK_SIZE);
bidi_cache_start_stack[bidi_cache_sp++] = bidi_cache_start;
/* Start a new level of cache, and make it empty. */
bidi_it->level_stack[0].override = NEUTRAL_DIR; /* X1 */
bidi_it->invalid_levels = 0;
bidi_it->invalid_rl_levels = -1;
- bidi_it->next_en_pos = -1;
+ /* Setting this to zero will force its recomputation the first time
+ we need it for W5. */
+ bidi_it->next_en_pos = 0;
+ bidi_it->next_en_type = UNKNOWN_BT;
bidi_it->next_for_ws.type = UNKNOWN_BT;
bidi_set_sor_type (bidi_it,
(bidi_it->paragraph_dir == R2L ? 1 : 0),
ptrdiff_t endpos
= (string->s || STRINGP (string->lstring)) ? string->schars : ZV;
struct text_pos pos;
+ int len;
/* If we got past the last known position of display string, compute
the position of the next one. That position could be at CHARPOS. */
normal_char:
if (string->s)
{
- int len;
if (!string->unibyte)
{
}
else if (STRINGP (string->lstring))
{
- int len;
-
if (!string->unibyte)
{
ch = STRING_CHAR_AND_LENGTH (SDATA (string->lstring) + bytepos,
}
else
{
- ch = FETCH_MULTIBYTE_CHAR (bytepos);
- *ch_len = CHAR_BYTES (ch);
+ ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (bytepos), len);
+ *ch_len = len;
}
*nchars = 1;
}
return pos_byte;
}
+/* On a 3.4 GHz machine, searching forward for a strong directional
+ character in a long paragraph full of weaks or neutrals takes about
+ 1 ms for each 20K characters. The number below limits each call to
+ bidi_paragraph_init to less than 10 ms even on slow machines. */
+#define MAX_STRONG_CHAR_SEARCH 100000
+
/* Determine the base direction, a.k.a. base embedding level, of the
paragraph we are about to iterate through. If DIR is either L2R or
R2L, just use that. Otherwise, determine the paragraph direction
Note that this function gives the paragraph separator the same
direction as the preceding paragraph, even though Emacs generally
- views the separartor as not belonging to any paragraph. */
+ views the separator as not belonging to any paragraph. */
void
bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it, int no_default_p)
{
/* The following loop is run more than once only if NO_DEFAULT_P
is non-zero, and only if we are iterating on a buffer. */
do {
+ ptrdiff_t pos1;
+
bytepos = pstartbyte;
if (!string_p)
pos = BYTE_TO_CHAR (bytepos);
bidi_it->frame_window_p, &ch_len, &nchars);
type = bidi_get_type (ch, NEUTRAL_DIR);
+ pos1 = pos;
for (pos += nchars, bytepos += ch_len;
- (bidi_get_category (type) != STRONG)
- || (bidi_ignore_explicit_marks_for_paragraph_level
- && (type == RLE || type == RLO
- || type == LRE || type == LRO));
+ ((bidi_get_category (type) != STRONG)
+ || (bidi_ignore_explicit_marks_for_paragraph_level
+ && (type == RLE || type == RLO
+ || type == LRE || type == LRO)))
+ /* Stop when searched too far into an abnormally large
+ paragraph full of weak or neutral characters. */
+ && pos - pos1 < MAX_STRONG_CHAR_SEARCH;
type = bidi_get_type (ch, NEUTRAL_DIR))
{
if (pos >= end)
}
}
else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
- || bidi_it->next_en_pos > bidi_it->charpos)
+ || (bidi_it->next_en_pos > bidi_it->charpos
+ && bidi_it->next_en_type == WEAK_EN))
type = WEAK_EN;
break;
case LRE: /* X3 */
}
}
else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
- || bidi_it->next_en_pos > bidi_it->charpos)
+ || (bidi_it->next_en_pos > bidi_it->charpos
+ && bidi_it->next_en_type == WEAK_EN))
type = WEAK_EN;
break;
case PDF: /* X7 */
}
}
else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
- || bidi_it->next_en_pos > bidi_it->charpos)
+ || (bidi_it->next_en_pos > bidi_it->charpos
+ && bidi_it->next_en_type == WEAK_EN))
type = WEAK_EN;
break;
default:
else if (type == WEAK_ET /* W5: ET with EN before or after it */
|| type == WEAK_BN) /* W5/Retaining */
{
- if (bidi_it->prev.type_after_w1 == WEAK_EN /* ET/BN w/EN before it */
- || bidi_it->next_en_pos > bidi_it->charpos)
+ if (bidi_it->prev.type_after_w1 == WEAK_EN) /* ET/BN w/EN before it */
type = WEAK_EN;
- else /* W5: ET/BN with EN after it. */
+ else if (bidi_it->next_en_pos > bidi_it->charpos
+ && bidi_it->next_en_type != WEAK_BN)
+ {
+ if (bidi_it->next_en_type == WEAK_EN) /* ET/BN with EN after it */
+ type = WEAK_EN;
+ }
+ else if (bidi_it->next_en_pos >=0)
{
ptrdiff_t en_pos = bidi_it->charpos + bidi_it->nchars;
const unsigned char *s = (STRINGP (bidi_it->string.lstring)
en_pos = bidi_it->charpos;
bidi_copy_it (bidi_it, &saved_it);
}
+ /* Remember this position, to speed up processing of the
+ next ETs. */
+ bidi_it->next_en_pos = en_pos;
if (type_of_next == WEAK_EN)
{
/* If the last strong character is AL, the EN we've
found will become AN when we get to it (W2). */
- if (bidi_it->last_strong.type_after_w1 != STRONG_AL)
- {
- type = WEAK_EN;
- /* Remember this EN position, to speed up processing
- of the next ETs. */
- bidi_it->next_en_pos = en_pos;
- }
+ if (bidi_it->last_strong.type_after_w1 == STRONG_AL)
+ type_of_next = WEAK_AN;
else if (type == WEAK_BN)
type = NEUTRAL_ON; /* W6/Retaining */
+ else
+ type = WEAK_EN;
}
+ else if (type_of_next == NEUTRAL_B)
+ /* Record the fact that there are no more ENs from
+ here to the end of paragraph, to avoid entering the
+ loop above ever again in this paragraph. */
+ bidi_it->next_en_pos = -1;
+ /* Record the type of the character where we ended our search. */
+ bidi_it->next_en_type = type_of_next;
}
}
}
|| type == NEUTRAL_ON))
abort ();
- if (bidi_get_category (type) == NEUTRAL
+ if ((type != NEUTRAL_B /* Don't risk entering the long loop below if
+ we are already at paragraph end. */
+ && bidi_get_category (type) == NEUTRAL)
|| (type == WEAK_BN && prev_level == current_level))
{
if (bidi_it->next_for_neutral.type != UNKNOWN_BT)
type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
bidi_it->next_for_neutral.type,
current_level);
+ /* The next two "else if" clauses are shortcuts for the
+ important special case when we have a long sequence of
+ neutral or WEAK_BN characters, such as whitespace or nulls or
+ other control characters, on the base embedding level of the
+ paragraph, and that sequence goes all the way to the end of
+ the paragraph and follows a character whose resolved
+ directionality is identical to the base embedding level.
+ (This is what happens in a buffer with plain L2R text that
+ happens to include long sequences of control characters.) By
+ virtue of N1, the result of examining this long sequence will
+ always be either STRONG_L or STRONG_R, depending on the base
+ embedding level. So we use this fact directly instead of
+ entering the expensive loop in the "else" clause. */
+ else if (current_level == 0
+ && bidi_it->prev_for_neutral.type == STRONG_L
+ && !bidi_explicit_dir_char (bidi_it->ch))
+ type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
+ STRONG_L, current_level);
+ else if (/* current level is 1 */
+ current_level == 1
+ /* base embedding level is also 1 */
+ && bidi_it->level_stack[0].level == 1
+ /* previous character is one of those considered R for
+ the purposes of W5 */
+ && (bidi_it->prev_for_neutral.type == STRONG_R
+ || bidi_it->prev_for_neutral.type == WEAK_EN
+ || bidi_it->prev_for_neutral.type == WEAK_AN)
+ && !bidi_explicit_dir_char (bidi_it->ch))
+ type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
+ STRONG_R, current_level);
else
{
/* Arrrgh!! The UAX#9 algorithm is too deeply entrenched in
case STRONG_L:
case STRONG_R:
case STRONG_AL:
+ /* Actually, STRONG_AL cannot happen here, because
+ bidi_resolve_weak converts it to STRONG_R, per W3. */
+ eassert (type != STRONG_AL);
next_type = type;
break;
case WEAK_EN:
/* N1: ``European and Arabic numbers are treated as
though they were R.'' */
next_type = STRONG_R;
- saved_it.next_for_neutral.type = STRONG_R;
break;
case WEAK_BN:
if (!bidi_explicit_dir_char (bidi_it->ch))
member. */
if (saved_it.type != WEAK_BN
|| bidi_get_category (bidi_it->prev.type_after_w1) == NEUTRAL)
- {
- next_type = bidi_it->prev_for_neutral.type;
- saved_it.next_for_neutral.type = next_type;
- bidi_check_type (next_type);
- }
+ next_type = bidi_it->prev_for_neutral.type;
else
{
/* This is a BN which does not adjoin neutrals.
}
type = bidi_resolve_neutral_1 (saved_it.prev_for_neutral.type,
next_type, current_level);
+ saved_it.next_for_neutral.type = next_type;
saved_it.type = type;
+ bidi_check_type (next_type);
bidi_check_type (type);
bidi_copy_it (bidi_it, &saved_it);
}
bidi_it->next_for_neutral.type = UNKNOWN_BT;
if (bidi_it->next_en_pos >= 0
&& bidi_it->charpos >= bidi_it->next_en_pos)
- bidi_it->next_en_pos = -1;
+ {
+ bidi_it->next_en_pos = 0;
+ bidi_it->next_en_type = UNKNOWN_BT;
+ }
if (bidi_it->next_for_ws.type != UNKNOWN_BT
&& bidi_it->charpos >= bidi_it->next_for_ws.charpos)
bidi_it->next_for_ws.type = UNKNOWN_BT;
do {
ch = bidi_fetch_char (bpos += clen, cpos += nc, &disp_pos, &dpp, &bs,
fwp, &clen, &nc);
- if (ch == '\n' || ch == BIDI_EOB /* || ch == LINESEP_CHAR */)
+ if (ch == '\n' || ch == BIDI_EOB)
chtype = NEUTRAL_B;
else
chtype = bidi_get_type (ch, NEUTRAL_DIR);
}
/* Resolve implicit levels, with a twist: PDFs get the embedding
- level of the enbedding they terminate. See below for the
+ level of the embedding they terminate. See below for the
reason. */
if (bidi_it->orig_type == PDF
/* Don't do this if this formatting code didn't change the
else if (bidi_it->orig_type == NEUTRAL_B /* L1 */
|| bidi_it->orig_type == NEUTRAL_S
|| bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB
- /* || bidi_it->ch == LINESEP_CHAR */
|| (bidi_it->orig_type == NEUTRAL_WS
&& (bidi_it->next_for_ws.type == NEUTRAL_B
|| bidi_it->next_for_ws.type == NEUTRAL_S)))