/* Low-level bidirectional buffer/string-scanning functions for GNU Emacs.
- Copyright (C) 2000-2001, 2004-2005, 2009-2011
+ Copyright (C) 2000-2001, 2004-2005, 2009-2012
Free Software Foundation, Inc.
This file is part of GNU Emacs.
A note about references to UAX#9 rules: if the reference says
something like "X9/Retaining", it means that you need to refer to
- rule X9 and to its modifications decribed in the "Implementation
+ rule X9 and to its modifications described in the "Implementation
Notes" section of UAX#9, under "Retaining Format Codes". */
#include <config.h>
#include <setjmp.h>
#include "lisp.h"
-#include "buffer.h"
#include "character.h"
+#include "buffer.h"
#include "dispextern.h"
static int bidi_initialized = 0;
val = CHAR_TABLE_REF (bidi_mirror_table, c);
if (INTEGERP (val))
{
- EMACS_INT v = XINT (val);
+ int v;
+
+ /* When debugging, check before assigning to V, so that the check
+ isn't broken by undefined behavior due to int overflow. */
+ eassert (CHAR_VALID_P (XINT (val)));
+
+ v = XINT (val);
+ /* Minimal test we must do in optimized builds, to prevent weird
+ crashes further down the road. */
if (v < 0 || v > MAX_CHAR)
abort ();
bidi_copy_it (bidi_it, &bidi_cache[i]);
bidi_cache_last_idx = i;
- /* Don't let scan direction from from the cached state override
+ /* Don't let scan direction from the cached state override
the current scan direction. */
bidi_it->scan_dir = current_scan_dir;
return bidi_it->type;
bidi_it->level_stack[0].override = NEUTRAL_DIR; /* X1 */
bidi_it->invalid_levels = 0;
bidi_it->invalid_rl_levels = -1;
- bidi_it->next_en_pos = -1;
+ /* Setting this to zero will force its recomputation the first time
+ we need it for W5. */
+ bidi_it->next_en_pos = 0;
+ bidi_it->next_en_type = UNKNOWN_BT;
bidi_it->next_for_ws.type = UNKNOWN_BT;
bidi_set_sor_type (bidi_it,
(bidi_it->paragraph_dir == R2L ? 1 : 0),
ptrdiff_t endpos
= (string->s || STRINGP (string->lstring)) ? string->schars : ZV;
struct text_pos pos;
+ int len;
/* If we got past the last known position of display string, compute
the position of the next one. That position could be at CHARPOS. */
normal_char:
if (string->s)
{
- int len;
if (!string->unibyte)
{
}
else if (STRINGP (string->lstring))
{
- int len;
-
if (!string->unibyte)
{
ch = STRING_CHAR_AND_LENGTH (SDATA (string->lstring) + bytepos,
}
else
{
- ch = FETCH_MULTIBYTE_CHAR (bytepos);
- *ch_len = CHAR_BYTES (ch);
+ ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (bytepos), len);
+ *ch_len = len;
}
*nchars = 1;
}
Note that this function gives the paragraph separator the same
direction as the preceding paragraph, even though Emacs generally
- views the separartor as not belonging to any paragraph. */
+ views the separator as not belonging to any paragraph. */
void
bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it, int no_default_p)
{
}
}
else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
- || bidi_it->next_en_pos > bidi_it->charpos)
+ || (bidi_it->next_en_pos > bidi_it->charpos
+ && bidi_it->next_en_type == WEAK_EN))
type = WEAK_EN;
break;
case LRE: /* X3 */
}
}
else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
- || bidi_it->next_en_pos > bidi_it->charpos)
+ || (bidi_it->next_en_pos > bidi_it->charpos
+ && bidi_it->next_en_type == WEAK_EN))
type = WEAK_EN;
break;
case PDF: /* X7 */
}
}
else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
- || bidi_it->next_en_pos > bidi_it->charpos)
+ || (bidi_it->next_en_pos > bidi_it->charpos
+ && bidi_it->next_en_type == WEAK_EN))
type = WEAK_EN;
break;
default:
else if (type == WEAK_ET /* W5: ET with EN before or after it */
|| type == WEAK_BN) /* W5/Retaining */
{
- if (bidi_it->prev.type_after_w1 == WEAK_EN /* ET/BN w/EN before it */
- || bidi_it->next_en_pos > bidi_it->charpos)
+ if (bidi_it->prev.type_after_w1 == WEAK_EN) /* ET/BN w/EN before it */
type = WEAK_EN;
- else /* W5: ET/BN with EN after it. */
+ else if (bidi_it->next_en_pos > bidi_it->charpos
+ && bidi_it->next_en_type != WEAK_BN)
+ {
+ if (bidi_it->next_en_type == WEAK_EN) /* ET/BN with EN after it */
+ type = WEAK_EN;
+ }
+ else if (bidi_it->next_en_pos >=0)
{
ptrdiff_t en_pos = bidi_it->charpos + bidi_it->nchars;
const unsigned char *s = (STRINGP (bidi_it->string.lstring)
en_pos = bidi_it->charpos;
bidi_copy_it (bidi_it, &saved_it);
}
+ /* Remember this position, to speed up processing of the
+ next ETs. */
+ bidi_it->next_en_pos = en_pos;
if (type_of_next == WEAK_EN)
{
/* If the last strong character is AL, the EN we've
found will become AN when we get to it (W2). */
- if (bidi_it->last_strong.type_after_w1 != STRONG_AL)
- {
- type = WEAK_EN;
- /* Remember this EN position, to speed up processing
- of the next ETs. */
- bidi_it->next_en_pos = en_pos;
- }
+ if (bidi_it->last_strong.type_after_w1 == STRONG_AL)
+ type_of_next = WEAK_AN;
else if (type == WEAK_BN)
type = NEUTRAL_ON; /* W6/Retaining */
+ else
+ type = WEAK_EN;
}
+ else if (type_of_next == NEUTRAL_B)
+ /* Record the fact that there are no more ENs from
+ here to the end of paragraph, to avoid entering the
+ loop above ever again in this paragraph. */
+ bidi_it->next_en_pos = -1;
+ /* Record the type of the character where we ended our search. */
+ bidi_it->next_en_type = type_of_next;
}
}
}
|| type == NEUTRAL_ON))
abort ();
- if (bidi_get_category (type) == NEUTRAL
+ if ((type != NEUTRAL_B /* Don't risk entering the long loop below if
+ we are already at paragraph end. */
+ && bidi_get_category (type) == NEUTRAL)
|| (type == WEAK_BN && prev_level == current_level))
{
if (bidi_it->next_for_neutral.type != UNKNOWN_BT)
type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
bidi_it->next_for_neutral.type,
current_level);
+ /* The next two "else if" clauses are shortcuts for the
+ important special case when we have a long sequence of
+ neutral or WEAK_BN characters, such as whitespace or nulls or
+ other control characters, on the base embedding level of the
+ paragraph, and that sequence goes all the way to the end of
+ the paragraph and follows a character whose resolved
+ directionality is identical to the base embedding level.
+ (This is what happens in a buffer with plain L2R text that
+ happens to include long sequences of control characters.) By
+ virtue of N1, the result of examining this long sequence will
+ always be either STRONG_L or STRONG_R, depending on the base
+ embedding level. So we use this fact directly instead of
+ entering the expensive loop in the "else" clause. */
+ else if (current_level == 0
+ && bidi_it->prev_for_neutral.type == STRONG_L
+ && !bidi_explicit_dir_char (bidi_it->ch))
+ type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
+ STRONG_L, current_level);
+ else if (/* current level is 1 */
+ current_level == 1
+ /* base embedding level is also 1 */
+ && bidi_it->level_stack[0].level == 1
+ /* previous character is one of those considered R for
+ the purposes of W5 */
+ && (bidi_it->prev_for_neutral.type == STRONG_R
+ || bidi_it->prev_for_neutral.type == WEAK_EN
+ || bidi_it->prev_for_neutral.type == WEAK_AN)
+ && !bidi_explicit_dir_char (bidi_it->ch))
+ type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
+ STRONG_R, current_level);
else
{
/* Arrrgh!! The UAX#9 algorithm is too deeply entrenched in
case STRONG_L:
case STRONG_R:
case STRONG_AL:
+ /* Actually, STRONG_AL cannot happen here, because
+ bidi_resolve_weak converts it to STRONG_R, per W3. */
+ xassert (type != STRONG_AL);
next_type = type;
break;
case WEAK_EN:
/* N1: ``European and Arabic numbers are treated as
though they were R.'' */
next_type = STRONG_R;
- saved_it.next_for_neutral.type = STRONG_R;
break;
case WEAK_BN:
if (!bidi_explicit_dir_char (bidi_it->ch))
member. */
if (saved_it.type != WEAK_BN
|| bidi_get_category (bidi_it->prev.type_after_w1) == NEUTRAL)
- {
- next_type = bidi_it->prev_for_neutral.type;
- saved_it.next_for_neutral.type = next_type;
- bidi_check_type (next_type);
- }
+ next_type = bidi_it->prev_for_neutral.type;
else
{
/* This is a BN which does not adjoin neutrals.
}
type = bidi_resolve_neutral_1 (saved_it.prev_for_neutral.type,
next_type, current_level);
+ saved_it.next_for_neutral.type = next_type;
saved_it.type = type;
+ bidi_check_type (next_type);
bidi_check_type (type);
bidi_copy_it (bidi_it, &saved_it);
}
bidi_it->next_for_neutral.type = UNKNOWN_BT;
if (bidi_it->next_en_pos >= 0
&& bidi_it->charpos >= bidi_it->next_en_pos)
- bidi_it->next_en_pos = -1;
+ {
+ bidi_it->next_en_pos = 0;
+ bidi_it->next_en_type = UNKNOWN_BT;
+ }
if (bidi_it->next_for_ws.type != UNKNOWN_BT
&& bidi_it->charpos >= bidi_it->next_for_ws.charpos)
bidi_it->next_for_ws.type = UNKNOWN_BT;
do {
ch = bidi_fetch_char (bpos += clen, cpos += nc, &disp_pos, &dpp, &bs,
fwp, &clen, &nc);
- if (ch == '\n' || ch == BIDI_EOB /* || ch == LINESEP_CHAR */)
+ if (ch == '\n' || ch == BIDI_EOB)
chtype = NEUTRAL_B;
else
chtype = bidi_get_type (ch, NEUTRAL_DIR);
}
/* Resolve implicit levels, with a twist: PDFs get the embedding
- level of the enbedding they terminate. See below for the
+ level of the embedding they terminate. See below for the
reason. */
if (bidi_it->orig_type == PDF
/* Don't do this if this formatting code didn't change the
else if (bidi_it->orig_type == NEUTRAL_B /* L1 */
|| bidi_it->orig_type == NEUTRAL_S
|| bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB
- /* || bidi_it->ch == LINESEP_CHAR */
|| (bidi_it->orig_type == NEUTRAL_WS
&& (bidi_it->next_for_ws.type == NEUTRAL_B
|| bidi_it->next_for_ws.type == NEUTRAL_S)))