/* Low-level bidirectional buffer/string-scanning functions for GNU Emacs.
- Copyright (C) 2000-2001, 2004-2005, 2009-2011
+ Copyright (C) 2000-2001, 2004-2005, 2009-2012
Free Software Foundation, Inc.
This file is part of GNU Emacs.
A note about references to UAX#9 rules: if the reference says
something like "X9/Retaining", it means that you need to refer to
- rule X9 and to its modifications decribed in the "Implementation
+ rule X9 and to its modifications described in the "Implementation
Notes" section of UAX#9, under "Retaining Format Codes". */
#include <config.h>
#include <setjmp.h>
#include "lisp.h"
-#include "buffer.h"
#include "character.h"
+#include "buffer.h"
#include "dispextern.h"
static int bidi_initialized = 0;
static inline void
bidi_check_type (bidi_type_t type)
{
- xassert (UNKNOWN_BT <= type && type <= NEUTRAL_ON);
+ eassert (UNKNOWN_BT <= type && type <= NEUTRAL_ON);
}
/* Given a bidi TYPE of a character, return its category. */
val = CHAR_TABLE_REF (bidi_mirror_table, c);
if (INTEGERP (val))
{
- int v = XINT (val);
+ int v;
+
+ /* When debugging, check before assigning to V, so that the check
+ isn't broken by undefined behavior due to int overflow. */
+ eassert (CHAR_VALID_P (XINT (val)));
+
+ v = XINT (val);
+ /* Minimal test we must do in optimized builds, to prevent weird
+ crashes further down the road. */
if (v < 0 || v > MAX_CHAR)
abort ();
int level, bidi_dir_t override)
{
bidi_it->stack_idx++;
- xassert (bidi_it->stack_idx < BIDI_MAXLEVEL);
+ eassert (bidi_it->stack_idx < BIDI_MAXLEVEL);
bidi_it->level_stack[bidi_it->stack_idx].level = level;
bidi_it->level_stack[bidi_it->stack_idx].override = override;
}
{
if (bidi_cache_size > BIDI_CACHE_CHUNK)
{
- bidi_cache
- = (struct bidi_it *) xrealloc (bidi_cache, BIDI_CACHE_CHUNK * elsz);
+ bidi_cache = xrealloc (bidi_cache, BIDI_CACHE_CHUNK * elsz);
bidi_cache_size = BIDI_CACHE_CHUNK;
}
bidi_cache_reset ();
resolved levels in cached states. DIR, if non-zero, means search
in that direction from the last cache hit. */
static inline ptrdiff_t
-bidi_cache_search (EMACS_INT charpos, int level, int dir)
+bidi_cache_search (ptrdiff_t charpos, int level, int dir)
{
ptrdiff_t i, i_start;
ptrdiff_t i = dir ? bidi_cache_last_idx : bidi_cache_idx - 1;
int incr = before ? 1 : 0;
- xassert (!dir || bidi_cache_last_idx >= 0);
+ eassert (!dir || bidi_cache_last_idx >= 0);
if (!dir)
dir = -1;
}
static inline bidi_type_t
-bidi_cache_find (EMACS_INT charpos, int level, struct bidi_it *bidi_it)
+bidi_cache_find (ptrdiff_t charpos, int level, struct bidi_it *bidi_it)
{
ptrdiff_t i = bidi_cache_search (charpos, level, bidi_it->scan_dir);
bidi_copy_it (bidi_it, &bidi_cache[i]);
bidi_cache_last_idx = i;
- /* Don't let scan direction from from the cached state override
+ /* Don't let scan direction from the cached state override
the current scan direction. */
bidi_it->scan_dir = current_scan_dir;
return bidi_it->type;
memcpy (&bidi_cache[bidi_cache_idx++], bidi_it, sizeof (struct bidi_it));
/* Push the current cache start onto the stack. */
- xassert (bidi_cache_sp < IT_STACK_SIZE);
+ eassert (bidi_cache_sp < IT_STACK_SIZE);
bidi_cache_start_stack[bidi_cache_sp++] = bidi_cache_start;
/* Start a new level of cache, and make it empty. */
/* Initialize the bidi iterator from buffer/string position CHARPOS. */
void
-bidi_init_it (EMACS_INT charpos, EMACS_INT bytepos, int frame_window_p,
+bidi_init_it (ptrdiff_t charpos, ptrdiff_t bytepos, int frame_window_p,
struct bidi_it *bidi_it)
{
if (! bidi_initialized)
/* Setting this to zero will force its recomputation the first time
we need it for W5. */
bidi_it->next_en_pos = 0;
+ bidi_it->next_en_type = UNKNOWN_BT;
bidi_it->next_for_ws.type = UNKNOWN_BT;
bidi_set_sor_type (bidi_it,
(bidi_it->paragraph_dir == R2L ? 1 : 0),
are zero-based character positions in S, BEGBYTE is byte position
corresponding to BEG. UNIBYTE, if non-zero, means S is a unibyte
string. */
-static inline EMACS_INT
-bidi_count_bytes (const unsigned char *s, const EMACS_INT beg,
- const EMACS_INT begbyte, const EMACS_INT end, int unibyte)
+static inline ptrdiff_t
+bidi_count_bytes (const unsigned char *s, const ptrdiff_t beg,
+ const ptrdiff_t begbyte, const ptrdiff_t end, int unibyte)
{
- EMACS_INT pos = beg;
+ ptrdiff_t pos = beg;
const unsigned char *p = s + begbyte, *start = p;
if (unibyte)
character from the current buffer. UNIBYTE non-zero means S is a
unibyte string. */
static inline int
-bidi_char_at_pos (EMACS_INT bytepos, const unsigned char *s, int unibyte)
+bidi_char_at_pos (ptrdiff_t bytepos, const unsigned char *s, int unibyte)
{
if (s)
{
string to iterate, or NULL if iterating over a buffer or a Lisp
string; in the latter case, STRING->lstring is the Lisp string. */
static inline int
-bidi_fetch_char (EMACS_INT bytepos, EMACS_INT charpos, EMACS_INT *disp_pos,
+bidi_fetch_char (ptrdiff_t bytepos, ptrdiff_t charpos, ptrdiff_t *disp_pos,
int *disp_prop, struct bidi_string_data *string,
- int frame_window_p, EMACS_INT *ch_len, EMACS_INT *nchars)
+ int frame_window_p, ptrdiff_t *ch_len, ptrdiff_t *nchars)
{
int ch;
- EMACS_INT endpos
+ ptrdiff_t endpos
= (string->s || STRINGP (string->lstring)) ? string->schars : ZV;
struct text_pos pos;
+ int len;
/* If we got past the last known position of display string, compute
the position of the next one. That position could be at CHARPOS. */
}
else if (charpos >= *disp_pos && *disp_prop)
{
- EMACS_INT disp_end_pos;
+ ptrdiff_t disp_end_pos;
/* We don't expect to find ourselves in the middle of a display
property. Hopefully, it will never be needed. */
normal_char:
if (string->s)
{
- int len;
if (!string->unibyte)
{
}
else if (STRINGP (string->lstring))
{
- int len;
-
if (!string->unibyte)
{
ch = STRING_CHAR_AND_LENGTH (SDATA (string->lstring) + bytepos,
}
else
{
- ch = FETCH_MULTIBYTE_CHAR (bytepos);
- *ch_len = CHAR_BYTES (ch);
+ ch = STRING_CHAR_AND_LENGTH (BYTE_POS_ADDR (bytepos), len);
+ *ch_len = len;
}
*nchars = 1;
}
following the buffer position, -1 if position is at the beginning
of a new paragraph, or -2 if position is neither at beginning nor
at end of a paragraph. */
-static EMACS_INT
-bidi_at_paragraph_end (EMACS_INT charpos, EMACS_INT bytepos)
+static ptrdiff_t
+bidi_at_paragraph_end (ptrdiff_t charpos, ptrdiff_t bytepos)
{
Lisp_Object sep_re;
Lisp_Object start_re;
- EMACS_INT val;
+ ptrdiff_t val;
sep_re = paragraph_separate_re;
start_re = paragraph_start_re;
Value is the byte position of the paragraph's beginning, or
BEGV_BYTE if paragraph_start_re is still not found after looking
back MAX_PARAGRAPH_SEARCH lines in the buffer. */
-static EMACS_INT
-bidi_find_paragraph_start (EMACS_INT pos, EMACS_INT pos_byte)
+static ptrdiff_t
+bidi_find_paragraph_start (ptrdiff_t pos, ptrdiff_t pos_byte)
{
Lisp_Object re = paragraph_start_re;
- EMACS_INT limit = ZV, limit_byte = ZV_BYTE;
- EMACS_INT n = 0;
+ ptrdiff_t limit = ZV, limit_byte = ZV_BYTE;
+ ptrdiff_t n = 0;
while (pos_byte > BEGV_BYTE
&& n++ < MAX_PARAGRAPH_SEARCH
return pos_byte;
}
+/* On a 3.4 GHz machine, searching forward for a strong directional
+ character in a long paragraph full of weaks or neutrals takes about
+ 1 ms for each 20K characters. The number below limits each call to
+ bidi_paragraph_init to less than 10 ms even on slow machines. */
+#define MAX_STRONG_CHAR_SEARCH 100000
+
/* Determine the base direction, a.k.a. base embedding level, of the
paragraph we are about to iterate through. If DIR is either L2R or
R2L, just use that. Otherwise, determine the paragraph direction
Note that this function gives the paragraph separator the same
direction as the preceding paragraph, even though Emacs generally
- views the separartor as not belonging to any paragraph. */
+ views the separator as not belonging to any paragraph. */
void
bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it, int no_default_p)
{
- EMACS_INT bytepos = bidi_it->bytepos;
+ ptrdiff_t bytepos = bidi_it->bytepos;
int string_p = bidi_it->string.s != NULL || STRINGP (bidi_it->string.lstring);
- EMACS_INT pstartbyte;
+ ptrdiff_t pstartbyte;
/* Note that begbyte is a byte position, while end is a character
position. Yes, this is ugly, but we are trying to avoid costly
calls to BYTE_TO_CHAR and its ilk. */
- EMACS_INT begbyte = string_p ? 0 : BEGV_BYTE;
- EMACS_INT end = string_p ? bidi_it->string.schars : ZV;
+ ptrdiff_t begbyte = string_p ? 0 : BEGV_BYTE;
+ ptrdiff_t end = string_p ? bidi_it->string.schars : ZV;
/* Special case for an empty buffer. */
if (bytepos == begbyte && bidi_it->charpos == end)
else if (dir == NEUTRAL_DIR) /* P2 */
{
int ch;
- EMACS_INT ch_len, nchars;
- EMACS_INT pos, disp_pos = -1;
+ ptrdiff_t ch_len, nchars;
+ ptrdiff_t pos, disp_pos = -1;
int disp_prop = 0;
bidi_type_t type;
const unsigned char *s;
/* The following loop is run more than once only if NO_DEFAULT_P
is non-zero, and only if we are iterating on a buffer. */
do {
+ ptrdiff_t pos1;
+
bytepos = pstartbyte;
if (!string_p)
pos = BYTE_TO_CHAR (bytepos);
bidi_it->frame_window_p, &ch_len, &nchars);
type = bidi_get_type (ch, NEUTRAL_DIR);
+ pos1 = pos;
for (pos += nchars, bytepos += ch_len;
- (bidi_get_category (type) != STRONG)
- || (bidi_ignore_explicit_marks_for_paragraph_level
- && (type == RLE || type == RLO
- || type == LRE || type == LRO));
+ ((bidi_get_category (type) != STRONG)
+ || (bidi_ignore_explicit_marks_for_paragraph_level
+ && (type == RLE || type == RLO
+ || type == LRE || type == LRO)))
+ /* Stop when searched too far into an abnormally large
+ paragraph full of weak or neutral characters. */
+ && pos - pos1 < MAX_STRONG_CHAR_SEARCH;
type = bidi_get_type (ch, NEUTRAL_DIR))
{
if (pos >= end)
bidi_it->paragraph_dir = L2R; /* P3 and HL1 */
else
{
- EMACS_INT prevpbyte = pstartbyte;
- EMACS_INT p = BYTE_TO_CHAR (pstartbyte), pbyte = pstartbyte;
+ ptrdiff_t prevpbyte = pstartbyte;
+ ptrdiff_t p = BYTE_TO_CHAR (pstartbyte), pbyte = pstartbyte;
/* Find the beginning of the previous paragraph, if any. */
while (pbyte > BEGV_BYTE && prevpbyte >= pstartbyte)
}
}
else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
- || bidi_it->next_en_pos > bidi_it->charpos)
+ || (bidi_it->next_en_pos > bidi_it->charpos
+ && bidi_it->next_en_type == WEAK_EN))
type = WEAK_EN;
break;
case LRE: /* X3 */
}
}
else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
- || bidi_it->next_en_pos > bidi_it->charpos)
+ || (bidi_it->next_en_pos > bidi_it->charpos
+ && bidi_it->next_en_type == WEAK_EN))
type = WEAK_EN;
break;
case PDF: /* X7 */
}
}
else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
- || bidi_it->next_en_pos > bidi_it->charpos)
+ || (bidi_it->next_en_pos > bidi_it->charpos
+ && bidi_it->next_en_type == WEAK_EN))
type = WEAK_EN;
break;
default:
{
int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
int new_level = bidi_resolve_explicit_1 (bidi_it);
- EMACS_INT eob = bidi_it->string.s ? bidi_it->string.schars : ZV;
+ ptrdiff_t eob = bidi_it->string.s ? bidi_it->string.schars : ZV;
const unsigned char *s
= (STRINGP (bidi_it->string.lstring)
? SDATA (bidi_it->string.lstring)
int next_char;
bidi_type_t type_of_next;
struct bidi_it saved_it;
- EMACS_INT eob
+ ptrdiff_t eob
= ((STRINGP (bidi_it->string.lstring) || bidi_it->string.s)
? bidi_it->string.schars : ZV);
else if (type == WEAK_ET /* W5: ET with EN before or after it */
|| type == WEAK_BN) /* W5/Retaining */
{
- if (bidi_it->prev.type_after_w1 == WEAK_EN /* ET/BN w/EN before it */
- || bidi_it->next_en_pos > bidi_it->charpos)
+ if (bidi_it->prev.type_after_w1 == WEAK_EN) /* ET/BN w/EN before it */
type = WEAK_EN;
- else if (bidi_it->next_en_pos >=0) /* W5: ET/BN with EN after it. */
+ else if (bidi_it->next_en_pos > bidi_it->charpos
+ && bidi_it->next_en_type != WEAK_BN)
+ {
+ if (bidi_it->next_en_type == WEAK_EN) /* ET/BN with EN after it */
+ type = WEAK_EN;
+ }
+ else if (bidi_it->next_en_pos >=0)
{
- EMACS_INT en_pos = bidi_it->charpos + bidi_it->nchars;
+ ptrdiff_t en_pos = bidi_it->charpos + bidi_it->nchars;
const unsigned char *s = (STRINGP (bidi_it->string.lstring)
? SDATA (bidi_it->string.lstring)
: bidi_it->string.s);
en_pos = bidi_it->charpos;
bidi_copy_it (bidi_it, &saved_it);
}
+ /* Remember this position, to speed up processing of the
+ next ETs. */
+ bidi_it->next_en_pos = en_pos;
if (type_of_next == WEAK_EN)
{
/* If the last strong character is AL, the EN we've
found will become AN when we get to it (W2). */
- if (bidi_it->last_strong.type_after_w1 != STRONG_AL)
- {
- type = WEAK_EN;
- /* Remember this EN position, to speed up processing
- of the next ETs. */
- bidi_it->next_en_pos = en_pos;
- }
+ if (bidi_it->last_strong.type_after_w1 == STRONG_AL)
+ type_of_next = WEAK_AN;
else if (type == WEAK_BN)
type = NEUTRAL_ON; /* W6/Retaining */
+ else
+ type = WEAK_EN;
}
else if (type_of_next == NEUTRAL_B)
/* Record the fact that there are no more ENs from
here to the end of paragraph, to avoid entering the
loop above ever again in this paragraph. */
bidi_it->next_en_pos = -1;
+ /* Record the type of the character where we ended our search. */
+ bidi_it->next_en_type = type_of_next;
}
}
}
case STRONG_AL:
/* Actually, STRONG_AL cannot happen here, because
bidi_resolve_weak converts it to STRONG_R, per W3. */
- xassert (type != STRONG_AL);
+ eassert (type != STRONG_AL);
next_type = type;
break;
case WEAK_EN:
bidi_type_t type;
int level, prev_level = -1;
struct bidi_saved_info next_for_neutral;
- EMACS_INT next_char_pos = -2;
+ ptrdiff_t next_char_pos = -2;
if (bidi_it->scan_dir == 1)
{
- EMACS_INT eob
+ ptrdiff_t eob
= ((bidi_it->string.s || STRINGP (bidi_it->string.lstring))
? bidi_it->string.schars : ZV);
bidi_it->next_for_neutral.type = UNKNOWN_BT;
if (bidi_it->next_en_pos >= 0
&& bidi_it->charpos >= bidi_it->next_en_pos)
- bidi_it->next_en_pos = 0;
+ {
+ bidi_it->next_en_pos = 0;
+ bidi_it->next_en_type = UNKNOWN_BT;
+ }
if (bidi_it->next_for_ws.type != UNKNOWN_BT
&& bidi_it->charpos >= bidi_it->next_for_ws.charpos)
bidi_it->next_for_ws.type = UNKNOWN_BT;
&& bidi_it->next_for_ws.type == UNKNOWN_BT)
{
int ch;
- EMACS_INT clen = bidi_it->ch_len;
- EMACS_INT bpos = bidi_it->bytepos;
- EMACS_INT cpos = bidi_it->charpos;
- EMACS_INT disp_pos = bidi_it->disp_pos;
- EMACS_INT nc = bidi_it->nchars;
+ ptrdiff_t clen = bidi_it->ch_len;
+ ptrdiff_t bpos = bidi_it->bytepos;
+ ptrdiff_t cpos = bidi_it->charpos;
+ ptrdiff_t disp_pos = bidi_it->disp_pos;
+ ptrdiff_t nc = bidi_it->nchars;
struct bidi_string_data bs = bidi_it->string;
bidi_type_t chtype;
int fwp = bidi_it->frame_window_p;
}
/* Resolve implicit levels, with a twist: PDFs get the embedding
- level of the enbedding they terminate. See below for the
+ level of the embedding they terminate. See below for the
reason. */
if (bidi_it->orig_type == PDF
/* Don't do this if this formatting code didn't change the
bidi_it->separator_limit = bidi_it->string.schars;
else if (bidi_it->bytepos < ZV_BYTE)
{
- EMACS_INT sep_len
+ ptrdiff_t sep_len
= bidi_at_paragraph_end (bidi_it->charpos + bidi_it->nchars,
bidi_it->bytepos + bidi_it->ch_len);
if (bidi_it->nchars <= 0)
fputs ("\n", stderr);
fputs ("pos ", stderr);
for (i = 0; i < bidi_cache_idx; i++)
- fprintf (stderr, "%*"pI"d", ndigits, bidi_cache[i].charpos);
+ fprintf (stderr, "%*"pD"d", ndigits, bidi_cache[i].charpos);
fputs ("\n", stderr);
}