Convert (most) functions in src to standard C.
[bpt/emacs.git] / src / bidi.c
1 /* Low-level bidirectional buffer-scanning functions for GNU Emacs.
2 Copyright (C) 2000, 2001, 2004, 2005, 2009, 2010
3 Free Software Foundation, Inc.
4
5 This file is part of GNU Emacs.
6
7 GNU Emacs is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
11
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
19
20 /* Written by Eli Zaretskii <eliz@gnu.org>.
21
22 A sequential implementation of the Unicode Bidirectional algorithm,
23 as per UAX#9, a part of the Unicode Standard.
24
25 Unlike the reference and most other implementations, this one is
26 designed to be called once for every character in the buffer or
27 string.
28
29 The main entry point is bidi_move_to_visually_next. Each time it
30 is called, it finds the next character in the visual order, and
31 returns its information in a special structure. The caller is then
32 expected to process this character for display or any other
33 purposes, and call bidi_move_to_visually_next for the next
34 character. See the comments in bidi_move_to_visually_next for more
35 details about its algorithm that finds the next visual-order
36 character by resolving their levels on the fly.
37
38 The two other entry points are bidi_paragraph_init and
39 bidi_mirror_char. The first determines the base direction of a
40 paragraph, while the second returns the mirrored version of its
41 argument character.
42
43 If you want to understand the code, you will have to read it
44 together with the relevant portions of UAX#9. The comments include
45 references to UAX#9 rules, for that very reason.
46
47 A note about references to UAX#9 rules: if the reference says
48 something like "X9/Retaining", it means that you need to refer to
49 rule X9 and to its modifications decribed in the "Implementation
50 Notes" section of UAX#9, under "Retaining Format Codes". */
51
52 #ifdef HAVE_CONFIG_H
53 #include <config.h>
54 #endif
55
56 #include <stdio.h>
57
58 #ifdef HAVE_STRING_H
59 #include <string.h>
60 #endif
61
62 #include <setjmp.h>
63
64 #include "lisp.h"
65 #include "buffer.h"
66 #include "character.h"
67 #include "dispextern.h"
68
69 static int bidi_initialized = 0;
70
71 static Lisp_Object bidi_type_table, bidi_mirror_table;
72
73 /* FIXME: Remove these when bidi_explicit_dir_char uses a lookup table. */
74 #define LRM_CHAR 0x200E
75 #define RLM_CHAR 0x200F
76 #define LRE_CHAR 0x202A
77 #define RLE_CHAR 0x202B
78 #define PDF_CHAR 0x202C
79 #define LRO_CHAR 0x202D
80 #define RLO_CHAR 0x202E
81
82 #define BIDI_EOB -1
83 #define BIDI_BOB -2 /* FIXME: Is this needed? */
84
85 /* Local data structures. (Look in dispextern.h for the rest.) */
86
87 /* What we need to know about the current paragraph. */
88 struct bidi_paragraph_info {
89 int start_bytepos; /* byte position where it begins */
90 int end_bytepos; /* byte position where it ends */
91 int embedding_level; /* its basic embedding level */
92 bidi_dir_t base_dir; /* its base direction */
93 };
94
95 /* Data type for describing the bidirectional character categories. */
96 typedef enum {
97 UNKNOWN_BC,
98 NEUTRAL,
99 WEAK,
100 STRONG
101 } bidi_category_t;
102
103 int bidi_ignore_explicit_marks_for_paragraph_level = 1;
104
105 static Lisp_Object paragraph_start_re, paragraph_separate_re;
106 static Lisp_Object Qparagraph_start, Qparagraph_separate;
107
108 static void
109 bidi_initialize (void)
110 {
111
112 #include "biditype.h"
113 #include "bidimirror.h"
114
115 int i;
116
117 bidi_type_table = Fmake_char_table (Qnil, make_number (STRONG_L));
118 staticpro (&bidi_type_table);
119
120 for (i = 0; i < sizeof bidi_type / sizeof bidi_type[0]; i++)
121 char_table_set_range (bidi_type_table, bidi_type[i].from, bidi_type[i].to,
122 make_number (bidi_type[i].type));
123
124 bidi_mirror_table = Fmake_char_table (Qnil, Qnil);
125 staticpro (&bidi_mirror_table);
126
127 for (i = 0; i < sizeof bidi_mirror / sizeof bidi_mirror[0]; i++)
128 char_table_set (bidi_mirror_table, bidi_mirror[i].from,
129 make_number (bidi_mirror[i].to));
130
131 Qparagraph_start = intern ("paragraph-start");
132 staticpro (&Qparagraph_start);
133 paragraph_start_re = Fsymbol_value (Qparagraph_start);
134 if (!STRINGP (paragraph_start_re))
135 paragraph_start_re = build_string ("\f\\|[ \t]*$");
136 staticpro (&paragraph_start_re);
137 Qparagraph_separate = intern ("paragraph-separate");
138 staticpro (&Qparagraph_separate);
139 paragraph_separate_re = Fsymbol_value (Qparagraph_separate);
140 if (!STRINGP (paragraph_separate_re))
141 paragraph_separate_re = build_string ("[ \t\f]*$");
142 staticpro (&paragraph_separate_re);
143 bidi_initialized = 1;
144 }
145
146 /* Return the bidi type of a character CH, subject to the current
147 directional OVERRIDE. */
148 static INLINE bidi_type_t
149 bidi_get_type (int ch, bidi_dir_t override)
150 {
151 bidi_type_t default_type;
152
153 if (ch == BIDI_EOB)
154 return NEUTRAL_B;
155 if (ch < 0 || ch > MAX_CHAR)
156 abort ();
157
158 default_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
159
160 if (override == NEUTRAL_DIR)
161 return default_type;
162
163 switch (default_type)
164 {
165 /* Although UAX#9 does not tell, it doesn't make sense to
166 override NEUTRAL_B and LRM/RLM characters. */
167 case NEUTRAL_B:
168 case LRE:
169 case LRO:
170 case RLE:
171 case RLO:
172 case PDF:
173 return default_type;
174 default:
175 switch (ch)
176 {
177 case LRM_CHAR:
178 case RLM_CHAR:
179 return default_type;
180 default:
181 if (override == L2R) /* X6 */
182 return STRONG_L;
183 else if (override == R2L)
184 return STRONG_R;
185 else
186 abort (); /* can't happen: handled above */
187 }
188 }
189 }
190
191 void
192 bidi_check_type (bidi_type_t type)
193 {
194 if (type < UNKNOWN_BT || type > NEUTRAL_ON)
195 abort ();
196 }
197
198 /* Given a bidi TYPE of a character, return its category. */
199 static INLINE bidi_category_t
200 bidi_get_category (bidi_type_t type)
201 {
202 switch (type)
203 {
204 case UNKNOWN_BT:
205 return UNKNOWN_BC;
206 case STRONG_L:
207 case STRONG_R:
208 case STRONG_AL:
209 case LRE:
210 case LRO:
211 case RLE:
212 case RLO:
213 return STRONG;
214 case PDF: /* ??? really?? */
215 case WEAK_EN:
216 case WEAK_ES:
217 case WEAK_ET:
218 case WEAK_AN:
219 case WEAK_CS:
220 case WEAK_NSM:
221 case WEAK_BN:
222 return WEAK;
223 case NEUTRAL_B:
224 case NEUTRAL_S:
225 case NEUTRAL_WS:
226 case NEUTRAL_ON:
227 return NEUTRAL;
228 default:
229 abort ();
230 }
231 }
232
233 /* Return the mirrored character of C, if it has one. If C has no
234 mirrored counterpart, return C.
235 Note: The conditions in UAX#9 clause L4 regarding the surrounding
236 context must be tested by the caller. */
237 int
238 bidi_mirror_char (int c)
239 {
240 Lisp_Object val;
241
242 if (c == BIDI_EOB)
243 return c;
244 if (c < 0 || c > MAX_CHAR)
245 abort ();
246
247 val = CHAR_TABLE_REF (bidi_mirror_table, c);
248 if (INTEGERP (val))
249 {
250 int v = XINT (val);
251
252 if (v < 0 || v > MAX_CHAR)
253 abort ();
254
255 return v;
256 }
257
258 return c;
259 }
260
261 /* Copy the bidi iterator from FROM to TO. To save cycles, this only
262 copies the part of the level stack that is actually in use. */
263 static INLINE void
264 bidi_copy_it (struct bidi_it *to, struct bidi_it *from)
265 {
266 int i;
267
268 /* Copy everything except the level stack and beyond. */
269 memcpy (to, from, ((size_t)&((struct bidi_it *)0)->level_stack[0]));
270
271 /* Copy the active part of the level stack. */
272 to->level_stack[0] = from->level_stack[0]; /* level zero is always in use */
273 for (i = 1; i <= from->stack_idx; i++)
274 to->level_stack[i] = from->level_stack[i];
275 }
276
277 /* Caching the bidi iterator states. */
278
279 #define BIDI_CACHE_CHUNK 200
280 static struct bidi_it *bidi_cache;
281 static size_t bidi_cache_size = 0;
282 static size_t elsz = sizeof (struct bidi_it);
283 static int bidi_cache_idx; /* next unused cache slot */
284 static int bidi_cache_last_idx; /* slot of last cache hit */
285
286 static INLINE void
287 bidi_cache_reset (void)
288 {
289 bidi_cache_idx = 0;
290 bidi_cache_last_idx = -1;
291 }
292
293 static INLINE void
294 bidi_cache_shrink (void)
295 {
296 if (bidi_cache_size > BIDI_CACHE_CHUNK)
297 {
298 bidi_cache_size = BIDI_CACHE_CHUNK;
299 bidi_cache =
300 (struct bidi_it *) xrealloc (bidi_cache, bidi_cache_size * elsz);
301 }
302 bidi_cache_reset ();
303 }
304
305 static INLINE void
306 bidi_cache_fetch_state (int idx, struct bidi_it *bidi_it)
307 {
308 int current_scan_dir = bidi_it->scan_dir;
309
310 if (idx < 0 || idx >= bidi_cache_idx)
311 abort ();
312
313 bidi_copy_it (bidi_it, &bidi_cache[idx]);
314 bidi_it->scan_dir = current_scan_dir;
315 bidi_cache_last_idx = idx;
316 }
317
318 /* Find a cached state with a given CHARPOS and resolved embedding
319 level less or equal to LEVEL. if LEVEL is -1, disregard the
320 resolved levels in cached states. DIR, if non-zero, means search
321 in that direction from the last cache hit. */
322 static INLINE int
323 bidi_cache_search (int charpos, int level, int dir)
324 {
325 int i, i_start;
326
327 if (bidi_cache_idx)
328 {
329 if (charpos < bidi_cache[bidi_cache_last_idx].charpos)
330 dir = -1;
331 else if (charpos > bidi_cache[bidi_cache_last_idx].charpos)
332 dir = 1;
333 if (dir)
334 i_start = bidi_cache_last_idx;
335 else
336 {
337 dir = -1;
338 i_start = bidi_cache_idx - 1;
339 }
340
341 if (dir < 0)
342 {
343 /* Linear search for now; FIXME! */
344 for (i = i_start; i >= 0; i--)
345 if (bidi_cache[i].charpos == charpos
346 && (level == -1 || bidi_cache[i].resolved_level <= level))
347 return i;
348 }
349 else
350 {
351 for (i = i_start; i < bidi_cache_idx; i++)
352 if (bidi_cache[i].charpos == charpos
353 && (level == -1 || bidi_cache[i].resolved_level <= level))
354 return i;
355 }
356 }
357
358 return -1;
359 }
360
361 /* Find a cached state where the resolved level changes to a value
362 that is lower than LEVEL, and return its cache slot index. DIR is
363 the direction to search, starting with the last used cache slot.
364 BEFORE, if non-zero, means return the index of the slot that is
365 ``before'' the level change in the search direction. That is,
366 given the cached levels like this:
367
368 1122333442211
369 AB C
370
371 and assuming we are at the position cached at the slot marked with
372 C, searching backwards (DIR = -1) for LEVEL = 2 will return the
373 index of slot B or A, depending whether BEFORE is, respectively,
374 non-zero or zero. */
375 static int
376 bidi_cache_find_level_change (int level, int dir, int before)
377 {
378 if (bidi_cache_idx)
379 {
380 int i = dir ? bidi_cache_last_idx : bidi_cache_idx - 1;
381 int incr = before ? 1 : 0;
382
383 if (!dir)
384 dir = -1;
385 else if (!incr)
386 i += dir;
387
388 if (dir < 0)
389 {
390 while (i >= incr)
391 {
392 if (bidi_cache[i - incr].resolved_level >= 0
393 && bidi_cache[i - incr].resolved_level < level)
394 return i;
395 i--;
396 }
397 }
398 else
399 {
400 while (i < bidi_cache_idx - incr)
401 {
402 if (bidi_cache[i + incr].resolved_level >= 0
403 && bidi_cache[i + incr].resolved_level < level)
404 return i;
405 i++;
406 }
407 }
408 }
409
410 return -1;
411 }
412
413 static INLINE void
414 bidi_cache_iterator_state (struct bidi_it *bidi_it, int resolved)
415 {
416 int idx;
417
418 /* We should never cache on backward scans. */
419 if (bidi_it->scan_dir == -1)
420 abort ();
421 idx = bidi_cache_search (bidi_it->charpos, -1, 1);
422
423 if (idx < 0)
424 {
425 idx = bidi_cache_idx;
426 /* Enlarge the cache as needed. */
427 if (idx >= bidi_cache_size)
428 {
429 bidi_cache_size += BIDI_CACHE_CHUNK;
430 bidi_cache =
431 (struct bidi_it *) xrealloc (bidi_cache, bidi_cache_size * elsz);
432 }
433 /* Character positions should correspond to cache positions 1:1.
434 If we are outside the range of cached positions, the cache is
435 useless and must be reset. */
436 if (idx > 0 &&
437 (bidi_it->charpos > bidi_cache[idx - 1].charpos + 1
438 || bidi_it->charpos < bidi_cache[0].charpos))
439 {
440 bidi_cache_reset ();
441 idx = 0;
442 }
443 bidi_copy_it (&bidi_cache[idx], bidi_it);
444 if (!resolved)
445 bidi_cache[idx].resolved_level = -1;
446 }
447 else
448 {
449 /* Copy only the members which could have changed, to avoid
450 costly copying of the entire struct. */
451 bidi_cache[idx].type = bidi_it->type;
452 bidi_check_type (bidi_it->type);
453 bidi_cache[idx].type_after_w1 = bidi_it->type_after_w1;
454 bidi_check_type (bidi_it->type_after_w1);
455 if (resolved)
456 bidi_cache[idx].resolved_level = bidi_it->resolved_level;
457 else
458 bidi_cache[idx].resolved_level = -1;
459 bidi_cache[idx].invalid_levels = bidi_it->invalid_levels;
460 bidi_cache[idx].invalid_rl_levels = bidi_it->invalid_rl_levels;
461 bidi_cache[idx].next_for_neutral = bidi_it->next_for_neutral;
462 bidi_cache[idx].next_for_ws = bidi_it->next_for_ws;
463 bidi_cache[idx].ignore_bn_limit = bidi_it->ignore_bn_limit;
464 }
465
466 bidi_cache_last_idx = idx;
467 if (idx >= bidi_cache_idx)
468 bidi_cache_idx = idx + 1;
469 }
470
471 static INLINE bidi_type_t
472 bidi_cache_find (int charpos, int level, struct bidi_it *bidi_it)
473 {
474 int i = bidi_cache_search (charpos, level, bidi_it->scan_dir);
475
476 if (i >= 0)
477 {
478 bidi_dir_t current_scan_dir = bidi_it->scan_dir;
479
480 bidi_copy_it (bidi_it, &bidi_cache[i]);
481 bidi_cache_last_idx = i;
482 /* Don't let scan direction from from the cached state override
483 the current scan direction. */
484 bidi_it->scan_dir = current_scan_dir;
485 return bidi_it->type;
486 }
487
488 return UNKNOWN_BT;
489 }
490
491 static INLINE int
492 bidi_peek_at_next_level (struct bidi_it *bidi_it)
493 {
494 if (bidi_cache_idx == 0 || bidi_cache_last_idx == -1)
495 abort ();
496 return bidi_cache[bidi_cache_last_idx + bidi_it->scan_dir].resolved_level;
497 }
498
499 /* Check if buffer position CHARPOS/BYTEPOS is the end of a paragraph.
500 Value is the non-negative length of the paragraph separator
501 following the buffer position, -1 if position is at the beginning
502 of a new paragraph, or -2 if position is neither at beginning nor
503 at end of a paragraph. */
504 static EMACS_INT
505 bidi_at_paragraph_end (EMACS_INT charpos, EMACS_INT bytepos)
506 {
507 /* FIXME: Why Fbuffer_local_value rather than just Fsymbol_value? */
508 Lisp_Object sep_re;
509 Lisp_Object start_re;
510 EMACS_INT val;
511
512 sep_re = paragraph_separate_re;
513 start_re = paragraph_start_re;
514
515 val = fast_looking_at (sep_re, charpos, bytepos, ZV, ZV_BYTE, Qnil);
516 if (val < 0)
517 {
518 if (fast_looking_at (start_re, charpos, bytepos, ZV, ZV_BYTE, Qnil) >= 0)
519 val = -1;
520 else
521 val = -2;
522 }
523
524 return val;
525 }
526
527 /* Determine the start-of-run (sor) directional type given the two
528 embedding levels on either side of the run boundary. Also, update
529 the saved info about previously seen characters, since that info is
530 generally valid for a single level run. */
531 static INLINE void
532 bidi_set_sor_type (struct bidi_it *bidi_it, int level_before, int level_after)
533 {
534 int higher_level = level_before > level_after ? level_before : level_after;
535
536 /* The prev_was_pdf gork is required for when we have several PDFs
537 in a row. In that case, we want to compute the sor type for the
538 next level run only once: when we see the first PDF. That's
539 because the sor type depends only on the higher of the two levels
540 that we find on the two sides of the level boundary (see UAX#9,
541 clause X10), and so we don't need to know the final embedding
542 level to which we descend after processing all the PDFs. */
543 if (!bidi_it->prev_was_pdf || level_before < level_after)
544 /* FIXME: should the default sor direction be user selectable? */
545 bidi_it->sor = (higher_level & 1) != 0 ? R2L : L2R;
546 if (level_before > level_after)
547 bidi_it->prev_was_pdf = 1;
548
549 bidi_it->prev.type = UNKNOWN_BT;
550 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
551 bidi_it->last_strong.orig_type = UNKNOWN_BT;
552 bidi_it->prev_for_neutral.type = bidi_it->sor == R2L ? STRONG_R : STRONG_L;
553 bidi_it->prev_for_neutral.charpos = bidi_it->charpos;
554 bidi_it->prev_for_neutral.bytepos = bidi_it->bytepos;
555 bidi_it->next_for_neutral.type = bidi_it->next_for_neutral.type_after_w1 =
556 bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
557 bidi_it->ignore_bn_limit = 0; /* meaning it's unknown */
558 }
559
560 static void
561 bidi_line_init (struct bidi_it *bidi_it)
562 {
563 bidi_it->scan_dir = 1; /* FIXME: do we need to have control on this? */
564 bidi_it->resolved_level = bidi_it->level_stack[0].level;
565 bidi_it->level_stack[0].override = NEUTRAL_DIR; /* X1 */
566 bidi_it->invalid_levels = 0;
567 bidi_it->invalid_rl_levels = -1;
568 bidi_it->next_en_pos = -1;
569 bidi_it->next_for_ws.type = UNKNOWN_BT;
570 bidi_set_sor_type (bidi_it,
571 bidi_it->paragraph_dir == R2L ? 1 : 0,
572 bidi_it->level_stack[0].level); /* X10 */
573
574 bidi_cache_reset ();
575 }
576
577 /* Find the beginning of this paragraph by looking back in the buffer.
578 Value is the byte position of the paragraph's beginning. */
579 static EMACS_INT
580 bidi_find_paragraph_start (EMACS_INT pos, EMACS_INT pos_byte)
581 {
582 Lisp_Object re = paragraph_start_re;
583 EMACS_INT limit = ZV, limit_byte = ZV_BYTE;
584
585 while (pos_byte > BEGV_BYTE
586 && fast_looking_at (re, pos, pos_byte, limit, limit_byte, Qnil) < 0)
587 {
588 pos = find_next_newline_no_quit (pos - 1, -1);
589 pos_byte = CHAR_TO_BYTE (pos);
590 }
591 return pos_byte;
592 }
593
594 /* Determine the direction, a.k.a. base embedding level, of the
595 paragraph we are about to iterate through. If DIR is either L2R or
596 R2L, just use that. Otherwise, determine the paragraph direction
597 from the first strong character of the paragraph.
598
599 Note that this gives the paragraph separator the same direction as
600 the preceding paragraph, even though Emacs generally views the
601 separartor as not belonging to any paragraph. */
602 void
603 bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it)
604 {
605 EMACS_INT bytepos = bidi_it->bytepos;
606
607 /* Special case for an empty buffer. */
608 if (bytepos == BEGV_BYTE && bytepos == ZV_BYTE)
609 dir = L2R;
610 /* We should never be called at EOB or before BEGV. */
611 else if (bytepos >= ZV_BYTE || bytepos < BEGV_BYTE)
612 abort ();
613
614 if (dir == L2R)
615 {
616 bidi_it->paragraph_dir = L2R;
617 bidi_it->new_paragraph = 0;
618 }
619 else if (dir == R2L)
620 {
621 bidi_it->paragraph_dir = R2L;
622 bidi_it->new_paragraph = 0;
623 }
624 else if (dir == NEUTRAL_DIR) /* P2 */
625 {
626 int ch, ch_len;
627 EMACS_INT pos;
628 bidi_type_t type;
629
630 if (!bidi_initialized)
631 bidi_initialize ();
632
633 /* If we are inside a paragraph separator, we are just waiting
634 for the separator to be exhausted; use the previous paragraph
635 direction. But don't do that if we have been just reseated,
636 because we need to reinitialize below in that case. */
637 if (!bidi_it->first_elt
638 && bidi_it->charpos < bidi_it->separator_limit)
639 return;
640
641 /* If we are on a newline, get past it to where the next
642 paragraph might start. But don't do that at BEGV since then
643 we are potentially in a new paragraph that doesn't yet
644 exist. */
645 pos = bidi_it->charpos;
646 if (bytepos > BEGV_BYTE && FETCH_CHAR (bytepos) == '\n')
647 {
648 bytepos++;
649 pos++;
650 }
651
652 /* We are either at the beginning of a paragraph or in the
653 middle of it. Find where this paragraph starts. */
654 bytepos = bidi_find_paragraph_start (pos, bytepos);
655
656 bidi_it->separator_limit = -1;
657 bidi_it->new_paragraph = 0;
658 ch = FETCH_CHAR (bytepos);
659 ch_len = CHAR_BYTES (ch);
660 pos = BYTE_TO_CHAR (bytepos);
661 type = bidi_get_type (ch, NEUTRAL_DIR);
662
663 for (pos++, bytepos += ch_len;
664 /* NOTE: UAX#9 says to search only for L, AL, or R types of
665 characters, and ignore RLE, RLO, LRE, and LRO. However,
666 I'm not sure it makes sense to omit those 4; should try
667 with and without that to see the effect. */
668 (bidi_get_category (type) != STRONG)
669 || (bidi_ignore_explicit_marks_for_paragraph_level
670 && (type == RLE || type == RLO
671 || type == LRE || type == LRO));
672 type = bidi_get_type (ch, NEUTRAL_DIR))
673 {
674 if (type == NEUTRAL_B && bidi_at_paragraph_end (pos, bytepos) >= -1)
675 break;
676 if (bytepos >= ZV_BYTE)
677 {
678 /* Pretend there's a paragraph separator at end of buffer. */
679 type = NEUTRAL_B;
680 break;
681 }
682 FETCH_CHAR_ADVANCE (ch, pos, bytepos);
683 }
684 if (type == STRONG_R || type == STRONG_AL) /* P3 */
685 bidi_it->paragraph_dir = R2L;
686 else if (type == STRONG_L)
687 bidi_it->paragraph_dir = L2R;
688 }
689 else
690 abort ();
691
692 /* Contrary to UAX#9 clause P3, we only default the paragraph
693 direction to L2R if we have no previous usable paragraph
694 direction. */
695 if (bidi_it->paragraph_dir != L2R && bidi_it->paragraph_dir != R2L)
696 bidi_it->paragraph_dir = L2R; /* P3 and ``higher protocols'' */
697 if (bidi_it->paragraph_dir == R2L)
698 bidi_it->level_stack[0].level = 1;
699 else
700 bidi_it->level_stack[0].level = 0;
701
702 bidi_line_init (bidi_it);
703 }
704
705 /* Do whatever UAX#9 clause X8 says should be done at paragraph's
706 end. */
707 static INLINE void
708 bidi_set_paragraph_end (struct bidi_it *bidi_it)
709 {
710 bidi_it->invalid_levels = 0;
711 bidi_it->invalid_rl_levels = -1;
712 bidi_it->stack_idx = 0;
713 bidi_it->resolved_level = bidi_it->level_stack[0].level;
714 }
715
716 /* Initialize the bidi iterator from buffer position CHARPOS. */
717 void
718 bidi_init_it (EMACS_INT charpos, EMACS_INT bytepos, struct bidi_it *bidi_it)
719 {
720 if (! bidi_initialized)
721 bidi_initialize ();
722 bidi_it->charpos = charpos;
723 bidi_it->bytepos = bytepos;
724 bidi_it->first_elt = 1;
725 bidi_set_paragraph_end (bidi_it);
726 bidi_it->new_paragraph = 1;
727 bidi_it->separator_limit = -1;
728 bidi_it->type = NEUTRAL_B;
729 bidi_it->type_after_w1 = NEUTRAL_B;
730 bidi_it->orig_type = NEUTRAL_B;
731 bidi_it->prev_was_pdf = 0;
732 bidi_it->prev.type = bidi_it->prev.type_after_w1 =
733 bidi_it->prev.orig_type = UNKNOWN_BT;
734 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
735 bidi_it->last_strong.orig_type = UNKNOWN_BT;
736 bidi_it->next_for_neutral.charpos = -1;
737 bidi_it->next_for_neutral.type =
738 bidi_it->next_for_neutral.type_after_w1 =
739 bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
740 bidi_it->prev_for_neutral.charpos = -1;
741 bidi_it->prev_for_neutral.type =
742 bidi_it->prev_for_neutral.type_after_w1 =
743 bidi_it->prev_for_neutral.orig_type = UNKNOWN_BT;
744 bidi_it->sor = L2R; /* FIXME: should it be user-selectable? */
745 bidi_cache_shrink ();
746 }
747
748 /* Push the current embedding level and override status; reset the
749 current level to LEVEL and the current override status to OVERRIDE. */
750 static INLINE void
751 bidi_push_embedding_level (struct bidi_it *bidi_it,
752 int level, bidi_dir_t override)
753 {
754 bidi_it->stack_idx++;
755 if (bidi_it->stack_idx >= BIDI_MAXLEVEL)
756 abort ();
757 bidi_it->level_stack[bidi_it->stack_idx].level = level;
758 bidi_it->level_stack[bidi_it->stack_idx].override = override;
759 }
760
761 /* Pop the embedding level and directional override status from the
762 stack, and return the new level. */
763 static INLINE int
764 bidi_pop_embedding_level (struct bidi_it *bidi_it)
765 {
766 /* UAX#9 says to ignore invalid PDFs. */
767 if (bidi_it->stack_idx > 0)
768 bidi_it->stack_idx--;
769 return bidi_it->level_stack[bidi_it->stack_idx].level;
770 }
771
772 /* Record in SAVED_INFO the information about the current character. */
773 static INLINE void
774 bidi_remember_char (struct bidi_saved_info *saved_info,
775 struct bidi_it *bidi_it)
776 {
777 saved_info->charpos = bidi_it->charpos;
778 saved_info->bytepos = bidi_it->bytepos;
779 saved_info->type = bidi_it->type;
780 bidi_check_type (bidi_it->type);
781 saved_info->type_after_w1 = bidi_it->type_after_w1;
782 bidi_check_type (bidi_it->type_after_w1);
783 saved_info->orig_type = bidi_it->orig_type;
784 bidi_check_type (bidi_it->orig_type);
785 }
786
787 /* Resolve the type of a neutral character according to the type of
788 surrounding strong text and the current embedding level. */
789 static INLINE bidi_type_t
790 bidi_resolve_neutral_1 (bidi_type_t prev_type, bidi_type_t next_type, int lev)
791 {
792 /* N1: European and Arabic numbers are treated as though they were R. */
793 if (next_type == WEAK_EN || next_type == WEAK_AN)
794 next_type = STRONG_R;
795 if (prev_type == WEAK_EN || prev_type == WEAK_AN)
796 prev_type = STRONG_R;
797
798 if (next_type == prev_type) /* N1 */
799 return next_type;
800 else if ((lev & 1) == 0) /* N2 */
801 return STRONG_L;
802 else
803 return STRONG_R;
804 }
805
806 static INLINE int
807 bidi_explicit_dir_char (int c)
808 {
809 /* FIXME: this should be replaced with a lookup table with suitable
810 bits set, like standard C ctype macros do. */
811 return (c == LRE_CHAR || c == LRO_CHAR
812 || c == RLE_CHAR || c == RLO_CHAR || c == PDF_CHAR);
813 }
814
815 /* A helper function for bidi_resolve_explicit. It advances to the
816 next character in logical order and determines the new embedding
817 level and directional override, but does not take into account
818 empty embeddings. */
819 static int
820 bidi_resolve_explicit_1 (struct bidi_it *bidi_it)
821 {
822 int curchar;
823 bidi_type_t type;
824 int current_level;
825 int new_level;
826 bidi_dir_t override;
827
828 if (bidi_it->bytepos < BEGV_BYTE /* after reseat to BEGV? */
829 || bidi_it->first_elt)
830 {
831 bidi_it->first_elt = 0;
832 if (bidi_it->charpos < BEGV)
833 bidi_it->charpos = BEGV;
834 bidi_it->bytepos = CHAR_TO_BYTE (bidi_it->charpos);
835 }
836 else if (bidi_it->bytepos < ZV_BYTE) /* don't move at ZV */
837 {
838 bidi_it->charpos++;
839 if (bidi_it->ch_len == 0)
840 abort ();
841 bidi_it->bytepos += bidi_it->ch_len;
842 }
843
844 current_level = bidi_it->level_stack[bidi_it->stack_idx].level; /* X1 */
845 override = bidi_it->level_stack[bidi_it->stack_idx].override;
846 new_level = current_level;
847
848 /* in case it is a unibyte character (not yet implemented) */
849 /* _fetch_multibyte_char_len = 1; */
850 if (bidi_it->bytepos >= ZV_BYTE)
851 {
852 curchar = BIDI_EOB;
853 bidi_it->ch_len = 1;
854 }
855 else
856 {
857 curchar = FETCH_CHAR (bidi_it->bytepos);
858 bidi_it->ch_len = CHAR_BYTES (curchar);
859 }
860 bidi_it->ch = curchar;
861
862 /* Don't apply directional override here, as all the types we handle
863 below will not be affected by the override anyway, and we need
864 the original type unaltered. The override will be applied in
865 bidi_resolve_weak. */
866 type = bidi_get_type (curchar, NEUTRAL_DIR);
867 bidi_it->orig_type = type;
868 bidi_check_type (bidi_it->orig_type);
869
870 if (type != PDF)
871 bidi_it->prev_was_pdf = 0;
872
873 bidi_it->type_after_w1 = UNKNOWN_BT;
874
875 switch (type)
876 {
877 case RLE: /* X2 */
878 case RLO: /* X4 */
879 bidi_it->type_after_w1 = type;
880 bidi_check_type (bidi_it->type_after_w1);
881 type = WEAK_BN; /* X9/Retaining */
882 if (bidi_it->ignore_bn_limit <= 0)
883 {
884 if (current_level <= BIDI_MAXLEVEL - 4)
885 {
886 /* Compute the least odd embedding level greater than
887 the current level. */
888 new_level = ((current_level + 1) & ~1) + 1;
889 if (bidi_it->type_after_w1 == RLE)
890 override = NEUTRAL_DIR;
891 else
892 override = R2L;
893 if (current_level == BIDI_MAXLEVEL - 4)
894 bidi_it->invalid_rl_levels = 0;
895 bidi_push_embedding_level (bidi_it, new_level, override);
896 }
897 else
898 {
899 bidi_it->invalid_levels++;
900 /* See the commentary about invalid_rl_levels below. */
901 if (bidi_it->invalid_rl_levels < 0)
902 bidi_it->invalid_rl_levels = 0;
903 bidi_it->invalid_rl_levels++;
904 }
905 }
906 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
907 || bidi_it->next_en_pos > bidi_it->charpos)
908 type = WEAK_EN;
909 break;
910 case LRE: /* X3 */
911 case LRO: /* X5 */
912 bidi_it->type_after_w1 = type;
913 bidi_check_type (bidi_it->type_after_w1);
914 type = WEAK_BN; /* X9/Retaining */
915 if (bidi_it->ignore_bn_limit <= 0)
916 {
917 if (current_level <= BIDI_MAXLEVEL - 5)
918 {
919 /* Compute the least even embedding level greater than
920 the current level. */
921 new_level = ((current_level + 2) & ~1);
922 if (bidi_it->type_after_w1 == LRE)
923 override = NEUTRAL_DIR;
924 else
925 override = L2R;
926 bidi_push_embedding_level (bidi_it, new_level, override);
927 }
928 else
929 {
930 bidi_it->invalid_levels++;
931 /* invalid_rl_levels counts invalid levels encountered
932 while the embedding level was already too high for
933 LRE/LRO, but not for RLE/RLO. That is because
934 there may be exactly one PDF which we should not
935 ignore even though invalid_levels is non-zero.
936 invalid_rl_levels helps to know what PDF is
937 that. */
938 if (bidi_it->invalid_rl_levels >= 0)
939 bidi_it->invalid_rl_levels++;
940 }
941 }
942 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
943 || bidi_it->next_en_pos > bidi_it->charpos)
944 type = WEAK_EN;
945 break;
946 case PDF: /* X7 */
947 bidi_it->type_after_w1 = type;
948 bidi_check_type (bidi_it->type_after_w1);
949 type = WEAK_BN; /* X9/Retaining */
950 if (bidi_it->ignore_bn_limit <= 0)
951 {
952 if (!bidi_it->invalid_rl_levels)
953 {
954 new_level = bidi_pop_embedding_level (bidi_it);
955 bidi_it->invalid_rl_levels = -1;
956 if (bidi_it->invalid_levels)
957 bidi_it->invalid_levels--;
958 /* else nothing: UAX#9 says to ignore invalid PDFs */
959 }
960 if (!bidi_it->invalid_levels)
961 new_level = bidi_pop_embedding_level (bidi_it);
962 else
963 {
964 bidi_it->invalid_levels--;
965 bidi_it->invalid_rl_levels--;
966 }
967 }
968 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
969 || bidi_it->next_en_pos > bidi_it->charpos)
970 type = WEAK_EN;
971 break;
972 default:
973 /* Nothing. */
974 break;
975 }
976
977 bidi_it->type = type;
978 bidi_check_type (bidi_it->type);
979
980 return new_level;
981 }
982
983 /* Given an iterator state in BIDI_IT, advance one character position
984 in the buffer to the next character (in the logical order), resolve
985 any explicit embeddings and directional overrides, and return the
986 embedding level of the character after resolving explicit
987 directives and ignoring empty embeddings. */
988 static int
989 bidi_resolve_explicit (struct bidi_it *bidi_it)
990 {
991 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
992 int new_level = bidi_resolve_explicit_1 (bidi_it);
993
994 if (prev_level < new_level
995 && bidi_it->type == WEAK_BN
996 && bidi_it->ignore_bn_limit == 0 /* only if not already known */
997 && bidi_it->bytepos < ZV_BYTE /* not already at EOB */
998 && bidi_explicit_dir_char (FETCH_CHAR (bidi_it->bytepos
999 + bidi_it->ch_len)))
1000 {
1001 /* Avoid pushing and popping embedding levels if the level run
1002 is empty, as this breaks level runs where it shouldn't.
1003 UAX#9 removes all the explicit embedding and override codes,
1004 so empty embeddings disappear without a trace. We need to
1005 behave as if we did the same. */
1006 struct bidi_it saved_it;
1007 int level = prev_level;
1008
1009 bidi_copy_it (&saved_it, bidi_it);
1010
1011 while (bidi_explicit_dir_char (FETCH_CHAR (bidi_it->bytepos
1012 + bidi_it->ch_len)))
1013 {
1014 level = bidi_resolve_explicit_1 (bidi_it);
1015 }
1016
1017 if (level == prev_level) /* empty embedding */
1018 saved_it.ignore_bn_limit = bidi_it->charpos + 1;
1019 else /* this embedding is non-empty */
1020 saved_it.ignore_bn_limit = -1;
1021
1022 bidi_copy_it (bidi_it, &saved_it);
1023 if (bidi_it->ignore_bn_limit > 0)
1024 {
1025 /* We pushed a level, but we shouldn't have. Undo that. */
1026 if (!bidi_it->invalid_rl_levels)
1027 {
1028 new_level = bidi_pop_embedding_level (bidi_it);
1029 bidi_it->invalid_rl_levels = -1;
1030 if (bidi_it->invalid_levels)
1031 bidi_it->invalid_levels--;
1032 }
1033 if (!bidi_it->invalid_levels)
1034 new_level = bidi_pop_embedding_level (bidi_it);
1035 else
1036 {
1037 bidi_it->invalid_levels--;
1038 bidi_it->invalid_rl_levels--;
1039 }
1040 }
1041 }
1042
1043 if (bidi_it->type == NEUTRAL_B) /* X8 */
1044 {
1045 bidi_set_paragraph_end (bidi_it);
1046 /* This is needed by bidi_resolve_weak below, and in L1. */
1047 bidi_it->type_after_w1 = bidi_it->type;
1048 bidi_check_type (bidi_it->type_after_w1);
1049 }
1050
1051 return new_level;
1052 }
1053
1054 /* Advance in the buffer, resolve weak types and return the type of
1055 the next character after weak type resolution. */
1056 static bidi_type_t
1057 bidi_resolve_weak (struct bidi_it *bidi_it)
1058 {
1059 bidi_type_t type;
1060 bidi_dir_t override;
1061 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1062 int new_level = bidi_resolve_explicit (bidi_it);
1063 int next_char;
1064 bidi_type_t type_of_next;
1065 struct bidi_it saved_it;
1066
1067 type = bidi_it->type;
1068 override = bidi_it->level_stack[bidi_it->stack_idx].override;
1069
1070 if (type == UNKNOWN_BT
1071 || type == LRE
1072 || type == LRO
1073 || type == RLE
1074 || type == RLO
1075 || type == PDF)
1076 abort ();
1077
1078 if (new_level != prev_level
1079 || bidi_it->type == NEUTRAL_B)
1080 {
1081 /* We've got a new embedding level run, compute the directional
1082 type of sor and initialize per-run variables (UAX#9, clause
1083 X10). */
1084 bidi_set_sor_type (bidi_it, prev_level, new_level);
1085 }
1086 else if (type == NEUTRAL_S || type == NEUTRAL_WS
1087 || type == WEAK_BN || type == STRONG_AL)
1088 bidi_it->type_after_w1 = type; /* needed in L1 */
1089 bidi_check_type (bidi_it->type_after_w1);
1090
1091 /* Level and directional override status are already recorded in
1092 bidi_it, and do not need any change; see X6. */
1093 if (override == R2L) /* X6 */
1094 type = STRONG_R;
1095 else if (override == L2R)
1096 type = STRONG_L;
1097 else
1098 {
1099 if (type == WEAK_NSM) /* W1 */
1100 {
1101 /* Note that we don't need to consider the case where the
1102 prev character has its type overridden by an RLO or LRO,
1103 because then either the type of this NSM would have been
1104 also overridden, or the previous character is outside the
1105 current level run, and thus not relevant to this NSM.
1106 This is why NSM gets the type_after_w1 of the previous
1107 character. */
1108 if (bidi_it->prev.type_after_w1 != UNKNOWN_BT
1109 /* if type_after_w1 is NEUTRAL_B, this NSM is at sor */
1110 && bidi_it->prev.type_after_w1 != NEUTRAL_B)
1111 type = bidi_it->prev.type_after_w1;
1112 else if (bidi_it->sor == R2L)
1113 type = STRONG_R;
1114 else if (bidi_it->sor == L2R)
1115 type = STRONG_L;
1116 else /* shouldn't happen! */
1117 abort ();
1118 }
1119 if (type == WEAK_EN /* W2 */
1120 && bidi_it->last_strong.type_after_w1 == STRONG_AL)
1121 type = WEAK_AN;
1122 else if (type == STRONG_AL) /* W3 */
1123 type = STRONG_R;
1124 else if ((type == WEAK_ES /* W4 */
1125 && bidi_it->prev.type_after_w1 == WEAK_EN
1126 && bidi_it->prev.orig_type == WEAK_EN)
1127 || (type == WEAK_CS
1128 && ((bidi_it->prev.type_after_w1 == WEAK_EN
1129 && bidi_it->prev.orig_type == WEAK_EN)
1130 || bidi_it->prev.type_after_w1 == WEAK_AN)))
1131 {
1132 next_char =
1133 bidi_it->bytepos + bidi_it->ch_len >= ZV_BYTE
1134 ? BIDI_EOB : FETCH_CHAR (bidi_it->bytepos + bidi_it->ch_len);
1135 type_of_next = bidi_get_type (next_char, override);
1136
1137 if (type_of_next == WEAK_BN
1138 || bidi_explicit_dir_char (next_char))
1139 {
1140 bidi_copy_it (&saved_it, bidi_it);
1141 while (bidi_resolve_explicit (bidi_it) == new_level
1142 && bidi_it->type == WEAK_BN)
1143 ;
1144 type_of_next = bidi_it->type;
1145 bidi_copy_it (bidi_it, &saved_it);
1146 }
1147
1148 /* If the next character is EN, but the last strong-type
1149 character is AL, that next EN will be changed to AN when
1150 we process it in W2 above. So in that case, this ES
1151 should not be changed into EN. */
1152 if (type == WEAK_ES
1153 && type_of_next == WEAK_EN
1154 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1155 type = WEAK_EN;
1156 else if (type == WEAK_CS)
1157 {
1158 if (bidi_it->prev.type_after_w1 == WEAK_AN
1159 && (type_of_next == WEAK_AN
1160 /* If the next character is EN, but the last
1161 strong-type character is AL, EN will be later
1162 changed to AN when we process it in W2 above.
1163 So in that case, this ES should not be
1164 changed into EN. */
1165 || (type_of_next == WEAK_EN
1166 && bidi_it->last_strong.type_after_w1 == STRONG_AL)))
1167 type = WEAK_AN;
1168 else if (bidi_it->prev.type_after_w1 == WEAK_EN
1169 && type_of_next == WEAK_EN
1170 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1171 type = WEAK_EN;
1172 }
1173 }
1174 else if (type == WEAK_ET /* W5: ET with EN before or after it */
1175 || type == WEAK_BN) /* W5/Retaining */
1176 {
1177 if (bidi_it->prev.type_after_w1 == WEAK_EN /* ET/BN w/EN before it */
1178 || bidi_it->next_en_pos > bidi_it->charpos)
1179 type = WEAK_EN;
1180 else /* W5: ET/BN with EN after it. */
1181 {
1182 EMACS_INT en_pos = bidi_it->charpos + 1;
1183
1184 next_char =
1185 bidi_it->bytepos + bidi_it->ch_len >= ZV_BYTE
1186 ? BIDI_EOB : FETCH_CHAR (bidi_it->bytepos + bidi_it->ch_len);
1187 type_of_next = bidi_get_type (next_char, override);
1188
1189 if (type_of_next == WEAK_ET
1190 || type_of_next == WEAK_BN
1191 || bidi_explicit_dir_char (next_char))
1192 {
1193 bidi_copy_it (&saved_it, bidi_it);
1194 while (bidi_resolve_explicit (bidi_it) == new_level
1195 && (bidi_it->type == WEAK_BN
1196 || bidi_it->type == WEAK_ET))
1197 ;
1198 type_of_next = bidi_it->type;
1199 en_pos = bidi_it->charpos;
1200 bidi_copy_it (bidi_it, &saved_it);
1201 }
1202 if (type_of_next == WEAK_EN)
1203 {
1204 /* If the last strong character is AL, the EN we've
1205 found will become AN when we get to it (W2). */
1206 if (bidi_it->last_strong.type_after_w1 != STRONG_AL)
1207 {
1208 type = WEAK_EN;
1209 /* Remember this EN position, to speed up processing
1210 of the next ETs. */
1211 bidi_it->next_en_pos = en_pos;
1212 }
1213 else if (type == WEAK_BN)
1214 type = NEUTRAL_ON; /* W6/Retaining */
1215 }
1216 }
1217 }
1218 }
1219
1220 if (type == WEAK_ES || type == WEAK_ET || type == WEAK_CS /* W6 */
1221 || (type == WEAK_BN
1222 && (bidi_it->prev.type_after_w1 == WEAK_CS /* W6/Retaining */
1223 || bidi_it->prev.type_after_w1 == WEAK_ES
1224 || bidi_it->prev.type_after_w1 == WEAK_ET)))
1225 type = NEUTRAL_ON;
1226
1227 /* Store the type we've got so far, before we clobber it with strong
1228 types in W7 and while resolving neutral types. But leave alone
1229 the original types that were recorded above, because we will need
1230 them for the L1 clause. */
1231 if (bidi_it->type_after_w1 == UNKNOWN_BT)
1232 bidi_it->type_after_w1 = type;
1233 bidi_check_type (bidi_it->type_after_w1);
1234
1235 if (type == WEAK_EN) /* W7 */
1236 {
1237 if ((bidi_it->last_strong.type_after_w1 == STRONG_L)
1238 || (bidi_it->last_strong.type == UNKNOWN_BT && bidi_it->sor == L2R))
1239 type = STRONG_L;
1240 }
1241
1242 bidi_it->type = type;
1243 bidi_check_type (bidi_it->type);
1244 return type;
1245 }
1246
1247 static bidi_type_t
1248 bidi_resolve_neutral (struct bidi_it *bidi_it)
1249 {
1250 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1251 bidi_type_t type = bidi_resolve_weak (bidi_it);
1252 int current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1253
1254 if (!(type == STRONG_R
1255 || type == STRONG_L
1256 || type == WEAK_BN
1257 || type == WEAK_EN
1258 || type == WEAK_AN
1259 || type == NEUTRAL_B
1260 || type == NEUTRAL_S
1261 || type == NEUTRAL_WS
1262 || type == NEUTRAL_ON))
1263 abort ();
1264
1265 if (bidi_get_category (type) == NEUTRAL
1266 || (type == WEAK_BN && prev_level == current_level))
1267 {
1268 if (bidi_it->next_for_neutral.type != UNKNOWN_BT)
1269 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1270 bidi_it->next_for_neutral.type,
1271 current_level);
1272 else
1273 {
1274 /* Arrrgh!! The UAX#9 algorithm is too deeply entrenched in
1275 the assumption of batch-style processing; see clauses W4,
1276 W5, and especially N1, which require to look far forward
1277 (as well as back) in the buffer. May the fleas of a
1278 thousand camels infest the armpits of those who design
1279 supposedly general-purpose algorithms by looking at their
1280 own implementations, and fail to consider other possible
1281 implementations! */
1282 struct bidi_it saved_it;
1283 bidi_type_t next_type;
1284
1285 if (bidi_it->scan_dir == -1)
1286 abort ();
1287
1288 bidi_copy_it (&saved_it, bidi_it);
1289 /* Scan the text forward until we find the first non-neutral
1290 character, and then use that to resolve the neutral we
1291 are dealing with now. We also cache the scanned iterator
1292 states, to salvage some of the effort later. */
1293 bidi_cache_iterator_state (bidi_it, 0);
1294 do {
1295 /* Record the info about the previous character, so that
1296 it will be cached below with this state. */
1297 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
1298 && bidi_it->type != WEAK_BN)
1299 bidi_remember_char (&bidi_it->prev, bidi_it);
1300 type = bidi_resolve_weak (bidi_it);
1301 /* Paragraph separators have their levels fully resolved
1302 at this point, so cache them as resolved. */
1303 bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B);
1304 /* FIXME: implement L1 here, by testing for a newline and
1305 resetting the level for any sequence of whitespace
1306 characters adjacent to it. */
1307 } while (!(type == NEUTRAL_B
1308 || (type != WEAK_BN
1309 && bidi_get_category (type) != NEUTRAL)
1310 /* This is all per level run, so stop when we
1311 reach the end of this level run. */
1312 || bidi_it->level_stack[bidi_it->stack_idx].level !=
1313 current_level));
1314
1315 bidi_remember_char (&saved_it.next_for_neutral, bidi_it);
1316
1317 switch (type)
1318 {
1319 case STRONG_L:
1320 case STRONG_R:
1321 case STRONG_AL:
1322 next_type = type;
1323 break;
1324 case WEAK_EN:
1325 case WEAK_AN:
1326 /* N1: ``European and Arabic numbers are treated as
1327 though they were R.'' */
1328 next_type = STRONG_R;
1329 saved_it.next_for_neutral.type = STRONG_R;
1330 break;
1331 case WEAK_BN:
1332 if (!bidi_explicit_dir_char (bidi_it->ch))
1333 abort (); /* can't happen: BNs are skipped */
1334 /* FALLTHROUGH */
1335 case NEUTRAL_B:
1336 /* Marched all the way to the end of this level run.
1337 We need to use the eor type, whose information is
1338 stored by bidi_set_sor_type in the prev_for_neutral
1339 member. */
1340 if (saved_it.type != WEAK_BN
1341 || bidi_get_category (bidi_it->prev.type_after_w1) == NEUTRAL)
1342 {
1343 next_type = bidi_it->prev_for_neutral.type;
1344 saved_it.next_for_neutral.type = next_type;
1345 bidi_check_type (next_type);
1346 }
1347 else
1348 {
1349 /* This is a BN which does not adjoin neutrals.
1350 Leave its type alone. */
1351 bidi_copy_it (bidi_it, &saved_it);
1352 return bidi_it->type;
1353 }
1354 break;
1355 default:
1356 abort ();
1357 }
1358 type = bidi_resolve_neutral_1 (saved_it.prev_for_neutral.type,
1359 next_type, current_level);
1360 saved_it.type = type;
1361 bidi_check_type (type);
1362 bidi_copy_it (bidi_it, &saved_it);
1363 }
1364 }
1365 return type;
1366 }
1367
1368 /* Given an iterator state in BIDI_IT, advance one character position
1369 in the buffer to the next character (in the logical order), resolve
1370 the bidi type of that next character, and return that type. */
1371 static bidi_type_t
1372 bidi_type_of_next_char (struct bidi_it *bidi_it)
1373 {
1374 bidi_type_t type;
1375
1376 /* This should always be called during a forward scan. */
1377 if (bidi_it->scan_dir != 1)
1378 abort ();
1379
1380 /* Reset the limit until which to ignore BNs if we step out of the
1381 area where we found only empty levels. */
1382 if ((bidi_it->ignore_bn_limit > 0
1383 && bidi_it->ignore_bn_limit <= bidi_it->charpos)
1384 || (bidi_it->ignore_bn_limit == -1
1385 && !bidi_explicit_dir_char (bidi_it->ch)))
1386 bidi_it->ignore_bn_limit = 0;
1387
1388 type = bidi_resolve_neutral (bidi_it);
1389
1390 return type;
1391 }
1392
1393 /* Given an iterator state BIDI_IT, advance one character position in
1394 the buffer to the next character (in the logical order), resolve
1395 the embedding and implicit levels of that next character, and
1396 return the resulting level. */
1397 static int
1398 bidi_level_of_next_char (struct bidi_it *bidi_it)
1399 {
1400 bidi_type_t type;
1401 int level, prev_level = -1;
1402 struct bidi_saved_info next_for_neutral;
1403
1404 if (bidi_it->scan_dir == 1)
1405 {
1406 /* There's no sense in trying to advance if we hit end of text. */
1407 if (bidi_it->bytepos >= ZV_BYTE)
1408 return bidi_it->resolved_level;
1409
1410 /* Record the info about the previous character. */
1411 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
1412 && bidi_it->type != WEAK_BN)
1413 bidi_remember_char (&bidi_it->prev, bidi_it);
1414 if (bidi_it->type_after_w1 == STRONG_R
1415 || bidi_it->type_after_w1 == STRONG_L
1416 || bidi_it->type_after_w1 == STRONG_AL)
1417 bidi_remember_char (&bidi_it->last_strong, bidi_it);
1418 /* FIXME: it sounds like we don't need both prev and
1419 prev_for_neutral members, but I'm leaving them both for now. */
1420 if (bidi_it->type == STRONG_R || bidi_it->type == STRONG_L
1421 || bidi_it->type == WEAK_EN || bidi_it->type == WEAK_AN)
1422 bidi_remember_char (&bidi_it->prev_for_neutral, bidi_it);
1423
1424 /* If we overstepped the characters used for resolving neutrals
1425 and whitespace, invalidate their info in the iterator. */
1426 if (bidi_it->charpos >= bidi_it->next_for_neutral.charpos)
1427 bidi_it->next_for_neutral.type = UNKNOWN_BT;
1428 if (bidi_it->next_en_pos >= 0
1429 && bidi_it->charpos >= bidi_it->next_en_pos)
1430 bidi_it->next_en_pos = -1;
1431 if (bidi_it->next_for_ws.type != UNKNOWN_BT
1432 && bidi_it->charpos >= bidi_it->next_for_ws.charpos)
1433 bidi_it->next_for_ws.type = UNKNOWN_BT;
1434
1435 /* This must be taken before we fill the iterator with the info
1436 about the next char. If we scan backwards, the iterator
1437 state must be already cached, so there's no need to know the
1438 embedding level of the previous character, since we will be
1439 returning to our caller shortly. */
1440 prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1441 }
1442 next_for_neutral = bidi_it->next_for_neutral;
1443
1444 /* Perhaps it is already cached. */
1445 type = bidi_cache_find (bidi_it->charpos + bidi_it->scan_dir, -1, bidi_it);
1446 if (type != UNKNOWN_BT)
1447 {
1448 /* Don't lose the information for resolving neutrals! The
1449 cached states could have been cached before their
1450 next_for_neutral member was computed. If we are on our way
1451 forward, we can simply take the info from the previous
1452 state. */
1453 if (bidi_it->scan_dir == 1
1454 && bidi_it->next_for_neutral.type == UNKNOWN_BT)
1455 bidi_it->next_for_neutral = next_for_neutral;
1456
1457 /* If resolved_level is -1, it means this state was cached
1458 before it was completely resolved, so we cannot return
1459 it. */
1460 if (bidi_it->resolved_level != -1)
1461 return bidi_it->resolved_level;
1462 }
1463 if (bidi_it->scan_dir == -1)
1464 /* If we are going backwards, the iterator state is already cached
1465 from previous scans, and should be fully resolved. */
1466 abort ();
1467
1468 if (type == UNKNOWN_BT)
1469 type = bidi_type_of_next_char (bidi_it);
1470
1471 if (type == NEUTRAL_B)
1472 return bidi_it->resolved_level;
1473
1474 level = bidi_it->level_stack[bidi_it->stack_idx].level;
1475 if ((bidi_get_category (type) == NEUTRAL /* && type != NEUTRAL_B */)
1476 || (type == WEAK_BN && prev_level == level))
1477 {
1478 if (bidi_it->next_for_neutral.type == UNKNOWN_BT)
1479 abort ();
1480
1481 /* If the cached state shows a neutral character, it was not
1482 resolved by bidi_resolve_neutral, so do it now. */
1483 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1484 bidi_it->next_for_neutral.type,
1485 level);
1486 }
1487
1488 if (!(type == STRONG_R
1489 || type == STRONG_L
1490 || type == WEAK_BN
1491 || type == WEAK_EN
1492 || type == WEAK_AN))
1493 abort ();
1494 bidi_it->type = type;
1495 bidi_check_type (bidi_it->type);
1496
1497 /* For L1 below, we need to know, for each WS character, whether
1498 it belongs to a sequence of WS characters preceeding a newline
1499 or a TAB or a paragraph separator. */
1500 if (bidi_it->orig_type == NEUTRAL_WS
1501 && bidi_it->next_for_ws.type == UNKNOWN_BT)
1502 {
1503 int ch;
1504 int clen = bidi_it->ch_len;
1505 EMACS_INT bpos = bidi_it->bytepos;
1506 EMACS_INT cpos = bidi_it->charpos;
1507 bidi_type_t chtype;
1508
1509 do {
1510 /*_fetch_multibyte_char_len = 1;*/
1511 ch = bpos + clen >= ZV_BYTE ? BIDI_EOB : FETCH_CHAR (bpos + clen);
1512 bpos += clen;
1513 cpos++;
1514 clen = (ch == BIDI_EOB ? 1 : CHAR_BYTES (ch));
1515 if (ch == '\n' || ch == BIDI_EOB /* || ch == LINESEP_CHAR */)
1516 chtype = NEUTRAL_B;
1517 else
1518 chtype = bidi_get_type (ch, NEUTRAL_DIR);
1519 } while (chtype == NEUTRAL_WS || chtype == WEAK_BN
1520 || bidi_explicit_dir_char (ch)); /* L1/Retaining */
1521 bidi_it->next_for_ws.type = chtype;
1522 bidi_check_type (bidi_it->next_for_ws.type);
1523 bidi_it->next_for_ws.charpos = cpos;
1524 bidi_it->next_for_ws.bytepos = bpos;
1525 }
1526
1527 /* Resolve implicit levels, with a twist: PDFs get the embedding
1528 level of the enbedding they terminate. See below for the
1529 reason. */
1530 if (bidi_it->orig_type == PDF
1531 /* Don't do this if this formatting code didn't change the
1532 embedding level due to invalid or empty embeddings. */
1533 && prev_level != level)
1534 {
1535 /* Don't look in UAX#9 for the reason for this: it's our own
1536 private quirk. The reason is that we want the formatting
1537 codes to be delivered so that they bracket the text of their
1538 embedding. For example, given the text
1539
1540 {RLO}teST{PDF}
1541
1542 we want it to be displayed as
1543
1544 {RLO}STet{PDF}
1545
1546 not as
1547
1548 STet{RLO}{PDF}
1549
1550 which will result because we bump up the embedding level as
1551 soon as we see the RLO and pop it as soon as we see the PDF,
1552 so RLO itself has the same embedding level as "teST", and
1553 thus would be normally delivered last, just before the PDF.
1554 The switch below fiddles with the level of PDF so that this
1555 ugly side effect does not happen.
1556
1557 (This is, of course, only important if the formatting codes
1558 are actually displayed, but Emacs does need to display them
1559 if the user wants to.) */
1560 level = prev_level;
1561 }
1562 else if (bidi_it->orig_type == NEUTRAL_B /* L1 */
1563 || bidi_it->orig_type == NEUTRAL_S
1564 || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB
1565 /* || bidi_it->ch == LINESEP_CHAR */
1566 || (bidi_it->orig_type == NEUTRAL_WS
1567 && (bidi_it->next_for_ws.type == NEUTRAL_B
1568 || bidi_it->next_for_ws.type == NEUTRAL_S)))
1569 level = bidi_it->level_stack[0].level;
1570 else if ((level & 1) == 0) /* I1 */
1571 {
1572 if (type == STRONG_R)
1573 level++;
1574 else if (type == WEAK_EN || type == WEAK_AN)
1575 level += 2;
1576 }
1577 else /* I2 */
1578 {
1579 if (type == STRONG_L || type == WEAK_EN || type == WEAK_AN)
1580 level++;
1581 }
1582
1583 bidi_it->resolved_level = level;
1584 return level;
1585 }
1586
1587 /* Move to the other edge of a level given by LEVEL. If END_FLAG is
1588 non-zero, we are at the end of a level, and we need to prepare to
1589 resume the scan of the lower level.
1590
1591 If this level's other edge is cached, we simply jump to it, filling
1592 the iterator structure with the iterator state on the other edge.
1593 Otherwise, we walk the buffer until we come back to the same level
1594 as LEVEL.
1595
1596 Note: we are not talking here about a ``level run'' in the UAX#9
1597 sense of the term, but rather about a ``level'' which includes
1598 all the levels higher than it. In other words, given the levels
1599 like this:
1600
1601 11111112222222333333334443343222222111111112223322111
1602 A B C
1603
1604 and assuming we are at point A scanning left to right, this
1605 function moves to point C, whereas the UAX#9 ``level 2 run'' ends
1606 at point B. */
1607 static void
1608 bidi_find_other_level_edge (struct bidi_it *bidi_it, int level, int end_flag)
1609 {
1610 int dir = end_flag ? -bidi_it->scan_dir : bidi_it->scan_dir;
1611 int idx;
1612
1613 /* Try the cache first. */
1614 if ((idx = bidi_cache_find_level_change (level, dir, end_flag)) >= 0)
1615 bidi_cache_fetch_state (idx, bidi_it);
1616 else
1617 {
1618 int new_level;
1619
1620 if (end_flag)
1621 abort (); /* if we are at end of level, its edges must be cached */
1622
1623 bidi_cache_iterator_state (bidi_it, 1);
1624 do {
1625 new_level = bidi_level_of_next_char (bidi_it);
1626 bidi_cache_iterator_state (bidi_it, 1);
1627 } while (new_level >= level);
1628 }
1629 }
1630
1631 void
1632 bidi_move_to_visually_next (struct bidi_it *bidi_it)
1633 {
1634 int old_level, new_level, next_level;
1635 struct bidi_it sentinel;
1636
1637 if (bidi_it->scan_dir == 0)
1638 {
1639 bidi_it->scan_dir = 1; /* default to logical order */
1640 }
1641
1642 /* If we just passed a newline, initialize for the next line. */
1643 if (!bidi_it->first_elt && bidi_it->orig_type == NEUTRAL_B)
1644 bidi_line_init (bidi_it);
1645
1646 /* Prepare the sentinel iterator state, and cache it. When we bump
1647 into it, scanning backwards, we'll know that the last non-base
1648 level is exhausted. */
1649 if (bidi_cache_idx == 0)
1650 {
1651 bidi_copy_it (&sentinel, bidi_it);
1652 if (bidi_it->first_elt)
1653 {
1654 sentinel.charpos--; /* cached charpos needs to be monotonic */
1655 sentinel.bytepos--;
1656 sentinel.ch = '\n'; /* doesn't matter, but why not? */
1657 sentinel.ch_len = 1;
1658 }
1659 bidi_cache_iterator_state (&sentinel, 1);
1660 }
1661
1662 old_level = bidi_it->resolved_level;
1663 new_level = bidi_level_of_next_char (bidi_it);
1664
1665 /* Reordering of resolved levels (clause L2) is implemented by
1666 jumping to the other edge of the level and flipping direction of
1667 scanning the text whenever we find a level change. */
1668 if (new_level != old_level)
1669 {
1670 int ascending = new_level > old_level;
1671 int level_to_search = ascending ? old_level + 1 : old_level;
1672 int incr = ascending ? 1 : -1;
1673 int expected_next_level = old_level + incr;
1674
1675 /* Jump (or walk) to the other edge of this level. */
1676 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
1677 /* Switch scan direction and peek at the next character in the
1678 new direction. */
1679 bidi_it->scan_dir = -bidi_it->scan_dir;
1680
1681 /* The following loop handles the case where the resolved level
1682 jumps by more than one. This is typical for numbers inside a
1683 run of text with left-to-right embedding direction, but can
1684 also happen in other situations. In those cases the decision
1685 where to continue after a level change, and in what direction,
1686 is tricky. For example, given a text like below:
1687
1688 abcdefgh
1689 11336622
1690
1691 (where the numbers below the text show the resolved levels),
1692 the result of reordering according to UAX#9 should be this:
1693
1694 efdcghba
1695
1696 This is implemented by the loop below which flips direction
1697 and jumps to the other edge of the level each time it finds
1698 the new level not to be the expected one. The expected level
1699 is always one more or one less than the previous one. */
1700 next_level = bidi_peek_at_next_level (bidi_it);
1701 while (next_level != expected_next_level)
1702 {
1703 expected_next_level += incr;
1704 level_to_search += incr;
1705 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
1706 bidi_it->scan_dir = -bidi_it->scan_dir;
1707 next_level = bidi_peek_at_next_level (bidi_it);
1708 }
1709
1710 /* Finally, deliver the next character in the new direction. */
1711 next_level = bidi_level_of_next_char (bidi_it);
1712 }
1713
1714 /* Take note when we have just processed the newline that precedes
1715 the end of the paragraph. The next time we are about to be
1716 called, set_iterator_to_next will automatically reinit the
1717 paragraph direction, if needed. We do this at the newline before
1718 the paragraph separator, because the next character might not be
1719 the first character of the next paragraph, due to the bidi
1720 reordering, whereas we _must_ know the paragraph base direction
1721 _before_ we process the paragraph's text, since the base
1722 direction affects the reordering. */
1723 if (bidi_it->scan_dir == 1
1724 && bidi_it->orig_type == NEUTRAL_B
1725 && bidi_it->bytepos < ZV_BYTE)
1726 {
1727 EMACS_INT sep_len =
1728 bidi_at_paragraph_end (bidi_it->charpos + 1,
1729 bidi_it->bytepos + bidi_it->ch_len);
1730 if (sep_len >= 0)
1731 {
1732 bidi_it->new_paragraph = 1;
1733 /* Record the buffer position of the last character of the
1734 paragraph separator. */
1735 bidi_it->separator_limit = bidi_it->charpos + 1 + sep_len;
1736 }
1737 }
1738
1739 if (bidi_it->scan_dir == 1 && bidi_cache_idx)
1740 {
1741 /* If we are at paragraph's base embedding level and beyond the
1742 last cached position, the cache's job is done and we can
1743 discard it. */
1744 if (bidi_it->resolved_level == bidi_it->level_stack[0].level
1745 && bidi_it->charpos > bidi_cache[bidi_cache_idx - 1].charpos)
1746 bidi_cache_reset ();
1747 /* But as long as we are caching during forward scan, we must
1748 cache each state, or else the cache integrity will be
1749 compromised: it assumes cached states correspond to buffer
1750 positions 1:1. */
1751 else
1752 bidi_cache_iterator_state (bidi_it, 1);
1753 }
1754 }
1755
1756 /* This is meant to be called from within the debugger, whenever you
1757 wish to examine the cache contents. */
1758 void
1759 bidi_dump_cached_states (void)
1760 {
1761 int i;
1762 int ndigits = 1;
1763
1764 if (bidi_cache_idx == 0)
1765 {
1766 fprintf (stderr, "The cache is empty.\n");
1767 return;
1768 }
1769 fprintf (stderr, "Total of %d state%s in cache:\n",
1770 bidi_cache_idx, bidi_cache_idx == 1 ? "" : "s");
1771
1772 for (i = bidi_cache[bidi_cache_idx - 1].charpos; i > 0; i /= 10)
1773 ndigits++;
1774 fputs ("ch ", stderr);
1775 for (i = 0; i < bidi_cache_idx; i++)
1776 fprintf (stderr, "%*c", ndigits, bidi_cache[i].ch);
1777 fputs ("\n", stderr);
1778 fputs ("lvl ", stderr);
1779 for (i = 0; i < bidi_cache_idx; i++)
1780 fprintf (stderr, "%*d", ndigits, bidi_cache[i].resolved_level);
1781 fputs ("\n", stderr);
1782 fputs ("pos ", stderr);
1783 for (i = 0; i < bidi_cache_idx; i++)
1784 fprintf (stderr, "%*d", ndigits, bidi_cache[i].charpos);
1785 fputs ("\n", stderr);
1786 }