fee97ae0c8e3c7c8069631b51fb1d5d5a8984ac6
[bpt/emacs.git] / src / bidi.c
1 /* Low-level bidirectional buffer-scanning functions for GNU Emacs.
2 Copyright (C) 2000, 2001, 2004, 2005, 2009, 2010
3 Free Software Foundation, Inc.
4
5 This file is part of GNU Emacs.
6
7 GNU Emacs is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
11
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
19
20 /* Written by Eli Zaretskii <eliz@gnu.org>.
21
22 A sequential implementation of the Unicode Bidirectional algorithm,
23 as per UAX#9, a part of the Unicode Standard.
24
25 Unlike the reference and most other implementations, this one is
26 designed to be called once for every character in the buffer or
27 string.
28
29 The main entry point is bidi_get_next_char_visually. Each time it
30 is called, it finds the next character in the visual order, and
31 returns its information in a special structure. The caller is then
32 expected to process this character for display or any other
33 purposes, and call bidi_get_next_char_visually for the next
34 character. See the comments in bidi_get_next_char_visually for
35 more details about its algorithm that finds the next visual-order
36 character by resolving their levels on the fly.
37
38 The two other entry points are bidi_paragraph_init and
39 bidi_mirror_char. The first determines the base direction of a
40 paragraph, while the second returns the mirrored version of its
41 argument character.
42
43 If you want to understand the code, you will have to read it
44 together with the relevant portions of UAX#9. The comments include
45 references to UAX#9 rules, for that very reason.
46
47 A note about references to UAX#9 rules: if the reference says
48 something like "X9/Retaining", it means that you need to refer to
49 rule X9 and to its modifications decribed in the "Implementation
50 Notes" section of UAX#9, under "Retaining Format Codes". */
51
52 #ifdef HAVE_CONFIG_H
53 #include <config.h>
54 #endif
55
56 #include <stdio.h>
57
58 #ifdef HAVE_STRING_H
59 #include <string.h>
60 #endif
61
62 #include <setjmp.h>
63
64 #include "lisp.h"
65 #include "buffer.h"
66 #include "character.h"
67 #include "dispextern.h"
68
69 static int bidi_initialized = 0;
70
71 static Lisp_Object bidi_type_table;
72
73 /* FIXME: Remove these when bidi_explicit_dir_char uses a lookup table. */
74 #define LRM_CHAR 0x200E
75 #define RLM_CHAR 0x200F
76 #define LRE_CHAR 0x202A
77 #define RLE_CHAR 0x202B
78 #define PDF_CHAR 0x202C
79 #define LRO_CHAR 0x202D
80 #define RLO_CHAR 0x202E
81
82 #define BIDI_EOB -1
83 #define BIDI_BOB -2 /* FIXME: Is this needed? */
84
85 /* Local data structures. (Look in dispextern.h for the rest.) */
86
87 /* What we need to know about the current paragraph. */
88 struct bidi_paragraph_info {
89 int start_bytepos; /* byte position where it begins */
90 int end_bytepos; /* byte position where it ends */
91 int embedding_level; /* its basic embedding level */
92 bidi_dir_t base_dir; /* its base direction */
93 };
94
95 /* Data type for describing the bidirectional character categories. */
96 typedef enum {
97 UNKNOWN_BC,
98 NEUTRAL,
99 WEAK,
100 STRONG
101 } bidi_category_t;
102
103 int bidi_ignore_explicit_marks_for_paragraph_level = 1;
104
105 static Lisp_Object fallback_paragraph_start_re, fallback_paragraph_separate_re;
106 static Lisp_Object Qparagraph_start, Qparagraph_separate;
107
108 static void
109 bidi_initialize ()
110 {
111 /* FIXME: This should come from the Unicode Database. */
112 struct {
113 int from, to;
114 bidi_type_t type;
115 } bidi_type[] =
116 { { 0x0000, 0x0008, WEAK_BN },
117 { 0x0009, 0x0000, NEUTRAL_S },
118 { 0x000A, 0x0000, NEUTRAL_B },
119 { 0x000B, 0x0000, NEUTRAL_S },
120 { 0x000C, 0x0000, NEUTRAL_WS },
121 { 0x000D, 0x0000, NEUTRAL_B },
122 { 0x000E, 0x001B, WEAK_BN },
123 { 0x001C, 0x001E, NEUTRAL_B },
124 { 0x001F, 0x0000, NEUTRAL_S },
125 { 0x0020, 0x0000, NEUTRAL_WS },
126 { 0x0021, 0x0022, NEUTRAL_ON },
127 { 0x0023, 0x0025, WEAK_ET },
128 { 0x0026, 0x002A, NEUTRAL_ON },
129 { 0x002B, 0x0000, WEAK_ES },
130 { 0x002C, 0x0000, WEAK_CS },
131 { 0x002D, 0x0000, WEAK_ES },
132 { 0x002E, 0x002F, WEAK_CS },
133 { 0x0030, 0x0039, WEAK_EN },
134 { 0x003A, 0x0000, WEAK_CS },
135 { 0x003B, 0x0040, NEUTRAL_ON },
136 { 0x005B, 0x0060, NEUTRAL_ON },
137 { 0x007B, 0x007E, NEUTRAL_ON },
138 { 0x007F, 0x0084, WEAK_BN },
139 { 0x0085, 0x0000, NEUTRAL_B },
140 { 0x0086, 0x009F, WEAK_BN },
141 { 0x00A0, 0x0000, WEAK_CS },
142 { 0x00A1, 0x0000, NEUTRAL_ON },
143 { 0x00A2, 0x00A5, WEAK_ET },
144 { 0x00A6, 0x00A9, NEUTRAL_ON },
145 { 0x00AB, 0x00AC, NEUTRAL_ON },
146 { 0x00AD, 0x0000, WEAK_BN },
147 { 0x00AE, 0x00Af, NEUTRAL_ON },
148 { 0x00B0, 0x00B1, WEAK_ET },
149 { 0x00B2, 0x00B3, WEAK_EN },
150 { 0x00B4, 0x0000, NEUTRAL_ON },
151 { 0x00B6, 0x00B8, NEUTRAL_ON },
152 { 0x00B9, 0x0000, WEAK_EN },
153 { 0x00BB, 0x00BF, NEUTRAL_ON },
154 { 0x00D7, 0x0000, NEUTRAL_ON },
155 { 0x00F7, 0x0000, NEUTRAL_ON },
156 { 0x02B9, 0x02BA, NEUTRAL_ON },
157 { 0x02C2, 0x02CF, NEUTRAL_ON },
158 { 0x02D2, 0x02DF, NEUTRAL_ON },
159 { 0x02E5, 0x02ED, NEUTRAL_ON },
160 { 0x0300, 0x036F, WEAK_NSM },
161 { 0x0374, 0x0375, NEUTRAL_ON },
162 { 0x037E, 0x0385, NEUTRAL_ON },
163 { 0x0387, 0x0000, NEUTRAL_ON },
164 { 0x03F6, 0x0000, NEUTRAL_ON },
165 { 0x0483, 0x0489, WEAK_NSM },
166 { 0x058A, 0x0000, NEUTRAL_ON },
167 { 0x0591, 0x05BD, WEAK_NSM },
168 { 0x05BE, 0x0000, STRONG_R },
169 { 0x05BF, 0x0000, WEAK_NSM },
170 { 0x05C0, 0x0000, STRONG_R },
171 { 0x05C1, 0x05C2, WEAK_NSM },
172 { 0x05C3, 0x0000, STRONG_R },
173 { 0x05C4, 0x05C5, WEAK_NSM },
174 { 0x05C6, 0x0000, STRONG_R },
175 { 0x05C7, 0x0000, WEAK_NSM },
176 { 0x05D0, 0x05F4, STRONG_R },
177 { 0x060C, 0x0000, WEAK_CS },
178 { 0x061B, 0x064A, STRONG_AL },
179 { 0x064B, 0x0655, WEAK_NSM },
180 { 0x0660, 0x0669, WEAK_AN },
181 { 0x066A, 0x0000, WEAK_ET },
182 { 0x066B, 0x066C, WEAK_AN },
183 { 0x066D, 0x066F, STRONG_AL },
184 { 0x0670, 0x0000, WEAK_NSM },
185 { 0x0671, 0x06D5, STRONG_AL },
186 { 0x06D6, 0x06DC, WEAK_NSM },
187 { 0x06DD, 0x0000, STRONG_AL },
188 { 0x06DE, 0x06E4, WEAK_NSM },
189 { 0x06E5, 0x06E6, STRONG_AL },
190 { 0x06E7, 0x06E8, WEAK_NSM },
191 { 0x06E9, 0x0000, NEUTRAL_ON },
192 { 0x06EA, 0x06ED, WEAK_NSM },
193 { 0x06F0, 0x06F9, WEAK_EN },
194 { 0x06FA, 0x070D, STRONG_AL },
195 { 0x070F, 0x0000, WEAK_BN },
196 { 0x0710, 0x0000, STRONG_AL },
197 { 0x0711, 0x0000, WEAK_NSM },
198 { 0x0712, 0x072C, STRONG_AL },
199 { 0x0730, 0x074A, WEAK_NSM },
200 { 0x0780, 0x07A5, STRONG_AL },
201 { 0x07A6, 0x07B0, WEAK_NSM },
202 { 0x07B1, 0x0000, STRONG_AL },
203 { 0x0901, 0x0902, WEAK_NSM },
204 { 0x093C, 0x0000, WEAK_NSM },
205 { 0x0941, 0x0948, WEAK_NSM },
206 { 0x094D, 0x0000, WEAK_NSM },
207 { 0x0951, 0x0954, WEAK_NSM },
208 { 0x0962, 0x0963, WEAK_NSM },
209 { 0x0981, 0x0000, WEAK_NSM },
210 { 0x09BC, 0x0000, WEAK_NSM },
211 { 0x09C1, 0x09C4, WEAK_NSM },
212 { 0x09CD, 0x0000, WEAK_NSM },
213 { 0x09E2, 0x09E3, WEAK_NSM },
214 { 0x09F2, 0x09F3, WEAK_ET },
215 { 0x0A02, 0x0000, WEAK_NSM },
216 { 0x0A3C, 0x0000, WEAK_NSM },
217 { 0x0A41, 0x0A4D, WEAK_NSM },
218 { 0x0A70, 0x0A71, WEAK_NSM },
219 { 0x0A81, 0x0A82, WEAK_NSM },
220 { 0x0ABC, 0x0000, WEAK_NSM },
221 { 0x0AC1, 0x0AC8, WEAK_NSM },
222 { 0x0ACD, 0x0000, WEAK_NSM },
223 { 0x0B01, 0x0000, WEAK_NSM },
224 { 0x0B3C, 0x0000, WEAK_NSM },
225 { 0x0B3F, 0x0000, WEAK_NSM },
226 { 0x0B41, 0x0B43, WEAK_NSM },
227 { 0x0B4D, 0x0B56, WEAK_NSM },
228 { 0x0B82, 0x0000, WEAK_NSM },
229 { 0x0BC0, 0x0000, WEAK_NSM },
230 { 0x0BCD, 0x0000, WEAK_NSM },
231 { 0x0C3E, 0x0C40, WEAK_NSM },
232 { 0x0C46, 0x0C56, WEAK_NSM },
233 { 0x0CBF, 0x0000, WEAK_NSM },
234 { 0x0CC6, 0x0000, WEAK_NSM },
235 { 0x0CCC, 0x0CCD, WEAK_NSM },
236 { 0x0D41, 0x0D43, WEAK_NSM },
237 { 0x0D4D, 0x0000, WEAK_NSM },
238 { 0x0DCA, 0x0000, WEAK_NSM },
239 { 0x0DD2, 0x0DD6, WEAK_NSM },
240 { 0x0E31, 0x0000, WEAK_NSM },
241 { 0x0E34, 0x0E3A, WEAK_NSM },
242 { 0x0E3F, 0x0000, WEAK_ET },
243 { 0x0E47, 0x0E4E, WEAK_NSM },
244 { 0x0EB1, 0x0000, WEAK_NSM },
245 { 0x0EB4, 0x0EBC, WEAK_NSM },
246 { 0x0EC8, 0x0ECD, WEAK_NSM },
247 { 0x0F18, 0x0F19, WEAK_NSM },
248 { 0x0F35, 0x0000, WEAK_NSM },
249 { 0x0F37, 0x0000, WEAK_NSM },
250 { 0x0F39, 0x0000, WEAK_NSM },
251 { 0x0F3A, 0x0F3D, NEUTRAL_ON },
252 { 0x0F71, 0x0F7E, WEAK_NSM },
253 { 0x0F80, 0x0F84, WEAK_NSM },
254 { 0x0F86, 0x0F87, WEAK_NSM },
255 { 0x0F90, 0x0FBC, WEAK_NSM },
256 { 0x0FC6, 0x0000, WEAK_NSM },
257 { 0x102D, 0x1030, WEAK_NSM },
258 { 0x1032, 0x1037, WEAK_NSM },
259 { 0x1039, 0x0000, WEAK_NSM },
260 { 0x1058, 0x1059, WEAK_NSM },
261 { 0x1680, 0x0000, NEUTRAL_WS },
262 { 0x169B, 0x169C, NEUTRAL_ON },
263 { 0x1712, 0x1714, WEAK_NSM },
264 { 0x1732, 0x1734, WEAK_NSM },
265 { 0x1752, 0x1753, WEAK_NSM },
266 { 0x1772, 0x1773, WEAK_NSM },
267 { 0x17B7, 0x17BD, WEAK_NSM },
268 { 0x17C6, 0x0000, WEAK_NSM },
269 { 0x17C9, 0x17D3, WEAK_NSM },
270 { 0x17DB, 0x0000, WEAK_ET },
271 { 0x1800, 0x180A, NEUTRAL_ON },
272 { 0x180B, 0x180D, WEAK_NSM },
273 { 0x180E, 0x0000, WEAK_BN },
274 { 0x18A9, 0x0000, WEAK_NSM },
275 { 0x1FBD, 0x0000, NEUTRAL_ON },
276 { 0x1FBF, 0x1FC1, NEUTRAL_ON },
277 { 0x1FCD, 0x1FCF, NEUTRAL_ON },
278 { 0x1FDD, 0x1FDF, NEUTRAL_ON },
279 { 0x1FED, 0x1FEF, NEUTRAL_ON },
280 { 0x1FFD, 0x1FFE, NEUTRAL_ON },
281 { 0x2000, 0x200A, NEUTRAL_WS },
282 { 0x200B, 0x200D, WEAK_BN },
283 { 0x200F, 0x0000, STRONG_R },
284 { 0x2010, 0x2027, NEUTRAL_ON },
285 { 0x2028, 0x0000, NEUTRAL_WS },
286 { 0x2029, 0x0000, NEUTRAL_B },
287 { 0x202A, 0x0000, LRE },
288 { 0x202B, 0x0000, RLE },
289 { 0x202C, 0x0000, PDF },
290 { 0x202D, 0x0000, LRO },
291 { 0x202E, 0x0000, RLO },
292 { 0x202F, 0x0000, NEUTRAL_WS },
293 { 0x2030, 0x2034, WEAK_ET },
294 { 0x2035, 0x2057, NEUTRAL_ON },
295 { 0x205F, 0x0000, NEUTRAL_WS },
296 { 0x2060, 0x206F, WEAK_BN },
297 { 0x2070, 0x0000, WEAK_EN },
298 { 0x2074, 0x2079, WEAK_EN },
299 { 0x207A, 0x207B, WEAK_ET },
300 { 0x207C, 0x207E, NEUTRAL_ON },
301 { 0x2080, 0x2089, WEAK_EN },
302 { 0x208A, 0x208B, WEAK_ET },
303 { 0x208C, 0x208E, NEUTRAL_ON },
304 { 0x20A0, 0x20B1, WEAK_ET },
305 { 0x20D0, 0x20EA, WEAK_NSM },
306 { 0x2100, 0x2101, NEUTRAL_ON },
307 { 0x2103, 0x2106, NEUTRAL_ON },
308 { 0x2108, 0x2109, NEUTRAL_ON },
309 { 0x2114, 0x0000, NEUTRAL_ON },
310 { 0x2116, 0x2118, NEUTRAL_ON },
311 { 0x211E, 0x2123, NEUTRAL_ON },
312 { 0x2125, 0x0000, NEUTRAL_ON },
313 { 0x2127, 0x0000, NEUTRAL_ON },
314 { 0x2129, 0x0000, NEUTRAL_ON },
315 { 0x212E, 0x0000, WEAK_ET },
316 { 0x2132, 0x0000, NEUTRAL_ON },
317 { 0x213A, 0x0000, NEUTRAL_ON },
318 { 0x2140, 0x2144, NEUTRAL_ON },
319 { 0x214A, 0x215F, NEUTRAL_ON },
320 { 0x2190, 0x2211, NEUTRAL_ON },
321 { 0x2212, 0x2213, WEAK_ET },
322 { 0x2214, 0x2335, NEUTRAL_ON },
323 { 0x237B, 0x2394, NEUTRAL_ON },
324 { 0x2396, 0x244A, NEUTRAL_ON },
325 { 0x2460, 0x249B, WEAK_EN },
326 { 0x24EA, 0x0000, WEAK_EN },
327 { 0x24EB, 0x2FFB, NEUTRAL_ON },
328 { 0x3000, 0x0000, NEUTRAL_WS },
329 { 0x3001, 0x3004, NEUTRAL_ON },
330 { 0x3008, 0x3020, NEUTRAL_ON },
331 { 0x302A, 0x302F, WEAK_NSM },
332 { 0x3030, 0x0000, NEUTRAL_ON },
333 { 0x3036, 0x3037, NEUTRAL_ON },
334 { 0x303D, 0x303F, NEUTRAL_ON },
335 { 0x3099, 0x309A, WEAK_NSM },
336 { 0x309B, 0x309C, NEUTRAL_ON },
337 { 0x30A0, 0x0000, NEUTRAL_ON },
338 { 0x30FB, 0x0000, NEUTRAL_ON },
339 { 0x3251, 0x325F, NEUTRAL_ON },
340 { 0x32B1, 0x32BF, NEUTRAL_ON },
341 { 0xA490, 0xA4C6, NEUTRAL_ON },
342 { 0xFB1D, 0x0000, STRONG_R },
343 { 0xFB1E, 0x0000, WEAK_NSM },
344 { 0xFB1F, 0xFB28, STRONG_R },
345 { 0xFB29, 0x0000, WEAK_ET },
346 { 0xFB2A, 0xFB4F, STRONG_R },
347 { 0xFB50, 0xFD3D, STRONG_AL },
348 { 0xFD3E, 0xFD3F, NEUTRAL_ON },
349 { 0xFD50, 0xFDFC, STRONG_AL },
350 { 0xFE00, 0xFE23, WEAK_NSM },
351 { 0xFE30, 0xFE4F, NEUTRAL_ON },
352 { 0xFE50, 0x0000, WEAK_CS },
353 { 0xFE51, 0x0000, NEUTRAL_ON },
354 { 0xFE52, 0x0000, WEAK_CS },
355 { 0xFE54, 0x0000, NEUTRAL_ON },
356 { 0xFE55, 0x0000, WEAK_CS },
357 { 0xFE56, 0xFE5E, NEUTRAL_ON },
358 { 0xFE5F, 0x0000, WEAK_ET },
359 { 0xFE60, 0xFE61, NEUTRAL_ON },
360 { 0xFE62, 0xFE63, WEAK_ET },
361 { 0xFE64, 0xFE68, NEUTRAL_ON },
362 { 0xFE69, 0xFE6A, WEAK_ET },
363 { 0xFE6B, 0x0000, NEUTRAL_ON },
364 { 0xFE70, 0xFEFC, STRONG_AL },
365 { 0xFEFF, 0x0000, WEAK_BN },
366 { 0xFF01, 0xFF02, NEUTRAL_ON },
367 { 0xFF03, 0xFF05, WEAK_ET },
368 { 0xFF06, 0xFF0A, NEUTRAL_ON },
369 { 0xFF0B, 0x0000, WEAK_ET },
370 { 0xFF0C, 0x0000, WEAK_CS },
371 { 0xFF0D, 0x0000, WEAK_ET },
372 { 0xFF0E, 0x0000, WEAK_CS },
373 { 0xFF0F, 0x0000, WEAK_ES },
374 { 0xFF10, 0xFF19, WEAK_EN },
375 { 0xFF1A, 0x0000, WEAK_CS },
376 { 0xFF1B, 0xFF20, NEUTRAL_ON },
377 { 0xFF3B, 0xFF40, NEUTRAL_ON },
378 { 0xFF5B, 0xFF65, NEUTRAL_ON },
379 { 0xFFE0, 0xFFE1, WEAK_ET },
380 { 0xFFE2, 0xFFE4, NEUTRAL_ON },
381 { 0xFFE5, 0xFFE6, WEAK_ET },
382 { 0xFFE8, 0xFFEE, NEUTRAL_ON },
383 { 0xFFF9, 0xFFFB, WEAK_BN },
384 { 0xFFFC, 0xFFFD, NEUTRAL_ON },
385 { 0x1D167, 0x1D169, WEAK_NSM },
386 { 0x1D173, 0x1D17A, WEAK_BN },
387 { 0x1D17B, 0x1D182, WEAK_NSM },
388 { 0x1D185, 0x1D18B, WEAK_NSM },
389 { 0x1D1AA, 0x1D1AD, WEAK_NSM },
390 { 0x1D7CE, 0x1D7FF, WEAK_EN },
391 { 0xE0001, 0xE007F, WEAK_BN } };
392 int i;
393
394 bidi_type_table = Fmake_char_table (Qnil, make_number (STRONG_L));
395 staticpro (&bidi_type_table);
396
397 for (i = 0; i < sizeof bidi_type / sizeof bidi_type[0]; i++)
398 char_table_set_range (bidi_type_table, bidi_type[i].from,
399 bidi_type[i].to ? bidi_type[i].to : bidi_type[i].from,
400 make_number (bidi_type[i].type));
401
402 fallback_paragraph_start_re =
403 Fsymbol_value (Fintern_soft (build_string ("paragraph-start"), Qnil));
404 if (!STRINGP (fallback_paragraph_start_re))
405 fallback_paragraph_start_re = build_string ("\f\\|[ \t]*$");
406 staticpro (&fallback_paragraph_start_re);
407 Qparagraph_start = intern ("paragraph-start");
408 staticpro (&Qparagraph_start);
409 fallback_paragraph_separate_re =
410 Fsymbol_value (Fintern_soft (build_string ("paragraph-separate"), Qnil));
411 if (!STRINGP (fallback_paragraph_separate_re))
412 fallback_paragraph_separate_re = build_string ("[ \t\f]*$");
413 staticpro (&fallback_paragraph_separate_re);
414 Qparagraph_separate = intern ("paragraph-separate");
415 staticpro (&Qparagraph_separate);
416 bidi_initialized = 1;
417 }
418
419 /* Return the bidi type of a character CH, subject to the current
420 directional OVERRIDE. */
421 static INLINE bidi_type_t
422 bidi_get_type (int ch, bidi_dir_t override)
423 {
424 bidi_type_t default_type;
425
426 if (ch == BIDI_EOB)
427 return NEUTRAL_B;
428 if (ch < 0 || ch > MAX_CHAR)
429 abort ();
430
431 default_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
432
433 if (override == NEUTRAL_DIR)
434 return default_type;
435
436 switch (default_type)
437 {
438 /* Although UAX#9 does not tell, it doesn't make sense to
439 override NEUTRAL_B and LRM/RLM characters. */
440 case NEUTRAL_B:
441 case LRE:
442 case LRO:
443 case RLE:
444 case RLO:
445 case PDF:
446 return default_type;
447 default:
448 switch (ch)
449 {
450 case LRM_CHAR:
451 case RLM_CHAR:
452 return default_type;
453 default:
454 if (override == L2R) /* X6 */
455 return STRONG_L;
456 else if (override == R2L)
457 return STRONG_R;
458 else
459 abort (); /* can't happen: handled above */
460 }
461 }
462 }
463
464 void
465 bidi_check_type (bidi_type_t type)
466 {
467 if (type < UNKNOWN_BT || type > NEUTRAL_ON)
468 abort ();
469 }
470
471 /* Given a bidi TYPE of a character, return its category. */
472 static INLINE bidi_category_t
473 bidi_get_category (bidi_type_t type)
474 {
475 switch (type)
476 {
477 case UNKNOWN_BT:
478 return UNKNOWN_BC;
479 case STRONG_L:
480 case STRONG_R:
481 case STRONG_AL:
482 case LRE:
483 case LRO:
484 case RLE:
485 case RLO:
486 return STRONG;
487 case PDF: /* ??? really?? */
488 case WEAK_EN:
489 case WEAK_ES:
490 case WEAK_ET:
491 case WEAK_AN:
492 case WEAK_CS:
493 case WEAK_NSM:
494 case WEAK_BN:
495 return WEAK;
496 case NEUTRAL_B:
497 case NEUTRAL_S:
498 case NEUTRAL_WS:
499 case NEUTRAL_ON:
500 return NEUTRAL;
501 default:
502 abort ();
503 }
504 }
505
506 /* Return the mirrored character of C, if any.
507
508 Note: The conditions in UAX#9 clause L4 must be tested by the
509 caller. */
510 /* FIXME: exceedingly temporary! Should consult the Unicode database
511 of character properties. */
512 int
513 bidi_mirror_char (int c)
514 {
515 static const char mirrored_pairs[] = "()<>[]{}";
516 const char *p = c > 0 && c < 128 ? strchr (mirrored_pairs, c) : NULL;
517
518 if (p)
519 {
520 size_t i = p - mirrored_pairs;
521
522 return mirrored_pairs [(i ^ 1)];
523 }
524 return c;
525 }
526
527 /* Copy the bidi iterator from FROM to TO. To save cycles, this only
528 copies the part of the level stack that is actually in use. */
529 static INLINE void
530 bidi_copy_it (struct bidi_it *to, struct bidi_it *from)
531 {
532 int i;
533
534 /* Copy everything except the level stack and beyond. */
535 memcpy (to, from, ((size_t)&((struct bidi_it *)0)->level_stack[0]));
536
537 /* Copy the active part of the level stack. */
538 to->level_stack[0] = from->level_stack[0]; /* level zero is always in use */
539 for (i = 1; i <= from->stack_idx; i++)
540 to->level_stack[i] = from->level_stack[i];
541 }
542
543 /* Caching the bidi iterator states. */
544
545 static struct bidi_it bidi_cache[1000]; /* FIXME: make this dynamically allocated! */
546 static int bidi_cache_idx;
547 static int bidi_cache_last_idx;
548
549 static INLINE void
550 bidi_cache_reset (void)
551 {
552 bidi_cache_idx = 0;
553 bidi_cache_last_idx = -1;
554 }
555
556 static INLINE void
557 bidi_cache_fetch_state (int idx, struct bidi_it *bidi_it)
558 {
559 int current_scan_dir = bidi_it->scan_dir;
560
561 if (idx < 0 || idx >= bidi_cache_idx)
562 abort ();
563
564 bidi_copy_it (bidi_it, &bidi_cache[idx]);
565 bidi_it->scan_dir = current_scan_dir;
566 bidi_cache_last_idx = idx;
567 }
568
569 /* Find a cached state with a given CHARPOS and resolved embedding
570 level less or equal to LEVEL. if LEVEL is -1, disregard the
571 resolved levels in cached states. DIR, if non-zero, means search
572 in that direction from the last cache hit. */
573 static INLINE int
574 bidi_cache_search (int charpos, int level, int dir)
575 {
576 int i, i_start;
577
578 if (bidi_cache_idx)
579 {
580 if (charpos < bidi_cache[bidi_cache_last_idx].charpos)
581 dir = -1;
582 else if (charpos > bidi_cache[bidi_cache_last_idx].charpos)
583 dir = 1;
584 if (dir)
585 i_start = bidi_cache_last_idx;
586 else
587 {
588 dir = -1;
589 i_start = bidi_cache_idx - 1;
590 }
591
592 if (dir < 0)
593 {
594 /* Linear search for now; FIXME! */
595 for (i = i_start; i >= 0; i--)
596 if (bidi_cache[i].charpos == charpos
597 && (level == -1 || bidi_cache[i].resolved_level <= level))
598 return i;
599 }
600 else
601 {
602 for (i = i_start; i < bidi_cache_idx; i++)
603 if (bidi_cache[i].charpos == charpos
604 && (level == -1 || bidi_cache[i].resolved_level <= level))
605 return i;
606 }
607 }
608
609 return -1;
610 }
611
612 /* Find a cached state where the resolved level changes to a value
613 that is lower than LEVEL, and return its cache slot index. DIR is
614 the direction to search, starting with the last used cache slot.
615 BEFORE, if non-zero, means return the index of the slot that is
616 ``before'' the level change in the search direction. That is,
617 given the cached levels like this:
618
619 1122333442211
620 AB C
621
622 and assuming we are at the position cached at the slot marked with
623 C, searching backwards (DIR = -1) for LEVEL = 2 will return the
624 index of slot B or A, depending whether BEFORE is, respectively,
625 non-zero or zero. */
626 static int
627 bidi_cache_find_level_change (int level, int dir, int before)
628 {
629 if (bidi_cache_idx)
630 {
631 int i = dir ? bidi_cache_last_idx : bidi_cache_idx - 1;
632 int incr = before ? 1 : 0;
633
634 if (!dir)
635 dir = -1;
636 else if (!incr)
637 i += dir;
638
639 if (dir < 0)
640 {
641 while (i >= incr)
642 {
643 if (bidi_cache[i - incr].resolved_level >= 0
644 && bidi_cache[i - incr].resolved_level < level)
645 return i;
646 i--;
647 }
648 }
649 else
650 {
651 while (i < bidi_cache_idx - incr)
652 {
653 if (bidi_cache[i + incr].resolved_level >= 0
654 && bidi_cache[i + incr].resolved_level < level)
655 return i;
656 i++;
657 }
658 }
659 }
660
661 return -1;
662 }
663
664 static INLINE void
665 bidi_cache_iterator_state (struct bidi_it *bidi_it, int resolved)
666 {
667 int idx;
668
669 /* We should never cache on backward scans. */
670 if (bidi_it->scan_dir == -1)
671 abort ();
672 idx = bidi_cache_search (bidi_it->charpos, -1, 1);
673
674 if (idx < 0)
675 {
676 idx = bidi_cache_idx;
677 /* Don't overrun the cache limit. */
678 if (idx > sizeof (bidi_cache) / sizeof (bidi_cache[0]) - 1)
679 abort ();
680 /* Character positions should correspond to cache positions 1:1.
681 If we are outside the range of cached positions, the cache is
682 useless and must be reset. */
683 if (idx > 0 &&
684 (bidi_it->charpos > bidi_cache[idx - 1].charpos + 1
685 || bidi_it->charpos < bidi_cache[0].charpos))
686 {
687 bidi_cache_reset ();
688 idx = 0;
689 }
690 bidi_copy_it (&bidi_cache[idx], bidi_it);
691 if (!resolved)
692 bidi_cache[idx].resolved_level = -1;
693 bidi_cache[idx].new_paragraph = 0;
694 }
695 else
696 {
697 /* Copy only the members which could have changed, to avoid
698 costly copying of the entire struct. */
699 bidi_cache[idx].type = bidi_it->type;
700 bidi_check_type (bidi_it->type);
701 bidi_cache[idx].type_after_w1 = bidi_it->type_after_w1;
702 bidi_check_type (bidi_it->type_after_w1);
703 if (resolved)
704 bidi_cache[idx].resolved_level = bidi_it->resolved_level;
705 else
706 bidi_cache[idx].resolved_level = -1;
707 bidi_cache[idx].invalid_levels = bidi_it->invalid_levels;
708 bidi_cache[idx].invalid_rl_levels = bidi_it->invalid_rl_levels;
709 bidi_cache[idx].next_for_neutral = bidi_it->next_for_neutral;
710 bidi_cache[idx].next_for_ws = bidi_it->next_for_ws;
711 bidi_cache[idx].ignore_bn_limit = bidi_it->ignore_bn_limit;
712 }
713
714 bidi_cache_last_idx = idx;
715 if (idx >= bidi_cache_idx)
716 bidi_cache_idx = idx + 1;
717 }
718
719 static INLINE bidi_type_t
720 bidi_cache_find (int charpos, int level, struct bidi_it *bidi_it)
721 {
722 int i = bidi_cache_search (charpos, level, bidi_it->scan_dir);
723
724 if (i >= 0)
725 {
726 bidi_dir_t current_scan_dir = bidi_it->scan_dir;
727
728 bidi_copy_it (bidi_it, &bidi_cache[i]);
729 bidi_cache_last_idx = i;
730 /* Don't let scan direction from from the cached state override
731 the current scan direction. */
732 bidi_it->scan_dir = current_scan_dir;
733 return bidi_it->type;
734 }
735
736 return UNKNOWN_BT;
737 }
738
739 static INLINE int
740 bidi_peek_at_next_level (struct bidi_it *bidi_it)
741 {
742 if (bidi_cache_idx == 0 || bidi_cache_last_idx == -1)
743 abort ();
744 return bidi_cache[bidi_cache_last_idx + bidi_it->scan_dir].resolved_level;
745 }
746
747 /* Check if buffer position CHARPOS/BYTEPOS is the end of a paragraph.
748 Value is the non-negative length of the paragraph separator
749 following the buffer position, -1 if position is at the beginning
750 of a new paragraph, or -2 if position is neither at beginning nor
751 at end of a paragraph. */
752 static EMACS_INT
753 bidi_at_paragraph_end (EMACS_INT charpos, EMACS_INT bytepos)
754 {
755 Lisp_Object sep_re = Fbuffer_local_value (Qparagraph_separate,
756 Fcurrent_buffer ());
757 Lisp_Object start_re = Fbuffer_local_value (Qparagraph_start,
758 Fcurrent_buffer ());
759 EMACS_INT val;
760
761 if (!STRINGP (sep_re))
762 sep_re = fallback_paragraph_separate_re;
763 if (!STRINGP (start_re))
764 start_re = fallback_paragraph_start_re;
765
766 val = fast_looking_at (sep_re, charpos, bytepos, ZV, ZV_BYTE, Qnil);
767 if (val < 0)
768 {
769 if (fast_looking_at (start_re, charpos, bytepos, ZV, ZV_BYTE, Qnil) >= 0)
770 val = -1;
771 else
772 val = -2;
773 }
774
775 return val;
776 }
777
778 /* Determine the start-of-run (sor) directional type given the two
779 embedding levels on either side of the run boundary. Also, update
780 the saved info about previously seen characters, since that info is
781 generally valid for a single level run. */
782 static INLINE void
783 bidi_set_sor_type (struct bidi_it *bidi_it, int level_before, int level_after)
784 {
785 int higher_level = level_before > level_after ? level_before : level_after;
786
787 /* The prev_was_pdf gork is required for when we have several PDFs
788 in a row. In that case, we want to compute the sor type for the
789 next level run only once: when we see the first PDF. That's
790 because the sor type depends only on the higher of the two levels
791 that we find on the two sides of the level boundary (see UAX#9,
792 clause X10), and so we don't need to know the final embedding
793 level to which we descend after processing all the PDFs. */
794 if (!bidi_it->prev_was_pdf || level_before < level_after)
795 /* FIXME: should the default sor direction be user selectable? */
796 bidi_it->sor = (higher_level & 1) != 0 ? R2L : L2R;
797 if (level_before > level_after)
798 bidi_it->prev_was_pdf = 1;
799
800 bidi_it->prev.type = UNKNOWN_BT;
801 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
802 bidi_it->last_strong.orig_type = UNKNOWN_BT;
803 bidi_it->prev_for_neutral.type = bidi_it->sor == R2L ? STRONG_R : STRONG_L;
804 bidi_it->prev_for_neutral.charpos = bidi_it->charpos;
805 bidi_it->prev_for_neutral.bytepos = bidi_it->bytepos;
806 bidi_it->next_for_neutral.type = bidi_it->next_for_neutral.type_after_w1 =
807 bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
808 bidi_it->ignore_bn_limit = 0; /* meaning it's unknown */
809 }
810
811 static void
812 bidi_line_init (struct bidi_it *bidi_it)
813 {
814 bidi_it->scan_dir = 1; /* FIXME: do we need to have control on this? */
815 bidi_it->resolved_level = bidi_it->level_stack[0].level;
816 bidi_it->level_stack[0].override = NEUTRAL_DIR; /* X1 */
817 bidi_it->invalid_levels = 0;
818 bidi_it->invalid_rl_levels = -1;
819 bidi_it->next_en_pos = -1;
820 bidi_it->next_for_ws.type = UNKNOWN_BT;
821 bidi_set_sor_type (bidi_it,
822 bidi_it->paragraph_dir == R2L ? 1 : 0,
823 bidi_it->level_stack[0].level); /* X10 */
824
825 bidi_cache_reset ();
826 }
827
828 /* Find the beginning of this paragraph by looking back in the buffer.
829 Value is the byte position of the paragraph's beginning. */
830 static EMACS_INT
831 bidi_find_paragraph_start (EMACS_INT pos, EMACS_INT pos_byte)
832 {
833 Lisp_Object re = Fbuffer_local_value (Qparagraph_start, Fcurrent_buffer ());
834 EMACS_INT limit = ZV, limit_byte = ZV_BYTE;
835
836 if (!STRINGP (re))
837 re = fallback_paragraph_start_re;
838 while (pos_byte > BEGV_BYTE
839 && fast_looking_at (re, pos, pos_byte, limit, limit_byte, Qnil) < 0)
840 {
841 pos = find_next_newline_no_quit (pos - 1, -1);
842 pos_byte = CHAR_TO_BYTE (pos);
843 }
844 return pos_byte;
845 }
846
847 /* Determine the direction, a.k.a. base embedding level, of the
848 paragraph we are about to iterate through. If DIR is either L2R or
849 R2L, just use that. Otherwise, determine the paragraph direction
850 from the first strong character of the paragraph.
851
852 Note that this gives the paragraph separator the same direction as
853 the preceding paragraph, even though Emacs generally views the
854 separartor as not belonging to any paragraph. */
855 void
856 bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it)
857 {
858 EMACS_INT bytepos = bidi_it->bytepos;
859
860 /* Special case for an empty buffer. */
861 if (bytepos == BEGV_BYTE && bytepos == ZV_BYTE)
862 dir = L2R;
863 /* We should never be called at EOB or before BEGV. */
864 else if (bytepos >= ZV_BYTE || bytepos < BEGV_BYTE)
865 abort ();
866
867 if (dir == L2R)
868 {
869 bidi_it->paragraph_dir = L2R;
870 bidi_it->new_paragraph = 0;
871 }
872 else if (dir == R2L)
873 {
874 bidi_it->paragraph_dir = R2L;
875 bidi_it->new_paragraph = 0;
876 }
877 else if (dir == NEUTRAL_DIR) /* P2 */
878 {
879 int ch, ch_len;
880 EMACS_INT pos;
881 bidi_type_t type;
882
883 /* If we are inside a paragraph separator, we are just waiting
884 for the separator to be exhausted; use the previous paragraph
885 direction. But don't do that if we have been just reseated,
886 because we need to reinitialize below in that case. */
887 if (!bidi_it->first_elt
888 && bidi_it->charpos < bidi_it->separator_limit)
889 return;
890
891 /* If we are on a newline, get past it to where the next
892 paragraph might start. But don't do that at BEGV since then
893 we are potentially in a new paragraph that doesn't yet
894 exist. */
895 pos = bidi_it->charpos;
896 if (bytepos > BEGV_BYTE && FETCH_CHAR (bytepos) == '\n')
897 {
898 bytepos++;
899 pos++;
900 }
901
902 /* We are either at the beginning of a paragraph or in the
903 middle of it. Find where this paragraph starts. */
904 bytepos = bidi_find_paragraph_start (pos, bytepos);
905
906 /* We should always be at the beginning of a new line at this
907 point. */
908 if (!(bytepos == BEGV_BYTE || FETCH_CHAR (bytepos - 1) == '\n'))
909 abort ();
910
911 bidi_it->separator_limit = -1;
912 bidi_it->new_paragraph = 0;
913 ch = FETCH_CHAR (bytepos);
914 ch_len = CHAR_BYTES (ch);
915 pos = BYTE_TO_CHAR (bytepos);
916 type = bidi_get_type (ch, NEUTRAL_DIR);
917
918 for (pos++, bytepos += ch_len;
919 /* NOTE: UAX#9 says to search only for L, AL, or R types of
920 characters, and ignore RLE, RLO, LRE, and LRO. However,
921 I'm not sure it makes sense to omit those 4; should try
922 with and without that to see the effect. */
923 (bidi_get_category (type) != STRONG)
924 || (bidi_ignore_explicit_marks_for_paragraph_level
925 && (type == RLE || type == RLO
926 || type == LRE || type == LRO));
927 type = bidi_get_type (ch, NEUTRAL_DIR))
928 {
929 if (type == NEUTRAL_B && bidi_at_paragraph_end (pos, bytepos) >= -1)
930 break;
931 if (bytepos >= ZV_BYTE)
932 {
933 /* Pretend there's a paragraph separator at end of buffer. */
934 type = NEUTRAL_B;
935 break;
936 }
937 FETCH_CHAR_ADVANCE (ch, pos, bytepos);
938 }
939 if (type == STRONG_R || type == STRONG_AL) /* P3 */
940 bidi_it->paragraph_dir = R2L;
941 else if (type == STRONG_L)
942 bidi_it->paragraph_dir = L2R;
943 }
944 else
945 abort ();
946
947 /* Contrary to UAX#9 clause P3, we only default the paragraph
948 direction to L2R if we have no previous usable paragraph
949 direction. */
950 if (bidi_it->paragraph_dir == NEUTRAL_DIR)
951 bidi_it->paragraph_dir = L2R; /* P3 and ``higher protocols'' */
952 if (bidi_it->paragraph_dir == R2L)
953 bidi_it->level_stack[0].level = 1;
954 else
955 bidi_it->level_stack[0].level = 0;
956
957 bidi_line_init (bidi_it);
958 }
959
960 /* Do whatever UAX#9 clause X8 says should be done at paragraph's
961 end. */
962 static INLINE void
963 bidi_set_paragraph_end (struct bidi_it *bidi_it)
964 {
965 bidi_it->invalid_levels = 0;
966 bidi_it->invalid_rl_levels = -1;
967 bidi_it->stack_idx = 0;
968 bidi_it->resolved_level = bidi_it->level_stack[0].level;
969 }
970
971 /* Initialize the bidi iterator from buffer position CHARPOS. */
972 void
973 bidi_init_it (EMACS_INT charpos, EMACS_INT bytepos, struct bidi_it *bidi_it)
974 {
975 if (! bidi_initialized)
976 bidi_initialize ();
977 bidi_it->charpos = charpos;
978 bidi_it->bytepos = bytepos;
979 bidi_it->first_elt = 1;
980 bidi_set_paragraph_end (bidi_it);
981 bidi_it->new_paragraph = 1;
982 bidi_it->separator_limit = -1;
983 bidi_it->type = NEUTRAL_B;
984 bidi_it->type_after_w1 = NEUTRAL_B;
985 bidi_it->orig_type = NEUTRAL_B;
986 bidi_it->prev_was_pdf = 0;
987 bidi_it->prev.type = bidi_it->prev.type_after_w1 =
988 bidi_it->prev.orig_type = UNKNOWN_BT;
989 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
990 bidi_it->last_strong.orig_type = UNKNOWN_BT;
991 bidi_it->next_for_neutral.charpos = -1;
992 bidi_it->next_for_neutral.type =
993 bidi_it->next_for_neutral.type_after_w1 =
994 bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
995 bidi_it->prev_for_neutral.charpos = -1;
996 bidi_it->prev_for_neutral.type =
997 bidi_it->prev_for_neutral.type_after_w1 =
998 bidi_it->prev_for_neutral.orig_type = UNKNOWN_BT;
999 bidi_it->sor = L2R; /* FIXME: should it be user-selectable? */
1000 }
1001
1002 /* Push the current embedding level and override status; reset the
1003 current level to LEVEL and the current override status to OVERRIDE. */
1004 static INLINE void
1005 bidi_push_embedding_level (struct bidi_it *bidi_it,
1006 int level, bidi_dir_t override)
1007 {
1008 bidi_it->stack_idx++;
1009 if (bidi_it->stack_idx >= BIDI_MAXLEVEL)
1010 abort ();
1011 bidi_it->level_stack[bidi_it->stack_idx].level = level;
1012 bidi_it->level_stack[bidi_it->stack_idx].override = override;
1013 }
1014
1015 /* Pop the embedding level and directional override status from the
1016 stack, and return the new level. */
1017 static INLINE int
1018 bidi_pop_embedding_level (struct bidi_it *bidi_it)
1019 {
1020 /* UAX#9 says to ignore invalid PDFs. */
1021 if (bidi_it->stack_idx > 0)
1022 bidi_it->stack_idx--;
1023 return bidi_it->level_stack[bidi_it->stack_idx].level;
1024 }
1025
1026 /* Record in SAVED_INFO the information about the current character. */
1027 static INLINE void
1028 bidi_remember_char (struct bidi_saved_info *saved_info,
1029 struct bidi_it *bidi_it)
1030 {
1031 saved_info->charpos = bidi_it->charpos;
1032 saved_info->bytepos = bidi_it->bytepos;
1033 saved_info->type = bidi_it->type;
1034 bidi_check_type (bidi_it->type);
1035 saved_info->type_after_w1 = bidi_it->type_after_w1;
1036 bidi_check_type (bidi_it->type_after_w1);
1037 saved_info->orig_type = bidi_it->orig_type;
1038 bidi_check_type (bidi_it->orig_type);
1039 }
1040
1041 /* Resolve the type of a neutral character according to the type of
1042 surrounding strong text and the current embedding level. */
1043 static INLINE bidi_type_t
1044 bidi_resolve_neutral_1 (bidi_type_t prev_type, bidi_type_t next_type, int lev)
1045 {
1046 /* N1: European and Arabic numbers are treated as though they were R. */
1047 if (next_type == WEAK_EN || next_type == WEAK_AN)
1048 next_type = STRONG_R;
1049 if (prev_type == WEAK_EN || prev_type == WEAK_AN)
1050 prev_type = STRONG_R;
1051
1052 if (next_type == prev_type) /* N1 */
1053 return next_type;
1054 else if ((lev & 1) == 0) /* N2 */
1055 return STRONG_L;
1056 else
1057 return STRONG_R;
1058 }
1059
1060 static INLINE int
1061 bidi_explicit_dir_char (int c)
1062 {
1063 /* FIXME: this should be replaced with a lookup table with suitable
1064 bits set, like standard C ctype macros do. */
1065 return (c == LRE_CHAR || c == LRO_CHAR
1066 || c == RLE_CHAR || c == RLO_CHAR || c == PDF_CHAR);
1067 }
1068
1069 /* A helper function for bidi_resolve_explicit. It advances to the
1070 next character in logical order and determines the new embedding
1071 level and directional override, but does not take into account
1072 empty embeddings. */
1073 static int
1074 bidi_resolve_explicit_1 (struct bidi_it *bidi_it)
1075 {
1076 int curchar;
1077 bidi_type_t type;
1078 int current_level;
1079 int new_level;
1080 bidi_dir_t override;
1081
1082 if (bidi_it->bytepos < BEGV_BYTE /* after reseat to BEGV? */
1083 || bidi_it->first_elt)
1084 {
1085 bidi_it->first_elt = 0;
1086 if (bidi_it->charpos < BEGV)
1087 bidi_it->charpos = BEGV;
1088 bidi_it->bytepos = CHAR_TO_BYTE (bidi_it->charpos);
1089 }
1090 else if (bidi_it->bytepos < ZV_BYTE) /* don't move at ZV */
1091 {
1092 bidi_it->charpos++;
1093 if (bidi_it->ch_len == 0)
1094 abort ();
1095 bidi_it->bytepos += bidi_it->ch_len;
1096 }
1097
1098 current_level = bidi_it->level_stack[bidi_it->stack_idx].level; /* X1 */
1099 override = bidi_it->level_stack[bidi_it->stack_idx].override;
1100 new_level = current_level;
1101
1102 /* in case it is a unibyte character (not yet implemented) */
1103 /* _fetch_multibyte_char_len = 1; */
1104 if (bidi_it->bytepos >= ZV_BYTE)
1105 {
1106 curchar = BIDI_EOB;
1107 bidi_it->ch_len = 1;
1108 }
1109 else
1110 {
1111 curchar = FETCH_CHAR (bidi_it->bytepos);
1112 bidi_it->ch_len = CHAR_BYTES (curchar);
1113 }
1114 bidi_it->ch = curchar;
1115
1116 /* Don't apply directional override here, as all the types we handle
1117 below will not be affected by the override anyway, and we need
1118 the original type unaltered. The override will be applied in
1119 bidi_resolve_weak. */
1120 type = bidi_get_type (curchar, NEUTRAL_DIR);
1121 bidi_it->orig_type = type;
1122 bidi_check_type (bidi_it->orig_type);
1123
1124 if (type != PDF)
1125 bidi_it->prev_was_pdf = 0;
1126
1127 bidi_it->type_after_w1 = UNKNOWN_BT;
1128
1129 switch (type)
1130 {
1131 case RLE: /* X2 */
1132 case RLO: /* X4 */
1133 bidi_it->type_after_w1 = type;
1134 bidi_check_type (bidi_it->type_after_w1);
1135 type = WEAK_BN; /* X9/Retaining */
1136 if (bidi_it->ignore_bn_limit <= 0)
1137 {
1138 if (current_level <= BIDI_MAXLEVEL - 4)
1139 {
1140 /* Compute the least odd embedding level greater than
1141 the current level. */
1142 new_level = ((current_level + 1) & ~1) + 1;
1143 if (bidi_it->type_after_w1 == RLE)
1144 override = NEUTRAL_DIR;
1145 else
1146 override = R2L;
1147 if (current_level == BIDI_MAXLEVEL - 4)
1148 bidi_it->invalid_rl_levels = 0;
1149 bidi_push_embedding_level (bidi_it, new_level, override);
1150 }
1151 else
1152 {
1153 bidi_it->invalid_levels++;
1154 /* See the commentary about invalid_rl_levels below. */
1155 if (bidi_it->invalid_rl_levels < 0)
1156 bidi_it->invalid_rl_levels = 0;
1157 bidi_it->invalid_rl_levels++;
1158 }
1159 }
1160 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
1161 || bidi_it->next_en_pos > bidi_it->charpos)
1162 type = WEAK_EN;
1163 break;
1164 case LRE: /* X3 */
1165 case LRO: /* X5 */
1166 bidi_it->type_after_w1 = type;
1167 bidi_check_type (bidi_it->type_after_w1);
1168 type = WEAK_BN; /* X9/Retaining */
1169 if (bidi_it->ignore_bn_limit <= 0)
1170 {
1171 if (current_level <= BIDI_MAXLEVEL - 5)
1172 {
1173 /* Compute the least even embedding level greater than
1174 the current level. */
1175 new_level = ((current_level + 2) & ~1);
1176 if (bidi_it->type_after_w1 == LRE)
1177 override = NEUTRAL_DIR;
1178 else
1179 override = L2R;
1180 bidi_push_embedding_level (bidi_it, new_level, override);
1181 }
1182 else
1183 {
1184 bidi_it->invalid_levels++;
1185 /* invalid_rl_levels counts invalid levels encountered
1186 while the embedding level was already too high for
1187 LRE/LRO, but not for RLE/RLO. That is because
1188 there may be exactly one PDF which we should not
1189 ignore even though invalid_levels is non-zero.
1190 invalid_rl_levels helps to know what PDF is
1191 that. */
1192 if (bidi_it->invalid_rl_levels >= 0)
1193 bidi_it->invalid_rl_levels++;
1194 }
1195 }
1196 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
1197 || bidi_it->next_en_pos > bidi_it->charpos)
1198 type = WEAK_EN;
1199 break;
1200 case PDF: /* X7 */
1201 bidi_it->type_after_w1 = type;
1202 bidi_check_type (bidi_it->type_after_w1);
1203 type = WEAK_BN; /* X9/Retaining */
1204 if (bidi_it->ignore_bn_limit <= 0)
1205 {
1206 if (!bidi_it->invalid_rl_levels)
1207 {
1208 new_level = bidi_pop_embedding_level (bidi_it);
1209 bidi_it->invalid_rl_levels = -1;
1210 if (bidi_it->invalid_levels)
1211 bidi_it->invalid_levels--;
1212 /* else nothing: UAX#9 says to ignore invalid PDFs */
1213 }
1214 if (!bidi_it->invalid_levels)
1215 new_level = bidi_pop_embedding_level (bidi_it);
1216 else
1217 {
1218 bidi_it->invalid_levels--;
1219 bidi_it->invalid_rl_levels--;
1220 }
1221 }
1222 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
1223 || bidi_it->next_en_pos > bidi_it->charpos)
1224 type = WEAK_EN;
1225 break;
1226 default:
1227 /* Nothing. */
1228 break;
1229 }
1230
1231 bidi_it->type = type;
1232 bidi_check_type (bidi_it->type);
1233
1234 return new_level;
1235 }
1236
1237 /* Given an iterator state in BIDI_IT, advance one character position
1238 in the buffer to the next character (in the logical order), resolve
1239 any explicit embeddings and directional overrides, and return the
1240 embedding level of the character after resolving explicit
1241 directives and ignoring empty embeddings. */
1242 static int
1243 bidi_resolve_explicit (struct bidi_it *bidi_it)
1244 {
1245 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1246 int new_level = bidi_resolve_explicit_1 (bidi_it);
1247
1248 if (prev_level < new_level
1249 && bidi_it->type == WEAK_BN
1250 && bidi_it->ignore_bn_limit == 0 /* only if not already known */
1251 && bidi_it->bytepos < ZV_BYTE /* not already at EOB */
1252 && bidi_explicit_dir_char (FETCH_CHAR (bidi_it->bytepos
1253 + bidi_it->ch_len)))
1254 {
1255 /* Avoid pushing and popping embedding levels if the level run
1256 is empty, as this breaks level runs where it shouldn't.
1257 UAX#9 removes all the explicit embedding and override codes,
1258 so empty embeddings disappear without a trace. We need to
1259 behave as if we did the same. */
1260 struct bidi_it saved_it;
1261 int level = prev_level;
1262
1263 bidi_copy_it (&saved_it, bidi_it);
1264
1265 while (bidi_explicit_dir_char (FETCH_CHAR (bidi_it->bytepos
1266 + bidi_it->ch_len)))
1267 {
1268 level = bidi_resolve_explicit_1 (bidi_it);
1269 }
1270
1271 if (level == prev_level) /* empty embedding */
1272 saved_it.ignore_bn_limit = bidi_it->charpos + 1;
1273 else /* this embedding is non-empty */
1274 saved_it.ignore_bn_limit = -1;
1275
1276 bidi_copy_it (bidi_it, &saved_it);
1277 if (bidi_it->ignore_bn_limit > 0)
1278 {
1279 /* We pushed a level, but we shouldn't have. Undo that. */
1280 if (!bidi_it->invalid_rl_levels)
1281 {
1282 new_level = bidi_pop_embedding_level (bidi_it);
1283 bidi_it->invalid_rl_levels = -1;
1284 if (bidi_it->invalid_levels)
1285 bidi_it->invalid_levels--;
1286 }
1287 if (!bidi_it->invalid_levels)
1288 new_level = bidi_pop_embedding_level (bidi_it);
1289 else
1290 {
1291 bidi_it->invalid_levels--;
1292 bidi_it->invalid_rl_levels--;
1293 }
1294 }
1295 }
1296
1297 if (bidi_it->type == NEUTRAL_B) /* X8 */
1298 {
1299 bidi_set_paragraph_end (bidi_it);
1300 /* This is needed by bidi_resolve_weak below, and in L1. */
1301 bidi_it->type_after_w1 = bidi_it->type;
1302 bidi_check_type (bidi_it->type_after_w1);
1303 }
1304
1305 return new_level;
1306 }
1307
1308 /* Advance in the buffer, resolve weak types and return the type of
1309 the next character after weak type resolution. */
1310 static bidi_type_t
1311 bidi_resolve_weak (struct bidi_it *bidi_it)
1312 {
1313 bidi_type_t type;
1314 bidi_dir_t override;
1315 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1316 int new_level = bidi_resolve_explicit (bidi_it);
1317 int next_char;
1318 bidi_type_t type_of_next;
1319 struct bidi_it saved_it;
1320
1321 type = bidi_it->type;
1322 override = bidi_it->level_stack[bidi_it->stack_idx].override;
1323
1324 if (type == UNKNOWN_BT
1325 || type == LRE
1326 || type == LRO
1327 || type == RLE
1328 || type == RLO
1329 || type == PDF)
1330 abort ();
1331
1332 if (new_level != prev_level
1333 || bidi_it->type == NEUTRAL_B)
1334 {
1335 /* We've got a new embedding level run, compute the directional
1336 type of sor and initialize per-run variables (UAX#9, clause
1337 X10). */
1338 bidi_set_sor_type (bidi_it, prev_level, new_level);
1339 }
1340 else if (type == NEUTRAL_S || type == NEUTRAL_WS
1341 || type == WEAK_BN || type == STRONG_AL)
1342 bidi_it->type_after_w1 = type; /* needed in L1 */
1343 bidi_check_type (bidi_it->type_after_w1);
1344
1345 /* Level and directional override status are already recorded in
1346 bidi_it, and do not need any change; see X6. */
1347 if (override == R2L) /* X6 */
1348 type = STRONG_R;
1349 else if (override == L2R)
1350 type = STRONG_L;
1351 else
1352 {
1353 if (type == WEAK_NSM) /* W1 */
1354 {
1355 /* Note that we don't need to consider the case where the
1356 prev character has its type overridden by an RLO or LRO,
1357 because then either the type of this NSM would have been
1358 also overridden, or the previous character is outside the
1359 current level run, and thus not relevant to this NSM.
1360 This is why NSM gets the type_after_w1 of the previous
1361 character. */
1362 if (bidi_it->prev.type_after_w1 != UNKNOWN_BT
1363 /* if type_after_w1 is NEUTRAL_B, this NSM is at sor */
1364 && bidi_it->prev.type_after_w1 != NEUTRAL_B)
1365 type = bidi_it->prev.type_after_w1;
1366 else if (bidi_it->sor == R2L)
1367 type = STRONG_R;
1368 else if (bidi_it->sor == L2R)
1369 type = STRONG_L;
1370 else /* shouldn't happen! */
1371 abort ();
1372 }
1373 if (type == WEAK_EN /* W2 */
1374 && bidi_it->last_strong.type_after_w1 == STRONG_AL)
1375 type = WEAK_AN;
1376 else if (type == STRONG_AL) /* W3 */
1377 type = STRONG_R;
1378 else if ((type == WEAK_ES /* W4 */
1379 && bidi_it->prev.type_after_w1 == WEAK_EN
1380 && bidi_it->prev.orig_type == WEAK_EN)
1381 || (type == WEAK_CS
1382 && ((bidi_it->prev.type_after_w1 == WEAK_EN
1383 && bidi_it->prev.orig_type == WEAK_EN)
1384 || bidi_it->prev.type_after_w1 == WEAK_AN)))
1385 {
1386 next_char =
1387 bidi_it->bytepos + bidi_it->ch_len >= ZV_BYTE
1388 ? BIDI_EOB : FETCH_CHAR (bidi_it->bytepos + bidi_it->ch_len);
1389 type_of_next = bidi_get_type (next_char, override);
1390
1391 if (type_of_next == WEAK_BN
1392 || bidi_explicit_dir_char (next_char))
1393 {
1394 bidi_copy_it (&saved_it, bidi_it);
1395 while (bidi_resolve_explicit (bidi_it) == new_level
1396 && bidi_it->type == WEAK_BN)
1397 ;
1398 type_of_next = bidi_it->type;
1399 bidi_copy_it (bidi_it, &saved_it);
1400 }
1401
1402 /* If the next character is EN, but the last strong-type
1403 character is AL, that next EN will be changed to AN when
1404 we process it in W2 above. So in that case, this ES
1405 should not be changed into EN. */
1406 if (type == WEAK_ES
1407 && type_of_next == WEAK_EN
1408 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1409 type = WEAK_EN;
1410 else if (type == WEAK_CS)
1411 {
1412 if (bidi_it->prev.type_after_w1 == WEAK_AN
1413 && (type_of_next == WEAK_AN
1414 /* If the next character is EN, but the last
1415 strong-type character is AL, EN will be later
1416 changed to AN when we process it in W2 above.
1417 So in that case, this ES should not be
1418 changed into EN. */
1419 || (type_of_next == WEAK_EN
1420 && bidi_it->last_strong.type_after_w1 == STRONG_AL)))
1421 type = WEAK_AN;
1422 else if (bidi_it->prev.type_after_w1 == WEAK_EN
1423 && type_of_next == WEAK_EN
1424 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
1425 type = WEAK_EN;
1426 }
1427 }
1428 else if (type == WEAK_ET /* W5: ET with EN before or after it */
1429 || type == WEAK_BN) /* W5/Retaining */
1430 {
1431 if (bidi_it->prev.type_after_w1 == WEAK_EN /* ET/BN w/EN before it */
1432 || bidi_it->next_en_pos > bidi_it->charpos)
1433 type = WEAK_EN;
1434 else /* W5: ET/BN with EN after it. */
1435 {
1436 EMACS_INT en_pos = bidi_it->charpos + 1;
1437
1438 next_char =
1439 bidi_it->bytepos + bidi_it->ch_len >= ZV_BYTE
1440 ? BIDI_EOB : FETCH_CHAR (bidi_it->bytepos + bidi_it->ch_len);
1441 type_of_next = bidi_get_type (next_char, override);
1442
1443 if (type_of_next == WEAK_ET
1444 || type_of_next == WEAK_BN
1445 || bidi_explicit_dir_char (next_char))
1446 {
1447 bidi_copy_it (&saved_it, bidi_it);
1448 while (bidi_resolve_explicit (bidi_it) == new_level
1449 && (bidi_it->type == WEAK_BN
1450 || bidi_it->type == WEAK_ET))
1451 ;
1452 type_of_next = bidi_it->type;
1453 en_pos = bidi_it->charpos;
1454 bidi_copy_it (bidi_it, &saved_it);
1455 }
1456 if (type_of_next == WEAK_EN)
1457 {
1458 /* If the last strong character is AL, the EN we've
1459 found will become AN when we get to it (W2). */
1460 if (bidi_it->last_strong.type_after_w1 != STRONG_AL)
1461 {
1462 type = WEAK_EN;
1463 /* Remember this EN position, to speed up processing
1464 of the next ETs. */
1465 bidi_it->next_en_pos = en_pos;
1466 }
1467 else if (type == WEAK_BN)
1468 type = NEUTRAL_ON; /* W6/Retaining */
1469 }
1470 }
1471 }
1472 }
1473
1474 if (type == WEAK_ES || type == WEAK_ET || type == WEAK_CS /* W6 */
1475 || (type == WEAK_BN
1476 && (bidi_it->prev.type_after_w1 == WEAK_CS /* W6/Retaining */
1477 || bidi_it->prev.type_after_w1 == WEAK_ES
1478 || bidi_it->prev.type_after_w1 == WEAK_ET)))
1479 type = NEUTRAL_ON;
1480
1481 /* Store the type we've got so far, before we clobber it with strong
1482 types in W7 and while resolving neutral types. But leave alone
1483 the original types that were recorded above, because we will need
1484 them for the L1 clause. */
1485 if (bidi_it->type_after_w1 == UNKNOWN_BT)
1486 bidi_it->type_after_w1 = type;
1487 bidi_check_type (bidi_it->type_after_w1);
1488
1489 if (type == WEAK_EN) /* W7 */
1490 {
1491 if ((bidi_it->last_strong.type_after_w1 == STRONG_L)
1492 || (bidi_it->last_strong.type == UNKNOWN_BT && bidi_it->sor == L2R))
1493 type = STRONG_L;
1494 }
1495
1496 bidi_it->type = type;
1497 bidi_check_type (bidi_it->type);
1498 return type;
1499 }
1500
1501 static bidi_type_t
1502 bidi_resolve_neutral (struct bidi_it *bidi_it)
1503 {
1504 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1505 bidi_type_t type = bidi_resolve_weak (bidi_it);
1506 int current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1507
1508 if (!(type == STRONG_R
1509 || type == STRONG_L
1510 || type == WEAK_BN
1511 || type == WEAK_EN
1512 || type == WEAK_AN
1513 || type == NEUTRAL_B
1514 || type == NEUTRAL_S
1515 || type == NEUTRAL_WS
1516 || type == NEUTRAL_ON))
1517 abort ();
1518
1519 if (bidi_get_category (type) == NEUTRAL
1520 || (type == WEAK_BN && prev_level == current_level))
1521 {
1522 if (bidi_it->next_for_neutral.type != UNKNOWN_BT)
1523 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1524 bidi_it->next_for_neutral.type,
1525 current_level);
1526 else
1527 {
1528 /* Arrrgh!! The UAX#9 algorithm is too deeply entrenched in
1529 the assumption of batch-style processing; see clauses W4,
1530 W5, and especially N1, which require to look far forward
1531 (as well as back) in the buffer. May the fleas of a
1532 thousand camels infest the armpits of those who design
1533 supposedly general-purpose algorithms by looking at their
1534 own implementations, and fail to consider other possible
1535 implementations! */
1536 struct bidi_it saved_it;
1537 bidi_type_t next_type;
1538
1539 if (bidi_it->scan_dir == -1)
1540 abort ();
1541
1542 bidi_copy_it (&saved_it, bidi_it);
1543 /* Scan the text forward until we find the first non-neutral
1544 character, and then use that to resolve the neutral we
1545 are dealing with now. We also cache the scanned iterator
1546 states, to salvage some of the effort later. */
1547 bidi_cache_iterator_state (bidi_it, 0);
1548 do {
1549 /* Record the info about the previous character, so that
1550 it will be cached below with this state. */
1551 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
1552 && bidi_it->type != WEAK_BN)
1553 bidi_remember_char (&bidi_it->prev, bidi_it);
1554 type = bidi_resolve_weak (bidi_it);
1555 /* Paragraph separators have their levels fully resolved
1556 at this point, so cache them as resolved. */
1557 bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B);
1558 /* FIXME: implement L1 here, by testing for a newline and
1559 resetting the level for any sequence of whitespace
1560 characters adjacent to it. */
1561 } while (!(type == NEUTRAL_B
1562 || (type != WEAK_BN
1563 && bidi_get_category (type) != NEUTRAL)
1564 /* This is all per level run, so stop when we
1565 reach the end of this level run. */
1566 || bidi_it->level_stack[bidi_it->stack_idx].level !=
1567 current_level));
1568
1569 bidi_remember_char (&saved_it.next_for_neutral, bidi_it);
1570
1571 switch (type)
1572 {
1573 case STRONG_L:
1574 case STRONG_R:
1575 case STRONG_AL:
1576 next_type = type;
1577 break;
1578 case WEAK_EN:
1579 case WEAK_AN:
1580 /* N1: ``European and Arabic numbers are treated as
1581 though they were R.'' */
1582 next_type = STRONG_R;
1583 saved_it.next_for_neutral.type = STRONG_R;
1584 break;
1585 case WEAK_BN:
1586 if (!bidi_explicit_dir_char (bidi_it->ch))
1587 abort (); /* can't happen: BNs are skipped */
1588 /* FALLTHROUGH */
1589 case NEUTRAL_B:
1590 /* Marched all the way to the end of this level run.
1591 We need to use the eor type, whose information is
1592 stored by bidi_set_sor_type in the prev_for_neutral
1593 member. */
1594 if (saved_it.type != WEAK_BN
1595 || bidi_get_category (bidi_it->prev.type_after_w1) == NEUTRAL)
1596 {
1597 next_type = bidi_it->prev_for_neutral.type;
1598 saved_it.next_for_neutral.type = next_type;
1599 bidi_check_type (next_type);
1600 }
1601 else
1602 {
1603 /* This is a BN which does not adjoin neutrals.
1604 Leave its type alone. */
1605 bidi_copy_it (bidi_it, &saved_it);
1606 return bidi_it->type;
1607 }
1608 break;
1609 default:
1610 abort ();
1611 }
1612 type = bidi_resolve_neutral_1 (saved_it.prev_for_neutral.type,
1613 next_type, current_level);
1614 saved_it.type = type;
1615 bidi_check_type (type);
1616 bidi_copy_it (bidi_it, &saved_it);
1617 }
1618 }
1619 return type;
1620 }
1621
1622 /* Given an iterator state in BIDI_IT, advance one character position
1623 in the buffer to the next character (in the logical order), resolve
1624 the bidi type of that next character, and return that type. */
1625 static bidi_type_t
1626 bidi_type_of_next_char (struct bidi_it *bidi_it)
1627 {
1628 bidi_type_t type;
1629
1630 /* This should always be called during a forward scan. */
1631 if (bidi_it->scan_dir != 1)
1632 abort ();
1633
1634 /* Reset the limit until which to ignore BNs if we step out of the
1635 area where we found only empty levels. */
1636 if ((bidi_it->ignore_bn_limit > 0
1637 && bidi_it->ignore_bn_limit <= bidi_it->charpos)
1638 || (bidi_it->ignore_bn_limit == -1
1639 && !bidi_explicit_dir_char (bidi_it->ch)))
1640 bidi_it->ignore_bn_limit = 0;
1641
1642 type = bidi_resolve_neutral (bidi_it);
1643
1644 return type;
1645 }
1646
1647 /* Given an iterator state BIDI_IT, advance one character position in
1648 the buffer to the next character (in the logical order), resolve
1649 the embedding and implicit levels of that next character, and
1650 return the resulting level. */
1651 static int
1652 bidi_level_of_next_char (struct bidi_it *bidi_it)
1653 {
1654 bidi_type_t type;
1655 int level, prev_level = -1;
1656 struct bidi_saved_info next_for_neutral;
1657
1658 if (bidi_it->scan_dir == 1)
1659 {
1660 /* There's no sense in trying to advance if we hit end of text. */
1661 if (bidi_it->bytepos >= ZV_BYTE)
1662 return bidi_it->resolved_level;
1663
1664 /* Record the info about the previous character. */
1665 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
1666 && bidi_it->type != WEAK_BN)
1667 bidi_remember_char (&bidi_it->prev, bidi_it);
1668 if (bidi_it->type_after_w1 == STRONG_R
1669 || bidi_it->type_after_w1 == STRONG_L
1670 || bidi_it->type_after_w1 == STRONG_AL)
1671 bidi_remember_char (&bidi_it->last_strong, bidi_it);
1672 /* FIXME: it sounds like we don't need both prev and
1673 prev_for_neutral members, but I'm leaving them both for now. */
1674 if (bidi_it->type == STRONG_R || bidi_it->type == STRONG_L
1675 || bidi_it->type == WEAK_EN || bidi_it->type == WEAK_AN)
1676 bidi_remember_char (&bidi_it->prev_for_neutral, bidi_it);
1677
1678 /* If we overstepped the characters used for resolving neutrals
1679 and whitespace, invalidate their info in the iterator. */
1680 if (bidi_it->charpos >= bidi_it->next_for_neutral.charpos)
1681 bidi_it->next_for_neutral.type = UNKNOWN_BT;
1682 if (bidi_it->next_en_pos >= 0
1683 && bidi_it->charpos >= bidi_it->next_en_pos)
1684 bidi_it->next_en_pos = -1;
1685 if (bidi_it->next_for_ws.type != UNKNOWN_BT
1686 && bidi_it->charpos >= bidi_it->next_for_ws.charpos)
1687 bidi_it->next_for_ws.type = UNKNOWN_BT;
1688
1689 /* This must be taken before we fill the iterator with the info
1690 about the next char. If we scan backwards, the iterator
1691 state must be already cached, so there's no need to know the
1692 embedding level of the previous character, since we will be
1693 returning to our caller shortly. */
1694 prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1695 }
1696 next_for_neutral = bidi_it->next_for_neutral;
1697
1698 /* Perhaps it is already cached. */
1699 type = bidi_cache_find (bidi_it->charpos + bidi_it->scan_dir, -1, bidi_it);
1700 if (type != UNKNOWN_BT)
1701 {
1702 /* Don't lose the information for resolving neutrals! The
1703 cached states could have been cached before their
1704 next_for_neutral member was computed. If we are on our way
1705 forward, we can simply take the info from the previous
1706 state. */
1707 if (bidi_it->scan_dir == 1
1708 && bidi_it->next_for_neutral.type == UNKNOWN_BT)
1709 bidi_it->next_for_neutral = next_for_neutral;
1710
1711 /* If resolved_level is -1, it means this state was cached
1712 before it was completely resolved, so we cannot return
1713 it. */
1714 if (bidi_it->resolved_level != -1)
1715 return bidi_it->resolved_level;
1716 }
1717 if (bidi_it->scan_dir == -1)
1718 /* If we are going backwards, the iterator state is already cached
1719 from previous scans, and should be fully resolved. */
1720 abort ();
1721
1722 if (type == UNKNOWN_BT)
1723 type = bidi_type_of_next_char (bidi_it);
1724
1725 if (type == NEUTRAL_B)
1726 return bidi_it->resolved_level;
1727
1728 level = bidi_it->level_stack[bidi_it->stack_idx].level;
1729 if ((bidi_get_category (type) == NEUTRAL /* && type != NEUTRAL_B */)
1730 || (type == WEAK_BN && prev_level == level))
1731 {
1732 if (bidi_it->next_for_neutral.type == UNKNOWN_BT)
1733 abort ();
1734
1735 /* If the cached state shows a neutral character, it was not
1736 resolved by bidi_resolve_neutral, so do it now. */
1737 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1738 bidi_it->next_for_neutral.type,
1739 level);
1740 }
1741
1742 if (!(type == STRONG_R
1743 || type == STRONG_L
1744 || type == WEAK_BN
1745 || type == WEAK_EN
1746 || type == WEAK_AN))
1747 abort ();
1748 bidi_it->type = type;
1749 bidi_check_type (bidi_it->type);
1750
1751 /* For L1 below, we need to know, for each WS character, whether
1752 it belongs to a sequence of WS characters preceeding a newline
1753 or a TAB or a paragraph separator. */
1754 if (bidi_it->orig_type == NEUTRAL_WS
1755 && bidi_it->next_for_ws.type == UNKNOWN_BT)
1756 {
1757 int ch;
1758 int clen = bidi_it->ch_len;
1759 EMACS_INT bpos = bidi_it->bytepos;
1760 EMACS_INT cpos = bidi_it->charpos;
1761 bidi_type_t chtype;
1762
1763 do {
1764 /*_fetch_multibyte_char_len = 1;*/
1765 ch = bpos + clen >= ZV_BYTE ? BIDI_EOB : FETCH_CHAR (bpos + clen);
1766 bpos += clen;
1767 cpos++;
1768 clen = (ch == BIDI_EOB ? 1 : CHAR_BYTES (ch));
1769 if (ch == '\n' || ch == BIDI_EOB /* || ch == LINESEP_CHAR */)
1770 chtype = NEUTRAL_B;
1771 else
1772 chtype = bidi_get_type (ch, NEUTRAL_DIR);
1773 } while (chtype == NEUTRAL_WS || chtype == WEAK_BN
1774 || bidi_explicit_dir_char (ch)); /* L1/Retaining */
1775 bidi_it->next_for_ws.type = chtype;
1776 bidi_check_type (bidi_it->next_for_ws.type);
1777 bidi_it->next_for_ws.charpos = cpos;
1778 bidi_it->next_for_ws.bytepos = bpos;
1779 }
1780
1781 /* Resolve implicit levels, with a twist: PDFs get the embedding
1782 level of the enbedding they terminate. See below for the
1783 reason. */
1784 if (bidi_it->orig_type == PDF
1785 /* Don't do this if this formatting code didn't change the
1786 embedding level due to invalid or empty embeddings. */
1787 && prev_level != level)
1788 {
1789 /* Don't look in UAX#9 for the reason for this: it's our own
1790 private quirk. The reason is that we want the formatting
1791 codes to be delivered so that they bracket the text of their
1792 embedding. For example, given the text
1793
1794 {RLO}teST{PDF}
1795
1796 we want it to be displayed as
1797
1798 {RLO}STet{PDF}
1799
1800 not as
1801
1802 STet{RLO}{PDF}
1803
1804 which will result because we bump up the embedding level as
1805 soon as we see the RLO and pop it as soon as we see the PDF,
1806 so RLO itself has the same embedding level as "teST", and
1807 thus would be normally delivered last, just before the PDF.
1808 The switch below fiddles with the level of PDF so that this
1809 ugly side effect does not happen.
1810
1811 (This is, of course, only important if the formatting codes
1812 are actually displayed, but Emacs does need to display them
1813 if the user wants to.) */
1814 level = prev_level;
1815 }
1816 else if (bidi_it->orig_type == NEUTRAL_B /* L1 */
1817 || bidi_it->orig_type == NEUTRAL_S
1818 || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB
1819 /* || bidi_it->ch == LINESEP_CHAR */
1820 || (bidi_it->orig_type == NEUTRAL_WS
1821 && (bidi_it->next_for_ws.type == NEUTRAL_B
1822 || bidi_it->next_for_ws.type == NEUTRAL_S)))
1823 level = bidi_it->level_stack[0].level;
1824 else if ((level & 1) == 0) /* I1 */
1825 {
1826 if (type == STRONG_R)
1827 level++;
1828 else if (type == WEAK_EN || type == WEAK_AN)
1829 level += 2;
1830 }
1831 else /* I2 */
1832 {
1833 if (type == STRONG_L || type == WEAK_EN || type == WEAK_AN)
1834 level++;
1835 }
1836
1837 bidi_it->resolved_level = level;
1838 return level;
1839 }
1840
1841 /* Move to the other edge of a level given by LEVEL. If END_FLAG is
1842 non-zero, we are at the end of a level, and we need to prepare to
1843 resume the scan of the lower level.
1844
1845 If this level's other edge is cached, we simply jump to it, filling
1846 the iterator structure with the iterator state on the other edge.
1847 Otherwise, we walk the buffer until we come back to the same level
1848 as LEVEL.
1849
1850 Note: we are not talking here about a ``level run'' in the UAX#9
1851 sense of the term, but rather about a ``level'' which includes
1852 all the levels higher than it. In other words, given the levels
1853 like this:
1854
1855 11111112222222333333334443343222222111111112223322111
1856 A B C
1857
1858 and assuming we are at point A scanning left to right, this
1859 function moves to point C, whereas the UAX#9 ``level 2 run'' ends
1860 at point B. */
1861 static void
1862 bidi_find_other_level_edge (struct bidi_it *bidi_it, int level, int end_flag)
1863 {
1864 int dir = end_flag ? -bidi_it->scan_dir : bidi_it->scan_dir;
1865 int idx;
1866
1867 /* Try the cache first. */
1868 if ((idx = bidi_cache_find_level_change (level, dir, end_flag)) >= 0)
1869 bidi_cache_fetch_state (idx, bidi_it);
1870 else
1871 {
1872 int new_level;
1873
1874 if (end_flag)
1875 abort (); /* if we are at end of level, its edges must be cached */
1876
1877 bidi_cache_iterator_state (bidi_it, 1);
1878 do {
1879 new_level = bidi_level_of_next_char (bidi_it);
1880 bidi_cache_iterator_state (bidi_it, 1);
1881 } while (new_level >= level);
1882 }
1883 }
1884
1885 void
1886 bidi_get_next_char_visually (struct bidi_it *bidi_it)
1887 {
1888 int old_level, new_level, next_level;
1889 struct bidi_it sentinel;
1890
1891 if (bidi_it->scan_dir == 0)
1892 {
1893 bidi_it->scan_dir = 1; /* default to logical order */
1894 }
1895
1896 /* If we just passed a newline, initialize for the next line. */
1897 if (!bidi_it->first_elt && bidi_it->orig_type == NEUTRAL_B)
1898 bidi_line_init (bidi_it);
1899
1900 /* Prepare the sentinel iterator state. */
1901 if (bidi_cache_idx == 0)
1902 {
1903 bidi_copy_it (&sentinel, bidi_it);
1904 if (bidi_it->first_elt)
1905 {
1906 sentinel.charpos--; /* cached charpos needs to be monotonic */
1907 sentinel.bytepos--;
1908 sentinel.ch = '\n'; /* doesn't matter, but why not? */
1909 sentinel.ch_len = 1;
1910 }
1911 }
1912
1913 old_level = bidi_it->resolved_level;
1914 new_level = bidi_level_of_next_char (bidi_it);
1915
1916 /* Reordering of resolved levels (clause L2) is implemented by
1917 jumping to the other edge of the level and flipping direction of
1918 scanning the text whenever we find a level change. */
1919 if (new_level != old_level)
1920 {
1921 int ascending = new_level > old_level;
1922 int level_to_search = ascending ? old_level + 1 : old_level;
1923 int incr = ascending ? 1 : -1;
1924 int expected_next_level = old_level + incr;
1925
1926 /* If we don't have anything cached yet, we need to cache the
1927 sentinel state, since we'll need it to record where to jump
1928 when the last non-base level is exhausted. */
1929 if (bidi_cache_idx == 0)
1930 bidi_cache_iterator_state (&sentinel, 1);
1931 /* Jump (or walk) to the other edge of this level. */
1932 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
1933 /* Switch scan direction and peek at the next character in the
1934 new direction. */
1935 bidi_it->scan_dir = -bidi_it->scan_dir;
1936
1937 /* The following loop handles the case where the resolved level
1938 jumps by more than one. This is typical for numbers inside a
1939 run of text with left-to-right embedding direction, but can
1940 also happen in other situations. In those cases the decision
1941 where to continue after a level change, and in what direction,
1942 is tricky. For example, given a text like below:
1943
1944 abcdefgh
1945 11336622
1946
1947 (where the numbers below the text show the resolved levels),
1948 the result of reordering according to UAX#9 should be this:
1949
1950 efdcghba
1951
1952 This is implemented by the loop below which flips direction
1953 and jumps to the other edge of the level each time it finds
1954 the new level not to be the expected one. The expected level
1955 is always one more or one less than the previous one. */
1956 next_level = bidi_peek_at_next_level (bidi_it);
1957 while (next_level != expected_next_level)
1958 {
1959 expected_next_level += incr;
1960 level_to_search += incr;
1961 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
1962 bidi_it->scan_dir = -bidi_it->scan_dir;
1963 next_level = bidi_peek_at_next_level (bidi_it);
1964 }
1965
1966 /* Finally, deliver the next character in the new direction. */
1967 next_level = bidi_level_of_next_char (bidi_it);
1968 }
1969
1970 /* Take note when we have just processed the newline that precedes
1971 the end of the paragraph. The next time we are about to be
1972 called, set_iterator_to_next will automatically reinit the
1973 paragraph direction, if needed. We do this at the newline before
1974 the paragraph separator, because the next character might not be
1975 the first character of the next paragraph, due to the bidi
1976 reordering, whereas we _must_ know the paragraph base direction
1977 _before_ we process the paragraph's text, since the base
1978 direction affects the reordering. */
1979 if (bidi_it->scan_dir == 1
1980 && bidi_it->orig_type == NEUTRAL_B
1981 && bidi_it->bytepos < ZV_BYTE)
1982 {
1983 EMACS_INT sep_len =
1984 bidi_at_paragraph_end (bidi_it->charpos + 1,
1985 bidi_it->bytepos + bidi_it->ch_len);
1986 if (sep_len >= 0)
1987 {
1988 bidi_it->new_paragraph = 1;
1989 /* Record the buffer position of the last character of the
1990 paragraph separator. */
1991 bidi_it->separator_limit = bidi_it->charpos + 1 + sep_len;
1992 }
1993 }
1994
1995 if (bidi_it->scan_dir == 1 && bidi_cache_idx)
1996 {
1997 /* If we are at paragraph's base embedding level and beyond the
1998 last cached position, the cache's job is done and we can
1999 discard it. */
2000 if (bidi_it->resolved_level == bidi_it->level_stack[0].level
2001 && bidi_it->charpos > bidi_cache[bidi_cache_idx - 1].charpos)
2002 bidi_cache_reset ();
2003 /* But as long as we are caching during forward scan, we must
2004 cache each state, or else the cache integrity will be
2005 compromised: it assumes cached states correspond to buffer
2006 positions 1:1. */
2007 else
2008 bidi_cache_iterator_state (bidi_it, 1);
2009 }
2010 }
2011
2012 /* This is meant to be called from within the debugger, whenever you
2013 wish to examine the cache contents. */
2014 void
2015 bidi_dump_cached_states (void)
2016 {
2017 int i;
2018 int ndigits = 1;
2019
2020 if (bidi_cache_idx == 0)
2021 {
2022 fprintf (stderr, "The cache is empty.\n");
2023 return;
2024 }
2025 fprintf (stderr, "Total of %d state%s in cache:\n",
2026 bidi_cache_idx, bidi_cache_idx == 1 ? "" : "s");
2027
2028 for (i = bidi_cache[bidi_cache_idx - 1].charpos; i > 0; i /= 10)
2029 ndigits++;
2030 fputs ("ch ", stderr);
2031 for (i = 0; i < bidi_cache_idx; i++)
2032 fprintf (stderr, "%*c", ndigits, bidi_cache[i].ch);
2033 fputs ("\n", stderr);
2034 fputs ("lvl ", stderr);
2035 for (i = 0; i < bidi_cache_idx; i++)
2036 fprintf (stderr, "%*d", ndigits, bidi_cache[i].resolved_level);
2037 fputs ("\n", stderr);
2038 fputs ("pos ", stderr);
2039 for (i = 0; i < bidi_cache_idx; i++)
2040 fprintf (stderr, "%*d", ndigits, bidi_cache[i].charpos);
2041 fputs ("\n", stderr);
2042 }