Retrospective commit from 2009-11-21.
[bpt/emacs.git] / src / bidi.c
CommitLineData
b7b65b15 1/* Low-level bidirectional buffer-scanning functions for GNU Emacs.
c143c213 2 Copyright (C) 2000, 2001, 2004, 2005, 2009 Free Software Foundation, Inc.
b7b65b15
EZ
3
4This file is part of GNU Emacs.
5
6GNU Emacs is free software; you can redistribute it and/or modify
7it under the terms of the GNU General Public License as published by
8the Free Software Foundation; either version 2, or (at your option)
9any later version.
10
11GNU Emacs is distributed in the hope that it will be useful,
12but WITHOUT ANY WARRANTY; without even the implied warranty of
13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14GNU General Public License for more details.
15
16
17You should have received a copy of the GNU General Public License
18along with GNU Emacs; see the file COPYING. If not, write to
19the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20Boston, MA 02110-1301, USA. */
21
2d6e4628
EZ
22/* Written by Eli Zaretskii <eliz@gnu.org>.
23
24 A sequential implementation of the Unicode Bidirectional algorithm,
b7b65b15
EZ
25 as per UAX#9, a part of the Unicode Standard.
26
27 Unlike the reference and most other implementations, this one is
28 designed to be called once for every character in the buffer.
29
30 The main entry point is bidi_get_next_char_visually. Each time it
31 is called, it finds the next character in the visual order, and
32 returns its information in a special structure. The caller is then
33 expected to process this character for display or any other
34 purposes, and call bidi_get_next_char_visually for the next
35 character. See the comments in bidi_get_next_char_visually for
36 more details about its algorithm that finds the next visual-order
37 character by resolving their levels on the fly.
38
89d3374a
EZ
39 If you want to understand the code, you will have to read it
40 together with the relevant portions of UAX#9. The comments include
41 references to UAX#9 rules, for that very reason.
42
b7b65b15
EZ
43 A note about references to UAX#9 rules: if the reference says
44 something like "X9/Retaining", it means that you need to refer to
45 rule X9 and to its modifications decribed in the "Implementation
46 Notes" section of UAX#9, under "Retaining Format Codes". */
47
48#ifdef HAVE_CONFIG_H
49#include <config.h>
50#endif
51
52#include <stdio.h>
53
54#ifdef HAVE_STRING_H
55#include <string.h>
56#endif
57
29e3d8d1
EZ
58#include <setjmp.h>
59
b7b65b15
EZ
60#include "lisp.h"
61#include "buffer.h"
62#include "character.h"
63#include "dispextern.h"
64
65static int bidi_initialized = 0;
66
67static Lisp_Object bidi_type_table;
68
e7402cb2 69/* FIXME: Remove these when bidi_explicit_dir_char uses a lookup table. */
b7b65b15
EZ
70#define LRM_CHAR 0x200E
71#define RLM_CHAR 0x200F
72#define LRE_CHAR 0x202A
73#define RLE_CHAR 0x202B
74#define PDF_CHAR 0x202C
75#define LRO_CHAR 0x202D
76#define RLO_CHAR 0x202E
77
b7b65b15 78#define BIDI_EOB -1
e7402cb2 79#define BIDI_BOB -2 /* FIXME: Is this needed? */
b7b65b15 80
b7b65b15
EZ
81/* Local data structures. (Look in dispextern.h for the rest.) */
82
83/* What we need to know about the current paragraph. */
84struct bidi_paragraph_info {
85 int start_bytepos; /* byte position where it begins */
86 int end_bytepos; /* byte position where it ends */
87 int embedding_level; /* its basic embedding level */
88 bidi_dir_t base_dir; /* its base direction */
89};
90
91/* Data type for describing the bidirectional character categories. */
92typedef enum {
93 UNKNOWN_BC,
94 NEUTRAL,
95 WEAK,
96 STRONG
97} bidi_category_t;
98
99int bidi_ignore_explicit_marks_for_paragraph_level = 1;
100
6bff6497
EZ
101static Lisp_Object fallback_paragraph_start_re, fallback_paragraph_separate_re;
102static Lisp_Object Qparagraph_start, Qparagraph_separate;
b7b65b15 103
b7b65b15
EZ
104static void
105bidi_initialize ()
106{
107 /* FIXME: This should come from the Unicode Database. */
108 struct {
109 int from, to;
110 bidi_type_t type;
111 } bidi_type[] =
112 { { 0x0000, 0x0008, WEAK_BN },
113 { 0x0009, 0x0000, NEUTRAL_S },
114 { 0x000A, 0x0000, NEUTRAL_B },
115 { 0x000B, 0x0000, NEUTRAL_S },
116 { 0x000C, 0x0000, NEUTRAL_WS },
117 { 0x000D, 0x0000, NEUTRAL_B },
118 { 0x000E, 0x001B, WEAK_BN },
119 { 0x001C, 0x001E, NEUTRAL_B },
120 { 0x001F, 0x0000, NEUTRAL_S },
121 { 0x0020, 0x0000, NEUTRAL_WS },
122 { 0x0021, 0x0022, NEUTRAL_ON },
123 { 0x0023, 0x0025, WEAK_ET },
124 { 0x0026, 0x002A, NEUTRAL_ON },
e342a24d 125 { 0x002B, 0x0000, WEAK_ES },
b7b65b15 126 { 0x002C, 0x0000, WEAK_CS },
e342a24d
EZ
127 { 0x002D, 0x0000, WEAK_ES },
128 { 0x002E, 0x002F, WEAK_CS },
b7b65b15
EZ
129 { 0x0030, 0x0039, WEAK_EN },
130 { 0x003A, 0x0000, WEAK_CS },
131 { 0x003B, 0x0040, NEUTRAL_ON },
132 { 0x005B, 0x0060, NEUTRAL_ON },
133 { 0x007B, 0x007E, NEUTRAL_ON },
134 { 0x007F, 0x0084, WEAK_BN },
135 { 0x0085, 0x0000, NEUTRAL_B },
136 { 0x0086, 0x009F, WEAK_BN },
137 { 0x00A0, 0x0000, WEAK_CS },
138 { 0x00A1, 0x0000, NEUTRAL_ON },
139 { 0x00A2, 0x00A5, WEAK_ET },
140 { 0x00A6, 0x00A9, NEUTRAL_ON },
e342a24d
EZ
141 { 0x00AB, 0x00AC, NEUTRAL_ON },
142 { 0x00AD, 0x0000, WEAK_BN },
143 { 0x00AE, 0x00Af, NEUTRAL_ON },
b7b65b15
EZ
144 { 0x00B0, 0x00B1, WEAK_ET },
145 { 0x00B2, 0x00B3, WEAK_EN },
146 { 0x00B4, 0x0000, NEUTRAL_ON },
147 { 0x00B6, 0x00B8, NEUTRAL_ON },
148 { 0x00B9, 0x0000, WEAK_EN },
149 { 0x00BB, 0x00BF, NEUTRAL_ON },
150 { 0x00D7, 0x0000, NEUTRAL_ON },
151 { 0x00F7, 0x0000, NEUTRAL_ON },
152 { 0x02B9, 0x02BA, NEUTRAL_ON },
153 { 0x02C2, 0x02CF, NEUTRAL_ON },
154 { 0x02D2, 0x02DF, NEUTRAL_ON },
155 { 0x02E5, 0x02ED, NEUTRAL_ON },
156 { 0x0300, 0x036F, WEAK_NSM },
157 { 0x0374, 0x0375, NEUTRAL_ON },
158 { 0x037E, 0x0385, NEUTRAL_ON },
159 { 0x0387, 0x0000, NEUTRAL_ON },
160 { 0x03F6, 0x0000, NEUTRAL_ON },
161 { 0x0483, 0x0489, WEAK_NSM },
162 { 0x058A, 0x0000, NEUTRAL_ON },
163 { 0x0591, 0x05BD, WEAK_NSM },
164 { 0x05BE, 0x0000, STRONG_R },
165 { 0x05BF, 0x0000, WEAK_NSM },
166 { 0x05C0, 0x0000, STRONG_R },
167 { 0x05C1, 0x05C2, WEAK_NSM },
168 { 0x05C3, 0x0000, STRONG_R },
e342a24d
EZ
169 { 0x05C4, 0x05C5, WEAK_NSM },
170 { 0x05C6, 0x0000, STRONG_R },
171 { 0x05C7, 0x0000, WEAK_NSM },
b7b65b15
EZ
172 { 0x05D0, 0x05F4, STRONG_R },
173 { 0x060C, 0x0000, WEAK_CS },
174 { 0x061B, 0x064A, STRONG_AL },
175 { 0x064B, 0x0655, WEAK_NSM },
176 { 0x0660, 0x0669, WEAK_AN },
177 { 0x066A, 0x0000, WEAK_ET },
178 { 0x066B, 0x066C, WEAK_AN },
179 { 0x066D, 0x066F, STRONG_AL },
180 { 0x0670, 0x0000, WEAK_NSM },
181 { 0x0671, 0x06D5, STRONG_AL },
182 { 0x06D6, 0x06DC, WEAK_NSM },
183 { 0x06DD, 0x0000, STRONG_AL },
184 { 0x06DE, 0x06E4, WEAK_NSM },
185 { 0x06E5, 0x06E6, STRONG_AL },
186 { 0x06E7, 0x06E8, WEAK_NSM },
187 { 0x06E9, 0x0000, NEUTRAL_ON },
188 { 0x06EA, 0x06ED, WEAK_NSM },
189 { 0x06F0, 0x06F9, WEAK_EN },
190 { 0x06FA, 0x070D, STRONG_AL },
191 { 0x070F, 0x0000, WEAK_BN },
192 { 0x0710, 0x0000, STRONG_AL },
193 { 0x0711, 0x0000, WEAK_NSM },
194 { 0x0712, 0x072C, STRONG_AL },
195 { 0x0730, 0x074A, WEAK_NSM },
196 { 0x0780, 0x07A5, STRONG_AL },
197 { 0x07A6, 0x07B0, WEAK_NSM },
198 { 0x07B1, 0x0000, STRONG_AL },
199 { 0x0901, 0x0902, WEAK_NSM },
200 { 0x093C, 0x0000, WEAK_NSM },
201 { 0x0941, 0x0948, WEAK_NSM },
202 { 0x094D, 0x0000, WEAK_NSM },
203 { 0x0951, 0x0954, WEAK_NSM },
204 { 0x0962, 0x0963, WEAK_NSM },
205 { 0x0981, 0x0000, WEAK_NSM },
206 { 0x09BC, 0x0000, WEAK_NSM },
207 { 0x09C1, 0x09C4, WEAK_NSM },
208 { 0x09CD, 0x0000, WEAK_NSM },
209 { 0x09E2, 0x09E3, WEAK_NSM },
210 { 0x09F2, 0x09F3, WEAK_ET },
211 { 0x0A02, 0x0000, WEAK_NSM },
212 { 0x0A3C, 0x0000, WEAK_NSM },
213 { 0x0A41, 0x0A4D, WEAK_NSM },
214 { 0x0A70, 0x0A71, WEAK_NSM },
215 { 0x0A81, 0x0A82, WEAK_NSM },
216 { 0x0ABC, 0x0000, WEAK_NSM },
217 { 0x0AC1, 0x0AC8, WEAK_NSM },
218 { 0x0ACD, 0x0000, WEAK_NSM },
219 { 0x0B01, 0x0000, WEAK_NSM },
220 { 0x0B3C, 0x0000, WEAK_NSM },
221 { 0x0B3F, 0x0000, WEAK_NSM },
222 { 0x0B41, 0x0B43, WEAK_NSM },
223 { 0x0B4D, 0x0B56, WEAK_NSM },
224 { 0x0B82, 0x0000, WEAK_NSM },
225 { 0x0BC0, 0x0000, WEAK_NSM },
226 { 0x0BCD, 0x0000, WEAK_NSM },
227 { 0x0C3E, 0x0C40, WEAK_NSM },
228 { 0x0C46, 0x0C56, WEAK_NSM },
229 { 0x0CBF, 0x0000, WEAK_NSM },
230 { 0x0CC6, 0x0000, WEAK_NSM },
231 { 0x0CCC, 0x0CCD, WEAK_NSM },
232 { 0x0D41, 0x0D43, WEAK_NSM },
233 { 0x0D4D, 0x0000, WEAK_NSM },
234 { 0x0DCA, 0x0000, WEAK_NSM },
235 { 0x0DD2, 0x0DD6, WEAK_NSM },
236 { 0x0E31, 0x0000, WEAK_NSM },
237 { 0x0E34, 0x0E3A, WEAK_NSM },
238 { 0x0E3F, 0x0000, WEAK_ET },
239 { 0x0E47, 0x0E4E, WEAK_NSM },
240 { 0x0EB1, 0x0000, WEAK_NSM },
241 { 0x0EB4, 0x0EBC, WEAK_NSM },
242 { 0x0EC8, 0x0ECD, WEAK_NSM },
243 { 0x0F18, 0x0F19, WEAK_NSM },
244 { 0x0F35, 0x0000, WEAK_NSM },
245 { 0x0F37, 0x0000, WEAK_NSM },
246 { 0x0F39, 0x0000, WEAK_NSM },
247 { 0x0F3A, 0x0F3D, NEUTRAL_ON },
248 { 0x0F71, 0x0F7E, WEAK_NSM },
249 { 0x0F80, 0x0F84, WEAK_NSM },
250 { 0x0F86, 0x0F87, WEAK_NSM },
251 { 0x0F90, 0x0FBC, WEAK_NSM },
252 { 0x0FC6, 0x0000, WEAK_NSM },
253 { 0x102D, 0x1030, WEAK_NSM },
254 { 0x1032, 0x1037, WEAK_NSM },
255 { 0x1039, 0x0000, WEAK_NSM },
256 { 0x1058, 0x1059, WEAK_NSM },
257 { 0x1680, 0x0000, NEUTRAL_WS },
258 { 0x169B, 0x169C, NEUTRAL_ON },
259 { 0x1712, 0x1714, WEAK_NSM },
260 { 0x1732, 0x1734, WEAK_NSM },
261 { 0x1752, 0x1753, WEAK_NSM },
262 { 0x1772, 0x1773, WEAK_NSM },
263 { 0x17B7, 0x17BD, WEAK_NSM },
264 { 0x17C6, 0x0000, WEAK_NSM },
265 { 0x17C9, 0x17D3, WEAK_NSM },
266 { 0x17DB, 0x0000, WEAK_ET },
267 { 0x1800, 0x180A, NEUTRAL_ON },
268 { 0x180B, 0x180D, WEAK_NSM },
269 { 0x180E, 0x0000, WEAK_BN },
270 { 0x18A9, 0x0000, WEAK_NSM },
271 { 0x1FBD, 0x0000, NEUTRAL_ON },
272 { 0x1FBF, 0x1FC1, NEUTRAL_ON },
273 { 0x1FCD, 0x1FCF, NEUTRAL_ON },
274 { 0x1FDD, 0x1FDF, NEUTRAL_ON },
275 { 0x1FED, 0x1FEF, NEUTRAL_ON },
276 { 0x1FFD, 0x1FFE, NEUTRAL_ON },
277 { 0x2000, 0x200A, NEUTRAL_WS },
278 { 0x200B, 0x200D, WEAK_BN },
279 { 0x200F, 0x0000, STRONG_R },
280 { 0x2010, 0x2027, NEUTRAL_ON },
281 { 0x2028, 0x0000, NEUTRAL_WS },
282 { 0x2029, 0x0000, NEUTRAL_B },
283 { 0x202A, 0x0000, LRE },
284 { 0x202B, 0x0000, RLE },
285 { 0x202C, 0x0000, PDF },
286 { 0x202D, 0x0000, LRO },
287 { 0x202E, 0x0000, RLO },
288 { 0x202F, 0x0000, NEUTRAL_WS },
289 { 0x2030, 0x2034, WEAK_ET },
290 { 0x2035, 0x2057, NEUTRAL_ON },
291 { 0x205F, 0x0000, NEUTRAL_WS },
292 { 0x2060, 0x206F, WEAK_BN },
293 { 0x2070, 0x0000, WEAK_EN },
294 { 0x2074, 0x2079, WEAK_EN },
295 { 0x207A, 0x207B, WEAK_ET },
296 { 0x207C, 0x207E, NEUTRAL_ON },
297 { 0x2080, 0x2089, WEAK_EN },
298 { 0x208A, 0x208B, WEAK_ET },
299 { 0x208C, 0x208E, NEUTRAL_ON },
300 { 0x20A0, 0x20B1, WEAK_ET },
301 { 0x20D0, 0x20EA, WEAK_NSM },
302 { 0x2100, 0x2101, NEUTRAL_ON },
303 { 0x2103, 0x2106, NEUTRAL_ON },
304 { 0x2108, 0x2109, NEUTRAL_ON },
305 { 0x2114, 0x0000, NEUTRAL_ON },
306 { 0x2116, 0x2118, NEUTRAL_ON },
307 { 0x211E, 0x2123, NEUTRAL_ON },
308 { 0x2125, 0x0000, NEUTRAL_ON },
309 { 0x2127, 0x0000, NEUTRAL_ON },
310 { 0x2129, 0x0000, NEUTRAL_ON },
311 { 0x212E, 0x0000, WEAK_ET },
312 { 0x2132, 0x0000, NEUTRAL_ON },
313 { 0x213A, 0x0000, NEUTRAL_ON },
314 { 0x2140, 0x2144, NEUTRAL_ON },
315 { 0x214A, 0x215F, NEUTRAL_ON },
316 { 0x2190, 0x2211, NEUTRAL_ON },
317 { 0x2212, 0x2213, WEAK_ET },
318 { 0x2214, 0x2335, NEUTRAL_ON },
319 { 0x237B, 0x2394, NEUTRAL_ON },
320 { 0x2396, 0x244A, NEUTRAL_ON },
321 { 0x2460, 0x249B, WEAK_EN },
322 { 0x24EA, 0x0000, WEAK_EN },
323 { 0x24EB, 0x2FFB, NEUTRAL_ON },
324 { 0x3000, 0x0000, NEUTRAL_WS },
325 { 0x3001, 0x3004, NEUTRAL_ON },
326 { 0x3008, 0x3020, NEUTRAL_ON },
327 { 0x302A, 0x302F, WEAK_NSM },
328 { 0x3030, 0x0000, NEUTRAL_ON },
329 { 0x3036, 0x3037, NEUTRAL_ON },
330 { 0x303D, 0x303F, NEUTRAL_ON },
331 { 0x3099, 0x309A, WEAK_NSM },
332 { 0x309B, 0x309C, NEUTRAL_ON },
333 { 0x30A0, 0x0000, NEUTRAL_ON },
334 { 0x30FB, 0x0000, NEUTRAL_ON },
335 { 0x3251, 0x325F, NEUTRAL_ON },
336 { 0x32B1, 0x32BF, NEUTRAL_ON },
337 { 0xA490, 0xA4C6, NEUTRAL_ON },
338 { 0xFB1D, 0x0000, STRONG_R },
339 { 0xFB1E, 0x0000, WEAK_NSM },
340 { 0xFB1F, 0xFB28, STRONG_R },
341 { 0xFB29, 0x0000, WEAK_ET },
342 { 0xFB2A, 0xFB4F, STRONG_R },
343 { 0xFB50, 0xFD3D, STRONG_AL },
344 { 0xFD3E, 0xFD3F, NEUTRAL_ON },
345 { 0xFD50, 0xFDFC, STRONG_AL },
346 { 0xFE00, 0xFE23, WEAK_NSM },
347 { 0xFE30, 0xFE4F, NEUTRAL_ON },
348 { 0xFE50, 0x0000, WEAK_CS },
349 { 0xFE51, 0x0000, NEUTRAL_ON },
350 { 0xFE52, 0x0000, WEAK_CS },
351 { 0xFE54, 0x0000, NEUTRAL_ON },
352 { 0xFE55, 0x0000, WEAK_CS },
353 { 0xFE56, 0xFE5E, NEUTRAL_ON },
354 { 0xFE5F, 0x0000, WEAK_ET },
355 { 0xFE60, 0xFE61, NEUTRAL_ON },
356 { 0xFE62, 0xFE63, WEAK_ET },
357 { 0xFE64, 0xFE68, NEUTRAL_ON },
358 { 0xFE69, 0xFE6A, WEAK_ET },
359 { 0xFE6B, 0x0000, NEUTRAL_ON },
360 { 0xFE70, 0xFEFC, STRONG_AL },
361 { 0xFEFF, 0x0000, WEAK_BN },
362 { 0xFF01, 0xFF02, NEUTRAL_ON },
363 { 0xFF03, 0xFF05, WEAK_ET },
364 { 0xFF06, 0xFF0A, NEUTRAL_ON },
365 { 0xFF0B, 0x0000, WEAK_ET },
366 { 0xFF0C, 0x0000, WEAK_CS },
367 { 0xFF0D, 0x0000, WEAK_ET },
368 { 0xFF0E, 0x0000, WEAK_CS },
369 { 0xFF0F, 0x0000, WEAK_ES },
370 { 0xFF10, 0xFF19, WEAK_EN },
371 { 0xFF1A, 0x0000, WEAK_CS },
372 { 0xFF1B, 0xFF20, NEUTRAL_ON },
373 { 0xFF3B, 0xFF40, NEUTRAL_ON },
374 { 0xFF5B, 0xFF65, NEUTRAL_ON },
375 { 0xFFE0, 0xFFE1, WEAK_ET },
376 { 0xFFE2, 0xFFE4, NEUTRAL_ON },
377 { 0xFFE5, 0xFFE6, WEAK_ET },
378 { 0xFFE8, 0xFFEE, NEUTRAL_ON },
379 { 0xFFF9, 0xFFFB, WEAK_BN },
380 { 0xFFFC, 0xFFFD, NEUTRAL_ON },
381 { 0x1D167, 0x1D169, WEAK_NSM },
382 { 0x1D173, 0x1D17A, WEAK_BN },
383 { 0x1D17B, 0x1D182, WEAK_NSM },
384 { 0x1D185, 0x1D18B, WEAK_NSM },
385 { 0x1D1AA, 0x1D1AD, WEAK_NSM },
386 { 0x1D7CE, 0x1D7FF, WEAK_EN },
387 { 0xE0001, 0xE007F, WEAK_BN } };
388 int i;
389
390 bidi_type_table = Fmake_char_table (Qnil, make_number (STRONG_L));
2d6e4628 391 staticpro (&bidi_type_table);
b7b65b15
EZ
392
393 for (i = 0; i < sizeof bidi_type / sizeof bidi_type[0]; i++)
f44e260c
EZ
394 char_table_set_range (bidi_type_table, bidi_type[i].from,
395 bidi_type[i].to ? bidi_type[i].to : bidi_type[i].from,
b7b65b15 396 make_number (bidi_type[i].type));
6bff6497
EZ
397
398 fallback_paragraph_start_re =
399 XSYMBOL (Fintern_soft (build_string ("paragraph-start"), Qnil))->value;
400 if (!STRINGP (fallback_paragraph_start_re))
401 fallback_paragraph_start_re = build_string ("\f\\|[ \t]*$");
402 staticpro (&fallback_paragraph_start_re);
403 Qparagraph_start = intern ("paragraph-start");
404 staticpro (&Qparagraph_start);
405 fallback_paragraph_separate_re =
406 XSYMBOL (Fintern_soft (build_string ("paragraph-separate"), Qnil))->value;
407 if (!STRINGP (fallback_paragraph_separate_re))
408 fallback_paragraph_separate_re = build_string ("[ \t\f]*$");
409 staticpro (&fallback_paragraph_separate_re);
410 Qparagraph_separate = intern ("paragraph-separate");
411 staticpro (&Qparagraph_separate);
b7b65b15
EZ
412 bidi_initialized = 1;
413}
414
6bff6497
EZ
415/* Return the bidi type of a character CH, subject to the current
416 directional OVERRIDE. */
b7b65b15 417bidi_type_t
6bff6497 418bidi_get_type (int ch, bidi_dir_t override)
b7b65b15 419{
6bff6497
EZ
420 bidi_type_t default_type;
421
e7402cb2
EZ
422 if (ch == BIDI_EOB)
423 return NEUTRAL_B;
e342a24d
EZ
424 if (ch < 0 || ch > MAX_CHAR)
425 abort ();
6bff6497
EZ
426
427 default_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
428
429 if (override == NEUTRAL_DIR)
430 return default_type;
431
432 switch (default_type)
433 {
434 /* Although UAX#9 does not tell, it doesn't make sense to
435 override NEUTRAL_B and LRM/RLM characters. */
436 case NEUTRAL_B:
437 case LRE:
438 case LRO:
439 case RLE:
440 case RLO:
441 case PDF:
442 return default_type;
443 default:
444 switch (ch)
445 {
446 case LRM_CHAR:
447 case RLM_CHAR:
448 return default_type;
449 default:
450 if (override == L2R) /* X6 */
451 return STRONG_L;
452 else if (override == R2L)
453 return STRONG_R;
454 else
455 abort (); /* can't happen: handled above */
456 }
457 }
b7b65b15
EZ
458}
459
2d6e4628
EZ
460void
461bidi_check_type (bidi_type_t type)
462{
463 if (type < UNKNOWN_BT || type > NEUTRAL_ON)
464 abort ();
465}
466
b7b65b15
EZ
467/* Given a bidi TYPE of a character, return its category. */
468bidi_category_t
469bidi_get_category (bidi_type_t type)
470{
471 switch (type)
472 {
473 case UNKNOWN_BT:
474 return UNKNOWN_BC;
475 case STRONG_L:
476 case STRONG_R:
477 case STRONG_AL:
478 case LRE:
479 case LRO:
480 case RLE:
481 case RLO:
482 return STRONG;
483 case PDF: /* ??? really?? */
484 case WEAK_EN:
485 case WEAK_ES:
486 case WEAK_ET:
487 case WEAK_AN:
488 case WEAK_CS:
489 case WEAK_NSM:
490 case WEAK_BN:
491 return WEAK;
492 case NEUTRAL_B:
493 case NEUTRAL_S:
494 case NEUTRAL_WS:
495 case NEUTRAL_ON:
496 return NEUTRAL;
497 default:
498 abort ();
499 }
500}
501
e342a24d
EZ
502/* Return the mirrored character of C, if any.
503
504 Note: The conditions in UAX#9 clause L4 must be tested by the
505 caller. */
b7b65b15
EZ
506/* FIXME: exceedingly temporary! Should consult the Unicode database
507 of character properties. */
508int
509bidi_mirror_char (int c)
510{
511 static const char mirrored_pairs[] = "()<>[]{}";
512 const char *p = strchr (mirrored_pairs, c);
513
514 if (p)
515 {
516 size_t i = p - mirrored_pairs;
517
518 if ((i & 1) == 0)
519 return mirrored_pairs[i + 1];
520 else
521 return mirrored_pairs[i - 1];
522 }
523 return c;
524}
525
526/* Copy the bidi iterator from FROM to TO. To save cycles, this only
527 copies the part of the level stack that is actually in use. */
528static inline void
529bidi_copy_it (struct bidi_it *to, struct bidi_it *from)
530{
9c82e145 531 int save_first_elt = to->first_elt;
b7b65b15
EZ
532 int i;
533
534 /* Copy everything except the level stack. */
535 memcpy (to, from, ((int)&((struct bidi_it *)0)->level_stack[0]));
e5a2fec7 536 /* Don't copy FIRST_ELT flag. */
9c82e145
EZ
537 to->first_elt = save_first_elt;
538 if (to->first_elt != 0 && to->first_elt != 1)
539 to->first_elt = 0;
b7b65b15
EZ
540
541 /* Copy the active part of the level stack. */
542 to->level_stack[0] = from->level_stack[0]; /* level zero is always in use */
543 for (i = 1; i <= from->stack_idx; i++)
544 to->level_stack[i] = from->level_stack[i];
545}
546
547/* Caching the bidi iterator states. */
548
549static struct bidi_it bidi_cache[1000]; /* FIXME: make this dynamically allocated! */
550static int bidi_cache_idx;
551static int bidi_cache_last_idx;
552
553static inline void
554bidi_cache_reset (void)
555{
556 bidi_cache_idx = 0;
557 bidi_cache_last_idx = -1;
558}
559
560static inline void
561bidi_cache_fetch_state (int idx, struct bidi_it *bidi_it)
562{
563 int current_scan_dir = bidi_it->scan_dir;
564
565 if (idx < 0 || idx >= bidi_cache_idx)
566 abort ();
567
568 bidi_copy_it (bidi_it, &bidi_cache[idx]);
569 bidi_it->scan_dir = current_scan_dir;
570 bidi_cache_last_idx = idx;
571}
572
573/* Find a cached state with a given CHARPOS and resolved embedding
574 level less or equal to LEVEL. if LEVEL is -1, disregard the
575 resolved levels in cached states. DIR, if non-zero, means search
576 in that direction from the last cache hit. */
577static inline int
578bidi_cache_search (int charpos, int level, int dir)
579{
580 int i, i_start;
581
582 if (bidi_cache_idx)
583 {
584 if (charpos < bidi_cache[bidi_cache_last_idx].charpos)
585 dir = -1;
586 else if (charpos > bidi_cache[bidi_cache_last_idx].charpos)
587 dir = 1;
588 if (dir)
589 i_start = bidi_cache_last_idx;
590 else
591 {
592 dir = -1;
593 i_start = bidi_cache_idx - 1;
594 }
595
596 if (dir < 0)
597 {
598 /* Linear search for now; FIXME! */
599 for (i = i_start; i >= 0; i--)
600 if (bidi_cache[i].charpos == charpos
601 && (level == -1 || bidi_cache[i].resolved_level <= level))
602 return i;
603 }
604 else
605 {
606 for (i = i_start; i < bidi_cache_idx; i++)
607 if (bidi_cache[i].charpos == charpos
608 && (level == -1 || bidi_cache[i].resolved_level <= level))
609 return i;
610 }
611 }
612
613 return -1;
614}
615
616/* Find a cached state where the resolved level changes to a value
617 that is lower than LEVEL, and return its cache slot index. DIR is
618 the direction to search, starting with the last used cache slot.
619 BEFORE, if non-zero, means return the index of the slot that is
620 ``before'' the level change in the search direction. That is,
621 given the cached levels like this:
622
623 1122333442211
624 AB C
625
626 and assuming we are at the position cached at the slot marked with
627 C, searching backwards (DIR = -1) for LEVEL = 2 will return the
628 index of slot B or A, depending whether BEFORE is, respectively,
629 non-zero or zero. */
630static int
631bidi_cache_find_level_change (int level, int dir, int before)
632{
633 if (bidi_cache_idx)
634 {
635 int i = dir ? bidi_cache_last_idx : bidi_cache_idx - 1;
636 int incr = before ? 1 : 0;
637
638 if (!dir)
639 dir = -1;
640 else if (!incr)
641 i += dir;
642
643 if (dir < 0)
644 {
645 while (i >= incr)
646 {
647 if (bidi_cache[i - incr].resolved_level >= 0
648 && bidi_cache[i - incr].resolved_level < level)
649 return i;
650 i--;
651 }
652 }
653 else
654 {
655 while (i < bidi_cache_idx - incr)
656 {
657 if (bidi_cache[i + incr].resolved_level >= 0
658 && bidi_cache[i + incr].resolved_level < level)
659 return i;
660 i++;
661 }
662 }
663 }
664
665 return -1;
666}
667
668static inline void
669bidi_cache_iterator_state (struct bidi_it *bidi_it, int resolved)
670{
671 int idx;
672
673 /* We should never cache on backward scans. */
674 if (bidi_it->scan_dir == -1)
675 abort ();
676 idx = bidi_cache_search (bidi_it->charpos, -1, 1);
677
678 if (idx < 0)
679 {
680 idx = bidi_cache_idx;
e5a2fec7 681 /* Don't overrun the cache limit. */
b7b65b15
EZ
682 if (idx > sizeof (bidi_cache) / sizeof (bidi_cache[0]) - 1)
683 abort ();
e5a2fec7
EZ
684 /* Don't violate cache integrity: character positions should
685 correspond to cache positions 1:1. */
686 if (idx > 0 && bidi_it->charpos != bidi_cache[idx - 1].charpos + 1)
687 abort ();
b7b65b15
EZ
688 bidi_copy_it (&bidi_cache[idx], bidi_it);
689 if (!resolved)
690 bidi_cache[idx].resolved_level = -1;
e5a2fec7 691 bidi_cache[idx].new_paragraph = 0;
b7b65b15
EZ
692 }
693 else
694 {
695 /* Copy only the members which could have changed, to avoid
696 costly copying of the entire struct. */
697 bidi_cache[idx].type = bidi_it->type;
2d6e4628 698 bidi_check_type (bidi_it->type);
89d3374a
EZ
699 bidi_cache[idx].type_after_w1 = bidi_it->type_after_w1;
700 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
701 if (resolved)
702 bidi_cache[idx].resolved_level = bidi_it->resolved_level;
703 else
704 bidi_cache[idx].resolved_level = -1;
705 bidi_cache[idx].invalid_levels = bidi_it->invalid_levels;
706 bidi_cache[idx].invalid_rl_levels = bidi_it->invalid_rl_levels;
707 bidi_cache[idx].next_for_neutral = bidi_it->next_for_neutral;
708 bidi_cache[idx].next_for_ws = bidi_it->next_for_ws;
709 bidi_cache[idx].ignore_bn_limit = bidi_it->ignore_bn_limit;
710 }
711
712 bidi_cache_last_idx = idx;
713 if (idx >= bidi_cache_idx)
714 bidi_cache_idx = idx + 1;
715}
716
717static inline bidi_type_t
718bidi_cache_find (int charpos, int level, struct bidi_it *bidi_it)
719{
720 int i = bidi_cache_search (charpos, level, bidi_it->scan_dir);
721
722 if (i >= 0)
723 {
724 bidi_dir_t current_scan_dir = bidi_it->scan_dir;
725
726 *bidi_it = bidi_cache[i];
727 bidi_cache_last_idx = i;
728 /* Don't let scan direction from from the cached state override
729 the current scan direction. */
730 bidi_it->scan_dir = current_scan_dir;
731 return bidi_it->type;
732 }
733
734 return UNKNOWN_BT;
735}
736
737static inline int
738bidi_peek_at_next_level (struct bidi_it *bidi_it)
739{
740 if (bidi_cache_idx == 0 || bidi_cache_last_idx == -1)
741 abort ();
742 return bidi_cache[bidi_cache_last_idx + bidi_it->scan_dir].resolved_level;
743}
744
be39f003
EZ
745/* Check if buffer position CHARPOS/BYTEPOS is the end of a paragraph.
746 Value is the non-negative length of the paragraph separator
747 following the buffer position, -1 if position is at the beginning
748 of a new paragraph, or -2 if position is neither at beginning nor
749 at end of a paragraph. */
750EMACS_INT
6bff6497 751bidi_at_paragraph_end (EMACS_INT charpos, EMACS_INT bytepos)
b7b65b15 752{
be39f003
EZ
753 Lisp_Object sep_re = Fbuffer_local_value (Qparagraph_separate,
754 Fcurrent_buffer ());
755 Lisp_Object start_re = Fbuffer_local_value (Qparagraph_start,
756 Fcurrent_buffer ());
757 EMACS_INT val;
758
759 if (!STRINGP (sep_re))
760 sep_re = fallback_paragraph_separate_re;
761 if (!STRINGP (start_re))
762 start_re = fallback_paragraph_start_re;
763
764 val = fast_looking_at (sep_re, charpos, bytepos, ZV, ZV_BYTE, Qnil);
765 if (val < 0)
766 {
767 if (fast_looking_at (start_re, charpos, bytepos, ZV, ZV_BYTE, Qnil) >= 0)
768 val = -1;
769 else
770 val = -2;
771 }
b7b65b15 772
be39f003 773 return val;
b7b65b15
EZ
774}
775
776/* Determine the start-of-run (sor) directional type given the two
777 embedding levels on either side of the run boundary. Also, update
778 the saved info about previously seen characters, since that info is
779 generally valid for a single level run. */
780static inline void
781bidi_set_sor_type (struct bidi_it *bidi_it, int level_before, int level_after)
782{
783 int higher_level = level_before > level_after ? level_before : level_after;
784
785 /* The prev_was_pdf gork is required for when we have several PDFs
786 in a row. In that case, we want to compute the sor type for the
787 next level run only once: when we see the first PDF. That's
788 because the sor type depends only on the higher of the two levels
789 that we find on the two sides of the level boundary (see UAX#9,
790 clause X10), and so we don't need to know the final embedding
791 level to which we descend after processing all the PDFs. */
e342a24d 792 if (!bidi_it->prev_was_pdf || level_before < level_after)
b7b65b15
EZ
793 /* FIXME: should the default sor direction be user selectable? */
794 bidi_it->sor = (higher_level & 1) != 0 ? R2L : L2R;
795 if (level_before > level_after)
796 bidi_it->prev_was_pdf = 1;
797
798 bidi_it->prev.type = UNKNOWN_BT;
89d3374a
EZ
799 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
800 bidi_it->last_strong.orig_type = UNKNOWN_BT;
b7b65b15
EZ
801 bidi_it->prev_for_neutral.type = bidi_it->sor == R2L ? STRONG_R : STRONG_L;
802 bidi_it->prev_for_neutral.charpos = bidi_it->charpos;
803 bidi_it->prev_for_neutral.bytepos = bidi_it->bytepos;
89d3374a
EZ
804 bidi_it->next_for_neutral.type = bidi_it->next_for_neutral.type_after_w1 =
805 bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
b7b65b15
EZ
806 bidi_it->ignore_bn_limit = 0; /* meaning it's unknown */
807}
808
6bff6497 809static void
be39f003
EZ
810bidi_line_init (struct bidi_it *bidi_it)
811{
812 bidi_it->scan_dir = 1; /* FIXME: do we need to have control on this? */
813 bidi_it->resolved_level = bidi_it->level_stack[0].level;
814 bidi_it->level_stack[0].override = NEUTRAL_DIR; /* X1 */
815 bidi_it->invalid_levels = 0;
816 bidi_it->invalid_rl_levels = -1;
817 bidi_it->next_en_pos = -1;
818 bidi_it->next_for_ws.type = UNKNOWN_BT;
b44d9321
EZ
819 bidi_set_sor_type (bidi_it,
820 bidi_it->paragraph_dir == R2L ? 1 : 0,
be39f003
EZ
821 bidi_it->level_stack[0].level); /* X10 */
822
823 bidi_cache_reset ();
824}
825
826/* Find the beginning of this paragraph by looking back in the buffer.
827 Value is the byte position of the paragraph's beginning. */
828static EMACS_INT
b44d9321 829bidi_find_paragraph_start (EMACS_INT pos, EMACS_INT pos_byte)
6bff6497 830{
be39f003 831 Lisp_Object re = Fbuffer_local_value (Qparagraph_start, Fcurrent_buffer ());
6bff6497
EZ
832 EMACS_INT limit = ZV, limit_byte = ZV_BYTE;
833
834 if (!STRINGP (re))
835 re = fallback_paragraph_start_re;
836 while (pos_byte > BEGV_BYTE
837 && fast_looking_at (re, pos, pos_byte, limit, limit_byte, Qnil) < 0)
838 {
be39f003
EZ
839 pos = find_next_newline_no_quit (pos - 1, -1);
840 pos_byte = CHAR_TO_BYTE (pos);
6bff6497 841 }
be39f003 842 return pos_byte;
6bff6497
EZ
843}
844
be39f003 845/* Determine the direction, a.k.a. base embedding level, of the
b44d9321
EZ
846 paragraph we are about to iterate through. If DIR is either L2R or
847 R2L, just use that. Otherwise, determine the paragraph direction
848 from the first strong character of the paragraph.
849
850 Note that this gives the paragraph separator the same direction as
851 the preceding paragraph, even though Emacs generally views the
852 separartor as not belonging to any paragraph. */
b7b65b15
EZ
853void
854bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it)
855{
6bff6497 856 EMACS_INT bytepos = bidi_it->bytepos;
e7402cb2 857
5e65aec0
EZ
858 /* Special case for an empty buffer. */
859 if (bytepos == BEGV_BYTE && bytepos == ZV_BYTE)
860 dir = L2R;
9c82e145 861 /* We should never be called at EOB or before BEGV. */
5e65aec0 862 else if (bytepos >= ZV_BYTE || bytepos < BEGV_BYTE)
9c82e145
EZ
863 abort ();
864
be39f003
EZ
865 if (dir == L2R)
866 {
867 bidi_it->paragraph_dir = L2R;
868 bidi_it->new_paragraph = 0;
869 }
870 else if (dir == R2L)
871 {
872 bidi_it->paragraph_dir = R2L;
873 bidi_it->new_paragraph = 0;
874 }
b7b65b15
EZ
875 else if (dir == NEUTRAL_DIR) /* P2 */
876 {
6bff6497
EZ
877 int ch, ch_len;
878 EMACS_INT pos;
879 bidi_type_t type;
be39f003
EZ
880 EMACS_INT sep_len;
881
882 /* If we are inside a paragraph separator, we are just waiting
883 for the separator to be exhausted; use the previous paragraph
e5a2fec7
EZ
884 direction. But don't do that if we have been just reseated,
885 because we need to reinitialize below in that case. */
886 if (!bidi_it->first_elt
887 && bidi_it->charpos < bidi_it->separator_limit)
be39f003
EZ
888 return;
889
b44d9321 890 /* If we are on a newline, get past it to where the next
5e65aec0
EZ
891 paragraph might start. But don't do that at BEGV since then
892 we are potentially in a new paragraph that doesn't yet
893 exist. */
c143c213 894 pos = bidi_it->charpos;
5e65aec0 895 if (bytepos > BEGV_BYTE && FETCH_CHAR (bytepos) == '\n')
be39f003 896 {
b44d9321 897 bytepos++;
c143c213 898 pos++;
be39f003 899 }
b44d9321
EZ
900
901 /* We are either at the beginning of a paragraph or in the
902 middle of it. Find where this paragraph starts. */
903 bytepos = bidi_find_paragraph_start (pos, bytepos);
6bff6497
EZ
904
905 /* We should always be at the beginning of a new line at this
906 point. */
b44d9321 907 if (!(bytepos == BEGV_BYTE || FETCH_CHAR (bytepos - 1) == '\n'))
6bff6497
EZ
908 abort ();
909
be39f003
EZ
910 bidi_it->separator_limit = -1;
911 bidi_it->new_paragraph = 0;
6bff6497
EZ
912 ch = FETCH_CHAR (bytepos);
913 ch_len = CHAR_BYTES (ch);
be39f003 914 pos = BYTE_TO_CHAR (bytepos);
6bff6497 915 type = bidi_get_type (ch, NEUTRAL_DIR);
b7b65b15 916
e342a24d 917 for (pos++, bytepos += ch_len;
b7b65b15
EZ
918 /* NOTE: UAX#9 says to search only for L, AL, or R types of
919 characters, and ignore RLE, RLO, LRE, and LRO. However,
920 I'm not sure it makes sense to omit those 4; should try
921 with and without that to see the effect. */
922 (bidi_get_category (type) != STRONG)
923 || (bidi_ignore_explicit_marks_for_paragraph_level
924 && (type == RLE || type == RLO
925 || type == LRE || type == LRO));
6bff6497 926 type = bidi_get_type (ch, NEUTRAL_DIR))
b7b65b15 927 {
be39f003 928 if (type == NEUTRAL_B && bidi_at_paragraph_end (pos, bytepos) >= -1)
b7b65b15 929 break;
21fce5ab
EZ
930 if (bytepos >= ZV_BYTE)
931 {
932 /* Pretend there's a paragraph separator at end of buffer. */
933 type = NEUTRAL_B;
934 break;
935 }
b7b65b15
EZ
936 FETCH_CHAR_ADVANCE (ch, pos, bytepos);
937 }
938 if (type == STRONG_R || type == STRONG_AL) /* P3 */
be39f003
EZ
939 bidi_it->paragraph_dir = R2L;
940 else if (type == STRONG_L)
941 bidi_it->paragraph_dir = L2R;
b7b65b15 942 }
be39f003
EZ
943 else
944 abort ();
b7b65b15 945
b44d9321
EZ
946 /* Contrary to UAX#9 clause P3, we only default the paragraph
947 direction to L2R if we have no previous usable paragraph
948 direction. */
be39f003 949 if (bidi_it->paragraph_dir == NEUTRAL_DIR)
b44d9321 950 bidi_it->paragraph_dir = L2R; /* P3 and ``higher protocols'' */
be39f003 951 if (bidi_it->paragraph_dir == R2L)
b44d9321 952 bidi_it->level_stack[0].level = 1;
be39f003 953 else
b44d9321 954 bidi_it->level_stack[0].level = 0;
be39f003
EZ
955
956 bidi_line_init (bidi_it);
b7b65b15
EZ
957}
958
6bff6497
EZ
959/* Do whatever UAX#9 clause X8 says should be done at paragraph's
960 end. */
b7b65b15
EZ
961static inline void
962bidi_set_paragraph_end (struct bidi_it *bidi_it)
963{
964 bidi_it->invalid_levels = 0;
965 bidi_it->invalid_rl_levels = -1;
966 bidi_it->stack_idx = 0;
967 bidi_it->resolved_level = bidi_it->level_stack[0].level;
b7b65b15
EZ
968}
969
89d3374a 970/* Initialize the bidi iterator from buffer position CHARPOS. */
b7b65b15 971void
6bff6497 972bidi_init_it (EMACS_INT charpos, EMACS_INT bytepos, struct bidi_it *bidi_it)
b7b65b15
EZ
973{
974 if (! bidi_initialized)
975 bidi_initialize ();
89d3374a
EZ
976 bidi_it->charpos = charpos;
977 bidi_it->bytepos = bytepos;
6bff6497
EZ
978 bidi_it->first_elt = 1;
979 bidi_set_paragraph_end (bidi_it);
980 bidi_it->new_paragraph = 1;
be39f003 981 bidi_it->separator_limit = -1;
b44d9321 982 bidi_it->paragraph_dir = NEUTRAL_DIR;
b7b65b15 983 bidi_it->type = NEUTRAL_B;
89d3374a 984 bidi_it->type_after_w1 = UNKNOWN_BT;
b7b65b15 985 bidi_it->orig_type = UNKNOWN_BT;
b7b65b15 986 bidi_it->prev_was_pdf = 0;
89d3374a
EZ
987 bidi_it->prev.type = bidi_it->prev.type_after_w1 = UNKNOWN_BT;
988 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
989 bidi_it->last_strong.orig_type = UNKNOWN_BT;
b7b65b15
EZ
990 bidi_it->next_for_neutral.charpos = -1;
991 bidi_it->next_for_neutral.type =
89d3374a
EZ
992 bidi_it->next_for_neutral.type_after_w1 =
993 bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
b7b65b15
EZ
994 bidi_it->prev_for_neutral.charpos = -1;
995 bidi_it->prev_for_neutral.type =
89d3374a
EZ
996 bidi_it->prev_for_neutral.type_after_w1 =
997 bidi_it->prev_for_neutral.orig_type = UNKNOWN_BT;
b7b65b15 998 bidi_it->sor = L2R; /* FIXME: should it be user-selectable? */
b7b65b15
EZ
999}
1000
1001/* Push the current embedding level and override status; reset the
1002 current level to LEVEL and the current override status to OVERRIDE. */
1003static inline void
1004bidi_push_embedding_level (struct bidi_it *bidi_it,
1005 int level, bidi_dir_t override)
1006{
1007 bidi_it->stack_idx++;
1008 if (bidi_it->stack_idx >= BIDI_MAXLEVEL)
1009 abort ();
1010 bidi_it->level_stack[bidi_it->stack_idx].level = level;
1011 bidi_it->level_stack[bidi_it->stack_idx].override = override;
1012}
1013
1014/* Pop the embedding level and directional override status from the
1015 stack, and return the new level. */
1016static inline int
1017bidi_pop_embedding_level (struct bidi_it *bidi_it)
1018{
1019 /* UAX#9 says to ignore invalid PDFs. */
1020 if (bidi_it->stack_idx > 0)
1021 bidi_it->stack_idx--;
1022 return bidi_it->level_stack[bidi_it->stack_idx].level;
1023}
1024
1025/* Record in SAVED_INFO the information about the current character. */
1026static inline void
1027bidi_remember_char (struct bidi_saved_info *saved_info,
1028 struct bidi_it *bidi_it)
1029{
1030 saved_info->charpos = bidi_it->charpos;
1031 saved_info->bytepos = bidi_it->bytepos;
1032 saved_info->type = bidi_it->type;
2d6e4628 1033 bidi_check_type (bidi_it->type);
89d3374a
EZ
1034 saved_info->type_after_w1 = bidi_it->type_after_w1;
1035 bidi_check_type (bidi_it->type_after_w1);
b7b65b15 1036 saved_info->orig_type = bidi_it->orig_type;
2d6e4628 1037 bidi_check_type (bidi_it->orig_type);
b7b65b15
EZ
1038}
1039
1040/* Resolve the type of a neutral character according to the type of
1041 surrounding strong text and the current embedding level. */
1042static inline bidi_type_t
1043bidi_resolve_neutral_1 (bidi_type_t prev_type, bidi_type_t next_type, int lev)
1044{
1045 /* N1: European and Arabic numbers are treated as though they were R. */
1046 if (next_type == WEAK_EN || next_type == WEAK_AN)
1047 next_type = STRONG_R;
1048 if (prev_type == WEAK_EN || prev_type == WEAK_AN)
1049 prev_type = STRONG_R;
1050
1051 if (next_type == prev_type) /* N1 */
1052 return next_type;
1053 else if ((lev & 1) == 0) /* N2 */
1054 return STRONG_L;
1055 else
1056 return STRONG_R;
1057}
1058
1059static inline int
1060bidi_explicit_dir_char (int c)
1061{
1062 /* FIXME: this should be replaced with a lookup table with suitable
1063 bits set, like standard C ctype macros do. */
1064 return (c == LRE_CHAR || c == LRO_CHAR
1065 || c == RLE_CHAR || c == RLO_CHAR || c == PDF_CHAR);
1066}
1067
1068/* A helper function for bidi_resolve_explicit. It advances to the
1069 next character in logical order and determines the new embedding
1070 level and directional override, but does not take into account
1071 empty embeddings. */
1072static int
1073bidi_resolve_explicit_1 (struct bidi_it *bidi_it)
1074{
1075 int curchar;
1076 bidi_type_t type;
1077 int current_level;
1078 int new_level;
1079 bidi_dir_t override;
1080
9c82e145
EZ
1081 if (bidi_it->bytepos < BEGV_BYTE /* after reseat to BEGV? */
1082 || bidi_it->first_elt)
e7402cb2 1083 {
9c82e145
EZ
1084 bidi_it->first_elt = 0;
1085 if (bidi_it->charpos < BEGV)
1086 bidi_it->charpos = BEGV;
1087 bidi_it->bytepos = CHAR_TO_BYTE (bidi_it->charpos);
e7402cb2 1088 }
9c82e145 1089 else if (bidi_it->bytepos < ZV_BYTE) /* don't move at ZV */
b7b65b15
EZ
1090 {
1091 bidi_it->charpos++;
e7402cb2
EZ
1092 if (bidi_it->ch_len == 0)
1093 abort ();
b7b65b15
EZ
1094 bidi_it->bytepos += bidi_it->ch_len;
1095 }
1096
1097 current_level = bidi_it->level_stack[bidi_it->stack_idx].level; /* X1 */
1098 override = bidi_it->level_stack[bidi_it->stack_idx].override;
1099 new_level = current_level;
1100
1101 /* in case it is a unibyte character (not yet implemented) */
1102 /* _fetch_multibyte_char_len = 1; */
e7402cb2
EZ
1103 if (bidi_it->bytepos >= ZV_BYTE)
1104 {
1105 curchar = BIDI_EOB;
1106 bidi_it->ch_len = 1;
1107 }
1108 else
1109 {
1110 curchar = FETCH_CHAR (bidi_it->bytepos);
1111 bidi_it->ch_len = CHAR_BYTES (curchar);
1112 }
b7b65b15 1113 bidi_it->ch = curchar;
b7b65b15 1114
6bff6497
EZ
1115 /* Don't apply directional override here, as all the types we handle
1116 below will not be affected by the override anyway, and we need
1117 the original type unaltered. The override will be applied in
1118 bidi_resolve_weak. */
1119 type = bidi_get_type (curchar, NEUTRAL_DIR);
89d3374a
EZ
1120 bidi_it->orig_type = type;
1121 bidi_check_type (bidi_it->orig_type);
b7b65b15
EZ
1122
1123 if (type != PDF)
1124 bidi_it->prev_was_pdf = 0;
1125
89d3374a 1126 bidi_it->type_after_w1 = UNKNOWN_BT;
b7b65b15
EZ
1127
1128 switch (type)
1129 {
1130 case RLE: /* X2 */
1131 case RLO: /* X4 */
89d3374a
EZ
1132 bidi_it->type_after_w1 = type;
1133 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1134 type = WEAK_BN; /* X9/Retaining */
1135 if (bidi_it->ignore_bn_limit <= 0)
1136 {
1137 if (current_level <= BIDI_MAXLEVEL - 4)
1138 {
1139 /* Compute the least odd embedding level greater than
1140 the current level. */
1141 new_level = ((current_level + 1) & ~1) + 1;
89d3374a 1142 if (bidi_it->type_after_w1 == RLE)
b7b65b15
EZ
1143 override = NEUTRAL_DIR;
1144 else
1145 override = R2L;
1146 if (current_level == BIDI_MAXLEVEL - 4)
1147 bidi_it->invalid_rl_levels = 0;
1148 bidi_push_embedding_level (bidi_it, new_level, override);
1149 }
1150 else
1151 {
1152 bidi_it->invalid_levels++;
1153 /* See the commentary about invalid_rl_levels below. */
1154 if (bidi_it->invalid_rl_levels < 0)
1155 bidi_it->invalid_rl_levels = 0;
1156 bidi_it->invalid_rl_levels++;
1157 }
1158 }
89d3374a 1159 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
b7b65b15
EZ
1160 || bidi_it->next_en_pos > bidi_it->charpos)
1161 type = WEAK_EN;
1162 break;
1163 case LRE: /* X3 */
1164 case LRO: /* X5 */
89d3374a
EZ
1165 bidi_it->type_after_w1 = type;
1166 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1167 type = WEAK_BN; /* X9/Retaining */
1168 if (bidi_it->ignore_bn_limit <= 0)
1169 {
1170 if (current_level <= BIDI_MAXLEVEL - 5)
1171 {
1172 /* Compute the least even embedding level greater than
1173 the current level. */
1174 new_level = ((current_level + 2) & ~1);
89d3374a 1175 if (bidi_it->type_after_w1 == LRE)
b7b65b15
EZ
1176 override = NEUTRAL_DIR;
1177 else
1178 override = L2R;
1179 bidi_push_embedding_level (bidi_it, new_level, override);
1180 }
1181 else
1182 {
1183 bidi_it->invalid_levels++;
1184 /* invalid_rl_levels counts invalid levels encountered
1185 while the embedding level was already too high for
1186 LRE/LRO, but not for RLE/RLO. That is because
1187 there may be exactly one PDF which we should not
1188 ignore even though invalid_levels is non-zero.
1189 invalid_rl_levels helps to know what PDF is
1190 that. */
1191 if (bidi_it->invalid_rl_levels >= 0)
1192 bidi_it->invalid_rl_levels++;
1193 }
1194 }
89d3374a 1195 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
b7b65b15
EZ
1196 || bidi_it->next_en_pos > bidi_it->charpos)
1197 type = WEAK_EN;
1198 break;
1199 case PDF: /* X7 */
89d3374a
EZ
1200 bidi_it->type_after_w1 = type;
1201 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1202 type = WEAK_BN; /* X9/Retaining */
1203 if (bidi_it->ignore_bn_limit <= 0)
1204 {
1205 if (!bidi_it->invalid_rl_levels)
1206 {
1207 new_level = bidi_pop_embedding_level (bidi_it);
1208 bidi_it->invalid_rl_levels = -1;
1209 if (bidi_it->invalid_levels)
1210 bidi_it->invalid_levels--;
1211 /* else nothing: UAX#9 says to ignore invalid PDFs */
1212 }
1213 if (!bidi_it->invalid_levels)
1214 new_level = bidi_pop_embedding_level (bidi_it);
1215 else
1216 {
1217 bidi_it->invalid_levels--;
1218 bidi_it->invalid_rl_levels--;
1219 }
1220 }
89d3374a 1221 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
b7b65b15
EZ
1222 || bidi_it->next_en_pos > bidi_it->charpos)
1223 type = WEAK_EN;
1224 break;
1225 default:
1226 /* Nothing. */
1227 break;
1228 }
1229
1230 bidi_it->type = type;
2d6e4628 1231 bidi_check_type (bidi_it->type);
b7b65b15
EZ
1232
1233 return new_level;
1234}
1235
1236/* Given an iterator state in BIDI_IT, advance one character position
1237 in the buffer to the next character (in the logical order), resolve
1238 any explicit embeddings and directional overrides, and return the
1239 embedding level of the character after resolving explicit
1240 directives and ignoring empty embeddings. */
1241static int
1242bidi_resolve_explicit (struct bidi_it *bidi_it)
1243{
1244 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1245 int new_level = bidi_resolve_explicit_1 (bidi_it);
1246
1247 if (prev_level < new_level
1248 && bidi_it->type == WEAK_BN
1249 && bidi_it->ignore_bn_limit == 0 /* only if not already known */
e7402cb2 1250 && bidi_it->ch != BIDI_EOB /* not already at EOB */
b7b65b15
EZ
1251 && bidi_explicit_dir_char (FETCH_CHAR (bidi_it->bytepos
1252 + bidi_it->ch_len)))
1253 {
1254 /* Avoid pushing and popping embedding levels if the level run
1255 is empty, as this breaks level runs where it shouldn't.
1256 UAX#9 removes all the explicit embedding and override codes,
1257 so empty embeddings disappear without a trace. We need to
1258 behave as if we did the same. */
1259 struct bidi_it saved_it;
1260 int level = prev_level;
1261
1262 bidi_copy_it (&saved_it, bidi_it);
1263
1264 while (bidi_explicit_dir_char (FETCH_CHAR (bidi_it->bytepos
1265 + bidi_it->ch_len)))
1266 {
1267 level = bidi_resolve_explicit_1 (bidi_it);
1268 }
1269
1270 if (level == prev_level) /* empty embedding */
1271 saved_it.ignore_bn_limit = bidi_it->charpos + 1;
1272 else /* this embedding is non-empty */
1273 saved_it.ignore_bn_limit = -1;
1274
1275 bidi_copy_it (bidi_it, &saved_it);
1276 if (bidi_it->ignore_bn_limit > 0)
1277 {
1278 /* We pushed a level, but we shouldn't have. Undo that. */
1279 if (!bidi_it->invalid_rl_levels)
1280 {
1281 new_level = bidi_pop_embedding_level (bidi_it);
1282 bidi_it->invalid_rl_levels = -1;
1283 if (bidi_it->invalid_levels)
1284 bidi_it->invalid_levels--;
1285 }
1286 if (!bidi_it->invalid_levels)
1287 new_level = bidi_pop_embedding_level (bidi_it);
1288 else
1289 {
1290 bidi_it->invalid_levels--;
1291 bidi_it->invalid_rl_levels--;
1292 }
1293 }
1294 }
1295
b7b65b15
EZ
1296 if (bidi_it->type == NEUTRAL_B) /* X8 */
1297 {
21fce5ab 1298 bidi_set_paragraph_end (bidi_it);
6bff6497
EZ
1299 /* This is needed by bidi_resolve_weak below, and in L1. */
1300 bidi_it->type_after_w1 = bidi_it->type;
89d3374a 1301 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1302 }
1303
1304 return new_level;
1305}
1306
1307/* Advance in the buffer, resolve weak types and return the type of
1308 the next character after weak type resolution. */
1309bidi_type_t
1310bidi_resolve_weak (struct bidi_it *bidi_it)
1311{
1312 bidi_type_t type;
1313 bidi_dir_t override;
1314 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1315 int new_level = bidi_resolve_explicit (bidi_it);
1316 int next_char;
1317 bidi_type_t type_of_next;
1318 struct bidi_it saved_it;
1319
1320 type = bidi_it->type;
1321 override = bidi_it->level_stack[bidi_it->stack_idx].override;
1322
1323 if (type == UNKNOWN_BT
1324 || type == LRE
1325 || type == LRO
1326 || type == RLE
1327 || type == RLO
1328 || type == PDF)
1329 abort ();
1330
1331 if (new_level != prev_level
1332 || bidi_it->type == NEUTRAL_B)
1333 {
1334 /* We've got a new embedding level run, compute the directional
1335 type of sor and initialize per-run variables (UAX#9, clause
1336 X10). */
1337 bidi_set_sor_type (bidi_it, prev_level, new_level);
1338 }
1339 else if (type == NEUTRAL_S || type == NEUTRAL_WS
1340 || type == WEAK_BN || type == STRONG_AL)
89d3374a
EZ
1341 bidi_it->type_after_w1 = type; /* needed in L1 */
1342 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1343
1344 /* Level and directional override status are already recorded in
1345 bidi_it, and do not need any change; see X6. */
1346 if (override == R2L) /* X6 */
1347 type = STRONG_R;
1348 else if (override == L2R)
1349 type = STRONG_L;
1350 else if (type == STRONG_AL)
1351 type = STRONG_R; /* W3 */
1352 else if (type == WEAK_NSM) /* W1 */
1353 {
1354 /* Note that we don't need to consider the case where the prev
1355 character has its type overridden by an RLO or LRO: such
1356 characters are outside the current level run, and thus not
89d3374a
EZ
1357 relevant to this NSM. Thus, NSM gets the orig_type of the
1358 previous character. */
b7b65b15 1359 if (bidi_it->prev.type != UNKNOWN_BT)
89d3374a 1360 type = bidi_it->prev.orig_type;
b7b65b15
EZ
1361 else if (bidi_it->sor == R2L)
1362 type = STRONG_R;
1363 else if (bidi_it->sor == L2R)
1364 type = STRONG_L;
1365 else /* shouldn't happen! */
1366 abort ();
1367 if (type == WEAK_EN /* W2 after W1 */
89d3374a 1368 && bidi_it->last_strong.type_after_w1 == STRONG_AL)
b7b65b15
EZ
1369 type = WEAK_AN;
1370 }
1371 else if (type == WEAK_EN /* W2 */
89d3374a 1372 && bidi_it->last_strong.type_after_w1 == STRONG_AL)
b7b65b15
EZ
1373 type = WEAK_AN;
1374 else if ((type == WEAK_ES
89d3374a
EZ
1375 && (bidi_it->prev.type_after_w1 == WEAK_EN /* W4 */
1376 && (bidi_it->prev.orig_type == WEAK_EN
1377 || bidi_it->prev.orig_type == WEAK_NSM))) /* aft W1 */
b7b65b15 1378 || (type == WEAK_CS
89d3374a
EZ
1379 && ((bidi_it->prev.type_after_w1 == WEAK_EN
1380 && (bidi_it->prev.orig_type == WEAK_EN /* W4 */
1381 || bidi_it->prev.orig_type == WEAK_NSM)) /* a/W1 */
1382 || bidi_it->prev.type_after_w1 == WEAK_AN))) /* W4 */
b7b65b15 1383 {
e7402cb2
EZ
1384 next_char =
1385 bidi_it->bytepos + bidi_it->ch_len >= ZV_BYTE
1386 ? BIDI_EOB : FETCH_CHAR (bidi_it->bytepos + bidi_it->ch_len);
6bff6497 1387 type_of_next = bidi_get_type (next_char, override);
b7b65b15
EZ
1388
1389 if (type_of_next == WEAK_BN
1390 || bidi_explicit_dir_char (next_char))
1391 {
1392 bidi_copy_it (&saved_it, bidi_it);
1393 while (bidi_resolve_explicit (bidi_it) == new_level
1394 && bidi_it->type == WEAK_BN)
1395 ;
1396 type_of_next = bidi_it->type;
1397 bidi_copy_it (bidi_it, &saved_it);
1398 }
1399
1400 /* If the next character is EN, but the last strong-type
1401 character is AL, that next EN will be changed to AN when we
1402 process it in W2 above. So in that case, this ES should not
1403 be changed into EN. */
1404 if (type == WEAK_ES
1405 && type_of_next == WEAK_EN
89d3374a 1406 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
b7b65b15
EZ
1407 type = WEAK_EN;
1408 else if (type == WEAK_CS)
1409 {
89d3374a 1410 if (bidi_it->prev.type_after_w1 == WEAK_AN
b7b65b15
EZ
1411 && (type_of_next == WEAK_AN
1412 /* If the next character is EN, but the last
1413 strong-type character is AL, EN will be later
1414 changed to AN when we process it in W2 above. So
1415 in that case, this ES should not be changed into
1416 EN. */
1417 || (type_of_next == WEAK_EN
89d3374a 1418 && bidi_it->last_strong.type_after_w1 == STRONG_AL)))
b7b65b15 1419 type = WEAK_AN;
89d3374a 1420 else if (bidi_it->prev.type_after_w1 == WEAK_EN
b7b65b15 1421 && type_of_next == WEAK_EN
89d3374a 1422 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
b7b65b15
EZ
1423 type = WEAK_EN;
1424 }
1425 }
1426 else if (type == WEAK_ET /* W5: ET with EN before or after it */
1427 || type == WEAK_BN) /* W5/Retaining */
1428 {
89d3374a 1429 if (bidi_it->prev.type_after_w1 == WEAK_EN /* ET/BN with EN before it */
b7b65b15
EZ
1430 || bidi_it->next_en_pos > bidi_it->charpos)
1431 type = WEAK_EN;
1432 /* W5: ET with EN after it. */
1433 else
1434 {
6bff6497 1435 EMACS_INT en_pos = bidi_it->charpos + 1;
b7b65b15 1436
e7402cb2
EZ
1437 next_char =
1438 bidi_it->bytepos + bidi_it->ch_len >= ZV_BYTE
1439 ? BIDI_EOB : FETCH_CHAR (bidi_it->bytepos + bidi_it->ch_len);
6bff6497 1440 type_of_next = bidi_get_type (next_char, override);
b7b65b15
EZ
1441
1442 if (type_of_next == WEAK_ET
1443 || type_of_next == WEAK_BN
1444 || bidi_explicit_dir_char (next_char))
1445 {
1446 bidi_copy_it (&saved_it, bidi_it);
1447 while (bidi_resolve_explicit (bidi_it) == new_level
1448 && (bidi_it->type == WEAK_BN || bidi_it->type == WEAK_ET))
1449 ;
1450 type_of_next = bidi_it->type;
1451 en_pos = bidi_it->charpos;
1452 bidi_copy_it (bidi_it, &saved_it);
1453 }
1454 if (type_of_next == WEAK_EN)
1455 {
1456 /* If the last strong character is AL, the EN we've
1457 found will become AN when we get to it (W2). */
89d3374a 1458 if (bidi_it->last_strong.type_after_w1 != STRONG_AL)
b7b65b15
EZ
1459 {
1460 type = WEAK_EN;
1461 /* Remember this EN position, to speed up processing
1462 of the next ETs. */
1463 bidi_it->next_en_pos = en_pos;
1464 }
1465 else if (type == WEAK_BN)
1466 type = NEUTRAL_ON; /* W6/Retaining */
1467 }
1468 }
1469 }
1470
1471 if (type == WEAK_ES || type == WEAK_ET || type == WEAK_CS /* W6 */
89d3374a
EZ
1472 || (type == WEAK_BN
1473 && (bidi_it->prev.type_after_w1 == WEAK_CS /* W6/Retaining */
1474 || bidi_it->prev.type_after_w1 == WEAK_ES
1475 || bidi_it->prev.type_after_w1 == WEAK_ET)))
b7b65b15
EZ
1476 type = NEUTRAL_ON;
1477
1478 /* Store the type we've got so far, before we clobber it with strong
1479 types in W7 and while resolving neutral types. But leave alone
1480 the original types that were recorded above, because we will need
1481 them for the L1 clause. */
89d3374a
EZ
1482 if (bidi_it->type_after_w1 == UNKNOWN_BT)
1483 bidi_it->type_after_w1 = type;
1484 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1485
1486 if (type == WEAK_EN) /* W7 */
1487 {
89d3374a 1488 if ((bidi_it->last_strong.type_after_w1 == STRONG_L)
b7b65b15
EZ
1489 || (bidi_it->last_strong.type == UNKNOWN_BT && bidi_it->sor == L2R))
1490 type = STRONG_L;
1491 }
1492
1493 bidi_it->type = type;
2d6e4628 1494 bidi_check_type (bidi_it->type);
b7b65b15
EZ
1495 return type;
1496}
1497
1498bidi_type_t
1499bidi_resolve_neutral (struct bidi_it *bidi_it)
1500{
1501 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1502 bidi_type_t type = bidi_resolve_weak (bidi_it);
1503 int current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1504
1505 if (!(type == STRONG_R
1506 || type == STRONG_L
1507 || type == WEAK_BN
1508 || type == WEAK_EN
1509 || type == WEAK_AN
1510 || type == NEUTRAL_B
1511 || type == NEUTRAL_S
1512 || type == NEUTRAL_WS
1513 || type == NEUTRAL_ON))
1514 abort ();
1515
1516 if (bidi_get_category (type) == NEUTRAL
1517 || (type == WEAK_BN && prev_level == current_level))
1518 {
1519 if (bidi_it->next_for_neutral.type != UNKNOWN_BT)
1520 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1521 bidi_it->next_for_neutral.type,
1522 current_level);
1523 else
1524 {
1525 /* Arrrgh!! The UAX#9 algorithm is too deeply entrenched in
1526 the assumption of batch-style processing; see clauses W4,
1527 W5, and especially N1, which require to look far forward
1528 (as well as back) in the buffer. May the fleas of a
1529 thousand camels infest the armpits of those who design
1530 supposedly general-purpose algorithms by looking at their
1531 own implementations, and fail to consider other possible
1532 implementations! */
1533 struct bidi_it saved_it;
1534 bidi_type_t next_type;
1535
1536 if (bidi_it->scan_dir == -1)
1537 abort ();
1538
1539 bidi_copy_it (&saved_it, bidi_it);
1540 /* Scan the text forward until we find the first non-neutral
1541 character, and then use that to resolve the neutral we
1542 are dealing with now. We also cache the scanned iterator
1543 states, to salvage some of the effort later. */
1544 bidi_cache_iterator_state (bidi_it, 0);
1545 do {
1546 /* Record the info about the previous character, so that
1547 it will be cached below with this state. */
89d3374a 1548 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
b7b65b15
EZ
1549 && bidi_it->type != WEAK_BN)
1550 bidi_remember_char (&bidi_it->prev, bidi_it);
1551 type = bidi_resolve_weak (bidi_it);
1552 /* Paragraph separators have their levels fully resolved
1553 at this point, so cache them as resolved. */
1554 bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B);
1555 /* FIXME: implement L1 here, by testing for a newline and
1556 resetting the level for any sequence of whitespace
1557 characters adjacent to it. */
1558 } while (!(type == NEUTRAL_B
1559 || (type != WEAK_BN
1560 && bidi_get_category (type) != NEUTRAL)
1561 /* This is all per level run, so stop when we
1562 reach the end of this level run. */
1563 || bidi_it->level_stack[bidi_it->stack_idx].level !=
1564 current_level));
1565
1566 bidi_remember_char (&saved_it.next_for_neutral, bidi_it);
1567
1568 switch (type)
1569 {
1570 case STRONG_L:
1571 case STRONG_R:
1572 case STRONG_AL:
1573 next_type = type;
1574 break;
1575 case WEAK_EN:
1576 case WEAK_AN:
1577 /* N1: ``European and Arabic numbers are treated as
1578 though they were R.'' */
1579 next_type = STRONG_R;
1580 saved_it.next_for_neutral.type = STRONG_R;
1581 break;
1582 case WEAK_BN:
1583 if (!bidi_explicit_dir_char (bidi_it->ch))
1584 abort (); /* can't happen: BNs are skipped */
1585 /* FALLTHROUGH */
1586 case NEUTRAL_B:
1587 /* Marched all the way to the end of this level run.
1588 We need to use the eor type, whose information is
1589 stored by bidi_set_sor_type in the prev_for_neutral
1590 member. */
1591 if (saved_it.type != WEAK_BN
89d3374a 1592 || bidi_get_category (bidi_it->prev.type_after_w1) == NEUTRAL)
b7b65b15
EZ
1593 {
1594 next_type = bidi_it->prev_for_neutral.type;
1595 saved_it.next_for_neutral.type = next_type;
2d6e4628 1596 bidi_check_type (next_type);
b7b65b15
EZ
1597 }
1598 else
1599 {
1600 /* This is a BN which does not adjoin neutrals.
1601 Leave its type alone. */
1602 bidi_copy_it (bidi_it, &saved_it);
1603 return bidi_it->type;
1604 }
1605 break;
1606 default:
1607 abort ();
1608 }
1609 type = bidi_resolve_neutral_1 (saved_it.prev_for_neutral.type,
1610 next_type, current_level);
1611 saved_it.type = type;
2d6e4628 1612 bidi_check_type (type);
b7b65b15
EZ
1613 bidi_copy_it (bidi_it, &saved_it);
1614 }
1615 }
1616 return type;
1617}
1618
1619/* Given an iterator state in BIDI_IT, advance one character position
1620 in the buffer to the next character (in the logical order), resolve
1621 the bidi type of that next character, and return that type. */
1622bidi_type_t
1623bidi_type_of_next_char (struct bidi_it *bidi_it)
1624{
1625 bidi_type_t type;
1626
1627 /* This should always be called during a forward scan. */
1628 if (bidi_it->scan_dir != 1)
1629 abort ();
1630
1631 /* Reset the limit until which to ignore BNs if we step out of the
1632 area where we found only empty levels. */
1633 if ((bidi_it->ignore_bn_limit > 0
1634 && bidi_it->ignore_bn_limit <= bidi_it->charpos)
1635 || (bidi_it->ignore_bn_limit == -1
1636 && !bidi_explicit_dir_char (bidi_it->ch)))
1637 bidi_it->ignore_bn_limit = 0;
1638
1639 type = bidi_resolve_neutral (bidi_it);
1640
1641 return type;
1642}
1643
1644/* Given an iterator state BIDI_IT, advance one character position in
1645 the buffer to the next character (in the logical order), resolve
1646 the embedding and implicit levels of that next character, and
1647 return the resulting level. */
1648int
1649bidi_level_of_next_char (struct bidi_it *bidi_it)
1650{
1651 bidi_type_t type;
1652 int level, prev_level = -1;
1653 struct bidi_saved_info next_for_neutral;
1654
1655 if (bidi_it->scan_dir == 1)
1656 {
1657 /* There's no sense in trying to advance if we hit end of text. */
1658 if (bidi_it->ch == BIDI_EOB)
1659 return bidi_it->resolved_level;
1660
1661 /* Record the info about the previous character. */
89d3374a 1662 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
b7b65b15
EZ
1663 && bidi_it->type != WEAK_BN)
1664 bidi_remember_char (&bidi_it->prev, bidi_it);
89d3374a
EZ
1665 if (bidi_it->type_after_w1 == STRONG_R
1666 || bidi_it->type_after_w1 == STRONG_L
1667 || bidi_it->type_after_w1 == STRONG_AL)
b7b65b15
EZ
1668 bidi_remember_char (&bidi_it->last_strong, bidi_it);
1669 /* FIXME: it sounds like we don't need both prev and
1670 prev_for_neutral members, but I'm leaving them both for now. */
1671 if (bidi_it->type == STRONG_R || bidi_it->type == STRONG_L
1672 || bidi_it->type == WEAK_EN || bidi_it->type == WEAK_AN)
1673 bidi_remember_char (&bidi_it->prev_for_neutral, bidi_it);
1674
1675 /* If we overstepped the characters used for resolving neutrals
1676 and whitespace, invalidate their info in the iterator. */
1677 if (bidi_it->charpos >= bidi_it->next_for_neutral.charpos)
1678 bidi_it->next_for_neutral.type = UNKNOWN_BT;
1679 if (bidi_it->next_en_pos >= 0
1680 && bidi_it->charpos >= bidi_it->next_en_pos)
1681 bidi_it->next_en_pos = -1;
1682 if (bidi_it->next_for_ws.type != UNKNOWN_BT
1683 && bidi_it->charpos >= bidi_it->next_for_ws.charpos)
1684 bidi_it->next_for_ws.type = UNKNOWN_BT;
1685
1686 /* This must be taken before we fill the iterator with the info
1687 about the next char. If we scan backwards, the iterator
1688 state must be already cached, so there's no need to know the
1689 embedding level of the previous character, since we will be
1690 returning to our caller shortly. */
1691 prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1692 }
1693 next_for_neutral = bidi_it->next_for_neutral;
1694
1695 /* Perhaps it is already cached. */
1696 type = bidi_cache_find (bidi_it->charpos + bidi_it->scan_dir, -1, bidi_it);
1697 if (type != UNKNOWN_BT)
1698 {
1699 /* Don't lose the information for resolving neutrals! The
1700 cached states could have been cached before their
1701 next_for_neutral member was computed. If we are on our way
1702 forward, we can simply take the info from the previous
1703 state. */
1704 if (bidi_it->scan_dir == 1
1705 && bidi_it->next_for_neutral.type == UNKNOWN_BT)
1706 bidi_it->next_for_neutral = next_for_neutral;
1707
1708 /* If resolved_level is -1, it means this state was cached
1709 before it was completely resolved, so we cannot return
1710 it. */
1711 if (bidi_it->resolved_level != -1)
1712 return bidi_it->resolved_level;
1713 }
1714 if (bidi_it->scan_dir == -1)
1715 /* If we are going backwards, the iterator state is already cached
1716 from previous scans, and should be fully resolved. */
1717 abort ();
1718
1719 if (type == UNKNOWN_BT)
1720 type = bidi_type_of_next_char (bidi_it);
1721
1722 if (type == NEUTRAL_B)
1723 return bidi_it->resolved_level;
1724
1725 level = bidi_it->level_stack[bidi_it->stack_idx].level;
1726 if ((bidi_get_category (type) == NEUTRAL /* && type != NEUTRAL_B */)
1727 || (type == WEAK_BN && prev_level == level))
1728 {
1729 if (bidi_it->next_for_neutral.type == UNKNOWN_BT)
1730 abort ();
1731
1732 /* If the cached state shows a neutral character, it was not
1733 resolved by bidi_resolve_neutral, so do it now. */
1734 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1735 bidi_it->next_for_neutral.type,
1736 level);
1737 }
1738
1739 if (!(type == STRONG_R
1740 || type == STRONG_L
1741 || type == WEAK_BN
1742 || type == WEAK_EN
1743 || type == WEAK_AN))
1744 abort ();
1745 bidi_it->type = type;
2d6e4628 1746 bidi_check_type (bidi_it->type);
b7b65b15
EZ
1747
1748 /* For L1 below, we need to know, for each WS character, whether
1749 it belongs to a sequence of WS characters preceeding a newline
1750 or a TAB or a paragraph separator. */
89d3374a 1751 if (bidi_it->orig_type == NEUTRAL_WS
b7b65b15
EZ
1752 && bidi_it->next_for_ws.type == UNKNOWN_BT)
1753 {
1754 int ch;
1755 int clen = bidi_it->ch_len;
6bff6497
EZ
1756 EMACS_INT bpos = bidi_it->bytepos;
1757 EMACS_INT cpos = bidi_it->charpos;
b7b65b15
EZ
1758 bidi_type_t chtype;
1759
1760 do {
1761 /*_fetch_multibyte_char_len = 1;*/
e7402cb2 1762 ch = bpos + clen >= ZV_BYTE ? BIDI_EOB : FETCH_CHAR (bpos + clen);
b7b65b15
EZ
1763 bpos += clen;
1764 cpos++;
e7402cb2
EZ
1765 clen = (ch == BIDI_EOB ? 1 : CHAR_BYTES (ch));
1766 if (ch == '\n' || ch == BIDI_EOB /* || ch == LINESEP_CHAR */)
b7b65b15
EZ
1767 chtype = NEUTRAL_B;
1768 else
6bff6497 1769 chtype = bidi_get_type (ch, NEUTRAL_DIR);
b7b65b15
EZ
1770 } while (chtype == NEUTRAL_WS || chtype == WEAK_BN
1771 || bidi_explicit_dir_char (ch)); /* L1/Retaining */
1772 bidi_it->next_for_ws.type = chtype;
2d6e4628 1773 bidi_check_type (bidi_it->next_for_ws.type);
b7b65b15
EZ
1774 bidi_it->next_for_ws.charpos = cpos;
1775 bidi_it->next_for_ws.bytepos = bpos;
1776 }
1777
1778 /* Resolve implicit levels, with a twist: PDFs get the embedding
1779 level of the enbedding they terminate. See below for the
1780 reason. */
89d3374a 1781 if (bidi_it->orig_type == PDF
b7b65b15
EZ
1782 /* Don't do this if this formatting code didn't change the
1783 embedding level due to invalid or empty embeddings. */
1784 && prev_level != level)
1785 {
1786 /* Don't look in UAX#9 for the reason for this: it's our own
1787 private quirk. The reason is that we want the formatting
1788 codes to be delivered so that they bracket the text of their
1789 embedding. For example, given the text
1790
1791 {RLO}teST{PDF}
1792
1793 we want it to be displayed as
1794
1795 {RLO}STet{PDF}
1796
1797 not as
1798
1799 STet{RLO}{PDF}
1800
1801 which will result because we bump up the embedding level as
1802 soon as we see the RLO and pop it as soon as we see the PDF,
1803 so RLO itself has the same embedding level as "teST", and
1804 thus would be normally delivered last, just before the PDF.
1805 The switch below fiddles with the level of PDF so that this
1806 ugly side effect does not happen.
1807
1808 (This is, of course, only important if the formatting codes
e7402cb2
EZ
1809 are actually displayed, but Emacs does need to display them
1810 if the user wants to.) */
b7b65b15
EZ
1811 level = prev_level;
1812 }
89d3374a
EZ
1813 else if (bidi_it->orig_type == NEUTRAL_B /* L1 */
1814 || bidi_it->orig_type == NEUTRAL_S
e7402cb2
EZ
1815 || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB
1816 /* || bidi_it->ch == LINESEP_CHAR */
89d3374a 1817 || (bidi_it->orig_type == NEUTRAL_WS
b7b65b15
EZ
1818 && (bidi_it->next_for_ws.type == NEUTRAL_B
1819 || bidi_it->next_for_ws.type == NEUTRAL_S)))
1820 level = bidi_it->level_stack[0].level;
1821 else if ((level & 1) == 0) /* I1 */
1822 {
1823 if (type == STRONG_R)
1824 level++;
1825 else if (type == WEAK_EN || type == WEAK_AN)
1826 level += 2;
1827 }
1828 else /* I2 */
1829 {
1830 if (type == STRONG_L || type == WEAK_EN || type == WEAK_AN)
1831 level++;
1832 }
1833
1834 bidi_it->resolved_level = level;
1835 return level;
1836}
1837
1838/* Move to the other edge of a level given by LEVEL. If END_FLAG is
1839 non-zero, we are at the end of a level, and we need to prepare to
1840 resume the scan of the lower level.
1841
1842 If this level's other edge is cached, we simply jump to it, filling
1843 the iterator structure with the iterator state on the other edge.
1844 Otherwise, we walk the buffer until we come back to the same level
1845 as LEVEL.
1846
1847 Note: we are not talking here about a ``level run'' in the UAX#9
1848 sense of the term, but rather about a ``level'' which includes
1849 all the levels higher than it. In other words, given the levels
1850 like this:
1851
1852 11111112222222333333334443343222222111111112223322111
1853 A B C
1854
1855 and assuming we are at point A scanning left to right, this
1856 function moves to point C, whereas the UAX#9 ``level 2 run'' ends
1857 at point B. */
1858static void
1859bidi_find_other_level_edge (struct bidi_it *bidi_it, int level, int end_flag)
1860{
1861 int dir = end_flag ? -bidi_it->scan_dir : bidi_it->scan_dir;
1862 int idx;
1863
1864 /* Try the cache first. */
1865 if ((idx = bidi_cache_find_level_change (level, dir, end_flag)) >= 0)
1866 bidi_cache_fetch_state (idx, bidi_it);
1867 else
1868 {
1869 int new_level;
1870
1871 if (end_flag)
1872 abort (); /* if we are at end of level, its edges must be cached */
1873
1874 bidi_cache_iterator_state (bidi_it, 1);
1875 do {
1876 new_level = bidi_level_of_next_char (bidi_it);
1877 bidi_cache_iterator_state (bidi_it, 1);
1878 } while (new_level >= level);
1879 }
1880}
1881
1882void
1883bidi_get_next_char_visually (struct bidi_it *bidi_it)
1884{
1885 int old_level, new_level, next_level;
9c82e145 1886 struct bidi_it sentinel;
b7b65b15
EZ
1887
1888 if (bidi_it->scan_dir == 0)
1889 {
1890 bidi_it->scan_dir = 1; /* default to logical order */
1891 }
1892
be39f003
EZ
1893 /* If we just passed a newline, initialize for the next line. */
1894 if (!bidi_it->first_elt && bidi_it->orig_type == NEUTRAL_B)
1895 bidi_line_init (bidi_it);
1896
9c82e145 1897 /* Prepare the sentinel iterator state. */
b7b65b15 1898 if (bidi_cache_idx == 0)
9c82e145
EZ
1899 {
1900 bidi_copy_it (&sentinel, bidi_it);
1901 if (bidi_it->first_elt)
1902 {
1903 sentinel.charpos--; /* cached charpos needs to be monotonic */
1904 sentinel.bytepos--;
1905 sentinel.ch = '\n'; /* doesn't matter, but why not? */
1906 sentinel.ch_len = 1;
1907 }
1908 }
b7b65b15
EZ
1909
1910 old_level = bidi_it->resolved_level;
1911 new_level = bidi_level_of_next_char (bidi_it);
b7b65b15
EZ
1912
1913 /* Reordering of resolved levels (clause L2) is implemented by
1914 jumping to the other edge of the level and flipping direction of
1915 scanning the buffer whenever we find a level change. */
1916 if (new_level != old_level)
1917 {
1918 int ascending = new_level > old_level;
1919 int level_to_search = ascending ? old_level + 1 : old_level;
1920 int incr = ascending ? 1 : -1;
1921 int expected_next_level = old_level + incr;
1922
1923 /* If we don't have anything cached yet, we need to cache the
9c82e145
EZ
1924 sentinel state, since we'll need it to record where to jump
1925 when the last non-base level is exhausted. */
b7b65b15 1926 if (bidi_cache_idx == 0)
9c82e145 1927 bidi_cache_iterator_state (&sentinel, 1);
b7b65b15
EZ
1928 /* Jump (or walk) to the other edge of this level. */
1929 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
1930 /* Switch scan direction and peek at the next character in the
1931 new direction. */
1932 bidi_it->scan_dir = -bidi_it->scan_dir;
1933
1934 /* The following loop handles the case where the resolved level
1935 jumps by more than one. This is typical for numbers inside a
1936 run of text with left-to-right embedding direction, but can
1937 also happen in other situations. In those cases the decision
1938 where to continue after a level change, and in what direction,
1939 is tricky. For example, given a text like below:
1940
1941 abcdefgh
1942 11336622
1943
1944 (where the numbers below the text show the resolved levels),
1945 the result of reordering according to UAX#9 should be this:
1946
1947 efdcghba
1948
1949 This is implemented by the loop below which flips direction
1950 and jumps to the other edge of the level each time it finds
1951 the new level not to be the expected one. The expected level
1952 is always one more or one less than the previous one. */
1953 next_level = bidi_peek_at_next_level (bidi_it);
1954 while (next_level != expected_next_level)
1955 {
1956 expected_next_level += incr;
1957 level_to_search += incr;
1958 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
1959 bidi_it->scan_dir = -bidi_it->scan_dir;
1960 next_level = bidi_peek_at_next_level (bidi_it);
1961 }
1962
1963 /* Finally, deliver the next character in the new direction. */
1964 next_level = bidi_level_of_next_char (bidi_it);
1965 }
1966
b44d9321
EZ
1967 /* Take note when we have just processed the newline that precedes
1968 the end of the paragraph. The next time we are about to be
1969 called, set_iterator_to_next will automatically reinit the
1970 paragraph direction, if needed. We do this at the newline before
1971 the paragraph separator, because the next character might not be
1972 the first character of the next paragraph, due to the bidi
1973 reordering. */
6bff6497 1974 if (bidi_it->scan_dir == 1
be39f003
EZ
1975 && bidi_it->orig_type == NEUTRAL_B
1976 && bidi_it->bytepos < ZV_BYTE)
1977 {
1978 EMACS_INT sep_len =
1979 bidi_at_paragraph_end (bidi_it->charpos + 1,
1980 bidi_it->bytepos + bidi_it->ch_len);
1981 if (sep_len >= 0)
1982 {
1983 bidi_it->new_paragraph = 1;
b44d9321
EZ
1984 /* Record the buffer position of the last character of the
1985 paragraph separator. */
1986 bidi_it->separator_limit = bidi_it->charpos + 1 + sep_len;
be39f003
EZ
1987 }
1988 }
6bff6497 1989
b7b65b15
EZ
1990 if (bidi_it->scan_dir == 1 && bidi_cache_idx)
1991 {
1992 /* If we are at paragraph's base embedding level and beyond the
1993 last cached position, the cache's job is done and we can
1994 discard it. */
1995 if (bidi_it->resolved_level == bidi_it->level_stack[0].level
1996 && bidi_it->charpos > bidi_cache[bidi_cache_idx - 1].charpos)
1997 bidi_cache_reset ();
1998 /* But as long as we are caching during forward scan, we must
1999 cache each state, or else the cache integrity will be
2000 compromised: it assumes cached states correspond to buffer
2001 positions 1:1. */
2002 else
2003 bidi_cache_iterator_state (bidi_it, 1);
2004 }
2005}
2006
2007/* This is meant to be called from within the debugger, whenever you
2008 wish to examine the cache contents. */
2009void
2010bidi_dump_cached_states (void)
2011{
2012 int i;
2013 int ndigits = 1;
2014
2015 if (bidi_cache_idx == 0)
2016 {
2017 fprintf (stderr, "The cache is empty.\n");
2018 return;
2019 }
2020 fprintf (stderr, "Total of %d state%s in cache:\n",
2021 bidi_cache_idx, bidi_cache_idx == 1 ? "" : "s");
2022
2023 for (i = bidi_cache[bidi_cache_idx - 1].charpos; i > 0; i /= 10)
2024 ndigits++;
2025 fputs ("ch ", stderr);
2026 for (i = 0; i < bidi_cache_idx; i++)
2027 fprintf (stderr, "%*c", ndigits, bidi_cache[i].ch);
2028 fputs ("\n", stderr);
2029 fputs ("lvl ", stderr);
2030 for (i = 0; i < bidi_cache_idx; i++)
2031 fprintf (stderr, "%*d", ndigits, bidi_cache[i].resolved_level);
2032 fputs ("\n", stderr);
2033 fputs ("pos ", stderr);
2034 for (i = 0; i < bidi_cache_idx; i++)
2035 fprintf (stderr, "%*d", ndigits, bidi_cache[i].charpos);
2036 fputs ("\n", stderr);
2037}