Retrospective commit from 2009-10-03.
[bpt/emacs.git] / src / bidi.c
CommitLineData
b7b65b15
EZ
1/* Low-level bidirectional buffer-scanning functions for GNU Emacs.
2 Copyright (C) 2000, 2001, 2004, 2005 Free Software Foundation, Inc.
3
4This file is part of GNU Emacs.
5
6GNU Emacs is free software; you can redistribute it and/or modify
7it under the terms of the GNU General Public License as published by
8the Free Software Foundation; either version 2, or (at your option)
9any later version.
10
11GNU Emacs is distributed in the hope that it will be useful,
12but WITHOUT ANY WARRANTY; without even the implied warranty of
13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14GNU General Public License for more details.
15
16
17You should have received a copy of the GNU General Public License
18along with GNU Emacs; see the file COPYING. If not, write to
19the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20Boston, MA 02110-1301, USA. */
21
2d6e4628
EZ
22/* Written by Eli Zaretskii <eliz@gnu.org>.
23
24 A sequential implementation of the Unicode Bidirectional algorithm,
b7b65b15
EZ
25 as per UAX#9, a part of the Unicode Standard.
26
27 Unlike the reference and most other implementations, this one is
28 designed to be called once for every character in the buffer.
29
30 The main entry point is bidi_get_next_char_visually. Each time it
31 is called, it finds the next character in the visual order, and
32 returns its information in a special structure. The caller is then
33 expected to process this character for display or any other
34 purposes, and call bidi_get_next_char_visually for the next
35 character. See the comments in bidi_get_next_char_visually for
36 more details about its algorithm that finds the next visual-order
37 character by resolving their levels on the fly.
38
89d3374a
EZ
39 If you want to understand the code, you will have to read it
40 together with the relevant portions of UAX#9. The comments include
41 references to UAX#9 rules, for that very reason.
42
b7b65b15
EZ
43 A note about references to UAX#9 rules: if the reference says
44 something like "X9/Retaining", it means that you need to refer to
45 rule X9 and to its modifications decribed in the "Implementation
46 Notes" section of UAX#9, under "Retaining Format Codes". */
47
48#ifdef HAVE_CONFIG_H
49#include <config.h>
50#endif
51
52#include <stdio.h>
53
54#ifdef HAVE_STRING_H
55#include <string.h>
56#endif
57
58#include "lisp.h"
59#include "buffer.h"
60#include "character.h"
61#include "dispextern.h"
62
63static int bidi_initialized = 0;
64
65static Lisp_Object bidi_type_table;
66
e7402cb2 67/* FIXME: Remove these when bidi_explicit_dir_char uses a lookup table. */
b7b65b15
EZ
68#define LRM_CHAR 0x200E
69#define RLM_CHAR 0x200F
70#define LRE_CHAR 0x202A
71#define RLE_CHAR 0x202B
72#define PDF_CHAR 0x202C
73#define LRO_CHAR 0x202D
74#define RLO_CHAR 0x202E
75
b7b65b15 76#define BIDI_EOB -1
e7402cb2 77#define BIDI_BOB -2 /* FIXME: Is this needed? */
b7b65b15 78
b7b65b15
EZ
79/* Local data structures. (Look in dispextern.h for the rest.) */
80
81/* What we need to know about the current paragraph. */
82struct bidi_paragraph_info {
83 int start_bytepos; /* byte position where it begins */
84 int end_bytepos; /* byte position where it ends */
85 int embedding_level; /* its basic embedding level */
86 bidi_dir_t base_dir; /* its base direction */
87};
88
89/* Data type for describing the bidirectional character categories. */
90typedef enum {
91 UNKNOWN_BC,
92 NEUTRAL,
93 WEAK,
94 STRONG
95} bidi_category_t;
96
97int bidi_ignore_explicit_marks_for_paragraph_level = 1;
98
6bff6497
EZ
99static Lisp_Object fallback_paragraph_start_re, fallback_paragraph_separate_re;
100static Lisp_Object Qparagraph_start, Qparagraph_separate;
b7b65b15 101
b7b65b15
EZ
102static void
103bidi_initialize ()
104{
105 /* FIXME: This should come from the Unicode Database. */
106 struct {
107 int from, to;
108 bidi_type_t type;
109 } bidi_type[] =
110 { { 0x0000, 0x0008, WEAK_BN },
111 { 0x0009, 0x0000, NEUTRAL_S },
112 { 0x000A, 0x0000, NEUTRAL_B },
113 { 0x000B, 0x0000, NEUTRAL_S },
114 { 0x000C, 0x0000, NEUTRAL_WS },
115 { 0x000D, 0x0000, NEUTRAL_B },
116 { 0x000E, 0x001B, WEAK_BN },
117 { 0x001C, 0x001E, NEUTRAL_B },
118 { 0x001F, 0x0000, NEUTRAL_S },
119 { 0x0020, 0x0000, NEUTRAL_WS },
120 { 0x0021, 0x0022, NEUTRAL_ON },
121 { 0x0023, 0x0025, WEAK_ET },
122 { 0x0026, 0x002A, NEUTRAL_ON },
e342a24d 123 { 0x002B, 0x0000, WEAK_ES },
b7b65b15 124 { 0x002C, 0x0000, WEAK_CS },
e342a24d
EZ
125 { 0x002D, 0x0000, WEAK_ES },
126 { 0x002E, 0x002F, WEAK_CS },
b7b65b15
EZ
127 { 0x0030, 0x0039, WEAK_EN },
128 { 0x003A, 0x0000, WEAK_CS },
129 { 0x003B, 0x0040, NEUTRAL_ON },
130 { 0x005B, 0x0060, NEUTRAL_ON },
131 { 0x007B, 0x007E, NEUTRAL_ON },
132 { 0x007F, 0x0084, WEAK_BN },
133 { 0x0085, 0x0000, NEUTRAL_B },
134 { 0x0086, 0x009F, WEAK_BN },
135 { 0x00A0, 0x0000, WEAK_CS },
136 { 0x00A1, 0x0000, NEUTRAL_ON },
137 { 0x00A2, 0x00A5, WEAK_ET },
138 { 0x00A6, 0x00A9, NEUTRAL_ON },
e342a24d
EZ
139 { 0x00AB, 0x00AC, NEUTRAL_ON },
140 { 0x00AD, 0x0000, WEAK_BN },
141 { 0x00AE, 0x00Af, NEUTRAL_ON },
b7b65b15
EZ
142 { 0x00B0, 0x00B1, WEAK_ET },
143 { 0x00B2, 0x00B3, WEAK_EN },
144 { 0x00B4, 0x0000, NEUTRAL_ON },
145 { 0x00B6, 0x00B8, NEUTRAL_ON },
146 { 0x00B9, 0x0000, WEAK_EN },
147 { 0x00BB, 0x00BF, NEUTRAL_ON },
148 { 0x00D7, 0x0000, NEUTRAL_ON },
149 { 0x00F7, 0x0000, NEUTRAL_ON },
150 { 0x02B9, 0x02BA, NEUTRAL_ON },
151 { 0x02C2, 0x02CF, NEUTRAL_ON },
152 { 0x02D2, 0x02DF, NEUTRAL_ON },
153 { 0x02E5, 0x02ED, NEUTRAL_ON },
154 { 0x0300, 0x036F, WEAK_NSM },
155 { 0x0374, 0x0375, NEUTRAL_ON },
156 { 0x037E, 0x0385, NEUTRAL_ON },
157 { 0x0387, 0x0000, NEUTRAL_ON },
158 { 0x03F6, 0x0000, NEUTRAL_ON },
159 { 0x0483, 0x0489, WEAK_NSM },
160 { 0x058A, 0x0000, NEUTRAL_ON },
161 { 0x0591, 0x05BD, WEAK_NSM },
162 { 0x05BE, 0x0000, STRONG_R },
163 { 0x05BF, 0x0000, WEAK_NSM },
164 { 0x05C0, 0x0000, STRONG_R },
165 { 0x05C1, 0x05C2, WEAK_NSM },
166 { 0x05C3, 0x0000, STRONG_R },
e342a24d
EZ
167 { 0x05C4, 0x05C5, WEAK_NSM },
168 { 0x05C6, 0x0000, STRONG_R },
169 { 0x05C7, 0x0000, WEAK_NSM },
b7b65b15
EZ
170 { 0x05D0, 0x05F4, STRONG_R },
171 { 0x060C, 0x0000, WEAK_CS },
172 { 0x061B, 0x064A, STRONG_AL },
173 { 0x064B, 0x0655, WEAK_NSM },
174 { 0x0660, 0x0669, WEAK_AN },
175 { 0x066A, 0x0000, WEAK_ET },
176 { 0x066B, 0x066C, WEAK_AN },
177 { 0x066D, 0x066F, STRONG_AL },
178 { 0x0670, 0x0000, WEAK_NSM },
179 { 0x0671, 0x06D5, STRONG_AL },
180 { 0x06D6, 0x06DC, WEAK_NSM },
181 { 0x06DD, 0x0000, STRONG_AL },
182 { 0x06DE, 0x06E4, WEAK_NSM },
183 { 0x06E5, 0x06E6, STRONG_AL },
184 { 0x06E7, 0x06E8, WEAK_NSM },
185 { 0x06E9, 0x0000, NEUTRAL_ON },
186 { 0x06EA, 0x06ED, WEAK_NSM },
187 { 0x06F0, 0x06F9, WEAK_EN },
188 { 0x06FA, 0x070D, STRONG_AL },
189 { 0x070F, 0x0000, WEAK_BN },
190 { 0x0710, 0x0000, STRONG_AL },
191 { 0x0711, 0x0000, WEAK_NSM },
192 { 0x0712, 0x072C, STRONG_AL },
193 { 0x0730, 0x074A, WEAK_NSM },
194 { 0x0780, 0x07A5, STRONG_AL },
195 { 0x07A6, 0x07B0, WEAK_NSM },
196 { 0x07B1, 0x0000, STRONG_AL },
197 { 0x0901, 0x0902, WEAK_NSM },
198 { 0x093C, 0x0000, WEAK_NSM },
199 { 0x0941, 0x0948, WEAK_NSM },
200 { 0x094D, 0x0000, WEAK_NSM },
201 { 0x0951, 0x0954, WEAK_NSM },
202 { 0x0962, 0x0963, WEAK_NSM },
203 { 0x0981, 0x0000, WEAK_NSM },
204 { 0x09BC, 0x0000, WEAK_NSM },
205 { 0x09C1, 0x09C4, WEAK_NSM },
206 { 0x09CD, 0x0000, WEAK_NSM },
207 { 0x09E2, 0x09E3, WEAK_NSM },
208 { 0x09F2, 0x09F3, WEAK_ET },
209 { 0x0A02, 0x0000, WEAK_NSM },
210 { 0x0A3C, 0x0000, WEAK_NSM },
211 { 0x0A41, 0x0A4D, WEAK_NSM },
212 { 0x0A70, 0x0A71, WEAK_NSM },
213 { 0x0A81, 0x0A82, WEAK_NSM },
214 { 0x0ABC, 0x0000, WEAK_NSM },
215 { 0x0AC1, 0x0AC8, WEAK_NSM },
216 { 0x0ACD, 0x0000, WEAK_NSM },
217 { 0x0B01, 0x0000, WEAK_NSM },
218 { 0x0B3C, 0x0000, WEAK_NSM },
219 { 0x0B3F, 0x0000, WEAK_NSM },
220 { 0x0B41, 0x0B43, WEAK_NSM },
221 { 0x0B4D, 0x0B56, WEAK_NSM },
222 { 0x0B82, 0x0000, WEAK_NSM },
223 { 0x0BC0, 0x0000, WEAK_NSM },
224 { 0x0BCD, 0x0000, WEAK_NSM },
225 { 0x0C3E, 0x0C40, WEAK_NSM },
226 { 0x0C46, 0x0C56, WEAK_NSM },
227 { 0x0CBF, 0x0000, WEAK_NSM },
228 { 0x0CC6, 0x0000, WEAK_NSM },
229 { 0x0CCC, 0x0CCD, WEAK_NSM },
230 { 0x0D41, 0x0D43, WEAK_NSM },
231 { 0x0D4D, 0x0000, WEAK_NSM },
232 { 0x0DCA, 0x0000, WEAK_NSM },
233 { 0x0DD2, 0x0DD6, WEAK_NSM },
234 { 0x0E31, 0x0000, WEAK_NSM },
235 { 0x0E34, 0x0E3A, WEAK_NSM },
236 { 0x0E3F, 0x0000, WEAK_ET },
237 { 0x0E47, 0x0E4E, WEAK_NSM },
238 { 0x0EB1, 0x0000, WEAK_NSM },
239 { 0x0EB4, 0x0EBC, WEAK_NSM },
240 { 0x0EC8, 0x0ECD, WEAK_NSM },
241 { 0x0F18, 0x0F19, WEAK_NSM },
242 { 0x0F35, 0x0000, WEAK_NSM },
243 { 0x0F37, 0x0000, WEAK_NSM },
244 { 0x0F39, 0x0000, WEAK_NSM },
245 { 0x0F3A, 0x0F3D, NEUTRAL_ON },
246 { 0x0F71, 0x0F7E, WEAK_NSM },
247 { 0x0F80, 0x0F84, WEAK_NSM },
248 { 0x0F86, 0x0F87, WEAK_NSM },
249 { 0x0F90, 0x0FBC, WEAK_NSM },
250 { 0x0FC6, 0x0000, WEAK_NSM },
251 { 0x102D, 0x1030, WEAK_NSM },
252 { 0x1032, 0x1037, WEAK_NSM },
253 { 0x1039, 0x0000, WEAK_NSM },
254 { 0x1058, 0x1059, WEAK_NSM },
255 { 0x1680, 0x0000, NEUTRAL_WS },
256 { 0x169B, 0x169C, NEUTRAL_ON },
257 { 0x1712, 0x1714, WEAK_NSM },
258 { 0x1732, 0x1734, WEAK_NSM },
259 { 0x1752, 0x1753, WEAK_NSM },
260 { 0x1772, 0x1773, WEAK_NSM },
261 { 0x17B7, 0x17BD, WEAK_NSM },
262 { 0x17C6, 0x0000, WEAK_NSM },
263 { 0x17C9, 0x17D3, WEAK_NSM },
264 { 0x17DB, 0x0000, WEAK_ET },
265 { 0x1800, 0x180A, NEUTRAL_ON },
266 { 0x180B, 0x180D, WEAK_NSM },
267 { 0x180E, 0x0000, WEAK_BN },
268 { 0x18A9, 0x0000, WEAK_NSM },
269 { 0x1FBD, 0x0000, NEUTRAL_ON },
270 { 0x1FBF, 0x1FC1, NEUTRAL_ON },
271 { 0x1FCD, 0x1FCF, NEUTRAL_ON },
272 { 0x1FDD, 0x1FDF, NEUTRAL_ON },
273 { 0x1FED, 0x1FEF, NEUTRAL_ON },
274 { 0x1FFD, 0x1FFE, NEUTRAL_ON },
275 { 0x2000, 0x200A, NEUTRAL_WS },
276 { 0x200B, 0x200D, WEAK_BN },
277 { 0x200F, 0x0000, STRONG_R },
278 { 0x2010, 0x2027, NEUTRAL_ON },
279 { 0x2028, 0x0000, NEUTRAL_WS },
280 { 0x2029, 0x0000, NEUTRAL_B },
281 { 0x202A, 0x0000, LRE },
282 { 0x202B, 0x0000, RLE },
283 { 0x202C, 0x0000, PDF },
284 { 0x202D, 0x0000, LRO },
285 { 0x202E, 0x0000, RLO },
286 { 0x202F, 0x0000, NEUTRAL_WS },
287 { 0x2030, 0x2034, WEAK_ET },
288 { 0x2035, 0x2057, NEUTRAL_ON },
289 { 0x205F, 0x0000, NEUTRAL_WS },
290 { 0x2060, 0x206F, WEAK_BN },
291 { 0x2070, 0x0000, WEAK_EN },
292 { 0x2074, 0x2079, WEAK_EN },
293 { 0x207A, 0x207B, WEAK_ET },
294 { 0x207C, 0x207E, NEUTRAL_ON },
295 { 0x2080, 0x2089, WEAK_EN },
296 { 0x208A, 0x208B, WEAK_ET },
297 { 0x208C, 0x208E, NEUTRAL_ON },
298 { 0x20A0, 0x20B1, WEAK_ET },
299 { 0x20D0, 0x20EA, WEAK_NSM },
300 { 0x2100, 0x2101, NEUTRAL_ON },
301 { 0x2103, 0x2106, NEUTRAL_ON },
302 { 0x2108, 0x2109, NEUTRAL_ON },
303 { 0x2114, 0x0000, NEUTRAL_ON },
304 { 0x2116, 0x2118, NEUTRAL_ON },
305 { 0x211E, 0x2123, NEUTRAL_ON },
306 { 0x2125, 0x0000, NEUTRAL_ON },
307 { 0x2127, 0x0000, NEUTRAL_ON },
308 { 0x2129, 0x0000, NEUTRAL_ON },
309 { 0x212E, 0x0000, WEAK_ET },
310 { 0x2132, 0x0000, NEUTRAL_ON },
311 { 0x213A, 0x0000, NEUTRAL_ON },
312 { 0x2140, 0x2144, NEUTRAL_ON },
313 { 0x214A, 0x215F, NEUTRAL_ON },
314 { 0x2190, 0x2211, NEUTRAL_ON },
315 { 0x2212, 0x2213, WEAK_ET },
316 { 0x2214, 0x2335, NEUTRAL_ON },
317 { 0x237B, 0x2394, NEUTRAL_ON },
318 { 0x2396, 0x244A, NEUTRAL_ON },
319 { 0x2460, 0x249B, WEAK_EN },
320 { 0x24EA, 0x0000, WEAK_EN },
321 { 0x24EB, 0x2FFB, NEUTRAL_ON },
322 { 0x3000, 0x0000, NEUTRAL_WS },
323 { 0x3001, 0x3004, NEUTRAL_ON },
324 { 0x3008, 0x3020, NEUTRAL_ON },
325 { 0x302A, 0x302F, WEAK_NSM },
326 { 0x3030, 0x0000, NEUTRAL_ON },
327 { 0x3036, 0x3037, NEUTRAL_ON },
328 { 0x303D, 0x303F, NEUTRAL_ON },
329 { 0x3099, 0x309A, WEAK_NSM },
330 { 0x309B, 0x309C, NEUTRAL_ON },
331 { 0x30A0, 0x0000, NEUTRAL_ON },
332 { 0x30FB, 0x0000, NEUTRAL_ON },
333 { 0x3251, 0x325F, NEUTRAL_ON },
334 { 0x32B1, 0x32BF, NEUTRAL_ON },
335 { 0xA490, 0xA4C6, NEUTRAL_ON },
336 { 0xFB1D, 0x0000, STRONG_R },
337 { 0xFB1E, 0x0000, WEAK_NSM },
338 { 0xFB1F, 0xFB28, STRONG_R },
339 { 0xFB29, 0x0000, WEAK_ET },
340 { 0xFB2A, 0xFB4F, STRONG_R },
341 { 0xFB50, 0xFD3D, STRONG_AL },
342 { 0xFD3E, 0xFD3F, NEUTRAL_ON },
343 { 0xFD50, 0xFDFC, STRONG_AL },
344 { 0xFE00, 0xFE23, WEAK_NSM },
345 { 0xFE30, 0xFE4F, NEUTRAL_ON },
346 { 0xFE50, 0x0000, WEAK_CS },
347 { 0xFE51, 0x0000, NEUTRAL_ON },
348 { 0xFE52, 0x0000, WEAK_CS },
349 { 0xFE54, 0x0000, NEUTRAL_ON },
350 { 0xFE55, 0x0000, WEAK_CS },
351 { 0xFE56, 0xFE5E, NEUTRAL_ON },
352 { 0xFE5F, 0x0000, WEAK_ET },
353 { 0xFE60, 0xFE61, NEUTRAL_ON },
354 { 0xFE62, 0xFE63, WEAK_ET },
355 { 0xFE64, 0xFE68, NEUTRAL_ON },
356 { 0xFE69, 0xFE6A, WEAK_ET },
357 { 0xFE6B, 0x0000, NEUTRAL_ON },
358 { 0xFE70, 0xFEFC, STRONG_AL },
359 { 0xFEFF, 0x0000, WEAK_BN },
360 { 0xFF01, 0xFF02, NEUTRAL_ON },
361 { 0xFF03, 0xFF05, WEAK_ET },
362 { 0xFF06, 0xFF0A, NEUTRAL_ON },
363 { 0xFF0B, 0x0000, WEAK_ET },
364 { 0xFF0C, 0x0000, WEAK_CS },
365 { 0xFF0D, 0x0000, WEAK_ET },
366 { 0xFF0E, 0x0000, WEAK_CS },
367 { 0xFF0F, 0x0000, WEAK_ES },
368 { 0xFF10, 0xFF19, WEAK_EN },
369 { 0xFF1A, 0x0000, WEAK_CS },
370 { 0xFF1B, 0xFF20, NEUTRAL_ON },
371 { 0xFF3B, 0xFF40, NEUTRAL_ON },
372 { 0xFF5B, 0xFF65, NEUTRAL_ON },
373 { 0xFFE0, 0xFFE1, WEAK_ET },
374 { 0xFFE2, 0xFFE4, NEUTRAL_ON },
375 { 0xFFE5, 0xFFE6, WEAK_ET },
376 { 0xFFE8, 0xFFEE, NEUTRAL_ON },
377 { 0xFFF9, 0xFFFB, WEAK_BN },
378 { 0xFFFC, 0xFFFD, NEUTRAL_ON },
379 { 0x1D167, 0x1D169, WEAK_NSM },
380 { 0x1D173, 0x1D17A, WEAK_BN },
381 { 0x1D17B, 0x1D182, WEAK_NSM },
382 { 0x1D185, 0x1D18B, WEAK_NSM },
383 { 0x1D1AA, 0x1D1AD, WEAK_NSM },
384 { 0x1D7CE, 0x1D7FF, WEAK_EN },
385 { 0xE0001, 0xE007F, WEAK_BN } };
386 int i;
387
388 bidi_type_table = Fmake_char_table (Qnil, make_number (STRONG_L));
2d6e4628 389 staticpro (&bidi_type_table);
b7b65b15
EZ
390
391 for (i = 0; i < sizeof bidi_type / sizeof bidi_type[0]; i++)
f44e260c
EZ
392 char_table_set_range (bidi_type_table, bidi_type[i].from,
393 bidi_type[i].to ? bidi_type[i].to : bidi_type[i].from,
b7b65b15 394 make_number (bidi_type[i].type));
6bff6497
EZ
395
396 fallback_paragraph_start_re =
397 XSYMBOL (Fintern_soft (build_string ("paragraph-start"), Qnil))->value;
398 if (!STRINGP (fallback_paragraph_start_re))
399 fallback_paragraph_start_re = build_string ("\f\\|[ \t]*$");
400 staticpro (&fallback_paragraph_start_re);
401 Qparagraph_start = intern ("paragraph-start");
402 staticpro (&Qparagraph_start);
403 fallback_paragraph_separate_re =
404 XSYMBOL (Fintern_soft (build_string ("paragraph-separate"), Qnil))->value;
405 if (!STRINGP (fallback_paragraph_separate_re))
406 fallback_paragraph_separate_re = build_string ("[ \t\f]*$");
407 staticpro (&fallback_paragraph_separate_re);
408 Qparagraph_separate = intern ("paragraph-separate");
409 staticpro (&Qparagraph_separate);
b7b65b15
EZ
410 bidi_initialized = 1;
411}
412
6bff6497
EZ
413/* Return the bidi type of a character CH, subject to the current
414 directional OVERRIDE. */
b7b65b15 415bidi_type_t
6bff6497 416bidi_get_type (int ch, bidi_dir_t override)
b7b65b15 417{
6bff6497
EZ
418 bidi_type_t default_type;
419
e7402cb2
EZ
420 if (ch == BIDI_EOB)
421 return NEUTRAL_B;
e342a24d
EZ
422 if (ch < 0 || ch > MAX_CHAR)
423 abort ();
6bff6497
EZ
424
425 default_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
426
427 if (override == NEUTRAL_DIR)
428 return default_type;
429
430 switch (default_type)
431 {
432 /* Although UAX#9 does not tell, it doesn't make sense to
433 override NEUTRAL_B and LRM/RLM characters. */
434 case NEUTRAL_B:
435 case LRE:
436 case LRO:
437 case RLE:
438 case RLO:
439 case PDF:
440 return default_type;
441 default:
442 switch (ch)
443 {
444 case LRM_CHAR:
445 case RLM_CHAR:
446 return default_type;
447 default:
448 if (override == L2R) /* X6 */
449 return STRONG_L;
450 else if (override == R2L)
451 return STRONG_R;
452 else
453 abort (); /* can't happen: handled above */
454 }
455 }
b7b65b15
EZ
456}
457
2d6e4628
EZ
458void
459bidi_check_type (bidi_type_t type)
460{
461 if (type < UNKNOWN_BT || type > NEUTRAL_ON)
462 abort ();
463}
464
b7b65b15
EZ
465/* Given a bidi TYPE of a character, return its category. */
466bidi_category_t
467bidi_get_category (bidi_type_t type)
468{
469 switch (type)
470 {
471 case UNKNOWN_BT:
472 return UNKNOWN_BC;
473 case STRONG_L:
474 case STRONG_R:
475 case STRONG_AL:
476 case LRE:
477 case LRO:
478 case RLE:
479 case RLO:
480 return STRONG;
481 case PDF: /* ??? really?? */
482 case WEAK_EN:
483 case WEAK_ES:
484 case WEAK_ET:
485 case WEAK_AN:
486 case WEAK_CS:
487 case WEAK_NSM:
488 case WEAK_BN:
489 return WEAK;
490 case NEUTRAL_B:
491 case NEUTRAL_S:
492 case NEUTRAL_WS:
493 case NEUTRAL_ON:
494 return NEUTRAL;
495 default:
496 abort ();
497 }
498}
499
e342a24d
EZ
500/* Return the mirrored character of C, if any.
501
502 Note: The conditions in UAX#9 clause L4 must be tested by the
503 caller. */
b7b65b15
EZ
504/* FIXME: exceedingly temporary! Should consult the Unicode database
505 of character properties. */
506int
507bidi_mirror_char (int c)
508{
509 static const char mirrored_pairs[] = "()<>[]{}";
510 const char *p = strchr (mirrored_pairs, c);
511
512 if (p)
513 {
514 size_t i = p - mirrored_pairs;
515
516 if ((i & 1) == 0)
517 return mirrored_pairs[i + 1];
518 else
519 return mirrored_pairs[i - 1];
520 }
521 return c;
522}
523
524/* Copy the bidi iterator from FROM to TO. To save cycles, this only
525 copies the part of the level stack that is actually in use. */
526static inline void
527bidi_copy_it (struct bidi_it *to, struct bidi_it *from)
528{
9c82e145 529 int save_first_elt = to->first_elt;
b7b65b15
EZ
530 int i;
531
532 /* Copy everything except the level stack. */
533 memcpy (to, from, ((int)&((struct bidi_it *)0)->level_stack[0]));
9c82e145
EZ
534 to->first_elt = save_first_elt;
535 if (to->first_elt != 0 && to->first_elt != 1)
536 to->first_elt = 0;
b7b65b15
EZ
537
538 /* Copy the active part of the level stack. */
539 to->level_stack[0] = from->level_stack[0]; /* level zero is always in use */
540 for (i = 1; i <= from->stack_idx; i++)
541 to->level_stack[i] = from->level_stack[i];
542}
543
544/* Caching the bidi iterator states. */
545
546static struct bidi_it bidi_cache[1000]; /* FIXME: make this dynamically allocated! */
547static int bidi_cache_idx;
548static int bidi_cache_last_idx;
549
550static inline void
551bidi_cache_reset (void)
552{
553 bidi_cache_idx = 0;
554 bidi_cache_last_idx = -1;
555}
556
557static inline void
558bidi_cache_fetch_state (int idx, struct bidi_it *bidi_it)
559{
560 int current_scan_dir = bidi_it->scan_dir;
561
562 if (idx < 0 || idx >= bidi_cache_idx)
563 abort ();
564
565 bidi_copy_it (bidi_it, &bidi_cache[idx]);
566 bidi_it->scan_dir = current_scan_dir;
567 bidi_cache_last_idx = idx;
568}
569
570/* Find a cached state with a given CHARPOS and resolved embedding
571 level less or equal to LEVEL. if LEVEL is -1, disregard the
572 resolved levels in cached states. DIR, if non-zero, means search
573 in that direction from the last cache hit. */
574static inline int
575bidi_cache_search (int charpos, int level, int dir)
576{
577 int i, i_start;
578
579 if (bidi_cache_idx)
580 {
581 if (charpos < bidi_cache[bidi_cache_last_idx].charpos)
582 dir = -1;
583 else if (charpos > bidi_cache[bidi_cache_last_idx].charpos)
584 dir = 1;
585 if (dir)
586 i_start = bidi_cache_last_idx;
587 else
588 {
589 dir = -1;
590 i_start = bidi_cache_idx - 1;
591 }
592
593 if (dir < 0)
594 {
595 /* Linear search for now; FIXME! */
596 for (i = i_start; i >= 0; i--)
597 if (bidi_cache[i].charpos == charpos
598 && (level == -1 || bidi_cache[i].resolved_level <= level))
599 return i;
600 }
601 else
602 {
603 for (i = i_start; i < bidi_cache_idx; i++)
604 if (bidi_cache[i].charpos == charpos
605 && (level == -1 || bidi_cache[i].resolved_level <= level))
606 return i;
607 }
608 }
609
610 return -1;
611}
612
613/* Find a cached state where the resolved level changes to a value
614 that is lower than LEVEL, and return its cache slot index. DIR is
615 the direction to search, starting with the last used cache slot.
616 BEFORE, if non-zero, means return the index of the slot that is
617 ``before'' the level change in the search direction. That is,
618 given the cached levels like this:
619
620 1122333442211
621 AB C
622
623 and assuming we are at the position cached at the slot marked with
624 C, searching backwards (DIR = -1) for LEVEL = 2 will return the
625 index of slot B or A, depending whether BEFORE is, respectively,
626 non-zero or zero. */
627static int
628bidi_cache_find_level_change (int level, int dir, int before)
629{
630 if (bidi_cache_idx)
631 {
632 int i = dir ? bidi_cache_last_idx : bidi_cache_idx - 1;
633 int incr = before ? 1 : 0;
634
635 if (!dir)
636 dir = -1;
637 else if (!incr)
638 i += dir;
639
640 if (dir < 0)
641 {
642 while (i >= incr)
643 {
644 if (bidi_cache[i - incr].resolved_level >= 0
645 && bidi_cache[i - incr].resolved_level < level)
646 return i;
647 i--;
648 }
649 }
650 else
651 {
652 while (i < bidi_cache_idx - incr)
653 {
654 if (bidi_cache[i + incr].resolved_level >= 0
655 && bidi_cache[i + incr].resolved_level < level)
656 return i;
657 i++;
658 }
659 }
660 }
661
662 return -1;
663}
664
665static inline void
666bidi_cache_iterator_state (struct bidi_it *bidi_it, int resolved)
667{
668 int idx;
669
670 /* We should never cache on backward scans. */
671 if (bidi_it->scan_dir == -1)
672 abort ();
673 idx = bidi_cache_search (bidi_it->charpos, -1, 1);
674
675 if (idx < 0)
676 {
677 idx = bidi_cache_idx;
678 if (idx > sizeof (bidi_cache) / sizeof (bidi_cache[0]) - 1)
679 abort ();
680 bidi_copy_it (&bidi_cache[idx], bidi_it);
681 if (!resolved)
682 bidi_cache[idx].resolved_level = -1;
683 }
684 else
685 {
686 /* Copy only the members which could have changed, to avoid
687 costly copying of the entire struct. */
688 bidi_cache[idx].type = bidi_it->type;
2d6e4628 689 bidi_check_type (bidi_it->type);
89d3374a
EZ
690 bidi_cache[idx].type_after_w1 = bidi_it->type_after_w1;
691 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
692 if (resolved)
693 bidi_cache[idx].resolved_level = bidi_it->resolved_level;
694 else
695 bidi_cache[idx].resolved_level = -1;
696 bidi_cache[idx].invalid_levels = bidi_it->invalid_levels;
697 bidi_cache[idx].invalid_rl_levels = bidi_it->invalid_rl_levels;
698 bidi_cache[idx].next_for_neutral = bidi_it->next_for_neutral;
699 bidi_cache[idx].next_for_ws = bidi_it->next_for_ws;
700 bidi_cache[idx].ignore_bn_limit = bidi_it->ignore_bn_limit;
701 }
702
703 bidi_cache_last_idx = idx;
704 if (idx >= bidi_cache_idx)
705 bidi_cache_idx = idx + 1;
706}
707
708static inline bidi_type_t
709bidi_cache_find (int charpos, int level, struct bidi_it *bidi_it)
710{
711 int i = bidi_cache_search (charpos, level, bidi_it->scan_dir);
712
713 if (i >= 0)
714 {
715 bidi_dir_t current_scan_dir = bidi_it->scan_dir;
716
717 *bidi_it = bidi_cache[i];
718 bidi_cache_last_idx = i;
719 /* Don't let scan direction from from the cached state override
720 the current scan direction. */
721 bidi_it->scan_dir = current_scan_dir;
722 return bidi_it->type;
723 }
724
725 return UNKNOWN_BT;
726}
727
728static inline int
729bidi_peek_at_next_level (struct bidi_it *bidi_it)
730{
731 if (bidi_cache_idx == 0 || bidi_cache_last_idx == -1)
732 abort ();
733 return bidi_cache[bidi_cache_last_idx + bidi_it->scan_dir].resolved_level;
734}
735
6bff6497
EZ
736/* Return non-zero if buffer's byte position POS is the end of a
737 paragraph. */
b7b65b15 738int
6bff6497 739bidi_at_paragraph_end (EMACS_INT charpos, EMACS_INT bytepos)
b7b65b15 740{
6bff6497 741 Lisp_Object re = XSYMBOL (Qparagraph_separate)->value;
e7402cb2 742
6bff6497
EZ
743 if (!STRINGP (re))
744 re = fallback_paragraph_separate_re;
b7b65b15 745
6bff6497 746 return fast_looking_at (re, charpos, bytepos, ZV, ZV_BYTE, Qnil) > 0;
b7b65b15
EZ
747}
748
749/* Determine the start-of-run (sor) directional type given the two
750 embedding levels on either side of the run boundary. Also, update
751 the saved info about previously seen characters, since that info is
752 generally valid for a single level run. */
753static inline void
754bidi_set_sor_type (struct bidi_it *bidi_it, int level_before, int level_after)
755{
756 int higher_level = level_before > level_after ? level_before : level_after;
757
758 /* The prev_was_pdf gork is required for when we have several PDFs
759 in a row. In that case, we want to compute the sor type for the
760 next level run only once: when we see the first PDF. That's
761 because the sor type depends only on the higher of the two levels
762 that we find on the two sides of the level boundary (see UAX#9,
763 clause X10), and so we don't need to know the final embedding
764 level to which we descend after processing all the PDFs. */
e342a24d 765 if (!bidi_it->prev_was_pdf || level_before < level_after)
b7b65b15
EZ
766 /* FIXME: should the default sor direction be user selectable? */
767 bidi_it->sor = (higher_level & 1) != 0 ? R2L : L2R;
768 if (level_before > level_after)
769 bidi_it->prev_was_pdf = 1;
770
771 bidi_it->prev.type = UNKNOWN_BT;
89d3374a
EZ
772 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
773 bidi_it->last_strong.orig_type = UNKNOWN_BT;
b7b65b15
EZ
774 bidi_it->prev_for_neutral.type = bidi_it->sor == R2L ? STRONG_R : STRONG_L;
775 bidi_it->prev_for_neutral.charpos = bidi_it->charpos;
776 bidi_it->prev_for_neutral.bytepos = bidi_it->bytepos;
89d3374a
EZ
777 bidi_it->next_for_neutral.type = bidi_it->next_for_neutral.type_after_w1 =
778 bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
b7b65b15
EZ
779 bidi_it->ignore_bn_limit = 0; /* meaning it's unknown */
780}
781
6bff6497
EZ
782/* Find the beginning of this paragraph by looking back in the
783 buffer. */
784static void
785bidi_find_paragraph_start (struct bidi_it *bidi_it)
786{
787 Lisp_Object re = XSYMBOL (Qparagraph_start)->value;
788 EMACS_INT pos = bidi_it->charpos;
789 EMACS_INT pos_byte = bidi_it->bytepos;
790 EMACS_INT limit = ZV, limit_byte = ZV_BYTE;
791
792 if (!STRINGP (re))
793 re = fallback_paragraph_start_re;
794 while (pos_byte > BEGV_BYTE
795 && fast_looking_at (re, pos, pos_byte, limit, limit_byte, Qnil) < 0)
796 {
797 find_next_newline_no_quit (pos, -1);
798 }
799}
800
b7b65b15
EZ
801void
802bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it)
803{
6bff6497 804 EMACS_INT bytepos = bidi_it->bytepos;
e7402cb2 805
9c82e145
EZ
806 /* We should never be called at EOB or before BEGV. */
807 if (bytepos >= ZV_BYTE || bytepos < BEGV_BYTE)
808 abort ();
809
e7402cb2 810 bidi_it->level_stack[0].level = 0; /* default for L2R */
6bff6497 811 bidi_it->paragraph_dir = L2R;
b7b65b15
EZ
812 if (dir == R2L)
813 bidi_it->level_stack[0].level = 1;
814 else if (dir == NEUTRAL_DIR) /* P2 */
815 {
6bff6497
EZ
816 int ch, ch_len;
817 EMACS_INT pos;
818 bidi_type_t type;
819
820 /* Search back to where this paragraph starts. */
821 bidi_find_paragraph_start (bidi_it);
822
823 /* We should always be at the beginning of a new line at this
824 point. */
825 if (!(bytepos == BEGV_BYTE
826 || FETCH_CHAR (bytepos) == '\n'
827 || FETCH_CHAR (bytepos - 1) == '\n'))
828 abort ();
829
830 ch = FETCH_CHAR (bytepos);
831 ch_len = CHAR_BYTES (ch);
832 pos = bidi_it->charpos;
833 type = bidi_get_type (ch, NEUTRAL_DIR);
b7b65b15 834
e342a24d 835 for (pos++, bytepos += ch_len;
b7b65b15
EZ
836 /* NOTE: UAX#9 says to search only for L, AL, or R types of
837 characters, and ignore RLE, RLO, LRE, and LRO. However,
838 I'm not sure it makes sense to omit those 4; should try
839 with and without that to see the effect. */
840 (bidi_get_category (type) != STRONG)
841 || (bidi_ignore_explicit_marks_for_paragraph_level
842 && (type == RLE || type == RLO
843 || type == LRE || type == LRO));
6bff6497 844 type = bidi_get_type (ch, NEUTRAL_DIR))
b7b65b15 845 {
6bff6497 846 if (type == NEUTRAL_B || bidi_at_paragraph_end (pos, bytepos))
b7b65b15
EZ
847 break;
848 FETCH_CHAR_ADVANCE (ch, pos, bytepos);
849 }
850 if (type == STRONG_R || type == STRONG_AL) /* P3 */
851 bidi_it->level_stack[0].level = 1;
852 }
6bff6497
EZ
853 if (bidi_it->level_stack[0].level == 1)
854 bidi_it->paragraph_dir = R2L;
b7b65b15
EZ
855 bidi_it->scan_dir = 1; /* FIXME: do we need to have control on this? */
856 bidi_it->resolved_level = bidi_it->level_stack[0].level;
857 bidi_it->level_stack[0].override = NEUTRAL_DIR; /* X1 */
858 bidi_it->invalid_levels = 0;
859 bidi_it->invalid_rl_levels = -1;
860 bidi_it->new_paragraph = 0;
861 bidi_it->next_en_pos = -1;
862 bidi_it->next_for_ws.type = UNKNOWN_BT;
6bff6497 863 bidi_set_sor_type (bidi_it, bidi_it->paragraph_dir,
e342a24d 864 bidi_it->level_stack[0].level); /* X10 */
b7b65b15
EZ
865
866 bidi_cache_reset ();
867}
868
6bff6497
EZ
869/* Do whatever UAX#9 clause X8 says should be done at paragraph's
870 end. */
b7b65b15
EZ
871static inline void
872bidi_set_paragraph_end (struct bidi_it *bidi_it)
873{
874 bidi_it->invalid_levels = 0;
875 bidi_it->invalid_rl_levels = -1;
876 bidi_it->stack_idx = 0;
877 bidi_it->resolved_level = bidi_it->level_stack[0].level;
b7b65b15
EZ
878}
879
89d3374a 880/* Initialize the bidi iterator from buffer position CHARPOS. */
b7b65b15 881void
6bff6497 882bidi_init_it (EMACS_INT charpos, EMACS_INT bytepos, struct bidi_it *bidi_it)
b7b65b15
EZ
883{
884 if (! bidi_initialized)
885 bidi_initialize ();
89d3374a
EZ
886 bidi_it->charpos = charpos;
887 bidi_it->bytepos = bytepos;
6bff6497
EZ
888 bidi_it->first_elt = 1;
889 bidi_set_paragraph_end (bidi_it);
890 bidi_it->new_paragraph = 1;
b7b65b15 891 bidi_it->type = NEUTRAL_B;
89d3374a 892 bidi_it->type_after_w1 = UNKNOWN_BT;
b7b65b15 893 bidi_it->orig_type = UNKNOWN_BT;
b7b65b15 894 bidi_it->prev_was_pdf = 0;
89d3374a
EZ
895 bidi_it->prev.type = bidi_it->prev.type_after_w1 = UNKNOWN_BT;
896 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
897 bidi_it->last_strong.orig_type = UNKNOWN_BT;
b7b65b15
EZ
898 bidi_it->next_for_neutral.charpos = -1;
899 bidi_it->next_for_neutral.type =
89d3374a
EZ
900 bidi_it->next_for_neutral.type_after_w1 =
901 bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
b7b65b15
EZ
902 bidi_it->prev_for_neutral.charpos = -1;
903 bidi_it->prev_for_neutral.type =
89d3374a
EZ
904 bidi_it->prev_for_neutral.type_after_w1 =
905 bidi_it->prev_for_neutral.orig_type = UNKNOWN_BT;
b7b65b15 906 bidi_it->sor = L2R; /* FIXME: should it be user-selectable? */
b7b65b15
EZ
907}
908
909/* Push the current embedding level and override status; reset the
910 current level to LEVEL and the current override status to OVERRIDE. */
911static inline void
912bidi_push_embedding_level (struct bidi_it *bidi_it,
913 int level, bidi_dir_t override)
914{
915 bidi_it->stack_idx++;
916 if (bidi_it->stack_idx >= BIDI_MAXLEVEL)
917 abort ();
918 bidi_it->level_stack[bidi_it->stack_idx].level = level;
919 bidi_it->level_stack[bidi_it->stack_idx].override = override;
920}
921
922/* Pop the embedding level and directional override status from the
923 stack, and return the new level. */
924static inline int
925bidi_pop_embedding_level (struct bidi_it *bidi_it)
926{
927 /* UAX#9 says to ignore invalid PDFs. */
928 if (bidi_it->stack_idx > 0)
929 bidi_it->stack_idx--;
930 return bidi_it->level_stack[bidi_it->stack_idx].level;
931}
932
933/* Record in SAVED_INFO the information about the current character. */
934static inline void
935bidi_remember_char (struct bidi_saved_info *saved_info,
936 struct bidi_it *bidi_it)
937{
938 saved_info->charpos = bidi_it->charpos;
939 saved_info->bytepos = bidi_it->bytepos;
940 saved_info->type = bidi_it->type;
2d6e4628 941 bidi_check_type (bidi_it->type);
89d3374a
EZ
942 saved_info->type_after_w1 = bidi_it->type_after_w1;
943 bidi_check_type (bidi_it->type_after_w1);
b7b65b15 944 saved_info->orig_type = bidi_it->orig_type;
2d6e4628 945 bidi_check_type (bidi_it->orig_type);
b7b65b15
EZ
946}
947
948/* Resolve the type of a neutral character according to the type of
949 surrounding strong text and the current embedding level. */
950static inline bidi_type_t
951bidi_resolve_neutral_1 (bidi_type_t prev_type, bidi_type_t next_type, int lev)
952{
953 /* N1: European and Arabic numbers are treated as though they were R. */
954 if (next_type == WEAK_EN || next_type == WEAK_AN)
955 next_type = STRONG_R;
956 if (prev_type == WEAK_EN || prev_type == WEAK_AN)
957 prev_type = STRONG_R;
958
959 if (next_type == prev_type) /* N1 */
960 return next_type;
961 else if ((lev & 1) == 0) /* N2 */
962 return STRONG_L;
963 else
964 return STRONG_R;
965}
966
967static inline int
968bidi_explicit_dir_char (int c)
969{
970 /* FIXME: this should be replaced with a lookup table with suitable
971 bits set, like standard C ctype macros do. */
972 return (c == LRE_CHAR || c == LRO_CHAR
973 || c == RLE_CHAR || c == RLO_CHAR || c == PDF_CHAR);
974}
975
976/* A helper function for bidi_resolve_explicit. It advances to the
977 next character in logical order and determines the new embedding
978 level and directional override, but does not take into account
979 empty embeddings. */
980static int
981bidi_resolve_explicit_1 (struct bidi_it *bidi_it)
982{
983 int curchar;
984 bidi_type_t type;
985 int current_level;
986 int new_level;
987 bidi_dir_t override;
988
9c82e145
EZ
989 if (bidi_it->bytepos < BEGV_BYTE /* after reseat to BEGV? */
990 || bidi_it->first_elt)
e7402cb2 991 {
9c82e145
EZ
992 bidi_it->first_elt = 0;
993 if (bidi_it->charpos < BEGV)
994 bidi_it->charpos = BEGV;
995 bidi_it->bytepos = CHAR_TO_BYTE (bidi_it->charpos);
e7402cb2 996 }
9c82e145 997 else if (bidi_it->bytepos < ZV_BYTE) /* don't move at ZV */
b7b65b15
EZ
998 {
999 bidi_it->charpos++;
e7402cb2
EZ
1000 if (bidi_it->ch_len == 0)
1001 abort ();
b7b65b15
EZ
1002 bidi_it->bytepos += bidi_it->ch_len;
1003 }
1004
1005 current_level = bidi_it->level_stack[bidi_it->stack_idx].level; /* X1 */
1006 override = bidi_it->level_stack[bidi_it->stack_idx].override;
1007 new_level = current_level;
1008
1009 /* in case it is a unibyte character (not yet implemented) */
1010 /* _fetch_multibyte_char_len = 1; */
e7402cb2
EZ
1011 if (bidi_it->bytepos >= ZV_BYTE)
1012 {
1013 curchar = BIDI_EOB;
1014 bidi_it->ch_len = 1;
1015 }
1016 else
1017 {
1018 curchar = FETCH_CHAR (bidi_it->bytepos);
1019 bidi_it->ch_len = CHAR_BYTES (curchar);
1020 }
b7b65b15 1021 bidi_it->ch = curchar;
b7b65b15 1022
6bff6497
EZ
1023 /* Don't apply directional override here, as all the types we handle
1024 below will not be affected by the override anyway, and we need
1025 the original type unaltered. The override will be applied in
1026 bidi_resolve_weak. */
1027 type = bidi_get_type (curchar, NEUTRAL_DIR);
89d3374a
EZ
1028 bidi_it->orig_type = type;
1029 bidi_check_type (bidi_it->orig_type);
b7b65b15
EZ
1030
1031 if (type != PDF)
1032 bidi_it->prev_was_pdf = 0;
1033
89d3374a 1034 bidi_it->type_after_w1 = UNKNOWN_BT;
b7b65b15
EZ
1035
1036 switch (type)
1037 {
1038 case RLE: /* X2 */
1039 case RLO: /* X4 */
89d3374a
EZ
1040 bidi_it->type_after_w1 = type;
1041 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1042 type = WEAK_BN; /* X9/Retaining */
1043 if (bidi_it->ignore_bn_limit <= 0)
1044 {
1045 if (current_level <= BIDI_MAXLEVEL - 4)
1046 {
1047 /* Compute the least odd embedding level greater than
1048 the current level. */
1049 new_level = ((current_level + 1) & ~1) + 1;
89d3374a 1050 if (bidi_it->type_after_w1 == RLE)
b7b65b15
EZ
1051 override = NEUTRAL_DIR;
1052 else
1053 override = R2L;
1054 if (current_level == BIDI_MAXLEVEL - 4)
1055 bidi_it->invalid_rl_levels = 0;
1056 bidi_push_embedding_level (bidi_it, new_level, override);
1057 }
1058 else
1059 {
1060 bidi_it->invalid_levels++;
1061 /* See the commentary about invalid_rl_levels below. */
1062 if (bidi_it->invalid_rl_levels < 0)
1063 bidi_it->invalid_rl_levels = 0;
1064 bidi_it->invalid_rl_levels++;
1065 }
1066 }
89d3374a 1067 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
b7b65b15
EZ
1068 || bidi_it->next_en_pos > bidi_it->charpos)
1069 type = WEAK_EN;
1070 break;
1071 case LRE: /* X3 */
1072 case LRO: /* X5 */
89d3374a
EZ
1073 bidi_it->type_after_w1 = type;
1074 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1075 type = WEAK_BN; /* X9/Retaining */
1076 if (bidi_it->ignore_bn_limit <= 0)
1077 {
1078 if (current_level <= BIDI_MAXLEVEL - 5)
1079 {
1080 /* Compute the least even embedding level greater than
1081 the current level. */
1082 new_level = ((current_level + 2) & ~1);
89d3374a 1083 if (bidi_it->type_after_w1 == LRE)
b7b65b15
EZ
1084 override = NEUTRAL_DIR;
1085 else
1086 override = L2R;
1087 bidi_push_embedding_level (bidi_it, new_level, override);
1088 }
1089 else
1090 {
1091 bidi_it->invalid_levels++;
1092 /* invalid_rl_levels counts invalid levels encountered
1093 while the embedding level was already too high for
1094 LRE/LRO, but not for RLE/RLO. That is because
1095 there may be exactly one PDF which we should not
1096 ignore even though invalid_levels is non-zero.
1097 invalid_rl_levels helps to know what PDF is
1098 that. */
1099 if (bidi_it->invalid_rl_levels >= 0)
1100 bidi_it->invalid_rl_levels++;
1101 }
1102 }
89d3374a 1103 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
b7b65b15
EZ
1104 || bidi_it->next_en_pos > bidi_it->charpos)
1105 type = WEAK_EN;
1106 break;
1107 case PDF: /* X7 */
89d3374a
EZ
1108 bidi_it->type_after_w1 = type;
1109 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1110 type = WEAK_BN; /* X9/Retaining */
1111 if (bidi_it->ignore_bn_limit <= 0)
1112 {
1113 if (!bidi_it->invalid_rl_levels)
1114 {
1115 new_level = bidi_pop_embedding_level (bidi_it);
1116 bidi_it->invalid_rl_levels = -1;
1117 if (bidi_it->invalid_levels)
1118 bidi_it->invalid_levels--;
1119 /* else nothing: UAX#9 says to ignore invalid PDFs */
1120 }
1121 if (!bidi_it->invalid_levels)
1122 new_level = bidi_pop_embedding_level (bidi_it);
1123 else
1124 {
1125 bidi_it->invalid_levels--;
1126 bidi_it->invalid_rl_levels--;
1127 }
1128 }
89d3374a 1129 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
b7b65b15
EZ
1130 || bidi_it->next_en_pos > bidi_it->charpos)
1131 type = WEAK_EN;
1132 break;
1133 default:
1134 /* Nothing. */
1135 break;
1136 }
1137
1138 bidi_it->type = type;
2d6e4628 1139 bidi_check_type (bidi_it->type);
b7b65b15
EZ
1140
1141 return new_level;
1142}
1143
1144/* Given an iterator state in BIDI_IT, advance one character position
1145 in the buffer to the next character (in the logical order), resolve
1146 any explicit embeddings and directional overrides, and return the
1147 embedding level of the character after resolving explicit
1148 directives and ignoring empty embeddings. */
1149static int
1150bidi_resolve_explicit (struct bidi_it *bidi_it)
1151{
1152 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1153 int new_level = bidi_resolve_explicit_1 (bidi_it);
1154
1155 if (prev_level < new_level
1156 && bidi_it->type == WEAK_BN
1157 && bidi_it->ignore_bn_limit == 0 /* only if not already known */
e7402cb2 1158 && bidi_it->ch != BIDI_EOB /* not already at EOB */
b7b65b15
EZ
1159 && bidi_explicit_dir_char (FETCH_CHAR (bidi_it->bytepos
1160 + bidi_it->ch_len)))
1161 {
1162 /* Avoid pushing and popping embedding levels if the level run
1163 is empty, as this breaks level runs where it shouldn't.
1164 UAX#9 removes all the explicit embedding and override codes,
1165 so empty embeddings disappear without a trace. We need to
1166 behave as if we did the same. */
1167 struct bidi_it saved_it;
1168 int level = prev_level;
1169
1170 bidi_copy_it (&saved_it, bidi_it);
1171
1172 while (bidi_explicit_dir_char (FETCH_CHAR (bidi_it->bytepos
1173 + bidi_it->ch_len)))
1174 {
1175 level = bidi_resolve_explicit_1 (bidi_it);
1176 }
1177
1178 if (level == prev_level) /* empty embedding */
1179 saved_it.ignore_bn_limit = bidi_it->charpos + 1;
1180 else /* this embedding is non-empty */
1181 saved_it.ignore_bn_limit = -1;
1182
1183 bidi_copy_it (bidi_it, &saved_it);
1184 if (bidi_it->ignore_bn_limit > 0)
1185 {
1186 /* We pushed a level, but we shouldn't have. Undo that. */
1187 if (!bidi_it->invalid_rl_levels)
1188 {
1189 new_level = bidi_pop_embedding_level (bidi_it);
1190 bidi_it->invalid_rl_levels = -1;
1191 if (bidi_it->invalid_levels)
1192 bidi_it->invalid_levels--;
1193 }
1194 if (!bidi_it->invalid_levels)
1195 new_level = bidi_pop_embedding_level (bidi_it);
1196 else
1197 {
1198 bidi_it->invalid_levels--;
1199 bidi_it->invalid_rl_levels--;
1200 }
1201 }
1202 }
1203
b7b65b15
EZ
1204 if (bidi_it->type == NEUTRAL_B) /* X8 */
1205 {
6bff6497
EZ
1206 /* End of buffer does _not_ indicate a new paragraph is coming.
1207 Otherwise, each character inserted at EOB will be processed
1208 as starting a new paragraph. */
1209 if (bidi_it->bytepos < ZV_BYTE)
1210 bidi_set_paragraph_end (bidi_it);
1211 /* This is needed by bidi_resolve_weak below, and in L1. */
1212 bidi_it->type_after_w1 = bidi_it->type;
89d3374a 1213 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1214 }
1215
1216 return new_level;
1217}
1218
1219/* Advance in the buffer, resolve weak types and return the type of
1220 the next character after weak type resolution. */
1221bidi_type_t
1222bidi_resolve_weak (struct bidi_it *bidi_it)
1223{
1224 bidi_type_t type;
1225 bidi_dir_t override;
1226 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1227 int new_level = bidi_resolve_explicit (bidi_it);
1228 int next_char;
1229 bidi_type_t type_of_next;
1230 struct bidi_it saved_it;
1231
1232 type = bidi_it->type;
1233 override = bidi_it->level_stack[bidi_it->stack_idx].override;
1234
1235 if (type == UNKNOWN_BT
1236 || type == LRE
1237 || type == LRO
1238 || type == RLE
1239 || type == RLO
1240 || type == PDF)
1241 abort ();
1242
1243 if (new_level != prev_level
1244 || bidi_it->type == NEUTRAL_B)
1245 {
1246 /* We've got a new embedding level run, compute the directional
1247 type of sor and initialize per-run variables (UAX#9, clause
1248 X10). */
1249 bidi_set_sor_type (bidi_it, prev_level, new_level);
1250 }
1251 else if (type == NEUTRAL_S || type == NEUTRAL_WS
1252 || type == WEAK_BN || type == STRONG_AL)
89d3374a
EZ
1253 bidi_it->type_after_w1 = type; /* needed in L1 */
1254 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1255
1256 /* Level and directional override status are already recorded in
1257 bidi_it, and do not need any change; see X6. */
1258 if (override == R2L) /* X6 */
1259 type = STRONG_R;
1260 else if (override == L2R)
1261 type = STRONG_L;
1262 else if (type == STRONG_AL)
1263 type = STRONG_R; /* W3 */
1264 else if (type == WEAK_NSM) /* W1 */
1265 {
1266 /* Note that we don't need to consider the case where the prev
1267 character has its type overridden by an RLO or LRO: such
1268 characters are outside the current level run, and thus not
89d3374a
EZ
1269 relevant to this NSM. Thus, NSM gets the orig_type of the
1270 previous character. */
b7b65b15 1271 if (bidi_it->prev.type != UNKNOWN_BT)
89d3374a 1272 type = bidi_it->prev.orig_type;
b7b65b15
EZ
1273 else if (bidi_it->sor == R2L)
1274 type = STRONG_R;
1275 else if (bidi_it->sor == L2R)
1276 type = STRONG_L;
1277 else /* shouldn't happen! */
1278 abort ();
1279 if (type == WEAK_EN /* W2 after W1 */
89d3374a 1280 && bidi_it->last_strong.type_after_w1 == STRONG_AL)
b7b65b15
EZ
1281 type = WEAK_AN;
1282 }
1283 else if (type == WEAK_EN /* W2 */
89d3374a 1284 && bidi_it->last_strong.type_after_w1 == STRONG_AL)
b7b65b15
EZ
1285 type = WEAK_AN;
1286 else if ((type == WEAK_ES
89d3374a
EZ
1287 && (bidi_it->prev.type_after_w1 == WEAK_EN /* W4 */
1288 && (bidi_it->prev.orig_type == WEAK_EN
1289 || bidi_it->prev.orig_type == WEAK_NSM))) /* aft W1 */
b7b65b15 1290 || (type == WEAK_CS
89d3374a
EZ
1291 && ((bidi_it->prev.type_after_w1 == WEAK_EN
1292 && (bidi_it->prev.orig_type == WEAK_EN /* W4 */
1293 || bidi_it->prev.orig_type == WEAK_NSM)) /* a/W1 */
1294 || bidi_it->prev.type_after_w1 == WEAK_AN))) /* W4 */
b7b65b15 1295 {
e7402cb2
EZ
1296 next_char =
1297 bidi_it->bytepos + bidi_it->ch_len >= ZV_BYTE
1298 ? BIDI_EOB : FETCH_CHAR (bidi_it->bytepos + bidi_it->ch_len);
6bff6497 1299 type_of_next = bidi_get_type (next_char, override);
b7b65b15
EZ
1300
1301 if (type_of_next == WEAK_BN
1302 || bidi_explicit_dir_char (next_char))
1303 {
1304 bidi_copy_it (&saved_it, bidi_it);
1305 while (bidi_resolve_explicit (bidi_it) == new_level
1306 && bidi_it->type == WEAK_BN)
1307 ;
1308 type_of_next = bidi_it->type;
1309 bidi_copy_it (bidi_it, &saved_it);
1310 }
1311
1312 /* If the next character is EN, but the last strong-type
1313 character is AL, that next EN will be changed to AN when we
1314 process it in W2 above. So in that case, this ES should not
1315 be changed into EN. */
1316 if (type == WEAK_ES
1317 && type_of_next == WEAK_EN
89d3374a 1318 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
b7b65b15
EZ
1319 type = WEAK_EN;
1320 else if (type == WEAK_CS)
1321 {
89d3374a 1322 if (bidi_it->prev.type_after_w1 == WEAK_AN
b7b65b15
EZ
1323 && (type_of_next == WEAK_AN
1324 /* If the next character is EN, but the last
1325 strong-type character is AL, EN will be later
1326 changed to AN when we process it in W2 above. So
1327 in that case, this ES should not be changed into
1328 EN. */
1329 || (type_of_next == WEAK_EN
89d3374a 1330 && bidi_it->last_strong.type_after_w1 == STRONG_AL)))
b7b65b15 1331 type = WEAK_AN;
89d3374a 1332 else if (bidi_it->prev.type_after_w1 == WEAK_EN
b7b65b15 1333 && type_of_next == WEAK_EN
89d3374a 1334 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
b7b65b15
EZ
1335 type = WEAK_EN;
1336 }
1337 }
1338 else if (type == WEAK_ET /* W5: ET with EN before or after it */
1339 || type == WEAK_BN) /* W5/Retaining */
1340 {
89d3374a 1341 if (bidi_it->prev.type_after_w1 == WEAK_EN /* ET/BN with EN before it */
b7b65b15
EZ
1342 || bidi_it->next_en_pos > bidi_it->charpos)
1343 type = WEAK_EN;
1344 /* W5: ET with EN after it. */
1345 else
1346 {
6bff6497 1347 EMACS_INT en_pos = bidi_it->charpos + 1;
b7b65b15 1348
e7402cb2
EZ
1349 next_char =
1350 bidi_it->bytepos + bidi_it->ch_len >= ZV_BYTE
1351 ? BIDI_EOB : FETCH_CHAR (bidi_it->bytepos + bidi_it->ch_len);
6bff6497 1352 type_of_next = bidi_get_type (next_char, override);
b7b65b15
EZ
1353
1354 if (type_of_next == WEAK_ET
1355 || type_of_next == WEAK_BN
1356 || bidi_explicit_dir_char (next_char))
1357 {
1358 bidi_copy_it (&saved_it, bidi_it);
1359 while (bidi_resolve_explicit (bidi_it) == new_level
1360 && (bidi_it->type == WEAK_BN || bidi_it->type == WEAK_ET))
1361 ;
1362 type_of_next = bidi_it->type;
1363 en_pos = bidi_it->charpos;
1364 bidi_copy_it (bidi_it, &saved_it);
1365 }
1366 if (type_of_next == WEAK_EN)
1367 {
1368 /* If the last strong character is AL, the EN we've
1369 found will become AN when we get to it (W2). */
89d3374a 1370 if (bidi_it->last_strong.type_after_w1 != STRONG_AL)
b7b65b15
EZ
1371 {
1372 type = WEAK_EN;
1373 /* Remember this EN position, to speed up processing
1374 of the next ETs. */
1375 bidi_it->next_en_pos = en_pos;
1376 }
1377 else if (type == WEAK_BN)
1378 type = NEUTRAL_ON; /* W6/Retaining */
1379 }
1380 }
1381 }
1382
1383 if (type == WEAK_ES || type == WEAK_ET || type == WEAK_CS /* W6 */
89d3374a
EZ
1384 || (type == WEAK_BN
1385 && (bidi_it->prev.type_after_w1 == WEAK_CS /* W6/Retaining */
1386 || bidi_it->prev.type_after_w1 == WEAK_ES
1387 || bidi_it->prev.type_after_w1 == WEAK_ET)))
b7b65b15
EZ
1388 type = NEUTRAL_ON;
1389
1390 /* Store the type we've got so far, before we clobber it with strong
1391 types in W7 and while resolving neutral types. But leave alone
1392 the original types that were recorded above, because we will need
1393 them for the L1 clause. */
89d3374a
EZ
1394 if (bidi_it->type_after_w1 == UNKNOWN_BT)
1395 bidi_it->type_after_w1 = type;
1396 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1397
1398 if (type == WEAK_EN) /* W7 */
1399 {
89d3374a 1400 if ((bidi_it->last_strong.type_after_w1 == STRONG_L)
b7b65b15
EZ
1401 || (bidi_it->last_strong.type == UNKNOWN_BT && bidi_it->sor == L2R))
1402 type = STRONG_L;
1403 }
1404
1405 bidi_it->type = type;
2d6e4628 1406 bidi_check_type (bidi_it->type);
b7b65b15
EZ
1407 return type;
1408}
1409
1410bidi_type_t
1411bidi_resolve_neutral (struct bidi_it *bidi_it)
1412{
1413 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1414 bidi_type_t type = bidi_resolve_weak (bidi_it);
1415 int current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1416
1417 if (!(type == STRONG_R
1418 || type == STRONG_L
1419 || type == WEAK_BN
1420 || type == WEAK_EN
1421 || type == WEAK_AN
1422 || type == NEUTRAL_B
1423 || type == NEUTRAL_S
1424 || type == NEUTRAL_WS
1425 || type == NEUTRAL_ON))
1426 abort ();
1427
1428 if (bidi_get_category (type) == NEUTRAL
1429 || (type == WEAK_BN && prev_level == current_level))
1430 {
1431 if (bidi_it->next_for_neutral.type != UNKNOWN_BT)
1432 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1433 bidi_it->next_for_neutral.type,
1434 current_level);
1435 else
1436 {
1437 /* Arrrgh!! The UAX#9 algorithm is too deeply entrenched in
1438 the assumption of batch-style processing; see clauses W4,
1439 W5, and especially N1, which require to look far forward
1440 (as well as back) in the buffer. May the fleas of a
1441 thousand camels infest the armpits of those who design
1442 supposedly general-purpose algorithms by looking at their
1443 own implementations, and fail to consider other possible
1444 implementations! */
1445 struct bidi_it saved_it;
1446 bidi_type_t next_type;
1447
1448 if (bidi_it->scan_dir == -1)
1449 abort ();
1450
1451 bidi_copy_it (&saved_it, bidi_it);
1452 /* Scan the text forward until we find the first non-neutral
1453 character, and then use that to resolve the neutral we
1454 are dealing with now. We also cache the scanned iterator
1455 states, to salvage some of the effort later. */
1456 bidi_cache_iterator_state (bidi_it, 0);
1457 do {
1458 /* Record the info about the previous character, so that
1459 it will be cached below with this state. */
89d3374a 1460 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
b7b65b15
EZ
1461 && bidi_it->type != WEAK_BN)
1462 bidi_remember_char (&bidi_it->prev, bidi_it);
1463 type = bidi_resolve_weak (bidi_it);
1464 /* Paragraph separators have their levels fully resolved
1465 at this point, so cache them as resolved. */
1466 bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B);
1467 /* FIXME: implement L1 here, by testing for a newline and
1468 resetting the level for any sequence of whitespace
1469 characters adjacent to it. */
1470 } while (!(type == NEUTRAL_B
1471 || (type != WEAK_BN
1472 && bidi_get_category (type) != NEUTRAL)
1473 /* This is all per level run, so stop when we
1474 reach the end of this level run. */
1475 || bidi_it->level_stack[bidi_it->stack_idx].level !=
1476 current_level));
1477
1478 bidi_remember_char (&saved_it.next_for_neutral, bidi_it);
1479
1480 switch (type)
1481 {
1482 case STRONG_L:
1483 case STRONG_R:
1484 case STRONG_AL:
1485 next_type = type;
1486 break;
1487 case WEAK_EN:
1488 case WEAK_AN:
1489 /* N1: ``European and Arabic numbers are treated as
1490 though they were R.'' */
1491 next_type = STRONG_R;
1492 saved_it.next_for_neutral.type = STRONG_R;
1493 break;
1494 case WEAK_BN:
1495 if (!bidi_explicit_dir_char (bidi_it->ch))
1496 abort (); /* can't happen: BNs are skipped */
1497 /* FALLTHROUGH */
1498 case NEUTRAL_B:
1499 /* Marched all the way to the end of this level run.
1500 We need to use the eor type, whose information is
1501 stored by bidi_set_sor_type in the prev_for_neutral
1502 member. */
1503 if (saved_it.type != WEAK_BN
89d3374a 1504 || bidi_get_category (bidi_it->prev.type_after_w1) == NEUTRAL)
b7b65b15
EZ
1505 {
1506 next_type = bidi_it->prev_for_neutral.type;
1507 saved_it.next_for_neutral.type = next_type;
2d6e4628 1508 bidi_check_type (next_type);
b7b65b15
EZ
1509 }
1510 else
1511 {
1512 /* This is a BN which does not adjoin neutrals.
1513 Leave its type alone. */
1514 bidi_copy_it (bidi_it, &saved_it);
1515 return bidi_it->type;
1516 }
1517 break;
1518 default:
1519 abort ();
1520 }
1521 type = bidi_resolve_neutral_1 (saved_it.prev_for_neutral.type,
1522 next_type, current_level);
1523 saved_it.type = type;
2d6e4628 1524 bidi_check_type (type);
b7b65b15
EZ
1525 bidi_copy_it (bidi_it, &saved_it);
1526 }
1527 }
1528 return type;
1529}
1530
1531/* Given an iterator state in BIDI_IT, advance one character position
1532 in the buffer to the next character (in the logical order), resolve
1533 the bidi type of that next character, and return that type. */
1534bidi_type_t
1535bidi_type_of_next_char (struct bidi_it *bidi_it)
1536{
1537 bidi_type_t type;
1538
1539 /* This should always be called during a forward scan. */
1540 if (bidi_it->scan_dir != 1)
1541 abort ();
1542
1543 /* Reset the limit until which to ignore BNs if we step out of the
1544 area where we found only empty levels. */
1545 if ((bidi_it->ignore_bn_limit > 0
1546 && bidi_it->ignore_bn_limit <= bidi_it->charpos)
1547 || (bidi_it->ignore_bn_limit == -1
1548 && !bidi_explicit_dir_char (bidi_it->ch)))
1549 bidi_it->ignore_bn_limit = 0;
1550
1551 type = bidi_resolve_neutral (bidi_it);
1552
1553 return type;
1554}
1555
1556/* Given an iterator state BIDI_IT, advance one character position in
1557 the buffer to the next character (in the logical order), resolve
1558 the embedding and implicit levels of that next character, and
1559 return the resulting level. */
1560int
1561bidi_level_of_next_char (struct bidi_it *bidi_it)
1562{
1563 bidi_type_t type;
1564 int level, prev_level = -1;
1565 struct bidi_saved_info next_for_neutral;
1566
1567 if (bidi_it->scan_dir == 1)
1568 {
1569 /* There's no sense in trying to advance if we hit end of text. */
1570 if (bidi_it->ch == BIDI_EOB)
1571 return bidi_it->resolved_level;
1572
1573 /* Record the info about the previous character. */
89d3374a 1574 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
b7b65b15
EZ
1575 && bidi_it->type != WEAK_BN)
1576 bidi_remember_char (&bidi_it->prev, bidi_it);
89d3374a
EZ
1577 if (bidi_it->type_after_w1 == STRONG_R
1578 || bidi_it->type_after_w1 == STRONG_L
1579 || bidi_it->type_after_w1 == STRONG_AL)
b7b65b15
EZ
1580 bidi_remember_char (&bidi_it->last_strong, bidi_it);
1581 /* FIXME: it sounds like we don't need both prev and
1582 prev_for_neutral members, but I'm leaving them both for now. */
1583 if (bidi_it->type == STRONG_R || bidi_it->type == STRONG_L
1584 || bidi_it->type == WEAK_EN || bidi_it->type == WEAK_AN)
1585 bidi_remember_char (&bidi_it->prev_for_neutral, bidi_it);
1586
1587 /* If we overstepped the characters used for resolving neutrals
1588 and whitespace, invalidate their info in the iterator. */
1589 if (bidi_it->charpos >= bidi_it->next_for_neutral.charpos)
1590 bidi_it->next_for_neutral.type = UNKNOWN_BT;
1591 if (bidi_it->next_en_pos >= 0
1592 && bidi_it->charpos >= bidi_it->next_en_pos)
1593 bidi_it->next_en_pos = -1;
1594 if (bidi_it->next_for_ws.type != UNKNOWN_BT
1595 && bidi_it->charpos >= bidi_it->next_for_ws.charpos)
1596 bidi_it->next_for_ws.type = UNKNOWN_BT;
1597
1598 /* This must be taken before we fill the iterator with the info
1599 about the next char. If we scan backwards, the iterator
1600 state must be already cached, so there's no need to know the
1601 embedding level of the previous character, since we will be
1602 returning to our caller shortly. */
1603 prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1604 }
1605 next_for_neutral = bidi_it->next_for_neutral;
1606
1607 /* Perhaps it is already cached. */
1608 type = bidi_cache_find (bidi_it->charpos + bidi_it->scan_dir, -1, bidi_it);
1609 if (type != UNKNOWN_BT)
1610 {
1611 /* Don't lose the information for resolving neutrals! The
1612 cached states could have been cached before their
1613 next_for_neutral member was computed. If we are on our way
1614 forward, we can simply take the info from the previous
1615 state. */
1616 if (bidi_it->scan_dir == 1
1617 && bidi_it->next_for_neutral.type == UNKNOWN_BT)
1618 bidi_it->next_for_neutral = next_for_neutral;
1619
1620 /* If resolved_level is -1, it means this state was cached
1621 before it was completely resolved, so we cannot return
1622 it. */
1623 if (bidi_it->resolved_level != -1)
1624 return bidi_it->resolved_level;
1625 }
1626 if (bidi_it->scan_dir == -1)
1627 /* If we are going backwards, the iterator state is already cached
1628 from previous scans, and should be fully resolved. */
1629 abort ();
1630
1631 if (type == UNKNOWN_BT)
1632 type = bidi_type_of_next_char (bidi_it);
1633
1634 if (type == NEUTRAL_B)
1635 return bidi_it->resolved_level;
1636
1637 level = bidi_it->level_stack[bidi_it->stack_idx].level;
1638 if ((bidi_get_category (type) == NEUTRAL /* && type != NEUTRAL_B */)
1639 || (type == WEAK_BN && prev_level == level))
1640 {
1641 if (bidi_it->next_for_neutral.type == UNKNOWN_BT)
1642 abort ();
1643
1644 /* If the cached state shows a neutral character, it was not
1645 resolved by bidi_resolve_neutral, so do it now. */
1646 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1647 bidi_it->next_for_neutral.type,
1648 level);
1649 }
1650
1651 if (!(type == STRONG_R
1652 || type == STRONG_L
1653 || type == WEAK_BN
1654 || type == WEAK_EN
1655 || type == WEAK_AN))
1656 abort ();
1657 bidi_it->type = type;
2d6e4628 1658 bidi_check_type (bidi_it->type);
b7b65b15
EZ
1659
1660 /* For L1 below, we need to know, for each WS character, whether
1661 it belongs to a sequence of WS characters preceeding a newline
1662 or a TAB or a paragraph separator. */
89d3374a 1663 if (bidi_it->orig_type == NEUTRAL_WS
b7b65b15
EZ
1664 && bidi_it->next_for_ws.type == UNKNOWN_BT)
1665 {
1666 int ch;
1667 int clen = bidi_it->ch_len;
6bff6497
EZ
1668 EMACS_INT bpos = bidi_it->bytepos;
1669 EMACS_INT cpos = bidi_it->charpos;
b7b65b15
EZ
1670 bidi_type_t chtype;
1671
1672 do {
1673 /*_fetch_multibyte_char_len = 1;*/
e7402cb2 1674 ch = bpos + clen >= ZV_BYTE ? BIDI_EOB : FETCH_CHAR (bpos + clen);
b7b65b15
EZ
1675 bpos += clen;
1676 cpos++;
e7402cb2
EZ
1677 clen = (ch == BIDI_EOB ? 1 : CHAR_BYTES (ch));
1678 if (ch == '\n' || ch == BIDI_EOB /* || ch == LINESEP_CHAR */)
b7b65b15
EZ
1679 chtype = NEUTRAL_B;
1680 else
6bff6497 1681 chtype = bidi_get_type (ch, NEUTRAL_DIR);
b7b65b15
EZ
1682 } while (chtype == NEUTRAL_WS || chtype == WEAK_BN
1683 || bidi_explicit_dir_char (ch)); /* L1/Retaining */
1684 bidi_it->next_for_ws.type = chtype;
2d6e4628 1685 bidi_check_type (bidi_it->next_for_ws.type);
b7b65b15
EZ
1686 bidi_it->next_for_ws.charpos = cpos;
1687 bidi_it->next_for_ws.bytepos = bpos;
1688 }
1689
1690 /* Resolve implicit levels, with a twist: PDFs get the embedding
1691 level of the enbedding they terminate. See below for the
1692 reason. */
89d3374a 1693 if (bidi_it->orig_type == PDF
b7b65b15
EZ
1694 /* Don't do this if this formatting code didn't change the
1695 embedding level due to invalid or empty embeddings. */
1696 && prev_level != level)
1697 {
1698 /* Don't look in UAX#9 for the reason for this: it's our own
1699 private quirk. The reason is that we want the formatting
1700 codes to be delivered so that they bracket the text of their
1701 embedding. For example, given the text
1702
1703 {RLO}teST{PDF}
1704
1705 we want it to be displayed as
1706
1707 {RLO}STet{PDF}
1708
1709 not as
1710
1711 STet{RLO}{PDF}
1712
1713 which will result because we bump up the embedding level as
1714 soon as we see the RLO and pop it as soon as we see the PDF,
1715 so RLO itself has the same embedding level as "teST", and
1716 thus would be normally delivered last, just before the PDF.
1717 The switch below fiddles with the level of PDF so that this
1718 ugly side effect does not happen.
1719
1720 (This is, of course, only important if the formatting codes
e7402cb2
EZ
1721 are actually displayed, but Emacs does need to display them
1722 if the user wants to.) */
b7b65b15
EZ
1723 level = prev_level;
1724 }
89d3374a
EZ
1725 else if (bidi_it->orig_type == NEUTRAL_B /* L1 */
1726 || bidi_it->orig_type == NEUTRAL_S
e7402cb2
EZ
1727 || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB
1728 /* || bidi_it->ch == LINESEP_CHAR */
89d3374a 1729 || (bidi_it->orig_type == NEUTRAL_WS
b7b65b15
EZ
1730 && (bidi_it->next_for_ws.type == NEUTRAL_B
1731 || bidi_it->next_for_ws.type == NEUTRAL_S)))
1732 level = bidi_it->level_stack[0].level;
1733 else if ((level & 1) == 0) /* I1 */
1734 {
1735 if (type == STRONG_R)
1736 level++;
1737 else if (type == WEAK_EN || type == WEAK_AN)
1738 level += 2;
1739 }
1740 else /* I2 */
1741 {
1742 if (type == STRONG_L || type == WEAK_EN || type == WEAK_AN)
1743 level++;
1744 }
1745
1746 bidi_it->resolved_level = level;
1747 return level;
1748}
1749
1750/* Move to the other edge of a level given by LEVEL. If END_FLAG is
1751 non-zero, we are at the end of a level, and we need to prepare to
1752 resume the scan of the lower level.
1753
1754 If this level's other edge is cached, we simply jump to it, filling
1755 the iterator structure with the iterator state on the other edge.
1756 Otherwise, we walk the buffer until we come back to the same level
1757 as LEVEL.
1758
1759 Note: we are not talking here about a ``level run'' in the UAX#9
1760 sense of the term, but rather about a ``level'' which includes
1761 all the levels higher than it. In other words, given the levels
1762 like this:
1763
1764 11111112222222333333334443343222222111111112223322111
1765 A B C
1766
1767 and assuming we are at point A scanning left to right, this
1768 function moves to point C, whereas the UAX#9 ``level 2 run'' ends
1769 at point B. */
1770static void
1771bidi_find_other_level_edge (struct bidi_it *bidi_it, int level, int end_flag)
1772{
1773 int dir = end_flag ? -bidi_it->scan_dir : bidi_it->scan_dir;
1774 int idx;
1775
1776 /* Try the cache first. */
1777 if ((idx = bidi_cache_find_level_change (level, dir, end_flag)) >= 0)
1778 bidi_cache_fetch_state (idx, bidi_it);
1779 else
1780 {
1781 int new_level;
1782
1783 if (end_flag)
1784 abort (); /* if we are at end of level, its edges must be cached */
1785
1786 bidi_cache_iterator_state (bidi_it, 1);
1787 do {
1788 new_level = bidi_level_of_next_char (bidi_it);
1789 bidi_cache_iterator_state (bidi_it, 1);
1790 } while (new_level >= level);
1791 }
1792}
1793
1794void
1795bidi_get_next_char_visually (struct bidi_it *bidi_it)
1796{
1797 int old_level, new_level, next_level;
9c82e145 1798 struct bidi_it sentinel;
b7b65b15
EZ
1799
1800 if (bidi_it->scan_dir == 0)
1801 {
1802 bidi_it->scan_dir = 1; /* default to logical order */
1803 }
1804
9c82e145 1805 /* Prepare the sentinel iterator state. */
b7b65b15 1806 if (bidi_cache_idx == 0)
9c82e145
EZ
1807 {
1808 bidi_copy_it (&sentinel, bidi_it);
1809 if (bidi_it->first_elt)
1810 {
1811 sentinel.charpos--; /* cached charpos needs to be monotonic */
1812 sentinel.bytepos--;
1813 sentinel.ch = '\n'; /* doesn't matter, but why not? */
1814 sentinel.ch_len = 1;
1815 }
1816 }
b7b65b15
EZ
1817
1818 old_level = bidi_it->resolved_level;
1819 new_level = bidi_level_of_next_char (bidi_it);
1820 if (bidi_it->ch == BIDI_EOB)
1821 return;
1822
1823 /* Reordering of resolved levels (clause L2) is implemented by
1824 jumping to the other edge of the level and flipping direction of
1825 scanning the buffer whenever we find a level change. */
1826 if (new_level != old_level)
1827 {
1828 int ascending = new_level > old_level;
1829 int level_to_search = ascending ? old_level + 1 : old_level;
1830 int incr = ascending ? 1 : -1;
1831 int expected_next_level = old_level + incr;
1832
1833 /* If we don't have anything cached yet, we need to cache the
9c82e145
EZ
1834 sentinel state, since we'll need it to record where to jump
1835 when the last non-base level is exhausted. */
b7b65b15 1836 if (bidi_cache_idx == 0)
9c82e145 1837 bidi_cache_iterator_state (&sentinel, 1);
b7b65b15
EZ
1838 /* Jump (or walk) to the other edge of this level. */
1839 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
1840 /* Switch scan direction and peek at the next character in the
1841 new direction. */
1842 bidi_it->scan_dir = -bidi_it->scan_dir;
1843
1844 /* The following loop handles the case where the resolved level
1845 jumps by more than one. This is typical for numbers inside a
1846 run of text with left-to-right embedding direction, but can
1847 also happen in other situations. In those cases the decision
1848 where to continue after a level change, and in what direction,
1849 is tricky. For example, given a text like below:
1850
1851 abcdefgh
1852 11336622
1853
1854 (where the numbers below the text show the resolved levels),
1855 the result of reordering according to UAX#9 should be this:
1856
1857 efdcghba
1858
1859 This is implemented by the loop below which flips direction
1860 and jumps to the other edge of the level each time it finds
1861 the new level not to be the expected one. The expected level
1862 is always one more or one less than the previous one. */
1863 next_level = bidi_peek_at_next_level (bidi_it);
1864 while (next_level != expected_next_level)
1865 {
1866 expected_next_level += incr;
1867 level_to_search += incr;
1868 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
1869 bidi_it->scan_dir = -bidi_it->scan_dir;
1870 next_level = bidi_peek_at_next_level (bidi_it);
1871 }
1872
1873 /* Finally, deliver the next character in the new direction. */
1874 next_level = bidi_level_of_next_char (bidi_it);
1875 }
1876
6bff6497
EZ
1877 /* Take note when we are at the end of the paragraph. The next time
1878 we are about to be called, next_element_from_buffer will
1879 automatically reinit the paragraph direction, if needed. */
1880 if (bidi_it->scan_dir == 1
1881 && bidi_it->type == NEUTRAL_B
1882 && bidi_it->bytepos < ZV_BYTE
1883 && bidi_at_paragraph_end (bidi_it->charpos + 1,
1884 bidi_it->bytepos + bidi_it->ch_len))
1885 bidi_it->new_paragraph = 1;
1886
b7b65b15
EZ
1887 if (bidi_it->scan_dir == 1 && bidi_cache_idx)
1888 {
1889 /* If we are at paragraph's base embedding level and beyond the
1890 last cached position, the cache's job is done and we can
1891 discard it. */
1892 if (bidi_it->resolved_level == bidi_it->level_stack[0].level
1893 && bidi_it->charpos > bidi_cache[bidi_cache_idx - 1].charpos)
1894 bidi_cache_reset ();
1895 /* But as long as we are caching during forward scan, we must
1896 cache each state, or else the cache integrity will be
1897 compromised: it assumes cached states correspond to buffer
1898 positions 1:1. */
1899 else
1900 bidi_cache_iterator_state (bidi_it, 1);
1901 }
1902}
1903
1904/* This is meant to be called from within the debugger, whenever you
1905 wish to examine the cache contents. */
1906void
1907bidi_dump_cached_states (void)
1908{
1909 int i;
1910 int ndigits = 1;
1911
1912 if (bidi_cache_idx == 0)
1913 {
1914 fprintf (stderr, "The cache is empty.\n");
1915 return;
1916 }
1917 fprintf (stderr, "Total of %d state%s in cache:\n",
1918 bidi_cache_idx, bidi_cache_idx == 1 ? "" : "s");
1919
1920 for (i = bidi_cache[bidi_cache_idx - 1].charpos; i > 0; i /= 10)
1921 ndigits++;
1922 fputs ("ch ", stderr);
1923 for (i = 0; i < bidi_cache_idx; i++)
1924 fprintf (stderr, "%*c", ndigits, bidi_cache[i].ch);
1925 fputs ("\n", stderr);
1926 fputs ("lvl ", stderr);
1927 for (i = 0; i < bidi_cache_idx; i++)
1928 fprintf (stderr, "%*d", ndigits, bidi_cache[i].resolved_level);
1929 fputs ("\n", stderr);
1930 fputs ("pos ", stderr);
1931 for (i = 0; i < bidi_cache_idx; i++)
1932 fprintf (stderr, "%*d", ndigits, bidi_cache[i].charpos);
1933 fputs ("\n", stderr);
1934}