Retrospective commit from 2009-10-10.
[bpt/emacs.git] / src / bidi.c
CommitLineData
b7b65b15 1/* Low-level bidirectional buffer-scanning functions for GNU Emacs.
c143c213 2 Copyright (C) 2000, 2001, 2004, 2005, 2009 Free Software Foundation, Inc.
b7b65b15
EZ
3
4This file is part of GNU Emacs.
5
6GNU Emacs is free software; you can redistribute it and/or modify
7it under the terms of the GNU General Public License as published by
8the Free Software Foundation; either version 2, or (at your option)
9any later version.
10
11GNU Emacs is distributed in the hope that it will be useful,
12but WITHOUT ANY WARRANTY; without even the implied warranty of
13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14GNU General Public License for more details.
15
16
17You should have received a copy of the GNU General Public License
18along with GNU Emacs; see the file COPYING. If not, write to
19the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20Boston, MA 02110-1301, USA. */
21
2d6e4628
EZ
22/* Written by Eli Zaretskii <eliz@gnu.org>.
23
24 A sequential implementation of the Unicode Bidirectional algorithm,
b7b65b15
EZ
25 as per UAX#9, a part of the Unicode Standard.
26
27 Unlike the reference and most other implementations, this one is
28 designed to be called once for every character in the buffer.
29
30 The main entry point is bidi_get_next_char_visually. Each time it
31 is called, it finds the next character in the visual order, and
32 returns its information in a special structure. The caller is then
33 expected to process this character for display or any other
34 purposes, and call bidi_get_next_char_visually for the next
35 character. See the comments in bidi_get_next_char_visually for
36 more details about its algorithm that finds the next visual-order
37 character by resolving their levels on the fly.
38
89d3374a
EZ
39 If you want to understand the code, you will have to read it
40 together with the relevant portions of UAX#9. The comments include
41 references to UAX#9 rules, for that very reason.
42
b7b65b15
EZ
43 A note about references to UAX#9 rules: if the reference says
44 something like "X9/Retaining", it means that you need to refer to
45 rule X9 and to its modifications decribed in the "Implementation
46 Notes" section of UAX#9, under "Retaining Format Codes". */
47
48#ifdef HAVE_CONFIG_H
49#include <config.h>
50#endif
51
52#include <stdio.h>
53
54#ifdef HAVE_STRING_H
55#include <string.h>
56#endif
57
58#include "lisp.h"
59#include "buffer.h"
60#include "character.h"
61#include "dispextern.h"
62
63static int bidi_initialized = 0;
64
65static Lisp_Object bidi_type_table;
66
e7402cb2 67/* FIXME: Remove these when bidi_explicit_dir_char uses a lookup table. */
b7b65b15
EZ
68#define LRM_CHAR 0x200E
69#define RLM_CHAR 0x200F
70#define LRE_CHAR 0x202A
71#define RLE_CHAR 0x202B
72#define PDF_CHAR 0x202C
73#define LRO_CHAR 0x202D
74#define RLO_CHAR 0x202E
75
b7b65b15 76#define BIDI_EOB -1
e7402cb2 77#define BIDI_BOB -2 /* FIXME: Is this needed? */
b7b65b15 78
b7b65b15
EZ
79/* Local data structures. (Look in dispextern.h for the rest.) */
80
81/* What we need to know about the current paragraph. */
82struct bidi_paragraph_info {
83 int start_bytepos; /* byte position where it begins */
84 int end_bytepos; /* byte position where it ends */
85 int embedding_level; /* its basic embedding level */
86 bidi_dir_t base_dir; /* its base direction */
87};
88
89/* Data type for describing the bidirectional character categories. */
90typedef enum {
91 UNKNOWN_BC,
92 NEUTRAL,
93 WEAK,
94 STRONG
95} bidi_category_t;
96
97int bidi_ignore_explicit_marks_for_paragraph_level = 1;
98
6bff6497
EZ
99static Lisp_Object fallback_paragraph_start_re, fallback_paragraph_separate_re;
100static Lisp_Object Qparagraph_start, Qparagraph_separate;
b7b65b15 101
b7b65b15
EZ
102static void
103bidi_initialize ()
104{
105 /* FIXME: This should come from the Unicode Database. */
106 struct {
107 int from, to;
108 bidi_type_t type;
109 } bidi_type[] =
110 { { 0x0000, 0x0008, WEAK_BN },
111 { 0x0009, 0x0000, NEUTRAL_S },
112 { 0x000A, 0x0000, NEUTRAL_B },
113 { 0x000B, 0x0000, NEUTRAL_S },
114 { 0x000C, 0x0000, NEUTRAL_WS },
115 { 0x000D, 0x0000, NEUTRAL_B },
116 { 0x000E, 0x001B, WEAK_BN },
117 { 0x001C, 0x001E, NEUTRAL_B },
118 { 0x001F, 0x0000, NEUTRAL_S },
119 { 0x0020, 0x0000, NEUTRAL_WS },
120 { 0x0021, 0x0022, NEUTRAL_ON },
121 { 0x0023, 0x0025, WEAK_ET },
122 { 0x0026, 0x002A, NEUTRAL_ON },
e342a24d 123 { 0x002B, 0x0000, WEAK_ES },
b7b65b15 124 { 0x002C, 0x0000, WEAK_CS },
e342a24d
EZ
125 { 0x002D, 0x0000, WEAK_ES },
126 { 0x002E, 0x002F, WEAK_CS },
b7b65b15
EZ
127 { 0x0030, 0x0039, WEAK_EN },
128 { 0x003A, 0x0000, WEAK_CS },
129 { 0x003B, 0x0040, NEUTRAL_ON },
130 { 0x005B, 0x0060, NEUTRAL_ON },
131 { 0x007B, 0x007E, NEUTRAL_ON },
132 { 0x007F, 0x0084, WEAK_BN },
133 { 0x0085, 0x0000, NEUTRAL_B },
134 { 0x0086, 0x009F, WEAK_BN },
135 { 0x00A0, 0x0000, WEAK_CS },
136 { 0x00A1, 0x0000, NEUTRAL_ON },
137 { 0x00A2, 0x00A5, WEAK_ET },
138 { 0x00A6, 0x00A9, NEUTRAL_ON },
e342a24d
EZ
139 { 0x00AB, 0x00AC, NEUTRAL_ON },
140 { 0x00AD, 0x0000, WEAK_BN },
141 { 0x00AE, 0x00Af, NEUTRAL_ON },
b7b65b15
EZ
142 { 0x00B0, 0x00B1, WEAK_ET },
143 { 0x00B2, 0x00B3, WEAK_EN },
144 { 0x00B4, 0x0000, NEUTRAL_ON },
145 { 0x00B6, 0x00B8, NEUTRAL_ON },
146 { 0x00B9, 0x0000, WEAK_EN },
147 { 0x00BB, 0x00BF, NEUTRAL_ON },
148 { 0x00D7, 0x0000, NEUTRAL_ON },
149 { 0x00F7, 0x0000, NEUTRAL_ON },
150 { 0x02B9, 0x02BA, NEUTRAL_ON },
151 { 0x02C2, 0x02CF, NEUTRAL_ON },
152 { 0x02D2, 0x02DF, NEUTRAL_ON },
153 { 0x02E5, 0x02ED, NEUTRAL_ON },
154 { 0x0300, 0x036F, WEAK_NSM },
155 { 0x0374, 0x0375, NEUTRAL_ON },
156 { 0x037E, 0x0385, NEUTRAL_ON },
157 { 0x0387, 0x0000, NEUTRAL_ON },
158 { 0x03F6, 0x0000, NEUTRAL_ON },
159 { 0x0483, 0x0489, WEAK_NSM },
160 { 0x058A, 0x0000, NEUTRAL_ON },
161 { 0x0591, 0x05BD, WEAK_NSM },
162 { 0x05BE, 0x0000, STRONG_R },
163 { 0x05BF, 0x0000, WEAK_NSM },
164 { 0x05C0, 0x0000, STRONG_R },
165 { 0x05C1, 0x05C2, WEAK_NSM },
166 { 0x05C3, 0x0000, STRONG_R },
e342a24d
EZ
167 { 0x05C4, 0x05C5, WEAK_NSM },
168 { 0x05C6, 0x0000, STRONG_R },
169 { 0x05C7, 0x0000, WEAK_NSM },
b7b65b15
EZ
170 { 0x05D0, 0x05F4, STRONG_R },
171 { 0x060C, 0x0000, WEAK_CS },
172 { 0x061B, 0x064A, STRONG_AL },
173 { 0x064B, 0x0655, WEAK_NSM },
174 { 0x0660, 0x0669, WEAK_AN },
175 { 0x066A, 0x0000, WEAK_ET },
176 { 0x066B, 0x066C, WEAK_AN },
177 { 0x066D, 0x066F, STRONG_AL },
178 { 0x0670, 0x0000, WEAK_NSM },
179 { 0x0671, 0x06D5, STRONG_AL },
180 { 0x06D6, 0x06DC, WEAK_NSM },
181 { 0x06DD, 0x0000, STRONG_AL },
182 { 0x06DE, 0x06E4, WEAK_NSM },
183 { 0x06E5, 0x06E6, STRONG_AL },
184 { 0x06E7, 0x06E8, WEAK_NSM },
185 { 0x06E9, 0x0000, NEUTRAL_ON },
186 { 0x06EA, 0x06ED, WEAK_NSM },
187 { 0x06F0, 0x06F9, WEAK_EN },
188 { 0x06FA, 0x070D, STRONG_AL },
189 { 0x070F, 0x0000, WEAK_BN },
190 { 0x0710, 0x0000, STRONG_AL },
191 { 0x0711, 0x0000, WEAK_NSM },
192 { 0x0712, 0x072C, STRONG_AL },
193 { 0x0730, 0x074A, WEAK_NSM },
194 { 0x0780, 0x07A5, STRONG_AL },
195 { 0x07A6, 0x07B0, WEAK_NSM },
196 { 0x07B1, 0x0000, STRONG_AL },
197 { 0x0901, 0x0902, WEAK_NSM },
198 { 0x093C, 0x0000, WEAK_NSM },
199 { 0x0941, 0x0948, WEAK_NSM },
200 { 0x094D, 0x0000, WEAK_NSM },
201 { 0x0951, 0x0954, WEAK_NSM },
202 { 0x0962, 0x0963, WEAK_NSM },
203 { 0x0981, 0x0000, WEAK_NSM },
204 { 0x09BC, 0x0000, WEAK_NSM },
205 { 0x09C1, 0x09C4, WEAK_NSM },
206 { 0x09CD, 0x0000, WEAK_NSM },
207 { 0x09E2, 0x09E3, WEAK_NSM },
208 { 0x09F2, 0x09F3, WEAK_ET },
209 { 0x0A02, 0x0000, WEAK_NSM },
210 { 0x0A3C, 0x0000, WEAK_NSM },
211 { 0x0A41, 0x0A4D, WEAK_NSM },
212 { 0x0A70, 0x0A71, WEAK_NSM },
213 { 0x0A81, 0x0A82, WEAK_NSM },
214 { 0x0ABC, 0x0000, WEAK_NSM },
215 { 0x0AC1, 0x0AC8, WEAK_NSM },
216 { 0x0ACD, 0x0000, WEAK_NSM },
217 { 0x0B01, 0x0000, WEAK_NSM },
218 { 0x0B3C, 0x0000, WEAK_NSM },
219 { 0x0B3F, 0x0000, WEAK_NSM },
220 { 0x0B41, 0x0B43, WEAK_NSM },
221 { 0x0B4D, 0x0B56, WEAK_NSM },
222 { 0x0B82, 0x0000, WEAK_NSM },
223 { 0x0BC0, 0x0000, WEAK_NSM },
224 { 0x0BCD, 0x0000, WEAK_NSM },
225 { 0x0C3E, 0x0C40, WEAK_NSM },
226 { 0x0C46, 0x0C56, WEAK_NSM },
227 { 0x0CBF, 0x0000, WEAK_NSM },
228 { 0x0CC6, 0x0000, WEAK_NSM },
229 { 0x0CCC, 0x0CCD, WEAK_NSM },
230 { 0x0D41, 0x0D43, WEAK_NSM },
231 { 0x0D4D, 0x0000, WEAK_NSM },
232 { 0x0DCA, 0x0000, WEAK_NSM },
233 { 0x0DD2, 0x0DD6, WEAK_NSM },
234 { 0x0E31, 0x0000, WEAK_NSM },
235 { 0x0E34, 0x0E3A, WEAK_NSM },
236 { 0x0E3F, 0x0000, WEAK_ET },
237 { 0x0E47, 0x0E4E, WEAK_NSM },
238 { 0x0EB1, 0x0000, WEAK_NSM },
239 { 0x0EB4, 0x0EBC, WEAK_NSM },
240 { 0x0EC8, 0x0ECD, WEAK_NSM },
241 { 0x0F18, 0x0F19, WEAK_NSM },
242 { 0x0F35, 0x0000, WEAK_NSM },
243 { 0x0F37, 0x0000, WEAK_NSM },
244 { 0x0F39, 0x0000, WEAK_NSM },
245 { 0x0F3A, 0x0F3D, NEUTRAL_ON },
246 { 0x0F71, 0x0F7E, WEAK_NSM },
247 { 0x0F80, 0x0F84, WEAK_NSM },
248 { 0x0F86, 0x0F87, WEAK_NSM },
249 { 0x0F90, 0x0FBC, WEAK_NSM },
250 { 0x0FC6, 0x0000, WEAK_NSM },
251 { 0x102D, 0x1030, WEAK_NSM },
252 { 0x1032, 0x1037, WEAK_NSM },
253 { 0x1039, 0x0000, WEAK_NSM },
254 { 0x1058, 0x1059, WEAK_NSM },
255 { 0x1680, 0x0000, NEUTRAL_WS },
256 { 0x169B, 0x169C, NEUTRAL_ON },
257 { 0x1712, 0x1714, WEAK_NSM },
258 { 0x1732, 0x1734, WEAK_NSM },
259 { 0x1752, 0x1753, WEAK_NSM },
260 { 0x1772, 0x1773, WEAK_NSM },
261 { 0x17B7, 0x17BD, WEAK_NSM },
262 { 0x17C6, 0x0000, WEAK_NSM },
263 { 0x17C9, 0x17D3, WEAK_NSM },
264 { 0x17DB, 0x0000, WEAK_ET },
265 { 0x1800, 0x180A, NEUTRAL_ON },
266 { 0x180B, 0x180D, WEAK_NSM },
267 { 0x180E, 0x0000, WEAK_BN },
268 { 0x18A9, 0x0000, WEAK_NSM },
269 { 0x1FBD, 0x0000, NEUTRAL_ON },
270 { 0x1FBF, 0x1FC1, NEUTRAL_ON },
271 { 0x1FCD, 0x1FCF, NEUTRAL_ON },
272 { 0x1FDD, 0x1FDF, NEUTRAL_ON },
273 { 0x1FED, 0x1FEF, NEUTRAL_ON },
274 { 0x1FFD, 0x1FFE, NEUTRAL_ON },
275 { 0x2000, 0x200A, NEUTRAL_WS },
276 { 0x200B, 0x200D, WEAK_BN },
277 { 0x200F, 0x0000, STRONG_R },
278 { 0x2010, 0x2027, NEUTRAL_ON },
279 { 0x2028, 0x0000, NEUTRAL_WS },
280 { 0x2029, 0x0000, NEUTRAL_B },
281 { 0x202A, 0x0000, LRE },
282 { 0x202B, 0x0000, RLE },
283 { 0x202C, 0x0000, PDF },
284 { 0x202D, 0x0000, LRO },
285 { 0x202E, 0x0000, RLO },
286 { 0x202F, 0x0000, NEUTRAL_WS },
287 { 0x2030, 0x2034, WEAK_ET },
288 { 0x2035, 0x2057, NEUTRAL_ON },
289 { 0x205F, 0x0000, NEUTRAL_WS },
290 { 0x2060, 0x206F, WEAK_BN },
291 { 0x2070, 0x0000, WEAK_EN },
292 { 0x2074, 0x2079, WEAK_EN },
293 { 0x207A, 0x207B, WEAK_ET },
294 { 0x207C, 0x207E, NEUTRAL_ON },
295 { 0x2080, 0x2089, WEAK_EN },
296 { 0x208A, 0x208B, WEAK_ET },
297 { 0x208C, 0x208E, NEUTRAL_ON },
298 { 0x20A0, 0x20B1, WEAK_ET },
299 { 0x20D0, 0x20EA, WEAK_NSM },
300 { 0x2100, 0x2101, NEUTRAL_ON },
301 { 0x2103, 0x2106, NEUTRAL_ON },
302 { 0x2108, 0x2109, NEUTRAL_ON },
303 { 0x2114, 0x0000, NEUTRAL_ON },
304 { 0x2116, 0x2118, NEUTRAL_ON },
305 { 0x211E, 0x2123, NEUTRAL_ON },
306 { 0x2125, 0x0000, NEUTRAL_ON },
307 { 0x2127, 0x0000, NEUTRAL_ON },
308 { 0x2129, 0x0000, NEUTRAL_ON },
309 { 0x212E, 0x0000, WEAK_ET },
310 { 0x2132, 0x0000, NEUTRAL_ON },
311 { 0x213A, 0x0000, NEUTRAL_ON },
312 { 0x2140, 0x2144, NEUTRAL_ON },
313 { 0x214A, 0x215F, NEUTRAL_ON },
314 { 0x2190, 0x2211, NEUTRAL_ON },
315 { 0x2212, 0x2213, WEAK_ET },
316 { 0x2214, 0x2335, NEUTRAL_ON },
317 { 0x237B, 0x2394, NEUTRAL_ON },
318 { 0x2396, 0x244A, NEUTRAL_ON },
319 { 0x2460, 0x249B, WEAK_EN },
320 { 0x24EA, 0x0000, WEAK_EN },
321 { 0x24EB, 0x2FFB, NEUTRAL_ON },
322 { 0x3000, 0x0000, NEUTRAL_WS },
323 { 0x3001, 0x3004, NEUTRAL_ON },
324 { 0x3008, 0x3020, NEUTRAL_ON },
325 { 0x302A, 0x302F, WEAK_NSM },
326 { 0x3030, 0x0000, NEUTRAL_ON },
327 { 0x3036, 0x3037, NEUTRAL_ON },
328 { 0x303D, 0x303F, NEUTRAL_ON },
329 { 0x3099, 0x309A, WEAK_NSM },
330 { 0x309B, 0x309C, NEUTRAL_ON },
331 { 0x30A0, 0x0000, NEUTRAL_ON },
332 { 0x30FB, 0x0000, NEUTRAL_ON },
333 { 0x3251, 0x325F, NEUTRAL_ON },
334 { 0x32B1, 0x32BF, NEUTRAL_ON },
335 { 0xA490, 0xA4C6, NEUTRAL_ON },
336 { 0xFB1D, 0x0000, STRONG_R },
337 { 0xFB1E, 0x0000, WEAK_NSM },
338 { 0xFB1F, 0xFB28, STRONG_R },
339 { 0xFB29, 0x0000, WEAK_ET },
340 { 0xFB2A, 0xFB4F, STRONG_R },
341 { 0xFB50, 0xFD3D, STRONG_AL },
342 { 0xFD3E, 0xFD3F, NEUTRAL_ON },
343 { 0xFD50, 0xFDFC, STRONG_AL },
344 { 0xFE00, 0xFE23, WEAK_NSM },
345 { 0xFE30, 0xFE4F, NEUTRAL_ON },
346 { 0xFE50, 0x0000, WEAK_CS },
347 { 0xFE51, 0x0000, NEUTRAL_ON },
348 { 0xFE52, 0x0000, WEAK_CS },
349 { 0xFE54, 0x0000, NEUTRAL_ON },
350 { 0xFE55, 0x0000, WEAK_CS },
351 { 0xFE56, 0xFE5E, NEUTRAL_ON },
352 { 0xFE5F, 0x0000, WEAK_ET },
353 { 0xFE60, 0xFE61, NEUTRAL_ON },
354 { 0xFE62, 0xFE63, WEAK_ET },
355 { 0xFE64, 0xFE68, NEUTRAL_ON },
356 { 0xFE69, 0xFE6A, WEAK_ET },
357 { 0xFE6B, 0x0000, NEUTRAL_ON },
358 { 0xFE70, 0xFEFC, STRONG_AL },
359 { 0xFEFF, 0x0000, WEAK_BN },
360 { 0xFF01, 0xFF02, NEUTRAL_ON },
361 { 0xFF03, 0xFF05, WEAK_ET },
362 { 0xFF06, 0xFF0A, NEUTRAL_ON },
363 { 0xFF0B, 0x0000, WEAK_ET },
364 { 0xFF0C, 0x0000, WEAK_CS },
365 { 0xFF0D, 0x0000, WEAK_ET },
366 { 0xFF0E, 0x0000, WEAK_CS },
367 { 0xFF0F, 0x0000, WEAK_ES },
368 { 0xFF10, 0xFF19, WEAK_EN },
369 { 0xFF1A, 0x0000, WEAK_CS },
370 { 0xFF1B, 0xFF20, NEUTRAL_ON },
371 { 0xFF3B, 0xFF40, NEUTRAL_ON },
372 { 0xFF5B, 0xFF65, NEUTRAL_ON },
373 { 0xFFE0, 0xFFE1, WEAK_ET },
374 { 0xFFE2, 0xFFE4, NEUTRAL_ON },
375 { 0xFFE5, 0xFFE6, WEAK_ET },
376 { 0xFFE8, 0xFFEE, NEUTRAL_ON },
377 { 0xFFF9, 0xFFFB, WEAK_BN },
378 { 0xFFFC, 0xFFFD, NEUTRAL_ON },
379 { 0x1D167, 0x1D169, WEAK_NSM },
380 { 0x1D173, 0x1D17A, WEAK_BN },
381 { 0x1D17B, 0x1D182, WEAK_NSM },
382 { 0x1D185, 0x1D18B, WEAK_NSM },
383 { 0x1D1AA, 0x1D1AD, WEAK_NSM },
384 { 0x1D7CE, 0x1D7FF, WEAK_EN },
385 { 0xE0001, 0xE007F, WEAK_BN } };
386 int i;
387
388 bidi_type_table = Fmake_char_table (Qnil, make_number (STRONG_L));
2d6e4628 389 staticpro (&bidi_type_table);
b7b65b15
EZ
390
391 for (i = 0; i < sizeof bidi_type / sizeof bidi_type[0]; i++)
f44e260c
EZ
392 char_table_set_range (bidi_type_table, bidi_type[i].from,
393 bidi_type[i].to ? bidi_type[i].to : bidi_type[i].from,
b7b65b15 394 make_number (bidi_type[i].type));
6bff6497
EZ
395
396 fallback_paragraph_start_re =
397 XSYMBOL (Fintern_soft (build_string ("paragraph-start"), Qnil))->value;
398 if (!STRINGP (fallback_paragraph_start_re))
399 fallback_paragraph_start_re = build_string ("\f\\|[ \t]*$");
400 staticpro (&fallback_paragraph_start_re);
401 Qparagraph_start = intern ("paragraph-start");
402 staticpro (&Qparagraph_start);
403 fallback_paragraph_separate_re =
404 XSYMBOL (Fintern_soft (build_string ("paragraph-separate"), Qnil))->value;
405 if (!STRINGP (fallback_paragraph_separate_re))
406 fallback_paragraph_separate_re = build_string ("[ \t\f]*$");
407 staticpro (&fallback_paragraph_separate_re);
408 Qparagraph_separate = intern ("paragraph-separate");
409 staticpro (&Qparagraph_separate);
b7b65b15
EZ
410 bidi_initialized = 1;
411}
412
6bff6497
EZ
413/* Return the bidi type of a character CH, subject to the current
414 directional OVERRIDE. */
b7b65b15 415bidi_type_t
6bff6497 416bidi_get_type (int ch, bidi_dir_t override)
b7b65b15 417{
6bff6497
EZ
418 bidi_type_t default_type;
419
e7402cb2
EZ
420 if (ch == BIDI_EOB)
421 return NEUTRAL_B;
e342a24d
EZ
422 if (ch < 0 || ch > MAX_CHAR)
423 abort ();
6bff6497
EZ
424
425 default_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch));
426
427 if (override == NEUTRAL_DIR)
428 return default_type;
429
430 switch (default_type)
431 {
432 /* Although UAX#9 does not tell, it doesn't make sense to
433 override NEUTRAL_B and LRM/RLM characters. */
434 case NEUTRAL_B:
435 case LRE:
436 case LRO:
437 case RLE:
438 case RLO:
439 case PDF:
440 return default_type;
441 default:
442 switch (ch)
443 {
444 case LRM_CHAR:
445 case RLM_CHAR:
446 return default_type;
447 default:
448 if (override == L2R) /* X6 */
449 return STRONG_L;
450 else if (override == R2L)
451 return STRONG_R;
452 else
453 abort (); /* can't happen: handled above */
454 }
455 }
b7b65b15
EZ
456}
457
2d6e4628
EZ
458void
459bidi_check_type (bidi_type_t type)
460{
461 if (type < UNKNOWN_BT || type > NEUTRAL_ON)
462 abort ();
463}
464
b7b65b15
EZ
465/* Given a bidi TYPE of a character, return its category. */
466bidi_category_t
467bidi_get_category (bidi_type_t type)
468{
469 switch (type)
470 {
471 case UNKNOWN_BT:
472 return UNKNOWN_BC;
473 case STRONG_L:
474 case STRONG_R:
475 case STRONG_AL:
476 case LRE:
477 case LRO:
478 case RLE:
479 case RLO:
480 return STRONG;
481 case PDF: /* ??? really?? */
482 case WEAK_EN:
483 case WEAK_ES:
484 case WEAK_ET:
485 case WEAK_AN:
486 case WEAK_CS:
487 case WEAK_NSM:
488 case WEAK_BN:
489 return WEAK;
490 case NEUTRAL_B:
491 case NEUTRAL_S:
492 case NEUTRAL_WS:
493 case NEUTRAL_ON:
494 return NEUTRAL;
495 default:
496 abort ();
497 }
498}
499
e342a24d
EZ
500/* Return the mirrored character of C, if any.
501
502 Note: The conditions in UAX#9 clause L4 must be tested by the
503 caller. */
b7b65b15
EZ
504/* FIXME: exceedingly temporary! Should consult the Unicode database
505 of character properties. */
506int
507bidi_mirror_char (int c)
508{
509 static const char mirrored_pairs[] = "()<>[]{}";
510 const char *p = strchr (mirrored_pairs, c);
511
512 if (p)
513 {
514 size_t i = p - mirrored_pairs;
515
516 if ((i & 1) == 0)
517 return mirrored_pairs[i + 1];
518 else
519 return mirrored_pairs[i - 1];
520 }
521 return c;
522}
523
524/* Copy the bidi iterator from FROM to TO. To save cycles, this only
525 copies the part of the level stack that is actually in use. */
526static inline void
527bidi_copy_it (struct bidi_it *to, struct bidi_it *from)
528{
9c82e145 529 int save_first_elt = to->first_elt;
b7b65b15
EZ
530 int i;
531
532 /* Copy everything except the level stack. */
533 memcpy (to, from, ((int)&((struct bidi_it *)0)->level_stack[0]));
e5a2fec7 534 /* Don't copy FIRST_ELT flag. */
9c82e145
EZ
535 to->first_elt = save_first_elt;
536 if (to->first_elt != 0 && to->first_elt != 1)
537 to->first_elt = 0;
b7b65b15
EZ
538
539 /* Copy the active part of the level stack. */
540 to->level_stack[0] = from->level_stack[0]; /* level zero is always in use */
541 for (i = 1; i <= from->stack_idx; i++)
542 to->level_stack[i] = from->level_stack[i];
543}
544
545/* Caching the bidi iterator states. */
546
547static struct bidi_it bidi_cache[1000]; /* FIXME: make this dynamically allocated! */
548static int bidi_cache_idx;
549static int bidi_cache_last_idx;
550
551static inline void
552bidi_cache_reset (void)
553{
554 bidi_cache_idx = 0;
555 bidi_cache_last_idx = -1;
556}
557
558static inline void
559bidi_cache_fetch_state (int idx, struct bidi_it *bidi_it)
560{
561 int current_scan_dir = bidi_it->scan_dir;
562
563 if (idx < 0 || idx >= bidi_cache_idx)
564 abort ();
565
566 bidi_copy_it (bidi_it, &bidi_cache[idx]);
567 bidi_it->scan_dir = current_scan_dir;
568 bidi_cache_last_idx = idx;
569}
570
571/* Find a cached state with a given CHARPOS and resolved embedding
572 level less or equal to LEVEL. if LEVEL is -1, disregard the
573 resolved levels in cached states. DIR, if non-zero, means search
574 in that direction from the last cache hit. */
575static inline int
576bidi_cache_search (int charpos, int level, int dir)
577{
578 int i, i_start;
579
580 if (bidi_cache_idx)
581 {
582 if (charpos < bidi_cache[bidi_cache_last_idx].charpos)
583 dir = -1;
584 else if (charpos > bidi_cache[bidi_cache_last_idx].charpos)
585 dir = 1;
586 if (dir)
587 i_start = bidi_cache_last_idx;
588 else
589 {
590 dir = -1;
591 i_start = bidi_cache_idx - 1;
592 }
593
594 if (dir < 0)
595 {
596 /* Linear search for now; FIXME! */
597 for (i = i_start; i >= 0; i--)
598 if (bidi_cache[i].charpos == charpos
599 && (level == -1 || bidi_cache[i].resolved_level <= level))
600 return i;
601 }
602 else
603 {
604 for (i = i_start; i < bidi_cache_idx; i++)
605 if (bidi_cache[i].charpos == charpos
606 && (level == -1 || bidi_cache[i].resolved_level <= level))
607 return i;
608 }
609 }
610
611 return -1;
612}
613
614/* Find a cached state where the resolved level changes to a value
615 that is lower than LEVEL, and return its cache slot index. DIR is
616 the direction to search, starting with the last used cache slot.
617 BEFORE, if non-zero, means return the index of the slot that is
618 ``before'' the level change in the search direction. That is,
619 given the cached levels like this:
620
621 1122333442211
622 AB C
623
624 and assuming we are at the position cached at the slot marked with
625 C, searching backwards (DIR = -1) for LEVEL = 2 will return the
626 index of slot B or A, depending whether BEFORE is, respectively,
627 non-zero or zero. */
628static int
629bidi_cache_find_level_change (int level, int dir, int before)
630{
631 if (bidi_cache_idx)
632 {
633 int i = dir ? bidi_cache_last_idx : bidi_cache_idx - 1;
634 int incr = before ? 1 : 0;
635
636 if (!dir)
637 dir = -1;
638 else if (!incr)
639 i += dir;
640
641 if (dir < 0)
642 {
643 while (i >= incr)
644 {
645 if (bidi_cache[i - incr].resolved_level >= 0
646 && bidi_cache[i - incr].resolved_level < level)
647 return i;
648 i--;
649 }
650 }
651 else
652 {
653 while (i < bidi_cache_idx - incr)
654 {
655 if (bidi_cache[i + incr].resolved_level >= 0
656 && bidi_cache[i + incr].resolved_level < level)
657 return i;
658 i++;
659 }
660 }
661 }
662
663 return -1;
664}
665
666static inline void
667bidi_cache_iterator_state (struct bidi_it *bidi_it, int resolved)
668{
669 int idx;
670
671 /* We should never cache on backward scans. */
672 if (bidi_it->scan_dir == -1)
673 abort ();
674 idx = bidi_cache_search (bidi_it->charpos, -1, 1);
675
676 if (idx < 0)
677 {
678 idx = bidi_cache_idx;
e5a2fec7 679 /* Don't overrun the cache limit. */
b7b65b15
EZ
680 if (idx > sizeof (bidi_cache) / sizeof (bidi_cache[0]) - 1)
681 abort ();
e5a2fec7
EZ
682 /* Don't violate cache integrity: character positions should
683 correspond to cache positions 1:1. */
684 if (idx > 0 && bidi_it->charpos != bidi_cache[idx - 1].charpos + 1)
685 abort ();
b7b65b15
EZ
686 bidi_copy_it (&bidi_cache[idx], bidi_it);
687 if (!resolved)
688 bidi_cache[idx].resolved_level = -1;
e5a2fec7 689 bidi_cache[idx].new_paragraph = 0;
b7b65b15
EZ
690 }
691 else
692 {
693 /* Copy only the members which could have changed, to avoid
694 costly copying of the entire struct. */
695 bidi_cache[idx].type = bidi_it->type;
2d6e4628 696 bidi_check_type (bidi_it->type);
89d3374a
EZ
697 bidi_cache[idx].type_after_w1 = bidi_it->type_after_w1;
698 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
699 if (resolved)
700 bidi_cache[idx].resolved_level = bidi_it->resolved_level;
701 else
702 bidi_cache[idx].resolved_level = -1;
703 bidi_cache[idx].invalid_levels = bidi_it->invalid_levels;
704 bidi_cache[idx].invalid_rl_levels = bidi_it->invalid_rl_levels;
705 bidi_cache[idx].next_for_neutral = bidi_it->next_for_neutral;
706 bidi_cache[idx].next_for_ws = bidi_it->next_for_ws;
707 bidi_cache[idx].ignore_bn_limit = bidi_it->ignore_bn_limit;
708 }
709
710 bidi_cache_last_idx = idx;
711 if (idx >= bidi_cache_idx)
712 bidi_cache_idx = idx + 1;
713}
714
715static inline bidi_type_t
716bidi_cache_find (int charpos, int level, struct bidi_it *bidi_it)
717{
718 int i = bidi_cache_search (charpos, level, bidi_it->scan_dir);
719
720 if (i >= 0)
721 {
722 bidi_dir_t current_scan_dir = bidi_it->scan_dir;
723
724 *bidi_it = bidi_cache[i];
725 bidi_cache_last_idx = i;
726 /* Don't let scan direction from from the cached state override
727 the current scan direction. */
728 bidi_it->scan_dir = current_scan_dir;
729 return bidi_it->type;
730 }
731
732 return UNKNOWN_BT;
733}
734
735static inline int
736bidi_peek_at_next_level (struct bidi_it *bidi_it)
737{
738 if (bidi_cache_idx == 0 || bidi_cache_last_idx == -1)
739 abort ();
740 return bidi_cache[bidi_cache_last_idx + bidi_it->scan_dir].resolved_level;
741}
742
be39f003
EZ
743/* Check if buffer position CHARPOS/BYTEPOS is the end of a paragraph.
744 Value is the non-negative length of the paragraph separator
745 following the buffer position, -1 if position is at the beginning
746 of a new paragraph, or -2 if position is neither at beginning nor
747 at end of a paragraph. */
748EMACS_INT
6bff6497 749bidi_at_paragraph_end (EMACS_INT charpos, EMACS_INT bytepos)
b7b65b15 750{
be39f003
EZ
751 Lisp_Object sep_re = Fbuffer_local_value (Qparagraph_separate,
752 Fcurrent_buffer ());
753 Lisp_Object start_re = Fbuffer_local_value (Qparagraph_start,
754 Fcurrent_buffer ());
755 EMACS_INT val;
756
757 if (!STRINGP (sep_re))
758 sep_re = fallback_paragraph_separate_re;
759 if (!STRINGP (start_re))
760 start_re = fallback_paragraph_start_re;
761
762 val = fast_looking_at (sep_re, charpos, bytepos, ZV, ZV_BYTE, Qnil);
763 if (val < 0)
764 {
765 if (fast_looking_at (start_re, charpos, bytepos, ZV, ZV_BYTE, Qnil) >= 0)
766 val = -1;
767 else
768 val = -2;
769 }
b7b65b15 770
be39f003 771 return val;
b7b65b15
EZ
772}
773
774/* Determine the start-of-run (sor) directional type given the two
775 embedding levels on either side of the run boundary. Also, update
776 the saved info about previously seen characters, since that info is
777 generally valid for a single level run. */
778static inline void
779bidi_set_sor_type (struct bidi_it *bidi_it, int level_before, int level_after)
780{
781 int higher_level = level_before > level_after ? level_before : level_after;
782
783 /* The prev_was_pdf gork is required for when we have several PDFs
784 in a row. In that case, we want to compute the sor type for the
785 next level run only once: when we see the first PDF. That's
786 because the sor type depends only on the higher of the two levels
787 that we find on the two sides of the level boundary (see UAX#9,
788 clause X10), and so we don't need to know the final embedding
789 level to which we descend after processing all the PDFs. */
e342a24d 790 if (!bidi_it->prev_was_pdf || level_before < level_after)
b7b65b15
EZ
791 /* FIXME: should the default sor direction be user selectable? */
792 bidi_it->sor = (higher_level & 1) != 0 ? R2L : L2R;
793 if (level_before > level_after)
794 bidi_it->prev_was_pdf = 1;
795
796 bidi_it->prev.type = UNKNOWN_BT;
89d3374a
EZ
797 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
798 bidi_it->last_strong.orig_type = UNKNOWN_BT;
b7b65b15
EZ
799 bidi_it->prev_for_neutral.type = bidi_it->sor == R2L ? STRONG_R : STRONG_L;
800 bidi_it->prev_for_neutral.charpos = bidi_it->charpos;
801 bidi_it->prev_for_neutral.bytepos = bidi_it->bytepos;
89d3374a
EZ
802 bidi_it->next_for_neutral.type = bidi_it->next_for_neutral.type_after_w1 =
803 bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
b7b65b15
EZ
804 bidi_it->ignore_bn_limit = 0; /* meaning it's unknown */
805}
806
6bff6497 807static void
be39f003
EZ
808bidi_line_init (struct bidi_it *bidi_it)
809{
810 bidi_it->scan_dir = 1; /* FIXME: do we need to have control on this? */
811 bidi_it->resolved_level = bidi_it->level_stack[0].level;
812 bidi_it->level_stack[0].override = NEUTRAL_DIR; /* X1 */
813 bidi_it->invalid_levels = 0;
814 bidi_it->invalid_rl_levels = -1;
815 bidi_it->next_en_pos = -1;
816 bidi_it->next_for_ws.type = UNKNOWN_BT;
b44d9321
EZ
817 bidi_set_sor_type (bidi_it,
818 bidi_it->paragraph_dir == R2L ? 1 : 0,
be39f003
EZ
819 bidi_it->level_stack[0].level); /* X10 */
820
821 bidi_cache_reset ();
822}
823
824/* Find the beginning of this paragraph by looking back in the buffer.
825 Value is the byte position of the paragraph's beginning. */
826static EMACS_INT
b44d9321 827bidi_find_paragraph_start (EMACS_INT pos, EMACS_INT pos_byte)
6bff6497 828{
be39f003 829 Lisp_Object re = Fbuffer_local_value (Qparagraph_start, Fcurrent_buffer ());
6bff6497
EZ
830 EMACS_INT limit = ZV, limit_byte = ZV_BYTE;
831
832 if (!STRINGP (re))
833 re = fallback_paragraph_start_re;
834 while (pos_byte > BEGV_BYTE
835 && fast_looking_at (re, pos, pos_byte, limit, limit_byte, Qnil) < 0)
836 {
be39f003
EZ
837 pos = find_next_newline_no_quit (pos - 1, -1);
838 pos_byte = CHAR_TO_BYTE (pos);
6bff6497 839 }
be39f003 840 return pos_byte;
6bff6497
EZ
841}
842
be39f003 843/* Determine the direction, a.k.a. base embedding level, of the
b44d9321
EZ
844 paragraph we are about to iterate through. If DIR is either L2R or
845 R2L, just use that. Otherwise, determine the paragraph direction
846 from the first strong character of the paragraph.
847
848 Note that this gives the paragraph separator the same direction as
849 the preceding paragraph, even though Emacs generally views the
850 separartor as not belonging to any paragraph. */
b7b65b15
EZ
851void
852bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it)
853{
6bff6497 854 EMACS_INT bytepos = bidi_it->bytepos;
e7402cb2 855
9c82e145
EZ
856 /* We should never be called at EOB or before BEGV. */
857 if (bytepos >= ZV_BYTE || bytepos < BEGV_BYTE)
858 abort ();
859
be39f003
EZ
860 if (dir == L2R)
861 {
862 bidi_it->paragraph_dir = L2R;
863 bidi_it->new_paragraph = 0;
864 }
865 else if (dir == R2L)
866 {
867 bidi_it->paragraph_dir = R2L;
868 bidi_it->new_paragraph = 0;
869 }
b7b65b15
EZ
870 else if (dir == NEUTRAL_DIR) /* P2 */
871 {
6bff6497
EZ
872 int ch, ch_len;
873 EMACS_INT pos;
874 bidi_type_t type;
be39f003
EZ
875 EMACS_INT sep_len;
876
877 /* If we are inside a paragraph separator, we are just waiting
878 for the separator to be exhausted; use the previous paragraph
e5a2fec7
EZ
879 direction. But don't do that if we have been just reseated,
880 because we need to reinitialize below in that case. */
881 if (!bidi_it->first_elt
882 && bidi_it->charpos < bidi_it->separator_limit)
be39f003
EZ
883 return;
884
b44d9321
EZ
885 /* If we are on a newline, get past it to where the next
886 paragraph might start. */
c143c213 887 pos = bidi_it->charpos;
b44d9321 888 if (FETCH_CHAR (bytepos) == '\n')
be39f003 889 {
b44d9321 890 bytepos++;
c143c213 891 pos++;
be39f003 892 }
b44d9321
EZ
893
894 /* We are either at the beginning of a paragraph or in the
895 middle of it. Find where this paragraph starts. */
896 bytepos = bidi_find_paragraph_start (pos, bytepos);
6bff6497
EZ
897
898 /* We should always be at the beginning of a new line at this
899 point. */
b44d9321 900 if (!(bytepos == BEGV_BYTE || FETCH_CHAR (bytepos - 1) == '\n'))
6bff6497
EZ
901 abort ();
902
be39f003
EZ
903 bidi_it->separator_limit = -1;
904 bidi_it->new_paragraph = 0;
6bff6497
EZ
905 ch = FETCH_CHAR (bytepos);
906 ch_len = CHAR_BYTES (ch);
be39f003 907 pos = BYTE_TO_CHAR (bytepos);
6bff6497 908 type = bidi_get_type (ch, NEUTRAL_DIR);
b7b65b15 909
e342a24d 910 for (pos++, bytepos += ch_len;
b7b65b15
EZ
911 /* NOTE: UAX#9 says to search only for L, AL, or R types of
912 characters, and ignore RLE, RLO, LRE, and LRO. However,
913 I'm not sure it makes sense to omit those 4; should try
914 with and without that to see the effect. */
915 (bidi_get_category (type) != STRONG)
916 || (bidi_ignore_explicit_marks_for_paragraph_level
917 && (type == RLE || type == RLO
918 || type == LRE || type == LRO));
6bff6497 919 type = bidi_get_type (ch, NEUTRAL_DIR))
b7b65b15 920 {
be39f003 921 if (type == NEUTRAL_B && bidi_at_paragraph_end (pos, bytepos) >= -1)
b7b65b15
EZ
922 break;
923 FETCH_CHAR_ADVANCE (ch, pos, bytepos);
924 }
925 if (type == STRONG_R || type == STRONG_AL) /* P3 */
be39f003
EZ
926 bidi_it->paragraph_dir = R2L;
927 else if (type == STRONG_L)
928 bidi_it->paragraph_dir = L2R;
b7b65b15 929 }
be39f003
EZ
930 else
931 abort ();
b7b65b15 932
b44d9321
EZ
933 /* Contrary to UAX#9 clause P3, we only default the paragraph
934 direction to L2R if we have no previous usable paragraph
935 direction. */
be39f003 936 if (bidi_it->paragraph_dir == NEUTRAL_DIR)
b44d9321 937 bidi_it->paragraph_dir = L2R; /* P3 and ``higher protocols'' */
be39f003 938 if (bidi_it->paragraph_dir == R2L)
b44d9321 939 bidi_it->level_stack[0].level = 1;
be39f003 940 else
b44d9321 941 bidi_it->level_stack[0].level = 0;
be39f003
EZ
942
943 bidi_line_init (bidi_it);
b7b65b15
EZ
944}
945
6bff6497
EZ
946/* Do whatever UAX#9 clause X8 says should be done at paragraph's
947 end. */
b7b65b15
EZ
948static inline void
949bidi_set_paragraph_end (struct bidi_it *bidi_it)
950{
951 bidi_it->invalid_levels = 0;
952 bidi_it->invalid_rl_levels = -1;
953 bidi_it->stack_idx = 0;
954 bidi_it->resolved_level = bidi_it->level_stack[0].level;
b7b65b15
EZ
955}
956
89d3374a 957/* Initialize the bidi iterator from buffer position CHARPOS. */
b7b65b15 958void
6bff6497 959bidi_init_it (EMACS_INT charpos, EMACS_INT bytepos, struct bidi_it *bidi_it)
b7b65b15
EZ
960{
961 if (! bidi_initialized)
962 bidi_initialize ();
89d3374a
EZ
963 bidi_it->charpos = charpos;
964 bidi_it->bytepos = bytepos;
6bff6497
EZ
965 bidi_it->first_elt = 1;
966 bidi_set_paragraph_end (bidi_it);
967 bidi_it->new_paragraph = 1;
be39f003 968 bidi_it->separator_limit = -1;
b44d9321 969 bidi_it->paragraph_dir = NEUTRAL_DIR;
b7b65b15 970 bidi_it->type = NEUTRAL_B;
89d3374a 971 bidi_it->type_after_w1 = UNKNOWN_BT;
b7b65b15 972 bidi_it->orig_type = UNKNOWN_BT;
b7b65b15 973 bidi_it->prev_was_pdf = 0;
89d3374a
EZ
974 bidi_it->prev.type = bidi_it->prev.type_after_w1 = UNKNOWN_BT;
975 bidi_it->last_strong.type = bidi_it->last_strong.type_after_w1 =
976 bidi_it->last_strong.orig_type = UNKNOWN_BT;
b7b65b15
EZ
977 bidi_it->next_for_neutral.charpos = -1;
978 bidi_it->next_for_neutral.type =
89d3374a
EZ
979 bidi_it->next_for_neutral.type_after_w1 =
980 bidi_it->next_for_neutral.orig_type = UNKNOWN_BT;
b7b65b15
EZ
981 bidi_it->prev_for_neutral.charpos = -1;
982 bidi_it->prev_for_neutral.type =
89d3374a
EZ
983 bidi_it->prev_for_neutral.type_after_w1 =
984 bidi_it->prev_for_neutral.orig_type = UNKNOWN_BT;
b7b65b15 985 bidi_it->sor = L2R; /* FIXME: should it be user-selectable? */
b7b65b15
EZ
986}
987
988/* Push the current embedding level and override status; reset the
989 current level to LEVEL and the current override status to OVERRIDE. */
990static inline void
991bidi_push_embedding_level (struct bidi_it *bidi_it,
992 int level, bidi_dir_t override)
993{
994 bidi_it->stack_idx++;
995 if (bidi_it->stack_idx >= BIDI_MAXLEVEL)
996 abort ();
997 bidi_it->level_stack[bidi_it->stack_idx].level = level;
998 bidi_it->level_stack[bidi_it->stack_idx].override = override;
999}
1000
1001/* Pop the embedding level and directional override status from the
1002 stack, and return the new level. */
1003static inline int
1004bidi_pop_embedding_level (struct bidi_it *bidi_it)
1005{
1006 /* UAX#9 says to ignore invalid PDFs. */
1007 if (bidi_it->stack_idx > 0)
1008 bidi_it->stack_idx--;
1009 return bidi_it->level_stack[bidi_it->stack_idx].level;
1010}
1011
1012/* Record in SAVED_INFO the information about the current character. */
1013static inline void
1014bidi_remember_char (struct bidi_saved_info *saved_info,
1015 struct bidi_it *bidi_it)
1016{
1017 saved_info->charpos = bidi_it->charpos;
1018 saved_info->bytepos = bidi_it->bytepos;
1019 saved_info->type = bidi_it->type;
2d6e4628 1020 bidi_check_type (bidi_it->type);
89d3374a
EZ
1021 saved_info->type_after_w1 = bidi_it->type_after_w1;
1022 bidi_check_type (bidi_it->type_after_w1);
b7b65b15 1023 saved_info->orig_type = bidi_it->orig_type;
2d6e4628 1024 bidi_check_type (bidi_it->orig_type);
b7b65b15
EZ
1025}
1026
1027/* Resolve the type of a neutral character according to the type of
1028 surrounding strong text and the current embedding level. */
1029static inline bidi_type_t
1030bidi_resolve_neutral_1 (bidi_type_t prev_type, bidi_type_t next_type, int lev)
1031{
1032 /* N1: European and Arabic numbers are treated as though they were R. */
1033 if (next_type == WEAK_EN || next_type == WEAK_AN)
1034 next_type = STRONG_R;
1035 if (prev_type == WEAK_EN || prev_type == WEAK_AN)
1036 prev_type = STRONG_R;
1037
1038 if (next_type == prev_type) /* N1 */
1039 return next_type;
1040 else if ((lev & 1) == 0) /* N2 */
1041 return STRONG_L;
1042 else
1043 return STRONG_R;
1044}
1045
1046static inline int
1047bidi_explicit_dir_char (int c)
1048{
1049 /* FIXME: this should be replaced with a lookup table with suitable
1050 bits set, like standard C ctype macros do. */
1051 return (c == LRE_CHAR || c == LRO_CHAR
1052 || c == RLE_CHAR || c == RLO_CHAR || c == PDF_CHAR);
1053}
1054
1055/* A helper function for bidi_resolve_explicit. It advances to the
1056 next character in logical order and determines the new embedding
1057 level and directional override, but does not take into account
1058 empty embeddings. */
1059static int
1060bidi_resolve_explicit_1 (struct bidi_it *bidi_it)
1061{
1062 int curchar;
1063 bidi_type_t type;
1064 int current_level;
1065 int new_level;
1066 bidi_dir_t override;
1067
9c82e145
EZ
1068 if (bidi_it->bytepos < BEGV_BYTE /* after reseat to BEGV? */
1069 || bidi_it->first_elt)
e7402cb2 1070 {
9c82e145
EZ
1071 bidi_it->first_elt = 0;
1072 if (bidi_it->charpos < BEGV)
1073 bidi_it->charpos = BEGV;
1074 bidi_it->bytepos = CHAR_TO_BYTE (bidi_it->charpos);
e7402cb2 1075 }
9c82e145 1076 else if (bidi_it->bytepos < ZV_BYTE) /* don't move at ZV */
b7b65b15
EZ
1077 {
1078 bidi_it->charpos++;
e7402cb2
EZ
1079 if (bidi_it->ch_len == 0)
1080 abort ();
b7b65b15
EZ
1081 bidi_it->bytepos += bidi_it->ch_len;
1082 }
1083
1084 current_level = bidi_it->level_stack[bidi_it->stack_idx].level; /* X1 */
1085 override = bidi_it->level_stack[bidi_it->stack_idx].override;
1086 new_level = current_level;
1087
1088 /* in case it is a unibyte character (not yet implemented) */
1089 /* _fetch_multibyte_char_len = 1; */
e7402cb2
EZ
1090 if (bidi_it->bytepos >= ZV_BYTE)
1091 {
1092 curchar = BIDI_EOB;
1093 bidi_it->ch_len = 1;
1094 }
1095 else
1096 {
1097 curchar = FETCH_CHAR (bidi_it->bytepos);
1098 bidi_it->ch_len = CHAR_BYTES (curchar);
1099 }
b7b65b15 1100 bidi_it->ch = curchar;
b7b65b15 1101
6bff6497
EZ
1102 /* Don't apply directional override here, as all the types we handle
1103 below will not be affected by the override anyway, and we need
1104 the original type unaltered. The override will be applied in
1105 bidi_resolve_weak. */
1106 type = bidi_get_type (curchar, NEUTRAL_DIR);
89d3374a
EZ
1107 bidi_it->orig_type = type;
1108 bidi_check_type (bidi_it->orig_type);
b7b65b15
EZ
1109
1110 if (type != PDF)
1111 bidi_it->prev_was_pdf = 0;
1112
89d3374a 1113 bidi_it->type_after_w1 = UNKNOWN_BT;
b7b65b15
EZ
1114
1115 switch (type)
1116 {
1117 case RLE: /* X2 */
1118 case RLO: /* X4 */
89d3374a
EZ
1119 bidi_it->type_after_w1 = type;
1120 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1121 type = WEAK_BN; /* X9/Retaining */
1122 if (bidi_it->ignore_bn_limit <= 0)
1123 {
1124 if (current_level <= BIDI_MAXLEVEL - 4)
1125 {
1126 /* Compute the least odd embedding level greater than
1127 the current level. */
1128 new_level = ((current_level + 1) & ~1) + 1;
89d3374a 1129 if (bidi_it->type_after_w1 == RLE)
b7b65b15
EZ
1130 override = NEUTRAL_DIR;
1131 else
1132 override = R2L;
1133 if (current_level == BIDI_MAXLEVEL - 4)
1134 bidi_it->invalid_rl_levels = 0;
1135 bidi_push_embedding_level (bidi_it, new_level, override);
1136 }
1137 else
1138 {
1139 bidi_it->invalid_levels++;
1140 /* See the commentary about invalid_rl_levels below. */
1141 if (bidi_it->invalid_rl_levels < 0)
1142 bidi_it->invalid_rl_levels = 0;
1143 bidi_it->invalid_rl_levels++;
1144 }
1145 }
89d3374a 1146 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
b7b65b15
EZ
1147 || bidi_it->next_en_pos > bidi_it->charpos)
1148 type = WEAK_EN;
1149 break;
1150 case LRE: /* X3 */
1151 case LRO: /* X5 */
89d3374a
EZ
1152 bidi_it->type_after_w1 = type;
1153 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1154 type = WEAK_BN; /* X9/Retaining */
1155 if (bidi_it->ignore_bn_limit <= 0)
1156 {
1157 if (current_level <= BIDI_MAXLEVEL - 5)
1158 {
1159 /* Compute the least even embedding level greater than
1160 the current level. */
1161 new_level = ((current_level + 2) & ~1);
89d3374a 1162 if (bidi_it->type_after_w1 == LRE)
b7b65b15
EZ
1163 override = NEUTRAL_DIR;
1164 else
1165 override = L2R;
1166 bidi_push_embedding_level (bidi_it, new_level, override);
1167 }
1168 else
1169 {
1170 bidi_it->invalid_levels++;
1171 /* invalid_rl_levels counts invalid levels encountered
1172 while the embedding level was already too high for
1173 LRE/LRO, but not for RLE/RLO. That is because
1174 there may be exactly one PDF which we should not
1175 ignore even though invalid_levels is non-zero.
1176 invalid_rl_levels helps to know what PDF is
1177 that. */
1178 if (bidi_it->invalid_rl_levels >= 0)
1179 bidi_it->invalid_rl_levels++;
1180 }
1181 }
89d3374a 1182 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
b7b65b15
EZ
1183 || bidi_it->next_en_pos > bidi_it->charpos)
1184 type = WEAK_EN;
1185 break;
1186 case PDF: /* X7 */
89d3374a
EZ
1187 bidi_it->type_after_w1 = type;
1188 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1189 type = WEAK_BN; /* X9/Retaining */
1190 if (bidi_it->ignore_bn_limit <= 0)
1191 {
1192 if (!bidi_it->invalid_rl_levels)
1193 {
1194 new_level = bidi_pop_embedding_level (bidi_it);
1195 bidi_it->invalid_rl_levels = -1;
1196 if (bidi_it->invalid_levels)
1197 bidi_it->invalid_levels--;
1198 /* else nothing: UAX#9 says to ignore invalid PDFs */
1199 }
1200 if (!bidi_it->invalid_levels)
1201 new_level = bidi_pop_embedding_level (bidi_it);
1202 else
1203 {
1204 bidi_it->invalid_levels--;
1205 bidi_it->invalid_rl_levels--;
1206 }
1207 }
89d3374a 1208 else if (bidi_it->prev.type_after_w1 == WEAK_EN /* W5/Retaining */
b7b65b15
EZ
1209 || bidi_it->next_en_pos > bidi_it->charpos)
1210 type = WEAK_EN;
1211 break;
1212 default:
1213 /* Nothing. */
1214 break;
1215 }
1216
1217 bidi_it->type = type;
2d6e4628 1218 bidi_check_type (bidi_it->type);
b7b65b15
EZ
1219
1220 return new_level;
1221}
1222
1223/* Given an iterator state in BIDI_IT, advance one character position
1224 in the buffer to the next character (in the logical order), resolve
1225 any explicit embeddings and directional overrides, and return the
1226 embedding level of the character after resolving explicit
1227 directives and ignoring empty embeddings. */
1228static int
1229bidi_resolve_explicit (struct bidi_it *bidi_it)
1230{
1231 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1232 int new_level = bidi_resolve_explicit_1 (bidi_it);
1233
1234 if (prev_level < new_level
1235 && bidi_it->type == WEAK_BN
1236 && bidi_it->ignore_bn_limit == 0 /* only if not already known */
e7402cb2 1237 && bidi_it->ch != BIDI_EOB /* not already at EOB */
b7b65b15
EZ
1238 && bidi_explicit_dir_char (FETCH_CHAR (bidi_it->bytepos
1239 + bidi_it->ch_len)))
1240 {
1241 /* Avoid pushing and popping embedding levels if the level run
1242 is empty, as this breaks level runs where it shouldn't.
1243 UAX#9 removes all the explicit embedding and override codes,
1244 so empty embeddings disappear without a trace. We need to
1245 behave as if we did the same. */
1246 struct bidi_it saved_it;
1247 int level = prev_level;
1248
1249 bidi_copy_it (&saved_it, bidi_it);
1250
1251 while (bidi_explicit_dir_char (FETCH_CHAR (bidi_it->bytepos
1252 + bidi_it->ch_len)))
1253 {
1254 level = bidi_resolve_explicit_1 (bidi_it);
1255 }
1256
1257 if (level == prev_level) /* empty embedding */
1258 saved_it.ignore_bn_limit = bidi_it->charpos + 1;
1259 else /* this embedding is non-empty */
1260 saved_it.ignore_bn_limit = -1;
1261
1262 bidi_copy_it (bidi_it, &saved_it);
1263 if (bidi_it->ignore_bn_limit > 0)
1264 {
1265 /* We pushed a level, but we shouldn't have. Undo that. */
1266 if (!bidi_it->invalid_rl_levels)
1267 {
1268 new_level = bidi_pop_embedding_level (bidi_it);
1269 bidi_it->invalid_rl_levels = -1;
1270 if (bidi_it->invalid_levels)
1271 bidi_it->invalid_levels--;
1272 }
1273 if (!bidi_it->invalid_levels)
1274 new_level = bidi_pop_embedding_level (bidi_it);
1275 else
1276 {
1277 bidi_it->invalid_levels--;
1278 bidi_it->invalid_rl_levels--;
1279 }
1280 }
1281 }
1282
b7b65b15
EZ
1283 if (bidi_it->type == NEUTRAL_B) /* X8 */
1284 {
6bff6497
EZ
1285 /* End of buffer does _not_ indicate a new paragraph is coming.
1286 Otherwise, each character inserted at EOB will be processed
1287 as starting a new paragraph. */
1288 if (bidi_it->bytepos < ZV_BYTE)
1289 bidi_set_paragraph_end (bidi_it);
1290 /* This is needed by bidi_resolve_weak below, and in L1. */
1291 bidi_it->type_after_w1 = bidi_it->type;
89d3374a 1292 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1293 }
1294
1295 return new_level;
1296}
1297
1298/* Advance in the buffer, resolve weak types and return the type of
1299 the next character after weak type resolution. */
1300bidi_type_t
1301bidi_resolve_weak (struct bidi_it *bidi_it)
1302{
1303 bidi_type_t type;
1304 bidi_dir_t override;
1305 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1306 int new_level = bidi_resolve_explicit (bidi_it);
1307 int next_char;
1308 bidi_type_t type_of_next;
1309 struct bidi_it saved_it;
1310
1311 type = bidi_it->type;
1312 override = bidi_it->level_stack[bidi_it->stack_idx].override;
1313
1314 if (type == UNKNOWN_BT
1315 || type == LRE
1316 || type == LRO
1317 || type == RLE
1318 || type == RLO
1319 || type == PDF)
1320 abort ();
1321
1322 if (new_level != prev_level
1323 || bidi_it->type == NEUTRAL_B)
1324 {
1325 /* We've got a new embedding level run, compute the directional
1326 type of sor and initialize per-run variables (UAX#9, clause
1327 X10). */
1328 bidi_set_sor_type (bidi_it, prev_level, new_level);
1329 }
1330 else if (type == NEUTRAL_S || type == NEUTRAL_WS
1331 || type == WEAK_BN || type == STRONG_AL)
89d3374a
EZ
1332 bidi_it->type_after_w1 = type; /* needed in L1 */
1333 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1334
1335 /* Level and directional override status are already recorded in
1336 bidi_it, and do not need any change; see X6. */
1337 if (override == R2L) /* X6 */
1338 type = STRONG_R;
1339 else if (override == L2R)
1340 type = STRONG_L;
1341 else if (type == STRONG_AL)
1342 type = STRONG_R; /* W3 */
1343 else if (type == WEAK_NSM) /* W1 */
1344 {
1345 /* Note that we don't need to consider the case where the prev
1346 character has its type overridden by an RLO or LRO: such
1347 characters are outside the current level run, and thus not
89d3374a
EZ
1348 relevant to this NSM. Thus, NSM gets the orig_type of the
1349 previous character. */
b7b65b15 1350 if (bidi_it->prev.type != UNKNOWN_BT)
89d3374a 1351 type = bidi_it->prev.orig_type;
b7b65b15
EZ
1352 else if (bidi_it->sor == R2L)
1353 type = STRONG_R;
1354 else if (bidi_it->sor == L2R)
1355 type = STRONG_L;
1356 else /* shouldn't happen! */
1357 abort ();
1358 if (type == WEAK_EN /* W2 after W1 */
89d3374a 1359 && bidi_it->last_strong.type_after_w1 == STRONG_AL)
b7b65b15
EZ
1360 type = WEAK_AN;
1361 }
1362 else if (type == WEAK_EN /* W2 */
89d3374a 1363 && bidi_it->last_strong.type_after_w1 == STRONG_AL)
b7b65b15
EZ
1364 type = WEAK_AN;
1365 else if ((type == WEAK_ES
89d3374a
EZ
1366 && (bidi_it->prev.type_after_w1 == WEAK_EN /* W4 */
1367 && (bidi_it->prev.orig_type == WEAK_EN
1368 || bidi_it->prev.orig_type == WEAK_NSM))) /* aft W1 */
b7b65b15 1369 || (type == WEAK_CS
89d3374a
EZ
1370 && ((bidi_it->prev.type_after_w1 == WEAK_EN
1371 && (bidi_it->prev.orig_type == WEAK_EN /* W4 */
1372 || bidi_it->prev.orig_type == WEAK_NSM)) /* a/W1 */
1373 || bidi_it->prev.type_after_w1 == WEAK_AN))) /* W4 */
b7b65b15 1374 {
e7402cb2
EZ
1375 next_char =
1376 bidi_it->bytepos + bidi_it->ch_len >= ZV_BYTE
1377 ? BIDI_EOB : FETCH_CHAR (bidi_it->bytepos + bidi_it->ch_len);
6bff6497 1378 type_of_next = bidi_get_type (next_char, override);
b7b65b15
EZ
1379
1380 if (type_of_next == WEAK_BN
1381 || bidi_explicit_dir_char (next_char))
1382 {
1383 bidi_copy_it (&saved_it, bidi_it);
1384 while (bidi_resolve_explicit (bidi_it) == new_level
1385 && bidi_it->type == WEAK_BN)
1386 ;
1387 type_of_next = bidi_it->type;
1388 bidi_copy_it (bidi_it, &saved_it);
1389 }
1390
1391 /* If the next character is EN, but the last strong-type
1392 character is AL, that next EN will be changed to AN when we
1393 process it in W2 above. So in that case, this ES should not
1394 be changed into EN. */
1395 if (type == WEAK_ES
1396 && type_of_next == WEAK_EN
89d3374a 1397 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
b7b65b15
EZ
1398 type = WEAK_EN;
1399 else if (type == WEAK_CS)
1400 {
89d3374a 1401 if (bidi_it->prev.type_after_w1 == WEAK_AN
b7b65b15
EZ
1402 && (type_of_next == WEAK_AN
1403 /* If the next character is EN, but the last
1404 strong-type character is AL, EN will be later
1405 changed to AN when we process it in W2 above. So
1406 in that case, this ES should not be changed into
1407 EN. */
1408 || (type_of_next == WEAK_EN
89d3374a 1409 && bidi_it->last_strong.type_after_w1 == STRONG_AL)))
b7b65b15 1410 type = WEAK_AN;
89d3374a 1411 else if (bidi_it->prev.type_after_w1 == WEAK_EN
b7b65b15 1412 && type_of_next == WEAK_EN
89d3374a 1413 && bidi_it->last_strong.type_after_w1 != STRONG_AL)
b7b65b15
EZ
1414 type = WEAK_EN;
1415 }
1416 }
1417 else if (type == WEAK_ET /* W5: ET with EN before or after it */
1418 || type == WEAK_BN) /* W5/Retaining */
1419 {
89d3374a 1420 if (bidi_it->prev.type_after_w1 == WEAK_EN /* ET/BN with EN before it */
b7b65b15
EZ
1421 || bidi_it->next_en_pos > bidi_it->charpos)
1422 type = WEAK_EN;
1423 /* W5: ET with EN after it. */
1424 else
1425 {
6bff6497 1426 EMACS_INT en_pos = bidi_it->charpos + 1;
b7b65b15 1427
e7402cb2
EZ
1428 next_char =
1429 bidi_it->bytepos + bidi_it->ch_len >= ZV_BYTE
1430 ? BIDI_EOB : FETCH_CHAR (bidi_it->bytepos + bidi_it->ch_len);
6bff6497 1431 type_of_next = bidi_get_type (next_char, override);
b7b65b15
EZ
1432
1433 if (type_of_next == WEAK_ET
1434 || type_of_next == WEAK_BN
1435 || bidi_explicit_dir_char (next_char))
1436 {
1437 bidi_copy_it (&saved_it, bidi_it);
1438 while (bidi_resolve_explicit (bidi_it) == new_level
1439 && (bidi_it->type == WEAK_BN || bidi_it->type == WEAK_ET))
1440 ;
1441 type_of_next = bidi_it->type;
1442 en_pos = bidi_it->charpos;
1443 bidi_copy_it (bidi_it, &saved_it);
1444 }
1445 if (type_of_next == WEAK_EN)
1446 {
1447 /* If the last strong character is AL, the EN we've
1448 found will become AN when we get to it (W2). */
89d3374a 1449 if (bidi_it->last_strong.type_after_w1 != STRONG_AL)
b7b65b15
EZ
1450 {
1451 type = WEAK_EN;
1452 /* Remember this EN position, to speed up processing
1453 of the next ETs. */
1454 bidi_it->next_en_pos = en_pos;
1455 }
1456 else if (type == WEAK_BN)
1457 type = NEUTRAL_ON; /* W6/Retaining */
1458 }
1459 }
1460 }
1461
1462 if (type == WEAK_ES || type == WEAK_ET || type == WEAK_CS /* W6 */
89d3374a
EZ
1463 || (type == WEAK_BN
1464 && (bidi_it->prev.type_after_w1 == WEAK_CS /* W6/Retaining */
1465 || bidi_it->prev.type_after_w1 == WEAK_ES
1466 || bidi_it->prev.type_after_w1 == WEAK_ET)))
b7b65b15
EZ
1467 type = NEUTRAL_ON;
1468
1469 /* Store the type we've got so far, before we clobber it with strong
1470 types in W7 and while resolving neutral types. But leave alone
1471 the original types that were recorded above, because we will need
1472 them for the L1 clause. */
89d3374a
EZ
1473 if (bidi_it->type_after_w1 == UNKNOWN_BT)
1474 bidi_it->type_after_w1 = type;
1475 bidi_check_type (bidi_it->type_after_w1);
b7b65b15
EZ
1476
1477 if (type == WEAK_EN) /* W7 */
1478 {
89d3374a 1479 if ((bidi_it->last_strong.type_after_w1 == STRONG_L)
b7b65b15
EZ
1480 || (bidi_it->last_strong.type == UNKNOWN_BT && bidi_it->sor == L2R))
1481 type = STRONG_L;
1482 }
1483
1484 bidi_it->type = type;
2d6e4628 1485 bidi_check_type (bidi_it->type);
b7b65b15
EZ
1486 return type;
1487}
1488
1489bidi_type_t
1490bidi_resolve_neutral (struct bidi_it *bidi_it)
1491{
1492 int prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1493 bidi_type_t type = bidi_resolve_weak (bidi_it);
1494 int current_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1495
1496 if (!(type == STRONG_R
1497 || type == STRONG_L
1498 || type == WEAK_BN
1499 || type == WEAK_EN
1500 || type == WEAK_AN
1501 || type == NEUTRAL_B
1502 || type == NEUTRAL_S
1503 || type == NEUTRAL_WS
1504 || type == NEUTRAL_ON))
1505 abort ();
1506
1507 if (bidi_get_category (type) == NEUTRAL
1508 || (type == WEAK_BN && prev_level == current_level))
1509 {
1510 if (bidi_it->next_for_neutral.type != UNKNOWN_BT)
1511 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1512 bidi_it->next_for_neutral.type,
1513 current_level);
1514 else
1515 {
1516 /* Arrrgh!! The UAX#9 algorithm is too deeply entrenched in
1517 the assumption of batch-style processing; see clauses W4,
1518 W5, and especially N1, which require to look far forward
1519 (as well as back) in the buffer. May the fleas of a
1520 thousand camels infest the armpits of those who design
1521 supposedly general-purpose algorithms by looking at their
1522 own implementations, and fail to consider other possible
1523 implementations! */
1524 struct bidi_it saved_it;
1525 bidi_type_t next_type;
1526
1527 if (bidi_it->scan_dir == -1)
1528 abort ();
1529
1530 bidi_copy_it (&saved_it, bidi_it);
1531 /* Scan the text forward until we find the first non-neutral
1532 character, and then use that to resolve the neutral we
1533 are dealing with now. We also cache the scanned iterator
1534 states, to salvage some of the effort later. */
1535 bidi_cache_iterator_state (bidi_it, 0);
1536 do {
1537 /* Record the info about the previous character, so that
1538 it will be cached below with this state. */
89d3374a 1539 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
b7b65b15
EZ
1540 && bidi_it->type != WEAK_BN)
1541 bidi_remember_char (&bidi_it->prev, bidi_it);
1542 type = bidi_resolve_weak (bidi_it);
1543 /* Paragraph separators have their levels fully resolved
1544 at this point, so cache them as resolved. */
1545 bidi_cache_iterator_state (bidi_it, type == NEUTRAL_B);
1546 /* FIXME: implement L1 here, by testing for a newline and
1547 resetting the level for any sequence of whitespace
1548 characters adjacent to it. */
1549 } while (!(type == NEUTRAL_B
1550 || (type != WEAK_BN
1551 && bidi_get_category (type) != NEUTRAL)
1552 /* This is all per level run, so stop when we
1553 reach the end of this level run. */
1554 || bidi_it->level_stack[bidi_it->stack_idx].level !=
1555 current_level));
1556
1557 bidi_remember_char (&saved_it.next_for_neutral, bidi_it);
1558
1559 switch (type)
1560 {
1561 case STRONG_L:
1562 case STRONG_R:
1563 case STRONG_AL:
1564 next_type = type;
1565 break;
1566 case WEAK_EN:
1567 case WEAK_AN:
1568 /* N1: ``European and Arabic numbers are treated as
1569 though they were R.'' */
1570 next_type = STRONG_R;
1571 saved_it.next_for_neutral.type = STRONG_R;
1572 break;
1573 case WEAK_BN:
1574 if (!bidi_explicit_dir_char (bidi_it->ch))
1575 abort (); /* can't happen: BNs are skipped */
1576 /* FALLTHROUGH */
1577 case NEUTRAL_B:
1578 /* Marched all the way to the end of this level run.
1579 We need to use the eor type, whose information is
1580 stored by bidi_set_sor_type in the prev_for_neutral
1581 member. */
1582 if (saved_it.type != WEAK_BN
89d3374a 1583 || bidi_get_category (bidi_it->prev.type_after_w1) == NEUTRAL)
b7b65b15
EZ
1584 {
1585 next_type = bidi_it->prev_for_neutral.type;
1586 saved_it.next_for_neutral.type = next_type;
2d6e4628 1587 bidi_check_type (next_type);
b7b65b15
EZ
1588 }
1589 else
1590 {
1591 /* This is a BN which does not adjoin neutrals.
1592 Leave its type alone. */
1593 bidi_copy_it (bidi_it, &saved_it);
1594 return bidi_it->type;
1595 }
1596 break;
1597 default:
1598 abort ();
1599 }
1600 type = bidi_resolve_neutral_1 (saved_it.prev_for_neutral.type,
1601 next_type, current_level);
1602 saved_it.type = type;
2d6e4628 1603 bidi_check_type (type);
b7b65b15
EZ
1604 bidi_copy_it (bidi_it, &saved_it);
1605 }
1606 }
1607 return type;
1608}
1609
1610/* Given an iterator state in BIDI_IT, advance one character position
1611 in the buffer to the next character (in the logical order), resolve
1612 the bidi type of that next character, and return that type. */
1613bidi_type_t
1614bidi_type_of_next_char (struct bidi_it *bidi_it)
1615{
1616 bidi_type_t type;
1617
1618 /* This should always be called during a forward scan. */
1619 if (bidi_it->scan_dir != 1)
1620 abort ();
1621
1622 /* Reset the limit until which to ignore BNs if we step out of the
1623 area where we found only empty levels. */
1624 if ((bidi_it->ignore_bn_limit > 0
1625 && bidi_it->ignore_bn_limit <= bidi_it->charpos)
1626 || (bidi_it->ignore_bn_limit == -1
1627 && !bidi_explicit_dir_char (bidi_it->ch)))
1628 bidi_it->ignore_bn_limit = 0;
1629
1630 type = bidi_resolve_neutral (bidi_it);
1631
1632 return type;
1633}
1634
1635/* Given an iterator state BIDI_IT, advance one character position in
1636 the buffer to the next character (in the logical order), resolve
1637 the embedding and implicit levels of that next character, and
1638 return the resulting level. */
1639int
1640bidi_level_of_next_char (struct bidi_it *bidi_it)
1641{
1642 bidi_type_t type;
1643 int level, prev_level = -1;
1644 struct bidi_saved_info next_for_neutral;
1645
1646 if (bidi_it->scan_dir == 1)
1647 {
1648 /* There's no sense in trying to advance if we hit end of text. */
1649 if (bidi_it->ch == BIDI_EOB)
1650 return bidi_it->resolved_level;
1651
1652 /* Record the info about the previous character. */
89d3374a 1653 if (bidi_it->type_after_w1 != WEAK_BN /* W1/Retaining */
b7b65b15
EZ
1654 && bidi_it->type != WEAK_BN)
1655 bidi_remember_char (&bidi_it->prev, bidi_it);
89d3374a
EZ
1656 if (bidi_it->type_after_w1 == STRONG_R
1657 || bidi_it->type_after_w1 == STRONG_L
1658 || bidi_it->type_after_w1 == STRONG_AL)
b7b65b15
EZ
1659 bidi_remember_char (&bidi_it->last_strong, bidi_it);
1660 /* FIXME: it sounds like we don't need both prev and
1661 prev_for_neutral members, but I'm leaving them both for now. */
1662 if (bidi_it->type == STRONG_R || bidi_it->type == STRONG_L
1663 || bidi_it->type == WEAK_EN || bidi_it->type == WEAK_AN)
1664 bidi_remember_char (&bidi_it->prev_for_neutral, bidi_it);
1665
1666 /* If we overstepped the characters used for resolving neutrals
1667 and whitespace, invalidate their info in the iterator. */
1668 if (bidi_it->charpos >= bidi_it->next_for_neutral.charpos)
1669 bidi_it->next_for_neutral.type = UNKNOWN_BT;
1670 if (bidi_it->next_en_pos >= 0
1671 && bidi_it->charpos >= bidi_it->next_en_pos)
1672 bidi_it->next_en_pos = -1;
1673 if (bidi_it->next_for_ws.type != UNKNOWN_BT
1674 && bidi_it->charpos >= bidi_it->next_for_ws.charpos)
1675 bidi_it->next_for_ws.type = UNKNOWN_BT;
1676
1677 /* This must be taken before we fill the iterator with the info
1678 about the next char. If we scan backwards, the iterator
1679 state must be already cached, so there's no need to know the
1680 embedding level of the previous character, since we will be
1681 returning to our caller shortly. */
1682 prev_level = bidi_it->level_stack[bidi_it->stack_idx].level;
1683 }
1684 next_for_neutral = bidi_it->next_for_neutral;
1685
1686 /* Perhaps it is already cached. */
1687 type = bidi_cache_find (bidi_it->charpos + bidi_it->scan_dir, -1, bidi_it);
1688 if (type != UNKNOWN_BT)
1689 {
1690 /* Don't lose the information for resolving neutrals! The
1691 cached states could have been cached before their
1692 next_for_neutral member was computed. If we are on our way
1693 forward, we can simply take the info from the previous
1694 state. */
1695 if (bidi_it->scan_dir == 1
1696 && bidi_it->next_for_neutral.type == UNKNOWN_BT)
1697 bidi_it->next_for_neutral = next_for_neutral;
1698
1699 /* If resolved_level is -1, it means this state was cached
1700 before it was completely resolved, so we cannot return
1701 it. */
1702 if (bidi_it->resolved_level != -1)
1703 return bidi_it->resolved_level;
1704 }
1705 if (bidi_it->scan_dir == -1)
1706 /* If we are going backwards, the iterator state is already cached
1707 from previous scans, and should be fully resolved. */
1708 abort ();
1709
1710 if (type == UNKNOWN_BT)
1711 type = bidi_type_of_next_char (bidi_it);
1712
1713 if (type == NEUTRAL_B)
1714 return bidi_it->resolved_level;
1715
1716 level = bidi_it->level_stack[bidi_it->stack_idx].level;
1717 if ((bidi_get_category (type) == NEUTRAL /* && type != NEUTRAL_B */)
1718 || (type == WEAK_BN && prev_level == level))
1719 {
1720 if (bidi_it->next_for_neutral.type == UNKNOWN_BT)
1721 abort ();
1722
1723 /* If the cached state shows a neutral character, it was not
1724 resolved by bidi_resolve_neutral, so do it now. */
1725 type = bidi_resolve_neutral_1 (bidi_it->prev_for_neutral.type,
1726 bidi_it->next_for_neutral.type,
1727 level);
1728 }
1729
1730 if (!(type == STRONG_R
1731 || type == STRONG_L
1732 || type == WEAK_BN
1733 || type == WEAK_EN
1734 || type == WEAK_AN))
1735 abort ();
1736 bidi_it->type = type;
2d6e4628 1737 bidi_check_type (bidi_it->type);
b7b65b15
EZ
1738
1739 /* For L1 below, we need to know, for each WS character, whether
1740 it belongs to a sequence of WS characters preceeding a newline
1741 or a TAB or a paragraph separator. */
89d3374a 1742 if (bidi_it->orig_type == NEUTRAL_WS
b7b65b15
EZ
1743 && bidi_it->next_for_ws.type == UNKNOWN_BT)
1744 {
1745 int ch;
1746 int clen = bidi_it->ch_len;
6bff6497
EZ
1747 EMACS_INT bpos = bidi_it->bytepos;
1748 EMACS_INT cpos = bidi_it->charpos;
b7b65b15
EZ
1749 bidi_type_t chtype;
1750
1751 do {
1752 /*_fetch_multibyte_char_len = 1;*/
e7402cb2 1753 ch = bpos + clen >= ZV_BYTE ? BIDI_EOB : FETCH_CHAR (bpos + clen);
b7b65b15
EZ
1754 bpos += clen;
1755 cpos++;
e7402cb2
EZ
1756 clen = (ch == BIDI_EOB ? 1 : CHAR_BYTES (ch));
1757 if (ch == '\n' || ch == BIDI_EOB /* || ch == LINESEP_CHAR */)
b7b65b15
EZ
1758 chtype = NEUTRAL_B;
1759 else
6bff6497 1760 chtype = bidi_get_type (ch, NEUTRAL_DIR);
b7b65b15
EZ
1761 } while (chtype == NEUTRAL_WS || chtype == WEAK_BN
1762 || bidi_explicit_dir_char (ch)); /* L1/Retaining */
1763 bidi_it->next_for_ws.type = chtype;
2d6e4628 1764 bidi_check_type (bidi_it->next_for_ws.type);
b7b65b15
EZ
1765 bidi_it->next_for_ws.charpos = cpos;
1766 bidi_it->next_for_ws.bytepos = bpos;
1767 }
1768
1769 /* Resolve implicit levels, with a twist: PDFs get the embedding
1770 level of the enbedding they terminate. See below for the
1771 reason. */
89d3374a 1772 if (bidi_it->orig_type == PDF
b7b65b15
EZ
1773 /* Don't do this if this formatting code didn't change the
1774 embedding level due to invalid or empty embeddings. */
1775 && prev_level != level)
1776 {
1777 /* Don't look in UAX#9 for the reason for this: it's our own
1778 private quirk. The reason is that we want the formatting
1779 codes to be delivered so that they bracket the text of their
1780 embedding. For example, given the text
1781
1782 {RLO}teST{PDF}
1783
1784 we want it to be displayed as
1785
1786 {RLO}STet{PDF}
1787
1788 not as
1789
1790 STet{RLO}{PDF}
1791
1792 which will result because we bump up the embedding level as
1793 soon as we see the RLO and pop it as soon as we see the PDF,
1794 so RLO itself has the same embedding level as "teST", and
1795 thus would be normally delivered last, just before the PDF.
1796 The switch below fiddles with the level of PDF so that this
1797 ugly side effect does not happen.
1798
1799 (This is, of course, only important if the formatting codes
e7402cb2
EZ
1800 are actually displayed, but Emacs does need to display them
1801 if the user wants to.) */
b7b65b15
EZ
1802 level = prev_level;
1803 }
89d3374a
EZ
1804 else if (bidi_it->orig_type == NEUTRAL_B /* L1 */
1805 || bidi_it->orig_type == NEUTRAL_S
e7402cb2
EZ
1806 || bidi_it->ch == '\n' || bidi_it->ch == BIDI_EOB
1807 /* || bidi_it->ch == LINESEP_CHAR */
89d3374a 1808 || (bidi_it->orig_type == NEUTRAL_WS
b7b65b15
EZ
1809 && (bidi_it->next_for_ws.type == NEUTRAL_B
1810 || bidi_it->next_for_ws.type == NEUTRAL_S)))
1811 level = bidi_it->level_stack[0].level;
1812 else if ((level & 1) == 0) /* I1 */
1813 {
1814 if (type == STRONG_R)
1815 level++;
1816 else if (type == WEAK_EN || type == WEAK_AN)
1817 level += 2;
1818 }
1819 else /* I2 */
1820 {
1821 if (type == STRONG_L || type == WEAK_EN || type == WEAK_AN)
1822 level++;
1823 }
1824
1825 bidi_it->resolved_level = level;
1826 return level;
1827}
1828
1829/* Move to the other edge of a level given by LEVEL. If END_FLAG is
1830 non-zero, we are at the end of a level, and we need to prepare to
1831 resume the scan of the lower level.
1832
1833 If this level's other edge is cached, we simply jump to it, filling
1834 the iterator structure with the iterator state on the other edge.
1835 Otherwise, we walk the buffer until we come back to the same level
1836 as LEVEL.
1837
1838 Note: we are not talking here about a ``level run'' in the UAX#9
1839 sense of the term, but rather about a ``level'' which includes
1840 all the levels higher than it. In other words, given the levels
1841 like this:
1842
1843 11111112222222333333334443343222222111111112223322111
1844 A B C
1845
1846 and assuming we are at point A scanning left to right, this
1847 function moves to point C, whereas the UAX#9 ``level 2 run'' ends
1848 at point B. */
1849static void
1850bidi_find_other_level_edge (struct bidi_it *bidi_it, int level, int end_flag)
1851{
1852 int dir = end_flag ? -bidi_it->scan_dir : bidi_it->scan_dir;
1853 int idx;
1854
1855 /* Try the cache first. */
1856 if ((idx = bidi_cache_find_level_change (level, dir, end_flag)) >= 0)
1857 bidi_cache_fetch_state (idx, bidi_it);
1858 else
1859 {
1860 int new_level;
1861
1862 if (end_flag)
1863 abort (); /* if we are at end of level, its edges must be cached */
1864
1865 bidi_cache_iterator_state (bidi_it, 1);
1866 do {
1867 new_level = bidi_level_of_next_char (bidi_it);
1868 bidi_cache_iterator_state (bidi_it, 1);
1869 } while (new_level >= level);
1870 }
1871}
1872
1873void
1874bidi_get_next_char_visually (struct bidi_it *bidi_it)
1875{
1876 int old_level, new_level, next_level;
9c82e145 1877 struct bidi_it sentinel;
b7b65b15
EZ
1878
1879 if (bidi_it->scan_dir == 0)
1880 {
1881 bidi_it->scan_dir = 1; /* default to logical order */
1882 }
1883
be39f003
EZ
1884 /* If we just passed a newline, initialize for the next line. */
1885 if (!bidi_it->first_elt && bidi_it->orig_type == NEUTRAL_B)
1886 bidi_line_init (bidi_it);
1887
9c82e145 1888 /* Prepare the sentinel iterator state. */
b7b65b15 1889 if (bidi_cache_idx == 0)
9c82e145
EZ
1890 {
1891 bidi_copy_it (&sentinel, bidi_it);
1892 if (bidi_it->first_elt)
1893 {
1894 sentinel.charpos--; /* cached charpos needs to be monotonic */
1895 sentinel.bytepos--;
1896 sentinel.ch = '\n'; /* doesn't matter, but why not? */
1897 sentinel.ch_len = 1;
1898 }
1899 }
b7b65b15
EZ
1900
1901 old_level = bidi_it->resolved_level;
1902 new_level = bidi_level_of_next_char (bidi_it);
1903 if (bidi_it->ch == BIDI_EOB)
1904 return;
1905
1906 /* Reordering of resolved levels (clause L2) is implemented by
1907 jumping to the other edge of the level and flipping direction of
1908 scanning the buffer whenever we find a level change. */
1909 if (new_level != old_level)
1910 {
1911 int ascending = new_level > old_level;
1912 int level_to_search = ascending ? old_level + 1 : old_level;
1913 int incr = ascending ? 1 : -1;
1914 int expected_next_level = old_level + incr;
1915
1916 /* If we don't have anything cached yet, we need to cache the
9c82e145
EZ
1917 sentinel state, since we'll need it to record where to jump
1918 when the last non-base level is exhausted. */
b7b65b15 1919 if (bidi_cache_idx == 0)
9c82e145 1920 bidi_cache_iterator_state (&sentinel, 1);
b7b65b15
EZ
1921 /* Jump (or walk) to the other edge of this level. */
1922 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
1923 /* Switch scan direction and peek at the next character in the
1924 new direction. */
1925 bidi_it->scan_dir = -bidi_it->scan_dir;
1926
1927 /* The following loop handles the case where the resolved level
1928 jumps by more than one. This is typical for numbers inside a
1929 run of text with left-to-right embedding direction, but can
1930 also happen in other situations. In those cases the decision
1931 where to continue after a level change, and in what direction,
1932 is tricky. For example, given a text like below:
1933
1934 abcdefgh
1935 11336622
1936
1937 (where the numbers below the text show the resolved levels),
1938 the result of reordering according to UAX#9 should be this:
1939
1940 efdcghba
1941
1942 This is implemented by the loop below which flips direction
1943 and jumps to the other edge of the level each time it finds
1944 the new level not to be the expected one. The expected level
1945 is always one more or one less than the previous one. */
1946 next_level = bidi_peek_at_next_level (bidi_it);
1947 while (next_level != expected_next_level)
1948 {
1949 expected_next_level += incr;
1950 level_to_search += incr;
1951 bidi_find_other_level_edge (bidi_it, level_to_search, !ascending);
1952 bidi_it->scan_dir = -bidi_it->scan_dir;
1953 next_level = bidi_peek_at_next_level (bidi_it);
1954 }
1955
1956 /* Finally, deliver the next character in the new direction. */
1957 next_level = bidi_level_of_next_char (bidi_it);
1958 }
1959
b44d9321
EZ
1960 /* Take note when we have just processed the newline that precedes
1961 the end of the paragraph. The next time we are about to be
1962 called, set_iterator_to_next will automatically reinit the
1963 paragraph direction, if needed. We do this at the newline before
1964 the paragraph separator, because the next character might not be
1965 the first character of the next paragraph, due to the bidi
1966 reordering. */
6bff6497 1967 if (bidi_it->scan_dir == 1
be39f003
EZ
1968 && bidi_it->orig_type == NEUTRAL_B
1969 && bidi_it->bytepos < ZV_BYTE)
1970 {
1971 EMACS_INT sep_len =
1972 bidi_at_paragraph_end (bidi_it->charpos + 1,
1973 bidi_it->bytepos + bidi_it->ch_len);
1974 if (sep_len >= 0)
1975 {
1976 bidi_it->new_paragraph = 1;
b44d9321
EZ
1977 /* Record the buffer position of the last character of the
1978 paragraph separator. */
1979 bidi_it->separator_limit = bidi_it->charpos + 1 + sep_len;
be39f003
EZ
1980 }
1981 }
6bff6497 1982
b7b65b15
EZ
1983 if (bidi_it->scan_dir == 1 && bidi_cache_idx)
1984 {
1985 /* If we are at paragraph's base embedding level and beyond the
1986 last cached position, the cache's job is done and we can
1987 discard it. */
1988 if (bidi_it->resolved_level == bidi_it->level_stack[0].level
1989 && bidi_it->charpos > bidi_cache[bidi_cache_idx - 1].charpos)
1990 bidi_cache_reset ();
1991 /* But as long as we are caching during forward scan, we must
1992 cache each state, or else the cache integrity will be
1993 compromised: it assumes cached states correspond to buffer
1994 positions 1:1. */
1995 else
1996 bidi_cache_iterator_state (bidi_it, 1);
1997 }
1998}
1999
2000/* This is meant to be called from within the debugger, whenever you
2001 wish to examine the cache contents. */
2002void
2003bidi_dump_cached_states (void)
2004{
2005 int i;
2006 int ndigits = 1;
2007
2008 if (bidi_cache_idx == 0)
2009 {
2010 fprintf (stderr, "The cache is empty.\n");
2011 return;
2012 }
2013 fprintf (stderr, "Total of %d state%s in cache:\n",
2014 bidi_cache_idx, bidi_cache_idx == 1 ? "" : "s");
2015
2016 for (i = bidi_cache[bidi_cache_idx - 1].charpos; i > 0; i /= 10)
2017 ndigits++;
2018 fputs ("ch ", stderr);
2019 for (i = 0; i < bidi_cache_idx; i++)
2020 fprintf (stderr, "%*c", ndigits, bidi_cache[i].ch);
2021 fputs ("\n", stderr);
2022 fputs ("lvl ", stderr);
2023 for (i = 0; i < bidi_cache_idx; i++)
2024 fprintf (stderr, "%*d", ndigits, bidi_cache[i].resolved_level);
2025 fputs ("\n", stderr);
2026 fputs ("pos ", stderr);
2027 for (i = 0; i < bidi_cache_idx; i++)
2028 fprintf (stderr, "%*d", ndigits, bidi_cache[i].charpos);
2029 fputs ("\n", stderr);
2030}