1 /* Low-level bidirectional buffer-scanning functions for GNU Emacs.
2 Copyright (C) 2000, 2001, 2004, 2005, 2009, 2010
3 Free Software Foundation, Inc.
5 This file is part of GNU Emacs.
7 GNU Emacs is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
12 GNU Emacs is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
20 /* Written by Eli Zaretskii <eliz@gnu.org>.
22 A sequential implementation of the Unicode Bidirectional algorithm,
23 as per UAX#9, a part of the Unicode Standard.
25 Unlike the reference and most other implementations, this one is
26 designed to be called once for every character in the buffer or
29 The main entry point is bidi_get_next_char_visually. Each time it
30 is called, it finds the next character in the visual order, and
31 returns its information in a special structure. The caller is then
32 expected to process this character for display or any other
33 purposes, and call bidi_get_next_char_visually for the next
34 character. See the comments in bidi_get_next_char_visually for
35 more details about its algorithm that finds the next visual-order
36 character by resolving their levels on the fly.
38 The two other entry points are bidi_paragraph_init and
39 bidi_mirror_char. The first determines the base direction of a
40 paragraph, while the second returns the mirrored version of its
43 If you want to understand the code, you will have to read it
44 together with the relevant portions of UAX#9. The comments include
45 references to UAX#9 rules, for that very reason.
47 A note about references to UAX#9 rules: if the reference says
48 something like "X9/Retaining", it means that you need to refer to
49 rule X9 and to its modifications decribed in the "Implementation
50 Notes" section of UAX#9, under "Retaining Format Codes". */
66 #include "character.h"
67 #include "dispextern.h"
69 static int bidi_initialized
= 0;
71 static Lisp_Object bidi_type_table
;
73 /* FIXME: Remove these when bidi_explicit_dir_char uses a lookup table. */
74 #define LRM_CHAR 0x200E
75 #define RLM_CHAR 0x200F
76 #define LRE_CHAR 0x202A
77 #define RLE_CHAR 0x202B
78 #define PDF_CHAR 0x202C
79 #define LRO_CHAR 0x202D
80 #define RLO_CHAR 0x202E
83 #define BIDI_BOB -2 /* FIXME: Is this needed? */
85 /* Local data structures. (Look in dispextern.h for the rest.) */
87 /* What we need to know about the current paragraph. */
88 struct bidi_paragraph_info
{
89 int start_bytepos
; /* byte position where it begins */
90 int end_bytepos
; /* byte position where it ends */
91 int embedding_level
; /* its basic embedding level */
92 bidi_dir_t base_dir
; /* its base direction */
95 /* Data type for describing the bidirectional character categories. */
103 int bidi_ignore_explicit_marks_for_paragraph_level
= 1;
105 static Lisp_Object fallback_paragraph_start_re
, fallback_paragraph_separate_re
;
106 static Lisp_Object Qparagraph_start
, Qparagraph_separate
;
111 /* FIXME: This should come from the Unicode Database. */
116 { { 0x0000, 0x0008, WEAK_BN
},
117 { 0x0009, 0x0000, NEUTRAL_S
},
118 { 0x000A, 0x0000, NEUTRAL_B
},
119 { 0x000B, 0x0000, NEUTRAL_S
},
120 { 0x000C, 0x0000, NEUTRAL_WS
},
121 { 0x000D, 0x0000, NEUTRAL_B
},
122 { 0x000E, 0x001B, WEAK_BN
},
123 { 0x001C, 0x001E, NEUTRAL_B
},
124 { 0x001F, 0x0000, NEUTRAL_S
},
125 { 0x0020, 0x0000, NEUTRAL_WS
},
126 { 0x0021, 0x0022, NEUTRAL_ON
},
127 { 0x0023, 0x0025, WEAK_ET
},
128 { 0x0026, 0x002A, NEUTRAL_ON
},
129 { 0x002B, 0x0000, WEAK_ES
},
130 { 0x002C, 0x0000, WEAK_CS
},
131 { 0x002D, 0x0000, WEAK_ES
},
132 { 0x002E, 0x002F, WEAK_CS
},
133 { 0x0030, 0x0039, WEAK_EN
},
134 { 0x003A, 0x0000, WEAK_CS
},
135 { 0x003B, 0x0040, NEUTRAL_ON
},
136 { 0x005B, 0x0060, NEUTRAL_ON
},
137 { 0x007B, 0x007E, NEUTRAL_ON
},
138 { 0x007F, 0x0084, WEAK_BN
},
139 { 0x0085, 0x0000, NEUTRAL_B
},
140 { 0x0086, 0x009F, WEAK_BN
},
141 { 0x00A0, 0x0000, WEAK_CS
},
142 { 0x00A1, 0x0000, NEUTRAL_ON
},
143 { 0x00A2, 0x00A5, WEAK_ET
},
144 { 0x00A6, 0x00A9, NEUTRAL_ON
},
145 { 0x00AB, 0x00AC, NEUTRAL_ON
},
146 { 0x00AD, 0x0000, WEAK_BN
},
147 { 0x00AE, 0x00Af, NEUTRAL_ON
},
148 { 0x00B0, 0x00B1, WEAK_ET
},
149 { 0x00B2, 0x00B3, WEAK_EN
},
150 { 0x00B4, 0x0000, NEUTRAL_ON
},
151 { 0x00B6, 0x00B8, NEUTRAL_ON
},
152 { 0x00B9, 0x0000, WEAK_EN
},
153 { 0x00BB, 0x00BF, NEUTRAL_ON
},
154 { 0x00D7, 0x0000, NEUTRAL_ON
},
155 { 0x00F7, 0x0000, NEUTRAL_ON
},
156 { 0x02B9, 0x02BA, NEUTRAL_ON
},
157 { 0x02C2, 0x02CF, NEUTRAL_ON
},
158 { 0x02D2, 0x02DF, NEUTRAL_ON
},
159 { 0x02E5, 0x02ED, NEUTRAL_ON
},
160 { 0x0300, 0x036F, WEAK_NSM
},
161 { 0x0374, 0x0375, NEUTRAL_ON
},
162 { 0x037E, 0x0385, NEUTRAL_ON
},
163 { 0x0387, 0x0000, NEUTRAL_ON
},
164 { 0x03F6, 0x0000, NEUTRAL_ON
},
165 { 0x0483, 0x0489, WEAK_NSM
},
166 { 0x058A, 0x0000, NEUTRAL_ON
},
167 { 0x0591, 0x05BD, WEAK_NSM
},
168 { 0x05BE, 0x0000, STRONG_R
},
169 { 0x05BF, 0x0000, WEAK_NSM
},
170 { 0x05C0, 0x0000, STRONG_R
},
171 { 0x05C1, 0x05C2, WEAK_NSM
},
172 { 0x05C3, 0x0000, STRONG_R
},
173 { 0x05C4, 0x05C5, WEAK_NSM
},
174 { 0x05C6, 0x0000, STRONG_R
},
175 { 0x05C7, 0x0000, WEAK_NSM
},
176 { 0x05D0, 0x05F4, STRONG_R
},
177 { 0x060C, 0x0000, WEAK_CS
},
178 { 0x061B, 0x064A, STRONG_AL
},
179 { 0x064B, 0x0655, WEAK_NSM
},
180 { 0x0660, 0x0669, WEAK_AN
},
181 { 0x066A, 0x0000, WEAK_ET
},
182 { 0x066B, 0x066C, WEAK_AN
},
183 { 0x066D, 0x066F, STRONG_AL
},
184 { 0x0670, 0x0000, WEAK_NSM
},
185 { 0x0671, 0x06D5, STRONG_AL
},
186 { 0x06D6, 0x06DC, WEAK_NSM
},
187 { 0x06DD, 0x0000, STRONG_AL
},
188 { 0x06DE, 0x06E4, WEAK_NSM
},
189 { 0x06E5, 0x06E6, STRONG_AL
},
190 { 0x06E7, 0x06E8, WEAK_NSM
},
191 { 0x06E9, 0x0000, NEUTRAL_ON
},
192 { 0x06EA, 0x06ED, WEAK_NSM
},
193 { 0x06F0, 0x06F9, WEAK_EN
},
194 { 0x06FA, 0x070D, STRONG_AL
},
195 { 0x070F, 0x0000, WEAK_BN
},
196 { 0x0710, 0x0000, STRONG_AL
},
197 { 0x0711, 0x0000, WEAK_NSM
},
198 { 0x0712, 0x072C, STRONG_AL
},
199 { 0x0730, 0x074A, WEAK_NSM
},
200 { 0x0780, 0x07A5, STRONG_AL
},
201 { 0x07A6, 0x07B0, WEAK_NSM
},
202 { 0x07B1, 0x0000, STRONG_AL
},
203 { 0x0901, 0x0902, WEAK_NSM
},
204 { 0x093C, 0x0000, WEAK_NSM
},
205 { 0x0941, 0x0948, WEAK_NSM
},
206 { 0x094D, 0x0000, WEAK_NSM
},
207 { 0x0951, 0x0954, WEAK_NSM
},
208 { 0x0962, 0x0963, WEAK_NSM
},
209 { 0x0981, 0x0000, WEAK_NSM
},
210 { 0x09BC, 0x0000, WEAK_NSM
},
211 { 0x09C1, 0x09C4, WEAK_NSM
},
212 { 0x09CD, 0x0000, WEAK_NSM
},
213 { 0x09E2, 0x09E3, WEAK_NSM
},
214 { 0x09F2, 0x09F3, WEAK_ET
},
215 { 0x0A02, 0x0000, WEAK_NSM
},
216 { 0x0A3C, 0x0000, WEAK_NSM
},
217 { 0x0A41, 0x0A4D, WEAK_NSM
},
218 { 0x0A70, 0x0A71, WEAK_NSM
},
219 { 0x0A81, 0x0A82, WEAK_NSM
},
220 { 0x0ABC, 0x0000, WEAK_NSM
},
221 { 0x0AC1, 0x0AC8, WEAK_NSM
},
222 { 0x0ACD, 0x0000, WEAK_NSM
},
223 { 0x0B01, 0x0000, WEAK_NSM
},
224 { 0x0B3C, 0x0000, WEAK_NSM
},
225 { 0x0B3F, 0x0000, WEAK_NSM
},
226 { 0x0B41, 0x0B43, WEAK_NSM
},
227 { 0x0B4D, 0x0B56, WEAK_NSM
},
228 { 0x0B82, 0x0000, WEAK_NSM
},
229 { 0x0BC0, 0x0000, WEAK_NSM
},
230 { 0x0BCD, 0x0000, WEAK_NSM
},
231 { 0x0C3E, 0x0C40, WEAK_NSM
},
232 { 0x0C46, 0x0C56, WEAK_NSM
},
233 { 0x0CBF, 0x0000, WEAK_NSM
},
234 { 0x0CC6, 0x0000, WEAK_NSM
},
235 { 0x0CCC, 0x0CCD, WEAK_NSM
},
236 { 0x0D41, 0x0D43, WEAK_NSM
},
237 { 0x0D4D, 0x0000, WEAK_NSM
},
238 { 0x0DCA, 0x0000, WEAK_NSM
},
239 { 0x0DD2, 0x0DD6, WEAK_NSM
},
240 { 0x0E31, 0x0000, WEAK_NSM
},
241 { 0x0E34, 0x0E3A, WEAK_NSM
},
242 { 0x0E3F, 0x0000, WEAK_ET
},
243 { 0x0E47, 0x0E4E, WEAK_NSM
},
244 { 0x0EB1, 0x0000, WEAK_NSM
},
245 { 0x0EB4, 0x0EBC, WEAK_NSM
},
246 { 0x0EC8, 0x0ECD, WEAK_NSM
},
247 { 0x0F18, 0x0F19, WEAK_NSM
},
248 { 0x0F35, 0x0000, WEAK_NSM
},
249 { 0x0F37, 0x0000, WEAK_NSM
},
250 { 0x0F39, 0x0000, WEAK_NSM
},
251 { 0x0F3A, 0x0F3D, NEUTRAL_ON
},
252 { 0x0F71, 0x0F7E, WEAK_NSM
},
253 { 0x0F80, 0x0F84, WEAK_NSM
},
254 { 0x0F86, 0x0F87, WEAK_NSM
},
255 { 0x0F90, 0x0FBC, WEAK_NSM
},
256 { 0x0FC6, 0x0000, WEAK_NSM
},
257 { 0x102D, 0x1030, WEAK_NSM
},
258 { 0x1032, 0x1037, WEAK_NSM
},
259 { 0x1039, 0x0000, WEAK_NSM
},
260 { 0x1058, 0x1059, WEAK_NSM
},
261 { 0x1680, 0x0000, NEUTRAL_WS
},
262 { 0x169B, 0x169C, NEUTRAL_ON
},
263 { 0x1712, 0x1714, WEAK_NSM
},
264 { 0x1732, 0x1734, WEAK_NSM
},
265 { 0x1752, 0x1753, WEAK_NSM
},
266 { 0x1772, 0x1773, WEAK_NSM
},
267 { 0x17B7, 0x17BD, WEAK_NSM
},
268 { 0x17C6, 0x0000, WEAK_NSM
},
269 { 0x17C9, 0x17D3, WEAK_NSM
},
270 { 0x17DB, 0x0000, WEAK_ET
},
271 { 0x1800, 0x180A, NEUTRAL_ON
},
272 { 0x180B, 0x180D, WEAK_NSM
},
273 { 0x180E, 0x0000, WEAK_BN
},
274 { 0x18A9, 0x0000, WEAK_NSM
},
275 { 0x1FBD, 0x0000, NEUTRAL_ON
},
276 { 0x1FBF, 0x1FC1, NEUTRAL_ON
},
277 { 0x1FCD, 0x1FCF, NEUTRAL_ON
},
278 { 0x1FDD, 0x1FDF, NEUTRAL_ON
},
279 { 0x1FED, 0x1FEF, NEUTRAL_ON
},
280 { 0x1FFD, 0x1FFE, NEUTRAL_ON
},
281 { 0x2000, 0x200A, NEUTRAL_WS
},
282 { 0x200B, 0x200D, WEAK_BN
},
283 { 0x200F, 0x0000, STRONG_R
},
284 { 0x2010, 0x2027, NEUTRAL_ON
},
285 { 0x2028, 0x0000, NEUTRAL_WS
},
286 { 0x2029, 0x0000, NEUTRAL_B
},
287 { 0x202A, 0x0000, LRE
},
288 { 0x202B, 0x0000, RLE
},
289 { 0x202C, 0x0000, PDF
},
290 { 0x202D, 0x0000, LRO
},
291 { 0x202E, 0x0000, RLO
},
292 { 0x202F, 0x0000, NEUTRAL_WS
},
293 { 0x2030, 0x2034, WEAK_ET
},
294 { 0x2035, 0x2057, NEUTRAL_ON
},
295 { 0x205F, 0x0000, NEUTRAL_WS
},
296 { 0x2060, 0x206F, WEAK_BN
},
297 { 0x2070, 0x0000, WEAK_EN
},
298 { 0x2074, 0x2079, WEAK_EN
},
299 { 0x207A, 0x207B, WEAK_ET
},
300 { 0x207C, 0x207E, NEUTRAL_ON
},
301 { 0x2080, 0x2089, WEAK_EN
},
302 { 0x208A, 0x208B, WEAK_ET
},
303 { 0x208C, 0x208E, NEUTRAL_ON
},
304 { 0x20A0, 0x20B1, WEAK_ET
},
305 { 0x20D0, 0x20EA, WEAK_NSM
},
306 { 0x2100, 0x2101, NEUTRAL_ON
},
307 { 0x2103, 0x2106, NEUTRAL_ON
},
308 { 0x2108, 0x2109, NEUTRAL_ON
},
309 { 0x2114, 0x0000, NEUTRAL_ON
},
310 { 0x2116, 0x2118, NEUTRAL_ON
},
311 { 0x211E, 0x2123, NEUTRAL_ON
},
312 { 0x2125, 0x0000, NEUTRAL_ON
},
313 { 0x2127, 0x0000, NEUTRAL_ON
},
314 { 0x2129, 0x0000, NEUTRAL_ON
},
315 { 0x212E, 0x0000, WEAK_ET
},
316 { 0x2132, 0x0000, NEUTRAL_ON
},
317 { 0x213A, 0x0000, NEUTRAL_ON
},
318 { 0x2140, 0x2144, NEUTRAL_ON
},
319 { 0x214A, 0x215F, NEUTRAL_ON
},
320 { 0x2190, 0x2211, NEUTRAL_ON
},
321 { 0x2212, 0x2213, WEAK_ET
},
322 { 0x2214, 0x2335, NEUTRAL_ON
},
323 { 0x237B, 0x2394, NEUTRAL_ON
},
324 { 0x2396, 0x244A, NEUTRAL_ON
},
325 { 0x2460, 0x249B, WEAK_EN
},
326 { 0x24EA, 0x0000, WEAK_EN
},
327 { 0x24EB, 0x2FFB, NEUTRAL_ON
},
328 { 0x3000, 0x0000, NEUTRAL_WS
},
329 { 0x3001, 0x3004, NEUTRAL_ON
},
330 { 0x3008, 0x3020, NEUTRAL_ON
},
331 { 0x302A, 0x302F, WEAK_NSM
},
332 { 0x3030, 0x0000, NEUTRAL_ON
},
333 { 0x3036, 0x3037, NEUTRAL_ON
},
334 { 0x303D, 0x303F, NEUTRAL_ON
},
335 { 0x3099, 0x309A, WEAK_NSM
},
336 { 0x309B, 0x309C, NEUTRAL_ON
},
337 { 0x30A0, 0x0000, NEUTRAL_ON
},
338 { 0x30FB, 0x0000, NEUTRAL_ON
},
339 { 0x3251, 0x325F, NEUTRAL_ON
},
340 { 0x32B1, 0x32BF, NEUTRAL_ON
},
341 { 0xA490, 0xA4C6, NEUTRAL_ON
},
342 { 0xFB1D, 0x0000, STRONG_R
},
343 { 0xFB1E, 0x0000, WEAK_NSM
},
344 { 0xFB1F, 0xFB28, STRONG_R
},
345 { 0xFB29, 0x0000, WEAK_ET
},
346 { 0xFB2A, 0xFB4F, STRONG_R
},
347 { 0xFB50, 0xFD3D, STRONG_AL
},
348 { 0xFD3E, 0xFD3F, NEUTRAL_ON
},
349 { 0xFD50, 0xFDFC, STRONG_AL
},
350 { 0xFE00, 0xFE23, WEAK_NSM
},
351 { 0xFE30, 0xFE4F, NEUTRAL_ON
},
352 { 0xFE50, 0x0000, WEAK_CS
},
353 { 0xFE51, 0x0000, NEUTRAL_ON
},
354 { 0xFE52, 0x0000, WEAK_CS
},
355 { 0xFE54, 0x0000, NEUTRAL_ON
},
356 { 0xFE55, 0x0000, WEAK_CS
},
357 { 0xFE56, 0xFE5E, NEUTRAL_ON
},
358 { 0xFE5F, 0x0000, WEAK_ET
},
359 { 0xFE60, 0xFE61, NEUTRAL_ON
},
360 { 0xFE62, 0xFE63, WEAK_ET
},
361 { 0xFE64, 0xFE68, NEUTRAL_ON
},
362 { 0xFE69, 0xFE6A, WEAK_ET
},
363 { 0xFE6B, 0x0000, NEUTRAL_ON
},
364 { 0xFE70, 0xFEFC, STRONG_AL
},
365 { 0xFEFF, 0x0000, WEAK_BN
},
366 { 0xFF01, 0xFF02, NEUTRAL_ON
},
367 { 0xFF03, 0xFF05, WEAK_ET
},
368 { 0xFF06, 0xFF0A, NEUTRAL_ON
},
369 { 0xFF0B, 0x0000, WEAK_ET
},
370 { 0xFF0C, 0x0000, WEAK_CS
},
371 { 0xFF0D, 0x0000, WEAK_ET
},
372 { 0xFF0E, 0x0000, WEAK_CS
},
373 { 0xFF0F, 0x0000, WEAK_ES
},
374 { 0xFF10, 0xFF19, WEAK_EN
},
375 { 0xFF1A, 0x0000, WEAK_CS
},
376 { 0xFF1B, 0xFF20, NEUTRAL_ON
},
377 { 0xFF3B, 0xFF40, NEUTRAL_ON
},
378 { 0xFF5B, 0xFF65, NEUTRAL_ON
},
379 { 0xFFE0, 0xFFE1, WEAK_ET
},
380 { 0xFFE2, 0xFFE4, NEUTRAL_ON
},
381 { 0xFFE5, 0xFFE6, WEAK_ET
},
382 { 0xFFE8, 0xFFEE, NEUTRAL_ON
},
383 { 0xFFF9, 0xFFFB, WEAK_BN
},
384 { 0xFFFC, 0xFFFD, NEUTRAL_ON
},
385 { 0x1D167, 0x1D169, WEAK_NSM
},
386 { 0x1D173, 0x1D17A, WEAK_BN
},
387 { 0x1D17B, 0x1D182, WEAK_NSM
},
388 { 0x1D185, 0x1D18B, WEAK_NSM
},
389 { 0x1D1AA, 0x1D1AD, WEAK_NSM
},
390 { 0x1D7CE, 0x1D7FF, WEAK_EN
},
391 { 0xE0001, 0xE007F, WEAK_BN
} };
394 bidi_type_table
= Fmake_char_table (Qnil
, make_number (STRONG_L
));
395 staticpro (&bidi_type_table
);
397 for (i
= 0; i
< sizeof bidi_type
/ sizeof bidi_type
[0]; i
++)
398 char_table_set_range (bidi_type_table
, bidi_type
[i
].from
,
399 bidi_type
[i
].to
? bidi_type
[i
].to
: bidi_type
[i
].from
,
400 make_number (bidi_type
[i
].type
));
402 fallback_paragraph_start_re
=
403 Fsymbol_value (Fintern_soft (build_string ("paragraph-start"), Qnil
));
404 if (!STRINGP (fallback_paragraph_start_re
))
405 fallback_paragraph_start_re
= build_string ("\f\\|[ \t]*$");
406 staticpro (&fallback_paragraph_start_re
);
407 Qparagraph_start
= intern ("paragraph-start");
408 staticpro (&Qparagraph_start
);
409 fallback_paragraph_separate_re
=
410 Fsymbol_value (Fintern_soft (build_string ("paragraph-separate"), Qnil
));
411 if (!STRINGP (fallback_paragraph_separate_re
))
412 fallback_paragraph_separate_re
= build_string ("[ \t\f]*$");
413 staticpro (&fallback_paragraph_separate_re
);
414 Qparagraph_separate
= intern ("paragraph-separate");
415 staticpro (&Qparagraph_separate
);
416 bidi_initialized
= 1;
419 /* Return the bidi type of a character CH, subject to the current
420 directional OVERRIDE. */
421 static INLINE bidi_type_t
422 bidi_get_type (int ch
, bidi_dir_t override
)
424 bidi_type_t default_type
;
428 if (ch
< 0 || ch
> MAX_CHAR
)
431 default_type
= (bidi_type_t
) XINT (CHAR_TABLE_REF (bidi_type_table
, ch
));
433 if (override
== NEUTRAL_DIR
)
436 switch (default_type
)
438 /* Although UAX#9 does not tell, it doesn't make sense to
439 override NEUTRAL_B and LRM/RLM characters. */
454 if (override
== L2R
) /* X6 */
456 else if (override
== R2L
)
459 abort (); /* can't happen: handled above */
465 bidi_check_type (bidi_type_t type
)
467 if (type
< UNKNOWN_BT
|| type
> NEUTRAL_ON
)
471 /* Given a bidi TYPE of a character, return its category. */
472 static INLINE bidi_category_t
473 bidi_get_category (bidi_type_t type
)
487 case PDF
: /* ??? really?? */
506 /* Return the mirrored character of C, if any.
508 Note: The conditions in UAX#9 clause L4 must be tested by the
510 /* FIXME: exceedingly temporary! Should consult the Unicode database
511 of character properties. */
513 bidi_mirror_char (int c
)
515 static const char mirrored_pairs
[] = "()<>[]{}";
516 const char *p
= c
> 0 && c
< 128 ? strchr (mirrored_pairs
, c
) : NULL
;
520 size_t i
= p
- mirrored_pairs
;
522 return mirrored_pairs
[(i
^ 1)];
527 /* Copy the bidi iterator from FROM to TO. To save cycles, this only
528 copies the part of the level stack that is actually in use. */
530 bidi_copy_it (struct bidi_it
*to
, struct bidi_it
*from
)
534 /* Copy everything except the level stack and beyond. */
535 memcpy (to
, from
, ((size_t)&((struct bidi_it
*)0)->level_stack
[0]));
537 /* Copy the active part of the level stack. */
538 to
->level_stack
[0] = from
->level_stack
[0]; /* level zero is always in use */
539 for (i
= 1; i
<= from
->stack_idx
; i
++)
540 to
->level_stack
[i
] = from
->level_stack
[i
];
543 /* Caching the bidi iterator states. */
545 static struct bidi_it bidi_cache
[1000]; /* FIXME: make this dynamically allocated! */
546 static int bidi_cache_idx
;
547 static int bidi_cache_last_idx
;
550 bidi_cache_reset (void)
553 bidi_cache_last_idx
= -1;
557 bidi_cache_fetch_state (int idx
, struct bidi_it
*bidi_it
)
559 int current_scan_dir
= bidi_it
->scan_dir
;
561 if (idx
< 0 || idx
>= bidi_cache_idx
)
564 bidi_copy_it (bidi_it
, &bidi_cache
[idx
]);
565 bidi_it
->scan_dir
= current_scan_dir
;
566 bidi_cache_last_idx
= idx
;
569 /* Find a cached state with a given CHARPOS and resolved embedding
570 level less or equal to LEVEL. if LEVEL is -1, disregard the
571 resolved levels in cached states. DIR, if non-zero, means search
572 in that direction from the last cache hit. */
574 bidi_cache_search (int charpos
, int level
, int dir
)
580 if (charpos
< bidi_cache
[bidi_cache_last_idx
].charpos
)
582 else if (charpos
> bidi_cache
[bidi_cache_last_idx
].charpos
)
585 i_start
= bidi_cache_last_idx
;
589 i_start
= bidi_cache_idx
- 1;
594 /* Linear search for now; FIXME! */
595 for (i
= i_start
; i
>= 0; i
--)
596 if (bidi_cache
[i
].charpos
== charpos
597 && (level
== -1 || bidi_cache
[i
].resolved_level
<= level
))
602 for (i
= i_start
; i
< bidi_cache_idx
; i
++)
603 if (bidi_cache
[i
].charpos
== charpos
604 && (level
== -1 || bidi_cache
[i
].resolved_level
<= level
))
612 /* Find a cached state where the resolved level changes to a value
613 that is lower than LEVEL, and return its cache slot index. DIR is
614 the direction to search, starting with the last used cache slot.
615 BEFORE, if non-zero, means return the index of the slot that is
616 ``before'' the level change in the search direction. That is,
617 given the cached levels like this:
622 and assuming we are at the position cached at the slot marked with
623 C, searching backwards (DIR = -1) for LEVEL = 2 will return the
624 index of slot B or A, depending whether BEFORE is, respectively,
627 bidi_cache_find_level_change (int level
, int dir
, int before
)
631 int i
= dir
? bidi_cache_last_idx
: bidi_cache_idx
- 1;
632 int incr
= before
? 1 : 0;
643 if (bidi_cache
[i
- incr
].resolved_level
>= 0
644 && bidi_cache
[i
- incr
].resolved_level
< level
)
651 while (i
< bidi_cache_idx
- incr
)
653 if (bidi_cache
[i
+ incr
].resolved_level
>= 0
654 && bidi_cache
[i
+ incr
].resolved_level
< level
)
665 bidi_cache_iterator_state (struct bidi_it
*bidi_it
, int resolved
)
669 /* We should never cache on backward scans. */
670 if (bidi_it
->scan_dir
== -1)
672 idx
= bidi_cache_search (bidi_it
->charpos
, -1, 1);
676 idx
= bidi_cache_idx
;
677 /* Don't overrun the cache limit. */
678 if (idx
> sizeof (bidi_cache
) / sizeof (bidi_cache
[0]) - 1)
680 /* Character positions should correspond to cache positions 1:1.
681 If we are outside the range of cached positions, the cache is
682 useless and must be reset. */
684 (bidi_it
->charpos
> bidi_cache
[idx
- 1].charpos
+ 1
685 || bidi_it
->charpos
< bidi_cache
[0].charpos
))
690 bidi_copy_it (&bidi_cache
[idx
], bidi_it
);
692 bidi_cache
[idx
].resolved_level
= -1;
693 bidi_cache
[idx
].new_paragraph
= 0;
697 /* Copy only the members which could have changed, to avoid
698 costly copying of the entire struct. */
699 bidi_cache
[idx
].type
= bidi_it
->type
;
700 bidi_check_type (bidi_it
->type
);
701 bidi_cache
[idx
].type_after_w1
= bidi_it
->type_after_w1
;
702 bidi_check_type (bidi_it
->type_after_w1
);
704 bidi_cache
[idx
].resolved_level
= bidi_it
->resolved_level
;
706 bidi_cache
[idx
].resolved_level
= -1;
707 bidi_cache
[idx
].invalid_levels
= bidi_it
->invalid_levels
;
708 bidi_cache
[idx
].invalid_rl_levels
= bidi_it
->invalid_rl_levels
;
709 bidi_cache
[idx
].next_for_neutral
= bidi_it
->next_for_neutral
;
710 bidi_cache
[idx
].next_for_ws
= bidi_it
->next_for_ws
;
711 bidi_cache
[idx
].ignore_bn_limit
= bidi_it
->ignore_bn_limit
;
714 bidi_cache_last_idx
= idx
;
715 if (idx
>= bidi_cache_idx
)
716 bidi_cache_idx
= idx
+ 1;
719 static INLINE bidi_type_t
720 bidi_cache_find (int charpos
, int level
, struct bidi_it
*bidi_it
)
722 int i
= bidi_cache_search (charpos
, level
, bidi_it
->scan_dir
);
726 bidi_dir_t current_scan_dir
= bidi_it
->scan_dir
;
728 bidi_copy_it (bidi_it
, &bidi_cache
[i
]);
729 bidi_cache_last_idx
= i
;
730 /* Don't let scan direction from from the cached state override
731 the current scan direction. */
732 bidi_it
->scan_dir
= current_scan_dir
;
733 return bidi_it
->type
;
740 bidi_peek_at_next_level (struct bidi_it
*bidi_it
)
742 if (bidi_cache_idx
== 0 || bidi_cache_last_idx
== -1)
744 return bidi_cache
[bidi_cache_last_idx
+ bidi_it
->scan_dir
].resolved_level
;
747 /* Check if buffer position CHARPOS/BYTEPOS is the end of a paragraph.
748 Value is the non-negative length of the paragraph separator
749 following the buffer position, -1 if position is at the beginning
750 of a new paragraph, or -2 if position is neither at beginning nor
751 at end of a paragraph. */
753 bidi_at_paragraph_end (EMACS_INT charpos
, EMACS_INT bytepos
)
755 Lisp_Object sep_re
= Fbuffer_local_value (Qparagraph_separate
,
757 Lisp_Object start_re
= Fbuffer_local_value (Qparagraph_start
,
761 if (!STRINGP (sep_re
))
762 sep_re
= fallback_paragraph_separate_re
;
763 if (!STRINGP (start_re
))
764 start_re
= fallback_paragraph_start_re
;
766 val
= fast_looking_at (sep_re
, charpos
, bytepos
, ZV
, ZV_BYTE
, Qnil
);
769 if (fast_looking_at (start_re
, charpos
, bytepos
, ZV
, ZV_BYTE
, Qnil
) >= 0)
778 /* Determine the start-of-run (sor) directional type given the two
779 embedding levels on either side of the run boundary. Also, update
780 the saved info about previously seen characters, since that info is
781 generally valid for a single level run. */
783 bidi_set_sor_type (struct bidi_it
*bidi_it
, int level_before
, int level_after
)
785 int higher_level
= level_before
> level_after
? level_before
: level_after
;
787 /* The prev_was_pdf gork is required for when we have several PDFs
788 in a row. In that case, we want to compute the sor type for the
789 next level run only once: when we see the first PDF. That's
790 because the sor type depends only on the higher of the two levels
791 that we find on the two sides of the level boundary (see UAX#9,
792 clause X10), and so we don't need to know the final embedding
793 level to which we descend after processing all the PDFs. */
794 if (!bidi_it
->prev_was_pdf
|| level_before
< level_after
)
795 /* FIXME: should the default sor direction be user selectable? */
796 bidi_it
->sor
= (higher_level
& 1) != 0 ? R2L
: L2R
;
797 if (level_before
> level_after
)
798 bidi_it
->prev_was_pdf
= 1;
800 bidi_it
->prev
.type
= UNKNOWN_BT
;
801 bidi_it
->last_strong
.type
= bidi_it
->last_strong
.type_after_w1
=
802 bidi_it
->last_strong
.orig_type
= UNKNOWN_BT
;
803 bidi_it
->prev_for_neutral
.type
= bidi_it
->sor
== R2L
? STRONG_R
: STRONG_L
;
804 bidi_it
->prev_for_neutral
.charpos
= bidi_it
->charpos
;
805 bidi_it
->prev_for_neutral
.bytepos
= bidi_it
->bytepos
;
806 bidi_it
->next_for_neutral
.type
= bidi_it
->next_for_neutral
.type_after_w1
=
807 bidi_it
->next_for_neutral
.orig_type
= UNKNOWN_BT
;
808 bidi_it
->ignore_bn_limit
= 0; /* meaning it's unknown */
812 bidi_line_init (struct bidi_it
*bidi_it
)
814 bidi_it
->scan_dir
= 1; /* FIXME: do we need to have control on this? */
815 bidi_it
->resolved_level
= bidi_it
->level_stack
[0].level
;
816 bidi_it
->level_stack
[0].override
= NEUTRAL_DIR
; /* X1 */
817 bidi_it
->invalid_levels
= 0;
818 bidi_it
->invalid_rl_levels
= -1;
819 bidi_it
->next_en_pos
= -1;
820 bidi_it
->next_for_ws
.type
= UNKNOWN_BT
;
821 bidi_set_sor_type (bidi_it
,
822 bidi_it
->paragraph_dir
== R2L
? 1 : 0,
823 bidi_it
->level_stack
[0].level
); /* X10 */
828 /* Find the beginning of this paragraph by looking back in the buffer.
829 Value is the byte position of the paragraph's beginning. */
831 bidi_find_paragraph_start (EMACS_INT pos
, EMACS_INT pos_byte
)
833 Lisp_Object re
= Fbuffer_local_value (Qparagraph_start
, Fcurrent_buffer ());
834 EMACS_INT limit
= ZV
, limit_byte
= ZV_BYTE
;
837 re
= fallback_paragraph_start_re
;
838 while (pos_byte
> BEGV_BYTE
839 && fast_looking_at (re
, pos
, pos_byte
, limit
, limit_byte
, Qnil
) < 0)
841 pos
= find_next_newline_no_quit (pos
- 1, -1);
842 pos_byte
= CHAR_TO_BYTE (pos
);
847 /* Determine the direction, a.k.a. base embedding level, of the
848 paragraph we are about to iterate through. If DIR is either L2R or
849 R2L, just use that. Otherwise, determine the paragraph direction
850 from the first strong character of the paragraph.
852 Note that this gives the paragraph separator the same direction as
853 the preceding paragraph, even though Emacs generally views the
854 separartor as not belonging to any paragraph. */
856 bidi_paragraph_init (bidi_dir_t dir
, struct bidi_it
*bidi_it
)
858 EMACS_INT bytepos
= bidi_it
->bytepos
;
860 /* Special case for an empty buffer. */
861 if (bytepos
== BEGV_BYTE
&& bytepos
== ZV_BYTE
)
863 /* We should never be called at EOB or before BEGV. */
864 else if (bytepos
>= ZV_BYTE
|| bytepos
< BEGV_BYTE
)
869 bidi_it
->paragraph_dir
= L2R
;
870 bidi_it
->new_paragraph
= 0;
874 bidi_it
->paragraph_dir
= R2L
;
875 bidi_it
->new_paragraph
= 0;
877 else if (dir
== NEUTRAL_DIR
) /* P2 */
883 /* If we are inside a paragraph separator, we are just waiting
884 for the separator to be exhausted; use the previous paragraph
885 direction. But don't do that if we have been just reseated,
886 because we need to reinitialize below in that case. */
887 if (!bidi_it
->first_elt
888 && bidi_it
->charpos
< bidi_it
->separator_limit
)
891 /* If we are on a newline, get past it to where the next
892 paragraph might start. But don't do that at BEGV since then
893 we are potentially in a new paragraph that doesn't yet
895 pos
= bidi_it
->charpos
;
896 if (bytepos
> BEGV_BYTE
&& FETCH_CHAR (bytepos
) == '\n')
902 /* We are either at the beginning of a paragraph or in the
903 middle of it. Find where this paragraph starts. */
904 bytepos
= bidi_find_paragraph_start (pos
, bytepos
);
906 /* We should always be at the beginning of a new line at this
908 if (!(bytepos
== BEGV_BYTE
|| FETCH_CHAR (bytepos
- 1) == '\n'))
911 bidi_it
->separator_limit
= -1;
912 bidi_it
->new_paragraph
= 0;
913 ch
= FETCH_CHAR (bytepos
);
914 ch_len
= CHAR_BYTES (ch
);
915 pos
= BYTE_TO_CHAR (bytepos
);
916 type
= bidi_get_type (ch
, NEUTRAL_DIR
);
918 for (pos
++, bytepos
+= ch_len
;
919 /* NOTE: UAX#9 says to search only for L, AL, or R types of
920 characters, and ignore RLE, RLO, LRE, and LRO. However,
921 I'm not sure it makes sense to omit those 4; should try
922 with and without that to see the effect. */
923 (bidi_get_category (type
) != STRONG
)
924 || (bidi_ignore_explicit_marks_for_paragraph_level
925 && (type
== RLE
|| type
== RLO
926 || type
== LRE
|| type
== LRO
));
927 type
= bidi_get_type (ch
, NEUTRAL_DIR
))
929 if (type
== NEUTRAL_B
&& bidi_at_paragraph_end (pos
, bytepos
) >= -1)
931 if (bytepos
>= ZV_BYTE
)
933 /* Pretend there's a paragraph separator at end of buffer. */
937 FETCH_CHAR_ADVANCE (ch
, pos
, bytepos
);
939 if (type
== STRONG_R
|| type
== STRONG_AL
) /* P3 */
940 bidi_it
->paragraph_dir
= R2L
;
941 else if (type
== STRONG_L
)
942 bidi_it
->paragraph_dir
= L2R
;
947 /* Contrary to UAX#9 clause P3, we only default the paragraph
948 direction to L2R if we have no previous usable paragraph
950 if (bidi_it
->paragraph_dir
== NEUTRAL_DIR
)
951 bidi_it
->paragraph_dir
= L2R
; /* P3 and ``higher protocols'' */
952 if (bidi_it
->paragraph_dir
== R2L
)
953 bidi_it
->level_stack
[0].level
= 1;
955 bidi_it
->level_stack
[0].level
= 0;
957 bidi_line_init (bidi_it
);
960 /* Do whatever UAX#9 clause X8 says should be done at paragraph's
963 bidi_set_paragraph_end (struct bidi_it
*bidi_it
)
965 bidi_it
->invalid_levels
= 0;
966 bidi_it
->invalid_rl_levels
= -1;
967 bidi_it
->stack_idx
= 0;
968 bidi_it
->resolved_level
= bidi_it
->level_stack
[0].level
;
971 /* Initialize the bidi iterator from buffer position CHARPOS. */
973 bidi_init_it (EMACS_INT charpos
, EMACS_INT bytepos
, struct bidi_it
*bidi_it
)
975 if (! bidi_initialized
)
977 bidi_it
->charpos
= charpos
;
978 bidi_it
->bytepos
= bytepos
;
979 bidi_it
->first_elt
= 1;
980 bidi_set_paragraph_end (bidi_it
);
981 bidi_it
->new_paragraph
= 1;
982 bidi_it
->separator_limit
= -1;
983 bidi_it
->type
= NEUTRAL_B
;
984 bidi_it
->type_after_w1
= NEUTRAL_B
;
985 bidi_it
->orig_type
= NEUTRAL_B
;
986 bidi_it
->prev_was_pdf
= 0;
987 bidi_it
->prev
.type
= bidi_it
->prev
.type_after_w1
=
988 bidi_it
->prev
.orig_type
= UNKNOWN_BT
;
989 bidi_it
->last_strong
.type
= bidi_it
->last_strong
.type_after_w1
=
990 bidi_it
->last_strong
.orig_type
= UNKNOWN_BT
;
991 bidi_it
->next_for_neutral
.charpos
= -1;
992 bidi_it
->next_for_neutral
.type
=
993 bidi_it
->next_for_neutral
.type_after_w1
=
994 bidi_it
->next_for_neutral
.orig_type
= UNKNOWN_BT
;
995 bidi_it
->prev_for_neutral
.charpos
= -1;
996 bidi_it
->prev_for_neutral
.type
=
997 bidi_it
->prev_for_neutral
.type_after_w1
=
998 bidi_it
->prev_for_neutral
.orig_type
= UNKNOWN_BT
;
999 bidi_it
->sor
= L2R
; /* FIXME: should it be user-selectable? */
1002 /* Push the current embedding level and override status; reset the
1003 current level to LEVEL and the current override status to OVERRIDE. */
1005 bidi_push_embedding_level (struct bidi_it
*bidi_it
,
1006 int level
, bidi_dir_t override
)
1008 bidi_it
->stack_idx
++;
1009 if (bidi_it
->stack_idx
>= BIDI_MAXLEVEL
)
1011 bidi_it
->level_stack
[bidi_it
->stack_idx
].level
= level
;
1012 bidi_it
->level_stack
[bidi_it
->stack_idx
].override
= override
;
1015 /* Pop the embedding level and directional override status from the
1016 stack, and return the new level. */
1018 bidi_pop_embedding_level (struct bidi_it
*bidi_it
)
1020 /* UAX#9 says to ignore invalid PDFs. */
1021 if (bidi_it
->stack_idx
> 0)
1022 bidi_it
->stack_idx
--;
1023 return bidi_it
->level_stack
[bidi_it
->stack_idx
].level
;
1026 /* Record in SAVED_INFO the information about the current character. */
1028 bidi_remember_char (struct bidi_saved_info
*saved_info
,
1029 struct bidi_it
*bidi_it
)
1031 saved_info
->charpos
= bidi_it
->charpos
;
1032 saved_info
->bytepos
= bidi_it
->bytepos
;
1033 saved_info
->type
= bidi_it
->type
;
1034 bidi_check_type (bidi_it
->type
);
1035 saved_info
->type_after_w1
= bidi_it
->type_after_w1
;
1036 bidi_check_type (bidi_it
->type_after_w1
);
1037 saved_info
->orig_type
= bidi_it
->orig_type
;
1038 bidi_check_type (bidi_it
->orig_type
);
1041 /* Resolve the type of a neutral character according to the type of
1042 surrounding strong text and the current embedding level. */
1043 static INLINE bidi_type_t
1044 bidi_resolve_neutral_1 (bidi_type_t prev_type
, bidi_type_t next_type
, int lev
)
1046 /* N1: European and Arabic numbers are treated as though they were R. */
1047 if (next_type
== WEAK_EN
|| next_type
== WEAK_AN
)
1048 next_type
= STRONG_R
;
1049 if (prev_type
== WEAK_EN
|| prev_type
== WEAK_AN
)
1050 prev_type
= STRONG_R
;
1052 if (next_type
== prev_type
) /* N1 */
1054 else if ((lev
& 1) == 0) /* N2 */
1061 bidi_explicit_dir_char (int c
)
1063 /* FIXME: this should be replaced with a lookup table with suitable
1064 bits set, like standard C ctype macros do. */
1065 return (c
== LRE_CHAR
|| c
== LRO_CHAR
1066 || c
== RLE_CHAR
|| c
== RLO_CHAR
|| c
== PDF_CHAR
);
1069 /* A helper function for bidi_resolve_explicit. It advances to the
1070 next character in logical order and determines the new embedding
1071 level and directional override, but does not take into account
1072 empty embeddings. */
1074 bidi_resolve_explicit_1 (struct bidi_it
*bidi_it
)
1080 bidi_dir_t override
;
1082 if (bidi_it
->bytepos
< BEGV_BYTE
/* after reseat to BEGV? */
1083 || bidi_it
->first_elt
)
1085 bidi_it
->first_elt
= 0;
1086 if (bidi_it
->charpos
< BEGV
)
1087 bidi_it
->charpos
= BEGV
;
1088 bidi_it
->bytepos
= CHAR_TO_BYTE (bidi_it
->charpos
);
1090 else if (bidi_it
->bytepos
< ZV_BYTE
) /* don't move at ZV */
1093 if (bidi_it
->ch_len
== 0)
1095 bidi_it
->bytepos
+= bidi_it
->ch_len
;
1098 current_level
= bidi_it
->level_stack
[bidi_it
->stack_idx
].level
; /* X1 */
1099 override
= bidi_it
->level_stack
[bidi_it
->stack_idx
].override
;
1100 new_level
= current_level
;
1102 /* in case it is a unibyte character (not yet implemented) */
1103 /* _fetch_multibyte_char_len = 1; */
1104 if (bidi_it
->bytepos
>= ZV_BYTE
)
1107 bidi_it
->ch_len
= 1;
1111 curchar
= FETCH_CHAR (bidi_it
->bytepos
);
1112 bidi_it
->ch_len
= CHAR_BYTES (curchar
);
1114 bidi_it
->ch
= curchar
;
1116 /* Don't apply directional override here, as all the types we handle
1117 below will not be affected by the override anyway, and we need
1118 the original type unaltered. The override will be applied in
1119 bidi_resolve_weak. */
1120 type
= bidi_get_type (curchar
, NEUTRAL_DIR
);
1121 bidi_it
->orig_type
= type
;
1122 bidi_check_type (bidi_it
->orig_type
);
1125 bidi_it
->prev_was_pdf
= 0;
1127 bidi_it
->type_after_w1
= UNKNOWN_BT
;
1133 bidi_it
->type_after_w1
= type
;
1134 bidi_check_type (bidi_it
->type_after_w1
);
1135 type
= WEAK_BN
; /* X9/Retaining */
1136 if (bidi_it
->ignore_bn_limit
<= 0)
1138 if (current_level
<= BIDI_MAXLEVEL
- 4)
1140 /* Compute the least odd embedding level greater than
1141 the current level. */
1142 new_level
= ((current_level
+ 1) & ~1) + 1;
1143 if (bidi_it
->type_after_w1
== RLE
)
1144 override
= NEUTRAL_DIR
;
1147 if (current_level
== BIDI_MAXLEVEL
- 4)
1148 bidi_it
->invalid_rl_levels
= 0;
1149 bidi_push_embedding_level (bidi_it
, new_level
, override
);
1153 bidi_it
->invalid_levels
++;
1154 /* See the commentary about invalid_rl_levels below. */
1155 if (bidi_it
->invalid_rl_levels
< 0)
1156 bidi_it
->invalid_rl_levels
= 0;
1157 bidi_it
->invalid_rl_levels
++;
1160 else if (bidi_it
->prev
.type_after_w1
== WEAK_EN
/* W5/Retaining */
1161 || bidi_it
->next_en_pos
> bidi_it
->charpos
)
1166 bidi_it
->type_after_w1
= type
;
1167 bidi_check_type (bidi_it
->type_after_w1
);
1168 type
= WEAK_BN
; /* X9/Retaining */
1169 if (bidi_it
->ignore_bn_limit
<= 0)
1171 if (current_level
<= BIDI_MAXLEVEL
- 5)
1173 /* Compute the least even embedding level greater than
1174 the current level. */
1175 new_level
= ((current_level
+ 2) & ~1);
1176 if (bidi_it
->type_after_w1
== LRE
)
1177 override
= NEUTRAL_DIR
;
1180 bidi_push_embedding_level (bidi_it
, new_level
, override
);
1184 bidi_it
->invalid_levels
++;
1185 /* invalid_rl_levels counts invalid levels encountered
1186 while the embedding level was already too high for
1187 LRE/LRO, but not for RLE/RLO. That is because
1188 there may be exactly one PDF which we should not
1189 ignore even though invalid_levels is non-zero.
1190 invalid_rl_levels helps to know what PDF is
1192 if (bidi_it
->invalid_rl_levels
>= 0)
1193 bidi_it
->invalid_rl_levels
++;
1196 else if (bidi_it
->prev
.type_after_w1
== WEAK_EN
/* W5/Retaining */
1197 || bidi_it
->next_en_pos
> bidi_it
->charpos
)
1201 bidi_it
->type_after_w1
= type
;
1202 bidi_check_type (bidi_it
->type_after_w1
);
1203 type
= WEAK_BN
; /* X9/Retaining */
1204 if (bidi_it
->ignore_bn_limit
<= 0)
1206 if (!bidi_it
->invalid_rl_levels
)
1208 new_level
= bidi_pop_embedding_level (bidi_it
);
1209 bidi_it
->invalid_rl_levels
= -1;
1210 if (bidi_it
->invalid_levels
)
1211 bidi_it
->invalid_levels
--;
1212 /* else nothing: UAX#9 says to ignore invalid PDFs */
1214 if (!bidi_it
->invalid_levels
)
1215 new_level
= bidi_pop_embedding_level (bidi_it
);
1218 bidi_it
->invalid_levels
--;
1219 bidi_it
->invalid_rl_levels
--;
1222 else if (bidi_it
->prev
.type_after_w1
== WEAK_EN
/* W5/Retaining */
1223 || bidi_it
->next_en_pos
> bidi_it
->charpos
)
1231 bidi_it
->type
= type
;
1232 bidi_check_type (bidi_it
->type
);
1237 /* Given an iterator state in BIDI_IT, advance one character position
1238 in the buffer to the next character (in the logical order), resolve
1239 any explicit embeddings and directional overrides, and return the
1240 embedding level of the character after resolving explicit
1241 directives and ignoring empty embeddings. */
1243 bidi_resolve_explicit (struct bidi_it
*bidi_it
)
1245 int prev_level
= bidi_it
->level_stack
[bidi_it
->stack_idx
].level
;
1246 int new_level
= bidi_resolve_explicit_1 (bidi_it
);
1248 if (prev_level
< new_level
1249 && bidi_it
->type
== WEAK_BN
1250 && bidi_it
->ignore_bn_limit
== 0 /* only if not already known */
1251 && bidi_it
->bytepos
< ZV_BYTE
/* not already at EOB */
1252 && bidi_explicit_dir_char (FETCH_CHAR (bidi_it
->bytepos
1253 + bidi_it
->ch_len
)))
1255 /* Avoid pushing and popping embedding levels if the level run
1256 is empty, as this breaks level runs where it shouldn't.
1257 UAX#9 removes all the explicit embedding and override codes,
1258 so empty embeddings disappear without a trace. We need to
1259 behave as if we did the same. */
1260 struct bidi_it saved_it
;
1261 int level
= prev_level
;
1263 bidi_copy_it (&saved_it
, bidi_it
);
1265 while (bidi_explicit_dir_char (FETCH_CHAR (bidi_it
->bytepos
1266 + bidi_it
->ch_len
)))
1268 level
= bidi_resolve_explicit_1 (bidi_it
);
1271 if (level
== prev_level
) /* empty embedding */
1272 saved_it
.ignore_bn_limit
= bidi_it
->charpos
+ 1;
1273 else /* this embedding is non-empty */
1274 saved_it
.ignore_bn_limit
= -1;
1276 bidi_copy_it (bidi_it
, &saved_it
);
1277 if (bidi_it
->ignore_bn_limit
> 0)
1279 /* We pushed a level, but we shouldn't have. Undo that. */
1280 if (!bidi_it
->invalid_rl_levels
)
1282 new_level
= bidi_pop_embedding_level (bidi_it
);
1283 bidi_it
->invalid_rl_levels
= -1;
1284 if (bidi_it
->invalid_levels
)
1285 bidi_it
->invalid_levels
--;
1287 if (!bidi_it
->invalid_levels
)
1288 new_level
= bidi_pop_embedding_level (bidi_it
);
1291 bidi_it
->invalid_levels
--;
1292 bidi_it
->invalid_rl_levels
--;
1297 if (bidi_it
->type
== NEUTRAL_B
) /* X8 */
1299 bidi_set_paragraph_end (bidi_it
);
1300 /* This is needed by bidi_resolve_weak below, and in L1. */
1301 bidi_it
->type_after_w1
= bidi_it
->type
;
1302 bidi_check_type (bidi_it
->type_after_w1
);
1308 /* Advance in the buffer, resolve weak types and return the type of
1309 the next character after weak type resolution. */
1311 bidi_resolve_weak (struct bidi_it
*bidi_it
)
1314 bidi_dir_t override
;
1315 int prev_level
= bidi_it
->level_stack
[bidi_it
->stack_idx
].level
;
1316 int new_level
= bidi_resolve_explicit (bidi_it
);
1318 bidi_type_t type_of_next
;
1319 struct bidi_it saved_it
;
1321 type
= bidi_it
->type
;
1322 override
= bidi_it
->level_stack
[bidi_it
->stack_idx
].override
;
1324 if (type
== UNKNOWN_BT
1332 if (new_level
!= prev_level
1333 || bidi_it
->type
== NEUTRAL_B
)
1335 /* We've got a new embedding level run, compute the directional
1336 type of sor and initialize per-run variables (UAX#9, clause
1338 bidi_set_sor_type (bidi_it
, prev_level
, new_level
);
1340 else if (type
== NEUTRAL_S
|| type
== NEUTRAL_WS
1341 || type
== WEAK_BN
|| type
== STRONG_AL
)
1342 bidi_it
->type_after_w1
= type
; /* needed in L1 */
1343 bidi_check_type (bidi_it
->type_after_w1
);
1345 /* Level and directional override status are already recorded in
1346 bidi_it, and do not need any change; see X6. */
1347 if (override
== R2L
) /* X6 */
1349 else if (override
== L2R
)
1353 if (type
== WEAK_NSM
) /* W1 */
1355 /* Note that we don't need to consider the case where the
1356 prev character has its type overridden by an RLO or LRO,
1357 because then either the type of this NSM would have been
1358 also overridden, or the previous character is outside the
1359 current level run, and thus not relevant to this NSM.
1360 This is why NSM gets the type_after_w1 of the previous
1362 if (bidi_it
->prev
.type_after_w1
!= UNKNOWN_BT
1363 /* if type_after_w1 is NEUTRAL_B, this NSM is at sor */
1364 && bidi_it
->prev
.type_after_w1
!= NEUTRAL_B
)
1365 type
= bidi_it
->prev
.type_after_w1
;
1366 else if (bidi_it
->sor
== R2L
)
1368 else if (bidi_it
->sor
== L2R
)
1370 else /* shouldn't happen! */
1373 if (type
== WEAK_EN
/* W2 */
1374 && bidi_it
->last_strong
.type_after_w1
== STRONG_AL
)
1376 else if (type
== STRONG_AL
) /* W3 */
1378 else if ((type
== WEAK_ES
/* W4 */
1379 && bidi_it
->prev
.type_after_w1
== WEAK_EN
1380 && bidi_it
->prev
.orig_type
== WEAK_EN
)
1382 && ((bidi_it
->prev
.type_after_w1
== WEAK_EN
1383 && bidi_it
->prev
.orig_type
== WEAK_EN
)
1384 || bidi_it
->prev
.type_after_w1
== WEAK_AN
)))
1387 bidi_it
->bytepos
+ bidi_it
->ch_len
>= ZV_BYTE
1388 ? BIDI_EOB
: FETCH_CHAR (bidi_it
->bytepos
+ bidi_it
->ch_len
);
1389 type_of_next
= bidi_get_type (next_char
, override
);
1391 if (type_of_next
== WEAK_BN
1392 || bidi_explicit_dir_char (next_char
))
1394 bidi_copy_it (&saved_it
, bidi_it
);
1395 while (bidi_resolve_explicit (bidi_it
) == new_level
1396 && bidi_it
->type
== WEAK_BN
)
1398 type_of_next
= bidi_it
->type
;
1399 bidi_copy_it (bidi_it
, &saved_it
);
1402 /* If the next character is EN, but the last strong-type
1403 character is AL, that next EN will be changed to AN when
1404 we process it in W2 above. So in that case, this ES
1405 should not be changed into EN. */
1407 && type_of_next
== WEAK_EN
1408 && bidi_it
->last_strong
.type_after_w1
!= STRONG_AL
)
1410 else if (type
== WEAK_CS
)
1412 if (bidi_it
->prev
.type_after_w1
== WEAK_AN
1413 && (type_of_next
== WEAK_AN
1414 /* If the next character is EN, but the last
1415 strong-type character is AL, EN will be later
1416 changed to AN when we process it in W2 above.
1417 So in that case, this ES should not be
1419 || (type_of_next
== WEAK_EN
1420 && bidi_it
->last_strong
.type_after_w1
== STRONG_AL
)))
1422 else if (bidi_it
->prev
.type_after_w1
== WEAK_EN
1423 && type_of_next
== WEAK_EN
1424 && bidi_it
->last_strong
.type_after_w1
!= STRONG_AL
)
1428 else if (type
== WEAK_ET
/* W5: ET with EN before or after it */
1429 || type
== WEAK_BN
) /* W5/Retaining */
1431 if (bidi_it
->prev
.type_after_w1
== WEAK_EN
/* ET/BN w/EN before it */
1432 || bidi_it
->next_en_pos
> bidi_it
->charpos
)
1434 else /* W5: ET/BN with EN after it. */
1436 EMACS_INT en_pos
= bidi_it
->charpos
+ 1;
1439 bidi_it
->bytepos
+ bidi_it
->ch_len
>= ZV_BYTE
1440 ? BIDI_EOB
: FETCH_CHAR (bidi_it
->bytepos
+ bidi_it
->ch_len
);
1441 type_of_next
= bidi_get_type (next_char
, override
);
1443 if (type_of_next
== WEAK_ET
1444 || type_of_next
== WEAK_BN
1445 || bidi_explicit_dir_char (next_char
))
1447 bidi_copy_it (&saved_it
, bidi_it
);
1448 while (bidi_resolve_explicit (bidi_it
) == new_level
1449 && (bidi_it
->type
== WEAK_BN
1450 || bidi_it
->type
== WEAK_ET
))
1452 type_of_next
= bidi_it
->type
;
1453 en_pos
= bidi_it
->charpos
;
1454 bidi_copy_it (bidi_it
, &saved_it
);
1456 if (type_of_next
== WEAK_EN
)
1458 /* If the last strong character is AL, the EN we've
1459 found will become AN when we get to it (W2). */
1460 if (bidi_it
->last_strong
.type_after_w1
!= STRONG_AL
)
1463 /* Remember this EN position, to speed up processing
1465 bidi_it
->next_en_pos
= en_pos
;
1467 else if (type
== WEAK_BN
)
1468 type
= NEUTRAL_ON
; /* W6/Retaining */
1474 if (type
== WEAK_ES
|| type
== WEAK_ET
|| type
== WEAK_CS
/* W6 */
1476 && (bidi_it
->prev
.type_after_w1
== WEAK_CS
/* W6/Retaining */
1477 || bidi_it
->prev
.type_after_w1
== WEAK_ES
1478 || bidi_it
->prev
.type_after_w1
== WEAK_ET
)))
1481 /* Store the type we've got so far, before we clobber it with strong
1482 types in W7 and while resolving neutral types. But leave alone
1483 the original types that were recorded above, because we will need
1484 them for the L1 clause. */
1485 if (bidi_it
->type_after_w1
== UNKNOWN_BT
)
1486 bidi_it
->type_after_w1
= type
;
1487 bidi_check_type (bidi_it
->type_after_w1
);
1489 if (type
== WEAK_EN
) /* W7 */
1491 if ((bidi_it
->last_strong
.type_after_w1
== STRONG_L
)
1492 || (bidi_it
->last_strong
.type
== UNKNOWN_BT
&& bidi_it
->sor
== L2R
))
1496 bidi_it
->type
= type
;
1497 bidi_check_type (bidi_it
->type
);
1502 bidi_resolve_neutral (struct bidi_it
*bidi_it
)
1504 int prev_level
= bidi_it
->level_stack
[bidi_it
->stack_idx
].level
;
1505 bidi_type_t type
= bidi_resolve_weak (bidi_it
);
1506 int current_level
= bidi_it
->level_stack
[bidi_it
->stack_idx
].level
;
1508 if (!(type
== STRONG_R
1513 || type
== NEUTRAL_B
1514 || type
== NEUTRAL_S
1515 || type
== NEUTRAL_WS
1516 || type
== NEUTRAL_ON
))
1519 if (bidi_get_category (type
) == NEUTRAL
1520 || (type
== WEAK_BN
&& prev_level
== current_level
))
1522 if (bidi_it
->next_for_neutral
.type
!= UNKNOWN_BT
)
1523 type
= bidi_resolve_neutral_1 (bidi_it
->prev_for_neutral
.type
,
1524 bidi_it
->next_for_neutral
.type
,
1528 /* Arrrgh!! The UAX#9 algorithm is too deeply entrenched in
1529 the assumption of batch-style processing; see clauses W4,
1530 W5, and especially N1, which require to look far forward
1531 (as well as back) in the buffer. May the fleas of a
1532 thousand camels infest the armpits of those who design
1533 supposedly general-purpose algorithms by looking at their
1534 own implementations, and fail to consider other possible
1536 struct bidi_it saved_it
;
1537 bidi_type_t next_type
;
1539 if (bidi_it
->scan_dir
== -1)
1542 bidi_copy_it (&saved_it
, bidi_it
);
1543 /* Scan the text forward until we find the first non-neutral
1544 character, and then use that to resolve the neutral we
1545 are dealing with now. We also cache the scanned iterator
1546 states, to salvage some of the effort later. */
1547 bidi_cache_iterator_state (bidi_it
, 0);
1549 /* Record the info about the previous character, so that
1550 it will be cached below with this state. */
1551 if (bidi_it
->type_after_w1
!= WEAK_BN
/* W1/Retaining */
1552 && bidi_it
->type
!= WEAK_BN
)
1553 bidi_remember_char (&bidi_it
->prev
, bidi_it
);
1554 type
= bidi_resolve_weak (bidi_it
);
1555 /* Paragraph separators have their levels fully resolved
1556 at this point, so cache them as resolved. */
1557 bidi_cache_iterator_state (bidi_it
, type
== NEUTRAL_B
);
1558 /* FIXME: implement L1 here, by testing for a newline and
1559 resetting the level for any sequence of whitespace
1560 characters adjacent to it. */
1561 } while (!(type
== NEUTRAL_B
1563 && bidi_get_category (type
) != NEUTRAL
)
1564 /* This is all per level run, so stop when we
1565 reach the end of this level run. */
1566 || bidi_it
->level_stack
[bidi_it
->stack_idx
].level
!=
1569 bidi_remember_char (&saved_it
.next_for_neutral
, bidi_it
);
1580 /* N1: ``European and Arabic numbers are treated as
1581 though they were R.'' */
1582 next_type
= STRONG_R
;
1583 saved_it
.next_for_neutral
.type
= STRONG_R
;
1586 if (!bidi_explicit_dir_char (bidi_it
->ch
))
1587 abort (); /* can't happen: BNs are skipped */
1590 /* Marched all the way to the end of this level run.
1591 We need to use the eor type, whose information is
1592 stored by bidi_set_sor_type in the prev_for_neutral
1594 if (saved_it
.type
!= WEAK_BN
1595 || bidi_get_category (bidi_it
->prev
.type_after_w1
) == NEUTRAL
)
1597 next_type
= bidi_it
->prev_for_neutral
.type
;
1598 saved_it
.next_for_neutral
.type
= next_type
;
1599 bidi_check_type (next_type
);
1603 /* This is a BN which does not adjoin neutrals.
1604 Leave its type alone. */
1605 bidi_copy_it (bidi_it
, &saved_it
);
1606 return bidi_it
->type
;
1612 type
= bidi_resolve_neutral_1 (saved_it
.prev_for_neutral
.type
,
1613 next_type
, current_level
);
1614 saved_it
.type
= type
;
1615 bidi_check_type (type
);
1616 bidi_copy_it (bidi_it
, &saved_it
);
1622 /* Given an iterator state in BIDI_IT, advance one character position
1623 in the buffer to the next character (in the logical order), resolve
1624 the bidi type of that next character, and return that type. */
1626 bidi_type_of_next_char (struct bidi_it
*bidi_it
)
1630 /* This should always be called during a forward scan. */
1631 if (bidi_it
->scan_dir
!= 1)
1634 /* Reset the limit until which to ignore BNs if we step out of the
1635 area where we found only empty levels. */
1636 if ((bidi_it
->ignore_bn_limit
> 0
1637 && bidi_it
->ignore_bn_limit
<= bidi_it
->charpos
)
1638 || (bidi_it
->ignore_bn_limit
== -1
1639 && !bidi_explicit_dir_char (bidi_it
->ch
)))
1640 bidi_it
->ignore_bn_limit
= 0;
1642 type
= bidi_resolve_neutral (bidi_it
);
1647 /* Given an iterator state BIDI_IT, advance one character position in
1648 the buffer to the next character (in the logical order), resolve
1649 the embedding and implicit levels of that next character, and
1650 return the resulting level. */
1652 bidi_level_of_next_char (struct bidi_it
*bidi_it
)
1655 int level
, prev_level
= -1;
1656 struct bidi_saved_info next_for_neutral
;
1658 if (bidi_it
->scan_dir
== 1)
1660 /* There's no sense in trying to advance if we hit end of text. */
1661 if (bidi_it
->bytepos
>= ZV_BYTE
)
1662 return bidi_it
->resolved_level
;
1664 /* Record the info about the previous character. */
1665 if (bidi_it
->type_after_w1
!= WEAK_BN
/* W1/Retaining */
1666 && bidi_it
->type
!= WEAK_BN
)
1667 bidi_remember_char (&bidi_it
->prev
, bidi_it
);
1668 if (bidi_it
->type_after_w1
== STRONG_R
1669 || bidi_it
->type_after_w1
== STRONG_L
1670 || bidi_it
->type_after_w1
== STRONG_AL
)
1671 bidi_remember_char (&bidi_it
->last_strong
, bidi_it
);
1672 /* FIXME: it sounds like we don't need both prev and
1673 prev_for_neutral members, but I'm leaving them both for now. */
1674 if (bidi_it
->type
== STRONG_R
|| bidi_it
->type
== STRONG_L
1675 || bidi_it
->type
== WEAK_EN
|| bidi_it
->type
== WEAK_AN
)
1676 bidi_remember_char (&bidi_it
->prev_for_neutral
, bidi_it
);
1678 /* If we overstepped the characters used for resolving neutrals
1679 and whitespace, invalidate their info in the iterator. */
1680 if (bidi_it
->charpos
>= bidi_it
->next_for_neutral
.charpos
)
1681 bidi_it
->next_for_neutral
.type
= UNKNOWN_BT
;
1682 if (bidi_it
->next_en_pos
>= 0
1683 && bidi_it
->charpos
>= bidi_it
->next_en_pos
)
1684 bidi_it
->next_en_pos
= -1;
1685 if (bidi_it
->next_for_ws
.type
!= UNKNOWN_BT
1686 && bidi_it
->charpos
>= bidi_it
->next_for_ws
.charpos
)
1687 bidi_it
->next_for_ws
.type
= UNKNOWN_BT
;
1689 /* This must be taken before we fill the iterator with the info
1690 about the next char. If we scan backwards, the iterator
1691 state must be already cached, so there's no need to know the
1692 embedding level of the previous character, since we will be
1693 returning to our caller shortly. */
1694 prev_level
= bidi_it
->level_stack
[bidi_it
->stack_idx
].level
;
1696 next_for_neutral
= bidi_it
->next_for_neutral
;
1698 /* Perhaps it is already cached. */
1699 type
= bidi_cache_find (bidi_it
->charpos
+ bidi_it
->scan_dir
, -1, bidi_it
);
1700 if (type
!= UNKNOWN_BT
)
1702 /* Don't lose the information for resolving neutrals! The
1703 cached states could have been cached before their
1704 next_for_neutral member was computed. If we are on our way
1705 forward, we can simply take the info from the previous
1707 if (bidi_it
->scan_dir
== 1
1708 && bidi_it
->next_for_neutral
.type
== UNKNOWN_BT
)
1709 bidi_it
->next_for_neutral
= next_for_neutral
;
1711 /* If resolved_level is -1, it means this state was cached
1712 before it was completely resolved, so we cannot return
1714 if (bidi_it
->resolved_level
!= -1)
1715 return bidi_it
->resolved_level
;
1717 if (bidi_it
->scan_dir
== -1)
1718 /* If we are going backwards, the iterator state is already cached
1719 from previous scans, and should be fully resolved. */
1722 if (type
== UNKNOWN_BT
)
1723 type
= bidi_type_of_next_char (bidi_it
);
1725 if (type
== NEUTRAL_B
)
1726 return bidi_it
->resolved_level
;
1728 level
= bidi_it
->level_stack
[bidi_it
->stack_idx
].level
;
1729 if ((bidi_get_category (type
) == NEUTRAL
/* && type != NEUTRAL_B */)
1730 || (type
== WEAK_BN
&& prev_level
== level
))
1732 if (bidi_it
->next_for_neutral
.type
== UNKNOWN_BT
)
1735 /* If the cached state shows a neutral character, it was not
1736 resolved by bidi_resolve_neutral, so do it now. */
1737 type
= bidi_resolve_neutral_1 (bidi_it
->prev_for_neutral
.type
,
1738 bidi_it
->next_for_neutral
.type
,
1742 if (!(type
== STRONG_R
1746 || type
== WEAK_AN
))
1748 bidi_it
->type
= type
;
1749 bidi_check_type (bidi_it
->type
);
1751 /* For L1 below, we need to know, for each WS character, whether
1752 it belongs to a sequence of WS characters preceeding a newline
1753 or a TAB or a paragraph separator. */
1754 if (bidi_it
->orig_type
== NEUTRAL_WS
1755 && bidi_it
->next_for_ws
.type
== UNKNOWN_BT
)
1758 int clen
= bidi_it
->ch_len
;
1759 EMACS_INT bpos
= bidi_it
->bytepos
;
1760 EMACS_INT cpos
= bidi_it
->charpos
;
1764 /*_fetch_multibyte_char_len = 1;*/
1765 ch
= bpos
+ clen
>= ZV_BYTE
? BIDI_EOB
: FETCH_CHAR (bpos
+ clen
);
1768 clen
= (ch
== BIDI_EOB
? 1 : CHAR_BYTES (ch
));
1769 if (ch
== '\n' || ch
== BIDI_EOB
/* || ch == LINESEP_CHAR */)
1772 chtype
= bidi_get_type (ch
, NEUTRAL_DIR
);
1773 } while (chtype
== NEUTRAL_WS
|| chtype
== WEAK_BN
1774 || bidi_explicit_dir_char (ch
)); /* L1/Retaining */
1775 bidi_it
->next_for_ws
.type
= chtype
;
1776 bidi_check_type (bidi_it
->next_for_ws
.type
);
1777 bidi_it
->next_for_ws
.charpos
= cpos
;
1778 bidi_it
->next_for_ws
.bytepos
= bpos
;
1781 /* Resolve implicit levels, with a twist: PDFs get the embedding
1782 level of the enbedding they terminate. See below for the
1784 if (bidi_it
->orig_type
== PDF
1785 /* Don't do this if this formatting code didn't change the
1786 embedding level due to invalid or empty embeddings. */
1787 && prev_level
!= level
)
1789 /* Don't look in UAX#9 for the reason for this: it's our own
1790 private quirk. The reason is that we want the formatting
1791 codes to be delivered so that they bracket the text of their
1792 embedding. For example, given the text
1796 we want it to be displayed as
1804 which will result because we bump up the embedding level as
1805 soon as we see the RLO and pop it as soon as we see the PDF,
1806 so RLO itself has the same embedding level as "teST", and
1807 thus would be normally delivered last, just before the PDF.
1808 The switch below fiddles with the level of PDF so that this
1809 ugly side effect does not happen.
1811 (This is, of course, only important if the formatting codes
1812 are actually displayed, but Emacs does need to display them
1813 if the user wants to.) */
1816 else if (bidi_it
->orig_type
== NEUTRAL_B
/* L1 */
1817 || bidi_it
->orig_type
== NEUTRAL_S
1818 || bidi_it
->ch
== '\n' || bidi_it
->ch
== BIDI_EOB
1819 /* || bidi_it->ch == LINESEP_CHAR */
1820 || (bidi_it
->orig_type
== NEUTRAL_WS
1821 && (bidi_it
->next_for_ws
.type
== NEUTRAL_B
1822 || bidi_it
->next_for_ws
.type
== NEUTRAL_S
)))
1823 level
= bidi_it
->level_stack
[0].level
;
1824 else if ((level
& 1) == 0) /* I1 */
1826 if (type
== STRONG_R
)
1828 else if (type
== WEAK_EN
|| type
== WEAK_AN
)
1833 if (type
== STRONG_L
|| type
== WEAK_EN
|| type
== WEAK_AN
)
1837 bidi_it
->resolved_level
= level
;
1841 /* Move to the other edge of a level given by LEVEL. If END_FLAG is
1842 non-zero, we are at the end of a level, and we need to prepare to
1843 resume the scan of the lower level.
1845 If this level's other edge is cached, we simply jump to it, filling
1846 the iterator structure with the iterator state on the other edge.
1847 Otherwise, we walk the buffer until we come back to the same level
1850 Note: we are not talking here about a ``level run'' in the UAX#9
1851 sense of the term, but rather about a ``level'' which includes
1852 all the levels higher than it. In other words, given the levels
1855 11111112222222333333334443343222222111111112223322111
1858 and assuming we are at point A scanning left to right, this
1859 function moves to point C, whereas the UAX#9 ``level 2 run'' ends
1862 bidi_find_other_level_edge (struct bidi_it
*bidi_it
, int level
, int end_flag
)
1864 int dir
= end_flag
? -bidi_it
->scan_dir
: bidi_it
->scan_dir
;
1867 /* Try the cache first. */
1868 if ((idx
= bidi_cache_find_level_change (level
, dir
, end_flag
)) >= 0)
1869 bidi_cache_fetch_state (idx
, bidi_it
);
1875 abort (); /* if we are at end of level, its edges must be cached */
1877 bidi_cache_iterator_state (bidi_it
, 1);
1879 new_level
= bidi_level_of_next_char (bidi_it
);
1880 bidi_cache_iterator_state (bidi_it
, 1);
1881 } while (new_level
>= level
);
1886 bidi_get_next_char_visually (struct bidi_it
*bidi_it
)
1888 int old_level
, new_level
, next_level
;
1889 struct bidi_it sentinel
;
1891 if (bidi_it
->scan_dir
== 0)
1893 bidi_it
->scan_dir
= 1; /* default to logical order */
1896 /* If we just passed a newline, initialize for the next line. */
1897 if (!bidi_it
->first_elt
&& bidi_it
->orig_type
== NEUTRAL_B
)
1898 bidi_line_init (bidi_it
);
1900 /* Prepare the sentinel iterator state. */
1901 if (bidi_cache_idx
== 0)
1903 bidi_copy_it (&sentinel
, bidi_it
);
1904 if (bidi_it
->first_elt
)
1906 sentinel
.charpos
--; /* cached charpos needs to be monotonic */
1908 sentinel
.ch
= '\n'; /* doesn't matter, but why not? */
1909 sentinel
.ch_len
= 1;
1913 old_level
= bidi_it
->resolved_level
;
1914 new_level
= bidi_level_of_next_char (bidi_it
);
1916 /* Reordering of resolved levels (clause L2) is implemented by
1917 jumping to the other edge of the level and flipping direction of
1918 scanning the text whenever we find a level change. */
1919 if (new_level
!= old_level
)
1921 int ascending
= new_level
> old_level
;
1922 int level_to_search
= ascending
? old_level
+ 1 : old_level
;
1923 int incr
= ascending
? 1 : -1;
1924 int expected_next_level
= old_level
+ incr
;
1926 /* If we don't have anything cached yet, we need to cache the
1927 sentinel state, since we'll need it to record where to jump
1928 when the last non-base level is exhausted. */
1929 if (bidi_cache_idx
== 0)
1930 bidi_cache_iterator_state (&sentinel
, 1);
1931 /* Jump (or walk) to the other edge of this level. */
1932 bidi_find_other_level_edge (bidi_it
, level_to_search
, !ascending
);
1933 /* Switch scan direction and peek at the next character in the
1935 bidi_it
->scan_dir
= -bidi_it
->scan_dir
;
1937 /* The following loop handles the case where the resolved level
1938 jumps by more than one. This is typical for numbers inside a
1939 run of text with left-to-right embedding direction, but can
1940 also happen in other situations. In those cases the decision
1941 where to continue after a level change, and in what direction,
1942 is tricky. For example, given a text like below:
1947 (where the numbers below the text show the resolved levels),
1948 the result of reordering according to UAX#9 should be this:
1952 This is implemented by the loop below which flips direction
1953 and jumps to the other edge of the level each time it finds
1954 the new level not to be the expected one. The expected level
1955 is always one more or one less than the previous one. */
1956 next_level
= bidi_peek_at_next_level (bidi_it
);
1957 while (next_level
!= expected_next_level
)
1959 expected_next_level
+= incr
;
1960 level_to_search
+= incr
;
1961 bidi_find_other_level_edge (bidi_it
, level_to_search
, !ascending
);
1962 bidi_it
->scan_dir
= -bidi_it
->scan_dir
;
1963 next_level
= bidi_peek_at_next_level (bidi_it
);
1966 /* Finally, deliver the next character in the new direction. */
1967 next_level
= bidi_level_of_next_char (bidi_it
);
1970 /* Take note when we have just processed the newline that precedes
1971 the end of the paragraph. The next time we are about to be
1972 called, set_iterator_to_next will automatically reinit the
1973 paragraph direction, if needed. We do this at the newline before
1974 the paragraph separator, because the next character might not be
1975 the first character of the next paragraph, due to the bidi
1976 reordering, whereas we _must_ know the paragraph base direction
1977 _before_ we process the paragraph's text, since the base
1978 direction affects the reordering. */
1979 if (bidi_it
->scan_dir
== 1
1980 && bidi_it
->orig_type
== NEUTRAL_B
1981 && bidi_it
->bytepos
< ZV_BYTE
)
1984 bidi_at_paragraph_end (bidi_it
->charpos
+ 1,
1985 bidi_it
->bytepos
+ bidi_it
->ch_len
);
1988 bidi_it
->new_paragraph
= 1;
1989 /* Record the buffer position of the last character of the
1990 paragraph separator. */
1991 bidi_it
->separator_limit
= bidi_it
->charpos
+ 1 + sep_len
;
1995 if (bidi_it
->scan_dir
== 1 && bidi_cache_idx
)
1997 /* If we are at paragraph's base embedding level and beyond the
1998 last cached position, the cache's job is done and we can
2000 if (bidi_it
->resolved_level
== bidi_it
->level_stack
[0].level
2001 && bidi_it
->charpos
> bidi_cache
[bidi_cache_idx
- 1].charpos
)
2002 bidi_cache_reset ();
2003 /* But as long as we are caching during forward scan, we must
2004 cache each state, or else the cache integrity will be
2005 compromised: it assumes cached states correspond to buffer
2008 bidi_cache_iterator_state (bidi_it
, 1);
2012 /* This is meant to be called from within the debugger, whenever you
2013 wish to examine the cache contents. */
2015 bidi_dump_cached_states (void)
2020 if (bidi_cache_idx
== 0)
2022 fprintf (stderr
, "The cache is empty.\n");
2025 fprintf (stderr
, "Total of %d state%s in cache:\n",
2026 bidi_cache_idx
, bidi_cache_idx
== 1 ? "" : "s");
2028 for (i
= bidi_cache
[bidi_cache_idx
- 1].charpos
; i
> 0; i
/= 10)
2030 fputs ("ch ", stderr
);
2031 for (i
= 0; i
< bidi_cache_idx
; i
++)
2032 fprintf (stderr
, "%*c", ndigits
, bidi_cache
[i
].ch
);
2033 fputs ("\n", stderr
);
2034 fputs ("lvl ", stderr
);
2035 for (i
= 0; i
< bidi_cache_idx
; i
++)
2036 fprintf (stderr
, "%*d", ndigits
, bidi_cache
[i
].resolved_level
);
2037 fputs ("\n", stderr
);
2038 fputs ("pos ", stderr
);
2039 for (i
= 0; i
< bidi_cache_idx
; i
++)
2040 fprintf (stderr
, "%*d", ndigits
, bidi_cache
[i
].charpos
);
2041 fputs ("\n", stderr
);