# HG changeset patch # User Eli Zaretskii # Date 1262344008 18000 # Node ID a551e4109c043b02cfe66108c83c170dcec74b06 # Parent e6df672626c1faf09a6979c59bbf05a7eb5dd045 Retrospective commit from 2009-10-03. Detect paragraph start and paragraph separators (part of determining paragraph base direction). (The commit of window.h belongs to the original changes by Kenichi Handa, it was forgotten earlier.) bidi.c (bidi_set_paragraph_end): Don't set the new_paragraph flag in the iterator. (bidi_init_it): Set the new_paragraph flag. (bidi_at_paragraph_end): Arguments are now character and byte position of the next character. All callers changed. (bidi_resolve_explicit): Don't call bidi_at_paragraph_end, and don't behave as if at paragraph end if it returns true. (bidi_get_next_char_visually): Don't call bidi_paragraph_init if new_paragraph flag is set. Set new_paragraph flag when at end of a paragraph. : New variables. : New variables. (bidi_initialize): Initialize and staticpro them. dispextern.h : New element paragraph_dir. Make positional elements EMACS_INT. bidi.c : Delete. diff -r e6df672626c1 -r a551e4109c04 src/ChangeLog.bidi --- a/src/ChangeLog.bidi Fri Jan 01 06:01:34 2010 -0500 +++ b/src/ChangeLog.bidi Fri Jan 01 06:06:48 2010 -0500 @@ -1,3 +1,35 @@ +2009-10-03 Eli Zaretskii + + * bidi.c (bidi_set_paragraph_end): Don't set the new_paragraph + flag in the iterator. + (bidi_init_it): Set the new_paragraph flag. + (bidi_at_paragraph_end): Arguments are now character and byte + position of the next character. All callers changed. + (bidi_resolve_explicit): Don't call bidi_at_paragraph_end, and + don't behave as if at paragraph end if it returns true. + (bidi_get_next_char_visually): Don't call bidi_paragraph_init if + new_paragraph flags is set. Set new_paragraph flag when at end of + a paragraph. + : New + variables. + : New variables. + (bidi_initialize): Initialize and staticpro them. + + * dispextern.h : New element paragraph_dir. Make + positional elements EMACS_INT. + + * bidi.c : Delete. + +2009-09-28 Eli Zaretskii + + * bidi.c (bidi_init_it): Initialize charpos, bytepos, and + first_elt before calling bidi_set_paragraph_end. + (bidi_resolve_explicit): Don't call bidi_set_paragraph_end at + EOB. + (bidi_at_paragraph_end): Don't set new_paragraph flag at EOB. + (bidi_get_type): Accept an additional argument OVERRIDE, per UAX#9 + "Explicit Overrides". All callers changed. + 2009-09-27 Eli Zaretskii * xdisp.c (next_element_from_buffer): If called not at line @@ -129,8 +161,7 @@ 2004-03-04 Kenichi Handa - The following changes are to support for bi-directional text - display. + The following changes are to support bidirectional text display. * Makefile.in (obj): Include bidi.o. (bidi.o): New target. diff -r e6df672626c1 -r a551e4109c04 src/bidi.c --- a/src/bidi.c Fri Jan 01 06:01:34 2010 -0500 +++ b/src/bidi.c Fri Jan 01 06:06:48 2010 -0500 @@ -96,8 +96,8 @@ int bidi_ignore_explicit_marks_for_paragraph_level = 1; -/* FIXME: Should be user-definable. */ -bidi_dir_t bidi_overriding_paragraph_direction = L2R; +static Lisp_Object fallback_paragraph_start_re, fallback_paragraph_separate_re; +static Lisp_Object Qparagraph_start, Qparagraph_separate; static void bidi_initialize () @@ -392,18 +392,67 @@ char_table_set_range (bidi_type_table, bidi_type[i].from, bidi_type[i].to ? bidi_type[i].to : bidi_type[i].from, make_number (bidi_type[i].type)); + + fallback_paragraph_start_re = + XSYMBOL (Fintern_soft (build_string ("paragraph-start"), Qnil))->value; + if (!STRINGP (fallback_paragraph_start_re)) + fallback_paragraph_start_re = build_string ("\f\\|[ \t]*$"); + staticpro (&fallback_paragraph_start_re); + Qparagraph_start = intern ("paragraph-start"); + staticpro (&Qparagraph_start); + fallback_paragraph_separate_re = + XSYMBOL (Fintern_soft (build_string ("paragraph-separate"), Qnil))->value; + if (!STRINGP (fallback_paragraph_separate_re)) + fallback_paragraph_separate_re = build_string ("[ \t\f]*$"); + staticpro (&fallback_paragraph_separate_re); + Qparagraph_separate = intern ("paragraph-separate"); + staticpro (&Qparagraph_separate); bidi_initialized = 1; } -/* Return the bidi type of a character CH. */ +/* Return the bidi type of a character CH, subject to the current + directional OVERRIDE. */ bidi_type_t -bidi_get_type (int ch) +bidi_get_type (int ch, bidi_dir_t override) { + bidi_type_t default_type; + if (ch == BIDI_EOB) return NEUTRAL_B; if (ch < 0 || ch > MAX_CHAR) abort (); - return (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch)); + + default_type = (bidi_type_t) XINT (CHAR_TABLE_REF (bidi_type_table, ch)); + + if (override == NEUTRAL_DIR) + return default_type; + + switch (default_type) + { + /* Although UAX#9 does not tell, it doesn't make sense to + override NEUTRAL_B and LRM/RLM characters. */ + case NEUTRAL_B: + case LRE: + case LRO: + case RLE: + case RLO: + case PDF: + return default_type; + default: + switch (ch) + { + case LRM_CHAR: + case RLM_CHAR: + return default_type; + default: + if (override == L2R) /* X6 */ + return STRONG_L; + else if (override == R2L) + return STRONG_R; + else + abort (); /* can't happen: handled above */ + } + } } void @@ -684,21 +733,17 @@ return bidi_cache[bidi_cache_last_idx + bidi_it->scan_dir].resolved_level; } -/* Return non-zero if buffer's byte position POS is the last character - of a paragraph. THIS_CH is the character preceding the one at POS in - the buffer. */ +/* Return non-zero if buffer's byte position POS is the end of a + paragraph. */ int -bidi_at_paragraph_end (int this_ch, int pos) +bidi_at_paragraph_end (EMACS_INT charpos, EMACS_INT bytepos) { - int next_ch; + Lisp_Object re = XSYMBOL (Qparagraph_separate)->value; - if (pos >= ZV_BYTE) - return 1; + if (!STRINGP (re)) + re = fallback_paragraph_separate_re; - next_ch = FETCH_CHAR (pos); - /* FIXME: This should support all Unicode characters that can end a - paragraph. */ - return (this_ch == '\n' && next_ch == '\n'); + return fast_looking_at (re, charpos, bytepos, ZV, ZV_BYTE, Qnil) > 0; } /* Determine the start-of-run (sor) directional type given the two @@ -734,30 +779,58 @@ bidi_it->ignore_bn_limit = 0; /* meaning it's unknown */ } +/* Find the beginning of this paragraph by looking back in the + buffer. */ +static void +bidi_find_paragraph_start (struct bidi_it *bidi_it) +{ + Lisp_Object re = XSYMBOL (Qparagraph_start)->value; + EMACS_INT pos = bidi_it->charpos; + EMACS_INT pos_byte = bidi_it->bytepos; + EMACS_INT limit = ZV, limit_byte = ZV_BYTE; + + if (!STRINGP (re)) + re = fallback_paragraph_start_re; + while (pos_byte > BEGV_BYTE + && fast_looking_at (re, pos, pos_byte, limit, limit_byte, Qnil) < 0) + { + find_next_newline_no_quit (pos, -1); + } +} + void bidi_paragraph_init (bidi_dir_t dir, struct bidi_it *bidi_it) { - int bytepos = bidi_it->bytepos; + EMACS_INT bytepos = bidi_it->bytepos; /* We should never be called at EOB or before BEGV. */ if (bytepos >= ZV_BYTE || bytepos < BEGV_BYTE) abort (); - /* We should always be called at the beginning of a new - paragraph. */ - if (!(bytepos == BEGV_BYTE - || FETCH_CHAR (bytepos) == '\n' - || FETCH_CHAR (bytepos - 1) == '\n')) - abort (); - bidi_it->level_stack[0].level = 0; /* default for L2R */ + bidi_it->paragraph_dir = L2R; if (dir == R2L) bidi_it->level_stack[0].level = 1; else if (dir == NEUTRAL_DIR) /* P2 */ { - int ch = FETCH_CHAR (bytepos), ch_len = CHAR_BYTES (ch); - int pos = bidi_it->charpos; - bidi_type_t type = bidi_get_type (ch); + int ch, ch_len; + EMACS_INT pos; + bidi_type_t type; + + /* Search back to where this paragraph starts. */ + bidi_find_paragraph_start (bidi_it); + + /* We should always be at the beginning of a new line at this + point. */ + if (!(bytepos == BEGV_BYTE + || FETCH_CHAR (bytepos) == '\n' + || FETCH_CHAR (bytepos - 1) == '\n')) + abort (); + + ch = FETCH_CHAR (bytepos); + ch_len = CHAR_BYTES (ch); + pos = bidi_it->charpos; + type = bidi_get_type (ch, NEUTRAL_DIR); for (pos++, bytepos += ch_len; /* NOTE: UAX#9 says to search only for L, AL, or R types of @@ -768,15 +841,17 @@ || (bidi_ignore_explicit_marks_for_paragraph_level && (type == RLE || type == RLO || type == LRE || type == LRO)); - type = bidi_get_type (ch)) + type = bidi_get_type (ch, NEUTRAL_DIR)) { - if (type == NEUTRAL_B || bidi_at_paragraph_end (ch, bytepos)) + if (type == NEUTRAL_B || bidi_at_paragraph_end (pos, bytepos)) break; FETCH_CHAR_ADVANCE (ch, pos, bytepos); } if (type == STRONG_R || type == STRONG_AL) /* P3 */ bidi_it->level_stack[0].level = 1; } + if (bidi_it->level_stack[0].level == 1) + bidi_it->paragraph_dir = R2L; bidi_it->scan_dir = 1; /* FIXME: do we need to have control on this? */ bidi_it->resolved_level = bidi_it->level_stack[0].level; bidi_it->level_stack[0].override = NEUTRAL_DIR; /* X1 */ @@ -785,14 +860,14 @@ bidi_it->new_paragraph = 0; bidi_it->next_en_pos = -1; bidi_it->next_for_ws.type = UNKNOWN_BT; - bidi_set_sor_type (bidi_it, bidi_overriding_paragraph_direction, + bidi_set_sor_type (bidi_it, bidi_it->paragraph_dir, bidi_it->level_stack[0].level); /* X10 */ bidi_cache_reset (); } -/* Do whatever UAX#9 clause X8 says should be done at paragraph's end, - and set the new paragraph flag in the iterator. */ +/* Do whatever UAX#9 clause X8 says should be done at paragraph's + end. */ static inline void bidi_set_paragraph_end (struct bidi_it *bidi_it) { @@ -800,19 +875,19 @@ bidi_it->invalid_rl_levels = -1; bidi_it->stack_idx = 0; bidi_it->resolved_level = bidi_it->level_stack[0].level; - bidi_it->new_paragraph = 1; } /* Initialize the bidi iterator from buffer position CHARPOS. */ void -bidi_init_it (int charpos, int bytepos, struct bidi_it *bidi_it) +bidi_init_it (EMACS_INT charpos, EMACS_INT bytepos, struct bidi_it *bidi_it) { if (! bidi_initialized) bidi_initialize (); - bidi_set_paragraph_end (bidi_it); - bidi_it->first_elt = 1; bidi_it->charpos = charpos; bidi_it->bytepos = bytepos; + bidi_it->first_elt = 1; + bidi_set_paragraph_end (bidi_it); + bidi_it->new_paragraph = 1; bidi_it->type = NEUTRAL_B; bidi_it->type_after_w1 = UNKNOWN_BT; bidi_it->orig_type = UNKNOWN_BT; @@ -945,7 +1020,11 @@ } bidi_it->ch = curchar; - type = bidi_get_type (curchar); + /* Don't apply directional override here, as all the types we handle + below will not be affected by the override anyway, and we need + the original type unaltered. The override will be applied in + bidi_resolve_weak. */ + type = bidi_get_type (curchar, NEUTRAL_DIR); bidi_it->orig_type = type; bidi_check_type (bidi_it->orig_type); @@ -1122,17 +1201,15 @@ } } - /* For when the paragraph end is defined by anything other than a - special Unicode character (a.k.a. ``higher protocols''). */ - if (bidi_it->type != NEUTRAL_B) - if (bidi_at_paragraph_end (bidi_it->ch, - bidi_it->bytepos + bidi_it->ch_len)) - bidi_it->type = NEUTRAL_B; - if (bidi_it->type == NEUTRAL_B) /* X8 */ { - bidi_set_paragraph_end (bidi_it); - bidi_it->type_after_w1 = bidi_it->type; /* needed below and in L1 */ + /* End of buffer does _not_ indicate a new paragraph is coming. + Otherwise, each character inserted at EOB will be processed + as starting a new paragraph. */ + if (bidi_it->bytepos < ZV_BYTE) + bidi_set_paragraph_end (bidi_it); + /* This is needed by bidi_resolve_weak below, and in L1. */ + bidi_it->type_after_w1 = bidi_it->type; bidi_check_type (bidi_it->type_after_w1); } @@ -1219,7 +1296,7 @@ next_char = bidi_it->bytepos + bidi_it->ch_len >= ZV_BYTE ? BIDI_EOB : FETCH_CHAR (bidi_it->bytepos + bidi_it->ch_len); - type_of_next = bidi_get_type (next_char); + type_of_next = bidi_get_type (next_char, override); if (type_of_next == WEAK_BN || bidi_explicit_dir_char (next_char)) @@ -1267,12 +1344,12 @@ /* W5: ET with EN after it. */ else { - int en_pos = bidi_it->charpos + 1; + EMACS_INT en_pos = bidi_it->charpos + 1; next_char = bidi_it->bytepos + bidi_it->ch_len >= ZV_BYTE ? BIDI_EOB : FETCH_CHAR (bidi_it->bytepos + bidi_it->ch_len); - type_of_next = bidi_get_type (next_char); + type_of_next = bidi_get_type (next_char, override); if (type_of_next == WEAK_ET || type_of_next == WEAK_BN @@ -1588,8 +1665,8 @@ { int ch; int clen = bidi_it->ch_len; - int bpos = bidi_it->bytepos; - int cpos = bidi_it->charpos; + EMACS_INT bpos = bidi_it->bytepos; + EMACS_INT cpos = bidi_it->charpos; bidi_type_t chtype; do { @@ -1601,7 +1678,7 @@ if (ch == '\n' || ch == BIDI_EOB /* || ch == LINESEP_CHAR */) chtype = NEUTRAL_B; else - chtype = bidi_get_type (ch); + chtype = bidi_get_type (ch, NEUTRAL_DIR); } while (chtype == NEUTRAL_WS || chtype == WEAK_BN || bidi_explicit_dir_char (ch)); /* L1/Retaining */ bidi_it->next_for_ws.type = chtype; @@ -1725,8 +1802,6 @@ bidi_it->scan_dir = 1; /* default to logical order */ } - if (bidi_it->new_paragraph) - bidi_paragraph_init (bidi_overriding_paragraph_direction, bidi_it); /* Prepare the sentinel iterator state. */ if (bidi_cache_idx == 0) { @@ -1799,6 +1874,16 @@ next_level = bidi_level_of_next_char (bidi_it); } + /* Take note when we are at the end of the paragraph. The next time + we are about to be called, next_element_from_buffer will + automatically reinit the paragraph direction, if needed. */ + if (bidi_it->scan_dir == 1 + && bidi_it->type == NEUTRAL_B + && bidi_it->bytepos < ZV_BYTE + && bidi_at_paragraph_end (bidi_it->charpos + 1, + bidi_it->bytepos + bidi_it->ch_len)) + bidi_it->new_paragraph = 1; + if (bidi_it->scan_dir == 1 && bidi_cache_idx) { /* If we are at paragraph's base embedding level and beyond the diff -r e6df672626c1 -r a551e4109c04 src/dispextern.h --- a/src/dispextern.h Fri Jan 01 06:01:34 2010 -0500 +++ b/src/dispextern.h Fri Jan 01 06:06:48 2010 -0500 @@ -1761,8 +1761,8 @@ /* Data type for iterating over bidi text. */ struct bidi_it { int first_elt; /* if non-zero, examine current char first */ - int bytepos; /* iterator's position in buffer */ - int charpos; + EMACS_INT bytepos; /* iterator's position in buffer */ + EMACS_INT charpos; int ch; /* character itself */ int ch_len; /* length of its multibyte sequence */ bidi_type_t type; /* bidi type of this character, after @@ -1773,14 +1773,15 @@ int invalid_levels; /* how many PDFs to ignore */ int invalid_rl_levels; /* how many PDFs from RLE/RLO to ignore */ int new_paragraph; /* if non-zero, a new paragraph begins here */ + bidi_dir_t paragraph_dir; /* current paragraph direction */ int prev_was_pdf; /* if non-zero, previous char was PDF */ struct bidi_saved_info prev; /* info about previous character */ struct bidi_saved_info last_strong; /* last-seen strong directional char */ struct bidi_saved_info next_for_neutral; /* surrounding characters for... */ struct bidi_saved_info prev_for_neutral; /* ...resolving neutrals */ struct bidi_saved_info next_for_ws; /* character after sequence of ws */ - int next_en_pos; /* position of next EN char for ET */ - int ignore_bn_limit; /* position until which to ignore BNs */ + EMACS_INT next_en_pos; /* position of next EN char for ET */ + EMACS_INT ignore_bn_limit; /* position until which to ignore BNs */ bidi_dir_t sor; /* direction of start-of-run in effect */ int scan_dir; /* direction of text scan */ int stack_idx; /* index of current data on the stack */ @@ -2798,7 +2799,7 @@ /* Defined in bidi.c */ -extern void bidi_init_it P_ ((int, int, struct bidi_it *)); +extern void bidi_init_it P_ ((EMACS_INT, EMACS_INT, struct bidi_it *)); extern void bidi_get_next_char_visually P_ ((struct bidi_it *)); extern void bidi_paragraph_init P_ ((bidi_dir_t, struct bidi_it *)); extern int bidi_mirror_char P_ ((int)); diff -r e6df672626c1 -r a551e4109c04 src/window.h --- a/src/window.h Fri Jan 01 06:01:34 2010 -0500 +++ b/src/window.h Fri Jan 01 06:06:48 2010 -0500 @@ -117,7 +117,10 @@ /* The buffer displayed in this window */ /* Of the fields vchild, hchild and buffer, only one is non-nil. */ Lisp_Object buffer; - /* A marker pointing to where in the text to start displaying */ + /* A marker pointing to where in the text to start displaying. + BIDI Note: This is the _logical-order_ start, i.e. the smallest + buffer position visible in the window, not necessarily the + character displayed in the top left corner of the window. */ Lisp_Object start; /* A marker pointing to where in the text point is in this window, used only when the window is not selected.