From xemacs-m  Wed Feb 26 19:54:52 1997
Received: from altair.xemacs.org (steve@xemacs.miranova.com [206.190.83.19])
	by xemacs.org (8.8.5/8.8.5) with ESMTP id TAA19200
	for <xemacs-beta@xemacs.org>; Wed, 26 Feb 1997 19:54:46 -0600 (CST)
Received: (from steve@localhost)
	by altair.xemacs.org (8.8.5/8.8.5) id SAA04741;
	Wed, 26 Feb 1997 18:06:28 -0800
Mail-Copies-To: never
To: xemacs-beta@xemacs.org
Subject: Re: regex.c patch
References: <rvwwrv3y0i.fsf@sdnp5.ucsd.edu>
X-Url: http://www.miranova.com/%7Esteve/
X-Face: #!T9!#9s-3o8)*uHlX{Ug[xW7E7Wr!*L46-OxqMu\xz23v|R9q}lH?cRS{rCNe^'[`^sr5"
 f8*@r4ipO6Jl!:Ccq<xoV[Qz2u8<8-+Vwf2gzJ44lf_/y9OaQ`@#Q65{U4/TC)i2`~/M&QI$X>p:9I
 OSS'2{-)-4wBnVeg0S\O4Al@)uC[pD|+
X-Attribution: sb
From: Steven L Baur <steve@miranova.com>
In-Reply-To: David Moore's message of 26 Feb 1997 16:15:09 -0800
Mime-Version: 1.0 (generated by tm-edit 7.105)
Content-Type: text/plain; charset=US-ASCII
Date: 26 Feb 1997 18:06:27 -0800
Message-ID: <m2iv3f0zq4.fsf@altair.xemacs.org>
Lines: 251
X-Mailer: Gnus v5.4.15/XEmacs 20.1

David Moore writes:

> This patch fixes some MULE specific bugs in regex.c and at least one
> non-mule specific bug.  It's not as efficient as possible; some of the
> INC_CHARPTR & charcount_to_bytecount's could probably be combined to do
> the check just once, if someone so wished.

> It fixes all the regexp bugs I've seen reported recently.  And this is
> the first time I've run Gnus under MULE, and that seems to be working
> still.

Yes!!!!  Thank you David!

There's one minor problem with your patch due to the unnatural
circumstances under which regex.o code is generated.  regex.o is built
twice.  The first time without -Demacs and without a normal MULE
environment (even when configured for MULE) to build [ce]tags, and the
second time with -Demacs to build XEmacs.  I don't know whether this
should be considered a bug.  At any rate the correction is trivial,
and your patch definitely fixes the recently reported regexp
problems. :-)

Here is the corrected patch:

(Cut & paste the code in the first `#ifndef emacs' if you've already
applied David's patch).

Index: regex.c
===================================================================
RCS file: /usr/local/xemacs/xemacs-20.0/src/regex.c,v
retrieving revision 1.2
diff -u -r1.2 regex.c
--- regex.c	1997/01/23 05:30:14	1.2
+++ regex.c	1997/02/27 01:15:35
@@ -3771,16 +3771,29 @@
 		      regs, size);
 }
 
+#ifndef emacs
+/* Snarfed from src/lisp.h, needed for compiling [ce]tags. */
+# define bytecount_to_charcount(ptr, len) (len)
+# define charcount_to_bytecount(ptr, len) (len)
+typedef int Charcount;
+#endif
 
 /* Using the compiled pattern in BUFP->buffer, first tries to match the
    virtual concatenation of STRING1 and STRING2, starting first at index
    STARTPOS, then at STARTPOS + 1, and so on.
+
+   With MULE, STARTPOS is a byte position, not a char position.  And the
+   search will increment STARTPOS by the width of the current leading
+   character.
    
    STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
    
    RANGE is how far to scan while trying to match.  RANGE = 0 means try
    only at STARTPOS; in general, the last start tried is STARTPOS +
    RANGE.
+
+   With MULE, RANGE is a byte position, not a char position.  The last
+   start tried is the character starting <= STARTPOS + RANGE.
    
    In REGS, return the indices of the virtual concatenation of STRING1
    and STRING2 that matched the entire BUFP->buffer and its contained
@@ -3813,8 +3826,13 @@
     
   /* Fix up RANGE if it might eventually take us outside
      the virtual concatenation of STRING1 and STRING2.  */
+#if 0
   if (endpos < -1)
     range = -1 - startpos;
+#else
+  if (endpos < 0)
+    range = 0 - startpos;
+#endif
   else if (endpos > total_size)
     range = total_size - startpos;
 
@@ -3862,25 +3880,33 @@
 	  /* whose stupid idea was it anyway to make this
 	     function take two strings to match?? */
 	  int lim = 0;
-	  unsigned char *p;
+	  register CONST unsigned char *d;
 	  int irange = range;
+
 	  if (startpos < size1 && startpos + range >= size1)
 	    lim = range - (size1 - startpos);
 
-	  p = ((unsigned char *)
-	       &(startpos >= size1 ? string2 - size1 : string1)[startpos]);
-	  p--;
+	  d = ((CONST unsigned char *)
+	       (startpos >= size1 ? string2 - size1 : string1) + startpos);
+	  DEC_CHARPTR(d);
 
 	  if (translate)
-	    {
-	      while (range > lim && translate[*p++] != '\n')
-		range--;
-	    }
+#ifdef MULE
+	    while (range > lim && (*d >= 0x80 || translate[*d] != '\n'))
+#else
+	    while (range > lim && translate[*d] != '\n')
+#endif
+	      {
+		INC_CHARPTR(d);
+		range -= charcount_to_bytecount (d, 1);
+	      }
 	  else
-	    {
-	      while (range > lim && *p++ != '\n')
-		range--;
-	    }
+	    while (range > lim && *d != '\n')
+	      {
+		INC_CHARPTR(d);
+		range -= charcount_to_bytecount (d, 1);
+	      }
+
 	  startpos += irange - range;
 	}
 #endif /* REGEX_BEGLINE_CHECK */
@@ -3893,35 +3919,47 @@
 	{
 	  if (range > 0)	/* Searching forwards.  */
 	    {
-	      register CONST char *d;
+	      register CONST unsigned char *d;
 	      register int lim = 0;
 	      int irange = range;
 
               if (startpos < size1 && startpos + range >= size1)
                 lim = range - (size1 - startpos);
 
-	      d = (startpos >= size1 ? string2 - size1 : string1) + startpos;
+	      d = ((CONST unsigned char *)
+		   (startpos >= size1 ? string2 - size1 : string1) + startpos);
    
               /* Written out as an if-else to avoid testing `translate'
                  inside the loop.  */
 	      if (translate)
-                while (range > lim
-                       && !fastmap[(unsigned char)
-				   translate[(unsigned char) *d++]])
-                  range--;
+#ifdef MULE
+                while (range > lim && *d < 0x80 && !fastmap[translate[*d]])
+#else
+                while (range > lim && !fastmap[translate[*d]])
+#endif
+		  {
+		    range -= charcount_to_bytecount (d, 1);
+		    INC_CHARPTR(d);
+		  }
 	      else
-                while (range > lim && !fastmap[(unsigned char) *d++])
-                  range--;
+                while (range > lim && !fastmap[*d])
+		  {
+		    range -= charcount_to_bytecount (d, 1);
+		    INC_CHARPTR(d);
+		  }
 
 	      startpos += irange - range;
 	    }
 	  else				/* Searching backwards.  */
 	    {
-	      register char c = (size1 == 0 || startpos >= size1
-                                 ? string2[startpos - size1] 
-                                 : string1[startpos]);
-
+	      register unsigned char c = (size1 == 0 || startpos >= size1
+					  ? string2[startpos - size1] 
+					  : string1[startpos]);
+#ifdef MULE
+	      if (c < 0x80 && !fastmap[(unsigned char) TRANSLATE (c)])
+#else
 	      if (!fastmap[(unsigned char) TRANSLATE (c)])
+#endif
 		goto advance;
 	    }
 	}
@@ -3951,17 +3989,28 @@
 
     advance:
       if (!range) 
-        break;
-      else if (range > 0) 
-        {
-          range--; 
-          startpos++;
-        }
-      else
-        {
-          range++; 
-          startpos--;
-        }
+	break;
+      else {
+	register CONST unsigned char *d;
+	Charcount d_size;
+
+	d = ((CONST unsigned char *)
+	     (startpos >= size1 ? string2 - size1 : string1) + startpos);
+
+	if (range > 0) 
+	  {
+	    d_size = charcount_to_bytecount (d, 1);
+	    range -= d_size;
+	    startpos += d_size;
+	  }
+	else
+	  {
+	    DEC_CHARPTR(d);
+	    d_size = charcount_to_bytecount (d, 1);
+	    range += d_size;
+	    startpos -= d_size;
+	  }
+      }
     }
   return -1;
 } /* re_search_2 */
@@ -5075,10 +5124,19 @@
                   = *p2 == (unsigned char) endline ? '\n' : p2[2];
 #endif
 
+#if 1
+                /* dmoore@ucsd.edu - emacs 19.34 uses this: */
+
                 if ((re_opcode_t) p1[3] == exactn
-		    && ! ((int) p2[1] * BYTEWIDTH > (int) p1[4]
-			  && (p2[1 + p1[4] / BYTEWIDTH]
-			      & (1 << (p1[4] % BYTEWIDTH)))))
+                    && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5]
+                          && (p2[2 + p1[5] / BYTEWIDTH]
+                              & (1 << (p1[5] % BYTEWIDTH)))))
+#else
+                if ((re_opcode_t) p1[3] == exactn
+                    && ! ((int) p2[1] * BYTEWIDTH > (int) p1[4]
+                          && (p2[1 + p1[4] / BYTEWIDTH]
+                              & (1 << (p1[4] % BYTEWIDTH)))))
+#endif
                   {
   		    p[-3] = (unsigned char) pop_failure_jump;
                     DEBUG_PRINT3 ("  %c != %c => pop_failure_jump.\n",

-- 
steve@miranova.com baur
Unsolicited commercial e-mail will be billed at $250/message.

