From 99255182b33407b29edee5fc30c4495ee09ccb1f Mon Sep 17 00:00:00 2001
From: Per Hedbor <ph@opera.com>
Date: Fri, 31 May 2013 18:59:18 +0200
Subject: [PATCH] Added min- and max- range-indicators to strings.

Also added flags indicating if a string is lowercase or uppercase.

The min/max is only calculated on demand to avoid any slowdowns.

This makes upper_case/lower_case(X), where X is an already lowercased
string very fast ('search' also utilize the ranges when available, so
search(string,"\0") is very fast if there are no null characters in
the string).

It also speeds up the %s format for get_all_args (at least the second
time a certain string is used) since the function checking for null
characters can now use the fields.

Currently most string operations simply reset the string to unchecked,
as an optimization they could copy the ranges/flag as appropriate.

The + operator already does keep track of the flags and ranges.

For wide-strings the min/max value is somewhat less correct, since
it's saved in a single byte. But '0' still works (you get string(0..X)
for strings with 0, and string(1..X) for strings without 0 but with
any other character from the first 255).
---
 src/builtin_functions.c |  29 +++++-
 src/operators.c         |  24 ++++-
 src/stralloc.c          | 214 ++++++++++++++++++++++++++++++++++++++--
 src/stralloc.h          |  64 ++++++++----
 4 files changed, 297 insertions(+), 34 deletions(-)

diff --git a/src/builtin_functions.c b/src/builtin_functions.c
index 3e3aa78a00..28451e5e5e 100644
--- a/src/builtin_functions.c
+++ b/src/builtin_functions.c
@@ -668,8 +668,12 @@ PMOD_EXPORT void f_lower_case(INT32 args)
     pop_n_elems(args-1);
     return;
   }
-  
+
   orig = Pike_sp[-args].u.string;
+
+  if( orig->flags & STRING_IS_LOWERCASE )
+      return;
+
   ret = begin_wide_shared_string(orig->len, orig->size_shift);
 
   MEMCPY(ret->str, orig->str, orig->len << orig->size_shift);
@@ -700,8 +704,10 @@ PMOD_EXPORT void f_lower_case(INT32 args)
 #endif
   }
 
+  ret = end_shared_string(ret);
+  ret->flags |= STRING_IS_LOWERCASE;
   pop_n_elems(args);
-  push_string(end_shared_string(ret));
+  push_string(ret);
 }
 
 /*! @decl string upper_case(string s)
@@ -738,8 +744,13 @@ PMOD_EXPORT void f_upper_case(INT32 args)
     pop_n_elems(args-1);
     return;
   }
-  
+
   orig = Pike_sp[-args].u.string;
+  if( orig->flags & STRING_IS_UPPERCASE )
+  {
+      return;
+  }
+
   ret=begin_wide_shared_string(orig->len,orig->size_shift);
   MEMCPY(ret->str, orig->str, orig->len << orig->size_shift);
 
@@ -800,7 +811,9 @@ PMOD_EXPORT void f_upper_case(INT32 args)
   }
 
   pop_n_elems(args);
-  push_string(end_shared_string(ret));
+  ret = end_shared_string(ret);
+  ret->flags |= STRING_IS_UPPERCASE;
+  push_string(ret);
 }
 
 /*! @decl string random_string(int len)
@@ -954,7 +967,13 @@ PMOD_EXPORT void f_search(INT32 args)
       } else {
 	val = index_shared_string(Pike_sp[1-args].u.string, 0);
       }
-      
+
+      if( !string_range_contains( haystack, val )  )
+      {
+          pop_n_elems(args);
+          push_int( -1 );
+          return;
+      }
       switch(Pike_sp[-args].u.string->size_shift) {
       case 0:
 	{
diff --git a/src/operators.c b/src/operators.c
index 5f0c741df1..1ebcb8cdf4 100644
--- a/src/operators.c
+++ b/src/operators.c
@@ -1555,7 +1555,7 @@ PMOD_EXPORT void f_add(INT32 args)
     PCHARP buf;
     ptrdiff_t tmp;
     int max_shift=0;
-
+    unsigned char tmp_flags, tmp_min, tmp_max;
     if(args==1) return;
 
     size=0;
@@ -1579,16 +1579,32 @@ PMOD_EXPORT void f_add(INT32 args)
     }
 
     tmp=sp[-args].u.string->len;
+    tmp_flags = sp[-args].u.string->flags;
+    tmp_min = sp[-args].u.string->min;
+    tmp_max = sp[-args].u.string->max;
+
     r=new_realloc_shared_string(sp[-args].u.string,size,max_shift);
+
+    r->flags |= tmp_flags & ~15;
+    r->min = tmp_min;
+    r->max = tmp_max;
+
     mark_free_svalue (sp - args);
     buf=MKPCHARP_STR_OFF(r,tmp);
     for(e=-args+1;e<0;e++)
     {
-      pike_string_cpy(buf,sp[e].u.string);
-      INC_PCHARP(buf,sp[e].u.string->len);
+      if( sp[e].u.string->len )
+      {
+        update_flags_for_add( r, sp[e].u.string );
+        pike_string_cpy(buf,sp[e].u.string);
+        INC_PCHARP(buf,sp[e].u.string->len);
+      }
     }
     SET_SVAL(sp[-args], T_STRING, 0, string, low_end_shared_string(r));
-    for(e=-args+1;e<0;e++) free_string(sp[e].u.string);
+
+    for(e=-args+1;e<0;e++)
+      free_string(sp[e].u.string);
+
     sp-=args-1;
 
     break;
diff --git a/src/stralloc.c b/src/stralloc.c
index c4661d57cb..1583439528 100644
--- a/src/stralloc.c
+++ b/src/stralloc.c
@@ -107,6 +107,137 @@ PMOD_EXPORT struct pike_string *empty_pike_string = 0;
 #define low_do_hash(STR,LEN,SHIFT) low_hashmem( (STR), (LEN)<<(SHIFT), HASH_PREFIX<<(SHIFT), hashkey )
 #define do_hash(STR) low_do_hash(STR->str,STR->len,STR->size_shift)
 
+/* Returns true if str could contain n. */
+PMOD_EXPORT int string_range_contains( struct pike_string *str, int n )
+{
+  INT32 min, max;
+  check_string_range( str, 1, &min, &max );
+  if( n >= min && n <= max )
+    return 1;
+  return 0;
+}
+
+/* Returns true if str2 could be in str1. */
+PMOD_EXPORT int string_range_contains_string( struct pike_string *str1,
+                                              struct pike_string *str2 )
+{
+  INT32 max1, min1;
+  INT32 max2, min2;
+  check_string_range( str1, 1, &min1, &max1 );
+  check_string_range( str2, 1, &min2, &max2 );
+  if( (min2 < min1) || (max2 > max1) )
+  {
+    if( (str1->flags & STRING_CONTENT_CHECKED) ==
+        (str2->flags & STRING_CONTENT_CHECKED) )
+      return 0;
+    /* fallback to simple size-shift check.  */
+    return str1->size_shift >= str2->size_shift;
+  }
+  if( (min2 < min1) || (max2 > max1) ) 
+    return 0;
+  return 1;
+}
+
+PMOD_EXPORT void check_string_range( struct pike_string *str,
+                                     int loose,
+                                     INT32 *min, INT32 *max )
+{
+  INT32 s_min = MAX_INT32;
+  INT32 s_max = MIN_INT32;
+  ssize_t i;
+
+  if( loose || ((str->flags & STRING_CONTENT_CHECKED ) && (!str->size_shift || !max)) )
+  {
+    if( str->flags & STRING_CONTENT_CHECKED )
+    {
+      s_min = str->min;
+      s_max = str->max;
+
+      if( str->size_shift )
+      {
+        s_min <<= 8 * str->size_shift;
+        s_max <<= 8 * str->size_shift;
+        if( s_min )
+          s_min -= (1<<(8*str->size_shift))-1;
+        s_max += str->size_shift == 1 ? 255 : 65535;
+      }
+    }
+    else
+    {
+      switch( str->size_shift )
+      {
+        case 2: s_min = MIN_INT32; s_max=MAX_INT32; break;
+        case 1: s_min = 0; s_max = 65535; break;
+        case 0: s_min = 0; s_max = 255; break;
+      }
+    }
+  }
+  else
+  {
+    str->flags |= STRING_CONTENT_CHECKED;
+
+    switch( str->size_shift )
+    {
+      case 0:
+       {
+         p_wchar0 *p = (p_wchar0*)str->str;
+         int upper = 0, lower = 0;
+         for( i=0; i<str->len; i++,p++ )
+         {
+           /* For 7-bit strings it's easy to check for
+            * lower/uppercase, so do that here as well.
+            */
+           if( *p >= 'A' && *p <= 'Z') upper++;
+           if( *p >= 'a' && *p <= 'z') lower++;
+
+           if( *p > s_max ) s_max = *p;
+           if( *p < s_min ) s_min = *p;
+         }
+
+         if( s_max < 128 )
+         {
+           if( upper && !lower )
+             str->flags |= STRING_IS_UPPERCASE;
+           if( lower && !upper )
+             str->flags |= STRING_IS_LOWERCASE;
+           if( !lower && !upper )
+             str->flags |= STRING_IS_LOWERCASE|STRING_IS_UPPERCASE;
+         }
+       }
+       str->min = s_min;
+       str->max = s_max;
+       break;
+
+      case 1:
+       {
+         p_wchar1 *p = (p_wchar1*)str->str;
+         for( i=0; i<str->len; i++,p++ )
+         {
+           if( *p > s_max ) s_max = *p;
+           if( *p < s_min ) s_min = *p;
+         }
+       }
+       str->min = (s_min+255) >> 8;
+       str->max = (s_max+255) >> 8;
+       break;
+
+      case 2:
+       {
+         p_wchar2 *p = (p_wchar2*)str->str;
+         for( i=0; i<str->len; i++,p++ )
+         {
+           if( *p > s_max ) s_max = *p;
+           if( *p < s_min ) s_min = *p;
+         }
+       }
+       str->min = (s_min+65535) >> 16;
+       str->max = (s_max+65535) >> 16;
+       break;
+    }
+  }
+  if( min ) *min = s_min;
+  if( max ) *max = s_max;
+}
 
 static INLINE int find_magnitude1(const p_wchar1 *s, ptrdiff_t len)
 {
@@ -628,6 +759,7 @@ PMOD_EXPORT struct pike_string *debug_begin_shared_string(size_t len)
   add_ref(t);	/* For DMALLOC */
   t->str[len]=0;
   t->len=len;
+/*  t->min = t->max = 0; */
   t->size_shift=0;
   DO_IF_DEBUG(t->next = NULL);
   return t;
@@ -776,6 +908,8 @@ PMOD_EXPORT struct pike_string *debug_begin_wide_shared_string(size_t len, int s
 PMOD_EXPORT void hash_string(struct pike_string *s)
 {
   if (!(s->flags & STRING_NOT_HASHED)) return;
+  /* if( s->len < HASH_PREFIX ) */
+  /*   check_string_range( s, 0, 0, 0 ); */
   s->hval=do_hash(s);
   s->flags &= ~STRING_NOT_HASHED;
 }
@@ -872,7 +1006,7 @@ PMOD_EXPORT struct pike_string *end_shared_string(struct pike_string *s)
 	  /* Fall though */
       }
       break;
-      
+
     case 1:
       if(!find_magnitude1(STR1(s),s->len))
       {
@@ -1295,7 +1429,7 @@ PMOD_EXPORT void check_string(struct pike_string *s)
       locate_problem(wrong_hash);
       Pike_fatal("Hash value changed?\n");
     }
-    
+
     if(debug_findstring(s) !=s)
       Pike_fatal("Shared string not shared.\n");
 
@@ -1728,6 +1862,7 @@ static struct pike_string *realloc_shared_string(struct pike_string *a,
   if(a->refs==1)
   {
     unlink_pike_string(a);
+    CLEAR_STRING_CHECKED(a);
     return realloc_unlinked_string(a, size);
   }else{
     r=begin_wide_shared_string(size,a->size_shift);
@@ -1875,6 +2010,7 @@ struct pike_string *modify_shared_string(struct pike_string *a,
 
     unlink_pike_string(a);
     low_set_index(a, index, c);
+    CLEAR_STRING_CHECKED(a);
     if((((unsigned int)index) >= HASH_PREFIX) && (index < a->len-8))
     {
       struct pike_string *old;
@@ -1906,9 +2042,63 @@ struct pike_string *modify_shared_string(struct pike_string *a,
   }
 }
 
+PMOD_EXPORT void set_flags_for_add( struct pike_string *ret,
+                                    unsigned char aflags,
+                                    unsigned char amin,
+                                    unsigned char amax,
+                                    struct pike_string *b)
+{
+  if( !b->len ) {
+    ret->flags |= aflags & ~15;
+    ret->min = amin;
+    ret->max = amax;
+    return;
+  }
+  if( (aflags & STRING_CONTENT_CHECKED) && (b->flags & STRING_CONTENT_CHECKED) )
+  {
+    ret->min = MIN( amin, b->min );
+    ret->max = MAX( amax, b->max );
+    ret->flags |= STRING_CONTENT_CHECKED;
+  }
+  else
+    ret->flags &= ~STRING_CONTENT_CHECKED;
+
+  if( (aflags & STRING_IS_LOWERCASE) && (b->flags & STRING_IS_LOWERCASE) )
+    ret->flags |= STRING_IS_LOWERCASE;
+  else
+    ret->flags &= ~STRING_IS_LOWERCASE;
+
+  if( (aflags & STRING_IS_UPPERCASE) && (b->flags & STRING_IS_UPPERCASE) )
+    ret->flags |= STRING_IS_UPPERCASE;
+  else
+    ret->flags &= ~STRING_IS_UPPERCASE;
+}
+
+PMOD_EXPORT void update_flags_for_add( struct pike_string *a, struct pike_string *b)
+{
+  if( !b->len ) return;
+  if( a->flags & STRING_CONTENT_CHECKED )
+  {
+    if(b->flags & STRING_CONTENT_CHECKED)
+    {
+      if( b->min < a->min ) a->min = b->min;
+      if( b->max > a->max ) a->max = b->max;
+    }
+    else
+      a->flags &= ~STRING_CONTENT_CHECKED;
+  }
+
+  if( (a->flags & STRING_IS_LOWERCASE) && !(b->flags & STRING_IS_LOWERCASE) )
+    a->flags &= ~STRING_IS_LOWERCASE;
+
+  if( (a->flags & STRING_IS_UPPERCASE) && !(b->flags & STRING_IS_UPPERCASE) )
+    a->flags &= ~STRING_IS_UPPERCASE;
+
+}
+
 /*** Add strings ***/
 PMOD_EXPORT struct pike_string *add_shared_strings(struct pike_string *a,
-					 struct pike_string *b)
+                                                   struct pike_string *b)
 {
   struct pike_string *ret;
   PCHARP tmp;
@@ -1919,16 +2109,21 @@ PMOD_EXPORT struct pike_string *add_shared_strings(struct pike_string *a,
   pike_string_cpy(tmp,a);
   INC_PCHARP(tmp,a->len);
   pike_string_cpy(tmp,b);
+  set_flags_for_add( ret, a->flags, a->min, a->max, b );
   return low_end_shared_string(ret);
 }
 
 PMOD_EXPORT struct pike_string *add_and_free_shared_strings(struct pike_string *a,
-						struct pike_string *b)
+                                                            struct pike_string *b)
 {
   ptrdiff_t alen = a->len;
   if(a->size_shift == b->size_shift)
   {
-    a = realloc_shared_string(a,alen + b->len);
+    unsigned char aflags = a->flags;
+    unsigned char amin = a->min;
+    unsigned char amax = a->max;
+    a = realloc_shared_string(a, alen + b->len);
+    set_flags_for_add( a, aflags, amin, amax, b );
     MEMCPY(a->str+(alen<<a->size_shift),b->str,b->len<<b->size_shift);
     free_string(b);
     a->flags |= STRING_NOT_HASHED;
@@ -1949,8 +2144,10 @@ PMOD_EXPORT ptrdiff_t string_search(struct pike_string *haystack,
   SearchMojt mojt;
   char *r;
 
-  if(needle->size_shift > haystack->size_shift ||
-     start + needle->len > haystack->len)
+  if( !string_range_contains_string( haystack, needle ) )
+    return -1;
+
+  if(start + needle->len > haystack->len)
     return -1;
 
   if(!needle->len) return start;
@@ -2147,6 +2344,8 @@ void init_shared_string_table(void)
   }
 #endif
   empty_pike_string = make_shared_string("");
+  empty_pike_string->flags |= STRING_CONTENT_CHECKED | STRING_IS_LOWERCASE | STRING_IS_UPPERCASE;
+  empty_pike_string->min = empty_pike_string->max = 0;
 }
 
 #ifdef DO_PIKE_CLEANUP
@@ -2357,6 +2556,7 @@ PMOD_EXPORT int init_string_builder_with_string (struct string_builder *s,
   if (str->refs == 1 && str->len > SHORT_STRING_THRESHOLD) {
     /* Unlink the string and use it as buffer directly. */
     unlink_pike_string (str);
+    str->flags = 0;
     s->s = str;
     s->malloced = str->len;
     s->known_shift = str->size_shift;
diff --git a/src/stralloc.h b/src/stralloc.h
index 779c6772b5..79681fe166 100644
--- a/src/stralloc.h
+++ b/src/stralloc.h
@@ -21,19 +21,23 @@
 #define PIKE_STRING_CONTENTS						\
   INT32 refs;								\
   INT32 ref_type;							\
-  INT16 flags;								\
-  INT16 size_shift; /* 14 bit waste, but good for alignment... */	\
+  unsigned char  flags;								\
+  unsigned char  size_shift; 	\
+  unsigned char  min;								\
+  unsigned char  max; 	\
   ptrdiff_t len; /* Not counting terminating NUL. */			\
   size_t hval;								\
-  struct pike_string *next 
+  struct pike_string *next
 #else /* !ATOMIC_SVALUE */
-#define PIKE_STRING_CONTENTS						\
-  INT32 refs;								\
-  INT16 flags;								\
-  INT16 size_shift; /* 14 bit waste, but good for alignment... */	\
-  ptrdiff_t len; /* Not counting terminating NUL. */			\
-  size_t hval;								\
-  struct pike_string *next 
+#define PIKE_STRING_CONTENTS                \
+    INT32 refs;                                     \
+    unsigned char  flags;                           \
+    unsigned char  size_shift;                      \
+    unsigned char  min;								\
+    unsigned char  max;                                             \
+    ptrdiff_t len; /* Not counting terminating NUL. */              \
+    size_t hval;                                                    \
+    struct pike_string *next
 #endif
 
 struct pike_string
@@ -50,10 +54,16 @@ struct string_builder
 };
 
 /* Flags used in pike_string->flags. */
-#define STRING_NOT_HASHED	1	/* Hash value is invalid. */
-#define STRING_NOT_SHARED	2	/* String not shared. */
-#define STRING_IS_SHORT		4	/* String is blockalloced. */
-#define STRING_CLEAR_ON_EXIT    8       /* Overwrite before free. */
+#define STRING_NOT_HASHED	    1	/* Hash value is invalid. */
+#define STRING_NOT_SHARED	    2	/* String not shared. */
+#define STRING_IS_SHORT		    4	/* String is blockalloced. */
+#define STRING_CLEAR_ON_EXIT    8   /* Overwrite before free. */
+
+#define STRING_CONTENT_CHECKED 16 /* if true, min and max are valid */
+#define STRING_IS_LOWERCASE    32
+#define STRING_IS_UPPERCASE    64
+
+#define CLEAR_STRING_CHECKED(X) do{(X)->flags &= 15;}while(0)
 
 /* Flags used by string_builder_append_integer() */
 #define APPEND_SIGNED		1	/* Value is signed */
@@ -96,10 +106,6 @@ struct pike_string *debug_findstring(const struct pike_string *foo);
 #define my_hash_string(X) PTR_TO_INT(X)
 #define is_same_string(X,Y) ((X)==(Y))
 
-/* NB: This intentionally only works for narrow strings. */
-#define string_has_null(X)						\
-  (STRNLEN((X)->str, (size_t)(X)->len) != (size_t)(X)->len)
-
 #ifdef PIKE_DEBUG
 #define STR0(X) ((p_wchar0 *)debug_check_size_shift((X),0)->str)
 #define STR1(X) ((p_wchar1 *)debug_check_size_shift((X),1)->str)
@@ -290,6 +296,14 @@ PMOD_EXPORT struct pike_string *debug_make_shared_string(const char *str);
 PMOD_EXPORT struct pike_string *debug_make_shared_string0(const p_wchar0 *str);
 PMOD_EXPORT struct pike_string *debug_make_shared_string1(const p_wchar1 *str);
 PMOD_EXPORT struct pike_string *debug_make_shared_string2(const p_wchar2 *str);
+PMOD_EXPORT void check_string_range( struct pike_string *str, int loose,
+                                     INT32 *min, INT32 *max );
+/* Returns true if str1 could contain str2. */
+PMOD_EXPORT int string_range_contains_string( struct pike_string *str1,
+                                              struct pike_string *str2 );
+/* Returns true if str could contain n. */
+PMOD_EXPORT int string_range_contains( struct pike_string *str, int n );
+
 PMOD_EXPORT void do_free_string(struct pike_string *s);
 PMOD_EXPORT void do_free_unlinked_pike_string(struct pike_string *s);
 PMOD_EXPORT void really_free_string(struct pike_string *s);
@@ -378,6 +392,11 @@ PMOD_EXPORT ptrdiff_t string_builder_quote_string(struct string_builder *buf,
 						  ptrdiff_t start,
 						  ptrdiff_t max_len,
 						  int flags);
+PMOD_EXPORT void update_flags_for_add( struct pike_string *a, struct pike_string *b);
+PMOD_EXPORT void set_flags_for_add( struct pike_string *ret,
+                                    unsigned char aflags, unsigned char amin,
+                                    unsigned char amax,
+                                    struct pike_string *b);
 PMOD_EXPORT void string_builder_append_integer(struct string_builder *s,
 					       LONGEST val,
 					       unsigned int base,
@@ -435,6 +454,15 @@ static INLINE void string_builder_binary_strcat(struct string_builder *s,
   string_builder_binary_strcat0 (s, (const p_wchar0 *) str, len);
 }
 
+/* Note: Does not work 100% correctly with shift==2 strings. */
+static INLINE int string_has_null( struct pike_string *x )
+{
+    INT32 min;
+    if( !x->len ) return 0;
+    check_string_range(x,0,&min,0);
+    return min <= 0;
+}
+
 #define ISCONSTSTR(X,Y) c_compare_string((X),Y,sizeof(Y)-sizeof(""))
 
 #define visit_string_ref(S, REF_TYPE)				\
-- 
GitLab