From 4ec32c792bd9ae6569da6dda6a76530d80bbc41c Mon Sep 17 00:00:00 2001
From: Per Hedbor <ph@opera.com>
Date: Thu, 7 Aug 2014 18:22:52 +0200
Subject: [PATCH] Optimized access to private/final global variables

Especially the machine code version is now significantly faster, it
will simply read the variable directly from the known byte offset
instead of calling a function that resolves it in the vtable.

Gives about a 20x speedup of trivial code along the lines of
globala = globala + globalb;

Also tried to disable some of the optimizations that causes lvalues to
be generated instead of the desired global/assign_global opcodes.

For now this is only done if the global variabeles are known to not be
arrays, multiset, strings, mapping or objects, since those
optimizations are needed to quickly append things to arrays (and
mappings/multiset, but that is less common. It is also needed for
destructive modifications of strings, something that is even less
common).
---
 src/code/amd64.c          |  85 +++++++++++++++++++++++-----
 src/docode.c              | 113 ++++++++++++++++++++++++++++----------
 src/interpret_functions.h |  20 +++++++
 3 files changed, 173 insertions(+), 45 deletions(-)

diff --git a/src/code/amd64.c b/src/code/amd64.c
index e3638fbfe0..142b7cee8e 100644
--- a/src/code/amd64.c
+++ b/src/code/amd64.c
@@ -1119,6 +1119,26 @@ static void amd64_push_int_reg(enum amd64_reg reg )
   amd64_add_sp( 1 );
 }
 
+static void amd64_get_storage( enum amd64_reg reg, ptrdiff_t offset )
+{
+  amd64_load_fp_reg();
+#if 0
+  /* Note: We really should keep pike_frame->current_storage up to date instead.. */
+  mov_mem_reg( fp_reg, OFFSETOF(pike_frame,current_storage), P_REG_RAX );
+  add_reg_imm( reg, offset );
+#else
+  /* fp->current_object->storage */
+  mov_mem_reg(fp_reg, OFFSETOF(pike_frame, current_object), P_REG_RAX);
+  mov_mem_reg(P_REG_RAX, OFFSETOF(object,storage), reg );
+  mov_mem_reg(fp_reg, OFFSETOF(pike_frame, context), P_REG_RAX);
+  /* + fp->context->storage_offset */
+  add_reg_mem( reg, P_REG_RAX, OFFSETOF(inherit,storage_offset) );
+  /* + offset */
+  add_reg_imm( reg, offset);
+#endif
+}
+
+
 static void amd64_mark(int offset)
 {
   amd64_load_sp_reg();
@@ -1224,25 +1244,25 @@ void amd64_ref_svalue( enum amd64_reg src, int already_have_type )
  LABEL_A;
 }
 
+void amd64_assign_svalue_no_free( enum amd64_reg dst, enum amd64_reg src, ptrdiff_t src_offset )
+{
+  if( dst == P_REG_RAX ||src == P_REG_RAX )
+    Pike_fatal( "Clobbering src/dst in amd64_assign_svalue_no_free <%d,%d>\n", dst, src);
+  /* Copy src -> dst */
+  mov_mem_reg(src, src_offset, P_REG_RAX);
+  mov_reg_mem(P_REG_RAX, dst, 0 );
+  mov_mem_reg(src, src_offset+sizeof(long), P_REG_RAX);
+  mov_reg_mem(P_REG_RAX, dst, sizeof(long) );
+}
+
 void amd64_assign_local( int b )
 {
   amd64_load_fp_reg();
   amd64_load_sp_reg();
-
-  mov_mem_reg( fp_reg, OFFSETOF(pike_frame, locals), ARG1_REG);
-  add_reg_imm( ARG1_REG,b*sizeof(struct svalue) );
-  mov_reg_reg( ARG1_REG, P_REG_RBX );
-
-  /* Free old svalue. */
-  amd64_free_svalue(ARG1_REG, 0);
-
-  /* Copy sp[-1] -> local */
-  amd64_load_sp_reg();
-  mov_mem_reg(sp_reg, -1*sizeof(struct svalue), P_REG_RAX);
-  mov_mem_reg(sp_reg, -1*sizeof(struct svalue)+sizeof(long), P_REG_RCX);
-
-  mov_reg_mem( P_REG_RAX, P_REG_RBX, 0 );
-  mov_reg_mem( P_REG_RCX, P_REG_RBX, sizeof(long) );
+  mov_mem_reg( fp_reg, OFFSETOF(pike_frame, locals), P_REG_RBX);
+  add_reg_imm( P_REG_RBX, b*sizeof(struct svalue) );
+  amd64_free_svalue(P_REG_RBX, 0);
+  amd64_assign_svalue_no_free( P_REG_RBX, sp_reg, -1*sizeof(struct svalue));
 }
 
 static void amd64_pop_mark(void)
@@ -2576,6 +2596,25 @@ void ins_f_byte_with_arg(unsigned int a, INT32 b)
     amd64_add_sp(-1);
     return;
 
+  case F_ASSIGN_PRIVATE_GLOBAL_AND_POP:
+  case F_ASSIGN_PRIVATE_GLOBAL:
+    ins_debug_instr_prologue(a-F_OFFSET, b, 0);
+    amd64_get_storage( P_REG_RBX, b );
+    amd64_free_svalue( P_REG_RBX, 0 );
+    amd64_load_sp_reg();
+
+    if( a == F_ASSIGN_PRIVATE_GLOBAL_AND_POP )
+    {
+      amd64_add_sp(-1);
+      amd64_assign_svalue_no_free( P_REG_RBX, sp_reg, 0);
+    }
+    else
+    {
+      amd64_assign_svalue_no_free( P_REG_RBX, sp_reg, -sizeof(struct svalue));
+      amd64_ref_svalue(P_REG_RBX,1); /* already have type in RAX (from assign_svalue) */
+    }
+    return;
+
   case F_ASSIGN_GLOBAL:
   case F_ASSIGN_GLOBAL_AND_POP:
       /* arg1: pike_fp->current obj
@@ -2647,6 +2686,22 @@ void ins_f_byte_with_arg(unsigned int a, INT32 b)
     }
     return;
 
+  case F_PRIVATE_GLOBAL:
+/*
+  This is a private (or final) global which is not a 'short' one.
+
+  It is guaranteed to be available as an svalue at
+  pike_fp->current_object->storage + pike_fp->context->storage_offset + offset
+
+  (I really wish pike_fp->current_storage was kept up to date for pike
+  function..)
+*/
+    ins_debug_instr_prologue(a-F_OFFSET, b, 0);
+    amd64_load_sp_reg();
+    amd64_get_storage( P_REG_RBX, b );
+    amd64_push_svaluep( P_REG_RBX );
+    return;
+
   case F_GLOBAL:
     ins_debug_instr_prologue(a-F_OFFSET, b, 0);
     amd64_load_fp_reg();
diff --git a/src/docode.c b/src/docode.c
index dcb35d3652..5e8e7e8b4d 100644
--- a/src/docode.c
+++ b/src/docode.c
@@ -678,6 +678,46 @@ static void emit_range (node *n DO_IF_DEBUG (COMMA int num_args))
   emit1 (F_RANGE, bound_types);
 }
 
+static void emit_global( int n )
+{
+  struct compilation *c = THIS_COMPILATION;
+  struct reference *ref = PTR_FROM_INT(Pike_compiler->new_program, n);
+  struct identifier *id = ID_FROM_PTR(Pike_compiler->new_program, ref);
+  if( (ref->id_flags & (ID_PRIVATE|ID_FINAL))
+      && !(id->identifier_flags & IDENTIFIER_NO_THIS_REF)
+      && IDENTIFIER_IS_VARIABLE(id->identifier_flags)
+      && !ref->inherit_offset
+      && id->run_time_type == PIKE_T_MIXED )
+  {
+    /* fprintf( stderr, "private global %d\n", (INT32)id->func.offset  ); */
+    emit1(F_PRIVATE_GLOBAL, id->func.offset);
+  }
+  else
+    emit1(F_GLOBAL, n);
+}
+
+static void emit_assign_global( int n, int and_pop )
+{
+  struct compilation *c = THIS_COMPILATION;
+  struct reference *ref = PTR_FROM_INT(Pike_compiler->new_program, n);
+  struct identifier *id = ID_FROM_PTR(Pike_compiler->new_program, ref);
+
+  if( (ref->id_flags & (ID_PRIVATE|ID_FINAL))
+      && !(id->identifier_flags & IDENTIFIER_NO_THIS_REF)
+      && !ref->inherit_offset
+      && id->run_time_type == PIKE_T_MIXED )
+  {
+    /* fprintf( stderr, "assign private global and pop %d\n",  */
+    /*          (INT32)id->func.offset ); */
+    emit1((and_pop?F_ASSIGN_PRIVATE_GLOBAL_AND_POP:F_ASSIGN_PRIVATE_GLOBAL),
+          id->func.offset);
+  }
+  else
+  {
+    emit1((and_pop?F_ASSIGN_GLOBAL_AND_POP:F_ASSIGN_GLOBAL), n);
+  }
+}
+
 static void emit_multi_assign(node *vals, node *vars, int no)
 {
   struct compilation *c = THIS_COMPILATION;
@@ -726,7 +766,7 @@ static void emit_multi_assign(node *vals, node *vars, int no)
     }else{
       code_expression(val, 0, "RHS");
       emit_multi_assign(vals, vars, no+1);
-      emit1(F_ASSIGN_GLOBAL_AND_POP, var->u.id.number);
+      emit_assign_global( var->u.id.number, 1 );
     }
     break;
 
@@ -785,7 +825,7 @@ static void emit_multi_assign(node *vals, node *vars, int no)
       {
 	code_expression(val, 0, "RHS");
 	emit_multi_assign(vals, vars, no+1);
-	emit1(F_ASSIGN_GLOBAL_AND_POP, var->u.integer.b);
+        emit_assign_global( var->u.integer.b, 1 );
 	break;
       }
     }
@@ -991,7 +1031,7 @@ static int do_docode2(node *n, int flags)
 	    emit1(F_CONSTANT, id->func.const_info.offset);
 	  }
 	}else{
-	  emit1(F_GLOBAL, n->u.integer.b);
+          emit_global( n->u.integer.b );
 	}
       }
     }
@@ -1270,14 +1310,15 @@ static int do_docode2(node *n, int flags)
     case F_MULTIPLY:
       if(node_is_eq(CDR(n),CAAR(n)))
       {
-	int num_args;
-	tmp1=do_docode(CDR(n),DO_LVALUE);
+        int num_args;
+	/* tmp1=do_docode(CDR(n),DO_LVALUE); */
 	if(match_types(CDR(n)->type, array_type_string) ||
 	   match_types(CDR(n)->type, string_type_string) ||
 	   match_types(CDR(n)->type, object_type_string) ||
 	   match_types(CDR(n)->type, multiset_type_string) ||
 	   match_types(CDR(n)->type, mapping_type_string))
 	{
+          do_docode(CDR(n),DO_LVALUE);
 	  num_args = do_docode(CDAR(n), 0);
 	  switch (num_args)
 	  {
@@ -1290,6 +1331,7 @@ static int do_docode2(node *n, int flags)
 #endif
 	  }
 	}else{
+          goto do_not_suboptimize_assign;
 	  emit0(F_LTOSVAL);
 	  num_args = do_docode(CDAR(n), 0);
 	}
@@ -1314,32 +1356,45 @@ static int do_docode2(node *n, int flags)
 	  node **arg = my_get_arg(&args, 0);
 	  if (arg && node_is_eq(CDR(n), *arg) &&
 	      !(args->tree_info & OPT_ASSIGNMENT)) {
-	    /* First arg is the lvalue.
-	     *
-	     * We optimize this to allow for destructive operations.
-	     */
-	    int no = 0;
-	    tmp1 = do_docode(CDR(n), DO_LVALUE);
-	    emit0(F_MARK_AND_CONST0);
-	    PUSH_CLEANUP_FRAME(do_pop_mark, 0);
-	    while ((arg = my_get_arg(&args, ++no)) && *arg) {
-	      do_docode(*arg, 0);
-	    }
-	    tmp1=store_constant(&CAAR(n)->u.sval,
-				!(CAAR(n)->tree_info & OPT_EXTERNAL_DEPEND),
-				CAAR(n)->name);
-	    emit1(F_LTOSVAL_CALL_BUILTIN_AND_ASSIGN, DO_NOT_WARN((INT32)tmp1));
-	    POP_AND_DONT_CLEANUP;
-	    return 1;
-	  }
+            if(match_types(CDR(n)->type, array_type_string) ||
+               match_types(CDR(n)->type, string_type_string) ||
+               match_types(CDR(n)->type, object_type_string) ||
+               match_types(CDR(n)->type, multiset_type_string) ||
+               match_types(CDR(n)->type, mapping_type_string))
+            {
+              /* First arg is the lvalue.
+               *
+               * We optimize this to allow for destructive operations.
+               */
+              int no = 0;
+              tmp1 = do_docode(CDR(n), DO_LVALUE);
+              emit0(F_MARK_AND_CONST0);
+              PUSH_CLEANUP_FRAME(do_pop_mark, 0);
+              while ((arg = my_get_arg(&args, ++no)) && *arg) {
+                do_docode(*arg, 0);
+              }
+              tmp1=store_constant(&CAAR(n)->u.sval,
+                                  !(CAAR(n)->tree_info & OPT_EXTERNAL_DEPEND),
+                                  CAAR(n)->name);
+              emit1(F_LTOSVAL_CALL_BUILTIN_AND_ASSIGN, DO_NOT_WARN((INT32)tmp1));
+              POP_AND_DONT_CLEANUP;
+              return 1;
+            }
+          }
 	}
       }
       /* FALL_THROUGH */
     default:
+      do_not_suboptimize_assign:
       switch(CDR(n)->token)
       {
+      case F_GLOBAL:
+  	  if(CDR(n)->u.integer.b) goto normal_assign;
+	  code_expression(CAR(n), 0, "RHS");
+          emit_assign_global( CDR(n)->u.integer.a, flags & DO_POP );
+          break;
       case F_LOCAL:
-	if(CDR(n)->u.integer.a >= 
+	if(CDR(n)->u.integer.a >=
 	   find_local_frame(CDR(n)->u.integer.b)->max_number_of_locals)
 	  yyerror("Illegal to use local variable here.");
 
@@ -1362,8 +1417,7 @@ static int do_docode2(node *n, int flags)
 	  yyerror("Cannot assign functions or constants.\n");
 	}else{
 	  code_expression(CAR(n), 0, "RHS");
-	  emit1(flags & DO_POP ? F_ASSIGN_GLOBAL_AND_POP:F_ASSIGN_GLOBAL,
-		CDR(n)->u.id.number);
+          emit_assign_global( CDR(n)->u.id.number, flags & DO_POP );
 	}
 	break;
 
@@ -1435,8 +1489,7 @@ static int do_docode2(node *n, int flags)
 	       IDENTIFIER_IS_VARIABLE( ID_FROM_INT(Pike_compiler->new_program, CDR(n)->u.integer.b)->identifier_flags))
 	    {
 	      code_expression(CAR(n), 0, "RHS");
-	      emit1(flags & DO_POP ? F_ASSIGN_GLOBAL_AND_POP:F_ASSIGN_GLOBAL,
-		    CDR(n)->u.integer.b);
+              emit_assign_global(CDR(n)->u.integer.b, flags & DO_POP );
 	      break;
 	    }
 	  }
@@ -2770,7 +2823,7 @@ static int do_docode2(node *n, int flags)
 	   * prototype. */
 	  emit1(F_LFUN,n->u.id.number);
 	else
-	  emit1(F_GLOBAL,n->u.id.number);
+          emit_global(n->u.id.number);
       }
     }else{
       if(flags & WANT_LVALUE)
@@ -2778,7 +2831,7 @@ static int do_docode2(node *n, int flags)
 	emit1(F_GLOBAL_LVALUE,n->u.id.number);
 	return 2;
       }else{
-	emit1(F_GLOBAL,n->u.id.number);
+        emit_global(n->u.id.number);
       }
     }
     return 1;
diff --git a/src/interpret_functions.h b/src/interpret_functions.h
index 45d5234b20..087235f0c9 100644
--- a/src/interpret_functions.h
+++ b/src/interpret_functions.h
@@ -355,6 +355,13 @@ OPCODE1_TAIL(F_MARK_AND_GLOBAL, "mark & global", I_UPDATE_SP|I_UPDATE_M_SP, {
   });
 });
 
+OPCODE1(F_PRIVATE_GLOBAL, "global <private>", I_UPDATE_SP, {
+    struct svalue *sp;
+    sp = (struct svalue *)(Pike_fp->current_object->storage + Pike_fp->context->storage_offset + arg1);
+    push_svalue( sp );
+    print_return_value();
+});
+
 OPCODE2_TAIL(F_MARK_AND_EXTERNAL, "mark & external", I_UPDATE_SP|I_UPDATE_M_SP, {
   *(Pike_mark_sp++)=Pike_sp;
 
@@ -1075,6 +1082,19 @@ OPCODE1(F_ASSIGN_GLOBAL_AND_POP, "assign global and pop", I_UPDATE_SP, {
   pop_stack();
 });
 
+OPCODE1(F_ASSIGN_PRIVATE_GLOBAL_AND_POP, "assign private global and pop", I_UPDATE_SP, {
+  struct svalue *tmp;
+  tmp = (struct svalue *)(Pike_fp->current_object->storage + Pike_fp->context->storage_offset + arg1);
+  free_svalue(tmp);
+  *tmp = *--Pike_sp;
+});
+
+OPCODE1(F_ASSIGN_PRIVATE_GLOBAL, "assign private global", I_UPDATE_SP, {
+  struct svalue *tmp;
+  tmp = (struct svalue *)(Pike_fp->current_object->storage + Pike_fp->context->storage_offset + arg1);
+  assign_svalue( tmp, Pike_sp-1 );
+});
+
 OPCODE2(F_ASSIGN_GLOBAL_NUMBER_AND_POP, "assign global number and pop", 0, {
   struct svalue tmp;
   SET_SVAL(tmp,PIKE_T_INT,0,integer,arg2);
-- 
GitLab