Removing some duplicate code in THD::convert_string() & friends

abarkov · abarkov · commit 4a126bf3e184 · 2018-05-11T18:02:16.000+04:00
1. Adding THD::convert_string(LEX_CSTRING *to,...) as a wrapper for convert_string(LEX_STRING *to,...), as LEX_CSTRING is now frequently used for conversion purpose. This reduced duplicate code in TEXT_STRING_sys, TEXT_STRING_literal, TEXT_STRING_filesystem grammar rules in *.yy 2. Adding yet another THD::convert_string() with an extra parameter "bool simple_copy_is_possible". This even more reduced repeatable code in the mentioned grammar rules in *.yy 3. Deriving Lex_ident_cli_st from Lex_string_with_metadata_st, as they have very similar functionality. Moving m_quote from Lex_ident_cli_st to Lex_string_with_metadata_st, as m_quote will be used later to optimize string literals anyway (e.g. avoid redundant copying on the tokenizer stage). Adjusting Lex_input_stream::get_text() accordingly. 4. Moving the reminders of the code in TEXT_STRING_sys, TEXT_STRING_literal, TEXT_STRING_filesystem grammar rules as new methods in THD: - make_text_string_sys() - make_text_string_connection() - make_text_string_filesystem() and changing *.yy to use these new methods. This reduced the amount of similar code in sql_yacc.yy and sql_yacc_ora.yy. 5. Removing duplicate code in Lex_input_stream::body_utf8_append_ident(): by reusing THD::make_text_string_sys(). Thanks to #3 and #4. 6. Making THD members charset_is_system_charset, charset_is_collation_connection, charset_is_character_set_filesystem private, as they are not needed externally any more.
diff --git a/sql/sql_class.h b/sql/sql_class.h
@@ -3130,8 +3130,10 @@ class THD :public Statement,
     is set if a statement accesses a temporary table created through
     CREATE TEMPORARY TABLE. 
   */
-  bool	     charset_is_system_charset, charset_is_collation_connection;
+private:
+  bool       charset_is_system_charset, charset_is_collation_connection;
   bool       charset_is_character_set_filesystem;
+public:
   bool       enable_slow_log;    /* Enable slow log for current statement */
   bool	     abort_on_warning;
   bool 	     got_warning;       /* Set on call to push_warning() */
@@ -3706,6 +3708,25 @@ class THD :public Statement,
   bool convert_string(LEX_STRING *to, CHARSET_INFO *to_cs,
 		      const char *from, size_t from_length,
 		      CHARSET_INFO *from_cs);
+  bool convert_string(LEX_CSTRING *to, CHARSET_INFO *to_cs,
+                      const char *from, size_t from_length,
+                      CHARSET_INFO *from_cs)
+  {
+    LEX_STRING tmp;
+    bool rc= convert_string(&tmp, to_cs, from, from_length, from_cs);
+    to->str= tmp.str;
+    to->length= tmp.length;
+    return rc;
+  }
+  bool convert_string(LEX_CSTRING *to, CHARSET_INFO *tocs,
+                      const LEX_CSTRING *from, CHARSET_INFO *fromcs,
+                      bool simple_copy_is_possible)
+  {
+    if (!simple_copy_is_possible)
+      return unlikely(convert_string(to, tocs, from->str, from->length, fromcs));
+    *to= *from;
+    return false;
+  }
   /*
     Convert a strings between character sets.
     Uses my_convert_fix(), which uses an mb_wc .. mc_mb loop internally.
@@ -3767,6 +3788,24 @@ class THD :public Statement,
   Item_basic_constant *make_string_literal_nchar(const Lex_string_with_metadata_st &str);
   Item_basic_constant *make_string_literal_charset(const Lex_string_with_metadata_st &str,
                                                    CHARSET_INFO *cs);
+  bool make_text_string_sys(LEX_CSTRING *to,
+                            const Lex_string_with_metadata_st *from)
+  {
+    return convert_string(to, system_charset_info,
+                          from, charset(), charset_is_system_charset);
+  }
+  bool make_text_string_connection(LEX_CSTRING *to,
+                                   const Lex_string_with_metadata_st *from)
+  {
+    return convert_string(to, variables.collation_connection,
+                          from, charset(), charset_is_collation_connection);
+  }
+  bool make_text_string_filesystem(LEX_CSTRING *to,
+                                   const Lex_string_with_metadata_st *from)
+  {
+    return convert_string(to, variables.character_set_filesystem,
+                          from, charset(), charset_is_character_set_filesystem);
+  }
   void add_changed_table(TABLE *table);
   void add_changed_table(const char *key, size_t key_length);
   CHANGED_TABLE_LIST * changed_table_dup(const char *key, size_t key_length);
diff --git a/sql/sql_lex.cc b/sql/sql_lex.cc
@@ -416,32 +416,18 @@ void Lex_input_stream::body_utf8_append(const char *ptr)
                   operation.
 */
 
-void Lex_input_stream::body_utf8_append_ident(THD *thd,
-                                              const LEX_CSTRING *txt,
-                                              const char *end_ptr)
+void
+Lex_input_stream::body_utf8_append_ident(THD *thd,
+                                         const Lex_string_with_metadata_st *txt,
+                                         const char *end_ptr)
 {
   if (!m_cpp_utf8_processed_ptr)
     return;
 
   LEX_CSTRING utf_txt;
-  CHARSET_INFO *txt_cs= thd->charset();
-
-  if (!my_charset_same(txt_cs, &my_charset_utf8_general_ci))
-  {
-    LEX_STRING to;
-    thd->convert_string(&to,
-                        &my_charset_utf8_general_ci,
-                        txt->str, (uint) txt->length,
-                        txt_cs);
-    utf_txt.str=    to.str;
-    utf_txt.length= to.length;
-
-  }
-  else
-    utf_txt= *txt;
+  thd->make_text_string_sys(&utf_txt, txt); // QQ: check return value?
 
   /* NOTE: utf_txt.length is in bytes, not in symbols. */
-
   memcpy(m_body_utf8_ptr, utf_txt.str, utf_txt.length);
   m_body_utf8_ptr += utf_txt.length;
   *m_body_utf8_ptr= 0;
@@ -1043,13 +1029,13 @@ bool Lex_input_stream::get_text(Lex_string_with_metadata_st *dst, uint sep,
   uchar c;
   uint found_escape=0;
   CHARSET_INFO *cs= m_thd->charset();
+  bool is_8bit= false;
 
-  dst->set_8bit(false);
   while (! eof())
   {
     c= yyGet();
     if (c & 0x80)
-      dst->set_8bit(true);
+      is_8bit= true;
 #ifdef USE_MB
     {
       int l;
@@ -1093,23 +1079,24 @@ bool Lex_input_stream::get_text(Lex_string_with_metadata_st *dst, uint sep,
 
       if (!(to= (char*) m_thd->alloc((uint) (end - str) + 1)))
       {
-        dst->str= "";        // Sql_alloc has set error flag
-        dst->length= 0;
-        return true;
+        dst->set(&empty_clex_str, 0, '\0');
+        return true;                   // Sql_alloc has set error flag
       }
-      dst->str= to;
 
       m_cpp_text_start= m_cpp_tok_start + pre_skip;
       m_cpp_text_end= get_cpp_ptr() - post_skip;
 
       if (!found_escape)
       {
-        memcpy(to, str, dst->length= (end - str));
-        to[dst->length]= 0;
+        size_t len= (end - str);
+        memcpy(to, str, len);
+        to[len]= '\0';
+        dst->set(to, len, is_8bit, '\0');
       }
       else
       {
-        dst->length= unescape(cs, to, str, end, sep);
+        size_t len= unescape(cs, to, str, end, sep);
+        dst->set(to, len, is_8bit, '\0');
       }
       return false;
     }
diff --git a/sql/sql_lex.h b/sql/sql_lex.h
@@ -37,22 +37,47 @@
 
 
 /**
-  A string with metadata.
+  A string with metadata. Usually points to a string in the client
+  character set, but unlike Lex_ident_cli_st (see below) it does not
+  necessarily point to a query fragment. It can also point to memory
+  of other kinds (e.g. an additional THD allocated memory buffer
+  not overlapping with the current query text).
+
   We'll add more flags here eventually, to know if the string has, e.g.:
   - multi-byte characters
   - bad byte sequences
   - backslash escapes:   'a\nb'
-  - separator escapes:   'a''b'
   and reuse the original query fragments instead of making the string
   copy too early, in Lex_input_stream::get_text().
   This will allow to avoid unnecessary copying, as well as
   create more optimal Item types in sql_yacc.yy
 */
 struct Lex_string_with_metadata_st: public LEX_CSTRING
 {
+private:
   bool m_is_8bit; // True if the string has 8bit characters
+  char m_quote;   // Quote character, or 0 if not quoted
 public:
   void set_8bit(bool is_8bit) { m_is_8bit= is_8bit; }
+  void set_metadata(bool is_8bit, char quote)
+  {
+    m_is_8bit= is_8bit;
+    m_quote= quote;
+  }
+  void set(const char *s, size_t len, bool is_8bit, char quote)
+  {
+    str= s;
+    length= len;
+    set_metadata(is_8bit, quote);
+  }
+  void set(const LEX_CSTRING *s, bool is_8bit, char quote)
+  {
+    ((LEX_CSTRING &)*this)= *s;
+    set_metadata(is_8bit, quote);
+  }
+  bool is_8bit() const { return m_is_8bit; }
+  bool is_quoted() const { return m_quote != '\0'; }
+  char quote() const { return m_quote; }
   // Get string repertoire by the 8-bit flag and the character set
   uint repertoire(CHARSET_INFO *cs) const
   {
@@ -71,44 +96,27 @@ struct Lex_string_with_metadata_st: public LEX_CSTRING
   Used to store identifiers in the client character set.
   Points to a query fragment.
 */
-struct Lex_ident_cli_st: public LEX_CSTRING
+struct Lex_ident_cli_st: public Lex_string_with_metadata_st
 {
-private:
-  bool m_is_8bit;
-  char m_quote;
 public:
   void set_keyword(const char *s, size_t len)
   {
-    str= s;
-    length= len;
-    m_is_8bit= false;
-    m_quote= '\0';
+    set(s, len, false, '\0');
   }
   void set_ident(const char *s, size_t len, bool is_8bit)
   {
-    str= s;
-    length= len;
-    m_is_8bit= is_8bit;
-    m_quote= '\0';
+    set(s, len, is_8bit, '\0');
   }
   void set_ident_quoted(const char *s, size_t len, bool is_8bit, char quote)
   {
-    str= s;
-    length= len;
-    m_is_8bit= is_8bit;
-    m_quote= quote;
+    set(s, len, is_8bit, quote);
   }
   void set_unquoted(const LEX_CSTRING *s, bool is_8bit)
   {
-    ((LEX_CSTRING &)*this)= *s;
-    m_is_8bit= is_8bit;
-    m_quote= '\0';
+    set(s, is_8bit, '\0');
   }
   const char *pos() const { return str - is_quoted(); }
   const char *end() const { return str + length + is_quoted(); }
-  bool is_quoted() const { return m_quote != '\0'; }
-  bool is_8bit() const { return m_is_8bit; }
-  char quote() const { return m_quote; }
 };
 
 
@@ -2453,7 +2461,7 @@ class Lex_input_stream
   void body_utf8_append(const char *ptr);
   void body_utf8_append(const char *ptr, const char *end_ptr);
   void body_utf8_append_ident(THD *thd,
-                              const LEX_CSTRING *txt,
+                              const Lex_string_with_metadata_st *txt,
                               const char *end_ptr);
   void body_utf8_append_escape(THD *thd,
                                const LEX_CSTRING *txt,
diff --git a/sql/sql_yacc.yy b/sql/sql_yacc.yy
@@ -15170,57 +15170,26 @@ IDENT_sys:
 TEXT_STRING_sys:
           TEXT_STRING
           {
-            if (thd->charset_is_system_charset)
-              $$= $1;
-            else
-            {
-              LEX_STRING to;
-              if (unlikely(thd->convert_string(&to, system_charset_info,
-                                               $1.str, $1.length,
-                                               thd->charset())))
-                MYSQL_YYABORT;
-              $$.str=    to.str;
-	      $$.length= to.length;
-            }
+            if (thd->make_text_string_sys(&$$, &$1))
+              MYSQL_YYABORT;
           }
         ;
 
 TEXT_STRING_literal:
           TEXT_STRING
           {
-            if (thd->charset_is_collation_connection)
-              $$= $1;
-            else
-            {
-              LEX_STRING to;
-              if (unlikely(thd->convert_string(&to,
-                                               thd->variables.collation_connection,
-                                               $1.str, $1.length,
-                                               thd->charset())))
-                MYSQL_YYABORT;
-              $$.str=    to.str;
-	      $$.length= to.length;
-            }
+            if (thd->make_text_string_connection(&$$, &$1))
+              MYSQL_YYABORT;
           }
         ;
 
 TEXT_STRING_filesystem:
           TEXT_STRING
           {
-            if (thd->charset_is_character_set_filesystem)
-              $$= $1;
-            else
-            {
-              LEX_STRING to;
-              if (unlikely(thd->convert_string(&to,
-                                               thd->variables.character_set_filesystem,
-                                               $1.str, $1.length,
-                                               thd->charset())))
-                MYSQL_YYABORT;
-              $$.str=    to.str;
-	      $$.length= to.length;
-            }
+            if (thd->make_text_string_filesystem(&$$, &$1))
+              MYSQL_YYABORT;
           }
+        ;
 
 ident_table_alias:
           IDENT_sys
diff --git a/sql/sql_yacc_ora.yy b/sql/sql_yacc_ora.yy
@@ -14919,56 +14919,24 @@ IDENT_sys:
 TEXT_STRING_sys:
           TEXT_STRING
           {
-            if (thd->charset_is_system_charset)
-              $$= $1;
-            else
-            {
-              LEX_STRING to;
-              if (unlikely(thd->convert_string(&to, system_charset_info,
-                                               $1.str, $1.length,
-                                               thd->charset())))
-                MYSQL_YYABORT;
-              $$.str=    to.str;
-	      $$.length= to.length;
-            }
+            if (thd->make_text_string_sys(&$$, &$1))
+              MYSQL_YYABORT;
           }
         ;
 
 TEXT_STRING_literal:
           TEXT_STRING
           {
-            if (thd->charset_is_collation_connection)
-              $$= $1;
-            else
-            {
-              LEX_STRING to;
-              if (unlikely(thd->convert_string(&to,
-                                               thd->variables.collation_connection,
-                                               $1.str, $1.length,
-                                               thd->charset())))
-                MYSQL_YYABORT;
-              $$.str=    to.str;
-	      $$.length= to.length;
-            }
+            if (thd->make_text_string_connection(&$$, &$1))
+              MYSQL_YYABORT;
           }
         ;
 
 TEXT_STRING_filesystem:
           TEXT_STRING
           {
-            if (thd->charset_is_character_set_filesystem)
-              $$= $1;
-            else
-            {
-              LEX_STRING to;
-              if (unlikely(thd->convert_string(&to,
-                                               thd->variables.character_set_filesystem,
-                                               $1.str, $1.length,
-                                               thd->charset())))
-                MYSQL_YYABORT;
-              $$.str=    to.str;
-	      $$.length= to.length;
-            }
+            if (thd->make_text_string_filesystem(&$$, &$1))
+              MYSQL_YYABORT;
           }
         ;