Add encoding argument to `Integral#chr`

Currently, `Integral#chr` in mruby changes behavior by `MRB_UTF8_STRING` setting. before this patch: $ bin/mruby -e 'p 171.chr' #=> "\xab" (`MRB_UTF8_STRING` is disabled) $ bin/mruby -e 'p 171.chr' #=> "«" (`MRB_UTF8_STRING` is enabled) This behavior is incompatible with Ruby, and a little inconvenient because it can't be interpreted as ASCII-8BIT with `MRB_UTF8_STRING`, I think. So add encoding argument according to Ruby. after this patch: $ bin/mruby -e 'p 171.chr' #=> "\xab" $ bin/mruby -e 'p 171.chr("ASCII-8BIT")' #=> "\xab" $ bin/mruby -e 'p 171.chr("UTF-8")' #=> "«" Allow only `String` for encoding because mruby doesn't have `Encoding` class, and `"ASCII-8BIT"` (`"BINARY"`) and `"UTF-8"` (only with `MRB_UTF8_STRING`) are valid value (default is `"ASCII-8BIT"`).

Add encoding argument to `Integral#chr`
Currently, `Integral#chr` in mruby changes behavior by `MRB_UTF8_STRING` setting. before this patch: $ bin/mruby -e 'p 171.chr' #=> "\xab" (`MRB_UTF8_STRING` is disabled) $ bin/mruby -e 'p 171.chr' #=> "«" (`MRB_UTF8_STRING` is enabled) This behavior is incompatible with Ruby, and a little inconvenient because it can't be interpreted as ASCII-8BIT with `MRB_UTF8_STRING`, I think. So add encoding argument according to Ruby. after this patch: $ bin/mruby -e 'p 171.chr' #=> "\xab" $ bin/mruby -e 'p 171.chr("ASCII-8BIT")' #=> "\xab" $ bin/mruby -e 'p 171.chr("UTF-8")' #=> "«" Allow only `String` for encoding because mruby doesn't have `Encoding` class, and `"ASCII-8BIT"` (`"BINARY"`) and `"UTF-8"` (only with `MRB_UTF8_STRING`) are valid value (default is `"ASCII-8BIT"`).
e86aa61f · KOBAYASHI Shuji · 5166e316 · e86aa61f · e86aa61f · e86aa61f
Commit e86aa61f authored Jul 23, 2019 by KOBAYASHI Shuji
4 changed files
--- a/mrbgems/mruby-string-ext/mrblib/string.rb
+++ b/mrbgems/mruby-string-ext/mrblib/string.rb
@@ -414,7 +414,7 @@ class String
      e = max.ord
      while c <= e
        break if exclusive and c == e
-        yield c.chr
+        yield c.chr(__ENCODING__)
        c += 1
      end
      return self

--- a/mrbgems/mruby-string-ext/src/string.c
+++ b/mrbgems/mruby-string-ext/src/string.c
@@ -5,6 +5,91 @@
 #include <mruby/string.h>
 #include <mruby/range.h>

+#define ENC_ASCII_8BIT "ASCII-8BIT"
+#define ENC_BINARY     "BINARY"
+#define ENC_UTF8       "UTF-8"
+
+#define ENC_COMP_P(enc, enc_lit) \
+  str_casecmp_p(RSTRING_PTR(enc), RSTRING_LEN(enc), enc_lit, sizeof(enc_lit"")-1)
+
+#ifdef MRB_WITHOUT_FLOAT
+# define mrb_float_p(o) FALSE
+#endif
+
+static mrb_bool
+str_casecmp_p(const char *s1, mrb_int len1, const char *s2, mrb_int len2)
+{
+  const char *e1, *e2;
+
+  if (len1 != len2) return FALSE;
+  e1 = s1 + len1;
+  e2 = s2 + len2;
+  while (s1 < e1 && s2 < e2) {
+    if (*s1 != *s2 && TOUPPER(*s1) != TOUPPER(*s2)) return FALSE;
+    ++s1;
+    ++s2;
+  }
+  return TRUE;
+}
+
+static mrb_value
+int_chr_binary(mrb_state *mrb, mrb_value num)
+{
+  mrb_int cp = mrb_int(mrb, num);
+  char c;
+  mrb_value str;
+
+  if (cp < 0 || 0xff < cp) {
+    mrb_raisef(mrb, E_RANGE_ERROR, "%S out of char range", num);
+  }
+  c = (char)cp;
+  str = mrb_str_new(mrb, &c, 1);
+  RSTR_SET_ASCII_FLAG(mrb_str_ptr(str));
+  return str;
+}
+
+#ifdef MRB_UTF8_STRING
+static mrb_value
+int_chr_utf8(mrb_state *mrb, mrb_value num)
+{
+  mrb_int cp = mrb_int(mrb, num);
+  char utf8[4];
+  mrb_int len;
+  mrb_value str;
+  uint32_t ascii_flag = 0;
+
+  if (cp < 0 || 0x10FFFF < cp) {
+    mrb_raisef(mrb, E_RANGE_ERROR, "%S out of char range", num);
+  }
+  if (cp < 0x80) {
+    utf8[0] = (char)cp;
+    len = 1;
+    ascii_flag = MRB_STR_ASCII;
+  }
+  else if (cp < 0x800) {
+    utf8[0] = (char)(0xC0 | (cp >> 6));
+    utf8[1] = (char)(0x80 | (cp & 0x3F));
+    len = 2;
+  }
+  else if (cp < 0x10000) {
+    utf8[0] = (char)(0xE0 |  (cp >> 12));
+    utf8[1] = (char)(0x80 | ((cp >>  6) & 0x3F));
+    utf8[2] = (char)(0x80 | ( cp        & 0x3F));
+    len = 3;
+  }
+  else {
+    utf8[0] = (char)(0xF0 |  (cp >> 18));
+    utf8[1] = (char)(0x80 | ((cp >> 12) & 0x3F));
+    utf8[2] = (char)(0x80 | ((cp >>  6) & 0x3F));
+    utf8[3] = (char)(0x80 | ( cp        & 0x3F));
+    len = 4;
+  }
+  str = mrb_str_new(mrb, utf8, len);
+  mrb_str_ptr(str)->flags |= ascii_flag;
+  return str;
+}
+#endif
+
 static mrb_value
 mrb_str_getbyte(mrb_state *mrb, mrb_value str)
 {
@@ -125,8 +210,6 @@ mrb_str_swapcase(mrb_state *mrb, mrb_value self)
  return str;
 }

-static mrb_value mrb_int_chr(mrb_state *mrb, mrb_value num);
-
 /*
 *  call-seq:
 *     str << integer       -> str
@@ -136,7 +219,8 @@ static mrb_value mrb_int_chr(mrb_state *mrb, mrb_value num);
 *
 *  Append---Concatenates the given object to <i>str</i>. If the object is a
 *  <code>Integer</code>, it is considered as a codepoint, and is converted
- *  to a character before concatenation.
+ *  to a character before concatenation
+ *  (equivalent to <code>str.concat(integer.chr(__ENCODING__))</code>).
 *
 *     a = "hello "
 *     a << "world"   #=> "hello world"
@@ -148,8 +232,12 @@ mrb_str_concat_m(mrb_state *mrb, mrb_value self)
  mrb_value str;

  mrb_get_args(mrb, "o", &str);
-  if (mrb_fixnum_p(str))
-    str = mrb_int_chr(mrb, str);
+  if (mrb_fixnum_p(str) || mrb_float_p(str))
+#ifdef MRB_UTF8_STRING
+    str = int_chr_utf8(mrb, str);
+#else
+    str = int_chr_binary(mrb, str);
+#endif
  else
    str = mrb_ensure_string_type(mrb, str);
  mrb_str_concat(mrb, self, str);
@@ -800,7 +888,7 @@ mrb_str_count(mrb_state *mrb, mrb_value str)
  tr_parse_pattern(mrb, &pat, v_pat, TRUE);
  tr_compile_pattern(&pat, v_pat, bitmap);
  tr_free_pattern(mrb, &pat);
-  
+
  s = RSTRING_PTR(str);
  len = RSTRING_LEN(str);
  for (i = 0; i < len; i++) {
@@ -836,49 +924,40 @@ mrb_str_chr(mrb_state *mrb, mrb_value self)
  return mrb_str_substr(mrb, self, 0, 1);
 }

+/*
+ *  call-seq:
+ *     int.chr([encoding])  ->  string
+ *
+ *  Returns a string containing the character represented by the +int+'s value
+ *  according to +encoding+. +"ASCII-8BIT"+ (+"BINARY"+) and +"UTF-8"+ (only
+ *  with +MRB_UTF8_STRING+) can be specified as +encoding+ (default is
+ *  +"ASCII-8BIT"+).
+ *
+ *     65.chr                  #=> "A"
+ *     230.chr                 #=> "\xE6"
+ *     230.chr("ASCII-8BIT")   #=> "\xE6"
+ *     230.chr("UTF-8")        #=> "\u00E6"
+ */
 static mrb_value
 mrb_int_chr(mrb_state *mrb, mrb_value num)
 {
-  mrb_int cp = mrb_fixnum(num);
-#ifdef MRB_UTF8_STRING
-  char utf8[4];
-  mrb_int len;
-
-  if (cp < 0 || 0x10FFFF < cp) {
-    mrb_raisef(mrb, E_RANGE_ERROR, "%S out of char range", num);
-  }
-  if (cp < 0x80) {
-    utf8[0] = (char)cp;
-    len = 1;
+  mrb_value enc;
+  mrb_bool enc_given;
+
+  mrb_get_args(mrb, "|S?", &enc, &enc_given);
+  if (!enc_given ||
+      ENC_COMP_P(enc, ENC_ASCII_8BIT) ||
+      ENC_COMP_P(enc, ENC_BINARY)) {
+    return int_chr_binary(mrb, num);
  }
-  else if (cp < 0x800) {
-    utf8[0] = (char)(0xC0 | (cp >> 6));
-    utf8[1] = (char)(0x80 | (cp & 0x3F));
-    len = 2;
-  }
-  else if (cp < 0x10000) {
-    utf8[0] = (char)(0xE0 |  (cp >> 12));
-    utf8[1] = (char)(0x80 | ((cp >>  6) & 0x3F));
-    utf8[2] = (char)(0x80 | ( cp        & 0x3F));
-    len = 3;
+#ifdef MRB_UTF8_STRING
+  else if (ENC_COMP_P(enc, ENC_UTF8)) {
+    return int_chr_utf8(mrb, num);
  }
+#endif
  else {
-    utf8[0] = (char)(0xF0 |  (cp >> 18));
-    utf8[1] = (char)(0x80 | ((cp >> 12) & 0x3F));
-    utf8[2] = (char)(0x80 | ((cp >>  6) & 0x3F));
-    utf8[3] = (char)(0x80 | ( cp        & 0x3F));
-    len = 4;
-  }
-  return mrb_str_new(mrb, utf8, len);
-#else
-  char c;
-
-  if (cp < 0 || 0xff < cp) {
-    mrb_raisef(mrb, E_RANGE_ERROR, "%S out of char range", num);
+    mrb_raisef(mrb, E_ARGUMENT_ERROR, "unknown encoding name - %S", enc);
  }
-  c = (char)cp;
-  return mrb_str_new(mrb, &c, 1);
-#endif
 }

 /*
@@ -1219,7 +1298,8 @@ mrb_mruby_string_ext_gem_init(mrb_state* mrb)
  mrb_define_method(mrb, s, "delete_suffix",   mrb_str_del_suffix,      MRB_ARGS_REQ(1));

  mrb_define_method(mrb, s, "__lines",         mrb_str_lines,           MRB_ARGS_NONE());
-  mrb_define_method(mrb, mrb_module_get(mrb, "Integral"), "chr", mrb_int_chr, MRB_ARGS_NONE());
+
+  mrb_define_method(mrb, mrb_module_get(mrb, "Integral"), "chr", mrb_int_chr, MRB_ARGS_OPT(1));
 }

 void

--- a/mrbgems/mruby-string-ext/test/numeric.rb
+++ b/mrbgems/mruby-string-ext/test/numeric.rb
+# coding: utf-8
+
 assert('Integer#chr') do
  assert_equal("A", 65.chr)
  assert_equal("B", 0x42.chr)
+  assert_equal("\xab", 171.chr)
  assert_raise(RangeError) { -1.chr }
+  assert_raise(RangeError) { 256.chr }
+
+  assert_equal("A", 65.chr("ASCII-8BIT"))
+  assert_equal("B", 0x42.chr("BINARY"))
+  assert_equal("\xab", 171.chr("ascii-8bit"))
+  assert_raise(RangeError) { -1.chr("binary") }
+  assert_raise(RangeError) { 256.chr("Ascii-8bit") }
+  assert_raise(ArgumentError) { 65.chr("ASCII") }
+  assert_raise(ArgumentError) { 65.chr("ASCII-8BIT", 2) }
+  assert_raise(TypeError) { 65.chr(:BINARY) }
+
+  if __ENCODING__ == "ASCII-8BIT"
+    assert_raise(ArgumentError) { 65.chr("UTF-8") }
+  else
+    assert_equal("A", 65.chr("UTF-8"))
+    assert_equal("B", 0x42.chr("UTF-8"))
+    assert_equal("«", 171.chr("utf-8"))
+    assert_equal("あ", 12354.chr("Utf-8"))
+    assert_raise(RangeError) { -1.chr("utf-8") }
+    assert_raise(RangeError) { 0x110000.chr.chr("UTF-8") }
+  end
 end
--- a/mrbgems/mruby-string-ext/test/string.rb
+++ b/mrbgems/mruby-string-ext/test/string.rb
@@ -167,8 +167,15 @@ end
 assert('String#concat') do
  assert_equal "Hello World!", "Hello " << "World" << 33
  assert_equal "Hello World!", "Hello ".concat("World").concat(33)
-
  assert_raise(TypeError) { "".concat(Object.new) }
+
+  if UTF8STRING
+    assert_equal "H«", "H" << 0xab
+    assert_equal "Hは", "H" << 12399
+  else
+    assert_equal "H\xab", "H" << 0xab
+    assert_raise(RangeError) { "H" << 12399 }
+  end
 end

 assert('String#casecmp') do