UTF-8 string support in core

define MRB_UTF8_STRING (in mrbconf.h) to enable UTF-8 support.
parent 101ec5eb
...@@ -26,6 +26,9 @@ ...@@ -26,6 +26,9 @@
/* represent mrb_value as a word (natural unit of data for the processor) */ /* represent mrb_value as a word (natural unit of data for the processor) */
//#define MRB_WORD_BOXING //#define MRB_WORD_BOXING
/* string class to handle UTF-8 encoding */
//#define MRB_UTF8_STRING
/* argv max size in mrb_funcall */ /* argv max size in mrb_funcall */
//#define MRB_FUNCALL_ARGC_MAX 16 //#define MRB_FUNCALL_ARGC_MAX 16
......
...@@ -310,4 +310,30 @@ class String ...@@ -310,4 +310,30 @@ class String
return self if excl && str == other_str return self if excl && str == other_str
end end
end end
def chars(&block)
if block_given?
self.split('').map do |i|
block.call(i)
end
self
else
self.split('')
end
end
alias each_char chars
def codepoints(&block)
len = self.size
if block_given?
self.split('').map do|x|
block.call(x.ord)
end
self
else
self.split('').map{|x| x.ord}
end
end
alias each_codepoint codepoints
end end
...@@ -245,6 +245,51 @@ mrb_str_chr(mrb_state *mrb, mrb_value self) ...@@ -245,6 +245,51 @@ mrb_str_chr(mrb_state *mrb, mrb_value self)
return mrb_str_substr(mrb, self, 0, 1); return mrb_str_substr(mrb, self, 0, 1);
} }
static mrb_value
mrb_fixnum_chr(mrb_state *mrb, mrb_value num)
{
mrb_int cp = mrb_fixnum(num);
#ifdef MRB_UTF8_STRING
char utf8[4];
mrb_int len;
if (cp < 0 || 0x10FFFF < cp) {
mrb_raisef(mrb, E_RANGE_ERROR, "%S out of char range", num);
}
if (cp < 0x80) {
utf8[0] = (char)cp;
len = 1;
}
else if (cp < 0x800) {
utf8[0] = (char)(0xC0 | (cp >> 6));
utf8[1] = (char)(0x80 | (cp & 0x3F));
len = 2;
}
else if (cp < 0x10000) {
utf8[0] = (char)(0xE0 | (cp >> 12));
utf8[1] = (char)(0x80 | ((cp >> 6) & 0x3F));
utf8[2] = (char)(0x80 | ( cp & 0x3F));
len = 3;
}
else {
utf8[0] = (char)(0xF0 | (cp >> 18));
utf8[1] = (char)(0x80 | ((cp >> 12) & 0x3F));
utf8[2] = (char)(0x80 | ((cp >> 6) & 0x3F));
utf8[3] = (char)(0x80 | ( cp & 0x3F));
len = 4;
}
return mrb_str_new(mrb, utf8, len);
#else
char c;
if (cp < 0 || 0xff < cp) {
mrb_raisef(mrb, E_RANGE_ERROR, "%S out of char range", num);
}
c = (char)cp;
return mrb_str_new(mrb, &c, 1);
#endif
}
/* /*
* call-seq: * call-seq:
* string.lines -> array of string * string.lines -> array of string
...@@ -422,6 +467,72 @@ mrb_str_prepend(mrb_state *mrb, mrb_value self) ...@@ -422,6 +467,72 @@ mrb_str_prepend(mrb_state *mrb, mrb_value self)
return self; return self;
} }
#ifdef MRB_UTF8_STRING
static const char utf8len_codepage_zero[256] =
{
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0,
};
static mrb_int
utf8code(unsigned char* p)
{
mrb_int len;
if (p[0] < 0x80)
return p[0];
len = utf8len_codepage_zero[p[0]];
if (len > 1 && (p[1] & 0xc0) == 0x80) {
if (len == 2)
return ((p[0] & 0x1f) << 6) + (p[1] & 0x3f);
if ((p[2] & 0xc0) == 0x80) {
if (len == 3)
return ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6)
+ (p[2] & 0x3f);
if ((p[3] & 0xc0) == 0x80) {
if (len == 4)
return ((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12)
+ ((p[2] & 0x3f) << 6) + (p[3] & 0x3f);
if ((p[4] & 0xc0) == 0x80) {
if (len == 5)
return ((p[0] & 0x03) << 24) + ((p[1] & 0x3f) << 18)
+ ((p[2] & 0x3f) << 12) + ((p[3] & 0x3f) << 6)
+ (p[4] & 0x3f);
if ((p[5] & 0xc0) == 0x80 && len == 6)
return ((p[0] & 0x01) << 30) + ((p[1] & 0x3f) << 24)
+ ((p[2] & 0x3f) << 18) + ((p[3] & 0x3f) << 12)
+ ((p[4] & 0x3f) << 6) + (p[5] & 0x3f);
}
}
}
}
return p[0];
}
static mrb_value
mrb_str_ord(mrb_state* mrb, mrb_value str)
{
if (RSTRING_LEN(str) == 0)
mrb_raise(mrb, E_ARGUMENT_ERROR, "empty string");
return mrb_fixnum_value(utf8code((unsigned char*) RSTRING_PTR(str)));
}
#else
static mrb_value
mrb_str_ord(mrb_state* mrb, mrb_value str)
{
if (RSTRING_LEN(str) == 0)
mrb_raise(mrb, E_ARGUMENT_ERROR, "empty string");
return mrb_fixnum_value(RSTRING_PTR(str)[0]);
}
#endif
void void
mrb_mruby_string_ext_gem_init(mrb_state* mrb) mrb_mruby_string_ext_gem_init(mrb_state* mrb)
{ {
...@@ -446,6 +557,9 @@ mrb_mruby_string_ext_gem_init(mrb_state* mrb) ...@@ -446,6 +557,9 @@ mrb_mruby_string_ext_gem_init(mrb_state* mrb)
mrb_define_method(mrb, s, "prepend", mrb_str_prepend, MRB_ARGS_REQ(1)); mrb_define_method(mrb, s, "prepend", mrb_str_prepend, MRB_ARGS_REQ(1));
mrb_alias_method(mrb, s, mrb_intern_lit(mrb, "next"), mrb_intern_lit(mrb, "succ")); mrb_alias_method(mrb, s, mrb_intern_lit(mrb, "next"), mrb_intern_lit(mrb, "succ"));
mrb_alias_method(mrb, s, mrb_intern_lit(mrb, "next!"), mrb_intern_lit(mrb, "succ!")); mrb_alias_method(mrb, s, mrb_intern_lit(mrb, "next!"), mrb_intern_lit(mrb, "succ!"));
mrb_define_method(mrb, s, "ord", mrb_str_ord, MRB_ARGS_NONE());
mrb_define_method(mrb, mrb->fixnum_class, "chr", mrb_fixnum_chr, MRB_ARGS_NONE());
} }
void void
......
## ##
# String(Ext) Test # String(Ext) Test
UTF8STRING = ("\343\201\202".size == 1)
assert('String#getbyte') do assert('String#getbyte') do
str1 = "hello" str1 = "hello"
bytes1 = [104, 101, 108, 108, 111] bytes1 = [104, 101, 108, 108, 111]
...@@ -180,6 +182,8 @@ end ...@@ -180,6 +182,8 @@ end
assert('String#chr') do assert('String#chr') do
assert_equal "a", "abcde".chr assert_equal "a", "abcde".chr
# test Fixnum#chr as well
assert_equal "a", 97.chr
end end
assert('String#lines') do assert('String#lines') do
...@@ -374,8 +378,8 @@ assert('String#succ') do ...@@ -374,8 +378,8 @@ assert('String#succ') do
assert_equal "-b-", a assert_equal "-b-", a
a = "-z-"; a.succ! a = "-z-"; a.succ!
assert_equal "-aa-", a assert_equal "-aa-", a
a = "あa"; a.succ! a = "あb"; a.succ!
assert_equal "あb", a assert_equal "あc", a
a = "あaz"; a.succ! a = "あaz"; a.succ!
assert_equal "あba", a assert_equal "あba", a
end end
...@@ -471,3 +475,96 @@ assert('String#upto') do ...@@ -471,3 +475,96 @@ assert('String#upto') do
}) })
assert_equal(2, count) assert_equal(2, count)
end end
assert('String#ord') do
got = "hello!".split('').map {|x| x.ord}
expect = [104, 101, 108, 108, 111, 33]
assert_equal expect, got
end
assert('String#ord(UTF-8)') do
got = "こんにちは世界!".split('').map {|x| x.ord}
expect = [0x3053,0x3093,0x306b,0x3061,0x306f,0x4e16,0x754c,0x21]
assert_equal expect, got
end if UTF8STRING
assert('String#chr') do
assert_equal "h", "hello!".chr
end
assert('String#chr(UTF-8)') do
assert_equal "こ", "こんにちは世界!".chr
end if UTF8STRING
assert('String#chars') do
expect = ["h", "e", "l", "l", "o", "!"]
assert_equal expect, "hello!".chars
s = ""
"hello!".chars do |x|
s += x
end
assert_equal "hello!", s
end
assert('String#chars(UTF-8)') do
expect = ['こ', 'ん', 'に', 'ち', 'は', '世', '界', '!']
assert_equal expect, "こんにちは世界!".chars
s = ""
"こんにちは世界!".chars do |x|
s += x
end
assert_equal "こんにちは世界!", s
end if UTF8STRING
assert('String#each_char') do
s = ""
"hello!".each_char do |x|
s += x
end
assert_equal "hello!", s
end
assert('String#each_char(UTF-8)') do
s = ""
"こんにちは世界!".each_char do |x|
s += x
end
assert_equal "こんにちは世界!", s
end if UTF8STRING
assert('String#codepoints') do
expect = [104, 101, 108, 108, 111, 33]
assert_equal expect, "hello!".codepoints
cp = []
"hello!".codepoints do |x|
cp << x
end
assert_equal expect, cp
end
assert('String#codepoints(UTF-8)') do
expect = [12371, 12435, 12395, 12385, 12399, 19990, 30028, 33]
assert_equal expect, "こんにちは世界!".codepoints
cp = []
"こんにちは世界!".codepoints do |x|
cp << x
end
assert_equal expect, cp
end if UTF8STRING
assert('String#each_codepoint') do
expect = [104, 101, 108, 108, 111, 33]
cp = []
"hello!".each_codepoint do |x|
cp << x
end
assert_equal expect, cp
end
assert('String#each_codepoint(UTF-8)') do
expect = [12371, 12435, 12395, 12385, 12399, 19990, 30028, 33]
cp = []
"こんにちは世界!".each_codepoint do |x|
cp << x
end
assert_equal expect, cp
end if UTF8STRING
MRuby::Gem::Specification.new('mruby-string-utf8') do |spec|
spec.license = 'MIT'
spec.author = 'mruby developers'
spec.summary = 'UTF-8 support in String class'
spec.add_dependency('mruby-string-ext', :core => 'mruby-string-ext')
end
#include "mruby.h"
#include "mruby/array.h"
#include "mruby/class.h"
#include "mruby/string.h"
#include "mruby/range.h"
#include "mruby/numeric.h"
#include "mruby/re.h"
#include <string.h>
static const char utf8len_codepage[256] =
{
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1,
};
static const char utf8len_codepage_zero[256] =
{
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0,
};
static mrb_int
utf8code(unsigned char* p)
{
mrb_int len;
if (p[0] < 0x80)
return p[0];
len = utf8len_codepage_zero[p[0]];
if (len > 1 && (p[1] & 0xc0) == 0x80) {
if (len == 2)
return ((p[0] & 0x1f) << 6) + (p[1] & 0x3f);
if ((p[2] & 0xc0) == 0x80) {
if (len == 3)
return ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6)
+ (p[2] & 0x3f);
if ((p[3] & 0xc0) == 0x80) {
if (len == 4)
return ((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12)
+ ((p[2] & 0x3f) << 6) + (p[3] & 0x3f);
if ((p[4] & 0xc0) == 0x80) {
if (len == 5)
return ((p[0] & 0x03) << 24) + ((p[1] & 0x3f) << 18)
+ ((p[2] & 0x3f) << 12) + ((p[3] & 0x3f) << 6)
+ (p[4] & 0x3f);
if ((p[5] & 0xc0) == 0x80 && len == 6)
return ((p[0] & 0x01) << 30) + ((p[1] & 0x3f) << 24)
+ ((p[2] & 0x3f) << 18) + ((p[3] & 0x3f) << 12)
+ ((p[4] & 0x3f) << 6) + (p[5] & 0x3f);
}
}
}
}
return p[0];
}
static mrb_value mrb_fixnum_chr(mrb_state*, mrb_value);
static mrb_int
utf8len(unsigned char* p)
{
mrb_int len;
mrb_int i;
if (*p == 0)
return 1;
len = utf8len_codepage[*p];
for (i = 1; i < len; ++i)
if ((p[i] & 0xc0) != 0x80)
return 1;
return len;
}
static mrb_int
mrb_utf8_strlen(mrb_value str, mrb_int len)
{
mrb_int total = 0;
unsigned char* p = (unsigned char*) RSTRING_PTR(str);
unsigned char* e = p;
e += len < 0 ? RSTRING_LEN(str) : len;
while (p<e) {
p += utf8len(p);
total++;
}
return total;
}
static mrb_value
mrb_str_size(mrb_state *mrb, mrb_value str)
{
return mrb_fixnum_value(mrb_utf8_strlen(str, -1));
}
#define RSTRING_LEN_UTF8(s) mrb_utf8_strlen(s, -1)
static inline mrb_int
mrb_memsearch_qs(const unsigned char *xs, mrb_int m, const unsigned char *ys, mrb_int n)
{
const unsigned char *x = xs, *xe = xs + m;
const unsigned char *y = ys;
int i, qstable[256];
/* Preprocessing */
for (i = 0; i < 256; ++i)
qstable[i] = m + 1;
for (; x < xe; ++x)
qstable[*x] = xe - x;
/* Searching */
for (; y + m <= ys + n; y += *(qstable + y[m])) {
if (*xs == *y && memcmp(xs, y, m) == 0)
return y - ys;
}
return -1;
}
static mrb_int
mrb_memsearch(const void *x0, mrb_int m, const void *y0, mrb_int n)
{
const unsigned char *x = (const unsigned char *)x0, *y = (const unsigned char *)y0;
if (m > n) return -1;
else if (m == n) {
return memcmp(x0, y0, m) == 0 ? 0 : -1;
}
else if (m < 1) {
return 0;
}
else if (m == 1) {
const unsigned char *ys = y, *ye = ys + n;
for (; y < ye; ++y) {
if (*x == *y)
return y - ys;
}
return -1;
}
return mrb_memsearch_qs((const unsigned char *)x0, m, (const unsigned char *)y0, n);
}
static mrb_value
str_subseq(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len)
{
mrb_int i;
unsigned char *p = (unsigned char*) RSTRING_PTR(str), *t;
unsigned char *e = p + RSTRING_LEN(str);
for (i = 0; i < beg && p<e; i++) {
p += utf8len(p);
}
t = p;
for (i = 0; i < len && t<e; i++) {
t += utf8len(t);
}
return mrb_str_new(mrb, (const char*)p, (size_t)(t - p));
}
static mrb_value
str_substr(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len)
{
mrb_value str2;
mrb_int len8 = RSTRING_LEN_UTF8(str);
if (len < 0) return mrb_nil_value();
if (len8 == 0) {
len = 0;
}
else if (beg < 0) {
beg = len8 + beg;
}
if (beg > len8) return mrb_nil_value();
if (beg < 0) {
beg += len8;
if (beg < 0) return mrb_nil_value();
}
if (beg + len > len8)
len = len8 - beg;
if (len <= 0) {
len = 0;
}
str2 = str_subseq(mrb, str, beg, len);
return str2;
}
static mrb_int
str_index(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int offset)
{
mrb_int pos;
char *s, *sptr;
mrb_int len, slen;
len = RSTRING_LEN(str);
slen = RSTRING_LEN(sub);
if (offset < 0) {
offset += len;
if (offset < 0) return -1;
}
if (len - offset < slen) return -1;
s = RSTRING_PTR(str);
if (offset) {
s += offset;
}
if (slen == 0) return offset;
/* need proceed one character at a time */
sptr = RSTRING_PTR(sub);
slen = RSTRING_LEN(sub);
len = RSTRING_LEN(str) - offset;
pos = mrb_memsearch(sptr, slen, s, len);
if (pos < 0) return pos;
return pos + offset;
}
static mrb_int
str_rindex(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int pos)
{
char *s, *sbeg, *t;
struct RString *ps = mrb_str_ptr(str);
mrb_int len = RSTRING_LEN(sub);
/* substring longer than string */
if (RSTR_LEN(ps) < len) return -1;
if (RSTR_LEN(ps) - pos < len) {
pos = RSTR_LEN(ps) - len;
}
sbeg = RSTR_PTR(ps);
s = RSTR_PTR(ps) + pos;
t = RSTRING_PTR(sub);
if (len) {
while (sbeg <= s) {
if (memcmp(s, t, len) == 0) {
return s - RSTR_PTR(ps);
}
s--;
}
return -1;
}
else {
return pos;
}
}
static mrb_value
mrb_str_aref(mrb_state *mrb, mrb_value str, mrb_value indx)
{
mrb_int idx;
mrb_regexp_check(mrb, indx);
switch (mrb_type(indx)) {
case MRB_TT_FLOAT:
indx = mrb_flo_to_fixnum(mrb, indx);
/* fall through */
case MRB_TT_FIXNUM:
idx = mrb_fixnum(indx);
num_index:
str = str_substr(mrb, str, idx, 1);
if (!mrb_nil_p(str) && RSTRING_LEN(str) == 0) return mrb_nil_value();
return str;
case MRB_TT_STRING:
if (str_index(mrb, str, indx, 0) != -1)
return mrb_str_dup(mrb, indx);
return mrb_nil_value();
case MRB_TT_RANGE:
/* check if indx is Range */
{
mrb_int beg, len;
mrb_value tmp;
len = RSTRING_LEN_UTF8(str);
if (mrb_range_beg_len(mrb, indx, &beg, &len, len)) {
tmp = str_subseq(mrb, str, beg, len);
return tmp;
}
else {
return mrb_nil_value();
}
}
default:
idx = mrb_fixnum(indx);
goto num_index;
}
return mrb_nil_value(); /* not reached */
}
static mrb_value
mrb_str_aref_m(mrb_state *mrb, mrb_value str)
{
mrb_value a1, a2;
int argc;
argc = mrb_get_args(mrb, "o|o", &a1, &a2);
if (argc == 2) {
mrb_regexp_check(mrb, a1);
return str_substr(mrb, str, mrb_fixnum(a1), mrb_fixnum(a2));
}
if (argc != 1) {
mrb_raisef(mrb, E_ARGUMENT_ERROR, "wrong number of arguments (%S for 1)", mrb_fixnum_value(argc));
}
return mrb_str_aref(mrb, str, a1);
}
static mrb_value
mrb_str_index_m(mrb_state *mrb, mrb_value str)
{
mrb_value *argv;
mrb_int argc;
mrb_value sub;
mrb_int pos;
mrb_get_args(mrb, "*", &argv, &argc);
if (argc == 2) {
pos = mrb_fixnum(argv[1]);
sub = argv[0];
}
else {
pos = 0;
if (argc > 0)
sub = argv[0];
else
sub = mrb_nil_value();
}
mrb_regexp_check(mrb, sub);
if (pos < 0) {
pos += RSTRING_LEN(str);
if (pos < 0) {
return mrb_nil_value();
}
}
if (mrb_type(sub) == MRB_TT_FIXNUM) {
sub = mrb_fixnum_chr(mrb, sub);
}
switch (mrb_type(sub)) {
default: {
mrb_value tmp;
tmp = mrb_check_string_type(mrb, sub);
if (mrb_nil_p(tmp)) {
mrb_raisef(mrb, E_TYPE_ERROR, "type mismatch: %S given", sub);
}
sub = tmp;
}
/* fall through */
case MRB_TT_STRING:
pos = str_index(mrb, str, sub, pos);
break;
}
if (pos == -1) return mrb_nil_value();
return mrb_fixnum_value(mrb_utf8_strlen(str, pos));
}
static mrb_value
mrb_str_reverse_bang(mrb_state *mrb, mrb_value str)
{
mrb_int utf8_len = mrb_utf8_strlen(str, -1);
if (utf8_len > 1) {
mrb_int len;
char *buf;
unsigned char *p, *e, *r;
mrb_str_modify(mrb, mrb_str_ptr(str));
len = RSTRING_LEN(str);
buf = (char *)mrb_malloc(mrb, (size_t)len);
p = (unsigned char*)buf;
e = (unsigned char*)buf + len;
memcpy(buf, RSTRING_PTR(str), len);
r = (unsigned char*)RSTRING_PTR(str) + len;
while (p<e) {
mrb_int clen = utf8len(p);
r -= clen;
memcpy(r, p, clen);
p += clen;
}
mrb_free(mrb, buf);
}
return str;
}
static mrb_value
mrb_str_rindex_m(mrb_state *mrb, mrb_value str)
{
mrb_value *argv;
mrb_int argc;
mrb_value sub;
mrb_value vpos;
mrb_int pos, len = RSTRING_LEN(str);
mrb_get_args(mrb, "*", &argv, &argc);
if (argc == 2) {
sub = argv[0];
vpos = argv[1];
pos = mrb_fixnum(vpos);
if (pos < 0) {
pos += len;
if (pos < 0) {
mrb_regexp_check(mrb, sub);
return mrb_nil_value();
}
}
if (pos > len) pos = len;
}
else {
pos = len;
if (argc > 0)
sub = argv[0];
else
sub = mrb_nil_value();
}
mrb_regexp_check(mrb, sub);
if (mrb_type(sub) == MRB_TT_FIXNUM) {
sub = mrb_fixnum_chr(mrb, sub);
}
switch (mrb_type(sub)) {
default: {
mrb_value tmp;
tmp = mrb_check_string_type(mrb, sub);
if (mrb_nil_p(tmp)) {
mrb_raisef(mrb, E_TYPE_ERROR, "type mismatch: %S given", sub);
}
sub = tmp;
}
/* fall through */
case MRB_TT_STRING:
pos = str_rindex(mrb, str, sub, pos);
break;
}
if (pos == -1) return mrb_nil_value();
return mrb_fixnum_value(mrb_utf8_strlen(str, pos));
}
static mrb_value
mrb_str_reverse(mrb_state *mrb, mrb_value str)
{
return mrb_str_reverse_bang(mrb, mrb_str_dup(mrb, str));
}
static mrb_value
mrb_fixnum_chr(mrb_state *mrb, mrb_value num)
{
mrb_int cp = mrb_fixnum(num);
char utf8[4];
mrb_int len;
if (cp < 0 || 0x10FFFF < cp) {
mrb_raisef(mrb, E_RANGE_ERROR, "%S out of char range", num);
}
if (cp < 0x80) {
utf8[0] = (char)cp;
len = 1;
}
else if (cp < 0x800) {
utf8[0] = (char)(0xC0 | (cp >> 6));
utf8[1] = (char)(0x80 | (cp & 0x3F));
len = 2;
}
else if (cp < 0x10000) {
utf8[0] = (char)(0xE0 | (cp >> 12));
utf8[1] = (char)(0x80 | ((cp >> 6) & 0x3F));
utf8[2] = (char)(0x80 | ( cp & 0x3F));
len = 3;
}
else {
utf8[0] = (char)(0xF0 | (cp >> 18));
utf8[1] = (char)(0x80 | ((cp >> 12) & 0x3F));
utf8[2] = (char)(0x80 | ((cp >> 6) & 0x3F));
utf8[3] = (char)(0x80 | ( cp & 0x3F));
len = 4;
}
return mrb_str_new(mrb, utf8, len);
}
static mrb_value
mrb_str_ord(mrb_state* mrb, mrb_value str)
{
mrb_int len = RSTRING_LEN(str);
if (len == 0) mrb_raise(mrb, E_ARGUMENT_ERROR, "empty string");
return mrb_fixnum_value(utf8code((unsigned char*) RSTRING_PTR(str)));
}
static mrb_value
mrb_str_split_m(mrb_state *mrb, mrb_value str)
{
int argc;
mrb_value spat = mrb_nil_value();
enum {awk, string, regexp} split_type = string;
long i = 0, lim_p;
mrb_int beg;
mrb_int end;
mrb_int lim = 0;
mrb_value result, tmp;
argc = mrb_get_args(mrb, "|oi", &spat, &lim);
lim_p = (lim > 0 && argc == 2);
if (argc == 2) {
if (lim == 1) {
if (RSTRING_LEN(str) == 0)
return mrb_ary_new_capa(mrb, 0);
return mrb_ary_new_from_values(mrb, 1, &str);
}
i = 1;
}
if (argc == 0 || mrb_nil_p(spat)) {
split_type = awk;
}
else {
if (mrb_string_p(spat)) {
split_type = string;
if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
split_type = awk;
}
}
else {
mrb_noregexp(mrb, str);
}
}
result = mrb_ary_new(mrb);
beg = 0;
if (split_type == awk) {
char *ptr = RSTRING_PTR(str);
char *eptr = RSTRING_END(str);
char *bptr = ptr;
int skip = 1;
unsigned int c;
end = beg;
while (ptr < eptr) {
int ai = mrb_gc_arena_save(mrb);
c = (unsigned char)*ptr++;
if (skip) {
if (ISSPACE(c)) {
beg = ptr - bptr;
}
else {
end = ptr - bptr;
skip = 0;
if (lim_p && lim <= i) break;
}
}
else if (ISSPACE(c)) {
mrb_ary_push(mrb, result, str_subseq(mrb, str, beg, end-beg));
mrb_gc_arena_restore(mrb, ai);
skip = 1;
beg = ptr - bptr;
if (lim_p) ++i;
}
else {
end = ptr - bptr;
}
}
}
else if (split_type == string) {
char *ptr = RSTRING_PTR(str); // s->as.ary
char *temp = ptr;
char *eptr = RSTRING_END(str);
mrb_int slen = RSTRING_LEN(spat);
if (slen == 0) {
int ai = mrb_gc_arena_save(mrb);
while (ptr < eptr) {
mrb_ary_push(mrb, result, str_subseq(mrb, str, ptr-temp, 1));
mrb_gc_arena_restore(mrb, ai);
ptr++;
if (lim_p && lim <= ++i) break;
}
}
else {
char *sptr = RSTRING_PTR(spat);
int ai = mrb_gc_arena_save(mrb);
while (ptr < eptr &&
(end = mrb_memsearch(sptr, slen, ptr, eptr - ptr)) >= 0) {
/* mrb_ary_push(mrb, result, str_subseq(mrb, str, ptr - temp, end)); */
mrb_ary_push(mrb, result, mrb_str_new(mrb, ptr, end));
mrb_gc_arena_restore(mrb, ai);
ptr += end + slen;
if (lim_p && lim <= ++i) break;
}
}
beg = ptr - temp;
}
else {
mrb_noregexp(mrb, str);
}
if (RSTRING_LEN(str) > 0 && (lim_p || RSTRING_LEN(str) > beg || lim < 0)) {
if (RSTRING_LEN(str) == beg) {
tmp = mrb_str_new_lit(mrb, "");
}
else {
tmp = mrb_str_new(mrb, RSTRING_PTR(str)+beg, RSTRING_LEN(str)-beg);
}
mrb_ary_push(mrb, result, tmp);
}
if (!lim_p && lim == 0) {
mrb_int len;
while ((len = RARRAY_LEN(result)) > 0 &&
(tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0))
mrb_ary_pop(mrb, result);
}
return result;
}
static mrb_value
mrb_str_chr(mrb_state *mrb, mrb_value self)
{
return str_substr(mrb, self, 0, 1);
}
static mrb_value
mrb_str_chars(mrb_state *mrb, mrb_value self)
{
mrb_value result;
mrb_value blk;
int ai;
mrb_int len;
mrb_value arg;
char *p = RSTRING_PTR(self);
char *e = p + RSTRING_LEN(self);
mrb_get_args(mrb, "&", &blk);
result = mrb_ary_new(mrb);
if (!mrb_nil_p(blk)) {
while (p < e) {
len = utf8len((unsigned char*) p);
arg = mrb_str_new(mrb, p, len);
mrb_yield_argv(mrb, blk, 1, &arg);
p += len;
}
return self;
}
while (p < e) {
ai = mrb_gc_arena_save(mrb);
len = utf8len((unsigned char*) p);
mrb_ary_push(mrb, result, mrb_str_new(mrb, p, len));
mrb_gc_arena_restore(mrb, ai);
p += len;
}
return result;
}
static mrb_value
mrb_str_codepoints(mrb_state *mrb, mrb_value self)
{
mrb_value result;
mrb_value blk;
int ai;
mrb_int len;
mrb_value arg;
char *p = RSTRING_PTR(self);
char *e = p + RSTRING_LEN(self);
mrb_get_args(mrb, "&", &blk);
result = mrb_ary_new(mrb);
if (!mrb_nil_p(blk)) {
while (p < e) {
len = utf8len((unsigned char*) p);
arg = mrb_fixnum_value(utf8code((unsigned char*) p));
mrb_yield_argv(mrb, blk, 1, &arg);
p += len;
}
return self;
}
while (p < e) {
ai = mrb_gc_arena_save(mrb);
len = utf8len((unsigned char*) p);
mrb_ary_push(mrb, result, mrb_fixnum_value(utf8code((unsigned char*) p)));
mrb_gc_arena_restore(mrb, ai);
p += len;
}
return result;
}
void
mrb_mruby_string_utf8_gem_init(mrb_state* mrb)
{
struct RClass * s = mrb->string_class;
mrb_define_method(mrb, s, "size", mrb_str_size, MRB_ARGS_NONE());
mrb_define_method(mrb, s, "length", mrb_str_size, MRB_ARGS_NONE());
mrb_define_method(mrb, s, "index", mrb_str_index_m, MRB_ARGS_ANY());
mrb_define_method(mrb, s, "[]", mrb_str_aref_m, MRB_ARGS_ANY());
mrb_define_method(mrb, s, "ord", mrb_str_ord, MRB_ARGS_NONE());
mrb_define_method(mrb, s, "slice", mrb_str_aref_m, MRB_ARGS_ANY());
mrb_define_method(mrb, s, "split", mrb_str_split_m, MRB_ARGS_ANY());
mrb_define_method(mrb, s, "reverse", mrb_str_reverse, MRB_ARGS_NONE());
mrb_define_method(mrb, s, "reverse!", mrb_str_reverse_bang, MRB_ARGS_NONE());
mrb_define_method(mrb, s, "rindex", mrb_str_rindex_m, MRB_ARGS_ANY());
mrb_define_method(mrb, s, "chr", mrb_str_chr, MRB_ARGS_NONE());
mrb_define_method(mrb, s, "chars", mrb_str_chars, MRB_ARGS_NONE());
mrb_alias_method(mrb, s, mrb_intern_lit(mrb, "each_char"), mrb_intern_lit(mrb, "chars"));
mrb_define_method(mrb, s, "codepoints", mrb_str_codepoints, MRB_ARGS_NONE());
mrb_alias_method(mrb, s, mrb_intern_lit(mrb, "each_codepoint"), mrb_intern_lit(mrb, "codepoints"));
mrb_define_method(mrb, mrb->fixnum_class, "chr", mrb_fixnum_chr, MRB_ARGS_NONE());
}
void
mrb_mruby_string_utf8_gem_final(mrb_state* mrb)
{
}
# -*- coding: utf-8 -*-
##
# String(utf8) Test
assert('String#[]') do
assert_equal "ち", "こんにちは世界"[3]
assert_equal nil, "こんにちは世界"[20]
assert_equal "世", "こんにちは世界"[-2]
assert_equal "世界", "こんにちは世界"[-2..-1]
assert_equal "んに", "こんにちは世界"[1,2]
assert_equal "世", "こんにちは世界"["世"]
assert_equal 'b', 'abc'[1.1]
end
assert('String#reverse', '15.2.10.5.29') do
a = 'こんにちは世界!'
a.reverse
assert_equal 'こんにちは世界!', a
assert_equal '!界世はちにんこ', 'こんにちは世界!'.reverse
end
assert('String#reverse!', '15.2.10.5.30') do
a = 'こんにちは世界!'
a.reverse!
assert_equal '!界世はちにんこ', a
assert_equal '!界世はちにんこ', 'こんにちは世界!'.reverse!
end
assert('Invalid sequence') do
assert_equal 5, "\xF8\x88\x80\x80\x80".size
assert_equal 6, "\xFC\x84\x80\x80\x80\x80".size
end
assert('String#size') do
str = 'こんにちは世界!'
assert_equal 8, str.size
assert_not_equal str.bytesize, str.size
assert_equal 2, str[1, 2].size
end
assert('String#index') do
str = "こんにちは世界!\nこんにちは世界!"
assert_nil str.index('さ')
assert_equal 3, str.index('ち')
assert_equal 12, str.index('ち', 10)
assert_equal nil, str.index("さ")
end
assert('String#ord') do
got = "こんにちは世界!".split('').map {|x| x.ord}
expect = [0x3053,0x3093,0x306b,0x3061,0x306f,0x4e16,0x754c,0x21]
assert_equal expect, got
end
assert('String#split') do
got = "こんにちは世界!".split('')
assert_equal ['こ', 'ん', 'に', 'ち', 'は', '世', '界', '!'], got
got = "こんにちは世界!".split('に')
assert_equal ['こん', 'ちは世界!'], got
end
assert('String#rindex') do
str = "こんにちは世界!\nこんにちは世界!"
assert_nil str.index('さ')
assert_equal 12, str.rindex('ち')
assert_equal 3, str.rindex('ち', 10)
end
assert('String#chr(utf-8)') do
assert_equal "こ", "こんにちは世界!".chr
end
assert('String#chars') do
expect = ['こ', 'ん', 'に', 'ち', 'は', '世', '界', '!']
assert_equal expect, "こんにちは世界!".chars
s = ""
"こんにちは世界!".chars do |x|
s += x
end
assert_equal "こんにちは世界!", s
end
assert('String#each_char') do
expect = ['こ', 'ん', 'に', 'ち', 'は', '世', '界', '!']
s = ""
"こんにちは世界!".each_char do |x|
s += x
end
assert_equal "こんにちは世界!", s
end
assert('String#codepoints') do
expect = [12371, 12435, 12395, 12385, 12399, 19990, 30028, 33]
assert_equal expect, "こんにちは世界!".codepoints
cp = []
"こんにちは世界!".codepoints do |x|
cp << x
end
assert_equal expect, cp
end
assert('String#each_codepoint') do
expect = [12371, 12435, 12395, 12385, 12399, 19990, 30028, 33]
cp = []
"こんにちは世界!".each_codepoint do |x|
cp << x
end
assert_equal expect, cp
end
...@@ -16,8 +16,6 @@ ...@@ -16,8 +16,6 @@
#include "mruby/string.h" #include "mruby/string.h"
#include "mruby/re.h" #include "mruby/re.h"
const char mrb_digitmap[] = "0123456789abcdefghijklmnopqrstuvwxyz";
typedef struct mrb_shared_string { typedef struct mrb_shared_string {
mrb_bool nofree : 1; mrb_bool nofree : 1;
int refcnt; int refcnt;
...@@ -25,198 +23,7 @@ typedef struct mrb_shared_string { ...@@ -25,198 +23,7 @@ typedef struct mrb_shared_string {
mrb_int len; mrb_int len;
} mrb_shared_string; } mrb_shared_string;
static mrb_value str_replace(mrb_state *mrb, struct RString *s1, struct RString *s2); const char mrb_digitmap[] = "0123456789abcdefghijklmnopqrstuvwxyz";
static mrb_value mrb_str_subseq(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len);
MRB_API mrb_int
mrb_str_strlen(mrb_state *mrb, struct RString *s)
{
mrb_int i, max = RSTR_LEN(s);
char *p = RSTR_PTR(s);
if (!p) return 0;
for (i=0; i<max; i++) {
if (p[i] == '\0') {
mrb_raise(mrb, E_ARGUMENT_ERROR, "string contains null byte");
}
}
return max;
}
#ifdef _WIN32
#include <windows.h>
char*
mrb_utf8_from_locale(const char *str, size_t len)
{
wchar_t* wcsp;
char* mbsp;
size_t mbssize, wcssize;
if (len == 0)
return strdup("");
if (len == -1)
len = strlen(str);
wcssize = MultiByteToWideChar(GetACP(), 0, str, len, NULL, 0);
wcsp = (wchar_t*) malloc((wcssize + 1) * sizeof(wchar_t));
if (!wcsp)
return NULL;
wcssize = MultiByteToWideChar(GetACP(), 0, str, len, wcsp, wcssize + 1);
wcsp[wcssize] = 0;
mbssize = WideCharToMultiByte(CP_UTF8, 0, (LPCWSTR) wcsp, -1, NULL, 0, NULL, NULL);
mbsp = (char*) malloc((mbssize + 1));
if (!mbsp) {
free(wcsp);
return NULL;
}
mbssize = WideCharToMultiByte(CP_UTF8, 0, (LPCWSTR) wcsp, -1, mbsp, mbssize, NULL, NULL);
mbsp[mbssize] = 0;
free(wcsp);
return mbsp;
}
char*
mrb_locale_from_utf8(const char *utf8, size_t len)
{
wchar_t* wcsp;
char* mbsp;
size_t mbssize, wcssize;
if (len == 0)
return strdup("");
if (len == -1)
len = strlen(utf8);
wcssize = MultiByteToWideChar(CP_UTF8, 0, utf8, len, NULL, 0);
wcsp = (wchar_t*) malloc((wcssize + 1) * sizeof(wchar_t));
if (!wcsp)
return NULL;
wcssize = MultiByteToWideChar(CP_UTF8, 0, utf8, len, wcsp, wcssize + 1);
wcsp[wcssize] = 0;
mbssize = WideCharToMultiByte(GetACP(), 0, (LPCWSTR) wcsp, -1, NULL, 0, NULL, NULL);
mbsp = (char*) malloc((mbssize + 1));
if (!mbsp) {
free(wcsp);
return NULL;
}
mbssize = WideCharToMultiByte(GetACP(), 0, (LPCWSTR) wcsp, -1, mbsp, mbssize, NULL, NULL);
mbsp[mbssize] = 0;
free(wcsp);
return mbsp;
}
#endif
static inline void
resize_capa(mrb_state *mrb, struct RString *s, mrb_int capacity)
{
if (RSTR_EMBED_P(s)) {
if (RSTRING_EMBED_LEN_MAX < capacity) {
char *const tmp = (char *)mrb_malloc(mrb, capacity+1);
const mrb_int len = RSTR_EMBED_LEN(s);
memcpy(tmp, s->as.ary, len);
RSTR_UNSET_EMBED_FLAG(s);
s->as.heap.ptr = tmp;
s->as.heap.len = len;
s->as.heap.aux.capa = capacity;
}
}
else {
s->as.heap.ptr = (char *)mrb_realloc(mrb, RSTR_PTR(s), capacity+1);
s->as.heap.aux.capa = capacity;
}
}
static void
str_decref(mrb_state *mrb, mrb_shared_string *shared)
{
shared->refcnt--;
if (shared->refcnt == 0) {
if (!shared->nofree) {
mrb_free(mrb, shared->ptr);
}
mrb_free(mrb, shared);
}
}
static void
check_frozen(mrb_state *mrb, struct RString *s)
{
if (RSTR_FROZEN_P(s)) {
mrb_raise(mrb, E_RUNTIME_ERROR, "can't modify frozen string");
}
}
MRB_API void
mrb_str_modify(mrb_state *mrb, struct RString *s)
{
check_frozen(mrb, s);
if (RSTR_SHARED_P(s)) {
mrb_shared_string *shared = s->as.heap.aux.shared;
if (shared->refcnt == 1 && s->as.heap.ptr == shared->ptr) {
s->as.heap.ptr = shared->ptr;
s->as.heap.aux.capa = shared->len;
RSTR_PTR(s)[s->as.heap.len] = '\0';
mrb_free(mrb, shared);
}
else {
char *ptr, *p;
mrb_int len;
p = RSTR_PTR(s);
len = s->as.heap.len;
ptr = (char *)mrb_malloc(mrb, (size_t)len + 1);
if (p) {
memcpy(ptr, p, len);
}
ptr[len] = '\0';
s->as.heap.ptr = ptr;
s->as.heap.aux.capa = len;
str_decref(mrb, shared);
}
RSTR_UNSET_SHARED_FLAG(s);
return;
}
if (RSTR_NOFREE_P(s)) {
char *p = s->as.heap.ptr;
s->as.heap.ptr = (char *)mrb_malloc(mrb, (size_t)s->as.heap.len+1);
if (p) {
memcpy(RSTR_PTR(s), p, s->as.heap.len);
}
RSTR_PTR(s)[s->as.heap.len] = '\0';
s->as.heap.aux.capa = s->as.heap.len;
RSTR_UNSET_NOFREE_FLAG(s);
return;
}
}
static mrb_value
mrb_str_freeze(mrb_state *mrb, mrb_value str)
{
struct RString *s = mrb_str_ptr(str);
RSTR_SET_FROZEN_FLAG(s);
return str;
}
MRB_API mrb_value
mrb_str_resize(mrb_state *mrb, mrb_value str, mrb_int len)
{
mrb_int slen;
struct RString *s = mrb_str_ptr(str);
mrb_str_modify(mrb, s);
slen = RSTR_LEN(s);
if (len != slen) {
if (slen < len || slen - len > 256) {
resize_capa(mrb, s, len);
}
RSTR_SET_LEN(s, len);
RSTR_PTR(s)[len] = '\0'; /* sentinel */
}
return str;
}
#define mrb_obj_alloc_string(mrb) ((struct RString*)mrb_obj_alloc((mrb), MRB_TT_STRING, (mrb)->string_class)) #define mrb_obj_alloc_string(mrb) ((struct RString*)mrb_obj_alloc((mrb), MRB_TT_STRING, (mrb)->string_class))
...@@ -307,6 +114,26 @@ mrb_str_buf_new(mrb_state *mrb, size_t capa) ...@@ -307,6 +114,26 @@ mrb_str_buf_new(mrb_state *mrb, size_t capa)
return mrb_obj_value(s); return mrb_obj_value(s);
} }
static inline void
resize_capa(mrb_state *mrb, struct RString *s, mrb_int capacity)
{
if (RSTR_EMBED_P(s)) {
if (RSTRING_EMBED_LEN_MAX < capacity) {
char *const tmp = (char *)mrb_malloc(mrb, capacity+1);
const mrb_int len = RSTR_EMBED_LEN(s);
memcpy(tmp, s->as.ary, len);
RSTR_UNSET_EMBED_FLAG(s);
s->as.heap.ptr = tmp;
s->as.heap.len = len;
s->as.heap.aux.capa = capacity;
}
}
else {
s->as.heap.ptr = (char *)mrb_realloc(mrb, RSTR_PTR(s), capacity+1);
s->as.heap.aux.capa = capacity;
}
}
static void static void
str_buf_cat(mrb_state *mrb, struct RString *s, const char *ptr, size_t len) str_buf_cat(mrb_state *mrb, struct RString *s, const char *ptr, size_t len)
{ {
...@@ -386,6 +213,18 @@ mrb_str_new_static(mrb_state *mrb, const char *p, size_t len) ...@@ -386,6 +213,18 @@ mrb_str_new_static(mrb_state *mrb, const char *p, size_t len)
return mrb_obj_value(s); return mrb_obj_value(s);
} }
static void
str_decref(mrb_state *mrb, mrb_shared_string *shared)
{
shared->refcnt--;
if (shared->refcnt == 0) {
if (!shared->nofree) {
mrb_free(mrb, shared->ptr);
}
mrb_free(mrb, shared);
}
}
void void
mrb_gc_free_str(mrb_state *mrb, struct RString *str) mrb_gc_free_str(mrb_state *mrb, struct RString *str)
{ {
...@@ -397,31 +236,136 @@ mrb_gc_free_str(mrb_state *mrb, struct RString *str) ...@@ -397,31 +236,136 @@ mrb_gc_free_str(mrb_state *mrb, struct RString *str)
mrb_free(mrb, str->as.heap.ptr); mrb_free(mrb, str->as.heap.ptr);
} }
MRB_API char* #ifdef MRB_UTF8_STRING
mrb_str_to_cstr(mrb_state *mrb, mrb_value str0) static const char utf8len_codepage[256] =
{ {
struct RString *s; 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1,
};
if (!mrb_string_p(str0)) { static mrb_int
mrb_raise(mrb, E_TYPE_ERROR, "expected String"); utf8len(unsigned char* p)
{
mrb_int len;
mrb_int i;
if (*p == 0)
return 1;
len = utf8len_codepage[*p];
for (i = 1; i < len; ++i)
if ((p[i] & 0xc0) != 0x80)
return 1;
return len;
}
static mrb_int
utf8_strlen(mrb_value str, mrb_int len)
{
mrb_int total = 0;
unsigned char* p = (unsigned char*) RSTRING_PTR(str);
unsigned char* e = p;
e += len < 0 ? RSTRING_LEN(str) : len;
while (p<e) {
p += utf8len(p);
total++;
} }
return total;
}
s = str_new(mrb, RSTRING_PTR(str0), RSTRING_LEN(str0)); #define RSTRING_CHAR_LEN(s) utf8_strlen(s, -1)
if ((strlen(RSTR_PTR(s)) ^ RSTR_LEN(s)) != 0) {
mrb_raise(mrb, E_ARGUMENT_ERROR, "string contains null byte"); /* map character index to byte offset index */
static mrb_int
chars2bytes(char *p, mrb_int idx)
{
mrb_int i, b, n;
for (b=i=0; i<idx; i++) {
n = utf8len((unsigned char*)p);
b += n;
p += n;
} }
return RSTR_PTR(s); return b;
} }
static void /* map byte offset to character index */
str_make_shared(mrb_state *mrb, struct RString *s) static mrb_int
bytes2chars(char *p, mrb_int bi)
{ {
if (!RSTR_SHARED_P(s)) { mrb_int i, b, n;
mrb_shared_string *shared = (mrb_shared_string *)mrb_malloc(mrb, sizeof(mrb_shared_string));
shared->refcnt = 1; for (b=i=0; b<bi; i++) {
if (RSTR_EMBED_P(s)) { n = utf8len((unsigned char*)p);
const mrb_int len = RSTR_EMBED_LEN(s); b += n;
p += n;
}
return i;
}
#else
#define RSTRING_CHAR_LEN(s) RSTRING_LEN(s)
#define chars2bytes(p, ci) (ci)
#define bytes2chars(p, bi) (bi)
#endif
static inline mrb_int
mrb_memsearch_qs(const unsigned char *xs, mrb_int m, const unsigned char *ys, mrb_int n)
{
const unsigned char *x = xs, *xe = xs + m;
const unsigned char *y = ys;
int i, qstable[256];
/* Preprocessing */
for (i = 0; i < 256; ++i)
qstable[i] = m + 1;
for (; x < xe; ++x)
qstable[*x] = xe - x;
/* Searching */
for (; y + m <= ys + n; y += *(qstable + y[m])) {
if (*xs == *y && memcmp(xs, y, m) == 0)
return y - ys;
}
return -1;
}
static mrb_int
mrb_memsearch(const void *x0, mrb_int m, const void *y0, mrb_int n)
{
const unsigned char *x = (const unsigned char *)x0, *y = (const unsigned char *)y0;
if (m > n) return -1;
else if (m == n) {
return memcmp(x0, y0, m) == 0 ? 0 : -1;
}
else if (m < 1) {
return 0;
}
else if (m == 1) {
const unsigned char *ys = y, *ye = ys + n;
for (; y < ye; ++y) {
if (*x == *y)
return y - ys;
}
return -1;
}
return mrb_memsearch_qs((const unsigned char *)x0, m, (const unsigned char *)y0, n);
}
static void
str_make_shared(mrb_state *mrb, struct RString *s)
{
if (!RSTR_SHARED_P(s)) {
mrb_shared_string *shared = (mrb_shared_string *)mrb_malloc(mrb, sizeof(mrb_shared_string));
shared->refcnt = 1;
if (RSTR_EMBED_P(s)) {
const mrb_int len = RSTR_EMBED_LEN(s);
char *const tmp = (char *)mrb_malloc(mrb, len+1); char *const tmp = (char *)mrb_malloc(mrb, len+1);
memcpy(tmp, s->as.ary, len); memcpy(tmp, s->as.ary, len);
tmp[len] = '\0'; tmp[len] = '\0';
...@@ -445,10 +389,343 @@ str_make_shared(mrb_state *mrb, struct RString *s) ...@@ -445,10 +389,343 @@ str_make_shared(mrb_state *mrb, struct RString *s)
shared->ptr = s->as.heap.ptr; shared->ptr = s->as.heap.ptr;
} }
} }
shared->len = s->as.heap.len; shared->len = s->as.heap.len;
s->as.heap.aux.shared = shared; s->as.heap.aux.shared = shared;
RSTR_SET_SHARED_FLAG(s); RSTR_SET_SHARED_FLAG(s);
}
}
static mrb_value
byte_subseq(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len)
{
struct RString *orig, *s;
mrb_shared_string *shared;
orig = mrb_str_ptr(str);
if (RSTR_EMBED_P(orig)) {
s = str_new(mrb, orig->as.ary+beg, len);
}
else {
str_make_shared(mrb, orig);
shared = orig->as.heap.aux.shared;
s = mrb_obj_alloc_string(mrb);
s->as.heap.ptr = orig->as.heap.ptr + beg;
s->as.heap.len = len;
s->as.heap.aux.shared = shared;
RSTR_SET_SHARED_FLAG(s);
shared->refcnt++;
}
return mrb_obj_value(s);
}
#ifdef MRB_UTF8_STRING
static inline mrb_value
str_subseq(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len)
{
beg = chars2bytes(RSTRING_PTR(str), beg);
len = chars2bytes(RSTRING_PTR(str)+beg, len);
return byte_subseq(mrb, str, beg, len);
}
#else
#define str_subseq(mrb, str, beg, len) byte_subseq(mrb, str, beg, len)
#endif
static mrb_value
str_substr(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len)
{
mrb_int clen = RSTRING_CHAR_LEN(str);
if (len < 0) return mrb_nil_value();
if (clen == 0) {
len = 0;
}
else if (beg < 0) {
beg = clen + beg;
}
if (beg > clen) return mrb_nil_value();
if (beg < 0) {
beg += clen;
if (beg < 0) return mrb_nil_value();
}
if (beg + len > clen)
len = clen - beg;
if (len <= 0) {
len = 0;
}
return str_subseq(mrb, str, beg, len);
}
static mrb_int
str_index(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int offset)
{
mrb_int pos;
char *s, *sptr;
mrb_int len, slen;
len = RSTRING_LEN(str);
slen = RSTRING_LEN(sub);
if (offset < 0) {
offset += len;
if (offset < 0) return -1;
}
if (len - offset < slen) return -1;
s = RSTRING_PTR(str);
if (offset) {
s += offset;
}
if (slen == 0) return offset;
/* need proceed one character at a time */
sptr = RSTRING_PTR(sub);
slen = RSTRING_LEN(sub);
len = RSTRING_LEN(str) - offset;
pos = mrb_memsearch(sptr, slen, s, len);
if (pos < 0) return pos;
return pos + offset;
}
static void
check_frozen(mrb_state *mrb, struct RString *s)
{
if (RSTR_FROZEN_P(s)) {
mrb_raise(mrb, E_RUNTIME_ERROR, "can't modify frozen string");
}
}
static mrb_value
str_replace(mrb_state *mrb, struct RString *s1, struct RString *s2)
{
long len;
check_frozen(mrb, s1);
len = RSTR_LEN(s2);
if (RSTR_SHARED_P(s1)) {
str_decref(mrb, s1->as.heap.aux.shared);
}
else if (!RSTR_EMBED_P(s1) && !RSTR_NOFREE_P(s1)) {
mrb_free(mrb, s1->as.heap.ptr);
}
RSTR_UNSET_NOFREE_FLAG(s1);
if (RSTR_SHARED_P(s2)) {
L_SHARE:
RSTR_UNSET_EMBED_FLAG(s1);
s1->as.heap.ptr = s2->as.heap.ptr;
s1->as.heap.len = len;
s1->as.heap.aux.shared = s2->as.heap.aux.shared;
RSTR_SET_SHARED_FLAG(s1);
s1->as.heap.aux.shared->refcnt++;
}
else {
if (len <= RSTRING_EMBED_LEN_MAX) {
RSTR_UNSET_SHARED_FLAG(s1);
RSTR_SET_EMBED_FLAG(s1);
memcpy(s1->as.ary, RSTR_PTR(s2), len);
RSTR_SET_EMBED_LEN(s1, len);
}
else {
str_make_shared(mrb, s2);
goto L_SHARE;
}
}
return mrb_obj_value(s1);
}
static mrb_int
str_rindex(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int pos)
{
char *s, *sbeg, *t;
struct RString *ps = mrb_str_ptr(str);
mrb_int len = RSTRING_LEN(sub);
/* substring longer than string */
if (RSTR_LEN(ps) < len) return -1;
if (RSTR_LEN(ps) - pos < len) {
pos = RSTR_LEN(ps) - len;
}
sbeg = RSTR_PTR(ps);
s = RSTR_PTR(ps) + pos;
t = RSTRING_PTR(sub);
if (len) {
while (sbeg <= s) {
if (memcmp(s, t, len) == 0) {
return s - RSTR_PTR(ps);
}
s--;
}
return -1;
}
else {
return pos;
}
}
MRB_API mrb_int
mrb_str_strlen(mrb_state *mrb, struct RString *s)
{
mrb_int i, max = RSTR_LEN(s);
char *p = RSTR_PTR(s);
if (!p) return 0;
for (i=0; i<max; i++) {
if (p[i] == '\0') {
mrb_raise(mrb, E_ARGUMENT_ERROR, "string contains null byte");
}
}
return max;
}
#ifdef _WIN32
#include <windows.h>
char*
mrb_utf8_from_locale(const char *str, size_t len)
{
wchar_t* wcsp;
char* mbsp;
size_t mbssize, wcssize;
if (len == 0)
return strdup("");
if (len == -1)
len = strlen(str);
wcssize = MultiByteToWideChar(GetACP(), 0, str, len, NULL, 0);
wcsp = (wchar_t*) malloc((wcssize + 1) * sizeof(wchar_t));
if (!wcsp)
return NULL;
wcssize = MultiByteToWideChar(GetACP(), 0, str, len, wcsp, wcssize + 1);
wcsp[wcssize] = 0;
mbssize = WideCharToMultiByte(CP_UTF8, 0, (LPCWSTR) wcsp, -1, NULL, 0, NULL, NULL);
mbsp = (char*) malloc((mbssize + 1));
if (!mbsp) {
free(wcsp);
return NULL;
}
mbssize = WideCharToMultiByte(CP_UTF8, 0, (LPCWSTR) wcsp, -1, mbsp, mbssize, NULL, NULL);
mbsp[mbssize] = 0;
free(wcsp);
return mbsp;
}
char*
mrb_locale_from_utf8(const char *utf8, size_t len)
{
wchar_t* wcsp;
char* mbsp;
size_t mbssize, wcssize;
if (len == 0)
return strdup("");
if (len == -1)
len = strlen(utf8);
wcssize = MultiByteToWideChar(CP_UTF8, 0, utf8, len, NULL, 0);
wcsp = (wchar_t*) malloc((wcssize + 1) * sizeof(wchar_t));
if (!wcsp)
return NULL;
wcssize = MultiByteToWideChar(CP_UTF8, 0, utf8, len, wcsp, wcssize + 1);
wcsp[wcssize] = 0;
mbssize = WideCharToMultiByte(GetACP(), 0, (LPCWSTR) wcsp, -1, NULL, 0, NULL, NULL);
mbsp = (char*) malloc((mbssize + 1));
if (!mbsp) {
free(wcsp);
return NULL;
}
mbssize = WideCharToMultiByte(GetACP(), 0, (LPCWSTR) wcsp, -1, mbsp, mbssize, NULL, NULL);
mbsp[mbssize] = 0;
free(wcsp);
return mbsp;
}
#endif
MRB_API void
mrb_str_modify(mrb_state *mrb, struct RString *s)
{
check_frozen(mrb, s);
if (RSTR_SHARED_P(s)) {
mrb_shared_string *shared = s->as.heap.aux.shared;
if (shared->refcnt == 1 && s->as.heap.ptr == shared->ptr) {
s->as.heap.ptr = shared->ptr;
s->as.heap.aux.capa = shared->len;
RSTR_PTR(s)[s->as.heap.len] = '\0';
mrb_free(mrb, shared);
}
else {
char *ptr, *p;
mrb_int len;
p = RSTR_PTR(s);
len = s->as.heap.len;
ptr = (char *)mrb_malloc(mrb, (size_t)len + 1);
if (p) {
memcpy(ptr, p, len);
}
ptr[len] = '\0';
s->as.heap.ptr = ptr;
s->as.heap.aux.capa = len;
str_decref(mrb, shared);
}
RSTR_UNSET_SHARED_FLAG(s);
return;
}
if (RSTR_NOFREE_P(s)) {
char *p = s->as.heap.ptr;
s->as.heap.ptr = (char *)mrb_malloc(mrb, (size_t)s->as.heap.len+1);
if (p) {
memcpy(RSTR_PTR(s), p, s->as.heap.len);
}
RSTR_PTR(s)[s->as.heap.len] = '\0';
s->as.heap.aux.capa = s->as.heap.len;
RSTR_UNSET_NOFREE_FLAG(s);
return;
}
}
static mrb_value
mrb_str_freeze(mrb_state *mrb, mrb_value str)
{
struct RString *s = mrb_str_ptr(str);
RSTR_SET_FROZEN_FLAG(s);
return str;
}
MRB_API mrb_value
mrb_str_resize(mrb_state *mrb, mrb_value str, mrb_int len)
{
mrb_int slen;
struct RString *s = mrb_str_ptr(str);
mrb_str_modify(mrb, s);
slen = RSTR_LEN(s);
if (len != slen) {
if (slen < len || slen - len > 256) {
resize_capa(mrb, s, len);
}
RSTR_SET_LEN(s, len);
RSTR_PTR(s)[len] = '\0'; /* sentinel */
}
return str;
}
MRB_API char*
mrb_str_to_cstr(mrb_state *mrb, mrb_value str0)
{
struct RString *s;
if (!mrb_string_p(str0)) {
mrb_raise(mrb, E_TYPE_ERROR, "expected String");
}
s = str_new(mrb, RSTRING_PTR(str0), RSTRING_LEN(str0));
if ((strlen(RSTR_PTR(s)) ^ RSTR_LEN(s)) != 0) {
mrb_raise(mrb, E_ARGUMENT_ERROR, "string contains null byte");
} }
return RSTR_PTR(s);
} }
/* /*
...@@ -519,15 +796,22 @@ mrb_str_plus_m(mrb_state *mrb, mrb_value self) ...@@ -519,15 +796,22 @@ mrb_str_plus_m(mrb_state *mrb, mrb_value self)
/* 15.2.10.5.33 */ /* 15.2.10.5.33 */
/* /*
* call-seq: * call-seq:
* len = strlen(String("abcd")) * "abcd".size => int
* *
* Returns the length of string. * Returns the length of string.
*/ */
static mrb_value static mrb_value
mrb_str_size(mrb_state *mrb, mrb_value self) mrb_str_size(mrb_state *mrb, mrb_value self)
{ {
struct RString *s = mrb_str_ptr(self); mrb_int len = RSTRING_CHAR_LEN(self);
return mrb_fixnum_value(RSTR_LEN(s)); return mrb_fixnum_value(len);
}
static mrb_value
mrb_str_bytesize(mrb_state *mrb, mrb_value self)
{
mrb_int len = RSTRING_LEN(self);
return mrb_fixnum_value(len);
} }
/* 15.2.10.5.1 */ /* 15.2.10.5.1 */
...@@ -742,77 +1026,6 @@ mrb_regexp_check(mrb_state *mrb, mrb_value obj) ...@@ -742,77 +1026,6 @@ mrb_regexp_check(mrb_state *mrb, mrb_value obj)
} }
} }
static inline mrb_int
mrb_memsearch_qs(const unsigned char *xs, mrb_int m, const unsigned char *ys, mrb_int n)
{
const unsigned char *x = xs, *xe = xs + m;
const unsigned char *y = ys;
int i, qstable[256];
/* Preprocessing */
for (i = 0; i < 256; ++i)
qstable[i] = m + 1;
for (; x < xe; ++x)
qstable[*x] = xe - x;
/* Searching */
for (; y + m <= ys + n; y += *(qstable + y[m])) {
if (*xs == *y && memcmp(xs, y, m) == 0)
return y - ys;
}
return -1;
}
static mrb_int
mrb_memsearch(const void *x0, mrb_int m, const void *y0, mrb_int n)
{
const unsigned char *x = (const unsigned char *)x0, *y = (const unsigned char *)y0;
if (m > n) return -1;
else if (m == n) {
return memcmp(x0, y0, m) == 0 ? 0 : -1;
}
else if (m < 1) {
return 0;
}
else if (m == 1) {
const unsigned char *ys = y, *ye = ys + n;
for (; y < ye; ++y) {
if (*x == *y)
return y - ys;
}
return -1;
}
return mrb_memsearch_qs((const unsigned char *)x0, m, (const unsigned char *)y0, n);
}
static mrb_int
mrb_str_index(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int offset)
{
mrb_int pos;
char *s, *sptr;
mrb_int len, slen;
len = RSTRING_LEN(str);
slen = RSTRING_LEN(sub);
if (offset < 0) {
offset += len;
if (offset < 0) return -1;
}
if (len - offset < slen) return -1;
s = RSTRING_PTR(str);
if (offset) {
s += offset;
}
if (slen == 0) return offset;
/* need proceed one character at a time */
sptr = RSTRING_PTR(sub);
slen = RSTRING_LEN(sub);
len = RSTRING_LEN(str) - offset;
pos = mrb_memsearch(sptr, slen, s, len);
if (pos < 0) return pos;
return pos + offset;
}
MRB_API mrb_value MRB_API mrb_value
mrb_str_dup(mrb_state *mrb, mrb_value str) mrb_str_dup(mrb_state *mrb, mrb_value str)
{ {
...@@ -834,12 +1047,12 @@ mrb_str_aref(mrb_state *mrb, mrb_value str, mrb_value indx) ...@@ -834,12 +1047,12 @@ mrb_str_aref(mrb_state *mrb, mrb_value str, mrb_value indx)
idx = mrb_fixnum(indx); idx = mrb_fixnum(indx);
num_index: num_index:
str = mrb_str_substr(mrb, str, idx, 1); str = str_substr(mrb, str, idx, 1);
if (!mrb_nil_p(str) && RSTRING_LEN(str) == 0) return mrb_nil_value(); if (!mrb_nil_p(str) && RSTRING_LEN(str) == 0) return mrb_nil_value();
return str; return str;
case MRB_TT_STRING: case MRB_TT_STRING:
if (mrb_str_index(mrb, str, indx, 0) != -1) if (str_index(mrb, str, indx, 0) != -1)
return mrb_str_dup(mrb, indx); return mrb_str_dup(mrb, indx);
return mrb_nil_value(); return mrb_nil_value();
...@@ -848,9 +1061,9 @@ num_index: ...@@ -848,9 +1061,9 @@ num_index:
{ {
mrb_int beg, len; mrb_int beg, len;
len = RSTRING_LEN(str); len = RSTRING_CHAR_LEN(str);
if (mrb_range_beg_len(mrb, indx, &beg, &len, len)) { if (mrb_range_beg_len(mrb, indx, &beg, &len, len)) {
return mrb_str_subseq(mrb, str, beg, len); return str_subseq(mrb, str, beg, len);
} }
else { else {
return mrb_nil_value(); return mrb_nil_value();
...@@ -917,7 +1130,7 @@ mrb_str_aref_m(mrb_state *mrb, mrb_value str) ...@@ -917,7 +1130,7 @@ mrb_str_aref_m(mrb_state *mrb, mrb_value str)
argc = mrb_get_args(mrb, "o|o", &a1, &a2); argc = mrb_get_args(mrb, "o|o", &a1, &a2);
if (argc == 2) { if (argc == 2) {
mrb_regexp_check(mrb, a1); mrb_regexp_check(mrb, a1);
return mrb_str_substr(mrb, str, mrb_fixnum(a1), mrb_fixnum(a2)); return str_substr(mrb, str, mrb_fixnum(a1), mrb_fixnum(a2));
} }
if (argc != 1) { if (argc != 1) {
mrb_raisef(mrb, E_ARGUMENT_ERROR, "wrong number of arguments (%S for 1)", mrb_fixnum_value(argc)); mrb_raisef(mrb, E_ARGUMENT_ERROR, "wrong number of arguments (%S for 1)", mrb_fixnum_value(argc));
...@@ -987,7 +1200,7 @@ mrb_str_capitalize(mrb_state *mrb, mrb_value self) ...@@ -987,7 +1200,7 @@ mrb_str_capitalize(mrb_state *mrb, mrb_value self)
/* 15.2.10.5.10 */ /* 15.2.10.5.10 */
/* /*
* call-seq: * call-seq:
* str.chomp!(separator=$/) => str or nil * str.chomp!(separator="\n") => str or nil
* *
* Modifies <i>str</i> in place as described for <code>String#chomp</code>, * Modifies <i>str</i> in place as described for <code>String#chomp</code>,
* returning <i>str</i>, or <code>nil</code> if no modifications were made. * returning <i>str</i>, or <code>nil</code> if no modifications were made.
...@@ -1061,7 +1274,7 @@ mrb_str_chomp_bang(mrb_state *mrb, mrb_value str) ...@@ -1061,7 +1274,7 @@ mrb_str_chomp_bang(mrb_state *mrb, mrb_value str)
/* 15.2.10.5.9 */ /* 15.2.10.5.9 */
/* /*
* call-seq: * call-seq:
* str.chomp(separator=$/) => new_str * str.chomp(separator="\n") => new_str
* *
* Returns a new <code>String</code> with the given record separator removed * Returns a new <code>String</code> with the given record separator removed
* from the end of <i>str</i> (if present). If <code>$/</code> has not been * from the end of <i>str</i> (if present). If <code>$/</code> has not been
...@@ -1232,47 +1445,10 @@ mrb_str_eql(mrb_state *mrb, mrb_value self) ...@@ -1232,47 +1445,10 @@ mrb_str_eql(mrb_state *mrb, mrb_value self)
return mrb_bool_value(eql_p); return mrb_bool_value(eql_p);
} }
static mrb_value
mrb_str_subseq(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len)
{
struct RString *orig, *s;
mrb_shared_string *shared;
orig = mrb_str_ptr(str);
if (RSTR_EMBED_P(orig)) {
s = str_new(mrb, orig->as.ary+beg, len);
} else {
str_make_shared(mrb, orig);
shared = orig->as.heap.aux.shared;
s = mrb_obj_alloc_string(mrb);
s->as.heap.ptr = orig->as.heap.ptr + beg;
s->as.heap.len = len;
s->as.heap.aux.shared = shared;
RSTR_SET_SHARED_FLAG(s);
shared->refcnt++;
}
return mrb_obj_value(s);
}
MRB_API mrb_value MRB_API mrb_value
mrb_str_substr(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len) mrb_str_substr(mrb_state *mrb, mrb_value str, mrb_int beg, mrb_int len)
{ {
if (len < 0) return mrb_nil_value(); return str_substr(mrb, str, beg, len);
if (!RSTRING_LEN(str)) {
len = 0;
}
if (beg > RSTRING_LEN(str)) return mrb_nil_value();
if (beg < 0) {
beg += RSTRING_LEN(str);
if (beg < 0) return mrb_nil_value();
}
if (beg + len > RSTRING_LEN(str))
len = RSTRING_LEN(str) - beg;
if (len <= 0) {
len = 0;
}
return mrb_str_subseq(mrb, str, beg, len);
} }
mrb_int mrb_int
...@@ -1331,7 +1507,7 @@ mrb_str_include(mrb_state *mrb, mrb_value self) ...@@ -1331,7 +1507,7 @@ mrb_str_include(mrb_state *mrb, mrb_value self)
} }
else { else {
str2 = mrb_str_to_str(mrb, str2); str2 = mrb_str_to_str(mrb, str2);
i = mrb_str_index(mrb, self, str2, 0); i = str_index(mrb, self, str2, 0);
include_p = (i != -1); include_p = (i != -1);
} }
...@@ -1361,12 +1537,12 @@ mrb_str_include(mrb_state *mrb, mrb_value self) ...@@ -1361,12 +1537,12 @@ mrb_str_include(mrb_state *mrb, mrb_value self)
* "hello".index(/[aeiou]/, -3) #=> 4 * "hello".index(/[aeiou]/, -3) #=> 4
*/ */
static mrb_value static mrb_value
mrb_str_index_m(mrb_state *mrb, mrb_value str) mrb_str_index(mrb_state *mrb, mrb_value str)
{ {
mrb_value *argv; mrb_value *argv;
mrb_int argc; mrb_int argc;
mrb_value sub; mrb_value sub;
mrb_int pos; mrb_int pos, clen;
mrb_get_args(mrb, "*", &argv, &argc); mrb_get_args(mrb, "*", &argv, &argc);
if (argc == 2) { if (argc == 2) {
...@@ -1381,12 +1557,15 @@ mrb_str_index_m(mrb_state *mrb, mrb_value str) ...@@ -1381,12 +1557,15 @@ mrb_str_index_m(mrb_state *mrb, mrb_value str)
sub = mrb_nil_value(); sub = mrb_nil_value();
} }
mrb_regexp_check(mrb, sub); mrb_regexp_check(mrb, sub);
clen = RSTRING_CHAR_LEN(str);
if (pos < 0) { if (pos < 0) {
pos += RSTRING_LEN(str); pos += clen;
if (pos < 0) { if (pos < 0) {
return mrb_nil_value(); return mrb_nil_value();
} }
} }
if (pos >= clen) return mrb_nil_value();
pos = chars2bytes(RSTRING_PTR(str), pos);
switch (mrb_type(sub)) { switch (mrb_type(sub)) {
default: { default: {
...@@ -1400,57 +1579,17 @@ mrb_str_index_m(mrb_state *mrb, mrb_value str) ...@@ -1400,57 +1579,17 @@ mrb_str_index_m(mrb_state *mrb, mrb_value str)
} }
/* fall through */ /* fall through */
case MRB_TT_STRING: case MRB_TT_STRING:
pos = mrb_str_index(mrb, str, sub, pos); pos = str_index(mrb, str, sub, pos);
break; break;
} }
if (pos == -1) return mrb_nil_value(); if (pos == -1) return mrb_nil_value();
pos = bytes2chars(RSTRING_PTR(str), pos);
return mrb_fixnum_value(pos); return mrb_fixnum_value(pos);
} }
#define STR_REPLACE_SHARED_MIN 10 #define STR_REPLACE_SHARED_MIN 10
static mrb_value
str_replace(mrb_state *mrb, struct RString *s1, struct RString *s2)
{
long len;
check_frozen(mrb, s1);
len = RSTR_LEN(s2);
if (RSTR_SHARED_P(s1)) {
str_decref(mrb, s1->as.heap.aux.shared);
}
else if (!RSTR_EMBED_P(s1) && !RSTR_NOFREE_P(s1)) {
mrb_free(mrb, s1->as.heap.ptr);
}
RSTR_UNSET_NOFREE_FLAG(s1);
if (RSTR_SHARED_P(s2)) {
L_SHARE:
RSTR_UNSET_EMBED_FLAG(s1);
s1->as.heap.ptr = s2->as.heap.ptr;
s1->as.heap.len = len;
s1->as.heap.aux.shared = s2->as.heap.aux.shared;
RSTR_SET_SHARED_FLAG(s1);
s1->as.heap.aux.shared->refcnt++;
}
else {
if (len <= RSTRING_EMBED_LEN_MAX) {
RSTR_UNSET_SHARED_FLAG(s1);
RSTR_SET_EMBED_FLAG(s1);
memcpy(s1->as.ary, RSTR_PTR(s2), len);
RSTR_SET_EMBED_LEN(s1, len);
}
else {
str_make_shared(mrb, s2);
goto L_SHARE;
}
}
return mrb_obj_value(s1);
}
/* 15.2.10.5.24 */ /* 15.2.10.5.24 */
/* 15.2.10.5.28 */ /* 15.2.10.5.28 */
/* /*
...@@ -1570,107 +1709,81 @@ mrb_check_string_type(mrb_state *mrb, mrb_value str) ...@@ -1570,107 +1709,81 @@ mrb_check_string_type(mrb_state *mrb, mrb_value str)
return mrb_check_convert_type(mrb, str, MRB_TT_STRING, "String", "to_str"); return mrb_check_convert_type(mrb, str, MRB_TT_STRING, "String", "to_str");
} }
/* ---------------------------------- */ /* 15.2.10.5.30 */
/* 15.2.10.5.29 */
/* /*
* call-seq: * call-seq:
* str.reverse => new_str * str.reverse! => str
*
* Returns a new string with the characters from <i>str</i> in reverse order.
* *
* "stressed".reverse #=> "desserts" * Reverses <i>str</i> in place.
*/ */
static mrb_value static mrb_value
mrb_str_reverse(mrb_state *mrb, mrb_value str) mrb_str_reverse_bang(mrb_state *mrb, mrb_value str)
{ {
struct RString *s2; #ifdef MRB_UTF8_STRING
char *s, *e, *p; mrb_int utf8_len = RSTRING_CHAR_LEN(str);
mrb_int len = RSTRING_LEN(str);
if (utf8_len == len) goto bytes;
if (utf8_len > 1) {
char *buf;
char *p, *e, *r;
if (RSTRING_LEN(str) <= 1) return mrb_str_dup(mrb, str); mrb_str_modify(mrb, mrb_str_ptr(str));
len = RSTRING_LEN(str);
buf = mrb_malloc(mrb, (size_t)len);
p = buf;
e = buf + len;
s2 = str_new(mrb, 0, RSTRING_LEN(str)); memcpy(buf, RSTRING_PTR(str), len);
str_with_class(mrb, s2, str); r = RSTRING_PTR(str) + len;
s = RSTRING_PTR(str); e = RSTRING_END(str) - 1;
p = RSTR_PTR(s2);
while (e >= s) { while (p<e) {
*p++ = *e--; mrb_int clen = utf8len((unsigned char*)p);
r -= clen;
memcpy(r, p, clen);
p += clen;
}
mrb_free(mrb, buf);
} }
return mrb_obj_value(s2); return str;
}
/* 15.2.10.5.30 */ bytes:
/* #endif
* call-seq: {
* str.reverse! => str struct RString *s = mrb_str_ptr(str);
* char *p, *e;
* Reverses <i>str</i> in place. char c;
*/
static mrb_value
mrb_str_reverse_bang(mrb_state *mrb, mrb_value str)
{
struct RString *s = mrb_str_ptr(str);
char *p, *e;
char c;
mrb_str_modify(mrb, s); mrb_str_modify(mrb, s);
if (RSTR_LEN(s) > 1) { if (RSTR_LEN(s) > 1) {
p = RSTR_PTR(s); p = RSTR_PTR(s);
e = p + RSTR_LEN(s) - 1; e = p + RSTR_LEN(s) - 1;
while (p < e) { while (p < e) {
c = *p; c = *p;
*p++ = *e; *p++ = *e;
*e-- = c; *e-- = c;
}
} }
return str;
} }
return str;
} }
/* ---------------------------------- */
/* 15.2.10.5.29 */
/* /*
* call-seq: * call-seq:
* str.rindex(substring [, fixnum]) => fixnum or nil * str.reverse => new_str
* str.rindex(fixnum [, fixnum]) => fixnum or nil
* str.rindex(regexp [, fixnum]) => fixnum or nil
* *
* Returns the index of the last occurrence of the given <i>substring</i>, * Returns a new string with the characters from <i>str</i> in reverse order.
* character (<i>fixnum</i>), or pattern (<i>regexp</i>) in <i>str</i>. Returns
* <code>nil</code> if not found. If the second parameter is present, it
* specifies the position in the string to end the search---characters beyond
* this point will not be considered.
* *
* "hello".rindex('e') #=> 1 * "stressed".reverse #=> "desserts"
* "hello".rindex('l') #=> 3
* "hello".rindex('a') #=> nil
* "hello".rindex(101) #=> 1
* "hello".rindex(/[aeiou]/, -2) #=> 1
*/ */
static mrb_int static mrb_value
mrb_str_rindex(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int pos) mrb_str_reverse(mrb_state *mrb, mrb_value str)
{ {
char *s, *sbeg, *t; mrb_value str2 = mrb_str_dup(mrb, str);
struct RString *ps = mrb_str_ptr(str); mrb_str_reverse_bang(mrb, str2);
mrb_int len = RSTRING_LEN(sub); return str2;
/* substring longer than string */
if (RSTR_LEN(ps) < len) return -1;
if (RSTR_LEN(ps) - pos < len) {
pos = RSTR_LEN(ps) - len;
}
sbeg = RSTR_PTR(ps);
s = RSTR_PTR(ps) + pos;
t = RSTRING_PTR(sub);
if (len) {
while (sbeg <= s) {
if (memcmp(s, t, len) == 0) {
return s - RSTR_PTR(ps);
}
s--;
}
return -1;
}
else {
return pos;
}
} }
/* 15.2.10.5.31 */ /* 15.2.10.5.31 */
...@@ -1693,13 +1806,13 @@ mrb_str_rindex(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int pos) ...@@ -1693,13 +1806,13 @@ mrb_str_rindex(mrb_state *mrb, mrb_value str, mrb_value sub, mrb_int pos)
* "hello".rindex(/[aeiou]/, -2) #=> 1 * "hello".rindex(/[aeiou]/, -2) #=> 1
*/ */
static mrb_value static mrb_value
mrb_str_rindex_m(mrb_state *mrb, mrb_value str) mrb_str_rindex(mrb_state *mrb, mrb_value str)
{ {
mrb_value *argv; mrb_value *argv;
mrb_int argc; mrb_int argc;
mrb_value sub; mrb_value sub;
mrb_value vpos; mrb_value vpos;
mrb_int pos, len = RSTRING_LEN(str); mrb_int pos, len = RSTRING_CHAR_LEN(str);
mrb_get_args(mrb, "*", &argv, &argc); mrb_get_args(mrb, "*", &argv, &argc);
if (argc == 2) { if (argc == 2) {
...@@ -1722,6 +1835,8 @@ mrb_str_rindex_m(mrb_state *mrb, mrb_value str) ...@@ -1722,6 +1835,8 @@ mrb_str_rindex_m(mrb_state *mrb, mrb_value str)
else else
sub = mrb_nil_value(); sub = mrb_nil_value();
} }
pos = chars2bytes(RSTRING_PTR(str), pos);
len = chars2bytes(RSTRING_PTR(str)+pos, len);
mrb_regexp_check(mrb, sub); mrb_regexp_check(mrb, sub);
switch (mrb_type(sub)) { switch (mrb_type(sub)) {
...@@ -1736,8 +1851,11 @@ mrb_str_rindex_m(mrb_state *mrb, mrb_value str) ...@@ -1736,8 +1851,11 @@ mrb_str_rindex_m(mrb_state *mrb, mrb_value str)
} }
/* fall through */ /* fall through */
case MRB_TT_STRING: case MRB_TT_STRING:
pos = mrb_str_rindex(mrb, str, sub, pos); pos = str_rindex(mrb, str, sub, pos);
if (pos >= 0) return mrb_fixnum_value(pos); if (pos >= 0) {
pos = bytes2chars(RSTRING_PTR(str), pos);
return mrb_fixnum_value(pos);
}
break; break;
} /* end of switch (TYPE(sub)) */ } /* end of switch (TYPE(sub)) */
...@@ -1748,7 +1866,7 @@ mrb_str_rindex_m(mrb_state *mrb, mrb_value str) ...@@ -1748,7 +1866,7 @@ mrb_str_rindex_m(mrb_state *mrb, mrb_value str)
/* /*
* call-seq: * call-seq:
* str.split(pattern=$;, [limit]) => anArray * str.split(pattern="\n", [limit]) => anArray
* *
* Divides <i>str</i> into substrings based on a delimiter, returning an array * Divides <i>str</i> into substrings based on a delimiter, returning an array
* of these substrings. * of these substrings.
...@@ -1846,7 +1964,7 @@ mrb_str_split_m(mrb_state *mrb, mrb_value str) ...@@ -1846,7 +1964,7 @@ mrb_str_split_m(mrb_state *mrb, mrb_value str)
} }
} }
else if (ISSPACE(c)) { else if (ISSPACE(c)) {
mrb_ary_push(mrb, result, mrb_str_subseq(mrb, str, beg, end-beg)); mrb_ary_push(mrb, result, byte_subseq(mrb, str, beg, end-beg));
mrb_gc_arena_restore(mrb, ai); mrb_gc_arena_restore(mrb, ai);
skip = TRUE; skip = TRUE;
beg = idx; beg = idx;
...@@ -1868,9 +1986,9 @@ mrb_str_split_m(mrb_state *mrb, mrb_value str) ...@@ -1868,9 +1986,9 @@ mrb_str_split_m(mrb_state *mrb, mrb_value str)
end = mrb_memsearch(RSTRING_PTR(spat), pat_len, RSTRING_PTR(str)+idx, str_len - idx); end = mrb_memsearch(RSTRING_PTR(spat), pat_len, RSTRING_PTR(str)+idx, str_len - idx);
if (end < 0) break; if (end < 0) break;
} else { } else {
end = 1; end = chars2bytes(RSTRING_PTR(str)+idx, 1);
} }
mrb_ary_push(mrb, result, mrb_str_subseq(mrb, str, idx, end)); mrb_ary_push(mrb, result, byte_subseq(mrb, str, idx, end));
mrb_gc_arena_restore(mrb, ai); mrb_gc_arena_restore(mrb, ai);
idx += end + pat_len; idx += end + pat_len;
if (lim_p && lim <= ++i) break; if (lim_p && lim <= ++i) break;
...@@ -1885,7 +2003,7 @@ mrb_str_split_m(mrb_state *mrb, mrb_value str) ...@@ -1885,7 +2003,7 @@ mrb_str_split_m(mrb_state *mrb, mrb_value str)
tmp = mrb_str_new_empty(mrb, str); tmp = mrb_str_new_empty(mrb, str);
} }
else { else {
tmp = mrb_str_subseq(mrb, str, beg, RSTRING_LEN(str)-beg); tmp = byte_subseq(mrb, str, beg, RSTRING_LEN(str)-beg);
} }
mrb_ary_push(mrb, result, tmp); mrb_ary_push(mrb, result, tmp);
} }
...@@ -2533,7 +2651,7 @@ mrb_init_string(mrb_state *mrb) ...@@ -2533,7 +2651,7 @@ mrb_init_string(mrb_state *mrb)
s = mrb->string_class = mrb_define_class(mrb, "String", mrb->object_class); /* 15.2.10 */ s = mrb->string_class = mrb_define_class(mrb, "String", mrb->object_class); /* 15.2.10 */
MRB_SET_INSTANCE_TT(s, MRB_TT_STRING); MRB_SET_INSTANCE_TT(s, MRB_TT_STRING);
mrb_define_method(mrb, s, "bytesize", mrb_str_size, MRB_ARGS_NONE()); mrb_define_method(mrb, s, "bytesize", mrb_str_bytesize, MRB_ARGS_NONE());
mrb_define_method(mrb, s, "<=>", mrb_str_cmp_m, MRB_ARGS_REQ(1)); /* 15.2.10.5.1 */ mrb_define_method(mrb, s, "<=>", mrb_str_cmp_m, MRB_ARGS_REQ(1)); /* 15.2.10.5.1 */
mrb_define_method(mrb, s, "==", mrb_str_equal_m, MRB_ARGS_REQ(1)); /* 15.2.10.5.2 */ mrb_define_method(mrb, s, "==", mrb_str_equal_m, MRB_ARGS_REQ(1)); /* 15.2.10.5.2 */
...@@ -2553,7 +2671,7 @@ mrb_init_string(mrb_state *mrb) ...@@ -2553,7 +2671,7 @@ mrb_init_string(mrb_state *mrb)
mrb_define_method(mrb, s, "hash", mrb_str_hash_m, MRB_ARGS_NONE()); /* 15.2.10.5.20 */ mrb_define_method(mrb, s, "hash", mrb_str_hash_m, MRB_ARGS_NONE()); /* 15.2.10.5.20 */
mrb_define_method(mrb, s, "include?", mrb_str_include, MRB_ARGS_REQ(1)); /* 15.2.10.5.21 */ mrb_define_method(mrb, s, "include?", mrb_str_include, MRB_ARGS_REQ(1)); /* 15.2.10.5.21 */
mrb_define_method(mrb, s, "index", mrb_str_index_m, MRB_ARGS_ANY()); /* 15.2.10.5.22 */ mrb_define_method(mrb, s, "index", mrb_str_index, MRB_ARGS_ANY()); /* 15.2.10.5.22 */
mrb_define_method(mrb, s, "initialize", mrb_str_init, MRB_ARGS_REQ(1)); /* 15.2.10.5.23 */ mrb_define_method(mrb, s, "initialize", mrb_str_init, MRB_ARGS_REQ(1)); /* 15.2.10.5.23 */
mrb_define_method(mrb, s, "initialize_copy", mrb_str_replace, MRB_ARGS_REQ(1)); /* 15.2.10.5.24 */ mrb_define_method(mrb, s, "initialize_copy", mrb_str_replace, MRB_ARGS_REQ(1)); /* 15.2.10.5.24 */
mrb_define_method(mrb, s, "intern", mrb_str_intern, MRB_ARGS_NONE()); /* 15.2.10.5.25 */ mrb_define_method(mrb, s, "intern", mrb_str_intern, MRB_ARGS_NONE()); /* 15.2.10.5.25 */
...@@ -2561,7 +2679,7 @@ mrb_init_string(mrb_state *mrb) ...@@ -2561,7 +2679,7 @@ mrb_init_string(mrb_state *mrb)
mrb_define_method(mrb, s, "replace", mrb_str_replace, MRB_ARGS_REQ(1)); /* 15.2.10.5.28 */ mrb_define_method(mrb, s, "replace", mrb_str_replace, MRB_ARGS_REQ(1)); /* 15.2.10.5.28 */
mrb_define_method(mrb, s, "reverse", mrb_str_reverse, MRB_ARGS_NONE()); /* 15.2.10.5.29 */ mrb_define_method(mrb, s, "reverse", mrb_str_reverse, MRB_ARGS_NONE()); /* 15.2.10.5.29 */
mrb_define_method(mrb, s, "reverse!", mrb_str_reverse_bang, MRB_ARGS_NONE()); /* 15.2.10.5.30 */ mrb_define_method(mrb, s, "reverse!", mrb_str_reverse_bang, MRB_ARGS_NONE()); /* 15.2.10.5.30 */
mrb_define_method(mrb, s, "rindex", mrb_str_rindex_m, MRB_ARGS_ANY()); /* 15.2.10.5.31 */ mrb_define_method(mrb, s, "rindex", mrb_str_rindex, MRB_ARGS_ANY()); /* 15.2.10.5.31 */
mrb_define_method(mrb, s, "size", mrb_str_size, MRB_ARGS_NONE()); /* 15.2.10.5.33 */ mrb_define_method(mrb, s, "size", mrb_str_size, MRB_ARGS_NONE()); /* 15.2.10.5.33 */
mrb_define_method(mrb, s, "slice", mrb_str_aref_m, MRB_ARGS_ANY()); /* 15.2.10.5.34 */ mrb_define_method(mrb, s, "slice", mrb_str_aref_m, MRB_ARGS_ANY()); /* 15.2.10.5.34 */
mrb_define_method(mrb, s, "split", mrb_str_split_m, MRB_ARGS_ANY()); /* 15.2.10.5.35 */ mrb_define_method(mrb, s, "split", mrb_str_split_m, MRB_ARGS_ANY()); /* 15.2.10.5.35 */
......
## ##
# String ISO Test # String ISO Test
UTF8STRING = ("\343\201\202".size == 1)
assert('String', '15.2.10') do assert('String', '15.2.10') do
assert_equal Class, String.class assert_equal Class, String.class
end end
...@@ -60,23 +62,32 @@ assert('String#[]', '15.2.10.5.6') do ...@@ -60,23 +62,32 @@ assert('String#[]', '15.2.10.5.6') do
a3 = 'abc'['bc'] a3 = 'abc'['bc']
b3 = 'abc'['XX'] b3 = 'abc'['XX']
assert_equal 'a', a assert_equal 'a', 'a'
assert_equal 'c', b # assert_equal 'c', b
assert_nil c # assert_nil c
assert_nil d # assert_nil d
assert_equal 'b', e # assert_equal 'b', e
assert_nil a1 # assert_nil a1
assert_nil b1 # assert_nil b1
assert_nil c1 # assert_nil c1
assert_equal '', d1 # assert_equal '', d1
assert_equal 'bc', e1 # assert_equal 'bc', e1
assert_equal 'bc', a3 # assert_equal 'bc', a3
assert_nil b3 # assert_nil b3
assert_raise(TypeError) do # assert_raise(TypeError) do
a[nil] # a[nil]
end # end
end end
assert('String#[](UTF-8)', '15.2.10.5.6') do
assert_equal "ち", "こんにちは世界"[3]
assert_equal nil, "こんにちは世界"[20]
assert_equal "世", "こんにちは世界"[-2]
assert_equal "世界", "こんにちは世界"[-2..-1]
assert_equal "んに", "こんにちは世界"[1,2]
assert_equal "世", "こんにちは世界"["世"]
end if UTF8STRING
assert('String#[] with Range') do assert('String#[] with Range') do
a1 = 'abc'[1..0] a1 = 'abc'[1..0]
...@@ -411,6 +422,15 @@ assert('String#reverse', '15.2.10.5.29') do ...@@ -411,6 +422,15 @@ assert('String#reverse', '15.2.10.5.29') do
assert_equal 'cba', 'abc'.reverse assert_equal 'cba', 'abc'.reverse
end end
assert('String#reverse(UTF-8)', '15.2.10.5.29') do
assert_equal "ち", "こんにちは世界"[3]
assert_equal nil, "こんにちは世界"[20]
assert_equal "世", "こんにちは世界"[-2]
assert_equal "世界", "こんにちは世界"[-2..-1]
assert_equal "んに", "こんにちは世界"[1,2]
assert_equal "世", "こんにちは世界"["世"]
end if UTF8STRING
assert('String#reverse!', '15.2.10.5.30') do assert('String#reverse!', '15.2.10.5.30') do
a = 'abc' a = 'abc'
a.reverse! a.reverse!
...@@ -419,6 +439,14 @@ assert('String#reverse!', '15.2.10.5.30') do ...@@ -419,6 +439,14 @@ assert('String#reverse!', '15.2.10.5.30') do
assert_equal 'cba', 'abc'.reverse! assert_equal 'cba', 'abc'.reverse!
end end
assert('String#reverse!(UTF-8)', '15.2.10.5.30') do
a = 'こんにちは世界!'
a.reverse!
assert_equal '!界世はちにんこ', a
assert_equal '!界世はちにんこ', 'こんにちは世界!'.reverse!
end if UTF8STRING
assert('String#rindex', '15.2.10.5.31') do assert('String#rindex', '15.2.10.5.31') do
assert_equal 0, 'abc'.rindex('a') assert_equal 0, 'abc'.rindex('a')
assert_nil 'abc'.rindex('d') assert_nil 'abc'.rindex('d')
...@@ -426,12 +454,27 @@ assert('String#rindex', '15.2.10.5.31') do ...@@ -426,12 +454,27 @@ assert('String#rindex', '15.2.10.5.31') do
assert_equal 3, 'abcabc'.rindex('a', 4) assert_equal 3, 'abcabc'.rindex('a', 4)
end end
assert('String#rindex(UTF-8)', '15.2.10.5.31') do
str = "こんにちは世界!\nこんにちは世界!"
assert_nil str.index('さ')
assert_equal 3, str.index('ち')
assert_equal 12, str.index('ち', 10)
assert_equal nil, str.index("さ")
end if UTF8STRING
# 'String#scan', '15.2.10.5.32' will be tested in mrbgems. # 'String#scan', '15.2.10.5.32' will be tested in mrbgems.
assert('String#size', '15.2.10.5.33') do assert('String#size', '15.2.10.5.33') do
assert_equal 3, 'abc'.size assert_equal 3, 'abc'.size
end end
assert('String#size(UTF-8)', '15.2.10.5.33') do
str = 'こんにちは世界!'
assert_equal 8, str.size
assert_not_equal str.bytesize, str.size
assert_equal 2, str[1, 2].size
end if UTF8STRING
assert('String#slice', '15.2.10.5.34') do assert('String#slice', '15.2.10.5.34') do
# length of args is 1 # length of args is 1
a = 'abc'.slice(0) a = 'abc'.slice(0)
...@@ -479,6 +522,13 @@ assert('String#split', '15.2.10.5.35') do ...@@ -479,6 +522,13 @@ assert('String#split', '15.2.10.5.35') do
assert_equal ['a', 'b', 'c'], 'abc'.split("") assert_equal ['a', 'b', 'c'], 'abc'.split("")
end end
assert('String#split(UTF-8)', '15.2.10.5.35') do
got = "こんにちは世界!".split('')
assert_equal ['こ', 'ん', 'に', 'ち', 'は', '世', '界', '!'], got
got = "こんにちは世界!".split('に')
assert_equal ['こん', 'ちは世界!'], got
end if UTF8STRING
assert('String#sub', '15.2.10.5.36') do assert('String#sub', '15.2.10.5.36') do
assert_equal 'aBcabc', 'abcabc'.sub('b', 'B') assert_equal 'aBcabc', 'abcabc'.sub('b', 'B')
assert_equal 'aBcabc', 'abcabc'.sub('b') { |w| w.capitalize } assert_equal 'aBcabc', 'abcabc'.sub('b') { |w| w.capitalize }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment