Commit 4918e5ec authored by Yukihiro "Matz" Matsumoto's avatar Yukihiro "Matz" Matsumoto

Merge pull request #2075 from mattn/utf8-string-ord

Implement String#ord, String#split for mruby-string-utf8
parents 7c1b3f8a d72b8c20
#include "mruby.h"
#include "mruby/array.h"
#include "mruby/string.h"
#include "mruby/range.h"
#include "mruby/re.h"
......@@ -23,6 +24,75 @@ static const char utf8len_codepage[256] =
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1,
};
static char utf8len_codepage_zero[256] =
{
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0,
};
static const char isspacetable[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
static mrb_int
utf8code(unsigned char* p)
{
mrb_int len;
if (p[0] < 0x80)
return p[0];
len = utf8len_codepage_zero[p[0]];
if (len > 1 && (p[1] & 0xc0) == 0x80) {
if (len == 2)
return ((p[0] & 0x1f) << 6) + (p[1] & 0x3f);
if ((p[2] & 0xc0) == 0x80) {
if (len == 3)
return ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6)
+ (p[2] & 0x3f);
if ((p[3] & 0xc0) == 0x80) {
if (len == 4)
return ((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12)
+ ((p[2] & 0x3f) << 6) + (p[3] & 0x3f);
if ((p[4] & 0xc0) == 0x80) {
if (len == 5)
return ((p[0] & 0x03) << 24) + ((p[1] & 0x3f) << 18)
+ ((p[2] & 0x3f) << 12) + ((p[3] & 0x3f) << 6)
+ (p[4] & 0x3f);
if ((p[5] & 0xc0) == 0x80 && len == 6)
return ((p[0] & 0x01) << 30) + ((p[1] & 0x3f) << 24)
+ ((p[2] & 0x3f) << 18) + ((p[3] & 0x3f) << 12)
+ ((p[4] & 0x3f) << 6) + (p[5] & 0x3f);
}
}
}
}
return p[0];
}
static mrb_value mrb_fixnum_chr(mrb_state*, mrb_value);
static mrb_int
......@@ -463,6 +533,139 @@ mrb_fixnum_chr(mrb_state *mrb, mrb_value num)
return mrb_str_new(mrb, utf8, len);
}
static mrb_value
mrb_str_ord(mrb_state* mrb, mrb_value str)
{
mrb_int len = RSTRING_LEN(str);
if (len == 0) mrb_raise(mrb, E_ARGUMENT_ERROR, "empty string");
return mrb_fixnum_value(utf8code((unsigned char*) RSTRING_PTR(str)));
}
static mrb_value
mrb_str_split_m(mrb_state *mrb, mrb_value str)
{
int argc;
mrb_value spat = mrb_nil_value();
enum {awk, string, regexp} split_type = string;
long i = 0, lim_p;
mrb_int beg;
mrb_int end;
mrb_int lim = 0;
mrb_value result, tmp;
argc = mrb_get_args(mrb, "|oi", &spat, &lim);
lim_p = (lim > 0 && argc == 2);
if (argc == 2) {
if (lim == 1) {
if (RSTRING_LEN(str) == 0)
return mrb_ary_new_capa(mrb, 0);
return mrb_ary_new_from_values(mrb, 1, &str);
}
i = 1;
}
if (argc == 0 || mrb_nil_p(spat)) {
split_type = awk;
}
else {
if (mrb_string_p(spat)) {
split_type = string;
if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
split_type = awk;
}
}
else {
noregexp(mrb, str);
}
}
result = mrb_ary_new(mrb);
beg = 0;
if (split_type == awk) {
char *ptr = RSTRING_PTR(str);
char *eptr = RSTRING_END(str);
char *bptr = ptr;
int skip = 1;
unsigned int c;
end = beg;
while (ptr < eptr) {
int ai = mrb_gc_arena_save(mrb);
c = (unsigned char)*ptr++;
if (skip) {
if (ascii_isspace(c)) {
beg = ptr - bptr;
}
else {
end = ptr - bptr;
skip = 0;
if (lim_p && lim <= i) break;
}
}
else if (ascii_isspace(c)) {
mrb_ary_push(mrb, result, str_subseq(mrb, str, beg, end-beg));
mrb_gc_arena_restore(mrb, ai);
skip = 1;
beg = ptr - bptr;
if (lim_p) ++i;
}
else {
end = ptr - bptr;
}
}
}
else if (split_type == string) {
char *ptr = RSTRING_PTR(str); // s->as.ary
char *temp = ptr;
char *eptr = RSTRING_END(str);
mrb_int slen = RSTRING_LEN(spat);
if (slen == 0) {
int ai = mrb_gc_arena_save(mrb);
while (ptr < eptr) {
mrb_ary_push(mrb, result, str_subseq(mrb, str, ptr-temp, 1));
mrb_gc_arena_restore(mrb, ai);
ptr++;
if (lim_p && lim <= ++i) break;
}
}
else {
char *sptr = RSTRING_PTR(spat);
int ai = mrb_gc_arena_save(mrb);
while (ptr < eptr &&
(end = mrb_memsearch(sptr, slen, ptr, eptr - ptr)) >= 0) {
mrb_ary_push(mrb, result, str_subseq(mrb, str, ptr - temp, end));
mrb_gc_arena_restore(mrb, ai);
ptr += end + slen;
if (lim_p && lim <= ++i) break;
}
}
beg = ptr - temp;
}
else {
noregexp(mrb, str);
}
if (RSTRING_LEN(str) > 0 && (lim_p || RSTRING_LEN(str) > beg || lim < 0)) {
if (RSTRING_LEN(str) == beg) {
tmp = mrb_str_new_lit(mrb, "");
}
else {
tmp = str_subseq(mrb, str, beg, RSTRING_LEN(str)-beg);
}
mrb_ary_push(mrb, result, tmp);
}
if (!lim_p && lim == 0) {
mrb_int len;
while ((len = RARRAY_LEN(result)) > 0 &&
(tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0))
mrb_ary_pop(mrb, result);
}
return result;
}
void
mrb_mruby_string_utf8_gem_init(mrb_state* mrb)
{
......@@ -472,7 +675,9 @@ mrb_mruby_string_utf8_gem_init(mrb_state* mrb)
mrb_define_method(mrb, s, "length", mrb_str_size, MRB_ARGS_NONE());
mrb_define_method(mrb, s, "index", mrb_str_index_m, MRB_ARGS_ANY());
mrb_define_method(mrb, s, "[]", mrb_str_aref_m, MRB_ARGS_ANY());
mrb_define_method(mrb, s, "ord", mrb_str_ord, MRB_ARGS_NONE());
mrb_define_method(mrb, s, "slice", mrb_str_aref_m, MRB_ARGS_ANY());
mrb_define_method(mrb, s, "split", mrb_str_split_m, MRB_ARGS_ANY());
mrb_define_method(mrb, s, "reverse", mrb_str_reverse, MRB_ARGS_NONE());
mrb_define_method(mrb, s, "reverse!", mrb_str_reverse_bang, MRB_ARGS_NONE());
mrb_define_method(mrb, s, "rindex", mrb_str_rindex_m, MRB_ARGS_ANY());
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment