Commit 383a9c79 authored by Tomoyuki Sahara's avatar Tomoyuki Sahara Committed by GitHub

Merge pull request #13 from ksss/unpack-utf8

Support unpack template "U"
parents 7e014efe 0dbd1d59
......@@ -38,6 +38,7 @@ There is no dependency on other mrbgems.
- q : 64-bit signed, native endian (`int64_t`)
- S : 16-bit unsigned, native endian (`uint16_t`)
- s : 16-bit signed, native endian (`int16_t`)
- U : UTF-8 character
- V : 32-bit unsigned, VAX (little-endian) byte order
- v : 16-bit unsigned, VAX (little-endian) byte order
- x : null byte
......
......@@ -77,7 +77,7 @@ check_little_endian(void)
static unsigned int
hex2int(unsigned char ch)
{
if (ch >= '0' && ch <= '9')
if (ch >= '0' && ch <= '9')
return ch - '0';
else if (ch >= 'A' && ch <= 'F')
return 10 + (ch - 'A');
......@@ -414,8 +414,12 @@ pack_utf8(mrb_state *mrb, mrb_value o, mrb_value str, mrb_int sidx, long count,
{
char utf8[4];
int len;
unsigned long c = mrb_fixnum(o);
unsigned long c = 0;
if (mrb_float_p(o)) {
goto range_error;
}
c = mrb_fixnum(o);
/* Unicode character */
/* from mruby-compiler gem */
......@@ -434,20 +438,98 @@ pack_utf8(mrb_state *mrb, mrb_value o, mrb_value str, mrb_int sidx, long count,
utf8[2] = (char)(0x80 | ( c & 0x3F));
len = 3;
}
else {
else if (c < 0x200000) {
utf8[0] = (char)(0xF0 | (c >> 18) );
utf8[1] = (char)(0x80 | ((c >> 12) & 0x3F));
utf8[2] = (char)(0x80 | ((c >> 6) & 0x3F));
utf8[3] = (char)(0x80 | ( c & 0x3F));
len = 4;
}
else {
range_error:
mrb_raise(mrb, E_RANGE_ERROR, "pack(U): value out of range");
}
str = str_len_ensure(mrb, str, sidx + len);
memcpy(RSTRING_PTR(str) + sidx, utf8, len);
return len;
}
static const unsigned long utf8_limits[] = {
0x0, /* 1 */
0x80, /* 2 */
0x800, /* 3 */
0x10000, /* 4 */
0x200000, /* 5 */
0x4000000, /* 6 */
0x80000000, /* 7 */
};
static unsigned long
utf8_to_uv(mrb_state *mrb, const char *p, long *lenp)
{
int c = *p++ & 0xff;
unsigned long uv = c;
long n;
if (!(uv & 0x80)) {
*lenp = 1;
return uv;
}
if (!(uv & 0x40)) {
*lenp = 1;
mrb_raise(mrb, E_ARGUMENT_ERROR, "malformed UTF-8 character");
}
if (!(uv & 0x20)) { n = 2; uv &= 0x1f; }
else if (!(uv & 0x10)) { n = 3; uv &= 0x0f; }
else if (!(uv & 0x08)) { n = 4; uv &= 0x07; }
else if (!(uv & 0x04)) { n = 5; uv &= 0x03; }
else if (!(uv & 0x02)) { n = 6; uv &= 0x01; }
else {
*lenp = 1;
mrb_raise(mrb, E_ARGUMENT_ERROR, "malformed UTF-8 character");
}
if (n > *lenp) {
mrb_raisef(mrb, E_ARGUMENT_ERROR, "malformed UTF-8 character (expected %S bytes, given %S bytes)",
mrb_fixnum_value(n), mrb_fixnum_value(*lenp));
}
*lenp = n--;
if (n != 0) {
while (n--) {
c = *p++ & 0xff;
if ((c & 0xc0) != 0x80) {
*lenp -= n + 1;
mrb_raisef(mrb, E_ARGUMENT_ERROR, "malformed UTF-8 character");
}
else {
c &= 0x3f;
uv = uv << 6 | c;
}
}
}
n = *lenp - 1;
if (uv < utf8_limits[n]) {
mrb_raisef(mrb, E_ARGUMENT_ERROR, "redundant UTF-8 sequence");
}
return uv;
}
static int
unpack_utf8(mrb_state *mrb, const unsigned char * src, int srclen, mrb_value ary, unsigned int flags)
{
unsigned long uv;
long lenp = srclen;
if (srclen == 0) {
return 1;
}
uv = utf8_to_uv(mrb, (const char *)src, &lenp);
mrb_ary_push(mrb, ary, mrb_fixnum_value((mrb_int)uv));
return (int)lenp;
}
static int
pack_a(mrb_state *mrb, mrb_value src, mrb_value dst, mrb_int didx, long count, unsigned int flags)
{
......@@ -482,7 +564,7 @@ pack_a(mrb_state *mrb, mrb_value src, mrb_value dst, mrb_int didx, long count, u
while (padlen-- > 0) {
*dptr++ = pad;
}
return dptr - dptr0;
}
......@@ -541,7 +623,7 @@ pack_h(mrb_state *mrb, mrb_value src, mrb_value dst, mrb_int didx, long count, u
} else if (slen > count) {
slen = count;
}
dst = str_len_ensure(mrb, dst, didx + count);
dptr = RSTRING_PTR(dst) + didx;
......@@ -1147,6 +1229,11 @@ mrb_pack_unpack(mrb_state *mrb, mrb_value str)
case PACK_DIR_DOUBLE:
srcidx += unpack_double(mrb, sptr, srclen - srcidx, result, flags);
break;
case PACK_DIR_UTF8:
srcidx += unpack_utf8(mrb, sptr, srclen - srcidx, result, flags);
break;
default:
mrb_raise(mrb, E_RUNTIME_ERROR, "mruby-pack's bug");
}
if (count > 0) {
count--;
......
......@@ -145,3 +145,21 @@ assert 'pack/unpack "I"' do
end
assert_pack 'I', str, [12345]
end
assert 'pack/unpack "U"' do
assert_equal [], "".unpack("U")
assert_equal [], "".unpack("U*")
assert_equal [65, 66], "ABC".unpack("U2")
assert_equal [12371, 12435, 12395, 12385, 12399, 19990, 30028], "こんにちは世界".unpack("U*")
assert_equal "", [].pack("U")
assert_equal "", [].pack("U*")
assert_equal "AB", [65, 66, 67].pack("U2")
assert_equal "こんにちは世界", [12371, 12435, 12395, 12385, 12399, 19990, 30028].pack("U*")
assert_equal "\000", [0].pack("U")
assert_raise(RangeError) { [-0x40000000].pack("U") }
assert_raise(RangeError) { [-1].pack("U") }
assert_raise(RangeError) { [0x40000000].pack("U") }
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment