Detect invalid first byte of UTF-8 char; fix #5269

The first byte of UTF-8 character should not be `80..c1`.

Detect invalid first byte of UTF-8 char; fix #5269
The first byte of UTF-8 character should not be `80..c1`.
f81591ce · Yukihiro "Matz" Matsumoto · 62e52473 · f81591ce
Unverified Commit f81591ce authored Jan 09, 2021 by Yukihiro "Matz" Matsumoto
Hide whitespace changes
Inline Side-by-side

Showing with 7 additions and 5 deletions

src/string.c src/string.c +7 -5

No files found.
--- a/src/string.c
+++ b/src/string.c
@@ -284,10 +284,12 @@ static const char utf8len_codepage[256] =
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+  1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,1,1,1,1,1,1,1,1,1,1,1,
 };

+#define utf8_islead(c) ((unsigned char)((c)&0xc0) != 0x80)
+
 mrb_int
 mrb_utf8len(const char* p, const char* e)
 {
@@ -299,7 +301,7 @@ mrb_utf8len(const char* p, const char* e)
  if (len == 1) return 1;
  if (len > e - p) return 1;
  for (i = 1; i < len; ++i)
-    if ((p[i] & 0xc0) != 0x80)
+    if (utf8_islead(p[i]))
      return 1;
  return len;
 }
@@ -307,15 +309,15 @@ mrb_utf8len(const char* p, const char* e)
 mrb_int
 mrb_utf8_strlen(const char *str, mrb_int byte_len)
 {
-  mrb_int total = 0;
+  mrb_int len = 0;
  const char *p = str;
  const char *e = p + byte_len;

  while (p < e) {
    p += mrb_utf8len(p, e);
-    total++;
+    len++;
  }
-  return total;
+  return len;
 }

 static mrb_int