commit 3fa532f2a7325f94a5798f86d77e8964df6e3434
parent 6e32c8bc706b32b633157d46f2ccd4feb9d37b6f
Author: Tomas Hlavaty <tom@logand.com>
Date:   Sun, 20 Sep 2015 18:15:40 +0200
utf8 codepoint reader added
Diffstat:
| M | rw.lisp |  |  | 53 | +++++++++++++++++++++++++++++++++++++++++++++++++++++ | 
1 file changed, 53 insertions(+), 0 deletions(-)
diff --git a/rw.lisp b/rw.lisp
@@ -44,6 +44,7 @@
            :next-u32be
            :next-u32le
            :next-u8
+           :next-utf8
            :next-z0
            :peek
            :peek-reader
@@ -63,6 +64,7 @@
            :u32
            :u32be
            :u32le
+           :utf8-reader
            :wrap-reader
            :wrap-writer
            :write-octets
@@ -334,6 +336,57 @@
     (when x
       (parse-integer (coerce x 'string) :radix radix))))
 
+(defun next-utf8 (reader)
+  (let ((i1 (rw:next reader)) i2 i3 i4 o2 o3 o4)
+    (macrolet ((wrong ()
+                 `(error "wrong UTF-8 sequence ~x ~x ~x ~x" i1 i2 i3 i4))
+               (tail (i o)
+                 `(progn
+                    (setq ,i (rw:next reader))
+                    (unless (and (typep ,i '(unsigned-byte 8))
+                                 (= #x80 (logand #b11000000 ,i)))
+                      (wrong))
+                    (setq ,o (logand #b00111111 ,i)))))
+      (cond
+        ((not i1) nil)
+        ((not (typep i1 '(unsigned-byte 8)))
+         (wrong))
+        ((<= #b00000000 i1 #b01111111) ;; one
+         i1)
+        ((<= #b11000000 i1 #b11011111) ;; two
+         (tail i2 o2)
+         (let ((z (logior (ash (logand #x1f i1) 6) o2)))
+           (unless (<= #x000080 z #x0007ff)
+             (wrong))
+           z))
+        ((<= #b11100000 i1 #b11101111) ;; three
+         (tail i2 o2)
+         (tail i3 o3)
+         (let ((z (logior (ash (logand #x0f i1) 12) (ash o2 6) o3)))
+           (unless (or (<= #x000800 z #x00d7ff)
+                       (<= #x00e000 z #x00ffff))
+             (wrong))
+           z))
+        ((<= #b11110000 i1 #b11110111) ;; four
+         (tail i2 o2)
+         (tail i3 o3)
+         (tail i4 o4)
+         (let ((z (logior (ash (logand #x07 i1) 18) (ash o2 12) (ash o3 6) o4)))
+           (unless (<= #x010000 z #x10ffff)
+             (wrong))
+           z))
+        (t (wrong))))))
+
+(defun utf8-reader (octet-reader)
+  (lambda ()
+    (next-utf8 octet-reader)))
+
+;;(rw:till (rw:peek-reader (utf8-reader (rw:reader #(#x24)))))
+;;(rw:till (rw:peek-reader (utf8-reader (rw:reader #(#xc2 #xa2)))))
+;;(rw:till (rw:peek-reader (utf8-reader (rw:reader #(#xe2 #x82 #xac)))))
+;;(rw:till (rw:peek-reader (utf8-reader (rw:reader #(#xf0 #x90 #x8d #x88)))))
+;;(rw:till (rw:peek-reader (utf8-reader (rw:reader #(#xc0 #x80))))) ;; overlong
+
 (defun bit-reader (octet-reader)
   (let (octet bit)
     (lambda ()