{-
  Ambidexter is Copyright (c) Prescott K. Turner, 2005. All rights reserved.
  It is distributed as free software under the license in the file "License",
  which is included in the distribution.
-}
module UTF8Input (readUTF8File)
where

import Data.Word
import System.IO
import Data.Array.Storable
import Data.Array.MArray

readUTF8File :: FilePath -> IO String
readUTF8File fname = do
    handle <- openBinaryFile fname ReadMode
    bytes <- readRemainder handle
    return (from_utf8 bytes)

readRemainder :: Handle -> IO [Word8]
readRemainder handle = do
    let chunk_size = 2000
    arr <- newListArray (0, chunk_size) (replicate chunk_size 0) :: IO (StorableArray Int Word8)
    count <- withStorableArray arr (\buf -> hGetBuf handle buf chunk_size)
    if count > 0 then do
	arr_elems <- getElems arr
	let chunk = take count arr_elems
	remainder <- readRemainder handle
	return $ chunk ++ remainder
     else
	return []

{-
  0xxxxxxx                                     000000000xxxxxxx
  110yyyyy 10xxxxxx                            00000yyyyyxxxxxx
  1110zzzz 10yyyyyy 10xxxxxx                   zzzzyyyyyyxxxxxx
-}

from_utf8 :: [Word8] -> String
from_utf8 a = case a of
	(b0:bytes) | b0 < b10000000 -> toEnum (fromEnum b0) : from_utf8 bytes
		   | b0 < b11000000 -> cerr : from_utf8 bytes
		   | b0 < b11100000 -> continue' 1 (b0 - b11000000) bytes
		   | b0 < b11110000 -> continue' 2 (b0 - b11100000) bytes
		   | b0 < b11111000 -> continue' 3 (b0 - b11110000) bytes
		   | b0 < b11111100 -> continue' 4 (b0 - b11111000) bytes
		   | otherwise -> continue' 5 (b0 - b11111100) bytes
	[] -> []
    where
	continue :: Int -> Integer -> [Word8] -> String
	continue need_bytes val input = if need_bytes == 0
		then val' `seq` val' : from_utf8 input
		else case input of
		   (b0:bytes) | is_initial b0 -> cerr : from_utf8 input
			      | otherwise -> continue (need_bytes-1)
						      (val*64+ toInteger (b0-b10000000))
						      bytes
		   [] -> cerr : from_utf8 [] -- premature end of input
	    where
		val' = toEnum (fromEnum val)
	continue' nb byte bytes = continue nb (toInteger byte) bytes

	is_initial b = b < b10000000 || b11000000 <= b
	cerr = '?'

	b10000000 = 128 :: Word8
	b11000000 = 192 :: Word8
	b11100000 = 224 :: Word8
	b11110000 = 240 :: Word8
	b11111000 = 248 :: Word8
	b11111100 = 252 :: Word8
