singrdk/base/Kernel/Singularity/Xml/XmlReader.cs

////////////////////////////////////////////////////////////////////////////////
//
//  Microsoft Research Singularity
//
//  Copyright (c) Microsoft Corporation.  All rights reserved.
//
//  Note:   This is a very limited Xml Reader, carved out of the
//          Libraries\Xml implementation.  At the end exists a
//          very basic stream class, which makes it possible to avoid linking
//          against System.IO.
//
//  Note:   All kernel Xml stream must be encoded as UTF-8.
//          UTF-8 is a way of writing Unicode characters with variable numbers
//          of bytes per character, optimized for the lower 127 ASCII
//          characters.  It's an efficient way of encoding US English in an
//          internationalizable way.
//          The UTF-8 byte order mark is simply the Unicode byte order mark
//          (0xFEFF) written in UTF-8 (0xEF 0xBB 0xBF).  The byte order mark
//          is used mostly to distinguish UTF-8 text from other encodings, and
//          doesn't switch the byte orderings.
//
//            bytes   bits    UTF-8 representation
//            -----   ----    -----------------------------------
//            1        7      0vvvvvvv
//            2       11      110vvvvv 10vvvvvv
//            3       16      1110vvvv 10vvvvvv 10vvvvvv
//            4       21      11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
//            -----   ----    -----------------------------------
//
//          Surrogate:
//            Real Unicode value = (HighSurrogate - 0xD800) * 0x400 +
//                                 (LowSurrogate - 0xDC00) + 0x10000
//
//  BUG:    The parser cannot handle attributes in single quotes
//          Actually, that's a feature -- XML requires the quotes.
//
//          Use this file for XML processing inside the Kernel
///////////////////////////////////////////////////////////////////////////////

namespace Microsoft.Singularity.Xml
{
    using System;
    using System.Collections;
    using Singularity.Io;
    using System.Text;

    /// <summary>
    /// This is a simple XML Parser that does no validation.
    /// All it does it parse the syntax of XML
    /// Note - since IoMemory is not CLSCompliant, neither is this!
    /// </summary>
    [CLSCompliant(false)]
    public class XmlReader
    {
        private States state;
        private int lineNumber;
        private KernelMemoryStream stream;
        private bool isUtf8;
        private XmlNode doc;
        private XmlNode[] stack;
        private int stackTop;
        private const int defaultStackDepth = 16;

        private XmlReader()
        {
            state = States.START;
            lineNumber = 1;
            stack = new XmlNode[defaultStackDepth];
            stackTop = 0;
            doc = new XmlNode("::xml");
        }

        public XmlReader(IoMemory mem)
            : this()
        {
            stream = new KernelIoMemoryStream(mem);
        }

        public XmlReader(byte[] buffer)
            : this()
        {
            stream = new KernelByteMemoryStream(buffer);
        }

        private enum TokenType
        {
            NAME,
            STRING_LITERAL,
            OPERATOR,
            NONE
        }

        private enum States
        {
            START,
            NEED_ELEMENT_NAME,
            NEED_ATTRIBUTE_NAME,
            NEED_EQUALS_SIGN,
            NEED_ATTRIBUTE_VALUE,
            NEED_CLOSURE_BRACKET,
            NEED_CLOSURE_NAME,
            NEED_FINAL_BRACKET,
            QUESTION_MARK_ELEMENT,
            GET_STRINGS
        }

        //////////////////////////////////////////////////// Stack Operations.
        //
        private void Push(XmlNode node)
        {
            if (stackTop >= stack.Length) {
                XmlNode[] dest = new XmlNode[stack.Length * 2];
                for (int i = 0; i < stackTop; i++) {
                    dest[i] = stack[i];
                }
                stack = dest;
            }
            stack[stackTop++] = node;
        }

        private XmlNode Pop()
        {
            XmlNode node = stack[stackTop - 1];
            stack[stackTop--] = null; // encourage GC sooner.
            return node;
        }

        private XmlNode Peek()
        {
            return stack[stackTop - 1];
        }

        //////////////////////////////////////////////////////////////////////
        //
        public XmlNode Parse()
        {
            if (stream == null) {
                return doc;
            }

            TokenType type;
            string token = null;
            XmlNode curNode = null;
            string curAttributeName = null;

            // Check for a UTF-8 preamble.
            isUtf8 = false;
            if (PeekCharacter() == 0xef) {
                if (ReadCharacter() == 0xef &&
                    ReadCharacter() == 0xbb &&
                    ReadCharacter() == 0xbf) {
                    isUtf8 = true;
                }
            }

            while (ReadToken(out token, out type)) {
                switch (state) {
                    case States.START:
                        if (type == TokenType.OPERATOR && token.Equals("<")) {
                            state = States.NEED_ELEMENT_NAME;
                        }
                        else {
                            // All other text is interpreted as freestanding text.
                            // It had better occur within a tag!
                            if (stackTop == 0) {
                                throw new XmlException(lineNumber, "Line " + lineNumber + ": Text must be inside a tag");
                            }

                            XmlNode lastNode = Peek();
                            lastNode.AddText(token);

                            // Next state depends on whether we're at the end of
                            // the text-within-a-tag already or not.
                            if (PeekCharacter() == '<') {
                                // Looks like we're already at the end
                                state = States.START;
                            }
                            else {
                                state = States.GET_STRINGS;
                            }
                        }
                        break;

                    case States.NEED_ELEMENT_NAME:
                        if (type == TokenType.NAME) {
                            state = States.NEED_ATTRIBUTE_NAME;
                            curNode = new XmlNode(token);
                        }
                        else if (type == TokenType.OPERATOR && token.Equals("/")) {
                            state = States.NEED_CLOSURE_NAME;
                        }
                        else if (type == TokenType.OPERATOR &&
                                 (token.Equals("?") ||
                                  token.Equals("!") ||
                                  token.Equals('-'))) {
                            // then go to the question mark state and ignore this element
                            state = States.QUESTION_MARK_ELEMENT;
                        }
                        else {
                            throw new XmlException(lineNumber, "Line " + lineNumber + ": Can only begin a tag with / or a name");
                        }
                        break;

                    case States.NEED_ATTRIBUTE_NAME:
                        if (type == TokenType.NAME) {
                            state = States.NEED_EQUALS_SIGN;
                            curAttributeName = token;
                        }
                        else if (type == TokenType.OPERATOR && token.Equals(">")) {
                            // hold onto this node so we can check it later
                            state = States.START;

                            if (stackTop != 0) {
                                XmlNode parent = Peek();
                                parent.AddChild(curNode);
                            }
                            else {
                                doc.AddChild(curNode);
                            }
                            Push(curNode);
                            curNode = null;
                        }
                        else if (type == TokenType.OPERATOR && token.Equals("/")) {
                            // this node is almost complete
                            state = States.NEED_CLOSURE_BRACKET;
                        }
                        else {
                            throw new XmlException(lineNumber, "Line " + lineNumber + ": Must have either attributes, '/>', or '>' after the name of an element");
                        }
                        break;

                    case States.NEED_EQUALS_SIGN:
                        if (type == TokenType.OPERATOR && token.Equals("=")) {
                            state = States.NEED_ATTRIBUTE_VALUE;
                        }
                        else {
                            throw new XmlException(lineNumber, "Line " + lineNumber + ": Need an '=' after an attribute name");
                        }
                        break;

                    case States.NEED_ATTRIBUTE_VALUE:
                        if (type == TokenType.STRING_LITERAL) {
                            state = States.NEED_ATTRIBUTE_NAME;
                            string unescaped_attribute_value = ExpandEntityReferences(token);
                            curNode.AddAttribute(curAttributeName, unescaped_attribute_value);
                            curAttributeName = null;
                        }
                        else {
                            throw new XmlException(lineNumber, "Line " + lineNumber + ": Must have an attribute value after the '=' in an XML node");
                        }
                        break;

                    case States.NEED_CLOSURE_BRACKET:
                        if (type == TokenType.OPERATOR && token.Equals(">")) {
                            // this node is done, and we don't have to check it
                            state = States.START;

                            if (stackTop != 0) {
                                XmlNode parent = Peek();
                                parent.AddChild(curNode);
                            }
                            else {
                                doc.AddChild(curNode);
                            }
                        }
                        else {
                            throw new XmlException(lineNumber, "Line " + lineNumber + ": Must have a '>' after a closing '/' in an XML node");
                        }
                        break;

                    case States.NEED_CLOSURE_NAME:
                        if (type == TokenType.NAME) {
                            // pop the last XmlNode and make sure that this name matches.
                            // Otherwise we don't have balanced open and close tags
                            state = States.NEED_FINAL_BRACKET;
                            XmlNode xmln = Pop();
                            if (!token.Equals(xmln.Name)) {
                                throw new XmlException(lineNumber, "Line " + lineNumber + ": " + token + " does not match " + xmln.Name);
                            }
                        }
                        else {
                            throw new XmlException(lineNumber, "Line " + lineNumber + ": Must have a name after an opening />");
                        }

                        break;
                    case States.NEED_FINAL_BRACKET:
                        if (type == TokenType.OPERATOR && token.Equals(">")) {
                            state = States.START;
                        }
                        else {
                            throw new XmlException(lineNumber, "Line " + lineNumber + ": Must have a > after a closure tag's name");
                        }
                        break;

                    case States.QUESTION_MARK_ELEMENT:
                        // just stay in this state until you see a '>'
                        while ('>' != ReadCharacter()) {
                            ;
                        }
                        state = States.START;
                        break;

                    case States.GET_STRINGS:
                        {
                            // stay in this state until you see a '<'
                            StringBuilder sb = new StringBuilder();
                            XmlNode prevNode = Peek();

                            if (type == TokenType.OPERATOR && token.Equals("<")) {
                                throw new XmlException(lineNumber, "Unexpected tag beginning while in text state");
                            }

                            sb.Append(token);
                            while (PeekCharacter() != -1 && PeekCharacter() != '<') {
                                sb.Append((char)ReadCharacter());
                            }

                            prevNode.AddText(sb.ToString());
                            state = States.START;
                        }
                        break;
                }
            }
            stream.Close();
            stream = null;
            return doc;
        }

        /// <summary>
        /// This method expands XML entity references found in an input string.
        /// If an invalid entity reference is encountered, this method will throw
        /// XmlException.
        /// </summary>
        /// <param name="input">The string to search for entity references.</param>
        /// <returns>The expanded string.</returns>
        private string ExpandEntityReferences(string input)
        {
            // In most cases, there are no entity references.  Check for that case now.
            // If we do find an entity reference, then the work isn't wasted.
            int start = input.IndexOf('&');
            if (start == -1)
                return input;

            StringBuilder buffer = new StringBuilder();
            buffer.Append(input, 0, start);
            start++;

            for (; ; )
            {
                // At this point, 'start' points to a named XML entity.
                // locate the entity name.
                int end = input.IndexOf(';', start);
                if (end == -1)
                    throw new XmlException(lineNumber, "An invalid entity reference was found.  '&' is present, but there is no matching ';'.");

                int name_length = end - start;
                if (name_length == 0)
                    throw new XmlException(lineNumber, "An invalid entity reference was found.  '&;' is not a valid entity reference.");

                string entity_name = input.Substring(start, name_length);
                string value;

                switch (entity_name)
                {
                    case "amp": value = "&"; break;
                    case "lt": value = "<"; break;
                    case "gt": value = ">"; break;
                    case "quot": value = "\""; break;
                    default:
                        throw new XmlException(lineNumber, String.Format("An invalid entity reference was found.  The entity '&{0};' is not recognized.", entity_name));
                }

                buffer.Append(value);

                // Are there any more entity references in this string?
                int next = input.IndexOf('&', end + 1);
                if (next == -1)
                {
                    // There are no more entity references in the string.
                    // Append the rest of the string.
                    buffer.Append(input, end + 1, input.Length - end - 1);
                    return buffer.ToString();
                }

                // Yes, there are more references.  Keep looping, preserving
                // the same meaning of 'start' as when we entered this loop.
                buffer.Append(input, end + 1, next - (end + 1));
                start = next + 1; // skip over next '&'
            }
        }

        //////////////////////////////////////////////////////////////////////
        //
        private int undoChar;

        private void UndoRead(int c)
        {
            undoChar = c;
        }

        private int PeekCharacter()
        {
            if (undoChar == '\0') {
                undoChar = ReadCharacter();
            }
            return undoChar;
        }

        private int ReadCharacter()
        {
            int val;
            if (undoChar != '\0') {
                val = undoChar;
                undoChar = '\0';
                return val;
            }

            val = stream.Read();
            if (val == '\n') {
                lineNumber++;
            }

            if (!isUtf8 || val < 0x80) {
                return val;
            }

            //  Shared UTF-8 Decoding:
            //      bytes   bits    UTF-8 representation
            //      -----   ----    -----------------------------------
            //      1        7      0vvvvvvv
            //      2       11      110vvvvv 10vvvvvv
            //      3       16      1110vvvv 10vvvvvv 10vvvvvv
            //      4       21      11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
            //      -----   ----    -----------------------------------
            //
            int bytes = 0;
            if (val <= 0xdf) {
                bytes = 1;
                val &= 0x1f;
            }
            else if (val <= 0xef) {
                bytes = 2;
                val &= 0x0f;
            }
            else if (val <= 0xf7) {
                bytes = 3;
                    val &= 0x07;
            }

            while (bytes-- > 0) {
                int next = stream.Read();
                if (next < 0x80 || next > 0xbf) {
                    throw new XmlException(lineNumber, "Buffer_InvalidEncoding");
                }
                val = val << 6 | (next & 0x3f);
            }
            return val;
        }

        //////////////////////////////////////////////////////////////////////
        //
        private bool ReadToken(out string token, out TokenType type)
        {
            // fortunately for us, the first character tells us exactly which of
            // the three types of token this is. [a-zA-Z] means it's a NAME, ["]
            // means it's a STRING_LITERAL and [<>/=] means it's an operator
            int nextVal = PeekCharacter();

            if (nextVal == -1) {
                // this is the end of the file
                token = "";
                type = TokenType.NONE;
                return false;
            }

            char firstChar = unchecked((char)nextVal);

            // if it's in [a-zA-Z]
            if (('a' <= firstChar && firstChar <= 'z') ||
                ('A' <= firstChar && firstChar <= 'Z')) {
                token = ReadName();
                type = TokenType.NAME;
                return true;
            }
            else if (firstChar == '"') {
                token = ReadString();
                type = TokenType.STRING_LITERAL;
                return true;
            }
            else if (firstChar == '<' ||
                     firstChar == '>' ||
                     firstChar == '/' ||
                     firstChar == '=' ||
                     firstChar == '?' ||
                     firstChar == '!' ||
                     firstChar == '-') {
                ReadCharacter();
                token = firstChar.ToString();
                type = TokenType.OPERATOR;
                return true;
            }
            else if (firstChar == ' ' ||
                     firstChar == '\n' ||
                     (int)firstChar == 13 ||
                     firstChar == '\t') {
                // throw away all whitespace
                while (firstChar == ' ' ||
                       firstChar == '\n' ||
                       (int)firstChar == 13 ||
                       firstChar == '\t') {
                    char tempChar = (char)ReadCharacter();
                    // patch to avoid problems with \n before EOF
                    if (PeekCharacter() == -1) {
                        type = TokenType.NONE;
                        token = "";
                        return false;
                    }
                    else {
                        firstChar = (char)PeekCharacter();
                    }
                }
                return ReadToken(out token, out type);
            }
            else {
                ReadCharacter();
                type = TokenType.NONE;
                token = firstChar.ToString();
                return true;
                // throw new XmlException(lineNumber, "Line " + lineNumber + ": '" + firstChar + "' is not a valid first character for a token");
            }
        }

        private string ReadName()
        {
            StringBuilder sb = new StringBuilder();

            // add characters to our string until we see something that's not in [a-zA-Z0-9_-.:]
            char read = (char)PeekCharacter();

            while (('a' <= read && read <= 'z') ||
                   ('A' <= read && read <= 'Z') ||
                   ('0' <= read && read <= '9') ||
                   read == '-' ||
                   read == '_' ||
                   read == '.' ||
                   read == ':') {
                sb.Append((char)ReadCharacter());
                read = (char)PeekCharacter();
            }
            return sb.ToString();
        }

        private string ReadString()
        {
            StringBuilder sb = new StringBuilder();
            // we need to read this string and translate it into a string literal.
            // this means that we have (for the moment) two magic characters: \ and "
            // "\\" means \ and "\"" means "
            bool translateNext = false;
            int read_or_eof = PeekCharacter();
            if (read_or_eof == -1)
            {
                throw new XmlException(lineNumber, "Line " + lineNumber + ": Cannot start a string literal at EOF.");
            }
            char read = (char)read_or_eof;
            if (read != '"')
            {
                throw new XmlException(lineNumber, "Line " + lineNumber + ": Cannot start a string literal with " + (char)read + ".  You must use a '\"'");
            }
            // drop the '"' on the floor
            ReadCharacter();
            read = (char)ReadCharacter();

            while (read != '"' || translateNext) {
                if (!translateNext ) {
                    if (read == '\\') {
                        translateNext = true;
                    }
                    else {
                        sb.Append(read);
                        read = (char)ReadCharacter();
                    }
                }
                else {
                    translateNext = false;
                    if (read == '\\') {
                        sb.Append("\\");
                        read = (char)ReadCharacter();
                    }
                    else if (read == '"') {
                        sb.Append("\"");
                        read = (char)ReadCharacter();
                    }
                    else {
                        throw new XmlException(lineNumber, "Line " + lineNumber + ": Invalid escape sequence: \\" + read);
                    }
                }
            }

            return sb.ToString();
        }
    }

    abstract class KernelMemoryStream
    {
        public void Close()
        {
        }

        public abstract int Read();
    }

    /// <summary>
    /// This is a wholly unsafe manner of making a byte array look like
    /// a stream to the KernelXmlReader by giving it Read(), Peek(), and
    /// Close() methods.
    /// </summary>
    class KernelIoMemoryStream : KernelMemoryStream
    {
        private IoMemory buffer;   // can access buffer[???] as needed
        private int      position;
        private int      size;

        public KernelIoMemoryStream(IoMemory buffer)
        {
            this.buffer = buffer;
            position = 0;
            size = buffer.Length;
        }

        public override int Read()
        {
            if (position >= size) {
                return -1;
            }

            return buffer[position++];
        }
    }

    class KernelByteMemoryStream : KernelMemoryStream
    {
        private byte[]  buffer;   // can access buffer[???] as needed
        private int     position;
        private int     size;

        public KernelByteMemoryStream(byte[] buffer)
        {
            this.buffer = buffer;
            position = 0;
            size = buffer.Length;
        }

        public override int Read()
        {
            if (position == size) {
                return -1;
            }
            return buffer[position++];
        }
    }
}