//////////////////////////////////////////////////////////////////////////////// // // Microsoft Research Singularity // // Copyright (c) Microsoft Corporation. All rights reserved. // // Note: This is a very limited Xml Reader, carved out of the // Libraries\Xml implementation. At the end exists a // very basic stream class, which makes it possible to avoid linking // against System.IO. // // Note: All kernel Xml stream must be encoded as UTF-8. // UTF-8 is a way of writing Unicode characters with variable numbers // of bytes per character, optimized for the lower 127 ASCII // characters. It's an efficient way of encoding US English in an // internationalizable way. // The UTF-8 byte order mark is simply the Unicode byte order mark // (0xFEFF) written in UTF-8 (0xEF 0xBB 0xBF). The byte order mark // is used mostly to distinguish UTF-8 text from other encodings, and // doesn't switch the byte orderings. // // bytes bits UTF-8 representation // ----- ---- ----------------------------------- // 1 7 0vvvvvvv // 2 11 110vvvvv 10vvvvvv // 3 16 1110vvvv 10vvvvvv 10vvvvvv // 4 21 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv // ----- ---- ----------------------------------- // // Surrogate: // Real Unicode value = (HighSurrogate - 0xD800) * 0x400 + // (LowSurrogate - 0xDC00) + 0x10000 // // BUG: The parser cannot handle attributes in single quotes // Actually, that's a feature -- XML requires the quotes. // // Use this file for XML processing inside the Kernel /////////////////////////////////////////////////////////////////////////////// namespace Microsoft.Singularity.Xml { using System; using System.Collections; using Singularity.Io; using System.Text; /// /// This is a simple XML Parser that does no validation. /// All it does it parse the syntax of XML /// Note - since IoMemory is not CLSCompliant, neither is this! /// [CLSCompliant(false)] public class XmlReader { private States state; private int lineNumber; private KernelMemoryStream stream; private bool isUtf8; private XmlNode doc; private XmlNode[] stack; private int stackTop; private const int defaultStackDepth = 16; private XmlReader() { state = States.START; lineNumber = 1; stack = new XmlNode[defaultStackDepth]; stackTop = 0; doc = new XmlNode("::xml"); } public XmlReader(IoMemory mem) : this() { stream = new KernelIoMemoryStream(mem); } public XmlReader(byte[] buffer) : this() { stream = new KernelByteMemoryStream(buffer); } private enum TokenType { NAME, STRING_LITERAL, OPERATOR, NONE } private enum States { START, NEED_ELEMENT_NAME, NEED_ATTRIBUTE_NAME, NEED_EQUALS_SIGN, NEED_ATTRIBUTE_VALUE, NEED_CLOSURE_BRACKET, NEED_CLOSURE_NAME, NEED_FINAL_BRACKET, QUESTION_MARK_ELEMENT, GET_STRINGS } //////////////////////////////////////////////////// Stack Operations. // private void Push(XmlNode node) { if (stackTop >= stack.Length) { XmlNode[] dest = new XmlNode[stack.Length * 2]; for (int i = 0; i < stackTop; i++) { dest[i] = stack[i]; } stack = dest; } stack[stackTop++] = node; } private XmlNode Pop() { XmlNode node = stack[stackTop - 1]; stack[stackTop--] = null; // encourage GC sooner. return node; } private XmlNode Peek() { return stack[stackTop - 1]; } ////////////////////////////////////////////////////////////////////// // public XmlNode Parse() { if (stream == null) { return doc; } TokenType type; string token = null; XmlNode curNode = null; string curAttributeName = null; // Check for a UTF-8 preamble. isUtf8 = false; if (PeekCharacter() == 0xef) { if (ReadCharacter() == 0xef && ReadCharacter() == 0xbb && ReadCharacter() == 0xbf) { isUtf8 = true; } } while (ReadToken(out token, out type)) { switch (state) { case States.START: if (type == TokenType.OPERATOR && token.Equals("<")) { state = States.NEED_ELEMENT_NAME; } else { // All other text is interpreted as freestanding text. // It had better occur within a tag! if (stackTop == 0) { throw new XmlException(lineNumber, "Line " + lineNumber + ": Text must be inside a tag"); } XmlNode lastNode = Peek(); lastNode.AddText(token); // Next state depends on whether we're at the end of // the text-within-a-tag already or not. if (PeekCharacter() == '<') { // Looks like we're already at the end state = States.START; } else { state = States.GET_STRINGS; } } break; case States.NEED_ELEMENT_NAME: if (type == TokenType.NAME) { state = States.NEED_ATTRIBUTE_NAME; curNode = new XmlNode(token); } else if (type == TokenType.OPERATOR && token.Equals("/")) { state = States.NEED_CLOSURE_NAME; } else if (type == TokenType.OPERATOR && (token.Equals("?") || token.Equals("!") || token.Equals('-'))) { // then go to the question mark state and ignore this element state = States.QUESTION_MARK_ELEMENT; } else { throw new XmlException(lineNumber, "Line " + lineNumber + ": Can only begin a tag with / or a name"); } break; case States.NEED_ATTRIBUTE_NAME: if (type == TokenType.NAME) { state = States.NEED_EQUALS_SIGN; curAttributeName = token; } else if (type == TokenType.OPERATOR && token.Equals(">")) { // hold onto this node so we can check it later state = States.START; if (stackTop != 0) { XmlNode parent = Peek(); parent.AddChild(curNode); } else { doc.AddChild(curNode); } Push(curNode); curNode = null; } else if (type == TokenType.OPERATOR && token.Equals("/")) { // this node is almost complete state = States.NEED_CLOSURE_BRACKET; } else { throw new XmlException(lineNumber, "Line " + lineNumber + ": Must have either attributes, '/>', or '>' after the name of an element"); } break; case States.NEED_EQUALS_SIGN: if (type == TokenType.OPERATOR && token.Equals("=")) { state = States.NEED_ATTRIBUTE_VALUE; } else { throw new XmlException(lineNumber, "Line " + lineNumber + ": Need an '=' after an attribute name"); } break; case States.NEED_ATTRIBUTE_VALUE: if (type == TokenType.STRING_LITERAL) { state = States.NEED_ATTRIBUTE_NAME; string unescaped_attribute_value = ExpandEntityReferences(token); curNode.AddAttribute(curAttributeName, unescaped_attribute_value); curAttributeName = null; } else { throw new XmlException(lineNumber, "Line " + lineNumber + ": Must have an attribute value after the '=' in an XML node"); } break; case States.NEED_CLOSURE_BRACKET: if (type == TokenType.OPERATOR && token.Equals(">")) { // this node is done, and we don't have to check it state = States.START; if (stackTop != 0) { XmlNode parent = Peek(); parent.AddChild(curNode); } else { doc.AddChild(curNode); } } else { throw new XmlException(lineNumber, "Line " + lineNumber + ": Must have a '>' after a closing '/' in an XML node"); } break; case States.NEED_CLOSURE_NAME: if (type == TokenType.NAME) { // pop the last XmlNode and make sure that this name matches. // Otherwise we don't have balanced open and close tags state = States.NEED_FINAL_BRACKET; XmlNode xmln = Pop(); if (!token.Equals(xmln.Name)) { throw new XmlException(lineNumber, "Line " + lineNumber + ": " + token + " does not match " + xmln.Name); } } else { throw new XmlException(lineNumber, "Line " + lineNumber + ": Must have a name after an opening />"); } break; case States.NEED_FINAL_BRACKET: if (type == TokenType.OPERATOR && token.Equals(">")) { state = States.START; } else { throw new XmlException(lineNumber, "Line " + lineNumber + ": Must have a > after a closure tag's name"); } break; case States.QUESTION_MARK_ELEMENT: // just stay in this state until you see a '>' while ('>' != ReadCharacter()) { ; } state = States.START; break; case States.GET_STRINGS: { // stay in this state until you see a '<' StringBuilder sb = new StringBuilder(); XmlNode prevNode = Peek(); if (type == TokenType.OPERATOR && token.Equals("<")) { throw new XmlException(lineNumber, "Unexpected tag beginning while in text state"); } sb.Append(token); while (PeekCharacter() != -1 && PeekCharacter() != '<') { sb.Append((char)ReadCharacter()); } prevNode.AddText(sb.ToString()); state = States.START; } break; } } stream.Close(); stream = null; return doc; } /// /// This method expands XML entity references found in an input string. /// If an invalid entity reference is encountered, this method will throw /// XmlException. /// /// The string to search for entity references. /// The expanded string. private string ExpandEntityReferences(string input) { // In most cases, there are no entity references. Check for that case now. // If we do find an entity reference, then the work isn't wasted. int start = input.IndexOf('&'); if (start == -1) return input; StringBuilder buffer = new StringBuilder(); buffer.Append(input, 0, start); start++; for (; ; ) { // At this point, 'start' points to a named XML entity. // locate the entity name. int end = input.IndexOf(';', start); if (end == -1) throw new XmlException(lineNumber, "An invalid entity reference was found. '&' is present, but there is no matching ';'."); int name_length = end - start; if (name_length == 0) throw new XmlException(lineNumber, "An invalid entity reference was found. '&;' is not a valid entity reference."); string entity_name = input.Substring(start, name_length); string value; switch (entity_name) { case "amp": value = "&"; break; case "lt": value = "<"; break; case "gt": value = ">"; break; case "quot": value = "\""; break; default: throw new XmlException(lineNumber, String.Format("An invalid entity reference was found. The entity '&{0};' is not recognized.", entity_name)); } buffer.Append(value); // Are there any more entity references in this string? int next = input.IndexOf('&', end + 1); if (next == -1) { // There are no more entity references in the string. // Append the rest of the string. buffer.Append(input, end + 1, input.Length - end - 1); return buffer.ToString(); } // Yes, there are more references. Keep looping, preserving // the same meaning of 'start' as when we entered this loop. buffer.Append(input, end + 1, next - (end + 1)); start = next + 1; // skip over next '&' } } ////////////////////////////////////////////////////////////////////// // private int undoChar; private void UndoRead(int c) { undoChar = c; } private int PeekCharacter() { if (undoChar == '\0') { undoChar = ReadCharacter(); } return undoChar; } private int ReadCharacter() { int val; if (undoChar != '\0') { val = undoChar; undoChar = '\0'; return val; } val = stream.Read(); if (val == '\n') { lineNumber++; } if (!isUtf8 || val < 0x80) { return val; } // Shared UTF-8 Decoding: // bytes bits UTF-8 representation // ----- ---- ----------------------------------- // 1 7 0vvvvvvv // 2 11 110vvvvv 10vvvvvv // 3 16 1110vvvv 10vvvvvv 10vvvvvv // 4 21 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv // ----- ---- ----------------------------------- // int bytes = 0; if (val <= 0xdf) { bytes = 1; val &= 0x1f; } else if (val <= 0xef) { bytes = 2; val &= 0x0f; } else if (val <= 0xf7) { bytes = 3; val &= 0x07; } while (bytes-- > 0) { int next = stream.Read(); if (next < 0x80 || next > 0xbf) { throw new XmlException(lineNumber, "Buffer_InvalidEncoding"); } val = val << 6 | (next & 0x3f); } return val; } ////////////////////////////////////////////////////////////////////// // private bool ReadToken(out string token, out TokenType type) { // fortunately for us, the first character tells us exactly which of // the three types of token this is. [a-zA-Z] means it's a NAME, ["] // means it's a STRING_LITERAL and [<>/=] means it's an operator int nextVal = PeekCharacter(); if (nextVal == -1) { // this is the end of the file token = ""; type = TokenType.NONE; return false; } char firstChar = unchecked((char)nextVal); // if it's in [a-zA-Z] if (('a' <= firstChar && firstChar <= 'z') || ('A' <= firstChar && firstChar <= 'Z')) { token = ReadName(); type = TokenType.NAME; return true; } else if (firstChar == '"') { token = ReadString(); type = TokenType.STRING_LITERAL; return true; } else if (firstChar == '<' || firstChar == '>' || firstChar == '/' || firstChar == '=' || firstChar == '?' || firstChar == '!' || firstChar == '-') { ReadCharacter(); token = firstChar.ToString(); type = TokenType.OPERATOR; return true; } else if (firstChar == ' ' || firstChar == '\n' || (int)firstChar == 13 || firstChar == '\t') { // throw away all whitespace while (firstChar == ' ' || firstChar == '\n' || (int)firstChar == 13 || firstChar == '\t') { char tempChar = (char)ReadCharacter(); // patch to avoid problems with \n before EOF if (PeekCharacter() == -1) { type = TokenType.NONE; token = ""; return false; } else { firstChar = (char)PeekCharacter(); } } return ReadToken(out token, out type); } else { ReadCharacter(); type = TokenType.NONE; token = firstChar.ToString(); return true; // throw new XmlException(lineNumber, "Line " + lineNumber + ": '" + firstChar + "' is not a valid first character for a token"); } } private string ReadName() { StringBuilder sb = new StringBuilder(); // add characters to our string until we see something that's not in [a-zA-Z0-9_-.:] char read = (char)PeekCharacter(); while (('a' <= read && read <= 'z') || ('A' <= read && read <= 'Z') || ('0' <= read && read <= '9') || read == '-' || read == '_' || read == '.' || read == ':') { sb.Append((char)ReadCharacter()); read = (char)PeekCharacter(); } return sb.ToString(); } private string ReadString() { StringBuilder sb = new StringBuilder(); // we need to read this string and translate it into a string literal. // this means that we have (for the moment) two magic characters: \ and " // "\\" means \ and "\"" means " bool translateNext = false; int read_or_eof = PeekCharacter(); if (read_or_eof == -1) { throw new XmlException(lineNumber, "Line " + lineNumber + ": Cannot start a string literal at EOF."); } char read = (char)read_or_eof; if (read != '"') { throw new XmlException(lineNumber, "Line " + lineNumber + ": Cannot start a string literal with " + (char)read + ". You must use a '\"'"); } // drop the '"' on the floor ReadCharacter(); read = (char)ReadCharacter(); while (read != '"' || translateNext) { if (!translateNext ) { if (read == '\\') { translateNext = true; } else { sb.Append(read); read = (char)ReadCharacter(); } } else { translateNext = false; if (read == '\\') { sb.Append("\\"); read = (char)ReadCharacter(); } else if (read == '"') { sb.Append("\""); read = (char)ReadCharacter(); } else { throw new XmlException(lineNumber, "Line " + lineNumber + ": Invalid escape sequence: \\" + read); } } } return sb.ToString(); } } abstract class KernelMemoryStream { public void Close() { } public abstract int Read(); } /// /// This is a wholly unsafe manner of making a byte array look like /// a stream to the KernelXmlReader by giving it Read(), Peek(), and /// Close() methods. /// class KernelIoMemoryStream : KernelMemoryStream { private IoMemory buffer; // can access buffer[???] as needed private int position; private int size; public KernelIoMemoryStream(IoMemory buffer) { this.buffer = buffer; position = 0; size = buffer.Length; } public override int Read() { if (position >= size) { return -1; } return buffer[position++]; } } class KernelByteMemoryStream : KernelMemoryStream { private byte[] buffer; // can access buffer[???] as needed private int position; private int size; public KernelByteMemoryStream(byte[] buffer) { this.buffer = buffer; position = 0; size = buffer.Length; } public override int Read() { if (position == size) { return -1; } return buffer[position++]; } } }