461 lines
19 KiB
C#
461 lines
19 KiB
C#
////////////////////////////////////////////////////////////////////////////////
|
|
//
|
|
// Microsoft Research Singularity
|
|
//
|
|
// Copyright (c) Microsoft Corporation. All rights reserved.
|
|
//
|
|
// Note: This is a very limited Xml Reader.
|
|
//
|
|
// BUG: The parser cannot handle attributes in single quotes
|
|
// Actually, that's a feature -- XML requires the quotes.
|
|
//
|
|
namespace Microsoft.Singularity.Xml
|
|
{
|
|
using System;
|
|
using System.IO;
|
|
using System.Collections;
|
|
using System.Text;
|
|
|
|
/// <summary>
|
|
/// This is a simple XML Parser that does no validation. All it does it parse the syntax of XML
|
|
/// </summary>
|
|
public class XmlReader
|
|
{
|
|
private States state;
|
|
private int lineNumber;
|
|
|
|
public XmlReader()
|
|
{
|
|
state = States.START;
|
|
lineNumber = 1;
|
|
}
|
|
|
|
private enum TokenType
|
|
{
|
|
NAME,
|
|
STRING_LITERAL,
|
|
OPERATOR,
|
|
NONE
|
|
}
|
|
|
|
private enum States
|
|
{
|
|
START,
|
|
NEED_ELEMENT_NAME,
|
|
NEED_ATTRIBUTE_NAME,
|
|
NEED_EQUALS_SIGN,
|
|
NEED_ATTRIBUTE_VALUE,
|
|
NEED_CLOSURE_BRACKET,
|
|
NEED_CLOSURE_NAME,
|
|
NEED_FINAL_BRACKET,
|
|
QUESTION_MARK_ELEMENT,
|
|
GET_STRINGS
|
|
}
|
|
|
|
public ArrayList Parse(byte[] xmlBytes)
|
|
{
|
|
MemoryStream ms = new MemoryStream(xmlBytes);
|
|
StreamReader sr = new StreamReader(ms);
|
|
return ParseHelper(sr);
|
|
}
|
|
|
|
public ArrayList Parse(string filePath)
|
|
{
|
|
StreamReader sr = new StreamReader(filePath);
|
|
return ParseHelper(sr);
|
|
}
|
|
|
|
|
|
private ArrayList ParseHelper(TextReader sr)
|
|
{
|
|
TokenType type;
|
|
string token = null;
|
|
ArrayList xmlNodes = new ArrayList();
|
|
XmlNode curNode = null;
|
|
Stack st = new Stack();
|
|
string curAttributeName = null;
|
|
|
|
while (ReadToken(sr, out token, out type)) {
|
|
switch (state) {
|
|
case States.START:
|
|
//Console.WriteLine("START");
|
|
if (type == TokenType.OPERATOR && token.Equals("<")) {
|
|
state = States.NEED_ELEMENT_NAME;
|
|
}
|
|
else
|
|
{
|
|
// All other text is interpreted as freestanding text.
|
|
// It had better occur within a tag!
|
|
if (st.Count == 0) {
|
|
throw new XmlException("Line " + lineNumber + ": Text must be inside a tag");
|
|
}
|
|
|
|
XmlNode lastNode = (XmlNode)st.Peek();
|
|
lastNode.AddText(token);
|
|
|
|
// Next state depends on whether we're at the end of the text-within-a-tag
|
|
// already or not.
|
|
if ((char)sr.Peek() == '<') {
|
|
// Looks like we're already at the end
|
|
state = States.START;
|
|
}
|
|
else {
|
|
state = States.GET_STRINGS;
|
|
}
|
|
}
|
|
break;
|
|
|
|
case States.NEED_ELEMENT_NAME:
|
|
//Console.WriteLine("NEED_ELEMENT_NAME");
|
|
if (type == TokenType.NAME) {
|
|
state = States.NEED_ATTRIBUTE_NAME;
|
|
curNode = new XmlNode(token, st.Count);
|
|
|
|
//Console.WriteLine("Saw beginning of element: " + token);
|
|
}
|
|
else if (type == TokenType.OPERATOR && token.Equals("/")) {
|
|
state = States.NEED_CLOSURE_NAME;
|
|
}
|
|
else if (type == TokenType.OPERATOR &&
|
|
(token.Equals("?") || token.Equals("!") || token.Equals('-'))) {
|
|
// then go to the question mark state and ignore this element
|
|
state = States.QUESTION_MARK_ELEMENT;
|
|
}
|
|
else {
|
|
throw new XmlException("Line " + lineNumber + ": Can only begin a tag with / or a name");
|
|
}
|
|
break;
|
|
|
|
case States.NEED_ATTRIBUTE_NAME:
|
|
//Console.WriteLine("NEED_ATTRIBUTE_NAME");
|
|
if (type == TokenType.NAME) {
|
|
state = States.NEED_EQUALS_SIGN;
|
|
curAttributeName = token;
|
|
}
|
|
else if (type == TokenType.OPERATOR && token.Equals(">")) {
|
|
// hold onto this node so we can check it later
|
|
state = States.START;
|
|
bool stackEmpty = st.Count == 0;
|
|
|
|
if (!stackEmpty) {
|
|
XmlNode parent = (XmlNode)st.Peek();
|
|
parent.AddChild(curNode);
|
|
}
|
|
|
|
st.Push(curNode);
|
|
xmlNodes.Add(curNode);
|
|
|
|
curNode = null;
|
|
}
|
|
else if (type == TokenType.OPERATOR && token.Equals("/")) {
|
|
// this node is almost complete
|
|
state = States.NEED_CLOSURE_BRACKET;
|
|
}
|
|
else {
|
|
throw new XmlException("Line " + lineNumber + ": Must have either attributes, '/>', or '>' after the name of an element");
|
|
}
|
|
break;
|
|
|
|
case States.NEED_EQUALS_SIGN:
|
|
//Console.WriteLine("NEED_EQUALS_SIGN");
|
|
if (type == TokenType.OPERATOR && token.Equals("=")) {
|
|
state = States.NEED_ATTRIBUTE_VALUE;
|
|
}
|
|
else {
|
|
throw new XmlException("Line " + lineNumber + ": Need an '=' after an attribute name");
|
|
}
|
|
break;
|
|
|
|
case States.NEED_ATTRIBUTE_VALUE:
|
|
//Console.WriteLine("NEED_ATTRIBUTE_VALUE");
|
|
if (type == TokenType.STRING_LITERAL) {
|
|
state = States.NEED_ATTRIBUTE_NAME;
|
|
string unescaped_attribute_value = ExpandEntityReferences(token);
|
|
curNode[curAttributeName] = unescaped_attribute_value;
|
|
curAttributeName = null;
|
|
}
|
|
else {
|
|
throw new XmlException("Line " + lineNumber + ": Must have an attribute value after the '=' in an XML node");
|
|
}
|
|
break;
|
|
|
|
case States.NEED_CLOSURE_BRACKET:
|
|
//Console.WriteLine("NEED_CLOSURE_BRACKET");
|
|
if (type == TokenType.OPERATOR && token.Equals(">")) {
|
|
// this node is done, and we don't have to check it
|
|
state = States.START;
|
|
bool stackEmpty = st.Count == 0;
|
|
|
|
if (!stackEmpty) {
|
|
XmlNode parent = (XmlNode)st.Peek();
|
|
parent.AddChild(curNode);
|
|
}
|
|
|
|
xmlNodes.Add(curNode);
|
|
}
|
|
else {
|
|
throw new XmlException("Line " + lineNumber + ": Must have a '>' after a closing '/' in an XML node");
|
|
}
|
|
break;
|
|
|
|
case States.NEED_CLOSURE_NAME:
|
|
//Console.WriteLine("NEED_CLOSURE_NAME");
|
|
if (type == TokenType.NAME) {
|
|
// pop the last XmlNode and make sure that this name matches.
|
|
// Otherwise we don't have balanced open and close tags
|
|
state = States.NEED_FINAL_BRACKET;
|
|
XmlNode xmln = (XmlNode)st.Pop();
|
|
if (!token.Equals(xmln.Name)) {
|
|
throw new XmlException("Line " + lineNumber + ": " + token + " does not match " + xmln.Name);
|
|
}
|
|
//Console.WriteLine("Saw end of element: " + token);
|
|
}
|
|
else {
|
|
throw new XmlException("Line " + lineNumber + ": Must have a name after an opening />");
|
|
}
|
|
|
|
break;
|
|
case States.NEED_FINAL_BRACKET:
|
|
//Console.WriteLine("NEED_FINAL_BRACKET");
|
|
if (type == TokenType.OPERATOR && token.Equals(">")) {
|
|
state = States.START;
|
|
}
|
|
else {
|
|
throw new XmlException("Line " + lineNumber + ": Must have a > after a closure tag's name");
|
|
}
|
|
break;
|
|
|
|
case States.QUESTION_MARK_ELEMENT:
|
|
// just stay in this state until you see a '>'
|
|
while ('>' != ReadCharacter(sr)) {
|
|
;
|
|
}
|
|
state = States.START;
|
|
break;
|
|
|
|
case States.GET_STRINGS:
|
|
{
|
|
// stay in this state until you see a '<'
|
|
StringBuilder sb = new StringBuilder();
|
|
XmlNode prevNode = (XmlNode)st.Peek();
|
|
|
|
if (type == TokenType.OPERATOR && token.Equals("<")) {
|
|
throw new XmlException("Unexpected tag beginning while in text state");
|
|
}
|
|
|
|
sb.Append(token);
|
|
while ((char)sr.Peek() != '<') {
|
|
sb.Append((char)sr.Read());
|
|
}
|
|
|
|
prevNode.AddText(sb.ToString());
|
|
state = States.START;
|
|
//Console.WriteLine("Grabbed string data");
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
sr.Close();
|
|
return xmlNodes;
|
|
}
|
|
|
|
/// <summary>
|
|
/// This method expands XML entity references found in an input string.
|
|
/// If an invalid entity reference is encountered, this method will throw
|
|
/// XmlException.
|
|
/// </summary>
|
|
/// <param name="input">The string to search for entity references.</param>
|
|
/// <returns>The expanded string.</returns>
|
|
private static string ExpandEntityReferences(string input)
|
|
{
|
|
// In most cases, there are no entity references. Check for that case now.
|
|
// If we do find an entity reference, then the work isn't wasted.
|
|
int start = input.IndexOf('&');
|
|
if (start == -1)
|
|
return input;
|
|
|
|
StringBuilder buffer = new StringBuilder();
|
|
buffer.Append(input, 0, start);
|
|
start++;
|
|
|
|
for (; ; )
|
|
{
|
|
// At this point, 'start' points to a named XML entity.
|
|
// locate the entity name.
|
|
int end = input.IndexOf(';', start);
|
|
if (end == -1)
|
|
throw new XmlException("An invalid entity reference was found. '&' is present, but there is no matching ';'.");
|
|
|
|
int name_length = end - start;
|
|
if (name_length == 0)
|
|
throw new XmlException("An invalid entity reference was found. '&;' is not a valid entity reference.");
|
|
|
|
string entity_name = input.Substring(start, name_length);
|
|
string value;
|
|
|
|
switch (entity_name)
|
|
{
|
|
case "amp": value = "&"; break;
|
|
case "lt": value = "<"; break;
|
|
case "gt": value = ">"; break;
|
|
case "quot": value = "\""; break;
|
|
default:
|
|
throw new XmlException(String.Format("An invalid entity reference was found. The entity '&{0};' is not recognized.", entity_name));
|
|
}
|
|
|
|
buffer.Append(value);
|
|
|
|
// Are there any more entity references in this string?
|
|
int next = input.IndexOf('&', end + 1);
|
|
if (next == -1)
|
|
{
|
|
// There are no more entity references in the string.
|
|
// Append the rest of the string.
|
|
buffer.Append(input, end + 1, input.Length - end - 1);
|
|
return buffer.ToString();
|
|
}
|
|
|
|
// Yes, there are more references. Keep looping, preserving
|
|
// the same meaning of 'start' as when we entered this loop.
|
|
buffer.Append(input, end + 1, next - (end + 1));
|
|
start = next + 1; // skip over next '&'
|
|
}
|
|
}
|
|
|
|
private char ReadCharacter(TextReader sr)
|
|
{
|
|
char tempChar = (char)sr.Read();
|
|
if (tempChar == '\n') {
|
|
lineNumber++;
|
|
}
|
|
return tempChar;
|
|
}
|
|
|
|
private bool ReadToken(TextReader sr, out string token, out TokenType type)
|
|
{
|
|
// fortunately for us, the first character tells us exactly which of the three
|
|
// types of token this is. [a-zA-Z] means it's a NAME, ["] means it's a STRING_LITERAL
|
|
// and [<>/=] means it's an operator
|
|
int nextVal = sr.Peek();
|
|
|
|
if (nextVal == -1) {
|
|
// this is the end of the file
|
|
token = "";
|
|
type = TokenType.NONE;
|
|
return false;
|
|
}
|
|
|
|
char firstChar = unchecked((char)nextVal);
|
|
|
|
// if it's in [a-zA-Z]
|
|
if (('a' <= firstChar && firstChar <= 'z') || ('A' <= firstChar && firstChar <= 'Z')) {
|
|
token = ReadName(sr);
|
|
type = TokenType.NAME;
|
|
return true;
|
|
}
|
|
else if (firstChar == '"') {
|
|
token = ReadString(sr);
|
|
type = TokenType.STRING_LITERAL;
|
|
return true;
|
|
}
|
|
else if (firstChar == '<' ||
|
|
firstChar == '>' ||
|
|
firstChar == '/' ||
|
|
firstChar == '=' ||
|
|
firstChar == '?' ||
|
|
firstChar == '!' ||
|
|
firstChar == '-') {
|
|
sr.Read();
|
|
token = firstChar.ToString();
|
|
type = TokenType.OPERATOR;
|
|
return true;
|
|
}
|
|
else if (firstChar == ' ' || firstChar == '\n' || (int)firstChar == 13 || firstChar == '\t') {
|
|
// throw away all whitespace
|
|
while (firstChar == ' ' || firstChar == '\n' || (int)firstChar == 13 || firstChar == '\t') {
|
|
char tempChar = (char)sr.Read();
|
|
if (tempChar == '\n') {
|
|
// increment the line count
|
|
lineNumber++;
|
|
}
|
|
firstChar = (char)sr.Peek();
|
|
}
|
|
return ReadToken(sr, out token, out type);
|
|
}
|
|
else {
|
|
sr.Read();
|
|
type = TokenType.NONE;
|
|
token = firstChar.ToString();
|
|
return true;
|
|
// throw new XmlException("Line " + lineNumber + ": '" + firstChar + "' is not a valid first character for a token");
|
|
}
|
|
}
|
|
|
|
private string ReadName(TextReader sr)
|
|
{
|
|
StringBuilder sb = new StringBuilder();
|
|
|
|
// add characters to our string until we see something that's not in [a-zA-Z0-9_-.:]
|
|
char read = (char)sr.Peek();
|
|
|
|
while (('a' <= read && read <= 'z') ||
|
|
('A' <= read && read <= 'Z') ||
|
|
('0' <= read && read <= '9') ||
|
|
read == '-' ||
|
|
read == '_' ||
|
|
read == '.' ||
|
|
read == ':') {
|
|
sb.Append((char)sr.Read());
|
|
read = (char)sr.Peek();
|
|
}
|
|
|
|
return sb.ToString();
|
|
}
|
|
|
|
private string ReadString(TextReader sr)
|
|
{
|
|
StringBuilder sb = new StringBuilder();
|
|
// we need to read this string and translate it into a string literal.
|
|
// this means that we have (for the moment) two magic characters: \ and "
|
|
// "\\" means \ and "\"" means "
|
|
bool translateNext = false;
|
|
char read = (char)sr.Peek();
|
|
if (read != '"') {
|
|
throw new XmlException("Line " + lineNumber + ": Cannot start a string literal with " + (char)read + ". You must use a '\"'");
|
|
}
|
|
// drop the '"' on the floor
|
|
sr.Read();
|
|
read = (char)sr.Read();
|
|
|
|
while (read != '"' || translateNext) {
|
|
if (!translateNext ) {
|
|
if (read == '\\') {
|
|
translateNext = true;
|
|
}
|
|
else {
|
|
sb.Append(read);
|
|
read = (char)sr.Read();
|
|
}
|
|
}
|
|
else {
|
|
translateNext = false;
|
|
if (read == '\\') {
|
|
sb.Append("\\");
|
|
read = (char)sr.Read();
|
|
}
|
|
else if (read == '"') {
|
|
sb.Append("\"");
|
|
read = (char)sr.Read();
|
|
}
|
|
else {
|
|
throw new XmlException("Line " + lineNumber + ": Invalid escape sequence: \\" + read);
|
|
}
|
|
}
|
|
}
|
|
|
|
return sb.ToString();
|
|
}
|
|
}
|
|
}
|