// Description: Html Agility Pack - HTML Parsers, selectors, traversors, manupulators. // Website & Documentation: http://html-agility-pack.net // Forum & Issues: https://github.com/zzzprojects/html-agility-pack // License: https://github.com/zzzprojects/html-agility-pack/blob/master/LICENSE // More projects: http://www.zzzprojects.com/ // Copyright ?ZZZ Projects Inc. 2014 - 2017. All rights reserved. using System; using System.Collections.Generic; using System.IO; using System.Text; using System.Text.RegularExpressions; using System.Xml; namespace HtmlAgilityPack { ///

/// Represents a complete HTML document. ///

public partial class HtmlDocument { #region Manager internal static bool _disableBehaviorTagP = true; ///

True to disable, false to enable the behavior tag p.

public static bool DisableBehaviorTagP { get => _disableBehaviorTagP; set { if (value) { if (HtmlNode.ElementsFlags.ContainsKey("p")) { HtmlNode.ElementsFlags.Remove("p"); } } else { if (!HtmlNode.ElementsFlags.ContainsKey("p")) { HtmlNode.ElementsFlags.Add("p", HtmlElementFlag.Empty | HtmlElementFlag.Closed); } } _disableBehaviorTagP = value; } } ///

Default builder to use in the HtmlDocument constructor

public static Action DefaultBuilder { get; set; } ///

Action to execute before the Parse is executed

public Action ParseExecuting { get; set; } #endregion #region Fields ///

/// Defines the max level we would go deep into the html document ///

private static int _maxDepthLevel = int.MaxValue; private int _c; #pragma warning disable CS0618 // 类型或成员已过时 private Crc32 _crc32; #pragma warning restore CS0618 // 类型或成员已过时 private HtmlAttribute _currentattribute; private HtmlNode _currentnode; private Encoding _declaredencoding; private HtmlNode _documentnode; private bool _fullcomment; private int _index; internal Dictionary Lastnodes = new Dictionary(); private HtmlNode _lastparentnode; private int _line; private int _lineposition, _maxlineposition; internal Dictionary Nodesid; private ParseState _oldstate; private bool _onlyDetectEncoding; internal Dictionary Openednodes; private List _parseerrors = new List(); private string _remainder; private int _remainderOffset; private ParseState _state; private Encoding _streamencoding; private bool _useHtmlEncodingForStream; ///

The HtmlDocument Text. Careful if you modify it.

public string Text; ///

True to stay backward compatible with previous version of HAP. This option does not guarantee 100% compatibility.

public bool BackwardCompatibility = true; ///

/// Adds Debugging attributes to node. Default is false. ///

public bool OptionAddDebuggingAttributes; ///

/// Defines if closing for non closed nodes must be done at the end or directly in the document. /// Setting this to true can actually change how browsers render the page. Default is false. ///

public bool OptionAutoCloseOnEnd; // close errors at the end ///

/// Defines if non closed nodes will be checked at the end of parsing. Default is true. ///

public bool OptionCheckSyntax = true; ///

/// Defines if a checksum must be computed for the document while parsing. Default is false. ///

public bool OptionComputeChecksum; ///

/// Defines if SelectNodes method will return null or empty collection when no node matched the XPath expression. /// Setting this to true will return empty collection and false will return null. Default is false. ///

public bool OptionEmptyCollection = false; ///

True to disable, false to enable the server side code.

public bool DisableServerSideCode = false; ///

/// Defines the default stream encoding to use. Default is System.Text.Encoding.Default. ///

public Encoding OptionDefaultStreamEncoding; ///

/// Force to take the original comment instead of creating it ///

public bool OptionXmlForceOriginalComment; ///

/// Defines if source text must be extracted while parsing errors. /// If the document has a lot of errors, or cascading errors, parsing performance can be dramatically affected if set to true. /// Default is false. ///

public bool OptionExtractErrorSourceText; // turning this on can dramatically slow performance if a lot of errors are detected ///

/// Defines the maximum length of source text or parse errors. Default is 100. ///

public int OptionExtractErrorSourceTextMaxLength = 100; ///

/// Defines if LI, TR, TH, TD tags must be partially fixed when nesting errors are detected. Default is false. ///

public bool OptionFixNestedTags; // fix li, tr, th, td tags ///

/// Defines if output must conform to XML, instead of HTML. Default is false. ///

public bool OptionOutputAsXml; ///

/// If used together with and enabled, Xml namespaces in element names are preserved. Default is false. ///

public bool OptionPreserveXmlNamespaces; ///

/// Defines if attribute value output must be optimized (not bound with double quotes if it is possible). Default is false. ///

public bool OptionOutputOptimizeAttributeValues; ///

Defines the global attribute value quote. When specified, it will always win.

public AttributeValueQuote? GlobalAttributeValueQuote; ///

/// Defines if name must be output with it's original case. Useful for asp.net tags and attributes. Default is false. ///

public bool OptionOutputOriginalCase; ///

/// Defines if name must be output in uppercase. Default is false. ///

public bool OptionOutputUpperCase; ///

/// Defines if declared encoding must be read from the document. /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node. /// Default is true. ///

public bool OptionReadEncoding = true; ///

/// Defines the name of a node that will throw the StopperNodeException when found as an end node. Default is null. ///

public string OptionStopperNodeName; ///

/// Defines if the 'id' attribute must be specifically used. Default is true. ///

public bool OptionUseIdAttribute = true; ///

/// Defines if empty nodes must be written as closed during output. Default is false. ///

public bool OptionWriteEmptyNodes; ///

/// The max number of nested child nodes. /// Added to prevent stackoverflow problem when a page has tens of thousands of opening html tags with no closing tags ///

public int OptionMaxNestedChildNodes = 0; #endregion #region Static Members internal static readonly string HtmlExceptionRefNotChild = "Reference node must be a child of this node"; internal static readonly string HtmlExceptionUseIdAttributeFalse = "You need to set UseIdAttribute property to true to enable this feature"; internal static readonly string HtmlExceptionClassDoesNotExist = "Class name doesn't exist"; internal static readonly string HtmlExceptionClassExists = "Class name already exists"; internal static readonly Dictionary HtmlResetters = new Dictionary() { {"li", new[] {"ul", "ol"}}, {"tr", new[] {"table"}}, {"th", new[] {"tr", "table"}}, {"td", new[] {"tr", "table"}}, }; #endregion #region Constructors ///

/// Creates an instance of an HTML document. ///

public HtmlDocument() { if (DefaultBuilder != null) { DefaultBuilder(this); } _documentnode = CreateNode(HtmlNodeType.Document, 0); #if SILVERLIGHT || METRO || NETSTANDARD1_3 || NETSTANDARD1_6 OptionDefaultStreamEncoding = Encoding.UTF8; #else OptionDefaultStreamEncoding = Encoding.Default; #endif } #endregion #region Properties ///

Gets the parsed text.

/// The parsed text. public string ParsedText { get { return Text; } } ///

/// Defines the max level we would go deep into the html document. If this depth level is exceeded, and exception is /// thrown. ///

public static int MaxDepthLevel { get { return _maxDepthLevel; } set { _maxDepthLevel = value; } } ///

/// Gets the document CRC32 checksum if OptionComputeChecksum was set to true before parsing, 0 otherwise. ///

[Obsolete] public int CheckSum { get { return _crc32 == null ? 0 : (int) _crc32.CheckSum; } } ///

/// Gets the document's declared encoding. /// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node (pre-HTML5) or the meta charset="XXXXX" html node (HTML5). ///

public Encoding DeclaredEncoding { get { return _declaredencoding; } } ///

/// Gets the root node of the document. ///

public HtmlNode DocumentNode { get { return _documentnode; } } ///

/// Gets the document's output encoding. ///

public Encoding Encoding { get { return GetOutEncoding(); } } ///

/// Gets a list of parse errors found in the document. ///

public IEnumerable ParseErrors { get { return _parseerrors; } } ///

/// Gets the remaining text. /// Will always be null if OptionStopperNodeName is null. ///

public string Remainder { get { return _remainder; } } ///

/// Gets the offset of Remainder in the original Html text. /// If OptionStopperNodeName is null, this will return the length of the original Html text. ///

public int RemainderOffset { get { return _remainderOffset; } } ///

/// Gets the document's stream encoding. ///

public Encoding StreamEncoding { get { return _streamencoding; } } #endregion #region Public Methods ///

/// Gets a valid XML name. ///

/// Any text. /// A string that is a valid XML name. public static string GetXmlName(string name) { return GetXmlName(name, false, false); } #if !METRO ///

/// ///

/// public void UseAttributeOriginalName(string tagName) { foreach (var nod in this.DocumentNode.SelectNodes("//" + tagName)) { foreach (var attribut in nod.Attributes) { attribut.UseOriginalName = true; } } } #endif ///

/// ///

/// /// /// /// public static string GetXmlName(string name, bool isAttribute, bool preserveXmlNamespaces) { string xmlname = string.Empty; bool nameisok = true; for (int i = 0; i < name.Length; i++) { // names are lcase // note: we are very limited here, too much? if (((name[i] >= 'a') && (name[i] <= 'z')) || ((name[i] >= 'A') && (name[i] <= 'Z')) || ((name[i] >= '0') && (name[i] <= '9')) || ((isAttribute || preserveXmlNamespaces) && name[i] == ':') || // (name[i]==':') || (name[i]=='_') || (name[i]=='-') || (name[i]=='.')) // these are bads in fact (name[i] == '_') || (name[i] == '-') || (name[i] == '.')) { xmlname += name[i]; } else { nameisok = false; byte[] bytes = Encoding.UTF8.GetBytes(new char[] {name[i]}); for (int j = 0; j < bytes.Length; j++) { xmlname += bytes[j].ToString("x2"); } xmlname += "_"; } } if (nameisok) { return xmlname; } return "_" + xmlname; } ///

/// Applies HTML encoding to a specified string. ///

/// The input string to encode. May not be null. /// The encoded string. public static string HtmlEncode(string html) { return HtmlEncodeWithCompatibility(html, true); } internal static string HtmlEncodeWithCompatibility(string html, bool backwardCompatibility = true) { if (html == null) { throw new ArgumentNullException("html"); } // replace & by & but only once! Regex rx = backwardCompatibility ? new Regex("&(?!(amp;)|(lt;)|(gt;)|(quot;))", RegexOptions.IgnoreCase) : new Regex("&(?!(amp;)|(lt;)|(gt;)|(quot;)|(nbsp;)|(reg;))", RegexOptions.IgnoreCase); return rx.Replace(html, "&").Replace("<", "<").Replace(">", ">").Replace("\"", """); } ///

/// Determines if the specified character is considered as a whitespace character. ///

/// The character to check. /// true if if the specified character is considered as a whitespace character. public static bool IsWhiteSpace(int c) { if ((c == 10) || (c == 13) || (c == 32) || (c == 9)) { return true; } return false; } ///

/// Creates an HTML attribute with the specified name. ///

/// The name of the attribute. May not be null. /// The new HTML attribute. public HtmlAttribute CreateAttribute(string name) { if (name == null) throw new ArgumentNullException("name"); HtmlAttribute att = CreateAttribute(); att.Name = name; return att; } ///

/// Creates an HTML attribute with the specified name. ///

/// The name of the attribute. May not be null. /// The value of the attribute. /// The new HTML attribute. public HtmlAttribute CreateAttribute(string name, string value) { if (name == null) { throw new ArgumentNullException("name"); } HtmlAttribute att = CreateAttribute(name); att.Value = value; return att; } ///

/// Creates an HTML comment node. ///

/// The new HTML comment node. public HtmlCommentNode CreateComment() { return (HtmlCommentNode) CreateNode(HtmlNodeType.Comment); } ///

/// Creates an HTML comment node with the specified comment text. ///

/// The comment text. May not be null. /// The new HTML comment node. public HtmlCommentNode CreateComment(string comment) { if (comment == null) { throw new ArgumentNullException("comment"); } HtmlCommentNode c = CreateComment(); c.Comment = comment; return c; } ///

/// Creates an HTML element node with the specified name. ///

/// The qualified name of the element. May not be null. /// The new HTML node. public HtmlNode CreateElement(string name) { if (name == null) { throw new ArgumentNullException("name"); } HtmlNode node = CreateNode(HtmlNodeType.Element); node.Name = name; return node; } ///

/// Creates an HTML text node. ///

/// The new HTML text node. public HtmlTextNode CreateTextNode() { return (HtmlTextNode) CreateNode(HtmlNodeType.Text); } ///

/// Creates an HTML text node with the specified text. ///

/// The text of the node. May not be null. /// The new HTML text node. public HtmlTextNode CreateTextNode(string text) { if (text == null) { throw new ArgumentNullException("text"); } HtmlTextNode t = CreateTextNode(); t.Text = text; return t; } ///

/// Detects the encoding of an HTML stream. ///

/// The input stream. May not be null. /// The detected encoding. public Encoding DetectEncoding(Stream stream) { return DetectEncoding(stream, false); } ///

/// Detects the encoding of an HTML stream. ///

/// The input stream. May not be null. /// The html is checked. /// The detected encoding. public Encoding DetectEncoding(Stream stream, bool checkHtml) { _useHtmlEncodingForStream = checkHtml; if (stream == null) { throw new ArgumentNullException("stream"); } return DetectEncoding(new StreamReader(stream)); } ///

/// Detects the encoding of an HTML text provided on a TextReader. ///

/// The TextReader used to feed the HTML. May not be null. /// The detected encoding. public Encoding DetectEncoding(TextReader reader) { if (reader == null) { throw new ArgumentNullException("reader"); } _onlyDetectEncoding = true; if (OptionCheckSyntax) { Openednodes = new Dictionary(); } else { Openednodes = null; } if (OptionUseIdAttribute) { Nodesid = new Dictionary(StringComparer.OrdinalIgnoreCase); } else { Nodesid = null; } StreamReader sr = reader as StreamReader; if (sr != null && !_useHtmlEncodingForStream) { Text = sr.ReadToEnd(); _streamencoding = sr.CurrentEncoding; return _streamencoding; } _streamencoding = null; _declaredencoding = null; Text = reader.ReadToEnd(); _documentnode = CreateNode(HtmlNodeType.Document, 0); // this is almost a hack, but it allows us not to muck with the original parsing code try { Parse(); } catch (EncodingFoundException ex) { return ex.Encoding; } return _streamencoding; } ///

/// Detects the encoding of an HTML text. ///

/// The input html text. May not be null. /// The detected encoding. public Encoding DetectEncodingHtml(string html) { if (html == null) { throw new ArgumentNullException("html"); } using (StringReader sr = new StringReader(html)) { Encoding encoding = DetectEncoding(sr); return encoding; } } ///

/// Gets the HTML node with the specified 'id' attribute value. ///

/// The attribute id to match. May not be null. /// The HTML node with the matching id or null if not found. public HtmlNode GetElementbyId(string id) { if (id == null) { throw new ArgumentNullException("id"); } if (Nodesid == null) { throw new Exception(HtmlExceptionUseIdAttributeFalse); } return Nodesid.ContainsKey(id) ? Nodesid[id] : null; } ///