// "Therefore those skilled at the unorthodox // are infinite as heaven and earth, // inexhaustible as the great rivers. // When they come to an end, // they bagin again, // like the days and months; // they die and are reborn, // like the four seasons." // // - Sun Tsu, // "The Art of War" using System.Collections.Generic; using System.Text.RegularExpressions; namespace HtmlRenderer.Parse { /// /// Collection of regular expressions used when parsing /// internal static class RegexParserHelper { #region Fields and Consts /// /// Extracts CSS style comments; e.g. /* comment */ /// public const string CssComments = @"/\*[^*/]*\*/"; /// /// Extracts the media types from a media at-rule; e.g. @media print, 3d, screen { /// public const string CssMediaTypes = @"@media[^\{\}]*\{"; /// /// Extracts defined blocks in CSS. /// WARNING: Blocks will include blocks inside at-rules. /// public const string CssBlocks = @"[^\{\}]*\{[^\{\}]*\}"; /// /// Extracts a number; e.g. 5, 6, 7.5, 0.9 /// public const string CssNumber = @"{[0-9]+|[0-9]*\.[0-9]+}"; /// /// Extracts css percentages from the string; e.g. 100% .5% 5.4% /// public const string CssPercentage = @"([0-9]+|[0-9]*\.[0-9]+)\%"; //TODO: Check if works fine /// /// Extracts CSS lengths; e.g. 9px 3pt .89em /// public const string CssLength = @"([0-9]+|[0-9]*\.[0-9]+)(em|ex|px|in|cm|mm|pt|pc)"; /// /// Extracts CSS colors; e.g. black white #fff #fe98cd rgb(5,5,5) rgb(45%, 0, 0) /// public const string CssColors = @"(#\S{6}|#\S{3}|rgb\(\s*[0-9]{1,3}\%?\s*\,\s*[0-9]{1,3}\%?\s*\,\s*[0-9]{1,3}\%?\s*\)|maroon|red|orange|yellow|olive|purple|fuchsia|white|lime|green|navy|blue|aqua|teal|black|silver|gray)"; /// /// Extracts line-height values (normal, numbers, lengths, percentages) /// public const string CssLineHeight = "(normal|" + CssNumber + "|" + CssLength + "|" + CssPercentage + ")"; /// /// Extracts CSS border styles; e.g. solid none dotted /// public const string CssBorderStyle = @"(none|hidden|dotted|dashed|solid|double|groove|ridge|inset|outset)"; /// /// Extracts CSS border widthe; e.g. 1px thin 3em /// public const string CssBorderWidth = "(" + CssLength + "|thin|medium|thick)"; /// /// Extracts font-family values /// public const string CssFontFamily = "(\"[^\"]*\"|'[^']*'|\\S+\\s*)(\\s*\\,\\s*(\"[^\"]*\"|'[^']*'|\\S+))*"; /// /// Extracts CSS font-styles; e.g. normal italic oblique /// public const string CssFontStyle = "(normal|italic|oblique)"; /// /// Extracts CSS font-variant values; e.g. normal, small-caps /// public const string CssFontVariant = "(normal|small-caps)"; /// /// Extracts font-weight values; e.g. normal, bold, bolder... /// public const string CssFontWeight = "(normal|bold|bolder|lighter|100|200|300|400|500|600|700|800|900)"; /// /// Exracts font sizes: xx-small, larger, small, 34pt, 30%, 2em /// public const string CssFontSize = "(" + CssLength + "|" + CssPercentage + "|xx-small|x-small|small|medium|large|x-large|xx-large|larger|smaller)"; /// /// Gets the font-size[/line-height]? on the font shorthand property. /// Check http://www.w3.org/TR/CSS21/fonts.html#font-shorthand /// public const string CssFontSizeAndLineHeight = CssFontSize + @"(\/" + CssLineHeight + @")?(\s|$)"; /// /// Extracts HTML tags /// public const string HtmlTag = @"<[^<>]*>"; /// /// Extracts attributes from a HTML tag; e.g. att=value, att="value" /// public const string HmlTagAttributes = "(?\\b\\w+\\b)\\s*=\\s*(?\"[^\"]*\"|'[^']*'|[^\"'<>\\s]+)"; /// /// the regexes cache that is used by the parser so not to create regex each time /// private static readonly Dictionary _regexes = new Dictionary(); #endregion /// /// Get CSS at rule from the given stylesheet. /// /// the stylesheet data to retrieve the rule from /// the index to start the search for the rule, on return will be the value of the end of the found rule /// the found at rule or null if not exists public static string GetCssAtRules(string stylesheet, ref int startIdx) { startIdx = stylesheet.IndexOf('@', startIdx); if (startIdx > -1) { int count = 1; int endIdx = stylesheet.IndexOf('{', startIdx); if (endIdx > -1) { while (count > 0 && endIdx < stylesheet.Length) { endIdx++; if (stylesheet[endIdx] == '{') { count++; } else if (stylesheet[endIdx] == '}') { count--; } } if (endIdx < stylesheet.Length) { var atrule = stylesheet.Substring(startIdx, endIdx - startIdx + 1); startIdx = endIdx; return atrule; } } } return null; } /// /// Extracts matches from the specified source /// /// Regular expression to extract matches /// Source to extract matches /// Collection of matches public static MatchCollection Match(string regex, string source) { var r = GetRegex(regex); return r.Matches(source); } /// /// Searches the specified regex on the source /// /// /// /// public static string Search(string regex, string source) { int position; return Search(regex, source, out position); } /// /// Searches the specified regex on the source /// /// /// /// /// public static string Search(string regex, string source, out int position) { MatchCollection matches = Match(regex, source); if (matches.Count > 0) { position = matches[0].Index; return matches[0].Value; } else { position = -1; } return null; } /// /// Get regex instance for the given regex string. /// /// the regex string to use /// the regex instance private static Regex GetRegex(string regex) { Regex r; if (!_regexes.TryGetValue(regex, out r)) { r = new Regex(regex, RegexOptions.IgnoreCase | RegexOptions.Singleline); _regexes[regex] = r; } return r; } } }