From 48d610720f11f1ea0f029298c3eeb7f8dc0e347d Mon Sep 17 00:00:00 2001 From: Simon Date: Mon, 27 May 2024 12:40:35 +0200 Subject: [PATCH] Replace custom css parser with crass --- Gemfile.lock | 4 +- css_parser.gemspec | 3 +- lib/css_parser.rb | 1 + lib/css_parser/parser.rb | 323 ++++++++++-------------- lib/css_parser/regexps.rb | 11 - lib/css_parser/rule_set.rb | 5 + lib/css_parser/rule_set/declarations.rb | 4 +- test/test_css_parser_offset_capture.rb | 41 ++- 8 files changed, 170 insertions(+), 222 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 2de758d..e53e5bf 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -3,6 +3,7 @@ PATH specs: css_parser (1.18.0) addressable + crass (~> 1.0) GEM remote: https://rubygems.org/ @@ -12,6 +13,7 @@ GEM ast (2.4.2) benchmark-ips (2.13.0) bump (0.10.0) + crass (1.0.6) debug (1.9.2) irb (~> 1.10) reline (>= 0.3.8) @@ -68,8 +70,8 @@ GEM webrick (1.8.1) PLATFORMS + arm64-darwin-22 java - ruby DEPENDENCIES benchmark-ips diff --git a/css_parser.gemspec b/css_parser.gemspec index d8cf155..d02094d 100644 --- a/css_parser.gemspec +++ b/css_parser.gemspec @@ -18,5 +18,6 @@ Gem::Specification.new name, CssParser::VERSION do |s| s.metadata['bug_tracker_uri'] = 'https://github.com/premailer/css_parser/issues' s.metadata['rubygems_mfa_required'] = 'true' - s.add_runtime_dependency 'addressable' + s.add_dependency 'addressable' + s.add_dependency 'crass', '~> 1.0' end diff --git a/lib/css_parser.rb b/lib/css_parser.rb index 37426c2..8d71587 100644 --- a/lib/css_parser.rb +++ b/lib/css_parser.rb @@ -7,6 +7,7 @@ require 'zlib' require 'stringio' require 'iconv' unless String.method_defined?(:encode) +require 'crass' require 'css_parser/version' require 'css_parser/rule_set' diff --git a/lib/css_parser/parser.rb b/lib/css_parser/parser.rb index 4e42149..1b8f5f8 100644 --- a/lib/css_parser/parser.rb +++ b/lib/css_parser/parser.rb @@ -17,12 +17,6 @@ class CircularReferenceError < StandardError; end # [io_exceptions] Throw an exception if a link can not be found. Boolean, default is true. class Parser USER_AGENT = "Ruby CSS Parser/#{CssParser::VERSION} (https://github.com/premailer/css_parser)".freeze - STRIP_CSS_COMMENTS_RX = %r{/\*.*?\*/}m.freeze - STRIP_HTML_COMMENTS_RX = //m.freeze - - # Initial parsing - RE_AT_IMPORT_RULE = /@import\s*(?:url\s*)?(?:\()?(?:\s*)["']?([^'"\s)]*)["']?\)?([\w\s,^\]()]*)\)?[;\n]?/.freeze - MAX_REDIRECTS = 3 # Array of CSS files that have been loaded. @@ -122,46 +116,111 @@ def add_block!(block, options = {}) options[:media_types] = [options[:media_types]].flatten.collect { |mt| CssParser.sanitize_media_query(mt) } options[:only_media_types] = [options[:only_media_types]].flatten.collect { |mt| CssParser.sanitize_media_query(mt) } - block = cleanup_block(block, options) - + # TODO: Would be nice to skip this step too if options[:base_uri] and @options[:absolute_paths] block = CssParser.convert_uris(block, options[:base_uri]) end - # Load @imported CSS - if @options[:import] - block.scan(RE_AT_IMPORT_RULE).each do |import_rule| - media_types = [] - if (media_string = import_rule[-1]) - media_string.split(',').each do |t| - media_types << CssParser.sanitize_media_query(t) unless t.empty? + current_media_queries = [:all] + if options[:media_types] + current_media_queries = options[:media_types].flatten.collect { |mt| CssParser.sanitize_media_query(mt) } + end + + Crass.parse(block).each do |node| + case node + in node: :style_rule + declarations = create_declaration_from_properties(node[:children]) + + add_rule_options = { + selectors: node[:selector][:value], + block: declarations, + media_types: current_media_queries + } + if options[:capture_offsets] + add_rule_options.merge!( + filename: options[:filename], + offset: node[:selector][:tokens].first[:pos]..node[:children].last[:pos] + ) + end + + add_rule!(**add_rule_options) + in node: :at_rule, name: 'media' + new_media_queries = split_media_query_by_or_condition(node[:prelude]) + add_block!(node[:block], options.merge(media_types: new_media_queries)) + + in node: :at_rule, name: 'page' + declarations = create_declaration_from_properties(Crass.parse_properties(node[:block])) + add_rule_options = { + selectors: "@page#{Crass::Parser.stringify(node[:prelude])}", + block: declarations, + media_types: current_media_queries + } + if options[:capture_offsets] + add_rule_options.merge!( + filename: options[:filename], + offset: node[:tokens].first[:pos]..node[:tokens].last[:pos] + ) + end + add_rule!(**add_rule_options) + + in node: :at_rule, name: 'font-face' + declarations = create_declaration_from_properties(Crass.parse_properties(node[:block])) + add_rule_options = { + selectors: "@font-face#{Crass::Parser.stringify(node[:prelude])}", + block: declarations, + media_types: current_media_queries + } + if options[:capture_offsets] + add_rule_options.merge!( + filename: options[:filename], + offset: node[:tokens].first[:pos]..node[:tokens].last[:pos] + ) + end + add_rule!(**add_rule_options) + + in node: :at_rule, name: 'import' + next unless @options[:import] + + import = nil + import_options = options.slice(:capture_offsets, :base_uri, :base_dir) + + prelude = node[:prelude].each + loop do + case (token = prelude.next) + in node: :whitespace # nothing + in node: :string + import = {type: :file, path: token[:value]} + break + in node: :function, name: 'url' + import = {type: :url, path: token[:value].first[:value]} + break end - else - media_types = [:all] end - next unless options[:only_media_types].include?(:all) or media_types.empty? or !(media_types & options[:only_media_types]).empty? + media_query_section = [] + loop { media_query_section << prelude.next } - import_path = import_rule[0].to_s.gsub(/['"]*/, '').strip + import_options[:media_types] = split_media_query_by_or_condition(media_query_section) + if import_options[:media_types].empty? + import_options[:media_types] = [:all] + end - import_options = {media_types: media_types} - import_options[:capture_offsets] = true if options[:capture_offsets] + unless options[:only_media_types].include?(:all) or !(import_options[:media_types] & options[:only_media_types]).empty? + next + end if options[:base_uri] - import_uri = Addressable::URI.parse(options[:base_uri].to_s) + Addressable::URI.parse(import_path) - import_options[:base_uri] = options[:base_uri] - load_uri!(import_uri, import_options) + load_uri!( + Addressable::URI.parse(options[:base_uri].to_s) + Addressable::URI.parse(import[:path]), + import_options + ) elsif options[:base_dir] - import_options[:base_dir] = options[:base_dir] - load_file!(import_path, import_options) + load_file!(import[:path], import_options) end + in node: :whitespace # nothing + in node: :error # nothing end end - - # Remove @import declarations - block = ignore_pattern(block, RE_AT_IMPORT_RULE, options) - - parse_block_into_rule_sets!(block, options) end # Add a CSS rule by setting the +selectors+, +declarations+ @@ -342,140 +401,6 @@ def compact! # :nodoc: [] end - def parse_block_into_rule_sets!(block, options = {}) # :nodoc: - current_media_queries = [:all] - if options[:media_types] - current_media_queries = options[:media_types].flatten.collect { |mt| CssParser.sanitize_media_query(mt) } - end - - in_declarations = 0 - block_depth = 0 - - in_charset = false # @charset is ignored for now - in_string = false - in_at_media_rule = false - in_media_block = false - - current_selectors = String.new - current_media_query = String.new - current_declarations = String.new - - # once we are in a rule, we will use this to store where we started if we are capturing offsets - rule_start = nil - offset = nil - - block.scan(/\s+|\\{2,}|\\?[{}\s"]|[()]|.[^\s"{}()\\]*/) do |token| - # save the regex offset so that we know where in the file we are - offset = Regexp.last_match.offset(0) if options[:capture_offsets] - - if token.start_with?('"') # found un-escaped double quote - in_string = !in_string - end - - if in_declarations > 0 - # too deep, malformed declaration block - if in_declarations > 1 - in_declarations -= 1 if token.include?('}') - next - end - - if !in_string && token.include?('{') - in_declarations += 1 - next - end - - current_declarations << token - - if !in_string && token.include?('}') - current_declarations.gsub!(/\}\s*$/, '') - - in_declarations -= 1 - current_declarations.strip! - - unless current_declarations.empty? - add_rule_options = { - selectors: current_selectors, block: current_declarations, - media_types: current_media_queries - } - if options[:capture_offsets] - add_rule_options.merge!(filename: options[:filename], offset: rule_start..offset.last) - end - add_rule!(**add_rule_options) - end - - current_selectors = String.new - current_declarations = String.new - - # restart our search for selectors and declarations - rule_start = nil if options[:capture_offsets] - end - elsif token =~ /@media/i - # found '@media', reset current media_types - in_at_media_rule = true - current_media_queries = [] - elsif in_at_media_rule - if token.include?('{') - block_depth += 1 - in_at_media_rule = false - in_media_block = true - current_media_queries << CssParser.sanitize_media_query(current_media_query) - current_media_query = String.new - elsif token.include?(',') - # new media query begins - token.tr!(',', ' ') - token.strip! - current_media_query << token << ' ' - current_media_queries << CssParser.sanitize_media_query(current_media_query) - current_media_query = String.new - else - token.strip! - # special-case the ( and ) tokens to remove inner-whitespace - # (eg we'd prefer '(width: 500px)' to '( width: 500px )' ) - case token - when '(' - current_media_query << token - when ')' - current_media_query.sub!(/ ?$/, token) - else - current_media_query << token << ' ' - end - end - elsif in_charset or token =~ /@charset/i - # iterate until we are out of the charset declaration - in_charset = !token.include?(';') - elsif !in_string && token.include?('}') - block_depth -= 1 - - # reset the current media query scope - if in_media_block - current_media_queries = [:all] - in_media_block = false - end - elsif !in_string && token.include?('{') - current_selectors.strip! - in_declarations += 1 - else - # if we are in a selector, add the token to the current selectors - current_selectors << token - - # mark this as the beginning of the selector unless we have already marked it - rule_start = offset.first if options[:capture_offsets] && rule_start.nil? && token =~ /^[^\s]+$/ - end - end - - # check for unclosed braces - return unless in_declarations > 0 - - add_rule_options = { - selectors: current_selectors, block: current_declarations, - media_types: current_media_queries - } - if options[:capture_offsets] - add_rule_options.merge!(filename: options[:filename], offset: rule_start..offset.last) - end - add_rule!(**add_rule_options) - end - # Load a remote CSS file. # # You can also pass in file://test.css @@ -565,35 +490,6 @@ def circular_reference_check(path) end end - # Remove a pattern from a given string - # - # Returns a string. - def ignore_pattern(css, regex, options) - # if we are capturing file offsets, replace the characters with spaces to retail the original positions - return css.gsub(regex) { |m| ' ' * m.length } if options[:capture_offsets] - - # otherwise just strip it out - css.gsub(regex, '') - end - - # Strip comments and clean up blank lines from a block of CSS. - # - # Returns a string. - def cleanup_block(block, options = {}) # :nodoc: - # Strip CSS comments - utf8_block = block.encode('UTF-8', 'UTF-8', invalid: :replace, undef: :replace, replace: ' ') - utf8_block = ignore_pattern(utf8_block, STRIP_CSS_COMMENTS_RX, options) - - # Strip HTML comments - they shouldn't really be in here but - # some people are just crazy... - utf8_block = ignore_pattern(utf8_block, STRIP_HTML_COMMENTS_RX, options) - - # Strip lines containing just whitespace - utf8_block.gsub!(/^\s+$/, '') unless options[:capture_offsets] - - utf8_block - end - # Download a file into a string. # # Returns the file's data and character set in an array. @@ -684,6 +580,45 @@ def read_remote_file(uri) # :nodoc: private + def split_media_query_by_or_condition(media_query_selector) + media_query_selector + .each_with_object([[]]) do |token, sum| + # comma is the same as or + # https://developer.mozilla.org/en-US/docs/Web/CSS/@media#logical_operators + case token + in node: :comma + sum << [] + in node: :ident, value: 'or' # rubocop:disable Lint/DuplicateBranch + sum << [] + else + sum.last << token + end + end # rubocop:disable Style/MultilineBlockChain + .map { Crass::Parser.stringify(_1).strip } + .reject(&:empty?) + .map(&:to_sym) + end + + def create_declaration_from_properties(properties) + declarations = RuleSet::Declarations.new + + properties.each do |child| + case child + in node: :property, value: '' # nothing, happen for { color:green; color: } + in node: :property + declarations.add_declaration!( + child[:name], + RuleSet::Declarations::Value.new(child[:value], important: child[:important]) + ) + in node: :whitespace # nothing + in node: :semicolon # nothing + in node: :error # nothing + end + end + + declarations + end + # Save a folded declaration block to the internal cache. def save_folded_declaration(block_hash, folded_declaration) # :nodoc: @folded_declaration_cache[block_hash] = folded_declaration diff --git a/lib/css_parser/regexps.rb b/lib/css_parser/regexps.rb index cf83b2d..6e11cb5 100644 --- a/lib/css_parser/regexps.rb +++ b/lib/css_parser/regexps.rb @@ -25,14 +25,6 @@ def self.regex_possible_values(*values) URI_RX_OR_NONE = Regexp.union(URI_RX, /none/i) RE_GRADIENT = /[-a-z]*gradient\([-a-z0-9 .,#%()]*\)/im.freeze - # Initial parsing - RE_AT_IMPORT_RULE = /@import\s+(url\()?["']?(.[^'"\s]*)["']?\)?([\w\s,^\])]*)\)?;?/.freeze - - #-- - # RE_AT_MEDIA_RULE = Regexp.new('(\"(.[^\n\r\f\\"]*|\\\\' + RE_NL.to_s + '|' + RE_ESCAPE.to_s + ')*\")') - - # RE_AT_IMPORT_RULE = Regexp.new('@import[\s]*(' + RE_STRING.to_s + ')([\w\s\,]*)[;]?', Regexp::IGNORECASE) -- should handle url() even though it is not allowed - #++ IMPORTANT_IN_PROPERTY_RX = /\s*!important\b\s*/i.freeze RE_INSIDE_OUTSIDE = regex_possible_values 'inside', 'outside' @@ -46,9 +38,6 @@ def self.regex_possible_values(*values) ) RE_IMAGE = Regexp.union(CssParser::URI_RX, CssParser::RE_GRADIENT, /none/i) - STRIP_CSS_COMMENTS_RX = %r{/\*.*?\*/}m.freeze - STRIP_HTML_COMMENTS_RX = //m.freeze - # Special units BOX_MODEL_UNITS_RX = /(auto|inherit|0|(-*([0-9]+|[0-9]*\.[0-9]+)(rem|vw|vh|vm|vmin|vmax|e[mx]+|px|[cm]+m|p[tc+]|in|%)))([\s;]|\Z)/imx.freeze RE_LENGTH_OR_PERCENTAGE = Regexp.new('([\-]*(([0-9]*\.[0-9]+)|[0-9]+)(e[mx]+|px|[cm]+m|p[tc+]|in|\%))', Regexp::IGNORECASE) diff --git a/lib/css_parser/rule_set.rb b/lib/css_parser/rule_set.rb index 7668236..a245c8d 100644 --- a/lib/css_parser/rule_set.rb +++ b/lib/css_parser/rule_set.rb @@ -447,6 +447,11 @@ def compute_dimensions_shorthand(values) end def parse_declarations!(block) # :nodoc: + if block.is_a? Declarations + self.declarations = block + return + end + self.declarations = Declarations.new return unless block diff --git a/lib/css_parser/rule_set/declarations.rb b/lib/css_parser/rule_set/declarations.rb index 9493f68..bffddc2 100644 --- a/lib/css_parser/rule_set/declarations.rb +++ b/lib/css_parser/rule_set/declarations.rb @@ -34,7 +34,7 @@ def ==(other) extend Forwardable - def_delegators :declarations, :each, :each_value + def_delegators :declarations, :each, :each_key, :each_value def initialize(declarations = {}) self.declarations = {} @@ -155,7 +155,7 @@ def replace_declaration!(replacing_property, replacements, preserve_importance: property_index = propperties.index(replacing_property) property_with_higher_precidence = propperties[(property_index + 1)..].to_set - replacement_declarations.each do |property, _value| + replacement_declarations.each_key do |property| if property_with_higher_precidence.member?(property) replacement_declarations.delete(property) else diff --git a/test/test_css_parser_offset_capture.rb b/test/test_css_parser_offset_capture.rb index a29e181..c949f59 100644 --- a/test/test_css_parser_offset_capture.rb +++ b/test/test_css_parser_offset_capture.rb @@ -18,30 +18,31 @@ def test_capturing_offsets_for_local_file # check that we found the body rule where we expected assert_equal 0, rules[0].offset.first - assert_equal 43, rules[0].offset.last + assert_equal 41, rules[0].offset.last assert_equal file_name, rules[0].filename # and the p rule assert_equal 45, rules[1].offset.first - assert_equal 63, rules[1].offset.last + assert_equal 61, rules[1].offset.last assert_equal file_name, rules[1].filename end # http://github.com/premailer/css_parser/issues#issue/4 def test_capturing_offsets_from_remote_file # TODO: test SSL locally + # TODO: cache request to make test not require internet (and so much faster) @cp.load_uri!("https://dialect.ca/inc/screen.css", capture_offsets: true) # there are a lot of rules in this file, but check some rule offsets rules = @cp.find_rule_sets(['#container', '#name_case_converter textarea']) assert_equal 2, rules.count - assert_equal 2172, rules.first.offset.first - assert_equal 2227, rules.first.offset.last + assert_equal 2_172, rules.first.offset.first + assert_equal 2_225, rules.first.offset.last assert_equal 'https://dialect.ca/inc/screen.css', rules.first.filename assert_equal 10_703, rules.last.offset.first - assert_equal 10_752, rules.last.offset.last + assert_equal 10_750, rules.last.offset.last assert_equal 'https://dialect.ca/inc/screen.css', rules.last.filename end @@ -58,22 +59,36 @@ def test_capturing_offsets_from_string assert_equal 4, rules.count assert_equal 6, rules[0].offset.first - assert_equal 27, rules[0].offset.last + assert_equal 25, rules[0].offset.last assert_equal 'index.html', rules[0].filename assert_equal 34, rules[1].offset.first - assert_equal 53, rules[1].offset.last + assert_equal 51, rules[1].offset.last assert_equal 'index.html', rules[1].filename assert_equal 60, rules[2].offset.first - assert_equal 102, rules[2].offset.last + assert_equal 100, rules[2].offset.last assert_equal 'index.html', rules[2].filename assert_equal 109, rules[3].offset.first - assert_equal 133, rules[3].offset.last + assert_equal 131, rules[3].offset.last assert_equal 'index.html', rules[3].filename end + def test_capturing_offsets_from_string_without_closing_bracket + css = <<-CSS + body { margin: 0px; + CSS + @cp.load_string!(css, capture_offsets: true, filename: 'index.html') + + rules = @cp.find_rule_sets(['body', 'p', '#content', '.content']) + assert_equal 1, rules.count + + assert_equal 6, rules[0].offset.first + assert_equal 25, rules[0].offset.last + assert_equal 'index.html', rules[0].filename + end + def test_capturing_offsets_with_imports base_dir = Pathname.new(__dir__).join('fixtures') @cp.load_file!('import1.css', base_dir: base_dir, capture_offsets: true) @@ -83,25 +98,25 @@ def test_capturing_offsets_with_imports # check that we found the div rule where we expected in the primary file assert_equal 'div', rules[0].selectors.join assert_equal 31, rules[0].offset.first - assert_equal 51, rules[0].offset.last + assert_equal 49, rules[0].offset.last assert_equal base_dir.join('import1.css').to_s, rules[0].filename # check that the a rule in the first import is where we expect assert_equal 'a', rules[1].selectors.join assert_equal 26, rules[1].offset.first - assert_equal 54, rules[1].offset.last + assert_equal 52, rules[1].offset.last assert_equal base_dir.join('subdir/import2.css').to_s, rules[1].filename # and the body rule in the second import assert_equal 'body', rules[2].selectors.join assert_equal 0, rules[2].offset.first - assert_equal 43, rules[2].offset.last + assert_equal 41, rules[2].offset.last assert_equal base_dir.join('simple.css').to_s, rules[2].filename # as well as the p rule in the second import assert_equal 'p', rules[3].selectors.join assert_equal 45, rules[3].offset.first - assert_equal 63, rules[3].offset.last + assert_equal 61, rules[3].offset.last assert_equal base_dir.join('simple.css').to_s, rules[3].filename end end