diff --git a/Gemfile b/Gemfile index 00ad8ed..ee7d012 100644 --- a/Gemfile +++ b/Gemfile @@ -3,6 +3,7 @@ source 'https://rubygems.org' # Specify your gem's dependencies in js_regex.gemspec gemspec +gem 'colorize' gem 'debug' gem 'gouteur', '~> 1.0' gem 'mini_racer', '~> 0.16' diff --git a/README.md b/README.md index 047dc6e..53d3943 100644 --- a/README.md +++ b/README.md @@ -22,11 +22,14 @@ Add it to your gemfile or run In Ruby: ```ruby -require 'js_regex' +require 'lang_regex' ruby_hex_regex = /0x\h+/i -js_regex = JsRegex.new(ruby_hex_regex) +# To JS +js_regex = LangRegex::JsRegex.new(ruby_hex_regex) +# To PHP +php_regex = LangRegex::PhpRegex.new(ruby_hex_regex) js_regex.warnings # => [] js_regex.source # => '0x[0-9A-F]+' @@ -59,7 +62,7 @@ var regExp = new RegExp(jsonObj.source, jsonObj.options); You might have noticed the empty `warnings` array in the example above: ```ruby -js_regex = JsRegex.new(ruby_hex_regex) +js_regex = LangRegex::JsRegex.new(ruby_hex_regex) js_regex.warnings # => [] ``` @@ -68,18 +71,18 @@ If this array isn't empty, that means that your Ruby regex contained some stuff ```ruby advanced_ruby_regex = /(? ["Dropped unsupported negative lookbehind '(? 'buzz' ``` -There is also a strict initializer, `JsRegex::new!`, which raises a `JsRegex::Error` if there are incompatibilites. This is particularly useful if you use JsRegex to convert regex-like strings, e.g. strings entered by users, as a `JsRegex::Error` might also occur if the given regex is invalid: +There is also a strict initializer, `LangRegex::new!`, which raises a `LangRegex::Error` if there are incompatibilites. This is particularly useful if you use JsRegex to convert regex-like strings, e.g. strings entered by users, as a `LangRegex::Error` might also occur if the given regex is invalid: ```ruby begin user_input = '(' - JsRegex.new(user_input) -rescue JsRegex::Error => e + LangRegex::JsRegex.new(user_input) +rescue LangRegex::Error => e e.message # => "Premature end of pattern (missing group closing parenthesis)" end ``` @@ -89,7 +92,7 @@ end An `options:` argument lets you append options (a.k.a. "flags") to the output: ```ruby -JsRegex.new(/x/i, options: 'g').to_h +LangRegex::JsRegex.new(/x/i, options: 'g').to_h # => { source: 'x', options: 'gi' } ``` @@ -101,13 +104,13 @@ A `target:` argument can be given to target more recent versions of JS and unloc ```ruby # ES2015 and greater use the u-flag to avoid lengthy escape sequences -JsRegex.new(/😋/, target: 'ES2009').to_s # => "/(?:\\uD83D\\uDE0B)/" -JsRegex.new(/😋/, target: 'ES2015').to_s # => "/😋/u" -JsRegex.new(/😋/, target: 'ES2018').to_s # => "/😋/u" +LangRegex::JsRegex.new(/😋/, target: 'ES2009').to_s # => "/(?:\\uD83D\\uDE0B)/" +LangRegex::JsRegex.new(/😋/, target: 'ES2015').to_s # => "/😋/u" +LangRegex::JsRegex.new(/😋/, target: 'ES2018').to_s # => "/😋/u" # ES2018 adds support for lookbehinds, properties etc. -JsRegex.new(/foo\K\p{ascii}/, target: 'ES2015').to_s # => "/foo[\x00-\x7f]/" -JsRegex.new(/foo\K\p{ascii}/, target: 'ES2018').to_s # => "/(?<=foo)\p{ASCII}/" +LangRegex::JsRegex.new(/foo\K\p{ascii}/, target: 'ES2015').to_s # => "/foo[\x00-\x7f]/" +LangRegex::JsRegex.new(/foo\K\p{ascii}/, target: 'ES2018').to_s # => "/(?<=foo)\p{ASCII}/" ``` diff --git a/bin/console b/bin/console index dc83387..1b76a6e 100755 --- a/bin/console +++ b/bin/console @@ -1,7 +1,7 @@ #!/usr/bin/env ruby require 'bundler/setup' -require 'js_regex' +require 'lang_regex' RP = Regexp::Parser RS = Regexp::Scanner diff --git a/bin/lang_regex b/bin/lang_regex new file mode 100755 index 0000000..419e0e2 --- /dev/null +++ b/bin/lang_regex @@ -0,0 +1,64 @@ +#!/usr/bin/env ruby + +require 'bundler/setup' +require 'colorize' +require 'optparse' + +require 'lang_regex' + +def display_conversion(lang, regex) + puts "------------------- #{lang}".green + if LangRegex::Target::JS.include? lang.to_s + puts LangRegex::JsRegex.new(Regexp.new(regex), target: lang) + else + puts Object.const_get("LangRegex::#{lang}Regex").new(Regexp.new(regex)) + end +end + +def display_all_conversions(langs, regex) + langs.each do |lang| + display_conversion lang, regex + end +end + +langs = [] + +option_parser = OptionParser.new do |opts| + opts.banner = 'Usage: lang_regex [options] [single regex]' + + # Output languages. + opts.separator '' + opts.separator 'Output languages:' + opts.on('-j', '--java', 'convert to Java regex') do + langs << :Java + end + opts.separator '' + opts.on('-J', '--es09', 'convert to ES09 regex') do + langs << :ES2009 + end + opts.on('--es15', 'convert to ES15 regex') do + langs << :ES2015 + end + opts.on('--es18', 'convert to ES18 regex') do + langs << :ES2018 + end + opts.separator '' + opts.on('-p', '--php', 'convert to Php regex') do + langs << :Php + end + opts.separator '' + opts.on('-y', '--python', 'convert to Python regex') do + langs << :Python + end +end + +option_parser.parse! + +if ARGV.empty? + require 'readline' + while (buf = Readline.readline('❯❯❯ '.red, true)) + display_all_conversions langs, buf + end +else + display_all_conversions langs, ARGV.last +end diff --git a/js_regex.gemspec b/lang_regex.gemspec similarity index 96% rename from js_regex.gemspec rename to lang_regex.gemspec index 6954160..87bf3b9 100644 --- a/js_regex.gemspec +++ b/lang_regex.gemspec @@ -5,7 +5,7 @@ require File.join(dir, 'lib', 'js_regex', 'version') Gem::Specification.new do |s| s.platform = Gem::Platform::RUBY s.name = 'js_regex' - s.version = JsRegex::VERSION + s.version = LangRegex::VERSION s.license = 'MIT' s.summary = 'Converts Ruby regexes to JavaScript regexes.' diff --git a/lib/js_regex.rb b/lib/js_regex.rb deleted file mode 100644 index d6f16aa..0000000 --- a/lib/js_regex.rb +++ /dev/null @@ -1,46 +0,0 @@ -# JsRegex converts ::Regexp instances to JavaScript. -# -# Usage: -# -# js_regex = JsRegex.new(my_ruby_regex) -# js_regex.to_h # for use in 'new RegExp()' -# js_regex.to_s # for direct injection into JavaScript -# -class JsRegex - require_relative File.join('js_regex', 'conversion') - require_relative File.join('js_regex', 'error') - require_relative File.join('js_regex', 'version') - require 'json' - - attr_reader :source, :options, :warnings, :target - - def initialize(ruby_regex, **kwargs) - @source, @options, @warnings, @target = Conversion.of(ruby_regex, **kwargs) - end - - def to_h - { source: source, options: options } - end - - def to_json(options = {}) - to_h.to_json(options) - end - - def to_s - "/#{source.empty? ? '(?:)' : source}/#{options}" - end - - # @raise JsRegex::ConversionError - def self.new!(ruby_regex, **kwargs) - new(ruby_regex, fail_fast: true, **kwargs) - end - - def self.compatible?(ruby_regex, **kwargs) - new!(ruby_regex, **kwargs) - true - rescue ConversionError - false - end - - ConversionError = Class.new(StandardError).send(:include, JsRegex::Error) -end diff --git a/lib/js_regex/conversion.rb b/lib/js_regex/conversion.rb index 7ee3b6d..3926356 100644 --- a/lib/js_regex/conversion.rb +++ b/lib/js_regex/conversion.rb @@ -1,4 +1,4 @@ -class JsRegex +module LangRegex # # This class acts as a facade, passing a Regexp to the Converters. # @@ -6,34 +6,34 @@ class JsRegex # class Conversion require 'regexp_parser' + require_relative 'target' require_relative 'converter' require_relative 'error' require_relative 'node' require_relative 'second_pass' - require_relative 'target' class << self - def of(input, options: nil, target: Target::ES2009, fail_fast: false) + def of(input, converter, options: nil, target: Target::ES2009, fail_fast: false) target = Target.cast(target) - source, warnings, extra_opts = convert_source(input, target, fail_fast) + source, warnings, extra_opts = convert_source(input, converter, target, fail_fast) options_string = convert_options(input, options, extra_opts) [source, options_string, warnings, target] end private - def convert_source(input, target, fail_fast) + def convert_source(input, converter, target, fail_fast) tree = Regexp::Parser.parse(input) context = Converter::Context.new( case_insensitive_root: tree.i?, target: target, fail_fast: fail_fast, ) - converted_tree = Converter.convert(tree, context) + converted_tree = converter.convert(tree, context) final_tree = SecondPass.call(converted_tree) [final_tree.to_s, context.warnings, context.required_options] rescue Regexp::Parser::Error => e - raise e.extend(JsRegex::Error) + raise e.extend(Error) end def convert_options(input, custom_options, required_options) diff --git a/lib/js_regex/converter.rb b/lib/js_regex/converter.rb index 7010683..aa5986b 100644 --- a/lib/js_regex/converter.rb +++ b/lib/js_regex/converter.rb @@ -1,36 +1,25 @@ -class JsRegex +module LangRegex module Converter Dir[File.join(__dir__, 'converter', '*.rb')].sort.each do |file| require file end - MAP = Hash.new(UnsupportedTokenConverter).merge( - anchor: AnchorConverter, - assertion: AssertionConverter, - backref: BackreferenceConverter, - conditional: ConditionalConverter, - escape: EscapeConverter, - expression: SubexpressionConverter, - free_space: FreespaceConverter, - group: GroupConverter, - keep: KeepConverter, - literal: LiteralConverter, - meta: MetaConverter, - nonproperty: PropertyConverter, - property: PropertyConverter, - set: SetConverter, - type: TypeConverter - ).freeze + class Converter + def initialize(converters_map) + @converters_map = converters_map + @converters_map.default ||= UnsupportedTokenConverter + end - class << self def convert(exp, context = nil) self.for(exp).convert(exp, context || Context.new) end def for(expression) - MAP[expression.type].new + @converters_map[expression.type].new(self) end + end + class << self # Legacy method. Remove in v4.0.0. def surrogate_pair_limit=(_arg) warn '#surrogate_pair_limit= is deprecated and has no effect anymore.' diff --git a/lib/js_regex/converter/anchor_converter.rb b/lib/js_regex/converter/anchor_converter.rb index 4906b59..9404687 100644 --- a/lib/js_regex/converter/anchor_converter.rb +++ b/lib/js_regex/converter/anchor_converter.rb @@ -1,46 +1,45 @@ require_relative 'base' -class JsRegex +module LangRegex module Converter # # Template class implementation. # - class AnchorConverter < JsRegex::Converter::Base + class AnchorConverter < Base private def convert_data case subtype - when :bol, :bos then '^' - when :eol, :eos then '$' - when :eos_ob_eol then '(?=\n?$)' - when :word_boundary then convert_boundary + when :bol, :eol then pass_through + when :bos then convert_beginning_of_string + when :eos then convert_end_of_string + when :eos_ob_eol then convert_end_of_string_with_new_line + when :word_boundary then convert_boundary when :nonword_boundary then convert_nonboundary else warn_of_unsupported_feature end end + def convert_beginning_of_string + '(?= Target::ES2015 + ES_TARGETS[1..].include?(target) end def es_2018_or_higher? - target >= Target::ES2018 + ES_TARGETS[2..].include?(target) end # these methods allow appending options to the final Conversion output @@ -99,13 +102,13 @@ def original_capturing_group_count attr_accessor :added_capturing_groups_after_group, :recursions_per_expression, - :required_options_hash, - :target + :required_options_hash attr_writer :capturing_group_count, :case_insensitive_root, :fail_fast, :in_atomic_group, + :target, :warnings def total_added_capturing_groups diff --git a/lib/js_regex/converter/escape_converter.rb b/lib/js_regex/converter/escape_converter.rb index 8f9b677..60f4694 100644 --- a/lib/js_regex/converter/escape_converter.rb +++ b/lib/js_regex/converter/escape_converter.rb @@ -1,12 +1,12 @@ require_relative 'base' require_relative 'literal_converter' -class JsRegex +module LangRegex module Converter # # Template class implementation. # - class EscapeConverter < JsRegex::Converter::Base + class EscapeConverter < Base ESCAPES_SHARED_BY_RUBY_AND_JS = %i[ alternation backslash diff --git a/lib/js_regex/converter/freespace_converter.rb b/lib/js_regex/converter/freespace_converter.rb index b4bb086..de325a0 100644 --- a/lib/js_regex/converter/freespace_converter.rb +++ b/lib/js_regex/converter/freespace_converter.rb @@ -1,11 +1,11 @@ require_relative 'base' -class JsRegex +module LangRegex module Converter # # Template class implementation. # - class FreespaceConverter < JsRegex::Converter::Base + class FreespaceConverter < Base private def convert_data diff --git a/lib/js_regex/converter/group_converter.rb b/lib/js_regex/converter/group_converter.rb index 25ad40a..6cdf425 100644 --- a/lib/js_regex/converter/group_converter.rb +++ b/lib/js_regex/converter/group_converter.rb @@ -1,11 +1,11 @@ require_relative 'base' -class JsRegex +module LangRegex module Converter # # Template class implementation. # - class GroupConverter < JsRegex::Converter::Base + class GroupConverter < Base private def convert_data diff --git a/lib/js_regex/converter/keep_converter.rb b/lib/js_regex/converter/keep_converter.rb index 9481232..0f115bc 100644 --- a/lib/js_regex/converter/keep_converter.rb +++ b/lib/js_regex/converter/keep_converter.rb @@ -1,11 +1,11 @@ require_relative 'base' -class JsRegex +module LangRegex module Converter # # Template class implementation. # - class KeepConverter < JsRegex::Converter::Base + class KeepConverter < Base private def convert_data diff --git a/lib/js_regex/converter/literal_converter.rb b/lib/js_regex/converter/literal_converter.rb index 9ef1386..18dafff 100644 --- a/lib/js_regex/converter/literal_converter.rb +++ b/lib/js_regex/converter/literal_converter.rb @@ -1,43 +1,60 @@ require_relative 'base' -class JsRegex +module LangRegex module Converter # # Template class implementation. # - class LiteralConverter < JsRegex::Converter::Base + class LiteralConverter < Base ASTRAL_PLANE_CODEPOINT_PATTERN = /[\u{10000}-\u{10FFFF}]/ + + ESCAPES = Hash.new { |h, k| raise KeyError, "#{h}[#{k.inspect}]" } + .merge("\f\n\r\t\v".chars.to_h { |c| [c, Regexp.escape(c)] }) + .merge('/' => '\\/') LITERAL_REQUIRING_ESCAPE_PATTERN = /[\/\f\n\r\t\v]/ class << self + def add_escapes(to_escape) + [ + ESCAPES.dup.merge(to_escape.chars.to_h { |c| [c, "\\#{c}"] }), + /#{LITERAL_REQUIRING_ESCAPE_PATTERN.source}|[#{to_escape}]/ + ] + end + def convert_data(data, context) if !context.u? && data =~ ASTRAL_PLANE_CODEPOINT_PATTERN if context.enable_u_option - escape_incompatible_bmp_literals(data) + escape_incompatible_bmp_literals(data, context.target) else - convert_astral_data(data) + convert_astral_data(data, context.target) end else - escape_incompatible_bmp_literals(data) + escape_incompatible_bmp_literals(data, context.target) end end - def convert_astral_data(data) + def convert_astral_data(data, target) data.each_char.each_with_object(Node.new) do |char, node| if char.ord > 0xFFFF node << surrogate_substitution_for(char) else - node << escape_incompatible_bmp_literals(char) + node << escape_incompatible_bmp_literals(char, target) end end end - ESCAPES = Hash.new { |h, k| raise KeyError, "#{h}[#{k.inspect}]" } - .merge("\f\n\r\t\v".chars.to_h { |c| [c, Regexp.escape(c)] }) - .merge('/' => '\\/') - - def escape_incompatible_bmp_literals(data) - data.gsub(LITERAL_REQUIRING_ESCAPE_PATTERN, ESCAPES) + def escape_incompatible_bmp_literals(data, target) + class_name = Target.class_name(target) + literal_requiring_escape_pattern, escapes = + if ::LangRegex::Converter.const_defined?("#{class_name}::LiteralConverter") + literal_converter = ::LangRegex::Converter.const_get("#{class_name}::LiteralConverter") + if literal_converter.const_defined?('LITERAL_REQUIRING_ESCAPE_PATTERN') \ + && literal_converter.const_defined?('ESCAPES') + [literal_converter::LITERAL_REQUIRING_ESCAPE_PATTERN, literal_converter::ESCAPES] + end + end \ + || [self::LITERAL_REQUIRING_ESCAPE_PATTERN, self::ESCAPES] + data.gsub(literal_requiring_escape_pattern, escapes) end private diff --git a/lib/js_regex/converter/meta_converter.rb b/lib/js_regex/converter/meta_converter.rb index e55869b..3eb53f4 100644 --- a/lib/js_regex/converter/meta_converter.rb +++ b/lib/js_regex/converter/meta_converter.rb @@ -1,11 +1,11 @@ require_relative 'base' -class JsRegex +module LangRegex module Converter # # Template class implementation. # - class MetaConverter < JsRegex::Converter::Base + class MetaConverter < Base DOT_EXPANSION = '(?:[\uD800-\uDBFF][\uDC00-\uDFFF]|[^\n\uD800-\uDFFF])' ML_DOT_EXPANSION = '(?:[\uD800-\uDBFF][\uDC00-\uDFFF]|[^\uD800-\uDFFF])' # Possible improvements for dot conversion: diff --git a/lib/js_regex/converter/property_converter.rb b/lib/js_regex/converter/property_converter.rb index 4aa819a..580a97d 100644 --- a/lib/js_regex/converter/property_converter.rb +++ b/lib/js_regex/converter/property_converter.rb @@ -1,7 +1,7 @@ require_relative 'base' require 'character_set' -class JsRegex +module LangRegex module Converter # # Template class implementation. @@ -9,7 +9,7 @@ module Converter # Uses the `character_set` and `regexp_property_values` gems to get the # codepoints matched by the property and build a set string from them. # - class PropertyConverter < JsRegex::Converter::Base + class PropertyConverter < Base # A map of normalized Ruby property names to names supported by ES2018+. def self.map @map ||= File.read("#{__dir__}/property_map.csv").scan(/(.+),(.+)/).to_h diff --git a/lib/js_regex/converter/set_converter.rb b/lib/js_regex/converter/set_converter.rb index 88dfebd..e4ae57d 100644 --- a/lib/js_regex/converter/set_converter.rb +++ b/lib/js_regex/converter/set_converter.rb @@ -3,7 +3,7 @@ require_relative 'type_converter' require 'character_set' -class JsRegex +module LangRegex module Converter # # Template class implementation. @@ -13,7 +13,7 @@ module Converter # children, it uses the `character_set` gem to establish the codepoints # matched by the whole set and build a completely new set string. # - class SetConverter < JsRegex::Converter::Base + class SetConverter < Base private def convert_data @@ -45,7 +45,7 @@ def simple_convert_child(exp) exp.text =~ LiteralConverter::ASTRAL_PLANE_CODEPOINT_PATTERN && !context.enable_u_option - LiteralConverter.escape_incompatible_bmp_literals(exp.text) + LiteralConverter.escape_incompatible_bmp_literals(exp.text, context.target) when :set # full conversion is needed for nested sets and intersections exp.token.equal?(:range) && exp.expressions.map do |op| @@ -59,10 +59,10 @@ def simple_convert_child(exp) case exp.token when *CONVERTIBLE_ESCAPE_TOKENS - EscapeConverter.new.convert(exp, context) + EscapeConverter.new(@converter).convert(exp, context) when :literal exp.char.ord <= 0xFFFF && - LiteralConverter.escape_incompatible_bmp_literals(exp.char) + LiteralConverter.escape_incompatible_bmp_literals(exp.char, context.target) end end end diff --git a/lib/js_regex/converter/subexpression_converter.rb b/lib/js_regex/converter/subexpression_converter.rb index e253861..d2c19dd 100644 --- a/lib/js_regex/converter/subexpression_converter.rb +++ b/lib/js_regex/converter/subexpression_converter.rb @@ -1,11 +1,11 @@ require_relative 'base' -class JsRegex +module LangRegex module Converter # # Template class implementation. # - class SubexpressionConverter < JsRegex::Converter::Base + class SubexpressionConverter < Base private def convert_data diff --git a/lib/js_regex/converter/type_converter.rb b/lib/js_regex/converter/type_converter.rb index 7230f0e..f90f953 100644 --- a/lib/js_regex/converter/type_converter.rb +++ b/lib/js_regex/converter/type_converter.rb @@ -1,18 +1,15 @@ require_relative 'base' -class JsRegex +module LangRegex module Converter # # Template class implementation. # - class TypeConverter < JsRegex::Converter::Base + class TypeConverter < Base HEX_EXPANSION = '[0-9A-Fa-f]' NONHEX_EXPANSION = '[^0-9A-Fa-f]' I_MODE_HEX_EXPANSION = '[0-9A-F]' I_MODE_NONHEX_EXPANSION = '[^0-9A-F]' - ES2018_HEX_EXPANSION = '\p{AHex}' - ES2018_NONHEX_EXPANSION = '\P{AHex}' - ES2018_XGRAPHEME_EXPANSION = '[\P{M}\P{Lm}](?:(?:[\u035C\u0361]\P{M}\p{M}*)|\u200d|\p{M}|\p{Lm}|\p{Emoji_Modifier})*' LINEBREAK_EXPANSION = '(?:\r\n|[\n\v\f\r\u0085\u2028\u2029])' def self.directly_compatible?(expression, _context = nil) @@ -44,9 +41,7 @@ def convert_data end def hex_expansion - if context.es_2018_or_higher? && context.enable_u_option - ES2018_HEX_EXPANSION - elsif context.case_insensitive_root + if context.case_insensitive_root I_MODE_HEX_EXPANSION else HEX_EXPANSION @@ -54,9 +49,7 @@ def hex_expansion end def nonhex_expansion - if context.es_2018_or_higher? && context.enable_u_option - ES2018_NONHEX_EXPANSION - elsif context.case_insensitive_root + if context.case_insensitive_root I_MODE_NONHEX_EXPANSION else NONHEX_EXPANSION @@ -78,11 +71,7 @@ def character_set end def xgrapheme - if context.es_2018_or_higher? && context.enable_u_option - ES2018_XGRAPHEME_EXPANSION - else - warn_of_unsupported_feature - end + warn_of_unsupported_feature end end end diff --git a/lib/js_regex/converter/unsupported_token_converter.rb b/lib/js_regex/converter/unsupported_token_converter.rb index 4ee4de7..b611dd2 100644 --- a/lib/js_regex/converter/unsupported_token_converter.rb +++ b/lib/js_regex/converter/unsupported_token_converter.rb @@ -1,11 +1,11 @@ require_relative 'base' -class JsRegex +module LangRegex module Converter # # Template class implementation. # - class UnsupportedTokenConverter < JsRegex::Converter::Base + class UnsupportedTokenConverter < Base private def convert_data diff --git a/lib/js_regex/error.rb b/lib/js_regex/error.rb index c3a41e5..ec7898a 100644 --- a/lib/js_regex/error.rb +++ b/lib/js_regex/error.rb @@ -1,5 +1,5 @@ -class JsRegex +module LangRegex # This is mixed into errors, e.g. those thrown by the parser, - # allowing to `rescue JsRegex::Error`. + # allowing to `rescue LangRegex::Error`. module Error; end end diff --git a/lib/js_regex/langs/java/converter/anchor_converter.rb b/lib/js_regex/langs/java/converter/anchor_converter.rb new file mode 100644 index 0000000..86e75d1 --- /dev/null +++ b/lib/js_regex/langs/java/converter/anchor_converter.rb @@ -0,0 +1,38 @@ +module LangRegex + module Converter + module Java + # + # Template class implementation. + # + class AnchorConverter < ::LangRegex::Converter::AnchorConverter + private + + def convert_beginning_of_string + '\\\\A' + end + + # Despite \b and \B existing in Java regexes, they do not behave the + # same way for some utf-8 characters. + # This is an approximation to the word boundary behavior in Ruby, c.f. + # https://github.com/ruby/ruby/blob/08476c45/tool/enc-unicode.rb#L130 + W = '\\\\d\\\\p{L}\\\\p{M}\\\\p{Pc}' + + def convert_boundary + "(?:(?<=[#{W}])(?![#{W}])|(?\-=!') + end + end + end +end diff --git a/lib/js_regex/langs/java/converter/type_converter.rb b/lib/js_regex/langs/java/converter/type_converter.rb new file mode 100644 index 0000000..3611012 --- /dev/null +++ b/lib/js_regex/langs/java/converter/type_converter.rb @@ -0,0 +1,19 @@ +module LangRegex + module Converter + module Java + # + # Template class implementation. + # + class TypeConverter < ::LangRegex::Converter::TypeConverter + def convert_data + case subtype + when :digit, :space, :word, :nondigit, :nonspace, :nonword + Regexp.escape(expression) + else + super + end + end + end + end + end +end diff --git a/lib/js_regex/langs/java/java_regex.rb b/lib/js_regex/langs/java/java_regex.rb new file mode 100644 index 0000000..931545a --- /dev/null +++ b/lib/js_regex/langs/java/java_regex.rb @@ -0,0 +1,26 @@ +module LangRegex + class JavaRegex < LangRegex + Dir[File.join(__dir__, 'converter', '*.rb')].sort.each do |file| + require file + end + + def initialize(ruby_regex, **kwargs) + super(ruby_regex, self.class.java_converter, target: Target::JAVA, **kwargs) + end + + def self.java_converter + Converter::Converter.new( + { + anchor: Converter::Java::AnchorConverter, + expression: Converter::SubexpressionConverter, + literal: Converter::LiteralConverter, + type: Converter::Java::TypeConverter + } + ) + end + + def to_s + (options.empty? ? '' : "(?#{options})") << source + end + end +end diff --git a/lib/js_regex/langs/js/converter/anchor_converter.rb b/lib/js_regex/langs/js/converter/anchor_converter.rb new file mode 100644 index 0000000..4f56766 --- /dev/null +++ b/lib/js_regex/langs/js/converter/anchor_converter.rb @@ -0,0 +1,32 @@ +module LangRegex + module Converter + module Js + # + # Template class implementation. + # + class AnchorConverter < ::LangRegex::Converter::AnchorConverter + private + + # This is an approximation to the word boundary behavior in Ruby, c.f. + # https://github.com/ruby/ruby/blob/08476c45/tool/enc-unicode.rb#L130 + W = '\d\p{L}\p{M}\p{Pc}' + + def convert_boundary + if context.es_2018_or_higher? && context.enable_u_option + "(?:(?<=[#{W}])(?![#{W}])|(? in Java', targets: [JAVA] do + expect(//).to\ + become(/\>/).and keep_matching('>') + end + it 'does not add escapes to \\n' do expect(/\\n/).to stay_the_same.and keep_matching('\\n', with_results: %w[\\n]) end diff --git a/spec/lib/js_regex/converter/meta_converter_spec.rb b/spec/lib/js_regex/converter/meta_converter_spec.rb index 86d102d..514ee65 100644 --- a/spec/lib/js_regex/converter/meta_converter_spec.rb +++ b/spec/lib/js_regex/converter/meta_converter_spec.rb @@ -1,6 +1,6 @@ require 'spec_helper' -describe JsRegex::Converter::MetaConverter do +describe LangRegex::Converter::MetaConverter do it 'replaces the dot meta char so that it keeps matching astral stuff, too' do expect(/a.a/).to\ become('a(?:[\uD800-\uDBFF][\uDC00-\uDFFF]|[^\n\uD800-\uDFFF])a') diff --git a/spec/lib/js_regex/converter/property_converter_spec.rb b/spec/lib/js_regex/converter/property_converter_spec.rb index 2d186dd..8b829b3 100644 --- a/spec/lib/js_regex/converter/property_converter_spec.rb +++ b/spec/lib/js_regex/converter/property_converter_spec.rb @@ -1,6 +1,6 @@ require 'spec_helper' -describe JsRegex::Converter::PropertyConverter do +describe LangRegex::Converter::PropertyConverter do it 'substitutes the \p{...} property style', targets: [ES2009, ES2015] do expect(/\p{ascii}/).to\ become(/[\x00-\x7F]/).and keep_matching('añB', with_results: %w[a B]) @@ -50,12 +50,12 @@ end it 'uses case-insensitive substitutions if needed' do - result = JsRegex.new(/1(?i:\p{lower})2/) + result = LangRegex::JsRegex.new(/1(?i:\p{lower})2/) expect(result.source).to include 'A-Z' end it 'does not use case-insensitive substitutions if everything is i anyway' do - result = JsRegex.new(/1\p{lower}2/i) + result = LangRegex::JsRegex.new(/1\p{lower}2/i) expect(result.source).not_to include 'A-Z' expect(result.warnings).to be_empty end diff --git a/spec/lib/js_regex/converter/set_converter_spec.rb b/spec/lib/js_regex/converter/set_converter_spec.rb index 65cfcfc..cc13728 100644 --- a/spec/lib/js_regex/converter/set_converter_spec.rb +++ b/spec/lib/js_regex/converter/set_converter_spec.rb @@ -1,6 +1,6 @@ require 'spec_helper' -describe JsRegex::Converter::SetConverter do +describe LangRegex::Converter::SetConverter do it 'preserves hex escape members' do expect(/[\x41]/).to stay_the_same.and keep_matching('ABC', with_results: %w[A]) end diff --git a/spec/lib/js_regex/converter/type_converter_spec.rb b/spec/lib/js_regex/converter/type_converter_spec.rb index dcfcce5..0c8af73 100644 --- a/spec/lib/js_regex/converter/type_converter_spec.rb +++ b/spec/lib/js_regex/converter/type_converter_spec.rb @@ -1,6 +1,10 @@ require 'spec_helper' -describe JsRegex::Converter::TypeConverter do +describe LangRegex::Converter::TypeConverter do + it 'escapes the supported sequences in Java', targets: [JAVA] do + expect(/\d\D\s\S\w\W/).to become(/\\d\\D\\s\\S\\w\\W/) + end + it 'preserves all types supported by JS in regular mode' do expect(/\d\D\s\S\w\W/).to stay_the_same end @@ -28,12 +32,12 @@ .to keep_matching(" ").and keep_not_matching(' ') end - it 'substitutes the hex type "\h" with an equivalent set', targets: [ES2009, ES2015] do + it 'substitutes the hex type "\h" with an equivalent set', targets: [ES2009, ES2015, JAVA, PHP, PYTHON] do expect(/\h+/).to\ become(/[0-9A-Fa-f]+/).and keep_matching('f').and keep_not_matching('x') end - it 'substitutes the hex type "\h" with an equivalent set in i-mode', targets: [ES2009, ES2015] do + it 'substitutes the hex type "\h" with an equivalent set in i-mode', targets: [ES2009, ES2015, JAVA, PHP, PYTHON] do expect(/\h+/i).to\ become(/[0-9A-F]+/i).and keep_matching('f').and keep_not_matching('x') end @@ -43,12 +47,12 @@ become(/\p{AHex}+/).and keep_matching('f').and keep_not_matching('x') end - it 'substitutes the nonhex type "\H" with an equivalent set', targets: [ES2009, ES2015] do + it 'substitutes the nonhex type "\H" with an equivalent set', targets: [ES2009, ES2015, JAVA, PHP, PYTHON] do expect(/\H+/).to\ become(/[^0-9A-Fa-f]+/).and keep_matching('x').and keep_not_matching('f', 'F') end - it 'substitutes the nonhex type "\H" with an equivalent set in i-mode', targets: [ES2009, ES2015] do + it 'substitutes the nonhex type "\H" with an equivalent set in i-mode', targets: [ES2009, ES2015, JAVA, PHP, PYTHON] do expect(/\H+/i).to\ become(/[^0-9A-F]+/i).and keep_matching('x').and keep_not_matching('f', 'F') end @@ -64,7 +68,7 @@ .and keep_matching("_\n_\r\n_", with_results: %W[\n \r\n]) end - it 'drops the extended grapheme type "\X" with warning', targets: [ES2009, ES2015] do + it 'drops the extended grapheme type "\X" with warning', targets: [ES2009, ES2015, JAVA, PHP, PYTHON] do expect(/a\Xb/).to\ become(/ab/) .with_warning("Dropped unsupported xgrapheme type '\\X' at index 1") diff --git a/spec/lib/js_regex/converter/unsupported_token_converter_spec.rb b/spec/lib/js_regex/converter/unsupported_token_converter_spec.rb index 234afe3..35c8443 100644 --- a/spec/lib/js_regex/converter/unsupported_token_converter_spec.rb +++ b/spec/lib/js_regex/converter/unsupported_token_converter_spec.rb @@ -1,6 +1,6 @@ require 'spec_helper' -describe JsRegex::Converter::UnsupportedTokenConverter do +describe LangRegex::Converter::UnsupportedTokenConverter do it 'drops tokens of unknown classes with warning' do expect([:unknown_class, :some_subtype]).to be_dropped_with_warning end diff --git a/spec/lib/js_regex/second_pass_spec.rb b/spec/lib/js_regex/second_pass_spec.rb index 49444fa..17cb820 100644 --- a/spec/lib/js_regex/second_pass_spec.rb +++ b/spec/lib/js_regex/second_pass_spec.rb @@ -1,6 +1,6 @@ require 'spec_helper' -describe JsRegex::SecondPass do +describe LangRegex::SecondPass do describe '::alternate_conditional_permutations' do it 'replaces one-branch conditionals with equivalent alternations' do expect(/-(<)?a(?(1)>)-/).to\ diff --git a/spec/lib/js_regex/target_spec.rb b/spec/lib/js_regex/target_spec.rb index 78e1086..3bc43cb 100644 --- a/spec/lib/js_regex/target_spec.rb +++ b/spec/lib/js_regex/target_spec.rb @@ -1,7 +1,7 @@ require 'spec_helper' -describe JsRegex::Target do - subject { JsRegex::Target } +describe LangRegex::Target do + subject { LangRegex::Target } it 'can ::cast input to a supported target' do # default value diff --git a/spec/lib/js_regex_spec.rb b/spec/lib/js_regex_spec.rb index 45e5fe1..140cf1e 100644 --- a/spec/lib/js_regex_spec.rb +++ b/spec/lib/js_regex_spec.rb @@ -1,12 +1,12 @@ require 'spec_helper' -describe JsRegex do +describe LangRegex do it 'has a semantic version number' do - expect(JsRegex::VERSION).to match(/\A\d+\.\d+\.\d+\z/) + expect(LangRegex::VERSION).to match(/\A\d+\.\d+\.\d+\z/) end describe '#to_h' do - let(:return_value) { JsRegex.new(//).to_h } + let(:return_value) { LangRegex::JsRegex.new(//).to_h } it 'returns a Hash' do expect(return_value).to be_instance_of(Hash) @@ -22,14 +22,14 @@ end describe '#to_json' do - let(:return_value) { JsRegex.new(//).to_json } + let(:return_value) { LangRegex::JsRegex.new(//).to_json } it 'returns a String' do expect(return_value).to be_instance_of(String) end it 'encodes the result of #to_h' do - js_regex = JsRegex.new(/[a-z]+/) + js_regex = LangRegex::JsRegex.new(/[a-z]+/) json = js_regex.to_json decoded_json = JSON.parse(json, symbolize_names: true) expect(decoded_json).to eq(js_regex.to_h) @@ -37,53 +37,53 @@ it 'passes on the options parameter, defaulting to {}' do expect_any_instance_of(Hash).to receive(:to_json).with(foo: :bar) - JsRegex.new(//).to_json(foo: :bar) + LangRegex::JsRegex.new(//).to_json(foo: :bar) end it 'passes on an empty hash as options parameter by default' do expect_any_instance_of(Hash).to receive(:to_json).with({}) - JsRegex.new(//).to_json + LangRegex::JsRegex.new(//).to_json end end describe '#to_s' do it 'returns a String' do - expect(JsRegex.new(/foo/).to_s).to eq '/foo/' + expect(LangRegex::JsRegex.new(/foo/).to_s).to eq '/foo/' end it 'includes options' do - expect(JsRegex.new(/foo/i).to_s).to eq '/foo/i' + expect(LangRegex::JsRegex.new(/foo/i).to_s).to eq '/foo/i' end it 'returns /(?:)/ if the source is empty, as `//` is illegal in JS' do - expect(JsRegex.new(//).to_s).to eq '/(?:)/' + expect(LangRegex::JsRegex.new(//).to_s).to eq '/(?:)/' end end describe '#warnings' do it 'returns an Array' do - expect(JsRegex.new(//).warnings).to be_instance_of(Array) + expect(LangRegex::JsRegex.new(//).warnings).to be_instance_of(Array) end end describe '#target' do it 'is the given target' do - expect(JsRegex.new(//, target: ES2018).target).to eq 'ES2018' + expect(LangRegex::JsRegex.new(//, target: ES2018).target).to eq 'ES2018' end it 'is ES2009 by default' do - expect(JsRegex.new(//).target).to eq 'ES2009' + expect(LangRegex::JsRegex.new(//).target).to eq 'ES2009' end end describe '::new!' do - it 'returns a JsRegex' do - expect(JsRegex.new!(//)).to be_a(JsRegex) + it 'returns a LangRegex' do + expect(LangRegex::JsRegex.new!(//)).to be_a(LangRegex::LangRegex) end it 'raises if there are incompatibility warnings' do - expect { JsRegex.new!(/\G/) }.to raise_error( - JsRegex::ConversionError, + expect { LangRegex::JsRegex.new!(/\G/) }.to raise_error( + LangRegex::ConversionError, "unsupported match start anchor '\\G' at index 0" ) end @@ -91,11 +91,11 @@ describe '::compatible?' do it 'returns true for supported regexps' do - expect(JsRegex.compatible?(//)).to eq true + expect(LangRegex::JsRegex.compatible?(//)).to eq true end it 'raises if there are incompatibility warnings' do - expect(JsRegex.compatible?(/\G/)).to eq false + expect(LangRegex::JsRegex.compatible?(/\G/)).to eq false end end end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index 557c1f5..16a164c 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -5,9 +5,9 @@ SimpleCov.start end -require 'js_regex' +require 'lang_regex' -include JsRegex::Target +include LangRegex::Target RSpec.configure do |config| config.mock_with(:rspec) { |mocks| mocks.verify_partial_doubles = true } @@ -36,18 +36,22 @@ $js_regex_cache = {} # Returns a Hash of all targetted ECMAScript versions, mapped to their -# JsRegex output. Use `targets:` RSpec metadata to test specific version(s). -# This method should be used for all JsRegex initializations in syntax specs. +# LangRegex output. Use `targets:` RSpec metadata to test specific version(s). +# This method should be used for all LangRegex initializations in syntax specs. def conversions(rb_regex) active_targets.to_h do |target| output = $js_regex_cache[[rb_regex, target]] ||= - JsRegex.new(rb_regex, target: target) + if LangRegex::Target::JS.include?(target) + LangRegex::JsRegex.new(rb_regex, target: target) + else + Object.const_get("LangRegex::#{target.capitalize}Regex").new(rb_regex, target: target) + end [target, output] end end def active_targets - Thread.current[:js_regex_custom_targets] || JsRegex::Target::SUPPORTED + Thread.current[:js_regex_custom_targets] || LangRegex::Target::FULLY_SUPPORTED end def use_custom_targets(arg) @@ -66,7 +70,7 @@ def use_custom_targets(arg) def error_for_source(js_regex, expected) if js_regex.source != expected "expected #{expected}, got #{js_regex.source}" - elsif !to_s_like_json(js_regex) + elsif LangRegex::Target::JS.include?(@target) && !to_s_like_json(js_regex) '#to_s/#to_json sanity check failed' end end @@ -126,16 +130,25 @@ def error_for_options(js_regex, expected) test_strings.each do |string| if with_results rb_matches = string.scan(rb_regex).flatten - js_matches = matches_in_js(js_regex, string) rb_matches == with_results || @msg = "rb matched #{rb_matches}" - js_matches == with_results || @msg = "js matched #{js_matches}" + if LangRegex::Target::JS.include?(target) + js_matches = matches_in_js(js_regex, string) + js_matches == with_results || @msg = "js matched #{js_matches}" + else + lang_matches = send("matches_in_#{target.downcase}", js_regex, string) + lang_matches == with_results || @msg = "#{target} matched #{lang_matches}" + end else # Due to JS' different splitting of group match data, some return values # are not completely identical between Ruby and JS matching calls. # In that case, don't specify expected results and just check that # a valid string does produce a match. - rb_regex =~ string || @msg = "rb did not match `#{string}`" - test_in_js(js_regex, string) || @msg = "js did not match `#{string}`" + rb_regex =~ string || @msg = "rb did not match `#{string}`" + if LangRegex::Target::JS.include?(target) + test_in_js(js_regex, string) || @msg = "js did not match `#{string}`" + else + send("test_in_#{target.downcase}", js_regex, string) || @msg = "#{target} did not match `#{string}`" + end end end @@ -151,8 +164,12 @@ def error_for_options(js_regex, expected) conversions(rb_regex).all? do |target, js_regex| @target = target test_strings.each do |string| - rb_regex =~ string && @msg = "rb did match `#{string}`" - test_in_js(js_regex, string) && @msg = "js did match `#{string}`" + rb_regex =~ string && @msg = "rb did match `#{string}`" + if LangRegex::Target::JS.include?(target) + test_in_js(js_regex, string) && @msg = "js did match `#{string}`" + else + send("test_in_#{target.downcase}", js_regex, string) && @msg = "#{target} did match `#{string}`" + end end @msg.nil? @@ -169,7 +186,7 @@ def error_for_options(js_regex, expected) allow_any_instance_of(Regexp::Expression::Root).to receive(:map).and_yield(exp) active_targets.all? do |target| @target = target - result = JsRegex.new(/dummy/, target: target) + result = LangRegex::JsRegex.new(/dummy/, target: target) source = result.source source.empty? || @msg = "expected empty source, got `#{source}`" result.warnings.count > 0 || @msg = 'did not warn' @@ -203,6 +220,81 @@ def test_in_js(js_regex, string) eval_js("#{js_regex}.test('#{js_escape(string)}');") end +require 'fileutils' +require 'open3' + +FileUtils.mkdir_p('tmp/') + +def matches_in_java(java_regex, string) + java_code = <<~JAVA + import java.util.regex.*; + import java.util.ArrayList; + import java.util.List; + + public class TestRegex { + public static void main(String[] args) { + Pattern pattern = Pattern.compile("#{java_regex}"); + Matcher matcher = pattern.matcher("#{js_escape(string)}"); + + while (matcher.find()) { + System.out.println(matcher.group()); + } + } + } + JAVA + File.write('tmp/test.java', java_code) + out, = Open3.capture3('java', 'tmp/test.java') + out.split("\n") +end + +def test_in_java(java_regex, string) + java_code = <<~JAVA + import java.util.regex.*; + public class TestRegex { + public static void main(String[] args) { + Pattern pattern = Pattern.compile("#{java_regex}"); + Matcher matcher = pattern.matcher("#{js_escape(string)}"); + System.exit(matcher.find() ? 1 : 0); + } + } + JAVA + File.write('tmp/test.java', java_code) + _, _, status = Open3.capture3('java', 'tmp/test.java') + !status.success? +end + +def matches_in_php(php_regex, string) + out, = Open3.capture3( + 'php', + '-r', + "preg_match_all('#{php_regex}', '#{js_escape(string)}', $matches); echo implode(PHP_EOL, $matches[0]);" + ) + out.split("\n") +end + +def test_in_php(php_regex, string) + _, _, status = Open3.capture3('php', '-r', "exit(preg_match(\"#{php_regex}\", \"#{js_escape(string)}\"));") + !status.success? +end + +def matches_in_python(python_regex, string) + out, = Open3.capture3( + 'python', + '-c', + "import re; print('\\n'.join(re.findall(r'#{python_regex}', '#{js_escape(string)}')));" + ) + out.split("\n") +end + +def test_in_python(python_regex, string) + _, _, status = Open3.capture3( + 'python', + '-c', + "import re; exit(bool(re.search(r'#{python_regex}', '#{js_escape(string)}')));" + ) + !status.success? +end + def to_s_like_json(js_regex) json_string = js_escape(js_regex.to_json) eval_js <<-JS @@ -219,6 +311,7 @@ def js_escape(string) string .gsub('\\') { '\\\\' } # this actually replaces one backslash with two .gsub("'") { "\\'" } # http://stackoverflow.com/revisions/12701027/2 + .gsub('"') { '\\"' } .gsub("\n", '\\n') .gsub("\r", '\\r') end