Question

JS Regex 抛出“最大调用堆栈大小超出”错误

2021-07-15

4258

javascript node.js regex

我有一个很长的段落，我需要比较单词边界以替换与所需值的匹配。

所需值将有许多不同的模式。这就是为什么我需要有很多行新的RegExp来逐行进行替换的原因。

var paragraph = "german gateway is located at ... Leonardo DA VINci and other some word superman";

paragraph
    .replace( new RegExp("\\b"+ "german gateway" +"\\b", "ig"), "German gateway")
    .replace( new RegExp("\\b"+ "Leonardo DA vinci" +"\\b", "ig"), "Leonardo da Vinci")
    .replace( new RegExp("\\b"+ "some word" +"\\b", "ig"), "some other word")
    .replace( new RegExp
    //continue for at least Few Thousand Rows.

console.log(paragraph);

//示例输出

German gateway is located at ... Leonardo da Vinci and other some other word superman

但是太多的新RegExp会导致js运行出错。

Uncaught RangeError: Maximum call stack size exceeded

有没有办法避免大量调用新的RegExp，同时可以保持我想要的正则表达式规则？

Answer 1

使用对象并迭代属性。如果您需要保证顺序，那么您可能希望使用对象数组，每个替换都有一个对象。

const paragraph = "german gateway is located at ... Leonardo DA VINci and other some word superman";

const replacements = {
  "german gateway": "German gateway",
  "Leonardo DA vinci": "Leonardo da Vinci",
  "some word": "some other word"
  /* ... */
};

const result = Object.entries(replacements)
                     .reduce((result, replacement) => {
                       const rx = new RegExp("\\b" + replacement[0] + "\\b", "ig");
                       return result.replace(rx, replacement[1]);
                     }, paragraph);
            
console.log(result);

并且您一定要转义传递给 RegExp() 构造函数的字符串中的变量部分。

Answer 2

您可以使用一个“关联”数组，其中包含要搜索的键和要替换的值，根据搜索词（键）构建一个正则表达式 trie，然后运行一个 .replace ，使用一个正则表达式来查找所有短语，并使用回调方法将其替换为 .replace 的替换参数。

首先，运行 npm install regex-trie ，然后使用

let paragraph = "german gateway is located at ... Leonardo DA VINci and other some word superman";
const phrases = {"german gateway":"German gateway","Leonardo DA vinci":"Leonardo da Vinci","some word":"some other word"}
let RegexTrie = require('regex-trie');
let trie = new RegexTrie();
trie.add(Object.keys(phrases))
const regex = new RegExp(`\\b${trie.toRegExp().source}\\b`, 'ig')

let getValue = function(prop, obj){
  prop = prop.toLowerCase();
  for(var p in obj){
     if(obj.hasOwnProperty(p) && prop == p.toLowerCase()){
         return obj[p];
     }
  }
}
paragraph = paragraph.replace(regex, (m) => getValue(m, phrases));
console.log(paragraph)
// => German gateway is located at ... Leonardo da Vinci and other some other word superman

以下是在 browserify 的帮助下捆绑的 JavaScript 代码片段：

(function(){function r(e,n,t){function o(i,f){if(!n[i]){if(!e[i]){var c="function"==typeof require&&require;if(!f&&c)return c(i,!0);if(u)return u(i,!0);var a=new Error("Cannot find module '"+i+"'");throw a.code="MODULE_NOT_FOUND",a}var p=n[i]={exports:{}};e[i][0].call(p.exports,function(r){var n=e[i][1][r];return o(n||r)},p,p.exports,r,e,n,t)}return n[i].exports}for(var u="function"==typeof require&&require,i=0;i<t.length;i++)o(t[i]);return o}return r})()({1:[function(require,module,exports){
let paragraph = "german gateway is located at ... Leonardo DA VINci and other some word superman";
const phrases = {"german gateway":"German gateway","Leonardo DA vinci":"Leonardo da Vinci","some word":"some other word"}
let RegexTrie = require('regex-trie');
let trie = new RegexTrie();
trie.add(Object.keys(phrases))
const regex = new RegExp(`\\b${trie.toRegExp().source}\\b`, 'ig')

let getValue = function(prop, obj){
  prop = prop.toLowerCase();
  for(var p in obj){
     if(obj.hasOwnProperty(p) && prop == p.toLowerCase()){
         return obj[p];
     }
  }
}
paragraph = paragraph.replace(regex, (m) => getValue(m, phrases));
console.log(paragraph)
},{"regex-trie":3}],2:[function(require,module,exports){
(function (global){(function (){
/*! http://mths.be/jsesc v0.5.0 by @mathias */
;(function(root) {

    // Detect free variables `exports`
    var freeExports = typeof exports == 'object' && exports;

    // Detect free variable `module`
    var freeModule = typeof module == 'object' && module &&
        module.exports == freeExports && module;

    // Detect free variable `global`, from Node.js or Browserified code,
    // and use it as `root`
    var freeGlobal = typeof global == 'object' && global;
    if (freeGlobal.global === freeGlobal || freeGlobal.window === freeGlobal) {
        root = freeGlobal;
    }

    /*--------------------------------------------------------------------------*/

    var object = {};
    var hasOwnProperty = object.hasOwnProperty;
    var forOwn = function(object, callback) {
        var key;
        for (key in object) {
            if (hasOwnProperty.call(object, key)) {
                callback(key, object[key]);
            }
        }
    };

    var extend = function(destination, source) {
        if (!source) {
            return destination;
        }
        forOwn(source, function(key, value) {
            destination[key] = value;
        });
        return destination;
    };

    var forEach = function(array, callback) {
        var length = array.length;
        var index = -1;
        while (++index < length) {
            callback(array[index]);
        }
    };

    var toString = object.toString;
    var isArray = function(value) {
        return toString.call(value) == '[object Array]';
    };
    var isObject = function(value) {
        // This is a very simple check, but it’s good enough for what we need.
        return toString.call(value) == '[object Object]';
    };
    var isString = function(value) {
        return typeof value == 'string' ||
            toString.call(value) == '[object String]';
    };
    var isFunction = function(value) {
        // In a perfect world, the `typeof` check would be sufficient. However,
        // in Chrome 1–12, `typeof /x/ == 'object'`, and in IE 6–8
        // `typeof alert == 'object'` and similar for other host objects.
        return typeof value == 'function' ||
            toString.call(value) == '[object Function]';
    };

    /*--------------------------------------------------------------------------*/

    // http://mathiasbynens.be/notes/javascript-escapes#single
    var singleEscapes = {
        '"': '\\"',
        '\'': '\\\'',
        '\\': '\\\\',
        '\b': '\\b',
        '\f': '\\f',
        '\n': '\\n',
        '\r': '\\r',
        '\t': '\\t'
        // `\v` is omitted intentionally, because in IE < 9, '\v' == 'v'.
        // '\v': '\\x0B'
    };
    var regexSingleEscape = /["'\\\b\f\n\r\t]/;

    var regexDigit = /[0-9]/;
    var regexWhitelist = /[ !#-&\(-\[\]-~]/;

    var jsesc = function(argument, options) {
        // Handle options
        var defaults = {
            'escapeEverything': false,
            'quotes': 'single',
            'wrap': false,
            'es6': false,
            'json': false,
            'compact': true,
            'indent': '\t',
            '__indent__': ''
        };
        var json = options && options.json;
        if (json) {
            defaults.quotes = 'double';
            defaults.wrap = true;
        }
        options = extend(defaults, options);
        if (options.quotes != 'single' && options.quotes != 'double') {
            options.quotes = 'single';
        }
        var quote = options.quotes == 'double' ? '"' : '\'';
        var compact = options.compact;
        var indent = options.indent;
        var oldIndent;
        var newLine = compact ? '' : '\n';
        var result;
        var isEmpty = true;

        if (json && argument && isFunction(argument.toJSON)) {
            argument = argument.toJSON();
        }

        if (!isString(argument)) {
            if (isArray(argument)) {
                result = [];
                options.wrap = true;
                oldIndent = options.__indent__;
                indent += oldIndent;
                options.__indent__ = indent;
                forEach(argument, function(value) {
                    isEmpty = false;
                    result.push(
                        (compact ? '' : indent) +
                        jsesc(value, options)
                    );
                });
                if (isEmpty) {
                    return '[]';
                }
                return '[' + newLine + result.join(',' + newLine) + newLine +
                    (compact ? '' : oldIndent) + ']';
            } else if (!isObject(argument)) {
                if (json) {
                    // For some values (e.g. `undefined`, `function` objects),
                    // `JSON.stringify(value)` returns `undefined` (which isn’t valid
                    // JSON) instead of `'null'`.
                    return JSON.stringify(argument) || 'null';
                }
                return String(argument);
            } else { // it’s an object
                result = [];
                options.wrap = true;
                oldIndent = options.__indent__;
                indent += oldIndent;
                options.__indent__ = indent;
                forOwn(argument, function(key, value) {
                    isEmpty = false;
                    result.push(
                        (compact ? '' : indent) +
                        jsesc(key, options) + ':' +
                        (compact ? '' : ' ') +
                        jsesc(value, options)
                    );
                });
                if (isEmpty) {
                    return '{}';
                }
                return '{' + newLine + result.join(',' + newLine) + newLine +
                    (compact ? '' : oldIndent) + '}';
            }
        }

        var string = argument;
        // Loop over each code unit in the string and escape it
        var index = -1;
        var length = string.length;
        var first;
        var second;
        var codePoint;
        result = '';
        while (++index < length) {
            var character = string.charAt(index);
            if (options.es6) {
                first = string.charCodeAt(index);
                if ( // check if it’s the start of a surrogate pair
                    first >= 0xD800 && first <= 0xDBFF && // high surrogate
                    length > index + 1 // there is a next code unit
                ) {
                    second = string.charCodeAt(index + 1);
                    if (second >= 0xDC00 && second <= 0xDFFF) { // low surrogate
                        // http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
                        codePoint = (first - 0xD800) * 0x400 + second - 0xDC00 + 0x10000;
                        result += '\\u{' + codePoint.toString(16).toUpperCase() + '}';
                        index++;
                        continue;
                    }
                }
            }
            if (!options.escapeEverything) {
                if (regexWhitelist.test(character)) {
                    // It’s a printable ASCII character that is not `"`, `'` or `\`,
                    // so don’t escape it.
                    result += character;
                    continue;
                }
                if (character == '"') {
                    result += quote == character ? '\\"' : character;
                    continue;
                }
                if (character == '\'') {
                    result += quote == character ? '\\\'' : character;
                    continue;
                }
            }
            if (
                character == '\0' &&
                !json &&
                !regexDigit.test(string.charAt(index + 1))
            ) {
                result += '\\0';
                continue;
            }
            if (regexSingleEscape.test(character)) {
                // no need for a `hasOwnProperty` check here
                result += singleEscapes[character];
                continue;
            }
            var charCode = character.charCodeAt(0);
            var hexadecimal = charCode.toString(16).toUpperCase();
            var longhand = hexadecimal.length > 2 || json;
            var escaped = '\\' + (longhand ? 'u' : 'x') +
                ('0000' + hexadecimal).slice(longhand ? -4 : -2);
            result += escaped;
            continue;
        }
        if (options.wrap) {
            result = quote + result + quote;
        }
        return result;
    };

    jsesc.version = '0.5.0';

    /*--------------------------------------------------------------------------*/

    // Some AMD build optimizers, like r.js, check for specific condition patterns
    // like the following:
    if (
        typeof define == 'function' &&
        typeof define.amd == 'object' &&
        define.amd
    ) {
        define(function() {
            return jsesc;
        });
    }   else if (freeExports && !freeExports.nodeType) {
        if (freeModule) { // in Node.js or RingoJS v0.8.0+
            freeModule.exports = jsesc;
        } else { // in Narwhal or RingoJS v0.7.0-
            freeExports.jsesc = jsesc;
        }
    } else { // in Rhino or a web browser
        root.jsesc = jsesc;
    }

}(this));

}).call(this)}).call(this,typeof global !== "undefined" ? global : typeof self !== "undefined" ? self : typeof window !== "undefined" ? window : {})
},{}],3:[function(require,module,exports){
var jsesc = require('jsesc');

/**
 * @module regex-trie
 */
var RegexTrie = (function () {

    "use strict";

    /**
     * The `RegexTrie` class builds a regular expression from a set of phrases
     * added to it. It produces a non-optimised `RegExp` and only represents
     * literal characters; only alphanumeric or underscore ("_") characters are
     * left unescaped.
     *
     * @class RegexTrie
     * @constructor
     */
    var RegexTrie = function () {

        if ( ! (this instanceof RegexTrie) ) {
            return new RegexTrie();
        }

        this._num_phrases_in_trie = 0;
        this._trie = {};

        return this;
    };

    /**
     *
     * Phrases can be added to the trie using `add`. Elements can be wrapped in
     * an array before being added. Only alphanumeric values will be added.
     * Objects, booleans, arrays, etc will all be ignored (failed attempts to
     * add values are silent.)
     *
     * @method add()
     * @param phrase_to_add {array|string|number}
     * @chainable
     */
    RegexTrie.prototype.add = function (phrase_to_add) {

        if ( phrase_to_add instanceof Array ) {
            phrase_to_add.forEach(this.add, this);
        }

        phrase_to_add = this._coerce_to_string(phrase_to_add);

        if ( ! this._is_phrase_valid(phrase_to_add) ) {
            return this;
        }

        // Has this phrase already been added?
        if ( this.contains(phrase_to_add) ) {
            return this;
        }

        var trie = this._trie;

        phrase_to_add.split('').forEach( function (chr) {

            if ( chr in trie ) {

                trie = trie[chr];
                return;
            }

            trie[chr] = {};
            trie      = trie[chr];
        }, this);

        // Set the end marker (so we know this was a complete word)
        trie.end = true;
        this._num_phrases_in_trie++;

        return this;
    };

    RegexTrie.prototype.toRegExp = function () {

        if ( this._num_phrases_in_trie === 0 ) return;

        var result = this.toString();
        return new RegExp(result);
    };

    RegexTrie.prototype.toString = function () {

        if ( this._num_phrases_in_trie === 0 ) return;

        var _walk_trie = function (trie, this_arg) {

            var keys   = Object.keys(trie),
            alt_group  = [],
            char_class = [],
            end        = false; // marks the end of a phrase

            keys.forEach( function (key) {

                var walk_result, insert;

                if ( key === 'end' ) {
                    end = true;
                    return;
                }

                walk_result =
                    this._quotemeta(key) + _walk_trie(trie[key], this_arg);

                // When we have more than one key, `insert` references
                // the alternative regexp group, otherwise it points to
                // the char class group.
                insert = ( keys.length > 1 ) ? [].push.bind(alt_group)
                                             : [].push.bind(char_class);
                insert(walk_result);
            }, this_arg);

            return this_arg._to_regex(alt_group, char_class, end);
        };

        var result = _walk_trie(this._trie, this);
        return result;
    };

    RegexTrie.prototype._to_regex = function (alt_group, char_class, end) {

        var group_has_one_element = function (el) {
                return el.length === 1;
            },
            result = "";

        // Once we've finished walking through the tree we need to build
        // the regex match groups...
        if ( alt_group.length > 0 ) {

            if ( alt_group.length === 1 ) {
                // Individual elements are merged with the current result.
                result += alt_group[0];
            }
            else if ( alt_group.every(group_has_one_element) ) {
                // When every single array in the alternative group is
                // a single element array, this gets flattened in to
                // a character class.
                result += ( '[' + alt_group.join('') + ']' );
            }
            else {
                // Finally, build a non-capturing alternative group.
                result += ( '(?:' + alt_group.join('|') + ')' );
            }
        }
        else if ( char_class.length > 0 ) {
            result += char_class[0];
        }

        if ( end && result ) {

            if ( result.length === 1 ) {
                result += '?';
            }
            else {
                result = '(?:' + result + ')?';
            }
        }

        return result;
    };

    RegexTrie.prototype.contains = function (phrase_to_fetch) {

        if ( ! this._is_phrase_valid(phrase_to_fetch) &&
                this._num_phrases_in_trie > 0 ) {
            return false;
        }

        var trie = this._trie;

        // Wrap the attempts to contains in a try/catch block; any non-existant
        // keys will cause an exception, which we treat as 'this value does not
        // exist'.
        try {

            phrase_to_fetch.split('').forEach( function (chr) {
                trie = trie[chr];
            });

            return ( trie.hasOwnProperty('end') && trie.end === true );
        }
        catch (e) {
            // Fall through
        }

        return false;
    };

    RegexTrie.prototype._coerce_to_string = function (phrase) {

        if ( typeof phrase === 'number' && ! isNaN(phrase) ) {
            phrase = phrase.toString();
        }

        return phrase;
    };

    RegexTrie.prototype._is_phrase_valid = function (phrase) {
        return ( typeof phrase === 'string' && phrase.length > 0 );
    };

    RegexTrie.prototype._quotemeta = function (phrase) {

        if ( ! this._is_phrase_valid(phrase) ) {
            return phrase;
        }

        return phrase
            .replace(/([\t\n\f\r\\\$\(\)\*\+\-\.\?\[\]\^\{\|\}])/g, '\\$1')
            .replace(/[^\x20-\x7E]/g, jsesc);
    };

    return RegexTrie;
})();

module.exports = RegexTrie;

},{"jsesc":2}]},{},[1]);