Chiu's comment is right: 'aaáaa'.match(/.+?/g)
yelds quite counter-intuitive [ "aa", "á", "aa" ]
, because "word character" (w
) in JavaScript regular expressions is just a shorthand for [A-Za-z0-9_]
('case-insensitive-alpha-numeric-and-underscore'), so word boundary (
) matches any place between chunk of alpha-numerics and any other character. This makes extracting "Unicode words" quite hard.
For non-unicase writing systems it is possible to identify "word character" by its dual nature: ch.toUpperCase() != ch.toLowerCase()
, so your altered snippet could look like this:
var bannedWords = ["bad", "mad", "testing", "b??", "ba?"];
var bannedWordsRegex = new RegExp('-' + bannedWords.join("-|-") + '-', 'i');
$(function() {
$("input").on("input", function() {
var invalid = bannedWordsRegex.test(dashPaddedWords(this.value));
$('#log').html(invalid ? 'bad' : 'good');
});
$("input").trigger("input").focus();
function dashPaddedWords(str) {
return '-' + str.replace(/./g, wordCharOrDash) + '-';
};
function wordCharOrDash(ch) {
return isWordChar(ch) ? ch : '-'
};
function isWordChar(ch) {
return ch.toUpperCase() != ch.toLowerCase();
};
});
<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.1/jquery.min.js"></script>
<input type='text' name='word_to_check' value="ba">
<p id="log"></p>
与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…