本文整理汇总了Java中org.apache.lucene.analysis.CharFilter类的典型用法代码示例。如果您正苦于以下问题:Java CharFilter类的具体用法?Java CharFilter怎么用?Java CharFilter使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
CharFilter类属于org.apache.lucene.analysis包,在下文中一共展示了CharFilter类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Java代码示例。
示例1: testNormalizerCharFilter
import org.apache.lucene.analysis.CharFilter; //导入依赖的package包/类
@Test
public void testNormalizerCharFilter() throws Exception {
String query = "한국어를 처리하는 예시입니닼ㅋ. 오픈코리안텍스틓ㅎㅎㅎㅎㅎㅎㅎ";
String expected = "한국어를 처리하는 예시입니다ㅋ. 오픈코리안텍스트ㅎㅎㅎ";
CharFilter inputReader = new OpenKoreanTextNormalizer(new StringReader(query));
char[] tempBuff = new char[10];
StringBuilder actual = new StringBuilder();
while (true) {
int length = inputReader.read(tempBuff);
if (length == -1) break;
actual.append(tempBuff, 0, length);
}
Assert.assertEquals(expected, actual.toString());
}
开发者ID:open-korean-text,项目名称:elasticsearch-analysis-openkoreantext,代码行数:19,代码来源:OpenKoreanTextNormalizerTest.java
示例2: testDefaultSetting
import org.apache.lucene.analysis.CharFilter; //导入依赖的package包/类
public void testDefaultSetting() throws Exception {
Settings settings = Settings.builder()
.put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer")
.build();
TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), settings, new AnalysisICUPlugin());
CharFilterFactory charFilterFactory = analysis.charFilter.get("myNormalizerChar");
String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि";
Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE);
String expectedOutput = normalizer.normalize(input);
CharFilter inputReader = (CharFilter) charFilterFactory.create(new StringReader(input));
char[] tempBuff = new char[10];
StringBuilder output = new StringBuilder();
while (true) {
int length = inputReader.read(tempBuff);
if (length == -1) break;
output.append(tempBuff, 0, length);
assertEquals(output.toString(), normalizer.normalize(input.substring(0, inputReader.correctOffset(output.length()))));
}
assertEquals(expectedOutput, output.toString());
}
开发者ID:justor,项目名称:elasticsearch_my,代码行数:22,代码来源:SimpleIcuNormalizerCharFilterTests.java
示例3: testNameAndModeSetting
import org.apache.lucene.analysis.CharFilter; //导入依赖的package包/类
public void testNameAndModeSetting() throws Exception {
Settings settings = Settings.builder()
.put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer")
.put("index.analysis.char_filter.myNormalizerChar.name", "nfkc")
.put("index.analysis.char_filter.myNormalizerChar.mode", "decompose")
.build();
TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), settings, new AnalysisICUPlugin());
CharFilterFactory charFilterFactory = analysis.charFilter.get("myNormalizerChar");
String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि";
Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE);
String expectedOutput = normalizer.normalize(input);
CharFilter inputReader = (CharFilter) charFilterFactory.create(new StringReader(input));
char[] tempBuff = new char[10];
StringBuilder output = new StringBuilder();
while (true) {
int length = inputReader.read(tempBuff);
if (length == -1) break;
output.append(tempBuff, 0, length);
assertEquals(output.toString(), normalizer.normalize(input.substring(0, inputReader.correctOffset(output.length()))));
}
assertEquals(expectedOutput, output.toString());
}
开发者ID:justor,项目名称:elasticsearch_my,代码行数:24,代码来源:SimpleIcuNormalizerCharFilterTests.java
示例4: testNormalization
import org.apache.lucene.analysis.CharFilter; //导入依赖的package包/类
public void testNormalization() throws IOException {
String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि";
Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE);
String expectedOutput = normalizer.normalize(input);
CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input), normalizer);
char[] tempBuff = new char[10];
StringBuilder output = new StringBuilder();
while (true) {
int length = reader.read(tempBuff);
if (length == -1) {
break;
}
output.append(tempBuff, 0, length);
assertEquals(output.toString(), normalizer.normalize(input.substring(0, reader.correctOffset(output.length()))));
}
assertEquals(expectedOutput, output.toString());
}
开发者ID:europeana,项目名称:search,代码行数:20,代码来源:TestICUNormalizer2CharFilter.java
示例5: newCharFilterChain
import org.apache.lucene.analysis.CharFilter; //导入依赖的package包/类
private CharFilterSpec newCharFilterChain(Random random, Reader reader) {
CharFilterSpec spec = new CharFilterSpec();
spec.reader = reader;
StringBuilder descr = new StringBuilder();
int numFilters = random.nextInt(3);
for (int i = 0; i < numFilters; i++) {
while (true) {
final Constructor<? extends CharFilter> ctor = charfilters.get(random.nextInt(charfilters.size()));
final Object args[] = newCharFilterArgs(random, spec.reader, ctor.getParameterTypes());
if (broken(ctor, args)) {
continue;
}
reader = createComponent(ctor, args, descr);
if (reader != null) {
spec.reader = reader;
break;
}
}
}
spec.toString = descr.toString();
return spec;
}
开发者ID:europeana,项目名称:search,代码行数:23,代码来源:TestRandomChains.java
示例6: testIterationMarksWithJapaneseTokenizer
import org.apache.lucene.analysis.CharFilter; //导入依赖的package包/类
public void testIterationMarksWithJapaneseTokenizer() throws IOException {
JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory();
Map<String, String> tokenizerArgs = Collections.emptyMap();
tokenizerFactory.init(tokenizerArgs);
tokenizerFactory.inform(new StringMockResourceLoader(""));
JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory();
Map<String, String> filterArgs = Collections.emptyMap();
filterFactory.init(filterArgs);
CharFilter filter = filterFactory.create(
new StringReader("時々馬鹿々々しいところゞゝゝミスヾ")
);
TokenStream tokenStream = tokenizerFactory.create(filter);
assertTokenStreamContents(tokenStream, new String[]{"時時", "馬鹿馬鹿しい", "ところどころ", "ミ", "スズ"});
}
开发者ID:pkarmstr,项目名称:NYBC,代码行数:17,代码来源:TestJapaneseIterationMarkCharFilterFactory.java
示例7: testKanjiOnlyIterationMarksWithJapaneseTokenizer
import org.apache.lucene.analysis.CharFilter; //导入依赖的package包/类
public void testKanjiOnlyIterationMarksWithJapaneseTokenizer() throws IOException {
JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory();
Map<String, String> tokenizerArgs = Collections.emptyMap();
tokenizerFactory.init(tokenizerArgs);
tokenizerFactory.inform(new StringMockResourceLoader(""));
JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory();
Map<String, String> filterArgs = new HashMap<String, String>();
filterArgs.put("normalizeKanji", "true");
filterArgs.put("normalizeKana", "false");
filterFactory.init(filterArgs);
CharFilter filter = filterFactory.create(
new StringReader("時々馬鹿々々しいところゞゝゝミスヾ")
);
TokenStream tokenStream = tokenizerFactory.create(filter);
assertTokenStreamContents(tokenStream, new String[]{"時時", "馬鹿馬鹿しい", "ところ", "ゞ", "ゝ", "ゝ", "ミス", "ヾ"});
}
开发者ID:pkarmstr,项目名称:NYBC,代码行数:19,代码来源:TestJapaneseIterationMarkCharFilterFactory.java
示例8: testKanaOnlyIterationMarksWithJapaneseTokenizer
import org.apache.lucene.analysis.CharFilter; //导入依赖的package包/类
public void testKanaOnlyIterationMarksWithJapaneseTokenizer() throws IOException {
JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory();
Map<String, String> tokenizerArgs = Collections.emptyMap();
tokenizerFactory.init(tokenizerArgs);
tokenizerFactory.inform(new StringMockResourceLoader(""));
JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory();
Map<String, String> filterArgs = new HashMap<String, String>();
filterArgs.put("normalizeKanji", "false");
filterArgs.put("normalizeKana", "true");
filterFactory.init(filterArgs);
CharFilter filter = filterFactory.create(
new StringReader("時々馬鹿々々しいところゞゝゝミスヾ")
);
TokenStream tokenStream = tokenizerFactory.create(filter);
assertTokenStreamContents(tokenStream, new String[]{"時々", "馬鹿", "々", "々", "しい", "ところどころ", "ミ", "スズ"});
}
开发者ID:pkarmstr,项目名称:NYBC,代码行数:19,代码来源:TestJapaneseIterationMarkCharFilterFactory.java
示例9: testIterationMarksWithKeywordTokenizer
import org.apache.lucene.analysis.CharFilter; //导入依赖的package包/类
public void testIterationMarksWithKeywordTokenizer() throws IOException {
final String text = "時々馬鹿々々しいところゞゝゝミスヾ";
JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(new HashMap<String,String>());
CharFilter filter = filterFactory.create(new StringReader(text));
TokenStream tokenStream = new MockTokenizer(filter, MockTokenizer.KEYWORD, false);
assertTokenStreamContents(tokenStream, new String[]{"時時馬鹿馬鹿しいところどころミスズ"});
}
开发者ID:europeana,项目名称:search,代码行数:8,代码来源:TestJapaneseIterationMarkCharFilterFactory.java
示例10: testIterationMarksWithJapaneseTokenizer
import org.apache.lucene.analysis.CharFilter; //导入依赖的package包/类
public void testIterationMarksWithJapaneseTokenizer() throws IOException {
JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new HashMap<String,String>());
tokenizerFactory.inform(new StringMockResourceLoader(""));
JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(new HashMap<String,String>());
CharFilter filter = filterFactory.create(
new StringReader("時々馬鹿々々しいところゞゝゝミスヾ")
);
TokenStream tokenStream = tokenizerFactory.create(newAttributeFactory(), filter);
assertTokenStreamContents(tokenStream, new String[]{"時時", "馬鹿馬鹿しい", "ところどころ", "ミ", "スズ"});
}
开发者ID:europeana,项目名称:search,代码行数:12,代码来源:TestJapaneseIterationMarkCharFilterFactory.java
示例11: testKanjiOnlyIterationMarksWithJapaneseTokenizer
import org.apache.lucene.analysis.CharFilter; //导入依赖的package包/类
public void testKanjiOnlyIterationMarksWithJapaneseTokenizer() throws IOException {
JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new HashMap<String,String>());
tokenizerFactory.inform(new StringMockResourceLoader(""));
Map<String, String> filterArgs = new HashMap<>();
filterArgs.put("normalizeKanji", "true");
filterArgs.put("normalizeKana", "false");
JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(filterArgs);
CharFilter filter = filterFactory.create(
new StringReader("時々馬鹿々々しいところゞゝゝミスヾ")
);
TokenStream tokenStream = tokenizerFactory.create(newAttributeFactory(), filter);
assertTokenStreamContents(tokenStream, new String[]{"時時", "馬鹿馬鹿しい", "ところ", "ゞ", "ゝ", "ゝ", "ミス", "ヾ"});
}
开发者ID:europeana,项目名称:search,代码行数:16,代码来源:TestJapaneseIterationMarkCharFilterFactory.java
示例12: testKanaOnlyIterationMarksWithJapaneseTokenizer
import org.apache.lucene.analysis.CharFilter; //导入依赖的package包/类
public void testKanaOnlyIterationMarksWithJapaneseTokenizer() throws IOException {
JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new HashMap<String,String>());
tokenizerFactory.inform(new StringMockResourceLoader(""));
Map<String, String> filterArgs = new HashMap<>();
filterArgs.put("normalizeKanji", "false");
filterArgs.put("normalizeKana", "true");
JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(filterArgs);
CharFilter filter = filterFactory.create(
new StringReader("時々馬鹿々々しいところゞゝゝミスヾ")
);
TokenStream tokenStream = tokenizerFactory.create(newAttributeFactory(), filter);
assertTokenStreamContents(tokenStream, new String[]{"時々", "馬鹿", "々", "々", "しい", "ところどころ", "ミ", "スズ"});
}
开发者ID:europeana,项目名称:search,代码行数:16,代码来源:TestJapaneseIterationMarkCharFilterFactory.java
示例13: testKanjiOnly
import org.apache.lucene.analysis.CharFilter; //导入依赖的package包/类
public void testKanjiOnly() throws IOException {
// Test kanji only repetition marks
CharFilter filter = new JapaneseIterationMarkCharFilter(
new StringReader("時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。"),
true, // kanji
false // no kana
);
assertCharFilterEquals(filter, "時時、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。");
}
开发者ID:europeana,项目名称:search,代码行数:10,代码来源:TestJapaneseIterationMarkCharFilter.java
示例14: testKanaOnly
import org.apache.lucene.analysis.CharFilter; //导入依赖的package包/类
public void testKanaOnly() throws IOException {
// Test kana only repetition marks
CharFilter filter = new JapaneseIterationMarkCharFilter(
new StringReader("時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。"),
false, // no kanji
true // kana
);
assertCharFilterEquals(filter, "時々、おおのさんと一緒にお寿司が食べたいです。abcところどころ。");
}
开发者ID:europeana,项目名称:search,代码行数:10,代码来源:TestJapaneseIterationMarkCharFilter.java
示例15: testNone
import org.apache.lucene.analysis.CharFilter; //导入依赖的package包/类
public void testNone() throws IOException {
// Test no repetition marks
CharFilter filter = new JapaneseIterationMarkCharFilter(
new StringReader("時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。"),
false, // no kanji
false // no kana
);
assertCharFilterEquals(filter, "時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。");
}
开发者ID:europeana,项目名称:search,代码行数:10,代码来源:TestJapaneseIterationMarkCharFilter.java
示例16: testTokenStream
import org.apache.lucene.analysis.CharFilter; //导入依赖的package包/类
public void testTokenStream() throws IOException {
// '℃', '№', '㈱', '㌘', 'サ'+'<<', 'ソ'+'<<', '㌰'+'<<'
String input = "℃ № ㈱ ㌘ ザ ゾ ㌰゙";
CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE));
Tokenizer tokenStream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(tokenStream,
new String[] {"°C", "No", "(株)", "グラム", "ザ", "ゾ", "ピゴ"},
new int[] {0, 2, 4, 6, 8, 11, 14},
new int[] {1, 3, 5, 7, 10, 13, 16},
input.length());
}
开发者ID:europeana,项目名称:search,代码行数:16,代码来源:TestICUNormalizer2CharFilter.java
示例17: testTokenStream2
import org.apache.lucene.analysis.CharFilter; //导入依赖的package包/类
public void testTokenStream2() throws IOException {
// '㌰', '<<'゙, '5', '℃', '№', '㈱', '㌘', 'サ', '<<', 'ソ', '<<'
String input = "㌰゙5℃№㈱㌘ザゾ";
CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
Tokenizer tokenStream = new NGramTokenizer(newAttributeFactory(), reader, 1, 1);
assertTokenStreamContents(tokenStream,
new String[] {"ピ", "ゴ", "5", "°", "c", "n", "o", "(", "株", ")", "グ", "ラ", "ム", "ザ", "ゾ"},
new int[]{0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9},
new int[]{1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9, 11},
input.length()
);
}
开发者ID:europeana,项目名称:search,代码行数:17,代码来源:TestICUNormalizer2CharFilter.java
示例18: testMassiveLigature
import org.apache.lucene.analysis.CharFilter; //导入依赖的package包/类
public void testMassiveLigature() throws IOException {
String input = "\uFDFA";
CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
Tokenizer tokenStream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(tokenStream,
new String[] {"صلى", "الله", "عليه", "وسلم"},
new int[]{0, 0, 0, 0},
new int[]{0, 0, 0, 1},
input.length()
);
}
开发者ID:europeana,项目名称:search,代码行数:16,代码来源:TestICUNormalizer2CharFilter.java
示例19: checkOutput
import org.apache.lucene.analysis.CharFilter; //导入依赖的package包/类
private void checkOutput(String input, String pattern, String replacement,
String expectedOutput, String expectedIndexMatchedOutput) throws IOException {
CharFilter cs = new PatternReplaceCharFilter(pattern(pattern), replacement,
new StringReader(input));
StringBuilder output = new StringBuilder();
for (int chr = cs.read(); chr > 0; chr = cs.read()) {
output.append((char) chr);
}
StringBuilder indexMatched = new StringBuilder();
for (int i = 0; i < output.length(); i++) {
indexMatched.append((cs.correctOffset(i) < 0 ? "-" : input.charAt(cs.correctOffset(i))));
}
boolean outputGood = expectedOutput.equals(output.toString());
boolean indexMatchedGood = expectedIndexMatchedOutput.equals(indexMatched.toString());
if (!outputGood || !indexMatchedGood || false) {
System.out.println("Pattern : " + pattern);
System.out.println("Replac. : " + replacement);
System.out.println("Input : " + input);
System.out.println("Output : " + output);
System.out.println("Expected: " + expectedOutput);
System.out.println("Output/i: " + indexMatched);
System.out.println("Expected: " + expectedIndexMatchedOutput);
System.out.println();
}
assertTrue("Output doesn't match.", outputGood);
assertTrue("Index-matched output doesn't match.", indexMatchedGood);
}
开发者ID:europeana,项目名称:search,代码行数:33,代码来源:TestPatternReplaceCharFilter.java
示例20: testNothingChange
import org.apache.lucene.analysis.CharFilter; //导入依赖的package包/类
public void testNothingChange() throws IOException {
final String BLOCK = "this is test.";
CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1$2$3",
new StringReader( BLOCK ) );
TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
assertTokenStreamContents(ts,
new String[] { "this", "is", "test." },
new int[] { 0, 5, 8 },
new int[] { 4, 7, 13 },
BLOCK.length());
}
开发者ID:europeana,项目名称:search,代码行数:12,代码来源:TestPatternReplaceCharFilter.java
注:本文中的org.apache.lucene.analysis.CharFilter类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论