测试类package LuceneUtil;
import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LetterTokenizer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.Version;
//自定义过滤分词器
public class MyStopAnalyzer extends Analyzer {
private Set stops;
public MyStopAnalyzer(String [] sws)//形参为 字符串数组
{
//会自动将字符串数组转换为Set
stops=StopFilter.makeStopSet(Version.LUCENE_35, sws,true);
//将原有的停用词加入到现在的停用词中
stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
}
public MyStopAnalyzer()
{
stops=StopAnalyzer.ENGLISH_STOP_WORDS_SET;
}
public TokenStream tokenStream(String FileName,Reader reader)
{
return new StopFilter(Version.LUCENE_35,
new LowerCaseFilter(Version.LUCENE_35,
new LetterTokenizer(Version.LUCENE_35, reader)), stops);
}
}
package LuceneTest;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;
import LuceneUtil.AnalyzerUtils;
import LuceneUtil.MyStopAnalyzer;
public class TestAnalyzer {
static ArrayList<String> list=null;
public static void main (String [] args) throws IOException{
//addNewWord( "烟台大学 ");
//test();
test01();
}
public static void test()
{
Analyzer a1=new MMSegAnalyzer();
String txt="我是一名大学生,我来自菏*,我现在烟台大学。";
AnalyzerUtils.displayToken(txt,a1);
}
public static void test01()
{
//使用自定义的过滤分词器
//这个语句 可以吧 “you“,”meet”,和“***” 给和谐掉
Analyzer a2=new MyStopAnalyzer(new String [] {"you","meet","***"});
//系统自带的StopAnalyzer
Analyzer a3=new StopAnalyzer(Version.LUCENE_35);
String txt=" i say :how are You,nice to meet you. ***";
AnalyzerUtils.displayToken(txt,a2);
AnalyzerUtils.displayToken(txt,a3);
}
package LuceneUtil;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.omg.CORBA.portable.Streamable;
//测试类
public class AnalyzerUtils {
public static void displayToken(String str,Analyzer a)
{
try {
TokenStream ts=a.tokenStream("cotents", new StringReader(str));
//创建一个属性,这个属性添加到流中,随着TokenStream增加
CharTermAttribute cta=ts.addAttribute(CharTermAttribute.class);
while (ts.incrementToken())
{
System.out.print("["+cta+"]");
}
System.out.println();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/*public static void addNewWord(String newWord) throws IOException{BufferedWriter bw=new BufferedWriter(new FileWriter("G:\\mmseg\\data\\words-my.dic"));ArrayList<String> list=new ArrayList<String>();list.add(newWord);Iterator<String> iterator=list.iterator();while
(iterator.hasNext()){bw.write(iterator.next());bw.flush();bw.newLine();}bw.close();System.out.println("添加成功");}*/}
测试结果如下:
可见 我想和谐掉的 那几个字已被和谐
第一行为执行和谐后的结果
第二行为未被和谐的
[i][say][how][nice]
[i][say][how][you][nice][meet][you][***]
不足:还不能对单个汉语词语和谐 ,汉语只能屏蔽一句话。而英语却可以
|
请发表评论