本文整理汇总了Java中org.ansj.splitWord.analysis.ToAnalysis类的典型用法代码示例。如果您正苦于以下问题:Java ToAnalysis类的具体用法?Java ToAnalysis怎么用?Java ToAnalysis使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
ToAnalysis类属于org.ansj.splitWord.analysis包,在下文中一共展示了ToAnalysis类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Java代码示例。
示例1: main
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
public static void main(String[] args) throws IOException {
List<Term> parse = ToAnalysis.parse("中华人民 共和国 成立了 ");
System.out.println(parse);
List<Term> parse1 = IndexAnalysis.parse("你吃过饭了没有!!!!!吃过无妨论文");
//System.out.println(parse1);
String text11="ZW321282050000000325";
Tokenizer tokenizer = new AnsjTokenizer(new StringReader(text11), 0, true);
CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt =
tokenizer.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute positionIncrementAtt =
tokenizer.addAttribute(PositionIncrementAttribute.class);
tokenizer.reset();
while (tokenizer.incrementToken()){
System.out.print(new String(termAtt.toString()+" ") );
// System.out.print( offsetAtt.startOffset() + "-" + offsetAtt.endOffset() + "-" );
//System.out.print( positionIncrementAtt.getPositionIncrement() +"/");
}
tokenizer.close();
}
开发者ID:dimensoft,项目名称:improved-journey,代码行数:27,代码来源:TestAnsj.java
示例2: checkTextContent
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
public int checkTextContent(int userId, String content) throws IOException {
HashSet<String> sensitiveWords = new HashSet<String>();
InputStream fis = new FileInputStream(source);
InputStreamReader isr = new InputStreamReader(fis, Charset.forName("UTF-8"));
BufferedReader br = new BufferedReader(isr);
String line;
while ((line = br.readLine()) != null)
sensitiveWords.add(line.substring(0, line.length() - 1));
Result result = ToAnalysis.parse(Jsoup.clean(content, Whitelist.none()));
List<Term> termList = result.getTerms();
for (Term term : termList) {
if (sensitiveWords.contains(term.getName()))
return 0;
}
return 1;
}
开发者ID:qinjr,项目名称:TeamNote,代码行数:19,代码来源:QualityUtilImpl.java
示例3: doPost
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
public void doPost(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException {
request.setCharacterEncoding("UTF-8");
response.setCharacterEncoding("UTF-8");
// 必填参数
String Text = request.getParameter("text");
List<Term> terms = ToAnalysis.parse(Text);
PrintWriter out = response.getWriter();
out.print("分词结果为:\n" + terms);
out.flush();
out.close();
}
开发者ID:landriesnidis,项目名称:NSIITA-SemanticMatching,代码行数:17,代码来源:Ansj_seg.java
示例4: tokenizeDocxFile
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
private static void tokenizeDocxFile(String filePath) {
File file = new File(filePath);
DocumentInfo doc = DocumentParser.parseFileToDocumentInfo(file);
if(doc instanceof WordDocumentInfo) {
String content = ((WordDocumentInfo) doc).getDocStr();
Result terms = ToAnalysis.parse(content);
for (int i = 0; i < terms.size(); i++) {
String words = terms.get(i).getName();
boolean filtered = false;
for(String stopToken : stopTokens)
if(words.equals(stopToken)) { filtered = true; break; }
char firstLetter = words.charAt(0);
if((firstLetter >= 'A' && firstLetter <= 'Z') ||
(firstLetter >= 'a' && firstLetter <= 'z') ||
(firstLetter >= '0' && firstLetter <= '9'))
filtered = true;
if(filtered) continue;
wordsCN.add(words);
}
}
else System.out.println("Not a docx file");
}
开发者ID:linzeqipku,项目名称:SnowGraph,代码行数:23,代码来源:WordSegmenter.java
示例5: findEntities
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
@Override
public Entities findEntities(String sentence, boolean allowDuplicated) {
Entities entities = new Entities(allowDuplicated);
Result result = ToAnalysis.parse(sentence);
for (Term term : result.getTerms()) {
if (term.getName().length() < 2) {
continue;
}
if (term.getNatureStr().startsWith("nr")) {
entities.addPerson(term.getName());
} else if (term.getNatureStr().startsWith("nt")) {
entities.addOrganization(term.getName());
} else if (term.getNatureStr().startsWith("ns")) {
if (term.getName().endsWith("大学") || term.getName().endsWith("学院")) {
entities.addOrganization(term.getName());
} else {
entities.addSpace(term.getName());
}
}
}
return entities;
}
开发者ID:iamxiatian,项目名称:wikit,代码行数:24,代码来源:AnsjSegment.java
示例6: main
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
public static void main(String[] args) throws IOException {
List<Term> parse = ToAnalysis.parse("天天向上,媒体打打。《回家真好》");
System.out.println(parse);
Tokenizer tokenizer = new AnsjTokenizer(new StringReader("天天向上,媒体打打。《回家真好》"), 0, true);
CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt =
tokenizer.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute positionIncrementAtt =
tokenizer.addAttribute(PositionIncrementAttribute.class);
while (tokenizer.incrementToken()){
System.out.print(new String(termAtt.toString()) );
System.out.print( offsetAtt.startOffset() + "-" + offsetAtt.endOffset() + "-" );
System.out.print( positionIncrementAtt.getPositionIncrement() +"/");
}
tokenizer.close();
}
开发者ID:lgnlgn,项目名称:ansj4solr,代码行数:21,代码来源:TestAnsj.java
示例7: testAnsjAnalyzer
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
@Test
public void testAnsjAnalyzer() throws IOException {
String input = "我在首都机场虹桥路滑旱冰!玩的很Happy,很Hi!";
System.out.print("Ansj索引时采用面向索引的分词:" + input);
displayTokensWithFullDetails(new AnsjAnalyzer(Version.LUCENE_36),input);
System.out.print("索引分词时过滤停用词");
displayTokensWithFullDetails(new AnsjAnalyzer(Version.LUCENE_36,null,stopwords),input);
System.out.println("------------------------------------------------------------");
System.out.print("Ansj查询时采用精准分词:" + input);
displayTokensWithFullDetails(new AnsjAnalyzer(Version.LUCENE_36,ToAnalysis.class),input);
System.out.print("查询分词时过滤停用词");
displayTokensWithFullDetails(new AnsjAnalyzer(Version.LUCENE_36,ToAnalysis.class,stopwords),input);
}
开发者ID:flash0729,项目名称:ansj-seg-for-lucene3,代码行数:17,代码来源:AnsjAnalysisTest.java
示例8: demo
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
public static ArrayList<String> demo(String strToParse) {
String str = strToParse;
//"我年纪还轻,阅历不深的时候,我父亲教导过我一句话,我至今还念念不忘。 \n" +
//"“每逢你想要批评任何人的时候,”他对我说,“你就记住,这个世界上所有的人,并不是个个都有过你拥有的那些优越的条件。”";
ArrayList<String> ret = new ArrayList<>();
Result terms = ToAnalysis.parse(str);
for (int i = 0; i < terms.size(); i++) {
String words = terms.get(i).getName();// 获取单词
String nominal = terms.get(i).getNatureStr();// 获取词性
ret.add(words);
//System.out.print(words + "\t" + nominal + "\n");
}
return ret;
}
开发者ID:linzeqipku,项目名称:SnowGraph,代码行数:15,代码来源:WordSegmenter.java
示例9: Seg
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
/**
* 分词
* @param sentence 待分词的句子
* @return 分词结果
*/
public static List<Term> Seg(String sentence) {
FilterRecognition filter = new FilterRecognition();
//过滤标点符号
filter.insertStopWord(",", " ", ".", ",", "。", ":", ":", "'", "‘", "’", " ", "“", "”", "《", "》", "[", "]", "-");
return ToAnalysis.parse(sentence).recognition(filter).getTerms();
}
开发者ID:jsksxs360,项目名称:Word2Vec,代码行数:12,代码来源:Segment.java
示例10: realySplit
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
private ArrayList<String> realySplit(String strbuf) {
List<Term> parse = ToAnalysis.parse(strbuf);
ArrayList<String> words = new ArrayList<>();
for (Term term : parse) {
if (StringUtils.RegexUtils.hasChinese(term.toString())) {
words.add(term.getName());
}
}
return words;
}
开发者ID:William-Hai,项目名称:CorpusSpider,代码行数:12,代码来源:AnjsSplitWordsUtils.java
示例11: containKeyword
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
public static boolean containKeyword(String text,String keyword) throws Exception{
boolean flag=false;
List<Term> tokens=ToAnalysis.parse(keyword);
for(Term t:tokens){
String token=t.getName();
if(text.contains(token)){
flag=true;
break;
}
}
return flag;
}
开发者ID:YufangWoo,项目名称:news-crawler,代码行数:13,代码来源:HtmlContentExtractor.java
示例12: segment
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
@Override
public List<String> segment(String sentence) throws SegmentException {
List<String> results = new ArrayList<String>();
Result result = ToAnalysis.parse(sentence);
for (Term term : result.getTerms()) {
results.add(term.getName());
}
return results;
}
开发者ID:iamxiatian,项目名称:wikit,代码行数:10,代码来源:AnsjSegment.java
示例13: tag
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
@Override
public List<SegWord> tag(String sentence) throws SegmentException {
Result result = ToAnalysis.parse(sentence);
List<SegWord> results = new ArrayList<SegWord>();
for (Term term : result.getTerms()) {
results.add(new SegWord(term.getName(), term.getNatureStr()));
}
return results;
}
开发者ID:iamxiatian,项目名称:wikit,代码行数:12,代码来源:AnsjSegment.java
示例14: tokenizeTerm
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
@Override
public HashMap<String, TermScore> tokenizeTerm(String input_str) {
//long startt = System.nanoTime();
tokens = ToAnalysis.parse(input_str);
token_iterator = tokens.listIterator();
HashMap<String, TermScore> hash = new HashMap<String, TermScore>();
while(token_iterator.hasNext())
{
Term term = token_iterator.next();
if(term.getName().length()>=2)
{
if(hash.get(term.getName()) == null)
hash.put(term.getName(), new TermScore(term.getName(), 0));
else
{
TermScore exist_term = hash.get(term.getName());
int new_score = exist_term.getScore()+1;
exist_term.setScore(new_score);
hash.put(term.getName(), exist_term);
}
}
}
//long endd = System.nanoTime();
//System.out.println("Tokenization costs: " + (endd - startt ) + " ns");
return hash;
}
开发者ID:LunarBaseEngin,项目名称:LunarBase,代码行数:30,代码来源:TokenizerForSearchEngine.java
示例15: guessNature
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
/**
* 通过规则 猜测词性
*
* @param word
* @return
*/
public static TermNatures guessNature(String word) {
String nature = null;
SmartForest<String[]> smartForest = SUFFIX_FOREST;
int len = 0;
for (int i = word.length() - 1; i >= 0; i--) {
smartForest = smartForest.get(word.charAt(i));
if (smartForest == null) {
break;
}
len++;
if (smartForest.getStatus() == 2) {
nature = smartForest.getParam()[0];
} else if (smartForest.getStatus() == 3) {
nature = smartForest.getParam()[0];
break;
}
}
if ("nt".equals(nature) && (len > 1 || word.length() > 3)) {
return TermNatures.NT;
} else if ("ns".equals(nature)) {
return TermNatures.NS;
} else if (word.length() < 5) {
Result parse = ToAnalysis.parse(word);
for (Term term : parse.getTerms()) {
if ("nr".equals(term.getNatureStr())) {
return TermNatures.NR;
}
}
} else if (ForeignPersonRecognition.isFName(word)) {
return TermNatures.NRF;
}
return TermNatures.NW;
}
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:42,代码来源:NatureRecognition.java
示例16: createComponents
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new AnsjTokenizer(reader,new ToAnalysis(reader));
TokenStreamComponents result;
if (stopwords.isEmpty()) {
result = new TokenStreamComponents(source);
} else {
result = new TokenStreamComponents(source,new StopFilter(matchVersion, source, stopwords));
}
return result;
}
开发者ID:flash0729,项目名称:ansj-seg-for-lucene3,代码行数:15,代码来源:AnsjSearchAnalyzer.java
示例17: contextInitialized
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
public void contextInitialized(ServletContextEvent arg0) {
//初始化开始
System.out.println("\nChatBots初始化加载中...\n");
//初始化命令短语集
System.out.println("正在导入短语数据...");
Phrases_Config.Init();
System.out.println("短语导入完毕!\n");
//初始化词库
System.out.println("正在导入词库数据...");
Lexicon_Config.Init();
System.out.println("词库导入完毕!\n");
//初始化语言包
System.out.println("正在导入语言包...");
Language_Config.Init();
System.out.println("语言包导入完毕!\n");
//初始化分词系统
System.out.println("正在初始化分词系统...");
String str = "初始化ANSJ分词系统,当前使用分词模式:精准分词";
ToAnalysis.parse(str);
System.out.println("分词系统初始化完成!\n");
//Socket服务
System.out.println("正在启动Socket服务...");
new Thread(new Runnable() {
@Override
public void run() {
new SocketServer(9876).start();
}
}).start();
//Socket客户端
System.out.println("正在启动Socket客户端...");
new Thread(new Runnable() {
@Override
public void run() {
new SocketClient();
}
}).start();
}
开发者ID:landriesnidis,项目名称:NSIITA-SemanticMatching,代码行数:47,代码来源:Init.java
示例18: main
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
public static void main(String[] args) {
// 增加新词,中间按照'\t'隔开
UserDefineLibrary.insertWord("ansj中文分词", "教师.list", 1000);
List<Term> terms = ToAnalysis.parse("我觉得Ansj中文分词是一个不错的系统!我是王婆!");
System.out.println("增加新词例子:" + terms);
//删除词语,只能删除.用户自定义的词典.
UserDefineLibrary.removeWord("ansj中文分词");
terms = ToAnalysis.parse("我觉得ansj中文分词是一个不错的系统!我是王婆!");
System.out.println("删除用户自定义词典例子:" + terms);
}
开发者ID:landriesnidis,项目名称:NSIITA-SemanticMatching,代码行数:11,代码来源:Demo.java
示例19: loadExistingFileFromLexicon
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
/**
* 加载已存在的文件到词库中
* @param fileName
* @return
*/
public static boolean loadExistingFileFromLexicon(String fileName){
//判断词库是否已加载过
if(Lexicons.contains(fileName)){
return true;
}
//文件夹路径及文件名
String Path = Library_Config.Lexicon_Path;
//读取文件内容
String[] lines = new RWfile_Util(Path).ReadFile(fileName);
if(lines[0] == null){
System.out.println("词库 " + fileName + " 导入失败!");
return false;
}
//导入词典内容
for(int i=0;i<lines.length;i++){
try{
//判断新增词语是否是基本词(即:分词结果是否由单个元素组成)
String[] line_split = lines[i].split("\\|");
int weight;
String word;
//判断配置文件中的单条数据是否完整
//完整数据由: 词语|权值 两部分组成
//如果完整则正常导入
//如果数据缺失(即无权值参数)则使用默认值
if(line_split.length >= 2){
word = line_split[0];
weight = Integer.parseInt(line_split[1]);
}else{
word = lines[i];
weight = Library_Config.LexiconConfiguratioFile_DefaultWeights;
}
List<Term> parse = ToAnalysis.parse(word);
if(parse.size() > 1){
//复合新词加入分词系统
UserDefineLibrary.insertWord(word, fileName, weight);
}else{
//新意单词加入自定义词表
CustomWords.put(word, fileName);
}
}catch(Exception ex){
ex.printStackTrace();
System.out.println("警告:词库 " + fileName + " 第 " + (i+1) + " 行数据错误!");
}
}
//记录加载完毕的词库
Lexicons.add(fileName);
return true;
}
开发者ID:landriesnidis,项目名称:NSIITA-SemanticMatching,代码行数:63,代码来源:Lexicon_Config.java
示例20: tokenize
import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
public void tokenize(String input_str)
{
tokens = ToAnalysis.parse(input_str);
token_iterator = tokens.listIterator();
}
开发者ID:LunarBaseEngin,项目名称:LunarBase,代码行数:6,代码来源:TokenizerForSearchEngine.java
注:本文中的org.ansj.splitWord.analysis.ToAnalysis类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论