• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Java ToAnalysis类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Java中org.ansj.splitWord.analysis.ToAnalysis的典型用法代码示例。如果您正苦于以下问题:Java ToAnalysis类的具体用法?Java ToAnalysis怎么用?Java ToAnalysis使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



ToAnalysis类属于org.ansj.splitWord.analysis包,在下文中一共展示了ToAnalysis类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Java代码示例。

示例1: main

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
public static void main(String[] args) throws IOException {
	List<Term> parse = ToAnalysis.parse("中华人民 共和国 成立了 ");
	System.out.println(parse);
	List<Term> parse1 = IndexAnalysis.parse("你吃过饭了没有!!!!!吃过无妨论文");
	
  
	//System.out.println(parse1);
	String text11="ZW321282050000000325";
	
	Tokenizer tokenizer = new AnsjTokenizer(new StringReader(text11), 0, true);
	CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
	OffsetAttribute offsetAtt = 
			tokenizer.addAttribute(OffsetAttribute.class);
		PositionIncrementAttribute positionIncrementAtt = 
			tokenizer.addAttribute(PositionIncrementAttribute.class);

    tokenizer.reset();
	while (tokenizer.incrementToken()){

	      System.out.print(new String(termAtt.toString()+" ") );
		//  System.out.print( offsetAtt.startOffset() + "-" + offsetAtt.endOffset() + "-" );
		//System.out.print( positionIncrementAtt.getPositionIncrement() +"/");

	}
	tokenizer.close();
}
 
开发者ID:dimensoft,项目名称:improved-journey,代码行数:27,代码来源:TestAnsj.java


示例2: checkTextContent

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
public int checkTextContent(int userId, String content) throws IOException {
    HashSet<String> sensitiveWords = new HashSet<String>();
    InputStream fis = new FileInputStream(source);
    InputStreamReader isr = new InputStreamReader(fis, Charset.forName("UTF-8"));
    BufferedReader br = new BufferedReader(isr);
    String line;
    while ((line = br.readLine()) != null)
        sensitiveWords.add(line.substring(0, line.length() - 1));


    Result result = ToAnalysis.parse(Jsoup.clean(content, Whitelist.none()));
    List<Term> termList = result.getTerms();
    for (Term term : termList) {
        if (sensitiveWords.contains(term.getName()))
            return 0;
    }
    return 1;
}
 
开发者ID:qinjr,项目名称:TeamNote,代码行数:19,代码来源:QualityUtilImpl.java


示例3: doPost

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
public void doPost(HttpServletRequest request, HttpServletResponse response) 
		throws ServletException, IOException {
	
	request.setCharacterEncoding("UTF-8");
	response.setCharacterEncoding("UTF-8");
	
	// 必填参数
	String Text = request.getParameter("text");
	
	List<Term> terms = ToAnalysis.parse(Text);
	
	PrintWriter out = response.getWriter();
	out.print("分词结果为:\n" + terms);
	out.flush();
	out.close();
}
 
开发者ID:landriesnidis,项目名称:NSIITA-SemanticMatching,代码行数:17,代码来源:Ansj_seg.java


示例4: tokenizeDocxFile

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
private static void tokenizeDocxFile(String filePath) {
    File file = new File(filePath);
    DocumentInfo doc = DocumentParser.parseFileToDocumentInfo(file);
    if(doc instanceof WordDocumentInfo) {
        String content = ((WordDocumentInfo) doc).getDocStr();
        Result terms = ToAnalysis.parse(content);
        for (int i = 0; i < terms.size(); i++) {
            String words = terms.get(i).getName();
            boolean filtered = false;
            for(String stopToken : stopTokens)
                if(words.equals(stopToken)) { filtered = true; break; }
            char firstLetter = words.charAt(0);
            if((firstLetter >= 'A' && firstLetter <= 'Z') ||
                    (firstLetter >= 'a' && firstLetter <= 'z') ||
                    (firstLetter >= '0' && firstLetter <= '9'))
                filtered = true;
            if(filtered) continue;
            wordsCN.add(words);
        }
    }
    else System.out.println("Not a docx file");
}
 
开发者ID:linzeqipku,项目名称:SnowGraph,代码行数:23,代码来源:WordSegmenter.java


示例5: findEntities

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
@Override
public Entities findEntities(String sentence, boolean allowDuplicated) {
    Entities entities = new Entities(allowDuplicated);

    Result result = ToAnalysis.parse(sentence);
    for (Term term : result.getTerms()) {
        if (term.getName().length() < 2) {
            continue;
        }
        if (term.getNatureStr().startsWith("nr")) {
            entities.addPerson(term.getName());
        } else if (term.getNatureStr().startsWith("nt")) {
            entities.addOrganization(term.getName());
        } else if (term.getNatureStr().startsWith("ns")) {
            if (term.getName().endsWith("大学") || term.getName().endsWith("学院")) {
                entities.addOrganization(term.getName());
            } else {
                entities.addSpace(term.getName());
            }
        }
    }
    return entities;
}
 
开发者ID:iamxiatian,项目名称:wikit,代码行数:24,代码来源:AnsjSegment.java


示例6: main

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
public static void main(String[] args) throws IOException {
	List<Term> parse = ToAnalysis.parse("天天向上,媒体打打。《回家真好》");
	System.out.println(parse);
	Tokenizer tokenizer = new AnsjTokenizer(new StringReader("天天向上,媒体打打。《回家真好》"), 0, true);
	CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
	OffsetAttribute offsetAtt = 
			tokenizer.addAttribute(OffsetAttribute.class);
		PositionIncrementAttribute positionIncrementAtt = 
			tokenizer.addAttribute(PositionIncrementAttribute.class);

	
	while (tokenizer.incrementToken()){

		System.out.print(new String(termAtt.toString()) );
		System.out.print( offsetAtt.startOffset() + "-" + offsetAtt.endOffset() + "-" );
		System.out.print( positionIncrementAtt.getPositionIncrement() +"/");

	}
	tokenizer.close();
}
 
开发者ID:lgnlgn,项目名称:ansj4solr,代码行数:21,代码来源:TestAnsj.java


示例7: testAnsjAnalyzer

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
@Test
public void testAnsjAnalyzer() throws IOException {
	String input = "我在首都机场虹桥路滑旱冰!玩的很Happy,很Hi!";
	
	System.out.print("Ansj索引时采用面向索引的分词:" + input);
	displayTokensWithFullDetails(new AnsjAnalyzer(Version.LUCENE_36),input);
	System.out.print("索引分词时过滤停用词");
	displayTokensWithFullDetails(new AnsjAnalyzer(Version.LUCENE_36,null,stopwords),input);
	
	System.out.println("------------------------------------------------------------");
	
	System.out.print("Ansj查询时采用精准分词:" + input);
	displayTokensWithFullDetails(new AnsjAnalyzer(Version.LUCENE_36,ToAnalysis.class),input);
	System.out.print("查询分词时过滤停用词");
	displayTokensWithFullDetails(new AnsjAnalyzer(Version.LUCENE_36,ToAnalysis.class,stopwords),input);
}
 
开发者ID:flash0729,项目名称:ansj-seg-for-lucene3,代码行数:17,代码来源:AnsjAnalysisTest.java


示例8: demo

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
public static ArrayList<String> demo(String strToParse) {
    String str = strToParse;
            //"我年纪还轻,阅历不深的时候,我父亲教导过我一句话,我至今还念念不忘。 \n" +
            //"“每逢你想要批评任何人的时候,”他对我说,“你就记住,这个世界上所有的人,并不是个个都有过你拥有的那些优越的条件。”";
    ArrayList<String> ret = new ArrayList<>();
    Result terms = ToAnalysis.parse(str);
    for (int i = 0; i < terms.size(); i++) {
        String words = terms.get(i).getName();// 获取单词
        String nominal = terms.get(i).getNatureStr();// 获取词性
        ret.add(words);
        //System.out.print(words + "\t" + nominal + "\n");
    }
    return ret;
}
 
开发者ID:linzeqipku,项目名称:SnowGraph,代码行数:15,代码来源:WordSegmenter.java


示例9: Seg

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
/**
 * 分词
 * @param sentence 待分词的句子
 * @return 分词结果
 */
public static List<Term> Seg(String sentence) {
    FilterRecognition filter = new FilterRecognition();
    //过滤标点符号
    filter.insertStopWord(",", " ", ".", ",", "。", ":", ":", "'", "‘", "’", " ", "“", "”", "《", "》", "[", "]", "-");
    return ToAnalysis.parse(sentence).recognition(filter).getTerms();
}
 
开发者ID:jsksxs360,项目名称:Word2Vec,代码行数:12,代码来源:Segment.java


示例10: realySplit

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
private ArrayList<String> realySplit(String strbuf) {
    List<Term> parse = ToAnalysis.parse(strbuf);
    ArrayList<String> words = new ArrayList<>();
    for (Term term : parse) {
        if (StringUtils.RegexUtils.hasChinese(term.toString())) {
            words.add(term.getName());
        }
    }
    
    return words;
}
 
开发者ID:William-Hai,项目名称:CorpusSpider,代码行数:12,代码来源:AnjsSplitWordsUtils.java


示例11: containKeyword

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
public static boolean containKeyword(String text,String keyword) throws Exception{
	 boolean flag=false;
	 List<Term> tokens=ToAnalysis.parse(keyword);
	 for(Term t:tokens){
			String token=t.getName();
			if(text.contains(token)){
				flag=true;
				break;
			}
	}		
	 return flag;
}
 
开发者ID:YufangWoo,项目名称:news-crawler,代码行数:13,代码来源:HtmlContentExtractor.java


示例12: segment

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
@Override
public List<String> segment(String sentence) throws SegmentException {
    List<String> results = new ArrayList<String>();
    Result result = ToAnalysis.parse(sentence);
    for (Term term : result.getTerms()) {
        results.add(term.getName());
    }
    return results;
}
 
开发者ID:iamxiatian,项目名称:wikit,代码行数:10,代码来源:AnsjSegment.java


示例13: tag

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
@Override
public List<SegWord> tag(String sentence) throws SegmentException {
    Result result = ToAnalysis.parse(sentence);
    List<SegWord> results = new ArrayList<SegWord>();

    for (Term term : result.getTerms()) {
        results.add(new SegWord(term.getName(), term.getNatureStr()));
    }

    return results;
}
 
开发者ID:iamxiatian,项目名称:wikit,代码行数:12,代码来源:AnsjSegment.java


示例14: tokenizeTerm

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
@Override
public HashMap<String, TermScore> tokenizeTerm(String input_str) {
	
	//long startt = System.nanoTime(); 
	tokens = ToAnalysis.parse(input_str);
	token_iterator = tokens.listIterator();
	
	HashMap<String, TermScore> hash = new HashMap<String, TermScore>();
	while(token_iterator.hasNext())
	{
		Term term = token_iterator.next();
		if(term.getName().length()>=2)
		{
			if(hash.get(term.getName()) == null)
				hash.put(term.getName(), new TermScore(term.getName(), 0));
			else
			{
				TermScore exist_term = hash.get(term.getName());
				int new_score = exist_term.getScore()+1;
				exist_term.setScore(new_score);
				hash.put(term.getName(), exist_term);
			}
		}
	}
	//long endd = System.nanoTime(); 
	//System.out.println("Tokenization costs: " + (endd - startt ) + " ns"); 
	
	return hash;
}
 
开发者ID:LunarBaseEngin,项目名称:LunarBase,代码行数:30,代码来源:TokenizerForSearchEngine.java


示例15: guessNature

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
/**
 * 通过规则 猜测词性
 * 
 * @param word
 * @return
 */
public static TermNatures guessNature(String word) {
    String nature = null;
    SmartForest<String[]> smartForest = SUFFIX_FOREST;
    int len = 0;
    for (int i = word.length() - 1; i >= 0; i--) {
        smartForest = smartForest.get(word.charAt(i));
        if (smartForest == null) {
            break;
        }
        len++;
        if (smartForest.getStatus() == 2) {
            nature = smartForest.getParam()[0];
        } else if (smartForest.getStatus() == 3) {
            nature = smartForest.getParam()[0];
            break;
        }
    }

    if ("nt".equals(nature) && (len > 1 || word.length() > 3)) {
        return TermNatures.NT;
    } else if ("ns".equals(nature)) {
        return TermNatures.NS;
    } else if (word.length() < 5) {
        Result parse = ToAnalysis.parse(word);
        for (Term term : parse.getTerms()) {
            if ("nr".equals(term.getNatureStr())) {
                return TermNatures.NR;
            }
        }
    } else if (ForeignPersonRecognition.isFName(word)) {
        return TermNatures.NRF;
    }

    return TermNatures.NW;
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:42,代码来源:NatureRecognition.java


示例16: createComponents

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
@Override
protected TokenStreamComponents createComponents(String fieldName,
		Reader reader) {
	final Tokenizer source = new AnsjTokenizer(reader,new ToAnalysis(reader));
	
	TokenStreamComponents result;
	if (stopwords.isEmpty()) {
		result = new TokenStreamComponents(source);
	} else {
		result = new TokenStreamComponents(source,new StopFilter(matchVersion, source, stopwords));
	}
	
	return result;
}
 
开发者ID:flash0729,项目名称:ansj-seg-for-lucene3,代码行数:15,代码来源:AnsjSearchAnalyzer.java


示例17: contextInitialized

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
public void contextInitialized(ServletContextEvent arg0) {
  		
  	//初始化开始
  	System.out.println("\nChatBots初始化加载中...\n");
  	
  	//初始化命令短语集
  	System.out.println("正在导入短语数据...");
  	Phrases_Config.Init();
  	System.out.println("短语导入完毕!\n");
  	
      //初始化词库    
  	System.out.println("正在导入词库数据...");
      Lexicon_Config.Init();
      System.out.println("词库导入完毕!\n");
      
      //初始化语言包
      System.out.println("正在导入语言包...");
      Language_Config.Init();
      System.out.println("语言包导入完毕!\n");
      
      //初始化分词系统
      System.out.println("正在初始化分词系统...");
      String str = "初始化ANSJ分词系统,当前使用分词模式:精准分词";
      ToAnalysis.parse(str);
      System.out.println("分词系统初始化完成!\n");
      
      //Socket服务
      System.out.println("正在启动Socket服务...");
      new Thread(new Runnable() {
	
	@Override
	public void run() {
		new SocketServer(9876).start();
	}
}).start();
      
      //Socket客户端
      System.out.println("正在启动Socket客户端...");
      new Thread(new Runnable() {
	
	@Override
	public void run() {
		new SocketClient();
	}
}).start();
  }
 
开发者ID:landriesnidis,项目名称:NSIITA-SemanticMatching,代码行数:47,代码来源:Init.java


示例18: main

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
public static void main(String[] args) {
       // 增加新词,中间按照'\t'隔开
       UserDefineLibrary.insertWord("ansj中文分词", "教师.list", 1000);
       List<Term> terms = ToAnalysis.parse("我觉得Ansj中文分词是一个不错的系统!我是王婆!");
       System.out.println("增加新词例子:" + terms);
        //删除词语,只能删除.用户自定义的词典.
       UserDefineLibrary.removeWord("ansj中文分词");
       terms = ToAnalysis.parse("我觉得ansj中文分词是一个不错的系统!我是王婆!");
       System.out.println("删除用户自定义词典例子:" + terms);
}
 
开发者ID:landriesnidis,项目名称:NSIITA-SemanticMatching,代码行数:11,代码来源:Demo.java


示例19: loadExistingFileFromLexicon

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
/**
 * 加载已存在的文件到词库中
 * @param fileName
 * @return
 */
public static boolean loadExistingFileFromLexicon(String fileName){
	
	//判断词库是否已加载过
	if(Lexicons.contains(fileName)){
		return true;
	}
	
	//文件夹路径及文件名
	String Path = Library_Config.Lexicon_Path;
			
	//读取文件内容
	String[] lines = new RWfile_Util(Path).ReadFile(fileName);
	if(lines[0] == null){
		System.out.println("词库 " + fileName + " 导入失败!");
		return false;
	}
	
	//导入词典内容
	for(int i=0;i<lines.length;i++){
		try{
			
			//判断新增词语是否是基本词(即:分词结果是否由单个元素组成)
			String[] line_split = lines[i].split("\\|");
			int weight;
			String word;
			
			//判断配置文件中的单条数据是否完整
			//完整数据由:  词语|权值	两部分组成
			//如果完整则正常导入
			//如果数据缺失(即无权值参数)则使用默认值
			if(line_split.length >= 2){
				word = line_split[0];
				weight = Integer.parseInt(line_split[1]);
			}else{
				word = lines[i];
				weight = Library_Config.LexiconConfiguratioFile_DefaultWeights;
			}

			List<Term> parse = ToAnalysis.parse(word);
			if(parse.size() > 1){
				//复合新词加入分词系统
				UserDefineLibrary.insertWord(word, fileName, weight);
			}else{
				//新意单词加入自定义词表
				CustomWords.put(word, fileName);
			}
			
		}catch(Exception ex){
			ex.printStackTrace();
			System.out.println("警告:词库 " + fileName + " 第 " + (i+1) + " 行数据错误!");
		}
	}
	
	//记录加载完毕的词库
	Lexicons.add(fileName);
	return true;
}
 
开发者ID:landriesnidis,项目名称:NSIITA-SemanticMatching,代码行数:63,代码来源:Lexicon_Config.java


示例20: tokenize

import org.ansj.splitWord.analysis.ToAnalysis; //导入依赖的package包/类
public void tokenize(String input_str)
{ 
	tokens = ToAnalysis.parse(input_str);
    token_iterator = tokens.listIterator();
}
 
开发者ID:LunarBaseEngin,项目名称:LunarBase,代码行数:6,代码来源:TokenizerForSearchEngine.java



注:本文中的org.ansj.splitWord.analysis.ToAnalysis类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Java TypeMismatchException类代码示例发布时间:2022-05-21
下一篇:
Java MushroomCow类代码示例发布时间:2022-05-21
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap