• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Java TextDocument类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Java中de.l3s.boilerpipe.document.TextDocument的典型用法代码示例。如果您正苦于以下问题:Java TextDocument类的具体用法?Java TextDocument怎么用?Java TextDocument使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



TextDocument类属于de.l3s.boilerpipe.document包,在下文中一共展示了TextDocument类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Java代码示例。

示例1: process

import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(TextDocument doc)
		throws BoilerpipeProcessingException {
	boolean changed = false;
	for (TextBlock tb : doc.getTextBlocks()) {
		if (tb.getNumWords() > 10) {
			continue;
		}
		final String text = tb.getText();
		for (Pattern p : PATTERNS_SHORT) {
			if (p.matcher(text).find()) {
				changed = true;
				tb.setIsContent(true);
				tb.addLabel(DefaultLabels.ARTICLE_METADATA);
			}
		}
	}
	return changed;
}
 
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:19,代码来源:ArticleMetadataFilter.java


示例2: process

import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(TextDocument doc)
        throws BoilerpipeProcessingException {
	boolean changes = false;
	
	List<TextBlock> list = doc.getTextBlocks();

	for (ListIterator<TextBlock> it = list.listIterator(list.size()); it.hasPrevious(); ) {
		TextBlock tb = it.previous();
		if(tb.isContent()) {
			if(tb.hasLabel(DefaultLabels.HEADING)) {
				tb.setIsContent(false);
				changes = true;
			} else {
				break;
			}
		}
	}
	
    return changes;
}
 
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:21,代码来源:TrailingHeadlineToBoilerplateFilter.java


示例3: process

import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(TextDocument doc)
        throws BoilerpipeProcessingException {
    List<TextBlock> textBlocks = doc.getTextBlocks();
    if (textBlocks.size() < 2) {
        return false;
    }

    boolean changes = false;
    TextBlock prevBlock = textBlocks.get(0);
    int offset = 1;

    for (Iterator<TextBlock> it = textBlocks.listIterator(offset); it
            .hasNext();) {
        TextBlock block = it.next();

        if(equalLabels(prevBlock.getLabels(), block.getLabels())) {
            prevBlock.mergeNext(block);
            it.remove();
            changes = true;
        } else {
            prevBlock = block;
        }
    }

    return changes;
}
 
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:27,代码来源:LabelFusion.java


示例4: process

import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(TextDocument doc)
        throws BoilerpipeProcessingException {
    List<TextBlock> textBlocks = doc.getTextBlocks();
    boolean changes = false;

    if (textBlocks.size() < 2) {
        return false;
    }

    TextBlock b1 = textBlocks.get(0);
    for (Iterator<TextBlock> it = textBlocks.listIterator(1); it.hasNext();) {
        TextBlock b2 = it.next();

        final boolean similar = (b1.getTextDensity() == b2.getTextDensity());
        
        if(similar) {
            b1.mergeNext(b2);
            it.remove();
            changes = true;
        } else {
            b1 = b2;
        }
    }

    return changes;
}
 
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:27,代码来源:SimpleBlockFusionProcessor.java


示例5: process

import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(final TextDocument doc)
        throws BoilerpipeProcessingException {

    boolean changes = false;

    BLOCK_LOOP: for (TextBlock tb : doc.getTextBlocks()) {
        if (!tb.isContent()) {
            for (String label : labels) {
                if (tb.hasLabel(label)) {
                    tb.setIsContent(true);
                    changes = true;
                    continue BLOCK_LOOP;
                }
            }
        }
    }

    return changes;
}
 
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:20,代码来源:LabelToContentFilter.java


示例6: process

import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(final TextDocument doc)
        throws BoilerpipeProcessingException {

    boolean changes = false;

    for (TextBlock tb : doc.getTextBlocks()) {
        if (!tb.isContent()) {
            continue;
        }
        if (tb.getNumWords() < minWords) {
            tb.setIsContent(false);
            changes = true;
        }

    }

    return changes;

}
 
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:20,代码来源:MinWordsFilter.java


示例7: process

import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(final TextDocument doc)
        throws BoilerpipeProcessingException {

    boolean changes = false;

    BLOCK_LOOP: for (TextBlock tb : doc.getTextBlocks()) {
        if (tb.isContent()) {
            for (String label : labels) {
                if (tb.hasLabel(label)) {
                    tb.setIsContent(false);
                    changes = true;
                    continue BLOCK_LOOP;
                }
            }
        }
    }

    return changes;
}
 
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:20,代码来源:LabelToBoilerplateFilter.java


示例8: process

import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(TextDocument doc)
		throws BoilerpipeProcessingException {
	List<TextBlock> textBlocks = doc.getTextBlocks();
	boolean hasChanges = false;

	for (Iterator<TextBlock> it = textBlocks.iterator(); it.hasNext();) {
		TextBlock tb = it.next();
		if (!tb.isContent()
				&& (labelToKeep == null || !tb
						.hasLabel(DefaultLabels.TITLE))) {
			it.remove();
			hasChanges = true;
		}
	}

	return hasChanges;
}
 
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:18,代码来源:BoilerplateBlockFilter.java


示例9: process

import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(final TextDocument doc)
        throws BoilerpipeProcessingException {

    boolean changes = false;

    for (TextBlock tb : doc.getTextBlocks()) {
        if (!tb.isContent()) {
            continue;
        }
        if (getNumFullTextWords(tb) < minWords) {
            tb.setIsContent(false);
            changes = true;
        }

    }

    return changes;

}
 
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:20,代码来源:MinFulltextWordsFilter.java


示例10: process

import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(TextDocument doc)
        throws BoilerpipeProcessingException {
    boolean changes = false;

    int numWords = 0;
    boolean foundEndOfText = false;
    for (Iterator<TextBlock> it = doc.getTextBlocks().iterator(); it.hasNext();) {
        TextBlock block = it.next();

        final boolean endOfText = block
                .hasLabel(DefaultLabels.INDICATES_END_OF_TEXT);
        if (block.isContent()) {
            numWords += getNumFullTextWords(block);
        }
        if (endOfText && numWords >= minNumWords) {
            foundEndOfText = true;
        }
        if (foundEndOfText) {
            changes = true;
            block.setIsContent(false);
        }
    }

    return changes;
}
 
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:26,代码来源:IgnoreBlocksAfterContentFilter.java


示例11: process

import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(TextDocument doc)
        throws BoilerpipeProcessingException {
    return

    TerminatingBlocksFinder.INSTANCE.process(doc)
            | new DocumentTitleMatchClassifier(doc.getTitle()).process(doc)
            | NumWordsRulesClassifier.INSTANCE.process(doc)
            | IgnoreBlocksAfterContentFilter.DEFAULT_INSTANCE.process(doc)
            | TrailingHeadlineToBoilerplateFilter.INSTANCE.process(doc)
            | BlockProximityFusion.MAX_DISTANCE_1.process(doc)
            | BoilerplateBlockFilter.INSTANCE_KEEP_TITLE.process(doc)
            | BlockProximityFusion.MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL.process(doc)
            | KeepLargestBlockFilter.INSTANCE_EXPAND_TO_SAME_TAGLEVEL_MIN_WORDS.process(doc)
            | ExpandTitleToContentFilter.INSTANCE.process(doc)
            | LargeBlockSameTagLevelToContentFilter.INSTANCE.process(doc)
            | ListAtEndFilter.INSTANCE.process(doc)
    ;
}
 
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:19,代码来源:ArticleExtractor.java


示例12: process

import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
/**
 * returns the article from an document with its basic html structure.
 *
 * @param HTMLDocument
 * @param URI the uri from the document for resolving the relative anchors in the document to absolute anchors
 * @return String
 */
public String process(HTMLDocument htmlDoc, URI docUri, final BoilerpipeExtractor extractor) {

    final HTMLHighlighter hh = HTMLHighlighter.newExtractingInstance();
    hh.setOutputHighlightOnly(true);

    TextDocument doc;

    String text = "";
    try {
        doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument();
        extractor.process(doc);
        final InputSource is = htmlDoc.toInputSource();
        text = hh.process(doc, is);
    } catch (Exception ex) {
        return null;
    }


    return removeNotAllowedTags(text, docUri);
}
 
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:28,代码来源:HtmlArticleExtractor.java


示例13: process

import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
/**
 * parses the media (picture, video) out of doc
 * 
 * @param doc document to parse the media out
 * @param extractor extractor to use
 * @return list of extracted media, with size = 0 if no media found
 */
public List<Media> process(String doc, final BoilerpipeExtractor extractor) {
	final HTMLDocument htmlDoc = new HTMLDocument(doc);
	List<Media> media = new ArrayList<Media>();
	TextDocument tdoc;

	try {
		tdoc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument();
		extractor.process(tdoc);
		final InputSource is = htmlDoc.toInputSource();
		media = process(tdoc, is);
	} catch (Exception e) {
		return null;
	}
	return media;
}
 
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:23,代码来源:MediaExtractor.java


示例14: extractHtml

import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
private void extractHtml(InputStream record, Document doc) throws TextExtractionException {
    try {
        BoundedInputStream in = new BoundedInputStream(record, maxDocSize);
        TextDocument textDoc = new BoilerpipeSAXInput(new InputSource(in)).getTextDocument();
        doc.setTitle(textDoc.getTitle());
        doc.setText(textDoc.getText(true, true).replace("\uFFFF", ""));
        if (boilingEnabled) {
            DefaultExtractor.INSTANCE.process(textDoc);
            doc.setBoiled(textDoc.getContent().replace("\uFFFF", ""));
        }
    } catch (SAXException | BoilerpipeProcessingException | IllegalArgumentException | ArrayIndexOutOfBoundsException e) {
        throw new TextExtractionException(e);
    }
}
 
开发者ID:nla,项目名称:bamboo,代码行数:15,代码来源:TextExtractor.java


示例15: process

import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
/**
 * Fetches the given {@link URL} using {@link HTMLFetcher} and processes the
 * retrieved HTML using the specified {@link BoilerpipeExtractor}.
 *
 *            The processed {@link TextDocument}.
 *            The original HTML document.
 * @return A List of enclosed links
 * @throws BoilerpipeProcessingException
 */
public List<String> process(final URL url, final BoilerpipeExtractor extractor)
throws IOException, BoilerpipeProcessingException, SAXException {
    final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);

    final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
    .getTextDocument();
    extractor.process(doc);

    final InputSource is = htmlDoc.toInputSource();

    return process(doc, is);
}
 
开发者ID:asimihsan,项目名称:handytrowel,代码行数:22,代码来源:LinkExtractor.java


示例16: process

import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(TextDocument doc)
        throws BoilerpipeProcessingException {
    List<TextBlock> textBlocks = doc.getTextBlocks();
    if (textBlocks.size() < 2) {
        return false;
    }

    boolean changes = false;
    int remaining = textBlocks.size();

    TextBlock blockBelow = null;
    TextBlock block;
    for (ListIterator<TextBlock> it = textBlocks.listIterator(textBlocks.size()); it
            .hasPrevious();) {
    	if(--remaining <= 0) {
    		break;
    	}
    	if(blockBelow == null) {
    		blockBelow = it.previous();
    		continue;
    	}
        block = it.previous();
        
        Set<String> labels = block.getLabels();
        if(labels != null && !labels.isEmpty()) {
        	for(String l : labels) {
        		blockBelow.addLabel(labelPrefix+l);
        	}
         changes = true;
        }
        blockBelow = block;
    }

    return changes;
}
 
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:36,代码来源:AddPrecedingLabelsFilter.java


示例17: process

import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(TextDocument doc)
		throws BoilerpipeProcessingException {
	if (potentialTitles == null) {
		return false;
	}
	boolean changes = false;
	
	for (final TextBlock tb : doc.getTextBlocks()) {
		String text = tb.getText();
		
		text = text.replace('\u00a0', ' ');
		text = text.replace("'", "");

		text = text.trim().toLowerCase();

		if (potentialTitles.contains(text)) {
			tb.addLabel(DefaultLabels.TITLE);
			changes = true;
			break;
		}
		
		text = PAT_REMOVE_CHARACTERS.matcher(text).replaceAll("").trim();
		if (potentialTitles.contains(text)) {
			tb.addLabel(DefaultLabels.TITLE);
			changes = true;
			break;
		}
	}
	return changes;
}
 
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:31,代码来源:DocumentTitleMatchClassifier.java


示例18: process

import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(final TextDocument doc)
		throws BoilerpipeProcessingException {

	boolean changes = false;

	int tagLevel = Integer.MAX_VALUE;
	for (TextBlock tb : doc.getTextBlocks()) {
		if (tb.isContent()
				&& tb.hasLabel(DefaultLabels.VERY_LIKELY_CONTENT)) {
			tagLevel = tb.getTagLevel();
		} else {
			if (tb.getTagLevel() > tagLevel
					&& tb.hasLabel(DefaultLabels.MIGHT_BE_CONTENT)
					&& tb.hasLabel(DefaultLabels.LI)
					&& tb.getLinkDensity() == 0
					) {
				tb.setIsContent(true);
				changes = true;
			} else {
				tagLevel = Integer.MAX_VALUE;
			}
		}
	}

	return changes;

}
 
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:28,代码来源:ListAtEndFilter.java


示例19: process

import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(TextDocument doc)
		throws BoilerpipeProcessingException {
	List<TextBlock> textBlocks = doc.getTextBlocks();
	if (textBlocks.size() < 2) {
		return false;
	}

	TextBlock prevBlock = textBlocks.get(0);

	boolean changes = false;
	do {
		changes = false;
		for (ListIterator<TextBlock> it = textBlocks.listIterator(1); it
				.hasNext();) {
			TextBlock block = it.next();

			if (prevBlock.isContent()
					&& block.getLinkDensity() < 0.56
					&& !block.hasLabel(DefaultLabels.STRICTLY_NOT_CONTENT)) {
				
				prevBlock.mergeNext(block);
				it.remove();
				changes = true;
			} else {
				prevBlock = block;
			}
		}
	} while (changes);

	return true;
}
 
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:32,代码来源:ContentFusion.java


示例20: process

import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(TextDocument doc)
        throws BoilerpipeProcessingException {

    List<TextBlock> tbs = doc.getTextBlocks();
    if (tbs.size() < 3) {
        return false;
    }
    
    TextBlock a = tbs.get(0);
    TextBlock b = tbs.get(1);
    TextBlock c;
    boolean hasChanges = false;
    for (Iterator<TextBlock> it= tbs.listIterator(2);it.hasNext();) {
        c = it.next();
        if(!b.isContent() && a.isContent() && c.isContent() && cond.meetsCondition(b)) {
        	b.setIsContent(true);
        	hasChanges = true;
        }
        
        a = c;
        if(!it.hasNext()) {
        	break;
        }
        b = it.next();
    }

    return hasChanges;
}
 
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:29,代码来源:SurroundingToContentFilter.java



注:本文中的de.l3s.boilerpipe.document.TextDocument类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Java LocatorEx类代码示例发布时间:2022-05-23
下一篇:
Java PacketTags类代码示例发布时间:2022-05-23
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap