本文整理汇总了Java中de.l3s.boilerpipe.document.TextDocument类的典型用法代码示例。如果您正苦于以下问题:Java TextDocument类的具体用法?Java TextDocument怎么用?Java TextDocument使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
TextDocument类属于de.l3s.boilerpipe.document包,在下文中一共展示了TextDocument类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Java代码示例。
示例1: process
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(TextDocument doc)
throws BoilerpipeProcessingException {
boolean changed = false;
for (TextBlock tb : doc.getTextBlocks()) {
if (tb.getNumWords() > 10) {
continue;
}
final String text = tb.getText();
for (Pattern p : PATTERNS_SHORT) {
if (p.matcher(text).find()) {
changed = true;
tb.setIsContent(true);
tb.addLabel(DefaultLabels.ARTICLE_METADATA);
}
}
}
return changed;
}
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:19,代码来源:ArticleMetadataFilter.java
示例2: process
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(TextDocument doc)
throws BoilerpipeProcessingException {
boolean changes = false;
List<TextBlock> list = doc.getTextBlocks();
for (ListIterator<TextBlock> it = list.listIterator(list.size()); it.hasPrevious(); ) {
TextBlock tb = it.previous();
if(tb.isContent()) {
if(tb.hasLabel(DefaultLabels.HEADING)) {
tb.setIsContent(false);
changes = true;
} else {
break;
}
}
}
return changes;
}
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:21,代码来源:TrailingHeadlineToBoilerplateFilter.java
示例3: process
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(TextDocument doc)
throws BoilerpipeProcessingException {
List<TextBlock> textBlocks = doc.getTextBlocks();
if (textBlocks.size() < 2) {
return false;
}
boolean changes = false;
TextBlock prevBlock = textBlocks.get(0);
int offset = 1;
for (Iterator<TextBlock> it = textBlocks.listIterator(offset); it
.hasNext();) {
TextBlock block = it.next();
if(equalLabels(prevBlock.getLabels(), block.getLabels())) {
prevBlock.mergeNext(block);
it.remove();
changes = true;
} else {
prevBlock = block;
}
}
return changes;
}
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:27,代码来源:LabelFusion.java
示例4: process
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(TextDocument doc)
throws BoilerpipeProcessingException {
List<TextBlock> textBlocks = doc.getTextBlocks();
boolean changes = false;
if (textBlocks.size() < 2) {
return false;
}
TextBlock b1 = textBlocks.get(0);
for (Iterator<TextBlock> it = textBlocks.listIterator(1); it.hasNext();) {
TextBlock b2 = it.next();
final boolean similar = (b1.getTextDensity() == b2.getTextDensity());
if(similar) {
b1.mergeNext(b2);
it.remove();
changes = true;
} else {
b1 = b2;
}
}
return changes;
}
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:27,代码来源:SimpleBlockFusionProcessor.java
示例5: process
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(final TextDocument doc)
throws BoilerpipeProcessingException {
boolean changes = false;
BLOCK_LOOP: for (TextBlock tb : doc.getTextBlocks()) {
if (!tb.isContent()) {
for (String label : labels) {
if (tb.hasLabel(label)) {
tb.setIsContent(true);
changes = true;
continue BLOCK_LOOP;
}
}
}
}
return changes;
}
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:20,代码来源:LabelToContentFilter.java
示例6: process
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(final TextDocument doc)
throws BoilerpipeProcessingException {
boolean changes = false;
for (TextBlock tb : doc.getTextBlocks()) {
if (!tb.isContent()) {
continue;
}
if (tb.getNumWords() < minWords) {
tb.setIsContent(false);
changes = true;
}
}
return changes;
}
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:20,代码来源:MinWordsFilter.java
示例7: process
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(final TextDocument doc)
throws BoilerpipeProcessingException {
boolean changes = false;
BLOCK_LOOP: for (TextBlock tb : doc.getTextBlocks()) {
if (tb.isContent()) {
for (String label : labels) {
if (tb.hasLabel(label)) {
tb.setIsContent(false);
changes = true;
continue BLOCK_LOOP;
}
}
}
}
return changes;
}
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:20,代码来源:LabelToBoilerplateFilter.java
示例8: process
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(TextDocument doc)
throws BoilerpipeProcessingException {
List<TextBlock> textBlocks = doc.getTextBlocks();
boolean hasChanges = false;
for (Iterator<TextBlock> it = textBlocks.iterator(); it.hasNext();) {
TextBlock tb = it.next();
if (!tb.isContent()
&& (labelToKeep == null || !tb
.hasLabel(DefaultLabels.TITLE))) {
it.remove();
hasChanges = true;
}
}
return hasChanges;
}
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:18,代码来源:BoilerplateBlockFilter.java
示例9: process
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(final TextDocument doc)
throws BoilerpipeProcessingException {
boolean changes = false;
for (TextBlock tb : doc.getTextBlocks()) {
if (!tb.isContent()) {
continue;
}
if (getNumFullTextWords(tb) < minWords) {
tb.setIsContent(false);
changes = true;
}
}
return changes;
}
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:20,代码来源:MinFulltextWordsFilter.java
示例10: process
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(TextDocument doc)
throws BoilerpipeProcessingException {
boolean changes = false;
int numWords = 0;
boolean foundEndOfText = false;
for (Iterator<TextBlock> it = doc.getTextBlocks().iterator(); it.hasNext();) {
TextBlock block = it.next();
final boolean endOfText = block
.hasLabel(DefaultLabels.INDICATES_END_OF_TEXT);
if (block.isContent()) {
numWords += getNumFullTextWords(block);
}
if (endOfText && numWords >= minNumWords) {
foundEndOfText = true;
}
if (foundEndOfText) {
changes = true;
block.setIsContent(false);
}
}
return changes;
}
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:26,代码来源:IgnoreBlocksAfterContentFilter.java
示例11: process
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(TextDocument doc)
throws BoilerpipeProcessingException {
return
TerminatingBlocksFinder.INSTANCE.process(doc)
| new DocumentTitleMatchClassifier(doc.getTitle()).process(doc)
| NumWordsRulesClassifier.INSTANCE.process(doc)
| IgnoreBlocksAfterContentFilter.DEFAULT_INSTANCE.process(doc)
| TrailingHeadlineToBoilerplateFilter.INSTANCE.process(doc)
| BlockProximityFusion.MAX_DISTANCE_1.process(doc)
| BoilerplateBlockFilter.INSTANCE_KEEP_TITLE.process(doc)
| BlockProximityFusion.MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL.process(doc)
| KeepLargestBlockFilter.INSTANCE_EXPAND_TO_SAME_TAGLEVEL_MIN_WORDS.process(doc)
| ExpandTitleToContentFilter.INSTANCE.process(doc)
| LargeBlockSameTagLevelToContentFilter.INSTANCE.process(doc)
| ListAtEndFilter.INSTANCE.process(doc)
;
}
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:19,代码来源:ArticleExtractor.java
示例12: process
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
/**
* returns the article from an document with its basic html structure.
*
* @param HTMLDocument
* @param URI the uri from the document for resolving the relative anchors in the document to absolute anchors
* @return String
*/
public String process(HTMLDocument htmlDoc, URI docUri, final BoilerpipeExtractor extractor) {
final HTMLHighlighter hh = HTMLHighlighter.newExtractingInstance();
hh.setOutputHighlightOnly(true);
TextDocument doc;
String text = "";
try {
doc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument();
extractor.process(doc);
final InputSource is = htmlDoc.toInputSource();
text = hh.process(doc, is);
} catch (Exception ex) {
return null;
}
return removeNotAllowedTags(text, docUri);
}
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:28,代码来源:HtmlArticleExtractor.java
示例13: process
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
/**
* parses the media (picture, video) out of doc
*
* @param doc document to parse the media out
* @param extractor extractor to use
* @return list of extracted media, with size = 0 if no media found
*/
public List<Media> process(String doc, final BoilerpipeExtractor extractor) {
final HTMLDocument htmlDoc = new HTMLDocument(doc);
List<Media> media = new ArrayList<Media>();
TextDocument tdoc;
try {
tdoc = new BoilerpipeSAXInput(htmlDoc.toInputSource()).getTextDocument();
extractor.process(tdoc);
final InputSource is = htmlDoc.toInputSource();
media = process(tdoc, is);
} catch (Exception e) {
return null;
}
return media;
}
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:23,代码来源:MediaExtractor.java
示例14: extractHtml
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
private void extractHtml(InputStream record, Document doc) throws TextExtractionException {
try {
BoundedInputStream in = new BoundedInputStream(record, maxDocSize);
TextDocument textDoc = new BoilerpipeSAXInput(new InputSource(in)).getTextDocument();
doc.setTitle(textDoc.getTitle());
doc.setText(textDoc.getText(true, true).replace("\uFFFF", ""));
if (boilingEnabled) {
DefaultExtractor.INSTANCE.process(textDoc);
doc.setBoiled(textDoc.getContent().replace("\uFFFF", ""));
}
} catch (SAXException | BoilerpipeProcessingException | IllegalArgumentException | ArrayIndexOutOfBoundsException e) {
throw new TextExtractionException(e);
}
}
开发者ID:nla,项目名称:bamboo,代码行数:15,代码来源:TextExtractor.java
示例15: process
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
/**
* Fetches the given {@link URL} using {@link HTMLFetcher} and processes the
* retrieved HTML using the specified {@link BoilerpipeExtractor}.
*
* The processed {@link TextDocument}.
* The original HTML document.
* @return A List of enclosed links
* @throws BoilerpipeProcessingException
*/
public List<String> process(final URL url, final BoilerpipeExtractor extractor)
throws IOException, BoilerpipeProcessingException, SAXException {
final HTMLDocument htmlDoc = HTMLFetcher.fetch(url);
final TextDocument doc = new BoilerpipeSAXInput(htmlDoc.toInputSource())
.getTextDocument();
extractor.process(doc);
final InputSource is = htmlDoc.toInputSource();
return process(doc, is);
}
开发者ID:asimihsan,项目名称:handytrowel,代码行数:22,代码来源:LinkExtractor.java
示例16: process
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(TextDocument doc)
throws BoilerpipeProcessingException {
List<TextBlock> textBlocks = doc.getTextBlocks();
if (textBlocks.size() < 2) {
return false;
}
boolean changes = false;
int remaining = textBlocks.size();
TextBlock blockBelow = null;
TextBlock block;
for (ListIterator<TextBlock> it = textBlocks.listIterator(textBlocks.size()); it
.hasPrevious();) {
if(--remaining <= 0) {
break;
}
if(blockBelow == null) {
blockBelow = it.previous();
continue;
}
block = it.previous();
Set<String> labels = block.getLabels();
if(labels != null && !labels.isEmpty()) {
for(String l : labels) {
blockBelow.addLabel(labelPrefix+l);
}
changes = true;
}
blockBelow = block;
}
return changes;
}
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:36,代码来源:AddPrecedingLabelsFilter.java
示例17: process
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(TextDocument doc)
throws BoilerpipeProcessingException {
if (potentialTitles == null) {
return false;
}
boolean changes = false;
for (final TextBlock tb : doc.getTextBlocks()) {
String text = tb.getText();
text = text.replace('\u00a0', ' ');
text = text.replace("'", "");
text = text.trim().toLowerCase();
if (potentialTitles.contains(text)) {
tb.addLabel(DefaultLabels.TITLE);
changes = true;
break;
}
text = PAT_REMOVE_CHARACTERS.matcher(text).replaceAll("").trim();
if (potentialTitles.contains(text)) {
tb.addLabel(DefaultLabels.TITLE);
changes = true;
break;
}
}
return changes;
}
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:31,代码来源:DocumentTitleMatchClassifier.java
示例18: process
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(final TextDocument doc)
throws BoilerpipeProcessingException {
boolean changes = false;
int tagLevel = Integer.MAX_VALUE;
for (TextBlock tb : doc.getTextBlocks()) {
if (tb.isContent()
&& tb.hasLabel(DefaultLabels.VERY_LIKELY_CONTENT)) {
tagLevel = tb.getTagLevel();
} else {
if (tb.getTagLevel() > tagLevel
&& tb.hasLabel(DefaultLabels.MIGHT_BE_CONTENT)
&& tb.hasLabel(DefaultLabels.LI)
&& tb.getLinkDensity() == 0
) {
tb.setIsContent(true);
changes = true;
} else {
tagLevel = Integer.MAX_VALUE;
}
}
}
return changes;
}
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:28,代码来源:ListAtEndFilter.java
示例19: process
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(TextDocument doc)
throws BoilerpipeProcessingException {
List<TextBlock> textBlocks = doc.getTextBlocks();
if (textBlocks.size() < 2) {
return false;
}
TextBlock prevBlock = textBlocks.get(0);
boolean changes = false;
do {
changes = false;
for (ListIterator<TextBlock> it = textBlocks.listIterator(1); it
.hasNext();) {
TextBlock block = it.next();
if (prevBlock.isContent()
&& block.getLinkDensity() < 0.56
&& !block.hasLabel(DefaultLabels.STRICTLY_NOT_CONTENT)) {
prevBlock.mergeNext(block);
it.remove();
changes = true;
} else {
prevBlock = block;
}
}
} while (changes);
return true;
}
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:32,代码来源:ContentFusion.java
示例20: process
import de.l3s.boilerpipe.document.TextDocument; //导入依赖的package包/类
public boolean process(TextDocument doc)
throws BoilerpipeProcessingException {
List<TextBlock> tbs = doc.getTextBlocks();
if (tbs.size() < 3) {
return false;
}
TextBlock a = tbs.get(0);
TextBlock b = tbs.get(1);
TextBlock c;
boolean hasChanges = false;
for (Iterator<TextBlock> it= tbs.listIterator(2);it.hasNext();) {
c = it.next();
if(!b.isContent() && a.isContent() && c.isContent() && cond.meetsCondition(b)) {
b.setIsContent(true);
hasChanges = true;
}
a = c;
if(!it.hasNext()) {
break;
}
b = it.next();
}
return hasChanges;
}
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:29,代码来源:SurroundingToContentFilter.java
注:本文中的de.l3s.boilerpipe.document.TextDocument类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论