本文整理汇总了Java中crawlercommons.robots.BaseRobotRules类的典型用法代码示例。如果您正苦于以下问题:Java BaseRobotRules类的具体用法?Java BaseRobotRules怎么用?Java BaseRobotRules使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
BaseRobotRules类属于crawlercommons.robots包,在下文中一共展示了BaseRobotRules类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Java代码示例。
示例1: main
import crawlercommons.robots.BaseRobotRules; //导入依赖的package包/类
public static void main(String args[]) throws Exception {
HttpProtocol protocol = new HttpProtocol();
String url = args[0];
Config conf = ConfUtils.loadConf(args[1]);
protocol.configure(conf);
if (!protocol.skipRobots) {
BaseRobotRules rules = protocol.getRobotRules(url);
System.out.println("is allowed : " + rules.isAllowed(url));
}
Metadata md = new Metadata();
ProtocolResponse response = protocol.getProtocolOutput(url, md);
System.out.println(url);
System.out.println(response.getMetadata());
System.out.println(response.getStatusCode());
System.out.println(response.getContent().length);
}
开发者ID:zaizi,项目名称:alfresco-apache-storm-demo,代码行数:21,代码来源:HttpProtocol.java
示例2: filter
import crawlercommons.robots.BaseRobotRules; //导入依赖的package包/类
@Override
public String filter(URL sourceUrl, Metadata sourceMetadata,
String urlToFilter) {
URL target;
try {
target = new URL(urlToFilter);
} catch (MalformedURLException e) {
return null;
}
// check whether the source and target have the same hostname
if (limitToSameHost) {
if (!target.getHost().equalsIgnoreCase(sourceUrl.getHost())) {
return urlToFilter;
}
}
BaseRobotRules rules = robots.getRobotRulesSet(
factory.getProtocol(target), urlToFilter);
if (!rules.isAllowed(urlToFilter)) {
return null;
}
return urlToFilter;
}
开发者ID:DigitalPebble,项目名称:storm-crawler,代码行数:25,代码来源:RobotsFilter.java
示例3: parseRobotsTxt
import crawlercommons.robots.BaseRobotRules; //导入依赖的package包/类
public static void parseRobotsTxt(String userAgent, String robotsUrl, String robotsTxt, HtmlAnalysisResult result) {
result.setRobotsTxt(robotsTxt);
SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser();
BaseRobotRules robotRules = robotsParser.parseContent(robotsUrl, robotsTxt.getBytes(), null, userAgent);
result.setRobotsAllowedAll(robotRules.isAllowAll());
result.setRobotsAllowedNone(robotRules.isAllowNone());
result.setRobotsAllowedHome(robotRules.isAllowed("/"));
result.setRobotsSitemaps(robotRules.getSitemaps());
result.setRobotsCrawlDelay(robotRules.getCrawlDelay());
}
开发者ID:tokenmill,项目名称:crawling-framework,代码行数:11,代码来源:PageAnalyzer.java
示例4: getRules
import crawlercommons.robots.BaseRobotRules; //导入依赖的package包/类
protected BaseRobotRules getRules(URI uri) {
try {
return RobotUtils.getRobotRules(fetcher, parser,
new URL(uri.getScheme(), uri.getHost(), uri.getPort(), ROBOTS_FILE_NAME));
} catch (MalformedURLException e) {
LOGGER.error("URL of robots.txt file is malformed. Returning rules for HTTP 400.");
return parser.failedFetch(400);
}
}
开发者ID:dice-group,项目名称:Squirrel,代码行数:10,代码来源:RobotsManagerImpl.java
示例5: getMinWaitingTime
import crawlercommons.robots.BaseRobotRules; //导入依赖的package包/类
@Override
public long getMinWaitingTime(URI uri) {
BaseRobotRules rules = getRules(uri);
long delay = rules.getCrawlDelay();
if (delay <= 0) {
return 0;
} else {
return delay;
}
}
开发者ID:dice-group,项目名称:Squirrel,代码行数:11,代码来源:RobotsManagerImpl.java
示例6: getRobotRulesSet
import crawlercommons.robots.BaseRobotRules; //导入依赖的package包/类
public BaseRobotRules getRobotRulesSet(Protocol protocol, Text url) {
URL u = null;
try {
u = new URL(url.toString());
} catch (Exception e) {
return EMPTY_RULES;
}
return getRobotRulesSet(protocol, u);
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:10,代码来源:RobotRulesParser.java
示例7: getRobotRulesSet
import crawlercommons.robots.BaseRobotRules; //导入依赖的package包/类
public BaseRobotRules getRobotRulesSet(Protocol protocol, String url) {
URL u = null;
try {
u = new URL(url);
} catch (Exception e) {
return EMPTY_RULES;
}
return getRobotRulesSet(protocol, u);
}
开发者ID:zaizi,项目名称:alfresco-apache-storm-demo,代码行数:10,代码来源:RobotRulesParser.java
示例8: isDisallowedByRobots
import crawlercommons.robots.BaseRobotRules; //导入依赖的package包/类
public boolean isDisallowedByRobots(LinkRelevance link) {
String hostname = link.getURL().getHost();
BaseRobotRules rules = robotRulesMap.get(hostname);
return rules != null && !rules.isAllowed(link.getURL().toString());
}
开发者ID:ViDA-NYU,项目名称:ache,代码行数:6,代码来源:Frontier.java
示例9: processRobot
import crawlercommons.robots.BaseRobotRules; //导入依赖的package包/类
private void processRobot(LinkRelevance link, FetchedResult response, boolean fetchFailed) {
BaseRobotRules robotRules;
if(fetchFailed || response == null) {
robotRules = parser.failedFetch(HttpStatus.SC_GONE);
}
else {
String contentType = response.getContentType();
boolean isPlainText = (contentType != null) && (contentType.startsWith("text/plain"));
if ((response.getNumRedirects() > 0) && !isPlainText) {
robotRules = parser.failedFetch(HttpStatus.SC_GONE);
} else {
robotRules = parser.parseContent(
response.getFetchedUrl(),
response.getContent(),
response.getContentType(),
userAgentName
);
}
}
try {
RobotsData robotsData = new RobotsData(link, robotRules);
linkStorage.insert(robotsData);
} catch (Exception e) {
logger.error("Failed to insert robots.txt data into link storage.", e);
}
}
开发者ID:ViDA-NYU,项目名称:ache,代码行数:30,代码来源:RobotsTxtHandler.java
示例10: failedFetch
import crawlercommons.robots.BaseRobotRules; //导入依赖的package包/类
@Override
public BaseRobotRules failedFetch(int httpStatusCode) {
ExtendedRobotRules result;
if ((httpStatusCode >= 200) && (httpStatusCode < 300)) {
throw new IllegalStateException("Can't use status code constructor with 2xx response");
} else if ((httpStatusCode >= 300) && (httpStatusCode < 400)) {
// Should only happen if we're getting endless redirects (more than
// our follow limit), so
// treat it as a temporary failure.
result = new ExtendedRobotRules(RobotRulesMode.ALLOW_NONE);
result.setDeferVisits(true);
} else if ((httpStatusCode >= 400) && (httpStatusCode < 500)) {
// Some sites return 410 (gone) instead of 404 (not found), so treat
// as the same.
// Actually treat all (including forbidden) as "no robots.txt", as
// that's what Google
// and other search engines do.
result = new ExtendedRobotRules(RobotRulesMode.ALLOW_ALL);
} else {
// Treat all other status codes as a temporary failure.
result = new ExtendedRobotRules(RobotRulesMode.ALLOW_NONE);
result.setDeferVisits(true);
}
return result;
}
开发者ID:Treydone,项目名称:mandrel,代码行数:28,代码来源:ExtendedRobotRulesParser.java
示例11: getRobotRulesSet
import crawlercommons.robots.BaseRobotRules; //导入依赖的package包/类
public BaseRobotRules getRobotRulesSet(Protocol protocol, String url) {
URL u;
try {
u = new URL(url);
} catch (Exception e) {
return EMPTY_RULES;
}
return getRobotRulesSet(protocol, u);
}
开发者ID:DigitalPebble,项目名称:storm-crawler,代码行数:10,代码来源:RobotRulesParser.java
示例12: isUriCrawlable
import crawlercommons.robots.BaseRobotRules; //导入依赖的package包/类
@Override
public boolean isUriCrawlable(URI uri) {
BaseRobotRules rules = getRules(uri);
return rules.isAllowed(uri.toString());
}
开发者ID:dice-group,项目名称:Squirrel,代码行数:6,代码来源:RobotsManagerImpl.java
示例13: getRobotRules
import crawlercommons.robots.BaseRobotRules; //导入依赖的package包/类
/**
* Get the robots rules for a given url
*/
public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
return robots.getRobotRulesSet(this, url);
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:7,代码来源:Ftp.java
示例14: getRobotRulesSet
import crawlercommons.robots.BaseRobotRules; //导入依赖的package包/类
/**
* The hosts for which the caching of robots rules is yet to be done, it sends
* a Ftp request to the host corresponding to the {@link URL} passed, gets
* robots file, parses the rules and caches the rules object to avoid re-work
* in future.
*
* @param ftp
* The {@link Protocol} object
* @param url
* URL
*
* @return robotRules A {@link BaseRobotRules} object for the rules
*/
public BaseRobotRules getRobotRulesSet(Protocol ftp, URL url) {
String protocol = url.getProtocol().toLowerCase(); // normalize to lower
// case
String host = url.getHost().toLowerCase(); // normalize to lower case
if (LOG.isTraceEnabled() && isWhiteListed(url)) {
LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
}
BaseRobotRules robotRules = CACHE.get(protocol + ":" + host);
if (robotRules != null) {
return robotRules; // cached rule
} else if (LOG.isTraceEnabled()) {
LOG.trace("cache miss " + url);
}
boolean cacheRule = true;
if (isWhiteListed(url)) {
// check in advance whether a host is whitelisted
// (we do not need to fetch robots.txt)
robotRules = EMPTY_RULES;
LOG.info("Whitelisted host found for: {}", url);
LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}", host);
} else {
try {
Text robotsUrl = new Text(new URL(url, "/robots.txt").toString());
ProtocolOutput output = ((Ftp) ftp).getProtocolOutput(robotsUrl,
new CrawlDatum());
ProtocolStatus status = output.getStatus();
if (status.getCode() == ProtocolStatus.SUCCESS) {
robotRules = parseRules(url.toString(), output.getContent()
.getContent(), CONTENT_TYPE, agentNames);
} else {
robotRules = EMPTY_RULES; // use default rules
}
} catch (Throwable t) {
if (LOG.isInfoEnabled()) {
LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
}
cacheRule = false; // try again later to fetch robots.txt
robotRules = EMPTY_RULES;
}
}
if (cacheRule)
CACHE.put(protocol + ":" + host, robotRules); // cache rules for host
return robotRules;
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:69,代码来源:FtpRobotRulesParser.java
示例15: getRobotRules
import crawlercommons.robots.BaseRobotRules; //导入依赖的package包/类
public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
return robots.getRobotRulesSet(this, url);
}
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:4,代码来源:HttpBase.java
示例16: getRobotRules
import crawlercommons.robots.BaseRobotRules; //导入依赖的package包/类
@Override
public BaseRobotRules getRobotRules(String url) {
if (this.skipRobots)
return RobotRulesParser.EMPTY_RULES;
return robots.getRobotRulesSet(this, url);
}
开发者ID:zaizi,项目名称:alfresco-apache-storm-demo,代码行数:7,代码来源:HttpProtocol.java
示例17: getRobotRulesSet
import crawlercommons.robots.BaseRobotRules; //导入依赖的package包/类
/**
* Get the rules from robots.txt which applies for the given {@code url}.
* Robot rules are cached for a unique combination of host, protocol, and
* port. If no rules are found in the cache, a HTTP request is send to fetch
* {{protocol://host:port/robots.txt}}. The robots.txt is then parsed and
* the rules are cached to avoid re-fetching and re-parsing it again.
*
* @param http
* The {@link Protocol} object
* @param url
* URL robots.txt applies to
*
* @return {@link BaseRobotRules} holding the rules from robots.txt
*/
@Override
public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {
String cacheKey = getCacheKey(url);
BaseRobotRules robotRules = CACHE.get(cacheKey);
boolean cacheRule = true;
if (robotRules == null) { // cache miss
URL redir = null;
LOG.trace("cache miss {}", url);
try {
ProtocolResponse response = http.getProtocolOutput(new URL(url,
"/robots.txt").toString(), Metadata.empty);
// try one level of redirection ?
if (response.getStatusCode() == 301
|| response.getStatusCode() == 302) {
String redirection = response.getMetadata().getFirstValue(
HttpHeaders.LOCATION);
if (StringUtils.isNotBlank(redirection)) {
if (!redirection.startsWith("http")) {
// RFC says it should be absolute, but apparently it
// isn't
redir = new URL(url, redirection);
} else {
redir = new URL(redirection);
}
response = http.getProtocolOutput(redir.toString(),
Metadata.empty);
}
}
if (response.getStatusCode() == 200) // found rules: parse them
{
String ct = response.getMetadata().getFirstValue(
HttpHeaders.CONTENT_TYPE);
robotRules = parseRules(url.toString(),
response.getContent(), ct, agentNames);
} else if ((response.getStatusCode() == 403)
&& (!allowForbidden))
robotRules = FORBID_ALL_RULES; // use forbid all
else if (response.getStatusCode() >= 500) {
cacheRule = false;
robotRules = EMPTY_RULES;
} else
robotRules = EMPTY_RULES; // use default rules
} catch (Throwable t) {
LOG.info("Couldn't get robots.txt for {} : {}", url,
t.toString());
cacheRule = false;
robotRules = EMPTY_RULES;
}
if (cacheRule) {
CACHE.put(cacheKey, robotRules); // cache rules for host
if (redir != null
&& !redir.getHost().equalsIgnoreCase(url.getHost())) {
// cache also for the redirected host
CACHE.put(getCacheKey(redir), robotRules);
}
}
}
return robotRules;
}
开发者ID:zaizi,项目名称:alfresco-apache-storm-demo,代码行数:80,代码来源:HttpRobotRulesParser.java
示例18: Frontier
import crawlercommons.robots.BaseRobotRules; //导入依赖的package包/类
public Frontier(String directory, int maxCacheUrlsSize, DB persistentHashtableBackend) {
this.urlRelevance = new PersistentHashtable<>(directory, maxCacheUrlsSize,
LinkRelevance.class, persistentHashtableBackend);
this.robotRulesMap = new PersistentHashtable<>(directory + "_robots", maxCacheUrlsSize,
BaseRobotRules.class, persistentHashtableBackend);
}
开发者ID:ViDA-NYU,项目名称:ache,代码行数:7,代码来源:Frontier.java
示例19: RobotsData
import crawlercommons.robots.BaseRobotRules; //导入依赖的package包/类
public RobotsData(LinkRelevance link, BaseRobotRules robotRules) {
this.link = link;
this.robotRules = robotRules;
}
开发者ID:ViDA-NYU,项目名称:ache,代码行数:5,代码来源:RobotsTxtHandler.java
示例20: getRobotRules
import crawlercommons.robots.BaseRobotRules; //导入依赖的package包/类
@Override
public BaseRobotRules getRobotRules(String url) {
return RobotRulesParser.EMPTY_RULES;
}
开发者ID:DigitalPebble,项目名称:storm-crawler,代码行数:5,代码来源:FileProtocol.java
注:本文中的crawlercommons.robots.BaseRobotRules类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论