本文整理汇总了Java中us.codecraft.webmagic.pipeline.Pipeline类的典型用法代码示例。如果您正苦于以下问题:Java Pipeline类的具体用法?Java Pipeline怎么用?Java Pipeline使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
Pipeline类属于us.codecraft.webmagic.pipeline包,在下文中一共展示了Pipeline类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Java代码示例。
示例1: scratch
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
public void scratch(){
us.codecraft.webmagic.Spider.create(new SimplePageProcessor(config.getStartUrl(), config.getTargetUrlPattern()))
.addPipeline(new Pipeline() {
@Override
public void process(ResultItems resultItems, Task task) {
Document doc = Jsoup.parse(resultItems.get("html"));
IPersist persist = PersistManager.getInstance().getPersist(config.getHost(), config.getCategory());
if(persist == null){
logger.warn("persistNotExists: host={}, category={}", config.getHost(), config.getCategory());
}else{
persist.persist(resultItems.getRequest().getUrl(), config, doc);
}
}
})
.thread(5).run();
}
开发者ID:wangdamu,项目名称:SpiderApplication,代码行数:17,代码来源:Spider.java
示例2: processRequest
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
protected void processRequest(Request request) {
Page page = downloader.download(request, this);
if (page == null) {
sleep(site.getSleepTime());
onError(request);
return;
}
// for cycle retry
if (page.isNeedCycleRetry()) {
extractAndAddRequests(page, true);
sleep(site.getRetrySleepTime());
return;
}
pageLoginProcessor.process(page);
extractAndAddRequests(page, spawnUrl);
if (!page.getResultItems().isSkip()) {
for (Pipeline pipeline : pipelines) {
pipeline.process(page.getResultItems(), this);
}
}
//for proxy status management
request.putExtra(Request.STATUS_CODE, page.getStatusCode());
sleep(site.getSleepTime());
}
开发者ID:hexiaohong-code,项目名称:LoginCrawler,代码行数:25,代码来源:SpiderLogin.java
示例3: testStartAndStop
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
@Ignore("long time")
@Test
public void testStartAndStop() throws InterruptedException {
Spider spider = Spider.create(new SimplePageProcessor( "http://www.oschina.net/*")).addPipeline(new Pipeline() {
@Override
public void process(ResultItems resultItems, Task task) {
System.out.println(1);
}
}).thread(1).addUrl("http://www.oschina.net/");
spider.start();
Thread.sleep(10000);
spider.stop();
Thread.sleep(10000);
spider.start();
Thread.sleep(10000);
}
开发者ID:code4craft,项目名称:webmagic,代码行数:17,代码来源:SpiderTest.java
示例4: startSpider
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
private static void startSpider(Params params) {
ScriptProcessor pageProcessor = ScriptProcessorBuilder.custom()
.language(params.getLanguage()).scriptFromFile(params.getScriptFileName()).thread(params.getThread()).build();
pageProcessor.getSite().setSleepTime(params.getSleepTime());
pageProcessor.getSite().setRetryTimes(3);
pageProcessor.getSite().setAcceptStatCode(WMCollections.<Integer>newHashSet(200, 404,403, 500,502));
Spider spider = Spider.create(pageProcessor).thread(params.getThread());
spider.clearPipeline().addPipeline(new Pipeline() {
@Override
public void process(ResultItems resultItems, Task task) {
}
});
if (params.getUrls() == null || params.getUrls().size() == 0) {
System.err.println("Need at least one argument");
System.out.println("Usage: java -jar webmagic.jar [-l language] -f script file [-t threadnum] [-s sleep time] url1 [url2 url3]");
System.exit(-1);
}
for (String url : params.getUrls()) {
spider.addUrl(url);
}
spider.run();
}
开发者ID:code4craft,项目名称:webmagic,代码行数:24,代码来源:ScriptConsole.java
示例5: close
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
public void close() {
destroyEach(downloader);
destroyEach(pageLoginProcessor);
destroyEach(scheduler);
for (Pipeline pipeline : pipelines) {
destroyEach(pipeline);
}
threadPool.shutdown();
}
开发者ID:hexiaohong-code,项目名称:LoginCrawler,代码行数:10,代码来源:SpiderLogin.java
示例6: destroy
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
protected void destroy() {
destroyEach(downloader);
destroyEach(pageProcessor);
for (Pipeline pipeline : pipelines) {
destroyEach(pipeline);
}
}
开发者ID:yuany,项目名称:en-webmagic,代码行数:8,代码来源:Spider.java
示例7: processRequest
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
protected void processRequest(Request request) {
Page page = downloader.download(request, this);
if (page == null) {
sleep(site.getSleepTime());
return;
}
pageProcessor.process(page);
addRequest(page);
if (!page.getResultItems().isSkip()) {
for (Pipeline pipeline : pipelines) {
pipeline.process(page.getResultItems(), this);
}
}
sleep(site.getSleepTime());
}
开发者ID:yuany,项目名称:en-webmagic,代码行数:16,代码来源:Spider.java
示例8: close
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
public void close() {
destroyEach(downloader);
destroyEach(pageProcessor);
destroyEach(scheduler);
for (Pipeline pipeline : pipelines) {
destroyEach(pipeline);
}
threadPool.shutdown();
}
开发者ID:code4craft,项目名称:webmagic,代码行数:10,代码来源:Spider.java
示例9: onDownloadSuccess
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
private void onDownloadSuccess(Request request, Page page) {
if (site.getAcceptStatCode().contains(page.getStatusCode())){
pageProcessor.process(page);
extractAndAddRequests(page, spawnUrl);
if (!page.getResultItems().isSkip()) {
for (Pipeline pipeline : pipelines) {
pipeline.process(page.getResultItems(), this);
}
}
} else {
logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode());
}
sleep(site.getSleepTime());
return;
}
开发者ID:code4craft,项目名称:webmagic,代码行数:16,代码来源:Spider.java
示例10: test_github
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
@Test
public void test_github() throws Exception {
Spider.create(new GithubRepoPageProcessor()).addPipeline(new Pipeline() {
@Override
public void process(ResultItems resultItems, Task task) {
assertThat(((String) resultItems.get("name")).trim()).isEqualTo("webmagic");
assertThat(((String) resultItems.get("author")).trim()).isEqualTo("code4craft");
}
}).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic");
}
开发者ID:code4craft,项目名称:webmagic,代码行数:11,代码来源:GithubRepoPageProcessorTest.java
示例11: test
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
@Test
public void test() {
OOSpider.create(new GithubRepoProcessor()).addPipeline(new Pipeline() {
@Override
public void process(ResultItems resultItems, Task task) {
Assert.assertEquals("78",((String)resultItems.get("star")).trim());
Assert.assertEquals("65",((String)resultItems.get("fork")).trim());
}
}).setDownloader(new MockGithubDownloader()).test("https://github.com/code4craft/webmagic");
}
开发者ID:code4craft,项目名称:webmagic,代码行数:11,代码来源:GithubRepoProcessor.java
示例12: getPipelineList
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
public List<Pipeline> getPipelineList() {
return pipelineList;
}
开发者ID:bruceq,项目名称:Gather-Platform,代码行数:4,代码来源:CommonSpider.java
示例13: setPipelineList
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
public CommonSpider setPipelineList(List<Pipeline> pipelineList) {
this.pipelineList = pipelineList;
return this;
}
开发者ID:bruceq,项目名称:Gather-Platform,代码行数:5,代码来源:CommonSpider.java
示例14: getPipelines
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
public List<Pipeline> getPipelines() {
return pipelines;
}
开发者ID:TransientBuckwheat,项目名称:nest-spider,代码行数:4,代码来源:CommonSpider.java
示例15: setPipelines
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
public CommonSpider setPipelines(List<Pipeline> pipelines) {
this.pipelines = pipelines;
return this;
}
开发者ID:TransientBuckwheat,项目名称:nest-spider,代码行数:5,代码来源:CommonSpider.java
示例16: clearPipeline
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
public Spider clearPipeline() {
pipelines = new ArrayList<Pipeline>();
return this;
}
开发者ID:yuany,项目名称:en-webmagic,代码行数:5,代码来源:Spider.java
示例17: addPipeline
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
/**
* add a pipeline for Spider
*
* @param pipeline pipeline
* @return this
* @see Pipeline
* @since 0.2.1
*/
public SpiderLogin addPipeline(Pipeline pipeline) {
checkIfRunning();
this.pipelines.add(pipeline);
return this;
}
开发者ID:hexiaohong-code,项目名称:LoginCrawler,代码行数:14,代码来源:SpiderLogin.java
示例18: setPipelines
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
/**
* set pipelines for Spider
*
* @param pipelines pipelines
* @return this
* @see Pipeline
* @since 0.4.1
*/
public SpiderLogin setPipelines(List<Pipeline> pipelines) {
checkIfRunning();
this.pipelines = pipelines;
return this;
}
开发者ID:hexiaohong-code,项目名称:LoginCrawler,代码行数:14,代码来源:SpiderLogin.java
示例19: clearPipeline
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
/**
* clear the pipelines set
*
* @return this
*/
public SpiderLogin clearPipeline() {
pipelines = new ArrayList<Pipeline>();
return this;
}
开发者ID:hexiaohong-code,项目名称:LoginCrawler,代码行数:10,代码来源:SpiderLogin.java
示例20: addPipeline
import us.codecraft.webmagic.pipeline.Pipeline; //导入依赖的package包/类
/**
* add a pipeline for Spider
*
* @param pipeline pipeline
* @return this
* @see Pipeline
* @since 0.2.1
*/
public Spider addPipeline(Pipeline pipeline) {
checkIfRunning();
this.pipelines.add(pipeline);
return this;
}
开发者ID:code4craft,项目名称:webmagic,代码行数:14,代码来源:Spider.java
注:本文中的us.codecraft.webmagic.pipeline.Pipeline类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论