本文整理汇总了C#中Abot.Poco.PageToCrawl类的典型用法代码示例。如果您正苦于以下问题:C# PageToCrawl类的具体用法?C# PageToCrawl怎么用?C# PageToCrawl使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
PageToCrawl类属于Abot.Poco命名空间,在下文中一共展示了PageToCrawl类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的C#代码示例。
示例1: ShouldCrawlPage
public virtual CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext)
{
if(pageToCrawl == null)
return new CrawlDecision { Allow = false, Reason = "Null page to crawl" };
if (crawlContext == null)
return new CrawlDecision { Allow = false, Reason = "Null crawl context" };
if(pageToCrawl.CrawlDepth > crawlContext.CrawlConfiguration.MaxCrawlDepth)
return new CrawlDecision { Allow = false, Reason = "Crawl depth is above max" };
if (!pageToCrawl.Uri.Scheme.StartsWith("http"))
return new CrawlDecision { Allow = false, Reason = "Scheme does not begin with http" };
if (crawlContext.CrawledCount + 1 > crawlContext.CrawlConfiguration.MaxPagesToCrawl)
{
return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawl limit of [{0}] has been reached", crawlContext.CrawlConfiguration.MaxPagesToCrawl) };
}
int pagesCrawledInThisDomain = 0;
if (crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain > 0 &&
crawlContext.CrawlCountByDomain.TryGetValue(pageToCrawl.Uri.Authority, out pagesCrawledInThisDomain) &&
pagesCrawledInThisDomain > 0)
{
if(pagesCrawledInThisDomain >= crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain)
return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawlPerDomain limit of [{0}] has been reached for domain [{1}]", crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain, pageToCrawl.Uri.Authority) };
}
if(!crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled && !pageToCrawl.IsInternal)
return new CrawlDecision { Allow = false, Reason = "Link is external" };
return new CrawlDecision { Allow = true };
}
开发者ID:vinchu,项目名称:abot,代码行数:33,代码来源:CrawlDecisionMaker.cs
示例2: PageCrawlStartingArgs
public PageCrawlStartingArgs(CrawlContext crawlContext, PageToCrawl pageToCrawl)
: base(crawlContext)
{
if (pageToCrawl == null)
throw new ArgumentNullException("pageToCrawl");
PageToCrawl = pageToCrawl;
}
开发者ID:CocoaLab,项目名称:abot,代码行数:8,代码来源:PageCrawlStartingArgs.cs
示例3: PageCrawlDisallowedArgs
public PageCrawlDisallowedArgs(CrawlContext crawlContext, PageToCrawl pageToCrawl, string disallowedReason)
: base(crawlContext, pageToCrawl)
{
if (string.IsNullOrWhiteSpace(disallowedReason))
throw new ArgumentNullException("disallowedReason");
DisallowedReason = disallowedReason;
}
开发者ID:haigneyc,项目名称:abot,代码行数:8,代码来源:PageCrawlDisallowedArgs.cs
示例4: SetUp
public void SetUp()
{
_page = new PageToCrawl { Uri = new Uri("http://a.com/") };
_pages = new List<PageToCrawl> { new PageToCrawl { Uri = new Uri("http://a.com/") }, new PageToCrawl { Uri = new Uri("http://b.com/") } };
_fakeCrawledUrlRepo = new Mock<ICrawledUrlRepository>();
_fakePagesToCrawlRepo = new Mock<IPagesToCrawlRepository>();
_unitUnderTest = new Scheduler(false, _fakeCrawledUrlRepo.Object, _fakePagesToCrawlRepo.Object);
}
开发者ID:CocoaLab,项目名称:abot,代码行数:9,代码来源:SchedulerTest.cs
示例5: Constructor_ValidUri_CreatesInstance
public void Constructor_ValidUri_CreatesInstance()
{
PageToCrawl unitUnderTest = new PageToCrawl(new Uri("http://a.com/"));
Assert.AreEqual(false, unitUnderTest.IsRetry);
Assert.AreEqual(false, unitUnderTest.IsRoot);
Assert.AreEqual(false, unitUnderTest.IsInternal);
Assert.AreEqual(null, unitUnderTest.ParentUri);
Assert.AreEqual("http://a.com/", unitUnderTest.Uri.AbsoluteUri);
Assert.AreEqual(0, unitUnderTest.CrawlDepth);
}
开发者ID:justinverhoef,项目名称:abot,代码行数:10,代码来源:PageToCrawlTest.cs
示例6: ConvertToPageToCrawl
public virtual PageToCrawl ConvertToPageToCrawl(LinkToCrawl link, int crawlerId)
{
var page = new PageToCrawl(new Uri(link.TargetUrl));
page.PageBag.SessionId = link.SessionId;
page.PageBag.CrawlerId = crawlerId;
page.ParentUri = new Uri(link.SourceUrl);
page.CrawlDepth = link.CrawlDepth;
page.IsInternal = link.IsInternal;
page.IsRoot = link.IsRoot;
return page;
}
开发者ID:BgRva,项目名称:ThrongBot,代码行数:11,代码来源:ModelFactory.cs
示例7: Constructor_CreatesInstance
public void Constructor_CreatesInstance()
{
PageToCrawl unitUnderTest = new PageToCrawl();
Assert.AreEqual(false, unitUnderTest.IsRetry);
Assert.AreEqual(false, unitUnderTest.IsRoot);
Assert.AreEqual(false, unitUnderTest.IsInternal);
Assert.AreEqual(null, unitUnderTest.ParentUri);
Assert.IsNull(unitUnderTest.Uri);
Assert.AreEqual(0, unitUnderTest.CrawlDepth);
Assert.IsNull(unitUnderTest.PageBag);
}
开发者ID:haigneyc,项目名称:abot,代码行数:11,代码来源:PageToCrawlTest.cs
示例8: ConvertToLinkToCrawl
public virtual LinkToCrawl ConvertToLinkToCrawl(PageToCrawl page, int sessionId)
{
var link = new LinkToCrawl();
link.SessionId = sessionId;
link.SourceUrl = page.ParentUri.AbsoluteUri;
link.TargetUrl = page.Uri.AbsoluteUri;
link.TargetBaseDomain = page.Uri.GetBaseDomain();
link.CrawlDepth = page.CrawlDepth;
link.IsRoot = page.IsRoot;
link.IsInternal = page.IsInternal;
return link;
}
开发者ID:BgRva,项目名称:ThrongBot,代码行数:12,代码来源:ModelFactory.cs
示例9: Add
public static void Add(SchedulerState state, PageToCrawl page)
{
var json = JsonConvert.SerializeObject(page);
var url = page.Uri.AbsoluteUri;
var trans = CreateTransaction(state);
var crawledPageKey = CrawledPageKey(state.SiteName, url);
var pageToCrawlKey = PageToCrawlKey(state.SiteName);
trans.AddCondition(Condition.KeyNotExists(crawledPageKey));
trans.StringSetAsync(crawledPageKey, "");
trans.ListLeftPushAsync(pageToCrawlKey, json);
trans.ExecuteAsync().Wait();
}
开发者ID:mng-au,项目名称:Abot.Redis.Scheduler,代码行数:12,代码来源:SchedulerFunc.cs
示例10: PageBag
public void PageBag()
{
PageToCrawl unitUnderTest = new PageToCrawl(new Uri("http://a.com/"));
unitUnderTest.PageBag.SomeVal = "someval";
unitUnderTest.PageBag.SomeQueue = new Queue<string>();
unitUnderTest.PageBag.SomeQueue.Enqueue("aaa");
unitUnderTest.PageBag.SomeQueue.Enqueue("bbb");
Assert.IsNotNull(unitUnderTest.PageBag);
Assert.AreEqual("someval", unitUnderTest.PageBag.SomeVal);
Assert.AreEqual("aaa", unitUnderTest.PageBag.SomeQueue.Dequeue());
Assert.AreEqual("bbb", unitUnderTest.PageBag.SomeQueue.Dequeue());
}
开发者ID:CocoaLab,项目名称:abot,代码行数:13,代码来源:PageToCrawlTest.cs
示例11: Add
public void Add(PageToCrawl page)
{
if (page == null)
throw new ArgumentNullException("page");
if (_allowUriRecrawling || page.IsRetry)
{
_pagesToCrawlRepo.Add(page);
}
else
{
if (_crawledUrlRepo.AddIfNew(page.Uri))
_pagesToCrawlRepo.Add(page);
}
}
开发者ID:CocoaLab,项目名称:abot,代码行数:15,代码来源:Scheduler.cs
示例12: ShouldCrawlPage
protected override bool ShouldCrawlPage(PageToCrawl pageToCrawl)
{
bool allowedByRobots = true;
if (_robotsDotText != null)
allowedByRobots = _robotsDotText.IsUrlAllowed(pageToCrawl.Uri.AbsoluteUri, _crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString);
//https://github.com/sjdirect/abot/issues/96 Handle scenario where the root is allowed but all the paths below are disallowed like "disallow: /*"
var allPathsBelowRootAllowedByRobots = false;
if (_robotsDotText != null && pageToCrawl.IsRoot && allowedByRobots)
{
var anyPathOffRoot = pageToCrawl.Uri.AbsoluteUri.EndsWith("/") ? pageToCrawl.Uri.AbsoluteUri + "aaaaa": pageToCrawl.Uri.AbsoluteUri + "/aaaaa";
allPathsBelowRootAllowedByRobots = _robotsDotText.IsUrlAllowed(anyPathOffRoot, _crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString);
}
if (_crawlContext.CrawlConfiguration.IsIgnoreRobotsDotTextIfRootDisallowedEnabled && pageToCrawl.IsRoot)
{
if (!allowedByRobots)
{
string message = string.Format("Page [{0}] [Disallowed by robots.txt file], however since IsIgnoreRobotsDotTextIfRootDisallowedEnabled is set to true the robots.txt file will be ignored for this site.", pageToCrawl.Uri.AbsoluteUri);
_logger.DebugFormat(message);
allowedByRobots = true;
_robotsDotText = null;
}
else if (!allPathsBelowRootAllowedByRobots)
{
string message = string.Format("All Pages below [{0}] [Disallowed by robots.txt file], however since IsIgnoreRobotsDotTextIfRootDisallowedEnabled is set to true the robots.txt file will be ignored for this site.", pageToCrawl.Uri.AbsoluteUri);
_logger.DebugFormat(message);
allowedByRobots = true;
_robotsDotText = null;
}
}
else if (!allowedByRobots)
{
string message = string.Format("Page [{0}] not crawled, [Disallowed by robots.txt file], set IsRespectRobotsDotText=false in config file if you would like to ignore robots.txt files.", pageToCrawl.Uri.AbsoluteUri);
_logger.DebugFormat(message);
FirePageCrawlDisallowedEventAsync(pageToCrawl, message);
FirePageCrawlDisallowedEvent(pageToCrawl, message);
return false;
}
return allowedByRobots && base.ShouldCrawlPage(pageToCrawl);
}
开发者ID:sharpcoder7,项目名称:abot,代码行数:46,代码来源:PoliteWebCrawler.cs
示例13: Add
/// <summary>
/// If this method is called, then it assumes some pre-logic for links to avoid has already
/// been applied and that the <paramref name="page"/> should be stored for future crawling.
/// </summary>
/// <param name="page"></param>
public void Add(PageToCrawl page)
{
if (page == null)
throw new ArgumentNullException("page");
//_logger.DebugFormat("Add(page): Target: {0}, Source: {1}, Root: {2}",
// page.Uri.AbsoluteUri,
// page.ParentUri.AbsoluteUri,
// page.IsRoot);
page.PageBag.SessionId = SessionId;
page.PageBag.CrawlerId = CrawlerId;
using (var factory = _provider.GetInstanceOf<IModelFactory>())
{
var link = factory.ConvertToLinkToCrawl(page, SessionId);
AddLinkToCrawl(link);
}
}
开发者ID:BgRva,项目名称:ThrongBot,代码行数:23,代码来源:MyScheduler.cs
示例14: Add
/// <summary>
/// Schedules the param to be crawled in a FIFO fashion
/// </summary>
public void Add(PageToCrawl page)
{
if (page == null)
throw new ArgumentNullException("page");
if (_allowUriRecrawling)
{
//_logger.DebugFormat("Scheduling for crawl [{0}]", page.Uri.AbsoluteUri);
_pagesToCrawl.Enqueue(page);
}
else
{
if (_scheduledOrCrawled.TryAdd(page.Uri.AbsoluteUri, null))
{
//_logger.DebugFormat("Scheduling for crawl [{0}]", page.Uri.AbsoluteUri);
_pagesToCrawl.Enqueue(page);
}
}
}
开发者ID:haigneyc,项目名称:abot,代码行数:22,代码来源:FifoScheduler.cs
示例15: ShouldCrawlPage
public virtual CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext)
{
if(pageToCrawl == null)
return new CrawlDecision { Allow = false, Reason = "Null page to crawl" };
if (crawlContext == null)
return new CrawlDecision { Allow = false, Reason = "Null crawl context" };
if (pageToCrawl.RedirectedFrom != null && pageToCrawl.RedirectPosition > crawlContext.CrawlConfiguration.HttpRequestMaxAutoRedirects)
return new CrawlDecision { Allow = false, Reason = string.Format("HttpRequestMaxAutoRedirects limit of [{0}] has been reached", crawlContext.CrawlConfiguration.HttpRequestMaxAutoRedirects) };
if(pageToCrawl.CrawlDepth > crawlContext.CrawlConfiguration.MaxCrawlDepth)
return new CrawlDecision { Allow = false, Reason = "Crawl depth is above max" };
if (!pageToCrawl.Uri.Scheme.StartsWith("http"))
return new CrawlDecision { Allow = false, Reason = "Scheme does not begin with http" };
//TODO Do we want to ignore redirect chains (ie.. do not treat them as seperate page crawls)?
if (!pageToCrawl.IsRetry &&
crawlContext.CrawlConfiguration.MaxPagesToCrawl > 0 &&
crawlContext.CrawledCount + crawlContext.Scheduler.Count + 1 > crawlContext.CrawlConfiguration.MaxPagesToCrawl)
{
return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawl limit of [{0}] has been reached", crawlContext.CrawlConfiguration.MaxPagesToCrawl) };
}
int pagesCrawledInThisDomain = 0;
if (!pageToCrawl.IsRetry &&
crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain > 0 &&
crawlContext.CrawlCountByDomain.TryGetValue(pageToCrawl.Uri.Authority, out pagesCrawledInThisDomain) &&
pagesCrawledInThisDomain > 0)
{
if (pagesCrawledInThisDomain >= crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain)
return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawlPerDomain limit of [{0}] has been reached for domain [{1}]", crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain, pageToCrawl.Uri.Authority) };
}
if(!crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled && !pageToCrawl.IsInternal)
return new CrawlDecision { Allow = false, Reason = "Link is external" };
return new CrawlDecision { Allow = true };
}
开发者ID:sharpcoder7,项目名称:abot,代码行数:40,代码来源:CrawlDecisionMaker.cs
示例16: GetNext_MultiplePages_ReturnsInFifoOrder
public void GetNext_MultiplePages_ReturnsInFifoOrder()
{
PageToCrawl page3 = new PageToCrawl(new Uri("http://abc/"));
PageToCrawl page4 = new PageToCrawl(new Uri("http://abcd/"));
_unitUnderTest.Add(_page1);
_unitUnderTest.Add(_page2);
_unitUnderTest.Add(page3);
_unitUnderTest.Add(page4);
PageToCrawl result1 = _unitUnderTest.GetNext();
PageToCrawl result2 = _unitUnderTest.GetNext();
PageToCrawl result3 = _unitUnderTest.GetNext();
PageToCrawl result4 = _unitUnderTest.GetNext();
PageToCrawl result5 = _unitUnderTest.GetNext();//should be null
Assert.AreSame(_page1, result1);
Assert.AreSame(_page2, result2);
Assert.AreSame(page3, result3);
Assert.AreSame(page4, result4);
Assert.IsNull(result5);
}
开发者ID:CocoaLab,项目名称:abot,代码行数:22,代码来源:PagesToCrawlRepositoryTest.cs
示例17: ShouldCrawlPage
protected override bool ShouldCrawlPage(PageToCrawl pageToCrawl)
{
var allowedByRobots = true;
if (_robotsDotText != null)
allowedByRobots = _robotsDotText.IsUrlAllowed(pageToCrawl.Uri.AbsoluteUri, _crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString);
if (!allowedByRobots && pageToCrawl.IsRoot && _crawlContext.CrawlConfiguration.IsIgnoreRobotsDotTextIfRootDisallowedEnabled)
{
_logger.DebugFormat("Page [{0}] [Disallowed by robots.txt file], however since IsIgnoreRobotsDotTextIfRootDisallowedEnabled is set to true the robots.txt file will be ignored for this site.", pageToCrawl.Uri.AbsoluteUri);
allowedByRobots = true;
_robotsDotText = null;
}
else if (!allowedByRobots)
{
_logger.DebugFormat("Page [{0}] not crawled, [Disallowed by robots.txt file], set IsRespectRobotsDotText=false in config file if you would like to ignore robots.txt files.", pageToCrawl.Uri.AbsoluteUri);
var message = $"Page [{pageToCrawl.Uri.AbsoluteUri}] not crawled, [Disallowed by robots.txt file], set IsRespectRobotsDotText=false in config file if you would like to ignore robots.txt files.";
FirePageCrawlDisallowedEventAsync(pageToCrawl, message);
FirePageCrawlDisallowedEvent(pageToCrawl, message);
return false;
}
return base.ShouldCrawlPage(pageToCrawl);
}
开发者ID:mng-au,项目名称:Abot.MultiProxyPoliteWebCrawler,代码行数:24,代码来源:MultiProxyPoliteWebCrawler.cs
示例18: FirePageCrawlStartingEventAsync
protected virtual void FirePageCrawlStartingEventAsync(PageToCrawl pageToCrawl)
{
EventHandler<PageCrawlStartingArgs> threadSafeEvent = PageCrawlStartingAsync;
if (threadSafeEvent != null)
{
//Fire each subscribers delegate async
foreach (EventHandler<PageCrawlStartingArgs> del in threadSafeEvent.GetInvocationList())
{
del.BeginInvoke(this, new PageCrawlStartingArgs(_crawlContext, pageToCrawl), null, null);
}
}
}
开发者ID:justinverhoef,项目名称:abot,代码行数:12,代码来源:WebCrawler.cs
示例19: GetNext
public void GetNext()
{
Assert.AreEqual(0, _unitUnderTest.Count);
PageToCrawl page1 = new PageToCrawl(new Uri("http://a.com/1"));
PageToCrawl page2 = new PageToCrawl(new Uri("http://a.com/2"));
PageToCrawl page3 = new PageToCrawl(new Uri("http://a.com/3"));
_unitUnderTest.Add(page1);
_unitUnderTest.Add(page2);
_unitUnderTest.Add(page3);
Assert.AreEqual(3, _unitUnderTest.Count);
Assert.AreEqual(page1.Uri, _unitUnderTest.GetNext().Uri);
Assert.AreEqual(page2.Uri, _unitUnderTest.GetNext().Uri);
Assert.AreEqual(page3.Uri, _unitUnderTest.GetNext().Uri);
Assert.AreEqual(0, _unitUnderTest.Count);
}
开发者ID:haigneyc,项目名称:abot,代码行数:18,代码来源:FifoSchedulerTest.cs
示例20: GetNext
public PageToCrawl GetNext()
{
int rnd = (int)((rand1.NextDouble() * (maxRandom - minRandom)) + minRandom);//296030
Uri tempUri = new Uri("http://us.ebid.net/for-sale/a-" + rnd.ToString() + ".htm");
//Uri tempUri = new Uri("http://us.ebid.net/for-sale/a-141378296.htm");
while (_crawledUrlRepo.Contains(tempUri))
{
rnd = (int)((rand1.NextDouble() * (maxRandom - minRandom)) + minRandom);
tempUri = new Uri("http://us.ebid.net/for-sale/a-" + rnd.ToString()+".htm");
}
count--;
PageToCrawl page = new PageToCrawl(tempUri);
page.ParentUri = new Uri("http://us.ebid.net/");
page.CrawlDepth = 1;
page.IsInternal = true;
page.IsRoot = false;
return page;
}
开发者ID:abbas-oveissi,项目名称:abot,代码行数:18,代码来源:MadBidScheduler.cs
注:本文中的Abot.Poco.PageToCrawl类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论