• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

C# Poco.PageToCrawl类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了C#中Abot.Poco.PageToCrawl的典型用法代码示例。如果您正苦于以下问题:C# PageToCrawl类的具体用法?C# PageToCrawl怎么用?C# PageToCrawl使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



PageToCrawl类属于Abot.Poco命名空间,在下文中一共展示了PageToCrawl类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的C#代码示例。

示例1: ShouldCrawlPage

        public virtual CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext)
        {
            if(pageToCrawl == null)
                return new CrawlDecision { Allow = false, Reason = "Null page to crawl" };

            if (crawlContext == null)
                return new CrawlDecision { Allow = false, Reason = "Null crawl context" };

            if(pageToCrawl.CrawlDepth > crawlContext.CrawlConfiguration.MaxCrawlDepth)
                return new CrawlDecision { Allow = false, Reason = "Crawl depth is above max" };

            if (!pageToCrawl.Uri.Scheme.StartsWith("http"))
                return new CrawlDecision { Allow = false, Reason = "Scheme does not begin with http" };

            if (crawlContext.CrawledCount + 1 > crawlContext.CrawlConfiguration.MaxPagesToCrawl)
            {
                return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawl limit of [{0}] has been reached", crawlContext.CrawlConfiguration.MaxPagesToCrawl) };
            }

            int pagesCrawledInThisDomain = 0;
            if (crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain > 0 &&
                crawlContext.CrawlCountByDomain.TryGetValue(pageToCrawl.Uri.Authority, out pagesCrawledInThisDomain) &&
                pagesCrawledInThisDomain > 0)
            {
                if(pagesCrawledInThisDomain >= crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain)
                    return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawlPerDomain limit of [{0}] has been reached for domain [{1}]", crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain, pageToCrawl.Uri.Authority) };
            }

            if(!crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled && !pageToCrawl.IsInternal)
                return new CrawlDecision { Allow = false, Reason = "Link is external" };

            return new CrawlDecision { Allow = true };
        }
开发者ID:vinchu,项目名称:abot,代码行数:33,代码来源:CrawlDecisionMaker.cs


示例2: PageCrawlStartingArgs

        public PageCrawlStartingArgs(CrawlContext crawlContext, PageToCrawl pageToCrawl)
            : base(crawlContext)
        {
            if (pageToCrawl == null)
                throw new ArgumentNullException("pageToCrawl");

            PageToCrawl = pageToCrawl;
        }
开发者ID:CocoaLab,项目名称:abot,代码行数:8,代码来源:PageCrawlStartingArgs.cs


示例3: PageCrawlDisallowedArgs

        public PageCrawlDisallowedArgs(CrawlContext crawlContext, PageToCrawl pageToCrawl, string disallowedReason)
            : base(crawlContext, pageToCrawl)
        {
            if (string.IsNullOrWhiteSpace(disallowedReason))
                throw new ArgumentNullException("disallowedReason");

            DisallowedReason = disallowedReason;
        }
开发者ID:haigneyc,项目名称:abot,代码行数:8,代码来源:PageCrawlDisallowedArgs.cs


示例4: SetUp

        public void SetUp()
        {
            _page = new PageToCrawl { Uri = new Uri("http://a.com/") };
            _pages = new List<PageToCrawl> { new PageToCrawl { Uri = new Uri("http://a.com/") }, new PageToCrawl { Uri = new Uri("http://b.com/") } };
            _fakeCrawledUrlRepo = new Mock<ICrawledUrlRepository>();
            _fakePagesToCrawlRepo = new Mock<IPagesToCrawlRepository>();

            _unitUnderTest = new Scheduler(false, _fakeCrawledUrlRepo.Object, _fakePagesToCrawlRepo.Object);
        }
开发者ID:CocoaLab,项目名称:abot,代码行数:9,代码来源:SchedulerTest.cs


示例5: Constructor_ValidUri_CreatesInstance

 public void Constructor_ValidUri_CreatesInstance()
 {
     PageToCrawl unitUnderTest = new PageToCrawl(new Uri("http://a.com/"));
     Assert.AreEqual(false, unitUnderTest.IsRetry);
     Assert.AreEqual(false, unitUnderTest.IsRoot);
     Assert.AreEqual(false, unitUnderTest.IsInternal);
     Assert.AreEqual(null, unitUnderTest.ParentUri);
     Assert.AreEqual("http://a.com/", unitUnderTest.Uri.AbsoluteUri);
     Assert.AreEqual(0, unitUnderTest.CrawlDepth);
 }
开发者ID:justinverhoef,项目名称:abot,代码行数:10,代码来源:PageToCrawlTest.cs


示例6: ConvertToPageToCrawl

 public virtual PageToCrawl ConvertToPageToCrawl(LinkToCrawl link, int crawlerId)
 {
     var page = new PageToCrawl(new Uri(link.TargetUrl));
     page.PageBag.SessionId = link.SessionId;
     page.PageBag.CrawlerId = crawlerId;
     page.ParentUri = new Uri(link.SourceUrl);
     page.CrawlDepth = link.CrawlDepth;
     page.IsInternal = link.IsInternal;
     page.IsRoot = link.IsRoot;
     return page;
 }
开发者ID:BgRva,项目名称:ThrongBot,代码行数:11,代码来源:ModelFactory.cs


示例7: Constructor_CreatesInstance

 public void Constructor_CreatesInstance()
 {
     PageToCrawl unitUnderTest = new PageToCrawl();
     Assert.AreEqual(false, unitUnderTest.IsRetry);
     Assert.AreEqual(false, unitUnderTest.IsRoot);
     Assert.AreEqual(false, unitUnderTest.IsInternal);
     Assert.AreEqual(null, unitUnderTest.ParentUri);
     Assert.IsNull(unitUnderTest.Uri);
     Assert.AreEqual(0, unitUnderTest.CrawlDepth);
     Assert.IsNull(unitUnderTest.PageBag);
 }
开发者ID:haigneyc,项目名称:abot,代码行数:11,代码来源:PageToCrawlTest.cs


示例8: ConvertToLinkToCrawl

 public virtual LinkToCrawl ConvertToLinkToCrawl(PageToCrawl page, int sessionId)
 {
     var link = new LinkToCrawl();
     link.SessionId = sessionId;
     link.SourceUrl = page.ParentUri.AbsoluteUri;
     link.TargetUrl = page.Uri.AbsoluteUri;
     link.TargetBaseDomain = page.Uri.GetBaseDomain();
     link.CrawlDepth = page.CrawlDepth;
     link.IsRoot = page.IsRoot;
     link.IsInternal = page.IsInternal;
     return link;
 }
开发者ID:BgRva,项目名称:ThrongBot,代码行数:12,代码来源:ModelFactory.cs


示例9: Add

		public static void Add(SchedulerState state, PageToCrawl page)
		{
			var json = JsonConvert.SerializeObject(page);
			var url = page.Uri.AbsoluteUri;
			var trans = CreateTransaction(state);
			var crawledPageKey = CrawledPageKey(state.SiteName, url);
			var pageToCrawlKey = PageToCrawlKey(state.SiteName);
			trans.AddCondition(Condition.KeyNotExists(crawledPageKey));
			trans.StringSetAsync(crawledPageKey, "");
			trans.ListLeftPushAsync(pageToCrawlKey, json);
			trans.ExecuteAsync().Wait();
		}
开发者ID:mng-au,项目名称:Abot.Redis.Scheduler,代码行数:12,代码来源:SchedulerFunc.cs


示例10: PageBag

        public void PageBag()
        {
            PageToCrawl unitUnderTest = new PageToCrawl(new Uri("http://a.com/"));
            unitUnderTest.PageBag.SomeVal = "someval";
            unitUnderTest.PageBag.SomeQueue = new Queue<string>();
            unitUnderTest.PageBag.SomeQueue.Enqueue("aaa");
            unitUnderTest.PageBag.SomeQueue.Enqueue("bbb");

            Assert.IsNotNull(unitUnderTest.PageBag);
            Assert.AreEqual("someval", unitUnderTest.PageBag.SomeVal);
            Assert.AreEqual("aaa", unitUnderTest.PageBag.SomeQueue.Dequeue());
            Assert.AreEqual("bbb", unitUnderTest.PageBag.SomeQueue.Dequeue());
        }
开发者ID:CocoaLab,项目名称:abot,代码行数:13,代码来源:PageToCrawlTest.cs


示例11: Add

        public void Add(PageToCrawl page)
        {
            if (page == null)
                throw new ArgumentNullException("page");

            if (_allowUriRecrawling || page.IsRetry)
            {
                _pagesToCrawlRepo.Add(page);
            }
            else
            {
                if (_crawledUrlRepo.AddIfNew(page.Uri))
                    _pagesToCrawlRepo.Add(page);
            }
        }
开发者ID:CocoaLab,项目名称:abot,代码行数:15,代码来源:Scheduler.cs


示例12: ShouldCrawlPage

        protected override bool ShouldCrawlPage(PageToCrawl pageToCrawl)
        {
            bool allowedByRobots = true;
            if (_robotsDotText != null)
                allowedByRobots = _robotsDotText.IsUrlAllowed(pageToCrawl.Uri.AbsoluteUri, _crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString);


            //https://github.com/sjdirect/abot/issues/96 Handle scenario where the root is allowed but all the paths below are disallowed like "disallow: /*"
            var allPathsBelowRootAllowedByRobots = false;
            if (_robotsDotText != null && pageToCrawl.IsRoot && allowedByRobots)
            {
                var anyPathOffRoot = pageToCrawl.Uri.AbsoluteUri.EndsWith("/") ? pageToCrawl.Uri.AbsoluteUri + "aaaaa": pageToCrawl.Uri.AbsoluteUri + "/aaaaa";
                allPathsBelowRootAllowedByRobots = _robotsDotText.IsUrlAllowed(anyPathOffRoot, _crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString);
            }

            if (_crawlContext.CrawlConfiguration.IsIgnoreRobotsDotTextIfRootDisallowedEnabled && pageToCrawl.IsRoot)    
            {
                if (!allowedByRobots)
                {
                    string message = string.Format("Page [{0}] [Disallowed by robots.txt file], however since IsIgnoreRobotsDotTextIfRootDisallowedEnabled is set to true the robots.txt file will be ignored for this site.", pageToCrawl.Uri.AbsoluteUri);
                    _logger.DebugFormat(message);
                    allowedByRobots = true;
                    _robotsDotText = null;
                }
                else if (!allPathsBelowRootAllowedByRobots)
                {
                    string message = string.Format("All Pages below [{0}] [Disallowed by robots.txt file], however since IsIgnoreRobotsDotTextIfRootDisallowedEnabled is set to true the robots.txt file will be ignored for this site.", pageToCrawl.Uri.AbsoluteUri);
                    _logger.DebugFormat(message);
                    allowedByRobots = true;
                    _robotsDotText = null;
                }

            }
            else if (!allowedByRobots)
            {
                string message = string.Format("Page [{0}] not crawled, [Disallowed by robots.txt file], set IsRespectRobotsDotText=false in config file if you would like to ignore robots.txt files.", pageToCrawl.Uri.AbsoluteUri);
                _logger.DebugFormat(message);

                FirePageCrawlDisallowedEventAsync(pageToCrawl, message);
                FirePageCrawlDisallowedEvent(pageToCrawl, message);

                return false;
            }

            return allowedByRobots && base.ShouldCrawlPage(pageToCrawl);
        }
开发者ID:sharpcoder7,项目名称:abot,代码行数:46,代码来源:PoliteWebCrawler.cs


示例13: Add

        /// <summary>
        /// If this method is called, then it assumes some pre-logic for links to avoid has already
        /// been applied and that the <paramref name="page"/> should be stored for future crawling.
        /// </summary>
        /// <param name="page"></param>
        public void Add(PageToCrawl page)
        {
            if (page == null)
                throw new ArgumentNullException("page");

            //_logger.DebugFormat("Add(page): Target: {0}, Source: {1}, Root: {2}",
            //    page.Uri.AbsoluteUri,
            //    page.ParentUri.AbsoluteUri,
            //    page.IsRoot);

            page.PageBag.SessionId = SessionId;
            page.PageBag.CrawlerId = CrawlerId;
            using (var factory = _provider.GetInstanceOf<IModelFactory>())
            {
                var link = factory.ConvertToLinkToCrawl(page, SessionId);
                AddLinkToCrawl(link);
            }
        }
开发者ID:BgRva,项目名称:ThrongBot,代码行数:23,代码来源:MyScheduler.cs


示例14: Add

        /// <summary>
        /// Schedules the param to be crawled in a FIFO fashion
        /// </summary>
        public void Add(PageToCrawl page)
        {
            if (page == null)
                throw new ArgumentNullException("page");

            if (_allowUriRecrawling)
            {
                //_logger.DebugFormat("Scheduling for crawl [{0}]", page.Uri.AbsoluteUri);
                _pagesToCrawl.Enqueue(page);
            }
            else
            {
                if (_scheduledOrCrawled.TryAdd(page.Uri.AbsoluteUri, null))
                {
                    //_logger.DebugFormat("Scheduling for crawl [{0}]", page.Uri.AbsoluteUri);
                    _pagesToCrawl.Enqueue(page);
                }
            }
        }
开发者ID:haigneyc,项目名称:abot,代码行数:22,代码来源:FifoScheduler.cs


示例15: ShouldCrawlPage

        public virtual CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext)
        {
            if(pageToCrawl == null)
                return new CrawlDecision { Allow = false, Reason = "Null page to crawl" };

            if (crawlContext == null)
                return new CrawlDecision { Allow = false, Reason = "Null crawl context" };

            if (pageToCrawl.RedirectedFrom != null && pageToCrawl.RedirectPosition > crawlContext.CrawlConfiguration.HttpRequestMaxAutoRedirects)
                return new CrawlDecision { Allow = false, Reason = string.Format("HttpRequestMaxAutoRedirects limit of [{0}] has been reached", crawlContext.CrawlConfiguration.HttpRequestMaxAutoRedirects) };

            if(pageToCrawl.CrawlDepth > crawlContext.CrawlConfiguration.MaxCrawlDepth)
                return new CrawlDecision { Allow = false, Reason = "Crawl depth is above max" };

            if (!pageToCrawl.Uri.Scheme.StartsWith("http"))
                return new CrawlDecision { Allow = false, Reason = "Scheme does not begin with http" };

            //TODO Do we want to ignore redirect chains (ie.. do not treat them as seperate page crawls)?
            if (!pageToCrawl.IsRetry &&
                crawlContext.CrawlConfiguration.MaxPagesToCrawl > 0 &&
                crawlContext.CrawledCount + crawlContext.Scheduler.Count + 1 > crawlContext.CrawlConfiguration.MaxPagesToCrawl)
            {
                return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawl limit of [{0}] has been reached", crawlContext.CrawlConfiguration.MaxPagesToCrawl) };
            }

            int pagesCrawledInThisDomain = 0;
            if (!pageToCrawl.IsRetry &&
                crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain > 0 &&
                crawlContext.CrawlCountByDomain.TryGetValue(pageToCrawl.Uri.Authority, out pagesCrawledInThisDomain) &&
                pagesCrawledInThisDomain > 0)
            {
                if (pagesCrawledInThisDomain >= crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain)
                    return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawlPerDomain limit of [{0}] has been reached for domain [{1}]", crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain, pageToCrawl.Uri.Authority) };
            }

            if(!crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled && !pageToCrawl.IsInternal)
                return new CrawlDecision { Allow = false, Reason = "Link is external" };

            return new CrawlDecision { Allow = true };
        }
开发者ID:sharpcoder7,项目名称:abot,代码行数:40,代码来源:CrawlDecisionMaker.cs


示例16: GetNext_MultiplePages_ReturnsInFifoOrder

        public void GetNext_MultiplePages_ReturnsInFifoOrder()
        {
            PageToCrawl page3 = new PageToCrawl(new Uri("http://abc/"));
            PageToCrawl page4 = new PageToCrawl(new Uri("http://abcd/"));
            
            _unitUnderTest.Add(_page1);
            _unitUnderTest.Add(_page2);
            _unitUnderTest.Add(page3);
            _unitUnderTest.Add(page4);

            PageToCrawl result1 = _unitUnderTest.GetNext();
            PageToCrawl result2 = _unitUnderTest.GetNext();
            PageToCrawl result3 = _unitUnderTest.GetNext();
            PageToCrawl result4 = _unitUnderTest.GetNext();
            PageToCrawl result5 = _unitUnderTest.GetNext();//should be null

            Assert.AreSame(_page1, result1);
            Assert.AreSame(_page2, result2);
            Assert.AreSame(page3, result3);
            Assert.AreSame(page4, result4);
            Assert.IsNull(result5);
        }
开发者ID:CocoaLab,项目名称:abot,代码行数:22,代码来源:PagesToCrawlRepositoryTest.cs


示例17: ShouldCrawlPage

		protected override bool ShouldCrawlPage(PageToCrawl pageToCrawl)
		{
			var allowedByRobots = true;
			if (_robotsDotText != null)
				allowedByRobots = _robotsDotText.IsUrlAllowed(pageToCrawl.Uri.AbsoluteUri, _crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString);

			if (!allowedByRobots && pageToCrawl.IsRoot && _crawlContext.CrawlConfiguration.IsIgnoreRobotsDotTextIfRootDisallowedEnabled)
			{
				_logger.DebugFormat("Page [{0}] [Disallowed by robots.txt file], however since IsIgnoreRobotsDotTextIfRootDisallowedEnabled is set to true the robots.txt file will be ignored for this site.", pageToCrawl.Uri.AbsoluteUri);
				allowedByRobots = true;
				_robotsDotText = null;
			}
			else if (!allowedByRobots)
			{				
				_logger.DebugFormat("Page [{0}] not crawled, [Disallowed by robots.txt file], set IsRespectRobotsDotText=false in config file if you would like to ignore robots.txt files.", pageToCrawl.Uri.AbsoluteUri);
				var message = $"Page [{pageToCrawl.Uri.AbsoluteUri}] not crawled, [Disallowed by robots.txt file], set IsRespectRobotsDotText=false in config file if you would like to ignore robots.txt files.";
				FirePageCrawlDisallowedEventAsync(pageToCrawl, message);
				FirePageCrawlDisallowedEvent(pageToCrawl, message);

				return false;
			}

			return base.ShouldCrawlPage(pageToCrawl);
		}
开发者ID:mng-au,项目名称:Abot.MultiProxyPoliteWebCrawler,代码行数:24,代码来源:MultiProxyPoliteWebCrawler.cs


示例18: FirePageCrawlStartingEventAsync

 protected virtual void FirePageCrawlStartingEventAsync(PageToCrawl pageToCrawl)
 {
     EventHandler<PageCrawlStartingArgs> threadSafeEvent = PageCrawlStartingAsync;
     if (threadSafeEvent != null)
     {
         //Fire each subscribers delegate async
         foreach (EventHandler<PageCrawlStartingArgs> del in threadSafeEvent.GetInvocationList())
         {
             del.BeginInvoke(this, new PageCrawlStartingArgs(_crawlContext, pageToCrawl), null, null);
         }
     }
 }
开发者ID:justinverhoef,项目名称:abot,代码行数:12,代码来源:WebCrawler.cs


示例19: GetNext

        public void GetNext()
        {
            Assert.AreEqual(0, _unitUnderTest.Count);

            PageToCrawl page1 = new PageToCrawl(new Uri("http://a.com/1"));
            PageToCrawl page2 = new PageToCrawl(new Uri("http://a.com/2"));
            PageToCrawl page3 = new PageToCrawl(new Uri("http://a.com/3"));

            _unitUnderTest.Add(page1);
            _unitUnderTest.Add(page2);
            _unitUnderTest.Add(page3);

            Assert.AreEqual(3, _unitUnderTest.Count);
            Assert.AreEqual(page1.Uri, _unitUnderTest.GetNext().Uri);
            Assert.AreEqual(page2.Uri, _unitUnderTest.GetNext().Uri);
            Assert.AreEqual(page3.Uri, _unitUnderTest.GetNext().Uri);
            Assert.AreEqual(0, _unitUnderTest.Count);
        }
开发者ID:haigneyc,项目名称:abot,代码行数:18,代码来源:FifoSchedulerTest.cs


示例20: GetNext

 public PageToCrawl GetNext()
 {
     int rnd = (int)((rand1.NextDouble() * (maxRandom - minRandom)) + minRandom);//296030
     Uri tempUri = new Uri("http://us.ebid.net/for-sale/a-" + rnd.ToString() + ".htm");
     //Uri tempUri = new Uri("http://us.ebid.net/for-sale/a-141378296.htm");
     while (_crawledUrlRepo.Contains(tempUri))
     {
         rnd = (int)((rand1.NextDouble() * (maxRandom - minRandom)) + minRandom);
         tempUri = new Uri("http://us.ebid.net/for-sale/a-" + rnd.ToString()+".htm");
     }
     count--;
     PageToCrawl page = new PageToCrawl(tempUri);
     page.ParentUri = new Uri("http://us.ebid.net/");
     page.CrawlDepth = 1;
     page.IsInternal = true;
     page.IsRoot = false;
     return page;
 }
开发者ID:abbas-oveissi,项目名称:abot,代码行数:18,代码来源:MadBidScheduler.cs



注:本文中的Abot.Poco.PageToCrawl类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
C# AcademyPopcorn.CollisionData类代码示例发布时间:2022-05-24
下一篇:
C# Poco.CrawledPage类代码示例发布时间:2022-05-24
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap