• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

C# Poco.CrawledPage类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了C#中Abot.Poco.CrawledPage的典型用法代码示例。如果您正苦于以下问题:C# CrawledPage类的具体用法?C# CrawledPage怎么用?C# CrawledPage使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



CrawledPage类属于Abot.Poco命名空间,在下文中一共展示了CrawledPage类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的C#代码示例。

示例1: Crawl_CallsDependencies

        public void Crawl_CallsDependencies()
        {
            Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html");
            Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html");

            CrawledPage homePage = new CrawledPage(_rootUri) { RawContent = "content here" };
            CrawledPage page1 = new CrawledPage(uri1);
            CrawledPage page2 = new CrawledPage(uri2);

            List<Uri> links = new List<Uri>{uri1, uri2};

            _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(homePage);
            _fakeHttpRequester.Setup(f => f.MakeRequest(uri1, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page1);
            _fakeHttpRequester.Setup(f => f.MakeRequest(uri2, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page2);
            _fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links);
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision{Allow = true});
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.Is<CrawledPage>(p => p.Uri == homePage.Uri), It.IsAny<CrawlContext>())).Returns(new CrawlDecision{ Allow = true });

            _unitUnderTest.Crawl(_rootUri);

            _fakeHttpRequester.Verify(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>()), Times.Once());
            _fakeHttpRequester.Verify(f => f.MakeRequest(uri1, It.IsAny<Func<CrawledPage, CrawlDecision>>()), Times.Once());
            _fakeHttpRequester.Verify(f => f.MakeRequest(uri2, It.IsAny<Func<CrawledPage, CrawlDecision>>()), Times.Once());
            _fakeHyperLinkParser.Verify(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == homePage.Uri)), Times.Exactly(1));
            _fakeHyperLinkParser.Verify(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == uri1)), Times.Exactly(1));
            _fakeHyperLinkParser.Verify(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == uri2)), Times.Exactly(1));
            _fakeCrawlDecisionMaker.Verify(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>()), Times.Exactly(3));
            _fakeCrawlDecisionMaker.Verify(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>()), Times.Exactly(3));
        }
开发者ID:paullou,项目名称:abot,代码行数:30,代码来源:WebCrawlerTest.cs


示例2: HtmlDocument_RawContentIsNull_HtmlDocumentIsNotNull

        public void HtmlDocument_RawContentIsNull_HtmlDocumentIsNotNull()
        {
            CrawledPage unitUnderTest = new CrawledPage(new Uri("http://a.com/")) { RawContent = null };

            Assert.IsNotNull(unitUnderTest.HtmlDocument);
            Assert.AreEqual("", unitUnderTest.HtmlDocument.DocumentNode.InnerText);
        }
开发者ID:justinverhoef,项目名称:abot,代码行数:7,代码来源:CrawledPageTest.cs


示例3: CsQuery_EncodingChangedTwice_IsLoaded

        public void CsQuery_EncodingChangedTwice_IsLoaded()
        {
            CrawledPage unitUnderTest = new CrawledPage(new Uri("http://a.com/")) { RawContent = @"<div>hehe</div><meta http-equiv=""Content-Type"" content=""text/html; charset=iso-8859-1""><meta http-equiv=""content-type"" content=""text/html; charset=utf-8"" /><div>hi</div>" };

            Assert.IsNotNull(unitUnderTest.CsQueryDocument);
            Assert.AreEqual(4, unitUnderTest.CsQueryDocument.Length);
        }
开发者ID:justinverhoef,项目名称:abot,代码行数:7,代码来源:CrawledPageTest.cs


示例4: Constructor_ValidArg_SetsPublicProperty

        public void Constructor_ValidArg_SetsPublicProperty()
        {
            CrawledPage page = new CrawledPage(new Uri("http://aaa.com/"));
            PageCrawlCompletedArgs uut = new PageCrawlCompletedArgs(new CrawlContext(), page);

            Assert.AreSame(page, uut.CrawledPage);
        }
开发者ID:CocoaLab,项目名称:abot,代码行数:7,代码来源:PageCrawlCompletedArgsTest.cs


示例5: Crawl_MinCrawlDelayDelayZero_DomainRateLimiterNotCalled

        public void Crawl_MinCrawlDelayDelayZero_DomainRateLimiterNotCalled()
        {
            Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html");
            Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html");

            CrawledPage homePage = new CrawledPage(_rootUri)
            {
                Content = new PageContent 
                { 
                    Text = "content here" 
                }
            };
            CrawledPage page1 = new CrawledPage(uri1);
            CrawledPage page2 = new CrawledPage(uri2);

            List<Uri> links = new List<Uri> { uri1, uri2 };
            
            _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(homePage);
            _fakeHttpRequester.Setup(f => f.MakeRequest(uri1, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page1);
            _fakeHttpRequester.Setup(f => f.MakeRequest(uri2, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page2);
            _fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links);
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });

            _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);

            _unitUnderTest.Crawl(_rootUri);

            _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny<Uri>()), Times.Never());
        }
开发者ID:sharpcoder7,项目名称:abot,代码行数:30,代码来源:PoliteWebCrawlerTest.cs


示例6: Uri

        public void Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_CrawlDelayAboveMinDomainCrawlDelay_CallsDomainRateLimiter()
        {
            Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html");
            Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html");

            CrawledPage homePage = new CrawledPage(_rootUri) { RawContent = "content here" };
            CrawledPage page1 = new CrawledPage(uri1);
            CrawledPage page2 = new CrawledPage(uri2);

            List<Uri> links = new List<Uri> { uri1, uri2 };

            _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(homePage);
            _fakeHttpRequester.Setup(f => f.MakeRequest(uri1, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page1);
            _fakeHttpRequester.Setup(f => f.MakeRequest(uri2, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page2);
            _fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links);
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });

            _fakeRobotsDotText.Setup(f => f.GetCrawlDelay(It.IsAny<string>())).Returns(3);//this is more then the max configured crawl delay (should be ignored)
            _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(It.IsAny<string>(), It.IsAny<string>())).Returns(true);
            _fakeRobotsDotTextFinder.Setup(f => f.Find(It.IsAny<Uri>())).Returns(_fakeRobotsDotText.Object);

            _dummyConfiguration.IsRespectRobotsDotTextEnabled = true;//BY HAVING A THIS EQUAL TO TRUE WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED
            _dummyConfiguration.MaxRobotsDotTextCrawlDelayInSeconds = 2; //This is less than the crawl delay (Should Be used)
            _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);

            _unitUnderTest.Crawl(_rootUri);

            _fakeHttpRequester.VerifyAll();
            _fakeHyperLinkParser.VerifyAll();
            _fakeRobotsDotText.VerifyAll();
            _fakeRobotsDotTextFinder.VerifyAll();
            _fakeDomainRateLimiter.Verify(f => f.AddDomain(It.IsAny<Uri>(), 2000), Times.Exactly(1));
            _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny<Uri>()), Times.Exactly(3));//BY HAVING A CRAWL DELAY ABOVE ZERO WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED
        }
开发者ID:justinverhoef,项目名称:abot,代码行数:35,代码来源:PoliteWebCrawlerTest.cs


示例7: Constructor_ValidArg_SetsPublicProperty

        public void Constructor_ValidArg_SetsPublicProperty()
        {
            PageToCrawl page = new CrawledPage(new Uri("http://aaa.com/"));
            PageCrawlStartingArgs args = new PageCrawlStartingArgs(new CrawlContext(), page);

            Assert.AreSame(page, args.PageToCrawl);
        }
开发者ID:CocoaLab,项目名称:abot,代码行数:7,代码来源:PageCrawlStartingArgsTest.cs


示例8: CsQueryDocument_ToManyNestedTagsInSource2_DoesNotCauseStackOverflowException

        public void CsQueryDocument_ToManyNestedTagsInSource2_DoesNotCauseStackOverflowException()
        {
            CrawledPage unitUnderTest = new CrawledPage(new Uri("http://a.com/")) { RawContent = GetFileContent("HtmlAgilityPackStackOverflow2.html") };

            Assert.IsNotNull(unitUnderTest.CsQueryDocument);
            Assert.IsTrue(unitUnderTest.CsQueryDocument.ToString().Length > 1);
        }
开发者ID:justinverhoef,项目名称:abot,代码行数:7,代码来源:CrawledPageTest.cs


示例9: Constructor_ValidArg_SetsPublicProperty

        public void Constructor_ValidArg_SetsPublicProperty()
        {
            CrawledPage page = new CrawledPage(new Uri("http://aaa.com/"));
            CrawlContext context = new CrawlContext();
            CrawlArgs args = new CrawlArgs(context);

            Assert.AreSame(context, args.CrawlContext);
        }
开发者ID:CocoaLab,项目名称:abot,代码行数:8,代码来源:CrawlArgsTest.cs


示例10: PageLinksCrawlDisallowedArgs

        public PageLinksCrawlDisallowedArgs(CrawlContext crawlContext, CrawledPage crawledPage, string disallowedReason)
            : base(crawlContext, crawledPage)
        {
            if (string.IsNullOrWhiteSpace(disallowedReason))
                throw new ArgumentNullException("disallowedReason");

            DisallowedReason = disallowedReason;
        }
开发者ID:krishnakanthms,项目名称:abot,代码行数:8,代码来源:PageLinksCrawlDisallowedArgs.cs


示例11: PageCrawlCompletedArgs

        public PageCrawlCompletedArgs(CrawlContext crawlContext, CrawledPage crawledPage)
            : base(crawlContext)
        {
            if (crawledPage == null)
                throw new ArgumentNullException("crawledPage");

            CrawledPage = crawledPage;
        }
开发者ID:krishnakanthms,项目名称:abot,代码行数:8,代码来源:PageCrawlCompletedArgs.cs


示例12: GetMetaRobotsValue

        protected override string GetMetaRobotsValue(CrawledPage crawledPage)
        {
            string robotsMeta = null;
            HtmlNode robotsNode = crawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//meta[translate(@name,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')='robots']");
            if (robotsNode != null)
                robotsMeta = robotsNode.GetAttributeValue("content", "");

            return robotsMeta;
        }
开发者ID:CocoaLab,项目名称:abot,代码行数:9,代码来源:HapHyperLinkParser.cs


示例13: GetHrefValues

        protected override IEnumerable<string> GetHrefValues(CrawledPage crawledPage)
        {
            IEnumerable<string> hrefValues = crawledPage.CsQueryDocument.Select("a, area")
            .Elements
            .Select(y => y.GetAttribute("href"))
            .Where(a => !string.IsNullOrWhiteSpace(a));

            return hrefValues;
        }
开发者ID:krishnakanthms,项目名称:abot,代码行数:9,代码来源:CsQueryHyperLinkParser.cs


示例14: HasRobotsNoFollow

        private bool HasRobotsNoFollow(CrawledPage crawledPage)
        {
            string robotsMeta = null;
            if (_isRespectMetaRobotsNoFollowEnabled)

                robotsMeta = crawledPage.CsQueryDocument["meta[name]"].Filter(d => d.Name.ToLowerInvariant() == "robots").Attr("content");

            return robotsMeta != null && robotsMeta.ToLower().Contains("nofollow");
        }
开发者ID:vinchu,项目名称:abot,代码行数:9,代码来源:CsQueryHyperLinkParser.cs


示例15: count_pages_containing_specific_keywords

        public void count_pages_containing_specific_keywords()
        {
            ICrawlingStats crawlingStats = new XingCrawlingStats(new[] { "jobdetail" }, _crawlingFilterDetail);
            var page = new CrawledPage(new Uri("http://a.com/jobdetail"));

            crawlingStats.ProcessCrawledPage(page);

            Assert.AreEqual(1, crawlingStats.CountOfCrawledPagesContainingSpecificKeyword);
        }
开发者ID:tekavec,项目名称:walter,代码行数:9,代码来源:XingCrawlingStatsShould.cs


示例16: ignore_duplicated_pages

        public void ignore_duplicated_pages()
        {
            ICrawlingStats crawlingStats = new XingCrawlingStats(new[] { "jobdetail" }, _crawlingFilterDetail);
            var page = new CrawledPage(new Uri("https://www.xn--jobbrse-d1a.com/jobdetail/?rid=101496772&qid=36120&fid=97&_uri=am9idGl0bGU9TWFya2V0aW5nJnJhZGl1cz0xMCZjb3VudHJ5PSZjYXRlZ29yeT0mYWdlbmN5PTAmY2FyZWVyPSZwYXJ0dGltZT0wJnNvcnQ9ZGF0ZSZwYWdlPTEmcnBwPTEwJmRhdGU9JnFkYXRlPTIwMTYtMDItMjImam9iaWQ9MSZ0b3RhbD0yNzI1Mw=="));

            crawlingStats.ProcessCrawledPage(page);
            crawlingStats.ProcessCrawledPage(page);

            Assert.AreEqual(1, crawlingStats.CountOfCrawledPagesContainingSpecificKeyword);
        }
开发者ID:tekavec,项目名称:walter,代码行数:10,代码来源:XingCrawlingStatsShould.cs


示例17: GenericIndexer

        public GenericIndexer(CrawledPage pageToIndex)
        {
            this.pageToIndex = pageToIndex;

            // gender check
            itemGender = "male";
            if (pageToIndex.Uri.AbsoluteUri.Contains("women") || // uniqlo
                    pageToIndex.Uri.AbsoluteUri.Contains("woman") || // zara
                    pageToIndex.Uri.AbsoluteUri.Contains("ladies")) itemGender = "female"; // h&m
        }
开发者ID:marakas,项目名称:scraper,代码行数:10,代码来源:GenericIndexer.cs


示例18: SearchForSpecificAttributeValue

 private bool SearchForSpecificAttributeValue(CrawledPage crawledPage)
 {
     var dom = crawledPage.CsQueryDocument;
     var elementById = dom.Document.GetElementById(_crawlingFilterDetail.ElementId);
     if (elementById != null)
     {
         var attribute = elementById.GetAttribute(_crawlingFilterDetail.AttributeName);
         return !string.IsNullOrEmpty(attribute) && attribute.ToLower().Contains(_crawlingFilterDetail.AttributeContains.ToLower());
     }
     return false;
 }
开发者ID:tekavec,项目名称:walter,代码行数:11,代码来源:XingCrawlingStats.cs


示例19: GetBaseHrefValue

        protected override string GetBaseHrefValue(CrawledPage crawledPage)
        {
            string hrefValue = "";
            HtmlNode node = crawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//base");

            //Must use node.InnerHtml instead of node.InnerText since "aaa<br />bbb" will be returned as "aaabbb"
            if (node != null)
                hrefValue = node.GetAttributeValue("href", "").Trim();

            return hrefValue;
        }
开发者ID:Resultly,项目名称:abot,代码行数:11,代码来源:HapHyperLinkParser.cs


示例20: CreateCrawledLink

 public virtual CrawledLink CreateCrawledLink(CrawledPage page, int sessionId, int crawlerId)
 {
     var link = new CrawledLink();
     link.SessionId = page.PageBag.SessionId;
     link.CrawlerId = page.PageBag.CrawlerId;
     link.SourceUrl = page.ParentUri.AbsoluteUri;
     link.TargetUrl = page.Uri.AbsoluteUri; // what was crawled
     link.StatusCode = page.HttpWebResponse.StatusCode;
     link.IsRoot = page.IsRoot;
     link.CrawlDepth = page.CrawlDepth;
     return link;
 }
开发者ID:BgRva,项目名称:ThrongBot,代码行数:12,代码来源:ModelFactory.cs



注:本文中的Abot.Poco.CrawledPage类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
C# Poco.PageToCrawl类代码示例发布时间:2022-05-24
下一篇:
C# TestTools.XmlSampleGenerator类代码示例发布时间:2022-05-24
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap