本文整理汇总了C#中Abot.Poco.CrawledPage类的典型用法代码示例。如果您正苦于以下问题:C# CrawledPage类的具体用法?C# CrawledPage怎么用?C# CrawledPage使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
CrawledPage类属于Abot.Poco命名空间,在下文中一共展示了CrawledPage类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的C#代码示例。
示例1: Crawl_CallsDependencies
public void Crawl_CallsDependencies()
{
Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html");
Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html");
CrawledPage homePage = new CrawledPage(_rootUri) { RawContent = "content here" };
CrawledPage page1 = new CrawledPage(uri1);
CrawledPage page2 = new CrawledPage(uri2);
List<Uri> links = new List<Uri>{uri1, uri2};
_fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(homePage);
_fakeHttpRequester.Setup(f => f.MakeRequest(uri1, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page1);
_fakeHttpRequester.Setup(f => f.MakeRequest(uri2, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page2);
_fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links);
_fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision{Allow = true});
_fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });
_fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.Is<CrawledPage>(p => p.Uri == homePage.Uri), It.IsAny<CrawlContext>())).Returns(new CrawlDecision{ Allow = true });
_unitUnderTest.Crawl(_rootUri);
_fakeHttpRequester.Verify(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>()), Times.Once());
_fakeHttpRequester.Verify(f => f.MakeRequest(uri1, It.IsAny<Func<CrawledPage, CrawlDecision>>()), Times.Once());
_fakeHttpRequester.Verify(f => f.MakeRequest(uri2, It.IsAny<Func<CrawledPage, CrawlDecision>>()), Times.Once());
_fakeHyperLinkParser.Verify(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == homePage.Uri)), Times.Exactly(1));
_fakeHyperLinkParser.Verify(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == uri1)), Times.Exactly(1));
_fakeHyperLinkParser.Verify(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == uri2)), Times.Exactly(1));
_fakeCrawlDecisionMaker.Verify(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>()), Times.Exactly(3));
_fakeCrawlDecisionMaker.Verify(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>()), Times.Exactly(3));
}
开发者ID:paullou,项目名称:abot,代码行数:30,代码来源:WebCrawlerTest.cs
示例2: HtmlDocument_RawContentIsNull_HtmlDocumentIsNotNull
public void HtmlDocument_RawContentIsNull_HtmlDocumentIsNotNull()
{
CrawledPage unitUnderTest = new CrawledPage(new Uri("http://a.com/")) { RawContent = null };
Assert.IsNotNull(unitUnderTest.HtmlDocument);
Assert.AreEqual("", unitUnderTest.HtmlDocument.DocumentNode.InnerText);
}
开发者ID:justinverhoef,项目名称:abot,代码行数:7,代码来源:CrawledPageTest.cs
示例3: CsQuery_EncodingChangedTwice_IsLoaded
public void CsQuery_EncodingChangedTwice_IsLoaded()
{
CrawledPage unitUnderTest = new CrawledPage(new Uri("http://a.com/")) { RawContent = @"<div>hehe</div><meta http-equiv=""Content-Type"" content=""text/html; charset=iso-8859-1""><meta http-equiv=""content-type"" content=""text/html; charset=utf-8"" /><div>hi</div>" };
Assert.IsNotNull(unitUnderTest.CsQueryDocument);
Assert.AreEqual(4, unitUnderTest.CsQueryDocument.Length);
}
开发者ID:justinverhoef,项目名称:abot,代码行数:7,代码来源:CrawledPageTest.cs
示例4: Constructor_ValidArg_SetsPublicProperty
public void Constructor_ValidArg_SetsPublicProperty()
{
CrawledPage page = new CrawledPage(new Uri("http://aaa.com/"));
PageCrawlCompletedArgs uut = new PageCrawlCompletedArgs(new CrawlContext(), page);
Assert.AreSame(page, uut.CrawledPage);
}
开发者ID:CocoaLab,项目名称:abot,代码行数:7,代码来源:PageCrawlCompletedArgsTest.cs
示例5: Crawl_MinCrawlDelayDelayZero_DomainRateLimiterNotCalled
public void Crawl_MinCrawlDelayDelayZero_DomainRateLimiterNotCalled()
{
Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html");
Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html");
CrawledPage homePage = new CrawledPage(_rootUri)
{
Content = new PageContent
{
Text = "content here"
}
};
CrawledPage page1 = new CrawledPage(uri1);
CrawledPage page2 = new CrawledPage(uri2);
List<Uri> links = new List<Uri> { uri1, uri2 };
_fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(homePage);
_fakeHttpRequester.Setup(f => f.MakeRequest(uri1, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page1);
_fakeHttpRequester.Setup(f => f.MakeRequest(uri2, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page2);
_fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links);
_fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });
_fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });
_unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);
_unitUnderTest.Crawl(_rootUri);
_fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny<Uri>()), Times.Never());
}
开发者ID:sharpcoder7,项目名称:abot,代码行数:30,代码来源:PoliteWebCrawlerTest.cs
示例6: Uri
public void Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_CrawlDelayAboveMinDomainCrawlDelay_CallsDomainRateLimiter()
{
Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html");
Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html");
CrawledPage homePage = new CrawledPage(_rootUri) { RawContent = "content here" };
CrawledPage page1 = new CrawledPage(uri1);
CrawledPage page2 = new CrawledPage(uri2);
List<Uri> links = new List<Uri> { uri1, uri2 };
_fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(homePage);
_fakeHttpRequester.Setup(f => f.MakeRequest(uri1, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page1);
_fakeHttpRequester.Setup(f => f.MakeRequest(uri2, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page2);
_fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links);
_fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });
_fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });
_fakeRobotsDotText.Setup(f => f.GetCrawlDelay(It.IsAny<string>())).Returns(3);//this is more then the max configured crawl delay (should be ignored)
_fakeRobotsDotText.Setup(f => f.IsUrlAllowed(It.IsAny<string>(), It.IsAny<string>())).Returns(true);
_fakeRobotsDotTextFinder.Setup(f => f.Find(It.IsAny<Uri>())).Returns(_fakeRobotsDotText.Object);
_dummyConfiguration.IsRespectRobotsDotTextEnabled = true;//BY HAVING A THIS EQUAL TO TRUE WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED
_dummyConfiguration.MaxRobotsDotTextCrawlDelayInSeconds = 2; //This is less than the crawl delay (Should Be used)
_unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);
_unitUnderTest.Crawl(_rootUri);
_fakeHttpRequester.VerifyAll();
_fakeHyperLinkParser.VerifyAll();
_fakeRobotsDotText.VerifyAll();
_fakeRobotsDotTextFinder.VerifyAll();
_fakeDomainRateLimiter.Verify(f => f.AddDomain(It.IsAny<Uri>(), 2000), Times.Exactly(1));
_fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny<Uri>()), Times.Exactly(3));//BY HAVING A CRAWL DELAY ABOVE ZERO WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED
}
开发者ID:justinverhoef,项目名称:abot,代码行数:35,代码来源:PoliteWebCrawlerTest.cs
示例7: Constructor_ValidArg_SetsPublicProperty
public void Constructor_ValidArg_SetsPublicProperty()
{
PageToCrawl page = new CrawledPage(new Uri("http://aaa.com/"));
PageCrawlStartingArgs args = new PageCrawlStartingArgs(new CrawlContext(), page);
Assert.AreSame(page, args.PageToCrawl);
}
开发者ID:CocoaLab,项目名称:abot,代码行数:7,代码来源:PageCrawlStartingArgsTest.cs
示例8: CsQueryDocument_ToManyNestedTagsInSource2_DoesNotCauseStackOverflowException
public void CsQueryDocument_ToManyNestedTagsInSource2_DoesNotCauseStackOverflowException()
{
CrawledPage unitUnderTest = new CrawledPage(new Uri("http://a.com/")) { RawContent = GetFileContent("HtmlAgilityPackStackOverflow2.html") };
Assert.IsNotNull(unitUnderTest.CsQueryDocument);
Assert.IsTrue(unitUnderTest.CsQueryDocument.ToString().Length > 1);
}
开发者ID:justinverhoef,项目名称:abot,代码行数:7,代码来源:CrawledPageTest.cs
示例9: Constructor_ValidArg_SetsPublicProperty
public void Constructor_ValidArg_SetsPublicProperty()
{
CrawledPage page = new CrawledPage(new Uri("http://aaa.com/"));
CrawlContext context = new CrawlContext();
CrawlArgs args = new CrawlArgs(context);
Assert.AreSame(context, args.CrawlContext);
}
开发者ID:CocoaLab,项目名称:abot,代码行数:8,代码来源:CrawlArgsTest.cs
示例10: PageLinksCrawlDisallowedArgs
public PageLinksCrawlDisallowedArgs(CrawlContext crawlContext, CrawledPage crawledPage, string disallowedReason)
: base(crawlContext, crawledPage)
{
if (string.IsNullOrWhiteSpace(disallowedReason))
throw new ArgumentNullException("disallowedReason");
DisallowedReason = disallowedReason;
}
开发者ID:krishnakanthms,项目名称:abot,代码行数:8,代码来源:PageLinksCrawlDisallowedArgs.cs
示例11: PageCrawlCompletedArgs
public PageCrawlCompletedArgs(CrawlContext crawlContext, CrawledPage crawledPage)
: base(crawlContext)
{
if (crawledPage == null)
throw new ArgumentNullException("crawledPage");
CrawledPage = crawledPage;
}
开发者ID:krishnakanthms,项目名称:abot,代码行数:8,代码来源:PageCrawlCompletedArgs.cs
示例12: GetMetaRobotsValue
protected override string GetMetaRobotsValue(CrawledPage crawledPage)
{
string robotsMeta = null;
HtmlNode robotsNode = crawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//meta[translate(@name,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz')='robots']");
if (robotsNode != null)
robotsMeta = robotsNode.GetAttributeValue("content", "");
return robotsMeta;
}
开发者ID:CocoaLab,项目名称:abot,代码行数:9,代码来源:HapHyperLinkParser.cs
示例13: GetHrefValues
protected override IEnumerable<string> GetHrefValues(CrawledPage crawledPage)
{
IEnumerable<string> hrefValues = crawledPage.CsQueryDocument.Select("a, area")
.Elements
.Select(y => y.GetAttribute("href"))
.Where(a => !string.IsNullOrWhiteSpace(a));
return hrefValues;
}
开发者ID:krishnakanthms,项目名称:abot,代码行数:9,代码来源:CsQueryHyperLinkParser.cs
示例14: HasRobotsNoFollow
private bool HasRobotsNoFollow(CrawledPage crawledPage)
{
string robotsMeta = null;
if (_isRespectMetaRobotsNoFollowEnabled)
robotsMeta = crawledPage.CsQueryDocument["meta[name]"].Filter(d => d.Name.ToLowerInvariant() == "robots").Attr("content");
return robotsMeta != null && robotsMeta.ToLower().Contains("nofollow");
}
开发者ID:vinchu,项目名称:abot,代码行数:9,代码来源:CsQueryHyperLinkParser.cs
示例15: count_pages_containing_specific_keywords
public void count_pages_containing_specific_keywords()
{
ICrawlingStats crawlingStats = new XingCrawlingStats(new[] { "jobdetail" }, _crawlingFilterDetail);
var page = new CrawledPage(new Uri("http://a.com/jobdetail"));
crawlingStats.ProcessCrawledPage(page);
Assert.AreEqual(1, crawlingStats.CountOfCrawledPagesContainingSpecificKeyword);
}
开发者ID:tekavec,项目名称:walter,代码行数:9,代码来源:XingCrawlingStatsShould.cs
示例16: ignore_duplicated_pages
public void ignore_duplicated_pages()
{
ICrawlingStats crawlingStats = new XingCrawlingStats(new[] { "jobdetail" }, _crawlingFilterDetail);
var page = new CrawledPage(new Uri("https://www.xn--jobbrse-d1a.com/jobdetail/?rid=101496772&qid=36120&fid=97&_uri=am9idGl0bGU9TWFya2V0aW5nJnJhZGl1cz0xMCZjb3VudHJ5PSZjYXRlZ29yeT0mYWdlbmN5PTAmY2FyZWVyPSZwYXJ0dGltZT0wJnNvcnQ9ZGF0ZSZwYWdlPTEmcnBwPTEwJmRhdGU9JnFkYXRlPTIwMTYtMDItMjImam9iaWQ9MSZ0b3RhbD0yNzI1Mw=="));
crawlingStats.ProcessCrawledPage(page);
crawlingStats.ProcessCrawledPage(page);
Assert.AreEqual(1, crawlingStats.CountOfCrawledPagesContainingSpecificKeyword);
}
开发者ID:tekavec,项目名称:walter,代码行数:10,代码来源:XingCrawlingStatsShould.cs
示例17: GenericIndexer
public GenericIndexer(CrawledPage pageToIndex)
{
this.pageToIndex = pageToIndex;
// gender check
itemGender = "male";
if (pageToIndex.Uri.AbsoluteUri.Contains("women") || // uniqlo
pageToIndex.Uri.AbsoluteUri.Contains("woman") || // zara
pageToIndex.Uri.AbsoluteUri.Contains("ladies")) itemGender = "female"; // h&m
}
开发者ID:marakas,项目名称:scraper,代码行数:10,代码来源:GenericIndexer.cs
示例18: SearchForSpecificAttributeValue
private bool SearchForSpecificAttributeValue(CrawledPage crawledPage)
{
var dom = crawledPage.CsQueryDocument;
var elementById = dom.Document.GetElementById(_crawlingFilterDetail.ElementId);
if (elementById != null)
{
var attribute = elementById.GetAttribute(_crawlingFilterDetail.AttributeName);
return !string.IsNullOrEmpty(attribute) && attribute.ToLower().Contains(_crawlingFilterDetail.AttributeContains.ToLower());
}
return false;
}
开发者ID:tekavec,项目名称:walter,代码行数:11,代码来源:XingCrawlingStats.cs
示例19: GetBaseHrefValue
protected override string GetBaseHrefValue(CrawledPage crawledPage)
{
string hrefValue = "";
HtmlNode node = crawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//base");
//Must use node.InnerHtml instead of node.InnerText since "aaa<br />bbb" will be returned as "aaabbb"
if (node != null)
hrefValue = node.GetAttributeValue("href", "").Trim();
return hrefValue;
}
开发者ID:Resultly,项目名称:abot,代码行数:11,代码来源:HapHyperLinkParser.cs
示例20: CreateCrawledLink
public virtual CrawledLink CreateCrawledLink(CrawledPage page, int sessionId, int crawlerId)
{
var link = new CrawledLink();
link.SessionId = page.PageBag.SessionId;
link.CrawlerId = page.PageBag.CrawlerId;
link.SourceUrl = page.ParentUri.AbsoluteUri;
link.TargetUrl = page.Uri.AbsoluteUri; // what was crawled
link.StatusCode = page.HttpWebResponse.StatusCode;
link.IsRoot = page.IsRoot;
link.CrawlDepth = page.CrawlDepth;
return link;
}
开发者ID:BgRva,项目名称:ThrongBot,代码行数:12,代码来源:ModelFactory.cs
注:本文中的Abot.Poco.CrawledPage类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论