本文整理汇总了Scala中org.jsoup.nodes.Document类的典型用法代码示例。如果您正苦于以下问题:Scala Document类的具体用法?Scala Document怎么用?Scala Document使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Document类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Scala代码示例。
示例1: NoticeServiceObjects
//设置package包名称以及导入依赖的类
package com.zhranklin.homepage.notice
import org.json4s._
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
object NoticeServiceObjects {
trait ServiceBase extends IndexService with FunNoticeFetcher with SelectorUrlService {
val initVal: ((Document) ? String, (Document) ? String, String, String)
lazy val (getContent, getDateStr, urlPattern, template) = initVal
}
class LawService(title: String, listId: String) extends NoticeService(s"??? - $title") with UrlService with IndexService with FunNoticeFetcher {
val getContent = contentF("div.text")
val getDateStr = dateF("span:contains(????)")
val template = "http://law.scu.edu.cn/xjax?arg=8573&arg=<index>&arg=20&arg=list&clazz=PortalArticleAction&method=list"
def getUrl(id: String) = s"http://law.scu.edu.cn/detail.jsp?portalId=725&cid=8385&nextcid=$listId&aid=$id"
override def noticeUrlsFromUrl(url: String): Iterable[NoticeEntry] = {
val jsonStr = Jsoup.connect(url).execute().body()
val json = jackson.parseJson(jsonStr)
json.\("data").asInstanceOf[JArray].arr.map(
jo ? NoticeEntry(getUrl(jo.\("id").values.toString), Some(jo.\("subject").values.toString)))
}
}
val serviceList = List(
"???? - ???? - test" ?
"http://www.sculj.cn/Special_News.asp?SpecialID=40&SpecialName=%D1%A7%D4%BA%B6%AF%CC%AC&page=<index>",
"???? - ???? - test" ? "http://sesu.scu.edu.cn/news/list_1_<index>.html",
"???? - ????" ? "http://sesu.scu.edu.cn/gonggao/list_2_<index>.html",
"????? - ???? - test" ? "http://cs.scu.edu.cn/cs/xsky/xskb/H951901index_<index>.htm",
"????? - ???? - test" ? "http://cs.scu.edu.cn/cs/xytz/H9502index_<index>.htm",
"????? - ???? - test" ? "http://cs.scu.edu.cn/cs/xyxw/H9501index_<index>.htm",
"????? - ??? - test" ? "http://cs.scu.edu.cn/cs/fwzy/ftl/H951204index_<index>.htm",
"???? - test" ? "http://news.scu.edu.cn/news2012/cdzx/I0201index_<index>.htm",
"???? - ????" ?"http://math.scu.edu.cn/news.asp?PAGE=<index>",
"?????? - ????" ? "http://seei.scu.edu.cn/student,p<index>,index.jsp",
"????? - ????" ? "http://flc2.scu.edu.cn/foreign/a/xueyuangonggao/list_27_<index>.html"
).map { tp ?
new NoticeService(tp._1) with UniversalUrlService with UniversalNoticeFetcher with IndexService {
val template = tp._2
}
} ++ List(
new NoticeService("??? - ??") with ServiceBase {
val initVal =(selectorF("input[name=news.content]")(_.first.attr("value")), dateF("table[width=900] td:contains(????)"),
"newsShow.*", "http://jwc.scu.edu.cn/jwc/moreNotice.action?url=moreNotice.action&type=2&keyWord=&pager.pageNow=<index>")},
new LawService("????", "8572"),
new LawService("????", "8573")
)
}
开发者ID:zhranklin,项目名称:Private_Blog,代码行数:54,代码来源:NoticeServiceObjects.scala
示例2: IsapReader
//设置package包名称以及导入依赖的类
package pl.mojepanstwo.sap.toakoma.readers
import org.slf4j.LoggerFactory
import org.springframework.batch.item.ItemReader
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import com.gargoylesoftware.htmlunit.WebClient
import pl.mojepanstwo.sap.toakoma._
object IsapReader {
val BASE_URL = "http://isap.sejm.gov.pl"
val URL = BASE_URL + "/DetailsServlet?id="
}
class IsapReader(val id: String) extends ItemReader[Document] {
val logger = LoggerFactory.getLogger(this.getClass())
var last = false
def read : Document = {
logger.trace("read")
if(last) return null
this.last = true
val isapUrl = IsapReader.URL + id
val rsp = Jsoup.connect(isapUrl).get
if(rsp.body.text.contains("Brak aktu prawnego o podanym adresie publikacyjnym !"))
throw new NoSuchDocumentException
return rsp
}
}
开发者ID:PrawoPolskie,项目名称:toakoma,代码行数:36,代码来源:IsapReader.scala
示例3: get
//设置package包名称以及导入依赖的类
package pl.mojepanstwo.sap.toakoma.services
import java.net.URL
import java.io.File
import org.apache.commons.io.FileUtils
import org.jsoup.nodes.Document
import com.gargoylesoftware.htmlunit._
import org.jsoup.Jsoup
trait Scraper {
def get(url: String) : Document
def dowloadFile(fileUrl:String, filePath:String) : String
}
class DefaultScraperService extends Scraper {
val webClient = new WebClient
def get(url: String) : Document = {
webClient.setRefreshHandler(new RefreshHandler {
override def handleRefresh(page: Page, url: URL, i: Int): Unit = webClient.getPage(url)
})
val apPage: Page = webClient.getPage(url)
Jsoup.parse(apPage.getWebResponse.getContentAsString)
}
def dowloadFile(fileUrl:String, filePath:String) : String = {
val url = new URL(fileUrl)
val tmp = new File(filePath)
FileUtils.copyURLToFile(url, tmp)
tmp.getAbsolutePath()
}
}
开发者ID:PrawoPolskie,项目名称:toakoma,代码行数:35,代码来源:Scraper.scala
示例4: ResourceScraperService
//设置package包名称以及导入依赖的类
package pl.mojepanstwo.sap.toakoma
import pl.mojepanstwo.sap.toakoma.services.Scraper
import org.jsoup.nodes.Document
import org.jsoup.Jsoup
import scala.io.Source
import java.io.File
import java.nio.file.Files
import org.apache.commons.io.IOUtils
import java.io.FileOutputStream
class ResourceScraperService extends Scraper {
def get(url: String) : Document = {
val pattern = ".*id=(.*)&type=([0-9]+).*".r
val pattern(id, docType) = url
Jsoup.parse(Source.fromResource("isap/" + id + "/" + docType + ".html").mkString)
}
def dowloadFile(fileUrl:String, filePath:String) : String = {
val pattern = ".*id=(.*)&type=([0-9]+).*".r
val pattern(id, docType) = fileUrl
val src = getClass.getResourceAsStream("/isap/" + id + "/" + docType + ".pdf")
val dest = new File(filePath)
val out = new FileOutputStream(dest)
IOUtils.copy(src, out)
src.close()
out.close()
dest.getAbsolutePath
}
}
开发者ID:PrawoPolskie,项目名称:toakoma,代码行数:33,代码来源:ResourceScraperService.scala
示例5: GgleLoginTest
//设置package包名称以及导入依赖的类
package com.szadowsz.tarbh.ggle
import com.szadowsz.common.net.Uri
import com.szadowsz.maeve.core.MaeveDriver
import com.szadowsz.maeve.core.browser.MaeveConf
import com.szadowsz.maeve.core.instruction.MaeveInstruction
import com.szadowsz.maeve.core.instruction.extractor.JsoupExtractor
import com.szadowsz.maeve.core.instruction.target.single.SingleTarget
import com.szadowsz.maeve.gglegrp.actions.GgleExecutor
import org.jsoup.nodes.Document
object GgleLoginTest {
private val link = ""
private val username: String = ""
private val passwd: String = ""
private val urlOfGrp = Uri(link)
private val groupName: String = ""
private def buildConfig(): MaeveConf = {
MaeveConf()
.setJavaScriptEnabled(true)
.setHTTPProxy("", 0, Nil)
.setThrowExceptionOnScriptError(false)
}
def main(args: Array[String]): Unit = {
System.setProperty("webdriver.chrome.driver", ".\\chromedriver_win32\\chromedriver.exe")
val conf = buildConfig()
val scraper = new MaeveDriver(conf)
val rootTarget = SingleTarget(urlOfGrp)
class TestExtractor extends JsoupExtractor {
override def extract(queryUrl: Uri, returnedUrl: Uri, inst: MaeveInstruction[_], page: Document): Unit = {}
override def shouldContinue(): Boolean = false
}
val rootFilter = new TestExtractor()
val actions = new GgleExecutor(username, passwd)
val rootInstruction = MaeveInstruction(groupName, rootTarget, actions, rootFilter, "./data/grp/", false, false, false, MaeveConf().setNoProxy())
scraper.feedInstruction(rootInstruction)
scraper.scrapeUsingCurrInstruction()
}
}
开发者ID:zakski,项目名称:project-disco,代码行数:48,代码来源:GgleLoginTest.scala
示例6: compile
//设置package包名称以及导入依赖的类
package indi.lewis.spider.html
import java.util
import com.google.gson._
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
private[html] trait ElementType {
var elementName: String = _;
def compile(doc:Document):JsonElement;
def compile(doc:String):JsonElement=compile(Jsoup.parse(doc));
}
private[html] case class ModelParent() extends ElementType {
elementName="root";
def this(elementName: String) {
this();
this.elementName = elementName;
}
val properties: java.util.ArrayList[ElementType] = new util.ArrayList[ElementType]();
override def compile(doc: Document): JsonElement = {
val ret=new JsonObject
for(i <- 0 to properties.size()-1; o= properties.get(i)){
ret.add(o.elementName,o.compile(doc))
}
ret
}
}
private[html] case class ModelElement(val elName: String, val f: (Document) => ElementType) extends ElementType {
this.elementName=elName
override def compile(doc: Document): JsonElement = f(doc).compile(doc)
}
private[html] case class ModelArray(val elName: String) extends ElementType {
val array: java.util.ArrayList[ElementType] = new util.ArrayList[ElementType]();
this.elementName=elName
override def compile(doc: Document): JsonElement = {
val jsonArray=new JsonArray
for(i <- 0 to array.size()-1; o= array.get(i)){
jsonArray.add(o.compile(doc))
}
jsonArray
}
}
private[html] case class ModelConstant(val elName: String, val value: Object) extends ElementType {
this.elementName=elName
override def compile(doc: Document): JsonElement = if(value!=null)new JsonPrimitive(value.toString) else JsonNull.INSTANCE
}
开发者ID:TokisakiFun,项目名称:Katipo,代码行数:56,代码来源:ElementType.scala
示例7: LaporBot
//设置package包名称以及导入依赖的类
package io.github.asepsaep.laporcrawler.bot
import scala.collection.JavaConverters._
import org.jsoup.nodes.Document
import org.jsoup.Jsoup
import io.github.asepsaep.laporcrawler.model.Ticket
case class LaporBot(ticketId: Int) {
private var ticket = new Ticket()
private val url = "http://36.66.86.72/pengaduan/" + ticketId
// System.setProperty("socksProxyHost", "127.0.0.1")
// System.setProperty("socksProxyPort", "10001")
// System.setProperty("socksProxyVersion", "5")
def crawl(): Option[Ticket] = {
val doc = Jsoup.connect(url).timeout(30000).get()
val maybeTicket = if (doc.getElementsByClass("no-data").isEmpty) Option(parse(doc)) else None
maybeTicket
}
def parse(doc: Document): Ticket = {
val id = ticketId
val title = doc.getElementById("row_Subject").text
val splitContent = doc.getElementById("row_content").text.split(", ")
val content = if (splitContent.length > 1) splitContent.tail.mkString(", ") else doc.getElementById("row_content").text
ticket = ticket.copy(id = id, title = title, content = content)
val details = doc.getElementsByClass("feedback-details").first.getElementsByTag("p").asScala
for (p ? details) {
val span = p.getElementsByTag("span")
span.first.text match {
case "USER:" ? ticket = ticket.copy(user = Some(span.last.text))
case "PLATFORM:" ? ticket = ticket.copy(platform = Some(span.last.text))
case "TANGGAL:" ? ticket = ticket.copy(date = Some(span.last.text))
case "KATEGORI:" ? ticket = ticket.copy(category = Some(span.last.text))
case "AREA:" ? ticket = ticket.copy(area = Some(span.last.text))
case "STATUS:" ? ticket = ticket.copy(status = Some(span.last.text))
case _ ? {}
}
}
val dispatchedTo = doc.select(".administrator .comment-content").first.getElementsByTag("p").first.getElementsByTag("span").first.getElementsByTag("b").first.text
ticket = ticket.copy(dispatchedTo = Some(dispatchedTo))
ticket
}
}
开发者ID:asepsaep,项目名称:lapor-crawler,代码行数:54,代码来源:LaporBot.scala
示例8: checkElementAndConvert
//设置package包名称以及导入依赖的类
package haishu.crawler.selector
import org.jsoup.nodes.{Document, Element}
private def checkElementAndConvert(element: Element): Element = element match {
case d: Document => d
case _ =>
val root = new Document(element.ownerDocument().baseUri())
root.appendChild(element.clone())
root
}
override def css(selector: String): Selectable = {
val cssSelector = Selectors.css(selector)
selectElements(cssSelector)
}
override def css(selector: String, attrName: String): Selectable = {
val cssSelector = Selectors.css(selector, attrName)
selectElements(cssSelector)
}
}
开发者ID:hualongdata,项目名称:hl-crawler,代码行数:24,代码来源:HtmlNode.scala
示例9: StyleguideSpider
//设置package包名称以及导入依赖的类
package com.themillhousegroup.witchhunt
import org.jsoup.nodes.{ Element, Document }
import scala.concurrent.Future
import com.themillhousegroup.scoup.{ ScoupImplicits, Scoup }
import scala.concurrent.ExecutionContext.Implicits.global
import java.net.URL
object StyleguideSpider extends ScoupImplicits {
def visit(url: URL, thisPageOnly: Boolean = false): Future[Set[Document]] = {
visitLink(url, Set.empty, thisPageOnly)
}
private def visitLink(url: URL, alreadyVisited: Set[URL], thisPageOnly: Boolean): Future[Set[Document]] = {
Scoup.parse(url.toString).flatMap { doc =>
if (thisPageOnly) {
Future.successful(Set(doc))
} else {
visitLinks(url, doc, alreadyVisited)
}
}
}
private def visitLinks(url: URL, doc: Document, alreadyVisited: Set[URL]) = {
val links = doc.select("a").filter(isLocal).map(_.attr("href"))
links.map(createFullLocalUrl(url)).filter(!alreadyVisited.contains(_)).foldLeft(Future.successful(Set(doc))) {
case (acc, link) =>
for {
existingDocs <- acc
newDocs <- visitLink(link, alreadyVisited + link, false)
} yield (existingDocs ++ newDocs)
}
}
private def isLocal(link: Element): Boolean = {
val href = link.attr("href")
href.startsWith("/")
}
def createFullLocalUrl(base: URL)(link: String): URL = {
(new java.net.URL(base, link))
}
}
开发者ID:themillhousegroup,项目名称:witchhunt,代码行数:47,代码来源:StyleguideSpider.scala
示例10: StylesheetFinder
//设置package包名称以及导入依赖的类
package com.themillhousegroup.witchhunt
import com.themillhousegroup.scoup.{ Scoup, ScoupImplicits }
import org.jsoup.nodes.Document
import scala.concurrent.Future
import scala.concurrent.ExecutionContext.Implicits.global
object StylesheetFinder extends ScoupImplicits {
def allStylesheetUrls(doc: Document): Seq[String] = {
doc.head.select("link").filter { elem =>
elem.attr("rel") == "stylesheet"
}.map { elem =>
elem.attr("href")
}.toSeq
}
def localStylesheetUrls(doc: Document): Seq[String] = {
allStylesheetUrls(doc).filter { url =>
// It starts with a single-slash ONLY (a double-slash means protocol-relative"
(url.startsWith("/") && !url.startsWith("//")) ||
// It doesn't start with a traditional protocol specifier
!(url.startsWith("http:") || url.startsWith("https://"))
}
}
}
开发者ID:themillhousegroup,项目名称:witchhunt,代码行数:29,代码来源:StylesheetFinder.scala
示例11: checkSelector
//设置package包名称以及导入依赖的类
package com.themillhousegroup.witchhunt.checks
import com.themillhousegroup.witchhunt.{ExcessiveSpecificityViolation, RuleEnumerator, Violation, ViolationType}
import org.jsoup.nodes.Document
import com.helger.css.decl.CSSDeclaration
trait WitchhuntViolationCheck {
def checkSelector(implicit ruleSet: RuleEnumerator, selector: String, lineNumber: Int, declarationsWithin: Seq[CSSDeclaration], applicablePages: Set[Document]): Option[Violation]
protected def buildViolation[VT <: ViolationType](vt: VT,
thresholdValue:Option[Int] = None,
violationValue:Option[Int] = None)(implicit ruleSet: RuleEnumerator,
selector: String,
lineNumber: Int,
applicablePages: Set[Document]):Option[Violation] = {
Some(
Violation(
ruleSet.sourceName,
ruleSet.sourceUrl,
lineNumber,
selector,
applicablePages.map(_.location),
vt,
thresholdValue,
violationValue
)
)
}
}
开发者ID:themillhousegroup,项目名称:witchhunt,代码行数:30,代码来源:WitchhuntViolationCheck.scala
示例12: ExcessiveSpecificityCheck
//设置package包名称以及导入依赖的类
package com.themillhousegroup.witchhunt.checks
import com.themillhousegroup.scoup.ScoupImplicits
import com.themillhousegroup.witchhunt._
import org.jsoup.nodes.Document
import com.helger.css.decl.CSSDeclaration
class ExcessiveSpecificityCheck(options: WitchhuntOptions) extends WitchhuntViolationCheck with ScoupImplicits {
// Return a violation if the selector is more specific that the configured limit
def checkSelector(implicit ruleSet: RuleEnumerator, selector: String, lineNumber: Int, declarationsWithin: Seq[CSSDeclaration], applicablePages: Set[Document]): Option[Violation] = {
val result = Specificity.calculateSingle(selector)
if (result.asInt > options.specificityLimit) {
buildViolation(
ExcessiveSpecificityViolation,
Some(options.specificityLimit),
Some(result.asInt)
)
} else {
None
}
}
}
开发者ID:themillhousegroup,项目名称:witchhunt,代码行数:26,代码来源:ExcessiveSpecificityCheck.scala
示例13: ExcessiveColorsCheck
//设置package包名称以及导入依赖的类
package com.themillhousegroup.witchhunt.checks
import com.themillhousegroup.scoup.ScoupImplicits
import com.themillhousegroup.witchhunt._
import org.jsoup.nodes.Document
import com.helger.css.decl.{ CSSExpression, CSSDeclaration }
class ExcessiveColorsCheck(options: WitchhuntOptions) extends WitchhuntViolationCheck with ScoupImplicits {
val CSS_COLOR_PROP = "color"
val knownColors = scala.collection.mutable.Set[CSSExpression]()
// Return a violation if the total number of colors defined exceeds the configured limit
def checkSelector(implicit ruleSet: RuleEnumerator, selector: String, lineNumber: Int, declarationsWithin: Seq[CSSDeclaration], applicablePages: Set[Document]): Option[Violation] = {
knownColors ++= declarationsWithin.filter(CSS_COLOR_PROP == _.getProperty).map { declaration =>
declaration.getExpression
}.toSet
if (knownColors.size > options.colorLimit) {
buildViolation(ExcessiveColorsViolation, Some(options.colorLimit), Some(knownColors.size))
} else {
None
}
}
}
开发者ID:themillhousegroup,项目名称:witchhunt,代码行数:27,代码来源:ExcessiveColorsCheck.scala
示例14: UnusedSelectorCheck
//设置package包名称以及导入依赖的类
package com.themillhousegroup.witchhunt.checks
import com.themillhousegroup.scoup.ScoupImplicits
import com.themillhousegroup.witchhunt.{ RuleEnumerator, UnusedSelectorViolation, Violation, ViolationType }
import org.jsoup.nodes.Document
import scala._
import scala.Some
import com.themillhousegroup.witchhunt.Violation
import com.helger.css.decl.CSSDeclaration
object UnusedSelectorCheck extends WitchhuntViolationCheck with ScoupImplicits {
// Return a violation if there is no element matching the selector in ANY of the supplied pages
def checkSelector(implicit ruleSet: RuleEnumerator, selector: String, lineNumber: Int, declarationsWithin: Seq[CSSDeclaration], applicablePages: Set[Document]): Option[Violation] = {
// As soon as we find an element that matches the selector, we can stop:
applicablePages.find { stylePage =>
stylePage.select(selector).nonEmpty
}.fold(
buildViolation(UnusedSelectorViolation)
)(_ => None)
}
}
开发者ID:themillhousegroup,项目名称:witchhunt,代码行数:23,代码来源:UnusedSelectorCheck.scala
示例15: Article
//设置package包名称以及导入依赖的类
package gander
import gander.images.Image
import gander.opengraph.OpenGraphData
import org.joda.time.DateTime
import org.jsoup.nodes.{Document, Element}
final case class Article(title: String,
cleanedArticleText: Option[String],
metaDescription: String,
metaKeywords: String,
canonicalLink: String,
domain: String,
topNode: Option[Element],
topImage: Option[Image],
tags: Set[String],
movies: List[Element],
finalUrl: String,
linkHash: String,
rawHtml: String,
doc: Document,
rawDoc: Document,
publishDate: Option[DateTime],
additionalData: Map[String, String],
openGraphData: OpenGraphData)
开发者ID:lloydmeta,项目名称:gander,代码行数:27,代码来源:Article.scala
示例16: WebsiteUpdateActor
//设置package包名称以及导入依赖的类
package services.schedulers
import javax.inject.Inject
import akka.actor.{Actor, Props}
import net.ruippeixotog.scalascraper.browser.JsoupBrowser.JsoupDocument
import org.jsoup.nodes.Document
import play.api.libs.concurrent.Execution.Implicits.defaultContext
import play.api.libs.ws.WSClient
import services.WebsiteParser.HTMLParser
import services.schedulers.WebsiteUpdateActor.update
class WebsiteUpdateActor @Inject()(WSClient: WSClient, HTMLParser: HTMLParser) extends Actor {
override def receive: Receive = {
case update(url: String, parseFormat: String) => {
for{
doc <- WSClient.url(url).get()
elements <- HTMLParser.tryParse(JsoupDocument(Document.createShell(doc.body)), parseFormat)
} yield {
elements.
}
}
}
}
object WebsiteUpdateActor {
def props = Props[WebsiteUpdateActor]
case class update(url: String, parseFormat: String)
}
开发者ID:orkunkl,项目名称:AnnouncementAppServer,代码行数:31,代码来源:WebsiteUpdateActor.scala
示例17: UboatNetClassFilter
//设置package包名称以及导入依赖的类
package com.szadowsz.morrigan.ships.uboat.filter
import com.szadowsz.common.net.Uri
import com.szadowsz.maeve.core.instruction.MaeveInstruction
import com.szadowsz.maeve.core.instruction.extractor.JsoupExtractor
import org.jsoup.nodes.Document
import scala.collection.JavaConverters._
class UboatNetClassFilter extends JsoupExtractor {
private val classPath: String = "body div[id=content] h1[class=warship_header]"
private val shipPath: String = classPath + " + p"
private val techPath : String = "body div[id=content] > div:contains(Technical information) > table[align=center][class=table_subtle] > tbody > tr"
private val shipNamesPath: String = "body div[id=content] h3 + table"
private val shipRegex = "[0-9]+".r
var rows : List[List[Any]] = List()
override def extract(queryUrl : Uri, returnedUrl : Uri, inst : MaeveInstruction[_], page: Document): Unit = {
val classHeader = page.select(classPath).asScala.head.text()
val shipCount = shipRegex.findFirstIn(page.select(shipPath).asScala.head.text()).map(c => c.toInt).getOrElse(-1)
val shipNames = page.select(shipNamesPath).get(1).select("td a[href*=/ship/]").asScala.map(e => (e.attr("href"),e.text()))
if (shipCount > shipNames.length) throw new IllegalStateException()
val techInfo = page.select(techPath).asScala.map(tr => (tr.child(0).text(), tr.child(1).text())).toMap
val classInfo = List(
shipNames.toList,
classHeader,
shipCount.toInt,
shipNames.length,
techInfo("Type"),
techInfo("Displacement"),
techInfo("Length"),
techInfo("Complement"),
techInfo("Armament"),
techInfo.getOrElse("Max speed",""),
techInfo.getOrElse("Engines",""),
techInfo.getOrElse("Power",""),
techInfo("Notes on class"),
returnedUrl.toString
)
rows = rows :+ classInfo
}
override def shouldContinue(): Boolean = false
}
开发者ID:zakski,项目名称:project-morrigan,代码行数:51,代码来源:UboatNetClassFilter.scala
示例18: UsNavyMultiPageFilter
//设置package包名称以及导入依赖的类
package com.szadowsz.morrigan.ships.wiki.us
import com.szadowsz.common.net.Uri
import com.szadowsz.maeve.core.instruction.MaeveInstruction
import com.szadowsz.maeve.core.instruction.extractor.JsoupExtractor
import org.jsoup.nodes.Document
import scala.collection.JavaConverters._
class UsNavyMultiPageFilter extends JsoupExtractor {
private val REDIRECT_PATH: String = "body div#content div#contentSub span.mw-redirectedfrom"
private val INFOBOX_PATH: String = "body div#content table.infobox"
private val SHIP_INSTANCE_PATH: String = "body div.mw-content-ltr > ul > li , body div.mw-content-ltr > dl > dd"
var ships : List[List[(Option[String], Boolean, Boolean, Boolean, String)]] = List()
private val prefixes = List("USS","USNS")
override def extract(queryUrl: Uri, returnedUrl: Uri, inst: MaeveInstruction[_], page: Document): Unit = {
val isRedirected = page.select(REDIRECT_PATH).asScala.nonEmpty
val hasInfo = page.select(INFOBOX_PATH).asScala.length == 1
val instances = if (!hasInfo) page.select(SHIP_INSTANCE_PATH).asScala.toList else List()
val curr = if (instances.isEmpty) {
List((Some(queryUrl.path), isRedirected, hasInfo, true, ""))
} else {
instances.filter(ins => ins.select("a").asScala.exists(e => prefixes.exists(p => e.attr("Title").contains(p)))).map(ins => {
// Exception in thread "main" java.net.URISyntaxException: Relative path in absolute URI: http://en.wikipedia.orghttps://donate.wikimedia.org/wiki/Special:FundraiserRedirector%3Futm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en
val nurl = ins.select("a").asScala.find(e => prefixes.exists(p => e.attr("Title").contains(p))).map(l => l.attr("href"))
val desc = ins.text()
val exists = nurl.exists(!Uri(_).containsQueryKey("redlink"))
(nurl, true, true, exists, desc)
})
}
ships = ships :+ curr
}
override def shouldContinue(): Boolean = false
}
开发者ID:zakski,项目名称:project-morrigan,代码行数:40,代码来源:UsNavyMultiPageFilter.scala
示例19: RoyalNavyMultiPageFilter
//设置package包名称以及导入依赖的类
package com.szadowsz.morrigan.ships.wiki.uk.scrape
import com.szadowsz.common.net.Uri
import com.szadowsz.maeve.core.instruction.MaeveInstruction
import com.szadowsz.maeve.core.instruction.extractor.JsoupExtractor
import org.jsoup.nodes.Document
import scala.collection.JavaConverters._
class RoyalNavyMultiPageFilter extends JsoupExtractor {
private val REDIRECT_PATH: String = "body div#content div#contentSub span.mw-redirectedfrom"
private val INFOBOX_PATH: String = "body div#content table.infobox"
private val SHIP_INSTANCE_PATH: String = "body div.mw-content-ltr > ul > li , body div.mw-content-ltr > dl > dd"
var ships : List[List[(Option[String], Boolean, Boolean, Boolean, String)]] = List()
override def extract(queryUrl: Uri, returnedUrl: Uri, inst: MaeveInstruction[_], page: Document): Unit = {
val isRedirected = page.select(REDIRECT_PATH).asScala.nonEmpty
val hasInfo = page.select(INFOBOX_PATH).asScala.length == 1
val instances = if (!hasInfo) page.select(SHIP_INSTANCE_PATH).asScala.toList else List()
val curr = if (instances.isEmpty) {
List((Some(queryUrl.path), isRedirected, hasInfo, true, ""))
} else {
instances.filter(ins => ins.select("a").asScala.exists(e => e.attr("Title").contains("HMS"))).map(ins => {
val nurl = ins.select("a").asScala.find(e => e.attr("Title").contains("HMS")).map(l => l.attr("href"))
val desc = ins.text()
val exists = nurl.exists(!Uri(_).containsQueryKey("redlink"))
(nurl, true, true, exists, desc)
})
}
ships = ships :+ curr
}
override def shouldContinue(): Boolean = false
}
开发者ID:zakski,项目名称:project-morrigan,代码行数:38,代码来源:RoyalNavyMultiPageFilter.scala
示例20: HtmlLifter
//设置package包名称以及导入依赖的类
package com.twitter.diffy.lifter
import org.jsoup.Jsoup
import org.jsoup.nodes.{Document, Element}
import org.jsoup.select.Elements
import scala.collection.JavaConversions._
object HtmlLifter {
def lift(node: Element): FieldMap[Any] = node match {
case doc: Document =>
FieldMap(
Map(
"head" -> lift(doc.head),
"body" -> lift(doc.body)
)
)
case doc: Element => {
val children: Elements = doc.children
val attributes =
FieldMap[String](
doc.attributes.asList map { attribute =>
attribute.getKey -> attribute.getValue
} toMap
)
FieldMap(
Map(
"tag" -> doc.tagName,
"text" -> doc.ownText,
"attributes" -> attributes,
"children" -> children.map(element => lift(element))
)
)
}
}
def decode(html: String): Document = Jsoup.parse(html)
}
开发者ID:sachinmanchanda,项目名称:diffy_unicast,代码行数:40,代码来源:HtmlLifter.scala
注:本文中的org.jsoup.nodes.Document类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论