Scala Document类代码示例

OStack程序员社区-中国程序员成长平台 › 门户 › 编程› Scala›Scala教程

原作者: [db:作者] 来自: [db:来源] 收藏邀请

本文整理汇总了Scala中org.jsoup.nodes.Document类的典型用法代码示例。如果您正苦于以下问题：Scala Document类的具体用法？Scala Document怎么用？Scala Document使用的例子？那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。

在下文中一共展示了Document类的20个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于我们的系统推荐出更棒的Scala代码示例。

示例1: NoticeServiceObjects

//设置package包名称以及导入依赖的类
package com.zhranklin.homepage.notice

import org.json4s._
import org.jsoup.Jsoup
import org.jsoup.nodes.Document

object NoticeServiceObjects {

  trait ServiceBase extends IndexService with FunNoticeFetcher with SelectorUrlService {
    val initVal: ((Document) ? String, (Document) ? String, String, String)
    lazy val (getContent, getDateStr, urlPattern, template) = initVal
  }

  class LawService(title: String, listId: String) extends NoticeService(s"??? - $title") with UrlService with IndexService with FunNoticeFetcher {
    val getContent = contentF("div.text")
    val getDateStr = dateF("span:contains(????)")
    val template = "http://law.scu.edu.cn/xjax?arg=8573&arg=<index>&arg=20&arg=list&clazz=PortalArticleAction&method=list"

    def getUrl(id: String) = s"http://law.scu.edu.cn/detail.jsp?portalId=725&cid=8385&nextcid=$listId&aid=$id"

    override def noticeUrlsFromUrl(url: String): Iterable[NoticeEntry] = {
      val jsonStr = Jsoup.connect(url).execute().body()
      val json = jackson.parseJson(jsonStr)
      json.\("data").asInstanceOf[JArray].arr.map(
        jo ? NoticeEntry(getUrl(jo.\("id").values.toString), Some(jo.\("subject").values.toString)))
    }
  }

  val serviceList = List(
    "???? - ???? - test" ?
      "http://www.sculj.cn/Special_News.asp?SpecialID=40&SpecialName=%D1%A7%D4%BA%B6%AF%CC%AC&page=<index>",
    "???? - ???? - test" ? "http://sesu.scu.edu.cn/news/list_1_<index>.html",
    "???? - ????" ? "http://sesu.scu.edu.cn/gonggao/list_2_<index>.html",
    "????? - ???? - test" ? "http://cs.scu.edu.cn/cs/xsky/xskb/H951901index_<index>.htm",
    "????? - ???? - test" ? "http://cs.scu.edu.cn/cs/xytz/H9502index_<index>.htm",
    "????? - ???? - test" ? "http://cs.scu.edu.cn/cs/xyxw/H9501index_<index>.htm",
    "????? - ??? - test" ? "http://cs.scu.edu.cn/cs/fwzy/ftl/H951204index_<index>.htm",
    "???? - test" ? "http://news.scu.edu.cn/news2012/cdzx/I0201index_<index>.htm",
    "???? - ????" ?"http://math.scu.edu.cn/news.asp?PAGE=<index>",
    "?????? - ????" ? "http://seei.scu.edu.cn/student,p<index>,index.jsp",
    "????? - ????" ? "http://flc2.scu.edu.cn/foreign/a/xueyuangonggao/list_27_<index>.html"
  ).map { tp ?
    new NoticeService(tp._1) with UniversalUrlService with UniversalNoticeFetcher with IndexService {
      val template = tp._2
    }
  } ++ List(
    new NoticeService("??? - ??") with ServiceBase {
      val initVal =(selectorF("input[name=news.content]")(_.first.attr("value")), dateF("table[width=900] td:contains(????)"),
        "newsShow.*", "http://jwc.scu.edu.cn/jwc/moreNotice.action?url=moreNotice.action&type=2&keyWord=&pager.pageNow=<index>")},
    new LawService("????", "8572"),
    new LawService("????", "8573")
  )
}

开发者ID:zhranklin，项目名称:Private_Blog，代码行数:54，代码来源:NoticeServiceObjects.scala

示例2: IsapReader

//设置package包名称以及导入依赖的类
package pl.mojepanstwo.sap.toakoma.readers

import org.slf4j.LoggerFactory
import org.springframework.batch.item.ItemReader

import org.jsoup.Jsoup

import org.jsoup.nodes.Document
import com.gargoylesoftware.htmlunit.WebClient
import pl.mojepanstwo.sap.toakoma._

object IsapReader {
  val BASE_URL = "http://isap.sejm.gov.pl"
  val URL      = BASE_URL + "/DetailsServlet?id="
}

class IsapReader(val id: String) extends ItemReader[Document] {

  val logger = LoggerFactory.getLogger(this.getClass())

  var last = false

  def read : Document = {
    logger.trace("read")

    if(last) return null

    this.last = true
    val isapUrl = IsapReader.URL + id
    val rsp = Jsoup.connect(isapUrl).get
    if(rsp.body.text.contains("Brak aktu prawnego o podanym adresie publikacyjnym !"))
      throw new NoSuchDocumentException
    return rsp
  }
}

开发者ID:PrawoPolskie，项目名称:toakoma，代码行数:36，代码来源:IsapReader.scala

示例3: get

//设置package包名称以及导入依赖的类
package pl.mojepanstwo.sap.toakoma.services

import java.net.URL
import java.io.File
import org.apache.commons.io.FileUtils
import org.jsoup.nodes.Document
import com.gargoylesoftware.htmlunit._
import org.jsoup.Jsoup

trait Scraper {
  def get(url: String) : Document
  def dowloadFile(fileUrl:String, filePath:String) : String
}

class DefaultScraperService extends Scraper {

  val webClient = new WebClient

  def get(url: String) : Document = {
      webClient.setRefreshHandler(new RefreshHandler {
        override def handleRefresh(page: Page, url: URL, i: Int): Unit = webClient.getPage(url)
      })
      val apPage: Page = webClient.getPage(url)
      Jsoup.parse(apPage.getWebResponse.getContentAsString)
  }

  def dowloadFile(fileUrl:String, filePath:String) : String = {
    val url = new URL(fileUrl)
    val tmp = new File(filePath)
    FileUtils.copyURLToFile(url, tmp)
    tmp.getAbsolutePath()
  }

}

开发者ID:PrawoPolskie，项目名称:toakoma，代码行数:35，代码来源:Scraper.scala

示例4: ResourceScraperService

//设置package包名称以及导入依赖的类
package pl.mojepanstwo.sap.toakoma

import pl.mojepanstwo.sap.toakoma.services.Scraper
import org.jsoup.nodes.Document
import org.jsoup.Jsoup
import scala.io.Source
import java.io.File
import java.nio.file.Files
import org.apache.commons.io.IOUtils
import java.io.FileOutputStream

class ResourceScraperService extends Scraper {

  def get(url: String) : Document = {
    val pattern = ".*id=(.*)&type=([0-9]+).*".r
    val pattern(id, docType) = url
    Jsoup.parse(Source.fromResource("isap/" + id + "/" + docType + ".html").mkString)
  }

  def dowloadFile(fileUrl:String, filePath:String) : String = {
    val pattern = ".*id=(.*)&type=([0-9]+).*".r
    val pattern(id, docType) = fileUrl
    val src = getClass.getResourceAsStream("/isap/" + id + "/" + docType + ".pdf")
    val dest = new File(filePath)
    val out = new FileOutputStream(dest)
    IOUtils.copy(src, out)
    src.close()
    out.close()
    dest.getAbsolutePath
  }

}

开发者ID:PrawoPolskie，项目名称:toakoma，代码行数:33，代码来源:ResourceScraperService.scala

示例5: GgleLoginTest

//设置package包名称以及导入依赖的类
package com.szadowsz.tarbh.ggle

import com.szadowsz.common.net.Uri
import com.szadowsz.maeve.core.MaeveDriver
import com.szadowsz.maeve.core.browser.MaeveConf
import com.szadowsz.maeve.core.instruction.MaeveInstruction
import com.szadowsz.maeve.core.instruction.extractor.JsoupExtractor
import com.szadowsz.maeve.core.instruction.target.single.SingleTarget
import com.szadowsz.maeve.gglegrp.actions.GgleExecutor
import org.jsoup.nodes.Document


object GgleLoginTest {
  private val link = ""
  private val username: String = ""
  private val passwd: String = ""
  private val urlOfGrp = Uri(link)
  private val groupName: String = ""

  private def buildConfig(): MaeveConf = {
    MaeveConf()
      .setJavaScriptEnabled(true)
      .setHTTPProxy("", 0, Nil)
      .setThrowExceptionOnScriptError(false)
  }



  def main(args: Array[String]): Unit = {
    System.setProperty("webdriver.chrome.driver", ".\\chromedriver_win32\\chromedriver.exe")
    val conf = buildConfig()
    val scraper = new MaeveDriver(conf)

    val rootTarget = SingleTarget(urlOfGrp)
    class TestExtractor extends JsoupExtractor {
      override def extract(queryUrl: Uri, returnedUrl: Uri, inst: MaeveInstruction[_], page: Document): Unit = {}
      override def shouldContinue(): Boolean = false
    }
    val rootFilter = new TestExtractor()

    val actions = new GgleExecutor(username, passwd)
    val rootInstruction = MaeveInstruction(groupName, rootTarget, actions, rootFilter, "./data/grp/", false, false, false, MaeveConf().setNoProxy())

    scraper.feedInstruction(rootInstruction)
    scraper.scrapeUsingCurrInstruction()
  }
}

开发者ID:zakski，项目名称:project-disco，代码行数:48，代码来源:GgleLoginTest.scala

示例6: compile

//设置package包名称以及导入依赖的类
package indi.lewis.spider.html

import java.util

import com.google.gson._
import org.jsoup.Jsoup
import org.jsoup.nodes.Document


private[html] trait ElementType {
  var elementName: String = _;
  def compile(doc:Document):JsonElement;
  def compile(doc:String):JsonElement=compile(Jsoup.parse(doc));
}

private[html] case class ModelParent() extends ElementType {
  elementName="root";

  def this(elementName: String) {
    this();
    this.elementName = elementName;
  }

  val properties: java.util.ArrayList[ElementType] = new util.ArrayList[ElementType]();

  override def compile(doc: Document): JsonElement = {
    val ret=new JsonObject
    for(i <- 0 to properties.size()-1; o= properties.get(i)){
      ret.add(o.elementName,o.compile(doc))
    }
    ret
  }
}

private[html] case class ModelElement(val elName: String, val f: (Document) => ElementType) extends ElementType {
  this.elementName=elName
  override def compile(doc: Document): JsonElement = f(doc).compile(doc)
}

private[html] case class ModelArray(val elName: String) extends ElementType {
  val array: java.util.ArrayList[ElementType] = new util.ArrayList[ElementType]();
  this.elementName=elName
  override def compile(doc: Document): JsonElement = {
    val jsonArray=new JsonArray
    for(i <- 0 to array.size()-1; o= array.get(i)){
      jsonArray.add(o.compile(doc))
    }
    jsonArray
  }
}

private[html] case class ModelConstant(val elName: String, val value: Object) extends ElementType {
  this.elementName=elName
  override def compile(doc: Document): JsonElement = if(value!=null)new JsonPrimitive(value.toString) else JsonNull.INSTANCE
}

开发者ID:TokisakiFun，项目名称:Katipo，代码行数:56，代码来源:ElementType.scala

示例7: LaporBot

//设置package包名称以及导入依赖的类
package io.github.asepsaep.laporcrawler.bot

import scala.collection.JavaConverters._

import org.jsoup.nodes.Document
import org.jsoup.Jsoup

import io.github.asepsaep.laporcrawler.model.Ticket

case class LaporBot(ticketId: Int) {

  private var ticket = new Ticket()
  private val url = "http://36.66.86.72/pengaduan/" + ticketId

  //  System.setProperty("socksProxyHost", "127.0.0.1")
  //  System.setProperty("socksProxyPort", "10001")
  //  System.setProperty("socksProxyVersion", "5")

  def crawl(): Option[Ticket] = {
    val doc = Jsoup.connect(url).timeout(30000).get()
    val maybeTicket = if (doc.getElementsByClass("no-data").isEmpty) Option(parse(doc)) else None
    maybeTicket
  }

  def parse(doc: Document): Ticket = {
    val id = ticketId
    val title = doc.getElementById("row_Subject").text
    val splitContent = doc.getElementById("row_content").text.split(", ")
    val content = if (splitContent.length > 1) splitContent.tail.mkString(", ") else doc.getElementById("row_content").text

    ticket = ticket.copy(id = id, title = title, content = content)

    val details = doc.getElementsByClass("feedback-details").first.getElementsByTag("p").asScala
    for (p ? details) {
      val span = p.getElementsByTag("span")
      span.first.text match {
        case "USER:"     ? ticket = ticket.copy(user = Some(span.last.text))
        case "PLATFORM:" ? ticket = ticket.copy(platform = Some(span.last.text))
        case "TANGGAL:"  ? ticket = ticket.copy(date = Some(span.last.text))
        case "KATEGORI:" ? ticket = ticket.copy(category = Some(span.last.text))
        case "AREA:"     ? ticket = ticket.copy(area = Some(span.last.text))
        case "STATUS:"   ? ticket = ticket.copy(status = Some(span.last.text))
        case _           ? {}
      }
    }

    val dispatchedTo = doc.select(".administrator .comment-content").first.getElementsByTag("p").first.getElementsByTag("span").first.getElementsByTag("b").first.text
    ticket = ticket.copy(dispatchedTo = Some(dispatchedTo))

    ticket
  }

}

开发者ID:asepsaep，项目名称:lapor-crawler，代码行数:54，代码来源:LaporBot.scala

示例8: checkElementAndConvert

//设置package包名称以及导入依赖的类
package haishu.crawler.selector

import org.jsoup.nodes.{Document, Element}


  private def checkElementAndConvert(element: Element): Element = element match {
    case d: Document => d
    case _ =>
      val root = new Document(element.ownerDocument().baseUri())
      root.appendChild(element.clone())
      root
  }

  override def css(selector: String): Selectable = {
    val cssSelector = Selectors.css(selector)
    selectElements(cssSelector)
  }

  override def css(selector: String, attrName: String): Selectable = {
    val cssSelector = Selectors.css(selector, attrName)
    selectElements(cssSelector)
  }
}

开发者ID:hualongdata，项目名称:hl-crawler，代码行数:24，代码来源:HtmlNode.scala

示例9: StyleguideSpider

//设置package包名称以及导入依赖的类
package com.themillhousegroup.witchhunt

import org.jsoup.nodes.{ Element, Document }
import scala.concurrent.Future
import com.themillhousegroup.scoup.{ ScoupImplicits, Scoup }
import scala.concurrent.ExecutionContext.Implicits.global
import java.net.URL


object StyleguideSpider extends ScoupImplicits {

  def visit(url: URL, thisPageOnly: Boolean = false): Future[Set[Document]] = {
    visitLink(url, Set.empty, thisPageOnly)
  }

  private def visitLink(url: URL, alreadyVisited: Set[URL], thisPageOnly: Boolean): Future[Set[Document]] = {
    Scoup.parse(url.toString).flatMap { doc =>

      if (thisPageOnly) {
        Future.successful(Set(doc))
      } else {
        visitLinks(url, doc, alreadyVisited)
      }
    }
  }

  private def visitLinks(url: URL, doc: Document, alreadyVisited: Set[URL]) = {
    val links = doc.select("a").filter(isLocal).map(_.attr("href"))
    links.map(createFullLocalUrl(url)).filter(!alreadyVisited.contains(_)).foldLeft(Future.successful(Set(doc))) {
      case (acc, link) =>
        for {
          existingDocs <- acc
          newDocs <- visitLink(link, alreadyVisited + link, false)
        } yield (existingDocs ++ newDocs)
    }
  }

  private def isLocal(link: Element): Boolean = {
    val href = link.attr("href")
    href.startsWith("/")
  }

  def createFullLocalUrl(base: URL)(link: String): URL = {
    (new java.net.URL(base, link))
  }
}

开发者ID:themillhousegroup，项目名称:witchhunt，代码行数:47，代码来源:StyleguideSpider.scala

示例10: StylesheetFinder

//设置package包名称以及导入依赖的类
package com.themillhousegroup.witchhunt

import com.themillhousegroup.scoup.{ Scoup, ScoupImplicits }
import org.jsoup.nodes.Document

import scala.concurrent.Future
import scala.concurrent.ExecutionContext.Implicits.global


object StylesheetFinder extends ScoupImplicits {

  def allStylesheetUrls(doc: Document): Seq[String] = {
    doc.head.select("link").filter { elem =>
      elem.attr("rel") == "stylesheet"
    }.map { elem =>
      elem.attr("href")
    }.toSeq
  }

  def localStylesheetUrls(doc: Document): Seq[String] = {
    allStylesheetUrls(doc).filter { url =>
      // It starts with a single-slash ONLY (a double-slash means protocol-relative"
      (url.startsWith("/") && !url.startsWith("//")) ||
        // It doesn't start with a traditional protocol specifier
        !(url.startsWith("http:") || url.startsWith("https://"))
    }
  }
}

开发者ID:themillhousegroup，项目名称:witchhunt，代码行数:29，代码来源:StylesheetFinder.scala

示例11: checkSelector

//设置package包名称以及导入依赖的类
package com.themillhousegroup.witchhunt.checks

import com.themillhousegroup.witchhunt.{ExcessiveSpecificityViolation, RuleEnumerator, Violation, ViolationType}
import org.jsoup.nodes.Document
import com.helger.css.decl.CSSDeclaration

trait WitchhuntViolationCheck {
  def checkSelector(implicit ruleSet: RuleEnumerator, selector: String, lineNumber: Int, declarationsWithin: Seq[CSSDeclaration], applicablePages: Set[Document]): Option[Violation]

  protected def buildViolation[VT <: ViolationType](vt: VT,
                                                    thresholdValue:Option[Int] = None,
                                                    violationValue:Option[Int] = None)(implicit ruleSet: RuleEnumerator,
                                                                                       selector: String,
                                                                                       lineNumber: Int,
                                                                                       applicablePages: Set[Document]):Option[Violation] = {
    Some(
      Violation(
        ruleSet.sourceName,
        ruleSet.sourceUrl,
        lineNumber,
        selector,
        applicablePages.map(_.location),
        vt,
        thresholdValue,
        violationValue
      )
    )
  }
}

开发者ID:themillhousegroup，项目名称:witchhunt，代码行数:30，代码来源:WitchhuntViolationCheck.scala

示例12: ExcessiveSpecificityCheck

//设置package包名称以及导入依赖的类
package com.themillhousegroup.witchhunt.checks

import com.themillhousegroup.scoup.ScoupImplicits
import com.themillhousegroup.witchhunt._
import org.jsoup.nodes.Document
import com.helger.css.decl.CSSDeclaration

class ExcessiveSpecificityCheck(options: WitchhuntOptions) extends WitchhuntViolationCheck with ScoupImplicits {

  // Return a violation if the selector is more specific that the configured limit
  def checkSelector(implicit ruleSet: RuleEnumerator, selector: String, lineNumber: Int, declarationsWithin: Seq[CSSDeclaration], applicablePages: Set[Document]): Option[Violation] = {

    val result = Specificity.calculateSingle(selector)

    if (result.asInt > options.specificityLimit) {
      buildViolation(
        ExcessiveSpecificityViolation,
        Some(options.specificityLimit),
        Some(result.asInt)
      )
    } else {
      None
    }
  }
}

开发者ID:themillhousegroup，项目名称:witchhunt，代码行数:26，代码来源:ExcessiveSpecificityCheck.scala

示例13: ExcessiveColorsCheck

//设置package包名称以及导入依赖的类
package com.themillhousegroup.witchhunt.checks

import com.themillhousegroup.scoup.ScoupImplicits
import com.themillhousegroup.witchhunt._
import org.jsoup.nodes.Document
import com.helger.css.decl.{ CSSExpression, CSSDeclaration }

class ExcessiveColorsCheck(options: WitchhuntOptions) extends WitchhuntViolationCheck with ScoupImplicits {

  val CSS_COLOR_PROP = "color"

  val knownColors = scala.collection.mutable.Set[CSSExpression]()

  // Return a violation if the total number of colors defined exceeds the configured limit
  def checkSelector(implicit ruleSet: RuleEnumerator, selector: String, lineNumber: Int, declarationsWithin: Seq[CSSDeclaration], applicablePages: Set[Document]): Option[Violation] = {
    knownColors ++= declarationsWithin.filter(CSS_COLOR_PROP == _.getProperty).map { declaration =>
      declaration.getExpression
    }.toSet

    if (knownColors.size > options.colorLimit) {
      buildViolation(ExcessiveColorsViolation, Some(options.colorLimit), Some(knownColors.size))
    } else {
      None
    }
  }
}

开发者ID:themillhousegroup，项目名称:witchhunt，代码行数:27，代码来源:ExcessiveColorsCheck.scala

示例14: UnusedSelectorCheck

//设置package包名称以及导入依赖的类
package com.themillhousegroup.witchhunt.checks

import com.themillhousegroup.scoup.ScoupImplicits
import com.themillhousegroup.witchhunt.{ RuleEnumerator, UnusedSelectorViolation, Violation, ViolationType }
import org.jsoup.nodes.Document
import scala._
import scala.Some
import com.themillhousegroup.witchhunt.Violation
import com.helger.css.decl.CSSDeclaration

object UnusedSelectorCheck extends WitchhuntViolationCheck with ScoupImplicits {

  // Return a violation if there is no element matching the selector in ANY of the supplied pages
  def checkSelector(implicit ruleSet: RuleEnumerator, selector: String, lineNumber: Int, declarationsWithin: Seq[CSSDeclaration], applicablePages: Set[Document]): Option[Violation] = {
    // As soon as we find an element that matches the selector, we can stop:
    applicablePages.find { stylePage =>
      stylePage.select(selector).nonEmpty
    }.fold(
      buildViolation(UnusedSelectorViolation)
    )(_ => None)
  }
}

开发者ID:themillhousegroup，项目名称:witchhunt，代码行数:23，代码来源:UnusedSelectorCheck.scala

示例15: Article

//设置package包名称以及导入依赖的类
package gander

import gander.images.Image
import gander.opengraph.OpenGraphData
import org.joda.time.DateTime
import org.jsoup.nodes.{Document, Element}


final case class Article(title: String,
                         cleanedArticleText: Option[String],
                         metaDescription: String,
                         metaKeywords: String,
                         canonicalLink: String,
                         domain: String,
                         topNode: Option[Element],
                         topImage: Option[Image],
                         tags: Set[String],
                         movies: List[Element],
                         finalUrl: String,
                         linkHash: String,
                         rawHtml: String,
                         doc: Document,
                         rawDoc: Document,
                         publishDate: Option[DateTime],
                         additionalData: Map[String, String],
                         openGraphData: OpenGraphData)

开发者ID:lloydmeta，项目名称:gander，代码行数:27，代码来源:Article.scala

示例16: WebsiteUpdateActor

//设置package包名称以及导入依赖的类
package services.schedulers

import javax.inject.Inject

import akka.actor.{Actor, Props}
import net.ruippeixotog.scalascraper.browser.JsoupBrowser.JsoupDocument
import org.jsoup.nodes.Document
import play.api.libs.concurrent.Execution.Implicits.defaultContext
import play.api.libs.ws.WSClient
import services.WebsiteParser.HTMLParser
import services.schedulers.WebsiteUpdateActor.update


class WebsiteUpdateActor @Inject()(WSClient: WSClient, HTMLParser: HTMLParser) extends Actor {
  override def receive: Receive = {
    case update(url: String, parseFormat: String) => {
      for{
        doc <- WSClient.url(url).get()
        elements <- HTMLParser.tryParse(JsoupDocument(Document.createShell(doc.body)), parseFormat)
      } yield {
        elements.
      }
    }
  }
}
object WebsiteUpdateActor {
  def props = Props[WebsiteUpdateActor]

  case class update(url: String, parseFormat: String)
}

开发者ID:orkunkl，项目名称:AnnouncementAppServer，代码行数:31，代码来源:WebsiteUpdateActor.scala

示例17: UboatNetClassFilter

//设置package包名称以及导入依赖的类
package com.szadowsz.morrigan.ships.uboat.filter

import com.szadowsz.common.net.Uri
import com.szadowsz.maeve.core.instruction.MaeveInstruction
import com.szadowsz.maeve.core.instruction.extractor.JsoupExtractor
import org.jsoup.nodes.Document

import scala.collection.JavaConverters._


class UboatNetClassFilter extends JsoupExtractor {
  private val classPath: String = "body div[id=content] h1[class=warship_header]"
  private val shipPath: String = classPath + " + p"
  private val techPath : String = "body div[id=content] > div:contains(Technical information) > table[align=center][class=table_subtle] > tbody > tr"
  private val shipNamesPath: String = "body div[id=content] h3 + table"

  private val shipRegex = "[0-9]+".r

  var rows : List[List[Any]] = List()

  override def extract(queryUrl : Uri, returnedUrl : Uri, inst : MaeveInstruction[_], page: Document): Unit = {
    val classHeader = page.select(classPath).asScala.head.text()
    val shipCount =  shipRegex.findFirstIn(page.select(shipPath).asScala.head.text()).map(c => c.toInt).getOrElse(-1)

    val shipNames = page.select(shipNamesPath).get(1).select("td a[href*=/ship/]").asScala.map(e => (e.attr("href"),e.text()))

    if (shipCount > shipNames.length) throw new IllegalStateException()

    val techInfo = page.select(techPath).asScala.map(tr => (tr.child(0).text(), tr.child(1).text())).toMap
    val classInfo = List(
      shipNames.toList,
      classHeader,
      shipCount.toInt,
      shipNames.length,
      techInfo("Type"),
      techInfo("Displacement"),
      techInfo("Length"),
      techInfo("Complement"),
      techInfo("Armament"),
      techInfo.getOrElse("Max speed",""),
      techInfo.getOrElse("Engines",""),
      techInfo.getOrElse("Power",""),
      techInfo("Notes on class"),
      returnedUrl.toString
    )
    rows = rows :+ classInfo
  }

  override def shouldContinue(): Boolean = false
}

开发者ID:zakski，项目名称:project-morrigan，代码行数:51，代码来源:UboatNetClassFilter.scala

示例18: UsNavyMultiPageFilter

//设置package包名称以及导入依赖的类
package com.szadowsz.morrigan.ships.wiki.us

import com.szadowsz.common.net.Uri
import com.szadowsz.maeve.core.instruction.MaeveInstruction
import com.szadowsz.maeve.core.instruction.extractor.JsoupExtractor
import org.jsoup.nodes.Document

import scala.collection.JavaConverters._


class UsNavyMultiPageFilter extends JsoupExtractor {
  private val REDIRECT_PATH: String = "body div#content div#contentSub span.mw-redirectedfrom"
  private val INFOBOX_PATH: String = "body div#content table.infobox"
  private val SHIP_INSTANCE_PATH: String = "body div.mw-content-ltr > ul > li , body div.mw-content-ltr > dl > dd"

  var ships : List[List[(Option[String], Boolean, Boolean, Boolean, String)]] = List()
  private val prefixes = List("USS","USNS")

  override def extract(queryUrl: Uri, returnedUrl: Uri, inst: MaeveInstruction[_], page: Document): Unit = {
    val isRedirected = page.select(REDIRECT_PATH).asScala.nonEmpty
    val hasInfo = page.select(INFOBOX_PATH).asScala.length == 1
    val instances = if (!hasInfo) page.select(SHIP_INSTANCE_PATH).asScala.toList else List()

    val curr = if (instances.isEmpty) {
      List((Some(queryUrl.path), isRedirected, hasInfo, true, ""))
    } else {
      instances.filter(ins => ins.select("a").asScala.exists(e => prefixes.exists(p => e.attr("Title").contains(p)))).map(ins => {
        // Exception in thread "main" java.net.URISyntaxException: Relative path in absolute URI: http://en.wikipedia.orghttps://donate.wikimedia.org/wiki/Special:FundraiserRedirector%3Futm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en
        val nurl = ins.select("a").asScala.find(e => prefixes.exists(p => e.attr("Title").contains(p))).map(l => l.attr("href"))
        val desc = ins.text()
        val exists = nurl.exists(!Uri(_).containsQueryKey("redlink"))
        (nurl, true, true, exists, desc)
      })
    }
    ships = ships :+ curr
  }

  override def shouldContinue(): Boolean = false
}

开发者ID:zakski，项目名称:project-morrigan，代码行数:40，代码来源:UsNavyMultiPageFilter.scala

示例19: RoyalNavyMultiPageFilter

//设置package包名称以及导入依赖的类
package com.szadowsz.morrigan.ships.wiki.uk.scrape

import com.szadowsz.common.net.Uri
import com.szadowsz.maeve.core.instruction.MaeveInstruction
import com.szadowsz.maeve.core.instruction.extractor.JsoupExtractor
import org.jsoup.nodes.Document

import scala.collection.JavaConverters._


class RoyalNavyMultiPageFilter extends JsoupExtractor {
  private val REDIRECT_PATH: String = "body div#content div#contentSub span.mw-redirectedfrom"
  private val INFOBOX_PATH: String = "body div#content table.infobox"
  private val SHIP_INSTANCE_PATH: String = "body div.mw-content-ltr > ul > li , body div.mw-content-ltr > dl > dd"

  var ships : List[List[(Option[String], Boolean, Boolean, Boolean, String)]] = List()

  override def extract(queryUrl: Uri, returnedUrl: Uri, inst: MaeveInstruction[_], page: Document): Unit = {
    val isRedirected = page.select(REDIRECT_PATH).asScala.nonEmpty
    val hasInfo = page.select(INFOBOX_PATH).asScala.length == 1
    val instances = if (!hasInfo) page.select(SHIP_INSTANCE_PATH).asScala.toList else List()

    val curr = if (instances.isEmpty) {
      List((Some(queryUrl.path), isRedirected, hasInfo, true, ""))
    } else {
      instances.filter(ins => ins.select("a").asScala.exists(e => e.attr("Title").contains("HMS"))).map(ins => {
        val nurl = ins.select("a").asScala.find(e => e.attr("Title").contains("HMS")).map(l => l.attr("href"))
        val desc = ins.text()
        val exists = nurl.exists(!Uri(_).containsQueryKey("redlink"))
        (nurl, true, true, exists, desc)
      })
    }
    ships = ships :+ curr
  }

  override def shouldContinue(): Boolean = false
}

开发者ID:zakski，项目名称:project-morrigan，代码行数:38，代码来源:RoyalNavyMultiPageFilter.scala

示例20: HtmlLifter

//设置package包名称以及导入依赖的类
package com.twitter.diffy.lifter

import org.jsoup.Jsoup
import org.jsoup.nodes.{Document, Element}
import org.jsoup.select.Elements

import scala.collection.JavaConversions._

object HtmlLifter {
  def lift(node: Element): FieldMap[Any] = node match {
    case doc: Document =>
      FieldMap(
        Map(
          "head" -> lift(doc.head),
          "body" -> lift(doc.body)
        )
      )
    case doc: Element => {
      val children: Elements = doc.children
      val attributes =
        FieldMap[String](
          doc.attributes.asList map { attribute =>
            attribute.getKey -> attribute.getValue
          } toMap
        )

      FieldMap(
        Map(
          "tag"         -> doc.tagName,
          "text"        -> doc.ownText,
          "attributes"  -> attributes,
          "children"    -> children.map(element => lift(element))
        )
      )
    }
  }

  def decode(html: String): Document = Jsoup.parse(html)
}

开发者ID:sachinmanchanda，项目名称:diffy_unicast，代码行数:40，代码来源:HtmlLifter.scala

注：本文中的org.jsoup.nodes.Document类示例整理自Github/MSDocs等源码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。