• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

Scala Document类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Scala中org.jsoup.nodes.Document的典型用法代码示例。如果您正苦于以下问题:Scala Document类的具体用法?Scala Document怎么用?Scala Document使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了Document类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Scala代码示例。

示例1: NoticeServiceObjects

//设置package包名称以及导入依赖的类
package com.zhranklin.homepage.notice

import org.json4s._
import org.jsoup.Jsoup
import org.jsoup.nodes.Document

object NoticeServiceObjects {

  trait ServiceBase extends IndexService with FunNoticeFetcher with SelectorUrlService {
    val initVal: ((Document) ? String, (Document) ? String, String, String)
    lazy val (getContent, getDateStr, urlPattern, template) = initVal
  }

  class LawService(title: String, listId: String) extends NoticeService(s"??? - $title") with UrlService with IndexService with FunNoticeFetcher {
    val getContent = contentF("div.text")
    val getDateStr = dateF("span:contains(????)")
    val template = "http://law.scu.edu.cn/xjax?arg=8573&arg=<index>&arg=20&arg=list&clazz=PortalArticleAction&method=list"

    def getUrl(id: String) = s"http://law.scu.edu.cn/detail.jsp?portalId=725&cid=8385&nextcid=$listId&aid=$id"

    override def noticeUrlsFromUrl(url: String): Iterable[NoticeEntry] = {
      val jsonStr = Jsoup.connect(url).execute().body()
      val json = jackson.parseJson(jsonStr)
      json.\("data").asInstanceOf[JArray].arr.map(
        jo ? NoticeEntry(getUrl(jo.\("id").values.toString), Some(jo.\("subject").values.toString)))
    }
  }

  val serviceList = List(
    "???? - ???? - test" ?
      "http://www.sculj.cn/Special_News.asp?SpecialID=40&SpecialName=%D1%A7%D4%BA%B6%AF%CC%AC&page=<index>",
    "???? - ???? - test" ? "http://sesu.scu.edu.cn/news/list_1_<index>.html",
    "???? - ????" ? "http://sesu.scu.edu.cn/gonggao/list_2_<index>.html",
    "????? - ???? - test" ? "http://cs.scu.edu.cn/cs/xsky/xskb/H951901index_<index>.htm",
    "????? - ???? - test" ? "http://cs.scu.edu.cn/cs/xytz/H9502index_<index>.htm",
    "????? - ???? - test" ? "http://cs.scu.edu.cn/cs/xyxw/H9501index_<index>.htm",
    "????? - ??? - test" ? "http://cs.scu.edu.cn/cs/fwzy/ftl/H951204index_<index>.htm",
    "???? - test" ? "http://news.scu.edu.cn/news2012/cdzx/I0201index_<index>.htm",
    "???? - ????" ?"http://math.scu.edu.cn/news.asp?PAGE=<index>",
    "?????? - ????" ? "http://seei.scu.edu.cn/student,p<index>,index.jsp",
    "????? - ????" ? "http://flc2.scu.edu.cn/foreign/a/xueyuangonggao/list_27_<index>.html"
  ).map { tp ?
    new NoticeService(tp._1) with UniversalUrlService with UniversalNoticeFetcher with IndexService {
      val template = tp._2
    }
  } ++ List(
    new NoticeService("??? - ??") with ServiceBase {
      val initVal =(selectorF("input[name=news.content]")(_.first.attr("value")), dateF("table[width=900] td:contains(????)"),
        "newsShow.*", "http://jwc.scu.edu.cn/jwc/moreNotice.action?url=moreNotice.action&type=2&keyWord=&pager.pageNow=<index>")},
    new LawService("????", "8572"),
    new LawService("????", "8573")
  )
} 
开发者ID:zhranklin,项目名称:Private_Blog,代码行数:54,代码来源:NoticeServiceObjects.scala


示例2: IsapReader

//设置package包名称以及导入依赖的类
package pl.mojepanstwo.sap.toakoma.readers

import org.slf4j.LoggerFactory
import org.springframework.batch.item.ItemReader

import org.jsoup.Jsoup

import org.jsoup.nodes.Document
import com.gargoylesoftware.htmlunit.WebClient
import pl.mojepanstwo.sap.toakoma._

object IsapReader {
  val BASE_URL = "http://isap.sejm.gov.pl"
  val URL      = BASE_URL + "/DetailsServlet?id="
}

class IsapReader(val id: String) extends ItemReader[Document] {

  val logger = LoggerFactory.getLogger(this.getClass())

  var last = false

  def read : Document = {
    logger.trace("read")

    if(last) return null

    this.last = true
    val isapUrl = IsapReader.URL + id
    val rsp = Jsoup.connect(isapUrl).get
    if(rsp.body.text.contains("Brak aktu prawnego o podanym adresie publikacyjnym !"))
      throw new NoSuchDocumentException
    return rsp
  }
} 
开发者ID:PrawoPolskie,项目名称:toakoma,代码行数:36,代码来源:IsapReader.scala


示例3: get

//设置package包名称以及导入依赖的类
package pl.mojepanstwo.sap.toakoma.services

import java.net.URL
import java.io.File
import org.apache.commons.io.FileUtils
import org.jsoup.nodes.Document
import com.gargoylesoftware.htmlunit._
import org.jsoup.Jsoup

trait Scraper {
  def get(url: String) : Document
  def dowloadFile(fileUrl:String, filePath:String) : String
}

class DefaultScraperService extends Scraper {

  val webClient = new WebClient

  def get(url: String) : Document = {
      webClient.setRefreshHandler(new RefreshHandler {
        override def handleRefresh(page: Page, url: URL, i: Int): Unit = webClient.getPage(url)
      })
      val apPage: Page = webClient.getPage(url)
      Jsoup.parse(apPage.getWebResponse.getContentAsString)
  }

  def dowloadFile(fileUrl:String, filePath:String) : String = {
    val url = new URL(fileUrl)
    val tmp = new File(filePath)
    FileUtils.copyURLToFile(url, tmp)
    tmp.getAbsolutePath()
  }

} 
开发者ID:PrawoPolskie,项目名称:toakoma,代码行数:35,代码来源:Scraper.scala


示例4: ResourceScraperService

//设置package包名称以及导入依赖的类
package pl.mojepanstwo.sap.toakoma

import pl.mojepanstwo.sap.toakoma.services.Scraper
import org.jsoup.nodes.Document
import org.jsoup.Jsoup
import scala.io.Source
import java.io.File
import java.nio.file.Files
import org.apache.commons.io.IOUtils
import java.io.FileOutputStream

class ResourceScraperService extends Scraper {

  def get(url: String) : Document = {
    val pattern = ".*id=(.*)&type=([0-9]+).*".r
    val pattern(id, docType) = url
    Jsoup.parse(Source.fromResource("isap/" + id + "/" + docType + ".html").mkString)
  }

  def dowloadFile(fileUrl:String, filePath:String) : String = {
    val pattern = ".*id=(.*)&type=([0-9]+).*".r
    val pattern(id, docType) = fileUrl
    val src = getClass.getResourceAsStream("/isap/" + id + "/" + docType + ".pdf")
    val dest = new File(filePath)
    val out = new FileOutputStream(dest)
    IOUtils.copy(src, out)
    src.close()
    out.close()
    dest.getAbsolutePath
  }

} 
开发者ID:PrawoPolskie,项目名称:toakoma,代码行数:33,代码来源:ResourceScraperService.scala


示例5: GgleLoginTest

//设置package包名称以及导入依赖的类
package com.szadowsz.tarbh.ggle

import com.szadowsz.common.net.Uri
import com.szadowsz.maeve.core.MaeveDriver
import com.szadowsz.maeve.core.browser.MaeveConf
import com.szadowsz.maeve.core.instruction.MaeveInstruction
import com.szadowsz.maeve.core.instruction.extractor.JsoupExtractor
import com.szadowsz.maeve.core.instruction.target.single.SingleTarget
import com.szadowsz.maeve.gglegrp.actions.GgleExecutor
import org.jsoup.nodes.Document


object GgleLoginTest {
  private val link = ""
  private val username: String = ""
  private val passwd: String = ""
  private val urlOfGrp = Uri(link)
  private val groupName: String = ""

  private def buildConfig(): MaeveConf = {
    MaeveConf()
      .setJavaScriptEnabled(true)
      .setHTTPProxy("", 0, Nil)
      .setThrowExceptionOnScriptError(false)
  }



  def main(args: Array[String]): Unit = {
    System.setProperty("webdriver.chrome.driver", ".\\chromedriver_win32\\chromedriver.exe")
    val conf = buildConfig()
    val scraper = new MaeveDriver(conf)

    val rootTarget = SingleTarget(urlOfGrp)
    class TestExtractor extends JsoupExtractor {
      override def extract(queryUrl: Uri, returnedUrl: Uri, inst: MaeveInstruction[_], page: Document): Unit = {}
      override def shouldContinue(): Boolean = false
    }
    val rootFilter = new TestExtractor()

    val actions = new GgleExecutor(username, passwd)
    val rootInstruction = MaeveInstruction(groupName, rootTarget, actions, rootFilter, "./data/grp/", false, false, false, MaeveConf().setNoProxy())

    scraper.feedInstruction(rootInstruction)
    scraper.scrapeUsingCurrInstruction()
  }
} 
开发者ID:zakski,项目名称:project-disco,代码行数:48,代码来源:GgleLoginTest.scala


示例6: compile

//设置package包名称以及导入依赖的类
package indi.lewis.spider.html

import java.util

import com.google.gson._
import org.jsoup.Jsoup
import org.jsoup.nodes.Document


private[html] trait ElementType {
  var elementName: String = _;
  def compile(doc:Document):JsonElement;
  def compile(doc:String):JsonElement=compile(Jsoup.parse(doc));
}

private[html] case class ModelParent() extends ElementType {
  elementName="root";

  def this(elementName: String) {
    this();
    this.elementName = elementName;
  }

  val properties: java.util.ArrayList[ElementType] = new util.ArrayList[ElementType]();

  override def compile(doc: Document): JsonElement = {
    val ret=new JsonObject
    for(i <- 0 to properties.size()-1; o= properties.get(i)){
      ret.add(o.elementName,o.compile(doc))
    }
    ret
  }
}

private[html] case class ModelElement(val elName: String, val f: (Document) => ElementType) extends ElementType {
  this.elementName=elName
  override def compile(doc: Document): JsonElement = f(doc).compile(doc)
}

private[html] case class ModelArray(val elName: String) extends ElementType {
  val array: java.util.ArrayList[ElementType] = new util.ArrayList[ElementType]();
  this.elementName=elName
  override def compile(doc: Document): JsonElement = {
    val jsonArray=new JsonArray
    for(i <- 0 to array.size()-1; o= array.get(i)){
      jsonArray.add(o.compile(doc))
    }
    jsonArray
  }
}

private[html] case class ModelConstant(val elName: String, val value: Object) extends ElementType {
  this.elementName=elName
  override def compile(doc: Document): JsonElement = if(value!=null)new JsonPrimitive(value.toString) else JsonNull.INSTANCE
} 
开发者ID:TokisakiFun,项目名称:Katipo,代码行数:56,代码来源:ElementType.scala


示例7: LaporBot

//设置package包名称以及导入依赖的类
package io.github.asepsaep.laporcrawler.bot

import scala.collection.JavaConverters._

import org.jsoup.nodes.Document
import org.jsoup.Jsoup

import io.github.asepsaep.laporcrawler.model.Ticket

case class LaporBot(ticketId: Int) {

  private var ticket = new Ticket()
  private val url = "http://36.66.86.72/pengaduan/" + ticketId

  //  System.setProperty("socksProxyHost", "127.0.0.1")
  //  System.setProperty("socksProxyPort", "10001")
  //  System.setProperty("socksProxyVersion", "5")

  def crawl(): Option[Ticket] = {
    val doc = Jsoup.connect(url).timeout(30000).get()
    val maybeTicket = if (doc.getElementsByClass("no-data").isEmpty) Option(parse(doc)) else None
    maybeTicket
  }

  def parse(doc: Document): Ticket = {
    val id = ticketId
    val title = doc.getElementById("row_Subject").text
    val splitContent = doc.getElementById("row_content").text.split(", ")
    val content = if (splitContent.length > 1) splitContent.tail.mkString(", ") else doc.getElementById("row_content").text

    ticket = ticket.copy(id = id, title = title, content = content)

    val details = doc.getElementsByClass("feedback-details").first.getElementsByTag("p").asScala
    for (p ? details) {
      val span = p.getElementsByTag("span")
      span.first.text match {
        case "USER:"     ? ticket = ticket.copy(user = Some(span.last.text))
        case "PLATFORM:" ? ticket = ticket.copy(platform = Some(span.last.text))
        case "TANGGAL:"  ? ticket = ticket.copy(date = Some(span.last.text))
        case "KATEGORI:" ? ticket = ticket.copy(category = Some(span.last.text))
        case "AREA:"     ? ticket = ticket.copy(area = Some(span.last.text))
        case "STATUS:"   ? ticket = ticket.copy(status = Some(span.last.text))
        case _           ? {}
      }
    }

    val dispatchedTo = doc.select(".administrator .comment-content").first.getElementsByTag("p").first.getElementsByTag("span").first.getElementsByTag("b").first.text
    ticket = ticket.copy(dispatchedTo = Some(dispatchedTo))

    ticket
  }

} 
开发者ID:asepsaep,项目名称:lapor-crawler,代码行数:54,代码来源:LaporBot.scala


示例8: checkElementAndConvert

//设置package包名称以及导入依赖的类
package haishu.crawler.selector

import org.jsoup.nodes.{Document, Element}


  private def checkElementAndConvert(element: Element): Element = element match {
    case d: Document => d
    case _ =>
      val root = new Document(element.ownerDocument().baseUri())
      root.appendChild(element.clone())
      root
  }

  override def css(selector: String): Selectable = {
    val cssSelector = Selectors.css(selector)
    selectElements(cssSelector)
  }

  override def css(selector: String, attrName: String): Selectable = {
    val cssSelector = Selectors.css(selector, attrName)
    selectElements(cssSelector)
  }
} 
开发者ID:hualongdata,项目名称:hl-crawler,代码行数:24,代码来源:HtmlNode.scala


示例9: StyleguideSpider

//设置package包名称以及导入依赖的类
package com.themillhousegroup.witchhunt

import org.jsoup.nodes.{ Element, Document }
import scala.concurrent.Future
import com.themillhousegroup.scoup.{ ScoupImplicits, Scoup }
import scala.concurrent.ExecutionContext.Implicits.global
import java.net.URL


object StyleguideSpider extends ScoupImplicits {

  def visit(url: URL, thisPageOnly: Boolean = false): Future[Set[Document]] = {
    visitLink(url, Set.empty, thisPageOnly)
  }

  private def visitLink(url: URL, alreadyVisited: Set[URL], thisPageOnly: Boolean): Future[Set[Document]] = {
    Scoup.parse(url.toString).flatMap { doc =>

      if (thisPageOnly) {
        Future.successful(Set(doc))
      } else {
        visitLinks(url, doc, alreadyVisited)
      }
    }
  }

  private def visitLinks(url: URL, doc: Document, alreadyVisited: Set[URL]) = {
    val links = doc.select("a").filter(isLocal).map(_.attr("href"))
    links.map(createFullLocalUrl(url)).filter(!alreadyVisited.contains(_)).foldLeft(Future.successful(Set(doc))) {
      case (acc, link) =>
        for {
          existingDocs <- acc
          newDocs <- visitLink(link, alreadyVisited + link, false)
        } yield (existingDocs ++ newDocs)
    }
  }

  private def isLocal(link: Element): Boolean = {
    val href = link.attr("href")
    href.startsWith("/")
  }

  def createFullLocalUrl(base: URL)(link: String): URL = {
    (new java.net.URL(base, link))
  }
} 
开发者ID:themillhousegroup,项目名称:witchhunt,代码行数:47,代码来源:StyleguideSpider.scala


示例10: StylesheetFinder

//设置package包名称以及导入依赖的类
package com.themillhousegroup.witchhunt

import com.themillhousegroup.scoup.{ Scoup, ScoupImplicits }
import org.jsoup.nodes.Document

import scala.concurrent.Future
import scala.concurrent.ExecutionContext.Implicits.global


object StylesheetFinder extends ScoupImplicits {

  def allStylesheetUrls(doc: Document): Seq[String] = {
    doc.head.select("link").filter { elem =>
      elem.attr("rel") == "stylesheet"
    }.map { elem =>
      elem.attr("href")
    }.toSeq
  }

  def localStylesheetUrls(doc: Document): Seq[String] = {
    allStylesheetUrls(doc).filter { url =>
      // It starts with a single-slash ONLY (a double-slash means protocol-relative"
      (url.startsWith("/") && !url.startsWith("//")) ||
        // It doesn't start with a traditional protocol specifier
        !(url.startsWith("http:") || url.startsWith("https://"))
    }
  }
} 
开发者ID:themillhousegroup,项目名称:witchhunt,代码行数:29,代码来源:StylesheetFinder.scala


示例11: checkSelector

//设置package包名称以及导入依赖的类
package com.themillhousegroup.witchhunt.checks

import com.themillhousegroup.witchhunt.{ExcessiveSpecificityViolation, RuleEnumerator, Violation, ViolationType}
import org.jsoup.nodes.Document
import com.helger.css.decl.CSSDeclaration

trait WitchhuntViolationCheck {
  def checkSelector(implicit ruleSet: RuleEnumerator, selector: String, lineNumber: Int, declarationsWithin: Seq[CSSDeclaration], applicablePages: Set[Document]): Option[Violation]

  protected def buildViolation[VT <: ViolationType](vt: VT,
                                                    thresholdValue:Option[Int] = None,
                                                    violationValue:Option[Int] = None)(implicit ruleSet: RuleEnumerator,
                                                                                       selector: String,
                                                                                       lineNumber: Int,
                                                                                       applicablePages: Set[Document]):Option[Violation] = {
    Some(
      Violation(
        ruleSet.sourceName,
        ruleSet.sourceUrl,
        lineNumber,
        selector,
        applicablePages.map(_.location),
        vt,
        thresholdValue,
        violationValue
      )
    )
  }
} 
开发者ID:themillhousegroup,项目名称:witchhunt,代码行数:30,代码来源:WitchhuntViolationCheck.scala


示例12: ExcessiveSpecificityCheck

//设置package包名称以及导入依赖的类
package com.themillhousegroup.witchhunt.checks

import com.themillhousegroup.scoup.ScoupImplicits
import com.themillhousegroup.witchhunt._
import org.jsoup.nodes.Document
import com.helger.css.decl.CSSDeclaration

class ExcessiveSpecificityCheck(options: WitchhuntOptions) extends WitchhuntViolationCheck with ScoupImplicits {

  // Return a violation if the selector is more specific that the configured limit
  def checkSelector(implicit ruleSet: RuleEnumerator, selector: String, lineNumber: Int, declarationsWithin: Seq[CSSDeclaration], applicablePages: Set[Document]): Option[Violation] = {

    val result = Specificity.calculateSingle(selector)

    if (result.asInt > options.specificityLimit) {
      buildViolation(
        ExcessiveSpecificityViolation,
        Some(options.specificityLimit),
        Some(result.asInt)
      )
    } else {
      None
    }
  }
} 
开发者ID:themillhousegroup,项目名称:witchhunt,代码行数:26,代码来源:ExcessiveSpecificityCheck.scala


示例13: ExcessiveColorsCheck

//设置package包名称以及导入依赖的类
package com.themillhousegroup.witchhunt.checks

import com.themillhousegroup.scoup.ScoupImplicits
import com.themillhousegroup.witchhunt._
import org.jsoup.nodes.Document
import com.helger.css.decl.{ CSSExpression, CSSDeclaration }

class ExcessiveColorsCheck(options: WitchhuntOptions) extends WitchhuntViolationCheck with ScoupImplicits {

  val CSS_COLOR_PROP = "color"

  val knownColors = scala.collection.mutable.Set[CSSExpression]()

  // Return a violation if the total number of colors defined exceeds the configured limit
  def checkSelector(implicit ruleSet: RuleEnumerator, selector: String, lineNumber: Int, declarationsWithin: Seq[CSSDeclaration], applicablePages: Set[Document]): Option[Violation] = {
    knownColors ++= declarationsWithin.filter(CSS_COLOR_PROP == _.getProperty).map { declaration =>
      declaration.getExpression
    }.toSet

    if (knownColors.size > options.colorLimit) {
      buildViolation(ExcessiveColorsViolation, Some(options.colorLimit), Some(knownColors.size))
    } else {
      None
    }
  }
} 
开发者ID:themillhousegroup,项目名称:witchhunt,代码行数:27,代码来源:ExcessiveColorsCheck.scala


示例14: UnusedSelectorCheck

//设置package包名称以及导入依赖的类
package com.themillhousegroup.witchhunt.checks

import com.themillhousegroup.scoup.ScoupImplicits
import com.themillhousegroup.witchhunt.{ RuleEnumerator, UnusedSelectorViolation, Violation, ViolationType }
import org.jsoup.nodes.Document
import scala._
import scala.Some
import com.themillhousegroup.witchhunt.Violation
import com.helger.css.decl.CSSDeclaration

object UnusedSelectorCheck extends WitchhuntViolationCheck with ScoupImplicits {

  // Return a violation if there is no element matching the selector in ANY of the supplied pages
  def checkSelector(implicit ruleSet: RuleEnumerator, selector: String, lineNumber: Int, declarationsWithin: Seq[CSSDeclaration], applicablePages: Set[Document]): Option[Violation] = {
    // As soon as we find an element that matches the selector, we can stop:
    applicablePages.find { stylePage =>
      stylePage.select(selector).nonEmpty
    }.fold(
      buildViolation(UnusedSelectorViolation)
    )(_ => None)
  }
} 
开发者ID:themillhousegroup,项目名称:witchhunt,代码行数:23,代码来源:UnusedSelectorCheck.scala


示例15: Article

//设置package包名称以及导入依赖的类
package gander

import gander.images.Image
import gander.opengraph.OpenGraphData
import org.joda.time.DateTime
import org.jsoup.nodes.{Document, Element}


final case class Article(title: String,
                         cleanedArticleText: Option[String],
                         metaDescription: String,
                         metaKeywords: String,
                         canonicalLink: String,
                         domain: String,
                         topNode: Option[Element],
                         topImage: Option[Image],
                         tags: Set[String],
                         movies: List[Element],
                         finalUrl: String,
                         linkHash: String,
                         rawHtml: String,
                         doc: Document,
                         rawDoc: Document,
                         publishDate: Option[DateTime],
                         additionalData: Map[String, String],
                         openGraphData: OpenGraphData) 
开发者ID:lloydmeta,项目名称:gander,代码行数:27,代码来源:Article.scala


示例16: WebsiteUpdateActor

//设置package包名称以及导入依赖的类
package services.schedulers

import javax.inject.Inject

import akka.actor.{Actor, Props}
import net.ruippeixotog.scalascraper.browser.JsoupBrowser.JsoupDocument
import org.jsoup.nodes.Document
import play.api.libs.concurrent.Execution.Implicits.defaultContext
import play.api.libs.ws.WSClient
import services.WebsiteParser.HTMLParser
import services.schedulers.WebsiteUpdateActor.update


class WebsiteUpdateActor @Inject()(WSClient: WSClient, HTMLParser: HTMLParser) extends Actor {
  override def receive: Receive = {
    case update(url: String, parseFormat: String) => {
      for{
        doc <- WSClient.url(url).get()
        elements <- HTMLParser.tryParse(JsoupDocument(Document.createShell(doc.body)), parseFormat)
      } yield {
        elements.
      }
    }
  }
}
object WebsiteUpdateActor {
  def props = Props[WebsiteUpdateActor]

  case class update(url: String, parseFormat: String)
} 
开发者ID:orkunkl,项目名称:AnnouncementAppServer,代码行数:31,代码来源:WebsiteUpdateActor.scala


示例17: UboatNetClassFilter

//设置package包名称以及导入依赖的类
package com.szadowsz.morrigan.ships.uboat.filter

import com.szadowsz.common.net.Uri
import com.szadowsz.maeve.core.instruction.MaeveInstruction
import com.szadowsz.maeve.core.instruction.extractor.JsoupExtractor
import org.jsoup.nodes.Document

import scala.collection.JavaConverters._


class UboatNetClassFilter extends JsoupExtractor {
  private val classPath: String = "body div[id=content] h1[class=warship_header]"
  private val shipPath: String = classPath + " + p"
  private val techPath : String = "body div[id=content] > div:contains(Technical information) > table[align=center][class=table_subtle] > tbody > tr"
  private val shipNamesPath: String = "body div[id=content] h3 + table"

  private val shipRegex = "[0-9]+".r

  var rows : List[List[Any]] = List()

  override def extract(queryUrl : Uri, returnedUrl : Uri, inst : MaeveInstruction[_], page: Document): Unit = {
    val classHeader = page.select(classPath).asScala.head.text()
    val shipCount =  shipRegex.findFirstIn(page.select(shipPath).asScala.head.text()).map(c => c.toInt).getOrElse(-1)

    val shipNames = page.select(shipNamesPath).get(1).select("td a[href*=/ship/]").asScala.map(e => (e.attr("href"),e.text()))

    if (shipCount > shipNames.length) throw new IllegalStateException()

    val techInfo = page.select(techPath).asScala.map(tr => (tr.child(0).text(), tr.child(1).text())).toMap
    val classInfo = List(
      shipNames.toList,
      classHeader,
      shipCount.toInt,
      shipNames.length,
      techInfo("Type"),
      techInfo("Displacement"),
      techInfo("Length"),
      techInfo("Complement"),
      techInfo("Armament"),
      techInfo.getOrElse("Max speed",""),
      techInfo.getOrElse("Engines",""),
      techInfo.getOrElse("Power",""),
      techInfo("Notes on class"),
      returnedUrl.toString
    )
    rows = rows :+ classInfo
  }

  override def shouldContinue(): Boolean = false
} 
开发者ID:zakski,项目名称:project-morrigan,代码行数:51,代码来源:UboatNetClassFilter.scala


示例18: UsNavyMultiPageFilter

//设置package包名称以及导入依赖的类
package com.szadowsz.morrigan.ships.wiki.us

import com.szadowsz.common.net.Uri
import com.szadowsz.maeve.core.instruction.MaeveInstruction
import com.szadowsz.maeve.core.instruction.extractor.JsoupExtractor
import org.jsoup.nodes.Document

import scala.collection.JavaConverters._


class UsNavyMultiPageFilter extends JsoupExtractor {
  private val REDIRECT_PATH: String = "body div#content div#contentSub span.mw-redirectedfrom"
  private val INFOBOX_PATH: String = "body div#content table.infobox"
  private val SHIP_INSTANCE_PATH: String = "body div.mw-content-ltr > ul > li , body div.mw-content-ltr > dl > dd"

  var ships : List[List[(Option[String], Boolean, Boolean, Boolean, String)]] = List()
  private val prefixes = List("USS","USNS")

  override def extract(queryUrl: Uri, returnedUrl: Uri, inst: MaeveInstruction[_], page: Document): Unit = {
    val isRedirected = page.select(REDIRECT_PATH).asScala.nonEmpty
    val hasInfo = page.select(INFOBOX_PATH).asScala.length == 1
    val instances = if (!hasInfo) page.select(SHIP_INSTANCE_PATH).asScala.toList else List()

    val curr = if (instances.isEmpty) {
      List((Some(queryUrl.path), isRedirected, hasInfo, true, ""))
    } else {
      instances.filter(ins => ins.select("a").asScala.exists(e => prefixes.exists(p => e.attr("Title").contains(p)))).map(ins => {
        // Exception in thread "main" java.net.URISyntaxException: Relative path in absolute URI: http://en.wikipedia.orghttps://donate.wikimedia.org/wiki/Special:FundraiserRedirector%3Futm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en
        val nurl = ins.select("a").asScala.find(e => prefixes.exists(p => e.attr("Title").contains(p))).map(l => l.attr("href"))
        val desc = ins.text()
        val exists = nurl.exists(!Uri(_).containsQueryKey("redlink"))
        (nurl, true, true, exists, desc)
      })
    }
    ships = ships :+ curr
  }

  override def shouldContinue(): Boolean = false
} 
开发者ID:zakski,项目名称:project-morrigan,代码行数:40,代码来源:UsNavyMultiPageFilter.scala


示例19: RoyalNavyMultiPageFilter

//设置package包名称以及导入依赖的类
package com.szadowsz.morrigan.ships.wiki.uk.scrape

import com.szadowsz.common.net.Uri
import com.szadowsz.maeve.core.instruction.MaeveInstruction
import com.szadowsz.maeve.core.instruction.extractor.JsoupExtractor
import org.jsoup.nodes.Document

import scala.collection.JavaConverters._


class RoyalNavyMultiPageFilter extends JsoupExtractor {
  private val REDIRECT_PATH: String = "body div#content div#contentSub span.mw-redirectedfrom"
  private val INFOBOX_PATH: String = "body div#content table.infobox"
  private val SHIP_INSTANCE_PATH: String = "body div.mw-content-ltr > ul > li , body div.mw-content-ltr > dl > dd"

  var ships : List[List[(Option[String], Boolean, Boolean, Boolean, String)]] = List()

  override def extract(queryUrl: Uri, returnedUrl: Uri, inst: MaeveInstruction[_], page: Document): Unit = {
    val isRedirected = page.select(REDIRECT_PATH).asScala.nonEmpty
    val hasInfo = page.select(INFOBOX_PATH).asScala.length == 1
    val instances = if (!hasInfo) page.select(SHIP_INSTANCE_PATH).asScala.toList else List()

    val curr = if (instances.isEmpty) {
      List((Some(queryUrl.path), isRedirected, hasInfo, true, ""))
    } else {
      instances.filter(ins => ins.select("a").asScala.exists(e => e.attr("Title").contains("HMS"))).map(ins => {
        val nurl = ins.select("a").asScala.find(e => e.attr("Title").contains("HMS")).map(l => l.attr("href"))
        val desc = ins.text()
        val exists = nurl.exists(!Uri(_).containsQueryKey("redlink"))
        (nurl, true, true, exists, desc)
      })
    }
    ships = ships :+ curr
  }

  override def shouldContinue(): Boolean = false
} 
开发者ID:zakski,项目名称:project-morrigan,代码行数:38,代码来源:RoyalNavyMultiPageFilter.scala


示例20: HtmlLifter

//设置package包名称以及导入依赖的类
package com.twitter.diffy.lifter

import org.jsoup.Jsoup
import org.jsoup.nodes.{Document, Element}
import org.jsoup.select.Elements

import scala.collection.JavaConversions._

object HtmlLifter {
  def lift(node: Element): FieldMap[Any] = node match {
    case doc: Document =>
      FieldMap(
        Map(
          "head" -> lift(doc.head),
          "body" -> lift(doc.body)
        )
      )
    case doc: Element => {
      val children: Elements = doc.children
      val attributes =
        FieldMap[String](
          doc.attributes.asList map { attribute =>
            attribute.getKey -> attribute.getValue
          } toMap
        )

      FieldMap(
        Map(
          "tag"         -> doc.tagName,
          "text"        -> doc.ownText,
          "attributes"  -> attributes,
          "children"    -> children.map(element => lift(element))
        )
      )
    }
  }

  def decode(html: String): Document = Jsoup.parse(html)
} 
开发者ID:sachinmanchanda,项目名称:diffy_unicast,代码行数:40,代码来源:HtmlLifter.scala



注:本文中的org.jsoup.nodes.Document类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Scala SessionId类代码示例发布时间:2022-05-23
下一篇:
Scala Http类代码示例发布时间:2022-05-23
热门推荐
热门话题
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap