本文整理汇总了Scala中org.jsoup.Jsoup类的典型用法代码示例。如果您正苦于以下问题:Scala Jsoup类的具体用法?Scala Jsoup怎么用?Scala Jsoup使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Jsoup类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Scala代码示例。
示例1: HtmlConcatCompiler
//设置package包名称以及导入依赖的类
package com.karasiq.scalajsbundler.compilers
import com.karasiq.scalajsbundler.ScalaJSBundler.PageTypedContent
import org.jsoup.Jsoup
import org.jsoup.nodes.Element
import scala.collection.JavaConversions._
object HtmlConcatCompiler extends AssetCompiler {
private implicit class ElementOps(val e: Element) extends AnyVal {
def concatWith(src: Element): Unit = {
@inline
def delimit(delimiter: String, s1: String, s2: String): String = {
if (s1.endsWith(delimiter)) s1 + s2
else s1 + delimiter + s2
}
src.attributes().foreach {
case a if a.getKey == "class" ?
e.attr(a.getKey, delimit(" ", e.attr(a.getKey), a.getValue))
case a if a.getKey == "style" ?
e.attr(a.getKey, delimit(";", e.attr(a.getKey), a.getValue))
case a ? // Replaces attribute value
e.attr(a.getKey, a.getValue)
}
e.append(src.html())
}
}
def concat(htmlList: Seq[String]): String = {
val result = Jsoup.parse(htmlList.head)
htmlList.tail.foreach { h ?
val html = Jsoup.parse(h)
result.head().concatWith(html.head())
result.body().concatWith(html.body())
}
result.outerHtml()
}
override def compile(contents: Seq[PageTypedContent]): String = {
concat(contents.map(_.asset.asString))
}
}
开发者ID:Karasiq,项目名称:sbt-scalajs-bundler,代码行数:46,代码来源:HtmlConcatCompiler.scala
示例2: ScheduleDownloadActor
//设置package包名称以及导入依赖的类
package logic.actors.schedule
import java.nio.charset.StandardCharsets
import javax.inject._
import akka.actor.{Actor, ActorRef}
import helpers.SpiritHelper
import logic.actors.schedule.ScheduleDownloadActor.DownloadSchedule
import logic.actors.schedule.ScheduleParseActor._
import org.fhs.spirit.scheduleparser.enumerations.EScheduleKind
import org.jsoup.Jsoup
import play.api.libs.ws.WSClient
import scala.collection.JavaConversions._
import scala.concurrent.Await
import scala.concurrent.duration._
@Singleton
class ScheduleDownloadActor @Inject()(ws: WSClient, @Named("parseActor") parseActor: ActorRef) extends Actor with SpiritHelper {
override def receive: Receive = {
case DownloadSchedule =>
val baseUrl = configuration.underlying.getString("schedule.baseUrl")
val lectureResults = uncachedCourseNames.map {
courseName =>
val outcome = "s_" + courseName + ".html"
val httpResult = Await.result(ws.url(baseUrl + outcome).get(), 10 seconds)
if (httpResult.status != 404) {
Some((httpResult.bodyAsBytes.decodeString(StandardCharsets.ISO_8859_1.toString), courseName))
} else {
None
}
}.filter(_.nonEmpty).map(rs => (Jsoup.parse(rs.get._1).toString, rs.get._2)).map(rs => (EScheduleKind.REGULAR, rs))
val blockBaseResult = Await.result(ws.url(baseUrl + "bindex.html").get(), 10 seconds)
val bindex = Jsoup.parse(blockBaseResult.bodyAsBytes.decodeString(StandardCharsets.ISO_8859_1.toString))
val blockRefs = bindex.select("a").map(_.attr("href")).toSet
val blockResult = blockRefs.map {
block =>
val httpResult = Await.result(ws.url(baseUrl + block).get(), 10 seconds)
if (httpResult.status != 404) {
Some((httpResult.bodyAsBytes.decodeString(StandardCharsets.ISO_8859_1.toString), block))
} else {
None
}
}.filter(_.nonEmpty).map(rs => (Jsoup.parse(rs.get._1).toString, rs.get._2)).map(rs => (EScheduleKind.BLOCK, rs))
parseActor ! ParseSchedule(lectureResults ++ blockResult)
}
}
开发者ID:P1tt187,项目名称:spirit-play,代码行数:56,代码来源:ScheduleDownloadActor.scala
示例3: MainPageResponseParser
//设置package包名称以及导入依赖的类
package bridgeapp.crawler.parsers
import java.net.URL
import akka.actor.{Props, ActorSystem, Actor, ActorRef}
import bridgeapp.crawler.Config
import bridgeapp.crawler.execution.{Response, ResponseParser}
import bridgeapp.crawler.storage.{DiskForumsStorage, ForumsStorage}
import com.typesafe.scalalogging.LazyLogging
import org.jsoup.Jsoup
import scala.collection.JavaConverters._
class MainPageResponseParser(parser: ActorRef) extends ResponseParser {
override def ->(response: Response): Unit = parser ! response
}
object MainPageResponseParser {
def apply()(implicit actorSystem: ActorSystem): MainPageResponseParser = {
val parser = actorSystem.actorOf(Props(new MainPageParser(ForumsStorage())))
new MainPageResponseParser(parser)
}
}
class MainPageParser(forumsListStorage: ForumsStorage) extends Actor with LazyLogging {
override def receive: Receive = {
case response: Response =>
val charset = response.charset.getOrElse("utf-8")
val body = new String(response.body, charset)
val document = Jsoup.parse(body, response.uri.toString)
val forumLink = document.select("[href^=viewforum.php]").asScala.toArray
logger.trace(s" Total url: ${forumLink.length}")
val forumsIds: Array[Int] = forumLink.map(_.attr("abs:href")).collect {
case href: String =>
val s = new URL(href).getQuery.split("&").map { part =>
val pair = part.split("=")
pair(0) -> pair(1)
}.toMap
s.getOrElse("f", "0").toInt
}
logger.trace(s" Extracted forums ids: ${forumsIds.length}")
forumsListStorage.write(forumsIds, Config.forumsStorageURI)(context.dispatcher)
}
}
开发者ID:bridge-app,项目名称:crawler,代码行数:54,代码来源:MainPageParser.scala
示例4: MALImage
//设置package包名称以及导入依赖的类
package me.abarrow.ScalaSubNet.mal
import java.io.File
import java.io.FileOutputStream
import java.net.URL
import org.jsoup.Jsoup
import org.jsoup.parser.Parser
import java.nio.channels.Channels
object MALImage {
def saveMainImage(animeID:Int, imagePath:File):Boolean = {
val doc = Jsoup.parse(new URL(MALURLs.MAL_ANIME_PAGE_PREFIX + animeID.toString()), 60000)
val mainImage = doc.select("img.ac").first()
if (mainImage == null) {
return false
}
val imgSrc = mainImage.attr("src")
val rbc = Channels.newChannel(new URL(imgSrc).openStream())
val fos = new FileOutputStream(imagePath)
try {
fos.getChannel().transferFrom(rbc, 0, Long.MaxValue)
} finally {
fos.close()
rbc.close()
}
true
}
}
开发者ID:Abarrowman,项目名称:ScalaSubNet,代码行数:29,代码来源:MALImage.scala
示例5: MALList
//设置package包名称以及导入依赖的类
package me.abarrow.ScalaSubNet.mal
import org.jsoup.Jsoup
import org.jsoup.parser.Parser
import collection.JavaConverters._
class MALList (val entries:Array[MALEntry]) {
}
object MALList {
private val MAL_LIST_SUFFIX = "&status=all&type=anime"
private val xmlParser = Parser.xmlParser()
def getListByUser(userId:String):MALList = {
val listXML = Jsoup.connect(MALURLs.MAL_LIST_PREFIX + userId + MAL_LIST_SUFFIX).parser(xmlParser).get()
new MALList(listXML.getElementsByTag("anime").asScala.map { x =>
val id = x.getElementsByTag("series_animedb_id").first().html().toInt
val name = x.getElementsByTag("series_title").first().html()
val score = x.getElementsByTag("my_score").first().html().toInt
val status = x.getElementsByTag("my_status").first().html().toInt
new MALEntry(id, name, score, status)
}.toArray)
}
}
开发者ID:Abarrowman,项目名称:ScalaSubNet,代码行数:27,代码来源:MALList.scala
示例6: NoticeServiceObjects
//设置package包名称以及导入依赖的类
package com.zhranklin.homepage.notice
import org.json4s._
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
object NoticeServiceObjects {
trait ServiceBase extends IndexService with FunNoticeFetcher with SelectorUrlService {
val initVal: ((Document) ? String, (Document) ? String, String, String)
lazy val (getContent, getDateStr, urlPattern, template) = initVal
}
class LawService(title: String, listId: String) extends NoticeService(s"??? - $title") with UrlService with IndexService with FunNoticeFetcher {
val getContent = contentF("div.text")
val getDateStr = dateF("span:contains(????)")
val template = "http://law.scu.edu.cn/xjax?arg=8573&arg=<index>&arg=20&arg=list&clazz=PortalArticleAction&method=list"
def getUrl(id: String) = s"http://law.scu.edu.cn/detail.jsp?portalId=725&cid=8385&nextcid=$listId&aid=$id"
override def noticeUrlsFromUrl(url: String): Iterable[NoticeEntry] = {
val jsonStr = Jsoup.connect(url).execute().body()
val json = jackson.parseJson(jsonStr)
json.\("data").asInstanceOf[JArray].arr.map(
jo ? NoticeEntry(getUrl(jo.\("id").values.toString), Some(jo.\("subject").values.toString)))
}
}
val serviceList = List(
"???? - ???? - test" ?
"http://www.sculj.cn/Special_News.asp?SpecialID=40&SpecialName=%D1%A7%D4%BA%B6%AF%CC%AC&page=<index>",
"???? - ???? - test" ? "http://sesu.scu.edu.cn/news/list_1_<index>.html",
"???? - ????" ? "http://sesu.scu.edu.cn/gonggao/list_2_<index>.html",
"????? - ???? - test" ? "http://cs.scu.edu.cn/cs/xsky/xskb/H951901index_<index>.htm",
"????? - ???? - test" ? "http://cs.scu.edu.cn/cs/xytz/H9502index_<index>.htm",
"????? - ???? - test" ? "http://cs.scu.edu.cn/cs/xyxw/H9501index_<index>.htm",
"????? - ??? - test" ? "http://cs.scu.edu.cn/cs/fwzy/ftl/H951204index_<index>.htm",
"???? - test" ? "http://news.scu.edu.cn/news2012/cdzx/I0201index_<index>.htm",
"???? - ????" ?"http://math.scu.edu.cn/news.asp?PAGE=<index>",
"?????? - ????" ? "http://seei.scu.edu.cn/student,p<index>,index.jsp",
"????? - ????" ? "http://flc2.scu.edu.cn/foreign/a/xueyuangonggao/list_27_<index>.html"
).map { tp ?
new NoticeService(tp._1) with UniversalUrlService with UniversalNoticeFetcher with IndexService {
val template = tp._2
}
} ++ List(
new NoticeService("??? - ??") with ServiceBase {
val initVal =(selectorF("input[name=news.content]")(_.first.attr("value")), dateF("table[width=900] td:contains(????)"),
"newsShow.*", "http://jwc.scu.edu.cn/jwc/moreNotice.action?url=moreNotice.action&type=2&keyWord=&pager.pageNow=<index>")},
new LawService("????", "8572"),
new LawService("????", "8573")
)
}
开发者ID:zhranklin,项目名称:Private_Blog,代码行数:54,代码来源:NoticeServiceObjects.scala
示例7: IsapReader
//设置package包名称以及导入依赖的类
package pl.mojepanstwo.sap.toakoma.readers
import org.slf4j.LoggerFactory
import org.springframework.batch.item.ItemReader
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import com.gargoylesoftware.htmlunit.WebClient
import pl.mojepanstwo.sap.toakoma._
object IsapReader {
val BASE_URL = "http://isap.sejm.gov.pl"
val URL = BASE_URL + "/DetailsServlet?id="
}
class IsapReader(val id: String) extends ItemReader[Document] {
val logger = LoggerFactory.getLogger(this.getClass())
var last = false
def read : Document = {
logger.trace("read")
if(last) return null
this.last = true
val isapUrl = IsapReader.URL + id
val rsp = Jsoup.connect(isapUrl).get
if(rsp.body.text.contains("Brak aktu prawnego o podanym adresie publikacyjnym !"))
throw new NoSuchDocumentException
return rsp
}
}
开发者ID:PrawoPolskie,项目名称:toakoma,代码行数:36,代码来源:IsapReader.scala
示例8: get
//设置package包名称以及导入依赖的类
package pl.mojepanstwo.sap.toakoma.services
import java.net.URL
import java.io.File
import org.apache.commons.io.FileUtils
import org.jsoup.nodes.Document
import com.gargoylesoftware.htmlunit._
import org.jsoup.Jsoup
trait Scraper {
def get(url: String) : Document
def dowloadFile(fileUrl:String, filePath:String) : String
}
class DefaultScraperService extends Scraper {
val webClient = new WebClient
def get(url: String) : Document = {
webClient.setRefreshHandler(new RefreshHandler {
override def handleRefresh(page: Page, url: URL, i: Int): Unit = webClient.getPage(url)
})
val apPage: Page = webClient.getPage(url)
Jsoup.parse(apPage.getWebResponse.getContentAsString)
}
def dowloadFile(fileUrl:String, filePath:String) : String = {
val url = new URL(fileUrl)
val tmp = new File(filePath)
FileUtils.copyURLToFile(url, tmp)
tmp.getAbsolutePath()
}
}
开发者ID:PrawoPolskie,项目名称:toakoma,代码行数:35,代码来源:Scraper.scala
示例9: ResourceScraperService
//设置package包名称以及导入依赖的类
package pl.mojepanstwo.sap.toakoma
import pl.mojepanstwo.sap.toakoma.services.Scraper
import org.jsoup.nodes.Document
import org.jsoup.Jsoup
import scala.io.Source
import java.io.File
import java.nio.file.Files
import org.apache.commons.io.IOUtils
import java.io.FileOutputStream
class ResourceScraperService extends Scraper {
def get(url: String) : Document = {
val pattern = ".*id=(.*)&type=([0-9]+).*".r
val pattern(id, docType) = url
Jsoup.parse(Source.fromResource("isap/" + id + "/" + docType + ".html").mkString)
}
def dowloadFile(fileUrl:String, filePath:String) : String = {
val pattern = ".*id=(.*)&type=([0-9]+).*".r
val pattern(id, docType) = fileUrl
val src = getClass.getResourceAsStream("/isap/" + id + "/" + docType + ".pdf")
val dest = new File(filePath)
val out = new FileOutputStream(dest)
IOUtils.copy(src, out)
src.close()
out.close()
dest.getAbsolutePath
}
}
开发者ID:PrawoPolskie,项目名称:toakoma,代码行数:33,代码来源:ResourceScraperService.scala
示例10: first
//设置package包名称以及导入依赖的类
package com.zhranklin.notice.service
import java.util.Date
import org.jsoup.Jsoup
import scala.collection.JavaConverters._
import scala.util._
trait IndexService {
val template: String
def first = rawIndices.head
protected def firstIndex: Int = 1
protected def valueStream(i: Int): Stream[Int] = i #:: valueStream(i + 1)
protected def indexNums: Iterable[Any] = valueStream(firstIndex)
protected def interpolate(value: Any): String = template.replaceAll("<index>", value.toString)
def rawIndices: Iterable[String] = indexNums map interpolate
def indexUrls: Iterable[String] = Stream(first) ++ rawIndices.drop(1)
}
case class Notice(url: String, title: String, html: String, date: Date) {
def widthlessHtml = {
val doc = Jsoup.parse(html)
doc.select("*[width]").asScala.map(_.removeAttr("width"))
doc.select("*[height]").asScala.map(_.removeAttr("height"))
doc.toString
}
def stylelessHtml = {
val doc = Jsoup.parse(html)
doc.select("*[width]").asScala.map(_.removeAttr("width"))
doc.select("*[height]").asScala.map(_.removeAttr("height"))
doc.select("*[style]").asScala.map(_.removeAttr("style"))
doc.toString
}
def imgs = Jsoup.parse(html).select("img[src]").asScala.map(_.attr("src"))
}
case class NoticeEntry(url: String, title: Option[String] = None)
abstract class NoticeService(val source: String) extends UrlService with IndexService with NoticeFetcher {
def getUrls: Iterable[Try[NoticeEntry]] = indexUrls.map (i ? Try(noticeUrlsFromUrl(i))).flatMap {
case Success(urls) ? urls map Success.apply
case Failure(t) ? Iterable(Failure(t))
}
def notices: Iterable[Try[Notice]] = getUrls.map (_.flatMap(u ? Try(fetch(u))))
def noticesWithErr(limit: Int, offset: Int): (List[Notice], List[Throwable]) = {
val (succ, err) = notices.slice(offset, offset + limit).toList.partition(_.isSuccess)
val successes = succ.asInstanceOf[List[Success[Notice]]].map(_.value)
val failures = err.asInstanceOf[List[Failure[Throwable]]].map(_.exception)
failures.groupBy(_.getClass.getSimpleName).map(_._2.head).foreach(t ? log.i(s"error when fetching news", t))
(successes, failures)
}
}
开发者ID:zhranklin,项目名称:notice_crawler,代码行数:53,代码来源:NoticeService.scala
示例11: SearchControllerTest
//设置package包名称以及导入依赖的类
package controllers
import model.{Runway, Airport, Country, SearchResult}
import org.jsoup.Jsoup
import org.scalatest.concurrent.ScalaFutures
import org.scalatest.mock.MockitoSugar
import org.scalatest.{Matchers, FunSpec}
import play.api.test.FakeRequest
import services.SearchService
import org.mockito.Mockito._
import scala.concurrent.Future
import scala.concurrent.ExecutionContext.Implicits.global
class SearchControllerTest extends FunSpec with Matchers with MockitoSugar with ScalaFutures{
describe("Search Controller"){
it("should generate search results page for given search term"){
new Setup {
when(mockSearchService.searchCountriesByNameOrCountryCode("aus")).thenReturn(Future(expectedSearchResult))
val response = searchController.searchByCountry("aus")(FakeRequest()).futureValue
response.header.status should be(200)
expectedFirstRow should be("Australia AUS Melbourne Airport small CONCRETE 1")
}
}
}
trait Setup{
val mockSearchService = mock[SearchService]
val searchController = new SearchController(mockSearchService)
val expectedSearchResult: Vector[SearchResult] = Vector(SearchResult(Country("Australia","AUS"),Airport("Melbourne Airport","small"),Runway("CONCRETE",1)))
val expectedFirstRow = Jsoup.parse(views.html.search_results(expectedSearchResult.toList).body).select("table > tbody > tr:nth-child(1) td").text()
}
}
开发者ID:atiqsayyed,项目名称:airport,代码行数:39,代码来源:SearchControllerTest.scala
示例12: ReportsControllerTest
//设置package包名称以及导入依赖的类
package controllers
import model._
import org.jsoup.Jsoup
import org.scalatest.concurrent.ScalaFutures
import org.scalatest.mock.MockitoSugar
import org.scalatest.{FunSpec, Matchers}
import play.api.test.FakeRequest
import services.ReportService
import org.mockito.Mockito._
import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent.Future
class ReportsControllerTest extends FunSpec with Matchers with MockitoSugar with ScalaFutures{
describe("Reports Controller"){
it("should display country name and count of airports in country"){
new Setup {
val response = reportController.getCountriesWithHighestNoOfAirports(FakeRequest()).futureValue
response.header.status should be(200)
val expectedFirstRow = Jsoup.parse(views.html.report("Some Title",expectedSearchResult.toList).body).select("table > tbody > tr:nth-child(1) td").text()
expectedFirstRow should be("Australia 100")
}
}
}
trait Setup{
val mockReportService = mock[ReportService]
val reportController = new ReportsController(mockReportService)
val expectedSearchResult: Vector[CountryReport] = Vector(CountryReport(Country("Australia","AUS"),100))
when(mockReportService.findCountriesWithHighestNoOfAirports).thenReturn(Future(expectedSearchResult))
}
}
开发者ID:atiqsayyed,项目名称:airport,代码行数:38,代码来源:ReportsControllerTest.scala
示例13: Crawler
//设置package包名称以及导入依赖的类
package pl.krix.scalacrawl
import java.net.URI
import org.jsoup.Jsoup
import scala.collection.JavaConversions._
object Crawler {
def getDomain(URL: String): Option[String] = { // get domain from URL method
new URI(URL).getHost match { // get URI's host
case s: String => Some(s.stripPrefix("www.")) // if got string, strip useless prefix
case null => None // if got null, return none
}
}
def crawl(URL: String, visited: Set[String], interval: Int) { // crawling method
Thread.sleep(interval) // sleep before launching a request
Jsoup.connect(URL) // connect
.get() // get content
.select("a[href]") // get href elements from content (links)
.map(_.attr("abs:href")) // get their absolute path
.filter(!_.isEmpty()) // weed out empty ones
.filter(getDomain(_) == getDomain(URL)) // we want links from same domain only
.filter(!visited.contains(_)) // we want unvisited links
.foreach { // for every such link
link:String => { // execute lambda which
println(URL + " --> " + link) // prints URL and its link
crawl(link, visited + URL, interval) // crawl inside link
}
}
}
def printHelp() = {
println("USAGE: sbt \"run [URL] [TIME INTERVAL BETWEEN REQUESTS]\"")
}
def main(args: Array[String]) { // run with arguments [URL] [TIME INTERVAL BETWEEN REQUESTS IN SECS]
if(args.length < 2){
printHelp()
}else{
crawl(args(0), Set[String](args(0)), args(1).toInt * 1000)
}
}
}
开发者ID:krix38,项目名称:scalaCrawl,代码行数:46,代码来源:Crawler.scala
示例14: LinkExtractor
//设置package包名称以及导入依赖的类
package wipro.crawler.util
import org.jsoup.Jsoup
import scala.collection.JavaConverters._
class LinkExtractor {
var crawledLinks : List[String] = List.empty[String]
def getAllPageLinks(url : String) = {
val links = Jsoup.connect(url).timeout(0).get().select("a[href]")
(for (link <- links.iterator().asScala) yield {
link.attr("href")
}).toSeq.distinct
}
def filterLinks(links : Seq[String],baseUrl : String) = {
links.filter(link => link != null && link.length > 0)
.filter(link => link.contains(baseUrl))
}
def crawlDomainLinks(url : String,depth : Int,maxDepth : Int,baseUrl : String) : Unit = {
if((!crawledLinks.contains(url)) && (depth < maxDepth)){
crawledLinks = url :: crawledLinks
if(url.contains(baseUrl)){
for(link <- getAllPageLinks(url)){
crawlDomainLinks(link,depth + 1,maxDepth,baseUrl)
}
}
}
}
}
开发者ID:adityahalabe,项目名称:webCrawler,代码行数:33,代码来源:LinkExtractor.scala
示例15:
//设置package包名称以及导入依赖的类
import com.mashape.unirest.http.Unirest
import org.jsoup.Jsoup
import org.jsoup.nodes.Element
import purecsv.safe._
val results = Unirest.post("http://nturanking.lis.ntu.edu.tw/DataPage/OverallRanking.aspx")
.queryString("pagesize", pagesize)
.queryString("y", year)
.asString.getBody
val jsoup = Jsoup.parse(results)
val jsoupResults = jsoup.body.select("#MainContain_GridView1 > tbody").select("tr").toArray.tail
val csvResults = (0 until jsoupResults.size)
.map(idx => (idx, jsoupResults(idx))).map(_.asInstanceOf[(Int, Element)])
.map(t =>
(t._1 + 1, t._2.child(1).child(0).html, t._2.child(2).child(0).html, t._2.child(3).child(0).html))
println(csvResults.map(t => s"${t._1},${t._2},${t._3},${t._4}").mkString("\n"))
}
}
开发者ID:sguzman,项目名称:UniversityRankingWebScraper,代码行数:22,代码来源:CScraper.scala
示例16: DataSource
//设置package包名称以及导入依赖的类
package com.debasish.nlp.dataSources
import org.apache.spark.SparkContext
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.sql.functions._
import org.jsoup.Jsoup
object DataSource {
def apply(sc: SparkContext): DataFrame = {
val sqlContext = new SQLContext(sc)
val filePath = getClass.getResource("/labeledTrainData.tsv").toString
val df = sqlContext.read
.format("com.databricks.spark.csv")
.option("delimiter", "\t")
.option("quote", "^")
.option("header", "true")
.option("inferSchema", "true")
.load(filePath)
val CleanReview = udf((cell: String) => {
val jString = Jsoup.parse(cell).body.text.replace("\"", "").replace("\\", "")
val regex = "[^a-zA-Z]"
jString//.replaceAll(regex, " ")
})
val CleanID = udf((cell: String) => {
cell.replace("\"", "")
})
df.withColumn("review", CleanReview(col("review"))).withColumn("id", CleanID(col("id")))
}
}
开发者ID:DEK11,项目名称:MoreNLP,代码行数:39,代码来源:DataSource.scala
示例17: compile
//设置package包名称以及导入依赖的类
package indi.lewis.spider.html
import java.util
import com.google.gson._
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
private[html] trait ElementType {
var elementName: String = _;
def compile(doc:Document):JsonElement;
def compile(doc:String):JsonElement=compile(Jsoup.parse(doc));
}
private[html] case class ModelParent() extends ElementType {
elementName="root";
def this(elementName: String) {
this();
this.elementName = elementName;
}
val properties: java.util.ArrayList[ElementType] = new util.ArrayList[ElementType]();
override def compile(doc: Document): JsonElement = {
val ret=new JsonObject
for(i <- 0 to properties.size()-1; o= properties.get(i)){
ret.add(o.elementName,o.compile(doc))
}
ret
}
}
private[html] case class ModelElement(val elName: String, val f: (Document) => ElementType) extends ElementType {
this.elementName=elName
override def compile(doc: Document): JsonElement = f(doc).compile(doc)
}
private[html] case class ModelArray(val elName: String) extends ElementType {
val array: java.util.ArrayList[ElementType] = new util.ArrayList[ElementType]();
this.elementName=elName
override def compile(doc: Document): JsonElement = {
val jsonArray=new JsonArray
for(i <- 0 to array.size()-1; o= array.get(i)){
jsonArray.add(o.compile(doc))
}
jsonArray
}
}
private[html] case class ModelConstant(val elName: String, val value: Object) extends ElementType {
this.elementName=elName
override def compile(doc: Document): JsonElement = if(value!=null)new JsonPrimitive(value.toString) else JsonNull.INSTANCE
}
开发者ID:TokisakiFun,项目名称:Katipo,代码行数:56,代码来源:ElementType.scala
示例18: Convert
//设置package包名称以及导入依赖的类
package com.javaconverter.model
import org.jsoup.Jsoup
import org.jsoup.select.NodeVisitor
import org.jsoup.nodes.Node
import org.jsoup.nodes.TextNode
import org.jsoup.nodes.Comment
import scala.collection.JavaConversions._
import org.jsoup.nodes.Element
import org.jsoup.nodes.DataNode
class Convert(html: String) {
private def pad(r: Range) = r.map(_ => " ").mkString
private def render(node: Node, depth: Int) = {
val render = node match {
case n: Element => node.nodeName + "("
case n: TextNode => s"""text("${node.asInstanceOf[TextNode].text()}""""
case n: Comment => s"""text("<!--${node.asInstanceOf[Comment].getData}-->""""
case n: DataNode => s"""text("${node.asInstanceOf[DataNode].getWholeData}""""
}
"\n" + pad(0 until depth) + render
}
def toJavaTags() = {
val doc = Jsoup.parse(html)
var result = ""
doc.child(0).traverse(new NodeVisitor() {
override def head(node: Node, depth: Int) {
result += render(node, depth)
var attribute = node.attributes().asList().map { attr => s""""${attr.getKey} -> ${attr.getValue}"""" }
if (!attribute.isEmpty && node.isInstanceOf[Element]) {
result += s"""attr(${attribute.mkString(",")})"""
if(!node.childNodes().isEmpty()){
result += ","
}
}
}
override def tail(node: Node, depth: Int) {
if(node.childNodes().isEmpty()){
result += ")"
} else{
result += "\n" + pad(depth until 0 by -1) + ")"
}
if(node.nextSibling() != null){
result += ","
}
}
}
)
result
}
def toHtmlFormat() = {
Jsoup.parse(html).toString().
replaceAll("<", "<").
replaceAll(">", ">")
}
}
开发者ID:manlioGit,项目名称:javatagsconverter,代码行数:62,代码来源:Convert.scala
示例19: LaporBot
//设置package包名称以及导入依赖的类
package io.github.asepsaep.laporcrawler.bot
import scala.collection.JavaConverters._
import org.jsoup.nodes.Document
import org.jsoup.Jsoup
import io.github.asepsaep.laporcrawler.model.Ticket
case class LaporBot(ticketId: Int) {
private var ticket = new Ticket()
private val url = "http://36.66.86.72/pengaduan/" + ticketId
// System.setProperty("socksProxyHost", "127.0.0.1")
// System.setProperty("socksProxyPort", "10001")
// System.setProperty("socksProxyVersion", "5")
def crawl(): Option[Ticket] = {
val doc = Jsoup.connect(url).timeout(30000).get()
val maybeTicket = if (doc.getElementsByClass("no-data").isEmpty) Option(parse(doc)) else None
maybeTicket
}
def parse(doc: Document): Ticket = {
val id = ticketId
val title = doc.getElementById("row_Subject").text
val splitContent = doc.getElementById("row_content").text.split(", ")
val content = if (splitContent.length > 1) splitContent.tail.mkString(", ") else doc.getElementById("row_content").text
ticket = ticket.copy(id = id, title = title, content = content)
val details = doc.getElementsByClass("feedback-details").first.getElementsByTag("p").asScala
for (p ? details) {
val span = p.getElementsByTag("span")
span.first.text match {
case "USER:" ? ticket = ticket.copy(user = Some(span.last.text))
case "PLATFORM:" ? ticket = ticket.copy(platform = Some(span.last.text))
case "TANGGAL:" ? ticket = ticket.copy(date = Some(span.last.text))
case "KATEGORI:" ? ticket = ticket.copy(category = Some(span.last.text))
case "AREA:" ? ticket = ticket.copy(area = Some(span.last.text))
case "STATUS:" ? ticket = ticket.copy(status = Some(span.last.text))
case _ ? {}
}
}
val dispatchedTo = doc.select(".administrator .comment-content").first.getElementsByTag("p").first.getElementsByTag("span").first.getElementsByTag("b").first.text
ticket = ticket.copy(dispatchedTo = Some(dispatchedTo))
ticket
}
}
开发者ID:asepsaep,项目名称:lapor-crawler,代码行数:54,代码来源:LaporBot.scala
示例20: Link
//设置package包名称以及导入依赖的类
package utils
import java.net.{MalformedURLException, URL}
import models.{MessageButton, Photo, Message}
import org.jsoup.Jsoup
import org.jsoup.nodes.Element
import org.jsoup.select.Elements
//import org.jsoup.nodes.Element
import scala.collection.JavaConversions._
import scala.util.control.Exception._
sealed case class Link(title: String, href: String, imageSrc:String, desc:String) {
override def toString(): String ={
s"title : $title, href : $href, imageSrc : $imageSrc, desc : $desc"
}
def toMessage:Message ={
Message(s"[$title]\n$desc",Option(Photo(imageSrc,300,250)), Option(MessageButton("?????",href)))
}
}
object HtmlParser {
type JDoc = org.jsoup.nodes.Document
def get(url: String): JDoc = Jsoup.connect(url).get()
def titleText(doc: JDoc): String = doc.select("title").text
def bodyText(doc: JDoc): String = doc.select("body").text
def linkSequence(doc: JDoc, containStr : String): Seq[Link] = {
val links = doc.select(s"a[href*=$containStr]").iterator.toList
links.map { l => Link(l.text, l.attr("href"), l.select("img[src]").attr("src"), l.select("[class*=desc]").text) }
}
def safeURL(url: String): Option[String] = {
val result = catching(classOf[MalformedURLException]) opt new URL(url)
result match {
case Some(v) => Some(v.toString)
case None => None
}
}
}
开发者ID:suya55,项目名称:kakaoYellowIdBot,代码行数:48,代码来源:HtmlParser.scala
注:本文中的org.jsoup.Jsoup类示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论