• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

ruippeixotog / scala-scraper / 11309684064

12 Oct 2024 11:53PM UTC coverage: 86.667%. Remained the same
11309684064

push

web-flow
Update sbt-pgp to 2.3.0 (#517)

299 of 345 relevant lines covered (86.67%)

2.01 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

85.71
/core/src/main/scala/net/ruippeixotog/scalascraper/browser/JsoupBrowser.scala
1
package net.ruippeixotog.scalascraper.browser
2

3
import java.io.{File, InputStream}
4
import java.net.{InetSocketAddress, Proxy => JavaProxy}
5

6
import scala.collection.JavaConverters._
7
import scala.collection.mutable
8

9
import org.jsoup.Connection.Method._
10
import org.jsoup.Connection.Response
11
import org.jsoup.{Connection, Jsoup}
12

13
import net.ruippeixotog.scalascraper.browser.JsoupBrowser._
14
import net.ruippeixotog.scalascraper.model._
15
import net.ruippeixotog.scalascraper.util._
16

17
/** A [[Browser]] implementation based on [[http://jsoup.org jsoup]], a Java HTML parser library. `JsoupBrowser`
18
  * provides powerful and efficient document querying, but it doesn't run JavaScript in the pages. As such, it is
19
  * limited to working strictly with the HTML send in the page source.
20
  *
21
  * Currently, `JsoupBrowser` does not keep separate cookie stores for different domains and paths. In each request all
22
  * cookies set previously will be sent, regardless of the domain they were set on. If you do requests to different
23
  * domains and do not want this behavior, use different `JsoupBrowser` instances.
24
  *
25
  * As the documents parsed by `JsoupBrowser` instances are not changed after loading, `Document` and `Element`
26
  * instances obtained from them are guaranteed to be immutable.
27
  *
28
  * @param userAgent
29
  *   the user agent with which requests should be made
30
  * @param proxy
31
  *   an optional proxy configuration to use
32
  */
33
class JsoupBrowser(val userAgent: String = "jsoup/1.8", val proxy: JavaProxy = null) extends Browser {
34
  type DocumentType = JsoupDocument
35

36
  private[this] val cookieMap = mutable.Map.empty[String, String]
4✔
37

38
  def get(url: String): JsoupDocument =
39
    executePipeline(Jsoup.connect(url).method(GET).proxy(proxy))
2✔
40

41
  def post(url: String, form: Map[String, String]): JsoupDocument =
42
    executePipeline(Jsoup.connect(url).method(POST).proxy(proxy).data(form.asJava))
2✔
43

44
  def parseFile(file: File, charset: String): JsoupDocument =
45
    JsoupDocument(Jsoup.parse(file, charset))
2✔
46

47
  def parseString(html: String): JsoupDocument =
48
    JsoupDocument(Jsoup.parse(html))
2✔
49

50
  def parseInputStream(inputStream: InputStream, charset: String): JsoupDocument =
51
    using(inputStream) { _ => JsoupDocument(Jsoup.parse(inputStream, charset, "")) }
4✔
52

53
  def cookies(url: String) = cookieMap.toMap
2✔
54

55
  def setCookie(url: String, key: String, value: String) = {
56
    cookieMap += key -> value
×
57
  }
58

59
  def setCookies(url: String, m: Map[String, String]) = {
60
    cookieMap ++= m
×
61
  }
62

63
  def clearCookies() = cookieMap.clear()
2✔
64

65
  def withProxy(proxy: Proxy): JsoupBrowser = {
66
    val newJavaProxy = new JavaProxy(
×
67
      if (proxy.proxyType == Proxy.SOCKS) JavaProxy.Type.SOCKS else JavaProxy.Type.HTTP,
×
68
      new InetSocketAddress(proxy.host, proxy.port)
×
69
    )
70
    new JsoupBrowser(userAgent, newJavaProxy)
×
71
  }
72

73
  def requestSettings(conn: Connection): Connection = conn
74

75
  protected[this] def defaultRequestSettings(conn: Connection): Connection =
76
    conn
77
      .cookies(cookieMap.asJava)
78
      .userAgent(userAgent)
79
      .header("Accept", "text/html,application/xhtml+xml,application/xml")
80
      .header("Accept-Charset", "utf-8")
81
      .timeout(15000)
82
      .maxBodySize(0)
2✔
83

84
  protected[this] def executeRequest(conn: Connection): Response =
85
    conn.execute()
2✔
86

87
  protected[this] def processResponse(res: Connection.Response): JsoupDocument = {
88
    lazy val doc = res.parse
89
    cookieMap ++= res.cookies.asScala
2✔
90
    if (res.hasHeader("Location")) get(res.header("Location")) else JsoupDocument(doc)
1✔
91
  }
92

93
  private[this] val executePipeline: Connection => JsoupDocument =
94
    (defaultRequestSettings _)
2✔
95
      .andThen(requestSettings)
2✔
96
      .andThen(executeRequest)
2✔
97
      .andThen(processResponse)
3✔
98
}
99

100
object JsoupBrowser {
101
  def apply(): Browser = new JsoupBrowser()
4✔
102

103
  def typed(): JsoupBrowser = new JsoupBrowser()
2✔
104

105
  case class JsoupElement(underlying: org.jsoup.nodes.Element) extends Element {
106
    type ThisType = JsoupElement
107

108
    def tagName = underlying.tagName
2✔
109

110
    def parent = Option(underlying.parent).map(JsoupElement.apply)
2✔
111

112
    def children = underlying.children.asScala.map(JsoupElement.apply)
2✔
113

114
    def siblings = underlying.siblingElements.asScala.map(JsoupElement.apply)
2✔
115

116
    def childNodes = underlying.childNodes.asScala.flatMap(JsoupNode.apply)
2✔
117

118
    def siblingNodes = underlying.siblingNodes.asScala.flatMap(JsoupNode.apply)
2✔
119

120
    def attrs = underlying.attributes.asScala.map { attr => attr.getKey -> attr.getValue }.toMap
2✔
121

122
    def hasAttr(name: String) = underlying.hasAttr(name)
2✔
123

124
    def attr(name: String) = {
125
      if (underlying.hasAttr(name)) underlying.attr(name)
4✔
126
      else throw new NoSuchElementException
4✔
127
    }
128

129
    def text = underlying.text
4✔
130

131
    def ownText = underlying.ownText
2✔
132

133
    def innerHtml = underlying.html
2✔
134

135
    def outerHtml = underlying.outerHtml
2✔
136

137
    private[this] def selectUnderlying(cssQuery: String): Iterator[JsoupElement] =
138
      underlying.select(cssQuery).iterator.asScala.map(JsoupElement.apply)
4✔
139

140
    def select(cssQuery: String) = ElementQuery(cssQuery, this, selectUnderlying)
4✔
141
  }
142

143
  object JsoupNode {
144
    def apply(underlying: org.jsoup.nodes.Node): Option[Node] =
145
      underlying match {
146
        case elem: org.jsoup.nodes.Element => Some(ElementNode(JsoupElement(elem)))
2✔
147
        case textNode: org.jsoup.nodes.TextNode => Some(TextNode(textNode.text))
2✔
148
        case _ => None
×
149
      }
150
  }
151

152
  case class JsoupDocument(underlying: org.jsoup.nodes.Document) extends Document {
153
    type ElementType = JsoupElement
154

155
    def location = underlying.location()
2✔
156

157
    def root = JsoupElement(underlying.getElementsByTag("html").first)
4✔
158

159
    override def title = underlying.title
2✔
160

161
    override def head = JsoupElement(underlying.head)
2✔
162

163
    override def body = JsoupElement(underlying.body)
2✔
164

165
    def toHtml = underlying.outerHtml
2✔
166
  }
167

168
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc