• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

elixir-crawly / crawly / da28628b-fefb-408c-b84b-ad2746345eb6

pending completion
da28628b-fefb-408c-b84b-ad2746345eb6

Pull #251

circle-ci

oltarasenko
Create jobs history
Pull Request #251: Refactor Jobs History

21 of 21 new or added lines in 3 files covered. (100.0%)

588 of 732 relevant lines covered (80.33%)

121.41 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

56.92
/lib/crawly/api.ex
1
defmodule Crawly.API.Router do
2
  @moduledoc """
3
  Crawly HTTP API. Allows to schedule/stop/get_stats
4
  of all running spiders.
5
  """
6

7
  require Logger
8

9
  use Plug.Router
10

11
  @spider_validation_schema %{
12
    "type" => "object",
13
    "additionalProperties" => false,
14
    "required" => ["name", "links_to_follow", "fields", "start_urls"],
15
    "properties" => %{
16
      "name" => %{"type" => "string"},
17
      "base_url" => %{"type" => "string", "format" => "uri"},
18
      "start_urls" => %{
19
        "type" => "array",
20
        "items" => %{"type" => "string", "format" => "uri"}
21
      },
22
      "links_to_follow" => %{
23
        "type" => "array",
24
        "items" => %{
25
          "type" => "object",
26
          "additionalProperties" => false,
27
          "properties" => %{
28
            "selector" => %{"type" => "string"},
29
            "attribute" => %{"type" => "string"}
30
          }
31
        }
32
      },
33
      "fields" => %{
34
        "type" => "array",
35
        "items" => %{
36
          "type" => "object",
37
          "additionalProperties" => false,
38
          "properties" => %{
39
            "name" => %{"type" => "string"},
40
            "selector" => %{"type" => "string"}
41
          }
42
        }
43
      }
44
    }
45
  }
46

47
  plug(Plug.Parsers, parsers: [:urlencoded, :multipart])
48
  plug(:match)
49
  plug(:dispatch)
50

51
  # Simple UI for crawly management
52
  get "/" do
×
53
    running_spiders = Crawly.Engine.running_spiders()
×
54

55
    spiders_list =
×
56
      Enum.map(
57
        Crawly.list_spiders(),
58
        fn spider ->
59
          {crawl_id, state} =
×
60
            case Map.get(running_spiders, spider) do
61
              {_pid, crawl_id} -> {crawl_id, :running}
×
62
              nil -> {nil, :idle}
×
63
            end
64

65
          spider_name =
×
66
            spider
67
            |> Atom.to_string()
68
            |> String.replace_leading("Elixir.", "")
69

70
          {scraped, scheduled} =
×
71
            case state == :running do
72
              false ->
×
73
                {" - ", " -  "}
74

75
              true ->
76
                {:stored_items, num} = Crawly.DataStorage.stats(spider)
×
77

78
                {:stored_requests, scheduled} =
×
79
                  Crawly.RequestsStorage.stats(spider)
80

81
                {num, scheduled}
82
            end
83

84
          editable? =
×
85
            case Crawly.SimpleStorage.get(:spiders, spider_name) do
86
              {:error, :not_found} -> false
×
87
              {:ok, _value} -> true
×
88
              _ -> false
×
89
            end
90

91
          %{
×
92
            name: spider_name,
93
            crawl_id: crawl_id,
94
            scheduled: scheduled,
95
            scraped: scraped,
96
            state: state,
97
            editable?: editable?
98
          }
99
        end
100
      )
101

102
    jobs_log =
×
103
      Crawly.Models.Job
104
      |> Crawly.SimpleStorage.list()
105
      |> Enum.map(fn crawl_id ->
106
        {:ok, crawl_info} =
×
107
          Crawly.SimpleStorage.get(Crawly.Models.Job, crawl_id)
108

109
        crawl_info
×
110
      end)
111
      |> Enum.sort(fn a, b ->
112
        case DateTime.compare(a.start, b.start) do
×
113
          :gt -> true
×
114
          _ -> false
×
115
        end
116
      end)
117

118
    response =
×
119
      render_template("list.html.eex",
120
        spiders_list: spiders_list,
121
        jobs_log: jobs_log
122
      )
123

124
    send_resp(conn, 200, response)
×
125
  end
126

127
  get "/new" do
3✔
128
    spider_name = Map.get(conn.query_params, "spider_name", "")
3✔
129

130
    spider_data =
3✔
131
      case spider_name do
132
        "" ->
1✔
133
          {:ok, ""}
134

135
        name ->
136
          Crawly.SimpleStorage.get(:spiders, name)
2✔
137
      end
138

139
    case spider_data do
3✔
140
      {:error, :not_found} ->
141
        send_resp(conn, 404, "Page not found")
×
142

143
      {:ok, value} ->
144
        response =
3✔
145
          render_template("new.html.eex",
146
            data: %{
147
              "errors" => "",
148
              "spider" => value,
149
              "spider_name" => spider_name
150
            }
151
          )
152

153
        send_resp(conn, 200, response)
3✔
154
    end
155
  end
156

157
  post "/new" do
6✔
158
    name_from_query_params = Map.get(conn.query_params, "spider_name", "")
6✔
159
    spider_yml = Map.get(conn.body_params, "spider")
6✔
160

161
    # Validate incoming data with json schema
162
    validation_result =
6✔
163
      case validate_new_spider_request(spider_yml) do
164
        {:error, errors} ->
2✔
165
          {:error, "#{inspect(errors)}"}
166

167
        %{"name" => spider_name} = yml ->
168
          # Check if spider already registered, but allow editing spiders
169
          case {is_spider_registered(spider_name),
4✔
170
                spider_name == name_from_query_params} do
171
            {true, false} ->
1✔
172
              {:error,
173
               "Spider with this name already exists. Try editing it instead of overriding"}
174

175
            _ ->
3✔
176
              {:ok, yml}
177
          end
178
      end
179

180
    case validation_result do
6✔
181
      {:ok, %{"name" => spider_name} = _parsed_yml} ->
182
        :ok = Crawly.SimpleStorage.put(:spiders, spider_name, spider_yml)
3✔
183

184
        # Now we can finally load the spider
185
        Crawly.Utils.load_yml_spider(spider_yml)
3✔
186

187
        # Now we can redirect to the homepage
188
        conn
189
        |> put_resp_header("location", "/")
190
        |> send_resp(conn.status || 302, "Redirect")
3✔
191

192
      {:error, errors} ->
193
        # Show errors and spider
194
        data = %{"errors" => errors, "spider" => spider_yml}
3✔
195
        response = render_template("new.html.eex", data: data)
3✔
196
        send_resp(conn, 400, response)
3✔
197
    end
198
  end
199

200
  delete "/spider/:spider_name" do
×
201
    Crawly.SimpleStorage.delete(:spiders, spider_name)
×
202

203
    conn
204
    |> put_resp_header("location", "/")
205
    |> send_resp(conn.status || 302, "Redirect")
×
206
  end
207

208
  get "/spiders" do
1✔
209
    msg =
1✔
210
      case Crawly.Engine.running_spiders() do
211
        spiders when map_size(spiders) == 0 ->
1✔
212
          "No spiders are currently running"
213

214
        spiders ->
215
          "Following spiders are running: #{inspect(spiders)}"
×
216
      end
217

218
    send_resp(conn, 200, msg)
1✔
219
  end
220

221
  get "/spiders/:spider_name/logs/:crawl_id" do
2✔
222
    spider_name = String.to_existing_atom(spider_name)
2✔
223
    log_file_path = Crawly.Utils.spider_log_path(spider_name, crawl_id)
2✔
224

225
    case File.exists?(log_file_path) do
2✔
226
      true -> Plug.Conn.send_file(conn, 200, log_file_path)
1✔
227
      false -> send_resp(conn, 404, "Oops! Page not found!")
1✔
228
    end
229
  end
230

231
  get "/spiders/:spider_name/items/:crawl_id" do
3✔
232
    folder =
3✔
233
      Application.get_env(:crawly, :pipelines, [])
234
      |> Keyword.get(Crawly.Pipelines.WriteToFile, [])
235
      |> Keyword.get(:folder, "")
236

237
    file_paths =
3✔
238
      case File.ls(folder) do
239
        {:ok, list} ->
240
          Enum.filter(list, fn path -> String.contains?(path, crawl_id) end)
1✔
241

242
        {:error, _} ->
2✔
243
          []
244
      end
245

246
    case file_paths do
3✔
247
      [] ->
248
        send_resp(conn, 404, "Oops! Page not found!")
2✔
249

250
      [file_path] ->
251
        full_path = Path.join([folder, file_path])
1✔
252
        Plug.Conn.send_file(conn, 200, full_path)
1✔
253

254
      other ->
255
        Logger.error("Could not get correct items file: #{inspect(other)}")
×
256
        send_resp(conn, 500, "Unexpected error")
×
257
    end
258
  end
259

260
  get "/spiders/:spider_name/requests" do
1✔
261
    spider_name = String.to_atom("Elixir.#{spider_name}")
1✔
262

263
    result =
1✔
264
      case Crawly.RequestsStorage.requests(spider_name) do
265
        {:requests, result} ->
266
          Enum.map(result, fn req ->
1✔
267
            %{url: req.url, headers: inspect(req.headers)}
1✔
268
          end)
269

270
        {:error, _} ->
×
271
          []
272
      end
273

274
    response =
1✔
275
      render_template("requests_list.html.eex",
276
        requests: result,
277
        spider_name: spider_name
278
      )
279

280
    send_resp(conn, 200, response)
1✔
281
  end
282

283
  get "/spiders/:spider_name/schedule" do
2✔
284
    spider_name = String.to_atom("Elixir.#{spider_name}")
2✔
285
    result = Crawly.Engine.start_spider(spider_name)
2✔
286

287
    msg =
2✔
288
      case result do
289
        {:error, :spider_already_started} -> "Already started"
×
290
        {:error, _} -> "Can't load the spider"
×
291
        :ok -> "Started!"
2✔
292
      end
293

294
    send_resp(conn, 200, msg)
2✔
295
  end
296

297
  get "/spiders/:spider_name/stop" do
15✔
298
    spider_name = String.to_atom("Elixir.#{spider_name}")
15✔
299
    result = Crawly.Engine.stop_spider(spider_name, :manual_stop)
15✔
300

301
    msg =
15✔
302
      case result do
303
        {:error, :spider_not_found} -> "Not found"
×
304
        {:error, :spider_not_running} -> "Spider is not running"
13✔
305
        :ok -> "Stopped!"
2✔
306
      end
307

308
    send_resp(conn, 200, msg)
15✔
309
  end
310

311
  get "/spiders/:spider_name/scheduled-requests" do
×
312
    spider_name = String.to_atom("Elixir.#{spider_name}")
×
313
    result = Crawly.RequestsStorage.stats(spider_name)
×
314

315
    msg =
×
316
      case result do
317
        {:error, :storage_worker_not_running} -> "Spider is not running"
×
318
        _ -> "#{inspect(result)}"
×
319
      end
320

321
    send_resp(conn, 200, msg)
×
322
  end
323

324
  get "/spiders/:spider_name/scraped-items" do
×
325
    spider_name = String.to_existing_atom("Elixir.#{spider_name}")
×
326
    result = Crawly.DataStorage.stats(spider_name)
×
327

328
    msg =
×
329
      case result do
330
        {:error, _} -> "Spider is not running"
×
331
        _ -> "#{inspect(result)}"
×
332
      end
333

334
    send_resp(conn, 200, msg)
×
335
  end
336

337
  get "/load-spiders" do
×
338
    loaded_spiders =
×
339
      case Crawly.load_spiders() do
340
        {:ok, spiders} -> spiders
×
341
        {:error, :no_spiders_dir} -> []
×
342
      end
343

344
    send_resp(
×
345
      conn,
346
      200,
347
      "Loaded following spiders from $SPIDERS_DIR: #{inspect(loaded_spiders)}"
348
    )
349
  end
350

351
  match _ do
×
352
    send_resp(conn, 404, "Oops! Page not found!")
×
353
  end
354

355
  defp validate_new_spider_request(maybe_yml) do
356
    with {:ok, yml} <- YamlElixir.read_from_string(maybe_yml),
6✔
357
         :ok <- ExJsonSchema.Validator.validate(@spider_validation_schema, yml) do
5✔
358
      yml
4✔
359
    else
360
      {:error, _err} = err -> err
2✔
361
    end
362
  end
363

364
  defp is_spider_registered(name) do
365
    module_name_str = "Elixir." <> name
4✔
366
    module_name = String.to_atom(module_name_str)
4✔
367
    Enum.member?(Crawly.Utils.list_spiders(), module_name)
4✔
368
  end
369

370
  defp render_template(template_name, assigns) do
371
    base_dir = :code.priv_dir(:crawly)
7✔
372
    template = Path.join(base_dir, template_name)
7✔
373
    rendered_template = EEx.eval_file(template, assigns)
7✔
374

375
    base_template = Path.join(base_dir, "index.html.eex")
7✔
376
    EEx.eval_file(base_template, rendered_template: rendered_template)
7✔
377
  end
378
end
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc