From 64baf33e1d9cf49b32742d41e4db69ca871192ef Mon Sep 17 00:00:00 2001 From: Chris Watson Date: Sun, 18 Aug 2019 22:22:53 -0700 Subject: [PATCH] Kind of almost working --- shard.yml | 3 +++ src/arachnid.cr | 24 +++++++++++++++++ src/arachnid/agent.cr | 16 +++++++++-- src/arachnid/http_client.cr | 36 ++++++++++++++++++++++--- src/arachnid/http_client/default.cr | 39 --------------------------- src/arachnid/http_client/webdriver.cr | 11 -------- src/arachnid/session_cache.cr | 13 ++++----- 7 files changed, 81 insertions(+), 61 deletions(-) delete mode 100644 src/arachnid/http_client/default.cr delete mode 100644 src/arachnid/http_client/webdriver.cr diff --git a/shard.yml b/shard.yml index 6ae862d..fb0f114 100644 --- a/shard.yml +++ b/shard.yml @@ -15,6 +15,9 @@ dependencies: version: 0.7.0 strange: github: hydecr/strange + marionette: + github: watzon/marionette + branch: master targets: arachnid: diff --git a/src/arachnid.cr b/src/arachnid.cr index b646438..a0ee63c 100644 --- a/src/arachnid.cr +++ b/src/arachnid.cr @@ -59,3 +59,27 @@ module Arachnid end end end + +require "json" + +# Let's build a sitemap of crystal-lang.org +# Links will be a hash of url to resource title +links = {} of String => String + +marionette = Marionette.launch(headless: false, extended: true) + +# Visit a particular host, in this case `crystal-lang.org`. This will +# not match on subdomains. +Arachnid.host("https://watzon.tech", browser: marionette) do |spider| + spider.every_html_page do |page| + puts "Visiting #{page.url.to_s}" + + # Ignore redirects for our sitemap + unless page.redirect? + # Add the url of every visited page to our sitemap + links[page.url.to_s] = page.title.to_s.strip + end + end +end + +File.write("neuralegion-sitemap.json", links.to_pretty_json) diff --git a/src/arachnid/agent.cr b/src/arachnid/agent.cr index 761a9e7..6a6cc6a 100644 --- a/src/arachnid/agent.cr +++ b/src/arachnid/agent.cr @@ -1,3 +1,5 @@ +require "marionette" + require "./agent/sanitizers" require "./agent/filters" require "./agent/events" @@ -17,6 +19,11 @@ module Arachnid # Set to limit to a single host. property host : String? + # Make all requests with a `Marionette::Browser` instance. + # Will be slower than normal, but will also allow the + # rendering of JavaScript. + getter browser : Marionette::Browser? + # User agent to use. property user_agent : String @@ -64,8 +71,8 @@ module Arachnid # Creates a new `Agent` object. def initialize( - http_client = nil, host : String? = nil, + browser : Marionette::Browser? = nil, read_timeout : Int32? = nil, connect_timeout : Int32? = nil, max_redirects : Int32? = nil, @@ -84,6 +91,7 @@ module Arachnid filter_options = nil ) @host = host + @browser = browser @host_header = host_header @host_headers = host_headers || {} of (Regex | String) => String @@ -110,8 +118,12 @@ module Arachnid @levels = {} of URI => Int32 @max_depth = max_depth + if browser && !browser.proxy + raise "Can't use marionette without a proxy. Make sure the extended option is true." + end + @sessions = SessionCache.new( - http_client || HTTPClient::Default, + browser, read_timeout, connect_timeout, max_redirects diff --git a/src/arachnid/http_client.cr b/src/arachnid/http_client.cr index 1f86d3c..ff3e7a8 100644 --- a/src/arachnid/http_client.cr +++ b/src/arachnid/http_client.cr @@ -1,7 +1,9 @@ -require "./http_client/**" +require "uri" module Arachnid - abstract class HTTPClient + class HTTPClient + + property browser : Marionette::Browser? property endpoint : URI? @@ -14,17 +16,32 @@ module Arachnid property headers : Hash(String, String) def initialize( + browser : Marionette::Browser? = nil, endpoint : URI? = nil, read_timeout : Int32? = nil, connect_timeout : Int32? = nil, max_redirects : Int32? = nil, headers : Hash(String, String)? = nil ) + @browser = browser @endpoint = endpoint @read_timeout = read_timeout || Arachnid.read_timeout @connect_timeout = connect_timeout || Arachnid.connect_timeout @max_redirects = max_redirects || Arachnid.max_redirects @headers = headers || {} of String => String + + @client = Halite::Client.new( + endpoint: @endpoint.to_s, + timeout: Halite::Timeout.new( + connect: @connect_timeout, + read: @read_timeout + ), + follow: Halite::Follow.new( + hops: @max_redirects, + strict: false + ), + headers: headers, + ) end {% for method in [:get, :post, :put, :patch, :delete] %} @@ -37,7 +54,20 @@ module Arachnid end {% end %} - abstract def request(method, path, options) + getter client : Halite::Client + + def request(method, path, options) + if browser = @browser + url = File.join(@endpoint.to_s, path) + headers = options[:headers]? ? options[:headers].each_with_object(HTTP::Headers.new) { |(k, v), h| h.add(k,v) } : nil + body = options[:body]? + res = browser.proxy.not_nil!.exec(method, url, headers, body) + Halite::Response.new(URI.parse(url), res) + else + options = Halite::Options.new(**options) + @client.request(method.to_s, path.to_s, options) + end + end def request(method, path, **options) request(method, path, options) diff --git a/src/arachnid/http_client/default.cr b/src/arachnid/http_client/default.cr deleted file mode 100644 index ab15dd8..0000000 --- a/src/arachnid/http_client/default.cr +++ /dev/null @@ -1,39 +0,0 @@ -require "halite" - -module Arachnid - abstract class HTTPClient - class Default < HTTPClient - - getter client : Halite::Client - - def initialize( - endpoint : URI? = nil, - read_timeout : Int32? = nil, - connect_timeout : Int32? = nil, - max_redirects : Int32? = nil, - headers : Hash(String, String)? = nil - ) - super(endpoint, read_timeout, connect_timeout, max_redirects, headers) - - @client = Halite::Client.new( - endpoint: @endpoint.to_s, - timeout: Halite::Timeout.new( - connect: @connect_timeout, - read: @read_timeout - ), - follow: Halite::Follow.new( - hops: @max_redirects, - strict: false - ), - headers: headers, - ) - end - - def request(method, path, options) - options = Halite::Options.new(**options) - @client.request(method.to_s, path.to_s, options) - end - - end - end -end diff --git a/src/arachnid/http_client/webdriver.cr b/src/arachnid/http_client/webdriver.cr deleted file mode 100644 index a250e8b..0000000 --- a/src/arachnid/http_client/webdriver.cr +++ /dev/null @@ -1,11 +0,0 @@ -module Arachnid - abstract class HTTPClient - class Webdriver < HTTPClient - - def request(method, path, options) - raise "Not implemented yet" - end - - end - end -end diff --git a/src/arachnid/session_cache.cr b/src/arachnid/session_cache.cr index e81fe23..1b87b8a 100644 --- a/src/arachnid/session_cache.cr +++ b/src/arachnid/session_cache.cr @@ -5,8 +5,8 @@ module Arachnid # Stores active HTTP Sessions organized by scheme, host-name and port. class SessionCache - # The HTTPClient class to use for requests - property client : HTTPClient.class + # `Marionette::Browser` instance to proxy requests through. + getter browser : Marionette::Browser? # Optional read timeout. property read_timeout : Int32 @@ -21,12 +21,12 @@ module Arachnid # Create a new session cache def initialize( - client : HTTPClient?.class = nil, + browser : Marionette::Browser? = nil, read_timeout : Int32? = nil, connect_timeout : Int32? = nil, max_redirects : Int32? = nil ) - @client = client || HTTPClient::Default + @browser = browser @read_timeout = read_timeout || Arachnid.read_timeout @connect_timeout = connect_timeout || Arachnid.connect_timeout @max_redirects = max_redirects || Arachnid.max_redirects @@ -59,12 +59,13 @@ module Arachnid endpoint.path = "" unless @sessions.has_key?(key) - session = @client.new( + session = HTTPClient.new( + browser: browser, endpoint: endpoint, read_timeout: @read_timeout, connect_timeout: @connect_timeout, max_redirects: @max_redirects, - headers: headers, + # headers: headers, ) @sessions[key] = session