Kind of almost working

This commit is contained in:
Chris Watson 2019-08-18 22:22:53 -07:00
parent 02bed53703
commit 64baf33e1d
7 changed files with 81 additions and 61 deletions

View File

@ -15,6 +15,9 @@ dependencies:
version: 0.7.0 version: 0.7.0
strange: strange:
github: hydecr/strange github: hydecr/strange
marionette:
github: watzon/marionette
branch: master
targets: targets:
arachnid: arachnid:

View File

@ -59,3 +59,27 @@ module Arachnid
end end
end end
end end
require "json"
# Let's build a sitemap of crystal-lang.org
# Links will be a hash of url to resource title
links = {} of String => String
marionette = Marionette.launch(headless: false, extended: true)
# Visit a particular host, in this case `crystal-lang.org`. This will
# not match on subdomains.
Arachnid.host("https://watzon.tech", browser: marionette) do |spider|
spider.every_html_page do |page|
puts "Visiting #{page.url.to_s}"
# Ignore redirects for our sitemap
unless page.redirect?
# Add the url of every visited page to our sitemap
links[page.url.to_s] = page.title.to_s.strip
end
end
end
File.write("neuralegion-sitemap.json", links.to_pretty_json)

View File

@ -1,3 +1,5 @@
require "marionette"
require "./agent/sanitizers" require "./agent/sanitizers"
require "./agent/filters" require "./agent/filters"
require "./agent/events" require "./agent/events"
@ -17,6 +19,11 @@ module Arachnid
# Set to limit to a single host. # Set to limit to a single host.
property host : String? property host : String?
# Make all requests with a `Marionette::Browser` instance.
# Will be slower than normal, but will also allow the
# rendering of JavaScript.
getter browser : Marionette::Browser?
# User agent to use. # User agent to use.
property user_agent : String property user_agent : String
@ -64,8 +71,8 @@ module Arachnid
# Creates a new `Agent` object. # Creates a new `Agent` object.
def initialize( def initialize(
http_client = nil,
host : String? = nil, host : String? = nil,
browser : Marionette::Browser? = nil,
read_timeout : Int32? = nil, read_timeout : Int32? = nil,
connect_timeout : Int32? = nil, connect_timeout : Int32? = nil,
max_redirects : Int32? = nil, max_redirects : Int32? = nil,
@ -84,6 +91,7 @@ module Arachnid
filter_options = nil filter_options = nil
) )
@host = host @host = host
@browser = browser
@host_header = host_header @host_header = host_header
@host_headers = host_headers || {} of (Regex | String) => String @host_headers = host_headers || {} of (Regex | String) => String
@ -110,8 +118,12 @@ module Arachnid
@levels = {} of URI => Int32 @levels = {} of URI => Int32
@max_depth = max_depth @max_depth = max_depth
if browser && !browser.proxy
raise "Can't use marionette without a proxy. Make sure the extended option is true."
end
@sessions = SessionCache.new( @sessions = SessionCache.new(
http_client || HTTPClient::Default, browser,
read_timeout, read_timeout,
connect_timeout, connect_timeout,
max_redirects max_redirects

View File

@ -1,7 +1,9 @@
require "./http_client/**" require "uri"
module Arachnid module Arachnid
abstract class HTTPClient class HTTPClient
property browser : Marionette::Browser?
property endpoint : URI? property endpoint : URI?
@ -14,17 +16,32 @@ module Arachnid
property headers : Hash(String, String) property headers : Hash(String, String)
def initialize( def initialize(
browser : Marionette::Browser? = nil,
endpoint : URI? = nil, endpoint : URI? = nil,
read_timeout : Int32? = nil, read_timeout : Int32? = nil,
connect_timeout : Int32? = nil, connect_timeout : Int32? = nil,
max_redirects : Int32? = nil, max_redirects : Int32? = nil,
headers : Hash(String, String)? = nil headers : Hash(String, String)? = nil
) )
@browser = browser
@endpoint = endpoint @endpoint = endpoint
@read_timeout = read_timeout || Arachnid.read_timeout @read_timeout = read_timeout || Arachnid.read_timeout
@connect_timeout = connect_timeout || Arachnid.connect_timeout @connect_timeout = connect_timeout || Arachnid.connect_timeout
@max_redirects = max_redirects || Arachnid.max_redirects @max_redirects = max_redirects || Arachnid.max_redirects
@headers = headers || {} of String => String @headers = headers || {} of String => String
@client = Halite::Client.new(
endpoint: @endpoint.to_s,
timeout: Halite::Timeout.new(
connect: @connect_timeout,
read: @read_timeout
),
follow: Halite::Follow.new(
hops: @max_redirects,
strict: false
),
headers: headers,
)
end end
{% for method in [:get, :post, :put, :patch, :delete] %} {% for method in [:get, :post, :put, :patch, :delete] %}
@ -37,7 +54,20 @@ module Arachnid
end end
{% end %} {% end %}
abstract def request(method, path, options) getter client : Halite::Client
def request(method, path, options)
if browser = @browser
url = File.join(@endpoint.to_s, path)
headers = options[:headers]? ? options[:headers].each_with_object(HTTP::Headers.new) { |(k, v), h| h.add(k,v) } : nil
body = options[:body]?
res = browser.proxy.not_nil!.exec(method, url, headers, body)
Halite::Response.new(URI.parse(url), res)
else
options = Halite::Options.new(**options)
@client.request(method.to_s, path.to_s, options)
end
end
def request(method, path, **options) def request(method, path, **options)
request(method, path, options) request(method, path, options)

View File

@ -1,39 +0,0 @@
require "halite"
module Arachnid
abstract class HTTPClient
class Default < HTTPClient
getter client : Halite::Client
def initialize(
endpoint : URI? = nil,
read_timeout : Int32? = nil,
connect_timeout : Int32? = nil,
max_redirects : Int32? = nil,
headers : Hash(String, String)? = nil
)
super(endpoint, read_timeout, connect_timeout, max_redirects, headers)
@client = Halite::Client.new(
endpoint: @endpoint.to_s,
timeout: Halite::Timeout.new(
connect: @connect_timeout,
read: @read_timeout
),
follow: Halite::Follow.new(
hops: @max_redirects,
strict: false
),
headers: headers,
)
end
def request(method, path, options)
options = Halite::Options.new(**options)
@client.request(method.to_s, path.to_s, options)
end
end
end
end

View File

@ -1,11 +0,0 @@
module Arachnid
abstract class HTTPClient
class Webdriver < HTTPClient
def request(method, path, options)
raise "Not implemented yet"
end
end
end
end

View File

@ -5,8 +5,8 @@ module Arachnid
# Stores active HTTP Sessions organized by scheme, host-name and port. # Stores active HTTP Sessions organized by scheme, host-name and port.
class SessionCache class SessionCache
# The HTTPClient class to use for requests # `Marionette::Browser` instance to proxy requests through.
property client : HTTPClient.class getter browser : Marionette::Browser?
# Optional read timeout. # Optional read timeout.
property read_timeout : Int32 property read_timeout : Int32
@ -21,12 +21,12 @@ module Arachnid
# Create a new session cache # Create a new session cache
def initialize( def initialize(
client : HTTPClient?.class = nil, browser : Marionette::Browser? = nil,
read_timeout : Int32? = nil, read_timeout : Int32? = nil,
connect_timeout : Int32? = nil, connect_timeout : Int32? = nil,
max_redirects : Int32? = nil max_redirects : Int32? = nil
) )
@client = client || HTTPClient::Default @browser = browser
@read_timeout = read_timeout || Arachnid.read_timeout @read_timeout = read_timeout || Arachnid.read_timeout
@connect_timeout = connect_timeout || Arachnid.connect_timeout @connect_timeout = connect_timeout || Arachnid.connect_timeout
@max_redirects = max_redirects || Arachnid.max_redirects @max_redirects = max_redirects || Arachnid.max_redirects
@ -59,12 +59,13 @@ module Arachnid
endpoint.path = "" endpoint.path = ""
unless @sessions.has_key?(key) unless @sessions.has_key?(key)
session = @client.new( session = HTTPClient.new(
browser: browser,
endpoint: endpoint, endpoint: endpoint,
read_timeout: @read_timeout, read_timeout: @read_timeout,
connect_timeout: @connect_timeout, connect_timeout: @connect_timeout,
max_redirects: @max_redirects, max_redirects: @max_redirects,
headers: headers, # headers: headers,
) )
@sessions[key] = session @sessions[key] = session