Kind of almost working
This commit is contained in:
parent
02bed53703
commit
64baf33e1d
|
@ -15,6 +15,9 @@ dependencies:
|
|||
version: 0.7.0
|
||||
strange:
|
||||
github: hydecr/strange
|
||||
marionette:
|
||||
github: watzon/marionette
|
||||
branch: master
|
||||
|
||||
targets:
|
||||
arachnid:
|
||||
|
|
|
@ -59,3 +59,27 @@ module Arachnid
|
|||
end
|
||||
end
|
||||
end
|
||||
|
||||
require "json"
|
||||
|
||||
# Let's build a sitemap of crystal-lang.org
|
||||
# Links will be a hash of url to resource title
|
||||
links = {} of String => String
|
||||
|
||||
marionette = Marionette.launch(headless: false, extended: true)
|
||||
|
||||
# Visit a particular host, in this case `crystal-lang.org`. This will
|
||||
# not match on subdomains.
|
||||
Arachnid.host("https://watzon.tech", browser: marionette) do |spider|
|
||||
spider.every_html_page do |page|
|
||||
puts "Visiting #{page.url.to_s}"
|
||||
|
||||
# Ignore redirects for our sitemap
|
||||
unless page.redirect?
|
||||
# Add the url of every visited page to our sitemap
|
||||
links[page.url.to_s] = page.title.to_s.strip
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
File.write("neuralegion-sitemap.json", links.to_pretty_json)
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
require "marionette"
|
||||
|
||||
require "./agent/sanitizers"
|
||||
require "./agent/filters"
|
||||
require "./agent/events"
|
||||
|
@ -17,6 +19,11 @@ module Arachnid
|
|||
# Set to limit to a single host.
|
||||
property host : String?
|
||||
|
||||
# Make all requests with a `Marionette::Browser` instance.
|
||||
# Will be slower than normal, but will also allow the
|
||||
# rendering of JavaScript.
|
||||
getter browser : Marionette::Browser?
|
||||
|
||||
# User agent to use.
|
||||
property user_agent : String
|
||||
|
||||
|
@ -64,8 +71,8 @@ module Arachnid
|
|||
|
||||
# Creates a new `Agent` object.
|
||||
def initialize(
|
||||
http_client = nil,
|
||||
host : String? = nil,
|
||||
browser : Marionette::Browser? = nil,
|
||||
read_timeout : Int32? = nil,
|
||||
connect_timeout : Int32? = nil,
|
||||
max_redirects : Int32? = nil,
|
||||
|
@ -84,6 +91,7 @@ module Arachnid
|
|||
filter_options = nil
|
||||
)
|
||||
@host = host
|
||||
@browser = browser
|
||||
|
||||
@host_header = host_header
|
||||
@host_headers = host_headers || {} of (Regex | String) => String
|
||||
|
@ -110,8 +118,12 @@ module Arachnid
|
|||
@levels = {} of URI => Int32
|
||||
@max_depth = max_depth
|
||||
|
||||
if browser && !browser.proxy
|
||||
raise "Can't use marionette without a proxy. Make sure the extended option is true."
|
||||
end
|
||||
|
||||
@sessions = SessionCache.new(
|
||||
http_client || HTTPClient::Default,
|
||||
browser,
|
||||
read_timeout,
|
||||
connect_timeout,
|
||||
max_redirects
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
require "./http_client/**"
|
||||
require "uri"
|
||||
|
||||
module Arachnid
|
||||
abstract class HTTPClient
|
||||
class HTTPClient
|
||||
|
||||
property browser : Marionette::Browser?
|
||||
|
||||
property endpoint : URI?
|
||||
|
||||
|
@ -14,17 +16,32 @@ module Arachnid
|
|||
property headers : Hash(String, String)
|
||||
|
||||
def initialize(
|
||||
browser : Marionette::Browser? = nil,
|
||||
endpoint : URI? = nil,
|
||||
read_timeout : Int32? = nil,
|
||||
connect_timeout : Int32? = nil,
|
||||
max_redirects : Int32? = nil,
|
||||
headers : Hash(String, String)? = nil
|
||||
)
|
||||
@browser = browser
|
||||
@endpoint = endpoint
|
||||
@read_timeout = read_timeout || Arachnid.read_timeout
|
||||
@connect_timeout = connect_timeout || Arachnid.connect_timeout
|
||||
@max_redirects = max_redirects || Arachnid.max_redirects
|
||||
@headers = headers || {} of String => String
|
||||
|
||||
@client = Halite::Client.new(
|
||||
endpoint: @endpoint.to_s,
|
||||
timeout: Halite::Timeout.new(
|
||||
connect: @connect_timeout,
|
||||
read: @read_timeout
|
||||
),
|
||||
follow: Halite::Follow.new(
|
||||
hops: @max_redirects,
|
||||
strict: false
|
||||
),
|
||||
headers: headers,
|
||||
)
|
||||
end
|
||||
|
||||
{% for method in [:get, :post, :put, :patch, :delete] %}
|
||||
|
@ -37,7 +54,20 @@ module Arachnid
|
|||
end
|
||||
{% end %}
|
||||
|
||||
abstract def request(method, path, options)
|
||||
getter client : Halite::Client
|
||||
|
||||
def request(method, path, options)
|
||||
if browser = @browser
|
||||
url = File.join(@endpoint.to_s, path)
|
||||
headers = options[:headers]? ? options[:headers].each_with_object(HTTP::Headers.new) { |(k, v), h| h.add(k,v) } : nil
|
||||
body = options[:body]?
|
||||
res = browser.proxy.not_nil!.exec(method, url, headers, body)
|
||||
Halite::Response.new(URI.parse(url), res)
|
||||
else
|
||||
options = Halite::Options.new(**options)
|
||||
@client.request(method.to_s, path.to_s, options)
|
||||
end
|
||||
end
|
||||
|
||||
def request(method, path, **options)
|
||||
request(method, path, options)
|
||||
|
|
|
@ -1,39 +0,0 @@
|
|||
require "halite"
|
||||
|
||||
module Arachnid
|
||||
abstract class HTTPClient
|
||||
class Default < HTTPClient
|
||||
|
||||
getter client : Halite::Client
|
||||
|
||||
def initialize(
|
||||
endpoint : URI? = nil,
|
||||
read_timeout : Int32? = nil,
|
||||
connect_timeout : Int32? = nil,
|
||||
max_redirects : Int32? = nil,
|
||||
headers : Hash(String, String)? = nil
|
||||
)
|
||||
super(endpoint, read_timeout, connect_timeout, max_redirects, headers)
|
||||
|
||||
@client = Halite::Client.new(
|
||||
endpoint: @endpoint.to_s,
|
||||
timeout: Halite::Timeout.new(
|
||||
connect: @connect_timeout,
|
||||
read: @read_timeout
|
||||
),
|
||||
follow: Halite::Follow.new(
|
||||
hops: @max_redirects,
|
||||
strict: false
|
||||
),
|
||||
headers: headers,
|
||||
)
|
||||
end
|
||||
|
||||
def request(method, path, options)
|
||||
options = Halite::Options.new(**options)
|
||||
@client.request(method.to_s, path.to_s, options)
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
end
|
|
@ -1,11 +0,0 @@
|
|||
module Arachnid
|
||||
abstract class HTTPClient
|
||||
class Webdriver < HTTPClient
|
||||
|
||||
def request(method, path, options)
|
||||
raise "Not implemented yet"
|
||||
end
|
||||
|
||||
end
|
||||
end
|
||||
end
|
|
@ -5,8 +5,8 @@ module Arachnid
|
|||
# Stores active HTTP Sessions organized by scheme, host-name and port.
|
||||
class SessionCache
|
||||
|
||||
# The HTTPClient class to use for requests
|
||||
property client : HTTPClient.class
|
||||
# `Marionette::Browser` instance to proxy requests through.
|
||||
getter browser : Marionette::Browser?
|
||||
|
||||
# Optional read timeout.
|
||||
property read_timeout : Int32
|
||||
|
@ -21,12 +21,12 @@ module Arachnid
|
|||
|
||||
# Create a new session cache
|
||||
def initialize(
|
||||
client : HTTPClient?.class = nil,
|
||||
browser : Marionette::Browser? = nil,
|
||||
read_timeout : Int32? = nil,
|
||||
connect_timeout : Int32? = nil,
|
||||
max_redirects : Int32? = nil
|
||||
)
|
||||
@client = client || HTTPClient::Default
|
||||
@browser = browser
|
||||
@read_timeout = read_timeout || Arachnid.read_timeout
|
||||
@connect_timeout = connect_timeout || Arachnid.connect_timeout
|
||||
@max_redirects = max_redirects || Arachnid.max_redirects
|
||||
|
@ -59,12 +59,13 @@ module Arachnid
|
|||
endpoint.path = ""
|
||||
|
||||
unless @sessions.has_key?(key)
|
||||
session = @client.new(
|
||||
session = HTTPClient.new(
|
||||
browser: browser,
|
||||
endpoint: endpoint,
|
||||
read_timeout: @read_timeout,
|
||||
connect_timeout: @connect_timeout,
|
||||
max_redirects: @max_redirects,
|
||||
headers: headers,
|
||||
# headers: headers,
|
||||
)
|
||||
|
||||
@sessions[key] = session
|
||||
|
|
Loading…
Reference in New Issue