Kind of almost working
This commit is contained in:
parent
02bed53703
commit
64baf33e1d
|
@ -15,6 +15,9 @@ dependencies:
|
||||||
version: 0.7.0
|
version: 0.7.0
|
||||||
strange:
|
strange:
|
||||||
github: hydecr/strange
|
github: hydecr/strange
|
||||||
|
marionette:
|
||||||
|
github: watzon/marionette
|
||||||
|
branch: master
|
||||||
|
|
||||||
targets:
|
targets:
|
||||||
arachnid:
|
arachnid:
|
||||||
|
|
|
@ -59,3 +59,27 @@ module Arachnid
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
require "json"
|
||||||
|
|
||||||
|
# Let's build a sitemap of crystal-lang.org
|
||||||
|
# Links will be a hash of url to resource title
|
||||||
|
links = {} of String => String
|
||||||
|
|
||||||
|
marionette = Marionette.launch(headless: false, extended: true)
|
||||||
|
|
||||||
|
# Visit a particular host, in this case `crystal-lang.org`. This will
|
||||||
|
# not match on subdomains.
|
||||||
|
Arachnid.host("https://watzon.tech", browser: marionette) do |spider|
|
||||||
|
spider.every_html_page do |page|
|
||||||
|
puts "Visiting #{page.url.to_s}"
|
||||||
|
|
||||||
|
# Ignore redirects for our sitemap
|
||||||
|
unless page.redirect?
|
||||||
|
# Add the url of every visited page to our sitemap
|
||||||
|
links[page.url.to_s] = page.title.to_s.strip
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
File.write("neuralegion-sitemap.json", links.to_pretty_json)
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
require "marionette"
|
||||||
|
|
||||||
require "./agent/sanitizers"
|
require "./agent/sanitizers"
|
||||||
require "./agent/filters"
|
require "./agent/filters"
|
||||||
require "./agent/events"
|
require "./agent/events"
|
||||||
|
@ -17,6 +19,11 @@ module Arachnid
|
||||||
# Set to limit to a single host.
|
# Set to limit to a single host.
|
||||||
property host : String?
|
property host : String?
|
||||||
|
|
||||||
|
# Make all requests with a `Marionette::Browser` instance.
|
||||||
|
# Will be slower than normal, but will also allow the
|
||||||
|
# rendering of JavaScript.
|
||||||
|
getter browser : Marionette::Browser?
|
||||||
|
|
||||||
# User agent to use.
|
# User agent to use.
|
||||||
property user_agent : String
|
property user_agent : String
|
||||||
|
|
||||||
|
@ -64,8 +71,8 @@ module Arachnid
|
||||||
|
|
||||||
# Creates a new `Agent` object.
|
# Creates a new `Agent` object.
|
||||||
def initialize(
|
def initialize(
|
||||||
http_client = nil,
|
|
||||||
host : String? = nil,
|
host : String? = nil,
|
||||||
|
browser : Marionette::Browser? = nil,
|
||||||
read_timeout : Int32? = nil,
|
read_timeout : Int32? = nil,
|
||||||
connect_timeout : Int32? = nil,
|
connect_timeout : Int32? = nil,
|
||||||
max_redirects : Int32? = nil,
|
max_redirects : Int32? = nil,
|
||||||
|
@ -84,6 +91,7 @@ module Arachnid
|
||||||
filter_options = nil
|
filter_options = nil
|
||||||
)
|
)
|
||||||
@host = host
|
@host = host
|
||||||
|
@browser = browser
|
||||||
|
|
||||||
@host_header = host_header
|
@host_header = host_header
|
||||||
@host_headers = host_headers || {} of (Regex | String) => String
|
@host_headers = host_headers || {} of (Regex | String) => String
|
||||||
|
@ -110,8 +118,12 @@ module Arachnid
|
||||||
@levels = {} of URI => Int32
|
@levels = {} of URI => Int32
|
||||||
@max_depth = max_depth
|
@max_depth = max_depth
|
||||||
|
|
||||||
|
if browser && !browser.proxy
|
||||||
|
raise "Can't use marionette without a proxy. Make sure the extended option is true."
|
||||||
|
end
|
||||||
|
|
||||||
@sessions = SessionCache.new(
|
@sessions = SessionCache.new(
|
||||||
http_client || HTTPClient::Default,
|
browser,
|
||||||
read_timeout,
|
read_timeout,
|
||||||
connect_timeout,
|
connect_timeout,
|
||||||
max_redirects
|
max_redirects
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
require "./http_client/**"
|
require "uri"
|
||||||
|
|
||||||
module Arachnid
|
module Arachnid
|
||||||
abstract class HTTPClient
|
class HTTPClient
|
||||||
|
|
||||||
|
property browser : Marionette::Browser?
|
||||||
|
|
||||||
property endpoint : URI?
|
property endpoint : URI?
|
||||||
|
|
||||||
|
@ -14,17 +16,32 @@ module Arachnid
|
||||||
property headers : Hash(String, String)
|
property headers : Hash(String, String)
|
||||||
|
|
||||||
def initialize(
|
def initialize(
|
||||||
|
browser : Marionette::Browser? = nil,
|
||||||
endpoint : URI? = nil,
|
endpoint : URI? = nil,
|
||||||
read_timeout : Int32? = nil,
|
read_timeout : Int32? = nil,
|
||||||
connect_timeout : Int32? = nil,
|
connect_timeout : Int32? = nil,
|
||||||
max_redirects : Int32? = nil,
|
max_redirects : Int32? = nil,
|
||||||
headers : Hash(String, String)? = nil
|
headers : Hash(String, String)? = nil
|
||||||
)
|
)
|
||||||
|
@browser = browser
|
||||||
@endpoint = endpoint
|
@endpoint = endpoint
|
||||||
@read_timeout = read_timeout || Arachnid.read_timeout
|
@read_timeout = read_timeout || Arachnid.read_timeout
|
||||||
@connect_timeout = connect_timeout || Arachnid.connect_timeout
|
@connect_timeout = connect_timeout || Arachnid.connect_timeout
|
||||||
@max_redirects = max_redirects || Arachnid.max_redirects
|
@max_redirects = max_redirects || Arachnid.max_redirects
|
||||||
@headers = headers || {} of String => String
|
@headers = headers || {} of String => String
|
||||||
|
|
||||||
|
@client = Halite::Client.new(
|
||||||
|
endpoint: @endpoint.to_s,
|
||||||
|
timeout: Halite::Timeout.new(
|
||||||
|
connect: @connect_timeout,
|
||||||
|
read: @read_timeout
|
||||||
|
),
|
||||||
|
follow: Halite::Follow.new(
|
||||||
|
hops: @max_redirects,
|
||||||
|
strict: false
|
||||||
|
),
|
||||||
|
headers: headers,
|
||||||
|
)
|
||||||
end
|
end
|
||||||
|
|
||||||
{% for method in [:get, :post, :put, :patch, :delete] %}
|
{% for method in [:get, :post, :put, :patch, :delete] %}
|
||||||
|
@ -37,7 +54,20 @@ module Arachnid
|
||||||
end
|
end
|
||||||
{% end %}
|
{% end %}
|
||||||
|
|
||||||
abstract def request(method, path, options)
|
getter client : Halite::Client
|
||||||
|
|
||||||
|
def request(method, path, options)
|
||||||
|
if browser = @browser
|
||||||
|
url = File.join(@endpoint.to_s, path)
|
||||||
|
headers = options[:headers]? ? options[:headers].each_with_object(HTTP::Headers.new) { |(k, v), h| h.add(k,v) } : nil
|
||||||
|
body = options[:body]?
|
||||||
|
res = browser.proxy.not_nil!.exec(method, url, headers, body)
|
||||||
|
Halite::Response.new(URI.parse(url), res)
|
||||||
|
else
|
||||||
|
options = Halite::Options.new(**options)
|
||||||
|
@client.request(method.to_s, path.to_s, options)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
def request(method, path, **options)
|
def request(method, path, **options)
|
||||||
request(method, path, options)
|
request(method, path, options)
|
||||||
|
|
|
@ -1,39 +0,0 @@
|
||||||
require "halite"
|
|
||||||
|
|
||||||
module Arachnid
|
|
||||||
abstract class HTTPClient
|
|
||||||
class Default < HTTPClient
|
|
||||||
|
|
||||||
getter client : Halite::Client
|
|
||||||
|
|
||||||
def initialize(
|
|
||||||
endpoint : URI? = nil,
|
|
||||||
read_timeout : Int32? = nil,
|
|
||||||
connect_timeout : Int32? = nil,
|
|
||||||
max_redirects : Int32? = nil,
|
|
||||||
headers : Hash(String, String)? = nil
|
|
||||||
)
|
|
||||||
super(endpoint, read_timeout, connect_timeout, max_redirects, headers)
|
|
||||||
|
|
||||||
@client = Halite::Client.new(
|
|
||||||
endpoint: @endpoint.to_s,
|
|
||||||
timeout: Halite::Timeout.new(
|
|
||||||
connect: @connect_timeout,
|
|
||||||
read: @read_timeout
|
|
||||||
),
|
|
||||||
follow: Halite::Follow.new(
|
|
||||||
hops: @max_redirects,
|
|
||||||
strict: false
|
|
||||||
),
|
|
||||||
headers: headers,
|
|
||||||
)
|
|
||||||
end
|
|
||||||
|
|
||||||
def request(method, path, options)
|
|
||||||
options = Halite::Options.new(**options)
|
|
||||||
@client.request(method.to_s, path.to_s, options)
|
|
||||||
end
|
|
||||||
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
|
@ -1,11 +0,0 @@
|
||||||
module Arachnid
|
|
||||||
abstract class HTTPClient
|
|
||||||
class Webdriver < HTTPClient
|
|
||||||
|
|
||||||
def request(method, path, options)
|
|
||||||
raise "Not implemented yet"
|
|
||||||
end
|
|
||||||
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
|
@ -5,8 +5,8 @@ module Arachnid
|
||||||
# Stores active HTTP Sessions organized by scheme, host-name and port.
|
# Stores active HTTP Sessions organized by scheme, host-name and port.
|
||||||
class SessionCache
|
class SessionCache
|
||||||
|
|
||||||
# The HTTPClient class to use for requests
|
# `Marionette::Browser` instance to proxy requests through.
|
||||||
property client : HTTPClient.class
|
getter browser : Marionette::Browser?
|
||||||
|
|
||||||
# Optional read timeout.
|
# Optional read timeout.
|
||||||
property read_timeout : Int32
|
property read_timeout : Int32
|
||||||
|
@ -21,12 +21,12 @@ module Arachnid
|
||||||
|
|
||||||
# Create a new session cache
|
# Create a new session cache
|
||||||
def initialize(
|
def initialize(
|
||||||
client : HTTPClient?.class = nil,
|
browser : Marionette::Browser? = nil,
|
||||||
read_timeout : Int32? = nil,
|
read_timeout : Int32? = nil,
|
||||||
connect_timeout : Int32? = nil,
|
connect_timeout : Int32? = nil,
|
||||||
max_redirects : Int32? = nil
|
max_redirects : Int32? = nil
|
||||||
)
|
)
|
||||||
@client = client || HTTPClient::Default
|
@browser = browser
|
||||||
@read_timeout = read_timeout || Arachnid.read_timeout
|
@read_timeout = read_timeout || Arachnid.read_timeout
|
||||||
@connect_timeout = connect_timeout || Arachnid.connect_timeout
|
@connect_timeout = connect_timeout || Arachnid.connect_timeout
|
||||||
@max_redirects = max_redirects || Arachnid.max_redirects
|
@max_redirects = max_redirects || Arachnid.max_redirects
|
||||||
|
@ -59,12 +59,13 @@ module Arachnid
|
||||||
endpoint.path = ""
|
endpoint.path = ""
|
||||||
|
|
||||||
unless @sessions.has_key?(key)
|
unless @sessions.has_key?(key)
|
||||||
session = @client.new(
|
session = HTTPClient.new(
|
||||||
|
browser: browser,
|
||||||
endpoint: endpoint,
|
endpoint: endpoint,
|
||||||
read_timeout: @read_timeout,
|
read_timeout: @read_timeout,
|
||||||
connect_timeout: @connect_timeout,
|
connect_timeout: @connect_timeout,
|
||||||
max_redirects: @max_redirects,
|
max_redirects: @max_redirects,
|
||||||
headers: headers,
|
# headers: headers,
|
||||||
)
|
)
|
||||||
|
|
||||||
@sessions[key] = session
|
@sessions[key] = session
|
||||||
|
|
Loading…
Reference in New Issue