Update session cache and remove dnt options
This commit is contained in:
parent
6e68789feb
commit
448fe8fddb
|
@ -184,7 +184,6 @@ Arachnid has a ton of configration options which can be passed to the mehthods l
|
||||||
- **read_timeout** - Read timeout
|
- **read_timeout** - Read timeout
|
||||||
- **connect_timeout** - Connect timeout
|
- **connect_timeout** - Connect timeout
|
||||||
- **max_redirects** - Maximum amount of redirects to follow
|
- **max_redirects** - Maximum amount of redirects to follow
|
||||||
- **do_not_track** - Sets the DNT header
|
|
||||||
- **default_headers** - Default HTTP headers to use for all hosts
|
- **default_headers** - Default HTTP headers to use for all hosts
|
||||||
- **host_header** - HTTP host header to use
|
- **host_header** - HTTP host header to use
|
||||||
- **host_headers** - HTTP headers to use for specific hosts
|
- **host_headers** - HTTP headers to use for specific hosts
|
||||||
|
@ -199,7 +198,6 @@ Arachnid has a ton of configration options which can be passed to the mehthods l
|
||||||
|
|
||||||
There are also a few class properties on `Arachnid` itself which are used as the defaults, unless overrided.
|
There are also a few class properties on `Arachnid` itself which are used as the defaults, unless overrided.
|
||||||
|
|
||||||
- **do_not_track**
|
|
||||||
- **max_redirects**
|
- **max_redirects**
|
||||||
- **connect_timeout**
|
- **connect_timeout**
|
||||||
- **read_timeout**
|
- **read_timeout**
|
||||||
|
|
|
@ -13,9 +13,6 @@ module Arachnid
|
||||||
# Specifies whether robots.txt should be honored globally
|
# Specifies whether robots.txt should be honored globally
|
||||||
class_property? robots : Bool = false
|
class_property? robots : Bool = false
|
||||||
|
|
||||||
# Should we set the DNT (Do Not Track) header?
|
|
||||||
class_property? do_not_track : Bool = false
|
|
||||||
|
|
||||||
# Maximum amount of redirects to follow
|
# Maximum amount of redirects to follow
|
||||||
class_property max_redirects : Int32 = 5
|
class_property max_redirects : Int32 = 5
|
||||||
|
|
||||||
|
@ -62,3 +59,28 @@ module Arachnid
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
require "json"
|
||||||
|
|
||||||
|
# Let's build a sitemap of crystal-lang.org
|
||||||
|
# Links will be a hash of url to resource title
|
||||||
|
links = {} of String => String
|
||||||
|
|
||||||
|
# Visit a particular host, in this case `crystal-lang.org`. This will
|
||||||
|
# not match on subdomains.
|
||||||
|
Arachnid.host("https://crystal-lang.org") do |spider|
|
||||||
|
# Ignore the API secion. It's a little big.
|
||||||
|
spider.ignore_urls_like(/\/(api)\//)
|
||||||
|
|
||||||
|
spider.every_html_page do |page|
|
||||||
|
puts "Visiting #{page.url.to_s}"
|
||||||
|
|
||||||
|
# Ignore redirects for our sitemap
|
||||||
|
unless page.redirect?
|
||||||
|
# Add the url of every visited page to our sitemap
|
||||||
|
links[page.url.to_s] = page.title.to_s.strip
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
File.write("crystal-lang.org-sitemap.json", links.to_pretty_json)
|
||||||
|
|
|
@ -64,11 +64,11 @@ module Arachnid
|
||||||
|
|
||||||
# Creates a new `Agent` object.
|
# Creates a new `Agent` object.
|
||||||
def initialize(
|
def initialize(
|
||||||
|
http_client = nil,
|
||||||
host : String? = nil,
|
host : String? = nil,
|
||||||
read_timeout : Int32? = nil,
|
read_timeout : Int32? = nil,
|
||||||
connect_timeout : Int32? = nil,
|
connect_timeout : Int32? = nil,
|
||||||
max_redirects : Int32? = nil,
|
max_redirects : Int32? = nil,
|
||||||
do_not_track : Bool? = nil,
|
|
||||||
default_headers : Hash(String, String)? = nil,
|
default_headers : Hash(String, String)? = nil,
|
||||||
host_header : String? = nil,
|
host_header : String? = nil,
|
||||||
host_headers : Hash(String | Regex, String)? = nil,
|
host_headers : Hash(String | Regex, String)? = nil,
|
||||||
|
@ -111,10 +111,10 @@ module Arachnid
|
||||||
@max_depth = max_depth
|
@max_depth = max_depth
|
||||||
|
|
||||||
@sessions = SessionCache.new(
|
@sessions = SessionCache.new(
|
||||||
|
http_client || HTTPClient::Default,
|
||||||
read_timeout,
|
read_timeout,
|
||||||
connect_timeout,
|
connect_timeout,
|
||||||
max_redirects,
|
max_redirects
|
||||||
do_not_track
|
|
||||||
)
|
)
|
||||||
|
|
||||||
@cookies = CookieJar.new
|
@cookies = CookieJar.new
|
||||||
|
|
|
@ -5,6 +5,9 @@ module Arachnid
|
||||||
# Stores active HTTP Sessions organized by scheme, host-name and port.
|
# Stores active HTTP Sessions organized by scheme, host-name and port.
|
||||||
class SessionCache
|
class SessionCache
|
||||||
|
|
||||||
|
# The HTTPClient class to use for requests
|
||||||
|
property client : HTTPClient.class
|
||||||
|
|
||||||
# Optional read timeout.
|
# Optional read timeout.
|
||||||
property read_timeout : Int32
|
property read_timeout : Int32
|
||||||
|
|
||||||
|
@ -14,24 +17,19 @@ module Arachnid
|
||||||
# Max redirects to follow.
|
# Max redirects to follow.
|
||||||
property max_redirects : Int32?
|
property max_redirects : Int32?
|
||||||
|
|
||||||
# Should we set a DNT (Do Not Track) header?
|
|
||||||
property? do_not_track : Bool
|
|
||||||
|
|
||||||
@sessions = {} of Tuple(String?, String?, Int32?) => HTTPClient
|
@sessions = {} of Tuple(String?, String?, Int32?) => HTTPClient
|
||||||
|
|
||||||
# Create a new session cache
|
# Create a new session cache
|
||||||
def initialize(
|
def initialize(
|
||||||
client,
|
client : HTTPClient?.class = nil,
|
||||||
read_timeout : Int32? = nil,
|
read_timeout : Int32? = nil,
|
||||||
connect_timeout : Int32? = nil,
|
connect_timeout : Int32? = nil,
|
||||||
max_redirects : Int32? = nil,
|
max_redirects : Int32? = nil
|
||||||
do_not_track : Bool? = nil
|
|
||||||
)
|
)
|
||||||
@client = client || HTTPClient::Default
|
@client = client || HTTPClient::Default
|
||||||
@read_timeout = read_timeout || Arachnid.read_timeout
|
@read_timeout = read_timeout || Arachnid.read_timeout
|
||||||
@connect_timeout = connect_timeout || Arachnid.connect_timeout
|
@connect_timeout = connect_timeout || Arachnid.connect_timeout
|
||||||
@max_redirects = max_redirects || Arachnid.max_redirects
|
@max_redirects = max_redirects || Arachnid.max_redirects
|
||||||
@do_not_track = do_not_track || Arachnid.do_not_track?
|
|
||||||
end
|
end
|
||||||
|
|
||||||
# Determines if there is an active session for the given URL
|
# Determines if there is an active session for the given URL
|
||||||
|
@ -60,11 +58,6 @@ module Arachnid
|
||||||
endpoint.fragment = nil
|
endpoint.fragment = nil
|
||||||
endpoint.path = ""
|
endpoint.path = ""
|
||||||
|
|
||||||
# Set headers
|
|
||||||
headers = {
|
|
||||||
"DNT" => @do_not_track ? "1" : "0"
|
|
||||||
}
|
|
||||||
|
|
||||||
unless @sessions.has_key?(key)
|
unless @sessions.has_key?(key)
|
||||||
session = @client.new(
|
session = @client.new(
|
||||||
endpoint: endpoint,
|
endpoint: endpoint,
|
||||||
|
|
Loading…
Reference in New Issue