Update session cache and remove dnt options
This commit is contained in:
parent
6e68789feb
commit
448fe8fddb
|
@ -184,7 +184,6 @@ Arachnid has a ton of configration options which can be passed to the mehthods l
|
|||
- **read_timeout** - Read timeout
|
||||
- **connect_timeout** - Connect timeout
|
||||
- **max_redirects** - Maximum amount of redirects to follow
|
||||
- **do_not_track** - Sets the DNT header
|
||||
- **default_headers** - Default HTTP headers to use for all hosts
|
||||
- **host_header** - HTTP host header to use
|
||||
- **host_headers** - HTTP headers to use for specific hosts
|
||||
|
@ -199,7 +198,6 @@ Arachnid has a ton of configration options which can be passed to the mehthods l
|
|||
|
||||
There are also a few class properties on `Arachnid` itself which are used as the defaults, unless overrided.
|
||||
|
||||
- **do_not_track**
|
||||
- **max_redirects**
|
||||
- **connect_timeout**
|
||||
- **read_timeout**
|
||||
|
|
|
@ -13,9 +13,6 @@ module Arachnid
|
|||
# Specifies whether robots.txt should be honored globally
|
||||
class_property? robots : Bool = false
|
||||
|
||||
# Should we set the DNT (Do Not Track) header?
|
||||
class_property? do_not_track : Bool = false
|
||||
|
||||
# Maximum amount of redirects to follow
|
||||
class_property max_redirects : Int32 = 5
|
||||
|
||||
|
@ -62,3 +59,28 @@ module Arachnid
|
|||
end
|
||||
end
|
||||
end
|
||||
|
||||
require "json"
|
||||
|
||||
# Let's build a sitemap of crystal-lang.org
|
||||
# Links will be a hash of url to resource title
|
||||
links = {} of String => String
|
||||
|
||||
# Visit a particular host, in this case `crystal-lang.org`. This will
|
||||
# not match on subdomains.
|
||||
Arachnid.host("https://crystal-lang.org") do |spider|
|
||||
# Ignore the API secion. It's a little big.
|
||||
spider.ignore_urls_like(/\/(api)\//)
|
||||
|
||||
spider.every_html_page do |page|
|
||||
puts "Visiting #{page.url.to_s}"
|
||||
|
||||
# Ignore redirects for our sitemap
|
||||
unless page.redirect?
|
||||
# Add the url of every visited page to our sitemap
|
||||
links[page.url.to_s] = page.title.to_s.strip
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
File.write("crystal-lang.org-sitemap.json", links.to_pretty_json)
|
||||
|
|
|
@ -64,11 +64,11 @@ module Arachnid
|
|||
|
||||
# Creates a new `Agent` object.
|
||||
def initialize(
|
||||
http_client = nil,
|
||||
host : String? = nil,
|
||||
read_timeout : Int32? = nil,
|
||||
connect_timeout : Int32? = nil,
|
||||
max_redirects : Int32? = nil,
|
||||
do_not_track : Bool? = nil,
|
||||
default_headers : Hash(String, String)? = nil,
|
||||
host_header : String? = nil,
|
||||
host_headers : Hash(String | Regex, String)? = nil,
|
||||
|
@ -111,10 +111,10 @@ module Arachnid
|
|||
@max_depth = max_depth
|
||||
|
||||
@sessions = SessionCache.new(
|
||||
http_client || HTTPClient::Default,
|
||||
read_timeout,
|
||||
connect_timeout,
|
||||
max_redirects,
|
||||
do_not_track
|
||||
max_redirects
|
||||
)
|
||||
|
||||
@cookies = CookieJar.new
|
||||
|
|
|
@ -5,6 +5,9 @@ module Arachnid
|
|||
# Stores active HTTP Sessions organized by scheme, host-name and port.
|
||||
class SessionCache
|
||||
|
||||
# The HTTPClient class to use for requests
|
||||
property client : HTTPClient.class
|
||||
|
||||
# Optional read timeout.
|
||||
property read_timeout : Int32
|
||||
|
||||
|
@ -14,24 +17,19 @@ module Arachnid
|
|||
# Max redirects to follow.
|
||||
property max_redirects : Int32?
|
||||
|
||||
# Should we set a DNT (Do Not Track) header?
|
||||
property? do_not_track : Bool
|
||||
|
||||
@sessions = {} of Tuple(String?, String?, Int32?) => HTTPClient
|
||||
|
||||
# Create a new session cache
|
||||
def initialize(
|
||||
client,
|
||||
client : HTTPClient?.class = nil,
|
||||
read_timeout : Int32? = nil,
|
||||
connect_timeout : Int32? = nil,
|
||||
max_redirects : Int32? = nil,
|
||||
do_not_track : Bool? = nil
|
||||
max_redirects : Int32? = nil
|
||||
)
|
||||
@client = client || HTTPClient::Default
|
||||
@read_timeout = read_timeout || Arachnid.read_timeout
|
||||
@connect_timeout = connect_timeout || Arachnid.connect_timeout
|
||||
@max_redirects = max_redirects || Arachnid.max_redirects
|
||||
@do_not_track = do_not_track || Arachnid.do_not_track?
|
||||
end
|
||||
|
||||
# Determines if there is an active session for the given URL
|
||||
|
@ -60,11 +58,6 @@ module Arachnid
|
|||
endpoint.fragment = nil
|
||||
endpoint.path = ""
|
||||
|
||||
# Set headers
|
||||
headers = {
|
||||
"DNT" => @do_not_track ? "1" : "0"
|
||||
}
|
||||
|
||||
unless @sessions.has_key?(key)
|
||||
session = @client.new(
|
||||
endpoint: endpoint,
|
||||
|
|
Loading…
Reference in New Issue