Update session cache and remove dnt options

This commit is contained in:
Chris 2019-07-10 17:19:04 -07:00
parent 6e68789feb
commit 448fe8fddb
No known key found for this signature in database
GPG Key ID: 37DAEF5F446370A4
4 changed files with 33 additions and 20 deletions

View File

@ -184,7 +184,6 @@ Arachnid has a ton of configration options which can be passed to the mehthods l
- **read_timeout** - Read timeout - **read_timeout** - Read timeout
- **connect_timeout** - Connect timeout - **connect_timeout** - Connect timeout
- **max_redirects** - Maximum amount of redirects to follow - **max_redirects** - Maximum amount of redirects to follow
- **do_not_track** - Sets the DNT header
- **default_headers** - Default HTTP headers to use for all hosts - **default_headers** - Default HTTP headers to use for all hosts
- **host_header** - HTTP host header to use - **host_header** - HTTP host header to use
- **host_headers** - HTTP headers to use for specific hosts - **host_headers** - HTTP headers to use for specific hosts
@ -199,7 +198,6 @@ Arachnid has a ton of configration options which can be passed to the mehthods l
There are also a few class properties on `Arachnid` itself which are used as the defaults, unless overrided. There are also a few class properties on `Arachnid` itself which are used as the defaults, unless overrided.
- **do_not_track**
- **max_redirects** - **max_redirects**
- **connect_timeout** - **connect_timeout**
- **read_timeout** - **read_timeout**

View File

@ -13,9 +13,6 @@ module Arachnid
# Specifies whether robots.txt should be honored globally # Specifies whether robots.txt should be honored globally
class_property? robots : Bool = false class_property? robots : Bool = false
# Should we set the DNT (Do Not Track) header?
class_property? do_not_track : Bool = false
# Maximum amount of redirects to follow # Maximum amount of redirects to follow
class_property max_redirects : Int32 = 5 class_property max_redirects : Int32 = 5
@ -62,3 +59,28 @@ module Arachnid
end end
end end
end end
require "json"
# Let's build a sitemap of crystal-lang.org
# Links will be a hash of url to resource title
links = {} of String => String
# Visit a particular host, in this case `crystal-lang.org`. This will
# not match on subdomains.
Arachnid.host("https://crystal-lang.org") do |spider|
# Ignore the API secion. It's a little big.
spider.ignore_urls_like(/\/(api)\//)
spider.every_html_page do |page|
puts "Visiting #{page.url.to_s}"
# Ignore redirects for our sitemap
unless page.redirect?
# Add the url of every visited page to our sitemap
links[page.url.to_s] = page.title.to_s.strip
end
end
end
File.write("crystal-lang.org-sitemap.json", links.to_pretty_json)

View File

@ -64,11 +64,11 @@ module Arachnid
# Creates a new `Agent` object. # Creates a new `Agent` object.
def initialize( def initialize(
http_client = nil,
host : String? = nil, host : String? = nil,
read_timeout : Int32? = nil, read_timeout : Int32? = nil,
connect_timeout : Int32? = nil, connect_timeout : Int32? = nil,
max_redirects : Int32? = nil, max_redirects : Int32? = nil,
do_not_track : Bool? = nil,
default_headers : Hash(String, String)? = nil, default_headers : Hash(String, String)? = nil,
host_header : String? = nil, host_header : String? = nil,
host_headers : Hash(String | Regex, String)? = nil, host_headers : Hash(String | Regex, String)? = nil,
@ -111,10 +111,10 @@ module Arachnid
@max_depth = max_depth @max_depth = max_depth
@sessions = SessionCache.new( @sessions = SessionCache.new(
http_client || HTTPClient::Default,
read_timeout, read_timeout,
connect_timeout, connect_timeout,
max_redirects, max_redirects
do_not_track
) )
@cookies = CookieJar.new @cookies = CookieJar.new

View File

@ -5,6 +5,9 @@ module Arachnid
# Stores active HTTP Sessions organized by scheme, host-name and port. # Stores active HTTP Sessions organized by scheme, host-name and port.
class SessionCache class SessionCache
# The HTTPClient class to use for requests
property client : HTTPClient.class
# Optional read timeout. # Optional read timeout.
property read_timeout : Int32 property read_timeout : Int32
@ -14,24 +17,19 @@ module Arachnid
# Max redirects to follow. # Max redirects to follow.
property max_redirects : Int32? property max_redirects : Int32?
# Should we set a DNT (Do Not Track) header?
property? do_not_track : Bool
@sessions = {} of Tuple(String?, String?, Int32?) => HTTPClient @sessions = {} of Tuple(String?, String?, Int32?) => HTTPClient
# Create a new session cache # Create a new session cache
def initialize( def initialize(
client, client : HTTPClient?.class = nil,
read_timeout : Int32? = nil, read_timeout : Int32? = nil,
connect_timeout : Int32? = nil, connect_timeout : Int32? = nil,
max_redirects : Int32? = nil, max_redirects : Int32? = nil
do_not_track : Bool? = nil
) )
@client = client || HTTPClient::Default @client = client || HTTPClient::Default
@read_timeout = read_timeout || Arachnid.read_timeout @read_timeout = read_timeout || Arachnid.read_timeout
@connect_timeout = connect_timeout || Arachnid.connect_timeout @connect_timeout = connect_timeout || Arachnid.connect_timeout
@max_redirects = max_redirects || Arachnid.max_redirects @max_redirects = max_redirects || Arachnid.max_redirects
@do_not_track = do_not_track || Arachnid.do_not_track?
end end
# Determines if there is an active session for the given URL # Determines if there is an active session for the given URL
@ -60,11 +58,6 @@ module Arachnid
endpoint.fragment = nil endpoint.fragment = nil
endpoint.path = "" endpoint.path = ""
# Set headers
headers = {
"DNT" => @do_not_track ? "1" : "0"
}
unless @sessions.has_key?(key) unless @sessions.has_key?(key)
session = @client.new( session = @client.new(
endpoint: endpoint, endpoint: endpoint,