From 448fe8fddbbb030603a8e7ac87be4d3454c6800b Mon Sep 17 00:00:00 2001 From: Chris Date: Wed, 10 Jul 2019 17:19:04 -0700 Subject: [PATCH] Update session cache and remove dnt options --- README.md | 2 -- src/arachnid.cr | 28 +++++++++++++++++++++++++--- src/arachnid/agent.cr | 6 +++--- src/arachnid/session_cache.cr | 17 +++++------------ 4 files changed, 33 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index d80ba45..b602ecd 100644 --- a/README.md +++ b/README.md @@ -184,7 +184,6 @@ Arachnid has a ton of configration options which can be passed to the mehthods l - **read_timeout** - Read timeout - **connect_timeout** - Connect timeout - **max_redirects** - Maximum amount of redirects to follow -- **do_not_track** - Sets the DNT header - **default_headers** - Default HTTP headers to use for all hosts - **host_header** - HTTP host header to use - **host_headers** - HTTP headers to use for specific hosts @@ -199,7 +198,6 @@ Arachnid has a ton of configration options which can be passed to the mehthods l There are also a few class properties on `Arachnid` itself which are used as the defaults, unless overrided. -- **do_not_track** - **max_redirects** - **connect_timeout** - **read_timeout** diff --git a/src/arachnid.cr b/src/arachnid.cr index 69bf7e9..6789c59 100644 --- a/src/arachnid.cr +++ b/src/arachnid.cr @@ -13,9 +13,6 @@ module Arachnid # Specifies whether robots.txt should be honored globally class_property? robots : Bool = false - # Should we set the DNT (Do Not Track) header? - class_property? do_not_track : Bool = false - # Maximum amount of redirects to follow class_property max_redirects : Int32 = 5 @@ -62,3 +59,28 @@ module Arachnid end end end + +require "json" + +# Let's build a sitemap of crystal-lang.org +# Links will be a hash of url to resource title +links = {} of String => String + +# Visit a particular host, in this case `crystal-lang.org`. This will +# not match on subdomains. +Arachnid.host("https://crystal-lang.org") do |spider| + # Ignore the API secion. It's a little big. + spider.ignore_urls_like(/\/(api)\//) + + spider.every_html_page do |page| + puts "Visiting #{page.url.to_s}" + + # Ignore redirects for our sitemap + unless page.redirect? + # Add the url of every visited page to our sitemap + links[page.url.to_s] = page.title.to_s.strip + end + end +end + +File.write("crystal-lang.org-sitemap.json", links.to_pretty_json) diff --git a/src/arachnid/agent.cr b/src/arachnid/agent.cr index b2d4e71..761a9e7 100644 --- a/src/arachnid/agent.cr +++ b/src/arachnid/agent.cr @@ -64,11 +64,11 @@ module Arachnid # Creates a new `Agent` object. def initialize( + http_client = nil, host : String? = nil, read_timeout : Int32? = nil, connect_timeout : Int32? = nil, max_redirects : Int32? = nil, - do_not_track : Bool? = nil, default_headers : Hash(String, String)? = nil, host_header : String? = nil, host_headers : Hash(String | Regex, String)? = nil, @@ -111,10 +111,10 @@ module Arachnid @max_depth = max_depth @sessions = SessionCache.new( + http_client || HTTPClient::Default, read_timeout, connect_timeout, - max_redirects, - do_not_track + max_redirects ) @cookies = CookieJar.new diff --git a/src/arachnid/session_cache.cr b/src/arachnid/session_cache.cr index ff2aac1..e81fe23 100644 --- a/src/arachnid/session_cache.cr +++ b/src/arachnid/session_cache.cr @@ -5,6 +5,9 @@ module Arachnid # Stores active HTTP Sessions organized by scheme, host-name and port. class SessionCache + # The HTTPClient class to use for requests + property client : HTTPClient.class + # Optional read timeout. property read_timeout : Int32 @@ -14,24 +17,19 @@ module Arachnid # Max redirects to follow. property max_redirects : Int32? - # Should we set a DNT (Do Not Track) header? - property? do_not_track : Bool - @sessions = {} of Tuple(String?, String?, Int32?) => HTTPClient # Create a new session cache def initialize( - client, + client : HTTPClient?.class = nil, read_timeout : Int32? = nil, connect_timeout : Int32? = nil, - max_redirects : Int32? = nil, - do_not_track : Bool? = nil + max_redirects : Int32? = nil ) @client = client || HTTPClient::Default @read_timeout = read_timeout || Arachnid.read_timeout @connect_timeout = connect_timeout || Arachnid.connect_timeout @max_redirects = max_redirects || Arachnid.max_redirects - @do_not_track = do_not_track || Arachnid.do_not_track? end # Determines if there is an active session for the given URL @@ -60,11 +58,6 @@ module Arachnid endpoint.fragment = nil endpoint.path = "" - # Set headers - headers = { - "DNT" => @do_not_track ? "1" : "0" - } - unless @sessions.has_key?(key) session = @client.new( endpoint: endpoint,