Initial commit

2019-06-26 02:45:03 -07:00 · 2019-06-26 02:45:03 -07:00 · 9b82f6b48a
commit 9b82f6b48a
30 changed files with 2895 additions and 0 deletions
--- a/.editorconfig
+++ b/.editorconfig
@ -0,0 +1,9 @@
+root = true
+
+[*.cr]
+charset = utf-8
+end_of_line = lf
+insert_final_newline = true
+indent_style = space
+indent_size = 2
+trim_trailing_whitespace = true
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,9 @@
+/docs/
+/lib/
+/bin/
+/.shards/
+*.dwarf
+
+# Libraries don't need dependency lock
+# Dependencies will be locked in applications that use them
+/shard.lock
--- a/.travis.yml
+++ b/.travis.yml
@ -0,0 +1,6 @@
+language: crystal
+
+# Uncomment the following if you'd like Travis to run specs and check code formatting
+# script:
+#   - crystal spec
+#   - crystal tool format --check
--- a/21
+++ b/21
@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2019 Chris Watson
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
--- a/README.md
+++ b/README.md
@ -0,0 +1,95 @@
+# Arachnid
+
+Arachnid is a fast and powerful web scraping framework for Crystal. It provides an easy to use DSL for scraping webpages and processing all of the things you might come across.
+
+## Installation
+
+1. Add the dependency to your `shard.yml`:
+
+   ```yaml
+   dependencies:
+     arachnid:
+       github: watzon/arachnid
+   ```
+
+2. Run `shards install`
+
+## Usage
+
+Arachnid provides an easy to use, powerful DSL for scraping websites.
+
+```crystal
+require "arachnid"
+require "json"
+
+# Let's build a sitemap of crystal-lang.org
+# Links will be a hash of url to page title
+links = {} of String => String
+
+# Visit a particular host, in this case `crystal-lang.org`. This will
+# not match on subdomains.
+Arachnid.host("https://crystal-lang.org") do |spider|
+  # Ignore the API secion. It's a little big.
+  spider.ignore_urls_like(/.*\/api.*/)
+
+  spider.every_page do |page|
+    puts "Visiting #{page.url.to_s}"
+
+    # Ignore redirects for our sitemap
+    unless page.redirect?
+      # Add the url of every visited page to our sitemap
+      links[page.url.to_s] = page.title.to_s.strip
+    end
+  end
+end
+
+File.write("crystal-lang.org-sitemap.json", links.to_pretty_json)
+```
+
+Want to scan external links as well?
+
+```crystal
+# To make things interesting, this time let's download
+# every image we find.
+Arachnid.start_at("https://crystal-lang.org") do |spider|
+  # Set a base path to store all the images at
+  base_image_dir = File.expand_path("~/Pictures/arachnid")
+  Dir.mkdir_p(base_image_dir)
+
+  spider.every_page do |page|
+    puts "Scanning #{page.url.to_s}"
+
+    if page.image?
+      # Since we're going to be saving a lot of images
+      # let's spawn a new fiber for each one. This
+      # makes things so much faster.
+      spawn do
+        # Output directory for images for this host
+        directory = File.join(base_image_dir, page.url.host.to_s)
+        Dir.mkdir_p(directory)
+
+        # The name of the image
+        filename = File.basename(page.url.path)
+
+        # Save the image using the body of the page
+        puts "Saving #{filename} to #{directory}"
+        File.write(File.join(directory, filename), page.body)
+      end
+    end
+  end
+end
+```
+
+More documentation will be coming soon!
+
+## Contributing
+
+1. Fork it (<https://github.com/watzon/arachnid/fork>)
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create a new Pull Request
+
+## Contributors
+
+- [Chris Watson](https://github.com/watzon) - creator and maintainer
--- a/shard.yml
+++ b/shard.yml
@ -0,0 +1,17 @@
+name: arachnid
+version: 0.1.0
+
+authors:
+  - Chris Watson <chris@watzon.me>
+
+dependencies:
+  halite:
+    github: icyleaf/halite
+    version: ~> 0.10.1
+  crystagiri:
+    github: madeindjs/crystagiri
+    branch: master
+
+crystal: 0.29.0
+
+license: MIT
--- a/spec/crepe_spec.cr
+++ b/spec/crepe_spec.cr
@ -0,0 +1,9 @@
+require "./spec_helper"
+
+describe Arachnid do
+  # TODO: Write tests
+
+  it "works" do
+    false.should eq(true)
+  end
+end
--- a/spec/spec_helper.cr
+++ b/spec/spec_helper.cr
@ -0,0 +1,2 @@
+require "spec"
+require "../src/arachnid"
--- a/src/arachnid.cr
+++ b/src/arachnid.cr
@ -0,0 +1,32 @@
+require "./arachnid/version"
+require "./arachnid/arachnid"
+
+# To make things interesting, this time let's download
+# every image we find.
+Arachnid.start_at("https://crystal-lang.org") do |spider|
+  # Set a base path to store all the images at
+  base_image_dir = File.expand_path("~/Pictures/arachnid")
+  Dir.mkdir_p(base_image_dir)
+
+  spider.every_page do |page|
+    puts "Scanning #{page.url.to_s}"
+
+    if page.image?
+      # Since we're going to be saving a lot of images
+      # let's spawn a new fiber for each one. This
+      # makes things so much faster.
+      spawn do
+        # Output directory for images for this host
+        directory = File.join(base_image_dir, page.url.host.to_s)
+        Dir.mkdir_p(directory)
+
+        # The name of the image
+        filename = File.basename(page.url.path)
+
+        # Save the image using the body of the page
+        puts "Saving #{filename} to #{directory}"
+        File.write(File.join(directory, filename), page.body)
+      end
+    end
+  end
+end
--- a/src/arachnid/agent.cr
+++ b/src/arachnid/agent.cr
@ -0,0 +1,543 @@
+require "./agent/sanitizers"
+require "./agent/filters"
+require "./agent/events"
+require "./agent/actions"
+require "./agent/robots"
+require "./page"
+require "./session_cache"
+require "./cookie_jar"
+require "./auth_store"
+
+module Arachnid
+  class Agent
+
+    getter? running : Bool
+
+    # Set to limit to a single host.
+    property host : String?
+
+    # User agent to use.
+    property user_agent : String
+
+    # HTTP Hoes Header to use.
+    property host_header : String?
+
+    # HTTP Host Headers to use for specific hosts.
+    property host_headers : Hash(String | Regex, String)
+
+    # HTTP Headers to use for every request.
+    property default_headers : Hash(String, String)
+
+    # HTTP Authentication credentials.
+    property authorized : AuthStore
+
+    # Referer to use.
+    property referer : String?
+
+    # Delay in between fetching pages.
+    property fetch_delay : Time::Span | Int32
+
+    # History containing visited URLs.
+    getter history : Set(URI)
+
+    # List of unreachable URIs.
+    getter failures : Set(URI)
+
+    # Queue of URLs to visit.
+    getter queue : Array(URI)
+
+    # The session cache.
+    property sessions : SessionCache
+
+    # Cached cookies.
+    property cookies : CookieJar
+
+    # Maximum number of pages to visit.
+    property limit : Int32?
+
+    # Maximum depth.
+    property max_depth : Int32?
+
+    # The visited URLs and their depth within a site.
+    property levels : Hash(URI, Int32)
+
+    # Creates a new `Agent` object.
+    def initialize(
+      host : String? = nil,
+      read_timeout : Int32? = nil,
+      connect_timeout : Int32? = nil,
+      follow_redirects : Bool? = nil,
+      max_redirects : Int32? = nil,
+      do_not_track : Bool? = nil,
+      default_headers : Hash(String, String)? = nil,
+      host_header : String? = nil,
+      host_headers : Hash(String | Regex, String)? = nil,
+      user_agent : String? = nil,
+      referer : String? = nil,
+      fetch_delay : (Int32 | Time::Span)? = nil,
+      queue : Set(URI)? = nil,
+      history : Set(URI)? = nil,
+      limit : Int32? = nil,
+      max_depth : Int32? = nil,
+      robots : Bool? = nil,
+      filter_options = nil
+    )
+      @host = host
+
+      @host_header = host_header
+      @host_headers = host_headers || {} of (Regex | String) => String
+      @default_headers = default_headers || {} of String => String
+
+      @user_agent = user_agent || Arachnid.user_agent
+      @referer = referer
+
+      @running = false
+      @fetch_delay = fetch_delay || 0
+      @history = history || Set(URI).new
+      @failures = Set(URI).new
+      @queue = queue || [] of URI
+
+      @limit = limit
+      @levels = {} of URI => Int32
+      @max_depth = max_depth
+
+      @sessions = SessionCache.new(
+        read_timeout,
+        connect_timeout,
+        follow_redirects,
+        max_redirects,
+        do_not_track
+      )
+
+      @cookies = CookieJar.new
+      @authorized = AuthStore.new
+
+      if filter_options
+        initialize_filters(**filter_options)
+      else
+        initialize_filters
+      end
+
+      initialize_robots if robots || Arachnid.robots?
+    end
+
+    # Create a new scoped `Agent` in a block.
+    def self.new(**options, &block : Agent ->)
+      _new = new(**options)
+      with _new yield _new
+      _new
+    end
+
+    # Creates a new `Agent` and begins spidering at the given URL.
+    def self.start_at(url, **options, &block : Agent ->)
+      agent = new(**options, &block)
+      agent.start_at(url, force: true)
+    end
+
+    # Creates a new `Agent` and spiders the web site located
+    # at the given URL.
+    def self.site(url, **options, &block : Agent ->)
+      url = url.is_a?(URI) ? url : URI.parse(url)
+      url_regex = Regex.new(Regex.escape(url.host.to_s))
+
+      agent = new(**options, &block)
+      agent.visit_hosts_like(url_regex)
+
+      agent.start_at(url, force: true)
+    end
+
+    # Creates a new `Agent` and spiders the given host.
+    def self.host(url, **options, &block : Agent ->)
+      url = url.is_a?(URI) ? url : URI.parse(url)
+
+      options = options.merge(host: url.host)
+      agent = new(**options, &block)
+
+      agent.start_at(url, force: true)
+    end
+
+    # Clears the history of the `Agent`.
+    def clear
+      @queue.clear
+      @history.clear
+      @failures.clear
+      self
+    end
+
+    # Start spidering at a given URL.
+    # def start_at(url, &block : Page ->)
+    #   enqueue(url)
+    #   run(&block)
+    # end
+
+    # Start spidering at a given URL.
+    def start_at(url, force = false)
+      enqueue(url, force: force)
+      return run
+    end
+
+    # Start spidering until the queue becomes empty or the
+    # agent is paused.
+    # def run(&block : Page ->)
+    #   @running = true
+
+    #   until @queue.empty? || paused? || limit_reached?
+    #     begin
+    #       visit_page(dequeue, &block)
+    #     rescue Actions::Paused
+    #       return self
+    #     rescue Actions::Action
+    #     end
+    #   end
+
+    #   @running = false
+    #   @sessions.clear
+    #   self
+    # end
+
+    # Start spidering until the queue becomes empty or the
+    # agent is paused.
+    def run
+      @running = true
+
+      until @queue.empty? || paused? || limit_reached? || !running?
+        begin
+          visit_page(dequeue)
+        rescue Actions::Paused
+          return self
+        rescue Actions::Action
+        end
+      end
+
+      @running = false
+      @sessions.clear
+      self
+    end
+
+    # Sets the history of URLs that were previously visited.
+    def history=(new_history)
+      @history.clear
+
+      new_history.each do |url|
+        @history << url.is_a?(URI) ? url : URI.parse(url)
+      end
+
+      @history
+    end
+
+    # Specifies the links which have been visited.
+    def visited_links
+      @history.map(&.to_s)
+    end
+
+    # Specifies the hosts which have been visited.
+    def visited_hosts
+      history.map(&.host)
+    end
+
+    # Determines whether a URL was visited or not.
+    def visited?(url)
+      url = url.is_a?(URI) ? url : URI.parse(url)
+      @history.includes?(url)
+    end
+
+    # Sets the list of failed URLs.
+    def failures=(new_failures)
+      @failures.clear
+
+      new_failures.each do |url|
+        @failures << url.is_a?(URI) ? url : URI.parse(url)
+      end
+
+      @failures
+    end
+
+    # Determines whether a given URL could not be visited.
+    def failed?(url)
+      url = url.is_a?(URI) ? url : URI.parse(url)
+      @failures.includes?(url)
+    end
+
+    # Sets the queue of URLs to visit.
+    # Sets the list of failed URLs.
+    def queue=(new_queue)
+      @queue.clear
+
+      new_queue.each do |url|
+        @queue << url.is_a?(URI) ? url : URI.parse(url)
+      end
+
+      @queue
+    end
+
+    # Determines whether the given URL has been queued for visiting.
+    def queued?(url)
+      url = url.is_a?(URI) ? url : URI.parse(url)
+      @queue.includes?(url)
+    end
+
+    # Enqueues a given URL for visiting, only if it passes all
+    # of the agent's rules for visiting a given URL.
+    def enqueue(url, level = 0, force = false)
+      url = sanitize_url(url)
+
+      if (!queued?(url) && visit?(url)) || force
+        link = url.to_s
+
+        return if url.host.to_s.empty?
+
+        begin
+          @every_url_blocks.each { |url_block| url_block.call(url) }
+
+          @every_url_like_blocks.each do |pattern, url_blocks|
+            match = case pattern
+                    when Regex
+                      link =~ pattern
+                    else
+                      (pattern == link) || (pattern == url)
+                    end
+
+            if match
+              url_blocks.each { |url_block| url_block.call(url) }
+            end
+          end
+        rescue action : Actions::Paused
+          raise(action)
+        rescue Actions::SkipLink
+          return false
+        rescue Actions::Action
+        end
+
+        @queue << url
+        @levels[url] = level
+        true
+      end
+    end
+
+    # Gets and creates a new `Page` object from a given URL,
+    # yielding the newly created page.
+    def get_page(url, &block)
+      url = url.is_a?(URI) ? url : URI.parse(url)
+
+      prepare_request(url) do |session, path, handlers|
+        new_page = Page.new(url, session.get(path, headers: handlers))
+
+        # save any new cookies
+        @cookies.from_page(new_page)
+
+        yield new_page
+        return new_page
+      end
+    end
+
+    # Gets and creates a new `Page` object from a given URL.
+    def get_page(url)
+      url = url.is_a?(URI) ? url : URI.parse(url)
+
+      prepare_request(url) do |session, path, handlers|
+        new_page = Page.new(url, session.get(path, handlers))
+
+        # save any new cookies
+        @cookies.from_page(new_page)
+
+        return new_page
+      end
+    end
+
+    # Posts supplied form data and creates a new Page from a given URL,
+    # yielding the newly created page.
+    def post_page(url, post_data = "", &block)
+      url = url.is_a?(URI) ? url : URI.parse(url)
+
+      prepare_request(url) do |session, path, handlers|
+        new_page = Page.new(url, session.post(path, post_data, handlers))
+
+        # save any new cookies
+        @cookies.from_page(new_page)
+
+        yield new_page
+        return new_page
+      end
+    end
+
+    # Posts supplied form data and creates a new Page from a given URL.
+    def post_page(url, post_data = "")
+      url = url.is_a?(URI) ? url : URI.parse(url)
+
+      prepare_request(url) do |session, path, handlers|
+        new_page = Page.new(url, session.post(path, post_data, handlers))
+
+        # save any new cookies
+        @cookies.from_page(new_page)
+
+        return new_page
+      end
+    end
+
+    # Visits a given URL and enqueues the links recovered
+    # from the page to be visited later.
+    # def visit_page(url, &block : Page ->)
+    #   url = sanitize_url(url)
+
+    #   get_page(url) do |page|
+    #     @history << page.url
+
+    #     begin
+    #       @every_page_blocks.each { |page_block| page_block.call(page) }
+    #       yield page
+    #     rescue action : Actions::Paused
+    #       raise(action)
+    #     rescue Actions::SkipPage
+    #       return Nil
+    #     rescue Actions::Action
+    #     end
+
+    #     page.each_url do |next_url|
+    #       begin
+    #         @every_link_blocks.each do |link_block|
+    #           link_block.call(page.url, next_url)
+    #         end
+    #       rescue action : Actions::Paused
+    #         raise(action)
+    #       rescue Actions::SkipLink
+    #         next
+    #       rescue Actions::Action
+    #       end
+
+    #       if @max_depth.nil? || @max_depth.not_nil! > (@levels[url]? || 0)
+    #         @levels[url] ||= 0
+    #         enqueue(next_url, @levels[url] + 1)
+    #       end
+    #     end
+    #   end
+    # end
+
+    # Visits a given URL and enqueues the links recovered
+    # from the page to be visited later.
+    def visit_page(url)
+      url = sanitize_url(url)
+
+      get_page(url) do |page|
+        @history << page.url
+
+        begin
+          @every_page_blocks.each { |page_block| page_block.call(page) }
+        rescue action : Actions::Paused
+          raise(action)
+        rescue Actions::SkipPage
+          return nil
+        rescue Actions::Action
+        end
+
+        page.each_url do |next_url|
+          begin
+            @every_link_blocks.each do |link_block|
+              link_block.call(page.url, next_url)
+            end
+          rescue action : Actions::Paused
+            raise(action)
+          rescue Actions::SkipLink
+            next
+          rescue Actions::Action
+          end
+
+          if @max_depth.nil? || @max_depth.not_nil! > (@levels[url]? || 0)
+            @levels[url] ||= 0
+            enqueue(next_url, @levels[url] + 1)
+          end
+        end
+      end
+    end
+
+    # Converts the agent into a hash.
+    def to_h
+      {"history" => @history, "queue" => @queue}
+    end
+
+    # Prepares request headers for a given URL.
+    protected def prepare_request_headers(url)
+      # set any additional HTTP headers
+      headers = @default_headers.dup
+
+      unless @host_headers.empty?
+        @host_headers.each do |name, header|
+          if url.host =~ name
+            headers["Host"] = header
+            break
+          end
+        end
+      end
+
+      headers["Host"] ||= @host_header.to_s if @host_header
+      headers["User-Agent"] ||= @user_agent.to_s
+      headers["Referer"] ||= @referer.to_s if @referer
+
+      if authorization = @authorized.for_url(url.host.to_s)
+        headers["Authorization"] = "Basic #{authorization}"
+      end
+
+      if header_cookies = @cookies.for_host(url.host.to_s)
+        headers["Cookie"] = header_cookies.to_cookie_header
+      end
+
+      headers
+    end
+
+    # Normalizes the request path and grabs a session to handle
+    # page get and post requests.
+    def prepare_request(url, &block)
+      path = if url.path.empty?
+               "/"
+             else
+               url.path
+             end
+
+      # append the URL query to the path
+      path += "?#{url.query}" if url.query
+
+      headers = prepare_request_headers(url)
+
+      begin
+        sleep(@fetch_delay) if @fetch_delay.to_i > 0
+
+        yield @sessions[url], path, headers
+      rescue Halite::Exception::Error | IO::Error | Socket::Error | OpenSSL::SSL::Error
+        @sessions.kill!(url)
+        return nil
+      end
+    end
+
+    # Dequeues a URL that will later be visited.
+    def dequeue
+      @queue.shift
+    end
+
+    # Determines if the maximum limit has been reached.
+    def limit_reached?
+      if limit = @limit
+        return @history.size >= limit
+      end
+      false
+    end
+
+    # Determines if a given URL should be visited.
+    def visit?(url)
+      # puts [url.to_s, visited?(url), visit_scheme?(url.scheme.to_s), visit_host?(url.host.to_s), visit_port?(url.port || -1), visit_link?(url.to_s), visit_url?(url), visit_ext?(url.path)]
+      !visited?(url) &&
+        visit_scheme?(url.scheme.to_s) &&
+        visit_host?(url.host.to_s) &&
+        visit_port?(url.port || -1) &&
+        visit_link?(url.to_s) &&
+        visit_url?(url) &&
+        visit_ext?(url.path)
+        # robot_allowed?(url.to_s)
+    end
+
+    # Adds a given URL to the failures list.
+    def failed(url)
+      @failures << url
+      @every_failed_url_blocks.each { |fail_block| fail_block.call(url) }
+      true
+    end
+  end
+end
--- a/src/arachnid/agent/actions.cr
+++ b/src/arachnid/agent/actions.cr
@ -0,0 +1,53 @@
+module Arachnid
+  class Agent
+    module Actions
+
+      # A Runtime Error
+      class RuntimeError < Exception; end
+
+      # The base `Actions` exceptions class
+      class Action < RuntimeError; end
+
+      # Exception used to pause a running `Agent`
+      class Paused < Action; end
+
+      # Exception which causes a running `Agent` to skip a link.
+      class SkipLink < Action; end
+
+      # Exception which caises a running `Agent` to skip a page.
+      class SkipPage < Action; end
+    end
+
+    # Continue spidering
+    def continue!(&block)
+      @paused = false
+      run(&block)
+    end
+
+    # Sets the pause state of the agent.
+    def pause=(state)
+      @paused = state
+    end
+
+    # Pauses the agent, causing spidering to temporarily stop.
+    def pause!
+      @paused = true
+      raise Actions::Paused.new
+    end
+
+    # Determines whether the agent is paused.
+    def paused?
+      @paused == true
+    end
+
+    # Causes the agent to skip the link being enqueued.
+    def skip_link!
+      raise Actions::SkipLink.new
+    end
+
+    # Causes the agent to skip the page being visited.
+    def skip_page!
+      raise Actions::SkipPage
+    end
+  end
+end
--- a/src/arachnid/agent/events.cr
+++ b/src/arachnid/agent/events.cr
@ -0,0 +1,248 @@
+require "../page"
+
+module Arachnid
+  class Agent
+    @every_url_blocks = [] of Proc(URI, Nil)
+
+    @every_failed_url_blocks = [] of Proc(URI, Nil)
+
+    @every_url_like_blocks = Hash(String | Regex, Array(Proc(URI, Nil))).new do |hash, key|
+      hash[key] = [] of Proc(URI, Nil)
+    end
+
+    @every_page_blocks = [] of Proc(Page, Nil)
+
+    @every_link_blocks = [] of Proc(URI, URI, Nil)
+
+    # Pass each URL from each page visited to the given block.
+    def every_url(&block : URI ->)
+      @every_url_blocks << block
+      self
+    end
+
+    # Pass each URL that could not be requested to the given block.
+    def every_failed_url(&block : URI ->)
+      @every_failed_url_blocks << block
+      self
+    end
+
+    # Pass every URL that the agent visits, and matches a given pattern,
+    # to a given block.
+    def every_url_like(pattern, &block : URI ->)
+      @every_url_like_blocks[pattern] << block
+      self
+    end
+
+    # Ssee `#every_url_like`
+    def urls_like(pattern, &block : URI ->)
+      every_url_like(pattern, &block)
+    end
+
+    # Pass the headers from every response the agent receives to a given
+    # block.
+    def all_headers(&block)
+      headers = [] of HTTP::Headers
+      every_page { |page| headers << page.headers }
+      headers.each { |header| yield headers }
+    end
+
+    # Pass every page that the agent visits to a given block.
+    def every_page(&block : Page ->)
+      @every_page_blocks << block
+      self
+    end
+
+    # Pass every OK page that the agent visits to a given block.
+    def every_ok_page(&block : Page ->)
+      pages = [] of Page
+      every_page { |page| (pages << page) if page.ok? }
+      pages.each { |page| yield page }
+    end
+
+    # Pass every Redirect page that the agent visits to a given block.
+    def every_redirect_page(&block : Page ->)
+      pages = [] of Page
+      every_page { |page| (pages << page) if page.redirect? }
+      pages.each { |page| yield page }
+    end
+
+    # Pass every Timeout page that the agent visits to a given block.
+    def every_timedout_page(&block : Page ->)
+      pages = [] of Page
+      every_page { |page| (pages << page) if page.timeout? }
+      pages.each { |page| yield page }
+    end
+
+    # Pass every Bad Request page that the agent visits to a given block.
+    def every_bad_request_page(&block : Page ->)
+      pages = [] of Page
+      every_page { |page| (pages << page) if page.bad_request? }
+      pages.each { |page| yield page }
+    end
+
+    # Pass every Unauthorized page that the agent visits to a given block.
+    def every_unauthorized_page(&block : Page ->)
+      pages = [] of Page
+      every_page { |page| (pages << page) if page.unauthorized? }
+      pages.each { |page| yield page }
+    end
+
+    # Pass every Forbidden page that the agent visits to a given block.
+    def every_forbidden_page(&block : Page ->)
+      pages = [] of Page
+      every_page { |page| (pages << page) if page.forbidden? }
+      pages.each { |page| yield page }
+    end
+
+    # Pass every Missing page that the agent visits to a given block.
+    def every_missing_page(&block : Page ->)
+      pages = [] of Page
+      every_page { |page| (pages << page) if page.missing? }
+      pages.each { |page| yield page }
+    end
+
+    # Pass every Internal Server Error page that the agent visits to a
+    # given block.
+    def every_internal_server_error_page(&block : Page ->)
+      pages = [] of Page
+      every_page { |page| (pages << page) if page.had_internal_server_error? }
+      pages.each { |page| yield page }
+    end
+
+    # Pass every Plain Text page that the agent visits to a given block.
+    def every_txt_page(&block : Page ->)
+      pages = [] of Page
+      every_page { |page| (pages << page) if page.txt? }
+      pages.each { |page| yield page }
+    end
+
+    # Pass every HTML page that the agent visits to a given block.
+    def every_html_page(&block : Page ->)
+      pages = [] of Page
+      every_page { |page| (pages << page) if page.html? }
+      pages.each { |page| yield page }
+    end
+
+    # Pass every XML page that the agent visits to a given block.
+    def every_xml_page(&block : Page ->)
+      pages = [] of Page
+      every_page { |page| (pages << page) if page.xml? }
+      pages.each { |page| yield page }
+    end
+
+    # Pass every XML Stylesheet (XSL) page that the agent visits to a
+    # given block.
+    def every_xsl_page(&block : Page ->)
+      pages = [] of Page
+      every_page { |page| (pages << page) if page.xsl? }
+      pages.each { |page| yield page }
+    end
+
+    # Pass every HTML or XML document that the agent parses to a given
+    # block.
+    def every_doc(&block : Document::HTML | XML::Node ->)
+      docs = [] of Document::HTML || XML::Node
+      every_page { |page| docs << page.doc.not_nil! if page.doc }
+      docs.each { |doc| yield doc }
+    end
+
+    # Pass every HTML document that the agent parses to a given block.
+    def every_html_doc(&block : Document::HTML | XML::Node ->)
+      docs = [] of Document::HTML
+      every_page { |page| docs << page.doc.not_nil! if page.html? }
+      docs.each { |doc| yield doc }
+    end
+
+    # Pass every XML document that the agent parses to a given block.
+    def every_xml_doc(&block : XML::Node ->)
+      docs = [] of XML::Node
+      every_page { |page| docs << page.doc.not_nil! if page.xml? }
+      docs.each { |doc| yield doc }
+    end
+
+    # Pass every XML Stylesheet (XSL) that the agent parses to a given
+    # block.
+    def every_xsl_doc(&block : XML::Node ->)
+      docs = [] of XML::Node
+      every_page { |page| docs << page.doc.not_nil! if page.xsl? }
+      docs.each { |doc| yield doc }
+    end
+
+    # Pass every RSS document that the agent parses to a given block.
+    def every_rss_doc(&block : XML::Node ->)
+      docs = [] of XML::Node
+      every_page { |page| docs << page.doc.not_nil! if page.rss? }
+      docs.each { |doc| yield doc }
+    end
+
+    # Pass every Atom document that the agent parses to a given block.
+    def every_atom_doc(&block : XML::Node ->)
+      docs = [] of XML::Node
+      every_page { |page| docs << page.doc.not_nil! if page.atom? }
+      docs.each { |doc| yield doc }
+    end
+
+    # Pass every JavaScript page that the agent visits to a given block.
+    def every_javascript_page(&block : Page ->)
+      pages = [] of Page
+      every_page { |page| (pages << page) if page.javascript? }
+      pages.each { |page| yield page }
+    end
+
+    # Pass every CSS page that the agent visits to a given block.
+    def every_css_page(&block : Page ->)
+      pages = [] of Page
+      every_page { |page| (pages << page) if page.css? }
+      pages.each { |page| yield page }
+    end
+
+    # Pass every RSS feed that the agent visits to a given block.
+    def every_rss_page(&block : Page ->)
+      pages = [] of Page
+      every_page { |page| (pages << page) if page.rss? }
+      pages.each { |page| yield page }
+    end
+
+    # Pass every Atom feed that the agent visits to a given block.
+    def every_atom_page(&block : Page ->)
+      pages = [] of Page
+      every_page { |page| (pages << page) if page.atom? }
+      pages.each { |page| yield page }
+    end
+
+    # Pass every MS Word page that the agent visits to a given block.
+    def every_ms_word_page(&block : Page ->)
+      pages = [] of Page
+      every_page { |page| (pages << page) if page.ms_word? }
+      pages.each { |page| yield page }
+    end
+
+    # Pass every PDF page that the agent visits to a given block.
+    def every_pdf_page(&block : Page ->)
+      pages = [] of Page
+      every_page { |page| (pages << page) if page.pdf? }
+      pages.each { |page| yield page }
+    end
+
+    # Pass every ZIP page that the agent visits to a given block.
+    def every_zip_page(&block : Page ->)
+      pages = [] of Page
+      every_page { |page| (pages << page) if page.zip? }
+      pages.each { |page| yield page }
+    end
+
+    # Passes every image URI to the given blocks.
+    def every_image(&block : Page ->)
+      pages = [] of Page
+      every_page { |page| (pages << page) if page.image? }
+      pages.each { |page| yield page }
+    end
+
+    # Passes every origin and destination URI of each link to a given
+    # block.
+    def every_link(&block : URI, URI ->)
+      @every_link_blocks << block
+      self
+    end
+  end
+end
--- a/src/arachnid/agent/filters.cr
+++ b/src/arachnid/agent/filters.cr
@ -0,0 +1,256 @@
+require "../rules"
+
+module Arachnid
+  class Agent
+    # List of acceptable URL schemes to follow
+    getter schemes : Array(String) = [] of String
+
+    @host_rules = Rules(String).new
+    @port_rules = Rules(Int32).new
+    @link_rules = Rules(String).new
+    @url_rules = Rules(URI).new
+    @ext_rules = Rules(String).new
+
+    # Sets the list of acceptable URL schemes to visit.
+    def schemes=(new_schemes)
+      @schemes = new_schemes.map(&.to_s)
+    end
+
+    # Specifies the patterns that match host-names to visit.
+    def visit_hosts
+      @host_rules.accept
+    end
+
+    # Adds a given pattern to the `#visit_hosts`.
+    def visit_hosts_like(pattern)
+      visit_hosts << pattern
+      self
+    end
+
+    def visit_hosts_like(&block)
+      visit_hosts << block
+      self
+    end
+
+    # Specifies the patterns that match host-names to not visit.
+    def ignore_hosts
+      @host_rules.reject
+    end
+
+    # Adds a given pattern to the `#ignore_hosts`.
+    def ignore_hosts_like(pattern)
+      ignore_hosts << pattern
+      self
+    end
+
+    def ignore_hosts_like(&block)
+      ignore_hosts << block
+      self
+    end
+
+    # Specifies the patterns that match the ports to visit.
+    def visit_ports
+      @port_rules.accept
+    end
+
+    # Adds a given pattern to the `#visit_ports`.
+    def visit_ports_like(pattern)
+      visit_ports << pattern
+      self
+    end
+
+    def visit_ports_like(&block : Int32 -> Bool)
+      visit_ports << block
+      self
+    end
+
+    # Specifies the patterns that match ports to not visit.
+    def ignore_ports
+      @port_rules.reject
+    end
+
+    # Adds a given pattern to the `#ignore_ports`.
+    def ignore_ports_like(pattern)
+      ignore_ports << pattern
+      self
+    end
+
+    def ignore_ports_like(&block : Int32 -> Bool)
+      ignore_ports << block
+      self
+    end
+
+    # Specifies the patterns that match the links to visit.
+    def visit_links
+      @link_rules.accept
+    end
+
+    # Adds a given pattern to the `#visit_links`
+    def visit_links_like(pattern)
+      visit_links << pattern
+      self
+    end
+
+    def visit_links_like(&block : String -> Bool)
+      visit_links << block
+      self
+    end
+
+    # Specifies the patterns that match links to not visit.
+    def ignore_links
+      @link_rules.reject
+    end
+
+    # Adds a given pattern to the `#ignore_links`.
+    def ignore_links_like(pattern)
+      ignore_links << pattern
+      self
+    end
+
+    def ignore_links_like(&block : String -> Bool)
+      ignore_links << block
+      self
+    end
+
+    # Specifies the patterns that match the URLs to visit.
+    def visit_urls
+      @url_rules.accept
+    end
+
+    # Adds a given pattern to the `#visit_urls`
+    def visit_urls_like(&block : URI -> Bool)
+      visit_urls << block
+      self
+    end
+
+    def visit_urls_like(pattern)
+      visit_urls << pattern
+      self
+    end
+
+    # Specifies the patterns that match URLs to not visit.
+    def ignore_urls
+      @url_rules.reject
+    end
+
+    # Adds a given pattern to the `#ignore_urls`.
+    def ignore_urls_like(&block : URI -> Bool)
+      ignore_urls << block
+      self
+    end
+
+    def ignore_urls_like(pattern)
+      ignore_urls << pattern
+      self
+    end
+
+    # Specifies the patterns that match the URI path extensions to visit.
+    def visit_exts
+      @ext_rules.accept
+    end
+
+    # Adds a given pattern to the `#visit_exts`.
+    def visit_exts_like(&block : String -> Bool)
+      visit_exts << block
+      self
+    end
+
+    def visit_exts_like(pattern)
+      visit_exts << pattern
+      self
+    end
+
+    # Specifies the patterns that match URI path extensions to not visit.
+    def ignore_exts
+      @ext_rules.reject
+    end
+
+    # Adds a given pattern to the `#ignore_exts`.
+    def ignore_exts_like(&block : String -> Bool)
+      ignore_exts << block
+      self
+    end
+
+    def ignore_exts_like(pattern)
+      ignore_exts << pattern
+      self
+    end
+
+    # Initializes filtering rules.
+    protected def initialize_filters(
+      schemes = nil,
+      hosts = nil,
+      ignore_hosts = nil,
+      ports = nil,
+      ignore_ports = nil,
+      links = nil,
+      ignore_links = nil,
+      urls = nil,
+      ignore_urls = nil,
+      exts = nil,
+      ignore_exts = nil
+    )
+
+      if schemes
+        self.schemes = schemes
+      else
+        @schemes << "http"
+        @schemes << "https"
+      end
+
+      @host_rules.accept = hosts
+      @host_rules.reject = ignore_hosts
+
+      @port_rules.accept = ports
+      @port_rules.reject = ignore_ports
+
+      @link_rules.accept = links
+      @link_rules.reject = ignore_links
+
+      @url_rules.accept = urls
+      @url_rules.reject = ignore_urls
+
+      @ext_rules.accept = exts
+      @ext_rules.reject = ignore_exts
+
+      if host
+        visit_hosts_like(host.to_s)
+      end
+    end
+
+    # Determines if a given URI scheme should be visited.
+    protected def visit_scheme?(scheme)
+      if scheme
+        @schemes.includes?(scheme)
+      else
+        true
+      end
+    end
+
+    # Determines if a given host-name should be visited.
+    protected def visit_host?(host)
+      @host_rules.accept?(host)
+    end
+
+    # Determines if a given port should be visited.
+    protected def visit_port?(port)
+      @port_rules.accept?(port)
+    end
+
+    # Determines if a given link should be visited.
+    protected def visit_link?(link)
+      @link_rules.accept?(link)
+    end
+
+    # Determines if a given URL should be visited.
+    protected def visit_url?(link)
+      @url_rules.accept?(link)
+    end
+
+    # Determines if a given URI path extension should be visited.
+    protected def visit_ext?(path)
+      ext = File.extname(path)
+      @ext_rules.accept?(ext)
+    end
+  end
+end
--- a/src/arachnid/agent/robots.cr
+++ b/src/arachnid/agent/robots.cr
@ -0,0 +1,20 @@
+require "../robots"
+
+module Arachnid
+  class Agent
+    @robots : Arachnid::Robots? = nil
+
+    # Initializes the robots filter.
+    def initialize_robots
+      # @robots = Arachnid::Robots.new(@user_agent)
+    end
+
+    # Determines whether a URL is allowed by the robot policy.
+    def robot_allowed?(url)
+      if robots = @robots
+        return robots.allowed?(url)
+      end
+      true
+    end
+  end
+end
--- a/src/arachnid/agent/sanitizers.cr
+++ b/src/arachnid/agent/sanitizers.cr
@ -0,0 +1,21 @@
+module Arachnid
+  class Agent
+    # Specifies whether the Agent will strip URI fragments
+    property? strip_fragments : Bool = true
+
+    # Specifies whether the Agent will strip URI queries
+    property? strip_query : Bool = false
+
+    # Sanitizes a URL based on filtering options
+    def sanitize_url(url)
+      # normalize the url
+      url = URI.parse(url) unless url.is_a?(URI)
+
+      url.path = "" if url.path == "/"
+      url.fragment = nil if @strip_fragments
+      url.query = nil if @strip_query
+
+      url
+    end
+  end
+end
--- a/src/arachnid/arachnid.cr
+++ b/src/arachnid/arachnid.cr
@ -0,0 +1,39 @@
+require "./page"
+require "./agent"
+
+module Arachnid
+  extend self
+
+  # Specifies whether robots.txt should be honored globally
+  class_property? robots : Bool = false
+
+  # Should we set the DNT (Do Not Track) header?
+  class_property? do_not_track : Bool = false
+
+  # Maximum amount of redirects to follow
+  class_property max_redirects : Int32 = 0
+
+  # Connect timeout.
+  class_property connect_timeout : Int32 = 10
+
+  # Read timeout.
+  class_property read_timeout : Int32 = 10
+
+  # The User-Agent string used by all Agent objects by default.
+  class_property user_agent : String = "Arachnid #{Arachnid::VERSION}"
+
+  # See `Agent.start_at`
+  def start_at(url, **options, &block : Agent ->)
+    Agent.start_at(url, **options, &block)
+  end
+
+  # See `Agent.host`
+  def host(name, **options, &block : Agent ->)
+    Agent.host(name, **options, &block)
+  end
+
+  # See `Agent.site`
+  def site(url, **options, &block : Agent ->)
+    Agent.site(url, **options, &block)
+  end
+end
--- a/src/arachnid/auth_credential.cr
+++ b/src/arachnid/auth_credential.cr
@ -0,0 +1,4 @@
+module Arachnid
+  # Represents HTTP Authentication credentials for a website.
+  record AuthCredential, username : String, password : String
+end
--- a/src/arachnid/auth_store.cr
+++ b/src/arachnid/auth_store.cr
@ -0,0 +1,83 @@
+require "base64"
+require "./extensions/uri"
+require "./auth_credential"
+require "./page"
+
+module Arachnid
+  class AuthStore
+    @credentials = {} of Tuple(String?, String?, Int32?) => Hash(Array(String), AuthCredential)
+
+    # Given a URL, return the most specific matching auth credential.
+    def [](url)
+      # normalize the url
+      url = URI.parse(url) unless url.is_a?(URI)
+
+      key = key_for(url)
+      paths = @credentials[key]?
+
+      return nil unless paths
+
+      # longest path first
+      ordered_paths = paths.keys.sort { |path_key|  -path_key.size }
+
+      # directories of the path
+      path_dirs = URI.expand_path(url.path).split('/').reject(&.empty?)
+
+      ordered_paths.each do |path|
+        return paths[path] if path_dirs[0, path.size] == path
+      end
+
+      nil
+    end
+
+    # Add an auth credential to the store for the supplied base URL.
+    def []=(url, auth)
+      # normalize the url
+      url = URI.parse(url) unless url.is_a?(URI)
+
+      # normalize the url path and split it
+      paths = URI.expand_path(url.path).split('/').reject(&.empty?)
+
+      key = key_for(url)
+
+      @credentials[key] ||= {} of Array(String) => AuthCredential
+      @credentials[key][paths] = auth
+      auth
+    end
+
+    # Convenience method to add username and password credentials
+    # for a named URL.
+    def add(url, username, password)
+      self[url] = AuthCredential.new(username: username, password: password)
+    end
+
+    # Returns the base64 encoded authorization string for the URL
+    # or `nil` if no authorization exists.
+    def for_url(url)
+      if auth = self[url]
+        Base64.encode("#{auth.username}#{auth.password}")
+      end
+    end
+
+    # Clear the contents of the auth store.
+    def clear!
+      @credentials.clear!
+      self
+    end
+
+    # Size of the current auth store (number of URL paths stored)
+    def size
+      @credentials.values.reduce(0) { |acc, paths| acc + paths.size }
+    end
+
+    # Inspect the auth store
+    def inspect
+      "<#{self.class}: #{@credentials.inspect}>"
+    end
+
+    # Creates a auth key based on the URL
+    private def key_for(url)
+      {url.scheme, url.host, url.port}
+    end
+  end
+end
--- a/src/arachnid/cookie_jar.cr
+++ b/src/arachnid/cookie_jar.cr
@ -0,0 +1,118 @@
+module Arachnid
+  class CookieJar
+    include Enumerable(HTTP::Cookies)
+
+    @params : Hash(String, HTTP::Cookies)
+
+    @cookies : HTTP::Cookies
+
+    @dirty : Set(String)
+
+    # Creates a new `CookieJar`
+    def initialize
+      @params = {} of String => HTTP::Cookies
+      @cookies = HTTP::Cookies.new
+      @dirty = Set(String).new
+    end
+
+    # Iterates over the host-name and cookie value pairs in the jar.
+    def each(&block)
+      @params.each do |kp|
+        yield kp
+      end
+    end
+
+    # Returns all relevant cookies in a single string for the named
+    # host or domain.
+    def [](host : String)
+      @params[host]? || HTTP::Cookies.new
+    end
+
+    # Add a cookie to the jar for a particular domain.
+    def []=(host : String, cookies : HTTP::Cookies)
+      @params[host] ||= HTTP::Cookies.new
+
+      cookies.each do |cookie|
+        if @params[host][cookie.name]? != cookie.value
+          cookies.each do |c|
+            @params[host] << c
+          end
+          @dirty.add(host)
+
+          break
+        end
+      end
+
+      cookies
+    end
+
+    # Retrieve cookies for a domain from the response.
+    def from_page(page)
+      cookies = page.cookies
+
+      unless cookies.empty?
+        self[page.url.host.to_s] = cookies
+        return true
+      end
+
+      false
+    end
+
+    # Returns the pre-encoded Cookie for a given host.
+    def for_host(host)
+      if @dirty.includes?(host)
+        values = [] of String
+
+        cookies_for_host(host).each do |cookie|
+          values << cookie.to_cookie_header
+        end
+
+        @cookies[host] = values.join("; ")
+        @dirty.delete(host)
+      end
+
+      @cookies[host]?
+    end
+
+    # Returns raw cookie value pairs for a given host. Includes cookies
+    # set on parent domains.
+    def cookies_for_host(host)
+      host_cookies = @params[host]? || HTTP::Cookies.new
+      subdomains = host.split('.')
+
+      while subdomains.size > 2
+        subdomains.shift
+
+        if parent_cookies = @params[subdomains.join('.')]?
+          parent_cookies.each do |cookie|
+            # copy in the parent cookies, only if they haven't been
+            # overridden yet.
+            unless host_cookies.has_key?(cookie.name)
+                host_cookies[cookie.name] = cookie.value
+            end
+          end
+        end
+      end
+
+      host_cookies
+    end
+
+    # Clear out the jar, removing all stored cookies.
+    def clear!
+      @params.clear
+      @cookies.clear
+      @dirty.clear
+      self
+    end
+
+    # Size of the cookie jar.
+    def size
+      @params.size
+    end
+
+    # Inspects the cookie jar.
+    def inspect
+      "#<#{self.class}: #{@params.inspect}>"
+    end
+  end
+end
--- a/src/arachnid/document/html.cr
+++ b/src/arachnid/document/html.cr
@ -0,0 +1,196 @@
+require "xml"
+
+module Arachnid
+  module Document
+    struct HTML
+      @content : String
+
+      @document : XML::Node
+
+      @ids : Hash(String, XML::Node)
+
+      @tags : Hash(String, Array(Tag))
+
+      @classes : Hash(String, Array(XML::Node))
+
+      forward_missing_to @document
+
+      def initialize(@content : String)
+        @document = XML.parse_html(@content)
+
+        @ids = {} of String => XML::Node
+        @tags = {} of String => Array(Tag)
+        @classes = {} of String => Array(XML::Node)
+
+        visit @document
+      end
+
+      def self.parse(content : String)
+        new(content)
+      end
+
+      # Transform the css query into an xpath query
+      def self.css_query_to_xpath(query : String) : String
+        query = "//#{query}"
+        # Convert '#id_name' as '[@id="id_name"]'
+        query = query.gsub /\#([A-z0-9]+-*_*)+/ { |m| "*[@id=\"%s\"]" % m.delete('#') }
+        # Convert '.classname' as '[@class="classname"]'
+        query = query.gsub /\.([A-z0-9]+-*_*)+/ { |m| "[@class=\"%s\"]" % m.delete('.') }
+        # Convert ' > ' as '/'
+        query = query.gsub /\s*>\s*/ { |m| "/" }
+        # Convert ' ' as '//'
+        query = query.gsub " ", "//"
+        # a leading '*' when xpath does not include node name
+        query = query.gsub /\/\[/ { |m| "/*[" }
+        return query
+      end
+
+      # Find first tag by tag name and return
+      # `HTML::Tag` if found or `nil` if not found
+      def at_tag(tag_name : String) : Tag | Nil
+        if tags = @tags[tag_name]?
+          tags.each do |tag|
+            return tag
+          end
+        end
+        return nil
+      end
+
+      # Find all nodes by tag name and yield
+      # `HTML::Tag` if found
+      def where_tag(tag_name : String, &block) : Array(Tag)
+        arr = [] of Tag
+        if tags = @tags[tag_name]?
+          tags.each do |tag|
+            yield tag
+            arr << tag
+          end
+        end
+        return arr
+      end
+
+      # Find all nodes by classname and yield
+      # `HTML::Tag` founded
+      def where_class(class_name : String, &block) : Array(Tag)
+        arr = [] of Tag
+        if klasses = @classes[class_name]?
+          klasses.each do |node|
+            klass = Tag.new(node)
+            yield klass
+            arr << klass
+          end
+        end
+        return arr
+      end
+
+      # Find a node by its id and return a
+      # `HTML::Tag` found or `nil` if not found
+      def at_id(id_name : String) : Tag | Nil
+        if node = @ids[id_name]?
+          return Tag.new(node)
+        end
+      end
+
+      # Find all nodes corresponding to the css query and yield
+      # `HTML::Tag` found or `nil` if not found
+      def css(query : String) : Array(Tag)
+        query = HTML.css_query_to_xpath(query)
+        return @nodes.xpath_nodes("//#{query}").map { |node|
+          tag = Tag.new(node)
+          yield tag
+          tag
+        }
+      end
+
+      # Find first node corresponding to the css query and return
+      # `HTML::Tag` if found or `nil` if not found
+      def at_css(query : String)
+        css(query) { |tag| return tag }
+        return nil
+      end
+
+      private def add_id(id : String, node : XML::Node)
+        @ids[id] = node
+      end
+
+      private def add_node(node : XML::Node)
+        if @tags[node.name]? == nil
+          @tags[node.name] = [] of Tag
+        end
+        @tags[node.name] << Tag.new(node)
+      end
+
+      private def add_class(klass : String, node : XML::Node)
+        if @classes[klass]? == nil
+          @classes[klass] = [] of XML::Node
+        end
+        @classes[klass] << node
+      end
+
+      # Depth-first visit. Given a node, extract metadata from
+      # node (if exists), then visit each child.
+      private def visit(node : XML::Node)
+        # We only extract metadata from HTML nodes
+        if node.element?
+          add_node node
+          if to = node["id"]?
+            add_id to, node
+          end
+          if classes = node["class"]?
+            classes.split(' ') { |to| add_class to, node }
+          end
+        end
+        # visit each child
+        node.children.each do |child|
+          visit child
+        end
+      end
+
+      # Represents an HTML Tag
+      struct Tag
+        getter node : XML::Node
+
+        forward_missing_to @node
+
+        def initialize(@node : XML::Node)
+        end
+
+        def classname : String | Nil
+          return @node["class"]? ? @node["class"] : nil
+        end
+
+        def tagname : String
+          return @node.name
+        end
+
+        def content : String
+          return @node.text != nil ? @node.text.as(String) : "".as(String)
+        end
+
+        def parent : Tag | Nil
+          if parent = @node.parent
+            return Tag.new parent
+          end
+          nil
+        end
+
+        def children : Array(Tag)
+          children = [] of Tag
+          @node.children.each do |node|
+            if node.element?
+              children << Tag.new node
+            end
+          end
+          children
+        end
+
+        def has_class?(klass : String) : Bool
+          if classes = classname
+            return classes.includes?(klass)
+          end
+          false
+        end
+      end
+    end
+  end
+end
--- a/src/arachnid/extensions/uri.cr
+++ b/src/arachnid/extensions/uri.cr
@ -0,0 +1,175 @@
+require "uri"
+require "string_scanner"
+
+class URI
+  #
+  # Expands a URI decoded path, into a proper absolute path.
+  #
+  # @param [String] path
+  #   The path from a URI.
+  #
+  # @return [String]
+  #   The expanded path.
+  #
+  # @example
+  #   URI.expand_path("./path")
+  #   # => "path"
+  #
+  # @example
+  #   URI.expand_path("test/../path")
+  #   # => "path"
+  #
+  # @example
+  #   URI.expand_path("/test/path/")
+  #   # => "/test/path/"
+  #
+  # @example
+  #   URI.expand_path("/test/../path")
+  #   # => "/path"
+  #
+  def self.expand_path(path)
+    if path.starts_with?("/")
+      leading_slash, path = path[0, 1], path[1..-1]
+    else
+      leading_slash = ""
+    end
+
+    if path.ends_with?("/")
+      trailing_slash, path = path[-1, 1], path[0..-2]
+    else
+      trailing_slash = ""
+    end
+
+    scanner = StringScanner.new(path)
+    stack = [] of String
+
+    until scanner.eos?
+      if (dir = scanner.scan(/[^\/]+/))
+        case dir
+        when ".." then stack.pop
+        when "."  then false
+        else           stack.push(dir)
+        end
+      else
+        scanner.skip(/\/+/)
+      end
+      break if stack.empty?
+    end
+
+    unless stack.empty?
+      "#{leading_slash}#{stack.join("/")}#{trailing_slash}"
+    else
+      ""
+    end
+  end
+
+  def split_path(path)
+    path.split("/")
+  end
+
+  def merge_path(base, rel)
+
+    # RFC2396, Section 5.2, 5)
+    # RFC2396, Section 5.2, 6)
+    base_path = split_path(base)
+    rel_path  = split_path(rel)
+
+    # RFC2396, Section 5.2, 6), a)
+    base_path << "" if base_path.last == ".."
+    while i = base_path.index("..")
+      base_path = base_path[i - 1, 2]
+    end
+
+    if (first = rel_path.first) && first.empty?
+      base_path.clear
+      rel_path.shift
+    end
+
+    # RFC2396, Section 5.2, 6), c)
+    # RFC2396, Section 5.2, 6), d)
+    rel_path.push("") if rel_path.last == '.' || rel_path.last == ".."
+    rel_path.delete('.')
+
+    # RFC2396, Section 5.2, 6), e)
+    tmp = [] of String
+    rel_path.each do |x|
+      if x == ".." &&
+          !(tmp.empty? || tmp.last == "..")
+        tmp.pop
+      else
+        tmp << x
+      end
+    end
+
+    add_trailer_slash = !tmp.empty?
+    if base_path.empty?
+      base_path = [""] # keep '/' for root directory
+    elsif add_trailer_slash
+      base_path.pop
+    end
+    while x = tmp.shift
+      if x == ".."
+        # RFC2396, Section 4
+        # a .. or . in an absolute path has no special meaning
+        base_path.pop if base_path.size > 1
+      else
+        # if x == ".."
+        #   valid absolute (but abnormal) path "/../..."
+        # else
+        #   valid absolute path
+        # end
+        base_path << x
+        tmp.each {|t| base_path << t}
+        add_trailer_slash = false
+        break
+      end
+    end
+    base_path.push("") if add_trailer_slash
+
+    return base_path.join('/')
+  end
+
+  def merge(oth)
+    oth = URI.parse(oth) unless oth.is_a?(URI)
+
+    if oth.absolute?
+      # raise BadURIError, "both URI are absolute" if absolute?
+      # hmm... should return oth for usability?
+      return oth
+    end
+
+    unless self.absolute?
+      raise URI::Error.new("both URI are othative")
+    end
+
+    base = self.dup
+
+    authority = oth.userinfo || oth.host || oth.port
+
+    # RFC2396, Section 5.2, 2)
+    if (oth.path.nil? || oth.path.empty?) && !authority && !oth.query
+      base.fragment=(oth.fragment) if oth.fragment
+      return base
+    end
+
+    base.query = nil
+    base.fragment=(nil)
+
+    # RFC2396, Section 5.2, 4)
+    if !authority
+      base.path = merge_path(base.path, oth.path) if base.path && oth.path
+    else
+      # RFC2396, Section 5.2, 4)
+      base.path = oth.path if oth.path
+    end
+
+    # RFC2396, Section 5.2, 7)
+    base.user = oth.userinfo if oth.userinfo
+    base.host = oth.host if oth.host
+    base.port = oth.port if oth.port
+    base.query = oth.query if oth.query
+    base.fragment=(oth.fragment) if oth.fragment
+
+    return base
+  end
+end
--- a/src/arachnid/page.cr
+++ b/src/arachnid/page.cr
@ -0,0 +1,97 @@
+require "uri"
+require "halite"
+
+require "./page/content_types"
+require "./page/cookies"
+require "./page/html"
+require "./page/status_codes"
+
+require "./document/html"
+
+module Arachnid
+  # Represents a page requested from a website
+  class Page
+    include Page::ContentTypes
+    include Page::Cookies
+    include Page::HTML
+    include Page::StatusCodes
+
+    # URL of the page
+    getter url : URI
+
+    # HTTP response
+    getter response : Halite::Response
+
+    # Headers returned with the body
+    getter headers : HTTP::Headers
+
+    @doc : (Document::HTML | XML::Node)?
+
+    delegate xpath, xpath_node, xpath_nodes, xpath_bool, xpath_float, xpath_string,
+      root, at_tag, where_tag, where_class, at_id, css, at_css, to: @doc
+
+    forward_missing_to @headers
+
+    # Creates a new `Page` object.
+    def initialize(url : URI, response : Halite::Response)
+      @url = url
+      @response = response
+      @headers = response.headers
+    end
+
+    # The body of the response
+    def body
+      @response.body || ""
+    end
+
+    # Returns a parsed document for HTML, XML, RSS, and Atom pages.
+    def doc
+      unless body.empty?
+        doc_class = if html?
+          Document::HTML
+        elsif rss? || atom? || xml? || xsl?
+          XML
+        end
+
+        if doc_class
+          begin
+            @doc ||= doc_class.parse(body)
+          rescue
+          end
+        end
+      end
+    end
+
+    # Searches the document for XPath or CSS paths
+    def search(path)
+      if document = doc
+        document.xpath_nodes(path)
+      else
+        [] of XML::Node
+      end
+    end
+
+    # Searches for the first occurrence of an XPath or CSS path
+    def at(path)
+      if document = doc
+        document.xpath_node(path)
+      end
+    end
+
+    def /(path)
+      search(path)
+    end
+
+    def %(path)
+      at(path)
+    end
+
+    def size
+      @response.body.bytesize
+    end
+
+    def to_s
+      body
+    end
+  end
+end
--- a/src/arachnid/page/content_types.cr
+++ b/src/arachnid/page/content_types.cr
@ -0,0 +1,162 @@
+module Arachnid
+  class Page
+    module ContentTypes
+      # The Content-Type of the page.
+      def content_type
+        @response.content_type || ""
+      end
+
+      # The content types of the page.
+      def content_types
+        types = @response.headers.get?("content-type") || [] of String
+      end
+
+      # The charset included in the Content-Type.
+      def content_charset
+        content_types.each do |value|
+          if value.includes?(";")
+            value.split(";").each do |param|
+              param.strip!
+
+              if param.starts_with?("charset=")
+                return param.split("=", 2).last
+              end
+            end
+          end
+        end
+
+        return nil
+      end
+
+      # Determines if any of the content-types of the page include a given
+      # type.
+      def is_content_type?(type : String | Regex)
+        content_types.any? do |value|
+          value = value.split(";", 2).first
+
+          if type.is_a?(Regex)
+            value =~ type
+          else
+            value == type
+          end
+        end
+      end
+
+      # Determines if the page is plain-text.
+      def plain_text?
+        is_content_type?("text/plain")
+      end
+
+      # ditto
+      def text?
+        plain_text?
+      end
+
+      # Determines if the page is a Directory Listing.
+      def directory?
+        is_content_type?("text/directory")
+      end
+
+      # Determines if the page is HTML document.
+      def html?
+        is_content_type?("text/html")
+      end
+
+      # Determines if the page is XML document.
+      def xml?
+        is_content_type?(/(text|application)\/xml/)
+      end
+
+      # Determines if the page is XML Stylesheet (XSL).
+      def xsl?
+        is_content_type?("text/xsl")
+      end
+
+      # Determines if the page is JavaScript.
+      def javascript?
+        is_content_type?(/(text|application)\/javascript/)
+      end
+
+      # Determines if the page is JSON.
+      def json?
+        is_content_type?("application/json")
+      end
+
+      # Determines if the page is a CSS stylesheet.
+      def css?
+        is_content_type?("text/css")
+      end
+
+      # Determines if the page is a RSS feed.
+      def rss?
+        is_content_type?(/application\/(rss\+xml|rdf\+xml)/)
+      end
+
+      # Determines if the page is an Atom feed.
+      def atom?
+        is_content_type?("application/atom+xml")
+      end
+
+      # Determines if the page is a MS Word document.
+      def ms_word?
+        is_content_type?("application/msword")
+      end
+
+      # Determines if the page is a PDF document.
+      def pdf?
+        is_content_type?("application/pdf")
+      end
+
+      # Determines if the page is a ZIP archive.
+      def zip?
+        is_content_type?("application/zip")
+      end
+
+      # Determine if the page is an image.
+      def image?
+        is_content_type?(/image\//)
+      end
+
+      def png?
+        is_content_type?("image/png")
+      end
+
+      def gif?
+        is_content_type?("image/gif")
+      end
+
+      def jpg?
+        is_content_type?(/image\/(jpg|jpeg)/)
+      end
+
+      def svg?
+        is_content_type?(/image\/svg(\+xml)?/)
+      end
+
+      def video?
+        is_content_type?(/video\/.*/)
+      end
+
+      def mp4?
+        is_content_type?("video/mp4")
+      end
+
+      def avi?
+        is_content_type?("video/x-msvideo")
+      end
+
+      def wmv?
+        is_content_type?("video/x-ms-wmv")
+      end
+
+      def quicktime?
+        is_content_type?("video/quicktime")
+      end
+
+      def flash?
+        is_content_type?("video/flash") ||
+          is_content_type?("application/x-shockwave-flash")
+      end
+    end
+  end
+end
--- a/src/arachnid/page/cookies.cr
+++ b/src/arachnid/page/cookies.cr
@ -0,0 +1,18 @@
+module Arachnid
+  class Page
+    module Cookies
+      # Reserved names used within Cookie strings
+      RESERVED_COOKIE_NAMES = Regex.new("^(?:Path|Expires|Domain|Secure|HTTPOnly)$", :ignore_case)
+
+      # The raw Cookie String sent along with the page.
+      def cookie
+        @response.headers["Set-Cookie"]? || ""
+      end
+
+      # The Cookie values sent along with the page.
+      def cookies
+        @response.cookies
+      end
+    end
+  end
+end
--- a/src/arachnid/page/html.cr
+++ b/src/arachnid/page/html.cr
@ -0,0 +1,204 @@
+require "../extensions/uri"
+
+module Arachnid
+  class Page
+    # TODO: Create enumerable methods for the methods that take a block
+    module HTML
+      # include Enumerable
+
+      # The title of the HTML page.
+      def title
+        if (node = at("//title"))
+          node.inner_text
+        end
+      end
+
+      # Enumerates over the meta-redirect links in the page.
+      def each_meta_redirect(&block : URI ->)
+        if (html? && doc)
+          search("//meta[@http-equiv and @content]").each do |node|
+            if node["http-equiv"] =~ /refresh/i
+              content = node["content"]
+
+              if (redirect = content.match(/url=(\S+)$/))
+                yield URI.parse(redirect[1])
+              end
+            end
+          end
+        end
+      end
+
+      # Returns a boolean indicating whether or not page-level meta
+      # redirects are present in this page.
+      def meta_redirect?
+        !meta_redirects.empty?
+      end
+
+      # The meta-redirect links of the page.
+      def meta_redirects
+        redirects = [] of URI
+        each_meta_redirect { |r| redirects << r }
+        redirects
+      end
+
+      # Enumerates over every HTTP or meta-redirect link in the page.
+      def each_redirect(&block : URI ->)
+        if (locations = @response.headers.get?("Location"))
+          # Location headers override any meta-refresh redirects in the HTML
+          locations.each { |l| URI.parse(l) }
+        else
+          # check page-level meta redirects if there isn't a location header
+          each_meta_redirect(&block)
+        end
+      end
+
+      # URLs that this document redirects to.
+      def redirects_to
+        each_redirect.to_a
+      end
+
+      # Enumerates over every `mailto:` link in the page.
+      def each_mailto(&block)
+        if (html? && doc)
+          doc.xpath_nodes("//a[starts-with(@href,'mailto:')]").each do |a|
+            yield a["href"][7..-1]
+          end
+        end
+      end
+
+      # `mailto:` links in the page.
+      def mailtos
+        each_mailto.to_a
+      end
+
+      # Enumerates over every link in the page.
+      def each_link(&block : URI ->)
+        each_redirect(&block) if redirect?
+
+        each_image(&block)
+
+        each_script(&block)
+
+        each_resource(&block)
+
+        if html? && (d = doc)
+          d.xpath_nodes("//a[@href]").each do |a|
+            link = to_absolute(a["href"])
+            yield link if link
+          end
+
+          d.xpath_nodes("//frame[@src]").each do |iframe|
+            link = to_absolute(iframe["src"])
+            yield link if link
+          end
+
+          d.xpath_nodes("//iframe[@src]").each do |iframe|
+            link = to_absolute(iframe["src"])
+            yield link if link
+          end
+        end
+      end
+
+      def each_script(&block : URI ->)
+        if html? && (d = doc)
+          d.xpath_nodes("//script[@src]").each do |script|
+            url = to_absolute(script["src"])
+            yield url if url
+          end
+        end
+      end
+
+      def each_resource(&block : URI ->)
+        if html? && (d = doc)
+          d.xpath_nodes("//link[@href]").each do |link|
+            yield URI.parse(link["href"])
+          end
+        end
+      end
+
+      def each_image(&block : URI ->)
+        if html? && (d = doc)
+          d.xpath_nodes("//img[@src]").each do |img|
+            url = to_absolute(img["src"])
+            yield url if url
+          end
+
+          d.xpath_nodes("//img[@srcset]").each do |set|
+            sources = set["srcset"].split(" ").map_with_index { |e, i| (i.zero? || i.even?) ? e : nil }.compact
+            sources.each do |source|
+              url = to_absolute(source)
+              yield url if url
+            end
+          end
+        end
+      end
+
+      def each_video(&block : URI ->)
+        if html? && (d = doc)
+          d.xpath_nodes("//video[@src]").each do |video|
+            url = to_absolute(video["src"])
+            yield url if url
+          end
+
+          d.xpath_nodes("//video/source[@src]").each do |source|
+            url = to_absolute(source["src"])
+            yield url if url
+          end
+        end
+      end
+
+      # The links from within the page.
+      def links
+        links = [] of URI
+        each_link { |link| links << link }
+        links
+      end
+
+      # Enumerates over every URL in the page.
+      def each_url(&block : URI ->)
+        each_link(&block) do |link|
+          if (url = to_absolute(link))
+            yield url
+          end
+        end
+      end
+
+      # ditto
+      def each(&block)
+        each_url { |url| yield url }
+      end
+
+      # Absolute URIs from within the page.
+      def urls
+        urls = [] of URI
+        each_url { |url| urls << link }
+        urls
+      end
+
+      # Normalizes and expands a given link into a proper URI.
+      def to_absolute(link)
+        link = link.is_a?(URI) ? link : URI.parse(link)
+
+        new_url = begin
+          url.merge(link)
+        rescue Exception
+          return
+        end
+
+        if (!new_url.opaque?) && (path = new_url.path)
+          # ensure that paths begin with a leading '/' for URI::FTP
+          if (new_url.scheme == "ftp" && !path.starts_with?("/"))
+            path.insert(0, "/")
+          end
+
+          # make sure the path does not contain any .. or . directories,
+          # since URI::Generic#merge cannot normalize paths such as
+          # "/stuff/../"
+          new_url.path = URI.expand_path(path)
+        end
+
+        return new_url
+      end
+    end
+  end
+end
--- a/src/arachnid/page/status_codes.cr
+++ b/src/arachnid/page/status_codes.cr
@ -0,0 +1,59 @@
+module Arachnid
+  class Page
+    module StatusCodes
+      # The response code from the page.
+      def code
+        @response.status_code.to_i
+      end
+
+      # Determines if the response code is `200`.
+      def ok?
+        code == 200
+      end
+
+      # Determines if the response code is `308`.
+      def timedout?
+        code == 308
+      end
+
+      # Determines if the response code is `400`.
+      def bad_request?
+        code == 400
+      end
+
+      # Determines if the response code is `401`.
+      def unauthorized?
+        code == 401
+      end
+
+      # Determines if the response code is `403`.
+      def forbidden?
+        code == 403
+      end
+
+      # Determines if the response code is `404`.
+      def missing?
+        code == 404
+      end
+
+      # Determines if the response code is `500`.
+      def had_internal_server_error?
+        code == 500
+      end
+
+      # Determines if the response code is `300`, `301`, `302`, `303`
+      # or `307`. Also checks for "soft" redirects added at the page
+      # level by a meta refresh tag.
+      def redirect?
+        case code
+        when 300..303, 307
+          true
+        when 200
+          meta_redirect?
+        else
+          false
+        end
+      end
+    end
+  end
+end
--- a/src/arachnid/robots.cr
+++ b/src/arachnid/robots.cr
@ -0,0 +1,231 @@
+require "uri"
+
+module Arachnid
+  # Parses robots.txt files for the perusal of a single user-agent.
+  #
+  # The behaviour implemented is guided by the following sources, though
+  # as there is no widely accepted standard, it may differ from other implementations.
+  # If you consider its behaviour to be in error, please contact the author.
+  #
+  # http://www.robotstxt.org/orig.html
+  #  - the original, now imprecise and outdated version
+  # http://www.robotstxt.org/norobots-rfc.txt
+  #  - a much more precise, outdated version
+  # http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=156449&from=35237
+  #  - a few hints at modern protocol extensions.
+  #
+  # This parser only considers lines starting with (case-insensitively:)
+  #  Useragent: User-agent: Allow: Disallow: Sitemap:
+  #
+  # The file is divided into sections, each of which contains one or more User-agent:
+  # lines, followed by one or more Allow: or Disallow: rules.
+  #
+  # The first section that contains a User-agent: line that matches the robot's
+  # user-agent, is the only section that relevent to that robot. The sections are checked
+  # in the same order as they appear in the file.
+  #
+  # (The * character is taken to mean "any number of any characters" during matching of
+  #  user-agents)
+  #
+  # Within that section, the first Allow: or Disallow: rule that matches the expression
+  # is taken as authoritative. If no rule in a section matches, the access is Allowed.
+  #
+  # (The order of matching is as in the RFC, Google matches all Allows and then all Disallows,
+  #  while Bing matches the most specific rule, I'm sure there are other interpretations)
+  #
+  # When matching urls, all % encodings are normalised (except for /?=& which have meaning)
+  # and "*"s match any number of any character.
+  #
+  # If a pattern ends with a $, then the pattern must match the entire path, or the entire
+  # path with query string.
+  #
+  # TODO: Rework to allow for multiple Robots
+  class Robots
+    alias Rule = Tuple(String, Bool)
+    alias RuleSet = Tuple(String, Array(Rule))
+
+    getter body : String
+
+    getter user_agent : String
+
+    getter rules : Array(Tuple(String, Array(Rule)))
+
+    getter sitemaps : Array(String)
+
+    def initialize(@body : String, @user_agent : String)
+      @sitemaps = [] of String
+      @rules = [] of RuleSet
+      parse(@body)
+    end
+
+    # Given a URI object, or a string representing one, determine whether this
+    # robots.txt would allow access to the path.
+    def allowed?(uri)
+      uri = URI.parse(uri)
+      path = (uri.path || "/") + (uri.query ? "?" + uri.query.to_s : "")
+      path_allowed?(@user_agent, path)
+    end
+
+    # Check whether the relative path (a string of the url's path and query
+    # string) is allowed by the rules we have for the given user_agent.
+    #
+    private def path_allowed?(user_agent, path)
+      @rules.each do |(ua_glob, path_globs)|
+        if match_ua_glob user_agent, ua_glob
+          path_globs.each do |(path_glob, allowed)|
+            return allowed if match_path_glob path, path_glob
+          end
+          return true
+        end
+      end
+      true
+    end
+
+    # This does a case-insensitive substring match such that if the user agent
+    # is contained within the glob, or vice-versa, we will match.
+    #
+    # According to the standard, *s shouldn't appear in the user-agent field
+    # except in the case of "*" meaning all user agents. Google however imply
+    # that the * will work, at least at the end of a string.
+    #
+    # For consistency, and because it seems expected behaviour, and because
+    # a glob * will match a literal * we use glob matching not string matching.
+    #
+    # The standard also advocates a substring match of the robot's user-agent
+    # within the user-agent field. From observation, it seems much more likely
+    # that the match will be the other way about, though we check for both.
+    #
+    private def match_ua_glob(user_agent, glob)
+      glob =~ Regex.new(Regex.escape(user_agent), Regex::Options::IGNORE_CASE) ||
+        user_agent =~ Regex.new(reify(glob), Regex::Options::IGNORE_CASE)
+    end
+
+    # This does case-sensitive prefix matching, such that if the path starts
+    # with the glob, we will match.
+    #
+    # According to the standard, that's it. However, it seems reasonably common
+    # for asterkisks to be interpreted as though they were globs.
+    #
+    # Additionally, some search engines, like Google, will treat a trailing $
+    # sign as forcing the glob to match the entire path - whether including
+    # or excluding the query string is not clear, so we check both.
+    #
+    # (i.e. it seems likely that a site owner who has Disallow: *.pdf$ expects
+    # to disallow requests to *.pdf?i_can_haz_pdf, which the robot could, if
+    # it were feeling malicious, construe.)
+    #
+    # With URLs there is the additional complication that %-encoding can give
+    # multiple representations for identical URLs, this is handled by
+    # normalize_percent_encoding.
+    #
+    private def match_path_glob(path, glob)
+      if glob =~ /\$$/
+        end_marker = "(?:\?|$)"
+        glob = glob.gsub /\$$/, ""
+      else
+        end_marker = ""
+      end
+
+      glob = normalize_percent_encoding(glob)
+      path = normalize_percent_encoding(path)
+
+      path =~ Regex.new("^" + reify(glob) + end_marker)
+
+    rescue e
+      false
+    end
+
+    # As a general rule, we want to ignore different representations of the
+    # same URL. Naively we could just unescape, or escape, everything, however
+    # the standard implies that a / is a HTTP path separator, while a %2F is an
+    # encoded / that does not act as a path separator. Similar issues with ?, &
+    # and =, though all other characters are fine. (While : also has a special
+    # meaning in HTTP, most implementations ignore this in the path)
+    #
+    # It's also worth noting that %-encoding is case-insensitive, so we
+    # explicitly upcase the few that we want to keep.
+    #
+    private def normalize_percent_encoding(path)
+      # First double-escape any characters we don't want to unescape
+      #                   &  /  =  ?
+      path = path.gsub(/%(26|2F|3D|3F)/i) do |code|
+        "%25#{code.upcase}"
+      end
+
+      URI.unescape(path)
+    end
+
+    # Convert the asterisks in a glob into (.*)s for regular expressions,
+    # and at the same time, escape any other characters that would have
+    # a significance in a regex.
+    #
+    private def reify(glob)
+      glob.split("*").map { |part| Regex.escape(part) }.join(".*")
+    end
+
+    # Convert the @body into a set of @rules so that our parsing mechanism
+    # becomes easier.
+    #
+    # @rules is an array of pairs. The first in the pair is the glob for the
+    # user-agent and the second another array of pairs. The first of the new
+    # pair is a glob for the path, and the second whether it appears in an
+    # Allow: or a Disallow: rule.
+    #
+    # For example:
+    #
+    # User-agent: *
+    # Disallow: /secret/
+    # Allow: /     # allow everything...
+    #
+    # Would be parsed so that:
+    #
+    # @rules = [["*", [ ["/secret/", false], ["/", true] ]]]
+    #
+    #
+    # The order of the arrays is maintained so that the first match in the file
+    # is obeyed as indicated by the pseudo-RFC on http://robotstxt.org/. There
+    # are alternative interpretations, some parse by speicifity of glob, and
+    # some check Allow lines for any match before Disallow lines. All are
+    # justifiable, but we could only pick one.
+    #
+    # Note that a blank Disallow: should be treated as an Allow: * and multiple
+    # user-agents may share the same set of rules.
+    #
+    private def parse(body)
+      body.split(/[\r\n]+/).each do |line|
+        prefix, value = line.delete("\000").split(":", 2).map(&.strip)
+        value = value.sub /\s+#.*/, "" if value
+        parser_mode = :begin
+
+        if prefix && value
+          case prefix.downcase
+          when /^user-?agent$/
+            if parser_mode == :user_agent
+              @rules << {value, rules.last[1]}
+            else
+              parser_mode = :user_agent
+              @rules << {value, [] of Rule}
+            end
+          when "disallow"
+            parser_mode = :rules
+            @rules << {"*", [] of Rule} if @rules.empty?
+
+            if value == ""
+              @rules.last[1] << {"*", true}
+            else
+              @rules.last[1] << {value, false}
+            end
+          when "allow"
+            parser_mode = :rules
+            @rules << {"*", [] of Rule} if @rules.empty?
+            @rules.last[1] << {value, true}
+          when "sitemap"
+            @sitemaps << value
+          else
+            # Ignore comments, Crawl-delay: and badly formed lines.
+          end
+        end
+      end
+    end
+  end
+end
--- a/src/arachnid/rules.cr
+++ b/src/arachnid/rules.cr
@ -0,0 +1,53 @@
+module Arachnid
+  # The `Rules` class represents collections of acceptance and rejection
+  # rules, which are used to filter data.
+  class Rules(T)
+    # Accept rules
+    getter accept : Array(Proc(T | Nil, Bool) | T | Regex | String)
+
+    # Reject rules
+    getter reject : Array(Proc(T | Nil, Bool) | T | Regex | String)
+
+    # Creates a new `Rules` object.
+    def initialize(accept : Array(Proc(T | Nil, Bool) | T | Regex | String)? = nil, reject : Array(Proc(T | Nil, Bool) | T | Regex | String)? = nil)
+      @accept = accept ? accept : [] of Proc(T | Nil, Bool) | T | Regex | String
+      @reject = reject ? reject : [] of Proc(T | Nil, Bool) | T | Regex | String
+    end
+
+    # Determines whether the data should be accepted or rejected.
+    def accept?(data : T)
+      return true if accept.empty? && reject.empty?
+
+      unless @accept.empty?
+        @accept.any? { |rule| test_data(data, rule) }
+      else
+        !@reject.any? { |rule| test_data(data, rule) }
+      end
+    end
+
+    def accept=(value)
+      @accept = value || [] of Proc(T | Nil, Bool) | T | Regex | String
+    end
+
+    # Determines whether the data should be rejected or accepted.
+    def reject?(data : T)
+      !accept?(data)
+    end
+
+    def reject=(value)
+      @reject = value || [] of Proc(T | Nil, Bool) | T | Regex | String
+    end
+
+    # Tests the given data against a pattern.
+    private def test_data(data : T, rule)
+      case rule
+      when Proc
+        rule.call(data) == true
+      when Regex
+        !((data.to_s =~ rule).nil?)
+      else
+        data == rule
+      end
+    end
+  end
+end
--- a/src/arachnid/session_cache.cr
+++ b/src/arachnid/session_cache.cr
@ -0,0 +1,112 @@
+require "uri"
+require "halite"
+
+module Arachnid
+  # Stores active HTTP Sessions organized by scheme, host-name and port.
+  class SessionCache
+
+    # Optional read timeout.
+    property read_timeout : Int32
+
+    # Optional connect timeout.
+    property connect_timeout : Int32
+
+    # Max redirects to follow.
+    property max_redirects : Int32?
+
+    # Should we set a DNT (Do Not Track) header?
+    property? do_not_track : Bool
+
+    @sessions = {} of Tuple(String?, String?, Int32?) => Halite::Client
+
+    # Create a new session cache
+    def initialize(
+      read_timeout : Int32? = nil,
+      connect_timeout : Int32? = nil,
+      follow_redirects : Bool? = nil,
+      max_redirects : Int32? = nil,
+      do_not_track : Bool? = nil
+    )
+      @read_timeout = read_timeout || Arachnid.read_timeout
+      @connect_timeout = connect_timeout || Arachnid.connect_timeout
+      @max_redirects = max_redirects || Arachnid.max_redirects
+      @do_not_track = do_not_track || Arachnid.do_not_track?
+    end
+
+    # Determines if there is an active session for the given URL
+    def active?(url)
+      # normalize the url
+      url = URI.parse(url) unless url.is_a?(URI)
+
+      # session key
+      key = key_for(url)
+
+      @sessions.has_key?(key)
+    end
+
+    # Provides an active session for a given URL.
+    def [](url)
+      # normalize the url
+      url = URI.parse(url) unless url.is_a?(URI)
+
+      # session key
+      key = key_for(url)
+
+      # normalize the endpoint
+      endpoint = url.dup
+      endpoint.scheme ||= "http"
+      endpoint.query = nil
+      endpoint.fragment = nil
+      endpoint.path = ""
+
+      # Set headers
+      headers = {
+        "DNT" => @do_not_track ? 1 : 0
+      }
+
+      unless @sessions.has_key?(key)
+        session = Halite::Client.new(
+          endpoint: endpoint,
+          timeout: Halite::Timeout.new(
+            connect: @connect_timeout,
+            read:  @read_timeout
+          ),
+          follow: Halite::Follow.new(
+            hops: @max_redirects,
+            strict: false
+          ),
+          headers: headers,
+        )
+
+        # session = session.logging(skip_request_body: true, skip_response_body: true)
+
+        @sessions[key] = session
+      end
+
+      @sessions[key]
+    end
+
+    # Destroys an HTTP session for the given scheme, host, and port.
+    def kill!(url)
+      # normalize the url
+      url = URI.parse(url) unless url.is_a?(URI)
+
+      # session key
+      key = key_for(url)
+
+      if sess = @sessions[key]
+        @sessions.delete(key)
+      end
+    end
+
+    # Clears the session cache
+    def clear
+      @sessions.clear
+    end
+
+    # Creates a session key based on the URL
+    private def key_for(url)
+      {url.scheme, url.host, url.port}
+    end
+  end
+end
--- a/src/arachnid/version.cr
+++ b/src/arachnid/version.cr
@ -0,0 +1,3 @@
+module Arachnid
+  VERSION = "0.1.0"
+end