Initial commit

2019-06-26 02:45:03 -07:00 · 2019-06-26 02:45:03 -07:00 · 9b82f6b48a
commit 9b82f6b48a
30 changed files with 2895 additions and 0 deletions
--- a/.editorconfig
+++ b/.editorconfig
@ -0,0 +1,9 @@
 root = true
 [*.cr]
 charset = utf-8
 end_of_line = lf
 insert_final_newline = true
 indent_style = space
 indent_size = 2
 trim_trailing_whitespace = true
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,9 @@
 /docs/
 /lib/
 /bin/
 /.shards/
 *.dwarf
 # Libraries don't need dependency lock
 # Dependencies will be locked in applications that use them
 /shard.lock
--- a/.travis.yml
+++ b/.travis.yml
@ -0,0 +1,6 @@
 language: crystal
 # Uncomment the following if you'd like Travis to run specs and check code formatting
 # script:
 #   - crystal spec
 #   - crystal tool format --check
--- a/21
+++ b/21
@ -0,0 +1,21 @@
 The MIT License (MIT)
 Copyright (c) 2019 Chris Watson
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
--- a/README.md
+++ b/README.md
@ -0,0 +1,95 @@
 # Arachnid
 Arachnid is a fast and powerful web scraping framework for Crystal. It provides an easy to use DSL for scraping webpages and processing all of the things you might come across.
 ## Installation
 1. Add the dependency to your `shard.yml`:
   ```yaml
   dependencies:
     arachnid:
       github: watzon/arachnid
   ```
 2. Run `shards install`
 ## Usage
 Arachnid provides an easy to use, powerful DSL for scraping websites.
 ```crystal
 require "arachnid"
 require "json"
 # Let's build a sitemap of crystal-lang.org
 # Links will be a hash of url to page title
 links = {} of String => String
 # Visit a particular host, in this case `crystal-lang.org`. This will
 # not match on subdomains.
 Arachnid.host("https://crystal-lang.org") do |spider|
  # Ignore the API secion. It's a little big.
  spider.ignore_urls_like(/.*\/api.*/)
  spider.every_page do |page|
    puts "Visiting #{page.url.to_s}"
    # Ignore redirects for our sitemap
    unless page.redirect?
      # Add the url of every visited page to our sitemap
      links[page.url.to_s] = page.title.to_s.strip
    end
  end
 end
 File.write("crystal-lang.org-sitemap.json", links.to_pretty_json)
 ```
 Want to scan external links as well?
 ```crystal
 # To make things interesting, this time let's download
 # every image we find.
 Arachnid.start_at("https://crystal-lang.org") do |spider|
  # Set a base path to store all the images at
  base_image_dir = File.expand_path("~/Pictures/arachnid")
  Dir.mkdir_p(base_image_dir)
  spider.every_page do |page|
    puts "Scanning #{page.url.to_s}"
    if page.image?
      # Since we're going to be saving a lot of images
      # let's spawn a new fiber for each one. This
      # makes things so much faster.
      spawn do
        # Output directory for images for this host
        directory = File.join(base_image_dir, page.url.host.to_s)
        Dir.mkdir_p(directory)
        # The name of the image
        filename = File.basename(page.url.path)
        # Save the image using the body of the page
        puts "Saving #{filename} to #{directory}"
        File.write(File.join(directory, filename), page.body)
      end
    end
  end
 end
 ```
 More documentation will be coming soon!
 ## Contributing
 1. Fork it (<https://github.com/watzon/arachnid/fork>)
 2. Create your feature branch (`git checkout -b my-new-feature`)
 3. Commit your changes (`git commit -am 'Add some feature'`)
 4. Push to the branch (`git push origin my-new-feature`)
 5. Create a new Pull Request
 ## Contributors
 - [Chris Watson](https://github.com/watzon) - creator and maintainer
--- a/shard.yml
+++ b/shard.yml
@ -0,0 +1,17 @@
 name: arachnid
 version: 0.1.0
 authors:
  - Chris Watson <chris@watzon.me>
 dependencies:
  halite:
    github: icyleaf/halite
    version: ~> 0.10.1
  crystagiri:
    github: madeindjs/crystagiri
    branch: master
 crystal: 0.29.0
 license: MIT
--- a/spec/crepe_spec.cr
+++ b/spec/crepe_spec.cr
@ -0,0 +1,9 @@
 require "./spec_helper"
 describe Arachnid do
  # TODO: Write tests
  it "works" do
    false.should eq(true)
  end
 end
--- a/spec/spec_helper.cr
+++ b/spec/spec_helper.cr
@ -0,0 +1,2 @@
 require "spec"
 require "../src/arachnid"
--- a/src/arachnid.cr
+++ b/src/arachnid.cr
@ -0,0 +1,32 @@
 require "./arachnid/version"
 require "./arachnid/arachnid"
 # To make things interesting, this time let's download
 # every image we find.
 Arachnid.start_at("https://crystal-lang.org") do |spider|
  # Set a base path to store all the images at
  base_image_dir = File.expand_path("~/Pictures/arachnid")
  Dir.mkdir_p(base_image_dir)
  spider.every_page do |page|
    puts "Scanning #{page.url.to_s}"
    if page.image?
      # Since we're going to be saving a lot of images
      # let's spawn a new fiber for each one. This
      # makes things so much faster.
      spawn do
        # Output directory for images for this host
        directory = File.join(base_image_dir, page.url.host.to_s)
        Dir.mkdir_p(directory)
        # The name of the image
        filename = File.basename(page.url.path)
        # Save the image using the body of the page
        puts "Saving #{filename} to #{directory}"
        File.write(File.join(directory, filename), page.body)
      end
    end
  end
 end
--- a/src/arachnid/agent.cr
+++ b/src/arachnid/agent.cr
@ -0,0 +1,543 @@
 require "./agent/sanitizers"
 require "./agent/filters"
 require "./agent/events"
 require "./agent/actions"
 require "./agent/robots"
 require "./page"
 require "./session_cache"
 require "./cookie_jar"
 require "./auth_store"
 module Arachnid
  class Agent
    getter? running : Bool
    # Set to limit to a single host.
    property host : String?
    # User agent to use.
    property user_agent : String
    # HTTP Hoes Header to use.
    property host_header : String?
    # HTTP Host Headers to use for specific hosts.
    property host_headers : Hash(String | Regex, String)
    # HTTP Headers to use for every request.
    property default_headers : Hash(String, String)
    # HTTP Authentication credentials.
    property authorized : AuthStore
    # Referer to use.
    property referer : String?
    # Delay in between fetching pages.
    property fetch_delay : Time::Span | Int32
    # History containing visited URLs.
    getter history : Set(URI)
    # List of unreachable URIs.
    getter failures : Set(URI)
    # Queue of URLs to visit.
    getter queue : Array(URI)
    # The session cache.
    property sessions : SessionCache
    # Cached cookies.
    property cookies : CookieJar
    # Maximum number of pages to visit.
    property limit : Int32?
    # Maximum depth.
    property max_depth : Int32?
    # The visited URLs and their depth within a site.
    property levels : Hash(URI, Int32)
    # Creates a new `Agent` object.
    def initialize(
      host : String? = nil,
      read_timeout : Int32? = nil,
      connect_timeout : Int32? = nil,
      follow_redirects : Bool? = nil,
      max_redirects : Int32? = nil,
      do_not_track : Bool? = nil,
      default_headers : Hash(String, String)? = nil,
      host_header : String? = nil,
      host_headers : Hash(String | Regex, String)? = nil,
      user_agent : String? = nil,
      referer : String? = nil,
      fetch_delay : (Int32 | Time::Span)? = nil,
      queue : Set(URI)? = nil,
      history : Set(URI)? = nil,
      limit : Int32? = nil,
      max_depth : Int32? = nil,
      robots : Bool? = nil,
      filter_options = nil
    )
      @host = host
      @host_header = host_header
      @host_headers = host_headers || {} of (Regex | String) => String
      @default_headers = default_headers || {} of String => String
      @user_agent = user_agent || Arachnid.user_agent
      @referer = referer
      @running = false
      @fetch_delay = fetch_delay || 0
      @history = history || Set(URI).new
      @failures = Set(URI).new
      @queue = queue || [] of URI
      @limit = limit
      @levels = {} of URI => Int32
      @max_depth = max_depth
      @sessions = SessionCache.new(
        read_timeout,
        connect_timeout,
        follow_redirects,
        max_redirects,
        do_not_track
      )
      @cookies = CookieJar.new
      @authorized = AuthStore.new
      if filter_options
        initialize_filters(**filter_options)
      else
        initialize_filters
      end
      initialize_robots if robots || Arachnid.robots?
    end
    # Create a new scoped `Agent` in a block.
    def self.new(**options, &block : Agent ->)
      _new = new(**options)
      with _new yield _new
      _new
    end
    # Creates a new `Agent` and begins spidering at the given URL.
    def self.start_at(url, **options, &block : Agent ->)
      agent = new(**options, &block)
      agent.start_at(url, force: true)
    end
    # Creates a new `Agent` and spiders the web site located
    # at the given URL.
    def self.site(url, **options, &block : Agent ->)
      url = url.is_a?(URI) ? url : URI.parse(url)
      url_regex = Regex.new(Regex.escape(url.host.to_s))
      agent = new(**options, &block)
      agent.visit_hosts_like(url_regex)
      agent.start_at(url, force: true)
    end
    # Creates a new `Agent` and spiders the given host.
    def self.host(url, **options, &block : Agent ->)
      url = url.is_a?(URI) ? url : URI.parse(url)
      options = options.merge(host: url.host)
      agent = new(**options, &block)
      agent.start_at(url, force: true)
    end
    # Clears the history of the `Agent`.
    def clear
      @queue.clear
      @history.clear
      @failures.clear
      self
    end
    # Start spidering at a given URL.
    # def start_at(url, &block : Page ->)
    #   enqueue(url)
    #   run(&block)
    # end
    # Start spidering at a given URL.
    def start_at(url, force = false)
      enqueue(url, force: force)
      return run
    end
    # Start spidering until the queue becomes empty or the
    # agent is paused.
    # def run(&block : Page ->)
    #   @running = true
    #   until @queue.empty? || paused? || limit_reached?
    #     begin
    #       visit_page(dequeue, &block)
    #     rescue Actions::Paused
    #       return self
    #     rescue Actions::Action
    #     end
    #   end
    #   @running = false
    #   @sessions.clear
    #   self
    # end
    # Start spidering until the queue becomes empty or the
    # agent is paused.
    def run
      @running = true
      until @queue.empty? || paused? || limit_reached? || !running?
        begin
          visit_page(dequeue)
        rescue Actions::Paused
          return self
        rescue Actions::Action
        end
      end
      @running = false
      @sessions.clear
      self
    end
    # Sets the history of URLs that were previously visited.
    def history=(new_history)
      @history.clear
      new_history.each do |url|
        @history << url.is_a?(URI) ? url : URI.parse(url)
      end
      @history
    end
    # Specifies the links which have been visited.
    def visited_links
      @history.map(&.to_s)
    end
    # Specifies the hosts which have been visited.
    def visited_hosts
      history.map(&.host)
    end
    # Determines whether a URL was visited or not.
    def visited?(url)
      url = url.is_a?(URI) ? url : URI.parse(url)
      @history.includes?(url)
    end
    # Sets the list of failed URLs.
    def failures=(new_failures)
      @failures.clear
      new_failures.each do |url|
        @failures << url.is_a?(URI) ? url : URI.parse(url)
      end
      @failures
    end
    # Determines whether a given URL could not be visited.
    def failed?(url)
      url = url.is_a?(URI) ? url : URI.parse(url)
      @failures.includes?(url)
    end
    # Sets the queue of URLs to visit.
    # Sets the list of failed URLs.
    def queue=(new_queue)
      @queue.clear
      new_queue.each do |url|
        @queue << url.is_a?(URI) ? url : URI.parse(url)
      end
      @queue
    end
    # Determines whether the given URL has been queued for visiting.
    def queued?(url)
      url = url.is_a?(URI) ? url : URI.parse(url)
      @queue.includes?(url)
    end
    # Enqueues a given URL for visiting, only if it passes all
    # of the agent's rules for visiting a given URL.
    def enqueue(url, level = 0, force = false)
      url = sanitize_url(url)
      if (!queued?(url) && visit?(url)) || force
        link = url.to_s
        return if url.host.to_s.empty?
        begin
          @every_url_blocks.each { |url_block| url_block.call(url) }
          @every_url_like_blocks.each do |pattern, url_blocks|
            match = case pattern
                    when Regex
                      link =~ pattern
                    else
                      (pattern == link) || (pattern == url)
                    end
            if match
              url_blocks.each { |url_block| url_block.call(url) }
            end
          end
        rescue action : Actions::Paused
          raise(action)
        rescue Actions::SkipLink
          return false
        rescue Actions::Action
        end
        @queue << url
        @levels[url] = level
        true
      end
    end
    # Gets and creates a new `Page` object from a given URL,
    # yielding the newly created page.
    def get_page(url, &block)
      url = url.is_a?(URI) ? url : URI.parse(url)
      prepare_request(url) do |session, path, handlers|
        new_page = Page.new(url, session.get(path, headers: handlers))
        # save any new cookies
        @cookies.from_page(new_page)
        yield new_page
        return new_page
      end
    end
    # Gets and creates a new `Page` object from a given URL.
    def get_page(url)
      url = url.is_a?(URI) ? url : URI.parse(url)
      prepare_request(url) do |session, path, handlers|
        new_page = Page.new(url, session.get(path, handlers))
        # save any new cookies
        @cookies.from_page(new_page)
        return new_page
      end
    end
    # Posts supplied form data and creates a new Page from a given URL,
    # yielding the newly created page.
    def post_page(url, post_data = "", &block)
      url = url.is_a?(URI) ? url : URI.parse(url)
      prepare_request(url) do |session, path, handlers|
        new_page = Page.new(url, session.post(path, post_data, handlers))
        # save any new cookies
        @cookies.from_page(new_page)
        yield new_page
        return new_page
      end
    end
    # Posts supplied form data and creates a new Page from a given URL.
    def post_page(url, post_data = "")
      url = url.is_a?(URI) ? url : URI.parse(url)
      prepare_request(url) do |session, path, handlers|
        new_page = Page.new(url, session.post(path, post_data, handlers))
        # save any new cookies
        @cookies.from_page(new_page)
        return new_page
      end
    end
    # Visits a given URL and enqueues the links recovered
    # from the page to be visited later.
    # def visit_page(url, &block : Page ->)
    #   url = sanitize_url(url)
    #   get_page(url) do |page|
    #     @history << page.url
    #     begin
    #       @every_page_blocks.each { |page_block| page_block.call(page) }
    #       yield page
    #     rescue action : Actions::Paused
    #       raise(action)
    #     rescue Actions::SkipPage
    #       return Nil
    #     rescue Actions::Action
    #     end
    #     page.each_url do |next_url|
    #       begin
    #         @every_link_blocks.each do |link_block|
    #           link_block.call(page.url, next_url)
    #         end
    #       rescue action : Actions::Paused
    #         raise(action)
    #       rescue Actions::SkipLink
    #         next
    #       rescue Actions::Action
    #       end
    #       if @max_depth.nil? || @max_depth.not_nil! > (@levels[url]? || 0)
    #         @levels[url] ||= 0
    #         enqueue(next_url, @levels[url] + 1)
    #       end
    #     end
    #   end
    # end
    # Visits a given URL and enqueues the links recovered
    # from the page to be visited later.
    def visit_page(url)
      url = sanitize_url(url)
      get_page(url) do |page|
        @history << page.url
        begin
          @every_page_blocks.each { |page_block| page_block.call(page) }
        rescue action : Actions::Paused
          raise(action)
        rescue Actions::SkipPage
          return nil
        rescue Actions::Action
        end
        page.each_url do |next_url|
          begin
            @every_link_blocks.each do |link_block|
              link_block.call(page.url, next_url)
            end
          rescue action : Actions::Paused
            raise(action)
          rescue Actions::SkipLink
            next
          rescue Actions::Action
          end
          if @max_depth.nil? || @max_depth.not_nil! > (@levels[url]? || 0)
            @levels[url] ||= 0
            enqueue(next_url, @levels[url] + 1)
          end
        end
      end
    end
    # Converts the agent into a hash.
    def to_h
      {"history" => @history, "queue" => @queue}
    end
    # Prepares request headers for a given URL.
    protected def prepare_request_headers(url)
      # set any additional HTTP headers
      headers = @default_headers.dup
      unless @host_headers.empty?
        @host_headers.each do |name, header|
          if url.host =~ name
            headers["Host"] = header
            break
          end
        end
      end
      headers["Host"] ||= @host_header.to_s if @host_header
      headers["User-Agent"] ||= @user_agent.to_s
      headers["Referer"] ||= @referer.to_s if @referer
      if authorization = @authorized.for_url(url.host.to_s)
        headers["Authorization"] = "Basic #{authorization}"
      end
      if header_cookies = @cookies.for_host(url.host.to_s)
        headers["Cookie"] = header_cookies.to_cookie_header
      end
      headers
    end
    # Normalizes the request path and grabs a session to handle
    # page get and post requests.
    def prepare_request(url, &block)
      path = if url.path.empty?
               "/"
             else
               url.path
             end
      # append the URL query to the path
      path += "?#{url.query}" if url.query
      headers = prepare_request_headers(url)
      begin
        sleep(@fetch_delay) if @fetch_delay.to_i > 0
        yield @sessions[url], path, headers
      rescue Halite::Exception::Error | IO::Error | Socket::Error | OpenSSL::SSL::Error
        @sessions.kill!(url)
        return nil
      end
    end
    # Dequeues a URL that will later be visited.
    def dequeue
      @queue.shift
    end
    # Determines if the maximum limit has been reached.
    def limit_reached?
      if limit = @limit
        return @history.size >= limit
      end
      false
    end
    # Determines if a given URL should be visited.
    def visit?(url)
      # puts [url.to_s, visited?(url), visit_scheme?(url.scheme.to_s), visit_host?(url.host.to_s), visit_port?(url.port || -1), visit_link?(url.to_s), visit_url?(url), visit_ext?(url.path)]
      !visited?(url) &&
        visit_scheme?(url.scheme.to_s) &&
        visit_host?(url.host.to_s) &&
        visit_port?(url.port || -1) &&
        visit_link?(url.to_s) &&
        visit_url?(url) &&
        visit_ext?(url.path)
        # robot_allowed?(url.to_s)
    end
    # Adds a given URL to the failures list.
    def failed(url)
      @failures << url
      @every_failed_url_blocks.each { |fail_block| fail_block.call(url) }
      true
    end
  end
 end
--- a/src/arachnid/agent/actions.cr
+++ b/src/arachnid/agent/actions.cr
@ -0,0 +1,53 @@
 module Arachnid
  class Agent
    module Actions
      # A Runtime Error
      class RuntimeError < Exception; end
      # The base `Actions` exceptions class
      class Action < RuntimeError; end
      # Exception used to pause a running `Agent`
      class Paused < Action; end
      # Exception which causes a running `Agent` to skip a link.
      class SkipLink < Action; end
      # Exception which caises a running `Agent` to skip a page.
      class SkipPage < Action; end
    end
    # Continue spidering
    def continue!(&block)
      @paused = false
      run(&block)
    end
    # Sets the pause state of the agent.
    def pause=(state)
      @paused = state
    end
    # Pauses the agent, causing spidering to temporarily stop.
    def pause!
      @paused = true
      raise Actions::Paused.new
    end
    # Determines whether the agent is paused.
    def paused?
      @paused == true
    end
    # Causes the agent to skip the link being enqueued.
    def skip_link!
      raise Actions::SkipLink.new
    end
    # Causes the agent to skip the page being visited.
    def skip_page!
      raise Actions::SkipPage
    end
  end
 end
--- a/src/arachnid/agent/events.cr
+++ b/src/arachnid/agent/events.cr
@ -0,0 +1,248 @@
 require "../page"
 module Arachnid
  class Agent
    @every_url_blocks = [] of Proc(URI, Nil)
    @every_failed_url_blocks = [] of Proc(URI, Nil)
    @every_url_like_blocks = Hash(String | Regex, Array(Proc(URI, Nil))).new do |hash, key|
      hash[key] = [] of Proc(URI, Nil)
    end
    @every_page_blocks = [] of Proc(Page, Nil)
    @every_link_blocks = [] of Proc(URI, URI, Nil)
    # Pass each URL from each page visited to the given block.
    def every_url(&block : URI ->)
      @every_url_blocks << block
      self
    end
    # Pass each URL that could not be requested to the given block.
    def every_failed_url(&block : URI ->)
      @every_failed_url_blocks << block
      self
    end
    # Pass every URL that the agent visits, and matches a given pattern,
    # to a given block.
    def every_url_like(pattern, &block : URI ->)
      @every_url_like_blocks[pattern] << block
      self
    end
    # Ssee `#every_url_like`
    def urls_like(pattern, &block : URI ->)
      every_url_like(pattern, &block)
    end
    # Pass the headers from every response the agent receives to a given
    # block.
    def all_headers(&block)
      headers = [] of HTTP::Headers
      every_page { |page| headers << page.headers }
      headers.each { |header| yield headers }
    end
    # Pass every page that the agent visits to a given block.
    def every_page(&block : Page ->)
      @every_page_blocks << block
      self
    end
    # Pass every OK page that the agent visits to a given block.
    def every_ok_page(&block : Page ->)
      pages = [] of Page
      every_page { |page| (pages << page) if page.ok? }
      pages.each { |page| yield page }
    end
    # Pass every Redirect page that the agent visits to a given block.
    def every_redirect_page(&block : Page ->)
      pages = [] of Page
      every_page { |page| (pages << page) if page.redirect? }
      pages.each { |page| yield page }
    end
    # Pass every Timeout page that the agent visits to a given block.
    def every_timedout_page(&block : Page ->)
      pages = [] of Page
      every_page { |page| (pages << page) if page.timeout? }
      pages.each { |page| yield page }
    end
    # Pass every Bad Request page that the agent visits to a given block.
    def every_bad_request_page(&block : Page ->)
      pages = [] of Page
      every_page { |page| (pages << page) if page.bad_request? }
      pages.each { |page| yield page }
    end
    # Pass every Unauthorized page that the agent visits to a given block.
    def every_unauthorized_page(&block : Page ->)
      pages = [] of Page
      every_page { |page| (pages << page) if page.unauthorized? }
      pages.each { |page| yield page }
    end
    # Pass every Forbidden page that the agent visits to a given block.
    def every_forbidden_page(&block : Page ->)
      pages = [] of Page
      every_page { |page| (pages << page) if page.forbidden? }
      pages.each { |page| yield page }
    end
    # Pass every Missing page that the agent visits to a given block.
    def every_missing_page(&block : Page ->)
      pages = [] of Page
      every_page { |page| (pages << page) if page.missing? }
      pages.each { |page| yield page }
    end
    # Pass every Internal Server Error page that the agent visits to a
    # given block.
    def every_internal_server_error_page(&block : Page ->)
      pages = [] of Page
      every_page { |page| (pages << page) if page.had_internal_server_error? }
      pages.each { |page| yield page }
    end
    # Pass every Plain Text page that the agent visits to a given block.
    def every_txt_page(&block : Page ->)
      pages = [] of Page
      every_page { |page| (pages << page) if page.txt? }
      pages.each { |page| yield page }
    end
    # Pass every HTML page that the agent visits to a given block.
    def every_html_page(&block : Page ->)
      pages = [] of Page
      every_page { |page| (pages << page) if page.html? }
      pages.each { |page| yield page }
    end
    # Pass every XML page that the agent visits to a given block.
    def every_xml_page(&block : Page ->)
      pages = [] of Page
      every_page { |page| (pages << page) if page.xml? }
      pages.each { |page| yield page }
    end
    # Pass every XML Stylesheet (XSL) page that the agent visits to a
    # given block.
    def every_xsl_page(&block : Page ->)
      pages = [] of Page
      every_page { |page| (pages << page) if page.xsl? }
      pages.each { |page| yield page }
    end
    # Pass every HTML or XML document that the agent parses to a given
    # block.
    def every_doc(&block : Document::HTML | XML::Node ->)
      docs = [] of Document::HTML || XML::Node
      every_page { |page| docs << page.doc.not_nil! if page.doc }
      docs.each { |doc| yield doc }
    end
    # Pass every HTML document that the agent parses to a given block.
    def every_html_doc(&block : Document::HTML | XML::Node ->)
      docs = [] of Document::HTML
      every_page { |page| docs << page.doc.not_nil! if page.html? }
      docs.each { |doc| yield doc }
    end
    # Pass every XML document that the agent parses to a given block.
    def every_xml_doc(&block : XML::Node ->)
      docs = [] of XML::Node
      every_page { |page| docs << page.doc.not_nil! if page.xml? }
      docs.each { |doc| yield doc }
    end
    # Pass every XML Stylesheet (XSL) that the agent parses to a given
    # block.
    def every_xsl_doc(&block : XML::Node ->)
      docs = [] of XML::Node
      every_page { |page| docs << page.doc.not_nil! if page.xsl? }
      docs.each { |doc| yield doc }
    end
    # Pass every RSS document that the agent parses to a given block.
    def every_rss_doc(&block : XML::Node ->)
      docs = [] of XML::Node
      every_page { |page| docs << page.doc.not_nil! if page.rss? }
      docs.each { |doc| yield doc }
    end
    # Pass every Atom document that the agent parses to a given block.
    def every_atom_doc(&block : XML::Node ->)
      docs = [] of XML::Node
      every_page { |page| docs << page.doc.not_nil! if page.atom? }
      docs.each { |doc| yield doc }
    end
    # Pass every JavaScript page that the agent visits to a given block.
    def every_javascript_page(&block : Page ->)
      pages = [] of Page
      every_page { |page| (pages << page) if page.javascript? }
      pages.each { |page| yield page }
    end
    # Pass every CSS page that the agent visits to a given block.
    def every_css_page(&block : Page ->)
      pages = [] of Page
      every_page { |page| (pages << page) if page.css? }
      pages.each { |page| yield page }
    end
    # Pass every RSS feed that the agent visits to a given block.
    def every_rss_page(&block : Page ->)
      pages = [] of Page
      every_page { |page| (pages << page) if page.rss? }
      pages.each { |page| yield page }
    end
    # Pass every Atom feed that the agent visits to a given block.
    def every_atom_page(&block : Page ->)
      pages = [] of Page
      every_page { |page| (pages << page) if page.atom? }
      pages.each { |page| yield page }
    end
    # Pass every MS Word page that the agent visits to a given block.
    def every_ms_word_page(&block : Page ->)
      pages = [] of Page
      every_page { |page| (pages << page) if page.ms_word? }
      pages.each { |page| yield page }
    end
    # Pass every PDF page that the agent visits to a given block.
    def every_pdf_page(&block : Page ->)
      pages = [] of Page
      every_page { |page| (pages << page) if page.pdf? }
      pages.each { |page| yield page }
    end
    # Pass every ZIP page that the agent visits to a given block.
    def every_zip_page(&block : Page ->)
      pages = [] of Page
      every_page { |page| (pages << page) if page.zip? }
      pages.each { |page| yield page }
    end
    # Passes every image URI to the given blocks.
    def every_image(&block : Page ->)
      pages = [] of Page
      every_page { |page| (pages << page) if page.image? }
      pages.each { |page| yield page }
    end
    # Passes every origin and destination URI of each link to a given
    # block.
    def every_link(&block : URI, URI ->)
      @every_link_blocks << block
      self
    end
  end
 end
--- a/src/arachnid/agent/filters.cr
+++ b/src/arachnid/agent/filters.cr
@ -0,0 +1,256 @@
 require "../rules"
 module Arachnid
  class Agent
    # List of acceptable URL schemes to follow
    getter schemes : Array(String) = [] of String
    @host_rules = Rules(String).new
    @port_rules = Rules(Int32).new
    @link_rules = Rules(String).new
    @url_rules = Rules(URI).new
    @ext_rules = Rules(String).new
    # Sets the list of acceptable URL schemes to visit.
    def schemes=(new_schemes)
      @schemes = new_schemes.map(&.to_s)
    end
    # Specifies the patterns that match host-names to visit.
    def visit_hosts
      @host_rules.accept
    end
    # Adds a given pattern to the `#visit_hosts`.
    def visit_hosts_like(pattern)
      visit_hosts << pattern
      self
    end
    def visit_hosts_like(&block)
      visit_hosts << block
      self
    end
    # Specifies the patterns that match host-names to not visit.
    def ignore_hosts
      @host_rules.reject
    end
    # Adds a given pattern to the `#ignore_hosts`.
    def ignore_hosts_like(pattern)
      ignore_hosts << pattern
      self
    end
    def ignore_hosts_like(&block)
      ignore_hosts << block
      self
    end
    # Specifies the patterns that match the ports to visit.
    def visit_ports
      @port_rules.accept
    end
    # Adds a given pattern to the `#visit_ports`.
    def visit_ports_like(pattern)
      visit_ports << pattern
      self
    end
    def visit_ports_like(&block : Int32 -> Bool)
      visit_ports << block
      self
    end
    # Specifies the patterns that match ports to not visit.
    def ignore_ports
      @port_rules.reject
    end
    # Adds a given pattern to the `#ignore_ports`.
    def ignore_ports_like(pattern)
      ignore_ports << pattern
      self
    end
    def ignore_ports_like(&block : Int32 -> Bool)
      ignore_ports << block
      self
    end
    # Specifies the patterns that match the links to visit.
    def visit_links
      @link_rules.accept
    end
    # Adds a given pattern to the `#visit_links`
    def visit_links_like(pattern)
      visit_links << pattern
      self
    end
    def visit_links_like(&block : String -> Bool)
      visit_links << block
      self
    end
    # Specifies the patterns that match links to not visit.
    def ignore_links
      @link_rules.reject
    end
    # Adds a given pattern to the `#ignore_links`.
    def ignore_links_like(pattern)
      ignore_links << pattern
      self
    end
    def ignore_links_like(&block : String -> Bool)
      ignore_links << block
      self
    end
    # Specifies the patterns that match the URLs to visit.
    def visit_urls
      @url_rules.accept
    end
    # Adds a given pattern to the `#visit_urls`
    def visit_urls_like(&block : URI -> Bool)
      visit_urls << block
      self
    end
    def visit_urls_like(pattern)
      visit_urls << pattern
      self
    end
    # Specifies the patterns that match URLs to not visit.
    def ignore_urls
      @url_rules.reject
    end
    # Adds a given pattern to the `#ignore_urls`.
    def ignore_urls_like(&block : URI -> Bool)
      ignore_urls << block
      self
    end
    def ignore_urls_like(pattern)
      ignore_urls << pattern
      self
    end
    # Specifies the patterns that match the URI path extensions to visit.
    def visit_exts
      @ext_rules.accept
    end
    # Adds a given pattern to the `#visit_exts`.
    def visit_exts_like(&block : String -> Bool)
      visit_exts << block
      self
    end
    def visit_exts_like(pattern)
      visit_exts << pattern
      self
    end
    # Specifies the patterns that match URI path extensions to not visit.
    def ignore_exts
      @ext_rules.reject
    end
    # Adds a given pattern to the `#ignore_exts`.
    def ignore_exts_like(&block : String -> Bool)
      ignore_exts << block
      self
    end
    def ignore_exts_like(pattern)
      ignore_exts << pattern
      self
    end
    # Initializes filtering rules.
    protected def initialize_filters(
      schemes = nil,
      hosts = nil,
      ignore_hosts = nil,
      ports = nil,
      ignore_ports = nil,
      links = nil,
      ignore_links = nil,
      urls = nil,
      ignore_urls = nil,
      exts = nil,
      ignore_exts = nil
    )
      if schemes
        self.schemes = schemes
      else
        @schemes << "http"
        @schemes << "https"
      end
      @host_rules.accept = hosts
      @host_rules.reject = ignore_hosts
      @port_rules.accept = ports
      @port_rules.reject = ignore_ports
      @link_rules.accept = links
      @link_rules.reject = ignore_links
      @url_rules.accept = urls
      @url_rules.reject = ignore_urls
      @ext_rules.accept = exts
      @ext_rules.reject = ignore_exts
      if host
        visit_hosts_like(host.to_s)
      end
    end
    # Determines if a given URI scheme should be visited.
    protected def visit_scheme?(scheme)
      if scheme
        @schemes.includes?(scheme)
      else
        true
      end
    end
    # Determines if a given host-name should be visited.
    protected def visit_host?(host)
      @host_rules.accept?(host)
    end
    # Determines if a given port should be visited.
    protected def visit_port?(port)
      @port_rules.accept?(port)
    end
    # Determines if a given link should be visited.
    protected def visit_link?(link)
      @link_rules.accept?(link)
    end
    # Determines if a given URL should be visited.
    protected def visit_url?(link)
      @url_rules.accept?(link)
    end
    # Determines if a given URI path extension should be visited.
    protected def visit_ext?(path)
      ext = File.extname(path)
      @ext_rules.accept?(ext)
    end
  end
 end
--- a/src/arachnid/agent/robots.cr
+++ b/src/arachnid/agent/robots.cr
@ -0,0 +1,20 @@
 require "../robots"
 module Arachnid
  class Agent
    @robots : Arachnid::Robots? = nil
    # Initializes the robots filter.
    def initialize_robots
      # @robots = Arachnid::Robots.new(@user_agent)
    end
    # Determines whether a URL is allowed by the robot policy.
    def robot_allowed?(url)
      if robots = @robots
        return robots.allowed?(url)
      end
      true
    end
  end
 end
--- a/src/arachnid/agent/sanitizers.cr
+++ b/src/arachnid/agent/sanitizers.cr
@ -0,0 +1,21 @@
 module Arachnid
  class Agent
    # Specifies whether the Agent will strip URI fragments
    property? strip_fragments : Bool = true
    # Specifies whether the Agent will strip URI queries
    property? strip_query : Bool = false
    # Sanitizes a URL based on filtering options
    def sanitize_url(url)
      # normalize the url
      url = URI.parse(url) unless url.is_a?(URI)
      url.path = "" if url.path == "/"
      url.fragment = nil if @strip_fragments
      url.query = nil if @strip_query
      url
    end
  end
 end
--- a/src/arachnid/arachnid.cr
+++ b/src/arachnid/arachnid.cr
@ -0,0 +1,39 @@
 require "./page"
 require "./agent"
 module Arachnid
  extend self
  # Specifies whether robots.txt should be honored globally
  class_property? robots : Bool = false
  # Should we set the DNT (Do Not Track) header?
  class_property? do_not_track : Bool = false
  # Maximum amount of redirects to follow
  class_property max_redirects : Int32 = 0
  # Connect timeout.
  class_property connect_timeout : Int32 = 10
  # Read timeout.
  class_property read_timeout : Int32 = 10
  # The User-Agent string used by all Agent objects by default.
  class_property user_agent : String = "Arachnid #{Arachnid::VERSION}"
  # See `Agent.start_at`
  def start_at(url, **options, &block : Agent ->)
    Agent.start_at(url, **options, &block)
  end
  # See `Agent.host`
  def host(name, **options, &block : Agent ->)
    Agent.host(name, **options, &block)
  end
  # See `Agent.site`
  def site(url, **options, &block : Agent ->)
    Agent.site(url, **options, &block)
  end
 end
--- a/src/arachnid/auth_credential.cr
+++ b/src/arachnid/auth_credential.cr
@ -0,0 +1,4 @@
 module Arachnid
  # Represents HTTP Authentication credentials for a website.
  record AuthCredential, username : String, password : String
 end
--- a/src/arachnid/auth_store.cr
+++ b/src/arachnid/auth_store.cr
@ -0,0 +1,83 @@
 require "base64"
 require "./extensions/uri"
 require "./auth_credential"
 require "./page"
 module Arachnid
  class AuthStore
    @credentials = {} of Tuple(String?, String?, Int32?) => Hash(Array(String), AuthCredential)
    # Given a URL, return the most specific matching auth credential.
    def [](url)
      # normalize the url
      url = URI.parse(url) unless url.is_a?(URI)
      key = key_for(url)
      paths = @credentials[key]?
      return nil unless paths
      # longest path first
      ordered_paths = paths.keys.sort { |path_key|  -path_key.size }
      # directories of the path
      path_dirs = URI.expand_path(url.path).split('/').reject(&.empty?)
      ordered_paths.each do |path|
        return paths[path] if path_dirs[0, path.size] == path
      end
      nil
    end
    # Add an auth credential to the store for the supplied base URL.
    def []=(url, auth)
      # normalize the url
      url = URI.parse(url) unless url.is_a?(URI)
      # normalize the url path and split it
      paths = URI.expand_path(url.path).split('/').reject(&.empty?)
      key = key_for(url)
      @credentials[key] ||= {} of Array(String) => AuthCredential
      @credentials[key][paths] = auth
      auth
    end
    # Convenience method to add username and password credentials
    # for a named URL.
    def add(url, username, password)
      self[url] = AuthCredential.new(username: username, password: password)
    end
    # Returns the base64 encoded authorization string for the URL
    # or `nil` if no authorization exists.
    def for_url(url)
      if auth = self[url]
        Base64.encode("#{auth.username}#{auth.password}")
      end
    end
    # Clear the contents of the auth store.
    def clear!
      @credentials.clear!
      self
    end
    # Size of the current auth store (number of URL paths stored)
    def size
      @credentials.values.reduce(0) { |acc, paths| acc + paths.size }
    end
    # Inspect the auth store
    def inspect
      "<#{self.class}: #{@credentials.inspect}>"
    end
    # Creates a auth key based on the URL
    private def key_for(url)
      {url.scheme, url.host, url.port}
    end
  end
 end
--- a/src/arachnid/cookie_jar.cr
+++ b/src/arachnid/cookie_jar.cr
@ -0,0 +1,118 @@
 module Arachnid
  class CookieJar
    include Enumerable(HTTP::Cookies)
    @params : Hash(String, HTTP::Cookies)
    @cookies : HTTP::Cookies
    @dirty : Set(String)
    # Creates a new `CookieJar`
    def initialize
      @params = {} of String => HTTP::Cookies
      @cookies = HTTP::Cookies.new
      @dirty = Set(String).new
    end
    # Iterates over the host-name and cookie value pairs in the jar.
    def each(&block)
      @params.each do |kp|
        yield kp
      end
    end
    # Returns all relevant cookies in a single string for the named
    # host or domain.
    def [](host : String)
      @params[host]? || HTTP::Cookies.new
    end
    # Add a cookie to the jar for a particular domain.
    def []=(host : String, cookies : HTTP::Cookies)
      @params[host] ||= HTTP::Cookies.new
      cookies.each do |cookie|
        if @params[host][cookie.name]? != cookie.value
          cookies.each do |c|
            @params[host] << c
          end
          @dirty.add(host)
          break
        end
      end
      cookies
    end
    # Retrieve cookies for a domain from the response.
    def from_page(page)
      cookies = page.cookies
      unless cookies.empty?
        self[page.url.host.to_s] = cookies
        return true
      end
      false
    end
    # Returns the pre-encoded Cookie for a given host.
    def for_host(host)
      if @dirty.includes?(host)
        values = [] of String
        cookies_for_host(host).each do |cookie|
          values << cookie.to_cookie_header
        end
        @cookies[host] = values.join("; ")
        @dirty.delete(host)
      end
      @cookies[host]?
    end
    # Returns raw cookie value pairs for a given host. Includes cookies
    # set on parent domains.
    def cookies_for_host(host)
      host_cookies = @params[host]? || HTTP::Cookies.new
      subdomains = host.split('.')
      while subdomains.size > 2
        subdomains.shift
        if parent_cookies = @params[subdomains.join('.')]?
          parent_cookies.each do |cookie|
            # copy in the parent cookies, only if they haven't been
            # overridden yet.
            unless host_cookies.has_key?(cookie.name)
                host_cookies[cookie.name] = cookie.value
            end
          end
        end
      end
      host_cookies
    end
    # Clear out the jar, removing all stored cookies.
    def clear!
      @params.clear
      @cookies.clear
      @dirty.clear
      self
    end
    # Size of the cookie jar.
    def size
      @params.size
    end
    # Inspects the cookie jar.
    def inspect
      "#<#{self.class}: #{@params.inspect}>"
    end
  end
 end
--- a/src/arachnid/document/html.cr
+++ b/src/arachnid/document/html.cr
@ -0,0 +1,196 @@
 require "xml"
 module Arachnid
  module Document
    struct HTML
      @content : String
      @document : XML::Node
      @ids : Hash(String, XML::Node)
      @tags : Hash(String, Array(Tag))
      @classes : Hash(String, Array(XML::Node))
      forward_missing_to @document
      def initialize(@content : String)
        @document = XML.parse_html(@content)
        @ids = {} of String => XML::Node
        @tags = {} of String => Array(Tag)
        @classes = {} of String => Array(XML::Node)
        visit @document
      end
      def self.parse(content : String)
        new(content)
      end
      # Transform the css query into an xpath query
      def self.css_query_to_xpath(query : String) : String
        query = "//#{query}"
        # Convert '#id_name' as '[@id="id_name"]'
        query = query.gsub /\#([A-z0-9]+-*_*)+/ { |m| "*[@id=\"%s\"]" % m.delete('#') }
        # Convert '.classname' as '[@class="classname"]'
        query = query.gsub /\.([A-z0-9]+-*_*)+/ { |m| "[@class=\"%s\"]" % m.delete('.') }
        # Convert ' > ' as '/'
        query = query.gsub /\s*>\s*/ { |m| "/" }
        # Convert ' ' as '//'
        query = query.gsub " ", "//"
        # a leading '*' when xpath does not include node name
        query = query.gsub /\/\[/ { |m| "/*[" }
        return query
      end
      # Find first tag by tag name and return
      # `HTML::Tag` if found or `nil` if not found
      def at_tag(tag_name : String) : Tag | Nil
        if tags = @tags[tag_name]?
          tags.each do |tag|
            return tag
          end
        end
        return nil
      end
      # Find all nodes by tag name and yield
      # `HTML::Tag` if found
      def where_tag(tag_name : String, &block) : Array(Tag)
        arr = [] of Tag
        if tags = @tags[tag_name]?
          tags.each do |tag|
            yield tag
            arr << tag
          end
        end
        return arr
      end
      # Find all nodes by classname and yield
      # `HTML::Tag` founded
      def where_class(class_name : String, &block) : Array(Tag)
        arr = [] of Tag
        if klasses = @classes[class_name]?
          klasses.each do |node|
            klass = Tag.new(node)
            yield klass
            arr << klass
          end
        end
        return arr
      end
      # Find a node by its id and return a
      # `HTML::Tag` found or `nil` if not found
      def at_id(id_name : String) : Tag | Nil
        if node = @ids[id_name]?
          return Tag.new(node)
        end
      end
      # Find all nodes corresponding to the css query and yield
      # `HTML::Tag` found or `nil` if not found
      def css(query : String) : Array(Tag)
        query = HTML.css_query_to_xpath(query)
        return @nodes.xpath_nodes("//#{query}").map { |node|
          tag = Tag.new(node)
          yield tag
          tag
        }
      end
      # Find first node corresponding to the css query and return
      # `HTML::Tag` if found or `nil` if not found
      def at_css(query : String)
        css(query) { |tag| return tag }
        return nil
      end
      private def add_id(id : String, node : XML::Node)
        @ids[id] = node
      end
      private def add_node(node : XML::Node)
        if @tags[node.name]? == nil
          @tags[node.name] = [] of Tag
        end
        @tags[node.name] << Tag.new(node)
      end
      private def add_class(klass : String, node : XML::Node)
        if @classes[klass]? == nil
          @classes[klass] = [] of XML::Node
        end
        @classes[klass] << node
      end
      # Depth-first visit. Given a node, extract metadata from
      # node (if exists), then visit each child.
      private def visit(node : XML::Node)
        # We only extract metadata from HTML nodes
        if node.element?
          add_node node
          if to = node["id"]?
            add_id to, node
          end
          if classes = node["class"]?
            classes.split(' ') { |to| add_class to, node }
          end
        end
        # visit each child
        node.children.each do |child|
          visit child
        end
      end
      # Represents an HTML Tag
      struct Tag
        getter node : XML::Node
        forward_missing_to @node
        def initialize(@node : XML::Node)
        end
        def classname : String | Nil
          return @node["class"]? ? @node["class"] : nil
        end
        def tagname : String
          return @node.name
        end
        def content : String
          return @node.text != nil ? @node.text.as(String) : "".as(String)
        end
        def parent : Tag | Nil
          if parent = @node.parent
            return Tag.new parent
          end
          nil
        end
        def children : Array(Tag)
          children = [] of Tag
          @node.children.each do |node|
            if node.element?
              children << Tag.new node
            end
          end
          children
        end
        def has_class?(klass : String) : Bool
          if classes = classname
            return classes.includes?(klass)
          end
          false
        end
      end
    end
  end
 end
--- a/src/arachnid/extensions/uri.cr
+++ b/src/arachnid/extensions/uri.cr
@ -0,0 +1,175 @@
 require "uri"
 require "string_scanner"
 class URI
  #
  # Expands a URI decoded path, into a proper absolute path.
  #
  # @param [String] path
  #   The path from a URI.
  #
  # @return [String]
  #   The expanded path.
  #
  # @example
  #   URI.expand_path("./path")
  #   # => "path"
  #
  # @example
  #   URI.expand_path("test/../path")
  #   # => "path"
  #
  # @example
  #   URI.expand_path("/test/path/")
  #   # => "/test/path/"
  #
  # @example
  #   URI.expand_path("/test/../path")
  #   # => "/path"
  #
  def self.expand_path(path)
    if path.starts_with?("/")
      leading_slash, path = path[0, 1], path[1..-1]
    else
      leading_slash = ""
    end
    if path.ends_with?("/")
      trailing_slash, path = path[-1, 1], path[0..-2]
    else
      trailing_slash = ""
    end
    scanner = StringScanner.new(path)
    stack = [] of String
    until scanner.eos?
      if (dir = scanner.scan(/[^\/]+/))
        case dir
        when ".." then stack.pop
        when "."  then false
        else           stack.push(dir)
        end
      else
        scanner.skip(/\/+/)
      end
      break if stack.empty?
    end
    unless stack.empty?
      "#{leading_slash}#{stack.join("/")}#{trailing_slash}"
    else
      ""
    end
  end
  def split_path(path)
    path.split("/")
  end
  def merge_path(base, rel)
    # RFC2396, Section 5.2, 5)
    # RFC2396, Section 5.2, 6)
    base_path = split_path(base)
    rel_path  = split_path(rel)
    # RFC2396, Section 5.2, 6), a)
    base_path << "" if base_path.last == ".."
    while i = base_path.index("..")
      base_path = base_path[i - 1, 2]
    end
    if (first = rel_path.first) && first.empty?
      base_path.clear
      rel_path.shift
    end
    # RFC2396, Section 5.2, 6), c)
    # RFC2396, Section 5.2, 6), d)
    rel_path.push("") if rel_path.last == '.' || rel_path.last == ".."
    rel_path.delete('.')
    # RFC2396, Section 5.2, 6), e)
    tmp = [] of String
    rel_path.each do |x|
      if x == ".." &&
          !(tmp.empty? || tmp.last == "..")
        tmp.pop
      else
        tmp << x
      end
    end
    add_trailer_slash = !tmp.empty?
    if base_path.empty?
      base_path = [""] # keep '/' for root directory
    elsif add_trailer_slash
      base_path.pop
    end
    while x = tmp.shift
      if x == ".."
        # RFC2396, Section 4
        # a .. or . in an absolute path has no special meaning
        base_path.pop if base_path.size > 1
      else
        # if x == ".."
        #   valid absolute (but abnormal) path "/../..."
        # else
        #   valid absolute path
        # end
        base_path << x
        tmp.each {|t| base_path << t}
        add_trailer_slash = false
        break
      end
    end
    base_path.push("") if add_trailer_slash
    return base_path.join('/')
  end
  def merge(oth)
    oth = URI.parse(oth) unless oth.is_a?(URI)
    if oth.absolute?
      # raise BadURIError, "both URI are absolute" if absolute?
      # hmm... should return oth for usability?
      return oth
    end
    unless self.absolute?
      raise URI::Error.new("both URI are othative")
    end
    base = self.dup
    authority = oth.userinfo || oth.host || oth.port
    # RFC2396, Section 5.2, 2)
    if (oth.path.nil? || oth.path.empty?) && !authority && !oth.query
      base.fragment=(oth.fragment) if oth.fragment
      return base
    end
    base.query = nil
    base.fragment=(nil)
    # RFC2396, Section 5.2, 4)
    if !authority
      base.path = merge_path(base.path, oth.path) if base.path && oth.path
    else
      # RFC2396, Section 5.2, 4)
      base.path = oth.path if oth.path
    end
    # RFC2396, Section 5.2, 7)
    base.user = oth.userinfo if oth.userinfo
    base.host = oth.host if oth.host
    base.port = oth.port if oth.port
    base.query = oth.query if oth.query
    base.fragment=(oth.fragment) if oth.fragment
    return base
  end
 end
--- a/src/arachnid/page.cr
+++ b/src/arachnid/page.cr
@ -0,0 +1,97 @@
 require "uri"
 require "halite"
 require "./page/content_types"
 require "./page/cookies"
 require "./page/html"
 require "./page/status_codes"
 require "./document/html"
 module Arachnid
  # Represents a page requested from a website
  class Page
    include Page::ContentTypes
    include Page::Cookies
    include Page::HTML
    include Page::StatusCodes
    # URL of the page
    getter url : URI
    # HTTP response
    getter response : Halite::Response
    # Headers returned with the body
    getter headers : HTTP::Headers
    @doc : (Document::HTML | XML::Node)?
    delegate xpath, xpath_node, xpath_nodes, xpath_bool, xpath_float, xpath_string,
      root, at_tag, where_tag, where_class, at_id, css, at_css, to: @doc
    forward_missing_to @headers
    # Creates a new `Page` object.
    def initialize(url : URI, response : Halite::Response)
      @url = url
      @response = response
      @headers = response.headers
    end
    # The body of the response
    def body
      @response.body || ""
    end
    # Returns a parsed document for HTML, XML, RSS, and Atom pages.
    def doc
      unless body.empty?
        doc_class = if html?
          Document::HTML
        elsif rss? || atom? || xml? || xsl?
          XML
        end
        if doc_class
          begin
            @doc ||= doc_class.parse(body)
          rescue
          end
        end
      end
    end
    # Searches the document for XPath or CSS paths
    def search(path)
      if document = doc
        document.xpath_nodes(path)
      else
        [] of XML::Node
      end
    end
    # Searches for the first occurrence of an XPath or CSS path
    def at(path)
      if document = doc
        document.xpath_node(path)
      end
    end
    def /(path)
      search(path)
    end
    def %(path)
      at(path)
    end
    def size
      @response.body.bytesize
    end
    def to_s
      body
    end
  end
 end
--- a/src/arachnid/page/content_types.cr
+++ b/src/arachnid/page/content_types.cr
@ -0,0 +1,162 @@
 module Arachnid
  class Page
    module ContentTypes
      # The Content-Type of the page.
      def content_type
        @response.content_type || ""
      end
      # The content types of the page.
      def content_types
        types = @response.headers.get?("content-type") || [] of String
      end
      # The charset included in the Content-Type.
      def content_charset
        content_types.each do |value|
          if value.includes?(";")
            value.split(";").each do |param|
              param.strip!
              if param.starts_with?("charset=")
                return param.split("=", 2).last
              end
            end
          end
        end
        return nil
      end
      # Determines if any of the content-types of the page include a given
      # type.
      def is_content_type?(type : String | Regex)
        content_types.any? do |value|
          value = value.split(";", 2).first
          if type.is_a?(Regex)
            value =~ type
          else
            value == type
          end
        end
      end
      # Determines if the page is plain-text.
      def plain_text?
        is_content_type?("text/plain")
      end
      # ditto
      def text?
        plain_text?
      end
      # Determines if the page is a Directory Listing.
      def directory?
        is_content_type?("text/directory")
      end
      # Determines if the page is HTML document.
      def html?
        is_content_type?("text/html")
      end
      # Determines if the page is XML document.
      def xml?
        is_content_type?(/(text|application)\/xml/)
      end
      # Determines if the page is XML Stylesheet (XSL).
      def xsl?
        is_content_type?("text/xsl")
      end
      # Determines if the page is JavaScript.
      def javascript?
        is_content_type?(/(text|application)\/javascript/)
      end
      # Determines if the page is JSON.
      def json?
        is_content_type?("application/json")
      end
      # Determines if the page is a CSS stylesheet.
      def css?
        is_content_type?("text/css")
      end
      # Determines if the page is a RSS feed.
      def rss?
        is_content_type?(/application\/(rss\+xml|rdf\+xml)/)
      end
      # Determines if the page is an Atom feed.
      def atom?
        is_content_type?("application/atom+xml")
      end
      # Determines if the page is a MS Word document.
      def ms_word?
        is_content_type?("application/msword")
      end
      # Determines if the page is a PDF document.
      def pdf?
        is_content_type?("application/pdf")
      end
      # Determines if the page is a ZIP archive.
      def zip?
        is_content_type?("application/zip")
      end
      # Determine if the page is an image.
      def image?
        is_content_type?(/image\//)
      end
      def png?
        is_content_type?("image/png")
      end
      def gif?
        is_content_type?("image/gif")
      end
      def jpg?
        is_content_type?(/image\/(jpg|jpeg)/)
      end
      def svg?
        is_content_type?(/image\/svg(\+xml)?/)
      end
      def video?
        is_content_type?(/video\/.*/)
      end
      def mp4?
        is_content_type?("video/mp4")
      end
      def avi?
        is_content_type?("video/x-msvideo")
      end
      def wmv?
        is_content_type?("video/x-ms-wmv")
      end
      def quicktime?
        is_content_type?("video/quicktime")
      end
      def flash?
        is_content_type?("video/flash") ||
          is_content_type?("application/x-shockwave-flash")
      end
    end
  end
 end
--- a/src/arachnid/page/cookies.cr
+++ b/src/arachnid/page/cookies.cr
@ -0,0 +1,18 @@
 module Arachnid
  class Page
    module Cookies
      # Reserved names used within Cookie strings
      RESERVED_COOKIE_NAMES = Regex.new("^(?:Path|Expires|Domain|Secure|HTTPOnly)$", :ignore_case)
      # The raw Cookie String sent along with the page.
      def cookie
        @response.headers["Set-Cookie"]? || ""
      end
      # The Cookie values sent along with the page.
      def cookies
        @response.cookies
      end
    end
  end
 end
--- a/src/arachnid/page/html.cr
+++ b/src/arachnid/page/html.cr
@ -0,0 +1,204 @@
 require "../extensions/uri"
 module Arachnid
  class Page
    # TODO: Create enumerable methods for the methods that take a block
    module HTML
      # include Enumerable
      # The title of the HTML page.
      def title
        if (node = at("//title"))
          node.inner_text
        end
      end
      # Enumerates over the meta-redirect links in the page.
      def each_meta_redirect(&block : URI ->)
        if (html? && doc)
          search("//meta[@http-equiv and @content]").each do |node|
            if node["http-equiv"] =~ /refresh/i
              content = node["content"]
              if (redirect = content.match(/url=(\S+)$/))
                yield URI.parse(redirect[1])
              end
            end
          end
        end
      end
      # Returns a boolean indicating whether or not page-level meta
      # redirects are present in this page.
      def meta_redirect?
        !meta_redirects.empty?
      end
      # The meta-redirect links of the page.
      def meta_redirects
        redirects = [] of URI
        each_meta_redirect { |r| redirects << r }
        redirects
      end
      # Enumerates over every HTTP or meta-redirect link in the page.
      def each_redirect(&block : URI ->)
        if (locations = @response.headers.get?("Location"))
          # Location headers override any meta-refresh redirects in the HTML
          locations.each { |l| URI.parse(l) }
        else
          # check page-level meta redirects if there isn't a location header
          each_meta_redirect(&block)
        end
      end
      # URLs that this document redirects to.
      def redirects_to
        each_redirect.to_a
      end
      # Enumerates over every `mailto:` link in the page.
      def each_mailto(&block)
        if (html? && doc)
          doc.xpath_nodes("//a[starts-with(@href,'mailto:')]").each do |a|
            yield a["href"][7..-1]
          end
        end
      end
      # `mailto:` links in the page.
      def mailtos
        each_mailto.to_a
      end
      # Enumerates over every link in the page.
      def each_link(&block : URI ->)
        each_redirect(&block) if redirect?
        each_image(&block)
        each_script(&block)
        each_resource(&block)
        if html? && (d = doc)
          d.xpath_nodes("//a[@href]").each do |a|
            link = to_absolute(a["href"])
            yield link if link
          end
          d.xpath_nodes("//frame[@src]").each do |iframe|
            link = to_absolute(iframe["src"])
            yield link if link
          end
          d.xpath_nodes("//iframe[@src]").each do |iframe|
            link = to_absolute(iframe["src"])
            yield link if link
          end
        end
      end
      def each_script(&block : URI ->)
        if html? && (d = doc)
          d.xpath_nodes("//script[@src]").each do |script|
            url = to_absolute(script["src"])
            yield url if url
          end
        end
      end
      def each_resource(&block : URI ->)
        if html? && (d = doc)
          d.xpath_nodes("//link[@href]").each do |link|
            yield URI.parse(link["href"])
          end
        end
      end
      def each_image(&block : URI ->)
        if html? && (d = doc)
          d.xpath_nodes("//img[@src]").each do |img|
            url = to_absolute(img["src"])
            yield url if url
          end
          d.xpath_nodes("//img[@srcset]").each do |set|
            sources = set["srcset"].split(" ").map_with_index { |e, i| (i.zero? || i.even?) ? e : nil }.compact
            sources.each do |source|
              url = to_absolute(source)
              yield url if url
            end
          end
        end
      end
      def each_video(&block : URI ->)
        if html? && (d = doc)
          d.xpath_nodes("//video[@src]").each do |video|
            url = to_absolute(video["src"])
            yield url if url
          end
          d.xpath_nodes("//video/source[@src]").each do |source|
            url = to_absolute(source["src"])
            yield url if url
          end
        end
      end
      # The links from within the page.
      def links
        links = [] of URI
        each_link { |link| links << link }
        links
      end
      # Enumerates over every URL in the page.
      def each_url(&block : URI ->)
        each_link(&block) do |link|
          if (url = to_absolute(link))
            yield url
          end
        end
      end
      # ditto
      def each(&block)
        each_url { |url| yield url }
      end
      # Absolute URIs from within the page.
      def urls
        urls = [] of URI
        each_url { |url| urls << link }
        urls
      end
      # Normalizes and expands a given link into a proper URI.
      def to_absolute(link)
        link = link.is_a?(URI) ? link : URI.parse(link)
        new_url = begin
          url.merge(link)
        rescue Exception
          return
        end
        if (!new_url.opaque?) && (path = new_url.path)
          # ensure that paths begin with a leading '/' for URI::FTP
          if (new_url.scheme == "ftp" && !path.starts_with?("/"))
            path.insert(0, "/")
          end
          # make sure the path does not contain any .. or . directories,
          # since URI::Generic#merge cannot normalize paths such as
          # "/stuff/../"
          new_url.path = URI.expand_path(path)
        end
        return new_url
      end
    end
  end
 end
--- a/src/arachnid/page/status_codes.cr
+++ b/src/arachnid/page/status_codes.cr
@ -0,0 +1,59 @@
 module Arachnid
  class Page
    module StatusCodes
      # The response code from the page.
      def code
        @response.status_code.to_i
      end
      # Determines if the response code is `200`.
      def ok?
        code == 200
      end
      # Determines if the response code is `308`.
      def timedout?
        code == 308
      end
      # Determines if the response code is `400`.
      def bad_request?
        code == 400
      end
      # Determines if the response code is `401`.
      def unauthorized?
        code == 401
      end
      # Determines if the response code is `403`.
      def forbidden?
        code == 403
      end
      # Determines if the response code is `404`.
      def missing?
        code == 404
      end
      # Determines if the response code is `500`.
      def had_internal_server_error?
        code == 500
      end
      # Determines if the response code is `300`, `301`, `302`, `303`
      # or `307`. Also checks for "soft" redirects added at the page
      # level by a meta refresh tag.
      def redirect?
        case code
        when 300..303, 307
          true
        when 200
          meta_redirect?
        else
          false
        end
      end
    end
  end
 end
--- a/src/arachnid/robots.cr
+++ b/src/arachnid/robots.cr
@ -0,0 +1,231 @@
 require "uri"
 module Arachnid
  # Parses robots.txt files for the perusal of a single user-agent.
  #
  # The behaviour implemented is guided by the following sources, though
  # as there is no widely accepted standard, it may differ from other implementations.
  # If you consider its behaviour to be in error, please contact the author.
  #
  # http://www.robotstxt.org/orig.html
  #  - the original, now imprecise and outdated version
  # http://www.robotstxt.org/norobots-rfc.txt
  #  - a much more precise, outdated version
  # http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=156449&from=35237
  #  - a few hints at modern protocol extensions.
  #
  # This parser only considers lines starting with (case-insensitively:)
  #  Useragent: User-agent: Allow: Disallow: Sitemap:
  #
  # The file is divided into sections, each of which contains one or more User-agent:
  # lines, followed by one or more Allow: or Disallow: rules.
  #
  # The first section that contains a User-agent: line that matches the robot's
  # user-agent, is the only section that relevent to that robot. The sections are checked
  # in the same order as they appear in the file.
  #
  # (The * character is taken to mean "any number of any characters" during matching of
  #  user-agents)
  #
  # Within that section, the first Allow: or Disallow: rule that matches the expression
  # is taken as authoritative. If no rule in a section matches, the access is Allowed.
  #
  # (The order of matching is as in the RFC, Google matches all Allows and then all Disallows,
  #  while Bing matches the most specific rule, I'm sure there are other interpretations)
  #
  # When matching urls, all % encodings are normalised (except for /?=& which have meaning)
  # and "*"s match any number of any character.
  #
  # If a pattern ends with a $, then the pattern must match the entire path, or the entire
  # path with query string.
  #
  # TODO: Rework to allow for multiple Robots
  class Robots
    alias Rule = Tuple(String, Bool)
    alias RuleSet = Tuple(String, Array(Rule))
    getter body : String
    getter user_agent : String
    getter rules : Array(Tuple(String, Array(Rule)))
    getter sitemaps : Array(String)
    def initialize(@body : String, @user_agent : String)
      @sitemaps = [] of String
      @rules = [] of RuleSet
      parse(@body)
    end
    # Given a URI object, or a string representing one, determine whether this
    # robots.txt would allow access to the path.
    def allowed?(uri)
      uri = URI.parse(uri)
      path = (uri.path || "/") + (uri.query ? "?" + uri.query.to_s : "")
      path_allowed?(@user_agent, path)
    end
    # Check whether the relative path (a string of the url's path and query
    # string) is allowed by the rules we have for the given user_agent.
    #
    private def path_allowed?(user_agent, path)
      @rules.each do |(ua_glob, path_globs)|
        if match_ua_glob user_agent, ua_glob
          path_globs.each do |(path_glob, allowed)|
            return allowed if match_path_glob path, path_glob
          end
          return true
        end
      end
      true
    end
    # This does a case-insensitive substring match such that if the user agent
    # is contained within the glob, or vice-versa, we will match.
    #
    # According to the standard, *s shouldn't appear in the user-agent field
    # except in the case of "*" meaning all user agents. Google however imply
    # that the * will work, at least at the end of a string.
    #
    # For consistency, and because it seems expected behaviour, and because
    # a glob * will match a literal * we use glob matching not string matching.
    #
    # The standard also advocates a substring match of the robot's user-agent
    # within the user-agent field. From observation, it seems much more likely
    # that the match will be the other way about, though we check for both.
    #
    private def match_ua_glob(user_agent, glob)
      glob =~ Regex.new(Regex.escape(user_agent), Regex::Options::IGNORE_CASE) ||
        user_agent =~ Regex.new(reify(glob), Regex::Options::IGNORE_CASE)
    end
    # This does case-sensitive prefix matching, such that if the path starts
    # with the glob, we will match.
    #
    # According to the standard, that's it. However, it seems reasonably common
    # for asterkisks to be interpreted as though they were globs.
    #
    # Additionally, some search engines, like Google, will treat a trailing $
    # sign as forcing the glob to match the entire path - whether including
    # or excluding the query string is not clear, so we check both.
    #
    # (i.e. it seems likely that a site owner who has Disallow: *.pdf$ expects
    # to disallow requests to *.pdf?i_can_haz_pdf, which the robot could, if
    # it were feeling malicious, construe.)
    #
    # With URLs there is the additional complication that %-encoding can give
    # multiple representations for identical URLs, this is handled by
    # normalize_percent_encoding.
    #
    private def match_path_glob(path, glob)
      if glob =~ /\$$/
        end_marker = "(?:\?|$)"
        glob = glob.gsub /\$$/, ""
      else
        end_marker = ""
      end
      glob = normalize_percent_encoding(glob)
      path = normalize_percent_encoding(path)
      path =~ Regex.new("^" + reify(glob) + end_marker)
    rescue e
      false
    end
    # As a general rule, we want to ignore different representations of the
    # same URL. Naively we could just unescape, or escape, everything, however
    # the standard implies that a / is a HTTP path separator, while a %2F is an
    # encoded / that does not act as a path separator. Similar issues with ?, &
    # and =, though all other characters are fine. (While : also has a special
    # meaning in HTTP, most implementations ignore this in the path)
    #
    # It's also worth noting that %-encoding is case-insensitive, so we
    # explicitly upcase the few that we want to keep.
    #
    private def normalize_percent_encoding(path)
      # First double-escape any characters we don't want to unescape
      #                   &  /  =  ?
      path = path.gsub(/%(26|2F|3D|3F)/i) do |code|
        "%25#{code.upcase}"
      end
      URI.unescape(path)
    end
    # Convert the asterisks in a glob into (.*)s for regular expressions,
    # and at the same time, escape any other characters that would have
    # a significance in a regex.
    #
    private def reify(glob)
      glob.split("*").map { |part| Regex.escape(part) }.join(".*")
    end
    # Convert the @body into a set of @rules so that our parsing mechanism
    # becomes easier.
    #
    # @rules is an array of pairs. The first in the pair is the glob for the
    # user-agent and the second another array of pairs. The first of the new
    # pair is a glob for the path, and the second whether it appears in an
    # Allow: or a Disallow: rule.
    #
    # For example:
    #
    # User-agent: *
    # Disallow: /secret/
    # Allow: /     # allow everything...
    #
    # Would be parsed so that:
    #
    # @rules = [["*", [ ["/secret/", false], ["/", true] ]]]
    #
    #
    # The order of the arrays is maintained so that the first match in the file
    # is obeyed as indicated by the pseudo-RFC on http://robotstxt.org/. There
    # are alternative interpretations, some parse by speicifity of glob, and
    # some check Allow lines for any match before Disallow lines. All are
    # justifiable, but we could only pick one.
    #
    # Note that a blank Disallow: should be treated as an Allow: * and multiple
    # user-agents may share the same set of rules.
    #
    private def parse(body)
      body.split(/[\r\n]+/).each do |line|
        prefix, value = line.delete("\000").split(":", 2).map(&.strip)
        value = value.sub /\s+#.*/, "" if value
        parser_mode = :begin
        if prefix && value
          case prefix.downcase
          when /^user-?agent$/
            if parser_mode == :user_agent
              @rules << {value, rules.last[1]}
            else
              parser_mode = :user_agent
              @rules << {value, [] of Rule}
            end
          when "disallow"
            parser_mode = :rules
            @rules << {"*", [] of Rule} if @rules.empty?
            if value == ""
              @rules.last[1] << {"*", true}
            else
              @rules.last[1] << {value, false}
            end
          when "allow"
            parser_mode = :rules
            @rules << {"*", [] of Rule} if @rules.empty?
            @rules.last[1] << {value, true}
          when "sitemap"
            @sitemaps << value
          else
            # Ignore comments, Crawl-delay: and badly formed lines.
          end
        end
      end
    end
  end
 end
--- a/src/arachnid/rules.cr
+++ b/src/arachnid/rules.cr
@ -0,0 +1,53 @@
 module Arachnid
  # The `Rules` class represents collections of acceptance and rejection
  # rules, which are used to filter data.
  class Rules(T)
    # Accept rules
    getter accept : Array(Proc(T | Nil, Bool) | T | Regex | String)
    # Reject rules
    getter reject : Array(Proc(T | Nil, Bool) | T | Regex | String)
    # Creates a new `Rules` object.
    def initialize(accept : Array(Proc(T | Nil, Bool) | T | Regex | String)? = nil, reject : Array(Proc(T | Nil, Bool) | T | Regex | String)? = nil)
      @accept = accept ? accept : [] of Proc(T | Nil, Bool) | T | Regex | String
      @reject = reject ? reject : [] of Proc(T | Nil, Bool) | T | Regex | String
    end
    # Determines whether the data should be accepted or rejected.
    def accept?(data : T)
      return true if accept.empty? && reject.empty?
      unless @accept.empty?
        @accept.any? { |rule| test_data(data, rule) }
      else
        !@reject.any? { |rule| test_data(data, rule) }
      end
    end
    def accept=(value)
      @accept = value || [] of Proc(T | Nil, Bool) | T | Regex | String
    end
    # Determines whether the data should be rejected or accepted.
    def reject?(data : T)
      !accept?(data)
    end
    def reject=(value)
      @reject = value || [] of Proc(T | Nil, Bool) | T | Regex | String
    end
    # Tests the given data against a pattern.
    private def test_data(data : T, rule)
      case rule
      when Proc
        rule.call(data) == true
      when Regex
        !((data.to_s =~ rule).nil?)
      else
        data == rule
      end
    end
  end
 end
--- a/src/arachnid/session_cache.cr
+++ b/src/arachnid/session_cache.cr
@ -0,0 +1,112 @@
 require "uri"
 require "halite"
 module Arachnid
  # Stores active HTTP Sessions organized by scheme, host-name and port.
  class SessionCache
    # Optional read timeout.
    property read_timeout : Int32
    # Optional connect timeout.
    property connect_timeout : Int32
    # Max redirects to follow.
    property max_redirects : Int32?
    # Should we set a DNT (Do Not Track) header?
    property? do_not_track : Bool
    @sessions = {} of Tuple(String?, String?, Int32?) => Halite::Client
    # Create a new session cache
    def initialize(
      read_timeout : Int32? = nil,
      connect_timeout : Int32? = nil,
      follow_redirects : Bool? = nil,
      max_redirects : Int32? = nil,
      do_not_track : Bool? = nil
    )
      @read_timeout = read_timeout || Arachnid.read_timeout
      @connect_timeout = connect_timeout || Arachnid.connect_timeout
      @max_redirects = max_redirects || Arachnid.max_redirects
      @do_not_track = do_not_track || Arachnid.do_not_track?
    end
    # Determines if there is an active session for the given URL
    def active?(url)
      # normalize the url
      url = URI.parse(url) unless url.is_a?(URI)
      # session key
      key = key_for(url)
      @sessions.has_key?(key)
    end
    # Provides an active session for a given URL.
    def [](url)
      # normalize the url
      url = URI.parse(url) unless url.is_a?(URI)
      # session key
      key = key_for(url)
      # normalize the endpoint
      endpoint = url.dup
      endpoint.scheme ||= "http"
      endpoint.query = nil
      endpoint.fragment = nil
      endpoint.path = ""
      # Set headers
      headers = {
        "DNT" => @do_not_track ? 1 : 0
      }
      unless @sessions.has_key?(key)
        session = Halite::Client.new(
          endpoint: endpoint,
          timeout: Halite::Timeout.new(
            connect: @connect_timeout,
            read:  @read_timeout
          ),
          follow: Halite::Follow.new(
            hops: @max_redirects,
            strict: false
          ),
          headers: headers,
        )
        # session = session.logging(skip_request_body: true, skip_response_body: true)
        @sessions[key] = session
      end
      @sessions[key]
    end
    # Destroys an HTTP session for the given scheme, host, and port.
    def kill!(url)
      # normalize the url
      url = URI.parse(url) unless url.is_a?(URI)
      # session key
      key = key_for(url)
      if sess = @sessions[key]
        @sessions.delete(key)
      end
    end
    # Clears the session cache
    def clear
      @sessions.clear
    end
    # Creates a session key based on the URL
    private def key_for(url)
      {url.scheme, url.host, url.port}
    end
  end
 end
--- a/src/arachnid/version.cr
+++ b/src/arachnid/version.cr
@ -0,0 +1,3 @@
 module Arachnid
  VERSION = "0.1.0"
 end