From 9b82f6b48a3f05586d7e148af448c8e9b377d003 Mon Sep 17 00:00:00 2001 From: Chris Watson Date: Wed, 26 Jun 2019 02:45:03 -0700 Subject: [PATCH] Initial commit --- .editorconfig | 9 + .gitignore | 9 + .travis.yml | 6 + LICENSE | 21 ++ README.md | 95 +++++ shard.yml | 17 + spec/crepe_spec.cr | 9 + spec/spec_helper.cr | 2 + src/arachnid.cr | 32 ++ src/arachnid/agent.cr | 543 +++++++++++++++++++++++++++++ src/arachnid/agent/actions.cr | 53 +++ src/arachnid/agent/events.cr | 248 +++++++++++++ src/arachnid/agent/filters.cr | 256 ++++++++++++++ src/arachnid/agent/robots.cr | 20 ++ src/arachnid/agent/sanitizers.cr | 21 ++ src/arachnid/arachnid.cr | 39 +++ src/arachnid/auth_credential.cr | 4 + src/arachnid/auth_store.cr | 83 +++++ src/arachnid/cookie_jar.cr | 118 +++++++ src/arachnid/document/html.cr | 196 +++++++++++ src/arachnid/extensions/uri.cr | 175 ++++++++++ src/arachnid/page.cr | 97 ++++++ src/arachnid/page/content_types.cr | 162 +++++++++ src/arachnid/page/cookies.cr | 18 + src/arachnid/page/html.cr | 204 +++++++++++ src/arachnid/page/status_codes.cr | 59 ++++ src/arachnid/robots.cr | 231 ++++++++++++ src/arachnid/rules.cr | 53 +++ src/arachnid/session_cache.cr | 112 ++++++ src/arachnid/version.cr | 3 + 30 files changed, 2895 insertions(+) create mode 100644 .editorconfig create mode 100644 .gitignore create mode 100644 .travis.yml create mode 100644 LICENSE create mode 100644 README.md create mode 100644 shard.yml create mode 100644 spec/crepe_spec.cr create mode 100644 spec/spec_helper.cr create mode 100644 src/arachnid.cr create mode 100644 src/arachnid/agent.cr create mode 100644 src/arachnid/agent/actions.cr create mode 100644 src/arachnid/agent/events.cr create mode 100644 src/arachnid/agent/filters.cr create mode 100644 src/arachnid/agent/robots.cr create mode 100644 src/arachnid/agent/sanitizers.cr create mode 100644 src/arachnid/arachnid.cr create mode 100644 src/arachnid/auth_credential.cr create mode 100644 src/arachnid/auth_store.cr create mode 100644 src/arachnid/cookie_jar.cr create mode 100644 src/arachnid/document/html.cr create mode 100644 src/arachnid/extensions/uri.cr create mode 100644 src/arachnid/page.cr create mode 100644 src/arachnid/page/content_types.cr create mode 100644 src/arachnid/page/cookies.cr create mode 100644 src/arachnid/page/html.cr create mode 100644 src/arachnid/page/status_codes.cr create mode 100644 src/arachnid/robots.cr create mode 100644 src/arachnid/rules.cr create mode 100644 src/arachnid/session_cache.cr create mode 100644 src/arachnid/version.cr diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..163eb75 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,9 @@ +root = true + +[*.cr] +charset = utf-8 +end_of_line = lf +insert_final_newline = true +indent_style = space +indent_size = 2 +trim_trailing_whitespace = true diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0bbd4a9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +/docs/ +/lib/ +/bin/ +/.shards/ +*.dwarf + +# Libraries don't need dependency lock +# Dependencies will be locked in applications that use them +/shard.lock diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..765f0e9 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,6 @@ +language: crystal + +# Uncomment the following if you'd like Travis to run specs and check code formatting +# script: +# - crystal spec +# - crystal tool format --check diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..21f35ce --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2019 Chris Watson + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..193f113 --- /dev/null +++ b/README.md @@ -0,0 +1,95 @@ +# Arachnid + +Arachnid is a fast and powerful web scraping framework for Crystal. It provides an easy to use DSL for scraping webpages and processing all of the things you might come across. + +## Installation + +1. Add the dependency to your `shard.yml`: + + ```yaml + dependencies: + arachnid: + github: watzon/arachnid + ``` + +2. Run `shards install` + +## Usage + +Arachnid provides an easy to use, powerful DSL for scraping websites. + +```crystal +require "arachnid" +require "json" + +# Let's build a sitemap of crystal-lang.org +# Links will be a hash of url to page title +links = {} of String => String + +# Visit a particular host, in this case `crystal-lang.org`. This will +# not match on subdomains. +Arachnid.host("https://crystal-lang.org") do |spider| + # Ignore the API secion. It's a little big. + spider.ignore_urls_like(/.*\/api.*/) + + spider.every_page do |page| + puts "Visiting #{page.url.to_s}" + + # Ignore redirects for our sitemap + unless page.redirect? + # Add the url of every visited page to our sitemap + links[page.url.to_s] = page.title.to_s.strip + end + end +end + +File.write("crystal-lang.org-sitemap.json", links.to_pretty_json) +``` + +Want to scan external links as well? + +```crystal +# To make things interesting, this time let's download +# every image we find. +Arachnid.start_at("https://crystal-lang.org") do |spider| + # Set a base path to store all the images at + base_image_dir = File.expand_path("~/Pictures/arachnid") + Dir.mkdir_p(base_image_dir) + + spider.every_page do |page| + puts "Scanning #{page.url.to_s}" + + if page.image? + # Since we're going to be saving a lot of images + # let's spawn a new fiber for each one. This + # makes things so much faster. + spawn do + # Output directory for images for this host + directory = File.join(base_image_dir, page.url.host.to_s) + Dir.mkdir_p(directory) + + # The name of the image + filename = File.basename(page.url.path) + + # Save the image using the body of the page + puts "Saving #{filename} to #{directory}" + File.write(File.join(directory, filename), page.body) + end + end + end +end +``` + +More documentation will be coming soon! + +## Contributing + +1. Fork it () +2. Create your feature branch (`git checkout -b my-new-feature`) +3. Commit your changes (`git commit -am 'Add some feature'`) +4. Push to the branch (`git push origin my-new-feature`) +5. Create a new Pull Request + +## Contributors + +- [Chris Watson](https://github.com/watzon) - creator and maintainer diff --git a/shard.yml b/shard.yml new file mode 100644 index 0000000..d15e104 --- /dev/null +++ b/shard.yml @@ -0,0 +1,17 @@ +name: arachnid +version: 0.1.0 + +authors: + - Chris Watson + +dependencies: + halite: + github: icyleaf/halite + version: ~> 0.10.1 + crystagiri: + github: madeindjs/crystagiri + branch: master + +crystal: 0.29.0 + +license: MIT diff --git a/spec/crepe_spec.cr b/spec/crepe_spec.cr new file mode 100644 index 0000000..7f2574a --- /dev/null +++ b/spec/crepe_spec.cr @@ -0,0 +1,9 @@ +require "./spec_helper" + +describe Arachnid do + # TODO: Write tests + + it "works" do + false.should eq(true) + end +end diff --git a/spec/spec_helper.cr b/spec/spec_helper.cr new file mode 100644 index 0000000..3421525 --- /dev/null +++ b/spec/spec_helper.cr @@ -0,0 +1,2 @@ +require "spec" +require "../src/arachnid" diff --git a/src/arachnid.cr b/src/arachnid.cr new file mode 100644 index 0000000..39fc76f --- /dev/null +++ b/src/arachnid.cr @@ -0,0 +1,32 @@ +require "./arachnid/version" +require "./arachnid/arachnid" + +# To make things interesting, this time let's download +# every image we find. +Arachnid.start_at("https://crystal-lang.org") do |spider| + # Set a base path to store all the images at + base_image_dir = File.expand_path("~/Pictures/arachnid") + Dir.mkdir_p(base_image_dir) + + spider.every_page do |page| + puts "Scanning #{page.url.to_s}" + + if page.image? + # Since we're going to be saving a lot of images + # let's spawn a new fiber for each one. This + # makes things so much faster. + spawn do + # Output directory for images for this host + directory = File.join(base_image_dir, page.url.host.to_s) + Dir.mkdir_p(directory) + + # The name of the image + filename = File.basename(page.url.path) + + # Save the image using the body of the page + puts "Saving #{filename} to #{directory}" + File.write(File.join(directory, filename), page.body) + end + end + end +end diff --git a/src/arachnid/agent.cr b/src/arachnid/agent.cr new file mode 100644 index 0000000..c3c8736 --- /dev/null +++ b/src/arachnid/agent.cr @@ -0,0 +1,543 @@ +require "./agent/sanitizers" +require "./agent/filters" +require "./agent/events" +require "./agent/actions" +require "./agent/robots" +require "./page" +require "./session_cache" +require "./cookie_jar" +require "./auth_store" + +module Arachnid + class Agent + + getter? running : Bool + + # Set to limit to a single host. + property host : String? + + # User agent to use. + property user_agent : String + + # HTTP Hoes Header to use. + property host_header : String? + + # HTTP Host Headers to use for specific hosts. + property host_headers : Hash(String | Regex, String) + + # HTTP Headers to use for every request. + property default_headers : Hash(String, String) + + # HTTP Authentication credentials. + property authorized : AuthStore + + # Referer to use. + property referer : String? + + # Delay in between fetching pages. + property fetch_delay : Time::Span | Int32 + + # History containing visited URLs. + getter history : Set(URI) + + # List of unreachable URIs. + getter failures : Set(URI) + + # Queue of URLs to visit. + getter queue : Array(URI) + + # The session cache. + property sessions : SessionCache + + # Cached cookies. + property cookies : CookieJar + + # Maximum number of pages to visit. + property limit : Int32? + + # Maximum depth. + property max_depth : Int32? + + # The visited URLs and their depth within a site. + property levels : Hash(URI, Int32) + + # Creates a new `Agent` object. + def initialize( + host : String? = nil, + read_timeout : Int32? = nil, + connect_timeout : Int32? = nil, + follow_redirects : Bool? = nil, + max_redirects : Int32? = nil, + do_not_track : Bool? = nil, + default_headers : Hash(String, String)? = nil, + host_header : String? = nil, + host_headers : Hash(String | Regex, String)? = nil, + user_agent : String? = nil, + referer : String? = nil, + fetch_delay : (Int32 | Time::Span)? = nil, + queue : Set(URI)? = nil, + history : Set(URI)? = nil, + limit : Int32? = nil, + max_depth : Int32? = nil, + robots : Bool? = nil, + filter_options = nil + ) + @host = host + + @host_header = host_header + @host_headers = host_headers || {} of (Regex | String) => String + @default_headers = default_headers || {} of String => String + + @user_agent = user_agent || Arachnid.user_agent + @referer = referer + + @running = false + @fetch_delay = fetch_delay || 0 + @history = history || Set(URI).new + @failures = Set(URI).new + @queue = queue || [] of URI + + @limit = limit + @levels = {} of URI => Int32 + @max_depth = max_depth + + @sessions = SessionCache.new( + read_timeout, + connect_timeout, + follow_redirects, + max_redirects, + do_not_track + ) + + @cookies = CookieJar.new + @authorized = AuthStore.new + + if filter_options + initialize_filters(**filter_options) + else + initialize_filters + end + + initialize_robots if robots || Arachnid.robots? + end + + # Create a new scoped `Agent` in a block. + def self.new(**options, &block : Agent ->) + _new = new(**options) + with _new yield _new + _new + end + + # Creates a new `Agent` and begins spidering at the given URL. + def self.start_at(url, **options, &block : Agent ->) + agent = new(**options, &block) + agent.start_at(url, force: true) + end + + # Creates a new `Agent` and spiders the web site located + # at the given URL. + def self.site(url, **options, &block : Agent ->) + url = url.is_a?(URI) ? url : URI.parse(url) + url_regex = Regex.new(Regex.escape(url.host.to_s)) + + agent = new(**options, &block) + agent.visit_hosts_like(url_regex) + + agent.start_at(url, force: true) + end + + # Creates a new `Agent` and spiders the given host. + def self.host(url, **options, &block : Agent ->) + url = url.is_a?(URI) ? url : URI.parse(url) + + options = options.merge(host: url.host) + agent = new(**options, &block) + + agent.start_at(url, force: true) + end + + # Clears the history of the `Agent`. + def clear + @queue.clear + @history.clear + @failures.clear + self + end + + # Start spidering at a given URL. + # def start_at(url, &block : Page ->) + # enqueue(url) + # run(&block) + # end + + # Start spidering at a given URL. + def start_at(url, force = false) + enqueue(url, force: force) + return run + end + + # Start spidering until the queue becomes empty or the + # agent is paused. + # def run(&block : Page ->) + # @running = true + + # until @queue.empty? || paused? || limit_reached? + # begin + # visit_page(dequeue, &block) + # rescue Actions::Paused + # return self + # rescue Actions::Action + # end + # end + + # @running = false + # @sessions.clear + # self + # end + + # Start spidering until the queue becomes empty or the + # agent is paused. + def run + @running = true + + until @queue.empty? || paused? || limit_reached? || !running? + begin + visit_page(dequeue) + rescue Actions::Paused + return self + rescue Actions::Action + end + end + + @running = false + @sessions.clear + self + end + + # Sets the history of URLs that were previously visited. + def history=(new_history) + @history.clear + + new_history.each do |url| + @history << url.is_a?(URI) ? url : URI.parse(url) + end + + @history + end + + # Specifies the links which have been visited. + def visited_links + @history.map(&.to_s) + end + + # Specifies the hosts which have been visited. + def visited_hosts + history.map(&.host) + end + + # Determines whether a URL was visited or not. + def visited?(url) + url = url.is_a?(URI) ? url : URI.parse(url) + @history.includes?(url) + end + + # Sets the list of failed URLs. + def failures=(new_failures) + @failures.clear + + new_failures.each do |url| + @failures << url.is_a?(URI) ? url : URI.parse(url) + end + + @failures + end + + # Determines whether a given URL could not be visited. + def failed?(url) + url = url.is_a?(URI) ? url : URI.parse(url) + @failures.includes?(url) + end + + # Sets the queue of URLs to visit. + # Sets the list of failed URLs. + def queue=(new_queue) + @queue.clear + + new_queue.each do |url| + @queue << url.is_a?(URI) ? url : URI.parse(url) + end + + @queue + end + + # Determines whether the given URL has been queued for visiting. + def queued?(url) + url = url.is_a?(URI) ? url : URI.parse(url) + @queue.includes?(url) + end + + # Enqueues a given URL for visiting, only if it passes all + # of the agent's rules for visiting a given URL. + def enqueue(url, level = 0, force = false) + url = sanitize_url(url) + + if (!queued?(url) && visit?(url)) || force + link = url.to_s + + return if url.host.to_s.empty? + + begin + @every_url_blocks.each { |url_block| url_block.call(url) } + + @every_url_like_blocks.each do |pattern, url_blocks| + match = case pattern + when Regex + link =~ pattern + else + (pattern == link) || (pattern == url) + end + + if match + url_blocks.each { |url_block| url_block.call(url) } + end + end + rescue action : Actions::Paused + raise(action) + rescue Actions::SkipLink + return false + rescue Actions::Action + end + + @queue << url + @levels[url] = level + true + end + end + + # Gets and creates a new `Page` object from a given URL, + # yielding the newly created page. + def get_page(url, &block) + url = url.is_a?(URI) ? url : URI.parse(url) + + prepare_request(url) do |session, path, handlers| + new_page = Page.new(url, session.get(path, headers: handlers)) + + # save any new cookies + @cookies.from_page(new_page) + + yield new_page + return new_page + end + end + + # Gets and creates a new `Page` object from a given URL. + def get_page(url) + url = url.is_a?(URI) ? url : URI.parse(url) + + prepare_request(url) do |session, path, handlers| + new_page = Page.new(url, session.get(path, handlers)) + + # save any new cookies + @cookies.from_page(new_page) + + return new_page + end + end + + # Posts supplied form data and creates a new Page from a given URL, + # yielding the newly created page. + def post_page(url, post_data = "", &block) + url = url.is_a?(URI) ? url : URI.parse(url) + + prepare_request(url) do |session, path, handlers| + new_page = Page.new(url, session.post(path, post_data, handlers)) + + # save any new cookies + @cookies.from_page(new_page) + + yield new_page + return new_page + end + end + + # Posts supplied form data and creates a new Page from a given URL. + def post_page(url, post_data = "") + url = url.is_a?(URI) ? url : URI.parse(url) + + prepare_request(url) do |session, path, handlers| + new_page = Page.new(url, session.post(path, post_data, handlers)) + + # save any new cookies + @cookies.from_page(new_page) + + return new_page + end + end + + # Visits a given URL and enqueues the links recovered + # from the page to be visited later. + # def visit_page(url, &block : Page ->) + # url = sanitize_url(url) + + # get_page(url) do |page| + # @history << page.url + + # begin + # @every_page_blocks.each { |page_block| page_block.call(page) } + # yield page + # rescue action : Actions::Paused + # raise(action) + # rescue Actions::SkipPage + # return Nil + # rescue Actions::Action + # end + + # page.each_url do |next_url| + # begin + # @every_link_blocks.each do |link_block| + # link_block.call(page.url, next_url) + # end + # rescue action : Actions::Paused + # raise(action) + # rescue Actions::SkipLink + # next + # rescue Actions::Action + # end + + # if @max_depth.nil? || @max_depth.not_nil! > (@levels[url]? || 0) + # @levels[url] ||= 0 + # enqueue(next_url, @levels[url] + 1) + # end + # end + # end + # end + + # Visits a given URL and enqueues the links recovered + # from the page to be visited later. + def visit_page(url) + url = sanitize_url(url) + + get_page(url) do |page| + @history << page.url + + begin + @every_page_blocks.each { |page_block| page_block.call(page) } + rescue action : Actions::Paused + raise(action) + rescue Actions::SkipPage + return nil + rescue Actions::Action + end + + page.each_url do |next_url| + begin + @every_link_blocks.each do |link_block| + link_block.call(page.url, next_url) + end + rescue action : Actions::Paused + raise(action) + rescue Actions::SkipLink + next + rescue Actions::Action + end + + if @max_depth.nil? || @max_depth.not_nil! > (@levels[url]? || 0) + @levels[url] ||= 0 + enqueue(next_url, @levels[url] + 1) + end + end + end + end + + # Converts the agent into a hash. + def to_h + {"history" => @history, "queue" => @queue} + end + + # Prepares request headers for a given URL. + protected def prepare_request_headers(url) + # set any additional HTTP headers + headers = @default_headers.dup + + unless @host_headers.empty? + @host_headers.each do |name, header| + if url.host =~ name + headers["Host"] = header + break + end + end + end + + headers["Host"] ||= @host_header.to_s if @host_header + headers["User-Agent"] ||= @user_agent.to_s + headers["Referer"] ||= @referer.to_s if @referer + + if authorization = @authorized.for_url(url.host.to_s) + headers["Authorization"] = "Basic #{authorization}" + end + + if header_cookies = @cookies.for_host(url.host.to_s) + headers["Cookie"] = header_cookies.to_cookie_header + end + + headers + end + + # Normalizes the request path and grabs a session to handle + # page get and post requests. + def prepare_request(url, &block) + path = if url.path.empty? + "/" + else + url.path + end + + # append the URL query to the path + path += "?#{url.query}" if url.query + + headers = prepare_request_headers(url) + + begin + sleep(@fetch_delay) if @fetch_delay.to_i > 0 + + yield @sessions[url], path, headers + rescue Halite::Exception::Error | IO::Error | Socket::Error | OpenSSL::SSL::Error + @sessions.kill!(url) + return nil + end + end + + # Dequeues a URL that will later be visited. + def dequeue + @queue.shift + end + + # Determines if the maximum limit has been reached. + def limit_reached? + if limit = @limit + return @history.size >= limit + end + false + end + + # Determines if a given URL should be visited. + def visit?(url) + # puts [url.to_s, visited?(url), visit_scheme?(url.scheme.to_s), visit_host?(url.host.to_s), visit_port?(url.port || -1), visit_link?(url.to_s), visit_url?(url), visit_ext?(url.path)] + !visited?(url) && + visit_scheme?(url.scheme.to_s) && + visit_host?(url.host.to_s) && + visit_port?(url.port || -1) && + visit_link?(url.to_s) && + visit_url?(url) && + visit_ext?(url.path) + # robot_allowed?(url.to_s) + end + + # Adds a given URL to the failures list. + def failed(url) + @failures << url + @every_failed_url_blocks.each { |fail_block| fail_block.call(url) } + true + end + end +end diff --git a/src/arachnid/agent/actions.cr b/src/arachnid/agent/actions.cr new file mode 100644 index 0000000..0e03020 --- /dev/null +++ b/src/arachnid/agent/actions.cr @@ -0,0 +1,53 @@ +module Arachnid + class Agent + module Actions + + # A Runtime Error + class RuntimeError < Exception; end + + # The base `Actions` exceptions class + class Action < RuntimeError; end + + # Exception used to pause a running `Agent` + class Paused < Action; end + + # Exception which causes a running `Agent` to skip a link. + class SkipLink < Action; end + + # Exception which caises a running `Agent` to skip a page. + class SkipPage < Action; end + end + + # Continue spidering + def continue!(&block) + @paused = false + run(&block) + end + + # Sets the pause state of the agent. + def pause=(state) + @paused = state + end + + # Pauses the agent, causing spidering to temporarily stop. + def pause! + @paused = true + raise Actions::Paused.new + end + + # Determines whether the agent is paused. + def paused? + @paused == true + end + + # Causes the agent to skip the link being enqueued. + def skip_link! + raise Actions::SkipLink.new + end + + # Causes the agent to skip the page being visited. + def skip_page! + raise Actions::SkipPage + end + end +end diff --git a/src/arachnid/agent/events.cr b/src/arachnid/agent/events.cr new file mode 100644 index 0000000..4d73a0e --- /dev/null +++ b/src/arachnid/agent/events.cr @@ -0,0 +1,248 @@ +require "../page" + +module Arachnid + class Agent + @every_url_blocks = [] of Proc(URI, Nil) + + @every_failed_url_blocks = [] of Proc(URI, Nil) + + @every_url_like_blocks = Hash(String | Regex, Array(Proc(URI, Nil))).new do |hash, key| + hash[key] = [] of Proc(URI, Nil) + end + + @every_page_blocks = [] of Proc(Page, Nil) + + @every_link_blocks = [] of Proc(URI, URI, Nil) + + # Pass each URL from each page visited to the given block. + def every_url(&block : URI ->) + @every_url_blocks << block + self + end + + # Pass each URL that could not be requested to the given block. + def every_failed_url(&block : URI ->) + @every_failed_url_blocks << block + self + end + + # Pass every URL that the agent visits, and matches a given pattern, + # to a given block. + def every_url_like(pattern, &block : URI ->) + @every_url_like_blocks[pattern] << block + self + end + + # Ssee `#every_url_like` + def urls_like(pattern, &block : URI ->) + every_url_like(pattern, &block) + end + + # Pass the headers from every response the agent receives to a given + # block. + def all_headers(&block) + headers = [] of HTTP::Headers + every_page { |page| headers << page.headers } + headers.each { |header| yield headers } + end + + # Pass every page that the agent visits to a given block. + def every_page(&block : Page ->) + @every_page_blocks << block + self + end + + # Pass every OK page that the agent visits to a given block. + def every_ok_page(&block : Page ->) + pages = [] of Page + every_page { |page| (pages << page) if page.ok? } + pages.each { |page| yield page } + end + + # Pass every Redirect page that the agent visits to a given block. + def every_redirect_page(&block : Page ->) + pages = [] of Page + every_page { |page| (pages << page) if page.redirect? } + pages.each { |page| yield page } + end + + # Pass every Timeout page that the agent visits to a given block. + def every_timedout_page(&block : Page ->) + pages = [] of Page + every_page { |page| (pages << page) if page.timeout? } + pages.each { |page| yield page } + end + + # Pass every Bad Request page that the agent visits to a given block. + def every_bad_request_page(&block : Page ->) + pages = [] of Page + every_page { |page| (pages << page) if page.bad_request? } + pages.each { |page| yield page } + end + + # Pass every Unauthorized page that the agent visits to a given block. + def every_unauthorized_page(&block : Page ->) + pages = [] of Page + every_page { |page| (pages << page) if page.unauthorized? } + pages.each { |page| yield page } + end + + # Pass every Forbidden page that the agent visits to a given block. + def every_forbidden_page(&block : Page ->) + pages = [] of Page + every_page { |page| (pages << page) if page.forbidden? } + pages.each { |page| yield page } + end + + # Pass every Missing page that the agent visits to a given block. + def every_missing_page(&block : Page ->) + pages = [] of Page + every_page { |page| (pages << page) if page.missing? } + pages.each { |page| yield page } + end + + # Pass every Internal Server Error page that the agent visits to a + # given block. + def every_internal_server_error_page(&block : Page ->) + pages = [] of Page + every_page { |page| (pages << page) if page.had_internal_server_error? } + pages.each { |page| yield page } + end + + # Pass every Plain Text page that the agent visits to a given block. + def every_txt_page(&block : Page ->) + pages = [] of Page + every_page { |page| (pages << page) if page.txt? } + pages.each { |page| yield page } + end + + # Pass every HTML page that the agent visits to a given block. + def every_html_page(&block : Page ->) + pages = [] of Page + every_page { |page| (pages << page) if page.html? } + pages.each { |page| yield page } + end + + # Pass every XML page that the agent visits to a given block. + def every_xml_page(&block : Page ->) + pages = [] of Page + every_page { |page| (pages << page) if page.xml? } + pages.each { |page| yield page } + end + + # Pass every XML Stylesheet (XSL) page that the agent visits to a + # given block. + def every_xsl_page(&block : Page ->) + pages = [] of Page + every_page { |page| (pages << page) if page.xsl? } + pages.each { |page| yield page } + end + + # Pass every HTML or XML document that the agent parses to a given + # block. + def every_doc(&block : Document::HTML | XML::Node ->) + docs = [] of Document::HTML || XML::Node + every_page { |page| docs << page.doc.not_nil! if page.doc } + docs.each { |doc| yield doc } + end + + # Pass every HTML document that the agent parses to a given block. + def every_html_doc(&block : Document::HTML | XML::Node ->) + docs = [] of Document::HTML + every_page { |page| docs << page.doc.not_nil! if page.html? } + docs.each { |doc| yield doc } + end + + # Pass every XML document that the agent parses to a given block. + def every_xml_doc(&block : XML::Node ->) + docs = [] of XML::Node + every_page { |page| docs << page.doc.not_nil! if page.xml? } + docs.each { |doc| yield doc } + end + + # Pass every XML Stylesheet (XSL) that the agent parses to a given + # block. + def every_xsl_doc(&block : XML::Node ->) + docs = [] of XML::Node + every_page { |page| docs << page.doc.not_nil! if page.xsl? } + docs.each { |doc| yield doc } + end + + # Pass every RSS document that the agent parses to a given block. + def every_rss_doc(&block : XML::Node ->) + docs = [] of XML::Node + every_page { |page| docs << page.doc.not_nil! if page.rss? } + docs.each { |doc| yield doc } + end + + # Pass every Atom document that the agent parses to a given block. + def every_atom_doc(&block : XML::Node ->) + docs = [] of XML::Node + every_page { |page| docs << page.doc.not_nil! if page.atom? } + docs.each { |doc| yield doc } + end + + # Pass every JavaScript page that the agent visits to a given block. + def every_javascript_page(&block : Page ->) + pages = [] of Page + every_page { |page| (pages << page) if page.javascript? } + pages.each { |page| yield page } + end + + # Pass every CSS page that the agent visits to a given block. + def every_css_page(&block : Page ->) + pages = [] of Page + every_page { |page| (pages << page) if page.css? } + pages.each { |page| yield page } + end + + # Pass every RSS feed that the agent visits to a given block. + def every_rss_page(&block : Page ->) + pages = [] of Page + every_page { |page| (pages << page) if page.rss? } + pages.each { |page| yield page } + end + + # Pass every Atom feed that the agent visits to a given block. + def every_atom_page(&block : Page ->) + pages = [] of Page + every_page { |page| (pages << page) if page.atom? } + pages.each { |page| yield page } + end + + # Pass every MS Word page that the agent visits to a given block. + def every_ms_word_page(&block : Page ->) + pages = [] of Page + every_page { |page| (pages << page) if page.ms_word? } + pages.each { |page| yield page } + end + + # Pass every PDF page that the agent visits to a given block. + def every_pdf_page(&block : Page ->) + pages = [] of Page + every_page { |page| (pages << page) if page.pdf? } + pages.each { |page| yield page } + end + + # Pass every ZIP page that the agent visits to a given block. + def every_zip_page(&block : Page ->) + pages = [] of Page + every_page { |page| (pages << page) if page.zip? } + pages.each { |page| yield page } + end + + # Passes every image URI to the given blocks. + def every_image(&block : Page ->) + pages = [] of Page + every_page { |page| (pages << page) if page.image? } + pages.each { |page| yield page } + end + + # Passes every origin and destination URI of each link to a given + # block. + def every_link(&block : URI, URI ->) + @every_link_blocks << block + self + end + end +end diff --git a/src/arachnid/agent/filters.cr b/src/arachnid/agent/filters.cr new file mode 100644 index 0000000..a1b69b3 --- /dev/null +++ b/src/arachnid/agent/filters.cr @@ -0,0 +1,256 @@ +require "../rules" + +module Arachnid + class Agent + # List of acceptable URL schemes to follow + getter schemes : Array(String) = [] of String + + @host_rules = Rules(String).new + @port_rules = Rules(Int32).new + @link_rules = Rules(String).new + @url_rules = Rules(URI).new + @ext_rules = Rules(String).new + + # Sets the list of acceptable URL schemes to visit. + def schemes=(new_schemes) + @schemes = new_schemes.map(&.to_s) + end + + # Specifies the patterns that match host-names to visit. + def visit_hosts + @host_rules.accept + end + + # Adds a given pattern to the `#visit_hosts`. + def visit_hosts_like(pattern) + visit_hosts << pattern + self + end + + def visit_hosts_like(&block) + visit_hosts << block + self + end + + # Specifies the patterns that match host-names to not visit. + def ignore_hosts + @host_rules.reject + end + + # Adds a given pattern to the `#ignore_hosts`. + def ignore_hosts_like(pattern) + ignore_hosts << pattern + self + end + + def ignore_hosts_like(&block) + ignore_hosts << block + self + end + + # Specifies the patterns that match the ports to visit. + def visit_ports + @port_rules.accept + end + + # Adds a given pattern to the `#visit_ports`. + def visit_ports_like(pattern) + visit_ports << pattern + self + end + + def visit_ports_like(&block : Int32 -> Bool) + visit_ports << block + self + end + + # Specifies the patterns that match ports to not visit. + def ignore_ports + @port_rules.reject + end + + # Adds a given pattern to the `#ignore_ports`. + def ignore_ports_like(pattern) + ignore_ports << pattern + self + end + + def ignore_ports_like(&block : Int32 -> Bool) + ignore_ports << block + self + end + + # Specifies the patterns that match the links to visit. + def visit_links + @link_rules.accept + end + + # Adds a given pattern to the `#visit_links` + def visit_links_like(pattern) + visit_links << pattern + self + end + + def visit_links_like(&block : String -> Bool) + visit_links << block + self + end + + # Specifies the patterns that match links to not visit. + def ignore_links + @link_rules.reject + end + + # Adds a given pattern to the `#ignore_links`. + def ignore_links_like(pattern) + ignore_links << pattern + self + end + + def ignore_links_like(&block : String -> Bool) + ignore_links << block + self + end + + # Specifies the patterns that match the URLs to visit. + def visit_urls + @url_rules.accept + end + + # Adds a given pattern to the `#visit_urls` + def visit_urls_like(&block : URI -> Bool) + visit_urls << block + self + end + + def visit_urls_like(pattern) + visit_urls << pattern + self + end + + # Specifies the patterns that match URLs to not visit. + def ignore_urls + @url_rules.reject + end + + # Adds a given pattern to the `#ignore_urls`. + def ignore_urls_like(&block : URI -> Bool) + ignore_urls << block + self + end + + def ignore_urls_like(pattern) + ignore_urls << pattern + self + end + + # Specifies the patterns that match the URI path extensions to visit. + def visit_exts + @ext_rules.accept + end + + # Adds a given pattern to the `#visit_exts`. + def visit_exts_like(&block : String -> Bool) + visit_exts << block + self + end + + def visit_exts_like(pattern) + visit_exts << pattern + self + end + + # Specifies the patterns that match URI path extensions to not visit. + def ignore_exts + @ext_rules.reject + end + + # Adds a given pattern to the `#ignore_exts`. + def ignore_exts_like(&block : String -> Bool) + ignore_exts << block + self + end + + def ignore_exts_like(pattern) + ignore_exts << pattern + self + end + + # Initializes filtering rules. + protected def initialize_filters( + schemes = nil, + hosts = nil, + ignore_hosts = nil, + ports = nil, + ignore_ports = nil, + links = nil, + ignore_links = nil, + urls = nil, + ignore_urls = nil, + exts = nil, + ignore_exts = nil + ) + + if schemes + self.schemes = schemes + else + @schemes << "http" + @schemes << "https" + end + + @host_rules.accept = hosts + @host_rules.reject = ignore_hosts + + @port_rules.accept = ports + @port_rules.reject = ignore_ports + + @link_rules.accept = links + @link_rules.reject = ignore_links + + @url_rules.accept = urls + @url_rules.reject = ignore_urls + + @ext_rules.accept = exts + @ext_rules.reject = ignore_exts + + if host + visit_hosts_like(host.to_s) + end + end + + # Determines if a given URI scheme should be visited. + protected def visit_scheme?(scheme) + if scheme + @schemes.includes?(scheme) + else + true + end + end + + # Determines if a given host-name should be visited. + protected def visit_host?(host) + @host_rules.accept?(host) + end + + # Determines if a given port should be visited. + protected def visit_port?(port) + @port_rules.accept?(port) + end + + # Determines if a given link should be visited. + protected def visit_link?(link) + @link_rules.accept?(link) + end + + # Determines if a given URL should be visited. + protected def visit_url?(link) + @url_rules.accept?(link) + end + + # Determines if a given URI path extension should be visited. + protected def visit_ext?(path) + ext = File.extname(path) + @ext_rules.accept?(ext) + end + end +end diff --git a/src/arachnid/agent/robots.cr b/src/arachnid/agent/robots.cr new file mode 100644 index 0000000..ed99613 --- /dev/null +++ b/src/arachnid/agent/robots.cr @@ -0,0 +1,20 @@ +require "../robots" + +module Arachnid + class Agent + @robots : Arachnid::Robots? = nil + + # Initializes the robots filter. + def initialize_robots + # @robots = Arachnid::Robots.new(@user_agent) + end + + # Determines whether a URL is allowed by the robot policy. + def robot_allowed?(url) + if robots = @robots + return robots.allowed?(url) + end + true + end + end +end diff --git a/src/arachnid/agent/sanitizers.cr b/src/arachnid/agent/sanitizers.cr new file mode 100644 index 0000000..894c262 --- /dev/null +++ b/src/arachnid/agent/sanitizers.cr @@ -0,0 +1,21 @@ +module Arachnid + class Agent + # Specifies whether the Agent will strip URI fragments + property? strip_fragments : Bool = true + + # Specifies whether the Agent will strip URI queries + property? strip_query : Bool = false + + # Sanitizes a URL based on filtering options + def sanitize_url(url) + # normalize the url + url = URI.parse(url) unless url.is_a?(URI) + + url.path = "" if url.path == "/" + url.fragment = nil if @strip_fragments + url.query = nil if @strip_query + + url + end + end +end diff --git a/src/arachnid/arachnid.cr b/src/arachnid/arachnid.cr new file mode 100644 index 0000000..a3d4008 --- /dev/null +++ b/src/arachnid/arachnid.cr @@ -0,0 +1,39 @@ +require "./page" +require "./agent" + +module Arachnid + extend self + + # Specifies whether robots.txt should be honored globally + class_property? robots : Bool = false + + # Should we set the DNT (Do Not Track) header? + class_property? do_not_track : Bool = false + + # Maximum amount of redirects to follow + class_property max_redirects : Int32 = 0 + + # Connect timeout. + class_property connect_timeout : Int32 = 10 + + # Read timeout. + class_property read_timeout : Int32 = 10 + + # The User-Agent string used by all Agent objects by default. + class_property user_agent : String = "Arachnid #{Arachnid::VERSION}" + + # See `Agent.start_at` + def start_at(url, **options, &block : Agent ->) + Agent.start_at(url, **options, &block) + end + + # See `Agent.host` + def host(name, **options, &block : Agent ->) + Agent.host(name, **options, &block) + end + + # See `Agent.site` + def site(url, **options, &block : Agent ->) + Agent.site(url, **options, &block) + end +end diff --git a/src/arachnid/auth_credential.cr b/src/arachnid/auth_credential.cr new file mode 100644 index 0000000..53d81ea --- /dev/null +++ b/src/arachnid/auth_credential.cr @@ -0,0 +1,4 @@ +module Arachnid + # Represents HTTP Authentication credentials for a website. + record AuthCredential, username : String, password : String +end diff --git a/src/arachnid/auth_store.cr b/src/arachnid/auth_store.cr new file mode 100644 index 0000000..223e6b3 --- /dev/null +++ b/src/arachnid/auth_store.cr @@ -0,0 +1,83 @@ +require "base64" +require "./extensions/uri" +require "./auth_credential" +require "./page" + +module Arachnid + class AuthStore + @credentials = {} of Tuple(String?, String?, Int32?) => Hash(Array(String), AuthCredential) + + # Given a URL, return the most specific matching auth credential. + def [](url) + # normalize the url + url = URI.parse(url) unless url.is_a?(URI) + + key = key_for(url) + paths = @credentials[key]? + + return nil unless paths + + # longest path first + ordered_paths = paths.keys.sort { |path_key| -path_key.size } + + # directories of the path + path_dirs = URI.expand_path(url.path).split('/').reject(&.empty?) + + ordered_paths.each do |path| + return paths[path] if path_dirs[0, path.size] == path + end + + nil + end + + # Add an auth credential to the store for the supplied base URL. + def []=(url, auth) + # normalize the url + url = URI.parse(url) unless url.is_a?(URI) + + # normalize the url path and split it + paths = URI.expand_path(url.path).split('/').reject(&.empty?) + + key = key_for(url) + + @credentials[key] ||= {} of Array(String) => AuthCredential + @credentials[key][paths] = auth + auth + end + + # Convenience method to add username and password credentials + # for a named URL. + def add(url, username, password) + self[url] = AuthCredential.new(username: username, password: password) + end + + # Returns the base64 encoded authorization string for the URL + # or `nil` if no authorization exists. + def for_url(url) + if auth = self[url] + Base64.encode("#{auth.username}#{auth.password}") + end + end + + # Clear the contents of the auth store. + def clear! + @credentials.clear! + self + end + + # Size of the current auth store (number of URL paths stored) + def size + @credentials.values.reduce(0) { |acc, paths| acc + paths.size } + end + + # Inspect the auth store + def inspect + "<#{self.class}: #{@credentials.inspect}>" + end + + # Creates a auth key based on the URL + private def key_for(url) + {url.scheme, url.host, url.port} + end + end +end diff --git a/src/arachnid/cookie_jar.cr b/src/arachnid/cookie_jar.cr new file mode 100644 index 0000000..1eb2635 --- /dev/null +++ b/src/arachnid/cookie_jar.cr @@ -0,0 +1,118 @@ +module Arachnid + class CookieJar + include Enumerable(HTTP::Cookies) + + @params : Hash(String, HTTP::Cookies) + + @cookies : HTTP::Cookies + + @dirty : Set(String) + + # Creates a new `CookieJar` + def initialize + @params = {} of String => HTTP::Cookies + @cookies = HTTP::Cookies.new + @dirty = Set(String).new + end + + # Iterates over the host-name and cookie value pairs in the jar. + def each(&block) + @params.each do |kp| + yield kp + end + end + + # Returns all relevant cookies in a single string for the named + # host or domain. + def [](host : String) + @params[host]? || HTTP::Cookies.new + end + + # Add a cookie to the jar for a particular domain. + def []=(host : String, cookies : HTTP::Cookies) + @params[host] ||= HTTP::Cookies.new + + cookies.each do |cookie| + if @params[host][cookie.name]? != cookie.value + cookies.each do |c| + @params[host] << c + end + @dirty.add(host) + + break + end + end + + cookies + end + + # Retrieve cookies for a domain from the response. + def from_page(page) + cookies = page.cookies + + unless cookies.empty? + self[page.url.host.to_s] = cookies + return true + end + + false + end + + # Returns the pre-encoded Cookie for a given host. + def for_host(host) + if @dirty.includes?(host) + values = [] of String + + cookies_for_host(host).each do |cookie| + values << cookie.to_cookie_header + end + + @cookies[host] = values.join("; ") + @dirty.delete(host) + end + + @cookies[host]? + end + + # Returns raw cookie value pairs for a given host. Includes cookies + # set on parent domains. + def cookies_for_host(host) + host_cookies = @params[host]? || HTTP::Cookies.new + subdomains = host.split('.') + + while subdomains.size > 2 + subdomains.shift + + if parent_cookies = @params[subdomains.join('.')]? + parent_cookies.each do |cookie| + # copy in the parent cookies, only if they haven't been + # overridden yet. + unless host_cookies.has_key?(cookie.name) + host_cookies[cookie.name] = cookie.value + end + end + end + end + + host_cookies + end + + # Clear out the jar, removing all stored cookies. + def clear! + @params.clear + @cookies.clear + @dirty.clear + self + end + + # Size of the cookie jar. + def size + @params.size + end + + # Inspects the cookie jar. + def inspect + "#<#{self.class}: #{@params.inspect}>" + end + end +end diff --git a/src/arachnid/document/html.cr b/src/arachnid/document/html.cr new file mode 100644 index 0000000..75831d4 --- /dev/null +++ b/src/arachnid/document/html.cr @@ -0,0 +1,196 @@ +require "xml" + +module Arachnid + module Document + struct HTML + @content : String + + @document : XML::Node + + @ids : Hash(String, XML::Node) + + @tags : Hash(String, Array(Tag)) + + @classes : Hash(String, Array(XML::Node)) + + forward_missing_to @document + + def initialize(@content : String) + @document = XML.parse_html(@content) + + @ids = {} of String => XML::Node + @tags = {} of String => Array(Tag) + @classes = {} of String => Array(XML::Node) + + visit @document + end + + def self.parse(content : String) + new(content) + end + + # Transform the css query into an xpath query + def self.css_query_to_xpath(query : String) : String + query = "//#{query}" + # Convert '#id_name' as '[@id="id_name"]' + query = query.gsub /\#([A-z0-9]+-*_*)+/ { |m| "*[@id=\"%s\"]" % m.delete('#') } + # Convert '.classname' as '[@class="classname"]' + query = query.gsub /\.([A-z0-9]+-*_*)+/ { |m| "[@class=\"%s\"]" % m.delete('.') } + # Convert ' > ' as '/' + query = query.gsub /\s*>\s*/ { |m| "/" } + # Convert ' ' as '//' + query = query.gsub " ", "//" + # a leading '*' when xpath does not include node name + query = query.gsub /\/\[/ { |m| "/*[" } + return query + end + + # Find first tag by tag name and return + # `HTML::Tag` if found or `nil` if not found + def at_tag(tag_name : String) : Tag | Nil + if tags = @tags[tag_name]? + tags.each do |tag| + return tag + end + end + return nil + end + + # Find all nodes by tag name and yield + # `HTML::Tag` if found + def where_tag(tag_name : String, &block) : Array(Tag) + arr = [] of Tag + if tags = @tags[tag_name]? + tags.each do |tag| + yield tag + arr << tag + end + end + return arr + end + + # Find all nodes by classname and yield + # `HTML::Tag` founded + def where_class(class_name : String, &block) : Array(Tag) + arr = [] of Tag + if klasses = @classes[class_name]? + klasses.each do |node| + klass = Tag.new(node) + yield klass + arr << klass + end + end + return arr + end + + # Find a node by its id and return a + # `HTML::Tag` found or `nil` if not found + def at_id(id_name : String) : Tag | Nil + if node = @ids[id_name]? + return Tag.new(node) + end + end + + # Find all nodes corresponding to the css query and yield + # `HTML::Tag` found or `nil` if not found + def css(query : String) : Array(Tag) + query = HTML.css_query_to_xpath(query) + return @nodes.xpath_nodes("//#{query}").map { |node| + tag = Tag.new(node) + yield tag + tag + } + end + + # Find first node corresponding to the css query and return + # `HTML::Tag` if found or `nil` if not found + def at_css(query : String) + css(query) { |tag| return tag } + return nil + end + + private def add_id(id : String, node : XML::Node) + @ids[id] = node + end + + private def add_node(node : XML::Node) + if @tags[node.name]? == nil + @tags[node.name] = [] of Tag + end + @tags[node.name] << Tag.new(node) + end + + private def add_class(klass : String, node : XML::Node) + if @classes[klass]? == nil + @classes[klass] = [] of XML::Node + end + @classes[klass] << node + end + + # Depth-first visit. Given a node, extract metadata from + # node (if exists), then visit each child. + private def visit(node : XML::Node) + # We only extract metadata from HTML nodes + if node.element? + add_node node + if to = node["id"]? + add_id to, node + end + if classes = node["class"]? + classes.split(' ') { |to| add_class to, node } + end + end + # visit each child + node.children.each do |child| + visit child + end + end + + # Represents an HTML Tag + struct Tag + getter node : XML::Node + + forward_missing_to @node + + def initialize(@node : XML::Node) + end + + def classname : String | Nil + return @node["class"]? ? @node["class"] : nil + end + + def tagname : String + return @node.name + end + + def content : String + return @node.text != nil ? @node.text.as(String) : "".as(String) + end + + def parent : Tag | Nil + if parent = @node.parent + return Tag.new parent + end + nil + end + + def children : Array(Tag) + children = [] of Tag + @node.children.each do |node| + if node.element? + children << Tag.new node + end + end + children + end + + def has_class?(klass : String) : Bool + if classes = classname + return classes.includes?(klass) + end + false + end + end + end + end +end diff --git a/src/arachnid/extensions/uri.cr b/src/arachnid/extensions/uri.cr new file mode 100644 index 0000000..0153a72 --- /dev/null +++ b/src/arachnid/extensions/uri.cr @@ -0,0 +1,175 @@ +require "uri" +require "string_scanner" + +class URI + # + # Expands a URI decoded path, into a proper absolute path. + # + # @param [String] path + # The path from a URI. + # + # @return [String] + # The expanded path. + # + # @example + # URI.expand_path("./path") + # # => "path" + # + # @example + # URI.expand_path("test/../path") + # # => "path" + # + # @example + # URI.expand_path("/test/path/") + # # => "/test/path/" + # + # @example + # URI.expand_path("/test/../path") + # # => "/path" + # + def self.expand_path(path) + if path.starts_with?("/") + leading_slash, path = path[0, 1], path[1..-1] + else + leading_slash = "" + end + + if path.ends_with?("/") + trailing_slash, path = path[-1, 1], path[0..-2] + else + trailing_slash = "" + end + + scanner = StringScanner.new(path) + stack = [] of String + + until scanner.eos? + if (dir = scanner.scan(/[^\/]+/)) + case dir + when ".." then stack.pop + when "." then false + else stack.push(dir) + end + else + scanner.skip(/\/+/) + end + break if stack.empty? + end + + unless stack.empty? + "#{leading_slash}#{stack.join("/")}#{trailing_slash}" + else + "" + end + end + + def split_path(path) + path.split("/") + end + + def merge_path(base, rel) + + # RFC2396, Section 5.2, 5) + # RFC2396, Section 5.2, 6) + base_path = split_path(base) + rel_path = split_path(rel) + + # RFC2396, Section 5.2, 6), a) + base_path << "" if base_path.last == ".." + while i = base_path.index("..") + base_path = base_path[i - 1, 2] + end + + if (first = rel_path.first) && first.empty? + base_path.clear + rel_path.shift + end + + # RFC2396, Section 5.2, 6), c) + # RFC2396, Section 5.2, 6), d) + rel_path.push("") if rel_path.last == '.' || rel_path.last == ".." + rel_path.delete('.') + + # RFC2396, Section 5.2, 6), e) + tmp = [] of String + rel_path.each do |x| + if x == ".." && + !(tmp.empty? || tmp.last == "..") + tmp.pop + else + tmp << x + end + end + + add_trailer_slash = !tmp.empty? + if base_path.empty? + base_path = [""] # keep '/' for root directory + elsif add_trailer_slash + base_path.pop + end + while x = tmp.shift + if x == ".." + # RFC2396, Section 4 + # a .. or . in an absolute path has no special meaning + base_path.pop if base_path.size > 1 + else + # if x == ".." + # valid absolute (but abnormal) path "/../..." + # else + # valid absolute path + # end + base_path << x + tmp.each {|t| base_path << t} + add_trailer_slash = false + break + end + end + base_path.push("") if add_trailer_slash + + return base_path.join('/') + end + + def merge(oth) + oth = URI.parse(oth) unless oth.is_a?(URI) + + if oth.absolute? + # raise BadURIError, "both URI are absolute" if absolute? + # hmm... should return oth for usability? + return oth + end + + unless self.absolute? + raise URI::Error.new("both URI are othative") + end + + base = self.dup + + authority = oth.userinfo || oth.host || oth.port + + # RFC2396, Section 5.2, 2) + if (oth.path.nil? || oth.path.empty?) && !authority && !oth.query + base.fragment=(oth.fragment) if oth.fragment + return base + end + + base.query = nil + base.fragment=(nil) + + # RFC2396, Section 5.2, 4) + if !authority + base.path = merge_path(base.path, oth.path) if base.path && oth.path + else + # RFC2396, Section 5.2, 4) + base.path = oth.path if oth.path + end + + # RFC2396, Section 5.2, 7) + base.user = oth.userinfo if oth.userinfo + base.host = oth.host if oth.host + base.port = oth.port if oth.port + base.query = oth.query if oth.query + base.fragment=(oth.fragment) if oth.fragment + + return base + end +end diff --git a/src/arachnid/page.cr b/src/arachnid/page.cr new file mode 100644 index 0000000..47875f9 --- /dev/null +++ b/src/arachnid/page.cr @@ -0,0 +1,97 @@ +require "uri" +require "halite" + +require "./page/content_types" +require "./page/cookies" +require "./page/html" +require "./page/status_codes" + +require "./document/html" + +module Arachnid + # Represents a page requested from a website + class Page + include Page::ContentTypes + include Page::Cookies + include Page::HTML + include Page::StatusCodes + + # URL of the page + getter url : URI + + # HTTP response + getter response : Halite::Response + + # Headers returned with the body + getter headers : HTTP::Headers + + @doc : (Document::HTML | XML::Node)? + + delegate xpath, xpath_node, xpath_nodes, xpath_bool, xpath_float, xpath_string, + root, at_tag, where_tag, where_class, at_id, css, at_css, to: @doc + + forward_missing_to @headers + + # Creates a new `Page` object. + def initialize(url : URI, response : Halite::Response) + @url = url + @response = response + @headers = response.headers + end + + # The body of the response + def body + @response.body || "" + end + + # Returns a parsed document for HTML, XML, RSS, and Atom pages. + def doc + unless body.empty? + doc_class = if html? + Document::HTML + elsif rss? || atom? || xml? || xsl? + XML + end + + if doc_class + begin + @doc ||= doc_class.parse(body) + rescue + end + end + end + end + + # Searches the document for XPath or CSS paths + def search(path) + if document = doc + document.xpath_nodes(path) + else + [] of XML::Node + end + end + + # Searches for the first occurrence of an XPath or CSS path + def at(path) + if document = doc + document.xpath_node(path) + end + end + + def /(path) + search(path) + end + + def %(path) + at(path) + end + + def size + @response.body.bytesize + end + + def to_s + body + end + end +end diff --git a/src/arachnid/page/content_types.cr b/src/arachnid/page/content_types.cr new file mode 100644 index 0000000..958828c --- /dev/null +++ b/src/arachnid/page/content_types.cr @@ -0,0 +1,162 @@ +module Arachnid + class Page + module ContentTypes + # The Content-Type of the page. + def content_type + @response.content_type || "" + end + + # The content types of the page. + def content_types + types = @response.headers.get?("content-type") || [] of String + end + + # The charset included in the Content-Type. + def content_charset + content_types.each do |value| + if value.includes?(";") + value.split(";").each do |param| + param.strip! + + if param.starts_with?("charset=") + return param.split("=", 2).last + end + end + end + end + + return nil + end + + # Determines if any of the content-types of the page include a given + # type. + def is_content_type?(type : String | Regex) + content_types.any? do |value| + value = value.split(";", 2).first + + if type.is_a?(Regex) + value =~ type + else + value == type + end + end + end + + # Determines if the page is plain-text. + def plain_text? + is_content_type?("text/plain") + end + + # ditto + def text? + plain_text? + end + + # Determines if the page is a Directory Listing. + def directory? + is_content_type?("text/directory") + end + + # Determines if the page is HTML document. + def html? + is_content_type?("text/html") + end + + # Determines if the page is XML document. + def xml? + is_content_type?(/(text|application)\/xml/) + end + + # Determines if the page is XML Stylesheet (XSL). + def xsl? + is_content_type?("text/xsl") + end + + # Determines if the page is JavaScript. + def javascript? + is_content_type?(/(text|application)\/javascript/) + end + + # Determines if the page is JSON. + def json? + is_content_type?("application/json") + end + + # Determines if the page is a CSS stylesheet. + def css? + is_content_type?("text/css") + end + + # Determines if the page is a RSS feed. + def rss? + is_content_type?(/application\/(rss\+xml|rdf\+xml)/) + end + + # Determines if the page is an Atom feed. + def atom? + is_content_type?("application/atom+xml") + end + + # Determines if the page is a MS Word document. + def ms_word? + is_content_type?("application/msword") + end + + # Determines if the page is a PDF document. + def pdf? + is_content_type?("application/pdf") + end + + # Determines if the page is a ZIP archive. + def zip? + is_content_type?("application/zip") + end + + # Determine if the page is an image. + def image? + is_content_type?(/image\//) + end + + def png? + is_content_type?("image/png") + end + + def gif? + is_content_type?("image/gif") + end + + def jpg? + is_content_type?(/image\/(jpg|jpeg)/) + end + + def svg? + is_content_type?(/image\/svg(\+xml)?/) + end + + def video? + is_content_type?(/video\/.*/) + end + + def mp4? + is_content_type?("video/mp4") + end + + def avi? + is_content_type?("video/x-msvideo") + end + + def wmv? + is_content_type?("video/x-ms-wmv") + end + + def quicktime? + is_content_type?("video/quicktime") + end + + def flash? + is_content_type?("video/flash") || + is_content_type?("application/x-shockwave-flash") + end + end + end +end diff --git a/src/arachnid/page/cookies.cr b/src/arachnid/page/cookies.cr new file mode 100644 index 0000000..5af47e9 --- /dev/null +++ b/src/arachnid/page/cookies.cr @@ -0,0 +1,18 @@ +module Arachnid + class Page + module Cookies + # Reserved names used within Cookie strings + RESERVED_COOKIE_NAMES = Regex.new("^(?:Path|Expires|Domain|Secure|HTTPOnly)$", :ignore_case) + + # The raw Cookie String sent along with the page. + def cookie + @response.headers["Set-Cookie"]? || "" + end + + # The Cookie values sent along with the page. + def cookies + @response.cookies + end + end + end +end diff --git a/src/arachnid/page/html.cr b/src/arachnid/page/html.cr new file mode 100644 index 0000000..a279c6d --- /dev/null +++ b/src/arachnid/page/html.cr @@ -0,0 +1,204 @@ +require "../extensions/uri" + +module Arachnid + class Page + # TODO: Create enumerable methods for the methods that take a block + module HTML + # include Enumerable + + # The title of the HTML page. + def title + if (node = at("//title")) + node.inner_text + end + end + + # Enumerates over the meta-redirect links in the page. + def each_meta_redirect(&block : URI ->) + if (html? && doc) + search("//meta[@http-equiv and @content]").each do |node| + if node["http-equiv"] =~ /refresh/i + content = node["content"] + + if (redirect = content.match(/url=(\S+)$/)) + yield URI.parse(redirect[1]) + end + end + end + end + end + + # Returns a boolean indicating whether or not page-level meta + # redirects are present in this page. + def meta_redirect? + !meta_redirects.empty? + end + + # The meta-redirect links of the page. + def meta_redirects + redirects = [] of URI + each_meta_redirect { |r| redirects << r } + redirects + end + + # Enumerates over every HTTP or meta-redirect link in the page. + def each_redirect(&block : URI ->) + if (locations = @response.headers.get?("Location")) + # Location headers override any meta-refresh redirects in the HTML + locations.each { |l| URI.parse(l) } + else + # check page-level meta redirects if there isn't a location header + each_meta_redirect(&block) + end + end + + # URLs that this document redirects to. + def redirects_to + each_redirect.to_a + end + + # Enumerates over every `mailto:` link in the page. + def each_mailto(&block) + if (html? && doc) + doc.xpath_nodes("//a[starts-with(@href,'mailto:')]").each do |a| + yield a["href"][7..-1] + end + end + end + + # `mailto:` links in the page. + def mailtos + each_mailto.to_a + end + + # Enumerates over every link in the page. + def each_link(&block : URI ->) + each_redirect(&block) if redirect? + + each_image(&block) + + each_script(&block) + + each_resource(&block) + + if html? && (d = doc) + d.xpath_nodes("//a[@href]").each do |a| + link = to_absolute(a["href"]) + yield link if link + end + + d.xpath_nodes("//frame[@src]").each do |iframe| + link = to_absolute(iframe["src"]) + yield link if link + end + + d.xpath_nodes("//iframe[@src]").each do |iframe| + link = to_absolute(iframe["src"]) + yield link if link + end + end + end + + def each_script(&block : URI ->) + if html? && (d = doc) + d.xpath_nodes("//script[@src]").each do |script| + url = to_absolute(script["src"]) + yield url if url + end + end + end + + def each_resource(&block : URI ->) + if html? && (d = doc) + d.xpath_nodes("//link[@href]").each do |link| + yield URI.parse(link["href"]) + end + end + end + + def each_image(&block : URI ->) + if html? && (d = doc) + d.xpath_nodes("//img[@src]").each do |img| + url = to_absolute(img["src"]) + yield url if url + end + + d.xpath_nodes("//img[@srcset]").each do |set| + sources = set["srcset"].split(" ").map_with_index { |e, i| (i.zero? || i.even?) ? e : nil }.compact + sources.each do |source| + url = to_absolute(source) + yield url if url + end + end + end + end + + def each_video(&block : URI ->) + if html? && (d = doc) + d.xpath_nodes("//video[@src]").each do |video| + url = to_absolute(video["src"]) + yield url if url + end + + d.xpath_nodes("//video/source[@src]").each do |source| + url = to_absolute(source["src"]) + yield url if url + end + end + end + + # The links from within the page. + def links + links = [] of URI + each_link { |link| links << link } + links + end + + # Enumerates over every URL in the page. + def each_url(&block : URI ->) + each_link(&block) do |link| + if (url = to_absolute(link)) + yield url + end + end + end + + # ditto + def each(&block) + each_url { |url| yield url } + end + + # Absolute URIs from within the page. + def urls + urls = [] of URI + each_url { |url| urls << link } + urls + end + + # Normalizes and expands a given link into a proper URI. + def to_absolute(link) + link = link.is_a?(URI) ? link : URI.parse(link) + + new_url = begin + url.merge(link) + rescue Exception + return + end + + if (!new_url.opaque?) && (path = new_url.path) + # ensure that paths begin with a leading '/' for URI::FTP + if (new_url.scheme == "ftp" && !path.starts_with?("/")) + path.insert(0, "/") + end + + # make sure the path does not contain any .. or . directories, + # since URI::Generic#merge cannot normalize paths such as + # "/stuff/../" + new_url.path = URI.expand_path(path) + end + + return new_url + end + end + end +end diff --git a/src/arachnid/page/status_codes.cr b/src/arachnid/page/status_codes.cr new file mode 100644 index 0000000..84c3ca5 --- /dev/null +++ b/src/arachnid/page/status_codes.cr @@ -0,0 +1,59 @@ +module Arachnid + class Page + module StatusCodes + # The response code from the page. + def code + @response.status_code.to_i + end + + # Determines if the response code is `200`. + def ok? + code == 200 + end + + # Determines if the response code is `308`. + def timedout? + code == 308 + end + + # Determines if the response code is `400`. + def bad_request? + code == 400 + end + + # Determines if the response code is `401`. + def unauthorized? + code == 401 + end + + # Determines if the response code is `403`. + def forbidden? + code == 403 + end + + # Determines if the response code is `404`. + def missing? + code == 404 + end + + # Determines if the response code is `500`. + def had_internal_server_error? + code == 500 + end + + # Determines if the response code is `300`, `301`, `302`, `303` + # or `307`. Also checks for "soft" redirects added at the page + # level by a meta refresh tag. + def redirect? + case code + when 300..303, 307 + true + when 200 + meta_redirect? + else + false + end + end + end + end +end diff --git a/src/arachnid/robots.cr b/src/arachnid/robots.cr new file mode 100644 index 0000000..ec476aa --- /dev/null +++ b/src/arachnid/robots.cr @@ -0,0 +1,231 @@ +require "uri" + +module Arachnid + # Parses robots.txt files for the perusal of a single user-agent. + # + # The behaviour implemented is guided by the following sources, though + # as there is no widely accepted standard, it may differ from other implementations. + # If you consider its behaviour to be in error, please contact the author. + # + # http://www.robotstxt.org/orig.html + # - the original, now imprecise and outdated version + # http://www.robotstxt.org/norobots-rfc.txt + # - a much more precise, outdated version + # http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=156449&from=35237 + # - a few hints at modern protocol extensions. + # + # This parser only considers lines starting with (case-insensitively:) + # Useragent: User-agent: Allow: Disallow: Sitemap: + # + # The file is divided into sections, each of which contains one or more User-agent: + # lines, followed by one or more Allow: or Disallow: rules. + # + # The first section that contains a User-agent: line that matches the robot's + # user-agent, is the only section that relevent to that robot. The sections are checked + # in the same order as they appear in the file. + # + # (The * character is taken to mean "any number of any characters" during matching of + # user-agents) + # + # Within that section, the first Allow: or Disallow: rule that matches the expression + # is taken as authoritative. If no rule in a section matches, the access is Allowed. + # + # (The order of matching is as in the RFC, Google matches all Allows and then all Disallows, + # while Bing matches the most specific rule, I'm sure there are other interpretations) + # + # When matching urls, all % encodings are normalised (except for /?=& which have meaning) + # and "*"s match any number of any character. + # + # If a pattern ends with a $, then the pattern must match the entire path, or the entire + # path with query string. + # + # TODO: Rework to allow for multiple Robots + class Robots + alias Rule = Tuple(String, Bool) + alias RuleSet = Tuple(String, Array(Rule)) + + getter body : String + + getter user_agent : String + + getter rules : Array(Tuple(String, Array(Rule))) + + getter sitemaps : Array(String) + + def initialize(@body : String, @user_agent : String) + @sitemaps = [] of String + @rules = [] of RuleSet + parse(@body) + end + + # Given a URI object, or a string representing one, determine whether this + # robots.txt would allow access to the path. + def allowed?(uri) + uri = URI.parse(uri) + path = (uri.path || "/") + (uri.query ? "?" + uri.query.to_s : "") + path_allowed?(@user_agent, path) + end + + # Check whether the relative path (a string of the url's path and query + # string) is allowed by the rules we have for the given user_agent. + # + private def path_allowed?(user_agent, path) + @rules.each do |(ua_glob, path_globs)| + if match_ua_glob user_agent, ua_glob + path_globs.each do |(path_glob, allowed)| + return allowed if match_path_glob path, path_glob + end + return true + end + end + true + end + + # This does a case-insensitive substring match such that if the user agent + # is contained within the glob, or vice-versa, we will match. + # + # According to the standard, *s shouldn't appear in the user-agent field + # except in the case of "*" meaning all user agents. Google however imply + # that the * will work, at least at the end of a string. + # + # For consistency, and because it seems expected behaviour, and because + # a glob * will match a literal * we use glob matching not string matching. + # + # The standard also advocates a substring match of the robot's user-agent + # within the user-agent field. From observation, it seems much more likely + # that the match will be the other way about, though we check for both. + # + private def match_ua_glob(user_agent, glob) + glob =~ Regex.new(Regex.escape(user_agent), Regex::Options::IGNORE_CASE) || + user_agent =~ Regex.new(reify(glob), Regex::Options::IGNORE_CASE) + end + + # This does case-sensitive prefix matching, such that if the path starts + # with the glob, we will match. + # + # According to the standard, that's it. However, it seems reasonably common + # for asterkisks to be interpreted as though they were globs. + # + # Additionally, some search engines, like Google, will treat a trailing $ + # sign as forcing the glob to match the entire path - whether including + # or excluding the query string is not clear, so we check both. + # + # (i.e. it seems likely that a site owner who has Disallow: *.pdf$ expects + # to disallow requests to *.pdf?i_can_haz_pdf, which the robot could, if + # it were feeling malicious, construe.) + # + # With URLs there is the additional complication that %-encoding can give + # multiple representations for identical URLs, this is handled by + # normalize_percent_encoding. + # + private def match_path_glob(path, glob) + if glob =~ /\$$/ + end_marker = "(?:\?|$)" + glob = glob.gsub /\$$/, "" + else + end_marker = "" + end + + glob = normalize_percent_encoding(glob) + path = normalize_percent_encoding(path) + + path =~ Regex.new("^" + reify(glob) + end_marker) + + rescue e + false + end + + # As a general rule, we want to ignore different representations of the + # same URL. Naively we could just unescape, or escape, everything, however + # the standard implies that a / is a HTTP path separator, while a %2F is an + # encoded / that does not act as a path separator. Similar issues with ?, & + # and =, though all other characters are fine. (While : also has a special + # meaning in HTTP, most implementations ignore this in the path) + # + # It's also worth noting that %-encoding is case-insensitive, so we + # explicitly upcase the few that we want to keep. + # + private def normalize_percent_encoding(path) + # First double-escape any characters we don't want to unescape + # & / = ? + path = path.gsub(/%(26|2F|3D|3F)/i) do |code| + "%25#{code.upcase}" + end + + URI.unescape(path) + end + + # Convert the asterisks in a glob into (.*)s for regular expressions, + # and at the same time, escape any other characters that would have + # a significance in a regex. + # + private def reify(glob) + glob.split("*").map { |part| Regex.escape(part) }.join(".*") + end + + # Convert the @body into a set of @rules so that our parsing mechanism + # becomes easier. + # + # @rules is an array of pairs. The first in the pair is the glob for the + # user-agent and the second another array of pairs. The first of the new + # pair is a glob for the path, and the second whether it appears in an + # Allow: or a Disallow: rule. + # + # For example: + # + # User-agent: * + # Disallow: /secret/ + # Allow: / # allow everything... + # + # Would be parsed so that: + # + # @rules = [["*", [ ["/secret/", false], ["/", true] ]]] + # + # + # The order of the arrays is maintained so that the first match in the file + # is obeyed as indicated by the pseudo-RFC on http://robotstxt.org/. There + # are alternative interpretations, some parse by speicifity of glob, and + # some check Allow lines for any match before Disallow lines. All are + # justifiable, but we could only pick one. + # + # Note that a blank Disallow: should be treated as an Allow: * and multiple + # user-agents may share the same set of rules. + # + private def parse(body) + body.split(/[\r\n]+/).each do |line| + prefix, value = line.delete("\000").split(":", 2).map(&.strip) + value = value.sub /\s+#.*/, "" if value + parser_mode = :begin + + if prefix && value + case prefix.downcase + when /^user-?agent$/ + if parser_mode == :user_agent + @rules << {value, rules.last[1]} + else + parser_mode = :user_agent + @rules << {value, [] of Rule} + end + when "disallow" + parser_mode = :rules + @rules << {"*", [] of Rule} if @rules.empty? + + if value == "" + @rules.last[1] << {"*", true} + else + @rules.last[1] << {value, false} + end + when "allow" + parser_mode = :rules + @rules << {"*", [] of Rule} if @rules.empty? + @rules.last[1] << {value, true} + when "sitemap" + @sitemaps << value + else + # Ignore comments, Crawl-delay: and badly formed lines. + end + end + end + end + end +end diff --git a/src/arachnid/rules.cr b/src/arachnid/rules.cr new file mode 100644 index 0000000..0d0d042 --- /dev/null +++ b/src/arachnid/rules.cr @@ -0,0 +1,53 @@ +module Arachnid + # The `Rules` class represents collections of acceptance and rejection + # rules, which are used to filter data. + class Rules(T) + # Accept rules + getter accept : Array(Proc(T | Nil, Bool) | T | Regex | String) + + # Reject rules + getter reject : Array(Proc(T | Nil, Bool) | T | Regex | String) + + # Creates a new `Rules` object. + def initialize(accept : Array(Proc(T | Nil, Bool) | T | Regex | String)? = nil, reject : Array(Proc(T | Nil, Bool) | T | Regex | String)? = nil) + @accept = accept ? accept : [] of Proc(T | Nil, Bool) | T | Regex | String + @reject = reject ? reject : [] of Proc(T | Nil, Bool) | T | Regex | String + end + + # Determines whether the data should be accepted or rejected. + def accept?(data : T) + return true if accept.empty? && reject.empty? + + unless @accept.empty? + @accept.any? { |rule| test_data(data, rule) } + else + !@reject.any? { |rule| test_data(data, rule) } + end + end + + def accept=(value) + @accept = value || [] of Proc(T | Nil, Bool) | T | Regex | String + end + + # Determines whether the data should be rejected or accepted. + def reject?(data : T) + !accept?(data) + end + + def reject=(value) + @reject = value || [] of Proc(T | Nil, Bool) | T | Regex | String + end + + # Tests the given data against a pattern. + private def test_data(data : T, rule) + case rule + when Proc + rule.call(data) == true + when Regex + !((data.to_s =~ rule).nil?) + else + data == rule + end + end + end +end diff --git a/src/arachnid/session_cache.cr b/src/arachnid/session_cache.cr new file mode 100644 index 0000000..1734736 --- /dev/null +++ b/src/arachnid/session_cache.cr @@ -0,0 +1,112 @@ +require "uri" +require "halite" + +module Arachnid + # Stores active HTTP Sessions organized by scheme, host-name and port. + class SessionCache + + # Optional read timeout. + property read_timeout : Int32 + + # Optional connect timeout. + property connect_timeout : Int32 + + # Max redirects to follow. + property max_redirects : Int32? + + # Should we set a DNT (Do Not Track) header? + property? do_not_track : Bool + + @sessions = {} of Tuple(String?, String?, Int32?) => Halite::Client + + # Create a new session cache + def initialize( + read_timeout : Int32? = nil, + connect_timeout : Int32? = nil, + follow_redirects : Bool? = nil, + max_redirects : Int32? = nil, + do_not_track : Bool? = nil + ) + @read_timeout = read_timeout || Arachnid.read_timeout + @connect_timeout = connect_timeout || Arachnid.connect_timeout + @max_redirects = max_redirects || Arachnid.max_redirects + @do_not_track = do_not_track || Arachnid.do_not_track? + end + + # Determines if there is an active session for the given URL + def active?(url) + # normalize the url + url = URI.parse(url) unless url.is_a?(URI) + + # session key + key = key_for(url) + + @sessions.has_key?(key) + end + + # Provides an active session for a given URL. + def [](url) + # normalize the url + url = URI.parse(url) unless url.is_a?(URI) + + # session key + key = key_for(url) + + # normalize the endpoint + endpoint = url.dup + endpoint.scheme ||= "http" + endpoint.query = nil + endpoint.fragment = nil + endpoint.path = "" + + # Set headers + headers = { + "DNT" => @do_not_track ? 1 : 0 + } + + unless @sessions.has_key?(key) + session = Halite::Client.new( + endpoint: endpoint, + timeout: Halite::Timeout.new( + connect: @connect_timeout, + read: @read_timeout + ), + follow: Halite::Follow.new( + hops: @max_redirects, + strict: false + ), + headers: headers, + ) + + # session = session.logging(skip_request_body: true, skip_response_body: true) + + @sessions[key] = session + end + + @sessions[key] + end + + # Destroys an HTTP session for the given scheme, host, and port. + def kill!(url) + # normalize the url + url = URI.parse(url) unless url.is_a?(URI) + + # session key + key = key_for(url) + + if sess = @sessions[key] + @sessions.delete(key) + end + end + + # Clears the session cache + def clear + @sessions.clear + end + + # Creates a session key based on the URL + private def key_for(url) + {url.scheme, url.host, url.port} + end + end +end diff --git a/src/arachnid/version.cr b/src/arachnid/version.cr new file mode 100644 index 0000000..385046e --- /dev/null +++ b/src/arachnid/version.cr @@ -0,0 +1,3 @@ +module Arachnid + VERSION = "0.1.0" +end