Updated rules and events

2019-06-26 18:50:41 -07:00 · 2019-06-26 18:50:41 -07:00 · 42b4c9c739
parent e11abaaf98
commit 42b4c9c739
4 changed files with 41 additions and 277 deletions
--- a/src/arachnid/agent/events.cr
+++ b/src/arachnid/agent/events.cr
@ -41,9 +41,9 @@ module Arachnid
    # Pass the headers from every response the agent receives to a given
    # block.
    def all_headers(&block)
-      headers = [] of HTTP::Headers
-      every_resource { |resource| headers << resource.headers }
-      headers.each { |header| yield headers }
+      @every_resource_blocks << ->(resource : Resource) {
+        block.call(resource.headers)
+      }
    end

    # Pass every resource that the agent visits to a given block.
@ -54,66 +54,66 @@ module Arachnid

    # Pass every OK resource that the agent visits to a given block.
    def every_ok_page(&block : Resource ->)
-      resources = [] of Resource
-      every_resource { |resource| (resources << resource) if resource.ok? }
-      resources.each { |resource| yield resource }
+      @every_resource_blocks << ->(resource : Resource) {
+        block.call(resource) if resource.ok?
+      }
    end

    # Pass every Redirect resource that the agent visits to a given block.
    def every_redirect_page(&block : Resource ->)
-      resources = [] of Resource
-      every_resource { |resource| (resources << resource) if resource.redirect? }
-      resources.each { |resource| yield resource }
+      @every_resource_blocks << ->(resource : Resource) {
+        block.call(resource) if resource.redirect?
+      }
    end

    # Pass every Timeout resource that the agent visits to a given block.
    def every_timedout_page(&block : Resource ->)
-      resources = [] of Resource
-      every_resource { |resource| (resources << resource) if resource.timeout? }
-      resources.each { |resource| yield resource }
+      @every_resource_blocks << ->(resource : Resource) {
+        block.call(resource) if resource.timeout?
+      }
    end

    # Pass every Bad Request resource that the agent visits to a given block.
    def every_bad_request_page(&block : Resource ->)
-      resources = [] of Resource
-      every_resource { |resource| (resources << resource) if resource.bad_request? }
-      resources.each { |resource| yield resource }
+      @every_resource_blocks << ->(resource : Resource) {
+        block.call(resource) if resource.bad_request?
+      }
    end

    # Pass every Unauthorized resource that the agent visits to a given block.
    def every_unauthorized_page(&block : Resource ->)
-      resources = [] of Resource
-      every_resource { |resource| (resources << resource) if resource.unauthorized? }
-      resources.each { |resource| yield resource }
+      @every_resource_blocks << ->(resource : Resource) {
+        block.call(resource) if resource.unauthorized?
+      }
    end

    # Pass every Forbidden resource that the agent visits to a given block.
    def every_forbidden_page(&block : Resource ->)
-      resources = [] of Resource
-      every_resource { |resource| (resources << resource) if resource.forbidden? }
-      resources.each { |resource| yield resource }
+      @every_resource_blocks << ->(resource : Resource) {
+        block.call(resource) if resource.forbidden?
+      }
    end

    # Pass every Missing resource that the agent visits to a given block.
    def every_missing_page(&block : Resource ->)
-      resources = [] of Resource
-      every_resource { |resource| (resources << resource) if resource.missing? }
-      resources.each { |resource| yield resource }
+      @every_resource_blocks << ->(resource : Resource) {
+        block.call(resource) if resource.missing?
+      }
    end

    # Pass every Internal Server Error resource that the agent visits to a
    # given block.
    def every_internal_server_error_page(&block : Resource ->)
-      resources = [] of Resource
-      every_resource { |resource| (resources << resource) if resource.had_internal_server_error? }
-      resources.each { |resource| yield resource }
+      @every_resource_blocks << ->(resource : Resource) {
+        block.call(resource) if resource.had_internal_server_error?
+      }
    end

    # Pass every Plain Text resource that the agent visits to a given block.
    def every_txt_page(&block : Resource ->)
-      resources = [] of Resource
-      every_resource { |resource| (resources << resource) if resource.txt? }
-      resources.each { |resource| yield resource }
+      @every_resource_blocks << ->(resource : Resource) {
+        block.call(resource) if resource.txt?
+      }
    end

    # Pass every HTML resource that the agent visits to a given block.
--- a/src/arachnid/agent/robots.cr
+++ b/src/arachnid/agent/robots.cr
@ -1,8 +1,6 @@
-require "../robots"
-
 module Arachnid
  class Agent
-    @robots : Arachnid::Robots? = nil
+    # @robots : Arachnid::Robots? = nil

    # Initializes the robots filter.
    def initialize_robots
@ -10,11 +8,11 @@ module Arachnid
    end

    # Determines whether a URL is allowed by the robot policy.
-    def robot_allowed?(url)
-      if robots = @robots
-        return robots.allowed?(url)
-      end
-      true
-    end
+    # def robot_allowed?(url)
+    #   if robots = @robots
+    #     return robots.allowed?(url)
+    #   end
+    #   true
+    # end
  end
 end
--- a/src/arachnid/robots.cr
+++ b/src/arachnid/robots.cr
@ -1,231 +0,0 @@
-require "uri"
-
-module Arachnid
-  # Parses robots.txt files for the perusal of a single user-agent.
-  #
-  # The behaviour implemented is guided by the following sources, though
-  # as there is no widely accepted standard, it may differ from other implementations.
-  # If you consider its behaviour to be in error, please contact the author.
-  #
-  # http://www.robotstxt.org/orig.html
-  #  - the original, now imprecise and outdated version
-  # http://www.robotstxt.org/norobots-rfc.txt
-  #  - a much more precise, outdated version
-  # http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=156449&from=35237
-  #  - a few hints at modern protocol extensions.
-  #
-  # This parser only considers lines starting with (case-insensitively:)
-  #  Useragent: User-agent: Allow: Disallow: Sitemap:
-  #
-  # The file is divided into sections, each of which contains one or more User-agent:
-  # lines, followed by one or more Allow: or Disallow: rules.
-  #
-  # The first section that contains a User-agent: line that matches the robot's
-  # user-agent, is the only section that relevent to that robot. The sections are checked
-  # in the same order as they appear in the file.
-  #
-  # (The * character is taken to mean "any number of any characters" during matching of
-  #  user-agents)
-  #
-  # Within that section, the first Allow: or Disallow: rule that matches the expression
-  # is taken as authoritative. If no rule in a section matches, the access is Allowed.
-  #
-  # (The order of matching is as in the RFC, Google matches all Allows and then all Disallows,
-  #  while Bing matches the most specific rule, I'm sure there are other interpretations)
-  #
-  # When matching urls, all % encodings are normalised (except for /?=& which have meaning)
-  # and "*"s match any number of any character.
-  #
-  # If a pattern ends with a $, then the pattern must match the entire path, or the entire
-  # path with query string.
-  #
-  # TODO: Rework to allow for multiple Robots
-  class Robots
-    alias Rule = Tuple(String, Bool)
-    alias RuleSet = Tuple(String, Array(Rule))
-
-    getter body : String
-
-    getter user_agent : String
-
-    getter rules : Array(Tuple(String, Array(Rule)))
-
-    getter sitemaps : Array(String)
-
-    def initialize(@body : String, @user_agent : String)
-      @sitemaps = [] of String
-      @rules = [] of RuleSet
-      parse(@body)
-    end
-
-    # Given a URI object, or a string representing one, determine whether this
-    # robots.txt would allow access to the path.
-    def allowed?(uri)
-      uri = URI.parse(uri)
-      path = (uri.path || "/") + (uri.query ? "?" + uri.query.to_s : "")
-      path_allowed?(@user_agent, path)
-    end
-
-    # Check whether the relative path (a string of the url's path and query
-    # string) is allowed by the rules we have for the given user_agent.
-    #
-    private def path_allowed?(user_agent, path)
-      @rules.each do |(ua_glob, path_globs)|
-        if match_ua_glob user_agent, ua_glob
-          path_globs.each do |(path_glob, allowed)|
-            return allowed if match_path_glob path, path_glob
-          end
-          return true
-        end
-      end
-      true
-    end
-
-    # This does a case-insensitive substring match such that if the user agent
-    # is contained within the glob, or vice-versa, we will match.
-    #
-    # According to the standard, *s shouldn't appear in the user-agent field
-    # except in the case of "*" meaning all user agents. Google however imply
-    # that the * will work, at least at the end of a string.
-    #
-    # For consistency, and because it seems expected behaviour, and because
-    # a glob * will match a literal * we use glob matching not string matching.
-    #
-    # The standard also advocates a substring match of the robot's user-agent
-    # within the user-agent field. From observation, it seems much more likely
-    # that the match will be the other way about, though we check for both.
-    #
-    private def match_ua_glob(user_agent, glob)
-      glob =~ Regex.new(Regex.escape(user_agent), Regex::Options::IGNORE_CASE) ||
-        user_agent =~ Regex.new(reify(glob), Regex::Options::IGNORE_CASE)
-    end
-
-    # This does case-sensitive prefix matching, such that if the path starts
-    # with the glob, we will match.
-    #
-    # According to the standard, that's it. However, it seems reasonably common
-    # for asterkisks to be interpreted as though they were globs.
-    #
-    # Additionally, some search engines, like Google, will treat a trailing $
-    # sign as forcing the glob to match the entire path - whether including
-    # or excluding the query string is not clear, so we check both.
-    #
-    # (i.e. it seems likely that a site owner who has Disallow: *.pdf$ expects
-    # to disallow requests to *.pdf?i_can_haz_pdf, which the robot could, if
-    # it were feeling malicious, construe.)
-    #
-    # With URLs there is the additional complication that %-encoding can give
-    # multiple representations for identical URLs, this is handled by
-    # normalize_percent_encoding.
-    #
-    private def match_path_glob(path, glob)
-      if glob =~ /\$$/
-        end_marker = "(?:\?|$)"
-        glob = glob.gsub /\$$/, ""
-      else
-        end_marker = ""
-      end
-
-      glob = normalize_percent_encoding(glob)
-      path = normalize_percent_encoding(path)
-
-      path =~ Regex.new("^" + reify(glob) + end_marker)
-
-    rescue e
-      false
-    end
-
-    # As a general rule, we want to ignore different representations of the
-    # same URL. Naively we could just unescape, or escape, everything, however
-    # the standard implies that a / is a HTTP path separator, while a %2F is an
-    # encoded / that does not act as a path separator. Similar issues with ?, &
-    # and =, though all other characters are fine. (While : also has a special
-    # meaning in HTTP, most implementations ignore this in the path)
-    #
-    # It's also worth noting that %-encoding is case-insensitive, so we
-    # explicitly upcase the few that we want to keep.
-    #
-    private def normalize_percent_encoding(path)
-      # First double-escape any characters we don't want to unescape
-      #                   &  /  =  ?
-      path = path.gsub(/%(26|2F|3D|3F)/i) do |code|
-        "%25#{code.upcase}"
-      end
-
-      URI.unescape(path)
-    end
-
-    # Convert the asterisks in a glob into (.*)s for regular expressions,
-    # and at the same time, escape any other characters that would have
-    # a significance in a regex.
-    #
-    private def reify(glob)
-      glob.split("*").map { |part| Regex.escape(part) }.join(".*")
-    end
-
-    # Convert the @body into a set of @rules so that our parsing mechanism
-    # becomes easier.
-    #
-    # @rules is an array of pairs. The first in the pair is the glob for the
-    # user-agent and the second another array of pairs. The first of the new
-    # pair is a glob for the path, and the second whether it appears in an
-    # Allow: or a Disallow: rule.
-    #
-    # For example:
-    #
-    # User-agent: *
-    # Disallow: /secret/
-    # Allow: /     # allow everything...
-    #
-    # Would be parsed so that:
-    #
-    # @rules = [["*", [ ["/secret/", false], ["/", true] ]]]
-    #
-    #
-    # The order of the arrays is maintained so that the first match in the file
-    # is obeyed as indicated by the pseudo-RFC on http://robotstxt.org/. There
-    # are alternative interpretations, some parse by speicifity of glob, and
-    # some check Allow lines for any match before Disallow lines. All are
-    # justifiable, but we could only pick one.
-    #
-    # Note that a blank Disallow: should be treated as an Allow: * and multiple
-    # user-agents may share the same set of rules.
-    #
-    private def parse(body)
-      body.split(/[\r\n]+/).each do |line|
-        prefix, value = line.delete("\000").split(":", 2).map(&.strip)
-        value = value.sub /\s+#.*/, "" if value
-        parser_mode = :begin
-
-        if prefix && value
-          case prefix.downcase
-          when /^user-?agent$/
-            if parser_mode == :user_agent
-              @rules << {value, rules.last[1]}
-            else
-              parser_mode = :user_agent
-              @rules << {value, [] of Rule}
-            end
-          when "disallow"
-            parser_mode = :rules
-            @rules << {"*", [] of Rule} if @rules.empty?
-
-            if value == ""
-              @rules.last[1] << {"*", true}
-            else
-              @rules.last[1] << {value, false}
-            end
-          when "allow"
-            parser_mode = :rules
-            @rules << {"*", [] of Rule} if @rules.empty?
-            @rules.last[1] << {value, true}
-          when "sitemap"
-            @sitemaps << value
-          else
-            # Ignore comments, Crawl-delay: and badly formed lines.
-          end
-        end
-      end
-    end
-  end
-end
--- a/src/arachnid/rules.cr
+++ b/src/arachnid/rules.cr
@ -16,13 +16,10 @@ module Arachnid

    # Determines whether the data should be accepted or rejected.
    def accept?(data : T)
-      return true if accept.empty? && reject.empty?
-
-      unless @accept.empty?
-        @accept.any? { |rule| test_data(data, rule) }
-      else
-        !@reject.any? { |rule| test_data(data, rule) }
-      end
+      result = true
+      result = @accept.any? { |rule| test_data(data, rule) } unless @accept.empty?
+      result = !@reject.any? { |rule| test_data(data, rule) } unless @reject.empty? || result == false
+      result
    end

    def accept=(value)