Updated rules and events

This commit is contained in:
Chris Watson 2019-06-26 18:50:41 -07:00
parent e11abaaf98
commit 42b4c9c739
No known key found for this signature in database
GPG Key ID: 37DAEF5F446370A4
4 changed files with 41 additions and 277 deletions

View File

@ -41,9 +41,9 @@ module Arachnid
# Pass the headers from every response the agent receives to a given
# block.
def all_headers(&block)
headers = [] of HTTP::Headers
every_resource { |resource| headers << resource.headers }
headers.each { |header| yield headers }
@every_resource_blocks << ->(resource : Resource) {
block.call(resource.headers)
}
end
# Pass every resource that the agent visits to a given block.
@ -54,66 +54,66 @@ module Arachnid
# Pass every OK resource that the agent visits to a given block.
def every_ok_page(&block : Resource ->)
resources = [] of Resource
every_resource { |resource| (resources << resource) if resource.ok? }
resources.each { |resource| yield resource }
@every_resource_blocks << ->(resource : Resource) {
block.call(resource) if resource.ok?
}
end
# Pass every Redirect resource that the agent visits to a given block.
def every_redirect_page(&block : Resource ->)
resources = [] of Resource
every_resource { |resource| (resources << resource) if resource.redirect? }
resources.each { |resource| yield resource }
@every_resource_blocks << ->(resource : Resource) {
block.call(resource) if resource.redirect?
}
end
# Pass every Timeout resource that the agent visits to a given block.
def every_timedout_page(&block : Resource ->)
resources = [] of Resource
every_resource { |resource| (resources << resource) if resource.timeout? }
resources.each { |resource| yield resource }
@every_resource_blocks << ->(resource : Resource) {
block.call(resource) if resource.timeout?
}
end
# Pass every Bad Request resource that the agent visits to a given block.
def every_bad_request_page(&block : Resource ->)
resources = [] of Resource
every_resource { |resource| (resources << resource) if resource.bad_request? }
resources.each { |resource| yield resource }
@every_resource_blocks << ->(resource : Resource) {
block.call(resource) if resource.bad_request?
}
end
# Pass every Unauthorized resource that the agent visits to a given block.
def every_unauthorized_page(&block : Resource ->)
resources = [] of Resource
every_resource { |resource| (resources << resource) if resource.unauthorized? }
resources.each { |resource| yield resource }
@every_resource_blocks << ->(resource : Resource) {
block.call(resource) if resource.unauthorized?
}
end
# Pass every Forbidden resource that the agent visits to a given block.
def every_forbidden_page(&block : Resource ->)
resources = [] of Resource
every_resource { |resource| (resources << resource) if resource.forbidden? }
resources.each { |resource| yield resource }
@every_resource_blocks << ->(resource : Resource) {
block.call(resource) if resource.forbidden?
}
end
# Pass every Missing resource that the agent visits to a given block.
def every_missing_page(&block : Resource ->)
resources = [] of Resource
every_resource { |resource| (resources << resource) if resource.missing? }
resources.each { |resource| yield resource }
@every_resource_blocks << ->(resource : Resource) {
block.call(resource) if resource.missing?
}
end
# Pass every Internal Server Error resource that the agent visits to a
# given block.
def every_internal_server_error_page(&block : Resource ->)
resources = [] of Resource
every_resource { |resource| (resources << resource) if resource.had_internal_server_error? }
resources.each { |resource| yield resource }
@every_resource_blocks << ->(resource : Resource) {
block.call(resource) if resource.had_internal_server_error?
}
end
# Pass every Plain Text resource that the agent visits to a given block.
def every_txt_page(&block : Resource ->)
resources = [] of Resource
every_resource { |resource| (resources << resource) if resource.txt? }
resources.each { |resource| yield resource }
@every_resource_blocks << ->(resource : Resource) {
block.call(resource) if resource.txt?
}
end
# Pass every HTML resource that the agent visits to a given block.

View File

@ -1,8 +1,6 @@
require "../robots"
module Arachnid
class Agent
@robots : Arachnid::Robots? = nil
# @robots : Arachnid::Robots? = nil
# Initializes the robots filter.
def initialize_robots
@ -10,11 +8,11 @@ module Arachnid
end
# Determines whether a URL is allowed by the robot policy.
def robot_allowed?(url)
if robots = @robots
return robots.allowed?(url)
end
true
end
# def robot_allowed?(url)
# if robots = @robots
# return robots.allowed?(url)
# end
# true
# end
end
end

View File

@ -1,231 +0,0 @@
require "uri"
module Arachnid
# Parses robots.txt files for the perusal of a single user-agent.
#
# The behaviour implemented is guided by the following sources, though
# as there is no widely accepted standard, it may differ from other implementations.
# If you consider its behaviour to be in error, please contact the author.
#
# http://www.robotstxt.org/orig.html
# - the original, now imprecise and outdated version
# http://www.robotstxt.org/norobots-rfc.txt
# - a much more precise, outdated version
# http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=156449&from=35237
# - a few hints at modern protocol extensions.
#
# This parser only considers lines starting with (case-insensitively:)
# Useragent: User-agent: Allow: Disallow: Sitemap:
#
# The file is divided into sections, each of which contains one or more User-agent:
# lines, followed by one or more Allow: or Disallow: rules.
#
# The first section that contains a User-agent: line that matches the robot's
# user-agent, is the only section that relevent to that robot. The sections are checked
# in the same order as they appear in the file.
#
# (The * character is taken to mean "any number of any characters" during matching of
# user-agents)
#
# Within that section, the first Allow: or Disallow: rule that matches the expression
# is taken as authoritative. If no rule in a section matches, the access is Allowed.
#
# (The order of matching is as in the RFC, Google matches all Allows and then all Disallows,
# while Bing matches the most specific rule, I'm sure there are other interpretations)
#
# When matching urls, all % encodings are normalised (except for /?=& which have meaning)
# and "*"s match any number of any character.
#
# If a pattern ends with a $, then the pattern must match the entire path, or the entire
# path with query string.
#
# TODO: Rework to allow for multiple Robots
class Robots
alias Rule = Tuple(String, Bool)
alias RuleSet = Tuple(String, Array(Rule))
getter body : String
getter user_agent : String
getter rules : Array(Tuple(String, Array(Rule)))
getter sitemaps : Array(String)
def initialize(@body : String, @user_agent : String)
@sitemaps = [] of String
@rules = [] of RuleSet
parse(@body)
end
# Given a URI object, or a string representing one, determine whether this
# robots.txt would allow access to the path.
def allowed?(uri)
uri = URI.parse(uri)
path = (uri.path || "/") + (uri.query ? "?" + uri.query.to_s : "")
path_allowed?(@user_agent, path)
end
# Check whether the relative path (a string of the url's path and query
# string) is allowed by the rules we have for the given user_agent.
#
private def path_allowed?(user_agent, path)
@rules.each do |(ua_glob, path_globs)|
if match_ua_glob user_agent, ua_glob
path_globs.each do |(path_glob, allowed)|
return allowed if match_path_glob path, path_glob
end
return true
end
end
true
end
# This does a case-insensitive substring match such that if the user agent
# is contained within the glob, or vice-versa, we will match.
#
# According to the standard, *s shouldn't appear in the user-agent field
# except in the case of "*" meaning all user agents. Google however imply
# that the * will work, at least at the end of a string.
#
# For consistency, and because it seems expected behaviour, and because
# a glob * will match a literal * we use glob matching not string matching.
#
# The standard also advocates a substring match of the robot's user-agent
# within the user-agent field. From observation, it seems much more likely
# that the match will be the other way about, though we check for both.
#
private def match_ua_glob(user_agent, glob)
glob =~ Regex.new(Regex.escape(user_agent), Regex::Options::IGNORE_CASE) ||
user_agent =~ Regex.new(reify(glob), Regex::Options::IGNORE_CASE)
end
# This does case-sensitive prefix matching, such that if the path starts
# with the glob, we will match.
#
# According to the standard, that's it. However, it seems reasonably common
# for asterkisks to be interpreted as though they were globs.
#
# Additionally, some search engines, like Google, will treat a trailing $
# sign as forcing the glob to match the entire path - whether including
# or excluding the query string is not clear, so we check both.
#
# (i.e. it seems likely that a site owner who has Disallow: *.pdf$ expects
# to disallow requests to *.pdf?i_can_haz_pdf, which the robot could, if
# it were feeling malicious, construe.)
#
# With URLs there is the additional complication that %-encoding can give
# multiple representations for identical URLs, this is handled by
# normalize_percent_encoding.
#
private def match_path_glob(path, glob)
if glob =~ /\$$/
end_marker = "(?:\?|$)"
glob = glob.gsub /\$$/, ""
else
end_marker = ""
end
glob = normalize_percent_encoding(glob)
path = normalize_percent_encoding(path)
path =~ Regex.new("^" + reify(glob) + end_marker)
rescue e
false
end
# As a general rule, we want to ignore different representations of the
# same URL. Naively we could just unescape, or escape, everything, however
# the standard implies that a / is a HTTP path separator, while a %2F is an
# encoded / that does not act as a path separator. Similar issues with ?, &
# and =, though all other characters are fine. (While : also has a special
# meaning in HTTP, most implementations ignore this in the path)
#
# It's also worth noting that %-encoding is case-insensitive, so we
# explicitly upcase the few that we want to keep.
#
private def normalize_percent_encoding(path)
# First double-escape any characters we don't want to unescape
# & / = ?
path = path.gsub(/%(26|2F|3D|3F)/i) do |code|
"%25#{code.upcase}"
end
URI.unescape(path)
end
# Convert the asterisks in a glob into (.*)s for regular expressions,
# and at the same time, escape any other characters that would have
# a significance in a regex.
#
private def reify(glob)
glob.split("*").map { |part| Regex.escape(part) }.join(".*")
end
# Convert the @body into a set of @rules so that our parsing mechanism
# becomes easier.
#
# @rules is an array of pairs. The first in the pair is the glob for the
# user-agent and the second another array of pairs. The first of the new
# pair is a glob for the path, and the second whether it appears in an
# Allow: or a Disallow: rule.
#
# For example:
#
# User-agent: *
# Disallow: /secret/
# Allow: / # allow everything...
#
# Would be parsed so that:
#
# @rules = [["*", [ ["/secret/", false], ["/", true] ]]]
#
#
# The order of the arrays is maintained so that the first match in the file
# is obeyed as indicated by the pseudo-RFC on http://robotstxt.org/. There
# are alternative interpretations, some parse by speicifity of glob, and
# some check Allow lines for any match before Disallow lines. All are
# justifiable, but we could only pick one.
#
# Note that a blank Disallow: should be treated as an Allow: * and multiple
# user-agents may share the same set of rules.
#
private def parse(body)
body.split(/[\r\n]+/).each do |line|
prefix, value = line.delete("\000").split(":", 2).map(&.strip)
value = value.sub /\s+#.*/, "" if value
parser_mode = :begin
if prefix && value
case prefix.downcase
when /^user-?agent$/
if parser_mode == :user_agent
@rules << {value, rules.last[1]}
else
parser_mode = :user_agent
@rules << {value, [] of Rule}
end
when "disallow"
parser_mode = :rules
@rules << {"*", [] of Rule} if @rules.empty?
if value == ""
@rules.last[1] << {"*", true}
else
@rules.last[1] << {value, false}
end
when "allow"
parser_mode = :rules
@rules << {"*", [] of Rule} if @rules.empty?
@rules.last[1] << {value, true}
when "sitemap"
@sitemaps << value
else
# Ignore comments, Crawl-delay: and badly formed lines.
end
end
end
end
end
end

View File

@ -16,13 +16,10 @@ module Arachnid
# Determines whether the data should be accepted or rejected.
def accept?(data : T)
return true if accept.empty? && reject.empty?
unless @accept.empty?
@accept.any? { |rule| test_data(data, rule) }
else
!@reject.any? { |rule| test_data(data, rule) }
end
result = true
result = @accept.any? { |rule| test_data(data, rule) } unless @accept.empty?
result = !@reject.any? { |rule| test_data(data, rule) } unless @reject.empty? || result == false
result
end
def accept=(value)