Initial commit

This commit is contained in:
Chris Watson 2019-06-26 02:45:03 -07:00
commit 9b82f6b48a
No known key found for this signature in database
GPG Key ID: 37DAEF5F446370A4
30 changed files with 2895 additions and 0 deletions

9
.editorconfig Normal file
View File

@ -0,0 +1,9 @@
root = true
[*.cr]
charset = utf-8
end_of_line = lf
insert_final_newline = true
indent_style = space
indent_size = 2
trim_trailing_whitespace = true

9
.gitignore vendored Normal file
View File

@ -0,0 +1,9 @@
/docs/
/lib/
/bin/
/.shards/
*.dwarf
# Libraries don't need dependency lock
# Dependencies will be locked in applications that use them
/shard.lock

6
.travis.yml Normal file
View File

@ -0,0 +1,6 @@
language: crystal
# Uncomment the following if you'd like Travis to run specs and check code formatting
# script:
# - crystal spec
# - crystal tool format --check

21
LICENSE Normal file
View File

@ -0,0 +1,21 @@
The MIT License (MIT)
Copyright (c) 2019 Chris Watson
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

95
README.md Normal file
View File

@ -0,0 +1,95 @@
# Arachnid
Arachnid is a fast and powerful web scraping framework for Crystal. It provides an easy to use DSL for scraping webpages and processing all of the things you might come across.
## Installation
1. Add the dependency to your `shard.yml`:
```yaml
dependencies:
arachnid:
github: watzon/arachnid
```
2. Run `shards install`
## Usage
Arachnid provides an easy to use, powerful DSL for scraping websites.
```crystal
require "arachnid"
require "json"
# Let's build a sitemap of crystal-lang.org
# Links will be a hash of url to page title
links = {} of String => String
# Visit a particular host, in this case `crystal-lang.org`. This will
# not match on subdomains.
Arachnid.host("https://crystal-lang.org") do |spider|
# Ignore the API secion. It's a little big.
spider.ignore_urls_like(/.*\/api.*/)
spider.every_page do |page|
puts "Visiting #{page.url.to_s}"
# Ignore redirects for our sitemap
unless page.redirect?
# Add the url of every visited page to our sitemap
links[page.url.to_s] = page.title.to_s.strip
end
end
end
File.write("crystal-lang.org-sitemap.json", links.to_pretty_json)
```
Want to scan external links as well?
```crystal
# To make things interesting, this time let's download
# every image we find.
Arachnid.start_at("https://crystal-lang.org") do |spider|
# Set a base path to store all the images at
base_image_dir = File.expand_path("~/Pictures/arachnid")
Dir.mkdir_p(base_image_dir)
spider.every_page do |page|
puts "Scanning #{page.url.to_s}"
if page.image?
# Since we're going to be saving a lot of images
# let's spawn a new fiber for each one. This
# makes things so much faster.
spawn do
# Output directory for images for this host
directory = File.join(base_image_dir, page.url.host.to_s)
Dir.mkdir_p(directory)
# The name of the image
filename = File.basename(page.url.path)
# Save the image using the body of the page
puts "Saving #{filename} to #{directory}"
File.write(File.join(directory, filename), page.body)
end
end
end
end
```
More documentation will be coming soon!
## Contributing
1. Fork it (<https://github.com/watzon/arachnid/fork>)
2. Create your feature branch (`git checkout -b my-new-feature`)
3. Commit your changes (`git commit -am 'Add some feature'`)
4. Push to the branch (`git push origin my-new-feature`)
5. Create a new Pull Request
## Contributors
- [Chris Watson](https://github.com/watzon) - creator and maintainer

17
shard.yml Normal file
View File

@ -0,0 +1,17 @@
name: arachnid
version: 0.1.0
authors:
- Chris Watson <chris@watzon.me>
dependencies:
halite:
github: icyleaf/halite
version: ~> 0.10.1
crystagiri:
github: madeindjs/crystagiri
branch: master
crystal: 0.29.0
license: MIT

9
spec/crepe_spec.cr Normal file
View File

@ -0,0 +1,9 @@
require "./spec_helper"
describe Arachnid do
# TODO: Write tests
it "works" do
false.should eq(true)
end
end

2
spec/spec_helper.cr Normal file
View File

@ -0,0 +1,2 @@
require "spec"
require "../src/arachnid"

32
src/arachnid.cr Normal file
View File

@ -0,0 +1,32 @@
require "./arachnid/version"
require "./arachnid/arachnid"
# To make things interesting, this time let's download
# every image we find.
Arachnid.start_at("https://crystal-lang.org") do |spider|
# Set a base path to store all the images at
base_image_dir = File.expand_path("~/Pictures/arachnid")
Dir.mkdir_p(base_image_dir)
spider.every_page do |page|
puts "Scanning #{page.url.to_s}"
if page.image?
# Since we're going to be saving a lot of images
# let's spawn a new fiber for each one. This
# makes things so much faster.
spawn do
# Output directory for images for this host
directory = File.join(base_image_dir, page.url.host.to_s)
Dir.mkdir_p(directory)
# The name of the image
filename = File.basename(page.url.path)
# Save the image using the body of the page
puts "Saving #{filename} to #{directory}"
File.write(File.join(directory, filename), page.body)
end
end
end
end

543
src/arachnid/agent.cr Normal file
View File

@ -0,0 +1,543 @@
require "./agent/sanitizers"
require "./agent/filters"
require "./agent/events"
require "./agent/actions"
require "./agent/robots"
require "./page"
require "./session_cache"
require "./cookie_jar"
require "./auth_store"
module Arachnid
class Agent
getter? running : Bool
# Set to limit to a single host.
property host : String?
# User agent to use.
property user_agent : String
# HTTP Hoes Header to use.
property host_header : String?
# HTTP Host Headers to use for specific hosts.
property host_headers : Hash(String | Regex, String)
# HTTP Headers to use for every request.
property default_headers : Hash(String, String)
# HTTP Authentication credentials.
property authorized : AuthStore
# Referer to use.
property referer : String?
# Delay in between fetching pages.
property fetch_delay : Time::Span | Int32
# History containing visited URLs.
getter history : Set(URI)
# List of unreachable URIs.
getter failures : Set(URI)
# Queue of URLs to visit.
getter queue : Array(URI)
# The session cache.
property sessions : SessionCache
# Cached cookies.
property cookies : CookieJar
# Maximum number of pages to visit.
property limit : Int32?
# Maximum depth.
property max_depth : Int32?
# The visited URLs and their depth within a site.
property levels : Hash(URI, Int32)
# Creates a new `Agent` object.
def initialize(
host : String? = nil,
read_timeout : Int32? = nil,
connect_timeout : Int32? = nil,
follow_redirects : Bool? = nil,
max_redirects : Int32? = nil,
do_not_track : Bool? = nil,
default_headers : Hash(String, String)? = nil,
host_header : String? = nil,
host_headers : Hash(String | Regex, String)? = nil,
user_agent : String? = nil,
referer : String? = nil,
fetch_delay : (Int32 | Time::Span)? = nil,
queue : Set(URI)? = nil,
history : Set(URI)? = nil,
limit : Int32? = nil,
max_depth : Int32? = nil,
robots : Bool? = nil,
filter_options = nil
)
@host = host
@host_header = host_header
@host_headers = host_headers || {} of (Regex | String) => String
@default_headers = default_headers || {} of String => String
@user_agent = user_agent || Arachnid.user_agent
@referer = referer
@running = false
@fetch_delay = fetch_delay || 0
@history = history || Set(URI).new
@failures = Set(URI).new
@queue = queue || [] of URI
@limit = limit
@levels = {} of URI => Int32
@max_depth = max_depth
@sessions = SessionCache.new(
read_timeout,
connect_timeout,
follow_redirects,
max_redirects,
do_not_track
)
@cookies = CookieJar.new
@authorized = AuthStore.new
if filter_options
initialize_filters(**filter_options)
else
initialize_filters
end
initialize_robots if robots || Arachnid.robots?
end
# Create a new scoped `Agent` in a block.
def self.new(**options, &block : Agent ->)
_new = new(**options)
with _new yield _new
_new
end
# Creates a new `Agent` and begins spidering at the given URL.
def self.start_at(url, **options, &block : Agent ->)
agent = new(**options, &block)
agent.start_at(url, force: true)
end
# Creates a new `Agent` and spiders the web site located
# at the given URL.
def self.site(url, **options, &block : Agent ->)
url = url.is_a?(URI) ? url : URI.parse(url)
url_regex = Regex.new(Regex.escape(url.host.to_s))
agent = new(**options, &block)
agent.visit_hosts_like(url_regex)
agent.start_at(url, force: true)
end
# Creates a new `Agent` and spiders the given host.
def self.host(url, **options, &block : Agent ->)
url = url.is_a?(URI) ? url : URI.parse(url)
options = options.merge(host: url.host)
agent = new(**options, &block)
agent.start_at(url, force: true)
end
# Clears the history of the `Agent`.
def clear
@queue.clear
@history.clear
@failures.clear
self
end
# Start spidering at a given URL.
# def start_at(url, &block : Page ->)
# enqueue(url)
# run(&block)
# end
# Start spidering at a given URL.
def start_at(url, force = false)
enqueue(url, force: force)
return run
end
# Start spidering until the queue becomes empty or the
# agent is paused.
# def run(&block : Page ->)
# @running = true
# until @queue.empty? || paused? || limit_reached?
# begin
# visit_page(dequeue, &block)
# rescue Actions::Paused
# return self
# rescue Actions::Action
# end
# end
# @running = false
# @sessions.clear
# self
# end
# Start spidering until the queue becomes empty or the
# agent is paused.
def run
@running = true
until @queue.empty? || paused? || limit_reached? || !running?
begin
visit_page(dequeue)
rescue Actions::Paused
return self
rescue Actions::Action
end
end
@running = false
@sessions.clear
self
end
# Sets the history of URLs that were previously visited.
def history=(new_history)
@history.clear
new_history.each do |url|
@history << url.is_a?(URI) ? url : URI.parse(url)
end
@history
end
# Specifies the links which have been visited.
def visited_links
@history.map(&.to_s)
end
# Specifies the hosts which have been visited.
def visited_hosts
history.map(&.host)
end
# Determines whether a URL was visited or not.
def visited?(url)
url = url.is_a?(URI) ? url : URI.parse(url)
@history.includes?(url)
end
# Sets the list of failed URLs.
def failures=(new_failures)
@failures.clear
new_failures.each do |url|
@failures << url.is_a?(URI) ? url : URI.parse(url)
end
@failures
end
# Determines whether a given URL could not be visited.
def failed?(url)
url = url.is_a?(URI) ? url : URI.parse(url)
@failures.includes?(url)
end
# Sets the queue of URLs to visit.
# Sets the list of failed URLs.
def queue=(new_queue)
@queue.clear
new_queue.each do |url|
@queue << url.is_a?(URI) ? url : URI.parse(url)
end
@queue
end
# Determines whether the given URL has been queued for visiting.
def queued?(url)
url = url.is_a?(URI) ? url : URI.parse(url)
@queue.includes?(url)
end
# Enqueues a given URL for visiting, only if it passes all
# of the agent's rules for visiting a given URL.
def enqueue(url, level = 0, force = false)
url = sanitize_url(url)
if (!queued?(url) && visit?(url)) || force
link = url.to_s
return if url.host.to_s.empty?
begin
@every_url_blocks.each { |url_block| url_block.call(url) }
@every_url_like_blocks.each do |pattern, url_blocks|
match = case pattern
when Regex
link =~ pattern
else
(pattern == link) || (pattern == url)
end
if match
url_blocks.each { |url_block| url_block.call(url) }
end
end
rescue action : Actions::Paused
raise(action)
rescue Actions::SkipLink
return false
rescue Actions::Action
end
@queue << url
@levels[url] = level
true
end
end
# Gets and creates a new `Page` object from a given URL,
# yielding the newly created page.
def get_page(url, &block)
url = url.is_a?(URI) ? url : URI.parse(url)
prepare_request(url) do |session, path, handlers|
new_page = Page.new(url, session.get(path, headers: handlers))
# save any new cookies
@cookies.from_page(new_page)
yield new_page
return new_page
end
end
# Gets and creates a new `Page` object from a given URL.
def get_page(url)
url = url.is_a?(URI) ? url : URI.parse(url)
prepare_request(url) do |session, path, handlers|
new_page = Page.new(url, session.get(path, handlers))
# save any new cookies
@cookies.from_page(new_page)
return new_page
end
end
# Posts supplied form data and creates a new Page from a given URL,
# yielding the newly created page.
def post_page(url, post_data = "", &block)
url = url.is_a?(URI) ? url : URI.parse(url)
prepare_request(url) do |session, path, handlers|
new_page = Page.new(url, session.post(path, post_data, handlers))
# save any new cookies
@cookies.from_page(new_page)
yield new_page
return new_page
end
end
# Posts supplied form data and creates a new Page from a given URL.
def post_page(url, post_data = "")
url = url.is_a?(URI) ? url : URI.parse(url)
prepare_request(url) do |session, path, handlers|
new_page = Page.new(url, session.post(path, post_data, handlers))
# save any new cookies
@cookies.from_page(new_page)
return new_page
end
end
# Visits a given URL and enqueues the links recovered
# from the page to be visited later.
# def visit_page(url, &block : Page ->)
# url = sanitize_url(url)
# get_page(url) do |page|
# @history << page.url
# begin
# @every_page_blocks.each { |page_block| page_block.call(page) }
# yield page
# rescue action : Actions::Paused
# raise(action)
# rescue Actions::SkipPage
# return Nil
# rescue Actions::Action
# end
# page.each_url do |next_url|
# begin
# @every_link_blocks.each do |link_block|
# link_block.call(page.url, next_url)
# end
# rescue action : Actions::Paused
# raise(action)
# rescue Actions::SkipLink
# next
# rescue Actions::Action
# end
# if @max_depth.nil? || @max_depth.not_nil! > (@levels[url]? || 0)
# @levels[url] ||= 0
# enqueue(next_url, @levels[url] + 1)
# end
# end
# end
# end
# Visits a given URL and enqueues the links recovered
# from the page to be visited later.
def visit_page(url)
url = sanitize_url(url)
get_page(url) do |page|
@history << page.url
begin
@every_page_blocks.each { |page_block| page_block.call(page) }
rescue action : Actions::Paused
raise(action)
rescue Actions::SkipPage
return nil
rescue Actions::Action
end
page.each_url do |next_url|
begin
@every_link_blocks.each do |link_block|
link_block.call(page.url, next_url)
end
rescue action : Actions::Paused
raise(action)
rescue Actions::SkipLink
next
rescue Actions::Action
end
if @max_depth.nil? || @max_depth.not_nil! > (@levels[url]? || 0)
@levels[url] ||= 0
enqueue(next_url, @levels[url] + 1)
end
end
end
end
# Converts the agent into a hash.
def to_h
{"history" => @history, "queue" => @queue}
end
# Prepares request headers for a given URL.
protected def prepare_request_headers(url)
# set any additional HTTP headers
headers = @default_headers.dup
unless @host_headers.empty?
@host_headers.each do |name, header|
if url.host =~ name
headers["Host"] = header
break
end
end
end
headers["Host"] ||= @host_header.to_s if @host_header
headers["User-Agent"] ||= @user_agent.to_s
headers["Referer"] ||= @referer.to_s if @referer
if authorization = @authorized.for_url(url.host.to_s)
headers["Authorization"] = "Basic #{authorization}"
end
if header_cookies = @cookies.for_host(url.host.to_s)
headers["Cookie"] = header_cookies.to_cookie_header
end
headers
end
# Normalizes the request path and grabs a session to handle
# page get and post requests.
def prepare_request(url, &block)
path = if url.path.empty?
"/"
else
url.path
end
# append the URL query to the path
path += "?#{url.query}" if url.query
headers = prepare_request_headers(url)
begin
sleep(@fetch_delay) if @fetch_delay.to_i > 0
yield @sessions[url], path, headers
rescue Halite::Exception::Error | IO::Error | Socket::Error | OpenSSL::SSL::Error
@sessions.kill!(url)
return nil
end
end
# Dequeues a URL that will later be visited.
def dequeue
@queue.shift
end
# Determines if the maximum limit has been reached.
def limit_reached?
if limit = @limit
return @history.size >= limit
end
false
end
# Determines if a given URL should be visited.
def visit?(url)
# puts [url.to_s, visited?(url), visit_scheme?(url.scheme.to_s), visit_host?(url.host.to_s), visit_port?(url.port || -1), visit_link?(url.to_s), visit_url?(url), visit_ext?(url.path)]
!visited?(url) &&
visit_scheme?(url.scheme.to_s) &&
visit_host?(url.host.to_s) &&
visit_port?(url.port || -1) &&
visit_link?(url.to_s) &&
visit_url?(url) &&
visit_ext?(url.path)
# robot_allowed?(url.to_s)
end
# Adds a given URL to the failures list.
def failed(url)
@failures << url
@every_failed_url_blocks.each { |fail_block| fail_block.call(url) }
true
end
end
end

View File

@ -0,0 +1,53 @@
module Arachnid
class Agent
module Actions
# A Runtime Error
class RuntimeError < Exception; end
# The base `Actions` exceptions class
class Action < RuntimeError; end
# Exception used to pause a running `Agent`
class Paused < Action; end
# Exception which causes a running `Agent` to skip a link.
class SkipLink < Action; end
# Exception which caises a running `Agent` to skip a page.
class SkipPage < Action; end
end
# Continue spidering
def continue!(&block)
@paused = false
run(&block)
end
# Sets the pause state of the agent.
def pause=(state)
@paused = state
end
# Pauses the agent, causing spidering to temporarily stop.
def pause!
@paused = true
raise Actions::Paused.new
end
# Determines whether the agent is paused.
def paused?
@paused == true
end
# Causes the agent to skip the link being enqueued.
def skip_link!
raise Actions::SkipLink.new
end
# Causes the agent to skip the page being visited.
def skip_page!
raise Actions::SkipPage
end
end
end

View File

@ -0,0 +1,248 @@
require "../page"
module Arachnid
class Agent
@every_url_blocks = [] of Proc(URI, Nil)
@every_failed_url_blocks = [] of Proc(URI, Nil)
@every_url_like_blocks = Hash(String | Regex, Array(Proc(URI, Nil))).new do |hash, key|
hash[key] = [] of Proc(URI, Nil)
end
@every_page_blocks = [] of Proc(Page, Nil)
@every_link_blocks = [] of Proc(URI, URI, Nil)
# Pass each URL from each page visited to the given block.
def every_url(&block : URI ->)
@every_url_blocks << block
self
end
# Pass each URL that could not be requested to the given block.
def every_failed_url(&block : URI ->)
@every_failed_url_blocks << block
self
end
# Pass every URL that the agent visits, and matches a given pattern,
# to a given block.
def every_url_like(pattern, &block : URI ->)
@every_url_like_blocks[pattern] << block
self
end
# Ssee `#every_url_like`
def urls_like(pattern, &block : URI ->)
every_url_like(pattern, &block)
end
# Pass the headers from every response the agent receives to a given
# block.
def all_headers(&block)
headers = [] of HTTP::Headers
every_page { |page| headers << page.headers }
headers.each { |header| yield headers }
end
# Pass every page that the agent visits to a given block.
def every_page(&block : Page ->)
@every_page_blocks << block
self
end
# Pass every OK page that the agent visits to a given block.
def every_ok_page(&block : Page ->)
pages = [] of Page
every_page { |page| (pages << page) if page.ok? }
pages.each { |page| yield page }
end
# Pass every Redirect page that the agent visits to a given block.
def every_redirect_page(&block : Page ->)
pages = [] of Page
every_page { |page| (pages << page) if page.redirect? }
pages.each { |page| yield page }
end
# Pass every Timeout page that the agent visits to a given block.
def every_timedout_page(&block : Page ->)
pages = [] of Page
every_page { |page| (pages << page) if page.timeout? }
pages.each { |page| yield page }
end
# Pass every Bad Request page that the agent visits to a given block.
def every_bad_request_page(&block : Page ->)
pages = [] of Page
every_page { |page| (pages << page) if page.bad_request? }
pages.each { |page| yield page }
end
# Pass every Unauthorized page that the agent visits to a given block.
def every_unauthorized_page(&block : Page ->)
pages = [] of Page
every_page { |page| (pages << page) if page.unauthorized? }
pages.each { |page| yield page }
end
# Pass every Forbidden page that the agent visits to a given block.
def every_forbidden_page(&block : Page ->)
pages = [] of Page
every_page { |page| (pages << page) if page.forbidden? }
pages.each { |page| yield page }
end
# Pass every Missing page that the agent visits to a given block.
def every_missing_page(&block : Page ->)
pages = [] of Page
every_page { |page| (pages << page) if page.missing? }
pages.each { |page| yield page }
end
# Pass every Internal Server Error page that the agent visits to a
# given block.
def every_internal_server_error_page(&block : Page ->)
pages = [] of Page
every_page { |page| (pages << page) if page.had_internal_server_error? }
pages.each { |page| yield page }
end
# Pass every Plain Text page that the agent visits to a given block.
def every_txt_page(&block : Page ->)
pages = [] of Page
every_page { |page| (pages << page) if page.txt? }
pages.each { |page| yield page }
end
# Pass every HTML page that the agent visits to a given block.
def every_html_page(&block : Page ->)
pages = [] of Page
every_page { |page| (pages << page) if page.html? }
pages.each { |page| yield page }
end
# Pass every XML page that the agent visits to a given block.
def every_xml_page(&block : Page ->)
pages = [] of Page
every_page { |page| (pages << page) if page.xml? }
pages.each { |page| yield page }
end
# Pass every XML Stylesheet (XSL) page that the agent visits to a
# given block.
def every_xsl_page(&block : Page ->)
pages = [] of Page
every_page { |page| (pages << page) if page.xsl? }
pages.each { |page| yield page }
end
# Pass every HTML or XML document that the agent parses to a given
# block.
def every_doc(&block : Document::HTML | XML::Node ->)
docs = [] of Document::HTML || XML::Node
every_page { |page| docs << page.doc.not_nil! if page.doc }
docs.each { |doc| yield doc }
end
# Pass every HTML document that the agent parses to a given block.
def every_html_doc(&block : Document::HTML | XML::Node ->)
docs = [] of Document::HTML
every_page { |page| docs << page.doc.not_nil! if page.html? }
docs.each { |doc| yield doc }
end
# Pass every XML document that the agent parses to a given block.
def every_xml_doc(&block : XML::Node ->)
docs = [] of XML::Node
every_page { |page| docs << page.doc.not_nil! if page.xml? }
docs.each { |doc| yield doc }
end
# Pass every XML Stylesheet (XSL) that the agent parses to a given
# block.
def every_xsl_doc(&block : XML::Node ->)
docs = [] of XML::Node
every_page { |page| docs << page.doc.not_nil! if page.xsl? }
docs.each { |doc| yield doc }
end
# Pass every RSS document that the agent parses to a given block.
def every_rss_doc(&block : XML::Node ->)
docs = [] of XML::Node
every_page { |page| docs << page.doc.not_nil! if page.rss? }
docs.each { |doc| yield doc }
end
# Pass every Atom document that the agent parses to a given block.
def every_atom_doc(&block : XML::Node ->)
docs = [] of XML::Node
every_page { |page| docs << page.doc.not_nil! if page.atom? }
docs.each { |doc| yield doc }
end
# Pass every JavaScript page that the agent visits to a given block.
def every_javascript_page(&block : Page ->)
pages = [] of Page
every_page { |page| (pages << page) if page.javascript? }
pages.each { |page| yield page }
end
# Pass every CSS page that the agent visits to a given block.
def every_css_page(&block : Page ->)
pages = [] of Page
every_page { |page| (pages << page) if page.css? }
pages.each { |page| yield page }
end
# Pass every RSS feed that the agent visits to a given block.
def every_rss_page(&block : Page ->)
pages = [] of Page
every_page { |page| (pages << page) if page.rss? }
pages.each { |page| yield page }
end
# Pass every Atom feed that the agent visits to a given block.
def every_atom_page(&block : Page ->)
pages = [] of Page
every_page { |page| (pages << page) if page.atom? }
pages.each { |page| yield page }
end
# Pass every MS Word page that the agent visits to a given block.
def every_ms_word_page(&block : Page ->)
pages = [] of Page
every_page { |page| (pages << page) if page.ms_word? }
pages.each { |page| yield page }
end
# Pass every PDF page that the agent visits to a given block.
def every_pdf_page(&block : Page ->)
pages = [] of Page
every_page { |page| (pages << page) if page.pdf? }
pages.each { |page| yield page }
end
# Pass every ZIP page that the agent visits to a given block.
def every_zip_page(&block : Page ->)
pages = [] of Page
every_page { |page| (pages << page) if page.zip? }
pages.each { |page| yield page }
end
# Passes every image URI to the given blocks.
def every_image(&block : Page ->)
pages = [] of Page
every_page { |page| (pages << page) if page.image? }
pages.each { |page| yield page }
end
# Passes every origin and destination URI of each link to a given
# block.
def every_link(&block : URI, URI ->)
@every_link_blocks << block
self
end
end
end

View File

@ -0,0 +1,256 @@
require "../rules"
module Arachnid
class Agent
# List of acceptable URL schemes to follow
getter schemes : Array(String) = [] of String
@host_rules = Rules(String).new
@port_rules = Rules(Int32).new
@link_rules = Rules(String).new
@url_rules = Rules(URI).new
@ext_rules = Rules(String).new
# Sets the list of acceptable URL schemes to visit.
def schemes=(new_schemes)
@schemes = new_schemes.map(&.to_s)
end
# Specifies the patterns that match host-names to visit.
def visit_hosts
@host_rules.accept
end
# Adds a given pattern to the `#visit_hosts`.
def visit_hosts_like(pattern)
visit_hosts << pattern
self
end
def visit_hosts_like(&block)
visit_hosts << block
self
end
# Specifies the patterns that match host-names to not visit.
def ignore_hosts
@host_rules.reject
end
# Adds a given pattern to the `#ignore_hosts`.
def ignore_hosts_like(pattern)
ignore_hosts << pattern
self
end
def ignore_hosts_like(&block)
ignore_hosts << block
self
end
# Specifies the patterns that match the ports to visit.
def visit_ports
@port_rules.accept
end
# Adds a given pattern to the `#visit_ports`.
def visit_ports_like(pattern)
visit_ports << pattern
self
end
def visit_ports_like(&block : Int32 -> Bool)
visit_ports << block
self
end
# Specifies the patterns that match ports to not visit.
def ignore_ports
@port_rules.reject
end
# Adds a given pattern to the `#ignore_ports`.
def ignore_ports_like(pattern)
ignore_ports << pattern
self
end
def ignore_ports_like(&block : Int32 -> Bool)
ignore_ports << block
self
end
# Specifies the patterns that match the links to visit.
def visit_links
@link_rules.accept
end
# Adds a given pattern to the `#visit_links`
def visit_links_like(pattern)
visit_links << pattern
self
end
def visit_links_like(&block : String -> Bool)
visit_links << block
self
end
# Specifies the patterns that match links to not visit.
def ignore_links
@link_rules.reject
end
# Adds a given pattern to the `#ignore_links`.
def ignore_links_like(pattern)
ignore_links << pattern
self
end
def ignore_links_like(&block : String -> Bool)
ignore_links << block
self
end
# Specifies the patterns that match the URLs to visit.
def visit_urls
@url_rules.accept
end
# Adds a given pattern to the `#visit_urls`
def visit_urls_like(&block : URI -> Bool)
visit_urls << block
self
end
def visit_urls_like(pattern)
visit_urls << pattern
self
end
# Specifies the patterns that match URLs to not visit.
def ignore_urls
@url_rules.reject
end
# Adds a given pattern to the `#ignore_urls`.
def ignore_urls_like(&block : URI -> Bool)
ignore_urls << block
self
end
def ignore_urls_like(pattern)
ignore_urls << pattern
self
end
# Specifies the patterns that match the URI path extensions to visit.
def visit_exts
@ext_rules.accept
end
# Adds a given pattern to the `#visit_exts`.
def visit_exts_like(&block : String -> Bool)
visit_exts << block
self
end
def visit_exts_like(pattern)
visit_exts << pattern
self
end
# Specifies the patterns that match URI path extensions to not visit.
def ignore_exts
@ext_rules.reject
end
# Adds a given pattern to the `#ignore_exts`.
def ignore_exts_like(&block : String -> Bool)
ignore_exts << block
self
end
def ignore_exts_like(pattern)
ignore_exts << pattern
self
end
# Initializes filtering rules.
protected def initialize_filters(
schemes = nil,
hosts = nil,
ignore_hosts = nil,
ports = nil,
ignore_ports = nil,
links = nil,
ignore_links = nil,
urls = nil,
ignore_urls = nil,
exts = nil,
ignore_exts = nil
)
if schemes
self.schemes = schemes
else
@schemes << "http"
@schemes << "https"
end
@host_rules.accept = hosts
@host_rules.reject = ignore_hosts
@port_rules.accept = ports
@port_rules.reject = ignore_ports
@link_rules.accept = links
@link_rules.reject = ignore_links
@url_rules.accept = urls
@url_rules.reject = ignore_urls
@ext_rules.accept = exts
@ext_rules.reject = ignore_exts
if host
visit_hosts_like(host.to_s)
end
end
# Determines if a given URI scheme should be visited.
protected def visit_scheme?(scheme)
if scheme
@schemes.includes?(scheme)
else
true
end
end
# Determines if a given host-name should be visited.
protected def visit_host?(host)
@host_rules.accept?(host)
end
# Determines if a given port should be visited.
protected def visit_port?(port)
@port_rules.accept?(port)
end
# Determines if a given link should be visited.
protected def visit_link?(link)
@link_rules.accept?(link)
end
# Determines if a given URL should be visited.
protected def visit_url?(link)
@url_rules.accept?(link)
end
# Determines if a given URI path extension should be visited.
protected def visit_ext?(path)
ext = File.extname(path)
@ext_rules.accept?(ext)
end
end
end

View File

@ -0,0 +1,20 @@
require "../robots"
module Arachnid
class Agent
@robots : Arachnid::Robots? = nil
# Initializes the robots filter.
def initialize_robots
# @robots = Arachnid::Robots.new(@user_agent)
end
# Determines whether a URL is allowed by the robot policy.
def robot_allowed?(url)
if robots = @robots
return robots.allowed?(url)
end
true
end
end
end

View File

@ -0,0 +1,21 @@
module Arachnid
class Agent
# Specifies whether the Agent will strip URI fragments
property? strip_fragments : Bool = true
# Specifies whether the Agent will strip URI queries
property? strip_query : Bool = false
# Sanitizes a URL based on filtering options
def sanitize_url(url)
# normalize the url
url = URI.parse(url) unless url.is_a?(URI)
url.path = "" if url.path == "/"
url.fragment = nil if @strip_fragments
url.query = nil if @strip_query
url
end
end
end

39
src/arachnid/arachnid.cr Normal file
View File

@ -0,0 +1,39 @@
require "./page"
require "./agent"
module Arachnid
extend self
# Specifies whether robots.txt should be honored globally
class_property? robots : Bool = false
# Should we set the DNT (Do Not Track) header?
class_property? do_not_track : Bool = false
# Maximum amount of redirects to follow
class_property max_redirects : Int32 = 0
# Connect timeout.
class_property connect_timeout : Int32 = 10
# Read timeout.
class_property read_timeout : Int32 = 10
# The User-Agent string used by all Agent objects by default.
class_property user_agent : String = "Arachnid #{Arachnid::VERSION}"
# See `Agent.start_at`
def start_at(url, **options, &block : Agent ->)
Agent.start_at(url, **options, &block)
end
# See `Agent.host`
def host(name, **options, &block : Agent ->)
Agent.host(name, **options, &block)
end
# See `Agent.site`
def site(url, **options, &block : Agent ->)
Agent.site(url, **options, &block)
end
end

View File

@ -0,0 +1,4 @@
module Arachnid
# Represents HTTP Authentication credentials for a website.
record AuthCredential, username : String, password : String
end

View File

@ -0,0 +1,83 @@
require "base64"
require "./extensions/uri"
require "./auth_credential"
require "./page"
module Arachnid
class AuthStore
@credentials = {} of Tuple(String?, String?, Int32?) => Hash(Array(String), AuthCredential)
# Given a URL, return the most specific matching auth credential.
def [](url)
# normalize the url
url = URI.parse(url) unless url.is_a?(URI)
key = key_for(url)
paths = @credentials[key]?
return nil unless paths
# longest path first
ordered_paths = paths.keys.sort { |path_key| -path_key.size }
# directories of the path
path_dirs = URI.expand_path(url.path).split('/').reject(&.empty?)
ordered_paths.each do |path|
return paths[path] if path_dirs[0, path.size] == path
end
nil
end
# Add an auth credential to the store for the supplied base URL.
def []=(url, auth)
# normalize the url
url = URI.parse(url) unless url.is_a?(URI)
# normalize the url path and split it
paths = URI.expand_path(url.path).split('/').reject(&.empty?)
key = key_for(url)
@credentials[key] ||= {} of Array(String) => AuthCredential
@credentials[key][paths] = auth
auth
end
# Convenience method to add username and password credentials
# for a named URL.
def add(url, username, password)
self[url] = AuthCredential.new(username: username, password: password)
end
# Returns the base64 encoded authorization string for the URL
# or `nil` if no authorization exists.
def for_url(url)
if auth = self[url]
Base64.encode("#{auth.username}#{auth.password}")
end
end
# Clear the contents of the auth store.
def clear!
@credentials.clear!
self
end
# Size of the current auth store (number of URL paths stored)
def size
@credentials.values.reduce(0) { |acc, paths| acc + paths.size }
end
# Inspect the auth store
def inspect
"<#{self.class}: #{@credentials.inspect}>"
end
# Creates a auth key based on the URL
private def key_for(url)
{url.scheme, url.host, url.port}
end
end
end

118
src/arachnid/cookie_jar.cr Normal file
View File

@ -0,0 +1,118 @@
module Arachnid
class CookieJar
include Enumerable(HTTP::Cookies)
@params : Hash(String, HTTP::Cookies)
@cookies : HTTP::Cookies
@dirty : Set(String)
# Creates a new `CookieJar`
def initialize
@params = {} of String => HTTP::Cookies
@cookies = HTTP::Cookies.new
@dirty = Set(String).new
end
# Iterates over the host-name and cookie value pairs in the jar.
def each(&block)
@params.each do |kp|
yield kp
end
end
# Returns all relevant cookies in a single string for the named
# host or domain.
def [](host : String)
@params[host]? || HTTP::Cookies.new
end
# Add a cookie to the jar for a particular domain.
def []=(host : String, cookies : HTTP::Cookies)
@params[host] ||= HTTP::Cookies.new
cookies.each do |cookie|
if @params[host][cookie.name]? != cookie.value
cookies.each do |c|
@params[host] << c
end
@dirty.add(host)
break
end
end
cookies
end
# Retrieve cookies for a domain from the response.
def from_page(page)
cookies = page.cookies
unless cookies.empty?
self[page.url.host.to_s] = cookies
return true
end
false
end
# Returns the pre-encoded Cookie for a given host.
def for_host(host)
if @dirty.includes?(host)
values = [] of String
cookies_for_host(host).each do |cookie|
values << cookie.to_cookie_header
end
@cookies[host] = values.join("; ")
@dirty.delete(host)
end
@cookies[host]?
end
# Returns raw cookie value pairs for a given host. Includes cookies
# set on parent domains.
def cookies_for_host(host)
host_cookies = @params[host]? || HTTP::Cookies.new
subdomains = host.split('.')
while subdomains.size > 2
subdomains.shift
if parent_cookies = @params[subdomains.join('.')]?
parent_cookies.each do |cookie|
# copy in the parent cookies, only if they haven't been
# overridden yet.
unless host_cookies.has_key?(cookie.name)
host_cookies[cookie.name] = cookie.value
end
end
end
end
host_cookies
end
# Clear out the jar, removing all stored cookies.
def clear!
@params.clear
@cookies.clear
@dirty.clear
self
end
# Size of the cookie jar.
def size
@params.size
end
# Inspects the cookie jar.
def inspect
"#<#{self.class}: #{@params.inspect}>"
end
end
end

View File

@ -0,0 +1,196 @@
require "xml"
module Arachnid
module Document
struct HTML
@content : String
@document : XML::Node
@ids : Hash(String, XML::Node)
@tags : Hash(String, Array(Tag))
@classes : Hash(String, Array(XML::Node))
forward_missing_to @document
def initialize(@content : String)
@document = XML.parse_html(@content)
@ids = {} of String => XML::Node
@tags = {} of String => Array(Tag)
@classes = {} of String => Array(XML::Node)
visit @document
end
def self.parse(content : String)
new(content)
end
# Transform the css query into an xpath query
def self.css_query_to_xpath(query : String) : String
query = "//#{query}"
# Convert '#id_name' as '[@id="id_name"]'
query = query.gsub /\#([A-z0-9]+-*_*)+/ { |m| "*[@id=\"%s\"]" % m.delete('#') }
# Convert '.classname' as '[@class="classname"]'
query = query.gsub /\.([A-z0-9]+-*_*)+/ { |m| "[@class=\"%s\"]" % m.delete('.') }
# Convert ' > ' as '/'
query = query.gsub /\s*>\s*/ { |m| "/" }
# Convert ' ' as '//'
query = query.gsub " ", "//"
# a leading '*' when xpath does not include node name
query = query.gsub /\/\[/ { |m| "/*[" }
return query
end
# Find first tag by tag name and return
# `HTML::Tag` if found or `nil` if not found
def at_tag(tag_name : String) : Tag | Nil
if tags = @tags[tag_name]?
tags.each do |tag|
return tag
end
end
return nil
end
# Find all nodes by tag name and yield
# `HTML::Tag` if found
def where_tag(tag_name : String, &block) : Array(Tag)
arr = [] of Tag
if tags = @tags[tag_name]?
tags.each do |tag|
yield tag
arr << tag
end
end
return arr
end
# Find all nodes by classname and yield
# `HTML::Tag` founded
def where_class(class_name : String, &block) : Array(Tag)
arr = [] of Tag
if klasses = @classes[class_name]?
klasses.each do |node|
klass = Tag.new(node)
yield klass
arr << klass
end
end
return arr
end
# Find a node by its id and return a
# `HTML::Tag` found or `nil` if not found
def at_id(id_name : String) : Tag | Nil
if node = @ids[id_name]?
return Tag.new(node)
end
end
# Find all nodes corresponding to the css query and yield
# `HTML::Tag` found or `nil` if not found
def css(query : String) : Array(Tag)
query = HTML.css_query_to_xpath(query)
return @nodes.xpath_nodes("//#{query}").map { |node|
tag = Tag.new(node)
yield tag
tag
}
end
# Find first node corresponding to the css query and return
# `HTML::Tag` if found or `nil` if not found
def at_css(query : String)
css(query) { |tag| return tag }
return nil
end
private def add_id(id : String, node : XML::Node)
@ids[id] = node
end
private def add_node(node : XML::Node)
if @tags[node.name]? == nil
@tags[node.name] = [] of Tag
end
@tags[node.name] << Tag.new(node)
end
private def add_class(klass : String, node : XML::Node)
if @classes[klass]? == nil
@classes[klass] = [] of XML::Node
end
@classes[klass] << node
end
# Depth-first visit. Given a node, extract metadata from
# node (if exists), then visit each child.
private def visit(node : XML::Node)
# We only extract metadata from HTML nodes
if node.element?
add_node node
if to = node["id"]?
add_id to, node
end
if classes = node["class"]?
classes.split(' ') { |to| add_class to, node }
end
end
# visit each child
node.children.each do |child|
visit child
end
end
# Represents an HTML Tag
struct Tag
getter node : XML::Node
forward_missing_to @node
def initialize(@node : XML::Node)
end
def classname : String | Nil
return @node["class"]? ? @node["class"] : nil
end
def tagname : String
return @node.name
end
def content : String
return @node.text != nil ? @node.text.as(String) : "".as(String)
end
def parent : Tag | Nil
if parent = @node.parent
return Tag.new parent
end
nil
end
def children : Array(Tag)
children = [] of Tag
@node.children.each do |node|
if node.element?
children << Tag.new node
end
end
children
end
def has_class?(klass : String) : Bool
if classes = classname
return classes.includes?(klass)
end
false
end
end
end
end
end

View File

@ -0,0 +1,175 @@
require "uri"
require "string_scanner"
class URI
#
# Expands a URI decoded path, into a proper absolute path.
#
# @param [String] path
# The path from a URI.
#
# @return [String]
# The expanded path.
#
# @example
# URI.expand_path("./path")
# # => "path"
#
# @example
# URI.expand_path("test/../path")
# # => "path"
#
# @example
# URI.expand_path("/test/path/")
# # => "/test/path/"
#
# @example
# URI.expand_path("/test/../path")
# # => "/path"
#
def self.expand_path(path)
if path.starts_with?("/")
leading_slash, path = path[0, 1], path[1..-1]
else
leading_slash = ""
end
if path.ends_with?("/")
trailing_slash, path = path[-1, 1], path[0..-2]
else
trailing_slash = ""
end
scanner = StringScanner.new(path)
stack = [] of String
until scanner.eos?
if (dir = scanner.scan(/[^\/]+/))
case dir
when ".." then stack.pop
when "." then false
else stack.push(dir)
end
else
scanner.skip(/\/+/)
end
break if stack.empty?
end
unless stack.empty?
"#{leading_slash}#{stack.join("/")}#{trailing_slash}"
else
""
end
end
def split_path(path)
path.split("/")
end
def merge_path(base, rel)
# RFC2396, Section 5.2, 5)
# RFC2396, Section 5.2, 6)
base_path = split_path(base)
rel_path = split_path(rel)
# RFC2396, Section 5.2, 6), a)
base_path << "" if base_path.last == ".."
while i = base_path.index("..")
base_path = base_path[i - 1, 2]
end
if (first = rel_path.first) && first.empty?
base_path.clear
rel_path.shift
end
# RFC2396, Section 5.2, 6), c)
# RFC2396, Section 5.2, 6), d)
rel_path.push("") if rel_path.last == '.' || rel_path.last == ".."
rel_path.delete('.')
# RFC2396, Section 5.2, 6), e)
tmp = [] of String
rel_path.each do |x|
if x == ".." &&
!(tmp.empty? || tmp.last == "..")
tmp.pop
else
tmp << x
end
end
add_trailer_slash = !tmp.empty?
if base_path.empty?
base_path = [""] # keep '/' for root directory
elsif add_trailer_slash
base_path.pop
end
while x = tmp.shift
if x == ".."
# RFC2396, Section 4
# a .. or . in an absolute path has no special meaning
base_path.pop if base_path.size > 1
else
# if x == ".."
# valid absolute (but abnormal) path "/../..."
# else
# valid absolute path
# end
base_path << x
tmp.each {|t| base_path << t}
add_trailer_slash = false
break
end
end
base_path.push("") if add_trailer_slash
return base_path.join('/')
end
def merge(oth)
oth = URI.parse(oth) unless oth.is_a?(URI)
if oth.absolute?
# raise BadURIError, "both URI are absolute" if absolute?
# hmm... should return oth for usability?
return oth
end
unless self.absolute?
raise URI::Error.new("both URI are othative")
end
base = self.dup
authority = oth.userinfo || oth.host || oth.port
# RFC2396, Section 5.2, 2)
if (oth.path.nil? || oth.path.empty?) && !authority && !oth.query
base.fragment=(oth.fragment) if oth.fragment
return base
end
base.query = nil
base.fragment=(nil)
# RFC2396, Section 5.2, 4)
if !authority
base.path = merge_path(base.path, oth.path) if base.path && oth.path
else
# RFC2396, Section 5.2, 4)
base.path = oth.path if oth.path
end
# RFC2396, Section 5.2, 7)
base.user = oth.userinfo if oth.userinfo
base.host = oth.host if oth.host
base.port = oth.port if oth.port
base.query = oth.query if oth.query
base.fragment=(oth.fragment) if oth.fragment
return base
end
end

97
src/arachnid/page.cr Normal file
View File

@ -0,0 +1,97 @@
require "uri"
require "halite"
require "./page/content_types"
require "./page/cookies"
require "./page/html"
require "./page/status_codes"
require "./document/html"
module Arachnid
# Represents a page requested from a website
class Page
include Page::ContentTypes
include Page::Cookies
include Page::HTML
include Page::StatusCodes
# URL of the page
getter url : URI
# HTTP response
getter response : Halite::Response
# Headers returned with the body
getter headers : HTTP::Headers
@doc : (Document::HTML | XML::Node)?
delegate xpath, xpath_node, xpath_nodes, xpath_bool, xpath_float, xpath_string,
root, at_tag, where_tag, where_class, at_id, css, at_css, to: @doc
forward_missing_to @headers
# Creates a new `Page` object.
def initialize(url : URI, response : Halite::Response)
@url = url
@response = response
@headers = response.headers
end
# The body of the response
def body
@response.body || ""
end
# Returns a parsed document for HTML, XML, RSS, and Atom pages.
def doc
unless body.empty?
doc_class = if html?
Document::HTML
elsif rss? || atom? || xml? || xsl?
XML
end
if doc_class
begin
@doc ||= doc_class.parse(body)
rescue
end
end
end
end
# Searches the document for XPath or CSS paths
def search(path)
if document = doc
document.xpath_nodes(path)
else
[] of XML::Node
end
end
# Searches for the first occurrence of an XPath or CSS path
def at(path)
if document = doc
document.xpath_node(path)
end
end
def /(path)
search(path)
end
def %(path)
at(path)
end
def size
@response.body.bytesize
end
def to_s
body
end
end
end

View File

@ -0,0 +1,162 @@
module Arachnid
class Page
module ContentTypes
# The Content-Type of the page.
def content_type
@response.content_type || ""
end
# The content types of the page.
def content_types
types = @response.headers.get?("content-type") || [] of String
end
# The charset included in the Content-Type.
def content_charset
content_types.each do |value|
if value.includes?(";")
value.split(";").each do |param|
param.strip!
if param.starts_with?("charset=")
return param.split("=", 2).last
end
end
end
end
return nil
end
# Determines if any of the content-types of the page include a given
# type.
def is_content_type?(type : String | Regex)
content_types.any? do |value|
value = value.split(";", 2).first
if type.is_a?(Regex)
value =~ type
else
value == type
end
end
end
# Determines if the page is plain-text.
def plain_text?
is_content_type?("text/plain")
end
# ditto
def text?
plain_text?
end
# Determines if the page is a Directory Listing.
def directory?
is_content_type?("text/directory")
end
# Determines if the page is HTML document.
def html?
is_content_type?("text/html")
end
# Determines if the page is XML document.
def xml?
is_content_type?(/(text|application)\/xml/)
end
# Determines if the page is XML Stylesheet (XSL).
def xsl?
is_content_type?("text/xsl")
end
# Determines if the page is JavaScript.
def javascript?
is_content_type?(/(text|application)\/javascript/)
end
# Determines if the page is JSON.
def json?
is_content_type?("application/json")
end
# Determines if the page is a CSS stylesheet.
def css?
is_content_type?("text/css")
end
# Determines if the page is a RSS feed.
def rss?
is_content_type?(/application\/(rss\+xml|rdf\+xml)/)
end
# Determines if the page is an Atom feed.
def atom?
is_content_type?("application/atom+xml")
end
# Determines if the page is a MS Word document.
def ms_word?
is_content_type?("application/msword")
end
# Determines if the page is a PDF document.
def pdf?
is_content_type?("application/pdf")
end
# Determines if the page is a ZIP archive.
def zip?
is_content_type?("application/zip")
end
# Determine if the page is an image.
def image?
is_content_type?(/image\//)
end
def png?
is_content_type?("image/png")
end
def gif?
is_content_type?("image/gif")
end
def jpg?
is_content_type?(/image\/(jpg|jpeg)/)
end
def svg?
is_content_type?(/image\/svg(\+xml)?/)
end
def video?
is_content_type?(/video\/.*/)
end
def mp4?
is_content_type?("video/mp4")
end
def avi?
is_content_type?("video/x-msvideo")
end
def wmv?
is_content_type?("video/x-ms-wmv")
end
def quicktime?
is_content_type?("video/quicktime")
end
def flash?
is_content_type?("video/flash") ||
is_content_type?("application/x-shockwave-flash")
end
end
end
end

View File

@ -0,0 +1,18 @@
module Arachnid
class Page
module Cookies
# Reserved names used within Cookie strings
RESERVED_COOKIE_NAMES = Regex.new("^(?:Path|Expires|Domain|Secure|HTTPOnly)$", :ignore_case)
# The raw Cookie String sent along with the page.
def cookie
@response.headers["Set-Cookie"]? || ""
end
# The Cookie values sent along with the page.
def cookies
@response.cookies
end
end
end
end

204
src/arachnid/page/html.cr Normal file
View File

@ -0,0 +1,204 @@
require "../extensions/uri"
module Arachnid
class Page
# TODO: Create enumerable methods for the methods that take a block
module HTML
# include Enumerable
# The title of the HTML page.
def title
if (node = at("//title"))
node.inner_text
end
end
# Enumerates over the meta-redirect links in the page.
def each_meta_redirect(&block : URI ->)
if (html? && doc)
search("//meta[@http-equiv and @content]").each do |node|
if node["http-equiv"] =~ /refresh/i
content = node["content"]
if (redirect = content.match(/url=(\S+)$/))
yield URI.parse(redirect[1])
end
end
end
end
end
# Returns a boolean indicating whether or not page-level meta
# redirects are present in this page.
def meta_redirect?
!meta_redirects.empty?
end
# The meta-redirect links of the page.
def meta_redirects
redirects = [] of URI
each_meta_redirect { |r| redirects << r }
redirects
end
# Enumerates over every HTTP or meta-redirect link in the page.
def each_redirect(&block : URI ->)
if (locations = @response.headers.get?("Location"))
# Location headers override any meta-refresh redirects in the HTML
locations.each { |l| URI.parse(l) }
else
# check page-level meta redirects if there isn't a location header
each_meta_redirect(&block)
end
end
# URLs that this document redirects to.
def redirects_to
each_redirect.to_a
end
# Enumerates over every `mailto:` link in the page.
def each_mailto(&block)
if (html? && doc)
doc.xpath_nodes("//a[starts-with(@href,'mailto:')]").each do |a|
yield a["href"][7..-1]
end
end
end
# `mailto:` links in the page.
def mailtos
each_mailto.to_a
end
# Enumerates over every link in the page.
def each_link(&block : URI ->)
each_redirect(&block) if redirect?
each_image(&block)
each_script(&block)
each_resource(&block)
if html? && (d = doc)
d.xpath_nodes("//a[@href]").each do |a|
link = to_absolute(a["href"])
yield link if link
end
d.xpath_nodes("//frame[@src]").each do |iframe|
link = to_absolute(iframe["src"])
yield link if link
end
d.xpath_nodes("//iframe[@src]").each do |iframe|
link = to_absolute(iframe["src"])
yield link if link
end
end
end
def each_script(&block : URI ->)
if html? && (d = doc)
d.xpath_nodes("//script[@src]").each do |script|
url = to_absolute(script["src"])
yield url if url
end
end
end
def each_resource(&block : URI ->)
if html? && (d = doc)
d.xpath_nodes("//link[@href]").each do |link|
yield URI.parse(link["href"])
end
end
end
def each_image(&block : URI ->)
if html? && (d = doc)
d.xpath_nodes("//img[@src]").each do |img|
url = to_absolute(img["src"])
yield url if url
end
d.xpath_nodes("//img[@srcset]").each do |set|
sources = set["srcset"].split(" ").map_with_index { |e, i| (i.zero? || i.even?) ? e : nil }.compact
sources.each do |source|
url = to_absolute(source)
yield url if url
end
end
end
end
def each_video(&block : URI ->)
if html? && (d = doc)
d.xpath_nodes("//video[@src]").each do |video|
url = to_absolute(video["src"])
yield url if url
end
d.xpath_nodes("//video/source[@src]").each do |source|
url = to_absolute(source["src"])
yield url if url
end
end
end
# The links from within the page.
def links
links = [] of URI
each_link { |link| links << link }
links
end
# Enumerates over every URL in the page.
def each_url(&block : URI ->)
each_link(&block) do |link|
if (url = to_absolute(link))
yield url
end
end
end
# ditto
def each(&block)
each_url { |url| yield url }
end
# Absolute URIs from within the page.
def urls
urls = [] of URI
each_url { |url| urls << link }
urls
end
# Normalizes and expands a given link into a proper URI.
def to_absolute(link)
link = link.is_a?(URI) ? link : URI.parse(link)
new_url = begin
url.merge(link)
rescue Exception
return
end
if (!new_url.opaque?) && (path = new_url.path)
# ensure that paths begin with a leading '/' for URI::FTP
if (new_url.scheme == "ftp" && !path.starts_with?("/"))
path.insert(0, "/")
end
# make sure the path does not contain any .. or . directories,
# since URI::Generic#merge cannot normalize paths such as
# "/stuff/../"
new_url.path = URI.expand_path(path)
end
return new_url
end
end
end
end

View File

@ -0,0 +1,59 @@
module Arachnid
class Page
module StatusCodes
# The response code from the page.
def code
@response.status_code.to_i
end
# Determines if the response code is `200`.
def ok?
code == 200
end
# Determines if the response code is `308`.
def timedout?
code == 308
end
# Determines if the response code is `400`.
def bad_request?
code == 400
end
# Determines if the response code is `401`.
def unauthorized?
code == 401
end
# Determines if the response code is `403`.
def forbidden?
code == 403
end
# Determines if the response code is `404`.
def missing?
code == 404
end
# Determines if the response code is `500`.
def had_internal_server_error?
code == 500
end
# Determines if the response code is `300`, `301`, `302`, `303`
# or `307`. Also checks for "soft" redirects added at the page
# level by a meta refresh tag.
def redirect?
case code
when 300..303, 307
true
when 200
meta_redirect?
else
false
end
end
end
end
end

231
src/arachnid/robots.cr Normal file
View File

@ -0,0 +1,231 @@
require "uri"
module Arachnid
# Parses robots.txt files for the perusal of a single user-agent.
#
# The behaviour implemented is guided by the following sources, though
# as there is no widely accepted standard, it may differ from other implementations.
# If you consider its behaviour to be in error, please contact the author.
#
# http://www.robotstxt.org/orig.html
# - the original, now imprecise and outdated version
# http://www.robotstxt.org/norobots-rfc.txt
# - a much more precise, outdated version
# http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=156449&from=35237
# - a few hints at modern protocol extensions.
#
# This parser only considers lines starting with (case-insensitively:)
# Useragent: User-agent: Allow: Disallow: Sitemap:
#
# The file is divided into sections, each of which contains one or more User-agent:
# lines, followed by one or more Allow: or Disallow: rules.
#
# The first section that contains a User-agent: line that matches the robot's
# user-agent, is the only section that relevent to that robot. The sections are checked
# in the same order as they appear in the file.
#
# (The * character is taken to mean "any number of any characters" during matching of
# user-agents)
#
# Within that section, the first Allow: or Disallow: rule that matches the expression
# is taken as authoritative. If no rule in a section matches, the access is Allowed.
#
# (The order of matching is as in the RFC, Google matches all Allows and then all Disallows,
# while Bing matches the most specific rule, I'm sure there are other interpretations)
#
# When matching urls, all % encodings are normalised (except for /?=& which have meaning)
# and "*"s match any number of any character.
#
# If a pattern ends with a $, then the pattern must match the entire path, or the entire
# path with query string.
#
# TODO: Rework to allow for multiple Robots
class Robots
alias Rule = Tuple(String, Bool)
alias RuleSet = Tuple(String, Array(Rule))
getter body : String
getter user_agent : String
getter rules : Array(Tuple(String, Array(Rule)))
getter sitemaps : Array(String)
def initialize(@body : String, @user_agent : String)
@sitemaps = [] of String
@rules = [] of RuleSet
parse(@body)
end
# Given a URI object, or a string representing one, determine whether this
# robots.txt would allow access to the path.
def allowed?(uri)
uri = URI.parse(uri)
path = (uri.path || "/") + (uri.query ? "?" + uri.query.to_s : "")
path_allowed?(@user_agent, path)
end
# Check whether the relative path (a string of the url's path and query
# string) is allowed by the rules we have for the given user_agent.
#
private def path_allowed?(user_agent, path)
@rules.each do |(ua_glob, path_globs)|
if match_ua_glob user_agent, ua_glob
path_globs.each do |(path_glob, allowed)|
return allowed if match_path_glob path, path_glob
end
return true
end
end
true
end
# This does a case-insensitive substring match such that if the user agent
# is contained within the glob, or vice-versa, we will match.
#
# According to the standard, *s shouldn't appear in the user-agent field
# except in the case of "*" meaning all user agents. Google however imply
# that the * will work, at least at the end of a string.
#
# For consistency, and because it seems expected behaviour, and because
# a glob * will match a literal * we use glob matching not string matching.
#
# The standard also advocates a substring match of the robot's user-agent
# within the user-agent field. From observation, it seems much more likely
# that the match will be the other way about, though we check for both.
#
private def match_ua_glob(user_agent, glob)
glob =~ Regex.new(Regex.escape(user_agent), Regex::Options::IGNORE_CASE) ||
user_agent =~ Regex.new(reify(glob), Regex::Options::IGNORE_CASE)
end
# This does case-sensitive prefix matching, such that if the path starts
# with the glob, we will match.
#
# According to the standard, that's it. However, it seems reasonably common
# for asterkisks to be interpreted as though they were globs.
#
# Additionally, some search engines, like Google, will treat a trailing $
# sign as forcing the glob to match the entire path - whether including
# or excluding the query string is not clear, so we check both.
#
# (i.e. it seems likely that a site owner who has Disallow: *.pdf$ expects
# to disallow requests to *.pdf?i_can_haz_pdf, which the robot could, if
# it were feeling malicious, construe.)
#
# With URLs there is the additional complication that %-encoding can give
# multiple representations for identical URLs, this is handled by
# normalize_percent_encoding.
#
private def match_path_glob(path, glob)
if glob =~ /\$$/
end_marker = "(?:\?|$)"
glob = glob.gsub /\$$/, ""
else
end_marker = ""
end
glob = normalize_percent_encoding(glob)
path = normalize_percent_encoding(path)
path =~ Regex.new("^" + reify(glob) + end_marker)
rescue e
false
end
# As a general rule, we want to ignore different representations of the
# same URL. Naively we could just unescape, or escape, everything, however
# the standard implies that a / is a HTTP path separator, while a %2F is an
# encoded / that does not act as a path separator. Similar issues with ?, &
# and =, though all other characters are fine. (While : also has a special
# meaning in HTTP, most implementations ignore this in the path)
#
# It's also worth noting that %-encoding is case-insensitive, so we
# explicitly upcase the few that we want to keep.
#
private def normalize_percent_encoding(path)
# First double-escape any characters we don't want to unescape
# & / = ?
path = path.gsub(/%(26|2F|3D|3F)/i) do |code|
"%25#{code.upcase}"
end
URI.unescape(path)
end
# Convert the asterisks in a glob into (.*)s for regular expressions,
# and at the same time, escape any other characters that would have
# a significance in a regex.
#
private def reify(glob)
glob.split("*").map { |part| Regex.escape(part) }.join(".*")
end
# Convert the @body into a set of @rules so that our parsing mechanism
# becomes easier.
#
# @rules is an array of pairs. The first in the pair is the glob for the
# user-agent and the second another array of pairs. The first of the new
# pair is a glob for the path, and the second whether it appears in an
# Allow: or a Disallow: rule.
#
# For example:
#
# User-agent: *
# Disallow: /secret/
# Allow: / # allow everything...
#
# Would be parsed so that:
#
# @rules = [["*", [ ["/secret/", false], ["/", true] ]]]
#
#
# The order of the arrays is maintained so that the first match in the file
# is obeyed as indicated by the pseudo-RFC on http://robotstxt.org/. There
# are alternative interpretations, some parse by speicifity of glob, and
# some check Allow lines for any match before Disallow lines. All are
# justifiable, but we could only pick one.
#
# Note that a blank Disallow: should be treated as an Allow: * and multiple
# user-agents may share the same set of rules.
#
private def parse(body)
body.split(/[\r\n]+/).each do |line|
prefix, value = line.delete("\000").split(":", 2).map(&.strip)
value = value.sub /\s+#.*/, "" if value
parser_mode = :begin
if prefix && value
case prefix.downcase
when /^user-?agent$/
if parser_mode == :user_agent
@rules << {value, rules.last[1]}
else
parser_mode = :user_agent
@rules << {value, [] of Rule}
end
when "disallow"
parser_mode = :rules
@rules << {"*", [] of Rule} if @rules.empty?
if value == ""
@rules.last[1] << {"*", true}
else
@rules.last[1] << {value, false}
end
when "allow"
parser_mode = :rules
@rules << {"*", [] of Rule} if @rules.empty?
@rules.last[1] << {value, true}
when "sitemap"
@sitemaps << value
else
# Ignore comments, Crawl-delay: and badly formed lines.
end
end
end
end
end
end

53
src/arachnid/rules.cr Normal file
View File

@ -0,0 +1,53 @@
module Arachnid
# The `Rules` class represents collections of acceptance and rejection
# rules, which are used to filter data.
class Rules(T)
# Accept rules
getter accept : Array(Proc(T | Nil, Bool) | T | Regex | String)
# Reject rules
getter reject : Array(Proc(T | Nil, Bool) | T | Regex | String)
# Creates a new `Rules` object.
def initialize(accept : Array(Proc(T | Nil, Bool) | T | Regex | String)? = nil, reject : Array(Proc(T | Nil, Bool) | T | Regex | String)? = nil)
@accept = accept ? accept : [] of Proc(T | Nil, Bool) | T | Regex | String
@reject = reject ? reject : [] of Proc(T | Nil, Bool) | T | Regex | String
end
# Determines whether the data should be accepted or rejected.
def accept?(data : T)
return true if accept.empty? && reject.empty?
unless @accept.empty?
@accept.any? { |rule| test_data(data, rule) }
else
!@reject.any? { |rule| test_data(data, rule) }
end
end
def accept=(value)
@accept = value || [] of Proc(T | Nil, Bool) | T | Regex | String
end
# Determines whether the data should be rejected or accepted.
def reject?(data : T)
!accept?(data)
end
def reject=(value)
@reject = value || [] of Proc(T | Nil, Bool) | T | Regex | String
end
# Tests the given data against a pattern.
private def test_data(data : T, rule)
case rule
when Proc
rule.call(data) == true
when Regex
!((data.to_s =~ rule).nil?)
else
data == rule
end
end
end
end

View File

@ -0,0 +1,112 @@
require "uri"
require "halite"
module Arachnid
# Stores active HTTP Sessions organized by scheme, host-name and port.
class SessionCache
# Optional read timeout.
property read_timeout : Int32
# Optional connect timeout.
property connect_timeout : Int32
# Max redirects to follow.
property max_redirects : Int32?
# Should we set a DNT (Do Not Track) header?
property? do_not_track : Bool
@sessions = {} of Tuple(String?, String?, Int32?) => Halite::Client
# Create a new session cache
def initialize(
read_timeout : Int32? = nil,
connect_timeout : Int32? = nil,
follow_redirects : Bool? = nil,
max_redirects : Int32? = nil,
do_not_track : Bool? = nil
)
@read_timeout = read_timeout || Arachnid.read_timeout
@connect_timeout = connect_timeout || Arachnid.connect_timeout
@max_redirects = max_redirects || Arachnid.max_redirects
@do_not_track = do_not_track || Arachnid.do_not_track?
end
# Determines if there is an active session for the given URL
def active?(url)
# normalize the url
url = URI.parse(url) unless url.is_a?(URI)
# session key
key = key_for(url)
@sessions.has_key?(key)
end
# Provides an active session for a given URL.
def [](url)
# normalize the url
url = URI.parse(url) unless url.is_a?(URI)
# session key
key = key_for(url)
# normalize the endpoint
endpoint = url.dup
endpoint.scheme ||= "http"
endpoint.query = nil
endpoint.fragment = nil
endpoint.path = ""
# Set headers
headers = {
"DNT" => @do_not_track ? 1 : 0
}
unless @sessions.has_key?(key)
session = Halite::Client.new(
endpoint: endpoint,
timeout: Halite::Timeout.new(
connect: @connect_timeout,
read: @read_timeout
),
follow: Halite::Follow.new(
hops: @max_redirects,
strict: false
),
headers: headers,
)
# session = session.logging(skip_request_body: true, skip_response_body: true)
@sessions[key] = session
end
@sessions[key]
end
# Destroys an HTTP session for the given scheme, host, and port.
def kill!(url)
# normalize the url
url = URI.parse(url) unless url.is_a?(URI)
# session key
key = key_for(url)
if sess = @sessions[key]
@sessions.delete(key)
end
end
# Clears the session cache
def clear
@sessions.clear
end
# Creates a session key based on the URL
private def key_for(url)
{url.scheme, url.host, url.port}
end
end
end

3
src/arachnid/version.cr Normal file
View File

@ -0,0 +1,3 @@
module Arachnid
VERSION = "0.1.0"
end