Initial commit
This commit is contained in:
commit
9b82f6b48a
|
@ -0,0 +1,9 @@
|
||||||
|
root = true
|
||||||
|
|
||||||
|
[*.cr]
|
||||||
|
charset = utf-8
|
||||||
|
end_of_line = lf
|
||||||
|
insert_final_newline = true
|
||||||
|
indent_style = space
|
||||||
|
indent_size = 2
|
||||||
|
trim_trailing_whitespace = true
|
|
@ -0,0 +1,9 @@
|
||||||
|
/docs/
|
||||||
|
/lib/
|
||||||
|
/bin/
|
||||||
|
/.shards/
|
||||||
|
*.dwarf
|
||||||
|
|
||||||
|
# Libraries don't need dependency lock
|
||||||
|
# Dependencies will be locked in applications that use them
|
||||||
|
/shard.lock
|
|
@ -0,0 +1,6 @@
|
||||||
|
language: crystal
|
||||||
|
|
||||||
|
# Uncomment the following if you'd like Travis to run specs and check code formatting
|
||||||
|
# script:
|
||||||
|
# - crystal spec
|
||||||
|
# - crystal tool format --check
|
|
@ -0,0 +1,21 @@
|
||||||
|
The MIT License (MIT)
|
||||||
|
|
||||||
|
Copyright (c) 2019 Chris Watson
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in
|
||||||
|
all copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||||
|
THE SOFTWARE.
|
|
@ -0,0 +1,95 @@
|
||||||
|
# Arachnid
|
||||||
|
|
||||||
|
Arachnid is a fast and powerful web scraping framework for Crystal. It provides an easy to use DSL for scraping webpages and processing all of the things you might come across.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
1. Add the dependency to your `shard.yml`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
dependencies:
|
||||||
|
arachnid:
|
||||||
|
github: watzon/arachnid
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Run `shards install`
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
Arachnid provides an easy to use, powerful DSL for scraping websites.
|
||||||
|
|
||||||
|
```crystal
|
||||||
|
require "arachnid"
|
||||||
|
require "json"
|
||||||
|
|
||||||
|
# Let's build a sitemap of crystal-lang.org
|
||||||
|
# Links will be a hash of url to page title
|
||||||
|
links = {} of String => String
|
||||||
|
|
||||||
|
# Visit a particular host, in this case `crystal-lang.org`. This will
|
||||||
|
# not match on subdomains.
|
||||||
|
Arachnid.host("https://crystal-lang.org") do |spider|
|
||||||
|
# Ignore the API secion. It's a little big.
|
||||||
|
spider.ignore_urls_like(/.*\/api.*/)
|
||||||
|
|
||||||
|
spider.every_page do |page|
|
||||||
|
puts "Visiting #{page.url.to_s}"
|
||||||
|
|
||||||
|
# Ignore redirects for our sitemap
|
||||||
|
unless page.redirect?
|
||||||
|
# Add the url of every visited page to our sitemap
|
||||||
|
links[page.url.to_s] = page.title.to_s.strip
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
File.write("crystal-lang.org-sitemap.json", links.to_pretty_json)
|
||||||
|
```
|
||||||
|
|
||||||
|
Want to scan external links as well?
|
||||||
|
|
||||||
|
```crystal
|
||||||
|
# To make things interesting, this time let's download
|
||||||
|
# every image we find.
|
||||||
|
Arachnid.start_at("https://crystal-lang.org") do |spider|
|
||||||
|
# Set a base path to store all the images at
|
||||||
|
base_image_dir = File.expand_path("~/Pictures/arachnid")
|
||||||
|
Dir.mkdir_p(base_image_dir)
|
||||||
|
|
||||||
|
spider.every_page do |page|
|
||||||
|
puts "Scanning #{page.url.to_s}"
|
||||||
|
|
||||||
|
if page.image?
|
||||||
|
# Since we're going to be saving a lot of images
|
||||||
|
# let's spawn a new fiber for each one. This
|
||||||
|
# makes things so much faster.
|
||||||
|
spawn do
|
||||||
|
# Output directory for images for this host
|
||||||
|
directory = File.join(base_image_dir, page.url.host.to_s)
|
||||||
|
Dir.mkdir_p(directory)
|
||||||
|
|
||||||
|
# The name of the image
|
||||||
|
filename = File.basename(page.url.path)
|
||||||
|
|
||||||
|
# Save the image using the body of the page
|
||||||
|
puts "Saving #{filename} to #{directory}"
|
||||||
|
File.write(File.join(directory, filename), page.body)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
```
|
||||||
|
|
||||||
|
More documentation will be coming soon!
|
||||||
|
|
||||||
|
## Contributing
|
||||||
|
|
||||||
|
1. Fork it (<https://github.com/watzon/arachnid/fork>)
|
||||||
|
2. Create your feature branch (`git checkout -b my-new-feature`)
|
||||||
|
3. Commit your changes (`git commit -am 'Add some feature'`)
|
||||||
|
4. Push to the branch (`git push origin my-new-feature`)
|
||||||
|
5. Create a new Pull Request
|
||||||
|
|
||||||
|
## Contributors
|
||||||
|
|
||||||
|
- [Chris Watson](https://github.com/watzon) - creator and maintainer
|
|
@ -0,0 +1,17 @@
|
||||||
|
name: arachnid
|
||||||
|
version: 0.1.0
|
||||||
|
|
||||||
|
authors:
|
||||||
|
- Chris Watson <chris@watzon.me>
|
||||||
|
|
||||||
|
dependencies:
|
||||||
|
halite:
|
||||||
|
github: icyleaf/halite
|
||||||
|
version: ~> 0.10.1
|
||||||
|
crystagiri:
|
||||||
|
github: madeindjs/crystagiri
|
||||||
|
branch: master
|
||||||
|
|
||||||
|
crystal: 0.29.0
|
||||||
|
|
||||||
|
license: MIT
|
|
@ -0,0 +1,9 @@
|
||||||
|
require "./spec_helper"
|
||||||
|
|
||||||
|
describe Arachnid do
|
||||||
|
# TODO: Write tests
|
||||||
|
|
||||||
|
it "works" do
|
||||||
|
false.should eq(true)
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,2 @@
|
||||||
|
require "spec"
|
||||||
|
require "../src/arachnid"
|
|
@ -0,0 +1,32 @@
|
||||||
|
require "./arachnid/version"
|
||||||
|
require "./arachnid/arachnid"
|
||||||
|
|
||||||
|
# To make things interesting, this time let's download
|
||||||
|
# every image we find.
|
||||||
|
Arachnid.start_at("https://crystal-lang.org") do |spider|
|
||||||
|
# Set a base path to store all the images at
|
||||||
|
base_image_dir = File.expand_path("~/Pictures/arachnid")
|
||||||
|
Dir.mkdir_p(base_image_dir)
|
||||||
|
|
||||||
|
spider.every_page do |page|
|
||||||
|
puts "Scanning #{page.url.to_s}"
|
||||||
|
|
||||||
|
if page.image?
|
||||||
|
# Since we're going to be saving a lot of images
|
||||||
|
# let's spawn a new fiber for each one. This
|
||||||
|
# makes things so much faster.
|
||||||
|
spawn do
|
||||||
|
# Output directory for images for this host
|
||||||
|
directory = File.join(base_image_dir, page.url.host.to_s)
|
||||||
|
Dir.mkdir_p(directory)
|
||||||
|
|
||||||
|
# The name of the image
|
||||||
|
filename = File.basename(page.url.path)
|
||||||
|
|
||||||
|
# Save the image using the body of the page
|
||||||
|
puts "Saving #{filename} to #{directory}"
|
||||||
|
File.write(File.join(directory, filename), page.body)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,543 @@
|
||||||
|
require "./agent/sanitizers"
|
||||||
|
require "./agent/filters"
|
||||||
|
require "./agent/events"
|
||||||
|
require "./agent/actions"
|
||||||
|
require "./agent/robots"
|
||||||
|
require "./page"
|
||||||
|
require "./session_cache"
|
||||||
|
require "./cookie_jar"
|
||||||
|
require "./auth_store"
|
||||||
|
|
||||||
|
module Arachnid
|
||||||
|
class Agent
|
||||||
|
|
||||||
|
getter? running : Bool
|
||||||
|
|
||||||
|
# Set to limit to a single host.
|
||||||
|
property host : String?
|
||||||
|
|
||||||
|
# User agent to use.
|
||||||
|
property user_agent : String
|
||||||
|
|
||||||
|
# HTTP Hoes Header to use.
|
||||||
|
property host_header : String?
|
||||||
|
|
||||||
|
# HTTP Host Headers to use for specific hosts.
|
||||||
|
property host_headers : Hash(String | Regex, String)
|
||||||
|
|
||||||
|
# HTTP Headers to use for every request.
|
||||||
|
property default_headers : Hash(String, String)
|
||||||
|
|
||||||
|
# HTTP Authentication credentials.
|
||||||
|
property authorized : AuthStore
|
||||||
|
|
||||||
|
# Referer to use.
|
||||||
|
property referer : String?
|
||||||
|
|
||||||
|
# Delay in between fetching pages.
|
||||||
|
property fetch_delay : Time::Span | Int32
|
||||||
|
|
||||||
|
# History containing visited URLs.
|
||||||
|
getter history : Set(URI)
|
||||||
|
|
||||||
|
# List of unreachable URIs.
|
||||||
|
getter failures : Set(URI)
|
||||||
|
|
||||||
|
# Queue of URLs to visit.
|
||||||
|
getter queue : Array(URI)
|
||||||
|
|
||||||
|
# The session cache.
|
||||||
|
property sessions : SessionCache
|
||||||
|
|
||||||
|
# Cached cookies.
|
||||||
|
property cookies : CookieJar
|
||||||
|
|
||||||
|
# Maximum number of pages to visit.
|
||||||
|
property limit : Int32?
|
||||||
|
|
||||||
|
# Maximum depth.
|
||||||
|
property max_depth : Int32?
|
||||||
|
|
||||||
|
# The visited URLs and their depth within a site.
|
||||||
|
property levels : Hash(URI, Int32)
|
||||||
|
|
||||||
|
# Creates a new `Agent` object.
|
||||||
|
def initialize(
|
||||||
|
host : String? = nil,
|
||||||
|
read_timeout : Int32? = nil,
|
||||||
|
connect_timeout : Int32? = nil,
|
||||||
|
follow_redirects : Bool? = nil,
|
||||||
|
max_redirects : Int32? = nil,
|
||||||
|
do_not_track : Bool? = nil,
|
||||||
|
default_headers : Hash(String, String)? = nil,
|
||||||
|
host_header : String? = nil,
|
||||||
|
host_headers : Hash(String | Regex, String)? = nil,
|
||||||
|
user_agent : String? = nil,
|
||||||
|
referer : String? = nil,
|
||||||
|
fetch_delay : (Int32 | Time::Span)? = nil,
|
||||||
|
queue : Set(URI)? = nil,
|
||||||
|
history : Set(URI)? = nil,
|
||||||
|
limit : Int32? = nil,
|
||||||
|
max_depth : Int32? = nil,
|
||||||
|
robots : Bool? = nil,
|
||||||
|
filter_options = nil
|
||||||
|
)
|
||||||
|
@host = host
|
||||||
|
|
||||||
|
@host_header = host_header
|
||||||
|
@host_headers = host_headers || {} of (Regex | String) => String
|
||||||
|
@default_headers = default_headers || {} of String => String
|
||||||
|
|
||||||
|
@user_agent = user_agent || Arachnid.user_agent
|
||||||
|
@referer = referer
|
||||||
|
|
||||||
|
@running = false
|
||||||
|
@fetch_delay = fetch_delay || 0
|
||||||
|
@history = history || Set(URI).new
|
||||||
|
@failures = Set(URI).new
|
||||||
|
@queue = queue || [] of URI
|
||||||
|
|
||||||
|
@limit = limit
|
||||||
|
@levels = {} of URI => Int32
|
||||||
|
@max_depth = max_depth
|
||||||
|
|
||||||
|
@sessions = SessionCache.new(
|
||||||
|
read_timeout,
|
||||||
|
connect_timeout,
|
||||||
|
follow_redirects,
|
||||||
|
max_redirects,
|
||||||
|
do_not_track
|
||||||
|
)
|
||||||
|
|
||||||
|
@cookies = CookieJar.new
|
||||||
|
@authorized = AuthStore.new
|
||||||
|
|
||||||
|
if filter_options
|
||||||
|
initialize_filters(**filter_options)
|
||||||
|
else
|
||||||
|
initialize_filters
|
||||||
|
end
|
||||||
|
|
||||||
|
initialize_robots if robots || Arachnid.robots?
|
||||||
|
end
|
||||||
|
|
||||||
|
# Create a new scoped `Agent` in a block.
|
||||||
|
def self.new(**options, &block : Agent ->)
|
||||||
|
_new = new(**options)
|
||||||
|
with _new yield _new
|
||||||
|
_new
|
||||||
|
end
|
||||||
|
|
||||||
|
# Creates a new `Agent` and begins spidering at the given URL.
|
||||||
|
def self.start_at(url, **options, &block : Agent ->)
|
||||||
|
agent = new(**options, &block)
|
||||||
|
agent.start_at(url, force: true)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Creates a new `Agent` and spiders the web site located
|
||||||
|
# at the given URL.
|
||||||
|
def self.site(url, **options, &block : Agent ->)
|
||||||
|
url = url.is_a?(URI) ? url : URI.parse(url)
|
||||||
|
url_regex = Regex.new(Regex.escape(url.host.to_s))
|
||||||
|
|
||||||
|
agent = new(**options, &block)
|
||||||
|
agent.visit_hosts_like(url_regex)
|
||||||
|
|
||||||
|
agent.start_at(url, force: true)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Creates a new `Agent` and spiders the given host.
|
||||||
|
def self.host(url, **options, &block : Agent ->)
|
||||||
|
url = url.is_a?(URI) ? url : URI.parse(url)
|
||||||
|
|
||||||
|
options = options.merge(host: url.host)
|
||||||
|
agent = new(**options, &block)
|
||||||
|
|
||||||
|
agent.start_at(url, force: true)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Clears the history of the `Agent`.
|
||||||
|
def clear
|
||||||
|
@queue.clear
|
||||||
|
@history.clear
|
||||||
|
@failures.clear
|
||||||
|
self
|
||||||
|
end
|
||||||
|
|
||||||
|
# Start spidering at a given URL.
|
||||||
|
# def start_at(url, &block : Page ->)
|
||||||
|
# enqueue(url)
|
||||||
|
# run(&block)
|
||||||
|
# end
|
||||||
|
|
||||||
|
# Start spidering at a given URL.
|
||||||
|
def start_at(url, force = false)
|
||||||
|
enqueue(url, force: force)
|
||||||
|
return run
|
||||||
|
end
|
||||||
|
|
||||||
|
# Start spidering until the queue becomes empty or the
|
||||||
|
# agent is paused.
|
||||||
|
# def run(&block : Page ->)
|
||||||
|
# @running = true
|
||||||
|
|
||||||
|
# until @queue.empty? || paused? || limit_reached?
|
||||||
|
# begin
|
||||||
|
# visit_page(dequeue, &block)
|
||||||
|
# rescue Actions::Paused
|
||||||
|
# return self
|
||||||
|
# rescue Actions::Action
|
||||||
|
# end
|
||||||
|
# end
|
||||||
|
|
||||||
|
# @running = false
|
||||||
|
# @sessions.clear
|
||||||
|
# self
|
||||||
|
# end
|
||||||
|
|
||||||
|
# Start spidering until the queue becomes empty or the
|
||||||
|
# agent is paused.
|
||||||
|
def run
|
||||||
|
@running = true
|
||||||
|
|
||||||
|
until @queue.empty? || paused? || limit_reached? || !running?
|
||||||
|
begin
|
||||||
|
visit_page(dequeue)
|
||||||
|
rescue Actions::Paused
|
||||||
|
return self
|
||||||
|
rescue Actions::Action
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
@running = false
|
||||||
|
@sessions.clear
|
||||||
|
self
|
||||||
|
end
|
||||||
|
|
||||||
|
# Sets the history of URLs that were previously visited.
|
||||||
|
def history=(new_history)
|
||||||
|
@history.clear
|
||||||
|
|
||||||
|
new_history.each do |url|
|
||||||
|
@history << url.is_a?(URI) ? url : URI.parse(url)
|
||||||
|
end
|
||||||
|
|
||||||
|
@history
|
||||||
|
end
|
||||||
|
|
||||||
|
# Specifies the links which have been visited.
|
||||||
|
def visited_links
|
||||||
|
@history.map(&.to_s)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Specifies the hosts which have been visited.
|
||||||
|
def visited_hosts
|
||||||
|
history.map(&.host)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines whether a URL was visited or not.
|
||||||
|
def visited?(url)
|
||||||
|
url = url.is_a?(URI) ? url : URI.parse(url)
|
||||||
|
@history.includes?(url)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Sets the list of failed URLs.
|
||||||
|
def failures=(new_failures)
|
||||||
|
@failures.clear
|
||||||
|
|
||||||
|
new_failures.each do |url|
|
||||||
|
@failures << url.is_a?(URI) ? url : URI.parse(url)
|
||||||
|
end
|
||||||
|
|
||||||
|
@failures
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines whether a given URL could not be visited.
|
||||||
|
def failed?(url)
|
||||||
|
url = url.is_a?(URI) ? url : URI.parse(url)
|
||||||
|
@failures.includes?(url)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Sets the queue of URLs to visit.
|
||||||
|
# Sets the list of failed URLs.
|
||||||
|
def queue=(new_queue)
|
||||||
|
@queue.clear
|
||||||
|
|
||||||
|
new_queue.each do |url|
|
||||||
|
@queue << url.is_a?(URI) ? url : URI.parse(url)
|
||||||
|
end
|
||||||
|
|
||||||
|
@queue
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines whether the given URL has been queued for visiting.
|
||||||
|
def queued?(url)
|
||||||
|
url = url.is_a?(URI) ? url : URI.parse(url)
|
||||||
|
@queue.includes?(url)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Enqueues a given URL for visiting, only if it passes all
|
||||||
|
# of the agent's rules for visiting a given URL.
|
||||||
|
def enqueue(url, level = 0, force = false)
|
||||||
|
url = sanitize_url(url)
|
||||||
|
|
||||||
|
if (!queued?(url) && visit?(url)) || force
|
||||||
|
link = url.to_s
|
||||||
|
|
||||||
|
return if url.host.to_s.empty?
|
||||||
|
|
||||||
|
begin
|
||||||
|
@every_url_blocks.each { |url_block| url_block.call(url) }
|
||||||
|
|
||||||
|
@every_url_like_blocks.each do |pattern, url_blocks|
|
||||||
|
match = case pattern
|
||||||
|
when Regex
|
||||||
|
link =~ pattern
|
||||||
|
else
|
||||||
|
(pattern == link) || (pattern == url)
|
||||||
|
end
|
||||||
|
|
||||||
|
if match
|
||||||
|
url_blocks.each { |url_block| url_block.call(url) }
|
||||||
|
end
|
||||||
|
end
|
||||||
|
rescue action : Actions::Paused
|
||||||
|
raise(action)
|
||||||
|
rescue Actions::SkipLink
|
||||||
|
return false
|
||||||
|
rescue Actions::Action
|
||||||
|
end
|
||||||
|
|
||||||
|
@queue << url
|
||||||
|
@levels[url] = level
|
||||||
|
true
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Gets and creates a new `Page` object from a given URL,
|
||||||
|
# yielding the newly created page.
|
||||||
|
def get_page(url, &block)
|
||||||
|
url = url.is_a?(URI) ? url : URI.parse(url)
|
||||||
|
|
||||||
|
prepare_request(url) do |session, path, handlers|
|
||||||
|
new_page = Page.new(url, session.get(path, headers: handlers))
|
||||||
|
|
||||||
|
# save any new cookies
|
||||||
|
@cookies.from_page(new_page)
|
||||||
|
|
||||||
|
yield new_page
|
||||||
|
return new_page
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Gets and creates a new `Page` object from a given URL.
|
||||||
|
def get_page(url)
|
||||||
|
url = url.is_a?(URI) ? url : URI.parse(url)
|
||||||
|
|
||||||
|
prepare_request(url) do |session, path, handlers|
|
||||||
|
new_page = Page.new(url, session.get(path, handlers))
|
||||||
|
|
||||||
|
# save any new cookies
|
||||||
|
@cookies.from_page(new_page)
|
||||||
|
|
||||||
|
return new_page
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Posts supplied form data and creates a new Page from a given URL,
|
||||||
|
# yielding the newly created page.
|
||||||
|
def post_page(url, post_data = "", &block)
|
||||||
|
url = url.is_a?(URI) ? url : URI.parse(url)
|
||||||
|
|
||||||
|
prepare_request(url) do |session, path, handlers|
|
||||||
|
new_page = Page.new(url, session.post(path, post_data, handlers))
|
||||||
|
|
||||||
|
# save any new cookies
|
||||||
|
@cookies.from_page(new_page)
|
||||||
|
|
||||||
|
yield new_page
|
||||||
|
return new_page
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Posts supplied form data and creates a new Page from a given URL.
|
||||||
|
def post_page(url, post_data = "")
|
||||||
|
url = url.is_a?(URI) ? url : URI.parse(url)
|
||||||
|
|
||||||
|
prepare_request(url) do |session, path, handlers|
|
||||||
|
new_page = Page.new(url, session.post(path, post_data, handlers))
|
||||||
|
|
||||||
|
# save any new cookies
|
||||||
|
@cookies.from_page(new_page)
|
||||||
|
|
||||||
|
return new_page
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Visits a given URL and enqueues the links recovered
|
||||||
|
# from the page to be visited later.
|
||||||
|
# def visit_page(url, &block : Page ->)
|
||||||
|
# url = sanitize_url(url)
|
||||||
|
|
||||||
|
# get_page(url) do |page|
|
||||||
|
# @history << page.url
|
||||||
|
|
||||||
|
# begin
|
||||||
|
# @every_page_blocks.each { |page_block| page_block.call(page) }
|
||||||
|
# yield page
|
||||||
|
# rescue action : Actions::Paused
|
||||||
|
# raise(action)
|
||||||
|
# rescue Actions::SkipPage
|
||||||
|
# return Nil
|
||||||
|
# rescue Actions::Action
|
||||||
|
# end
|
||||||
|
|
||||||
|
# page.each_url do |next_url|
|
||||||
|
# begin
|
||||||
|
# @every_link_blocks.each do |link_block|
|
||||||
|
# link_block.call(page.url, next_url)
|
||||||
|
# end
|
||||||
|
# rescue action : Actions::Paused
|
||||||
|
# raise(action)
|
||||||
|
# rescue Actions::SkipLink
|
||||||
|
# next
|
||||||
|
# rescue Actions::Action
|
||||||
|
# end
|
||||||
|
|
||||||
|
# if @max_depth.nil? || @max_depth.not_nil! > (@levels[url]? || 0)
|
||||||
|
# @levels[url] ||= 0
|
||||||
|
# enqueue(next_url, @levels[url] + 1)
|
||||||
|
# end
|
||||||
|
# end
|
||||||
|
# end
|
||||||
|
# end
|
||||||
|
|
||||||
|
# Visits a given URL and enqueues the links recovered
|
||||||
|
# from the page to be visited later.
|
||||||
|
def visit_page(url)
|
||||||
|
url = sanitize_url(url)
|
||||||
|
|
||||||
|
get_page(url) do |page|
|
||||||
|
@history << page.url
|
||||||
|
|
||||||
|
begin
|
||||||
|
@every_page_blocks.each { |page_block| page_block.call(page) }
|
||||||
|
rescue action : Actions::Paused
|
||||||
|
raise(action)
|
||||||
|
rescue Actions::SkipPage
|
||||||
|
return nil
|
||||||
|
rescue Actions::Action
|
||||||
|
end
|
||||||
|
|
||||||
|
page.each_url do |next_url|
|
||||||
|
begin
|
||||||
|
@every_link_blocks.each do |link_block|
|
||||||
|
link_block.call(page.url, next_url)
|
||||||
|
end
|
||||||
|
rescue action : Actions::Paused
|
||||||
|
raise(action)
|
||||||
|
rescue Actions::SkipLink
|
||||||
|
next
|
||||||
|
rescue Actions::Action
|
||||||
|
end
|
||||||
|
|
||||||
|
if @max_depth.nil? || @max_depth.not_nil! > (@levels[url]? || 0)
|
||||||
|
@levels[url] ||= 0
|
||||||
|
enqueue(next_url, @levels[url] + 1)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Converts the agent into a hash.
|
||||||
|
def to_h
|
||||||
|
{"history" => @history, "queue" => @queue}
|
||||||
|
end
|
||||||
|
|
||||||
|
# Prepares request headers for a given URL.
|
||||||
|
protected def prepare_request_headers(url)
|
||||||
|
# set any additional HTTP headers
|
||||||
|
headers = @default_headers.dup
|
||||||
|
|
||||||
|
unless @host_headers.empty?
|
||||||
|
@host_headers.each do |name, header|
|
||||||
|
if url.host =~ name
|
||||||
|
headers["Host"] = header
|
||||||
|
break
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
headers["Host"] ||= @host_header.to_s if @host_header
|
||||||
|
headers["User-Agent"] ||= @user_agent.to_s
|
||||||
|
headers["Referer"] ||= @referer.to_s if @referer
|
||||||
|
|
||||||
|
if authorization = @authorized.for_url(url.host.to_s)
|
||||||
|
headers["Authorization"] = "Basic #{authorization}"
|
||||||
|
end
|
||||||
|
|
||||||
|
if header_cookies = @cookies.for_host(url.host.to_s)
|
||||||
|
headers["Cookie"] = header_cookies.to_cookie_header
|
||||||
|
end
|
||||||
|
|
||||||
|
headers
|
||||||
|
end
|
||||||
|
|
||||||
|
# Normalizes the request path and grabs a session to handle
|
||||||
|
# page get and post requests.
|
||||||
|
def prepare_request(url, &block)
|
||||||
|
path = if url.path.empty?
|
||||||
|
"/"
|
||||||
|
else
|
||||||
|
url.path
|
||||||
|
end
|
||||||
|
|
||||||
|
# append the URL query to the path
|
||||||
|
path += "?#{url.query}" if url.query
|
||||||
|
|
||||||
|
headers = prepare_request_headers(url)
|
||||||
|
|
||||||
|
begin
|
||||||
|
sleep(@fetch_delay) if @fetch_delay.to_i > 0
|
||||||
|
|
||||||
|
yield @sessions[url], path, headers
|
||||||
|
rescue Halite::Exception::Error | IO::Error | Socket::Error | OpenSSL::SSL::Error
|
||||||
|
@sessions.kill!(url)
|
||||||
|
return nil
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Dequeues a URL that will later be visited.
|
||||||
|
def dequeue
|
||||||
|
@queue.shift
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines if the maximum limit has been reached.
|
||||||
|
def limit_reached?
|
||||||
|
if limit = @limit
|
||||||
|
return @history.size >= limit
|
||||||
|
end
|
||||||
|
false
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines if a given URL should be visited.
|
||||||
|
def visit?(url)
|
||||||
|
# puts [url.to_s, visited?(url), visit_scheme?(url.scheme.to_s), visit_host?(url.host.to_s), visit_port?(url.port || -1), visit_link?(url.to_s), visit_url?(url), visit_ext?(url.path)]
|
||||||
|
!visited?(url) &&
|
||||||
|
visit_scheme?(url.scheme.to_s) &&
|
||||||
|
visit_host?(url.host.to_s) &&
|
||||||
|
visit_port?(url.port || -1) &&
|
||||||
|
visit_link?(url.to_s) &&
|
||||||
|
visit_url?(url) &&
|
||||||
|
visit_ext?(url.path)
|
||||||
|
# robot_allowed?(url.to_s)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Adds a given URL to the failures list.
|
||||||
|
def failed(url)
|
||||||
|
@failures << url
|
||||||
|
@every_failed_url_blocks.each { |fail_block| fail_block.call(url) }
|
||||||
|
true
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,53 @@
|
||||||
|
module Arachnid
|
||||||
|
class Agent
|
||||||
|
module Actions
|
||||||
|
|
||||||
|
# A Runtime Error
|
||||||
|
class RuntimeError < Exception; end
|
||||||
|
|
||||||
|
# The base `Actions` exceptions class
|
||||||
|
class Action < RuntimeError; end
|
||||||
|
|
||||||
|
# Exception used to pause a running `Agent`
|
||||||
|
class Paused < Action; end
|
||||||
|
|
||||||
|
# Exception which causes a running `Agent` to skip a link.
|
||||||
|
class SkipLink < Action; end
|
||||||
|
|
||||||
|
# Exception which caises a running `Agent` to skip a page.
|
||||||
|
class SkipPage < Action; end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Continue spidering
|
||||||
|
def continue!(&block)
|
||||||
|
@paused = false
|
||||||
|
run(&block)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Sets the pause state of the agent.
|
||||||
|
def pause=(state)
|
||||||
|
@paused = state
|
||||||
|
end
|
||||||
|
|
||||||
|
# Pauses the agent, causing spidering to temporarily stop.
|
||||||
|
def pause!
|
||||||
|
@paused = true
|
||||||
|
raise Actions::Paused.new
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines whether the agent is paused.
|
||||||
|
def paused?
|
||||||
|
@paused == true
|
||||||
|
end
|
||||||
|
|
||||||
|
# Causes the agent to skip the link being enqueued.
|
||||||
|
def skip_link!
|
||||||
|
raise Actions::SkipLink.new
|
||||||
|
end
|
||||||
|
|
||||||
|
# Causes the agent to skip the page being visited.
|
||||||
|
def skip_page!
|
||||||
|
raise Actions::SkipPage
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,248 @@
|
||||||
|
require "../page"
|
||||||
|
|
||||||
|
module Arachnid
|
||||||
|
class Agent
|
||||||
|
@every_url_blocks = [] of Proc(URI, Nil)
|
||||||
|
|
||||||
|
@every_failed_url_blocks = [] of Proc(URI, Nil)
|
||||||
|
|
||||||
|
@every_url_like_blocks = Hash(String | Regex, Array(Proc(URI, Nil))).new do |hash, key|
|
||||||
|
hash[key] = [] of Proc(URI, Nil)
|
||||||
|
end
|
||||||
|
|
||||||
|
@every_page_blocks = [] of Proc(Page, Nil)
|
||||||
|
|
||||||
|
@every_link_blocks = [] of Proc(URI, URI, Nil)
|
||||||
|
|
||||||
|
# Pass each URL from each page visited to the given block.
|
||||||
|
def every_url(&block : URI ->)
|
||||||
|
@every_url_blocks << block
|
||||||
|
self
|
||||||
|
end
|
||||||
|
|
||||||
|
# Pass each URL that could not be requested to the given block.
|
||||||
|
def every_failed_url(&block : URI ->)
|
||||||
|
@every_failed_url_blocks << block
|
||||||
|
self
|
||||||
|
end
|
||||||
|
|
||||||
|
# Pass every URL that the agent visits, and matches a given pattern,
|
||||||
|
# to a given block.
|
||||||
|
def every_url_like(pattern, &block : URI ->)
|
||||||
|
@every_url_like_blocks[pattern] << block
|
||||||
|
self
|
||||||
|
end
|
||||||
|
|
||||||
|
# Ssee `#every_url_like`
|
||||||
|
def urls_like(pattern, &block : URI ->)
|
||||||
|
every_url_like(pattern, &block)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Pass the headers from every response the agent receives to a given
|
||||||
|
# block.
|
||||||
|
def all_headers(&block)
|
||||||
|
headers = [] of HTTP::Headers
|
||||||
|
every_page { |page| headers << page.headers }
|
||||||
|
headers.each { |header| yield headers }
|
||||||
|
end
|
||||||
|
|
||||||
|
# Pass every page that the agent visits to a given block.
|
||||||
|
def every_page(&block : Page ->)
|
||||||
|
@every_page_blocks << block
|
||||||
|
self
|
||||||
|
end
|
||||||
|
|
||||||
|
# Pass every OK page that the agent visits to a given block.
|
||||||
|
def every_ok_page(&block : Page ->)
|
||||||
|
pages = [] of Page
|
||||||
|
every_page { |page| (pages << page) if page.ok? }
|
||||||
|
pages.each { |page| yield page }
|
||||||
|
end
|
||||||
|
|
||||||
|
# Pass every Redirect page that the agent visits to a given block.
|
||||||
|
def every_redirect_page(&block : Page ->)
|
||||||
|
pages = [] of Page
|
||||||
|
every_page { |page| (pages << page) if page.redirect? }
|
||||||
|
pages.each { |page| yield page }
|
||||||
|
end
|
||||||
|
|
||||||
|
# Pass every Timeout page that the agent visits to a given block.
|
||||||
|
def every_timedout_page(&block : Page ->)
|
||||||
|
pages = [] of Page
|
||||||
|
every_page { |page| (pages << page) if page.timeout? }
|
||||||
|
pages.each { |page| yield page }
|
||||||
|
end
|
||||||
|
|
||||||
|
# Pass every Bad Request page that the agent visits to a given block.
|
||||||
|
def every_bad_request_page(&block : Page ->)
|
||||||
|
pages = [] of Page
|
||||||
|
every_page { |page| (pages << page) if page.bad_request? }
|
||||||
|
pages.each { |page| yield page }
|
||||||
|
end
|
||||||
|
|
||||||
|
# Pass every Unauthorized page that the agent visits to a given block.
|
||||||
|
def every_unauthorized_page(&block : Page ->)
|
||||||
|
pages = [] of Page
|
||||||
|
every_page { |page| (pages << page) if page.unauthorized? }
|
||||||
|
pages.each { |page| yield page }
|
||||||
|
end
|
||||||
|
|
||||||
|
# Pass every Forbidden page that the agent visits to a given block.
|
||||||
|
def every_forbidden_page(&block : Page ->)
|
||||||
|
pages = [] of Page
|
||||||
|
every_page { |page| (pages << page) if page.forbidden? }
|
||||||
|
pages.each { |page| yield page }
|
||||||
|
end
|
||||||
|
|
||||||
|
# Pass every Missing page that the agent visits to a given block.
|
||||||
|
def every_missing_page(&block : Page ->)
|
||||||
|
pages = [] of Page
|
||||||
|
every_page { |page| (pages << page) if page.missing? }
|
||||||
|
pages.each { |page| yield page }
|
||||||
|
end
|
||||||
|
|
||||||
|
# Pass every Internal Server Error page that the agent visits to a
|
||||||
|
# given block.
|
||||||
|
def every_internal_server_error_page(&block : Page ->)
|
||||||
|
pages = [] of Page
|
||||||
|
every_page { |page| (pages << page) if page.had_internal_server_error? }
|
||||||
|
pages.each { |page| yield page }
|
||||||
|
end
|
||||||
|
|
||||||
|
# Pass every Plain Text page that the agent visits to a given block.
|
||||||
|
def every_txt_page(&block : Page ->)
|
||||||
|
pages = [] of Page
|
||||||
|
every_page { |page| (pages << page) if page.txt? }
|
||||||
|
pages.each { |page| yield page }
|
||||||
|
end
|
||||||
|
|
||||||
|
# Pass every HTML page that the agent visits to a given block.
|
||||||
|
def every_html_page(&block : Page ->)
|
||||||
|
pages = [] of Page
|
||||||
|
every_page { |page| (pages << page) if page.html? }
|
||||||
|
pages.each { |page| yield page }
|
||||||
|
end
|
||||||
|
|
||||||
|
# Pass every XML page that the agent visits to a given block.
|
||||||
|
def every_xml_page(&block : Page ->)
|
||||||
|
pages = [] of Page
|
||||||
|
every_page { |page| (pages << page) if page.xml? }
|
||||||
|
pages.each { |page| yield page }
|
||||||
|
end
|
||||||
|
|
||||||
|
# Pass every XML Stylesheet (XSL) page that the agent visits to a
|
||||||
|
# given block.
|
||||||
|
def every_xsl_page(&block : Page ->)
|
||||||
|
pages = [] of Page
|
||||||
|
every_page { |page| (pages << page) if page.xsl? }
|
||||||
|
pages.each { |page| yield page }
|
||||||
|
end
|
||||||
|
|
||||||
|
# Pass every HTML or XML document that the agent parses to a given
|
||||||
|
# block.
|
||||||
|
def every_doc(&block : Document::HTML | XML::Node ->)
|
||||||
|
docs = [] of Document::HTML || XML::Node
|
||||||
|
every_page { |page| docs << page.doc.not_nil! if page.doc }
|
||||||
|
docs.each { |doc| yield doc }
|
||||||
|
end
|
||||||
|
|
||||||
|
# Pass every HTML document that the agent parses to a given block.
|
||||||
|
def every_html_doc(&block : Document::HTML | XML::Node ->)
|
||||||
|
docs = [] of Document::HTML
|
||||||
|
every_page { |page| docs << page.doc.not_nil! if page.html? }
|
||||||
|
docs.each { |doc| yield doc }
|
||||||
|
end
|
||||||
|
|
||||||
|
# Pass every XML document that the agent parses to a given block.
|
||||||
|
def every_xml_doc(&block : XML::Node ->)
|
||||||
|
docs = [] of XML::Node
|
||||||
|
every_page { |page| docs << page.doc.not_nil! if page.xml? }
|
||||||
|
docs.each { |doc| yield doc }
|
||||||
|
end
|
||||||
|
|
||||||
|
# Pass every XML Stylesheet (XSL) that the agent parses to a given
|
||||||
|
# block.
|
||||||
|
def every_xsl_doc(&block : XML::Node ->)
|
||||||
|
docs = [] of XML::Node
|
||||||
|
every_page { |page| docs << page.doc.not_nil! if page.xsl? }
|
||||||
|
docs.each { |doc| yield doc }
|
||||||
|
end
|
||||||
|
|
||||||
|
# Pass every RSS document that the agent parses to a given block.
|
||||||
|
def every_rss_doc(&block : XML::Node ->)
|
||||||
|
docs = [] of XML::Node
|
||||||
|
every_page { |page| docs << page.doc.not_nil! if page.rss? }
|
||||||
|
docs.each { |doc| yield doc }
|
||||||
|
end
|
||||||
|
|
||||||
|
# Pass every Atom document that the agent parses to a given block.
|
||||||
|
def every_atom_doc(&block : XML::Node ->)
|
||||||
|
docs = [] of XML::Node
|
||||||
|
every_page { |page| docs << page.doc.not_nil! if page.atom? }
|
||||||
|
docs.each { |doc| yield doc }
|
||||||
|
end
|
||||||
|
|
||||||
|
# Pass every JavaScript page that the agent visits to a given block.
|
||||||
|
def every_javascript_page(&block : Page ->)
|
||||||
|
pages = [] of Page
|
||||||
|
every_page { |page| (pages << page) if page.javascript? }
|
||||||
|
pages.each { |page| yield page }
|
||||||
|
end
|
||||||
|
|
||||||
|
# Pass every CSS page that the agent visits to a given block.
|
||||||
|
def every_css_page(&block : Page ->)
|
||||||
|
pages = [] of Page
|
||||||
|
every_page { |page| (pages << page) if page.css? }
|
||||||
|
pages.each { |page| yield page }
|
||||||
|
end
|
||||||
|
|
||||||
|
# Pass every RSS feed that the agent visits to a given block.
|
||||||
|
def every_rss_page(&block : Page ->)
|
||||||
|
pages = [] of Page
|
||||||
|
every_page { |page| (pages << page) if page.rss? }
|
||||||
|
pages.each { |page| yield page }
|
||||||
|
end
|
||||||
|
|
||||||
|
# Pass every Atom feed that the agent visits to a given block.
|
||||||
|
def every_atom_page(&block : Page ->)
|
||||||
|
pages = [] of Page
|
||||||
|
every_page { |page| (pages << page) if page.atom? }
|
||||||
|
pages.each { |page| yield page }
|
||||||
|
end
|
||||||
|
|
||||||
|
# Pass every MS Word page that the agent visits to a given block.
|
||||||
|
def every_ms_word_page(&block : Page ->)
|
||||||
|
pages = [] of Page
|
||||||
|
every_page { |page| (pages << page) if page.ms_word? }
|
||||||
|
pages.each { |page| yield page }
|
||||||
|
end
|
||||||
|
|
||||||
|
# Pass every PDF page that the agent visits to a given block.
|
||||||
|
def every_pdf_page(&block : Page ->)
|
||||||
|
pages = [] of Page
|
||||||
|
every_page { |page| (pages << page) if page.pdf? }
|
||||||
|
pages.each { |page| yield page }
|
||||||
|
end
|
||||||
|
|
||||||
|
# Pass every ZIP page that the agent visits to a given block.
|
||||||
|
def every_zip_page(&block : Page ->)
|
||||||
|
pages = [] of Page
|
||||||
|
every_page { |page| (pages << page) if page.zip? }
|
||||||
|
pages.each { |page| yield page }
|
||||||
|
end
|
||||||
|
|
||||||
|
# Passes every image URI to the given blocks.
|
||||||
|
def every_image(&block : Page ->)
|
||||||
|
pages = [] of Page
|
||||||
|
every_page { |page| (pages << page) if page.image? }
|
||||||
|
pages.each { |page| yield page }
|
||||||
|
end
|
||||||
|
|
||||||
|
# Passes every origin and destination URI of each link to a given
|
||||||
|
# block.
|
||||||
|
def every_link(&block : URI, URI ->)
|
||||||
|
@every_link_blocks << block
|
||||||
|
self
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,256 @@
|
||||||
|
require "../rules"
|
||||||
|
|
||||||
|
module Arachnid
|
||||||
|
class Agent
|
||||||
|
# List of acceptable URL schemes to follow
|
||||||
|
getter schemes : Array(String) = [] of String
|
||||||
|
|
||||||
|
@host_rules = Rules(String).new
|
||||||
|
@port_rules = Rules(Int32).new
|
||||||
|
@link_rules = Rules(String).new
|
||||||
|
@url_rules = Rules(URI).new
|
||||||
|
@ext_rules = Rules(String).new
|
||||||
|
|
||||||
|
# Sets the list of acceptable URL schemes to visit.
|
||||||
|
def schemes=(new_schemes)
|
||||||
|
@schemes = new_schemes.map(&.to_s)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Specifies the patterns that match host-names to visit.
|
||||||
|
def visit_hosts
|
||||||
|
@host_rules.accept
|
||||||
|
end
|
||||||
|
|
||||||
|
# Adds a given pattern to the `#visit_hosts`.
|
||||||
|
def visit_hosts_like(pattern)
|
||||||
|
visit_hosts << pattern
|
||||||
|
self
|
||||||
|
end
|
||||||
|
|
||||||
|
def visit_hosts_like(&block)
|
||||||
|
visit_hosts << block
|
||||||
|
self
|
||||||
|
end
|
||||||
|
|
||||||
|
# Specifies the patterns that match host-names to not visit.
|
||||||
|
def ignore_hosts
|
||||||
|
@host_rules.reject
|
||||||
|
end
|
||||||
|
|
||||||
|
# Adds a given pattern to the `#ignore_hosts`.
|
||||||
|
def ignore_hosts_like(pattern)
|
||||||
|
ignore_hosts << pattern
|
||||||
|
self
|
||||||
|
end
|
||||||
|
|
||||||
|
def ignore_hosts_like(&block)
|
||||||
|
ignore_hosts << block
|
||||||
|
self
|
||||||
|
end
|
||||||
|
|
||||||
|
# Specifies the patterns that match the ports to visit.
|
||||||
|
def visit_ports
|
||||||
|
@port_rules.accept
|
||||||
|
end
|
||||||
|
|
||||||
|
# Adds a given pattern to the `#visit_ports`.
|
||||||
|
def visit_ports_like(pattern)
|
||||||
|
visit_ports << pattern
|
||||||
|
self
|
||||||
|
end
|
||||||
|
|
||||||
|
def visit_ports_like(&block : Int32 -> Bool)
|
||||||
|
visit_ports << block
|
||||||
|
self
|
||||||
|
end
|
||||||
|
|
||||||
|
# Specifies the patterns that match ports to not visit.
|
||||||
|
def ignore_ports
|
||||||
|
@port_rules.reject
|
||||||
|
end
|
||||||
|
|
||||||
|
# Adds a given pattern to the `#ignore_ports`.
|
||||||
|
def ignore_ports_like(pattern)
|
||||||
|
ignore_ports << pattern
|
||||||
|
self
|
||||||
|
end
|
||||||
|
|
||||||
|
def ignore_ports_like(&block : Int32 -> Bool)
|
||||||
|
ignore_ports << block
|
||||||
|
self
|
||||||
|
end
|
||||||
|
|
||||||
|
# Specifies the patterns that match the links to visit.
|
||||||
|
def visit_links
|
||||||
|
@link_rules.accept
|
||||||
|
end
|
||||||
|
|
||||||
|
# Adds a given pattern to the `#visit_links`
|
||||||
|
def visit_links_like(pattern)
|
||||||
|
visit_links << pattern
|
||||||
|
self
|
||||||
|
end
|
||||||
|
|
||||||
|
def visit_links_like(&block : String -> Bool)
|
||||||
|
visit_links << block
|
||||||
|
self
|
||||||
|
end
|
||||||
|
|
||||||
|
# Specifies the patterns that match links to not visit.
|
||||||
|
def ignore_links
|
||||||
|
@link_rules.reject
|
||||||
|
end
|
||||||
|
|
||||||
|
# Adds a given pattern to the `#ignore_links`.
|
||||||
|
def ignore_links_like(pattern)
|
||||||
|
ignore_links << pattern
|
||||||
|
self
|
||||||
|
end
|
||||||
|
|
||||||
|
def ignore_links_like(&block : String -> Bool)
|
||||||
|
ignore_links << block
|
||||||
|
self
|
||||||
|
end
|
||||||
|
|
||||||
|
# Specifies the patterns that match the URLs to visit.
|
||||||
|
def visit_urls
|
||||||
|
@url_rules.accept
|
||||||
|
end
|
||||||
|
|
||||||
|
# Adds a given pattern to the `#visit_urls`
|
||||||
|
def visit_urls_like(&block : URI -> Bool)
|
||||||
|
visit_urls << block
|
||||||
|
self
|
||||||
|
end
|
||||||
|
|
||||||
|
def visit_urls_like(pattern)
|
||||||
|
visit_urls << pattern
|
||||||
|
self
|
||||||
|
end
|
||||||
|
|
||||||
|
# Specifies the patterns that match URLs to not visit.
|
||||||
|
def ignore_urls
|
||||||
|
@url_rules.reject
|
||||||
|
end
|
||||||
|
|
||||||
|
# Adds a given pattern to the `#ignore_urls`.
|
||||||
|
def ignore_urls_like(&block : URI -> Bool)
|
||||||
|
ignore_urls << block
|
||||||
|
self
|
||||||
|
end
|
||||||
|
|
||||||
|
def ignore_urls_like(pattern)
|
||||||
|
ignore_urls << pattern
|
||||||
|
self
|
||||||
|
end
|
||||||
|
|
||||||
|
# Specifies the patterns that match the URI path extensions to visit.
|
||||||
|
def visit_exts
|
||||||
|
@ext_rules.accept
|
||||||
|
end
|
||||||
|
|
||||||
|
# Adds a given pattern to the `#visit_exts`.
|
||||||
|
def visit_exts_like(&block : String -> Bool)
|
||||||
|
visit_exts << block
|
||||||
|
self
|
||||||
|
end
|
||||||
|
|
||||||
|
def visit_exts_like(pattern)
|
||||||
|
visit_exts << pattern
|
||||||
|
self
|
||||||
|
end
|
||||||
|
|
||||||
|
# Specifies the patterns that match URI path extensions to not visit.
|
||||||
|
def ignore_exts
|
||||||
|
@ext_rules.reject
|
||||||
|
end
|
||||||
|
|
||||||
|
# Adds a given pattern to the `#ignore_exts`.
|
||||||
|
def ignore_exts_like(&block : String -> Bool)
|
||||||
|
ignore_exts << block
|
||||||
|
self
|
||||||
|
end
|
||||||
|
|
||||||
|
def ignore_exts_like(pattern)
|
||||||
|
ignore_exts << pattern
|
||||||
|
self
|
||||||
|
end
|
||||||
|
|
||||||
|
# Initializes filtering rules.
|
||||||
|
protected def initialize_filters(
|
||||||
|
schemes = nil,
|
||||||
|
hosts = nil,
|
||||||
|
ignore_hosts = nil,
|
||||||
|
ports = nil,
|
||||||
|
ignore_ports = nil,
|
||||||
|
links = nil,
|
||||||
|
ignore_links = nil,
|
||||||
|
urls = nil,
|
||||||
|
ignore_urls = nil,
|
||||||
|
exts = nil,
|
||||||
|
ignore_exts = nil
|
||||||
|
)
|
||||||
|
|
||||||
|
if schemes
|
||||||
|
self.schemes = schemes
|
||||||
|
else
|
||||||
|
@schemes << "http"
|
||||||
|
@schemes << "https"
|
||||||
|
end
|
||||||
|
|
||||||
|
@host_rules.accept = hosts
|
||||||
|
@host_rules.reject = ignore_hosts
|
||||||
|
|
||||||
|
@port_rules.accept = ports
|
||||||
|
@port_rules.reject = ignore_ports
|
||||||
|
|
||||||
|
@link_rules.accept = links
|
||||||
|
@link_rules.reject = ignore_links
|
||||||
|
|
||||||
|
@url_rules.accept = urls
|
||||||
|
@url_rules.reject = ignore_urls
|
||||||
|
|
||||||
|
@ext_rules.accept = exts
|
||||||
|
@ext_rules.reject = ignore_exts
|
||||||
|
|
||||||
|
if host
|
||||||
|
visit_hosts_like(host.to_s)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines if a given URI scheme should be visited.
|
||||||
|
protected def visit_scheme?(scheme)
|
||||||
|
if scheme
|
||||||
|
@schemes.includes?(scheme)
|
||||||
|
else
|
||||||
|
true
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines if a given host-name should be visited.
|
||||||
|
protected def visit_host?(host)
|
||||||
|
@host_rules.accept?(host)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines if a given port should be visited.
|
||||||
|
protected def visit_port?(port)
|
||||||
|
@port_rules.accept?(port)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines if a given link should be visited.
|
||||||
|
protected def visit_link?(link)
|
||||||
|
@link_rules.accept?(link)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines if a given URL should be visited.
|
||||||
|
protected def visit_url?(link)
|
||||||
|
@url_rules.accept?(link)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines if a given URI path extension should be visited.
|
||||||
|
protected def visit_ext?(path)
|
||||||
|
ext = File.extname(path)
|
||||||
|
@ext_rules.accept?(ext)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,20 @@
|
||||||
|
require "../robots"
|
||||||
|
|
||||||
|
module Arachnid
|
||||||
|
class Agent
|
||||||
|
@robots : Arachnid::Robots? = nil
|
||||||
|
|
||||||
|
# Initializes the robots filter.
|
||||||
|
def initialize_robots
|
||||||
|
# @robots = Arachnid::Robots.new(@user_agent)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines whether a URL is allowed by the robot policy.
|
||||||
|
def robot_allowed?(url)
|
||||||
|
if robots = @robots
|
||||||
|
return robots.allowed?(url)
|
||||||
|
end
|
||||||
|
true
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,21 @@
|
||||||
|
module Arachnid
|
||||||
|
class Agent
|
||||||
|
# Specifies whether the Agent will strip URI fragments
|
||||||
|
property? strip_fragments : Bool = true
|
||||||
|
|
||||||
|
# Specifies whether the Agent will strip URI queries
|
||||||
|
property? strip_query : Bool = false
|
||||||
|
|
||||||
|
# Sanitizes a URL based on filtering options
|
||||||
|
def sanitize_url(url)
|
||||||
|
# normalize the url
|
||||||
|
url = URI.parse(url) unless url.is_a?(URI)
|
||||||
|
|
||||||
|
url.path = "" if url.path == "/"
|
||||||
|
url.fragment = nil if @strip_fragments
|
||||||
|
url.query = nil if @strip_query
|
||||||
|
|
||||||
|
url
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,39 @@
|
||||||
|
require "./page"
|
||||||
|
require "./agent"
|
||||||
|
|
||||||
|
module Arachnid
|
||||||
|
extend self
|
||||||
|
|
||||||
|
# Specifies whether robots.txt should be honored globally
|
||||||
|
class_property? robots : Bool = false
|
||||||
|
|
||||||
|
# Should we set the DNT (Do Not Track) header?
|
||||||
|
class_property? do_not_track : Bool = false
|
||||||
|
|
||||||
|
# Maximum amount of redirects to follow
|
||||||
|
class_property max_redirects : Int32 = 0
|
||||||
|
|
||||||
|
# Connect timeout.
|
||||||
|
class_property connect_timeout : Int32 = 10
|
||||||
|
|
||||||
|
# Read timeout.
|
||||||
|
class_property read_timeout : Int32 = 10
|
||||||
|
|
||||||
|
# The User-Agent string used by all Agent objects by default.
|
||||||
|
class_property user_agent : String = "Arachnid #{Arachnid::VERSION}"
|
||||||
|
|
||||||
|
# See `Agent.start_at`
|
||||||
|
def start_at(url, **options, &block : Agent ->)
|
||||||
|
Agent.start_at(url, **options, &block)
|
||||||
|
end
|
||||||
|
|
||||||
|
# See `Agent.host`
|
||||||
|
def host(name, **options, &block : Agent ->)
|
||||||
|
Agent.host(name, **options, &block)
|
||||||
|
end
|
||||||
|
|
||||||
|
# See `Agent.site`
|
||||||
|
def site(url, **options, &block : Agent ->)
|
||||||
|
Agent.site(url, **options, &block)
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,4 @@
|
||||||
|
module Arachnid
|
||||||
|
# Represents HTTP Authentication credentials for a website.
|
||||||
|
record AuthCredential, username : String, password : String
|
||||||
|
end
|
|
@ -0,0 +1,83 @@
|
||||||
|
require "base64"
|
||||||
|
require "./extensions/uri"
|
||||||
|
require "./auth_credential"
|
||||||
|
require "./page"
|
||||||
|
|
||||||
|
module Arachnid
|
||||||
|
class AuthStore
|
||||||
|
@credentials = {} of Tuple(String?, String?, Int32?) => Hash(Array(String), AuthCredential)
|
||||||
|
|
||||||
|
# Given a URL, return the most specific matching auth credential.
|
||||||
|
def [](url)
|
||||||
|
# normalize the url
|
||||||
|
url = URI.parse(url) unless url.is_a?(URI)
|
||||||
|
|
||||||
|
key = key_for(url)
|
||||||
|
paths = @credentials[key]?
|
||||||
|
|
||||||
|
return nil unless paths
|
||||||
|
|
||||||
|
# longest path first
|
||||||
|
ordered_paths = paths.keys.sort { |path_key| -path_key.size }
|
||||||
|
|
||||||
|
# directories of the path
|
||||||
|
path_dirs = URI.expand_path(url.path).split('/').reject(&.empty?)
|
||||||
|
|
||||||
|
ordered_paths.each do |path|
|
||||||
|
return paths[path] if path_dirs[0, path.size] == path
|
||||||
|
end
|
||||||
|
|
||||||
|
nil
|
||||||
|
end
|
||||||
|
|
||||||
|
# Add an auth credential to the store for the supplied base URL.
|
||||||
|
def []=(url, auth)
|
||||||
|
# normalize the url
|
||||||
|
url = URI.parse(url) unless url.is_a?(URI)
|
||||||
|
|
||||||
|
# normalize the url path and split it
|
||||||
|
paths = URI.expand_path(url.path).split('/').reject(&.empty?)
|
||||||
|
|
||||||
|
key = key_for(url)
|
||||||
|
|
||||||
|
@credentials[key] ||= {} of Array(String) => AuthCredential
|
||||||
|
@credentials[key][paths] = auth
|
||||||
|
auth
|
||||||
|
end
|
||||||
|
|
||||||
|
# Convenience method to add username and password credentials
|
||||||
|
# for a named URL.
|
||||||
|
def add(url, username, password)
|
||||||
|
self[url] = AuthCredential.new(username: username, password: password)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Returns the base64 encoded authorization string for the URL
|
||||||
|
# or `nil` if no authorization exists.
|
||||||
|
def for_url(url)
|
||||||
|
if auth = self[url]
|
||||||
|
Base64.encode("#{auth.username}#{auth.password}")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Clear the contents of the auth store.
|
||||||
|
def clear!
|
||||||
|
@credentials.clear!
|
||||||
|
self
|
||||||
|
end
|
||||||
|
|
||||||
|
# Size of the current auth store (number of URL paths stored)
|
||||||
|
def size
|
||||||
|
@credentials.values.reduce(0) { |acc, paths| acc + paths.size }
|
||||||
|
end
|
||||||
|
|
||||||
|
# Inspect the auth store
|
||||||
|
def inspect
|
||||||
|
"<#{self.class}: #{@credentials.inspect}>"
|
||||||
|
end
|
||||||
|
|
||||||
|
# Creates a auth key based on the URL
|
||||||
|
private def key_for(url)
|
||||||
|
{url.scheme, url.host, url.port}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,118 @@
|
||||||
|
module Arachnid
|
||||||
|
class CookieJar
|
||||||
|
include Enumerable(HTTP::Cookies)
|
||||||
|
|
||||||
|
@params : Hash(String, HTTP::Cookies)
|
||||||
|
|
||||||
|
@cookies : HTTP::Cookies
|
||||||
|
|
||||||
|
@dirty : Set(String)
|
||||||
|
|
||||||
|
# Creates a new `CookieJar`
|
||||||
|
def initialize
|
||||||
|
@params = {} of String => HTTP::Cookies
|
||||||
|
@cookies = HTTP::Cookies.new
|
||||||
|
@dirty = Set(String).new
|
||||||
|
end
|
||||||
|
|
||||||
|
# Iterates over the host-name and cookie value pairs in the jar.
|
||||||
|
def each(&block)
|
||||||
|
@params.each do |kp|
|
||||||
|
yield kp
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Returns all relevant cookies in a single string for the named
|
||||||
|
# host or domain.
|
||||||
|
def [](host : String)
|
||||||
|
@params[host]? || HTTP::Cookies.new
|
||||||
|
end
|
||||||
|
|
||||||
|
# Add a cookie to the jar for a particular domain.
|
||||||
|
def []=(host : String, cookies : HTTP::Cookies)
|
||||||
|
@params[host] ||= HTTP::Cookies.new
|
||||||
|
|
||||||
|
cookies.each do |cookie|
|
||||||
|
if @params[host][cookie.name]? != cookie.value
|
||||||
|
cookies.each do |c|
|
||||||
|
@params[host] << c
|
||||||
|
end
|
||||||
|
@dirty.add(host)
|
||||||
|
|
||||||
|
break
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
cookies
|
||||||
|
end
|
||||||
|
|
||||||
|
# Retrieve cookies for a domain from the response.
|
||||||
|
def from_page(page)
|
||||||
|
cookies = page.cookies
|
||||||
|
|
||||||
|
unless cookies.empty?
|
||||||
|
self[page.url.host.to_s] = cookies
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
|
||||||
|
false
|
||||||
|
end
|
||||||
|
|
||||||
|
# Returns the pre-encoded Cookie for a given host.
|
||||||
|
def for_host(host)
|
||||||
|
if @dirty.includes?(host)
|
||||||
|
values = [] of String
|
||||||
|
|
||||||
|
cookies_for_host(host).each do |cookie|
|
||||||
|
values << cookie.to_cookie_header
|
||||||
|
end
|
||||||
|
|
||||||
|
@cookies[host] = values.join("; ")
|
||||||
|
@dirty.delete(host)
|
||||||
|
end
|
||||||
|
|
||||||
|
@cookies[host]?
|
||||||
|
end
|
||||||
|
|
||||||
|
# Returns raw cookie value pairs for a given host. Includes cookies
|
||||||
|
# set on parent domains.
|
||||||
|
def cookies_for_host(host)
|
||||||
|
host_cookies = @params[host]? || HTTP::Cookies.new
|
||||||
|
subdomains = host.split('.')
|
||||||
|
|
||||||
|
while subdomains.size > 2
|
||||||
|
subdomains.shift
|
||||||
|
|
||||||
|
if parent_cookies = @params[subdomains.join('.')]?
|
||||||
|
parent_cookies.each do |cookie|
|
||||||
|
# copy in the parent cookies, only if they haven't been
|
||||||
|
# overridden yet.
|
||||||
|
unless host_cookies.has_key?(cookie.name)
|
||||||
|
host_cookies[cookie.name] = cookie.value
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
host_cookies
|
||||||
|
end
|
||||||
|
|
||||||
|
# Clear out the jar, removing all stored cookies.
|
||||||
|
def clear!
|
||||||
|
@params.clear
|
||||||
|
@cookies.clear
|
||||||
|
@dirty.clear
|
||||||
|
self
|
||||||
|
end
|
||||||
|
|
||||||
|
# Size of the cookie jar.
|
||||||
|
def size
|
||||||
|
@params.size
|
||||||
|
end
|
||||||
|
|
||||||
|
# Inspects the cookie jar.
|
||||||
|
def inspect
|
||||||
|
"#<#{self.class}: #{@params.inspect}>"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,196 @@
|
||||||
|
require "xml"
|
||||||
|
|
||||||
|
module Arachnid
|
||||||
|
module Document
|
||||||
|
struct HTML
|
||||||
|
@content : String
|
||||||
|
|
||||||
|
@document : XML::Node
|
||||||
|
|
||||||
|
@ids : Hash(String, XML::Node)
|
||||||
|
|
||||||
|
@tags : Hash(String, Array(Tag))
|
||||||
|
|
||||||
|
@classes : Hash(String, Array(XML::Node))
|
||||||
|
|
||||||
|
forward_missing_to @document
|
||||||
|
|
||||||
|
def initialize(@content : String)
|
||||||
|
@document = XML.parse_html(@content)
|
||||||
|
|
||||||
|
@ids = {} of String => XML::Node
|
||||||
|
@tags = {} of String => Array(Tag)
|
||||||
|
@classes = {} of String => Array(XML::Node)
|
||||||
|
|
||||||
|
visit @document
|
||||||
|
end
|
||||||
|
|
||||||
|
def self.parse(content : String)
|
||||||
|
new(content)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Transform the css query into an xpath query
|
||||||
|
def self.css_query_to_xpath(query : String) : String
|
||||||
|
query = "//#{query}"
|
||||||
|
# Convert '#id_name' as '[@id="id_name"]'
|
||||||
|
query = query.gsub /\#([A-z0-9]+-*_*)+/ { |m| "*[@id=\"%s\"]" % m.delete('#') }
|
||||||
|
# Convert '.classname' as '[@class="classname"]'
|
||||||
|
query = query.gsub /\.([A-z0-9]+-*_*)+/ { |m| "[@class=\"%s\"]" % m.delete('.') }
|
||||||
|
# Convert ' > ' as '/'
|
||||||
|
query = query.gsub /\s*>\s*/ { |m| "/" }
|
||||||
|
# Convert ' ' as '//'
|
||||||
|
query = query.gsub " ", "//"
|
||||||
|
# a leading '*' when xpath does not include node name
|
||||||
|
query = query.gsub /\/\[/ { |m| "/*[" }
|
||||||
|
return query
|
||||||
|
end
|
||||||
|
|
||||||
|
# Find first tag by tag name and return
|
||||||
|
# `HTML::Tag` if found or `nil` if not found
|
||||||
|
def at_tag(tag_name : String) : Tag | Nil
|
||||||
|
if tags = @tags[tag_name]?
|
||||||
|
tags.each do |tag|
|
||||||
|
return tag
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return nil
|
||||||
|
end
|
||||||
|
|
||||||
|
# Find all nodes by tag name and yield
|
||||||
|
# `HTML::Tag` if found
|
||||||
|
def where_tag(tag_name : String, &block) : Array(Tag)
|
||||||
|
arr = [] of Tag
|
||||||
|
if tags = @tags[tag_name]?
|
||||||
|
tags.each do |tag|
|
||||||
|
yield tag
|
||||||
|
arr << tag
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return arr
|
||||||
|
end
|
||||||
|
|
||||||
|
# Find all nodes by classname and yield
|
||||||
|
# `HTML::Tag` founded
|
||||||
|
def where_class(class_name : String, &block) : Array(Tag)
|
||||||
|
arr = [] of Tag
|
||||||
|
if klasses = @classes[class_name]?
|
||||||
|
klasses.each do |node|
|
||||||
|
klass = Tag.new(node)
|
||||||
|
yield klass
|
||||||
|
arr << klass
|
||||||
|
end
|
||||||
|
end
|
||||||
|
return arr
|
||||||
|
end
|
||||||
|
|
||||||
|
# Find a node by its id and return a
|
||||||
|
# `HTML::Tag` found or `nil` if not found
|
||||||
|
def at_id(id_name : String) : Tag | Nil
|
||||||
|
if node = @ids[id_name]?
|
||||||
|
return Tag.new(node)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Find all nodes corresponding to the css query and yield
|
||||||
|
# `HTML::Tag` found or `nil` if not found
|
||||||
|
def css(query : String) : Array(Tag)
|
||||||
|
query = HTML.css_query_to_xpath(query)
|
||||||
|
return @nodes.xpath_nodes("//#{query}").map { |node|
|
||||||
|
tag = Tag.new(node)
|
||||||
|
yield tag
|
||||||
|
tag
|
||||||
|
}
|
||||||
|
end
|
||||||
|
|
||||||
|
# Find first node corresponding to the css query and return
|
||||||
|
# `HTML::Tag` if found or `nil` if not found
|
||||||
|
def at_css(query : String)
|
||||||
|
css(query) { |tag| return tag }
|
||||||
|
return nil
|
||||||
|
end
|
||||||
|
|
||||||
|
private def add_id(id : String, node : XML::Node)
|
||||||
|
@ids[id] = node
|
||||||
|
end
|
||||||
|
|
||||||
|
private def add_node(node : XML::Node)
|
||||||
|
if @tags[node.name]? == nil
|
||||||
|
@tags[node.name] = [] of Tag
|
||||||
|
end
|
||||||
|
@tags[node.name] << Tag.new(node)
|
||||||
|
end
|
||||||
|
|
||||||
|
private def add_class(klass : String, node : XML::Node)
|
||||||
|
if @classes[klass]? == nil
|
||||||
|
@classes[klass] = [] of XML::Node
|
||||||
|
end
|
||||||
|
@classes[klass] << node
|
||||||
|
end
|
||||||
|
|
||||||
|
# Depth-first visit. Given a node, extract metadata from
|
||||||
|
# node (if exists), then visit each child.
|
||||||
|
private def visit(node : XML::Node)
|
||||||
|
# We only extract metadata from HTML nodes
|
||||||
|
if node.element?
|
||||||
|
add_node node
|
||||||
|
if to = node["id"]?
|
||||||
|
add_id to, node
|
||||||
|
end
|
||||||
|
if classes = node["class"]?
|
||||||
|
classes.split(' ') { |to| add_class to, node }
|
||||||
|
end
|
||||||
|
end
|
||||||
|
# visit each child
|
||||||
|
node.children.each do |child|
|
||||||
|
visit child
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Represents an HTML Tag
|
||||||
|
struct Tag
|
||||||
|
getter node : XML::Node
|
||||||
|
|
||||||
|
forward_missing_to @node
|
||||||
|
|
||||||
|
def initialize(@node : XML::Node)
|
||||||
|
end
|
||||||
|
|
||||||
|
def classname : String | Nil
|
||||||
|
return @node["class"]? ? @node["class"] : nil
|
||||||
|
end
|
||||||
|
|
||||||
|
def tagname : String
|
||||||
|
return @node.name
|
||||||
|
end
|
||||||
|
|
||||||
|
def content : String
|
||||||
|
return @node.text != nil ? @node.text.as(String) : "".as(String)
|
||||||
|
end
|
||||||
|
|
||||||
|
def parent : Tag | Nil
|
||||||
|
if parent = @node.parent
|
||||||
|
return Tag.new parent
|
||||||
|
end
|
||||||
|
nil
|
||||||
|
end
|
||||||
|
|
||||||
|
def children : Array(Tag)
|
||||||
|
children = [] of Tag
|
||||||
|
@node.children.each do |node|
|
||||||
|
if node.element?
|
||||||
|
children << Tag.new node
|
||||||
|
end
|
||||||
|
end
|
||||||
|
children
|
||||||
|
end
|
||||||
|
|
||||||
|
def has_class?(klass : String) : Bool
|
||||||
|
if classes = classname
|
||||||
|
return classes.includes?(klass)
|
||||||
|
end
|
||||||
|
false
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,175 @@
|
||||||
|
require "uri"
|
||||||
|
require "string_scanner"
|
||||||
|
|
||||||
|
class URI
|
||||||
|
#
|
||||||
|
# Expands a URI decoded path, into a proper absolute path.
|
||||||
|
#
|
||||||
|
# @param [String] path
|
||||||
|
# The path from a URI.
|
||||||
|
#
|
||||||
|
# @return [String]
|
||||||
|
# The expanded path.
|
||||||
|
#
|
||||||
|
# @example
|
||||||
|
# URI.expand_path("./path")
|
||||||
|
# # => "path"
|
||||||
|
#
|
||||||
|
# @example
|
||||||
|
# URI.expand_path("test/../path")
|
||||||
|
# # => "path"
|
||||||
|
#
|
||||||
|
# @example
|
||||||
|
# URI.expand_path("/test/path/")
|
||||||
|
# # => "/test/path/"
|
||||||
|
#
|
||||||
|
# @example
|
||||||
|
# URI.expand_path("/test/../path")
|
||||||
|
# # => "/path"
|
||||||
|
#
|
||||||
|
def self.expand_path(path)
|
||||||
|
if path.starts_with?("/")
|
||||||
|
leading_slash, path = path[0, 1], path[1..-1]
|
||||||
|
else
|
||||||
|
leading_slash = ""
|
||||||
|
end
|
||||||
|
|
||||||
|
if path.ends_with?("/")
|
||||||
|
trailing_slash, path = path[-1, 1], path[0..-2]
|
||||||
|
else
|
||||||
|
trailing_slash = ""
|
||||||
|
end
|
||||||
|
|
||||||
|
scanner = StringScanner.new(path)
|
||||||
|
stack = [] of String
|
||||||
|
|
||||||
|
until scanner.eos?
|
||||||
|
if (dir = scanner.scan(/[^\/]+/))
|
||||||
|
case dir
|
||||||
|
when ".." then stack.pop
|
||||||
|
when "." then false
|
||||||
|
else stack.push(dir)
|
||||||
|
end
|
||||||
|
else
|
||||||
|
scanner.skip(/\/+/)
|
||||||
|
end
|
||||||
|
break if stack.empty?
|
||||||
|
end
|
||||||
|
|
||||||
|
unless stack.empty?
|
||||||
|
"#{leading_slash}#{stack.join("/")}#{trailing_slash}"
|
||||||
|
else
|
||||||
|
""
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def split_path(path)
|
||||||
|
path.split("/")
|
||||||
|
end
|
||||||
|
|
||||||
|
def merge_path(base, rel)
|
||||||
|
|
||||||
|
# RFC2396, Section 5.2, 5)
|
||||||
|
# RFC2396, Section 5.2, 6)
|
||||||
|
base_path = split_path(base)
|
||||||
|
rel_path = split_path(rel)
|
||||||
|
|
||||||
|
# RFC2396, Section 5.2, 6), a)
|
||||||
|
base_path << "" if base_path.last == ".."
|
||||||
|
while i = base_path.index("..")
|
||||||
|
base_path = base_path[i - 1, 2]
|
||||||
|
end
|
||||||
|
|
||||||
|
if (first = rel_path.first) && first.empty?
|
||||||
|
base_path.clear
|
||||||
|
rel_path.shift
|
||||||
|
end
|
||||||
|
|
||||||
|
# RFC2396, Section 5.2, 6), c)
|
||||||
|
# RFC2396, Section 5.2, 6), d)
|
||||||
|
rel_path.push("") if rel_path.last == '.' || rel_path.last == ".."
|
||||||
|
rel_path.delete('.')
|
||||||
|
|
||||||
|
# RFC2396, Section 5.2, 6), e)
|
||||||
|
tmp = [] of String
|
||||||
|
rel_path.each do |x|
|
||||||
|
if x == ".." &&
|
||||||
|
!(tmp.empty? || tmp.last == "..")
|
||||||
|
tmp.pop
|
||||||
|
else
|
||||||
|
tmp << x
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
add_trailer_slash = !tmp.empty?
|
||||||
|
if base_path.empty?
|
||||||
|
base_path = [""] # keep '/' for root directory
|
||||||
|
elsif add_trailer_slash
|
||||||
|
base_path.pop
|
||||||
|
end
|
||||||
|
while x = tmp.shift
|
||||||
|
if x == ".."
|
||||||
|
# RFC2396, Section 4
|
||||||
|
# a .. or . in an absolute path has no special meaning
|
||||||
|
base_path.pop if base_path.size > 1
|
||||||
|
else
|
||||||
|
# if x == ".."
|
||||||
|
# valid absolute (but abnormal) path "/../..."
|
||||||
|
# else
|
||||||
|
# valid absolute path
|
||||||
|
# end
|
||||||
|
base_path << x
|
||||||
|
tmp.each {|t| base_path << t}
|
||||||
|
add_trailer_slash = false
|
||||||
|
break
|
||||||
|
end
|
||||||
|
end
|
||||||
|
base_path.push("") if add_trailer_slash
|
||||||
|
|
||||||
|
return base_path.join('/')
|
||||||
|
end
|
||||||
|
|
||||||
|
def merge(oth)
|
||||||
|
oth = URI.parse(oth) unless oth.is_a?(URI)
|
||||||
|
|
||||||
|
if oth.absolute?
|
||||||
|
# raise BadURIError, "both URI are absolute" if absolute?
|
||||||
|
# hmm... should return oth for usability?
|
||||||
|
return oth
|
||||||
|
end
|
||||||
|
|
||||||
|
unless self.absolute?
|
||||||
|
raise URI::Error.new("both URI are othative")
|
||||||
|
end
|
||||||
|
|
||||||
|
base = self.dup
|
||||||
|
|
||||||
|
authority = oth.userinfo || oth.host || oth.port
|
||||||
|
|
||||||
|
# RFC2396, Section 5.2, 2)
|
||||||
|
if (oth.path.nil? || oth.path.empty?) && !authority && !oth.query
|
||||||
|
base.fragment=(oth.fragment) if oth.fragment
|
||||||
|
return base
|
||||||
|
end
|
||||||
|
|
||||||
|
base.query = nil
|
||||||
|
base.fragment=(nil)
|
||||||
|
|
||||||
|
# RFC2396, Section 5.2, 4)
|
||||||
|
if !authority
|
||||||
|
base.path = merge_path(base.path, oth.path) if base.path && oth.path
|
||||||
|
else
|
||||||
|
# RFC2396, Section 5.2, 4)
|
||||||
|
base.path = oth.path if oth.path
|
||||||
|
end
|
||||||
|
|
||||||
|
# RFC2396, Section 5.2, 7)
|
||||||
|
base.user = oth.userinfo if oth.userinfo
|
||||||
|
base.host = oth.host if oth.host
|
||||||
|
base.port = oth.port if oth.port
|
||||||
|
base.query = oth.query if oth.query
|
||||||
|
base.fragment=(oth.fragment) if oth.fragment
|
||||||
|
|
||||||
|
return base
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,97 @@
|
||||||
|
require "uri"
|
||||||
|
require "halite"
|
||||||
|
|
||||||
|
require "./page/content_types"
|
||||||
|
require "./page/cookies"
|
||||||
|
require "./page/html"
|
||||||
|
require "./page/status_codes"
|
||||||
|
|
||||||
|
require "./document/html"
|
||||||
|
|
||||||
|
module Arachnid
|
||||||
|
# Represents a page requested from a website
|
||||||
|
class Page
|
||||||
|
include Page::ContentTypes
|
||||||
|
include Page::Cookies
|
||||||
|
include Page::HTML
|
||||||
|
include Page::StatusCodes
|
||||||
|
|
||||||
|
# URL of the page
|
||||||
|
getter url : URI
|
||||||
|
|
||||||
|
# HTTP response
|
||||||
|
getter response : Halite::Response
|
||||||
|
|
||||||
|
# Headers returned with the body
|
||||||
|
getter headers : HTTP::Headers
|
||||||
|
|
||||||
|
@doc : (Document::HTML | XML::Node)?
|
||||||
|
|
||||||
|
delegate xpath, xpath_node, xpath_nodes, xpath_bool, xpath_float, xpath_string,
|
||||||
|
root, at_tag, where_tag, where_class, at_id, css, at_css, to: @doc
|
||||||
|
|
||||||
|
forward_missing_to @headers
|
||||||
|
|
||||||
|
# Creates a new `Page` object.
|
||||||
|
def initialize(url : URI, response : Halite::Response)
|
||||||
|
@url = url
|
||||||
|
@response = response
|
||||||
|
@headers = response.headers
|
||||||
|
end
|
||||||
|
|
||||||
|
# The body of the response
|
||||||
|
def body
|
||||||
|
@response.body || ""
|
||||||
|
end
|
||||||
|
|
||||||
|
# Returns a parsed document for HTML, XML, RSS, and Atom pages.
|
||||||
|
def doc
|
||||||
|
unless body.empty?
|
||||||
|
doc_class = if html?
|
||||||
|
Document::HTML
|
||||||
|
elsif rss? || atom? || xml? || xsl?
|
||||||
|
XML
|
||||||
|
end
|
||||||
|
|
||||||
|
if doc_class
|
||||||
|
begin
|
||||||
|
@doc ||= doc_class.parse(body)
|
||||||
|
rescue
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Searches the document for XPath or CSS paths
|
||||||
|
def search(path)
|
||||||
|
if document = doc
|
||||||
|
document.xpath_nodes(path)
|
||||||
|
else
|
||||||
|
[] of XML::Node
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Searches for the first occurrence of an XPath or CSS path
|
||||||
|
def at(path)
|
||||||
|
if document = doc
|
||||||
|
document.xpath_node(path)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def /(path)
|
||||||
|
search(path)
|
||||||
|
end
|
||||||
|
|
||||||
|
def %(path)
|
||||||
|
at(path)
|
||||||
|
end
|
||||||
|
|
||||||
|
def size
|
||||||
|
@response.body.bytesize
|
||||||
|
end
|
||||||
|
|
||||||
|
def to_s
|
||||||
|
body
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,162 @@
|
||||||
|
module Arachnid
|
||||||
|
class Page
|
||||||
|
module ContentTypes
|
||||||
|
# The Content-Type of the page.
|
||||||
|
def content_type
|
||||||
|
@response.content_type || ""
|
||||||
|
end
|
||||||
|
|
||||||
|
# The content types of the page.
|
||||||
|
def content_types
|
||||||
|
types = @response.headers.get?("content-type") || [] of String
|
||||||
|
end
|
||||||
|
|
||||||
|
# The charset included in the Content-Type.
|
||||||
|
def content_charset
|
||||||
|
content_types.each do |value|
|
||||||
|
if value.includes?(";")
|
||||||
|
value.split(";").each do |param|
|
||||||
|
param.strip!
|
||||||
|
|
||||||
|
if param.starts_with?("charset=")
|
||||||
|
return param.split("=", 2).last
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
return nil
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines if any of the content-types of the page include a given
|
||||||
|
# type.
|
||||||
|
def is_content_type?(type : String | Regex)
|
||||||
|
content_types.any? do |value|
|
||||||
|
value = value.split(";", 2).first
|
||||||
|
|
||||||
|
if type.is_a?(Regex)
|
||||||
|
value =~ type
|
||||||
|
else
|
||||||
|
value == type
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines if the page is plain-text.
|
||||||
|
def plain_text?
|
||||||
|
is_content_type?("text/plain")
|
||||||
|
end
|
||||||
|
|
||||||
|
# ditto
|
||||||
|
def text?
|
||||||
|
plain_text?
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines if the page is a Directory Listing.
|
||||||
|
def directory?
|
||||||
|
is_content_type?("text/directory")
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines if the page is HTML document.
|
||||||
|
def html?
|
||||||
|
is_content_type?("text/html")
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines if the page is XML document.
|
||||||
|
def xml?
|
||||||
|
is_content_type?(/(text|application)\/xml/)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines if the page is XML Stylesheet (XSL).
|
||||||
|
def xsl?
|
||||||
|
is_content_type?("text/xsl")
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines if the page is JavaScript.
|
||||||
|
def javascript?
|
||||||
|
is_content_type?(/(text|application)\/javascript/)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines if the page is JSON.
|
||||||
|
def json?
|
||||||
|
is_content_type?("application/json")
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines if the page is a CSS stylesheet.
|
||||||
|
def css?
|
||||||
|
is_content_type?("text/css")
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines if the page is a RSS feed.
|
||||||
|
def rss?
|
||||||
|
is_content_type?(/application\/(rss\+xml|rdf\+xml)/)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines if the page is an Atom feed.
|
||||||
|
def atom?
|
||||||
|
is_content_type?("application/atom+xml")
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines if the page is a MS Word document.
|
||||||
|
def ms_word?
|
||||||
|
is_content_type?("application/msword")
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines if the page is a PDF document.
|
||||||
|
def pdf?
|
||||||
|
is_content_type?("application/pdf")
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines if the page is a ZIP archive.
|
||||||
|
def zip?
|
||||||
|
is_content_type?("application/zip")
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determine if the page is an image.
|
||||||
|
def image?
|
||||||
|
is_content_type?(/image\//)
|
||||||
|
end
|
||||||
|
|
||||||
|
def png?
|
||||||
|
is_content_type?("image/png")
|
||||||
|
end
|
||||||
|
|
||||||
|
def gif?
|
||||||
|
is_content_type?("image/gif")
|
||||||
|
end
|
||||||
|
|
||||||
|
def jpg?
|
||||||
|
is_content_type?(/image\/(jpg|jpeg)/)
|
||||||
|
end
|
||||||
|
|
||||||
|
def svg?
|
||||||
|
is_content_type?(/image\/svg(\+xml)?/)
|
||||||
|
end
|
||||||
|
|
||||||
|
def video?
|
||||||
|
is_content_type?(/video\/.*/)
|
||||||
|
end
|
||||||
|
|
||||||
|
def mp4?
|
||||||
|
is_content_type?("video/mp4")
|
||||||
|
end
|
||||||
|
|
||||||
|
def avi?
|
||||||
|
is_content_type?("video/x-msvideo")
|
||||||
|
end
|
||||||
|
|
||||||
|
def wmv?
|
||||||
|
is_content_type?("video/x-ms-wmv")
|
||||||
|
end
|
||||||
|
|
||||||
|
def quicktime?
|
||||||
|
is_content_type?("video/quicktime")
|
||||||
|
end
|
||||||
|
|
||||||
|
def flash?
|
||||||
|
is_content_type?("video/flash") ||
|
||||||
|
is_content_type?("application/x-shockwave-flash")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,18 @@
|
||||||
|
module Arachnid
|
||||||
|
class Page
|
||||||
|
module Cookies
|
||||||
|
# Reserved names used within Cookie strings
|
||||||
|
RESERVED_COOKIE_NAMES = Regex.new("^(?:Path|Expires|Domain|Secure|HTTPOnly)$", :ignore_case)
|
||||||
|
|
||||||
|
# The raw Cookie String sent along with the page.
|
||||||
|
def cookie
|
||||||
|
@response.headers["Set-Cookie"]? || ""
|
||||||
|
end
|
||||||
|
|
||||||
|
# The Cookie values sent along with the page.
|
||||||
|
def cookies
|
||||||
|
@response.cookies
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,204 @@
|
||||||
|
require "../extensions/uri"
|
||||||
|
|
||||||
|
module Arachnid
|
||||||
|
class Page
|
||||||
|
# TODO: Create enumerable methods for the methods that take a block
|
||||||
|
module HTML
|
||||||
|
# include Enumerable
|
||||||
|
|
||||||
|
# The title of the HTML page.
|
||||||
|
def title
|
||||||
|
if (node = at("//title"))
|
||||||
|
node.inner_text
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Enumerates over the meta-redirect links in the page.
|
||||||
|
def each_meta_redirect(&block : URI ->)
|
||||||
|
if (html? && doc)
|
||||||
|
search("//meta[@http-equiv and @content]").each do |node|
|
||||||
|
if node["http-equiv"] =~ /refresh/i
|
||||||
|
content = node["content"]
|
||||||
|
|
||||||
|
if (redirect = content.match(/url=(\S+)$/))
|
||||||
|
yield URI.parse(redirect[1])
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Returns a boolean indicating whether or not page-level meta
|
||||||
|
# redirects are present in this page.
|
||||||
|
def meta_redirect?
|
||||||
|
!meta_redirects.empty?
|
||||||
|
end
|
||||||
|
|
||||||
|
# The meta-redirect links of the page.
|
||||||
|
def meta_redirects
|
||||||
|
redirects = [] of URI
|
||||||
|
each_meta_redirect { |r| redirects << r }
|
||||||
|
redirects
|
||||||
|
end
|
||||||
|
|
||||||
|
# Enumerates over every HTTP or meta-redirect link in the page.
|
||||||
|
def each_redirect(&block : URI ->)
|
||||||
|
if (locations = @response.headers.get?("Location"))
|
||||||
|
# Location headers override any meta-refresh redirects in the HTML
|
||||||
|
locations.each { |l| URI.parse(l) }
|
||||||
|
else
|
||||||
|
# check page-level meta redirects if there isn't a location header
|
||||||
|
each_meta_redirect(&block)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# URLs that this document redirects to.
|
||||||
|
def redirects_to
|
||||||
|
each_redirect.to_a
|
||||||
|
end
|
||||||
|
|
||||||
|
# Enumerates over every `mailto:` link in the page.
|
||||||
|
def each_mailto(&block)
|
||||||
|
if (html? && doc)
|
||||||
|
doc.xpath_nodes("//a[starts-with(@href,'mailto:')]").each do |a|
|
||||||
|
yield a["href"][7..-1]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# `mailto:` links in the page.
|
||||||
|
def mailtos
|
||||||
|
each_mailto.to_a
|
||||||
|
end
|
||||||
|
|
||||||
|
# Enumerates over every link in the page.
|
||||||
|
def each_link(&block : URI ->)
|
||||||
|
each_redirect(&block) if redirect?
|
||||||
|
|
||||||
|
each_image(&block)
|
||||||
|
|
||||||
|
each_script(&block)
|
||||||
|
|
||||||
|
each_resource(&block)
|
||||||
|
|
||||||
|
if html? && (d = doc)
|
||||||
|
d.xpath_nodes("//a[@href]").each do |a|
|
||||||
|
link = to_absolute(a["href"])
|
||||||
|
yield link if link
|
||||||
|
end
|
||||||
|
|
||||||
|
d.xpath_nodes("//frame[@src]").each do |iframe|
|
||||||
|
link = to_absolute(iframe["src"])
|
||||||
|
yield link if link
|
||||||
|
end
|
||||||
|
|
||||||
|
d.xpath_nodes("//iframe[@src]").each do |iframe|
|
||||||
|
link = to_absolute(iframe["src"])
|
||||||
|
yield link if link
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def each_script(&block : URI ->)
|
||||||
|
if html? && (d = doc)
|
||||||
|
d.xpath_nodes("//script[@src]").each do |script|
|
||||||
|
url = to_absolute(script["src"])
|
||||||
|
yield url if url
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def each_resource(&block : URI ->)
|
||||||
|
if html? && (d = doc)
|
||||||
|
d.xpath_nodes("//link[@href]").each do |link|
|
||||||
|
yield URI.parse(link["href"])
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def each_image(&block : URI ->)
|
||||||
|
if html? && (d = doc)
|
||||||
|
d.xpath_nodes("//img[@src]").each do |img|
|
||||||
|
url = to_absolute(img["src"])
|
||||||
|
yield url if url
|
||||||
|
end
|
||||||
|
|
||||||
|
d.xpath_nodes("//img[@srcset]").each do |set|
|
||||||
|
sources = set["srcset"].split(" ").map_with_index { |e, i| (i.zero? || i.even?) ? e : nil }.compact
|
||||||
|
sources.each do |source|
|
||||||
|
url = to_absolute(source)
|
||||||
|
yield url if url
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def each_video(&block : URI ->)
|
||||||
|
if html? && (d = doc)
|
||||||
|
d.xpath_nodes("//video[@src]").each do |video|
|
||||||
|
url = to_absolute(video["src"])
|
||||||
|
yield url if url
|
||||||
|
end
|
||||||
|
|
||||||
|
d.xpath_nodes("//video/source[@src]").each do |source|
|
||||||
|
url = to_absolute(source["src"])
|
||||||
|
yield url if url
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# The links from within the page.
|
||||||
|
def links
|
||||||
|
links = [] of URI
|
||||||
|
each_link { |link| links << link }
|
||||||
|
links
|
||||||
|
end
|
||||||
|
|
||||||
|
# Enumerates over every URL in the page.
|
||||||
|
def each_url(&block : URI ->)
|
||||||
|
each_link(&block) do |link|
|
||||||
|
if (url = to_absolute(link))
|
||||||
|
yield url
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# ditto
|
||||||
|
def each(&block)
|
||||||
|
each_url { |url| yield url }
|
||||||
|
end
|
||||||
|
|
||||||
|
# Absolute URIs from within the page.
|
||||||
|
def urls
|
||||||
|
urls = [] of URI
|
||||||
|
each_url { |url| urls << link }
|
||||||
|
urls
|
||||||
|
end
|
||||||
|
|
||||||
|
# Normalizes and expands a given link into a proper URI.
|
||||||
|
def to_absolute(link)
|
||||||
|
link = link.is_a?(URI) ? link : URI.parse(link)
|
||||||
|
|
||||||
|
new_url = begin
|
||||||
|
url.merge(link)
|
||||||
|
rescue Exception
|
||||||
|
return
|
||||||
|
end
|
||||||
|
|
||||||
|
if (!new_url.opaque?) && (path = new_url.path)
|
||||||
|
# ensure that paths begin with a leading '/' for URI::FTP
|
||||||
|
if (new_url.scheme == "ftp" && !path.starts_with?("/"))
|
||||||
|
path.insert(0, "/")
|
||||||
|
end
|
||||||
|
|
||||||
|
# make sure the path does not contain any .. or . directories,
|
||||||
|
# since URI::Generic#merge cannot normalize paths such as
|
||||||
|
# "/stuff/../"
|
||||||
|
new_url.path = URI.expand_path(path)
|
||||||
|
end
|
||||||
|
|
||||||
|
return new_url
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,59 @@
|
||||||
|
module Arachnid
|
||||||
|
class Page
|
||||||
|
module StatusCodes
|
||||||
|
# The response code from the page.
|
||||||
|
def code
|
||||||
|
@response.status_code.to_i
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines if the response code is `200`.
|
||||||
|
def ok?
|
||||||
|
code == 200
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines if the response code is `308`.
|
||||||
|
def timedout?
|
||||||
|
code == 308
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines if the response code is `400`.
|
||||||
|
def bad_request?
|
||||||
|
code == 400
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines if the response code is `401`.
|
||||||
|
def unauthorized?
|
||||||
|
code == 401
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines if the response code is `403`.
|
||||||
|
def forbidden?
|
||||||
|
code == 403
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines if the response code is `404`.
|
||||||
|
def missing?
|
||||||
|
code == 404
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines if the response code is `500`.
|
||||||
|
def had_internal_server_error?
|
||||||
|
code == 500
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines if the response code is `300`, `301`, `302`, `303`
|
||||||
|
# or `307`. Also checks for "soft" redirects added at the page
|
||||||
|
# level by a meta refresh tag.
|
||||||
|
def redirect?
|
||||||
|
case code
|
||||||
|
when 300..303, 307
|
||||||
|
true
|
||||||
|
when 200
|
||||||
|
meta_redirect?
|
||||||
|
else
|
||||||
|
false
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,231 @@
|
||||||
|
require "uri"
|
||||||
|
|
||||||
|
module Arachnid
|
||||||
|
# Parses robots.txt files for the perusal of a single user-agent.
|
||||||
|
#
|
||||||
|
# The behaviour implemented is guided by the following sources, though
|
||||||
|
# as there is no widely accepted standard, it may differ from other implementations.
|
||||||
|
# If you consider its behaviour to be in error, please contact the author.
|
||||||
|
#
|
||||||
|
# http://www.robotstxt.org/orig.html
|
||||||
|
# - the original, now imprecise and outdated version
|
||||||
|
# http://www.robotstxt.org/norobots-rfc.txt
|
||||||
|
# - a much more precise, outdated version
|
||||||
|
# http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=156449&from=35237
|
||||||
|
# - a few hints at modern protocol extensions.
|
||||||
|
#
|
||||||
|
# This parser only considers lines starting with (case-insensitively:)
|
||||||
|
# Useragent: User-agent: Allow: Disallow: Sitemap:
|
||||||
|
#
|
||||||
|
# The file is divided into sections, each of which contains one or more User-agent:
|
||||||
|
# lines, followed by one or more Allow: or Disallow: rules.
|
||||||
|
#
|
||||||
|
# The first section that contains a User-agent: line that matches the robot's
|
||||||
|
# user-agent, is the only section that relevent to that robot. The sections are checked
|
||||||
|
# in the same order as they appear in the file.
|
||||||
|
#
|
||||||
|
# (The * character is taken to mean "any number of any characters" during matching of
|
||||||
|
# user-agents)
|
||||||
|
#
|
||||||
|
# Within that section, the first Allow: or Disallow: rule that matches the expression
|
||||||
|
# is taken as authoritative. If no rule in a section matches, the access is Allowed.
|
||||||
|
#
|
||||||
|
# (The order of matching is as in the RFC, Google matches all Allows and then all Disallows,
|
||||||
|
# while Bing matches the most specific rule, I'm sure there are other interpretations)
|
||||||
|
#
|
||||||
|
# When matching urls, all % encodings are normalised (except for /?=& which have meaning)
|
||||||
|
# and "*"s match any number of any character.
|
||||||
|
#
|
||||||
|
# If a pattern ends with a $, then the pattern must match the entire path, or the entire
|
||||||
|
# path with query string.
|
||||||
|
#
|
||||||
|
# TODO: Rework to allow for multiple Robots
|
||||||
|
class Robots
|
||||||
|
alias Rule = Tuple(String, Bool)
|
||||||
|
alias RuleSet = Tuple(String, Array(Rule))
|
||||||
|
|
||||||
|
getter body : String
|
||||||
|
|
||||||
|
getter user_agent : String
|
||||||
|
|
||||||
|
getter rules : Array(Tuple(String, Array(Rule)))
|
||||||
|
|
||||||
|
getter sitemaps : Array(String)
|
||||||
|
|
||||||
|
def initialize(@body : String, @user_agent : String)
|
||||||
|
@sitemaps = [] of String
|
||||||
|
@rules = [] of RuleSet
|
||||||
|
parse(@body)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Given a URI object, or a string representing one, determine whether this
|
||||||
|
# robots.txt would allow access to the path.
|
||||||
|
def allowed?(uri)
|
||||||
|
uri = URI.parse(uri)
|
||||||
|
path = (uri.path || "/") + (uri.query ? "?" + uri.query.to_s : "")
|
||||||
|
path_allowed?(@user_agent, path)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Check whether the relative path (a string of the url's path and query
|
||||||
|
# string) is allowed by the rules we have for the given user_agent.
|
||||||
|
#
|
||||||
|
private def path_allowed?(user_agent, path)
|
||||||
|
@rules.each do |(ua_glob, path_globs)|
|
||||||
|
if match_ua_glob user_agent, ua_glob
|
||||||
|
path_globs.each do |(path_glob, allowed)|
|
||||||
|
return allowed if match_path_glob path, path_glob
|
||||||
|
end
|
||||||
|
return true
|
||||||
|
end
|
||||||
|
end
|
||||||
|
true
|
||||||
|
end
|
||||||
|
|
||||||
|
# This does a case-insensitive substring match such that if the user agent
|
||||||
|
# is contained within the glob, or vice-versa, we will match.
|
||||||
|
#
|
||||||
|
# According to the standard, *s shouldn't appear in the user-agent field
|
||||||
|
# except in the case of "*" meaning all user agents. Google however imply
|
||||||
|
# that the * will work, at least at the end of a string.
|
||||||
|
#
|
||||||
|
# For consistency, and because it seems expected behaviour, and because
|
||||||
|
# a glob * will match a literal * we use glob matching not string matching.
|
||||||
|
#
|
||||||
|
# The standard also advocates a substring match of the robot's user-agent
|
||||||
|
# within the user-agent field. From observation, it seems much more likely
|
||||||
|
# that the match will be the other way about, though we check for both.
|
||||||
|
#
|
||||||
|
private def match_ua_glob(user_agent, glob)
|
||||||
|
glob =~ Regex.new(Regex.escape(user_agent), Regex::Options::IGNORE_CASE) ||
|
||||||
|
user_agent =~ Regex.new(reify(glob), Regex::Options::IGNORE_CASE)
|
||||||
|
end
|
||||||
|
|
||||||
|
# This does case-sensitive prefix matching, such that if the path starts
|
||||||
|
# with the glob, we will match.
|
||||||
|
#
|
||||||
|
# According to the standard, that's it. However, it seems reasonably common
|
||||||
|
# for asterkisks to be interpreted as though they were globs.
|
||||||
|
#
|
||||||
|
# Additionally, some search engines, like Google, will treat a trailing $
|
||||||
|
# sign as forcing the glob to match the entire path - whether including
|
||||||
|
# or excluding the query string is not clear, so we check both.
|
||||||
|
#
|
||||||
|
# (i.e. it seems likely that a site owner who has Disallow: *.pdf$ expects
|
||||||
|
# to disallow requests to *.pdf?i_can_haz_pdf, which the robot could, if
|
||||||
|
# it were feeling malicious, construe.)
|
||||||
|
#
|
||||||
|
# With URLs there is the additional complication that %-encoding can give
|
||||||
|
# multiple representations for identical URLs, this is handled by
|
||||||
|
# normalize_percent_encoding.
|
||||||
|
#
|
||||||
|
private def match_path_glob(path, glob)
|
||||||
|
if glob =~ /\$$/
|
||||||
|
end_marker = "(?:\?|$)"
|
||||||
|
glob = glob.gsub /\$$/, ""
|
||||||
|
else
|
||||||
|
end_marker = ""
|
||||||
|
end
|
||||||
|
|
||||||
|
glob = normalize_percent_encoding(glob)
|
||||||
|
path = normalize_percent_encoding(path)
|
||||||
|
|
||||||
|
path =~ Regex.new("^" + reify(glob) + end_marker)
|
||||||
|
|
||||||
|
rescue e
|
||||||
|
false
|
||||||
|
end
|
||||||
|
|
||||||
|
# As a general rule, we want to ignore different representations of the
|
||||||
|
# same URL. Naively we could just unescape, or escape, everything, however
|
||||||
|
# the standard implies that a / is a HTTP path separator, while a %2F is an
|
||||||
|
# encoded / that does not act as a path separator. Similar issues with ?, &
|
||||||
|
# and =, though all other characters are fine. (While : also has a special
|
||||||
|
# meaning in HTTP, most implementations ignore this in the path)
|
||||||
|
#
|
||||||
|
# It's also worth noting that %-encoding is case-insensitive, so we
|
||||||
|
# explicitly upcase the few that we want to keep.
|
||||||
|
#
|
||||||
|
private def normalize_percent_encoding(path)
|
||||||
|
# First double-escape any characters we don't want to unescape
|
||||||
|
# & / = ?
|
||||||
|
path = path.gsub(/%(26|2F|3D|3F)/i) do |code|
|
||||||
|
"%25#{code.upcase}"
|
||||||
|
end
|
||||||
|
|
||||||
|
URI.unescape(path)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Convert the asterisks in a glob into (.*)s for regular expressions,
|
||||||
|
# and at the same time, escape any other characters that would have
|
||||||
|
# a significance in a regex.
|
||||||
|
#
|
||||||
|
private def reify(glob)
|
||||||
|
glob.split("*").map { |part| Regex.escape(part) }.join(".*")
|
||||||
|
end
|
||||||
|
|
||||||
|
# Convert the @body into a set of @rules so that our parsing mechanism
|
||||||
|
# becomes easier.
|
||||||
|
#
|
||||||
|
# @rules is an array of pairs. The first in the pair is the glob for the
|
||||||
|
# user-agent and the second another array of pairs. The first of the new
|
||||||
|
# pair is a glob for the path, and the second whether it appears in an
|
||||||
|
# Allow: or a Disallow: rule.
|
||||||
|
#
|
||||||
|
# For example:
|
||||||
|
#
|
||||||
|
# User-agent: *
|
||||||
|
# Disallow: /secret/
|
||||||
|
# Allow: / # allow everything...
|
||||||
|
#
|
||||||
|
# Would be parsed so that:
|
||||||
|
#
|
||||||
|
# @rules = [["*", [ ["/secret/", false], ["/", true] ]]]
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# The order of the arrays is maintained so that the first match in the file
|
||||||
|
# is obeyed as indicated by the pseudo-RFC on http://robotstxt.org/. There
|
||||||
|
# are alternative interpretations, some parse by speicifity of glob, and
|
||||||
|
# some check Allow lines for any match before Disallow lines. All are
|
||||||
|
# justifiable, but we could only pick one.
|
||||||
|
#
|
||||||
|
# Note that a blank Disallow: should be treated as an Allow: * and multiple
|
||||||
|
# user-agents may share the same set of rules.
|
||||||
|
#
|
||||||
|
private def parse(body)
|
||||||
|
body.split(/[\r\n]+/).each do |line|
|
||||||
|
prefix, value = line.delete("\000").split(":", 2).map(&.strip)
|
||||||
|
value = value.sub /\s+#.*/, "" if value
|
||||||
|
parser_mode = :begin
|
||||||
|
|
||||||
|
if prefix && value
|
||||||
|
case prefix.downcase
|
||||||
|
when /^user-?agent$/
|
||||||
|
if parser_mode == :user_agent
|
||||||
|
@rules << {value, rules.last[1]}
|
||||||
|
else
|
||||||
|
parser_mode = :user_agent
|
||||||
|
@rules << {value, [] of Rule}
|
||||||
|
end
|
||||||
|
when "disallow"
|
||||||
|
parser_mode = :rules
|
||||||
|
@rules << {"*", [] of Rule} if @rules.empty?
|
||||||
|
|
||||||
|
if value == ""
|
||||||
|
@rules.last[1] << {"*", true}
|
||||||
|
else
|
||||||
|
@rules.last[1] << {value, false}
|
||||||
|
end
|
||||||
|
when "allow"
|
||||||
|
parser_mode = :rules
|
||||||
|
@rules << {"*", [] of Rule} if @rules.empty?
|
||||||
|
@rules.last[1] << {value, true}
|
||||||
|
when "sitemap"
|
||||||
|
@sitemaps << value
|
||||||
|
else
|
||||||
|
# Ignore comments, Crawl-delay: and badly formed lines.
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,53 @@
|
||||||
|
module Arachnid
|
||||||
|
# The `Rules` class represents collections of acceptance and rejection
|
||||||
|
# rules, which are used to filter data.
|
||||||
|
class Rules(T)
|
||||||
|
# Accept rules
|
||||||
|
getter accept : Array(Proc(T | Nil, Bool) | T | Regex | String)
|
||||||
|
|
||||||
|
# Reject rules
|
||||||
|
getter reject : Array(Proc(T | Nil, Bool) | T | Regex | String)
|
||||||
|
|
||||||
|
# Creates a new `Rules` object.
|
||||||
|
def initialize(accept : Array(Proc(T | Nil, Bool) | T | Regex | String)? = nil, reject : Array(Proc(T | Nil, Bool) | T | Regex | String)? = nil)
|
||||||
|
@accept = accept ? accept : [] of Proc(T | Nil, Bool) | T | Regex | String
|
||||||
|
@reject = reject ? reject : [] of Proc(T | Nil, Bool) | T | Regex | String
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines whether the data should be accepted or rejected.
|
||||||
|
def accept?(data : T)
|
||||||
|
return true if accept.empty? && reject.empty?
|
||||||
|
|
||||||
|
unless @accept.empty?
|
||||||
|
@accept.any? { |rule| test_data(data, rule) }
|
||||||
|
else
|
||||||
|
!@reject.any? { |rule| test_data(data, rule) }
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def accept=(value)
|
||||||
|
@accept = value || [] of Proc(T | Nil, Bool) | T | Regex | String
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines whether the data should be rejected or accepted.
|
||||||
|
def reject?(data : T)
|
||||||
|
!accept?(data)
|
||||||
|
end
|
||||||
|
|
||||||
|
def reject=(value)
|
||||||
|
@reject = value || [] of Proc(T | Nil, Bool) | T | Regex | String
|
||||||
|
end
|
||||||
|
|
||||||
|
# Tests the given data against a pattern.
|
||||||
|
private def test_data(data : T, rule)
|
||||||
|
case rule
|
||||||
|
when Proc
|
||||||
|
rule.call(data) == true
|
||||||
|
when Regex
|
||||||
|
!((data.to_s =~ rule).nil?)
|
||||||
|
else
|
||||||
|
data == rule
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,112 @@
|
||||||
|
require "uri"
|
||||||
|
require "halite"
|
||||||
|
|
||||||
|
module Arachnid
|
||||||
|
# Stores active HTTP Sessions organized by scheme, host-name and port.
|
||||||
|
class SessionCache
|
||||||
|
|
||||||
|
# Optional read timeout.
|
||||||
|
property read_timeout : Int32
|
||||||
|
|
||||||
|
# Optional connect timeout.
|
||||||
|
property connect_timeout : Int32
|
||||||
|
|
||||||
|
# Max redirects to follow.
|
||||||
|
property max_redirects : Int32?
|
||||||
|
|
||||||
|
# Should we set a DNT (Do Not Track) header?
|
||||||
|
property? do_not_track : Bool
|
||||||
|
|
||||||
|
@sessions = {} of Tuple(String?, String?, Int32?) => Halite::Client
|
||||||
|
|
||||||
|
# Create a new session cache
|
||||||
|
def initialize(
|
||||||
|
read_timeout : Int32? = nil,
|
||||||
|
connect_timeout : Int32? = nil,
|
||||||
|
follow_redirects : Bool? = nil,
|
||||||
|
max_redirects : Int32? = nil,
|
||||||
|
do_not_track : Bool? = nil
|
||||||
|
)
|
||||||
|
@read_timeout = read_timeout || Arachnid.read_timeout
|
||||||
|
@connect_timeout = connect_timeout || Arachnid.connect_timeout
|
||||||
|
@max_redirects = max_redirects || Arachnid.max_redirects
|
||||||
|
@do_not_track = do_not_track || Arachnid.do_not_track?
|
||||||
|
end
|
||||||
|
|
||||||
|
# Determines if there is an active session for the given URL
|
||||||
|
def active?(url)
|
||||||
|
# normalize the url
|
||||||
|
url = URI.parse(url) unless url.is_a?(URI)
|
||||||
|
|
||||||
|
# session key
|
||||||
|
key = key_for(url)
|
||||||
|
|
||||||
|
@sessions.has_key?(key)
|
||||||
|
end
|
||||||
|
|
||||||
|
# Provides an active session for a given URL.
|
||||||
|
def [](url)
|
||||||
|
# normalize the url
|
||||||
|
url = URI.parse(url) unless url.is_a?(URI)
|
||||||
|
|
||||||
|
# session key
|
||||||
|
key = key_for(url)
|
||||||
|
|
||||||
|
# normalize the endpoint
|
||||||
|
endpoint = url.dup
|
||||||
|
endpoint.scheme ||= "http"
|
||||||
|
endpoint.query = nil
|
||||||
|
endpoint.fragment = nil
|
||||||
|
endpoint.path = ""
|
||||||
|
|
||||||
|
# Set headers
|
||||||
|
headers = {
|
||||||
|
"DNT" => @do_not_track ? 1 : 0
|
||||||
|
}
|
||||||
|
|
||||||
|
unless @sessions.has_key?(key)
|
||||||
|
session = Halite::Client.new(
|
||||||
|
endpoint: endpoint,
|
||||||
|
timeout: Halite::Timeout.new(
|
||||||
|
connect: @connect_timeout,
|
||||||
|
read: @read_timeout
|
||||||
|
),
|
||||||
|
follow: Halite::Follow.new(
|
||||||
|
hops: @max_redirects,
|
||||||
|
strict: false
|
||||||
|
),
|
||||||
|
headers: headers,
|
||||||
|
)
|
||||||
|
|
||||||
|
# session = session.logging(skip_request_body: true, skip_response_body: true)
|
||||||
|
|
||||||
|
@sessions[key] = session
|
||||||
|
end
|
||||||
|
|
||||||
|
@sessions[key]
|
||||||
|
end
|
||||||
|
|
||||||
|
# Destroys an HTTP session for the given scheme, host, and port.
|
||||||
|
def kill!(url)
|
||||||
|
# normalize the url
|
||||||
|
url = URI.parse(url) unless url.is_a?(URI)
|
||||||
|
|
||||||
|
# session key
|
||||||
|
key = key_for(url)
|
||||||
|
|
||||||
|
if sess = @sessions[key]
|
||||||
|
@sessions.delete(key)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# Clears the session cache
|
||||||
|
def clear
|
||||||
|
@sessions.clear
|
||||||
|
end
|
||||||
|
|
||||||
|
# Creates a session key based on the URL
|
||||||
|
private def key_for(url)
|
||||||
|
{url.scheme, url.host, url.port}
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -0,0 +1,3 @@
|
||||||
|
module Arachnid
|
||||||
|
VERSION = "0.1.0"
|
||||||
|
end
|
Loading…
Reference in New Issue