Initial commit
This commit is contained in:
commit
9b82f6b48a
|
@ -0,0 +1,9 @@
|
|||
root = true
|
||||
|
||||
[*.cr]
|
||||
charset = utf-8
|
||||
end_of_line = lf
|
||||
insert_final_newline = true
|
||||
indent_style = space
|
||||
indent_size = 2
|
||||
trim_trailing_whitespace = true
|
|
@ -0,0 +1,9 @@
|
|||
/docs/
|
||||
/lib/
|
||||
/bin/
|
||||
/.shards/
|
||||
*.dwarf
|
||||
|
||||
# Libraries don't need dependency lock
|
||||
# Dependencies will be locked in applications that use them
|
||||
/shard.lock
|
|
@ -0,0 +1,6 @@
|
|||
language: crystal
|
||||
|
||||
# Uncomment the following if you'd like Travis to run specs and check code formatting
|
||||
# script:
|
||||
# - crystal spec
|
||||
# - crystal tool format --check
|
|
@ -0,0 +1,21 @@
|
|||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2019 Chris Watson
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
|
@ -0,0 +1,95 @@
|
|||
# Arachnid
|
||||
|
||||
Arachnid is a fast and powerful web scraping framework for Crystal. It provides an easy to use DSL for scraping webpages and processing all of the things you might come across.
|
||||
|
||||
## Installation
|
||||
|
||||
1. Add the dependency to your `shard.yml`:
|
||||
|
||||
```yaml
|
||||
dependencies:
|
||||
arachnid:
|
||||
github: watzon/arachnid
|
||||
```
|
||||
|
||||
2. Run `shards install`
|
||||
|
||||
## Usage
|
||||
|
||||
Arachnid provides an easy to use, powerful DSL for scraping websites.
|
||||
|
||||
```crystal
|
||||
require "arachnid"
|
||||
require "json"
|
||||
|
||||
# Let's build a sitemap of crystal-lang.org
|
||||
# Links will be a hash of url to page title
|
||||
links = {} of String => String
|
||||
|
||||
# Visit a particular host, in this case `crystal-lang.org`. This will
|
||||
# not match on subdomains.
|
||||
Arachnid.host("https://crystal-lang.org") do |spider|
|
||||
# Ignore the API secion. It's a little big.
|
||||
spider.ignore_urls_like(/.*\/api.*/)
|
||||
|
||||
spider.every_page do |page|
|
||||
puts "Visiting #{page.url.to_s}"
|
||||
|
||||
# Ignore redirects for our sitemap
|
||||
unless page.redirect?
|
||||
# Add the url of every visited page to our sitemap
|
||||
links[page.url.to_s] = page.title.to_s.strip
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
File.write("crystal-lang.org-sitemap.json", links.to_pretty_json)
|
||||
```
|
||||
|
||||
Want to scan external links as well?
|
||||
|
||||
```crystal
|
||||
# To make things interesting, this time let's download
|
||||
# every image we find.
|
||||
Arachnid.start_at("https://crystal-lang.org") do |spider|
|
||||
# Set a base path to store all the images at
|
||||
base_image_dir = File.expand_path("~/Pictures/arachnid")
|
||||
Dir.mkdir_p(base_image_dir)
|
||||
|
||||
spider.every_page do |page|
|
||||
puts "Scanning #{page.url.to_s}"
|
||||
|
||||
if page.image?
|
||||
# Since we're going to be saving a lot of images
|
||||
# let's spawn a new fiber for each one. This
|
||||
# makes things so much faster.
|
||||
spawn do
|
||||
# Output directory for images for this host
|
||||
directory = File.join(base_image_dir, page.url.host.to_s)
|
||||
Dir.mkdir_p(directory)
|
||||
|
||||
# The name of the image
|
||||
filename = File.basename(page.url.path)
|
||||
|
||||
# Save the image using the body of the page
|
||||
puts "Saving #{filename} to #{directory}"
|
||||
File.write(File.join(directory, filename), page.body)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
```
|
||||
|
||||
More documentation will be coming soon!
|
||||
|
||||
## Contributing
|
||||
|
||||
1. Fork it (<https://github.com/watzon/arachnid/fork>)
|
||||
2. Create your feature branch (`git checkout -b my-new-feature`)
|
||||
3. Commit your changes (`git commit -am 'Add some feature'`)
|
||||
4. Push to the branch (`git push origin my-new-feature`)
|
||||
5. Create a new Pull Request
|
||||
|
||||
## Contributors
|
||||
|
||||
- [Chris Watson](https://github.com/watzon) - creator and maintainer
|
|
@ -0,0 +1,17 @@
|
|||
name: arachnid
|
||||
version: 0.1.0
|
||||
|
||||
authors:
|
||||
- Chris Watson <chris@watzon.me>
|
||||
|
||||
dependencies:
|
||||
halite:
|
||||
github: icyleaf/halite
|
||||
version: ~> 0.10.1
|
||||
crystagiri:
|
||||
github: madeindjs/crystagiri
|
||||
branch: master
|
||||
|
||||
crystal: 0.29.0
|
||||
|
||||
license: MIT
|
|
@ -0,0 +1,9 @@
|
|||
require "./spec_helper"
|
||||
|
||||
describe Arachnid do
|
||||
# TODO: Write tests
|
||||
|
||||
it "works" do
|
||||
false.should eq(true)
|
||||
end
|
||||
end
|
|
@ -0,0 +1,2 @@
|
|||
require "spec"
|
||||
require "../src/arachnid"
|
|
@ -0,0 +1,32 @@
|
|||
require "./arachnid/version"
|
||||
require "./arachnid/arachnid"
|
||||
|
||||
# To make things interesting, this time let's download
|
||||
# every image we find.
|
||||
Arachnid.start_at("https://crystal-lang.org") do |spider|
|
||||
# Set a base path to store all the images at
|
||||
base_image_dir = File.expand_path("~/Pictures/arachnid")
|
||||
Dir.mkdir_p(base_image_dir)
|
||||
|
||||
spider.every_page do |page|
|
||||
puts "Scanning #{page.url.to_s}"
|
||||
|
||||
if page.image?
|
||||
# Since we're going to be saving a lot of images
|
||||
# let's spawn a new fiber for each one. This
|
||||
# makes things so much faster.
|
||||
spawn do
|
||||
# Output directory for images for this host
|
||||
directory = File.join(base_image_dir, page.url.host.to_s)
|
||||
Dir.mkdir_p(directory)
|
||||
|
||||
# The name of the image
|
||||
filename = File.basename(page.url.path)
|
||||
|
||||
# Save the image using the body of the page
|
||||
puts "Saving #{filename} to #{directory}"
|
||||
File.write(File.join(directory, filename), page.body)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,543 @@
|
|||
require "./agent/sanitizers"
|
||||
require "./agent/filters"
|
||||
require "./agent/events"
|
||||
require "./agent/actions"
|
||||
require "./agent/robots"
|
||||
require "./page"
|
||||
require "./session_cache"
|
||||
require "./cookie_jar"
|
||||
require "./auth_store"
|
||||
|
||||
module Arachnid
|
||||
class Agent
|
||||
|
||||
getter? running : Bool
|
||||
|
||||
# Set to limit to a single host.
|
||||
property host : String?
|
||||
|
||||
# User agent to use.
|
||||
property user_agent : String
|
||||
|
||||
# HTTP Hoes Header to use.
|
||||
property host_header : String?
|
||||
|
||||
# HTTP Host Headers to use for specific hosts.
|
||||
property host_headers : Hash(String | Regex, String)
|
||||
|
||||
# HTTP Headers to use for every request.
|
||||
property default_headers : Hash(String, String)
|
||||
|
||||
# HTTP Authentication credentials.
|
||||
property authorized : AuthStore
|
||||
|
||||
# Referer to use.
|
||||
property referer : String?
|
||||
|
||||
# Delay in between fetching pages.
|
||||
property fetch_delay : Time::Span | Int32
|
||||
|
||||
# History containing visited URLs.
|
||||
getter history : Set(URI)
|
||||
|
||||
# List of unreachable URIs.
|
||||
getter failures : Set(URI)
|
||||
|
||||
# Queue of URLs to visit.
|
||||
getter queue : Array(URI)
|
||||
|
||||
# The session cache.
|
||||
property sessions : SessionCache
|
||||
|
||||
# Cached cookies.
|
||||
property cookies : CookieJar
|
||||
|
||||
# Maximum number of pages to visit.
|
||||
property limit : Int32?
|
||||
|
||||
# Maximum depth.
|
||||
property max_depth : Int32?
|
||||
|
||||
# The visited URLs and their depth within a site.
|
||||
property levels : Hash(URI, Int32)
|
||||
|
||||
# Creates a new `Agent` object.
|
||||
def initialize(
|
||||
host : String? = nil,
|
||||
read_timeout : Int32? = nil,
|
||||
connect_timeout : Int32? = nil,
|
||||
follow_redirects : Bool? = nil,
|
||||
max_redirects : Int32? = nil,
|
||||
do_not_track : Bool? = nil,
|
||||
default_headers : Hash(String, String)? = nil,
|
||||
host_header : String? = nil,
|
||||
host_headers : Hash(String | Regex, String)? = nil,
|
||||
user_agent : String? = nil,
|
||||
referer : String? = nil,
|
||||
fetch_delay : (Int32 | Time::Span)? = nil,
|
||||
queue : Set(URI)? = nil,
|
||||
history : Set(URI)? = nil,
|
||||
limit : Int32? = nil,
|
||||
max_depth : Int32? = nil,
|
||||
robots : Bool? = nil,
|
||||
filter_options = nil
|
||||
)
|
||||
@host = host
|
||||
|
||||
@host_header = host_header
|
||||
@host_headers = host_headers || {} of (Regex | String) => String
|
||||
@default_headers = default_headers || {} of String => String
|
||||
|
||||
@user_agent = user_agent || Arachnid.user_agent
|
||||
@referer = referer
|
||||
|
||||
@running = false
|
||||
@fetch_delay = fetch_delay || 0
|
||||
@history = history || Set(URI).new
|
||||
@failures = Set(URI).new
|
||||
@queue = queue || [] of URI
|
||||
|
||||
@limit = limit
|
||||
@levels = {} of URI => Int32
|
||||
@max_depth = max_depth
|
||||
|
||||
@sessions = SessionCache.new(
|
||||
read_timeout,
|
||||
connect_timeout,
|
||||
follow_redirects,
|
||||
max_redirects,
|
||||
do_not_track
|
||||
)
|
||||
|
||||
@cookies = CookieJar.new
|
||||
@authorized = AuthStore.new
|
||||
|
||||
if filter_options
|
||||
initialize_filters(**filter_options)
|
||||
else
|
||||
initialize_filters
|
||||
end
|
||||
|
||||
initialize_robots if robots || Arachnid.robots?
|
||||
end
|
||||
|
||||
# Create a new scoped `Agent` in a block.
|
||||
def self.new(**options, &block : Agent ->)
|
||||
_new = new(**options)
|
||||
with _new yield _new
|
||||
_new
|
||||
end
|
||||
|
||||
# Creates a new `Agent` and begins spidering at the given URL.
|
||||
def self.start_at(url, **options, &block : Agent ->)
|
||||
agent = new(**options, &block)
|
||||
agent.start_at(url, force: true)
|
||||
end
|
||||
|
||||
# Creates a new `Agent` and spiders the web site located
|
||||
# at the given URL.
|
||||
def self.site(url, **options, &block : Agent ->)
|
||||
url = url.is_a?(URI) ? url : URI.parse(url)
|
||||
url_regex = Regex.new(Regex.escape(url.host.to_s))
|
||||
|
||||
agent = new(**options, &block)
|
||||
agent.visit_hosts_like(url_regex)
|
||||
|
||||
agent.start_at(url, force: true)
|
||||
end
|
||||
|
||||
# Creates a new `Agent` and spiders the given host.
|
||||
def self.host(url, **options, &block : Agent ->)
|
||||
url = url.is_a?(URI) ? url : URI.parse(url)
|
||||
|
||||
options = options.merge(host: url.host)
|
||||
agent = new(**options, &block)
|
||||
|
||||
agent.start_at(url, force: true)
|
||||
end
|
||||
|
||||
# Clears the history of the `Agent`.
|
||||
def clear
|
||||
@queue.clear
|
||||
@history.clear
|
||||
@failures.clear
|
||||
self
|
||||
end
|
||||
|
||||
# Start spidering at a given URL.
|
||||
# def start_at(url, &block : Page ->)
|
||||
# enqueue(url)
|
||||
# run(&block)
|
||||
# end
|
||||
|
||||
# Start spidering at a given URL.
|
||||
def start_at(url, force = false)
|
||||
enqueue(url, force: force)
|
||||
return run
|
||||
end
|
||||
|
||||
# Start spidering until the queue becomes empty or the
|
||||
# agent is paused.
|
||||
# def run(&block : Page ->)
|
||||
# @running = true
|
||||
|
||||
# until @queue.empty? || paused? || limit_reached?
|
||||
# begin
|
||||
# visit_page(dequeue, &block)
|
||||
# rescue Actions::Paused
|
||||
# return self
|
||||
# rescue Actions::Action
|
||||
# end
|
||||
# end
|
||||
|
||||
# @running = false
|
||||
# @sessions.clear
|
||||
# self
|
||||
# end
|
||||
|
||||
# Start spidering until the queue becomes empty or the
|
||||
# agent is paused.
|
||||
def run
|
||||
@running = true
|
||||
|
||||
until @queue.empty? || paused? || limit_reached? || !running?
|
||||
begin
|
||||
visit_page(dequeue)
|
||||
rescue Actions::Paused
|
||||
return self
|
||||
rescue Actions::Action
|
||||
end
|
||||
end
|
||||
|
||||
@running = false
|
||||
@sessions.clear
|
||||
self
|
||||
end
|
||||
|
||||
# Sets the history of URLs that were previously visited.
|
||||
def history=(new_history)
|
||||
@history.clear
|
||||
|
||||
new_history.each do |url|
|
||||
@history << url.is_a?(URI) ? url : URI.parse(url)
|
||||
end
|
||||
|
||||
@history
|
||||
end
|
||||
|
||||
# Specifies the links which have been visited.
|
||||
def visited_links
|
||||
@history.map(&.to_s)
|
||||
end
|
||||
|
||||
# Specifies the hosts which have been visited.
|
||||
def visited_hosts
|
||||
history.map(&.host)
|
||||
end
|
||||
|
||||
# Determines whether a URL was visited or not.
|
||||
def visited?(url)
|
||||
url = url.is_a?(URI) ? url : URI.parse(url)
|
||||
@history.includes?(url)
|
||||
end
|
||||
|
||||
# Sets the list of failed URLs.
|
||||
def failures=(new_failures)
|
||||
@failures.clear
|
||||
|
||||
new_failures.each do |url|
|
||||
@failures << url.is_a?(URI) ? url : URI.parse(url)
|
||||
end
|
||||
|
||||
@failures
|
||||
end
|
||||
|
||||
# Determines whether a given URL could not be visited.
|
||||
def failed?(url)
|
||||
url = url.is_a?(URI) ? url : URI.parse(url)
|
||||
@failures.includes?(url)
|
||||
end
|
||||
|
||||
# Sets the queue of URLs to visit.
|
||||
# Sets the list of failed URLs.
|
||||
def queue=(new_queue)
|
||||
@queue.clear
|
||||
|
||||
new_queue.each do |url|
|
||||
@queue << url.is_a?(URI) ? url : URI.parse(url)
|
||||
end
|
||||
|
||||
@queue
|
||||
end
|
||||
|
||||
# Determines whether the given URL has been queued for visiting.
|
||||
def queued?(url)
|
||||
url = url.is_a?(URI) ? url : URI.parse(url)
|
||||
@queue.includes?(url)
|
||||
end
|
||||
|
||||
# Enqueues a given URL for visiting, only if it passes all
|
||||
# of the agent's rules for visiting a given URL.
|
||||
def enqueue(url, level = 0, force = false)
|
||||
url = sanitize_url(url)
|
||||
|
||||
if (!queued?(url) && visit?(url)) || force
|
||||
link = url.to_s
|
||||
|
||||
return if url.host.to_s.empty?
|
||||
|
||||
begin
|
||||
@every_url_blocks.each { |url_block| url_block.call(url) }
|
||||
|
||||
@every_url_like_blocks.each do |pattern, url_blocks|
|
||||
match = case pattern
|
||||
when Regex
|
||||
link =~ pattern
|
||||
else
|
||||
(pattern == link) || (pattern == url)
|
||||
end
|
||||
|
||||
if match
|
||||
url_blocks.each { |url_block| url_block.call(url) }
|
||||
end
|
||||
end
|
||||
rescue action : Actions::Paused
|
||||
raise(action)
|
||||
rescue Actions::SkipLink
|
||||
return false
|
||||
rescue Actions::Action
|
||||
end
|
||||
|
||||
@queue << url
|
||||
@levels[url] = level
|
||||
true
|
||||
end
|
||||
end
|
||||
|
||||
# Gets and creates a new `Page` object from a given URL,
|
||||
# yielding the newly created page.
|
||||
def get_page(url, &block)
|
||||
url = url.is_a?(URI) ? url : URI.parse(url)
|
||||
|
||||
prepare_request(url) do |session, path, handlers|
|
||||
new_page = Page.new(url, session.get(path, headers: handlers))
|
||||
|
||||
# save any new cookies
|
||||
@cookies.from_page(new_page)
|
||||
|
||||
yield new_page
|
||||
return new_page
|
||||
end
|
||||
end
|
||||
|
||||
# Gets and creates a new `Page` object from a given URL.
|
||||
def get_page(url)
|
||||
url = url.is_a?(URI) ? url : URI.parse(url)
|
||||
|
||||
prepare_request(url) do |session, path, handlers|
|
||||
new_page = Page.new(url, session.get(path, handlers))
|
||||
|
||||
# save any new cookies
|
||||
@cookies.from_page(new_page)
|
||||
|
||||
return new_page
|
||||
end
|
||||
end
|
||||
|
||||
# Posts supplied form data and creates a new Page from a given URL,
|
||||
# yielding the newly created page.
|
||||
def post_page(url, post_data = "", &block)
|
||||
url = url.is_a?(URI) ? url : URI.parse(url)
|
||||
|
||||
prepare_request(url) do |session, path, handlers|
|
||||
new_page = Page.new(url, session.post(path, post_data, handlers))
|
||||
|
||||
# save any new cookies
|
||||
@cookies.from_page(new_page)
|
||||
|
||||
yield new_page
|
||||
return new_page
|
||||
end
|
||||
end
|
||||
|
||||
# Posts supplied form data and creates a new Page from a given URL.
|
||||
def post_page(url, post_data = "")
|
||||
url = url.is_a?(URI) ? url : URI.parse(url)
|
||||
|
||||
prepare_request(url) do |session, path, handlers|
|
||||
new_page = Page.new(url, session.post(path, post_data, handlers))
|
||||
|
||||
# save any new cookies
|
||||
@cookies.from_page(new_page)
|
||||
|
||||
return new_page
|
||||
end
|
||||
end
|
||||
|
||||
# Visits a given URL and enqueues the links recovered
|
||||
# from the page to be visited later.
|
||||
# def visit_page(url, &block : Page ->)
|
||||
# url = sanitize_url(url)
|
||||
|
||||
# get_page(url) do |page|
|
||||
# @history << page.url
|
||||
|
||||
# begin
|
||||
# @every_page_blocks.each { |page_block| page_block.call(page) }
|
||||
# yield page
|
||||
# rescue action : Actions::Paused
|
||||
# raise(action)
|
||||
# rescue Actions::SkipPage
|
||||
# return Nil
|
||||
# rescue Actions::Action
|
||||
# end
|
||||
|
||||
# page.each_url do |next_url|
|
||||
# begin
|
||||
# @every_link_blocks.each do |link_block|
|
||||
# link_block.call(page.url, next_url)
|
||||
# end
|
||||
# rescue action : Actions::Paused
|
||||
# raise(action)
|
||||
# rescue Actions::SkipLink
|
||||
# next
|
||||
# rescue Actions::Action
|
||||
# end
|
||||
|
||||
# if @max_depth.nil? || @max_depth.not_nil! > (@levels[url]? || 0)
|
||||
# @levels[url] ||= 0
|
||||
# enqueue(next_url, @levels[url] + 1)
|
||||
# end
|
||||
# end
|
||||
# end
|
||||
# end
|
||||
|
||||
# Visits a given URL and enqueues the links recovered
|
||||
# from the page to be visited later.
|
||||
def visit_page(url)
|
||||
url = sanitize_url(url)
|
||||
|
||||
get_page(url) do |page|
|
||||
@history << page.url
|
||||
|
||||
begin
|
||||
@every_page_blocks.each { |page_block| page_block.call(page) }
|
||||
rescue action : Actions::Paused
|
||||
raise(action)
|
||||
rescue Actions::SkipPage
|
||||
return nil
|
||||
rescue Actions::Action
|
||||
end
|
||||
|
||||
page.each_url do |next_url|
|
||||
begin
|
||||
@every_link_blocks.each do |link_block|
|
||||
link_block.call(page.url, next_url)
|
||||
end
|
||||
rescue action : Actions::Paused
|
||||
raise(action)
|
||||
rescue Actions::SkipLink
|
||||
next
|
||||
rescue Actions::Action
|
||||
end
|
||||
|
||||
if @max_depth.nil? || @max_depth.not_nil! > (@levels[url]? || 0)
|
||||
@levels[url] ||= 0
|
||||
enqueue(next_url, @levels[url] + 1)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Converts the agent into a hash.
|
||||
def to_h
|
||||
{"history" => @history, "queue" => @queue}
|
||||
end
|
||||
|
||||
# Prepares request headers for a given URL.
|
||||
protected def prepare_request_headers(url)
|
||||
# set any additional HTTP headers
|
||||
headers = @default_headers.dup
|
||||
|
||||
unless @host_headers.empty?
|
||||
@host_headers.each do |name, header|
|
||||
if url.host =~ name
|
||||
headers["Host"] = header
|
||||
break
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
headers["Host"] ||= @host_header.to_s if @host_header
|
||||
headers["User-Agent"] ||= @user_agent.to_s
|
||||
headers["Referer"] ||= @referer.to_s if @referer
|
||||
|
||||
if authorization = @authorized.for_url(url.host.to_s)
|
||||
headers["Authorization"] = "Basic #{authorization}"
|
||||
end
|
||||
|
||||
if header_cookies = @cookies.for_host(url.host.to_s)
|
||||
headers["Cookie"] = header_cookies.to_cookie_header
|
||||
end
|
||||
|
||||
headers
|
||||
end
|
||||
|
||||
# Normalizes the request path and grabs a session to handle
|
||||
# page get and post requests.
|
||||
def prepare_request(url, &block)
|
||||
path = if url.path.empty?
|
||||
"/"
|
||||
else
|
||||
url.path
|
||||
end
|
||||
|
||||
# append the URL query to the path
|
||||
path += "?#{url.query}" if url.query
|
||||
|
||||
headers = prepare_request_headers(url)
|
||||
|
||||
begin
|
||||
sleep(@fetch_delay) if @fetch_delay.to_i > 0
|
||||
|
||||
yield @sessions[url], path, headers
|
||||
rescue Halite::Exception::Error | IO::Error | Socket::Error | OpenSSL::SSL::Error
|
||||
@sessions.kill!(url)
|
||||
return nil
|
||||
end
|
||||
end
|
||||
|
||||
# Dequeues a URL that will later be visited.
|
||||
def dequeue
|
||||
@queue.shift
|
||||
end
|
||||
|
||||
# Determines if the maximum limit has been reached.
|
||||
def limit_reached?
|
||||
if limit = @limit
|
||||
return @history.size >= limit
|
||||
end
|
||||
false
|
||||
end
|
||||
|
||||
# Determines if a given URL should be visited.
|
||||
def visit?(url)
|
||||
# puts [url.to_s, visited?(url), visit_scheme?(url.scheme.to_s), visit_host?(url.host.to_s), visit_port?(url.port || -1), visit_link?(url.to_s), visit_url?(url), visit_ext?(url.path)]
|
||||
!visited?(url) &&
|
||||
visit_scheme?(url.scheme.to_s) &&
|
||||
visit_host?(url.host.to_s) &&
|
||||
visit_port?(url.port || -1) &&
|
||||
visit_link?(url.to_s) &&
|
||||
visit_url?(url) &&
|
||||
visit_ext?(url.path)
|
||||
# robot_allowed?(url.to_s)
|
||||
end
|
||||
|
||||
# Adds a given URL to the failures list.
|
||||
def failed(url)
|
||||
@failures << url
|
||||
@every_failed_url_blocks.each { |fail_block| fail_block.call(url) }
|
||||
true
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,53 @@
|
|||
module Arachnid
|
||||
class Agent
|
||||
module Actions
|
||||
|
||||
# A Runtime Error
|
||||
class RuntimeError < Exception; end
|
||||
|
||||
# The base `Actions` exceptions class
|
||||
class Action < RuntimeError; end
|
||||
|
||||
# Exception used to pause a running `Agent`
|
||||
class Paused < Action; end
|
||||
|
||||
# Exception which causes a running `Agent` to skip a link.
|
||||
class SkipLink < Action; end
|
||||
|
||||
# Exception which caises a running `Agent` to skip a page.
|
||||
class SkipPage < Action; end
|
||||
end
|
||||
|
||||
# Continue spidering
|
||||
def continue!(&block)
|
||||
@paused = false
|
||||
run(&block)
|
||||
end
|
||||
|
||||
# Sets the pause state of the agent.
|
||||
def pause=(state)
|
||||
@paused = state
|
||||
end
|
||||
|
||||
# Pauses the agent, causing spidering to temporarily stop.
|
||||
def pause!
|
||||
@paused = true
|
||||
raise Actions::Paused.new
|
||||
end
|
||||
|
||||
# Determines whether the agent is paused.
|
||||
def paused?
|
||||
@paused == true
|
||||
end
|
||||
|
||||
# Causes the agent to skip the link being enqueued.
|
||||
def skip_link!
|
||||
raise Actions::SkipLink.new
|
||||
end
|
||||
|
||||
# Causes the agent to skip the page being visited.
|
||||
def skip_page!
|
||||
raise Actions::SkipPage
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,248 @@
|
|||
require "../page"
|
||||
|
||||
module Arachnid
|
||||
class Agent
|
||||
@every_url_blocks = [] of Proc(URI, Nil)
|
||||
|
||||
@every_failed_url_blocks = [] of Proc(URI, Nil)
|
||||
|
||||
@every_url_like_blocks = Hash(String | Regex, Array(Proc(URI, Nil))).new do |hash, key|
|
||||
hash[key] = [] of Proc(URI, Nil)
|
||||
end
|
||||
|
||||
@every_page_blocks = [] of Proc(Page, Nil)
|
||||
|
||||
@every_link_blocks = [] of Proc(URI, URI, Nil)
|
||||
|
||||
# Pass each URL from each page visited to the given block.
|
||||
def every_url(&block : URI ->)
|
||||
@every_url_blocks << block
|
||||
self
|
||||
end
|
||||
|
||||
# Pass each URL that could not be requested to the given block.
|
||||
def every_failed_url(&block : URI ->)
|
||||
@every_failed_url_blocks << block
|
||||
self
|
||||
end
|
||||
|
||||
# Pass every URL that the agent visits, and matches a given pattern,
|
||||
# to a given block.
|
||||
def every_url_like(pattern, &block : URI ->)
|
||||
@every_url_like_blocks[pattern] << block
|
||||
self
|
||||
end
|
||||
|
||||
# Ssee `#every_url_like`
|
||||
def urls_like(pattern, &block : URI ->)
|
||||
every_url_like(pattern, &block)
|
||||
end
|
||||
|
||||
# Pass the headers from every response the agent receives to a given
|
||||
# block.
|
||||
def all_headers(&block)
|
||||
headers = [] of HTTP::Headers
|
||||
every_page { |page| headers << page.headers }
|
||||
headers.each { |header| yield headers }
|
||||
end
|
||||
|
||||
# Pass every page that the agent visits to a given block.
|
||||
def every_page(&block : Page ->)
|
||||
@every_page_blocks << block
|
||||
self
|
||||
end
|
||||
|
||||
# Pass every OK page that the agent visits to a given block.
|
||||
def every_ok_page(&block : Page ->)
|
||||
pages = [] of Page
|
||||
every_page { |page| (pages << page) if page.ok? }
|
||||
pages.each { |page| yield page }
|
||||
end
|
||||
|
||||
# Pass every Redirect page that the agent visits to a given block.
|
||||
def every_redirect_page(&block : Page ->)
|
||||
pages = [] of Page
|
||||
every_page { |page| (pages << page) if page.redirect? }
|
||||
pages.each { |page| yield page }
|
||||
end
|
||||
|
||||
# Pass every Timeout page that the agent visits to a given block.
|
||||
def every_timedout_page(&block : Page ->)
|
||||
pages = [] of Page
|
||||
every_page { |page| (pages << page) if page.timeout? }
|
||||
pages.each { |page| yield page }
|
||||
end
|
||||
|
||||
# Pass every Bad Request page that the agent visits to a given block.
|
||||
def every_bad_request_page(&block : Page ->)
|
||||
pages = [] of Page
|
||||
every_page { |page| (pages << page) if page.bad_request? }
|
||||
pages.each { |page| yield page }
|
||||
end
|
||||
|
||||
# Pass every Unauthorized page that the agent visits to a given block.
|
||||
def every_unauthorized_page(&block : Page ->)
|
||||
pages = [] of Page
|
||||
every_page { |page| (pages << page) if page.unauthorized? }
|
||||
pages.each { |page| yield page }
|
||||
end
|
||||
|
||||
# Pass every Forbidden page that the agent visits to a given block.
|
||||
def every_forbidden_page(&block : Page ->)
|
||||
pages = [] of Page
|
||||
every_page { |page| (pages << page) if page.forbidden? }
|
||||
pages.each { |page| yield page }
|
||||
end
|
||||
|
||||
# Pass every Missing page that the agent visits to a given block.
|
||||
def every_missing_page(&block : Page ->)
|
||||
pages = [] of Page
|
||||
every_page { |page| (pages << page) if page.missing? }
|
||||
pages.each { |page| yield page }
|
||||
end
|
||||
|
||||
# Pass every Internal Server Error page that the agent visits to a
|
||||
# given block.
|
||||
def every_internal_server_error_page(&block : Page ->)
|
||||
pages = [] of Page
|
||||
every_page { |page| (pages << page) if page.had_internal_server_error? }
|
||||
pages.each { |page| yield page }
|
||||
end
|
||||
|
||||
# Pass every Plain Text page that the agent visits to a given block.
|
||||
def every_txt_page(&block : Page ->)
|
||||
pages = [] of Page
|
||||
every_page { |page| (pages << page) if page.txt? }
|
||||
pages.each { |page| yield page }
|
||||
end
|
||||
|
||||
# Pass every HTML page that the agent visits to a given block.
|
||||
def every_html_page(&block : Page ->)
|
||||
pages = [] of Page
|
||||
every_page { |page| (pages << page) if page.html? }
|
||||
pages.each { |page| yield page }
|
||||
end
|
||||
|
||||
# Pass every XML page that the agent visits to a given block.
|
||||
def every_xml_page(&block : Page ->)
|
||||
pages = [] of Page
|
||||
every_page { |page| (pages << page) if page.xml? }
|
||||
pages.each { |page| yield page }
|
||||
end
|
||||
|
||||
# Pass every XML Stylesheet (XSL) page that the agent visits to a
|
||||
# given block.
|
||||
def every_xsl_page(&block : Page ->)
|
||||
pages = [] of Page
|
||||
every_page { |page| (pages << page) if page.xsl? }
|
||||
pages.each { |page| yield page }
|
||||
end
|
||||
|
||||
# Pass every HTML or XML document that the agent parses to a given
|
||||
# block.
|
||||
def every_doc(&block : Document::HTML | XML::Node ->)
|
||||
docs = [] of Document::HTML || XML::Node
|
||||
every_page { |page| docs << page.doc.not_nil! if page.doc }
|
||||
docs.each { |doc| yield doc }
|
||||
end
|
||||
|
||||
# Pass every HTML document that the agent parses to a given block.
|
||||
def every_html_doc(&block : Document::HTML | XML::Node ->)
|
||||
docs = [] of Document::HTML
|
||||
every_page { |page| docs << page.doc.not_nil! if page.html? }
|
||||
docs.each { |doc| yield doc }
|
||||
end
|
||||
|
||||
# Pass every XML document that the agent parses to a given block.
|
||||
def every_xml_doc(&block : XML::Node ->)
|
||||
docs = [] of XML::Node
|
||||
every_page { |page| docs << page.doc.not_nil! if page.xml? }
|
||||
docs.each { |doc| yield doc }
|
||||
end
|
||||
|
||||
# Pass every XML Stylesheet (XSL) that the agent parses to a given
|
||||
# block.
|
||||
def every_xsl_doc(&block : XML::Node ->)
|
||||
docs = [] of XML::Node
|
||||
every_page { |page| docs << page.doc.not_nil! if page.xsl? }
|
||||
docs.each { |doc| yield doc }
|
||||
end
|
||||
|
||||
# Pass every RSS document that the agent parses to a given block.
|
||||
def every_rss_doc(&block : XML::Node ->)
|
||||
docs = [] of XML::Node
|
||||
every_page { |page| docs << page.doc.not_nil! if page.rss? }
|
||||
docs.each { |doc| yield doc }
|
||||
end
|
||||
|
||||
# Pass every Atom document that the agent parses to a given block.
|
||||
def every_atom_doc(&block : XML::Node ->)
|
||||
docs = [] of XML::Node
|
||||
every_page { |page| docs << page.doc.not_nil! if page.atom? }
|
||||
docs.each { |doc| yield doc }
|
||||
end
|
||||
|
||||
# Pass every JavaScript page that the agent visits to a given block.
|
||||
def every_javascript_page(&block : Page ->)
|
||||
pages = [] of Page
|
||||
every_page { |page| (pages << page) if page.javascript? }
|
||||
pages.each { |page| yield page }
|
||||
end
|
||||
|
||||
# Pass every CSS page that the agent visits to a given block.
|
||||
def every_css_page(&block : Page ->)
|
||||
pages = [] of Page
|
||||
every_page { |page| (pages << page) if page.css? }
|
||||
pages.each { |page| yield page }
|
||||
end
|
||||
|
||||
# Pass every RSS feed that the agent visits to a given block.
|
||||
def every_rss_page(&block : Page ->)
|
||||
pages = [] of Page
|
||||
every_page { |page| (pages << page) if page.rss? }
|
||||
pages.each { |page| yield page }
|
||||
end
|
||||
|
||||
# Pass every Atom feed that the agent visits to a given block.
|
||||
def every_atom_page(&block : Page ->)
|
||||
pages = [] of Page
|
||||
every_page { |page| (pages << page) if page.atom? }
|
||||
pages.each { |page| yield page }
|
||||
end
|
||||
|
||||
# Pass every MS Word page that the agent visits to a given block.
|
||||
def every_ms_word_page(&block : Page ->)
|
||||
pages = [] of Page
|
||||
every_page { |page| (pages << page) if page.ms_word? }
|
||||
pages.each { |page| yield page }
|
||||
end
|
||||
|
||||
# Pass every PDF page that the agent visits to a given block.
|
||||
def every_pdf_page(&block : Page ->)
|
||||
pages = [] of Page
|
||||
every_page { |page| (pages << page) if page.pdf? }
|
||||
pages.each { |page| yield page }
|
||||
end
|
||||
|
||||
# Pass every ZIP page that the agent visits to a given block.
|
||||
def every_zip_page(&block : Page ->)
|
||||
pages = [] of Page
|
||||
every_page { |page| (pages << page) if page.zip? }
|
||||
pages.each { |page| yield page }
|
||||
end
|
||||
|
||||
# Passes every image URI to the given blocks.
|
||||
def every_image(&block : Page ->)
|
||||
pages = [] of Page
|
||||
every_page { |page| (pages << page) if page.image? }
|
||||
pages.each { |page| yield page }
|
||||
end
|
||||
|
||||
# Passes every origin and destination URI of each link to a given
|
||||
# block.
|
||||
def every_link(&block : URI, URI ->)
|
||||
@every_link_blocks << block
|
||||
self
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,256 @@
|
|||
require "../rules"
|
||||
|
||||
module Arachnid
|
||||
class Agent
|
||||
# List of acceptable URL schemes to follow
|
||||
getter schemes : Array(String) = [] of String
|
||||
|
||||
@host_rules = Rules(String).new
|
||||
@port_rules = Rules(Int32).new
|
||||
@link_rules = Rules(String).new
|
||||
@url_rules = Rules(URI).new
|
||||
@ext_rules = Rules(String).new
|
||||
|
||||
# Sets the list of acceptable URL schemes to visit.
|
||||
def schemes=(new_schemes)
|
||||
@schemes = new_schemes.map(&.to_s)
|
||||
end
|
||||
|
||||
# Specifies the patterns that match host-names to visit.
|
||||
def visit_hosts
|
||||
@host_rules.accept
|
||||
end
|
||||
|
||||
# Adds a given pattern to the `#visit_hosts`.
|
||||
def visit_hosts_like(pattern)
|
||||
visit_hosts << pattern
|
||||
self
|
||||
end
|
||||
|
||||
def visit_hosts_like(&block)
|
||||
visit_hosts << block
|
||||
self
|
||||
end
|
||||
|
||||
# Specifies the patterns that match host-names to not visit.
|
||||
def ignore_hosts
|
||||
@host_rules.reject
|
||||
end
|
||||
|
||||
# Adds a given pattern to the `#ignore_hosts`.
|
||||
def ignore_hosts_like(pattern)
|
||||
ignore_hosts << pattern
|
||||
self
|
||||
end
|
||||
|
||||
def ignore_hosts_like(&block)
|
||||
ignore_hosts << block
|
||||
self
|
||||
end
|
||||
|
||||
# Specifies the patterns that match the ports to visit.
|
||||
def visit_ports
|
||||
@port_rules.accept
|
||||
end
|
||||
|
||||
# Adds a given pattern to the `#visit_ports`.
|
||||
def visit_ports_like(pattern)
|
||||
visit_ports << pattern
|
||||
self
|
||||
end
|
||||
|
||||
def visit_ports_like(&block : Int32 -> Bool)
|
||||
visit_ports << block
|
||||
self
|
||||
end
|
||||
|
||||
# Specifies the patterns that match ports to not visit.
|
||||
def ignore_ports
|
||||
@port_rules.reject
|
||||
end
|
||||
|
||||
# Adds a given pattern to the `#ignore_ports`.
|
||||
def ignore_ports_like(pattern)
|
||||
ignore_ports << pattern
|
||||
self
|
||||
end
|
||||
|
||||
def ignore_ports_like(&block : Int32 -> Bool)
|
||||
ignore_ports << block
|
||||
self
|
||||
end
|
||||
|
||||
# Specifies the patterns that match the links to visit.
|
||||
def visit_links
|
||||
@link_rules.accept
|
||||
end
|
||||
|
||||
# Adds a given pattern to the `#visit_links`
|
||||
def visit_links_like(pattern)
|
||||
visit_links << pattern
|
||||
self
|
||||
end
|
||||
|
||||
def visit_links_like(&block : String -> Bool)
|
||||
visit_links << block
|
||||
self
|
||||
end
|
||||
|
||||
# Specifies the patterns that match links to not visit.
|
||||
def ignore_links
|
||||
@link_rules.reject
|
||||
end
|
||||
|
||||
# Adds a given pattern to the `#ignore_links`.
|
||||
def ignore_links_like(pattern)
|
||||
ignore_links << pattern
|
||||
self
|
||||
end
|
||||
|
||||
def ignore_links_like(&block : String -> Bool)
|
||||
ignore_links << block
|
||||
self
|
||||
end
|
||||
|
||||
# Specifies the patterns that match the URLs to visit.
|
||||
def visit_urls
|
||||
@url_rules.accept
|
||||
end
|
||||
|
||||
# Adds a given pattern to the `#visit_urls`
|
||||
def visit_urls_like(&block : URI -> Bool)
|
||||
visit_urls << block
|
||||
self
|
||||
end
|
||||
|
||||
def visit_urls_like(pattern)
|
||||
visit_urls << pattern
|
||||
self
|
||||
end
|
||||
|
||||
# Specifies the patterns that match URLs to not visit.
|
||||
def ignore_urls
|
||||
@url_rules.reject
|
||||
end
|
||||
|
||||
# Adds a given pattern to the `#ignore_urls`.
|
||||
def ignore_urls_like(&block : URI -> Bool)
|
||||
ignore_urls << block
|
||||
self
|
||||
end
|
||||
|
||||
def ignore_urls_like(pattern)
|
||||
ignore_urls << pattern
|
||||
self
|
||||
end
|
||||
|
||||
# Specifies the patterns that match the URI path extensions to visit.
|
||||
def visit_exts
|
||||
@ext_rules.accept
|
||||
end
|
||||
|
||||
# Adds a given pattern to the `#visit_exts`.
|
||||
def visit_exts_like(&block : String -> Bool)
|
||||
visit_exts << block
|
||||
self
|
||||
end
|
||||
|
||||
def visit_exts_like(pattern)
|
||||
visit_exts << pattern
|
||||
self
|
||||
end
|
||||
|
||||
# Specifies the patterns that match URI path extensions to not visit.
|
||||
def ignore_exts
|
||||
@ext_rules.reject
|
||||
end
|
||||
|
||||
# Adds a given pattern to the `#ignore_exts`.
|
||||
def ignore_exts_like(&block : String -> Bool)
|
||||
ignore_exts << block
|
||||
self
|
||||
end
|
||||
|
||||
def ignore_exts_like(pattern)
|
||||
ignore_exts << pattern
|
||||
self
|
||||
end
|
||||
|
||||
# Initializes filtering rules.
|
||||
protected def initialize_filters(
|
||||
schemes = nil,
|
||||
hosts = nil,
|
||||
ignore_hosts = nil,
|
||||
ports = nil,
|
||||
ignore_ports = nil,
|
||||
links = nil,
|
||||
ignore_links = nil,
|
||||
urls = nil,
|
||||
ignore_urls = nil,
|
||||
exts = nil,
|
||||
ignore_exts = nil
|
||||
)
|
||||
|
||||
if schemes
|
||||
self.schemes = schemes
|
||||
else
|
||||
@schemes << "http"
|
||||
@schemes << "https"
|
||||
end
|
||||
|
||||
@host_rules.accept = hosts
|
||||
@host_rules.reject = ignore_hosts
|
||||
|
||||
@port_rules.accept = ports
|
||||
@port_rules.reject = ignore_ports
|
||||
|
||||
@link_rules.accept = links
|
||||
@link_rules.reject = ignore_links
|
||||
|
||||
@url_rules.accept = urls
|
||||
@url_rules.reject = ignore_urls
|
||||
|
||||
@ext_rules.accept = exts
|
||||
@ext_rules.reject = ignore_exts
|
||||
|
||||
if host
|
||||
visit_hosts_like(host.to_s)
|
||||
end
|
||||
end
|
||||
|
||||
# Determines if a given URI scheme should be visited.
|
||||
protected def visit_scheme?(scheme)
|
||||
if scheme
|
||||
@schemes.includes?(scheme)
|
||||
else
|
||||
true
|
||||
end
|
||||
end
|
||||
|
||||
# Determines if a given host-name should be visited.
|
||||
protected def visit_host?(host)
|
||||
@host_rules.accept?(host)
|
||||
end
|
||||
|
||||
# Determines if a given port should be visited.
|
||||
protected def visit_port?(port)
|
||||
@port_rules.accept?(port)
|
||||
end
|
||||
|
||||
# Determines if a given link should be visited.
|
||||
protected def visit_link?(link)
|
||||
@link_rules.accept?(link)
|
||||
end
|
||||
|
||||
# Determines if a given URL should be visited.
|
||||
protected def visit_url?(link)
|
||||
@url_rules.accept?(link)
|
||||
end
|
||||
|
||||
# Determines if a given URI path extension should be visited.
|
||||
protected def visit_ext?(path)
|
||||
ext = File.extname(path)
|
||||
@ext_rules.accept?(ext)
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,20 @@
|
|||
require "../robots"
|
||||
|
||||
module Arachnid
|
||||
class Agent
|
||||
@robots : Arachnid::Robots? = nil
|
||||
|
||||
# Initializes the robots filter.
|
||||
def initialize_robots
|
||||
# @robots = Arachnid::Robots.new(@user_agent)
|
||||
end
|
||||
|
||||
# Determines whether a URL is allowed by the robot policy.
|
||||
def robot_allowed?(url)
|
||||
if robots = @robots
|
||||
return robots.allowed?(url)
|
||||
end
|
||||
true
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,21 @@
|
|||
module Arachnid
|
||||
class Agent
|
||||
# Specifies whether the Agent will strip URI fragments
|
||||
property? strip_fragments : Bool = true
|
||||
|
||||
# Specifies whether the Agent will strip URI queries
|
||||
property? strip_query : Bool = false
|
||||
|
||||
# Sanitizes a URL based on filtering options
|
||||
def sanitize_url(url)
|
||||
# normalize the url
|
||||
url = URI.parse(url) unless url.is_a?(URI)
|
||||
|
||||
url.path = "" if url.path == "/"
|
||||
url.fragment = nil if @strip_fragments
|
||||
url.query = nil if @strip_query
|
||||
|
||||
url
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,39 @@
|
|||
require "./page"
|
||||
require "./agent"
|
||||
|
||||
module Arachnid
|
||||
extend self
|
||||
|
||||
# Specifies whether robots.txt should be honored globally
|
||||
class_property? robots : Bool = false
|
||||
|
||||
# Should we set the DNT (Do Not Track) header?
|
||||
class_property? do_not_track : Bool = false
|
||||
|
||||
# Maximum amount of redirects to follow
|
||||
class_property max_redirects : Int32 = 0
|
||||
|
||||
# Connect timeout.
|
||||
class_property connect_timeout : Int32 = 10
|
||||
|
||||
# Read timeout.
|
||||
class_property read_timeout : Int32 = 10
|
||||
|
||||
# The User-Agent string used by all Agent objects by default.
|
||||
class_property user_agent : String = "Arachnid #{Arachnid::VERSION}"
|
||||
|
||||
# See `Agent.start_at`
|
||||
def start_at(url, **options, &block : Agent ->)
|
||||
Agent.start_at(url, **options, &block)
|
||||
end
|
||||
|
||||
# See `Agent.host`
|
||||
def host(name, **options, &block : Agent ->)
|
||||
Agent.host(name, **options, &block)
|
||||
end
|
||||
|
||||
# See `Agent.site`
|
||||
def site(url, **options, &block : Agent ->)
|
||||
Agent.site(url, **options, &block)
|
||||
end
|
||||
end
|
|
@ -0,0 +1,4 @@
|
|||
module Arachnid
|
||||
# Represents HTTP Authentication credentials for a website.
|
||||
record AuthCredential, username : String, password : String
|
||||
end
|
|
@ -0,0 +1,83 @@
|
|||
require "base64"
|
||||
require "./extensions/uri"
|
||||
require "./auth_credential"
|
||||
require "./page"
|
||||
|
||||
module Arachnid
|
||||
class AuthStore
|
||||
@credentials = {} of Tuple(String?, String?, Int32?) => Hash(Array(String), AuthCredential)
|
||||
|
||||
# Given a URL, return the most specific matching auth credential.
|
||||
def [](url)
|
||||
# normalize the url
|
||||
url = URI.parse(url) unless url.is_a?(URI)
|
||||
|
||||
key = key_for(url)
|
||||
paths = @credentials[key]?
|
||||
|
||||
return nil unless paths
|
||||
|
||||
# longest path first
|
||||
ordered_paths = paths.keys.sort { |path_key| -path_key.size }
|
||||
|
||||
# directories of the path
|
||||
path_dirs = URI.expand_path(url.path).split('/').reject(&.empty?)
|
||||
|
||||
ordered_paths.each do |path|
|
||||
return paths[path] if path_dirs[0, path.size] == path
|
||||
end
|
||||
|
||||
nil
|
||||
end
|
||||
|
||||
# Add an auth credential to the store for the supplied base URL.
|
||||
def []=(url, auth)
|
||||
# normalize the url
|
||||
url = URI.parse(url) unless url.is_a?(URI)
|
||||
|
||||
# normalize the url path and split it
|
||||
paths = URI.expand_path(url.path).split('/').reject(&.empty?)
|
||||
|
||||
key = key_for(url)
|
||||
|
||||
@credentials[key] ||= {} of Array(String) => AuthCredential
|
||||
@credentials[key][paths] = auth
|
||||
auth
|
||||
end
|
||||
|
||||
# Convenience method to add username and password credentials
|
||||
# for a named URL.
|
||||
def add(url, username, password)
|
||||
self[url] = AuthCredential.new(username: username, password: password)
|
||||
end
|
||||
|
||||
# Returns the base64 encoded authorization string for the URL
|
||||
# or `nil` if no authorization exists.
|
||||
def for_url(url)
|
||||
if auth = self[url]
|
||||
Base64.encode("#{auth.username}#{auth.password}")
|
||||
end
|
||||
end
|
||||
|
||||
# Clear the contents of the auth store.
|
||||
def clear!
|
||||
@credentials.clear!
|
||||
self
|
||||
end
|
||||
|
||||
# Size of the current auth store (number of URL paths stored)
|
||||
def size
|
||||
@credentials.values.reduce(0) { |acc, paths| acc + paths.size }
|
||||
end
|
||||
|
||||
# Inspect the auth store
|
||||
def inspect
|
||||
"<#{self.class}: #{@credentials.inspect}>"
|
||||
end
|
||||
|
||||
# Creates a auth key based on the URL
|
||||
private def key_for(url)
|
||||
{url.scheme, url.host, url.port}
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,118 @@
|
|||
module Arachnid
|
||||
class CookieJar
|
||||
include Enumerable(HTTP::Cookies)
|
||||
|
||||
@params : Hash(String, HTTP::Cookies)
|
||||
|
||||
@cookies : HTTP::Cookies
|
||||
|
||||
@dirty : Set(String)
|
||||
|
||||
# Creates a new `CookieJar`
|
||||
def initialize
|
||||
@params = {} of String => HTTP::Cookies
|
||||
@cookies = HTTP::Cookies.new
|
||||
@dirty = Set(String).new
|
||||
end
|
||||
|
||||
# Iterates over the host-name and cookie value pairs in the jar.
|
||||
def each(&block)
|
||||
@params.each do |kp|
|
||||
yield kp
|
||||
end
|
||||
end
|
||||
|
||||
# Returns all relevant cookies in a single string for the named
|
||||
# host or domain.
|
||||
def [](host : String)
|
||||
@params[host]? || HTTP::Cookies.new
|
||||
end
|
||||
|
||||
# Add a cookie to the jar for a particular domain.
|
||||
def []=(host : String, cookies : HTTP::Cookies)
|
||||
@params[host] ||= HTTP::Cookies.new
|
||||
|
||||
cookies.each do |cookie|
|
||||
if @params[host][cookie.name]? != cookie.value
|
||||
cookies.each do |c|
|
||||
@params[host] << c
|
||||
end
|
||||
@dirty.add(host)
|
||||
|
||||
break
|
||||
end
|
||||
end
|
||||
|
||||
cookies
|
||||
end
|
||||
|
||||
# Retrieve cookies for a domain from the response.
|
||||
def from_page(page)
|
||||
cookies = page.cookies
|
||||
|
||||
unless cookies.empty?
|
||||
self[page.url.host.to_s] = cookies
|
||||
return true
|
||||
end
|
||||
|
||||
false
|
||||
end
|
||||
|
||||
# Returns the pre-encoded Cookie for a given host.
|
||||
def for_host(host)
|
||||
if @dirty.includes?(host)
|
||||
values = [] of String
|
||||
|
||||
cookies_for_host(host).each do |cookie|
|
||||
values << cookie.to_cookie_header
|
||||
end
|
||||
|
||||
@cookies[host] = values.join("; ")
|
||||
@dirty.delete(host)
|
||||
end
|
||||
|
||||
@cookies[host]?
|
||||
end
|
||||
|
||||
# Returns raw cookie value pairs for a given host. Includes cookies
|
||||
# set on parent domains.
|
||||
def cookies_for_host(host)
|
||||
host_cookies = @params[host]? || HTTP::Cookies.new
|
||||
subdomains = host.split('.')
|
||||
|
||||
while subdomains.size > 2
|
||||
subdomains.shift
|
||||
|
||||
if parent_cookies = @params[subdomains.join('.')]?
|
||||
parent_cookies.each do |cookie|
|
||||
# copy in the parent cookies, only if they haven't been
|
||||
# overridden yet.
|
||||
unless host_cookies.has_key?(cookie.name)
|
||||
host_cookies[cookie.name] = cookie.value
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
host_cookies
|
||||
end
|
||||
|
||||
# Clear out the jar, removing all stored cookies.
|
||||
def clear!
|
||||
@params.clear
|
||||
@cookies.clear
|
||||
@dirty.clear
|
||||
self
|
||||
end
|
||||
|
||||
# Size of the cookie jar.
|
||||
def size
|
||||
@params.size
|
||||
end
|
||||
|
||||
# Inspects the cookie jar.
|
||||
def inspect
|
||||
"#<#{self.class}: #{@params.inspect}>"
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,196 @@
|
|||
require "xml"
|
||||
|
||||
module Arachnid
|
||||
module Document
|
||||
struct HTML
|
||||
@content : String
|
||||
|
||||
@document : XML::Node
|
||||
|
||||
@ids : Hash(String, XML::Node)
|
||||
|
||||
@tags : Hash(String, Array(Tag))
|
||||
|
||||
@classes : Hash(String, Array(XML::Node))
|
||||
|
||||
forward_missing_to @document
|
||||
|
||||
def initialize(@content : String)
|
||||
@document = XML.parse_html(@content)
|
||||
|
||||
@ids = {} of String => XML::Node
|
||||
@tags = {} of String => Array(Tag)
|
||||
@classes = {} of String => Array(XML::Node)
|
||||
|
||||
visit @document
|
||||
end
|
||||
|
||||
def self.parse(content : String)
|
||||
new(content)
|
||||
end
|
||||
|
||||
# Transform the css query into an xpath query
|
||||
def self.css_query_to_xpath(query : String) : String
|
||||
query = "//#{query}"
|
||||
# Convert '#id_name' as '[@id="id_name"]'
|
||||
query = query.gsub /\#([A-z0-9]+-*_*)+/ { |m| "*[@id=\"%s\"]" % m.delete('#') }
|
||||
# Convert '.classname' as '[@class="classname"]'
|
||||
query = query.gsub /\.([A-z0-9]+-*_*)+/ { |m| "[@class=\"%s\"]" % m.delete('.') }
|
||||
# Convert ' > ' as '/'
|
||||
query = query.gsub /\s*>\s*/ { |m| "/" }
|
||||
# Convert ' ' as '//'
|
||||
query = query.gsub " ", "//"
|
||||
# a leading '*' when xpath does not include node name
|
||||
query = query.gsub /\/\[/ { |m| "/*[" }
|
||||
return query
|
||||
end
|
||||
|
||||
# Find first tag by tag name and return
|
||||
# `HTML::Tag` if found or `nil` if not found
|
||||
def at_tag(tag_name : String) : Tag | Nil
|
||||
if tags = @tags[tag_name]?
|
||||
tags.each do |tag|
|
||||
return tag
|
||||
end
|
||||
end
|
||||
return nil
|
||||
end
|
||||
|
||||
# Find all nodes by tag name and yield
|
||||
# `HTML::Tag` if found
|
||||
def where_tag(tag_name : String, &block) : Array(Tag)
|
||||
arr = [] of Tag
|
||||
if tags = @tags[tag_name]?
|
||||
tags.each do |tag|
|
||||
yield tag
|
||||
arr << tag
|
||||
end
|
||||
end
|
||||
return arr
|
||||
end
|
||||
|
||||
# Find all nodes by classname and yield
|
||||
# `HTML::Tag` founded
|
||||
def where_class(class_name : String, &block) : Array(Tag)
|
||||
arr = [] of Tag
|
||||
if klasses = @classes[class_name]?
|
||||
klasses.each do |node|
|
||||
klass = Tag.new(node)
|
||||
yield klass
|
||||
arr << klass
|
||||
end
|
||||
end
|
||||
return arr
|
||||
end
|
||||
|
||||
# Find a node by its id and return a
|
||||
# `HTML::Tag` found or `nil` if not found
|
||||
def at_id(id_name : String) : Tag | Nil
|
||||
if node = @ids[id_name]?
|
||||
return Tag.new(node)
|
||||
end
|
||||
end
|
||||
|
||||
# Find all nodes corresponding to the css query and yield
|
||||
# `HTML::Tag` found or `nil` if not found
|
||||
def css(query : String) : Array(Tag)
|
||||
query = HTML.css_query_to_xpath(query)
|
||||
return @nodes.xpath_nodes("//#{query}").map { |node|
|
||||
tag = Tag.new(node)
|
||||
yield tag
|
||||
tag
|
||||
}
|
||||
end
|
||||
|
||||
# Find first node corresponding to the css query and return
|
||||
# `HTML::Tag` if found or `nil` if not found
|
||||
def at_css(query : String)
|
||||
css(query) { |tag| return tag }
|
||||
return nil
|
||||
end
|
||||
|
||||
private def add_id(id : String, node : XML::Node)
|
||||
@ids[id] = node
|
||||
end
|
||||
|
||||
private def add_node(node : XML::Node)
|
||||
if @tags[node.name]? == nil
|
||||
@tags[node.name] = [] of Tag
|
||||
end
|
||||
@tags[node.name] << Tag.new(node)
|
||||
end
|
||||
|
||||
private def add_class(klass : String, node : XML::Node)
|
||||
if @classes[klass]? == nil
|
||||
@classes[klass] = [] of XML::Node
|
||||
end
|
||||
@classes[klass] << node
|
||||
end
|
||||
|
||||
# Depth-first visit. Given a node, extract metadata from
|
||||
# node (if exists), then visit each child.
|
||||
private def visit(node : XML::Node)
|
||||
# We only extract metadata from HTML nodes
|
||||
if node.element?
|
||||
add_node node
|
||||
if to = node["id"]?
|
||||
add_id to, node
|
||||
end
|
||||
if classes = node["class"]?
|
||||
classes.split(' ') { |to| add_class to, node }
|
||||
end
|
||||
end
|
||||
# visit each child
|
||||
node.children.each do |child|
|
||||
visit child
|
||||
end
|
||||
end
|
||||
|
||||
# Represents an HTML Tag
|
||||
struct Tag
|
||||
getter node : XML::Node
|
||||
|
||||
forward_missing_to @node
|
||||
|
||||
def initialize(@node : XML::Node)
|
||||
end
|
||||
|
||||
def classname : String | Nil
|
||||
return @node["class"]? ? @node["class"] : nil
|
||||
end
|
||||
|
||||
def tagname : String
|
||||
return @node.name
|
||||
end
|
||||
|
||||
def content : String
|
||||
return @node.text != nil ? @node.text.as(String) : "".as(String)
|
||||
end
|
||||
|
||||
def parent : Tag | Nil
|
||||
if parent = @node.parent
|
||||
return Tag.new parent
|
||||
end
|
||||
nil
|
||||
end
|
||||
|
||||
def children : Array(Tag)
|
||||
children = [] of Tag
|
||||
@node.children.each do |node|
|
||||
if node.element?
|
||||
children << Tag.new node
|
||||
end
|
||||
end
|
||||
children
|
||||
end
|
||||
|
||||
def has_class?(klass : String) : Bool
|
||||
if classes = classname
|
||||
return classes.includes?(klass)
|
||||
end
|
||||
false
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,175 @@
|
|||
require "uri"
|
||||
require "string_scanner"
|
||||
|
||||
class URI
|
||||
#
|
||||
# Expands a URI decoded path, into a proper absolute path.
|
||||
#
|
||||
# @param [String] path
|
||||
# The path from a URI.
|
||||
#
|
||||
# @return [String]
|
||||
# The expanded path.
|
||||
#
|
||||
# @example
|
||||
# URI.expand_path("./path")
|
||||
# # => "path"
|
||||
#
|
||||
# @example
|
||||
# URI.expand_path("test/../path")
|
||||
# # => "path"
|
||||
#
|
||||
# @example
|
||||
# URI.expand_path("/test/path/")
|
||||
# # => "/test/path/"
|
||||
#
|
||||
# @example
|
||||
# URI.expand_path("/test/../path")
|
||||
# # => "/path"
|
||||
#
|
||||
def self.expand_path(path)
|
||||
if path.starts_with?("/")
|
||||
leading_slash, path = path[0, 1], path[1..-1]
|
||||
else
|
||||
leading_slash = ""
|
||||
end
|
||||
|
||||
if path.ends_with?("/")
|
||||
trailing_slash, path = path[-1, 1], path[0..-2]
|
||||
else
|
||||
trailing_slash = ""
|
||||
end
|
||||
|
||||
scanner = StringScanner.new(path)
|
||||
stack = [] of String
|
||||
|
||||
until scanner.eos?
|
||||
if (dir = scanner.scan(/[^\/]+/))
|
||||
case dir
|
||||
when ".." then stack.pop
|
||||
when "." then false
|
||||
else stack.push(dir)
|
||||
end
|
||||
else
|
||||
scanner.skip(/\/+/)
|
||||
end
|
||||
break if stack.empty?
|
||||
end
|
||||
|
||||
unless stack.empty?
|
||||
"#{leading_slash}#{stack.join("/")}#{trailing_slash}"
|
||||
else
|
||||
""
|
||||
end
|
||||
end
|
||||
|
||||
def split_path(path)
|
||||
path.split("/")
|
||||
end
|
||||
|
||||
def merge_path(base, rel)
|
||||
|
||||
# RFC2396, Section 5.2, 5)
|
||||
# RFC2396, Section 5.2, 6)
|
||||
base_path = split_path(base)
|
||||
rel_path = split_path(rel)
|
||||
|
||||
# RFC2396, Section 5.2, 6), a)
|
||||
base_path << "" if base_path.last == ".."
|
||||
while i = base_path.index("..")
|
||||
base_path = base_path[i - 1, 2]
|
||||
end
|
||||
|
||||
if (first = rel_path.first) && first.empty?
|
||||
base_path.clear
|
||||
rel_path.shift
|
||||
end
|
||||
|
||||
# RFC2396, Section 5.2, 6), c)
|
||||
# RFC2396, Section 5.2, 6), d)
|
||||
rel_path.push("") if rel_path.last == '.' || rel_path.last == ".."
|
||||
rel_path.delete('.')
|
||||
|
||||
# RFC2396, Section 5.2, 6), e)
|
||||
tmp = [] of String
|
||||
rel_path.each do |x|
|
||||
if x == ".." &&
|
||||
!(tmp.empty? || tmp.last == "..")
|
||||
tmp.pop
|
||||
else
|
||||
tmp << x
|
||||
end
|
||||
end
|
||||
|
||||
add_trailer_slash = !tmp.empty?
|
||||
if base_path.empty?
|
||||
base_path = [""] # keep '/' for root directory
|
||||
elsif add_trailer_slash
|
||||
base_path.pop
|
||||
end
|
||||
while x = tmp.shift
|
||||
if x == ".."
|
||||
# RFC2396, Section 4
|
||||
# a .. or . in an absolute path has no special meaning
|
||||
base_path.pop if base_path.size > 1
|
||||
else
|
||||
# if x == ".."
|
||||
# valid absolute (but abnormal) path "/../..."
|
||||
# else
|
||||
# valid absolute path
|
||||
# end
|
||||
base_path << x
|
||||
tmp.each {|t| base_path << t}
|
||||
add_trailer_slash = false
|
||||
break
|
||||
end
|
||||
end
|
||||
base_path.push("") if add_trailer_slash
|
||||
|
||||
return base_path.join('/')
|
||||
end
|
||||
|
||||
def merge(oth)
|
||||
oth = URI.parse(oth) unless oth.is_a?(URI)
|
||||
|
||||
if oth.absolute?
|
||||
# raise BadURIError, "both URI are absolute" if absolute?
|
||||
# hmm... should return oth for usability?
|
||||
return oth
|
||||
end
|
||||
|
||||
unless self.absolute?
|
||||
raise URI::Error.new("both URI are othative")
|
||||
end
|
||||
|
||||
base = self.dup
|
||||
|
||||
authority = oth.userinfo || oth.host || oth.port
|
||||
|
||||
# RFC2396, Section 5.2, 2)
|
||||
if (oth.path.nil? || oth.path.empty?) && !authority && !oth.query
|
||||
base.fragment=(oth.fragment) if oth.fragment
|
||||
return base
|
||||
end
|
||||
|
||||
base.query = nil
|
||||
base.fragment=(nil)
|
||||
|
||||
# RFC2396, Section 5.2, 4)
|
||||
if !authority
|
||||
base.path = merge_path(base.path, oth.path) if base.path && oth.path
|
||||
else
|
||||
# RFC2396, Section 5.2, 4)
|
||||
base.path = oth.path if oth.path
|
||||
end
|
||||
|
||||
# RFC2396, Section 5.2, 7)
|
||||
base.user = oth.userinfo if oth.userinfo
|
||||
base.host = oth.host if oth.host
|
||||
base.port = oth.port if oth.port
|
||||
base.query = oth.query if oth.query
|
||||
base.fragment=(oth.fragment) if oth.fragment
|
||||
|
||||
return base
|
||||
end
|
||||
end
|
|
@ -0,0 +1,97 @@
|
|||
require "uri"
|
||||
require "halite"
|
||||
|
||||
require "./page/content_types"
|
||||
require "./page/cookies"
|
||||
require "./page/html"
|
||||
require "./page/status_codes"
|
||||
|
||||
require "./document/html"
|
||||
|
||||
module Arachnid
|
||||
# Represents a page requested from a website
|
||||
class Page
|
||||
include Page::ContentTypes
|
||||
include Page::Cookies
|
||||
include Page::HTML
|
||||
include Page::StatusCodes
|
||||
|
||||
# URL of the page
|
||||
getter url : URI
|
||||
|
||||
# HTTP response
|
||||
getter response : Halite::Response
|
||||
|
||||
# Headers returned with the body
|
||||
getter headers : HTTP::Headers
|
||||
|
||||
@doc : (Document::HTML | XML::Node)?
|
||||
|
||||
delegate xpath, xpath_node, xpath_nodes, xpath_bool, xpath_float, xpath_string,
|
||||
root, at_tag, where_tag, where_class, at_id, css, at_css, to: @doc
|
||||
|
||||
forward_missing_to @headers
|
||||
|
||||
# Creates a new `Page` object.
|
||||
def initialize(url : URI, response : Halite::Response)
|
||||
@url = url
|
||||
@response = response
|
||||
@headers = response.headers
|
||||
end
|
||||
|
||||
# The body of the response
|
||||
def body
|
||||
@response.body || ""
|
||||
end
|
||||
|
||||
# Returns a parsed document for HTML, XML, RSS, and Atom pages.
|
||||
def doc
|
||||
unless body.empty?
|
||||
doc_class = if html?
|
||||
Document::HTML
|
||||
elsif rss? || atom? || xml? || xsl?
|
||||
XML
|
||||
end
|
||||
|
||||
if doc_class
|
||||
begin
|
||||
@doc ||= doc_class.parse(body)
|
||||
rescue
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Searches the document for XPath or CSS paths
|
||||
def search(path)
|
||||
if document = doc
|
||||
document.xpath_nodes(path)
|
||||
else
|
||||
[] of XML::Node
|
||||
end
|
||||
end
|
||||
|
||||
# Searches for the first occurrence of an XPath or CSS path
|
||||
def at(path)
|
||||
if document = doc
|
||||
document.xpath_node(path)
|
||||
end
|
||||
end
|
||||
|
||||
def /(path)
|
||||
search(path)
|
||||
end
|
||||
|
||||
def %(path)
|
||||
at(path)
|
||||
end
|
||||
|
||||
def size
|
||||
@response.body.bytesize
|
||||
end
|
||||
|
||||
def to_s
|
||||
body
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,162 @@
|
|||
module Arachnid
|
||||
class Page
|
||||
module ContentTypes
|
||||
# The Content-Type of the page.
|
||||
def content_type
|
||||
@response.content_type || ""
|
||||
end
|
||||
|
||||
# The content types of the page.
|
||||
def content_types
|
||||
types = @response.headers.get?("content-type") || [] of String
|
||||
end
|
||||
|
||||
# The charset included in the Content-Type.
|
||||
def content_charset
|
||||
content_types.each do |value|
|
||||
if value.includes?(";")
|
||||
value.split(";").each do |param|
|
||||
param.strip!
|
||||
|
||||
if param.starts_with?("charset=")
|
||||
return param.split("=", 2).last
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
return nil
|
||||
end
|
||||
|
||||
# Determines if any of the content-types of the page include a given
|
||||
# type.
|
||||
def is_content_type?(type : String | Regex)
|
||||
content_types.any? do |value|
|
||||
value = value.split(";", 2).first
|
||||
|
||||
if type.is_a?(Regex)
|
||||
value =~ type
|
||||
else
|
||||
value == type
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Determines if the page is plain-text.
|
||||
def plain_text?
|
||||
is_content_type?("text/plain")
|
||||
end
|
||||
|
||||
# ditto
|
||||
def text?
|
||||
plain_text?
|
||||
end
|
||||
|
||||
# Determines if the page is a Directory Listing.
|
||||
def directory?
|
||||
is_content_type?("text/directory")
|
||||
end
|
||||
|
||||
# Determines if the page is HTML document.
|
||||
def html?
|
||||
is_content_type?("text/html")
|
||||
end
|
||||
|
||||
# Determines if the page is XML document.
|
||||
def xml?
|
||||
is_content_type?(/(text|application)\/xml/)
|
||||
end
|
||||
|
||||
# Determines if the page is XML Stylesheet (XSL).
|
||||
def xsl?
|
||||
is_content_type?("text/xsl")
|
||||
end
|
||||
|
||||
# Determines if the page is JavaScript.
|
||||
def javascript?
|
||||
is_content_type?(/(text|application)\/javascript/)
|
||||
end
|
||||
|
||||
# Determines if the page is JSON.
|
||||
def json?
|
||||
is_content_type?("application/json")
|
||||
end
|
||||
|
||||
# Determines if the page is a CSS stylesheet.
|
||||
def css?
|
||||
is_content_type?("text/css")
|
||||
end
|
||||
|
||||
# Determines if the page is a RSS feed.
|
||||
def rss?
|
||||
is_content_type?(/application\/(rss\+xml|rdf\+xml)/)
|
||||
end
|
||||
|
||||
# Determines if the page is an Atom feed.
|
||||
def atom?
|
||||
is_content_type?("application/atom+xml")
|
||||
end
|
||||
|
||||
# Determines if the page is a MS Word document.
|
||||
def ms_word?
|
||||
is_content_type?("application/msword")
|
||||
end
|
||||
|
||||
# Determines if the page is a PDF document.
|
||||
def pdf?
|
||||
is_content_type?("application/pdf")
|
||||
end
|
||||
|
||||
# Determines if the page is a ZIP archive.
|
||||
def zip?
|
||||
is_content_type?("application/zip")
|
||||
end
|
||||
|
||||
# Determine if the page is an image.
|
||||
def image?
|
||||
is_content_type?(/image\//)
|
||||
end
|
||||
|
||||
def png?
|
||||
is_content_type?("image/png")
|
||||
end
|
||||
|
||||
def gif?
|
||||
is_content_type?("image/gif")
|
||||
end
|
||||
|
||||
def jpg?
|
||||
is_content_type?(/image\/(jpg|jpeg)/)
|
||||
end
|
||||
|
||||
def svg?
|
||||
is_content_type?(/image\/svg(\+xml)?/)
|
||||
end
|
||||
|
||||
def video?
|
||||
is_content_type?(/video\/.*/)
|
||||
end
|
||||
|
||||
def mp4?
|
||||
is_content_type?("video/mp4")
|
||||
end
|
||||
|
||||
def avi?
|
||||
is_content_type?("video/x-msvideo")
|
||||
end
|
||||
|
||||
def wmv?
|
||||
is_content_type?("video/x-ms-wmv")
|
||||
end
|
||||
|
||||
def quicktime?
|
||||
is_content_type?("video/quicktime")
|
||||
end
|
||||
|
||||
def flash?
|
||||
is_content_type?("video/flash") ||
|
||||
is_content_type?("application/x-shockwave-flash")
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,18 @@
|
|||
module Arachnid
|
||||
class Page
|
||||
module Cookies
|
||||
# Reserved names used within Cookie strings
|
||||
RESERVED_COOKIE_NAMES = Regex.new("^(?:Path|Expires|Domain|Secure|HTTPOnly)$", :ignore_case)
|
||||
|
||||
# The raw Cookie String sent along with the page.
|
||||
def cookie
|
||||
@response.headers["Set-Cookie"]? || ""
|
||||
end
|
||||
|
||||
# The Cookie values sent along with the page.
|
||||
def cookies
|
||||
@response.cookies
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,204 @@
|
|||
require "../extensions/uri"
|
||||
|
||||
module Arachnid
|
||||
class Page
|
||||
# TODO: Create enumerable methods for the methods that take a block
|
||||
module HTML
|
||||
# include Enumerable
|
||||
|
||||
# The title of the HTML page.
|
||||
def title
|
||||
if (node = at("//title"))
|
||||
node.inner_text
|
||||
end
|
||||
end
|
||||
|
||||
# Enumerates over the meta-redirect links in the page.
|
||||
def each_meta_redirect(&block : URI ->)
|
||||
if (html? && doc)
|
||||
search("//meta[@http-equiv and @content]").each do |node|
|
||||
if node["http-equiv"] =~ /refresh/i
|
||||
content = node["content"]
|
||||
|
||||
if (redirect = content.match(/url=(\S+)$/))
|
||||
yield URI.parse(redirect[1])
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Returns a boolean indicating whether or not page-level meta
|
||||
# redirects are present in this page.
|
||||
def meta_redirect?
|
||||
!meta_redirects.empty?
|
||||
end
|
||||
|
||||
# The meta-redirect links of the page.
|
||||
def meta_redirects
|
||||
redirects = [] of URI
|
||||
each_meta_redirect { |r| redirects << r }
|
||||
redirects
|
||||
end
|
||||
|
||||
# Enumerates over every HTTP or meta-redirect link in the page.
|
||||
def each_redirect(&block : URI ->)
|
||||
if (locations = @response.headers.get?("Location"))
|
||||
# Location headers override any meta-refresh redirects in the HTML
|
||||
locations.each { |l| URI.parse(l) }
|
||||
else
|
||||
# check page-level meta redirects if there isn't a location header
|
||||
each_meta_redirect(&block)
|
||||
end
|
||||
end
|
||||
|
||||
# URLs that this document redirects to.
|
||||
def redirects_to
|
||||
each_redirect.to_a
|
||||
end
|
||||
|
||||
# Enumerates over every `mailto:` link in the page.
|
||||
def each_mailto(&block)
|
||||
if (html? && doc)
|
||||
doc.xpath_nodes("//a[starts-with(@href,'mailto:')]").each do |a|
|
||||
yield a["href"][7..-1]
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# `mailto:` links in the page.
|
||||
def mailtos
|
||||
each_mailto.to_a
|
||||
end
|
||||
|
||||
# Enumerates over every link in the page.
|
||||
def each_link(&block : URI ->)
|
||||
each_redirect(&block) if redirect?
|
||||
|
||||
each_image(&block)
|
||||
|
||||
each_script(&block)
|
||||
|
||||
each_resource(&block)
|
||||
|
||||
if html? && (d = doc)
|
||||
d.xpath_nodes("//a[@href]").each do |a|
|
||||
link = to_absolute(a["href"])
|
||||
yield link if link
|
||||
end
|
||||
|
||||
d.xpath_nodes("//frame[@src]").each do |iframe|
|
||||
link = to_absolute(iframe["src"])
|
||||
yield link if link
|
||||
end
|
||||
|
||||
d.xpath_nodes("//iframe[@src]").each do |iframe|
|
||||
link = to_absolute(iframe["src"])
|
||||
yield link if link
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def each_script(&block : URI ->)
|
||||
if html? && (d = doc)
|
||||
d.xpath_nodes("//script[@src]").each do |script|
|
||||
url = to_absolute(script["src"])
|
||||
yield url if url
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def each_resource(&block : URI ->)
|
||||
if html? && (d = doc)
|
||||
d.xpath_nodes("//link[@href]").each do |link|
|
||||
yield URI.parse(link["href"])
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def each_image(&block : URI ->)
|
||||
if html? && (d = doc)
|
||||
d.xpath_nodes("//img[@src]").each do |img|
|
||||
url = to_absolute(img["src"])
|
||||
yield url if url
|
||||
end
|
||||
|
||||
d.xpath_nodes("//img[@srcset]").each do |set|
|
||||
sources = set["srcset"].split(" ").map_with_index { |e, i| (i.zero? || i.even?) ? e : nil }.compact
|
||||
sources.each do |source|
|
||||
url = to_absolute(source)
|
||||
yield url if url
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def each_video(&block : URI ->)
|
||||
if html? && (d = doc)
|
||||
d.xpath_nodes("//video[@src]").each do |video|
|
||||
url = to_absolute(video["src"])
|
||||
yield url if url
|
||||
end
|
||||
|
||||
d.xpath_nodes("//video/source[@src]").each do |source|
|
||||
url = to_absolute(source["src"])
|
||||
yield url if url
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# The links from within the page.
|
||||
def links
|
||||
links = [] of URI
|
||||
each_link { |link| links << link }
|
||||
links
|
||||
end
|
||||
|
||||
# Enumerates over every URL in the page.
|
||||
def each_url(&block : URI ->)
|
||||
each_link(&block) do |link|
|
||||
if (url = to_absolute(link))
|
||||
yield url
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# ditto
|
||||
def each(&block)
|
||||
each_url { |url| yield url }
|
||||
end
|
||||
|
||||
# Absolute URIs from within the page.
|
||||
def urls
|
||||
urls = [] of URI
|
||||
each_url { |url| urls << link }
|
||||
urls
|
||||
end
|
||||
|
||||
# Normalizes and expands a given link into a proper URI.
|
||||
def to_absolute(link)
|
||||
link = link.is_a?(URI) ? link : URI.parse(link)
|
||||
|
||||
new_url = begin
|
||||
url.merge(link)
|
||||
rescue Exception
|
||||
return
|
||||
end
|
||||
|
||||
if (!new_url.opaque?) && (path = new_url.path)
|
||||
# ensure that paths begin with a leading '/' for URI::FTP
|
||||
if (new_url.scheme == "ftp" && !path.starts_with?("/"))
|
||||
path.insert(0, "/")
|
||||
end
|
||||
|
||||
# make sure the path does not contain any .. or . directories,
|
||||
# since URI::Generic#merge cannot normalize paths such as
|
||||
# "/stuff/../"
|
||||
new_url.path = URI.expand_path(path)
|
||||
end
|
||||
|
||||
return new_url
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,59 @@
|
|||
module Arachnid
|
||||
class Page
|
||||
module StatusCodes
|
||||
# The response code from the page.
|
||||
def code
|
||||
@response.status_code.to_i
|
||||
end
|
||||
|
||||
# Determines if the response code is `200`.
|
||||
def ok?
|
||||
code == 200
|
||||
end
|
||||
|
||||
# Determines if the response code is `308`.
|
||||
def timedout?
|
||||
code == 308
|
||||
end
|
||||
|
||||
# Determines if the response code is `400`.
|
||||
def bad_request?
|
||||
code == 400
|
||||
end
|
||||
|
||||
# Determines if the response code is `401`.
|
||||
def unauthorized?
|
||||
code == 401
|
||||
end
|
||||
|
||||
# Determines if the response code is `403`.
|
||||
def forbidden?
|
||||
code == 403
|
||||
end
|
||||
|
||||
# Determines if the response code is `404`.
|
||||
def missing?
|
||||
code == 404
|
||||
end
|
||||
|
||||
# Determines if the response code is `500`.
|
||||
def had_internal_server_error?
|
||||
code == 500
|
||||
end
|
||||
|
||||
# Determines if the response code is `300`, `301`, `302`, `303`
|
||||
# or `307`. Also checks for "soft" redirects added at the page
|
||||
# level by a meta refresh tag.
|
||||
def redirect?
|
||||
case code
|
||||
when 300..303, 307
|
||||
true
|
||||
when 200
|
||||
meta_redirect?
|
||||
else
|
||||
false
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,231 @@
|
|||
require "uri"
|
||||
|
||||
module Arachnid
|
||||
# Parses robots.txt files for the perusal of a single user-agent.
|
||||
#
|
||||
# The behaviour implemented is guided by the following sources, though
|
||||
# as there is no widely accepted standard, it may differ from other implementations.
|
||||
# If you consider its behaviour to be in error, please contact the author.
|
||||
#
|
||||
# http://www.robotstxt.org/orig.html
|
||||
# - the original, now imprecise and outdated version
|
||||
# http://www.robotstxt.org/norobots-rfc.txt
|
||||
# - a much more precise, outdated version
|
||||
# http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=156449&from=35237
|
||||
# - a few hints at modern protocol extensions.
|
||||
#
|
||||
# This parser only considers lines starting with (case-insensitively:)
|
||||
# Useragent: User-agent: Allow: Disallow: Sitemap:
|
||||
#
|
||||
# The file is divided into sections, each of which contains one or more User-agent:
|
||||
# lines, followed by one or more Allow: or Disallow: rules.
|
||||
#
|
||||
# The first section that contains a User-agent: line that matches the robot's
|
||||
# user-agent, is the only section that relevent to that robot. The sections are checked
|
||||
# in the same order as they appear in the file.
|
||||
#
|
||||
# (The * character is taken to mean "any number of any characters" during matching of
|
||||
# user-agents)
|
||||
#
|
||||
# Within that section, the first Allow: or Disallow: rule that matches the expression
|
||||
# is taken as authoritative. If no rule in a section matches, the access is Allowed.
|
||||
#
|
||||
# (The order of matching is as in the RFC, Google matches all Allows and then all Disallows,
|
||||
# while Bing matches the most specific rule, I'm sure there are other interpretations)
|
||||
#
|
||||
# When matching urls, all % encodings are normalised (except for /?=& which have meaning)
|
||||
# and "*"s match any number of any character.
|
||||
#
|
||||
# If a pattern ends with a $, then the pattern must match the entire path, or the entire
|
||||
# path with query string.
|
||||
#
|
||||
# TODO: Rework to allow for multiple Robots
|
||||
class Robots
|
||||
alias Rule = Tuple(String, Bool)
|
||||
alias RuleSet = Tuple(String, Array(Rule))
|
||||
|
||||
getter body : String
|
||||
|
||||
getter user_agent : String
|
||||
|
||||
getter rules : Array(Tuple(String, Array(Rule)))
|
||||
|
||||
getter sitemaps : Array(String)
|
||||
|
||||
def initialize(@body : String, @user_agent : String)
|
||||
@sitemaps = [] of String
|
||||
@rules = [] of RuleSet
|
||||
parse(@body)
|
||||
end
|
||||
|
||||
# Given a URI object, or a string representing one, determine whether this
|
||||
# robots.txt would allow access to the path.
|
||||
def allowed?(uri)
|
||||
uri = URI.parse(uri)
|
||||
path = (uri.path || "/") + (uri.query ? "?" + uri.query.to_s : "")
|
||||
path_allowed?(@user_agent, path)
|
||||
end
|
||||
|
||||
# Check whether the relative path (a string of the url's path and query
|
||||
# string) is allowed by the rules we have for the given user_agent.
|
||||
#
|
||||
private def path_allowed?(user_agent, path)
|
||||
@rules.each do |(ua_glob, path_globs)|
|
||||
if match_ua_glob user_agent, ua_glob
|
||||
path_globs.each do |(path_glob, allowed)|
|
||||
return allowed if match_path_glob path, path_glob
|
||||
end
|
||||
return true
|
||||
end
|
||||
end
|
||||
true
|
||||
end
|
||||
|
||||
# This does a case-insensitive substring match such that if the user agent
|
||||
# is contained within the glob, or vice-versa, we will match.
|
||||
#
|
||||
# According to the standard, *s shouldn't appear in the user-agent field
|
||||
# except in the case of "*" meaning all user agents. Google however imply
|
||||
# that the * will work, at least at the end of a string.
|
||||
#
|
||||
# For consistency, and because it seems expected behaviour, and because
|
||||
# a glob * will match a literal * we use glob matching not string matching.
|
||||
#
|
||||
# The standard also advocates a substring match of the robot's user-agent
|
||||
# within the user-agent field. From observation, it seems much more likely
|
||||
# that the match will be the other way about, though we check for both.
|
||||
#
|
||||
private def match_ua_glob(user_agent, glob)
|
||||
glob =~ Regex.new(Regex.escape(user_agent), Regex::Options::IGNORE_CASE) ||
|
||||
user_agent =~ Regex.new(reify(glob), Regex::Options::IGNORE_CASE)
|
||||
end
|
||||
|
||||
# This does case-sensitive prefix matching, such that if the path starts
|
||||
# with the glob, we will match.
|
||||
#
|
||||
# According to the standard, that's it. However, it seems reasonably common
|
||||
# for asterkisks to be interpreted as though they were globs.
|
||||
#
|
||||
# Additionally, some search engines, like Google, will treat a trailing $
|
||||
# sign as forcing the glob to match the entire path - whether including
|
||||
# or excluding the query string is not clear, so we check both.
|
||||
#
|
||||
# (i.e. it seems likely that a site owner who has Disallow: *.pdf$ expects
|
||||
# to disallow requests to *.pdf?i_can_haz_pdf, which the robot could, if
|
||||
# it were feeling malicious, construe.)
|
||||
#
|
||||
# With URLs there is the additional complication that %-encoding can give
|
||||
# multiple representations for identical URLs, this is handled by
|
||||
# normalize_percent_encoding.
|
||||
#
|
||||
private def match_path_glob(path, glob)
|
||||
if glob =~ /\$$/
|
||||
end_marker = "(?:\?|$)"
|
||||
glob = glob.gsub /\$$/, ""
|
||||
else
|
||||
end_marker = ""
|
||||
end
|
||||
|
||||
glob = normalize_percent_encoding(glob)
|
||||
path = normalize_percent_encoding(path)
|
||||
|
||||
path =~ Regex.new("^" + reify(glob) + end_marker)
|
||||
|
||||
rescue e
|
||||
false
|
||||
end
|
||||
|
||||
# As a general rule, we want to ignore different representations of the
|
||||
# same URL. Naively we could just unescape, or escape, everything, however
|
||||
# the standard implies that a / is a HTTP path separator, while a %2F is an
|
||||
# encoded / that does not act as a path separator. Similar issues with ?, &
|
||||
# and =, though all other characters are fine. (While : also has a special
|
||||
# meaning in HTTP, most implementations ignore this in the path)
|
||||
#
|
||||
# It's also worth noting that %-encoding is case-insensitive, so we
|
||||
# explicitly upcase the few that we want to keep.
|
||||
#
|
||||
private def normalize_percent_encoding(path)
|
||||
# First double-escape any characters we don't want to unescape
|
||||
# & / = ?
|
||||
path = path.gsub(/%(26|2F|3D|3F)/i) do |code|
|
||||
"%25#{code.upcase}"
|
||||
end
|
||||
|
||||
URI.unescape(path)
|
||||
end
|
||||
|
||||
# Convert the asterisks in a glob into (.*)s for regular expressions,
|
||||
# and at the same time, escape any other characters that would have
|
||||
# a significance in a regex.
|
||||
#
|
||||
private def reify(glob)
|
||||
glob.split("*").map { |part| Regex.escape(part) }.join(".*")
|
||||
end
|
||||
|
||||
# Convert the @body into a set of @rules so that our parsing mechanism
|
||||
# becomes easier.
|
||||
#
|
||||
# @rules is an array of pairs. The first in the pair is the glob for the
|
||||
# user-agent and the second another array of pairs. The first of the new
|
||||
# pair is a glob for the path, and the second whether it appears in an
|
||||
# Allow: or a Disallow: rule.
|
||||
#
|
||||
# For example:
|
||||
#
|
||||
# User-agent: *
|
||||
# Disallow: /secret/
|
||||
# Allow: / # allow everything...
|
||||
#
|
||||
# Would be parsed so that:
|
||||
#
|
||||
# @rules = [["*", [ ["/secret/", false], ["/", true] ]]]
|
||||
#
|
||||
#
|
||||
# The order of the arrays is maintained so that the first match in the file
|
||||
# is obeyed as indicated by the pseudo-RFC on http://robotstxt.org/. There
|
||||
# are alternative interpretations, some parse by speicifity of glob, and
|
||||
# some check Allow lines for any match before Disallow lines. All are
|
||||
# justifiable, but we could only pick one.
|
||||
#
|
||||
# Note that a blank Disallow: should be treated as an Allow: * and multiple
|
||||
# user-agents may share the same set of rules.
|
||||
#
|
||||
private def parse(body)
|
||||
body.split(/[\r\n]+/).each do |line|
|
||||
prefix, value = line.delete("\000").split(":", 2).map(&.strip)
|
||||
value = value.sub /\s+#.*/, "" if value
|
||||
parser_mode = :begin
|
||||
|
||||
if prefix && value
|
||||
case prefix.downcase
|
||||
when /^user-?agent$/
|
||||
if parser_mode == :user_agent
|
||||
@rules << {value, rules.last[1]}
|
||||
else
|
||||
parser_mode = :user_agent
|
||||
@rules << {value, [] of Rule}
|
||||
end
|
||||
when "disallow"
|
||||
parser_mode = :rules
|
||||
@rules << {"*", [] of Rule} if @rules.empty?
|
||||
|
||||
if value == ""
|
||||
@rules.last[1] << {"*", true}
|
||||
else
|
||||
@rules.last[1] << {value, false}
|
||||
end
|
||||
when "allow"
|
||||
parser_mode = :rules
|
||||
@rules << {"*", [] of Rule} if @rules.empty?
|
||||
@rules.last[1] << {value, true}
|
||||
when "sitemap"
|
||||
@sitemaps << value
|
||||
else
|
||||
# Ignore comments, Crawl-delay: and badly formed lines.
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,53 @@
|
|||
module Arachnid
|
||||
# The `Rules` class represents collections of acceptance and rejection
|
||||
# rules, which are used to filter data.
|
||||
class Rules(T)
|
||||
# Accept rules
|
||||
getter accept : Array(Proc(T | Nil, Bool) | T | Regex | String)
|
||||
|
||||
# Reject rules
|
||||
getter reject : Array(Proc(T | Nil, Bool) | T | Regex | String)
|
||||
|
||||
# Creates a new `Rules` object.
|
||||
def initialize(accept : Array(Proc(T | Nil, Bool) | T | Regex | String)? = nil, reject : Array(Proc(T | Nil, Bool) | T | Regex | String)? = nil)
|
||||
@accept = accept ? accept : [] of Proc(T | Nil, Bool) | T | Regex | String
|
||||
@reject = reject ? reject : [] of Proc(T | Nil, Bool) | T | Regex | String
|
||||
end
|
||||
|
||||
# Determines whether the data should be accepted or rejected.
|
||||
def accept?(data : T)
|
||||
return true if accept.empty? && reject.empty?
|
||||
|
||||
unless @accept.empty?
|
||||
@accept.any? { |rule| test_data(data, rule) }
|
||||
else
|
||||
!@reject.any? { |rule| test_data(data, rule) }
|
||||
end
|
||||
end
|
||||
|
||||
def accept=(value)
|
||||
@accept = value || [] of Proc(T | Nil, Bool) | T | Regex | String
|
||||
end
|
||||
|
||||
# Determines whether the data should be rejected or accepted.
|
||||
def reject?(data : T)
|
||||
!accept?(data)
|
||||
end
|
||||
|
||||
def reject=(value)
|
||||
@reject = value || [] of Proc(T | Nil, Bool) | T | Regex | String
|
||||
end
|
||||
|
||||
# Tests the given data against a pattern.
|
||||
private def test_data(data : T, rule)
|
||||
case rule
|
||||
when Proc
|
||||
rule.call(data) == true
|
||||
when Regex
|
||||
!((data.to_s =~ rule).nil?)
|
||||
else
|
||||
data == rule
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,112 @@
|
|||
require "uri"
|
||||
require "halite"
|
||||
|
||||
module Arachnid
|
||||
# Stores active HTTP Sessions organized by scheme, host-name and port.
|
||||
class SessionCache
|
||||
|
||||
# Optional read timeout.
|
||||
property read_timeout : Int32
|
||||
|
||||
# Optional connect timeout.
|
||||
property connect_timeout : Int32
|
||||
|
||||
# Max redirects to follow.
|
||||
property max_redirects : Int32?
|
||||
|
||||
# Should we set a DNT (Do Not Track) header?
|
||||
property? do_not_track : Bool
|
||||
|
||||
@sessions = {} of Tuple(String?, String?, Int32?) => Halite::Client
|
||||
|
||||
# Create a new session cache
|
||||
def initialize(
|
||||
read_timeout : Int32? = nil,
|
||||
connect_timeout : Int32? = nil,
|
||||
follow_redirects : Bool? = nil,
|
||||
max_redirects : Int32? = nil,
|
||||
do_not_track : Bool? = nil
|
||||
)
|
||||
@read_timeout = read_timeout || Arachnid.read_timeout
|
||||
@connect_timeout = connect_timeout || Arachnid.connect_timeout
|
||||
@max_redirects = max_redirects || Arachnid.max_redirects
|
||||
@do_not_track = do_not_track || Arachnid.do_not_track?
|
||||
end
|
||||
|
||||
# Determines if there is an active session for the given URL
|
||||
def active?(url)
|
||||
# normalize the url
|
||||
url = URI.parse(url) unless url.is_a?(URI)
|
||||
|
||||
# session key
|
||||
key = key_for(url)
|
||||
|
||||
@sessions.has_key?(key)
|
||||
end
|
||||
|
||||
# Provides an active session for a given URL.
|
||||
def [](url)
|
||||
# normalize the url
|
||||
url = URI.parse(url) unless url.is_a?(URI)
|
||||
|
||||
# session key
|
||||
key = key_for(url)
|
||||
|
||||
# normalize the endpoint
|
||||
endpoint = url.dup
|
||||
endpoint.scheme ||= "http"
|
||||
endpoint.query = nil
|
||||
endpoint.fragment = nil
|
||||
endpoint.path = ""
|
||||
|
||||
# Set headers
|
||||
headers = {
|
||||
"DNT" => @do_not_track ? 1 : 0
|
||||
}
|
||||
|
||||
unless @sessions.has_key?(key)
|
||||
session = Halite::Client.new(
|
||||
endpoint: endpoint,
|
||||
timeout: Halite::Timeout.new(
|
||||
connect: @connect_timeout,
|
||||
read: @read_timeout
|
||||
),
|
||||
follow: Halite::Follow.new(
|
||||
hops: @max_redirects,
|
||||
strict: false
|
||||
),
|
||||
headers: headers,
|
||||
)
|
||||
|
||||
# session = session.logging(skip_request_body: true, skip_response_body: true)
|
||||
|
||||
@sessions[key] = session
|
||||
end
|
||||
|
||||
@sessions[key]
|
||||
end
|
||||
|
||||
# Destroys an HTTP session for the given scheme, host, and port.
|
||||
def kill!(url)
|
||||
# normalize the url
|
||||
url = URI.parse(url) unless url.is_a?(URI)
|
||||
|
||||
# session key
|
||||
key = key_for(url)
|
||||
|
||||
if sess = @sessions[key]
|
||||
@sessions.delete(key)
|
||||
end
|
||||
end
|
||||
|
||||
# Clears the session cache
|
||||
def clear
|
||||
@sessions.clear
|
||||
end
|
||||
|
||||
# Creates a session key based on the URL
|
||||
private def key_for(url)
|
||||
{url.scheme, url.host, url.port}
|
||||
end
|
||||
end
|
||||
end
|
|
@ -0,0 +1,3 @@
|
|||
module Arachnid
|
||||
VERSION = "0.1.0"
|
||||
end
|
Loading…
Reference in New Issue