Updated readme and other things

2019-06-26 20:20:28 -07:00 · 2019-06-26 20:20:28 -07:00 · 7fa5205096
parent 42b4c9c739
commit 7fa5205096
4 changed files with 246 additions and 8 deletions
--- a/README.md
+++ b/README.md
@ -2,6 +2,56 @@

 Arachnid is a fast and powerful web scraping framework for Crystal. It provides an easy to use DSL for scraping webpages and processing all of the things you might come across.

+- [Arachnid](#Arachnid)
+  - [Installation](#Installation)
+  - [Examples](#Examples)
+  - [Usage](#Usage)
+    - [Configuration](#Configuration)
+    - [Crawling](#Crawling)
+      - [Arachnid#start_at(url, **options, &block : Agent ->)](#Arachnidstartaturl-options-block--Agent)
+      - [Arachnid#site(url, **options, &block : Agent ->)](#Arachnidsiteurl-options-block--Agent)
+      - [Arachnid#host(name, **options, &block : Agent ->)](#Arachnidhostname-options-block--Agent)
+    - [Crawling Rules](#Crawling-Rules)
+    - [Events](#Events)
+      - [`every_url(&block : URI ->)`](#everyurlblock--URI)
+      - [`every_failed_url(&block : URI ->)`](#everyfailedurlblock--URI)
+      - [`every_url_like(pattern, &block : URI ->)`](#everyurllikepattern-block--URI)
+      - [`urls_like(pattern, &block : URI ->)`](#urlslikepattern-block--URI)
+      - [`all_headers(&block : HTTP::Headers)`](#allheadersblock--HTTPHeaders)
+      - [`every_resource(&block : Resource ->)`](#everyresourceblock--Resource)
+      - [`every_ok_page(&block : Resource ->)`](#everyokpageblock--Resource)
+      - [`every_redirect_page(&block : Resource ->)`](#everyredirectpageblock--Resource)
+      - [`every_timedout_page(&block : Resource ->)`](#everytimedoutpageblock--Resource)
+      - [`every_bad_request_page(&block : Resource ->)`](#everybadrequestpageblock--Resource)
+      - [`def every_unauthorized_page(&block : Resource ->)`](#def-everyunauthorizedpageblock--Resource)
+      - [`every_forbidden_page(&block : Resource ->)`](#everyforbiddenpageblock--Resource)
+      - [`every_missing_page(&block : Resource ->)`](#everymissingpageblock--Resource)
+      - [`every_internal_server_error_page(&block : Resource ->)`](#everyinternalservererrorpageblock--Resource)
+      - [`every_txt_page(&block : Resource ->)`](#everytxtpageblock--Resource)
+      - [`every_html_page(&block : Resource ->)`](#everyhtmlpageblock--Resource)
+      - [`every_xml_page(&block : Resource ->)`](#everyxmlpageblock--Resource)
+      - [`every_xsl_page(&block : Resource ->)`](#everyxslpageblock--Resource)
+      - [`every_doc(&block : Document::HTML | XML::Node ->)`](#everydocblock--DocumentHTML--XMLNode)
+      - [`every_html_doc(&block : Document::HTML | XML::Node ->)`](#everyhtmldocblock--DocumentHTML--XMLNode)
+      - [`every_xml_doc(&block : XML::Node ->)`](#everyxmldocblock--XMLNode)
+      - [`every_xsl_doc(&block : XML::Node ->)`](#everyxsldocblock--XMLNode)
+      - [`every_rss_doc(&block : XML::Node ->)`](#everyrssdocblock--XMLNode)
+      - [`every_atom_doc(&block : XML::Node ->)`](#everyatomdocblock--XMLNode)
+      - [`every_javascript(&block : Resource ->)`](#everyjavascriptblock--Resource)
+      - [`every_css(&block : Resource ->)`](#everycssblock--Resource)
+      - [`every_rss(&block : Resource ->)`](#everyrssblock--Resource)
+      - [`every_atom(&block : Resource ->)`](#everyatomblock--Resource)
+      - [`every_ms_word(&block : Resource ->)`](#everymswordblock--Resource)
+      - [`every_pdf(&block : Resource ->)`](#everypdfblock--Resource)
+      - [`every_zip(&block : Resource ->)`](#everyzipblock--Resource)
+      - [`every_image(&block : Resource ->)`](#everyimageblock--Resource)
+      - [`every_content_type(content_type : String | Regex, &block : Resource ->)`](#everycontenttypecontenttype--String--Regex-block--Resource)
+      - [`every_link(&block : URI, URI ->)`](#everylinkblock--URI-URI)
+    - [Content Types](#Content-Types)
+    - [Parsing HTML](#Parsing-HTML)
+  - [Contributing](#Contributing)
+  - [Contributors](#Contributors)
+
 ## Installation

 1. Add the dependency to your `shard.yml`:
@ -10,11 +60,12 @@ Arachnid is a fast and powerful web scraping framework for Crystal. It provides
   dependencies:
     arachnid:
       github: watzon/arachnid
+       version: ~> 0.1.0
   ```

 2. Run `shards install`

-## Usage
+## Examples

 Arachnid provides an easy to use, powerful DSL for scraping websites.

@ -82,7 +133,197 @@ Arachnid.start_at("https://crystal-lang.org") do |spider|
 end
 ```

-More documentation will be coming soon!
+## Usage
+
+### Configuration
+
+Arachnid has a ton of configration options which can be passed to the mehthods listed below in [Crawling](#crawling) and to the constructor for `Arachnid::Agent`. They are as follows:
+
+- **read_timeout** - Read timeout
+- **connect_timeout** - Connect timeout
+- **max_redirects** - Maximum amount of redirects to follow
+- **do_not_track** - Sets the DNT header
+- **default_headers** - Default HTTP headers to use for all hosts
+- **host_header** - HTTP host header to use
+- **host_headers** - HTTP headers to use for specific hosts
+- **user_agent** - sets the user agent for the crawler
+- **referer** - Referer to use
+- **fetch_delay** - Delay in between fetching resources
+- **queue** - Preload the queue with urls
+- **history** - Links that should not be visited
+- **limit** - Maximum number of resources to visit
+- **max_depth** - Maximum crawl depth
+- **filter_options** - Passed to [`initialize_filters`]()
+
+There are also a few class properties on `Arachnid` itself which are used as the defaults, unless overrided.
+
+- **do_not_track**
+- **max_redirects**
+- **connect_timeout**
+- **read_timeout**
+- **user_agent**
+
+### Crawling
+
+Arachnid provides 3 interfaces to use for crawling:
+
+#### Arachnid#start_at(url, **options, &block : Agent ->)
+
+`start_at` is what you want to use if you're going to be doing a full crawl of multiple sites. It doesn't filter any urls by default and will scan every link it encounters.
+
+#### Arachnid#site(url, **options, &block : Agent ->)
+
+`site` constrains the crawl to a specific site. "site" in this case is defined as all paths within a domain and it's subdomains.
+
+#### Arachnid#host(name, **options, &block : Agent ->)
+
+`host` is similar to site, but stays within the domain, not crawling subdomains.
+
+*Maybe `site` and `host` should be swapped? I don't know what is more intuitive.*
+
+### Crawling Rules
+
+Arachnid has the concept of **filters** for the purpose of filtering urls before visiting them. They are as follows:
+
+- **schemes**
+  - [visit_schemes_like(pattern : String | Regex)]()
+  - [ignore_schemes_like(pattern : String | Regex)]()
+- **hosts**
+  - [visit_hosts_like(pattern : String | Regex)]()
+  - [ignore_hosts_like(pattern : String | Regex)]()
+- **ports**
+  - [visit_ports_like(pattern : String | Regex)]()
+  - [ignore_ports_like(pattern : String | Regex)]()
+- **ports**
+  - [visit_ports_like(pattern : String | Regex)]()
+  - [ignore_ports_like(pattern : String | Regex)]()
+- **links**
+  - [visit_links_like(pattern : String | Regex)]()
+  - [ignore_links_like(pattern : String | Regex)]()
+- **urls**
+  - [visit_urls_like(pattern : String | Regex)]()
+  - [ignore_urls_like(pattern : String | Regex)]()
+- **exts**
+  - [visit_exts_like(pattern : String | Regex)]()
+  - [ignore_exts_like(pattern : String | Regex)]()
+
+All of these methods have the ability to also take a block instead of a pattern, where the block returns true or false. The only difference between `links` and `urls` in this case is with the block argument. `links` receives a `String` and `urls` a `URI`. Honestly I'll probably get rid of `links` soon and just make it `urls`.
+
+`exts` looks at the extension, if it exists, and fiters base on that.
+
+### Events
+
+Every crawled "page" is referred to as a resource, since sometimes they will be html/xml, sometimes javascript or css, and sometimes images, videos, zip files, etc. Every time a resource is scanned one of several events is called. They are:
+
+#### `every_url(&block : URI ->)`
+Pass each URL from each resource visited to the given block.
+
+#### `every_failed_url(&block : URI ->)`
+Pass each URL that could not be requested to the given block.
+
+#### `every_url_like(pattern, &block : URI ->)`
+Pass every URL that the agent visits, and matches a given pattern, to a given block.
+
+#### `urls_like(pattern, &block : URI ->)`
+Same as `every_url_like`
+
+#### `all_headers(&block : HTTP::Headers)`
+Pass the headers from every response the agent receives to a given block.
+
+#### `every_resource(&block : Resource ->)`
+Pass every resource that the agent visits to a given block.
+
+#### `every_ok_page(&block : Resource ->)`
+Pass every OK resource that the agent visits to a given block.
+
+#### `every_redirect_page(&block : Resource ->)`
+Pass every Redirect resource that the agent visits to a given block.
+
+#### `every_timedout_page(&block : Resource ->)`
+Pass every Timeout resource that the agent visits to a given block.
+
+#### `every_bad_request_page(&block : Resource ->)`
+Pass every Bad Request resource that the agent visits to a given block.
+
+#### `def every_unauthorized_page(&block : Resource ->)`
+Pass every Unauthorized resource that the agent visits to a given block.
+
+#### `every_forbidden_page(&block : Resource ->)`
+Pass every Forbidden resource that the agent visits to a given block.
+
+#### `every_missing_page(&block : Resource ->)`
+Pass every Missing resource that the agent visits to a given block.
+
+#### `every_internal_server_error_page(&block : Resource ->)`
+Pass every Internal Server Error resource that the agent visits to a given block.
+
+#### `every_txt_page(&block : Resource ->)`
+Pass every Plain Text resource that the agent visits to a given block.
+
+#### `every_html_page(&block : Resource ->)`
+Pass every HTML resource that the agent visits to a given block.
+
+#### `every_xml_page(&block : Resource ->)`
+Pass every XML resource that the agent visits to a given block.
+
+#### `every_xsl_page(&block : Resource ->)`
+Pass every XML Stylesheet (XSL) resource that the agent visits to a given block.
+
+#### `every_doc(&block : Document::HTML | XML::Node ->)`
+Pass every HTML or XML document that the agent parses to a given block.
+
+#### `every_html_doc(&block : Document::HTML | XML::Node ->)`
+Pass every HTML document that the agent parses to a given block.
+
+#### `every_xml_doc(&block : XML::Node ->)`
+Pass every XML document that the agent parses to a given block.
+
+#### `every_xsl_doc(&block : XML::Node ->)`
+Pass every XML Stylesheet (XSL) that the agent parses to a given block.
+
+#### `every_rss_doc(&block : XML::Node ->)`
+Pass every RSS document that the agent parses to a given block.
+
+#### `every_atom_doc(&block : XML::Node ->)`
+Pass every Atom document that the agent parses to a given block.
+
+#### `every_javascript(&block : Resource ->)`
+Pass every JavaScript resource that the agent visits to a given block.
+
+#### `every_css(&block : Resource ->)`
+Pass every CSS resource that the agent visits to a given block.
+
+#### `every_rss(&block : Resource ->)`
+Pass every RSS feed that the agent visits to a given block.
+
+#### `every_atom(&block : Resource ->)`
+Pass every Atom feed that the agent visits to a given block.
+
+#### `every_ms_word(&block : Resource ->)`
+Pass every MS Word resource that the agent visits to a given block.
+
+#### `every_pdf(&block : Resource ->)`
+Pass every PDF resource that the agent visits to a given block.
+
+#### `every_zip(&block : Resource ->)`
+Pass every ZIP resource that the agent visits to a given block.
+
+#### `every_image(&block : Resource ->)`
+Passes every image resource to the given block.
+
+#### `every_content_type(content_type : String | Regex, &block : Resource ->)`
+Passes every resource with a matching content type to the given block.
+
+#### `every_link(&block : URI, URI ->)`
+Passes every origin and destination URI of each link to a given block.
+
+### Content Types
+
+Every resource has an associated content type and the `Resource` class itself provides several easy methods to check it. You can find all of them [here]().
+
+### Parsing HTML
+
+Every HTML/XML resource has full access to the suite of methods provided by [Crystagiri]() allowing you to more easily search by css selector.

 ## Contributing

--- a/src/arachnid/agent.cr
+++ b/src/arachnid/agent.cr
@ -19,7 +19,7 @@ module Arachnid
    # User agent to use.
    property user_agent : String

-    # HTTP Hoes Header to use.
+    # HTTP Host Header to use.
    property host_header : String?

    # HTTP Host Headers to use for specific hosts.
@ -66,7 +66,6 @@ module Arachnid
      host : String? = nil,
      read_timeout : Int32? = nil,
      connect_timeout : Int32? = nil,
-      follow_redirects : Bool? = nil,
      max_redirects : Int32? = nil,
      do_not_track : Bool? = nil,
      default_headers : Hash(String, String)? = nil,
@ -104,7 +103,6 @@ module Arachnid
      @sessions = SessionCache.new(
        read_timeout,
        connect_timeout,
-        follow_redirects,
        max_redirects,
        do_not_track
      )
--- a/src/arachnid/agent/events.cr
+++ b/src/arachnid/agent/events.cr
@ -40,7 +40,7 @@ module Arachnid

    # Pass the headers from every response the agent receives to a given
    # block.
-    def all_headers(&block)
+    def all_headers(&block : HTTP::Headers)
      @every_resource_blocks << ->(resource : Resource) {
        block.call(resource.headers)
      }
@ -182,7 +182,7 @@ module Arachnid
      }
    end

-    # Pass every JavaScript resource that the agent visits to a given blocevery_javascript_resource(&block : Resource ->)
+    # Pass every JavaScript resource that the agent visits to a given block
    def every_javascript(&block : Resource ->)
      @every_resource_blocks << ->(resource : Resource) {
        block.call(resource) if resource.javascript?
--- a/src/arachnid/session_cache.cr
+++ b/src/arachnid/session_cache.cr
@ -23,7 +23,6 @@ module Arachnid
    def initialize(
      read_timeout : Int32? = nil,
      connect_timeout : Int32? = nil,
-      follow_redirects : Bool? = nil,
      max_redirects : Int32? = nil,
      do_not_track : Bool? = nil
    )