Added ignore option to cli

This commit is contained in:
Chris Watson 2019-07-01 09:55:07 -07:00
parent 19e022faae
commit 18eb31c0ce
No known key found for this signature in database
GPG Key ID: 37DAEF5F446370A4
3 changed files with 27 additions and 13 deletions

View File

@ -33,14 +33,15 @@ module Arachnid
arachnid summarize https://crystal-lang.org -c 404 500 arachnid summarize https://crystal-lang.org -c 404 500
USAGE USAGE
option "-l", "--ilinks", type: Bool, desc: "generate a map of pages to internal links" option "-l", "--ilinks", type: Bool, desc: "generate a map of pages to internal links"
option "-L", "--elinks", type: Bool, desc: "generate a map of pages to external links" option "-L", "--elinks", type: Bool, desc: "generate a map of pages to external links"
option "-c CODES", "--codes=CODES", type: Array(Int32), desc: "generate a map of status codes to pages \ option "-c CODES", "--codes=CODES", type: Array(Int32), desc: "generate a map of status codes to pages \
that responded with that code" that responded with that code"
option "-n", "--limit NUM", type: Int32, desc: "maximum number of pages to scan" option "-n", "--limit NUM", type: Int32, desc: "maximum number of pages to scan"
option "-f", "--fibers NUM", type: Int32, desc: "maximum amount of fibers to spin up", default: 10 option "-f", "--fibers NUM", type: Int32, desc: "maximum amount of fibers to spin up", default: 10
option "-o FILE", "--output=FILE", type: String, desc: "file to write the report to (if undefined \ option "-i", "--ignore PATTERNS", type: Array(String), desc: "url patterns to ignore (regex)"
output will be printed to STDOUT" option "-o FILE", "--output=FILE", type: String, desc: "file to write the report to (if undefined \
output will be printed to STDOUT"
run do |opts, args| run do |opts, args|
if args.empty? if args.empty?
@ -70,11 +71,12 @@ module Arachnid
USAGE USAGE
option "--xml", type: Bool, desc: "generate the sitemap in XML format" option "--xml", type: Bool, desc: "generate the sitemap in XML format"
option "--json", type: Bool, desc: "generate the sitemap in JSON format" option "--json", type: Bool, desc: "generate the sitemap in JSON format"
option "-o FILE", "--output=FILE", type: String, desc: "filename to write the report to. \ option "-o FILE", "--output=FILE", type: String, desc: "filename to write the report to. \
default is the hostname + .json or .xml" default is the hostname + .json or .xml"
option "-f", "--fibers NUM", type: Int32, desc: "maximum amount of fibers to spin up", default: 10 option "-f", "--fibers NUM", type: Int32, desc: "maximum amount of fibers to spin up", default: 10
option "-i", "--ignore PATTERNS", type: Array(String), desc: "url patterns to ignore (regex)"
run do |opts, args| run do |opts, args|
if args.size != 1 if args.size != 1

View File

@ -18,6 +18,12 @@ module Arachnid
spider = Arachnid::Agent.new(fibers: opts.fibers) spider = Arachnid::Agent.new(fibers: opts.fibers)
spider.visit_urls_like(Regex.new(Regex.escape(url.to_s))) spider.visit_urls_like(Regex.new(Regex.escape(url.to_s)))
opts.ignore.each do |pattern|
pattern = Regex.new(pattern)
pp pattern
spider.ignore_urls_like(pattern)
end
map = { map = {
domain: url.to_s, domain: url.to_s,
lastmod: { lastmod: {

View File

@ -15,6 +15,12 @@ module Arachnid
spider.visit_urls_like(Regex.new(url)) spider.visit_urls_like(Regex.new(url))
end end
opts.ignore.each do |pattern|
pattern = Regex.new(pattern)
pp pattern
spider.ignore_urls_like(pattern)
end
pages = 0 pages = 0
internal_links = Hash(String, Array(String)).new internal_links = Hash(String, Array(String)).new
external_links = Hash(String, Array(String)).new external_links = Hash(String, Array(String)).new