From 18eb31c0ce4d8ebed6c4a2a8c0e192face2902cf Mon Sep 17 00:00:00 2001 From: Chris Watson Date: Mon, 1 Jul 2019 09:55:07 -0700 Subject: [PATCH] Added ignore option to cli --- src/arachnid/cli.cr | 28 +++++++++++++++------------- src/arachnid/cli/sitemap.cr | 6 ++++++ src/arachnid/cli/summarize.cr | 6 ++++++ 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/src/arachnid/cli.cr b/src/arachnid/cli.cr index 8368ea9..cf2ea82 100644 --- a/src/arachnid/cli.cr +++ b/src/arachnid/cli.cr @@ -33,14 +33,15 @@ module Arachnid arachnid summarize https://crystal-lang.org -c 404 500 USAGE - option "-l", "--ilinks", type: Bool, desc: "generate a map of pages to internal links" - option "-L", "--elinks", type: Bool, desc: "generate a map of pages to external links" - option "-c CODES", "--codes=CODES", type: Array(Int32), desc: "generate a map of status codes to pages \ - that responded with that code" - option "-n", "--limit NUM", type: Int32, desc: "maximum number of pages to scan" - option "-f", "--fibers NUM", type: Int32, desc: "maximum amount of fibers to spin up", default: 10 - option "-o FILE", "--output=FILE", type: String, desc: "file to write the report to (if undefined \ - output will be printed to STDOUT" + option "-l", "--ilinks", type: Bool, desc: "generate a map of pages to internal links" + option "-L", "--elinks", type: Bool, desc: "generate a map of pages to external links" + option "-c CODES", "--codes=CODES", type: Array(Int32), desc: "generate a map of status codes to pages \ + that responded with that code" + option "-n", "--limit NUM", type: Int32, desc: "maximum number of pages to scan" + option "-f", "--fibers NUM", type: Int32, desc: "maximum amount of fibers to spin up", default: 10 + option "-i", "--ignore PATTERNS", type: Array(String), desc: "url patterns to ignore (regex)" + option "-o FILE", "--output=FILE", type: String, desc: "file to write the report to (if undefined \ + output will be printed to STDOUT" run do |opts, args| if args.empty? @@ -70,11 +71,12 @@ module Arachnid USAGE - option "--xml", type: Bool, desc: "generate the sitemap in XML format" - option "--json", type: Bool, desc: "generate the sitemap in JSON format" - option "-o FILE", "--output=FILE", type: String, desc: "filename to write the report to. \ - default is the hostname + .json or .xml" - option "-f", "--fibers NUM", type: Int32, desc: "maximum amount of fibers to spin up", default: 10 + option "--xml", type: Bool, desc: "generate the sitemap in XML format" + option "--json", type: Bool, desc: "generate the sitemap in JSON format" + option "-o FILE", "--output=FILE", type: String, desc: "filename to write the report to. \ + default is the hostname + .json or .xml" + option "-f", "--fibers NUM", type: Int32, desc: "maximum amount of fibers to spin up", default: 10 + option "-i", "--ignore PATTERNS", type: Array(String), desc: "url patterns to ignore (regex)" run do |opts, args| if args.size != 1 diff --git a/src/arachnid/cli/sitemap.cr b/src/arachnid/cli/sitemap.cr index 07fabb4..55b8413 100644 --- a/src/arachnid/cli/sitemap.cr +++ b/src/arachnid/cli/sitemap.cr @@ -18,6 +18,12 @@ module Arachnid spider = Arachnid::Agent.new(fibers: opts.fibers) spider.visit_urls_like(Regex.new(Regex.escape(url.to_s))) + opts.ignore.each do |pattern| + pattern = Regex.new(pattern) + pp pattern + spider.ignore_urls_like(pattern) + end + map = { domain: url.to_s, lastmod: { diff --git a/src/arachnid/cli/summarize.cr b/src/arachnid/cli/summarize.cr index 9f719f0..790a39d 100644 --- a/src/arachnid/cli/summarize.cr +++ b/src/arachnid/cli/summarize.cr @@ -15,6 +15,12 @@ module Arachnid spider.visit_urls_like(Regex.new(url)) end + opts.ignore.each do |pattern| + pattern = Regex.new(pattern) + pp pattern + spider.ignore_urls_like(pattern) + end + pages = 0 internal_links = Hash(String, Array(String)).new external_links = Hash(String, Array(String)).new