Added and image downloader to the cli

This commit is contained in:
Chris Watson 2019-07-01 13:37:34 -07:00
parent 11207b60f6
commit 642ff912b5
No known key found for this signature in database
GPG Key ID: 37DAEF5F446370A4
2 changed files with 121 additions and 2 deletions

View File

@ -14,7 +14,7 @@ module Arachnid
end end
sub "summarize" do sub "summarize" do
desc "Scan a site (or sites) and generate a JSON report" desc "scan a site (or sites) and generate a JSON report"
usage <<-USAGE usage <<-USAGE
arachnid summarize [sites] [options] arachnid summarize [sites] [options]
@ -80,7 +80,7 @@ module Arachnid
run do |opts, args| run do |opts, args|
if args.size != 1 if args.size != 1
raise "arachnid sitemap requires exactly one site to scan. you provided #{args.size}" raise "arachnid sitemap requires exactly one site to scan. you provided #{args.size}."
elsif !opts.json && !opts.xml elsif !opts.json && !opts.xml
raise "you must select either xml or json" raise "you must select either xml or json"
else else
@ -89,6 +89,66 @@ module Arachnid
end end
end end
end end
sub "imgd" do
desc "scan a site and download all the images found"
usage <<-USAGE
arachnid imgd [url] [options]
Examples:
# Download all images from crystal-lang.org and save them to ./images
arachnid imgd https://crystal-lang.org -o ./images
# Download all images between 5000 and 10000 bytes
arachnid imgd https://crystal-lang.org -m5000 -x10000
USAGE
option "-n", "--limit NUM", type: Int32, desc: "maximum number of pages to scan"
option "-f", "--fibers NUM", type: Int32, desc: "maximum amount of fibers to spin up", default: 10
option "-i", "--ignore PATTERNS", type: Array(String), desc: "url patterns to ignore (regex)"
option "-o DIR", "--outdir=DIR", type: String, desc: "directory to save images to", default: "./imgd-downloads"
option "-m NUM", "--minsize=NUM", type: Int32, desc: "image minimum size (in bytes)"
option "-x NUM", "--maxsize=NUM", type: Int32, desc: "image maximum size (in bytes)"
run do |opts, args|
if args.size != 1
raise "arachnid imgd requires exactly one site to scan. you provided #{args.size}."
else
img = Arachnid::Cli::ImageDownloader.new
img.run(opts, args)
end
end
end
help_template do |desc, usage, options, sub_commands|
longest_option = options.reduce(0) do |acc, opt|
option = opt[:names].join(", ")
option.size > acc ? option.size : acc
end
options_help_lines = options.map do |option|
option[:names].join(", ").ljust(longest_option + 5) + " - #{option[:desc]}" + ( option[:default] ? " (default: #{option[:default]})" : "" )
end
base = <<-BASE_HELP
#{usage}
#{desc}
options:
#{options_help_lines.join("\n ")}
BASE_HELP
sub = <<-SUB_COMMAND_HELP
sub commands:
#{sub_commands.map { |command| command[:help_line].strip }.join("\n ") }
SUB_COMMAND_HELP
sub_commands.empty? ? base : base + sub
end
end end
end end
end end

View File

@ -0,0 +1,59 @@
require "./action"
require "termspinner"
require "mime"
module Arachnid
class Cli < Clim
class ImageDownloader < Cli::Action
def run(opts, args)
url = URI.parse(args[0])
spinner = Spinner::Spinner.new("Wait...")
count = 0
outdir = File.expand_path(opts.outdir, __DIR__)
spider = Arachnid::Agent.new(limit: opts.limit, fibers: opts.fibers)
spider.visit_urls_like(Regex.new(url.to_s))
opts.ignore.each do |pattern|
pattern = Regex.new(pattern)
spider.ignore_urls_like(pattern)
end
spider.every_image do |res|
next if opts.minsize && res.body.bytesize < opts.minsize.not_nil!
next if opts.maxsize && res.body.bytesize > opts.maxsize.not_nil!
# name = opts.format ? format_filename(opts.format, res.url.path) || res.url.path
name = format_filename(nil, res, count)
outfile = File.join(outdir, name)
count += 1
spinner.message = "Saved #{outfile}"
File.write(outfile, res.body.to_slice, mode: "a")
end
# Create the target directory
Dir.mkdir_p(outdir)
spinner.start("Crawling...")
spider.start_at(url)
spinner.stop("Finished! #{count} images saved to #{outdir}\n")
end
def format_filename(format, res, index)
filename = res.url.path
ext = File.extname(filename)
basename = File.basename(filename, ext)
# If the ext is empty create one from the MIME type
if ext.empty?
extensions = MIME.extensions(res.content_type)
ext = extensions.first? || ".unknown"
end
filename = basename + ext
end
end
end
end