diff --git a/src/arachnid/cli.cr b/src/arachnid/cli.cr index cf2ea82..33d8d23 100644 --- a/src/arachnid/cli.cr +++ b/src/arachnid/cli.cr @@ -14,7 +14,7 @@ module Arachnid end sub "summarize" do - desc "Scan a site (or sites) and generate a JSON report" + desc "scan a site (or sites) and generate a JSON report" usage <<-USAGE arachnid summarize [sites] [options] @@ -80,7 +80,7 @@ module Arachnid run do |opts, args| if args.size != 1 - raise "arachnid sitemap requires exactly one site to scan. you provided #{args.size}" + raise "arachnid sitemap requires exactly one site to scan. you provided #{args.size}." elsif !opts.json && !opts.xml raise "you must select either xml or json" else @@ -89,6 +89,66 @@ module Arachnid end end end + + sub "imgd" do + desc "scan a site and download all the images found" + usage <<-USAGE + arachnid imgd [url] [options] + + Examples: + + # Download all images from crystal-lang.org and save them to ./images + arachnid imgd https://crystal-lang.org -o ./images + + # Download all images between 5000 and 10000 bytes + arachnid imgd https://crystal-lang.org -m5000 -x10000 + USAGE + + option "-n", "--limit NUM", type: Int32, desc: "maximum number of pages to scan" + option "-f", "--fibers NUM", type: Int32, desc: "maximum amount of fibers to spin up", default: 10 + option "-i", "--ignore PATTERNS", type: Array(String), desc: "url patterns to ignore (regex)" + option "-o DIR", "--outdir=DIR", type: String, desc: "directory to save images to", default: "./imgd-downloads" + option "-m NUM", "--minsize=NUM", type: Int32, desc: "image minimum size (in bytes)" + option "-x NUM", "--maxsize=NUM", type: Int32, desc: "image maximum size (in bytes)" + + run do |opts, args| + if args.size != 1 + raise "arachnid imgd requires exactly one site to scan. you provided #{args.size}." + else + img = Arachnid::Cli::ImageDownloader.new + img.run(opts, args) + end + end + end + + help_template do |desc, usage, options, sub_commands| + longest_option = options.reduce(0) do |acc, opt| + option = opt[:names].join(", ") + option.size > acc ? option.size : acc + end + + options_help_lines = options.map do |option| + option[:names].join(", ").ljust(longest_option + 5) + " - #{option[:desc]}" + ( option[:default] ? " (default: #{option[:default]})" : "" ) + end + + base = <<-BASE_HELP + #{usage} + + #{desc} + + options: + #{options_help_lines.join("\n ")} + + BASE_HELP + + sub = <<-SUB_COMMAND_HELP + + sub commands: + #{sub_commands.map { |command| command[:help_line].strip }.join("\n ") } + SUB_COMMAND_HELP + + sub_commands.empty? ? base : base + sub + end end end end diff --git a/src/arachnid/cli/image_downloader.cr b/src/arachnid/cli/image_downloader.cr new file mode 100644 index 0000000..64cdb27 --- /dev/null +++ b/src/arachnid/cli/image_downloader.cr @@ -0,0 +1,59 @@ +require "./action" +require "termspinner" +require "mime" + +module Arachnid + class Cli < Clim + class ImageDownloader < Cli::Action + + def run(opts, args) + url = URI.parse(args[0]) + spinner = Spinner::Spinner.new("Wait...") + + count = 0 + outdir = File.expand_path(opts.outdir, __DIR__) + + spider = Arachnid::Agent.new(limit: opts.limit, fibers: opts.fibers) + spider.visit_urls_like(Regex.new(url.to_s)) + + opts.ignore.each do |pattern| + pattern = Regex.new(pattern) + spider.ignore_urls_like(pattern) + end + + spider.every_image do |res| + next if opts.minsize && res.body.bytesize < opts.minsize.not_nil! + next if opts.maxsize && res.body.bytesize > opts.maxsize.not_nil! + # name = opts.format ? format_filename(opts.format, res.url.path) || res.url.path + name = format_filename(nil, res, count) + outfile = File.join(outdir, name) + + count += 1 + spinner.message = "Saved #{outfile}" + File.write(outfile, res.body.to_slice, mode: "a") + end + + # Create the target directory + Dir.mkdir_p(outdir) + + spinner.start("Crawling...") + spider.start_at(url) + spinner.stop("Finished! #{count} images saved to #{outdir}\n") + end + + def format_filename(format, res, index) + filename = res.url.path + ext = File.extname(filename) + basename = File.basename(filename, ext) + + # If the ext is empty create one from the MIME type + if ext.empty? + extensions = MIME.extensions(res.content_type) + ext = extensions.first? || ".unknown" + end + + filename = basename + ext + end + end + end +end