Added and image downloader to the cli

2019-07-01 13:37:34 -07:00 · 2019-07-01 13:37:34 -07:00 · 642ff912b5
parent 11207b60f6
commit 642ff912b5
2 changed files with 121 additions and 2 deletions
--- a/src/arachnid/cli.cr
+++ b/src/arachnid/cli.cr
@ -14,7 +14,7 @@ module Arachnid
      end
      sub "summarize" do
-        desc "Scan a site (or sites) and generate a JSON report"
+        desc "scan a site (or sites) and generate a JSON report"
        usage <<-USAGE
        arachnid summarize [sites] [options]
@ -80,7 +80,7 @@ module Arachnid
        run do |opts, args|
          if args.size != 1
-            raise "arachnid sitemap requires exactly one site to scan. you provided #{args.size}"
+            raise "arachnid sitemap requires exactly one site to scan. you provided #{args.size}."
          elsif !opts.json && !opts.xml
            raise "you must select either xml or json"
          else
@ -89,6 +89,66 @@ module Arachnid
          end
        end
      end
      sub "imgd" do
        desc "scan a site and download all the images found"
        usage <<-USAGE
        arachnid imgd [url] [options]
          Examples:
            # Download all images from crystal-lang.org and save them to ./images
            arachnid imgd https://crystal-lang.org -o ./images
            # Download all images between 5000 and 10000 bytes
            arachnid imgd https://crystal-lang.org -m5000 -x10000
        USAGE
        option "-n", "--limit NUM",         type: Int32,          desc: "maximum number of pages to scan"
        option "-f", "--fibers NUM",        type: Int32,          desc: "maximum amount of fibers to spin up", default: 10
        option "-i", "--ignore PATTERNS",   type: Array(String),  desc: "url patterns to ignore (regex)"
        option "-o DIR", "--outdir=DIR",    type: String,         desc: "directory to save images to",         default: "./imgd-downloads"
        option "-m NUM", "--minsize=NUM",   type: Int32,          desc: "image minimum size (in bytes)"
        option "-x NUM", "--maxsize=NUM",   type: Int32,          desc: "image maximum size (in bytes)"
        run do |opts, args|
          if args.size != 1
            raise "arachnid imgd requires exactly one site to scan. you provided #{args.size}."
          else
            img = Arachnid::Cli::ImageDownloader.new
            img.run(opts, args)
          end
        end
      end
      help_template do |desc, usage, options, sub_commands|
        longest_option = options.reduce(0) do |acc, opt|
          option = opt[:names].join(", ")
          option.size > acc ? option.size : acc
        end
        options_help_lines = options.map do |option|
          option[:names].join(", ").ljust(longest_option + 5) + " - #{option[:desc]}" + ( option[:default] ? " (default: #{option[:default]})" : "" )
        end
        base = <<-BASE_HELP
          #{usage}
          #{desc}
          options:
            #{options_help_lines.join("\n    ")}
        BASE_HELP
        sub = <<-SUB_COMMAND_HELP
          sub commands:
            #{sub_commands.map { |command| command[:help_line].strip }.join("\n    ") }
        SUB_COMMAND_HELP
        sub_commands.empty? ? base : base + sub
      end
    end
  end
 end
--- a/src/arachnid/cli/image_downloader.cr
+++ b/src/arachnid/cli/image_downloader.cr
@ -0,0 +1,59 @@
 require "./action"
 require "termspinner"
 require "mime"
 module Arachnid
  class Cli < Clim
    class ImageDownloader < Cli::Action
      def run(opts, args)
        url = URI.parse(args[0])
        spinner = Spinner::Spinner.new("Wait...")
        count = 0
        outdir = File.expand_path(opts.outdir, __DIR__)
        spider = Arachnid::Agent.new(limit: opts.limit, fibers: opts.fibers)
        spider.visit_urls_like(Regex.new(url.to_s))
        opts.ignore.each do |pattern|
          pattern = Regex.new(pattern)
          spider.ignore_urls_like(pattern)
        end
        spider.every_image do |res|
          next if opts.minsize && res.body.bytesize < opts.minsize.not_nil!
          next if opts.maxsize && res.body.bytesize > opts.maxsize.not_nil!
          # name = opts.format ? format_filename(opts.format, res.url.path) || res.url.path
          name = format_filename(nil, res, count)
          outfile = File.join(outdir, name)
          count += 1
          spinner.message = "Saved #{outfile}"
          File.write(outfile, res.body.to_slice, mode: "a")
        end
        # Create the target directory
        Dir.mkdir_p(outdir)
        spinner.start("Crawling...")
        spider.start_at(url)
        spinner.stop("Finished! #{count} images saved to #{outdir}\n")
      end
      def format_filename(format, res, index)
        filename = res.url.path
        ext = File.extname(filename)
        basename = File.basename(filename, ext)
        # If the ext is empty create one from the MIME type
        if ext.empty?
          extensions = MIME.extensions(res.content_type)
          ext = extensions.first? || ".unknown"
        end
        filename = basename + ext
      end
    end
  end
 end