Added and image downloader to the cli
This commit is contained in:
parent
11207b60f6
commit
642ff912b5
|
@ -14,7 +14,7 @@ module Arachnid
|
|||
end
|
||||
|
||||
sub "summarize" do
|
||||
desc "Scan a site (or sites) and generate a JSON report"
|
||||
desc "scan a site (or sites) and generate a JSON report"
|
||||
usage <<-USAGE
|
||||
arachnid summarize [sites] [options]
|
||||
|
||||
|
@ -80,7 +80,7 @@ module Arachnid
|
|||
|
||||
run do |opts, args|
|
||||
if args.size != 1
|
||||
raise "arachnid sitemap requires exactly one site to scan. you provided #{args.size}"
|
||||
raise "arachnid sitemap requires exactly one site to scan. you provided #{args.size}."
|
||||
elsif !opts.json && !opts.xml
|
||||
raise "you must select either xml or json"
|
||||
else
|
||||
|
@ -89,6 +89,66 @@ module Arachnid
|
|||
end
|
||||
end
|
||||
end
|
||||
|
||||
sub "imgd" do
|
||||
desc "scan a site and download all the images found"
|
||||
usage <<-USAGE
|
||||
arachnid imgd [url] [options]
|
||||
|
||||
Examples:
|
||||
|
||||
# Download all images from crystal-lang.org and save them to ./images
|
||||
arachnid imgd https://crystal-lang.org -o ./images
|
||||
|
||||
# Download all images between 5000 and 10000 bytes
|
||||
arachnid imgd https://crystal-lang.org -m5000 -x10000
|
||||
USAGE
|
||||
|
||||
option "-n", "--limit NUM", type: Int32, desc: "maximum number of pages to scan"
|
||||
option "-f", "--fibers NUM", type: Int32, desc: "maximum amount of fibers to spin up", default: 10
|
||||
option "-i", "--ignore PATTERNS", type: Array(String), desc: "url patterns to ignore (regex)"
|
||||
option "-o DIR", "--outdir=DIR", type: String, desc: "directory to save images to", default: "./imgd-downloads"
|
||||
option "-m NUM", "--minsize=NUM", type: Int32, desc: "image minimum size (in bytes)"
|
||||
option "-x NUM", "--maxsize=NUM", type: Int32, desc: "image maximum size (in bytes)"
|
||||
|
||||
run do |opts, args|
|
||||
if args.size != 1
|
||||
raise "arachnid imgd requires exactly one site to scan. you provided #{args.size}."
|
||||
else
|
||||
img = Arachnid::Cli::ImageDownloader.new
|
||||
img.run(opts, args)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
help_template do |desc, usage, options, sub_commands|
|
||||
longest_option = options.reduce(0) do |acc, opt|
|
||||
option = opt[:names].join(", ")
|
||||
option.size > acc ? option.size : acc
|
||||
end
|
||||
|
||||
options_help_lines = options.map do |option|
|
||||
option[:names].join(", ").ljust(longest_option + 5) + " - #{option[:desc]}" + ( option[:default] ? " (default: #{option[:default]})" : "" )
|
||||
end
|
||||
|
||||
base = <<-BASE_HELP
|
||||
#{usage}
|
||||
|
||||
#{desc}
|
||||
|
||||
options:
|
||||
#{options_help_lines.join("\n ")}
|
||||
|
||||
BASE_HELP
|
||||
|
||||
sub = <<-SUB_COMMAND_HELP
|
||||
|
||||
sub commands:
|
||||
#{sub_commands.map { |command| command[:help_line].strip }.join("\n ") }
|
||||
SUB_COMMAND_HELP
|
||||
|
||||
sub_commands.empty? ? base : base + sub
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -0,0 +1,59 @@
|
|||
require "./action"
|
||||
require "termspinner"
|
||||
require "mime"
|
||||
|
||||
module Arachnid
|
||||
class Cli < Clim
|
||||
class ImageDownloader < Cli::Action
|
||||
|
||||
def run(opts, args)
|
||||
url = URI.parse(args[0])
|
||||
spinner = Spinner::Spinner.new("Wait...")
|
||||
|
||||
count = 0
|
||||
outdir = File.expand_path(opts.outdir, __DIR__)
|
||||
|
||||
spider = Arachnid::Agent.new(limit: opts.limit, fibers: opts.fibers)
|
||||
spider.visit_urls_like(Regex.new(url.to_s))
|
||||
|
||||
opts.ignore.each do |pattern|
|
||||
pattern = Regex.new(pattern)
|
||||
spider.ignore_urls_like(pattern)
|
||||
end
|
||||
|
||||
spider.every_image do |res|
|
||||
next if opts.minsize && res.body.bytesize < opts.minsize.not_nil!
|
||||
next if opts.maxsize && res.body.bytesize > opts.maxsize.not_nil!
|
||||
# name = opts.format ? format_filename(opts.format, res.url.path) || res.url.path
|
||||
name = format_filename(nil, res, count)
|
||||
outfile = File.join(outdir, name)
|
||||
|
||||
count += 1
|
||||
spinner.message = "Saved #{outfile}"
|
||||
File.write(outfile, res.body.to_slice, mode: "a")
|
||||
end
|
||||
|
||||
# Create the target directory
|
||||
Dir.mkdir_p(outdir)
|
||||
|
||||
spinner.start("Crawling...")
|
||||
spider.start_at(url)
|
||||
spinner.stop("Finished! #{count} images saved to #{outdir}\n")
|
||||
end
|
||||
|
||||
def format_filename(format, res, index)
|
||||
filename = res.url.path
|
||||
ext = File.extname(filename)
|
||||
basename = File.basename(filename, ext)
|
||||
|
||||
# If the ext is empty create one from the MIME type
|
||||
if ext.empty?
|
||||
extensions = MIME.extensions(res.content_type)
|
||||
ext = extensions.first? || ".unknown"
|
||||
end
|
||||
|
||||
filename = basename + ext
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
Loading…
Reference in New Issue