Added and image downloader to the cli
This commit is contained in:
parent
11207b60f6
commit
642ff912b5
|
@ -14,7 +14,7 @@ module Arachnid
|
||||||
end
|
end
|
||||||
|
|
||||||
sub "summarize" do
|
sub "summarize" do
|
||||||
desc "Scan a site (or sites) and generate a JSON report"
|
desc "scan a site (or sites) and generate a JSON report"
|
||||||
usage <<-USAGE
|
usage <<-USAGE
|
||||||
arachnid summarize [sites] [options]
|
arachnid summarize [sites] [options]
|
||||||
|
|
||||||
|
@ -80,7 +80,7 @@ module Arachnid
|
||||||
|
|
||||||
run do |opts, args|
|
run do |opts, args|
|
||||||
if args.size != 1
|
if args.size != 1
|
||||||
raise "arachnid sitemap requires exactly one site to scan. you provided #{args.size}"
|
raise "arachnid sitemap requires exactly one site to scan. you provided #{args.size}."
|
||||||
elsif !opts.json && !opts.xml
|
elsif !opts.json && !opts.xml
|
||||||
raise "you must select either xml or json"
|
raise "you must select either xml or json"
|
||||||
else
|
else
|
||||||
|
@ -89,6 +89,66 @@ module Arachnid
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
sub "imgd" do
|
||||||
|
desc "scan a site and download all the images found"
|
||||||
|
usage <<-USAGE
|
||||||
|
arachnid imgd [url] [options]
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
# Download all images from crystal-lang.org and save them to ./images
|
||||||
|
arachnid imgd https://crystal-lang.org -o ./images
|
||||||
|
|
||||||
|
# Download all images between 5000 and 10000 bytes
|
||||||
|
arachnid imgd https://crystal-lang.org -m5000 -x10000
|
||||||
|
USAGE
|
||||||
|
|
||||||
|
option "-n", "--limit NUM", type: Int32, desc: "maximum number of pages to scan"
|
||||||
|
option "-f", "--fibers NUM", type: Int32, desc: "maximum amount of fibers to spin up", default: 10
|
||||||
|
option "-i", "--ignore PATTERNS", type: Array(String), desc: "url patterns to ignore (regex)"
|
||||||
|
option "-o DIR", "--outdir=DIR", type: String, desc: "directory to save images to", default: "./imgd-downloads"
|
||||||
|
option "-m NUM", "--minsize=NUM", type: Int32, desc: "image minimum size (in bytes)"
|
||||||
|
option "-x NUM", "--maxsize=NUM", type: Int32, desc: "image maximum size (in bytes)"
|
||||||
|
|
||||||
|
run do |opts, args|
|
||||||
|
if args.size != 1
|
||||||
|
raise "arachnid imgd requires exactly one site to scan. you provided #{args.size}."
|
||||||
|
else
|
||||||
|
img = Arachnid::Cli::ImageDownloader.new
|
||||||
|
img.run(opts, args)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
help_template do |desc, usage, options, sub_commands|
|
||||||
|
longest_option = options.reduce(0) do |acc, opt|
|
||||||
|
option = opt[:names].join(", ")
|
||||||
|
option.size > acc ? option.size : acc
|
||||||
|
end
|
||||||
|
|
||||||
|
options_help_lines = options.map do |option|
|
||||||
|
option[:names].join(", ").ljust(longest_option + 5) + " - #{option[:desc]}" + ( option[:default] ? " (default: #{option[:default]})" : "" )
|
||||||
|
end
|
||||||
|
|
||||||
|
base = <<-BASE_HELP
|
||||||
|
#{usage}
|
||||||
|
|
||||||
|
#{desc}
|
||||||
|
|
||||||
|
options:
|
||||||
|
#{options_help_lines.join("\n ")}
|
||||||
|
|
||||||
|
BASE_HELP
|
||||||
|
|
||||||
|
sub = <<-SUB_COMMAND_HELP
|
||||||
|
|
||||||
|
sub commands:
|
||||||
|
#{sub_commands.map { |command| command[:help_line].strip }.join("\n ") }
|
||||||
|
SUB_COMMAND_HELP
|
||||||
|
|
||||||
|
sub_commands.empty? ? base : base + sub
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -0,0 +1,59 @@
|
||||||
|
require "./action"
|
||||||
|
require "termspinner"
|
||||||
|
require "mime"
|
||||||
|
|
||||||
|
module Arachnid
|
||||||
|
class Cli < Clim
|
||||||
|
class ImageDownloader < Cli::Action
|
||||||
|
|
||||||
|
def run(opts, args)
|
||||||
|
url = URI.parse(args[0])
|
||||||
|
spinner = Spinner::Spinner.new("Wait...")
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
outdir = File.expand_path(opts.outdir, __DIR__)
|
||||||
|
|
||||||
|
spider = Arachnid::Agent.new(limit: opts.limit, fibers: opts.fibers)
|
||||||
|
spider.visit_urls_like(Regex.new(url.to_s))
|
||||||
|
|
||||||
|
opts.ignore.each do |pattern|
|
||||||
|
pattern = Regex.new(pattern)
|
||||||
|
spider.ignore_urls_like(pattern)
|
||||||
|
end
|
||||||
|
|
||||||
|
spider.every_image do |res|
|
||||||
|
next if opts.minsize && res.body.bytesize < opts.minsize.not_nil!
|
||||||
|
next if opts.maxsize && res.body.bytesize > opts.maxsize.not_nil!
|
||||||
|
# name = opts.format ? format_filename(opts.format, res.url.path) || res.url.path
|
||||||
|
name = format_filename(nil, res, count)
|
||||||
|
outfile = File.join(outdir, name)
|
||||||
|
|
||||||
|
count += 1
|
||||||
|
spinner.message = "Saved #{outfile}"
|
||||||
|
File.write(outfile, res.body.to_slice, mode: "a")
|
||||||
|
end
|
||||||
|
|
||||||
|
# Create the target directory
|
||||||
|
Dir.mkdir_p(outdir)
|
||||||
|
|
||||||
|
spinner.start("Crawling...")
|
||||||
|
spider.start_at(url)
|
||||||
|
spinner.stop("Finished! #{count} images saved to #{outdir}\n")
|
||||||
|
end
|
||||||
|
|
||||||
|
def format_filename(format, res, index)
|
||||||
|
filename = res.url.path
|
||||||
|
ext = File.extname(filename)
|
||||||
|
basename = File.basename(filename, ext)
|
||||||
|
|
||||||
|
# If the ext is empty create one from the MIME type
|
||||||
|
if ext.empty?
|
||||||
|
extensions = MIME.extensions(res.content_type)
|
||||||
|
ext = extensions.first? || ".unknown"
|
||||||
|
end
|
||||||
|
|
||||||
|
filename = basename + ext
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
Loading…
Reference in New Issue