#!/bin/sh

# Configuration
#	o Environment
alias cp="cp"
alias echo="echo -e"
self=`basename $0`
#	o Arguments
target=$1
# 	o wget
ua="Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.3) Gecko/20070309 Firefox/2.0.0.3"
# Files and paths
searchFile="`pwd`/search.tmp"
tmpSiteDir="`pwd`/site.tmp"
tmpSiteFilesFile="`pwd`/sitefiles.tmp"
targetDir="`pwd`/$target"

# Functions
function usage {
	cat <<USAGE
usage: $self domain [YYYYMMDD]
   ex. $self cwd.ptbcanadian.com 20030814
 note: domain should be an unaddorned domain or subdomain name
 note: YYYYMMDD is the latest date to accept files (this helps when a domain is
       now owned by a squatter and its originaly content is unavailable)
USAGE
}

# Sanity
if [ $# -ne 1 ]; then
	usage
	exit 1
fi
if [ -e "$tmpSiteDir" ]; then
	echo "Error: $tmpSiteDir directory already exists."
	exit 1
fi
if [ -e "$targetDir" ]; then
	echo "Error: $targetDir directory already exists."
	exit 1
fi

wget "http://web.archive.org/web/*sa_sh_sr_1nr_20000/${target}" -O - | sed -e "s/>/>\n/g" | grep -ie "web.archive.org.*/[0-9]*/.*${target}" > "$searchFile"

while read line; do
	((outterwait = ($RANDOM % 5) + 1))
	((innerwait = ($RANDOM % 1) + 1))

	rm -rf "$tmpSiteDir"
	mkdir -p "$tmpSiteDir"
	cd "$tmpSiteDir"
	echo $line
	echo "\tinnerwait = $innerwait"
	echo "$line" | grep -i '/*hh_/' 2>&1 >/dev/null
	if [ $? -eq 0 ]; then
		echo "$line" | wget -Fi - -O - -U "$ua" | sed -e "s/>/>\n/g" | grep -ie "web.archive.org.*standonguard.com" | wget -Kkp -Fi - -w $innerwait -U "$ua"
	else
		echo "$line" | wget -Kkp -Fi - -U "$ua"
	fi

	cd web.archive.org/web && [ -d [0-9]* ] && cd [0-9]* && cd http* && cd *
	if [ $? -eq 0 ]; then
		find . -type f > "$tmpSiteFilesFile"
		mkdir -p "$targetDir"
		while read file; do
			mkdir -p "$targetDir/`dirname $file`/"
			cp -fau "$file" "$targetDir/`dirname $file`/"
		done < "$tmpSiteFilesFile"
	fi

	echo "Sleeping $outterwait"
	sleep $outterwait
	echo
done < "$searchFile"

rm -f "$searchFile" "$tmpSiteFilesFile"
rm -rf "$tmpSiteDir"

