#!/bin/sh
# http://www.berklix.com/~jhs/bin/.sh/web_cp_0_inc
#	Called by 
#		~/bin/.sh/web_cp_local_daily
#		~/bin/.sh/web_cp_local_distfiles
#		~/bin/.sh/web_cp_local_monthly
#		~/bin/.sh/web_cp_local_netswitch
#		~/bin/.sh/web_cp_local_pcbsd
#		~/bin/.sh/web_cp_local_phk
#		~/bin/.sh/web_cp_local_test
#		~/bin/.sh/web_cp_local_weekly
#	Not called by 
#		~/bin/.sh/web_cp_remote
# Test to ensure cwd & parameters get passed in:
#	cd /var/tmp
#	pwd
#	InheritanceVarTest="word1 word2"
#	export InheritanceVarTest
#	echo -n "printenv: "
#	printenv InheritanceVarTest
#	echo -n "dollar:   "
#	echo $InheritanceVarTest
#	echo Leaving Include

# After a test with:
#	cp ~jhs/pu*l/index.html ~jhs-l/public_html/
#	$sl ; $ht $params -O $httree http://fire/~jhs-l/
# Lessons learnt:
#	If source directory does not contain an index.hml,
#		then lots of index*.html get created in target
#	If source directory does contain an index.hml,
#		It makes empty sub directories for things index.html 
#		points at, eg cv/.
#		It removes files in the directory that are not referred
#		to by index.html
#			So if you had a target FreeBSD/distfiles/ directory &
#			someone put an index.html in the source,
#			httrack would delete 100G in target.
#			So if BBC podcasts index no longer listed a podcast,
#			it would disappear even if still in source.

# /site/usr/local/www/Data/virtual/berklix.net/backup ->
#	/usr/local/www/backup
# Extra domains beyond web_cp_remote, called from /var/jhs/crontab just
# on user.js.berklix.net, not on www.berklix.org.
# See also:
#	http://svnweb.freebsd.org/ports/head/www/httrack/
#	http://www.httrack.com/
# Warning: Greedy !!
#	This used 120M of swap, & gate=park ran out, it also made
#	gate slow. So as it also runs on internal host, using gate
#	as a proxy, run it on internal host, which has another
#	advantage: do not need to keep both alternate gates up to
#	date.  It or perhaps with something else locked up internal
#	host, so best run it on a sacrificial host that can reboot
#	?  ... concern over web_cp_remote !

# Warning:
#	If the shell is suspended a few times, each time the running
#	httrack gets dropped into the background,
#	doing that a few times means several httrack processes running
#	in parallel.

pub="/pub"	# -> site/pub
cd $pub	|| ( echo "$0: Fatal Error: cd $pub failed" ; exit 1 )
echo "$0:"
echo "OK succeeded: cd $pub"
unset pub

if test $? -eq 0 ; then
	true
	# echo "$0 cd succeeded on `hostname -s` `date -u +%Y-%m-%dT%H:%M:%SZ`" | \
	#	mail -s "Cron: `hostname -s`:`basename $0`" jhs
else
	echo "$0: cd failed on `hostname -s` `date -u +%Y-%m-%dT%H:%M:%SZ`" | \
		mail -s "Cron: `hostname -s`:`basename $0`" jhs
	exit 1
fi
domain1=`hostname -s`
domain2=`hostname`
domain=`hostname | sed -e s/${domain1}.//`

ht="nice nice /usr/local/bin/httrack"
#	--verbose --debug-log
#		I suspect --verbose or --debug-log produces 1.2G of mail
#		that oveflows host=fire.

sl="sleep 6"
# sleep is so if I hit with ^C I dont have to manually do a load
#	of key strokes about 6 times to finally escape the shell.

params=""			# httrack --help

# -w = --mirror
params="$params --mirror"
# re-applied 29 April 2012	after which it stopped refetching everything,
#	eg downloads.bbc.co.uk/podcasts/radio4/tip/

# -s0 = --robots=0
# params="--robots=0"			# used in web_cp_local_distfiles

# man hhtrack:
#  -AN maximum transfer rate in bytes/seconds (1000=1KB/s max) (--max-rate[=N])
#  -AN,  --max-rate[=N]
# Timing test:
#	Over 40 minutes at holz, during working day, with a 1Mbit/s
#	DSL line, & proxy host connected to DSL modem at 10Mbit/sec,
#	& 100 Mbit/s to internal host, with httrack sucking BBC
#	podcasts into a new largely empty tree, initially with rdist
#	also sending from gate to 3 x berklix remote servers,
#	-A Test		du -s -k Result
#	 30000		15 K Byte/ sec = 120 K bit/sec
#	 60000		29 K Byte/ sec = 232 K bit/sec
#	120000		50 K Byte/ sec = 400 K bit/sec
#	240000		50 K Byte/ sec = 400 K bit/sec
#	480000		50 K Byte/ sec = 400 K bit/sec

# Original DSL was 768 K bits/s, automaticaly upgraded to 1 M bit/sec
# 1000 K bit/sec = 125 K byte/sec raw, less with overhead.
# It seems something (BBC, TKom, DSL, Gate ?) limits at 400 K bit/sec.
# It seems the httrack parameter is not in scale with reality.
# With old DSL I chose -A30000, to not saturate my link,
# & preserve interactive response.

# My new DSL can theoretically do up to 16 Mbit/sec,
# but my cable to splitter constrains that.
# a 10 Mbit/sec enet interface if still in place may also constrain that.
#	(host=mart 2x100 enete. host=park to be checked)
# gate host CPU is slow.

# params="$params -A60000"	# 2011.06.22 # x 8 = 480,000 bits / sec
# params="$params -A30000"	# 2011.07.20 # x 8 = 240,000 bits / sec
# params="$params -A20000"	# 2012.04.17 # x 8 = 160,000 bits / sec
# params="$params -A10000"	# max 10 Kbyte/sec, gently on bsn
params="$params   -A300000"	# 2014.02.08 # x 8 = 2,400,000 bits / sec

# Observed: Maximum bandwidth limited to xxxxxxxx to avoid server overload
#	web_cp_local_distfiles	ftp2.de.freebsd.org		250000
#	web_cp_local_bsd	ftp-archive.freebsd.org		100000
#	web_cp_local_daily	www.bbc.co.uk			100000

# Remember that is per invocation of httrack.
# & as often host=blak is running web_cp_local_distfiles while
# host=fire is running web_cp_local_daily, 2 x -A30000 take so much
# that browsers time out.

# $sl ; $ht $params -A10000 -O berklix.com http://www.berklix.com
	# du -k = 456,654
# $sl ; $ht $params -A10000 -O berklix.org http://www.berklix.org
# $sl ; $ht $params -A10000 -O berklix.net http://www.berklix.net

# Man httrack: "-MN maximum overall size that can be uploaded/scanned
#		(--max-size[=N])"
# Allow more data as I can load my flat rate DSL.
#	www.uk.freebsd.org
#	More than 100,000,000 bytes have been transfered.. giving up - OK
#		  100,000,000
#params="$params -M2,000,000,000"
#params="$params  -M2000000000"
# Currently my distfiles has about 100 Gig.

# cygwin 2014-02: More than 2000000000 bytes have been transfered.. giving up

#params="$params  -M10000000000"

if [ "${domain1}" = "mart" ]; then	# Max bytes per job
	# echo "Proxy not needed on gateway, but you may need lots of swap"
elif [ "${domain1}" = "park" ]; then	# Max bytes per job
	# echo "Proxy not needed on gateway, but you may need lots of swap"
else
	if ( [ "$domain" == js.berklix.net ] ) ; then
		params="$params -P gate:80"
	# echo "Proxy is set to gate:80"
	fi
fi

# params="$params --quiet"

# --update		updates a mirror in the current folder
params="$params --update"	# To reduce traffic

# -%s = --updatehack
#	    update  hacks: various hacks to limit re-transfers when updating
#              (identical size, bogus response..) (--updatehack)
# I suspect maybe this does not work ?
params="$params --updatehack"

params="$params --verbose"	# for debug

if ( [ "$domain" != js.berklix.net ] && [ "$domain" != no.berklix.net ] ) ; then
	echo "$0: Error wrong domain"
	exit 1
	fi

# To Check Hrefs
#	Warning produces over 600M, as get all of eg:
#		./_jhs/gea/ski
#		./jhs/gea/ski
#	& probably later
#		./gea/ski

#	--robots=0	# ignore robots.txt

#	-cN    number of multiple connections (*c8) (--sockets[=N])
params="$params --sockets=1"		# --sockets=3 in web_cp_local_distfiles
# params="$params --sockets=3"		# --sockets=3 in web_cp_local_distfiles
#	Only update 1 file at a time:
#		As for a home link I want to intermittently drop link
#		& annoying when I have various .mp3 that are
#		truncated after pulling in parallel, & xmms stops playing
#		half way. Better just pull one mp3 at a time.
#	While its running I see multiple .mp3.tmp files, but these
#	have older time stamps, so are presumably Not indication of
#	parallel fetches, but just from prior aborted runs, & perhaps
#	only get deleted by --purge-old=1 at end of a succesful run.
#	However I want to pull other multiple html & distfiles in parallel.

paramsbbc="$params"	# subset of params used for bbc

# --- Below here params are not passed to ~/bin/.sh/web_cp_local_daily for bbc

# -X = --purge-old	# purge old files after update
params="$params -X"
# params="$params --purge-old"		Result:	Option -X needs to be followed by a parameter: -X <param>
# params="$params --purge-old=0"	Result:	No error
# params="$params --purge-old=1"	Result:	* invalid option 1
# I''ve seen web_cp_remote_daily removing lots of big bbc podcasts at end
# of run, logged as eg:
#	Info:   Purging bbc/downloads.bbc.co.uk/podcasts/radio4/material/material_20130704-1703c.mp3
# & man httrack shows:
#	-X     *purge old files after update (X0 keep delete) (--purge-old[=N])
# so web_cp_remote_daily need to avoid -X

params="$params --stay-on-same-domain"

# -a = --stay-on-same-address
params="$params --stay-on-same-address"

# params="$params --verbose"

# params="$params -Z -z"          # Add debug

# params="$params -%!"          # Tried 2012_04_28
        # Bypass  built-in  security limits aimed to avoid bandwith
        # abuses (bandwidth, simultaneous  connections)
        # (--disable-security-lim- its)

# params="$params --background-on-suspend"
# params="$params -y"

# To try later:
#       --keep-alive

# Other HTML site copying tools apart from htttrack:
#	pavuk	< ernst
#	spider	< ernst
#	webcopy
#	wget	< gary
