Make backfill script safer for automated use

* Write output files to a temp dir out of the way, just in case a
  regular dumps worker is working on the same output jobs
* Check that there is no existing partial or full output file in place
  just before starting the maintenance script that would produce it
* Check that there is no existing file in the final location, before
  moving the finished temp file into that location (perhaps a regular
  worker produced it in the meantime)

With bad scheduling it is still possible for this script to work on the
same job as a regular dump worker, and for the regular worker to
generate checksums of its output file which is then overwritten by this
script's output.  But these changes should minimize the possibility.

Bug: T252396
Change-Id: Ib966731c949f0d93bcea36a3e66dffd978a60ce5
diff --git a/xmldumps-backup/fixup_scripts/do_dumptextpass_jobs.sh b/xmldumps-backup/fixup_scripts/do_dumptextpass_jobs.sh
index 940ea0a..2888980 100644
--- a/xmldumps-backup/fixup_scripts/do_dumptextpass_jobs.sh
+++ b/xmldumps-backup/fixup_scripts/do_dumptextpass_jobs.sh
@@ -10,6 +10,9 @@
 # page-meta-history bz2 output files
 # optionally locks wiki for date during the run
 #
+# output files are written to the wiki's temp dir and only
+# moved into place once they are verified
+#
 # does NOT: update md5s, status, dumpruninfo, symlinks, etc.
 # does NOT: clean up old dumps, remove old files from run
 # this should be done by running a noop via the regular dumps system
@@ -183,7 +186,7 @@
 }
 
 setup_textpass_args() {
-    #/usr/bin/php7.2 /srv/mediawiki/multiversion/MWScript.php dumpTextPass.php --wiki=wikidatawiki --stub=gzip:/mnt/dumpsdata/xmldatadumps/temp/w/wikidatawiki/wikidatawiki-20190901-stub-meta-history1.xml-p82872p98330.gz --prefetch=7zip:/mnt/dumpsdata/xmldatadumps/public/wikidatawiki/20190801/wikidatawiki-20190801-pages-meta-history1.xml-p74429p85729.7z;/mnt/dumpsdata/xmldatadumps/public/wikidatawiki/20190801/wikidatawiki-20190801-pages-meta-history1.xml-p85730p103181.7z --report=1000 --spawn=/usr/bin/php7.2 --output=bzip2:/mnt/dumpsdata/xmldatadumps/public/wikidatawiki/20190901/wikidatawiki-20190901-pages-meta-history1.xml-p82872p98330.bz2.inprog --full
+    #/usr/bin/php7.2 /srv/mediawiki/multiversion/MWScript.php dumpTextPass.php --wiki=wikidatawiki --stub=gzip:/mnt/dumpsdata/xmldatadumps/temp/w/wikidatawiki/wikidatawiki-20190901-stub-meta-history1.xml-p82872p98330.gz --prefetch=7zip:/mnt/dumpsdata/xmldatadumps/public/wikidatawiki/20190801/wikidatawiki-20190801-pages-meta-history1.xml-p74429p85729.7z;/mnt/dumpsdata/xmldatadumps/public/wikidatawiki/20190801/wikidatawiki-20190801-pages-meta-history1.xml-p85730p103181.7z --report=1000 --spawn=/usr/bin/php7.2 --output=bzip2:/mnt/dumpsdata/xmldatadumps/temp/w/wikidatawiki/wikidatawiki-20190901-pages-meta-history1.xml-p82872p98330.bz2.inprog --full
 
     # sanity check of date
     result=`date -d "$DATE"`
@@ -204,7 +207,7 @@
     dumptextargs=( "${dumptextargs[@]}" "--wiki=${WIKI}" "--report=1000" "--spawn=$PHP" )
     dumptextargs=( "${dumptextargs[@]}" "--full" )
     dumptextargs=( "${dumptextargs[@]}" "--stub=gzip:${STUB}" )
-    dumptextargs=( "${dumptextargs[@]}" "--output=bzip2:${OUTDIR}/${OFILE}.inprog" )
+    dumptextargs=( "${dumptextargs[@]}" "--output=bzip2:${TEMPFILESDIR}/${OFILE}.inprog" )
     if [ -n "$PREFETCHES" ]; then
 	dumptextargs=( "${dumptextargs[@]}" "--prefetch=7zip:${PREFETCHES}" )
     fi
@@ -303,15 +306,27 @@
 	    echo "New batch..."
 	fi
 	for stub_doing in ${stubs_batch[@]}; do
+	    EXISTS=""
 	    get_stub_range $stub_doing
 	    outputfile="${WIKI}-${DATE}-pages-meta-history${PARTNUM}.xml-p${STUBSTART}p${STUBEND}.bz2"
 	    get_prefetches $STUBSTART $STUBEND || exit 1
 	    combine_prefetches
 	    setup_textpass_args "$stub_doing" "$outputfile" ${prefetches[@]}
-	    if [ -n "$DRYRUN" -o -n "$VERBOSE" ]; then
-		echo "$PHP ${dumptextargs[@]}"
+
+	    # skip if there is a partial or complete output file already there
+	    if [[ -f "${OUTDIR}/${outputfile}.inprog" ]] || [[ -f "${OUTDIR}/${outputfile}" ]]; then
+		if [ -n "$DRYRUN" -o -n "$VERBOSE" ]; then
+	            echo "Output file ${OUTDIR}/${outputfile} already in progress, skipping"
+		fi
+		EXISTS="true"
 	    fi
-	    if [ -z "$DRYRUN" ]; then
+
+	    if [ -z "$EXISTS" ]; then
+		if [ -n "$DRYRUN" -o -n "$VERBOSE" ]; then
+		    echo "$PHP ${dumptextargs[@]}"
+		fi
+	    fi
+	    if [ -z "$DRYRUN" -a -z "$EXISTS" ]; then
 	        $PHP ${dumptextargs[@]} &
 	        wait_pids+=($!)
 		outfiles+=("$outputfile")
@@ -322,11 +337,18 @@
 	    wait $pid
 	    if [ $? -ne 0 ]; then
 		echo "failed to generate" ${outfiles[$i]} "with nonzero exit code"
-	    elif $( /usr/local/bin/checkforbz2footer ${OUTDIR}/${outfiles[$i]}.inprog ); then
-		mv ${OUTDIR}/${outfiles[$i]}.inprog ${OUTDIR}/${outfiles[$i]}
+	    elif $( /usr/local/bin/checkforbz2footer ${TEMPFILESDIR}/${outfiles[$i]}.inprog ); then
+		# should we move over an existing file? no, we don't know what put it there, so manual
+		# intervention will be required in that case. this may not indicate an error though
+		if [ -f "${OUTDIR}/${outfiles[$i]}" ]; then
+		    echo "File ${OUTDIR}/${outfiles[$i]} already exists, not writing over it"
+		    mv ${TEMPFILESDIR}/${outfiles[$i]}.inprog ${TEMPFILESDIR}/${outfiles[$i]}
+		else
+		    mv ${TEMPFILESDIR}/${outfiles[$i]}.inprog ${OUTDIR}/${outfiles[$i]}
+		fi
 	    else
 		echo "renaming truncated ${outfiles[$i]}"
-		mv ${OUTDIR}/${outfiles[$i]}.inprog ${OUTDIR}/${outfiles[$i]}.truncated
+		mv ${TEMPFILESDIR}/${outfiles[$i]}.inprog ${TEMPFILESDIR}/${outfiles[$i]}.truncated
             fi
 	    ((i++))
 	done