#!/bin/sh -- # This comment tells perl not to loop! eval 'exec /usr/imports/bin/perl5 -S $0 ${1+"$@"}' if 0; ################################################################ # # retrieve_database strand-dbfile destination-dir (workdir) # # Inputs: # # File contains entries one per line: # n url1 url2, where n is the unique ID number of a # document pair, url1 is the URL of an English document, # and url2 is the URL that STRAND has identified as a # parallel translation. # # Directory , which is assumed to already # exist, will contain files N.1 and N.2 containing the # downloaded parallel translations for pair number N. # # Optional directory defaults to "/tmp"; will # contain temporary files. # # Output: # # Files are created in (see above), # and the database of successfully retrieved pairs is # printed on stdout. Failures are reported to STDERR. # # Required: # # The GNU Wget package must be installed and in the # execution path, or the user must replace variable # $wgetcmd, below, with some other command string # that takes a URL as an argument, retrieves the page, # and prints it to stdout. # # ################################################################ # Wget command $wgetcmd = "wget -q -O -"; # Handle command line arguments $argc = @ARGV; if ($argc < 2 || $argc > 3) { die "Usage: $0 strand-dbfile destination-dir (workdir)\n"; } $dbfile = $ARGV[0]; $outdir = $ARGV[1]; $workdir = ($argc == 3 ? $ARGV[2] : "/tmp"); $debug = "off"; open(IN, "< $dbfile") || die "Unable to read $dbfile\n"; $tempfile1 = "$$.1"; $tempfile2 = "$$.2"; while ($line = ) { next if ($line =~ /^#/); if (($i,$url1,$url2) = ($line =~ /^\s*(\d+)\s+(\S+)\s+(\S+)\s*$/)) { # Retrieve the two pages from the WWW print STDERR "Retrieving $url1\n" if ($debug eq "on"); `$wgetcmd '$url1' > $workdir/$tempfile1`; print STDERR "Retrieving $url2\n" if ($debug eq "on"); `$wgetcmd '$url2' > $workdir/$tempfile2`; # If one or both pages was unretrievable, report error if (((-s "$workdir/$tempfile1") == 0) || ((-s "$workdir/$tempfile2") == 0)) { print STDERR "Failed: $i\t $url1 $url2\n"; unlink("$workdir/$tempfile1") if (-e "$workdir/$tempfile1"); unlink("$workdir/$tempfile2") if (-e "$workdir/$tempfile2"); next; } # Otherwise move file to destination directory and report line else { `/bin/mv $workdir/$tempfile1 $outdir/$i.1`; `/bin/mv $workdir/$tempfile2 $outdir/$i.2`; print "$line"; } } }