#!/usr/bin/env perl ############################################################################## # RDF Grabber # # Downloads RDF files and copies them to a position in the local filesystem. # Gets information on file URL and local destination from a tab seperated # input file, where each line is of the form: ^URL\tPath$ # # NOTE - This script does not support path names that contain whitespace, even # escaped whitespace. # # $Header: /var/cvs/projects/websites/acherondevelopment/files/parse_rdf/rdfgrabber,v 1.1 2003/06/11 18:19:40 vrai Exp $ ############################################################################## # Get the RDF information file my $infoFilename = $ARGV [ 0 ] or die ( "Usage: rdfgrabber rdf_info_filename\n" ); # Parse the file my %rdfs = ParseRDFInfoFile ( $infoFilename ); # Process the RDFs in the hash ProcessRDFs ( \%rdfs ); exit ( ); ############################################################################## # Function definitions # Process the RDFs, downloads them and copies them to the given location # # Arg0: Reference to the RDF hash, URL=>Path sub ProcessRDFs ( ) { my ( $rdfs ) = @_; foreach my $url ( keys ( %$rdfs ) ) { # Get the path my $path = $rdfs->{ $url }; my $tmpPath = '/tmp/tmp_' . `/usr/bin/env /usr/bin/basename $path` or die ( "Cannot get basename of $path - $!\n" ); $tmpPath = TrimString ( $tmpPath ); # Remove any existing temporary file with this name unlink ( $tmpPath ) or die ( "Unable to remove temporary file $tmpPath - $!\n" ) if ( stat ( $tmpPath ) ); # Download the RDF to a local file my $output = `/usr/bin/env /usr/bin/wget -O $tmpPath $url 2>&1`; # Check the output, if it's valid move the file to it's new location my $regTmpPath = quotemeta ( $tmpPath ); my $mvOutput = `mv $tmpPath $path`; my $touchOutput = `touch $path`; # Remove the temporary file (if it exists) - not too bothered if this doesn't work unlink ( $tmpPath ); } } # Parses the RDF info file in to a hash where each pair is: URL=>Path # # Ret: Hash of the form URL=>Path # Arg0: File name of RDF Information File sub ParseRDFInfoFile ( ) { my ( $filename ) = @_; my %rdfs; # Open the file and read it in a line at a time open ( INFOFILE, "<$filename" ) or die ( "Unable to open info file \"$filename\" - $!\n" ); while ( my $line = ) { # Check that the line is valid if ( $line =~ /^http:\/\/[^\t]+[\t]+[^\t]+$/ ) { # Split the line in to its two components and add them to the hash my ( $url, $path ) = split ( /[\t]+/, $line, 2 ); $url = TrimString ( $url ); $path = TrimString ( $path ); $rdfs { $url } = $path; } } close ( INFOFILE ); return %rdfs; } # Removes ALL whitespace from a string, even escaped whitespace # # Ret: String sans whitespace # Arg0: String to trim sub TrimString ( ) { my ( $string ) = @_; $string =~ s/[ \t\r\n]+//g; return $string; }