#!/usr/local/bin/perl # Despaminator Spambot Vaccine, Version 1.1 # Kevin Q. Brown # Techie Nerd, Inc. # http://www.TechieNerd.com/ # Copyright (C) 2003 # You may distribute this script under the terms of either the # GNU General Public License or the Perl Artistic License. # This script is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # WWW Syntax: index.cgi?subdir=xxx&url=yyy # Command Line Syntax: index.cgi subdir=xxx url=yyy # The despaminator protects email addresses from harvesting by spammers. # It splits, permutes, and escapes the components of each email address, # using JavaScript to reconstruct the email addresses in web browsers. # It rewrites all local hyperlinks to ensure that subsequent pages # are protected, too. # It process three kinds of web pages: # (1) html and htm files # (2) txt files (which it embeds in HTML) # (3) all other files are passed straight through unchanged. # Assumptions: # (A) The web site has only static web pages. # (B) The web site's host supports user-defined Perl CGI scripts. # (Placing this script as "index.cgi" in the main directory # for the web site works best, but if one must place CGI scripts # in a cgi-bin directory, then simply create an index.html that # autoforwards to that cgi-bin script.) # (C) When the web site references a local directory, # the url ends in a "/". (If it does not, then this script # will assume that the reference is to a file rather than a # directory.) # (D) When a url ends in a "/", the directory it references will # have a file with name "$defaultname" (defined below). # (This script does not yet support a _list_ of file names # to search for when the url references a directory.) # (E) Hyperlinks occur only in the HTML tag attributes: # action, background, codebase, href, and src # In particular, Flash applets for Internet Explorer are not supported. # Installation # - Move the old web site to another directory # (You may also want to password protect those pages if they # are still accessible via WWW.) # - Change the first line of this script to point to the location of Perl # on the web server. # - Edit the "Customization Parameters" below. # - Place this script (index.cgi) in the previous main directory # for the web site and ensure that it is executable. # Attributes with url values, according to Appendix B of # "Web Design in a Nutshell" by Jennifer Niederst: # action, background, cite, classid, codebase, data, href, # longdesc, profile, src, usemap, and version # Also, the "value" attribute of the "param" tag contains a url # when referencing a Flash .swf file for Internet Explorer. # In other contexts, the "value" attribute contains non-URL values. # Note: "version" (for the HTML DTD version) is almost certainly an # absolute URL that does not need to be translated. # # This script will translate the urls for attributes: # action, background, codebase, href, and src # History: # Version 1.1: Allow only limited chars for file name and subdir (May 6, 2004) # URLs have several variations. Here are some examples: # http://www.abc.xyz/... (assumed to be a non-local URL) # abc.html # abc.htm # abc.txt # abc/def.html # abc/ (This requires finding the implied file after "/".) # abc (Assumed to be a binary file, since it has no extension.) # /abc.html # ../abc.html # ./abc.html # abc.html?xyz=123 (JavaScript _can_ decode this query string.) # abc.html#DEF # abc.html?xyz=123#DEF # #DEF (This requires filling in the current file name.) # /?xyz=123 (This form may not be supported in this script.) # These can be enclosed in either double or single quotes. use strict; use CGI qw/:cgi :form/; # customization parameters my $scriptname = "index.cgi"; # name of this script # actual location of source files for web site # (docroot should end in "/") my $docroot = "/usr/home/myuserid/mywebsite/"; # not directly web accessible my $defaultname = "index.html"; # for directories ending in "/" # Characters allowed in file names my $fnameokchars = q(\w\.\-); # "\w" matches alphanumeric and "_" chars my $subdirokchars = q(\/) . $fnameokchars; # Define mapping for MIME types. # For a more complete list, see the mime.types file for the web server. # This list is needed because all local hyperlinks must pass through # this script and the correct mime type must be written to the HTML # header for files that are copied straight through. my %MIMETYPE = ( asc => "text/plain", au => "audio/basic", bin => "application/octet-stream", bmp => "image/bmp", class => "application/octet-stream", css => "text/css", dll => "application/octet-stream", doc => "application/msword", eps => "application/postscript", exe => "application/octet-stream", gif => "image/gif", htm => "text/html", html => "text/html", js => "application/x-javascript", mov => "video/quicktime", mp3 => "audio/mpeg", jpeg => "image/jpeg", jpg => "image/jpeg", mpeg => "video/mpeg", mpg => "video/mpeg", pdf => "application/pdf", png => "image/png", ppt => "application/vnd.ms-powerpoint", ps => "application/postscript", qt => "video/quicktime", rtf => "text/rtf", sh => "application/x-sh", snd => "audio/basic", swf => "application/x-shockwave-flash", tar => "application/x-tar", tif => "image/tiff", tiff => "image/tiff", txt => "text/plain", wav => "audio/x-wav", xls => "application/vnd.ms-excel", xml => "text/xml", xsl => "text/xml", zip => "application/zip" ); # Get parameter(s) my $url = param('url'); my $subdir = param('subdir'); if ($url eq "" && $subdir eq "") { $url = $defaultname; } unless ($url =~ /^[$fnameokchars]*$/) { errmsg("Bad character in url."); exit; } unless (length($url)) { errmsg("No parameter 'url' given: $!"); exit; } unless ($subdir =~ /^[$subdirokchars]*$/) { errmsg("Bad character in subdir."); exit; } # subdir should be empty or should end with "/" $subdir .= "/" unless ($subdir eq "" || substr($subdir, -1) eq "/"); # Ensure that the url is for a file that the user is allowed to see. # Convention: Put the web site in another directory tree and do not # allow accesses outside that directory tree. All files within the # directory tree are OK for the user to see. # Thus, just ensure that the level never drops below zero. my $tsturl = $url; $tsturl =~ s/\.\.\///g; # remove all instances of "../" my $cnt = (length($url) - length($tsturl)) / 3; # number of instances of "../" # conservative restriction for relative URLs my $lvl = $subdir =~ tr[/][/]; # count number of "/" chars in $subdir if ($lvl < $cnt) { errmsg("Not allowed to traverse beyond web site root."); exit 1; } # conservative restriction for URLs relative to document root if ($cnt > 0 && substr($url, 0, 1) eq "/") { errmsg("Not allowed to traverse beyond web site root."); exit 1; } # parse URL and insert any pieces that are missing (ie. implied) $url = canonicalurl($subdir, "", $url); # Echo-print non-HTML / non-plain-text files my $mtype = getmimetype($url); if ($mtype ne "text/html" && $mtype ne "text/plain") { echofile($url); exit 0; } # initialization srand( time ^ ($$ + ($$ << 15)) ); my $txt = ""; # read old HTML or plain-text file # Note: This assumes that the file is small enough to fit into RAM. my $fname = $url; $fname =~ s/\?.*//; # remove any query string $fname =~ s/\#.*//; # remove any "#" string $fname =~ s/^\.+//; # eliminate leading periods unless ( open(FH, "$docroot$subdir$fname") ) { errmsg("Cannot read file $fname: $!\n"); exit; } my $txt = join('', ); # the entire file is one long text string close(FH); # plain text file if ($mtype eq "text/plain") { # convert special characters for HTML display $txt =~ s/&/\&/g; $txt =~ s//\>/g; # Beginning HTML wrapper for plain text file my $headwrap = "\n\n" . CGI->escapeHTML($url) . "\n\n\n
\n";
  $txt =~ s/^/$headwrap/;

  # convert URLs to working hyperlinks
  $txt =~ s/(http:[^>\s]+)/$1<\/a>/g;

  # convert email addresses to hyperlinks while confusing spambots
  $txt =~ s/\s*\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*/asencodehref($&)/ieg;

  # Add final HTML for wrapping a text file
  $txt .= "
\n\n\n"; } else # HTML file { # remove lines $txt =~ s/]*>//ig; # convert email addresses to use anti-spam JavaScript # Case: (action|href)="mailto:..." or href="mailto:...?xxx=yyy&abc=123" $txt =~ s/<\s*[^>]+\s+(action|href)\s*=\s*\"mailto:\s*\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*(\?[^"]+)?\"\s*>/asencode($&)/ieg; # Case: any other email address $txt =~ s/\s*\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*/asencode($&)/ieg; # Edit all local "action=", "background=", "codebase=", "href=", # and "src=" references to use this script. # (The "subdir" parameter tracks depth of subdirectories.) $txt =~ s/(action|background|codebase|href|src)\s*=[^>\s]+/insertscript($scriptname,$subdir,$url,$&)/ieg; } # insert decoding for anti-spam JavaScript $txt =~ s/<\/head>/writeasdecode($&)/ie; # output result print "Content-Type: text/html\n\n"; print $txt; exit 0; # anti-spam encode sub asencode { my $addrstr = shift; # split string into several pieces my @textarr = split( /([.\@:?=+-])/, $addrstr); # my $len = scalar(@textarr); my $maxix = $#textarr; # initialize permutation array my $ix = 0; my @fperm = (); my @rperm = (); for ($ix=0; $ix <= $maxix; ++$ix) { $fperm[$ix] = $rperm[$ix] = $ix; } # permute text array and maintain both forward and reverse permutation arrays for ($ix=0; $ix<$maxix; ++$ix) { my $rix = $ix + 1 + int(rand($maxix-$ix)); # print "swap $ix and $rix\n"; # debug swap($fperm[$ix], $fperm[$rix], \@rperm); swap($ix, $rix, \@fperm); swap($ix, $rix, \@textarr); } # accumulate permutation and text array pieces into two strings my $delim = "+"; my $permstr = ""; my $textstr = ""; for ($ix=0; $ix<=$maxix; ++$ix) { # Note: escape() leaves a-z, A-Z, '_', '-', and '.' unchanged. # All other characters get converted to '%xx' equivalents. $permstr .= CGI->escape($rperm[$ix]) . $delim; $textstr .= CGI->escape($textarr[$ix]) . $delim; } chop($permstr); chop($textstr); # return JavaScript to decode the permuted string return(""); } # insert anti-spam encoded email address into a "mailto:" hyperlink sub asencodehref { my $addrstr = shift; my $hrefstr = "$addrstr"; return ( asencode($hrefstr) ); } # swap two elements of an array that is passed by reference sub swap { my $ix1 = shift; my $ix2 = shift; my $arrref = shift; my $temp = $arrref->[$ix1]; $arrref->[$ix1] = $arrref->[$ix2]; $arrref->[$ix2] = $temp; } # output anti-spam decoding JavaScript # TODO: Return CRLF on DOS, LF on UNIX sub writeasdecode { my $mstr = shift; my $delim = "+"; my $str = qq/