search_firewall.cgi

#!/usr/bin/perl

# Intermediate Search, Version 1.1
#    Copyright 1997 by Fluid Dynamics <xav.com>
#    You are free to use the script, but please ask before you 
#    distribute it.
#
# For latest version and help files, visit:
#    http://www.xav.com/scripts/search/
# __________________________________________________________________


$basedir = '/usr/local/apache/htdocs/';
# The directory location of all your files.  Remember the trailing 
# slash.

$baseurl = 'http://www.setgetweb.com/';
# The URL corresponding to the base directory.

@files = ('firewall/*.html',
          'firewall/*/*.html');


# These are all the files that will be listed.  The asterisk is a 
# wildcard - it will list all files and directories.

$summary_file = '/tmp/summaries.txt';

# Make this writable  chmod(777 summaries.txt) and hide it well!
# It holds the results of everybody's searches so you'll know what 
# people are really looking for when they come to your site.  We 
# have placed our summary file in a non-web directory so others 
# can't see it - you could put it in a hidden or secure directory.

$link_url = 'http://www.setgetweb.com/';
$link_title = 'Home';
# Enter the URL and title of your main web page.

$java_toys = 'off';
# If your visitors can't handle Java, better set this to 'off'.

$searchpict = 'http://www.setgetweb.com/images/search.gif';
$lawlogo = 'http://www.setgetweb.com/images/search.gif';
# The URL of the E3 picture.

$cgi_url = 'search_firewall.cgi';
# Change this to the full URL only if your rename this script.

# This array holds info on all the directories and filetypes you'd like 
# your visitors to search.  Visit the readme file for more customizing 
# information.


# Options for Weighted Search:
#
# All occurrences of a search term count as one point.  The occurrence 
# of a term in the filename, title, META keywords, or META description 
# can have added weight (equivalent to a multiplier per hit).  Enter 
# the multipliers in the array below - the defaults are (2,2,4,2).  If 
# this makes no sense to you, just ignore it and leave the defaults as 
# they are - they work pretty well.  Note that this will give extra 
# weight to those pages that have a properly formatted title and META 
# tags, even if they contain the same basic information.

($name_x, $title_x, $keywords_x, $description_x) = (2,2,4,2);


# No further editing is necessary, but feel free to play around...
# Note that much of the code below is straight HTML, and very easy to 
# modify if you know a little about HTML programming.
# 
# __________________________________________________________________




read(STDIN,$buffer,$ENV{'CONTENT_LENGTH'});

#@pairs = split(/&/,$buffer);
@pairs = split(/&/,$ENV{'QUERY_STRING'});
foreach $pair (@pairs)
{
    ($name,$value) = split(/=/,$pair);
    $value =~ tr/+/ /;
    $value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
    $FORM{$name} = $value;
}



if ($FORM{'terms'})
{
    &get_files;
    &search;
    &return_html;
}
else
{
    print "<p>Unable to parse variable terms";
    #&prompt;
}


sub prompt
{
print "Content-type: text/html\n\n";
print <<EOM;
<HTML>
<HEAD>
<TITLE>Intermediate Search</TITLE>
<META NAME="description" CONTENT="Try out our internal search engine 
    for the fastest way to find what you're looking for!">


</head>

<body>

<center>
<IMG SRC="$lawlogo"></IMG>
<IMG SRC="$searchpict" height=75 width=75 ALT="Another Green World"></IMG>
<p>


<p><FORM METHOD=POST ACTION="$cgi_url" NAME="searchform">
<INPUT TYPE=TEXT NAME="terms" SIZE=30>
<INPUT TYPE=SUBMIT VALUE="Search!"><BR>

EOM

if ($java_toys eq 'on')
{
    print "<SCRIPT LANGUAGE=\"JavaScript\">\n";
    print "<!-- script hiding...\n";
    print "document.searchform.terms.focus();\n";
    print "// End hiding -->\n";
    print "</SCRIPT>\n";
}

print <<EOM;
</FORM>

<BR><H1>Search Tips</H1>
</center>
<font class=grey3>
<BLOCKQUOTE>

<p>The default value is or. Thus, a search for 
<i>lacobrts cmprts srgen</i> would return pages with at least one of the three terms.<P>

<p>The best way to search is to require that all the words be found. Do this by sticking a plus (+) sign in front of each word.  For example <i>+lacobrts +AP170</i> would only return documents that had both &quot;lacobrts&quot; and &quot;AP170&quot;.

<p>You can search for a phrase by putting quotes around your phrase, i.e., &quot;<i>Just what, exactly, is a lacobrts?</i>&quot;

<p>Each term may be preceded by the standard Boolean operators 
not, and, or or.

<p> If you search for "<i>dogs  not pizzas</i>", you'll find all documents containing the word <i>dogs</i> except those documents which also contain the word <i>pizzas</i>.

<p> If you type in "<i>hot</i>  and  <i>dog</i> and <i> pizzas</i>", you'll find only those documents which contain hree search terms. 

<p>Altavista's shorthand notation works too. A search on "<i>dogs 
-hot</i>" is equivalent to the first example, and "<i>+hot 
+dog +pizzas</i>" will return the same documents as the second.<P>

If a search term has at least one capital letter, like "<i>parIS</i>", 
the search will be case sensitive with respect to that word - that is, 
only documents containing "<i>parIS</i>" will be found. On the other 
hand, lowercase words like "<i>paris</i>" will generate hits 
from "<i>Paris</i>", "<i>PARIS</i>", or "<i>parIS</i>".<P>

To group a collection of words, use quotes.  For example, the query 
<i>"Zoltan Milosevic"</i> (quotes included) would not generate a hit 
from "Slobodan Milosevic met with Zoltan Smith". Without quotes, 
the sentence would count. Boolean operators can also 
act on quotations: a search on '<i>+the +kitten not "the 
kitten"</i>' would return only those documents where 
"<i>the</i>" and "<i>kitten</i>" appear separately.<P>

Intermediate Search finds words, not strings. A search for 
"<i>in</i>" would turn up only that word, not "<i>bin</i>",
"<i>inside</i>", or "<i>acquaintance</i>". To perform a 
string search, preface your term with the dollar sign - a 
query on "<i>\$in</i>" would find all words lists above. Note 
that more complex wildcard searches using the asterisk are 
<I>not</I> permitted. Including the asterisk in your query will 
return a list of all files, but that's its only function.<P>

These rules are based on <A HREF="http://www.altavista.digital.com">
Altavista's</A> query syntax; a look at their <A 
HREF="http://www.altavista.digital.com/cgi-bin/query?pg=h">
Search Tips</A> may prove useful. The original 
Simple Search was created by Matt Wright and can be found at <a 
href="http://www.worldwidemart.com/scripts/">Matt's Script 
Archive</a>. Like Matt's script, our version is freeware and can 
easily be set up on most websites.</BLOCKQUOTE>

<BR>

<center>

<A HREF="$link_url">$link_title</A>

<p><HR SIZE=1 NOSHADE WIDTH=50\%>

<p><font class=grey2><i>

Intermediate Search, Version 1.1 is Copyright 1997 (freeware) by
<A HREF="http://www.xav.com">Fluid Dynamics</A>.<BR>
Visit the <A HREF="http://www.xav.com/scripts/search">Search Page</A>
for help files and most recent version.
</font></i>

</center>
<p></p>

</BODY></HTML>
EOM
}




sub get_files
{
&bad_base unless (-e $basedir);
chdir($basedir);
foreach $file (@files)
{
    $ls = `ls $file`;
      @ls = split(/\s+/,$ls);
      foreach $temp_file (@ls) {
         if (-d $file) {
            $filename = "$file$temp_file";
            if (-T $filename) {
               push(@FILES,$filename);
            }
         }
         elsif (-T $temp_file) {
            push(@FILES,$temp_file);
         }
      }
   }
}

sub search
{
# Convert multiple blank spaces to single spaces:
$FORM{'terms'} =~ s/\s+/ /g;
$FORM{'terms'} = " $FORM{'terms'} ";

# Convert NOT statements to minus signs:
$FORM{'terms'} =~ s/ not / -/ig;

# Convert AND statements to plus signs:
$FORM{'terms'} =~ s/ and / \+/ig;

# Strip OR statements (OR is the default):
$FORM{'terms'} =~ s/ or / /ig;

# Strip wildcards (bad, bad things!)
$check = 'true' unless ($FORM{'terms'} =~ /\*/);

# Correct for grouped entries:
@terms = split(/\"/,$FORM{'terms'});
$iterator = 0;
$FORM{'terms'} = "";
$placeholder = '%%%==%%%';
foreach $term (@terms)
{
    # Do some binary-state switching:
    if ($iterator == 1)
        {$iterator--;}
    else
        {$iterator++;}

    # The iterator is 0 during grouped states:
    $term =~ s/ /$placeholder/g unless $iterator;
    $FORM{'terms'} .= $term;
}
# Done correcting for grouped entries - now all "term1 term2" 
# queries have the ugly placeholder holding them together instead 
# of blank spaces, so they won't get broken up when we do the 
# final splitting by spaces.

@terms = split(/\s+/,$FORM{'terms'});
foreach $term (@terms)
{
    # Skip null entries (first and last)
    next if ($term eq '');

    # Unmask grouped terms:
    $term =~ s/$placeholder/ /g;

    if ($term =~ /^\+/)
        {
        $term =~ s/\+//o;
        $term = '\W' . $term. '\W' unless ($term =~ /^\$/);
        $term =~ s/^\$//;
        push(@required,$term);
        $required_terms_present = "you bet";
        }
    elsif ($term =~ /^-/)
        {
        $term =~ s/-//o;
        $term = '\W' . $term. '\W' unless ($term =~ /^\$/);
        $term =~ s/^\$//;
        push(@forbidden,$term);
        $forbidden_terms_present = "fraid so";
        }
    else
        {
        $term = '\W' . $term. '\W' unless ($term =~ /^\$/);
        $term =~ s/^\$//;
        push(@optional,$term);
        }
}


foreach $FILE (@FILES)
{
open(FILE,"$FILE");
@LINES = <FILE>;
close(FILE);
$string = join(' ',@LINES);
$string =~ s/\n//g;


# Extract the title, if there is one:
if ($string =~ /<title>(.*)<\/title>/i)
{
    $titles{$FILE} = $1;
    for ($i=1;$i<$title_x;$i++)
        {
        $string .= $titles{$FILE};
        }
}
$titles{$FILE} = $FILE unless $titles{$FILE};

# Extract the description, if there is one:
if ($string =~ /<meta\s+name="description"\s+content="(.*)>/i)
{
    @cut = split(/\">/,$1);
    $description{$FILE} = $cut[0];
    for ($i=0;$i<$description_x;$i++)
        {
        $string .= $description{$FILE};
        }
}
else
{
    $string2 = $string;
    $string2 =~ s/<title>(.*)<\/title>//ig;
    $string2 =~ s/<([^>]|\n)*>//g;
    @words = split(/\s+/,$string2);
    for ($i=0;$i<25;$i++)
        {$description{$FILE} .= "$words[$i] ";}
    $description{$FILE} .= "...";
}


# Extract the keywords, if they exist:
if ($string =~ /<meta\s+name="keywords"\s+content="(.*)>/i)
{
    @cut = split(/\">/,$1);
    $keywords = $cut[0];
    for ($i=0;$i<$keywords_x;$i++)
        {
        $string .= $keywords;
        }
}


# Weight the filename as needed:
for ($i=0;$i<$name_x;$i++)
{
    $string .= "$baseurl$FILE";
}


# Now that we're done with the special HTML tags, strip HTML tags 
# from the file so that they aren't used in the search:

$string =~ s/<([^>]|\n)*>//g;


# Check for optional terms:
foreach $term (@optional)
{
    $lowercase = $term;
    $lowercase =~ tr/[A-Z]/[a-z]/;
    $lowercase =~ tr/\\w/\\W/;

    if (($term eq $lowercase) && ($check))
        {
        $include{$FILE} = 'yes' if ($string =~ /$term/i);
        @count = split(/$term/i,$string);
        }
    elsif ($check)
        {
        $include{$FILE} = 'yes' if ($string =~ /$term/);
        @count = split(/$term/,$string);
        }
    $word_count = @count;
    $relevance{$FILE} = $relevance{$FILE} + $word_count;
}
# Done checking for optional terms.


# Check for required terms:
if ($required_terms_present eq "you bet")
{
foreach $term (@required)
{
    $lowercase = $term;
    $lowercase =~ tr/[A-Z]/[a-z]/;
    $lowercase =~ tr/\\w/\\W/;
    if (($term eq $lowercase) && ($check))
        {
        if (($string =~ /$term/i) && ($include{$FILE} ne 'no'))
            {
            $include{$FILE} = 'yes';
            @count = split(/$term/i,$string);
            }
        else
            {
            $include{$FILE} = 'no';
            last;
            }
        }
    elsif ($check)
        {
        if (($string =~ /$term/) && ($include{$FILE} ne 'no'))
            {
            $include{$FILE} = 'yes';
            @count = split(/$term/,$string);
            }
        else
            {
            $include{$FILE} = 'no';
            last;
            }
        }
    $word_count = @count;
    $relevance{$FILE} = $relevance{$FILE} + $word_count;
}
}
# Done checking for required terms.


# Check for forbidden terms:
if (($forbidden_terms_present eq "fraid so") && ($check))
{
foreach $term (@forbidden)
{
    $lowercase = $term;
    $lowercase =~ tr/[A-Z]/[a-z]/;
    $lowercase =~ tr/\\w/\\W/;
    if (($term eq $lowercase) && ($string =~ /$term/i))
        {
        $include{$FILE} = 'no';
        last;
        }
    elsif ($string =~ /$term/)
        {
        $include{$FILE} = 'no';
        last;
        }
}
}
# Done checking for forbidden terms.


# Allow for wildcard-triggered listing:
$include{$FILE} = 'yes' unless ($check);


# Format for relevance:
if ($include{$FILE} eq 'yes')
{
    $relevance = sprintf("%.3f",($relevance{$FILE}/1000));
    $HITS{"$relevance$FILE"} = "$FILE";
    $hitcount++;
}


} # End loop through all files.
} # End search procedure.


      
sub return_html
{
# First we build a summary file for the webmaster and the visitor:
$docstring = "$hitcount Documents";
$docstring = "One Document" if ($hitcount == 1);
$docstring = "No Documents" unless ($hitcount);

$summary = "<h3><I>Search Results: <TT>$docstring Found</TT></I></h3>\n";
$summary .= "<BLOCKQUOTE>\n<PRE>\n";
$summary .= "     Optional Terms:  " if (@optional);
$i = 0;

foreach $term (@optional)
{
    $term = "<I>$term</I>" unless ($term =~ /^\\W/);
    $term =~ s/\\W//g;
    $summary .= "$term";
    $i++;
    $summary .= ", " unless ($i == @optional);
}

$summary .= "\n     Required Terms:  " if (@required);
$i = 0;

foreach $term (@required)
{
    $term = "<I>$term</I>" unless ($term =~ /^\\W/);
    $term =~ s/\\W//g;
    $summary .= "$term";
    $i++;
    $summary .= ", " unless ($i == @required);
}
$summary .= "\n    Forbidden Terms:  " if (@forbidden);
$i = 0;

foreach $term (@forbidden)
{
    $term = "<I>$term</I>" unless ($term =~ /^\\W/);
    $term =~ s/\\W//g;
    $summary .= "$term";
    $i++;
    $summary .= ", " unless ($i == @forbidden);
}

$summary .= "\n</PRE></BLOCKQUOTE>\n";

open(SUMMARY,">>$summary_file");
print SUMMARY "Search by $ENV{'REMOTE_HOST'}:<BR>\n";
print SUMMARY $summary;
close(SUMMARY);

# Now that the webmaster knows what's going on, we print the 
# results for the visitor:


print "Content-type: text/html\n\n";

print <<EOM;

<HTML>

<HEAD>
<TITLE>Results of Your Search</TITLE>


</head>

<body>


$summary

<font class=grey3>

<DL>
EOM

if ($hitcount > 0)
{
foreach $key (reverse sort keys %HITS)
{
    $file = $HITS{$key};
    $size = -s "$basedir$file";
    if ($size > 1500)
        {$size = int($size/1000) . " K";}
    else
        {$size = "$size bytes";}
    $last = &Last_Modified("$basedir$file");
    print "<P><DT><font class=grey2><a href=\"$baseurl$file\">$titles{$file}</a></DT>\n";
    print "<DD><font class=grey2>$description{$file}<BR>\n";
    print "<CITE><font class=grey2><A HREF=\"$baseurl$file\">$baseurl$file</A><FONT SIZE=-1>";
    print " - $size - $last</FONT></CITE></DD></i>\n";
}
}
else
{
print <<EOM;
<BLOCKQUOTE>Unfortunately, we didn't find any documents which 
matched your search terms. You may want to visit our 
<A HREF="$cgi_url?tips">search tips</A> page to better refine your 
queries.</BLOCKQUOTE>
EOM
}
print <<EOM;
</DL>

<CENTER>
<BR><BR><FORM METHOD=POST ACTION="$cgi_url">
<INPUT TYPE=TEXT NAME="terms" SIZE=40>
<INPUT TYPE=SUBMIT VALUE="New Search"></FORM>
</CENTER>

<BR><H5 ALIGN=CENTER>
<A HREF="$cgi_url?tips">Search Tips</A> - 
<A HREF="$link_url">$link_title</A>
<HR SIZE=1 NOSHADE WIDTH=50\%>
Intermediate Search, Version 1.1 is Copyright 1997 (freeware) by
<A HREF="http://www.xav.com">Fluid Dynamics</A>.<BR>
Visit the 
<A HREF="http://www.xav.com/scripts/search">Search Page</A>
for help files and most recent version.</H5></BODY></HTML>
EOM
}


sub Last_Modified
# This wonderful snippet was written by Jeff Carnahan of Terminal 
# Productions (www.terminalp.com)
{
$filename = shift;
($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime((stat($filename))[9]);
@months = (Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec);
return "$mday $months[$mon] $year";
}


sub bad_base
{
print "Content-type: text/html\n\n";
print "I tried to find the base directory you specified:\n";
print "<BLOCKQUOTE><PRE>$basedir</PRE></BLOCKQUOTE>\n";
print "But the system told me that it did not exist.\n";
exit;
}