search.cgi

#!/usr/bin/perl

# Intermediate Search, Version 1.1
#	Copyright 1997 by Fluid Dynamics <xav.com>
#	You are free to use the script, but please ask before you 
#	distribute it.
#
# For latest version and help files, visit:
#	http://www.xav.com/scripts/search/
# __________________________________________________________________


$basedir = '/home/httpd';
# The directory location of all your files.  Remember the trailing 
# slash.

$baseurl = 'http://www.lawsonhelp.com/';
# The URL corresponding to the base directory.

#@files = ('public/*.html',
#	  'public/*/*.html',
#	  'client/*.html',
#	  'client/*/*.html');

#@files = ('public/*/*.html',
#	  'client/*/*.html',
#	  'client/*/*.txt',
#	  'client/*/*.txt'
#	  );

@files = ('public/docu/*.html');

# These are all the files that will be listed.  The asterisk is a 
# wildcard - it will list all files and directories.

$summary_file = '/tmp/summaries.txt';
# Make this writable  chmod(777 summaries.html) and hide it well!
# It holds the results of everybody's searches so you'll know what 
# people are really looking for when they come to your site.  We 
# have placed our summary file in a non-web directory so others 
# can't see it - you could put it in a hidden or secure directory.

$link_url = 'http://www.lawsonhelp.com/';
$link_title = 'LawsonHelp.com';
# Enter the URL and title of your main web page.

$java_toys = 'off';
# If your visitors can't handle Java, better set this to 'off'.

$searchpict = 'http://www.lawsonhelp.com/public/images/logo.jpg';
# The URL of the E3 picture.

$cgi_url = 'search.cgi';
# Change this to the full URL only if your rename this script.

# This array holds info on all the directories and filetypes you'd like 
# your visitors to search.  Visit the readme file for more customizing 
# information.


# Options for Weighted Search:
#
# All occurrences of a search term count as one point.  The occurrence 
# of a term in the filename, title, META keywords, or META description 
# can have added weight (equivalent to a multiplier per hit).  Enter 
# the multipliers in the array below - the defaults are (2,2,4,2).  If 
# this makes no sense to you, just ignore it and leave the defaults as 
# they are - they work pretty well.  Note that this will give extra 
# weight to those pages that have a properly formatted title and META 
# tags, even if they contain the same basic information.

($name_x, $title_x, $keywords_x, $description_x) = (2,2,4,2);


# No further editing is necessary, but feel free to play around...
# Note that much of the code below is straight HTML, and very easy to 
# modify if you know a little about HTML programming.
# 
# __________________________________________________________________




read(STDIN,$buffer,$ENV{'CONTENT_LENGTH'});
@pairs = split(/&/,$buffer);
foreach $pair (@pairs)
	{
	($name,$value) = split(/=/,$pair);
	$value =~ tr/+/ /;
	$value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
	$FORM{$name} = $value;
	}



if ($FORM{'terms'})
	{
	&get_files;
	&search;
	&return_html;
	}
else
	{
	&prompt;
	}


sub prompt
{
print "Content-type: text/html\n\n";
print <<EOM;
<HTML>
<HEAD>
<TITLE>Intermediate Search</TITLE>
<META NAME="description" CONTENT="Try out our internal search engine 
	for the fastest way to find what you're looking for!">
</HEAD>

<BODY BGCOLOR="#FFFFFF" TEXT="#000000" LINK="#CE0000" ALINK="#000000" VLINK="#880000">
<BR><BLOCKQUOTE>
<IMG SRC="$searchpict" ALIGN=RIGHT HSPACE=20 WIDTH=253 HEIGHT=144 
BORDER=1 ALT="Picture of Search Aircraft (trying to build a theme)">
<B>Enter a few keywords to search our site, or read our 
<A HREF="#tips">search tips</A> to make advanced queries.</B><BR>
<FORM METHOD=POST ACTION="$cgi_url" NAME="searchform">
<INPUT TYPE=TEXT NAME="terms" SIZE=30>
<INPUT TYPE=SUBMIT VALUE="Search!"><BR>
</BLOCKQUOTE>
EOM

if ($java_toys eq 'on')
	{
	print "<SCRIPT LANGUAGE=\"JavaScript\">\n";
	print "<!-- script hiding...\n";
	print "document.searchform.terms.focus();\n";
	print "// End hiding -->\n";
	print "</SCRIPT>\n";
	}

print <<EOM;
<A NAME="tips"></A>
</FORM>
<BR><H2><TT>Tips, tips and more tips!</TT></H2>
<BLOCKQUOTE>
Each term may be preceded by the standard Boolean operators 
<TT>not</TT>, <TT>and</TT>, or <TT>or</TT>. If you search for 
"<TT>dogs not pizzas</TT>", you'll find <I>all</I> documents 
containing the word "<TT>dogs</TT>" <I>except</I> those 
documents which <I>also</I> contain the word "<TT>pizzas</TT>". If 
you type in "<TT>and hot and dog and pizzas</TT>", you'll find 
<I>only</I> those documents which contain <I>all three</I> search 
terms. The default value is <TT>or</TT>. Thus, a search for 
"<TT>hot dog pizzas</TT>" would return pages <I>with at least 
one</I> of the three terms.<P>

Altavista's shorthand notation works too. A search on "<TT>dogs 
-hot</TT>" is equivalent to the first example, and "<TT>+hot 
+dog +pizzas</TT>" will return the same documents as the second.<P>

If a search term has at least one capital letter, like "<TT>parIS</TT>", 
the search will be case sensitive with respect to that word - that is, 
only documents containing "<TT>parIS</TT>" will be found. On the other 
hand, lowercase words like "<TT>paris</TT>" will generate hits 
from "<TT>Paris</TT>", "<TT>PARIS</TT>", or "<TT>parIS</TT>".<P>

To group a collection of words, use quotes.  For example, the query 
<TT>"Zoltan Milosevic"</TT> (quotes included) would not generate a hit 
from "Slobodan Milosevic met with Zoltan Smith". Without quotes, 
the sentence would count. Boolean operators can also 
act on quotations: a search on '<TT>+the +kitten not "the 
kitten"</TT>' would return only those documents where 
"<TT>the</TT>" and "<TT>kitten</TT>" appear separately.<P>

Intermediate Search finds words, not strings. A search for 
"<TT>in</TT>" would turn up only that word, not "<TT>bin</TT>",
"<TT>inside</TT>", or "<TT>acquaintance</TT>". To perform a 
string search, preface your term with the dollar sign - a 
query on "<TT>\$in</TT>" would find all words lists above. Note 
that more complex wildcard searches using the asterisk are 
<I>not</I> permitted. Including the asterisk in your query will 
return a list of all files, but that's its only function.<P>

These rules are based on <A HREF="http://www.altavista.digital.com">
Altavista's</A> query syntax; a look at their <A 
HREF="http://www.altavista.digital.com/cgi-bin/query?pg=h">
Search Tips</A> may prove useful. The original 
Simple Search was created by Matt Wright and can be found at <a 
href="http://www.worldwidemart.com/scripts/">Matt's Script 
Archive</a>. Like Matt's script, our version is freeware and can 
easily be set up on most websites.</BLOCKQUOTE>

<BR><H5 ALIGN=CENTER>
<A HREF="$link_url">$link_title</A>
<HR SIZE=1 NOSHADE WIDTH=50\%>
Intermediate Search, Version 1.1 is Copyright 1997 (freeware) by
<A HREF="http://www.xav.com">Fluid Dynamics</A>.<BR>
Visit the 
<A HREF="http://www.xav.com/scripts/search">Search Page</A>
for help files and most recent version.</H5></BODY></HTML>
EOM
}




sub get_files
{
&bad_base unless (-e $basedir);
chdir($basedir);
foreach $file (@files)
	{
	$ls = `ls $file`;
      @ls = split(/\s+/,$ls);
      foreach $temp_file (@ls) {
         if (-d $file) {
            $filename = "$file$temp_file";
            if (-T $filename) {
               push(@FILES,$filename);
            }
         }
         elsif (-T $temp_file) {
            push(@FILES,$temp_file);
         }
      }
   }
}

sub search
{
# Convert multiple blank spaces to single spaces:
$FORM{'terms'} =~ s/\s+/ /g;
$FORM{'terms'} = " $FORM{'terms'} ";

# Convert NOT statements to minus signs:
$FORM{'terms'} =~ s/ not / -/ig;

# Convert AND statements to plus signs:
$FORM{'terms'} =~ s/ and / \+/ig;

# Strip OR statements (OR is the default):
$FORM{'terms'} =~ s/ or / /ig;

# Strip wildcards (bad, bad things!)
$check = 'true' unless ($FORM{'terms'} =~ /\*/);

# Correct for grouped entries:
@terms = split(/\"/,$FORM{'terms'});
$iterator = 0;
$FORM{'terms'} = "";
$placeholder = '%%%==%%%';
foreach $term (@terms)
	{
	# Do some binary-state switching:
	if ($iterator == 1)
		{$iterator--;}
	else
		{$iterator++;}

	# The iterator is 0 during grouped states:
	$term =~ s/ /$placeholder/g unless $iterator;
	$FORM{'terms'} .= $term;
	}
# Done correcting for grouped entries - now all "term1 term2" 
# queries have the ugly placeholder holding them together instead 
# of blank spaces, so they won't get broken up when we do the 
# final splitting by spaces.

@terms = split(/\s+/,$FORM{'terms'});
foreach $term (@terms)
	{
	# Skip null entries (first and last)
	next if ($term eq '');

	# Unmask grouped terms:
	$term =~ s/$placeholder/ /g;

	if ($term =~ /^\+/)
		{
		$term =~ s/\+//o;
		$term = '\W' . $term. '\W' unless ($term =~ /^\$/);
		$term =~ s/^\$//;
		push(@required,$term);
		$required_terms_present = "you bet";
		}
	elsif ($term =~ /^-/)
		{
		$term =~ s/-//o;
		$term = '\W' . $term. '\W' unless ($term =~ /^\$/);
		$term =~ s/^\$//;
		push(@forbidden,$term);
		$forbidden_terms_present = "fraid so";
		}
	else
		{
		$term = '\W' . $term. '\W' unless ($term =~ /^\$/);
		$term =~ s/^\$//;
		push(@optional,$term);
		}
	}


foreach $FILE (@FILES)
	{
open(FILE,"$FILE");
@LINES = <FILE>;
close(FILE);
$string = join(' ',@LINES);
$string =~ s/\n//g;


# Extract the title, if there is one:
if ($string =~ /<title>(.*)<\/title>/i)
	{
	$titles{$FILE} = $1;
	for ($i=1;$i<$title_x;$i++)
		{
		$string .= $titles{$FILE};
		}
	}
$titles{$FILE} = $FILE unless $titles{$FILE};

# Extract the description, if there is one:
if ($string =~ /<meta\s+name="description"\s+content="(.*)>/i)
	{
	@cut = split(/\">/,$1);
	$description{$FILE} = $cut[0];
	for ($i=0;$i<$description_x;$i++)
		{
		$string .= $description{$FILE};
		}
	}
else
	{
	$string2 = $string;
	$string2 =~ s/<title>(.*)<\/title>//ig;
	$string2 =~ s/<([^>]|\n)*>//g;
	@words = split(/\s+/,$string2);
	for ($i=0;$i<25;$i++)
		{$description{$FILE} .= "$words[$i] ";}
	$description{$FILE} .= "...";
	}


# Extract the keywords, if they exist:
if ($string =~ /<meta\s+name="keywords"\s+content="(.*)>/i)
	{
	@cut = split(/\">/,$1);
	$keywords = $cut[0];
	for ($i=0;$i<$keywords_x;$i++)
		{
		$string .= $keywords;
		}
	}


# Weight the filename as needed:
for ($i=0;$i<$name_x;$i++)
	{
	$string .= "$baseurl$FILE";
	}


# Now that we're done with the special HTML tags, strip HTML tags 
# from the file so that they aren't used in the search:

$string =~ s/<([^>]|\n)*>//g;


# Check for optional terms:
foreach $term (@optional)
	{
	$lowercase = $term;
	$lowercase =~ tr/[A-Z]/[a-z]/;
	$lowercase =~ tr/\\w/\\W/;

	if (($term eq $lowercase) && ($check))
		{
		$include{$FILE} = 'yes' if ($string =~ /$term/i);
		@count = split(/$term/i,$string);
		}
	elsif ($check)
		{
		$include{$FILE} = 'yes' if ($string =~ /$term/);
		@count = split(/$term/,$string);
		}
	$word_count = @count;
	$relevance{$FILE} = $relevance{$FILE} + $word_count;
	}
# Done checking for optional terms.


# Check for required terms:
if ($required_terms_present eq "you bet")
{
foreach $term (@required)
	{
	$lowercase = $term;
	$lowercase =~ tr/[A-Z]/[a-z]/;
	$lowercase =~ tr/\\w/\\W/;
	if (($term eq $lowercase) && ($check))
		{
		if (($string =~ /$term/i) && ($include{$FILE} ne 'no'))
			{
			$include{$FILE} = 'yes';
			@count = split(/$term/i,$string);
			}
		else
			{
			$include{$FILE} = 'no';
			last;
			}
		}
	elsif ($check)
		{
		if (($string =~ /$term/) && ($include{$FILE} ne 'no'))
			{
			$include{$FILE} = 'yes';
			@count = split(/$term/,$string);
			}
		else
			{
			$include{$FILE} = 'no';
			last;
			}
		}
	$word_count = @count;
	$relevance{$FILE} = $relevance{$FILE} + $word_count;
	}
}
# Done checking for required terms.


# Check for forbidden terms:
if (($forbidden_terms_present eq "fraid so") && ($check))
{
foreach $term (@forbidden)
	{
	$lowercase = $term;
	$lowercase =~ tr/[A-Z]/[a-z]/;
	$lowercase =~ tr/\\w/\\W/;
	if (($term eq $lowercase) && ($string =~ /$term/i))
		{
		$include{$FILE} = 'no';
		last;
		}
	elsif ($string =~ /$term/)
		{
		$include{$FILE} = 'no';
		last;
		}
	}
}
# Done checking for forbidden terms.


# Allow for wildcard-triggered listing:
$include{$FILE} = 'yes' unless ($check);


# Format for relevance:
if ($include{$FILE} eq 'yes')
	{
	$relevance = sprintf("%.3f",($relevance{$FILE}/1000));
	$HITS{"$relevance$FILE"} = "$FILE";
	$hitcount++;
	}


} # End loop through all files.
} # End search procedure.


      
sub return_html
{
# First we build a summary file for the webmaster and the visitor:
$docstring = "$hitcount Documents";
$docstring = "One Document" if ($hitcount == 1);
$docstring = "No Documents" unless ($hitcount);

$summary = "<H2><TT>Search Results: $docstring Found</TT></H2>\n";
$summary .= "<BLOCKQUOTE>\n<PRE>\n";
$summary .= "     Optional Terms:  " if (@optional);
$i = 0;
foreach $term (@optional)
	{
	$term = "<I>$term</I>" unless ($term =~ /^\\W/);
	$term =~ s/\\W//g;
	$summary .= "$term";
	$i++;
	$summary .= ", " unless ($i == @optional);
	}
$summary .= "\n     Required Terms:  " if (@required);
$i = 0;
foreach $term (@required)
	{
	$term = "<I>$term</I>" unless ($term =~ /^\\W/);
	$term =~ s/\\W//g;
	$summary .= "$term";
	$i++;
	$summary .= ", " unless ($i == @required);
	}
$summary .= "\n    Forbidden Terms:  " if (@forbidden);
$i = 0;
foreach $term (@forbidden)
	{
	$term = "<I>$term</I>" unless ($term =~ /^\\W/);
	$term =~ s/\\W//g;
	$summary .= "$term";
	$i++;
	$summary .= ", " unless ($i == @forbidden);
	}
$summary .= "\n</PRE></BLOCKQUOTE>\n";

open(SUMMARY,">>$summary_file");
print SUMMARY "Search by $ENV{'REMOTE_HOST'}:<BR>\n";
print SUMMARY $summary;
close(SUMMARY);

# Now that the webmaster knows what's going on, we print the 
# results for the visitor:


print "Content-type: text/html\n\n";
print <<EOM;
<HTML>
<HEAD><TITLE>Results of Your Search</TITLE></HEAD>
<BODY BGCOLOR="#FFFFFF" TEXT="#000000" LINK="#0000EE" VLINK="#551A8B" ALINK="#FF0000">
$summary
<DL>
EOM

if ($hitcount > 0)
{
foreach $key (reverse sort keys %HITS)
	{
	$file = $HITS{$key};
	$size = -s "$basedir$file";
	if ($size > 1500)
		{$size = int($size/1000) . " K";}
	else
		{$size = "$size bytes";}
	$last = &Last_Modified("$basedir$file");
	print "<P><DT><a href=\"$baseurl$file\"><STRONG>$titles{$file}</STRONG></a></DT>\n";
	print "<DD>$description{$file}<BR>\n";
	print "<CITE><A HREF=\"$baseurl$file\">$baseurl$file</A><FONT SIZE=-1>";
	print " - $size - $last</FONT></CITE></DD>\n";
	}
}
else
{
print <<EOM;
<BLOCKQUOTE><B>Unfortunately, we didn't find any documents which 
matched your search terms. You may want to visit our 
<A HREF="$cgi_url?tips">search tips</A> page to better refine your 
queries.</B></BLOCKQUOTE>
EOM
}
print <<EOM;
</DL>

<CENTER>
<BR><BR><FORM METHOD=POST ACTION="$cgi_url">
<INPUT TYPE=TEXT NAME="terms" SIZE=40>
<INPUT TYPE=SUBMIT VALUE="New Search"></FORM>
</CENTER>

<BR><H5 ALIGN=CENTER>
<A HREF="$cgi_url?tips">Search Tips</A> - 
<A HREF="$link_url">$link_title</A>
<HR SIZE=1 NOSHADE WIDTH=50\%>
Intermediate Search, Version 1.1 is Copyright 1997 (freeware) by
<A HREF="http://www.xav.com">Fluid Dynamics</A>.<BR>
Visit the 
<A HREF="http://www.xav.com/scripts/search">Search Page</A>
for help files and most recent version.</H5></BODY></HTML>
EOM
}


sub Last_Modified
# This wonderful snippet was written by Jeff Carnahan of Terminal 
# Productions (www.terminalp.com)
{
$filename = shift;
($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime((stat($filename))[9]);
@months = (Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec);
return "$mday $months[$mon] $year";
}


sub bad_base
{
print "Content-type: text/html\n\n";
print "I tried to find the base directory you specified:\n";
print "<BLOCKQUOTE><PRE>$basedir</PRE></BLOCKQUOTE>\n";
print "But the system told me that it did not exist.\n";
exit;
}