search.cgi
#!/usr/bin/perl # Intermediate Search, Version 1.1 # Copyright 1997 by Fluid Dynamics <xav.com> # You are free to use the script, but please ask before you # distribute it. # # For latest version and help files, visit: # http://www.xav.com/scripts/search/ # __________________________________________________________________ $basedir = '/home/httpd'; # The directory location of all your files. Remember the trailing # slash. $baseurl = 'http://www.lawsonhelp.com/'; # The URL corresponding to the base directory. #@files = ('public/*.html', # 'public/*/*.html', # 'client/*.html', # 'client/*/*.html'); #@files = ('public/*/*.html', # 'client/*/*.html', # 'client/*/*.txt', # 'client/*/*.txt' # ); @files = ('public/docu/*.html'); # These are all the files that will be listed. The asterisk is a # wildcard - it will list all files and directories. $summary_file = '/tmp/summaries.txt'; # Make this writable chmod(777 summaries.html) and hide it well! # It holds the results of everybody's searches so you'll know what # people are really looking for when they come to your site. We # have placed our summary file in a non-web directory so others # can't see it - you could put it in a hidden or secure directory. $link_url = 'http://www.lawsonhelp.com/'; $link_title = 'LawsonHelp.com'; # Enter the URL and title of your main web page. $java_toys = 'off'; # If your visitors can't handle Java, better set this to 'off'. $searchpict = 'http://www.lawsonhelp.com/public/images/logo.jpg'; # The URL of the E3 picture. $cgi_url = 'search.cgi'; # Change this to the full URL only if your rename this script. # This array holds info on all the directories and filetypes you'd like # your visitors to search. Visit the readme file for more customizing # information. # Options for Weighted Search: # # All occurrences of a search term count as one point. The occurrence # of a term in the filename, title, META keywords, or META description # can have added weight (equivalent to a multiplier per hit). Enter # the multipliers in the array below - the defaults are (2,2,4,2). If # this makes no sense to you, just ignore it and leave the defaults as # they are - they work pretty well. Note that this will give extra # weight to those pages that have a properly formatted title and META # tags, even if they contain the same basic information. ($name_x, $title_x, $keywords_x, $description_x) = (2,2,4,2); # No further editing is necessary, but feel free to play around... # Note that much of the code below is straight HTML, and very easy to # modify if you know a little about HTML programming. # # __________________________________________________________________ read(STDIN,$buffer,$ENV{'CONTENT_LENGTH'}); @pairs = split(/&/,$buffer); foreach $pair (@pairs) { ($name,$value) = split(/=/,$pair); $value =~ tr/+/ /; $value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; $FORM{$name} = $value; } if ($FORM{'terms'}) { &get_files; &search; &return_html; } else { &prompt; } sub prompt { print "Content-type: text/html\n\n"; print <<EOM; <HTML> <HEAD> <TITLE>Intermediate Search</TITLE> <META NAME="description" CONTENT="Try out our internal search engine for the fastest way to find what you're looking for!"> </HEAD> <BODY BGCOLOR="#FFFFFF" TEXT="#000000" LINK="#CE0000" ALINK="#000000" VLINK="#880000"> <BR><BLOCKQUOTE> <IMG SRC="$searchpict" ALIGN=RIGHT HSPACE=20 WIDTH=253 HEIGHT=144 BORDER=1 ALT="Picture of Search Aircraft (trying to build a theme)"> <B>Enter a few keywords to search our site, or read our <A HREF="#tips">search tips</A> to make advanced queries.</B><BR> <FORM METHOD=POST ACTION="$cgi_url" NAME="searchform"> <INPUT TYPE=TEXT NAME="terms" SIZE=30> <INPUT TYPE=SUBMIT VALUE="Search!"><BR> </BLOCKQUOTE> EOM if ($java_toys eq 'on') { print "<SCRIPT LANGUAGE=\"JavaScript\">\n"; print "<!-- script hiding...\n"; print "document.searchform.terms.focus();\n"; print "// End hiding -->\n"; print "</SCRIPT>\n"; } print <<EOM; <A NAME="tips"></A> </FORM> <BR><H2><TT>Tips, tips and more tips!</TT></H2> <BLOCKQUOTE> Each term may be preceded by the standard Boolean operators <TT>not</TT>, <TT>and</TT>, or <TT>or</TT>. If you search for "<TT>dogs not pizzas</TT>", you'll find <I>all</I> documents containing the word "<TT>dogs</TT>" <I>except</I> those documents which <I>also</I> contain the word "<TT>pizzas</TT>". If you type in "<TT>and hot and dog and pizzas</TT>", you'll find <I>only</I> those documents which contain <I>all three</I> search terms. The default value is <TT>or</TT>. Thus, a search for "<TT>hot dog pizzas</TT>" would return pages <I>with at least one</I> of the three terms.<P> Altavista's shorthand notation works too. A search on "<TT>dogs -hot</TT>" is equivalent to the first example, and "<TT>+hot +dog +pizzas</TT>" will return the same documents as the second.<P> If a search term has at least one capital letter, like "<TT>parIS</TT>", the search will be case sensitive with respect to that word - that is, only documents containing "<TT>parIS</TT>" will be found. On the other hand, lowercase words like "<TT>paris</TT>" will generate hits from "<TT>Paris</TT>", "<TT>PARIS</TT>", or "<TT>parIS</TT>".<P> To group a collection of words, use quotes. For example, the query <TT>"Zoltan Milosevic"</TT> (quotes included) would not generate a hit from "Slobodan Milosevic met with Zoltan Smith". Without quotes, the sentence would count. Boolean operators can also act on quotations: a search on '<TT>+the +kitten not "the kitten"</TT>' would return only those documents where "<TT>the</TT>" and "<TT>kitten</TT>" appear separately.<P> Intermediate Search finds words, not strings. A search for "<TT>in</TT>" would turn up only that word, not "<TT>bin</TT>", "<TT>inside</TT>", or "<TT>acquaintance</TT>". To perform a string search, preface your term with the dollar sign - a query on "<TT>\$in</TT>" would find all words lists above. Note that more complex wildcard searches using the asterisk are <I>not</I> permitted. Including the asterisk in your query will return a list of all files, but that's its only function.<P> These rules are based on <A HREF="http://www.altavista.digital.com"> Altavista's</A> query syntax; a look at their <A HREF="http://www.altavista.digital.com/cgi-bin/query?pg=h"> Search Tips</A> may prove useful. The original Simple Search was created by Matt Wright and can be found at <a href="http://www.worldwidemart.com/scripts/">Matt's Script Archive</a>. Like Matt's script, our version is freeware and can easily be set up on most websites.</BLOCKQUOTE> <BR><H5 ALIGN=CENTER> <A HREF="$link_url">$link_title</A> <HR SIZE=1 NOSHADE WIDTH=50\%> Intermediate Search, Version 1.1 is Copyright 1997 (freeware) by <A HREF="http://www.xav.com">Fluid Dynamics</A>.<BR> Visit the <A HREF="http://www.xav.com/scripts/search">Search Page</A> for help files and most recent version.</H5></BODY></HTML> EOM } sub get_files { &bad_base unless (-e $basedir); chdir($basedir); foreach $file (@files) { $ls = `ls $file`; @ls = split(/\s+/,$ls); foreach $temp_file (@ls) { if (-d $file) { $filename = "$file$temp_file"; if (-T $filename) { push(@FILES,$filename); } } elsif (-T $temp_file) { push(@FILES,$temp_file); } } } } sub search { # Convert multiple blank spaces to single spaces: $FORM{'terms'} =~ s/\s+/ /g; $FORM{'terms'} = " $FORM{'terms'} "; # Convert NOT statements to minus signs: $FORM{'terms'} =~ s/ not / -/ig; # Convert AND statements to plus signs: $FORM{'terms'} =~ s/ and / \+/ig; # Strip OR statements (OR is the default): $FORM{'terms'} =~ s/ or / /ig; # Strip wildcards (bad, bad things!) $check = 'true' unless ($FORM{'terms'} =~ /\*/); # Correct for grouped entries: @terms = split(/\"/,$FORM{'terms'}); $iterator = 0; $FORM{'terms'} = ""; $placeholder = '%%%==%%%'; foreach $term (@terms) { # Do some binary-state switching: if ($iterator == 1) {$iterator--;} else {$iterator++;} # The iterator is 0 during grouped states: $term =~ s/ /$placeholder/g unless $iterator; $FORM{'terms'} .= $term; } # Done correcting for grouped entries - now all "term1 term2" # queries have the ugly placeholder holding them together instead # of blank spaces, so they won't get broken up when we do the # final splitting by spaces. @terms = split(/\s+/,$FORM{'terms'}); foreach $term (@terms) { # Skip null entries (first and last) next if ($term eq ''); # Unmask grouped terms: $term =~ s/$placeholder/ /g; if ($term =~ /^\+/) { $term =~ s/\+//o; $term = '\W' . $term. '\W' unless ($term =~ /^\$/); $term =~ s/^\$//; push(@required,$term); $required_terms_present = "you bet"; } elsif ($term =~ /^-/) { $term =~ s/-//o; $term = '\W' . $term. '\W' unless ($term =~ /^\$/); $term =~ s/^\$//; push(@forbidden,$term); $forbidden_terms_present = "fraid so"; } else { $term = '\W' . $term. '\W' unless ($term =~ /^\$/); $term =~ s/^\$//; push(@optional,$term); } } foreach $FILE (@FILES) { open(FILE,"$FILE"); @LINES = <FILE>; close(FILE); $string = join(' ',@LINES); $string =~ s/\n//g; # Extract the title, if there is one: if ($string =~ /<title>(.*)<\/title>/i) { $titles{$FILE} = $1; for ($i=1;$i<$title_x;$i++) { $string .= $titles{$FILE}; } } $titles{$FILE} = $FILE unless $titles{$FILE}; # Extract the description, if there is one: if ($string =~ /<meta\s+name="description"\s+content="(.*)>/i) { @cut = split(/\">/,$1); $description{$FILE} = $cut[0]; for ($i=0;$i<$description_x;$i++) { $string .= $description{$FILE}; } } else { $string2 = $string; $string2 =~ s/<title>(.*)<\/title>//ig; $string2 =~ s/<([^>]|\n)*>//g; @words = split(/\s+/,$string2); for ($i=0;$i<25;$i++) {$description{$FILE} .= "$words[$i] ";} $description{$FILE} .= "..."; } # Extract the keywords, if they exist: if ($string =~ /<meta\s+name="keywords"\s+content="(.*)>/i) { @cut = split(/\">/,$1); $keywords = $cut[0]; for ($i=0;$i<$keywords_x;$i++) { $string .= $keywords; } } # Weight the filename as needed: for ($i=0;$i<$name_x;$i++) { $string .= "$baseurl$FILE"; } # Now that we're done with the special HTML tags, strip HTML tags # from the file so that they aren't used in the search: $string =~ s/<([^>]|\n)*>//g; # Check for optional terms: foreach $term (@optional) { $lowercase = $term; $lowercase =~ tr/[A-Z]/[a-z]/; $lowercase =~ tr/\\w/\\W/; if (($term eq $lowercase) && ($check)) { $include{$FILE} = 'yes' if ($string =~ /$term/i); @count = split(/$term/i,$string); } elsif ($check) { $include{$FILE} = 'yes' if ($string =~ /$term/); @count = split(/$term/,$string); } $word_count = @count; $relevance{$FILE} = $relevance{$FILE} + $word_count; } # Done checking for optional terms. # Check for required terms: if ($required_terms_present eq "you bet") { foreach $term (@required) { $lowercase = $term; $lowercase =~ tr/[A-Z]/[a-z]/; $lowercase =~ tr/\\w/\\W/; if (($term eq $lowercase) && ($check)) { if (($string =~ /$term/i) && ($include{$FILE} ne 'no')) { $include{$FILE} = 'yes'; @count = split(/$term/i,$string); } else { $include{$FILE} = 'no'; last; } } elsif ($check) { if (($string =~ /$term/) && ($include{$FILE} ne 'no')) { $include{$FILE} = 'yes'; @count = split(/$term/,$string); } else { $include{$FILE} = 'no'; last; } } $word_count = @count; $relevance{$FILE} = $relevance{$FILE} + $word_count; } } # Done checking for required terms. # Check for forbidden terms: if (($forbidden_terms_present eq "fraid so") && ($check)) { foreach $term (@forbidden) { $lowercase = $term; $lowercase =~ tr/[A-Z]/[a-z]/; $lowercase =~ tr/\\w/\\W/; if (($term eq $lowercase) && ($string =~ /$term/i)) { $include{$FILE} = 'no'; last; } elsif ($string =~ /$term/) { $include{$FILE} = 'no'; last; } } } # Done checking for forbidden terms. # Allow for wildcard-triggered listing: $include{$FILE} = 'yes' unless ($check); # Format for relevance: if ($include{$FILE} eq 'yes') { $relevance = sprintf("%.3f",($relevance{$FILE}/1000)); $HITS{"$relevance$FILE"} = "$FILE"; $hitcount++; } } # End loop through all files. } # End search procedure. sub return_html { # First we build a summary file for the webmaster and the visitor: $docstring = "$hitcount Documents"; $docstring = "One Document" if ($hitcount == 1); $docstring = "No Documents" unless ($hitcount); $summary = "<H2><TT>Search Results: $docstring Found</TT></H2>\n"; $summary .= "<BLOCKQUOTE>\n<PRE>\n"; $summary .= " Optional Terms: " if (@optional); $i = 0; foreach $term (@optional) { $term = "<I>$term</I>" unless ($term =~ /^\\W/); $term =~ s/\\W//g; $summary .= "$term"; $i++; $summary .= ", " unless ($i == @optional); } $summary .= "\n Required Terms: " if (@required); $i = 0; foreach $term (@required) { $term = "<I>$term</I>" unless ($term =~ /^\\W/); $term =~ s/\\W//g; $summary .= "$term"; $i++; $summary .= ", " unless ($i == @required); } $summary .= "\n Forbidden Terms: " if (@forbidden); $i = 0; foreach $term (@forbidden) { $term = "<I>$term</I>" unless ($term =~ /^\\W/); $term =~ s/\\W//g; $summary .= "$term"; $i++; $summary .= ", " unless ($i == @forbidden); } $summary .= "\n</PRE></BLOCKQUOTE>\n"; open(SUMMARY,">>$summary_file"); print SUMMARY "Search by $ENV{'REMOTE_HOST'}:<BR>\n"; print SUMMARY $summary; close(SUMMARY); # Now that the webmaster knows what's going on, we print the # results for the visitor: print "Content-type: text/html\n\n"; print <<EOM; <HTML> <HEAD><TITLE>Results of Your Search</TITLE></HEAD> <BODY BGCOLOR="#FFFFFF" TEXT="#000000" LINK="#0000EE" VLINK="#551A8B" ALINK="#FF0000"> $summary <DL> EOM if ($hitcount > 0) { foreach $key (reverse sort keys %HITS) { $file = $HITS{$key}; $size = -s "$basedir$file"; if ($size > 1500) {$size = int($size/1000) . " K";} else {$size = "$size bytes";} $last = &Last_Modified("$basedir$file"); print "<P><DT><a href=\"$baseurl$file\"><STRONG>$titles{$file}</STRONG></a></DT>\n"; print "<DD>$description{$file}<BR>\n"; print "<CITE><A HREF=\"$baseurl$file\">$baseurl$file</A><FONT SIZE=-1>"; print " - $size - $last</FONT></CITE></DD>\n"; } } else { print <<EOM; <BLOCKQUOTE><B>Unfortunately, we didn't find any documents which matched your search terms. You may want to visit our <A HREF="$cgi_url?tips">search tips</A> page to better refine your queries.</B></BLOCKQUOTE> EOM } print <<EOM; </DL> <CENTER> <BR><BR><FORM METHOD=POST ACTION="$cgi_url"> <INPUT TYPE=TEXT NAME="terms" SIZE=40> <INPUT TYPE=SUBMIT VALUE="New Search"></FORM> </CENTER> <BR><H5 ALIGN=CENTER> <A HREF="$cgi_url?tips">Search Tips</A> - <A HREF="$link_url">$link_title</A> <HR SIZE=1 NOSHADE WIDTH=50\%> Intermediate Search, Version 1.1 is Copyright 1997 (freeware) by <A HREF="http://www.xav.com">Fluid Dynamics</A>.<BR> Visit the <A HREF="http://www.xav.com/scripts/search">Search Page</A> for help files and most recent version.</H5></BODY></HTML> EOM } sub Last_Modified # This wonderful snippet was written by Jeff Carnahan of Terminal # Productions (www.terminalp.com) { $filename = shift; ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime((stat($filename))[9]); @months = (Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec); return "$mday $months[$mon] $year"; } sub bad_base { print "Content-type: text/html\n\n"; print "I tried to find the base directory you specified:\n"; print "<BLOCKQUOTE><PRE>$basedir</PRE></BLOCKQUOTE>\n"; print "But the system told me that it did not exist.\n"; exit; }