search_jira20.cgi
#!/usr/bin/perl # Intermediate Search, Version 1.1 # Copyright 1997 by Fluid Dynamics <xav.com> # You are free to use the script, but please ask before you # distribute it. # # For latest version and help files, visit: # http://www.xav.com/scripts/search/ # __________________________________________________________________ $basedir = '/usr/local/apache/htdocs/'; # The directory location of all your files. Remember the trailing # slash. $baseurl = 'http://www.setgetweb.com/'; # The URL corresponding to the base directory. @files = ('tech/jira20/*.html'); # These are all the files that will be listed. The asterisk is a # wildcard - it will list all files and directories. $summary_file = '/tmp/summaries.txt'; # Make this writable chmod(777 summaries.txt) and hide it well! # It holds the results of everybody's searches so you'll know what # people are really looking for when they come to your site. We # have placed our summary file in a non-web directory so others # can't see it - you could put it in a hidden or secure directory. $link_url = 'http://www.setgetweb.com/'; $link_title = 'Home'; # Enter the URL and title of your main web page. $java_toys = 'off'; # If your visitors can't handle Java, better set this to 'off'. $searchpict = 'http://www.setgetweb.com/images/search.gif'; $lawlogo = 'http://www.setgetweb.com/images/search.gif'; # The URL of the E3 picture. $cgi_url = 'search_jira20.cgi'; # Change this to the full URL only if your rename this script. # This array holds info on all the directories and filetypes you'd like # your visitors to search. Visit the readme file for more customizing # information. # Options for Weighted Search: # # All occurrences of a search term count as one point. The occurrence # of a term in the filename, title, META keywords, or META description # can have added weight (equivalent to a multiplier per hit). Enter # the multipliers in the array below - the defaults are (2,2,4,2). If # this makes no sense to you, just ignore it and leave the defaults as # they are - they work pretty well. Note that this will give extra # weight to those pages that have a properly formatted title and META # tags, even if they contain the same basic information. ($name_x, $title_x, $keywords_x, $description_x) = (2,2,4,2); # No further editing is necessary, but feel free to play around... # Note that much of the code below is straight HTML, and very easy to # modify if you know a little about HTML programming. # # __________________________________________________________________ read(STDIN,$buffer,$ENV{'CONTENT_LENGTH'}); #@pairs = split(/&/,$buffer); @pairs = split(/&/,$ENV{'QUERY_STRING'}); foreach $pair (@pairs) { ($name,$value) = split(/=/,$pair); $value =~ tr/+/ /; $value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; $FORM{$name} = $value; } if ($FORM{'terms'}) { &get_files; &search; &return_html; } else { print "<p>Unable to parse variable terms"; #&prompt; } sub prompt { print "Content-type: text/html\n\n"; print <<EOM; <HTML> <HEAD> <TITLE>Intermediate Search</TITLE> <META NAME="description" CONTENT="Try out our internal search engine for the fastest way to find what you're looking for!"> </head> <body> <center> <IMG SRC="$lawlogo"></IMG> <IMG SRC="$searchpict" height=75 width=75 ALT="Another Green World"></IMG> <p> <p><FORM METHOD=POST ACTION="$cgi_url" NAME="searchform"> <INPUT TYPE=TEXT NAME="terms" SIZE=30> <INPUT TYPE=SUBMIT VALUE="Search!"><BR> EOM if ($java_toys eq 'on') { print "<SCRIPT LANGUAGE=\"JavaScript\">\n"; print "<!-- script hiding...\n"; print "document.searchform.terms.focus();\n"; print "// End hiding -->\n"; print "</SCRIPT>\n"; } print <<EOM; </FORM> <BR><H1>Search Tips</H1> </center> <font class=grey3> <BLOCKQUOTE> <p>The default value is or. Thus, a search for <i>lacobrts cmprts srgen</i> would return pages with at least one of the three terms.<P> <p>The best way to search is to require that all the words be found. Do this by sticking a plus (+) sign in front of each word. For example <i>+lacobrts +AP170</i> would only return documents that had both &quot;lacobrts&quot; and &quot;AP170&quot;. <p>You can search for a phrase by putting quotes around your phrase, i.e., &quot;<i>Just what, exactly, is a lacobrts?</i>&quot; <p>Each term may be preceded by the standard Boolean operators not, and, or or. <p> If you search for "<i>dogs not pizzas</i>", you'll find all documents containing the word <i>dogs</i> except those documents which also contain the word <i>pizzas</i>. <p> If you type in "<i>hot</i> and <i>dog</i> and <i> pizzas</i>", you'll find only those documents which contain hree search terms. <p>Altavista's shorthand notation works too. A search on "<i>dogs -hot</i>" is equivalent to the first example, and "<i>+hot +dog +pizzas</i>" will return the same documents as the second.<P> If a search term has at least one capital letter, like "<i>parIS</i>", the search will be case sensitive with respect to that word - that is, only documents containing "<i>parIS</i>" will be found. On the other hand, lowercase words like "<i>paris</i>" will generate hits from "<i>Paris</i>", "<i>PARIS</i>", or "<i>parIS</i>".<P> To group a collection of words, use quotes. For example, the query <i>"Zoltan Milosevic"</i> (quotes included) would not generate a hit from "Slobodan Milosevic met with Zoltan Smith". Without quotes, the sentence would count. Boolean operators can also act on quotations: a search on '<i>+the +kitten not "the kitten"</i>' would return only those documents where "<i>the</i>" and "<i>kitten</i>" appear separately.<P> Intermediate Search finds words, not strings. A search for "<i>in</i>" would turn up only that word, not "<i>bin</i>", "<i>inside</i>", or "<i>acquaintance</i>". To perform a string search, preface your term with the dollar sign - a query on "<i>\$in</i>" would find all words lists above. Note that more complex wildcard searches using the asterisk are <I>not</I> permitted. Including the asterisk in your query will return a list of all files, but that's its only function.<P> These rules are based on <A HREF="http://www.altavista.digital.com"> Altavista's</A> query syntax; a look at their <A HREF="http://www.altavista.digital.com/cgi-bin/query?pg=h"> Search Tips</A> may prove useful. The original Simple Search was created by Matt Wright and can be found at <a href="http://www.worldwidemart.com/scripts/">Matt's Script Archive</a>. Like Matt's script, our version is freeware and can easily be set up on most websites.</BLOCKQUOTE> <BR> <center> <A HREF="$link_url">$link_title</A> <p><HR SIZE=1 NOSHADE WIDTH=50\%> <p><font class=grey2><i> Intermediate Search, Version 1.1 is Copyright 1997 (freeware) by <A HREF="http://www.xav.com">Fluid Dynamics</A>.<BR> Visit the <A HREF="http://www.xav.com/scripts/search">Search Page</A> for help files and most recent version. </font></i> </center> <p></p> </BODY></HTML> EOM } sub get_files { &bad_base unless (-e $basedir); chdir($basedir); foreach $file (@files) { $ls = `ls $file`; @ls = split(/\s+/,$ls); foreach $temp_file (@ls) { if (-d $file) { $filename = "$file$temp_file"; if (-T $filename) { push(@FILES,$filename); } } elsif (-T $temp_file) { push(@FILES,$temp_file); } } } } sub search { # Convert multiple blank spaces to single spaces: $FORM{'terms'} =~ s/\s+/ /g; $FORM{'terms'} = " $FORM{'terms'} "; # Convert NOT statements to minus signs: $FORM{'terms'} =~ s/ not / -/ig; # Convert AND statements to plus signs: $FORM{'terms'} =~ s/ and / \+/ig; # Strip OR statements (OR is the default): $FORM{'terms'} =~ s/ or / /ig; # Strip wildcards (bad, bad things!) $check = 'true' unless ($FORM{'terms'} =~ /\*/); # Correct for grouped entries: @terms = split(/\"/,$FORM{'terms'}); $iterator = 0; $FORM{'terms'} = ""; $placeholder = '%%%==%%%'; foreach $term (@terms) { # Do some binary-state switching: if ($iterator == 1) {$iterator--;} else {$iterator++;} # The iterator is 0 during grouped states: $term =~ s/ /$placeholder/g unless $iterator; $FORM{'terms'} .= $term; } # Done correcting for grouped entries - now all "term1 term2" # queries have the ugly placeholder holding them together instead # of blank spaces, so they won't get broken up when we do the # final splitting by spaces. @terms = split(/\s+/,$FORM{'terms'}); foreach $term (@terms) { # Skip null entries (first and last) next if ($term eq ''); # Unmask grouped terms: $term =~ s/$placeholder/ /g; if ($term =~ /^\+/) { $term =~ s/\+//o; $term = '\W' . $term. '\W' unless ($term =~ /^\$/); $term =~ s/^\$//; push(@required,$term); $required_terms_present = "you bet"; } elsif ($term =~ /^-/) { $term =~ s/-//o; $term = '\W' . $term. '\W' unless ($term =~ /^\$/); $term =~ s/^\$//; push(@forbidden,$term); $forbidden_terms_present = "fraid so"; } else { $term = '\W' . $term. '\W' unless ($term =~ /^\$/); $term =~ s/^\$//; push(@optional,$term); } } foreach $FILE (@FILES) { open(FILE,"$FILE"); @LINES = <FILE>; close(FILE); $string = join(' ',@LINES); $string =~ s/\n//g; # Extract the title, if there is one: if ($string =~ /<title>(.*)<\/title>/i) { $titles{$FILE} = $1; for ($i=1;$i<$title_x;$i++) { $string .= $titles{$FILE}; } } $titles{$FILE} = $FILE unless $titles{$FILE}; # Extract the description, if there is one: if ($string =~ /<meta\s+name="description"\s+content="(.*)>/i) { @cut = split(/\">/,$1); $description{$FILE} = $cut[0]; for ($i=0;$i<$description_x;$i++) { $string .= $description{$FILE}; } } else { $string2 = $string; $string2 =~ s/<title>(.*)<\/title>//ig; $string2 =~ s/<([^>]|\n)*>//g; @words = split(/\s+/,$string2); for ($i=0;$i<25;$i++) {$description{$FILE} .= "$words[$i] ";} $description{$FILE} .= "..."; } # Extract the keywords, if they exist: if ($string =~ /<meta\s+name="keywords"\s+content="(.*)>/i) { @cut = split(/\">/,$1); $keywords = $cut[0]; for ($i=0;$i<$keywords_x;$i++) { $string .= $keywords; } } # Weight the filename as needed: for ($i=0;$i<$name_x;$i++) { $string .= "$baseurl$FILE"; } # Now that we're done with the special HTML tags, strip HTML tags # from the file so that they aren't used in the search: $string =~ s/<([^>]|\n)*>//g; # Check for optional terms: foreach $term (@optional) { $lowercase = $term; $lowercase =~ tr/[A-Z]/[a-z]/; $lowercase =~ tr/\\w/\\W/; if (($term eq $lowercase) && ($check)) { $include{$FILE} = 'yes' if ($string =~ /$term/i); @count = split(/$term/i,$string); } elsif ($check) { $include{$FILE} = 'yes' if ($string =~ /$term/); @count = split(/$term/,$string); } $word_count = @count; $relevance{$FILE} = $relevance{$FILE} + $word_count; } # Done checking for optional terms. # Check for required terms: if ($required_terms_present eq "you bet") { foreach $term (@required) { $lowercase = $term; $lowercase =~ tr/[A-Z]/[a-z]/; $lowercase =~ tr/\\w/\\W/; if (($term eq $lowercase) && ($check)) { if (($string =~ /$term/i) && ($include{$FILE} ne 'no')) { $include{$FILE} = 'yes'; @count = split(/$term/i,$string); } else { $include{$FILE} = 'no'; last; } } elsif ($check) { if (($string =~ /$term/) && ($include{$FILE} ne 'no')) { $include{$FILE} = 'yes'; @count = split(/$term/,$string); } else { $include{$FILE} = 'no'; last; } } $word_count = @count; $relevance{$FILE} = $relevance{$FILE} + $word_count; } } # Done checking for required terms. # Check for forbidden terms: if (($forbidden_terms_present eq "fraid so") && ($check)) { foreach $term (@forbidden) { $lowercase = $term; $lowercase =~ tr/[A-Z]/[a-z]/; $lowercase =~ tr/\\w/\\W/; if (($term eq $lowercase) && ($string =~ /$term/i)) { $include{$FILE} = 'no'; last; } elsif ($string =~ /$term/) { $include{$FILE} = 'no'; last; } } } # Done checking for forbidden terms. # Allow for wildcard-triggered listing: $include{$FILE} = 'yes' unless ($check); # Format for relevance: if ($include{$FILE} eq 'yes') { $relevance = sprintf("%.3f",($relevance{$FILE}/1000)); $HITS{"$relevance$FILE"} = "$FILE"; $hitcount++; } } # End loop through all files. } # End search procedure. sub return_html { # First we build a summary file for the webmaster and the visitor: $docstring = "$hitcount Documents"; $docstring = "One Document" if ($hitcount == 1); $docstring = "No Documents" unless ($hitcount); $summary = "<h3><I>Search Results: <TT>$docstring Found</TT></I></h3>\n"; $summary .= "<BLOCKQUOTE>\n<PRE>\n"; $summary .= " Optional Terms: " if (@optional); $i = 0; foreach $term (@optional) { $term = "<I>$term</I>" unless ($term =~ /^\\W/); $term =~ s/\\W//g; $summary .= "$term"; $i++; $summary .= ", " unless ($i == @optional); } $summary .= "\n Required Terms: " if (@required); $i = 0; foreach $term (@required) { $term = "<I>$term</I>" unless ($term =~ /^\\W/); $term =~ s/\\W//g; $summary .= "$term"; $i++; $summary .= ", " unless ($i == @required); } $summary .= "\n Forbidden Terms: " if (@forbidden); $i = 0; foreach $term (@forbidden) { $term = "<I>$term</I>" unless ($term =~ /^\\W/); $term =~ s/\\W//g; $summary .= "$term"; $i++; $summary .= ", " unless ($i == @forbidden); } $summary .= "\n</PRE></BLOCKQUOTE>\n"; open(SUMMARY,">>$summary_file"); print SUMMARY "Search by $ENV{'REMOTE_HOST'}:<BR>\n"; print SUMMARY $summary; close(SUMMARY); # Now that the webmaster knows what's going on, we print the # results for the visitor: print "Content-type: text/html\n\n"; print <<EOM; <HTML> <HEAD> <TITLE>Results of Your Search</TITLE> </head> <body> $summary <font class=grey3> <DL> EOM if ($hitcount > 0) { foreach $key (reverse sort keys %HITS) { $file = $HITS{$key}; $size = -s "$basedir$file"; if ($size > 1500) {$size = int($size/1000) . " K";} else {$size = "$size bytes";} $last = &Last_Modified("$basedir$file"); print "<P><DT><font class=grey2><a href=\"$baseurl$file\">$titles{$file}</a></DT>\n"; print "<DD><font class=grey2>$description{$file}<BR>\n"; print "<CITE><font class=grey2><A HREF=\"$baseurl$file\">$baseurl$file</A><FONT SIZE=-1>"; print " - $size - $last</FONT></CITE></DD></i>\n"; } } else { print <<EOM; <BLOCKQUOTE>Unfortunately, we didn't find any documents which matched your search terms. You may want to visit our <A HREF="$cgi_url?tips">search tips</A> page to better refine your queries.</BLOCKQUOTE> EOM } print <<EOM; </DL> <CENTER> <BR><BR><FORM METHOD=POST ACTION="$cgi_url"> <INPUT TYPE=TEXT NAME="terms" SIZE=40> <INPUT TYPE=SUBMIT VALUE="New Search"></FORM> </CENTER> <BR><H5 ALIGN=CENTER> <A HREF="$cgi_url?tips">Search Tips</A> - <A HREF="$link_url">$link_title</A> <HR SIZE=1 NOSHADE WIDTH=50\%> Intermediate Search, Version 1.1 is Copyright 1997 (freeware) by <A HREF="http://www.xav.com">Fluid Dynamics</A>.<BR> Visit the <A HREF="http://www.xav.com/scripts/search">Search Page</A> for help files and most recent version.</H5></BODY></HTML> EOM } sub Last_Modified # This wonderful snippet was written by Jeff Carnahan of Terminal # Productions (www.terminalp.com) { $filename = shift; ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime((stat($filename))[9]); @months = (Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec); return "$mday $months[$mon] $year"; } sub bad_base { print "Content-type: text/html\n\n"; print "I tried to find the base directory you specified:\n"; print "<BLOCKQUOTE><PRE>$basedir</PRE></BLOCKQUOTE>\n"; print "But the system told me that it did not exist.\n"; exit; }