search.php

Go to the documentation of this file.
00001 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
00002 <html><head><meta http-equiv="Content-Type" content="text/html;charset=iso-8859-1">
00003 <title>Search</title>
00004 <link href="doxygen.css" rel="stylesheet" type="text/css">
00005 <link href="tabs.css" rel="stylesheet" type="text/css">
00006 </head><body>
00007 <!-- Generated by Doxygen 1.4.7 -->
00008 <div class="tabs">
00009   <ul>
00010     <li><a href="main.html"><span>Main&nbsp;Page</span></a></li>
00011     <li><a href="annotated.html"><span>Classes</span></a></li>
00012     <li><a href="files.html"><span>Files</span></a></li>
00013     <li><a href="dirs.html"><span>Directories</span></a></li>
00014     <li>
00015       <form action="search.php" method="get">
00016         <table cellspacing="0" cellpadding="0" border="0">
00017           <tr>
00018             <td><label>&nbsp;<u>S</u>earch&nbsp;for&nbsp;</label></td>
00019 
00020 <?php
00021 
00022 function search_results()
00023 {
00024   return "Search Results";
00025 }
00026 
00027 function matches_text($num)
00028 {
00029   if ($num==0)
00030   {
00031     return "Sorry, no documents matching your query.";
00032   }
00033   else if ($num==1)
00034   {
00035     return "Found <b>1</b> document matching your query.";
00036   }
00037   else // $num>1
00038   {
00039     return "Found <b>$num</b> documents matching your query. Showing best matches first.";
00040   }
00041 }
00042 
00043 function report_matches()
00044 {
00045   return "Matches: ";
00046 }
00047 function end_form($value)
00048 {
00049   echo "            <td><input type=\"text\" name=\"query\" value=\"$value\" size=\"20\" accesskey=\"s\"/></td>\n          </tr>\n        </table>\n      </form>\n    </li>\n  </ul>\n</div>\n";
00050 }
00051 
00052 function readInt($file)
00053 {
00054   $b1 = ord(fgetc($file)); $b2 = ord(fgetc($file));
00055   $b3 = ord(fgetc($file)); $b4 = ord(fgetc($file));
00056   return ($b1<<24)|($b2<<16)|($b3<<8)|$b4;
00057 }
00058 
00059 function readString($file)
00060 {
00061   $result="";
00062   while (ord($c=fgetc($file))) $result.=$c;
00063   return $result;
00064 }
00065 
00066 function readHeader($file)
00067 {
00068   $header =fgetc($file); $header.=fgetc($file);
00069   $header.=fgetc($file); $header.=fgetc($file);
00070   return $header;
00071 }
00072 
00073 function computeIndex($word)
00074 {
00075   // Fast string hashing
00076   //$lword = strtolower($word);
00077   //$l = strlen($lword);
00078   //for ($i=0;$i<$l;$i++)
00079   //{
00080   //  $c = ord($lword{$i});
00081   //  $v = (($v & 0xfc00) ^ ($v << 6) ^ $c) & 0xffff;
00082   //}
00083   //return $v;
00084 
00085   // Simple hashing that allows for substring search
00086   if (strlen($word)<2) return -1;
00087   // high char of the index
00088   $hi = ord($word{0});
00089   if ($hi==0) return -1;
00090   // low char of the index
00091   $lo = ord($word{1});
00092   if ($lo==0) return -1;
00093   // return index
00094   return $hi*256+$lo;
00095 }
00096 
00097 function search($file,$word,&$statsList)
00098 {
00099   $index = computeIndex($word);
00100   if ($index!=-1) // found a valid index
00101   {
00102     fseek($file,$index*4+4); // 4 bytes per entry, skip header
00103     $index = readInt($file);
00104     if ($index) // found words matching the hash key
00105     {
00106       $start=sizeof($statsList);
00107       $count=$start;
00108       fseek($file,$index);
00109       $w = readString($file);
00110       while ($w)
00111       {
00112         $statIdx = readInt($file);
00113         if ($word==substr($w,0,strlen($word)))
00114         { // found word that matches (as substring)
00115           $statsList[$count++]=array(
00116               "word"=>$word,
00117               "match"=>$w,
00118               "index"=>$statIdx,
00119               "full"=>strlen($w)==strlen($word),
00120               "docs"=>array()
00121               );
00122         }
00123         $w = readString($file);
00124       }
00125       $totalHi=0;
00126       $totalFreqHi=0;
00127       $totalFreqLo=0;
00128       for ($count=$start;$count<sizeof($statsList);$count++)
00129       {
00130         $statInfo = &$statsList[$count];
00131         $multiplier = 1;
00132         // whole word matches have a double weight
00133         if ($statInfo["full"]) $multiplier=2;
00134         fseek($file,$statInfo["index"]); 
00135         $numDocs = readInt($file);
00136         $docInfo = array();
00137         // read docs info + occurrence frequency of the word
00138         for ($i=0;$i<$numDocs;$i++)
00139         {
00140           $idx=readInt($file); 
00141           $freq=readInt($file); 
00142           $docInfo[$i]=array("idx"  => $idx,
00143                              "freq" => $freq>>1,
00144                              "rank" => 0.0,
00145                              "hi"   => $freq&1
00146                             );
00147           if ($freq&1) // word occurs in high priority doc
00148           {
00149             $totalHi++;
00150             $totalFreqHi+=$freq*$multiplier;
00151           }
00152           else // word occurs in low priority doc
00153           {
00154             $totalFreqLo+=$freq*$multiplier;
00155           }
00156         }
00157         // read name and url info for the doc
00158         for ($i=0;$i<$numDocs;$i++)
00159         {
00160           fseek($file,$docInfo[$i]["idx"]);
00161           $docInfo[$i]["name"]=readString($file);
00162           $docInfo[$i]["url"]=readString($file);
00163         }
00164         $statInfo["docs"]=$docInfo;
00165       }
00166       $totalFreq=($totalHi+1)*$totalFreqLo + $totalFreqHi;
00167       for ($count=$start;$count<sizeof($statsList);$count++)
00168       {
00169         $statInfo = &$statsList[$count];
00170         $multiplier = 1;
00171         // whole word matches have a double weight
00172         if ($statInfo["full"]) $multiplier=2;
00173         for ($i=0;$i<sizeof($statInfo["docs"]);$i++)
00174         {
00175           $docInfo = &$statInfo["docs"];
00176           // compute frequency rank of the word in each doc
00177           $freq=$docInfo[$i]["freq"];
00178           if ($docInfo[$i]["hi"])
00179           {
00180             $statInfo["docs"][$i]["rank"]=
00181               (float)($freq*$multiplier+$totalFreqLo)/$totalFreq;
00182           }
00183           else
00184           {
00185             $statInfo["docs"][$i]["rank"]=
00186               (float)($freq*$multiplier)/$totalFreq;
00187           }
00188         }
00189       }
00190     }
00191   }
00192   return $statsList;
00193 }
00194 
00195 function combine_results($results,&$docs)
00196 {
00197   foreach ($results as $wordInfo)
00198   {
00199     $docsList = &$wordInfo["docs"];
00200     foreach ($docsList as $di)
00201     {
00202       $key=$di["url"];
00203       $rank=$di["rank"];
00204       if (in_array($key, array_keys($docs)))
00205       {
00206         $docs[$key]["rank"]+=$rank;
00207       }
00208       else
00209       {
00210         $docs[$key] = array("url"=>$key,
00211             "name"=>$di["name"],
00212             "rank"=>$rank
00213             );
00214       }
00215       $docs[$key]["words"][] = array(
00216                "word"=>$wordInfo["word"],
00217                "match"=>$wordInfo["match"],
00218                "freq"=>$di["freq"]
00219                );
00220     }
00221   }
00222   return $docs;
00223 }
00224 
00225 function filter_results($docs,&$requiredWords,&$forbiddenWords)
00226 {
00227   $filteredDocs=array();
00228   while (list ($key, $val) = each ($docs)) 
00229   {
00230     $words = &$docs[$key]["words"];
00231     $copy=1; // copy entry by default
00232     if (sizeof($requiredWords)>0)
00233     {
00234       foreach ($requiredWords as $reqWord)
00235       {
00236         $found=0;
00237         foreach ($words as $wordInfo)
00238         { 
00239           $found = $wordInfo["word"]==$reqWord;
00240           if ($found) break;
00241         }
00242         if (!$found) 
00243         {
00244           $copy=0; // document contains none of the required words
00245           break;
00246         }
00247       }
00248     }
00249     if (sizeof($forbiddenWords)>0)
00250     {
00251       foreach ($words as $wordInfo)
00252       {
00253         if (in_array($wordInfo["word"],$forbiddenWords))
00254         {
00255           $copy=0; // document contains a forbidden word
00256           break;
00257         }
00258       }
00259     }
00260     if ($copy) $filteredDocs[$key]=$docs[$key];
00261   }
00262   return $filteredDocs;
00263 }
00264 
00265 function compare_rank($a,$b)
00266 {
00267   if ($a["rank"] == $b["rank"]) 
00268   {
00269     return 0;
00270   }
00271   return ($a["rank"]>$b["rank"]) ? -1 : 1; 
00272 }
00273 
00274 function sort_results($docs,&$sorted)
00275 {
00276   $sorted = $docs;
00277   usort($sorted,"compare_rank");
00278   return $sorted;
00279 }
00280 
00281 function report_results(&$docs)
00282 {
00283   echo "<table cellspacing=\"2\">\n";
00284   echo "  <tr>\n";
00285   echo "    <td colspan=\"2\"><h2>".search_results()."</h2></td>\n";
00286   echo "  </tr>\n";
00287   $numDocs = sizeof($docs);
00288   if ($numDocs==0)
00289   {
00290     echo "  <tr>\n";
00291     echo "    <td colspan=\"2\">".matches_text(0)."</td>\n";
00292     echo "  </tr>\n";
00293   }
00294   else
00295   {
00296     echo "  <tr>\n";
00297     echo "    <td colspan=\"2\">".matches_text($numDocs);
00298     echo "\n";
00299     echo "    </td>\n";
00300     echo "  </tr>\n";
00301     $num=1;
00302     foreach ($docs as $doc)
00303     {
00304       echo "  <tr>\n";
00305       echo "    <td align=\"right\">$num.</td>";
00306       echo     "<td><a class=\"el\" href=\"".$doc["url"]."\">".$doc["name"]."</a></td>\n";
00307       echo "  <tr>\n";
00308       echo "    <td></td><td class=\"tiny\">".report_matches()." ";
00309       foreach ($doc["words"] as $wordInfo)
00310       {
00311         $word = $wordInfo["word"];
00312         $matchRight = substr($wordInfo["match"],strlen($word));
00313         echo "<b>$word</b>$matchRight(".$wordInfo["freq"].") ";
00314       }
00315       echo "    </td>\n";
00316       echo "  </tr>\n";
00317       $num++;
00318     }
00319   }
00320   echo "</table>\n";
00321 }
00322 
00323 function main()
00324 {
00325   if(strcmp('4.1.0', phpversion()) > 0) 
00326   {
00327     die("Error: PHP version 4.1.0 or above required!");
00328   }
00329   if (!($file=fopen("search.idx","rb"))) 
00330   {
00331     die("Error: Search index file could NOT be opened!");
00332   }
00333   if (readHeader($file)!="DOXS")
00334   {
00335     die("Error: Header of index file is invalid!");
00336   }
00337   $query="";
00338   if (array_key_exists("query", $_GET))
00339   {
00340     $query=$_GET["query"];
00341   }
00342   end_form($query);
00343   echo "&nbsp;\n<div class=\"searchresults\">\n";
00344   $results = array();
00345   $requiredWords = array();
00346   $forbiddenWords = array();
00347   $foundWords = array();
00348   $word=strtok($query," ");
00349   while ($word) // for each word in the search query
00350   {
00351     if (($word{0}=='+')) { $word=substr($word,1); $requiredWords[]=$word; }
00352     if (($word{0}=='-')) { $word=substr($word,1); $forbiddenWords[]=$word; }
00353     if (!in_array($word,$foundWords))
00354     {
00355       $foundWords[]=$word;
00356       search($file,strtolower($word),$results);
00357     }
00358     $word=strtok(" ");
00359   }
00360   $docs = array();
00361   combine_results($results,$docs);
00362   // filter out documents with forbidden word or that do not contain
00363   // required words
00364   $filteredDocs = filter_results($docs,$requiredWords,$forbiddenWords);
00365   // sort the results based on rank
00366   $sorted = array();
00367   sort_results($filteredDocs,$sorted);
00368   // report results to the user
00369   report_results($sorted);
00370   echo "</div>\n";
00371   fclose($file);
00372 }
00373 
00374 main();
00375 
00376 
00377 ?>
00378 <hr size="1"><address style="align: right;"><small>Generated on Mon Oct 9 12:39:32 2006 for Strongtalk VM by&nbsp;
00379 <a href="http://www.doxygen.org/index.html">
00380 <img src="doxygen.png" alt="doxygen" align="middle" border="0"></a> 1.4.7 </small></address>
00381 </body>
00382 </html>

Generated on Mon Oct 9 13:37:27 2006 for Strongtalk VM by  doxygen 1.4.7