package searchengine;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
   WordCounter contains a method to count the number of times an array of "keywords" 
   occurs in a text file.
*/
public class WordCounter
{
   /**
      Strips off the HTML tags in a file and writes the results
      into a txt file called pagestrip.txt.
      @param filename the text file to strip off the HTML tags
   */
   public static void stripHTML(String filename) throws Exception
   {
      int i;
      String[] result;
      String inputLine;
      FileReader reader;
      FileWriter writer;
      BufferedReader in;
      PrintWriter out;

      /* Setup to write result out to pagestrip.txt */
      writer = new FileWriter("pagestrip.txt");
      out = new PrintWriter(writer);

      /* Read in input file */
      reader = new FileReader(filename);
      in = new BufferedReader(reader);

      /* Read file one line at a time */
      inputLine = in.readLine();
      while(inputLine != null)
      {        
         /* Strip out HTML tags */
         result = inputLine.split("<[^>]*>");

         /* Write out result into file */
         for (i = 0; i < result.length; i++)
            out.println(result[i]);

         inputLine = in.readLine();
      } /* End while loop to get to end of file */

      reader.close();
      writer.close();

   } /* End method stripHTML */ 


   /**
      Counts the number of times some keywords occur in a text file 
      and returns the results in an array.
      @param filename the text file to count words with
      @param keywords the list of keywords to look for
      @return count the list containing the count for each keyword 
   */
   public static int[] count(String filename, String[] keywords) throws Exception
   {
      int i, j;
      int found_index;
      int array_length;
      int[] count;
      boolean found;
      Matcher matcher;
      FileReader reader;
      BufferedReader in;
      String inputLine;
      Pattern[] keyword_patterns;


      /* Get length of keywords array */
      array_length = keywords.length;

      /* Set our count and keyword_patterns arrays to the length of keywords array */
      count = new int[array_length];
      keyword_patterns = new Pattern[array_length];

      /* 
         Initialize count array to 0 AND 
         Compile keywords into regular expressions
      */
      for (i = 0; i < array_length; i++)
      {
        count[i] = 0;
        keyword_patterns[i] = Pattern.compile(keywords[i]);
      }
      
      /* Read in input file */
      reader = new FileReader(filename);
      in = new BufferedReader(reader);

      /* Read file one line at a time */
      inputLine = in.readLine();
      while(inputLine != null)
      {        
         /* Compare each keyword with the line read from file */
         for (i = 0; i < array_length; i++)
         {
            /* Converts line read in to lower case */
            inputLine = inputLine.toLowerCase();

            /* Check if keyword occurs again in line read from file */
            for (j = 0; j < inputLine.length(); j++)
            {
               matcher = keyword_patterns[i].matcher(inputLine);
               found = matcher.find(j);

               /* Keep track of how many times keywords occur */
               if (found)
               {
                  found_index = matcher.start();
                  count[i]++;
                  j = found_index + keywords[i].length();
               }
            } /* End for loop j*/
         } /* End for loop i*/

         inputLine = in.readLine();
      } /* End while loop to get to end of file */

      reader.close();

      return count;

   } /* End method count */

} /* End class WordCounter */
