/*
Copyright (C) 2000-2010  Ministere de la culture et de la communication (France), AJLSM
See LICENCE file
 */

package fr.gouv.culture.sdx.search.lucene.analysis;

import java.io.Reader;

import org.apache.avalon.framework.configuration.Configuration;
import org.apache.avalon.framework.configuration.ConfigurationException;
import org.apache.avalon.framework.logger.Logger;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter;

import gpl.pierrick.brihaye.aramorph.lucene.ArabicGlosser;
import gpl.pierrick.brihaye.aramorph.lucene.ArabicGrammaticalFilter;
import gpl.pierrick.brihaye.aramorph.lucene.ArabicTokenizer;
import gpl.pierrick.brihaye.aramorph.lucene.WhitespaceFilter;

/** An english glosser for the arabic language. This glosser uses Tim Buckwalter's algorithm
 * (available at <a href="http://www.ldc.upenn.edu/Catalog/CatalogEntry.jsp?catalogId=LDC2002L49">LDC
 * Catalog</a>) to identify the morphological category of arabic tokens and then return their glosses.
 * The meaningful morphological categories are still to be determined but the current list gives
 * good results.
 * @author Pierrick Brihaye, 2003
 */
public final class Glosser_ar_en extends AbstractAnalyzer {

	/* (non-Javadoc)
	 * @see fr.gouv.culture.sdx.search.lucene.analysis.AbstractAnalyzer#getAnalyserType()
	 */
	protected String getAnalyzerType() {
		return Glosser_ar_en.ANALYZER_TYPE;
	}
	protected final static String ANALYZER_TYPE="Glosser_ar_en";
	
    /** An array containing some common english words that are usually not
     * useful for searching. */
    public static final String[] STOP_WORDS = {
        "a", "and", "are", "as", "at", "be", "but", "by",
        "for", "if", "in", "into", "is", "it",
        "no", "not", "of", "on", "or", "s", "such",
        "t", "that", "the", "their", "then", "there", "these",
        "they", "this", "to", "was", "will", "with"
    };

    /** Configure the glosser.
     * @param configuration The configuration object
     * @throws ConfigurationException If a problem occurs during configuration
     */
    public void configure(Configuration configuration) throws ConfigurationException {
        super.configure(configuration);
    }

    /** Transmits a super.getLog() to the class.
     * @param logger The super.getLog()
     */
    public void enableLogging(Logger logger) {
        super.enableLogging(logger);
    }

    /** Returns a token stream of glosses of arabic words whose morphological categories are found to be semantically meaningful.
     * @return The token stream
     * @param reader The reader
     */
    public TokenStream tokenStream(String fieldName, Reader reader) {
        TokenStream result = null;
        try {
            result = new ArabicTokenizer(reader);
            result = new ArabicGlosser(result);
            result = new ArabicGrammaticalFilter(result);
            result = new WhitespaceFilter(result);
            result = new StandardFilter(result);
            result = new LowerCaseFilter(result);
            // MAJ Lucene 2.1.0 
            //result = new StopFilter(result, StopFilter.makeStopTable(STOP_WORDS));
            result = new StopFilter(result, StopFilter.makeStopSet(STOP_WORDS));
        } catch (Exception e) {
            this.logger.error("Arabic glosser error", e);
        }
        return result;
    }
    
    /** Creates a TokenStream which tokenizes all the text in the provided
	 *  Reader. Provided for backward compatibility only.
	 * @deprecated use tokenStream(String, Reader) instead. 
	 * @see fr.gouv.culture.sdx.search.lucene.analysis.Analyzer#tokenStream(java.io.Reader)
	 * @author Malo Pichot, 2007
	 */
	public TokenStream tokenStream(Reader reader) {
		return tokenStream(null, reader);
	}


}



