/*
SDX: Documentary System in XML.
Copyright (C) 2000, 2001, 2003  Ministere de la culture et de la communication (France), AJLSM

Ministere de la culture et de la communication,
Mission de la recherche et de la technologie
3 rue de Valois, 75042 Paris Cedex 01 (France)
mrt@culture.fr, michel.bottin@culture.fr

AJLSM, 17, rue Vital Carles, 33000 Bordeaux (France)
sevigny@ajlsm.com

Pierrick Brihaye, 2003
pierrick.brihaye@wanadoo.fr

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the
Free Software Foundation, Inc.
59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
or connect to:
http://www.fsf.org/copyleft/gpl.html
*/

package fr.gouv.culture.sdx.search.lucene.analysis;

import gpl.pierrick.brihaye.aramorph.lucene.ArabicGrammaticalFilter;
import gpl.pierrick.brihaye.aramorph.lucene.ArabicStemmer;
import gpl.pierrick.brihaye.aramorph.lucene.ArabicTokenizer;
import org.apache.avalon.framework.configuration.Configuration;
import org.apache.avalon.framework.configuration.ConfigurationException;
import org.apache.avalon.framework.logger.Logger;
import org.apache.lucene.analysis.TokenStream;

import java.io.Reader;

/** Analyzer for the arabic language. This analyzer uses Tim Buckwalter's algorithm
 * (avalaible at <a href="http://www.ldc.upenn.edu/Catalog/CatalogEntry.jsp?catalogId=LDC2002L49">LDC
 * Catalog</a>) to identify the morphological category of arabic tokens.
 * The relevant categories are still to be determined but the current list gives
 * good results.
 * Final tokens are a romanized canonical version of the word.
 * @author Pierrick Brihaye, 2003
 */
public final class Analyzer_ar extends AbstractAnalyzer {

	protected final static String ANALYZER_TYPE="Analyzer_ar";
	
    /** Configure the glosser.
     * @param configuration The configuration object
     * @throws ConfigurationException If a problem occurs during configuration
     */
    public void configure(Configuration configuration) throws ConfigurationException {
        super.configure(configuration);
    }

    /** Transmits a super.getLog() to the class.
     * @param logger The super.getLog()
     */
    public void enableLogging(Logger logger) {
        super.enableLogging(logger);
    }

    /** Returns a token stream of romanized arabic words whose morphological categories are found to be semantically relevant.
     * @return The token stream
     * @param reader The reader
     *@param fieldName The field
     */
    public final TokenStream tokenStream(String fieldName, Reader reader) {
        TokenStream result = null;
        try {
            result = new ArabicTokenizer(reader);
            result = new ArabicStemmer(result);
            result = new ArabicGrammaticalFilter(result);
        } catch (Exception e) {
            this.logger.error("Arabic analyzer error", e);
        }
        return result;
    }

	/* (non-Javadoc)
	 * @see fr.gouv.culture.sdx.search.lucene.analysis.AbstractAnalyzer#getAnalyserType()
	 */
	protected String getAnalyzerType() {
		return Analyzer_ar.ANALYZER_TYPE;
	}
}

