/*
SDX: Documentary System in XML.
Copyright (C) 2000, 2001, 2002  Ministere de la culture et de la communication (France), AJLSM

Ministere de la culture et de la communication,
Mission de la recherche et de la technologie
3 rue de Valois, 75042 Paris Cedex 01 (France)
mrt@culture.fr, michel.bottin@culture.fr

AJLSM, 17, rue Vital Carles, 33000 Bordeaux (France)
sevigny@ajlsm.com

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the
Free Software Foundation, Inc.
59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
or connect to:
http://www.fsf.org/copyleft/gpl.html
*/
package fr.gouv.culture.sdx.search.lucene.analysis;

import fr.gouv.culture.sdx.search.lucene.analysis.filter.FrenchStandardFilter;
import fr.gouv.culture.sdx.search.lucene.analysis.filter.ISOLatin1AccentFilter;
import org.apache.avalon.framework.configuration.Configuration;
import org.apache.avalon.framework.configuration.ConfigurationException;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer;

import java.io.Reader;

/**
 * An analyzer for french language.
 *
 * <p>
 * This analyzers performs these tasks :
 * <ul>
 * <li>all letters are converted to lower case
 * <li>accents from ISO-8859-1 can be removed
 * <li>stop words are removed, the list of words removed can come from a configuration file or use a default one
 * </ul>
 * <p>
 * The possible configurations of this analyzer are :
 * <ul>
 * <li>A list of stop words can be given in the configuration file, a default list is hardcoded.
 * <li>Accented characters are converted to their unaccented form by default, but this can be override with the keepAccent attribute
 * </ul>
 */
public class Analyzer_fr extends DefaultAnalyzer {

    /** If true, we keep the accents. */
    private boolean keepAccents = false;

    /** The attribute name for keeping accents or not. */
    private static final String KEEP_ACCENTS_ATTRIBUTE = "keepAccents";

    /**
     * Configures this analyzer.
     */
    public void configure(Configuration configuration) throws ConfigurationException {
        // The super class will handle the stop words
        super.configure(configuration);

        // Now we check for accents
        if (configuration != null) keepAccents = configuration.getAttributeAsBoolean(KEEP_ACCENTS_ATTRIBUTE, false);
    }

    /**
     * Builds a chain for filtering words.
     * <p>
     * The chain is this one :
     * <ul>
     * <li>Lucene's standrad tokenizer
     * <li>French standard filter
     * <li>Lucene's lower case filter
     * <li>ISOLatin1 accent remover (by default, can be overridden)
     * <li>Stop words filter (by default, can be overridden)
     * </ul>
     */
    public final TokenStream tokenStream(String fieldName, Reader reader) {

        // The token stream that will be returned.
        TokenStream result;

        // Builds the chain...
        result = new StandardTokenizer(reader);

        FrenchStandardFilter fsf = new FrenchStandardFilter();
        fsf.enableLogging(logger);
        fsf.setUp(result);

        result = fsf;

        result = new LowerCaseFilter(result);
        if (!keepAccents) {
            ISOLatin1AccentFilter ilf = new ISOLatin1AccentFilter();
            ilf.enableLogging(logger);
            ilf.setUp(result);
            result = ilf;

        }
        if (stopTable != null) result = new StopFilter(result, stopTable);

        // And returns the end of the chain
        return result;
    }
}
