package fr.gouv.culture.sdx.search.lucene.analysis;

/* ====================================================================
 * The Apache Software License, Version 1.1
 *
 * Copyright (c) 2001 The Apache Software Foundation.  All rights
 * reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * 3. The end-user documentation included with the redistribution,
 *    if any, must include the following acknowledgment:
 *       "This product includes software developed by the
 *        Apache Software Foundation (http://www.apache.org/)."
 *    Alternately, this acknowledgment may appear in the software itself,
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Apache" and "Apache Software Foundation" and
 *    "Apache Lucene" must not be used to endorse or promote products
 *    derived from this software without prior written permission. For
 *    written permission, please contact apache@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache",
 *    "Apache Lucene", nor may "Apache" appear in their name, without
 *    prior written permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 * ====================================================================
 *
 * This software consists of voluntary contributions made by many
 * individuals on behalf of the Apache Software Foundation.  For more
 * information on the Apache Software Foundation, please see
 * <http://www.apache.org/>.
 */

import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.de.WordlistLoader;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;

import java.io.*;
import java.util.Hashtable;

/**
 * Analyzer for Czech language. Supports an external list of stopwords (words that
 * will not be indexed at all).
 * A default set of stopwords is used unless an alternative list is specified, the
 * exclusion list is empty by default.
 *
 * @author    Lukas Zapletal [lzap@root.cz]
 */
public final class Analyzer_cz extends DefaultAnalyzer {

    /**
     * List of typical stopwords.
     */
    public static final String[] DEFAULT_STOP_WORDS = {
        "a", "s", "k", "o", "i", "u", "v", "z", "dnes", "cz", "t\u00edmto", "bude\u0161", "budem",
        "byli", "jse\u0161", "m\u016fj", "sv\u00fdm", "ta", "tomto", "tohle", "tuto", "tyto",
        "jej", "zda", "pro\u010d", "m\u00e1te", "tato", "kam", "tohoto", "kdo", "kte\u0159\u00ed",
        "mi", "n\u00e1m", "tom", "tomuto", "m\u00edt", "nic", "proto", "kterou", "byla",
        "toho", "proto\u017ee", "asi", "ho", "na\u0161i", "napi\u0161te", "re", "co\u017e", "t\u00edm",
        "tak\u017ee", "sv\u00fdch", "jej\u00ed", "sv\u00fdmi", "jste", "aj", "tu", "tedy", "teto",
        "bylo", "kde", "ke", "prav\u00e9", "ji", "nad", "nejsou", "\u010di", "pod", "t\u00e9ma",
        "mezi", "p\u0159es", "ty", "pak", "v\u00e1m", "ani", "kdy\u017e", "v\u0161ak", "neg", "jsem",
        "tento", "\u010dl\u00e1nku", "\u010dl\u00e1nky", "aby", "jsme", "p\u0159ed", "pta", "jejich",
        "byl", "je\u0161t\u011b", "a\u017e", "bez", "tak\u00e9", "pouze", "prvn\u00ed", "va\u0161e", "kter\u00e1",
        "n\u00e1s", "nov\u00fd", "tipy", "pokud", "m\u016f\u017ee", "strana", "jeho", "sv\u00e9", "jin\u00e9",
        "zpr\u00e1vy", "nov\u00e9", "nen\u00ed", "v\u00e1s", "jen", "podle", "zde", "u\u017e", "b\u00fdt", "v\u00edce",
        "bude", "ji\u017e", "ne\u017e", "kter\u00fd", "by", "kter\u00e9", "co", "nebo", "ten", "tak",
        "m\u00e1", "p\u0159i", "od", "po", "jsou", "jak", "dal\u0161\u00ed", "ale", "si", "se", "ve",
        "to", "jako", "za", "zp\u011bt", "ze", "do", "pro", "je", "na", "atd", "atp",
        "jakmile", "p\u0159i\u010dem\u017e", "j\u00e1", "on", "ona", "ono", "oni", "ony", "my", "vy",
        "j\u00ed", "ji", "m\u011b", "mne", "jemu", "tomu", "t\u011bm", "t\u011bmu", "n\u011bmu", "n\u011bmu\u017e",
        "jeho\u017e", "j\u00ed\u017e", "jeliko\u017e", "je\u017e", "jako\u017e", "na\u010de\u017e",
    };


    /**
     * Builds an analyzer.
     */
    public Analyzer_cz() {
        //stopTable = StopFilter.makeStopTable( DEFAULT_STOP_WORDS );
    }

    /**
     * Builds an analyzer with the given stop words.
     */
    public Analyzer_cz(String[] stopwords) {
        stopTable = StopFilter.makeStopTable(stopwords);
    }

    /**
     * Builds an analyzer with the given stop words.
     */
    public Analyzer_cz(Hashtable stopwords) {
        stopTable = stopwords;
    }

    /**
     * Builds an analyzer with the given stop words.
     */
    public Analyzer_cz(File stopwords) {
        stopTable = WordlistLoader.getWordtable(stopwords);
    }

    /**
     * Loads stopwords hash from resource stream (file, database...).
     * @param   wordfile    File containing the wordlist
     * @param   encoding    Encoding used (win-1250, iso-8859-2, ...}, null for default system encoding
     */
    public void loadStopWords(InputStream wordfile, String encoding) {
        if (wordfile == null) {
            stopTable = new Hashtable();
            return;
        }
        try {
            // clear any previous table (if present)
            stopTable = new Hashtable();

            InputStreamReader isr;
            if (encoding == null)
                isr = new InputStreamReader(wordfile);
            else
                isr = new InputStreamReader(wordfile, encoding);


            LineNumberReader lnr = new LineNumberReader(isr);
            String word;
            while ((word = lnr.readLine()) != null) {
                stopTable.put(word, word);
            }

        } catch (IOException e) {
            stopTable = null;
        }
    }

    /**
     * Creates a TokenStream which tokenizes all the text in the provided Reader.
     *
     * @return  A TokenStream build from a StandardTokenizer filtered with
     * 			StandardFilter, StopFilter, GermanStemFilter and LowerCaseFilter
     */
    public final TokenStream tokenStream(String fieldName, Reader reader) {
        TokenStream result = new StandardTokenizer(reader);
        result = new StandardFilter(result);
        result = new LowerCaseFilter(result);
        result = new StopFilter(result, stopTable);
        return result;
    }


}

