/*
 * Decompiled with CFR 0.152.
 */
package org.openimaj.text.nlp;

import gov.sandia.cognition.text.token.DefaultToken;
import gov.sandia.cognition.text.token.Token;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringEscapeUtils;
import org.openimaj.text.nlp.TweetTokeniserException;
import org.openimaj.text.nlp.patterns.AbbreviationPatternProvider;
import org.openimaj.text.nlp.patterns.ComplicatedNumberPatternProvider;
import org.openimaj.text.nlp.patterns.EmailPatternProvider;
import org.openimaj.text.nlp.patterns.EmbeddedApostrophePatternProvider;
import org.openimaj.text.nlp.patterns.EmbeddedDashPatternProvider;
import org.openimaj.text.nlp.patterns.EmoticonPatternProvider;
import org.openimaj.text.nlp.patterns.EntityPatternProvider;
import org.openimaj.text.nlp.patterns.PunctuationPatternProvider;
import org.openimaj.text.nlp.patterns.TimePatternProvider;
import org.openimaj.text.nlp.patterns.TwitterStuffPatternProvider;
import org.openimaj.text.nlp.patterns.URLPatternProvider;
import org.openimaj.text.util.RegexUtil;

public class EntityTweetTokeniser
implements Iterable<Token> {
    private String text;
    private ArrayList<Token> tokenize;
    private ArrayList<Token> protectedTokens;
    private ArrayList<Token> unprotectedTokens;
    private static final Locale[] invalidLanguages = new Locale[]{new Locale("zh"), new Locale("ko"), new Locale("jp")};
    static EmoticonPatternProvider emoticons = new EmoticonPatternProvider();
    static PunctuationPatternProvider punctuation = new PunctuationPatternProvider();
    static EntityPatternProvider entity = new EntityPatternProvider();
    static URLPatternProvider url = new URLPatternProvider();
    static TimePatternProvider time = new TimePatternProvider();
    static ComplicatedNumberPatternProvider number = new ComplicatedNumberPatternProvider();
    static TwitterStuffPatternProvider twitterPart = new TwitterStuffPatternProvider();
    static EmailPatternProvider email = new EmailPatternProvider();
    static AbbreviationPatternProvider abbrev = new AbbreviationPatternProvider(entity);
    private static final String spaceRegex = "\\s+";
    static String Separators = RegexUtil.regex_or_match("--+", "\u2015");
    static String Decorations = new String(" [\u266b]+ ").replace(" ", "");
    static EmbeddedApostrophePatternProvider embedded = new EmbeddedApostrophePatternProvider(punctuation);
    static EmbeddedDashPatternProvider embeddedDash = new EmbeddedDashPatternProvider(punctuation);
    static String[] ProtectThese = new String[]{twitterPart.patternString(), emoticons.patternString(), url.patternString(), email.patternString(), entity.patternString(), time.patternString(), number.patternString(), punctuation.patternString(), abbrev.patternString(), Separators, Decorations};
    static String oredProtect = RegexUtil.regex_or_match(ProtectThese);
    static Pattern Protect_RE = Pattern.compile(oredProtect, 66);

    public static boolean isValid(Locale locale) {
        return EntityTweetTokeniser.isValid(locale.getLanguage());
    }

    public static boolean isValid(String locale) {
        for (Locale invalidLocal : invalidLanguages) {
            if (!invalidLocal.getLanguage().equals(locale)) continue;
            return false;
        }
        return true;
    }

    public EntityTweetTokeniser(String s) throws UnsupportedEncodingException, TweetTokeniserException {
        this.text = new String(s);
        this.fixEncoding();
        this.squeeze_whitespace();
        this.simple_tokenize();
    }

    private void simple_tokenize() throws TweetTokeniserException {
        this.tokenize = new ArrayList();
        this.edge_punct_munge();
        ArrayList<String> goods = new ArrayList<String>();
        ArrayList<String> bads = new ArrayList<String>();
        ArrayList<Token> res = new ArrayList<Token>();
        ArrayList<Token> goodt = new ArrayList<Token>();
        ArrayList<Token> badt = new ArrayList<Token>();
        int i = 0;
        Matcher matches = Protect_RE.matcher(this.text);
        if (matches != null) {
            List<Token> goodStrings;
            while (matches.find()) {
                String goodString = this.text.substring(i, matches.start());
                goods.add(goodString);
                goodStrings = this.unprotected_tokenize(goodString);
                res.addAll(goodStrings);
                goodt.addAll(goodStrings);
                String badString = this.text.substring(matches.start(), matches.end());
                bads.add(badString);
                DefaultToken badTok = new DefaultToken(badString, 0);
                res.add((Token)badTok);
                badt.add((Token)badTok);
                i = matches.end();
            }
            String finalGood = this.text.substring(i, this.text.length());
            goodStrings = this.unprotected_tokenize(finalGood);
            res.addAll(goodStrings);
            goodt.addAll(goodStrings);
        } else {
            String goodString = this.text.substring(0, this.text.length());
            List<Token> goodStrings = this.unprotected_tokenize(goodString);
            res.addAll(goodStrings);
            goodt.addAll(goodStrings);
        }
        this.tokenize = this.post_process(res);
        this.protectedTokens = this.post_process(badt);
        this.unprotectedTokens = this.post_process(goodt);
    }

    private ArrayList<Token> post_process(ArrayList<Token> res) {
        return res;
    }

    private List<Token> unprotected_tokenize(String goodString) {
        String[] strings = goodString.split(spaceRegex);
        ArrayList<Token> t = new ArrayList<Token>();
        for (String s : strings) {
            if (s.isEmpty()) continue;
            t.add((Token)new DefaultToken(s, 0));
        }
        return t;
    }

    private void edge_punct_munge() {
    }

    private void squeeze_whitespace() {
        this.text = this.text.replaceAll(spaceRegex, " ");
    }

    private void fixEncoding() throws UnsupportedEncodingException {
        this.text = new String(this.text.getBytes("UTF-8"), "UTF-8");
        this.text = StringEscapeUtils.unescapeHtml((String)this.text);
    }

    @Override
    public Iterator<Token> iterator() {
        return this.tokenize.iterator();
    }

    public List<Token> getTokens() {
        return this.tokenize;
    }

    public List<String> getStringTokens() {
        ArrayList<String> stringTokens = new ArrayList<String>();
        for (Token token : this.tokenize) {
            stringTokens.add(token.getText());
        }
        return stringTokens;
    }

    public List<String> getProtectedStringTokens() {
        ArrayList<String> stringTokens = new ArrayList<String>();
        for (Token token : this.protectedTokens) {
            stringTokens.add(token.getText());
        }
        return stringTokens;
    }

    public List<String> getUnprotectedStringTokens() {
        ArrayList<String> stringTokens = new ArrayList<String>();
        for (Token token : this.unprotectedTokens) {
            stringTokens.add(token.getText());
        }
        return stringTokens;
    }
}

