package edu.stanford.nlp.process;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.objectbank.TokenizerFactory;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.Timing;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringReader;
import java.io.Writer;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.regex.Pattern;

/* loaded from: input_file:edu/stanford/nlp/process/PTBTokenizer.class */
public class PTBTokenizer<T extends HasWord> extends AbstractTokenizer<T> {
    private PTBLexer lexer;

    /* loaded from: input_file:edu/stanford/nlp/process/PTBTokenizer$PTBTokenizerFactory.class */
    public static class PTBTokenizerFactory<T extends HasWord> implements TokenizerFactory<T> {
        protected LexedTokenFactory<T> factory;
        protected String options;

        public static TokenizerFactory<Word> newTokenizerFactory() {
            return newPTBTokenizerFactory(new WordTokenFactory(), "");
        }

        public static PTBTokenizerFactory<Word> newPTBTokenizerFactory(boolean z) {
            return new PTBTokenizerFactory<>(z, false, false, new WordTokenFactory());
        }

        public static PTBTokenizerFactory<Word> newWordTokenizerFactory(String str) {
            return new PTBTokenizerFactory<>(new WordTokenFactory(), str);
        }

        public static PTBTokenizerFactory<CoreLabel> newCoreLabelTokenizerFactory(String str) {
            return new PTBTokenizerFactory<>(new CoreLabelTokenFactory(), str);
        }

        public static <T extends HasWord> PTBTokenizerFactory<T> newPTBTokenizerFactory(LexedTokenFactory<T> lexedTokenFactory, String str) {
            return new PTBTokenizerFactory<>(lexedTokenFactory, str);
        }

        public static PTBTokenizerFactory<CoreLabel> newPTBTokenizerFactory(boolean z, boolean z2) {
            return new PTBTokenizerFactory<>(z, z2, false, new CoreLabelTokenFactory());
        }

        private PTBTokenizerFactory(boolean z, boolean z2, boolean z3, LexedTokenFactory<T> lexedTokenFactory) {
            this.factory = lexedTokenFactory;
            StringBuilder sb = new StringBuilder();
            if (z3) {
                sb.append("ptb3Escaping=false");
            } else {
                sb.append("ptb3Escaping=true");
            }
            if (z) {
                sb.append(",tokenizeNLs");
            }
            if (z2) {
                sb.append(",invertible");
            }
            this.options = sb.toString();
        }

        private PTBTokenizerFactory(LexedTokenFactory<T> lexedTokenFactory, String str) {
            this.factory = lexedTokenFactory;
            this.options = str;
        }

        @Override // edu.stanford.nlp.objectbank.IteratorFromReaderFactory
        public Iterator<T> getIterator(Reader reader) {
            return getTokenizer(reader);
        }

        @Override // edu.stanford.nlp.objectbank.TokenizerFactory
        public Tokenizer<T> getTokenizer(Reader reader) {
            return new PTBTokenizer(reader, this.factory, this.options);
        }

        @Override // edu.stanford.nlp.objectbank.TokenizerFactory
        public void setOptions(String str) {
            this.options = str;
        }

        /* synthetic */ PTBTokenizerFactory(boolean z, boolean z2, boolean z3, LexedTokenFactory lexedTokenFactory, PTBTokenizerFactory pTBTokenizerFactory) {
            this(z, z2, z3, lexedTokenFactory);
        }

        /* synthetic */ PTBTokenizerFactory(LexedTokenFactory lexedTokenFactory, String str, PTBTokenizerFactory pTBTokenizerFactory) {
            this(lexedTokenFactory, str);
        }
    }

    public static PTBTokenizer<Word> newPTBTokenizer(Reader reader) {
        return newPTBTokenizer(reader, false);
    }

    public static PTBTokenizer<Word> newPTBTokenizer(Reader reader, boolean z) {
        return new PTBTokenizer<>(reader, z, false, false, new WordTokenFactory());
    }

    public static PTBTokenizer<CoreLabel> newPTBTokenizer(Reader reader, boolean z, boolean z2) {
        return new PTBTokenizer<>(reader, z, z2, false, new CoreLabelTokenFactory());
    }

    private PTBTokenizer(Reader reader, boolean z, boolean z2, boolean z3, LexedTokenFactory<T> lexedTokenFactory) {
        StringBuilder sb = new StringBuilder();
        if (z3) {
            sb.append("ptb3Escaping=false");
        } else {
            sb.append("ptb3Escaping=true");
        }
        if (z) {
            sb.append(",tokenizeNLs");
        }
        if (z2) {
            sb.append(",invertible");
        }
        this.lexer = new PTBLexer(reader, lexedTokenFactory, sb.toString());
    }

    public PTBTokenizer(Reader reader, LexedTokenFactory<T> lexedTokenFactory, String str) {
        this.lexer = new PTBLexer(reader, lexedTokenFactory, str);
    }

    /* JADX INFO: Access modifiers changed from: protected */
    /* JADX WARN: Multi-variable type inference failed */
    /* JADX WARN: Type inference failed for: r0v6, types: [edu.stanford.nlp.ling.HasWord] */
    @Override // edu.stanford.nlp.process.AbstractTokenizer
    public T getNext() {
        T t = null;
        try {
            t = (HasWord) this.lexer.next();
        } catch (Exception e) {
            this.nextToken = null;
        }
        return t;
    }

    public static String ptb2Text(String str) {
        StringBuilder sb = new StringBuilder(str.length());
        PTB2TextLexer pTB2TextLexer = new PTB2TextLexer(new StringReader(str));
        while (true) {
            try {
                String next = pTB2TextLexer.next();
                if (next == null) {
                    break;
                }
                sb.append(next);
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return sb.toString();
    }

    public static String ptbToken2Text(String str) {
        return ptb2Text(String.valueOf(' ') + str + ' ').trim();
    }

    public static int ptb2Text(Reader reader, Writer writer) throws IOException {
        int i = 0;
        PTB2TextLexer pTB2TextLexer = new PTB2TextLexer(reader);
        while (true) {
            String next = pTB2TextLexer.next();
            if (next == null) {
                return i;
            }
            i++;
            writer.write(next);
        }
    }

    private static void untok(List<String> list, List<String> list2, String str) throws IOException {
        Timing timing = new Timing();
        int i = 0;
        int size = list.size();
        if (size == 0) {
            InputStreamReader inputStreamReader = new InputStreamReader(System.in, str);
            PrintWriter printWriter = new PrintWriter((OutputStream) System.out, true);
            i = ptb2Text(inputStreamReader, printWriter);
            printWriter.close();
        } else {
            for (int i2 = 0; i2 < size; i2++) {
                BufferedReader readReaderFromString = IOUtils.readReaderFromString(list.get(i2), str);
                PrintWriter printWriter2 = list2 == null ? new PrintWriter((OutputStream) System.out, true) : new PrintWriter((Writer) new BufferedWriter(new OutputStreamWriter(new FileOutputStream(list2.get(i2)), str)), true);
                i += ptb2Text(readReaderFromString, printWriter2);
                printWriter2.close();
            }
        }
        System.err.println("PTBTokenizer untokenized " + i + " tokens at " + new DecimalFormat("0.00").format(i / (timing.stop() / 1000.0d)) + " tokens per second.");
    }

    public static String ptb2Text(List<String> list) {
        return ptb2Text(StringUtils.join(list));
    }

    public static String labelList2Text(List<? extends HasWord> list) {
        ArrayList arrayList = new ArrayList();
        Iterator<? extends HasWord> it = list.iterator();
        while (it.hasNext()) {
            arrayList.add(it.next().word());
        }
        return ptb2Text(arrayList);
    }

    private static void tok(List<String> list, List<String> list2, String str, Pattern pattern, Pattern pattern2, String str2, boolean z, boolean z2, boolean z3) throws IOException {
        Timing timing = new Timing();
        int i = 0;
        int size = list.size();
        if (size == 0) {
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(System.in, str));
            PrintWriter printWriter = new PrintWriter((OutputStream) System.out, true);
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    break;
                } else {
                    i += tokReader(new StringReader(readLine), printWriter, pattern, pattern2, str2, z, z2, z3);
                }
            }
        } else {
            for (int i2 = 0; i2 < size; i2++) {
                BufferedReader readReaderFromString = IOUtils.readReaderFromString(list.get(i2), str);
                PrintWriter printWriter2 = list2 == null ? new PrintWriter((OutputStream) System.out, true) : new PrintWriter((Writer) new BufferedWriter(new OutputStreamWriter(new FileOutputStream(list2.get(i2)), str)), true);
                i += tokReader(readReaderFromString, printWriter2, pattern, pattern2, str2, z, z2, z3);
                readReaderFromString.close();
                if (list2 != null) {
                    printWriter2.close();
                }
            }
        }
        System.err.println("PTBTokenizer tokenized " + i + " tokens at " + new DecimalFormat("0.00").format(i / (timing.stop() / 1000.0d)) + " tokens per second.");
    }

    /* JADX WARN: Multi-variable type inference failed */
    private static int tokReader(Reader reader, PrintWriter printWriter, Pattern pattern, Pattern pattern2, String str, boolean z, boolean z2, boolean z3) {
        int i = 0;
        PTBTokenizer pTBTokenizer = new PTBTokenizer(reader, new CoreLabelTokenFactory(), str);
        boolean z4 = pattern == null;
        boolean z5 = true;
        while (pTBTokenizer.hasNext()) {
            CoreLabel coreLabel = (CoreLabel) pTBTokenizer.next();
            String str2 = (String) coreLabel.get(CoreAnnotations.TextAnnotation.class);
            if (z3) {
                str2 = str2.toLowerCase(Locale.ENGLISH);
                coreLabel.set(CoreAnnotations.TextAnnotation.class, str2);
            }
            if (pattern != null && pattern.matcher(str2).matches()) {
                z4 = true;
            } else if (pattern2 != null && pattern2.matcher(str2).matches()) {
                z4 = false;
            } else if (z4) {
                if (z2) {
                    str2 = coreLabel.toString();
                }
                if (!z) {
                    printWriter.println(str2);
                } else if (PTBLexer.NEWLINE_TOKEN.equals(str2)) {
                    z5 = true;
                    printWriter.println();
                } else {
                    if (z5) {
                        z5 = false;
                    } else {
                        printWriter.print(" ");
                    }
                    printWriter.print(str2);
                }
            }
            i++;
        }
        return i;
    }

    public static TokenizerFactory<Word> factory() {
        return PTBTokenizerFactory.newTokenizerFactory();
    }

    public static <T extends HasWord> TokenizerFactory<T> factory(boolean z, LexedTokenFactory<T> lexedTokenFactory) {
        return new PTBTokenizerFactory(z, false, false, lexedTokenFactory, null);
    }

    public static TokenizerFactory<CoreLabel> factory(boolean z, boolean z2) {
        return PTBTokenizerFactory.newPTBTokenizerFactory(z, z2);
    }

    public static <T extends HasWord> TokenizerFactory<T> factory(LexedTokenFactory<T> lexedTokenFactory, String str) {
        return new PTBTokenizerFactory(lexedTokenFactory, str, null);
    }

    public static void main(String[] strArr) throws IOException {
        int i = 0;
        String str = "utf-8";
        Pattern pattern = null;
        Pattern pattern2 = null;
        StringBuilder sb = new StringBuilder();
        boolean z = false;
        boolean z2 = false;
        boolean z3 = false;
        boolean z4 = false;
        boolean z5 = false;
        while (i < strArr.length && strArr[i].charAt(0) == '-') {
            if ("-options".equals(strArr[i])) {
                i++;
                sb.append(',');
                sb.append(strArr[i]);
            } else if ("-preserveLines".equals(strArr[i])) {
                sb.append(",tokenizeNLs");
                z = true;
            } else if ("-lowerCase".equals(strArr[i])) {
                z5 = true;
            } else if ("-dump".equals(strArr[i])) {
                z3 = true;
            } else if ("-ioFileList".equals(strArr[i])) {
                z2 = true;
            } else if ("-charset".equals(strArr[i]) && i < strArr.length - 1) {
                i++;
                str = strArr[i];
            } else if ("-parseInside".equals(strArr[i]) && i < strArr.length - 1) {
                i++;
                try {
                    pattern = Pattern.compile("<(?:" + strArr[i] + ")[^>]*?>");
                    pattern2 = Pattern.compile("</(?:" + strArr[i] + ")[^>]*?>");
                } catch (Exception e) {
                    pattern = null;
                    pattern2 = null;
                }
            } else if ("-untok".equals(strArr[i])) {
                z4 = true;
            } else {
                if ("-h".equals(strArr[i]) || "-help".equals(strArr[i]) || "--help".equals(strArr[i])) {
                    System.err.println("usage: java edu.stanford.nlp.process.PTBTokenizer [options]* filename*");
                    System.err.println("  options: -preserveLines|-lowerCase|-dump|-ioFileList|-charset|-parseInside elementRegex|-options options|-h");
                    return;
                }
                System.err.println("Unknown option: " + strArr[i]);
            }
            i++;
        }
        ArrayList arrayList = new ArrayList();
        ArrayList arrayList2 = null;
        if (z2) {
            arrayList2 = new ArrayList();
            for (int i2 = i; i2 < strArr.length; i2++) {
                BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(strArr[i2]), str));
                while (true) {
                    String readLine = bufferedReader.readLine();
                    if (readLine == null) {
                        break;
                    }
                    String[] split = readLine.split("\\s+");
                    arrayList.add(split[0]);
                    if (split.length > 1) {
                        arrayList2.add(split[1]);
                    } else {
                        arrayList2.add(String.valueOf(split[0]) + ".tok");
                    }
                }
                bufferedReader.close();
            }
        } else {
            arrayList.addAll(Arrays.asList(strArr).subList(i, strArr.length));
        }
        if (z4) {
            untok(arrayList, arrayList2, str);
        } else {
            tok(arrayList, arrayList2, str, pattern, pattern2, sb.toString(), z, z3, z5);
        }
    }
}
