package dkpro.similarity.experiments.rte.util;

import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma;
import de.tudarmstadt.ukp.dkpro.core.gate.GateLemmatizer;
import de.tudarmstadt.ukp.dkpro.core.opennlp.OpenNlpPosTagger;
import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter;
import dkpro.similarity.experiments.rte.Pipeline;
import java.io.File;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;

/* loaded from: input_file:dkpro/similarity/experiments/rte/util/WordIdfValuesGenerator.class */
public class WordIdfValuesGenerator {
    static final String LF = System.getProperty("line.separator");

    public static void computeIdfScores(Pipeline.Dataset dataset) throws Exception {
        File file = new File("target/utils/word-idf/" + RteUtil.getCommonDatasetName(dataset) + ".txt");
        System.out.println("Computing word idf values");
        if (file.exists()) {
            System.out.println(" - skipping, already exists");
            return;
        }
        System.out.println(" - this may take a while...");
        Collection<File> listFiles = FileUtils.listFiles(new File("target/utils/plaintexts/" + RteUtil.getCommonDatasetName(dataset)), new String[]{"txt"}, false);
        HashMap hashMap = new HashMap();
        HashSet hashSet = new HashSet();
        for (File file2 : listFiles) {
            ArrayList arrayList = new ArrayList();
            for (Lemma lemma : getLemmas(FileUtils.readFileToString(file2))) {
                try {
                    arrayList.add(lemma.getValue().toLowerCase());
                } catch (NullPointerException e) {
                    System.err.println(" - unparsable token: " + lemma.getCoveredText());
                }
            }
            hashSet.add(arrayList);
        }
        HashSet<String> hashSet2 = new HashSet();
        Iterator it = hashSet.iterator();
        while (it.hasNext()) {
            hashSet2.addAll((List) it.next());
        }
        for (String str : hashSet2) {
            double d = 0.0d;
            Iterator it2 = hashSet.iterator();
            while (it2.hasNext()) {
                if (((List) it2.next()).contains(str)) {
                    d += 1.0d;
                }
            }
            hashMap.put(str, Double.valueOf(d));
        }
        for (String str2 : hashMap.keySet()) {
            hashMap.put(str2, Double.valueOf(Math.log10(listFiles.size() / ((Double) hashMap.get(str2)).doubleValue())));
        }
        StringBuilder sb = new StringBuilder();
        for (String str3 : hashMap.keySet()) {
            sb.append(str3 + "\t" + hashMap.get(str3) + LF);
        }
        FileUtils.writeStringToFile(file, sb.toString());
        System.out.println(" - done");
    }

    private static Collection<Lemma> getLemmas(String str) throws Exception {
        AnalysisEngine createEngine = AnalysisEngineFactory.createEngine(AnalysisEngineFactory.createEngineDescription(new AnalysisEngineDescription[]{AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class, new Object[0]), AnalysisEngineFactory.createEngineDescription(OpenNlpPosTagger.class, new Object[]{"language", "en"}), AnalysisEngineFactory.createEngineDescription(GateLemmatizer.class, new Object[0])}), new Object[0]);
        JCas newJCas = createEngine.newJCas();
        newJCas.setDocumentText(str);
        createEngine.process(newJCas);
        return JCasUtil.select(newJCas, Lemma.class);
    }
}
