package slib.sml.sm.core.measures.corpus;

import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.StoredField;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import slib.utils.FileUtils;
import slib.utils.ex.SLIB_Ex_Critic;

/* loaded from: input_file:slib/sml/sm/core/measures/corpus/VocContextMatrixBuilder.class */
public class VocContextMatrixBuilder {
    Logger logger;
    List<String> docFields;
    Set<String> vocabulary;
    Matrix matrix;
    MatrixType matrixType;

    public VocContextMatrixBuilder(MatrixType matrixType, Set<String> set) {
        this.logger = LoggerFactory.getLogger(getClass());
        this.docFields = new ArrayList();
        this.matrixType = matrixType;
        this.vocabulary = set;
        this.docFields = new ArrayList();
        switch (matrixType) {
            case WORD_WORD:
                this.matrix = new Matrix();
                break;
            case WORD_DOC:
                this.matrix = new Matrix();
                break;
        }
        this.logger.info("Matrix Builder: " + this.matrixType);
    }

    public Matrix getMatrix() {
        return this.matrix;
    }

    public VocContextMatrixBuilder(MatrixType matrixType, Set<String> set, List<String> list) {
        this(matrixType, set);
        setDocFields(list);
    }

    public List<String> getDocFields() {
        return this.docFields;
    }

    public final void setDocFields(Collection<String> collection) {
        Iterator<String> it = collection.iterator();
        while (it.hasNext()) {
            addDocFields(it.next());
        }
    }

    public void addDocFields(String str) {
        if (this.docFields.contains(str)) {
            return;
        }
        this.docFields.add(str);
    }

    public void buildMatrix(Iterable<Document> iterable) throws SLIB_Ex_Critic {
        this.logger.info("Building voc-context matrix");
        this.logger.info("voc: " + this.vocabulary);
        this.logger.info("fields: " + this.docFields);
        if (this.docFields.isEmpty()) {
            throw new SLIB_Ex_Critic("Please specify a field to analyse in the given documents");
        }
        Iterator<Document> it = iterable.iterator();
        while (it.hasNext()) {
            process(it.next());
        }
    }

    public void process(Document document) throws SLIB_Ex_Critic {
        this.logger.debug("processing doc: " + document.toString());
        for (String str : this.docFields) {
            this.logger.info(str + " : " + document.get(str));
            if (document.getField(str) == null) {
                this.logger.warn("Skip field " + str);
                return;
            }
            switch (this.matrixType) {
                case WORD_WORD:
                    HashSet hashSet = new HashSet();
                    hashSet.addAll(Arrays.asList(document.get(str).split("\\s")));
                    String[] strArr = (String[]) hashSet.toArray(new String[hashSet.size()]);
                    for (int i = 0; i < strArr.length; i++) {
                        for (int i2 = i + 1; i2 < strArr.length; i2++) {
                            if (this.vocabulary == null || (this.vocabulary.contains(strArr[i]) && this.vocabulary.contains(strArr[i2]))) {
                                this.matrix.addValue(strArr[i], strArr[i2], 1.0d);
                                this.matrix.addValue(strArr[i2], strArr[i], 1.0d);
                            }
                        }
                    }
                    break;
                case WORD_DOC:
                    HashSet hashSet2 = new HashSet();
                    hashSet2.addAll(Arrays.asList(document.get(str).split("\\s")));
                    for (String str2 : (String[]) hashSet2.toArray(new String[hashSet2.size()])) {
                        this.matrix.addValue(str2, document, 1.0d);
                    }
                    break;
            }
        }
    }

    public static void main(String[] strArr) throws SLIB_Ex_Critic, IOException {
        List<File> listFilesForFolder = FileUtils.listFilesForFolder("/data/tmp/wiki/", Arrays.asList("txt"), 100000);
        ArrayList arrayList = new ArrayList();
        arrayList.add("content");
        VocContextMatrixBuilder vocContextMatrixBuilder = new VocContextMatrixBuilder(MatrixType.WORD_WORD, new HashSet(Arrays.asList("lion", "panthera", "Africa", "lamb", "insecticides", "animal", "Genealogists", "rugby", "football", "Sydney", "Australia")), arrayList);
        for (File file : listFilesForFolder) {
            Document document = new Document();
            document.add(new StoredField("content", FileUtils.readFile(file.getAbsolutePath(), Charset.defaultCharset())));
            vocContextMatrixBuilder.process(document);
        }
        Matrix matrix = vocContextMatrixBuilder.getMatrix();
        System.out.println("size: " + matrix.getInternalStorage().keySet().size());
        for (String str : matrix.getInternalStorage().keySet()) {
            System.out.println(str + "\t(" + ((Map) matrix.getInternalStorage().get(str)).size() + ")\t" + matrix.getInternalStorage().get(str));
        }
    }
}
