package org.elasticsearch.xpack.ml.inference.nlp.tokenizers;

import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import org.elasticsearch.common.util.set.Sets;
import org.elasticsearch.xpack.core.ml.inference.trainedmodel.Tokenization;
import org.elasticsearch.xpack.ml.inference.nlp.tokenizers.MPNetTokenizationResult;
import org.elasticsearch.xpack.ml.inference.nlp.tokenizers.TokenizationResult;

/* loaded from: input_file:org/elasticsearch/xpack/ml/inference/nlp/tokenizers/MPNetTokenizer.class */
public class MPNetTokenizer extends BertTokenizer {
    public static final String UNKNOWN_TOKEN = "[UNK]";
    public static final String SEPARATOR_TOKEN = "</s>";
    public static final String PAD_TOKEN = "<pad>";
    public static final String CLASS_TOKEN = "<s>";
    public static final String MASK_TOKEN = "<mask>";
    private static final Set<String> NEVER_SPLIT = Set.of("<mask>");

    /* loaded from: input_file:org/elasticsearch/xpack/ml/inference/nlp/tokenizers/MPNetTokenizer$Builder.class */
    public static class Builder {
        protected final List<String> originalVocab;
        protected final SortedMap<String, Integer> vocab;
        protected boolean doLowerCase;
        protected boolean withSpecialTokens;
        protected int maxSequenceLength;
        protected Set<String> neverSplit;
        protected boolean doTokenizeCjKChars = true;
        protected Boolean doStripAccents = null;

        protected Builder(List<String> list, Tokenization tokenization) {
            this.originalVocab = list;
            this.vocab = buildSortedVocab(list);
            this.doLowerCase = tokenization.doLowerCase();
            this.withSpecialTokens = tokenization.withSpecialTokens();
            this.maxSequenceLength = tokenization.maxSequenceLength();
        }

        private static SortedMap<String, Integer> buildSortedVocab(List<String> list) {
            TreeMap treeMap = new TreeMap();
            for (int i = 0; i < list.size(); i++) {
                treeMap.put(list.get(i), Integer.valueOf(i));
            }
            return treeMap;
        }

        public Builder setDoLowerCase(boolean z) {
            this.doLowerCase = z;
            return this;
        }

        public Builder setDoTokenizeCjKChars(boolean z) {
            this.doTokenizeCjKChars = z;
            return this;
        }

        public Builder setDoStripAccents(Boolean bool) {
            this.doStripAccents = bool;
            return this;
        }

        public Builder setNeverSplit(Set<String> set) {
            this.neverSplit = set;
            return this;
        }

        public Builder setMaxSequenceLength(int i) {
            this.maxSequenceLength = i;
            return this;
        }

        public Builder setWithSpecialTokens(boolean z) {
            this.withSpecialTokens = z;
            return this;
        }

        public MPNetTokenizer build() {
            if (this.doStripAccents == null) {
                this.doStripAccents = Boolean.valueOf(this.doLowerCase);
            }
            if (this.neverSplit == null) {
                this.neverSplit = Collections.emptySet();
            }
            return new MPNetTokenizer(this.originalVocab, this.vocab, this.doLowerCase, this.doTokenizeCjKChars, this.doStripAccents.booleanValue(), this.withSpecialTokens, this.maxSequenceLength, this.neverSplit);
        }
    }

    protected MPNetTokenizer(List<String> list, SortedMap<String, Integer> sortedMap, boolean z, boolean z2, boolean z3, boolean z4, int i, Set<String> set) {
        super(list, sortedMap, z, z2, z3, z4, i, Sets.union(set, NEVER_SPLIT), "</s>", "<s>", "<pad>", "<mask>", "[UNK]");
    }

    @Override // org.elasticsearch.xpack.ml.inference.nlp.tokenizers.BertTokenizer, org.elasticsearch.xpack.ml.inference.nlp.tokenizers.NlpTokenizer
    protected int getNumExtraTokensForSeqPair() {
        return 4;
    }

    @Override // org.elasticsearch.xpack.ml.inference.nlp.tokenizers.BertTokenizer, org.elasticsearch.xpack.ml.inference.nlp.tokenizers.NlpTokenizer
    TokenizationResult.TokensBuilder createTokensBuilder(int i, int i2, boolean z) {
        return new MPNetTokenizationResult.MPNetTokensBuilder(z, i, i2);
    }

    @Override // org.elasticsearch.xpack.ml.inference.nlp.tokenizers.BertTokenizer, org.elasticsearch.xpack.ml.inference.nlp.tokenizers.NlpTokenizer
    public TokenizationResult buildTokenizationResult(List<TokenizationResult.Tokens> list) {
        return new MPNetTokenizationResult(this.originalVocab, list, getPadTokenId().orElseThrow());
    }

    public static Builder mpBuilder(List<String> list, Tokenization tokenization) {
        return new Builder(list, tokenization);
    }
}
