package com.code972.hebmorph;

import com.code972.hebmorph.datastructures.DictRadix;
import com.code972.hebmorph.hspell.Constants;
import com.code972.hebmorph.hspell.LingInfo;
import java.io.IOException;
import java.io.Reader;

/* loaded from: input_file:com/code972/hebmorph/Tokenizer.class */
public class Tokenizer {
    private Reader input;
    private int dataLen;
    private int inputOffset;
    private int tokenOffset;
    private int tokenLengthInSource;
    private Character suffixForExactMatch;
    private final DictRadix<Integer> hebrewPrefixes;
    private final DictRadix<Byte> specialCases;
    private static final int IO_BUFFER_SIZE = 4096;
    private char[] ioBuffer;
    private int ioBufferIndex;
    private final char[] wordBuffer;
    public static final char[] Geresh = {'\'', 1523, 8216, 8217, 8219};
    public static final char[] Gershayim = {'\"', 1524, 8220, 8221, 8223, 10078, 65282};
    public static final char[] Makaf = {'-', 8210, 8211, 8212, 8213, 1470};
    public static final char[] CharsFollowingPrefixes = concatenateCharArrays(new char[]{Geresh, Gershayim, Makaf});
    public static final char[] LettersAcceptingGeresh = {1494, 1490, 1509, 1510, 1495};
    private static final Byte dummyData = new Byte((byte) 0);

    /* loaded from: input_file:com/code972/hebmorph/Tokenizer$TokenType.class */
    public static class TokenType {
        public static int Hebrew = 1;
        public static int NonHebrew = 2;
        public static int Numeric = 4;
        public static int Mixed = 8;
        public static int Construct = 16;
        public static int Acronym = 32;
        public static int Exact = 64;
        public static int Custom = Constants.DMask.D_DOUBLE;
    }

    public static boolean isOfChars(char c, char[] cArr) {
        for (char c2 : cArr) {
            if (c == c2) {
                return true;
            }
        }
        return false;
    }

    public static char[] concatenateCharArrays(char[]... cArr) {
        int i = 0;
        for (char[] cArr2 : cArr) {
            i += cArr2.length;
        }
        char[] cArr3 = new char[i];
        int i2 = 0;
        for (char[] cArr4 : cArr) {
            System.arraycopy(cArr4, 0, cArr3, i2, cArr4.length);
            i2 += cArr4.length;
        }
        return cArr3;
    }

    public static boolean isHebrewLetter(char c) {
        return c >= 1488 && c <= 1514;
    }

    public static boolean isFinalHebrewLetter(char c) {
        return c == 1507 || c == 1498 || c == 1501 || c == 1509 || c == 1503;
    }

    public static boolean isNiqqudChar(char c) {
        return (c >= 1456 && c <= 1465) || c == 1473 || c == 1474 || c == 1467 || c == 1468;
    }

    public final int getOffset() {
        return this.tokenOffset;
    }

    public void setOffset(int i) {
        this.tokenOffset = i;
    }

    public int getLengthInSource() {
        return this.tokenLengthInSource;
    }

    public void setLengthInSource(int i) {
        this.tokenLengthInSource = i;
    }

    public Character getSuffixForExactMatch() {
        return this.suffixForExactMatch;
    }

    public void setSuffixForExactMatch(Character ch) {
        this.suffixForExactMatch = ch;
    }

    public void addSpecialCase(String str) {
        this.specialCases.addNode(str, (String) dummyData);
    }

    public static boolean isLegalPrefix(char[] cArr, int i, DictRadix<Integer> dictRadix) {
        try {
            dictRadix.lookup(cArr, 0, i, false);
            return true;
        } catch (IllegalArgumentException e) {
            return false;
        }
    }

    public Tokenizer(Reader reader) {
        this(reader, null);
    }

    public Tokenizer(Reader reader, DictRadix<Byte> dictRadix) {
        this.dataLen = 0;
        this.inputOffset = 0;
        this.tokenOffset = 0;
        this.tokenLengthInSource = 0;
        this.suffixForExactMatch = null;
        this.ioBuffer = new char[4096];
        this.ioBufferIndex = 0;
        this.wordBuffer = new char[Constants.MaxWordLength];
        this.input = reader;
        this.specialCases = dictRadix != null ? dictRadix : new DictRadix<>(false);
        this.hebrewPrefixes = LingInfo.buildPrefixTree(false);
    }

    private boolean isRecognizedException(char[] cArr, byte b, char c) {
        char[] cArr2 = new char[b + 1];
        System.arraycopy(cArr, 0, cArr2, 0, b);
        cArr2[b] = c;
        return isRecognizedException(cArr2, (byte) (b + 1));
    }

    private boolean isRecognizedException(char[] cArr, byte b) {
        int i;
        int i2 = 0;
        try {
            while (true) {
                boolean isHebrewLetter = isHebrewLetter(cArr[i2 == true ? 1 : 0]);
                i = i2;
                if (isHebrewLetter) {
                    if ((i2 == true ? 1 : 0) >= cArr.length || !isLegalPrefix(cArr, (i2 == true ? 1 : 0) + 1, this.hebrewPrefixes)) {
                        break;
                    }
                    i2 = (i2 == true ? 1 : 0) + 1;
                }
                this.specialCases.lookup(cArr, i, b - i, true);
                return true;
            }
            this.specialCases.lookup(cArr, i, b - i, true);
            return true;
        } catch (IllegalArgumentException e) {
            return false;
        }
        i = 0;
    }

    /* JADX WARN: Type inference failed for: r1v70, types: [T, java.lang.String] */
    public int nextToken(Reference<String> reference) throws IOException {
        char c;
        byte b = 0;
        this.tokenOffset = 0;
        int i = 0;
        byte b2 = -1;
        while (true) {
            if (this.ioBufferIndex >= this.dataLen) {
                this.inputOffset += this.dataLen;
                this.dataLen = this.input.read(this.ioBuffer, 0, this.ioBuffer.length);
                if (this.dataLen <= 0) {
                    this.dataLen = 0;
                    if (b <= 0) {
                        reference.ref = "";
                        this.tokenLengthInSource = 0;
                        this.tokenOffset = this.inputOffset;
                        return 0;
                    }
                    if ((i & TokenType.Custom) > 0 && !isRecognizedException(this.wordBuffer, b)) {
                        reference.ref = "";
                        this.tokenLengthInSource = 0;
                        this.tokenOffset = this.inputOffset;
                        return 0;
                    }
                } else {
                    this.ioBufferIndex = 0;
                }
            }
            char[] cArr = this.ioBuffer;
            int i2 = this.ioBufferIndex;
            this.ioBufferIndex = i2 + 1;
            c = cArr[i2];
            boolean z = false;
            if (b == 0) {
                if (isHebrewLetter(c)) {
                    if (!isFinalHebrewLetter(c)) {
                        i |= TokenType.Hebrew;
                        z = true;
                    }
                } else if (Character.isLetterOrDigit(c)) {
                    i |= TokenType.NonHebrew;
                    if (Character.isDigit(c)) {
                        i |= TokenType.Numeric;
                    }
                    z = true;
                }
            } else if ((i & TokenType.Custom) > 0 && !Character.isSpaceChar(c)) {
                this.wordBuffer[b] = c;
                if (!isRecognizedException(this.wordBuffer, (byte) (b + 1))) {
                    i &= TokenType.Custom ^ (-1);
                    b = b2;
                    this.ioBufferIndex--;
                    break;
                }
                z = true;
            } else if (isHebrewLetter(c) || isNiqqudChar(c)) {
                z = true;
            } else if (Character.isLetterOrDigit(c)) {
                if (i == TokenType.Hebrew) {
                    i |= TokenType.Mixed;
                }
                z = true;
            } else if (isOfChars(c, Gershayim)) {
                c = '\"';
                if (!isHebrewLetter(this.wordBuffer[b - 1]) && !isNiqqudChar(this.wordBuffer[b - 1])) {
                    break;
                }
                i |= TokenType.Acronym;
                z = true;
            } else if (isOfChars(c, Geresh)) {
                c = '\'';
                if ((i & TokenType.Hebrew) > 0 && !isHebrewLetter(this.wordBuffer[b - 1]) && !isNiqqudChar(this.wordBuffer[b - 1]) && !isOfChars(this.wordBuffer[b - 1], Geresh)) {
                    break;
                }
                z = true;
            } else {
                if (isSffixForExactMath(c) || Character.isSpaceChar(c) || !isRecognizedException(this.wordBuffer, b, c)) {
                    break;
                }
                b2 = b;
                i |= TokenType.Custom;
                z = true;
            }
            if (z) {
                if (b == 0) {
                    this.tokenOffset = (this.inputOffset + this.ioBufferIndex) - 1;
                } else if (b == this.wordBuffer.length - 1) {
                }
                if (!isOfChars(c, Geresh)) {
                    byte b3 = b;
                    b = (byte) (b + 1);
                    this.wordBuffer[b3] = c;
                } else if (this.wordBuffer[b - 1] == c) {
                    this.wordBuffer[b - 1] = '\"';
                    i |= TokenType.Acronym;
                } else {
                    byte b4 = b;
                    b = (byte) (b + 1);
                    this.wordBuffer[b4] = c;
                }
            }
        }
        if (isOfChars(c, Makaf)) {
            i |= TokenType.Construct;
        } else if (this.suffixForExactMatch != null && this.suffixForExactMatch.equals(Character.valueOf(c))) {
            i |= TokenType.Exact;
        }
        if (this.dataLen <= 0) {
            this.tokenLengthInSource = Math.max(this.inputOffset - this.tokenOffset, 0);
        } else {
            this.tokenLengthInSource = Math.max(((this.inputOffset + this.ioBufferIndex) - 1) - this.tokenOffset, 0);
        }
        if (isOfChars(this.wordBuffer[b - 1], Gershayim)) {
            b = (byte) (b - 1);
            this.wordBuffer[b] = 0;
            this.tokenLengthInSource = Math.max(this.tokenLengthInSource - 1, 0);
        }
        if (b > 2 && this.wordBuffer[b - 1] == '\'' && ((i & TokenType.Hebrew) == 0 || !isOfChars(this.wordBuffer[b - 2], LettersAcceptingGeresh))) {
            b = (byte) (b - 1);
            this.wordBuffer[b] = 0;
            this.tokenLengthInSource = Math.max(this.tokenLengthInSource - 1, 0);
        }
        reference.ref = new String(this.wordBuffer, 0, (int) b);
        return i;
    }

    private boolean isSffixForExactMath(char c) {
        return this.suffixForExactMatch != null && c == this.suffixForExactMatch.charValue();
    }

    public final void reset(Reader reader) {
        this.input = reader;
        this.inputOffset = 0;
        this.dataLen = 0;
        this.ioBufferIndex = 0;
        this.tokenOffset = 0;
        this.tokenLengthInSource = 0;
    }
}
