| /* |
| ********************************************************************** |
| * Copyright (C) 2005-2009, International Business Machines |
| * Corporation and others. All Rights Reserved. |
| ********************************************************************** |
| */ |
| |
| #ifndef __CSRSBCS_H |
| #define __CSRSBCS_H |
| |
| #include "unicode/uobject.h" |
| |
| #if !UCONFIG_NO_CONVERSION |
| |
| #include "csrecog.h" |
| |
| U_NAMESPACE_BEGIN |
| |
| class NGramParser : public UMemory |
| { |
| private: |
| int32_t byteIndex; |
| int32_t ngram; |
| |
| const int32_t *ngramList; |
| const uint8_t *charMap; |
| |
| int32_t ngramCount; |
| int32_t hitCount; |
| |
| public: |
| NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap); |
| |
| private: |
| /* |
| * Binary search for value in table, which must have exactly 64 entries. |
| */ |
| int32_t search(const int32_t *table, int32_t value); |
| |
| void lookup(int32_t thisNgram); |
| void addByte(int32_t b); |
| int32_t nextByte(InputText *det); |
| |
| public: |
| int32_t parse(InputText *det); |
| |
| }; |
| |
| class CharsetRecog_sbcs : public CharsetRecognizer |
| { |
| protected: |
| UBool haveC1Bytes; |
| |
| public: |
| CharsetRecog_sbcs(); |
| |
| virtual ~CharsetRecog_sbcs(); |
| |
| virtual const char *getName() const = 0; |
| |
| virtual int32_t match(InputText *det) = 0; |
| |
| int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]); |
| }; |
| |
| class CharsetRecog_8859_1 : public CharsetRecog_sbcs |
| { |
| public: |
| virtual ~CharsetRecog_8859_1(); |
| |
| const char *getName() const; |
| }; |
| |
| class CharsetRecog_8859_2 : public CharsetRecog_sbcs |
| { |
| public: |
| virtual ~CharsetRecog_8859_2(); |
| |
| const char *getName() const; |
| }; |
| |
| class CharsetRecog_8859_5 : public CharsetRecog_sbcs |
| { |
| public: |
| virtual ~CharsetRecog_8859_5(); |
| |
| const char *getName() const; |
| }; |
| |
| class CharsetRecog_8859_6 : public CharsetRecog_sbcs |
| { |
| public: |
| virtual ~CharsetRecog_8859_6(); |
| |
| const char *getName() const; |
| }; |
| |
| class CharsetRecog_8859_7 : public CharsetRecog_sbcs |
| { |
| public: |
| virtual ~CharsetRecog_8859_7(); |
| |
| const char *getName() const; |
| }; |
| |
| class CharsetRecog_8859_8 : public CharsetRecog_sbcs |
| { |
| public: |
| virtual ~CharsetRecog_8859_8(); |
| |
| virtual const char *getName() const; |
| }; |
| |
| class CharsetRecog_8859_9 : public CharsetRecog_sbcs |
| { |
| public: |
| virtual ~CharsetRecog_8859_9(); |
| |
| const char *getName() const; |
| }; |
| |
| class CharsetRecog_8859_1_en : public CharsetRecog_8859_1 |
| { |
| public: |
| virtual ~CharsetRecog_8859_1_en(); |
| |
| const char *getLanguage() const; |
| |
| int32_t match(InputText *textIn); |
| }; |
| |
| class CharsetRecog_8859_1_da : public CharsetRecog_8859_1 |
| { |
| public: |
| virtual ~CharsetRecog_8859_1_da(); |
| |
| const char *getLanguage() const; |
| |
| int32_t match(InputText *textIn); |
| }; |
| |
| class CharsetRecog_8859_1_de : public CharsetRecog_8859_1 |
| { |
| public: |
| virtual ~CharsetRecog_8859_1_de(); |
| |
| const char *getLanguage() const; |
| |
| int32_t match(InputText *textIn); |
| }; |
| |
| class CharsetRecog_8859_1_es : public CharsetRecog_8859_1 |
| { |
| public: |
| virtual ~CharsetRecog_8859_1_es(); |
| |
| const char *getLanguage() const; |
| |
| int32_t match(InputText *textIn); |
| }; |
| |
| class CharsetRecog_8859_1_fr : public CharsetRecog_8859_1 |
| { |
| public: |
| virtual ~CharsetRecog_8859_1_fr(); |
| |
| const char *getLanguage() const; |
| |
| int32_t match(InputText *textIn); |
| }; |
| |
| class CharsetRecog_8859_1_it : public CharsetRecog_8859_1 |
| { |
| public: |
| virtual ~CharsetRecog_8859_1_it(); |
| |
| const char *getLanguage() const; |
| |
| int32_t match(InputText *textIn); |
| }; |
| |
| class CharsetRecog_8859_1_nl : public CharsetRecog_8859_1 |
| { |
| public: |
| virtual ~CharsetRecog_8859_1_nl(); |
| |
| const char *getLanguage() const; |
| |
| int32_t match(InputText *textIn); |
| }; |
| |
| class CharsetRecog_8859_1_no : public CharsetRecog_8859_1 |
| { |
| public: |
| virtual ~CharsetRecog_8859_1_no(); |
| |
| const char *getLanguage() const; |
| |
| int32_t match(InputText *textIn); |
| }; |
| |
| class CharsetRecog_8859_1_pt : public CharsetRecog_8859_1 |
| { |
| public: |
| virtual ~CharsetRecog_8859_1_pt(); |
| |
| const char *getLanguage() const; |
| |
| int32_t match(InputText *textIn); |
| }; |
| |
| class CharsetRecog_8859_1_sv : public CharsetRecog_8859_1 |
| { |
| public: |
| virtual ~CharsetRecog_8859_1_sv(); |
| |
| const char *getLanguage() const; |
| |
| int32_t match(InputText *textIn); |
| }; |
| |
| class CharsetRecog_8859_2_cs : public CharsetRecog_8859_2 |
| { |
| public: |
| virtual ~CharsetRecog_8859_2_cs(); |
| |
| const char *getLanguage() const; |
| |
| int32_t match(InputText *textIn); |
| }; |
| |
| class CharsetRecog_8859_2_hu : public CharsetRecog_8859_2 |
| { |
| public: |
| virtual ~CharsetRecog_8859_2_hu(); |
| |
| const char *getLanguage() const; |
| |
| int32_t match(InputText *textIn); |
| }; |
| |
| class CharsetRecog_8859_2_pl : public CharsetRecog_8859_2 |
| { |
| public: |
| virtual ~CharsetRecog_8859_2_pl(); |
| |
| const char *getLanguage() const; |
| |
| int32_t match(InputText *textIn); |
| }; |
| |
| class CharsetRecog_8859_2_ro : public CharsetRecog_8859_2 |
| { |
| public: |
| virtual ~CharsetRecog_8859_2_ro(); |
| |
| const char *getLanguage() const; |
| |
| int32_t match(InputText *textIn); |
| }; |
| |
| class CharsetRecog_8859_5_ru : public CharsetRecog_8859_5 |
| { |
| public: |
| virtual ~CharsetRecog_8859_5_ru(); |
| |
| const char *getLanguage() const; |
| |
| int32_t match(InputText *textIn); |
| }; |
| |
| class CharsetRecog_8859_6_ar : public CharsetRecog_8859_6 |
| { |
| public: |
| virtual ~CharsetRecog_8859_6_ar(); |
| |
| const char *getLanguage() const; |
| |
| int32_t match(InputText *textIn); |
| }; |
| |
| class CharsetRecog_8859_7_el : public CharsetRecog_8859_7 |
| { |
| public: |
| virtual ~CharsetRecog_8859_7_el(); |
| |
| const char *getLanguage() const; |
| |
| int32_t match(InputText *textIn); |
| }; |
| |
| class CharsetRecog_8859_8_I_he : public CharsetRecog_8859_8 |
| { |
| public: |
| virtual ~CharsetRecog_8859_8_I_he(); |
| |
| const char *getName() const; |
| |
| const char *getLanguage() const; |
| |
| int32_t match(InputText *textIn); |
| }; |
| |
| class CharsetRecog_8859_8_he : public CharsetRecog_8859_8 |
| { |
| public: |
| virtual ~CharsetRecog_8859_8_he (); |
| |
| const char *getLanguage() const; |
| |
| int32_t match(InputText *textIn); |
| }; |
| |
| class CharsetRecog_8859_9_tr : public CharsetRecog_8859_9 |
| { |
| public: |
| virtual ~CharsetRecog_8859_9_tr (); |
| |
| const char *getLanguage() const; |
| |
| int32_t match(InputText *textIn); |
| }; |
| |
| class CharsetRecog_windows_1256 : public CharsetRecog_sbcs |
| { |
| public: |
| virtual ~CharsetRecog_windows_1256(); |
| |
| const char *getName() const; |
| |
| const char *getLanguage() const; |
| |
| int32_t match(InputText *textIn); |
| }; |
| |
| class CharsetRecog_windows_1251 : public CharsetRecog_sbcs |
| { |
| public: |
| virtual ~CharsetRecog_windows_1251(); |
| |
| const char *getName() const; |
| |
| const char *getLanguage() const; |
| |
| int32_t match(InputText *textIn); |
| }; |
| |
| |
| class CharsetRecog_KOI8_R : public CharsetRecog_sbcs |
| { |
| public: |
| virtual ~CharsetRecog_KOI8_R(); |
| |
| const char *getName() const; |
| |
| const char *getLanguage() const; |
| |
| int32_t match(InputText *textIn); |
| }; |
| |
| class CharsetRecog_IBM424_he : public CharsetRecog_sbcs |
| { |
| public: |
| virtual ~CharsetRecog_IBM424_he(); |
| |
| const char *getLanguage() const; |
| }; |
| |
| class CharsetRecog_IBM424_he_rtl : public CharsetRecog_IBM424_he { |
| public: |
| virtual ~CharsetRecog_IBM424_he_rtl(); |
| |
| const char *getName() const; |
| |
| int32_t match(InputText *textIn); |
| }; |
| |
| class CharsetRecog_IBM424_he_ltr : public CharsetRecog_IBM424_he { |
| virtual ~CharsetRecog_IBM424_he_ltr(); |
| |
| const char *getName() const; |
| |
| int32_t match(InputText *textIn); |
| }; |
| |
| class CharsetRecog_IBM420_ar : public CharsetRecog_sbcs |
| { |
| public: |
| virtual ~CharsetRecog_IBM420_ar(); |
| |
| const char *getLanguage() const; |
| |
| protected: |
| void matchInit(InputText *textIn); |
| void matchFinish(InputText *textIn); |
| |
| private: |
| uint8_t *prev_fInputBytes; |
| int32_t prev_fInputBytesLength; |
| UBool deleteBuffer; |
| |
| UBool isLamAlef(uint8_t b); |
| uint8_t *unshapeLamAlef(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length); |
| uint8_t *unshape(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length); |
| }; |
| |
| class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar { |
| public: |
| virtual ~CharsetRecog_IBM420_ar_rtl(); |
| |
| const char *getName() const; |
| |
| int32_t match(InputText *textIn); |
| }; |
| |
| class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar { |
| virtual ~CharsetRecog_IBM420_ar_ltr(); |
| |
| const char *getName() const; |
| |
| int32_t match(InputText *textIn); |
| }; |
| |
| U_NAMESPACE_END |
| |
| #endif |
| #endif /* __CSRSBCS_H */ |