blob: cc26b057b48754796b5e309132fd779db5db0020 [file] [log] [blame]
/*
**********************************************************************
* Copyright (C) 2005-2012, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
#ifndef __CSRSBCS_H
#define __CSRSBCS_H
#include "unicode/uobject.h"
#if !UCONFIG_NO_CONVERSION
#include "csrecog.h"
U_NAMESPACE_BEGIN
class NGramParser : public UMemory
{
private:
int32_t byteIndex;
int32_t ngram;
const int32_t *ngramList;
const uint8_t *charMap;
int32_t ngramCount;
int32_t hitCount;
public:
NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap);
private:
/*
* Binary search for value in table, which must have exactly 64 entries.
*/
int32_t search(const int32_t *table, int32_t value);
void lookup(int32_t thisNgram);
void addByte(int32_t b);
int32_t nextByte(InputText *det);
public:
int32_t parse(InputText *det);
};
class CharsetRecog_sbcs : public CharsetRecognizer
{
public:
CharsetRecog_sbcs();
virtual ~CharsetRecog_sbcs();
virtual const char *getName() const = 0;
virtual UBool match(InputText *det, CharsetMatch *results) const = 0;
virtual int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const;
};
class CharsetRecog_8859_1 : public CharsetRecog_sbcs
{
public:
virtual ~CharsetRecog_8859_1();
const char *getName() const;
virtual UBool match(InputText *det, CharsetMatch *results) const;
};
class CharsetRecog_8859_2 : public CharsetRecog_sbcs
{
public:
virtual ~CharsetRecog_8859_2();
const char *getName() const;
virtual UBool match(InputText *det, CharsetMatch *results) const;
};
class CharsetRecog_8859_5 : public CharsetRecog_sbcs
{
public:
virtual ~CharsetRecog_8859_5();
const char *getName() const;
};
class CharsetRecog_8859_6 : public CharsetRecog_sbcs
{
public:
virtual ~CharsetRecog_8859_6();
const char *getName() const;
};
class CharsetRecog_8859_7 : public CharsetRecog_sbcs
{
public:
virtual ~CharsetRecog_8859_7();
const char *getName() const;
};
class CharsetRecog_8859_8 : public CharsetRecog_sbcs
{
public:
virtual ~CharsetRecog_8859_8();
virtual const char *getName() const;
};
class CharsetRecog_8859_9 : public CharsetRecog_sbcs
{
public:
virtual ~CharsetRecog_8859_9();
const char *getName() const;
};
class CharsetRecog_8859_5_ru : public CharsetRecog_8859_5
{
public:
virtual ~CharsetRecog_8859_5_ru();
const char *getLanguage() const;
virtual UBool match(InputText *det, CharsetMatch *results) const;
};
class CharsetRecog_8859_6_ar : public CharsetRecog_8859_6
{
public:
virtual ~CharsetRecog_8859_6_ar();
const char *getLanguage() const;
virtual UBool match(InputText *det, CharsetMatch *results) const;
};
class CharsetRecog_8859_7_el : public CharsetRecog_8859_7
{
public:
virtual ~CharsetRecog_8859_7_el();
const char *getLanguage() const;
virtual UBool match(InputText *det, CharsetMatch *results) const;
};
class CharsetRecog_8859_8_I_he : public CharsetRecog_8859_8
{
public:
virtual ~CharsetRecog_8859_8_I_he();
const char *getName() const;
const char *getLanguage() const;
virtual UBool match(InputText *det, CharsetMatch *results) const;
};
class CharsetRecog_8859_8_he : public CharsetRecog_8859_8
{
public:
virtual ~CharsetRecog_8859_8_he ();
const char *getLanguage() const;
virtual UBool match(InputText *det, CharsetMatch *results) const;
};
class CharsetRecog_8859_9_tr : public CharsetRecog_8859_9
{
public:
virtual ~CharsetRecog_8859_9_tr ();
const char *getLanguage() const;
virtual UBool match(InputText *det, CharsetMatch *results) const;
};
class CharsetRecog_windows_1256 : public CharsetRecog_sbcs
{
public:
virtual ~CharsetRecog_windows_1256();
const char *getName() const;
const char *getLanguage() const;
virtual UBool match(InputText *det, CharsetMatch *results) const;
};
class CharsetRecog_windows_1251 : public CharsetRecog_sbcs
{
public:
virtual ~CharsetRecog_windows_1251();
const char *getName() const;
const char *getLanguage() const;
virtual UBool match(InputText *det, CharsetMatch *results) const;
};
class CharsetRecog_KOI8_R : public CharsetRecog_sbcs
{
public:
virtual ~CharsetRecog_KOI8_R();
const char *getName() const;
const char *getLanguage() const;
virtual UBool match(InputText *det, CharsetMatch *results) const;
};
class CharsetRecog_IBM424_he : public CharsetRecog_sbcs
{
public:
virtual ~CharsetRecog_IBM424_he();
const char *getLanguage() const;
};
class CharsetRecog_IBM424_he_rtl : public CharsetRecog_IBM424_he {
public:
virtual ~CharsetRecog_IBM424_he_rtl();
const char *getName() const;
virtual UBool match(InputText *det, CharsetMatch *results) const;
};
class CharsetRecog_IBM424_he_ltr : public CharsetRecog_IBM424_he {
virtual ~CharsetRecog_IBM424_he_ltr();
const char *getName() const;
virtual UBool match(InputText *det, CharsetMatch *results) const;
};
class CharsetRecog_IBM420_ar : public CharsetRecog_sbcs
{
public:
virtual ~CharsetRecog_IBM420_ar();
const char *getLanguage() const;
protected:
void matchInit(InputText *textIn);
void matchFinish(InputText *textIn);
private:
uint8_t *prev_fInputBytes;
int32_t prev_fInputBytesLength;
UBool deleteBuffer;
UBool isLamAlef(uint8_t b);
uint8_t *unshapeLamAlef(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length);
uint8_t *unshape(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length);
};
class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar {
public:
virtual ~CharsetRecog_IBM420_ar_rtl();
const char *getName() const;
virtual UBool match(InputText *det, CharsetMatch *results) const;
};
class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar {
virtual ~CharsetRecog_IBM420_ar_ltr();
const char *getName() const;
virtual UBool match(InputText *det, CharsetMatch *results) const;
};
U_NAMESPACE_END
#endif /* !UCONFIG_NO_CONVERSION */
#endif /* __CSRSBCS_H */