src/third_party/sqlite/src/ext/fts1/simple_tokenizer.c - cobalt - Git at Google

 /*
 ** The author disclaims copyright to this source code.
 **
 *************************************************************************
 ** Implementation of the "simple" full-text-search tokenizer.
 */

 #include <assert.h>
 #if !defined(__APPLE__)
 #include <malloc.h>
 #else
 #include <stdlib.h>
 #endif
 #include <stdio.h>
 #include <string.h>
 #include <ctype.h>

 #include "tokenizer.h"

 /* Duplicate a string; the caller must free() the returned string.
  * (We don't use strdup() since it's not part of the standard C library and
  * may not be available everywhere.) */
 /* TODO(shess) Copied from fulltext.c, consider util.c for such
 ** things. */
 static char *string_dup(const char *s){
   char *str = malloc(strlen(s) + 1);
   strcpy(str, s);
   return str;
 }

 typedef struct simple_tokenizer {
   sqlite3_tokenizer base;
   const char *zDelim;          /* token delimiters */
 } simple_tokenizer;

 typedef struct simple_tokenizer_cursor {
   sqlite3_tokenizer_cursor base;
   const char *pInput;          /* input we are tokenizing */
   int nBytes;                  /* size of the input */
   const char *pCurrent;        /* current position in pInput */
   int iToken;                  /* index of next token to be returned */
   char *zToken;                /* storage for current token */
   int nTokenBytes;             /* actual size of current token */
   int nTokenAllocated;         /* space allocated to zToken buffer */
 } simple_tokenizer_cursor;

 static sqlite3_tokenizer_module simpleTokenizerModule;/* forward declaration */

 static int simpleCreate(
   int argc, const char **argv,
   sqlite3_tokenizer **ppTokenizer
 ){
   simple_tokenizer *t;

   t = (simple_tokenizer *) malloc(sizeof(simple_tokenizer));
   /* TODO(shess) Delimiters need to remain the same from run to run,
   ** else we need to reindex.  One solution would be a meta-table to
   ** track such information in the database, then we'd only want this
   ** information on the initial create.
   */
   if( argc>1 ){
     t->zDelim = string_dup(argv[1]);
   } else {
     /* Build a string excluding alphanumeric ASCII characters */
     char zDelim[0x80];               /* nul-terminated, so nul not a member */
     int i, j;
     for(i=1, j=0; i<0x80; i++){
       if( !isalnum(i) ){
         zDelim[j++] = i;
       }
     }
     zDelim[j++] = '\0';
     assert( j<=sizeof(zDelim) );
     t->zDelim = string_dup(zDelim);
   }

   *ppTokenizer = &t->base;
   return SQLITE_OK;
 }

 static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
   simple_tokenizer *t = (simple_tokenizer *) pTokenizer;

   free((void *) t->zDelim);
   free(t);

   return SQLITE_OK;
 }

 static int simpleOpen(
   sqlite3_tokenizer *pTokenizer,
   const char *pInput, int nBytes,
   sqlite3_tokenizer_cursor **ppCursor
 ){
   simple_tokenizer_cursor *c;

   c = (simple_tokenizer_cursor *) malloc(sizeof(simple_tokenizer_cursor));
   c->pInput = pInput;
   c->nBytes = nBytes<0 ? (int) strlen(pInput) : nBytes;
   c->pCurrent = c->pInput;        /* start tokenizing at the beginning */
   c->iToken = 0;
   c->zToken = NULL;               /* no space allocated, yet. */
   c->nTokenBytes = 0;
   c->nTokenAllocated = 0;

   *ppCursor = &c->base;
   return SQLITE_OK;
 }

 static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
   simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;

   if( NULL!=c->zToken ){
     free(c->zToken);
   }
   free(c);

   return SQLITE_OK;
 }

 static int simpleNext(
   sqlite3_tokenizer_cursor *pCursor,
   const char **ppToken, int *pnBytes,
   int *piStartOffset, int *piEndOffset, int *piPosition
 ){
   simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
   simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
   int ii;

   while( c->pCurrent-c->pInput<c->nBytes ){
     int n = (int) strcspn(c->pCurrent, t->zDelim);
     if( n>0 ){
       if( n+1>c->nTokenAllocated ){
         c->zToken = realloc(c->zToken, n+1);
       }
       for(ii=0; ii<n; ii++){
         /* TODO(shess) This needs expansion to handle UTF-8
         ** case-insensitivity.
         */
         char ch = c->pCurrent[ii];
         c->zToken[ii] = (unsigned char)ch<0x80 ? tolower(ch) : ch;
       }
       c->zToken[n] = '\0';
       *ppToken = c->zToken;
       *pnBytes = n;
       *piStartOffset = (int) (c->pCurrent-c->pInput);
       *piEndOffset = *piStartOffset+n;
       *piPosition = c->iToken++;
       c->pCurrent += n + 1;

       return SQLITE_OK;
     }
     c->pCurrent += n + 1;
     /* TODO(shess) could strspn() to skip delimiters en masse.  Needs
     ** to happen in two places, though, which is annoying.
     */
   }
   return SQLITE_DONE;
 }

 static sqlite3_tokenizer_module simpleTokenizerModule = {
   0,
   simpleCreate,
   simpleDestroy,
   simpleOpen,
   simpleClose,
   simpleNext,
 };

 void get_simple_tokenizer_module(
   sqlite3_tokenizer_module **ppModule
 ){
   *ppModule = &simpleTokenizerModule;
 }
	/*
	** The author disclaims copyright to this source code.
	**
	*************************************************************************
	** Implementation of the "simple" full-text-search tokenizer.
	*/

	#include <assert.h>
	#if !defined(__APPLE__)
	#include <malloc.h>
	#else
	#include <stdlib.h>
	#endif
	#include <stdio.h>
	#include <string.h>
	#include <ctype.h>

	#include "tokenizer.h"

	/* Duplicate a string; the caller must free() the returned string.
	* (We don't use strdup() since it's not part of the standard C library and
	* may not be available everywhere.) */
	/* TODO(shess) Copied from fulltext.c, consider util.c for such
	** things. */
	static char string_dup(const char s){
	char *str = malloc(strlen(s) + 1);
	strcpy(str, s);
	return str;
	}

	typedef struct simple_tokenizer {
	sqlite3_tokenizer base;
	const char zDelim; / token delimiters */
	} simple_tokenizer;

	typedef struct simple_tokenizer_cursor {
	sqlite3_tokenizer_cursor base;
	const char pInput; / input we are tokenizing */
	int nBytes; /* size of the input */
	const char pCurrent; / current position in pInput */
	int iToken; /* index of next token to be returned */
	char zToken; / storage for current token */
	int nTokenBytes; /* actual size of current token */
	int nTokenAllocated; /* space allocated to zToken buffer */
	} simple_tokenizer_cursor;

	static sqlite3_tokenizer_module simpleTokenizerModule;/* forward declaration */

	static int simpleCreate(
	int argc, const char **argv,
	sqlite3_tokenizer **ppTokenizer
	){
	simple_tokenizer *t;

	t = (simple_tokenizer *) malloc(sizeof(simple_tokenizer));
	/* TODO(shess) Delimiters need to remain the same from run to run,
	** else we need to reindex. One solution would be a meta-table to
	** track such information in the database, then we'd only want this
	** information on the initial create.
	*/
	if( argc>1 ){
	t->zDelim = string_dup(argv[1]);
	} else {
	/* Build a string excluding alphanumeric ASCII characters */
	char zDelim[0x80]; /* nul-terminated, so nul not a member */
	int i, j;
	for(i=1, j=0; i<0x80; i++){
	if( !isalnum(i) ){
	zDelim[j++] = i;
	}
	}
	zDelim[j++] = '\0';
	assert( j<=sizeof(zDelim) );
	t->zDelim = string_dup(zDelim);
	}

	*ppTokenizer = &t->base;
	return SQLITE_OK;
	}

	static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
	simple_tokenizer t = (simple_tokenizer ) pTokenizer;

	free((void *) t->zDelim);
	free(t);

	return SQLITE_OK;
	}

	static int simpleOpen(
	sqlite3_tokenizer *pTokenizer,
	const char *pInput, int nBytes,
	sqlite3_tokenizer_cursor **ppCursor
	){
	simple_tokenizer_cursor *c;

	c = (simple_tokenizer_cursor *) malloc(sizeof(simple_tokenizer_cursor));
	c->pInput = pInput;
	c->nBytes = nBytes<0 ? (int) strlen(pInput) : nBytes;
	c->pCurrent = c->pInput; /* start tokenizing at the beginning */
	c->iToken = 0;
	c->zToken = NULL; /* no space allocated, yet. */
	c->nTokenBytes = 0;
	c->nTokenAllocated = 0;

	*ppCursor = &c->base;
	return SQLITE_OK;
	}

	static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
	simple_tokenizer_cursor c = (simple_tokenizer_cursor ) pCursor;

	if( NULL!=c->zToken ){
	free(c->zToken);
	}
	free(c);

	return SQLITE_OK;
	}

	static int simpleNext(
	sqlite3_tokenizer_cursor *pCursor,
	const char *ppToken, int pnBytes,
	int piStartOffset, int piEndOffset, int *piPosition
	){
	simple_tokenizer_cursor c = (simple_tokenizer_cursor ) pCursor;
	simple_tokenizer t = (simple_tokenizer ) pCursor->pTokenizer;
	int ii;

	while( c->pCurrent-c->pInput<c->nBytes ){
	int n = (int) strcspn(c->pCurrent, t->zDelim);
	if( n>0 ){
	if( n+1>c->nTokenAllocated ){
	c->zToken = realloc(c->zToken, n+1);
	}
	for(ii=0; ii<n; ii++){
	/* TODO(shess) This needs expansion to handle UTF-8
	** case-insensitivity.
	*/
	char ch = c->pCurrent[ii];
	c->zToken[ii] = (unsigned char)ch<0x80 ? tolower(ch) : ch;
	}
	c->zToken[n] = '\0';
	*ppToken = c->zToken;
	*pnBytes = n;
	*piStartOffset = (int) (c->pCurrent-c->pInput);
	piEndOffset = piStartOffset+n;
	*piPosition = c->iToken++;
	c->pCurrent += n + 1;

	return SQLITE_OK;
	}
	c->pCurrent += n + 1;
	/* TODO(shess) could strspn() to skip delimiters en masse. Needs
	** to happen in two places, though, which is annoying.
	*/
	}
	return SQLITE_DONE;
	}

	static sqlite3_tokenizer_module simpleTokenizerModule = {
	0,
	simpleCreate,
	simpleDestroy,
	simpleOpen,
	simpleClose,
	simpleNext,
	};

	void get_simple_tokenizer_module(
	sqlite3_tokenizer_module **ppModule
	){
	*ppModule = &simpleTokenizerModule;
	}