Main Page | Class Hierarchy | Class List | File List | Class Members | File Members

fti-new.c

Go to the documentation of this file.
00001 #include "server/postgres.h"
00002 #include <ctype.h>
00003 #include <wctype.h>
00004 #include <wchar.h>
00005 #include <iconv.h>
00006 #include <errno.h>
00007 #include "server/executor/spi.h"
00008 #include "server/commands/trigger.h"
00009 #include <stdlib.h>
00010 #include <unistd.h>
00011 #include <locale.h>
00012 #include <time.h>
00013 
00014 
00015 typedef struct SubStrRec {
00016   Oid      oid;
00017   wchar_t *text;
00018   struct SubStrRec *next;
00019 } SubStrRec;
00020 
00021 double codec_time=0;
00022 double breakup_time=0;
00023 double overall_time=0;
00024 double ins_substr_time=0;
00025 double lookup_substr_time=0;
00026 double lookup_stopword_time=0;
00027 double sql_time=0;
00028 
00029 extern Datum fti(PG_FUNCTION_ARGS);
00030 static wchar_t *breakup(wchar_t *, long);
00031 static int stopword_cmp(const void *a, const void *b);
00032 static bool is_stopword(wchar_t *);
00033 static bool new_tuple = false;
00034 static Oid lookupSubString(wchar_t *);
00035 static void insertSubString(wchar_t *, Oid);
00036 static bool ischar(const wint_t c);
00037 static wchar_t *mb2wc(char *s, long *len);
00038 static char *wc2mb(wchar_t *s, long *len);
00039 static char *codec(char *s, int len, char *from, char *to, long *wclen);
00040 static long mywcslen(wchar_t *s);
00041 static void insert(char *column, HeapTuple t, Oid oid, char *relname);
00042 static void del(Oid oid, char *relname);
00043 wchar_t **stopwords=0L;
00044 int    stopwordcount=0;
00045 #define substrhashsize 10000
00046 SubStrRec *substrhash[substrhashsize];
00047 
00048 /***********************************************************************/
00049 PG_FUNCTION_INFO_V1(fti);
00050 
00051 Datum
00052 fti(PG_FUNCTION_ARGS)
00053 {
00054   int ret;
00055   if ((ret = SPI_connect()) < 0)
00056     elog(ERROR, "Full Text Indexing: SPI_connect failed, returned %d\n", ret);
00057 
00058 
00059 
00060 //  nanosleep(100);
00061   clock_t tmptime=clock();
00062   stopwordcount=0;
00063   stopwords=NULL;
00064   memset(substrhash, 0, substrhashsize*sizeof(SubStrRec*));
00065 
00066   setlocale(LC_ALL, "de_DE@euro");
00067   elog(NOTICE, "====================== fti() neu =========================");
00068 //  elog(NOTICE, "TESTTESTTESTTEST");
00069   TriggerData *trigdata = (TriggerData *) fcinfo->context;
00070   Trigger     *trigger;         /* to get trigger name */
00071   int          nargs;           /* # of arguments */
00072   char       **args;            /* arguments */
00073   char        *relname;         /* triggered relation name */
00074   Relation     rel;             /* triggered relation */
00075   HeapTuple    rettuple = NULL;
00076   TupleDesc    tupdesc;         /* tuple description */
00077   bool         isinsert = false;
00078   bool         isdelete = false;
00079   //  int          ret;
00080   Oid          oid;
00081   char         *column=NULL;
00082   
00083   /*
00084    * FILE                *debug;
00085    */
00086   
00087   /*
00088    * debug = fopen("/dev/xconsole", "w"); fprintf(debug, "FTI: entered
00089    * function\n"); fflush(debug);
00090    */
00091   if (!CALLED_AS_TRIGGER(fcinfo))
00092     elog(ERROR, "Full Text Indexing: not fired by trigger manager");
00093   if (TRIGGER_FIRED_FOR_STATEMENT(trigdata->tg_event))
00094     elog(ERROR, "Full Text Indexing: can't process STATEMENT events");
00095   if (TRIGGER_FIRED_BEFORE(trigdata->tg_event))
00096     elog(ERROR, "Full Text Indexing: must be fired AFTER event");
00097   
00098   if (TRIGGER_FIRED_BY_INSERT(trigdata->tg_event))
00099     isinsert = true;
00100   if (TRIGGER_FIRED_BY_UPDATE(trigdata->tg_event)) {
00101     isdelete = true;
00102     isinsert = true;
00103   }
00104   if (TRIGGER_FIRED_BY_DELETE(trigdata->tg_event))
00105     isdelete = true;
00106     
00107   trigger = trigdata->tg_trigger;
00108   rel = trigdata->tg_relation;
00109   relname = SPI_getrelname(rel);
00110   rettuple = trigdata->tg_trigtuple;
00111   if (isdelete && isinsert) 
00112     rettuple = trigdata->tg_newtuple;
00113   
00114   //  if ((ret = SPI_connect()) < 0)
00115   //    elog(ERROR, "Full Text Indexing: SPI_connect failed, returned %d\n", ret);
00116   
00117   nargs = trigger->tgnargs;
00118   if (nargs != 2)
00119     elog(ERROR, "Full Text Indexing: trigger can only have 2 arguments");
00120     
00121   args = trigger->tgargs;
00122   tupdesc = rel->rd_att; 
00123   
00124   oid = HeapTupleGetOid(rettuple);
00125   if (!OidIsValid(oid))
00126     elog(ERROR, "Full Text Indexing: oid of current tuple is NULL");
00127   
00128   
00129   if(isdelete && isinsert) {
00130     Oid     *argtypes;
00131     int      colnum;
00132     
00133     argtypes = (Oid *) palloc(sizeof(Oid));
00134     argtypes[0] = VARCHAROID;  
00135     
00136     colnum = SPI_fnumber(tupdesc, args[0]);
00137     
00138     if (colnum == SPI_ERROR_NOATTRIBUTE)
00139       elog(ERROR, "Full Text Indexing: column '%s' not found", args[0]);
00140     column = SPI_getvalue(rettuple, tupdesc, colnum);
00141     
00142     char *buffer=SPI_getvalue(trigdata->tg_trigtuple, tupdesc, colnum);
00143     elog(NOTICE, "update");
00144     if(!column || (buffer && colnum && !strcmp(buffer, column)))
00145       return PointerGetDatum(rettuple);
00146   }
00147   
00148   if (isdelete) {
00149     Oid     *argtypes;
00150     int      nonum;
00151     
00152     argtypes = (Oid *) palloc(2 * sizeof(Oid));
00153     argtypes[0] = VARCHAROID; 
00154     argtypes[1] = INT4OID;
00155 
00156     nonum = SPI_fnumber(tupdesc, args[1]);
00157     if (nonum == SPI_ERROR_NOATTRIBUTE)
00158       elog(ERROR, "Full Text Indexing: column '%s' not found", args[1]);
00159     oid = strtoul(SPI_getvalue(rettuple, tupdesc, nonum), NULL, 10);
00160     char txt[100];
00161     sprintf(txt, "oid=%i", oid);
00162     elog(NOTICE, txt);
00163     del(oid, relname);
00164   }
00165   
00166   if (isinsert) {
00167     Oid     *argtypes;
00168     int      colnum, nonum;
00169     
00170     argtypes = (Oid *) palloc(2 * sizeof(Oid));
00171     argtypes[0] = VARCHAROID; 
00172     argtypes[1] = INT4OID;
00173     
00174     if(!column) {
00175       colnum = SPI_fnumber(tupdesc, args[0]);
00176       if (colnum == SPI_ERROR_NOATTRIBUTE)
00177         elog(ERROR, "Full Text Indexing: column '%s' not found", args[0]);
00178       column = SPI_getvalue(rettuple, tupdesc, colnum);
00179     }
00180 
00181     nonum = SPI_fnumber(tupdesc, args[1]);
00182     if (nonum == SPI_ERROR_NOATTRIBUTE)
00183       elog(ERROR, "Full Text Indexing: column '%s' not found", args[1]);
00184     
00185     oid = strtoul(SPI_getvalue(rettuple, tupdesc, nonum), NULL, 10);
00186     
00187     insert(column, rettuple, oid, relname);
00188   }
00189   
00190   
00191   overall_time+=clock()-tmptime;
00192   /*
00193   char txt[100];
00194   sprintf(txt, "\noverall: %f (100)\ncodec: %f (%f)\nbreakup %f (%f)\nlookup_stopword: %f (%f)\nlookup_substr: %f (%f)\nins_substr: %f (%f)\nsqltime: %f (%f)",
00195           overall_time,  
00196           codec_time, codec_time/overall_time*100, 
00197           breakup_time, breakup_time/overall_time*100, 
00198           lookup_stopword_time, lookup_stopword_time/overall_time*100, 
00199           lookup_substr_time, lookup_substr_time/overall_time*100, 
00200           ins_substr_time, ins_substr_time/overall_time*100,
00201           sql_time, sql_time/overall_time*100);
00202   elog(NOTICE, txt);
00203   */
00204   SPI_finish();  
00205   return PointerGetDatum(rettuple);
00206 }
00207 
00208 
00209   
00210 static void del(Oid oid, char *relname) {
00211   elog(NOTICE, "del()");
00212   char    query[8192];
00213   sprintf(query, "DELETE FROM search_index WHERE id=%i and tab=tabid('%s')", oid, relname);
00214   int ret = SPI_exec(query, 0);
00215   if (ret != SPI_OK_DELETE)
00216     elog(ERROR, "Full Text Indexing: error executing plan in delete");
00217 }
00218 
00219 static void insert(char *column, HeapTuple t, Oid oid, char *relname) {
00220   long strlen;
00221   char query[8192];
00222   wchar_t *substring;
00223   wchar_t *string;
00224   int ret;
00225   
00226   //  elog(NOTICE, "insert()");
00227   
00228   string=mb2wc(column, &strlen);
00229   if(string) {
00230     //    wchar_t   *buff=NULL;
00231     
00232     int i;
00233     for(i=0; i<strlen; i++) // mywcslen(string); i++)
00234       string[i] = towlower(string[i]);
00235     
00236     //    buff = palloc((strlen+1)*sizeof(wchar_t)); // ((mywcslen(string)+ 1)*sizeof(wchar_t));
00237     new_tuple = true;
00238     
00239     //    elog(NOTICE, "nach stopwords");
00240     
00241     ret = SPI_exec("select word from search_stopword order by word", 0);
00242     stopwordcount = SPI_processed;
00243     if ( ret == SPI_OK_SELECT && SPI_processed > 0 ) {
00244       TupleDesc tupdesc = SPI_tuptable->tupdesc;
00245       SPITupleTable *tuptable = SPI_tuptable;
00246       int j;
00247       stopwords=palloc(stopwordcount*sizeof(wchar_t*));
00248       
00249       for (j = 0; j < stopwordcount; j++) {
00250         HeapTuple tuple = tuptable->vals[j];
00251         wchar_t *s=mb2wc(SPI_getvalue(tuple, tupdesc, 1), NULL);
00252         stopwords[j]=s;
00253       }
00254     }
00255     
00256     
00257     while((substring = breakup(string, strlen))) {
00258       //      elog(NOTICE, "nach breakup()");
00259       Oid wordoid=InvalidOid;
00260       if(wcslen(substring)<4) continue;
00261       if(is_stopword(substring)) continue;
00262       if(lookupSubString(substring)!=InvalidOid) continue;
00263       //      elog(NOTICE, wc2mb(substring, NULL));
00264       
00265       // keyword already in database? wordoid = oid of the keyword or InvalidOid, if not in db
00266       sprintf(query, "select oid from search_wordlist where string='%s'", wc2mb(substring, NULL));
00267       ret=SPI_exec(query, 0);
00268       if(ret != SPI_OK_SELECT)
00269         elog(ERROR, "Full Text Indexing: error executing select from wordlist");
00270       
00271       if(SPI_processed==1) {
00272         bool null;
00273         wordoid=DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1, &null));
00274       }
00275       SPI_freetuptable(SPI_tuptable);
00276       
00277       // if keyword not in db...
00278       if(wordoid==InvalidOid) {
00279 
00280         // write keyword to db, oid result in wordoid
00281         sprintf(query, "insert into search_wordlist (string) values ('%s')", 
00282                 wc2mb(substring, NULL));
00283         ret = SPI_exec(query, 0);
00284         if (ret != SPI_OK_INSERT)
00285           elog(ERROR, "Full Text Indexing: error executing plan in insert");
00286         wordoid=SPI_lastoid;
00287 
00288         SPI_freetuptable(SPI_tuptable);
00289         
00290         // write substrings to db
00291         int parent=wordoid;
00292         int id=0;
00293         if(wcslen(substring)>3) {
00294           for(i=1; i<wcslen(substring)-3; i++) {
00295             wchar_t *s=substring+i;
00296 
00297             // substring already in db? res to soid.
00298             Oid soid=InvalidOid;
00299             sprintf(query, "select oid from search_wordlist where string='%s'", wc2mb(s, NULL));
00300             ret=SPI_exec(query, 0);
00301             if(ret != SPI_OK_SELECT)
00302               elog(ERROR, "Full Text Indexing: error executing select from wordlist");
00303             if(SPI_processed==1) {
00304               bool null;
00305               soid=DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1, &null));
00306             }
00307             SPI_freetuptable(SPI_tuptable);
00308             
00309             // substring not in db => write to db
00310             if(soid==InvalidOid) {
00311               sprintf(query, "insert into search_wordlist (string) values (lower('%s'))", wc2mb(s, NULL));
00312               ret = SPI_exec(query, 0);
00313               if (ret != SPI_OK_INSERT)
00314                 elog(ERROR, "Full Text Indexing: error executing plan in insert");
00315               id=SPI_lastoid;
00316               SPI_freetuptable(SPI_tuptable);
00317             }
00318             else id=soid;
00319 
00320             // insert substring-id as child to parent id.
00321             sprintf(query, "insert into search_parent(id, parent) values (%i, %i)", id, parent);
00322             ret = SPI_exec(query, 0);
00323             if (ret != SPI_OK_INSERT)
00324               elog(ERROR, "Full Text Indexing: error executing plan in insert");
00325             SPI_freetuptable(SPI_tuptable);
00326 
00327             // substring id becomes new parent id.
00328             parent=id;
00329 
00330             if(soid!=InvalidOid) break;
00331           }
00332         }
00333       }
00334 
00335       insertSubString(substring, wordoid);
00336       
00337       clock_t sqltime=clock();
00338       sprintf(query, "insert into search_index (string, id, tab) values (%u, %u, tabid('%s'))", wordoid, oid, relname);
00339       
00340       ret=SPI_exec(query, 0);
00341       if (ret != SPI_OK_INSERT)
00342         elog(ERROR, "Full Text Indexing: error executing plan in insert");
00343       SPI_freetuptable(SPI_tuptable);
00344       
00345       sql_time+=clock()-sqltime;
00346     }
00347     
00348     //    pfree(buff);
00349   }
00350 }
00351 
00352 static Oid lookupSubString(wchar_t *substring) {
00353   clock_t tmptime=clock();
00354   SubStrRec *r;
00355   int len=wcslen(substring);
00356   int h=0;
00357   int i;
00358 
00359   //  elog(NOTICE, "lookupSubString()");
00360 
00361   unsigned char *s=(unsigned char *)substring;
00362   for(i=0; i<len*sizeof(wchar_t); i++) h+=s[i];
00363 
00364   h=h%substrhashsize;
00365   r=substrhash[h];
00366   
00367   while(r && wcscmp(r->text, substring)!=0) {
00368     // elog(NOTICE, wc2mb(r->text, NULL));
00369     r=r->next;
00370   }
00371 
00372   lookup_substr_time+=clock()-tmptime;
00373   if(!r) return InvalidOid;
00374   else return r->oid;
00375 }
00376 
00377 void insertSubString(wchar_t *substring, Oid oid) {
00378   //  elog(NOTICE, "insertSubString()");
00379 
00380   clock_t tmptime=clock();
00381   int len=wcslen(substring);
00382   int h=0;
00383   int i;
00384 
00385   unsigned char *s=(unsigned char *)substring;
00386   for(i=0; i<len*sizeof(wchar_t); i++) h+=s[i];
00387   h=h%substrhashsize;
00388 
00389   SubStrRec *r=palloc(sizeof(SubStrRec));
00390   r->oid=oid;
00391   //  elog(NOTICE, wc2mb(substring, NULL));
00392   r->text=palloc((len+2)*sizeof(wchar_t));
00393   wcscpy(r->text, substring);
00394   r->next=NULL;
00395   if(!substrhash[h]) substrhash[h]=r;
00396   else {
00397     SubStrRec *s=substrhash[h];
00398     while(s->next) s=s->next;
00399     s->next=r;
00400   }
00401   ins_substr_time+=clock()-tmptime;
00402 }
00403 
00404 static wchar_t*
00405 breakup(wchar_t *string, long strlen)
00406 {
00407   //  elog(NOTICE, "breakup() start");
00408   // elog(NOTICE, wc2mb(string, NULL));
00409   clock_t tmptime=clock();
00410   static wchar_t *cur_pos;
00411   static wchar_t *substring;
00412   
00413   if (new_tuple) {
00414     cur_pos = string; 
00415     new_tuple = false;  /* don't initialize this next time */
00416   }
00417   
00418   while (cur_pos < string+strlen-1 && !ischar(*cur_pos)) 
00419     cur_pos++;
00420   if(cur_pos==string+strlen-1) return NULL;
00421   substring=cur_pos;
00422   while(cur_pos < string+strlen-1 && ischar(*cur_pos)) 
00423     cur_pos++;
00424   *cur_pos=L'\0';
00425 
00426   breakup_time+=clock()-tmptime;
00427   //  elog(NOTICE, wc2mb(substring, NULL));
00428   //  elog(NOTICE, wc2mb(cur_pos, NULL));
00429   return substring;
00430 }
00431 
00432 
00433 int stopword_cmp(const void *a, const void *b) {
00434   wchar_t *x=(wchar_t*)a;
00435   wchar_t **y=(wchar_t**)b;
00436   return wcscmp(x,*y);
00437 }
00438 
00439 /* copied from src/backend/parser/keywords.c and adjusted for our situation*/
00440 static bool is_stopword(wchar_t *text) 
00441 {
00442   clock_t tmptime=clock();
00443   bool ret= !(bsearch(text, stopwords, stopwordcount, sizeof(wchar_t*), stopword_cmp)==0L);
00444   lookup_stopword_time+=clock()-tmptime;
00445   return ret;
00446 }
00447 
00448 bool ischar(const wint_t c) {
00449   return iswalpha(c);
00450 }
00451 
00452 
00453 wchar_t *mb2wc(char *s, long *len) {
00454   long l;
00455   if(s) return (wchar_t*) codec(s, strlen(s), "UTF8", "WCHAR_T", (len?len:&l));
00456   else return 0L;
00457 }
00458 
00459 char *wc2mb(wchar_t *s, long *len) {
00460   long l;
00461   return codec((char*)s, mywcslen(s)*sizeof(wchar_t), "WCHAR_T", "UTF8", (len?len:&l));
00462 }
00463 
00464 char *codec(char *s, int len, char *from, char *to, long *wclen) {
00465   clock_t tmptime=clock();
00466   if(!s || len==0 || !*s) return NULL;
00467   iconv_t cd = iconv_open(to, from);
00468   size_t avail=0;
00469   if(cd==(iconv_t)-1) elog(INFO, strerror(errno));
00470   int buflen=(len+1)*sizeof(wchar_t); // +len/4;
00471   avail=buflen;
00472   char *buffer = (char *) palloc(buflen);
00473   char *bufpointer=buffer;
00474   char *spointer=s;
00475   size_t nconv=0;
00476   size_t l=len;
00477   
00478   nconv=iconv(cd, &spointer, &l, &bufpointer, &avail);
00479   if(nconv==(size_t)-1) {
00480     if(errno==EILSEQ) elog(INFO, "EILSEQ");
00481     if(errno==EINVAL) elog(INFO, "EINVAL");
00482     if(errno==EBADF) elog(INFO, "EBADF");
00483     elog(ERROR, strerror(errno));
00484   }
00485   
00486   nconv=iconv(cd, NULL, NULL, &bufpointer, &avail);
00487 
00488   buffer=(char*) repalloc(buffer, buflen-avail+sizeof(wchar_t));
00489   *(wchar_t*)bufpointer=L'\0';
00490   iconv_close(cd);
00491   codec_time+=clock()-tmptime;
00492 
00493   *wclen=(buflen-avail)/sizeof(wchar_t);
00494 
00495   return buffer;
00496 }
00497 
00498 static long mywcslen(wchar_t *s) {
00499   long i=0;
00500   while(s[i]!=L'\0')  i++;
00501   return i;
00502 }

Generated on Sat Mar 27 19:20:39 2004 for Literature by doxygen 1.3.6-20040222