00001 #include "server/postgres.h"
00002 #include <ctype.h>
00003 #include <wctype.h>
00004 #include <wchar.h>
00005 #include <iconv.h>
00006 #include <errno.h>
00007 #include "server/executor/spi.h"
00008 #include "server/commands/trigger.h"
00009 #include <stdlib.h>
00010 #include <unistd.h>
00011 #include <locale.h>
00012 #include <time.h>
00013
00014
00015 typedef struct SubStrRec {
00016 Oid oid;
00017 wchar_t *text;
00018 struct SubStrRec *next;
00019 } SubStrRec;
00020
00021 double codec_time=0;
00022 double breakup_time=0;
00023 double overall_time=0;
00024 double ins_substr_time=0;
00025 double lookup_substr_time=0;
00026 double lookup_stopword_time=0;
00027 double sql_time=0;
00028
00029 extern Datum fti(PG_FUNCTION_ARGS);
00030 static wchar_t *breakup(wchar_t *, long);
00031 static int stopword_cmp(const void *a, const void *b);
00032 static bool is_stopword(wchar_t *);
00033 static bool new_tuple = false;
00034 static Oid lookupSubString(wchar_t *);
00035 static void insertSubString(wchar_t *, Oid);
00036 static bool ischar(const wint_t c);
00037 static wchar_t *mb2wc(char *s, long *len);
00038 static char *wc2mb(wchar_t *s, long *len);
00039 static char *codec(char *s, int len, char *from, char *to, long *wclen);
00040 static long mywcslen(wchar_t *s);
00041 static void insert(char *column, HeapTuple t, Oid oid, char *relname);
00042 static void del(Oid oid, char *relname);
00043 wchar_t **stopwords=0L;
00044 int stopwordcount=0;
00045 #define substrhashsize 10000
00046 SubStrRec *substrhash[substrhashsize];
00047
00048
00049 PG_FUNCTION_INFO_V1(fti);
00050
00051 Datum
00052 fti(PG_FUNCTION_ARGS)
00053 {
00054 int ret;
00055 if ((ret = SPI_connect()) < 0)
00056 elog(ERROR, "Full Text Indexing: SPI_connect failed, returned %d\n", ret);
00057
00058
00059
00060
00061 clock_t tmptime=clock();
00062 stopwordcount=0;
00063 stopwords=NULL;
00064 memset(substrhash, 0, substrhashsize*sizeof(SubStrRec*));
00065
00066 setlocale(LC_ALL, "de_DE@euro");
00067 elog(NOTICE, "====================== fti() neu =========================");
00068
00069 TriggerData *trigdata = (TriggerData *) fcinfo->context;
00070 Trigger *trigger;
00071 int nargs;
00072 char **args;
00073 char *relname;
00074 Relation rel;
00075 HeapTuple rettuple = NULL;
00076 TupleDesc tupdesc;
00077 bool isinsert = false;
00078 bool isdelete = false;
00079
00080 Oid oid;
00081 char *column=NULL;
00082
00083
00084
00085
00086
00087
00088
00089
00090
00091 if (!CALLED_AS_TRIGGER(fcinfo))
00092 elog(ERROR, "Full Text Indexing: not fired by trigger manager");
00093 if (TRIGGER_FIRED_FOR_STATEMENT(trigdata->tg_event))
00094 elog(ERROR, "Full Text Indexing: can't process STATEMENT events");
00095 if (TRIGGER_FIRED_BEFORE(trigdata->tg_event))
00096 elog(ERROR, "Full Text Indexing: must be fired AFTER event");
00097
00098 if (TRIGGER_FIRED_BY_INSERT(trigdata->tg_event))
00099 isinsert = true;
00100 if (TRIGGER_FIRED_BY_UPDATE(trigdata->tg_event)) {
00101 isdelete = true;
00102 isinsert = true;
00103 }
00104 if (TRIGGER_FIRED_BY_DELETE(trigdata->tg_event))
00105 isdelete = true;
00106
00107 trigger = trigdata->tg_trigger;
00108 rel = trigdata->tg_relation;
00109 relname = SPI_getrelname(rel);
00110 rettuple = trigdata->tg_trigtuple;
00111 if (isdelete && isinsert)
00112 rettuple = trigdata->tg_newtuple;
00113
00114
00115
00116
00117 nargs = trigger->tgnargs;
00118 if (nargs != 2)
00119 elog(ERROR, "Full Text Indexing: trigger can only have 2 arguments");
00120
00121 args = trigger->tgargs;
00122 tupdesc = rel->rd_att;
00123
00124 oid = HeapTupleGetOid(rettuple);
00125 if (!OidIsValid(oid))
00126 elog(ERROR, "Full Text Indexing: oid of current tuple is NULL");
00127
00128
00129 if(isdelete && isinsert) {
00130 Oid *argtypes;
00131 int colnum;
00132
00133 argtypes = (Oid *) palloc(sizeof(Oid));
00134 argtypes[0] = VARCHAROID;
00135
00136 colnum = SPI_fnumber(tupdesc, args[0]);
00137
00138 if (colnum == SPI_ERROR_NOATTRIBUTE)
00139 elog(ERROR, "Full Text Indexing: column '%s' not found", args[0]);
00140 column = SPI_getvalue(rettuple, tupdesc, colnum);
00141
00142 char *buffer=SPI_getvalue(trigdata->tg_trigtuple, tupdesc, colnum);
00143 elog(NOTICE, "update");
00144 if(!column || (buffer && colnum && !strcmp(buffer, column)))
00145 return PointerGetDatum(rettuple);
00146 }
00147
00148 if (isdelete) {
00149 Oid *argtypes;
00150 int nonum;
00151
00152 argtypes = (Oid *) palloc(2 * sizeof(Oid));
00153 argtypes[0] = VARCHAROID;
00154 argtypes[1] = INT4OID;
00155
00156 nonum = SPI_fnumber(tupdesc, args[1]);
00157 if (nonum == SPI_ERROR_NOATTRIBUTE)
00158 elog(ERROR, "Full Text Indexing: column '%s' not found", args[1]);
00159 oid = strtoul(SPI_getvalue(rettuple, tupdesc, nonum), NULL, 10);
00160 char txt[100];
00161 sprintf(txt, "oid=%i", oid);
00162 elog(NOTICE, txt);
00163 del(oid, relname);
00164 }
00165
00166 if (isinsert) {
00167 Oid *argtypes;
00168 int colnum, nonum;
00169
00170 argtypes = (Oid *) palloc(2 * sizeof(Oid));
00171 argtypes[0] = VARCHAROID;
00172 argtypes[1] = INT4OID;
00173
00174 if(!column) {
00175 colnum = SPI_fnumber(tupdesc, args[0]);
00176 if (colnum == SPI_ERROR_NOATTRIBUTE)
00177 elog(ERROR, "Full Text Indexing: column '%s' not found", args[0]);
00178 column = SPI_getvalue(rettuple, tupdesc, colnum);
00179 }
00180
00181 nonum = SPI_fnumber(tupdesc, args[1]);
00182 if (nonum == SPI_ERROR_NOATTRIBUTE)
00183 elog(ERROR, "Full Text Indexing: column '%s' not found", args[1]);
00184
00185 oid = strtoul(SPI_getvalue(rettuple, tupdesc, nonum), NULL, 10);
00186
00187 insert(column, rettuple, oid, relname);
00188 }
00189
00190
00191 overall_time+=clock()-tmptime;
00192
00193
00194
00195
00196
00197
00198
00199
00200
00201
00202
00203
00204 SPI_finish();
00205 return PointerGetDatum(rettuple);
00206 }
00207
00208
00209
00210 static void del(Oid oid, char *relname) {
00211 elog(NOTICE, "del()");
00212 char query[8192];
00213 sprintf(query, "DELETE FROM search_index WHERE id=%i and tab=tabid('%s')", oid, relname);
00214 int ret = SPI_exec(query, 0);
00215 if (ret != SPI_OK_DELETE)
00216 elog(ERROR, "Full Text Indexing: error executing plan in delete");
00217 }
00218
00219 static void insert(char *column, HeapTuple t, Oid oid, char *relname) {
00220 long strlen;
00221 char query[8192];
00222 wchar_t *substring;
00223 wchar_t *string;
00224 int ret;
00225
00226
00227
00228 string=mb2wc(column, &strlen);
00229 if(string) {
00230
00231
00232 int i;
00233 for(i=0; i<strlen; i++)
00234 string[i] = towlower(string[i]);
00235
00236
00237 new_tuple = true;
00238
00239
00240
00241 ret = SPI_exec("select word from search_stopword order by word", 0);
00242 stopwordcount = SPI_processed;
00243 if ( ret == SPI_OK_SELECT && SPI_processed > 0 ) {
00244 TupleDesc tupdesc = SPI_tuptable->tupdesc;
00245 SPITupleTable *tuptable = SPI_tuptable;
00246 int j;
00247 stopwords=palloc(stopwordcount*sizeof(wchar_t*));
00248
00249 for (j = 0; j < stopwordcount; j++) {
00250 HeapTuple tuple = tuptable->vals[j];
00251 wchar_t *s=mb2wc(SPI_getvalue(tuple, tupdesc, 1), NULL);
00252 stopwords[j]=s;
00253 }
00254 }
00255
00256
00257 while((substring = breakup(string, strlen))) {
00258
00259 Oid wordoid=InvalidOid;
00260 if(wcslen(substring)<4) continue;
00261 if(is_stopword(substring)) continue;
00262 if(lookupSubString(substring)!=InvalidOid) continue;
00263
00264
00265
00266 sprintf(query, "select oid from search_wordlist where string='%s'", wc2mb(substring, NULL));
00267 ret=SPI_exec(query, 0);
00268 if(ret != SPI_OK_SELECT)
00269 elog(ERROR, "Full Text Indexing: error executing select from wordlist");
00270
00271 if(SPI_processed==1) {
00272 bool null;
00273 wordoid=DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1, &null));
00274 }
00275 SPI_freetuptable(SPI_tuptable);
00276
00277
00278 if(wordoid==InvalidOid) {
00279
00280
00281 sprintf(query, "insert into search_wordlist (string) values ('%s')",
00282 wc2mb(substring, NULL));
00283 ret = SPI_exec(query, 0);
00284 if (ret != SPI_OK_INSERT)
00285 elog(ERROR, "Full Text Indexing: error executing plan in insert");
00286 wordoid=SPI_lastoid;
00287
00288 SPI_freetuptable(SPI_tuptable);
00289
00290
00291 int parent=wordoid;
00292 int id=0;
00293 if(wcslen(substring)>3) {
00294 for(i=1; i<wcslen(substring)-3; i++) {
00295 wchar_t *s=substring+i;
00296
00297
00298 Oid soid=InvalidOid;
00299 sprintf(query, "select oid from search_wordlist where string='%s'", wc2mb(s, NULL));
00300 ret=SPI_exec(query, 0);
00301 if(ret != SPI_OK_SELECT)
00302 elog(ERROR, "Full Text Indexing: error executing select from wordlist");
00303 if(SPI_processed==1) {
00304 bool null;
00305 soid=DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1, &null));
00306 }
00307 SPI_freetuptable(SPI_tuptable);
00308
00309
00310 if(soid==InvalidOid) {
00311 sprintf(query, "insert into search_wordlist (string) values (lower('%s'))", wc2mb(s, NULL));
00312 ret = SPI_exec(query, 0);
00313 if (ret != SPI_OK_INSERT)
00314 elog(ERROR, "Full Text Indexing: error executing plan in insert");
00315 id=SPI_lastoid;
00316 SPI_freetuptable(SPI_tuptable);
00317 }
00318 else id=soid;
00319
00320
00321 sprintf(query, "insert into search_parent(id, parent) values (%i, %i)", id, parent);
00322 ret = SPI_exec(query, 0);
00323 if (ret != SPI_OK_INSERT)
00324 elog(ERROR, "Full Text Indexing: error executing plan in insert");
00325 SPI_freetuptable(SPI_tuptable);
00326
00327
00328 parent=id;
00329
00330 if(soid!=InvalidOid) break;
00331 }
00332 }
00333 }
00334
00335 insertSubString(substring, wordoid);
00336
00337 clock_t sqltime=clock();
00338 sprintf(query, "insert into search_index (string, id, tab) values (%u, %u, tabid('%s'))", wordoid, oid, relname);
00339
00340 ret=SPI_exec(query, 0);
00341 if (ret != SPI_OK_INSERT)
00342 elog(ERROR, "Full Text Indexing: error executing plan in insert");
00343 SPI_freetuptable(SPI_tuptable);
00344
00345 sql_time+=clock()-sqltime;
00346 }
00347
00348
00349 }
00350 }
00351
00352 static Oid lookupSubString(wchar_t *substring) {
00353 clock_t tmptime=clock();
00354 SubStrRec *r;
00355 int len=wcslen(substring);
00356 int h=0;
00357 int i;
00358
00359
00360
00361 unsigned char *s=(unsigned char *)substring;
00362 for(i=0; i<len*sizeof(wchar_t); i++) h+=s[i];
00363
00364 h=h%substrhashsize;
00365 r=substrhash[h];
00366
00367 while(r && wcscmp(r->text, substring)!=0) {
00368
00369 r=r->next;
00370 }
00371
00372 lookup_substr_time+=clock()-tmptime;
00373 if(!r) return InvalidOid;
00374 else return r->oid;
00375 }
00376
00377 void insertSubString(wchar_t *substring, Oid oid) {
00378
00379
00380 clock_t tmptime=clock();
00381 int len=wcslen(substring);
00382 int h=0;
00383 int i;
00384
00385 unsigned char *s=(unsigned char *)substring;
00386 for(i=0; i<len*sizeof(wchar_t); i++) h+=s[i];
00387 h=h%substrhashsize;
00388
00389 SubStrRec *r=palloc(sizeof(SubStrRec));
00390 r->oid=oid;
00391
00392 r->text=palloc((len+2)*sizeof(wchar_t));
00393 wcscpy(r->text, substring);
00394 r->next=NULL;
00395 if(!substrhash[h]) substrhash[h]=r;
00396 else {
00397 SubStrRec *s=substrhash[h];
00398 while(s->next) s=s->next;
00399 s->next=r;
00400 }
00401 ins_substr_time+=clock()-tmptime;
00402 }
00403
00404 static wchar_t*
00405 breakup(wchar_t *string, long strlen)
00406 {
00407
00408
00409 clock_t tmptime=clock();
00410 static wchar_t *cur_pos;
00411 static wchar_t *substring;
00412
00413 if (new_tuple) {
00414 cur_pos = string;
00415 new_tuple = false;
00416 }
00417
00418 while (cur_pos < string+strlen-1 && !ischar(*cur_pos))
00419 cur_pos++;
00420 if(cur_pos==string+strlen-1) return NULL;
00421 substring=cur_pos;
00422 while(cur_pos < string+strlen-1 && ischar(*cur_pos))
00423 cur_pos++;
00424 *cur_pos=L'\0';
00425
00426 breakup_time+=clock()-tmptime;
00427
00428
00429 return substring;
00430 }
00431
00432
00433 int stopword_cmp(const void *a, const void *b) {
00434 wchar_t *x=(wchar_t*)a;
00435 wchar_t **y=(wchar_t**)b;
00436 return wcscmp(x,*y);
00437 }
00438
00439
00440 static bool is_stopword(wchar_t *text)
00441 {
00442 clock_t tmptime=clock();
00443 bool ret= !(bsearch(text, stopwords, stopwordcount, sizeof(wchar_t*), stopword_cmp)==0L);
00444 lookup_stopword_time+=clock()-tmptime;
00445 return ret;
00446 }
00447
00448 bool ischar(const wint_t c) {
00449 return iswalpha(c);
00450 }
00451
00452
00453 wchar_t *mb2wc(char *s, long *len) {
00454 long l;
00455 if(s) return (wchar_t*) codec(s, strlen(s), "UTF8", "WCHAR_T", (len?len:&l));
00456 else return 0L;
00457 }
00458
00459 char *wc2mb(wchar_t *s, long *len) {
00460 long l;
00461 return codec((char*)s, mywcslen(s)*sizeof(wchar_t), "WCHAR_T", "UTF8", (len?len:&l));
00462 }
00463
00464 char *codec(char *s, int len, char *from, char *to, long *wclen) {
00465 clock_t tmptime=clock();
00466 if(!s || len==0 || !*s) return NULL;
00467 iconv_t cd = iconv_open(to, from);
00468 size_t avail=0;
00469 if(cd==(iconv_t)-1) elog(INFO, strerror(errno));
00470 int buflen=(len+1)*sizeof(wchar_t);
00471 avail=buflen;
00472 char *buffer = (char *) palloc(buflen);
00473 char *bufpointer=buffer;
00474 char *spointer=s;
00475 size_t nconv=0;
00476 size_t l=len;
00477
00478 nconv=iconv(cd, &spointer, &l, &bufpointer, &avail);
00479 if(nconv==(size_t)-1) {
00480 if(errno==EILSEQ) elog(INFO, "EILSEQ");
00481 if(errno==EINVAL) elog(INFO, "EINVAL");
00482 if(errno==EBADF) elog(INFO, "EBADF");
00483 elog(ERROR, strerror(errno));
00484 }
00485
00486 nconv=iconv(cd, NULL, NULL, &bufpointer, &avail);
00487
00488 buffer=(char*) repalloc(buffer, buflen-avail+sizeof(wchar_t));
00489 *(wchar_t*)bufpointer=L'\0';
00490 iconv_close(cd);
00491 codec_time+=clock()-tmptime;
00492
00493 *wclen=(buflen-avail)/sizeof(wchar_t);
00494
00495 return buffer;
00496 }
00497
00498 static long mywcslen(wchar_t *s) {
00499 long i=0;
00500 while(s[i]!=L'\0') i++;
00501 return i;
00502 }