/* Copyright (C) 2000-2002 Lavtech.com corp. All rights reserved.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
*/

#include "udm_config.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <ctype.h>

#include "udm_common.h"
#include "udm_textlist.h"
#include "udm_parsehtml.h"
#include "udm_utils.h"
#include "udm_url.h"
#include "udm_match.h"
#include "udm_log.h"
#include "udm_xmalloc.h"
#include "udm_server.h"
#include "udm_hrefs.h"
#include "udm_word.h"
#include "udm_crossword.h"
#include "udm_spell.h"
#include "udm_unicode.h"
#include "udm_unidata.h"
#include "udm_uniconv.h"
#include "udm_sgml.h"
#include "udm_guesser.h"
#include "udm_crc32.h"
#include "udm_vars.h"
#include "udm_mutex.h"
#include "udm_searchtool.h"

/****************************************************************/


static int UdmReallocSection(UDM_AGENT *Indexer, UDM_VAR *Sec)
{
	if(!Sec->val){
		Sec->val=(char*)UdmMalloc(Sec->maxlen+1);
	}else{
		/* Add separator */
		const char *uspace;
		char *vn = UdmStrStore(NULL, "separator.");
		size_t uspacel;
		size_t space_left;

		vn = UdmStrStore(vn, Sec->name);

		UDM_GETLOCK(Indexer, UDM_LOCK_CONF);
		uspace = UdmVarListFindStr(&Indexer->Conf->Vars, vn, " ");
		UdmFree(vn);
		uspacel = uspace ? strlen(uspace) : 0;
		space_left = Sec->maxlen - Sec->curlen;
		if (space_left > uspacel) {
			Sec->curlen += snprintf(Sec->val + Sec->curlen, space_left, "%s", uspace);
		} else {
			Sec->curlen = Sec->maxlen;
		}

		UDM_RELEASELOCK(Indexer, UDM_LOCK_CONF);
	}
	return UDM_OK;
}

int UdmPrepareWords(UDM_AGENT * Indexer,UDM_DOCUMENT * Doc){
	size_t		i;
	const char	*doccset;
	UDM_CHARSET	*doccs;
	UDM_CHARSET	*loccs;
	UDM_CHARSET	*sys_int;
	UDM_CONV	dc_uni;
	UDM_CONV	uni_lc;
	UDM_TEXTLIST	*tlist=&Doc->TextList;
	UDM_VAR		*Sec;
	int		res=UDM_OK;
	int		crc32=0;
	int		crossec;
	int             *uword;    /* Word in UNICODE      */
	char            *lcsword;  /* Word in LocalCharset */
	size_t          max_word_len, min_word_len, uwordlen = UDM_MAXWORDSIZE;
#ifdef USE_PARANOIA
	void *paran = UdmViolationEnter();
#endif
	
	if ((uword = (int*)UdmMalloc((uwordlen + 1) * sizeof(int))) == NULL) {
#ifdef USE_PARANOIA
	  UdmViolationExit(paran);
#endif
	  return UDM_ERROR;
	}
	if ((lcsword = (char*)UdmMalloc(12 * uwordlen + 1)) == NULL) { UDM_FREE(uword); 
#ifdef USE_PARANOIA
	UdmViolationExit(paran);
#endif
	return UDM_ERROR; }

	Sec=UdmVarListFind(&Doc->Sections,"crosswords");
	crossec=Sec?Sec->section:0;
	
	doccset=UdmVarListFindStr(&Doc->Sections,"Parser.Charset",NULL);
	if(!doccset)doccset=UdmVarListFindStr(&Doc->Sections,"Charset",NULL);
	if(!doccset||!*doccset)doccset=UdmVarListFindStr(&Doc->Sections,"RemoteCharset","iso-8859-1");
	doccs=UdmGetCharSet(doccset);
	if(!doccs)doccs=UdmGetCharSet("iso-8859-1");
	loccs = Doc->lcs;
	if(!loccs)loccs=UdmGetCharSet("iso-8859-1");
	sys_int=UdmGetCharSet("sys-int");

	UdmConvInit(&dc_uni,doccs,sys_int,UDM_RECODE_HTML);
	UdmConvInit(&uni_lc,sys_int,loccs,UDM_RECODE_HTML);

	UDM_GETLOCK(Indexer, UDM_LOCK_CONF);
	max_word_len = Indexer->Conf->WordParam.max_word_len;
	min_word_len = Indexer->Conf->WordParam.min_word_len;
	UDM_RELEASELOCK(Indexer, UDM_LOCK_CONF);
	
	
	/* Now convert everything to UNICODE format and calculate CRC32 */
	
	for(i=0;i<tlist->nitems;i++){
		size_t		srclen;
		size_t		srclen0;
		size_t		dstlen;
		size_t		reslen;
		char		*src,*dst;
		int		*lt, *tok, *ustr = NULL, *UStr = NULL;
		UDM_TEXTITEM	*Item=&tlist->Item[i];
		char		secname[128];
		
		srclen0=strlen(Item->str);
		srclen=srclen0+1;			/* with '\0' */
		dstlen=(3*srclen+1)*sizeof(int);	/* with '\0' */
		
		if ((ustr = (int*)UdmMalloc(dstlen)) == NULL) {
		  UdmLog(Indexer, UDM_LOG_ERROR, "%s:%d Can't alloc %u bytes", __FILE__, __LINE__, dstlen);
		  UDM_FREE(uword); UDM_FREE(lcsword);
#ifdef USE_PARANOIA
		  UdmViolationExit(paran);
#endif
		  return UDM_ERROR;
		}
		
		src=Item->str;
		dst=(char*)ustr;
		UdmConv(&dc_uni,dst,dstlen,src,srclen);
		UdmUniRemoveDoubleSpaces(ustr);
		if ((UStr = UdmUniDup(ustr)) == NULL) {
		  UdmLog(Indexer, UDM_LOG_ERROR, "%s:%d Can't UdmUniDup", __FILE__, __LINE__);
		  UDM_FREE(uword); UDM_FREE(lcsword); UDM_FREE(ustr);
#ifdef USE_PARANOIA
		  UdmViolationExit(paran);
#endif
		  return UDM_ERROR;
		}
		reslen = UdmUniLen(ustr);
		
		/*
		TODO for clones detection:
		Replace any separators into space to ignore 
		various pseudo-graphics, commas, semicolons
		and so on to improve clone detection quality
		*/
		
		if (strncasecmp(Item->section_name, "url", 3) != 0) /* do not calculate crc32  on url* sections */
		  crc32=UdmCRC32Update(crc32,(char*)ustr,reslen);
		
		if(Item->section){
			UdmUniStrToLower(ustr);
			ustr = UdmUniSegment(Indexer, ustr, UdmVarListFindStr(&Doc->Sections, "Content-Language", ""));

			for(tok=UdmUniGetToken(ustr,&lt); tok ; tok = UdmUniGetToken(NULL, &lt) ){
				size_t	tlen;				/* Word length          */ 
				int	ures;
				
				tlen=lt-tok;
				
				if (tlen <= max_word_len && tlen >= min_word_len) {
				
				        if (tlen > uwordlen) {
					  uwordlen = tlen;
					  if ((uword = (int*)UdmRealloc(uword, (uwordlen + 1) * sizeof(int))) == NULL) { 
					    UDM_FREE(lcsword); UDM_FREE(ustr); UDM_FREE(UStr); 
#ifdef USE_PARANOIA
					    UdmViolationExit(paran);
#endif
					    return UDM_ERROR;
					  }
					  if ((lcsword = (char*)UdmRealloc(lcsword, 12 *uwordlen + 1)) == NULL) { 
					    UDM_FREE(uword); UDM_FREE(ustr); UDM_FREE(UStr); 
#ifdef USE_PARANOIA
					    UdmViolationExit(paran);
#endif
					    return UDM_ERROR;
					  }
					}

					memcpy(uword,tok,tlen*sizeof(int));
					uword[tlen]=0;
				
					UdmConv(&uni_lc, lcsword, 12 * uwordlen + 1, (char*)uword, sizeof(*uword) * (tlen + 1));
					ures=strlen(lcsword);
				
					res=UdmWordListAdd(Doc,lcsword,Item->section);
					if(res!=UDM_OK)break;
				
					if(Item->href && crossec){
						UDM_CROSSWORD cw;
						cw.url=Item->href;
						cw.weight = crossec;
						cw.pos=Doc->CrossWords.wordpos;
						cw.word=lcsword;
						UdmCrossListAdd(Doc,&cw);
					}
				}
			}
		}
		
		if((Sec=UdmVarListFind(&Doc->Sections,Item->section_name))){

			if(Sec->curlen < Sec->maxlen){
				int cnvres;

				UdmReallocSection(Indexer,Sec);

				src = (char*)UStr;
				srclen = UdmUniLen(UStr) * sizeof(int);
				dstlen=Sec->maxlen-Sec->curlen;
				cnvres=UdmConv(&uni_lc,Sec->val+Sec->curlen,dstlen,src,srclen);
				Sec->curlen+=uni_lc.obytes;
				Sec->val[Sec->curlen]='\0';
				
				if (cnvres<0){
					Sec->curlen=Sec->maxlen;
				}
			}
		}
		
		udm_snprintf(secname,sizeof(secname)-1,"Raw.%s",Item->section_name);
		if ((Sec=UdmVarListFind(&Doc->Sections,secname)))
		{
			if(Sec->curlen < Sec->maxlen){
				size_t nbytes;
				
				UdmReallocSection(Indexer,Sec);
				
				dstlen= Sec->maxlen-Sec->curlen;
				nbytes= dstlen < srclen0 ? dstlen : srclen0;
				memcpy(Sec->val+Sec->curlen,Item->str,nbytes);
				Sec->curlen+=nbytes;
				Sec->val[Sec->curlen]='\0';
				
				if (dstlen<srclen0){
					Sec->curlen=Sec->maxlen;
				}
			}
		}
		
		UDM_FREE(ustr);
		UDM_FREE(UStr);
		if(res!=UDM_OK)break;
		
	}
	UdmVarListReplaceInt(&Doc->Sections,"crc32",crc32);
	
	UDM_FREE(uword); UDM_FREE(lcsword);
#ifdef USE_PARANOIA
	UdmViolationExit(paran);
#endif
	return res;
}


/**************************** Built-in Parsers ***************************/

int UdmParseURLText(UDM_AGENT *A,UDM_DOCUMENT *Doc){
	UDM_TEXTITEM	Item;
	UDM_VAR		*Sec;
	
	Item.href=NULL;
	
	if((Sec=UdmVarListFind(&Doc->Sections,"url.proto"))) {
		char sc[]="url.proto";
		Item.str = UDM_NULL2EMPTY(Doc->CurURL.schema);
		Item.section=Sec->section;
		Item.section_name=sc;
		UdmTextListAdd(&Doc->TextList,&Item);
	}
	if((Sec=UdmVarListFind(&Doc->Sections,"url.host"))) {
		char sc[]="url.host";
		Item.str = UDM_NULL2EMPTY(Doc->CurURL.hostname);
		Item.section=Sec->section;
		Item.section_name=sc;
		UdmTextListAdd(&Doc->TextList,&Item);
	}
	if((Sec=UdmVarListFind(&Doc->Sections,"url.path"))) {
		char sc[]="url.path";
		Item.str = UDM_NULL2EMPTY(Doc->CurURL.path);
		Item.section=Sec->section;
		Item.section_name=sc;
		UdmTextListAdd(&Doc->TextList,&Item);
	}
	if((Sec=UdmVarListFind(&Doc->Sections,"url.file"))) {
	        char *str, sc[]="url.file";
		str = (char*)UdmMalloc(strlen(UDM_NULL2EMPTY(Doc->CurURL.filename)) + 1);
		if (str != NULL) {
		  UdmUnescapeCGIQuery(str, UDM_NULL2EMPTY(Doc->CurURL.filename));
		  Item.str = str;
		  Item.section = Sec->section;
		  Item.section_name = sc;
		  UdmTextListAdd(&Doc->TextList, &Item);
		  UDM_FREE(str);
		}
	}
	return UDM_OK;
}

int UdmParseHeaders(UDM_AGENT *Indexer,UDM_DOCUMENT *Doc){
	size_t i;
	UDM_TEXTITEM Item;
	
	Item.href=NULL;
	for(i=0;i<Doc->Sections.nvars;i++){
		char	secname[128];
		UDM_VAR	*Sec;
		udm_snprintf(secname,sizeof(secname),"header.%s",Doc->Sections.Var[i].name);
		secname[sizeof(secname)-1]='\0';
		if((Sec=UdmVarListFind(&Doc->Sections,secname))){
			Item.str=Doc->Sections.Var[i].val;
			Item.section=Sec->section;
			Item.section_name=secname;
			UdmTextListAdd(&Doc->TextList,&Item);
		}
	}
	return UDM_OK;
}

int UdmParseText(UDM_AGENT * Indexer,UDM_DOCUMENT * Doc){
	UDM_TEXTITEM	Item;
	UDM_VAR		*BSec=UdmVarListFind(&Doc->Sections,"body");
	char		sc[]="body";
	
	Item.href=NULL;
	
	if(BSec && Doc->Buf.content && Doc->Spider.index){
		char *lt;
		Item.section=BSec->section;
		Item.str = udm_strtok_r(Doc->Buf.content, "\r\n", &lt);
		Item.section_name=sc;
		while(Item.str){
			UdmTextListAdd(&Doc->TextList,&Item);
			Item.str = udm_strtok_r(NULL, "\r\n", &lt);
		}
	}
	return(UDM_OK);
}


static void UdmNextCharB(void *d) {
  UDM_HTMLTOK *t = (UDM_HTMLTOK *)d;
  (t->b)++;
}

static void UdmNextCharE(void *d) {
  UDM_HTMLTOK *t = (UDM_HTMLTOK *)d;
  (t->e)++;
}


int UdmHTMLTOKInit(UDM_HTMLTOK *tag) {
  bzero((void*)tag, sizeof(*tag));
  tag->next_b = &UdmNextCharB;
  tag->next_e = &UdmNextCharE;
  return UDM_OK;
}


const char * UdmHTMLToken(const char * s, const char ** lt,UDM_HTMLTOK *t){

	t->ntoks=0;
	t->s = s;
	t->lt = lt;
	
	if(t->s == NULL && (t->s = *lt) == NULL)
		return NULL;

	if(!*t->s) return NULL;
	
	if(!strncmp(t->s,"<!--",4))t->type=UDM_HTML_COM;
	else	
	if(*t->s=='<')t->type=UDM_HTML_TAG;
	else	t->type=UDM_HTML_TXT;

	switch(t->type){
		case UDM_HTML_TAG:

			for(*lt = t->b = t->s + 1; *t->b; ) {
				const char * valbeg=NULL;
				const char * valend=NULL;
				size_t nt=t->ntoks;
				
				
				/* Skip leading spaces */
				while((*t->b)&&strchr(" \t\r\n",*t->b)) (*t->next_b)(t);

				if(*t->b=='>'){
					*lt = t->b + 1;
					return(t->s);
				}

				if(*t->b=='<'){ /* Probably broken tag occure */
					*lt = t->b;
					return(t->s);
				}

				/* Skip non-spaces, i.e. name */
				for(t->e = t->b; (*t->e) && !strchr(" =>\t\r\n", *t->e); (*t->next_e)(t));
				
				if(t->ntoks<UDM_MAXTAGVAL)
					t->ntoks++;
				
				t->toks[nt].val=0;
				t->toks[nt].vlen=0;
				t->toks[nt].name = t->b;
				t->toks[nt].nlen = t->e - t->b;

				if (nt == 0) {
				  if(!strncasecmp(t->b,"script",6)) t->script = 1;
				  if(!strncasecmp(t->b,"/script",7)) t->script = 0;
				  if(!strncasecmp(t->b, "noindex", 7)) t->comment = 1;
				  if(!strncasecmp(t->b, "/noindex", 8)) t->comment = 0;
				  if(!strncasecmp(t->b, "style", 5)) t->style = 1;
				  if(!strncasecmp(t->b, "/style", 6)) t->style = 0;
				  if(!strncasecmp(t->b, "body", 4)) t->body = 1;
				  if(!strncasecmp(t->b, "/body", 5)) t->body = 0;
				}

				if(*t->e=='>'){
					*lt = t->e + 1;
					return(t->s);
				}

				if(!(*t->e)){
					*lt = t->e;
					return(t->s);
				}
				
				/* Skip spaces */
				while((*t->e) && strchr(" \t\r\n",*t->e))(*t->next_e)(t);
				
				if(*t->e != '='){
					t->b = t->e;
				       *lt = t->b;        /* bug when hang on broken inside tag pages fix */
					continue;
				}
				
				/* Skip spaces */
				for(t->b = t->e + 1; (*t->b) && strchr(" \r\n\t", *t->b); (*t->next_b)(t));
				
				if(*t->b == '"'){
					t->b++;
					
					valbeg = t->b;
					for(t->e = t->b; (*t->e) && (*t->e != '"'); (*t->next_e)(t));
					valend = t->e;
					
					t->b = t->e;
					if(*t->b == '"')(*t->next_b)(t);
				}else
				if(*t->b == '\''){
					t->b++;
					
					valbeg = t->b;
					for(t->e = t->b; (*t->e) && (*t->e != '\''); (*t->next_e)(t));
					valend = t->e;
					
					t->b = t->e;
					if(*t->b == '\'')(*t->next_b)(t);
				}else{
					valbeg = t->b;
					for(t->e = t->b; (*t->e) && !strchr(" >\t\r\n", *t->e);(*t->next_e)(t));
					valend = t->e;
					
					t->b = t->e;
				}
				*lt = t->b;
				t->toks[nt].val=valbeg;
				t->toks[nt].vlen=valend-valbeg;
			}
			break;

		case UDM_HTML_COM: /* comment */
			
			if(!strncasecmp(t->s, "<!--UdmComment-->",17))
				t->comment=1;
			else
			if(!strncasecmp(t->s, "<!--/UdmComment-->",18))
				t->comment=0;

			for(t->e = t->s; (*t->e) && (strncmp(t->e, "-->", 3)); (*t->next_e)(t));
			if(!strncmp(t->e, "-->", 3)) *lt = t->e + 3;
			else	*lt = t->e;
			break;

		case UDM_HTML_TXT: /* text */
		default:
			/* Special case when script  */
			/* body is not commented:    */
			/* <script> x="<"; </script> */
			/* We should find </script>  */
			
			for(t->e = t->s; *t->e; (*t->next_e)(t)){
				if(*t->e == '<'){
					if(t->script){
						if(!strncasecmp(t->e, "</script>",9)){
							/* This is when script body  */
							/* is not hidden using <!--  */
							break;
						}else
						if(!strncmp(t->e, "<!--",4)){
							/* This is when script body  */
							/* is hidden but there are   */
							/* several spaces between    */
							/* <SCRIPT> and <!--         */
							break;
						}
					}else{
						break;
					}
				}
			}
			
			*lt = t->e;
			break;
	}
	return t->s;
}


int UdmHTMLParseTag(UDM_HTMLTOK * tag,UDM_DOCUMENT * Doc){
	UDM_TEXTITEM Item;
	UDM_VAR	*Sec;
	int opening;
	char name[128];
	register char * n;
	char *metaname=NULL;
	char *metacont=NULL;
	char *href=NULL;
	char *lang = NULL;
	char *secname;
	size_t i, seclen = 128;

#ifdef USE_PARANOIA
	void *paran = UdmViolationEnter();
#endif

	if(!tag->ntoks) {
#ifdef USE_PARANOIA
	  UdmViolationExit(paran);
#endif
	  return(0);
	}
	if(!tag->toks[0].name) {
#ifdef USE_PARANOIA
	  UdmViolationExit(paran);
#endif
	  return(0);
	}
	if(tag->toks[0].nlen>sizeof(name)-1) {
#ifdef USE_PARANOIA
	  UdmViolationExit(paran);
#endif
	  return(0);
	}

	secname = (char*)UdmMalloc(seclen);
	strncpy(name,tag->toks[0].name,tag->toks[0].nlen);
	name[tag->toks[0].nlen]='\0';
	
	for(i=0;i<tag->ntoks;i++){
		if(ISTAG(i,"name")){
			metaname = UdmStrndup(tag->toks[i].val,tag->toks[i].vlen);
		}else
		if(ISTAG(i,"http-equiv")){
			metaname = UdmStrndup(tag->toks[i].val,tag->toks[i].vlen);
		}else
		if(ISTAG(i,"content")){
			metacont = UdmStrndup(tag->toks[i].val,tag->toks[i].vlen);
		}else
		if(ISTAG(i,"href")){
			/* A, LINK, AREA*/
			char *y = UdmStrndup(tag->toks[i].val,tag->toks[i].vlen);
			href = (char*)UdmStrdup(UdmTrim(y, " \t\r\n"));
			UDM_FREE(y);
		}else
		if(ISTAG(i,"src")){
			/* IMG, FRAME */
			char *y = UdmStrndup(tag->toks[i].val,tag->toks[i].vlen);
			href = (char*)UdmStrdup(UdmTrim(y, " \t\r\n"));
			UDM_FREE(y);
		}else
		if (ISTAG(i, "lang")) {
			char *y = UdmStrndup(tag->toks[i].val,tag->toks[i].vlen);
			lang = (char*)UdmStrdup(UdmTrim(y, " \t\r\n"));
			for(n = lang; *n; *n = tolower(*n),n++);
			UDM_FREE(y);
		} else {
		        if (tag->toks[i].nlen + 12 > seclen) {
			  secname = (char*)UdmRealloc(secname, seclen = (tag->toks[i].nlen + 12));
			}
			
		        strcpy(secname, "attribute.");
			strncat(secname + 10, tag->toks[i].name, tag->toks[i].nlen);
			secname[seclen - 1]='\0';

			if ((Sec = UdmVarListFind(&Doc->Sections, secname)) && Doc->Spider.index) {
			  char *y = UdmStrndup(tag->toks[i].val,tag->toks[i].vlen);
			  Item.str = y;
			  Item.section = Sec->section;
			  Item.section_name = secname;
			  Item.href = NULL;
			  UdmTextListAdd(&Doc->TextList, &Item);
			  UDM_FREE(y);
			}
		}
	}
	
	for(n=name;*n;*n=tolower(*n),n++);
	
	if(name[0]=='/'){
		opening=0;
		memmove(name,name+1,strlen(name+1)+1);
	}else{
		opening=1;
	}

	/* Let's find tag name in order of frequency */

	if(!strcmp(name,"a")){
		UDM_FREE(tag->lasthref);			/*117941*/
	}else
	if(!strcmp(name,"title"))	tag->title=opening;	/*6192*/
	else
	if(!strcmp(name,"html") && opening && (lang != NULL)) {
		UdmVarListReplaceStr(&Doc->Sections, "Meta-Language", lang);
	}else
	if(!strcmp(name,"body")) {
		tag->body=opening;	/*5146*/
		if (opening && (lang != NULL)) {
			UdmVarListReplaceStr(&Doc->Sections, "Meta-Language", lang);
		}
	}else
	if((!strcmp(name,"meta"))&&(metaname)&&(metacont)){ 
		
		strcpy(secname,"meta.");
		strncat(secname + 5 ,metaname, seclen - 5);
		secname[seclen - 1]='\0';
		
		if((!tag->comment) && (Sec=UdmVarListFind(&Doc->Sections,secname)) && Doc->Spider.index) {
			UdmSGMLUnescape(metacont);
			Item.str=metacont;
			Item.section=Sec->section;
			Item.section_name=secname;
			Item.href = NULL;
			UdmTextListAdd(&Doc->TextList,&Item);
		}
		
		if(!strcasecmp(metaname,"Content-Type")){
			char *p;
			if((p=strstr(metacont,"charset="))){
				const char *cs = UdmCharsetCanonicalName(UdmTrim(p + 8, " \t"));
				UdmVarListReplaceStr(&Doc->Sections, "Meta-Charset", cs ? cs : p + 8);
			}
		}else
		if(!strcasecmp(metaname, "Content-Language") || !strcasecmp(metaname, "DC.Language")) {
			char *l;
			l = (char*)UdmStrdup(metacont);
			for(n = l; *n; *n = tolower(*n),n++);
			UdmVarListReplaceStr(&Doc->Sections, "Meta-Language", l);
			UDM_FREE(l);
		}else
		if(!strcasecmp(metaname,"refresh")){
			/* Format: "10; Url=http://something/" */
			/* URL can be written in different     */
			/* forms: URL, url, Url and so on      */
		        char *p;
			
			if((p = strchr(metacont, '='))){
				if((p >= metacont + 3) && (!strncasecmp(p-3,"URL=",4))){
					href = (char*)UdmStrdup(p + 1);
				}else{
					UDM_FREE(href);
				}
			}
 		}else
		if(!strcasecmp(metaname,"robots")&&(Doc->Spider.use_robots)&&(metacont)){
			char * lt;
			char * rtok;
					
			rtok = udm_strtok_r(metacont," ,\r\n\t",&lt);
			while(rtok){
				if(!strcasecmp(rtok,"ALL")){
					/* Set Server parameters */
					tag->follow=Doc->Spider.follow;
					tag->index=Doc->Spider.index;
				}else
				if(!strcasecmp(rtok,"NONE")){
					tag->follow=UDM_FOLLOW_NO;
					tag->index=0;
					Doc->Spider.follow = UDM_FOLLOW_NO;
					Doc->Spider.index = 0;
				}else
				  if(!strcasecmp(rtok,"NOINDEX")) {
					tag->index=0;
					Doc->Spider.index = 0;
/*					Doc->method = UDM_METHOD_DISALLOW;*/
				}else
				  if(!strcasecmp(rtok,"NOFOLLOW")) {
					tag->follow=UDM_FOLLOW_NO;
					Doc->Spider.follow = UDM_FOLLOW_NO;
				}else
				  if(!strcasecmp(rtok,"NOARCHIVE")) {
				        UdmVarListReplaceStr(&Doc->Sections, "Z", "");
				}else
				  if(!strcasecmp(rtok,"INDEX")) {
				        tag->index = Doc->Spider.index;
				}else
				if(!strcasecmp(rtok,"FOLLOW")) 
					tag->follow=Doc->Spider.follow;
				rtok = udm_strtok_r(NULL," \r\n\t",&lt);
			}
		}
	}
	else	if(!strcmp(name,"script"))	tag->script=opening;
	else	if(!strcmp(name,"style"))	tag->style=opening;
	else	if(!strcmp(name,"noindex"))	tag->comment=opening;
	else	
	if((!strcmp(name,"base"))&&(href)){
		
		UdmVarListReplaceStr(&Doc->Sections,"base.href",href);
		
		/* Do not add BASE HREF itself into database.      */
		/* It will be used only to compose relative links. */
		UDM_FREE(href);
	}

	if((href)&&(tag->follow!=UDM_FOLLOW_NO)){
		UDM_HREF	Href;
		
		UdmSGMLUnescape(href);
		UdmHrefInit(&Href);
		Href.referrer = UdmVarListFindInt(&Doc->Sections, "Referrer-ID", 0);
		Href.hops=1+UdmVarListFindInt(&Doc->Sections,"Hops",0);
		Href.site_id = UdmVarListFindInt(&Doc->Sections, "Site_id", 0);
		Href.url=href;
		Href.method=UDM_METHOD_GET;
		UdmHrefListAdd(&Doc->Hrefs,&Href);
		
		/* For crosswords */
		UDM_FREE(tag->lasthref);
		tag->lasthref = (char*)UdmStrdup(href);
	}
	UDM_FREE(metaname);
	UDM_FREE(metacont);
	UDM_FREE(href);
	UDM_FREE(lang);
	UDM_FREE(secname);
	
#ifdef USE_PARANOIA
	UdmViolationExit(paran);
#endif
	return 0;
}


#define MAXSTACK	1024

typedef struct {
	size_t len;
	char * ofs;
} UDM_TAGSTACK;

int UdmHTMLParse(UDM_AGENT *Indexer,UDM_DOCUMENT *Doc){
	UDM_HTMLTOK	tag;
#ifdef NOTUSED_FOR_XML
	char		stack[MAXSTACK*8]="";
	char   		*sbot=stack;
	size_t		nstack=0;
	UDM_TAGSTACK	Stack[MAXSTACK];
#endif
	UDM_TEXTITEM	Item;
	const char	*htok;
	const char	*last;
	UDM_VAR		*BSec=UdmVarListFind(&Doc->Sections,"body");
	UDM_VAR		*TSec=UdmVarListFind(&Doc->Sections,"title");
	int		body_sec  = BSec ? BSec->section : 0;
	int		title_sec = TSec ? TSec->section : 0;
	char		scb[]="body";
	char		sct[]="title";

#ifdef USE_PARANOIA
	void *paran = UdmViolationEnter();
#endif
	
	bzero((void*)&Item, sizeof(Item));
	UdmHTMLTOKInit(&tag);
	tag.follow=Doc->Spider.follow;
	tag.index=Doc->Spider.index;

	htok=UdmHTMLToken(Doc->Buf.content,&last,&tag);
	
	while(htok){
		char       *tmp=NULL;
		const char *tmpbeg;
		const char *tmpend;

		switch(tag.type){
			
			case UDM_HTML_COM:

				break;

			case UDM_HTML_TXT:

				for( tmpbeg=htok;   tmpbeg<last && strchr(" \r\n\t",tmpbeg[0]) ; tmpbeg++);
				for( tmpend=last-1; htok<tmpend && strchr(" \r\n\t",tmpend[0]) ; tmpend--);
				if(tmpbeg>=tmpend)break;
				
				tmp = UdmStrndup(tmpbeg,(size_t)(tmpend-tmpbeg+1));
				
				if (BSec && !tag.comment && tag.body && !tag.script && !tag.style && tag.index) {
					Item.href=tag.lasthref;
					Item.str=tmp;
					Item.section=body_sec;
					Item.section_name=scb;
					UdmTextListAdd(&Doc->TextList,&Item);
				}
				if (TSec && !tag.comment && tag.title && tag.index) {
					Item.href=NULL;
					Item.str=tmp;
					Item.section=title_sec;
					Item.section_name=sct;
					UdmTextListAdd(&Doc->TextList,&Item);
				}
				UDM_FREE(tmp);
				break;
		
			case UDM_HTML_TAG:
				
#ifdef NOTUSED_FOR_XML
				if(0){
					size_t i;
					char str[100];
					strncpy(str,tag.toks[0].name,tag.toks[0].nlen);
					str[tag.toks[0].nlen]='\0';
					fprintf(stderr,"T=%s\n",str);
					
					switch(tag.toks[0].name[0]){
						case '!':
							
							break;
						case '/':
							for(i=nstack;i>0;i--){
								if(Stack[i-1].len+1==tag.toks[0].nlen){
									if(!strncasecmp(Stack[i-1].ofs,tag.toks[0].name+1,Stack[i-1].len)){
										nstack=i-1;
										sbot=Stack[i-1].ofs;
										if(sbot>stack)sbot--;
										*sbot='\0';
										break;
									}
								}
							}
							break;
						default:
							if(*stack){
								strcat(sbot,".");
								sbot++;
							}
							strncpy(sbot,tag.toks[0].name,tag.toks[0].nlen);
							sbot[tag.toks[0].nlen]='\0';
							Stack[nstack].ofs=sbot;
							Stack[nstack].len=tag.toks[0].nlen;
							nstack++;
							sbot=sbot+strlen(sbot);
						
					}
					fprintf(stderr,"S=%s\n\n",stack);
				}
#endif
				UdmHTMLParseTag(&tag,Doc);
				break;
		}
		htok=UdmHTMLToken(NULL,&last,&tag);
		
	}
	UDM_FREE(tag.lasthref);	
#ifdef USE_PARANOIA
	UdmViolationExit(paran);
#endif
	return UDM_OK;
}
