/* wnfilter.c -- 
 * Created: Sat Jul 12 20:28:49 1997 by faith@acm.org
 * Revised: Sun Jan 25 19:06:23 1998 by faith@acm.org
 * Copyright 1997, 1998 Rickard E. Faith (faith@acm.org)
 * 
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation; either version 1, or (at your option) any
 * later version.
 * 
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 675 Mass Ave, Cambridge, MA 02139, USA.
 * 
 * $Id: wnfilter.c,v 1.5 1998/02/16 02:06:20 faith Exp $
 *
 * Modified header text in lines 400+ for WordNet 1.7
 * RDH 6/23/01       
 */

#include "config.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#define FMT_MAXPOS 65
#define FMT_INDENT  5

#define BSIZE 10240

#define ADJ  'a'
#define ADV  'r'
#define NOUN 'n'
#define VERB 'v'

#define ADJ_FILE  0
#define ADV_FILE  1
#define NOUN_FILE 2
#define VERB_FILE 3

#define WN16 1

static FILE *data[4];

static unsigned char b64_list[] =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

/* |b64_encode| encodes |val| in a printable base 64 format.  A MSB-first
   encoding is generated. */

static const char *b64_encode( unsigned long val )
{
   static char   result[7];
   int    i;

   result[0] = b64_list[ (val & 0xc0000000) >> 30 ];
   result[1] = b64_list[ (val & 0x3f000000) >> 24 ];
   result[2] = b64_list[ (val & 0x00fc0000) >> 18 ];
   result[3] = b64_list[ (val & 0x0003f000) >> 12 ];
   result[4] = b64_list[ (val & 0x00000fc0) >>  6 ];
   result[5] = b64_list[ (val & 0x0000003f)       ];
   result[6] = 0;

   for (i = 0; i < 5; i++) if (result[i] != b64_list[0]) return result + i;
   return result + 5;
}

static FILE *fmt_str;
static int  fmt_indent;
static int  fmt_pos;
static int  fmt_pending;
static int  fmt_hwcount;

static void fmt_openindex( const char *filename )
{
   char buffer[1024];

   if (!filename) return;

   sprintf( buffer, "sort -df > %s\n", filename );
   
   if (!(fmt_str = popen( buffer, "w" ))) {
      fprintf( stderr, "Cannot open %s for write\n", buffer );
      exit(1);
   }
}

static void fmt_newline( void )
{
   int i;
   
   putchar('\n');
   for (i = 0; i < fmt_indent; i++) putchar(' ');
   fmt_pos = fmt_indent;
   fmt_pending = 0;
}

static void fmt_string( const char *s )
{
   char *sdup = malloc( strlen(s) + 1 );
   char *pt = sdup;
   char *p = sdup;
   char *t;
   int  len;

   for (t = sdup; *s; s++) {
      if (*s == '_') *t++ = ' ';
      else *t++ = *s;
   }
   *t = '\0';

   while ((pt = strchr(pt, ' '))) {
      *pt++ = '\0';
      len = strlen(p);
      if (fmt_pending && fmt_pos + len > FMT_MAXPOS) {
	 fmt_newline();
      }
      if (fmt_pending) {
	 putchar(' ');
	 ++fmt_pos;
	 fmt_pending = 0;
      }
      printf( "%s", p );
      fmt_pos += len;
      p = pt;
      fmt_pending = 1;
   }
   
   len = strlen(p);
   if (fmt_pending && fmt_pos + len > FMT_MAXPOS) {
      fmt_newline();
   }
   if (len && fmt_pending) {
      putchar(' ');
      ++fmt_pos;
      fmt_pending = 0;
   }
   if (!len) {
      fmt_pending = 1;
   } else {
      printf( "%s", p );
      fmt_pos += len;
   }

   free(sdup);
}

static void fmt_newheadword( const char *word )
{
   static char prev[1024] = "";
   static int  start = 0;
   static int  end;
   const char  *s;
   char        *d;

   fmt_indent = 0;
   if (*prev) fmt_newline();
   fflush(stdout);
   end = ftell(stdout);
   
   if (fmt_str && *prev) {
      fprintf( fmt_str, "%s\t%s\t", prev, b64_encode(start) );
      fprintf( fmt_str, "%s\n", b64_encode(end-start) );
   }
   if (word) {
      for (d = prev, s = word; *s; ++s, ++d) {
	 if (*s == '_') *d = ' ';
	 else           *d = *s;
      }
      *d = '\0';
      start = end;
      fmt_string(word);
   }

   if (fmt_hwcount && !(fmt_hwcount % 100)) {
      fprintf( stderr, "%10d headwords\r", fmt_hwcount );
   }
   ++fmt_hwcount;
}

static void fmt_closeindex( void )
{
   fmt_newheadword(NULL);
   if (fmt_str) pclose( fmt_str );
   fprintf( stderr, "%12d headwords\n", fmt_hwcount );
}

static void fmt_newdef( int def, char pos )
{
   char       buffer[1024];
   const char *p;

   switch (pos) {
   case 's':
   case ADJ:   p = "adj "; break;
   case ADV:   p = "adv "; break;
   case NOUN:  p = "n ";   break;
   case VERB:  p = "v ";   break;
   case 0:     p = "";     break;
   default:
      fprintf( stderr, "Illegal pos: %c\n", pos );
      exit(1);
   }

   if (def) {
      sprintf( buffer, "%s%d:", p, def );
   } else {
      if (pos) sprintf( buffer, "%s:", p );
      else     strcpy( buffer, "" );
   }

   fmt_indent = FMT_INDENT;
   fmt_newline();
   printf( "%s", buffer );
   fmt_indent += strlen(buffer) + 1;
}


static char *getentry( const char *offset, const char *file )
{
   static char buffer[BSIZE];
   int         f;
   long        o = atol(offset);
   char        *pt;

   switch (*file) {
   case ADJ:  f = ADJ_FILE;  break;
   case ADV:  f = ADV_FILE;  break;
   case NOUN: f = NOUN_FILE; break;
   case VERB: f = VERB_FILE; break;
   default:
      fprintf( stderr, "File type %s illegal\n", file );
      exit(1);
   }

   fseek(data[f], o, SEEK_SET);
   fgets( buffer, BSIZE-1, data[f] );
   pt = buffer + strlen( buffer ) - 1;
   while (pt >= buffer && (*pt == '\n' || *pt == ' ')) *pt-- = '\0';
      
   return buffer;
}

static void processentry( int flag, char *word, char *entry )
{
   static char prevPos  = 0;
   static char prevWord[BSIZE] = "";
   static int  def = 0;
   char        *pos;
   int         count;
   int         i;
   char        *bufcopy = strdup( entry );
   int         p_cnt;
   char        *wordCap = NULL;
   char        *wordTmp;
   char        *syn, *ant;
   char        *gloss = NULL;
   char        *ptr;
   char        *offset;
   char        *pt;
   int         ants;
   int         syns = 0;

   strtok( entry, " " );	/* synset_offset */
   strtok( NULL, " " );		/* lex_file_num */
   pos = strtok( NULL, " " );
   if (*pos == 's') *pos = ADJ;	/* Adjective Satellite */
   count = strtol( strtok( NULL, " " ), NULL, 16 );

   for (i = 0; i < count; i++) {
      wordTmp = strtok( NULL, " " );	/* word */
      strtok( NULL, " " );	/* id */
      if (!wordCap && !strcasecmp(wordTmp,word)) wordCap = wordTmp;
   }
   if (!wordCap) wordCap = word;

   gloss = strchr( bufcopy, '|' );

   if (count == 1 && !gloss) {
      free(bufcopy);
#if 0
      fprintf( stderr, "\n SKIPPING %s %s\n", word, entry );
#endif
      return;
   }
   
   if (strcmp(word,prevWord)) {
      def = 1;
      strcpy(prevWord,word);
      prevPos = *pos;
      fmt_newheadword(wordCap);
      fmt_newdef( (flag||def>1) ? def++ : 0, *pos );
      if (count == 1 && !gloss) {
	 fprintf( stderr, "No synonyms or gloss for %s %s\n", word, pos );
	 exit(2);
      }
   } else if (count != 1 || gloss) {
      if (!prevPos || prevPos != *pos) {
	 prevPos = *pos;
	 def = 1;
	 fmt_newdef( (flag||def>1) ? def++ : 0, *pos );
      } else {
	 fmt_newdef( (flag||def>1) ? def++ : 0, 0 );
      }
   } else {
      fprintf( stderr, "No synonyms or gloss for %s %s (count = %d)\n",
	       word, pos, count );
      exit(2);
   }

   if ((gloss = strchr( bufcopy, '|' ))) {
      fmt_string( gloss + 1 );
   }

   strtok( bufcopy, " " );	/* synset_offset */
   strtok( NULL, " " );		/* lex_file_num */
   strtok( NULL, " " );		/* pos */
   strtok( NULL, " " );		/* count */

   for (i = 0; i < count; i++) {
      syn = strtok( NULL, " " );
      strtok( NULL, " " );	/* id */

      if (!strcmp(syn,word)) continue;
      if (!syns) {
	 fmt_string( " [syn: " );
	 fmt_string( "{" );
	 fmt_string( syn );
	 fmt_string( "}" );
	 ++syns;
      } else {
	 fmt_string( ", " );
	 fmt_string( "{" );
	 fmt_string( syn );
	 fmt_string( "}" );
	 ++syns;
      }
   }
   if (syns) fmt_string( "]" );

   p_cnt = atoi( strtok( NULL, " " ) );
   for (i = 0, ants = 0; i < p_cnt; i++) {
      ptr = strtok( NULL, " " );
      offset = strtok( NULL, " " );
      pos = strtok( NULL, " " );
      strtok( NULL, " " );	/* source/target field */

#if 0
      printf( "%d %s %s %s\n", i, ptr, offset, pos );
#endif
      if (*ptr == '!') {
	 entry = getentry( offset, pos );
	 pt = strchr( entry, ' ' ); /* synset_offset */
	 pt = strchr( pt+1, ' ' ); /* lex_file_num */
	 pt = strchr( pt+1, ' ' ); /* pos */
	 pt = strchr( pt+1, ' ' ); /* count */
	 ant = pt + 1;
	 pt = strchr( pt+1, ' ' ); /* word */
	 *pt = '\0';

	 if (!ants++) {
	    fmt_string( " [ant: " );
	    fmt_string( "{" );
	    fmt_string( ant );
	    fmt_string( "}" );
	 } else {
	    fmt_string( ", " );
	    fmt_string( "{" );
	    fmt_string( ant );
	    fmt_string( "}" );
	 }
      }
   }
   if (ants) fmt_string( "]" );

   free( bufcopy );
}

int main( int argc, char **argv )
{
   const char *filenames[4] = { "data/data.adj",
				"data/data.adv",
				"data/data.noun",
				"data/data.verb" };
   int        i;
   char       *entry;
   char       buffer[BSIZE];
   char       *pt;
   char       *word = buffer;
   char       *pos;
   time_t     t;
   int        flag;

   for (i = 0; i < 4; i++) {
      if (!(data[i] = fopen( filenames[i], "r" ))) {
	 fprintf( stderr, "Cannot open %s for read\n", filenames[i] );
	 exit(1);
      }
   }

   fmt_openindex( "wn.index" );

   fmt_newheadword("00-database-url");
   fmt_newdef(0,0);
   fmt_string( "http://www.cogsci.princeton.edu/~wn/" );

   fmt_newheadword("00-database-short");
   fmt_newdef(0,0);
   fmt_string( "WordNet (r) 1.7.1 (July 2002)" );

   fmt_newheadword("00-database-long");
   fmt_newdef(0,0);
   fmt_string( "WordNet (r): A Lexical Database for English from"
	      " the Cognitive Science Laboratory at Princeton University" );

   fmt_newheadword("00-database-info");
   fmt_newdef(0,0);
   fmt_string("This file was converted from the original database on:" );
   fmt_newline();
   time(&t);
   sprintf( buffer, "          %25.25s", ctime(&t) );
   fmt_string( buffer );
   fmt_newline();
   fmt_newline();
   fmt_string( "The original data is available from:" );
   fmt_newline();
   fmt_string( "   http://www.cogsci.princeton.edu/~wn/" );
   fmt_newline();
   fmt_newline();
   fmt_string(
      "The original data was distributed with the notice shown below."
      "  No additional restrictions are claimed.  Please redistribute"
      " this changed version under the same conditions and restriction"
      " that apply to the original version." );
   fmt_newline();
   fmt_indent += 3;
   fmt_newline();
   rewind(data[0]);
   while (fgets(buffer,BSIZE-1,data[0])) {
      if (*buffer != ' ') break;
      pt = buffer + strlen(buffer) - 1;
      while (*pt == '\n' || *pt == '\r' || *pt == ' ') *pt-- = '\0';
      if ((pt = strchr( buffer + 2, ' '))) {
	 fmt_string(pt+1);
	 fmt_string(" ");
      } else {
	 fmt_newline();
	 fmt_newline();
      }
   }
   fmt_indent = 0;
   fmt_newline();

   while (fgets(buffer,BSIZE-1,stdin)) {
      int p_cnt, sns_cnt;

      if (*buffer != ' ') {
	 pos = strchr( buffer, ' ' ); /* word */
	 *pos++ = '\0';
	 pt = strchr( pos, ' ' );	/* pos */
	 *pt++ = '\0';
	 pt = strchr( pt, ' ' );	/* poly_cnt */
	 p_cnt = atoi( ++pt );
	 for (i = 0; i <= p_cnt; i++) {
	    pt = strchr( pt+1, ' ' );	/* p_cnt, ptr_types */
	 }
	 sns_cnt = atoi( ++pt) ;
#if WN16
	 pt = strchr( pt, ' ' ) + 1; /* skip version 1.6 tagsense_cnt */
#endif
	 for (i = 0; i < sns_cnt; i++) {
	    pt = strchr( pt+1, ' ' );	/* synset_offset */
	    entry = getentry( pt + 1, pos );
	    flag = 0;
	    if (sns_cnt > 1) ++flag;
	    processentry( flag, word, entry );
	 }
      }
   }

   
   fmt_newheadword(NULL);

   fmt_closeindex();
   return 0;
}
