#! /usr/bin/python
#
# Copyright (C) 2005 Laurent Pelecq <laurent.pelecq@soleil.org>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
# 02111-1307, USA.
# $Id: html2rest 29470 2005-11-11 14:28:16Z madarche $

import sys, os
import optparse

import re
import HTMLParser
import StringIO

### Global

version = '1.0'

### Functions

def warn(msg):
    sys.stderr.write(msg)

def die(msg, status=1):
    warn(msg)
    sys.exit(status)

def noop(msg):
    pass

### Parsing command line

usage="""%prog [options] base_url source_dir dest_dir

This command converts HTML pages generated by Wiki engines to
restructured text.

Currently only Zwiki is supported with option --zwiki.
"""

option_parser = optparse.OptionParser(usage=usage,
                                      version="%%prog %s"%(version))

option_parser.add_option("--debug",
                         dest="debug", action="store_true", default=False,
                         help="Debug mode. Print stack trace on error (for developers only)")

option_parser.add_option("--trace",
                         dest="trace", action="store_true", default=False,
                         help="Trace processing (for developers only)")

option_parser.add_option("-v", "--verbose",
                         dest="verbose", action="store_true", default=False,
                         help="print status messages to stdout")

option_parser.add_option("-n", "--no-act",
                         dest="no_act", action="store_true", default=False,
                         help="print what would be done without"
                         " executing anything")

option_parser.add_option("--zwiki",
                         dest="zwiki", action="store_true", default=False,
                         help="parse page generated by ZWiki")

### Main

context = None

class ZwikiContext:

    """Zwiki specific settings."""

    page_start_comment = ' end of header '

    page_end_comment = ' start of footer '

    U = 'A-Z\xc0-\xdf'
    L = 'a-z\xe0-\xff'
    b = '(?<![%s0-9])' % (U+L)
    wikiname1 = r'(?L)%s[%s]+[%s]+[%s][%s]*[0-9]*' % (b,U,L,U,U+L)
    wikiname2 = r'(?L)%s[%s][%s]+[%s][%s]*[0-9]*'  % (b,U,U,L,U+L)
    wikilink  = r'(?:%s|%s)' % (wikiname1,wikiname2)
    wikilink_re = re.compile(wikilink)

    #wikilink_re = re.compile(r'(?:[A-Z][a-z]+){2,}[0-9]*')

    char_re = re.compile(r'_[a-z0-9]{2}')

    def __init__(self, base_url, pages):
        self.base_url = base_url
        self.pages = pages

    def match(self, href, text):
        """Return the text to insert if it is a wiki link.

        It returns None if it's not a wiki link and an empty string
        if this link must be discarded."""
        if text == '?':
            return '' # Non existent page, text before link is sufficient
        url_base, url_name = os.path.split(href)
        if url_base == self.base_url:
            if self.wikilink_re.match(text):
                return text
            elif url_name in self.pages:
                return '[%s]'%(text) # Preserve bracketed Wiki names
            return text
        return None

    def convert_page_name(self, page_name):
        result = StringIO.StringIO()
        pos = 0
        for m in self.char_re.finditer(page_name):
            result.write(page_name[pos:m.start(0)])
            result.write(chr(int(m.group(0)[1:], 16)))
            pos = m.end(0)
        result.write(page_name[pos:])
        return result.getvalue()


class Formatter:

    class Error(Exception):
        pass

    def __init__(self, parent=None):
        self.parent = parent
        self.lines = []

    def add_lines(self, lines):
        self.lines += lines

    parse = add_lines

    def finish(self):
        return self.lines

    def add_prefix(self, lines, first_line_prefix, line_prefix=None):
        if not line_prefix:
            line_prefix = first_line_prefix
        prefix = first_line_prefix
        result = []
        for s in lines:
            result.append(prefix + s)
            prefix = line_prefix
        return result

    def get_child_width(self, total_width):
        return total_width

    @classmethod
    def factory(self, parent, tag, attrs):
        return self(parent)


class ListFormatter(Formatter):

    def __init__(self, parent, is_enum):
        Formatter.__init__(self, parent)
        self.is_enum = is_enum
        self.level = 0
        if isinstance(parent, self.__class__):
            self.level = self.parent.level + 1
        self.count = 0

    @classmethod
    def factory(self, parent, tag, attrs):
        return self(parent, tag == 'ol')


class ListItemFormatter(Formatter):

    bullets = [ '*', '-', '+' ]

    def __init__(self, parent=None):
        Formatter.__init__(self, parent)
        if not isinstance(parent, ListFormatter):
            raise self.Error('syntax error: list item outside list.')
        self.index = self.parent.count
        self.parent.count += 1

    def get_prefixes(self):
        if self.parent.is_enum:
            first_line_prefix = '%d.'%(self.index + 1)
        else:
            bullet = self.bullets[self.parent.level % len(self.bullets)]
            first_line_prefix = '%s '%(bullet)
        line_prefix = ' ' * len(first_line_prefix)
        return first_line_prefix, line_prefix

    def get_child_width(self, total_width):
        first_line_prefix, line_prefix = self.get_prefixes()
        return total_width - len(line_prefix)

    def parse(self, lines):
        first_line_prefix, line_prefix = self.get_prefixes()
        self.add_lines(self.add_prefix(lines, first_line_prefix, line_prefix))


class HorizontalRuleFormatter(Formatter):

    def parse(self, lines):
        self.add_lines([ '----', '' ])


class LineBreakFormatter(Formatter):

    def parse(self, lines):
        self.add_lines(self.add_prefix(lines, '| '))


class QuoteFormatter(Formatter):

    def parse(self, lines):
        self.add_lines(self.add_prefix(lines, '%  '))


class VerbatimFormatter(Formatter):

    indent = '  '

    def get_child_width(self, total_width):
        return total_width - len(self.indent)

    def parse(self, lines):
        self.add_lines(self.add_prefix(lines, self.indent))


class TitleFormatter(Formatter):

    delimiters = [ '=', '=', '-', '~', ':', '%', '@' ]

    def __init__(self, parent, level):
        Formatter.__init__(self, parent)
        self.level = level
        self.title = ''

    def parse(self, lines):
        self.title += ' '.join(lines).strip()

    def finish(self):
        title = self.title
        underline_char = self.delimiters[self.level - 1]
        underline = underline_char * len(title)
        result = [ title, underline, '' ]
        if self.level == 1:
            result = [ underline ] + result
        return result

    @classmethod
    def factory(self, parent, tag, attrs):
        return self(parent, int(tag[1:]))


class TableFormatter(Formatter):

    class Cell:

        vertical_bar = '|'

        header_horizontal_bar = '='

        horizontal_bar = '-'

        crossing = '+'

        def __init__(self, lines=[], is_header=False):
            self.lines = lines
            self.is_header = is_header
            self.width = 0
            if lines:
                self.width = max(( len(x) for x in lines ))
            self.height = len(lines)

        def format(self, row, col, width, height):
            lines = self.lines
            crossing = self.crossing
            vbar = self.vertical_bar
            left_crossing = ''
            left_vbar = ''
            if col == 0:
                left_crossing = crossing
                left_vbar = vbar
            hbar = self.horizontal_bar
            if self.is_header:
                hbar = self.header_horizontal_bar

            if len(lines) < height:
                lines += [ '' ] * (height - len(lines))

            result = []
            if row == 0:
                top_hbar = self.horizontal_bar
                result.append('%s%s%s'%(left_crossing, top_hbar * width, crossing))
            result += [ '%s%-*s%s'%(left_vbar, width, x, vbar) for x in lines ]
            result.append('%s%s%s'%(left_crossing, hbar * width, crossing))
            return result

    class Col:

        def __init__(self, rows, index):
            self.width = None
            for row in rows:
                if index < len(row):
                    width = row[index].width
                    if not self.width or width > self.width:
                        self.width = width

    class Row:

        def __init__(self, cells):
            self.cells = cells
            self.height = 0
            if cells:
                self.height = max(( x.height for x in cells ))

        def __len__(self):
            return len(self.cells)

        def __getitem__(self, i):
            return self.cells[i]


    def __init__(self, parent):
        Formatter.__init__(self, parent)
        self.nlines = 0
        self.rows = []

    def add_row(self, cells):
        self.rows.append(self.Row(cells))

    def merge_cell(self, row_lines, cell_lines):
        if row_lines == None:
            result = cell_lines
        else:
            result = []
            for left, right in zip(row_lines, cell_lines):
                result.append(left + right)
        return result

    def parse(self, lines):
        pass

    def finish(self):
        rows = self.rows
        nrows = len(rows)
        ncols = max(( len(x) for x in rows ))
        cols = [ self.Col(rows, i) for i in range(ncols) ]

        result = []
        r = 0
        for row in rows:
            c = 0
            row_lines = None
            for cell in row:
                cell_lines = cell.format(r, c, cols[c].width, row.height)
                row_lines = self.merge_cell(row_lines, cell_lines)
                c += 1
            for i in range(c, ncols):
                empty_cell_lines = self.Cell().format(r, i, cols[i].width,
                                                      row.height)
                row_lines = self.merge_cell(row_lines, empty_cell_lines)

            r += 1
            result += row_lines
        return result + [ '' ]


class TableRowFormatter(Formatter):

    def __init__(self, parent):
        Formatter.__init__(self, parent)
        self.ncols = 0
        parent.nlines += 1
        self.cells = []

    def get_child_width(self, total_width):
        return (total_width - 1) / self.ncols - 1

    def add_cell(self, lines, is_header):
        self.cells.append(TableFormatter.Cell(lines, is_header))

    def parse(self, lines):
        pass

    def finish(self):
        self.parent.add_row(self.cells)
        return []


class TableCellFormatter(Formatter):

    def __init__(self, parent, is_header):
        Formatter.__init__(self, parent)
        self.is_header = is_header
        parent.ncols += 1

    def parse(self, lines):
        self.parent.add_cell(lines, self.is_header)

    def finish(self):
        return []

    @classmethod
    def factory(self, parent, tag, attrs):
        for aname, aval in attrs:
            if (aname == 'rowspan' or aname == 'colspan') and int(aval) != 1:
                raise self.Error("%s: attribute not supported on %s"%(aname, tag))
        return self(parent, tag == 'th')


class Styler:

    class Error(Exception):
        pass

    class Link:
        def __init__(self, url, name):
            self.url = url
            self.name = name

    def __init__(self):
        self.data = StringIO.StringIO()
        self.child = None
        self.links = []

    def write(self, text):
        target = self.child or self.data
        target.write(text)

    def push(self, styler):
        if self.child:
            self.child.push(styler)
        else:
            self.child = styler

    def pop(self):
        if self.child:
            text = self.child.pop()
            if text != None:
                self.data.write(text)
                self.links += self.child.links
                self.child = None
        else:
            return self.getvalue()

    def getvalue(self):
        return self.data.getvalue()

    @classmethod
    def factory(self, tag, attrs):
        return self()


class CharStyler(Styler):

    markers = {
        'strong': '**', 'b': '**',
        'em': '*', 'i': '*',
        'code': '`'
    }

    def __init__(self, marker):
        Styler.__init__(self)
        self.marker = marker

    def getvalue(self):
        text = Styler.getvalue(self)
        return '%s%s%s'%(self.marker, text, self.marker)

    @classmethod
    def factory(self, tag, attrs):
        return self(self.markers[tag])


class LinkStyler(Styler):

    word_re = re.compile(r'(?:\w|\d)+$')

    def __init__(self, href):
        Styler.__init__(self)
        self.href = href

    def getvalue(self):
        text = Styler.getvalue(self)
        if not self.href or text == self.href:
            return text
        if context:
            link = context.match(self.href, text)
            if link != None:
                return link
        self.links.append('.. _%s: %s'%(text, self.href))
        if not self.word_re.match(text):
            text = '`%s`'%(text)
        text = '%s_'%(text)
        return text

    @classmethod
    def factory(self, tag, attrs):
        href = None
        for aname, aval in attrs:
            if aname == 'href':
                href = aval
                break
        return self(href)


class ImageStyler(Styler):

    class Counter:
        def __init__(self):
            self.n = 0
        def next(self):
            self.n += 1
            return self.n

    index = Counter()

    def __init__(self, src, alt=None):
        Styler.__init__(self)
        self.src = src
        self.alt = alt or 'img%d'%(self.index.next())

    def getvalue(self):
        text = '|%s|'%(self.alt)
        self.links.append('.. %s image:: %s'%(text, self.src))
        return text

    @classmethod
    def factory(self, tag, attrs):
        alt = None
        src = None
        for aname, aval in attrs:
            if aname == 'src':
                src = aval
            elif aname == 'alt':
                alt = aval
        return self(src)


class Cdata(Styler):

    word_re = re.compile(r'(?:\S+|`[^`]*`\S+)')

    def __init__(self):
        Styler.__init__(self)

    def wrap(self, text, wrap_length):
        """Wrap text but preserve wiki links."""
        result = []
        words = self.word_re.findall(text)
        sep = ''
        line_length = 0
        line = StringIO.StringIO()
        for w in words:
            wl = len(w)
            if line_length + wl > wrap_length and wl <= wrap_length:
                result.append(line.getvalue())
                line = StringIO.StringIO()
                sep = ''
                line_length = 0
            line.write(sep)
            line.write(w)
            line_length += len(sep) + wl
            sep = ' '
        result.append(line.getvalue())
        return result

    def get_content(self, width):
        text = self.getvalue().strip()
        if not text:
            return []
        lines = self.wrap(text, width)
        links = self.links
        if links:
            lines.append('')
            lines += links
        lines.append('')
        return lines

    def __str__(self):
        return `self.getvalue().strip()`


class RawData(StringIO.StringIO):

    def get_content(self, width):
        return self.getvalue().split('\n')


class Para:

    class Error(Exception):
        pass

    def __init__(self, formatter=None):
        self.data = None
        self.formatter = formatter
        self.children = []

    def get_data(self, data_class):
        data = self.data
        if not data or not isinstance(data, data_class):
            self.data = data = data_class()
            self.children.append(data)
        return data

    def provide_formatter(self, formatter):
        """Set formatter if not already set."""
        if not self.formatter:
            self.formatter = formatter

    def set_formatter(self, formatter):
        if self.formatter.__class__ != formatter.__class__:
            if self.formatter:
                current = self.formatter.__class__.__name__
                new = formatter.__class__.__name__
                raise self.Error('duplicate formatter: %s != %s'%(current, new))
            self.formatter = formatter

    def add_child(self, para):
        self.children.append(para)
        self.data = None

    def add_raw_data(self, text):
        trace('Add raw data: <%s>\n'%(text))
        self.get_data(RawData).write(text)

    def add_cdata(self, text):
        trace('Add cdata: <%s>\n'%(text))
        self.get_data(Cdata).write(text)

    def push_styler(self, styler):
        self.get_data(Cdata).push(styler)

    def pop_styler(self):
        if self.data: # May have extra closing tag in malformed HTML
            self.data.pop()

    def get_content(self, width):
        formatter = self.formatter or Formatter()
        child_width = formatter.get_child_width(width)
        lines = []
        for para in self.children:
            formatter.parse(para.get_content(child_width))
        lines += formatter.finish()
        return lines

    def __str__(self):
        formatter = self.formatter or Formatter()
        s = StringIO.StringIO()
        s.write('Formatter: %s\n'%(formatter.__class__))
        for child in self.children:
            s.write('  %s\n'%(child))
        return s.getvalue()


class Html2ReStructuredTextParser(HTMLParser.HTMLParser):

    block_tag = {
        'p': None,
        'ul': ListFormatter,
        'ol': ListFormatter,
        'li': ListItemFormatter,
        'pre': VerbatimFormatter,
        'hr': HorizontalRuleFormatter,
        'table': TableFormatter,
        'blockquote': QuoteFormatter,
        'tr': TableRowFormatter,
        'td': TableCellFormatter,
        'th': TableCellFormatter,
        'h1': TitleFormatter,
        'h2': TitleFormatter,
        'h3': TitleFormatter,
        'h4': TitleFormatter,
        'h5': TitleFormatter,
        'h6': TitleFormatter
        }

    modifier_tag = {
        'br': LineBreakFormatter
        }

    inline_tag = {
        'a': LinkStyler,
        'strong': CharStyler,
        'b': CharStyler,
        'em': CharStyler,
        'i': CharStyler,
        'code': CharStyler,
        'img': ImageStyler
    }

    pure_tag = {
        'table': True, 'tr': True, 'th': True, 'td': True
    }

    entities = { 'amp': '&', 'lt': '<', 'gt': '>', 'nbsp': ' ' }

    charref = {
        8211: '-',
        8217: "'",
        8220: '"',
        8221: '"',
        8482: "TM"
    }

    page_width = 78

    class Error(Exception):
        pass

    def __init__(self):
        HTMLParser.HTMLParser.__init__(self)
        self.in_body = False
        self.in_verbatim = False
        self.para = Para()
        self.tags = []
        self.current = [ self.para ]
        self.start_comment = self.end_comment = None
        if context:
            self.start_comment = context.page_start_comment
            self.end_comment = context.page_end_comment

    def open_tag(self, tag, attrs, single=False):
        s = '<%s'%(tag)
        for aname, aval in attrs:
            s += ' %s="%s"'%(aname, aval)
        if single:
            s += '/'
        s += '>'
        return s

    def close_tag(self, tag, single=False):
        if single:
            return ''
        return '</%s>'%(tag)

    def push_para(self, tag, formatter=None):
        para = Para(formatter)
        self.current[-1].add_child(para)
        self.current.append(para)
        self.tags.append(tag)

    def pop_para(self, tag=None):
        if not tag or tag == self.tags[-1]:
            self.current.pop()
            if tag:
                self.tags.pop()

    def push_styler(self, styler):
        self.current[-1].push_styler(styler)

    def pop_styler(self):
        self.current[-1].pop_styler()

    def add_text(self, text):
        para = self.current[-1]
        if self.in_verbatim:
            para.add_raw_data(text)
        elif self.in_body and (not self.tags or not self.tags[-1] in self.pure_tag):
            para.add_cdata(text)

    def set_formatter(self, formatter):
        self.current[-1].set_formatter(formatter)

    def provide_formatter(self, formatter):
        self.current[-1].provide_formatter(formatter)

    def handle_startendtag(self, tag, attrs):
        self.handle_starttag(tag, attrs, single=True)
        self.handle_endtag(tag, single=True)

    def handle_starttag(self, tag, attrs, single=False):
        if self.in_body:
            if self.in_verbatim:
                self.add_text(self.open_tag(tag, attrs, single))
            else:
                parent = None
                in_verbatim = (tag == 'pre')
                if len(self.current) > 0:
                    parent = self.current[-1].formatter
                if tag in self.block_tag:
                    if in_verbatim:
                        self.add_text('::')
                    formatter = None
                    formatter_class = self.block_tag[tag]
                    if formatter_class:
                        formatter = formatter_class.factory(parent, tag, attrs)
                    self.push_para(tag, formatter)
                elif tag in self.inline_tag:
                    factory = self.inline_tag[tag].factory
                    self.push_styler(factory(tag, attrs))
                elif tag in self.modifier_tag:
                    factory = self.modifier_tag[tag].factory
                    self.provide_formatter(factory(parent, tag, attrs))
                elif not tag in self.pure_tag:
                    self.add_text('<%s'%(tag))
                    for aname, aval in attrs:
                        self.add_text(' %s="%s"'%(aname, aval))
                    self.add_text('>')
                self.in_verbatim = in_verbatim
                trace('Start tag: %s %s\n'%('.'*len(self.current),
                                            self.open_tag(tag, attrs)))
        elif tag == 'body' and not self.start_comment:
            self.in_body = True

    def handle_endtag(self, tag, single=False):
        if self.in_body:
            if tag == 'pre':
                self.in_verbatim = False
            if self.in_verbatim:
                self.add_text(self.close_tag(tag, single))
            elif not self.end_comment and tag == 'body':
                self.in_body = False
            else:
                trace('End tag:   %s /%s\n'%('.'*len(self.current), tag))
                if tag in self.block_tag:
                    self.pop_para(tag)
                elif tag in self.inline_tag:
                    self.pop_styler()
                elif not tag in self.modifier_tag:
                    self.add_text(self.close_tag(tag, single))

    def handle_data(self, data):
        self.add_text(data)

    def handle_charref(self, name):
        c = int(name)
        try:
            self.add_text(chr(c))
        except ValueError:
            if c in self.charref:
                return self.charref[c]
            raise self.Error('invalid char reference: &#%s;'%(name))

    def handle_entityref(self, name):
        if name in self.entities:
            self.add_text(self.entities[name])
        else:
            self.add_text('&%s'%(name))

    def handle_comment(self, data):
        if data == self.start_comment:
            self.in_body = True
        elif data == self.end_comment:
            self.in_body = False

    def convert(self, infd, outfd):
        self.outfd = outfd
        for line in infd:
            self.feed(line)
        while self.current:
            self.pop_para()
        lines = self.para.get_content(self.page_width)
        for s in lines:
            outfd.write('%s\n'%(s))


def convert_page(src_dir, dest_dir, pagename):
    infd = outfd = None
    infile = os.path.join(src_dir, pagename)
    out_pagename = pagename
    if context:
        out_pagename = context.convert_page_name(pagename)
    outfile = os.path.join(dest_dir, out_pagename)
    verbose("Converting %s\n"%(infile))
    if options.no_act:
        print "Convert %s to %s"%(infile, outfile)
    else:
        try:
            infd = file(infile)
            outfd = file(outfile, "w")
            parser = Html2ReStructuredTextParser()
            parser.convert(infd, outfd)
        finally:
            if infd:
                if outfd:
                    outfd.close()
                infd.close()


options, args = option_parser.parse_args()

try:
    verbose = noop
    if options.verbose:
        verbose = warn

    trace = noop
    if options.trace:
        trace = warn

    if len(args) != 3:
        option_parser.error("invalid number of arguments: %d"%(len(args)))

    base_url, src_dir, dest_dir = tuple(args)
    if base_url[-1] == '/':
        base_url = base_url[:-1]

    pages = os.listdir(src_dir)

    if options.zwiki:
        context = ZwikiContext(base_url, pages)

    for pagename in pages:
        convert_page(src_dir, dest_dir, pagename)
except:
    if options.debug:
        import traceback
        traceback.print_exc()
    else:
        warn('Error: %s\n'%(sys.exc_info()[1]))

#  Local Variables: ***
#  mode: python ***
#  End: ***
