from Products.PortalTransforms.interfaces import itransform
from Products.CMFDefault.utils import bodyfinder
from Products.CMFDefault.utils import IllegalHTML
from zLOG import LOG, PROBLEM

from sgmllib import SGMLParser
from Products.CMFDefault.utils import SimpleHTMLParser
from Products.CMFDefault.utils import VALID_TAGS
from Products.CMFDefault.utils import NASTY_TAGS

# tag mapping: tag -> short or long tag
VALID_TAGS = VALID_TAGS.copy()
NASTY_TAGS = NASTY_TAGS.copy()

# add some tags to allowed types. This should be fixed in CMFDefault
VALID_TAGS['ins'] = 1
VALID_TAGS['del'] = 1
VALID_TAGS['q'] = 1

msg_pat = """
<div class="system-message">
<p class="system-message-title">System message: %s</p>
%s</d>
"""

class StrippingParser(SGMLParser):
    """Pass only allowed tags;  raise exception for known-bad.
    
    Copied from Products.CMFDefault.utils
    Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
    """

    from htmlentitydefs import entitydefs # replace entitydefs from sgmllib

    def __init__(self, valid, nasty, raise_error):
        SGMLParser.__init__( self )
        self.result = []
        self.valid = valid
        self.nasty = nasty
        self.raise_error = raise_error
        self.suppress = False

    def handle_data(self, data):
        if self.suppress: return
        if data:
            self.result.append(data)

    def handle_charref(self, name):
        if self.suppress: return
        self.result.append('&#%s;' % name)

    def handle_comment(self, comment):
        pass

    def handle_decl(self, data):
        pass

    def handle_entityref(self, name):
        if self.suppress: return
        if self.entitydefs.has_key(name):
            x = ';'
        else:
            # this breaks unstandard entities that end with ';'
            x = ''

        self.result.append('&%s%s' % (name, x))

    def unknown_starttag(self, tag, attrs):
        """ Delete all tags except for legal ones.
        """
        if self.suppress: return
        if self.valid.has_key(tag):
            self.result.append('<' + tag)

            for k, v in attrs:
                if k.strip().lower().startswith('on'):
                    if not self.raise_error: continue
                    else: raise IllegalHTML, 'Javascript event "%s" not allowed.' % k
                elif v.strip().lower().startswith('javascript:' ):
                    if not self.raise_error: continue
                    else: raise IllegalHTML, 'Javascript URI "%s" not allowed.' % v
                else:
                    self.result.append(' %s="%s"' % (k, v))

            #UNUSED endTag = '</%s>' % tag
            if self.valid.get(tag):
                self.result.append('>')
            else:
                self.result.append(' />')
        elif self.nasty.has_key(tag):
            self.suppress = True
            if self.raise_error:
                raise IllegalHTML, 'Dynamic tag "%s" not allowed.' % tag
        else:
            # omit tag
            pass

    def unknown_endtag(self, tag):
        if self.nasty.has_key(tag):
            self.suppress = False
        if self.suppress: return
        if self.valid.get(tag):
            self.result.append('</%s>' % tag)
            #remTag = '</%s>' % tag

    def getResult(self):
        return ''.join(self.result)

def scrubHTML(html, valid=VALID_TAGS, nasty=NASTY_TAGS, raise_error=True):

    """ Strip illegal HTML tags from string text.
    """
    parser = StrippingParser(valid=valid, nasty=nasty, raise_error=raise_error)
    parser.feed(html)
    parser.close()
    return parser.getResult()

class SafeHTML:
    """Simple transform which uses CMFDefault functions to
    clean potentially bad tags"""

    __implements__ = itransform

    __name__ = "safe_html"
    inputs   = ('text/html',)
    output = "text/x-html-safe"

    def __init__(self, name=None):
        self.config_metadata = {
            'inputs' : ('list', 'Inputs', 'Input(s) MIME type. Change with care.'),
            }
        if name:
            self.__name__ = name

    def name(self):
        return self.__name__

    def __getattr__(self, attr):
        if attr == 'inputs':
            return self.config['inputs']
        if attr == 'output':
            return self.config['output']
        raise AttributeError(attr)

    def convert(self, orig, data, **kwargs):
        try:
             safe = scrubHTML(bodyfinder(orig), raise_error=False)
        except IllegalHTML, inst:
            data.setData(msg_pat % ("Error", str(inst)))
        else:
            data.setData(safe)
        return data

def register():
    return SafeHTML()
