///###////////////////////////////////////////////////////////////////////////
//
// Burton Computer Corporation
// http://www.burton-computer.com
// http://www.cooldevtools.com
// $Id: UrlOnlyHtmlTokenizer.cc 86 2004-11-11 14:48:57Z brian $
//
// Copyright (C) 2000 Burton Computer Corporation
// ALL RIGHTS RESERVED
//
// This program is open source software; you can redistribute it
// and/or modify it under the terms of the Q Public License (QPL)
// version 1.0. Use of this software in whole or in part, including
// linking it (modified or unmodified) into other programs is
// subject to the terms of the QPL.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// Q Public License for more details.
//
// You should have received a copy of the Q Public License
// along with this program; see the file LICENSE.txt.  If not, visit
// the Burton Computer Corporation or CoolDevTools web site
// QPL pages at:
//
//    http://www.burton-computer.com/qpl.html
//    http://www.cooldevtools.com/qpl.html
//

#include "UrlOnlyHtmlTokenizer.h"

static const string URL_REGEX("[^a-z0-9_](href|src)[ \t\r\n]*=[ \t\r\n]*('[^>' \t\r\n]+|\"[^>\" \t\r\n]+|[^> \t\r\n]+)");

UrlOnlyHtmlTokenizer::UrlOnlyHtmlTokenizer(AbstractTokenizer *textTokenizer,
                                           AbstractTokenizer *htmlTokenizer,
                                           int maxTagLength,
                                           AbstractTokenReceiver *tagReceiver)
: HtmlTokenizer(textTokenizer, htmlTokenizer, maxTagLength, tagReceiver),
  m_urlRegex(URL_REGEX, 3, true)
{
}

UrlOnlyHtmlTokenizer::~UrlOnlyHtmlTokenizer()
{
}

void UrlOnlyHtmlTokenizer::processTagBody(const string &tag)
{
    if (isCommentTag(tag)) {
        return;
    }

    int offset = 0;
    string url, decoded_url;
    RegularExpression::MatchData match;
    while (m_urlRegex.match(tag.c_str() + offset)) {
        m_urlRegex.getMatch(2, url);
        m_urlRegex.getMatch(2, match);
        HtmlTokenizer::processTagBody(decodeUrl(url, decoded_url));
        offset += match.end_pos;
        assert(offset <= tag.length());
    }
}

const string &UrlOnlyHtmlTokenizer::decodeUrl(const string &url,
                                              string &buffer)
{
    buffer.erase();
    int len = url.length();
    const char *chars = url.c_str();
    for (int i = 0; i < len; ++i) {
        char ch = chars[i];
        if (ch == '%' && is_xdigit(chars[i+1]) && is_xdigit(chars[i+2])) {
            buffer += (char)(hex_to_int(chars[i+1]) << 4 | hex_to_int(chars[i+2]));
            i += 2;
        } else {
            buffer += ch;
        }
    }
    if (is_debug) {
        cerr << "ORIG URL '" << url << "' DECODED '" << buffer << "'" << endl;
    }
    return buffer;
}

