/* This file is part of the KDE project
 * Copyright (C) 2007 Montel Laurent <montel@kde.org>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License as published by the Free Software Foundation version 2.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; see the file COPYING.  If not, write to
 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 * Boston, MA 02110-1301, USA.
 *
 */

#define STRIGI_IMPORT_API
#include <strigi/streamthroughanalyzer.h>
#include <strigi/analyzerplugin.h>
#include <strigi/fieldtypes.h>
#include <strigi/analysisresult.h>

//kde include
#include <KUrl>
#include <kdebug.h>

//qt include
#include <QFile>
#include <QTextStream>
#include <QRegExp>
using namespace std;
using namespace Strigi;

class HtmlThroughAnalyzerFactory;
class HtmlThroughAnalyzer : public StreamThroughAnalyzer {
    private:
        const HtmlThroughAnalyzerFactory* factory;
        AnalysisResult* idx;
        const char* name() const {
           return "HtmlThroughAnalyzer";
        }

        void setIndexable( AnalysisResult *i ) {
            idx = i;
        }
        InputStream* connectInputStream( InputStream *in );
        bool isReadyWithStream() { return true; }
    public:
        HtmlThroughAnalyzer( const HtmlThroughAnalyzerFactory* f ) : factory( f ) {}
};

class HtmlThroughAnalyzerFactory : public StreamThroughAnalyzerFactory {
private:
    const char* name() const {
        return "HtmlThroughAnalyzer";
    }
    StreamThroughAnalyzer* newInstance() const {
        return new HtmlThroughAnalyzer(this);
    }
    void registerFields( FieldRegister& );

    static const std::string docTypeName;
    static const std::string javascriptName;
    static const std::string titleName;
public:
    const RegisteredField* docTypeNameField;
    const RegisteredField* javascriptNameField;
    const RegisteredField* titleNameField;
};

const std::string HtmlThroughAnalyzerFactory::docTypeName( "document type" );
const std::string HtmlThroughAnalyzerFactory::titleName( "title" );
const std::string HtmlThroughAnalyzerFactory::javascriptName( "javascript" );

void HtmlThroughAnalyzerFactory::registerFields( FieldRegister& reg ) {
	docTypeNameField = reg.registerField(docTypeName, FieldRegister::stringType, 1, 0 );
        javascriptNameField = reg.registerField(javascriptName, FieldRegister::stringType, 1, 0 );
        titleNameField = reg.registerField(titleName, FieldRegister::stringType, 1, 0 );
	
}

InputStream* HtmlThroughAnalyzer::connectInputStream( InputStream* in ) {
    const string& path = idx->path();
    QFile f(path.c_str());
    if (!f.open(IO_ReadOnly))
        return in;
    // we're only interested in the header, so just read until before </head>
    // or until <body> if the author forgot it
    // In this case, it's better to limit the size of the buffer to something
    // sensible. Think a 0-filled 3GB file with an .html extension.
    int maxBufSize = qMin((int)f.size(), 32768);
    QByteArray data(maxBufSize + 1);
    f.read(data.data(), maxBufSize);
    data[maxBufSize]='\0';

    QString s(data);
    int last=0,start=0;
    QRegExp exp;
    exp.setCaseSensitivity(Qt::CaseInsensitive);
    exp.setMinimal(true);
    exp.setPattern("\\s*<\\s*!doctype\\s*([^>]*)\\s*>");
    if (exp.indexIn(s, last) != -1)
    {
        //kDebug(7034) << "DocType: " << exp.capturedTexts().join("-") << endl;
	idx->addValue( factory->docTypeNameField, (const char*)exp.cap(1).toUtf8());
    }

    QString title;
    exp.setPattern("<\\s*title\\s*>\\s*(.*)\\s*<\\s*/\\s*title\\s*>");
    QString meta, name, content;
    exp.setPattern("<\\s*meta\\s*([^>]*)\\s*>");
    QRegExp rxName("(?:name|http-equiv)\\s*=\\s*\"([^\"]+)\"", Qt::CaseInsensitive);
    QRegExp rxContent("content\\s*=\\s*\"([^\"]+)\"", Qt::CaseInsensitive);
    QRegExp rxCharset("charset\\s*=\\s*(.*)", Qt::CaseInsensitive);
    QTextCodec *codec = 0;

    // find the meta tags
    last = 0;
    while (1)
    {
        if ((start=exp.indexIn(s, last)) == -1)
            break;
        meta = exp.cap(1);
        last = start+exp.matchedLength();

        //kDebug(7034) << "Found Meta: " << meta << endl;

        if (rxName.indexIn(meta) == -1)
            continue;
        name = rxName.cap(1);

        if (rxContent.indexIn(meta) == -1)
            continue;
        content = rxContent.cap(1);

        // check if it has a charset defined
        if ( rxCharset.search(content) != -1 )
        {
            //kDebug(7034) << "CodecForName : " << rxCharset.cap(1) << endl;
            codec = QTextCodec::codecForName(rxCharset.cap(1).toAscii());
        }
    }
 
    if (exp.indexIn(s, last) != -1)
    {
        title = exp.cap(1);
    }
    if ( ! title.isEmpty() )
    {
        if ( codec )
        {
          title = codec->toUnicode(title.toAscii());
          //kDebug(7034) << "Codec : " << codec->name() << endl;
        }
	idx->addValue( factory->titleNameField, (const char*)title.toUtf8());
    }

    // find out if it contains javascript
    exp.setPattern("<script>");

    idx->addValue( factory->javascriptNameField, (s.indexOf(exp)!=-1) ? "true" : "false");
    return in;
}

class Factory : public AnalyzerFactoryFactory {
public:
    std::list<StreamThroughAnalyzerFactory*>
    streamThroughAnalyzerFactories() const {
        std::list<StreamThroughAnalyzerFactory*> af;
        af.push_back(new HtmlThroughAnalyzerFactory());
        return af;
    }
};

STRIGI_ANALYZER_FACTORY(Factory) 

