import java.io.*;
import java.net.*;
import java.util.*;


/**
 * Class to build a list of files from hyperlinks
 *
 * @author Ben Secrest &lt;blsecres@users.sourceforge.net&gt;
 * @version $Id: LinkScanner.java,v 1.20 2002/09/23 10:37:27 howama Exp $
 */
public class LinkScanner extends FileListBuilder {
    /** Set of files already processed */
    private HashSet processedFiles;

    /** Stack of files to be processed */
    private Stack processStack;

    /** The project settings */
    private Project project;

    /** Filters for files to scan for links */
    private String[] scanIncludeFilters;

    /** Filters for files to not scan for links */
    private String[] scanExcludeFilters;

    /**
     * Determines whether or not links will be scanned from local files or
     * live web pages
     */
    private boolean useNetwork;


    /**
     * Constructor
     * @param logObj The object ot use for logging
     * @param parserChooser The parser chooser object used to parse files
     * @param indexIncFilters Filters for files to include in index
     * @param indexExcFilters Filters for files to exclude from index
     * @param scanIncFilters Filters for files to scan for links
     * @param scanExcFilters Filters for files to not scan for links
     * @param useNet Use network connection to obtain files
     * @param targetList Object to build FileList in
     */
    public LinkScanner(IGLog logObj, ParserChooser parserChooser,
	    String[] indexIncFilters, String[] indexExcFilters,
	    String[] scanIncFilters, String[] scanExcFilters,
	    boolean useNet, FileList targetList) {
	super(logObj, parserChooser, indexIncFilters, indexExcFilters,
		targetList);

	parser = parserChooser;

	scanIncludeFilters = scanIncFilters;
	scanExcludeFilters = scanExcFilters;

	useNetwork = useNet;

	processedFiles = new HashSet();
	processStack = new Stack();
    }


    /**
     * Method to scan a file parsing out links and any other necessary
     * information
     * @param file The filename or URL to scan for links
     * @throws IOException if an error occurs processing a file
     */
    public void scan(String file) throws IOException, IllegalVariableException {
	if (LOGLEVEL >= IGLog.PROCEDURE)
	    log.add(IGLog.PROCEDURE, "LinkScanner.scan(String[" + file + "])");

	String protocol = null;		// protocol to use for server access
	String host = null;		// host to download files from
	String virtualRoot = null;	// virtual root directory for
					// filesystem searches
	String fileSeparator = null;	// the system file separator
	int port = -1;

	if (useNetwork) {
	    URL site = new URL(file);
	    protocol = site.getProtocol();
	    host = site.getHost();
	    port = site.getPort();
	    fileSeparator = "/";
	} else {
	    fileSeparator = System.getProperty("file.separator");
	    virtualRoot = file.substring(0, file.lastIndexOf(fileSeparator));
	}

	processStack.push(file);

	// process extracted links until nothing left
	while (! processStack.empty()) {
	    boolean isDeadLink = false;
	    String curFile = (String) processStack.pop();
	    URL curURL = (useNetwork ? new URL(curFile) : null);
	    String[] links = null;

	    // this file has already been processed
	    if (processedFiles.contains(curFile))
		continue;

	    IGFile igFile = new IGFile(curFile);
	    // make sure this file doesn't get processed more than once
	    processedFiles.add(curFile);

	    try {
		parser.wantLinks(filePassScanFilters(curFile));
		parser.parse(igFile);

		links = parser.getLinks(igFile);
	    } catch (java.io.IOException ioe) {
		// an IOException is thrown by ParserChooser.parse() if the
		// file does not exist
		log.addWarning(109, "LS_DEAD_LINK", new String[]{curFile});
		isDeadLink = true;
	    }

	    if (isDeadLink)
		continue;
	    else
		// add igFile to FileList
		addFile(igFile);


	    /*
	     * process links extracted from document..links will be null if the
	     * file didn't pass the scan filter
	     */
	    for (int i = 0; links != null && i < links.length; i++) {
		int index;	// generic index for splicing paths
		/*
		 * URL.getFile() and URL.getPath() return the exact same
		 * string.  Get this string, assume all directory names
		 * will be followed by a '/' and create a base path for links
		 * relative to the current file
		 */
		String path = (useNetwork ? curURL.getFile() : curFile);
		index = path.lastIndexOf(fileSeparator);
		path = (index < 1 ? "" : path.substring(0, index));

		while ((index = links[i].indexOf("../")) != -1) {
		    // TODO more robust link .. notation handling
		    // fix .. notation
		    // could be link[i] = /somedir/../file.html
		    // or ../../file.html etc

		    // for now just assume there will only be leading ..'s in
		    // link and pop directories off of path
		    links[i] = links[i].substring(index + 3);
		    index = path.lastIndexOf(fileSeparator, path.length());
		    path = path.substring(0, (index == -1 ? 0 : index));
		}

		// strip references
		if ((index = links[i].indexOf('#')) != -1)
		    links[i] = links[i].substring(0, index);

		/*
		 * only include files in the process list that pass the index
		 * filters
		 */
		if (filePassesFilters(links[i])) {
		    String extracted;	// the file taken from the current doc

		    // see if the link is a URL or just a path
		    if (IGMisc.isURL(links[i])) {
			/*
			 * continue if this is a file system search and we've
			 * been given a URL
			 */
			if (! useNetwork)
			    continue;


			/*
			 * see if this link is a link on the host being
			 * processed
			 */
			if (! new URL(links[i]).getHost().equalsIgnoreCase(
				host))
			    continue;
			else
			    extracted = links[i];
		    } else /* link isn't a URL */ {
			// check for relative or absolute path
			// absolute paths on file system search need to be
			// moved to the virtualRoot
			extracted = (links[i].startsWith(fileSeparator)
				? ((useNetwork ? "" : virtualRoot) + links[i])
				: (path + fileSeparator + links[i]));

			if (useNetwork) {
			    // create a new URL from the extracted link
			    extracted = new URL(protocol, host, port,
				    extracted).toString();
			} else if (! fileSeparator.equals("/")) {
			    // if file separator isnt '/', convert '/' in links
			    // to fileSeparator
			    StringBuffer buffer = new StringBuffer(extracted);

			    for (int j = 0; j < buffer.length(); j++)
				if (buffer.charAt(j) == '/')
				    buffer.replace(j, j + 1, fileSeparator);

			    extracted = buffer.toString();
			}
		    }

		    // ensure this file hasn't been processed already
		    if (! processedFiles.contains(extracted)) {
			processStack.push(extracted);
			if (LOGLEVEL >= IGLog.PROGRESS)
			    log.addResource(IGLog.PROGRESS, "LS_PROCESS",
				    new String[]{extracted});
		    }
		}
	    }
	}
    }


    /**
     * Check a filename against the scan include and exclude filters
     * @param filename The name of the file to run through the filters
     * @return <tt>true</tt> if the file passes the filters, <tt>false</tt>
     * 	otherwise
     */
    private boolean filePassScanFilters(String filename) {
	// TODO Right now this is very basic and based on the index filters.
	// TODO In the future, there are hopes that it will behave similar to
	// TODO w3m's mirror mode.
        if (LOGLEVEL >= IGLog.RESULT)
            log.addResource(IGLog.RESULT, "FLB_SCAN_EXFILTER",
		    new Object[]{new Integer(scanExcludeFilters.length)});

	for (int i = 0; i < scanExcludeFilters.length; i++) {
	    if (scanExcludeFilters[i].equals("")) {
		continue;
	    }
            if (LOGLEVEL >= IGLog.PROCEDURE)
                log.addResource(IGLog.PROCEDURE, "FLB_CHECK_EXFILTER",
                                new String[]{scanExcludeFilters[i]});
            if (filename.indexOf(scanExcludeFilters[i]) > -1) {
                if (LOGLEVEL >= IGLog.PROGRESS)
                    log.addResource(IGLog.PROGRESS, "FLB_FILE_EXCLUDED",
                                    new String[]{filename});
                return false;
            }
        }

        // No include filters == allow all
        if (scanIncludeFilters.length == 0) {
            return true;
        }

        // scan include filters
        for (int i = 0; i < scanIncludeFilters.length; i++) {
            if (filename.indexOf(scanIncludeFilters[i]) > -1) {
                return true;
            }
        }

        if (LOGLEVEL >= IGLog.PROGRESS)
            log.addResource(IGLog.PROGRESS, "FLB_FILE_MATCH",
                            new String[]{filename});

        return false;
    }
}
