/*
 * Written by Bastien Chevreux (BaCh)
 *
 * Copyright (C) 1997-2000 by the German Cancer Research Center (Deutsches
 *   Krebsforschungszentrum, DKFZ Heidelberg) and Bastien Chevreux
 * Copyright (C) 2000 and later by Bastien Chevreux
 *
 * All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the
 * Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
 *
 */

// functions to process reads
// currently in namespace and object assembly


#include "boost/unordered_map.hpp"
#include <boost/regex.hpp>
#include <boost/filesystem.hpp>

#include "assembly.H"
#include "dataprocessing.H"
#include "hashstats.H"
#include <ctype.h>


using namespace std;


//#define CEBUG(bla)   {if(CEBUGFLAG) {cout << bla; cout.flush();}}
#define CEBUG(bla)




//#define CEBUG(bla)   {if(id1==2282 && id2==342) {cout << bla; cout.flush();}}
//#define CEBUG(bla)   {cout << bla; cout.flush();}
//#define CEBUG(bla)   {cout << bla;}






/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/

void Assembly::performHashAnalysis(bool useminkmer, int32 version, const string prefix, const string postfix, const string logname)
{
  FUNCSTART("void Assembly::performHashAnalysis()");
  //CEBUG("BEFORE\n");
  //for(uint32 actid=0; actid<AS_readpool.size(); actid++){
  //  Read & r=AS_readpool.getRead(actid);
  //  r.integrityCheck();
  //  Read::setCoutType(Read::AS_TEXT);
  //  cout << r;
  //}

  assembly_parameters const & as_fixparams= AS_miraparams[0].getAssemblyParams();
  skim_parameters const & skim_params= AS_miraparams[0].getSkimParams();
  hashstatistics_parameters const & hs_params= AS_miraparams[0].getHashStatisticsParams();

  uint8 basesperhash=skim_params.sk_basesperhash;
  if(sizeof(uint64) < 8 && basesperhash > 15) basesperhash=15;

  uint32 nastyrepeatratio=0;
  if(AS_needsskimfornastyrepeats && hs_params.hs_masknastyrepeats){
    // BaCh 22.04.2013: MNR tags are now always deleted by assignReadBaseStatistics(), so must be also re-set
    // TODO: check whether AS_needsskimfornastyrepeats is still needed
    //AS_needsskimfornastyrepeats=false;
    AS_needsskimfornastyrepeats=true;
    nastyrepeatratio=hs_params.hs_nastyrepeatratio;
  }

  HashStatistics s3;
  string filenameforhs;
  string stathsfn(buildDefaultCheckpointFileName("static_hashstat.bin"));
  {
    s3.setHashFrequencyRatios(hs_params.hs_freqest_minnormal,
			      hs_params.hs_freqest_maxnormal,
			      hs_params.hs_freqest_repeat,
			      hs_params.hs_freqest_heavyrepeat,
			      hs_params.hs_freqest_crazyrepeat,
			      hs_params.hs_nastyrepeatratio,
			      hs_params.hs_nastyrepeatcoverage);

    s3.setAvgHashFreqMinimum(hs_params.hs_freq_covestmin);

    vector<uint32> minkmer;
    if(useminkmer){
      for(auto & mp : AS_miraparams){
	minkmer.push_back(mp.getAssemblyParams().as_clipmask_rarekmers);
      }
    }

    if(fileExists(stathsfn)){
      s3.loadHashStatistics(AS_readpool,stathsfn,basesperhash);
    }else{
      s3.prepareHashStatistics(AS_miraparams[0].getDirectoryParams().dir_tmp,
			       AS_readpool,
			       true,
			       false,
			       false,
			       true,
			       1,
			       basesperhash,
			       filenameforhs
	);
    }
    s3.showHashStatisticsInfo();
    cout << "Assigning statistics values:\n";
    if(AS_miraparams[0].getAssemblyParams().as_dateoutput) dateStamp(cout);
    s3.assignReadBaseStatistics_MultiThread(skim_params.sk_numthreads, nastyrepeatratio>0,minkmer);
    if(AS_miraparams[0].getAssemblyParams().as_dateoutput) dateStamp(cout);

    if(AS_miraparams[0].getAssemblyParams().as_buntify_reads){
      AS_dataprocessing.buntifyReadsByHashFreq_Pool(AS_readpool,basesperhash);
    }
  }

  //CEBUG("AFTER\n");
  //for(uint32 actid=0; actid<AS_readpool.size(); actid++){
  //  Read & r=AS_readpool.getRead(actid);
  //
  //  Read::setCoutType(Read::AS_TEXT);
  //
  //  if(r.getName()=="FF5UQ0101A62BE.fn"
  //     || r.getName()=="FFPHEER01DATWZ"
  //     || r.getName()=="FFPHEER01AK3C0"){
  //    cout << r;
  //  }
  //}

  //if(nastyrepeatratio){
  if(hs_params.hs_repeatlevel_in_infofile){
    string filename;

    if(logname.size()){
      filename=buildFileName(version, prefix, postfix, logname, "");
    }else{
      //filename=buildFileName(version, prefix, postfix,
      //			     as_fixparams.as_outfile_stats_readrepeats,
      //			     ".lst");

      //filename=buildDefaultInfoFileName(version, prefix, postfix,
      filename=buildDefaultInfoFileName(-1, "", "",
					"",
					as_fixparams.as_outfile_stats_readrepeats,
					".lst",
					true);
    }

    cout << "Writing read repeat info to: " << filename << " ... ";
    cout.flush();

    uint32 howmanys=0;
    uint32 howmanyt=0;
    uint32 repanalysislevel=hs_params.hs_repeatlevel_in_infofile;
    if(repanalysislevel<5) repanalysislevel=5;
    if(repanalysislevel>8) repanalysislevel=8;

    ofstream fout;
    fout.open(filename.c_str(), ios::out|ios::trunc);
    for(uint32 rpi=0; rpi<AS_readpool.size(); rpi++){
      Read & actread= AS_readpool.getRead(rpi);
      if(!actread.hasValidData()
	 || !actread.isUsedInAssembly()) continue;
      bool mustshow=false;
      if(actread.hasTag(Read::REA_tagentry_idHAF5,-1)) {
	if(repanalysislevel==5) mustshow=true;
      }else if(actread.hasTag(Read::REA_tagentry_idHAF6,-1)) {
	if(repanalysislevel<=6) mustshow=true;
      }else if(actread.hasTag(Read::REA_tagentry_idHAF7,-1)) {
	if(repanalysislevel<=7) mustshow=true;
      }else if(actread.hasTag(Read::REA_tagentry_idMNRr,-1)) {
	if(repanalysislevel<=8) mustshow=true;
      }
      if(mustshow){
	bool countedthisseq=false;
	for(uint32 tn=0; tn<actread.getNumOfTags(); tn++){
	  const multitag_t & acttag=actread.getTag(tn);
	  if(acttag.to-acttag.from +1 >= basesperhash){
	    mustshow=false;
	    if(acttag.identifier==Read::REA_tagentry_idHAF5) {
	      if(repanalysislevel==5) mustshow=true;
	    }else if(acttag.identifier==Read::REA_tagentry_idHAF6) {
	      if(repanalysislevel<=6) mustshow=true;
	    }else if(acttag.identifier==Read::REA_tagentry_idHAF7) {
	      if(repanalysislevel<=7) mustshow=true;
	    }else if(acttag.identifier==Read::REA_tagentry_idMNRr) {
	      if(repanalysislevel<=8) mustshow=true;
	    }
	    if(mustshow){
	      if(!countedthisseq){
		countedthisseq++;
		++howmanys;
	      }
	      ++howmanyt;
	      fout << actread.getName() << '\t'
		   << acttag.getIdentifierStr() << '\t';
	      for(uint32 readpos=acttag.from; readpos<=acttag.to; readpos++){
		fout << static_cast<char>(toupper(actread.getBaseInSequence(readpos)));
	      }
	      fout << '\n';
	    }
	  }
	}
      }
    }

    cout << howmanys << " sequences with " << howmanyt << " masked stretches." << endl;
  }

  if(hs_params.hs_masknastyrepeats && hs_params.hs_apply_digitalnormalisation){
    if(!fileExists(stathsfn)){
      //cout << "Renaming / moving static hash statistics " << filenameforhs << " to " << stathsfn << " ... "; cout.flush();
      cout << "Renaming / moving static hash statistics ... "; cout.flush();
      fileRename(filenameforhs,stathsfn);
      cout << "done." << endl;
    }
    if(AS_miraparams[0].getAssemblyParams().as_dateoutput) dateStamp(cout);
    cout << "Performing digital normalisation: "; cout.flush();
    AS_dataprocessing.performDigitalNormalisation_Pool(AS_readpool,s3,&AS_debrisreason);
    cout << "done\n";
    if(AS_miraparams[0].getAssemblyParams().as_dateoutput) dateStamp(cout);
  }


  if(AS_logflag_dumphashanalysis){
    string logfilename=AS_miraparams[0].getDirectoryParams().dir_tmp+"/elog.dp.hashanalysis.lst";

    cout << "elog hashan: " << logfilename << endl;
    ofstream logfout;
    logfout.open(logfilename.c_str(), ios::out|ios::trunc);

    for(uint32 rpi=0; rpi<AS_readpool.size(); rpi++){
      Read::setCoutType(Read::AS_TEXT);
      logfout << AS_readpool[rpi];
    }
  }

  FUNCEND();
  return;
}









/*************************************************************************
 *
 * expects reads to have baseflags set  (by performHashAnalysis())
 *
 *
 *************************************************************************/

//#define CEBUG(bla)   {cout << bla; cout.flush();}

uint64 Assembly::performNewProposedCutbackClips(const string & logname, const string & logprefix)
{
  FUNCSTART("void Assembly::performProposedCutbackClips(const string & logname, const string & logprefix)");

  bool doit=false;
  for(auto mpi=0; mpi < AS_miraparams.size(); ++mpi){
    doit|=AS_seqtypespresent[mpi] && AS_miraparams[mpi].getAssemblyParams().as_clip_proposeendclips;
  }

  if(!doit) return 0;

  cout << "Hash analysis for proposed cutbacks:";

  skim_parameters const & skim_params= AS_miraparams[0].getSkimParams();
  assembly_parameters const & as_fixparams= AS_miraparams[0].getAssemblyParams();
  hashstatistics_parameters const & hs_params= AS_miraparams[0].getHashStatisticsParams();

  {
    uint8 basesperhash=as_fixparams.as_clip_pec_basesperhash;
    if(sizeof(uint64) < 8 && basesperhash > 15) basesperhash=15;

    HashStatistics s3;

    s3.setHashFrequencyRatios(hs_params.hs_freqest_minnormal,
			      hs_params.hs_freqest_maxnormal,
			      hs_params.hs_freqest_repeat,
			      hs_params.hs_freqest_heavyrepeat,
			      hs_params.hs_freqest_crazyrepeat,
			      hs_params.hs_nastyrepeatratio,
			      hs_params.hs_nastyrepeatcoverage);

    vector<uint32> dummy;

    string filenameforhs;
    s3.prepareHashStatistics(AS_miraparams[0].getDirectoryParams().dir_tmp,
			     AS_readpool,
			     true,
			     false,
			     false,
			     true,
			     AS_miraparams[0].getAssemblyParams().as_clip_pec_mkfr,
			     basesperhash,
			     filenameforhs
      );
    s3.showHashStatisticsInfo();
    cout << "Assigning statistics values:\n";
    if(AS_miraparams[0].getAssemblyParams().as_dateoutput) dateStamp(cout);
    s3.assignReadBaseStatistics_MultiThread(skim_params.sk_numthreads, false, dummy);
    if(AS_miraparams[0].getAssemblyParams().as_dateoutput) dateStamp(cout);


    auto avgcov=s3.getAvgHashFreqRaw();

    if(basesperhash>=17
       && AS_miraparams[0].getAssemblyParams().as_clip_pec_mkfr <2){
      if(AS_miraparams[0].getPathfinderParams().paf_use_genomic_algorithms
	 && AS_seqtypespresent[ReadGroupLib::SEQTYPE_SOLEXA] && avgcov >=50){
	cout << "Detected probable higher coverage in Illumina genome project, setting: -CL:pmkfr=2\n";
	const_cast<assembly_parameters &>(AS_miraparams[0].getAssemblyParams()).as_clip_pec_mkfr=2;
      }
    }
  }

  ofstream logfout;
  if(!logname.empty()){
    logfout.open(logname.c_str(), ios::out|ios::app);
    if(!logfout){
      MIRANOTIFY(Notify::FATAL, "Could not open log for appending: " << logname);
    }
  }

  if(as_fixparams.as_dateoutput) dateStamp(cout);
  cout << '\n';

  static string ggcproblem="ggc";

  cout << "Looking for proposed cutbacks ... "; cout.flush();
  Read::setCoutType(Read::AS_TEXT);

  uint32 cbleft=0;
  uint32 cbright=0;
  uint32 killed=0;
  uint64 numbasesclipped=0;
  for(uint32 actid=0; actid<AS_readpool.size(); actid++){
    Read & r=AS_readpool.getRead(actid);

    if(r.hasValidData()
       && AS_miraparams[r.getSequencingType()].getAssemblyParams().as_clip_proposeendclips
       && r.hasBaseHashStats()
       && !(r.isBackbone()
	    || r.isRail())){

      CEBUG("lar " << r.getName() << " ");

      bool hasbeenclipped=false;

      uint32 oldlen=r.getLenClippedSeq();

      {
	int32 lpos=r.getLeftClipoff();
	vector<Read::bposhashstat_t>::const_iterator bhsI=r.getBPosHashStats().begin();
	advance(bhsI,lpos);
	for(; lpos<static_cast<int32>(r.getLenSeq()); ++lpos, ++bhsI) {
	  if(AS_miraparams[r.getSequencingType()].getAssemblyParams().as_clip_pec_ffreq >0
	     && (bhsI->fwd.getFrequency() > AS_miraparams[r.getSequencingType()].getAssemblyParams().as_clip_pec_ffreq
		 || bhsI->rev.getFrequency() > AS_miraparams[r.getSequencingType()].getAssemblyParams().as_clip_pec_ffreq)) {
	    CEBUG("ffreq stop at " << lpos << "\n");
	    break;
	  }
	  if(AS_miraparams[r.getSequencingType()].getAssemblyParams().as_clip_pec_ffr
	     && ( bhsI->fwd.hasConfirmedFwdRev()
		  || bhsI->rev.hasConfirmedFwdRev())) {
	    CEBUG("ffore stop at " << lpos << "\n");
	    break;
	  }
	  if(AS_miraparams[r.getSequencingType()].getAssemblyParams().as_clip_pec_fcmst
	     && ( bhsI->fwd.hasConfirmedMultipleSeqType()
		  || bhsI->rev.hasConfirmedMultipleSeqType())) {
	    CEBUG("fcmst stop at " << lpos << "\n");
	    break;
	  }
	  if(AS_miraparams[r.getSequencingType()].getAssemblyParams().as_clip_pec_fsalp
	     && ( bhsI->fwd.hasSeenAtLowPos()
		  || bhsI->rev.hasSeenAtLowPos())) {
	    CEBUG("fsalp stop at " << lpos << "\n");
	    break;
	  }
	}

	if(lpos != r.getLeftClipoff()){
	  hasbeenclipped=true;

	  if(lpos>0 && lpos>r.getLenSeq()) lpos=r.getLenSeq();
	  CEBUG("pcb l: " << r.getName() << " " << r.getLeftClipoff()
		<< " " << lpos << endl);
	  logfout << logprefix << " left "
		  << r.getName() << '\t'
		  << r.getLeftClipoff() << " -> ";
	  if(lpos==r.getLenSeq()){
	    r.setRQClipoff(r.getLeftClipoff());
	    logfout << "killed\n";
	  }else{
	    r.setLQClipoff(lpos);
	    logfout << r.getLeftClipoff() << '\n';
	  }
	  cbleft++;
	}
      }

      {
      	int32 rpos=r.getRightClipoff();
	vector<Read::bposhashstat_t>::const_iterator bhsI=r.getBPosHashStats().begin();
	advance(bhsI,rpos);

	for(; rpos >0; --rpos){
	  --bhsI;
	  if(AS_miraparams[r.getSequencingType()].getAssemblyParams().as_clip_pec_bfreq
	     && (bhsI->fwd.getFrequency() > AS_miraparams[r.getSequencingType()].getAssemblyParams().as_clip_pec_bfreq
		 || bhsI->rev.getFrequency() > AS_miraparams[r.getSequencingType()].getAssemblyParams().as_clip_pec_bfreq)) {
	    CEBUG("bfreq stop at " << rpos << "\n");
	    break;
	  }
	  if(AS_miraparams[r.getSequencingType()].getAssemblyParams().as_clip_pec_bfr
	     && ( bhsI->fwd.hasConfirmedFwdRev()
		  || bhsI->rev.hasConfirmedFwdRev())) {
	    CEBUG("bfore stop at " << rpos << "\n");
	    break;
	  }
	  if(AS_miraparams[r.getSequencingType()].getAssemblyParams().as_clip_pec_bcmst
	     && ( bhsI->fwd.hasConfirmedMultipleSeqType()
		  || bhsI->rev.hasConfirmedMultipleSeqType())) {
	    CEBUG("bcmst stop at " << rpos << "\n");
	    break;
	  }
	  if(AS_miraparams[r.getSequencingType()].getAssemblyParams().as_clip_pec_bsalp
	     && ( bhsI->fwd.hasSeenAtLowPos()
		  || bhsI->rev.hasSeenAtLowPos())) {
	    CEBUG("fsalp stop at " << rpos << "\n");
	    break;
	  }
      	}

      	if(rpos != r.getRightClipoff()){
	  hasbeenclipped=true;

      	  CEBUG("pcb r: " << r.getName() << " " << r.getRightClipoff()
      		<< " " << rpos << endl);
      	  logfout << logprefix << " right "
      		  << r.getName() << '\t'
      		  << r.getRightClipoff() << " -> ";
      	  r.setRQClipoff(rpos);
      	  cbright++;
      	  logfout << r.getRightClipoff() << '\n';

	  // special handling of Solexa GGC.G error
	  // from point of right clip, 15 bases backwards:
	  //  search for first ggc.g and clip there
	  if(r.getSequencingType()==ReadGroupLib::SEQTYPE_SOLEXA
	     && AS_miraparams[0].getAssemblyParams().as_clip_pec_sxaggcxg
	     && r.getLenClippedSeq() >=15){
	    //Read::setCoutType(Read::AS_TEXTSHORT);
	    //cout << r;
	    string searchstr=r.getSeqAsChar();
	    boost::to_lower(searchstr);
	    int64 searchstart=r.getRightClipoff()-15;
	    if(searchstart<0) searchstart=0;
	    size_t found;
	    do{
	      found=searchstr.find(ggcproblem,searchstart);
	      if (found!=string::npos){
		searchstart=found+1;
		if(found < r.getRightClipoff()
		   && found+4<r.getRightClipoff()
		   && searchstr[found+4]=='g'){
		  logfout << logprefix << "possible Solexa GGC.G problem "
			  << r.getName() << '\t' << r.getRQClipoff() << " -> ";
		  r.setRQClipoff(static_cast<int32>(found+4));
		  logfout << r.getRQClipoff() << '\n';
		  found=string::npos; // stop the loop
		}
	      }
	    }while(found!=string::npos);
	  }
      	}
      }

      if(hasbeenclipped){
	CEBUG("clipstat yes\n" << r << endl);
	numbasesclipped+=oldlen-r.getLenClippedSeq();
	if(oldlen
	   && (r.getLenClippedSeq() < AS_miraparams[r.getSequencingType()].getAssemblyParams().as_minimum_readlength )){
	  killed++;
	  logfout << logprefix << " "
		  << r.getName() << " killed, remaining length ("
		  << r.getLenClippedSeq() << ")\n";
	}
      }else{
	CEBUG("clipstat no\n" << r << endl);
      }
    }
  }

  logfout.close();

  cout << "done.\nPerformed clips:"
       << "\n\tNum reads cliped left: " << cbleft
       << "\n\tNum reads cliped right: " << cbright
       << "\n\tNum reads completely killed: " << killed
       << "\n\tTotal bases clipped         : " << numbasesclipped
       << "\n\n";

  // now, set the align parameters to enforce clean ends
  for(uint32 st=0; st<ReadGroupLib::SEQTYPE_END; st++){
    align_parameters & alpar=const_cast<align_parameters &>(AS_miraparams[st].getAlignParams());
    alpar.ads_enforce_clean_ends=true;
    alpar.ads_clean_end_distance=AS_miraparams[0].getSkimParams().sk_basesperhash;
  }


  AS_dataprocessing.clipPolyBaseAtEnd_Pool(AS_readpool,logprefix);

  FUNCEND();

  return numbasesclipped;
}
//#define CEBUG(bla)






/*************************************************************************
 *
 *
 *
 *************************************************************************/

void Assembly::cutBackPossibleChimeras(const string & logname, const string & logprefix, const vector<int32> & chuntleftcut, const vector<int32> & chuntrightcut, vector<bool> & chimeracutflag)
{
  FUNCSTART("void Assembly::cutBackPossibleChimeras(const string & logname, const string & logprefix, const vector<int32> & chuntleftcut, const vector<int32> & chuntrightcut)");

  BUGIFTHROW(chuntleftcut.size()!=chuntrightcut.size() && chuntleftcut.size() != AS_readpool.size(),"Arrays mismatch? chuntleftcut.size()!=chuntrightcut.size && chuntleftcut.size() != AS_readpool.size()");

  ofstream logfout;
  if(!logname.empty()){
    logfout.open(logname.c_str(), ios::out|ios::app);
    if(!logfout){
      MIRANOTIFY(Notify::FATAL, "Could not open log for appending: " << logname);
    }
  }

  cout << "Cutting back possible chimeras ... "; cout.flush();

  if(!chimeracutflag.empty()){
    chimeracutflag.clear();
    chimeracutflag.resize(chuntleftcut.size(),false);
  }

  assembly_parameters const & as_fixparams= AS_miraparams[0].getAssemblyParams();

  for(uint32 actreadid=0;actreadid<chuntleftcut.size();actreadid++){
    Read & actread=AS_readpool.getRead(actreadid);
    if(actread.hasValidData()
       && !(actread.isBackbone()
	    || actread.isRail())){
      bool didcut=false;
      if(as_fixparams.as_clip_skimchimeradetection
	 && (chuntleftcut[actreadid]>0
	     || chuntrightcut[actreadid]>0)){
	logfout << logprefix << " possible chimera: " << actread.getName()
		<< "\t["
		<< actread.getLeftClipoff()
		<< ","
		<< actread.getRightClipoff()
		<< "[ using cfrag " << chuntleftcut[actreadid] << ":" << chuntrightcut[actreadid]
		<< " cut back to ";

	actread.setLSClipoff(actread.getLeftClipoff()+chuntleftcut[actreadid]);
	actread.setRSClipoff(actread.getLeftClipoff()+(chuntrightcut[actreadid]-chuntleftcut[actreadid])+1);
	didcut=true;
	if(!chimeracutflag.empty()){
	  chimeracutflag[actreadid]=true;
	}

	logfout << '['
		<< actread.getLeftClipoff()
		<< ","
		<< actread.getRightClipoff()
		<< "[\n";
      }

      if(!didcut
	 && (chuntleftcut[actreadid]<0
	     || chuntrightcut[actreadid]<0)){
	if(as_fixparams.as_clip_skimjunkdetection){
	  logfout << logprefix << " removed possible junk: " ;
	}else{
	  logfout << logprefix << " untouched possible junk: " ;
	}
	logfout << actread.getName()
		<< "\t["
		<< -chuntleftcut[actreadid]
		<< ","
		<< -chuntrightcut[actreadid]
		<< '\n';
	if(as_fixparams.as_clip_skimjunkdetection){
	  actread.setLSClipoff(actread.getLeftClipoff()-chuntleftcut[actreadid]);
	  actread.setRSClipoff(actread.getRightClipoff()+chuntrightcut[actreadid]);
	  if(!chimeracutflag.empty()){
	    chimeracutflag[actreadid]=true;
	  }
	}
      }
    }
  }

  cout << "done.\n";
}




/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/
/*
void Assembly::performPool_AdaptorRightClip(const string & logname, const string & logprefix, const uint8 seqtype)
{
  FUNCSTART("void Assembly::performPool_SolexaAdaptorRightClip(const string & logname, const string & logprefix, const uint8 seqtype);)");

// BOOST: regex not compatible with _GLIBCXX_DEBUG
#ifdef _GLIBCXX_DEBUG
  cout << "_GLIBCXX_DEBUG not compatible with BOOST::regex :-(\n";
  return;
#endif

  BUGIFTHROW(seqtype>=ReadGroupLib::SEQTYPE_END,"Unknown seqtype " << static_cast<uint16>(seqtype) << "given.");

#if CPP_READ_SEQTYPE_END != 8
#error "This code is made for 8 sequencing types, adapt!"
#endif

  struct masterslavere_t {
    boost::regex masterre;
    std::list<boost::regex> slaveres;
    bool hasmaster;

    masterslavere_t(): hasmaster(false) {};
  };

  // prepare regular expressions
  list<masterslavere_t> adapres;
  {
    istringstream tmpis;
    if(seqtype==ReadGroupLib::SEQTYPE_SOLEXA){
      static const char regexfile[] = {
#include "adaptorsregex.solexa.xxd.H"
	,0
      };
      tmpis.str(regexfile);
    }else if(seqtype==ReadGroupLib::SEQTYPE_IONTORRENT){
      static const char regexfile[] = {
#include "adaptorsregex.iontor.xxd.H"
	,0
      };
      tmpis.str(regexfile);
    }

    masterslavere_t tmpmsre;
    string line;

    while(true){
      getline(tmpis,line);
      if(tmpis.eof()) break;
      if(line[0]=='>'){
	adapres.push_back(tmpmsre);
	line.erase(0,1);         // get away the ">"
	boost::trim(line);
	if(!line.empty()){
	  boost::to_upper(line);
	  adapres.back().masterre=boost::regex(line);
	  adapres.back().hasmaster=true;
	}
      }else{
	BUGIFTHROW(adapres.empty(),"Oooops, found no master expression?");
	boost::to_upper(line);
	adapres.back().slaveres.push_back(boost::regex(line));
      }
    }
  }

  ReadPool adappool(&AS_miraparams);
  {
    istringstream tmpis;

    if(seqtype==ReadGroupLib::SEQTYPE_SOLEXA){
      static const char adapfile[] = {
#include "adaptorsforclip.solexa.xxd.H"
	,0
      };
      tmpis.str(adapfile);
    }else if(seqtype==ReadGroupLib::SEQTYPE_IONTORRENT){
      static const char adapfile[] = {
#include "adaptorsforclip.iontor.xxd.H"
	,0
      };
      tmpis.str(adapfile);
    }else if(seqtype==ReadGroupLib::SEQTYPE_454GS20){
      static const char adapfile[] = {
#include "adaptorsforclip.454.xxd.H"
	,0
      };
      tmpis.str(adapfile);
    }

    string line;
    while(true){
      getline(tmpis,line);
      if(tmpis.eof()) break;
      line.erase(0,1);         // get away the ">"
      if(!line.empty()){
	size_t ereadidx=adappool.provideEmptyRead();
	Read & actread=adappool[ereadidx];
	actread.disallowAdjustments();
	actread.setName(line);
	getline(tmpis,line);
	if(tmpis.eof()) break;
	actread.setSequenceFromString(line);
      }
    }
  }

  //adappool.dumpPoolInfo(cout);

  // Go back if nothing to be searched
  if(adappool.size()==0 && adapres.size()==0) return;

  cout << "Starting " << ReadGroupLib::getNameOfSequencingType(seqtype) << " known adaptor right clip ... "; cout.flush();

  Skim adapskim;
  adapskim.skimStreamPrepare(adappool,7,1);

  if(AS_miraparams[0].getAssemblyParams().as_dateoutput) dateStamp(cout);
  cout << "Searching multithread now ... \n"; cout.flush();

  cout << static_cast<int16>(AS_miraparams[0].getSkimParams().sk_numthreads) << endl;

  vector<int32> clipres;
  adapskim.findAdaptorRightClip(AS_readpool,clipres,seqtype,9,AS_miraparams[0].getSkimParams().sk_numthreads);
  //adapskim.findAdaptorRightClip(AS_readpool,clipres,seqtype,9,1);
  //adapskim.findAdaptorRightClip(AS_readpool,clipres,seqtype,9,8);

  BUGIFTHROW(clipres.size()!=AS_readpool.size(),"clipres.size()!=AS_readpool.size()???");

  ofstream logfout;
  if(!logname.empty()){
    logfout.open(logname.c_str(), ios::out|ios::app);
    if(!logfout){
      MIRANOTIFY(Notify::FATAL, "Could not open log for appending: " << logname);
    }
  }

  uint32 numclipped=0;

  if(AS_miraparams[0].getAssemblyParams().as_dateoutput) dateStamp(cout);
  cout << "Searching for " <<  ReadGroupLib::getNameOfSequencingType(seqtype) << " partial end adaptors ... \n"; cout.flush();
  ProgressIndicator<int64> P(0, AS_readpool.size());
  for(uint32 actid=0; actid < AS_readpool.size(); actid++){
    P.progress(actid);
    Read & actread = AS_readpool.getRead(actid);
    if(actread.hasValidData()
       && actread.getSequencingType()==seqtype
       && !(actread.isBackbone() || actread.isRail())){

      auto oldrsclip=actread.getRSClipoff();
      if(clipres[actid]>=0){
	if(clipres[actid] < oldrsclip){
	  ++numclipped;
	  actread.setRSClipoff(clipres[actid]);
	  logfout << logprefix << " "
		  << ReadGroupLib::getNameOfSequencingType(seqtype)
		  << " adaptor: " << actread.getName()
		  << " changed right clip from " << oldrsclip << " to " << clipres[actid] << "\n";
	}
      }else if(!adapres.empty()){
	string seq(actread.getSeqAsChar());
	boost::to_upper(seq);

	boost::match_results<std::string::const_iterator> what;
	boost::match_flag_type flags = boost::match_default;
	std::string::const_iterator start, end;

	for(auto & msre : adapres){
	  bool dosearch=true;
	  if(msre.hasmaster){
	    if(!regex_search(start, end, what, msre.masterre, flags)) {
	      dosearch=false;
	    }
	  }
	  bool breakit=false;
	  if(dosearch){
	    for(auto & thisre : msre.slaveres){
	      start = seq.begin();
	      end = seq.end();
	      if(regex_search(start, end, what, thisre, flags)) {
		if(what.position()< oldrsclip){
		  actread.setRSClipoff(what.position());
		  logfout << logprefix << " "
			  << ReadGroupLib::getNameOfSequencingType(actread.getSequencingType())
			  << " partial end adaptor: " << actread.getName()
			  << " changed right clip from " << oldrsclip << " to " << what.position() << "\n";
		  breakit=true;
		  break;
		}
	      }
	    }
	  }
	  if(breakit) break;
	}

      }
    }
  }

  P.finishAtOnce();

  if(!logname.empty()){
    logfout.close();
  }

  cout << "done. Clipped " << numclipped << " reads.\n";

  if(AS_miraparams[0].getAssemblyParams().as_dateoutput) dateStamp(cout);

  FUNCEND();
  return;
}

*/


/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/

void Assembly::correctContigs()
{
#ifdef MIRA_HAS_EDIT
  FUNCSTART("void Assembly::correctContigs()");

  if(AS_miraparams[0].getAssemblyParams().as_dateoutput) dateStamp(cout);
  cout << "\nEditing contigs:" << endl;

  EDITParameters eparams;

  //  eparams.setDoEval();
  eparams.setStrictEvaluation(false);
  eparams.setConfirmationThreshold(0.5);
  eparams.setShowProgress(true);
  eparams.setVerbose(0);
  eparams.setShowProgress(true);


  list<Contig>::iterator I = AS_contigs.begin();
  int32 ccounter=0;
  ProgressIndicator<int64> P(0, AS_contigs.size());

  while(I!=AS_contigs.end()){
    P.progress(ccounter);
    try {
      //	CEBUG("Editing contig:" << ccounter << endl);
      //	CEBUG(*I);
      cout << "Editing contig:" << ccounter << endl;
      editContigBack(*I, eparams);
      ScfBuffer::discard();
      cout << "deleting star columns" << ccounter << endl;
      I->deleteStarOnlyColumns(0, I->getContigLength()-1);
      cout << "marking repeats" << ccounter << endl;

      Contig::repeatmarker_stats_t repstats;
      vector<bool> readsmarkedsrm;
      I->newMarkPossibleRepeats(repstats, readsmarkedsrm);

      //	CEBUG("Corrected contig:" << endl);
      //	CEBUG(*I);
    }
    catch(Notify n){
      n.handleError("Error while examining fault-region");
    }

    I++;ccounter++;
  }

  P.finishAtOnce();

  cout << endl;

  FUNCEND();
#endif
  return;
}






/*************************************************************************
 *
 * Calculates possible sequence vector leftovers at the left side of a read
 * Reads that get a clip must be of Sanger type
 *
 * Does not clip backbone reads, rail reads, multicopyreads
 *  AND not areas protected by Staden GenBank Feature tags
 *
 * Clipping itself must be done afterwards in the performSeqVectorClippings()
 *  function. This was split in two parts to allow releasing of the
 *  big memory chunks AS_readhmcovered, AS_readhitmiss, etc.
 *
 *************************************************************************/


void Assembly::calcPossibleSeqVectorClipoffs(int32 version, const string prefix, const string postfix, const string logname)
{
  FUNCSTART("void Assembly::calcPossibleSeqVectorClipoffs(int32 version, const string prefix, const string postfix, const string logname)");

  if(AS_readhmcovered.size()==0 || AS_readhitmiss.size()==0) {
    cout << "\nNo vector clipping information available, aborting vector clip.\n";
    FUNCEND();
    return;
  }

  if(AS_miraparams[0].getAssemblyParams().as_dateoutput) dateStamp(cout);
  cout << "\nCalculating possible vector leftovers ... ";
  cout.flush();
  //ProgressIndicator P (0, AS_readhmcovered.size()-1);

  AS_clipleft.clear();
  AS_clipright.clear();
  AS_clipleft.resize(AS_readhmcovered.size(),-1);
  AS_clipright.resize(AS_readhmcovered.size(),-1);

  string filename;
  if(logname.size()){
    filename=buildFileName(version, prefix, postfix, logname, ".txt");
  }else{
    filename=buildFileName(version, prefix, postfix,
			   AS_miraparams[0].getAssemblyParams().as_tmpf_vectorclip,
			   ".txt");
  }

  ofstream logout(filename.c_str(), ios::out | ios::trunc);

  for(uint32 id=0; id<AS_readhmcovered.size(); id++) {
    if(AS_readpool.getRead(id).getSequencingType() != ReadGroupLib::SEQTYPE_SANGER
       || AS_readpool.getRead(id).isBackbone()
       || AS_readpool.getRead(id).isRail()
       || AS_multicopies[id]>0
      ) continue;


    //P.progress(id);

    uint32 clippos=0;
    bool mustclip=false;
    for(uint32 actpos=0; actpos<AS_readhmcovered[id].size(); actpos++) {
      if(actpos-clippos > 5) break;
      if(AS_readhmcovered[id][actpos]>=4) {
	if(AS_readhitmiss[id][actpos]) {
	  if(100.0/static_cast<double>(AS_readhmcovered[id][actpos])*static_cast<double>(AS_readhitmiss[id][actpos]) >= 30.0) {
	    clippos=actpos;
	    mustclip=true;
	  }
	}
      }
    }
    clippos++;

    // check that no GenBank Feature tags protect the area, else clip less
    {

      // FIXME: put all checks for that into read.C (*sigh*)

      for(uint32 i=0; i<AS_readpool.getRead(id).getNumOfTags(); i++){
	const multitag_t & acttag=AS_readpool.getRead(id).getTag(i);
	if(!acttag.isSourceMIRA()){
	  if(acttag.from<clippos) clippos=acttag.from;
	  if(acttag.to<=clippos) clippos=0;
	}
      }
    }

    // auf clip verzichten wenn nur 1 base betroffen (sieht zu doof aus)
    if(mustclip && clippos>1) {
      uint32 maxcliplenallowed=AS_miraparams[AS_readpool.getRead(id).getSequencingType()].getAssemblyParams().as_clip_vector_maxlenallowed;
      if(maxcliplenallowed == 0 || clippos <= maxcliplenallowed) {
	//AS_readpool.getRead(id).setClipoffs(AS_readpool.getRead(id).getLeftClipoff()+clippos,
	//				    AS_readpool.getRead(id).getRightClipoff(),
	//				    false);

	//AS_clipleft[id]=AS_readpool.getRead(id).getLeftClipoff()+clippos;

	AS_clipleft[id]=clippos;

	logout << "Clipped " << clippos << " bases on the left of " << AS_readpool.getRead(id).getName() << "\n";

      } else {
	if(clippos > maxcliplenallowed) {
	  logout << "Not clipped " << clippos << " bases on the left of " << AS_readpool.getRead(id).getName() << " , too long.\n";
	}
      }
    }
  }

  logout.close();

  //P.progress(AS_readhmcovered.size());
  cout << "done.\n";

  AS_steps[ASVECTORSCLIPPED]=1;
  AS_steps[ASADSLISTOK]=0;

  FUNCEND();
}




/*************************************************************************
 *
 * Reads must be Sanger type
 *
 *
 *************************************************************************/

void Assembly::performSeqVectorClippings()
{
  FUNCSTART("void Assembly::performSeqVectorClippings()");

  cout << "\nPerforming vector clipping ... ";
  cout.flush();

  for(uint32 id=0; id<AS_clipleft.size(); id++) {
    if(AS_clipleft[id]>=0
       && AS_readpool.getRead(id).isSequencingType(ReadGroupLib::SEQTYPE_SANGER)) {
      AS_readpool.getRead(id).setClipoffs(AS_readpool.getRead(id).getLeftClipoff()+AS_clipleft[id],
					  AS_readpool.getRead(id).getRightClipoff(),
					  false);
    }
  }
  FUNCEND();

  AS_clipleft.clear();

  cout << "done." << endl;

  return;
}



/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/

struct cliplen_t{
  int32 len;
  bool changed;
};


//#define CEBUGFLAG 1
void Assembly::extendADS(int32 version, const string prefix, const string postfix, const string logname)
{
  FUNCSTART("void Assembly::extendADS(int32 version, const string prefix, const string postfix, const string logname)");

//  if(AS_steps[ASADSLISTOK]==0){
//    makeAlignments();
//  }


#if CPP_READ_SEQTYPE_END != 8
#error "This code is made for 8 sequencing types, adapt!"
#endif

  // TODO: change to use different Aligns / MIRAparams depending
  //   on Sanger / 454 (/ PacBio ???) reads

  // TODO: what about PacBio? currently not used, but should it?

  MIRAParameters tmpparams = AS_miraparams[0];

  const_cast<align_parameters &>(tmpparams.getAlignParams()).al_min_relscore=5;

  assembly_parameters const & as_params= tmpparams.getAssemblyParams();

  string filename;
  if(logname.size()){
    filename=buildFileName(version, prefix, postfix, logname, ".txt");
  }else{
    filename=buildFileName(version, prefix, postfix,
			   as_params.as_tmpf_adsextend,
			   ".txt");
  }

  ofstream logout(filename.c_str(), ios::out | ios::trunc);


  vector<cliplen_t> clips(AS_readpool.size());
  for(uint32 i=0; i<clips.size(); i++){
    clips[i].len=0;
    clips[i].changed=false;
  }

  list<AlignedDualSeq> madsl;

  try{
    // true for using memcache
    Align bla(&tmpparams);

    cout << "\n";
    if(as_params.as_dateoutput) dateStamp(cout);
    cout << "\nSearching possible read extensions (for Sanger and/or 454):\n";

    ProgressIndicator<int32> P(0, static_cast<int32>(AS_adsfacts.size())-1);
    uint32 pindic=0;

    vector<AlignedDualSeqFacts>::const_iterator I = AS_adsfacts.begin();
    for(;I!=AS_adsfacts.end();I++){
      P.progress(pindic++);
      // first try: prolongate to end.
      int32 id1=I->getID1();
      int32 id2=I->getID2();

      // no sense to calc read extensions for reads where both seqtypes are said
      //  not to use extensions
      if(AS_miraparams[AS_readpool.getRead(id1).getSequencingType()].getAssemblyParams().as_use_read_extension == false
	 && AS_miraparams[AS_readpool.getRead(id2).getSequencingType()].getAssemblyParams().as_use_read_extension == false) continue;

      if(AS_permanent_overlap_bans.checkIfBanned(id1,id2)) {
	CEBUG("PermBan for: " << id1 << " " << id2 <<"\tskipping\n");
	continue;
      }

      CEBUG("\n\nid1: " << id1 << "\t" << AS_readpool.getRead(id1).getName() <<endl);
      CEBUG("id2: " << id2 << "\t" << AS_readpool.getRead(id2).getName() <<endl);

      // normally the sequences should have a length >0
      // but due to some clipping being done after SKIM (chimera etc.), it
      //  may happen they are 0 now. If that's the case, don't bother
      //  looking at.
      if(AS_readpool[id1].getLenClippedSeq() == 0
	 || AS_readpool[id2].getLenClippedSeq() == 0) continue;

      // check for sequencing types
      if( AS_readpool.getRead(id1).isSequencingType(ReadGroupLib::SEQTYPE_PACBIOLQ)
	  || AS_readpool.getRead(id2).isSequencingType(ReadGroupLib::SEQTYPE_PACBIOLQ)) continue;
      // let's allow PacBio HQ

      if( AS_readpool.getRead(id1).isSequencingType(ReadGroupLib::SEQTYPE_IONTORRENT)
	  || AS_readpool.getRead(id2).isSequencingType(ReadGroupLib::SEQTYPE_IONTORRENT)) continue;

      if( AS_readpool.getRead(id1).isSequencingType(ReadGroupLib::SEQTYPE_SOLEXA)
	  || AS_readpool.getRead(id2).isSequencingType(ReadGroupLib::SEQTYPE_SOLEXA)) continue;

      if( AS_readpool.getRead(id1).isSequencingType(ReadGroupLib::SEQTYPE_TEXT)
	  || AS_readpool.getRead(id2).isSequencingType(ReadGroupLib::SEQTYPE_TEXT)) continue;

      if( AS_readpool.getRead(id1).isSequencingType(ReadGroupLib::SEQTYPE_ABISOLID)
	  || AS_readpool.getRead(id2).isSequencingType(ReadGroupLib::SEQTYPE_ABISOLID)) continue;

      //if(clips[id1].changed && clips[id2].changed){
      //	CEBUG(id1 << " and " << id2 <<" already changed.\n");
      //	continue;
      //}

      madsl.clear();

#if CEBUGFLAG > 0
      //Read::setCoutType(Read::AS_TEXT);
      Read::setCoutType(Read::AS_TEXTCLIPS);
      CEBUG(AS_readpool.getRead(id1));
      CEBUG(AS_readpool.getRead(id2));
#endif

      if(I->getSequenceDirection(id1) * I->getSequenceDirection(id2) > 0){

	CEBUG("doalign\n");

	// evil hack warning
	// the &(* ...) construction is needed for gcc3 as it cannot convert
	//  a vector<char> iterator to char *   (*sigh*)

	int32 extendlen1=AS_readpool.getRead(id1).getRightExtend();
	int32 extendlen2=AS_readpool.getRead(id2).getRightExtend();

	if(AS_miraparams[AS_readpool.getRead(id1).getSequencingType()].getAssemblyParams().as_use_read_extension == false) {
	  extendlen1=0;
	}
	if(AS_miraparams[AS_readpool.getRead(id2).getSequencingType()].getAssemblyParams().as_use_read_extension == false){
	  extendlen2=0;
	}

	CEBUG("l1: " <<AS_readpool.getRead(id1).getLenClippedSeq() << endl);
	CEBUG("e1: " <<extendlen1 << endl);
	CEBUG("l2: " <<AS_readpool.getRead(id2).getLenClippedSeq() << endl);
	CEBUG("e2: " <<extendlen2 << endl);

	if(extendlen1 >= 10 || extendlen2 >= 10){
	  bla.acquireSequences(
	    &(*AS_readpool.getRead(id1).getActualSequence().begin())
	    +AS_readpool.getRead(id1).getLeftClipoff(),
	    AS_readpool.getRead(id1).getLenClippedSeq()+extendlen1,
	    &(*AS_readpool.getRead(id2).getActualSequence().begin())
	    +AS_readpool.getRead(id2).getLeftClipoff(),
	    AS_readpool.getRead(id2).getLenClippedSeq()+extendlen2,
	    id1, id2, 1, 1, true, I->getOffsetInAlignment(id2));
	  bla.fullAlign(&madsl,false,false);

	  if(madsl.size()==0){
	    CEBUG("No results, less radical try.\n");

	    int32 tryseqlen1=0;
	    if(AS_miraparams[AS_readpool.getRead(id1).getSequencingType()].getAssemblyParams().as_use_read_extension) {
	      if(clips[id1].changed){
		extendlen1-=clips[id1].len;
	      }
	      extendlen1/=2;
	      tryseqlen1=AS_readpool.getRead(id1).getLenClippedSeq()+extendlen1;
	      if(clips[id1].changed){
		tryseqlen1+=clips[id1].len;
	      }
	      if(AS_readpool.getRead(id1).getLeftClipoff()+tryseqlen1 >= static_cast<int32>(AS_readpool.getRead(id1).getLenSeq())) {
		CEBUG("t1o: " <<tryseqlen1 << endl);
		tryseqlen1=AS_readpool.getRead(id1).getLenClippedSeq()+AS_readpool.getRead(id1).getRightExtend();
		CEBUG("t1n: " <<tryseqlen1 << endl);
	      }
	    }

	    int32 tryseqlen2=0;
	    if(AS_miraparams[AS_readpool.getRead(id2).getSequencingType()].getAssemblyParams().as_use_read_extension) {
	      if(clips[id2].changed){
		extendlen2-=clips[id2].len;
	      }
	      extendlen2/=2;
	      tryseqlen2=AS_readpool.getRead(id2).getLenClippedSeq()+extendlen2;
	      if(clips[id2].changed){
		tryseqlen2+=clips[id2].len;
	      }
	      if(AS_readpool.getRead(id2).getLeftClipoff()+tryseqlen2 >= static_cast<int32>(AS_readpool.getRead(id2).getLenSeq())) {
		CEBUG("t2o: " <<tryseqlen2 << endl);
		tryseqlen2=AS_readpool.getRead(id2).getLenClippedSeq()+AS_readpool.getRead(id2).getRightExtend();
		CEBUG("t2n: " <<tryseqlen2 << endl);
	      }
	    }

	    CEBUG("cc1: " <<clips[id1].changed << endl);
	    CEBUG("cl1: " <<clips[id1].len << endl);
	    CEBUG("l1: " <<AS_readpool.getRead(id1).getLenClippedSeq() << endl);
	    CEBUG("t1: " <<tryseqlen1 << endl);
	    CEBUG("cc2: " <<clips[id2].changed << endl);
	    CEBUG("cl2: " <<clips[id2].len << endl);
	    CEBUG("l2: " <<AS_readpool.getRead(id2).getLenClippedSeq() << endl);
	    CEBUG("t2: " <<tryseqlen2 << endl);
	    if(extendlen1 < 5 && extendlen2 < 5) {
	      CEBUG("skip" << endl);
	      continue;
	    }

	    if(tryseqlen1>0 && tryseqlen2>0){
	      bla.acquireSequences(
		&(*AS_readpool.getRead(id1).getActualSequence().begin())
		+AS_readpool.getRead(id1).getLeftClipoff(),
		tryseqlen1,
		&(*AS_readpool.getRead(id2).getActualSequence().begin())
		+AS_readpool.getRead(id2).getLeftClipoff(),
		tryseqlen2,
		id1, id2, 1, 1, true, I->getOffsetInAlignment(id2));
	    }
	  }
	}
      }else{
	if(I->getSequenceDirection(id2)>0){
	}else{
	}
      }

      if(madsl.size()==0){
	CEBUG("No results\n");
      }else{
	int32 bestweight=0;
	list<AlignedDualSeq>::iterator J;
	for(J= madsl.begin(); J!=madsl.end(); ){
	  if(J->isValid()==false){
	    J=madsl.erase(J);
	  }else{
	    if(J->getWeight()>bestweight) bestweight=J->getWeight();
	    J++;
	  }
	}
	// take only the best
	for(J= madsl.begin(); J!=madsl.end();){
	  if(J->getWeight() != bestweight){
	    J=madsl.erase(J);
	  } else {
	    J++;
	  }
	}
//	  cout << "Ext. 1st success: " << id1 << "\t" << id2 << "\n";
//	  cout << *I;
//	  cout << *(madsl.begin());

	int32 lens1=0;
	int32 lens2=0;
	if(madsl.begin()->clipper(as_params.as_readextension_window_len,
				  as_params.as_readextension_window_maxerrors,
				  lens1, lens2)){
//	    cout << "Lalala\n";

	  lens1-=AS_readpool.getRead(id1).getLenClippedSeq();
	  lens2-=AS_readpool.getRead(id2).getLenClippedSeq();
	  CEBUG("o1: " << AS_readpool.getRead(id1).getLenClippedSeq() << "\tn: " << lens1);
	  CEBUG("\no2: " << AS_readpool.getRead(id2).getLenClippedSeq() << "\tn: " << lens2<<endl);


	  if(AS_miraparams[AS_readpool.getRead(id1).getSequencingType()].getAssemblyParams().as_use_read_extension){
	    if(lens1>5 && lens1>clips[id1].len){
	      clips[id1].len=lens1;
	      clips[id1].changed=true;
	    }
	  }

	  if(AS_miraparams[AS_readpool.getRead(id2).getSequencingType()].getAssemblyParams().as_use_read_extension){
	    if(lens2>5 && lens2>clips[id2].len){
	      clips[id2].len=lens2;
	      clips[id2].changed=true;
	    }
	  }
	}
      }
    }
    P.finishAtOnce();
  }
  catch(Notify n){
    n.handleError(THISFUNC);
  }

  int32 lenplus=0;
  int32 numchanged=0;
  for(uint32 rid=0; rid<clips.size(); rid++){
    if(AS_readpool.getRead(rid).isBackbone()
       || AS_readpool.getRead(rid).isRail()) continue;
    // contig join spoiler! do not extend back again!
    if(AS_readpool.getRead(rid).hasTag(Read::REA_defaulttag_CJSP.identifier)) continue;
    if(AS_miraparams[AS_readpool.getRead(rid).getSequencingType()].getAssemblyParams().as_use_read_extension) continue;

    if(clips[rid].changed){
      CEBUG("ID: " << rid << "\t" << AS_readpool.getRead(rid).getName() << "\toldlen: " << AS_readpool.getRead(rid).getLenClippedSeq());
      CEBUG("\tgained: " << clips[rid].len << endl);
      numchanged++;
      lenplus+=clips[rid].len;

      logout << AS_readpool.getRead(rid).getName() << "\t" << clips[rid].len << "\n";

      AS_readpool.getRead(rid).setClipoffs(AS_readpool.getRead(rid).getLeftClipoff(),
					 AS_readpool.getRead(rid).getLeftClipoff()+AS_readpool.getRead(rid).getLenClippedSeq()+clips[rid].len-1,
					 false);

      if(AS_readpool.getRead(rid).checkRead()){
	cout << AS_readpool.getRead(rid);
	MIRANOTIFY(Notify::INTERNAL, AS_readpool.getRead(rid).checkRead()) ;
      }
    }
  }

  cout << "\nChanged length of " << numchanged << " sequences."<< endl;
  if(numchanged!=0){
    cout << "Mean length gained in these sequences: " << static_cast<double>(lenplus)/ static_cast<double>(numchanged) << " bases." << endl;
  }

  logout.close();

  AS_steps[ASADSLISTOK]=0;

  FUNCEND();
  return;
}
//#define CEBUGFLAG 0




#define CEBUG(bla)   {cout << bla; cout.flush();}

void Assembly::analyseOverlapHashProfile(vector<uint8> & profile, vector<skimedges_t>::const_iterator seI, ADSEstimator & adse)
{
  vector<uint32> longeststretch(7,0);
  vector<uint32> currentstretch(7,0);

  for(size_t pi=0; pi<profile.size(); pi++){
    //CEBUG(pi << '\t' << static_cast<uint16>(profile[pi]) << '\n');
    for(size_t si=0; si<7; si++){
      if(si==profile[pi]){
	currentstretch[si]++;
	if(currentstretch[si]>longeststretch[si]) longeststretch[si]=currentstretch[si];
      }else{
	currentstretch[si]=0;
      }
    }
  }

  if(longeststretch[3]<5){
    if(AS_skimstaken[seI->skimindex]==true){
      cout << "Remove seI: " << *seI;
      cout << "stretches:\n";
      for(size_t si=0; si<7; si++){
	cout << si << ' ' << longeststretch[si] << endl;
      }

      AS_skimstaken[seI->skimindex]=false;
      AS_numskimoverlaps[seI->rid1]--;
      AS_numskimoverlaps[seI->linked_with]--;
    }
  }
}

#define CEBUG(bla)






/////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////
/////////////////////////        Obsolete         ///////////////////////
/////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////


/*************************************************************************
 *
 * expects reads to have baseflags set  (by performHashAnalysis())
 *
 * doesn't seem to be a good idea
 *
 *************************************************************************/

//#define CEBUG(bla)   {cout << bla; cout.flush();}

/*

void Assembly::performHashEditing()
{
  FUNCSTART("void Assembly::performHashEditing()");

  cout << "Hash analysis for editing:";

  skim_parameters const & skim_params= AS_miraparams[0].getSkimParams();
  assembly_parameters const & as_fixparams= AS_miraparams[0].getAssemblyParams();

  uint32 basesperhash=as_fixparams.as_clip_pec_basesperhash;
  if(sizeof(uint64) < 8 && basesperhash > 15) basesperhash=15;
  {

    Skim s3;

    s3.setHashFrequencyRatios(skim_params.hs_freqest_minnormal,
			      skim_params.hs_freqest_maxnormal,
			      skim_params.hs_freqest_repeat,
			      skim_params.hs_freqest_heavyrepeat,
			      skim_params.hs_freqest_crazyrepeat,
			      skim_params.hs_nastyrepeatratio);

    s3.analyseHashes(AS_miraparams[0].getDirectoryParams().dir_tmp,
		     AS_readpool,
		     true,
		     false,
		     false,
		     true,
		     1,
		     basesperhash,
		     1,
		     false);
  }

  if(as_fixparams.as_dateoutput) dateStamp(cout);
  cout << '\n';

  cout << "Looking for proposed edits ... "; cout.flush();

  vector<uint8> maxhf;
  maxhf.reserve(10000);

  uint64 numbaseschanged=0;
  uint64 numreadschanged=0;

  for(uint32 actid=0; actid<AS_readpool.size(); actid++){
    Read & r=AS_readpool.getRead(actid);

    if(r.hasValidData()
       && r.hasBaseHashStats()
       && !(r.isBackbone()
	    || r.isRail())){

      maxhf.clear();
      maxhf.resize(r.getLenSeq(),0);

      bool wasedited=false;

      {
	int32 lpos=r.getLeftClipoff();
	vector<Read::bposhashstat_t>::const_iterator bhsI=r.getBPosHashStats().begin();
	vector<uint8>::iterator mhfI=maxhf.begin();
	advance(bhsI,lpos);
	advance(mhfI,lpos);

	uint32 counter=basesperhash;
	for(; lpos<static_cast<int32>(r.getLenSeq()); lpos++, bhsI++, mhfI++) {
	  *mhfI=(bhsI->fwd.getFrequency())>1;
	  if(*mhfI) counter=basesperhash;
	  if(counter) {
	    *mhfI=4;
	    --counter;
	  }
	}

	lpos=r.getLeftClipoff();
	mhfI=maxhf.begin();
	advance(mhfI,lpos);

	//for(; lpos<static_cast<int32>(r.getLenSeq()); lpos++) {
	//  cout << (uint16) maxhf[lpos] << ' ';
	//}
	//cout << endl;
	//lpos=r.getLeftClipoff();
	//for(; lpos<static_cast<int32>(r.getLenSeq()); lpos++) {
	//  cout << r.getBaseInSequence(lpos) << ' ';
	//}
	//cout << endl;
	//Read::setCoutType(Read::AS_TEXT);
	//cout << r;

	lpos=r.getLeftClipoff();
	for(; lpos<static_cast<int32>(r.getLenSeq()); lpos++, mhfI++) {
	  if(*mhfI) break;
	}

	int32 editstart=-1;
	for(; lpos<static_cast<int32>(r.getLenSeq()); lpos++, mhfI++) {
	  if(editstart<0){
	    if(*mhfI==0) {
	      editstart=lpos;
	    }
	  }else{
	    if(*mhfI) {
	      for(int32 ii=editstart; ii<lpos; ii++) {
		//editpositions.push_back(ii);
		r.changeBaseInSequence('n',0,ii);
		numbaseschanged++;
		wasedited=true;
	      }
	      editstart=-1;
	    }
	  }
	}

      }
      if(wasedited) numreadschanged++;

      //if(editpositions.size()){
      //	cout << r.getName() << ": wants to edit " << editpositions.size() << " positions\n";
      //}
    }
  }

  cout << "changed " << numbaseschanged << " bases to 'n' in " << numreadschanged << " reads.\n";

  FUNCEND();

  return;
}
//#define CEBUG(bla)
*/


/*************************************************************************
 *
 * BaCh 31.12.2012: Errrrm ... what's that function for? Was probably a
 *  quick hack for something or I planed some "extra" file for very
 *  special cases. Should probably be removed.
 *
 * REMOVEME!
 *
 * ugly and slow, but works and is fast enough
 *
 *************************************************************************/
//#define CEBUG(bla)   {cout << bla; cout.flush(); }
/*
void Assembly::mergeTemplateInfo(const string & tifile, const string & logname, const string & logprefix)
{
  FUNCSTART("void Assembly::mergeTemplateInfo(const string & tifile, const string & logname, const string & logprefix)");

  cout << "Merging template info from " << tifile << ":\n";

  CEBUG("Building hash table ... "); cout.flush();

  typedef boost::unordered_map<std::string, int32> strmap;
  strmap rnmap;
  strmap::iterator rnI;

  for(uint32 i=0; i<AS_readpool.size();i++){
    if(!AS_readpool[i].getName().empty()) {
      rnmap[AS_readpool[i].getName()]=i;
    }
  }
  CEBUG("done." << endl);

  ofstream logfout;
  if(!logname.empty()){
    logfout.open(logname.c_str(), ios::out|ios::app);
    if(!logfout){
      MIRANOTIFY(Notify::FATAL, "Could not open log for appending: " << logname);
    }
  }

  ifstream tifin;
  tifin.open(tifile.c_str(), ios::in|ios::ate);
  if(!tifin){
    MIRANOTIFY(Notify::FATAL, "File not found: " << tifile);
  }
  streampos tifsize=tifin.tellg();
  tifin.seekg(0, ios::beg);

  ProgressIndicator<streamsize> P (0, tifsize,1000);

  string token;

  while(!tifin.eof()){
    tifin >> token;
    if(tifin.eof()) break;
    if(P.delaytrigger()) P.progress(tifin.tellg());

    //tifin >> sd_score >> sd_readname;

    if(tifin.eof()) break;

    if(token[0]=='+'){
      // new lib
    }else{
      // existing name
      bool foundname=false;
      rnI=rnmap.find(token);
      if(rnI==rnmap.end()) {
	CEBUG("Not found: " << token << endl);
	continue;
      }
      uint32 foundreadid=rnI->second;
      if(!AS_readpool[foundreadid].hasValidData()) continue;

      Read actread(AS_readpool[foundreadid]);
      assembly_parameters const & as_params= AS_miraparams[actread.getSequencingType()].getAssemblyParams();
    }
  }
  P.finishAtOnce();

  tifin.close();

  if(!logname.empty()){
    logfout.close();
  }

  cout << "\nDone." << endl;


  FUNCEND();
  return;
}
//#define CEBUG(bla)
*/




/*************************************************************************
 *
 * splits a sequence into overlapping subsequences
 *
 * AND
 *
 * saves pre-computed adsfacts file into log directory for later
 *  later reading
 * number of generated adsfacts is put in AS_numADSFacts_fromshreds
 *
 *
 * This saves enormous amount of time, but is not the "real" thing:
 *  matches between shreds that are non-overlapping from the start on are
 *  not made
 *
 *************************************************************************/

/*
void Assembly::shredReadsIntoReadPool(ReadPool & sourcepool, uint32 shredlen, uint32 shredoffsetinc, uint8 shredreadtype, const string & shredstrain)
{
  FUNCSTART("void Assembly::shredReadsIntoReadPool(ReadPool & sourcepool, uint32 shredlen, uint32 shredoffsetinc, uint8 shredreadtype, const string & shredstrain)");

  AS_numADSFacts_fromshreds=0;
  string adsfshredsfilename=AS_miraparams[0].getDirectoryParams().dir_tmp+"/shred.adsfacts";
  ofstream adsfout;
  adsfout.open((adsfshredsfilename+".adsfacts").c_str(), ios::out|ios::trunc);

  deque<uint32> overlapfifo;

  string shredseq;
  shredseq.reserve(shredlen);
  vector<base_quality_t> shredqual;
  shredqual.reserve(shredlen+10);
  string shredname;

  for(uint32 actsourceid=0; actsourceid < sourcepool.size(); actsourceid++){
    Read & sourceread = sourcepool.getRead(actsourceid);
    if(!sourceread.hasValidData()) continue;
    if(sourceread.getLenSeq() < shredlen) continue;

    uint32 actoffset=0;
    uint32 shredcounter=0;
    for(bool doloop=true; doloop; actoffset+=shredoffsetinc){
      uint32 fromi=actoffset;
      uint32 toi=actoffset+shredlen;
      if(toi>=sourceread.getLenSeq()) {
	toi=sourceread.getLenSeq();
	doloop=false;
      }
      shredseq.clear();
      shredqual.clear();
      for(; fromi<toi; fromi++){
	shredseq+=sourceread.getBaseInSequence(fromi);
	shredqual.push_back(sourceread.getQualityInSequence(fromi));
      }

      // if wished: lower quals to max as_cap454consensusqual
      if(AS_miraparams[0].getAssemblyParams().as_cap454consensusqual>0){
	vector<base_quality_t>::iterator qI=shredqual.begin();
	base_quality_t maxqual=AS_miraparams[0].getAssemblyParams().as_cap454consensusqual;
	for(;qI != shredqual.end(); qI++){
	  if(*qI>maxqual) *qI=maxqual;
	}
      }

      ostringstream ostr;
      ostr << "shred_" << shredcounter << "_" << sourceread.getName();
      shredname=ostr.str();

      AS_readpool.addNewEmptyRead();
      uint32 newreadid=AS_readpool.size()-1;
      Read & newread=AS_readpool.getRead(newreadid);
      newread.setName(shredname);
      newread.setSequenceFromString(shredseq);
      newread.setQualities(shredqual);
      newread.setStrain(shredstrain.c_str());
      newread.setSequencingType(shredreadtype);

      //cout << "\n----------------------------------------\nAdded " << shredname << '\n';
      // now insert the weights
      {
	overlapfifo.push_front(newreadid);
	deque<uint32>::iterator OFI=overlapfifo.begin();
	OFI++;
	int32 overlaplen=shredlen-shredoffsetinc;
	int32 totalshredoffset=shredoffsetinc;
	uint32 numelements=1;
	while(OFI != overlapfifo.end()) {
	  if(overlaplen<=0) break;

	  AlignedDualSeqFacts tmpadsf;
	  tmpadsf.publicinit(
	    *OFI,
	    newreadid,
	    static_cast<uint16>(totalshredoffset),
	    static_cast<uint16>(totalshredoffset
				-(AS_readpool.getRead(*OFI).getLenSeq()
				  -AS_readpool.getRead(newreadid).getLenSeq())),
	    0,
	    static_cast<uint16>((AS_readpool.getRead(*OFI).getLenSeq()+
				 AS_readpool.getRead(newreadid).getLenSeq()-overlaplen)),
	    1,
	    1,
	    100);

	  // output of the ADSfacts to file
	  // TODO: real ouput
	  // first weight and direction
	  // TODO: reduce weight to favorise real reads in assembly???
	  adsfout << overlaplen*10000 << "\t1\t";
	  tmpadsf.serialiseOut(adsfout);
	  adsfout << '\n';

	  AS_numADSFacts_fromshreds++;

	  OFI++;
	  overlaplen-=shredoffsetinc;
	  totalshredoffset+=shredoffsetinc;
	  numelements++;
	}
	if(overlapfifo.size()>numelements) overlapfifo.resize(numelements);
      }
      shredcounter++;
    }
    cout << "Shredded " << sourceread.getName() << " into " << shredcounter << " pieces.\n";
  }

  adsfout.close();

  FUNCEND();
}
*/
