#!/bin/bash

# flowt overlap testing, given input file $1:
# for input, and for flowt-reduced input
# gen fasta files, blast against itself, check best hit vs second

input=$1

flowt="../flowt -s=flowt.summary -t 2000"

count(){
    echo " --- count($1)"
    echo -n "Reads extracted from $1: "
    grep '^>' $1 | wc -l
}

# only do this if fasta is newer
# only do this for orig, just use grep for flowt
blast(){
  echo " --- blast($1)"
  formatdb -i $1 -pF
  blastall -p blastn -a 2 -i $1 -d $1 -e 1e-50 -m 8 > ${1}_blast.tab 2> blast.ERR
}

summarize(){
    echo " --- summarize($1)"
    echo "extract bitscore best/second best"
    echo "count number of 'exact' matches"
}

plot(){
    echo " --- plot($1)"
    echo "generate plots of best/second best"
}

# build groups of blast hits, one file per input sequence
group(){
  echo " --- group($1)"
  dir=${1}.d
  mkdir -p $dir
  rm -f $dir/*

  cat $1 | while read line
  do
     out=`echo $line | cut -d' ' -f 1`
     echo $line | sed -e 's/  */	/g' >> $dir/$out.tab
  done
}

# filter out redundant blast results
blastuniq(){ 
    awk '{ if ($1 >= $2) { print } else {}}' 
}

histogram(){
  echo " --- histogram($1)"
  for a in `find $1`; do wc -l $a ; done | sort -n | grep $1 | sed -e 's/ [^ 0-9].*$//g'| uniq -c
}

# identify likely clones in grouped blast files
# $7 and $8 are query begin and end, $9 and $10 are subject begin and end
# divide the number by two (symmetry of blast)
# caveat: LCRs cause clones to be split into separate hits - also no way to check that we match full length
# todo: make it symmetric?
# todo: check that 'self' gets a value - this often fails.
clones(){
   echo "counting probable duplicates in $1, same direction:"
   for a in `find $1.d -name \*.tab`; do
     self=`awk '{if ($1 == $2) { print $8}}' < $a | head -1` #`head -1 $a | cut -f 8`
     # echo DEBUG file= $a self= $self 
     tail -n +2 $a | awk '{if($1 != $2 && $7 < 5 && $9 < 5 && $8 > '$self'*0.9) { print }}' > $a.clones
   done
   echo -n `ls $1.d/*tab.clones | wc -l`" groups, containing matches: "
   ls $1.d/*.tab.clones | wc -l
}

revclones(){
   echo "counting probable duplicates in $1, reverse direction: "
   for a in `find $1.d -name \*.tab`; do
     self=`awk '{if ($1 == $2) { print $8}}' < $a | head -1` #`head -1 $a | cut -f 8`
     # echo DEBUG file= $a self= $self 
     tail -n +2 $a | awk '{if($1 != $2 && $7 < 5 && $10 < 5 && $8 > '$self'*0.9) { print }}'  > $a.revclones
   done
   echo -n `ls $1.d/*tab.revclones | wc -l`" groups, containing matches: "
   ls $1.d/*.tab.revclones | wc -l
}

# plot orig.fasta_blast.tab.dat flowt.fasta_blast.tab.dat
mkplots(){
   for a in orig flowt; do
       for b in clones revclones; do
            wc -l $a.fasta*d/*.tab.$b | sort -n | sed -e 's/ '$a'.*$//g' | grep -v total | uniq -c > $a-$b
       done
   done

   gnuplot <<EOF
set term postscript color
set out "clones.ps"
set log y
set xtics 5
set xrange [-0.5:]
set xlabel "number of duplicates"
set ylabel "number of reads (log scale)"
plot "orig-clones" using (\$2):((\$1)) with impulses title "unfiltered", "flowt-clones" using (\$2+0.1):((\$1)) with impulses title "filtered"
EOF

   gnuplot <<EOF
set term postscript color
set out "revclones.ps"
set log y
set xtics 5
set xrange [-0.5:]
set xlabel "number of duplicates"
set ylabel "number of reads (log scale)"
plot "orig-revclones" using (\$2):((\$1)) with impulses title "unfiltered", "flowt-revclones" using (\$2+0.1):((\$1)) with impulses title "filtered"
EOF

}

group test.tab
exit -1

# one function to bring them all...
run(){
    count $1
    blast $1
    group ${1}_blast.tab
    histogram ${1}_blast.tab.d
    echo "Total number of blast hits: " `wc -l ${1}_blast.tab`
    clones ${1}_blast.tab.d
    revclones ${1}_blast.tab.d
    summarize ${1}_blast.tab
}

flower -r $input > orig.fasta
run orig.fasta

$flowt $input
flower -r unique.sff > flowt.fasta
flower -r duplic.sff >> flowt.fasta
run flowt.fasta

mkplots


