mirror of
https://git.FreeBSD.org/ports.git
synced 2024-12-16 03:24:07 +00:00
d06281fc0f
Approved by: jrm (mentor) Differential Revision: https://reviews.freebsd.org/D15139
381 lines
9.9 KiB
Bash
381 lines
9.9 KiB
Bash
#!/usr/bin/env bash
|
|
|
|
set -e
|
|
|
|
##########################################################################
|
|
# Function description:
|
|
# Pause until user presses return
|
|
##########################################################################
|
|
|
|
pause()
|
|
{
|
|
local junk
|
|
|
|
printf "Press return to continue..."
|
|
read junk
|
|
}
|
|
|
|
|
|
##########################################################################
|
|
# Not necessary if invoking scripts with "bash script-name" as shown
|
|
# in the dDocent tutorial, but supplied for completeness. Tutorial
|
|
# scripts have also been patched upstream to usr /usr/bin/env, but
|
|
# we won't assume they'll all stay that way.
|
|
##########################################################################
|
|
|
|
fix_bash_path()
|
|
{
|
|
# Don't echo these commands
|
|
{ set +x; } 2>/dev/null
|
|
if [ $# != 1 ]; then
|
|
printf "Usage: $0 script\n"
|
|
exit 1
|
|
fi
|
|
sed -i.bak -e "s|#!/bin/bash|#!/usr/bin/env bash|g" $1
|
|
set -x
|
|
}
|
|
|
|
|
|
##########################################################################
|
|
# Main
|
|
##########################################################################
|
|
|
|
##########################################################################
|
|
# Workarounds for running dDocent from FreeBSD ports and Pkgsrc.
|
|
# Include this section in any scripts running dDocent commands.
|
|
##########################################################################
|
|
|
|
# Hack to allow remake_reference.sh, etc. to find trimmomatic.
|
|
# trimmomatic.jar and adapter files are not an executables and should not
|
|
# be in PATH according to filesystem hierarchy standards, but the dDocent
|
|
# scripts are coded to look for them there, because that's how dDocent
|
|
# installs the bundled Trimmomatic.
|
|
if [ -e /usr/local/share/java/classes/trimmomatic.jar ]; then
|
|
export PATH=${PATH}:/usr/local/share/java/classes:/usr/local/share/trimmomatic/adapters
|
|
elif [ -e $PKGSRC/lib/java/trimmomatic.jar ]; then
|
|
export PATH=${PATH}:$PKGSRC/lib/java:$PKGSRC/share/trimmomatic/adapters
|
|
else
|
|
printf "Error: Trimmomatic is not installed in a known location.\n" >> /dev/stderr
|
|
fi
|
|
|
|
if ! pwd | fgrep dDocent-test; then
|
|
mkdir -p dDocent-test
|
|
cd dDocent-test
|
|
fi
|
|
|
|
# remake_reference.sh and some other scripts use GNU extensions in awk
|
|
# commands. Trick systems into using gawk to get around this without
|
|
# having to patch every script.
|
|
mkdir -p ddocent-bin
|
|
ln -sf `which gawk` ddocent-bin/awk
|
|
ln -sf `which python2.7` ddocent-bin/python
|
|
export PATH=`pwd`/ddocent-bin:$PATH
|
|
|
|
##########################################################################
|
|
# Actual dDocent commands below
|
|
##########################################################################
|
|
|
|
##########################################################################
|
|
# Command sequence from the dDocent tutorial on Github. See link below.
|
|
##########################################################################
|
|
|
|
cat << EOM
|
|
|
|
This script runs the test commands described at
|
|
|
|
http://ddocent.com/assembly
|
|
|
|
You may want to follow along on this web page as the script runs. It will
|
|
pause after each step to allow checking the output.
|
|
|
|
EOM
|
|
pause
|
|
|
|
mkdir -p Data
|
|
cd Data
|
|
|
|
printf "Downloading and unpacking data.zip...\n"
|
|
if [ -e data.zip ]; then
|
|
mv data.zip /tmp/dDocent-data.zip
|
|
rm -rf *
|
|
mv /tmp/dDocent-data.zip data.zip
|
|
else
|
|
rm -rf *
|
|
curl --insecure -L -o data.zip \
|
|
'https://www.dropbox.com/s/t09xjuudev4de72/data.zip?dl=0'
|
|
fi
|
|
|
|
# Hack: current data.zip contains data.zip. ??!
|
|
if [ ! -e simRRLs2.py ]; then
|
|
yes | unzip data.zip || true
|
|
fi
|
|
ls -l
|
|
pause
|
|
|
|
set -x
|
|
head SimRAD.barcodes
|
|
{ set +x; } 2>/dev/null
|
|
pause
|
|
|
|
set -x
|
|
cut -f2 SimRAD.barcodes > barcodes
|
|
head barcodes
|
|
{ set +x; } 2>/dev/null
|
|
pause
|
|
|
|
set -x
|
|
process_radtags -1 SimRAD_R1.fastq.gz -2 SimRAD_R2.fastq.gz -b barcodes \
|
|
-e ecoRI --renz_2 mspI -r -i gzfastq
|
|
ls | fgrep sample_ | head
|
|
{ set +x; } 2>/dev/null
|
|
pause
|
|
|
|
set -x
|
|
rm *rem*
|
|
{ set +x; } 2>/dev/null
|
|
pause
|
|
|
|
rm -f Rename_for_dDocent.sh # Always get the latest
|
|
set -x
|
|
curl --insecure -L -O https://github.com/jpuritz/dDocent/raw/master/Rename_for_dDocent.sh
|
|
more Rename_for_dDocent.sh
|
|
{ set +x; } 2>/dev/null
|
|
pause
|
|
|
|
set -x
|
|
bash Rename_for_dDocent.sh SimRAD.barcodes
|
|
{ set +x; } 2>/dev/null
|
|
|
|
set -x
|
|
ls *.fq.gz
|
|
{ set +x; } 2>/dev/null
|
|
pause
|
|
|
|
set -x
|
|
ls *.F.fq.gz > namelist
|
|
sed -i'' -e 's/.F.fq.gz//g' namelist
|
|
AWK1='BEGIN{P=1}{if(P==1||P==2){gsub(/^[@]/,">");print}; if(P==4)P=0; P++}'
|
|
AWK2='!/>/'
|
|
AWK3='!/NNN/'
|
|
PERLT='while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}'
|
|
|
|
cat namelist | parallel --no-notice -j 8 "zcat {}.F.fq.gz | mawk '$AWK1' | mawk '$AWK2' > {}.forward"
|
|
cat namelist | parallel --no-notice -j 8 "zcat {}.R.fq.gz | mawk '$AWK1' | mawk '$AWK2' > {}.reverse"
|
|
cat namelist | parallel --no-notice -j 8 "paste -d '-' {}.forward {}.reverse | mawk '$AWK3' | sed 's/-/NNNNNNNNNN/' | perl -e '$PERLT' > {}.uniq.seqs"
|
|
{ set +x; } 2>/dev/null
|
|
pause
|
|
|
|
set -x
|
|
cat *.uniq.seqs > uniq.seqs
|
|
for i in {2..20};
|
|
do
|
|
echo $i >> pfile
|
|
done
|
|
cat pfile | parallel --no-notice \
|
|
"echo -n {}xxx && mawk -v x={} '\$1 >= x' uniq.seqs | wc -l" \
|
|
| mawk '{gsub("xxx","\t",$0); print;}'| sort -g > uniqseq.data
|
|
rm pfile
|
|
{ set +x; } 2>/dev/null
|
|
pause
|
|
|
|
set -x
|
|
more uniqseq.data
|
|
{ set +x; } 2>/dev/null
|
|
pause
|
|
|
|
set -x
|
|
gnuplot << \EOF
|
|
set terminal dumb size 80, 30
|
|
set autoscale
|
|
set xrange [2:20]
|
|
unset label
|
|
set title "Number of Unique Sequences with More than X Coverage (Counted within individuals)"
|
|
set xlabel "Coverage"
|
|
set ylabel "Number of Unique Sequences"
|
|
plot 'uniqseq.data' with lines notitle
|
|
pause -1
|
|
EOF
|
|
{ set +x; } 2>/dev/null
|
|
pause
|
|
|
|
set -x
|
|
parallel --no-notice -j 8 mawk -v x=4 \''$1 >= x'\' ::: *.uniq.seqs | cut -f2 | perl -e 'while (<>) {chomp; $z{$_}++;} while(($k,$v) = each(%z)) {print "$v\t$k\n";}' > uniqCperindv
|
|
wc -l uniqCperindv
|
|
{ set +x; } 2>/dev/null
|
|
pause
|
|
|
|
set -x
|
|
for ((i = 2; i <= 10; i++));
|
|
do
|
|
echo $i >> ufile
|
|
done
|
|
|
|
cat ufile | parallel --no-notice "echo -n {}xxx && mawk -v x={} '\$1 >= x' uniqCperindv | wc -l" | mawk '{gsub("xxx","\t",$0); print;}'| sort -g > uniqseq.peri.data
|
|
rm ufile
|
|
{ set +x; } 2>/dev/null
|
|
pause
|
|
|
|
set -x
|
|
gnuplot << \EOF
|
|
set terminal dumb size 80, 30
|
|
set autoscale
|
|
unset label
|
|
set title "Number of Unique Sequences present in more than X Individuals"
|
|
set xlabel "Number of Individuals"
|
|
set ylabel "Number of Unique Sequences"
|
|
plot 'uniqseq.peri.data' with lines notitle
|
|
pause -1
|
|
EOF
|
|
{ set +x; } 2>/dev/null
|
|
pause
|
|
|
|
set -x
|
|
mawk -v x=4 '$1 >= x' uniqCperindv > uniq.k.4.c.4.seqs
|
|
wc -l uniq.k.4.c.4.seqs
|
|
{ set +x; } 2>/dev/null
|
|
pause
|
|
|
|
set -x
|
|
cut -f2 uniq.k.4.c.4.seqs > totaluniqseq
|
|
mawk '{c= c + 1; print ">Contig_" c "\n" $1}' totaluniqseq > uniq.fasta
|
|
{ set +x; } 2>/dev/null
|
|
pause
|
|
|
|
set -x
|
|
sed -e 's/NNNNNNNNNN/\t/g' uniq.fasta | cut -f1 > uniq.F.fasta
|
|
{ set +x; } 2>/dev/null
|
|
pause
|
|
|
|
set -x
|
|
cd-hit-est -i uniq.F.fasta -o xxx -c 0.8 -T 0 -M 0 -g 1
|
|
{ set +x; } 2>/dev/null
|
|
pause
|
|
|
|
set -x
|
|
mawk '{if ($1 ~ /Cl/) clus = clus + 1; else print $3 "\t" clus}' xxx.clstr | sed 's/[>Contig_,...]//g' | sort -g -k1 > sort.contig.cluster.ids
|
|
paste sort.contig.cluster.ids totaluniqseq > contig.cluster.totaluniqseq
|
|
sort -k2,2 -g contig.cluster.totaluniqseq | sed -e 's/NNNNNNNNNN/\t/g' > rcluster
|
|
{ set +x; } 2>/dev/null
|
|
pause
|
|
|
|
set -x
|
|
head rcluster
|
|
{ set +x; } 2>/dev/null
|
|
pause
|
|
|
|
set -x
|
|
cut -f2 rcluster | uniq | wc -l
|
|
{ set +x; } 2>/dev/null
|
|
pause
|
|
|
|
set -x
|
|
rainbow div -i rcluster -o rbdiv.out
|
|
{ set +x; } 2>/dev/null
|
|
pause
|
|
|
|
set -x
|
|
rainbow div -i rcluster -o rbdiv.out -f 0.5 -K 10
|
|
{ set +x; } 2>/dev/null
|
|
pause
|
|
|
|
set -x
|
|
rainbow merge -o rbasm.out -a -i rbdiv.out
|
|
{ set +x; } 2>/dev/null
|
|
pause
|
|
|
|
set -x
|
|
rainbow merge -o rbasm.out -a -i rbdiv.out -r 2
|
|
{ set +x; } 2>/dev/null
|
|
pause
|
|
|
|
# If missing fdesc mount for bash, <(echo "E") will not work
|
|
# cat rbasm.out <(echo "E") |sed 's/[0-9]*:[0-9]*://g' | mawk ' {
|
|
set -x
|
|
echo "E" > endfile
|
|
cat rbasm.out endfile |sed 's/[0-9]*:[0-9]*://g' | mawk ' {
|
|
if (NR == 1) e=$2;
|
|
else if ($1 ~/E/ && lenp > len1) {c=c+1; print ">dDocent_Contig_" e "\n" seq2 "NNNNNNNNNN" seq1; seq1=0; seq2=0;lenp=0;e=$2;fclus=0;len1=0;freqp=0;lenf=0}
|
|
else if ($1 ~/E/ && lenp <= len1) {c=c+1; print ">dDocent_Contig_" e "\n" seq1; seq1=0; seq2=0;lenp=0;e=$2;fclus=0;len1=0;freqp=0;lenf=0}
|
|
else if ($1 ~/C/) clus=$2;
|
|
else if ($1 ~/L/) len=$2;
|
|
else if ($1 ~/S/) seq=$2;
|
|
else if ($1 ~/N/) freq=$2;
|
|
else if ($1 ~/R/ && $0 ~/0/ && $0 !~/1/ && len > lenf) {seq1 = seq; fclus=clus;lenf=len}
|
|
else if ($1 ~/R/ && $0 ~/0/ && $0 ~/1/) {seq1 = seq; fclus=clus; len1=len}
|
|
else if ($1 ~/R/ && $0 ~!/0/ && freq > freqp && len >= lenp || $1 ~/R/ && $0 ~!/0/ && freq == freqp && len > lenp) {seq2 = seq; lenp = len; freqp=freq}
|
|
}' > rainbow.fasta
|
|
{ set +x; } 2>/dev/null
|
|
pause
|
|
|
|
set -x
|
|
cd-hit-est -i rainbow.fasta -o referenceRC.fasta -M 0 -T 0 -c 0.9
|
|
{ set +x; } 2>/dev/null
|
|
pause
|
|
|
|
rm -f remake_reference.sh
|
|
set -x
|
|
curl --insecure -L -O https://github.com/jpuritz/dDocent/raw/master/scripts/remake_reference.sh
|
|
more remake_reference.sh
|
|
#fix_bash_path remake_reference.sh
|
|
|
|
bash remake_reference.sh 4 4 0.90 PE 2
|
|
{ set +x; } 2>/dev/null
|
|
pause
|
|
|
|
rm -f ReferenceOpt.sh
|
|
set -x
|
|
curl --insecure -L -O https://github.com/jpuritz/dDocent/raw/master/scripts/ReferenceOpt.sh
|
|
more ReferenceOpt.sh
|
|
|
|
bash ReferenceOpt.sh 4 8 4 8 PE 16
|
|
{ set +x; } 2>/dev/null
|
|
pause
|
|
|
|
set -x
|
|
bash remake_reference.sh 4 4 0.90 PE 2
|
|
head reference.fasta
|
|
{ set +x; } 2>/dev/null
|
|
pause
|
|
|
|
set -x
|
|
wc -l reference.fasta
|
|
{ set +x; } 2>/dev/null
|
|
pause
|
|
|
|
set -x
|
|
mawk '/>/' reference.fasta | wc -l
|
|
{ set +x; } 2>/dev/null
|
|
pause
|
|
|
|
set -x
|
|
mawk '/^NAATT.*N*.*CCGN$/' reference.fasta | wc -l
|
|
grep '^NAATT.*N*.*CCGN$' reference.fasta | wc -l
|
|
{ set +x; } 2>/dev/null
|
|
pause
|
|
|
|
printf "Bonus Section: Optimize reference assemblies? (takes a long time) y/[n] "
|
|
read bonus
|
|
if [ 0$bonus = 0y ]; then
|
|
set -x
|
|
curl -L -O https://raw.githubusercontent.com/jpuritz/dDocent/master/scripts/RefMapOpt.sh
|
|
{ set +x; } 2>/dev/null
|
|
printf "Running dDocent to trim reads.\n"
|
|
pause
|
|
set -x
|
|
cat << EOM | dDocent || true
|
|
yes
|
|
8
|
|
10g
|
|
yes
|
|
no
|
|
no
|
|
no
|
|
bacon@uwm.edu
|
|
EOM
|
|
bash RefMapOpt.sh 4 8 4 8 0.9 64 PE
|
|
{ set +x; } 2>/dev/null
|
|
pause
|
|
more mapping.results
|
|
pause
|
|
fi
|