Parasites dominate hyperdiverse soil protist communities in neotropical rainforests

Table of Contents

Frédéric Mahé, Colomban de Vargas, David Bass, Lucas Czech, Alexandros Stamatakis, Enrique Lara, Jordan Mayor, John Bunge, Sarah Sernaker, Tobias Siemensmeyer, Isabelle Trautmann, Sarah Romac, Cédric Berney, Alexey Kozlov, Edward A. D. Mitchell, Christophe V. W. Seppey, David Singer, Elianne Egge, Rainer Wirth, Gabriel Trueba, and Micah Dunthorn

Supplementary file

1 Disclaimer

The purpose of this document is too provide the reader with details on the bioinformatics methods used to prepare this paper. The code snippets and shell commands presented here were executed on a Debian GNU/Linux 8, and might have to be adapted to your particular system. Use them carefully.

2 Sample geographical coordinates

Sample name Field station Month & year UTM coordinates of 1st sample UTM coordinates of 2nd sample
B005_B006 Baro Colorado Island, Panama October 2012 UTM17P 0628970 1012760 UTM17P 0628778 1012696
B007_B008 Baro Colorado Island, Panama October 2012 UTM17P 0628598 1012614 UTM17P 0628436 1012656
B010 Baro Colorado Island, Panama October 2012 UTM17P 0628105 1012533  
B011_B012 Baro Colorado Island, Panama October 2012 UTM17P 0628590 1012512 UTM17P 0628465 1012407
B013_B014 Baro Colorado Island, Panama October 2012 UTM17P 0628415 1012269 UTM17P 0628342 1012108
B020 Baro Colorado Island, Panama October 2012 UTM17P 0629156 1011959  
B029_B030 Baro Colorado Island, Panama October 2012 UTM17P 0627727 1011629 UTM17P 0627712 1011853
B031_B032 Baro Colorado Island, Panama October 2012 UTM17P 0627585 1011950 UTM17P 0627360 1012080
B033_B034 Baro Colorado Island, Panama October 2012 UTM17P 0627316 1012254 UTM17P 0627251 1012431
B035_B036 Baro Colorado Island, Panama October 2012 UTM17P 0627236 1012614 UTM17P 0627369 1012832
B037_B038 Baro Colorado Island, Panama October 2012 UTM17P 0627269 1012952 UTM17P 0627389 1013115
B039_B040 Baro Colorado Island, Panama October 2012 UTM17P 0627514 1013261 UTM17P 0627457 1013396
B043_B044 Baro Colorado Island, Panama October 2012 UTM17P 0627008 1013156 UTM17P 0626933 1013191
B045_B046 Baro Colorado Island, Panama October 2012 UTM17P 0626925 1013368 UTM17P 0626948 1013575
B047_B048 Baro Colorado Island, Panama October 2012 UTM17P 0626961 1013769 UTM17P 0626809 1013035
B050 Baro Colorado Island, Panama October 2012 UTM17P 0626654 1013151  
B051_B052 Baro Colorado Island, Panama October 2012 UTM17P 0625351 1010594 UTM17P 0625463 1010743
B060 Baro Colorado Island, Panama October 2012 UTM17 P0625853 1011771  
B070 Baro Colorado Island, Panama October 2012 UTM17P 0625462 1012036  
B080 Baro Colorado Island, Panama October 2012 UTM17P 0625613 1011717  
B081_B082 Baro Colorado Island, Panama October 2012 UTM17P 0625691 1010992 UTM17P 0625784 1011005
B090 Baro Colorado Island, Panama October 2012 UTM17P 0626449 1011141  
B100 Baro Colorado Island, Panama October 2012 UTM17P 0626820 1010851  
B129_B130 Baro Colorado Island, Panama June 2013 UTM17P 0627725 1011628 UTM17P 0627708 1011851
B133_B134 Baro Colorado Island, Panama June 2013 UTM17P 0627313 1012256 UTM17P 0627249 1012431
B135_B136 Baro Colorado Island, Panama June 2013 UTM17P 0627241 1012619 UTM17P 0627364 1012833
B143_B144 Baro Colorado Island, Panama June 2013 UTM17P 0627003 1013154 UTM17P 0626930 1013189
B145_B146 Baro Colorado Island, Panama June 2013 UTM17P 0626926 1013364 UTM17P 0626940 1013567
B155_B156 Baro Colorado Island, Panama June 2013 UTM17P 0625602 1011278 UTM17P 0625678 1011447
B163_B164 Baro Colorado Island, Panama June 2013 UTM17P 0626219 1012716 UTM17P 0626088 1012872
B167_B168 Baro Colorado Island, Panama June 2013 UTM17P 0625741 1012089 UTM17P 0625654 1012060
B173_B174 Baro Colorado Island, Panama June 2013 UTM17P 0625124 1012101 UTM17P 0625041 1012107
B175_B176 Baro Colorado Island, Panama June 2013 UTM17P 0624940 1012141 UTM17P 0624848 1012169
B177_B178 Baro Colorado Island, Panama June 2013 UTM17P 0624683 1012232 UTM17P 0625349 1011959
B183_B184 Baro Colorado Island, Panama June 2013 UTM17P 0625858 1011059 UTM17P 0625868 1011167
B185_B186 Baro Colorado Island, Panama June 2013 UTM17P 0626968 1011326 UTM17P 0626048 1011324
B193_B194 Baro Colorado Island, Panama June 2013 UTM17P 0626634 1011016 UTM17P 0626615 1010921
B197_B198 Baro Colorado Island, Panama June 2013 UTM17P 0626917 1010726 UTM17P 0626991 1010781
B199_B200 Baro Colorado Island, Panama June 2013 UTM17P 0626909 1010818 UTM17P 0626819 1010851
L001_L002 La Selva Biological Station, Costa Rica October 2012 UTM16P 0827508 1151992 UTM16P 0827291 1151572
L005_L006 La Selva Biological Station, Costa Rica October 2012 UTM16P 0827077 1151198 UTM16P 0826996 1151174
L007_L008 La Selva Biological Station, Costa Rica October 2012 UTM16P 0826878 1151240 UTM16P 0826762 1151312
L010 La Selva Biological Station, Costa Rica October 2012 UTM16P 0826519 1151490  
L011_L012 La Selva Biological Station, Costa Rica October 2012 UTM16P 0826394 1151574 UTM16P 0826269 1151653
L013_L014 La Selva Biological Station, Costa Rica October 2012 UTM16P 0826150 1151733 UTM16P 0826029 1151811
L015_L016 La Selva Biological Station, Costa Rica October 2012 UTM16P 0825892 1151890 UTM16P 0825766 1151969
L018 La Selva Biological Station, Costa Rica October 2012 UTM16P 0825996 1152327  
L019_L020 La Selva Biological Station, Costa Rica October 2012 UTM16P 0826174 1152210 UTM16P 0826365 1152076
L021_L022 La Selva Biological Station, Costa Rica October 2012 UTM16P 0826485 1152007 UTM16P 0826584 1152115
L023_L024 La Selva Biological Station, Costa Rica October 2012 UTM16P 0826646 1152202 UTM16P 0826667 1152296
L025_L026 La Selva Biological Station, Costa Rica October 2012 UTM16P 0826806 1152440 UTM16P 0826967 1152538
L027_L028 La Selva Biological Station, Costa Rica October 2012 UTM16P 0827145 1152596 UTM16P 0827146 1152783
L030 La Selva Biological Station, Costa Rica October 2012 UTM16P 0827269 1153203  
L031_L032 La Selva Biological Station, Costa Rica October 2012 UTM16P 0827271 1153264 UTM16P 0827195 1153341
L035_L036 La Selva Biological Station, Costa Rica October 2012 UTM16P 0827424 1153638 UTM16P 0827532 1153786
L037_L038 La Selva Biological Station, Costa Rica October 2012 UTM16P 0827560 1153865 UTM16P 0827648 1153883
L039_L040 La Selva Biological Station, Costa Rica October 2012 UTM16P 0827721 1153851 UTM16P 0827807 1153847
L041_L042 La Selva Biological Station, Costa Rica October 2012 UTM16P 0828171 1153424 UTM16P 0828107 1153293
L043_L044 La Selva Biological Station, Costa Rica October 2012 UTM16P 0828029 1153133 UTM16P 0828001 1153148
L045_L046 La Selva Biological Station, Costa Rica October 2012 UTM16P 0827919 1153208 UTM16P 0827838 1153261
L049_L050 La Selva Biological Station, Costa Rica October 2012 UTM16P 0827539 1153343 UTM16P 0827420 1153464
L051_L052 La Selva Biological Station, Costa Rica October 2012 UTM16P 0827116 1154647 UTM16P 0827062 1154729
L053_L054 La Selva Biological Station, Costa Rica October 2012 UTM16P 0827935 1154770 UTM16P 0826975 1154978
L055_L056 La Selva Biological Station, Costa Rica October 2012 UTM16P 0826988 1155125 UTM16P 0827084 1155185
L057_L058 La Selva Biological Station, Costa Rica October 2012 UTM16P 0827174 1155148 UTM16P 0827040 1154956
L059_L060 La Selva Biological Station, Costa Rica October 2012 UTM16P 0827158 1154892 UTM16P 0827310 1154811
L061_L062 La Selva Biological Station, Costa Rica October 2012 UTM16P 0827402 1153917 UTM16P 0827384 1154098
L063_L064 La Selva Biological Station, Costa Rica October 2012 UTM16P 0827207 1154131 UTM16P 0827034 1154065
L065_L066 La Selva Biological Station, Costa Rica October 2012 UTM16P 0826917 1153921 UTM16P 0826813 1153803
L067_L068 La Selva Biological Station, Costa Rica October 2012 UTM16P 0826690 1153656 UTM16P 0826453 1153688
L069_L070 La Selva Biological Station, Costa Rica October 2012 UTM16P 0826297 1153706 UTM16P 0826165 1153716
L071_L072 La Selva Biological Station, Costa Rica October 2012 UTM16P 0826083 1153577 UTM16P0826030 1153502
L073_L074 La Selva Biological Station, Costa Rica October 2012 UTM16P 0825967 1153430 UTM16P 0825871 1153305
L075_L076 La Selva Biological Station, Costa Rica October 2012 UTM16P 0825850 1153264 UTM16P 0825795 1153187
L077_L078 La Selva Biological Station, Costa Rica October 2012 UTM16P 0825693 1153019 UTM16P 0825645 1152937
L079_L080 La Selva Biological Station, Costa Rica October 2012 UTM16P 0825587 1152850 UTM16P 0825554 1152810
L081_L082 La Selva Biological Station, Costa Rica October 2012 UTM16P 0825522 1152730 UTM16P 0825462 1152695
L083_L084 La Selva Biological Station, Costa Rica October 2012 UTM16P 0825410 1152612 UTM16P 0825360 1152529
L085_L086 La Selva Biological Station, Costa Rica October 2012 UTM16P 0825433 1152750 UTM16P 0825436 1152843
L089_L090 La Selva Biological Station, Costa Rica October 2012 UTM16P 0825676 1153060 UTM16P 0825715 1153131
L092 La Selva Biological Station, Costa Rica October 2012 UTM16P 0824007 1154316  
L093_L094 La Selva Biological Station, Costa Rica October 2012 UTM16P 0823943 1154246 UTM16P 0823882 1154176
L095_L096 La Selva Biological Station, Costa Rica October 2012 UTM16P 0823820 1154084 UTM16P 0823952 1153994
L097_L098 La Selva Biological Station, Costa Rica October 2012 UTM16P 0823975 1153968 UTM16P 0824068 1153931
L099_L100 La Selva Biological Station, Costa Rica October 2012 UTM16P 0824104 1153916 UTM16P 0824192 1153852
L101_L102 La Selva Biological Station, Costa Rica June 2013 UTM16P 0827504 1151992 UTM16P 0827829 1151568
L103_L104 La Selva Biological Station, Costa Rica June 2013 UTM16P 0827209 1151406 UTM16P 0827137 1151325
L109_L110 La Selva Biological Station, Costa Rica June 2013 UTM16P 0826633 1151391 UTM16P 0826519 1151490
L111_L112 La Selva Biological Station, Costa Rica June 2013 UTM16P 0826394 1151576 UTM16P 0826271 1151650
L115_L116 La Selva Biological Station, Costa Rica June 2013 UTM16P 0825894 1151894 UTM16P 0825766 1151965
L117_L118 La Selva Biological Station, Costa Rica June 2013 UTM16P 0825642 1152064 UTM16P 0825989 1152325
L119_L120 La Selva Biological Station, Costa Rica June 2013 UTM16P 0826171 1152214 UTM16P 0826364 1152082
L123_L124 La Selva Biological Station, Costa Rica June 2013 UTM16P 0826651 1152202 UTM16P 0826670 1152300
L125_L126 La Selva Biological Station, Costa Rica June 2013 UTM16P 0826812 1152435 UTM16P 0826966 1152539
L129_L130 La Selva Biological Station, Costa Rica June 2013 UTM16P 0827187 1152981 UTM16P 0827272 1153202
L131_L132 La Selva Biological Station, Costa Rica June 2013 UTM16P 0827263 1153262 UTM16P 0827192 1153339
L137_L138 La Selva Biological Station, Costa Rica June 2013 UTM16P 0827557 1153864 UTM16P 0827651 1153880
L139_L140 La Selva Biological Station, Costa Rica June 2013 UTM16P 0827718 1153849 UTM16P 0827807 1153842
L145_L146 La Selva Biological Station, Costa Rica June 2013 UTM16P 0827921 1153205 UTM16P 0827834 1153262
L151_L152 La Selva Biological Station, Costa Rica June 2013 UTM16P 0827116 1154647 UTM16P 0827062 1154729
L155_L156 La Selva Biological Station, Costa Rica June 2013 UTM16P 0826988 1155125 UTM16P 0827084 1155185
L159_L160 La Selva Biological Station, Costa Rica June 2013 UTM16P 0827158 1154892 UTM16P 0827310 1154811
L161_L162 La Selva Biological Station, Costa Rica June 2013 UTM16P 0827406 1153923 UTM16P 0827385 1154093
L165_L166 La Selva Biological Station, Costa Rica June 2013 UTM16P 0826917 1153919 UTM16P 0826811 1153803
L171_L172 La Selva Biological Station, Costa Rica June 2013 UTM16P 0826083 1153578 UTM16P 0826029 1153498
L173_L174 La Selva Biological Station, Costa Rica June 2013 UTM16P 0825965 1153426 UTM16P 0825873 1153301
L175_L176 La Selva Biological Station, Costa Rica June 2013 UTM16P 0825854 1153267 UTM16P 0825794 1153187
L177_L178 La Selva Biological Station, Costa Rica June 2013 UTM16P 0825694 1153018 UTM16P 0825643 1152935
L179_L180 La Selva Biological Station, Costa Rica June 2013 UTM16P 0825588 1152849 UTM16P 0825552 1152809
L181_L182 La Selva Biological Station, Costa Rica June 2013 UTM16P 0825519 1152726 UTM16P 0825464 1152695
L183_L184 La Selva Biological Station, Costa Rica June 2013 UTM16P 0825412 1152618 UTM16P 0825359 1152531
L185_L186 La Selva Biological Station, Costa Rica June 2013 UTM16P 0825434 1152750 UTM16P 0825433 1152845
L187_L188 La Selva Biological Station, Costa Rica June 2013 UTM16P 0825520 1152898 UTM16P 0825598 1152952
L189_L190 La Selva Biological Station, Costa Rica June 2013 UTM16P 0825677 1153056 UTM16P 0825718 1153130
L191_L192 La Selva Biological Station, Costa Rica June 2013 UTM16P 0824008 1154665 UTM16P 0824007 1154316
L193_L194 La Selva Biological Station, Costa Rica June 2013 UTM16P 0823943 1154246 UTM16P 0823882 1154176
L195_L196 La Selva Biological Station, Costa Rica June 2013 UTM16P 0823820 1154084 UTM16P 0823952 1153994
L197_L198 La Selva Biological Station, Costa Rica June 2013 UTM16P 0823975 1153968 UTM16P 0824068 1153931
L199_L200 La Selva Biological Station, Costa Rica June 2013 UTM16P 0824104 1153916 UTM16P 0824192 1153852
T105_T106 Tiputini Biodiversity Station, Ecuador October 2013 UTM18M 0373189 9928059 UTM18M 0373467 9928335
T107_T108 Tiputini Biodiversity Station, Ecuador October 2013 UTM18M 0373335 9928669 UTM18M 0373011 9928958
T109_T110 Tiputini Biodiversity Station, Ecuador October 2013 UTM18M 0372841 9929029 UTM18M 0372722 9929161
T111 Tiputini Biodiversity Station, Ecuador October 2013 UTM18M 0372252 9929207  
T125_T126 Tiputini Biodiversity Station, Ecuador October 2013 UTM18M 0371606 9929833 UTM18M 0371890 9929755
T127_T128 Tiputini Biodiversity Station, Ecuador October 2013 UTM18M 0372509 9929532 UTM18M 0372598 9929440
T143_T144 Tiputini Biodiversity Station, Ecuador October 2013 UTM18M 0371293 9930778 UTM18M 0371298 9930670
T151_T152 Tiputini Biodiversity Station, Ecuador October 2013 UTM18M 0371323 9929828 UTM18M 0371193 9929798
T154 Tiputini Biodiversity Station, Ecuador October 2013 UTM18M 0370864 9930009  
T159_T160 Tiputini Biodiversity Station, Ecuador October 2013 UTM18M 0370561 9930026 UTM18M 0370415 9929830
T163_T164 Tiputini Biodiversity Station, Ecuador October 2013 UTM18M 0370685 9929529 UTM18M 0370873 9929503
T165 Tiputini Biodiversity Station, Ecuador October 2013 UTM18M 0371053 9929512  
T167_T168 Tiputini Biodiversity Station, Ecuador October 2013 UTM18M 0371389 9929450 UTM18M 0371462 9929351
T169_T170 Tiputini Biodiversity Station, Ecuador October 2013 UTM18M 0371579 9929435 UTM18M 0371582 9929323
T171_T172 Tiputini Biodiversity Station, Ecuador October 2013 UTM18M 0370413 9929909 UTM18M 0370369 9930000
T174 Tiputini Biodiversity Station, Ecuador October 2013 UTM18M 0370132 9930224  
T175_T176 Tiputini Biodiversity Station, Ecuador October 2013 UTM18M 0370047 9930268 UTM18M 0369980 9930348
T177_T178 Tiputini Biodiversity Station, Ecuador October 2013 UTM18M 0369858 9930414 UTM18M 0369852 9930521
T179_T180 Tiputini Biodiversity Station, Ecuador October 2013 UTM18M 0369835 9930615 UTM18M 0369821 9930699
T182 Tiputini Biodiversity Station, Ecuador October 2013 UTM18M 0369655 9931026  
T185_186 Tiputini Biodiversity Station, Ecuador October 2013 UTM18M 0369826 9931183 UTM18M 0369992 9931191
T194 Tiputini Biodiversity Station, Ecuador October 2013 UTM18M 0370585 9931519  
T195_T196 Tiputini Biodiversity Station, Ecuador October 2013 UTM18M 0370681 9931513 UTM18M 0370763 9931521
T197_T198 Tiputini Biodiversity Station, Ecuador October 2013 UTM18M 0370763 9931521 UTM18M 0370970 9931517
T199_T200 Tiputini Biodiversity Station, Ecuador October 2013 UTM18M 0371042 9931492 UTM18M 0371155 9931461

3 Illumina runs (universal V4 primers)

3.1 Assemble paired-ends

# kl
cd ${HOME}/neotropical_diversity/data/

# Assemble
PEAR="${HOME}/bin/PEAR/src/pear"
GUESS_ENCODING="${HOME}/src/guess-encoding.py"
THREADS=8

find . -name "*_1_1.fastq.bz2" | \
while read R1 ; do
    R2=$(sed -e 's/_1_1/_1_2/' <<< ${R1})
    bunzip2 -k ${R1} ${R2}
    ENCODING=$(awk 'NR % 4 == 0' ${R1/.bz2/} | python ${GUESS_ENCODING} 2> /dev/null)
    echo -e "${R1}\t${ENCODING}"
    ${PEAR} -b ${ENCODING} -j ${THREADS} -f ${R1/.bz2/} -r ${R2/.bz2/} -o ${R1/_1.fastq.bz2/}
    rm -f ${R1/_1.fastq.bz2/}{.discarded,.unassembled.{forward,reverse}}.fastq ${R1/.bz2/} ${R2/.bz2/}
done

exit 0

3.2 Run cutadapt

PRIMERS V4

PRIMER_F="CCAGCASCYGCGGTAATTCC" PRIMER_R="TYRATCAAGAACGAAAGT" ANTI_PRIMER_F="GGAATTACCGCRGSTGCTGG" ANTI_PRIMER_R="ACTTTCGTTCTTGATYRA"

kl
cd ${HOME}/neotropical_diversity/data/2013-12-30/
for f in *.assembled.fastq ; do
    bash ../../src/clean_fastq_files.sh ${f}
done

where clean_fastq_files.sh contains:

#! /bin/bash

module load python/latest-2.7
export LC_ALL=C

INPUT="${1}"
CUTADAPT="/home/mahe/.local/bin/cutadapt"
VSEARCH="${HOME}/bin/vsearch/bin/vsearch-1.1.1-osx-x86_64"
HASHING="${HOME}/src/hashing.py"

PRIMER_F="CCAGCASCYGCGGTAATTCC"
PRIMER_R="TYRATCAAGAACGAAAGT"
ANTI_PRIMER_F="GGAATTACCGCRGSTGCTGG"
ANTI_PRIMER_R="ACTTTCGTTCTTGATYRA"

TMP_FORWARD=$(mktemp)
TMP_ANTI_FORWARD=$(mktemp)
TMP_REVERSE=$(mktemp)
TMP_ANTI_REVERSE=$(mktemp)
TMP_FASTA=$(mktemp)
TMP_FASTA_DEREPLICATED=$(mktemp)
FINAL_FASTA=${INPUT/.fastq/.fasta}
LOG=${INPUT/.fastq/.log}

# Get reads containing forward primer
${CUTADAPT} --discard-untrimmed \
    --format=fastq \
    -g PRIMER_F=${PRIMER_F} \
    -o ${TMP_FORWARD} ${INPUT} > ${LOG}

# Get reads containing reverse primer
${CUTADAPT} --discard-untrimmed \
    --format=fastq \
    -a PRIMER_R=${PRIMER_R} \
    -o ${TMP_REVERSE} ${TMP_FORWARD} >> ${LOG}

# Get reads containing anti-reverse primer (in 5' position)
${CUTADAPT} --discard-untrimmed \
    --format=fastq \
    -g ANTI_PRIMER_R=${ANTI_PRIMER_R} \
    -o ${TMP_ANTI_FORWARD} ${INPUT} >> ${LOG}

# Get reads containing anti-forward primer (in 3' position)
${CUTADAPT} --discard-untrimmed \
    --format=fastq \
    -a ANTI_PRIMER_F=${ANTI_PRIMER_F} \
    -o ${TMP_ANTI_REVERSE} ${TMP_ANTI_FORWARD} >> ${LOG}

# Convert fastq to fasta (reverse-complement the second file)
(awk '(NR - 2) % 4 == 0' ${TMP_REVERSE}
 awk '(NR - 2) % 4 == 0' ${TMP_ANTI_REVERSE} | \
     tr "acgturykmbdhvACGTURYKMBDHV" "tgcaayrmkvhdbTGCAAYRMKVHDB" | rev) | \
     grep -v [^ACGTacgt] | awk '{printf ">a%d\n%s\n", NR, $1}' > ${TMP_FASTA}

rm -f ${TMP_FORWARD} ${TMP_ANTI_FORWARD} ${TMP_REVERSE} ${TMP_ANTI_REVERSE}

# Dereplicate (vsearch)
"${VSEARCH}" --threads 1 \
    --derep_fulllength ${TMP_FASTA} \
    --sizeout \
    --fasta_width 0 \
    --output ${TMP_FASTA_DEREPLICATED}

# Compute hash values
python ${HASHING} ${TMP_FASTA_DEREPLICATED} > ${FINAL_FASTA}

# Get some basic statistics
awk -F "=" 'BEGIN {OFS = "\t"}
            /^>/ {c += 1 ; s += $2}
            END {
                printf "\n%s\n%s\t%d\n%s\t%d\n", "Basic stats:", "uniques", c, "reads", s
            }' ${FINAL_FASTA} >> ${LOG}

rm -f ${TMP_FASTA} ${TMP_FASTA_DEREPLICATED}

exit 0

3.3 Summarize assembly stats

kl
cd ${HOME}/neotropical_diversity/data/2013-12-30/
for TARGET in *_1_1.fastq.bz2 ; do
    ASSEMBLED=$(wc -l < ${TARGET/_1.fastq.bz2/.assembled.fastq})
    RAW=$(bzcat ${TARGET} | sed '/^$/d' | wc -l)
    CLEAN_READS=$(tail -n 1 ${TARGET/_1.fastq.bz2/.assembled.log} | cut -f 2)
    awk -v after=$ASSEMBLED -v before=$RAW -v file=$TARGET -v clean_reads=$CLEAN_READS 'BEGIN {printf "| %s | %s | %s | %.1f | %s | %.1f |\n", file, before / 4, after / 4, 100 * after / before, clean_reads, 100 * clean_reads / (after / 4)}'
done
Sample raw assembled % primers %  
B010 335925 302963 90.2 271333 89.6  
B020 698743 628051 89.9 548572 87.3  
B030 513097 464403 90.5 417321 89.9  
B040 715818 647954 90.5 571901 88.3  
B050 290376 234447 80.7 196771 83.9  
B060 749450 673216 89.8 580906 86.3  
B070 103417 78711 76.1 62909 79.9 BAD
B080 861437 786409 91.3 720303 91.6  
B090 926832 845395 91.2 783374 92.7  
B100 1164491 1062649 91.3 964485 90.8  
L010 648490 581480 89.7 516281 88.8  
L020 365611 328282 89.8 295923 90.1  
L030 337266 299225 88.7 259695 86.8  
L040 798176 729663 91.4 655327 89.8  
L050 809286 742541 91.8 679930 91.6  
L060 778942 713255 91.6 634552 89.0  
L070 721544 660151 91.5 588745 89.2  
L080 357786 322517 90.1 279493 86.7  
L090 721010 653954 90.7 588390 90.0  
L100 850231 773550 91.0 682484 88.2  
B033_B034 1254058 1229465 98.0 1216202 98.9  
B035_B036 958535 939631 98.0 933791 99.4  
B037_B038 778871 747752 96.0 741346 99.1  
B039_B040 1219062 1184386 97.2 1176220 99.3  
B043_B044 996756 944365 94.7 935807 99.1  
B045_B046 1049071 1020981 97.3 1013012 99.2  
B047_B048 775763 660413 85.1 649547 98.4  
B051_B052 1307632 1287944 98.5 1275897 99.1  
B081_B082 1038080 1029870 99.2 1024313 99.5  
L049_L050 662252 648952 98.0 631336 97.3  
L051_L052 1301146 1259997 96.8 1247906 99.0  
L053_L054 1185115 1166701 98.4 1153604 98.9  
L055_L056 832961 817196 98.1 793091 97.1  
L057_L058 882125 864370 98.0 840963 97.3  
L059_L060 828171 815897 98.5 804766 98.6  
L061_L062 1014607 988154 97.4 973130 98.5  
L063_L064 1180449 1147513 97.2 1114401 97.1  
L065_L066 900278 892207 99.1 885233 99.2  
L067_L068 845596 813664 96.2 800978 98.4  
L069_L070 924607 908685 98.3 900132 99.1  
L071_L072 1113591 1084217 97.4 1071586 98.8  
L073_L074 649309 632805 97.5 616349 97.4  
L075_L076 829967 812226 97.9 803422 98.9  
L077_L078 900100 876617 97.4 863032 98.5  
L079_L080 1015440 985229 97.0 971827 98.6  
L081_L082 763960 749317 98.1 742182 99.0  
L083_L084 940442 903757 96.1 876820 97.0  
L085_L086 859544 827219 96.2 805254 97.3  
L089_L090 715222 700907 98.0 684859 97.7  
L092 1507356 1486775 98.6 1474852 99.2  
L093_L094 1176553 1155835 98.2 1143333 98.9  
L095_L096 1188829 1072318 90.2 919568 85.8  
L097_L098 996241 938988 94.3 905526 96.4  
L099_L100 734520 371077 50.5 343536 92.6 BAD
B129_B130 970167 939324 96.8 797114 84.9  
B133_B134 799005 780607 97.7 657427 84.2  
B135_B136 811205 789760 97.4 693092 87.8  
B143_B144 692442 682453 98.6 612093 89.7  
B145_B146 664351 658744 99.2 552616 83.9  
B155_B156 706701 701490 99.3 594226 84.7  
B163_B164 869092 858907 98.8 738102 85.9  
B167_B168 310984 292632 94.1 245006 83.7  
B173_B174 504626 492632 97.6 414030 84.0  
B175_B176 749182 733646 97.9 613849 83.7  
B177_B178 2222946 2180551 98.1 1842450 84.5  
B183_B184 879922 871787 99.1 742968 85.2  
B185_B186 742558 734968 99.0 622912 84.8  
B193_B194 894797 886750 99.1 767107 86.5  
B197_B198 361308 332624 92.1 279561 84.0  
B199_B200 872090 862571 98.9 852764 98.9  
L101_L102 1089197 1064229 97.7 1052283 98.9  
L103_L104 649516 631893 97.3 624284 98.8  
L109_L110 818090 798549 97.6 788820 98.8  
L111_L112 785 776 98.9 768 99.0  
L115_L116 864423 845904 97.9 833297 98.5  
L117_L118 1256730 1227061 97.6 1213172 98.9  
L119_L120 722925 696831 96.4 687580 98.7  
L123_L124 799617 778266 97.3 769025 98.8  
L125_L126 915100 903664 98.8 892299 98.7  
L129_L130 267592 260098 97.2 256776 98.7  
L131_L132 733208 716874 97.8 708469 98.8  
L137_L13 1064056 1042652 98.0 1027034 98.5  
L139_L140 776861 758230 97.6 748716 98.7  
L145_L146 405953 376741 92.8 371367 98.6  
L151_L152 828183 815400 98.5 802965 98.5  
L155_L156 868635 856459 98.6 845705 98.7  
L159_L160 1050291 1036189 98.7 1021533 98.6  
L161_L162 680263 574569 84.5 566274 98.6  
L165_L166 970285 934520 96.3 921133 98.6  
T105_T106 1330489 1290427 97.0 1217516 94.3  
T107_T108 925287 898217 97.1 847051 94.3  
T109_T110 908225 888190 97.8 838486 94.4  
T111 1000369 972635 97.2 918878 94.5  
T125_T126 534415 523947 98.0 494750 94.4  
T127_T128 1189530 1144207 96.2 1079855 94.4  
T143_T144 1210476 1193732 98.6 1128481 94.5  
T151_T152 982993 958941 97.6 905167 94.4  
T154 953346 928628 97.4 877436 94.5  
T159_T160 1169335 1127957 96.5 1065079 94.4  
T163_T164 1046247 1016207 97.1 958921 94.4  
T165 1148516 1121331 97.6 1062909 94.8  
T167_T168 793491 764693 96.4 721980 94.4  
T169_T170 1239551 1199528 96.8 1131511 94.3  
T171_T172 893903 870180 97.3 823720 94.7  
T174 96566 92912 96.2 87728 94.4  
T175_T176 972466 930110 95.6 878340 94.4  
T177_T178 978776 913983 93.4 861804 94.3  
T179_T180 866585 842974 97.3 800664 95.0  
T182 479330 470952 98.3 444419 94.4  
T185_186 751697 726801 96.7 613303 84.4  
T194 951579 936589 98.4 804032 85.8  
T195_T196 747342 718620 96.2 613470 85.4  
T197_T198 624433 610147 97.7 521303 85.4  
T199_T200 756374 740389 97.9 623324 84.2  
L001_L002 1072934 1031951 96.2 1008993 97.8  
L005_L006 956295 928881 97.1 911040 98.1  
L007_L008 1063327 1002773 94.3 968882 96.6  
L011_L012 1047186 1013617 96.8 989332 97.6  
L013_L014 1112144 1048956 94.3 1031591 98.3  
L015_L016 1017455 965045 94.8 928228 96.2  
L018 1161781 1101485 94.8 1076679 97.7  
L019_L020 966222 455317 47.1 423843 93.1 BAD
L021_L022 1025153 897418 87.5 862669 96.1  
L023_L024 976304 942437 96.5 933374 99.0  
L025_L026 1196689 1168647 97.7 1146959 98.1  
L027_L028 354453 307397 86.7 223094 72.6 BAD
L030 799487 601927 75.3 439269 73.0 BAD
L031_L032 1043739 1013836 97.1 1004410 99.1  
L035_L036 1164928 1128927 96.9 1112636 98.6  
L037_L038 1405244 1369454 97.5 1318370 96.3  
L039_L040 1097195 766762 69.9 710948 92.7 BAD
L041_L042 1172760 1145787 97.7 1135095 99.1  
L043_L044 845314 821167 97.1 809682 98.6  
L045_L046 1049705 647697 61.7 517090 79.8 BAD
L001_L002 769322 738370 96.0 710346 96.2  
L005_L006 694706 670104 96.5 644291 96.1  
L007_L008 756507 712949 94.2 679395 95.3  
L011_L012 742672 718643 96.8 690414 96.1  
L013_L014 798003 753404 94.4 729373 96.8  
L015_L016 713492 677891 95.0 642509 94.8  
L018 828511 783668 94.6 754147 96.2  
L019_L020 712143 343580 48.2 334496 97.4 BAD
L021_L022 756050 656873 86.9 624906 95.1  
L023_L024 688027 664093 96.5 648445 97.6  
L025_L026 834359 816106 97.8 789868 96.8  
L027_L028 258882 224324 86.7 163289 72.8 BAD
L030 586558 443774 75.7 329815 74.3 BAD
L031_L032 729024 708339 97.2 690625 97.5  
L035_L036 840351 811829 96.6 786883 96.9  
L037_L038 1012734 986925 97.5 934934 94.7  
L039_L040 816919 567508 69.5 529382 93.3 BAD
L041_L042 826382 806477 97.6 785119 97.4  
L043_L044 583848 567893 97.3 552111 97.2  
L045_L046 776990 478406 61.6 394172 82.4 BAD
L171_L172 388910 378917 97.4 350808 92.6  
L173_L174 376048 366111 97.4 340209 92.9  
L175_L176 69712 67878 97.4 63302 93.3  
L177_L178 991293 964861 97.3 894549 92.7  
L179_L180 849174 833633 98.2 772079 92.6  
L181_L182 1015206 1004323 98.9 933643 93.0  
L183_L184 1052554 1020520 97.0 947722 92.9  
L185_L186 988120 980530 99.2 915254 93.3  
L187_L188 1023155 995206 97.3 923912 92.8  
L189_L190 883787 872201 98.7 803930 92.2  
L191_L192 1024683 1011324 98.7 933793 92.3  
L193_L194 789859 765662 96.9 706206 92.2  
L195_L196 924282 909057 98.4 835502 91.9  
L197_L198 824319 814188 98.8 750542 92.2  
L199_L200 1092293 1072852 98.2 988862 92.2  

3.4 Dereplicate the data

Some samples are an equimolar mix of two individual physical samples. I will count them as single molecular samples. In practice, the number of samples is equal to the number of individual fasta files.

kl
cd ${HOME}/neotropical_diversity/data/

VSEARCH="${HOME}/bin/vsearch/bin/vsearch"
TMP_FASTA=$(mktemp)
TMP_FASTA_DEREPLICATED=$(mktemp)
OUTPUT="neotropical_soil_175_samples.fas"

cat ./201[35]*/[LTB][0-9][0-9][0-9]*.fas > ${TMP_FASTA}

# Dereplicate (vsearch)
"${VSEARCH}" --threads 1 \
    --derep_fulllength ${TMP_FASTA} \
    --sizein \
    --sizeout \
    --fasta_width 0 \
    --output ${TMP_FASTA_DEREPLICATED}

# Change abundance annotations
sed -e '/^>/ s/;size=/_/' \
    -e '/^>/ s/;$//' < ${TMP_FASTA_DEREPLICATED} > ${OUTPUT}

bzip2 -9k ${OUTPUT} &

rm ${TMP_FASTA} ${TMP_FASTA_DEREPLICATED}

3.5 Clustering (swarm)

# kl
cd ${HOME}/src/
# Target the big memory node (1024 GB)
bsub -q normal -n 16 -R "span[hosts=1] select[model==XEON_E5_4650] rusage[mem=200000]" bash swarm_fastidious.sh ../neotropical_diversity/data/neotropical_soil_175_samples.fas

where swarm.sh contains:

#!/bin/bash -

# Target SSE4.1-able nodes, keep all the threads on one host
# Usage: bsub -q normal-s -n 16 -R "span[hosts=1] select[model==XEON_E5_2670] rusage[mem=32768]" bash swarm.sh target.fas

SWARM="${HOME}/bin/swarm/bin/swarm"
FASTA_FILE=$(readlink -f "${1}")
RESOLUTION="1"
THREADS="16"
OUTPUT_SWARMS="${FASTA_FILE%%.*}_${RESOLUTION}.swarms"
OUTPUT_STATS="${FASTA_FILE%%.*}_${RESOLUTION}.stats"
OUTPUT_STRUCT="${FASTA_FILE%%.*}_${RESOLUTION}.struct"
OUTPUT_REPRESENTATIVES="${FASTA_FILE%%.*}_${RESOLUTION}_representatives.fas"

# check if compressed file
if [[ ${FASTA_FILE##*.} == "bz2" ]] ; then
    bzcat "${FASTA_FILE}" > ${FASTA_FILE/.bz2/}
    FASTA_FILE=${FASTA_FILE/.bz2/}
fi

# Verify the abundance annotation style
ANNOTATION=$(head -n 1 "${FASTA_FILE}" | grep -o ";size=\|_")
case "${ANNOTATION}" in
    ";size=")
        ANNOTATION_OPTION="-z"
        ;;
    "_")
        ANNOTATION_OPTION=""
        ;;
    *)
        echo "Unidentified abundance annotation (\"_\" or \";size=\")." 1>&2
        exit 1
        ;;
esac


# run swarm
if [[ ${ANNOTATION_OPTION} ]] ; then
    "${SWARM}" -d "${RESOLUTION}" \
        -w "${OUTPUT_REPRESENTATIVES}" \
        -i "${OUTPUT_STRUCT}" \
        -t "${THREADS}" "${ANNOTATION_OPTION}" \
        -s "${OUTPUT_STATS}" < "${FASTA_FILE}" > "${OUTPUT_SWARMS}"
else
    "${SWARM}" -d "${RESOLUTION}" \
        -w "${OUTPUT_REPRESENTATIVES}" \
        -i "${OUTPUT_STRUCT}" \
        -t "${THREADS}" \
        -s "${OUTPUT_STATS}" < "${FASTA_FILE}" > "${OUTPUT_SWARMS}"
fi


exit 0

3.6 Taxonomic assignment

kl
cd ${HOME}/src/
bash stampa.sh ../neotropical_diversity/data/neotropical_soil_175_samples_1f_representatives.fas SSU_V4

(see stampa for a complete description of the taxonomic assignment process and an updated version of the scripts)

3.7 Basic stats

The first 155 new samples represent 43,834,769 unique sequences (122,020,527 raw reads).

The first 175 samples represent 46,769,632 unique sequences (132,319,222 raw reads).

kl
cd ${HOME}/neotropical_diversity/data/
awk -F "_" '/^>/ {s += $2 ; c += 1} END {print s, c}' neotropical_soil_175_samples.fas

Reads assigned to protists

cd ~/neotropical_diversity/results/stampa/
SOURCE="neotropical_soil_175_samples.OTU.protists_reads.stampa"
awk '{s += $2} END {print s}' "${SOURCE}"

We have 50,118,536 reads assigned to protists.

Reads assigned to fungi

cd ~/neotropical_diversity/results/stampa/
SOURCE="neotropical_soil_175_samples.OTU.fungi_reads.stampa"
awk '{s += $2} END {print s}' "${SOURCE}"

We have 44,430,705 reads assigned to fungi.

3.8 Chimera checking

The idea is to perform chimera checking after swarm. I need to:

  • extract OTU representatives into a new fasta file,
  • launch vsearch,
  • collect a list of OTU representatives that are chimeric,

I relaunch the extraction with the new swarm fastidious results, on the new dataset.

kl

DIR="${HOME}/neotropical_diversity/data/"
SRC="${HOME}/src/"
INPUT="neotropical_soil_175_samples_1f_representatives.fas"
VSEARCH_UCHIME="vsearch_chimera.sh"

# Chimera check with vsearch
cd "${SRC}"
bsub -q long \
    -n 1 \
    -R "select[model==XEON_E5_2670] rusage[mem=20000]" \
    bash "${VSEARCH_UCHIME}" "${DIR}${INPUT}"

where vsearch_chimera.sh contains:

#!/bin/bash -

# Target SSE4.1-able nodes, keep all the threads on one host
# Usage: bsub -q long -n 1 -R "select[model==XEON_E5_2670] rusage[mem=16000]" bash vsearch_chimera.sh FASTA

VSEARCH="${HOME}/bin/vsearch/bin/vsearch"
QUERIES=$(readlink -f "${1}")
CHIMERAS=${QUERIES%.*}_chimeras.fas
UCHIME=${QUERIES%.*}.uchime

## Verify the abundance annotations (expect ";size=")
if [[ ! $(head "${QUERIES}" | grep ";size=") ]] ; then
    echo "Abundance annotations must be in usearch's style (;size=)." 1>&2
    echo "Creating a temporary file with modified annotation style." 1>&2
    TMP=${QUERIES%.*}.tmp
    sed -e '/^>/ s/_/;size=/' "${QUERIES}" > "${TMP}"
    QUERIES="${TMP}"
fi

# Chimera detection (vsearch)
"${VSEARCH}" --uchime_denovo "${QUERIES}" \
    --fasta_width 0 \
    --chimeras "${CHIMERAS}" \
    --uchimeout "${UCHIME}"

# Clean temporary file (if any)
[[ ${QUERIES##*.} == "tmp" ]] && rm -f "${QUERIES}"

exit 0

3.9 Contingency tables

3.9.1 Amplicon table

We have 20 samples that are replicated (double or triple). I decide to merge identical files first (modification of the script):

kl
cd ${HOME}/neotropical_diversity/data/
ls -1 ./201[35]*/[LTB][0-9][0-9][0-9]*.fas | cut -d "/" -f 3 | sort -d | uniq -d
sample #
L001_L002 2
L005_L006 2
L007_L008 2
L011_L012 2
L013_L014 2
L015_L016 2
L018 2
L019_L020 2
L021_L022 2
L023_L024 2
L025_L026 2
L027_L028 2
L030 3
L031_L032 2
L035_L036 2
L037_L038 2
L039_L040 2
L041_L042 2
L043_L044 2
L045_L046 2
kl
cd ${HOME}/neotropical_diversity/src/
module load python/latest-2.7
python amplicon_contingency_table.py ../data/201[35]*/[LTB][0-9][0-9][0-9]*.fas > ../data/neotropical_soil_175_samples.amplicons.table
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
    Read all fasta files and build a sorted amplicon contingency
    table. Usage: python amplicon_contingency_table.py samples_*.fas
"""

from __future__ import print_function

__author__ = "Frédéric Mahé <mahe@rhrk.uni-kl.fr>"
__date__ = "2015/03/10"
__version__ = "$Revision: 1.0"

import os
import sys
import operator

#*****************************************************************************#
#                                                                             #
#                                  Functions                                  #
#                                                                             #
#*****************************************************************************#


def fasta_parse():
    """
    Map amplicon ids, abundances and samples
    """
    separator = ";size="
    fasta_files = sys.argv[1:]
    all_amplicons = dict()
    samples = dict()
    amplicons2samples = dict()
    for fasta_file in fasta_files:
        sample = os.path.basename(fasta_file)
        sample = os.path.splitext(sample)[0]
        samples[sample] = samples.get(sample, 0) + 1
        with open(fasta_file, "rU") as fasta_file:
            for line in fasta_file:
                if line.startswith(">"):
                    amplicon, abundance = line.strip(">\n").split(separator)
                    abundance = int(abundance)
                    if amplicon not in amplicons2samples:
                        amplicons2samples[amplicon] = {sample: abundance}
                    else:
                        # deal with duplicated samples
                        amplicons2samples[amplicon][sample] = amplicons2samples[amplicon].get(sample, 0) + abundance
                    all_amplicons[amplicon] = all_amplicons.get(amplicon, 0) + abundance

    # deal with duplicated samples
    duplicates = [sample for sample in samples if samples[sample] > 1]
    if duplicates:
        print("Warning: some samples are duplicated", file=sys.stderr)
        print("\n".join(duplicates), file=sys.stderr)
    samples = sorted(samples.keys())

    return all_amplicons, amplicons2samples, samples


def main():
    """
    Read all fasta files and build a sorted amplicon contingency table
    """
    # Parse command line
    all_amplicons, amplicons2samples, samples = fasta_parse()

    # Sort amplicons by decreasing abundance (and by amplicon name)
    sorted_all_amplicons = sorted(all_amplicons.iteritems(),
                                  key=operator.itemgetter(1, 0))
    sorted_all_amplicons.reverse()

    # Print table header
    print("amplicon", "\t".join(samples), "total", sep="\t", file=sys.stdout)

    # Print table content
    for amplicon, abundance in sorted_all_amplicons:
        abundances = [amplicons2samples[amplicon].get(sample, 0)
                      for sample in samples]
        total = sum(abundances)
        # print(len(abundances), len(samples), total, len(amplicons2samples[amplicon]), file=sys.stderr)
        # print(amplicons2samples[amplicon], file=sys.stderr)
        abundances = [str(i) for i in abundances]
        # Sanity check
        if total == abundance:
            print(amplicon, "\t".join(abundances), total, sep="\t",
                  file=sys.stdout)
        else:
            print("Abundance sum is not correct for this amplicon",
                  amplicon, abundance, total, file=sys.stderr)
            sys.exit(-1)

    return


#*****************************************************************************#
#                                                                             #
#                                     Body                                    #
#                                                                             #
#*****************************************************************************#

if __name__ == '__main__':

    main()

sys.exit(0)

Sanity check

kl
cd ${HOME}/neotropical_diversity/data/
wc -l neotropical_soil_175_samples.amplicons.table
grep -c "^>" neotropical_soil_175_samples.fas

OK.

3.9.2 OTU table

We have 20 samples that are replicated (double or triple). Since that technical replication was not done systematically, we cannot use it as a denoising approach (sample intersections). To reduce noise, we will keep only abundant OTUs (mass of 3 or more), or small OTUs spread other at more than one sample (mass of 2 or 3, present in 2 or 3 samples). Additionally, small OTUs discarded can be salvaged if their proximity with a reference is high (99% for example?). OTUs marked as chimeras will be discarded (but chimeras with high identity with references should be investigated).

kl
cd ${HOME}/projects/Deep_Sea_protists/data/V9/
awk '$21 == "N" && $20 < 3 && ($22 >= 99.0 || $22 == 100.0)' sediment_v9_17_samples.OTU.table2 | wc -l

It saves 82 OTUs out of 664,560. Again it puts all the pressure on the reference database. It has to be curated carrefuly!

Modification of the amplicon table python script to produce the OTU table.

# kl
cd ${HOME}/neotropical_diversity/data/

FOLDER="${HOME}/neotropical_diversity/src"
SCRIPT="OTU_contingency_table.py"
FASTA="neotropical_soil_175_samples.fas"  # CHANGE HERE!
STATS="${FASTA/.fas/_1f.stats}"
SWARMS="${FASTA/.fas/_1f.swarms}"
STAMPA="${FASTA/.fas/_representatives.results}"
UCHIME="${FASTA/.fas/_representatives.uchime}"
OTU_TABLE="${FASTA/.fas/.OTU.table}"

module load python/latest-2.7

python "${FOLDER}/${SCRIPT}" "${STAMPA}" "${STATS}" "${SWARMS}" "${UCHIME}" ./201[35]*/[LTB][0-9][0-9][0-9]*.fas > "${OTU_TABLE}"
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
    Read all fasta files and build a sorted amplicon contingency
    table. Usage: python OTU_contingency_table.py [input files]
"""

from __future__ import print_function

__author__ = "Frédéric Mahé <mahe@rhrk.uni-kl.fr>"
__date__ = "2016/03/03"
__version__ = "$Revision: 3.0"

import os
import re
import sys
import operator

#*****************************************************************************#
#                                                                             #
#                                  Functions                                  #
#                                                                             #
#*****************************************************************************#


def stampa_parse():
    """
    Map amplicon ids and taxonomic assignments.
    """
    separator = "\t"
    stampa_file = sys.argv[1]
    stampa = dict()
    with open(stampa_file, "rU") as stampa_file:
        for line in stampa_file:
            amplicon, abundance, identity, taxonomy, references = line.strip().split(separator)
            stampa[amplicon] = (identity, taxonomy, references)

    return stampa


def stats_parse():
    """
    Map OTU seeds and stats.
    """
    separator = "\t"
    stats_file = sys.argv[2]
    stats = dict()
    with open(stats_file, "rU") as stats_file:
        for line in stats_file:
            mass, seed = line.strip().split(separator)[1:3]
            stats[seed] = int(mass)
    # Sort OTUs by decreasing mass
    sorted_stats = sorted(stats.iteritems(),
                          key=operator.itemgetter(1, 0))
    sorted_stats.reverse()

    return stats, sorted_stats


def swarms_parse():
    """
    Map OTUs.
    """
    separator = "_[0-9]+|;size=[0-9]+;| "  # parsing of abundance annotations
    swarms_file = sys.argv[3]
    swarms = dict()
    with open(swarms_file, "rU") as swarms_file:
        for line in swarms_file:
            line = line.strip()
            amplicons = re.split(separator, line)[0::2]
            seed = amplicons[0]
            swarms[seed] = [amplicons]

    return swarms


def uchime_parse():
    """
    Map OTU's chimera status.
    """
    separator = " "
    uchime_file = sys.argv[4]
    uchime = dict()
    with open(uchime_file, "rU") as uchime_file:
        for line in uchime_file:
            OTU = line.strip().split("\t")
            try:
                seed = OTU[1].split(";")[0]
            except IndexError:  # deal with partial line (missing seed)
                continue
            try:
                status = OTU[17]
            except IndexError:  # deal with unfinished chimera detection runs
                status = "NA"
            uchime[seed] = status

    return uchime


def fasta_parse():
    """
    Map amplicon ids, abundances and samples.
    """
    separator = ";size="  ## CHANGE THAT!!!
    fasta_files = sys.argv[5:]
    all_amplicons = dict()    # Not used to produce the final table!
    samples = dict()
    amplicons2samples = dict()
    for fasta_file in fasta_files:
        sample = os.path.basename(fasta_file)
        sample = os.path.splitext(sample)[0]
        sample = sample.split("_")[0]
        samples[sample] = samples.get(sample, 0) + 1
        with open(fasta_file, "rU") as fasta_file:
            for line in fasta_file:
                if line.startswith(">"):
                    amplicon, abundance = line.strip(">\n").split(separator)
                    abundance = int(abundance)
                    if amplicon not in amplicons2samples:
                        amplicons2samples[amplicon] = {sample: abundance}
                    else:
                        # deal with duplicated samples
                        amplicons2samples[amplicon][sample] = amplicons2samples[amplicon].get(sample, 0) + abundance
                    all_amplicons[amplicon] = all_amplicons.get(amplicon, 0) + abundance
    # deal with duplicated samples
    duplicates = [sample for sample in samples if samples[sample] > 1]
    if duplicates:
        print("Warning: some samples are duplicated", file=sys.stderr)
        print("\n".join(duplicates), file=sys.stderr)
    samples = sorted(samples.keys())

    return all_amplicons, amplicons2samples, samples


def print_table(stampa, stats, sorted_stats,
                swarms, uchime, amplicons2samples, samples):
    """
    Export results.
    """
    # Print table header
    print("OTU", "amplicon", "\t".join(samples),
          "total", "chimera", "identity",
          "taxonomy", "references",
          sep="\t", file=sys.stdout)

    # Print table content
    i = 1
    for seed, abundance in sorted_stats:
        occurrences = dict([(sample, 0) for sample in samples])
        for amplicons in swarms[seed]:
            for amplicon in amplicons:
                for sample in samples:
                    occurrences[sample] += amplicons2samples[amplicon].get(sample, 0)
        abundances = sum([occurrences[sample] for sample in samples])

        total = abundances
        if total == stats[seed]:
            # Deal with incomplete chimera checking
            if seed in uchime:
                chimera_status = uchime[seed]
            else:
                chimera_status = "NA"
            print(i, seed,
                  "\t".join([str(occurrences[sample]) for sample in samples]),
                  total, chimera_status,
                  "\t".join(stampa[seed]), sep="\t", file=sys.stdout)
        else:
            print("Abundance sum is not correct for this amplicon",
                  amplicon, abundance, total, file=sys.stderr)
            sys.exit(-1)
        i += 1

    return


def main():
    """
    Read all fasta files and build a sorted amplicon contingency table.
    """
    # Parse taxonomic results
    stampa = stampa_parse()

    # Parse stats
    stats, sorted_stats = stats_parse()

    # Parse swarms
    swarms = swarms_parse()

    # Parse uchime
    uchime = uchime_parse()

    # Parse fasta files
    all_amplicons, amplicons2samples, samples = fasta_parse()

    # Print table header
    print_table(stampa, stats, sorted_stats, swarms,
                uchime, amplicons2samples, samples)

    return


#*****************************************************************************#
#                                                                             #
#                                     Body                                    #
#                                                                             #
#*****************************************************************************#

if __name__ == '__main__':

    main()

sys.exit(0)

Sanity check

kl
cd ${HOME}/neotropical_diversity/data/
# OTUs
wc -l neotropical_soil_175_samples.OTU.table neotropical_soil_175_samples_1f.stats neotropical_soil_175_samples_1f.swarms
# reads
tail -n +2 neotropical_soil_175_samples.OTU.table | awk '{s += $157} END {print s}'
awk -F "_" '/^>/ {s += $2} END {print s}' neotropical_soil_175_samples.fas

Everything's good: 23734698 neotropical_soil_175_samples.OTU.table 23734697 neotropical_soil_175_samples_1f.stats 23734697 neotropical_soil_175_samples_1f.swarms

132319222 reads are present in the fasta and table.

3.9.3 OTU table filtering (protists only)

kl
cd ${HOME}/neotropical_diversity/src/

FOLDER="../data"
SCRIPT="OTU_table_cleaner_protists.py"
OTU_TABLE="neotropical_soil_175_samples.OTU.table"
OTU_FILTERED="${OTU_TABLE/.table/.protists.table}"

module load python/latest-2.7

python "${SCRIPT}" "${FOLDER}/${OTU_TABLE}" 99.5 > "${FOLDER}/${OTU_FILTERED}"

I decide to salvage small OTUs with 99.5% identity with references (appr. 2 differences for V4 sequences).

The reduction of the OTU number is drastic: 23735052 neotropical_soil_175_samples.OTU.table 29093 neotropical_soil_175_samples.OTU.protists.table

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
    Parse the OTU table, and filter out OTUs based on taxonomic
    assignment, chimera status and ecological distribution.
"""

from __future__ import print_function

__author__ = "Frédéric Mahé <mahe@rhrk.uni-kl.fr>"
__date__ = "2015/03/30"
__version__ = "$Revision: 1.0"

import sys
import csv

#*****************************************************************************#
#                                                                             #
#                                  Functions                                  #
#                                                                             #
#*****************************************************************************#


def parse_table(input_file):
    """Parse the OTU table and filter valid OTUs."""

    with open(input_file, "rb") as input_file:
        reader = csv.DictReader(input_file, delimiter="\t")

        threshold = 99.5
        if sys.argv[2]:
            threshold = float(sys.argv[2])
        first_time = True
        is_chimera = set(["N", "NA"])
        non_protists = set(["Fungi", "Metazoa", "Streptophyta"])
        non_samples = ["OTU", "amplicon", "total", "chimera",
                       "identity", "taxonomy", "references"]

        for row in reader:
            # List samples and print header
            if first_time:
                samples = list(set(row.keys()) - set(non_samples))
                samples = sorted(samples)
                print("\t".join(non_samples[0:2]), "\t".join(samples),
                      "\t".join(non_samples[2:]), sep="\t", file=sys.stdout)
                first_time = False

            # Exclude chimeras
            if row["chimera"] not in is_chimera:
                continue

            # Exclude non-protists
            protist = False
            taxon = row["taxonomy"].split("|")
            domain = taxon[0]
            if domain == "Eukaryota":
                kingdom = taxon[2]
                if kingdom not in non_protists:
                    protist = True

            # Filter rows
            identity = float(row["identity"])
            total = int(row["total"])
            if total == 1:
                if protist and identity >= threshold:
                    print_row(row, samples)
            elif total == 2:
                if protist:
                    if identity >= threshold:
                        print_row(row, samples)
                    else:
                        occurrences = len([sample for sample in samples
                                           if int(row[sample])])
                    if occurrences > 1:
                        print_row(row, samples)
            else:
                if protist:
                    print_row(row, samples)
    return


def print_row(row, samples):
    """Output OTUs."""

    occurrences = [str(row[sample]) for sample in samples]

    print(row["OTU"], row["amplicon"], "\t".join(occurrences),
          row["total"], row["chimera"], row["identity"],
          row["taxonomy"], row["references"], sep="\t", file=sys.stdout)

    return


def main():
    """Parse the OTU table and produce taxonomic profiles."""

    parse_table(sys.argv[1])

    return


#*****************************************************************************#
#                                                                             #
#                                     Body                                    #
#                                                                             #
#*****************************************************************************#

if __name__ == '__main__':

    main()

sys.exit(0)
kl
cd ${HOME}/neotropical_diversity/data/
grep "Eukaryota" neotropical_soil_175_samples.OTU.table | \
    grep -v "Metazoa\|Streptophyta\|Fungi" | \
    awk '{if ($158 == "N" || $158 == "NA") s += $157} END {print s}'

99,88% of OTUs are discarded. The filtered table contains 50,118,359 reads (out of 132,319,222, with 66,756,703 reads assigned to non-protist eucaryots).

The OTU filtering drops 17.85% of the reads that were assigned to protists and were not chimeras (50,118,359 out of 61,010,252 remain).

  1. # of OTUs observed in the combined dataset of all three forests
kl
cd ${HOME}/neotropical_diversity/data/
tail -n +2 neotropical_soil_175_samples.OTU.protists.table | wc -l

We have 29,092 protist OTUs.

3.9.4 OTU table filtering (fungi only)

kl
cd ${HOME}/neotropical_diversity/src/

FOLDER="../data"
SCRIPT="OTU_table_cleaner_fungi.py"
OTU_TABLE="neotropical_soil_175_samples.OTU.table"
OTU_FILTERED="${OTU_TABLE/.table/.fungi.table}"

module load python/latest-2.7

python "${SCRIPT}" "${FOLDER}/${OTU_TABLE}" 99.5 > "${FOLDER}/${OTU_FILTERED}"

I decide to salvage small OTUs with 99.5% identity with references (appr. 2 differences for V4 sequences).

The reduction of the OTU number is drastic: 23735052 neotropical_soil_175_samples.OTU.table 29093 neotropical_soil_175_samples.OTU.fungi.table

kl
cd ${HOME}/neotropical_diversity/data/
grep "Eukaryota" neotropical_soil_175_samples.OTU.table | \
    grep -v "Metazoa\|Streptophyta\|Fungi" | \
    awk '{if ($158 == "N" || $158 == "NA") s += $157} END {print s}'

99,88% of OTUs are discarded. The filtered table contains 50,118,359 reads (out of 132,319,222, with 66,756,703 reads assigned to non-protist eucaryots).

The OTU filtering drops 17.85% of the reads that were assigned to protists and were not chimeras (50,118,359 out of 61,010,252 remain).

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
    Parse the OTU table, and filter out OTUs based on taxonomic
    assignment, chimera status and ecological distribution.
"""

from __future__ import print_function

__author__ = "Frédéric Mahé <mahe@rhrk.uni-kl.fr>"
__date__ = "2015/04/16"
__version__ = "$Revision: 1.0"

import sys
import csv

#*****************************************************************************#
#                                                                             #
#                                  Functions                                  #
#                                                                             #
#*****************************************************************************#


def parse_table(input_file):
    """Parse the OTU table and filter valid OTUs."""

    with open(input_file, "rb") as input_file:
        reader = csv.DictReader(input_file, delimiter="\t")

        threshold = 99.5
        if sys.argv[2]:
            threshold = float(sys.argv[2])
        first_time = True
        is_chimera = set(["N", "NA"])
        non_protists = set(["Fungi", "Metazoa", "Streptophyta"])
        non_samples = ["OTU", "amplicon", "total", "chimera",
                       "identity", "taxonomy", "references"]

        for row in reader:
            # List samples and print header
            if first_time:
                samples = list(set(row.keys()) - set(non_samples))
                samples = sorted(samples)
                print("\t".join(non_samples[0:2]), "\t".join(samples),
                      "\t".join(non_samples[2:]), sep="\t", file=sys.stdout)
                first_time = False

            # Exclude chimeras
            if row["chimera"] not in is_chimera:
                continue

            # Exclude non-fungi
            fungi = False
            taxon = row["taxonomy"].split("|")
            domain = taxon[0]
            if domain == "Eukaryota":
                kingdom = taxon[2]
                if kingdom == "Fungi" :
                    fungi = True

            # Filter rows
            identity = float(row["identity"])
            total = int(row["total"])
            if total == 1:
                if fungi and identity >= threshold:
                    print_row(row, samples)
            elif total == 2:
                if fungi:
                    if identity >= threshold:
                        print_row(row, samples)
                    else:
                        occurrences = len([sample for sample in samples
                                           if int(row[sample])])
                    if occurrences > 1:
                        print_row(row, samples)
            else:
                if fungi:
                    print_row(row, samples)
    return


def print_row(row, samples):
    """Output OTUs."""

    occurrences = [str(row[sample]) for sample in samples]

    print(row["OTU"], row["amplicon"], "\t".join(occurrences),
          row["total"], row["chimera"], row["identity"],
          row["taxonomy"], row["references"], sep="\t", file=sys.stdout)

    return


def main():
    """Parse the OTU table and produce taxonomic profiles."""

    parse_table(sys.argv[1])

    return


#*****************************************************************************#
#                                                                             #
#                                     Body                                    #
#                                                                             #
#*****************************************************************************#

if __name__ == '__main__':

    main()

sys.exit(0)

How many of?

# aragorn
cd ~/neotropical_diversity/results/first_155_samples/

TABLE="neotropical_soil_175_samples.OTU.fungi.table"
TOTAL=$(head -n 1 "${TABLE}" | tr "\t" "\n" | nl -n ln | grep total | cut -f 1)

for TAXA in Cryptomycota Mucoromycota ; do
    grep "${TAXA}" "${TABLE}" | \
        awk \
        -v TOTAL="${TOTAL}" \
        -v TAXA="${TAXA}" \
        '{OTU += 1 ; reads += $TOTAL} END {print TAXA, OTU, reads}'
done
Taxa OTUs reads
Cryptomycota 1712 1411408
Mucoromycota 692 977271

3.9.5 OTU table filtering (apicomplexans only)

Extract OTUs assigned to Apicomplexa

cd ~/neotropical_diversity/results/first_155_samples/

PROTISTS="neotropical_soil_175_samples.OTU.protists.table"
APICOMPLEXA="${PROTISTS/protists/apicomplexa}"

head -n 1 "${PROTISTS}" > "${APICOMPLEXA}"
grep "Apicomplexa" "${PROTISTS}" >> "${APICOMPLEXA}"

Use R to summarize data

library(dplyr)
library(tidyr)
library(ggplot2)
library(reshape2)

## Load data
setwd("~/neotropical_diversity/results/first_155_samples/")
input <- "neotropical_soil_175_samples.OTU.apicomplexa.table"

## Import and format data
d <- read.table(input, sep = "\t", header = TRUE, dec = ".") %>% tbl_df()

## Create forest variables
d$Barro <- rowSums(select(d, starts_with("B")))
d$LaSelva <- rowSums(select(d, starts_with("L")))
d$Tiputini <- rowSums(select(d, starts_with("T")) %>%
                          select(-taxonomy, -total))

## Summarize data
d <- select(d, one_of("OTU", "Barro", "Tiputini", "LaSelva")) %>%
    gather("forest", "reads", 2:4) %>%
        group_by(forest)

## Number of apicomplexan reads per forest
d1 <- summarise(d, sum = sum(reads))
print(d1)

## Number of apicomplexan OTUs per forest
d2 <- filter(d, reads > 0) %>% count(forest)
print(d2)

quit(save = "no")
  1. % of total reads in just La Selva that are taxonomically assigned to the Apicomplexans
  2. % of total reads in just Barro that are taxonomically assigned to the Apicomplexans
  3. % of total reads in just Tiputini that are taxonomically assigned to the Apicomplexans
Forest Protists reads Apicomplexa reads % protist OTUs Apicomplexa OTUs %
Barro 17624328 16387261 92.98 7669 4606 60.06
LaSelva 23209275 17784839 76.63 18224 7870 43.18
Tiputini 9284933 8315426 89.56 5718 3514 61.46

3.9.6 OTU table filtering (apicomplexans only, after removing Lucas' unplaced OTUs)

Extract OTUs assigned to Apicomplexa

# aragorn
cd ~/neotropical_diversity/results/first_155_samples/

PROTISTS="neotropical_soil_175_samples.OTU.protists_cleaned.table"
APICOMPLEXA="${PROTISTS/protists/apicomplexa}"

head -n 1 "${PROTISTS}" > "${APICOMPLEXA}"
grep "Apicomplexa" "${PROTISTS}" >> "${APICOMPLEXA}"

Use R to summarize data

library(dplyr)
library(tidyr)
library(ggplot2)
library(reshape2)

## Load data
setwd("~/neotropical_diversity/results/first_155_samples/")

## ---------------------------- protists ----------------------------------- ##

## Import and format data
input <- "neotropical_soil_175_samples.OTU.protists_cleaned.table"
d <- read.table(input, sep = "\t", header = TRUE, dec = ".") %>% tbl_df()

## Create forest variables
d$Barro <- rowSums(select(d, starts_with("B")))
d$LaSelva <- rowSums(select(d, starts_with("L")))
d$Tiputini <- rowSums(select(d, starts_with("T")) %>%
                  select(-taxonomy, -total))

## Summarize data
d <- select(d, one_of("OTU", "Barro", "Tiputini", "LaSelva")) %>%
    gather("forest", "reads", 2:4) %>%
    group_by(forest)

## Number of protist reads per forest
d1 <- summarise(d, sum = sum(reads))
print(d1)

## Number of protist OTUs per forest
d2 <- filter(d, reads > 0) %>% count(forest)
print(d2)


## ------------------------- apicomplexans --------------------------------- ##


## Import and format data
input <- "neotropical_soil_175_samples.OTU.apicomplexa_cleaned.table"
d <- read.table(input, sep = "\t", header = TRUE, dec = ".") %>% tbl_df()

## Create forest variables
d$Barro <- rowSums(select(d, starts_with("B")))
d$LaSelva <- rowSums(select(d, starts_with("L")))
d$Tiputini <- rowSums(select(d, starts_with("T")) %>%
                  select(-taxonomy, -total))

## Summarize data
d <- select(d, one_of("OTU", "Barro", "Tiputini", "LaSelva")) %>%
    gather("forest", "reads", 2:4) %>%
    group_by(forest)

## Number of apicomplexan reads per forest
d1 <- summarise(d, sum = sum(reads))
print(d1)

## Number of apicomplexan OTUs per forest
d2 <- filter(d, reads > 0) %>% count(forest)
print(d2)

quit(save = "no")
  1. % of total reads in just La Selva that are taxonomically assigned to the Apicomplexans
  2. % of total reads in just Barro that are taxonomically assigned to the Apicomplexans
  3. % of total reads in just Tiputini that are taxonomically assigned to the Apicomplexans
Forest Protist reads Apicomplexa reads % Protist OTUs Apicomplexa OTUs %
Barro 16232500 15035494 92.63 7065 4141 58.61
LaSelva 21515829 16250286 75.53 16935 7213 42.59
Tiputini 8903877 8072110 90.66 5217 3197 61.28
Total 46652206 39357890 84.36 26860 13578 50.55

(don't forget that the sum of OTUs per forest is always higher than the total number of OTUs. There are some shared OTUs)

3.9.7 OTU table filtering (Metazoa only)

kl
cd ${HOME}/neotropical_diversity/src/

FOLDER="../data"
SCRIPT="OTU_table_cleaner_metazoa.py"
OTU_TABLE="neotropical_soil_175_samples.OTU.table"
OTU_FILTERED="${OTU_TABLE/.table/.metazoa.table}"

module load python/latest-2.7

python "${SCRIPT}" "${FOLDER}/${OTU_TABLE}" 99.5 > "${FOLDER}/${OTU_FILTERED}"

cd ../data
bzip2 -9k neotropical_soil_175_samples.OTU.metazoa.table
${OTU_FILTERED}

I decide to salvage small OTUs with 99.5% identity with references (appr. 2 differences for V4 sequences).

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
    Parse the OTU table, and filter out OTUs based on taxonomic
    assignment, chimera status and ecological distribution.
"""

from __future__ import print_function

__author__ = "Frédéric Mahé <frederic.mahe@cirad.fr>"
__date__ = "2015/10/11"
__version__ = "$Revision: 1.0"

import sys
import csv

#*****************************************************************************#
#                                                                             #
#                                  Functions                                  #
#                                                                             #
#*****************************************************************************#


def parse_table(input_file):
    """Parse the OTU table and filter valid OTUs."""

    with open(input_file, "rb") as input_file:
        reader = csv.DictReader(input_file, delimiter="\t")

        if sys.argv[2]:
            threshold = float(sys.argv[2])
        else:
            threshold = 99.5            
        first_time = True
        is_chimera = set(["N", "NA"])
        non_samples = ["OTU", "amplicon", "total", "chimera",
                       "identity", "taxonomy", "references"]

        for row in reader:
            # List samples and print header
            if first_time:
                samples = list(set(row.keys()) - set(non_samples))
                samples = sorted(samples)
                print("\t".join(non_samples[0:2]), "\t".join(samples),
                      "\t".join(non_samples[2:]), sep="\t", file=sys.stdout)
                first_time = False

            # Exclude chimeras
            if row["chimera"] not in is_chimera:
                continue

            # Exclude non-metazoa
            metazoa = False
            taxon = row["taxonomy"].split("|")
            domain = taxon[0]
            if domain == "Eukaryota":
                kingdom = taxon[2]
                if kingdom == "Metazoa" :
                    metazoa = True

            # Filter rows
            identity = float(row["identity"])
            total = int(row["total"])
            if total == 1:
                if metazoa and identity >= threshold:
                    print_row(row, samples)
            elif total == 2:
                if metazoa:
                    if identity >= threshold:
                        print_row(row, samples)
                    else:
                        occurrences = len([sample for sample in samples
                                           if int(row[sample])])
                    if occurrences > 1:
                        print_row(row, samples)
            else:
                if metazoa:
                    print_row(row, samples)
    return


def print_row(row, samples):
    """Output OTUs."""

    occurrences = [str(row[sample]) for sample in samples]

    print(row["OTU"], row["amplicon"], "\t".join(occurrences),
          row["total"], row["chimera"], row["identity"],
          row["taxonomy"], row["references"], sep="\t", file=sys.stdout)

    return


def main():
    """Parse the OTU table and produce taxonomic profiles."""

    parse_table(sys.argv[1])

    return


#*****************************************************************************#
#                                                                             #
#                                     Body                                    #
#                                                                             #
#*****************************************************************************#

if __name__ == '__main__':

    main()

sys.exit(0)
# aragorn
cd ~/neotropical_diversity/results/first_155_samples/

OTU_TABLE="neotropical_soil_175_samples.OTU.table"
METAZOA_TABLE="neotropical_soil_175_samples.OTU.metazoa.table"

wc -l "${OTU_TABLE}" "${METAZOA_TABLE}"

ABUNDANCES=$(head -n 1 "${METAZOA_TABLE}" | tr "\t" "\n" | nl | grep total | cut -f 1 | tr -d " ")

awk \
    -v ABUNDANCES=${ABUNDANCES} \
    '{if (NR > 1 && $(ABUNDANCES + 1) == "N") s += $ABUNDANCES} END {print s}' "${OTU_TABLE}"

awk \
    -v ABUNDANCES=${ABUNDANCES} \
    '{if (NR > 1) s += $ABUNDANCES} END {print s}' "${METAZOA_TABLE}"

The reduction of the OTU number is drastic: 23734698 neotropical_soil_175_samples.OTU.table 4384 neotropical_soil_175_samples.OTU.metazoa.table

From 109,840,006 to 5,715,309 reads.

In the Metazoa table, we have 4,383 OTUs representing 5,715,309 reads.

Our top OTU (1 million reads) is assigned to Craniata and maybe a contamination of human DNA. Most mamals have exactly the same 18S rRNA V4 sequence (checked on GenBank [2015-12-24 jeu.]), and it can also be real sequences left by mammals leaving in the forest. Probably a mix of the two. The sequences mostly come from 4 samples. We may investigate.

3.9.8 OTU table filtering (streptophyta only)

# kl
cd ${HOME}/neotropical_diversity/src/

FOLDER="../data"
SCRIPT="OTU_table_cleaner_streptophyta.py"
OTU_TABLE="neotropical_soil_175_samples.OTU.table"
OTU_FILTERED="${OTU_TABLE/.table/.streptophyta.table}"

module load python/latest-2.7

python "${SCRIPT}" "${FOLDER}/${OTU_TABLE}" 99.5 > "${FOLDER}/${OTU_FILTERED}"

I decide to salvage small OTUs with 99.5% identity with references (appr. 2 differences for V4 sequences).

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
    Parse the OTU table, and filter out OTUs based on taxonomic
    assignment, chimera status and ecological distribution.
"""

from __future__ import print_function

__author__ = "Frédéric Mahé <mahe@rhrk.uni-kl.fr>"
__date__ = "2016/03/09"
__version__ = "$Revision: 1.0"

import sys
import csv

#*****************************************************************************#
#                                                                             #
#                                  Functions                                  #
#                                                                             #
#*****************************************************************************#


def parse_table(input_file):
    """Parse the OTU table and filter valid OTUs."""

    with open(input_file, "rb") as input_file:
        reader = csv.DictReader(input_file, delimiter="\t")

        threshold = 99.5
        if sys.argv[2]:
            threshold = float(sys.argv[2])
        first_time = True
        is_chimera = set(["N", "NA"])
        non_samples = ["OTU", "amplicon", "total", "chimera",
                       "identity", "taxonomy", "references"]

        for row in reader:
            # List samples and print header
            if first_time:
                samples = list(set(row.keys()) - set(non_samples))
                samples = sorted(samples)
                print("\t".join(non_samples[0:2]), "\t".join(samples),
                      "\t".join(non_samples[2:]), sep="\t", file=sys.stdout)
                first_time = False

            # Exclude chimeras
            if row["chimera"] not in is_chimera:
                continue

            # Exclude non-streptophyta
            streptophyta = False
            taxon = row["taxonomy"].split("|")
            domain = taxon[0]
            if domain == "Eukaryota":
                kingdom = taxon[2]
                if kingdom == "Streptophyta" :
                    streptophyta = True

            # Filter rows
            identity = float(row["identity"])
            total = int(row["total"])
            if total == 1:
                if streptophyta and identity >= threshold:
                    print_row(row, samples)
            elif total == 2:
                if streptophyta:
                    if identity >= threshold:
                        print_row(row, samples)
                    else:
                        occurrences = len([sample for sample in samples
                                           if int(row[sample])])
                    if occurrences > 1:
                        print_row(row, samples)
            else:
                if streptophyta:
                    print_row(row, samples)
    return


def print_row(row, samples):
    """Output OTUs."""

    occurrences = [str(row[sample]) for sample in samples]

    print(row["OTU"], row["amplicon"], "\t".join(occurrences),
          row["total"], row["chimera"], row["identity"],
          row["taxonomy"], row["references"], sep="\t", file=sys.stdout)

    return


def main():
    """Parse the OTU table and produce taxonomic profiles."""

    parse_table(sys.argv[1])

    return


#*****************************************************************************#
#                                                                             #
#                                     Body                                    #
#                                                                             #
#*****************************************************************************#

if __name__ == '__main__':

    main()

sys.exit(0)

3.9.9 OTU table filtering (colpodean only)

Extract OTUs assigned to Colpodea

cd ~/neotropical_diversity/results/first_155_samples/

PROTISTS="neotropical_soil_175_samples.OTU.protists.table"
COLPODEA="${PROTISTS/protists/colpodea}"

head -n 1 "${PROTISTS}" > "${COLPODEA}"
grep "Colpodea" "${PROTISTS}" >> "${COLPODEA}"

Use R to summarize data

library(dplyr)
library(tidyr)
library(ggplot2)
library(reshape2)

## Load data
setwd("~/neotropical_diversity/results/first_155_samples/")
input <- "neotropical_soil_175_samples.OTU.colpodea.table"

## Import and format data
d <- read.table(input, sep = "\t", header = TRUE, dec = ".") %>% tbl_df()

## Create forest variables
d$Barro <- rowSums(select(d, starts_with("B")))
d$LaSelva <- rowSums(select(d, starts_with("L")))
d$Tiputini <- rowSums(select(d, starts_with("T")) %>%
                          select(-taxonomy, -total))

## Summarize data
d <- select(d, one_of("OTU", "Barro", "Tiputini", "LaSelva")) %>%
    gather("forest", "reads", 2:4) %>%
        group_by(forest)

## Number of colpodean reads per forest
d1 <- summarise(d, sum = sum(reads))
print(d1)

## Number of colpodean OTUs per forest
d2 <- filter(d, reads > 0) %>% count(forest)
print(d2)

quit(save = "no")

3.9.10 Summary table protists, animals, plants and fungi (OTUs and reads)

  1. Table of OTUs for tropical soil OTUs for animals, plants, and fungi.
# kl
cd ${HOME}/neotropical_diversity/data/

for TABLE in neotropical_soil_175_samples.OTU.{metazoa,streptophyta,fungi,protists_cleaned}.table ; do
    TAXA=${TABLE/.table/}
    TAXA=${TAXA/*./}
    TAXA=${TAXA/_*/}
    ABUNDANCES=$(head -n 1 "${TABLE}" | tr "\t" "\n" | nl | grep total | cut -f 1 | tr -d " ")
    awk \
        -v ABUNDANCES=${ABUNDANCES} \
        -v TAXA=${TAXA} \
        'BEGIN {FS = "\t" ; OFS = "\t"}
         {if (NR > 1 && $(ABUNDANCES + 1) == "N") {
              s += $ABUNDANCES ; c += 1
          }
         } END {print TAXA, c, s}' "${TABLE}"
done
Taxa OTUs reads
Metazoa 4374 5715300
Streptophyta 3089 3874669
Fungi 17849 44430656
Protists 26841 46652187

(small differences in OTU numbers come from the filtering of chimeras. The above table discards chimeras)

3.10 Taxonomic profiles (high taxonomic level)

Produce a barplot (or barchart) for all samples and the top taxonomic groups (+ other).

Create the comparison table

# kl
cd ${HOME}/neotropical_diversity/src/

SCRIPT="taxonomic_profiles.py"
DIR="../data"
OTU_TABLE="neotropical_soil_175_samples.OTU.table"
PROFILES="${OTU_TABLE/.OTU.table/_taxonomic_profiles.csv}"

module load python/latest-2.7 

python "${SCRIPT}" "${DIR}/${OTU_TABLE}" > "${DIR}/${PROFILES}"
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
    Parse the OTU table, group OTUs assigned to the same taxa per
    sample and produce taxonomic profiles.
"""

from __future__ import print_function

__author__ = "Frédéric Mahé <mahe@rhrk.uni-kl.fr>"
__date__ = "2015/03/25"
__version__ = "$Revision: 1.0"

import os
import sys
import csv

#*****************************************************************************#
#                                                                             #
#                                  Functions                                  #
#                                                                             #
#*****************************************************************************#


def parse_table(input_file):
    """Parse the OTU table."""

    other_domains = set(["Bacteria", "Archaea", "Organelle"])
    unknowns = set(["*", "No_hit", "not_rRNA", "_X"])

    with open(input_file, "rb") as input_file:
        reader = csv.DictReader(input_file, delimiter="\t")
        grand_total = 0
        first_time = True
        non_samples = set(["OTU", "amplicon", "total", "chimera",
                           "identity", "taxonomy", "references"])
        for row in reader:
            # List samples and build the data structure
            if first_time:
                samples = list(set(row.keys()) - non_samples)
                samples = sorted(samples)
                first_time = False
                data = {"Eukaryota": dict(),
                        "Unknown": {sample: 0 for sample in samples},
                        "Organelle": {sample: 0 for sample in samples},
                        "Bacteria": {sample: 0 for sample in samples},
                        "Archaea": {sample: 0 for sample in samples}}

            # Exclude chimeras
            if row["chimera"] == "N" or row["chimera"] == "NA":
                grand_total += int(row["total"])
                if row["taxonomy"] == "No_hit":
                    domain, super_kingdom, kingdom = "No_hit", "No_hit", "No_hit"
                else:
                    domain, super_kingdom, kingdom =  row["taxonomy"].split("|")[0:3]
                if domain == "Eukaryota":
                    if super_kingdom == "*" or super_kingdom == "Eukaryota_X":
                        kingdom = "Unknown Eukaryota"
                    else:
                        if kingdom == "*" or kingdom.endswith("_X"):
                            # Deal with "Taxa_X" situations
                            kingdom = "Unknown " + super_kingdom.split("_X")[0]
                    if kingdom not in data[domain]:
                        data[domain][kingdom] = {sample: 0 for sample in samples}
                    for sample in samples:
                        data[domain][kingdom][sample] += int(row[sample])
                elif domain in other_domains:
                    for sample in samples:
                        data[domain][sample] += int(row[sample])
                elif domain in unknowns:
                    domain = "Unknown"
                    for sample in samples:
                        data[domain][sample] += int(row[sample])
                else:
                    print("Something's wrong! Unknown domain:\n",
                          domain, row, file=sys.stderr)
                    sys.exit(-1)
    return samples, data, grand_total


def output_table(samples, data, grand_total):
    """Output taxonomic profiles"""

    # Taxonomic domains
    non_eucaryotic_domains = ["Archaea", "Bacteria", "Organelle", "Unknown"]

    # Header
    print("taxa", "\t".join(samples),
          "total", "percentage", sep="\t", file=sys.stdout)
    # Other domains
    for domain in non_eucaryotic_domains:
        tmp = [str(data[domain][sample])
               for sample in samples]
        total = sum(data[domain].values())
        percentage = round(100.0 * total / grand_total, 2)
        print(domain, "\t".join(tmp),
              total, percentage, sep="\t", file=sys.stdout)
    # Eukaryota
    kingdoms = sorted(data["Eukaryota"].keys())
    for kingdom in kingdoms:
        tmp = [str(data["Eukaryota"][kingdom][sample])
               for sample in samples]
        total = sum(data["Eukaryota"][kingdom].values())
        percentage = round(100.0 * total / grand_total, 2)
        print(kingdom, "\t".join(tmp),
              total, percentage, sep="\t", file=sys.stdout)
    return


def main():
    """Parse the OTU table and produce taxonomic profiles."""

    # Parse the OTU table
    samples, data, grand_total = parse_table(sys.argv[1])

    # Export taxonomic profiles
    output_table(samples, data, grand_total)

    return


#*****************************************************************************#
#                                                                             #
#                                     Body                                    #
#                                                                             #
#*****************************************************************************#

if __name__ == '__main__':

    main()

sys.exit(0)

Sanity check

kl
cd ${HOME}/neotropical_diversity/data/
awk '{s += $(NF -1)} END {print s}' neotropical_soil_175_samples_taxonomic_profiles.csv
# 130367129
awk '{if ($158 == "N" || $158 == "NA") s += $157} END {print s}' neotropical_soil_175_samples.OTU.table 
# 130367129

Make barcharts (all taxa > 0.1%)

library(ggplot2)
library(scales)
library(reshape2)

setwd("~/neotropical_diversity/results/stampa/")

## Neotropical soil samples taxonomic profiles
input <- "neotropical_soil_175_samples_taxonomic_profiles.csv"
threshold <- 0.1

## Import and format data
df <- read.table(input, sep = "\t", header = TRUE)
df <- df %>%
    filter(percentage >= threshold) %>%
    select(-total, -percentage) %>%
    melt(id.vars = c("taxa"))
colnames(df) <- c("taxa", "sample", "abundance")

## Barcharts
ggplot(df, aes(x = sample, y = abundance, fill = taxa)) +
    geom_bar(stat = "identity") +
    scale_x_discrete(limits = rev(levels(df$sample))) +
    scale_y_continuous(labels = comma) +
    xlab("samples") +
    ylab("number of environmental sequences") +
    coord_flip() +
    theme_bw() +
    ggtitle("Tree Line taxonomic profiles (> 0.1%)") +
    theme(legend.justification=c(1,0),
          legend.position=c(1,0),
          legend.background = element_rect(colour="black", size=.1))

## Output to PDF
output <- gsub(".csv", ".pdf", input, fixed = TRUE)
ggsave(file = output, width = 8 , height = 14)

quit(save="no")

3.11 Taxonomic profiles per forest (high taxonomic level)

Produce a barplot (or barchart) where samples are grouped by forest and by taxonomic groups.

library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
library(reshape2)

## Load data
setwd("~/neotropical_diversity/results/stampa/")
input <- "neotropical_soil_175_samples.OTU.protists.table"

## Multiple plot function
##
## ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
## - cols:   Number of columns in layout
## - layout: A matrix specifying the layout. If present, 'cols' is ignored.
##
## If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
## then plot 1 will go in the upper left, 2 will go in the upper right, and
## 3 will go all the way across the bottom.
##
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
    library(grid)

    ## Make a list from the ... arguments and plotlist
    plots <- c(list(...), plotlist)

    numPlots = length(plots)

    ## If layout is NULL, then use 'cols' to determine layout
    if (is.null(layout)) {
        ## Make the panel
        ## ncol: Number of columns of plots
        ## nrow: Number of rows needed, calculated from # of cols
        layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                         ncol = cols, nrow = ceiling(numPlots/cols))
    }

    if (numPlots==1) {
        print(plots[[1]])

    } else {
          ## Set up the page
          grid.newpage()
          pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

          ## Make each plot, in the correct location
          for (i in 1:numPlots) {
              ## Get the i,j matrix positions of the regions that contain this subplot
              matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

              print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                    layout.pos.col = matchidx$col))
          }
      }
}


## Import and format data
d <- read.table(input, sep = "\t", header = TRUE, dec=".") %>% tbl_df()

## For some reasons, the command below does not work with mutate
d$Barro <- rowSums(select(d, starts_with("B")))
d$LaSelva <- rowSums(select(d, starts_with("L")))
d$Tiputini <- rowSums(select(d, starts_with("T")) %>%
                      select(-taxonomy, -total))

## Discard all other columns
d <- select(d, one_of("Barro", "Tiputini", "LaSelva", "taxonomy"))

## Extract the third field from the "taxonomy" and store in a new column
d$clade <- apply(d["taxonomy"], 1 , function(x) strsplit(x, "|", fixed = TRUE)[[1]][3])

d$LaSelva <- as.integer(d$LaSelva)
d$Tiputini <- as.integer(d$Tiputini)
d$Barro <- as.integer(d$Barro)

## Group by clade (sum reads)
d2 <- select(d, -taxonomy) %>%
    gather("forest", "abundance", -clade) %>%
        group_by(clade, forest)
d2$abundance <- as.integer(d2$abundance)
d2 <- tally(d2, wt = abundance, sort = FALSE)

## Replace "*" by "Unknown", and discard "Chimera"
d2 <- d2 %>% filter(clade != "Chimera")
d2$clade[d2$clade == "*"] <- "Unknown"
d2$clade[d2$clade == "Alveolata_X"] <- "Alveolata incertae sedis"
d2$clade[d2$clade == "Amoebozoa_X"] <- "Amoebozoa incertae sedis"
d2$clade[d2$clade == "Stramenopiles_X"] <- "non-Ochrophyta Stramenopiles"

## List clades that have significative abundances
main_taxa <- d2 %>%
    select(-forest) %>%
    group_by(clade) %>%
    tally(wt = n, sort = TRUE) %>%
    mutate(percentage = 100 * n / sum(d2$n)) %>%
    filter(percentage > 0.1) %>%
    select(-n, -percentage)

## All rows in d2 that have a match in main_taxa
d2 <- semi_join(d2, main_taxa, by = "clade")

## Order the legend
taxa_order_reads<- select(d2, clade) %>% distinct()

#------------------------ Absolute barplots -----------------------------------#

## Barcharts (reads)
p1 <- ggplot(d2, aes(x = forest, y = n, fill = clade)) +
    geom_bar(stat = "identity") +
    scale_y_continuous(labels = comma) +
    scale_fill_discrete(breaks = taxa_order_reads$clade,
                            name = "clade                         ") +
    scale_x_discrete(limits = rev(c("LaSelva", "Barro", "Tiputini")),
                     breaks = c("LaSelva", "Barro", "Tiputini"),
                     labels = c("Costa Rica\n(La Selva)",
                         "Panama\n(Barro\nColorado)",
                         "Ecuador\n(Tiputini)")) +
    ylab("number of observed reads") +
    xlab("forests") +
    coord_flip() +
    theme_bw() +
    theme(axis.text  = element_text(size = 11),
          axis.title.x = element_text(vjust = 0)) ##  +
    ## ggtitle("Neotropical Forest Soils: protist communities (175 samples, share > 0.1%)") +
    ## theme(legend.background = element_rect(colour="black", size=.1))

## Barcharts (OTUs)

## Group by clade (sum reads)
d3 <- select(d, -taxonomy) %>%
    gather("forest", "abundance", -clade) %>%
        group_by(clade, forest) %>%
            filter(abundance != "0") %>%
                tally(sort= TRUE)

## Replace "*" by "Unknown", and discard "Chimera"
d3 <- d3 %>% filter(clade != "Chimera")
d3$clade[d3$clade == "*"] <- "Unknown"
d3$clade[d3$clade == "Alveolata_X"] <- "Alveolata incertae sedis"
d3$clade[d3$clade == "Amoebozoa_X"] <- "Amoebozoa incertae sedis"
d3$clade[d3$clade == "Stramenopiles_X"] <- "non-Ochrophyta Stramenopiles"

## List clades that have significative abundances
main_taxa <- d3 %>%
    select(-forest) %>%
    group_by(clade) %>%
    tally(sort = TRUE) %>%
    mutate(percentage = 100 * n / sum(d3$n)) %>%
    filter(percentage > 0.1) %>%
    select(-n, -percentage)

## ## All rows in d3 that have a match in main_taxa (in the same time,
## it sorts d3 by decreasing number of OTUs)
d3 <- semi_join(d3, main_taxa, by = "clade")

## Order the legend
taxa_order_OTUs <- select(d3, clade) %>% distinct()

## Barcharts
p2 <- ggplot(d3, aes(x = forest, y = n, fill = clade)) +
    geom_bar(stat = "identity") +
    scale_y_continuous(labels = comma) +
    scale_fill_discrete(breaks = taxa_order_OTUs$clade) +
    scale_x_discrete(limits = rev(c("LaSelva", "Barro", "Tiputini")),
                     breaks = c("LaSelva", "Barro", "Tiputini"),
                     labels = c("Costa Rica\n(La Selva)",
                         "Panama\n(Barro\nColorado)",
                         "Ecuador\n(Tiputini)")) +
    ylab("number of observed OTUs") +
    xlab("forests") +
    coord_flip() +
    theme_bw() +
    theme(axis.text  = element_text(size = 11),
          axis.title.x = element_text(vjust = 0))

## Output to PDF (multiplot)
output <- gsub(".table", "_group_by_forests_absolute.pdf", input, fixed = TRUE)
pdf(file = output, width = 11 , height = 10)
multiplot(p1, p2)
dev.off()


#-------------------------- Percentage barplots -------------------------------#

## Barcharts (reads)
p1 <- ggplot(d2, aes(x = forest, y = n, fill = clade)) +
    geom_bar(stat = "identity", position = "fill") +
    scale_y_continuous(labels = percent_format()) +
    scale_fill_discrete(breaks = taxa_order_reads$clade,
                            name = "clade                         ") +
    scale_x_discrete(limits = rev(c("LaSelva", "Barro", "Tiputini")),
                     breaks = c("LaSelva", "Barro", "Tiputini"),
                     labels = c("Costa Rica\n(La Selva)",
                         "Panama\n(Barro\nColorado)",
                         "Ecuador\n(Tiputini)")) +
    ylab("percentage of observed reads") +
    xlab("forests") +
    coord_flip() +
    theme_bw() +
    theme(axis.text  = element_text(size = 11),
          axis.title.x = element_text(vjust = 0))

## Barcharts (OTUs)
p2 <- ggplot(d3, aes(x = forest, y = n, fill = clade)) +
    geom_bar(stat = "identity", position = "fill") +
    scale_y_continuous(labels = percent_format()) +
    scale_fill_discrete(breaks = taxa_order_OTUs$clade) +
    scale_x_discrete(limits = rev(c("LaSelva", "Barro", "Tiputini")),
                     breaks = c("LaSelva", "Barro", "Tiputini"),
                     labels = c("Costa Rica\n(La Selva)",
                         "Panama\n(Barro\nColorado)",
                         "Ecuador\n(Tiputini)")) +
    ylab("percentage of observed OTUs") +
    xlab("forests") +
    coord_flip() +
    theme_bw() +
    theme(axis.text  = element_text(size = 11),
          axis.title.x = element_text(vjust = 0))

## Output to PDF (multiplot)
output <- gsub(".table", "_group_by_forests_relative.pdf", input, fixed = TRUE)
pdf(file = output, width = 11 , height = 10)
multiplot(p1, p2)
dev.off()

quit(save="no")

3.12 Taxonomic profiles per forest (high taxonomic level, after removal of non-placed OTUs)

Produce a barplot (or barchart) where samples are grouped by forest and by taxonomic groups (after removal of the 2,232 OTUs not placed by Lucas).

# aragorn
FOLDER="${HOME}/neotropical_diversity/results"
TABLE="neotropical_soil_175_samples.OTU.protists_cleaned.table"

cd ./stampa/
ln -sf ${FOLDER}/first_155_samples/${TABLE} ${TABLE}
library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
library(reshape2)

## Load data
setwd("~/neotropical_diversity/results/stampa/")
input <- "neotropical_soil_175_samples.OTU.protists_cleaned.table"

## Multiple plot function
##
## ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
## - cols:   Number of columns in layout
## - layout: A matrix specifying the layout. If present, 'cols' is ignored.
##
## If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
## then plot 1 will go in the upper left, 2 will go in the upper right, and
## 3 will go all the way across the bottom.
##
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
    library(grid)

    ## Make a list from the ... arguments and plotlist
    plots <- c(list(...), plotlist)

    numPlots = length(plots)

    ## If layout is NULL, then use 'cols' to determine layout
    if (is.null(layout)) {
        ## Make the panel
        ## ncol: Number of columns of plots
        ## nrow: Number of rows needed, calculated from # of cols
        layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                         ncol = cols, nrow = ceiling(numPlots/cols))
    }

    if (numPlots==1) {
        print(plots[[1]])

    } else {
          ## Set up the page
          grid.newpage()
          pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

          ## Make each plot, in the correct location
          for (i in 1:numPlots) {
              ## Get the i,j matrix positions of the regions that contain this subplot
              matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

              print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                    layout.pos.col = matchidx$col))
          }
      }
}


## Import and format data
d <- read.table(input, sep = "\t", header = TRUE, dec=".") %>% tbl_df()

## For some reasons, the command below does not work with mutate
d$Barro <- rowSums(select(d, starts_with("B")))
d$LaSelva <- rowSums(select(d, starts_with("L")))
d$Tiputini <- rowSums(select(d, starts_with("T")) %>%
                      select(-taxonomy, -total))

## Discard all other columns
d <- select(d, one_of("Barro", "Tiputini", "LaSelva", "taxonomy"))

## Extract the third field from the "taxonomy" and store in a new column
d$clade <- apply(d["taxonomy"], 1 , function(x) strsplit(x, "|", fixed = TRUE)[[1]][3])

d$LaSelva <- as.integer(d$LaSelva)
d$Tiputini <- as.integer(d$Tiputini)
d$Barro <- as.integer(d$Barro)

## Group by clade (sum reads)
d2 <- select(d, -taxonomy) %>%
    gather("forest", "abundance", -clade) %>%
        group_by(clade, forest)
d2$abundance <- as.integer(d2$abundance)
d2 <- tally(d2, wt = abundance, sort = FALSE)

## Replace "*" by "Unknown", and discard "Chimera"
d2 <- d2 %>% filter(clade != "Chimera")
d2$clade[d2$clade == "*"] <- "Unknown"
d2$clade[d2$clade == "Alveolata_X"] <- "Alveolata incertae sedis"
d2$clade[d2$clade == "Amoebozoa_X"] <- "Amoebozoa incertae sedis"
d2$clade[d2$clade == "Stramenopiles_X"] <- "non-Ochrophyta Stramenopiles"

## List clades that have significative abundances
main_taxa <- d2 %>%
    select(-forest) %>%
    group_by(clade) %>%
    tally(wt = n, sort = TRUE) %>%
    mutate(percentage = 100 * n / sum(d2$n)) %>%
    filter(percentage > 0.1) %>%
    select(-n, -percentage)

## All rows in d2 that have a match in main_taxa
d2 <- semi_join(d2, main_taxa, by = "clade")

## Order the legend
taxa_order_reads<- select(d2, clade) %>% distinct()

#------------------------ Absolute barplots -----------------------------------#

## Barcharts (reads)
p1 <- ggplot(d2, aes(x = forest, y = n, fill = clade)) +
    geom_bar(stat = "identity") +
    scale_y_continuous(labels = comma) +
    scale_fill_discrete(breaks = taxa_order_reads$clade,
                            name = "clade                         ") +
    scale_x_discrete(limits = rev(c("LaSelva", "Barro", "Tiputini")),
                     breaks = c("LaSelva", "Barro", "Tiputini"),
                     labels = c("Costa Rica\n(La Selva)",
                         "Panama\n(Barro\nColorado)",
                         "Ecuador\n(Tiputini)")) +
    ylab("number of observed reads") +
    xlab("forests") +
    coord_flip() +
    theme_bw() +
    theme(axis.text  = element_text(size = 11),
          axis.title.x = element_text(vjust = 0)) ##  +

## Barcharts (OTUs)

## Group by clade (sum reads)
d3 <- select(d, -taxonomy) %>%
    gather("forest", "abundance", -clade) %>%
    group_by(clade, forest) %>%
    filter(abundance != "0") %>%
    tally(sort= TRUE)

## Replace "*" by "Unknown", and discard "Chimera"
d3 <- d3 %>% filter(clade != "Chimera")
d3$clade[d3$clade == "*"] <- "Unknown"
d3$clade[d3$clade == "Alveolata_X"] <- "Alveolata incertae sedis"
d3$clade[d3$clade == "Amoebozoa_X"] <- "Amoebozoa incertae sedis"
d3$clade[d3$clade == "Stramenopiles_X"] <- "non-Ochrophyta Stramenopiles"

## List clades that have significative abundances
main_taxa <- d3 %>%
    select(-forest) %>%
    group_by(clade) %>%
    tally(sort = TRUE) %>%
    mutate(percentage = 100 * n / sum(d3$n)) %>%
    filter(percentage > 0.1) %>%
    select(-n, -percentage)

## ## All rows in d3 that have a match in main_taxa (in the same time,
## it sorts d3 by decreasing number of OTUs)
d3 <- semi_join(d3, main_taxa, by = "clade")

## Order the legend
taxa_order_OTUs <- select(d3, clade) %>% distinct()

## Barcharts
p2 <- ggplot(d3, aes(x = forest, y = n, fill = clade)) +
    geom_bar(stat = "identity") +
    scale_y_continuous(labels = comma) +
    scale_fill_discrete(breaks = taxa_order_OTUs$clade) +
    scale_x_discrete(limits = rev(c("LaSelva", "Barro", "Tiputini")),
                     breaks = c("LaSelva", "Barro", "Tiputini"),
                     labels = c("Costa Rica\n(La Selva)",
                         "Panama\n(Barro\nColorado)",
                         "Ecuador\n(Tiputini)")) +
    ylab("number of observed OTUs") +
    xlab("forests") +
    coord_flip() +
    theme_bw() +
    theme(axis.text  = element_text(size = 11),
          axis.title.x = element_text(vjust = 0))

## Output to PDF (multiplot)
output <- gsub(".table", "_group_by_forests_absolute.pdf", input, fixed = TRUE)
pdf(file = output, width = 11 , height = 10)
multiplot(p1, p2)
dev.off()


#-------------------------- Percentage barplots -------------------------------#

## Barcharts (reads)
p1 <- ggplot(d2, aes(x = forest, y = n, fill = clade)) +
    geom_bar(stat = "identity", position = "fill") +
    scale_y_continuous(labels = percent_format()) +
    scale_fill_discrete(breaks = taxa_order_reads$clade,
                            name = "clade                         ") +
    scale_x_discrete(limits = rev(c("LaSelva", "Barro", "Tiputini")),
                     breaks = c("LaSelva", "Barro", "Tiputini"),
                     labels = c("Costa Rica\n(La Selva)",
                         "Panama\n(Barro\nColorado)",
                         "Ecuador\n(Tiputini)")) +
    ylab("percentage of observed reads") +
    xlab("forests") +
    coord_flip() +
    theme_bw() +
    theme(axis.text  = element_text(size = 11),
          axis.title.x = element_text(vjust = 0))

## Barcharts (OTUs)
p2 <- ggplot(d3, aes(x = forest, y = n, fill = clade)) +
    geom_bar(stat = "identity", position = "fill") +
    scale_y_continuous(labels = percent_format()) +
    scale_fill_discrete(breaks = taxa_order_OTUs$clade) +
    scale_x_discrete(limits = rev(c("LaSelva", "Barro", "Tiputini")),
                     breaks = c("LaSelva", "Barro", "Tiputini"),
                     labels = c("Costa Rica\n(La Selva)",
                         "Panama\n(Barro\nColorado)",
                         "Ecuador\n(Tiputini)")) +
    ylab("percentage of observed OTUs") +
    xlab("forests") +
    coord_flip() +
    theme_bw() +
    theme(axis.text  = element_text(size = 11),
          axis.title.x = element_text(vjust = 0))

## Output to PDF (multiplot)
output <- gsub(".table", "_group_by_forests_relative.pdf", input, fixed = TRUE)
pdf(file = output, width = 11 , height = 10)
multiplot(p1, p2)
dev.off()

quit(save="no")

3.13 Taxonomic profiles per forest (Fungi; high taxonomic level)

Produce a barplot (or barchart) where samples are grouped by forest and by taxonomic groups (Fungi).

library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
library(reshape2)

## Load data
setwd("~/neotropical_diversity/results/stampa/")
input <- "neotropical_soil_175_samples.OTU.fungi.table"

## Multiple plot function
##
## ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
## - cols:   Number of columns in layout
## - layout: A matrix specifying the layout. If present, 'cols' is ignored.
##
## If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
## then plot 1 will go in the upper left, 2 will go in the upper right, and
## 3 will go all the way across the bottom.
##
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
    library(grid)

    ## Make a list from the ... arguments and plotlist
    plots <- c(list(...), plotlist)

    numPlots = length(plots)

    ## If layout is NULL, then use 'cols' to determine layout
    if (is.null(layout)) {
        ## Make the panel
        ## ncol: Number of columns of plots
        ## nrow: Number of rows needed, calculated from # of cols
        layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                         ncol = cols, nrow = ceiling(numPlots/cols))
    }

    if (numPlots==1) {
        print(plots[[1]])

    } else {
          ## Set up the page
          grid.newpage()
          pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

          ## Make each plot, in the correct location
          for (i in 1:numPlots) {
              ## Get the i,j matrix positions of the regions that contain this subplot
              matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

              print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                    layout.pos.col = matchidx$col))
          }
      }
}


## Import and format data
d <- read.table(input, sep = "\t", header = TRUE, dec=".") %>% tbl_df()

## For some reasons, the command below does not work with mutate
d$Barro <- rowSums(select(d, starts_with("B")))
d$LaSelva <- rowSums(select(d, starts_with("L")))
d$Tiputini <- rowSums(select(d, starts_with("T")) %>%
                      select(-taxonomy, -total))

## Discard all other columns
d <- select(d, one_of("Barro", "Tiputini", "LaSelva", "taxonomy"))

## Extract the fourth field from the "taxonomy" and store in a new column
d$clade <- apply(d["taxonomy"], 1 , function(x) strsplit(x, "|", fixed = TRUE)[[1]][4])

d$LaSelva <- as.integer(d$LaSelva)
d$Tiputini <- as.integer(d$Tiputini)
d$Barro <- as.integer(d$Barro)

## Group by clade (sum reads)
d2 <- select(d, -taxonomy) %>%
    gather("forest", "abundance", -clade) %>%
        group_by(clade, forest)
d2$abundance <- as.integer(d2$abundance)
d2 <- tally(d2, wt = abundance, sort = FALSE)

## Replace "*" by "Unknown", and discard "Chimera"
d2 <- d2 %>% filter(clade != "Chimera")
d2$clade[d2$clade == "*"] <- "Unknown"
d2$clade[d2$clade == "Alveolata_X"] <- "Alveolata"
d2$clade[d2$clade == "Amoebozoa_X"] <- "Amoebozoa"
d2$clade[d2$clade == "Stramenopiles_X"] <- "Stramenopiles"

## List clades that have significative abundances
main_taxa <- d2 %>%
    select(-forest) %>%
    group_by(clade) %>%
    tally(wt = n, sort = TRUE) %>%
    mutate(percentage = 100 * n / sum(d2$n)) %>%
    filter(percentage > 0.1) %>%
    select(-n, -percentage)

## All rows in d2 that have a match in main_taxa
d2 <- semi_join(d2, main_taxa, by = "clade")

## Order the legend
taxa_order_reads<- select(d2, clade) %>% distinct()

#------------------------ Absolute barplots -----------------------------------#

## Barcharts (reads)
p1 <- ggplot(d2, aes(x = forest, y = n, fill = clade)) +
    geom_bar(stat = "identity") +
    scale_y_continuous(labels = comma) +
    scale_fill_discrete(breaks = taxa_order_reads$clade,
                            name = "clade                         ") +
    scale_x_discrete(limits = rev(c("LaSelva", "Barro", "Tiputini")),
                     breaks = c("LaSelva", "Barro", "Tiputini"),
                     labels = c("Costa Rica\n(La Selva)",
                         "Panama\n(Barro\nColorado)",
                         "Ecuador\n(Tiputini)")) +
    ylab("number of observed reads") +
    xlab("forests") +
    coord_flip() +
    theme_bw() +
    theme(axis.text  = element_text(size = 11),
          axis.title.x = element_text(vjust = 0)) ##  +
    ## ggtitle("Neotropical Forest Soils: fungi communities (175 samples, share > 0.1%)") +
    ## theme(legend.background = element_rect(colour="black", size=.1))

## Barcharts (OTUs)

## Group by clade (sum reads)
d3 <- select(d, -taxonomy) %>%
    gather("forest", "abundance", -clade) %>%
        group_by(clade, forest) %>%
            filter(abundance != "0") %>%
                tally(sort= TRUE)

## Replace "*" by "Unknown", and discard "Chimera"
d3 <- d3 %>% filter(clade != "Chimera")
d3$clade[d3$clade == "*"] <- "Unknown"
d3$clade[d3$clade == "Alveolata_X"] <- "Alveolata"
d3$clade[d3$clade == "Amoebozoa_X"] <- "Amoebozoa"
d3$clade[d3$clade == "Stramenopiles_X"] <- "Stramenopiles"

## List clades that have significative abundances
main_taxa <- d3 %>%
    select(-forest) %>%
    group_by(clade) %>%
    tally(sort = TRUE) %>%
    mutate(percentage = 100 * n / sum(d3$n)) %>%
    filter(percentage > 0.1) %>%
    select(-n, -percentage)

## ## All rows in d3 that have a match in main_taxa (in the same time,
## it sorts d3 by decreasing number of OTUs)
d3 <- semi_join(d3, main_taxa, by = "clade")

## Order the legend
taxa_order_OTUs <- select(d3, clade) %>% distinct()

## Barcharts
p2 <- ggplot(d3, aes(x = forest, y = n, fill = clade)) +
    geom_bar(stat = "identity") +
    scale_y_continuous(labels = comma) +
    scale_fill_discrete(breaks = taxa_order_OTUs$clade) +
    scale_x_discrete(limits = rev(c("LaSelva", "Barro", "Tiputini")),
                     breaks = c("LaSelva", "Barro", "Tiputini"),
                     labels = c("Costa Rica\n(La Selva)",
                         "Panama\n(Barro\nColorado)",
                         "Ecuador\n(Tiputini)")) +
    ylab("number of observed OTUs") +
    xlab("forests") +
    coord_flip() +
    theme_bw() +
    theme(axis.text  = element_text(size = 11),
          axis.title.x = element_text(vjust = 0))

## Output to PDF (multiplot)
output <- gsub(".table", "_group_by_forests_absolute.pdf", input, fixed = TRUE)
pdf(file = output, width = 11 , height = 10)
multiplot(p1, p2)
dev.off()


#-------------------------- Percentage barplots -------------------------------#

## Barcharts (reads)
p1 <- ggplot(d2, aes(x = forest, y = n, fill = clade)) +
    geom_bar(stat = "identity", position = "fill") +
    scale_y_continuous(labels = percent_format()) +
    scale_fill_discrete(breaks = taxa_order_reads$clade,
                            name = "clade                         ") +
    scale_x_discrete(limits = rev(c("LaSelva", "Barro", "Tiputini")),
                     breaks = c("LaSelva", "Barro", "Tiputini"),
                     labels = c("Costa Rica\n(La Selva)",
                         "Panama\n(Barro\nColorado)",
                         "Ecuador\n(Tiputini)")) +
    ylab("percentage of observed reads") +
    xlab("forests") +
    coord_flip() +
    theme_bw() +
    theme(axis.text  = element_text(size = 11),
          axis.title.x = element_text(vjust = 0))

## Barcharts (OTUs)
p2 <- ggplot(d3, aes(x = forest, y = n, fill = clade)) +
    geom_bar(stat = "identity", position = "fill") +
    scale_y_continuous(labels = percent_format()) +
    scale_fill_discrete(breaks = taxa_order_OTUs$clade) +
    scale_x_discrete(limits = rev(c("LaSelva", "Barro", "Tiputini")),
                     breaks = c("LaSelva", "Barro", "Tiputini"),
                     labels = c("Costa Rica\n(La Selva)",
                         "Panama\n(Barro\nColorado)",
                         "Ecuador\n(Tiputini)")) +
    ylab("percentage of observed OTUs") +
    xlab("forests") +
    coord_flip() +
    theme_bw() +
    theme(axis.text  = element_text(size = 11),
          axis.title.x = element_text(vjust = 0))

## Output to PDF (multiplot)
output <- gsub(".table", "_group_by_forests_relative.pdf", input, fixed = TRUE)
pdf(file = output, width = 11 , height = 10)
multiplot(p1, p2)
dev.off()

quit(save="no")

3.14 Taxonomic profiles per forest (Fungi; fifth taxonomic level)

Produce a barplot (or barchart) where samples are grouped by forest and by taxonomic groups (Fungi, 5th level).

# aragorn
cd ~/neotropical_diversity/results/stampa/
TABLE="neotropical_soil_175_samples.OTU.fungi.table"
sed 's/Ascomycota_X|Ascomycota_XX|Archaeorhizomyces/Archaeorhizomyces|Archaeorhizomyces|Archaeorhizomyces/' ${TABLE}| \
sed 's/Fungi_XX/Unknown/' > "${TABLE}.tmp"
library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
library(reshape2)

## Load data
setwd("~/neotropical_diversity/results/stampa/")
input <- "neotropical_soil_175_samples.OTU.fungi.table.tmp"

## Multiple plot function
##
## ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
## - cols:   Number of columns in layout
## - layout: A matrix specifying the layout. If present, 'cols' is ignored.
##
## If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
## then plot 1 will go in the upper left, 2 will go in the upper right, and
## 3 will go all the way across the bottom.
##
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
    library(grid)

    ## Make a list from the ... arguments and plotlist
    plots <- c(list(...), plotlist)

    numPlots = length(plots)

    ## If layout is NULL, then use 'cols' to determine layout
    if (is.null(layout)) {
        ## Make the panel
        ## ncol: Number of columns of plots
        ## nrow: Number of rows needed, calculated from # of cols
        layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                         ncol = cols, nrow = ceiling(numPlots/cols))
    }

    if (numPlots==1) {
        print(plots[[1]])

    } else {
          ## Set up the page
          grid.newpage()
          pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

          ## Make each plot, in the correct location
          for (i in 1:numPlots) {
              ## Get the i,j matrix positions of the regions that contain this subplot
              matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

              print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                    layout.pos.col = matchidx$col))
          }
      }
}


## Import and format data
d <- read.table(input, sep = "\t", header = TRUE, dec=".") %>% tbl_df()

## For some reasons, the command below does not work with mutate
d$Barro <- rowSums(select(d, starts_with("B")))
d$LaSelva <- rowSums(select(d, starts_with("L")))
d$Tiputini <- rowSums(select(d, starts_with("T")) %>%
                      select(-taxonomy, -total))

## Discard all other columns
d <- select(d, one_of("Barro", "Tiputini", "LaSelva", "taxonomy"))

## Extract the fifth field from the "taxonomy" and store in a new column
d$clade <- apply(d["taxonomy"], 1 , function(x) strsplit(x, "|", fixed = TRUE)[[1]][5])
d$LaSelva <- as.integer(d$LaSelva)
d$Tiputini <- as.integer(d$Tiputini)
d$Barro <- as.integer(d$Barro)

## Group by clade (sum reads)
d2 <- select(d, -taxonomy) %>%
    gather("forest", "abundance", -clade) %>%
    group_by(clade, forest)
d2$abundance <- as.integer(d2$abundance)
d2 <- tally(d2, wt = abundance, sort = FALSE)

## Replace "*" by "Unknown", and discard "Chimera"
d2 <- d2 %>% filter(clade != "Chimera")
d2$clade[d2$clade == "*"] <- "Unknown"

## List clades that have significative abundances
main_taxa <- d2 %>%
    select(-forest) %>%
    group_by(clade) %>%
    tally(wt = n, sort = TRUE) %>%
    mutate(percentage = 100 * n / sum(d2$n)) %>%
    filter(percentage > 0.1) %>%
    select(-n, -percentage)

## All rows in d2 that have a match in main_taxa
d2 <- semi_join(d2, main_taxa, by = "clade") %>%
    arrange(clade)

## Barcharts (OTUs)
## Group by clade (sum reads)
d3 <- select(d, -taxonomy) %>%
    gather("forest", "abundance", -clade) %>%
    group_by(clade, forest) %>%
    filter(abundance != "0") %>%
    tally(sort= TRUE)

## Replace "*" by "Unknown", and discard "Chimera"
d3 <- d3 %>% filter(clade != "Chimera")
d3$clade[d3$clade == "*"] <- "Unknown"
## d3$clade[d3$clade == "Taphrinomycotina"] <- "Taphrinomycotina (excluding the Archaeorhizomycetes)"

## List clades that have significative abundances
main_taxa <- d3 %>%
    select(-forest) %>%
    group_by(clade) %>%
    tally(sort = TRUE) %>%
    mutate(percentage = 100 * n / sum(d3$n)) %>%
    filter(percentage > 0.1) %>%
    select(-n, -percentage)

## ## All rows in d3 that have a match in main_taxa
d3 <- semi_join(d3, main_taxa, by = "clade") %>% arrange(clade)

## Order the legend
taxa_order <- bind_rows(select(d2, clade), select(d3, clade)) %>% arrange(clade) %>% distinct()

#------------------------ Absolute barplots -----------------------------------#

## Barcharts (reads)
p1 <- ggplot(d2, aes(x = forest, y = n, fill = clade)) +
    geom_bar(stat = "identity") +
    scale_y_continuous(labels = comma) +
    scale_fill_discrete(breaks = taxa_order$clade,
                            name = "clade                         ") +
    scale_x_discrete(limits = rev(c("LaSelva", "Barro", "Tiputini")),
                     breaks = c("LaSelva", "Barro", "Tiputini"),
                     labels = c("Costa Rica\n(La Selva)",
                         "Panama\n(Barro\nColorado)",
                         "Ecuador\n(Tiputini)")) +
    ylab("number of observed reads") +
    xlab("forests") +
    coord_flip() +
    theme_bw() +
    theme(axis.text  = element_text(size = 11),
          axis.title.x = element_text(vjust = 0))

## Barcharts (OTUs)
p2 <- ggplot(d3, aes(x = forest, y = n, fill = clade)) +
    geom_bar(stat = "identity") +
    scale_y_continuous(labels = comma) +
    scale_fill_discrete(breaks = taxa_order$clade) +
    scale_x_discrete(limits = rev(c("LaSelva", "Barro", "Tiputini")),
                     breaks = c("LaSelva", "Barro", "Tiputini"),
                     labels = c("Costa Rica\n(La Selva)",
                         "Panama\n(Barro\nColorado)",
                         "Ecuador\n(Tiputini)")) +
    ylab("number of observed OTUs") +
    xlab("forests") +
    coord_flip() +
    theme_bw() +
    theme(axis.text  = element_text(size = 11),
          axis.title.x = element_text(vjust = 0))

## Output to PDF (multiplot)
output <- gsub(".table.tmp", "_5th_level_group_by_forests_absolute.pdf", input, fixed = TRUE)
pdf(file = output, width = 11 , height = 10)
multiplot(p1, p2)
dev.off()


#-------------------------- Percentage barplots -------------------------------#

## Barcharts (reads)
p1 <- ggplot(d2, aes(x = forest, y = n, fill = clade)) +
    geom_bar(stat = "identity", position = "fill") +
    scale_y_continuous(labels = percent_format()) +
    scale_fill_discrete(breaks = taxa_order$clade,
                            name = "clade                         ") +
    scale_x_discrete(limits = rev(c("LaSelva", "Barro", "Tiputini")),
                     breaks = c("LaSelva", "Barro", "Tiputini"),
                     labels = c("Costa Rica\n(La Selva)",
                         "Panama\n(Barro\nColorado)",
                         "Ecuador\n(Tiputini)")) +
    ylab("percentage of observed reads") +
    xlab("forests") +
    coord_flip() +
    theme_bw() +
    theme(axis.text  = element_text(size = 11),
          axis.title.x = element_text(vjust = 0))

## Barcharts (OTUs)
p2 <- ggplot(d3, aes(x = forest, y = n, fill = clade)) +
    geom_bar(stat = "identity", position = "fill") +
    scale_y_continuous(labels = percent_format()) +
    scale_fill_discrete(breaks = taxa_order$clade) +
    scale_x_discrete(limits = rev(c("LaSelva", "Barro", "Tiputini")),
                     breaks = c("LaSelva", "Barro", "Tiputini"),
                     labels = c("Costa Rica\n(La Selva)",
                         "Panama\n(Barro\nColorado)",
                         "Ecuador\n(Tiputini)")) +
    ylab("percentage of observed OTUs") +
    xlab("forests") +
    coord_flip() +
    theme_bw() +
    theme(axis.text  = element_text(size = 11),
          axis.title.x = element_text(vjust = 0))

## Output to PDF (multiplot)
output <- gsub(".table.tmp", "_5th_level_group_by_forests_relative.pdf", input, fixed = TRUE)
pdf(file = output, width = 11 , height = 10)
multiplot(p1, p2)
dev.off()

quit(save="no")
# aragorn
cd ~/neotropical_diversity/results/stampa/
# clean
TABLE="neotropical_soil_175_samples.OTU.fungi.table"
rm -f "${TABLE}.tmp"

3.15 Alpha-diversity estimates (vegan)

library(dplyr)
library(tidyr)
library(vegan)

setwd("~/neotropical_diversity/results/first_155_samples/")
input <- "neotropical_soil_175_samples.OTU.protists_cleaned.table"

## Import and format data
d <- read.table(input, sep = "\t", header = TRUE, dec=".") %>%
    tbl_df() %>%
    select(total)

## transpose (to get OTUs in columns)
d <- t(d)

## -------------------------------------------------------------------------- ##
## Alpha diversity (global and per sample)

## richness
estimateR(d)

## Shannon index H (richness + evenness)
H <- diversity(d, index = "shannon", MARGIN = 1, base = exp(1))
H

## Pielou’s index of evenness: (0-1, 1 = max. evenness)
J <- H/log(dim(d)[2])
J

## Simpson's D index: (richness + evenness, 0-1; 1 - D rises as evenness increases)
D <- diversity(d, "simpson")
D
inv_D <- diversity(d, "invsimpson")
inv_D

quit(save = "no")

Initial protist dataset

total S.obs 2.909200e+04 S.chao1 2.909220e+04 se.chao1 4.626483e-01 S.ACE 2.909618e+04 se.ACE 8.492294e+01

Shannon index H (richness + evenness) 4.876584

Pielou’s index of evenness: (0-1, 1 = max. evenness) 0.4744581

Simpson's D index: (richness + evenness, 0-1; 1 - D rises as evenness increases) 0.9644248

Inverted Simpson 28.10947

Cleaned protist dataset (after taxonomic placement)

total S.obs 2.686000e+04 S.chao1 2.686018e+04 se.chao1 4.495074e-01 S.ACE 2.686400e+04 se.ACE 8.154326e+01

Shannon index H (richness + evenness) 4.731536

Pielou’s index of evenness: (0-1, 1 = max. evenness) 0.4639492

Simpson's D index: (richness + evenness, 0-1; 1 - D rises as evenness increases) 0.9596464

Inverted Simpson 24.78093

3.16 Alpha-diversity estimates (breakaway) (after removal of non-placed OTUs)

Group samples by forest (after removal of the 2,232 OTUs not placed by Lucas). The objective is to produce a new table containing with only 4 columns: OTU, forest 1, forest 2, forest 3, total.

library(dplyr)
library(tidyr)
library(breakaway)

## Load data
setwd("~/neotropical_diversity/results/stampa/")
input <- "neotropical_soil_175_samples.OTU.protists_cleaned.table"

## Import and format data
d <- read.table(input, sep = "\t", header = TRUE, dec=".") %>% tbl_df()

## For some reasons, the command below does not work with mutate
d$Barro <- rowSums(select(d, starts_with("B")))
d$LaSelva <- rowSums(select(d, starts_with("L")))
d$Tiputini <- rowSums(select(d, starts_with("T")) %>%
                      select(-taxonomy, -total))

## Discard all other columns
d <- select(d, one_of("Barro", "Tiputini", "LaSelva", "total"))

## Write to file
write.csv(file = "neotropical_soil_175_samples.OTU.protists_cleaned_forests.table", x = d)

quit(save="no")
cd ~/neotropical_diversity/results/stampa/

BREAKAWAY="../../src/breakaway.R"
TABLE="neotropical_soil_175_samples.OTU.protists_cleaned_forests.table"
OUTPUT="${TABLE/.table/.alphadiversity.table}"
TMP=$(mktemp)

# Loop over the samples
echo -e "SAMPLE\tMASS\tOBSERVED\tESTIMATES\tSE" > "${OUTPUT}"
for ((i=2 ; i<=5 ; i++)) ; do
    COLUMN=$(cut -d "," -f ${i} "${TABLE}" | grep -v "^0$")
    SAMPLE=$(head -n 1 <<< "${COLUMN}" | tr -d "\"")
    MASS=$(awk '{s += $1} END {print s}' <<< "${COLUMN}")
    OBSERVED=$(wc -l <<< "${COLUMN}")
    OBSERVED=$(( $OBSERVED - 1 ))
    tail -n +2 <<< "${COLUMN}" | sort -n | uniq -c | \
        awk '{print $2, $1}' > "${TMP}"
    ESTIMATES=""
    ESTIMATES=$(Rscript "${BREAKAWAY}" "${TMP}" | cut -d " " -f 2 | paste - -)
    [[ ! "${ESTIMATES}" ]] && ESTIMATES=$(echo -e "NA\tNA")
    echo -e "${SAMPLE}\t${MASS}\t${OBSERVED}\t${ESTIMATES}"
done >> "${OUTPUT}"

rm "${TMP}"

Rscript used in the above block

#!/usr/bin/Rscript

library(breakaway)

setwd("~/neotropical_diversity/results/stampa/")
args <- commandArgs(trailingOnly = TRUE)
input <- args[1]

d <- read.table(input, sep = " ")

answers <- breakaway(d, plot = FALSE, print = FALSE,
                     answers = TRUE, force = FALSE)

round(answers$est)
round(answers$seest)

quit(save="no")
sample mass observed estimates se
Barro 16232500 7065 7218 44
Tiputini 8903877 5217 5287 23
LaSelva 21515829 16935 16968 17
total 46652206 26860 26863 2

3.17 Alpha-diversity estimates using betta and breakaway (Sarah Sernaker)

Analysis performed by Sarah Sernaker (Department of Statistical Science, Cornell University, Malott Hall, Ithaca, New York 14853, USA.), using the following code:

#' ---
#' title: "Neotropical Forests breakaway and betta analysis"
#' author: "Sarah Sernaker"
#' ---
# install.packages('breakaway')

library('breakaway')

datTab <- read.table("neotropical_soil_175_samples.OTU.protists_cleaned_forests_recleaned.table",
                     header = TRUE, sep=",")
names<-colnames(datTab)


names<-names[-1:-1]
datTab<-datTab[,-1:-1]  # take off first column


freqTab <- lapply(apply(datTab,2,table),as.data.frame) 
freqTab <- lapply(freqTab,function(x) x[x[,1]!=0,])  # create frequency vector f_i= #(i)

totalSpecies <- unlist(lapply(freqTab,function(x) sum(x[,2]))) # sum of observed species

estimates_baway_newdata <- matrix(NA,nrow=length(freqTab),ncol=4)
rownames(estimates_baway_newdata) <- names(freqTab)
colnames(estimates_baway_newdata) <- c("baway_est","baway_seest","baway_lcb","baway_ucb")

for (i in 1:length(freqTab)) {  # breakaway ests of total species, std error, and 95% C.I
    baway1 <- try(breakaway(freqTab[[i]],plot=FALSE,print=FALSE,answers=TRUE), silent=T)
    if(!is.null(baway1$est)) { # if it works, store it
      estimates_baway_newdata[i,1] <- baway1$est
      estimates_baway_newdata[i,2] <- baway1$seest
      estimates_baway_newdata[i,3:4] <- baway1$ci
    }
}
estimates_baway_newdata

# write.table(estimates_baway_newdata, "bwayEsts_total.csv", sep=",")

# apply betta function using breakaway total estimates as initial estimates
bettaEsts<-betta(estimates_baway_newdata[,1],estimates_baway_newdata[,2])

# plot with breakaway output
# pdf("confInts_breakaway.pdf",height=6,width=10)
plot(0,0,type="n",bty="n",main="95% confidence intervals of estimated diversity"
     ,xlab=" ", ylab="breakaway estimates of total diversity", xaxt='n', xlim=c(.5,4), 
     ylim=c(0,30000))
for (i in 1:4){
  lines(c(i,i),c(estimates_baway_newdata[i,3],estimates_baway_newdata[i,4]))
  points(i,estimates_baway_newdata[i,1], pch='-')
}
axis(1,1:length(bettaEsts$blups),tick=F,labels=rownames(estimates_baway_newdata),
     las=3,cex.axis=1)
# dev.off()

# plot with betta output
# pdf("confInts_betta.pdf",height=6,width=10)
conf_ints <-matrix(rep(0), length(bettaEsts$blups),2)
rownames(conf_ints) <- rownames(estimates_baway_newdata)
colnames(conf_ints)<-c("Lower", "Upper")
plot(0,0,type="n",bty="n",main="95% confidence intervals of estimated diversity"
     ,xlab=" ",ylab="betta estimates of total diversity", xaxt='n', xlim=c(.5,4), 
     ylim=c(0,30000))
for (i in 1:length(bettaEsts$blups)){ #length(bettaEsts$blups)
  conf_ints[i,1]<-max(0,bettaEsts$blups[i]-1.96*bettaEsts$blupses[i])
  conf_ints[i,2]<-bettaEsts$blups[i]+1.96*bettaEsts$blupses[i]
  lines(c(i,i),c(conf_ints[i,1],conf_ints[i,2]))
  points(i,bettaEsts$blups[i], pch='-')
}
axis(1,1:length(bettaEsts$blups),tick=F,labels=rownames(conf_ints),las=3,cex.axis=1)
dev.off()
# write.table(conf_ints, "betta_confints.csv", sep=",")

3.18 Beta-diversity comparisons (protists) (Jaccard index) (vegan)

library(vegan)

setwd("~/neotropical_diversity/results/first_155_samples/")
input <- "neotropical_soil_175_samples.OTU.protists.table"
output <- gsub(".table", "", input, fixed = TRUE)

## Load the dataframe
d <- read.table(input, sep = "\t", header = TRUE, row.names = 1)

## -------------------------------------------------------------------------- ##
## Cleaning

## Samples with less than 10,000 reads
## B199_B200, B175_B176, L111_L112, B197_B198, B145_B146, B167_B168, L183_L184, L131_L132, L097_L098, L181_L182
## Outliers
## B173_B174  # Low diversity, mostly made of one OTU of Chlorophyceae Dunaliella

## Reduce (remove useless columns and low samples)
d <- subset(d,
            select = -c(amplicon, total, chimera, identity,
                        taxonomy, references,               
                        B199_B200, B175_B176, L111_L112, B197_B198,
                        B145_B146, B167_B168, L183_L184, L131_L132,
                        L097_L098, L181_L182,
                        B173_B174))

## Identify low samples and outliers
quantile(rowSums(d))
rowSums(d)
min(rowSums(d))
hist(rowSums(d))

## transpose (to get OTUs in columns)
d <- t(d)

## Randomly subsample the table, so all samples have the same number of reads
d_rarefied <- rrarefy(d, min(rowSums(d)))


## -------------------------------------------------------------------------- ##
## Alpha diversity (global and per sample)

## richness
estimateR(d_rarefied)

## Alpha diversity: evenness
plot(colSums(d), log = "y", xlab = "Rank", ylab = "abundance")  # global
mod <- radfit(d)  # (per sample diversity)
plot(mod)
output_file <- paste(output, "_evenness.pdf", sep = "")
dev.copy2pdf(device = x11, file = output_file, out.type = "pdf", width = 12, height = 6)

## ## Accumulation curves (using Ugland’s method) (global)
## CC_CURVugland <- specaccum(d, method = "exact", permutations=1000)
## plot(CC_CURVugland)
## output_file <- paste(output, "_ugland.pdf", sep = "")
## dev.copy2pdf(device = x11, file = output_file, out.type = "pdf", width = 12, height = 6)

## ## Preston model (global)
## preston <- prestonfit(colSums(d))
## prestondistr <- prestondistr(colSums(d))
## plot(preston)
## lines(prestondistr, line.col = "blue3")
## output_file <- paste(output, "_preston.pdf", sep = "")
## dev.copy2pdf(device = x11, file = output_file, out.type = "pdf", width = 12, height = 6)

## Shannon index H (richness + evenness)
H <- diversity(d, index = "shannon", MARGIN = 1, base = exp(1))
H

## Pielou’s index of evenness: (0-1, 1 = max. evenness)
J <- H/log(dim(d)[2])
J

## Simpson's D index: (richness + evenness, 0-1; 1 - D rises as evenness increases)
D <- diversity(d, "simpson")
D
inv_D <- diversity(d, "invsimpson")
inv_D

## Rényi diversities
R <- renyi(d_rarefied)
plot(R)
output_file <- paste(output, "_renyi.pdf", sep = "")
dev.copy2pdf(device = x11, file = output_file, out.type = "pdf", width = 12, height = 6)

## Rarefaction curves
## rarecurve(d, step = 100000, xlab = "sample size (number of reads)", ylab = "number of OTUs")
## output_file <- paste(output, "_rarefaction_curves.pdf", sep = "")
## dev.copy2pdf(device = x11, file = output_file, out.type = "pdf", width = 12, height = 6)


## -------------------------------------------------------------------------- ##
## Beta diversity

## Bray Curtis dissimilarity matrix
d_rarefied.bray <- vegdist(d_rarefied, method = "bray")
d_rarefied.bray

## NMDS analyses
d_rarefied.bray.nmds <- monoMDS(d_rarefied.bray)
d_rarefied.bray.nmds
plot(d_rarefied.bray.nmds)
output_file <- paste(output, "_NMDS.pdf", sep = "")
dev.copy2pdf(device = x11, file = output_file, out.type = "pdf", width = 12, height = 6)
stressplot(d_rarefied.bray.nmds)
output_file <- paste(output, "_NMDS_stress.pdf", sep = "")
dev.copy2pdf(device = x11, file = output_file, out.type = "pdf", width = 12, height = 6)

## UPGMA (Unweighted Pair Group Method with Arithmetic Mean)
d_rarefied.bray.hclust <- hclust(d_rarefied.bray, "average")
plot(d_rarefied.bray.hclust)
output_file <- paste(output, "_UPGMA.pdf", sep = "")
dev.copy2pdf(device = x11, file = output_file, out.type = "pdf", width = 12, height = 6)
d_rarefied.bray.hclust

quit(save = "no")

Heatmap

library(vegan)
library(tidyr)
library(dplyr)
library(ggplot2)
library(scales)
library(grid)

setwd("~/neotropical_diversity/results/first_155_samples/")
input <- "neotropical_soil_175_samples.OTU.protists.table"
output <- gsub(".table", "", input, fixed = TRUE)

## Load the dataframe
d <- read.table(input, sep = "\t", header = TRUE, row.names = 1)

## -------------------------------------------------------------------------- ##
## Cleaning

## Samples with less than 10,000 reads
## B199_B200, B175_B176, L111_L112, B197_B198, B145_B146, B167_B168, L183_L184, L131_L132, L097_L098, L181_L182
## Outliers
## B173_B174  # Low diversity, mostly made of one OTU of Chlorophyceae Dunaliella

## Reduce (remove useless columns and low samples)
d <- subset(d,
            select = -c(amplicon, total, chimera, identity,
                        taxonomy, references,               
                        B199_B200, B175_B176, L111_L112, B197_B198,
                        B145_B146, B167_B168, L183_L184, L131_L132,
                        L097_L098, L181_L182,
                        B173_B174))

## transpose (to get OTUs in columns)
d <- t(d)

## Randomly subsample the table, so all samples have the same number of reads
d_rarefied <- rrarefy(d, min(rowSums(d)))

## -------------------------------------------------------------------------- ##
## Beta diversity

## Jaccard dissimilarity matrix
d_rarefied.jaccard <- vegdist(d_rarefied, method = "jaccard")
jaccard <- as.matrix(as.dist(d_rarefied.jaccard))
output_file <- paste(output, "_jaccard.csv", sep = "")
write.table(jaccard, file = output_file, sep = "\t", row.names = TRUE)

## colnames(d) <- c("sampleA", "sampleB", "sizeA", "sizeB", "commonA", "commonB")
jaccard2 <- as.data.frame(jaccard)
jaccard2 <- tbl_df(jaccard2)
samples <- as.data.frame(colnames(jaccard2))
colnames(samples) <- c("sampleA")
jaccard3 <- bind_cols(samples, jaccard2)
jaccard3 <- gather(jaccard3, "sampleB", "distance", 2:ncol(jaccard3)) %>%
    filter(distance > 0) %>%
    mutate(similarity = 1 - distance)

breaks <- which(levels(jaccard3$sampleA) %in% c("B193_B194", "L199_L200"))

## Plot
ggplot(jaccard3, aes(x = sampleA, y = sampleB)) +
    theme_bw() +
    geom_tile(aes(fill = similarity), colour = "white") +
    scale_fill_gradient(low = "white", high = "steelblue") +
    theme(axis.text.x = element_text(size = 5, angle = 270, vjust = 0.5, hjust = 0),
          axis.text.y = element_text(size = 5),
          panel.grid.minor = element_blank(),
          panel.grid.major = element_blank(),
          panel.background = element_blank(),
          axis.title.x = element_blank(),
          axis.title.y = element_blank()) +
     geom_vline(size = 0.1, xintercept = breaks + c(0.5, 0.5)) +
     geom_hline(size = 0.1, yintercept = breaks + c(0.5, 0.5))

## Output file
output_file <- paste(output, "_jaccard.pdf", sep = "")
ggsave(file = output_file, width = 12 , height = 10)

quit(save = "no")

The three forests appear clearly. However, La Selva seems to be made of a core group of similar samples, and of a loose set of samples bearing little ressemblance among themselves or with samples from other forests.

Tiputini samples are similar among themselves but show no ressemblance to samples from the two other forests.

3.19 Beta-diversity comparisons (fungi) (Jaccard index) (vegan)

library(vegan)

setwd("~/neotropical_diversity/results/first_155_samples/")
input <- "neotropical_soil_175_samples.OTU.fungi.table"
output <- gsub(".table", "", input, fixed = TRUE)

## Load the dataframe
d <- read.table(input, sep = "\t", header = TRUE)

## -------------------------------------------------------------------------- ##
## Cleaning

## Samples with less than 2,000 reads
## Outliers
## B005_B006  # Low diversity?

d <- subset(d,
            select = -c(OTU, amplicon, total, chimera, identity,
                        taxonomy, references,
                        B155_B156, B173_B174, B013_B014, B133_B134,
                        L111_L112, B167_B168, B197_B198, B007_B008,
                        T195_T196, T174, B031_B032,
                        B005_B006))

## transpose (to get OTUs in columns)
d <- t(d)

## Identify low samples and outliers
quantile(rowSums(d))
sort(rowSums(d))[1:20]
min(rowSums(d))
hist(rowSums(d))

## Randomly subsample the table, so all samples have the same number of reads
d_rarefied <- rrarefy(d, min(rowSums(d)))

## -------------------------------------------------------------------------- ##
## Alpha diversity (global and per sample)

## richness
estimateR(d_rarefied)

## Alpha diversity: evenness
plot(colSums(d), log = "y", xlab = "Rank", ylab = "abundance")  # global
mod <- radfit(d)  # (per sample diversity)
plot(mod)
output_file <- paste(output, "_evenness.pdf", sep = "")
dev.copy2pdf(device = x11, file = output_file, out.type = "pdf", width = 12, height = 6)

## ## Accumulation curves (using Ugland’s method) (global)
## CC_CURVugland <- specaccum(d, method = "exact", permutations=1000)
## plot(CC_CURVugland)
## output_file <- paste(output, "_ugland.pdf", sep = "")
## dev.copy2pdf(device = x11, file = output_file, out.type = "pdf", width = 12, height = 6)

## ## Preston model (global)
## preston <- prestonfit(colSums(d))
## prestondistr <- prestondistr(colSums(d))
## plot(preston)
## lines(prestondistr, line.col = "blue3")
## output_file <- paste(output, "_preston.pdf", sep = "")
## dev.copy2pdf(device = x11, file = output_file, out.type = "pdf", width = 12, height = 6)

## Shannon index H (richness + evenness)
H <- diversity(d, index = "shannon", MARGIN = 1, base = exp(1))
H

## Pielou’s index of evenness: (0-1, 1 = max. evenness)
J <- H/log(dim(d)[2])
J

## Simpson's D index: (richness + evenness, 0-1; 1 - D rises as evenness increases)
D <- diversity(d, "simpson")
D
inv_D <- diversity(d, "invsimpson")
inv_D

## Rényi diversities
R <- renyi(d_rarefied)
plot(R)
output_file <- paste(output, "_renyi.pdf", sep = "")
dev.copy2pdf(device = x11, file = output_file, out.type = "pdf", width = 12, height = 6)

## Rarefaction curves
rarecurve(d, step = 100000, xlab = "sample size (number of reads)", ylab = "number of OTUs")
output_file <- paste(output, "_rarefaction_curves.pdf", sep = "")
dev.copy2pdf(device = x11, file = output_file, out.type = "pdf", width = 12, height = 6)


## -------------------------------------------------------------------------- ##
## Beta diversity

## Bray Curtis dissimilarity matrix
d_rarefied.bray <- vegdist(d_rarefied, method = "bray")
d_rarefied.bray

## NMDS analyses
d_rarefied.bray.nmds <- monoMDS(d_rarefied.bray)
d_rarefied.bray.nmds
plot(d_rarefied.bray.nmds)
output_file <- paste(output, "_NMDS.pdf", sep = "")
dev.copy2pdf(device = x11, file = output_file, out.type = "pdf", width = 12, height = 6)
stressplot(d_rarefied.bray.nmds)
output_file <- paste(output, "_NMDS_stress.pdf", sep = "")
dev.copy2pdf(device = x11, file = output_file, out.type = "pdf", width = 12, height = 6)

## UPGMA (Unweighted Pair Group Method with Arithmetic Mean)
d_rarefied.bray.hclust <- hclust(d_rarefied.bray, "average")
plot(d_rarefied.bray.hclust)
output_file <- paste(output, "_UPGMA.pdf", sep = "")
dev.copy2pdf(device = x11, file = output_file, out.type = "pdf", width = 12, height = 6)
d_rarefied.bray.hclust

quit(save = "no")

Heatmap

library(vegan)
library(tidyr)
library(dplyr)
library(ggplot2)
library(scales)
library(grid)

setwd("~/neotropical_diversity/results/first_155_samples/")
input <- "neotropical_soil_175_samples.OTU.fungi.table"
output <- gsub(".table", "", input, fixed = TRUE)

## Load the dataframe
d <- read.table(input, sep = "\t", header = TRUE)

## -------------------------------------------------------------------------- ##
## Cleaning

## Samples with less than 2,000 reads
## Outliers
## B005_B006  # Low diversity

## Reduce (remove useless columns and low samples)
d <- subset(d,
            select = -c(OTU, amplicon, total, chimera, identity,
                        taxonomy, references,
                        B155_B156, B173_B174, B013_B014, B133_B134,
                        L111_L112, B167_B168, B197_B198, B007_B008,
                        T195_T196, T174, B031_B032,
                        B005_B006))

## transpose (to get OTUs in columns)
d <- t(d)

## Randomly subsample the table, so all samples have the same number of reads
d_rarefied <- rrarefy(d, min(rowSums(d)))

## -------------------------------------------------------------------------- ##
## Beta diversity

## Jaccard dissimilarity matrix
d_rarefied.jaccard <- vegdist(d_rarefied, method = "jaccard")
jaccard <- as.matrix(as.dist(d_rarefied.jaccard))
output_file <- paste(output, "_jaccard.csv", sep = "")
write.table(jaccard, file = output_file, sep = "\t", row.names = TRUE)

## colnames(d) <- c("sampleA", "sampleB", "sizeA", "sizeB", "commonA", "commonB")
jaccard2 <- as.data.frame(jaccard)
jaccard2 <- tbl_df(jaccard2)
samples <- as.data.frame(colnames(jaccard2))
colnames(samples) <- c("sampleA")
jaccard3 <- bind_cols(samples, jaccard2)
jaccard3 <- gather(jaccard3, "sampleB", "distance", 2:ncol(jaccard3)) %>%
    filter(distance > 0) %>%
    mutate(similarity = 1 - distance)

breaks <- which(levels(jaccard3$sampleA) %in% c("B193_B194", "L199_L200"))

## Plot
ggplot(jaccard3, aes(x = sampleA, y = sampleB)) +
    theme_bw() +
    geom_tile(aes(fill = similarity), colour = "white") +
    scale_fill_gradient(low = "white", high = "steelblue") +
    expand_limits(fill = c(0, 1)) +
    theme(axis.text.x = element_text(size = 5, angle = 270, vjust = 0.5, hjust = 0),
          axis.text.y = element_text(size = 5),
          legend.title = element_text(face = "bold"),
          panel.grid.minor = element_blank(),
          panel.grid.major = element_blank(),
          panel.background = element_blank(),
          axis.title.x = element_blank(),
          axis.title.y = element_blank()) +
     geom_vline(size = 0.1, xintercept = breaks + c(0.5, 0.5)) +
     geom_hline(size = 0.1, yintercept = breaks + c(0.5, 0.5))

## Output file
output_file <- paste(output, "_jaccard.pdf", sep = "")
ggsave(file = output_file, width = 12 , height = 10)

quit(save = "no")

3.20 NMDS plot (protists and fungi)

protist

library(vegan)
library(tidyr)
library(dplyr)
library(ggplot2)
library(ggrepel)

setwd("~/neotropical_diversity/results/first_155_samples/")
input <- "neotropical_soil_175_samples.OTU.protists.table"
output <- gsub(".table", "", input, fixed = TRUE)
iterations <- 3e4

## Load the dataframe
d <- read.table(input, sep = "\t", header = TRUE, row.names = 1)

## -------------------------------------------------------------------------- ##
## Cleaning
## Samples with less than 10,000 reads
## B199_B200, B175_B176, L111_L112, B197_B198, B145_B146, B167_B168, L183_L184, L131_L132, L097_L098, L181_L182
## Outliers
## B173_B174  # Low diversity, mostly made of one OTU of Chlorophyceae Dunaliella

## Reduce (remove useless columns and low samples)
d <- subset(d,
            select = -c(amplicon, total, chimera, identity,
                        taxonomy, references,               
                        B199_B200, B175_B176, L111_L112, B197_B198,
                        B145_B146, B167_B168, L183_L184, L131_L132,
                        L097_L098, L181_L182,
                        B173_B174))

## transpose (to get OTUs in columns)
d <- t(d)

## Randomly subsample the table, so all samples have the same number of reads
d_rarefied <- rrarefy(d, min(rowSums(d)))

## Bray Curtis dissimilarity matrix
d_rarefied.bray <- vegdist(d_rarefied, method = "bray")

## NMDS analyses
d_rarefied.bray.nmds <- monoMDS(d_rarefied.bray)

## Extract data scores (https://chrischizinski.github.io/rstats/2014/04/13/vegan-ggplot2/)
stress <- d_rarefied.bray.nmds$stress
samples <- rownames(d_rarefied)
data.scores <- as.data.frame(scores(d_rarefied.bray.nmds))
data.scores$site <- rownames(data.scores)
data.scores$samples <- samples
head(data.scores)

## Add a forest variable
data.scores <- mutate(data.scores, forest = substr(site, 1, 1)) %>% select(-site)
data.scores$forest[data.scores$forest == "B"] <- "Panama (Barro Colorado)"
data.scores$forest[data.scores$forest == "L"] <- "Costa Rica (La Selva)"
data.scores$forest[data.scores$forest == "T"] <- "Ecuador (Tiputini)"
x_min <- min(data.scores$MDS1)
y_max <- max(data.scores$MDS2)
stress_annotation <- paste("stress: ", round(stress, digits = 4), sep = "")
head(data.scores)

## Plot
ggplot(data = data.scores, aes(x = MDS1, y = MDS2, label = samples)) +
    geom_text_repel(alpha = 0.5,
                    size = 2.75,
                    segment.size = 0.25,
                    segment.color = "grey",
                    max.iter = iterations) +
    geom_point(aes(colour = forest), size = 2) +
    theme_bw(base_size = 16) +
    guides(colour = guide_legend(title = NULL)) +
    theme(legend.justification = c(1,0), legend.position = c(1,0)) +
    annotate("text", x = x_min + abs(x_min / 10), y = y_max, label = stress_annotation) +
    coord_equal()

output_file <- paste(output, "_MDS.pdf", sep = "")
ggsave(output_file, width = 13, height = 8.5)

## Plot (no sample names)
ggplot(data = data.scores, aes(x = MDS1, y = MDS2, colour = forest)) +
    geom_point(size = 3) +
    theme_bw(base_size = 16) +
    guides(colour = guide_legend(title = NULL)) +
    theme(legend.justification = c(1,0), legend.position = c(1,0)) +
    annotate("text", x = x_min + abs(x_min / 10), y = y_max, label = stress_annotation) +
    coord_equal()

output_file <- paste(output, "_MDS_no_labels.pdf", sep = "")
ggsave(output_file, width = 13, height = 8.5)

quit(save = "no")

fungi

library(vegan)
library(tidyr)
library(dplyr)
library(ggplot2)
library(ggrepel)

setwd("~/neotropical_diversity/results/first_155_samples/")
input <- "neotropical_soil_175_samples.OTU.fungi.table"
output <- gsub(".table", "", input, fixed = TRUE)
iterations <- 3e4

## Load the dataframe
d <- read.table(input, sep = "\t", header = TRUE)

## -------------------------------------------------------------------------- ##
## Cleaning
## Samples with less than 2,000 reads
## Outliers
## B005_B006  # Low diversity?

d <- subset(d,
            select = -c(OTU, amplicon, total, chimera, identity,
                        taxonomy, references,
                        B155_B156, B173_B174, B013_B014, B133_B134,
                        L111_L112, B167_B168, B197_B198, B007_B008,
                        T195_T196, T174, B031_B032,
                        B005_B006))

## transpose (to get OTUs in columns)
d <- t(d)

## Randomly subsample the table, so all samples have the same number of reads
d_rarefied <- rrarefy(d, min(rowSums(d)))

## Bray Curtis dissimilarity matrix
d_rarefied.bray <- vegdist(d_rarefied, method = "bray")

## NMDS analyses
d_rarefied.bray.nmds <- monoMDS(d_rarefied.bray)

## Extract data scores (https://chrischizinski.github.io/rstats/2014/04/13/vegan-ggplot2/)
samples <- rownames(d_rarefied)
stress <- d_rarefied.bray.nmds$stress
data.scores <- as.data.frame(scores(d_rarefied.bray.nmds))
data.scores$site <- rownames(data.scores)
data.scores$samples <- samples
head(data.scores)

## Add a forest variable
data.scores <- mutate(data.scores, forest = substr(site, 1, 1)) %>% select(-site)
data.scores$forest[data.scores$forest == "B"] <- "Panama (Barro Colorado)"
data.scores$forest[data.scores$forest == "L"] <- "Costa Rica (La Selva)"
data.scores$forest[data.scores$forest == "T"] <- "Ecuador (Tiputini)"
x_min <- min(data.scores$MDS1)
y_max <- max(data.scores$MDS2)
stress_annotation <- paste("stress: ", round(stress, digits = 4), sep = "")
head(data.scores)


## Plot
ggplot(data = data.scores, aes(x = MDS1, y = MDS2, label = samples)) +
    geom_text_repel(alpha = 0.5,
                    size = 2.75,
                    segment.size = 0.25,
                    segment.color = "grey",
                    max.iter = iterations) +
    geom_point(aes(colour = forest), size = 2) +
    theme_bw(base_size = 16) +
    guides(colour = guide_legend(title = NULL)) +
    theme(legend.justification = c(1,0), legend.position = c(1,0)) +
    annotate("text", x = x_min + abs(x_min / 10),
             y = y_max, label = stress_annotation) +
    coord_equal()

output_file <- paste(output, "_MDS.pdf", sep = "")
ggsave(output_file, width = 13, height = 8.5)

## Plot (no sample names)
ggplot(data = data.scores, aes(x = MDS1, y = MDS2, colour = forest)) +
    geom_point(size = 3) +
    theme_bw(base_size = 16) +
    guides(colour = guide_legend(title = NULL)) +
    theme(legend.justification = c(1,0), legend.position = c(1,0)) +
    annotate("text", x = x_min + abs(x_min / 10),
             y = y_max, label = stress_annotation) +
    coord_equal()

output_file <- paste(output, "_MDS_no_labels.pdf", sep = "")
ggsave(output_file, width = 13, height = 8.5)

quit(save = "no")

3.21 Dendrograms (protists and fungi)

protist

library(vegan)
library(tidyr)
library(dplyr)
library(ggplot2)
library(dendextend)
library(dendextendRcpp)

setwd("~/neotropical_diversity/results/first_155_samples/")
input <- "neotropical_soil_175_samples.OTU.protists.table"
output <- gsub(".table", "", input, fixed = TRUE)

## Load the dataframe
d <- read.table(input, sep = "\t", header = TRUE, row.names = 1)

## -------------------------------------------------------------------------- ##
## Cleaning
## Samples with less than 10,000 reads
## B199_B200, B175_B176, L111_L112, B197_B198, B145_B146, B167_B168, L183_L184, L131_L132, L097_L098, L181_L182
## Outliers
## B173_B174  # Low diversity, mostly made of one OTU of Chlorophyceae Dunaliella

## Reduce (remove useless columns and low samples)
d <- subset(d,
            select = -c(amplicon, total, chimera, identity,
                        taxonomy, references,               
                        B199_B200, B175_B176, L111_L112, B197_B198,
                        B145_B146, B167_B168, L183_L184, L131_L132,
                        L097_L098, L181_L182,
                        B173_B174))

## transpose (to get OTUs in columns)
d <- t(d)

## Randomly subsample the table, so all samples have the same number of reads
d_rarefied <- rrarefy(d, min(rowSums(d)))

## Bray Curtis dissimilarity matrix
d_rarefied.bray <- vegdist(d_rarefied, method = "bray")

## UPGMA (Unweighted Pair Group Method with Arithmetic Mean)
dendrogram <- hclust(d_rarefied.bray, "average") %>% as.dendrogram()

grepl("^T", labels(dendrogram))
colors <- substr(labels(dendrogram), 1, 1)
colors <- gsub("T", "#7CAE00", colors)
colors <- gsub("B", "#00BFC4", colors)
colors <- gsub("L", "#F8766D", colors)

## Plot and Save
output_file <- paste(output, "_UPGMA_prettier.pdf", sep = "")
pdf(file = output_file, height = 6, width = 14)

dendrogram %>%
    set("leaves_pch", 19) %>%
    set("leaves_cex", 1.1) %>%
    set("leaves_col", colors) %>%
    set("labels_cex", 0.6) %>%
    hang.dendrogram %>%
    plot()

dev.off()

quit(save = "no")

fungi

library(vegan)
library(tidyr)
library(dplyr)
library(ggplot2)
library(ggrepel)

setwd("~/neotropical_diversity/results/first_155_samples/")
input <- "neotropical_soil_175_samples.OTU.fungi.table"
output <- gsub(".table", "", input, fixed = TRUE)

## Load the dataframe
d <- read.table(input, sep = "\t", header = TRUE)

## -------------------------------------------------------------------------- ##
## Cleaning
## Samples with less than 2,000 reads
## Outliers
## B005_B006  # Low diversity?

d <- subset(d,
            select = -c(OTU, amplicon, total, chimera, identity,
                        taxonomy, references,
                        B155_B156, B173_B174, B013_B014, B133_B134,
                        L111_L112, B167_B168, B197_B198, B007_B008,
                        T195_T196, T174, B031_B032,
                        B005_B006))

## transpose (to get OTUs in columns)
d <- t(d)

## Randomly subsample the table, so all samples have the same number of reads
d_rarefied <- rrarefy(d, min(rowSums(d)))

## Bray Curtis dissimilarity matrix
d_rarefied.bray <- vegdist(d_rarefied, method = "bray")

## NMDS analyses
d_rarefied.bray.nmds <- monoMDS(d_rarefied.bray)

## Extract data scores (https://chrischizinski.github.io/rstats/2014/04/13/vegan-ggplot2/)
samples <- rownames(d_rarefied)
data.scores <- as.data.frame(scores(d_rarefied.bray.nmds))
data.scores$site <- rownames(data.scores)
data.scores$samples <- samples
head(data.scores)

## Add a forest variable
data.scores <- mutate(data.scores, forest = substr(site, 1, 1)) %>% select(-site)
data.scores$forest[data.scores$forest == "B"] <- "Panama (Barro Colorado)"
data.scores$forest[data.scores$forest == "L"] <- "Costa Rica (La Selva)"
data.scores$forest[data.scores$forest == "T"] <- "Ecuador (Tiputini)"
head(data.scores)


## Plot
ggplot(data = data.scores, aes(x = MDS1, y = MDS2, label = samples)) +
    geom_text_repel(alpha = 0.5,
                    size = 2.75,
                    segment.size = 0.25,
                    segment.color = "grey",
                    max.iter = 2e4) +
    geom_point(aes(colour = forest), size = 2) +
    theme_bw(base_size = 16) +
    guides(colour = guide_legend(title = NULL)) +
    theme(legend.justification = c(1,0), legend.position = c(1,0)) +
    coord_equal()

output_file <- paste(output, "_MDS.pdf", sep = "")
ggsave(output_file, width = 13, height = 8.5)

## Plot (no sample names)
ggplot(data = data.scores, aes(x = MDS1, y = MDS2, colour = forest)) +
    geom_point(size = 3) +
    theme_bw(base_size = 16) +
    guides(colour = guide_legend(title = NULL)) +
    theme(legend.justification = c(1,0), legend.position = c(1,0)) +
    coord_equal()

output_file <- paste(output, "_MDS_no_labels.pdf", sep = "")
ggsave(output_file, width = 13, height = 8.5)

quit(save = "no")

3.22 OTU accumulation plots (per sample and per forest)

For all n values (from 1 to 175), compute all possible sets of n samples, count the number of OTUs and tally the results (jackknifing).

It is easier for me to do it in python.

# aragorn
cd ~/neotropical_diversity/results/first_155_samples/
python ../../src/rarefaction_by_sample.py neotropical_soil_175_samples.OTU.protists_cleaned.table > neotropical_soil_175_samples.OTU.protists_cleaned_rarefaction_by_sample_by_forest.data
library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)

setwd("~/neotropical_diversity/results/first_155_samples/")
input <- "neotropical_soil_175_samples.OTU.protists_cleaned_rarefaction_by_sample_by_forest.data"
## input <- "tmp.data"
output <- gsub(".data", "", input, fixed = TRUE)

## Load the dataframe
d <- read.table(input, sep = " ", header = FALSE)
colnames(d) <- c("forest", "number_of_samples",
                 "median_OTUs", "min_OTUs", "max_OTUs")
d$number_of_samples <-as.factor(d$number_of_samples)
breaks <- floor(seq(1, 88, length.out = 22))

## Rename forests
levels(d$forest)[levels(d$forest) == "Barro"] <- "Panama (Barro Colorado)"
levels(d$forest)[levels(d$forest) == "LaSelva"] <- "Costa Rica (La Selva)"
levels(d$forest)[levels(d$forest) == "Tiputini"] <- "Ecuador (Tiputini)"

## Plot
ggplot(data = d, aes(x = number_of_samples, y = median_OTUs)) +
    geom_point(shape = 19, size = 2, color = "firebrick") +
    geom_errorbar(aes(ymax = max_OTUs, ymin = min_OTUs), width = 0.2, color = "darkgrey") +
    scale_y_continuous(labels = comma) +
    scale_x_discrete(breaks = breaks, labels = breaks) +
    theme_bw(base_size = 16) +
    xlab("number of samples") +
    ylab("number of OTUs") +
    facet_grid(forest ~ .)

output_file <- paste(output, "_median.pdf", sep = "")
ggsave(output_file, width = 10, height = 12)

quit(save = "no")

Compute all possible combinations or 1,000,000 combinations, whichever came first.

3.23 Ciliophora and Colpodea figures

# kl
cd ${HOME}/neotropical_diversity/data/
PROTISTS="neotropical_soil_175_samples.OTU.protists.table"
grep "Ciliophora" "${PROTISTS}" |\
      awk '{c += 1 ; s += $157} END {print c, s}'
grep "Colpodea" "${PROTISTS}" |\
      awk '{c += 1 ; s += $157} END {print c, s}'

After OTU filtering, the protist dataset represents 29,094 OTUs (50,118,359 reads). Ciliophora represent 2,018 OTUs (1,528,421 reads; 3.0% of the protists); and Colpodea represent 1,017 OTUs (999,245 reads; 2.0% of the protists).

kl
cd ${HOME}/neotropical_diversity/data/
PROTISTS="neotropical_soil_175_samples.OTU.protists.table"
grep "Colpodea" "${PROTISTS}" | cut -f 157,160 | tr "|" "\t" | cut -f 1,6 | \
    awk '{a[$2] += $1 ; b[$2] += 1} END {for (i in a) {print i, a[i], b[i]}}' | \
    sort -k2,2nr

Colpodea taxonomic breakdown

Colpodea reads OTUs
Platyophryida 704888 424
Colpodida 182506 469
Cyrtolophosidida 105014 105
Bursariomorphida 6837 19

Get the number of unique amplicons:

kl
cd ${HOME}/neotropical_diversity/data/
STATS="neotropical_soil_175_samples_1f.stats"
grep -F -f <(grep "Colpodea" "${PROTISTS}" | cut -f 2) "${STATS}" | \
awk '{s += $1} END {print s}'

262,787 unique amplicons assigned to Colpodea.

3.24 Stampa plots

3.24.1 Global plots

kl
cd ${HOME}/neotropical_diversity/data/
export LC_ALL=C
DATASET="neotropical_soil_175_samples"

for TABLE in ${DATASET}.OTU.{protists,fungi}.table ; do
    # Get starting column
    START=$(head -n 1 "${TABLE}" | tr "\t" "\n" | nl | grep "total" | awk '{print $1}')

    # Sum reads
    awk -v START="${START}" \
        'BEGIN {FS = "\t"}
         {if (NR == 1) {next}
          stampa[$(START+2)] += $START
         } END {
          for (id in stampa) {
              print id, stampa[id]
          }
         }' "${TABLE}" | sort -k1,1n > "${TABLE/.table/_reads.stampa}"

    # Count each OTU as one observation
    awk -v START="${START}" \
        'BEGIN {FS = "\t"}
         {if (NR == 1) {next}
          stampa[$(START+2)] += 1
         } END {
          for (id in stampa) {
              print id, stampa[id]
          }
         }' "${TABLE}" | sort -k1,1n > "${TABLE/.table/_OTUs.stampa}"

done

Inject the results in ggplot2

library(ggplot2)
library(scales)

setwd("~/neotropical_diversity/results/stampa/")
DATASET <- "neotropical_soil_175_samples"

inputs <- paste(DATASET, c(".OTU.protists_reads.stampa",
                           ".OTU.protists_OTUs.stampa",
                           ".OTU.fungi_reads.stampa",
                           ".OTU.fungi_OTUs.stampa"), sep = "")

for (input in inputs) {
    # Get the name of the taxa
    TAXO <- sub("^.*((fungi|protists)_(reads|OTUs)).*$", "\\1", input, perl = TRUE)
    TAXO <- sub("_", " ", TAXO, fixed = TRUE)
    DATASET <- sub(".OTU.*$", "", input)
    DATASET <- sub("biomarks", "BioMarKs", DATASET, fixed = TRUE)
    DATASET <- gsub("_", " ", DATASET, fixed = TRUE)
    TITLE <- paste(DATASET, " (", TAXO, ")", sep="")

    # Load the data
    d <- read.table(input, sep = " ", dec = ".")
    colnames(d) <- c("identities", "abundance")
    d$identities <- d$identities / 100

    # Get the max abundance value
    y_max <- max(d$abundance)

   # Plot
    ggplot(d, aes(x = identities, y = abundance)) +
        geom_segment(aes(xend = identities, yend = 0), colour = "darkred", size = 1) +
        scale_x_continuous(labels = percent, limits = c(0.5, 1)) +
        scale_y_continuous(labels=comma) +
        xlab("identity with a reference sequence") +
        ylab("number of environmental sequences") +
        annotate("text", x = 0.50, y = y_max * 0.9, hjust = 0, colour = "grey", size = 8, label = TITLE)

    ## Output to PDF
    output <- gsub(".stampa", "_stampa.pdf", input, fixed = TRUE)
    ggsave(file = output, width = 8 , height = 5)
}

quit(save="no")

3.24.2 Compare with TARA

library(ggplot2)
library(scales)

setwd("~/neotropical_diversity/results/stampa/")

## Oceanic Surface Waters
input <- "~/Science/Projects/TARA/results/Stampa/TARA_V9_370_samples.OTU.protists_reads.stampa"
surface <- read.table(input, sep = " ", dec = ".")
colnames(surface) <- c("identities", "abundance")
surface <- cbind("environment" = "Oceanic Surface Waters", surface)

## Oceanic Deep Sediments
input <- "~/Science/Projects/Deep_Sea_protists/results/all_samples_protists.data"
benthos <- read.table(input, sep = " ", dec = ".")
colnames(benthos) <- c("identities", "abundance")
benthos <- cbind("environment" = "Oceanic Deep Sediments", benthos)

## Neotropical Forest Soils
input <- "neotropical_soil_175_samples.OTU.protists_reads.stampa"
soil <- read.table(input, sep = " ", dec = ".")
colnames(soil) <- c("identities", "abundance")
soil <- cbind("environment" = "Neotropical Forest Soils", soil)

## Merge and clean the dataset
## environments <- rbind(surface, benthos, soil)
environments <- rbind(soil, surface)
environments <- na.omit(environments)
environments$identities <- environments$identities / 100

ggplot(environments, aes(x = identities, y = abundance)) +
    ## geom_line(size = 0.1) +
    geom_segment(aes(xend = identities, yend = 0), colour = "darkred", size = 1) +
    scale_x_continuous(labels = percent, limits = c(0.4, 1)) +
    scale_y_continuous(labels=comma) +
    xlab("max % of similarity to reference database") +
    ylab("number of reads") +
    facet_grid(environment ~ ., scales="free_y") +
    theme_bw() +
    theme(axis.title.x = element_text(vjust = 0),
          axis.title.y = element_text(vjust = 1),
          strip.text.y = element_text(size = 9.5))

## Output to PDF
output <- "environments_comparison_stampa_protists.pdf"
ggsave(file = output, width = 5 , height = 5)  ## use height = 7 for plotting 3 environments

quit(save="no")

3.24.3 BioMarKs Illumina V4, V9, Swiss Soils V9 and Neotrop Fungi

library(ggplot2)
library(scales)

setwd("~/neotropical_diversity/results/stampa/")

## European Coastal Waters (BioMarKs V4 protists)
input <- "~/Science/Projects/BioMarks/results/Stampa/Illumina_V4/biomarks_v4_illumina.OTU.protists_reads.stampa"
coast_v4 <- read.table(input, sep = " ", dec = ".")
colnames(coast_v4) <- c("identities", "abundance")
coast_v4 <- cbind("environment" = "BioMarKs (protists V4)", coast_v4)

## European Coastal Waters (BioMarKs V9 protists)
input <- "~/Science/Projects/BioMarks/results/Stampa/Illumina_V9/biomarks_v9_illumina.OTU.protists_reads.stampa"
coast_v9 <- read.table(input, sep = " ", dec = ".")
colnames(coast_v9) <- c("identities", "abundance")
coast_v9 <- cbind("environment" = "BioMarKs (protists V9)", coast_v9)

## Swiss Soils (V9 protists)
input <- "~/Science/Projects/Swiss_forests/data/swiss_forests_V9_29_samples.OTU.protists_reads.stampa"
swiss_soils_v9 <- read.table(input, sep = " ", dec = ".")
colnames(swiss_soils_v9) <- c("identities", "abundance")
swiss_soils_v9 <- cbind("environment" = "Swiss Soils (protists V9)", swiss_soils_v9)

## Neotropical Forest Soils (fungi)
input <- "neotropical_soil_175_samples.OTU.fungi_reads.stampa"
soil <- read.table(input, sep = " ", dec = ".")
colnames(soil) <- c("identities", "abundance")
soil <- cbind("environment" = "Neotropical Forest Soils (fungi)", soil)

## Merge and clean the dataset
environments <- rbind(coast_v4, coast_v9, swiss_soils_v9, soil)
environments <- na.omit(environments)
environments$identities <- environments$identities / 100

ggplot(environments, aes(x = identities, y = abundance)) +
    geom_segment(aes(xend = identities, yend = 0), colour = "darkred", size = 1) +
    scale_x_continuous(labels = percent, limits = c(0.4, 1)) +
    scale_y_continuous(labels=comma) +
    xlab("max % of similarity to reference database") +
    ylab("number of reads") +
    facet_grid(environment ~ ., scales="free_y") +
    theme_bw() +
    theme(axis.title.x = element_text(vjust = 0),
          axis.title.y = element_text(vjust = 1),
          strip.text.y = element_text(size = 9.5))

## Output to PDF
output <- "environments_comparison_stampa_biomarks_swiss_protists.pdf"
ggsave(file = output, width = 5 , height = 9)  ## use height = 7 for plotting 3 environments

quit(save="no")

3.24.4 Percentage of assignments (strictly) below 80%

cd ~/neotropical_diversity/results/stampa/

PROJECT="neotropical_soil_175_samples.OTU."

for f in ${PROJECT}*.stampa ; do
    awk '{s += $2
          if ($1 < 80.0) {b += $2}
         } END {
             printf "%s\t%.1f\n", FILENAME, 100 * b / s
         }' "${f}" | \
        sed -e "s/${PROJECT}//"
done
  % of reads % of OTUs
protists 75.3 43.6
fungi 2.4 9.1

3.24.5 Percentage of assignments (equal or above 95%)

cd ~/neotropical_diversity/results/stampa/

PROJECT="neotropical_soil_175_samples.OTU."

for f in ${PROJECT}{protists,fungi}_{reads,OTUs}.stampa ; do
    awk '{s += $2
          if ($1 >= 95.0) {b += $2}
         } END {
             printf "%s\t%.1f\n", FILENAME, 100 * b / s
         }' "${f}" | \
        sed -e "s/${PROJECT}//"
done

cd ~/Science/Projects/TARA/results/Stampa/

PROJECT="TARA_V9_370_samples.OTU.protists_reads.stampa"

for f in ${PROJECT} ; do
    awk '{s += $2
          if ($1 >= 95.0 || $1 == 100.0) {b += $2}
         } END {
             printf "%s\t%.1f\n", FILENAME, 100 * b / s
         }' "${f}"
done
  % of reads % of OTUs
protists 8.1 19.4
fungi 91.2 66.0

TARA is 68.1% of reads equal or greater than 95%.

3.24.6 Stampa plots for the top protists taxa

For the top protist taxa (in reads and OTUs), plot stampa plots (6 per plate: 2 columns of 3 taxa).

library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
library(reshape2)

## Load data
setwd("~/neotropical_diversity/results/stampa/")
input <- "neotropical_soil_175_samples.OTU.protists_cleaned.table"

## Import and format data
d <- read.table(input, sep = "\t", header = TRUE, dec=".") %>% tbl_df()

## Discard all other columns
d <- select(d, one_of("total", "taxonomy", "identity"))

## Extract the third field from the "taxonomy" and store in a new column
d$clade <- apply(d["taxonomy"], 1 , function(x) strsplit(x, "|", fixed = TRUE)[[1]][3])

## Duplicate d
d2 <- select(d, -taxonomy)

## Replace "*" by "Unknown", and discard "Chimera"
d2 <- d2 %>% filter(clade != "Chimera")
d2$clade[d2$clade == "*"] <- "Unknown"
d2$clade[d2$clade == "Alveolata_X"] <- "Alveolata incertae sedis"
d2$clade[d2$clade == "Amoebozoa_X"] <- "Amoebozoa incertae sedis"
d2$clade[d2$clade == "Stramenopiles_X"] <- "non-Ochrophyta Stramenopiles"

## Group by clade and percentage of identity ---------------------- (sum reads)
d2 <- d2 %>%
    group_by(clade, identity) %>%
        summarise(reads = sum(total))

## List clades that have significative abundances (> 0.1% of reads)
main_taxa <- d2 %>%
    group_by(clade) %>%
    tally(wt = reads, sort = TRUE) %>%
    mutate(percentage = 100 * n / sum(d2$reads)) %>%
    filter(percentage > 0.1) %>%
    select(-n, -percentage)

main_taxa

## All rows in d2 that have a match in main_taxa
d2 <- semi_join(d2, main_taxa, by = "clade")

## Produce the stampa plots (facet)
ggplot(d2, aes(x = identity / 100, y = reads)) +
    geom_segment(aes(xend = identity / 100, yend = 0), colour = "darkred", size = 1) +
    scale_x_continuous(labels = percent, limits = c(0.4, 1)) +
    scale_y_continuous(labels = comma) +
    xlab("max % of similarity to reference database") +
    ylab("number of reads") +
    facet_wrap(~ clade, scales = "free_y", ncol = 2) +
    theme_bw() +
    theme(axis.title.x = element_text(vjust = 0),
          axis.title.y = element_text(vjust = 1),
          strip.text.y = element_text(size = 9.5))

output <- "neotropical_soil_175_samples.OTU.protists_cleaned_top_taxa_reads_stampa_plots.pdf"
ggsave(output, width = 18 , height = 26, units = "cm")

## ----------------------------------------------------------------- (sum OTUs)

## Duplicate d
d2 <- select(d, -taxonomy)

## Replace "*" by "Unknown", and discard "Chimera"
d2 <- d2 %>% filter(clade != "Chimera")
d2$clade[d2$clade == "*"] <- "Unknown"
d2$clade[d2$clade == "Alveolata_X"] <- "Alveolata incertae sedis"
d2$clade[d2$clade == "Amoebozoa_X"] <- "Amoebozoa incertae sedis"
d2$clade[d2$clade == "Stramenopiles_X"] <- "non-Ochrophyta Stramenopiles"

## List clades that have significative abundances (> 0.1% of reads)
main_taxa <- d2 %>%
    count(clade) %>%
    arrange(desc(n)) %>%
    mutate(percentage = 100 * n / nrow(d)) %>%
    filter(percentage > 0.1) %>%
    select(-n, -percentage)

main_taxa

## Group by clade and percentage of identity
d2 <- d2 %>%
    group_by(clade, identity) %>%
    tally() %>%
    rename(OTUs = n)

## All rows in d2 that have a match in main_taxa
d2 <- semi_join(d2, main_taxa, by = "clade")

## Produce the stampa plots (facet)
ggplot(d2, aes(x = identity / 100, y = OTUs)) +
    geom_segment(aes(xend = identity / 100, yend = 0), colour = "darkred", size = 1) +
    scale_x_continuous(labels = percent, limits = c(0.4, 1)) +
    scale_y_continuous(labels = comma) +
    xlab("max % of similarity to reference database") +
    ylab("number of OTUs") +
    facet_wrap(~ clade, scales = "free_y", ncol = 3) +
    theme_bw() +
    theme(axis.title.x = element_text(vjust = 0),
          axis.title.y = element_text(vjust = 1),
          strip.text.y = element_text(size = 9.5))

output <- "neotropical_soil_175_samples.OTU.protists_cleaned_top_taxa_OTUs_stampa_plots.pdf"
ggsave(output, width = 20 , height = 19, units = "cm")

quit(save="no")

3.25 Extract all protists amplicons for all samples

They need the cleaned reads from just the protists. I will prepare two file: representatives only, all amplicons

kl
cd ${HOME}/neotropical_diversity/data/

OTUs="neotropical_soil_175_samples.OTU.protists.table"
OTU_REPRESENTATIVES="${OTUs/.OTU.protists.table/_1f_representatives.fas}"
PROTIST_OTU_REPRESENTATIVES="${OTU_REPRESENTATIVES/_1f_representatives/_protist_OTU}"
PROTIST_OTU_ALL_AMPLICONS="${OTU_REPRESENTATIVES/_1f_representatives/_protist_amplicons}"
SWARMS="${OTU_REPRESENTATIVES/_representatives.fas/.swarms}"
ALL_FASTA="${OTUs/.OTU.protists.table/.fas}"
AMPLICON_TABLE="neotropical_soil_175_samples.amplicons.table"
PROTIST_AMPLICONS_FASTA="${AMPLICON_TABLE/.amplicons.table/_protist_amplicons.fas}"
PROTIST_AMPLICONS_TABLE="${AMPLICON_TABLE/.amplicons.table/_protist_amplicons.table}"

export LC_ALL=C

# Make a new fasta file (representatives)
SEEDS=$(mktemp)
tail -n +2 "${OTUs}" | cut -f 2 > "${SEEDS}"
grep -A 1 -F -f "${SEEDS}" "${OTU_REPRESENTATIVES}" | \
    sed -e '/^--$/d' > "${PROTIST_OTU_REPRESENTATIVES}"

# Make a new fasta file (all amplicons)
PROTISTS=$(mktemp)
grep -F -f "${SEEDS}" "${SWARMS}" | \
    sed -e '/^--$/d' | tr " " "\n" > "${PROTISTS}"
grep -A 1 -F -f "${PROTISTS}" "${ALL_FASTA}" | \
    sed -e '/^--$/d' > "${PROTIST_OTU_ALL_AMPLICONS}"
rm -f "${PROTISTS}" "${SEEDS}"

# Provide an amplicon2sample mapping
AMPLICONS=$(mktemp)
grep "^>" "${PROTIST_AMPLICONS_FASTA}" | \
    tr -d ">" | cut -d "_" -f 1 > "${AMPLICONS}"
grep -F -f "${AMPLICONS}" "${AMPLICON_TABLE}" | \
    sed -e '/^--$/d' > "${PROTIST_AMPLICONS_TABLE}"
rm "${AMPLICONS}"

# Compress and publish
tar cvfj neotropical_175_samples.tar.bz2 "${PROTIST_OTU_REPRESENTATIVES}" "${PROTIST_OTU_ALL_AMPLICONS}" "${PROTIST_AMPLICONS_TABLE}" "${SWARMS}" "${OTUs}"
cp neotropical_175_samples.tar.bz2 /scratch/WWW/

Sanity check

kl
cd ${HOME}/neotropical_diversity/data/
awk -F "_" '/^>/ {s += $2} END {print s}' neotropical_soil_175_samples_protist_amplicons.fas
tail -n +2 neotropical_soil_175_samples.OTU.protists.table | cut -f 157 | awk '{s += $1} END {print s}'

# Amplicon table
wc -l neotropical_soil_175_samples_protist_amplicons.table
grep -c "^>" neotropical_soil_175_samples_protist_amplicons.fas

Correct: 50,118,536 reads; 10,567,804 amplicons

Compute sample size

kl
cd ${HOME}/neotropical_diversity/data/
awk '{if (NR == 1) {
         for (i=2 ; i<NF; i++) {
             n[i] = $i
         }
      } else {
         for (i=2 ; i<NF; i++) {
             s[i] += $i
         }
      }
     } END {
         OFS = "\t"
         for (i=2; i<NF ; i++) {
             print i - 1, n[i], s[i]
         }
     }' neotropical_soil_175_samples.amplicons.table

Sample sizes

  sample reads
1 B005_B006 544015
2 B007_B008 626531
3 B010 271333
4 B011_B012 848347
5 B013_B014 170268
6 B020 548572
7 B029_B030 933695
8 B030 417321
9 B031_B032 959889
10 B033_B034 1216202
11 B035_B036 933791
12 B037_B038 741346
13 B039_B040 1176220
14 B040 571901
15 B043_B044 935807
16 B045_B046 1013012
17 B047_B048 649547
18 B050 196771
19 B051_B052 1275897
20 B060 580906
21 B070 62909
22 B080 720303
23 B081_B082 1024313
24 B090 783374
25 B100 964485
26 B129_B130 797114
27 B133_B134 657427
28 B135_B136 693092
29 B143_B144 612093
30 B145_B146 552616
31 B155_B156 594226
32 B163_B164 738102
33 B167_B168 245006
34 B173_B174 414030
35 B175_B176 613849
36 B177_B178 1842450
37 B183_B184 742968
38 B185_B186 622912
39 B193_B194 767107
40 B197_B198 279561
41 B199_B200 852764
42 L001_L002 1719339
43 L005_L006 1555331
44 L007_L008 1648277
45 L010 516281
46 L011_L012 1679746
47 L013_L014 1760964
48 L015_L016 1570737
49 L018 1830826
50 L019_L020 758339
51 L020 295923
52 L021_L022 1487575
53 L023_L024 1581819
54 L025_L026 1936827
55 L027_L028 386383
56 L030 1028779
57 L031_L032 1695035
58 L035_L036 1899519
59 L037_L038 2253304
60 L039_L040 1240330
61 L040 655327
62 L041_L042 1920214
63 L043_L044 1361793
64 L045_L046 911262
65 L049_L050 631336
66 L050 679930
67 L051_L052 1247906
68 L053_L054 1153604
69 L055_L056 793091
70 L057_L058 840963
71 L059_L060 804766
72 L060 634552
73 L061_L062 973130
74 L063_L064 1114401
75 L065_L066 885233
76 L067_L068 800978
77 L069_L070 900132
78 L070 588745
79 L071_L072 1071586
80 L073_L074 616349
81 L075_L076 803422
82 L077_L078 863032
83 L079_L080 971827
84 L080 279493
85 L081_L082 742182
86 L083_L084 876820
87 L085_L086 805254
88 L089_L090 684859
89 L090 588390
90 L092 1474852
91 L093_L094 1143333
92 L095_L096 919568
93 L097_L098 905526
94 L099_L100 343536
95 L100 682484
96 L101_L102 1052283
97 L103_L104 624284
98 L109_L110 788820
99 L111_L112 768
100 L115_L116 833297
101 L117_L118 1213172
102 L119_L120 687580
103 L123_L124 769025
104 L125_L126 892299
105 L129_L130 256776
106 L131_L132 708469
107 L137_L13 1027034
108 L139_L140 748716
109 L145_L146 371367
110 L151_L152 802965
111 L155_L156 845705
112 L159_L160 1021533
113 L161_L162 566274
114 L165_L166 921133
115 L171_L172 350808
116 L173_L174 340209
117 L175_L176 63302
118 L177_L178 894549
119 L179_L180 772079
120 L181_L182 933643
121 L183_L184 947722
122 L185_L186 915254
123 L187_L188 923912
124 L189_L190 803930
125 L191_L192 933793
126 L193_L194 706206
127 L195_L196 835502
128 L197_L198 750542
129 L199_L200 988862
130 T105_T106 1217516
131 T107_T108 847051
132 T109_T110 838486
133 T111 918878
134 T125_T126 494750
135 T127_T128 1079855
136 T143_T144 1128481
137 T151_T152 905167
138 T154 877436
139 T159_T160 1065079
140 T163_T164 958921
141 T165 1062909
142 T167_T168 721980
143 T169_T170 1131511
144 T171_T172 823720
145 T174 87728
146 T175_T176 878340
147 T177_T178 861804
148 T179_T180 800664
149 T182 444419
150 T185_186 613303
151 T194 804032
152 T195_T196 613470
153 T197_T198 521303
154 T199_T200 623324

The smallest sample is L111_L112 with 768 reads.

100 L111_L112 768 22 B070 62909 118 L175_L176 63302 146 T174 87728 6 B013_B014 170268 19 B050 196771 34 B167_B168 245006 106 L129_L130 256776 4 B010 271333 85 L080 279493 … 47 L011_L012 1679746 58 L031_L032 1695035 43 L001_L002 1719339 48 L013_L014 1760964 50 L018 1830826 37 B177_B178 1842450 59 L035_L036 1899519 63 L041_L042 1920214 55 L025_L026 1936827 60 L037_L038 2253304

library(ggplot2)
library(scales)

setwd("${HOME}neotropical_diversity/results/")
input <- "neotropical_soil_175_samples.sample_sizes"

d <- read.table(input, sep = "\t")
colnames(d) <- c("number", "samples", "reads")

sorted_index <- order(d$reads)

ordered_samples <- d[sorted_index, c(2)]

median(d$reads)

ggplot(d, aes(x = samples, y = reads)) +
    geom_point() +
    theme_bw() +
    geom_hline(aes(yintercept = median(reads)),
               color = "darkred", linetype = "dashed", size = 0.5) +
    geom_rug(sides = "r", color = "grey") +
    scale_y_continuous(labels = comma,
                       name = "sample size (in reads)") +
    scale_x_discrete(limits = ordered_samples,
                     breaks = NULL,
                     name = "all 154 samples (sorted by size)")

## Output to PDF
output <- gsub(".sample_sizes", ".sample_sizes.pdf", input, fixed = TRUE)
ggsave(file = output, width = 7 , height = 4)

quit(save="no")

3.26 Group specific Stampa plots

Group specific (10 top taxa) stampa plots.

protocol:

  • get the table,
  • list top 10 taxa,
  • produce the stampa plots,
# aragorn
cd ~/neotropical_diversity/results/stampa/
export LC_ALL=C
TABLE="neotropical_soil_175_samples.OTU.protists.table"
TMP_TABLE=$(mktemp)
TMP_LIST=$(mktemp)

# Get starting column
START=$(head -n 1 "${TABLE}" | tr "\t" "\n" | nl | grep "total" | awk '{print $1}')

# Sum reads per taxonomic group (level 3)
awk -v START="${START}" \
    'BEGIN {FS = "\t"}
     {if (NR == 1) {next}
          split($(START+3), a, "|")
          taxon = a[3]
          if (taxon == "" || taxon == "*") {
              taxon = "Unknown"
          }
          taxa[taxon] += $START
     } END {
      for (taxon in taxa) {
          print taxon, taxa[taxon]
      }
     }' "${TABLE}" | sort -k2,2nr | \
         nl -w1 -s " " | tr " " "\t" > "${TMP_LIST}"

# Produce stampa data
while read RANK TAXON READS ; do
    grep "|${TAXON}|" "${TABLE}" > "${TMP_TABLE}"
    # Sum reads
    awk -v START="${START}" \
        'BEGIN {FS = "\t"}
         {if (NR == 1) {next}
          stampa[$(START+2)] += $START
         } END {
          for (id in stampa) {
              print id, stampa[id]
          }
         }' "${TMP_TABLE}" | sort -k1,1n > "stampa_${RANK}_${TAXON}.tmp"
    # Produce plots (R script is available in the next code block)
    Rscript stampa_plots.R ${RANK} ${TAXON} ${READS}
    rm "stampa_${RANK}_${TAXON}.tmp"
done < "${TMP_LIST}"

zip -r neotropical_soil_175_samples_protists_group_specific stampa_*.pdf

# clean
rm "${TMP_TABLE}" "${TMP_LIST}" Rplots.pdf stampa_*.pdf

1 Apicomplexa 42487526 2 Cercozoa 2418013 3 Ciliophora 1528426 4 Conosa 1062845 5 Lobosa 648049 6 Chlorophyta 485348 7 Ochrophyta 464849 8 Stramenopiles_X 274776 9 Dinophyta 142656 10 Amoebozoa_X 136324 11 Rhodophyta 135433 12 Haptophyta 69650 13 Hilomonadea 61479 14 Unknown 58279 15 Discoba 57474 16 Choanoflagellida 25558 17 Centroheliozoa 24596 18 Mesomycetozoa 19823 19 Apusomonadidae 8120 20 Perkinsea 4862 21 Malawimonadidae 1181 22 Telonemia 650 23 Glaucophyta 546 24 Metamonada 543 25 Cryptophyta 541 26 Alveolata_X 473 27 Katablepharidophyta 216 28 Opisthokonta_X 153 29 Breviatea 134 30 Chimera 11 31 Radiolaria 2

#!/usr/bin/Rscript

library(ggplot2)
library(scales)

setwd("~/neotropical_diversity/results/stampa/")
DATASET <- "neotropical_soil_175_samples"



quit(save="no")

input <- paste(DATASET, c(".OTU.protists_reads.stampa",
                           ".OTU.protists_OTUs.stampa",
                           ".OTU.fungi_reads.stampa",
                           ".OTU.fungi_OTUs.stampa"), sep = "")

# Get the name of the taxa
TAXO <- sub("^.*((fungi|protists)_(reads|OTUs)).*$", "\\1", input, perl = TRUE)
TAXO <- sub("_", " ", TAXO, fixed = TRUE)
DATASET <- sub(".OTU.*$", "", input)
DATASET <- sub("biomarks", "BioMarKs", DATASET, fixed = TRUE)
DATASET <- gsub("_", " ", DATASET, fixed = TRUE)
TITLE <- paste(DATASET, " (", TAXO, ")", sep="")

# Load the data
d <- read.table(input, sep = " ", dec = ".")
colnames(d) <- c("identities", "abundance")
d$identities <- d$identities / 100

# Get the max abundance value
y_max <- max(d$abundance)

# Plot
ggplot(d, aes(x = identities, y = abundance)) +
    geom_segment(aes(xend = identities, yend = 0), colour = "darkred", size = 1) +
    scale_x_continuous(labels = percent, limits = c(0.5, 1)) +
    scale_y_continuous(labels=comma) +
    xlab("identity with a reference sequence") +
    ylab("number of environmental sequences") +
    annotate("text", x = 0.50, y = y_max * 0.9, hjust = 0, colour = "grey", size = 8, label = TITLE)

## Output to PDF
output <- gsub(".stampa", "_stampa.pdf", input, fixed = TRUE)
ggsave(file = output, width = 8 , height = 5)

quit(save="no")

3.27 Phylogenetic placement

For a detailed description see code_supplement_epa.html.

3.27.1 Unassigned and ambiguously placed amplicons

Lucas sent us two files:

  • no_clade (amplicons falling outside any know big clade),
  • uncertain (amplicons that can be placed in at least two big clades)

How many reads does it represent?

# aragorn
cd ~/neotropical_diversity/results/first_155_samples/phylogenetic_placement/

BIG_FASTA="../neotropical_soil_175_samples.fas"
BIG_SWARMS="../neotropical_soil_175_samples_1f.swarms"
BIG_TABLE="../neotropical_soil_175_samples.OTU.table"
TMP=$(mktemp)

for f in no_clade uncertain ; do
    # # extract fasta
    # grep -A 1 -F -f ${f} ${BIG_FASTA} | sed '/^--$/d' > ${f}.fas
    # # how many reads?
    # awk 'BEGIN {FS = "_"} {s += $2} END {print s}' ${f}.fas
    # # how many OTUs?
    # grep -F -f ${f} ${BIG_SWARMS} | \
    #     sed '/^--$/d' | \
    #     cut -d "_" -f 1 | \
    #     sort -du > ${TMP}
    # wc -l < ${TMP}
    # # extract OTU table
    # (head -n 1 ${BIG_TABLE} ; \
    #     grep -F -f ${TMP} ${BIG_TABLE} | sed '/^--$/d') > ${f}.OTU.table
    # # How many reads in the OTUs?
    # tail -n +2 ${f}.OTU.table | awk '{s += $157} END {print s}'
    # select only OTUs where unclassified or uncertain amplicons are seeds
    (head -n 1 ${f}.OTU.table ; \
        grep -F -f ${f} ${f}.OTU.table | sed '/^--$/d') > ${f}.OTU.seeds_only.table
done

rm -f ${TMP}

# clean uncompressed files
dataset unique amplicons #reads #OTUs #reads in OTUs #OTUs (seeds only)
no_clade 1314 2807 41 60175 21
uncertain 574833 2848711 3672 32814164 2211

Lucas sent us two lists of unique amplicons. Each amplicon can represent one or several reads (i.e. observations). For instance, the 1,314 unassigned unique amplicons ("no clade") represent 2,807 reads. According to swarm, those 1,314 unique amplicons belong to 41 distinct OTUs, themselves representing 60,175 reads. As you can see, the number of reads don't match.

Let's take a step back. The objective of swarm is to reduce noise by identifying locally significant amplicons (seeds) and attaching surrounding less-abundant amplicons to them. Here "locally" means in a particular region of the amplicon-space, not a particular sampling site. Empirical results show that the radius of an OTU is correlated to the abundance of its seed: an abundant seed will result in a larger OTU. In some cases the gravity center of those OTUs is assigned to a given clade, while some less-abundant amplicons living at the marging of the OTU are assigned outside that clade. That's expected, and explain the discrepancy.

Swarm does an excellent job at clustering, with very little over-grouping. Knowing that, I consider it is safe to focus only on OTUs where a "no clade" amplicon is the seed ("seeds_only.table").

For the "no clade" amplicons, the largest and most interesting OTU is e6fe43d2b871250a4670f6bfc167941ee18b5c3d, containing 834 unique amplicons, representing 1,950 reads, assigned to the Amoebozoa Lobosa (Stygamoebida) with 79.2% similarity (reference: AB330051.1). That OTU is present in several samples.

# aragorn
cd ~/neotropical_diversity/results/first_155_samples/phylogenetic_placement/
bzcat ../neotropical_soil_175_samples_1f.stats.bz2 | grep e6fe43d2b871250a4670f6bfc167941ee18b5c3d
# how many reads were tagged as "no clade"?
(cat no_clade ;
    bzcat ../neotropical_soil_175_samples_1f.swarms.bz2 | \
        grep -m 1 "^e6fe43d2b871250a4670f6bfc167941ee18b5c3d" | \
        tr " " "\n" | cut -d "_" -f 1) | sort -d | uniq -d | wc -l

Of the 834 unique amplicons contained in that swarm OTU, 833 are tagged as "no clade" by Lucas's protocol. I think it speaks in favor of swarm's recall.

3.27.2 Unassigned and ambiguously placed OTUs

  1. 2,231 protist OTUs were removed because their centroid was placed outside of any known clade, or equally distributed outside and within known clades. How many reads were in these 2,231 OTUs?

Lucas listed 21 OTUs placed in no clade and 2,211 OTUs with uncertain placement. That's a total of 2,232 OTUs we can discard from our dataset.

# aragorn
cd ${HOME}neotropical_diversity/results/first_155_samples/

TABLE="neotropical_soil_175_samples.OTU.protists.table"
TOTAL=$(head -n 1 "${TABLE}" | \
    tr "\t" "\n" | nl | grep "total" | awk '{print $1}')
TMP=$(mktemp)
cat ../phylogenetic_placement/otus/{no_clade,uncertain} > "${TMP}"

# how many OTUs to discard?
wc -l < "${TMP}"

# how many reads do these OTUs represent?
grep -F -f "${TMP}" "${TABLE}" | awk -v TOTAL=${TOTAL} '{s += $TOTAL} END {print s}'

# build a clean table (make a dict of rejected OTUs and check against it)
awk -v otus="${TMP}" \
    'BEGIN {OFS = "\t"
            while (getline < otus) {
                bad_otus[$1] = 1
            }
            close(otus)
     }
     {if (! bad_otus[$2]) {
          print $0
      }
     }' "${TABLE}" > "${TABLE/.table/_cleaned.table}"

# how many OTUs remaining?
echo $(( $(wc -l < "${TABLE/.table/_cleaned.table}") - 1 ))

rm "${TMP}"

The discarded OTUs represent 3,466,330 reads (out of 50,118,536 clean protist reads).

  1. After the removal of the 2,231 OTUs, were there 26,861 OTUs remaining?

26,860 OTUs remaining.

  1. Given the removal of the 2,231 OTUs, what is the new taxonomic assignment figure? Please send this updated bar-chart figure.

(see here for the code)

  1. From this updated figure what is:
  2. % reads that match the Apicomplexa in the total dataset?
  3. % reads that match the Apicomplexa in each forest?
  4. % OTUs that match the Apicomplexa in the total dataset?
  5. % OTUs that match the Apicomplexa in each forest?

(see here for the code)

Forest Protist reads Apicomplexa reads % Protist OTUs Apicomplexa OTUs %
Barro 16232500 15035494 92.63 7065 4141 58.61
LaSelva 21515829 16250286 75.53 16935 7213 42.59
Tiputini 8903877 8072110 90.66 5217 3197 61.28
Total 46652206 39357890 84.36 26860 13578 50.55

(don't forget that the sum of OTUs per forest is always higher than the total number of OTUs. There are some shared OTUs)

  1. Given the removal of the 2,231 OTUs, please resend the updated table of the hyperdominant taxa. The last version stated: 70% of the protist reads from the soils derived from just 50 OTUs.

(see here for the code)

  1. Given the removal of some samples from Tara Oceans to match what was used in the paper.

What is the total number of observed OTUs in the total dataset?

What is the total number of observed OTUs in just the Atlantic?

What is the total number of observed OTUs in just the Pacific?

3.28 Hyperdominance assessment (without Lucas' unplaced OTUs)

library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)

setwd("~/neotropical_diversity/results/first_155_samples/")
input <- "neotropical_soil_175_samples.OTU.protists_cleaned.table"

## Load data
d <- read.table(input, sep = "\t", header = TRUE)

## Clean and compute the cumulative sum
all_reads <- sum(d$total)
ranks <- length(d$total)
d <- select(d, -matches("[TBL][0-9]"), -amplicon, -chimera, -identity, -taxonomy, -references) %>%
    mutate(cumulative = cumsum(d$total) / all_reads) %>%
    mutate(rank = seq(1, ranks)) %>%
    select(-OTU, -total) %>%
    slice(1:200)

glimpse(d)

## Plot
ggplot(d, aes(x = rank, y = cumulative)) +
    geom_line(colour = "darkred", size = 1) +
    scale_y_continuous(labels = percent, limits = c(0, 1)) +
    scale_x_continuous() +
    xlab("Number of OTUs") +
    ylab("percentage of observations") +
    theme_bw(base_size = 16)

## Output to PDF
output <- gsub(".table", ".hyperdominance.pdf", input, fixed = TRUE)
ggsave(file = output, width = 8 , height = 5)

quit(save="no")

Make a table

cd ~/neotropical_diversity/results/first_155_samples/

TABLE="neotropical_soil_175_samples.OTU.protists_cleaned.table"
MAX=50

function get_column_number() {
    head -n 1 "${TABLE}" | tr "\t" "\n" | nl -n ln | grep "${1}" | cut -d " " -f 1
}

TOTAL=$(get_column_number total)
IDENTITY=$(get_column_number identity)
TAXONOMY=$(get_column_number taxonomy)
GRAND_TOTAL=$(awk -v COLUMN=${TOTAL} 'BEGIN {FS = "\t"} {s += $COLUMN} END {print s}' "${TABLE}")

cut -f 1,${TOTAL},${IDENTITY},${TAXONOMY} "${TABLE}" | \
    head -n $(( ${MAX} + 1 )) | \
    awk -v GRAND_TOTAL=${GRAND_TOTAL} \
        'BEGIN {FS = OFS = "\t"}
         {perc = 100 * $2 / GRAND_TOTAL
          cum_perc += perc
          print $1, $2, perc, cum_perc, $3, $4}' | \
    sed 's/total\t0\t0/total\t%\tcum_%/' > "${TABLE/.table/.hyperdominance.csv}"

(only work when copy pasted to a terminal)

Are each OTU evenly distributed?

library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)

setwd("~/neotropical_diversity/results/first_155_samples/")
input <- "neotropical_soil_175_samples.OTU.protists_cleaned.table"

## Load data
d <- read.table(input, sep = "\t", header = TRUE)

## Compute sample sizes
sum_all_reads <- sum(d$total)
ranks <- length(d$total)
sample_sizes <- select(d, matches("[TBL][0-9]")) %>%
    mutate(rank = seq(1, ranks)) %>%
    gather("samples", "reads", matches("[TBL][0-9]")) %>%
    group_by(samples) %>%
    summarize(sum = sum(reads))

ordered_samples <- arrange(sample_sizes, sum) %>%
    select(samples)

## Isolate the first OTU
OTU_first <- select(d, matches("[TBL][0-9]")) %>%
    slice(1:1) %>%
    gather("samples2", "reads", matches("[TBL][0-9]"))

sum_reads <- sum(OTU_first$reads)
glimpse(OTU_first)

## Join the two tables
OTU_distribution <- bind_cols(sample_sizes, OTU_first) %>%
    select(-samples2) %>%
    mutate(ratio = reads / sum)

average <- sum_reads / sum_all_reads

## Plot
ggplot(OTU_distribution, aes(x = samples, y = (ratio - average) / average)) +
    geom_point() +
    scale_y_continuous() +
    scale_x_discrete(limits = levels(ordered_samples$samples),
                     breaks = NULL) +
    xlab("Samples (sorted by ascending number of reads)") +
    ylab("percentage of observations per sample") +
    theme_bw()

## Output to PDF
output <- gsub(".table", ".hyperdominance_distribution.pdf", input, fixed = TRUE)
ggsave(file = output, width = 8 , height = 5)

quit(save="no")

3.29 Group specific: Haptophyta

kl
cd ${HOME}/neotropical_diversity/data/

INPUT="neotropical_soil_175_samples.OTU.protists.table"
REPRESENTATIVES="${INPUT/.OTU.protists.table/_1f_representatives.fas}"
FASTA="${INPUT/.OTU.protists.table/_1f_representatives_haptophyta.fas}"

grep --no-group-separator \
     -A 1 -F \
     -f <(grep "Haptophyta" "${INPUT}" | cut -f 2) "${REPRESENTATIVES}" > "${FASTA}"

3.30 Seed vs crown vs radius plots (protists only) (multiplot with TARA, Swiss soils and BioMarKs)

Red-white-blue plate for the the tropical soils, Tara, BioMarKs V4, BioMarKs V9, Swiss soils. You have these in separate files.

Five plots on one plate:

  • tropical soils,
  • Tara,
  • BioMarKs V4,
  • BioMarKs V9,
  • Swiss soils

I made a first plate with free scales (all plots are independent).

library(ggplot2)
library(scales)
require(cowplot)

#----------------------------- Neotropical data -------------------------------#

setwd("~/neotropical_diversity/results/first_155_samples/")

## Study name (change here)
p1_input <- "neotropical_soil_175_samples_1.stats2_protists"
p1_title <- "Neotropical Soils (V4)"

## Load stats
p1_stats <- read.table(p1_input, sep = "\t")

## Group data frames and name variables
colnames(p1_stats) <- c("size", "mass", "first_amplicon_id",
                        "first_amplicon_abundance", "singletons",
                        "radius", "steps", "similarity")

## Variable
p1_min_abundance <- 11
p1_min_size <- 2
p1_max_abundance <- max(p1_stats$first_amplicon_abundance)
p1_max_size <- max(p1_stats$size)

## Eliminate small swarms
p1_reduced_stats <- subset.data.frame(p1_stats,
                                      p1_stats$size >= p1_min_size &
                                          p1_stats$first_amplicon_abundance >= p1_min_abundance)

## The gradient values are always expressed on a 0 to 1 scale . My own
## values (low, high) have to be converted as such: y = (x - low) /
## (high - low); where x is my value and y is the equivalent in the 0
## to 1 range.
## Compute the position of the 90% and 97% color limits
p1_low <- min(p1_reduced_stats$similarity) ## 76.9 in TARA V9 908
p1_high <- max(p1_reduced_stats$similarity) ## 99.4 in TARA V9 908
p1_ninety <- (90.0 - p1_low) / (p1_high - p1_low)
p1_ninetyseven <- (97.0 - p1_low) / (p1_high - p1_low)

## Plot
p1 <- ggplot(p1_reduced_stats,
             aes(x = first_amplicon_abundance,
                 y = size,
                 colour = similarity / 100)) +
      geom_point(shape = 21) +
      labs(title = p1_title) +
      scale_x_log10(name = "abundance of central amplicon",
                    breaks = trans_breaks("log10", function(x) 10^x),
                    labels = trans_format("log10", math_format(10^.x)),
                    limits = c(p1_min_abundance, p1_max_abundance)) +
      scale_y_log10(name = "number of amplicons in the OTU",
                    breaks = trans_breaks("log10", function(x) 10^x),
                    labels = trans_format("log10", math_format(10^.x)),
                    limits = c(p1_min_size, p1_max_size)) +
      scale_colour_gradientn(name = "similarity (%)",
                             colours = c("darkred", "red", "white", "dodgerblue4"),
                             values = c(0, p1_ninety, p1_ninetyseven, 1),
                             breaks = c(0.80, 0.90, 0.97, 1),
                             labels = c("80", "90", "97", "100")) +
      theme(legend.justification = c(1, 0),
            legend.position = c(1, 0),
            legend.background = element_rect(colour = "grey"))

#----------------------------- TARA data --------------------------------------#

setwd("~/Science/Projects/TARA/results/Swarms/")

## Study name (change here)
p2_input <- "TARA_V9_370_samples_1.stats2_protists"
p2_title <- "TARA V9 (V9)"

## Load stats
p2_stats <- read.table(p2_input, sep = "\t")

## Group data frames and name variables
colnames(p2_stats) <- c("size", "mass", "first_amplicon_id",
                        "first_amplicon_abundance", "singletons",
                        "radius", "steps", "similarity")

## Variable
p2_min_abundance <- 11
p2_min_size <- 2
p2_max_abundance <-  max(p2_stats$first_amplicon_abundance)
p2_max_size <- max(p2_stats$size)

## Eliminate small swarms
p2_reduced_stats <- subset.data.frame(p2_stats,
                                      p2_stats$size >= p2_min_size &
                                          p2_stats$first_amplicon_abundance >= p2_min_abundance)

## The gradient values are always expressed on a 0 to 1 scale . My own
## values (low, high) have to be converted as such: y = (x - low) /
## (high - low); where x is my value and y is the equivalent in the 0
## to 1 range.
## Compute the position of the 90% and 97% color limits
p2_low <- min(p2_reduced_stats$similarity)
p2_high <- max(p2_reduced_stats$similarity)
p2_ninety <- (90 - p2_low) / (p2_high - p2_low)
p2_ninetyseven <- (97 - p2_low) / (p2_high - p2_low)

## Plot
p2 <- ggplot(p2_reduced_stats,
             aes(x = first_amplicon_abundance,
                 y = size,
                 colour = similarity / 100)) +
      geom_point(shape = 21) +
      labs(title = p2_title) +
      scale_x_log10(name = "abundance of central amplicon",
                    breaks = trans_breaks("log10", function(x) 10^x),
                    labels = trans_format("log10", math_format(10^.x)),
                    limits = c(p2_min_abundance, p2_max_abundance)) +
      scale_y_log10(name = "number of amplicons in the OTU",
                    breaks = trans_breaks("log10", function(x) 10^x),
                    labels = trans_format("log10", math_format(10^.x)),
                    limits = c(p2_min_size, p2_max_size)) +
      scale_colour_gradientn(name = "similarity (%)",
                             colours = c("darkred", "red", "white", "dodgerblue4"),
                             values = c(0, p2_ninety, p2_ninetyseven, 1),
                             breaks = c(0.80, 0.90, 0.97, 1),
                             labels = c("80", "90", "97", "100")) +
      theme(legend.justification = c(1, 0),
            legend.position = c(1, 0),
            legend.background = element_rect(colour = "grey"))

#----------------------------- Swiss data -------------------------------------#

setwd("~/Science/Projects/Swiss_forests/data/")

## Study name (change here)
p3_input <- "swiss_forests_V9_29_samples_1.stats2_protists"
p3_title <- "Swiss Soils (V9)"

## Load stats
p3_stats <- read.table(p3_input, sep = "\t")

## Group data frames and name variables
colnames(p3_stats) <- c("size", "mass", "first_amplicon_id",
                        "first_amplicon_abundance", "singletons",
                        "radius", "steps", "similarity")

## Variables
p3_min_abundance <- 11
p3_min_size <- 2
p3_max_abundance <- max(p3_stats$first_amplicon_abundance)
p3_max_size <- max(p3_stats$size)

## Eliminate small swarms
p3_reduced_stats <- subset.data.frame(p3_stats,
                                      p3_stats$size >= p3_min_size &
                                          p3_stats$first_amplicon_abundance >= p3_min_abundance)

## The gradient values are always expressed on a 0 to 1 scale . My own
## values (low, high) have to be converted as such: y = (x - low) /
## (high - low); where x is my value and y is the equivalent in the 0
## to 1 range.
## Compute the position of the 90% and 97% color limits
p3_low <- min(p3_reduced_stats$similarity)
p3_high <- max(p3_reduced_stats$similarity)
p3_ninety <- (90 - p3_low) / (p3_high - p3_low)
p3_ninetyseven <- (97 - p3_low) / (p3_high - p3_low)

## Plot
p3 <- ggplot(p3_reduced_stats,
       aes(x = first_amplicon_abundance,
           y = size,
           colour = similarity / 100)) +
    geom_point(shape = 21) +
    labs(title = p3_title) +
    scale_x_log10(name = "abundance of central amplicon",
                  breaks = trans_breaks("log10", function(x) 10^x),
                  labels = trans_format("log10", math_format(10^.x)),
                  limits = c(p3_min_abundance, p3_max_abundance)) +
    scale_y_log10(name = "number of amplicons in the OTU",
                  breaks = trans_breaks("log10", function(x) 10^x),
                  labels = trans_format("log10", math_format(10^.x)),
                  limits = c(p3_min_size, p3_max_size)) +
    scale_colour_gradientn(name = "similarity (%)",
                           colours = c("darkred", "red", "white", "dodgerblue4"),
                           values = c(0, p3_ninety, p3_ninetyseven, 1),
                           breaks = c(0.80, 0.90, 0.97, 1),
                           labels = c("80", "90", "97", "100")) +
      theme(legend.justification = c(1, 0),
            legend.position = c(1, 0),
            legend.background = element_rect(colour = "grey"))

#----------------------------- BioMarKs V4 ------------------------------------#

setwd("~/Science/Projects/BioMarks/results/")

## Study name (change here)
p4_input <- "biomarks_v4_illumina_1.stats2_protists"
p4_title <- "BioMarKs (V4)"

## Load stats
p4_stats <- read.table(p4_input, sep = "\t")

## Group data frames and name variables
colnames(p4_stats) <- c("size", "mass", "first_amplicon_id",
                        "first_amplicon_abundance", "singletons",
                        "radius", "steps", "similarity")

## Variables
p4_min_abundance <- 11
p4_min_size <- 2
p4_max_abundance <- max(p4_stats$first_amplicon_abundance)
p4_max_size <- max(p4_stats$size)

## Eliminate small swarms
p4_reduced_stats <- subset.data.frame(p4_stats,
                                      p4_stats$size >= p4_min_size &
                                          p4_stats$first_amplicon_abundance >= p4_min_abundance)

## The gradient values are always expressed on a 0 to 1 scale . My own
## values (low, high) have to be converted as such: y = (x - low) /
## (high - low); where x is my value and y is the equivalent in the 0
## to 1 range.
## Compute the position of the 90% and 97% color limits
p4_low <- min(p4_reduced_stats$similarity)
p4_high <- max(p4_reduced_stats$similarity)
p4_ninety <- (90 - p4_low) / (p4_high - p4_low)
p4_ninetyseven <- (97 - p4_low) / (p4_high - p4_low)

## Plot
p4 <- ggplot(p4_reduced_stats,
             aes(x = first_amplicon_abundance,
                 y = size,
                 colour = similarity / 100)) +
    geom_point(shape = 21) +
    labs(title = p4_title) +
    scale_x_log10(name = "abundance of central amplicon",
                  breaks = trans_breaks("log10", function(x) 10^x),
                  labels = trans_format("log10", math_format(10^.x)),
                  limits = c(p4_min_abundance, p4_max_abundance)) +
    scale_y_log10(name = "number of amplicons in the OTU",
                  breaks = trans_breaks("log10", function(x) 10^x),
                  labels = trans_format("log10", math_format(10^.x)),
                  limits = c(p4_min_size, p4_max_size)) +
    scale_colour_gradientn(name = "similarity (%)",
                           colours = c("darkred", "red", "white", "dodgerblue4"),
                           values = c(0, p4_ninety, p4_ninetyseven, 1),
                           breaks = c(0.80, 0.90, 0.97, 1),
                           labels = c("80", "90", "97", "100")) +
      theme(legend.justification = c(1, 0),
            legend.position = c(1, 0),
            legend.background = element_rect(colour = "grey"))

#----------------------------- BioMarKs V9 ------------------------------------#

setwd("~/Science/Projects/BioMarks/results")

## Study name (change here)
p5_input <- "biomarks_v9_illumina_1.stats2_protists"
p5_title <- "BioMarKs (V9)"

## Load stats
p5_stats <- read.table(p5_input, sep = "\t")

## Group data frames and name variables
colnames(p5_stats) <- c("size", "mass", "first_amplicon_id",
                        "first_amplicon_abundance", "singletons",
                        "radius", "steps", "similarity")

## Variables
p5_min_abundance <- 11
p5_min_size <- 2
p5_max_abundance <- max(p5_stats$first_amplicon_abundance)
p5_max_size <- max(p5_stats$size)

## Eliminate small swarms
p5_reduced_stats <- subset.data.frame(p5_stats,
                                      p5_stats$size >= p5_min_size &
                                          p5_stats$first_amplicon_abundance >= p5_min_abundance)

## The gradient values are always expressed on a 0 to 1 scale . My own
## values (low, high) have to be converted as such: y = (x - low) /
## (high - low); where x is my value and y is the equivalent in the 0
## to 1 range.
## Compute the position of the 90% and 97% color limits
p5_low <- min(p5_reduced_stats$similarity)
p5_high <- max(p5_reduced_stats$similarity)
p5_ninety <- (90 - p5_low) / (p5_high - p5_low)
p5_ninetyseven <- (97 - p5_low) / (p5_high - p5_low)

## Plot
p5 <- ggplot(p5_reduced_stats,
             aes(x = first_amplicon_abundance,
                 y = size,
                 colour = similarity / 100)) +
    geom_point(shape = 21) +
    labs(title = p5_title) +
    scale_x_log10(name = "abundance of central amplicon",
                  breaks = trans_breaks("log10", function(x) 10^x),
                  labels = trans_format("log10", math_format(10^.x)),
                  limits = c(p5_min_abundance, p5_max_abundance)) +
    scale_y_log10(name = "number of amplicons in the OTU",
                  breaks = trans_breaks("log10", function(x) 10^x),
                  labels = trans_format("log10", math_format(10^.x)),
                  limits = c(p5_min_size, p5_max_size)) +
    scale_colour_gradientn(name = "similarity (%)",
                           colours = c("darkred", "red", "white", "dodgerblue4"),
                           values = c(0, p5_ninety, p5_ninetyseven, 1),
                           breaks = c(0.80, 0.90, 0.97, 1),
                           labels = c("80", "90", "97", "100")) +
      theme(legend.justification = c(1, 0),
            legend.position = c(1, 0),
            legend.background = element_rect(colour = "grey"))

#----------------------------- Create PDF -------------------------------------#

## Build and save the plate
setwd("~/neotropical_diversity/results/first_155_samples/")
output_file <- "seed_vs_crown_vs_radius_free_scales.pdf"
theme_set(theme_bw())

## p1 = Neotrop V4
## p2 = TARA V9
## p3 = Swiss V9
## p4 = BioMarKs V4
## p5 = BioMarKs V9

plot2by2 <- plot_grid(p1, p2, p4, p5, NULL, p3,
                      labels = c("A", "B", "C", "D", " ", "E"),
                      ncol = 2, nrow = 3, align = "hv")
save_plot(output_file, plot2by2,
          ncol = 2, nrow = 3,
          base_width = 6)

quit(save="no")

I make a second plate with synchronized scales, to show the effect of sequencing depth increase (OTU radii grow from blue to white to red).

library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
require(cowplot)

## Build fake_v9 data to force same color scales
fake_v9 <- data.frame(size = as.integer(c(11, 11)),
                      mass = as.integer(c(11, 11)),
                      first_amplicon_id = c("a", "b"),
                      first_amplicon_abundance = as.integer(c(11, 11)),
                      singletons = as.integer(c(11, 11)),
                      radius = as.integer(c(11, 11)),
                      steps = as.integer(c(11, 11)),
                      similarity = c(85.0, 100.0))

## Build fake_v4 data to force same color scales
fake_v4 <- data.frame(size = as.integer(c(11, 11)),
                      mass = as.integer(c(11, 11)),
                      first_amplicon_id = c("a", "b"),
                      first_amplicon_abundance = as.integer(c(11, 11)),
                      singletons = as.integer(c(11, 11)),
                      radius = as.integer(c(11, 11)),
                      steps = as.integer(c(11, 11)),
                      similarity = c(96.0, 100.0))

#----------------------------- Neotropical data -------------------------------#

setwd("~/neotropical_diversity/results/first_155_samples/")

## Study name (change here)
p1_input <- "neotropical_soil_175_samples_1.stats2_protists"
p1_title <- "Neotropical Soils (V4)"

## Load stats
p1_stats <- read.table(p1_input, sep = "\t", )
p1_stats <- na.omit(p1_stats)

## Group data frames and name variables
colnames(p1_stats) <- c("size", "mass", "first_amplicon_id",
                        "first_amplicon_abundance", "singletons",
                        "radius", "steps", "similarity")

## Variable
p1_min_abundance <- 11
p1_min_size <- 2
p1_max_abundance <- max(p1_stats$first_amplicon_abundance)
p1_max_size <- max(p1_stats$size)

## Eliminate small swarms
p1_reduced_stats <- subset.data.frame(p1_stats,
                                      p1_stats$size >= p1_min_size &
                                          p1_stats$first_amplicon_abundance >= p1_min_abundance)

## Add fake_v4 values
p1_reduced_stats <- rbind(fake_v4, p1_reduced_stats)

## The gradient values are always expressed on a 0 to 1 scale . My own
## values (low, high) have to be converted as such: y = (x - low) /
## (high - low); where x is my value and y is the equivalent in the 0
## to 1 range.
## Compute the position of the 90% and 97% color limits
p1_low <- min(p1_reduced_stats$similarity) ## 76.9 in TARA V9 908
p1_high <- max(p1_reduced_stats$similarity) ## 99.4 in TARA V9 908
p1_ninety <- (90.0 - p1_low) / (p1_high - p1_low)
p1_ninetyseven <- (97.0 - p1_low) / (p1_high - p1_low)

## Plot
p1 <- ggplot(p1_reduced_stats,
             aes(x = first_amplicon_abundance,
                 y = size,
                 colour = similarity / 100)) +
      geom_point(shape = 21) +
      labs(title = p1_title) +
      scale_x_log10(name = "abundance of central amplicon",
                    breaks = trans_breaks("log10", function(x) 10^x),
                    labels = trans_format("log10", math_format(10^.x)),
                    limits = c(p1_min_abundance, p1_max_abundance)) +
      scale_y_log10(name = "number of amplicons in the OTU",
                    breaks = trans_breaks("log10", function(x) 10^x),
                    labels = trans_format("log10", math_format(10^.x)),
                    limits = c(p1_min_size, p1_max_size)) +
      scale_colour_gradientn(name = "similarity (%)",
                             colours = c("darkred", "red", "white", "dodgerblue4"),
                             values = c(0, p1_ninety, p1_ninetyseven, 1),
                             breaks = c(0.80, 0.90, 0.97, 1),
                             labels = c("80", "90", "97", "100")) +
      theme(legend.justification = c(1, 0),
            legend.position = c(1, 0),
            legend.background = element_rect(colour = "grey"))

#----------------------------- TARA data --------------------------------------#

setwd("~/Science/Projects/TARA/results/Swarms/")

## Study name (change here)
p2_input <- "TARA_V9_370_samples_1.stats2_protists"
p2_title <- "TARA V9 (V9)"

## Load stats
p2_stats <- read.table(p2_input, sep = "\t")

## Group data frames and name variables
colnames(p2_stats) <- c("size", "mass", "first_amplicon_id",
                        "first_amplicon_abundance", "singletons",
                        "radius", "steps", "similarity")

## Variable
p2_min_abundance <- 11
p2_min_size <- 2
p2_max_abundance <-  max(p2_stats$first_amplicon_abundance)
p2_max_size <- max(p2_stats$size)

## Eliminate small swarms
p2_reduced_stats <- subset.data.frame(p2_stats,
                                      p2_stats$size >= p2_min_size &
                                          p2_stats$first_amplicon_abundance >= p2_min_abundance)

## Add fake_v9 values
p2_reduced_stats <- rbind(fake_v9, p2_reduced_stats)

## The gradient values are always expressed on a 0 to 1 scale . My own
## values (low, high) have to be converted as such: y = (x - low) /
## (high - low); where x is my value and y is the equivalent in the 0
## to 1 range.
## Compute the position of the 90% and 97% color limits
p2_low <- min(p2_reduced_stats$similarity)
p2_high <- max(p2_reduced_stats$similarity)
p2_ninety <- (90 - p2_low) / (p2_high - p2_low)
p2_ninetyseven <- (97 - p2_low) / (p2_high - p2_low)

## Plot
p2 <- ggplot(p2_reduced_stats,
             aes(x = first_amplicon_abundance,
                 y = size,
                 colour = similarity / 100)) +
      geom_point(shape = 21) +
      labs(title = p2_title) +
      scale_x_log10(name = "abundance of central amplicon",
                    breaks = trans_breaks("log10", function(x) 10^x),
                    labels = trans_format("log10", math_format(10^.x)),
                    limits = c(p2_min_abundance, p2_max_abundance)) +
      scale_y_log10(name = "number of amplicons in the OTU",
                    breaks = trans_breaks("log10", function(x) 10^x),
                    labels = trans_format("log10", math_format(10^.x)),
                    limits = c(p2_min_size, p2_max_size)) +
      scale_colour_gradientn(name = "similarity (%)",
                             colours = c("darkred", "red", "white", "dodgerblue4"),
                             values = c(0, p2_ninety, p2_ninetyseven, 1),
                             breaks = c(0.80, 0.90, 0.97, 1),
                             labels = c("80", "90", "97", "100")) +
      theme(legend.justification = c(1, 0),
            legend.position = c(1, 0),
            legend.background = element_rect(colour = "grey"))

#----------------------------- Swiss data -------------------------------------#

setwd("~/Science/Projects/Swiss_forests/data/")

## Study name (change here)
p3_input <- "swiss_forests_V9_29_samples_1.stats2_protists"
p3_title <- "Swiss Soils (V9)"

## Load stats
p3_stats <- read.table(p3_input, sep = "\t")

## Group data frames and name variables
colnames(p3_stats) <- c("size", "mass", "first_amplicon_id",
                        "first_amplicon_abundance", "singletons",
                        "radius", "steps", "similarity")

## Variables
p3_min_abundance <- 11
p3_min_size <- 2
p3_max_abundance <- max(p2_stats$first_amplicon_abundance)  ## max of TARA
p3_max_size <- max(p2_stats$size)  ## max of TARA

## Eliminate small swarms
p3_reduced_stats <- subset.data.frame(p3_stats,
                                      p3_stats$size >= p3_min_size &
                                          p3_stats$first_amplicon_abundance >= p3_min_abundance)

## Add fake_v9 values
p3_reduced_stats <- rbind(fake_v9, p3_reduced_stats)

## The gradient values are always expressed on a 0 to 1 scale . My own
## values (low, high) have to be converted as such: y = (x - low) /
## (high - low); where x is my value and y is the equivalent in the 0
## to 1 range.
## Compute the position of the 90% and 97% color limits
p3_low <- min(p3_reduced_stats$similarity)
p3_high <- max(p3_reduced_stats$similarity)
p3_ninety <- (90 - p3_low) / (p3_high - p3_low)
p3_ninetyseven <- (97 - p3_low) / (p3_high - p3_low)

## Plot
p3 <- ggplot(p3_reduced_stats,
       aes(x = first_amplicon_abundance,
           y = size,
           colour = similarity / 100)) +
    geom_point(shape = 21) +
    labs(title = p3_title) +
    scale_x_log10(name = "abundance of central amplicon",
                  breaks = trans_breaks("log10", function(x) 10^x),
                  labels = trans_format("log10", math_format(10^.x)),
                  limits = c(p3_min_abundance, p3_max_abundance)) +
    scale_y_log10(name = "number of amplicons in the OTU",
                  breaks = trans_breaks("log10", function(x) 10^x),
                  labels = trans_format("log10", math_format(10^.x)),
                  limits = c(p3_min_size, p3_max_size)) +
        scale_colour_gradientn(name = "similarity (%)",
                               colours = c("darkred", "red", "white", "dodgerblue4"),
                               values = c(0, p3_ninety, p3_ninetyseven, 1),
                               breaks = c(0.80, 0.90, 0.97, 1),
                               labels = c("80", "90", "97", "100")) +
      theme(legend.justification = c(1, 0),
            legend.position = c(1, 0),
            legend.background = element_rect(colour = "grey"))

#----------------------------- BioMarKs V4 ------------------------------------#

setwd("~/Science/Projects/BioMarks/results/")

## Study name (change here)
p4_input <- "biomarks_v4_illumina_1.stats2_protists"
p4_title <- "BioMarKs (V4)"

## Load stats
p4_stats <- read.table(p4_input, sep = "\t")

## Group data frames and name variables
colnames(p4_stats) <- c("size", "mass", "first_amplicon_id",
                        "first_amplicon_abundance", "singletons",
                        "radius", "steps", "similarity")

## Variables
p4_min_abundance <- 11
p4_min_size <- 2
p4_max_abundance <- max(p1_stats$first_amplicon_abundance)  ## max of Neotrop
p4_max_size <- max(p1_stats$size)  ## max of Neotrop

## Eliminate small swarms
p4_reduced_stats <- subset.data.frame(p4_stats,
                                      p4_stats$size >= p4_min_size &
                                          p4_stats$first_amplicon_abundance >= p4_min_abundance)

## Add fake_v4 values
p4_reduced_stats <- rbind(fake_v4, p4_reduced_stats)

## The gradient values are always expressed on a 0 to 1 scale . My own
## values (low, high) have to be converted as such: y = (x - low) /
## (high - low); where x is my value and y is the equivalent in the 0
## to 1 range.
## Compute the position of the 90% and 97% color limits
p4_low <- min(p4_reduced_stats$similarity)
p4_high <- max(p4_reduced_stats$similarity)
p4_ninety <- (90 - p4_low) / (p4_high - p4_low)
p4_ninetyseven <- (97 - p4_low) / (p4_high - p4_low)

## Plot
p4 <- ggplot(p4_reduced_stats,
             aes(x = first_amplicon_abundance,
                 y = size,
                 colour = similarity / 100)) +
    geom_point(shape = 21) +
    labs(title = p4_title) +
    scale_x_log10(name = "abundance of central amplicon",
                  breaks = trans_breaks("log10", function(x) 10^x),
                  labels = trans_format("log10", math_format(10^.x)),
                  limits = c(p4_min_abundance, p4_max_abundance)) +
    scale_y_log10(name = "number of amplicons in the OTU",
                  breaks = trans_breaks("log10", function(x) 10^x),
                  labels = trans_format("log10", math_format(10^.x)),
                  limits = c(p4_min_size, p4_max_size)) +
    scale_colour_gradientn(name = "similarity (%)",
                           colours = c("darkred", "red", "white", "dodgerblue4"),
                           values = c(0, p4_ninety, p4_ninetyseven, 1),
                           breaks = c(0.80, 0.90, 0.97, 1),
                           labels = c("80", "90", "97", "100")) +
      theme(legend.justification = c(1, 0),
            legend.position = c(1, 0),
            legend.background = element_rect(colour = "grey"))

#----------------------------- BioMarKs V9 ------------------------------------#

setwd("~/Science/Projects/BioMarks/results")

## Study name (change here)
p5_input <- "biomarks_v9_illumina_1.stats2_protists"
p5_title <- "BioMarKs (V9)"

## Load stats
p5_stats <- read.table(p5_input, sep = "\t")

## Group data frames and name variables
colnames(p5_stats) <- c("size", "mass", "first_amplicon_id",
                        "first_amplicon_abundance", "singletons",
                        "radius", "steps", "similarity")

## Variables
p5_min_abundance <- 11
p5_min_size <- 2
p5_max_abundance <- max(p2_stats$first_amplicon_abundance)  ## max of TARA
p5_max_size <- max(p2_stats$size)  ## max of TARA

## Eliminate small swarms
p5_reduced_stats <- subset.data.frame(p5_stats,
                                      p5_stats$size >= p5_min_size &
                                          p5_stats$first_amplicon_abundance >= p5_min_abundance)

## Add fake_v9 values
p5_reduced_stats <- rbind(fake_v9, p5_reduced_stats)

## The gradient values are always expressed on a 0 to 1 scale . My own
## values (low, high) have to be converted as such: y = (x - low) /
## (high - low); where x is my value and y is the equivalent in the 0
## to 1 range.
## Compute the position of the 90% and 97% color limits
p5_low <- min(p5_reduced_stats$similarity)
p5_high <- max(p5_reduced_stats$similarity)
p5_ninety <- (90 - p5_low) / (p5_high - p5_low)
p5_ninetyseven <- (97 - p5_low) / (p5_high - p5_low)

## Plot
p5 <- ggplot(p5_reduced_stats,
             aes(x = first_amplicon_abundance,
                 y = size,
                 colour = similarity / 100)) +
    geom_point(shape = 21) +
    labs(title = p5_title) +
    scale_x_log10(name = "abundance of central amplicon",
                  breaks = trans_breaks("log10", function(x) 10^x),
                  labels = trans_format("log10", math_format(10^.x)),
                  limits = c(p5_min_abundance, p5_max_abundance)) +
    scale_y_log10(name = "number of amplicons in the OTU",
                  breaks = trans_breaks("log10", function(x) 10^x),
                  labels = trans_format("log10", math_format(10^.x)),
                  limits = c(p5_min_size, p5_max_size)) +
    scale_colour_gradientn(name = "similarity (%)",
                           colours = c("darkred", "red", "white", "dodgerblue4"),
                           values = c(0, p5_ninety, p5_ninetyseven, 1),
                           breaks = c(0.80, 0.90, 0.97, 1),
                           labels = c("80", "90", "97", "100")) +
    theme(legend.justification = c(1, 0),
          legend.position = c(1, 0),
          legend.background = element_rect(colour = "grey"))

#----------------------------- Create PDF -------------------------------------#

## Build and save the plate
setwd("~/neotropical_diversity/results/first_155_samples/")
output_file <- "seed_vs_crown_vs_radius_synced_scales.pdf"
## theme_set(theme_bw())
theme_set(theme_gray())

## p1 = Neotrop V4
## p2 = TARA V9
## p3 = Swiss V9
## p4 = BioMarKs V4
## p5 = BioMarKs V9

plot2by2 <- plot_grid(p1, p2, p4, p5, NULL, p3,
                      labels = c("A", "B", "C", "D", " ", "E"),
                      ncol = 2, nrow = 3, align = "hv")
save_plot(output_file, plot2by2,
          ncol = 2, nrow = 3,
          base_width = 6)

quit(save="no")

3.31 How many Apicomplexa reads and OTUs in all projects?

Total number of cleaned reads for: TARA BioMarKs v4 BioMarKs v9 Swiss soils v9

% of Apicomplexa reads for: TARA BioMarKs v4 BioMarKs v9 Swiss soils v9

% of Apicomplexa OTUs for: TARA BioMarKs v4 BioMarKs v9 Swiss soils v9

I already did that for the three neotropical forests.

library(dplyr)
library(tidyr)
library(vegan)

results <- data_frame()
setwd("~/neotropical_diversity/results/first_155_samples/")
targets <- list(c("biomarks_v9_illumina.OTU.protists.table", "BioMarKs V9"),
                c("biomarks_v4_illumina.OTU.protists.table", "BioMarKs V4"),
                c("swiss_forests_V9_29_samples.OTU.protists.table", "Swiss Soils V9"),
                c("neotropical_soil_175_samples.OTU.protists_cleaned.table", "Neotropical Soils V4"))

for (target in targets) {
    input <- target[1]
    project <- data_frame(name = target[2])

    ## Import and format data
    d <- read.table(input, sep = "\t", header = TRUE, dec=".") %>%
        tbl_df() %>%
            select(total, taxonomy)

    ## Extract the nth field from the "taxonomy" and store in a new column
    if (target[2] == "Swiss Soils V9") {field <- 5} else {field <- 3}
    d$clade <- apply(d["taxonomy"], 1 , function(x) strsplit(x, "|", fixed = TRUE)[[1]][field])
    d <- select(d, -taxonomy)

    ## Tally protists
    w <- summarise(d, protists_reads = sum(total))
    x <- summarise(d, protists_OTUs = n())

    ## Tally apicomplexans
    apicomplexa <- filter(d, clade == "Apicomplexa")
    y <- summarise(apicomplexa, apicomplexa_reads = sum(total))
    z <- summarise(apicomplexa, apicomplexa_OTUs = n())

    ## Build results table
    row <- bind_cols(project,
                     w, y, data_frame(perc_reads = 100 * y$apicomplexa_reads / w$protists_reads),
                     x, z, data_frame(perc_OTUs = 100 * z$apicomplexa_OTUs / x$protists_OTUs))
    results <- bind_rows(results, row)
}

#*****************************************************************************#
#                                                                             #
#                                   TARA V9                                   #
#                                                                             #
#*****************************************************************************#

## TARA is a special case, as we need to exclude some samples first,
## and then recompute the number of reads

## Load data
input <- "TARA_V9_370_samples.OTU.protists.table"
project <- data_frame(name = "TARA Oceans V9")

## Import and format data
d <- read.table(input, sep = "\t", header = TRUE, dec=".") %>%
    tbl_df() %>%
    select(OTU, starts_with("ERR"), taxonomy) %>%
    select(-ERR562392, -ERR562720, -ERR562389, -ERR562572,
           -ERR562599, -ERR562631, -ERR562632, -ERR562635,
           -ERR562653, -ERR562663, -ERR562685, -ERR562689,
           -ERR562690, -ERR562703, -ERR562725)

## Extract the third field from the "taxonomy" and store in a new column
field <- 3
d$clade <- apply(d["taxonomy"], 1 , function(x) strsplit(x, "|", fixed = TRUE)[[1]][field])

end <- ncol(d) - 2
d <- gather(d, "samples", "n", 2:end) %>%
    select(-taxonomy, -samples) %>%
    filter(n != "0")

## Sum reads per OTU (compute a new total) and group by clade
d <- group_by(d, clade, OTU) %>%
    tally(wt = n, sort = FALSE) %>%
    rename(total = n) %>%
    ungroup() %>%
    select(-OTU)

## Tally protists
w <- summarise(d, protists_reads = sum(total))
x <- summarise(d, protists_OTUs = n())

## Tally apicomplexans
apicomplexa <- filter(d, clade == "Apicomplexa")
y <- summarise(apicomplexa, apicomplexa_reads = sum(total))
z <- summarise(apicomplexa, apicomplexa_OTUs = n())

## Build results table
row <- bind_cols(project,
                 w, y, data_frame(perc_reads = 100 * y$apicomplexa_reads / w$protists_reads),
                 x, z, data_frame(perc_OTUs = 100 * z$apicomplexa_OTUs / x$protists_OTUs))
results <- bind_rows(results, row)

## Output
print(results)
write.csv(results, file = "tmp.csv")
name p_reads a_reads perc_reads p_OTUs a_OTUs perc_OTUs
BioMarKs V9 49794719 623728 1.252599 50161 932 1.858017
BioMarKs V4 6092106 76796 1.260582 106013 1624 1.531888
Swiss Soils V9 9445114 176711 1.870925 58732 1218 2.073827
Neotropical Soils V4 46652206 39357890 84.364478 26860 13578 50.551005
TARA Oceans V9 367757099 5566151 1.513540 302663 7735 2.555648

3.32 Compare with TARA, BioMarKs and Swiss soils

library(cowplot)
library(ggplot2)
library(tidyr)
library(dplyr)
library(scales)
library(grid)
library(gridExtra)

#*****************************************************************************#
#                                                                             #
#                          Neotropical Soils (V4)                             #
#                                                                             #
#*****************************************************************************#

## Load data
setwd("~/neotropical_diversity/results/first_155_samples/")
input <- "neotropical_soil_175_samples.OTU.protists_cleaned.table"

## Import and format data
d <- read.table(input, sep = "\t", header = TRUE, dec=".") %>%
    tbl_df() %>%
    select(total, taxonomy)

## Extract the third field from the "taxonomy" and store in a new column
d$clade <- apply(d["taxonomy"], 1 , function(x) strsplit(x, "|", fixed = TRUE)[[1]][3])
d$clade[d$clade == "Stramenopiles_X"] <- "non-Ochrophyta Stramenopiles"

## Group by clade
d <- select(d, -taxonomy) %>%
    group_by(clade) %>%
    filter(total != "0")

## Sum reads
neotrop <- tally(d, wt = total, sort = FALSE) %>%
    rename(abundance = n) %>%
    mutate(project = "Neotropical Forest Soils")

## Tally OTUs
neotrop_OTUs <- tally(d, sort = TRUE) %>%
    rename(abundance = n) %>%
    mutate(project = "Neotropical Forest Soils")


#*****************************************************************************#
#                                                                             #
#                                   TARA V9                                   #
#                                                                             #
#*****************************************************************************#

## Load data
setwd("~/neotropical_diversity/results/first_155_samples/")
input <- "TARA_V9_370_samples.OTU.protists.table"

## Import and format data
d <- read.table(input, sep = "\t", header = TRUE, dec=".") %>%
    tbl_df() %>%
    select(OTU, starts_with("ERR"), taxonomy) %>%
    select(-ERR562392, -ERR562720, -ERR562389, -ERR562572,
           -ERR562599, -ERR562631, -ERR562632, -ERR562635,
           -ERR562653, -ERR562663, -ERR562685, -ERR562689,
           -ERR562690, -ERR562703, -ERR562725)

## Extract the third field from the "taxonomy" and store in a new column
d$clade <- apply(d["taxonomy"], 1 , function(x) strsplit(x, "|", fixed = TRUE)[[1]][3])

end <- ncol(d) - 2
d <- gather(d, "samples", "n", 2:end) %>%
    select(-taxonomy, -samples) %>%
    filter(n != "0")

## Sum reads per OTU (compute a new total) and group by clade
d <- group_by(d, clade, OTU) %>%
    tally(wt = n, sort = FALSE) %>%
    rename(total = n)

## Sum reads by clade
tara_V9 <- tally(d, wt = total) %>%
    rename(abundance = n) %>%
    arrange(clade) %>%
    mutate(project = "TARA Oceans")

## Tally OTUs by clade
tara_V9_OTUs <- tally(d) %>%
    rename(abundance = n) %>%
    arrange(clade) %>%
    mutate(project = "TARA Oceans")


#*****************************************************************************#
#                                                                             #
#                                 BioMarKs V9                                 #
#                                                                             #
#*****************************************************************************#

setwd("~/neotropical_diversity/results/first_155_samples/")
input <- "biomarks_v9_illumina.OTU.protists.table"

## Import and format data
d <- read.table(input, sep = "\t", header = TRUE, dec=".") %>%
    tbl_df() %>%
    select(total, taxonomy)

## Extract the third field from the "taxonomy" and store in a new column
d$clade <- apply(d["taxonomy"], 1 , function(x) strsplit(x, "|", fixed = TRUE)[[1]][3])

## Group by clade
d <- select(d, -taxonomy) %>%
    group_by(clade) %>%
    filter(total != "0")

## Sum reads
biomarks_V9 <- tally(d, wt = total, sort = FALSE) %>%
    rename(abundance = n) %>%
    mutate(project = "BioMarKs (V9)")

## Tally OTUs
biomarks_V9_OTUs <- tally(d, sort = TRUE) %>%
    rename(abundance = n) %>%
    mutate(project = "BioMarKs (V9)")


#*****************************************************************************#
#                                                                             #
#                                 BioMarKs V4                                 #
#                                                                             #
#*****************************************************************************#

setwd("~/neotropical_diversity/results/first_155_samples/")
input <- "biomarks_v4_illumina.OTU.protists.table"

## Import and format data
d <- read.table(input, sep = "\t", header = TRUE, dec=".") %>%
    tbl_df() %>%
    select(total, taxonomy)

## Extract the third field from the "taxonomy" and store in a new column
d$clade <- apply(d["taxonomy"], 1 , function(x) strsplit(x, "|", fixed = TRUE)[[1]][3])
d$clade[d$clade == "Stramenopiles_X"] <- "non-Ochrophyta Stramenopiles"

## Group by clade
d <- select(d, -taxonomy) %>%
    group_by(clade) %>%
    filter(total != "0")

## Sum reads
biomarks_V4 <- tally(d, wt = total, sort = FALSE) %>%
    rename(abundance = n) %>%
    mutate(project = "BioMarKs (V4)")

## Tally OTUs
biomarks_V4_OTUs <- tally(d, sort = TRUE) %>%
    rename(abundance = n) %>%
    mutate(project = "BioMarKs (V4)")

## Clean
rm("d", "input")


#*****************************************************************************#
#                                                                             #
#                                    Merge                                    #
#                                                                             #
#*****************************************************************************#

projects <- bind_rows(neotrop, tara_V9, biomarks_V4, biomarks_V9)
projects_OTUs <- bind_rows(neotrop_OTUs, tara_V9_OTUs,
                           biomarks_V4_OTUs, biomarks_V9_OTUs)

## Rename some clades
projects <- projects %>% filter(clade != "Chimera")
projects$clade[projects$clade == "*"] <- "Unknown"
projects$clade[projects$clade == "Alveolata_X"] <- "Alveolata incertae sedis"
projects$clade[projects$clade == "Amoebozoa_X"] <- "Amoebozoa incertae sedis"
projects$clade[projects$clade == "Stramenopiles_X"] <- "Ochrophyta"
##
projects_OTUs <- projects_OTUs %>% filter(clade != "Chimera")
projects_OTUs$clade[projects_OTUs$clade == "*"] <- "Unknown"
projects_OTUs$clade[projects_OTUs$clade == "Alveolata_X"] <- "Alveolata incertae sedis"
projects_OTUs$clade[projects_OTUs$clade == "Amoebozoa_X"] <- "Amoebozoa incertae sedis"
projects_OTUs$clade[projects_OTUs$clade == "Stramenopiles_X"] <- "Ochrophyta"

## List clades that have significative read abundances
total_abundance <- sum(projects$abundance)
main_taxa <- projects %>%
    group_by(clade) %>%
    tally(wt = abundance, sort = TRUE)  %>%
    mutate(percentage = 100 * n / total_abundance) %>%
    filter(percentage > 0.5) %>%
    select(-n, -percentage)

## List clades that have significative read abundances
total_richness <- sum(projects_OTUs$abundance)
main_taxa_OTUs <- projects_OTUs %>%
    group_by(clade) %>%
    tally(wt = abundance, sort = TRUE)  %>%
    mutate(percentage = 100 * n / total_richness) %>%
    filter(percentage > 0.5) %>%
    select(-n, -percentage)


## All rows in projects that have a match in main_taxa
projects <- semi_join(projects, main_taxa, by = "clade")
projects_OTUs <- semi_join(projects_OTUs, main_taxa_OTUs, by = "clade")

## Order the legend
taxa_order_reads <- select(projects, clade) %>% distinct()
taxa_order_OTUs <- select(projects_OTUs, clade) %>% distinct()

#*****************************************************************************#
#                                                                             #
#                                Read Plot                                    #
#                                                                             #
#*****************************************************************************#

## Shared plot code
p0 <- ggplot(projects, aes(x = project, y = abundance, fill = clade)) +
    ylab("number of observed reads") +
    scale_fill_discrete(breaks = taxa_order_reads$clade,
                            name = "clade                                   ") +
    scale_x_discrete(limits = rev(c("Neotropical Forest Soils", "TARA Oceans",
                         "BioMarKs (V4)", "BioMarKs (V9)")),
                     breaks = c("Neotropical Forest Soils", "TARA Oceans",
                         "BioMarKs (V4)", "BioMarKs (V9)"),
                     labels = c("Neotropical Forest Soils", "TARA Oceans",
                         "BioMarKs (V4)", "BioMarKs (V9)")) +
    coord_flip() +
    theme_bw() +
    theme(axis.text  = element_text(size = 11),
          axis.title.y = element_blank(),
          legend.title = element_blank(),
          legend.text = element_text(size = 8))

## Percentage barplots
p1 <- p0 +
    geom_bar(stat = "identity", position = "fill") +
    scale_y_continuous(labels = percent)


#*****************************************************************************#
#                                                                             #
#                                 OTU Plots                                   #
#                                                                             #
#*****************************************************************************#

## Shared plot code
p3 <- ggplot(projects_OTUs, aes(x = project, y = abundance, fill = clade)) +
    ylab("number of observed OTUs") +
    scale_fill_discrete(breaks = taxa_order_OTUs$clade,
                            name = "clade") +
    scale_x_discrete(limits = rev(c("Neotropical Forest Soils", "TARA Oceans",
                         "BioMarKs (V4)", "BioMarKs (V9)")),
                     breaks = c("Neotropical Forest Soils", "TARA Oceans",
                         "BioMarKs (V4)", "BioMarKs (V9)"),
                     labels = c("Neotropical Forest Soils", "TARA Oceans",
                         "BioMarKs (V4)", "BioMarKs (V9)")) +
    coord_flip() +
    theme_bw() +
    theme(axis.text  = element_text(size = 11),
          axis.title.y = element_blank(),
          legend.title = element_blank(),
          legend.text = element_text(size = 8))

## Percentage barplots
p4 <- p3 +
    geom_bar(stat = "identity", position = "fill") +
    scale_y_continuous(labels = percent)


#*****************************************************************************#
#                                                                             #
#                      Neotropical Ciliate-specific runs                      #
#                                                                             #
#*****************************************************************************#

## Import and format data
setwd("~/neotropical_diversity/results/stampa/")
input <- "neotropical_soil_20_samples_454_ciliate.OTU.protists.table"
d <- read.table(input, sep = "\t", header = TRUE, dec=".") %>% tbl_df()

## For some reasons, the command below does not work with mutate
d$Barro <- rowSums(select(d, starts_with("B")))
d$LaSelva <- rowSums(select(d, starts_with("L")))
d$Tiputini <- rowSums(select(d, starts_with("T")) %>%
                      select(-taxonomy, -total))

## Discard all other columns
d <- select(d, one_of("Barro", "Tiputini", "LaSelva", "taxonomy"))

## Extract the third field from the "taxonomy" and store in a new column
d$clade <- apply(d["taxonomy"], 1 , function(x) strsplit(x, "|", fixed = TRUE)[[1]][3])

d$LaSelva <- as.integer(d$LaSelva)
d$Tiputini <- as.integer(d$Tiputini)
d$Barro <- as.integer(d$Barro)

## Group by clade (sum reads)
d4 <- select(d, -taxonomy) %>%
    gather("forest", "abundance", -clade) %>%
    group_by(clade, forest)
d4$abundance <- as.integer(d4$abundance)
d4 <- tally(d4, wt = abundance, sort = FALSE)

## Replace "*" by "Unknown", and discard "Chimera"
d4 <- d4 %>% filter(clade != "Chimera")
d4$clade[d4$clade == "*"] <- "Unknown"
d4$clade[d4$clade == "Alveolata_X"] <- "Alveolata incertae sedis"
d4$clade[d4$clade == "Amoebozoa_X"] <- "Amoebozoa incertae sedis"
d4$clade[d4$clade == "Stramenopiles_X"] <- "non-Ochrophyta Stramenopiles"

## List clades that have significative abundances
main_taxa2 <- d4 %>%
    select(-forest) %>%
    group_by(clade) %>%
    tally(wt = n, sort = TRUE) %>%
    mutate(percentage = 100 * n / sum(d4$n)) %>%
    filter(percentage > 0.1) %>%
    select(-n, -percentage)

## All rows in d4 that have a match in main_taxa
d4 <- semi_join(d4, main_taxa2, by = "clade")

## Order the legend
taxa_order_reads2 <- select(d4, clade) %>% distinct()

#-------------------------- Percentage barplots -------------------------------#

## Barcharts (reads)
p2 <- ggplot(d4, aes(x = forest, y = n, fill = clade)) +
    geom_bar(stat = "identity", position = "fill") +
    scale_y_continuous(labels = percent_format()) +
    scale_fill_discrete(breaks = taxa_order_reads2$clade,
                            name = "clade") +
    scale_x_discrete(limits = rev(c("LaSelva", "Barro")),
                     breaks = c("LaSelva", "Barro"),
                     labels = c("Costa Rica\n(La Selva)",
                         "Panama\n(Barro\nColorado)")) +
    ylab("percentage of observed reads") +
    xlab("forests") +
    coord_flip() +
    theme_bw() +
    theme(axis.text  = element_text(size = 11),
          axis.title.x = element_text(vjust = 0),
          axis.title.y = element_blank(),
          legend.title = element_blank(),
          legend.text = element_text(size = 8))


#*****************************************************************************#
#                                                                             #
#                                    Plots                                    #
#                                                                             #
#*****************************************************************************#

## Align plots vertically (https://gist.github.com/tomhopper/faa24797bb44addeba79)
gA <- ggplot_gtable(ggplot_build(p1))
gB <- ggplot_gtable(ggplot_build(p2))
gC <- ggplot_gtable(ggplot_build(p4))
maxWidth = grid::unit.pmax(gA$widths, gB$widths, gC$widths)
gA$widths <- as.list(maxWidth)
gB$widths <- as.list(maxWidth)
gC$widths <- as.list(maxWidth)
grid.newpage()

## Output
setwd("~/neotropical_diversity/results/stampa/")
output <- "environment_comparisons_protists_reads2.pdf"
pdf(file = output, width = 11, height = 10)
grid.arrange(arrangeGrob(gA, gB, gC, nrow = 3, heights = c(0.4, 0.2, 0.4)))
dev.off()

quit(save="no")

3.33 Oomycota

3.33.1 Functional profiles per forest (Oomycota)

library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
library(reshape2)

## Load data
setwd("~/neotropical_diversity/results/first_155_samples/Oomycota/")
input <- "../../neotropical_soil_175_samples.OTU.protists_cleaned.table"
input_functions <- "oomycota_OTU_table_function.csv"

## Multiple plot function
##
## ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
## - cols:   Number of columns in layout
## - layout: A matrix specifying the layout. If present, 'cols' is ignored.
##
## If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
## then plot 1 will go in the upper left, 2 will go in the upper right, and
## 3 will go all the way across the bottom.
##
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
    library(grid)

    ## Make a list from the ... arguments and plotlist
    plots <- c(list(...), plotlist)

    numPlots = length(plots)

    ## If layout is NULL, then use 'cols' to determine layout
    if (is.null(layout)) {
        ## Make the panel
        ## ncol: Number of columns of plots
        ## nrow: Number of rows needed, calculated from # of cols
        layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                         ncol = cols, nrow = ceiling(numPlots/cols))
    }

    if (numPlots==1) {
        print(plots[[1]])

    } else {
          ## Set up the page
          grid.newpage()
          pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

          ## Make each plot, in the correct location
          for (i in 1:numPlots) {
              ## Get the i,j matrix positions of the regions that contain this subplot
              matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

              print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                    layout.pos.col = matchidx$col))
          }
      }
}

## Import and format data
f <- read.table(input_functions, sep = "\t", header = TRUE, dec = ".") %>%
    tbl_df()

levels(f$functions)[levels(f$functions) == "Unknown"] <- "unknown"
levels(f$functions)[levels(f$functions) == "facultative_associated_to_animals_decomposer"] <- "facultative animal saprotroph"
levels(f$functions)[levels(f$functions) == "facultative_associated_to_arthropods"] <- "facultative arthropod parasite"
levels(f$functions)[levels(f$functions) == "facultative_plant_parasite"] <- "facultative plant parasite"
levels(f$functions)[levels(f$functions) == "facultative_probably_associated_to_arthropods"] <- "facultative probable arthropod parasite"
levels(f$functions)[levels(f$functions) == "obligate_invertebrate_parasite"] <- "obligate invertebrate parasite"
levels(f$functions)[levels(f$functions) == "obligate_nematode_parasite"] <- "obligate nematode parasite"
levels(f$functions)[levels(f$functions) == "saprotrophic not parasite"] <- "saprotroph not parasite"
levels(f$functions)
functional_annotations <- f
rm(f)

## Import and format data
d <- read.table(input, sep = "\t", header = TRUE, dec=".") %>% tbl_df()

## For some reasons, the command below does not work with mutate
d$Barro <- rowSums(select(d, starts_with("B")))
d$LaSelva <- rowSums(select(d, starts_with("L")))
d$Tiputini <- rowSums(select(d, starts_with("T")) %>%
                      select(-taxonomy, -total))

## Discard all other columns
d <- select(d, one_of("Barro", "Tiputini", "LaSelva", "taxonomy", "OTU"))

## Extract the fourth field from the "taxonomy" and store in a new column
d$clade <- apply(d["taxonomy"], 1 , function(x) strsplit(x, "|", fixed = TRUE)[[1]][4])

d$LaSelva <- as.integer(d$LaSelva)
d$Tiputini <- as.integer(d$Tiputini)
d$Barro <- as.integer(d$Barro)

## Emiminate other clades
d <- filter(d, clade == "Oomycota") %>% select(-taxonomy, -clade)

## Union with functions
d <- left_join(d, functional_annotations, by = "OTU") %>% select(-OTU)

## Group by function (sum reads)
d2 <- gather(d, "forest", "abundance", -functions) %>%
      group_by(functions, forest) %>%
      tally(wt = abundance, sort = FALSE)

#------------------------ Absolute barplots -----------------------------------#

## Barcharts (reads)
p1 <- ggplot(d2, aes(x = forest, y = n, fill = functions)) +
    geom_bar(stat = "identity", colour = "black", size = 0.1) +
    scale_y_continuous(labels = comma) +
    scale_x_discrete(limits = rev(c("LaSelva", "Barro", "Tiputini")),
                     breaks = c("LaSelva", "Barro", "Tiputini"),
                     labels = c("Costa Rica\n(La Selva)",
                         "Panama\n(Barro\nColorado)",
                         "Ecuador\n(Tiputini)")) +
    ylab("number of observed reads") +
    xlab("forests") +
    coord_flip() +
    theme_bw() +
    theme(axis.text  = element_text(size = 11),
          axis.title.x = element_text(vjust = 0)) ##  +

## Group by function (sum OTUs)
d3 <- gather(d, "forest", "abundance", -functions) %>%
    group_by(functions, forest) %>%
    filter(abundance != "0") %>%
    tally(sort= TRUE)

## Barcharts (OTUs)
p2 <- ggplot(d3, aes(x = forest, y = n, fill = functions)) +
    geom_bar(stat = "identity", colour = "black", size = 0.1) +
    scale_y_continuous(labels = comma) +
    scale_x_discrete(limits = rev(c("LaSelva", "Barro", "Tiputini")),
                     breaks = c("LaSelva", "Barro", "Tiputini"),
                     labels = c("Costa Rica\n(La Selva)",
                         "Panama\n(Barro\nColorado)",
                         "Ecuador\n(Tiputini)")) +
    ylab("number of observed OTUs") +
    xlab("forests") +
    coord_flip() +
    theme_bw() +
    theme(axis.text  = element_text(size = 11),
          axis.title.x = element_text(vjust = 0))

## Output to PDF (multiplot)
output <- gsub(".table", "_group_by_forests_absolute_oomycota_functions.pdf", input, fixed = TRUE)
pdf(file = output, width = 11 , height = 10)
multiplot(p1, p2)
dev.off()


#-------------------------- Percentage barplots -------------------------------#

## Barcharts (reads)
p1 <- ggplot(d2, aes(x = forest, y = n, fill = functions)) +
    geom_bar(stat = "identity", position = "fill", colour = "black", size = 0.1) +
    scale_y_continuous(labels = percent_format()) +
    scale_x_discrete(limits = rev(c("LaSelva", "Barro", "Tiputini")),
                     breaks = c("LaSelva", "Barro", "Tiputini"),
                     labels = c("Costa Rica\n(La Selva)",
                         "Panama\n(Barro\nColorado)",
                         "Ecuador\n(Tiputini)")) +
    ylab("percentage of observed reads") +
    xlab("forests") +
    coord_flip() +
    theme_bw() +
    theme(axis.text  = element_text(size = 11),
          axis.title.x = element_text(vjust = 0))

## Barcharts (OTUs)
p2 <- ggplot(d3, aes(x = forest, y = n, fill = functions)) +
    geom_bar(stat = "identity", position = "fill", colour = "black", size = 0.1) +
    scale_y_continuous(labels = percent_format()) +
    scale_x_discrete(limits = rev(c("LaSelva", "Barro", "Tiputini")),
                     breaks = c("LaSelva", "Barro", "Tiputini"),
                     labels = c("Costa Rica\n(La Selva)",
                         "Panama\n(Barro\nColorado)",
                         "Ecuador\n(Tiputini)")) +
    ylab("percentage of observed OTUs") +
    xlab("forests") +
    coord_flip() +
    theme_bw() +
    theme(axis.text  = element_text(size = 11),
          axis.title.x = element_text(vjust = 0))

## Output to PDF (multiplot)
output <- gsub(".table", "_group_by_forests_relative_oomycota_functions.pdf", input, fixed = TRUE)
pdf(file = output, width = 11 , height = 10)
multiplot(p1, p2)
dev.off()

quit(save="no")

3.33.2 Heatmaps

ibrary(ggplot2)
library(scales)
library(plyr)
library(dplyr)
library(tidyr)
library(vegan)
comma1<-function(...) {comma(...,digits=1)}

# Load OTU table
mat_protist<-read.table("neotropical_soil_175_samples.OTU.protists_cleaned.table",h=T)
mat<-t(mat_protist[,3:(ncol(mat_protist)-5)])
colnames(mat)<-paste("otu",mat_protist[,1],sep="_")
taxo<-cbind.data.frame(ldply(strsplit(gsub("\\+"," ",gsub("\\*","unclassified",as.character(mat_protist[,(ncol(mat_protist)-1)]))),
                                      split="|",fixed=T)),mat_protist[,c((ncol(mat_protist)-2),ncol(mat_protist))])
rownames(taxo)<-paste("otu",mat_protist[,1],sep="_")
ranks<-c("Domain","Kingdom","Phylum","Class","Order","Family","Genus","Species","Genus_2","Species_2")
colnames(taxo)<-c(ranks,"similarity","best_matches")
taxo_oo<-taxo[which(taxo$Class=="Oomycota"),]
mat_oo<-mat[,rownames(taxo_oo)]
mat_oo<-mat_oo[which(rowSums(mat_oo)>0),]
ra_oo<-mat_oo/rowSums(mat_oo)

# Replace "unclassified" with "previous-rank_X" following PR2 rule
for (i in 6:8) {
  tmp_pos<-which(taxo_oo[,i]=="unclassified")
  tmp_prev<-taxo_oo[tmp_pos,i-1]
  taxo_oo[tmp_pos,i]<-ifelse(tmp_prev=="unclassified",
         ifelse(grepl("X$",taxo_oo[tmp_pos,i-2]),paste0(taxo_oo[tmp_pos,i-2],"XX"),paste0(taxo_oo[tmp_pos,i-2],"_XX")),
         ifelse(grepl("X$",tmp_prev),paste0(tmp_prev,"X"),paste0(tmp_prev,"_X")))}
taxo_oo[,ranks]<-lapply(taxo_oo[,ranks],factor)

# sample vs. forest data frame
loc<-rbind(c("B","Panama\n(Barro Colorado)"),c("L","Costa Rica\n(La Selva)"),c("T","Ecuador\n(Tiputini)"))
colnames(loc)<-c("ab","forest")
env<-merge(loc,cbind.data.frame(ab=substr(rownames(mat_oo),1,1),sample=rownames(mat_oo)),by="ab")

# Average OTU relative abundance per forest
forest_mean<-cbind.data.frame(forest=env$forest,ra) %>% tbl_df() %>%
  group_by(forest) %>% summarize_each(funs(mean)) %>%
  gather("otu","ral",-forest) %>%
  mutate_each(funs(factor),otu) %>%
  full_join(cbind.data.frame(otu=rownames(taxo_oo),genus=taxo_oo$Genus)) %>%
  mutate(name=paste(otu,genus,sep=" \t"))

g1<-ggplot(forest_mean, aes(forest,reorder(otu,ral))) +
  geom_tile(aes(fill=ral)) + # or geom_tile to allow space between rectangles
  scale_fill_continuous(name="Relative\nabundance",label=comma1,low="white",high="steelblue",na.value="white",trans='log') +
                        #breaks=c(10^-6,5*10^(-5:-1)+10^-6),labels=c("0",paste0("5e-",5:1))) + # for legend with scientific notation
  scale_x_discrete(expand = c(0, 0)) +
  scale_y_discrete(expand = c(0, 0)) +
  theme_grey() +
  theme(axis.ticks=element_blank(),
        axis.title=element_blank(),
        axis.text=element_text(colour="black",size=9),
        axis.text.y=element_text(hjust=0,size=6),
        legend.title.align=0.5,
        legend.margin=unit(1,"points"),
        plot.margin=unit(c(5,0,5,5),"points"))
ggsave("Heatmap_Oomycota_OTUs.pdf",g1,width=4,height=10)

# Average OTU relative abundance per forest sum per family
family_forest<-cbind.data.frame(otu=rownames(taxo_oo),family=taxo_oo$Family) %>%
  tbl_df() %>%
  full_join(forest_mean) %>%
  group_by(forest,family) %>%
  summarize_each(funs(sum),-otu,-name,-genus)

g2<-ggplot(family_forest, aes(forest,reorder(family,ral))) +
  geom_tile(aes(fill=ral)) +
  scale_fill_continuous(name="Relative\nabundance",low="white",high="steelblue") +
  scale_x_discrete(expand = c(0, 0)) +
  scale_y_discrete(expand = c(0, 0)) +
  ylab(label="family") +
  theme(axis.ticks=element_blank(),
        axis.title=element_blank(),
        axis.text=element_text(colour="black",size=8),
        axis.text.y=element_text(hjust=0,size=7),
        legend.title.align=0.5,
        legend.margin=unit(1,"points"),
        plot.margin=unit(c(5,0,5,5),"points"))
ggsave("Heatmap_Oomycota_family.pdf",g2,width=4,height=3)

# Average OTU relative abundance per forest sum per genus
genus_forest<-group_by(forest_mean,forest,genus) %>%
  summarize_each(funs(sum),-otu,-name)

g3<-ggplot(genus_forest, aes(forest,reorder(genus,ral))) +
  geom_tile(aes(fill=ral)) +
  scale_fill_continuous(name="Relative\nabundance",low="white",high="steelblue") +
  scale_x_discrete(expand = c(0, 0)) +
  scale_y_discrete(expand = c(0, 0)) +
  ylab(label="genus") +
  theme(axis.ticks=element_blank(),
        axis.title=element_blank(),
        axis.text=element_text(colour="black",size=8),
        axis.text.y=element_text(hjust=0,size=7),
        legend.title.align=0.5,
        legend.margin=unit(1,"points"),
        plot.margin=unit(c(5,0,5,5),"points"))
ggsave("Heatmap_Oomycota_genus.pdf",g3,width=4,height=5)

3.34 Sample median size

# kl
cd ~/Ciliata_neotropical/data/

cut -f 3-156 neotropical_soil_175_samples.OTU.table | \
    awk -F "\t" '{for (i=1 ; i<=NF ; i++) {a[i] += $i}} END {for (i in a) {print a[i]}}' | \
    sort -n

median: 803,000

3.35 EPA likelihood histograms

library(tidyr)
library(dplyr)
library(ggplot2)
library(scales)

setwd("~/Ciliata_neotropical_diversity/supplement_placement_confidence/")
cbPalette <- c("#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

## First probable placement
input <- "trops_placement_dist_hist_1st.csv"
first_placement <- read.table(input, sep = "\t", dec = ",", header = TRUE) %>%
    select(Range.Start, Count) %>%
    mutate(rank = "most probable placement")

## First probable placement
input2 <- "trops_placement_dist_hist_2nd.csv"
second_placement <- read.table(input2, sep = "\t", dec = ",", header = TRUE) %>%
    select(Range.Start, Count) %>%
    mutate(rank = "second most probable placement")

## Merge
placements <- bind_rows(first_placement, second_placement)

## Replace comas with dots
placements$Range.Start <- sub(",", ".", placements$Range.Start)

## First plot
ggplot(placements, aes(x = Range.Start, y = Count, fill = rank)) +
    geom_bar(stat = "identity", position = position_dodge()) +
    scale_x_discrete() +
    scale_y_continuous(labels = comma) +
    scale_fill_manual(values = cbPalette) +
    xlab("likelihood weights") +
    ylab("number of amplicons") +
    theme_bw(base_size = 16) +
    theme(legend.justification = c(1, 0),
          legend.position = c(0.8, 0.8),
          legend.title = element_blank(),
          axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, size = 11),
          legend.background = element_rect(fill = "white", colour = "gray",
              size = 0.5, linetype = "solid"))

## Output to EPS
output <- "hist_first_and_second_placement.eps"
ggsave(file = output, width = 14 , height = 5.5)


## edpl placement
input3 <- "trops_placement_dist_hist_edpl.csv"
edpl <- read.table(input3, sep = "\t", dec = ",", header = TRUE) %>%
    select(Range.Start, Count)

## Replace comas with dots
edpl$Range.Start <- sub(",", ".", edpl$Range.Start)

ggplot(edpl, aes(x = Range.Start, y = Count)) +
    geom_bar(stat = "identity", fill = "#E69F00", width = 0.55) +
    scale_x_discrete() +
    scale_y_continuous(labels = comma) +
    xlab("likelihood weights") +
    ylab("number of amplicons") +
    theme_bw(base_size = 16) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, size = 11))

## Output to EPS
output <- "hist_edpl_placement.eps"
ggsave(file = output, width = 14 , height = 5.5)

## ggplot(placements, aes(x = Range.Start, y = Count, colour = rank)) +
##     geom_segment(aes(xend = Range.Start, yend = 0),
##                  size = 2, alpha = 0.5) +
##     scale_x_discrete() +
##     scale_y_continuous(labels = comma) +
##     xlab("likelihood weights") +
##     ylab("number of amplicons") +
##     theme_bw(base_size = 16) +
##     theme(legend.justification = c(1, 0), legend.position = c(0.8, 0.8),
##               legend.background = element_rect(fill="gray90", size=.5, linetype="dotted"))

## +
##     theme(axis.title.x = element_text(vjust = 0),
##           axis.title.y = element_text(vjust = 1),
##           strip.text.y = element_text(size = 9.5))

##     facet_grid(environment ~ ., scales="free_y") +

quit(save="no")

4 First 454 run (universal V4 primers)

First 10 samples are from Costa Rica. Second 10 are from Panama. The samples are evenly placed out in the filed sites; that is, about as far apart as possible.

The amplicons are already splitted in 20 files, corresponding to the 20 samples.

4.1 PCR conditions

The primers are:

V4F = CCAGCASCYGCGGTAATTCC V4R = ACTTTCGTTCTTGATYRA

Also known as:

TAReuk454FWD1 (Forward): 5‘-CCAGCA(G/C)C(C/T)GCGGTAATTCC-3‘ TAReukRev3 (Reverse): 5‘-ACTTTCGTTCTTGAT(C/T)(A/-G)A-3‘

Standard master mix (in μl):

  • 19,75 H20
  • 2,5 10x Buffer
  • 0,5 dNTP
  • 0,5 V4F
  • 0,5 V4R
  • 0,25 polymerase
  • 1 template

Cycling conditions:

  1. 95°C 5 min
  2. 94°C 0:30 min
  3. 47°C 0:45 min
  4. 72°C 1 min
  5. 72°C 5min; 2-4 = 29x

4.2 Samples

We have 10 samples from "La Selva" and 10 from "Barro".

  Barcode Sample Library
1 A35505 L010 lib28265
2 A35506 L020 lib28266
3 A35507 L030 lib28267
4 A43981 L040 lib28268
5 A43982 L050 lib28269
6 A43983 L060 lib28270
7 A43984 L070 lib28271
8 A43985 L080 lib28272
9 A43986 L090 lib28273
10 A43987 L100 lib28274
11 A43988 B010 lib28275
12 A43989 B020 lib28276
13 A43990 B030 lib28277
14 A43991 B040 lib28278
15 A43992 B050 lib28279
16 A43993 B060 lib28280
17 A43994 B070 lib28281
18 A43995 B080 lib28282
19 A43996 B090 lib28283
20 A43997 B100 lib28284

4.3 Extract, chimera check and dereplicate

We used a conservative approach to extract amplicons by keeping only amplicons containing both forward and reverse primers, using the following regex:

V4F = CCAGCA[ACGT]C[ACGT]GCGGTAATTC[ACGT] V4R = T[ACGT][ACGT]ATCAAGAACGAAAGT

Degenerate primers are replaced by [ACGT]. The last base of the V4F primer is also replaced by an unknown nucleotide. The V4R primer is reverse-complemented.

## Dereplicate each samples
cd ~/neotropical_diversity/data/amplicons/first_454_run/
USEARCH="${HOME}/bin/usearch7.0.1001_i86linux32"
INPUT=$(mktemp)
READS=$(mktemp)
NOCHIMERAS=$(mktemp)
# Unzip
unzip -j *.zip \*.fna
# Extract and dereplicate
for f in *.fna ; do
    sample=${f%%_lib*}
    sample=${sample#*_}
    awk 'NR==1 {print ; next} {printf /^>/ ? "\n"$0"\n" : $1} END {print}' "${f}" |
    grep -v "^>" | tr "acgt" "ACGT" |
    sed -n 's/.*CCAGCA[ACGT]C[ACGT]GCGGTAATTC[ACGT]\([ACGTN][ACGTN]*\)T[ACGT][ACGT]ATCAAGAACGAAAGT.*/\1/p' > "${READS}"
    PRIMERS=$(wc -l < "${READS}")
    grep -v "N" "${READS}" | sort -d | uniq -c |
    while read abundance sequence ; do
        hash=$(printf "${sequence}" | sha1sum)
        hash=${hash:0:40}
        printf ">%s_%d_%s\n" "${hash}" "${abundance}" "${sequence}"
    done | sort -t "_" -k2,2nr -k1.2,1d |
    sed -e 's/\_\([0-9][0-9]*\)/;size=\1;/1' |
    sed 's/\_/\n/1' > "${INPUT}"
    # Chimera detection (call uchime, modify and linearize again)
    # Uchime only works on capital letters... bad design?
    "${USEARCH}" -uchime_denovo "${INPUT}" -nonchimeras "${NOCHIMERAS}"
    sed -e 's/;size=\([0-9][0-9]*\);/\_\1/' "${NOCHIMERAS}" |
    awk 'NR==1 {print ; next} {printf /^>/ ? "\n"$0"\n" : $1} END {print}' > "${sample}.fas"
    # Stats
    RAW=$(grep -c "^>" "${f}")
    NO_Ns=$(awk -F "[;=]" '/^>/ {sum += $3 ; unique += 1} END {print sum, unique}' "${INPUT}")
    CHIMERAS=$(awk -F "_" '/^>/ {sum += $2 ; unique += 1} END {print sum, unique}' "${sample}.fas")
    echo "${sample} ${RAW} ${PRIMERS} ${NO_Ns} ${CHIMERAS}" >> tmp.log
done
cat tmp.log
rm -f "${INPUT}" "${NOCHIMERAS}" "${READS}" *.fna tmp.log

## Dereplicate the whole project (using a Awk table)
cat [BL]*.fas |
awk 'BEGIN {RS = ">" ; FS = "[_\n]"} {if (NR != 1) {abundances[$1] += $2 ; sequences[$1] = $3}} END {for (amplicon in sequences) {print ">" amplicon "_" abundances[amplicon] "_" sequences[amplicon]}}' |
sort -t "_" -k2,2nr -k1.2,1d |
sed -e 's/\_/\n/2' > first_20_samples.fas
bzip2 -9kf first_20_samples.fas
sample raw primers uniques no Ns chimeras uniques %
B010 20786 19049 3017 18989 18694 2840 89.9
B020 26335 24674 3586 24582 23905 3512 90.8
B030 38128 35514 4311 35401 34966 4098 91.7
B040 35380 32676 4742 32579 31607 4214 89.3
B050 29504 27137 3476 27018 26861 3407 91.0
B060 35290 32843 5319 32733 32338 5087 91.6
B070 20746 19235 2705 19176 18905 2685 91.1
B080 39007 36452 3800 36406 35848 3528 91.9
B090 42257 39374 6563 39266 37055 5823 87.7
B100 63626 59038 5637 58967 54865 5153 86.2
L010 31290 26985 6632 26880 26429 6276 84.5
L020 42868 40048 3253 39971 39044 3134 91.1
L030 46168 38578 9464 38384 38234 9324 82.8
L040 41479 38634 9696 38552 37174 8953 89.6
L050 39664 36812 8254 36713 35515 7576 89.5
L060 27499 24977 5206 24910 24202 4862 88.0
L070 32698 30031 7153 29965 28807 6796 88.1
L080 19319 17463 4657 17395 17064 4396 88.3
L090 29568 27078 5484 26984 26519 5232 89.7
L100 31112 28483 6759 28393 27932 6419 89.8
Total 692724 635081 109714 633264 615964 103315 88.9

4.3.1 Effect of truncating the reverse primer

That table show the effect of truncated reverse primer on the number of collected amplicons (on the first sample: NG-6848_B010_lib28275_1779_01.zip).

Primer R captured amplicons
T[ACGT][ACGT]ATCAAGAACGAAAGT 18989
T[ACGT][ACGT]ATCAAGAACGAAAG 19002
T[ACGT][ACGT]ATCAAGAACGAAA 19346
T[ACGT][ACGT]ATCAAGAACGAA 19464
T[ACGT][ACGT]ATCAAGAACGA 19471
T[ACGT][ACGT]ATCAAGAACG 19504
T[ACGT][ACGT]ATCAAGAAC 19527
T[ACGT][ACGT]ATCAAGAA 19566
T[ACGT][ACGT]ATCAAGA 19680
T[ACGT][ACGT]ATCAAG 19701
T[ACGT][ACGT]ATCAA 19908
T[ACGT][ACGT]ATCA 19980
T[ACGT][ACGT]ATC 20153
T[ACGT][ACGT]AT 20310
T[ACGT][ACGT]A 20377
[ACGT][ACGT][ACGT]ATCAAGAACGAAAGT 19223

4.4 Clustering

4.4.1 Use swarm to clusterize the dataset

cd ~/neotropical_diversity/results/swarm/

BREAKER="../../../Swarms/swarm/scripts/swarm_breaker.py"
FASTA="first_20_samples.fas"
swarm < "${FASTA}" > "${FASTA/.fas/_1.swarms}"
python "${BREAKER}" -f "${FASTA}" -s "${FASTA/.fas/_1.swarms}" 2> /dev/null > "${FASTA/.fas/_1.swarms_new}"
awk 'BEGIN {FS = "[_ ]"; OFS = "\t"} {sum = 0 ; singletons = 0 ; for (i=2; i<=NF; i=i+2) {sum += $i ; if ($i == 1) {singletons += 1}} ; print NF - 1, sum, $1, $2, singletons}' "${FASTA/.fas/_1.swarms_new}" |
sort -k2,2nr -k1,1nr -k3,3d > "${FASTA/.fas/_1.stats_new}"

4.4.2 Error repartition for the top seed

cd ~/neotropical_diversity/results/swarm/

FASTA="first_20_samples.fas"
SWARMS="first_20_samples_1.swarms_new"
SCRIPT="../../src/seed_vs_crown_error_distributions.py"
AMPLICONS=$(mktemp)
PAIRS=$(mktemp)
TMP_FASTA=$(mktemp)

# Extract the top seed and all amplicons linked to it
SEED=$(head -n 1 "${FASTA}" | tr -d ">" | cut -d "_" -f 1)
grep -A 1 -F -f <(grep -m 1 "^${SEED}" "${SWARMS}" | tr " " "\n") "${FASTA}" | sed -e '/^--$/d' > "${AMPLICONS}"
swarm -d 1 -b "${AMPLICONS}" 2>&1 >/dev/null | grep "${SEED}" > "${PAIRS}"
grep -A 1 -F -f <(echo "${SEED}" ; cut -f 3 "${PAIRS}") "${FASTA}" |
sed -e '/^--$/d' > "${TMP_FASTA}"

# Localize each error (python script)
python "${SCRIPT}" -i "${TMP_FASTA}" > "${FASTA/.fas/_}${SEED}.errors"

# clean
rm "${AMPLICONS}" "${PAIRS}" "${TMP_FASTA}"

Produce a visualization (mutations, insertions, deletions)

library(ggplot2)
setwd("~/neotropical_diversity/results/swarm/")

d <- read.table("first_20_samples_825ec635c410c114e9ca0486a7f0aa3ab2751a97.errors", sep =" ")
colnames(d) <- c("seed", "seed_abundance", "seed_length", "subseed", "subseed_abundance", "error_type", "position")

x_max <- d$seed_length[1]

ggplot(d, aes(x = position, y = subseed_abundance)) +
    geom_point() +
    scale_x_continuous(limits=c(0, x_max)) +
    scale_y_log10() +
    xlab("position along the seed sequence") +
    ylab("micro-variant copy-numbers") +
    facet_grid(error_type ~ .)

ggsave(file = "first_20_samples_825ec635c410c114e9ca0486a7f0aa3ab2751a97.error_cloud.pdf", width=7, height=6)
ggsave(file = "first_20_samples_825ec635c410c114e9ca0486a7f0aa3ab2751a97.error_cloud.svg", width=7, height=6)

quit(save="no")

4.5 Taxonomic assignments

# n0
cd ~
FILE="CCAGCASCYGCGGTAATTCC_ACTTTCGTTCTTGATYRA-k3_extracted.fasta.tar.gz"
tar -xzf "${FILE}"
# Clean the fasta headers
awk 'BEGIN {FS = "|" ; OFS="|"} {if (/^>/) {printf "%s %s", $1, $7 ; for (i=8 ; i<=NF ; i++) {printf "|%s", $i} ; print ""} else print}' "${FILE/.tar.gz/}" > "V4_${FILE/-k3_extracted.fasta.tar.gz/}_20140217.fasta"
rm -f "${FILE/.tar.gz/}"
bash mass_pairwise.sh ../neotropical_diversity_soils/data/first_20_samples.fas V4_PR2

4.6 Contingency tables

4.6.1 Relations between amplicons and samples

Add the taxonomic assignments of amplicons to the table. Computation is slow, but it works.

cd ~/neotropical_diversity/results/
# Header
(echo -ne "Amplicon\tTotal\t" ;
for f in ../data/amplicons/first_454_run/[BL]*.fas ; do
    f=${f##*/}
    echo ${f/.fas/}
done | tr "\n" "\t" ;
echo -e "Identity\tTaxonomy\tReferences") > amplicons2samples.csv

# Amplicons
grep "^>" first_20_samples.fas | tr "_" "\t" | tr -d ">" |
while read amplicon total ; do
    echo -ne "${amplicon}\t${total}\t"
    for f in ../data/amplicons/first_454_run/[BL]*.fas ; do
        abundance=$(grep -m 1 "^>${amplicon}" ${f})
        abundance=${abundance##*_}
        echo -ne "${abundance:-0}\t"
    done
    assignment=$(grep -m 1 "^${amplicon}" ./stampa/first_20_samples.results | cut -f 3-)
    echo "${assignment}"
done >> amplicons2samples.csv

Rewrite using Awk (work in progress)

awk 'BEGIN {FS = "_"} {if (FNR == 1) {file = substr(FILENAME, 33, 4)} ; if (/^>/) {a[file][substr($1, 2)] = $2}} END {for (file in a) for (amplicon in a[file]) print file, amplicon, a[file][amplicon]}' ../data/amplicons/first_454_run/B*.fas > /dev/null

4.6.2 Relations between OTUs and samples

cd ~/neotropical_diversity/results/
# Header
echo -e "OTU\t$(head -n 1 amplicons2samples.csv)" > OTUs2samples.csv
AMPLICONS=$(mktemp)
OTU=1
sort -k2,2nr -k1,1nr ./swarm/first_20_samples_1.stats |
while read a b c d e f g ; do
    grep -m 1 "^${c}" ./swarm/first_20_samples_1.swarms | tr " " "\n" | cut -d "_" -f 1 > "${AMPLICONS}"
    ## The number of samples is hardcoded (end before the 23th
    ## column). Use tac to be sure that the last line treated is the
    ## one with the most abundant amplicon.
    grep -F -f "${AMPLICONS}" amplicons2samples.csv | tac |
    awk -v OTU=$OTU 'BEGIN {FS = "\t"} {for (i = 2 ; i < 23 ; i++) {sums[i] += $i}} END {{printf "%s\t%s\t", OTU, $1} ; for (i = 2 ; i < 23 ; i++) {printf "%i\t", sums[i]} ; {printf "%s\t%s\t%s\n", $23, $24, $25}}'
    OTU=$(( $OTU + 1 ))
done >> OTUs2samples.csv
rm "${AMPLICONS}"

5 Second 454 run (ciliate specific V4 primers)

5.1 PCR conditions

Half a plate with the same 20 samples, using a two step amplification:

  • ciliate specific primers,
  • universal V4 primers

The primers are:

Cil_F: 5’ - TGG TAG TGT ATT GGA CWA CCA -3’

Cil R (1-3):

5’ - TCT GAT CGT CTT TGA TCC CTT A – 3’ 5’ - TCT RAT CGT CTT TGA TCC CCT A – 3’ 5’ - TCT GAT TGT CTT TGA TCC CCT A – 3’

The reverse primer is a mix of 3 different primers (1/3rd each).

The PCR Program:

(with Hot Start Taq from Qiagen)

95°C 5min 94°C 30sec 56°C 30sec 72°C 60sec 72°C 10min 4°C end

25 Cycles should be enough.

5.2 Extract, chimera check and dereplicate

We used a conservative approach to extract amplicons by keeping only amplicons containing both forward and reverse primers, using the following regex:

V4F = CCAGCA[ACGT]C[ACGT]GCGGTAATTC[ACGT] V4R = T[ACGT][ACGT]ATCAAGAACGAAAGT

Degenerate primers are replaced by [ACGT]. The last base of the V4F primer is also replaced by an unknown nucleotide. The V4R primer is reverse-complemented.

## Dereplicate each samples
cd ~/neotropical_diversity/data/amplicons/second_454_run/
USEARCH="${HOME}/bin/usearch7.0.1001_i86linux32"
INPUT=$(mktemp)
READS=$(mktemp)
NOCHIMERAS=$(mktemp)
# Unzip
unzip -j *.zip \*.fna
# Extract and dereplicate
for f in *.fna ; do
    sample=${f%%_lib*}
    sample=${sample#*_}
    awk 'NR==1 {print ; next} {printf /^>/ ? "\n"$0"\n" : $1} END {print}' "${f}" |
    grep -v "^>" | tr "acgt" "ACGT" |
    sed -n 's/.*CCAGCA[ACGT]C[ACGT]GCGGTAATTC[ACGT]\([ACGTN][ACGTN]*\)T[ACGT][ACGT]ATCAAGAACGAAAGT.*/\1/p' > "${READS}"
    PRIMERS=$(wc -l < "${READS}")
    grep -v "N" "${READS}" | sort -d | uniq -c |
    while read abundance sequence ; do
        hash=$(printf "${sequence}" | sha1sum)
        hash=${hash:0:40}
        printf ">%s_%d_%s\n" "${hash}" "${abundance}" "${sequence}"
    done | sort -t "_" -k2,2nr -k1.2,1d |
    sed -e 's/\_\([0-9][0-9]*\)/;size=\1;/1' |
    sed 's/\_/\n/1' > "${INPUT}"
    # Chimera detection (call uchime, modify and linearize again)
    # Uchime only works on capital letters... bad design?
    "${USEARCH}" -uchime_denovo "${INPUT}" -nonchimeras "${NOCHIMERAS}"
    sed -e 's/;size=\([0-9][0-9]*\);/\_\1/' "${NOCHIMERAS}" |
    awk 'NR==1 {print ; next} {printf /^>/ ? "\n"$0"\n" : $1} END {print}' > "${sample}.fas"
    # Stats
    RAW=$(grep -c "^>" "${f}")
    NO_Ns=$(awk -F "[;=]" '/^>/ {sum += $3 ; unique += 1} END {print sum, unique}' "${INPUT}")
    CHIMERAS=$(awk -F "_" '/^>/ {sum += $2 ; unique += 1} END {print sum, unique}' "${sample}.fas")
    echo "${sample} ${RAW} ${PRIMERS} ${NO_Ns} ${CHIMERAS}" >> tmp.log
done
cat tmp.log
rm -f "${INPUT}" "${NOCHIMERAS}" "${READS}" *.fna tmp.log

## Dereplicate the whole project (using a Awk table)
cat [BL]*.fas |
awk 'BEGIN {RS = ">" ; FS = "[_\n]"} {if (NR != 1) {abundances[$1] += $2 ; sequences[$1] = $3}} END {for (amplicon in sequences) {print ">" amplicon "_" abundances[amplicon] "_" sequences[amplicon]}}' |
sort -t "_" -k2,2nr -k1.2,1d |
sed -e 's/\_/\n/2' > second_20_samples.fas
bzip2 -9k second_20_samples.fas
sample raw primers uniques no Ns chimeras uniques %
B010 17443 15778 1498 15763 15680 1457 89.9
B020 87 77 22 77 77 22 88.5
B030 13367 12061 1928 11987 11931 1875 89.3
B040 29662 26727 1709 26701 26696 1704 90.0
B050 70 60 12 60 60 12 85.7
B060 15508 13916 1818 13889 13860 1790 89.4
B070 30 21 13 21 21 13 70.0
B080 22304 20605 1470 20581 20501 1405 91.9
B090 25171 22560 2882 22507 22387 2790 88.9
B100 28365 24729 1357 24701 24696 1352 87.1
L010 19887 18161 2810 18102 18010 2724 90.6
L020 682 632 196 630 630 196 92.4
L030 18829 17094 2534 16993 16984 2525 90.2
L040 20388 18451 3924 18372 18145 3717 89.0
L050 23487 21592 3636 21495 21416 3567 91.2
L060 23196 20907 3427 20813 20767 3384 89.5
L070 21535 19527 3656 19409 19346 3599 89.8
L080 15725 14216 2790 14141 14103 2754 89.7
L090 21691 19891 3323 19822 19726 3237 90.9
L100 20914 18632 2917 18571 18476 2829 88.3
Total 338341 305637 41922 304635 303512 40952 89.7

5.3 Swarm 2.0

  • swarm,
  • chimera detection,
  • taxonomic assignment,
  • contingency tables,
  • stampa plots
kl
cd src/

FASTA="../neotropical_diversity/data/Roche454_Ciliate_specific/neotropical_soil_20_samples_454_ciliate.fas"
bash swarm_fastidious.sh "${FASTA}"
bash vsearch_chimera.sh "${FASTA/.fas/_1f_representatives.fas}"
bash stampa.sh "${FASTA}" SSU_V4

5.4 Basic stats

We have 20 samples, 34,421 unique sequences representing 303,512 reads.

cd ${HOME}/neotropical_diversity/data/Roche454_Ciliate_specific/
awk 'BEGIN {FS = "_"}
     {if (/^>/) s+= $2
     } END {
         print s, NR/2
     }' neotropical_soil_20_samples_454_ciliate.fas

5.5 Contingency tables

5.5.1 OTU table

kl
cd ${HOME}/neotropical_diversity/data/Roche454_Ciliate_specific/

FOLDER="${HOME}/neotropical_diversity/src"
SCRIPT="OTU_contingency_table.py"
FASTA="neotropical_soil_20_samples_454_ciliate.fas"  # CHANGE HERE!
STATS="${FASTA/.fas/_1f.stats}"
SWARMS="${FASTA/.fas/_1f.swarms}"
STAMPA="${FASTA/.fas/.results}"
UCHIME="${FASTA/.fas/_1f_representatives.uchime}"
OTU_TABLE="${FASTA/.fas/.OTU.table}"

module load python/latest-2.7

python "${FOLDER}/${SCRIPT}" "${STAMPA}" "${STATS}" "${SWARMS}" "${UCHIME}" ./[BL]*.fas > "${OTU_TABLE}"

5.5.2 OTU table filtering (protists only)

kl
cd ${HOME}/neotropical_diversity/data/Roche454_Ciliate_specific/

FOLDER="${HOME}/neotropical_diversity/src"
SCRIPT="OTU_table_cleaner_protists.py"
OTU_TABLE="neotropical_soil_20_samples_454_ciliate.OTU.table"
OTU_FILTERED="${OTU_TABLE/.table/.protists.table}"

module load python/latest-2.7

python "${FOLDER}/${SCRIPT}" "${OTU_TABLE}" 99.5 > "${OTU_FILTERED}"

# Stats
awk '{if (NR == 1) {next} ; c += 1 ; s += $23} END {print c, s}' "${OTU_FILTERED}"

I decide to salvage small OTUs with 99.5% identity with references (appr. 2 differences for V4 sequences).

I end up with 1,082 OTUs representing 297,892 reads.

  1. total number of reads from combined La Selva and Barro
  2. total number of OTUs from combined La Selva and Barro

Use R to summarize data

library(dplyr)
library(tidyr)
library(ggplot2)
library(reshape2)

## Load data
setwd("~/neotropical_diversity/results/stampa/")
input <- "neotropical_soil_20_samples_454_ciliate.OTU.protists.table"

## Import and format data
d <- read.table(input, sep = "\t", header = TRUE, dec = ".") %>% tbl_df()

## Create forest variables
d$Barro <- rowSums(select(d, starts_with("B")))
d$LaSelva <- rowSums(select(d, starts_with("L")))
## d$Tiputini <- rowSums(select(d, starts_with("T")) %>%
##                           select(-taxonomy, -total))

## Summarize data
d <- select(d, one_of("OTU", "Barro", "LaSelva")) %>%
    gather("forest", "reads", 2:3) %>%
        group_by(forest)

## Number of apicomplexan reads per forest
d1 <- summarise(d, sum = sum(reads))
print(d1)

## Number of apicomplexan OTUs per forest
d2 <- filter(d, reads > 0) %>% count(forest)
print(d2)

quit(save = "no")
forest reads OTUs
Barro 133957 271
LaSelva 163935 874

I end up with 1,082 OTUs representing 297,892 reads.

5.5.3 OTU table filtering (apicomplexans only)

  1. % of total reads in the combined dataset of the two forests that are taxonomically assigned to the Apicomplexans
  2. % of total reads in just La Selva that are taxonomically assigned to the Apicomplexans
  3. % of total reads in just Barro that are taxonomically assigned to the Apicomplexans
  4. % of total OTUs in the combined dataset of the two forests that are taxonomically assigned to the Apicomplexans
  5. % of total OTUs in just La Selva that are taxonomically assigned to the Apicomplexans
  6. % of total OTUs in just Barro that are taxonomically assigned to the Apicomplexans

Extract OTUs assigned to Apicomplexa

cd ~/neotropical_diversity/results/stampa/

PROTISTS="neotropical_soil_20_samples_454_ciliate.OTU.protists.table"
APICOMPLEXA="${PROTISTS/protists/apicomplexa}"

head -n 1 "${PROTISTS}" > "${APICOMPLEXA}"
grep "Apicomplexa" "${PROTISTS}" >> "${APICOMPLEXA}"

# Apicomplexa reads and OTUs
TOTAL=$(head -n 1 "${APICOMPLEXA}" | tr "\t" "\n" | nl | grep "total" | awk '{print $1}')
awk -v TOTAL=${TOTAL} '{s += $TOTAL} END {print s, NR - 1}' "${APICOMPLEXA}"

Use R to summarize data

library(dplyr)
library(tidyr)
library(ggplot2)
library(reshape2)

## Load data
setwd("~/neotropical_diversity/results/stampa/")
input <- "neotropical_soil_20_samples_454_ciliate.OTU.apicomplexa.table"

## Import and format data
d <- read.table(input, sep = "\t", header = TRUE, dec = ".") %>% tbl_df()

## Create forest variables
d$Barro <- rowSums(select(d, starts_with("B")))
d$LaSelva <- rowSums(select(d, starts_with("L")))

## Summarize data
d <- select(d, one_of("OTU", "Barro", "LaSelva")) %>%
    gather("forest", "reads", 2:3) %>%
        group_by(forest)

## Number of apicomplexan reads per forest
d1 <- summarise(d, sum = sum(reads))
print(d1)

## Number of apicomplexan OTUs per forest
d2 <- filter(d, reads > 0) %>% count(forest)
print(d2)

quit(save = "no")
forest protist reads Apicomplexa reads % protist OTUs Apicomplexa OTUs %
Barro 133957 110617 82.58 271 133 49.08
LaSelva 163935 31672 19.32 874 201 23.00

Between the two forests, we have 142,289 reads assigned to Apicomplexa and 305 OTUs.

5.5.4 OTU table filtering (fungi only)

kl
cd ${HOME}/neotropical_diversity/data/Roche454_Ciliate_specific/

FOLDER="${HOME}/neotropical_diversity/src"
SCRIPT="OTU_table_cleaner_fungi.py"
OTU_TABLE="neotropical_soil_20_samples_454_ciliate.OTU.table"
OTU_FILTERED="${OTU_TABLE/.table/.fungi.table}"

module load python/latest-2.7

python "${FOLDER}/${SCRIPT}" "${OTU_TABLE}" 99.5 > "${OTU_FILTERED}"

# Stats
awk '{if (NR == 1) {next} ; c += 1 ; s += $23} END {print c, s}' "${OTU_FILTERED}"

I collect 19 OTUs representing 522 reads.

5.6 Taxonomic profiles per forest (high taxonomic level)

Produce a barplot (or barchart) where samples are grouped by forest and by taxonomic groups.

# aragorn
cd ${HOME}neotropical_diversity/results/stampa/

SOURCE="${HOME}/neotropical_diversity/data/Roche454_Ciliate_specific/"
TABLE="neotropical_soil_20_samples_454_ciliate.OTU.protists.table"

scp kl:${SOURCE}${TABLE} .
sed -i 's/#//g' ${TABLE}
library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
library(reshape2)

## Load data
setwd("~/neotropical_diversity/results/stampa/")
input <- "neotropical_soil_20_samples_454_ciliate.OTU.protists.table"

## Multiple plot function
##
## ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
## - cols:   Number of columns in layout
## - layout: A matrix specifying the layout. If present, 'cols' is ignored.
##
## If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
## then plot 1 will go in the upper left, 2 will go in the upper right, and
## 3 will go all the way across the bottom.
##
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
    library(grid)

    ## Make a list from the ... arguments and plotlist
    plots <- c(list(...), plotlist)

    numPlots = length(plots)

    ## If layout is NULL, then use 'cols' to determine layout
    if (is.null(layout)) {
        ## Make the panel
        ## ncol: Number of columns of plots
        ## nrow: Number of rows needed, calculated from # of cols
        layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                         ncol = cols, nrow = ceiling(numPlots/cols))
    }

    if (numPlots==1) {
        print(plots[[1]])

    } else {
          ## Set up the page
          grid.newpage()
          pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

          ## Make each plot, in the correct location
          for (i in 1:numPlots) {
              ## Get the i,j matrix positions of the regions that contain this subplot
              matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

              print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                    layout.pos.col = matchidx$col))
          }
      }
}


## Import and format data
d <- read.table(input, sep = "\t", header = TRUE, dec=".") %>% tbl_df()

## For some reasons, the command below does not work with mutate
d$Barro <- rowSums(select(d, starts_with("B")))
d$LaSelva <- rowSums(select(d, starts_with("L")))
d$Tiputini <- rowSums(select(d, starts_with("T")) %>%
                      select(-taxonomy, -total))

## Discard all other columns
d <- select(d, one_of("Barro", "Tiputini", "LaSelva", "taxonomy"))

## Extract the third field from the "taxonomy" and store in a new column
d$clade <- apply(d["taxonomy"], 1 , function(x) strsplit(x, "|", fixed = TRUE)[[1]][3])

d$LaSelva <- as.integer(d$LaSelva)
d$Tiputini <- as.integer(d$Tiputini)
d$Barro <- as.integer(d$Barro)

## Group by clade (sum reads)
d2 <- select(d, -taxonomy) %>%
    gather("forest", "abundance", -clade) %>%
        group_by(clade, forest)
d2$abundance <- as.integer(d2$abundance)
d2 <- tally(d2, wt = abundance, sort = FALSE)

## Replace "*" by "Unknown", and discard "Chimera"
d2 <- d2 %>% filter(clade != "Chimera")
d2$clade[d2$clade == "*"] <- "Unknown"
d2$clade[d2$clade == "Alveolata_X"] <- "Alveolata incertae sedis"
d2$clade[d2$clade == "Amoebozoa_X"] <- "Amoebozoa incertae sedis"
d2$clade[d2$clade == "Stramenopiles_X"] <- "non-Ochrophyta Stramenopiles"

## List clades that have significative abundances
main_taxa <- d2 %>%
    select(-forest) %>%
    group_by(clade) %>%
    tally(wt = n, sort = TRUE) %>%
    mutate(percentage = 100 * n / sum(d2$n)) %>%
    filter(percentage > 0.1) %>%
    select(-n, -percentage)

## All rows in d2 that have a match in main_taxa
d2 <- semi_join(d2, main_taxa, by = "clade")

## Order the legend
taxa_order_reads<- select(d2, clade) %>% distinct()

#------------------------ Absolute barplots -----------------------------------#

## Barcharts (reads)
p1 <- ggplot(d2, aes(x = forest, y = n, fill = clade)) +
    geom_bar(stat = "identity") +
    scale_y_continuous(labels = comma) +
    scale_fill_discrete(breaks = taxa_order_reads$clade,
                            name = "clade                         ") +
    scale_x_discrete(limits = rev(c("LaSelva", "Barro", "Tiputini")),
                     breaks = c("LaSelva", "Barro", "Tiputini"),
                     labels = c("Costa Rica\n(La Selva)",
                         "Panama\n(Barro\nColorado)",
                         "Ecuador\n(Tiputini)")) +
    ylab("number of observed reads") +
    xlab("forests") +
    coord_flip() +
    theme_bw() +
    theme(axis.text  = element_text(size = 11),
          axis.title.x = element_text(vjust = 0)) ##  +
    ## ggtitle("Neotropical Forest Soils: protist communities (175 samples, share > 0.1%)") +
    ## theme(legend.background = element_rect(colour="black", size=.1))

## Barcharts (OTUs)

## Group by clade (sum reads)
d3 <- select(d, -taxonomy) %>%
    gather("forest", "abundance", -clade) %>%
        group_by(clade, forest) %>%
            filter(abundance != "0") %>%
                tally(sort= TRUE)

## Replace "*" by "Unknown", and discard "Chimera"
d3 <- d3 %>% filter(clade != "Chimera")
d3$clade[d3$clade == "*"] <- "Unknown"
d3$clade[d3$clade == "Alveolata_X"] <- "Alveolata incertae sedis"
d3$clade[d3$clade == "Amoebozoa_X"] <- "Amoebozoa incertae sedis"
d3$clade[d3$clade == "Stramenopiles_X"] <- "non-Ochrophyta Stramenopiles"

## List clades that have significative abundances
main_taxa <- d3 %>%
    select(-forest) %>%
    group_by(clade) %>%
    tally(sort = TRUE) %>%
    mutate(percentage = 100 * n / sum(d3$n)) %>%
    filter(percentage > 0.1) %>%
    select(-n, -percentage)

## ## All rows in d3 that have a match in main_taxa (in the same time,
## it sorts d3 by decreasing number of OTUs)
d3 <- semi_join(d3, main_taxa, by = "clade")

## Order the legend
taxa_order_OTUs <- select(d3, clade) %>% distinct()

## Barcharts
p2 <- ggplot(d3, aes(x = forest, y = n, fill = clade)) +
    geom_bar(stat = "identity") +
    scale_y_continuous(labels = comma) +
    scale_fill_discrete(breaks = taxa_order_OTUs$clade) +
    scale_x_discrete(limits = rev(c("LaSelva", "Barro", "Tiputini")),
                     breaks = c("LaSelva", "Barro", "Tiputini"),
                     labels = c("Costa Rica\n(La Selva)",
                         "Panama\n(Barro\nColorado)",
                         "Ecuador\n(Tiputini)")) +
    ylab("number of observed OTUs") +
    xlab("forests") +
    coord_flip() +
    theme_bw() +
    theme(axis.text  = element_text(size = 11),
          axis.title.x = element_text(vjust = 0))

## Output to PDF (multiplot)
output <- gsub(".table", "_group_by_forests_absolute.pdf", input, fixed = TRUE)
pdf(file = output, width = 11 , height = 10)
multiplot(p1, p2)
dev.off()


#-------------------------- Percentage barplots -------------------------------#

## Barcharts (reads)
p1 <- ggplot(d2, aes(x = forest, y = n, fill = clade)) +
    geom_bar(stat = "identity", position = "fill") +
    scale_y_continuous(labels = percent_format()) +
    scale_fill_discrete(breaks = taxa_order_reads$clade,
                            name = "clade                         ") +
    scale_x_discrete(limits = rev(c("LaSelva", "Barro")),
                     breaks = c("LaSelva", "Barro"),
                     labels = c("Costa Rica\n(La Selva)",
                         "Panama\n(Barro\nColorado)")) +
    ylab("percentage of observed reads") +
    xlab("forests") +
    coord_flip() +
    theme_bw() +
    theme(axis.text  = element_text(size = 11),
          axis.title.x = element_text(vjust = 0))

## Barcharts (OTUs)
p2 <- ggplot(d3, aes(x = forest, y = n, fill = clade)) +
    geom_bar(stat = "identity", position = "fill") +
    scale_y_continuous(labels = percent_format()) +
    scale_fill_discrete(breaks = taxa_order_OTUs$clade) +
    scale_x_discrete(limits = rev(c("LaSelva", "Barro")),
                     breaks = c("LaSelva", "Barro"),
                     labels = c("Costa Rica\n(La Selva)",
                         "Panama\n(Barro\nColorado)")) +
    ylab("percentage of observed OTUs") +
    xlab("forests") +
    coord_flip() +
    theme_bw() +
    theme(axis.text  = element_text(size = 11),
          axis.title.x = element_text(vjust = 0))

## Output to PDF (multiplot)
output <- gsub(".table", "_group_by_forests_relative.pdf", input, fixed = TRUE)
pdf(file = output, width = 11 , height = 7.25)
multiplot(p1, p2)
dev.off()

quit(save="no")

5.7 Hyperdominance assessment

library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)

setwd("~/neotropical_diversity/results/stampa/")
input <- "neotropical_soil_20_samples_454_ciliate.OTU.protists.table"

## Load data
d <- read.table(input, sep = "\t", header = TRUE)

## Clean and compute the cumulative sum
all_reads <- sum(d$total)
ranks <- length(d$total)
d <- select(d, -matches("[TBL][0-9]"), -amplicon, -chimera, -identity, -taxonomy, -references) %>%
    mutate(cumulative = cumsum(d$total) / all_reads) %>%
        mutate(rank = seq(1, ranks)) %>%
            select(-OTU, -total) %>%
                slice(1:200)

glimpse(d)

## Plot
ggplot(d, aes(x = rank, y = cumulative)) +
    geom_line(colour = "darkred", size = 1) +
        scale_y_continuous(labels = percent, limits = c(0, 1)) +
            scale_x_continuous() +
                xlab("Number of OTUs") +
                    ylab("percentage of observations")
                        ## annotate("text", x = 0.50, y = y_max * 0.9, hjust = 0, colour = "grey", size = 8, label = TITLE)

## Output to PDF
output <- gsub(".table", ".hyperdominance.pdf", input, fixed = TRUE)
ggsave(file = output, width = 8 , height = 5)

quit(save="no")

Make a table

cd ~/neotropical_diversity/results/stampa/

TABLE="neotropical_soil_20_samples_454_ciliate.OTU.protists.table"
MAX=50

function get_column_number() {
    head -n 1 "${TABLE}" | tr "\t" "\n" | nl -n ln | grep "${1}" | cut -d " " -f 1
}

TOTAL=$(get_column_number total)
IDENTITY=$(get_column_number identity)
TAXONOMY=$(get_column_number taxonomy)
GRAND_TOTAL=$(awk -v COLUMN=${TOTAL} 'BEGIN {FS = "\t"} {s += $COLUMN} END {print s}' "${TABLE}")

cut -f 1,${TOTAL},${IDENTITY},${TAXONOMY} "${TABLE}" | \
    head -n $(( ${MAX} + 1 )) | \
    awk -v GRAND_TOTAL=${GRAND_TOTAL} \
        'BEGIN {FS = OFS = "\t"}
         {perc = 100 * $2 / GRAND_TOTAL
          cum_perc += perc
          print $1, $2, perc, cum_perc, $3, $4}' | \
    sed 's/total\t0\t0/total\t%\tcum_%/' > "${TABLE/.table/.hyperdominance.csv}"

Date: [2017-01-15 dim.]

Author: Frédéric Mahé

Created: 2017-01-15 dim. 16:45

Emacs 24.4.1 (Org mode 8.2.10)

Validate