From 96f5e9a4d77fa5e0d44424a89e97f477caf14ffc Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 11 Jul 2024 14:49:49 +0200
Subject: [PATCH 001/310] Update Snakefile

---
 Snakefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Snakefile b/Snakefile
index 71c2bb3..e23b74c 100644
--- a/Snakefile
+++ b/Snakefile
@@ -119,10 +119,10 @@ rule ragtag_scaffolding:
             -o {output.fa} 2>&1 | \
             tee {log.cmd}
 
-        #if [[ -z $(zgrep '[^[:space:]]' {output.fa}) ]] ; then
-        #    echo "Error : Empty final fasta"
-        #    exit 1
-        #fi
+        if [[ -z $(grep '[^[:space:]]' {output.fa}) ]] ; then
+            echo "Error : Empty final fasta"
+            exit 1
+        fi
         """
 
 rule quast_stats:
-- 
GitLab


From de6e96c37a3fa07514e09b52d38d0957ef6f093f Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Fri, 12 Jul 2024 14:43:27 +0200
Subject: [PATCH 002/310] Added mm2_params for RagTag

---
 Snakefile                | 4 +++-
 config.yaml              | 4 ++++
 example/config_CICD.yaml | 4 ++++
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index e23b74c..f97c38c 100644
--- a/Snakefile
+++ b/Snakefile
@@ -103,7 +103,8 @@ rule ragtag_scaffolding:
     retries: 1
     priority: 100
     params:
-        app_path=config["app.path"]
+        app_path=config["app.path"],
+        mm2_config=config["ragtag_mm2_conf"]
     log: 
         cmd="logs/ragtag/{haplotype}.ragtag.cmd.log",
         time="logs/ragtag/{haplotype}.ragtag.time.log"
@@ -116,6 +117,7 @@ rule ragtag_scaffolding:
             -t {threads} \
             -r {input.ref} \
             -q {input.fa} \
+            --mm2-params {params.mm2_config} \
             -o {output.fa} 2>&1 | \
             tee {log.cmd}
 
diff --git a/config.yaml b/config.yaml
index 0a8502a..1367008 100644
--- a/config.yaml
+++ b/config.yaml
@@ -7,6 +7,10 @@ reference: '<reference_name>'
 app.path: '<path>'
 
 # Core parameters
+# RagTag parameters
+ragtag_mm2_conf: '-x asm5'
+##Â Add -f 0.0002 for large genomes
+
 # Wfmash alignement parameters :
 wfmash.segment_length: 10000
 wfmash.mapping_id: 95
diff --git a/example/config_CICD.yaml b/example/config_CICD.yaml
index cc0c279..dfd0ad2 100644
--- a/example/config_CICD.yaml
+++ b/example/config_CICD.yaml
@@ -7,6 +7,10 @@ reference: 'R64.hap1.fa.gz'
 app.path: 'appimgs/'
 
 # Core parameters
+# RagTag parameters
+ragtag_mm2_conf: '-x asm5'
+##Â Add -f 0.0002 for large genomes
+
 # Wfmash alignement parameters :
 wfmash.segment_length: 5000
 wfmash.mapping_id: 90
-- 
GitLab


From c7cc3e0c529cd57ff5fd7b90d19e62a57a92d8ac Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Fri, 12 Jul 2024 15:55:04 +0200
Subject: [PATCH 003/310] Added chrInputs figs to final report if generated

---
 Snakefile | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/Snakefile b/Snakefile
index f97c38c..7860363 100644
--- a/Snakefile
+++ b/Snakefile
@@ -739,6 +739,12 @@ def get_report_sections(wildcards):
             haplotype=SAMPLES_NOREF
             )
 
+    if config["get_chrInputs_SyRI"] == "True":
+        sections["SyRI_on_chrInputs_figs"] = expand(
+            "output/chrInput.syri.figs/"+config['name']+".{chromosome}.asm.syri.png", 
+            chromosome=CHRLIST
+            )
+
     return sections      
 
 rule create_pan1c_report:
@@ -750,7 +756,8 @@ rule create_pan1c_report:
     threads: 4
     params:
         app_path=config['app.path'],
-        add_SyRI=config['get_ASMs_SyRI']
+        add_ASMs_SyRI=config['get_ASMs_SyRI'],
+        add_chrInputs_SyRI=config['get_chrInputs_SyRI']
     run:
         shell("touch {output.report}")
 
@@ -780,21 +787,28 @@ rule create_pan1c_report:
         shell("echo '' >> {output.report}")
 
         # Adding chromosomes figures
-        fig_list = [fig for fig in input.odgifigs]
-        print(fig_list)
-        fig_list.sort()
+        odgi_figs_list = [fig for fig in input.odgifigs]
+        odgi_figs_list.sort()
+
+        if params.add_chrInputs_SyRI:
+            syri_figs_dict = {
+                os.path.basename(fig).split('.')[1]:os.path.basename(fig) 
+                for fig in input.SyRI_on_chrInputs_figs
+                }
         
         shell("echo '# Chromosome-scale odgi graphs' >> {output.report}")
-        for fig in fig_list:
-            basename=os.path.basename(fig)
+        for i in range(len(odgi_figs_list)):
+            odgi_basename=os.path.basename(odgi_figs_list[i])
             chr_name=basename.split('.')[1]
             
             shell("echo '## {chr_name}' >> {output.report}")
             shell("echo '![{basename}](./report/{basename})' >> {output.report}")
+            if params.add_chrInputs_SyRI:
+                shell("echo '![{syri_figs_dict[chr_name]}](./chrInput.syri.figs/{syri_figs_dict[chr_name]}})' >> {output.report}")
         shell("echo '' >> {output.report}")
 
         # Adding SyRI figs if produced
-        if params.add_SyRI:
+        if params.add_ASMs_SyRI:
 
             fig_list = [fig for fig in input.SyRI_on_ASMs_figs]
             fig_list.sort()
-- 
GitLab


From 90e3fcadbf24c9754d2173eff075696a0390b678 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 15 Jul 2024 09:12:30 +0200
Subject: [PATCH 004/310] Updated RagTag to take mm2 parameters

---
 Snakefile                   | 2 +-
 scripts/ragtagChromInfer.sh | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/Snakefile b/Snakefile
index 7860363..48318c5 100644
--- a/Snakefile
+++ b/Snakefile
@@ -117,7 +117,7 @@ rule ragtag_scaffolding:
             -t {threads} \
             -r {input.ref} \
             -q {input.fa} \
-            --mm2-params {params.mm2_config} \
+            -m {params.mm2_config} \
             -o {output.fa} 2>&1 | \
             tee {log.cmd}
 
diff --git a/scripts/ragtagChromInfer.sh b/scripts/ragtagChromInfer.sh
index b8dc8ea..5db4a89 100755
--- a/scripts/ragtagChromInfer.sh
+++ b/scripts/ragtagChromInfer.sh
@@ -8,9 +8,10 @@ threads=""      # Threads count
 inputref=""     # Reference fasta
 inputquery=""   # Query fasta
 output=""       # Output fasta
+mm2params=""    # Minimap2 parameters
 
 ## Getting arguments
-while getopts "d:a:t:r:q:c:o:" option; do
+while getopts "d:a:t:r:q:c:o:m:" option; do
     case "$option" in
         d) tmpdir="$OPTARG";;
         a) appdir="$OPTARG";;
@@ -19,7 +20,8 @@ while getopts "d:a:t:r:q:c:o:" option; do
         q) inputquery="$OPTARG";;
         c) rtcommand="$OPTARG";;
         o) output="$OPTARG";;
-        \?) echo "Usage: $0 [-d tmpdir] [-a apptainer dir] [-t threads] [-r inputref] [-q inputquery] [-o output fasta] [-n pangenome name]" >&2
+        m) mm2params="$OPTARG";;
+        \?) echo "Usage: $0 [-d tmpdir] [-a apptainer dir] [-t threads] [-r inputref] [-q inputquery] [-o output fasta] [-n pangenome name] [-m mm2-params]" >&2
             exit 1;;
     esac
 done
@@ -34,7 +36,7 @@ mkdir -p $tmpdir
 
 # Running ragtag scaffold
 apptainer run $appdir/pan1c-env.sif ragtag.py scaffold \
-    -t $threads -o $tmpdir $inputref $inputquery
+    -t $threads --mm2-params $mm2params -o $tmpdir $inputref $inputquery
 
 # Renaming sequence according to naming scheme
 grep ">${ref}#chr*" -A1 $tmpdir/ragtag.scaffold.fasta | \
-- 
GitLab


From 10e2f77305045ff8ed69f116196be0a47b642601 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 15 Jul 2024 09:24:41 +0200
Subject: [PATCH 005/310] Update ragtagChromInfer.sh

---
 scripts/ragtagChromInfer.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/ragtagChromInfer.sh b/scripts/ragtagChromInfer.sh
index 5db4a89..293f187 100755
--- a/scripts/ragtagChromInfer.sh
+++ b/scripts/ragtagChromInfer.sh
@@ -36,7 +36,7 @@ mkdir -p $tmpdir
 
 # Running ragtag scaffold
 apptainer run $appdir/pan1c-env.sif ragtag.py scaffold \
-    -t $threads --mm2-params $mm2params -o $tmpdir $inputref $inputquery
+    -t $threads --mm2-params "$mm2params" -o $tmpdir $inputref $inputquery
 
 # Renaming sequence according to naming scheme
 grep ">${ref}#chr*" -A1 $tmpdir/ragtag.scaffold.fasta | \
-- 
GitLab


From a3959ec26fa6a7c9fa4183e2d1cf49b20120a3d7 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 15 Jul 2024 09:36:18 +0200
Subject: [PATCH 006/310] Update Snakefile

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 48318c5..938b10b 100644
--- a/Snakefile
+++ b/Snakefile
@@ -117,7 +117,7 @@ rule ragtag_scaffolding:
             -t {threads} \
             -r {input.ref} \
             -q {input.fa} \
-            -m {params.mm2_config} \
+            -m "{params.mm2_config}" \
             -o {output.fa} 2>&1 | \
             tee {log.cmd}
 
-- 
GitLab


From 9ebec505fd12ecbd9b58574e6ce9bf7cf646deba Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 15 Jul 2024 09:47:52 +0200
Subject: [PATCH 007/310] Update config.yaml

---
 scripts/ragtagChromInfer.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/ragtagChromInfer.sh b/scripts/ragtagChromInfer.sh
index 293f187..21d14cb 100755
--- a/scripts/ragtagChromInfer.sh
+++ b/scripts/ragtagChromInfer.sh
@@ -36,7 +36,7 @@ mkdir -p $tmpdir
 
 # Running ragtag scaffold
 apptainer run $appdir/pan1c-env.sif ragtag.py scaffold \
-    -t $threads --mm2-params "$mm2params" -o $tmpdir $inputref $inputquery
+    --mm2-params "$mm2params -t $threads" -o $tmpdir $inputref $inputquery
 
 # Renaming sequence according to naming scheme
 grep ">${ref}#chr*" -A1 $tmpdir/ragtag.scaffold.fasta | \
-- 
GitLab


From dee5c68d0b35e9f3b9c40c25e8135ed01a02b575 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 16 Jul 2024 09:51:51 +0200
Subject: [PATCH 008/310] Update Snakefile

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 938b10b..3b0e964 100644
--- a/Snakefile
+++ b/Snakefile
@@ -310,7 +310,7 @@ rule SyRI_on_chrInput:
             -o $(basename {output.fig}) \
             -r {output.wrkdir}/"${{refname}}.fa.gz" \
             -q "${{AllAsmList[*]}}" \
-            -h 9 -w 16
+            -h 10 -w 20
         mv {output.wrkdir}/$(basename {output.fig}) {output.fig}
         rm {output.wrkdir}/*.fa
         """
-- 
GitLab


From 1b64b7fc2202e0130a11f7ea0f16df92a9052238 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 16 Jul 2024 10:14:02 +0200
Subject: [PATCH 009/310] Update Snakefile

---
 Snakefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Snakefile b/Snakefile
index 3b0e964..484b1da 100644
--- a/Snakefile
+++ b/Snakefile
@@ -790,7 +790,7 @@ rule create_pan1c_report:
         odgi_figs_list = [fig for fig in input.odgifigs]
         odgi_figs_list.sort()
 
-        if params.add_chrInputs_SyRI:
+        if params.add_chrInputs_SyRI == "True":
             syri_figs_dict = {
                 os.path.basename(fig).split('.')[1]:os.path.basename(fig) 
                 for fig in input.SyRI_on_chrInputs_figs
@@ -799,16 +799,16 @@ rule create_pan1c_report:
         shell("echo '# Chromosome-scale odgi graphs' >> {output.report}")
         for i in range(len(odgi_figs_list)):
             odgi_basename=os.path.basename(odgi_figs_list[i])
-            chr_name=basename.split('.')[1]
+            chr_name=odgi_basename.split('.')[1]
             
             shell("echo '## {chr_name}' >> {output.report}")
             shell("echo '![{basename}](./report/{basename})' >> {output.report}")
-            if params.add_chrInputs_SyRI:
+            if params.add_chrInputs_SyRI == "True":
                 shell("echo '![{syri_figs_dict[chr_name]}](./chrInput.syri.figs/{syri_figs_dict[chr_name]}})' >> {output.report}")
         shell("echo '' >> {output.report}")
 
         # Adding SyRI figs if produced
-        if params.add_ASMs_SyRI:
+        if params.add_ASMs_SyRI == "True":
 
             fig_list = [fig for fig in input.SyRI_on_ASMs_figs]
             fig_list.sort()
-- 
GitLab


From 7bab69de4270dcf8b79efa099e5f40abc27adb1d Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 16 Jul 2024 10:15:52 +0200
Subject: [PATCH 010/310] Update Snakefile

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 484b1da..fd3403c 100644
--- a/Snakefile
+++ b/Snakefile
@@ -802,7 +802,7 @@ rule create_pan1c_report:
             chr_name=odgi_basename.split('.')[1]
             
             shell("echo '## {chr_name}' >> {output.report}")
-            shell("echo '![{basename}](./report/{basename})' >> {output.report}")
+            shell("echo '![{odgi_basename}](./report/{odgi_basename})' >> {output.report}")
             if params.add_chrInputs_SyRI == "True":
                 shell("echo '![{syri_figs_dict[chr_name]}](./chrInput.syri.figs/{syri_figs_dict[chr_name]}})' >> {output.report}")
         shell("echo '' >> {output.report}")
-- 
GitLab


From d645fc1e30edcb04b4f79fe990a2c52e675bc86f Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 16 Jul 2024 11:00:00 +0200
Subject: [PATCH 011/310] Update Snakefile

---
 Snakefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index fd3403c..3659534 100644
--- a/Snakefile
+++ b/Snakefile
@@ -804,7 +804,8 @@ rule create_pan1c_report:
             shell("echo '## {chr_name}' >> {output.report}")
             shell("echo '![{odgi_basename}](./report/{odgi_basename})' >> {output.report}")
             if params.add_chrInputs_SyRI == "True":
-                shell("echo '![{syri_figs_dict[chr_name]}](./chrInput.syri.figs/{syri_figs_dict[chr_name]}})' >> {output.report}")
+                syri_fig = syri_figs_dict[chr_name]
+                shell("echo '![{syri_fig}](./chrInput.syri.figs/{syri_fig})' >> {output.report}")
         shell("echo '' >> {output.report}")
 
         # Adding SyRI figs if produced
-- 
GitLab


From d3538475dc4162d9793752d93d0c4875942f0aa3 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 16 Jul 2024 14:51:36 +0200
Subject: [PATCH 012/310] Updated SyRI script

Taking fontsize and space for homologous chromosomes as input
---
 Snakefile              |  2 +-
 scripts/getSyriFigs.sh | 10 +++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/Snakefile b/Snakefile
index 3659534..b2e421f 100644
--- a/Snakefile
+++ b/Snakefile
@@ -310,7 +310,7 @@ rule SyRI_on_chrInput:
             -o $(basename {output.fig}) \
             -r {output.wrkdir}/"${{refname}}.fa.gz" \
             -q "${{AllAsmList[*]}}" \
-            -h 10 -w 20
+            -h 10 -w 20 -s "0.7" -f 10
         mv {output.wrkdir}/$(basename {output.fig}) {output.fig}
         rm {output.wrkdir}/*.fa
         """
diff --git a/scripts/getSyriFigs.sh b/scripts/getSyriFigs.sh
index 84f93c2..273b30d 100755
--- a/scripts/getSyriFigs.sh
+++ b/scripts/getSyriFigs.sh
@@ -11,9 +11,11 @@ wrkdir=""       # Working directory (directory used by pggb to store step files
 output=""       # Output Syri figure(s)
 height=16       # Figure height
 width=9         # Figure width
+fontsize=12     
+space="0.7"     # Space for homologous chromosomes 
 
 ## Getting arguments
-while getopts "r:q:a:t:d:o:h:w:" option; do
+while getopts "r:q:a:t:d:o:h:w:f:s:" option; do
     case "$option" in
         r) ref="$OPTARG";;
         q) qry="$OPTARG";;
@@ -23,7 +25,9 @@ while getopts "r:q:a:t:d:o:h:w:" option; do
         o) output="$OPTARG";;
         h) height="$OPTARG";;
         w) width="$OPTARG";;
-        \?) echo "Usage: $0 [-r ref] [-q query] [-a appdir] [-t threads] [-d wrkdir] [-o output] [-h height] [-w width]" >&2
+        f) fontsize="$OPTARG";;
+        s) space="$OPTARG";;
+        \?) echo "Usage: $0 [-r ref] [-q query] [-a appdir] [-t threads] [-d wrkdir] [-o output] [-h height] [-w width] [-f fontsize] [-s space]" >&2
             exit 1;;
     esac
 done
@@ -102,7 +106,7 @@ for asm in "${asmList[@]}"; do
 done
 
 # Generating the plotsr command
-command="--genomes $wrkdir/genomes.txt -o $wrkdir/$output -f 12 -H $height -W $width -d 600 "
+command="--genomes $wrkdir/genomes.txt -o $wrkdir/$output -f $fontsize -S $space -H $height -W $width -d 600 "
 
 # Adding syri files to the command as each needs to be specified using "--sr" argument 
 for file in "${syriFileList[@]}"; do
-- 
GitLab


From 0cf7af815784699fe4f3155b9d886915c35b6030 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 16 Jul 2024 15:17:26 +0200
Subject: [PATCH 013/310] Report in HTML

With github flavoured CSS
---
 Snakefile               |  15 +-
 src/README.md           |   2 +
 src/github-markdown.css | 938 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 952 insertions(+), 3 deletions(-)
 create mode 100644 src/README.md
 create mode 100644 src/github-markdown.css

diff --git a/Snakefile b/Snakefile
index b2e421f..3892169 100644
--- a/Snakefile
+++ b/Snakefile
@@ -752,7 +752,8 @@ rule create_pan1c_report:
     input:
         unpack(get_report_sections)
     output:
-        report="output/pan1c."+config['name']+".report.md"
+        report="output/pan1c."+config['name']+".report.md",
+        html="output/pan1c."+config['name']+".report.html"
     threads: 4
     params:
         app_path=config['app.path'],
@@ -763,7 +764,12 @@ rule create_pan1c_report:
 
         # Adding Summary (made for importing in Joplin)
         shell("echo '# Summary' >> {output.report}")
-        shell("echo '[TOC]' >> {output.report}")
+        shell("echo '- [Graph metadata](#graph-metadata)' >> {output.report}")
+        shell("echo '- [General stats](#general-stats)' >> {output.report}")
+        shell("echo '- [Path stats](#path-stats)' >> {output.report}")
+        shell("echo '- [Chromosome-scale odgi graphs](#chromosome-scale-odgi-graphs)' >> {output.report}")
+        if params.add_ASMs_SyRI == "True":
+            shell("echo '- [SyRI on input assemblies](#syri-on-input-assemblies)' >> {output.report}")
         shell("echo '' >> {output.report}")
         
         # Adding graph construction info
@@ -822,4 +828,7 @@ rule create_pan1c_report:
                 shell("echo '## {hap_name[0]}, {hap_name[1]}' >> {output.report}")
                 shell("echo '![{basename}](./asm.syri.figs/{basename})' >> {output.report}")
 
-            shell("echo '' >> {output.report}")
\ No newline at end of file
+            shell("echo '' >> {output.report}")
+
+        # Converting to HTML
+        shell("pandoc --standalone -c src/github-markdown.css -f gfm -t html {output.report} > {output.html}")
\ No newline at end of file
diff --git a/src/README.md b/src/README.md
new file mode 100644
index 0000000..0164645
--- /dev/null
+++ b/src/README.md
@@ -0,0 +1,2 @@
+# Github flavoured markdown CSS
+From https://gist.github.com/fergiemcdowall/9ecbea41f67465b5cfcd3560508eb100#file-github-markdown-css
\ No newline at end of file
diff --git a/src/github-markdown.css b/src/github-markdown.css
new file mode 100644
index 0000000..1c5d981
--- /dev/null
+++ b/src/github-markdown.css
@@ -0,0 +1,938 @@
+@font-face {
+  font-family: octicons-link;
+  src: url(data:font/woff;charset=utf-8;base64,d09GRgABAAAAAAZwABAAAAAACFQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABEU0lHAAAGaAAAAAgAAAAIAAAAAUdTVUIAAAZcAAAACgAAAAoAAQAAT1MvMgAAAyQAAABJAAAAYFYEU3RjbWFwAAADcAAAAEUAAACAAJThvmN2dCAAAATkAAAABAAAAAQAAAAAZnBnbQAAA7gAAACyAAABCUM+8IhnYXNwAAAGTAAAABAAAAAQABoAI2dseWYAAAFsAAABPAAAAZwcEq9taGVhZAAAAsgAAAA0AAAANgh4a91oaGVhAAADCAAAABoAAAAkCA8DRGhtdHgAAAL8AAAADAAAAAwGAACfbG9jYQAAAsAAAAAIAAAACABiATBtYXhwAAACqAAAABgAAAAgAA8ASm5hbWUAAAToAAABQgAAAlXu73sOcG9zdAAABiwAAAAeAAAAME3QpOBwcmVwAAAEbAAAAHYAAAB/aFGpk3jaTY6xa8JAGMW/O62BDi0tJLYQincXEypYIiGJjSgHniQ6umTsUEyLm5BV6NDBP8Tpts6F0v+k/0an2i+itHDw3v2+9+DBKTzsJNnWJNTgHEy4BgG3EMI9DCEDOGEXzDADU5hBKMIgNPZqoD3SilVaXZCER3/I7AtxEJLtzzuZfI+VVkprxTlXShWKb3TBecG11rwoNlmmn1P2WYcJczl32etSpKnziC7lQyWe1smVPy/Lt7Kc+0vWY/gAgIIEqAN9we0pwKXreiMasxvabDQMM4riO+qxM2ogwDGOZTXxwxDiycQIcoYFBLj5K3EIaSctAq2kTYiw+ymhce7vwM9jSqO8JyVd5RH9gyTt2+J/yUmYlIR0s04n6+7Vm1ozezUeLEaUjhaDSuXHwVRgvLJn1tQ7xiuVv/ocTRF42mNgZGBgYGbwZOBiAAFGJBIMAAizAFoAAABiAGIAznjaY2BkYGAA4in8zwXi+W2+MjCzMIDApSwvXzC97Z4Ig8N/BxYGZgcgl52BCSQKAA3jCV8CAABfAAAAAAQAAEB42mNgZGBg4f3vACQZQABIMjKgAmYAKEgBXgAAeNpjYGY6wTiBgZWBg2kmUxoDA4MPhGZMYzBi1AHygVLYQUCaawqDA4PChxhmh/8ODDEsvAwHgMKMIDnGL0x7gJQCAwMAJd4MFwAAAHjaY2BgYGaA4DAGRgYQkAHyGMF8NgYrIM3JIAGVYYDT+AEjAwuDFpBmA9KMDEwMCh9i/v8H8sH0/4dQc1iAmAkALaUKLgAAAHjaTY9LDsIgEIbtgqHUPpDi3gPoBVyRTmTddOmqTXThEXqrob2gQ1FjwpDvfwCBdmdXC5AVKFu3e5MfNFJ29KTQT48Ob9/lqYwOGZxeUelN2U2R6+cArgtCJpauW7UQBqnFkUsjAY/kOU1cP+DAgvxwn1chZDwUbd6CFimGXwzwF6tPbFIcjEl+vvmM/byA48e6tWrKArm4ZJlCbdsrxksL1AwWn/yBSJKpYbq8AXaaTb8AAHja28jAwOC00ZrBeQNDQOWO//sdBBgYGRiYWYAEELEwMTE4uzo5Zzo5b2BxdnFOcALxNjA6b2ByTswC8jYwg0VlNuoCTWAMqNzMzsoK1rEhNqByEyerg5PMJlYuVueETKcd/89uBpnpvIEVomeHLoMsAAe1Id4AAAAAAAB42oWQT07CQBTGv0JBhagk7HQzKxca2sJCE1hDt4QF+9JOS0nbaaYDCQfwCJ7Au3AHj+LO13FMmm6cl7785vven0kBjHCBhfpYuNa5Ph1c0e2Xu3jEvWG7UdPDLZ4N92nOm+EBXuAbHmIMSRMs+4aUEd4Nd3CHD8NdvOLTsA2GL8M9PODbcL+hD7C1xoaHeLJSEao0FEW14ckxC+TU8TxvsY6X0eLPmRhry2WVioLpkrbp84LLQPGI7c6sOiUzpWIWS5GzlSgUzzLBSikOPFTOXqly7rqx0Z1Q5BAIoZBSFihQYQOOBEdkCOgXTOHA07HAGjGWiIjaPZNW13/+lm6S9FT7rLHFJ6fQbkATOG1j2OFMucKJJsxIVfQORl+9Jyda6Sl1dUYhSCm1dyClfoeDve4qMYdLEbfqHf3O/AdDumsjAAB42mNgYoAAZQYjBmyAGYQZmdhL8zLdDEydARfoAqIAAAABAAMABwAKABMAB///AA8AAQAAAAAAAAAAAAAAAAABAAAAAA==) format('woff');
+}
+
+.octicon {
+  display: inline-block;
+  fill: currentColor;
+  vertical-align: text-bottom;
+}
+
+.anchor {
+  float: left;
+  line-height: 1;
+  margin-left: -20px;
+  padding-right: 4px;
+}
+
+.anchor:focus {
+  outline: none;
+}
+
+h1 .octicon-link,
+h2 .octicon-link,
+h3 .octicon-link,
+h4 .octicon-link,
+h5 .octicon-link,
+h6 .octicon-link {
+  color: #1b1f23;
+  vertical-align: middle;
+  visibility: hidden;
+}
+
+h1:hover .anchor,
+h2:hover .anchor,
+h3:hover .anchor,
+h4:hover .anchor,
+h5:hover .anchor,
+h6:hover .anchor {
+  text-decoration: none;
+}
+
+h1:hover .anchor .octicon-link,
+h2:hover .anchor .octicon-link,
+h3:hover .anchor .octicon-link,
+h4:hover .anchor .octicon-link,
+h5:hover .anchor .octicon-link,
+h6:hover .anchor .octicon-link {
+  visibility: visible;
+}
+
+body {
+  -ms-text-size-adjust: 100%;
+  -webkit-text-size-adjust: 100%;
+  color: #24292e;
+  line-height: 1.5;
+  font-family: -apple-system,BlinkMacSystemFont,Segoe UI,Helvetica,Arial,sans-serif,Apple Color Emoji,Segoe UI Emoji,Segoe UI Symbol;
+  font-size: 16px;
+  line-height: 1.5;
+  word-wrap: break-word;
+}
+
+.pl-c {
+  color: #6a737d;
+}
+
+.pl-c1,
+.pl-s .pl-v {
+  color: #005cc5;
+}
+
+.pl-e,
+.pl-en {
+  color: #6f42c1;
+}
+
+.pl-s .pl-s1,
+.pl-smi {
+  color: #24292e;
+}
+
+.pl-ent {
+  color: #22863a;
+}
+
+.pl-k {
+  color: #d73a49;
+}
+
+.pl-pds,
+.pl-s,
+.pl-s .pl-pse .pl-s1,
+.pl-sr,
+.pl-sr .pl-cce,
+.pl-sr .pl-sra,
+.pl-sr .pl-sre {
+  color: #032f62;
+}
+
+.pl-smw,
+.pl-v {
+  color: #e36209;
+}
+
+.pl-bu {
+  color: #b31d28;
+}
+
+.pl-ii {
+  background-color: #b31d28;
+  color: #fafbfc;
+}
+
+.pl-c2 {
+  background-color: #d73a49;
+  color: #fafbfc;
+}
+
+.pl-c2:before {
+  content: "^M";
+}
+
+.pl-sr .pl-cce {
+  color: #22863a;
+  font-weight: 700;
+}
+
+.pl-ml {
+  color: #735c0f;
+}
+
+.pl-mh,
+.pl-mh .pl-en,
+.pl-ms {
+  color: #005cc5;
+  font-weight: 700;
+}
+
+.pl-mi {
+  color: #24292e;
+  font-style: italic;
+}
+
+.pl-mb {
+  color: #24292e;
+  font-weight: 700;
+}
+
+.pl-md {
+  background-color: #ffeef0;
+  color: #b31d28;
+}
+
+.pl-mi1 {
+  background-color: #f0fff4;
+  color: #22863a;
+}
+
+.pl-mc {
+  background-color: #ffebda;
+  color: #e36209;
+}
+
+.pl-mi2 {
+  background-color: #005cc5;
+  color: #f6f8fa;
+}
+
+.pl-mdr {
+  color: #6f42c1;
+  font-weight: 700;
+}
+
+.pl-ba {
+  color: #586069;
+}
+
+.pl-sg {
+  color: #959da5;
+}
+
+.pl-corl {
+  color: #032f62;
+  text-decoration: underline;
+}
+
+details {
+  display: block;
+}
+
+summary {
+  display: list-item;
+}
+
+a {
+  background-color: transparent;
+}
+
+a:active,
+a:hover {
+  outline-width: 0;
+}
+
+strong {
+  font-weight: inherit;
+  font-weight: bolder;
+}
+
+h1 {
+  font-size: 2em;
+  margin: .67em 0;
+}
+
+img {
+  border-style: none;
+}
+
+code,
+kbd,
+pre {
+  font-family: monospace,monospace;
+  font-size: 1em;
+}
+
+hr {
+  box-sizing: content-box;
+  height: 0;
+  overflow: visible;
+}
+
+input {
+  font: inherit;
+  margin: 0;
+}
+
+input {
+  overflow: visible;
+}
+
+[type=checkbox] {
+  box-sizing: border-box;
+  padding: 0;
+}
+
+* {
+  box-sizing: border-box;
+}
+
+input {
+  font-family: inherit;
+  font-size: inherit;
+  line-height: inherit;
+}
+
+a {
+  color: #0366d6;
+  text-decoration: none;
+}
+
+a:hover {
+  text-decoration: underline;
+}
+
+strong {
+  font-weight: 600;
+}
+
+hr {
+  background: transparent;
+  border: 0;
+  border-bottom: 1px solid #dfe2e5;
+  height: 0;
+  margin: 15px 0;
+  overflow: hidden;
+}
+
+hr:before {
+  content: "";
+  display: table;
+}
+
+hr:after {
+  clear: both;
+  content: "";
+  display: table;
+}
+
+table {
+  border-collapse: collapse;
+  border-spacing: 0;
+}
+
+td,
+th {
+  padding: 0;
+}
+
+details summary {
+  cursor: pointer;
+}
+
+h1,
+h2,
+h3,
+h4,
+h5,
+h6 {
+  margin-bottom: 0;
+  margin-top: 0;
+}
+
+h1 {
+  font-size: 32px;
+}
+
+h1,
+h2 {
+  font-weight: 600;
+}
+
+h2 {
+  font-size: 24px;
+}
+
+h3 {
+  font-size: 20px;
+}
+
+h3,
+h4 {
+  font-weight: 600;
+}
+
+h4 {
+  font-size: 16px;
+}
+
+h5 {
+  font-size: 14px;
+}
+
+h5,
+h6 {
+  font-weight: 600;
+}
+
+h6 {
+  font-size: 12px;
+}
+
+p {
+  margin-bottom: 10px;
+  margin-top: 0;
+}
+
+blockquote {
+  margin: 0;
+}
+
+ol,
+ul {
+  margin-bottom: 0;
+  margin-top: 0;
+  padding-left: 0;
+}
+
+ol ol,
+ul ol {
+  list-style-type: lower-roman;
+}
+
+ol ol ol,
+ol ul ol,
+ul ol ol,
+ul ul ol {
+  list-style-type: lower-alpha;
+}
+
+dd {
+  margin-left: 0;
+}
+
+code,
+pre {
+  font-family: SFMono-Regular,Consolas,Liberation Mono,Menlo,Courier,monospace;
+  font-size: 12px;
+}
+
+pre {
+  margin-bottom: 0;
+  margin-top: 0;
+}
+
+input::-webkit-inner-spin-button,
+input::-webkit-outer-spin-button {
+  -webkit-appearance: none;
+  appearance: none;
+  margin: 0;
+}
+
+.border {
+  border: 1px solid #e1e4e8!important;
+}
+
+.border-0 {
+  border: 0!important;
+}
+
+.border-bottom {
+  border-bottom: 1px solid #e1e4e8!important;
+}
+
+.rounded-1 {
+  border-radius: 3px!important;
+}
+
+.bg-white {
+  background-color: #fff!important;
+}
+
+.bg-gray-light {
+  background-color: #fafbfc!important;
+}
+
+.text-gray-light {
+  color: #6a737d!important;
+}
+
+.mb-0 {
+  margin-bottom: 0!important;
+}
+
+.my-2 {
+  margin-bottom: 8px!important;
+  margin-top: 8px!important;
+}
+
+.pl-0 {
+  padding-left: 0!important;
+}
+
+.py-0 {
+  padding-bottom: 0!important;
+  padding-top: 0!important;
+}
+
+.pl-1 {
+  padding-left: 4px!important;
+}
+
+.pl-2 {
+  padding-left: 8px!important;
+}
+
+.py-2 {
+  padding-bottom: 8px!important;
+  padding-top: 8px!important;
+}
+
+.pl-3,
+.px-3 {
+  padding-left: 16px!important;
+}
+
+.px-3 {
+  padding-right: 16px!important;
+}
+
+.pl-4 {
+  padding-left: 24px!important;
+}
+
+.pl-5 {
+  padding-left: 32px!important;
+}
+
+.pl-6 {
+  padding-left: 40px!important;
+}
+
+.f6 {
+  font-size: 12px!important;
+}
+
+.lh-condensed {
+  line-height: 1.25!important;
+}
+
+.text-bold {
+  font-weight: 600!important;
+}
+
+a:not([href]) {
+  color: inherit;
+  text-decoration: none;
+}
+
+blockquote,
+dl,
+ol,
+p,
+pre,
+table,
+ul {
+  margin-bottom: 16px;
+  margin-top: 0;
+}
+
+hr {
+  background-color: #e1e4e8;
+  border: 0;
+  height: .25em;
+  margin: 24px 0;
+  padding: 0;
+}
+
+blockquote {
+  border-left: .25em solid #dfe2e5;
+  color: #6a737d;
+  padding: 0 1em;
+}
+
+blockquote>:first-child {
+  margin-top: 0;
+}
+
+blockquote>:last-child {
+  margin-bottom: 0;
+}
+
+kbd {
+  background-color: #fafbfc;
+  border: 1px solid #c6cbd1;
+  border-bottom-color: #959da5;
+  border-radius: 3px;
+  box-shadow: inset 0 -1px 0 #959da5;
+  color: #444d56;
+  display: inline-block;
+  font-size: 11px;
+  line-height: 10px;
+  padding: 3px 5px;
+  vertical-align: middle;
+}
+
+h1,
+h2,
+h3,
+h4,
+h5,
+h6 {
+  font-weight: 600;
+  line-height: 1.25;
+  margin-bottom: 16px;
+  margin-top: 24px;
+}
+
+h1 {
+  font-size: 2em;
+}
+
+h1,
+h2 {
+  border-bottom: 1px solid #eaecef;
+  padding-bottom: .3em;
+}
+
+h2 {
+  font-size: 1.5em;
+}
+
+h3 {
+  font-size: 1.25em;
+}
+
+h4 {
+  font-size: 1em;
+}
+
+h5 {
+  font-size: .875em;
+}
+
+h6 {
+  color: #6a737d;
+  font-size: .85em;
+}
+
+ol,
+ul {
+  padding-left: 2em;
+}
+
+ol ol,
+ol ul,
+ul ol,
+ul ul {
+  margin-bottom: 0;
+  margin-top: 0;
+}
+
+li {
+  word-wrap: break-all;
+}
+
+li>p {
+  margin-top: 16px;
+}
+
+li+li {
+  margin-top: .25em;
+}
+
+dl {
+  padding: 0;
+}
+
+dl dt {
+  font-size: 1em;
+  font-style: italic;
+  font-weight: 600;
+  margin-top: 16px;
+  padding: 0;
+}
+
+dl dd {
+  margin-bottom: 16px;
+  padding: 0 16px;
+}
+
+table {
+  display: block;
+  overflow: auto;
+  width: 100%;
+}
+
+table th {
+  font-weight: 600;
+}
+
+table td,
+table th {
+  border: 1px solid #dfe2e5;
+  padding: 6px 13px;
+}
+
+table tr {
+  background-color: #fff;
+  border-top: 1px solid #c6cbd1;
+}
+
+table tr:nth-child(2n) {
+  background-color: #f6f8fa;
+}
+
+img {
+  background-color: #fff;
+  box-sizing: content-box;
+  max-width: 100%;
+}
+
+img[align=right] {
+  padding-left: 20px;
+}
+
+img[align=left] {
+  padding-right: 20px;
+}
+
+code {
+  background-color: rgba(27,31,35,.05);
+  border-radius: 3px;
+  font-size: 85%;
+  margin: 0;
+  padding: .2em .4em;
+}
+
+pre {
+  word-wrap: normal;
+}
+
+pre>code {
+  background: transparent;
+  border: 0;
+  font-size: 100%;
+  margin: 0;
+  padding: 0;
+  white-space: pre;
+  word-break: normal;
+}
+
+.highlight {
+  margin-bottom: 16px;
+}
+
+.highlight pre {
+  margin-bottom: 0;
+  word-break: normal;
+}
+
+.highlight pre,
+pre {
+  background-color: #f6f8fa;
+  border-radius: 3px;
+  font-size: 85%;
+  line-height: 1.45;
+  overflow: auto;
+  padding: 16px;
+}
+
+pre code {
+  background-color: transparent;
+  border: 0;
+  display: inline;
+  line-height: inherit;
+  margin: 0;
+  max-width: auto;
+  overflow: visible;
+  padding: 0;
+  word-wrap: normal;
+}
+
+.commit-tease-sha {
+  color: #444d56;
+  display: inline-block;
+  font-family: SFMono-Regular,Consolas,Liberation Mono,Menlo,Courier,monospace;
+  font-size: 90%;
+}
+
+.blob-wrapper {
+  border-bottom-left-radius: 3px;
+  border-bottom-right-radius: 3px;
+  overflow-x: auto;
+  overflow-y: hidden;
+}
+
+.blob-wrapper-embedded {
+  max-height: 240px;
+  overflow-y: auto;
+}
+
+.blob-num {
+  -moz-user-select: none;
+  -ms-user-select: none;
+  -webkit-user-select: none;
+  color: rgba(27,31,35,.3);
+  cursor: pointer;
+  font-family: SFMono-Regular,Consolas,Liberation Mono,Menlo,Courier,monospace;
+  font-size: 12px;
+  line-height: 20px;
+  min-width: 50px;
+  padding-left: 10px;
+  padding-right: 10px;
+  text-align: right;
+  user-select: none;
+  vertical-align: top;
+  white-space: nowrap;
+  width: 1%;
+}
+
+.blob-num:hover {
+  color: rgba(27,31,35,.6);
+}
+
+.blob-num:before {
+  content: attr(data-line-number);
+}
+
+.blob-code {
+  line-height: 20px;
+  padding-left: 10px;
+  padding-right: 10px;
+  position: relative;
+  vertical-align: top;
+}
+
+.blob-code-inner {
+  color: #24292e;
+  font-family: SFMono-Regular,Consolas,Liberation Mono,Menlo,Courier,monospace;
+  font-size: 12px;
+  overflow: visible;
+  white-space: pre;
+  word-wrap: normal;
+}
+
+.pl-token.active,
+.pl-token:hover {
+  background: #ffea7f;
+  cursor: pointer;
+}
+
+kbd {
+  background-color: #fafbfc;
+  border: 1px solid #d1d5da;
+  border-bottom-color: #c6cbd1;
+  border-radius: 3px;
+  box-shadow: inset 0 -1px 0 #c6cbd1;
+  color: #444d56;
+  display: inline-block;
+  font: 11px SFMono-Regular,Consolas,Liberation Mono,Menlo,Courier,monospace;
+  line-height: 10px;
+  padding: 3px 5px;
+  vertical-align: middle;
+}
+
+:checked+.radio-label {
+  border-color: #0366d6;
+  position: relative;
+  z-index: 1;
+}
+
+.tab-size[data-tab-size="1"] {
+  -moz-tab-size: 1;
+  tab-size: 1;
+}
+
+.tab-size[data-tab-size="2"] {
+  -moz-tab-size: 2;
+  tab-size: 2;
+}
+
+.tab-size[data-tab-size="3"] {
+  -moz-tab-size: 3;
+  tab-size: 3;
+}
+
+.tab-size[data-tab-size="4"] {
+  -moz-tab-size: 4;
+  tab-size: 4;
+}
+
+.tab-size[data-tab-size="5"] {
+  -moz-tab-size: 5;
+  tab-size: 5;
+}
+
+.tab-size[data-tab-size="6"] {
+  -moz-tab-size: 6;
+  tab-size: 6;
+}
+
+.tab-size[data-tab-size="7"] {
+  -moz-tab-size: 7;
+  tab-size: 7;
+}
+
+.tab-size[data-tab-size="8"] {
+  -moz-tab-size: 8;
+  tab-size: 8;
+}
+
+.tab-size[data-tab-size="9"] {
+  -moz-tab-size: 9;
+  tab-size: 9;
+}
+
+.tab-size[data-tab-size="10"] {
+  -moz-tab-size: 10;
+  tab-size: 10;
+}
+
+.tab-size[data-tab-size="11"] {
+  -moz-tab-size: 11;
+  tab-size: 11;
+}
+
+.tab-size[data-tab-size="12"] {
+  -moz-tab-size: 12;
+  tab-size: 12;
+}
+
+.task-list-item {
+  list-style-type: none;
+}
+
+.task-list-item+.task-list-item {
+  margin-top: 3px;
+}
+
+.task-list-item input {
+  margin: 0 .2em .25em -1.6em;
+  vertical-align: middle;
+}
+
+hr {
+  border-bottom-color: #eee;
+}
+
+.pl-0 {
+  padding-left: 0!important;
+}
+
+.pl-1 {
+  padding-left: 4px!important;
+}
+
+.pl-2 {
+  padding-left: 8px!important;
+}
+
+.pl-3 {
+  padding-left: 16px!important;
+}
+
+.pl-4 {
+  padding-left: 24px!important;
+}
+
+.pl-5 {
+  padding-left: 32px!important;
+}
+
+.pl-6 {
+  padding-left: 40px!important;
+}
+
+.pl-7 {
+  padding-left: 48px!important;
+}
+
+.pl-8 {
+  padding-left: 64px!important;
+}
+
+.pl-9 {
+  padding-left: 80px!important;
+}
+
+.pl-10 {
+  padding-left: 96px!important;
+}
+
+.pl-11 {
+  padding-left: 112px!important;
+}
+
+.pl-12 {
+  padding-left: 128px!important;
+}
-- 
GitLab


From 47f99b977adc9fa6717c92884bbc957ca8e5ec5c Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 16 Jul 2024 15:57:01 +0200
Subject: [PATCH 014/310] Update Snakefile

Switched to pangetools for wfmash
---
 Snakefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index 3892169..2b22216 100644
--- a/Snakefile
+++ b/Snakefile
@@ -339,7 +339,7 @@ rule wfmash_on_chr:
         """
         ## Mapping
         /usr/bin/time -v -o {log.time_map} \
-            apptainer exec {params.app_path}/pggb.sif wfmash \
+            apptainer exec {params.app_path}/PanGeTools.sif wfmash \
             -s {params.segment_length} -l $(( {params.segment_length} * 5 )) -p {params.mapping_id} \
             -n $(cat {input.fai} | wc -l) {params.wfmash_sec} -t {threads} \
             --tmp-base $(dirname {output.aln}) \
@@ -349,7 +349,7 @@ rule wfmash_on_chr:
 
         ## Aligning
         /usr/bin/time -v -o {log.time_aln} \
-            apptainer exec {params.app_path}/pggb.sif wfmash \
+            apptainer exec {params.app_path}/PanGeTools.sif wfmash \
             -s {params.segment_length} -l $(( {params.segment_length} * 5 )) -p {params.mapping_id} \
             -n $(cat {input.fai} | wc -l) {params.wfmash_sec} -t {threads} \
             --tmp-base $(dirname {output.aln}) \
-- 
GitLab


From 72c8d0091c1786ef2eb8e60b58b2078653fd7a1b Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 16 Jul 2024 16:00:00 +0200
Subject: [PATCH 015/310] Trying S=1 for SyRI

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 2b22216..1d771c8 100644
--- a/Snakefile
+++ b/Snakefile
@@ -310,7 +310,7 @@ rule SyRI_on_chrInput:
             -o $(basename {output.fig}) \
             -r {output.wrkdir}/"${{refname}}.fa.gz" \
             -q "${{AllAsmList[*]}}" \
-            -h 10 -w 20 -s "0.7" -f 10
+            -h 10 -w 20 -s "1" -f 10
         mv {output.wrkdir}/$(basename {output.fig}) {output.fig}
         rm {output.wrkdir}/*.fa
         """
-- 
GitLab


From 61e30405a43db02405d1a627bf8aeef7674e1ab2 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 16 Jul 2024 16:31:14 +0200
Subject: [PATCH 016/310] Update Snakefile

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 1d771c8..816df78 100644
--- a/Snakefile
+++ b/Snakefile
@@ -310,7 +310,7 @@ rule SyRI_on_chrInput:
             -o $(basename {output.fig}) \
             -r {output.wrkdir}/"${{refname}}.fa.gz" \
             -q "${{AllAsmList[*]}}" \
-            -h 10 -w 20 -s "1" -f 10
+            -h 10 -w 20 -s "0.9" -f 10
         mv {output.wrkdir}/$(basename {output.fig}) {output.fig}
         rm {output.wrkdir}/*.fa
         """
-- 
GitLab


From ec2d4877c65cda633cb13fceaeced73470a8b123 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Wed, 24 Jul 2024 10:20:08 +0200
Subject: [PATCH 017/310] Create analyze_VCF.R

Analyze VCF from 'vg deconstruct' to produce a figure showing the number of INS/DEL
---
 scripts/analyze_VCF.R | 137 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 137 insertions(+)
 create mode 100644 scripts/analyze_VCF.R

diff --git a/scripts/analyze_VCF.R b/scripts/analyze_VCF.R
new file mode 100644
index 0000000..2f4c273
--- /dev/null
+++ b/scripts/analyze_VCF.R
@@ -0,0 +1,137 @@
+# Script to plot the INS/DEL length from a VCF produced by 'VG deconstruct'
+# 
+# @author: alexis.mergez@inrae.fr
+# @version: 1.0
+
+library(optparse)
+
+#% Parsing arguments
+option_list = list(
+    make_option(c("-b", "--binwidth"), type="double", default=0.02, 
+        help="Bin width", metavar="double"),
+    make_option(c("-t", "--tsv"), type="character", default=NULL, 
+        help=".tsv input", metavar="character"),
+    make_option(c("-o", "--out"), type="character", default=NULL, 
+        help="output file name", metavar="character"),
+    make_option(c("-p", "--panname"), type="character", default=NULL, 
+        help="pangenome name", metavar="character"),
+    make_option(c("-W", "--width"), type="integer", default=12, 
+        help="Figure width", metavar="integer"),
+    make_option(c("-H", "--height"), type="integer", default=9, 
+        help="Figure height", metavar="integer")
+);
+
+opt_parser = OptionParser(option_list=option_list);
+opt = parse_args(opt_parser);
+
+## Accesing arguments with opt$<arg>. For example : opt$bands, opt$tsv, ...
+
+library(ggplot2)
+library(tidyverse)
+library(gridExtra)
+
+#% Parsing TSV file
+write("[analyze_VCF] Parsing TSV ...", stdout())
+x <- read.delim(opt$tsv)
+
+sample = str_split_1(x$CHROM[1], "#")[1]
+
+## Filtering too long and too short INS/DEL, splitting data into 2 dataframe by type 
+write("[analyze_VCF] Filtering ...", stdout())
+INS = x[which(x$LEN >= -100000 & x$LEN <= -50), ]
+INS$LEN = -INS$LEN
+
+DEL = x[which(x$LEN <= 100000 & x$LEN >= 50), ]
+
+
+## Passing to LOG for scale reasons
+INS$LOGLEN = log10(INS$LEN)
+DEL$LOGLEN = -log10(DEL$LEN)
+
+#% Figures section
+title_text = element_text(face="bold", size = 12)
+colours = c("#78ABA8", "#C8CFA0", "#FCDC94", "#EF9C66")
+
+## Function to retrieve the legend from a plot as a dedicated plot (used in the multiplot command)
+get_legend<-function(myggplot){
+  tmp <- ggplot_gtable(ggplot_build(myggplot))
+  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
+  legend <- tmp$grobs[[leg]]
+  return(legend)
+}
+
+
+## Creating Bins from the log of the length
+write("[analyze_VCF] Binning ...", stdout())
+INS = INS %>% mutate(
+    Bin = cut(
+        LOGLEN, 
+        breaks = seq(0, log10(100000), 
+        by = opt$binwidth), 
+        include.lowest = TRUE, 
+        right = FALSE, 
+        ordered_result = TRUE, 
+        dig.lab=3)
+    )
+DEL = DEL %>% mutate(
+    Bin = cut(
+        LOGLEN, 
+        breaks = seq(-log10(100000)-opt$binwidth, 0, by = opt$binwidth), 
+        include.lowest = TRUE, 
+        right = FALSE, 
+        ordered_result = TRUE, 
+        dig.lab=3)
+    )
+
+## Summerizing the dataframe by haplotypes
+write("[analyze_VCF] Summerizing ...", stdout())
+INS_F = INS %>% 
+	group_by(HAP, Bin) %>%
+	summarise(
+		Count = n(),
+		.groups = 'drop'
+	) %>%
+	mutate(x = factor(parse_number(as.character(Bin), locale = locale(decimal_mark = ".", grouping_mark = ""))))
+
+DEL_F = DEL %>% 
+	group_by(HAP, Bin) %>%
+	summarise(
+		Count = n(),
+		.groups = 'drop'
+	) %>%
+	mutate(x = factor(parse_number(as.character(Bin), locale = locale(decimal_mark = ".", grouping_mark = ""))))
+
+## Creating the graphs
+write("[analyze_VCF] Creating graphs ...", stdout())
+gInsHap = ggplot(INS_F, aes(x=x)) +
+	geom_line(aes(y=Count, group=HAP, color=HAP)) +
+	scale_y_continuous(trans='log10', position = "right") +
+	scale_x_discrete(
+		breaks=seq(0,5),
+		labels=c("","10bp","100bp","1kb","10kb","100kb")) +
+	xlab("INS") +
+	theme_bw() +
+	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), axis.title = title_text) 
+
+gDelHap = ggplot(DEL_F, aes(x=x)) +
+	geom_line(aes(y=Count, group=HAP, color=HAP)) +
+	scale_y_continuous(trans='log10') +
+	scale_x_discrete(
+		breaks=-seq(0,5),
+		labels=c("","-10bp","-100bp","-1kb","-10kb","-100kb")) +
+	xlab("DEL") +
+	theme_bw() +
+	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), title = title_text, legend.position = "none") 
+
+## Exporting the legend
+legend = get_legend(gInsHap)
+gInsHap = gInsHap + theme(legend.position = "none")
+
+## Combining the plots and the legend
+write("[analyze_VCF] Assembling graphs ...", stdout())
+gfHap = grid.arrange(gDelHap, gInsHap, legend, nrow = 1, top=paste0(c("Pan1c - ",opt$panname," - ARS1-2"), collapse=""), widths=c(2.3, 2.3, 0.8))
+
+#% Saving the figure
+write("[analyze_VCF] Exporting ...", stdout())
+ggsave(opt$out, plot=gfHap, width=opt$width, height=opt$height)
+write("[analyze_VCF] Done !", stdout())
\ No newline at end of file
-- 
GitLab


From f7a0564aad71fdc9cfdb69abac015d479f7d293e Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 24 Jul 2024 15:27:36 +0200
Subject: [PATCH 018/310] Added VCF figure as possible output

Added rules :
- gfa_2_xg : convert GFA to XG
- vg_deconstruct : Produce a VCF using the reference haplotype as target paths
- vcf_fig : Generates the TSV summerizing the VCF and generates the figure using R

Added scripts :
- vcf_2_tsv.awk : Create the TSV for the R script in rules:vcf_fig
- analyze_VCF.R : Generate INS/DEL distribution figure from the TSV in rule:vcf_fig
---
 Snakefile             | 68 +++++++++++++++++++++++++++++++++++++++++--
 config.yaml           |  3 ++
 rules/tools.smk       | 15 ++++++++++
 scripts/analyze_VCF.R |  6 ++--
 scripts/vcf_2_tsv.awk | 22 ++++++++++++++
 5 files changed, 109 insertions(+), 5 deletions(-)
 create mode 100644 scripts/vcf_2_tsv.awk

diff --git a/Snakefile b/Snakefile
index 816df78..2f61ae3 100644
--- a/Snakefile
+++ b/Snakefile
@@ -71,7 +71,10 @@ def which_analysis():
             analysis_inputs.append(
                 "output/pan1c."+config['name']+".report.md"
             )
-
+    if config["get_VCF"] == "True": # VCF from the final graph against the "reference"
+        analysis_inputs.append(
+            "output/pan1c."+config['name']+".vcf.png"
+        ) 
     return analysis_inputs
 
 """
@@ -685,6 +688,56 @@ rule panacus_stats:
             tee {log.cmd}
         """
 
+rule vg_deconstruct:
+    # Produce a VCF based on the "reference" haplotype
+    input:
+        graph="output/pan1c."+config['name']+".xg",
+    output:
+        vcf=temp("output/pan1c."+config['name']+".vcf"),
+    threads: 8
+    params:
+        app_path=config['app.path'],
+        ref=config['reference']
+    log: 
+        cmd="logs/vg_deconstruct/vg_deconstruct.cmd.log",
+        time="logs/vg_deconstruct/vg_deconstruct.time.log"
+    shell:
+        """
+        /usr/bin/time -v -o {log.time} \
+            apptainer run --app vg {params.app_path}/PanGeTools.sif \
+                deconstruct -a -e \
+                -P $(echo {params.ref} | cut -f1 -d'.') \
+                {input.graph} \
+                -t {threads} -v \
+                1> {output.vcf} \
+                2> >(tee {log.cmd} >&2)
+        """
+
+rule vcf_fig:
+    # Produce a figure describing INS/DEL length distribution
+    input:
+        vcf="output/pan1c."+config['name']+".vcf.gz"
+    output:
+        tsv=temp("output/pan1c."+config['name']+".vcf.tsv"),
+        vcf_fig="output/pan1c."+config['name']+".vcf.png"
+    threads: 1
+    params:
+        app_path=config['app.path'],
+        fig_config=config['vcf_fig.params'],
+        pan_name=config['name']
+    shell:
+        """
+        ## Producing TSV for the figure
+        awk -f scripts/vcf_2_tsv.awk {input} > {output.tsv}
+
+        ## Running R to get the figure
+        apptainer run --app Renv {params.app_path}/pan1c-env.sif Rscript scripts/analyze_VCF.R \
+            -t {output.tsv} \
+            -o {output.vcf_fig} \
+            -p {params.pan_name} \
+            {params.fig_config}
+        """
+
 rule create_pan1c_report_fig:
     # Produces a markdown report figure of chromosomes graphs
     input:
@@ -745,6 +798,9 @@ def get_report_sections(wildcards):
             chromosome=CHRLIST
             )
 
+    if config['get_VCF'] == "True":
+        sections['VCF_fig'] = "output/pan1c."+config['name']+".vcf.png"
+
     return sections      
 
 rule create_pan1c_report:
@@ -758,7 +814,8 @@ rule create_pan1c_report:
     params:
         app_path=config['app.path'],
         add_ASMs_SyRI=config['get_ASMs_SyRI'],
-        add_chrInputs_SyRI=config['get_chrInputs_SyRI']
+        add_chrInputs_SyRI=config['get_chrInputs_SyRI'],
+        add_VCF_fig=config['get_VCF']
     run:
         shell("touch {output.report}")
 
@@ -830,5 +887,12 @@ rule create_pan1c_report:
 
             shell("echo '' >> {output.report}")
 
+        # Adding VCF figure
+        if params.add_VCF_fig == "True":
+            basename = os.path.basename(input.VCF_fig)
+            shell("echo '# INS/DEL distribution' >> {output.report}")
+            shell("echo '![{basename}](./{basename})' >> {output.report}")
+            shell("echo '' >> {output.report}")
+
         # Converting to HTML
         shell("pandoc --standalone -c src/github-markdown.css -f gfm -t html {output.report} > {output.html}")
\ No newline at end of file
diff --git a/config.yaml b/config.yaml
index 1367008..2347007 100644
--- a/config.yaml
+++ b/config.yaml
@@ -33,5 +33,8 @@ get_PAV: 'False'
 # Computes SyRI figures for haplotypes 
 get_ASMs_SyRI: 'False' # Haplotype vs Reference
 get_chrInputs_SyRI: 'False' # SyRI on chrInputs
+# Producing VCF and its associated INS/DEL figure
+get_VCF: 'False'
+vcf_fig.params: '--binwidth 0.05 --height 6 --width 18'
 # Creating final report
 create_report: 'True'
diff --git a/rules/tools.smk b/rules/tools.smk
index 53fe1ff..262f63b 100644
--- a/rules/tools.smk
+++ b/rules/tools.smk
@@ -28,4 +28,19 @@ rule run_bgzip:
         """
         apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
             -@ {threads} {input}
+        """
+
+rule gfa_2_xg:
+    # Convert a GFA to a XG
+    input:
+        "{graph}.gfa"
+    output:
+        "{graph}.xg"
+    threads: 8
+    params:
+        app_path=config["app.path"]
+    shell:
+        """
+        apptainer run --app vg {params.app_path}/PanGeTools.sif \
+            convert -g -x -t {threads} {input} > {output}
         """
\ No newline at end of file
diff --git a/scripts/analyze_VCF.R b/scripts/analyze_VCF.R
index 2f4c273..29fe61c 100644
--- a/scripts/analyze_VCF.R
+++ b/scripts/analyze_VCF.R
@@ -15,9 +15,9 @@ option_list = list(
         help="output file name", metavar="character"),
     make_option(c("-p", "--panname"), type="character", default=NULL, 
         help="pangenome name", metavar="character"),
-    make_option(c("-W", "--width"), type="integer", default=12, 
+    make_option(c("-W", "--width"), type="integer", default=18, 
         help="Figure width", metavar="integer"),
-    make_option(c("-H", "--height"), type="integer", default=9, 
+    make_option(c("-H", "--height"), type="integer", default=6, 
         help="Figure height", metavar="integer")
 );
 
@@ -129,7 +129,7 @@ gInsHap = gInsHap + theme(legend.position = "none")
 
 ## Combining the plots and the legend
 write("[analyze_VCF] Assembling graphs ...", stdout())
-gfHap = grid.arrange(gDelHap, gInsHap, legend, nrow = 1, top=paste0(c("Pan1c - ",opt$panname," - ARS1-2"), collapse=""), widths=c(2.3, 2.3, 0.8))
+gfHap = grid.arrange(gDelHap, gInsHap, legend, nrow = 1, top=paste0(c("Pan1c - ",opt$panname," - ",sample), collapse=""), widths=c(2.3, 2.3, 0.8))
 
 #% Saving the figure
 write("[analyze_VCF] Exporting ...", stdout())
diff --git a/scripts/vcf_2_tsv.awk b/scripts/vcf_2_tsv.awk
new file mode 100644
index 0000000..04f6be9
--- /dev/null
+++ b/scripts/vcf_2_tsv.awk
@@ -0,0 +1,22 @@
+# Header
+/^#CHROM/ {
+    for (i=10;i<=NF;i++) {HAP[i]=$i}; print "CHROM\tPOS\tID\tHAP\tLEN"
+}
+
+# INS/DEL counter
+!/^#/ { 
+    ALTLEN[0]=0; split($5, arr, ",")
+	for (i in arr) {
+        ALTLEN[i]=(length($4) - length(arr[i]))
+    } 
+    for (i=10; i<=NF; i++) { 
+		split($i, ALTs, "|")
+        hap=0 
+		for (a in ALTs) { 
+			if (ALTs[a] != "." && ALTLEN[ALTs[a]] != 0) { 
+				hap+=1 
+				printf("%s\t%s\t%s\t%s#%s\t%s\n", $1, $2, $3, HAP[i], hap, ALTLEN[ALTs[a]])
+            }
+        }
+    }
+}
\ No newline at end of file
-- 
GitLab


From feb1272fd26462af47044f06bcce751d0fa3fdd2 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 24 Jul 2024 15:28:55 +0200
Subject: [PATCH 019/310] Updated example config.yaml accordingly

---
 example/config_CICD.yaml | 3 +++
 rules/tools.smk          | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/example/config_CICD.yaml b/example/config_CICD.yaml
index dfd0ad2..715ce04 100644
--- a/example/config_CICD.yaml
+++ b/example/config_CICD.yaml
@@ -34,5 +34,8 @@ get_PAV: 'False'
 #get_allASM_SyRI: 'False' # All vs all
 get_ASMs_SyRI: 'True' # Haplotype vs Reference
 get_chrInputs_SyRI: 'True' # SyRI on chrInputs
+# Producing VCF and its associated INS/DEL figure
+get_VCF: 'False'
+vcf_fig.params: '--binwidth 0.05 --height 6 --width 18'
 # Creating final report
 create_report: 'True'
diff --git a/rules/tools.smk b/rules/tools.smk
index 262f63b..f1dd1c1 100644
--- a/rules/tools.smk
+++ b/rules/tools.smk
@@ -31,7 +31,7 @@ rule run_bgzip:
         """
 
 rule gfa_2_xg:
-    # Convert a GFA to a XG
+    # Convert a GFA to XG
     input:
         "{graph}.gfa"
     output:
-- 
GitLab


From 9ff61f7b004f285d35e590738ebe6deb23cefbf1 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 24 Jul 2024 15:59:00 +0200
Subject: [PATCH 020/310] Added memory ressources

Added :
- function to allocate memory (Formula : job_default_mem * n_retries * mem_multiplier * threads)
- Added default memory for all rules
- Added mem_multipler parameter in config.yaml
---
 Snakefile                | 56 ++++++++++++++++++++++++++++++++++++++--
 config.yaml              |  4 +++
 example/config_CICD.yaml |  4 +++
 rules/tools.smk          |  6 +++++
 4 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index 2f61ae3..5b6ca15 100644
--- a/Snakefile
+++ b/Snakefile
@@ -77,6 +77,12 @@ def which_analysis():
         ) 
     return analysis_inputs
 
+"""
+Functions   ---------------------------------------------------------------------------------------
+"""
+def get_mem_mb(wildcards, base, attempt, threads, multiplier=config["mem_multiplier"]):
+    return attempt * base * multiplier * threads
+
 """
 Rules   -------------------------------------------------------------------------------------------
 """
@@ -103,6 +109,8 @@ rule ragtag_scaffolding:
         fa=temp("data/hap.ragtagged/{haplotype}.ragtagged.fa"),
         tar="data/hap.ragtagged/{haplotype}.tar.gz"
     threads: 8
+    resources:
+        mem_mb=get_mem_mb(base=2000)
     retries: 1
     priority: 100
     params:
@@ -138,6 +146,8 @@ rule quast_stats:
     output:
         report="output/quast/"+config['name']+".quast.report.html"
     threads: 8
+    resources:
+        mem_mb=get_mem_mb(base=2000)
     params:
         app_path=config["app.path"],
         pan_name=config["name"]
@@ -177,6 +187,8 @@ rule contig_position:
         fig="output/chr.contig/{chromosome}.contig.png",
         outdir=directory("output/chr.contig/{chromosome}")
     threads: 1
+    resources:
+        mem_mb=get_mem_mb(base=16000)
     params:
         app_path=config["app.path"]
     shell:
@@ -223,6 +235,8 @@ rule chromosome_clustering:
     output:
         temp(expand('data/chrInputs/'+config['name']+'.{chromosome}.fa', chromosome=CHRLIST))
     threads: 1
+    resources:
+        mem_mb=get_mem_mb(base=16000)
     priority: 100
     params:
         app_path=config["app.path"],
@@ -247,9 +261,9 @@ rule SyRI_on_ASM:
         cmd="logs/SyRI_ASM/{haplotype}.SyRI_ASM.cmd.log",
         time="logs/SyRI_ASM/{haplotype}.SyRI_ASM.time.log"
     threads: 4
-    retries: 1
     resources:
-        mem_gb=48
+        mem_mb=get_mem_mb(base=12000)
+    retries: 1
     params:
         app_path=config["app.path"]
     shell:
@@ -280,6 +294,8 @@ rule SyRI_on_chrInput:
         fig="output/chrInput.syri.figs/"+config['name']+".{chromosome}.asm.syri.png",
         wrkdir=directory('data/chrInputs/syri/{chromosome}')
     threads: 8
+    resources:
+        mem_mb=get_mem_mb(base=12000)
     params:
         app_path=config["app.path"],
         ref=config['reference']
@@ -327,6 +343,8 @@ rule wfmash_on_chr:
         mapping=temp("data/chrGraphs/{chromosome}/{chromosome}.wfmash.mapping.paf"),
         aln=temp("data/chrGraphs/{chromosome}/{chromosome}.wfmash.aln.paf")
     threads: 16
+    resources:
+        mem_mb=get_mem_mb(base=2000)
     priority: 100
     params:
         app_path=config['app.path'],
@@ -377,6 +395,8 @@ rule seqwish:
     output:
         temp("data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfa")
     threads: 8
+    resources:
+        mem_mb=get_mem_mb(base=4000)
     priority: 100
     params:
         app_path=config['app.path'],
@@ -406,6 +426,8 @@ rule gfaffix_on_chr:
         gfa=temp("data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.gfa"),
         transform="data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.transform.txt"
     threads: 1
+    resources:
+        mem_mb=get_mem_mb(base=24000)
     priority: 100
     params:
         app_path=config['app.path']
@@ -430,6 +452,8 @@ rule odgi_postprocessing:
     output:
         gfa='data/chrGraphs/'+config['name']+'.{chromosome}.gfa'
     threads: 8
+    resources:
+        mem_mb=get_mem_mb(base=4000)
     priority: 100
     params:
         app_path=config['app.path']
@@ -493,6 +517,8 @@ rule generate_graph_list:
     output:
         "data/chrGraphs/graphsList.txt"
     threads: 1
+    resources:
+        mem_mb=get_mem_mb(base=100)
     priority: 100
     run:
         with open(output[0], "w") as handle:
@@ -510,6 +536,8 @@ rule graph_squeeze:
         cmd="logs/squeeze/"+config['name']+".squeeze.cmd.log",
         time="logs/squeeze/"+config['name']+".squeeze.time.log",
     threads: 16
+    resources:
+        mem_mb=get_mem_mb(base=2000)
     priority: 100
     params:
         app_path=config['app.path']
@@ -534,6 +562,8 @@ rule graph_stats:
         genstats="output/stats/chrGraphs/"+config['name']+".{chromosome}.general.stats.tsv",
         pathstats="output/stats/chrGraphs/"+config['name']+".{chromosome}.path.stats.tsv"
     threads: 4
+    resources:
+        mem_mb=get_mem_mb(base=6000)
     params:
         app_path=config['app.path'],
         pan_name=config['name']
@@ -553,6 +583,8 @@ rule graph_figs:
         oneDviz="output/chrGraphs.figs/"+config['name']+".{chromosome}.1Dviz.png",
         pcov="output/chrGraphs.figs/"+config['name']+".{chromosome}.pcov.png"
     threads: 4
+    resources:
+        mem_mb=get_mem_mb(base=4000)
     params:
         app_path=config['app.path'],
         oneDviz=config['odgi.1Dviz.params'],
@@ -576,6 +608,8 @@ rule aggregate_graphs_stats:
         genstats="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv",
         pathstats="output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv"
     threads: 1
+    resources:
+        mem_mb=get_mem_mb(base=2000)
     params:
         app_path=config['app.path'],
         pan_name=config['name']
@@ -593,6 +627,8 @@ rule final_graph_tagging:
     output:
         "output/pan1c."+config['name']+".gfa.metadata"
     threads: 1
+    resources:
+        mem_mb=get_mem_mb(base=2000)
     priority: 100
     params:
         app_path=config['app.path'],
@@ -611,6 +647,8 @@ rule pggb_input_stats:
     output:
         "output/stats/pan1c."+config['name']+".chrInput.stats.tsv"
     threads: 1
+    resources:
+        mem_mb=get_mem_mb(base=2000)
     params:
         app_path=config['app.path'],
         pan_name=config['name']
@@ -629,6 +667,8 @@ rule core_statistics:
         tsv = "output/stats/pan1c."+config['name']+".core.stats.tsv",
         dir = directory("output/pggb.usage.figs")
     threads: 1
+    resources:
+        mem_mb=get_mem_mb(base=2000)
     params:
         app_path=config['app.path'],
         pan_name=config['name']
@@ -650,6 +690,8 @@ rule get_pav:
     output:
         directory("output/pav.matrices")
     threads: 16
+    resources:
+        mem_mb=get_mem_mb(base=2000)
     params:
         app_path=config['app.path']
     run:
@@ -675,6 +717,8 @@ rule panacus_stats:
         pan_name=config['name'],
         refname=config['reference']
     threads: 8
+    resources:
+        mem_mb=get_mem_mb(base=2000)
     shell:
         """
         /usr/bin/time -v -o {log.time} \
@@ -695,6 +739,8 @@ rule vg_deconstruct:
     output:
         vcf=temp("output/pan1c."+config['name']+".vcf"),
     threads: 8
+    resources:
+        mem_mb=get_mem_mb(base=32000)
     params:
         app_path=config['app.path'],
         ref=config['reference']
@@ -721,6 +767,8 @@ rule vcf_fig:
         tsv=temp("output/pan1c."+config['name']+".vcf.tsv"),
         vcf_fig="output/pan1c."+config['name']+".vcf.png"
     threads: 1
+    resources:
+        mem_mb=get_mem_mb(base=8000)
     params:
         app_path=config['app.path'],
         fig_config=config['vcf_fig.params'],
@@ -748,6 +796,8 @@ rule create_pan1c_report_fig:
         namefig=temp("tmp/{chromosome}.name.png"),
         reportfig="output/report/"+config['name']+".{chromosome}.report.fig.png"
     threads: 4
+    resources:
+        mem_mb=get_mem_mb(base=2000)
     params:
         app_path=config['app.path']
     shell:
@@ -811,6 +861,8 @@ rule create_pan1c_report:
         report="output/pan1c."+config['name']+".report.md",
         html="output/pan1c."+config['name']+".report.html"
     threads: 4
+    resources:
+        mem_mb=get_mem_mb(base=1000)
     params:
         app_path=config['app.path'],
         add_ASMs_SyRI=config['get_ASMs_SyRI'],
diff --git a/config.yaml b/config.yaml
index 2347007..20a6240 100644
--- a/config.yaml
+++ b/config.yaml
@@ -6,6 +6,10 @@ reference: '<reference_name>'
 # Directory of apptainer images (downloaded with getApps.sh)
 app.path: '<path>'
 
+## Resources
+# Memory multiplier (increase when OOM). Formula : job_default_mem * n_retries * mem_multiplier * threads
+mem_multiplier: 1
+
 # Core parameters
 # RagTag parameters
 ragtag_mm2_conf: '-x asm5'
diff --git a/example/config_CICD.yaml b/example/config_CICD.yaml
index 715ce04..4dc0dd4 100644
--- a/example/config_CICD.yaml
+++ b/example/config_CICD.yaml
@@ -6,6 +6,10 @@ reference: 'R64.hap1.fa.gz'
 # Directory of apptainer images (downloaded with getApps.sh)
 app.path: 'appimgs/'
 
+## Resources
+# Memory multiplier (increase when OOM). Formula : job_default_mem * n_retries * mem_multiplier * threads
+mem_multiplier: 1
+
 # Core parameters
 # RagTag parameters
 ragtag_mm2_conf: '-x asm5'
diff --git a/rules/tools.smk b/rules/tools.smk
index f1dd1c1..d9b033f 100644
--- a/rules/tools.smk
+++ b/rules/tools.smk
@@ -9,6 +9,8 @@ rule samtools_index:
         "{sample}.fa.gz.fai",
         "{sample}.fa.gz.gzi"
     threads: 1
+    resources:
+        mem_mb=get_mem_mb(base=2000)
     params: 
         app_path=config["app.path"]
     shell:
@@ -22,6 +24,8 @@ rule run_bgzip:
     output:
         "{file}.gz"
     threads: 4
+    resources:
+        mem_mb=get_mem_mb(base=8000)
     params:
         app_path=config["app.path"]
     shell:
@@ -37,6 +41,8 @@ rule gfa_2_xg:
     output:
         "{graph}.xg"
     threads: 8
+    resources:
+        mem_mb=get_mem_mb(base=8000)
     params:
         app_path=config["app.path"]
     shell:
-- 
GitLab


From c7b57cb10845df4a70c2c4c86f0aa9e36c49a1c5 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 24 Jul 2024 16:00:33 +0200
Subject: [PATCH 021/310] Update tools.smk

---
 rules/tools.smk | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/rules/tools.smk b/rules/tools.smk
index d9b033f..11e9328 100644
--- a/rules/tools.smk
+++ b/rules/tools.smk
@@ -1,3 +1,9 @@
+"""
+Functions   ---------------------------------------------------------------------------------------
+"""
+def get_mem_mb(wildcards, base, attempt, threads, multiplier=config["mem_multiplier"]):
+    return attempt * base * multiplier * threads
+
 """
 Tool rules  ---------------------------------------------------------------------------------------
 """
-- 
GitLab


From b9ba40d8f344a1630b3467559a9ac4cd33ce3715 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 24 Jul 2024 16:09:48 +0200
Subject: [PATCH 022/310] Moved base memory allocation value

---
 Snakefile       | 52 ++++++++++++++++++++++++-------------------------
 rules/tools.smk | 12 +++++++-----
 2 files changed, 33 insertions(+), 31 deletions(-)

diff --git a/Snakefile b/Snakefile
index 5b6ca15..22a02dd 100644
--- a/Snakefile
+++ b/Snakefile
@@ -80,8 +80,8 @@ def which_analysis():
 """
 Functions   ---------------------------------------------------------------------------------------
 """
-def get_mem_mb(wildcards, base, attempt, threads, multiplier=config["mem_multiplier"]):
-    return attempt * base * multiplier * threads
+def get_mem_mb(wildcards, attempt, threads, multiplier=config["mem_multiplier"]):
+    return attempt * multiplier * threads
 
 """
 Rules   -------------------------------------------------------------------------------------------
@@ -110,7 +110,7 @@ rule ragtag_scaffolding:
         tar="data/hap.ragtagged/{haplotype}.tar.gz"
     threads: 8
     resources:
-        mem_mb=get_mem_mb(base=2000)
+        mem_mb = 2000 * get_mem_mb
     retries: 1
     priority: 100
     params:
@@ -147,7 +147,7 @@ rule quast_stats:
         report="output/quast/"+config['name']+".quast.report.html"
     threads: 8
     resources:
-        mem_mb=get_mem_mb(base=2000)
+        mem_mb = 2000 * get_mem_mb
     params:
         app_path=config["app.path"],
         pan_name=config["name"]
@@ -188,7 +188,7 @@ rule contig_position:
         outdir=directory("output/chr.contig/{chromosome}")
     threads: 1
     resources:
-        mem_mb=get_mem_mb(base=16000)
+        mem_mb = 16000 * get_mem_mb
     params:
         app_path=config["app.path"]
     shell:
@@ -236,7 +236,7 @@ rule chromosome_clustering:
         temp(expand('data/chrInputs/'+config['name']+'.{chromosome}.fa', chromosome=CHRLIST))
     threads: 1
     resources:
-        mem_mb=get_mem_mb(base=16000)
+        mem_mb = 16000 * get_mem_mb
     priority: 100
     params:
         app_path=config["app.path"],
@@ -262,7 +262,7 @@ rule SyRI_on_ASM:
         time="logs/SyRI_ASM/{haplotype}.SyRI_ASM.time.log"
     threads: 4
     resources:
-        mem_mb=get_mem_mb(base=12000)
+        mem_mb = 12000 * get_mem_mb
     retries: 1
     params:
         app_path=config["app.path"]
@@ -295,7 +295,7 @@ rule SyRI_on_chrInput:
         wrkdir=directory('data/chrInputs/syri/{chromosome}')
     threads: 8
     resources:
-        mem_mb=get_mem_mb(base=12000)
+        mem_mb = 12000 * get_mem_mb
     params:
         app_path=config["app.path"],
         ref=config['reference']
@@ -344,7 +344,7 @@ rule wfmash_on_chr:
         aln=temp("data/chrGraphs/{chromosome}/{chromosome}.wfmash.aln.paf")
     threads: 16
     resources:
-        mem_mb=get_mem_mb(base=2000)
+        mem_mb = 2000 * get_mem_mb
     priority: 100
     params:
         app_path=config['app.path'],
@@ -396,7 +396,7 @@ rule seqwish:
         temp("data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfa")
     threads: 8
     resources:
-        mem_mb=get_mem_mb(base=4000)
+        mem_mb = 4000 * get_mem_mb
     priority: 100
     params:
         app_path=config['app.path'],
@@ -427,7 +427,7 @@ rule gfaffix_on_chr:
         transform="data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.transform.txt"
     threads: 1
     resources:
-        mem_mb=get_mem_mb(base=24000)
+        mem_mb = 24000 * get_mem_mb
     priority: 100
     params:
         app_path=config['app.path']
@@ -453,7 +453,7 @@ rule odgi_postprocessing:
         gfa='data/chrGraphs/'+config['name']+'.{chromosome}.gfa'
     threads: 8
     resources:
-        mem_mb=get_mem_mb(base=4000)
+        mem_mb = 4000 * get_mem_mb
     priority: 100
     params:
         app_path=config['app.path']
@@ -518,7 +518,7 @@ rule generate_graph_list:
         "data/chrGraphs/graphsList.txt"
     threads: 1
     resources:
-        mem_mb=get_mem_mb(base=100)
+        mem_mb = 100 * get_mem_mb
     priority: 100
     run:
         with open(output[0], "w") as handle:
@@ -537,7 +537,7 @@ rule graph_squeeze:
         time="logs/squeeze/"+config['name']+".squeeze.time.log",
     threads: 16
     resources:
-        mem_mb=get_mem_mb(base=2000)
+        mem_mb = 2000 * get_mem_mb
     priority: 100
     params:
         app_path=config['app.path']
@@ -563,7 +563,7 @@ rule graph_stats:
         pathstats="output/stats/chrGraphs/"+config['name']+".{chromosome}.path.stats.tsv"
     threads: 4
     resources:
-        mem_mb=get_mem_mb(base=6000)
+        mem_mb = 6000 * get_mem_mb
     params:
         app_path=config['app.path'],
         pan_name=config['name']
@@ -584,7 +584,7 @@ rule graph_figs:
         pcov="output/chrGraphs.figs/"+config['name']+".{chromosome}.pcov.png"
     threads: 4
     resources:
-        mem_mb=get_mem_mb(base=4000)
+        mem_mb = 4000 * get_mem_mb
     params:
         app_path=config['app.path'],
         oneDviz=config['odgi.1Dviz.params'],
@@ -609,7 +609,7 @@ rule aggregate_graphs_stats:
         pathstats="output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv"
     threads: 1
     resources:
-        mem_mb=get_mem_mb(base=2000)
+        mem_mb = 2000 * get_mem_mb
     params:
         app_path=config['app.path'],
         pan_name=config['name']
@@ -628,7 +628,7 @@ rule final_graph_tagging:
         "output/pan1c."+config['name']+".gfa.metadata"
     threads: 1
     resources:
-        mem_mb=get_mem_mb(base=2000)
+        mem_mb = 2000 * get_mem_mb
     priority: 100
     params:
         app_path=config['app.path'],
@@ -648,7 +648,7 @@ rule pggb_input_stats:
         "output/stats/pan1c."+config['name']+".chrInput.stats.tsv"
     threads: 1
     resources:
-        mem_mb=get_mem_mb(base=2000)
+        mem_mb = 2000 * get_mem_mb
     params:
         app_path=config['app.path'],
         pan_name=config['name']
@@ -668,7 +668,7 @@ rule core_statistics:
         dir = directory("output/pggb.usage.figs")
     threads: 1
     resources:
-        mem_mb=get_mem_mb(base=2000)
+        mem_mb = 2000 * get_mem_mb
     params:
         app_path=config['app.path'],
         pan_name=config['name']
@@ -691,7 +691,7 @@ rule get_pav:
         directory("output/pav.matrices")
     threads: 16
     resources:
-        mem_mb=get_mem_mb(base=2000)
+        mem_mb = 2000 * get_mem_mb
     params:
         app_path=config['app.path']
     run:
@@ -718,7 +718,7 @@ rule panacus_stats:
         refname=config['reference']
     threads: 8
     resources:
-        mem_mb=get_mem_mb(base=2000)
+        mem_mb = 2000 * get_mem_mb
     shell:
         """
         /usr/bin/time -v -o {log.time} \
@@ -740,7 +740,7 @@ rule vg_deconstruct:
         vcf=temp("output/pan1c."+config['name']+".vcf"),
     threads: 8
     resources:
-        mem_mb=get_mem_mb(base=32000)
+        mem_mb = 32000 * get_mem_mb
     params:
         app_path=config['app.path'],
         ref=config['reference']
@@ -768,7 +768,7 @@ rule vcf_fig:
         vcf_fig="output/pan1c."+config['name']+".vcf.png"
     threads: 1
     resources:
-        mem_mb=get_mem_mb(base=8000)
+        mem_mb = 8000 * get_mem_mb
     params:
         app_path=config['app.path'],
         fig_config=config['vcf_fig.params'],
@@ -797,7 +797,7 @@ rule create_pan1c_report_fig:
         reportfig="output/report/"+config['name']+".{chromosome}.report.fig.png"
     threads: 4
     resources:
-        mem_mb=get_mem_mb(base=2000)
+        mem_mb = 2000 * get_mem_mb
     params:
         app_path=config['app.path']
     shell:
@@ -862,7 +862,7 @@ rule create_pan1c_report:
         html="output/pan1c."+config['name']+".report.html"
     threads: 4
     resources:
-        mem_mb=get_mem_mb(base=1000)
+        mem_mb = 500 * get_mem_mb
     params:
         app_path=config['app.path'],
         add_ASMs_SyRI=config['get_ASMs_SyRI'],
diff --git a/rules/tools.smk b/rules/tools.smk
index 11e9328..c0a9985 100644
--- a/rules/tools.smk
+++ b/rules/tools.smk
@@ -1,8 +1,8 @@
 """
 Functions   ---------------------------------------------------------------------------------------
 """
-def get_mem_mb(wildcards, base, attempt, threads, multiplier=config["mem_multiplier"]):
-    return attempt * base * multiplier * threads
+def get_mem_mb(wildcards, attempt, threads, multiplier=config["mem_multiplier"]):
+    return attempt * multiplier * threads
 
 """
 Tool rules  ---------------------------------------------------------------------------------------
@@ -15,8 +15,9 @@ rule samtools_index:
         "{sample}.fa.gz.fai",
         "{sample}.fa.gz.gzi"
     threads: 1
+    retries: 1
     resources:
-        mem_mb=get_mem_mb(base=2000)
+        mem_mb = 2000 * get_mem_mb
     params: 
         app_path=config["app.path"]
     shell:
@@ -30,8 +31,9 @@ rule run_bgzip:
     output:
         "{file}.gz"
     threads: 4
+    retries: 1
     resources:
-        mem_mb=get_mem_mb(base=8000)
+        mem_mb = 8000 * get_mem_mb
     params:
         app_path=config["app.path"]
     shell:
@@ -48,7 +50,7 @@ rule gfa_2_xg:
         "{graph}.xg"
     threads: 8
     resources:
-        mem_mb=get_mem_mb(base=8000)
+        mem_mb = 8000 * get_mem_mb
     params:
         app_path=config["app.path"]
     shell:
-- 
GitLab


From 6e081c1a8f825650fc4b6c2ef864c9aa0dc702ba Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 24 Jul 2024 16:17:12 +0200
Subject: [PATCH 023/310] Switched to lambda expressions

---
 Snakefile       | 48 ++++++++++++++++++++++++------------------------
 rules/tools.smk |  6 +++---
 2 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/Snakefile b/Snakefile
index 22a02dd..f8d3b72 100644
--- a/Snakefile
+++ b/Snakefile
@@ -110,7 +110,7 @@ rule ragtag_scaffolding:
         tar="data/hap.ragtagged/{haplotype}.tar.gz"
     threads: 8
     resources:
-        mem_mb = 2000 * get_mem_mb
+        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 2000 
     retries: 1
     priority: 100
     params:
@@ -147,7 +147,7 @@ rule quast_stats:
         report="output/quast/"+config['name']+".quast.report.html"
     threads: 8
     resources:
-        mem_mb = 2000 * get_mem_mb
+        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 2000
     params:
         app_path=config["app.path"],
         pan_name=config["name"]
@@ -188,7 +188,7 @@ rule contig_position:
         outdir=directory("output/chr.contig/{chromosome}")
     threads: 1
     resources:
-        mem_mb = 16000 * get_mem_mb
+        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 16000
     params:
         app_path=config["app.path"]
     shell:
@@ -236,7 +236,7 @@ rule chromosome_clustering:
         temp(expand('data/chrInputs/'+config['name']+'.{chromosome}.fa', chromosome=CHRLIST))
     threads: 1
     resources:
-        mem_mb = 16000 * get_mem_mb
+        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 16000
     priority: 100
     params:
         app_path=config["app.path"],
@@ -262,7 +262,7 @@ rule SyRI_on_ASM:
         time="logs/SyRI_ASM/{haplotype}.SyRI_ASM.time.log"
     threads: 4
     resources:
-        mem_mb = 12000 * get_mem_mb
+        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 12000
     retries: 1
     params:
         app_path=config["app.path"]
@@ -295,7 +295,7 @@ rule SyRI_on_chrInput:
         wrkdir=directory('data/chrInputs/syri/{chromosome}')
     threads: 8
     resources:
-        mem_mb = 12000 * get_mem_mb
+        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 12000
     params:
         app_path=config["app.path"],
         ref=config['reference']
@@ -344,7 +344,7 @@ rule wfmash_on_chr:
         aln=temp("data/chrGraphs/{chromosome}/{chromosome}.wfmash.aln.paf")
     threads: 16
     resources:
-        mem_mb = 2000 * get_mem_mb
+        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 2000
     priority: 100
     params:
         app_path=config['app.path'],
@@ -396,7 +396,7 @@ rule seqwish:
         temp("data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfa")
     threads: 8
     resources:
-        mem_mb = 4000 * get_mem_mb
+        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 4000
     priority: 100
     params:
         app_path=config['app.path'],
@@ -427,7 +427,7 @@ rule gfaffix_on_chr:
         transform="data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.transform.txt"
     threads: 1
     resources:
-        mem_mb = 24000 * get_mem_mb
+        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 24000
     priority: 100
     params:
         app_path=config['app.path']
@@ -453,7 +453,7 @@ rule odgi_postprocessing:
         gfa='data/chrGraphs/'+config['name']+'.{chromosome}.gfa'
     threads: 8
     resources:
-        mem_mb = 4000 * get_mem_mb
+        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 4000
     priority: 100
     params:
         app_path=config['app.path']
@@ -518,7 +518,7 @@ rule generate_graph_list:
         "data/chrGraphs/graphsList.txt"
     threads: 1
     resources:
-        mem_mb = 100 * get_mem_mb
+        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 100
     priority: 100
     run:
         with open(output[0], "w") as handle:
@@ -537,7 +537,7 @@ rule graph_squeeze:
         time="logs/squeeze/"+config['name']+".squeeze.time.log",
     threads: 16
     resources:
-        mem_mb = 2000 * get_mem_mb
+        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 2000
     priority: 100
     params:
         app_path=config['app.path']
@@ -563,7 +563,7 @@ rule graph_stats:
         pathstats="output/stats/chrGraphs/"+config['name']+".{chromosome}.path.stats.tsv"
     threads: 4
     resources:
-        mem_mb = 6000 * get_mem_mb
+        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 6000
     params:
         app_path=config['app.path'],
         pan_name=config['name']
@@ -584,7 +584,7 @@ rule graph_figs:
         pcov="output/chrGraphs.figs/"+config['name']+".{chromosome}.pcov.png"
     threads: 4
     resources:
-        mem_mb = 4000 * get_mem_mb
+        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 4000
     params:
         app_path=config['app.path'],
         oneDviz=config['odgi.1Dviz.params'],
@@ -609,7 +609,7 @@ rule aggregate_graphs_stats:
         pathstats="output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv"
     threads: 1
     resources:
-        mem_mb = 2000 * get_mem_mb
+        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 2000
     params:
         app_path=config['app.path'],
         pan_name=config['name']
@@ -628,7 +628,7 @@ rule final_graph_tagging:
         "output/pan1c."+config['name']+".gfa.metadata"
     threads: 1
     resources:
-        mem_mb = 2000 * get_mem_mb
+        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 2000
     priority: 100
     params:
         app_path=config['app.path'],
@@ -648,7 +648,7 @@ rule pggb_input_stats:
         "output/stats/pan1c."+config['name']+".chrInput.stats.tsv"
     threads: 1
     resources:
-        mem_mb = 2000 * get_mem_mb
+        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 2000
     params:
         app_path=config['app.path'],
         pan_name=config['name']
@@ -668,7 +668,7 @@ rule core_statistics:
         dir = directory("output/pggb.usage.figs")
     threads: 1
     resources:
-        mem_mb = 2000 * get_mem_mb
+        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 2000
     params:
         app_path=config['app.path'],
         pan_name=config['name']
@@ -691,7 +691,7 @@ rule get_pav:
         directory("output/pav.matrices")
     threads: 16
     resources:
-        mem_mb = 2000 * get_mem_mb
+        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 2000
     params:
         app_path=config['app.path']
     run:
@@ -718,7 +718,7 @@ rule panacus_stats:
         refname=config['reference']
     threads: 8
     resources:
-        mem_mb = 2000 * get_mem_mb
+        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 2000
     shell:
         """
         /usr/bin/time -v -o {log.time} \
@@ -740,7 +740,7 @@ rule vg_deconstruct:
         vcf=temp("output/pan1c."+config['name']+".vcf"),
     threads: 8
     resources:
-        mem_mb = 32000 * get_mem_mb
+        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 32000
     params:
         app_path=config['app.path'],
         ref=config['reference']
@@ -768,7 +768,7 @@ rule vcf_fig:
         vcf_fig="output/pan1c."+config['name']+".vcf.png"
     threads: 1
     resources:
-        mem_mb = 8000 * get_mem_mb
+        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 8000
     params:
         app_path=config['app.path'],
         fig_config=config['vcf_fig.params'],
@@ -797,7 +797,7 @@ rule create_pan1c_report_fig:
         reportfig="output/report/"+config['name']+".{chromosome}.report.fig.png"
     threads: 4
     resources:
-        mem_mb = 2000 * get_mem_mb
+        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 2000
     params:
         app_path=config['app.path']
     shell:
@@ -862,7 +862,7 @@ rule create_pan1c_report:
         html="output/pan1c."+config['name']+".report.html"
     threads: 4
     resources:
-        mem_mb = 500 * get_mem_mb
+        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 500
     params:
         app_path=config['app.path'],
         add_ASMs_SyRI=config['get_ASMs_SyRI'],
diff --git a/rules/tools.smk b/rules/tools.smk
index c0a9985..bfb0da1 100644
--- a/rules/tools.smk
+++ b/rules/tools.smk
@@ -17,7 +17,7 @@ rule samtools_index:
     threads: 1
     retries: 1
     resources:
-        mem_mb = 2000 * get_mem_mb
+        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 2000
     params: 
         app_path=config["app.path"]
     shell:
@@ -33,7 +33,7 @@ rule run_bgzip:
     threads: 4
     retries: 1
     resources:
-        mem_mb = 8000 * get_mem_mb
+        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 8000
     params:
         app_path=config["app.path"]
     shell:
@@ -50,7 +50,7 @@ rule gfa_2_xg:
         "{graph}.xg"
     threads: 8
     resources:
-        mem_mb = 8000 * get_mem_mb
+        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 8000
     params:
         app_path=config["app.path"]
     shell:
-- 
GitLab


From 7cc28db981b1b9962058a4b30618c8379dd4e174 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 25 Jul 2024 10:09:13 +0200
Subject: [PATCH 024/310] Fixed RAM allocation

---
 Snakefile       | 48 ++++++++++++++++++++++++------------------------
 rules/tools.smk |  6 +++---
 2 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/Snakefile b/Snakefile
index f8d3b72..2cb4e8f 100644
--- a/Snakefile
+++ b/Snakefile
@@ -110,7 +110,7 @@ rule ragtag_scaffolding:
         tar="data/hap.ragtagged/{haplotype}.tar.gz"
     threads: 8
     resources:
-        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 2000 
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000 
     retries: 1
     priority: 100
     params:
@@ -147,7 +147,7 @@ rule quast_stats:
         report="output/quast/"+config['name']+".quast.report.html"
     threads: 8
     resources:
-        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 2000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
     params:
         app_path=config["app.path"],
         pan_name=config["name"]
@@ -188,7 +188,7 @@ rule contig_position:
         outdir=directory("output/chr.contig/{chromosome}")
     threads: 1
     resources:
-        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 16000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 16000
     params:
         app_path=config["app.path"]
     shell:
@@ -236,7 +236,7 @@ rule chromosome_clustering:
         temp(expand('data/chrInputs/'+config['name']+'.{chromosome}.fa', chromosome=CHRLIST))
     threads: 1
     resources:
-        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 16000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 16000
     priority: 100
     params:
         app_path=config["app.path"],
@@ -262,7 +262,7 @@ rule SyRI_on_ASM:
         time="logs/SyRI_ASM/{haplotype}.SyRI_ASM.time.log"
     threads: 4
     resources:
-        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 12000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 12000
     retries: 1
     params:
         app_path=config["app.path"]
@@ -295,7 +295,7 @@ rule SyRI_on_chrInput:
         wrkdir=directory('data/chrInputs/syri/{chromosome}')
     threads: 8
     resources:
-        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 12000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 12000
     params:
         app_path=config["app.path"],
         ref=config['reference']
@@ -344,7 +344,7 @@ rule wfmash_on_chr:
         aln=temp("data/chrGraphs/{chromosome}/{chromosome}.wfmash.aln.paf")
     threads: 16
     resources:
-        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 2000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
     priority: 100
     params:
         app_path=config['app.path'],
@@ -396,7 +396,7 @@ rule seqwish:
         temp("data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfa")
     threads: 8
     resources:
-        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 4000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000
     priority: 100
     params:
         app_path=config['app.path'],
@@ -427,7 +427,7 @@ rule gfaffix_on_chr:
         transform="data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.transform.txt"
     threads: 1
     resources:
-        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 24000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 24000
     priority: 100
     params:
         app_path=config['app.path']
@@ -453,7 +453,7 @@ rule odgi_postprocessing:
         gfa='data/chrGraphs/'+config['name']+'.{chromosome}.gfa'
     threads: 8
     resources:
-        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 4000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000
     priority: 100
     params:
         app_path=config['app.path']
@@ -518,7 +518,7 @@ rule generate_graph_list:
         "data/chrGraphs/graphsList.txt"
     threads: 1
     resources:
-        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 100
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 100
     priority: 100
     run:
         with open(output[0], "w") as handle:
@@ -537,7 +537,7 @@ rule graph_squeeze:
         time="logs/squeeze/"+config['name']+".squeeze.time.log",
     threads: 16
     resources:
-        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 2000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
     priority: 100
     params:
         app_path=config['app.path']
@@ -563,7 +563,7 @@ rule graph_stats:
         pathstats="output/stats/chrGraphs/"+config['name']+".{chromosome}.path.stats.tsv"
     threads: 4
     resources:
-        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 6000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 6000
     params:
         app_path=config['app.path'],
         pan_name=config['name']
@@ -584,7 +584,7 @@ rule graph_figs:
         pcov="output/chrGraphs.figs/"+config['name']+".{chromosome}.pcov.png"
     threads: 4
     resources:
-        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 4000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000
     params:
         app_path=config['app.path'],
         oneDviz=config['odgi.1Dviz.params'],
@@ -609,7 +609,7 @@ rule aggregate_graphs_stats:
         pathstats="output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv"
     threads: 1
     resources:
-        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 2000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
     params:
         app_path=config['app.path'],
         pan_name=config['name']
@@ -628,7 +628,7 @@ rule final_graph_tagging:
         "output/pan1c."+config['name']+".gfa.metadata"
     threads: 1
     resources:
-        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 2000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
     priority: 100
     params:
         app_path=config['app.path'],
@@ -648,7 +648,7 @@ rule pggb_input_stats:
         "output/stats/pan1c."+config['name']+".chrInput.stats.tsv"
     threads: 1
     resources:
-        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 2000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
     params:
         app_path=config['app.path'],
         pan_name=config['name']
@@ -668,7 +668,7 @@ rule core_statistics:
         dir = directory("output/pggb.usage.figs")
     threads: 1
     resources:
-        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 2000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
     params:
         app_path=config['app.path'],
         pan_name=config['name']
@@ -691,7 +691,7 @@ rule get_pav:
         directory("output/pav.matrices")
     threads: 16
     resources:
-        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 2000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
     params:
         app_path=config['app.path']
     run:
@@ -718,7 +718,7 @@ rule panacus_stats:
         refname=config['reference']
     threads: 8
     resources:
-        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 2000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
     shell:
         """
         /usr/bin/time -v -o {log.time} \
@@ -740,7 +740,7 @@ rule vg_deconstruct:
         vcf=temp("output/pan1c."+config['name']+".vcf"),
     threads: 8
     resources:
-        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 32000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 32000
     params:
         app_path=config['app.path'],
         ref=config['reference']
@@ -768,7 +768,7 @@ rule vcf_fig:
         vcf_fig="output/pan1c."+config['name']+".vcf.png"
     threads: 1
     resources:
-        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 8000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 8000
     params:
         app_path=config['app.path'],
         fig_config=config['vcf_fig.params'],
@@ -797,7 +797,7 @@ rule create_pan1c_report_fig:
         reportfig="output/report/"+config['name']+".{chromosome}.report.fig.png"
     threads: 4
     resources:
-        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 2000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
     params:
         app_path=config['app.path']
     shell:
@@ -862,7 +862,7 @@ rule create_pan1c_report:
         html="output/pan1c."+config['name']+".report.html"
     threads: 4
     resources:
-        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 500
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 500
     params:
         app_path=config['app.path'],
         add_ASMs_SyRI=config['get_ASMs_SyRI'],
diff --git a/rules/tools.smk b/rules/tools.smk
index bfb0da1..117f65e 100644
--- a/rules/tools.smk
+++ b/rules/tools.smk
@@ -17,7 +17,7 @@ rule samtools_index:
     threads: 1
     retries: 1
     resources:
-        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 2000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
     params: 
         app_path=config["app.path"]
     shell:
@@ -33,7 +33,7 @@ rule run_bgzip:
     threads: 4
     retries: 1
     resources:
-        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 8000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 8000
     params:
         app_path=config["app.path"]
     shell:
@@ -50,7 +50,7 @@ rule gfa_2_xg:
         "{graph}.xg"
     threads: 8
     resources:
-        mem_mb = lambda wildcards, threads: 100 * threads * config["mem_multiplier"] * 8000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 8000
     params:
         app_path=config["app.path"]
     shell:
-- 
GitLab


From 844bf89c94507fda5f618a724eba43cb5f9b6c09 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 26 Jul 2024 17:05:46 +0200
Subject: [PATCH 025/310] Create runSnakemakeSLURM.sh

Template for running on multiple nodes
---
 runSnakemakeSLURM.sh | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100755 runSnakemakeSLURM.sh

diff --git a/runSnakemakeSLURM.sh b/runSnakemakeSLURM.sh
new file mode 100755
index 0000000..3a9616e
--- /dev/null
+++ b/runSnakemakeSLURM.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+#SBATCH --cpus-per-task=1
+#SBATCH -p unlimitq
+#SBATCH -o <log path>.log
+#SBATCH -J <jname>
+#SBATCH --mem=24G
+#SBATCH --mail-type=BEGIN,END,FAIL
+#SBATCH --mail-user=<mail>
+
+module purge
+module load containers/Apptainer/1.2.5
+module load devel/Miniconda/Miniconda3
+
+source activate <path_to_pan1c_env>
+
+apppath=<path_to_pan1c-box>
+
+# Creating DAG
+echo "Creating DAG ..."
+snakemake -c $(nproc) --dag | dot -Tsvg > workflow.svg
+# Running the workflow
+echo "Running the workflow ..."
+/usr/bin/time -v -o whole.run.time.log snakemake --executor slurm --jobs 10
+
-- 
GitLab


From bce1cd4970f856675e5d6928ae105f14fc64f6c3 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 29 Jul 2024 09:47:23 +0200
Subject: [PATCH 026/310] Fixed VCF_fig

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 2cb4e8f..cc43b08 100644
--- a/Snakefile
+++ b/Snakefile
@@ -776,7 +776,7 @@ rule vcf_fig:
     shell:
         """
         ## Producing TSV for the figure
-        awk -f scripts/vcf_2_tsv.awk {input} > {output.tsv}
+        zcat {input} | awk -f scripts/vcf_2_tsv.awk > {output.tsv}
 
         ## Running R to get the figure
         apptainer run --app Renv {params.app_path}/pan1c-env.sif Rscript scripts/analyze_VCF.R \
-- 
GitLab


From 204ad6f065077424b0a92c0c767086f599a84424 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 29 Jul 2024 09:52:45 +0200
Subject: [PATCH 027/310] Fixed generate_graph_list

Bumped up the amount of memory required to prevent OOM
---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index cc43b08..6f40b95 100644
--- a/Snakefile
+++ b/Snakefile
@@ -518,7 +518,7 @@ rule generate_graph_list:
         "data/chrGraphs/graphsList.txt"
     threads: 1
     resources:
-        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 100
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
     priority: 100
     run:
         with open(output[0], "w") as handle:
-- 
GitLab


From 2e3e02f5a3a97e2487c963d11e72ddd4691fe50a Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 29 Jul 2024 10:50:41 +0200
Subject: [PATCH 028/310] Update Snakefile

---
 Snakefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index 6f40b95..65aca82 100644
--- a/Snakefile
+++ b/Snakefile
@@ -893,12 +893,12 @@ rule create_pan1c_report:
 
         # Adding General stats
         shell("echo '# General stats' >> {output.report}")
-        shell("cat {input.genstats} | csv2md -d $'\\t' >> {output.report}")
+        shell("cat {input.genstats} | apptainer {params.app_path}/pan1c-env.sif csv2md -d $'\\t' >> {output.report}")
         shell("echo '' >> {output.report}")
 
         # Adding Path stats
         shell("echo '# Path stats' >> {output.report}")
-        shell("cat {input.pathstats} | csv2md -d $'\\t' >> {output.report}")
+        shell("cat {input.pathstats} | apptainer {params.app_path}/pan1c-env.sif csv2md -d $'\\t' >> {output.report}")
         shell("echo '' >> {output.report}")
 
         # Adding chromosomes figures
-- 
GitLab


From 6435aafc374a59486b7ba6c953e617e024afebb5 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 29 Jul 2024 10:54:28 +0200
Subject: [PATCH 029/310] Update Snakefile

---
 Snakefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index 65aca82..e592033 100644
--- a/Snakefile
+++ b/Snakefile
@@ -893,12 +893,12 @@ rule create_pan1c_report:
 
         # Adding General stats
         shell("echo '# General stats' >> {output.report}")
-        shell("cat {input.genstats} | apptainer {params.app_path}/pan1c-env.sif csv2md -d $'\\t' >> {output.report}")
+        shell("cat {input.genstats} | apptainer run {params.app_path}/pan1c-env.sif csv2md -d $'\\t' >> {output.report}")
         shell("echo '' >> {output.report}")
 
         # Adding Path stats
         shell("echo '# Path stats' >> {output.report}")
-        shell("cat {input.pathstats} | apptainer {params.app_path}/pan1c-env.sif csv2md -d $'\\t' >> {output.report}")
+        shell("cat {input.pathstats} | apptainer run {params.app_path}/pan1c-env.sif csv2md -d $'\\t' >> {output.report}")
         shell("echo '' >> {output.report}")
 
         # Adding chromosomes figures
-- 
GitLab


From 82d0321c5fe2ea1b6fb72a7c68173d75077a305e Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 29 Jul 2024 16:30:38 +0200
Subject: [PATCH 030/310] Updated VCF figure script

---
 Snakefile             |  23 +++++----
 scripts/analyze_VCF.R | 105 ++++++++++++++++++++++++++++++++----------
 2 files changed, 95 insertions(+), 33 deletions(-)

diff --git a/Snakefile b/Snakefile
index e592033..5c2ccee 100644
--- a/Snakefile
+++ b/Snakefile
@@ -765,7 +765,7 @@ rule vcf_fig:
         vcf="output/pan1c."+config['name']+".vcf.gz"
     output:
         tsv=temp("output/pan1c."+config['name']+".vcf.tsv"),
-        vcf_fig="output/pan1c."+config['name']+".vcf.png"
+        vcf_fig=directory("output/vcf.figs")
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 8000
@@ -775,10 +775,10 @@ rule vcf_fig:
         pan_name=config['name']
     shell:
         """
-        ## Producing TSV for the figure
+        ## Producing TSV for the figures
         zcat {input} | awk -f scripts/vcf_2_tsv.awk > {output.tsv}
 
-        ## Running R to get the figure
+        ## Running R to get the figures
         apptainer run --app Renv {params.app_path}/pan1c-env.sif Rscript scripts/analyze_VCF.R \
             -t {output.tsv} \
             -o {output.vcf_fig} \
@@ -849,7 +849,7 @@ def get_report_sections(wildcards):
             )
 
     if config['get_VCF'] == "True":
-        sections['VCF_fig'] = "output/pan1c."+config['name']+".vcf.png"
+        sections['VCF_fig'] = "output/vcf.figs"
 
     return sections      
 
@@ -941,10 +941,17 @@ rule create_pan1c_report:
 
         # Adding VCF figure
         if params.add_VCF_fig == "True":
-            basename = os.path.basename(input.VCF_fig)
-            shell("echo '# INS/DEL distribution' >> {output.report}")
-            shell("echo '![{basename}](./{basename})' >> {output.report}")
-            shell("echo '' >> {output.report}")
+            shell("echo '# INS/DEL length distribution' >> {output.report}")
+            figures = [
+                fig for fig in os.listdir(input.VCF_fig)
+                if fig.split('.')[2] != "general"
+            ]
+            figures = ["pan1c."+config["name"]+".general.vcf.png", "pan1c."+config["name"]+".general.log.vcf.png"] + figures
+            for basename in figures:
+                name = basename.split('.')[2]
+                shell("echo '## {name}' >> {output.report}")
+                shell("echo '![{basename}](./vcf.figs/{basename})' >> {output.report}")
+                shell("echo '' >> {output.report}")
 
         # Converting to HTML
         shell("pandoc --standalone -c src/github-markdown.css -f gfm -t html {output.report} > {output.html}")
\ No newline at end of file
diff --git a/scripts/analyze_VCF.R b/scripts/analyze_VCF.R
index 29fe61c..df423e3 100644
--- a/scripts/analyze_VCF.R
+++ b/scripts/analyze_VCF.R
@@ -1,7 +1,7 @@
 # Script to plot the INS/DEL length from a VCF produced by 'VG deconstruct'
 # 
 # @author: alexis.mergez@inrae.fr
-# @version: 1.0
+# @version: 1.1
 
 library(optparse)
 
@@ -12,7 +12,7 @@ option_list = list(
     make_option(c("-t", "--tsv"), type="character", default=NULL, 
         help=".tsv input", metavar="character"),
     make_option(c("-o", "--out"), type="character", default=NULL, 
-        help="output file name", metavar="character"),
+        help="output directory", metavar="character"),
     make_option(c("-p", "--panname"), type="character", default=NULL, 
         help="pangenome name", metavar="character"),
     make_option(c("-W", "--width"), type="integer", default=18, 
@@ -35,8 +35,9 @@ write("[analyze_VCF] Parsing TSV ...", stdout())
 x <- read.delim(opt$tsv)
 
 sample = str_split_1(x$CHROM[1], "#")[1]
+x[c("HAPNAME", "HAPID")] = str_split_fixed(x$HAP, "#", 2)
 
-## Filtering too long and too short INS/DEL, splitting data into 2 dataframe by type 
+#% Filtering too long and too short INS/DEL, splitting data into 2 dataframe by type 
 write("[analyze_VCF] Filtering ...", stdout())
 INS = x[which(x$LEN >= -100000 & x$LEN <= -50), ]
 INS$LEN = -INS$LEN
@@ -44,7 +45,7 @@ INS$LEN = -INS$LEN
 DEL = x[which(x$LEN <= 100000 & x$LEN >= 50), ]
 
 
-## Passing to LOG for scale reasons
+#% Passing to LOG for scale reasons
 INS$LOGLEN = log10(INS$LEN)
 DEL$LOGLEN = -log10(DEL$LEN)
 
@@ -52,7 +53,7 @@ DEL$LOGLEN = -log10(DEL$LEN)
 title_text = element_text(face="bold", size = 12)
 colours = c("#78ABA8", "#C8CFA0", "#FCDC94", "#EF9C66")
 
-## Function to retrieve the legend from a plot as a dedicated plot (used in the multiplot command)
+#% Function to retrieve the legend from a plot as a dedicated plot (used in the multiplot command)
 get_legend<-function(myggplot){
   tmp <- ggplot_gtable(ggplot_build(myggplot))
   leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
@@ -60,8 +61,7 @@ get_legend<-function(myggplot){
   return(legend)
 }
 
-
-## Creating Bins from the log of the length
+#% Creating Bins from the log of the length
 write("[analyze_VCF] Binning ...", stdout())
 INS = INS %>% mutate(
     Bin = cut(
@@ -83,7 +83,7 @@ DEL = DEL %>% mutate(
         dig.lab=3)
     )
 
-## Summerizing the dataframe by haplotypes
+#% Summerizing the dataframe by haplotypes
 write("[analyze_VCF] Summerizing ...", stdout())
 INS_F = INS %>% 
 	group_by(HAP, Bin) %>%
@@ -92,6 +92,7 @@ INS_F = INS %>%
 		.groups = 'drop'
 	) %>%
 	mutate(x = factor(parse_number(as.character(Bin), locale = locale(decimal_mark = ".", grouping_mark = ""))))
+INS_F[c("HAPNAME", "HAPID")] = str_split_fixed(INS_F$HAP, "#", 2)
 
 DEL_F = DEL %>% 
 	group_by(HAP, Bin) %>%
@@ -100,11 +101,14 @@ DEL_F = DEL %>%
 		.groups = 'drop'
 	) %>%
 	mutate(x = factor(parse_number(as.character(Bin), locale = locale(decimal_mark = ".", grouping_mark = ""))))
-
-## Creating the graphs
-write("[analyze_VCF] Creating graphs ...", stdout())
-gInsHap = ggplot(INS_F, aes(x=x)) +
-	geom_line(aes(y=Count, group=HAP, color=HAP)) +
+DEL_F[c("HAPNAME", "HAPID")] = str_split_fixed(DEL_F$HAP, "#", 2)
+#% Creating the general graph, in log and non-log version
+
+## General function
+get_fig_log = function(INS_F, DEL_F, top_name){
+    # Insertion figure
+    figA = ggplot(INS_F, aes(x=x)) +
+	geom_line(aes(y=Count, group=HAP, color=HAP, linetype=HAPID)) +
 	scale_y_continuous(trans='log10', position = "right") +
 	scale_x_discrete(
 		breaks=seq(0,5),
@@ -113,25 +117,76 @@ gInsHap = ggplot(INS_F, aes(x=x)) +
 	theme_bw() +
 	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), axis.title = title_text) 
 
-gDelHap = ggplot(DEL_F, aes(x=x)) +
-	geom_line(aes(y=Count, group=HAP, color=HAP)) +
+    # Deletion figure
+    figB = ggplot(DEL_F, aes(x=x)) +
+	geom_line(aes(y=Count, group=HAP, color=HAP, linetype=HAPID)) +
 	scale_y_continuous(trans='log10') +
 	scale_x_discrete(
 		breaks=-seq(0,5),
 		labels=c("","-10bp","-100bp","-1kb","-10kb","-100kb")) +
 	xlab("DEL") +
 	theme_bw() +
-	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), title = title_text, legend.position = "none") 
+	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), title = title_text, legend.position = "none")
+
+    legend = get_legend(figA)
+    figA = figA + theme(legend.position = "none")
 
-## Exporting the legend
-legend = get_legend(gInsHap)
-gInsHap = gInsHap + theme(legend.position = "none")
+    ## Combining the plots and the legend
+    figF = grid.arrange(figB, figA, legend, nrow = 1, top=paste0(c("Pan1c - ",top_name," - ",sample), collapse=""), widths=c(2.3, 2.3, 0.8))
+    return(figF)
+}
+
+get_fig = function(INS_F, DEL_F, top_name){
+    # Insertion figure
+    figA = ggplot(INS_F, aes(x=x)) +
+	geom_line(aes(y=Count, group=HAP, color=HAP, linetype=HAPID)) +
+	scale_y_continuous(position = "right") +
+	scale_x_discrete(
+		breaks=seq(0,5),
+		labels=c("","10bp","100bp","1kb","10kb","100kb")) +
+	xlab("INS") +
+	theme_bw() +
+	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), axis.title = title_text) 
 
-## Combining the plots and the legend
-write("[analyze_VCF] Assembling graphs ...", stdout())
-gfHap = grid.arrange(gDelHap, gInsHap, legend, nrow = 1, top=paste0(c("Pan1c - ",opt$panname," - ",sample), collapse=""), widths=c(2.3, 2.3, 0.8))
+    # Deletion figure
+    figB = ggplot(DEL_F, aes(x=x)) +
+	geom_line(aes(y=Count, group=HAP, color=HAP, linetype=HAPID)) +
+	scale_x_discrete(
+		breaks=-seq(0,5),
+		labels=c("","-10bp","-100bp","-1kb","-10kb","-100kb")) +
+	xlab("DEL") +
+	theme_bw() +
+	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), title = title_text, legend.position = "none")
+
+    legend = get_legend(figA)
+    figA = figA + theme(legend.position = "none")
+
+    ## Combining the plots and the legend
+    figF = grid.arrange(figB, figA, legend, nrow = 1, top=paste0(c("Pan1c - ",top_name," - ",sample), collapse=""), widths=c(2.3, 2.3, 0.8))
+    return(figF)
+}
+
+#% Log version
+write("[analyze_VCF] Creating general graph (Log version) ...", stdout())
+FIG = get_fig_log(INS_F, DEL_F, opt$panname)
+sub_name = paste0(c("pan1c",opt$panname,"General_log","vcf","png"), collapse=".")
+ggsave(paste0(c(opt$out,sub_name), collapse="/"), plot=FIG, width=opt$width, height=opt$height)
+
+#% Non log version
+write("[analyze_VCF] Creating general graph (Non-log version) ...", stdout())
+FIG = get_fig(INS_F, DEL_F, opt$panname)
+sub_name = paste0(c("pan1c",opt$panname,"General","vcf","png"), collapse=".")
+ggsave(paste0(c(opt$out,sub_name), collapse="/"), plot=FIG, width=opt$width, height=opt$height)
+
+#% Individual haplotypes plots
+write("[analyze_VCF] Creating haplotype graphs  ...", stdout())
+for (hapname in unique(x$HAPNAME)){
+    # Getting the haps 
+    hapids = unique(x[x$HAPNAME == hapname,]$HAPID)
+    haps = paste(hapname, hapids, sep="#")
+    FIG = get_fig(INS_F[INS_F$HAP == haps,], DEL_F[DEL_F$HAP == haps,], paste0(c(opt$panname," - ",hapname), collapse=''))
+    sub_name = paste0(c("pan1c",opt$panname,hapname,"vcf","png"), collapse=".")
+    ggsave(paste0(c(opt$out, sub_name), collapse="/"), plot=FIG, width=opt$width, height=opt$height)
+}
 
-#% Saving the figure
-write("[analyze_VCF] Exporting ...", stdout())
-ggsave(opt$out, plot=gfHap, width=opt$width, height=opt$height)
 write("[analyze_VCF] Done !", stdout())
\ No newline at end of file
-- 
GitLab


From 8b1b7965bcdece5bdaba067190c90aeff0b99a61 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 29 Jul 2024 16:32:47 +0200
Subject: [PATCH 031/310] Update Snakefile

---
 Snakefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Snakefile b/Snakefile
index 5c2ccee..a7d9a65 100644
--- a/Snakefile
+++ b/Snakefile
@@ -276,6 +276,7 @@ rule SyRI_on_ASM:
             -d {output.wrkdir} \
             -o $(basename {output.fig}) \
             -r {input.ref} \
+            -h 10 -w 20 -s "0.9" -f 10 \
             -q "{input.qry}" 2>&1 | \
             tee {log.cmd}
         
-- 
GitLab


From 8e9bd1ad6ce52618e33b0232a3a748ed259b803b Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 29 Jul 2024 16:39:10 +0200
Subject: [PATCH 032/310] Update Snakefile

---
 Snakefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Snakefile b/Snakefile
index a7d9a65..67bb7ee 100644
--- a/Snakefile
+++ b/Snakefile
@@ -776,6 +776,8 @@ rule vcf_fig:
         pan_name=config['name']
     shell:
         """
+        mkdir {output.vcf_fig}
+
         ## Producing TSV for the figures
         zcat {input} | awk -f scripts/vcf_2_tsv.awk > {output.tsv}
 
-- 
GitLab


From 42c1760ddaf1466131d6cf3bef473d4893d9c44e Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 29 Jul 2024 16:47:54 +0200
Subject: [PATCH 033/310] Update Snakefile

---
 Snakefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 67bb7ee..99733d0 100644
--- a/Snakefile
+++ b/Snakefile
@@ -947,7 +947,8 @@ rule create_pan1c_report:
             shell("echo '# INS/DEL length distribution' >> {output.report}")
             figures = [
                 fig for fig in os.listdir(input.VCF_fig)
-                if fig.split('.')[2] != "general"
+                if fig[-3:] == "png"
+                and fig.split('.')[2] != "general"
             ]
             figures = ["pan1c."+config["name"]+".general.vcf.png", "pan1c."+config["name"]+".general.log.vcf.png"] + figures
             for basename in figures:
-- 
GitLab


From 560916082ca99d89c51ae86756720661f556c6dd Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 29 Jul 2024 16:58:06 +0200
Subject: [PATCH 034/310] Update Snakefile

---
 Snakefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index 99733d0..028e877 100644
--- a/Snakefile
+++ b/Snakefile
@@ -948,9 +948,9 @@ rule create_pan1c_report:
             figures = [
                 fig for fig in os.listdir(input.VCF_fig)
                 if fig[-3:] == "png"
-                and fig.split('.')[2] != "general"
+                and fig.split('.')[2][:7] != "General"
             ]
-            figures = ["pan1c."+config["name"]+".general.vcf.png", "pan1c."+config["name"]+".general.log.vcf.png"] + figures
+            figures = ["pan1c."+config["name"]+".General.vcf.png", "pan1c."+config["name"]+".General_log.vcf.png"] + figures
             for basename in figures:
                 name = basename.split('.')[2]
                 shell("echo '## {name}' >> {output.report}")
-- 
GitLab


From e6c71599cf63bd721ee22815a3bcf0d2bc9387d8 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 29 Jul 2024 17:08:46 +0200
Subject: [PATCH 035/310] Update Snakefile

---
 Snakefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Snakefile b/Snakefile
index 028e877..71fc98a 100644
--- a/Snakefile
+++ b/Snakefile
@@ -950,6 +950,7 @@ rule create_pan1c_report:
                 if fig[-3:] == "png"
                 and fig.split('.')[2][:7] != "General"
             ]
+            figures.sort()
             figures = ["pan1c."+config["name"]+".General.vcf.png", "pan1c."+config["name"]+".General_log.vcf.png"] + figures
             for basename in figures:
                 name = basename.split('.')[2]
-- 
GitLab


From 1745ae37cea042a764c66f4e6bec13f5c8f89af0 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 29 Jul 2024 17:32:28 +0200
Subject: [PATCH 036/310] Update analyze_VCF.R

---
 scripts/analyze_VCF.R | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/scripts/analyze_VCF.R b/scripts/analyze_VCF.R
index df423e3..06a6ac3 100644
--- a/scripts/analyze_VCF.R
+++ b/scripts/analyze_VCF.R
@@ -113,6 +113,7 @@ get_fig_log = function(INS_F, DEL_F, top_name){
 	scale_x_discrete(
 		breaks=seq(0,5),
 		labels=c("","10bp","100bp","1kb","10kb","100kb")) +
+    expand_limits(x=c(0, 5)) +
 	xlab("INS") +
 	theme_bw() +
 	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), axis.title = title_text) 
@@ -124,6 +125,7 @@ get_fig_log = function(INS_F, DEL_F, top_name){
 	scale_x_discrete(
 		breaks=-seq(0,5),
 		labels=c("","-10bp","-100bp","-1kb","-10kb","-100kb")) +
+    expand_limits(x=c(0, 5)) +
 	xlab("DEL") +
 	theme_bw() +
 	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), title = title_text, legend.position = "none")
@@ -144,6 +146,7 @@ get_fig = function(INS_F, DEL_F, top_name){
 	scale_x_discrete(
 		breaks=seq(0,5),
 		labels=c("","10bp","100bp","1kb","10kb","100kb")) +
+    expand_limits(x=c(0, 5)) +
 	xlab("INS") +
 	theme_bw() +
 	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), axis.title = title_text) 
@@ -154,6 +157,7 @@ get_fig = function(INS_F, DEL_F, top_name){
 	scale_x_discrete(
 		breaks=-seq(0,5),
 		labels=c("","-10bp","-100bp","-1kb","-10kb","-100kb")) +
+    expand_limits(x=c(0, 5)) +
 	xlab("DEL") +
 	theme_bw() +
 	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), title = title_text, legend.position = "none")
-- 
GitLab


From b6ac62c2bc005cb4acb02afa1e4951d86f5dc815 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 29 Jul 2024 17:52:05 +0200
Subject: [PATCH 037/310] Update analyze_VCF.R

---
 scripts/analyze_VCF.R | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/analyze_VCF.R b/scripts/analyze_VCF.R
index 06a6ac3..c5bd5f1 100644
--- a/scripts/analyze_VCF.R
+++ b/scripts/analyze_VCF.R
@@ -113,7 +113,7 @@ get_fig_log = function(INS_F, DEL_F, top_name){
 	scale_x_discrete(
 		breaks=seq(0,5),
 		labels=c("","10bp","100bp","1kb","10kb","100kb")) +
-    expand_limits(x=c(0, 5)) +
+    expand_limits(x=c(0)) +
 	xlab("INS") +
 	theme_bw() +
 	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), axis.title = title_text) 
@@ -125,7 +125,7 @@ get_fig_log = function(INS_F, DEL_F, top_name){
 	scale_x_discrete(
 		breaks=-seq(0,5),
 		labels=c("","-10bp","-100bp","-1kb","-10kb","-100kb")) +
-    expand_limits(x=c(0, 5)) +
+    expand_limits(x=c(0)) +
 	xlab("DEL") +
 	theme_bw() +
 	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), title = title_text, legend.position = "none")
@@ -146,7 +146,7 @@ get_fig = function(INS_F, DEL_F, top_name){
 	scale_x_discrete(
 		breaks=seq(0,5),
 		labels=c("","10bp","100bp","1kb","10kb","100kb")) +
-    expand_limits(x=c(0, 5)) +
+    expand_limits(x=c(0)) +
 	xlab("INS") +
 	theme_bw() +
 	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), axis.title = title_text) 
@@ -157,7 +157,7 @@ get_fig = function(INS_F, DEL_F, top_name){
 	scale_x_discrete(
 		breaks=-seq(0,5),
 		labels=c("","-10bp","-100bp","-1kb","-10kb","-100kb")) +
-    expand_limits(x=c(0, 5)) +
+    expand_limits(x=c(0)) +
 	xlab("DEL") +
 	theme_bw() +
 	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), title = title_text, legend.position = "none")
-- 
GitLab


From 5f30682c1c27f82206c27fda8743863515ade9d3 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 29 Jul 2024 17:53:13 +0200
Subject: [PATCH 038/310] Update analyze_VCF.R

---
 scripts/analyze_VCF.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/analyze_VCF.R b/scripts/analyze_VCF.R
index c5bd5f1..46d4b9a 100644
--- a/scripts/analyze_VCF.R
+++ b/scripts/analyze_VCF.R
@@ -193,4 +193,5 @@ for (hapname in unique(x$HAPNAME)){
     ggsave(paste0(c(opt$out, sub_name), collapse="/"), plot=FIG, width=opt$width, height=opt$height)
 }
 
-write("[analyze_VCF] Done !", stdout())
\ No newline at end of file
+write("[analyze_VCF] Done !", stdout())
+warnings()
\ No newline at end of file
-- 
GitLab


From e03e5262b8f34b89df66798d2c51e6982e7aca1f Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 29 Jul 2024 17:55:24 +0200
Subject: [PATCH 039/310] Update analyze_VCF.R

---
 scripts/analyze_VCF.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/analyze_VCF.R b/scripts/analyze_VCF.R
index 46d4b9a..400adcd 100644
--- a/scripts/analyze_VCF.R
+++ b/scripts/analyze_VCF.R
@@ -188,7 +188,7 @@ for (hapname in unique(x$HAPNAME)){
     # Getting the haps 
     hapids = unique(x[x$HAPNAME == hapname,]$HAPID)
     haps = paste(hapname, hapids, sep="#")
-    FIG = get_fig(INS_F[INS_F$HAP == haps,], DEL_F[DEL_F$HAP == haps,], paste0(c(opt$panname," - ",hapname), collapse=''))
+    FIG = get_fig(INS_F[INS_F$HAP %in% haps,], DEL_F[DEL_F$HAP %in% haps,], paste0(c(opt$panname," - ",hapname), collapse=''))
     sub_name = paste0(c("pan1c",opt$panname,hapname,"vcf","png"), collapse=".")
     ggsave(paste0(c(opt$out, sub_name), collapse="/"), plot=FIG, width=opt$width, height=opt$height)
 }
-- 
GitLab


From d4fc60574ba58484e3ad71e948c238919ef109da Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 29 Jul 2024 17:57:16 +0200
Subject: [PATCH 040/310] Update analyze_VCF.R

---
 scripts/analyze_VCF.R | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/analyze_VCF.R b/scripts/analyze_VCF.R
index 400adcd..ebcde27 100644
--- a/scripts/analyze_VCF.R
+++ b/scripts/analyze_VCF.R
@@ -110,10 +110,10 @@ get_fig_log = function(INS_F, DEL_F, top_name){
     figA = ggplot(INS_F, aes(x=x)) +
 	geom_line(aes(y=Count, group=HAP, color=HAP, linetype=HAPID)) +
 	scale_y_continuous(trans='log10', position = "right") +
+    expand_limits(x=c(0)) +
 	scale_x_discrete(
 		breaks=seq(0,5),
 		labels=c("","10bp","100bp","1kb","10kb","100kb")) +
-    expand_limits(x=c(0)) +
 	xlab("INS") +
 	theme_bw() +
 	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), axis.title = title_text) 
@@ -122,10 +122,10 @@ get_fig_log = function(INS_F, DEL_F, top_name){
     figB = ggplot(DEL_F, aes(x=x)) +
 	geom_line(aes(y=Count, group=HAP, color=HAP, linetype=HAPID)) +
 	scale_y_continuous(trans='log10') +
+    expand_limits(x=c(0)) +
 	scale_x_discrete(
 		breaks=-seq(0,5),
 		labels=c("","-10bp","-100bp","-1kb","-10kb","-100kb")) +
-    expand_limits(x=c(0)) +
 	xlab("DEL") +
 	theme_bw() +
 	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), title = title_text, legend.position = "none")
@@ -143,10 +143,10 @@ get_fig = function(INS_F, DEL_F, top_name){
     figA = ggplot(INS_F, aes(x=x)) +
 	geom_line(aes(y=Count, group=HAP, color=HAP, linetype=HAPID)) +
 	scale_y_continuous(position = "right") +
+    expand_limits(x=c(0)) +
 	scale_x_discrete(
 		breaks=seq(0,5),
 		labels=c("","10bp","100bp","1kb","10kb","100kb")) +
-    expand_limits(x=c(0)) +
 	xlab("INS") +
 	theme_bw() +
 	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), axis.title = title_text) 
@@ -154,10 +154,10 @@ get_fig = function(INS_F, DEL_F, top_name){
     # Deletion figure
     figB = ggplot(DEL_F, aes(x=x)) +
 	geom_line(aes(y=Count, group=HAP, color=HAP, linetype=HAPID)) +
+    expand_limits(x=c(0)) +
 	scale_x_discrete(
 		breaks=-seq(0,5),
 		labels=c("","-10bp","-100bp","-1kb","-10kb","-100kb")) +
-    expand_limits(x=c(0)) +
 	xlab("DEL") +
 	theme_bw() +
 	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), title = title_text, legend.position = "none")
-- 
GitLab


From f0d32576db9c1b555e93c8d6c6a287ebebf3e673 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Tue, 30 Jul 2024 11:27:08 +0200
Subject: [PATCH 041/310] Update Snakefile

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 71fc98a..34ae016 100644
--- a/Snakefile
+++ b/Snakefile
@@ -73,7 +73,7 @@ def which_analysis():
             )
     if config["get_VCF"] == "True": # VCF from the final graph against the "reference"
         analysis_inputs.append(
-            "output/pan1c."+config['name']+".vcf.png"
+            "output/vcf.figs"
         ) 
     return analysis_inputs
 
-- 
GitLab


From e37bd43d1caa253b807b4fac7043f8e3453ae072 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Tue, 30 Jul 2024 13:40:37 +0200
Subject: [PATCH 042/310] Update Snakefile

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 34ae016..2f65eff 100644
--- a/Snakefile
+++ b/Snakefile
@@ -629,7 +629,7 @@ rule final_graph_tagging:
         "output/pan1c."+config['name']+".gfa.metadata"
     threads: 1
     resources:
-        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 8000
     priority: 100
     params:
         app_path=config['app.path'],
-- 
GitLab


From e280d1fd1ab7020d79842600cbf1568912628835 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Wed, 31 Jul 2024 10:05:30 +0200
Subject: [PATCH 043/310] Bumped default resources for Quast

---
 Snakefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Snakefile b/Snakefile
index 2f65eff..f1301d1 100644
--- a/Snakefile
+++ b/Snakefile
@@ -110,7 +110,7 @@ rule ragtag_scaffolding:
         tar="data/hap.ragtagged/{haplotype}.tar.gz"
     threads: 8
     resources:
-        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000 
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000 
     retries: 1
     priority: 100
     params:
@@ -145,9 +145,9 @@ rule quast_stats:
         ref="data/haplotypes/"+config['reference']
     output:
         report="output/quast/"+config['name']+".quast.report.html"
-    threads: 8
+    threads: 16
     resources:
-        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000
     params:
         app_path=config["app.path"],
         pan_name=config["name"]
-- 
GitLab


From 5b82faceb25f82b026f2675e5f614c33f0488108 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Wed, 31 Jul 2024 11:54:38 +0200
Subject: [PATCH 044/310] Added INDEL distribution for SyRI

---
 Snakefile                                   | 66 ++++++++++++++++++---
 scripts/vcf_2_tsv_syri.awk                  | 12 ++++
 scripts/{vcf_2_tsv.awk => vcf_2_tsv_vg.awk} |  0
 3 files changed, 69 insertions(+), 9 deletions(-)
 create mode 100644 scripts/vcf_2_tsv_syri.awk
 rename scripts/{vcf_2_tsv.awk => vcf_2_tsv_vg.awk} (100%)

diff --git a/Snakefile b/Snakefile
index f1301d1..e944eea 100644
--- a/Snakefile
+++ b/Snakefile
@@ -73,7 +73,7 @@ def which_analysis():
             )
     if config["get_VCF"] == "True": # VCF from the final graph against the "reference"
         analysis_inputs.append(
-            "output/vcf.figs"
+            "output/vcf.figs.vg"
         ) 
     return analysis_inputs
 
@@ -760,13 +760,13 @@ rule vg_deconstruct:
                 2> >(tee {log.cmd} >&2)
         """
 
-rule vcf_fig:
-    # Produce a figure describing INS/DEL length distribution
+rule vcf_fig_vg:
+    # Produce a figure describing INS/DEL length distribution from vg deconstruct
     input:
         vcf="output/pan1c."+config['name']+".vcf.gz"
     output:
         tsv=temp("output/pan1c."+config['name']+".vcf.tsv"),
-        vcf_fig=directory("output/vcf.figs")
+        vcf_fig=directory("output/vcf.figs.vg")
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 8000
@@ -779,7 +779,53 @@ rule vcf_fig:
         mkdir {output.vcf_fig}
 
         ## Producing TSV for the figures
-        zcat {input} | awk -f scripts/vcf_2_tsv.awk > {output.tsv}
+        zcat {input} | awk -f scripts/vcf_2_tsv_vg.awk > {output.tsv}
+
+        ## Running R to get the figures
+        apptainer run --app Renv {params.app_path}/pan1c-env.sif Rscript scripts/analyze_VCF.R \
+            -t {output.tsv} \
+            -o {output.vcf_fig} \
+            -p {params.pan_name} \
+            {params.fig_config}
+        """
+
+rule vcf_fig_syri:
+    # Produce a figure describing INS/DEL length distribution from syri
+    input: 
+        folder='data/asm.syri/',
+        flags=expand('data/asm.syri/{haplotype}', haplotype=SAMPLES_NOREF)
+    output:
+        inter_tsv=expand('data/asm.syri/{haplotype}.vcf.tsv', haplotype=SAMPLES_NOREF)
+        tsv=temp("output/pan1c."+config['name']+".vcf.syri.tsv"),
+        vcf_fig=directory("output/vcf.figs.syri")
+    threads: 1
+    resources:
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 8000
+    params:
+        app_path=config['app.path'],
+        fig_config=config['vcf_fig.params'],
+        pan_name=config['name'],
+        refname=config['reference']
+    shell:
+        """
+        mkdir {output.vcf_fig}
+
+        RHAP = $(basename {params.refname} .fa.gz | cut -f1 -d'.')
+        RHAPN = $(basename {params.refname} .fa.gz | cut -f2 -d'.' | cut -f2 -d'p')
+
+        ## Going through all folders
+        for folder in {input.folder}; do
+            THAP = $(basename $folder | cut -f1 -d'.')
+            THAPN = $(basename $folder | cut -f2 -d'.' | cut -f2 -d'p')
+
+            # Producing intermediate TSVs
+            cat $folder/*.vcf | \
+                awk -v THAP=$THAP -v THAPN=$THAPN -v RHAP=$RHAP -v RHAPN=$RHAPN -f scripts/vcf_2_tsv_syri.awk \
+                > {input.folder}/$(basename $folder).vcf.tsv
+
+        ## Merging TSVs
+        head -n1 {input.folder}/$(basename $folder).vcf.tsv > {output.tsv}
+        tail -n +2 {input.folder}/*.vcf.tsv >> {output.tsv}
 
         ## Running R to get the figures
         apptainer run --app Renv {params.app_path}/pan1c-env.sif Rscript scripts/analyze_VCF.R \
@@ -852,7 +898,8 @@ def get_report_sections(wildcards):
             )
 
     if config['get_VCF'] == "True":
-        sections['VCF_fig'] = "output/vcf.figs"
+        sections['VCF_fig_vg'] = "output/vcf.figs.vg"
+        sections['VCF_fig_syri'] = "output/vcf.figs.syri"
 
     return sections      
 
@@ -942,11 +989,11 @@ rule create_pan1c_report:
 
             shell("echo '' >> {output.report}")
 
-        # Adding VCF figure
+        # Adding VCF figure from vg
         if params.add_VCF_fig == "True":
             shell("echo '# INS/DEL length distribution' >> {output.report}")
             figures = [
-                fig for fig in os.listdir(input.VCF_fig)
+                fig for fig in os.listdir(input.VCF_fig_vg)
                 if fig[-3:] == "png"
                 and fig.split('.')[2][:7] != "General"
             ]
@@ -955,7 +1002,8 @@ rule create_pan1c_report:
             for basename in figures:
                 name = basename.split('.')[2]
                 shell("echo '## {name}' >> {output.report}")
-                shell("echo '![{basename}](./vcf.figs/{basename})' >> {output.report}")
+                shell("echo '![{basename}](./vcf.figs.vg/{basename})' >> {output.report}")
+                shell("echo '![{basename}](./vcf.figs.syri/{basename})' >> {output.report}")
                 shell("echo '' >> {output.report}")
 
         # Converting to HTML
diff --git a/scripts/vcf_2_tsv_syri.awk b/scripts/vcf_2_tsv_syri.awk
new file mode 100644
index 0000000..1546fa4
--- /dev/null
+++ b/scripts/vcf_2_tsv_syri.awk
@@ -0,0 +1,12 @@
+# Header
+/^#CHROM/ { print "CHROM\tPOS\tID\tHAP\tLEN" }
+
+
+# INS/DEL counter
+!/^#/ { 
+    TYPE=substr($3, 1, 3)
+    if (TYPE=="DEL" || TYPE=="INS") {
+	    ALTLEN=length($4)-length($5)
+	    printf("%s#%s#%s\t%s\t%s\t%s#%s\t%s\n", RHAP, RHAPN, $1, $2, $3, THAP, THAPN, ALTLEN)
+	}
+}
\ No newline at end of file
diff --git a/scripts/vcf_2_tsv.awk b/scripts/vcf_2_tsv_vg.awk
similarity index 100%
rename from scripts/vcf_2_tsv.awk
rename to scripts/vcf_2_tsv_vg.awk
-- 
GitLab


From 2f3af9c8d18406d6dc524633b2f149cbc78da28b Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Wed, 31 Jul 2024 11:55:29 +0200
Subject: [PATCH 045/310] Typo

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index e944eea..e53f6f6 100644
--- a/Snakefile
+++ b/Snakefile
@@ -795,7 +795,7 @@ rule vcf_fig_syri:
         folder='data/asm.syri/',
         flags=expand('data/asm.syri/{haplotype}', haplotype=SAMPLES_NOREF)
     output:
-        inter_tsv=expand('data/asm.syri/{haplotype}.vcf.tsv', haplotype=SAMPLES_NOREF)
+        inter_tsv=expand('data/asm.syri/{haplotype}.vcf.tsv', haplotype=SAMPLES_NOREF),
         tsv=temp("output/pan1c."+config['name']+".vcf.syri.tsv"),
         vcf_fig=directory("output/vcf.figs.syri")
     threads: 1
-- 
GitLab


From 51f553dcf97657acf63541e9899dfcd148298c52 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Wed, 31 Jul 2024 12:02:05 +0200
Subject: [PATCH 046/310] Typo

---
 Snakefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Snakefile b/Snakefile
index e53f6f6..6d91adb 100644
--- a/Snakefile
+++ b/Snakefile
@@ -810,13 +810,13 @@ rule vcf_fig_syri:
         """
         mkdir {output.vcf_fig}
 
-        RHAP = $(basename {params.refname} .fa.gz | cut -f1 -d'.')
-        RHAPN = $(basename {params.refname} .fa.gz | cut -f2 -d'.' | cut -f2 -d'p')
+        RHAP=$(basename {params.refname} .fa.gz | cut -f1 -d'.')
+        RHAPN=$(basename {params.refname} .fa.gz | cut -f2 -d'.' | cut -f2 -d'p')
 
         ## Going through all folders
         for folder in {input.folder}; do
-            THAP = $(basename $folder | cut -f1 -d'.')
-            THAPN = $(basename $folder | cut -f2 -d'.' | cut -f2 -d'p')
+            THAP=$(basename $folder | cut -f1 -d'.')
+            THAPN=$(basename $folder | cut -f2 -d'.' | cut -f2 -d'p')
 
             # Producing intermediate TSVs
             cat $folder/*.vcf | \
-- 
GitLab


From ea65739b29f7ea33ad87690c9c65c9d897d5b736 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Wed, 31 Jul 2024 12:10:46 +0200
Subject: [PATCH 047/310] Update Snakefile

---
 Snakefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 6d91adb..1511d0b 100644
--- a/Snakefile
+++ b/Snakefile
@@ -822,7 +822,8 @@ rule vcf_fig_syri:
             cat $folder/*.vcf | \
                 awk -v THAP=$THAP -v THAPN=$THAPN -v RHAP=$RHAP -v RHAPN=$RHAPN -f scripts/vcf_2_tsv_syri.awk \
                 > {input.folder}/$(basename $folder).vcf.tsv
-
+        done
+        
         ## Merging TSVs
         head -n1 {input.folder}/$(basename $folder).vcf.tsv > {output.tsv}
         tail -n +2 {input.folder}/*.vcf.tsv >> {output.tsv}
-- 
GitLab


From 5ee15f28ef822d47f0d288fbc77a36a03f6c62d8 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Wed, 31 Jul 2024 12:17:06 +0200
Subject: [PATCH 048/310] Update Snakefile

---
 Snakefile | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/Snakefile b/Snakefile
index 1511d0b..c98fa65 100644
--- a/Snakefile
+++ b/Snakefile
@@ -814,16 +814,18 @@ rule vcf_fig_syri:
         RHAPN=$(basename {params.refname} .fa.gz | cut -f2 -d'.' | cut -f2 -d'p')
 
         ## Going through all folders
-        for folder in {input.folder}; do
-            THAP=$(basename $folder | cut -f1 -d'.')
-            THAPN=$(basename $folder | cut -f2 -d'.' | cut -f2 -d'p')
-
-            # Producing intermediate TSVs
-            cat $folder/*.vcf | \
-                awk -v THAP=$THAP -v THAPN=$THAPN -v RHAP=$RHAP -v RHAPN=$RHAPN -f scripts/vcf_2_tsv_syri.awk \
-                > {input.folder}/$(basename $folder).vcf.tsv
+        for folder in {input.folder}/*; do
+            if [ -d $folder ]; then
+                THAP=$(basename $folder | cut -f1 -d'.')
+                THAPN=$(basename $folder | cut -f2 -d'.' | cut -f2 -d'p')
+
+                # Producing intermediate TSVs
+                cat $folder/*.vcf | \
+                    awk -v THAP=$THAP -v THAPN=$THAPN -v RHAP=$RHAP -v RHAPN=$RHAPN -f scripts/vcf_2_tsv_syri.awk \
+                    > {input.folder}/$(basename $folder).vcf.tsv
+            fi
         done
-        
+
         ## Merging TSVs
         head -n1 {input.folder}/$(basename $folder).vcf.tsv > {output.tsv}
         tail -n +2 {input.folder}/*.vcf.tsv >> {output.tsv}
-- 
GitLab


From b7ccc387622d004e5fad271e0edd0a204d6a7d11 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Wed, 31 Jul 2024 12:22:29 +0200
Subject: [PATCH 049/310] Debug

---
 Snakefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Snakefile b/Snakefile
index c98fa65..3492358 100644
--- a/Snakefile
+++ b/Snakefile
@@ -829,6 +829,7 @@ rule vcf_fig_syri:
         ## Merging TSVs
         head -n1 {input.folder}/$(basename $folder).vcf.tsv > {output.tsv}
         tail -n +2 {input.folder}/*.vcf.tsv >> {output.tsv}
+        cp {output.tsv} {output.tsv}.SAVE
 
         ## Running R to get the figures
         apptainer run --app Renv {params.app_path}/pan1c-env.sif Rscript scripts/analyze_VCF.R \
-- 
GitLab


From b8368c8aef2d3473f34020e1c7b9ee4e1046449e Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Wed, 31 Jul 2024 12:26:04 +0200
Subject: [PATCH 050/310] Update Snakefile

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 3492358..0b3a738 100644
--- a/Snakefile
+++ b/Snakefile
@@ -828,7 +828,7 @@ rule vcf_fig_syri:
 
         ## Merging TSVs
         head -n1 {input.folder}/$(basename $folder).vcf.tsv > {output.tsv}
-        tail -n +2 {input.folder}/*.vcf.tsv >> {output.tsv}
+        tail -n +2  -q {input.folder}/*.vcf.tsv >> {output.tsv}
         cp {output.tsv} {output.tsv}.SAVE
 
         ## Running R to get the figures
-- 
GitLab


From 322b8a585fa013e56177cbb2447bc1ca48e3782b Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Wed, 31 Jul 2024 12:35:23 +0200
Subject: [PATCH 051/310] Added tool name to analyze_VCF figures

---
 Snakefile             |  2 ++
 scripts/analyze_VCF.R | 16 +++++++++-------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/Snakefile b/Snakefile
index 0b3a738..8709808 100644
--- a/Snakefile
+++ b/Snakefile
@@ -786,6 +786,7 @@ rule vcf_fig_vg:
             -t {output.tsv} \
             -o {output.vcf_fig} \
             -p {params.pan_name} \
+            --tool "VG" \
             {params.fig_config}
         """
 
@@ -836,6 +837,7 @@ rule vcf_fig_syri:
             -t {output.tsv} \
             -o {output.vcf_fig} \
             -p {params.pan_name} \
+            --tool "SyRI" \
             {params.fig_config}
         """
 
diff --git a/scripts/analyze_VCF.R b/scripts/analyze_VCF.R
index ebcde27..9d0f598 100644
--- a/scripts/analyze_VCF.R
+++ b/scripts/analyze_VCF.R
@@ -19,6 +19,8 @@ option_list = list(
         help="Figure width", metavar="integer"),
     make_option(c("-H", "--height"), type="integer", default=6, 
         help="Figure height", metavar="integer")
+    make_option(c("--tool"), type="character", default=NULL, 
+        help="VCF creation tool (VG, SyRI, ...)", metavar="character"),
 );
 
 opt_parser = OptionParser(option_list=option_list);
@@ -105,7 +107,7 @@ DEL_F[c("HAPNAME", "HAPID")] = str_split_fixed(DEL_F$HAP, "#", 2)
 #% Creating the general graph, in log and non-log version
 
 ## General function
-get_fig_log = function(INS_F, DEL_F, top_name){
+get_fig_log = function(INS_F, DEL_F, top_name, tool_name){
     # Insertion figure
     figA = ggplot(INS_F, aes(x=x)) +
 	geom_line(aes(y=Count, group=HAP, color=HAP, linetype=HAPID)) +
@@ -134,11 +136,11 @@ get_fig_log = function(INS_F, DEL_F, top_name){
     figA = figA + theme(legend.position = "none")
 
     ## Combining the plots and the legend
-    figF = grid.arrange(figB, figA, legend, nrow = 1, top=paste0(c("Pan1c - ",top_name," - ",sample), collapse=""), widths=c(2.3, 2.3, 0.8))
+    figF = grid.arrange(figB, figA, legend, nrow = 1, top=paste0(c("Pan1c - ",top_name," - ",sample," - ",tool_name), collapse=""), widths=c(2.3, 2.3, 0.8))
     return(figF)
 }
 
-get_fig = function(INS_F, DEL_F, top_name){
+get_fig = function(INS_F, DEL_F, top_name, tool_name){
     # Insertion figure
     figA = ggplot(INS_F, aes(x=x)) +
 	geom_line(aes(y=Count, group=HAP, color=HAP, linetype=HAPID)) +
@@ -166,19 +168,19 @@ get_fig = function(INS_F, DEL_F, top_name){
     figA = figA + theme(legend.position = "none")
 
     ## Combining the plots and the legend
-    figF = grid.arrange(figB, figA, legend, nrow = 1, top=paste0(c("Pan1c - ",top_name," - ",sample), collapse=""), widths=c(2.3, 2.3, 0.8))
+    figF = grid.arrange(figB, figA, legend, nrow = 1, top=paste0(c("Pan1c - ",top_name," - ",sample," - ",tool_name), collapse=""), widths=c(2.3, 2.3, 0.8))
     return(figF)
 }
 
 #% Log version
 write("[analyze_VCF] Creating general graph (Log version) ...", stdout())
-FIG = get_fig_log(INS_F, DEL_F, opt$panname)
+FIG = get_fig_log(INS_F, DEL_F, opt$panname, opt$tool)
 sub_name = paste0(c("pan1c",opt$panname,"General_log","vcf","png"), collapse=".")
 ggsave(paste0(c(opt$out,sub_name), collapse="/"), plot=FIG, width=opt$width, height=opt$height)
 
 #% Non log version
 write("[analyze_VCF] Creating general graph (Non-log version) ...", stdout())
-FIG = get_fig(INS_F, DEL_F, opt$panname)
+FIG = get_fig(INS_F, DEL_F, opt$panname, opt$tool)
 sub_name = paste0(c("pan1c",opt$panname,"General","vcf","png"), collapse=".")
 ggsave(paste0(c(opt$out,sub_name), collapse="/"), plot=FIG, width=opt$width, height=opt$height)
 
@@ -188,7 +190,7 @@ for (hapname in unique(x$HAPNAME)){
     # Getting the haps 
     hapids = unique(x[x$HAPNAME == hapname,]$HAPID)
     haps = paste(hapname, hapids, sep="#")
-    FIG = get_fig(INS_F[INS_F$HAP %in% haps,], DEL_F[DEL_F$HAP %in% haps,], paste0(c(opt$panname," - ",hapname), collapse=''))
+    FIG = get_fig(INS_F[INS_F$HAP %in% haps,], DEL_F[DEL_F$HAP %in% haps,], paste0(c(opt$panname," - ",hapname), collapse=''), opt$tool)
     sub_name = paste0(c("pan1c",opt$panname,hapname,"vcf","png"), collapse=".")
     ggsave(paste0(c(opt$out, sub_name), collapse="/"), plot=FIG, width=opt$width, height=opt$height)
 }
-- 
GitLab


From 01d43dfb0270aea5e1661976f7181dd73572ae48 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Wed, 31 Jul 2024 12:39:31 +0200
Subject: [PATCH 052/310] Typo

---
 scripts/analyze_VCF.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/analyze_VCF.R b/scripts/analyze_VCF.R
index 9d0f598..37884a7 100644
--- a/scripts/analyze_VCF.R
+++ b/scripts/analyze_VCF.R
@@ -18,7 +18,7 @@ option_list = list(
     make_option(c("-W", "--width"), type="integer", default=18, 
         help="Figure width", metavar="integer"),
     make_option(c("-H", "--height"), type="integer", default=6, 
-        help="Figure height", metavar="integer")
+        help="Figure height", metavar="integer"),
     make_option(c("--tool"), type="character", default=NULL, 
         help="VCF creation tool (VG, SyRI, ...)", metavar="character"),
 );
-- 
GitLab


From 93dfa6a43ffefc41599e2b3090c58ff4713aebef Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Wed, 31 Jul 2024 12:58:12 +0200
Subject: [PATCH 053/310] Updated analyze_VCF args

---
 Snakefile             | 4 ++--
 scripts/analyze_VCF.R | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Snakefile b/Snakefile
index 8709808..c253111 100644
--- a/Snakefile
+++ b/Snakefile
@@ -786,7 +786,7 @@ rule vcf_fig_vg:
             -t {output.tsv} \
             -o {output.vcf_fig} \
             -p {params.pan_name} \
-            --tool "VG" \
+            -T "VG" \
             {params.fig_config}
         """
 
@@ -837,7 +837,7 @@ rule vcf_fig_syri:
             -t {output.tsv} \
             -o {output.vcf_fig} \
             -p {params.pan_name} \
-            --tool "SyRI" \
+            -T "SyRI" \
             {params.fig_config}
         """
 
diff --git a/scripts/analyze_VCF.R b/scripts/analyze_VCF.R
index 37884a7..f7be795 100644
--- a/scripts/analyze_VCF.R
+++ b/scripts/analyze_VCF.R
@@ -15,12 +15,12 @@ option_list = list(
         help="output directory", metavar="character"),
     make_option(c("-p", "--panname"), type="character", default=NULL, 
         help="pangenome name", metavar="character"),
+    make_option(c("-T, --tool"), type="character", default=NULL, 
+        help="VCF creation tool (VG, SyRI, ...)", metavar="character"),
     make_option(c("-W", "--width"), type="integer", default=18, 
         help="Figure width", metavar="integer"),
     make_option(c("-H", "--height"), type="integer", default=6, 
-        help="Figure height", metavar="integer"),
-    make_option(c("--tool"), type="character", default=NULL, 
-        help="VCF creation tool (VG, SyRI, ...)", metavar="character"),
+        help="Figure height", metavar="integer")
 );
 
 opt_parser = OptionParser(option_list=option_list);
-- 
GitLab


From bc0480a0e77086d399bb22d9aeae910e24a9a322 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Wed, 31 Jul 2024 13:02:17 +0200
Subject: [PATCH 054/310] Update analyze_VCF.R

---
 scripts/analyze_VCF.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/analyze_VCF.R b/scripts/analyze_VCF.R
index f7be795..40886f9 100644
--- a/scripts/analyze_VCF.R
+++ b/scripts/analyze_VCF.R
@@ -15,7 +15,7 @@ option_list = list(
         help="output directory", metavar="character"),
     make_option(c("-p", "--panname"), type="character", default=NULL, 
         help="pangenome name", metavar="character"),
-    make_option(c("-T, --tool"), type="character", default=NULL, 
+    make_option(c("-T", "--tool"), type="character", default=NULL, 
         help="VCF creation tool (VG, SyRI, ...)", metavar="character"),
     make_option(c("-W", "--width"), type="integer", default=18, 
         help="Figure width", metavar="integer"),
-- 
GitLab


From 15485fa23074172ee20c5902f2dd2bb77c857b49 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Wed, 31 Jul 2024 13:40:40 +0200
Subject: [PATCH 055/310] Update vcf_2_tsv_syri.awk

---
 scripts/vcf_2_tsv_syri.awk | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/scripts/vcf_2_tsv_syri.awk b/scripts/vcf_2_tsv_syri.awk
index 1546fa4..598653d 100644
--- a/scripts/vcf_2_tsv_syri.awk
+++ b/scripts/vcf_2_tsv_syri.awk
@@ -2,11 +2,20 @@
 /^#CHROM/ { print "CHROM\tPOS\tID\tHAP\tLEN" }
 
 
-# INS/DEL counter
-!/^#/ { 
-    TYPE=substr($3, 1, 3)
-    if (TYPE=="DEL" || TYPE=="INS") {
-	    ALTLEN=length($4)-length($5)
-	    printf("%s#%s#%s\t%s\t%s\t%s#%s\t%s\n", RHAP, RHAPN, $1, $2, $3, THAP, THAPN, ALTLEN)
-	}
+# INS/DEL counter (Counting SyRI typed INDEL)
+#!/^#/ { 
+#    TYPE=substr($3, 1, 3)
+#    if (TYPE=="DEL" || TYPE=="INS") {
+#	    ALTLEN=length($4)-length($5)
+#	    printf("%s#%s#%s\t%s\t%s\t%s#%s\t%s\n", RHAP, RHAPN, $1, $2, $3, THAP, THAPN, ALTLEN)
+#	}
+#}
+
+# INS/DEL counter (Dumb style counting)
+!/^#/ {
+    TYPE=substr($3, 1, 1)
+    if (TYPE!="<") {
+        ALTLEN=length($4)-length($5)
+        printf("%s#%s#%s\t%s\t%s\t%s#%s\t%s\n", RHAP, RHAPN, $1, $2, $3, THAP, THAPN, ALTLEN)
+    }
 }
\ No newline at end of file
-- 
GitLab


From aa2c5eee050a42c7b01b3860fe7842741110f35d Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 1 Aug 2024 09:49:36 +0200
Subject: [PATCH 056/310] Bumped pggb_input_stats RAM alloc

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index c253111..376b21c 100644
--- a/Snakefile
+++ b/Snakefile
@@ -649,7 +649,7 @@ rule pggb_input_stats:
         "output/stats/pan1c."+config['name']+".chrInput.stats.tsv"
     threads: 1
     resources:
-        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 6000
     params:
         app_path=config['app.path'],
         pan_name=config['name']
-- 
GitLab


From 6bfc6b50e4237f8480770db5995dd823ca719771 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 1 Aug 2024 10:19:41 +0200
Subject: [PATCH 057/310] Update Snakefile

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 376b21c..d98f891 100644
--- a/Snakefile
+++ b/Snakefile
@@ -649,7 +649,7 @@ rule pggb_input_stats:
         "output/stats/pan1c."+config['name']+".chrInput.stats.tsv"
     threads: 1
     resources:
-        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 6000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 32000
     params:
         app_path=config['app.path'],
         pan_name=config['name']
-- 
GitLab


From 120bbb7a1676e8d4f88c8d8b7e348b03e2b23686 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 1 Aug 2024 10:33:37 +0200
Subject: [PATCH 058/310] Removed Debug

---
 Snakefile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index d98f891..e5659b9 100644
--- a/Snakefile
+++ b/Snakefile
@@ -830,7 +830,6 @@ rule vcf_fig_syri:
         ## Merging TSVs
         head -n1 {input.folder}/$(basename $folder).vcf.tsv > {output.tsv}
         tail -n +2  -q {input.folder}/*.vcf.tsv >> {output.tsv}
-        cp {output.tsv} {output.tsv}.SAVE
 
         ## Running R to get the figures
         apptainer run --app Renv {params.app_path}/pan1c-env.sif Rscript scripts/analyze_VCF.R \
-- 
GitLab


From e0295dfe7927224ab142946b55a9663435db2c6b Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 1 Aug 2024 11:04:19 +0200
Subject: [PATCH 059/310] Revert "Removed Debug"

This reverts commit 120bbb7a1676e8d4f88c8d8b7e348b03e2b23686.
---
 Snakefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Snakefile b/Snakefile
index e5659b9..d98f891 100644
--- a/Snakefile
+++ b/Snakefile
@@ -830,6 +830,7 @@ rule vcf_fig_syri:
         ## Merging TSVs
         head -n1 {input.folder}/$(basename $folder).vcf.tsv > {output.tsv}
         tail -n +2  -q {input.folder}/*.vcf.tsv >> {output.tsv}
+        cp {output.tsv} {output.tsv}.SAVE
 
         ## Running R to get the figures
         apptainer run --app Renv {params.app_path}/pan1c-env.sif Rscript scripts/analyze_VCF.R \
-- 
GitLab


From a9f5d2dd4cd7896288c8df6d5cf6d592ac3e65e0 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 1 Aug 2024 11:15:56 +0200
Subject: [PATCH 060/310] Update Snakefile

---
 Snakefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index d98f891..f1ed3f5 100644
--- a/Snakefile
+++ b/Snakefile
@@ -256,7 +256,8 @@ rule SyRI_on_ASM:
         qry="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz"
     output:
         fig="output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png",
-        wrkdir=directory('data/asm.syri/{haplotype}')
+        wrkdir=directory('data/asm.syri/{haplotype}'),
+        maindir=directory('data/asm.syri')
     log: 
         cmd="logs/SyRI_ASM/{haplotype}.SyRI_ASM.cmd.log",
         time="logs/SyRI_ASM/{haplotype}.SyRI_ASM.time.log"
@@ -830,7 +831,6 @@ rule vcf_fig_syri:
         ## Merging TSVs
         head -n1 {input.folder}/$(basename $folder).vcf.tsv > {output.tsv}
         tail -n +2  -q {input.folder}/*.vcf.tsv >> {output.tsv}
-        cp {output.tsv} {output.tsv}.SAVE
 
         ## Running R to get the figures
         apptainer run --app Renv {params.app_path}/pan1c-env.sif Rscript scripts/analyze_VCF.R \
-- 
GitLab


From 8efa87de941bbc572e9ccc27c785026288a65fc3 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 1 Aug 2024 11:17:24 +0200
Subject: [PATCH 061/310] Revert "Update Snakefile"

This reverts commit a9f5d2dd4cd7896288c8df6d5cf6d592ac3e65e0.
---
 Snakefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index f1ed3f5..d98f891 100644
--- a/Snakefile
+++ b/Snakefile
@@ -256,8 +256,7 @@ rule SyRI_on_ASM:
         qry="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz"
     output:
         fig="output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png",
-        wrkdir=directory('data/asm.syri/{haplotype}'),
-        maindir=directory('data/asm.syri')
+        wrkdir=directory('data/asm.syri/{haplotype}')
     log: 
         cmd="logs/SyRI_ASM/{haplotype}.SyRI_ASM.cmd.log",
         time="logs/SyRI_ASM/{haplotype}.SyRI_ASM.time.log"
@@ -831,6 +830,7 @@ rule vcf_fig_syri:
         ## Merging TSVs
         head -n1 {input.folder}/$(basename $folder).vcf.tsv > {output.tsv}
         tail -n +2  -q {input.folder}/*.vcf.tsv >> {output.tsv}
+        cp {output.tsv} {output.tsv}.SAVE
 
         ## Running R to get the figures
         apptainer run --app Renv {params.app_path}/pan1c-env.sif Rscript scripts/analyze_VCF.R \
-- 
GitLab


From a58a746a1d4beb8dd08a8dc081ed1bf6b77926fc Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 1 Aug 2024 11:23:50 +0200
Subject: [PATCH 062/310] Update Snakefile

---
 Snakefile | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/Snakefile b/Snakefile
index d98f891..9228779 100644
--- a/Snakefile
+++ b/Snakefile
@@ -793,7 +793,6 @@ rule vcf_fig_vg:
 rule vcf_fig_syri:
     # Produce a figure describing INS/DEL length distribution from syri
     input: 
-        folder='data/asm.syri/',
         flags=expand('data/asm.syri/{haplotype}', haplotype=SAMPLES_NOREF)
     output:
         inter_tsv=expand('data/asm.syri/{haplotype}.vcf.tsv', haplotype=SAMPLES_NOREF),
@@ -813,9 +812,9 @@ rule vcf_fig_syri:
 
         RHAP=$(basename {params.refname} .fa.gz | cut -f1 -d'.')
         RHAPN=$(basename {params.refname} .fa.gz | cut -f2 -d'.' | cut -f2 -d'p')
-
+        FOLDER=$(dirname {input.flags[0]})
         ## Going through all folders
-        for folder in {input.folder}/*; do
+        for folder in $FOLDER/*; do
             if [ -d $folder ]; then
                 THAP=$(basename $folder | cut -f1 -d'.')
                 THAPN=$(basename $folder | cut -f2 -d'.' | cut -f2 -d'p')
@@ -823,14 +822,13 @@ rule vcf_fig_syri:
                 # Producing intermediate TSVs
                 cat $folder/*.vcf | \
                     awk -v THAP=$THAP -v THAPN=$THAPN -v RHAP=$RHAP -v RHAPN=$RHAPN -f scripts/vcf_2_tsv_syri.awk \
-                    > {input.folder}/$(basename $folder).vcf.tsv
+                    > $FOLDER/$(basename $folder).vcf.tsv
             fi
         done
 
         ## Merging TSVs
-        head -n1 {input.folder}/$(basename $folder).vcf.tsv > {output.tsv}
-        tail -n +2  -q {input.folder}/*.vcf.tsv >> {output.tsv}
-        cp {output.tsv} {output.tsv}.SAVE
+        head -n1 $FOLDER/$(basename $folder).vcf.tsv > {output.tsv}
+        tail -n +2  -q $FOLDER/*.vcf.tsv >> {output.tsv}
 
         ## Running R to get the figures
         apptainer run --app Renv {params.app_path}/pan1c-env.sif Rscript scripts/analyze_VCF.R \
-- 
GitLab


From 1f28fdf9192037605a3a91be72ef031dcae3b050 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 1 Aug 2024 14:45:37 +0200
Subject: [PATCH 063/310] Update ragtagChromInfer.sh

Redirecting RagTag stderr to stdout (i.e. log file) to make debug easier
---
 scripts/ragtagChromInfer.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/ragtagChromInfer.sh b/scripts/ragtagChromInfer.sh
index 21d14cb..f4c8b74 100755
--- a/scripts/ragtagChromInfer.sh
+++ b/scripts/ragtagChromInfer.sh
@@ -36,7 +36,7 @@ mkdir -p $tmpdir
 
 # Running ragtag scaffold
 apptainer run $appdir/pan1c-env.sif ragtag.py scaffold \
-    --mm2-params "$mm2params -t $threads" -o $tmpdir $inputref $inputquery
+    --mm2-params "$mm2params -t $threads" -o $tmpdir $inputref $inputquery 2>&1
 
 # Renaming sequence according to naming scheme
 grep ">${ref}#chr*" -A1 $tmpdir/ragtag.scaffold.fasta | \
-- 
GitLab


From d0d002a4e5365c095574b59db454d73fb207d9fa Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 1 Aug 2024 17:54:02 +0200
Subject: [PATCH 064/310] Updated PanGeTools

- Added seqwish v0.7.9
- Updated vg to v1.58.0
---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 9228779..2105cd1 100644
--- a/Snakefile
+++ b/Snakefile
@@ -437,7 +437,7 @@ rule gfaffix_on_chr:
     shell:
         """
         /usr/bin/time -v -o {log.time} \
-            apptainer exec {params.app_path}/pggb.sif gfaffix \
+            apptainer exec {params.app_path}/PanGeTools.sif gfaffix \
             {input} -o {output.gfa} -t {output.transform} \
             > /dev/null
 
-- 
GitLab


From b94c049da85064d520d6c9858be3bc84ce4c2229 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 1 Aug 2024 18:06:19 +0200
Subject: [PATCH 065/310] Removed the need of pggb.sif

Everything was moved to PanGeTools :
- Added minimum image version requirement
- Updated getTags.py accordingly
- Updated seqwish and gfaffix rules to use PanGeTools
---
 README.md          |  6 +++++-
 Snakefile          |  4 ++--
 getApps.sh         |  1 -
 scripts/getTags.py | 19 -------------------
 4 files changed, 7 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index 04529fb..ab80202 100644
--- a/README.md
+++ b/README.md
@@ -3,11 +3,15 @@
 Pan1c : a snakemake workflow for creating pangenomes at chromosomic scale.
 The workflow use a set of apptainer images : 
 - PanGeTools (Odgi, VG, ...): https://forgemia.inra.fr/alexis.mergez/pangetools
-- PanGraTools (PGGB): https://forgemia.inra.fr/alexis.mergez/pangratools
 - Pan1c-Apps (Python, Snakemake): https://forgemia.inra.fr/alexis.mergez/pan1capps
 
 > An example of input files and a config file is available in `example/`.  
 
+# Minimum image versions
+- PanGeTools >= v1.10.0
+- Pan1c-env >= v1.0.14
+- Pan1c-box >= v1.0.14
+
 # Prepare your data
 This workflow can take chromosome level assemblies as well as contig level assemblies but requires a reference assembly.  
 **Fasta files need to be compressed** using **bgzip2** (included in [PanGeTools](https://forgemia.inra.fr/alexis.mergez/pangetools)).
diff --git a/Snakefile b/Snakefile
index 2105cd1..28b63a6 100644
--- a/Snakefile
+++ b/Snakefile
@@ -408,7 +408,7 @@ rule seqwish:
     shell:
         """
         /usr/bin/time -v -o {log.time} \
-            apptainer exec {params.app_path}/pggb.sif seqwish \
+            apptainer exec {params.app_path}/PanGeTools.sif seqwish \
             -s {input.fa} -p {input.aln} -g {output} \
             {params.seqwish} -t {threads} \
             --temp-dir $(dirname {output}) -P 2>&1 | \
@@ -752,7 +752,7 @@ rule vg_deconstruct:
         """
         /usr/bin/time -v -o {log.time} \
             apptainer run --app vg {params.app_path}/PanGeTools.sif \
-                deconstruct -a -e \
+                deconstruct -e \
                 -P $(echo {params.ref} | cut -f1 -d'.') \
                 {input.graph} \
                 -t {threads} -v \
diff --git a/getApps.sh b/getApps.sh
index 1ba6cc9..b182b0a 100755
--- a/getApps.sh
+++ b/getApps.sh
@@ -17,5 +17,4 @@ done
 # Script
 apptainer pull $appdir/PanGeTools.sif oras://registry.forgemia.inra.fr/alexis.mergez/pangetools/pangetools:latest  
 apptainer pull $appdir/pan1c-env.sif oras://registry.forgemia.inra.fr/alexis.mergez/pan1capps/pan1cenv:latest  
-apptainer pull $appdir/pggb.sif oras://registry.forgemia.inra.fr/alexis.mergez/pangratools/pggb:latest  
 apptainer pull $appdir/pan1c-box.sif oras://registry.forgemia.inra.fr/alexis.mergez/pan1capps/pan1cbox:latest 
diff --git a/scripts/getTags.py b/scripts/getTags.py
index 2e9c010..2cd7ea4 100644
--- a/scripts/getTags.py
+++ b/scripts/getTags.py
@@ -79,25 +79,6 @@ for key in labels.keys():
     if ".Version" in key:
         tags["pangetools"][key.lower()] = labels[key]
 
-### PGGB image section
-tags["pggb"] = {}
-
-# Reading the apps versions from the apptainer tags
-_output = subprocess.run(
-    ["apptainer", "inspect", "-j", f"{args.appdir}/pggb.sif"],
-    capture_output=True, 
-    text=True
-).stdout
-_output = json.loads(_output)
-labels = _output['data']['attributes']['labels']
-tags["pggb"]["image.version"] = labels['Version']
-tags["pggb"]["image.home"] = labels['about.home']
-
-# Adding app versions to the tag dictionnary
-for key in labels.keys():
-    if ".Version" in key:
-        tags["pggb"][key.lower()] = labels[key]
-
 ### Pan1c-Env section
 tags["pan1c-env"] = {}
 
-- 
GitLab


From 8dcc5513539e596fe816a7bf2d48e271b10fdd04 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 2 Aug 2024 18:41:56 +0200
Subject: [PATCH 066/310] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ab80202..2ce151b 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@ The workflow use a set of apptainer images :
 
 > An example of input files and a config file is available in `example/`.  
 
-# Minimum image versions
+# Minimum image version
 - PanGeTools >= v1.10.0
 - Pan1c-env >= v1.0.14
 - Pan1c-box >= v1.0.14
-- 
GitLab


From a71c4e06de5cd64891a615c5d98096a1b63c8d2e Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 5 Aug 2024 11:54:05 +0200
Subject: [PATCH 067/310] Update Snakefile

---
 Snakefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 28b63a6..2395b9b 100644
--- a/Snakefile
+++ b/Snakefile
@@ -366,6 +366,7 @@ rule wfmash_on_chr:
             -n $(cat {input.fai} | wc -l) {params.wfmash_sec} -t {threads} \
             --tmp-base $(dirname {output.aln}) \
             {input.fa} --approx-map \
+            --lower-triangular \
             1> {output.mapping} \
             2> >(tee {log.cmd_map} >&2)
 
@@ -375,8 +376,8 @@ rule wfmash_on_chr:
             -s {params.segment_length} -l $(( {params.segment_length} * 5 )) -p {params.mapping_id} \
             -n $(cat {input.fai} | wc -l) {params.wfmash_sec} -t {threads} \
             --tmp-base $(dirname {output.aln}) \
+            --lower-triangular \
             {input.fa} -i {output.mapping} \
-            --invert-filtering \
             1> {output.aln} \
             2> >(tee {log.cmd_aln} >&2)
 
-- 
GitLab


From 3daf136dd5216922908e48114819079bd328e070 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 5 Aug 2024 15:03:57 +0200
Subject: [PATCH 068/310] Update README.md

---
 README.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/README.md b/README.md
index 2ce151b..d100eaa 100644
--- a/README.md
+++ b/README.md
@@ -35,7 +35,16 @@ Before running the worflow, some apptainer images needs to be downloaded. Use th
 Clone this repository and create a `data/haplotypes` directory where you will place all your haplotypes.  
 Update the reference name and the apptainer image directory in `config.yaml`.  
 Then, modify the variables in `runSnakemake.sh` to match your requirements (number of threads, memory, job name, email, etc.).  
+## Single machine mode
 Navigate to the root directory of the repository and execute `sbatch runSnakemake.sh`!
+The default script uses a single node and runs everything on it. This method only require apptainer to run but isn't the most efficient for job distribution.
+## Cluster execution
+To execute each steps as a job with SLURM, install a custom conda environement with this command : 
+```
+conda create -n Pan1c -c conda-forge -c bioconda snakemake=8.4.7 snakemake-executor-plugin-slurm
+```
+This works by having a job that runs snakemake which will submit other jobs. To do so, configure `runSnakemakeSLURM.sh` and submit it using `sbatch`.
+> If you get OOM errors, use the mem_multiplier in `config.yaml` to allocate more memory for jobs.
 
 # Outputs
 The workflow generates several key files :
-- 
GitLab


From 45b72895923164324b796687ff096fc9969f04bb Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 5 Aug 2024 17:34:54 +0200
Subject: [PATCH 069/310] Customizing SyRI figures

---
 Snakefile              |  9 ++++++---
 scripts/getSyriFigs.sh |  6 ++++--
 src/plotsr-base.cfg    | 29 +++++++++++++++++++++++++++++
 3 files changed, 39 insertions(+), 5 deletions(-)
 create mode 100644 src/plotsr-base.cfg

diff --git a/Snakefile b/Snakefile
index 2395b9b..188dae5 100644
--- a/Snakefile
+++ b/Snakefile
@@ -265,7 +265,8 @@ rule SyRI_on_ASM:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 12000
     retries: 1
     params:
-        app_path=config["app.path"]
+        app_path=config["app.path"],
+        plotsr_cfg="src/plotsr-base.cfg"
     shell:
         """
         mkdir -p {output.wrkdir}
@@ -277,6 +278,7 @@ rule SyRI_on_ASM:
             -o $(basename {output.fig}) \
             -r {input.ref} \
             -h 10 -w 20 -s "0.9" -f 10 \
+            -c {params.plotsr_cfg} \
             -q "{input.qry}" 2>&1 | \
             tee {log.cmd}
         
@@ -288,7 +290,6 @@ Core section : Running PGGB
 """
 
 rule SyRI_on_chrInput:
-    # WIP
     input:
         fasta='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz'
     output:
@@ -299,7 +300,8 @@ rule SyRI_on_chrInput:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 12000
     params:
         app_path=config["app.path"],
-        ref=config['reference']
+        ref=config['reference'],
+        plotsr_cfg="src/plotsr-base.cfg"
     shell:
         """
         mkdir {output.wrkdir}
@@ -330,6 +332,7 @@ rule SyRI_on_chrInput:
             -o $(basename {output.fig}) \
             -r {output.wrkdir}/"${{refname}}.fa.gz" \
             -q "${{AllAsmList[*]}}" \
+            -c {params.plotsr_cfg} \
             -h 10 -w 20 -s "0.9" -f 10
         mv {output.wrkdir}/$(basename {output.fig}) {output.fig}
         rm {output.wrkdir}/*.fa
diff --git a/scripts/getSyriFigs.sh b/scripts/getSyriFigs.sh
index 273b30d..a76ca80 100755
--- a/scripts/getSyriFigs.sh
+++ b/scripts/getSyriFigs.sh
@@ -13,6 +13,7 @@ height=16       # Figure height
 width=9         # Figure width
 fontsize=12     
 space="0.7"     # Space for homologous chromosomes 
+config=""       # Plotsr config file
 
 ## Getting arguments
 while getopts "r:q:a:t:d:o:h:w:f:s:" option; do
@@ -27,7 +28,8 @@ while getopts "r:q:a:t:d:o:h:w:f:s:" option; do
         w) width="$OPTARG";;
         f) fontsize="$OPTARG";;
         s) space="$OPTARG";;
-        \?) echo "Usage: $0 [-r ref] [-q query] [-a appdir] [-t threads] [-d wrkdir] [-o output] [-h height] [-w width] [-f fontsize] [-s space]" >&2
+        c) config="$OPTARG";;
+        \?) echo "Usage: $0 [-r ref] [-q query] [-a appdir] [-t threads] [-d wrkdir] [-o output] [-h height] [-w width] [-f fontsize] [-s space] [-c config]" >&2
             exit 1;;
     esac
 done
@@ -106,7 +108,7 @@ for asm in "${asmList[@]}"; do
 done
 
 # Generating the plotsr command
-command="--genomes $wrkdir/genomes.txt -o $wrkdir/$output -f $fontsize -S $space -H $height -W $width -d 600 "
+command="--genomes $wrkdir/genomes.txt -o $wrkdir/$output -f $fontsize -S $space -H $height -W $width -d 600 -c $config"
 
 # Adding syri files to the command as each needs to be specified using "--sr" argument 
 for file in "${syriFileList[@]}"; do
diff --git a/src/plotsr-base.cfg b/src/plotsr-base.cfg
new file mode 100644
index 0000000..ff2f157
--- /dev/null
+++ b/src/plotsr-base.cfg
@@ -0,0 +1,29 @@
+## COLOURS and transparency for alignments (syntenic, inverted, translocated, and duplicated)
+syncol:#CCCCCC
+invcol:#FFA500
+tracol:#9ACD32
+dupcol:#00BBFF
+synlwd:0                ## Line width for syntenic annotations
+invlwd:0.1              ## Line width for inversions
+tralwd:0.1              ## Line width for translocations
+duplwd:0.1              ## Line width for duplications
+alpha:0.8
+
+## Margins and dimensions:
+chrmar:0.1              ## Adjusts the gap between chromosomes and tracks. Higher values leads to more gap
+exmar:0.1               ## Extra margin at the top and bottom of plot area
+marginchr:0.1           ## Margin between adjacent chromosomes when using --itx
+
+## Legend
+legend:T                ## To plot legend use T, use F to not plot legend
+genlegcol:4            ## Number of columns for genome legend, set -1 for automatic setup
+bbox:0,1.01,0.5,0.3		## [Left edge, bottom edge, width, height]
+bbox_v:0,1.1,0.5,0.3	## For vertical chromosomes (using -v option)
+bboxmar:0.8             ## Margin between genome and annotation legends
+
+## Tracks
+norm:T                  ## For each chromosome, independently normalise the y-axis of tracks. Use T for normalising independently, and F to normalise based on max value across all chromosomes
+
+## Axis
+maxl:-1                 ## Manually set maximum chromosome position. Use `-1` for automatic selection. Does not work with --itx
+genname:T               ## Write genome names adjacent to the chromosome (T) or not (F)
\ No newline at end of file
-- 
GitLab


From 97f4d694c7eb9e6ad59d54e1eb70b761a574d54f Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 5 Aug 2024 17:56:26 +0200
Subject: [PATCH 070/310] Update getSyriFigs.sh

---
 scripts/getSyriFigs.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/getSyriFigs.sh b/scripts/getSyriFigs.sh
index a76ca80..409871f 100755
--- a/scripts/getSyriFigs.sh
+++ b/scripts/getSyriFigs.sh
@@ -16,7 +16,7 @@ space="0.7"     # Space for homologous chromosomes
 config=""       # Plotsr config file
 
 ## Getting arguments
-while getopts "r:q:a:t:d:o:h:w:f:s:" option; do
+while getopts "r:q:a:t:d:o:h:w:f:s:c:" option; do
     case "$option" in
         r) ref="$OPTARG";;
         q) qry="$OPTARG";;
-- 
GitLab


From 4c1ce57da1578fe21ca35f781cb9eeb1db7f58e9 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Tue, 6 Aug 2024 10:22:25 +0200
Subject: [PATCH 071/310] Update getSyriFigs.sh

---
 scripts/getSyriFigs.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/getSyriFigs.sh b/scripts/getSyriFigs.sh
index 409871f..75b216d 100755
--- a/scripts/getSyriFigs.sh
+++ b/scripts/getSyriFigs.sh
@@ -108,7 +108,7 @@ for asm in "${asmList[@]}"; do
 done
 
 # Generating the plotsr command
-command="--genomes $wrkdir/genomes.txt -o $wrkdir/$output -f $fontsize -S $space -H $height -W $width -d 600 -c $config"
+command="--genomes $wrkdir/genomes.txt -o $wrkdir/$output -f $fontsize -S $space -H $height -W $width -d 600 -c $config "
 
 # Adding syri files to the command as each needs to be specified using "--sr" argument 
 for file in "${syriFileList[@]}"; do
-- 
GitLab


From 1c892e32fcb3c12c2670f96e6acb75c82e94ee36 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Tue, 6 Aug 2024 10:47:46 +0200
Subject: [PATCH 072/310] Fixed flag for plotsr config file

---
 scripts/getSyriFigs.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/getSyriFigs.sh b/scripts/getSyriFigs.sh
index 75b216d..f5871a0 100755
--- a/scripts/getSyriFigs.sh
+++ b/scripts/getSyriFigs.sh
@@ -108,7 +108,7 @@ for asm in "${asmList[@]}"; do
 done
 
 # Generating the plotsr command
-command="--genomes $wrkdir/genomes.txt -o $wrkdir/$output -f $fontsize -S $space -H $height -W $width -d 600 -c $config "
+command="--genomes $wrkdir/genomes.txt -o $wrkdir/$output -f $fontsize -S $space -H $height -W $width -d 600 --cfg $config "
 
 # Adding syri files to the command as each needs to be specified using "--sr" argument 
 for file in "${syriFileList[@]}"; do
-- 
GitLab


From eb2ad61f2b4eb85c695730a6677ae56455cd4cd2 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Tue, 6 Aug 2024 13:31:02 +0200
Subject: [PATCH 073/310] Update plotsr-base.cfg

---
 src/plotsr-base.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plotsr-base.cfg b/src/plotsr-base.cfg
index ff2f157..043db7a 100644
--- a/src/plotsr-base.cfg
+++ b/src/plotsr-base.cfg
@@ -19,7 +19,7 @@ legend:T                ## To plot legend use T, use F to not plot legend
 genlegcol:4            ## Number of columns for genome legend, set -1 for automatic setup
 bbox:0,1.01,0.5,0.3		## [Left edge, bottom edge, width, height]
 bbox_v:0,1.1,0.5,0.3	## For vertical chromosomes (using -v option)
-bboxmar:0.8             ## Margin between genome and annotation legends
+bboxmar:0.5             ## Margin between genome and annotation legends
 
 ## Tracks
 norm:T                  ## For each chromosome, independently normalise the y-axis of tracks. Use T for normalising independently, and F to normalise based on max value across all chromosomes
-- 
GitLab


From da5d3be3dcac01399ce1dffbb491732617e597d8 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Tue, 6 Aug 2024 13:36:08 +0200
Subject: [PATCH 074/310] Update getSyriFigs.sh

Added haplotype ID to genome name in SyRI figures
---
 scripts/getSyriFigs.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/getSyriFigs.sh b/scripts/getSyriFigs.sh
index f5871a0..4f3c6ba 100755
--- a/scripts/getSyriFigs.sh
+++ b/scripts/getSyriFigs.sh
@@ -104,7 +104,7 @@ done
 # Each line contains 2 columns : fasta filepath and its simpler name
 echo -e "#files\tname" > $wrkdir/genomes.txt
 for asm in "${asmList[@]}"; do
-    echo -e "$wrkdir/$(basename ${asm} .gz)\t$(basename $asm .fa.gz | cut -d'.' -f1)" >> $wrkdir/genomes.txt
+    echo -e "$wrkdir/$(basename ${asm} .gz)\t$(basename $asm .fa.gz | cut -d'.' -f1)#$(basename $asm .fa.gz | cut -d'.' -f2 | cut -d'p' -f2)" >> $wrkdir/genomes.txt
 done
 
 # Generating the plotsr command
-- 
GitLab


From 073e62c568bad79d44f2333a00c9752580f922b5 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Wed, 7 Aug 2024 09:25:30 +0200
Subject: [PATCH 075/310] Update Snakefile

Made some tempory files as tempory
---
 Snakefile | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/Snakefile b/Snakefile
index 188dae5..6c3804c 100644
--- a/Snakefile
+++ b/Snakefile
@@ -144,18 +144,21 @@ rule quast_stats:
         fas=expand("data/haplotypes/{haplotype}.fa.gz", haplotype=SAMPLES_NOREF),
         ref="data/haplotypes/"+config['reference']
     output:
-        report="output/quast/"+config['name']+".quast.report.html"
+        report="output/"+config['name']+".quast.report.html"
     threads: 16
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000
     params:
         app_path=config["app.path"],
-        pan_name=config["name"]
+        pan_name=config["name"],
+        tmp_dir="output/quast"
     log: 
         cmd="logs/quast/quast.cmd.log",
         time="logs/quast/quast.time.log"
     shell:
         """
+        mkdir {params.tmp_dir}
+
         /usr/bin/time -v -o {log.time} \
             apptainer run {params.app_path}/pan1c-env.sif quast.py \
             -t {threads} \
@@ -169,13 +172,8 @@ rule quast_stats:
             {input.fas} 2>&1 | \
             tee {log.cmd}
 
-        # Compressing temporary files
-        apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
-            -@ {threads} $(dirname {output.report})/contigs_reports/*.fa
-        apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
-            -@ {threads} $(dirname {output.report})/contigs_reports/minimap_output/*
-
-        mv $(dirname {output.report})/report.html {output.report}
+        mv ${params.tmp_dir}/report.html {output.report}
+        rm -r {params.tmp_dir}
         """
 
 rule contig_position:
@@ -185,7 +183,7 @@ rule contig_position:
         fai="data/chrInputs/"+config["name"]+".{chromosome}.fa.gz.fai"
     output:
         fig="output/chr.contig/{chromosome}.contig.png",
-        outdir=directory("output/chr.contig/{chromosome}")
+        outdir=temp(directory("output/chr.contig/{chromosome}"))
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 16000
@@ -256,7 +254,7 @@ rule SyRI_on_ASM:
         qry="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz"
     output:
         fig="output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png",
-        wrkdir=directory('data/asm.syri/{haplotype}')
+        wrkdir=temp(directory('data/asm.syri/{haplotype}'))
     log: 
         cmd="logs/SyRI_ASM/{haplotype}.SyRI_ASM.cmd.log",
         time="logs/SyRI_ASM/{haplotype}.SyRI_ASM.time.log"
@@ -294,7 +292,7 @@ rule SyRI_on_chrInput:
         fasta='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz'
     output:
         fig="output/chrInput.syri.figs/"+config['name']+".{chromosome}.asm.syri.png",
-        wrkdir=directory('data/chrInputs/syri/{chromosome}')
+        wrkdir=temp(directory('data/chrInputs/syri/{chromosome}'))
     threads: 8
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 12000
@@ -455,7 +453,7 @@ rule odgi_postprocessing:
     input:
         rules.gfaffix_on_chr.output.gfa
     output:
-        gfa='data/chrGraphs/'+config['name']+'.{chromosome}.gfa'
+        gfa=temp('data/chrGraphs/'+config['name']+'.{chromosome}.gfa')
     threads: 8
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000
@@ -511,8 +509,8 @@ rule odgi_postprocessing:
         sed -i "/^H/r $(dirname {input})/metadata.txt" {output.gfa}
 
         # Compressing
-        # apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
-        #     -@ {threads} -k {output.gfa}
+        apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
+            -@ {threads} -k {output.gfa}
         """
 
 rule generate_graph_list:
-- 
GitLab


From 7dc05fe9a45c68bf72f5e4f175597e2be8130c1b Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 7 Aug 2024 09:37:10 +0200
Subject: [PATCH 076/310] Update Snakefile

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 6c3804c..32498cf 100644
--- a/Snakefile
+++ b/Snakefile
@@ -60,7 +60,7 @@ def which_analysis():
         )
     if config["run_Quast"] == "True": # Running Quast on input haplotypes
         analysis_inputs.append(
-            "output/quast/"+config['name']+".quast.report.html"
+            "output/"+config['name']+".quast.report.html"
         )
     if config["get_contig_pos"] == "True": # Chromosome decomposition into its contig figure
         analysis_inputs.append(
-- 
GitLab


From 792906f22297622ce6f8e380959c6f28d7bfe1ca Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 7 Aug 2024 10:30:27 +0200
Subject: [PATCH 077/310] Fixed error in SV from VG figure

---
 scripts/vcf_2_tsv_vg.awk | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/scripts/vcf_2_tsv_vg.awk b/scripts/vcf_2_tsv_vg.awk
index 04f6be9..1262ba2 100644
--- a/scripts/vcf_2_tsv_vg.awk
+++ b/scripts/vcf_2_tsv_vg.awk
@@ -5,16 +5,24 @@
 
 # INS/DEL counter
 !/^#/ { 
-    ALTLEN[0]=0; split($5, arr, ",")
+    ## Get allele length compared to the ref
+    # Adding the reference which is 0
+    ALTLEN[0]=0 
+    # Split the comma separated list of alternative alleles
+    split($5, arr, ",")
+    # Compute the length of each alternative allele
 	for (i in arr) {
         ALTLEN[i]=(length($4) - length(arr[i]))
     } 
-    for (i=10; i<=NF; i++) { 
+    # For each sample
+    for (i=10; i<=NF; i++) {
+        # Splitting haplotypes 
 		split($i, ALTs, "|")
-        hap=0 
+        hap=0
+        #Iterating over haplotypes
 		for (a in ALTs) { 
-			if (ALTs[a] != "." && ALTLEN[ALTs[a]] != 0) { 
-				hap+=1 
+            hap+=1
+			if (ALTs[a] != "." && ALTLEN[ALTs[a]] != 0) {  
 				printf("%s\t%s\t%s\t%s#%s\t%s\n", $1, $2, $3, HAP[i], hap, ALTLEN[ALTs[a]])
             }
         }
-- 
GitLab


From 3cceafdcc3697bc24cda89e963ba6d34411191c6 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 7 Aug 2024 10:41:50 +0200
Subject: [PATCH 078/310] Update Snakefile

---
 Snakefile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 32498cf..63fda13 100644
--- a/Snakefile
+++ b/Snakefile
@@ -157,7 +157,9 @@ rule quast_stats:
         time="logs/quast/quast.time.log"
     shell:
         """
-        mkdir {params.tmp_dir}
+        if [ ! -d {params.tmp_dir} ]; then
+            mkdir {params.tmp_dir}
+        fi
 
         /usr/bin/time -v -o {log.time} \
             apptainer run {params.app_path}/pan1c-env.sif quast.py \
-- 
GitLab


From 24f31321943a8bd6056e169c99673d5ab0b2a2b1 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 7 Aug 2024 11:24:08 +0200
Subject: [PATCH 079/310] Removing wfmash 1.18 args

---
 Snakefile | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index 63fda13..39f7086 100644
--- a/Snakefile
+++ b/Snakefile
@@ -369,7 +369,6 @@ rule wfmash_on_chr:
             -n $(cat {input.fai} | wc -l) {params.wfmash_sec} -t {threads} \
             --tmp-base $(dirname {output.aln}) \
             {input.fa} --approx-map \
-            --lower-triangular \
             1> {output.mapping} \
             2> >(tee {log.cmd_map} >&2)
 
@@ -379,7 +378,6 @@ rule wfmash_on_chr:
             -s {params.segment_length} -l $(( {params.segment_length} * 5 )) -p {params.mapping_id} \
             -n $(cat {input.fai} | wc -l) {params.wfmash_sec} -t {threads} \
             --tmp-base $(dirname {output.aln}) \
-            --lower-triangular \
             {input.fa} -i {output.mapping} \
             1> {output.aln} \
             2> >(tee {log.cmd_aln} >&2)
-- 
GitLab


From 6812f1b10adfcf986e8660e935862772a88601d1 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 7 Aug 2024 13:57:06 +0200
Subject: [PATCH 080/310] Update Snakefile

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 39f7086..1f5e75d 100644
--- a/Snakefile
+++ b/Snakefile
@@ -174,7 +174,7 @@ rule quast_stats:
             {input.fas} 2>&1 | \
             tee {log.cmd}
 
-        mv ${params.tmp_dir}/report.html {output.report}
+        mv {params.tmp_dir}/report.html {output.report}
         rm -r {params.tmp_dir}
         """
 
-- 
GitLab


From 530967de9c1ddd154d07b66117670c6be8aa40e5 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 7 Aug 2024 14:32:35 +0200
Subject: [PATCH 081/310] Create analyze_VCF_v2.R

---
 scripts/analyze_VCF_v2.R | 256 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 256 insertions(+)
 create mode 100644 scripts/analyze_VCF_v2.R

diff --git a/scripts/analyze_VCF_v2.R b/scripts/analyze_VCF_v2.R
new file mode 100644
index 0000000..402a59a
--- /dev/null
+++ b/scripts/analyze_VCF_v2.R
@@ -0,0 +1,256 @@
+# Script to plot the INS/DEL length from a VCF produced by 'VG deconstruct'
+# 
+# @author: alexis.mergez@inrae.fr
+# @version: 1.2
+
+library(optparse)
+
+#% Parsing arguments
+option_list = list(
+    make_option(c("-b", "--binwidth"), type="double", default=0.02, 
+        help="Bin width", metavar="double"),
+    make_option(c("-v", "--vg"), type="character", default=NULL, 
+        help="vg TSV", metavar="character"),
+    make_option(c("-s", "--syri"), type="character", default=NULL, 
+        help="SyRi TSV", metavar="character"),
+    make_option(c("-o", "--out"), type="character", default=NULL, 
+        help="output directory", metavar="character"),
+    make_option(c("-p", "--panname"), type="character", default=NULL, 
+        help="pangenome name", metavar="character"),
+    make_option(c("-W", "--width"), type="integer", default=18, 
+        help="Figure width", metavar="integer"),
+    make_option(c("-H", "--height"), type="integer", default=6, 
+        help="Figure height", metavar="integer")
+);
+
+opt_parser = OptionParser(option_list=option_list);
+opt = parse_args(opt_parser);
+
+## Accesing arguments with opt$<arg>. For example : opt$bands, opt$tsv, ...
+
+library(ggplot2)
+library(tidyverse)
+library(gridExtra)
+
+#% Parsing TSV file
+write("[analyze_VCF] Parsing TSV ...", stdout())
+vg <- read.delim(opt$vg)
+syri <- read.delim(opt$syri)
+
+sample = str_split_1(vg$CHROM[1], "#")[1]
+vg[c("HAPNAME", "HAPID")] = str_split_fixed(vg$HAP, "#", 2)
+syri[c("HAPNAME", "HAPID")] = str_split_fixed(syri$HAP, "#", 2)
+
+#% Prepare data
+get_data<-function(data){
+    #% Filtering too long and too short INS/DEL, splitting data into 2 dataframe by type
+    INS = data[which(data$LEN >= -100000 & data$LEN <= -50), ]
+    INS$LEN = -INS$LEN
+
+    DEL = data[which(data$LEN <= 100000 & data$LEN >= 50), ]
+
+    #% Passing to LOG for scale reasons
+    INS$LOGLEN = log10(INS$LEN)
+    DEL$LOGLEN = -log10(DEL$LEN)
+
+    #% Creating Bins from the log of the length
+    INS = INS %>% mutate(
+        Bin = cut(
+            LOGLEN, 
+            breaks = seq(0, log10(100000), 
+            by = opt$binwidth), 
+            include.lowest = TRUE, 
+            right = FALSE, 
+            ordered_result = TRUE, 
+            dig.lab=3)
+        )
+    DEL = DEL %>% mutate(
+        Bin = cut(
+            LOGLEN, 
+            breaks = seq(-log10(100000)-opt$binwidth, 0, by = opt$binwidth), 
+            include.lowest = TRUE, 
+            right = FALSE, 
+            ordered_result = TRUE, 
+            dig.lab=3)
+        )
+
+    #% Summerizing the dataframe by haplotypes
+    INS_F = INS %>% 
+        group_by(HAP, Bin) %>%
+        summarise(
+            Count = n(),
+            .groups = 'drop'
+        ) %>%
+        mutate(x = factor(parse_number(as.character(Bin), locale = locale(decimal_mark = ".", grouping_mark = ""))))
+    INS_F[c("HAPNAME", "HAPID")] = str_split_fixed(INS_F$HAP, "#", 2)
+
+    DEL_F = DEL %>% 
+        group_by(HAP, Bin) %>%
+        summarise(
+            Count = n(),
+            .groups = 'drop'
+        ) %>%
+        mutate(x = factor(parse_number(as.character(Bin), locale = locale(decimal_mark = ".", grouping_mark = ""))))
+    DEL_F[c("HAPNAME", "HAPID")] = str_split_fixed(DEL_F$HAP, "#", 2)
+
+    return(INS_F, DEL_F)
+}
+
+vINS_F, vDEL_F = get_data(vg)
+sINF_F, sDEL_F = get_data(syri)
+
+#% Figures section
+title_text = element_text(face="bold", size = 12)
+colours = c("#78ABA8", "#C8CFA0", "#FCDC94", "#EF9C66")
+
+#% Function to retrieve the legend from a plot as a dedicated plot (used in the multiplot command)
+get_legend<-function(myggplot){
+  tmp <- ggplot_gtable(ggplot_build(myggplot))
+  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
+  legend <- tmp$grobs[[leg]]
+  return(legend)
+}
+
+
+#% Creating the general graph, in log and non-log version
+
+## General function
+get_fig_log = function(INS_F, DEL_F, top_name, tool_name){
+    # Insertion figure
+    figA = ggplot(INS_F, aes(x=x)) +
+	geom_line(aes(y=Count, group=HAP, color=HAP, linetype=HAPID)) +
+	scale_y_continuous(trans='log10', position = "right") +
+    expand_limits(x=c(0)) +
+	scale_x_discrete(
+		breaks=seq(0,5),
+		labels=c("","10bp","100bp","1kb","10kb","100kb")) +
+	xlab("INS") +
+	theme_bw() +
+	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), axis.title = title_text) 
+
+    # Deletion figure
+    figB = ggplot(DEL_F, aes(x=x)) +
+	geom_line(aes(y=Count, group=HAP, color=HAP, linetype=HAPID)) +
+	scale_y_continuous(trans='log10') +
+    expand_limits(x=c(0)) +
+	scale_x_discrete(
+		breaks=-seq(0,5),
+		labels=c("","-10bp","-100bp","-1kb","-10kb","-100kb")) +
+	xlab("DEL") +
+	theme_bw() +
+	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), title = title_text, legend.position = "none")
+
+    legend = get_legend(figA)
+    figA = figA + theme(legend.position = "none")
+
+    ## Combining the plots and the legend
+    figF = grid.arrange(figB, figA, legend, nrow = 1, top=paste0(c("Pan1c - ",top_name," - ",sample," - ",tool_name), collapse=""), widths=c(2.3, 2.3, 0.8))
+    return(figF)
+}
+
+get_fig = function(INS_F, DEL_F, top_name, tool_name){
+    # Insertion figure
+    figA = ggplot(INS_F, aes(x=x)) +
+	geom_line(aes(y=Count, group=HAP, color=HAP, linetype=HAPID)) +
+	scale_y_continuous(position = "right") +
+    expand_limits(x=c(0)) +
+	scale_x_discrete(
+		breaks=seq(0,5),
+		labels=c("","10bp","100bp","1kb","10kb","100kb")) +
+	xlab("INS") +
+	theme_bw() +
+	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), axis.title = title_text) 
+
+    # Deletion figure
+    figB = ggplot(DEL_F, aes(x=x)) +
+	geom_line(aes(y=Count, group=HAP, color=HAP, linetype=HAPID)) +
+    expand_limits(x=c(0)) +
+	scale_x_discrete(
+		breaks=-seq(0,5),
+		labels=c("","-10bp","-100bp","-1kb","-10kb","-100kb")) +
+	xlab("DEL") +
+	theme_bw() +
+	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), title = title_text, legend.position = "none")
+
+    legend = get_legend(figA)
+    figA = figA + theme(legend.position = "none")
+
+    ## Combining the plots and the legend
+    figF = grid.arrange(figB, figA, legend, nrow = 1, top=paste0(c("Pan1c - ",top_name," - ",sample," - ",tool_name), collapse=""), widths=c(2.3, 2.3, 0.8))
+    return(figF)
+}
+
+get_fig_single = function(vINS_F, vDEL_F, sINS_F, sDEL_F, top_name){
+    # Insertion figure
+    figA = ggplot(vINS_F, aes(x=x, y=Count, group=HAP, color=HAP, linetype=HAPID)) +
+	geom_line() +
+    geom_line(data = sINS_F) +
+	scale_y_continuous(position = "right") +
+    expand_limits(x=c(0)) +
+	scale_x_discrete(
+		breaks=seq(0,5),
+		labels=c("","10bp","100bp","1kb","10kb","100kb")) +
+	xlab("INS") +
+	theme_bw() +
+	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), axis.title = title_text) 
+
+    # Deletion figure
+    figB = ggplot(vDEL_F, aes(x=x, y=Count, group=HAP, color=HAP, linetype=HAPID)) +
+    geom_line() +
+	geom_line(data = sDEL_F) +
+    expand_limits(x=c(0)) +
+	scale_x_discrete(
+		breaks=-seq(0,5),
+		labels=c("","-10bp","-100bp","-1kb","-10kb","-100kb")) +
+	xlab("DEL") +
+	theme_bw() +
+	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), title = title_text, legend.position = "none")
+
+    legend = get_legend(figA)
+    figA = figA + theme(legend.position = "none")
+
+    ## Combining the plots and the legend
+    figF = grid.arrange(figB, figA, legend, nrow = 1, top=paste0(c("Pan1c - ",top_name," - ",sample), collapse=""), widths=c(2.3, 2.3, 0.8))
+    return(figF)
+}
+
+#% Log version
+write("[analyze_VCF] Creating vg general graph (Log version) ...", stdout())
+FIG = get_fig_log(vINS_F, vDEL_F, opt$panname, "VG")
+sub_name = paste0(c("pan1c",opt$panname,"General_log","vcf","vg","png"), collapse=".")
+ggsave(paste0(c(opt$out,sub_name), collapse="/"), plot=FIG, width=opt$width, height=opt$height)
+
+write("[analyze_VCF] Creating SyRI general graph (Log version) ...", stdout())
+FIG = get_fig_log(sINS_F, sDEL_F, opt$panname, "SyRI")
+sub_name = paste0(c("pan1c",opt$panname,"General_log","vcf","syri","png"), collapse=".")
+ggsave(paste0(c(opt$out,sub_name), collapse="/"), plot=FIG, width=opt$width, height=opt$height)
+
+#% Non log version
+write("[analyze_VCF] Creating vg general graph (Non-log version) ...", stdout())
+FIG = get_fig(vINS_F, vDEL_F, opt$panname, "VG")
+sub_name = paste0(c("pan1c",opt$panname,"General","vcf","vg","png"), collapse=".")
+ggsave(paste0(c(opt$out,sub_name), collapse="/"), plot=FIG, width=opt$width, height=opt$height)
+
+write("[analyze_VCF] Creating SyRI general graph (Non-log version) ...", stdout())
+FIG = get_fig(sINS_F, sDEL_F, opt$panname, "SyRI")
+sub_name = paste0(c("pan1c",opt$panname,"General","vcf","vg","png"), collapse=".")
+ggsave(paste0(c(opt$out,sub_name), collapse="/"), plot=FIG, width=opt$width, height=opt$height)
+
+#% Individual haplotypes plots
+write("[analyze_VCF] Creating haplotype graphs  ...", stdout())
+for (hapname in unique(vg$HAPNAME)){
+    # Getting the haps 
+    hapids = unique(vg[vg$HAPNAME == hapname,]$HAPID)
+    haps = paste(hapname, hapids, sep="#")
+    FIG = get_fig_single(
+        vINS_F[vINS_F$HAP %in% haps,], 
+        vDEL_F[vDEL_F$HAP %in% haps,], 
+        sINS_F[sINS_F$HAP %in% haps,], 
+        sDEL_F[sDEL_F$HAP %in% haps,], 
+        paste0(c(opt$panname," - ",hapname), collapse='')
+    )
+    sub_name = paste0(c("pan1c",opt$panname,hapname,"vcf","both","png"), collapse=".")
+    ggsave(paste0(c(opt$out, sub_name), collapse="/"), plot=FIG, width=opt$width, height=opt$height)
+}
+
+write("[analyze_VCF] Done !", stdout())
\ No newline at end of file
-- 
GitLab


From 2832dda2800d52996042b81dbbf13aeae0951ed7 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 7 Aug 2024 14:42:00 +0200
Subject: [PATCH 082/310] Update analyze_VCF_v2.R

---
 scripts/analyze_VCF_v2.R | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/scripts/analyze_VCF_v2.R b/scripts/analyze_VCF_v2.R
index 402a59a..152caf6 100644
--- a/scripts/analyze_VCF_v2.R
+++ b/scripts/analyze_VCF_v2.R
@@ -93,11 +93,14 @@ get_data<-function(data){
         mutate(x = factor(parse_number(as.character(Bin), locale = locale(decimal_mark = ".", grouping_mark = ""))))
     DEL_F[c("HAPNAME", "HAPID")] = str_split_fixed(DEL_F$HAP, "#", 2)
 
-    return(INS_F, DEL_F)
+    return(list(INS = INS_F, DEL = DEL_F))
 }
-
-vINS_F, vDEL_F = get_data(vg)
-sINF_F, sDEL_F = get_data(syri)
+tmp = get_data(vg)
+vINS_F = tmp$INS
+vDEL_F = tmp$DEL
+tmp = get_data(syri)
+sINF_F = tmp$INS
+sDEL_F = tmp$DEL
 
 #% Figures section
 title_text = element_text(face="bold", size = 12)
-- 
GitLab


From 7ae1ecd74a21b7ebabfe004097a946b6a6161a88 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 7 Aug 2024 14:43:46 +0200
Subject: [PATCH 083/310] Update analyze_VCF_v2.R

---
 scripts/analyze_VCF_v2.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/analyze_VCF_v2.R b/scripts/analyze_VCF_v2.R
index 152caf6..34c3a9c 100644
--- a/scripts/analyze_VCF_v2.R
+++ b/scripts/analyze_VCF_v2.R
@@ -99,7 +99,7 @@ tmp = get_data(vg)
 vINS_F = tmp$INS
 vDEL_F = tmp$DEL
 tmp = get_data(syri)
-sINF_F = tmp$INS
+sINS_F = tmp$INS
 sDEL_F = tmp$DEL
 
 #% Figures section
-- 
GitLab


From 54e294ec65d2217a65bd21b3b2661646f6aa2261 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 7 Aug 2024 14:47:33 +0200
Subject: [PATCH 084/310] Update analyze_VCF_v2.R

---
 scripts/analyze_VCF_v2.R | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/scripts/analyze_VCF_v2.R b/scripts/analyze_VCF_v2.R
index 34c3a9c..c17c971 100644
--- a/scripts/analyze_VCF_v2.R
+++ b/scripts/analyze_VCF_v2.R
@@ -184,8 +184,13 @@ get_fig = function(INS_F, DEL_F, top_name, tool_name){
 }
 
 get_fig_single = function(vINS_F, vDEL_F, sINS_F, sDEL_F, top_name){
+    vINS_F$TOOL = rep("VG", nrow(vINS_F))
+    sINS_F$TOOL = rep("SyRI", nrow(sINS_F))
+    vDEL_F$TOOL = rep("VG", nrow(vDEL_F))
+    sDEL_F$TOOL = rep("VG", nrow(sDEL_F))
+
     # Insertion figure
-    figA = ggplot(vINS_F, aes(x=x, y=Count, group=HAP, color=HAP, linetype=HAPID)) +
+    figA = ggplot(vINS_F, aes(x=x, y=Count, group=TOOL, color=TOOL, linetype=HAPID)) +
 	geom_line() +
     geom_line(data = sINS_F) +
 	scale_y_continuous(position = "right") +
@@ -198,7 +203,7 @@ get_fig_single = function(vINS_F, vDEL_F, sINS_F, sDEL_F, top_name){
 	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), axis.title = title_text) 
 
     # Deletion figure
-    figB = ggplot(vDEL_F, aes(x=x, y=Count, group=HAP, color=HAP, linetype=HAPID)) +
+    figB = ggplot(vDEL_F, aes(x=x, y=Count, group=TOOL, color=TOOL, linetype=HAPID)) +
     geom_line() +
 	geom_line(data = sDEL_F) +
     expand_limits(x=c(0)) +
-- 
GitLab


From d210b3930fb6ace951081eb336a095c7182726e5 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 7 Aug 2024 14:49:05 +0200
Subject: [PATCH 085/310] Update analyze_VCF_v2.R

---
 scripts/analyze_VCF_v2.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/analyze_VCF_v2.R b/scripts/analyze_VCF_v2.R
index c17c971..5657140 100644
--- a/scripts/analyze_VCF_v2.R
+++ b/scripts/analyze_VCF_v2.R
@@ -187,7 +187,7 @@ get_fig_single = function(vINS_F, vDEL_F, sINS_F, sDEL_F, top_name){
     vINS_F$TOOL = rep("VG", nrow(vINS_F))
     sINS_F$TOOL = rep("SyRI", nrow(sINS_F))
     vDEL_F$TOOL = rep("VG", nrow(vDEL_F))
-    sDEL_F$TOOL = rep("VG", nrow(sDEL_F))
+    sDEL_F$TOOL = rep("SyRI", nrow(sDEL_F))
 
     # Insertion figure
     figA = ggplot(vINS_F, aes(x=x, y=Count, group=TOOL, color=TOOL, linetype=HAPID)) +
@@ -241,7 +241,7 @@ ggsave(paste0(c(opt$out,sub_name), collapse="/"), plot=FIG, width=opt$width, hei
 
 write("[analyze_VCF] Creating SyRI general graph (Non-log version) ...", stdout())
 FIG = get_fig(sINS_F, sDEL_F, opt$panname, "SyRI")
-sub_name = paste0(c("pan1c",opt$panname,"General","vcf","vg","png"), collapse=".")
+sub_name = paste0(c("pan1c",opt$panname,"General","vcf","syri","png"), collapse=".")
 ggsave(paste0(c(opt$out,sub_name), collapse="/"), plot=FIG, width=opt$width, height=opt$height)
 
 #% Individual haplotypes plots
-- 
GitLab


From bbce95cde9d0b92bcdc780883ddc65cac8f18d8d Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 7 Aug 2024 15:43:27 +0200
Subject: [PATCH 086/310] Updated SyRI/VG INDEL figs rules

---
 Snakefile | 75 +++++++++++++++++++++++++++++++------------------------
 1 file changed, 43 insertions(+), 32 deletions(-)

diff --git a/Snakefile b/Snakefile
index 1f5e75d..60621cf 100644
--- a/Snakefile
+++ b/Snakefile
@@ -52,7 +52,7 @@ def which_analysis():
 
     if config["get_ASMs_SyRI"] == "True": # Creating SyRI for each input assembly 
         analysis_inputs.append(
-            expand("output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", haplotype=SAMPLES_NOREF)
+            expand("output/asm.syri.figs/"+config['name']+".{haplotype}.syri.png", haplotype=SAMPLES_NOREF)
         )
     if config["get_chrInputs_SyRI"] == "True": # Creating SyRI figures for each PGGB input
         analysis_inputs.append(
@@ -73,7 +73,8 @@ def which_analysis():
             )
     if config["get_VCF"] == "True": # VCF from the final graph against the "reference"
         analysis_inputs.append(
-            "output/vcf.figs.vg"
+            "output/vcf.figs.vg",
+            "output/vcf.figs.syri"
         ) 
     return analysis_inputs
 
@@ -255,8 +256,8 @@ rule SyRI_on_ASM:
         ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz",
         qry="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz"
     output:
-        fig="output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png",
-        wrkdir=temp(directory('data/asm.syri/{haplotype}'))
+        fig="output/asm.syri.figs/"+config['name']+".{haplotype}.syri.png",
+        vcf="data/asm.syri/"+config['name']+".{haplotype}.syri.vcf.gz"
     log: 
         cmd="logs/SyRI_ASM/{haplotype}.SyRI_ASM.cmd.log",
         time="logs/SyRI_ASM/{haplotype}.SyRI_ASM.time.log"
@@ -266,15 +267,18 @@ rule SyRI_on_ASM:
     retries: 1
     params:
         app_path=config["app.path"],
+        wrk_dir="data/asm.syri",
         plotsr_cfg="src/plotsr-base.cfg"
     shell:
         """
-        mkdir -p {output.wrkdir}
+        dir="{params.wrk_dir}/{wildcars.haplotype}"
+
+        mkdir -p $dir
         /usr/bin/time -v -o {log.time} \
             bash scripts/getSyriFigs.sh \
             -a {params.app_path} \
             -t {threads} \
-            -d {output.wrkdir} \
+            -d $dir \
             -o $(basename {output.fig}) \
             -r {input.ref} \
             -h 10 -w 20 -s "0.9" -f 10 \
@@ -282,7 +286,13 @@ rule SyRI_on_ASM:
             -q "{input.qry}" 2>&1 | \
             tee {log.cmd}
         
-        mv {output.wrkdir}/$(basename {output.fig}) {output.fig}
+        mv $dir/$(basename {output.fig}) {output.fig}
+        mv $dir/*.vcf $(basename {output.vcf} .gz)
+
+        apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
+            -@ {threads} $(basename {output.vcf} .gz)
+
+        rm -r $dir
         """
 
 """
@@ -293,31 +303,32 @@ rule SyRI_on_chrInput:
     input:
         fasta='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz'
     output:
-        fig="output/chrInput.syri.figs/"+config['name']+".{chromosome}.asm.syri.png",
-        wrkdir=temp(directory('data/chrInputs/syri/{chromosome}'))
+        fig="output/chrInput.syri.figs/"+config['name']+".{chromosome}.syri.png"
     threads: 8
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 12000
     params:
         app_path=config["app.path"],
         ref=config['reference'],
+        wrk_dir="data/chrInput.syri",
         plotsr_cfg="src/plotsr-base.cfg"
     shell:
         """
-        mkdir {output.wrkdir}
+        dir="{params.wrk_dir}/{wildcards.chromosome}"
+
+        mkdir $dir
         refname=$(basename {params.ref} .fa.gz | cut -d'.' -f1,2)
 
         ## Creating single fasta from multifasta
         zcat {input.fasta} | awk -F"#" \
-            '/^>/ {{OUT="{output.wrkdir}/" substr($0,2) ".fa"}}; {{print >> OUT; close(OUT)}}'
+            '/^>/ {{OUT="$dir/" substr($0,2) ".fa"}}; {{print >> OUT; close(OUT)}}'
 
         apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
-            -@ {threads} {output.wrkdir}/*.fa
-        #zgrep '>' {output.wrkdir}/*.fa.gz
+            -@ {threads} $dir/*.fa
 
         ## Getting the list of sequences
         AllAsmList=()
-        for file in {output.wrkdir}/*.fa.gz; do
+        for file in $dir/*.fa.gz; do
             asm="$(basename $file .fa.gz | cut -f1,2 -d"#" | sed 's/#/\\.hap/').fa.gz"
             mv $file "$(dirname $file)/$asm"
             AllAsmList+=("$(dirname $file)/$asm")
@@ -328,14 +339,15 @@ rule SyRI_on_chrInput:
         bash scripts/getSyriFigs.sh \
             -a {params.app_path} \
             -t {threads} \
-            -d {output.wrkdir} \
+            -d $dir \
             -o $(basename {output.fig}) \
-            -r {output.wrkdir}/"${{refname}}.fa.gz" \
+            -r $dir/"${{refname}}.fa.gz" \
             -q "${{AllAsmList[*]}}" \
             -c {params.plotsr_cfg} \
             -h 10 -w 20 -s "0.9" -f 10
-        mv {output.wrkdir}/$(basename {output.fig}) {output.fig}
-        rm {output.wrkdir}/*.fa
+
+        mv $dir/$(basename {output.fig}) {output.fig}
+        rm -r $dir
         """
 
 rule wfmash_on_chr:
@@ -795,9 +807,8 @@ rule vcf_fig_vg:
 rule vcf_fig_syri:
     # Produce a figure describing INS/DEL length distribution from syri
     input: 
-        flags=expand('data/asm.syri/{haplotype}', haplotype=SAMPLES_NOREF)
+        vcfs=expand("data/asm.syri/"+config['name']+".{haplotype}.syri.vcf.gz", haplotype=SAMPLES_NOREF)
     output:
-        inter_tsv=expand('data/asm.syri/{haplotype}.vcf.tsv', haplotype=SAMPLES_NOREF),
         tsv=temp("output/pan1c."+config['name']+".vcf.syri.tsv"),
         vcf_fig=directory("output/vcf.figs.syri")
     threads: 1
@@ -814,22 +825,22 @@ rule vcf_fig_syri:
 
         RHAP=$(basename {params.refname} .fa.gz | cut -f1 -d'.')
         RHAPN=$(basename {params.refname} .fa.gz | cut -f2 -d'.' | cut -f2 -d'p')
-        FOLDER=$(dirname {input.flags[0]})
+        FOLDER=$(dirname {input.vcfs[0]})
+
         ## Going through all folders
-        for folder in $FOLDER/*; do
-            if [ -d $folder ]; then
-                THAP=$(basename $folder | cut -f1 -d'.')
-                THAPN=$(basename $folder | cut -f2 -d'.' | cut -f2 -d'p')
-
-                # Producing intermediate TSVs
-                cat $folder/*.vcf | \
-                    awk -v THAP=$THAP -v THAPN=$THAPN -v RHAP=$RHAP -v RHAPN=$RHAPN -f scripts/vcf_2_tsv_syri.awk \
-                    > $FOLDER/$(basename $folder).vcf.tsv
+        for vcf in $FOLDER/*.vcf.gz; do
+            THAP=$(basename $vcf .syri.vcf.gz | cut -f2 -d'.')
+            THAPN=$(basename $vcf .syri.vcf.gz | cut -f3 -d'.' | cut -f2 -d'p')
+
+            # Producing intermediate TSVs
+            zcat $vcf | \
+                awk -v THAP=$THAP -v THAPN=$THAPN -v RHAP=$RHAP -v RHAPN=$RHAPN -f scripts/vcf_2_tsv_syri.awk \
+                > $FOLDER/$(basename $vcf .gz).tsv
             fi
         done
 
         ## Merging TSVs
-        head -n1 $FOLDER/$(basename $folder).vcf.tsv > {output.tsv}
+        head -n1 $FOLDER/$(basename $vcf .gz).tsv > {output.tsv}
         tail -n +2  -q $FOLDER/*.vcf.tsv >> {output.tsv}
 
         ## Running R to get the figures
@@ -893,7 +904,7 @@ def get_report_sections(wildcards):
 
     if config["get_ASMs_SyRI"] == "True":
         sections["SyRI_on_ASMs_figs"] = expand(
-            "output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", 
+            "output/asm.syri.figs/"+config['name']+".{haplotype}.syri.png", 
             haplotype=SAMPLES_NOREF
             )
 
-- 
GitLab


From 4b6f18505e415c8c1f30ae4a56dfa70cc4ba049e Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 7 Aug 2024 15:49:23 +0200
Subject: [PATCH 087/310] Update Snakefile

---
 Snakefile | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/Snakefile b/Snakefile
index 60621cf..c304df3 100644
--- a/Snakefile
+++ b/Snakefile
@@ -72,10 +72,9 @@ def which_analysis():
                 "output/pan1c."+config['name']+".report.md"
             )
     if config["get_VCF"] == "True": # VCF from the final graph against the "reference"
-        analysis_inputs.append(
-            "output/vcf.figs.vg",
-            "output/vcf.figs.syri"
-        ) 
+        analysis_inputs.append("output/vcf.figs.vg") 
+        analysis_inputs.append("output/vcf.figs.syri")
+        
     return analysis_inputs
 
 """
-- 
GitLab


From d29af35604eb8a2cf147a208f31490eb869ee0e7 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 7 Aug 2024 15:55:59 +0200
Subject: [PATCH 088/310] Update Snakefile

---
 Snakefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Snakefile b/Snakefile
index c304df3..1f9744b 100644
--- a/Snakefile
+++ b/Snakefile
@@ -56,7 +56,7 @@ def which_analysis():
         )
     if config["get_chrInputs_SyRI"] == "True": # Creating SyRI figures for each PGGB input
         analysis_inputs.append(
-            expand("output/chrInput.syri.figs/"+config['name']+".{chromosome}.asm.syri.png", chromosome=CHRLIST)
+            expand("output/chrInput.syri.figs/"+config['name']+".{chromosome}.syri.png", chromosome=CHRLIST)
         )
     if config["run_Quast"] == "True": # Running Quast on input haplotypes
         analysis_inputs.append(
@@ -74,7 +74,7 @@ def which_analysis():
     if config["get_VCF"] == "True": # VCF from the final graph against the "reference"
         analysis_inputs.append("output/vcf.figs.vg") 
         analysis_inputs.append("output/vcf.figs.syri")
-        
+
     return analysis_inputs
 
 """
@@ -909,7 +909,7 @@ def get_report_sections(wildcards):
 
     if config["get_chrInputs_SyRI"] == "True":
         sections["SyRI_on_chrInputs_figs"] = expand(
-            "output/chrInput.syri.figs/"+config['name']+".{chromosome}.asm.syri.png", 
+            "output/chrInput.syri.figs/"+config['name']+".{chromosome}.syri.png", 
             chromosome=CHRLIST
             )
 
-- 
GitLab


From 2fb5d5fd923746c43cbd5b1c97bafefc96502552 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 7 Aug 2024 15:58:45 +0200
Subject: [PATCH 089/310] Update Snakefile

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 1f9744b..0d78d06 100644
--- a/Snakefile
+++ b/Snakefile
@@ -270,7 +270,7 @@ rule SyRI_on_ASM:
         plotsr_cfg="src/plotsr-base.cfg"
     shell:
         """
-        dir="{params.wrk_dir}/{wildcars.haplotype}"
+        dir="{params.wrk_dir}/{wildcards.haplotype}"
 
         mkdir -p $dir
         /usr/bin/time -v -o {log.time} \
-- 
GitLab


From 0d5182d814493517c81ba2433f4ae41fcd63ac24 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 7 Aug 2024 16:10:39 +0200
Subject: [PATCH 090/310] Update Snakefile

---
 Snakefile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index 0d78d06..ec97afc 100644
--- a/Snakefile
+++ b/Snakefile
@@ -164,7 +164,7 @@ rule quast_stats:
         /usr/bin/time -v -o {log.time} \
             apptainer run {params.app_path}/pan1c-env.sif quast.py \
             -t {threads} \
-            -o "$(dirname {output.report})" \
+            -o {params.tmp_dir} \
             -r {input.ref} \
             --plots-format png \
             --no-read-stats \
@@ -263,7 +263,6 @@ rule SyRI_on_ASM:
     threads: 4
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 12000
-    retries: 1
     params:
         app_path=config["app.path"],
         wrk_dir="data/asm.syri",
-- 
GitLab


From 83c3575ec3efedf8349e2f57244343093a13fcac Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 7 Aug 2024 16:15:09 +0200
Subject: [PATCH 091/310] Update Snakefile

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index ec97afc..706b462 100644
--- a/Snakefile
+++ b/Snakefile
@@ -314,7 +314,7 @@ rule SyRI_on_chrInput:
         """
         dir="{params.wrk_dir}/{wildcards.chromosome}"
 
-        mkdir $dir
+        mkdir -p $dir
         refname=$(basename {params.ref} .fa.gz | cut -d'.' -f1,2)
 
         ## Creating single fasta from multifasta
-- 
GitLab


From ba2cbe5c9f99601e2dce327b39ea5cf36b6786b1 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 7 Aug 2024 16:22:45 +0200
Subject: [PATCH 092/310] Update Snakefile

---
 Snakefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Snakefile b/Snakefile
index 706b462..9edbc65 100644
--- a/Snakefile
+++ b/Snakefile
@@ -318,8 +318,8 @@ rule SyRI_on_chrInput:
         refname=$(basename {params.ref} .fa.gz | cut -d'.' -f1,2)
 
         ## Creating single fasta from multifasta
-        zcat {input.fasta} | awk -F"#" \
-            '/^>/ {{OUT="$dir/" substr($0,2) ".fa"}}; {{print >> OUT; close(OUT)}}'
+        zcat {input.fasta} | awk -F"#" -v DIR=$dir \
+            '/^>/ {{OUT= DIR "/" substr($0,2) ".fa"}}; {{print >> OUT; close(OUT)}}'
 
         apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
             -@ {threads} $dir/*.fa
@@ -339,7 +339,7 @@ rule SyRI_on_chrInput:
             -t {threads} \
             -d $dir \
             -o $(basename {output.fig}) \
-            -r $dir/"${{refname}}.fa.gz" \
+            -r "${{dir}}/${{refname}}.fa.gz" \
             -q "${{AllAsmList[*]}}" \
             -c {params.plotsr_cfg} \
             -h 10 -w 20 -s "0.9" -f 10
-- 
GitLab


From 7078c4e5e7a7b8c2d45cb1ce796e2d03a1d94f43 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 7 Aug 2024 17:17:35 +0200
Subject: [PATCH 093/310] Fixed plotsr error

---
 Snakefile           |  4 ++--
 src/plotsr-base.cfg | 13 +------------
 2 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/Snakefile b/Snakefile
index 9edbc65..2bb0d5d 100644
--- a/Snakefile
+++ b/Snakefile
@@ -285,10 +285,10 @@ rule SyRI_on_ASM:
             tee {log.cmd}
         
         mv $dir/$(basename {output.fig}) {output.fig}
-        mv $dir/*.vcf $(basename {output.vcf} .gz)
+        mv $dir/*.vcf {params.wrk_dir}/$(basename {output.vcf} .gz)
 
         apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
-            -@ {threads} $(basename {output.vcf} .gz)
+            -@ {threads} {params.wrk_dir}/$(basename {output.vcf} .gz)
 
         rm -r $dir
         """
diff --git a/src/plotsr-base.cfg b/src/plotsr-base.cfg
index 043db7a..ad942be 100644
--- a/src/plotsr-base.cfg
+++ b/src/plotsr-base.cfg
@@ -3,10 +3,6 @@ syncol:#CCCCCC
 invcol:#FFA500
 tracol:#9ACD32
 dupcol:#00BBFF
-synlwd:0                ## Line width for syntenic annotations
-invlwd:0.1              ## Line width for inversions
-tralwd:0.1              ## Line width for translocations
-duplwd:0.1              ## Line width for duplications
 alpha:0.8
 
 ## Margins and dimensions:
@@ -19,11 +15,4 @@ legend:T                ## To plot legend use T, use F to not plot legend
 genlegcol:4            ## Number of columns for genome legend, set -1 for automatic setup
 bbox:0,1.01,0.5,0.3		## [Left edge, bottom edge, width, height]
 bbox_v:0,1.1,0.5,0.3	## For vertical chromosomes (using -v option)
-bboxmar:0.5             ## Margin between genome and annotation legends
-
-## Tracks
-norm:T                  ## For each chromosome, independently normalise the y-axis of tracks. Use T for normalising independently, and F to normalise based on max value across all chromosomes
-
-## Axis
-maxl:-1                 ## Manually set maximum chromosome position. Use `-1` for automatic selection. Does not work with --itx
-genname:T               ## Write genome names adjacent to the chromosome (T) or not (F)
\ No newline at end of file
+bboxmar:0.5             ## Margin between genome and annotation legends
\ No newline at end of file
-- 
GitLab


From 8d1ca1275fa700740b7a1791d2d19428e84f5ce4 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 7 Aug 2024 20:33:15 +0200
Subject: [PATCH 094/310] Update Snakefile

---
 Snakefile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 2bb0d5d..2977338 100644
--- a/Snakefile
+++ b/Snakefile
@@ -834,7 +834,6 @@ rule vcf_fig_syri:
             zcat $vcf | \
                 awk -v THAP=$THAP -v THAPN=$THAPN -v RHAP=$RHAP -v RHAPN=$RHAPN -f scripts/vcf_2_tsv_syri.awk \
                 > $FOLDER/$(basename $vcf .gz).tsv
-            fi
         done
 
         ## Merging TSVs
-- 
GitLab


From fe26374da39a30f8a140f66f75c3eb8f04f0c3bb Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 8 Aug 2024 09:48:57 +0200
Subject: [PATCH 095/310] New vcf figures

---
 Snakefile | 93 ++++++++++++++++++++++++-------------------------------
 1 file changed, 41 insertions(+), 52 deletions(-)

diff --git a/Snakefile b/Snakefile
index 2977338..851a85f 100644
--- a/Snakefile
+++ b/Snakefile
@@ -72,8 +72,7 @@ def which_analysis():
                 "output/pan1c."+config['name']+".report.md"
             )
     if config["get_VCF"] == "True": # VCF from the final graph against the "reference"
-        analysis_inputs.append("output/vcf.figs.vg") 
-        analysis_inputs.append("output/vcf.figs.syri")
+        analysis_inputs.append("output/vcf.figs") 
 
     return analysis_inputs
 
@@ -772,13 +771,13 @@ rule vg_deconstruct:
                 2> >(tee {log.cmd} >&2)
         """
 
-rule vcf_fig_vg:
-    # Produce a figure describing INS/DEL length distribution from vg deconstruct
+rule vcf_fig:
+    # Produce a figure describing INS/DEL length distribution from vg deconstruct and SyRI
     input:
-        vcf="output/pan1c."+config['name']+".vcf.gz"
+        vg="output/pan1c."+config['name']+".vcf.gz"
+        syris=expand("data/asm.syri/"+config['name']+".{haplotype}.syri.vcf.gz", haplotype=SAMPLES_NOREF)
     output:
-        tsv=temp("output/pan1c."+config['name']+".vcf.tsv"),
-        vcf_fig=directory("output/vcf.figs.vg")
+        vcf_fig=directory("output/vcf.figs")
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 8000
@@ -788,43 +787,13 @@ rule vcf_fig_vg:
         pan_name=config['name']
     shell:
         """
-        mkdir {output.vcf_fig}
-
-        ## Producing TSV for the figures
-        zcat {input} | awk -f scripts/vcf_2_tsv_vg.awk > {output.tsv}
-
-        ## Running R to get the figures
-        apptainer run --app Renv {params.app_path}/pan1c-env.sif Rscript scripts/analyze_VCF.R \
-            -t {output.tsv} \
-            -o {output.vcf_fig} \
-            -p {params.pan_name} \
-            -T "VG" \
-            {params.fig_config}
-        """
-
-rule vcf_fig_syri:
-    # Produce a figure describing INS/DEL length distribution from syri
-    input: 
-        vcfs=expand("data/asm.syri/"+config['name']+".{haplotype}.syri.vcf.gz", haplotype=SAMPLES_NOREF)
-    output:
-        tsv=temp("output/pan1c."+config['name']+".vcf.syri.tsv"),
-        vcf_fig=directory("output/vcf.figs.syri")
-    threads: 1
-    resources:
-        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 8000
-    params:
-        app_path=config['app.path'],
-        fig_config=config['vcf_fig.params'],
-        pan_name=config['name'],
-        refname=config['reference']
-    shell:
-        """
-        mkdir {output.vcf_fig}
+        mkdir -p {output.vcf_fig}
 
         RHAP=$(basename {params.refname} .fa.gz | cut -f1 -d'.')
         RHAPN=$(basename {params.refname} .fa.gz | cut -f2 -d'.' | cut -f2 -d'p')
         FOLDER=$(dirname {input.vcfs[0]})
 
+        #% SyRI VCF
         ## Going through all folders
         for vcf in $FOLDER/*.vcf.gz; do
             THAP=$(basename $vcf .syri.vcf.gz | cut -f2 -d'.')
@@ -837,16 +806,24 @@ rule vcf_fig_syri:
         done
 
         ## Merging TSVs
-        head -n1 $FOLDER/$(basename $vcf .gz).tsv > {output.tsv}
-        tail -n +2  -q $FOLDER/*.vcf.tsv >> {output.tsv}
+        head -n1 $FOLDER/$(basename $vcf .gz).tsv > {output.vcf_fig}/syri.tsv
+        tail -n +2  -q $FOLDER/*.vcf.tsv >> {output.vcf_fig}/syri.tsv
+
+        rm $FOLDER/*.tsv
+
+        #% VG VCF
+        ## Producing TSV for the figures
+        zcat {input} | awk -f scripts/vcf_2_tsv_vg.awk > {output.vcf_fig}/vg.tsv
 
-        ## Running R to get the figures
-        apptainer run --app Renv {params.app_path}/pan1c-env.sif Rscript scripts/analyze_VCF.R \
-            -t {output.tsv} \
+        #% Running R to get the figures
+        apptainer run --app Renv {params.app_path}/pan1c-env.sif Rscript scripts/analyze_VCF_v2.R \
+            -v {output.vcf_fig}/vg.tsv \
+            -s {output.vcf_fig}/syri.tsv \
             -o {output.vcf_fig} \
             -p {params.pan_name} \
-            -T "SyRI" \
             {params.fig_config}
+
+        rm {output.vcf_fig}/*.tsv
         """
 
 rule create_pan1c_report_fig:
@@ -912,8 +889,7 @@ def get_report_sections(wildcards):
             )
 
     if config['get_VCF'] == "True":
-        sections['VCF_fig_vg'] = "output/vcf.figs.vg"
-        sections['VCF_fig_syri'] = "output/vcf.figs.syri"
+        sections['VCF_figs'] = "output/vcf.figs"
 
     return sections      
 
@@ -1007,17 +983,30 @@ rule create_pan1c_report:
         if params.add_VCF_fig == "True":
             shell("echo '# INS/DEL length distribution' >> {output.report}")
             figures = [
-                fig for fig in os.listdir(input.VCF_fig_vg)
+                fig for fig in os.listdir(input.VCF_figs)
                 if fig[-3:] == "png"
                 and fig.split('.')[2][:7] != "General"
             ]
             figures.sort()
-            figures = ["pan1c."+config["name"]+".General.vcf.png", "pan1c."+config["name"]+".General_log.vcf.png"] + figures
+
+            # Adding back 'general' figures
+            figures = [
+                f"pan1c."+config["name"]+".{fig_type}.vcf.{tool}.png" 
+                for fig_type in ["General", "General_log"]
+                for tool in ["vg", "syri"]
+            ] + figures
+
+            general_fig = None
             for basename in figures:
                 name = basename.split('.')[2]
-                shell("echo '## {name}' >> {output.report}")
-                shell("echo '![{basename}](./vcf.figs.vg/{basename})' >> {output.report}")
-                shell("echo '![{basename}](./vcf.figs.syri/{basename})' >> {output.report}")
+
+                if name[:7] == "General" and general_fig is None :
+                    shell("echo '## {name}' >> {output.report}")
+                    general_fig = name
+                else :
+                    general_fig = None
+
+                shell("echo '![{basename}](./vcf.figs/{basename})' >> {output.report}")
                 shell("echo '' >> {output.report}")
 
         # Converting to HTML
-- 
GitLab


From d44512f8d8b2f1fbee6211a3c0159bdda2d77a41 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 8 Aug 2024 09:51:28 +0200
Subject: [PATCH 096/310] Update Snakefile

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 851a85f..8410822 100644
--- a/Snakefile
+++ b/Snakefile
@@ -774,7 +774,7 @@ rule vg_deconstruct:
 rule vcf_fig:
     # Produce a figure describing INS/DEL length distribution from vg deconstruct and SyRI
     input:
-        vg="output/pan1c."+config['name']+".vcf.gz"
+        vg="output/pan1c."+config['name']+".vcf.gz",
         syris=expand("data/asm.syri/"+config['name']+".{haplotype}.syri.vcf.gz", haplotype=SAMPLES_NOREF)
     output:
         vcf_fig=directory("output/vcf.figs")
-- 
GitLab


From 3e35f069b5c0843495384c1f59af51f3e0dd7b89 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 8 Aug 2024 09:55:59 +0200
Subject: [PATCH 097/310] Update Snakefile

---
 Snakefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 8410822..20f50e9 100644
--- a/Snakefile
+++ b/Snakefile
@@ -784,7 +784,8 @@ rule vcf_fig:
     params:
         app_path=config['app.path'],
         fig_config=config['vcf_fig.params'],
-        pan_name=config['name']
+        pan_name=config['name'],
+        refname=config['reference']
     shell:
         """
         mkdir -p {output.vcf_fig}
-- 
GitLab


From c34b68069cd7d1e198898d3a20c2faf3dd3344ce Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 8 Aug 2024 09:58:47 +0200
Subject: [PATCH 098/310] Update Snakefile

---
 Snakefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index 20f50e9..4ca851b 100644
--- a/Snakefile
+++ b/Snakefile
@@ -792,7 +792,7 @@ rule vcf_fig:
 
         RHAP=$(basename {params.refname} .fa.gz | cut -f1 -d'.')
         RHAPN=$(basename {params.refname} .fa.gz | cut -f2 -d'.' | cut -f2 -d'p')
-        FOLDER=$(dirname {input.vcfs[0]})
+        FOLDER=$(dirname {input.syris[0]})
 
         #% SyRI VCF
         ## Going through all folders
@@ -814,7 +814,7 @@ rule vcf_fig:
 
         #% VG VCF
         ## Producing TSV for the figures
-        zcat {input} | awk -f scripts/vcf_2_tsv_vg.awk > {output.vcf_fig}/vg.tsv
+        zcat {input.vg} | awk -f scripts/vcf_2_tsv_vg.awk > {output.vcf_fig}/vg.tsv
 
         #% Running R to get the figures
         apptainer run --app Renv {params.app_path}/pan1c-env.sif Rscript scripts/analyze_VCF_v2.R \
-- 
GitLab


From f51187c7d50f2ff1e5301a26f091fa07b7f4a8bb Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 8 Aug 2024 10:08:59 +0200
Subject: [PATCH 099/310] Update analyze_VCF_v2.R

---
 scripts/analyze_VCF_v2.R | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/analyze_VCF_v2.R b/scripts/analyze_VCF_v2.R
index 5657140..1fe91f0 100644
--- a/scripts/analyze_VCF_v2.R
+++ b/scripts/analyze_VCF_v2.R
@@ -133,7 +133,7 @@ get_fig_log = function(INS_F, DEL_F, top_name, tool_name){
 
     # Deletion figure
     figB = ggplot(DEL_F, aes(x=x)) +
-	geom_line(aes(y=Count, group=HAP, color=HAP, linetype=HAPID)) +
+	geom_line(aes(y=Count, group=interaction(HAP, HAPID), color=HAP, linetype=HAPID)) +
 	scale_y_continuous(trans='log10') +
     expand_limits(x=c(0)) +
 	scale_x_discrete(
@@ -154,7 +154,7 @@ get_fig_log = function(INS_F, DEL_F, top_name, tool_name){
 get_fig = function(INS_F, DEL_F, top_name, tool_name){
     # Insertion figure
     figA = ggplot(INS_F, aes(x=x)) +
-	geom_line(aes(y=Count, group=HAP, color=HAP, linetype=HAPID)) +
+	geom_line(aes(y=Count, group=interaction(HAP, HAPID), color=HAP, linetype=HAPID)) +
 	scale_y_continuous(position = "right") +
     expand_limits(x=c(0)) +
 	scale_x_discrete(
@@ -166,7 +166,7 @@ get_fig = function(INS_F, DEL_F, top_name, tool_name){
 
     # Deletion figure
     figB = ggplot(DEL_F, aes(x=x)) +
-	geom_line(aes(y=Count, group=HAP, color=HAP, linetype=HAPID)) +
+	geom_line(aes(y=Count, group=interaction(HAP, HAPID), color=HAP, linetype=HAPID)) +
     expand_limits(x=c(0)) +
 	scale_x_discrete(
 		breaks=-seq(0,5),
@@ -190,7 +190,7 @@ get_fig_single = function(vINS_F, vDEL_F, sINS_F, sDEL_F, top_name){
     sDEL_F$TOOL = rep("SyRI", nrow(sDEL_F))
 
     # Insertion figure
-    figA = ggplot(vINS_F, aes(x=x, y=Count, group=TOOL, color=TOOL, linetype=HAPID)) +
+    figA = ggplot(vINS_F, aes(x=x, y=Count, group=interaction(TOOL, HAPID), color=TOOL, linetype=HAPID)) +
 	geom_line() +
     geom_line(data = sINS_F) +
 	scale_y_continuous(position = "right") +
-- 
GitLab


From 8969e50e3302b9b53cecc77d8621cba69989e4c0 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 8 Aug 2024 10:41:40 +0200
Subject: [PATCH 100/310] Update analyze_VCF_v2.R

---
 scripts/analyze_VCF_v2.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/analyze_VCF_v2.R b/scripts/analyze_VCF_v2.R
index 1fe91f0..708f712 100644
--- a/scripts/analyze_VCF_v2.R
+++ b/scripts/analyze_VCF_v2.R
@@ -121,7 +121,7 @@ get_legend<-function(myggplot){
 get_fig_log = function(INS_F, DEL_F, top_name, tool_name){
     # Insertion figure
     figA = ggplot(INS_F, aes(x=x)) +
-	geom_line(aes(y=Count, group=HAP, color=HAP, linetype=HAPID)) +
+	geom_line(aes(y=Count, group=interaction(HAP, HAPID), color=HAP, linetype=HAPID)) +
 	scale_y_continuous(trans='log10', position = "right") +
     expand_limits(x=c(0)) +
 	scale_x_discrete(
-- 
GitLab


From 099a816136a4b0ff44d48e3610bef88e3074a27d Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 8 Aug 2024 10:48:07 +0200
Subject: [PATCH 101/310] Update analyze_VCF_v2.R

---
 scripts/analyze_VCF_v2.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/analyze_VCF_v2.R b/scripts/analyze_VCF_v2.R
index 708f712..5a66710 100644
--- a/scripts/analyze_VCF_v2.R
+++ b/scripts/analyze_VCF_v2.R
@@ -203,7 +203,7 @@ get_fig_single = function(vINS_F, vDEL_F, sINS_F, sDEL_F, top_name){
 	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), axis.title = title_text) 
 
     # Deletion figure
-    figB = ggplot(vDEL_F, aes(x=x, y=Count, group=TOOL, color=TOOL, linetype=HAPID)) +
+    figB = ggplot(vDEL_F, aes(x=x, y=Count, group=interaction(TOOL, HAPID), color=TOOL, linetype=HAPID)) +
     geom_line() +
 	geom_line(data = sDEL_F) +
     expand_limits(x=c(0)) +
-- 
GitLab


From 95b398ac16252d37bf9a490b6aa63ef0d761039e Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 8 Aug 2024 11:11:28 +0200
Subject: [PATCH 102/310] Update Snakefile

---
 Snakefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index 4ca851b..5160d1e 100644
--- a/Snakefile
+++ b/Snakefile
@@ -973,7 +973,7 @@ rule create_pan1c_report:
             shell("echo '# SyRI on input assemblies' >> {output.report}")
             for fig in fig_list:
                 basename = os.path.basename(fig)
-                hap_name = basename.split('.')[2:4]
+                hap_name = basename.split('.')[1:3]
 
                 shell("echo '## {hap_name[0]}, {hap_name[1]}' >> {output.report}")
                 shell("echo '![{basename}](./asm.syri.figs/{basename})' >> {output.report}")
@@ -992,7 +992,7 @@ rule create_pan1c_report:
 
             # Adding back 'general' figures
             figures = [
-                f"pan1c."+config["name"]+".{fig_type}.vcf.{tool}.png" 
+                f"pan1c.{config['name']}.{fig_type}.vcf.{tool}.png" 
                 for fig_type in ["General", "General_log"]
                 for tool in ["vg", "syri"]
             ] + figures
-- 
GitLab


From 19098028f863ab38551daeb70680742dcad4510e Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 8 Aug 2024 11:18:52 +0200
Subject: [PATCH 103/310] Update Snakefile

---
 Snakefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index 5160d1e..bebcc53 100644
--- a/Snakefile
+++ b/Snakefile
@@ -1001,9 +1001,9 @@ rule create_pan1c_report:
             for basename in figures:
                 name = basename.split('.')[2]
 
-                if name[:7] == "General" and general_fig is None :
+                if general_fig is None :
                     shell("echo '## {name}' >> {output.report}")
-                    general_fig = name
+                    if name[:7] == "General": general_fig = name
                 else :
                     general_fig = None
 
-- 
GitLab


From 2c111b0799ddd61f348841a3dd2354a37bda984e Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 8 Aug 2024 13:52:23 +0200
Subject: [PATCH 104/310] Update Snakefile

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index bebcc53..58fa815 100644
--- a/Snakefile
+++ b/Snakefile
@@ -780,7 +780,7 @@ rule vcf_fig:
         vcf_fig=directory("output/vcf.figs")
     threads: 1
     resources:
-        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 8000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 16000
     params:
         app_path=config['app.path'],
         fig_config=config['vcf_fig.params'],
-- 
GitLab


From 5ed049f4bb410bb635829d57410d81dc2e16195d Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 8 Aug 2024 16:19:30 +0200
Subject: [PATCH 105/310] Changed big genome tip

---
 config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config.yaml b/config.yaml
index 20a6240..e776404 100644
--- a/config.yaml
+++ b/config.yaml
@@ -13,7 +13,7 @@ mem_multiplier: 1
 # Core parameters
 # RagTag parameters
 ragtag_mm2_conf: '-x asm5'
-##Â Add -f 0.0002 for large genomes
+##Â Add -f 0.02 for large genomes
 
 # Wfmash alignement parameters :
 wfmash.segment_length: 10000
-- 
GitLab


From 9cd39a2191a0db1c71e482c8df609977527ec73a Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 8 Aug 2024 16:34:54 +0200
Subject: [PATCH 106/310] Update plotsr-base.cfg

---
 src/plotsr-base.cfg | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/plotsr-base.cfg b/src/plotsr-base.cfg
index ad942be..394e11c 100644
--- a/src/plotsr-base.cfg
+++ b/src/plotsr-base.cfg
@@ -12,7 +12,7 @@ marginchr:0.1           ## Margin between adjacent chromosomes when using --itx
 
 ## Legend
 legend:T                ## To plot legend use T, use F to not plot legend
-genlegcol:4            ## Number of columns for genome legend, set -1 for automatic setup
+genlegcol:5            ## Number of columns for genome legend, set -1 for automatic setup
 bbox:0,1.01,0.5,0.3		## [Left edge, bottom edge, width, height]
 bbox_v:0,1.1,0.5,0.3	## For vertical chromosomes (using -v option)
-bboxmar:0.5             ## Margin between genome and annotation legends
\ No newline at end of file
+bboxmar:0.8             ## Margin between genome and annotation legends
\ No newline at end of file
-- 
GitLab


From 0102452b7d63ab2d9e437b6cff7188fdb10977de Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 8 Aug 2024 17:12:03 +0200
Subject: [PATCH 107/310] Fixed vg deconstruct command

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 58fa815..d5cc1bf 100644
--- a/Snakefile
+++ b/Snakefile
@@ -763,7 +763,7 @@ rule vg_deconstruct:
         """
         /usr/bin/time -v -o {log.time} \
             apptainer run --app vg {params.app_path}/PanGeTools.sif \
-                deconstruct -e \
+                deconstruct -a \
                 -P $(echo {params.ref} | cut -f1 -d'.') \
                 {input.graph} \
                 -t {threads} -v \
-- 
GitLab


From 2f890a405b7f7bb7e44fb85978cb817e62d37299 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 8 Aug 2024 17:58:45 +0200
Subject: [PATCH 108/310] Forcing compression

---
 Snakefile       | 27 ++++++++++++++++-----------
 rules/tools.smk | 14 ++++++++++++++
 2 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/Snakefile b/Snakefile
index d5cc1bf..b942d23 100644
--- a/Snakefile
+++ b/Snakefile
@@ -89,7 +89,7 @@ Rules   ------------------------------------------------------------------------
 # Main target rule
 rule all:
     input:
-        "output/pan1c."+config['name']+".gfa", # Final graph (main output)
+        "output/pan1c."+config['name']+".gfa.gz", # Final graph (main output)
         "output/pan1c."+config['name']+".gfa.metadata", # Metadata for the final (also in top of gfa files as # line)
         which_analysis()
 
@@ -354,7 +354,9 @@ rule wfmash_on_chr:
         fai='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz.fai'
     output:
         mapping=temp("data/chrGraphs/{chromosome}/{chromosome}.wfmash.mapping.paf"),
-        aln=temp("data/chrGraphs/{chromosome}/{chromosome}.wfmash.aln.paf")
+        aln=temp("data/chrGraphs/{chromosome}/{chromosome}.wfmash.aln.paf"),
+        mapping_gz="data/chrGraphs/{chromosome}/{chromosome}.wfmash.mapping.paf.gz",
+        aln_gz="data/chrGraphs/{chromosome}/{chromosome}.wfmash.aln.paf.gz"
     threads: 16
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
@@ -405,7 +407,8 @@ rule seqwish:
         fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz',
         aln=rules.wfmash_on_chr.output.aln
     output:
-        temp("data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfa")
+        gfa=temp("data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfa"),
+        gfa_gz="data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfa.gz"
     threads: 8
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000
@@ -420,22 +423,23 @@ rule seqwish:
         """
         /usr/bin/time -v -o {log.time} \
             apptainer exec {params.app_path}/PanGeTools.sif seqwish \
-            -s {input.fa} -p {input.aln} -g {output} \
+            -s {input.fa} -p {input.aln} -g {output.gfa} \
             {params.seqwish} -t {threads} \
-            --temp-dir $(dirname {output}) -P 2>&1 | \
+            --temp-dir $(dirname {output.gfa}) -P 2>&1 | \
             tee {log.cmd}
 
         # Compressing
         apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
-            -@ {threads} -k {output}
+            -@ {threads} -k {output.gfa}
         """
 
 rule gfaffix_on_chr:
     # Run gfaffix on seqwish graph
     input:
-        rules.seqwish.output
+        rules.seqwish.output.gfa
     output:
         gfa=temp("data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.gfa"),
+        gfa_gz="data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.gfa",
         transform="data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.transform.txt"
     threads: 1
     resources:
@@ -462,7 +466,8 @@ rule odgi_postprocessing:
     input:
         rules.gfaffix_on_chr.output.gfa
     output:
-        gfa=temp('data/chrGraphs/'+config['name']+'.{chromosome}.gfa')
+        gfa=temp('data/chrGraphs/'+config['name']+'.{chromosome}.gfa'),
+        gfa_gz='data/chrGraphs/'+config['name']+'.{chromosome}.gfa.gz'
     threads: 8
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000
@@ -543,7 +548,7 @@ rule graph_squeeze:
         glist="data/chrGraphs/graphsList.txt",
         graphs=expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST)
     output:
-        "output/pan1c."+config['name']+".gfa"
+        gfa=temp("output/pan1c."+config['name']+".gfa")
     log: 
         cmd="logs/squeeze/"+config['name']+".squeeze.cmd.log",
         time="logs/squeeze/"+config['name']+".squeeze.time.log",
@@ -557,11 +562,11 @@ rule graph_squeeze:
         """
         /usr/bin/time -v -o {log.time} \
             apptainer run --app odgi {params.app_path}/PanGeTools.sif \
-            squeeze -t {threads} -O -P -f {input.glist} -o {output}.og 2>&1 | \
+            squeeze -t {threads} -O -P -f {input.glist} -o {output.gfa}.og 2>&1 | \
             tee {log.cmd}
 
         apptainer run --app odgi {params.app_path}/PanGeTools.sif \
-            view -t {threads} -P -i {output}.og -g > {output}
+            view -t {threads} -P -i {output.gfa}.og -g > {output}
 
         rm {output}.og
         """
diff --git a/rules/tools.smk b/rules/tools.smk
index 117f65e..35d02cb 100644
--- a/rules/tools.smk
+++ b/rules/tools.smk
@@ -42,6 +42,20 @@ rule run_bgzip:
             -@ {threads} {input}
         """
 
+rule uncompress:
+    # Run BGZIP on the file
+    input: 
+        "{file}.gz"
+    output:
+        "{file}"
+    threads: 1
+    resources:
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 8000
+    shell:
+        """
+        gzip -d -k {input}
+        """
+
 rule gfa_2_xg:
     # Convert a GFA to XG
     input:
-- 
GitLab


From 486e752a31856df0a431b992a95716ff0ba305a0 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 8 Aug 2024 18:06:02 +0200
Subject: [PATCH 109/310] Update Snakefile

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index b942d23..e651a4a 100644
--- a/Snakefile
+++ b/Snakefile
@@ -439,7 +439,7 @@ rule gfaffix_on_chr:
         rules.seqwish.output.gfa
     output:
         gfa=temp("data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.gfa"),
-        gfa_gz="data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.gfa",
+        gfa_gz="data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.gfa.gz",
         transform="data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.transform.txt"
     threads: 1
     resources:
-- 
GitLab


From f6f1bae2d0c2e335f092b5833e5f11aad7f0bb9e Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 8 Aug 2024 18:07:22 +0200
Subject: [PATCH 110/310] Update tools.smk

---
 rules/tools.smk | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/rules/tools.smk b/rules/tools.smk
index 35d02cb..117f65e 100644
--- a/rules/tools.smk
+++ b/rules/tools.smk
@@ -42,20 +42,6 @@ rule run_bgzip:
             -@ {threads} {input}
         """
 
-rule uncompress:
-    # Run BGZIP on the file
-    input: 
-        "{file}.gz"
-    output:
-        "{file}"
-    threads: 1
-    resources:
-        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 8000
-    shell:
-        """
-        gzip -d -k {input}
-        """
-
 rule gfa_2_xg:
     # Convert a GFA to XG
     input:
-- 
GitLab


From c6261504f48b4d6fd8d7998a98c5abc97f1bff4b Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Fri, 9 Aug 2024 13:46:20 +0200
Subject: [PATCH 111/310] Tagging when squeezing

---
 Snakefile | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/Snakefile b/Snakefile
index e651a4a..4735291 100644
--- a/Snakefile
+++ b/Snakefile
@@ -546,7 +546,8 @@ rule graph_squeeze:
     # Using odgi to merge every subgraphs into a final one
     input:
         glist="data/chrGraphs/graphsList.txt",
-        graphs=expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST)
+        graphs=expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST),
+        tags="output/pan1c."+config['name']+".gfa.metadata"
     output:
         gfa=temp("output/pan1c."+config['name']+".gfa")
     log: 
@@ -569,6 +570,8 @@ rule graph_squeeze:
             view -t {threads} -P -i {output.gfa}.og -g > {output}
 
         rm {output}.og
+
+        sed -i '/^H/r {input.tags}' {output}
         """
 
 rule graph_stats:
@@ -640,7 +643,7 @@ rule aggregate_graphs_stats:
 rule final_graph_tagging:
     # Add metadata to the final GFA
     input:
-        graph="output/pan1c."+config['name']+".gfa",
+        "config.yaml"
     output:
         "output/pan1c."+config['name']+".gfa.metadata"
     threads: 1
@@ -654,7 +657,6 @@ rule final_graph_tagging:
         """
         python scripts/getTags.py \
             --appdir {params.app_path} --config-file config.yaml > {output}
-        sed -i '/^H/r {output}' {input.graph}
         """
 
 rule pggb_input_stats:
-- 
GitLab


From ff0ffa01d855e7ca0e98e3b69f0a03bcd03b014b Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Fri, 9 Aug 2024 13:48:55 +0200
Subject: [PATCH 112/310] Producing Tags once

---
 Snakefile | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/Snakefile b/Snakefile
index 4735291..baae0c3 100644
--- a/Snakefile
+++ b/Snakefile
@@ -464,7 +464,8 @@ rule gfaffix_on_chr:
 rule odgi_postprocessing:
     # Running pggb's postprocessing (mainly odgi) steps with gfaffix graph
     input:
-        rules.gfaffix_on_chr.output.gfa
+        gfa=rules.gfaffix_on_chr.output.gfa,
+        tags="output/pan1c."+config['name']+".gfa.metadata"
     output:
         gfa=temp('data/chrGraphs/'+config['name']+'.{chromosome}.gfa'),
         gfa_gz='data/chrGraphs/'+config['name']+'.{chromosome}.gfa.gz'
@@ -485,12 +486,12 @@ rule odgi_postprocessing:
         time_view="logs/pggb/{chromosome}.odgi_view.time.log"
     shell:
         """
-        OGfile="$(dirname {input})/$(basename {input} .gfa)"
+        OGfile="$(dirname {input.gfa})/$(basename {input.gfa} .gfa)"
 
         ## Converting to odgi format and optimizing namespace
         /usr/bin/time -v -o {log.time_build} \
             apptainer run --app odgi {params.app_path}/PanGeTools.sif \
-            build -t {threads} -P -g {input} -o $OGfile.og -O 2>&1 | \
+            build -t {threads} -P -g {input.gfa} -o $OGfile.og -O 2>&1 | \
             tee {log.cmd_build}
         
         ## Unchoping the nodes (merges unitigs into single nodes)
@@ -514,13 +515,10 @@ rule odgi_postprocessing:
             2> >(tee {log.cmd_view} >&2) 
 
         ## Removing .og files for space savings
-        rm $(dirname {input})/*.og
+        rm $(dirname {input.gfa})/*.og
 
         ## Adding metadata
-        python scripts/getTags.py \
-            --appdir {params.app_path} --config-file config.yaml \
-            > "$(dirname {input})/metadata.txt"
-        sed -i "/^H/r $(dirname {input})/metadata.txt" {output.gfa}
+        sed -i '/^H/r {input.tags}' {output.gfa}
 
         # Compressing
         apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
-- 
GitLab


From b85a22402bd4bbaa19a6c3f94270729afc88f116 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Fri, 9 Aug 2024 18:03:46 +0200
Subject: [PATCH 113/310] Update Snakefile

---
 Snakefile | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/Snakefile b/Snakefile
index baae0c3..983b7ad 100644
--- a/Snakefile
+++ b/Snakefile
@@ -547,7 +547,8 @@ rule graph_squeeze:
         graphs=expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST),
         tags="output/pan1c."+config['name']+".gfa.metadata"
     output:
-        gfa=temp("output/pan1c."+config['name']+".gfa")
+        gfa=temp("output/pan1c."+config['name']+".gfa"),
+        gfa_gz="output/pan1c."+config['name']+".gfa.gz"
     log: 
         cmd="logs/squeeze/"+config['name']+".squeeze.cmd.log",
         time="logs/squeeze/"+config['name']+".squeeze.time.log",
@@ -565,11 +566,16 @@ rule graph_squeeze:
             tee {log.cmd}
 
         apptainer run --app odgi {params.app_path}/PanGeTools.sif \
-            view -t {threads} -P -i {output.gfa}.og -g > {output}
+            view -t {threads} -P -i {output.gfa}.og -g > {output.gfa}
 
-        rm {output}.og
+        rm {output.gfa}.og
 
-        sed -i '/^H/r {input.tags}' {output}
+        # Tagging
+        sed -i '/^H/r {input.tags}' {output.gfa}
+
+        # Compressing
+        apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
+            -@ {threads} -k {output.gfa}
         """
 
 rule graph_stats:
-- 
GitLab


From 8c39f1e186a37b8ccca62a50827aee4c4ab8550f Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 12 Aug 2024 12:35:50 +0200
Subject: [PATCH 114/310] Compressing output by default

---
 Snakefile | 48 +++++++++++++++++++++++++++++-------------------
 1 file changed, 29 insertions(+), 19 deletions(-)

diff --git a/Snakefile b/Snakefile
index 983b7ad..aaa1b58 100644
--- a/Snakefile
+++ b/Snakefile
@@ -405,7 +405,7 @@ rule seqwish:
     # Run seqwish on alignement produced by wfmash
     input:
         fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz',
-        aln=rules.wfmash_on_chr.output.aln
+        aln=rules.wfmash_on_chr.output.aln_gz
     output:
         gfa=temp("data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfa"),
         gfa_gz="data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfa.gz"
@@ -423,22 +423,21 @@ rule seqwish:
         """
         /usr/bin/time -v -o {log.time} \
             apptainer exec {params.app_path}/PanGeTools.sif seqwish \
-            -s {input.fa} -p {input.aln} -g {output.gfa} \
+            -s {input.fa} -p {input.aln} -g $(basename {output.gfa_gz} .gz) \
             {params.seqwish} -t {threads} \
-            --temp-dir $(dirname {output.gfa}) -P 2>&1 | \
+            --temp-dir $(dirname {output.gfa_gz}) -P 2>&1 | \
             tee {log.cmd}
 
         # Compressing
         apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
-            -@ {threads} -k {output.gfa}
+            -@ {threads} $(basename {output.gfa_gz} .gz)
         """
 
 rule gfaffix_on_chr:
     # Run gfaffix on seqwish graph
     input:
-        rules.seqwish.output.gfa
+        rules.seqwish.output.gfa_gz
     output:
-        gfa=temp("data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.gfa"),
         gfa_gz="data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.gfa.gz",
         transform="data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.transform.txt"
     threads: 1
@@ -451,23 +450,27 @@ rule gfaffix_on_chr:
         time="logs/pggb/{chromosome}.gfaffix.time.log"
     shell:
         """
+        # Decompressing Seqwish graph
+        gzip -d -k {input}
+
         /usr/bin/time -v -o {log.time} \
             apptainer exec {params.app_path}/PanGeTools.sif gfaffix \
-            {input} -o {output.gfa} -t {output.transform} \
+            $(basename {input} .gz) -o {output.gfa} -t {output.transform} \
             > /dev/null
 
         # Compressing
         apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
-            -@ {threads} -k {output.gfa}
+            -@ {threads} {output.gfa}
+
+        rm $(basename {input} .gz)
         """
 
 rule odgi_postprocessing:
     # Running pggb's postprocessing (mainly odgi) steps with gfaffix graph
     input:
-        gfa=rules.gfaffix_on_chr.output.gfa,
+        gfa=rules.gfaffix_on_chr.output.gfa_gz,
         tags="output/pan1c."+config['name']+".gfa.metadata"
     output:
-        gfa=temp('data/chrGraphs/'+config['name']+'.{chromosome}.gfa'),
         gfa_gz='data/chrGraphs/'+config['name']+'.{chromosome}.gfa.gz'
     threads: 8
     resources:
@@ -486,12 +489,14 @@ rule odgi_postprocessing:
         time_view="logs/pggb/{chromosome}.odgi_view.time.log"
     shell:
         """
-        OGfile="$(dirname {input.gfa})/$(basename {input.gfa} .gfa)"
+        OGfile="$(dirname {input.gfa})/$(basename {input.gfa} .gfa.gz)"
+
+        gzip -d -k {input.gfa}
 
         ## Converting to odgi format and optimizing namespace
         /usr/bin/time -v -o {log.time_build} \
             apptainer run --app odgi {params.app_path}/PanGeTools.sif \
-            build -t {threads} -P -g {input.gfa} -o $OGfile.og -O 2>&1 | \
+            build -t {threads} -P -g $(basename {input.gfa} .gz) -o $OGfile.og -O 2>&1 | \
             tee {log.cmd_build}
         
         ## Unchoping the nodes (merges unitigs into single nodes)
@@ -522,29 +527,34 @@ rule odgi_postprocessing:
 
         # Compressing
         apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
-            -@ {threads} -k {output.gfa}
+            -@ {threads} {output.gfa}
         """
 
 rule generate_graph_list:
     # Generate a text file containing all created graphs
     input:
-        expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST)
+        expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa.gz', chromosome=CHRLIST)
     output:
-        "data/chrGraphs/graphsList.txt"
+        "data/chrGraphs/graphsList.txt",
+        temp(expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST))
     threads: 1
     resources:
-        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000
     priority: 100
     run:
         with open(output[0], "w") as handle:
             for file in input:
-                handle.write(file+"\n")
+                handle.write(file[:-3]+"\n")
+        
+        for file in input:
+            shell("gzip -d -k {file}")
+
 
 rule graph_squeeze:
     # Using odgi to merge every subgraphs into a final one
     input:
         glist="data/chrGraphs/graphsList.txt",
-        graphs=expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST),
+        graphs=expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa.gz', chromosome=CHRLIST),
         tags="output/pan1c."+config['name']+".gfa.metadata"
     output:
         gfa=temp("output/pan1c."+config['name']+".gfa"),
@@ -581,7 +591,7 @@ rule graph_squeeze:
 rule graph_stats:
     # Using GFAstats to produce stats on every chromosome graphs
     input:
-        graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa'
+        graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa.gz'
     output:
         genstats="output/stats/chrGraphs/"+config['name']+".{chromosome}.general.stats.tsv",
         pathstats="output/stats/chrGraphs/"+config['name']+".{chromosome}.path.stats.tsv"
-- 
GitLab


From f0524e2b7f21f18659008bf494d94888a17a66d5 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 12 Aug 2024 12:42:03 +0200
Subject: [PATCH 115/310] Update Snakefile

---
 Snakefile | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/Snakefile b/Snakefile
index aaa1b58..77f26d3 100644
--- a/Snakefile
+++ b/Snakefile
@@ -471,7 +471,7 @@ rule odgi_postprocessing:
         gfa=rules.gfaffix_on_chr.output.gfa_gz,
         tags="output/pan1c."+config['name']+".gfa.metadata"
     output:
-        gfa_gz='data/chrGraphs/'+config['name']+'.{chromosome}.gfa.gz'
+        gfa='data/chrGraphs/'+config['name']+'.{chromosome}.gfa'
     threads: 8
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000
@@ -524,10 +524,6 @@ rule odgi_postprocessing:
 
         ## Adding metadata
         sed -i '/^H/r {input.tags}' {output.gfa}
-
-        # Compressing
-        apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
-            -@ {threads} {output.gfa}
         """
 
 rule generate_graph_list:
-- 
GitLab


From a4f003a19e7ff32f6638d7b095474d6d7d7bc23e Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 12 Aug 2024 12:59:13 +0200
Subject: [PATCH 116/310] Update Snakefile

---
 Snakefile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index 77f26d3..1b98216 100644
--- a/Snakefile
+++ b/Snakefile
@@ -407,7 +407,6 @@ rule seqwish:
         fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz',
         aln=rules.wfmash_on_chr.output.aln_gz
     output:
-        gfa=temp("data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfa"),
         gfa_gz="data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfa.gz"
     threads: 8
     resources:
@@ -471,7 +470,7 @@ rule odgi_postprocessing:
         gfa=rules.gfaffix_on_chr.output.gfa_gz,
         tags="output/pan1c."+config['name']+".gfa.metadata"
     output:
-        gfa='data/chrGraphs/'+config['name']+'.{chromosome}.gfa'
+        gfa=temp('data/chrGraphs/'+config['name']+'.{chromosome}.gfa')
     threads: 8
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000
-- 
GitLab


From 08157297fd4cb97242400fcf45b51750f134b18d Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 12 Aug 2024 15:16:01 +0200
Subject: [PATCH 117/310] Update Snakefile

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 1b98216..e08e155 100644
--- a/Snakefile
+++ b/Snakefile
@@ -422,7 +422,7 @@ rule seqwish:
         """
         /usr/bin/time -v -o {log.time} \
             apptainer exec {params.app_path}/PanGeTools.sif seqwish \
-            -s {input.fa} -p {input.aln} -g $(basename {output.gfa_gz} .gz) \
+            -s {input.fa} -p {input.aln} -g $(dirname {output.gfa_gz})/$(basename {output.gfa_gz} .gz) \
             {params.seqwish} -t {threads} \
             --temp-dir $(dirname {output.gfa_gz}) -P 2>&1 | \
             tee {log.cmd}
-- 
GitLab


From fad1af618d4deb51398db1f2611d84a10c617efa Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 12 Aug 2024 16:10:07 +0200
Subject: [PATCH 118/310] Update Snakefile

---
 Snakefile | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/Snakefile b/Snakefile
index e08e155..0be8436 100644
--- a/Snakefile
+++ b/Snakefile
@@ -420,16 +420,18 @@ rule seqwish:
         time="logs/pggb/{chromosome}.seqwish.time.log"
     shell:
         """
+        gfa_out="$(dirname {output.gfa_gz})/$(basename {output.gfa_gz} .gz)"
+
         /usr/bin/time -v -o {log.time} \
             apptainer exec {params.app_path}/PanGeTools.sif seqwish \
-            -s {input.fa} -p {input.aln} -g $(dirname {output.gfa_gz})/$(basename {output.gfa_gz} .gz) \
+            -s {input.fa} -p {input.aln} -g $gfa_out \
             {params.seqwish} -t {threads} \
             --temp-dir $(dirname {output.gfa_gz}) -P 2>&1 | \
             tee {log.cmd}
 
         # Compressing
         apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
-            -@ {threads} $(basename {output.gfa_gz} .gz)
+            -@ {threads} $gfa_out
         """
 
 rule gfaffix_on_chr:
@@ -451,23 +453,25 @@ rule gfaffix_on_chr:
         """
         # Decompressing Seqwish graph
         gzip -d -k {input}
+        gfa_in="$(dirname {input})/$(basename {input} .gz)"
+        gfa_out="$(dirname {output.gfa_gz})/$(basename {output.gfa_gz} .gz)"
 
         /usr/bin/time -v -o {log.time} \
             apptainer exec {params.app_path}/PanGeTools.sif gfaffix \
-            $(basename {input} .gz) -o {output.gfa} -t {output.transform} \
+            $gfa_in -o $gfa_out -t {output.transform} \
             > /dev/null
 
         # Compressing
         apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
-            -@ {threads} {output.gfa}
+            -@ {threads} $gfa_out
 
-        rm $(basename {input} .gz)
+        rm $gfa_in
         """
 
 rule odgi_postprocessing:
     # Running pggb's postprocessing (mainly odgi) steps with gfaffix graph
     input:
-        gfa=rules.gfaffix_on_chr.output.gfa_gz,
+        gfa_gz=rules.gfaffix_on_chr.output.gfa_gz,
         tags="output/pan1c."+config['name']+".gfa.metadata"
     output:
         gfa=temp('data/chrGraphs/'+config['name']+'.{chromosome}.gfa')
@@ -488,14 +492,15 @@ rule odgi_postprocessing:
         time_view="logs/pggb/{chromosome}.odgi_view.time.log"
     shell:
         """
-        OGfile="$(dirname {input.gfa})/$(basename {input.gfa} .gfa.gz)"
+        OGfile="$(dirname {input.gfa_gz})/$(basename {input.gfa_gz} .gfa.gz)"
 
-        gzip -d -k {input.gfa}
+        gzip -d -k {input.gfa_gz}
+        gfa_in="$(dirname {input.gfa_gz})/$(basename {input.gfa_gz} .gz)"
 
         ## Converting to odgi format and optimizing namespace
         /usr/bin/time -v -o {log.time_build} \
             apptainer run --app odgi {params.app_path}/PanGeTools.sif \
-            build -t {threads} -P -g $(basename {input.gfa} .gz) -o $OGfile.og -O 2>&1 | \
+            build -t {threads} -P -g $gfa_in -o $OGfile.og -O 2>&1 | \
             tee {log.cmd_build}
         
         ## Unchoping the nodes (merges unitigs into single nodes)
@@ -520,6 +525,7 @@ rule odgi_postprocessing:
 
         ## Removing .og files for space savings
         rm $(dirname {input.gfa})/*.og
+        rm gfa_in
 
         ## Adding metadata
         sed -i '/^H/r {input.tags}' {output.gfa}
-- 
GitLab


From 238bf9727b6748930d9dd1301461bdd8e369f9fc Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 12 Aug 2024 16:25:26 +0200
Subject: [PATCH 119/310] Update Snakefile

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 0be8436..ec66361 100644
--- a/Snakefile
+++ b/Snakefile
@@ -524,7 +524,7 @@ rule odgi_postprocessing:
             2> >(tee {log.cmd_view} >&2) 
 
         ## Removing .og files for space savings
-        rm $(dirname {input.gfa})/*.og
+        rm $(dirname {input.gfa_gz})/*.og
         rm gfa_in
 
         ## Adding metadata
-- 
GitLab


From 84f712c45aec909ae8df74de9a431913ebce57fb Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 12 Aug 2024 17:51:30 +0200
Subject: [PATCH 120/310] Update Snakefile

---
 Snakefile | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/Snakefile b/Snakefile
index ec66361..dc53111 100644
--- a/Snakefile
+++ b/Snakefile
@@ -536,8 +536,7 @@ rule generate_graph_list:
     input:
         expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa.gz', chromosome=CHRLIST)
     output:
-        "data/chrGraphs/graphsList.txt",
-        temp(expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST))
+        "data/chrGraphs/graphsList.txt"
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000
@@ -546,9 +545,6 @@ rule generate_graph_list:
         with open(output[0], "w") as handle:
             for file in input:
                 handle.write(file[:-3]+"\n")
-        
-        for file in input:
-            shell("gzip -d -k {file}")
 
 
 rule graph_squeeze:
@@ -559,7 +555,8 @@ rule graph_squeeze:
         tags="output/pan1c."+config['name']+".gfa.metadata"
     output:
         gfa=temp("output/pan1c."+config['name']+".gfa"),
-        gfa_gz="output/pan1c."+config['name']+".gfa.gz"
+        gfa_gz="output/pan1c."+config['name']+".gfa.gz",
+        sgl_graphs=expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST)
     log: 
         cmd="logs/squeeze/"+config['name']+".squeeze.cmd.log",
         time="logs/squeeze/"+config['name']+".squeeze.time.log",
@@ -571,6 +568,9 @@ rule graph_squeeze:
         app_path=config['app.path']
     shell:
         """
+        # Decompressing every graphs
+        gzip -d -k {input.graphs}
+
         /usr/bin/time -v -o {log.time} \
             apptainer run --app odgi {params.app_path}/PanGeTools.sif \
             squeeze -t {threads} -O -P -f {input.glist} -o {output.gfa}.og 2>&1 | \
@@ -580,6 +580,9 @@ rule graph_squeeze:
             view -t {threads} -P -i {output.gfa}.og -g > {output.gfa}
 
         rm {output.gfa}.og
+        
+        # Removing decompressed GFAs
+        rm $(dirname {input.glist})/*.gfa
 
         # Tagging
         sed -i '/^H/r {input.tags}' {output.gfa}
@@ -633,6 +636,7 @@ rule graph_figs:
         ## Pcov viz
         apptainer run --app odgi {params.app_path}/PanGeTools.sif \
             viz -i {input.graph} -o {output.pcov} {params.pcov} -t {threads} -P
+
         """
 
 rule aggregate_graphs_stats:
-- 
GitLab


From 474316a2bfe3c8ff46b33c3ee29c528c729dbc57 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Tue, 13 Aug 2024 18:17:54 +0200
Subject: [PATCH 121/310] Compressing by default

---
 Snakefile       | 26 ++++++++++++++------------
 rules/tools.smk | 16 ++++++++++++++++
 2 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/Snakefile b/Snakefile
index dc53111..93b45cc 100644
--- a/Snakefile
+++ b/Snakefile
@@ -247,7 +247,7 @@ rule chromosome_clustering:
             --fasta {input} --output $(dirname {output[0]}) --panname {params.pan_name}
         """
 
-rule SyRI_on_ASM:
+rule SyRI_on_ASM_mm2:
     # Run SyRI on a single assembly. 
     # The assembly is mapped on the 'reference' and SyRI search for SV.
     input:
@@ -474,7 +474,7 @@ rule odgi_postprocessing:
         gfa_gz=rules.gfaffix_on_chr.output.gfa_gz,
         tags="output/pan1c."+config['name']+".gfa.metadata"
     output:
-        gfa=temp('data/chrGraphs/'+config['name']+'.{chromosome}.gfa')
+        gfa_gz='data/chrGraphs/'+config['name']+'.{chromosome}.gfa.gz'
     threads: 8
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000
@@ -496,6 +496,7 @@ rule odgi_postprocessing:
 
         gzip -d -k {input.gfa_gz}
         gfa_in="$(dirname {input.gfa_gz})/$(basename {input.gfa_gz} .gz)"
+        gfa_out="$(dirname {output.gfa_gz})/$(basename {output.gfa_gz} .gz)"
 
         ## Converting to odgi format and optimizing namespace
         /usr/bin/time -v -o {log.time_build} \
@@ -520,15 +521,19 @@ rule odgi_postprocessing:
         /usr/bin/time -v -o {log.time_view} \
             apptainer run --app odgi {params.app_path}/PanGeTools.sif \
             view -i $OGfile.unchoped.sorted.og -g \
-            1> {output.gfa} \
+            1> $gfa_out \
             2> >(tee {log.cmd_view} >&2) 
 
         ## Removing .og files for space savings
         rm $(dirname {input.gfa_gz})/*.og
-        rm gfa_in
+        rm $gfa_in
 
         ## Adding metadata
-        sed -i '/^H/r {input.tags}' {output.gfa}
+        sed -i '/^H/r {input.tags}' $gfa_out
+
+        ## Compressing
+        apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
+            -@ {threads} $gfa_out
         """
 
 rule generate_graph_list:
@@ -546,17 +551,14 @@ rule generate_graph_list:
             for file in input:
                 handle.write(file[:-3]+"\n")
 
-
 rule graph_squeeze:
     # Using odgi to merge every subgraphs into a final one
     input:
         glist="data/chrGraphs/graphsList.txt",
-        graphs=expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa.gz', chromosome=CHRLIST),
+        graphs=expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST),
         tags="output/pan1c."+config['name']+".gfa.metadata"
     output:
-        gfa=temp("output/pan1c."+config['name']+".gfa"),
-        gfa_gz="output/pan1c."+config['name']+".gfa.gz",
-        sgl_graphs=expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST)
+        gfa_gz="output/pan1c."+config['name']+".gfa.gz"
     log: 
         cmd="logs/squeeze/"+config['name']+".squeeze.cmd.log",
         time="logs/squeeze/"+config['name']+".squeeze.time.log",
@@ -589,7 +591,7 @@ rule graph_squeeze:
 
         # Compressing
         apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
-            -@ {threads} -k {output.gfa}
+            -@ {threads} {output.gfa}
         """
 
 rule graph_stats:
@@ -659,7 +661,7 @@ rule aggregate_graphs_stats:
             --outputPaths {output.pathstats} --panname {params.pan_name} 
         """
 
-rule final_graph_tagging:
+rule get_graph_tags:
     # Add metadata to the final GFA
     input:
         "config.yaml"
diff --git a/rules/tools.smk b/rules/tools.smk
index 117f65e..7683ed1 100644
--- a/rules/tools.smk
+++ b/rules/tools.smk
@@ -42,6 +42,22 @@ rule run_bgzip:
             -@ {threads} {input}
         """
 
+rule decompress_graph:
+    # Decompressing graph if required
+    input: 
+        "{file}.gfa.gz"
+    output:
+        temp("{file}.gfa")
+    threads: 1
+    resources:
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 8000
+    params:
+        app_path=config["app.path"]
+    shell:
+        """
+        gzip -d -k {input} 
+        """
+
 rule gfa_2_xg:
     # Convert a GFA to XG
     input:
-- 
GitLab


From b7b7a3759ec01196b92cf9c167bc4bebde2fb2f3 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Tue, 13 Aug 2024 18:27:46 +0200
Subject: [PATCH 122/310] Update Snakefile

---
 Snakefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Snakefile b/Snakefile
index 93b45cc..258eb7a 100644
--- a/Snakefile
+++ b/Snakefile
@@ -2,6 +2,7 @@ configfile: "config.yaml"
 
 include: "rules/tools.smk"
 
+ruleorder: odgi_postprocessing > run_bgzip
 
 ## Modules
 import os
-- 
GitLab


From 9f297e4dc876e975c3ee1929f2408ba66655e43e Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Tue, 13 Aug 2024 18:39:20 +0200
Subject: [PATCH 123/310] Update Snakefile

---
 Snakefile | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/Snakefile b/Snakefile
index 258eb7a..0d60d4b 100644
--- a/Snakefile
+++ b/Snakefile
@@ -573,26 +573,24 @@ rule graph_squeeze:
         """
         # Decompressing every graphs
         gzip -d -k {input.graphs}
+        gfa_out="$(dirname {output.gfa_gz})/$(basename {output.gfa_gz} .gz)"
 
         /usr/bin/time -v -o {log.time} \
             apptainer run --app odgi {params.app_path}/PanGeTools.sif \
-            squeeze -t {threads} -O -P -f {input.glist} -o {output.gfa}.og 2>&1 | \
+            squeeze -t {threads} -O -P -f {input.glist} -o $gfa_out.og 2>&1 | \
             tee {log.cmd}
 
         apptainer run --app odgi {params.app_path}/PanGeTools.sif \
-            view -t {threads} -P -i {output.gfa}.og -g > {output.gfa}
+            view -t {threads} -P -i $gfa_out.og -g > $gfa_out
 
-        rm {output.gfa}.og
-        
-        # Removing decompressed GFAs
-        rm $(dirname {input.glist})/*.gfa
+        rm $gfa_out.og
 
         # Tagging
-        sed -i '/^H/r {input.tags}' {output.gfa}
+        sed -i '/^H/r {input.tags}' $gfa_out
 
         # Compressing
         apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
-            -@ {threads} {output.gfa}
+            -@ {threads} $gfa_out
         """
 
 rule graph_stats:
-- 
GitLab


From 394315bb84f3067dd4f5d66603a8f360a42f8da5 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Tue, 13 Aug 2024 19:14:03 +0200
Subject: [PATCH 124/310] Update Snakefile

---
 Snakefile | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index 0d60d4b..8f10a9d 100644
--- a/Snakefile
+++ b/Snakefile
@@ -571,8 +571,6 @@ rule graph_squeeze:
         app_path=config['app.path']
     shell:
         """
-        # Decompressing every graphs
-        gzip -d -k {input.graphs}
         gfa_out="$(dirname {output.gfa_gz})/$(basename {output.gfa_gz} .gz)"
 
         /usr/bin/time -v -o {log.time} \
-- 
GitLab


From ff8eebbb544502cc7d669b7fcd7696d4d10d0da1 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Wed, 14 Aug 2024 11:36:07 +0200
Subject: [PATCH 125/310] Update Snakefile

---
 Snakefile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 8f10a9d..a3948ec 100644
--- a/Snakefile
+++ b/Snakefile
@@ -505,6 +505,9 @@ rule odgi_postprocessing:
             build -t {threads} -P -g $gfa_in -o $OGfile.og -O 2>&1 | \
             tee {log.cmd_build}
         
+        ## Removing input GFA
+        rm $gfa_in
+        
         ## Unchoping the nodes (merges unitigs into single nodes)
         /usr/bin/time -v -o {log.time_unchop} \
             apptainer run --app odgi {params.app_path}/PanGeTools.sif \
@@ -527,7 +530,6 @@ rule odgi_postprocessing:
 
         ## Removing .og files for space savings
         rm $(dirname {input.gfa_gz})/*.og
-        rm $gfa_in
 
         ## Adding metadata
         sed -i '/^H/r {input.tags}' $gfa_out
-- 
GitLab


From b0f6ebb2fff0ab757bc2aa568adf1cfd5a10b999 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 28 Aug 2024 16:40:00 +0200
Subject: [PATCH 126/310] Preparing for GFAstats and chrGraphs figs

- Working with GFAstats v0.4.2
- Adding chrGraphs figure script and related rule (still not used)
- Renamed some scripts for clarity
---
 Snakefile                                     |  29 ++-
 ...sAggregation.py => chrGraphs_aggregate.py} |   0
 scripts/chrGraphs_figs.py                     | 245 ++++++++++++++++++
 .../{chrInputStats.py => chrInput_stats.py}   |   0
 scripts/{coreStats.py => core_stats.py}       |   0
 5 files changed, 271 insertions(+), 3 deletions(-)
 rename scripts/{chrStatsAggregation.py => chrGraphs_aggregate.py} (100%)
 create mode 100644 scripts/chrGraphs_figs.py
 rename scripts/{chrInputStats.py => chrInput_stats.py} (100%)
 rename scripts/{coreStats.py => core_stats.py} (100%)

diff --git a/Snakefile b/Snakefile
index a3948ec..3ea4152 100644
--- a/Snakefile
+++ b/Snakefile
@@ -655,7 +655,7 @@ rule aggregate_graphs_stats:
         pan_name=config['name']
     shell:
         """
-        apptainer run {params.app_path}/pan1c-env.sif python scripts/chrStatsAggregation.py \
+        apptainer run {params.app_path}/pan1c-env.sif python scripts/chrGraphs_aggregate.py \
             --input $(dirname {input[0]}) --outputGeneral {output.genstats} \
             --outputPaths {output.pathstats} --panname {params.pan_name} 
         """
@@ -693,7 +693,7 @@ rule pggb_input_stats:
         pan_name=config['name']
     shell:
         """
-        apptainer run {params.app_path}/pan1c-env.sif python scripts/chrInputStats.py \
+        apptainer run {params.app_path}/pan1c-env.sif python scripts/chrInput_stats.py \
             -f data/chrInputs/*.fa.gz -o {output} -p {params.pan_name}
         """
 
@@ -714,7 +714,7 @@ rule core_statistics:
     shell:
         """
         mkdir -p {output.dir}
-        apptainer run {params.app_path}/pan1c-env.sif python scripts/coreStats.py \
+        apptainer run {params.app_path}/pan1c-env.sif python scripts/core_stats.py \
             --pggbStats logs/pggb --chrInputStats {input.chrInputStats} \
             --chrGraphStats {input.chrGraphStats} -o {output.tsv} -f {output.dir} -p {params.pan_name}
         """
@@ -891,6 +891,29 @@ rule create_pan1c_report_fig:
         composite -geometry "+790+$ctgheight" {output.odgifig} {output.reportfig} {output.reportfig}
         """
 
+rule create_chrGraphs_figs:
+    # Produce figures based on aggregated path stats
+    input:
+        pathstats="output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv"
+    output:
+        barplots=expand("output/chrGraphs.stats.figs/"+config['name']+".path.decomp.{chromosome}.png", chromosome=CHRLIST),
+        scatters=expand("output/chrGraphs.stats.figs/"+config['name']+".2D.scatter.{chromosome}.png", chromosome=CHRLIST),
+        heatmaps=expand("output/chrGraphs.stats.figs/"+config['name']+".sharred.content.{chromosome}.png", chromosome=CHRLIST),
+        barplot_mean="output/chrGraphs.stats.figs/"+config['name']+".path.decomp.mean.png",
+        scatter_mean="output/chrGraphs.stats.figs/"+config['name']+".2D.scatter.mean.png"
+    threads: 1
+    ressources:
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 16000
+    params:
+        app_path=config['app.path'],
+        pan_name=config['name']
+    shell:
+        """
+        apptainer run {params.app_path}/pan1c-env.sif python scripts/chrGraphs_figs.py \
+            --input {input.pathstats} --output_dir {output.genstats} \
+            --panname {params.pan_name} 
+        """
+
 def get_report_sections(wildcards):
     """
     Return 'create_pan1c_report' optional inputs to add them to the final report.
diff --git a/scripts/chrStatsAggregation.py b/scripts/chrGraphs_aggregate.py
similarity index 100%
rename from scripts/chrStatsAggregation.py
rename to scripts/chrGraphs_aggregate.py
diff --git a/scripts/chrGraphs_figs.py b/scripts/chrGraphs_figs.py
new file mode 100644
index 0000000..dc7ea4a
--- /dev/null
+++ b/scripts/chrGraphs_figs.py
@@ -0,0 +1,245 @@
+"""
+Chromosomes statistics figure script for Pan1c workflow
+
+Use aggregated TSV produced with chrGraphs_aggregate.py
+
+@author: alexis.mergez@inrae.fr
+@version: 1.0
+"""
+
+#% Librairies
+import os
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
+import seaborn as sns
+import pandas as pd
+import argparse
+from sklearn.cluster import DBSCAN
+from adjustText import adjust_text
+from plottable import Table, ColDef
+
+#% Parsing arguments
+arg_parser = argparse.ArgumentParser(description='Statistic figures for Pan1c workflow')
+arg_parser.add_argument(
+    "--input",
+    "-i",
+    dest = "input",
+    required = True,
+    help = "Input TSV containing chromosome path stats"
+    )
+arg_parser.add_argument(
+    "--output_dir",
+    dest = "dir",
+    required = True,
+    help = "Output dir for figures"
+    )
+arg_parser.add_argument(
+    "--panname",
+    "-p",
+    dest = "panname",
+    required = True,
+    help = "Pangenome name"
+    )
+args = arg_parser.parse_args()
+
+#% Loading and preparing data
+def get_path_name(pathname):
+    return pathname.rsplit("#", 1)[0]
+
+# Reading data
+gData = pd.read_csv(args.input, sep='\t')
+gData["Path.name"] = gData["Path.name"].apply(get_path_name)
+gData.set_index(["Pangenome.name","Chr.id", "Path.name"], inplace=True)
+
+# Splitting shared content and other stats
+shared_content = gData.loc[:, ["Path.length", "Shared.content"]].to_dict()
+gData.drop("Shared.content", axis=1, inplace=True)
+
+# Summing Private and core for the barplot figure
+gData["Path.PrivCore.R.Length"] = gData["Path.private.R.length"] + gData["Path.core.R.length"]
+
+# Switching to Mbp instead of Bp
+gData[["Path.length", "Path.PrivCore.R.Length", "Path.core.R.length"]] = gData[["Path.length", "Path.PrivCore.R.Length", "Path.core.R.length"]].astype(float)
+gData.loc[:, ["Path.length", "Path.PrivCore.R.Length", "Path.core.R.length"]] = gData.loc[:, ["Path.length", "Path.PrivCore.R.Length", "Path.core.R.length"]] / 1000000
+
+# Adding percentages relative to path length
+gData["Path.private.R.per"] = (gData["Path.private.R.length"]*100)/(gData["Path.length"]*1000000)
+gData["Path.core.R.per"] = (gData["Path.core.R.length"]*100)/gData["Path.length"]
+
+# Shared content
+shared_dict = {}
+A = 0
+for key, value in shared_content["Shared.content"].items():
+    for elem in value.split(';'):
+        target, stats = elem.split(":")
+        target = target.rsplit("#", 1)[0]
+
+        shared_dict[A] = list(key)+[target]+[shared_content["Path.length"][key]]+[int(val) for val in stats.split(',')]
+        A+=1
+sData = pd.DataFrame.from_dict(shared_dict, orient='index', columns = ["Pangenome.name", "Chr.id", "Query.name", "Target.name", "Path.length", "Shared.nodes.count", "Shared.length", "Shared.R.length"])
+sData.set_index(["Pangenome.name","Chr.id", "Query.name", "Target.name"], inplace=True)
+sData.loc[:, "Shared.prop"] = sData["Shared.length"]*100/sData["Path.length"]
+sData.loc[:, "Shared.R.prop"] = sData["Shared.R.length"]*100/sData["Path.length"]
+
+# Pivoting the dataframe for the heatmap
+sTable = sData.reset_index().pivot(values=["Shared.R.prop"], index=["Pangenome.name", "Chr.id", "Query.name"], columns=["Target.name"])
+
+# Removing multi-index from columns
+sTable.columns = sTable.columns.droplevel()
+
+#% Figures functions
+# Bar plots with path decomposition into Core, Private and Other proportion
+def get_group_decomp_fig(data, title, savedir):
+    sns.set_style("ticks")
+    plt.figure(figsize=(16, 9), dpi = 256)
+    
+    # Adding bars
+    bar = sns.barplot(data, y="Path.name", x="Path.length", color='#FCDC94', estimator=np.mean, errorbar=None, orient="h")
+    
+    priv_bar = sns.barplot(data, y="Path.name", x="Path.PrivCore.R.Length", color='#78ABA8', estimator=np.mean, errorbar=None, orient="h")
+    
+    core_bar = sns.barplot(data, y="Path.name", x="Path.core.R.length", color='#EF9C66', estimator=np.mean, errorbar=None, orient="h")
+
+    # Adding relative percentages
+    for path_name in data.index.unique("Path.name"):
+        _len = data.loc[path_name, "Path.length"]
+
+        columns = ["Path.length", "Path.PrivCore.R.Length", "Path.core.R.length"]
+        for i in range(len(columns)):
+            try :
+                _var = data.loc[path_name, columns[i:i+2]]
+                x = _var.mean()       
+                value = ((_var.iloc[0] - _var.iloc[1])/_len)*100
+            except :
+                _var = data.loc[path_name, columns[i]]
+                x = _var/2       
+                value = ((_var - 0)/_len)*100
+            plt.text(x, path_name, f"{round(value, 2)}%", horizontalalignment='center', verticalalignment='center', fontsize=10)
+    
+    # Adding legend
+    top_bar = mpatches.Patch(color='#FCDC94', label='Other')
+    middle_bar = mpatches.Patch(color='#78ABA8', label='Private')
+    bottom_bar = mpatches.Patch(color='#EF9C66', label='Core')
+    plt.legend(handles=[top_bar, middle_bar, bottom_bar], frameon=False)
+    sns.move_legend(bar, 'best', bbox_to_anchor=(1, 0.5))
+
+    # Adding/Modifying titles
+    plt.title(title, fontsize=15 , fontweight="bold")
+    plt.xlabel('Path length (Mbp)', loc='right', fontweight="bold")
+    plt.ylabel(None)
+
+    sns.despine()
+    plt.savefig(savedir)
+
+# 2D scatter Core vs Private 
+def get_group_2d_fig(data, title, savedir):
+    sns.set_style("ticks")
+    fig, (ax, ax_table) = plt.subplots(1, 2, gridspec_kw={'width_ratios': [3, 2], 'wspace': 0.05}, figsize=(16, 9), dpi=256)
+    
+    # Clustering with DBSCAN
+    X = data.loc[:, ["Path.private.R.per", "Path.core.R.per"]].to_numpy()
+    data["Clusters"] = DBSCAN(eps=2, min_samples = 1).fit_predict(X).tolist()
+
+    # Plotting scatter
+    sns.scatterplot(data, x = "Path.private.R.per", y = "Path.core.R.per", hue="Clusters", palette="tab10", ax=ax)
+    
+    # Adding labels
+    TEXTS = []
+    for genome in data.index.unique("Path.name"):
+        x, y = data.loc[genome, ["Path.private.R.per", "Path.core.R.per"]]
+        TEXTS.append(ax.text(x, y, genome, color="#7F7F7F", fontsize=9))
+    adjust_text(
+        TEXTS,
+        expand=(2,2),
+        arrowprops=dict(
+            arrowstyle="->", 
+            color="#b3b3b3", 
+            lw=0.5,
+            shrinkA=0.2
+        ),
+    ax=fig.axes[0]
+    )
+    
+    # Adding table
+    table_data = {
+        f"Cluster {cluster_id}" : data[data["Clusters"] == cluster_id].index.to_list()
+        for cluster_id in sorted(data["Clusters"].unique())
+    }
+    max_row = max((len(genomes) for genomes in table_data.values()))
+    for cluster_id in table_data.keys():
+        table_data[cluster_id] += (max_row - len(table_data[cluster_id])) * ['']
+    table_df = pd.DataFrame.from_dict(table_data)
+
+    ax_table = Table(
+        table_df, 
+        row_dividers=False, 
+        column_definitions = [
+            ColDef("index", width=0, title="", textprops={"fontsize": 0})
+        ],
+        textprops={"ha": "left"}
+    )
+    
+    # Adding/Modifying titles
+    ax.set_title(title, fontsize=15 , fontweight="bold")
+    ax.set_xlabel('Private relative proportion (%)', loc='right', fontweight="bold")
+    ax.set_ylabel('Core relative proportion (%)', loc='top', fontweight="bold")
+
+    sns.despine()
+    plt.savefig(savedir)
+
+# Shared content heatmap
+def get_hm_shared_fig(data, title, savedir):
+    sns.set_style("ticks")
+    plt.figure(figsize=(16, 9), dpi=256)
+    
+    # Heatmap
+    ax = sns.heatmap(data, cmap="Spectral_r", square = True)
+    
+    # Adding/Modifying titles
+    plt.title(title, fontsize=15 , fontweight="bold")
+    plt.xlabel('Target', fontweight="bold")
+    plt.ylabel('Query', fontweight="bold")
+
+    sns.despine()
+    plt.savefig(savedir)
+
+#% Figure generation
+# Bar plots
+for pangenome in alt_gData.index.unique("Pangenome.name"):
+    ## For each chromosome
+    for chrid in alt_gData.index.unique("Chr.id"):
+        get_group_decomp_fig(
+            data = alt_gData.loc[pangenome, chrid,:], 
+            title = f"Path composition by groups - {pangenome} - {chrid}",
+            savedir = os.path.join(args.dir, f"{pangenome}.path.decomp.{chrid}.png")
+        )
+    ## For the mean across chromosomes 
+    get_group_decomp_fig(
+        data = alt_gData.groupby("Path.name").sum(), 
+        title = "Haplotypes mean composition accross chromosomes",
+        savedir = os.path.join(args.dir, f"{pangenome}.path.decomp.mean.png")
+    )
+
+# 2D Scatter Core vs Private
+for pangenome in alt_gData.index.unique("Pangenome.name"):
+    for chrid in alt_gData.index.unique("Chr.id"):
+        get_group_2d_fig(
+            data = alt_gData.loc[pangenome, chrid,:], 
+            title = f"Path composition by groups - {pangenome} - {chrid}",
+            savedir = os.path.join(args.dir, f"{pangenome}.2D.scatter.{chrid}.png")
+        )
+    get_group_2d_fig(
+        data = alt_gData.groupby("Path.name").mean(), 
+        title = "Haplotypes mean composition accross chromosomes",
+        savedir = os.path.join(args.dir, f"{pangenome}.2D.scatter.mean.png")
+    )
+
+# Shared content heatmap
+for pangenome in sTable.index.unique("Pangenome.name"):
+    for chrid in sTable.index.unique("Chr.id"):
+        get_hm_shared_fig(
+            data = sTable.loc[pangenome, chrid,:], 
+            title = f"Shared content (%) - {pangenome} - {chrid}",
+            savedir = os.path.join(args.dir, f"{pangenome}.sharred.content.{chrid}.png")
+        )
\ No newline at end of file
diff --git a/scripts/chrInputStats.py b/scripts/chrInput_stats.py
similarity index 100%
rename from scripts/chrInputStats.py
rename to scripts/chrInput_stats.py
diff --git a/scripts/coreStats.py b/scripts/core_stats.py
similarity index 100%
rename from scripts/coreStats.py
rename to scripts/core_stats.py
-- 
GitLab


From 4b898234c4085cd9e78afe759d750020daf277a3 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 28 Aug 2024 16:54:05 +0200
Subject: [PATCH 127/310] Update Snakefile

---
 Snakefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index 3ea4152..3e632e8 100644
--- a/Snakefile
+++ b/Snakefile
@@ -602,7 +602,7 @@ rule graph_stats:
         pathstats="output/stats/chrGraphs/"+config['name']+".{chromosome}.path.stats.tsv"
     threads: 4
     resources:
-        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 6000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 8000
     params:
         app_path=config['app.path'],
         pan_name=config['name']
@@ -902,7 +902,7 @@ rule create_chrGraphs_figs:
         barplot_mean="output/chrGraphs.stats.figs/"+config['name']+".path.decomp.mean.png",
         scatter_mean="output/chrGraphs.stats.figs/"+config['name']+".2D.scatter.mean.png"
     threads: 1
-    ressources:
+    resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 16000
     params:
         app_path=config['app.path'],
-- 
GitLab


From 7dec93f0827bf12f7bf09312aebd7cd655a9fd05 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 28 Aug 2024 17:25:06 +0200
Subject: [PATCH 128/310] Added chrGraphs figures to final report

---
 Snakefile | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 50 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index 3e632e8..0512c53 100644
--- a/Snakefile
+++ b/Snakefile
@@ -926,6 +926,11 @@ def get_report_sections(wildcards):
     sections["odgifigs"] = expand("output/report/"+config['name']+".{chromosome}.report.fig.png", chromosome=CHRLIST)
     sections["genstats"] = "output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv"
     sections["pathstats"] = "output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv"
+    sections["barplots"] = expand("output/chrGraphs.stats.figs/"+config['name']+".path.decomp.{chromosome}.png", chromosome=CHRLIST)
+    sections["scatters"] = expand("output/chrGraphs.stats.figs/"+config['name']+".2D.scatter.{chromosome}.png", chromosome=CHRLIST),
+    sections["heatmaps"] = expand("output/chrGraphs.stats.figs/"+config['name']+".sharred.content.{chromosome}.png", chromosome=CHRLIST),
+    sections["barplot_mean"] = "output/chrGraphs.stats.figs/"+config['name']+".path.decomp.mean.png",
+    sections["scatter_mean"] = "output/chrGraphs.stats.figs/"+config['name']+".2D.scatter.mean.png"
 
     if config["get_ASMs_SyRI"] == "True":
         sections["SyRI_on_ASMs_figs"] = expand(
@@ -962,7 +967,7 @@ rule create_pan1c_report:
     run:
         shell("touch {output.report}")
 
-        # Adding Summary (made for importing in Joplin)
+        # Adding Summary
         shell("echo '# Summary' >> {output.report}")
         shell("echo '- [Graph metadata](#graph-metadata)' >> {output.report}")
         shell("echo '- [General stats](#general-stats)' >> {output.report}")
@@ -989,7 +994,50 @@ rule create_pan1c_report:
 
         # Adding Path stats
         shell("echo '# Path stats' >> {output.report}")
-        shell("cat {input.pathstats} | apptainer run {params.app_path}/pan1c-env.sif csv2md -d $'\\t' >> {output.report}")
+
+        ## Barplots sub section
+        shell("echo '## Path composition' >> {output.report}")
+        _basename = os.path.basename(input.barplot_mean)
+        shell("echo '![{_basename}](./chrGraphs.stats.figs/{_basename})' >> {output.report}")
+
+        barplot_figs_list = [fig for fig in input.barplots]
+        barplot_figs_list.sort()
+
+        for i in range(len(barplot_figs_list)):
+            barplot_basename=os.path.basename(barplot_figs_list[i])
+            chr_name=barplot_basename.split('.')[-2]
+
+            shell("echo '### {chr_name}' >> {output.report}")
+            shell("echo '![{barplot_basename}](./chrGraphs.stats.figs/{barplot_basename})' >> {output.report}")
+        
+        ## 2D scatter sub section
+        shell("echo '## Core vs Private' >> {output.report}")
+        _basename = os.path.basename(input.scatter_mean)
+        shell("echo '![{_basename}](./chrGraphs.stats.figs/{_basename})' >> {output.report}")
+
+        scatter_figs_list = [fig for fig in input.scatters]
+        scatter_figs_list.sort()
+
+        for i in range(len(scatter_figs_list)):
+            scatter_basename=os.path.basename(scatter_figs_list[i])
+            chr_name=scatter_basename.split('.')[-2]
+
+            shell("echo '### {chr_name}' >> {output.report}")
+            shell("echo '![{scatter_basename}](./chrGraphs.stats.figs/{scatter_basename})' >> {output.report}")
+
+        ## Heatmap section
+        shell("echo '## Pairwise shared content' >> {output.report}")
+        hm_figs_list = [fig for fig in input.heatmaps]
+        hm_figs_list.sort()
+
+        for i in range(len(hm_figs_list)):
+            hm_basename=os.path.basename(hm_figs_list[i])
+            chr_name=hm_basename.split('.')[-2]
+
+            shell("echo '### {chr_name}' >> {output.report}")
+            shell("echo '![{hm_basename}](./chrGraphs.stats.figs/{hm_basename})' >> {output.report}")
+
+        #shell("cat {input.pathstats} | apptainer run {params.app_path}/pan1c-env.sif csv2md -d $'\\t' >> {output.report}")
         shell("echo '' >> {output.report}")
 
         # Adding chromosomes figures
-- 
GitLab


From da16e6c4593a16f5a3b56be548f7f085c886e7c9 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 28 Aug 2024 18:14:36 +0200
Subject: [PATCH 129/310] Update Snakefile

---
 Snakefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Snakefile b/Snakefile
index 0512c53..c63b2d1 100644
--- a/Snakefile
+++ b/Snakefile
@@ -927,9 +927,9 @@ def get_report_sections(wildcards):
     sections["genstats"] = "output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv"
     sections["pathstats"] = "output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv"
     sections["barplots"] = expand("output/chrGraphs.stats.figs/"+config['name']+".path.decomp.{chromosome}.png", chromosome=CHRLIST)
-    sections["scatters"] = expand("output/chrGraphs.stats.figs/"+config['name']+".2D.scatter.{chromosome}.png", chromosome=CHRLIST),
-    sections["heatmaps"] = expand("output/chrGraphs.stats.figs/"+config['name']+".sharred.content.{chromosome}.png", chromosome=CHRLIST),
-    sections["barplot_mean"] = "output/chrGraphs.stats.figs/"+config['name']+".path.decomp.mean.png",
+    sections["scatters"] = expand("output/chrGraphs.stats.figs/"+config['name']+".2D.scatter.{chromosome}.png", chromosome=CHRLIST)
+    sections["heatmaps"] = expand("output/chrGraphs.stats.figs/"+config['name']+".sharred.content.{chromosome}.png", chromosome=CHRLIST)
+    sections["barplot_mean"] = "output/chrGraphs.stats.figs/"+config['name']+".path.decomp.mean.png"
     sections["scatter_mean"] = "output/chrGraphs.stats.figs/"+config['name']+".2D.scatter.mean.png"
 
     if config["get_ASMs_SyRI"] == "True":
-- 
GitLab


From 6005b7eba0a54bb973dbbd77af5be92b9005724e Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 28 Aug 2024 18:17:07 +0200
Subject: [PATCH 130/310] Update Snakefile

---
 Snakefile | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index c63b2d1..138b75c 100644
--- a/Snakefile
+++ b/Snakefile
@@ -900,7 +900,8 @@ rule create_chrGraphs_figs:
         scatters=expand("output/chrGraphs.stats.figs/"+config['name']+".2D.scatter.{chromosome}.png", chromosome=CHRLIST),
         heatmaps=expand("output/chrGraphs.stats.figs/"+config['name']+".sharred.content.{chromosome}.png", chromosome=CHRLIST),
         barplot_mean="output/chrGraphs.stats.figs/"+config['name']+".path.decomp.mean.png",
-        scatter_mean="output/chrGraphs.stats.figs/"+config['name']+".2D.scatter.mean.png"
+        scatter_mean="output/chrGraphs.stats.figs/"+config['name']+".2D.scatter.mean.png",
+        outdir="output/chrGraphs.stats.figs/"
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 16000
@@ -909,8 +910,10 @@ rule create_chrGraphs_figs:
         pan_name=config['name']
     shell:
         """
+        mkdir -p {output.outdir}
+
         apptainer run {params.app_path}/pan1c-env.sif python scripts/chrGraphs_figs.py \
-            --input {input.pathstats} --output_dir {output.genstats} \
+            --input {input.pathstats} --output_dir {output.outdir} \
             --panname {params.pan_name} 
         """
 
-- 
GitLab


From ce8a0d66fa3f521d56d62dcab4a4f2ef563aab5b Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 28 Aug 2024 18:21:29 +0200
Subject: [PATCH 131/310] Update chrGraphs_figs.py

---
 scripts/chrGraphs_figs.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/scripts/chrGraphs_figs.py b/scripts/chrGraphs_figs.py
index dc7ea4a..c2d3aa0 100644
--- a/scripts/chrGraphs_figs.py
+++ b/scripts/chrGraphs_figs.py
@@ -206,31 +206,31 @@ def get_hm_shared_fig(data, title, savedir):
 
 #% Figure generation
 # Bar plots
-for pangenome in alt_gData.index.unique("Pangenome.name"):
+for pangenome in gData.index.unique("Pangenome.name"):
     ## For each chromosome
-    for chrid in alt_gData.index.unique("Chr.id"):
+    for chrid in gData.index.unique("Chr.id"):
         get_group_decomp_fig(
-            data = alt_gData.loc[pangenome, chrid,:], 
+            data = gData.loc[pangenome, chrid,:], 
             title = f"Path composition by groups - {pangenome} - {chrid}",
             savedir = os.path.join(args.dir, f"{pangenome}.path.decomp.{chrid}.png")
         )
     ## For the mean across chromosomes 
     get_group_decomp_fig(
-        data = alt_gData.groupby("Path.name").sum(), 
+        data = gData.groupby("Path.name").sum(), 
         title = "Haplotypes mean composition accross chromosomes",
         savedir = os.path.join(args.dir, f"{pangenome}.path.decomp.mean.png")
     )
 
 # 2D Scatter Core vs Private
-for pangenome in alt_gData.index.unique("Pangenome.name"):
-    for chrid in alt_gData.index.unique("Chr.id"):
+for pangenome in gData.index.unique("Pangenome.name"):
+    for chrid in gData.index.unique("Chr.id"):
         get_group_2d_fig(
-            data = alt_gData.loc[pangenome, chrid,:], 
+            data = gData.loc[pangenome, chrid,:], 
             title = f"Path composition by groups - {pangenome} - {chrid}",
             savedir = os.path.join(args.dir, f"{pangenome}.2D.scatter.{chrid}.png")
         )
     get_group_2d_fig(
-        data = alt_gData.groupby("Path.name").mean(), 
+        data = gData.groupby("Path.name").mean(), 
         title = "Haplotypes mean composition accross chromosomes",
         savedir = os.path.join(args.dir, f"{pangenome}.2D.scatter.mean.png")
     )
-- 
GitLab


From 880460aabd961fc0debfe669e83c5a5dd058568b Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 28 Aug 2024 18:25:57 +0200
Subject: [PATCH 132/310] Update Snakefile

---
 Snakefile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index 138b75c..74648d6 100644
--- a/Snakefile
+++ b/Snakefile
@@ -901,7 +901,6 @@ rule create_chrGraphs_figs:
         heatmaps=expand("output/chrGraphs.stats.figs/"+config['name']+".sharred.content.{chromosome}.png", chromosome=CHRLIST),
         barplot_mean="output/chrGraphs.stats.figs/"+config['name']+".path.decomp.mean.png",
         scatter_mean="output/chrGraphs.stats.figs/"+config['name']+".2D.scatter.mean.png",
-        outdir="output/chrGraphs.stats.figs/"
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 16000
@@ -913,7 +912,7 @@ rule create_chrGraphs_figs:
         mkdir -p {output.outdir}
 
         apptainer run {params.app_path}/pan1c-env.sif python scripts/chrGraphs_figs.py \
-            --input {input.pathstats} --output_dir {output.outdir} \
+            --input {input.pathstats} --output_dir $(dirname {output.barplot_mean}) \
             --panname {params.pan_name} 
         """
 
-- 
GitLab


From ddbc91de89471b463a53d6537eb01f577a27d266 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 28 Aug 2024 18:26:48 +0200
Subject: [PATCH 133/310] Update Snakefile

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 74648d6..19a68b7 100644
--- a/Snakefile
+++ b/Snakefile
@@ -909,7 +909,7 @@ rule create_chrGraphs_figs:
         pan_name=config['name']
     shell:
         """
-        mkdir -p {output.outdir}
+        mkdir -p $(dirname {output.barplot_mean})
 
         apptainer run {params.app_path}/pan1c-env.sif python scripts/chrGraphs_figs.py \
             --input {input.pathstats} --output_dir $(dirname {output.barplot_mean}) \
-- 
GitLab


From 5c8d472abc5acb9f8c0a04530957ac57d254a707 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 2 Sep 2024 17:47:06 +0200
Subject: [PATCH 134/310] Renaming and graph addition

- Renamed scripts for a sort of unified name scheme
- Added figures to general path stats
---
 Snakefile                                     |  16 +-
 .../{chrGraphs_figs.py => VCF.stats_figs.py}  |   4 +-
 ...regate.py => chrGraphs.stats_aggregate.py} |   0
 scripts/chrGraphs.stats_figs.py               | 320 ++++++++++++++++++
 ...put_stats.py => chrInput.stats_compute.py} |   0
 .../{contig_position.R => contig.pos_figs.R}  |  11 +-
 .../{core_stats.py => core.stats_compute.py}  |   0
 scripts/vcf_2_tsv_vg.awk                      |   3 +-
 8 files changed, 344 insertions(+), 10 deletions(-)
 rename scripts/{chrGraphs_figs.py => VCF.stats_figs.py} (98%)
 rename scripts/{chrGraphs_aggregate.py => chrGraphs.stats_aggregate.py} (100%)
 create mode 100644 scripts/chrGraphs.stats_figs.py
 rename scripts/{chrInput_stats.py => chrInput.stats_compute.py} (100%)
 rename scripts/{contig_position.R => contig.pos_figs.R} (90%)
 rename scripts/{core_stats.py => core.stats_compute.py} (100%)

diff --git a/Snakefile b/Snakefile
index 19a68b7..504fc26 100644
--- a/Snakefile
+++ b/Snakefile
@@ -222,7 +222,7 @@ rule contig_position:
         #cat $base/{wildcards.chromosome}.tsv
 
         # Creating figure
-        apptainer run --app Renv {params.app_path}/pan1c-env.sif Rscript scripts/contig_position.R \
+        apptainer run --app Renv {params.app_path}/pan1c-env.sif Rscript scripts/contig.pos_figs.R \
             -b $base/{wildcards.chromosome}.bands \
             -t $base/{wildcards.chromosome}.tsv \
             -o {output.fig}
@@ -655,7 +655,7 @@ rule aggregate_graphs_stats:
         pan_name=config['name']
     shell:
         """
-        apptainer run {params.app_path}/pan1c-env.sif python scripts/chrGraphs_aggregate.py \
+        apptainer run {params.app_path}/pan1c-env.sif python scripts/chrGraphs.stats_aggregate.py \
             --input $(dirname {input[0]}) --outputGeneral {output.genstats} \
             --outputPaths {output.pathstats} --panname {params.pan_name} 
         """
@@ -693,7 +693,7 @@ rule pggb_input_stats:
         pan_name=config['name']
     shell:
         """
-        apptainer run {params.app_path}/pan1c-env.sif python scripts/chrInput_stats.py \
+        apptainer run {params.app_path}/pan1c-env.sif python scripts/chrInput.stats_compute.py \
             -f data/chrInputs/*.fa.gz -o {output} -p {params.pan_name}
         """
 
@@ -714,7 +714,7 @@ rule core_statistics:
     shell:
         """
         mkdir -p {output.dir}
-        apptainer run {params.app_path}/pan1c-env.sif python scripts/core_stats.py \
+        apptainer run {params.app_path}/pan1c-env.sif python scripts/core.stats_compute.py \
             --pggbStats logs/pggb --chrInputStats {input.chrInputStats} \
             --chrGraphStats {input.chrGraphStats} -o {output.tsv} -f {output.dir} -p {params.pan_name}
         """
@@ -901,6 +901,7 @@ rule create_chrGraphs_figs:
         heatmaps=expand("output/chrGraphs.stats.figs/"+config['name']+".sharred.content.{chromosome}.png", chromosome=CHRLIST),
         barplot_mean="output/chrGraphs.stats.figs/"+config['name']+".path.decomp.mean.png",
         scatter_mean="output/chrGraphs.stats.figs/"+config['name']+".2D.scatter.mean.png",
+        heatmap_diff="output/chrGraphs.stats.figs/"+config['name']+".sharred.content.diff.png"
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 16000
@@ -911,7 +912,7 @@ rule create_chrGraphs_figs:
         """
         mkdir -p $(dirname {output.barplot_mean})
 
-        apptainer run {params.app_path}/pan1c-env.sif python scripts/chrGraphs_figs.py \
+        apptainer run {params.app_path}/pan1c-env.sif python scripts/chrGraphs.stats_figs.py \
             --input {input.pathstats} --output_dir $(dirname {output.barplot_mean}) \
             --panname {params.pan_name} 
         """
@@ -933,6 +934,7 @@ def get_report_sections(wildcards):
     sections["heatmaps"] = expand("output/chrGraphs.stats.figs/"+config['name']+".sharred.content.{chromosome}.png", chromosome=CHRLIST)
     sections["barplot_mean"] = "output/chrGraphs.stats.figs/"+config['name']+".path.decomp.mean.png"
     sections["scatter_mean"] = "output/chrGraphs.stats.figs/"+config['name']+".2D.scatter.mean.png"
+    sections["heatmap_diff"] = "output/chrGraphs.stats.figs/"+config['name']+".sharred.content.diff.png"
 
     if config["get_ASMs_SyRI"] == "True":
         sections["SyRI_on_ASMs_figs"] = expand(
@@ -1029,6 +1031,10 @@ rule create_pan1c_report:
 
         ## Heatmap section
         shell("echo '## Pairwise shared content' >> {output.report}")
+        shell("echo '### Pairwise euclid distance betwwen chromosomes ' >> {output.report}")
+        _basename = os.path.basename(input.heatmap_diff)
+        shell("echo '![{_basename}](./chrGraphs.stats.figs/{_basename})' >> {output.report}")
+
         hm_figs_list = [fig for fig in input.heatmaps]
         hm_figs_list.sort()
 
diff --git a/scripts/chrGraphs_figs.py b/scripts/VCF.stats_figs.py
similarity index 98%
rename from scripts/chrGraphs_figs.py
rename to scripts/VCF.stats_figs.py
index c2d3aa0..4882337 100644
--- a/scripts/chrGraphs_figs.py
+++ b/scripts/VCF.stats_figs.py
@@ -1,7 +1,7 @@
 """
-Chromosomes statistics figure script for Pan1c workflow
+VCF statistics figure script for Pan1c workflow
 
-Use aggregated TSV produced with chrGraphs_aggregate.py
+Use aggregated TSV produced with vcf_2_tsv_syri.awk and vcf_2_tsv_vg.awk
 
 @author: alexis.mergez@inrae.fr
 @version: 1.0
diff --git a/scripts/chrGraphs_aggregate.py b/scripts/chrGraphs.stats_aggregate.py
similarity index 100%
rename from scripts/chrGraphs_aggregate.py
rename to scripts/chrGraphs.stats_aggregate.py
diff --git a/scripts/chrGraphs.stats_figs.py b/scripts/chrGraphs.stats_figs.py
new file mode 100644
index 0000000..dcad8f5
--- /dev/null
+++ b/scripts/chrGraphs.stats_figs.py
@@ -0,0 +1,320 @@
+"""
+Chromosomes statistics figure script for Pan1c workflow
+
+Use aggregated TSV produced with chrGraphs.stats_aggregate.py
+
+@author: alexis.mergez@inrae.fr
+@version: 1.0
+"""
+
+#% Librairies
+import os
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
+import seaborn as sns
+import pandas as pd
+import argparse
+from sklearn.cluster import DBSCAN
+from adjustText import adjust_text
+from plottable import Table, ColDef
+
+#% Parsing arguments
+arg_parser = argparse.ArgumentParser(description='Statistic figures for Pan1c workflow')
+arg_parser.add_argument(
+    "--input",
+    "-i",
+    dest = "input",
+    required = True,
+    help = "Input TSV containing chromosome path stats"
+    )
+arg_parser.add_argument(
+    "--output_dir",
+    dest = "dir",
+    required = True,
+    help = "Output dir for figures"
+    )
+arg_parser.add_argument(
+    "--panname",
+    "-p",
+    dest = "panname",
+    required = True,
+    help = "Pangenome name"
+    )
+args = arg_parser.parse_args()
+
+#% Loading and preparing data
+def get_path_name(pathname):
+    return pathname.rsplit("#", 1)[0]
+
+# Reading data
+gData = pd.read_csv("/home/amergez/TÃ©lÃ©chargements/GFAsp/pan1c.37Bra-v3c.chrGraph.path.stats.tsv", sep='\t')
+gData["Path.name"] = gData["Path.name"].apply(get_path_name)
+gData.set_index(["Pangenome.name","Chr.id", "Path.name"], inplace=True)
+
+# Splitting shared content and other stats
+shared_content = gData.loc[:, ["Path.length", "Shared.content"]].to_dict()
+gData.drop("Shared.content", axis=1, inplace=True)
+
+# Summing Private and core for the barplot figure
+gData["Path.PrivCore.R.Length"] = gData["Path.private.R.length"] + gData["Path.core.R.length"]
+
+# Switching to Mbp instead of Bp
+gData[["Path.length", "Path.PrivCore.R.Length", "Path.core.R.length"]] = gData[["Path.length", "Path.PrivCore.R.Length", "Path.core.R.length"]].astype(float)
+gData.loc[:, ["Path.length", "Path.PrivCore.R.Length", "Path.core.R.length"]] = gData.loc[:, ["Path.length", "Path.PrivCore.R.Length", "Path.core.R.length"]] / 1000000
+
+# Adding percentages relative to path length
+gData["Path.private.R.per"] = (gData["Path.private.R.length"]*100)/(gData["Path.length"]*1000000)
+gData["Path.core.R.per"] = (gData["Path.core.R.length"]*100)/gData["Path.length"]
+
+# Shared content
+shared_dict = {}
+A = 0
+for key, value in shared_content["Shared.content"].items():
+    for elem in value.split(';'):
+        target, stats = elem.split(":")
+        target = target.rsplit("#", 1)[0]
+
+        shared_dict[A] = list(key)+[target]+[shared_content["Path.length"][key]]+[int(val) for val in stats.split(',')]
+        A+=1
+sData = pd.DataFrame.from_dict(shared_dict, orient='index', columns = ["Pangenome.name", "Chr.id", "Query.name", "Target.name", "Path.length", "Shared.nodes.count", "Shared.length", "Shared.R.length"])
+sData.set_index(["Pangenome.name","Chr.id", "Query.name", "Target.name"], inplace=True)
+sData.loc[:, "Shared.prop"] = sData["Shared.length"]*100/sData["Path.length"]
+sData.loc[:, "Shared.R.prop"] = sData["Shared.R.length"]*100/sData["Path.length"]
+sData.loc[:, "Shared.length.mb"] = sData["Shared.length"]/1000000
+
+#% Figures functions
+# Bar plots with path decomposition into Core, Private and Other proportion
+def get_group_decomp_fig(data, title, savedir):
+    sns.set_style("ticks")
+    fig, (mbax, perax) = plt.subplots(1, 2, gridspec_kw={'wspace': 0.2, 'hspace':0}, figsize=(16, 9), dpi=300)
+    
+    # Adding bars for Mbax (in megabase)
+    sns.barplot(data, y="Path.name", x="Path.length", color='#FCDC94', estimator=np.mean, errorbar=None, orient="h", ax=mbax)
+    sns.barplot(data, y="Path.name", x="Path.PrivCore.R.Length", color='#78ABA8', estimator=np.mean, errorbar=None, orient="h", ax=mbax)
+    sns.barplot(data, y="Path.name", x="Path.core.R.length", color='#EF9C66', estimator=np.mean, errorbar=None, orient="h", ax=mbax)
+    
+    # Adding correspong size on Mbax
+    for path_name in data.index.unique("Path.name"):
+        _len = data.loc[path_name, "Path.length"]
+
+        columns = ["Path.length", "Path.PrivCore.R.Length", "Path.core.R.length"]
+        for i in range(len(columns)):
+            try :
+                _var = data.loc[path_name, columns[i:i+2]]
+                x = _var.mean()       
+                value = (_var.iloc[0] - _var.iloc[1])
+            except :
+                _var = data.loc[path_name, columns[i]]
+                x = _var/2       
+                value = (_var - 0)
+            mbax.text(x, path_name, f"{round(value, 1)}Mb", horizontalalignment='center', verticalalignment='center', fontsize=10)
+
+    # Computing the relative percentages
+    data["Path.core.R.length.per"] = data["Path.core.R.length"] * 100 / data["Path.length"]
+    data["Path.PrivCore.R.Length.per"] = data["Path.PrivCore.R.Length"] * 100 / data["Path.length"]
+    data["Path.length.per"] = [100] * len(data)
+
+    # Adding bars for Perax (in percentage)
+    #sns.barplot(data, y="Path.name", x="Path.length.per", color='#FCDC94', estimator=np.mean, errorbar=None, orient="h", ax=perax)
+    sns.barplot(data, y="Path.name", x="Path.PrivCore.R.Length.per", color='#78ABA8', estimator=np.mean, errorbar=None, orient="h", ax=perax)
+    sns.barplot(data, y="Path.name", x="Path.core.R.length.per", color='#EF9C66', estimator=np.mean, errorbar=None, orient="h", ax=perax)
+
+    # Adding correspong size on Perax
+    for path_name in data.index.unique("Path.name"):
+
+        columns = ["Path.PrivCore.R.Length.per", "Path.core.R.length.per"]
+        for i in range(len(columns)):
+            try :
+                _var = data.loc[path_name, columns[i:i+2]]
+                x = _var.mean()       
+                value = (_var.iloc[0] - _var.iloc[1])
+            except :
+                _var = data.loc[path_name, columns[i]]
+                x = _var/2       
+                value = (_var - 0)
+            perax.text(x, path_name, f"{round(value, 1)}%", horizontalalignment='center', verticalalignment='center', fontsize=10)
+    
+    # Adding legend
+    top_bar = mpatches.Patch(color='#FCDC94', label='Other')
+    middle_bar = mpatches.Patch(color='#78ABA8', label='Private')
+    bottom_bar = mpatches.Patch(color='#EF9C66', label='Core')
+    perax.legend(handles=[top_bar, middle_bar, bottom_bar], frameon=False)
+    sns.move_legend(perax, 'best', bbox_to_anchor=(1, 0.5))
+
+    # Adding/Modifying titles
+    fig.suptitle(title, fontsize=15 , fontweight="bold")
+    mbax.set_xlabel('Path length (Mbp)', loc='right', fontweight="bold")
+    perax.set_xlabel('Path relative proportion (%)', loc='right', fontweight="bold")
+    mbax.set_ylabel(None)
+    perax.set_ylabel(None)
+
+    sns.despine()
+    plt.savefig(savedir)
+
+# 2D scatter Core vs Private 
+def get_group_2d_fig(data, title, savedir):
+    sns.set_style("ticks")
+    fig, (ax, ax_table) = plt.subplots(1, 2, gridspec_kw={'width_ratios': [3, 2], 'wspace': 0.05}, figsize=(16, 9), dpi=300)
+    
+    # Clustering with DBSCAN
+    X = data.loc[:, ["Path.private.R.per", "Path.core.R.per"]].to_numpy()
+    data["Clusters"] = DBSCAN(eps=2, min_samples = 1).fit_predict(X).tolist()
+
+    # Plotting scatter
+    sns.scatterplot(data, x = "Path.private.R.per", y = "Path.core.R.per", hue="Clusters", palette="tab10", ax=ax)
+    
+    # Adding labels
+    TEXTS = []
+    for genome in data.index.unique("Path.name"):
+        x, y = data.loc[genome, ["Path.private.R.per", "Path.core.R.per"]]
+        TEXTS.append(ax.text(x, y, genome, color="#7F7F7F", fontsize=9))
+    adjust_text(
+        TEXTS,
+        expand=(2,2),
+        arrowprops=dict(
+            arrowstyle="->", 
+            color="#b3b3b3", 
+            lw=0.5,
+            shrinkA=0.2
+        ),
+    ax=fig.axes[0]
+    )
+    
+    # Adding table
+    table_data = {
+        f"Cluster {cluster_id}" : data[data["Clusters"] == cluster_id].index.to_list()
+        for cluster_id in sorted(data["Clusters"].unique())
+    }
+    max_row = max((len(genomes) for genomes in table_data.values()))
+    for cluster_id in table_data.keys():
+        table_data[cluster_id] += (max_row - len(table_data[cluster_id])) * ['']
+    table_df = pd.DataFrame.from_dict(table_data)
+
+    ax_table = Table(
+        table_df, 
+        row_dividers=False, 
+        column_definitions = [
+            ColDef("index", width=0, title="", textprops={"fontsize": 0})
+        ],
+        textprops={"ha": "left"}
+    )
+    
+    # Adding/Modifying titles
+    ax.set_title(title, fontsize=15 , fontweight="bold")
+    ax.set_xlabel('Private relative proportion (%)', loc='right', fontweight="bold")
+    ax.set_ylabel('Core relative proportion (%)', loc='top', fontweight="bold")
+
+    sns.despine()
+    plt.savefig(savedir)
+
+# Shared content heatmap
+def get_hm_shared_fig(data, title, savedir):
+    sns.set_style("ticks")
+    fig, ((Rax, NRax), (Rax_bar, NRax_bar)) = plt.subplots(2, 2, gridspec_kw={'wspace': 0.3, 'hspace': 0.2, 'height_ratios':[4, 0.15]}, figsize=(16, 9), dpi=300)
+
+    # Pivoting the dataframe for the heatmap
+    data["Shared.length"] = data["Shared.length"]/1000000
+    srTable = data.reset_index().pivot(values=["Shared.R.prop"], index=["Query.name"], columns=["Target.name"]).fillna(100)
+    snrTable = data.reset_index().pivot(values=["Shared.length"], index=["Query.name"], columns=["Target.name"])
+    
+    # Removing multi-index from columns
+    srTable.columns = srTable.columns.droplevel()
+    snrTable.columns = snrTable.columns.droplevel()
+    
+    # Heatmap
+    sns.heatmap(srTable, cmap="Spectral_r", square = True, ax=Rax, cbar=False)
+    fig.colorbar(Rax.collections[0], cax=Rax_bar, orientation='horizontal')
+    sns.heatmap(snrTable, cmap="Spectral_r", square = True, ax=NRax, mask=np.triu(snrTable), cbar=False)
+    fig.colorbar(NRax.collections[0], cax=NRax_bar, orientation='horizontal')
+    
+    # Adding/Modifying titles
+    fig.suptitle(title, fontsize=15 , fontweight="bold")
+    Rax.set_title("Relative proportion of shared content (%)", fontsize=13 , fontweight="bold")
+    NRax.set_title("Node size of shared content (Mbp)", fontsize=13 , fontweight="bold")
+    Rax.set_xlabel(None)
+    NRax.set_xlabel(None)
+    Rax.set_ylabel('Query', fontweight="bold")
+    NRax.set_ylabel(None)
+
+    sns.despine()
+    plt.savefig(savedir)
+
+# Shared content difference between chromosomes heatmap
+def get_hm_diff_fig(data, title, savedir):
+    sns.set_style("ticks")
+    fig, ax = plt.subplots(1, 1, figsize=(10, 9), dpi=300)
+    
+    # Heatmap
+    sns.heatmap(data, cmap="hot_r", square = True, ax=ax, mask=np.triu(data))
+    
+    # Adding/Modifying titles
+    fig.suptitle(title, fontsize=15 , fontweight="bold")
+    ax.set_xlabel(None)
+    ax.set_ylabel('Query', fontweight="bold")
+
+    sns.despine()
+    plt.savefig(savedir)
+
+#% Figure generation
+# Bar plots
+for pangenome in gData.index.unique("Pangenome.name"):
+    ## For each chromosome
+    for chrid in gData.index.unique("Chr.id"):
+        get_group_decomp_fig(
+            data = gData.loc[pangenome, chrid,:].copy(), 
+            title = f"Path composition by groups - {pangenome} - {chrid}",
+            savedir = os.path.join(args.dir, f"{pangenome}.path.decomp.{chrid}.png")
+        )
+    ## For the mean across chromosomes 
+    get_group_decomp_fig(
+        data = gData.groupby("Path.name").sum().copy(), 
+        title = "Haplotypes mean composition accross chromosomes",
+        savedir = os.path.join(args.dir, f"{pangenome}.path.decomp.mean.png")
+    )
+
+# 2D Scatter Core vs Private
+for pangenome in gData.index.unique("Pangenome.name"):
+    for chrid in gData.index.unique("Chr.id"):
+        get_group_2d_fig(
+            data = gData.loc[pangenome, chrid,:].copy(), 
+            title = f"Path composition by groups - {pangenome} - {chrid}",
+            savedir = os.path.join(args.dir, f"{pangenome}.2D.scatter.{chrid}.png")
+        )
+    get_group_2d_fig(
+        data = gData.groupby("Path.name").mean().copy(), 
+        title = "Haplotypes mean composition accross chromosomes",
+        savedir = os.path.join(args.dir, f"{pangenome}.2D.scatter.mean.png")
+    )
+
+# Shared content heatmap
+for pangenome in sTable.index.unique("Pangenome.name"):
+    for chrid in sTable.index.unique("Chr.id"):
+        get_hm_shared_fig(
+            data = sTable.loc[pangenome, chrid,:].copy(), 
+            title = f"Shared content (%) - {pangenome} - {chrid}",
+            savedir = os.path.join(args.dir, f"{pangenome}.sharred.content.{chrid}.png")
+        )
+
+# Shared content difference between chromosomes heatmap
+dData = {"Q":[], "T":[], "Diff":[]}
+for pangenome in sData.index.unique("Pangenome.name"):
+    # Iterating over chromosomes twice to make pairs
+    for Q in sData.index.unique("Chr.id"):
+        for T in sData.index.unique("Chr.id"):
+            Qtable = sData.loc[pangenome, Q, :].copy().reset_index().pivot(values=["Shared.prop"], index=["Query.name"], columns=["Target.name"]).fillna(0)
+            Ttable = sData.loc[pangenome, T, :].copy().reset_index().pivot(values=["Shared.prop"], index=["Query.name"], columns=["Target.name"]).fillna(0)
+
+            # Computing Euclid distance using Frobenious norm
+            dData["Q"].append(Q)
+            dData["T"].append(T)
+            dData["Diff"].append(np.linalg.norm(Qtable.values-Ttable.values, ord = 'fro'))
+            
+    dData = pd.DataFrame.from_dict(dData).pivot(values=["Diff"], index=["Q"], columns=["T"])
+    dData.columns = dData.columns.droplevel()
+    
+    get_hm_diff_fig(
+        dData, 
+        title = f"Matrix distance between chromosomes - {pangenome}",
+        savedir = os.path.join(args.dir, f"{pangenome}.sharred.content.diff.png")
+    )
\ No newline at end of file
diff --git a/scripts/chrInput_stats.py b/scripts/chrInput.stats_compute.py
similarity index 100%
rename from scripts/chrInput_stats.py
rename to scripts/chrInput.stats_compute.py
diff --git a/scripts/contig_position.R b/scripts/contig.pos_figs.R
similarity index 90%
rename from scripts/contig_position.R
rename to scripts/contig.pos_figs.R
index b0eeb0a..58d849f 100644
--- a/scripts/contig_position.R
+++ b/scripts/contig.pos_figs.R
@@ -1,3 +1,12 @@
+"""
+Contig position figure script for Pan1c workflow
+
+Use contig position to produce a figure showing their position into respective chromosome
+
+@author: alexis.mergez@inrae.fr, adapted from Cedric Cabau
+@version: 1.0
+"""
+
 library("karyoploteR")
 library("optparse")
 
@@ -70,8 +79,6 @@ if (nChr >= 4){
 
 }
 
-
-
 kp <- plotKaryotype(genome=my.genome, cytobands=my.cytobands, plot.params=pp, chromosomes="all")
 kp <- kpAddBaseNumbers(kp, cex=0.6, tick.dist=5000000)
 dev.off()
\ No newline at end of file
diff --git a/scripts/core_stats.py b/scripts/core.stats_compute.py
similarity index 100%
rename from scripts/core_stats.py
rename to scripts/core.stats_compute.py
diff --git a/scripts/vcf_2_tsv_vg.awk b/scripts/vcf_2_tsv_vg.awk
index 1262ba2..75563c2 100644
--- a/scripts/vcf_2_tsv_vg.awk
+++ b/scripts/vcf_2_tsv_vg.awk
@@ -1,5 +1,6 @@
 # Header
 /^#CHROM/ {
+    # Getting the name of haplotypes
     for (i=10;i<=NF;i++) {HAP[i]=$i}; print "CHROM\tPOS\tID\tHAP\tLEN"
 }
 
@@ -23,7 +24,7 @@
 		for (a in ALTs) { 
             hap+=1
 			if (ALTs[a] != "." && ALTLEN[ALTs[a]] != 0) {  
-				printf("%s\t%s\t%s\t%s#%s\t%s\n", $1, $2, $3, HAP[i], hap, ALTLEN[ALTs[a]])
+				printf("%s\t%s\t%s|%s\t%s#%s\t%s\n", $1, $2, $3, ALTs[a], HAP[i], hap, ALTLEN[ALTs[a]])
             }
         }
     }
-- 
GitLab


From ab85ef3a3db88fe21928ce508e1f735aff158312 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 2 Sep 2024 18:46:02 +0200
Subject: [PATCH 135/310] Switched to python for the VCF figures

- Switched from analyze_VCF_v2.R to VCF.stats_figs.py (R to Python)
- Fixed absolute path in chrGraphs.stats_figs.py
- Removed vcf_fig config entry from config.yaml and config_CICD.yaml
---
 Snakefile                       |  15 +-
 config.yaml                     |   1 -
 example/config_CICD.yaml        |   1 -
 scripts/VCF.stats_figs.py       | 473 +++++++++++++++++++-------------
 scripts/chrGraphs.stats_figs.py |   2 +-
 5 files changed, 286 insertions(+), 206 deletions(-)

diff --git a/Snakefile b/Snakefile
index 504fc26..423a159 100644
--- a/Snakefile
+++ b/Snakefile
@@ -844,14 +844,13 @@ rule vcf_fig:
         zcat {input.vg} | awk -f scripts/vcf_2_tsv_vg.awk > {output.vcf_fig}/vg.tsv
 
         #% Running R to get the figures
-        apptainer run --app Renv {params.app_path}/pan1c-env.sif Rscript scripts/analyze_VCF_v2.R \
-            -v {output.vcf_fig}/vg.tsv \
-            -s {output.vcf_fig}/syri.tsv \
-            -o {output.vcf_fig} \
-            -p {params.pan_name} \
-            {params.fig_config}
-
-        rm {output.vcf_fig}/*.tsv
+        apptainer run {params.app_path}/pan1c-env.sif python scripts/VCF.stats_figs.py \
+            --vg {output.vcf_fig}/vg.tsv \
+            --syri {output.vcf_fig}/syri.tsv \
+            --output_dir {output.vcf_fig} \
+            --panname {params.pan_name}
+
+        #rm {output.vcf_fig}/*.tsv
         """
 
 rule create_pan1c_report_fig:
diff --git a/config.yaml b/config.yaml
index e776404..06c5ab2 100644
--- a/config.yaml
+++ b/config.yaml
@@ -39,6 +39,5 @@ get_ASMs_SyRI: 'False' # Haplotype vs Reference
 get_chrInputs_SyRI: 'False' # SyRI on chrInputs
 # Producing VCF and its associated INS/DEL figure
 get_VCF: 'False'
-vcf_fig.params: '--binwidth 0.05 --height 6 --width 18'
 # Creating final report
 create_report: 'True'
diff --git a/example/config_CICD.yaml b/example/config_CICD.yaml
index 4dc0dd4..f3adb7d 100644
--- a/example/config_CICD.yaml
+++ b/example/config_CICD.yaml
@@ -40,6 +40,5 @@ get_ASMs_SyRI: 'True' # Haplotype vs Reference
 get_chrInputs_SyRI: 'True' # SyRI on chrInputs
 # Producing VCF and its associated INS/DEL figure
 get_VCF: 'False'
-vcf_fig.params: '--binwidth 0.05 --height 6 --width 18'
 # Creating final report
 create_report: 'True'
diff --git a/scripts/VCF.stats_figs.py b/scripts/VCF.stats_figs.py
index 4882337..3ef407b 100644
--- a/scripts/VCF.stats_figs.py
+++ b/scripts/VCF.stats_figs.py
@@ -9,24 +9,24 @@ Use aggregated TSV produced with vcf_2_tsv_syri.awk and vcf_2_tsv_vg.awk
 
 #% Librairies
 import os
-import numpy as np
+import pandas as pd
 import matplotlib.pyplot as plt
-import matplotlib.patches as mpatches
+from matplotlib.lines import Line2D
 import seaborn as sns
-import pandas as pd
-import argparse
-from sklearn.cluster import DBSCAN
-from adjustText import adjust_text
-from plottable import Table, ColDef
 
 #% Parsing arguments
-arg_parser = argparse.ArgumentParser(description='Statistic figures for Pan1c workflow')
+arg_parser = argparse.ArgumentParser(description='VCF statistic figures for Pan1c workflow')
+arg_parser.add_argument(
+    "--vg",
+    dest = "vg",
+    required = True,
+    help = "Input TSV containing vg variants"
+    )
 arg_parser.add_argument(
-    "--input",
-    "-i",
-    dest = "input",
+    "--syri",
+    dest = "syri",
     required = True,
-    help = "Input TSV containing chromosome path stats"
+    help = "Input TSV containing syri detected variants"
     )
 arg_parser.add_argument(
     "--output_dir",
@@ -44,202 +44,285 @@ arg_parser.add_argument(
 args = arg_parser.parse_args()
 
 #% Loading and preparing data
-def get_path_name(pathname):
-    return pathname.rsplit("#", 1)[0]
-
-# Reading data
-gData = pd.read_csv(args.input, sep='\t')
-gData["Path.name"] = gData["Path.name"].apply(get_path_name)
-gData.set_index(["Pangenome.name","Chr.id", "Path.name"], inplace=True)
-
-# Splitting shared content and other stats
-shared_content = gData.loc[:, ["Path.length", "Shared.content"]].to_dict()
-gData.drop("Shared.content", axis=1, inplace=True)
-
-# Summing Private and core for the barplot figure
-gData["Path.PrivCore.R.Length"] = gData["Path.private.R.length"] + gData["Path.core.R.length"]
-
-# Switching to Mbp instead of Bp
-gData[["Path.length", "Path.PrivCore.R.Length", "Path.core.R.length"]] = gData[["Path.length", "Path.PrivCore.R.Length", "Path.core.R.length"]].astype(float)
-gData.loc[:, ["Path.length", "Path.PrivCore.R.Length", "Path.core.R.length"]] = gData.loc[:, ["Path.length", "Path.PrivCore.R.Length", "Path.core.R.length"]] / 1000000
-
-# Adding percentages relative to path length
-gData["Path.private.R.per"] = (gData["Path.private.R.length"]*100)/(gData["Path.length"]*1000000)
-gData["Path.core.R.per"] = (gData["Path.core.R.length"]*100)/gData["Path.length"]
-
-# Shared content
-shared_dict = {}
-A = 0
-for key, value in shared_content["Shared.content"].items():
-    for elem in value.split(';'):
-        target, stats = elem.split(":")
-        target = target.rsplit("#", 1)[0]
-
-        shared_dict[A] = list(key)+[target]+[shared_content["Path.length"][key]]+[int(val) for val in stats.split(',')]
-        A+=1
-sData = pd.DataFrame.from_dict(shared_dict, orient='index', columns = ["Pangenome.name", "Chr.id", "Query.name", "Target.name", "Path.length", "Shared.nodes.count", "Shared.length", "Shared.R.length"])
-sData.set_index(["Pangenome.name","Chr.id", "Query.name", "Target.name"], inplace=True)
-sData.loc[:, "Shared.prop"] = sData["Shared.length"]*100/sData["Path.length"]
-sData.loc[:, "Shared.R.prop"] = sData["Shared.R.length"]*100/sData["Path.length"]
-
-# Pivoting the dataframe for the heatmap
-sTable = sData.reset_index().pivot(values=["Shared.R.prop"], index=["Pangenome.name", "Chr.id", "Query.name"], columns=["Target.name"])
-
-# Removing multi-index from columns
-sTable.columns = sTable.columns.droplevel()
-
-#% Figures functions
-# Bar plots with path decomposition into Core, Private and Other proportion
-def get_group_decomp_fig(data, title, savedir):
-    sns.set_style("ticks")
-    plt.figure(figsize=(16, 9), dpi = 256)
-    
-    # Adding bars
-    bar = sns.barplot(data, y="Path.name", x="Path.length", color='#FCDC94', estimator=np.mean, errorbar=None, orient="h")
-    
-    priv_bar = sns.barplot(data, y="Path.name", x="Path.PrivCore.R.Length", color='#78ABA8', estimator=np.mean, errorbar=None, orient="h")
-    
-    core_bar = sns.barplot(data, y="Path.name", x="Path.core.R.length", color='#EF9C66', estimator=np.mean, errorbar=None, orient="h")
-
-    # Adding relative percentages
-    for path_name in data.index.unique("Path.name"):
-        _len = data.loc[path_name, "Path.length"]
-
-        columns = ["Path.length", "Path.PrivCore.R.Length", "Path.core.R.length"]
-        for i in range(len(columns)):
-            try :
-                _var = data.loc[path_name, columns[i:i+2]]
-                x = _var.mean()       
-                value = ((_var.iloc[0] - _var.iloc[1])/_len)*100
-            except :
-                _var = data.loc[path_name, columns[i]]
-                x = _var/2       
-                value = ((_var - 0)/_len)*100
-            plt.text(x, path_name, f"{round(value, 2)}%", horizontalalignment='center', verticalalignment='center', fontsize=10)
-    
-    # Adding legend
-    top_bar = mpatches.Patch(color='#FCDC94', label='Other')
-    middle_bar = mpatches.Patch(color='#78ABA8', label='Private')
-    bottom_bar = mpatches.Patch(color='#EF9C66', label='Core')
-    plt.legend(handles=[top_bar, middle_bar, bottom_bar], frameon=False)
-    sns.move_legend(bar, 'best', bbox_to_anchor=(1, 0.5))
-
-    # Adding/Modifying titles
-    plt.title(title, fontsize=15 , fontweight="bold")
-    plt.xlabel('Path length (Mbp)', loc='right', fontweight="bold")
-    plt.ylabel(None)
-
-    sns.despine()
-    plt.savefig(savedir)
+## VG dataset
+vg = pd.read_csv(args.vg, sep="\t")
+vg.rename(columns={"CHROM": "NAME"}, inplace=True)
+vg[["QUERY", "CHROM"]] = vg["NAME"].str.rsplit("#", n=1, expand=True)
+vg[["Genome", "Haplotype"]] = vg["HAP"].str.rsplit("#", n=1, expand=True)
+vg.set_index(["CHROM", "QUERY", "HAP"], inplace=True)
+vg.drop("NAME", axis=1, inplace=True)
+vg.loc[:,"LEN"] = -vg["LEN"]
+
+## Syri dataset
+sr = pd.read_csv(args.syri, sep="\t")
+sr.rename(columns={"CHROM": "NAME"}, inplace=True)
+sr[["QUERY", "CHROM"]] = sr["NAME"].str.rsplit("#", n=1, expand=True)
+sr[["Genome", "Haplotype"]] = sr["HAP"].str.rsplit("#", n=1, expand=True)
+sr.set_index(["CHROM", "QUERY", "HAP"], inplace=True)
+sr.drop("NAME", axis=1, inplace=True)
+sr.loc[:,"LEN"] = -sr["LEN"]
+
+## Getting ref name
+REF = vg.index.unique("QUERY")[0].split("#")[0]
+
+#% Figure settings
+## CMAP & Genome order
+hue_order = sorted(vg.index.unique("HAP"))
+    
+# Creating color palette based on genome instead of haplotype
+genomes = sorted(vg["Genome"].unique())
+col_values = sns.color_palette(n_colors=len(genomes))
+colors = dict(zip(genomes, col_values))
+cmap = {
+    f"{name}#{hap}": color 
+    for name, color in colors.items()
+    for hap in range(1, int(vg["Haplotype"].max())+1)
+}
+
+#% Figures funtion
+# General histogram
+def hist_general(data, title, savedir, hue_order=hue_order, colors=colors, cmap=cmap, genomes=genomes):
+    INS = data.query("LEN > 50 & LEN < 100000").rename(columns={"LEN": "INS"})
+    DEL = data.query("LEN < -50 & LEN > -100000").rename(columns={"LEN": "DEL"})
+    DEL.loc[:,"DEL"] = -DEL["DEL"]
 
-# 2D scatter Core vs Private 
-def get_group_2d_fig(data, title, savedir):
-    sns.set_style("ticks")
-    fig, (ax, ax_table) = plt.subplots(1, 2, gridspec_kw={'width_ratios': [3, 2], 'wspace': 0.05}, figsize=(16, 9), dpi=256)
-    
-    # Clustering with DBSCAN
-    X = data.loc[:, ["Path.private.R.per", "Path.core.R.per"]].to_numpy()
-    data["Clusters"] = DBSCAN(eps=2, min_samples = 1).fit_predict(X).tolist()
-
-    # Plotting scatter
-    sns.scatterplot(data, x = "Path.private.R.per", y = "Path.core.R.per", hue="Clusters", palette="tab10", ax=ax)
-    
-    # Adding labels
-    TEXTS = []
-    for genome in data.index.unique("Path.name"):
-        x, y = data.loc[genome, ["Path.private.R.per", "Path.core.R.per"]]
-        TEXTS.append(ax.text(x, y, genome, color="#7F7F7F", fontsize=9))
-    adjust_text(
-        TEXTS,
-        expand=(2,2),
-        arrowprops=dict(
-            arrowstyle="->", 
-            color="#b3b3b3", 
-            lw=0.5,
-            shrinkA=0.2
-        ),
-    ax=fig.axes[0]
+    LS = ['solid', 'dashed', 'dashdot', 'dotted']
+    
+    fig, (delax, insax) = plt.subplots(1, 2, gridspec_kw={'wspace': 0}, figsize=(16, 9), dpi=300, sharey=True)
+    
+    # Plotting deletions
+    sns.histplot(
+        data=DEL, 
+        x="DEL", 
+        hue="HAP", 
+        hue_order=hue_order,
+        palette = cmap,
+        binwidth=0.05, 
+        log_scale=True, 
+        element='poly', 
+        fill=False,
+        ax=delax,
+        legend=False
     )
     
-    # Adding table
-    table_data = {
-        f"Cluster {cluster_id}" : data[data["Clusters"] == cluster_id].index.to_list()
-        for cluster_id in sorted(data["Clusters"].unique())
-    }
-    max_row = max((len(genomes) for genomes in table_data.values()))
-    for cluster_id in table_data.keys():
-        table_data[cluster_id] += (max_row - len(table_data[cluster_id])) * ['']
-    table_df = pd.DataFrame.from_dict(table_data)
-
-    ax_table = Table(
-        table_df, 
-        row_dividers=False, 
-        column_definitions = [
-            ColDef("index", width=0, title="", textprops={"fontsize": 0})
-        ],
-        textprops={"ha": "left"}
+    # Reversing xaxis on DEL
+    delax.set(xlim=delax.get_xlim()[::-1])
+    sns.despine(ax=delax)
+    delax.grid()
+    
+    # Plotting insertions
+    sns.histplot(
+        data=INS, 
+        x="INS", 
+        hue="HAP", 
+        hue_order=hue_order, 
+        palette = cmap,
+        binwidth=0.05, 
+        log_scale=True, 
+        element='poly', 
+        fill=False,
+        ax=insax,
+        legend=False
     )
     
-    # Adding/Modifying titles
-    ax.set_title(title, fontsize=15 , fontweight="bold")
-    ax.set_xlabel('Private relative proportion (%)', loc='right', fontweight="bold")
-    ax.set_ylabel('Core relative proportion (%)', loc='top', fontweight="bold")
+    insax.grid()
+    
+    # Changing linestyle based on haplotype id 
+    for ax in [insax, delax]:
+        for line, hap in zip(ax.lines, hue_order):
+            # Appliquer le style selon le haplotype
+            line.set_linestyle(LS[int(hap.split('#')[-1])-1])
 
-    sns.despine()
+    # Custom legend
+    genome_legend_elements = [
+        Line2D([0],[0], color=colors[Genome], linestyle="solid", label=Genome)
+        for Genome in genomes
+    ]
+    hap_legend_elements = [
+        Line2D([0],[0], color="black", linestyle=LS[i-1], label=i)
+        for i in range(1, int(data["Haplotype"].max())+1)
+    ]
+    leg_handles = [Line2D([],[], color="w", label="\n$\\bf{Haplotypes}$")]+genome_legend_elements+[Line2D([],[], color="w", label="\n$\\bf{Hap ID}$")]+hap_legend_elements
+    insax.legend(handles = leg_handles, ncol=2, frameon=False)
+    
+    # Y axis to the right
+    insax.yaxis.tick_right()
+    
+    # Changing labels of xTicks
+    insax.set_xticks([100, 1000, 10000, 100000])
+    insax.set_xticklabels(["100bp", "1kbp", "10kbp", "100kbp"])
+    delax.set_xticks([100, 1000, 10000, 100000])
+    delax.set_xticklabels(["-100bp", "-1kbp", "-10kbp", "-100kbp"])
+    
+    sns.despine(ax=insax, right=False)
+    
+    # Moving legend outside the figure
+    sns.move_legend(insax, 'best', bbox_to_anchor=(1.3, 1.05))
+   
+    fig.suptitle(title, fontsize=15 , fontweight="bold")
+
+    plt.tight_layout()
+    
     plt.savefig(savedir)
+    plt.close()
 
-# Shared content heatmap
-def get_hm_shared_fig(data, title, savedir):
-    sns.set_style("ticks")
-    plt.figure(figsize=(16, 9), dpi=256)
+# Genome specific hstograms
+def hist_genome(data, title, savedir):
+    INS = data.query("LEN > 50 & LEN < 100000").rename(columns={"LEN": "INS"})
+    DEL = data.query("LEN < -50 & LEN > -100000").rename(columns={"LEN": "DEL"})
+    DEL.loc[:,"DEL"] = -DEL["DEL"]
+
+    genomes = sorted(data["Genome"].unique())
+    genomes = [(tool, name) for name in genomes for tool in data.index.unique("TOOL")]
+    col_values = sns.color_palette(n_colors=len(genomes)*2)
+    colors = dict(zip(genomes, col_values))
+    cmap = {
+            cur_tool: {
+            f"{name}#{hap}": color 
+            for (tool, name), color in colors.items()
+            for hap in range(1, int(data["Haplotype"].max())+1)
+            if tool == cur_tool
+        }
+        for cur_tool in data.index.unique("TOOL")
+    }
+
+    LS = ['solid', 'dashed', 'dashdot', 'dotted']
+    
+    fig, (delax, insax) = plt.subplots(1, 2, gridspec_kw={'wspace': 0}, figsize=(16, 9), dpi=300, sharey=True)
+    
+    # Plotting deletions
+    sns.histplot(
+        data=DEL.query("TOOL == 'VG'"), 
+        x="DEL", 
+        hue="HAP", 
+        #hue_order=hue_order,
+        palette = cmap['VG'],
+        binwidth=0.05, 
+        log_scale=True, 
+        element='poly', 
+        fill=False,
+        ax=delax,
+        legend=False
+    )
+    sns.histplot(
+        data=DEL.query("TOOL == 'Syri'"), 
+        x="DEL", 
+        hue="HAP", 
+        #hue_order=hue_order,
+        palette = cmap['Syri'],
+        binwidth=0.05, 
+        log_scale=True, 
+        element='poly', 
+        fill=False,
+        ax=delax,
+        legend=False
+    )
+    
+    # Reversing xaxis on DEL
+    delax.set(xlim=delax.get_xlim()[::-1])
+    sns.despine(ax=delax)
+    delax.grid()
+    
+    # Plotting insertions
+    sns.histplot(
+        data=INS.query("TOOL == 'VG'"), 
+        x="INS", 
+        hue="HAP", 
+        #hue_order=hue_order, 
+        palette = cmap['VG'],
+        binwidth=0.05, 
+        log_scale=True, 
+        element='poly', 
+        fill=False,
+        ax=insax,
+        legend=False
+    )
+    sns.histplot(
+        data=INS.query("TOOL == 'Syri'"), 
+        x="INS", 
+        hue="HAP", 
+        #hue_order=hue_order, 
+        palette = cmap['Syri'],
+        binwidth=0.05, 
+        log_scale=True, 
+        element='poly', 
+        fill=False,
+        ax=insax,
+        legend=False
+    )
     
-    # Heatmap
-    ax = sns.heatmap(data, cmap="Spectral_r", square = True)
+    insax.grid()
     
-    # Adding/Modifying titles
-    plt.title(title, fontsize=15 , fontweight="bold")
-    plt.xlabel('Target', fontweight="bold")
-    plt.ylabel('Query', fontweight="bold")
+    # Changing linestyle based on haplotype id 
+    for ax in [insax, delax]:
+        for line, hap in zip(ax.lines, hue_order):
+            # Appliquer le style selon le haplotype
+            line.set_linestyle(LS[int(hap.split('#')[-1])-1])
 
-    sns.despine()
+    # Custom legend
+    tool_legend_elements = [
+        Line2D([0],[0], color=colors[(tool, Genome)], linestyle="solid", label=tool)
+        for tool, Genome in genomes
+    ]
+    
+    hap_legend_elements = [
+        Line2D([0],[0], color="black", linestyle=LS[i-1], label=i)
+        for i in range(1, int(data["Haplotype"].max())+1)
+    ]
+    leg_handles = [Line2D([],[], color="w", label="\n$\\bf{Tool}$")]+tool_legend_elements+[Line2D([],[], color="w", label="\n$\\bf{Hap ID}$")]+hap_legend_elements
+
+    insax.legend(handles = leg_handles, frameon=False)
+    
+    # Y axis to the right
+    insax.yaxis.tick_right()
+    
+    # Changing labels of xTicks
+    insax.set_xticks([100, 1000, 10000, 100000])
+    insax.set_xticklabels(["100bp", "1kbp", "10kbp", "100kbp"])
+    delax.set_xticks([100, 1000, 10000, 100000])
+    delax.set_xticklabels(["-100bp", "-1kbp", "-10kbp", "-100kbp"])
+    
+    sns.despine(ax=insax, right=False)
+    
+    # Moving legend outside the figure
+    sns.move_legend(insax, 'best', bbox_to_anchor=(1.2, 1))
+  
+    fig.suptitle(title, fontsize=15 , fontweight="bold")
+
+    plt.tight_layout()
+    
     plt.savefig(savedir)
+    plt.close()
 
-#% Figure generation
-# Bar plots
-for pangenome in gData.index.unique("Pangenome.name"):
-    ## For each chromosome
-    for chrid in gData.index.unique("Chr.id"):
-        get_group_decomp_fig(
-            data = gData.loc[pangenome, chrid,:], 
-            title = f"Path composition by groups - {pangenome} - {chrid}",
-            savedir = os.path.join(args.dir, f"{pangenome}.path.decomp.{chrid}.png")
-        )
-    ## For the mean across chromosomes 
-    get_group_decomp_fig(
-        data = gData.groupby("Path.name").sum(), 
-        title = "Haplotypes mean composition accross chromosomes",
-        savedir = os.path.join(args.dir, f"{pangenome}.path.decomp.mean.png")
-    )
+#% Generating figures
+# General histograms
+hist_general(
+    vg, 
+    f"Pan1c - {args.panname} - {REF} - VG", 
+    os.path.join(ars.output_dir, f"pan1c.{args.panname}.General.vcf.vg.png")
+)
+hist_general(
+    sr, 
+    f"Pan1c - {args.panname} - {REF} - Syri", 
+    os.path.join(ars.output_dir, f"pan1c.{args.panname}.General.vcf.syri.png")
+)
 
-# 2D Scatter Core vs Private
-for pangenome in gData.index.unique("Pangenome.name"):
-    for chrid in gData.index.unique("Chr.id"):
-        get_group_2d_fig(
-            data = gData.loc[pangenome, chrid,:], 
-            title = f"Path composition by groups - {pangenome} - {chrid}",
-            savedir = os.path.join(args.dir, f"{pangenome}.2D.scatter.{chrid}.png")
-        )
-    get_group_2d_fig(
-        data = gData.groupby("Path.name").mean(), 
-        title = "Haplotypes mean composition accross chromosomes",
-        savedir = os.path.join(args.dir, f"{pangenome}.2D.scatter.mean.png")
+# Genome specific histograms
+# Iterating over haplotypes
+for genome in sorted(vg["Genome"].unique()):
+    
+    # Concatening data from both datasets 
+    data = pd.concat(
+        {
+            "VG": vg.query("Genome == @genome"),
+            "Syri": sr.query("Genome == @genome")
+        },
+        names=["TOOL"]
     )
 
-# Shared content heatmap
-for pangenome in sTable.index.unique("Pangenome.name"):
-    for chrid in sTable.index.unique("Chr.id"):
-        get_hm_shared_fig(
-            data = sTable.loc[pangenome, chrid,:], 
-            title = f"Shared content (%) - {pangenome} - {chrid}",
-            savedir = os.path.join(args.dir, f"{pangenome}.sharred.content.{chrid}.png")
-        )
\ No newline at end of file
+    # Creating the figure
+    hist_genome(
+        data, 
+        f"Pan1c - {args.panname} - {REF} - {genome}", 
+        os.path.join(ars.output_dir, f"pan1c.{args.panname}.{genome}.vcf.both.png")
+    )
\ No newline at end of file
diff --git a/scripts/chrGraphs.stats_figs.py b/scripts/chrGraphs.stats_figs.py
index dcad8f5..8132405 100644
--- a/scripts/chrGraphs.stats_figs.py
+++ b/scripts/chrGraphs.stats_figs.py
@@ -48,7 +48,7 @@ def get_path_name(pathname):
     return pathname.rsplit("#", 1)[0]
 
 # Reading data
-gData = pd.read_csv("/home/amergez/TÃ©lÃ©chargements/GFAsp/pan1c.37Bra-v3c.chrGraph.path.stats.tsv", sep='\t')
+gData = pd.read_csv(args.input, sep='\t')
 gData["Path.name"] = gData["Path.name"].apply(get_path_name)
 gData.set_index(["Pangenome.name","Chr.id", "Path.name"], inplace=True)
 
-- 
GitLab


From 7bed21c96390e5544e9871b1b86ad96219de8a7b Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 2 Sep 2024 18:49:51 +0200
Subject: [PATCH 136/310] Update Snakefile

---
 Snakefile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 423a159..8b8c965 100644
--- a/Snakefile
+++ b/Snakefile
@@ -810,7 +810,6 @@ rule vcf_fig:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 16000
     params:
         app_path=config['app.path'],
-        fig_config=config['vcf_fig.params'],
         pan_name=config['name'],
         refname=config['reference']
     shell:
-- 
GitLab


From 11b65c7ea25ab5aeae9d4f2c8eb592b2fb43ada2 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 2 Sep 2024 23:27:49 +0200
Subject: [PATCH 137/310] Update chrGraphs.stats_figs.py

---
 scripts/chrGraphs.stats_figs.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/scripts/chrGraphs.stats_figs.py b/scripts/chrGraphs.stats_figs.py
index 8132405..17bc798 100644
--- a/scripts/chrGraphs.stats_figs.py
+++ b/scripts/chrGraphs.stats_figs.py
@@ -151,6 +151,7 @@ def get_group_decomp_fig(data, title, savedir):
 
     sns.despine()
     plt.savefig(savedir)
+    plt.close()
 
 # 2D scatter Core vs Private 
 def get_group_2d_fig(data, title, savedir):
@@ -207,6 +208,7 @@ def get_group_2d_fig(data, title, savedir):
 
     sns.despine()
     plt.savefig(savedir)
+    plt.close()
 
 # Shared content heatmap
 def get_hm_shared_fig(data, title, savedir):
@@ -239,6 +241,7 @@ def get_hm_shared_fig(data, title, savedir):
 
     sns.despine()
     plt.savefig(savedir)
+    plt.close()
 
 # Shared content difference between chromosomes heatmap
 def get_hm_diff_fig(data, title, savedir):
@@ -255,6 +258,7 @@ def get_hm_diff_fig(data, title, savedir):
 
     sns.despine()
     plt.savefig(savedir)
+    plt.close()
 
 #% Figure generation
 # Bar plots
@@ -288,10 +292,10 @@ for pangenome in gData.index.unique("Pangenome.name"):
     )
 
 # Shared content heatmap
-for pangenome in sTable.index.unique("Pangenome.name"):
-    for chrid in sTable.index.unique("Chr.id"):
+for pangenome in sData.index.unique("Pangenome.name"):
+    for chrid in sData.index.unique("Chr.id"):
         get_hm_shared_fig(
-            data = sTable.loc[pangenome, chrid,:].copy(), 
+            data = sData.loc[pangenome, chrid,:].copy(), 
             title = f"Shared content (%) - {pangenome} - {chrid}",
             savedir = os.path.join(args.dir, f"{pangenome}.sharred.content.{chrid}.png")
         )
-- 
GitLab


From 134b0da40e4c86b6b26862967e9bb47488d5a17d Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 2 Sep 2024 23:48:39 +0200
Subject: [PATCH 138/310] Update chrGraphs.stats_figs.py

- Increased in-between figures
---
 scripts/chrGraphs.stats_figs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/chrGraphs.stats_figs.py b/scripts/chrGraphs.stats_figs.py
index 17bc798..8f93ceb 100644
--- a/scripts/chrGraphs.stats_figs.py
+++ b/scripts/chrGraphs.stats_figs.py
@@ -87,7 +87,7 @@ sData.loc[:, "Shared.length.mb"] = sData["Shared.length"]/1000000
 # Bar plots with path decomposition into Core, Private and Other proportion
 def get_group_decomp_fig(data, title, savedir):
     sns.set_style("ticks")
-    fig, (mbax, perax) = plt.subplots(1, 2, gridspec_kw={'wspace': 0.2, 'hspace':0}, figsize=(16, 9), dpi=300)
+    fig, (mbax, perax) = plt.subplots(1, 2, gridspec_kw={'wspace': 0.45, 'hspace':0}, figsize=(16, 10), dpi=300)
     
     # Adding bars for Mbax (in megabase)
     sns.barplot(data, y="Path.name", x="Path.length", color='#FCDC94', estimator=np.mean, errorbar=None, orient="h", ax=mbax)
@@ -213,7 +213,7 @@ def get_group_2d_fig(data, title, savedir):
 # Shared content heatmap
 def get_hm_shared_fig(data, title, savedir):
     sns.set_style("ticks")
-    fig, ((Rax, NRax), (Rax_bar, NRax_bar)) = plt.subplots(2, 2, gridspec_kw={'wspace': 0.3, 'hspace': 0.2, 'height_ratios':[4, 0.15]}, figsize=(16, 9), dpi=300)
+    fig, ((Rax, NRax), (Rax_bar, NRax_bar)) = plt.subplots(2, 2, gridspec_kw={'wspace': 0.5, 'hspace': 0.3, 'height_ratios':[4, 0.15]}, figsize=(16, 9), dpi=300)
 
     # Pivoting the dataframe for the heatmap
     data["Shared.length"] = data["Shared.length"]/1000000
-- 
GitLab


From cb5ff607a3ca3272a5b2ac98fdd39164e9468f33 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 3 Sep 2024 09:58:50 +0200
Subject: [PATCH 139/310] Update VCF.stats_figs.py

---
 scripts/VCF.stats_figs.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/VCF.stats_figs.py b/scripts/VCF.stats_figs.py
index 3ef407b..a074f0c 100644
--- a/scripts/VCF.stats_figs.py
+++ b/scripts/VCF.stats_figs.py
@@ -13,6 +13,7 @@ import pandas as pd
 import matplotlib.pyplot as plt
 from matplotlib.lines import Line2D
 import seaborn as sns
+import argparse
 
 #% Parsing arguments
 arg_parser = argparse.ArgumentParser(description='VCF statistic figures for Pan1c workflow')
-- 
GitLab


From 1749190c327bc4fc9f1e287feb12fe54a186a052 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 3 Sep 2024 10:30:55 +0200
Subject: [PATCH 140/310] Update VCF.stats_figs.py

---
 scripts/VCF.stats_figs.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/VCF.stats_figs.py b/scripts/VCF.stats_figs.py
index a074f0c..c0dd862 100644
--- a/scripts/VCF.stats_figs.py
+++ b/scripts/VCF.stats_figs.py
@@ -300,12 +300,12 @@ def hist_genome(data, title, savedir):
 hist_general(
     vg, 
     f"Pan1c - {args.panname} - {REF} - VG", 
-    os.path.join(ars.output_dir, f"pan1c.{args.panname}.General.vcf.vg.png")
+    os.path.join(args.output_dir, f"pan1c.{args.panname}.General.vcf.vg.png")
 )
 hist_general(
     sr, 
     f"Pan1c - {args.panname} - {REF} - Syri", 
-    os.path.join(ars.output_dir, f"pan1c.{args.panname}.General.vcf.syri.png")
+    os.path.join(args.output_dir, f"pan1c.{args.panname}.General.vcf.syri.png")
 )
 
 # Genome specific histograms
@@ -325,5 +325,5 @@ for genome in sorted(vg["Genome"].unique()):
     hist_genome(
         data, 
         f"Pan1c - {args.panname} - {REF} - {genome}", 
-        os.path.join(ars.output_dir, f"pan1c.{args.panname}.{genome}.vcf.both.png")
+        os.path.join(args.output_dir, f"pan1c.{args.panname}.{genome}.vcf.both.png")
     )
\ No newline at end of file
-- 
GitLab


From 05d5db92694f56b7d0e394c6847c1a01b4f1004d Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 3 Sep 2024 10:51:50 +0200
Subject: [PATCH 141/310] Update VCF.stats_figs.py

---
 scripts/VCF.stats_figs.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/VCF.stats_figs.py b/scripts/VCF.stats_figs.py
index c0dd862..e7fa687 100644
--- a/scripts/VCF.stats_figs.py
+++ b/scripts/VCF.stats_figs.py
@@ -300,12 +300,12 @@ def hist_genome(data, title, savedir):
 hist_general(
     vg, 
     f"Pan1c - {args.panname} - {REF} - VG", 
-    os.path.join(args.output_dir, f"pan1c.{args.panname}.General.vcf.vg.png")
+    os.path.join(args.dir, f"pan1c.{args.panname}.General.vcf.vg.png")
 )
 hist_general(
     sr, 
     f"Pan1c - {args.panname} - {REF} - Syri", 
-    os.path.join(args.output_dir, f"pan1c.{args.panname}.General.vcf.syri.png")
+    os.path.join(args.dir, f"pan1c.{args.panname}.General.vcf.syri.png")
 )
 
 # Genome specific histograms
@@ -325,5 +325,5 @@ for genome in sorted(vg["Genome"].unique()):
     hist_genome(
         data, 
         f"Pan1c - {args.panname} - {REF} - {genome}", 
-        os.path.join(args.output_dir, f"pan1c.{args.panname}.{genome}.vcf.both.png")
+        os.path.join(args.dir, f"pan1c.{args.panname}.{genome}.vcf.both.png")
     )
\ No newline at end of file
-- 
GitLab


From 65a8aac14b57ab3b83f094f581deada458da775e Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 3 Sep 2024 18:55:47 +0200
Subject: [PATCH 142/310] Update contig.pos_figs.R

---
 scripts/contig.pos_figs.R | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/scripts/contig.pos_figs.R b/scripts/contig.pos_figs.R
index 58d849f..6a46ee2 100644
--- a/scripts/contig.pos_figs.R
+++ b/scripts/contig.pos_figs.R
@@ -1,11 +1,9 @@
-"""
-Contig position figure script for Pan1c workflow
-
-Use contig position to produce a figure showing their position into respective chromosome
-
-@author: alexis.mergez@inrae.fr, adapted from Cedric Cabau
-@version: 1.0
-"""
+# Contig position figure script for Pan1c workflow
+#
+# Use contig position to produce a figure showing their position into respective chromosome
+#
+# @author: alexis.mergez@inrae.fr, adapted from Cedric Cabau
+# @version: 1.0
 
 library("karyoploteR")
 library("optparse")
-- 
GitLab


From ccc8c0d22511016e8850f1d320973e60a5f0e189 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 4 Sep 2024 09:51:44 +0200
Subject: [PATCH 143/310] Update chrGraphs.stats_figs.py

---
 scripts/chrGraphs.stats_figs.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/scripts/chrGraphs.stats_figs.py b/scripts/chrGraphs.stats_figs.py
index 8f93ceb..fb60273 100644
--- a/scripts/chrGraphs.stats_figs.py
+++ b/scripts/chrGraphs.stats_figs.py
@@ -312,8 +312,11 @@ for pangenome in sData.index.unique("Pangenome.name"):
             # Computing Euclid distance using Frobenious norm
             dData["Q"].append(Q)
             dData["T"].append(T)
-            dData["Diff"].append(np.linalg.norm(Qtable.values-Ttable.values, ord = 'fro'))
-            
+            try : # Catching non similar shapes in case some path are not available in both matrices
+                dData["Diff"].append(np.linalg.norm(Qtable.values-Ttable.values, ord = 'fro'))
+            except : 
+                dData["Diff"].append(np.nan)
+
     dData = pd.DataFrame.from_dict(dData).pivot(values=["Diff"], index=["Q"], columns=["T"])
     dData.columns = dData.columns.droplevel()
     
-- 
GitLab


From 258ecb8c84ba3c4570615fca0fa599774cddaaac Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 4 Sep 2024 11:49:28 +0200
Subject: [PATCH 144/310] Update VCF.stats_figs.py

---
 scripts/VCF.stats_figs.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/VCF.stats_figs.py b/scripts/VCF.stats_figs.py
index e7fa687..e345a9f 100644
--- a/scripts/VCF.stats_figs.py
+++ b/scripts/VCF.stats_figs.py
@@ -89,7 +89,7 @@ def hist_general(data, title, savedir, hue_order=hue_order, colors=colors, cmap=
 
     LS = ['solid', 'dashed', 'dashdot', 'dotted']
     
-    fig, (delax, insax) = plt.subplots(1, 2, gridspec_kw={'wspace': 0}, figsize=(16, 9), dpi=300, sharey=True)
+    fig, (delax, insax) = plt.subplots(1, 2, gridspec_kw={'wspace': 0}, figsize=(18, 9), dpi=300, sharey=True)
     
     # Plotting deletions
     sns.histplot(
@@ -158,7 +158,7 @@ def hist_general(data, title, savedir, hue_order=hue_order, colors=colors, cmap=
     sns.despine(ax=insax, right=False)
     
     # Moving legend outside the figure
-    sns.move_legend(insax, 'best', bbox_to_anchor=(1.3, 1.05))
+    sns.move_legend(insax, "upper left", bbox_to_anchor=(1.3, 1.05))
    
     fig.suptitle(title, fontsize=15 , fontweight="bold")
 
@@ -189,7 +189,7 @@ def hist_genome(data, title, savedir):
 
     LS = ['solid', 'dashed', 'dashdot', 'dotted']
     
-    fig, (delax, insax) = plt.subplots(1, 2, gridspec_kw={'wspace': 0}, figsize=(16, 9), dpi=300, sharey=True)
+    fig, (delax, insax) = plt.subplots(1, 2, gridspec_kw={'wspace': 0}, figsize=(18, 9), dpi=300, sharey=True)
     
     # Plotting deletions
     sns.histplot(
@@ -286,7 +286,7 @@ def hist_genome(data, title, savedir):
     sns.despine(ax=insax, right=False)
     
     # Moving legend outside the figure
-    sns.move_legend(insax, 'best', bbox_to_anchor=(1.2, 1))
+    sns.move_legend(insax, "upper left", bbox_to_anchor=(1, 1))
   
     fig.suptitle(title, fontsize=15 , fontweight="bold")
 
-- 
GitLab


From a4d89ab6f8811e9b9b1f3b191039f7433e29f207 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 4 Sep 2024 11:51:24 +0200
Subject: [PATCH 145/310] Update VCF.stats_figs.py

---
 scripts/VCF.stats_figs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/VCF.stats_figs.py b/scripts/VCF.stats_figs.py
index e345a9f..025cfd4 100644
--- a/scripts/VCF.stats_figs.py
+++ b/scripts/VCF.stats_figs.py
@@ -158,7 +158,7 @@ def hist_general(data, title, savedir, hue_order=hue_order, colors=colors, cmap=
     sns.despine(ax=insax, right=False)
     
     # Moving legend outside the figure
-    sns.move_legend(insax, "upper left", bbox_to_anchor=(1.3, 1.05))
+    sns.move_legend(insax, "upper left", bbox_to_anchor=(1, 1))
    
     fig.suptitle(title, fontsize=15 , fontweight="bold")
 
-- 
GitLab


From ba3a22bbf2e559c7072f763bdcaa801cf286b92b Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 4 Sep 2024 12:56:11 +0200
Subject: [PATCH 146/310] Update Snakefile

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 8b8c965..ad0e96c 100644
--- a/Snakefile
+++ b/Snakefile
@@ -807,7 +807,7 @@ rule vcf_fig:
         vcf_fig=directory("output/vcf.figs")
     threads: 1
     resources:
-        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 16000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 20000
     params:
         app_path=config['app.path'],
         pan_name=config['name'],
-- 
GitLab


From 749774fe57dec699c0bae81e94cf3ca7a6072e53 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 4 Sep 2024 13:48:59 +0200
Subject: [PATCH 147/310] Fixed linestyle bug

---
 scripts/VCF.stats_figs.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/scripts/VCF.stats_figs.py b/scripts/VCF.stats_figs.py
index 025cfd4..a313878 100644
--- a/scripts/VCF.stats_figs.py
+++ b/scripts/VCF.stats_figs.py
@@ -173,6 +173,7 @@ def hist_genome(data, title, savedir):
     DEL = data.query("LEN < -50 & LEN > -100000").rename(columns={"LEN": "DEL"})
     DEL.loc[:,"DEL"] = -DEL["DEL"]
 
+    hue_order = sorted(data.index.unique("HAP"))
     genomes = sorted(data["Genome"].unique())
     genomes = [(tool, name) for name in genomes for tool in data.index.unique("TOOL")]
     col_values = sns.color_palette(n_colors=len(genomes)*2)
@@ -196,7 +197,7 @@ def hist_genome(data, title, savedir):
         data=DEL.query("TOOL == 'VG'"), 
         x="DEL", 
         hue="HAP", 
-        #hue_order=hue_order,
+        hue_order=hue_order,
         palette = cmap['VG'],
         binwidth=0.05, 
         log_scale=True, 
@@ -209,7 +210,7 @@ def hist_genome(data, title, savedir):
         data=DEL.query("TOOL == 'Syri'"), 
         x="DEL", 
         hue="HAP", 
-        #hue_order=hue_order,
+        hue_order=hue_order,
         palette = cmap['Syri'],
         binwidth=0.05, 
         log_scale=True, 
@@ -256,8 +257,8 @@ def hist_genome(data, title, savedir):
     
     # Changing linestyle based on haplotype id 
     for ax in [insax, delax]:
-        for line, hap in zip(ax.lines, hue_order):
-            # Appliquer le style selon le haplotype
+        for line, hap in zip(ax.lines, 2*hue_order):
+            # Applying style according to haplotype ID
             line.set_linestyle(LS[int(hap.split('#')[-1])-1])
 
     # Custom legend
-- 
GitLab


From e946d9a018ddc27bfac169fb9d7755b46770cb9b Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 4 Sep 2024 14:04:42 +0200
Subject: [PATCH 148/310] Removed log INSDEL figures

---
 Snakefile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index ad0e96c..eadc573 100644
--- a/Snakefile
+++ b/Snakefile
@@ -1096,8 +1096,7 @@ rule create_pan1c_report:
 
             # Adding back 'general' figures
             figures = [
-                f"pan1c.{config['name']}.{fig_type}.vcf.{tool}.png" 
-                for fig_type in ["General", "General_log"]
+                f"pan1c.{config['name']}.General.vcf.{tool}.png" 
                 for tool in ["vg", "syri"]
             ] + figures
 
-- 
GitLab


From f4735e4cfd172e60356b89c2ce36f62a89a1f120 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 4 Sep 2024 18:09:31 +0200
Subject: [PATCH 149/310] Moved graph tagging to a dedicated rule

---
 Snakefile | 44 ++++++++++++++++++++++++++++++++------------
 1 file changed, 32 insertions(+), 12 deletions(-)

diff --git a/Snakefile b/Snakefile
index eadc573..a79c2e9 100644
--- a/Snakefile
+++ b/Snakefile
@@ -91,6 +91,7 @@ Rules   ------------------------------------------------------------------------
 rule all:
     input:
         "output/pan1c."+config['name']+".gfa.gz", # Final graph (main output)
+        "flags/pan1c."+config['name']+".done", # Tag
         "output/pan1c."+config['name']+".gfa.metadata", # Metadata for the final (also in top of gfa files as # line)
         which_analysis()
 
@@ -472,8 +473,7 @@ rule gfaffix_on_chr:
 rule odgi_postprocessing:
     # Running pggb's postprocessing (mainly odgi) steps with gfaffix graph
     input:
-        gfa_gz=rules.gfaffix_on_chr.output.gfa_gz,
-        tags="output/pan1c."+config['name']+".gfa.metadata"
+        gfa_gz=rules.gfaffix_on_chr.output.gfa_gz
     output:
         gfa_gz='data/chrGraphs/'+config['name']+'.{chromosome}.gfa.gz'
     threads: 8
@@ -531,18 +531,42 @@ rule odgi_postprocessing:
         ## Removing .og files for space savings
         rm $(dirname {input.gfa_gz})/*.og
 
-        ## Adding metadata
-        sed -i '/^H/r {input.tags}' $gfa_out
-
         ## Compressing
         apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
             -@ {threads} $gfa_out
         """
 
+rule graph_tagging:
+    # Add workflow metadata to a GFA
+    input:
+        tags="output/pan1c."+config['name']+".gfa.metadata",
+        gfa="{dir}/{graph}.gfa"
+    output:
+        gfa="{dir}/{graph}.gfa.gz",
+        flag=touch("flags/{graph}.done")
+    resources:
+        mem_mb = 8000
+    params:
+        app_path=config['app.path']
+    shell:
+        """
+        ## Removing any old tags
+        sed -i '/^#/d' {input.gfa}
+
+        ## Adding new tags
+        sed -i '/^H/r {input.tags}' {input.gfa}
+
+        ## Compressing
+        apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
+            -@ {threads} {input.gfa}
+        """
+    
+
 rule generate_graph_list:
     # Generate a text file containing all created graphs
     input:
-        expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa.gz', chromosome=CHRLIST)
+        gfas=expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa.gz', chromosome=CHRLIST),
+        tag_flags=expand('flags/'+config['name']+'.{chromosome}.done', chromosome=CHRLIST) 
     output:
         "data/chrGraphs/graphsList.txt"
     threads: 1
@@ -551,15 +575,14 @@ rule generate_graph_list:
     priority: 100
     run:
         with open(output[0], "w") as handle:
-            for file in input:
+            for file in input.gfas:
                 handle.write(file[:-3]+"\n")
 
 rule graph_squeeze:
     # Using odgi to merge every subgraphs into a final one
     input:
         glist="data/chrGraphs/graphsList.txt",
-        graphs=expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST),
-        tags="output/pan1c."+config['name']+".gfa.metadata"
+        graphs=expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST)
     output:
         gfa_gz="output/pan1c."+config['name']+".gfa.gz"
     log: 
@@ -585,9 +608,6 @@ rule graph_squeeze:
 
         rm $gfa_out.og
 
-        # Tagging
-        sed -i '/^H/r {input.tags}' $gfa_out
-
         # Compressing
         apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
             -@ {threads} $gfa_out
-- 
GitLab


From 4da777918969e83a6d148931520e06821ff27608 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 5 Sep 2024 10:24:08 +0200
Subject: [PATCH 150/310] Update Snakefile

---
 Snakefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Snakefile b/Snakefile
index a79c2e9..0ac8d41 100644
--- a/Snakefile
+++ b/Snakefile
@@ -540,10 +540,10 @@ rule graph_tagging:
     # Add workflow metadata to a GFA
     input:
         tags="output/pan1c."+config['name']+".gfa.metadata",
-        gfa="{dir}/{graph}.gfa"
+        gfa="{graph}.gfa"
     output:
-        gfa="{dir}/{graph}.gfa.gz",
-        flag=touch("flags/{graph}.done")
+        gfa="{graph}.gfa.gz",
+        flag=touch(lambda wildcards: "flags/" + os.path.basename(wildcards.graph) + ".done")
     resources:
         mem_mb = 8000
     params:
-- 
GitLab


From e896c9933bc8012cd7ee4eca46bf11f9272c1d5d Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 5 Sep 2024 10:26:26 +0200
Subject: [PATCH 151/310] Update Snakefile

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 0ac8d41..986ab29 100644
--- a/Snakefile
+++ b/Snakefile
@@ -543,7 +543,7 @@ rule graph_tagging:
         gfa="{graph}.gfa"
     output:
         gfa="{graph}.gfa.gz",
-        flag=touch(lambda wildcards: "flags/" + os.path.basename(wildcards.graph) + ".done")
+        flag=touch("flags/"+os.path.basename(wildcards.graph)+".done")
     resources:
         mem_mb = 8000
     params:
-- 
GitLab


From a7fea181527f21d51e5a91e131e52a3de3db8e3d Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 5 Sep 2024 10:31:34 +0200
Subject: [PATCH 152/310] Update Snakefile

---
 Snakefile | 35 ++++++++++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/Snakefile b/Snakefile
index 986ab29..43c150f 100644
--- a/Snakefile
+++ b/Snakefile
@@ -536,14 +536,14 @@ rule odgi_postprocessing:
             -@ {threads} $gfa_out
         """
 
-rule graph_tagging:
-    # Add workflow metadata to a GFA
+rule graph_tagging_chr_level:
+    # Add workflow metadata to chromosome level GFA
     input:
         tags="output/pan1c."+config['name']+".gfa.metadata",
-        gfa="{graph}.gfa"
+        gfa="data/chrGraphs/{graph}.gfa"
     output:
-        gfa="{graph}.gfa.gz",
-        flag=touch("flags/"+os.path.basename(wildcards.graph)+".done")
+        gfa="data/chrGraphs/{graph}.gfa.gz",
+        flag=touch("flags/{graph}.done")
     resources:
         mem_mb = 8000
     params:
@@ -613,6 +613,31 @@ rule graph_squeeze:
             -@ {threads} $gfa_out
         """
 
+rule graph_tagging_final:
+    # Add workflow metadata to chromosome level GFA
+    input:
+        tags="output/pan1c."+config['name']+".gfa.metadata",
+        gfa="output/{graph}.gfa"
+    output:
+        gfa="output/{graph}.gfa.gz",
+        flag=touch("flags/{graph}.done")
+    resources:
+        mem_mb = 8000
+    params:
+        app_path=config['app.path']
+    shell:
+        """
+        ## Removing any old tags
+        sed -i '/^#/d' {input.gfa}
+
+        ## Adding new tags
+        sed -i '/^H/r {input.tags}' {input.gfa}
+
+        ## Compressing
+        apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
+            -@ {threads} {input.gfa}
+        """
+
 rule graph_stats:
     # Using GFAstats to produce stats on every chromosome graphs
     input:
-- 
GitLab


From 9cee6903ea8c73a5c2528ec5247023f826021563 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 5 Sep 2024 10:35:37 +0200
Subject: [PATCH 153/310] Update Snakefile

---
 Snakefile | 57 +++++++------------------------------------------------
 1 file changed, 7 insertions(+), 50 deletions(-)

diff --git a/Snakefile b/Snakefile
index 43c150f..56cd4f7 100644
--- a/Snakefile
+++ b/Snakefile
@@ -473,6 +473,7 @@ rule gfaffix_on_chr:
 rule odgi_postprocessing:
     # Running pggb's postprocessing (mainly odgi) steps with gfaffix graph
     input:
+        tags="output/pan1c."+config['name']+".gfa.metadata",
         gfa_gz=rules.gfaffix_on_chr.output.gfa_gz
     output:
         gfa_gz='data/chrGraphs/'+config['name']+'.{chromosome}.gfa.gz'
@@ -531,36 +532,13 @@ rule odgi_postprocessing:
         ## Removing .og files for space savings
         rm $(dirname {input.gfa_gz})/*.og
 
-        ## Compressing
-        apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
-            -@ {threads} $gfa_out
-        """
-
-rule graph_tagging_chr_level:
-    # Add workflow metadata to chromosome level GFA
-    input:
-        tags="output/pan1c."+config['name']+".gfa.metadata",
-        gfa="data/chrGraphs/{graph}.gfa"
-    output:
-        gfa="data/chrGraphs/{graph}.gfa.gz",
-        flag=touch("flags/{graph}.done")
-    resources:
-        mem_mb = 8000
-    params:
-        app_path=config['app.path']
-    shell:
-        """
-        ## Removing any old tags
-        sed -i '/^#/d' {input.gfa}
-
         ## Adding new tags
-        sed -i '/^H/r {input.tags}' {input.gfa}
+        sed -i '/^H/r {input.tags}' $gfa_out
 
         ## Compressing
         apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
-            -@ {threads} {input.gfa}
+            -@ {threads} $gfa_out
         """
-    
 
 rule generate_graph_list:
     # Generate a text file containing all created graphs
@@ -582,6 +560,7 @@ rule graph_squeeze:
     # Using odgi to merge every subgraphs into a final one
     input:
         glist="data/chrGraphs/graphsList.txt",
+        tags="output/pan1c."+config['name']+".gfa.metadata",
         graphs=expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST)
     output:
         gfa_gz="output/pan1c."+config['name']+".gfa.gz"
@@ -608,34 +587,12 @@ rule graph_squeeze:
 
         rm $gfa_out.og
 
-        # Compressing
-        apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
-            -@ {threads} $gfa_out
-        """
-
-rule graph_tagging_final:
-    # Add workflow metadata to chromosome level GFA
-    input:
-        tags="output/pan1c."+config['name']+".gfa.metadata",
-        gfa="output/{graph}.gfa"
-    output:
-        gfa="output/{graph}.gfa.gz",
-        flag=touch("flags/{graph}.done")
-    resources:
-        mem_mb = 8000
-    params:
-        app_path=config['app.path']
-    shell:
-        """
-        ## Removing any old tags
-        sed -i '/^#/d' {input.gfa}
-
         ## Adding new tags
-        sed -i '/^H/r {input.tags}' {input.gfa}
+        sed -i '/^H/r {input.tags}' $gfa_out
 
-        ## Compressing
+        # Compressing
         apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
-            -@ {threads} {input.gfa}
+            -@ {threads} $gfa_out
         """
 
 rule graph_stats:
-- 
GitLab


From 24d621f8af80f850a5f9982585bce91b717cf531 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 5 Sep 2024 10:37:38 +0200
Subject: [PATCH 154/310] Update Snakefile

---
 Snakefile | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/Snakefile b/Snakefile
index 56cd4f7..0148db2 100644
--- a/Snakefile
+++ b/Snakefile
@@ -91,7 +91,6 @@ Rules   ------------------------------------------------------------------------
 rule all:
     input:
         "output/pan1c."+config['name']+".gfa.gz", # Final graph (main output)
-        "flags/pan1c."+config['name']+".done", # Tag
         "output/pan1c."+config['name']+".gfa.metadata", # Metadata for the final (also in top of gfa files as # line)
         which_analysis()
 
@@ -543,8 +542,7 @@ rule odgi_postprocessing:
 rule generate_graph_list:
     # Generate a text file containing all created graphs
     input:
-        gfas=expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa.gz', chromosome=CHRLIST),
-        tag_flags=expand('flags/'+config['name']+'.{chromosome}.done', chromosome=CHRLIST) 
+        gfas=expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa.gz', chromosome=CHRLIST)
     output:
         "data/chrGraphs/graphsList.txt"
     threads: 1
-- 
GitLab


From 0e387ef74b0bde876ea5da4d6d70a05045a40125 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 5 Sep 2024 13:08:41 +0200
Subject: [PATCH 155/310] Update chrGraphs.stats_figs.py

Cosmetic changes
---
 scripts/chrGraphs.stats_figs.py | 66 ++++++++++++++++++++-------------
 1 file changed, 41 insertions(+), 25 deletions(-)

diff --git a/scripts/chrGraphs.stats_figs.py b/scripts/chrGraphs.stats_figs.py
index fb60273..1cc11f1 100644
--- a/scripts/chrGraphs.stats_figs.py
+++ b/scripts/chrGraphs.stats_figs.py
@@ -150,13 +150,13 @@ def get_group_decomp_fig(data, title, savedir):
     perax.set_ylabel(None)
 
     sns.despine()
-    plt.savefig(savedir)
+    plt.savefig(savedir, bbox_inches='tight')
     plt.close()
 
 # 2D scatter Core vs Private 
 def get_group_2d_fig(data, title, savedir):
     sns.set_style("ticks")
-    fig, (ax, ax_table) = plt.subplots(1, 2, gridspec_kw={'width_ratios': [3, 2], 'wspace': 0.05}, figsize=(16, 9), dpi=300)
+    fig, (ax, ax_table) = plt.subplots(1, 2, gridspec_kw={'width_ratios': [2.5, 2], 'wspace': 0.05, 'left':0.05, 'right':0.95}, figsize=(20, 9), dpi=300)
     
     # Clustering with DBSCAN
     X = data.loc[:, ["Path.private.R.per", "Path.core.R.per"]].to_numpy()
@@ -203,21 +203,21 @@ def get_group_2d_fig(data, title, savedir):
     
     # Adding/Modifying titles
     ax.set_title(title, fontsize=15 , fontweight="bold")
-    ax.set_xlabel('Private relative proportion (%)', loc='right', fontweight="bold")
-    ax.set_ylabel('Core relative proportion (%)', loc='top', fontweight="bold")
+    ax.set_xlabel('Relative Proportion of Private sequence (%)', loc='right', fontweight="bold", fontsize=11)
+    ax.set_ylabel('Relative Proportion of Core sequence (%)', loc='top', fontweight="bold", fontsize=11)
 
     sns.despine()
-    plt.savefig(savedir)
+    plt.savefig(savedir, bbox_inches='tight')
     plt.close()
 
 # Shared content heatmap
 def get_hm_shared_fig(data, title, savedir):
     sns.set_style("ticks")
-    fig, ((Rax, NRax), (Rax_bar, NRax_bar)) = plt.subplots(2, 2, gridspec_kw={'wspace': 0.5, 'hspace': 0.3, 'height_ratios':[4, 0.15]}, figsize=(16, 9), dpi=300)
+    fig, (Rax, NRax) = plt.subplots(1, 2, gridspec_kw={'wspace': 0.3, 'hspace': 0, 'top':1.1, 'bottom':0.01, "left":0.1, "right":1}, figsize=(20, 10), dpi=300)
 
     # Pivoting the dataframe for the heatmap
     data["Shared.length"] = data["Shared.length"]/1000000
-    srTable = data.reset_index().pivot(values=["Shared.R.prop"], index=["Query.name"], columns=["Target.name"]).fillna(100)
+    srTable = data.reset_index().pivot(values=["Shared.R.prop"], index=["Query.name"], columns=["Target.name"])
     snrTable = data.reset_index().pivot(values=["Shared.length"], index=["Query.name"], columns=["Target.name"])
     
     # Removing multi-index from columns
@@ -225,39 +225,55 @@ def get_hm_shared_fig(data, title, savedir):
     snrTable.columns = snrTable.columns.droplevel()
     
     # Heatmap
-    sns.heatmap(srTable, cmap="Spectral_r", square = True, ax=Rax, cbar=False)
-    fig.colorbar(Rax.collections[0], cax=Rax_bar, orientation='horizontal')
-    sns.heatmap(snrTable, cmap="Spectral_r", square = True, ax=NRax, mask=np.triu(snrTable), cbar=False)
-    fig.colorbar(NRax.collections[0], cax=NRax_bar, orientation='horizontal')
+    sns.heatmap(
+        srTable, 
+        cmap="Spectral_r", 
+        square = True, 
+        ax=Rax, 
+        cbar_kws=dict(shrink=0.6, label="Proportion relative to Query, with repeats (%)")
+    )
+    Rax.figure.axes[-1].yaxis.label.set(fontweight="bold", fontsize=13)
+    #fig.colorbar(Rax.collections[0], cax=Rax_bar, shrink=0.6)
+    sns.heatmap(
+        snrTable, 
+        cmap="Spectral_r", 
+        square = True, 
+        ax=NRax, 
+        mask=np.triu(snrTable), 
+        cbar_kws=dict(shrink=0.6, label="Size, without repeats (Mbp)")
+    )
+    NRax.figure.axes[-1].yaxis.label.set(fontweight="bold", fontsize=13)
+    #fig.colorbar(NRax.collections[0], cax=NRax_bar, shrink=0.6)
     
     # Adding/Modifying titles
-    fig.suptitle(title, fontsize=15 , fontweight="bold")
-    Rax.set_title("Relative proportion of shared content (%)", fontsize=13 , fontweight="bold")
-    NRax.set_title("Node size of shared content (Mbp)", fontsize=13 , fontweight="bold")
+    fig.suptitle(title, fontsize=18 , fontweight="bold")
+    Rax.set_title("Shared Sequence Percentages", fontsize=13 , fontweight="bold")
+    NRax.set_title("Shared Sequence Size", fontsize=13 , fontweight="bold")
     Rax.set_xlabel(None)
     NRax.set_xlabel(None)
     Rax.set_ylabel('Query', fontweight="bold")
     NRax.set_ylabel(None)
 
     sns.despine()
-    plt.savefig(savedir)
+    plt.savefig(savedir, bbox_inches='tight')
     plt.close()
 
 # Shared content difference between chromosomes heatmap
 def get_hm_diff_fig(data, title, savedir):
     sns.set_style("ticks")
-    fig, ax = plt.subplots(1, 1, figsize=(10, 9), dpi=300)
+    fig, ax = plt.subplots(1, 1, figsize=(10, 9), gridspec_kw={'wspace': 0.2, 'hspace': 0, 'top':0.98, 'left':0.05, 'right':1, 'bottom':0.01}, dpi=300)
     
     # Heatmap
-    sns.heatmap(data, cmap="hot_r", square = True, ax=ax, mask=np.triu(data))
+    sns.heatmap(data, cmap="hot_r", square = True, ax=ax, mask=np.triu(data), cbar_kws=dict(shrink=0.8, label='Euclidean distance'))
+    ax.figure.axes[-1].yaxis.label.set(fontweight="bold", fontsize=13)
     
     # Adding/Modifying titles
-    fig.suptitle(title, fontsize=15 , fontweight="bold")
+    fig.suptitle(title, fontsize=18 , fontweight="bold")
     ax.set_xlabel(None)
-    ax.set_ylabel('Query', fontweight="bold")
+    ax.set_ylabel(None)
 
     sns.despine()
-    plt.savefig(savedir)
+    plt.savefig(savedir, bbox_inches='tight')
     plt.close()
 
 #% Figure generation
@@ -273,7 +289,7 @@ for pangenome in gData.index.unique("Pangenome.name"):
     ## For the mean across chromosomes 
     get_group_decomp_fig(
         data = gData.groupby("Path.name").sum().copy(), 
-        title = "Haplotypes mean composition accross chromosomes",
+        title = "Path composition mean accross chromosomes",
         savedir = os.path.join(args.dir, f"{pangenome}.path.decomp.mean.png")
     )
 
@@ -282,12 +298,12 @@ for pangenome in gData.index.unique("Pangenome.name"):
     for chrid in gData.index.unique("Chr.id"):
         get_group_2d_fig(
             data = gData.loc[pangenome, chrid,:].copy(), 
-            title = f"Path composition by groups - {pangenome} - {chrid}",
+            title = f"Private vs. Core Sequence for Each Graph Path - {pangenome} - {chrid}",
             savedir = os.path.join(args.dir, f"{pangenome}.2D.scatter.{chrid}.png")
         )
     get_group_2d_fig(
         data = gData.groupby("Path.name").mean().copy(), 
-        title = "Haplotypes mean composition accross chromosomes",
+        title = "Mean Private vs. Core Sequence for Each Graph Path",
         savedir = os.path.join(args.dir, f"{pangenome}.2D.scatter.mean.png")
     )
 
@@ -296,7 +312,7 @@ for pangenome in sData.index.unique("Pangenome.name"):
     for chrid in sData.index.unique("Chr.id"):
         get_hm_shared_fig(
             data = sData.loc[pangenome, chrid,:].copy(), 
-            title = f"Shared content (%) - {pangenome} - {chrid}",
+            title = f"Pairwise Path Comparison - {pangenome} - {chrid}",
             savedir = os.path.join(args.dir, f"{pangenome}.sharred.content.{chrid}.png")
         )
 
@@ -322,6 +338,6 @@ for pangenome in sData.index.unique("Pangenome.name"):
     
     get_hm_diff_fig(
         dData, 
-        title = f"Matrix distance between chromosomes - {pangenome}",
+        title = f"Pairwise Euclidean distance between path comparison matrices - {pangenome}",
         savedir = os.path.join(args.dir, f"{pangenome}.sharred.content.diff.png")
     )
\ No newline at end of file
-- 
GitLab


From ac966656b40ca38df36f9364afa572982dc9c392 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 5 Sep 2024 18:33:05 +0200
Subject: [PATCH 156/310] Removed old scripts

---
 Snakefile                |   2 +
 config.yaml              |   2 +-
 scripts/analyze_VCF.R    | 199 -----------------------------
 scripts/analyze_VCF_v2.R | 264 ---------------------------------------
 4 files changed, 3 insertions(+), 464 deletions(-)
 delete mode 100644 scripts/analyze_VCF.R
 delete mode 100644 scripts/analyze_VCF_v2.R

diff --git a/Snakefile b/Snakefile
index 0148db2..fadbd6e 100644
--- a/Snakefile
+++ b/Snakefile
@@ -381,6 +381,7 @@ rule wfmash_on_chr:
             -n $(cat {input.fai} | wc -l) {params.wfmash_sec} -t {threads} \
             --tmp-base $(dirname {output.aln}) \
             {input.fa} --approx-map \
+            -X \
             1> {output.mapping} \
             2> >(tee {log.cmd_map} >&2)
 
@@ -391,6 +392,7 @@ rule wfmash_on_chr:
             -n $(cat {input.fai} | wc -l) {params.wfmash_sec} -t {threads} \
             --tmp-base $(dirname {output.aln}) \
             {input.fa} -i {output.mapping} \
+            -X \
             1> {output.aln} \
             2> >(tee {log.cmd_aln} >&2)
 
diff --git a/config.yaml b/config.yaml
index 06c5ab2..baf408c 100644
--- a/config.yaml
+++ b/config.yaml
@@ -18,7 +18,7 @@ ragtag_mm2_conf: '-x asm5'
 # Wfmash alignement parameters :
 wfmash.segment_length: 10000
 wfmash.mapping_id: 95
-wfmash.secondary: '-k 19 -H 0.001 -X'
+wfmash.secondary: '-k 19 -H 0.001'
 
 # Seqwish parameters
 seqwish.params: '-B 10000000 -k 19 -f 0'
diff --git a/scripts/analyze_VCF.R b/scripts/analyze_VCF.R
deleted file mode 100644
index 40886f9..0000000
--- a/scripts/analyze_VCF.R
+++ /dev/null
@@ -1,199 +0,0 @@
-# Script to plot the INS/DEL length from a VCF produced by 'VG deconstruct'
-# 
-# @author: alexis.mergez@inrae.fr
-# @version: 1.1
-
-library(optparse)
-
-#% Parsing arguments
-option_list = list(
-    make_option(c("-b", "--binwidth"), type="double", default=0.02, 
-        help="Bin width", metavar="double"),
-    make_option(c("-t", "--tsv"), type="character", default=NULL, 
-        help=".tsv input", metavar="character"),
-    make_option(c("-o", "--out"), type="character", default=NULL, 
-        help="output directory", metavar="character"),
-    make_option(c("-p", "--panname"), type="character", default=NULL, 
-        help="pangenome name", metavar="character"),
-    make_option(c("-T", "--tool"), type="character", default=NULL, 
-        help="VCF creation tool (VG, SyRI, ...)", metavar="character"),
-    make_option(c("-W", "--width"), type="integer", default=18, 
-        help="Figure width", metavar="integer"),
-    make_option(c("-H", "--height"), type="integer", default=6, 
-        help="Figure height", metavar="integer")
-);
-
-opt_parser = OptionParser(option_list=option_list);
-opt = parse_args(opt_parser);
-
-## Accesing arguments with opt$<arg>. For example : opt$bands, opt$tsv, ...
-
-library(ggplot2)
-library(tidyverse)
-library(gridExtra)
-
-#% Parsing TSV file
-write("[analyze_VCF] Parsing TSV ...", stdout())
-x <- read.delim(opt$tsv)
-
-sample = str_split_1(x$CHROM[1], "#")[1]
-x[c("HAPNAME", "HAPID")] = str_split_fixed(x$HAP, "#", 2)
-
-#% Filtering too long and too short INS/DEL, splitting data into 2 dataframe by type 
-write("[analyze_VCF] Filtering ...", stdout())
-INS = x[which(x$LEN >= -100000 & x$LEN <= -50), ]
-INS$LEN = -INS$LEN
-
-DEL = x[which(x$LEN <= 100000 & x$LEN >= 50), ]
-
-
-#% Passing to LOG for scale reasons
-INS$LOGLEN = log10(INS$LEN)
-DEL$LOGLEN = -log10(DEL$LEN)
-
-#% Figures section
-title_text = element_text(face="bold", size = 12)
-colours = c("#78ABA8", "#C8CFA0", "#FCDC94", "#EF9C66")
-
-#% Function to retrieve the legend from a plot as a dedicated plot (used in the multiplot command)
-get_legend<-function(myggplot){
-  tmp <- ggplot_gtable(ggplot_build(myggplot))
-  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
-  legend <- tmp$grobs[[leg]]
-  return(legend)
-}
-
-#% Creating Bins from the log of the length
-write("[analyze_VCF] Binning ...", stdout())
-INS = INS %>% mutate(
-    Bin = cut(
-        LOGLEN, 
-        breaks = seq(0, log10(100000), 
-        by = opt$binwidth), 
-        include.lowest = TRUE, 
-        right = FALSE, 
-        ordered_result = TRUE, 
-        dig.lab=3)
-    )
-DEL = DEL %>% mutate(
-    Bin = cut(
-        LOGLEN, 
-        breaks = seq(-log10(100000)-opt$binwidth, 0, by = opt$binwidth), 
-        include.lowest = TRUE, 
-        right = FALSE, 
-        ordered_result = TRUE, 
-        dig.lab=3)
-    )
-
-#% Summerizing the dataframe by haplotypes
-write("[analyze_VCF] Summerizing ...", stdout())
-INS_F = INS %>% 
-	group_by(HAP, Bin) %>%
-	summarise(
-		Count = n(),
-		.groups = 'drop'
-	) %>%
-	mutate(x = factor(parse_number(as.character(Bin), locale = locale(decimal_mark = ".", grouping_mark = ""))))
-INS_F[c("HAPNAME", "HAPID")] = str_split_fixed(INS_F$HAP, "#", 2)
-
-DEL_F = DEL %>% 
-	group_by(HAP, Bin) %>%
-	summarise(
-		Count = n(),
-		.groups = 'drop'
-	) %>%
-	mutate(x = factor(parse_number(as.character(Bin), locale = locale(decimal_mark = ".", grouping_mark = ""))))
-DEL_F[c("HAPNAME", "HAPID")] = str_split_fixed(DEL_F$HAP, "#", 2)
-#% Creating the general graph, in log and non-log version
-
-## General function
-get_fig_log = function(INS_F, DEL_F, top_name, tool_name){
-    # Insertion figure
-    figA = ggplot(INS_F, aes(x=x)) +
-	geom_line(aes(y=Count, group=HAP, color=HAP, linetype=HAPID)) +
-	scale_y_continuous(trans='log10', position = "right") +
-    expand_limits(x=c(0)) +
-	scale_x_discrete(
-		breaks=seq(0,5),
-		labels=c("","10bp","100bp","1kb","10kb","100kb")) +
-	xlab("INS") +
-	theme_bw() +
-	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), axis.title = title_text) 
-
-    # Deletion figure
-    figB = ggplot(DEL_F, aes(x=x)) +
-	geom_line(aes(y=Count, group=HAP, color=HAP, linetype=HAPID)) +
-	scale_y_continuous(trans='log10') +
-    expand_limits(x=c(0)) +
-	scale_x_discrete(
-		breaks=-seq(0,5),
-		labels=c("","-10bp","-100bp","-1kb","-10kb","-100kb")) +
-	xlab("DEL") +
-	theme_bw() +
-	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), title = title_text, legend.position = "none")
-
-    legend = get_legend(figA)
-    figA = figA + theme(legend.position = "none")
-
-    ## Combining the plots and the legend
-    figF = grid.arrange(figB, figA, legend, nrow = 1, top=paste0(c("Pan1c - ",top_name," - ",sample," - ",tool_name), collapse=""), widths=c(2.3, 2.3, 0.8))
-    return(figF)
-}
-
-get_fig = function(INS_F, DEL_F, top_name, tool_name){
-    # Insertion figure
-    figA = ggplot(INS_F, aes(x=x)) +
-	geom_line(aes(y=Count, group=HAP, color=HAP, linetype=HAPID)) +
-	scale_y_continuous(position = "right") +
-    expand_limits(x=c(0)) +
-	scale_x_discrete(
-		breaks=seq(0,5),
-		labels=c("","10bp","100bp","1kb","10kb","100kb")) +
-	xlab("INS") +
-	theme_bw() +
-	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), axis.title = title_text) 
-
-    # Deletion figure
-    figB = ggplot(DEL_F, aes(x=x)) +
-	geom_line(aes(y=Count, group=HAP, color=HAP, linetype=HAPID)) +
-    expand_limits(x=c(0)) +
-	scale_x_discrete(
-		breaks=-seq(0,5),
-		labels=c("","-10bp","-100bp","-1kb","-10kb","-100kb")) +
-	xlab("DEL") +
-	theme_bw() +
-	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), title = title_text, legend.position = "none")
-
-    legend = get_legend(figA)
-    figA = figA + theme(legend.position = "none")
-
-    ## Combining the plots and the legend
-    figF = grid.arrange(figB, figA, legend, nrow = 1, top=paste0(c("Pan1c - ",top_name," - ",sample," - ",tool_name), collapse=""), widths=c(2.3, 2.3, 0.8))
-    return(figF)
-}
-
-#% Log version
-write("[analyze_VCF] Creating general graph (Log version) ...", stdout())
-FIG = get_fig_log(INS_F, DEL_F, opt$panname, opt$tool)
-sub_name = paste0(c("pan1c",opt$panname,"General_log","vcf","png"), collapse=".")
-ggsave(paste0(c(opt$out,sub_name), collapse="/"), plot=FIG, width=opt$width, height=opt$height)
-
-#% Non log version
-write("[analyze_VCF] Creating general graph (Non-log version) ...", stdout())
-FIG = get_fig(INS_F, DEL_F, opt$panname, opt$tool)
-sub_name = paste0(c("pan1c",opt$panname,"General","vcf","png"), collapse=".")
-ggsave(paste0(c(opt$out,sub_name), collapse="/"), plot=FIG, width=opt$width, height=opt$height)
-
-#% Individual haplotypes plots
-write("[analyze_VCF] Creating haplotype graphs  ...", stdout())
-for (hapname in unique(x$HAPNAME)){
-    # Getting the haps 
-    hapids = unique(x[x$HAPNAME == hapname,]$HAPID)
-    haps = paste(hapname, hapids, sep="#")
-    FIG = get_fig(INS_F[INS_F$HAP %in% haps,], DEL_F[DEL_F$HAP %in% haps,], paste0(c(opt$panname," - ",hapname), collapse=''), opt$tool)
-    sub_name = paste0(c("pan1c",opt$panname,hapname,"vcf","png"), collapse=".")
-    ggsave(paste0(c(opt$out, sub_name), collapse="/"), plot=FIG, width=opt$width, height=opt$height)
-}
-
-write("[analyze_VCF] Done !", stdout())
-warnings()
\ No newline at end of file
diff --git a/scripts/analyze_VCF_v2.R b/scripts/analyze_VCF_v2.R
deleted file mode 100644
index 5a66710..0000000
--- a/scripts/analyze_VCF_v2.R
+++ /dev/null
@@ -1,264 +0,0 @@
-# Script to plot the INS/DEL length from a VCF produced by 'VG deconstruct'
-# 
-# @author: alexis.mergez@inrae.fr
-# @version: 1.2
-
-library(optparse)
-
-#% Parsing arguments
-option_list = list(
-    make_option(c("-b", "--binwidth"), type="double", default=0.02, 
-        help="Bin width", metavar="double"),
-    make_option(c("-v", "--vg"), type="character", default=NULL, 
-        help="vg TSV", metavar="character"),
-    make_option(c("-s", "--syri"), type="character", default=NULL, 
-        help="SyRi TSV", metavar="character"),
-    make_option(c("-o", "--out"), type="character", default=NULL, 
-        help="output directory", metavar="character"),
-    make_option(c("-p", "--panname"), type="character", default=NULL, 
-        help="pangenome name", metavar="character"),
-    make_option(c("-W", "--width"), type="integer", default=18, 
-        help="Figure width", metavar="integer"),
-    make_option(c("-H", "--height"), type="integer", default=6, 
-        help="Figure height", metavar="integer")
-);
-
-opt_parser = OptionParser(option_list=option_list);
-opt = parse_args(opt_parser);
-
-## Accesing arguments with opt$<arg>. For example : opt$bands, opt$tsv, ...
-
-library(ggplot2)
-library(tidyverse)
-library(gridExtra)
-
-#% Parsing TSV file
-write("[analyze_VCF] Parsing TSV ...", stdout())
-vg <- read.delim(opt$vg)
-syri <- read.delim(opt$syri)
-
-sample = str_split_1(vg$CHROM[1], "#")[1]
-vg[c("HAPNAME", "HAPID")] = str_split_fixed(vg$HAP, "#", 2)
-syri[c("HAPNAME", "HAPID")] = str_split_fixed(syri$HAP, "#", 2)
-
-#% Prepare data
-get_data<-function(data){
-    #% Filtering too long and too short INS/DEL, splitting data into 2 dataframe by type
-    INS = data[which(data$LEN >= -100000 & data$LEN <= -50), ]
-    INS$LEN = -INS$LEN
-
-    DEL = data[which(data$LEN <= 100000 & data$LEN >= 50), ]
-
-    #% Passing to LOG for scale reasons
-    INS$LOGLEN = log10(INS$LEN)
-    DEL$LOGLEN = -log10(DEL$LEN)
-
-    #% Creating Bins from the log of the length
-    INS = INS %>% mutate(
-        Bin = cut(
-            LOGLEN, 
-            breaks = seq(0, log10(100000), 
-            by = opt$binwidth), 
-            include.lowest = TRUE, 
-            right = FALSE, 
-            ordered_result = TRUE, 
-            dig.lab=3)
-        )
-    DEL = DEL %>% mutate(
-        Bin = cut(
-            LOGLEN, 
-            breaks = seq(-log10(100000)-opt$binwidth, 0, by = opt$binwidth), 
-            include.lowest = TRUE, 
-            right = FALSE, 
-            ordered_result = TRUE, 
-            dig.lab=3)
-        )
-
-    #% Summerizing the dataframe by haplotypes
-    INS_F = INS %>% 
-        group_by(HAP, Bin) %>%
-        summarise(
-            Count = n(),
-            .groups = 'drop'
-        ) %>%
-        mutate(x = factor(parse_number(as.character(Bin), locale = locale(decimal_mark = ".", grouping_mark = ""))))
-    INS_F[c("HAPNAME", "HAPID")] = str_split_fixed(INS_F$HAP, "#", 2)
-
-    DEL_F = DEL %>% 
-        group_by(HAP, Bin) %>%
-        summarise(
-            Count = n(),
-            .groups = 'drop'
-        ) %>%
-        mutate(x = factor(parse_number(as.character(Bin), locale = locale(decimal_mark = ".", grouping_mark = ""))))
-    DEL_F[c("HAPNAME", "HAPID")] = str_split_fixed(DEL_F$HAP, "#", 2)
-
-    return(list(INS = INS_F, DEL = DEL_F))
-}
-tmp = get_data(vg)
-vINS_F = tmp$INS
-vDEL_F = tmp$DEL
-tmp = get_data(syri)
-sINS_F = tmp$INS
-sDEL_F = tmp$DEL
-
-#% Figures section
-title_text = element_text(face="bold", size = 12)
-colours = c("#78ABA8", "#C8CFA0", "#FCDC94", "#EF9C66")
-
-#% Function to retrieve the legend from a plot as a dedicated plot (used in the multiplot command)
-get_legend<-function(myggplot){
-  tmp <- ggplot_gtable(ggplot_build(myggplot))
-  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
-  legend <- tmp$grobs[[leg]]
-  return(legend)
-}
-
-
-#% Creating the general graph, in log and non-log version
-
-## General function
-get_fig_log = function(INS_F, DEL_F, top_name, tool_name){
-    # Insertion figure
-    figA = ggplot(INS_F, aes(x=x)) +
-	geom_line(aes(y=Count, group=interaction(HAP, HAPID), color=HAP, linetype=HAPID)) +
-	scale_y_continuous(trans='log10', position = "right") +
-    expand_limits(x=c(0)) +
-	scale_x_discrete(
-		breaks=seq(0,5),
-		labels=c("","10bp","100bp","1kb","10kb","100kb")) +
-	xlab("INS") +
-	theme_bw() +
-	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), axis.title = title_text) 
-
-    # Deletion figure
-    figB = ggplot(DEL_F, aes(x=x)) +
-	geom_line(aes(y=Count, group=interaction(HAP, HAPID), color=HAP, linetype=HAPID)) +
-	scale_y_continuous(trans='log10') +
-    expand_limits(x=c(0)) +
-	scale_x_discrete(
-		breaks=-seq(0,5),
-		labels=c("","-10bp","-100bp","-1kb","-10kb","-100kb")) +
-	xlab("DEL") +
-	theme_bw() +
-	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), title = title_text, legend.position = "none")
-
-    legend = get_legend(figA)
-    figA = figA + theme(legend.position = "none")
-
-    ## Combining the plots and the legend
-    figF = grid.arrange(figB, figA, legend, nrow = 1, top=paste0(c("Pan1c - ",top_name," - ",sample," - ",tool_name), collapse=""), widths=c(2.3, 2.3, 0.8))
-    return(figF)
-}
-
-get_fig = function(INS_F, DEL_F, top_name, tool_name){
-    # Insertion figure
-    figA = ggplot(INS_F, aes(x=x)) +
-	geom_line(aes(y=Count, group=interaction(HAP, HAPID), color=HAP, linetype=HAPID)) +
-	scale_y_continuous(position = "right") +
-    expand_limits(x=c(0)) +
-	scale_x_discrete(
-		breaks=seq(0,5),
-		labels=c("","10bp","100bp","1kb","10kb","100kb")) +
-	xlab("INS") +
-	theme_bw() +
-	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), axis.title = title_text) 
-
-    # Deletion figure
-    figB = ggplot(DEL_F, aes(x=x)) +
-	geom_line(aes(y=Count, group=interaction(HAP, HAPID), color=HAP, linetype=HAPID)) +
-    expand_limits(x=c(0)) +
-	scale_x_discrete(
-		breaks=-seq(0,5),
-		labels=c("","-10bp","-100bp","-1kb","-10kb","-100kb")) +
-	xlab("DEL") +
-	theme_bw() +
-	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), title = title_text, legend.position = "none")
-
-    legend = get_legend(figA)
-    figA = figA + theme(legend.position = "none")
-
-    ## Combining the plots and the legend
-    figF = grid.arrange(figB, figA, legend, nrow = 1, top=paste0(c("Pan1c - ",top_name," - ",sample," - ",tool_name), collapse=""), widths=c(2.3, 2.3, 0.8))
-    return(figF)
-}
-
-get_fig_single = function(vINS_F, vDEL_F, sINS_F, sDEL_F, top_name){
-    vINS_F$TOOL = rep("VG", nrow(vINS_F))
-    sINS_F$TOOL = rep("SyRI", nrow(sINS_F))
-    vDEL_F$TOOL = rep("VG", nrow(vDEL_F))
-    sDEL_F$TOOL = rep("SyRI", nrow(sDEL_F))
-
-    # Insertion figure
-    figA = ggplot(vINS_F, aes(x=x, y=Count, group=interaction(TOOL, HAPID), color=TOOL, linetype=HAPID)) +
-	geom_line() +
-    geom_line(data = sINS_F) +
-	scale_y_continuous(position = "right") +
-    expand_limits(x=c(0)) +
-	scale_x_discrete(
-		breaks=seq(0,5),
-		labels=c("","10bp","100bp","1kb","10kb","100kb")) +
-	xlab("INS") +
-	theme_bw() +
-	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), axis.title = title_text) 
-
-    # Deletion figure
-    figB = ggplot(vDEL_F, aes(x=x, y=Count, group=interaction(TOOL, HAPID), color=TOOL, linetype=HAPID)) +
-    geom_line() +
-	geom_line(data = sDEL_F) +
-    expand_limits(x=c(0)) +
-	scale_x_discrete(
-		breaks=-seq(0,5),
-		labels=c("","-10bp","-100bp","-1kb","-10kb","-100kb")) +
-	xlab("DEL") +
-	theme_bw() +
-	theme(plot.margin = unit(c(0.2, 0, 0, 0), "inches"), title = title_text, legend.position = "none")
-
-    legend = get_legend(figA)
-    figA = figA + theme(legend.position = "none")
-
-    ## Combining the plots and the legend
-    figF = grid.arrange(figB, figA, legend, nrow = 1, top=paste0(c("Pan1c - ",top_name," - ",sample), collapse=""), widths=c(2.3, 2.3, 0.8))
-    return(figF)
-}
-
-#% Log version
-write("[analyze_VCF] Creating vg general graph (Log version) ...", stdout())
-FIG = get_fig_log(vINS_F, vDEL_F, opt$panname, "VG")
-sub_name = paste0(c("pan1c",opt$panname,"General_log","vcf","vg","png"), collapse=".")
-ggsave(paste0(c(opt$out,sub_name), collapse="/"), plot=FIG, width=opt$width, height=opt$height)
-
-write("[analyze_VCF] Creating SyRI general graph (Log version) ...", stdout())
-FIG = get_fig_log(sINS_F, sDEL_F, opt$panname, "SyRI")
-sub_name = paste0(c("pan1c",opt$panname,"General_log","vcf","syri","png"), collapse=".")
-ggsave(paste0(c(opt$out,sub_name), collapse="/"), plot=FIG, width=opt$width, height=opt$height)
-
-#% Non log version
-write("[analyze_VCF] Creating vg general graph (Non-log version) ...", stdout())
-FIG = get_fig(vINS_F, vDEL_F, opt$panname, "VG")
-sub_name = paste0(c("pan1c",opt$panname,"General","vcf","vg","png"), collapse=".")
-ggsave(paste0(c(opt$out,sub_name), collapse="/"), plot=FIG, width=opt$width, height=opt$height)
-
-write("[analyze_VCF] Creating SyRI general graph (Non-log version) ...", stdout())
-FIG = get_fig(sINS_F, sDEL_F, opt$panname, "SyRI")
-sub_name = paste0(c("pan1c",opt$panname,"General","vcf","syri","png"), collapse=".")
-ggsave(paste0(c(opt$out,sub_name), collapse="/"), plot=FIG, width=opt$width, height=opt$height)
-
-#% Individual haplotypes plots
-write("[analyze_VCF] Creating haplotype graphs  ...", stdout())
-for (hapname in unique(vg$HAPNAME)){
-    # Getting the haps 
-    hapids = unique(vg[vg$HAPNAME == hapname,]$HAPID)
-    haps = paste(hapname, hapids, sep="#")
-    FIG = get_fig_single(
-        vINS_F[vINS_F$HAP %in% haps,], 
-        vDEL_F[vDEL_F$HAP %in% haps,], 
-        sINS_F[sINS_F$HAP %in% haps,], 
-        sDEL_F[sDEL_F$HAP %in% haps,], 
-        paste0(c(opt$panname," - ",hapname), collapse='')
-    )
-    sub_name = paste0(c("pan1c",opt$panname,hapname,"vcf","both","png"), collapse=".")
-    ggsave(paste0(c(opt$out, sub_name), collapse="/"), plot=FIG, width=opt$width, height=opt$height)
-}
-
-write("[analyze_VCF] Done !", stdout())
\ No newline at end of file
-- 
GitLab


From 38784e864fb39d6dbd11a37ee6e1f4cb20825cae Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 6 Sep 2024 14:40:59 +0200
Subject: [PATCH 157/310] Preparing for wfmash Syri figures

---
 Snakefile                                    | 24 ++++++++++----------
 scripts/{getSyriFigs.sh => Syri.figs_mm2.sh} |  2 +-
 2 files changed, 13 insertions(+), 13 deletions(-)
 rename scripts/{getSyriFigs.sh => Syri.figs_mm2.sh} (98%)

diff --git a/Snakefile b/Snakefile
index fadbd6e..16db5a8 100644
--- a/Snakefile
+++ b/Snakefile
@@ -53,7 +53,7 @@ def which_analysis():
 
     if config["get_ASMs_SyRI"] == "True": # Creating SyRI for each input assembly 
         analysis_inputs.append(
-            expand("output/asm.syri.figs/"+config['name']+".{haplotype}.syri.png", haplotype=SAMPLES_NOREF)
+            expand("output/asm.syri.figs/"+config['name']+".{haplotype}.syri.mm2.png", haplotype=SAMPLES_NOREF)
         )
     if config["get_chrInputs_SyRI"] == "True": # Creating SyRI figures for each PGGB input
         analysis_inputs.append(
@@ -255,17 +255,17 @@ rule SyRI_on_ASM_mm2:
         ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz",
         qry="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz"
     output:
-        fig="output/asm.syri.figs/"+config['name']+".{haplotype}.syri.png",
-        vcf="data/asm.syri/"+config['name']+".{haplotype}.syri.vcf.gz"
+        fig="output/asm.syri.figs/"+config['name']+".{haplotype}.syri.mm2.png",
+        vcf="data/asm.syri.mm2/"+config['name']+".{haplotype}.syri.mm2.vcf.gz"
     log: 
-        cmd="logs/SyRI_ASM/{haplotype}.SyRI_ASM.cmd.log",
-        time="logs/SyRI_ASM/{haplotype}.SyRI_ASM.time.log"
+        cmd="logs/SyRI_ASM/{haplotype}.SyRI_ASM.mm2.cmd.log",
+        time="logs/SyRI_ASM/{haplotype}.SyRI_ASM.mm2.time.log"
     threads: 4
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 12000
     params:
         app_path=config["app.path"],
-        wrk_dir="data/asm.syri",
+        wrk_dir="data/asm.syri.mm2",
         plotsr_cfg="src/plotsr-base.cfg"
     shell:
         """
@@ -273,7 +273,7 @@ rule SyRI_on_ASM_mm2:
 
         mkdir -p $dir
         /usr/bin/time -v -o {log.time} \
-            bash scripts/getSyriFigs.sh \
+            bash scripts/Syri.figs_mm2.sh \
             -a {params.app_path} \
             -t {threads} \
             -d $dir \
@@ -334,7 +334,7 @@ rule SyRI_on_chrInput:
         
         #echo "The ASM Array : ${{AllAsmList[@]}}"
 
-        bash scripts/getSyriFigs.sh \
+        bash scripts/Syri.figs_mm2.sh \
             -a {params.app_path} \
             -t {threads} \
             -d $dir \
@@ -804,7 +804,7 @@ rule vcf_fig:
     # Produce a figure describing INS/DEL length distribution from vg deconstruct and SyRI
     input:
         vg="output/pan1c."+config['name']+".vcf.gz",
-        syris=expand("data/asm.syri/"+config['name']+".{haplotype}.syri.vcf.gz", haplotype=SAMPLES_NOREF)
+        syris_mm2=expand("data/asm.syri.mm2/"+config['name']+".{haplotype}.syri.mm2.vcf.gz", haplotype=SAMPLES_NOREF)
     output:
         vcf_fig=directory("output/vcf.figs")
     threads: 1
@@ -820,9 +820,9 @@ rule vcf_fig:
 
         RHAP=$(basename {params.refname} .fa.gz | cut -f1 -d'.')
         RHAPN=$(basename {params.refname} .fa.gz | cut -f2 -d'.' | cut -f2 -d'p')
-        FOLDER=$(dirname {input.syris[0]})
+        FOLDER=$(dirname {input.syris_mm2[0]})
 
-        #% SyRI VCF
+        #% SyRI VCF MM2
         ## Going through all folders
         for vcf in $FOLDER/*.vcf.gz; do
             THAP=$(basename $vcf .syri.vcf.gz | cut -f2 -d'.')
@@ -938,7 +938,7 @@ def get_report_sections(wildcards):
 
     if config["get_ASMs_SyRI"] == "True":
         sections["SyRI_on_ASMs_figs"] = expand(
-            "output/asm.syri.figs/"+config['name']+".{haplotype}.syri.png", 
+            "output/asm.syri.figs/"+config['name']+".{haplotype}.syri.mm2.png", 
             haplotype=SAMPLES_NOREF
             )
 
diff --git a/scripts/getSyriFigs.sh b/scripts/Syri.figs_mm2.sh
similarity index 98%
rename from scripts/getSyriFigs.sh
rename to scripts/Syri.figs_mm2.sh
index 4f3c6ba..08e1ba7 100755
--- a/scripts/getSyriFigs.sh
+++ b/scripts/Syri.figs_mm2.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Create a Syri figure for the given genomes
+# Create a Syri figure for the given genomes using minimap2
 # @author: alexis.mergez@inrae.fr
 
 # Initializing arguments
-- 
GitLab


From 12b52f813b6e54be6b376c5011bdfa9264de6059 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 6 Sep 2024 15:51:05 +0200
Subject: [PATCH 158/310] Added Syri with wfmash on haplotypes

---
 Snakefile                |  53 ++++++++++++++++-
 scripts/Syri.figs_wfm.sh | 123 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 174 insertions(+), 2 deletions(-)
 create mode 100755 scripts/Syri.figs_wfm.sh

diff --git a/Snakefile b/Snakefile
index 16db5a8..508f967 100644
--- a/Snakefile
+++ b/Snakefile
@@ -53,7 +53,7 @@ def which_analysis():
 
     if config["get_ASMs_SyRI"] == "True": # Creating SyRI for each input assembly 
         analysis_inputs.append(
-            expand("output/asm.syri.figs/"+config['name']+".{haplotype}.syri.mm2.png", haplotype=SAMPLES_NOREF)
+            expand("output/asm.syri.figs/"+config['name']+".{haplotype}.syri.{tool}.png", haplotype=SAMPLES_NOREF, tool=["mm2", "wfm"])
         )
     if config["get_chrInputs_SyRI"] == "True": # Creating SyRI figures for each PGGB input
         analysis_inputs.append(
@@ -250,7 +250,7 @@ rule chromosome_clustering:
 
 rule SyRI_on_ASM_mm2:
     # Run SyRI on a single assembly. 
-    # The assembly is mapped on the 'reference' and SyRI search for SV.
+    # The assembly is mapped on the 'reference' with Minimap2 and SyRI search for SV.
     input:
         ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz",
         qry="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz"
@@ -292,6 +292,55 @@ rule SyRI_on_ASM_mm2:
 
         rm -r $dir
         """
+    
+rule SyRI_on_ASM_wfm:
+    # Run SyRI on a single assembly. 
+    # The assembly is mapped on the 'reference' with Wfmash and SyRI search for SV.
+    input:
+        ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz",
+        qry="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz"
+    output:
+        fig="output/asm.syri.figs/"+config['name']+".{haplotype}.syri.wfm.png",
+        vcf="data/asm.syri.wfm/"+config['name']+".{haplotype}.syri.wfm.vcf.gz"
+    log: 
+        cmd="logs/SyRI_ASM/{haplotype}.SyRI_ASM.wfm.cmd.log",
+        time="logs/SyRI_ASM/{haplotype}.SyRI_ASM.wfm.time.log"
+    threads: 4
+    resources:
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 12000
+    params:
+        app_path=config["app.path"],
+        wrk_dir="data/asm.syri.wfm",
+        plotsr_cfg="src/plotsr-base.cfg",
+        segment_length=config['wfmash.segment_length'],
+        mapping_id=config['wfmash.mapping_id'],
+        wfmash_sec=config['wfmash.secondary']
+    shell:
+        """
+        dir="{params.wrk_dir}/{wildcards.haplotype}"
+
+        mkdir -p $dir
+        /usr/bin/time -v -o {log.time} \
+            bash scripts/Syri.figs_wfm.sh \
+            -a {params.app_path} \
+            -t {threads} \
+            -d $dir \
+            -o $(basename {output.fig}) \
+            -r {input.ref} \
+            -h 10 -w 20 -s "0.9" -f 10 \
+            -c {params.plotsr_cfg} \
+            -w "-p {params.mapping_id} -s {params.segment_length} -l $(( {params.segment_length} * 5 )) {params.wfmash_sec}" \
+            -q "{input.qry}" 2>&1 | \
+            tee {log.cmd}
+        
+        mv $dir/$(basename {output.fig}) {output.fig}
+        mv $dir/*.vcf {params.wrk_dir}/$(basename {output.vcf} .gz)
+
+        apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
+            -@ {threads} {params.wrk_dir}/$(basename {output.vcf} .gz)
+
+        rm -r $dir
+        """
 
 """
 Core section : Running PGGB
diff --git a/scripts/Syri.figs_wfm.sh b/scripts/Syri.figs_wfm.sh
new file mode 100755
index 0000000..d263423
--- /dev/null
+++ b/scripts/Syri.figs_wfm.sh
@@ -0,0 +1,123 @@
+#!/bin/bash
+# Create a Syri figure for the given genomes using wfmash
+# @author: alexis.mergez@inrae.fr
+
+# Initializing arguments
+ref=""          # Reference fasta
+qry=""          # Queries fasta
+appdir=""       # Directory containing apptainer images
+threads=""      
+wrkdir=""       # Working directory (directory used by pggb to store step files like .paf, etc...)
+output=""       # Output Syri figure(s)
+height=16       # Figure height
+width=9         # Figure width
+fontsize=12     
+space="0.7"     # Space for homologous chromosomes 
+config=""       # Plotsr config file
+wfm_args=""     # wfmash arguments
+
+## Getting arguments
+while getopts "r:q:a:t:d:o:h:w:f:s:c:w:" option; do
+    case "$option" in
+        r) ref="$OPTARG";;
+        q) qry="$OPTARG";;
+        a) appdir="$OPTARG";;
+        t) threads="$OPTARG";;
+        d) wrkdir="$OPTARG";;
+        o) output="$OPTARG";;
+        h) height="$OPTARG";;
+        w) width="$OPTARG";;
+        f) fontsize="$OPTARG";;
+        s) space="$OPTARG";;
+        c) config="$OPTARG";;
+        w) wfm_args="$OPTARG";;
+        \?) echo "Usage: $0 [-r ref] [-q query] [-a appdir] [-t threads] [-d wrkdir] [-o output] [-h height] [-w width] [-f fontsize] [-s space] [-c config] [-w wfmash arguments]" >&2
+            exit 1;;
+    esac
+done
+
+## Main script
+# Reading query argument and creating an array containing the path to query fasta files
+IFS=' ' read -r -a temp <<< "$qry"
+readarray -td '' temp_sorted < <(printf '%s\0' "${temp[@]}" | sort -z)
+
+#echo "Query : $qry"
+#echo "tempArr : ${temp[@]}"
+
+asmList=("$ref")
+
+# Sorting the array to put the reference in first
+for item in "${temp_sorted[@]}"; do
+    if [[ $item != "$ref" ]]; then
+        asmList+=($item)
+    fi
+done
+
+#echo "The array : ${asmList[@]}"
+
+# Array to store the created syri files 
+syriFileList=()
+
+# Iterating 2 by 2 with overlap, over the array of fasta files
+for ((i = 0; i < ${#asmList[@]} - 1; i++)); do
+
+    # Setting filepaths for later
+    bamFile="$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz).bam"
+    syriFile="$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz).syri.out"
+    hapID=$(basename ${asmList[i + 1]})
+    refID=$(basename ${asmList[i]})
+
+    # Debug
+    echo -e "\n[Debug::SyRI_script::$hapID] hapID: $hapID\trefID: $refID\n"
+    
+    # Adding the output syri file to the array
+    syriFileList+=($syriFile)
+
+    # Prunning fasta files based of common chromosomes
+    apptainer run $appdir/pan1c-env.sif python scripts/syri_pruning.py \
+        -r ${asmList[i]} -f ${asmList[i + 1]} -o $wrkdir -d
+
+    #echo "\n[Debug::SyRI_script::$hapID] REF"
+    #grep '>' $wrkdir/$(basename ${asmList[i]} .gz)
+    #echo "\n[Debug::SyRI_script::$hapID] QRY"
+    #grep '>' $wrkdir/$(basename ${asmList[i + 1]} .gz)
+
+    # Renaming chromosomes with same ids as the reference (Not working)
+    #sed -i "s/${hapID}#chr/${refID}#chr/g" $wrkdir/$(basename ${asmList[i + 1]} .gz)
+
+    #Â Minimap2 genome vs genome alignment
+    apptainer run --app wfmash $appdir/PanGeTools.sif \
+        $wrkdir/$(basename ${asmList[i]} .gz) $wrkdir/$(basename ${asmList[i + 1]} .gz) \
+        "$wfm_args" \
+        -n $(grep '>' $wrkdir/$(basename ${asmList[i]} .gz) | wc -l) \
+        -t $threads -a > $wrkdir/$bamFile.sam
+
+    apptainer run --app samtools $appdir/PanGeTools.sif sort -O BAM -@ $threads $wrkdir/$bamFile.sam > $wrkdir/$bamFile
+    rm $wrkdir/$bamFile.sam 
+    #rm $wrkdir/*.fa
+
+    # Syri on previous alignment
+    apptainer run $appdir/pan1c-env.sif \
+        syri -c $wrkdir/$bamFile -r $wrkdir/$(basename ${asmList[i]} .gz) -q $wrkdir/$(basename ${asmList[i + 1]} .gz) -k -F B \
+        --nc $threads \
+        --dir $wrkdir --prefix "$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz)."    
+done
+
+# Creating genomes.txt for plotsr. It is used to give simple names to fasta files in the final figure
+# Each line contains 2 columns : fasta filepath and its simpler name
+echo -e "#files\tname" > $wrkdir/genomes.txt
+for asm in "${asmList[@]}"; do
+    echo -e "$wrkdir/$(basename ${asm} .gz)\t$(basename $asm .fa.gz | cut -d'.' -f1)#$(basename $asm .fa.gz | cut -d'.' -f2 | cut -d'p' -f2)" >> $wrkdir/genomes.txt
+done
+
+# Generating the plotsr command
+command="--genomes $wrkdir/genomes.txt -o $wrkdir/$output -f $fontsize -S $space -H $height -W $width -d 600 --cfg $config "
+
+# Adding syri files to the command as each needs to be specified using "--sr" argument 
+for file in "${syriFileList[@]}"; do
+    command+="--sr $wrkdir/$file "
+done
+
+# Running plotsr
+apptainer run $appdir/pan1c-env.sif plotsr \
+    $command
-- 
GitLab


From 4a94c8a65996a82e3c9ff09be81115d2e2e2ce76 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 6 Sep 2024 16:22:20 +0200
Subject: [PATCH 159/310] Update Syri.figs_wfm.sh

---
 scripts/Syri.figs_wfm.sh | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/scripts/Syri.figs_wfm.sh b/scripts/Syri.figs_wfm.sh
index d263423..0fb3290 100755
--- a/scripts/Syri.figs_wfm.sh
+++ b/scripts/Syri.figs_wfm.sh
@@ -85,9 +85,19 @@ for ((i = 0; i < ${#asmList[@]} - 1; i++)); do
     # Renaming chromosomes with same ids as the reference (Not working)
     #sed -i "s/${hapID}#chr/${refID}#chr/g" $wrkdir/$(basename ${asmList[i + 1]} .gz)
 
-    #Â Minimap2 genome vs genome alignment
+    # Compressing and indexing
+    apptainer run $appdir/PanGeTools.sif bgzip \
+        -@ $threads $wrkdir/$(basename ${asmList[i]} .gz)
+    apptainer run $appdir/PanGeTools.sif bgzip \
+        -@ $threads $wrkdir/$(basename ${asmList[i+1]} .gz)
+    apptainer run --app samtools $appdir/PanGeTools.sif \
+        faidx $wrkdir/$(basename ${asmList[i]})
+    apptainer run --app samtools $appdir/PanGeTools.sif \
+        faidx $wrkdir/$(basename ${asmList[i+1]})
+
+    #Â Wfmash genome vs genome alignment
     apptainer run --app wfmash $appdir/PanGeTools.sif \
-        $wrkdir/$(basename ${asmList[i]} .gz) $wrkdir/$(basename ${asmList[i + 1]} .gz) \
+        $wrkdir/$(basename ${asmList[i]}) $wrkdir/$(basename ${asmList[i + 1]}) \
         "$wfm_args" \
         -n $(grep '>' $wrkdir/$(basename ${asmList[i]} .gz) | wc -l) \
         -t $threads -a > $wrkdir/$bamFile.sam
-- 
GitLab


From 614c0ee3693c16ef7b11e61bf31c53e80f28568d Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 6 Sep 2024 16:49:29 +0200
Subject: [PATCH 160/310] Update Syri.figs_wfm.sh

---
 scripts/Syri.figs_wfm.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/Syri.figs_wfm.sh b/scripts/Syri.figs_wfm.sh
index 0fb3290..847f838 100755
--- a/scripts/Syri.figs_wfm.sh
+++ b/scripts/Syri.figs_wfm.sh
@@ -87,9 +87,9 @@ for ((i = 0; i < ${#asmList[@]} - 1; i++)); do
 
     # Compressing and indexing
     apptainer run $appdir/PanGeTools.sif bgzip \
-        -@ $threads $wrkdir/$(basename ${asmList[i]} .gz)
+        -@ $threads -k $wrkdir/$(basename ${asmList[i]} .gz)
     apptainer run $appdir/PanGeTools.sif bgzip \
-        -@ $threads $wrkdir/$(basename ${asmList[i+1]} .gz)
+        -@ $threads -k $wrkdir/$(basename ${asmList[i+1]} .gz)
     apptainer run --app samtools $appdir/PanGeTools.sif \
         faidx $wrkdir/$(basename ${asmList[i]})
     apptainer run --app samtools $appdir/PanGeTools.sif \
@@ -99,7 +99,7 @@ for ((i = 0; i < ${#asmList[@]} - 1; i++)); do
     apptainer run --app wfmash $appdir/PanGeTools.sif \
         $wrkdir/$(basename ${asmList[i]}) $wrkdir/$(basename ${asmList[i + 1]}) \
         "$wfm_args" \
-        -n $(grep '>' $wrkdir/$(basename ${asmList[i]} .gz) | wc -l) \
+        -n $(grep '>' $wrkdir/$(basename ${asmList[i]}) | wc -l) \
         -t $threads -a > $wrkdir/$bamFile.sam
 
     apptainer run --app samtools $appdir/PanGeTools.sif sort -O BAM -@ $threads $wrkdir/$bamFile.sam > $wrkdir/$bamFile
-- 
GitLab


From 5b5269f0ba60b47ac693b2a19cbff93b40670590 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 6 Sep 2024 17:01:42 +0200
Subject: [PATCH 161/310] Update Syri.figs_wfm.sh

---
 scripts/Syri.figs_wfm.sh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/scripts/Syri.figs_wfm.sh b/scripts/Syri.figs_wfm.sh
index 847f838..f59ecd5 100755
--- a/scripts/Syri.figs_wfm.sh
+++ b/scripts/Syri.figs_wfm.sh
@@ -96,11 +96,11 @@ for ((i = 0; i < ${#asmList[@]} - 1; i++)); do
         faidx $wrkdir/$(basename ${asmList[i+1]})
 
     #Â Wfmash genome vs genome alignment
-    apptainer run --app wfmash $appdir/PanGeTools.sif \
-        $wrkdir/$(basename ${asmList[i]}) $wrkdir/$(basename ${asmList[i + 1]}) \
-        "$wfm_args" \
-        -n $(grep '>' $wrkdir/$(basename ${asmList[i]}) | wc -l) \
-        -t $threads -a > $wrkdir/$bamFile.sam
+    /usr/bin/time -v -o WFM.log \
+        apptainer run --app wfmash $appdir/PanGeTools.sif \
+            $wrkdir/$(basename ${asmList[i]}) $wrkdir/$(basename ${asmList[i + 1]}) \
+            "$wfm_args" \
+            -t $threads -a > $wrkdir/$bamFile.sam
 
     apptainer run --app samtools $appdir/PanGeTools.sif sort -O BAM -@ $threads $wrkdir/$bamFile.sam > $wrkdir/$bamFile
     rm $wrkdir/$bamFile.sam 
-- 
GitLab


From 915a26662059401f98a5048b60db304992b4d96c Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 6 Sep 2024 17:13:36 +0200
Subject: [PATCH 162/310] Update Syri.figs_wfm.sh

---
 scripts/Syri.figs_wfm.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/Syri.figs_wfm.sh b/scripts/Syri.figs_wfm.sh
index f59ecd5..8a45d0d 100755
--- a/scripts/Syri.figs_wfm.sh
+++ b/scripts/Syri.figs_wfm.sh
@@ -95,8 +95,10 @@ for ((i = 0; i < ${#asmList[@]} - 1; i++)); do
     apptainer run --app samtools $appdir/PanGeTools.sif \
         faidx $wrkdir/$(basename ${asmList[i+1]})
 
+    echo $wfm_args
+    
     #Â Wfmash genome vs genome alignment
-    /usr/bin/time -v -o WFM.log \
+    /usr/bin/time -v -o $wrkdir/WFM.log \
         apptainer run --app wfmash $appdir/PanGeTools.sif \
             $wrkdir/$(basename ${asmList[i]}) $wrkdir/$(basename ${asmList[i + 1]}) \
             "$wfm_args" \
-- 
GitLab


From 8b26d7aec3c22c528c17cae7acc8bcd364df3c5a Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 6 Sep 2024 17:25:47 +0200
Subject: [PATCH 163/310] Removed wfamsh arguments for the moment

---
 Snakefile                | 1 -
 scripts/Syri.figs_wfm.sh | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/Snakefile b/Snakefile
index 508f967..fd7fe19 100644
--- a/Snakefile
+++ b/Snakefile
@@ -329,7 +329,6 @@ rule SyRI_on_ASM_wfm:
             -r {input.ref} \
             -h 10 -w 20 -s "0.9" -f 10 \
             -c {params.plotsr_cfg} \
-            -w "-p {params.mapping_id} -s {params.segment_length} -l $(( {params.segment_length} * 5 )) {params.wfmash_sec}" \
             -q "{input.qry}" 2>&1 | \
             tee {log.cmd}
         
diff --git a/scripts/Syri.figs_wfm.sh b/scripts/Syri.figs_wfm.sh
index 8a45d0d..24b3044 100755
--- a/scripts/Syri.figs_wfm.sh
+++ b/scripts/Syri.figs_wfm.sh
@@ -14,7 +14,7 @@ width=9         # Figure width
 fontsize=12     
 space="0.7"     # Space for homologous chromosomes 
 config=""       # Plotsr config file
-wfm_args=""     # wfmash arguments
+wfm_args="-p 95 -s 10000 -l 50000 -k19 -H 0.001"     # wfmash arguments
 
 ## Getting arguments
 while getopts "r:q:a:t:d:o:h:w:f:s:c:w:" option; do
@@ -96,7 +96,7 @@ for ((i = 0; i < ${#asmList[@]} - 1; i++)); do
         faidx $wrkdir/$(basename ${asmList[i+1]})
 
     echo $wfm_args
-    
+
     #Â Wfmash genome vs genome alignment
     /usr/bin/time -v -o $wrkdir/WFM.log \
         apptainer run --app wfmash $appdir/PanGeTools.sif \
-- 
GitLab


From 5c81595bce41a72ccfdea8c99484fb048e15c918 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 6 Sep 2024 17:28:34 +0200
Subject: [PATCH 164/310] Update Syri.figs_wfm.sh

---
 scripts/Syri.figs_wfm.sh | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/scripts/Syri.figs_wfm.sh b/scripts/Syri.figs_wfm.sh
index 24b3044..9cd1748 100755
--- a/scripts/Syri.figs_wfm.sh
+++ b/scripts/Syri.figs_wfm.sh
@@ -95,13 +95,11 @@ for ((i = 0; i < ${#asmList[@]} - 1; i++)); do
     apptainer run --app samtools $appdir/PanGeTools.sif \
         faidx $wrkdir/$(basename ${asmList[i+1]})
 
-    echo $wfm_args
-
     #Â Wfmash genome vs genome alignment
     /usr/bin/time -v -o $wrkdir/WFM.log \
         apptainer run --app wfmash $appdir/PanGeTools.sif \
             $wrkdir/$(basename ${asmList[i]}) $wrkdir/$(basename ${asmList[i + 1]}) \
-            "$wfm_args" \
+            -p 95 -s 10000 -l 50000 -k19 -H 0.001 \
             -t $threads -a > $wrkdir/$bamFile.sam
 
     apptainer run --app samtools $appdir/PanGeTools.sif sort -O BAM -@ $threads $wrkdir/$bamFile.sam > $wrkdir/$bamFile
-- 
GitLab


From 50bf5037b47ed2f4de7fdc9f2e67dbac39980c01 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 6 Sep 2024 17:46:26 +0200
Subject: [PATCH 165/310] Update Syri.figs_wfm.sh

---
 scripts/Syri.figs_wfm.sh | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/scripts/Syri.figs_wfm.sh b/scripts/Syri.figs_wfm.sh
index 9cd1748..81ad3fa 100755
--- a/scripts/Syri.figs_wfm.sh
+++ b/scripts/Syri.figs_wfm.sh
@@ -96,11 +96,10 @@ for ((i = 0; i < ${#asmList[@]} - 1; i++)); do
         faidx $wrkdir/$(basename ${asmList[i+1]})
 
     #Â Wfmash genome vs genome alignment
-    /usr/bin/time -v -o $wrkdir/WFM.log \
-        apptainer run --app wfmash $appdir/PanGeTools.sif \
-            $wrkdir/$(basename ${asmList[i]}) $wrkdir/$(basename ${asmList[i + 1]}) \
-            -p 95 -s 10000 -l 50000 -k19 -H 0.001 \
-            -t $threads -a > $wrkdir/$bamFile.sam
+    apptainer run --app wfmash $appdir/PanGeTools.sif \
+        $wrkdir/$(basename ${asmList[i]}) $wrkdir/$(basename ${asmList[i + 1]}) \
+        -p 95 -s 10000 -l 50000 -k19 -H 0.001 \
+        -t $threads -a | sed "s/_[0-9]//g" > $wrkdir/$bamFile.sam
 
     apptainer run --app samtools $appdir/PanGeTools.sif sort -O BAM -@ $threads $wrkdir/$bamFile.sam > $wrkdir/$bamFile
     rm $wrkdir/$bamFile.sam 
-- 
GitLab


From beda6dd685a5bbfaf509681d89cf4091c2dfef95 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 6 Sep 2024 17:56:59 +0200
Subject: [PATCH 166/310] Update Syri.figs_wfm.sh

---
 scripts/Syri.figs_wfm.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/Syri.figs_wfm.sh b/scripts/Syri.figs_wfm.sh
index 81ad3fa..feb818b 100755
--- a/scripts/Syri.figs_wfm.sh
+++ b/scripts/Syri.figs_wfm.sh
@@ -102,7 +102,7 @@ for ((i = 0; i < ${#asmList[@]} - 1; i++)); do
         -t $threads -a | sed "s/_[0-9]//g" > $wrkdir/$bamFile.sam
 
     apptainer run --app samtools $appdir/PanGeTools.sif sort -O BAM -@ $threads $wrkdir/$bamFile.sam > $wrkdir/$bamFile
-    rm $wrkdir/$bamFile.sam 
+    #rm $wrkdir/$bamFile.sam 
     #rm $wrkdir/*.fa
 
     # Syri on previous alignment
-- 
GitLab


From abb3542854571de7ba26e6e1b1ece1aed759c666 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 6 Sep 2024 18:01:08 +0200
Subject: [PATCH 167/310] Update Syri.figs_wfm.sh

---
 scripts/Syri.figs_wfm.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/Syri.figs_wfm.sh b/scripts/Syri.figs_wfm.sh
index feb818b..902aa60 100755
--- a/scripts/Syri.figs_wfm.sh
+++ b/scripts/Syri.figs_wfm.sh
@@ -99,10 +99,11 @@ for ((i = 0; i < ${#asmList[@]} - 1; i++)); do
     apptainer run --app wfmash $appdir/PanGeTools.sif \
         $wrkdir/$(basename ${asmList[i]}) $wrkdir/$(basename ${asmList[i + 1]}) \
         -p 95 -s 10000 -l 50000 -k19 -H 0.001 \
+        -n $(grep '>' $wrkdir/$(basename ${asmList[i]} .gz) | wc -l) \
         -t $threads -a | sed "s/_[0-9]//g" > $wrkdir/$bamFile.sam
 
     apptainer run --app samtools $appdir/PanGeTools.sif sort -O BAM -@ $threads $wrkdir/$bamFile.sam > $wrkdir/$bamFile
-    #rm $wrkdir/$bamFile.sam 
+    rm $wrkdir/$bamFile.sam 
     #rm $wrkdir/*.fa
 
     # Syri on previous alignment
-- 
GitLab


From eb9632d5144c07a3b761b815dbedc2620ab72b11 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 6 Sep 2024 18:08:36 +0200
Subject: [PATCH 168/310] Update Syri.figs_wfm.sh

---
 scripts/Syri.figs_wfm.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/Syri.figs_wfm.sh b/scripts/Syri.figs_wfm.sh
index 902aa60..0bbd825 100755
--- a/scripts/Syri.figs_wfm.sh
+++ b/scripts/Syri.figs_wfm.sh
@@ -98,7 +98,7 @@ for ((i = 0; i < ${#asmList[@]} - 1; i++)); do
     #Â Wfmash genome vs genome alignment
     apptainer run --app wfmash $appdir/PanGeTools.sif \
         $wrkdir/$(basename ${asmList[i]}) $wrkdir/$(basename ${asmList[i + 1]}) \
-        -p 95 -s 10000 -l 50000 -k19 -H 0.001 \
+        -p 95 -s 1000 -l 5000 -k19 -H 0.001 \
         -n $(grep '>' $wrkdir/$(basename ${asmList[i]} .gz) | wc -l) \
         -t $threads -a | sed "s/_[0-9]//g" > $wrkdir/$bamFile.sam
 
-- 
GitLab


From dde879d423c2f58c51d01599e7f1fc670284727c Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Mon, 9 Sep 2024 16:57:09 +0200
Subject: [PATCH 169/310] Removed wfm from default results

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index fd7fe19..109525e 100644
--- a/Snakefile
+++ b/Snakefile
@@ -53,7 +53,7 @@ def which_analysis():
 
     if config["get_ASMs_SyRI"] == "True": # Creating SyRI for each input assembly 
         analysis_inputs.append(
-            expand("output/asm.syri.figs/"+config['name']+".{haplotype}.syri.{tool}.png", haplotype=SAMPLES_NOREF, tool=["mm2", "wfm"])
+            expand("output/asm.syri.figs/"+config['name']+".{haplotype}.syri.{tool}.png", haplotype=SAMPLES_NOREF, tool=["mm2"])
         )
     if config["get_chrInputs_SyRI"] == "True": # Creating SyRI figures for each PGGB input
         analysis_inputs.append(
-- 
GitLab


From ba4159454969b84124b17a6a6cfa205583d433e9 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 10 Sep 2024 15:39:26 +0200
Subject: [PATCH 170/310] Added RagTag args

Set confidence to 0.4 instead of 0.2
---
 Snakefile                   | 4 +++-
 config.yaml                 | 3 ++-
 example/config_CICD.yaml    | 3 ++-
 scripts/ragtagChromInfer.sh | 3 ++-
 4 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/Snakefile b/Snakefile
index 109525e..1fc6f35 100644
--- a/Snakefile
+++ b/Snakefile
@@ -115,7 +115,8 @@ rule ragtag_scaffolding:
     priority: 100
     params:
         app_path=config["app.path"],
-        mm2_config=config["ragtag_mm2_conf"]
+        mm2_config=config["ragtag_mm2_conf"],
+        rt_config=config["ragtag_args"]
     log: 
         cmd="logs/ragtag/{haplotype}.ragtag.cmd.log",
         time="logs/ragtag/{haplotype}.ragtag.time.log"
@@ -128,6 +129,7 @@ rule ragtag_scaffolding:
             -t {threads} \
             -r {input.ref} \
             -q {input.fa} \
+            -c "{params.args}" \
             -m "{params.mm2_config}" \
             -o {output.fa} 2>&1 | \
             tee {log.cmd}
diff --git a/config.yaml b/config.yaml
index baf408c..80ab808 100644
--- a/config.yaml
+++ b/config.yaml
@@ -11,7 +11,8 @@ app.path: '<path>'
 mem_multiplier: 1
 
 # Core parameters
-# RagTag parameters
+# RagTag parameters (see: https://github.com/malonge/RagTag/wiki/scaffold)
+ragtag_args: '-i 0.4'
 ragtag_mm2_conf: '-x asm5'
 ##Â Add -f 0.02 for large genomes
 
diff --git a/example/config_CICD.yaml b/example/config_CICD.yaml
index f3adb7d..11f9571 100644
--- a/example/config_CICD.yaml
+++ b/example/config_CICD.yaml
@@ -11,7 +11,8 @@ app.path: 'appimgs/'
 mem_multiplier: 1
 
 # Core parameters
-# RagTag parameters
+# RagTag parameters (see: https://github.com/malonge/RagTag/wiki/scaffold)
+ragtag_args: '-i 0.4'
 ragtag_mm2_conf: '-x asm5'
 ##Â Add -f 0.0002 for large genomes
 
diff --git a/scripts/ragtagChromInfer.sh b/scripts/ragtagChromInfer.sh
index f4c8b74..05085dc 100755
--- a/scripts/ragtagChromInfer.sh
+++ b/scripts/ragtagChromInfer.sh
@@ -9,6 +9,7 @@ inputref=""     # Reference fasta
 inputquery=""   # Query fasta
 output=""       # Output fasta
 mm2params=""    # Minimap2 parameters
+rtcommand=""    # RagTag commands
 
 ## Getting arguments
 while getopts "d:a:t:r:q:c:o:m:" option; do
@@ -36,7 +37,7 @@ mkdir -p $tmpdir
 
 # Running ragtag scaffold
 apptainer run $appdir/pan1c-env.sif ragtag.py scaffold \
-    --mm2-params "$mm2params -t $threads" -o $tmpdir $inputref $inputquery 2>&1
+    --mm2-params "$mm2params -t $threads" $rtcommand -o $tmpdir $inputref $inputquery 2>&1
 
 # Renaming sequence according to naming scheme
 grep ">${ref}#chr*" -A1 $tmpdir/ragtag.scaffold.fasta | \
-- 
GitLab


From e84b879f51e897a36949403b57e8943dad0ee082 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 10 Sep 2024 15:43:11 +0200
Subject: [PATCH 171/310] Update Snakefile

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 1fc6f35..bc03f11 100644
--- a/Snakefile
+++ b/Snakefile
@@ -129,7 +129,7 @@ rule ragtag_scaffolding:
             -t {threads} \
             -r {input.ref} \
             -q {input.fa} \
-            -c "{params.args}" \
+            -c "{params.ragtag_args}" \
             -m "{params.mm2_config}" \
             -o {output.fa} 2>&1 | \
             tee {log.cmd}
-- 
GitLab


From c1096a7c9eafd68a44a1331c1b2fd93f3e2e260a Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 10 Sep 2024 15:44:01 +0200
Subject: [PATCH 172/310] Update Snakefile

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index bc03f11..f9eaac2 100644
--- a/Snakefile
+++ b/Snakefile
@@ -129,7 +129,7 @@ rule ragtag_scaffolding:
             -t {threads} \
             -r {input.ref} \
             -q {input.fa} \
-            -c "{params.ragtag_args}" \
+            -c "{params.rt_config}" \
             -m "{params.mm2_config}" \
             -o {output.fa} 2>&1 | \
             tee {log.cmd}
-- 
GitLab


From 9780e6811c96b7bbefb85bc29ce804f2451a68dd Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 11 Sep 2024 10:22:55 +0200
Subject: [PATCH 173/310] Normalization by reference length

---
 Snakefile                       |  7 +++++--
 scripts/chrGraphs.stats_figs.py | 15 ++++++++++++---
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/Snakefile b/Snakefile
index f9eaac2..fa5a489 100644
--- a/Snakefile
+++ b/Snakefile
@@ -957,14 +957,17 @@ rule create_chrGraphs_figs:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 16000
     params:
         app_path=config['app.path'],
-        pan_name=config['name']
+        pan_name=config['name'],
+        ref_name=config['reference']
     shell:
         """
         mkdir -p $(dirname {output.barplot_mean})
 
+        ref="$(basename {params.ref_name} .fa.gz | cut -f1 -d'.')#$(basename {params.ref_name} .fa.gz | cut -f2 -d'.' | cut -f2 -d'p')"
+
         apptainer run {params.app_path}/pan1c-env.sif python scripts/chrGraphs.stats_figs.py \
             --input {input.pathstats} --output_dir $(dirname {output.barplot_mean}) \
-            --panname {params.pan_name} 
+            --panname {params.pan_name} --reference "$ref"
         """
 
 def get_report_sections(wildcards):
diff --git a/scripts/chrGraphs.stats_figs.py b/scripts/chrGraphs.stats_figs.py
index 1cc11f1..5e586b6 100644
--- a/scripts/chrGraphs.stats_figs.py
+++ b/scripts/chrGraphs.stats_figs.py
@@ -4,7 +4,7 @@ Chromosomes statistics figure script for Pan1c workflow
 Use aggregated TSV produced with chrGraphs.stats_aggregate.py
 
 @author: alexis.mergez@inrae.fr
-@version: 1.0
+@version: 1.1
 """
 
 #% Librairies
@@ -41,6 +41,13 @@ arg_parser.add_argument(
     required = True,
     help = "Pangenome name"
     )
+arg_parser.add_argument(
+    "--reference",
+    "-r",
+    dest = "ref",
+    required = True,
+    help = "Reference name"
+    )
 args = arg_parser.parse_args()
 
 #% Loading and preparing data
@@ -322,8 +329,10 @@ for pangenome in sData.index.unique("Pangenome.name"):
     # Iterating over chromosomes twice to make pairs
     for Q in sData.index.unique("Chr.id"):
         for T in sData.index.unique("Chr.id"):
-            Qtable = sData.loc[pangenome, Q, :].copy().reset_index().pivot(values=["Shared.prop"], index=["Query.name"], columns=["Target.name"]).fillna(0)
-            Ttable = sData.loc[pangenome, T, :].copy().reset_index().pivot(values=["Shared.prop"], index=["Query.name"], columns=["Target.name"]).fillna(0)
+            Qtable = sData.loc[pangenome, Q, :].copy().reset_index().pivot(values=["Shared.length"], index=["Query.name"], columns=["Target.name"]).fillna(0)
+            Qtable = Qtable / sData.loc[("37Bra-v3c", Q, args.ref),:].iloc[0]["Path.length"]
+            Ttable = sData.loc[pangenome, T, :].copy().reset_index().pivot(values=["Shared.length"], index=["Query.name"], columns=["Target.name"]).fillna(0)
+            Ttable = Ttable / sData.loc[("37Bra-v3c", T, args.ref),:].iloc[0]["Path.length"]
 
             # Computing Euclid distance using Frobenious norm
             dData["Q"].append(Q)
-- 
GitLab


From ae26350b1f6f97e5eb41310de9f58bda84e203e2 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 11 Sep 2024 10:28:03 +0200
Subject: [PATCH 174/310] Fixed chrGraphs.stats_figs.py

---
 scripts/chrGraphs.stats_figs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/chrGraphs.stats_figs.py b/scripts/chrGraphs.stats_figs.py
index 5e586b6..9d78d94 100644
--- a/scripts/chrGraphs.stats_figs.py
+++ b/scripts/chrGraphs.stats_figs.py
@@ -330,9 +330,9 @@ for pangenome in sData.index.unique("Pangenome.name"):
     for Q in sData.index.unique("Chr.id"):
         for T in sData.index.unique("Chr.id"):
             Qtable = sData.loc[pangenome, Q, :].copy().reset_index().pivot(values=["Shared.length"], index=["Query.name"], columns=["Target.name"]).fillna(0)
-            Qtable = Qtable / sData.loc[("37Bra-v3c", Q, args.ref),:].iloc[0]["Path.length"]
+            Qtable = Qtable / sData.loc[(pangenome, Q, args.ref),:].iloc[0]["Path.length"]
             Ttable = sData.loc[pangenome, T, :].copy().reset_index().pivot(values=["Shared.length"], index=["Query.name"], columns=["Target.name"]).fillna(0)
-            Ttable = Ttable / sData.loc[("37Bra-v3c", T, args.ref),:].iloc[0]["Path.length"]
+            Ttable = Ttable / sData.loc[(pangenome, T, args.ref),:].iloc[0]["Path.length"]
 
             # Computing Euclid distance using Frobenious norm
             dData["Q"].append(Q)
-- 
GitLab


From 1c0c10d6b34ec29402ac78bbf4c817c50f0c1516 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Wed, 11 Sep 2024 14:36:26 +0200
Subject: [PATCH 175/310] Added description to figures

---
 Snakefile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Snakefile b/Snakefile
index fa5a489..cb431ee 100644
--- a/Snakefile
+++ b/Snakefile
@@ -1054,6 +1054,7 @@ rule create_pan1c_report:
 
         ## Barplots sub section
         shell("echo '## Path composition' >> {output.report}")
+        shell("echo -e 'First, the list of unique node ids belonging to each group is computed :\n- Core : node ids found in every path of the graph\n- Private : node ids found in one and only path of the graph\n- Other : node ids that does not belong to previous groups\n\nEach group length is computed using the sum of node lengths, weighted by the number of copies found in a given Path (left panel). This sum is then divided by the Path length (right panel).' >> {output.report}")
         _basename = os.path.basename(input.barplot_mean)
         shell("echo '![{_basename}](./chrGraphs.stats.figs/{_basename})' >> {output.report}")
 
@@ -1069,6 +1070,7 @@ rule create_pan1c_report:
         
         ## 2D scatter sub section
         shell("echo '## Core vs Private' >> {output.report}")
+        shell("echo -e 'Using the same method to compute the proportion of Core and Private as in the previous figures, Private is plotted against Core. Basically, if the Path is in the lower right, it tends to have more Private than Core sequences compared to other Paths. The clustering is done using DBSCAN.' >> {output.report}")
         _basename = os.path.basename(input.scatter_mean)
         shell("echo '![{_basename}](./chrGraphs.stats.figs/{_basename})' >> {output.report}")
 
@@ -1084,6 +1086,7 @@ rule create_pan1c_report:
 
         ## Heatmap section
         shell("echo '## Pairwise shared content' >> {output.report}")
+        shell("echo -e 'For each pair of paths, the length and relative proportion of shared nodes is computed. The left panel shows the relative proportion in the Query path, including repetitions. The right panel shows the absolute length of shared nodes, without repeats.' >> {output.report}")
         shell("echo '### Pairwise euclid distance betwwen chromosomes ' >> {output.report}")
         _basename = os.path.basename(input.heatmap_diff)
         shell("echo '![{_basename}](./chrGraphs.stats.figs/{_basename})' >> {output.report}")
-- 
GitLab


From c58b1d3c4a9663de39e24ca4eee28acb4869151e Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 10:22:32 +0200
Subject: [PATCH 176/310] Specifying graph tool used

---
 Snakefile                | 146 ++++++++++++++++++++-------------------
 config.yaml              |   2 +
 example/config_CICD.yaml |   2 +
 3 files changed, 78 insertions(+), 72 deletions(-)

diff --git a/Snakefile b/Snakefile
index cb431ee..17e3386 100644
--- a/Snakefile
+++ b/Snakefile
@@ -36,15 +36,17 @@ nHAP = len(SAMPLES)
 with gzip.open("data/haplotypes/"+config['reference'], "r") as handle:
     CHRLIST = [line.decode().split("#")[-1].split('\n')[0] for line in handle.readlines() if line.decode()[0] == ">"]
 
+graph_tools = ["pan1c"] + (config["get_MC"] == True)*["MC"] 
+
 # Adding optionnal output based on config.yaml, using the following function
 def which_analysis():
     
     ## Default analysis
     analysis_inputs = [     
-        "output/stats/pan1c."+config['name']+".core.stats.tsv", # core stats
-        expand("output/panacus.reports/"+config['name']+".{chromosome}.histgrowth.html", chromosome=CHRLIST), # panacus histgrowth 
-        expand("output/chrGraphs.figs/"+config['name']+".{chromosome}.1Dviz.png", chromosome=CHRLIST), # visualizations from odgi on chromosome graphs
-        "output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv" # chromosomes graph statistics
+        expand("output/stats/{gtool}."+config['name']+".core.stats.tsv", gtool=graph_tools), # core stats
+        expand("output/panacus.reports/{gtool}."+config['name']+".{chromosome}.histgrowth.html", chromosome=CHRLIST, gtool=graph_tools), # panacus histgrowth 
+        expand("output/chrGraphs.figs/{gtool}."+config['name']+".{chromosome}.1Dviz.png", chromosome=CHRLIST, gtool=graph_tools), # visualizations from odgi on chromosome graphs
+        expand("output/stats/{gtool}."+config['name']+".chrGraph.general.stats.tsv", gtool=graph_tools) # chromosomes graph statistics
     ]
     
     ## Optionals analysis steps
@@ -70,10 +72,12 @@ def which_analysis():
 
         if config["create_report"] == "True": # Creating report (need contig)
             analysis_inputs.append(
-                "output/pan1c."+config['name']+".report.md"
+                expand("output/{gtool}."+config['name']+".report.md", gtool=graph_tools)
             )
     if config["get_VCF"] == "True": # VCF from the final graph against the "reference"
-        analysis_inputs.append("output/vcf.figs") 
+        analysis_inputs.append(
+            expand("output/{gtool}.vcf.figs", gtool=graph_tools)
+        )
 
     return analysis_inputs
 
@@ -404,10 +408,10 @@ rule wfmash_on_chr:
         fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz',
         fai='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz.fai'
     output:
-        mapping=temp("data/chrGraphs/{chromosome}/{chromosome}.wfmash.mapping.paf"),
-        aln=temp("data/chrGraphs/{chromosome}/{chromosome}.wfmash.aln.paf"),
-        mapping_gz="data/chrGraphs/{chromosome}/{chromosome}.wfmash.mapping.paf.gz",
-        aln_gz="data/chrGraphs/{chromosome}/{chromosome}.wfmash.aln.paf.gz"
+        mapping=temp("data/chrGraphs/pan1c.{chromosome}/pan1c.{chromosome}.wfmash.mapping.paf"),
+        aln=temp("data/chrGraphs/pan1c.{chromosome}/pan1c.{chromosome}.wfmash.aln.paf"),
+        mapping_gz="data/chrGraphs/pan1c.{chromosome}/pan1c.{chromosome}.wfmash.mapping.paf.gz",
+        aln_gz="data/chrGraphs/pan1c.{chromosome}/pan1c.{chromosome}.wfmash.aln.paf.gz"
     threads: 16
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
@@ -460,7 +464,7 @@ rule seqwish:
         fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz',
         aln=rules.wfmash_on_chr.output.aln_gz
     output:
-        gfa_gz="data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfa.gz"
+        gfa_gz="data/chrGraphs/pan1c.{chromosome}/pan1c.{chromosome}.seqwish.gfa.gz"
     threads: 8
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000
@@ -492,8 +496,8 @@ rule gfaffix_on_chr:
     input:
         rules.seqwish.output.gfa_gz
     output:
-        gfa_gz="data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.gfa.gz",
-        transform="data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.transform.txt"
+        gfa_gz="data/chrGraphs/pan1c.{chromosome}/pan1c.{chromosome}.seqwish.gfaffixD.gfa.gz",
+        transform="data/chrGraphs/pan1c.{chromosome}/pan1c.{chromosome}.seqwish.gfaffixD.transform.txt"
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 24000
@@ -527,7 +531,7 @@ rule odgi_postprocessing:
         tags="output/pan1c."+config['name']+".gfa.metadata",
         gfa_gz=rules.gfaffix_on_chr.output.gfa_gz
     output:
-        gfa_gz='data/chrGraphs/'+config['name']+'.{chromosome}.gfa.gz'
+        gfa_gz='data/chrGraphs/pan1c.'+config['name']+'.{chromosome}.gfa.gz'
     threads: 8
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000
@@ -594,9 +598,9 @@ rule odgi_postprocessing:
 rule generate_graph_list:
     # Generate a text file containing all created graphs
     input:
-        gfas=expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa.gz', chromosome=CHRLIST)
+        gfas=expand('data/chrGraphs/{gtool}.'+config['name']+'.{chromosome}.gfa.gz', chromosome=CHRLIST)
     output:
-        "data/chrGraphs/graphsList.txt"
+        "data/chrGraphs/graphsList.{gtool}.txt"
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000
@@ -609,14 +613,14 @@ rule generate_graph_list:
 rule graph_squeeze:
     # Using odgi to merge every subgraphs into a final one
     input:
-        glist="data/chrGraphs/graphsList.txt",
+        glist="data/chrGraphs/graphsList.{gtool}.txt",
         tags="output/pan1c."+config['name']+".gfa.metadata",
-        graphs=expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST)
+        graphs=expand('data/chrGraphs/{gtool}.'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST)
     output:
-        gfa_gz="output/pan1c."+config['name']+".gfa.gz"
+        gfa_gz="output/{gtool}."+config['name']+".gfa.gz"
     log: 
-        cmd="logs/squeeze/"+config['name']+".squeeze.cmd.log",
-        time="logs/squeeze/"+config['name']+".squeeze.time.log",
+        cmd="logs/squeeze/{gtool}."+config['name']+".squeeze.cmd.log",
+        time="logs/squeeze/{gtool}."+config['name']+".squeeze.time.log",
     threads: 16
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
@@ -648,10 +652,10 @@ rule graph_squeeze:
 rule graph_stats:
     # Using GFAstats to produce stats on every chromosome graphs
     input:
-        graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa.gz'
+        graph='data/chrGraphs/{gtool}.'+config['name']+'.{chromosome}.gfa.gz'
     output:
-        genstats="output/stats/chrGraphs/"+config['name']+".{chromosome}.general.stats.tsv",
-        pathstats="output/stats/chrGraphs/"+config['name']+".{chromosome}.path.stats.tsv"
+        genstats="output/stats/chrGraphs/{gtool}."+config['name']+".{chromosome}.general.stats.tsv",
+        pathstats="output/stats/chrGraphs/{gtool}."+config['name']+".{chromosome}.path.stats.tsv"
     threads: 4
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 8000
@@ -669,10 +673,10 @@ rule graph_stats:
 rule graph_figs:
     # Creating figures using odgi viz 
     input:
-        graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa'
+        graph='data/chrGraphs/{gtool}.'+config['name']+'.{chromosome}.gfa'
     output:
-        oneDviz="output/chrGraphs.figs/"+config['name']+".{chromosome}.1Dviz.png",
-        pcov="output/chrGraphs.figs/"+config['name']+".{chromosome}.pcov.png"
+        oneDviz="output/chrGraphs.figs/{gtool}."+config['name']+".{chromosome}.1Dviz.png",
+        pcov="output/chrGraphs.figs/{gtool}."+config['name']+".{chromosome}.pcov.png"
     threads: 4
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000
@@ -695,10 +699,10 @@ rule graph_figs:
 rule aggregate_graphs_stats:
     # Reading and merging all stats files from chromosome graphs into a .tsv.
     input:
-        genstats=expand("output/stats/chrGraphs/"+config['name']+".{chromosome}.general.stats.tsv", chromosome=CHRLIST)
+        genstats=expand("output/stats/chrGraphs/{gtool}."+config['name']+".{chromosome}.general.stats.tsv", chromosome=CHRLIST)
     output:
-        genstats="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv",
-        pathstats="output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv"
+        genstats="output/stats/{gtool}."+config['name']+".chrGraph.general.stats.tsv",
+        pathstats="output/stats/{gtool}."+config['name']+".chrGraph.path.stats.tsv"
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
@@ -734,9 +738,9 @@ rule get_graph_tags:
 rule pggb_input_stats:
     # Produces statistics on pggb input sequences
     input:
-        flag="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv"
+        flag="output/stats/{gtool}."+config['name']+".chrGraph.general.stats.tsv"
     output:
-        "output/stats/pan1c."+config['name']+".chrInput.stats.tsv"
+        "output/stats/{gtool}."+config['name']+".chrInput.stats.tsv"
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 32000
@@ -752,11 +756,10 @@ rule pggb_input_stats:
 rule core_statistics:
     # Aggregate chrInput, chrGraph and pggb statistics into a single tsv 
     input:
-        chrInputStats = "output/stats/pan1c."+config['name']+".chrInput.stats.tsv",
-        chrGraphStats = "output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv"
+        chrInputStats = "output/stats/{gtool}."+config['name']+".chrInput.stats.tsv",
+        chrGraphStats = "output/stats/{gtool}."+config['name']+".chrGraph.general.stats.tsv"
     output:
-        tsv = "output/stats/pan1c."+config['name']+".core.stats.tsv",
-        dir = directory("output/pggb.usage.figs")
+        tsv = "output/stats/{gtool}."+config['name']+".core.stats.tsv"
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
@@ -765,10 +768,9 @@ rule core_statistics:
         pan_name=config['name']
     shell:
         """
-        mkdir -p {output.dir}
         apptainer run {params.app_path}/pan1c-env.sif python scripts/core.stats_compute.py \
             --pggbStats logs/pggb --chrInputStats {input.chrInputStats} \
-            --chrGraphStats {input.chrGraphStats} -o {output.tsv} -f {output.dir} -p {params.pan_name}
+            --chrGraphStats {input.chrGraphStats} -o {output.tsv} -f /dev/null -p {params.pan_name}
         """
 
 """
@@ -777,7 +779,7 @@ Post-processing section
 rule get_pav:
     # Create PAV matrix readable by panache for a given chromosome scale graph
     input:
-        "data/chrGraphs/graphsList.txt"
+        "data/chrGraphs/graphsList.{gtool}.txt"
     output:
         directory("output/pav.matrices")
     threads: 16
@@ -797,12 +799,12 @@ rule get_pav:
 rule panacus_stats:
     # Produces panacus reports for a chromosome graph
     input:
-        graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa'
+        graph='data/chrGraphs/{gtool}.'+config['name']+'.{chromosome}.gfa'
     output:
-        html='output/panacus.reports/'+config['name']+'.{chromosome}.histgrowth.html'
+        html='output/panacus.reports/{gtool}.'+config['name']+'.{chromosome}.histgrowth.html'
     log: 
-        cmd="logs/panacus/{chromosome}.panacus.cmd.log",
-        time="logs/panacus/{chromosome}.panacus.time.log"
+        cmd="logs/panacus/{gtool}.{chromosome}.panacus.cmd.log",
+        time="logs/panacus/{gtool}.{chromosome}.panacus.time.log"
     params:
         app_path=config['app.path'],
         pan_name=config['name'],
@@ -826,9 +828,9 @@ rule panacus_stats:
 rule vg_deconstruct:
     # Produce a VCF based on the "reference" haplotype
     input:
-        graph="output/pan1c."+config['name']+".xg",
+        graph="output/{gtool}."+config['name']+".xg",
     output:
-        vcf=temp("output/pan1c."+config['name']+".vcf"),
+        vcf=temp("output/{gtool}."+config['name']+".vcf"),
     threads: 8
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 32000
@@ -836,8 +838,8 @@ rule vg_deconstruct:
         app_path=config['app.path'],
         ref=config['reference']
     log: 
-        cmd="logs/vg_deconstruct/vg_deconstruct.cmd.log",
-        time="logs/vg_deconstruct/vg_deconstruct.time.log"
+        cmd="logs/vg_deconstruct/{gtool}.vg_deconstruct.cmd.log",
+        time="logs/vg_deconstruct/{gtool}.vg_deconstruct.time.log"
     shell:
         """
         /usr/bin/time -v -o {log.time} \
@@ -853,10 +855,10 @@ rule vg_deconstruct:
 rule vcf_fig:
     # Produce a figure describing INS/DEL length distribution from vg deconstruct and SyRI
     input:
-        vg="output/pan1c."+config['name']+".vcf.gz",
+        vg="output/{gtool}."+config['name']+".vcf.gz",
         syris_mm2=expand("data/asm.syri.mm2/"+config['name']+".{haplotype}.syri.mm2.vcf.gz", haplotype=SAMPLES_NOREF)
     output:
-        vcf_fig=directory("output/vcf.figs")
+        vcf_fig=directory("output/{gtool}.vcf.figs")
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 20000
@@ -907,12 +909,12 @@ rule vcf_fig:
 rule create_pan1c_report_fig:
     # Produces a markdown report figure of chromosomes graphs
     input:
-        graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa',
+        graph='data/chrGraphs/{gtool}.'+config['name']+'.{chromosome}.gfa',
         contigfig="output/chr.contig/{chromosome}.contig.png",
     output:
-        odgifig=temp("tmp/{chromosome}.odgi.png"),
-        namefig=temp("tmp/{chromosome}.name.png"),
-        reportfig="output/report/"+config['name']+".{chromosome}.report.fig.png"
+        odgifig=temp("tmp/{gtool}.{chromosome}.odgi.png"),
+        namefig=temp("tmp/{gtool}.{chromosome}.name.png"),
+        reportfig="output/report/{gtool}."+config['name']+".{chromosome}.report.fig.png"
     threads: 4
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
@@ -944,14 +946,14 @@ rule create_pan1c_report_fig:
 rule create_chrGraphs_figs:
     # Produce figures based on aggregated path stats
     input:
-        pathstats="output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv"
+        pathstats="output/stats/{gtool}."+config['name']+".chrGraph.path.stats.tsv"
     output:
-        barplots=expand("output/chrGraphs.stats.figs/"+config['name']+".path.decomp.{chromosome}.png", chromosome=CHRLIST),
-        scatters=expand("output/chrGraphs.stats.figs/"+config['name']+".2D.scatter.{chromosome}.png", chromosome=CHRLIST),
-        heatmaps=expand("output/chrGraphs.stats.figs/"+config['name']+".sharred.content.{chromosome}.png", chromosome=CHRLIST),
-        barplot_mean="output/chrGraphs.stats.figs/"+config['name']+".path.decomp.mean.png",
-        scatter_mean="output/chrGraphs.stats.figs/"+config['name']+".2D.scatter.mean.png",
-        heatmap_diff="output/chrGraphs.stats.figs/"+config['name']+".sharred.content.diff.png"
+        barplots=expand("output/chrGraphs.stats.figs/{gtool}."+config['name']+".path.decomp.{chromosome}.png", chromosome=CHRLIST),
+        scatters=expand("output/chrGraphs.stats.figs/{gtool}."+config['name']+".2D.scatter.{chromosome}.png", chromosome=CHRLIST),
+        heatmaps=expand("output/chrGraphs.stats.figs/{gtool}."+config['name']+".sharred.content.{chromosome}.png", chromosome=CHRLIST),
+        barplot_mean="output/chrGraphs.stats.figs/{gtool}."+config['name']+".path.decomp.mean.png",
+        scatter_mean="output/chrGraphs.stats.figs/{gtool}."+config['name']+".2D.scatter.mean.png",
+        heatmap_diff="output/chrGraphs.stats.figs/{gtool}."+config['name']+".sharred.content.diff.png"
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 16000
@@ -979,15 +981,15 @@ def get_report_sections(wildcards):
     sections = dict()
 
     sections["metadata"] = "output/pan1c."+config['name']+".gfa.metadata"
-    sections["odgifigs"] = expand("output/report/"+config['name']+".{chromosome}.report.fig.png", chromosome=CHRLIST)
-    sections["genstats"] = "output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv"
-    sections["pathstats"] = "output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv"
-    sections["barplots"] = expand("output/chrGraphs.stats.figs/"+config['name']+".path.decomp.{chromosome}.png", chromosome=CHRLIST)
-    sections["scatters"] = expand("output/chrGraphs.stats.figs/"+config['name']+".2D.scatter.{chromosome}.png", chromosome=CHRLIST)
-    sections["heatmaps"] = expand("output/chrGraphs.stats.figs/"+config['name']+".sharred.content.{chromosome}.png", chromosome=CHRLIST)
-    sections["barplot_mean"] = "output/chrGraphs.stats.figs/"+config['name']+".path.decomp.mean.png"
-    sections["scatter_mean"] = "output/chrGraphs.stats.figs/"+config['name']+".2D.scatter.mean.png"
-    sections["heatmap_diff"] = "output/chrGraphs.stats.figs/"+config['name']+".sharred.content.diff.png"
+    sections["odgifigs"] = expand("output/report/{wildcards.gtool}."+config['name']+".{chromosome}.report.fig.png", chromosome=CHRLIST)
+    sections["genstats"] = "output/stats/{wildcards.gtool}."+config['name']+".chrGraph.general.stats.tsv"
+    sections["pathstats"] = "output/stats/{wildcards.gtool}."+config['name']+".chrGraph.path.stats.tsv"
+    sections["barplots"] = expand("output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".path.decomp.{chromosome}.png", chromosome=CHRLIST)
+    sections["scatters"] = expand("output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".2D.scatter.{chromosome}.png", chromosome=CHRLIST)
+    sections["heatmaps"] = expand("output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".sharred.content.{chromosome}.png", chromosome=CHRLIST)
+    sections["barplot_mean"] = "output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".path.decomp.mean.png"
+    sections["scatter_mean"] = "output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".2D.scatter.mean.png"
+    sections["heatmap_diff"] = "output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".sharred.content.diff.png"
 
     if config["get_ASMs_SyRI"] == "True":
         sections["SyRI_on_ASMs_figs"] = expand(
@@ -1002,7 +1004,7 @@ def get_report_sections(wildcards):
             )
 
     if config['get_VCF'] == "True":
-        sections['VCF_figs'] = "output/vcf.figs"
+        sections['VCF_figs'] = "output/{wildcards.gtool}.vcf.figs"
 
     return sections      
 
@@ -1011,8 +1013,8 @@ rule create_pan1c_report:
     input:
         unpack(get_report_sections)
     output:
-        report="output/pan1c."+config['name']+".report.md",
-        html="output/pan1c."+config['name']+".report.html"
+        report="output/{gtool}."+config['name']+".report.md",
+        html="output/{gtool}."+config['name']+".report.html"
     threads: 4
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 500
diff --git a/config.yaml b/config.yaml
index 80ab808..ae25ac7 100644
--- a/config.yaml
+++ b/config.yaml
@@ -31,6 +31,8 @@ odgi.pcov.params: '-x 2000 -O'
 ## Optional parts of the workflow
 # Running Quast to get statistics on input haplotypes
 run_Quast: 'True'
+# Make Minigraph-Cactus graph using the same method (chromosome level)
+get_MC: 'False'
 # Getting figures showing chromosome decomposition into contigs
 get_contig_pos: 'True'
 # Computes Presence Absence Variant matrices for Panache (not recommended; very long)
diff --git a/example/config_CICD.yaml b/example/config_CICD.yaml
index 11f9571..ba2ac18 100644
--- a/example/config_CICD.yaml
+++ b/example/config_CICD.yaml
@@ -31,6 +31,8 @@ odgi.pcov.params: '-x 2000 -a 25 -O'
 ## Optional parts of the workflow
 # Running Quast to get statistics on input haplotypes
 run_Quast: 'True'
+# Make Minigraph-Cactus graph using the same method (chromosome level)
+get_MC: 'True'
 # Getting figures showing chromosome decomposition into contigs
 get_contig_pos: 'True'
 # Computes Presence Absence Variant matrices for Panache (not recommended; very long)
-- 
GitLab


From 3fbc254a78db6f9a1ddc094a9855e2166baec1d9 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 10:25:01 +0200
Subject: [PATCH 177/310] Update Snakefile

---
 Snakefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index 17e3386..921a853 100644
--- a/Snakefile
+++ b/Snakefile
@@ -94,8 +94,8 @@ Rules   ------------------------------------------------------------------------
 # Main target rule
 rule all:
     input:
-        "output/pan1c."+config['name']+".gfa.gz", # Final graph (main output)
-        "output/pan1c."+config['name']+".gfa.metadata", # Metadata for the final (also in top of gfa files as # line)
+        expand("output/{gtool}."+config['name']+".gfa.gz", gtool=graph_tools), # Final graph (main output)
+        expand("output/{gtool}."+config['name']+".gfa.metadata", gtool=graph_tools), # Metadata for the final (also in top of gfa files as # line)
         which_analysis()
 
 """
-- 
GitLab


From 3b4ac86c53be4ce9dd952181c63f5e9410a28191 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 10:40:28 +0200
Subject: [PATCH 178/310] Update Snakefile

---
 Snakefile | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/Snakefile b/Snakefile
index 921a853..84d281d 100644
--- a/Snakefile
+++ b/Snakefile
@@ -598,7 +598,7 @@ rule odgi_postprocessing:
 rule generate_graph_list:
     # Generate a text file containing all created graphs
     input:
-        gfas=expand('data/chrGraphs/{gtool}.'+config['name']+'.{chromosome}.gfa.gz', chromosome=CHRLIST)
+        gfas=expand('data/chrGraphs/{{gtool}}.'+config['name']+'.{chromosome}.gfa.gz', chromosome=CHRLIST)
     output:
         "data/chrGraphs/graphsList.{gtool}.txt"
     threads: 1
@@ -615,7 +615,7 @@ rule graph_squeeze:
     input:
         glist="data/chrGraphs/graphsList.{gtool}.txt",
         tags="output/pan1c."+config['name']+".gfa.metadata",
-        graphs=expand('data/chrGraphs/{gtool}.'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST)
+        graphs=expand('data/chrGraphs/{{gtool}}.'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST)
     output:
         gfa_gz="output/{gtool}."+config['name']+".gfa.gz"
     log: 
@@ -699,7 +699,7 @@ rule graph_figs:
 rule aggregate_graphs_stats:
     # Reading and merging all stats files from chromosome graphs into a .tsv.
     input:
-        genstats=expand("output/stats/chrGraphs/{gtool}."+config['name']+".{chromosome}.general.stats.tsv", chromosome=CHRLIST)
+        genstats=expand("output/stats/chrGraphs/{{gtool}}."+config['name']+".{chromosome}.general.stats.tsv", chromosome=CHRLIST)
     output:
         genstats="output/stats/{gtool}."+config['name']+".chrGraph.general.stats.tsv",
         pathstats="output/stats/{gtool}."+config['name']+".chrGraph.path.stats.tsv"
@@ -948,9 +948,9 @@ rule create_chrGraphs_figs:
     input:
         pathstats="output/stats/{gtool}."+config['name']+".chrGraph.path.stats.tsv"
     output:
-        barplots=expand("output/chrGraphs.stats.figs/{gtool}."+config['name']+".path.decomp.{chromosome}.png", chromosome=CHRLIST),
-        scatters=expand("output/chrGraphs.stats.figs/{gtool}."+config['name']+".2D.scatter.{chromosome}.png", chromosome=CHRLIST),
-        heatmaps=expand("output/chrGraphs.stats.figs/{gtool}."+config['name']+".sharred.content.{chromosome}.png", chromosome=CHRLIST),
+        barplots=expand("output/chrGraphs.stats.figs/{{gtool}}."+config['name']+".path.decomp.{chromosome}.png", chromosome=CHRLIST),
+        scatters=expand("output/chrGraphs.stats.figs/{{gtool}}."+config['name']+".2D.scatter.{chromosome}.png", chromosome=CHRLIST),
+        heatmaps=expand("output/chrGraphs.stats.figs/{{gtool}}."+config['name']+".sharred.content.{chromosome}.png", chromosome=CHRLIST),
         barplot_mean="output/chrGraphs.stats.figs/{gtool}."+config['name']+".path.decomp.mean.png",
         scatter_mean="output/chrGraphs.stats.figs/{gtool}."+config['name']+".2D.scatter.mean.png",
         heatmap_diff="output/chrGraphs.stats.figs/{gtool}."+config['name']+".sharred.content.diff.png"
@@ -981,12 +981,12 @@ def get_report_sections(wildcards):
     sections = dict()
 
     sections["metadata"] = "output/pan1c."+config['name']+".gfa.metadata"
-    sections["odgifigs"] = expand("output/report/{wildcards.gtool}."+config['name']+".{chromosome}.report.fig.png", chromosome=CHRLIST)
+    sections["odgifigs"] = expand("output/report/{{wildcards.gtool}}."+config['name']+".{chromosome}.report.fig.png", chromosome=CHRLIST)
     sections["genstats"] = "output/stats/{wildcards.gtool}."+config['name']+".chrGraph.general.stats.tsv"
     sections["pathstats"] = "output/stats/{wildcards.gtool}."+config['name']+".chrGraph.path.stats.tsv"
-    sections["barplots"] = expand("output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".path.decomp.{chromosome}.png", chromosome=CHRLIST)
-    sections["scatters"] = expand("output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".2D.scatter.{chromosome}.png", chromosome=CHRLIST)
-    sections["heatmaps"] = expand("output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".sharred.content.{chromosome}.png", chromosome=CHRLIST)
+    sections["barplots"] = expand("output/chrGraphs.stats.figs/{{wildcards.gtool}}."+config['name']+".path.decomp.{chromosome}.png", chromosome=CHRLIST)
+    sections["scatters"] = expand("output/chrGraphs.stats.figs/{{wildcards.gtool}}."+config['name']+".2D.scatter.{chromosome}.png", chromosome=CHRLIST)
+    sections["heatmaps"] = expand("output/chrGraphs.stats.figs/{{wildcards.gtool}}."+config['name']+".sharred.content.{chromosome}.png", chromosome=CHRLIST)
     sections["barplot_mean"] = "output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".path.decomp.mean.png"
     sections["scatter_mean"] = "output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".2D.scatter.mean.png"
     sections["heatmap_diff"] = "output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".sharred.content.diff.png"
-- 
GitLab


From e11a4281193b64aa6d07082d49159063a49a0d06 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 10:41:37 +0200
Subject: [PATCH 179/310] Update .gitlab-ci.yml

---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 828b4de..259e6b2 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -23,7 +23,7 @@ shallow_run_workflow:
     - cp example/*.fa.gz data/haplotypes
     - HOME=$(pwd)
     - rm config.yaml && mv example/config_CICD.yaml config.yaml
-    - apptainer run --bind $HOME:/root appimgs/snakebox.sif snakemake -c1 --dag > workflow.dot
+    - apptainer run --bind $HOME:/root appimgs/snakebox.sif snakemake --debug -c1 --dag > workflow.dot
   artifacts:
     paths:
       - workflow.dot
-- 
GitLab


From 46bf9834e3e7c7c8ff7d03e1761af88c58d79795 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 10:45:40 +0200
Subject: [PATCH 180/310] Update Snakefile

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 84d281d..0c0d431 100644
--- a/Snakefile
+++ b/Snakefile
@@ -2,7 +2,7 @@ configfile: "config.yaml"
 
 include: "rules/tools.smk"
 
-ruleorder: odgi_postprocessing > run_bgzip
+ruleorder: odgi_postprocessing > graph_squeeze > run_bgzip
 
 ## Modules
 import os
-- 
GitLab


From 71c91b8587906afce6bc3d91e9049ca225e1e21d Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 10:49:19 +0200
Subject: [PATCH 181/310] Update tools.smk

---
 rules/tools.smk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rules/tools.smk b/rules/tools.smk
index 7683ed1..ce71c46 100644
--- a/rules/tools.smk
+++ b/rules/tools.smk
@@ -45,9 +45,9 @@ rule run_bgzip:
 rule decompress_graph:
     # Decompressing graph if required
     input: 
-        "{file}.gfa.gz"
+        "{dir}/{tool}.{file}.gfa.gz"
     output:
-        temp("{file}.gfa")
+        temp("{dir}/{tool}.{file}.gfa")
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 8000
-- 
GitLab


From a42b16a96aece7554ca454ccacfb928d957d8715 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 10:53:17 +0200
Subject: [PATCH 182/310] Added tmp to decompressed gfa

---
 Snakefile       | 8 ++++----
 rules/tools.smk | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/Snakefile b/Snakefile
index 0c0d431..595707b 100644
--- a/Snakefile
+++ b/Snakefile
@@ -615,7 +615,7 @@ rule graph_squeeze:
     input:
         glist="data/chrGraphs/graphsList.{gtool}.txt",
         tags="output/pan1c."+config['name']+".gfa.metadata",
-        graphs=expand('data/chrGraphs/{{gtool}}.'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST)
+        graphs=expand('data/chrGraphs/{{gtool}}.'+config['name']+'.{chromosome}.tmp.gfa', chromosome=CHRLIST)
     output:
         gfa_gz="output/{gtool}."+config['name']+".gfa.gz"
     log: 
@@ -673,7 +673,7 @@ rule graph_stats:
 rule graph_figs:
     # Creating figures using odgi viz 
     input:
-        graph='data/chrGraphs/{gtool}.'+config['name']+'.{chromosome}.gfa'
+        graph='data/chrGraphs/{gtool}.'+config['name']+'.{chromosome}.tmp.gfa'
     output:
         oneDviz="output/chrGraphs.figs/{gtool}."+config['name']+".{chromosome}.1Dviz.png",
         pcov="output/chrGraphs.figs/{gtool}."+config['name']+".{chromosome}.pcov.png"
@@ -799,7 +799,7 @@ rule get_pav:
 rule panacus_stats:
     # Produces panacus reports for a chromosome graph
     input:
-        graph='data/chrGraphs/{gtool}.'+config['name']+'.{chromosome}.gfa'
+        graph='data/chrGraphs/{gtool}.'+config['name']+'.{chromosome}.tmp.gfa'
     output:
         html='output/panacus.reports/{gtool}.'+config['name']+'.{chromosome}.histgrowth.html'
     log: 
@@ -909,7 +909,7 @@ rule vcf_fig:
 rule create_pan1c_report_fig:
     # Produces a markdown report figure of chromosomes graphs
     input:
-        graph='data/chrGraphs/{gtool}.'+config['name']+'.{chromosome}.gfa',
+        graph='data/chrGraphs/{gtool}.'+config['name']+'.{chromosome}.tmp.gfa',
         contigfig="output/chr.contig/{chromosome}.contig.png",
     output:
         odgifig=temp("tmp/{gtool}.{chromosome}.odgi.png"),
diff --git a/rules/tools.smk b/rules/tools.smk
index ce71c46..4368d77 100644
--- a/rules/tools.smk
+++ b/rules/tools.smk
@@ -45,9 +45,9 @@ rule run_bgzip:
 rule decompress_graph:
     # Decompressing graph if required
     input: 
-        "{dir}/{tool}.{file}.gfa.gz"
+        "{file}.gfa.gz"
     output:
-        temp("{dir}/{tool}.{file}.gfa")
+        temp("{file}.tmp.gfa")
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 8000
-- 
GitLab


From 56b9ee1ea05e0add5b310b9768e20e5d1153b193 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 10:58:02 +0200
Subject: [PATCH 183/310] Update config_CICD.yaml

---
 example/config_CICD.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/config_CICD.yaml b/example/config_CICD.yaml
index ba2ac18..e058b23 100644
--- a/example/config_CICD.yaml
+++ b/example/config_CICD.yaml
@@ -32,7 +32,7 @@ odgi.pcov.params: '-x 2000 -a 25 -O'
 # Running Quast to get statistics on input haplotypes
 run_Quast: 'True'
 # Make Minigraph-Cactus graph using the same method (chromosome level)
-get_MC: 'True'
+get_MC: 'False'
 # Getting figures showing chromosome decomposition into contigs
 get_contig_pos: 'True'
 # Computes Presence Absence Variant matrices for Panache (not recommended; very long)
-- 
GitLab


From 17da1e0acb631fd971c3a8f72a68622b63002b4d Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 11:03:42 +0200
Subject: [PATCH 184/310] Update Snakefile

---
 Snakefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Snakefile b/Snakefile
index 595707b..584286f 100644
--- a/Snakefile
+++ b/Snakefile
@@ -981,12 +981,12 @@ def get_report_sections(wildcards):
     sections = dict()
 
     sections["metadata"] = "output/pan1c."+config['name']+".gfa.metadata"
-    sections["odgifigs"] = expand("output/report/{{wildcards.gtool}}."+config['name']+".{chromosome}.report.fig.png", chromosome=CHRLIST)
+    sections["odgifigs"] = expand("output/report/{gtool}."+config['name']+".{chromosome}.report.fig.png", chromosome=CHRLIST, gtool=[wildcards.gtool])
     sections["genstats"] = "output/stats/{wildcards.gtool}."+config['name']+".chrGraph.general.stats.tsv"
     sections["pathstats"] = "output/stats/{wildcards.gtool}."+config['name']+".chrGraph.path.stats.tsv"
-    sections["barplots"] = expand("output/chrGraphs.stats.figs/{{wildcards.gtool}}."+config['name']+".path.decomp.{chromosome}.png", chromosome=CHRLIST)
-    sections["scatters"] = expand("output/chrGraphs.stats.figs/{{wildcards.gtool}}."+config['name']+".2D.scatter.{chromosome}.png", chromosome=CHRLIST)
-    sections["heatmaps"] = expand("output/chrGraphs.stats.figs/{{wildcards.gtool}}."+config['name']+".sharred.content.{chromosome}.png", chromosome=CHRLIST)
+    sections["barplots"] = expand("output/chrGraphs.stats.figs/{gtool}."+config['name']+".path.decomp.{chromosome}.png", chromosome=CHRLIST, gtool=[wildcards.gtool])
+    sections["scatters"] = expand("output/chrGraphs.stats.figs/{gtool}."+config['name']+".2D.scatter.{chromosome}.png", chromosome=CHRLIST, gtool=[wildcards.gtool])
+    sections["heatmaps"] = expand("output/chrGraphs.stats.figs/{gtool}."+config['name']+".sharred.content.{chromosome}.png", chromosome=CHRLIST, gtool=[wildcards.gtool])
     sections["barplot_mean"] = "output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".path.decomp.mean.png"
     sections["scatter_mean"] = "output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".2D.scatter.mean.png"
     sections["heatmap_diff"] = "output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".sharred.content.diff.png"
-- 
GitLab


From d11f5e0e388ebe9e8bf21f1dc69b969a7ae67185 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 11:04:53 +0200
Subject: [PATCH 185/310] Update Snakefile

---
 Snakefile | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Snakefile b/Snakefile
index 584286f..c028ab4 100644
--- a/Snakefile
+++ b/Snakefile
@@ -982,14 +982,14 @@ def get_report_sections(wildcards):
 
     sections["metadata"] = "output/pan1c."+config['name']+".gfa.metadata"
     sections["odgifigs"] = expand("output/report/{gtool}."+config['name']+".{chromosome}.report.fig.png", chromosome=CHRLIST, gtool=[wildcards.gtool])
-    sections["genstats"] = "output/stats/{wildcards.gtool}."+config['name']+".chrGraph.general.stats.tsv"
-    sections["pathstats"] = "output/stats/{wildcards.gtool}."+config['name']+".chrGraph.path.stats.tsv"
+    sections["genstats"] = f"output/stats/{wildcards.gtool}."+config['name']+".chrGraph.general.stats.tsv"
+    sections["pathstats"] = f"output/stats/{wildcards.gtool}."+config['name']+".chrGraph.path.stats.tsv"
     sections["barplots"] = expand("output/chrGraphs.stats.figs/{gtool}."+config['name']+".path.decomp.{chromosome}.png", chromosome=CHRLIST, gtool=[wildcards.gtool])
     sections["scatters"] = expand("output/chrGraphs.stats.figs/{gtool}."+config['name']+".2D.scatter.{chromosome}.png", chromosome=CHRLIST, gtool=[wildcards.gtool])
     sections["heatmaps"] = expand("output/chrGraphs.stats.figs/{gtool}."+config['name']+".sharred.content.{chromosome}.png", chromosome=CHRLIST, gtool=[wildcards.gtool])
-    sections["barplot_mean"] = "output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".path.decomp.mean.png"
-    sections["scatter_mean"] = "output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".2D.scatter.mean.png"
-    sections["heatmap_diff"] = "output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".sharred.content.diff.png"
+    sections["barplot_mean"] = f"output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".path.decomp.mean.png"
+    sections["scatter_mean"] = f"output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".2D.scatter.mean.png"
+    sections["heatmap_diff"] = f"output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".sharred.content.diff.png"
 
     if config["get_ASMs_SyRI"] == "True":
         sections["SyRI_on_ASMs_figs"] = expand(
@@ -1004,7 +1004,7 @@ def get_report_sections(wildcards):
             )
 
     if config['get_VCF'] == "True":
-        sections['VCF_figs'] = "output/{wildcards.gtool}.vcf.figs"
+        sections['VCF_figs'] = f"output/{wildcards.gtool}.vcf.figs"
 
     return sections      
 
-- 
GitLab


From 61a6de99bbffeb78bc9f9fdf0770801183657168 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 11:07:07 +0200
Subject: [PATCH 186/310] Update tools.smk

---
 rules/tools.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rules/tools.smk b/rules/tools.smk
index 4368d77..0a68159 100644
--- a/rules/tools.smk
+++ b/rules/tools.smk
@@ -61,7 +61,7 @@ rule decompress_graph:
 rule gfa_2_xg:
     # Convert a GFA to XG
     input:
-        "{graph}.gfa"
+        "{graph}.tmp.gfa"
     output:
         "{graph}.xg"
     threads: 8
-- 
GitLab


From a892ac0efd75a1dee324484e042fdc7c4298e94a Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 11:37:45 +0200
Subject: [PATCH 187/310] Added MC graph creation rule

---
 Snakefile                | 43 ++++++++++++++++++++++++++++++++++++++++
 example/config_CICD.yaml |  2 +-
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index c028ab4..05b806b 100644
--- a/Snakefile
+++ b/Snakefile
@@ -595,6 +595,49 @@ rule odgi_postprocessing:
             -@ {threads} $gfa_out
         """
 
+rule MC_graph:
+    input:
+        tags="output/pan1c."+config['name']+".gfa.metadata",
+        fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz'
+    output:
+        gfa_gz='data/chrGraphs/MC.'+config['name']+'.{chromosome}.gfa.gz'
+    threads: 16
+    resources:
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 32000
+    params:
+        tmp_dir='data/chrGraphs/MC.{chromosome}',
+        ref_name=config['name'],
+        app_path=config['app.path']
+    log:
+        stdout="logs/MC/{chromosome}.mc.stdout.log",
+        stderr="logs/MC/{chromosome}.mc.stderr.log"
+    shell:
+        """
+        mkdir -p {params.tmp_dir}
+
+        # Creating a fasta for each sequence
+        zcat {input.fa} | awk -F"#" -v DIR={params.tmp_dir} \
+            '/^>/ {{OUT= DIR "/" substr($0,2) ".fa"}}; {{print >> OUT; close(OUT)}}'
+
+        # Listing fasta files
+        for hap in {params.tmp_dir}/*.fa; do
+            fullname=$(basename $hap .fa)
+            genome=$(echo $fullname | cut -f1 -d'#')
+            hapid=$(echo $fullname | cut -f2 -d '#')
+            echo -e "${genome}.${hapid}\t${hap}" >> {params.tmp_dir}/{wildcards.chromosome}.genomes.txt
+        done
+
+        apptainer run {params.app_path}/minigraph-cactus_v2.7.0.sif \
+            {params.tmp_dir}/tmp {params.tmp_dir}/{wildcards.chromosome}.genomes.txt \
+            --outDir {params.tmp_dir} \
+            --outName $(basename {output.gfa_gz} .gfa.gz) \
+            --reference "$(basename {params.ref_name} .fa.gz | cut -f1 -d'.').$(basename {params.ref_name} .fa.gz | cut -f2 -d '.' | cut -f2 -d'p')" \
+            --gfa \
+            --clip 0 --filter 0
+        
+        mv {params.tmp_dir}/$(basename {output.gfa_gz} .gfa.gz).full.gfa.gz {output.gfa_gz}
+        """
+
 rule generate_graph_list:
     # Generate a text file containing all created graphs
     input:
diff --git a/example/config_CICD.yaml b/example/config_CICD.yaml
index e058b23..ba2ac18 100644
--- a/example/config_CICD.yaml
+++ b/example/config_CICD.yaml
@@ -32,7 +32,7 @@ odgi.pcov.params: '-x 2000 -a 25 -O'
 # Running Quast to get statistics on input haplotypes
 run_Quast: 'True'
 # Make Minigraph-Cactus graph using the same method (chromosome level)
-get_MC: 'False'
+get_MC: 'True'
 # Getting figures showing chromosome decomposition into contigs
 get_contig_pos: 'True'
 # Computes Presence Absence Variant matrices for Panache (not recommended; very long)
-- 
GitLab


From adf76ef1d9a2b12d8fc8eb9ebcf7b81d8d0a879d Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 11:39:51 +0200
Subject: [PATCH 188/310] Update Snakefile

---
 Snakefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Snakefile b/Snakefile
index 05b806b..7d1685d 100644
--- a/Snakefile
+++ b/Snakefile
@@ -37,6 +37,7 @@ with gzip.open("data/haplotypes/"+config['reference'], "r") as handle:
     CHRLIST = [line.decode().split("#")[-1].split('\n')[0] for line in handle.readlines() if line.decode()[0] == ">"]
 
 graph_tools = ["pan1c"] + (config["get_MC"] == True)*["MC"] 
+print(graph_tools)
 
 # Adding optionnal output based on config.yaml, using the following function
 def which_analysis():
-- 
GitLab


From 1f5bb4125a47d1d90bd3b3e2954cc0402dde92d5 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 11:40:47 +0200
Subject: [PATCH 189/310] Update Snakefile

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 7d1685d..9e1d4de 100644
--- a/Snakefile
+++ b/Snakefile
@@ -36,7 +36,7 @@ nHAP = len(SAMPLES)
 with gzip.open("data/haplotypes/"+config['reference'], "r") as handle:
     CHRLIST = [line.decode().split("#")[-1].split('\n')[0] for line in handle.readlines() if line.decode()[0] == ">"]
 
-graph_tools = ["pan1c"] + (config["get_MC"] == True)*["MC"] 
+graph_tools = ["pan1c"] + (config["get_MC"] == "True")*["MC"] 
 print(graph_tools)
 
 # Adding optionnal output based on config.yaml, using the following function
-- 
GitLab


From 32ae9af09e6f6556626644b673aa7829ffcea0a1 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 11:42:24 +0200
Subject: [PATCH 190/310] Update Snakefile

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 9e1d4de..e843517 100644
--- a/Snakefile
+++ b/Snakefile
@@ -96,7 +96,7 @@ Rules   ------------------------------------------------------------------------
 rule all:
     input:
         expand("output/{gtool}."+config['name']+".gfa.gz", gtool=graph_tools), # Final graph (main output)
-        expand("output/{gtool}."+config['name']+".gfa.metadata", gtool=graph_tools), # Metadata for the final (also in top of gfa files as # line)
+        "output/pan1c."+config['name']+".gfa.metadata", # Metadata for the final (also in top of gfa files as # line)
         which_analysis()
 
 """
-- 
GitLab


From 9c3469c74a3cd5e34e9339795e17108d0cd5c04d Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 11:44:06 +0200
Subject: [PATCH 191/310] Update Snakefile

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index e843517..e7a7bad 100644
--- a/Snakefile
+++ b/Snakefile
@@ -625,7 +625,7 @@ rule MC_graph:
             fullname=$(basename $hap .fa)
             genome=$(echo $fullname | cut -f1 -d'#')
             hapid=$(echo $fullname | cut -f2 -d '#')
-            echo -e "${genome}.${hapid}\t${hap}" >> {params.tmp_dir}/{wildcards.chromosome}.genomes.txt
+            echo -e "${{genome}}.${{hapid}}\t${{hap}}" >> {params.tmp_dir}/{wildcards.chromosome}.genomes.txt
         done
 
         apptainer run {params.app_path}/minigraph-cactus_v2.7.0.sif \
-- 
GitLab


From 0229b84c16cc61bf1b7c23142340ebf396f369dc Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 11:46:24 +0200
Subject: [PATCH 192/310] Update Snakefile

---
 Snakefile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index e7a7bad..25e56fe 100644
--- a/Snakefile
+++ b/Snakefile
@@ -37,7 +37,6 @@ with gzip.open("data/haplotypes/"+config['reference'], "r") as handle:
     CHRLIST = [line.decode().split("#")[-1].split('\n')[0] for line in handle.readlines() if line.decode()[0] == ">"]
 
 graph_tools = ["pan1c"] + (config["get_MC"] == "True")*["MC"] 
-print(graph_tools)
 
 # Adding optionnal output based on config.yaml, using the following function
 def which_analysis():
-- 
GitLab


From b23d16571dba75734ecff2388597ceed60102ba1 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 11:56:59 +0200
Subject: [PATCH 193/310] Update Snakefile

---
 Snakefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index 25e56fe..6bcd0d7 100644
--- a/Snakefile
+++ b/Snakefile
@@ -606,7 +606,7 @@ rule MC_graph:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 32000
     params:
         tmp_dir='data/chrGraphs/MC.{chromosome}',
-        ref_name=config['name'],
+        ref_name=config['reference'],
         app_path=config['app.path']
     log:
         stdout="logs/MC/{chromosome}.mc.stdout.log",
@@ -623,7 +623,7 @@ rule MC_graph:
         for hap in {params.tmp_dir}/*.fa; do
             fullname=$(basename $hap .fa)
             genome=$(echo $fullname | cut -f1 -d'#')
-            hapid=$(echo $fullname | cut -f2 -d '#')
+            hapid=$(echo $fullname | cut -f2 -d'#')
             echo -e "${{genome}}.${{hapid}}\t${{hap}}" >> {params.tmp_dir}/{wildcards.chromosome}.genomes.txt
         done
 
-- 
GitLab


From 8335ee57016fc2d5714f78f9795d66902384c566 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 12:27:24 +0200
Subject: [PATCH 194/310] Update Snakefile

---
 Snakefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Snakefile b/Snakefile
index 6bcd0d7..7ffdbb7 100644
--- a/Snakefile
+++ b/Snakefile
@@ -613,6 +613,7 @@ rule MC_graph:
         stderr="logs/MC/{chromosome}.mc.stderr.log"
     shell:
         """
+        if [ -d {params.tmp_dir} ]; then rm -r {params.tmp_dir}; fi
         mkdir -p {params.tmp_dir}
 
         # Creating a fasta for each sequence
-- 
GitLab


From 9d5348259e17b594062337eac36367500e294e18 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 12:39:36 +0200
Subject: [PATCH 195/310] Update Snakefile

---
 Snakefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Snakefile b/Snakefile
index 7ffdbb7..5e3eb71 100644
--- a/Snakefile
+++ b/Snakefile
@@ -618,13 +618,13 @@ rule MC_graph:
 
         # Creating a fasta for each sequence
         zcat {input.fa} | awk -F"#" -v DIR={params.tmp_dir} \
-            '/^>/ {{OUT= DIR "/" substr($0,2) ".fa"}}; {{print >> OUT; close(OUT)}}'
+            '/^>/ {{OUT= DIR "/" gsub(/#/, "_", substr($0,2)) ".fa"}}; {{print >> OUT; close(OUT)}}'
 
         # Listing fasta files
         for hap in {params.tmp_dir}/*.fa; do
             fullname=$(basename $hap .fa)
-            genome=$(echo $fullname | cut -f1 -d'#')
-            hapid=$(echo $fullname | cut -f2 -d'#')
+            genome=$(echo $fullname | cut -f1 -d'_')
+            hapid=$(echo $fullname | cut -f2 -d'_')
             echo -e "${{genome}}.${{hapid}}\t${{hap}}" >> {params.tmp_dir}/{wildcards.chromosome}.genomes.txt
         done
 
-- 
GitLab


From dcd4b78e419bc9eacc45e633f17c5843aa167766 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 12:43:20 +0200
Subject: [PATCH 196/310] Update Snakefile

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 5e3eb71..2fcc69e 100644
--- a/Snakefile
+++ b/Snakefile
@@ -618,7 +618,7 @@ rule MC_graph:
 
         # Creating a fasta for each sequence
         zcat {input.fa} | awk -F"#" -v DIR={params.tmp_dir} \
-            '/^>/ {{OUT= DIR "/" gsub(/#/, "_", substr($0,2)) ".fa"}}; {{print >> OUT; close(OUT)}}'
+            '/^>/ {{name=substr($0,2); OUT= DIR "/" gsub(/#/, "_", name) ".fa"}}; {{print >> OUT; close(OUT)}}'
 
         # Listing fasta files
         for hap in {params.tmp_dir}/*.fa; do
-- 
GitLab


From 18bddc7be40357933634737c5d52ef610d65c212 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 13:01:34 +0200
Subject: [PATCH 197/310] Update Snakefile

---
 Snakefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index 2fcc69e..f9049e7 100644
--- a/Snakefile
+++ b/Snakefile
@@ -617,8 +617,8 @@ rule MC_graph:
         mkdir -p {params.tmp_dir}
 
         # Creating a fasta for each sequence
-        zcat {input.fa} | awk -F"#" -v DIR={params.tmp_dir} \
-            '/^>/ {{name=substr($0,2); OUT= DIR "/" gsub(/#/, "_", name) ".fa"}}; {{print >> OUT; close(OUT)}}'
+        zcat {input.fa} | awk -v DIR={params.tmp_dir} \
+            '/^>/ {{name=substr($0, 2); gsub(/#/, "_", name); OUT= DIR "/" name ".fa"}}; {{print >> OUT; close(OUT)}}'
 
         # Listing fasta files
         for hap in {params.tmp_dir}/*.fa; do
-- 
GitLab


From 212a8c9e2cad74476834b2651b06894d518cdbb2 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 13:06:43 +0200
Subject: [PATCH 198/310] Update Snakefile

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index f9049e7..d5aeba6 100644
--- a/Snakefile
+++ b/Snakefile
@@ -618,7 +618,7 @@ rule MC_graph:
 
         # Creating a fasta for each sequence
         zcat {input.fa} | awk -v DIR={params.tmp_dir} \
-            '/^>/ {{name=substr($0, 2); gsub(/#/, "_", name); OUT= DIR "/" name ".fa"}}; {{print >> OUT; close(OUT)}}'
+            '/^>/ {{name=substr($0, 2); OUT= DIR "/" name ".fa"}}; {{print >> OUT; close(OUT)}}'
 
         # Listing fasta files
         for hap in {params.tmp_dir}/*.fa; do
-- 
GitLab


From 8e16b79c6f88fc34b457ee3382b215062fce4cb1 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 13:09:32 +0200
Subject: [PATCH 199/310] Update Snakefile

---
 Snakefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index d5aeba6..30ad6a4 100644
--- a/Snakefile
+++ b/Snakefile
@@ -623,8 +623,8 @@ rule MC_graph:
         # Listing fasta files
         for hap in {params.tmp_dir}/*.fa; do
             fullname=$(basename $hap .fa)
-            genome=$(echo $fullname | cut -f1 -d'_')
-            hapid=$(echo $fullname | cut -f2 -d'_')
+            genome=$(echo $fullname | cut -f1 -d'#')
+            hapid=$(echo $fullname | cut -f2 -d'#')
             echo -e "${{genome}}.${{hapid}}\t${{hap}}" >> {params.tmp_dir}/{wildcards.chromosome}.genomes.txt
         done
 
-- 
GitLab


From 98751fc7c287518c1733d4b2c59a32ba4c3468a7 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 13:15:19 +0200
Subject: [PATCH 200/310] Update Snakefile

---
 Snakefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Snakefile b/Snakefile
index 30ad6a4..74fbe55 100644
--- a/Snakefile
+++ b/Snakefile
@@ -618,13 +618,13 @@ rule MC_graph:
 
         # Creating a fasta for each sequence
         zcat {input.fa} | awk -v DIR={params.tmp_dir} \
-            '/^>/ {{name=substr($0, 2); OUT= DIR "/" name ".fa"}}; {{print >> OUT; close(OUT)}}'
+            '/^>/ {{name=substr($0, 2); gsub(/#/, ".", name); OUT= DIR "/" name ".fa"}}; {{print >> OUT; close(OUT)}}'
 
         # Listing fasta files
         for hap in {params.tmp_dir}/*.fa; do
             fullname=$(basename $hap .fa)
-            genome=$(echo $fullname | cut -f1 -d'#')
-            hapid=$(echo $fullname | cut -f2 -d'#')
+            genome=$(echo $fullname | cut -f1 -d'.')
+            hapid=$(echo $fullname | cut -f2 -d'.')
             echo -e "${{genome}}.${{hapid}}\t${{hap}}" >> {params.tmp_dir}/{wildcards.chromosome}.genomes.txt
         done
 
-- 
GitLab


From 9d0d4c753403daa42bf0cfaf134e1525d2f7ab0f Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 14:05:22 +0200
Subject: [PATCH 201/310] Fixes

- decompression rule
- gfastats with wrong output name
---
 Snakefile       | 2 +-
 rules/tools.smk | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index 74fbe55..1ad9bc9 100644
--- a/Snakefile
+++ b/Snakefile
@@ -710,7 +710,7 @@ rule graph_stats:
         """
         apptainer run --app gfastats {params.app_path}/PanGeTools.sif \
             -g {input.graph} -P \
-            -o $(dirname {output.genstats})/{params.pan_name}.{wildcards.chromosome} \
+            -o $(dirname {output.genstats})/{wildcards.gtool}.{params.pan_name}.{wildcards.chromosome} \
             -t {threads}
         """
 
diff --git a/rules/tools.smk b/rules/tools.smk
index 0a68159..e5d36ba 100644
--- a/rules/tools.smk
+++ b/rules/tools.smk
@@ -55,7 +55,7 @@ rule decompress_graph:
         app_path=config["app.path"]
     shell:
         """
-        gzip -d -k {input} 
+        gzip -d -c {input} > {output} 
         """
 
 rule gfa_2_xg:
-- 
GitLab


From 60894a9501f3584651aa05f14c98e7c551636615 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 14:14:44 +0200
Subject: [PATCH 202/310] Converting MC output to GFA1.0

---
 Snakefile | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 1ad9bc9..7d89806 100644
--- a/Snakefile
+++ b/Snakefile
@@ -628,6 +628,7 @@ rule MC_graph:
             echo -e "${{genome}}.${{hapid}}\t${{hap}}" >> {params.tmp_dir}/{wildcards.chromosome}.genomes.txt
         done
 
+        # Running MC
         apptainer run {params.app_path}/minigraph-cactus_v2.7.0.sif \
             {params.tmp_dir}/tmp {params.tmp_dir}/{wildcards.chromosome}.genomes.txt \
             --outDir {params.tmp_dir} \
@@ -636,7 +637,10 @@ rule MC_graph:
             --gfa \
             --clip 0 --filter 0
         
-        mv {params.tmp_dir}/$(basename {output.gfa_gz} .gfa.gz).full.gfa.gz {output.gfa_gz}
+        # Converting to GFA 1.0
+        apptainer run --app gfavc {params.app_path}/PanGeTools.sif \
+            --gfa1 {params.tmp_dir}/$(basename {output.gfa_gz} .gfa.gz).full.gfa.gz \
+            --outName {output.gfa_gz}
         """
 
 rule generate_graph_list:
-- 
GitLab


From 4a4e96408685297b13be5f39231780def210e914 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 14:25:12 +0200
Subject: [PATCH 203/310] Fixed Panacus script

---
 Snakefile               | 2 +-
 scripts/getPanacusHG.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index 7d89806..93f737a 100644
--- a/Snakefile
+++ b/Snakefile
@@ -866,7 +866,7 @@ rule panacus_stats:
             bash scripts/getPanacusHG.sh \
             -g {input.graph} \
             -r $(basename {params.refname} .fa.gz) \
-            -d data/chrGraphs/{wildcards.chromosome} \
+            -d data/chrGraphs/{wildcards.gtool}.{wildcards.chromosome} \
             -o {output.html} \
             -a {params.app_path} \
             -t {threads} 2>&1 | \
diff --git a/scripts/getPanacusHG.sh b/scripts/getPanacusHG.sh
index d26dda3..b25fc6d 100755
--- a/scripts/getPanacusHG.sh
+++ b/scripts/getPanacusHG.sh
@@ -25,7 +25,7 @@ while getopts "g:r:a:t:d:o:" option; do
 done
 
 # Getting chromosome name
-chrname=$(basename ${gfa} .gfa | cut -d'.' -f2)
+chrname=$(basename ${gfa} .gfa | cut -d'.' -f3)
 ref=$(echo $refname | sed 's/.hap/#/')
 
 # Getting paths in chromosome graph
-- 
GitLab


From 4b197eb82f8f2c411ce07ff1caffa7eeea51e819 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 14:36:47 +0200
Subject: [PATCH 204/310] Fixing more things

- Typo
- Adding graph tool name to chrGraphs.stats_figs
---
 Snakefile                       |  4 ++--
 scripts/chrGraphs.stats_figs.py | 19 +++++++++++++------
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/Snakefile b/Snakefile
index 93f737a..8b7c7ab 100644
--- a/Snakefile
+++ b/Snakefile
@@ -1001,7 +1001,7 @@ rule create_chrGraphs_figs:
         heatmaps=expand("output/chrGraphs.stats.figs/{{gtool}}."+config['name']+".sharred.content.{chromosome}.png", chromosome=CHRLIST),
         barplot_mean="output/chrGraphs.stats.figs/{gtool}."+config['name']+".path.decomp.mean.png",
         scatter_mean="output/chrGraphs.stats.figs/{gtool}."+config['name']+".2D.scatter.mean.png",
-        heatmap_diff="output/chrGraphs.stats.figs/{gtool}."+config['name']+".sharred.content.diff.png"
+        heatmap_diff="output/chrGraphs.stats.figs/{gtool}."+config['name']+".shared.content.diff.png"
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 16000
@@ -1037,7 +1037,7 @@ def get_report_sections(wildcards):
     sections["heatmaps"] = expand("output/chrGraphs.stats.figs/{gtool}."+config['name']+".sharred.content.{chromosome}.png", chromosome=CHRLIST, gtool=[wildcards.gtool])
     sections["barplot_mean"] = f"output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".path.decomp.mean.png"
     sections["scatter_mean"] = f"output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".2D.scatter.mean.png"
-    sections["heatmap_diff"] = f"output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".sharred.content.diff.png"
+    sections["heatmap_diff"] = f"output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".shared.content.diff.png"
 
     if config["get_ASMs_SyRI"] == "True":
         sections["SyRI_on_ASMs_figs"] = expand(
diff --git a/scripts/chrGraphs.stats_figs.py b/scripts/chrGraphs.stats_figs.py
index 9d78d94..20f8745 100644
--- a/scripts/chrGraphs.stats_figs.py
+++ b/scripts/chrGraphs.stats_figs.py
@@ -41,6 +41,13 @@ arg_parser.add_argument(
     required = True,
     help = "Pangenome name"
     )
+arg_parser.add_argument(
+    "--grapher",
+    "-g",
+    dest = "grapher",
+    required = True,
+    help = "Graph creation tool"
+    )
 arg_parser.add_argument(
     "--reference",
     "-r",
@@ -291,13 +298,13 @@ for pangenome in gData.index.unique("Pangenome.name"):
         get_group_decomp_fig(
             data = gData.loc[pangenome, chrid,:].copy(), 
             title = f"Path composition by groups - {pangenome} - {chrid}",
-            savedir = os.path.join(args.dir, f"{pangenome}.path.decomp.{chrid}.png")
+            savedir = os.path.join(args.dir, f"{args.grapher}.{pangenome}.path.decomp.{chrid}.png")
         )
     ## For the mean across chromosomes 
     get_group_decomp_fig(
         data = gData.groupby("Path.name").sum().copy(), 
         title = "Path composition mean accross chromosomes",
-        savedir = os.path.join(args.dir, f"{pangenome}.path.decomp.mean.png")
+        savedir = os.path.join(args.dir, f"{args.grapher}.{pangenome}.path.decomp.mean.png")
     )
 
 # 2D Scatter Core vs Private
@@ -306,12 +313,12 @@ for pangenome in gData.index.unique("Pangenome.name"):
         get_group_2d_fig(
             data = gData.loc[pangenome, chrid,:].copy(), 
             title = f"Private vs. Core Sequence for Each Graph Path - {pangenome} - {chrid}",
-            savedir = os.path.join(args.dir, f"{pangenome}.2D.scatter.{chrid}.png")
+            savedir = os.path.join(args.dir, f"{args.grapher}.{pangenome}.2D.scatter.{chrid}.png")
         )
     get_group_2d_fig(
         data = gData.groupby("Path.name").mean().copy(), 
         title = "Mean Private vs. Core Sequence for Each Graph Path",
-        savedir = os.path.join(args.dir, f"{pangenome}.2D.scatter.mean.png")
+        savedir = os.path.join(args.dir, f"{args.grapher}.{pangenome}.2D.scatter.mean.png")
     )
 
 # Shared content heatmap
@@ -320,7 +327,7 @@ for pangenome in sData.index.unique("Pangenome.name"):
         get_hm_shared_fig(
             data = sData.loc[pangenome, chrid,:].copy(), 
             title = f"Pairwise Path Comparison - {pangenome} - {chrid}",
-            savedir = os.path.join(args.dir, f"{pangenome}.sharred.content.{chrid}.png")
+            savedir = os.path.join(args.dir, f"{args.grapher}.{pangenome}.sharred.content.{chrid}.png")
         )
 
 # Shared content difference between chromosomes heatmap
@@ -348,5 +355,5 @@ for pangenome in sData.index.unique("Pangenome.name"):
     get_hm_diff_fig(
         dData, 
         title = f"Pairwise Euclidean distance between path comparison matrices - {pangenome}",
-        savedir = os.path.join(args.dir, f"{pangenome}.sharred.content.diff.png")
+        savedir = os.path.join(args.dir, f"{args.grapher}.{pangenome}.shared.content.diff.png")
     )
\ No newline at end of file
-- 
GitLab


From 3779a62f7cc72ca53e87268ba43c3fcf0dadb21a Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 14:41:09 +0200
Subject: [PATCH 205/310] Fixing

- Actually using the graph tool args
- Adding tool in figures name
---
 Snakefile                       |  2 +-
 scripts/chrGraphs.stats_figs.py | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/Snakefile b/Snakefile
index 8b7c7ab..3d0deaa 100644
--- a/Snakefile
+++ b/Snakefile
@@ -1017,7 +1017,7 @@ rule create_chrGraphs_figs:
 
         apptainer run {params.app_path}/pan1c-env.sif python scripts/chrGraphs.stats_figs.py \
             --input {input.pathstats} --output_dir $(dirname {output.barplot_mean}) \
-            --panname {params.pan_name} --reference "$ref"
+            --panname {params.pan_name} --reference "$ref" --grapher {wildcards.gtool}
         """
 
 def get_report_sections(wildcards):
diff --git a/scripts/chrGraphs.stats_figs.py b/scripts/chrGraphs.stats_figs.py
index 20f8745..ac459d0 100644
--- a/scripts/chrGraphs.stats_figs.py
+++ b/scripts/chrGraphs.stats_figs.py
@@ -297,13 +297,13 @@ for pangenome in gData.index.unique("Pangenome.name"):
     for chrid in gData.index.unique("Chr.id"):
         get_group_decomp_fig(
             data = gData.loc[pangenome, chrid,:].copy(), 
-            title = f"Path composition by groups - {pangenome} - {chrid}",
+            title = f"Path composition by groups - {args.grapher}.{pangenome} - {chrid}",
             savedir = os.path.join(args.dir, f"{args.grapher}.{pangenome}.path.decomp.{chrid}.png")
         )
     ## For the mean across chromosomes 
     get_group_decomp_fig(
         data = gData.groupby("Path.name").sum().copy(), 
-        title = "Path composition mean accross chromosomes",
+        title = f"Path composition mean accross chromosomes - {args.grapher}.{pangenome}",
         savedir = os.path.join(args.dir, f"{args.grapher}.{pangenome}.path.decomp.mean.png")
     )
 
@@ -312,12 +312,12 @@ for pangenome in gData.index.unique("Pangenome.name"):
     for chrid in gData.index.unique("Chr.id"):
         get_group_2d_fig(
             data = gData.loc[pangenome, chrid,:].copy(), 
-            title = f"Private vs. Core Sequence for Each Graph Path - {pangenome} - {chrid}",
+            title = f"Private vs. Core Sequence for Each Graph Path - {args.grapher}.{pangenome} - {chrid}",
             savedir = os.path.join(args.dir, f"{args.grapher}.{pangenome}.2D.scatter.{chrid}.png")
         )
     get_group_2d_fig(
         data = gData.groupby("Path.name").mean().copy(), 
-        title = "Mean Private vs. Core Sequence for Each Graph Path",
+        title = f"Mean Private vs. Core Sequence for Each Graph Path - {args.grapher}.{pangenome}",
         savedir = os.path.join(args.dir, f"{args.grapher}.{pangenome}.2D.scatter.mean.png")
     )
 
@@ -326,7 +326,7 @@ for pangenome in sData.index.unique("Pangenome.name"):
     for chrid in sData.index.unique("Chr.id"):
         get_hm_shared_fig(
             data = sData.loc[pangenome, chrid,:].copy(), 
-            title = f"Pairwise Path Comparison - {pangenome} - {chrid}",
+            title = f"Pairwise Path Comparison - {args.grapher}.{pangenome} - {chrid}",
             savedir = os.path.join(args.dir, f"{args.grapher}.{pangenome}.sharred.content.{chrid}.png")
         )
 
@@ -354,6 +354,6 @@ for pangenome in sData.index.unique("Pangenome.name"):
     
     get_hm_diff_fig(
         dData, 
-        title = f"Pairwise Euclidean distance between path comparison matrices - {pangenome}",
+        title = f"Pairwise Euclidean distance between path comparison matrices - {args.grapher}.{pangenome}",
         savedir = os.path.join(args.dir, f"{args.grapher}.{pangenome}.shared.content.diff.png")
     )
\ No newline at end of file
-- 
GitLab


From 682ddcc40422ef69a4b349fad0b6308d79c1f064 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 15:07:55 +0200
Subject: [PATCH 206/310] Update Snakefile

---
 Snakefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Snakefile b/Snakefile
index 3d0deaa..0017fc7 100644
--- a/Snakefile
+++ b/Snakefile
@@ -759,6 +759,8 @@ rule aggregate_graphs_stats:
         pan_name=config['name']
     shell:
         """
+        echo "{input.genstats}"
+
         apptainer run {params.app_path}/pan1c-env.sif python scripts/chrGraphs.stats_aggregate.py \
             --input $(dirname {input[0]}) --outputGeneral {output.genstats} \
             --outputPaths {output.pathstats} --panname {params.pan_name} 
-- 
GitLab


From 8aff9a322d7e65f281cf0d8a8d3db18ab676ad2e Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 15:19:08 +0200
Subject: [PATCH 207/310] Update Snakefile

---
 Snakefile | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 0017fc7..f72f61f 100644
--- a/Snakefile
+++ b/Snakefile
@@ -640,7 +640,11 @@ rule MC_graph:
         # Converting to GFA 1.0
         apptainer run --app gfavc {params.app_path}/PanGeTools.sif \
             --gfa1 {params.tmp_dir}/$(basename {output.gfa_gz} .gfa.gz).full.gfa.gz \
-            --outName {output.gfa_gz}
+            --outName "$(dirname {output.gfa_gz})/$(basename {output.gfa_gz} .gz)"
+
+        apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
+            -@ {threads} "$(dirname {output.gfa_gz})/$(basename {output.gfa_gz} .gz)"
+        
         """
 
 rule generate_graph_list:
-- 
GitLab


From 56c6d789253d634a2e726be8a13ad63a8465eb19 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 15:25:15 +0200
Subject: [PATCH 208/310] Update Snakefile

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index f72f61f..e35e700 100644
--- a/Snakefile
+++ b/Snakefile
@@ -1173,7 +1173,7 @@ rule create_pan1c_report:
         shell("echo '# Chromosome-scale odgi graphs' >> {output.report}")
         for i in range(len(odgi_figs_list)):
             odgi_basename=os.path.basename(odgi_figs_list[i])
-            chr_name=odgi_basename.split('.')[1]
+            chr_name=odgi_basename.split('.')[2]
             
             shell("echo '## {chr_name}' >> {output.report}")
             shell("echo '![{odgi_basename}](./report/{odgi_basename})' >> {output.report}")
-- 
GitLab


From 4363214c421afca24d83ae6beff2f8d7c3eec1cf Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 15:43:43 +0200
Subject: [PATCH 209/310] Update Snakefile

---
 Snakefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index e35e700..1b11c5d 100644
--- a/Snakefile
+++ b/Snakefile
@@ -650,7 +650,7 @@ rule MC_graph:
 rule generate_graph_list:
     # Generate a text file containing all created graphs
     input:
-        gfas=expand('data/chrGraphs/{{gtool}}.'+config['name']+'.{chromosome}.gfa.gz', chromosome=CHRLIST)
+        gfas=expand('data/chrGraphs/{{gtool}}.'+config['name']+'.{chromosome}.tmp.gfa', chromosome=CHRLIST)
     output:
         "data/chrGraphs/graphsList.{gtool}.txt"
     threads: 1
@@ -660,7 +660,7 @@ rule generate_graph_list:
     run:
         with open(output[0], "w") as handle:
             for file in input.gfas:
-                handle.write(file[:-3]+"\n")
+                handle.write(file+"\n")
 
 rule graph_squeeze:
     # Using odgi to merge every subgraphs into a final one
-- 
GitLab


From a394dc07bfd2dca37d427313b01a324304dc97fc Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 12 Sep 2024 15:56:15 +0200
Subject: [PATCH 210/310] Update VCF.stats_figs.py

---
 scripts/VCF.stats_figs.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/VCF.stats_figs.py b/scripts/VCF.stats_figs.py
index a313878..adad390 100644
--- a/scripts/VCF.stats_figs.py
+++ b/scripts/VCF.stats_figs.py
@@ -322,6 +322,8 @@ for genome in sorted(vg["Genome"].unique()):
         names=["TOOL"]
     )
 
+    print(data)
+
     # Creating the figure
     hist_genome(
         data, 
-- 
GitLab


From 050b7f418e8925d5922b8408c6bf72934b929e82 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 13 Sep 2024 11:31:31 +0200
Subject: [PATCH 211/310] Simplifying MC paths name

---
 Snakefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Snakefile b/Snakefile
index 1b11c5d..c5a658b 100644
--- a/Snakefile
+++ b/Snakefile
@@ -640,6 +640,7 @@ rule MC_graph:
         # Converting to GFA 1.0
         apptainer run --app gfavc {params.app_path}/PanGeTools.sif \
             --gfa1 {params.tmp_dir}/$(basename {output.gfa_gz} .gfa.gz).full.gfa.gz \
+            --simplify \
             --outName "$(dirname {output.gfa_gz})/$(basename {output.gfa_gz} .gz)"
 
         apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
-- 
GitLab


From 8796d06f4236d59fd1e224b336bfb88ed7eb8466 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 13 Sep 2024 11:55:18 +0200
Subject: [PATCH 212/310] Fixed stats aggregation bug

---
 Snakefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Snakefile b/Snakefile
index c5a658b..8571e4b 100644
--- a/Snakefile
+++ b/Snakefile
@@ -707,8 +707,8 @@ rule graph_stats:
     input:
         graph='data/chrGraphs/{gtool}.'+config['name']+'.{chromosome}.gfa.gz'
     output:
-        genstats="output/stats/chrGraphs/{gtool}."+config['name']+".{chromosome}.general.stats.tsv",
-        pathstats="output/stats/chrGraphs/{gtool}."+config['name']+".{chromosome}.path.stats.tsv"
+        genstats="output/stats/chrGraphs.{gtool}/{gtool}."+config['name']+".{chromosome}.general.stats.tsv",
+        pathstats="output/stats/chrGraphs.{gtool}/{gtool}."+config['name']+".{chromosome}.path.stats.tsv"
     threads: 4
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 8000
@@ -752,7 +752,7 @@ rule graph_figs:
 rule aggregate_graphs_stats:
     # Reading and merging all stats files from chromosome graphs into a .tsv.
     input:
-        genstats=expand("output/stats/chrGraphs/{{gtool}}."+config['name']+".{chromosome}.general.stats.tsv", chromosome=CHRLIST)
+        genstats=expand("output/stats/chrGraphs.{{gtool}}/{{gtool}}."+config['name']+".{chromosome}.general.stats.tsv", chromosome=CHRLIST)
     output:
         genstats="output/stats/{gtool}."+config['name']+".chrGraph.general.stats.tsv",
         pathstats="output/stats/{gtool}."+config['name']+".chrGraph.path.stats.tsv"
-- 
GitLab


From 1278cbfe50da731cb3b29d32e8dcf7398c56a930 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 13 Sep 2024 13:33:25 +0200
Subject: [PATCH 213/310] Fixed VCF_figs path in reports

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 8571e4b..4564ed4 100644
--- a/Snakefile
+++ b/Snakefile
@@ -1225,7 +1225,7 @@ rule create_pan1c_report:
                 else :
                     general_fig = None
 
-                shell("echo '![{basename}](./vcf.figs/{basename})' >> {output.report}")
+                shell("echo '![{basename}](./{wildcards.gtool}.vcf.figs/{basename})' >> {output.report}")
                 shell("echo '' >> {output.report}")
 
         # Converting to HTML
-- 
GitLab


From 0818edb20f77688e7972828189915037e8a0e98b Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Wed, 25 Sep 2024 10:18:31 +0200
Subject: [PATCH 214/310] Added quast asm stats tsv 2 json converter

---
 Snakefile                      |  8 +++++++-
 scripts/asmStats.tsv_2_json.py | 31 +++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)
 create mode 100644 scripts/asmStats.tsv_2_json.py

diff --git a/Snakefile b/Snakefile
index 4564ed4..e0115f1 100644
--- a/Snakefile
+++ b/Snakefile
@@ -150,7 +150,8 @@ rule quast_stats:
         fas=expand("data/haplotypes/{haplotype}.fa.gz", haplotype=SAMPLES_NOREF),
         ref="data/haplotypes/"+config['reference']
     output:
-        report="output/"+config['name']+".quast.report.html"
+        report="output/"+config['name']+".quast.report.html",
+        json="output/report_data/"+config['name']+".assembly_stats.quast.json"
     threads: 16
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000
@@ -181,6 +182,11 @@ rule quast_stats:
             tee {log.cmd}
 
         mv {params.tmp_dir}/report.html {output.report}
+        
+        apptainer run {params.app_path}/pan1c-env.sif python scripts/asmStats.tsv_2_json.py \
+            --input {params.tmp_dir}/transposed_report.tsv \
+            --output {output.json}
+
         rm -r {params.tmp_dir}
         """
 
diff --git a/scripts/asmStats.tsv_2_json.py b/scripts/asmStats.tsv_2_json.py
new file mode 100644
index 0000000..d975ebf
--- /dev/null
+++ b/scripts/asmStats.tsv_2_json.py
@@ -0,0 +1,31 @@
+"""
+Converter from Quast assembly stats TSV to json
+
+@author: alexis.mergez@inrae.fr
+@version: 1.0
+"""
+
+import os
+import argparse
+import pandas as pd
+
+## Arguments
+arg_parser = argparse.ArgumentParser(description='Quast ASM stats to JSON')
+arg_parser.add_argument(
+    "--input",
+    "-i",
+    dest = "input",
+    required = True,
+    help = "Quast transposed_report TSV"
+    )
+arg_parser.add_argument(
+    "--output",
+    dest = "output",
+    required = True,
+    help = "Output path"
+    )
+args = arg_parser.parse_args()
+
+data=pd.read_csv(args.input, sep="\t", index_col = 0)
+
+data.to_json(args.output, orient="index")
\ No newline at end of file
-- 
GitLab


From f49e4659db55bb1bc3bc637975db732c178ab625 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 2 Oct 2024 11:42:57 +0200
Subject: [PATCH 215/310] Started PanicQC JSON creation

---
 Snakefile                | 14 +++++++++++--
 scripts/getTags.py       | 10 +++++++++
 scripts/pan1c_QC_json.py | 45 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 67 insertions(+), 2 deletions(-)
 create mode 100644 scripts/pan1c_QC_json.py

diff --git a/Snakefile b/Snakefile
index e0115f1..84c8557 100644
--- a/Snakefile
+++ b/Snakefile
@@ -782,7 +782,8 @@ rule get_graph_tags:
     input:
         "config.yaml"
     output:
-        "output/pan1c."+config['name']+".gfa.metadata"
+        md="output/pan1c."+config['name']+".gfa.metadata",
+        json="output/report_data/"+config['name']+".tags.json"
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 8000
@@ -793,7 +794,7 @@ rule get_graph_tags:
     shell:
         """
         python scripts/getTags.py \
-            --appdir {params.app_path} --config-file config.yaml > {output}
+            --appdir {params.app_path} --config-file config.yaml --json {output.json} > {output.md}
         """
 
 rule pggb_input_stats:
@@ -1033,6 +1034,15 @@ rule create_chrGraphs_figs:
             --panname {params.pan_name} --reference "$ref" --grapher {wildcards.gtool}
         """
 
+rule Pan1cQC_JSON:
+    input:
+        quast="output/report_data/"+config['name']+".assembly_stats.quast.json",
+        tags="output/report_data/"+config['name']+".tags.json",
+        gfastats="output/stats/{gtool}."+config['name']+".chrGraph.path.stats.tsv",
+    output:
+        "output/report_data/{gtool}."+config['name']+".json"
+    threads: 1
+
 def get_report_sections(wildcards):
     """
     Return 'create_pan1c_report' optional inputs to add them to the final report.
diff --git a/scripts/getTags.py b/scripts/getTags.py
index 2cd7ea4..4eb91db 100644
--- a/scripts/getTags.py
+++ b/scripts/getTags.py
@@ -28,6 +28,12 @@ arg_parser.add_argument(
     required = True,
     help = "Pan1c config file"
     )
+arg_parser.add_argument(
+    "--json",
+    dest = "json",
+    required = True,
+    help = "JSON output"
+    )
 args = arg_parser.parse_args()
 
 ## Main script
@@ -117,6 +123,10 @@ for key in labels.keys():
     if ".Version" in key:
         tags["pan1c-box"][key.lower()] = labels[key]
 
+# Exporting tags to JSON
+with open(args.json, "w") as handle:
+    json.dump(tags, handle, indent=6)
+
 ## Exporting tags to stdout
 print("#\tThis graph have been created using the Pan1c workflow (https://forgemia.inra.fr/alexis.mergez/pan1c)\n#")
 print("#\tTool versions and commands\n#")
diff --git a/scripts/pan1c_QC_json.py b/scripts/pan1c_QC_json.py
new file mode 100644
index 0000000..3bd1ed7
--- /dev/null
+++ b/scripts/pan1c_QC_json.py
@@ -0,0 +1,45 @@
+"""
+JSON creator for Pan1c-QC
+
+@author: alexis.mergez@inrae.fr
+@version: 1.0
+"""
+
+import os
+import argparse
+import pandas as pd
+import json
+
+## Arguments
+arg_parser = argparse.ArgumentParser(description='JSON for Pan1c-QC')
+arg_parser.add_argument(
+    "--quast",
+    dest = "quast",
+    required = True,
+    help = "Quast JSON"
+    )
+arg_parser.add_argument(
+    "--gfastats",
+    dest = "gfastats",
+    required = True,
+    help = "chrGraphs aggregated stats"
+    )
+arg_parser.add_argument(
+    "--tags",
+    dest = "tags",
+    required = True,
+    help = "Graph metadata and tags in JSON"
+    )
+arg_parser.add_argument(
+    "--output",
+    dest = "output",
+    required = True,
+    help = "Output path"
+    )
+args = arg_parser.parse_args()
+
+# Reading tables
+quast = pd.read_csv(args.quast, sep="\t", index_col = 0)
+gfastats = pd.read_csv(args.gfastats, sep="\t")
+
+data.to_json(args.output, orient="index")
\ No newline at end of file
-- 
GitLab


From c3afcd020b198527698383ebadbf94c6e363b7c2 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 2 Oct 2024 17:40:03 +0200
Subject: [PATCH 216/310] Updated Tags

---
 scripts/getTags.py | 75 ++++++++++++++++++++++++++++++----------------
 1 file changed, 49 insertions(+), 26 deletions(-)

diff --git a/scripts/getTags.py b/scripts/getTags.py
index 4eb91db..83198a2 100644
--- a/scripts/getTags.py
+++ b/scripts/getTags.py
@@ -45,7 +45,10 @@ Tags dictionnary :
 tags = {}
 
 ### Pan1c-workflow section
-tags["Pan1c"] = {}
+tags["Pangenome"] = {}
+tags["Parameters"] = {}
+tags["Tools"] = {}
+tags["Apptainer"] = {}
 
 # Using git to get the version of the Pan1c workflow 
 _output = subprocess.run(
@@ -55,8 +58,10 @@ _output = subprocess.run(
 ).stdout[:-1]
 
 # Adding tags
-tags["Pan1c"]["pan1c.version"] = _output
-tags["Pan1c"]["pan1c.home"] = "https://forgemia.inra.fr/alexis.mergez/pan1c"
+tags["Pan1c"] = {
+    "version": _output,
+    "pan1c.home": "https://forgemia.inra.fr/alexis.mergez/pan1c"
+}
 
 # Getting the parameters used in the workflow from the config file
 with open(args.config, 'r') as handle:
@@ -64,11 +69,14 @@ with open(args.config, 'r') as handle:
         line = line[:-1]
         if len(line) and line[0] != "#": # parameter
             _split = line.split(": ")
-            tags["Pan1c"][_split[0]] = _split[-1]
 
-### PanGeTools section
-tags["pangetools"] = {}
+            if _split[0] not in ["name", "reference"]:
+                tags["Parameters"][_split[0]] = _split[-1]
+            
+            else : 
+                tags["Pangenome"][_split[0].capitalize()] = _split[-1]
 
+### PanGeTools section
 # Reading the apps versions from the apptainer tags
 _output = subprocess.run(
     ["apptainer", "inspect", "-j", f"{args.appdir}/PanGeTools.sif"],
@@ -77,17 +85,19 @@ _output = subprocess.run(
 ).stdout
 _output = json.loads(_output)
 labels = _output['data']['attributes']['labels']
-tags["pangetools"]["image.version"] = labels['Version']
-tags["pangetools"]["image.home"] = labels['about.home']
 
-# Adding app versions to the tag dictionnary
+# Populating Apptainer section
+tags["Apptainer"]["pangetools"] = {
+    "version": labels['Version'],
+    "home": labels['about.home']
+}
+
+# Adding app versions to the Tool section
 for key in labels.keys():
     if ".Version" in key:
-        tags["pangetools"][key.lower()] = labels[key]
+        tags["Tools"][key.lower().split(".")[0]] = labels[key]
 
 ### Pan1c-Env section
-tags["pan1c-env"] = {}
-
 # Reading the apps versions from the apptainer tags
 _output = subprocess.run(
     ["apptainer", "inspect", "-j", f"{args.appdir}/pan1c-env.sif"],
@@ -96,17 +106,19 @@ _output = subprocess.run(
 ).stdout
 _output = json.loads(_output)
 labels = _output['data']['attributes']['labels']
-tags["pan1c-env"]["image.version"] = labels['Version']
-tags["pan1c-env"]["image.home"] = labels['about.home']
+
+# Populating Apptainer section
+tags["Apptainer"]["pan1c-env"] = {
+    "version": labels['Version'],
+    "home": labels['about.home']
+}
 
 # Adding app versions to the tag dictionnary
 for key in labels.keys():
     if ".Version" in key:
-        tags["pan1c-env"][key.lower()] = labels[key]
+        tags["Tools"][key.lower().split(".")[0]] = labels[key]
 
 ## Pan1c-Box section
-tags["pan1c-box"] = {}
-
 # Reading the apps versions from the apptainer tags
 _output = subprocess.run(
     ["apptainer", "inspect", "-j", f"{args.appdir}/pan1c-box.sif"],
@@ -115,13 +127,17 @@ _output = subprocess.run(
 ).stdout
 _output = json.loads(_output)
 labels = _output['data']['attributes']['labels']
-tags["pan1c-box"]["image.version"] = labels['Version']
-tags["pan1c-box"]["image.home"] = labels['about.home']
+
+# Populating Apptainer section
+tags["Apptainer"]["pan1c-box"] = {
+    "version": labels['Version'],
+    "home": labels['about.home']
+}
 
 # Adding app versions to the tag dictionnary
 for key in labels.keys():
     if ".Version" in key:
-        tags["pan1c-box"][key.lower()] = labels[key]
+        tags["Tools"][key.lower().split(".")[0]] = labels[key]
 
 # Exporting tags to JSON
 with open(args.json, "w") as handle:
@@ -130,9 +146,16 @@ with open(args.json, "w") as handle:
 ## Exporting tags to stdout
 print("#\tThis graph have been created using the Pan1c workflow (https://forgemia.inra.fr/alexis.mergez/pan1c)\n#")
 print("#\tTool versions and commands\n#")
-for first_elem in tags.keys():
-    print(f'#\t-- {first_elem} --')
-    for label in tags[first_elem].keys():
-        print(f"#\t{first_elem}\t{label}: {tags[first_elem][label]}")
-    print('#')
-    
+for section, svalues in tags.items():
+    print(f'#\t-- {section} --')
+
+    if section == "Apptainer":
+        for image_name, image_info in svalues.items():
+            for key, value in image_info.items():
+                print(f"#\t{image_name}\t{key}: {value}")
+        print('#')
+
+    else :
+        for key, value in svalues.items():
+            print(f"#\t{key}: {value}")
+        print('#')
-- 
GitLab


From 2cf7ac4015b4b1f1e2443909833517b3eeb21885 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 2 Oct 2024 17:44:01 +0200
Subject: [PATCH 217/310] Fixed "'" in params

---
 scripts/getTags.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/getTags.py b/scripts/getTags.py
index 83198a2..8aefdeb 100644
--- a/scripts/getTags.py
+++ b/scripts/getTags.py
@@ -71,10 +71,10 @@ with open(args.config, 'r') as handle:
             _split = line.split(": ")
 
             if _split[0] not in ["name", "reference"]:
-                tags["Parameters"][_split[0]] = _split[-1]
+                tags["Parameters"][_split[0]] = _split[-1].replace("'", "")
             
             else : 
-                tags["Pangenome"][_split[0].capitalize()] = _split[-1]
+                tags["Pangenome"][_split[0].capitalize()] = _split[-1].replace("'", "")
 
 ### PanGeTools section
 # Reading the apps versions from the apptainer tags
-- 
GitLab


From 3e179d01c2b208208d53dd190f55919a8526fd7f Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 3 Oct 2024 10:08:37 +0200
Subject: [PATCH 218/310] Temporary fix to alloc issue with BGZIP

---
 scripts/ragtagChromInfer.sh | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/scripts/ragtagChromInfer.sh b/scripts/ragtagChromInfer.sh
index 05085dc..53c278b 100755
--- a/scripts/ragtagChromInfer.sh
+++ b/scripts/ragtagChromInfer.sh
@@ -51,6 +51,8 @@ grep ">${ref}#chr*" -A1 $tmpdir/ragtag.scaffold.fasta | \
 mv $tmpdir/${sample}.ragtagged.fa $output
 
 # Compressing temporary files
-tar --remove-files -cf $tmpdir.tar $tmpdir
-apptainer run --app bgzip $appdir/PanGeTools.sif \
-    -@ $threads $tmpdir.tar
+#tar --remove-files -cf $tmpdir.tar $tmpdir
+#apptainer run --app bgzip $appdir/PanGeTools.sif \
+#    -@ $threads $tmpdir.tar
+
+rm -r $tmpdir
\ No newline at end of file
-- 
GitLab


From 1ea9dabd0a1e502fdff121b9a0de64203b151994 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 3 Oct 2024 10:43:06 +0200
Subject: [PATCH 219/310] Trying to fix

---
 scripts/ragtagChromInfer.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/scripts/ragtagChromInfer.sh b/scripts/ragtagChromInfer.sh
index 53c278b..9c3cdbb 100755
--- a/scripts/ragtagChromInfer.sh
+++ b/scripts/ragtagChromInfer.sh
@@ -36,10 +36,12 @@ ref=$(echo $fullref | sed 's/.hap/#/')
 mkdir -p $tmpdir
 
 # Running ragtag scaffold
+echo -e "\nRunning RagTag\n"
 apptainer run $appdir/pan1c-env.sif ragtag.py scaffold \
     --mm2-params "$mm2params -t $threads" $rtcommand -o $tmpdir $inputref $inputquery 2>&1
 
 # Renaming sequence according to naming scheme
+echo -e "\nRenaming sequences\n"
 grep ">${ref}#chr*" -A1 $tmpdir/ragtag.scaffold.fasta | \
     sed "s/${ref}#chr\([^_]*\)_RagTag/${hapID}#chr\1/g" > $tmpdir/${sample}.ragtagged.fa
 
@@ -48,6 +50,7 @@ grep ">${ref}#chr*" -A1 $tmpdir/ragtag.scaffold.fasta | \
 #     -@ $threads $tmpdir/${sample}.ragtagged.fa
 
 # Moving fa.gz to output dir
+echo -e "\nMoving final file\n"
 mv $tmpdir/${sample}.ragtagged.fa $output
 
 # Compressing temporary files
@@ -55,4 +58,5 @@ mv $tmpdir/${sample}.ragtagged.fa $output
 #apptainer run --app bgzip $appdir/PanGeTools.sif \
 #    -@ $threads $tmpdir.tar
 
+echo -e "\nRemoving tmp dir\n"
 rm -r $tmpdir
\ No newline at end of file
-- 
GitLab


From 1f38c054f935b95817a58251b73f5c13a95ab4cf Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 3 Oct 2024 11:12:49 +0200
Subject: [PATCH 220/310] Converging on error

---
 Snakefile | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/Snakefile b/Snakefile
index 84c8557..cb5e5cd 100644
--- a/Snakefile
+++ b/Snakefile
@@ -138,10 +138,10 @@ rule ragtag_scaffolding:
             -o {output.fa} 2>&1 | \
             tee {log.cmd}
 
-        if [[ -z $(grep '[^[:space:]]' {output.fa}) ]] ; then
-            echo "Error : Empty final fasta"
-            exit 1
-        fi
+        #if [[ -z $(grep '[^[:space:]]' {output.fa}) ]] ; then
+        #    echo "Error : Empty final fasta"
+        #    exit 1
+        #fi
         """
 
 rule quast_stats:
@@ -1042,6 +1042,7 @@ rule Pan1cQC_JSON:
     output:
         "output/report_data/{gtool}."+config['name']+".json"
     threads: 1
+    
 
 def get_report_sections(wildcards):
     """
-- 
GitLab


From 69e930b58ba89c25ebf63439cdc90cce16dc75f1 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 3 Oct 2024 11:47:50 +0200
Subject: [PATCH 221/310] Re-added compression of tmp for RagTag

---
 scripts/ragtagChromInfer.sh | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/scripts/ragtagChromInfer.sh b/scripts/ragtagChromInfer.sh
index 9c3cdbb..c163697 100755
--- a/scripts/ragtagChromInfer.sh
+++ b/scripts/ragtagChromInfer.sh
@@ -54,9 +54,7 @@ echo -e "\nMoving final file\n"
 mv $tmpdir/${sample}.ragtagged.fa $output
 
 # Compressing temporary files
-#tar --remove-files -cf $tmpdir.tar $tmpdir
-#apptainer run --app bgzip $appdir/PanGeTools.sif \
-#    -@ $threads $tmpdir.tar
-
-echo -e "\nRemoving tmp dir\n"
-rm -r $tmpdir
\ No newline at end of file
+echo -e "\nCompressing tmp dir\n"
+tar --remove-files -cf $tmpdir.tar $tmpdir
+apptainer run --app bgzip $appdir/PanGeTools.sif \
+    -@ $threads $tmpdir.tar
-- 
GitLab


From f566a5fdfa5753c079f9eed801020addaa545078 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 3 Oct 2024 14:16:20 +0200
Subject: [PATCH 222/310] Adding Assembly JSON report for Pan1c QC

---
 Snakefile                                     |  64 ++++++--
 scripts/asm.pan1c_QC.py                       | 149 ++++++++++++++++++
 scripts/pan1c_QC_json.py                      |  45 ------
 ...tats.tsv_2_json.py => quast.tsv_2_json.py} |   0
 4 files changed, 202 insertions(+), 56 deletions(-)
 create mode 100644 scripts/asm.pan1c_QC.py
 delete mode 100644 scripts/pan1c_QC_json.py
 rename scripts/{asmStats.tsv_2_json.py => quast.tsv_2_json.py} (100%)

diff --git a/Snakefile b/Snakefile
index cb5e5cd..7e10421 100644
--- a/Snakefile
+++ b/Snakefile
@@ -74,6 +74,7 @@ def which_analysis():
             analysis_inputs.append(
                 expand("output/{gtool}."+config['name']+".report.md", gtool=graph_tools)
             )
+            analysis_inputs.append("output/report_data/"+config['name']+".assembly.json")
     if config["get_VCF"] == "True": # VCF from the final graph against the "reference"
         analysis_inputs.append(
             expand("output/{gtool}.vcf.figs", gtool=graph_tools)
@@ -183,7 +184,7 @@ rule quast_stats:
 
         mv {params.tmp_dir}/report.html {output.report}
         
-        apptainer run {params.app_path}/pan1c-env.sif python scripts/asmStats.tsv_2_json.py \
+        apptainer run {params.app_path}/pan1c-env.sif python scripts/quast.tsv_2_json.py \
             --input {params.tmp_dir}/transposed_report.tsv \
             --output {output.json}
 
@@ -353,6 +354,57 @@ rule SyRI_on_ASM_wfm:
         rm -r $dir
         """
 
+def asm_json_inputs(wildcards):
+    sections = dict()
+
+    sections["quast"] = "output/report_data/"+config['name']+".assembly_stats.quast.json"
+    sections["fai"] = expand('data/chrInputs/'+config['name']+'.{chromosome}.fa.gz.fai', chromosome=CHRLIST)
+
+    if config["get_contig_pos"] == "True":
+        sections["contig_pos"] = expand(
+            "output/chr.contig/{chromosome}.contig.png",
+            chromosome=CHRLIST
+        )
+
+    if config["get_ASMs_SyRI"] == "True":
+        sections["SyRI_on_ASMs_figs"] = expand(
+            "output/asm.syri.figs/"+config['name']+".{haplotype}.syri.mm2.png", 
+            haplotype=SAMPLES_NOREF
+        )
+
+    if config["get_chrInputs_SyRI"] == "True":
+        sections["SyRI_on_chrInputs_figs"] = expand(
+            "output/chrInput.syri.figs/"+config['name']+".{chromosome}.syri.png", 
+            chromosome=CHRLIST
+        )
+
+
+rule asm_json:
+    # Produce the Assembly JSON for Pan1c QC
+    input:
+        unpack(asm_json_inputs)
+    output:
+        json="output/report_data/"+config['name']+".assembly.json"
+    threads: 1
+    resources:
+        mem_mb = lambda wildcards, threads: threads * 8000
+    params:
+        app_path=config["app.path"],
+        pan_name=config["name"],
+        ref=config['reference'],
+        add_ASMs_SyRI=config['get_ASMs_SyRI'],
+        add_chrInputs_SyRI=config['get_chrInputs_SyRI'],
+        add_contig_pos=config['get_contig_pos'],
+        tmp_dir="output/quast"
+    run:
+        command = ["--output {output.json} --quast {input.quast} --fai {input.fai} --name {params.pan_name} --ref {params.ref_name}"]
+        if params.add_ASMs_SyRI == "True": command.append("--syri_asm")
+        if params.add_chrInputs_SyRI == "True": command.append("--syri_chr")
+        if params.add_contig_pos == "True": command.append("--contig_pos")
+
+        command = " ".join(command)
+        shell("apptainer run {params.app_path}/pan1c-env.sif python scripts/asm.pan1c_QC.py {command}")
+
 """
 Core section : Running PGGB
 """
@@ -1034,16 +1086,6 @@ rule create_chrGraphs_figs:
             --panname {params.pan_name} --reference "$ref" --grapher {wildcards.gtool}
         """
 
-rule Pan1cQC_JSON:
-    input:
-        quast="output/report_data/"+config['name']+".assembly_stats.quast.json",
-        tags="output/report_data/"+config['name']+".tags.json",
-        gfastats="output/stats/{gtool}."+config['name']+".chrGraph.path.stats.tsv",
-    output:
-        "output/report_data/{gtool}."+config['name']+".json"
-    threads: 1
-    
-
 def get_report_sections(wildcards):
     """
     Return 'create_pan1c_report' optional inputs to add them to the final report.
diff --git a/scripts/asm.pan1c_QC.py b/scripts/asm.pan1c_QC.py
new file mode 100644
index 0000000..6e1b0e9
--- /dev/null
+++ b/scripts/asm.pan1c_QC.py
@@ -0,0 +1,149 @@
+"""
+Assembly JSON creator for Pan1c-QC
+
+@author: alexis.mergez@inrae.fr
+@version: 1.0
+"""
+
+import os
+import argparse
+import pandas as pd
+import json
+
+## Arguments
+arg_parser = argparse.ArgumentParser(description='Assembly JSON for Pan1c-QC')
+arg_parser.add_argument(
+    "--quast",
+    dest = "quast",
+    required = True,
+    help = "Quast JSON"
+    )
+arg_parser.add_argument(
+    "--fai",
+    dest = "fai",
+    required = True,
+    help = "concatenated chrInputs fasta index"
+    )
+arg_parser.add_argument(
+    "--name",
+    dest = "name",
+    required = True,
+    help = "Pangenome name"
+    )
+arg_parser.add_argument(
+    "--ref",
+    dest = "ref",
+    required = True,
+    help = "Workflow reference assembly name"
+    )
+arg_parser.add_argument(
+    '--syri_asm',
+    action="store_true",
+    dest = "syri_asm",
+    help = "Add path to Syri figures for haplotypes"
+)
+arg_parser.add_argument(
+    '--syri_chr',
+    action="store_true",
+    dest = "syri_chr",
+    help = "Add path to Syri figures for chrInputs"
+)
+arg_parser.add_argument(
+    '--contig_pos',
+    action="store_true",
+    dest = "contig_pos",
+    help = "Add path to contig pos figures"
+)
+arg_parser.add_argument(
+    "--output",
+    dest = "output",
+    required = True,
+    help = "Output path"
+    )
+args = arg_parser.parse_args()
+
+## Reading inputs
+# Quast JSON
+with open(args.quast, 'r') as file:
+    quast = json.load(file)
+
+
+## Preparing Quast stats
+qdf = pd.DataFrame.from_dict(quast, orient='index')
+
+# Renaming haplotypes according to PanSN
+rename_dict = { name: name.replace(".hap", "#").replace("broken", "contig") for name in qdf.index }
+for key, name in rename_dict.items():
+    if len(name.split("_contig")) != 2:
+        rename_dict[key] = f"{name}_scaffold"
+qdf.rename(index = rename_dict, inplace = True)
+
+# Computing deviation from mean length and reference length
+mean_length = qdf["Total length"].mean()
+qdf["Deviation from mean length (%)"] = round((qdf["Total length"] - mean_length)*100 / mean_length, 2) 
+qdf["Deviation from ref length (%)"] = round((qdf["Total length"] - qdf["Reference length"])*100 / qdf["Reference length"], 2)
+
+# Adding missing broken columns
+for hap in qdf.index:
+    decomp = hap.rsplit("_", maxsplit=1)
+    if decomp[-1] == "scaffold" and f"{decomp[0]}_contig" not in qdf.index:
+        qdf.loc[f"{decomp[0]}_contig"] = qdf.loc[hap]
+
+qdf = qdf.reset_index().rename(columns = {'index': 'raw_hap'})
+qdf[["Hap", "Type"]] = qdf["raw_hap"].str.rsplit("_", n=1, expand=True)
+
+# Removing unnecessary columns
+qdf.drop(columns=[col for col in qdf.columns if col not in ['Hap', 'Type', '# contigs', 'Total length', 'GC (%)', 'N50', 'L50', "# N's per 100 kbp", "Deviation from mean length (%)", "Deviation from ref length (%)"]], inplace=True) 
+qdf = qdf.sort_values(["Type", "Hap"]).set_index(["Type", "Hap"])
+
+## Preparing chromosome length table
+cdf = pd.read_csv(args.fai, sep="\t", header = None, usecols=[0,1], names=["Full_hap", "Length"])
+
+# Splitting full haplotype name and indeexing based on chromosome and haplotype id
+cdf[["Hap", "Chr"]] = cdf["Full_hap"].str.rsplit("#", n=1, expand=True)
+cdf = cdf.set_index(["Chr", "Hap"]).drop(columns = "Full_hap")
+
+# Computing deviation from mean length and reference length
+cdf["Deviation from mean length (%)"] = cdf.groupby(level=0)["Length"].transform(
+    lambda x: round((x - x.mean()) * 100 / x.mean(), 2)
+)
+
+_dev_from_ref_ = []
+for index, row in cdf.iterrows():
+    ref_value = cdf.loc[(index[0], args.ref), "Length"]
+    _dev_from_ref_.append(
+        round((row["Length"] - ref_value)*100/ref_value, 2)
+    )
+
+cdf["Deviation from ref length (%)"] = _dev_from_ref_
+
+
+## Creating Assembly JSON
+assembly = { 
+    "Quast": {
+        "Scaffold": qdf.loc["scaffold",:].to_dict(orient="index"),
+        "Contig": qdf.loc["contig",:].to_dict(orient="index"),
+        "Path": f"data/{args.name}.quast.report.html"
+    },
+    "Chrom_length": {
+        chrid: cdf.loc[chrid].to_dict(orient='index') for chrid in cdf.index.get_level_values(0).unique()
+    }
+}
+
+if args.contig_pos:
+    assembly["Contig_pos"] = {
+        chrid: f"data/chr.contig/{chrid}.contig.png" for chrid in cdf.index.get_level_values(0).unique()
+    }
+
+if args.syri_asm:
+    assembly["Syri_hap"] = {
+        hap: f"data/asm.syri.figs/{args.name}.{hap.replace('#', '.hap')}.syri.mm2.png" for hap in cdf.index.get_level_values(1).unique()
+    }
+
+if args.syri_chr:
+    assembly["Syri_chr"] = {
+        chrid: f"data/chrInput.syri.figs/{args.name}.{chrid}.syri.png" for chrid in cdf.index.get_level_values(0).unique()
+    }
+
+with open(args.output, "w") as handle:
+    json.dump(assembly, handle, indent=6)
diff --git a/scripts/pan1c_QC_json.py b/scripts/pan1c_QC_json.py
deleted file mode 100644
index 3bd1ed7..0000000
--- a/scripts/pan1c_QC_json.py
+++ /dev/null
@@ -1,45 +0,0 @@
-"""
-JSON creator for Pan1c-QC
-
-@author: alexis.mergez@inrae.fr
-@version: 1.0
-"""
-
-import os
-import argparse
-import pandas as pd
-import json
-
-## Arguments
-arg_parser = argparse.ArgumentParser(description='JSON for Pan1c-QC')
-arg_parser.add_argument(
-    "--quast",
-    dest = "quast",
-    required = True,
-    help = "Quast JSON"
-    )
-arg_parser.add_argument(
-    "--gfastats",
-    dest = "gfastats",
-    required = True,
-    help = "chrGraphs aggregated stats"
-    )
-arg_parser.add_argument(
-    "--tags",
-    dest = "tags",
-    required = True,
-    help = "Graph metadata and tags in JSON"
-    )
-arg_parser.add_argument(
-    "--output",
-    dest = "output",
-    required = True,
-    help = "Output path"
-    )
-args = arg_parser.parse_args()
-
-# Reading tables
-quast = pd.read_csv(args.quast, sep="\t", index_col = 0)
-gfastats = pd.read_csv(args.gfastats, sep="\t")
-
-data.to_json(args.output, orient="index")
\ No newline at end of file
diff --git a/scripts/asmStats.tsv_2_json.py b/scripts/quast.tsv_2_json.py
similarity index 100%
rename from scripts/asmStats.tsv_2_json.py
rename to scripts/quast.tsv_2_json.py
-- 
GitLab


From 4697c48645892d1a0afbbae045c3fa7080ea1492 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 3 Oct 2024 14:20:14 +0200
Subject: [PATCH 223/310] Fixed asm_json_inputs

---
 Snakefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Snakefile b/Snakefile
index 7e10421..ccf0e00 100644
--- a/Snakefile
+++ b/Snakefile
@@ -378,6 +378,7 @@ def asm_json_inputs(wildcards):
             chromosome=CHRLIST
         )
 
+    return sections
 
 rule asm_json:
     # Produce the Assembly JSON for Pan1c QC
-- 
GitLab


From f0897796be36f2079d2dd2f146ec5ebaae2aa6ef Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 3 Oct 2024 15:17:18 +0200
Subject: [PATCH 224/310] Fixing rule asm_json

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index ccf0e00..9196fe2 100644
--- a/Snakefile
+++ b/Snakefile
@@ -398,7 +398,7 @@ rule asm_json:
         add_contig_pos=config['get_contig_pos'],
         tmp_dir="output/quast"
     run:
-        command = ["--output {output.json} --quast {input.quast} --fai {input.fai} --name {params.pan_name} --ref {params.ref_name}"]
+        command = [f"--output {output.json} --quast {input.quast} --fai {input.fai} --name {params.pan_name} --ref {params.ref_name}"]
         if params.add_ASMs_SyRI == "True": command.append("--syri_asm")
         if params.add_chrInputs_SyRI == "True": command.append("--syri_chr")
         if params.add_contig_pos == "True": command.append("--contig_pos")
-- 
GitLab


From 2f45c595429a40f38adc4590033e17cf018b9523 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 3 Oct 2024 15:20:31 +0200
Subject: [PATCH 225/310] Fixing rule asm_json 2

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 9196fe2..6ace5e7 100644
--- a/Snakefile
+++ b/Snakefile
@@ -392,7 +392,7 @@ rule asm_json:
     params:
         app_path=config["app.path"],
         pan_name=config["name"],
-        ref=config['reference'],
+        ref_name=config['reference'],
         add_ASMs_SyRI=config['get_ASMs_SyRI'],
         add_chrInputs_SyRI=config['get_chrInputs_SyRI'],
         add_contig_pos=config['get_contig_pos'],
-- 
GitLab


From eef3cf066b5448314d4a3bf7796713273cb6f0f4 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 3 Oct 2024 15:27:52 +0200
Subject: [PATCH 226/310] Fixed asm_json rule

---
 Snakefile | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 6ace5e7..02fd72e 100644
--- a/Snakefile
+++ b/Snakefile
@@ -398,7 +398,9 @@ rule asm_json:
         add_contig_pos=config['get_contig_pos'],
         tmp_dir="output/quast"
     run:
-        command = [f"--output {output.json} --quast {input.quast} --fai {input.fai} --name {params.pan_name} --ref {params.ref_name}"]
+        shell("cat {input.fai} > output/report_data/all.fai")
+
+        command = [f"--output {output.json} --quast {input.quast} --fai output/report_data/all.fai --name {params.pan_name} --ref {params.ref_name}"]
         if params.add_ASMs_SyRI == "True": command.append("--syri_asm")
         if params.add_chrInputs_SyRI == "True": command.append("--syri_chr")
         if params.add_contig_pos == "True": command.append("--contig_pos")
@@ -406,6 +408,8 @@ rule asm_json:
         command = " ".join(command)
         shell("apptainer run {params.app_path}/pan1c-env.sif python scripts/asm.pan1c_QC.py {command}")
 
+        shell("rm output/report_data/all.fai")
+
 """
 Core section : Running PGGB
 """
-- 
GitLab


From 54ca7bea2d86b3d0394b56c0578592b073f94877 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 3 Oct 2024 15:32:30 +0200
Subject: [PATCH 227/310] Fixed rule asm_json 3

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 02fd72e..4a2180d 100644
--- a/Snakefile
+++ b/Snakefile
@@ -400,7 +400,7 @@ rule asm_json:
     run:
         shell("cat {input.fai} > output/report_data/all.fai")
 
-        command = [f"--output {output.json} --quast {input.quast} --fai output/report_data/all.fai --name {params.pan_name} --ref {params.ref_name}"]
+        command = [f"--output {output.json} --quast {input.quast} --fai output/report_data/all.fai --name {params.pan_name} --ref {params.ref_name.rsplit(".", maxsplit=1)}"]
         if params.add_ASMs_SyRI == "True": command.append("--syri_asm")
         if params.add_chrInputs_SyRI == "True": command.append("--syri_chr")
         if params.add_contig_pos == "True": command.append("--contig_pos")
-- 
GitLab


From b5be6774bc616193e5906895a599942663a2cc79 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 3 Oct 2024 15:35:28 +0200
Subject: [PATCH 228/310] Fixing rule asm_json 4

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 4a2180d..71644c0 100644
--- a/Snakefile
+++ b/Snakefile
@@ -400,7 +400,7 @@ rule asm_json:
     run:
         shell("cat {input.fai} > output/report_data/all.fai")
 
-        command = [f"--output {output.json} --quast {input.quast} --fai output/report_data/all.fai --name {params.pan_name} --ref {params.ref_name.rsplit(".", maxsplit=1)}"]
+        command = [f"--output {output.json} --quast {input.quast} --fai output/report_data/all.fai --name {params.pan_name} --ref {params.ref_name.rsplit(".", maxsplit=2)[0]}"]
         if params.add_ASMs_SyRI == "True": command.append("--syri_asm")
         if params.add_chrInputs_SyRI == "True": command.append("--syri_chr")
         if params.add_contig_pos == "True": command.append("--contig_pos")
-- 
GitLab


From 817ead7d125a649123dc670e4d1f130eb61cbde0 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 3 Oct 2024 15:38:46 +0200
Subject: [PATCH 229/310] Fixing rule asm_json 5

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 71644c0..c500bed 100644
--- a/Snakefile
+++ b/Snakefile
@@ -400,7 +400,7 @@ rule asm_json:
     run:
         shell("cat {input.fai} > output/report_data/all.fai")
 
-        command = [f"--output {output.json} --quast {input.quast} --fai output/report_data/all.fai --name {params.pan_name} --ref {params.ref_name.rsplit(".", maxsplit=2)[0]}"]
+        command = [f"--output {output.json} --quast {input.quast} --fai output/report_data/all.fai --name {params.pan_name} --ref {params.ref_name.rsplit('.', maxsplit=2)[0].replace('.hap', '#')}"]
         if params.add_ASMs_SyRI == "True": command.append("--syri_asm")
         if params.add_chrInputs_SyRI == "True": command.append("--syri_chr")
         if params.add_contig_pos == "True": command.append("--contig_pos")
-- 
GitLab


From 4b97b926976f59a8b846910baf276599c9492952 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 8 Oct 2024 14:32:44 +0200
Subject: [PATCH 230/310] Switched from Quast to Assemblathon stats

---
 Snakefile               | 46 +++++++++++++++------
 scripts/asm.pan1c_QC.py | 90 +++++++++++++++++++++--------------------
 2 files changed, 80 insertions(+), 56 deletions(-)

diff --git a/Snakefile b/Snakefile
index c500bed..46c2ca1 100644
--- a/Snakefile
+++ b/Snakefile
@@ -151,8 +151,7 @@ rule quast_stats:
         fas=expand("data/haplotypes/{haplotype}.fa.gz", haplotype=SAMPLES_NOREF),
         ref="data/haplotypes/"+config['reference']
     output:
-        report="output/"+config['name']+".quast.report.html",
-        json="output/report_data/"+config['name']+".assembly_stats.quast.json"
+        report="output/"+config['name']+".quast.report.html"
     threads: 16
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000
@@ -183,14 +182,35 @@ rule quast_stats:
             tee {log.cmd}
 
         mv {params.tmp_dir}/report.html {output.report}
-        
-        apptainer run {params.app_path}/pan1c-env.sif python scripts/quast.tsv_2_json.py \
-            --input {params.tmp_dir}/transposed_report.tsv \
-            --output {output.json}
 
         rm -r {params.tmp_dir}
         """
 
+rule assemblathon_stats:
+    input:
+        fa="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz",
+        reffai="data/haplotypes/"+config['reference']+".fai"
+    output:
+        csv="data/hap.ragtagged/{haplotype}.ragtagged.stats.csv"
+    threads: 1
+    resources:
+        mem_mb = lambda wildcards, threads: threads * 16000
+    params:
+        app_path=config["app.path"]
+    shell:
+        """
+        # Getting ref size
+        rsize=$(awk '{sum += $2} END {print sum}' {input.reffai})
+
+        # Running Assemblathon_stats
+        apptainer exec {params.app_path}/pan1c-env.sif assemblathon_stats.pl \
+            --csv \
+            --genome_size $rsize \
+            {input.fa}
+
+        mv data/hap.ragtagged/{wildcards.haplotype}.ragtagged.csv {output.csv}
+        """
+
 rule contig_position:
     # Produce figures with contig positions
     input:
@@ -357,7 +377,7 @@ rule SyRI_on_ASM_wfm:
 def asm_json_inputs(wildcards):
     sections = dict()
 
-    sections["quast"] = "output/report_data/"+config['name']+".assembly_stats.quast.json"
+    section["csv"] = csv=expand("data/hap.ragtagged/{haplotype}.ragtagged.stats.csv", haplotype=SAMPLES)
     sections["fai"] = expand('data/chrInputs/'+config['name']+'.{chromosome}.fa.gz.fai', chromosome=CHRLIST)
 
     if config["get_contig_pos"] == "True":
@@ -385,22 +405,24 @@ rule asm_json:
     input:
         unpack(asm_json_inputs)
     output:
-        json="output/report_data/"+config['name']+".assembly.json"
+        json="output/report_data/"+config['name']+".assembly.json",
+        merged="output/report_data/"+config['name']+".assemblathon_stats.tsv"
     threads: 1
     resources:
-        mem_mb = lambda wildcards, threads: threads * 8000
+        mem_mb = lambda wildcards, threads: threads * 16000
     params:
         app_path=config["app.path"],
         pan_name=config["name"],
         ref_name=config['reference'],
         add_ASMs_SyRI=config['get_ASMs_SyRI'],
         add_chrInputs_SyRI=config['get_chrInputs_SyRI'],
-        add_contig_pos=config['get_contig_pos'],
-        tmp_dir="output/quast"
+        add_contig_pos=config['get_contig_pos']
     run:
         shell("cat {input.fai} > output/report_data/all.fai")
+        shell("awk 'FNR==1 && NR!=1 {next} {print}' {input.csv} | tr ',' '\\t' > {output.merged}")
+
+        command = [f"--output {output.json} --asm_stats {output.merged} --fai output/report_data/all.fai --name {params.pan_name} --ref {params.ref_name.rsplit('.', maxsplit=2)[0].replace('.hap', '#')}"]
 
-        command = [f"--output {output.json} --quast {input.quast} --fai output/report_data/all.fai --name {params.pan_name} --ref {params.ref_name.rsplit('.', maxsplit=2)[0].replace('.hap', '#')}"]
         if params.add_ASMs_SyRI == "True": command.append("--syri_asm")
         if params.add_chrInputs_SyRI == "True": command.append("--syri_chr")
         if params.add_contig_pos == "True": command.append("--contig_pos")
diff --git a/scripts/asm.pan1c_QC.py b/scripts/asm.pan1c_QC.py
index 6e1b0e9..ce8a8c0 100644
--- a/scripts/asm.pan1c_QC.py
+++ b/scripts/asm.pan1c_QC.py
@@ -2,7 +2,7 @@
 Assembly JSON creator for Pan1c-QC
 
 @author: alexis.mergez@inrae.fr
-@version: 1.0
+@version: 1.1
 """
 
 import os
@@ -13,10 +13,10 @@ import json
 ## Arguments
 arg_parser = argparse.ArgumentParser(description='Assembly JSON for Pan1c-QC')
 arg_parser.add_argument(
-    "--quast",
-    dest = "quast",
+    "--asm_stats",
+    dest = "asmstats",
     required = True,
-    help = "Quast JSON"
+    help = "Combined assemblathon stats"
     )
 arg_parser.add_argument(
     "--fai",
@@ -62,41 +62,7 @@ arg_parser.add_argument(
     )
 args = arg_parser.parse_args()
 
-## Reading inputs
-# Quast JSON
-with open(args.quast, 'r') as file:
-    quast = json.load(file)
-
-
-## Preparing Quast stats
-qdf = pd.DataFrame.from_dict(quast, orient='index')
-
-# Renaming haplotypes according to PanSN
-rename_dict = { name: name.replace(".hap", "#").replace("broken", "contig") for name in qdf.index }
-for key, name in rename_dict.items():
-    if len(name.split("_contig")) != 2:
-        rename_dict[key] = f"{name}_scaffold"
-qdf.rename(index = rename_dict, inplace = True)
-
-# Computing deviation from mean length and reference length
-mean_length = qdf["Total length"].mean()
-qdf["Deviation from mean length (%)"] = round((qdf["Total length"] - mean_length)*100 / mean_length, 2) 
-qdf["Deviation from ref length (%)"] = round((qdf["Total length"] - qdf["Reference length"])*100 / qdf["Reference length"], 2)
-
-# Adding missing broken columns
-for hap in qdf.index:
-    decomp = hap.rsplit("_", maxsplit=1)
-    if decomp[-1] == "scaffold" and f"{decomp[0]}_contig" not in qdf.index:
-        qdf.loc[f"{decomp[0]}_contig"] = qdf.loc[hap]
-
-qdf = qdf.reset_index().rename(columns = {'index': 'raw_hap'})
-qdf[["Hap", "Type"]] = qdf["raw_hap"].str.rsplit("_", n=1, expand=True)
-
-# Removing unnecessary columns
-qdf.drop(columns=[col for col in qdf.columns if col not in ['Hap', 'Type', '# contigs', 'Total length', 'GC (%)', 'N50', 'L50', "# N's per 100 kbp", "Deviation from mean length (%)", "Deviation from ref length (%)"]], inplace=True) 
-qdf = qdf.sort_values(["Type", "Hap"]).set_index(["Type", "Hap"])
-
-## Preparing chromosome length table
+## Preparing chromosome length table ------------------------------------------------------------------------
 cdf = pd.read_csv(args.fai, sep="\t", header = None, usecols=[0,1], names=["Full_hap", "Length"])
 
 # Splitting full haplotype name and indeexing based on chromosome and haplotype id
@@ -117,13 +83,49 @@ for index, row in cdf.iterrows():
 
 cdf["Deviation from ref length (%)"] = _dev_from_ref_
 
+## Preparing Assemblathon stats -----------------------------------------------------------------------------
+adf = pd.read_csv(args.asmstats, sep="\t", index_col=0)
+
+# Renaming haplotypes according to PanSN
+rename_dict = { name: name.replace(".hap", "#").replace(".fa.gz", "") for name in adf.index }
+adf.rename(index = rename_dict, inplace = True)
+
+# Subsetting the whole dataframe
+column_ids = [
+    ["Number of scaffolds", "Total size of scaffolds", "N50 scaffold length", "L50 scaffold count", "scaffold %N", "scaffold %C", "scaffold %G"],
+    ["Number of contigs", "Total size of contigs", "N50 contig length", "L50 contig count", "contig %N", "contig %C", "contig %G"]
+]
+renamed_col = ["# contigs", "Total length", "N50", "L50", "# N's per 100kbp", "GC (%)"]
+
+ctgs = adf.loc[:, columns[1]]
+scfs = adf.loc[:, columns[0]]
+
+# Adding type, renaming columns and merging back
+ctgs["Type"] = ["Contig"]*len(ctgs)
+scfs["Type"] = ["Scaffold"]*len(scfs)
+
+ctgs["GC (%)"] = ctgs["contig %C"]+ctgs["contig %G"]
+scfs["GC (%)"] = scfs["scaffold %C"]+scfs["scaffold %G"]
+
+scfs = scfs.drop(columns=["scaffold %C", "scaffold %G"]).rename(columns = {column_ids[0][i]: renamed_col[i] for i in range(len(column_ids[0])-1)})
+ctgs = ctgs.drop(columns=["contig %C", "contig %G"]).rename(columns = {column_ids[1][i]: renamed_col[i] for i in range(len(column_ids[1])-1)})
+adf = pd.concat([scfs, ctgs], axis=0).reset_index()
+
+ref_len = cdf.xs(args.ref, level='Hap')["Length"].sum()
+
+# Computing deviation from mean length and reference length
+mean_length = adf["Total length"].mean()
+adf["Deviation from mean length (%)"] = round((adf["Total length"] - mean_length)*100 / mean_length, 2) 
+adf["Deviation from ref length (%)"] = round((adf["Total length"] - ref_len)*100 / ref_len, 2)
+
+adf = adf.sort_values(["Type", "Assembly"]).set_index(["Type", "Assembly"])
 
-## Creating Assembly JSON
+## Creating Assembly JSON -----------------------------------------------------------------------------------
 assembly = { 
-    "Quast": {
-        "Scaffold": qdf.loc["scaffold",:].to_dict(orient="index"),
-        "Contig": qdf.loc["contig",:].to_dict(orient="index"),
-        "Path": f"data/{args.name}.quast.report.html"
+    "ASM_stats": {
+        "Scaffold": adf.loc["Scaffold",:].to_dict(orient="index"),
+        "Contig": adf.loc["Contig",:].to_dict(orient="index"),
+        "Quast_path": f"data/{args.name}.quast.report.html"
     },
     "Chrom_length": {
         chrid: cdf.loc[chrid].to_dict(orient='index') for chrid in cdf.index.get_level_values(0).unique()
-- 
GitLab


From 86845e90d795c78ef2626d7fc363b338f6eac258 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 8 Oct 2024 14:34:23 +0200
Subject: [PATCH 231/310] typo

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 46c2ca1..3fb81f5 100644
--- a/Snakefile
+++ b/Snakefile
@@ -377,7 +377,7 @@ rule SyRI_on_ASM_wfm:
 def asm_json_inputs(wildcards):
     sections = dict()
 
-    section["csv"] = csv=expand("data/hap.ragtagged/{haplotype}.ragtagged.stats.csv", haplotype=SAMPLES)
+    sections["csv"] = csv=expand("data/hap.ragtagged/{haplotype}.ragtagged.stats.csv", haplotype=SAMPLES)
     sections["fai"] = expand('data/chrInputs/'+config['name']+'.{chromosome}.fa.gz.fai', chromosome=CHRLIST)
 
     if config["get_contig_pos"] == "True":
-- 
GitLab


From 844b6472707e03fbf37bd7002c23dde30eb69a56 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 8 Oct 2024 14:39:50 +0200
Subject: [PATCH 232/310] Another typo

---
 Snakefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index 3fb81f5..a7c6b25 100644
--- a/Snakefile
+++ b/Snakefile
@@ -200,7 +200,7 @@ rule assemblathon_stats:
     shell:
         """
         # Getting ref size
-        rsize=$(awk '{sum += $2} END {print sum}' {input.reffai})
+        rsize=$(awk '\{sum += $2\} END \{print sum\}' {input.reffai})
 
         # Running Assemblathon_stats
         apptainer exec {params.app_path}/pan1c-env.sif assemblathon_stats.pl \
@@ -419,7 +419,7 @@ rule asm_json:
         add_contig_pos=config['get_contig_pos']
     run:
         shell("cat {input.fai} > output/report_data/all.fai")
-        shell("awk 'FNR==1 && NR!=1 {next} {print}' {input.csv} | tr ',' '\\t' > {output.merged}")
+        shell("awk 'FNR==1 && NR!=1 \{next\} \{print\}' {input.csv} | tr ',' '\\t' > {output.merged}")
 
         command = [f"--output {output.json} --asm_stats {output.merged} --fai output/report_data/all.fai --name {params.pan_name} --ref {params.ref_name.rsplit('.', maxsplit=2)[0].replace('.hap', '#')}"]
 
-- 
GitLab


From 9d58bdeda4fd3c5a8c414a30d50b12e82eaaff4d Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 8 Oct 2024 14:43:46 +0200
Subject: [PATCH 233/310] Typo 3

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index a7c6b25..b382454 100644
--- a/Snakefile
+++ b/Snakefile
@@ -419,7 +419,7 @@ rule asm_json:
         add_contig_pos=config['get_contig_pos']
     run:
         shell("cat {input.fai} > output/report_data/all.fai")
-        shell("awk 'FNR==1 && NR!=1 \{next\} \{print\}' {input.csv} | tr ',' '\\t' > {output.merged}")
+        shell("awk 'FNR==1 && NR!=1 {{next}} {{print}}' {input.csv} | tr ',' '\\t' > {output.merged}")
 
         command = [f"--output {output.json} --asm_stats {output.merged} --fai output/report_data/all.fai --name {params.pan_name} --ref {params.ref_name.rsplit('.', maxsplit=2)[0].replace('.hap', '#')}"]
 
-- 
GitLab


From 324bdab64df51dbba02af109edca575e6e626b91 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 8 Oct 2024 14:46:23 +0200
Subject: [PATCH 234/310] Typo 4

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index b382454..bd8c051 100644
--- a/Snakefile
+++ b/Snakefile
@@ -200,7 +200,7 @@ rule assemblathon_stats:
     shell:
         """
         # Getting ref size
-        rsize=$(awk '\{sum += $2\} END \{print sum\}' {input.reffai})
+        rsize=$(awk '{{sum += $2}} END {{print sum}}' {input.reffai})
 
         # Running Assemblathon_stats
         apptainer exec {params.app_path}/pan1c-env.sif assemblathon_stats.pl \
-- 
GitLab


From 109caa636af0e305f2a8106137b6dc2e3421fc00 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 8 Oct 2024 15:48:40 +0200
Subject: [PATCH 235/310] Typo 5

---
 scripts/asm.pan1c_QC.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/asm.pan1c_QC.py b/scripts/asm.pan1c_QC.py
index ce8a8c0..e691fd4 100644
--- a/scripts/asm.pan1c_QC.py
+++ b/scripts/asm.pan1c_QC.py
@@ -97,8 +97,8 @@ column_ids = [
 ]
 renamed_col = ["# contigs", "Total length", "N50", "L50", "# N's per 100kbp", "GC (%)"]
 
-ctgs = adf.loc[:, columns[1]]
-scfs = adf.loc[:, columns[0]]
+ctgs = adf.loc[:, column_ids[1]]
+scfs = adf.loc[:, column_ids[0]]
 
 # Adding type, renaming columns and merging back
 ctgs["Type"] = ["Contig"]*len(ctgs)
-- 
GitLab


From 0ea92be6c0c1c486a30d772c0b2ff0933102b694 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 8 Oct 2024 15:54:54 +0200
Subject: [PATCH 236/310] Fixed path and using raw assemblies

---
 Snakefile               | 6 +++---
 scripts/asm.pan1c_QC.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Snakefile b/Snakefile
index bd8c051..f1a5955 100644
--- a/Snakefile
+++ b/Snakefile
@@ -188,10 +188,10 @@ rule quast_stats:
 
 rule assemblathon_stats:
     input:
-        fa="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz",
+        fa="data/haplotypes/{haplotype}.fa.gz",
         reffai="data/haplotypes/"+config['reference']+".fai"
     output:
-        csv="data/hap.ragtagged/{haplotype}.ragtagged.stats.csv"
+        csv="data/haplotypes/{haplotype}.stats.csv"
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * 16000
@@ -377,7 +377,7 @@ rule SyRI_on_ASM_wfm:
 def asm_json_inputs(wildcards):
     sections = dict()
 
-    sections["csv"] = csv=expand("data/hap.ragtagged/{haplotype}.ragtagged.stats.csv", haplotype=SAMPLES)
+    sections["csv"] = csv=expand("data/haplotypes/{haplotype}.stats.csv", haplotype=SAMPLES)
     sections["fai"] = expand('data/chrInputs/'+config['name']+'.{chromosome}.fa.gz.fai', chromosome=CHRLIST)
 
     if config["get_contig_pos"] == "True":
diff --git a/scripts/asm.pan1c_QC.py b/scripts/asm.pan1c_QC.py
index e691fd4..e584fa8 100644
--- a/scripts/asm.pan1c_QC.py
+++ b/scripts/asm.pan1c_QC.py
@@ -87,7 +87,7 @@ cdf["Deviation from ref length (%)"] = _dev_from_ref_
 adf = pd.read_csv(args.asmstats, sep="\t", index_col=0)
 
 # Renaming haplotypes according to PanSN
-rename_dict = { name: name.replace(".hap", "#").replace(".fa.gz", "") for name in adf.index }
+rename_dict = { name: name.replace(".hap", "#").replace(".fa.gz", "").split("/")[-1] for name in adf.index }
 adf.rename(index = rename_dict, inplace = True)
 
 # Subsetting the whole dataframe
-- 
GitLab


From 59ec41a2f1c5db1bb5149274bc3aa6ffa87d0a20 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 8 Oct 2024 16:02:18 +0200
Subject: [PATCH 237/310] Fixed error in rule assemblathon_stats

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index f1a5955..ab28730 100644
--- a/Snakefile
+++ b/Snakefile
@@ -208,7 +208,7 @@ rule assemblathon_stats:
             --genome_size $rsize \
             {input.fa}
 
-        mv data/hap.ragtagged/{wildcards.haplotype}.ragtagged.csv {output.csv}
+        mv data/haplotypes/{wildcards.haplotype}.csv {output.csv}
         """
 
 rule contig_position:
-- 
GitLab


From 90238f130af7e320199942ebe6c344cf6c8e2495 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 10 Oct 2024 10:26:48 +0200
Subject: [PATCH 238/310] Update asm.pan1c_QC.py

---
 scripts/asm.pan1c_QC.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/asm.pan1c_QC.py b/scripts/asm.pan1c_QC.py
index e584fa8..17a7ea3 100644
--- a/scripts/asm.pan1c_QC.py
+++ b/scripts/asm.pan1c_QC.py
@@ -139,7 +139,7 @@ if args.contig_pos:
 
 if args.syri_asm:
     assembly["Syri_hap"] = {
-        hap: f"data/asm.syri.figs/{args.name}.{hap.replace('#', '.hap')}.syri.mm2.png" for hap in cdf.index.get_level_values(1).unique()
+        hap: f"data/asm.syri.figs/{args.name}.{hap.replace('#', '.hap')}.syri.mm2.png" for hap in cdf.index.get_level_values(1).unique() if hap != args.ref
     }
 
 if args.syri_chr:
-- 
GitLab


From 7d17c483dbec4f4cd14373d2d5975fdee02833c4 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 11 Oct 2024 17:02:35 +0200
Subject: [PATCH 239/310] Adding Graph JSON for Panic QC

---
 Snakefile                 |  30 +++++++++-
 scripts/asm.pan1c_QC.py   |  14 ++++-
 scripts/graph.pan1c_QC.py | 119 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 158 insertions(+), 5 deletions(-)
 create mode 100644 scripts/graph.pan1c_QC.py

diff --git a/Snakefile b/Snakefile
index ab28730..f40b144 100644
--- a/Snakefile
+++ b/Snakefile
@@ -377,7 +377,7 @@ rule SyRI_on_ASM_wfm:
 def asm_json_inputs(wildcards):
     sections = dict()
 
-    sections["csv"] = csv=expand("data/haplotypes/{haplotype}.stats.csv", haplotype=SAMPLES)
+    sections["csv"] = expand("data/haplotypes/{haplotype}.stats.csv", haplotype=SAMPLES)
     sections["fai"] = expand('data/chrInputs/'+config['name']+'.{chromosome}.fa.gz.fai', chromosome=CHRLIST)
 
     if config["get_contig_pos"] == "True":
@@ -416,7 +416,8 @@ rule asm_json:
         ref_name=config['reference'],
         add_ASMs_SyRI=config['get_ASMs_SyRI'],
         add_chrInputs_SyRI=config['get_chrInputs_SyRI'],
-        add_contig_pos=config['get_contig_pos']
+        add_contig_pos=config['get_contig_pos'],
+        add_quast=config["run_Quast"]
     run:
         shell("cat {input.fai} > output/report_data/all.fai")
         shell("awk 'FNR==1 && NR!=1 {{next}} {{print}}' {input.csv} | tr ',' '\\t' > {output.merged}")
@@ -426,6 +427,7 @@ rule asm_json:
         if params.add_ASMs_SyRI == "True": command.append("--syri_asm")
         if params.add_chrInputs_SyRI == "True": command.append("--syri_chr")
         if params.add_contig_pos == "True": command.append("--contig_pos")
+        if params.add_quast == "True": command.append("--quast")
 
         command = " ".join(command)
         shell("apptainer run {params.app_path}/pan1c-env.sif python scripts/asm.pan1c_QC.py {command}")
@@ -914,6 +916,30 @@ rule core_statistics:
             --chrGraphStats {input.chrGraphStats} -o {output.tsv} -f /dev/null -p {params.pan_name}
         """
 
+rule graph_json:
+    # Produce the Graph JSON for Pan1c QC
+    input:
+        genstats = expand("output/stats/{gtool}."+config['name']+".chrGraph.general.stats.tsv", gtool=graph_tools)
+        pathstats = expand("output/stats/{gtool}."+config['name']+".chrGraph.path.stats.tsv", gtool=graph_tools),
+        odgifigs = expand("output/report/{gtool}."+config['name']+".{chromosome}.report.fig.png", gtool=graph_tools, chromosome=CHRLIST)
+    output:
+        json="output/report_data/"+config['name']+".graph.json"
+    threads: 1
+    resources:
+        mem_mb = lambda wildcards, threads: threads * 16000
+    params:
+        app_path=config["app.path"],
+        pan_name=config["name"]
+    shell:
+        """
+        apptainer run {params.app_path}/pan1c-env.sif python scripts/graph.pan1c_QC.py \
+            --gen_stats {input.genstats} \
+            --path_stats {input.pathstats} \
+            --name {params.pan_name} \
+            --odgi_figs {input.odgifigs} \
+            --output {output.json}
+        """
+    
 """
 Post-processing section
 """
diff --git a/scripts/asm.pan1c_QC.py b/scripts/asm.pan1c_QC.py
index 17a7ea3..425103f 100644
--- a/scripts/asm.pan1c_QC.py
+++ b/scripts/asm.pan1c_QC.py
@@ -2,7 +2,7 @@
 Assembly JSON creator for Pan1c-QC
 
 @author: alexis.mergez@inrae.fr
-@version: 1.1
+@version: 1.2
 """
 
 import os
@@ -54,6 +54,12 @@ arg_parser.add_argument(
     dest = "contig_pos",
     help = "Add path to contig pos figures"
 )
+arg_parser.add_argument(
+    '--quast',
+    action="store_true",
+    dest = "quast",
+    help = "Add path to quast report"
+)
 arg_parser.add_argument(
     "--output",
     dest = "output",
@@ -124,14 +130,16 @@ adf = adf.sort_values(["Type", "Assembly"]).set_index(["Type", "Assembly"])
 assembly = { 
     "ASM_stats": {
         "Scaffold": adf.loc["Scaffold",:].to_dict(orient="index"),
-        "Contig": adf.loc["Contig",:].to_dict(orient="index"),
-        "Quast_path": f"data/{args.name}.quast.report.html"
+        "Contig": adf.loc["Contig",:].to_dict(orient="index")
     },
     "Chrom_length": {
         chrid: cdf.loc[chrid].to_dict(orient='index') for chrid in cdf.index.get_level_values(0).unique()
     }
 }
 
+if args.quast:
+    assembly["ASM_stats"]["Quast_path"] = f"data/{args.name}.quast.report.html"
+
 if args.contig_pos:
     assembly["Contig_pos"] = {
         chrid: f"data/chr.contig/{chrid}.contig.png" for chrid in cdf.index.get_level_values(0).unique()
diff --git a/scripts/graph.pan1c_QC.py b/scripts/graph.pan1c_QC.py
new file mode 100644
index 0000000..3e38e18
--- /dev/null
+++ b/scripts/graph.pan1c_QC.py
@@ -0,0 +1,119 @@
+"""
+Graph JSON creator for Pan1c-QC
+
+@author: alexis.mergez@inrae.fr
+@version: 1.0
+"""
+
+import os
+import argparse
+import pandas as pd
+import json
+
+## Arguments
+arg_parser = argparse.ArgumentParser(description='Graph JSON for Pan1c-QC')
+arg_parser.add_argument(
+    "--gen_stats",
+    dest = "general",
+    nargs="+",
+    required = True,
+    help = "General stats. Multiple allowed (MC, PGGB, etc...)"
+    )
+arg_parser.add_argument(
+    "--path_stats",
+    dest = "path",
+    nargs="+",
+    required = True,
+    help = "Path stats. Multiple allowed (MC, PGGB, etc...)"
+    )
+arg_parser.add_argument(
+    "--name",
+    dest = "name",
+    required = True,
+    help = "Pangenome name"
+    )
+arg_parser.add_argument(
+    '--odgi_figs',
+    dest = "odgifigs",
+    nargs="+",
+    help = "Path to odgi figures"
+    )
+arg_parser.add_argument(
+    "--output",
+    dest = "output",
+    required = True,
+    help = "Output path"
+    )
+args = arg_parser.parse_args()
+
+## General statistics
+
+gen_stats = {}
+
+for tsv in args.general:
+    gtool = gtool_translation[os.path.basename(tsv).split(".")[0]]
+    
+    gen_stats[gtool] = pd.read_csv(tsv, sep="\t").drop(columns="Pangenome.name").set_index("Chr.id").to_dict(orient="index")
+
+## Path statistics and shared content
+
+path_stats = {}
+shared_table = {} 
+
+for tsv in args.path:
+    gtool = gtool_translation[os.path.basename(tsv).split(".")[0]]
+    df = pd.read_csv(tsv, sep="\t").drop(columns="Pangenome.name")
+
+    # Path stats
+    path_stats[gtool] = {}
+    
+    for chrid in df["Chr.id"].unique():
+        path_stats[gtool][chrid] = df[df["Chr.id"] == chrid].drop(columns=["Chr.id", "Shared.content"]).set_index("Path.name").to_dict(orient="index")
+
+    # Shared content
+    shared_table[gtool] = {} 
+    shared_content = df.set_index(["Chr.id", "Path.name"]).loc[:, ["Path.length", "Shared.content"]].to_dict()
+    shared_dict = {}
+    A = 0
+    for key, value in shared_content["Shared.content"].items():
+        for elem in value.split(';'):
+            target, stats = elem.split(":")
+            target = target.rsplit("#", 1)[0]
+    
+            shared_dict[A] = list(key)+[target]+[shared_content["Path.length"][key]]+[int(val) for val in stats.split(',')]
+            A+=1
+    sdf = pd.DataFrame.from_dict(shared_dict, orient='index', columns = ["Chr.id", "Query.name", "Target.name", "Path.length", "Shared.nodes.count", "Shared.length", "Shared.R.length"])
+    sdf.set_index(["Chr.id", "Query.name", "Target.name"], inplace=True)
+    sdf.loc[:, "Shared.prop"] = sdf["Shared.length"]*100/sdf["Path.length"]
+    sdf.loc[:, "Shared.R.prop"] = sdf["Shared.R.length"]*100/sdf["Path.length"]
+    sdf.loc[:, "Shared.length.mb"] = sdf["Shared.length"]/1000000
+    sdf.reset_index(inplace=True)
+
+    for chrid in sdf["Chr.id"].unique():
+        chrdf = sdf[sdf["Chr.id"] == chrid].drop(columns="Chr.id")
+        shared_table[gtool][chrid] = {}
+        
+        for query in chrdf["Query.name"].unique():
+            shared_table[gtool][chrid][query] = chrdf[chrdf["Query.name"] == query].drop(columns="Query.name").set_index("Target.name").to_dict(orient="index")
+
+## Assembling output JSON
+
+Graph_JSON = {
+    "General_stats": gen_stats,
+    "Paths_stats": path_stats,
+    "Shared_content": shared_table,
+}
+
+avail_gtool = list(set([os.path.basename(figs).split('.')[0] for figs in args.odgifigs]))
+
+Graph_JSON["odgi_figs"] = {
+    gtool_translation[gtool]: {
+        chrid : f"{gtool}.{args.name}.{chrid}.report.fig.png"
+        for chrid in gen_stats[gtool_translation[gtool]].keys()
+    }
+    for gtool in avail_gtool
+}
+
+## Outputing to JSON
+with open(args.output, "w") as handle:
+    json.dump(Graph_JSON, handle, indent=6)
\ No newline at end of file
-- 
GitLab


From b0c367e0ce5cef56258d83addcd1d58e4115ee9e Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 11 Oct 2024 17:12:09 +0200
Subject: [PATCH 240/310] Creating the Graph JSON by default

---
 Snakefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Snakefile b/Snakefile
index f40b144..d672248 100644
--- a/Snakefile
+++ b/Snakefile
@@ -75,6 +75,7 @@ def which_analysis():
                 expand("output/{gtool}."+config['name']+".report.md", gtool=graph_tools)
             )
             analysis_inputs.append("output/report_data/"+config['name']+".assembly.json")
+            analysis_inputs.append("output/report_data/"+config['name']+".graph.json")
     if config["get_VCF"] == "True": # VCF from the final graph against the "reference"
         analysis_inputs.append(
             expand("output/{gtool}.vcf.figs", gtool=graph_tools)
-- 
GitLab


From 1e6c59b92c30dd7c181910e5791fd8c79a410bce Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 11 Oct 2024 17:12:42 +0200
Subject: [PATCH 241/310] Typo

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index d672248..7db0fa1 100644
--- a/Snakefile
+++ b/Snakefile
@@ -920,7 +920,7 @@ rule core_statistics:
 rule graph_json:
     # Produce the Graph JSON for Pan1c QC
     input:
-        genstats = expand("output/stats/{gtool}."+config['name']+".chrGraph.general.stats.tsv", gtool=graph_tools)
+        genstats = expand("output/stats/{gtool}."+config['name']+".chrGraph.general.stats.tsv", gtool=graph_tools),
         pathstats = expand("output/stats/{gtool}."+config['name']+".chrGraph.path.stats.tsv", gtool=graph_tools),
         odgifigs = expand("output/report/{gtool}."+config['name']+".{chromosome}.report.fig.png", gtool=graph_tools, chromosome=CHRLIST)
     output:
-- 
GitLab


From 9440ebe881920afe0b572f9c52bf61eb1f6bc9f2 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 11 Oct 2024 17:16:39 +0200
Subject: [PATCH 242/310] Forgot translation dict in Graph JSON script

---
 scripts/graph.pan1c_QC.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/graph.pan1c_QC.py b/scripts/graph.pan1c_QC.py
index 3e38e18..88e4e15 100644
--- a/scripts/graph.pan1c_QC.py
+++ b/scripts/graph.pan1c_QC.py
@@ -46,6 +46,8 @@ arg_parser.add_argument(
     )
 args = arg_parser.parse_args()
 
+gtool_translation = {"pan1c": "PGGB", "MC": "MC"}
+
 ## General statistics
 
 gen_stats = {}
-- 
GitLab


From da23aaee798f75a1b448d1169757a3d423397823 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 11 Oct 2024 17:24:41 +0200
Subject: [PATCH 243/310] Changed path to odgifigs

---
 scripts/graph.pan1c_QC.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/graph.pan1c_QC.py b/scripts/graph.pan1c_QC.py
index 88e4e15..08a4217 100644
--- a/scripts/graph.pan1c_QC.py
+++ b/scripts/graph.pan1c_QC.py
@@ -110,7 +110,7 @@ avail_gtool = list(set([os.path.basename(figs).split('.')[0] for figs in args.od
 
 Graph_JSON["odgi_figs"] = {
     gtool_translation[gtool]: {
-        chrid : f"{gtool}.{args.name}.{chrid}.report.fig.png"
+        chrid : f"data/odgifigs/{gtool}.{args.name}.{chrid}.report.fig.png"
         for chrid in gen_stats[gtool_translation[gtool]].keys()
     }
     for gtool in avail_gtool
-- 
GitLab


From 1f5b98dd25c58848e7c60412172152063e101b13 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 15 Oct 2024 10:43:26 +0200
Subject: [PATCH 244/310] Fixed order in chromosome report fig

---
 Snakefile | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 7db0fa1..b6541f4 100644
--- a/Snakefile
+++ b/Snakefile
@@ -1090,9 +1090,15 @@ rule create_pan1c_report_fig:
         app_path=config['app.path']
     shell:
         """
+        ## Get path order (alphabetic)
+        apptainer run --app odgi {params.app_path}/PanGeTools.sif \
+            paths -i {input.graph} -L | sort > $(dirname {output.reportfig})/{wildcards.gtool}.{wildcards.chromosome}.paths.order.txt
+
         ## Odgi 1D viz
         apptainer run --app odgi {params.app_path}/PanGeTools.sif \
-            viz -i {input.graph} -o {output.odgifig} -x 2500 -a 80 -b -H -t {threads} -P
+            viz -i {input.graph} -o {output.odgifig} -x 2500 -a 80 -b -H -t {threads} -P -p $(dirname {output.reportfig})/{wildcards.gtool}.{wildcards.chromosome}.paths.order.txt
+
+        rm $(dirname {output.reportfig})/{wildcards.gtool}.{wildcards.chromosome}.paths.order.txt
 
         ## Getting legend from contig figure
         convert {input.contigfig} -crop 790x+0+0 +repage {output.namefig}
-- 
GitLab


From 6592d844ffd7e728281a916ab7bc45e9aa628b03 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 15 Oct 2024 16:58:01 +0200
Subject: [PATCH 245/310] Sorting haplotypes in contig.pos_figs.R

---
 scripts/contig.pos_figs.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/contig.pos_figs.R b/scripts/contig.pos_figs.R
index 6a46ee2..5a40eee 100644
--- a/scripts/contig.pos_figs.R
+++ b/scripts/contig.pos_figs.R
@@ -28,6 +28,7 @@ opt = parse_args(opt_parser);
 #print("Reading tsv file ...")
 x = read.table(opt$tsv, sep = "\t", comment.char="^")
 colnames(x) = x[1,]
+haplotypes = sort(colnames(x))
 x = x[-1,]
 my.genome <- toGRanges(x)
 nChr = nrow(x)
@@ -77,6 +78,6 @@ if (nChr >= 4){
 
 }
 
-kp <- plotKaryotype(genome=my.genome, cytobands=my.cytobands, plot.params=pp, chromosomes="all")
+kp <- plotKaryotype(genome=my.genome, cytobands=my.cytobands, plot.params=pp, chromosomes=haplotypes)
 kp <- kpAddBaseNumbers(kp, cex=0.6, tick.dist=5000000)
 dev.off()
\ No newline at end of file
-- 
GitLab


From c0643db00b535103f4d7740e67d9d6721d92f2ce Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 15 Oct 2024 17:27:37 +0200
Subject: [PATCH 246/310] Selecting shell sort

---
 scripts/contig.pos_figs.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/contig.pos_figs.R b/scripts/contig.pos_figs.R
index 5a40eee..d83d197 100644
--- a/scripts/contig.pos_figs.R
+++ b/scripts/contig.pos_figs.R
@@ -28,7 +28,7 @@ opt = parse_args(opt_parser);
 #print("Reading tsv file ...")
 x = read.table(opt$tsv, sep = "\t", comment.char="^")
 colnames(x) = x[1,]
-haplotypes = sort(colnames(x))
+haplotypes = sort(colnames(x), method="shell")
 x = x[-1,]
 my.genome <- toGRanges(x)
 nChr = nrow(x)
-- 
GitLab


From 1a5a60a6312717a5359edb554198c47a716f24df Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Wed, 16 Oct 2024 09:06:03 +0200
Subject: [PATCH 247/310] Adding mem_multiplier to JSON rules

---
 Snakefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index b6541f4..9427ced 100644
--- a/Snakefile
+++ b/Snakefile
@@ -410,7 +410,7 @@ rule asm_json:
         merged="output/report_data/"+config['name']+".assemblathon_stats.tsv"
     threads: 1
     resources:
-        mem_mb = lambda wildcards, threads: threads * 16000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 16000
     params:
         app_path=config["app.path"],
         pan_name=config["name"],
@@ -927,7 +927,7 @@ rule graph_json:
         json="output/report_data/"+config['name']+".graph.json"
     threads: 1
     resources:
-        mem_mb = lambda wildcards, threads: threads * 16000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 16000
     params:
         app_path=config["app.path"],
         pan_name=config["name"]
-- 
GitLab


From 8cd4f8166b757c2de0c9bb46935e49bf38f18401 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Wed, 16 Oct 2024 11:09:13 +0200
Subject: [PATCH 248/310] Patching haplotypes names

---
 scripts/graph.pan1c_QC.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/graph.pan1c_QC.py b/scripts/graph.pan1c_QC.py
index 08a4217..523f8a4 100644
--- a/scripts/graph.pan1c_QC.py
+++ b/scripts/graph.pan1c_QC.py
@@ -65,6 +65,7 @@ shared_table = {}
 for tsv in args.path:
     gtool = gtool_translation[os.path.basename(tsv).split(".")[0]]
     df = pd.read_csv(tsv, sep="\t").drop(columns="Pangenome.name")
+    df["Path.name"] = df["Path.name"].str.rsplit("#", n=1)[0]
 
     # Path stats
     path_stats[gtool] = {}
-- 
GitLab


From a9d2a1cab0bc5b26bf657947f1acaba2b5f72d03 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Wed, 16 Oct 2024 11:17:10 +0200
Subject: [PATCH 249/310] Update graph.pan1c_QC.py

---
 scripts/graph.pan1c_QC.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/graph.pan1c_QC.py b/scripts/graph.pan1c_QC.py
index 523f8a4..3c1bb07 100644
--- a/scripts/graph.pan1c_QC.py
+++ b/scripts/graph.pan1c_QC.py
@@ -65,7 +65,7 @@ shared_table = {}
 for tsv in args.path:
     gtool = gtool_translation[os.path.basename(tsv).split(".")[0]]
     df = pd.read_csv(tsv, sep="\t").drop(columns="Pangenome.name")
-    df["Path.name"] = df["Path.name"].str.rsplit("#", n=1)[0]
+    df["Path.name"] = df["Path.name"].str.rsplit("#", n=1).str[0]
 
     # Path stats
     path_stats[gtool] = {}
-- 
GitLab


From 3c6682d92ee180df5ad4b6c5166610c383d1acc7 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Fri, 18 Oct 2024 08:59:10 +0200
Subject: [PATCH 250/310] Updated GFA to XG conversion

Creating a XG from a GFAv1.1, marking all sequences as reference
---
 rules/tools.smk | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/rules/tools.smk b/rules/tools.smk
index e5d36ba..db1f566 100644
--- a/rules/tools.smk
+++ b/rules/tools.smk
@@ -71,6 +71,13 @@ rule gfa_2_xg:
         app_path=config["app.path"]
     shell:
         """
+        # Converting to GFAv1.1
+        apptainer run --app gfavc {params.app_path}/PanGeTools.sif \
+            --gfa {input} \
+            --outName "{output}.v11.gfa"
+
         apptainer run --app vg {params.app_path}/PanGeTools.sif \
-            convert -g -x -t {threads} {input} > {output}
+            convert -g -x -t {threads} "{output}.v11.gfa" > {output}
+
+        rm "{output}.v11.gfa"
         """
\ No newline at end of file
-- 
GitLab


From d969b59a33384ff39c36e3c698d01701e7c3e2af Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Fri, 18 Oct 2024 09:04:39 +0200
Subject: [PATCH 251/310] Update contig.pos_figs.R

---
 scripts/contig.pos_figs.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/contig.pos_figs.R b/scripts/contig.pos_figs.R
index d83d197..814a60e 100644
--- a/scripts/contig.pos_figs.R
+++ b/scripts/contig.pos_figs.R
@@ -28,7 +28,8 @@ opt = parse_args(opt_parser);
 #print("Reading tsv file ...")
 x = read.table(opt$tsv, sep = "\t", comment.char="^")
 colnames(x) = x[1,]
-haplotypes = sort(colnames(x), method="shell")
+haplotypes = sort(unique(colnames(x)), method="shell")
+print(haplotypes)
 x = x[-1,]
 my.genome <- toGRanges(x)
 nChr = nrow(x)
-- 
GitLab


From dbdaff6895dfd7532fec1d824314c815a589fda9 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Fri, 18 Oct 2024 10:00:00 +0200
Subject: [PATCH 252/310] Update contig.pos_figs.R

---
 scripts/contig.pos_figs.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/contig.pos_figs.R b/scripts/contig.pos_figs.R
index 814a60e..eb3fe19 100644
--- a/scripts/contig.pos_figs.R
+++ b/scripts/contig.pos_figs.R
@@ -28,7 +28,7 @@ opt = parse_args(opt_parser);
 #print("Reading tsv file ...")
 x = read.table(opt$tsv, sep = "\t", comment.char="^")
 colnames(x) = x[1,]
-haplotypes = sort(unique(colnames(x)), method="shell")
+haplotypes = sort(unique(colnames(x[,1])), method="shell")
 print(haplotypes)
 x = x[-1,]
 my.genome <- toGRanges(x)
-- 
GitLab


From d6a5066ed310381907288a39c7c3616b3cf75fa9 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 24 Oct 2024 17:29:47 +0200
Subject: [PATCH 253/310] Added var_json for Pan1c_QC

---
 Snakefile               |  77 ++++++++++++++++++++++++++
 scripts/var.pan1c_QC.py | 117 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 194 insertions(+)
 create mode 100644 scripts/var.pan1c_QC.py

diff --git a/Snakefile b/Snakefile
index 9427ced..a2be6f5 100644
--- a/Snakefile
+++ b/Snakefile
@@ -1074,6 +1074,83 @@ rule vcf_fig:
         #rm {output.vcf_fig}/*.tsv
         """
 
+rule vg_vcf_2_tsv:
+    input:
+        "output/{gtool}."+config['name']+".vcf.gz"
+    output:
+        temp("tmp/var_json/vg_{gtool}.tsv")
+    threads: 1
+    resources:
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 8000    
+    shell:
+        """
+        zcat {input} | awk -f scripts/vcf_2_tsv_vg.awk > {output}
+        """
+
+rule syri_vcf_2_tsv:
+    input:
+        expand("data/asm.syri.mm2/"+config['name']+".{haplotype}.syri.mm2.vcf.gz", haplotype=SAMPLES_NOREF)
+    output:
+        temp("tmp/var_json/syri_mm2.tsv")
+    threads: 1
+    resources:
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 8000   
+    params:
+        app_path=config['app.path'],
+        pan_name=config['name'],
+        refname=config['reference'] 
+    shell:
+        """
+        RHAP=$(basename {params.refname} .fa.gz | cut -f1 -d'.')
+        RHAPN=$(basename {params.refname} .fa.gz | cut -f2 -d'.' | cut -f2 -d'p')
+        FOLDER=$(dirname {input[0]})
+
+        #% SyRI VCF MM2
+        ## Going through all folders
+        for vcf in $FOLDER/*.vcf.gz; do
+            THAP=$(basename $vcf .syri.vcf.gz | cut -f2 -d'.')
+            THAPN=$(basename $vcf .syri.vcf.gz | cut -f3 -d'.' | cut -f2 -d'p')
+
+            # Producing intermediate TSVs
+            zcat $vcf | \
+                awk -v THAP=$THAP -v THAPN=$THAPN -v RHAP=$RHAP -v RHAPN=$RHAPN -f scripts/vcf_2_tsv_syri.awk \
+                > $FOLDER/$(basename $vcf .gz).tsv
+        done
+
+        ## Merging TSVs
+        head -n1 $FOLDER/$(basename $vcf .gz).tsv > {output}
+        tail -n +2  -q $FOLDER/*.vcf.tsv >> {output}
+
+        rm $FOLDER/*.tsv
+        """
+
+def var_json_inputs(wildcards):
+    # Inputs for var_json rule
+    inputs = {}
+    inputs["vg"] = expand("tmp/var_json/vg_{gtool}.tsv", gtool=graph_tools)
+    inputs["syri_mm2"] = "tmp/var_json/syri_mm2.tsv"
+
+    return inputs
+
+rule var_json:
+    # Produce the Assembly JSON for Pan1c QC
+    input:
+        unpack(var_json_inputs)
+    output:
+        json="output/report_data/"+config['name']+".var.json"
+    threads: 1
+    resources:
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 16000
+    params:
+        app_path=config['app.path'],
+        refname=config['reference']
+    shell:
+        """
+        apptainer run {params.app_path}/pan1c-env.sif python scripts/var.pan1c_QC.py \
+            --inputs {input.vg} {input.syri_mm2} \
+            --output {output.json}
+        """
+
 rule create_pan1c_report_fig:
     # Produces a markdown report figure of chromosomes graphs
     input:
diff --git a/scripts/var.pan1c_QC.py b/scripts/var.pan1c_QC.py
new file mode 100644
index 0000000..ddca7c5
--- /dev/null
+++ b/scripts/var.pan1c_QC.py
@@ -0,0 +1,117 @@
+"""
+Variant JSON creator for Pan1c-QC
+
+@author: alexis.mergez@inrae.fr
+@version: 1.0
+"""
+
+import os
+import argparse
+import pandas as pd
+import numpy as np
+import json
+
+## Arguments
+arg_parser = argparse.ArgumentParser(description='Variant JSON for Pan1c-QC')
+arg_parser.add_argument(
+    "--input",
+    "-i",
+    dest = "inputs",
+    nargs="+",
+    required = True,
+    help = "TSV(s) of variants. Filename should be the tool used (for example: df_pan1c, syri_mm2, ...)"
+    )
+arg_parser.add_argument(
+    "--output",
+    dest = "output",
+    required = True,
+    help = "Output path"
+    )
+args = arg_parser.parse_args()
+
+## Parsing functions
+def log_transform(x):
+    if x > 0:
+        return np.log10(x)      # Logarithme pour les valeurs positives
+    elif x < 0:
+        return -np.log10(-x)    # -Logarithme pour les valeurs nÃ©gatives
+    else:
+        return np.nan 
+
+def log_untransform(x):
+    if x >= 0:
+        return 10**(x)  # Pour les valeurs non nÃ©gatives
+    else:
+        return -10**(-x)
+
+def parse_tsv(file):
+    # Parsing TSV into dataframe
+    df = pd.read_csv(file, sep="\t")
+    df.rename(columns={"CHROM": "NAME"}, inplace=True)
+    df[["QUERY", "CHROM"]] = df["NAME"].str.rsplit("#", n=1, expand=True)
+    df[["Genome", "Haplotype"]] = df["HAP"].str.rsplit("#", n=1, expand=True)
+    df.drop("NAME", axis=1, inplace=True)
+    df.loc[:,"LEN"] = -df["LEN"]
+
+    # Keeping variants between 50 <= |<val>| <= 100000
+    df = df.query('(-100000 <= LEN <= -50) or (50 <= LEN <= 100000)')
+
+    # Converting the length to Log
+    df.loc[:, "LOGLEN"] = df.loc[:, 'LEN'].apply(log_transform)
+
+    # Binning LOGLEN
+    bins = pd.interval_range(start=-5, freq=0.05, end=5)
+    df.loc[:, 'BIN'] = pd.cut(df.loc[:, 'LOGLEN'], bins=bins, precision=2)
+
+    # Counting variants for each bin, grouped by Query, Chromosome and Haplotype
+    fdata = {}
+    for query in df["QUERY"].unique():
+        fdata[query] = {}
+        
+        for hapid in df["HAP"].unique():
+            fdata[query][hapid] = {}
+            
+            tmp_data = {}
+
+            for chromid in df["CHROM"].unique():
+                
+                sub_df = df.query("QUERY == @query and CHROM == @chromid and HAP == @hapid")
+                bin_counts = sub_df['BIN'].value_counts(sort=False)
+                bin_counts = bin_counts.reindex(bins, fill_value=0)
+
+                # Filtering bins for values between -50 and 50
+                bin_counts = bin_counts[np.array([(interval.left > np.log10(50)) or (interval.right < -np.log10(50)) for interval in bin_counts.index])]    
+
+                # Saving bin counts for summing later
+                tmp_data[chromid] = bin_counts.values
+
+                fdata[query][hapid][chromid] = list(bin_counts.values)
+                bins_string = [f"({int(round(log_untransform(interval.left), 0))}, {int(round(log_untransform(interval.right), 0))}]" for interval in bin_counts.index]
+                
+                if "index" in fdata: 
+                    assert (np.array(fdata["index"])==np.array(bins_string)).all()
+                else:
+                    fdata["index"] = bins_string
+            
+            all_chrom = pd.DataFrame.from_dict(tmp_data, orient='columns')
+            all_chrom.index = fdata["index"]
+            fdata[query][hapid]["All"] = list(all_chrom.sum(axis=1).values)
+
+    return fdata
+
+## Parsing all TSV and aggregating into a final dictionnary
+data = {}
+for file in args.inputs:
+    tool_name = os.path.basename(file).rsplit('.', 1)[0]
+    print(tool_name)
+    data[tool_name] = parse_tsv(file)
+
+    if "index" in data:
+        assert (np.array(data["index"])==np.array(data[tool_name]["index"])).all()
+    else:
+        data["index"] = data[tool_name]["index"]
+
+    del data[tool_name]["index"]
+
+with open(args.output, "w") as handle:
+    json.dump(data, handle, indent=6)
-- 
GitLab


From acab2126387a5ed079f574947dee057ee3b51643 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 24 Oct 2024 17:32:47 +0200
Subject: [PATCH 254/310] Update Snakefile

---
 Snakefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Snakefile b/Snakefile
index a2be6f5..9c2e080 100644
--- a/Snakefile
+++ b/Snakefile
@@ -80,6 +80,7 @@ def which_analysis():
         analysis_inputs.append(
             expand("output/{gtool}.vcf.figs", gtool=graph_tools)
         )
+        analysis_inputs.append("output/report_data/"+config['name']+".var.json")
 
     return analysis_inputs
 
-- 
GitLab


From 525c57db9209d067bc2f9aaf71474dd2486db7ff Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 25 Oct 2024 08:36:11 +0200
Subject: [PATCH 255/310] Typo

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 9c2e080..dd1f2d1 100644
--- a/Snakefile
+++ b/Snakefile
@@ -1148,7 +1148,7 @@ rule var_json:
     shell:
         """
         apptainer run {params.app_path}/pan1c-env.sif python scripts/var.pan1c_QC.py \
-            --inputs {input.vg} {input.syri_mm2} \
+            --input {input.vg} {input.syri_mm2} \
             --output {output.json}
         """
 
-- 
GitLab


From 2dbd73f02c4fed377c5cc4329bef37ed8d18483b Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 25 Oct 2024 09:37:41 +0200
Subject: [PATCH 256/310] Bumped memory for var_json

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index dd1f2d1..fbbc6ef 100644
--- a/Snakefile
+++ b/Snakefile
@@ -1141,7 +1141,7 @@ rule var_json:
         json="output/report_data/"+config['name']+".var.json"
     threads: 1
     resources:
-        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 16000
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 48000
     params:
         app_path=config['app.path'],
         refname=config['reference']
-- 
GitLab


From 9734a1bcf5daa3e387e8781f582fde06adfac5af Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 25 Oct 2024 10:53:24 +0200
Subject: [PATCH 257/310] Fixed var_json

---
 scripts/var.pan1c_QC.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/scripts/var.pan1c_QC.py b/scripts/var.pan1c_QC.py
index ddca7c5..a667f3c 100644
--- a/scripts/var.pan1c_QC.py
+++ b/scripts/var.pan1c_QC.py
@@ -85,7 +85,7 @@ def parse_tsv(file):
                 # Saving bin counts for summing later
                 tmp_data[chromid] = bin_counts.values
 
-                fdata[query][hapid][chromid] = list(bin_counts.values)
+                fdata[query][hapid][chromid] = ';'.join([str(k) for k in list(bin_counts.values)])
                 bins_string = [f"({int(round(log_untransform(interval.left), 0))}, {int(round(log_untransform(interval.right), 0))}]" for interval in bin_counts.index]
                 
                 if "index" in fdata: 
@@ -95,19 +95,20 @@ def parse_tsv(file):
             
             all_chrom = pd.DataFrame.from_dict(tmp_data, orient='columns')
             all_chrom.index = fdata["index"]
-            fdata[query][hapid]["All"] = list(all_chrom.sum(axis=1).values)
-
+            fdata[query][hapid]["All"] = ';'.join([str(k) for k in list(all_chrom.sum(axis=1).values)])
+    
+    fdata["index"] = ";".join(fdata["index"])
     return fdata
 
 ## Parsing all TSV and aggregating into a final dictionnary
 data = {}
-for file in args.inputs:
+for file in args_inputs:
     tool_name = os.path.basename(file).rsplit('.', 1)[0]
     print(tool_name)
     data[tool_name] = parse_tsv(file)
 
     if "index" in data:
-        assert (np.array(data["index"])==np.array(data[tool_name]["index"])).all()
+        assert (data["index"]==data[tool_name]["index"])
     else:
         data["index"] = data[tool_name]["index"]
 
-- 
GitLab


From 84c0c2d46a390f244a842af21465a6f0656a0185 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 25 Oct 2024 10:57:33 +0200
Subject: [PATCH 258/310] Typo

---
 scripts/var.pan1c_QC.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/var.pan1c_QC.py b/scripts/var.pan1c_QC.py
index a667f3c..7e6e886 100644
--- a/scripts/var.pan1c_QC.py
+++ b/scripts/var.pan1c_QC.py
@@ -102,7 +102,7 @@ def parse_tsv(file):
 
 ## Parsing all TSV and aggregating into a final dictionnary
 data = {}
-for file in args_inputs:
+for file in args.inputs:
     tool_name = os.path.basename(file).rsplit('.', 1)[0]
     print(tool_name)
     data[tool_name] = parse_tsv(file)
-- 
GitLab


From 0b31478ab6a5b3345a633b039edfdd5762fe2afa Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 25 Oct 2024 11:54:58 +0200
Subject: [PATCH 259/310] Better keys for var_json

---
 scripts/var.pan1c_QC.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/scripts/var.pan1c_QC.py b/scripts/var.pan1c_QC.py
index 7e6e886..17043ea 100644
--- a/scripts/var.pan1c_QC.py
+++ b/scripts/var.pan1c_QC.py
@@ -101,18 +101,25 @@ def parse_tsv(file):
     return fdata
 
 ## Parsing all TSV and aggregating into a final dictionnary
-data = {}
+data = {"x": {}, "y": {}}
 for file in args.inputs:
     tool_name = os.path.basename(file).rsplit('.', 1)[0]
     print(tool_name)
-    data[tool_name] = parse_tsv(file)
+    data["x"][tool_name] = parse_tsv(file)
 
     if "index" in data:
-        assert (data["index"]==data[tool_name]["index"])
+        assert (data["y"]["bin"]==data[tool_name]["index"])
     else:
-        data["index"] = data[tool_name]["index"]
+        data["y"]["bin"] = data["x"][tool_name]["index"]
 
-    del data[tool_name]["index"]
+    del data["x"][tool_name]["index"]
+
+data["y"]["logy"] = ';'.join(
+    [
+        str(np.array(interval[1:-1].split(", ")).astype('int').mean()) # Mean coord between min max of each bin
+        for interval in data["y"]["bin"].split(";")
+    ]
+)
 
 with open(args.output, "w") as handle:
     json.dump(data, handle, indent=6)
-- 
GitLab


From 7145f75999bc8e7d261faa01168018c65de716a8 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 25 Oct 2024 11:56:57 +0200
Subject: [PATCH 260/310] Fixed inversed x and y

---
 scripts/var.pan1c_QC.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/scripts/var.pan1c_QC.py b/scripts/var.pan1c_QC.py
index 17043ea..1331aed 100644
--- a/scripts/var.pan1c_QC.py
+++ b/scripts/var.pan1c_QC.py
@@ -105,19 +105,19 @@ data = {"x": {}, "y": {}}
 for file in args.inputs:
     tool_name = os.path.basename(file).rsplit('.', 1)[0]
     print(tool_name)
-    data["x"][tool_name] = parse_tsv(file)
+    data["y"][tool_name] = parse_tsv(file)
 
     if "index" in data:
-        assert (data["y"]["bin"]==data[tool_name]["index"])
+        assert (data["x"]["bin"]==data["y"][tool_name]["index"])
     else:
-        data["y"]["bin"] = data["x"][tool_name]["index"]
+        data["x"]["bin"] = data["y"][tool_name]["index"]
 
-    del data["x"][tool_name]["index"]
+    del data["y"][tool_name]["index"]
 
-data["y"]["logy"] = ';'.join(
+data["x"]["logx"] = ';'.join(
     [
         str(np.array(interval[1:-1].split(", ")).astype('int').mean()) # Mean coord between min max of each bin
-        for interval in data["y"]["bin"].split(";")
+        for interval in data["x"]["bin"].split(";")
     ]
 )
 
-- 
GitLab


From ce57f61609b1a721d86d6ee5a43c20a9c4d743fb Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 25 Oct 2024 15:03:42 +0200
Subject: [PATCH 261/310] Renaming

---
 Snakefile                 | 216 ++++++++++++++++++--------------------
 scripts/graph.pan1c_QC.py |   4 +-
 2 files changed, 108 insertions(+), 112 deletions(-)

diff --git a/Snakefile b/Snakefile
index fbbc6ef..6684e67 100644
--- a/Snakefile
+++ b/Snakefile
@@ -36,17 +36,17 @@ nHAP = len(SAMPLES)
 with gzip.open("data/haplotypes/"+config['reference'], "r") as handle:
     CHRLIST = [line.decode().split("#")[-1].split('\n')[0] for line in handle.readlines() if line.decode()[0] == ">"]
 
-graph_tools = ["pan1c"] + (config["get_MC"] == "True")*["MC"] 
+graph_tools = ["PGGB"] + (config["get_MC"] == "True")*["MC"] 
 
 # Adding optionnal output based on config.yaml, using the following function
 def which_analysis():
     
     ## Default analysis
     analysis_inputs = [     
-        expand("output/stats/{gtool}."+config['name']+".core.stats.tsv", gtool=graph_tools), # core stats
-        expand("output/panacus.reports/{gtool}."+config['name']+".{chromosome}.histgrowth.html", chromosome=CHRLIST, gtool=graph_tools), # panacus histgrowth 
-        expand("output/chrGraphs.figs/{gtool}."+config['name']+".{chromosome}.1Dviz.png", chromosome=CHRLIST, gtool=graph_tools), # visualizations from odgi on chromosome graphs
-        expand("output/stats/{gtool}."+config['name']+".chrGraph.general.stats.tsv", gtool=graph_tools) # chromosomes graph statistics
+        expand("output/stats/Pan1c.{gtool}."+config['name']+".core.stats.tsv", gtool=graph_tools), # core stats
+        expand("output/panacus.reports/Pan1c.{gtool}."+config['name']+".{chromosome}.histgrowth.html", chromosome=CHRLIST, gtool=graph_tools), # panacus histgrowth 
+        expand("output/chrGraphs.figs/Pan1c.{gtool}."+config['name']+".{chromosome}.1Dviz.png", chromosome=CHRLIST, gtool=graph_tools), # visualizations from odgi on chromosome graphs
+        expand("output/stats/Pan1c.{gtool}."+config['name']+".chrGraph.general.stats.tsv", gtool=graph_tools) # chromosomes graph statistics
     ]
     
     ## Optionals analysis steps
@@ -55,41 +55,35 @@ def which_analysis():
 
     if config["get_ASMs_SyRI"] == "True": # Creating SyRI for each input assembly 
         analysis_inputs.append(
-            expand("output/asm.syri.figs/"+config['name']+".{haplotype}.syri.{tool}.png", haplotype=SAMPLES_NOREF, tool=["mm2"])
+            expand("output/asm.syri.figs/Pan1c."+config['name']+".{haplotype}.syri_{tool}.png", haplotype=SAMPLES_NOREF, tool=["mm2"])
         )
     if config["get_chrInputs_SyRI"] == "True": # Creating SyRI figures for each PGGB input
         analysis_inputs.append(
-            expand("output/chrInput.syri.figs/"+config['name']+".{chromosome}.syri.png", chromosome=CHRLIST)
+            expand("output/chrInput.syri.figs/Pan1c."+config['name']+".{chromosome}.syri_mm2.png", chromosome=CHRLIST)
         )
     if config["run_Quast"] == "True": # Running Quast on input haplotypes
         analysis_inputs.append(
-            "output/"+config['name']+".quast.report.html"
+            "output/Pan1c."+config['name']+".quast.report.html"
         )
     if config["get_contig_pos"] == "True": # Chromosome decomposition into its contig figure
         analysis_inputs.append(
-            expand("output/chr.contig/{haplotype}.contig.png", haplotype=CHRLIST) 
+            expand("output/chr.contig/Pan1c."+config['name']+"{chromosome}.contig.png", chromosome=CHRLIST) 
         )
 
         if config["create_report"] == "True": # Creating report (need contig)
             analysis_inputs.append(
-                expand("output/{gtool}."+config['name']+".report.md", gtool=graph_tools)
+                expand("output/Pan1c.{gtool}."+config['name']+".report.md", gtool=graph_tools)
             )
-            analysis_inputs.append("output/report_data/"+config['name']+".assembly.json")
-            analysis_inputs.append("output/report_data/"+config['name']+".graph.json")
+            analysis_inputs.append("output/report_data/Pan1c."+config['name']+".assembly.json")
+            analysis_inputs.append("output/report_data/Pan1c."+config['name']+".graph.json")
     if config["get_VCF"] == "True": # VCF from the final graph against the "reference"
         analysis_inputs.append(
             expand("output/{gtool}.vcf.figs", gtool=graph_tools)
         )
-        analysis_inputs.append("output/report_data/"+config['name']+".var.json")
+        analysis_inputs.append("output/report_data/Pan1c."+config['name']+".var.json")
 
     return analysis_inputs
 
-"""
-Functions   ---------------------------------------------------------------------------------------
-"""
-def get_mem_mb(wildcards, attempt, threads, multiplier=config["mem_multiplier"]):
-    return attempt * multiplier * threads
-
 """
 Rules   -------------------------------------------------------------------------------------------
 """
@@ -97,8 +91,8 @@ Rules   ------------------------------------------------------------------------
 # Main target rule
 rule all:
     input:
-        expand("output/{gtool}."+config['name']+".gfa.gz", gtool=graph_tools), # Final graph (main output)
-        "output/pan1c."+config['name']+".gfa.metadata", # Metadata for the final (also in top of gfa files as # line)
+        expand("output/Pan1c.{gtool}."+config['name']+".gfa.gz", gtool=graph_tools), # Final graph (main output)
+        "output/Pan1c."+config['name']+".gfa.metadata", # Metadata for the final (also in top of gfa files as # line)
         which_analysis()
 
 """
@@ -153,7 +147,7 @@ rule quast_stats:
         fas=expand("data/haplotypes/{haplotype}.fa.gz", haplotype=SAMPLES_NOREF),
         ref="data/haplotypes/"+config['reference']
     output:
-        report="output/"+config['name']+".quast.report.html"
+        report="output/Pan1c."+config['name']+".quast.report.html"
     threads: 16
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000
@@ -213,13 +207,13 @@ rule assemblathon_stats:
         mv data/haplotypes/{wildcards.haplotype}.csv {output.csv}
         """
 
-rule contig_position:
+rule contig_positions:
     # Produce figures with contig positions
     input:
         fa="data/chrInputs/"+config["name"]+".{chromosome}.fa.gz",
         fai="data/chrInputs/"+config["name"]+".{chromosome}.fa.gz.fai"
     output:
-        fig="output/chr.contig/{chromosome}.contig.png",
+        fig="output/chr.contig/Pan1c."+config['name']+"{chromosome}.contig.png",
         outdir=temp(directory("output/chr.contig/{chromosome}"))
     threads: 1
     resources:
@@ -290,8 +284,8 @@ rule SyRI_on_ASM_mm2:
         ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz",
         qry="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz"
     output:
-        fig="output/asm.syri.figs/"+config['name']+".{haplotype}.syri.mm2.png",
-        vcf="data/asm.syri.mm2/"+config['name']+".{haplotype}.syri.mm2.vcf.gz"
+        fig="output/asm.syri.figs/Pan1c."+config['name']+".{haplotype}.syri_mm2.png",
+        vcf="data/asm.syri.mm2/Pan1c."+config['name']+".{haplotype}.syri.mm2.vcf.gz"
     log: 
         cmd="logs/SyRI_ASM/{haplotype}.SyRI_ASM.mm2.cmd.log",
         time="logs/SyRI_ASM/{haplotype}.SyRI_ASM.mm2.time.log"
@@ -335,8 +329,8 @@ rule SyRI_on_ASM_wfm:
         ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz",
         qry="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz"
     output:
-        fig="output/asm.syri.figs/"+config['name']+".{haplotype}.syri.wfm.png",
-        vcf="data/asm.syri.wfm/"+config['name']+".{haplotype}.syri.wfm.vcf.gz"
+        fig="output/asm.syri.figs/Pan1c."+config['name']+".{haplotype}.syri_wfm.png",
+        vcf="data/asm.syri.wfm/Pan1c."+config['name']+".{haplotype}.syri.wfm.vcf.gz"
     log: 
         cmd="logs/SyRI_ASM/{haplotype}.SyRI_ASM.wfm.cmd.log",
         time="logs/SyRI_ASM/{haplotype}.SyRI_ASM.wfm.time.log"
@@ -384,19 +378,19 @@ def asm_json_inputs(wildcards):
 
     if config["get_contig_pos"] == "True":
         sections["contig_pos"] = expand(
-            "output/chr.contig/{chromosome}.contig.png",
+            "output/chr.contig/Pan1c."+config['name']+"{chromosome}.contig.png",
             chromosome=CHRLIST
         )
 
     if config["get_ASMs_SyRI"] == "True":
         sections["SyRI_on_ASMs_figs"] = expand(
-            "output/asm.syri.figs/"+config['name']+".{haplotype}.syri.mm2.png", 
+            "output/asm.syri.figs/Pan1c."+config['name']+".{haplotype}.syri_mm2.png", 
             haplotype=SAMPLES_NOREF
         )
 
     if config["get_chrInputs_SyRI"] == "True":
         sections["SyRI_on_chrInputs_figs"] = expand(
-            "output/chrInput.syri.figs/"+config['name']+".{chromosome}.syri.png", 
+            "output/chrInput.syri.figs/Pan1c."+config['name']+".{chromosome}.syri_mm2.png", 
             chromosome=CHRLIST
         )
 
@@ -407,8 +401,8 @@ rule asm_json:
     input:
         unpack(asm_json_inputs)
     output:
-        json="output/report_data/"+config['name']+".assembly.json",
-        merged="output/report_data/"+config['name']+".assemblathon_stats.tsv"
+        json="output/report_data/Pan1c."+config['name']+".assembly.json",
+        merged="output/report_data/Pan1c."+config['name']+".assemblathon_stats.tsv"
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 16000
@@ -444,7 +438,7 @@ rule SyRI_on_chrInput:
     input:
         fasta='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz'
     output:
-        fig="output/chrInput.syri.figs/"+config['name']+".{chromosome}.syri.png"
+        fig="output/chrInput.syri.figs/Pan1c."+config['name']+".{chromosome}.syri_mm2.png"
     threads: 8
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 12000
@@ -497,10 +491,10 @@ rule wfmash_on_chr:
         fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz',
         fai='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz.fai'
     output:
-        mapping=temp("data/chrGraphs/pan1c.{chromosome}/pan1c.{chromosome}.wfmash.mapping.paf"),
-        aln=temp("data/chrGraphs/pan1c.{chromosome}/pan1c.{chromosome}.wfmash.aln.paf"),
-        mapping_gz="data/chrGraphs/pan1c.{chromosome}/pan1c.{chromosome}.wfmash.mapping.paf.gz",
-        aln_gz="data/chrGraphs/pan1c.{chromosome}/pan1c.{chromosome}.wfmash.aln.paf.gz"
+        mapping=temp("data/chrGraphs/PGGB.{chromosome}/Pan1c."+config['name']+".{chromosome}.wfmash.mapping.paf"),
+        aln=temp("data/chrGraphs/PGGB.{chromosome}/Pan1c."+config['name']+".{chromosome}.wfmash.aln.paf"),
+        mapping_gz="data/chrGraphs/PGGB.{chromosome}/Pan1c."+config['name']+".{chromosome}.wfmash.mapping.paf.gz",
+        aln_gz="data/chrGraphs/PGGB.{chromosome}/Pan1c."+config['name']+".{chromosome}.wfmash.aln.paf.gz"
     threads: 16
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
@@ -553,7 +547,7 @@ rule seqwish:
         fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz',
         aln=rules.wfmash_on_chr.output.aln_gz
     output:
-        gfa_gz="data/chrGraphs/pan1c.{chromosome}/pan1c.{chromosome}.seqwish.gfa.gz"
+        gfa_gz="data/chrGraphs/PGGB.{chromosome}/Pan1c."+config['name']+".{chromosome}.seqwish.gfa.gz"
     threads: 8
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000
@@ -585,8 +579,8 @@ rule gfaffix_on_chr:
     input:
         rules.seqwish.output.gfa_gz
     output:
-        gfa_gz="data/chrGraphs/pan1c.{chromosome}/pan1c.{chromosome}.seqwish.gfaffixD.gfa.gz",
-        transform="data/chrGraphs/pan1c.{chromosome}/pan1c.{chromosome}.seqwish.gfaffixD.transform.txt"
+        gfa_gz="data/chrGraphs/PGGB.{chromosome}/Pan1c."+config['name']+".{chromosome}.seqwish.gfaffixD.gfa.gz",
+        transform="data/chrGraphs/PGGB.{chromosome}/Pan1c."+config['name']+".{chromosome}.seqwish.gfaffixD.transform.txt"
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 24000
@@ -617,10 +611,10 @@ rule gfaffix_on_chr:
 rule odgi_postprocessing:
     # Running pggb's postprocessing (mainly odgi) steps with gfaffix graph
     input:
-        tags="output/pan1c."+config['name']+".gfa.metadata",
+        tags="output/Pan1c."+config['name']+".gfa.metadata",
         gfa_gz=rules.gfaffix_on_chr.output.gfa_gz
     output:
-        gfa_gz='data/chrGraphs/pan1c.'+config['name']+'.{chromosome}.gfa.gz'
+        gfa_gz="data/chrGraphs/Pan1c.PGGB."+config['name']+".{chromosome}.gfa.gz"
     threads: 8
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000
@@ -686,10 +680,10 @@ rule odgi_postprocessing:
 
 rule MC_graph:
     input:
-        tags="output/pan1c."+config['name']+".gfa.metadata",
+        tags="output/Pan1c."+config['name']+".gfa.metadata",
         fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz'
     output:
-        gfa_gz='data/chrGraphs/MC.'+config['name']+'.{chromosome}.gfa.gz'
+        gfa_gz='data/chrGraphs/Pan1c.MC.'+config['name']+'.{chromosome}.gfa.gz'
     threads: 16
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 32000
@@ -740,9 +734,9 @@ rule MC_graph:
 rule generate_graph_list:
     # Generate a text file containing all created graphs
     input:
-        gfas=expand('data/chrGraphs/{{gtool}}.'+config['name']+'.{chromosome}.tmp.gfa', chromosome=CHRLIST)
+        gfas=expand('data/chrGraphs/Pan1c.{{gtool}}.'+config['name']+'.{chromosome}.tmp.gfa', chromosome=CHRLIST)
     output:
-        "data/chrGraphs/graphsList.{gtool}.txt"
+        temp("data/chrGraphs/graphsList.{gtool}.txt")
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000
@@ -756,13 +750,13 @@ rule graph_squeeze:
     # Using odgi to merge every subgraphs into a final one
     input:
         glist="data/chrGraphs/graphsList.{gtool}.txt",
-        tags="output/pan1c."+config['name']+".gfa.metadata",
-        graphs=expand('data/chrGraphs/{{gtool}}.'+config['name']+'.{chromosome}.tmp.gfa', chromosome=CHRLIST)
+        tags="output/Pan1c."+config['name']+".gfa.metadata",
+        graphs=expand('data/chrGraphs/Pan1c.{{gtool}}.'+config['name']+'.{chromosome}.tmp.gfa', chromosome=CHRLIST)
     output:
-        gfa_gz="output/{gtool}."+config['name']+".gfa.gz"
+        gfa_gz="output/Pan1c.{gtool}."+config['name']+".gfa.gz"
     log: 
-        cmd="logs/squeeze/{gtool}."+config['name']+".squeeze.cmd.log",
-        time="logs/squeeze/{gtool}."+config['name']+".squeeze.time.log",
+        cmd="logs/squeeze/Pan1c.{gtool}."+config['name']+".squeeze.cmd.log",
+        time="logs/squeeze/Pan1c.{gtool}."+config['name']+".squeeze.time.log",
     threads: 16
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
@@ -794,10 +788,10 @@ rule graph_squeeze:
 rule graph_stats:
     # Using GFAstats to produce stats on every chromosome graphs
     input:
-        graph='data/chrGraphs/{gtool}.'+config['name']+'.{chromosome}.gfa.gz'
+        graph='data/chrGraphs/Pan1c.{gtool}.'+config['name']+'.{chromosome}.gfa.gz'
     output:
-        genstats="output/stats/chrGraphs.{gtool}/{gtool}."+config['name']+".{chromosome}.general.stats.tsv",
-        pathstats="output/stats/chrGraphs.{gtool}/{gtool}."+config['name']+".{chromosome}.path.stats.tsv"
+        genstats="output/stats/chrGraphs.{gtool}/Pan1c.{gtool}."+config['name']+".{chromosome}.general.stats.tsv",
+        pathstats="output/stats/chrGraphs.{gtool}/Pan1c.{gtool}."+config['name']+".{chromosome}.path.stats.tsv"
     threads: 4
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 8000
@@ -815,10 +809,10 @@ rule graph_stats:
 rule graph_figs:
     # Creating figures using odgi viz 
     input:
-        graph='data/chrGraphs/{gtool}.'+config['name']+'.{chromosome}.tmp.gfa'
+        graph='data/chrGraphs/Pan1c.{gtool}.'+config['name']+'.{chromosome}.tmp.gfa'
     output:
-        oneDviz="output/chrGraphs.figs/{gtool}."+config['name']+".{chromosome}.1Dviz.png",
-        pcov="output/chrGraphs.figs/{gtool}."+config['name']+".{chromosome}.pcov.png"
+        oneDviz="output/chrGraphs.figs/Pan1c.{gtool}."+config['name']+".{chromosome}.1Dviz.png",
+        pcov="output/chrGraphs.figs/Pan1c.{gtool}."+config['name']+".{chromosome}.pcov.png"
     threads: 4
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000
@@ -841,10 +835,10 @@ rule graph_figs:
 rule aggregate_graphs_stats:
     # Reading and merging all stats files from chromosome graphs into a .tsv.
     input:
-        genstats=expand("output/stats/chrGraphs.{{gtool}}/{{gtool}}."+config['name']+".{chromosome}.general.stats.tsv", chromosome=CHRLIST)
+        genstats=expand("output/stats/chrGraphs.{{gtool}}/Pan1c.{{gtool}}."+config['name']+".{chromosome}.general.stats.tsv", chromosome=CHRLIST)
     output:
-        genstats="output/stats/{gtool}."+config['name']+".chrGraph.general.stats.tsv",
-        pathstats="output/stats/{gtool}."+config['name']+".chrGraph.path.stats.tsv"
+        genstats="output/stats/Pan1c.{gtool}."+config['name']+".chrGraph.general.stats.tsv",
+        pathstats="output/stats/Pan1c.{gtool}."+config['name']+".chrGraph.path.stats.tsv"
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
@@ -865,8 +859,8 @@ rule get_graph_tags:
     input:
         "config.yaml"
     output:
-        md="output/pan1c."+config['name']+".gfa.metadata",
-        json="output/report_data/"+config['name']+".tags.json"
+        md="output/Pan1c."+config['name']+".gfa.metadata",
+        json="output/report_data/Pan1c."+config['name']+".tags.json"
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 8000
@@ -883,9 +877,9 @@ rule get_graph_tags:
 rule pggb_input_stats:
     # Produces statistics on pggb input sequences
     input:
-        flag="output/stats/{gtool}."+config['name']+".chrGraph.general.stats.tsv"
+        flag="output/stats/Pan1c.{gtool}."+config['name']+".chrGraph.general.stats.tsv"
     output:
-        "output/stats/{gtool}."+config['name']+".chrInput.stats.tsv"
+        "output/stats/Pan1c.{gtool}."+config['name']+".chrInput.stats.tsv"
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 32000
@@ -901,10 +895,10 @@ rule pggb_input_stats:
 rule core_statistics:
     # Aggregate chrInput, chrGraph and pggb statistics into a single tsv 
     input:
-        chrInputStats = "output/stats/{gtool}."+config['name']+".chrInput.stats.tsv",
-        chrGraphStats = "output/stats/{gtool}."+config['name']+".chrGraph.general.stats.tsv"
+        chrInputStats = "output/stats/Pan1c.{gtool}."+config['name']+".chrInput.stats.tsv",
+        chrGraphStats = "output/stats/Pan1c.{gtool}."+config['name']+".chrGraph.general.stats.tsv"
     output:
-        tsv = "output/stats/{gtool}."+config['name']+".core.stats.tsv"
+        tsv = "output/stats/Pan1c.{gtool}."+config['name']+".core.stats.tsv"
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
@@ -921,11 +915,11 @@ rule core_statistics:
 rule graph_json:
     # Produce the Graph JSON for Pan1c QC
     input:
-        genstats = expand("output/stats/{gtool}."+config['name']+".chrGraph.general.stats.tsv", gtool=graph_tools),
-        pathstats = expand("output/stats/{gtool}."+config['name']+".chrGraph.path.stats.tsv", gtool=graph_tools),
-        odgifigs = expand("output/report/{gtool}."+config['name']+".{chromosome}.report.fig.png", gtool=graph_tools, chromosome=CHRLIST)
+        genstats = expand("output/stats/Pan1c.{gtool}."+config['name']+".chrGraph.general.stats.tsv", gtool=graph_tools),
+        pathstats = expand("output/stats/Pan1c.{gtool}."+config['name']+".chrGraph.path.stats.tsv", gtool=graph_tools),
+        odgifigs = expand("output/report/Pan1c.{gtool}."+config['name']+".{chromosome}.report.fig.png", gtool=graph_tools, chromosome=CHRLIST)
     output:
-        json="output/report_data/"+config['name']+".graph.json"
+        json="output/report_data/Pan1c."+config['name']+".graph.json"
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 16000
@@ -968,12 +962,12 @@ rule get_pav:
 rule panacus_stats:
     # Produces panacus reports for a chromosome graph
     input:
-        graph='data/chrGraphs/{gtool}.'+config['name']+'.{chromosome}.tmp.gfa'
+        graph='data/chrGraphs/Pan1c.{gtool}.'+config['name']+'.{chromosome}.tmp.gfa'
     output:
-        html='output/panacus.reports/{gtool}.'+config['name']+'.{chromosome}.histgrowth.html'
+        html='output/panacus.reports/Pan1c.{gtool}.'+config['name']+'.{chromosome}.histgrowth.html'
     log: 
-        cmd="logs/panacus/{gtool}.{chromosome}.panacus.cmd.log",
-        time="logs/panacus/{gtool}.{chromosome}.panacus.time.log"
+        cmd="logs/panacus/Pan1c.{gtool}.{chromosome}.panacus.cmd.log",
+        time="logs/panacus/Pan1c.{gtool}.{chromosome}.panacus.time.log"
     params:
         app_path=config['app.path'],
         pan_name=config['name'],
@@ -997,9 +991,9 @@ rule panacus_stats:
 rule vg_deconstruct:
     # Produce a VCF based on the "reference" haplotype
     input:
-        graph="output/{gtool}."+config['name']+".xg",
+        graph="output/Pan1c.{gtool}."+config['name']+".xg",
     output:
-        vcf=temp("output/{gtool}."+config['name']+".vcf"),
+        vcf=temp("output/Pan1c.{gtool}."+config['name']+".vcf"),
     threads: 8
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 32000
@@ -1007,8 +1001,8 @@ rule vg_deconstruct:
         app_path=config['app.path'],
         ref=config['reference']
     log: 
-        cmd="logs/vg_deconstruct/{gtool}.vg_deconstruct.cmd.log",
-        time="logs/vg_deconstruct/{gtool}.vg_deconstruct.time.log"
+        cmd="logs/vg_deconstruct/Pan1c.{gtool}.vg_deconstruct.cmd.log",
+        time="logs/vg_deconstruct/Pan1c.{gtool}.vg_deconstruct.time.log"
     shell:
         """
         /usr/bin/time -v -o {log.time} \
@@ -1024,8 +1018,8 @@ rule vg_deconstruct:
 rule vcf_fig:
     # Produce a figure describing INS/DEL length distribution from vg deconstruct and SyRI
     input:
-        vg="output/{gtool}."+config['name']+".vcf.gz",
-        syris_mm2=expand("data/asm.syri.mm2/"+config['name']+".{haplotype}.syri.mm2.vcf.gz", haplotype=SAMPLES_NOREF)
+        vg="output/Pan1c.{gtool}."+config['name']+".vcf.gz",
+        syris_mm2=expand("data/asm.syri.mm2/Pan1c."+config['name']+".{haplotype}.syri.mm2.vcf.gz", haplotype=SAMPLES_NOREF)
     output:
         vcf_fig=directory("output/{gtool}.vcf.figs")
     threads: 1
@@ -1077,7 +1071,7 @@ rule vcf_fig:
 
 rule vg_vcf_2_tsv:
     input:
-        "output/{gtool}."+config['name']+".vcf.gz"
+        "output/Pan1c.{gtool}."+config['name']+".vcf.gz"
     output:
         temp("tmp/var_json/vg_{gtool}.tsv")
     threads: 1
@@ -1090,7 +1084,7 @@ rule vg_vcf_2_tsv:
 
 rule syri_vcf_2_tsv:
     input:
-        expand("data/asm.syri.mm2/"+config['name']+".{haplotype}.syri.mm2.vcf.gz", haplotype=SAMPLES_NOREF)
+        expand("data/asm.syri.mm2/Pan1c."+config['name']+".{haplotype}.syri.mm2.vcf.gz", haplotype=SAMPLES_NOREF)
     output:
         temp("tmp/var_json/syri_mm2.tsv")
     threads: 1
@@ -1138,7 +1132,7 @@ rule var_json:
     input:
         unpack(var_json_inputs)
     output:
-        json="output/report_data/"+config['name']+".var.json"
+        json="output/report_data/Pan1c."+config['name']+".var.json"
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 48000
@@ -1155,12 +1149,12 @@ rule var_json:
 rule create_pan1c_report_fig:
     # Produces a markdown report figure of chromosomes graphs
     input:
-        graph='data/chrGraphs/{gtool}.'+config['name']+'.{chromosome}.tmp.gfa',
-        contigfig="output/chr.contig/{chromosome}.contig.png",
+        graph='data/chrGraphs/Pan1c.{gtool}.'+config['name']+'.{chromosome}.tmp.gfa',
+        contigfig="output/chr.contig/Pan1c."+config['name']+"{chromosome}.contig.png",
     output:
-        odgifig=temp("tmp/{gtool}.{chromosome}.odgi.png"),
-        namefig=temp("tmp/{gtool}.{chromosome}.name.png"),
-        reportfig="output/report/{gtool}."+config['name']+".{chromosome}.report.fig.png"
+        odgifig=temp("tmp/Pan1c.{gtool}.{chromosome}.odgi.png"),
+        namefig=temp("tmp/Pan1c.{gtool}.{chromosome}.name.png"),
+        reportfig="output/report/Pan1c.{gtool}."+config['name']+".{chromosome}.report.fig.png"
     threads: 4
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
@@ -1198,14 +1192,14 @@ rule create_pan1c_report_fig:
 rule create_chrGraphs_figs:
     # Produce figures based on aggregated path stats
     input:
-        pathstats="output/stats/{gtool}."+config['name']+".chrGraph.path.stats.tsv"
+        pathstats="output/stats/Pan1c.{gtool}."+config['name']+".chrGraph.path.stats.tsv"
     output:
-        barplots=expand("output/chrGraphs.stats.figs/{{gtool}}."+config['name']+".path.decomp.{chromosome}.png", chromosome=CHRLIST),
-        scatters=expand("output/chrGraphs.stats.figs/{{gtool}}."+config['name']+".2D.scatter.{chromosome}.png", chromosome=CHRLIST),
-        heatmaps=expand("output/chrGraphs.stats.figs/{{gtool}}."+config['name']+".sharred.content.{chromosome}.png", chromosome=CHRLIST),
-        barplot_mean="output/chrGraphs.stats.figs/{gtool}."+config['name']+".path.decomp.mean.png",
-        scatter_mean="output/chrGraphs.stats.figs/{gtool}."+config['name']+".2D.scatter.mean.png",
-        heatmap_diff="output/chrGraphs.stats.figs/{gtool}."+config['name']+".shared.content.diff.png"
+        barplots=expand("output/chrGraphs.stats.figs/Pan1c.{{gtool}}."+config['name']+".path.decomp.{chromosome}.png", chromosome=CHRLIST),
+        scatters=expand("output/chrGraphs.stats.figs/Pan1c.{{gtool}}."+config['name']+".2D.scatter.{chromosome}.png", chromosome=CHRLIST),
+        heatmaps=expand("output/chrGraphs.stats.figs/Pan1c.{{gtool}}."+config['name']+".sharred.content.{chromosome}.png", chromosome=CHRLIST),
+        barplot_mean="output/chrGraphs.stats.figs/Pan1c.{gtool}."+config['name']+".path.decomp.mean.png",
+        scatter_mean="output/chrGraphs.stats.figs/Pan1c.{gtool}."+config['name']+".2D.scatter.mean.png",
+        heatmap_diff="output/chrGraphs.stats.figs/Pan1c.{gtool}."+config['name']+".shared.content.diff.png"
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 16000
@@ -1232,26 +1226,26 @@ def get_report_sections(wildcards):
     """
     sections = dict()
 
-    sections["metadata"] = "output/pan1c."+config['name']+".gfa.metadata"
-    sections["odgifigs"] = expand("output/report/{gtool}."+config['name']+".{chromosome}.report.fig.png", chromosome=CHRLIST, gtool=[wildcards.gtool])
-    sections["genstats"] = f"output/stats/{wildcards.gtool}."+config['name']+".chrGraph.general.stats.tsv"
-    sections["pathstats"] = f"output/stats/{wildcards.gtool}."+config['name']+".chrGraph.path.stats.tsv"
-    sections["barplots"] = expand("output/chrGraphs.stats.figs/{gtool}."+config['name']+".path.decomp.{chromosome}.png", chromosome=CHRLIST, gtool=[wildcards.gtool])
-    sections["scatters"] = expand("output/chrGraphs.stats.figs/{gtool}."+config['name']+".2D.scatter.{chromosome}.png", chromosome=CHRLIST, gtool=[wildcards.gtool])
-    sections["heatmaps"] = expand("output/chrGraphs.stats.figs/{gtool}."+config['name']+".sharred.content.{chromosome}.png", chromosome=CHRLIST, gtool=[wildcards.gtool])
-    sections["barplot_mean"] = f"output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".path.decomp.mean.png"
-    sections["scatter_mean"] = f"output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".2D.scatter.mean.png"
-    sections["heatmap_diff"] = f"output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".shared.content.diff.png"
+    sections["metadata"] = "output/Pan1c."+config['name']+".gfa.metadata"
+    sections["odgifigs"] = expand("output/report/Pan1c.{gtool}."+config['name']+".{chromosome}.report.fig.png", chromosome=CHRLIST, gtool=[wildcards.gtool])
+    sections["genstats"] = f"output/stats/Pan1c.{wildcards.gtool}."+config['name']+".chrGraph.general.stats.tsv"
+    sections["pathstats"] = f"output/stats/Pan1c.{wildcards.gtool}."+config['name']+".chrGraph.path.stats.tsv"
+    sections["barplots"] = expand("output/chrGraphs.stats.figs/Pan1c.{gtool}."+config['name']+".path.decomp.{chromosome}.png", chromosome=CHRLIST, gtool=[wildcards.gtool])
+    sections["scatters"] = expand("output/chrGraphs.stats.figs/Pan1c.{gtool}."+config['name']+".2D.scatter.{chromosome}.png", chromosome=CHRLIST, gtool=[wildcards.gtool])
+    sections["heatmaps"] = expand("output/chrGraphs.stats.figs/Pan1c.{gtool}."+config['name']+".sharred.content.{chromosome}.png", chromosome=CHRLIST, gtool=[wildcards.gtool])
+    sections["barplot_mean"] = f"output/chrGraphs.stats.figs/Pan1c.{wildcards.gtool}."+config['name']+".path.decomp.mean.png"
+    sections["scatter_mean"] = f"output/chrGraphs.stats.figs/Pan1c.{wildcards.gtool}."+config['name']+".2D.scatter.mean.png"
+    sections["heatmap_diff"] = f"output/chrGraphs.stats.figs/Pan1c.{wildcards.gtool}."+config['name']+".shared.content.diff.png"
 
     if config["get_ASMs_SyRI"] == "True":
         sections["SyRI_on_ASMs_figs"] = expand(
-            "output/asm.syri.figs/"+config['name']+".{haplotype}.syri.mm2.png", 
+            "output/asm.syri.figs/Pan1c."+config['name']+".{haplotype}.syri_mm2.png", 
             haplotype=SAMPLES_NOREF
             )
 
     if config["get_chrInputs_SyRI"] == "True":
         sections["SyRI_on_chrInputs_figs"] = expand(
-            "output/chrInput.syri.figs/"+config['name']+".{chromosome}.syri.png", 
+            "output/chrInput.syri.figs/Pan1c."+config['name']+".{chromosome}.syri_mm2.png", 
             chromosome=CHRLIST
             )
 
@@ -1265,8 +1259,8 @@ rule create_pan1c_report:
     input:
         unpack(get_report_sections)
     output:
-        report="output/{gtool}."+config['name']+".report.md",
-        html="output/{gtool}."+config['name']+".report.html"
+        report="output/Pan1c.{gtool}."+config['name']+".report.md",
+        html="output/Pan1c.{gtool}."+config['name']+".report.html"
     threads: 4
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 500
diff --git a/scripts/graph.pan1c_QC.py b/scripts/graph.pan1c_QC.py
index 3c1bb07..8c41d82 100644
--- a/scripts/graph.pan1c_QC.py
+++ b/scripts/graph.pan1c_QC.py
@@ -2,7 +2,7 @@
 Graph JSON creator for Pan1c-QC
 
 @author: alexis.mergez@inrae.fr
-@version: 1.0
+@version: 1.1
 """
 
 import os
@@ -99,6 +99,8 @@ for tsv in args.path:
         for query in chrdf["Query.name"].unique():
             shared_table[gtool][chrid][query] = chrdf[chrdf["Query.name"] == query].drop(columns="Query.name").set_index("Target.name").to_dict(orient="index")
 
+
+
 ## Assembling output JSON
 
 Graph_JSON = {
-- 
GitLab


From 28d65573ea4a3482571302f45b99724fc83d0618 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 25 Oct 2024 15:57:16 +0200
Subject: [PATCH 262/310] Added Total, INS and DEL variant sums for each
 chromosomes

---
 scripts/var.pan1c_QC.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/scripts/var.pan1c_QC.py b/scripts/var.pan1c_QC.py
index 1331aed..d53714f 100644
--- a/scripts/var.pan1c_QC.py
+++ b/scripts/var.pan1c_QC.py
@@ -72,6 +72,9 @@ def parse_tsv(file):
             fdata[query][hapid] = {}
             
             tmp_data = {}
+            fdata[query][hapid]["All_Total"] = 0
+            fdata[query][hapid]["All_INS"] = 0
+            fdata[query][hapid]["All_DEL"] = 0
 
             for chromid in df["CHROM"].unique():
                 
@@ -85,7 +88,17 @@ def parse_tsv(file):
                 # Saving bin counts for summing later
                 tmp_data[chromid] = bin_counts.values
 
+                # Creating a concatenated list
                 fdata[query][hapid][chromid] = ';'.join([str(k) for k in list(bin_counts.values)])
+
+                # Computing Total number of variants, number of Deletions, number of Insertions 
+                fdata[query][hapid][f"{chromid}_Total"] = int(bin_counts.values.sum())
+                fdata[query][hapid]["All_Total"] += fdata[query][hapid][f"{chromid}_Total"]
+                fdata[query][hapid][f"{chromid}_DEL"] = int(bin_counts.values[:66].sum())
+                fdata[query][hapid]["All_DEL"] += fdata[query][hapid][f"{chromid}_DEL"]
+                fdata[query][hapid][f"{chromid}_INS"] = int(bin_counts.values[66:].sum())
+                fdata[query][hapid]["All_INS"] += fdata[query][hapid][f"{chromid}_INS"]
+                
                 bins_string = [f"({int(round(log_untransform(interval.left), 0))}, {int(round(log_untransform(interval.right), 0))}]" for interval in bin_counts.index]
                 
                 if "index" in fdata: 
@@ -96,6 +109,7 @@ def parse_tsv(file):
             all_chrom = pd.DataFrame.from_dict(tmp_data, orient='columns')
             all_chrom.index = fdata["index"]
             fdata[query][hapid]["All"] = ';'.join([str(k) for k in list(all_chrom.sum(axis=1).values)])
+            
     
     fdata["index"] = ";".join(fdata["index"])
     return fdata
-- 
GitLab


From 1576c791b6f07c1fbefd5ed0df8cb5df0d943348 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Mon, 28 Oct 2024 12:28:59 +0100
Subject: [PATCH 263/310] Fixed rule graph_stats

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 6684e67..99fcfe2 100644
--- a/Snakefile
+++ b/Snakefile
@@ -802,7 +802,7 @@ rule graph_stats:
         """
         apptainer run --app gfastats {params.app_path}/PanGeTools.sif \
             -g {input.graph} -P \
-            -o $(dirname {output.genstats})/{wildcards.gtool}.{params.pan_name}.{wildcards.chromosome} \
+            -o $(dirname {output.genstats})/Pan1c.{wildcards.gtool}.{params.pan_name}.{wildcards.chromosome} \
             -t {threads}
         """
 
-- 
GitLab


From e3cec1f0390dd4f170649146e4dc0c606558e306 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Mon, 4 Nov 2024 16:54:17 +0100
Subject: [PATCH 264/310] Changed config.yaml - Made MC params apparent - Made
 a dedicated Pan1c-View JSON parameter

---
 Snakefile               | 36 +++++++++++++++++++-----------------
 config.yaml             |  6 ++++--
 scripts/var.pan1c_QC.py | 24 ++++++++++++------------
 3 files changed, 35 insertions(+), 31 deletions(-)

diff --git a/Snakefile b/Snakefile
index 99fcfe2..e1a046c 100644
--- a/Snakefile
+++ b/Snakefile
@@ -70,18 +70,25 @@ def which_analysis():
             expand("output/chr.contig/Pan1c."+config['name']+"{chromosome}.contig.png", chromosome=CHRLIST) 
         )
 
-        if config["create_report"] == "True": # Creating report (need contig)
-            analysis_inputs.append(
-                expand("output/Pan1c.{gtool}."+config['name']+".report.md", gtool=graph_tools)
-            )
-            analysis_inputs.append("output/report_data/Pan1c."+config['name']+".assembly.json")
-            analysis_inputs.append("output/report_data/Pan1c."+config['name']+".graph.json")
+    if config["Pan1c-View_jsons"] == "True": # Creates JSONs for Pan1c-View
+        analysis_inputs.append("output/report_data/Pan1c."+config['name']+".assembly.json")
+        analysis_inputs.append("output/report_data/Pan1c."+config['name']+".graph.json")
+
+        if config["get_VCF"] == "True":
+            analysis_inputs.append("output/report_data/Pan1c."+config['name']+".var.json")
+
+    ### Legacy option, will soon be removed
+
+    if config["create_report"] == "True": # Creating legacy report
+        analysis_inputs.append(
+            expand("output/Pan1c.{gtool}."+config['name']+".report.md", gtool=graph_tools)
+        )
+
     if config["get_VCF"] == "True": # VCF from the final graph against the "reference"
         analysis_inputs.append(
             expand("output/{gtool}.vcf.figs", gtool=graph_tools)
         )
-        analysis_inputs.append("output/report_data/Pan1c."+config['name']+".var.json")
-
+        
     return analysis_inputs
 
 """
@@ -690,7 +697,8 @@ rule MC_graph:
     params:
         tmp_dir='data/chrGraphs/MC.{chromosome}',
         ref_name=config['reference'],
-        app_path=config['app.path']
+        app_path=config['app.path'],
+        mc=config['MC.params']
     log:
         stdout="logs/MC/{chromosome}.mc.stdout.log",
         stderr="logs/MC/{chromosome}.mc.stderr.log"
@@ -718,7 +726,7 @@ rule MC_graph:
             --outName $(basename {output.gfa_gz} .gfa.gz) \
             --reference "$(basename {params.ref_name} .fa.gz | cut -f1 -d'.').$(basename {params.ref_name} .fa.gz | cut -f2 -d '.' | cut -f2 -d'p')" \
             --gfa \
-            --clip 0 --filter 0
+            {MC.params}
         
         # Converting to GFA 1.0
         apptainer run --app gfavc {params.app_path}/PanGeTools.sif \
@@ -1162,15 +1170,9 @@ rule create_pan1c_report_fig:
         app_path=config['app.path']
     shell:
         """
-        ## Get path order (alphabetic)
-        apptainer run --app odgi {params.app_path}/PanGeTools.sif \
-            paths -i {input.graph} -L | sort > $(dirname {output.reportfig})/{wildcards.gtool}.{wildcards.chromosome}.paths.order.txt
-
         ## Odgi 1D viz
         apptainer run --app odgi {params.app_path}/PanGeTools.sif \
-            viz -i {input.graph} -o {output.odgifig} -x 2500 -a 80 -b -H -t {threads} -P -p $(dirname {output.reportfig})/{wildcards.gtool}.{wildcards.chromosome}.paths.order.txt
-
-        rm $(dirname {output.reportfig})/{wildcards.gtool}.{wildcards.chromosome}.paths.order.txt
+            viz -i {input.graph} -o {output.odgifig} -x 2500 -a 80 -b -H -t {threads} -P
 
         ## Getting legend from contig figure
         convert {input.contigfig} -crop 790x+0+0 +repage {output.namefig}
diff --git a/config.yaml b/config.yaml
index ae25ac7..1ed52d8 100644
--- a/config.yaml
+++ b/config.yaml
@@ -30,9 +30,10 @@ odgi.pcov.params: '-x 2000 -O'
 
 ## Optional parts of the workflow
 # Running Quast to get statistics on input haplotypes
-run_Quast: 'True'
+run_Quast: 'False'
 # Make Minigraph-Cactus graph using the same method (chromosome level)
 get_MC: 'False'
+MC.params: '--clip 0 --filter 0'
 # Getting figures showing chromosome decomposition into contigs
 get_contig_pos: 'True'
 # Computes Presence Absence Variant matrices for Panache (not recommended; very long)
@@ -40,7 +41,8 @@ get_PAV: 'False'
 # Computes SyRI figures for haplotypes 
 get_ASMs_SyRI: 'False' # Haplotype vs Reference
 get_chrInputs_SyRI: 'False' # SyRI on chrInputs
-# Producing VCF and its associated INS/DEL figure
+# Producing VCF from Pangenome graph
 get_VCF: 'False'
 # Creating final report
 create_report: 'True'
+Pan1c-View_jsons: 'True'
diff --git a/scripts/var.pan1c_QC.py b/scripts/var.pan1c_QC.py
index d53714f..447721e 100644
--- a/scripts/var.pan1c_QC.py
+++ b/scripts/var.pan1c_QC.py
@@ -69,12 +69,12 @@ def parse_tsv(file):
         fdata[query] = {}
         
         for hapid in df["HAP"].unique():
-            fdata[query][hapid] = {}
+            fdata[query][hapid] = {"Sums":{}, "Counts":{}}
             
             tmp_data = {}
-            fdata[query][hapid]["All_Total"] = 0
-            fdata[query][hapid]["All_INS"] = 0
-            fdata[query][hapid]["All_DEL"] = 0
+            fdata[query][hapid]["Sums"]["All_Total"] = 0
+            fdata[query][hapid]["Sums"]["All_INS"] = 0
+            fdata[query][hapid]["Sums"]["All_DEL"] = 0
 
             for chromid in df["CHROM"].unique():
                 
@@ -89,15 +89,15 @@ def parse_tsv(file):
                 tmp_data[chromid] = bin_counts.values
 
                 # Creating a concatenated list
-                fdata[query][hapid][chromid] = ';'.join([str(k) for k in list(bin_counts.values)])
+                fdata[query][hapid]["Counts"][chromid] = ';'.join([str(k) for k in list(bin_counts.values)])
 
                 # Computing Total number of variants, number of Deletions, number of Insertions 
-                fdata[query][hapid][f"{chromid}_Total"] = int(bin_counts.values.sum())
-                fdata[query][hapid]["All_Total"] += fdata[query][hapid][f"{chromid}_Total"]
-                fdata[query][hapid][f"{chromid}_DEL"] = int(bin_counts.values[:66].sum())
-                fdata[query][hapid]["All_DEL"] += fdata[query][hapid][f"{chromid}_DEL"]
-                fdata[query][hapid][f"{chromid}_INS"] = int(bin_counts.values[66:].sum())
-                fdata[query][hapid]["All_INS"] += fdata[query][hapid][f"{chromid}_INS"]
+                fdata[query][hapid]["Sums"][f"{chromid}_Total"] = int(bin_counts.values.sum())
+                fdata[query][hapid]["Sums"]["All_Total"] += fdata[query][hapid]["Sums"][f"{chromid}_Total"]
+                fdata[query][hapid]["Sums"][f"{chromid}_DEL"] = int(bin_counts.values[:66].sum())
+                fdata[query][hapid]["Sums"]["All_DEL"] += fdata[query][hapid]["Sums"][f"{chromid}_DEL"]
+                fdata[query][hapid]["Sums"][f"{chromid}_INS"] = int(bin_counts.values[66:].sum())
+                fdata[query][hapid]["Sums"]["All_INS"] += fdata[query][hapid]["Sums"][f"{chromid}_INS"]
                 
                 bins_string = [f"({int(round(log_untransform(interval.left), 0))}, {int(round(log_untransform(interval.right), 0))}]" for interval in bin_counts.index]
                 
@@ -108,7 +108,7 @@ def parse_tsv(file):
             
             all_chrom = pd.DataFrame.from_dict(tmp_data, orient='columns')
             all_chrom.index = fdata["index"]
-            fdata[query][hapid]["All"] = ';'.join([str(k) for k in list(all_chrom.sum(axis=1).values)])
+            fdata[query][hapid]["Counts"]["All"] = ';'.join([str(k) for k in list(all_chrom.sum(axis=1).values)])
             
     
     fdata["index"] = ";".join(fdata["index"])
-- 
GitLab


From b76b2696f6552ee3a28adc2681868831a5fb4f28 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Mon, 4 Nov 2024 17:13:51 +0100
Subject: [PATCH 265/310] Updated CICD

---
 example/config_CICD.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/example/config_CICD.yaml b/example/config_CICD.yaml
index ba2ac18..30a7503 100644
--- a/example/config_CICD.yaml
+++ b/example/config_CICD.yaml
@@ -33,6 +33,7 @@ odgi.pcov.params: '-x 2000 -a 25 -O'
 run_Quast: 'True'
 # Make Minigraph-Cactus graph using the same method (chromosome level)
 get_MC: 'True'
+MC.params: '--clip 0 --filter 0'
 # Getting figures showing chromosome decomposition into contigs
 get_contig_pos: 'True'
 # Computes Presence Absence Variant matrices for Panache (not recommended; very long)
@@ -42,6 +43,7 @@ get_PAV: 'False'
 get_ASMs_SyRI: 'True' # Haplotype vs Reference
 get_chrInputs_SyRI: 'True' # SyRI on chrInputs
 # Producing VCF and its associated INS/DEL figure
-get_VCF: 'False'
+get_VCF: 'True'
 # Creating final report
 create_report: 'True'
+Pan1c-View_jsons: 'True'
-- 
GitLab


From 14e4c2245ad2c093d3518b91272c7a08835e3d46 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Mon, 4 Nov 2024 17:34:03 +0100
Subject: [PATCH 266/310] Typo

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index e1a046c..9f42478 100644
--- a/Snakefile
+++ b/Snakefile
@@ -726,7 +726,7 @@ rule MC_graph:
             --outName $(basename {output.gfa_gz} .gfa.gz) \
             --reference "$(basename {params.ref_name} .fa.gz | cut -f1 -d'.').$(basename {params.ref_name} .fa.gz | cut -f2 -d '.' | cut -f2 -d'p')" \
             --gfa \
-            {MC.params}
+            {mc.params}
         
         # Converting to GFA 1.0
         apptainer run --app gfavc {params.app_path}/PanGeTools.sif \
-- 
GitLab


From ad9f0b9ae134a481460494dd68d6efcde84fd78d Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Mon, 4 Nov 2024 17:38:24 +0100
Subject: [PATCH 267/310] typo x2

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 9f42478..a5c5750 100644
--- a/Snakefile
+++ b/Snakefile
@@ -726,7 +726,7 @@ rule MC_graph:
             --outName $(basename {output.gfa_gz} .gfa.gz) \
             --reference "$(basename {params.ref_name} .fa.gz | cut -f1 -d'.').$(basename {params.ref_name} .fa.gz | cut -f2 -d '.' | cut -f2 -d'p')" \
             --gfa \
-            {mc.params}
+            {params.mc}
         
         # Converting to GFA 1.0
         apptainer run --app gfavc {params.app_path}/PanGeTools.sif \
-- 
GitLab


From 88cc5e0f8268072890a48a912485a00f84840244 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 5 Nov 2024 11:26:49 +0100
Subject: [PATCH 268/310] Added rule Pan1c-View_data Create a tar.gz with
 figures used in Pan1c-View

---
 Snakefile | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/Snakefile b/Snakefile
index a5c5750..7d28596 100644
--- a/Snakefile
+++ b/Snakefile
@@ -77,6 +77,8 @@ def which_analysis():
         if config["get_VCF"] == "True":
             analysis_inputs.append("output/report_data/Pan1c."+config['name']+".var.json")
 
+        analysis_inputs.append("output/"+config['name']+".Pan1c-View.data.tar.gz")
+
     ### Legacy option, will soon be removed
 
     if config["create_report"] == "True": # Creating legacy report
@@ -1154,6 +1156,47 @@ rule var_json:
             --output {output.json}
         """
 
+def Pan1c_view_data_inputs(wildcards):
+    inputs = list()
+
+    if config["get_contig_pos"] == "True":
+        inputs += expand(
+            "output/chr.contig/Pan1c."+config['name']+"{chromosome}.contig.png",
+            chromosome=CHRLIST
+        )
+
+    if config["get_ASMs_SyRI"] == "True":
+        inputs += expand(
+            "output/asm.syri.figs/Pan1c."+config['name']+".{haplotype}.syri_mm2.png", 
+            haplotype=SAMPLES_NOREF
+        )
+
+    if config["get_chrInputs_SyRI"] == "True":
+        inputs += expand(
+            "output/chrInput.syri.figs/Pan1c."+config['name']+".{chromosome}.syri_mm2.png", 
+            chromosome=CHRLIST
+        )
+
+    inputs += expand(
+        "output/report/Pan1c.{gtool}."+config['name']+".{chromosome}.report.fig.png", 
+        gtool=graph_tools, chromosome=CHRLIST
+    )
+    
+    return inputs
+
+rule Pan1c_View_data:
+    input:
+        Pan1c_view_data_inputs
+    output:
+        "output/"+config['name']+".Pan1c-View.data.tar.gz"
+    threads: 1
+    resources:
+        mem_mb = 8000
+    shell:
+        """
+        tar --transform 's/output/data/' -czvf {output} {input}
+        """
+    
 rule create_pan1c_report_fig:
     # Produces a markdown report figure of chromosomes graphs
     input:
-- 
GitLab


From c35c910c8aea84b8ba4ee670246d5efbd404c5e4 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 5 Nov 2024 11:34:21 +0100
Subject: [PATCH 269/310] Adding Tags to MC graph

---
 Snakefile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Snakefile b/Snakefile
index 7d28596..d6dc7f5 100644
--- a/Snakefile
+++ b/Snakefile
@@ -736,6 +736,9 @@ rule MC_graph:
             --simplify \
             --outName "$(dirname {output.gfa_gz})/$(basename {output.gfa_gz} .gz)"
 
+        ## Adding new tags
+        sed -i '/^H/r {input.tags}' "$(dirname {output.gfa_gz})/$(basename {output.gfa_gz} .gz)"
+
         apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
             -@ {threads} "$(dirname {output.gfa_gz})/$(basename {output.gfa_gz} .gz)"
         
-- 
GitLab


From bb36285187e0ac95a55ab8c39e40880a7bf24682 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 5 Nov 2024 11:40:04 +0100
Subject: [PATCH 270/310] Added compression and cleaning pass to MC_graph rule

---
 Snakefile | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/Snakefile b/Snakefile
index d6dc7f5..ced46a8 100644
--- a/Snakefile
+++ b/Snakefile
@@ -697,7 +697,7 @@ rule MC_graph:
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 32000
     params:
-        tmp_dir='data/chrGraphs/MC.{chromosome}',
+        mc_dir='data/chrGraphs/MC.{chromosome}',
         ref_name=config['reference'],
         app_path=config['app.path'],
         mc=config['MC.params']
@@ -706,36 +706,41 @@ rule MC_graph:
         stderr="logs/MC/{chromosome}.mc.stderr.log"
     shell:
         """
-        if [ -d {params.tmp_dir} ]; then rm -r {params.tmp_dir}; fi
-        mkdir -p {params.tmp_dir}
+        if [ -d {params.mc_dir} ]; then rm -r {params.mc_dir}; fi
+        mkdir -p {params.mc_dir}
 
         # Creating a fasta for each sequence
-        zcat {input.fa} | awk -v DIR={params.tmp_dir} \
+        zcat {input.fa} | awk -v DIR={params.mc_dir} \
             '/^>/ {{name=substr($0, 2); gsub(/#/, ".", name); OUT= DIR "/" name ".fa"}}; {{print >> OUT; close(OUT)}}'
 
         # Listing fasta files
-        for hap in {params.tmp_dir}/*.fa; do
+        for hap in {params.mc_dir}/*.fa; do
             fullname=$(basename $hap .fa)
             genome=$(echo $fullname | cut -f1 -d'.')
             hapid=$(echo $fullname | cut -f2 -d'.')
-            echo -e "${{genome}}.${{hapid}}\t${{hap}}" >> {params.tmp_dir}/{wildcards.chromosome}.genomes.txt
+            echo -e "${{genome}}.${{hapid}}\t${{hap}}" >> {params.mc_dir}/{wildcards.chromosome}.genomes.txt
         done
 
         # Running MC
         apptainer run {params.app_path}/minigraph-cactus_v2.7.0.sif \
-            {params.tmp_dir}/tmp {params.tmp_dir}/{wildcards.chromosome}.genomes.txt \
-            --outDir {params.tmp_dir} \
+            {params.mc_dir}/tmp {params.mc_dir}/{wildcards.chromosome}.genomes.txt \
+            --outDir {params.mc_dir} \
             --outName $(basename {output.gfa_gz} .gfa.gz) \
             --reference "$(basename {params.ref_name} .fa.gz | cut -f1 -d'.').$(basename {params.ref_name} .fa.gz | cut -f2 -d '.' | cut -f2 -d'p')" \
             --gfa \
             {params.mc}
-        
+
         # Converting to GFA 1.0
         apptainer run --app gfavc {params.app_path}/PanGeTools.sif \
-            --gfa1 {params.tmp_dir}/$(basename {output.gfa_gz} .gfa.gz).full.gfa.gz \
+            --gfa1 {params.mc_dir}/$(basename {output.gfa_gz} .gfa.gz).full.gfa.gz \
             --simplify \
             --outName "$(dirname {output.gfa_gz})/$(basename {output.gfa_gz} .gz)"
 
+        # Cleaning and compressing
+        rm -r {params.mc_dir}/*.fa {params.mc_dir}/*.txt {params.mc_dir}/chrom-*
+        apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
+            -@ {threads} {params.mc_dir}/*.hal {params.mc_dir}/*.paf
+
         ## Adding new tags
         sed -i '/^H/r {input.tags}' "$(dirname {output.gfa_gz})/$(basename {output.gfa_gz} .gz)"
 
-- 
GitLab


From a085b7d498a9bd2a990e902675da498aed0e003a Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 5 Nov 2024 15:55:06 +0100
Subject: [PATCH 271/310] Added null value for var_json

---
 scripts/var.pan1c_QC.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/scripts/var.pan1c_QC.py b/scripts/var.pan1c_QC.py
index 447721e..420e619 100644
--- a/scripts/var.pan1c_QC.py
+++ b/scripts/var.pan1c_QC.py
@@ -89,7 +89,9 @@ def parse_tsv(file):
                 tmp_data[chromid] = bin_counts.values
 
                 # Creating a concatenated list
-                fdata[query][hapid]["Counts"][chromid] = ';'.join([str(k) for k in list(bin_counts.values)])
+                tmp_counts = [str(k) for k in list(bin_counts.values)]
+                tmp_counts = tmp_counts[:66] + ["null"] + tmp_counts[66:] # Adding null for -50;50bp variants
+                fdata[query][hapid]["Counts"][chromid] = ';'.join(tmp_counts)
 
                 # Computing Total number of variants, number of Deletions, number of Insertions 
                 fdata[query][hapid]["Sums"][f"{chromid}_Total"] = int(bin_counts.values.sum())
@@ -100,7 +102,7 @@ def parse_tsv(file):
                 fdata[query][hapid]["Sums"]["All_INS"] += fdata[query][hapid]["Sums"][f"{chromid}_INS"]
                 
                 bins_string = [f"({int(round(log_untransform(interval.left), 0))}, {int(round(log_untransform(interval.right), 0))}]" for interval in bin_counts.index]
-                
+
                 if "index" in fdata: 
                     assert (np.array(fdata["index"])==np.array(bins_string)).all()
                 else:
@@ -108,10 +110,12 @@ def parse_tsv(file):
             
             all_chrom = pd.DataFrame.from_dict(tmp_data, orient='columns')
             all_chrom.index = fdata["index"]
-            fdata[query][hapid]["Counts"]["All"] = ';'.join([str(k) for k in list(all_chrom.sum(axis=1).values)])
-            
+
+            tmp_counts = [str(k) for k in list(all_chrom.sum(axis=1).values)]
+            tmp_counts = tmp_counts[:66] + ["null"] + tmp_counts[66:] # Adding null for -50;50bp variants
+            fdata[query][hapid]["Counts"]["All"] = ';'.join(tmp_counts)
     
-    fdata["index"] = ";".join(fdata["index"])
+    fdata["index"] = ";".join(fdata["index"][:66]+["(-50, 50]"]+fdata["index"][66:])
     return fdata
 
 ## Parsing all TSV and aggregating into a final dictionnary
-- 
GitLab


From f73883092e8408a19d0a926de7b8da1ece18aeed Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 5 Nov 2024 16:16:38 +0100
Subject: [PATCH 272/310] Updated graph_json script

---
 scripts/graph.pan1c_QC.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/scripts/graph.pan1c_QC.py b/scripts/graph.pan1c_QC.py
index 8c41d82..40f0554 100644
--- a/scripts/graph.pan1c_QC.py
+++ b/scripts/graph.pan1c_QC.py
@@ -46,14 +46,13 @@ arg_parser.add_argument(
     )
 args = arg_parser.parse_args()
 
-gtool_translation = {"pan1c": "PGGB", "MC": "MC"}
 
 ## General statistics
 
 gen_stats = {}
 
 for tsv in args.general:
-    gtool = gtool_translation[os.path.basename(tsv).split(".")[0]]
+    gtool = os.path.basename(tsv).split(".")[1]
     
     gen_stats[gtool] = pd.read_csv(tsv, sep="\t").drop(columns="Pangenome.name").set_index("Chr.id").to_dict(orient="index")
 
@@ -63,7 +62,7 @@ path_stats = {}
 shared_table = {} 
 
 for tsv in args.path:
-    gtool = gtool_translation[os.path.basename(tsv).split(".")[0]]
+    gtool = os.path.basename(tsv).split(".")[1]
     df = pd.read_csv(tsv, sep="\t").drop(columns="Pangenome.name")
     df["Path.name"] = df["Path.name"].str.rsplit("#", n=1).str[0]
 
@@ -109,12 +108,12 @@ Graph_JSON = {
     "Shared_content": shared_table,
 }
 
-avail_gtool = list(set([os.path.basename(figs).split('.')[0] for figs in args.odgifigs]))
+avail_gtool = list(set([os.path.basename(figs).split('.')[1] for figs in args.odgifigs]))
 
 Graph_JSON["odgi_figs"] = {
-    gtool_translation[gtool]: {
-        chrid : f"data/odgifigs/{gtool}.{args.name}.{chrid}.report.fig.png"
-        for chrid in gen_stats[gtool_translation[gtool]].keys()
+    gtool: {
+        chrid : f"data/odgifigs/Pan1c.{gtool}.{args.name}.{chrid}.report.fig.png"
+        for chrid in gen_stats[gtool].keys()
     }
     for gtool in avail_gtool
 }
-- 
GitLab


From 81062245a26fb66f25fef545c67c3fd62bcf03b5 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 5 Nov 2024 17:27:17 +0100
Subject: [PATCH 273/310] Updated Pan1c_View_data rule

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index ced46a8..95d2246 100644
--- a/Snakefile
+++ b/Snakefile
@@ -1202,7 +1202,7 @@ rule Pan1c_View_data:
         mem_mb = 8000
     shell:
         """
-        tar --transform 's/output/data/' -czvf {output} {input}
+        tar --transform 's/output/data/' --transform 's/report\//odgifigs\//' -czvf {output} {input}
         """
     
 rule create_pan1c_report_fig:
-- 
GitLab


From 27bd6f0475204dc810baf05fd3c64b91972bfc38 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 5 Nov 2024 17:43:20 +0100
Subject: [PATCH 274/310] Cahnged VCF creation config

---
 Snakefile | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/Snakefile b/Snakefile
index 95d2246..430fdc9 100644
--- a/Snakefile
+++ b/Snakefile
@@ -79,6 +79,11 @@ def which_analysis():
 
         analysis_inputs.append("output/"+config['name']+".Pan1c-View.data.tar.gz")
 
+    if config["get_VCF"] == "True":
+            analysis_inputs.append(
+                expand("output/Pan1c.{gtool}."+config['name']+".vcf.gz", gtool=graph_tools)
+            )
+
     ### Legacy option, will soon be removed
 
     if config["create_report"] == "True": # Creating legacy report
@@ -86,10 +91,10 @@ def which_analysis():
             expand("output/Pan1c.{gtool}."+config['name']+".report.md", gtool=graph_tools)
         )
 
-    if config["get_VCF"] == "True": # VCF from the final graph against the "reference"
-        analysis_inputs.append(
-            expand("output/{gtool}.vcf.figs", gtool=graph_tools)
-        )
+        if config["get_VCF"] == "True": # VCF from the final graph against the "reference"
+            analysis_inputs.append(
+                expand("output/{gtool}.vcf.figs", gtool=graph_tools)
+            )
         
     return analysis_inputs
 
-- 
GitLab


From f637348597cb9006d1a4b1d2b6c570a566e9ee6b Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Thu, 7 Nov 2024 15:49:29 +0100
Subject: [PATCH 275/310] Adding JSONs to Pan1c-View tarball

---
 Snakefile | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 430fdc9..6c488eb 100644
--- a/Snakefile
+++ b/Snakefile
@@ -1172,6 +1172,13 @@ rule var_json:
 def Pan1c_view_data_inputs(wildcards):
     inputs = list()
 
+    inputs += [
+        "output/report_data/Pan1c."+config['name']+".assembly.json",
+        "output/report_data/Pan1c."+config['name']+".graph.json",
+        "output/report_data/Pan1c."+config['name']+".var.json",
+        "output/report_data/Pan1c."+config['name']+".tags.json"
+    ]
+
     if config["get_contig_pos"] == "True":
         inputs += expand(
             "output/chr.contig/Pan1c."+config['name']+"{chromosome}.contig.png",
@@ -1207,7 +1214,10 @@ rule Pan1c_View_data:
         mem_mb = 8000
     shell:
         """
-        tar --transform 's/output/data/' --transform 's/report\//odgifigs\//' -czvf {output} {input}
+        tar --transform 's/output/data/' \
+            --transform 's/report\//odgifigs\//' \
+            --transform 's/output\/report_data\///' \
+            -czvf {output} {input}
         """
     
 rule create_pan1c_report_fig:
-- 
GitLab


From b2afb7a9cbbb5b96b8e1dd06dd433173afe78c2a Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 8 Nov 2024 15:01:18 +0100
Subject: [PATCH 276/310] Added paths to relevant files in summary json

---
 scripts/getTags.py | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/scripts/getTags.py b/scripts/getTags.py
index 8aefdeb..93289e9 100644
--- a/scripts/getTags.py
+++ b/scripts/getTags.py
@@ -139,10 +139,6 @@ for key in labels.keys():
     if ".Version" in key:
         tags["Tools"][key.lower().split(".")[0]] = labels[key]
 
-# Exporting tags to JSON
-with open(args.json, "w") as handle:
-    json.dump(tags, handle, indent=6)
-
 ## Exporting tags to stdout
 print("#\tThis graph have been created using the Pan1c workflow (https://forgemia.inra.fr/alexis.mergez/pan1c)\n#")
 print("#\tTool versions and commands\n#")
@@ -159,3 +155,27 @@ for section, svalues in tags.items():
         for key, value in svalues.items():
             print(f"#\t{key}: {value}")
         print('#')
+
+# Adding path to generated files
+gtools = ["PGGB"] + (tags["Parameters"]["get_MC"] == "True")*["MC"]
+
+tags["Files"] = {
+    "GFAv1": {
+        tool: f"data/Pan1c.{tool}.{tags['Pangenome']['name']}.gfa.gz" 
+        for tool in gtools    
+    }
+}
+
+if tags["Parameters"]["get_VCF"] == "True":
+    tags["Files"]["XG"] = {
+        tool: f"data/Pan1c.{tool}.{tags['Pangenome']['name']}.xg" 
+        for tool in gtools    
+    }
+    tags["Files"]["VCF"] = {
+        tool: f"data/Pan1c.{tool}.{tags['Pangenome']['name']}.vcf.gz" 
+        for tool in gtools    
+    }
+
+# Exporting tags to JSON
+with open(args.json, "w") as handle:
+    json.dump(tags, handle, indent=6)
\ No newline at end of file
-- 
GitLab


From 8f8c72c8670a15b251658c55d159d8bc974f068e Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 8 Nov 2024 15:06:54 +0100
Subject: [PATCH 277/310] Adding graphs to Pan1c-View tarball

---
 Snakefile | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 6c488eb..26d0fd8 100644
--- a/Snakefile
+++ b/Snakefile
@@ -1172,6 +1172,7 @@ rule var_json:
 def Pan1c_view_data_inputs(wildcards):
     inputs = list()
 
+    # Adding JSONs
     inputs += [
         "output/report_data/Pan1c."+config['name']+".assembly.json",
         "output/report_data/Pan1c."+config['name']+".graph.json",
@@ -1179,6 +1180,13 @@ def Pan1c_view_data_inputs(wildcards):
         "output/report_data/Pan1c."+config['name']+".tags.json"
     ]
 
+    # Adding GFAv1
+    inputs.append(expand(
+        "output/Pan1c.{gtool}."+config["name"]+".gfa.gz",
+        gtool=graph_tools
+    ))
+
+    # Adding optional figures
     if config["get_contig_pos"] == "True":
         inputs += expand(
             "output/chr.contig/Pan1c."+config['name']+"{chromosome}.contig.png",
@@ -1197,6 +1205,14 @@ def Pan1c_view_data_inputs(wildcards):
             chromosome=CHRLIST
         )
 
+    if config["get_VCF"] == "True":
+        inputs += expand(
+            "output/Pan1c.{gtool}."+config["name"]+".{extension}",
+            gtool=graph_tools,
+            extension=["vcf.gz", "xg"] 
+        )
+
+    # Adding 1D viz
     inputs += expand(
         "output/report/Pan1c.{gtool}."+config['name']+".{chromosome}.report.fig.png", 
         gtool=graph_tools, chromosome=CHRLIST
@@ -1216,7 +1232,7 @@ rule Pan1c_View_data:
         """
         tar --transform 's/output/data/' \
             --transform 's/report\//odgifigs\//' \
-            --transform 's/output\/report_data\///' \
+            --transform 's/data\/report_data\///' \
             -czvf {output} {input}
         """
     
-- 
GitLab


From 15f7895c070bc34a7150e6062753b32827a7e77d Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 8 Nov 2024 15:09:46 +0100
Subject: [PATCH 278/310] Fixed issue with pan1c-view rule inputs

---
 Snakefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index 26d0fd8..0f2ff43 100644
--- a/Snakefile
+++ b/Snakefile
@@ -1181,10 +1181,10 @@ def Pan1c_view_data_inputs(wildcards):
     ]
 
     # Adding GFAv1
-    inputs.append(expand(
+    inputs += expand(
         "output/Pan1c.{gtool}."+config["name"]+".gfa.gz",
         gtool=graph_tools
-    ))
+    )
 
     # Adding optional figures
     if config["get_contig_pos"] == "True":
-- 
GitLab


From f69d3c3140ca045890908245e09cee064e911758 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 8 Nov 2024 15:34:19 +0100
Subject: [PATCH 279/310] Fixed syri_vcf_2_tsv

---
 Snakefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index 0f2ff43..2f7ba63 100644
--- a/Snakefile
+++ b/Snakefile
@@ -1126,8 +1126,8 @@ rule syri_vcf_2_tsv:
         #% SyRI VCF MM2
         ## Going through all folders
         for vcf in $FOLDER/*.vcf.gz; do
-            THAP=$(basename $vcf .syri.vcf.gz | cut -f2 -d'.')
-            THAPN=$(basename $vcf .syri.vcf.gz | cut -f3 -d'.' | cut -f2 -d'p')
+            THAP=$(basename $vcf .syri.mm2.vcf.gz | cut -f3 -d'.')
+            THAPN=$(basename $vcf .syri.mm2.vcf.gz | cut -f4 -d'.' | cut -f2 -d'p')
 
             # Producing intermediate TSVs
             zcat $vcf | \
-- 
GitLab


From e8436731743dedbbefab08d26d8afc7fc255380d Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 8 Nov 2024 15:38:46 +0100
Subject: [PATCH 280/310] Fixed path to figures in ASM_JSON

---
 Snakefile               | 10 +++++-----
 scripts/asm.pan1c_QC.py |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Snakefile b/Snakefile
index 2f7ba63..f2d79e1 100644
--- a/Snakefile
+++ b/Snakefile
@@ -67,7 +67,7 @@ def which_analysis():
         )
     if config["get_contig_pos"] == "True": # Chromosome decomposition into its contig figure
         analysis_inputs.append(
-            expand("output/chr.contig/Pan1c."+config['name']+"{chromosome}.contig.png", chromosome=CHRLIST) 
+            expand("output/chr.contig/Pan1c."+config['name']+".{chromosome}.contig.png", chromosome=CHRLIST) 
         )
 
     if config["Pan1c-View_jsons"] == "True": # Creates JSONs for Pan1c-View
@@ -227,7 +227,7 @@ rule contig_positions:
         fa="data/chrInputs/"+config["name"]+".{chromosome}.fa.gz",
         fai="data/chrInputs/"+config["name"]+".{chromosome}.fa.gz.fai"
     output:
-        fig="output/chr.contig/Pan1c."+config['name']+"{chromosome}.contig.png",
+        fig="output/chr.contig/Pan1c."+config['name']+".{chromosome}.contig.png",
         outdir=temp(directory("output/chr.contig/{chromosome}"))
     threads: 1
     resources:
@@ -392,7 +392,7 @@ def asm_json_inputs(wildcards):
 
     if config["get_contig_pos"] == "True":
         sections["contig_pos"] = expand(
-            "output/chr.contig/Pan1c."+config['name']+"{chromosome}.contig.png",
+            "output/chr.contig/Pan1c."+config['name']+".{chromosome}.contig.png",
             chromosome=CHRLIST
         )
 
@@ -1189,7 +1189,7 @@ def Pan1c_view_data_inputs(wildcards):
     # Adding optional figures
     if config["get_contig_pos"] == "True":
         inputs += expand(
-            "output/chr.contig/Pan1c."+config['name']+"{chromosome}.contig.png",
+            "output/chr.contig/Pan1c."+config['name']+".{chromosome}.contig.png",
             chromosome=CHRLIST
         )
 
@@ -1240,7 +1240,7 @@ rule create_pan1c_report_fig:
     # Produces a markdown report figure of chromosomes graphs
     input:
         graph='data/chrGraphs/Pan1c.{gtool}.'+config['name']+'.{chromosome}.tmp.gfa',
-        contigfig="output/chr.contig/Pan1c."+config['name']+"{chromosome}.contig.png",
+        contigfig="output/chr.contig/Pan1c."+config['name']+".{chromosome}.contig.png",
     output:
         odgifig=temp("tmp/Pan1c.{gtool}.{chromosome}.odgi.png"),
         namefig=temp("tmp/Pan1c.{gtool}.{chromosome}.name.png"),
diff --git a/scripts/asm.pan1c_QC.py b/scripts/asm.pan1c_QC.py
index 425103f..efa784d 100644
--- a/scripts/asm.pan1c_QC.py
+++ b/scripts/asm.pan1c_QC.py
@@ -142,17 +142,17 @@ if args.quast:
 
 if args.contig_pos:
     assembly["Contig_pos"] = {
-        chrid: f"data/chr.contig/{chrid}.contig.png" for chrid in cdf.index.get_level_values(0).unique()
+        chrid: f"data/chr.contig/Pan1c.{args.name}.{chrid}.contig.png" for chrid in cdf.index.get_level_values(0).unique()
     }
 
 if args.syri_asm:
     assembly["Syri_hap"] = {
-        hap: f"data/asm.syri.figs/{args.name}.{hap.replace('#', '.hap')}.syri.mm2.png" for hap in cdf.index.get_level_values(1).unique() if hap != args.ref
+        hap: f"data/asm.syri.figs/Pan1c.{args.name}.{hap.replace('#', '.hap')}.syri.mm2.png" for hap in cdf.index.get_level_values(1).unique() if hap != args.ref
     }
 
 if args.syri_chr:
     assembly["Syri_chr"] = {
-        chrid: f"data/chrInput.syri.figs/{args.name}.{chrid}.syri.png" for chrid in cdf.index.get_level_values(0).unique()
+        chrid: f"data/chrInput.syri.figs/Pan1c.{args.name}.{chrid}.syri.png" for chrid in cdf.index.get_level_values(0).unique()
     }
 
 with open(args.output, "w") as handle:
-- 
GitLab


From 606d6934ccfec1fca00a877726666c278f8b51af Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 8 Nov 2024 16:21:23 +0100
Subject: [PATCH 281/310] Added diff table to graph_json

---
 scripts/graph.pan1c_QC.py | 46 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 41 insertions(+), 5 deletions(-)

diff --git a/scripts/graph.pan1c_QC.py b/scripts/graph.pan1c_QC.py
index 40f0554..84c6799 100644
--- a/scripts/graph.pan1c_QC.py
+++ b/scripts/graph.pan1c_QC.py
@@ -48,7 +48,6 @@ args = arg_parser.parse_args()
 
 
 ## General statistics
-
 gen_stats = {}
 
 for tsv in args.general:
@@ -57,9 +56,9 @@ for tsv in args.general:
     gen_stats[gtool] = pd.read_csv(tsv, sep="\t").drop(columns="Pangenome.name").set_index("Chr.id").to_dict(orient="index")
 
 ## Path statistics and shared content
-
 path_stats = {}
 shared_table = {} 
+diff_table = {}
 
 for tsv in args.path:
     gtool = os.path.basename(tsv).split(".")[1]
@@ -72,9 +71,11 @@ for tsv in args.path:
     for chrid in df["Chr.id"].unique():
         path_stats[gtool][chrid] = df[df["Chr.id"] == chrid].drop(columns=["Chr.id", "Shared.content"]).set_index("Path.name").to_dict(orient="index")
 
-    # Shared content
+    # Shared content stats
     shared_table[gtool] = {} 
     shared_content = df.set_index(["Chr.id", "Path.name"]).loc[:, ["Path.length", "Shared.content"]].to_dict()
+
+    ## Creating new dataframe based on concatenated chain in "Shared.content" column
     shared_dict = {}
     A = 0
     for key, value in shared_content["Shared.content"].items():
@@ -84,6 +85,8 @@ for tsv in args.path:
     
             shared_dict[A] = list(key)+[target]+[shared_content["Path.length"][key]]+[int(val) for val in stats.split(',')]
             A+=1
+    
+    ## Computing stats on shared content
     sdf = pd.DataFrame.from_dict(shared_dict, orient='index', columns = ["Chr.id", "Query.name", "Target.name", "Path.length", "Shared.nodes.count", "Shared.length", "Shared.R.length"])
     sdf.set_index(["Chr.id", "Query.name", "Target.name"], inplace=True)
     sdf.loc[:, "Shared.prop"] = sdf["Shared.length"]*100/sdf["Path.length"]
@@ -98,14 +101,47 @@ for tsv in args.path:
         for query in chrdf["Query.name"].unique():
             shared_table[gtool][chrid][query] = chrdf[chrdf["Query.name"] == query].drop(columns="Query.name").set_index("Target.name").to_dict(orient="index")
 
-
+    ## Computing difference between shared content heatmaps
+    dData = {"Qchr":[], "Tchr":[], "Diff":[]}
+
+    # Iterating over chromosomes twice to make pairs
+    for Qchr in shared_table[gtool].keys():
+        # Creating Query shared length matrice
+        Qtable = np.array([
+            [ 
+                shared_table[gtool][Qchr][Qasm][Tasm]["Shared.prop"]
+                for Tasm in shared_table[gtool][Qchr][Qasm].keys()
+            ]
+            for Qasm in shared_table[gtool][Qchr].keys()
+        ])
+
+        for Tchr in shared_table[gtool].keys():
+            Ttable = np.array([
+                [
+                    shared_table[gtool][Tchr][Qasm][Tasm]["Shared.prop"] 
+                    for Tasm in shared_table[gtool][Tchr][Qasm].keys()
+                ]
+                for Qasm in shared_table[gtool][Tchr].keys()
+            ])
+
+            # Computing Euclid distance using Frobenious norm
+            dData["Qchr"].append(Qchr)
+            dData["Tchr"].append(Tchr)
+            try : # Catching non similar shapes in case some path are not available in both matrices
+                dData["Diff"].append(np.linalg.norm(Qtable-Ttable, ord = 'fro'))
+            except : 
+                dData["Diff"].append(np.nan)
+
+    dData = pd.DataFrame.from_dict(dData).pivot(values=["Diff"], index=["Qchr"], columns=["Tchr"])
+    dData.columns = dData.columns.droplevel()
+    diff_table[gtool] = dData.to_dict(orient="index")
 
 ## Assembling output JSON
-
 Graph_JSON = {
     "General_stats": gen_stats,
     "Paths_stats": path_stats,
     "Shared_content": shared_table,
+    "Diff_shared_content": diff_table
 }
 
 avail_gtool = list(set([os.path.basename(figs).split('.')[1] for figs in args.odgifigs]))
-- 
GitLab


From 95fac73ca01e858ba9e6d83699dac6e9f190ee97 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 8 Nov 2024 16:23:53 +0100
Subject: [PATCH 282/310] Fixed key error in getTags.py

---
 scripts/getTags.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/getTags.py b/scripts/getTags.py
index 93289e9..54a0483 100644
--- a/scripts/getTags.py
+++ b/scripts/getTags.py
@@ -161,18 +161,18 @@ gtools = ["PGGB"] + (tags["Parameters"]["get_MC"] == "True")*["MC"]
 
 tags["Files"] = {
     "GFAv1": {
-        tool: f"data/Pan1c.{tool}.{tags['Pangenome']['name']}.gfa.gz" 
+        tool: f"data/Pan1c.{tool}.{tags['Pangenome']['Name']}.gfa.gz" 
         for tool in gtools    
     }
 }
 
 if tags["Parameters"]["get_VCF"] == "True":
     tags["Files"]["XG"] = {
-        tool: f"data/Pan1c.{tool}.{tags['Pangenome']['name']}.xg" 
+        tool: f"data/Pan1c.{tool}.{tags['Pangenome']['Name']}.xg" 
         for tool in gtools    
     }
     tags["Files"]["VCF"] = {
-        tool: f"data/Pan1c.{tool}.{tags['Pangenome']['name']}.vcf.gz" 
+        tool: f"data/Pan1c.{tool}.{tags['Pangenome']['Name']}.vcf.gz" 
         for tool in gtools    
     }
 
-- 
GitLab


From dae94a53110fe5923ad8102960fafb46e17d5fbb Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 12 Nov 2024 09:22:57 +0100
Subject: [PATCH 283/310] Fixed missing numpy import

---
 scripts/graph.pan1c_QC.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/graph.pan1c_QC.py b/scripts/graph.pan1c_QC.py
index 84c6799..65b4155 100644
--- a/scripts/graph.pan1c_QC.py
+++ b/scripts/graph.pan1c_QC.py
@@ -8,6 +8,7 @@ Graph JSON creator for Pan1c-QC
 import os
 import argparse
 import pandas as pd
+import numpy as np
 import json
 
 ## Arguments
-- 
GitLab


From fa2f10b6c818c35dc4c06e0224178bd8c8a0de9e Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Tue, 12 Nov 2024 15:26:57 +0100
Subject: [PATCH 284/310] Fixed syri figures path in asm JSON

---
 scripts/asm.pan1c_QC.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/asm.pan1c_QC.py b/scripts/asm.pan1c_QC.py
index efa784d..8ba7cec 100644
--- a/scripts/asm.pan1c_QC.py
+++ b/scripts/asm.pan1c_QC.py
@@ -147,12 +147,12 @@ if args.contig_pos:
 
 if args.syri_asm:
     assembly["Syri_hap"] = {
-        hap: f"data/asm.syri.figs/Pan1c.{args.name}.{hap.replace('#', '.hap')}.syri.mm2.png" for hap in cdf.index.get_level_values(1).unique() if hap != args.ref
+        hap: f"data/asm.syri.figs/Pan1c.{args.name}.{hap.replace('#', '.hap')}.syri_mm2.png" for hap in cdf.index.get_level_values(1).unique() if hap != args.ref
     }
 
 if args.syri_chr:
     assembly["Syri_chr"] = {
-        chrid: f"data/chrInput.syri.figs/Pan1c.{args.name}.{chrid}.syri.png" for chrid in cdf.index.get_level_values(0).unique()
+        chrid: f"data/chrInput.syri.figs/Pan1c.{args.name}.{chrid}.syri_mm2.png" for chrid in cdf.index.get_level_values(0).unique()
     }
 
 with open(args.output, "w") as handle:
-- 
GitLab


From 76c228029510909cfb41b1cb2dc3395e8497d0a6 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 14 Nov 2024 13:57:15 +0100
Subject: [PATCH 285/310] Added MC apptainer image

---
 getApps.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/getApps.sh b/getApps.sh
index b182b0a..6162a6a 100755
--- a/getApps.sh
+++ b/getApps.sh
@@ -17,4 +17,5 @@ done
 # Script
 apptainer pull $appdir/PanGeTools.sif oras://registry.forgemia.inra.fr/alexis.mergez/pangetools/pangetools:latest  
 apptainer pull $appdir/pan1c-env.sif oras://registry.forgemia.inra.fr/alexis.mergez/pan1capps/pan1cenv:latest  
-apptainer pull $appdir/pan1c-box.sif oras://registry.forgemia.inra.fr/alexis.mergez/pan1capps/pan1cbox:latest 
+apptainer pull $appdir/pan1c-box.sif oras://registry.forgemia.inra.fr/alexis.mergez/pan1capps/pan1cbox:latest
+apptainer pull $appdir/minigraph-cactus_v2.7.0.sif oras://registry.forgemia.inra.fr/alexis.mergez/pangratools/minigraph-cactus:latest
-- 
GitLab


From 1b0fceef3a25d6a4af62f82a5767255be01755de Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 15 Nov 2024 10:19:21 +0100
Subject: [PATCH 286/310] Started description JSON

---
 doc/description.json | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 doc/description.json

diff --git a/doc/description.json b/doc/description.json
new file mode 100644
index 0000000..5e24a35
--- /dev/null
+++ b/doc/description.json
@@ -0,0 +1,44 @@
+{
+    "Descriptions": {
+        "General": "Pan1c (Pangenome at Chromosome Scale) is a Snakemake workflow designed to simplify the construction and quality assessment of pangenome graphs. The workflow consists of splitting the overall graph construction into chromosome-level graph construction, followed by concatenation. This reduces the complexity of graph construction and exploration while enabling the use of cluster infrastructures, ultimately reducing the overall construction time.",
+        "Path composition": "The composition of each path is computed at the chromosome level and corresponds to a graph definition. The core group represents the cumulative length of nodes traversed by all paths/haplotypes (this length varies due to node repetition). The private group represents the cumulative length of nodes traversed by individual haplotypes.",
+        "Shared content": "Cumulative length of nodes shared pairwise between haplotypes.",
+        "Diff Shared content": "Pairwise Euclidean distance between 'shared content' matrices.",
+        "Odgi figures": "The figure shows a single chromosome. The top panel displays a decomposition of haplotype scaffolds into contigs, indicated by alternating shades of grey. The bottom panel provides a 1D visualization of the graph using Odgi. Syntenic regions are superimposed, while divergent regions appear as white gaps in specific haplotypes.",
+        "Syri figures": "The Syri tool produces a figure showing structural variants between haplotypes. Each comparison is pairwise, meaning that only two haplotypes are compared at a time.",
+        "Variant figures ": "Displays the variant count by size for deletions and insertions. All variants are detected relative to the reference set in the workflow. Two tools are used: Syri, which represents the standard method for variant detection by aligning each haplotype against the reference, and VG, which uses the pangenome graph (PGGB or Minigraph-Cactus) to detect variants."
+    },
+    "Metrics": {
+        "Path.length": "Length of the path",
+        "Path.nodes.count": "Number of unique node_ids in the path (without repeats)",
+        "Path.private.nodes.count": "Number of unique node_ids traversed only by this path (without repeats)",
+        "Path.core.nodes.count": "Number of node_ids traversed by all paths (without repeats)",
+        "Path.private.length": "Sum of the private node lengths in the path (without repeats)",
+        "Path.core.length": "Sum of the core node lengths in the path (without repeats)",
+        "Path.private.R.length": "Sum of the private node lengths in the path (with repeats)",
+        "Path.core.R.length": "Sum of the core node lengths in the path (with repeats)",
+        "Path.steps.count": "Number of nodes in the path (with repeats)",
+        "Path.nodes.R.size.mean": "Mean length of nodes in the path (with repeats)",
+        "Path.nodes.size.mean": "Mean length of nodes in the path (without repeats)",
+        "Path.nodes.R.size.median": "Median length of nodes in the path (with repeats)",
+        "Path.nodes.size.median": "Median length of nodes in the path (without repeats)",
+        "Path.degree.mean": "Mean degree of path nodes (without repeats)",
+        "Path.private.degree.mean": "Mean degree of private nodes of the path (without repeats)",
+        "Path.core.degree.mean": "Mean degree of core nodes of the path (without repeats)",
+        "Nodes.count": "Number of unique node_ids in the graph (without repeats)",
+        "Edges.count": "Number of edges in the graph (without repeats)",
+        "Path.count": "Number of paths in the graph",
+        "Path.length.mean": "Mean path lengths",
+        "Path.length.median": "Median path lengths",
+        "Nodes.private.count": "Number of private nodes (without repeats)",
+        "Nodes.core.count": "Number of core nodes (without repeats)",
+        "Steps.count": "Total number of nodes required to obtain all complete paths (with repeats)",
+        "Total.nodes.length": "Total length of nodes (without repeats)",
+        "Total.sequence.length": "Total sequence length stored in the graph",
+        "Compression.factor": "Size reduction factor between total sequence length and total node length",
+        "Nodes.length.mean": "Mean length of nodes (without repeats)",
+        "Nodes.length.median": "Median length of nodes (without repeats)",
+        "Degree.mean": "Mean number of edges per node. Edges are counted twice for start and end nodes",
+        "Degree.median": "Median number of edges per node. Edges are counted twice for start and end nodes"
+    }
+}
\ No newline at end of file
-- 
GitLab


From aa9880118a0234a700d779d1520c0dba7266bd1f Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Fri, 15 Nov 2024 11:40:14 +0100
Subject: [PATCH 287/310] New documentation for Pan1c-View

---
 doc/Pan1c.documentation.json | 67 ++++++++++++++++++++++++++++++++++++
 doc/description.json         | 44 -----------------------
 2 files changed, 67 insertions(+), 44 deletions(-)
 create mode 100644 doc/Pan1c.documentation.json
 delete mode 100644 doc/description.json

diff --git a/doc/Pan1c.documentation.json b/doc/Pan1c.documentation.json
new file mode 100644
index 0000000..75cbc85
--- /dev/null
+++ b/doc/Pan1c.documentation.json
@@ -0,0 +1,67 @@
+{
+    "Summary": {
+        "General Information": "Pan1c (Pangenome at Chromosome Scale) is a Snakemake workflow designed to simplify the construction and quality assessment of pangenome graphs. The workflow involves splitting the overall graph construction into chromosome-level graph construction, followed by concatenation. Thus, only intra-chromosomal variation is represented. Inter-chromosomal variation is not embedded into the graph but can be inferred when mapping onto it. RagTag is used to scaffold input assemblies and allows for clustering inputs by chromosome sequences.",
+        "Files": {
+            "GFAv1": "Graphical Fragment Assembly v1.0: Haplotypes are represented as P-lines. See https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md for more info.",
+            "XG": "Indexed graph used with the VG toolkit. See https://github.com/vgteam/vg/wiki/File-Formats#xg-xg-lightweight-graph--path-index for more info.",
+            "VCF": "Variant Calling Format generated using 'vg deconstruct' on the final graph. The reference is the reference haplotype set in the workflow config file."
+        }
+    },
+    "Assemblies": {
+        "Assemblathon": "Assemblathon is run on the input assemblies (before RagTag) to evaluate their quality.",
+        "Chromosome Length": "The length is retrieved from the FAI computed on clustered sequences (after RagTag).",
+        "Contig Position": "Each scaffolded haplotype (after RagTag) is split into its contigs (using N repeats as contig borders). Transitions between contigs are represented with gray color changes. A 'black' region indicates the presence of many small contigs.",
+        "SyRI": "SyRI detects structural variations between two genomes. It uses an alignment file (produced by Minimap2 here). Comparisons are pairwise between two haplotypes."
+    },
+    "Graph": {
+        "General Metrics": {
+            "Description": "Metrics are computed using GFAstats.py (included in the PanGeTools Apptainer image).",
+            "Metrics": {
+                "Path.length": "Length of the path.",
+                "Path.nodes.count": "Number of unique node_ids in the path (excluding repeats).",
+                "Path.private.nodes.count": "Number of unique node_ids traversed only by this path (excluding repeats).",
+                "Path.core.nodes.count": "Number of node_ids traversed by all paths (excluding repeats).",
+                "Path.private.length": "Sum of the lengths of private nodes in the path (excluding repeats).",
+                "Path.core.length": "Sum of the lengths of core nodes in the path (excluding repeats).",
+                "Path.private.R.length": "Sum of the lengths of private nodes in the path (including repeats).",
+                "Path.core.R.length": "Sum of the lengths of core nodes in the path (including repeats).",
+                "Path.steps.count": "Number of nodes in the path (including repeats).",
+                "Path.nodes.R.size.mean": "Mean length of nodes in the path (including repeats).",
+                "Path.nodes.size.mean": "Mean length of nodes in the path (excluding repeats).",
+                "Path.nodes.R.size.median": "Median length of nodes in the path (including repeats).",
+                "Path.nodes.size.median": "Median length of nodes in the path (excluding repeats).",
+                "Path.degree.mean": "Mean degree of path nodes (excluding repeats).",
+                "Path.private.degree.mean": "Mean degree of private nodes in the path (excluding repeats).",
+                "Path.core.degree.mean": "Mean degree of core nodes in the path (excluding repeats).",
+                "Nodes.count": "Number of unique node_ids in the graph (excluding repeats).",
+                "Edges.count": "Number of edges in the graph (excluding repeats).",
+                "Path.count": "Number of paths in the graph.",
+                "Path.length.mean": "Mean path length.",
+                "Path.length.median": "Median path length.",
+                "Nodes.private.count": "Number of private nodes (excluding repeats).",
+                "Nodes.core.count": "Number of core nodes (excluding repeats).",
+                "Steps.count": "Total number of nodes required to obtain all complete paths (including repeats).",
+                "Total.nodes.length": "Total length of nodes (excluding repeats).",
+                "Total.sequence.length": "Total sequence length stored in the graph.",
+                "Compression.factor": "Size reduction factor between total sequence length and total node length.",
+                "Nodes.length.mean": "Mean length of nodes (excluding repeats).",
+                "Nodes.length.median": "Median length of nodes (excluding repeats).",
+                "Degree.mean": "Mean number of edges per node. Edges are counted twice for start and end nodes.",
+                "Degree.median": "Median number of edges per node. Edges are counted twice for start and end nodes."
+            }
+        },
+        "Path Composition": "Using GFAstats.py, nodes are categorized into three groups at the chromosome scale: Core, Private, Other. Core corresponds to nodes traversed by all paths (haplotypes) from the chromosome. Private corresponds to nodes traversed by only one path (haplotype). Other includes the remaining nodes. For a given path, the proportion of its length by group is computed using the sequence length of the corresponding nodes, including repetition.",
+        "Shared Content": {
+            "Description": "At the chromosome scale, the list of node_ids shared between each pair of paths (haplotypes) is computed. Several metrics are derived from this list.",
+            "Metrics": {
+                "Shared.nodes.count": "Number of nodes shared between a given pair of paths.",
+                "Shared.length": "Sum of the sequence lengths of shared nodes (excluding repeats).",
+                "Shared.R.length": "Sum of the sequence lengths of shared nodes (including repeats).",
+                "Shared.prop": "Proportion of the path that is shared (excluding repeats).",
+                "Shared.R.prop": "Proportion of the path that is shared (including repeats)."
+            }
+        },
+        "Odgi Figures": "The contig position figure from the Assembly section is superimposed with the Odgi 1D visualization for each chromosome graph. Coordinates are not strictly identical, as the Odgi figure aims to identify similar regions (indicated by corresponding blocks of color) and INDEL regions (indicated by a white line in a given haplotype), which can introduce gaps in haplotypes."
+    },
+    "Variants": "Variants are called from graphs (PGGB or MC) using 'vg deconstruct' and from linear haplotypes using SyRI. Only variants between 50bp and 100kbp in size are kept for statistics and comparisons. SyRI represents the classic method of detecting variants. In both cases, the reference is the same: the reference haplotype set in the workflow config file."
+}
diff --git a/doc/description.json b/doc/description.json
deleted file mode 100644
index 5e24a35..0000000
--- a/doc/description.json
+++ /dev/null
@@ -1,44 +0,0 @@
-{
-    "Descriptions": {
-        "General": "Pan1c (Pangenome at Chromosome Scale) is a Snakemake workflow designed to simplify the construction and quality assessment of pangenome graphs. The workflow consists of splitting the overall graph construction into chromosome-level graph construction, followed by concatenation. This reduces the complexity of graph construction and exploration while enabling the use of cluster infrastructures, ultimately reducing the overall construction time.",
-        "Path composition": "The composition of each path is computed at the chromosome level and corresponds to a graph definition. The core group represents the cumulative length of nodes traversed by all paths/haplotypes (this length varies due to node repetition). The private group represents the cumulative length of nodes traversed by individual haplotypes.",
-        "Shared content": "Cumulative length of nodes shared pairwise between haplotypes.",
-        "Diff Shared content": "Pairwise Euclidean distance between 'shared content' matrices.",
-        "Odgi figures": "The figure shows a single chromosome. The top panel displays a decomposition of haplotype scaffolds into contigs, indicated by alternating shades of grey. The bottom panel provides a 1D visualization of the graph using Odgi. Syntenic regions are superimposed, while divergent regions appear as white gaps in specific haplotypes.",
-        "Syri figures": "The Syri tool produces a figure showing structural variants between haplotypes. Each comparison is pairwise, meaning that only two haplotypes are compared at a time.",
-        "Variant figures ": "Displays the variant count by size for deletions and insertions. All variants are detected relative to the reference set in the workflow. Two tools are used: Syri, which represents the standard method for variant detection by aligning each haplotype against the reference, and VG, which uses the pangenome graph (PGGB or Minigraph-Cactus) to detect variants."
-    },
-    "Metrics": {
-        "Path.length": "Length of the path",
-        "Path.nodes.count": "Number of unique node_ids in the path (without repeats)",
-        "Path.private.nodes.count": "Number of unique node_ids traversed only by this path (without repeats)",
-        "Path.core.nodes.count": "Number of node_ids traversed by all paths (without repeats)",
-        "Path.private.length": "Sum of the private node lengths in the path (without repeats)",
-        "Path.core.length": "Sum of the core node lengths in the path (without repeats)",
-        "Path.private.R.length": "Sum of the private node lengths in the path (with repeats)",
-        "Path.core.R.length": "Sum of the core node lengths in the path (with repeats)",
-        "Path.steps.count": "Number of nodes in the path (with repeats)",
-        "Path.nodes.R.size.mean": "Mean length of nodes in the path (with repeats)",
-        "Path.nodes.size.mean": "Mean length of nodes in the path (without repeats)",
-        "Path.nodes.R.size.median": "Median length of nodes in the path (with repeats)",
-        "Path.nodes.size.median": "Median length of nodes in the path (without repeats)",
-        "Path.degree.mean": "Mean degree of path nodes (without repeats)",
-        "Path.private.degree.mean": "Mean degree of private nodes of the path (without repeats)",
-        "Path.core.degree.mean": "Mean degree of core nodes of the path (without repeats)",
-        "Nodes.count": "Number of unique node_ids in the graph (without repeats)",
-        "Edges.count": "Number of edges in the graph (without repeats)",
-        "Path.count": "Number of paths in the graph",
-        "Path.length.mean": "Mean path lengths",
-        "Path.length.median": "Median path lengths",
-        "Nodes.private.count": "Number of private nodes (without repeats)",
-        "Nodes.core.count": "Number of core nodes (without repeats)",
-        "Steps.count": "Total number of nodes required to obtain all complete paths (with repeats)",
-        "Total.nodes.length": "Total length of nodes (without repeats)",
-        "Total.sequence.length": "Total sequence length stored in the graph",
-        "Compression.factor": "Size reduction factor between total sequence length and total node length",
-        "Nodes.length.mean": "Mean length of nodes (without repeats)",
-        "Nodes.length.median": "Median length of nodes (without repeats)",
-        "Degree.mean": "Mean number of edges per node. Edges are counted twice for start and end nodes",
-        "Degree.median": "Median number of edges per node. Edges are counted twice for start and end nodes"
-    }
-}
\ No newline at end of file
-- 
GitLab


From 0849383da8f354e7c2b1e0280d751aaf40376e0e Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Fri, 15 Nov 2024 14:43:16 +0100
Subject: [PATCH 288/310] Some changes for Pan1c-View

---
 Snakefile                 | 2 +-
 scripts/graph.pan1c_QC.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index f2d79e1..057685c 100644
--- a/Snakefile
+++ b/Snakefile
@@ -1224,7 +1224,7 @@ rule Pan1c_View_data:
     input:
         Pan1c_view_data_inputs
     output:
-        "output/"+config['name']+".Pan1c-View.data.tar.gz"
+        "output/"+config['name']+".Pan1c-View.tar.gz"
     threads: 1
     resources:
         mem_mb = 8000
diff --git a/scripts/graph.pan1c_QC.py b/scripts/graph.pan1c_QC.py
index 65b4155..51fba13 100644
--- a/scripts/graph.pan1c_QC.py
+++ b/scripts/graph.pan1c_QC.py
@@ -92,7 +92,6 @@ for tsv in args.path:
     sdf.set_index(["Chr.id", "Query.name", "Target.name"], inplace=True)
     sdf.loc[:, "Shared.prop"] = sdf["Shared.length"]*100/sdf["Path.length"]
     sdf.loc[:, "Shared.R.prop"] = sdf["Shared.R.length"]*100/sdf["Path.length"]
-    sdf.loc[:, "Shared.length.mb"] = sdf["Shared.length"]/1000000
     sdf.reset_index(inplace=True)
 
     for chrid in sdf["Chr.id"].unique():
-- 
GitLab


From 727da110c1aa835b1301b10e7dbbdf2781b1e9e5 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Fri, 15 Nov 2024 14:48:35 +0100
Subject: [PATCH 289/310] Update Snakefile

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 057685c..192f48b 100644
--- a/Snakefile
+++ b/Snakefile
@@ -1224,7 +1224,7 @@ rule Pan1c_View_data:
     input:
         Pan1c_view_data_inputs
     output:
-        "output/"+config['name']+".Pan1c-View.tar.gz"
+        "output/"+config['name']+".Pan1c_View.tar.gz"
     threads: 1
     resources:
         mem_mb = 8000
-- 
GitLab


From 9ed9e724c20cd39c713344f1daa000edca99ed5e Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Fri, 15 Nov 2024 14:51:51 +0100
Subject: [PATCH 290/310] Added doc to Pan1c-view tarball

---
 Snakefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 192f48b..391c972 100644
--- a/Snakefile
+++ b/Snakefile
@@ -1177,7 +1177,8 @@ def Pan1c_view_data_inputs(wildcards):
         "output/report_data/Pan1c."+config['name']+".assembly.json",
         "output/report_data/Pan1c."+config['name']+".graph.json",
         "output/report_data/Pan1c."+config['name']+".var.json",
-        "output/report_data/Pan1c."+config['name']+".tags.json"
+        "output/report_data/Pan1c."+config['name']+".tags.json",
+        "doc/Pan1c.documentation.json"
     ]
 
     # Adding GFAv1
-- 
GitLab


From 666ca9bdad853a8908ee3775e1c21f2b7b08af59 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 15 Nov 2024 16:08:41 +0100
Subject: [PATCH 291/310] Fixed rule all

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 391c972..8af05c0 100644
--- a/Snakefile
+++ b/Snakefile
@@ -77,7 +77,7 @@ def which_analysis():
         if config["get_VCF"] == "True":
             analysis_inputs.append("output/report_data/Pan1c."+config['name']+".var.json")
 
-        analysis_inputs.append("output/"+config['name']+".Pan1c-View.data.tar.gz")
+        analysis_inputs.append("output/"+config['name']+".Pan1c-View.tar.gz")
 
     if config["get_VCF"] == "True":
             analysis_inputs.append(
-- 
GitLab


From a10e5a2a53f9e1117185133a8da191b0caaf0f9e Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 15 Nov 2024 16:12:26 +0100
Subject: [PATCH 292/310] Typo

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 8af05c0..a626f96 100644
--- a/Snakefile
+++ b/Snakefile
@@ -77,7 +77,7 @@ def which_analysis():
         if config["get_VCF"] == "True":
             analysis_inputs.append("output/report_data/Pan1c."+config['name']+".var.json")
 
-        analysis_inputs.append("output/"+config['name']+".Pan1c-View.tar.gz")
+        analysis_inputs.append("output/"+config['name']+".Pan1c_View.tar.gz")
 
     if config["get_VCF"] == "True":
             analysis_inputs.append(
-- 
GitLab


From e243329a0cc5529793fb3cff3cd22cf0ccc36022 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Mon, 18 Nov 2024 10:53:00 +0100
Subject: [PATCH 293/310] Removed legacy report and added benchmark for
 snakemake

---
 Snakefile                   | 320 +++++-------------------------------
 scripts/sr_mapping2graph.sh |  32 ----
 2 files changed, 38 insertions(+), 314 deletions(-)
 delete mode 100755 scripts/sr_mapping2graph.sh

diff --git a/Snakefile b/Snakefile
index a626f96..cb199ea 100644
--- a/Snakefile
+++ b/Snakefile
@@ -84,18 +84,6 @@ def which_analysis():
                 expand("output/Pan1c.{gtool}."+config['name']+".vcf.gz", gtool=graph_tools)
             )
 
-    ### Legacy option, will soon be removed
-
-    if config["create_report"] == "True": # Creating legacy report
-        analysis_inputs.append(
-            expand("output/Pan1c.{gtool}."+config['name']+".report.md", gtool=graph_tools)
-        )
-
-        if config["get_VCF"] == "True": # VCF from the final graph against the "reference"
-            analysis_inputs.append(
-                expand("output/{gtool}.vcf.figs", gtool=graph_tools)
-            )
-        
     return analysis_inputs
 
 """
@@ -128,6 +116,8 @@ rule ragtag_scaffolding:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000 
     retries: 1
     priority: 100
+    benchmark:
+        "statistics/ragtag.{haplotype}.txt"
     params:
         app_path=config["app.path"],
         mm2_config=config["ragtag_mm2_conf"],
@@ -169,6 +159,8 @@ rule quast_stats:
         app_path=config["app.path"],
         pan_name=config["name"],
         tmp_dir="output/quast"
+    benchmark:
+        "statistics/quast.{haplotype}.txt"
     log: 
         cmd="logs/quast/quast.cmd.log",
         time="logs/quast/quast.time.log"
@@ -205,6 +197,8 @@ rule assemblathon_stats:
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * 16000
+    benchmark:
+        "statistics/assemblathon.{haplotype}.txt"
     params:
         app_path=config["app.path"]
     shell:
@@ -232,6 +226,8 @@ rule contig_positions:
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 16000
+    benchmark:
+        "statistics/contig_pos.{chromosome}.txt"
     params:
         app_path=config["app.path"]
     shell:
@@ -281,6 +277,8 @@ rule chromosome_clustering:
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 16000
     priority: 100
+    benchmark:
+        "statistics/chromosome_clustering.txt"
     params:
         app_path=config["app.path"],
         pan_name=config["name"]
@@ -306,6 +304,8 @@ rule SyRI_on_ASM_mm2:
     threads: 4
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 12000
+    benchmark:
+        "statistics/syri_asm_mm2.{haplotype}.txt"
     params:
         app_path=config["app.path"],
         wrk_dir="data/asm.syri.mm2",
@@ -351,6 +351,8 @@ rule SyRI_on_ASM_wfm:
     threads: 4
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 12000
+    benchmark:
+        "statistics/syri_asm_wfmash.{haplotype}.txt"
     params:
         app_path=config["app.path"],
         wrk_dir="data/asm.syri.wfm",
@@ -456,6 +458,8 @@ rule SyRI_on_chrInput:
     threads: 8
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 12000
+    benchmark:
+        "statistics/syri_chr_mm2.{chromosome}.txt"
     params:
         app_path=config["app.path"],
         ref=config['reference'],
@@ -518,6 +522,8 @@ rule wfmash_on_chr:
         segment_length=config['wfmash.segment_length'],
         mapping_id=config['wfmash.mapping_id'],
         wfmash_sec=config['wfmash.secondary']
+    benchmark:
+        "statistics/wfmash.{chromosome}.txt"
     log: 
         cmd_map="logs/pggb/{chromosome}.wfmash_mapping.cmd.log",
         time_map="logs/pggb/{chromosome}.wfmash_mapping.time.log",
@@ -566,6 +572,8 @@ rule seqwish:
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000
     priority: 100
+    benchmark:
+        "statistics/seqwish.{chromosome}.txt"
     params:
         app_path=config['app.path'],
         seqwish=config['seqwish.params']
@@ -599,6 +607,8 @@ rule gfaffix_on_chr:
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 24000
     priority: 100
+    benchmark:
+        "statistics/gfaffix.{chromosome}.txt"
     params:
         app_path=config['app.path']
     log: 
@@ -632,6 +642,8 @@ rule odgi_postprocessing:
     threads: 8
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000
+    benchmark:
+        "statistics/odgi_postprocessing.{chromosome}.txt"
     priority: 100
     params:
         app_path=config['app.path']
@@ -701,6 +713,8 @@ rule MC_graph:
     threads: 16
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 32000
+    benchmark:
+        "statistics/MC.{chromosome}.txt"
     params:
         mc_dir='data/chrGraphs/MC.{chromosome}',
         ref_name=config['reference'],
@@ -783,6 +797,8 @@ rule graph_squeeze:
     threads: 16
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
+    benchmark:
+        "statistics/graph_squeeze.{gtool}.{chromosome}.txt"
     priority: 100
     params:
         app_path=config['app.path']
@@ -818,6 +834,8 @@ rule graph_stats:
     threads: 4
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 8000
+    benchmark:
+        "statistics/graph_stats.{gtool}.{chromosome}.txt"
     params:
         app_path=config['app.path'],
         pan_name=config['name']
@@ -946,6 +964,8 @@ rule graph_json:
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 16000
+    benchmark:
+        "statistics/graph_json.{gtool}.{chromosome}.txt"
     params:
         app_path=config["app.path"],
         pan_name=config["name"]
@@ -998,6 +1018,8 @@ rule panacus_stats:
     threads: 8
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
+    benchmark:
+        "statistics/panacus.{gtool}.{chromosome}.txt"
     shell:
         """
         /usr/bin/time -v -o {log.time} \
@@ -1020,6 +1042,8 @@ rule vg_deconstruct:
     threads: 8
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 32000
+    benchmark:
+        "statistics/vg_deconstruct.{gtool}.txt"
     params:
         app_path=config['app.path'],
         ref=config['reference']
@@ -1159,6 +1183,8 @@ rule var_json:
     threads: 1
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 48000
+    benchmark:
+        "statistics/var_json.txt"
     params:
         app_path=config['app.path'],
         refname=config['reference']
@@ -1236,273 +1262,3 @@ rule Pan1c_View_data:
             --transform 's/data\/report_data\///' \
             -czvf {output} {input}
         """
-    
-rule create_pan1c_report_fig:
-    # Produces a markdown report figure of chromosomes graphs
-    input:
-        graph='data/chrGraphs/Pan1c.{gtool}.'+config['name']+'.{chromosome}.tmp.gfa',
-        contigfig="output/chr.contig/Pan1c."+config['name']+".{chromosome}.contig.png",
-    output:
-        odgifig=temp("tmp/Pan1c.{gtool}.{chromosome}.odgi.png"),
-        namefig=temp("tmp/Pan1c.{gtool}.{chromosome}.name.png"),
-        reportfig="output/report/Pan1c.{gtool}."+config['name']+".{chromosome}.report.fig.png"
-    threads: 4
-    resources:
-        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
-    params:
-        app_path=config['app.path']
-    shell:
-        """
-        ## Odgi 1D viz
-        apptainer run --app odgi {params.app_path}/PanGeTools.sif \
-            viz -i {input.graph} -o {output.odgifig} -x 2500 -a 80 -b -H -t {threads} -P
-
-        ## Getting legend from contig figure
-        convert {input.contigfig} -crop 790x+0+0 +repage {output.namefig}
-
-        odgheight=$(identify -ping -format '%h' {output.odgifig})
-        ctgheight=$(identify -ping -format '%h' {input.contigfig})
-
-        combinedheight=$(($odgheight+$ctgheight))
-
-        echo -e "[Debug::Report fig]\t$odgheight\t$ctgheight\t$combinedheight"
-
-        ## Creating empty canvas ad adding other figures to it
-        convert -size "3300x$combinedheight" xc:white {output.reportfig}
-        composite -geometry +0+0 {input.contigfig} {output.reportfig} {output.reportfig}
-        composite -geometry "+0+$ctgheight" {output.namefig} {output.reportfig} {output.reportfig}
-        composite -geometry "+790+$ctgheight" {output.odgifig} {output.reportfig} {output.reportfig}
-        """
-
-rule create_chrGraphs_figs:
-    # Produce figures based on aggregated path stats
-    input:
-        pathstats="output/stats/Pan1c.{gtool}."+config['name']+".chrGraph.path.stats.tsv"
-    output:
-        barplots=expand("output/chrGraphs.stats.figs/Pan1c.{{gtool}}."+config['name']+".path.decomp.{chromosome}.png", chromosome=CHRLIST),
-        scatters=expand("output/chrGraphs.stats.figs/Pan1c.{{gtool}}."+config['name']+".2D.scatter.{chromosome}.png", chromosome=CHRLIST),
-        heatmaps=expand("output/chrGraphs.stats.figs/Pan1c.{{gtool}}."+config['name']+".sharred.content.{chromosome}.png", chromosome=CHRLIST),
-        barplot_mean="output/chrGraphs.stats.figs/Pan1c.{gtool}."+config['name']+".path.decomp.mean.png",
-        scatter_mean="output/chrGraphs.stats.figs/Pan1c.{gtool}."+config['name']+".2D.scatter.mean.png",
-        heatmap_diff="output/chrGraphs.stats.figs/Pan1c.{gtool}."+config['name']+".shared.content.diff.png"
-    threads: 1
-    resources:
-        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 16000
-    params:
-        app_path=config['app.path'],
-        pan_name=config['name'],
-        ref_name=config['reference']
-    shell:
-        """
-        mkdir -p $(dirname {output.barplot_mean})
-
-        ref="$(basename {params.ref_name} .fa.gz | cut -f1 -d'.')#$(basename {params.ref_name} .fa.gz | cut -f2 -d'.' | cut -f2 -d'p')"
-
-        apptainer run {params.app_path}/pan1c-env.sif python scripts/chrGraphs.stats_figs.py \
-            --input {input.pathstats} --output_dir $(dirname {output.barplot_mean}) \
-            --panname {params.pan_name} --reference "$ref" --grapher {wildcards.gtool}
-        """
-
-def get_report_sections(wildcards):
-    """
-    Return 'create_pan1c_report' optional inputs to add them to the final report.
-    For example :
-        - SyRI figures on Assemblies
-    """
-    sections = dict()
-
-    sections["metadata"] = "output/Pan1c."+config['name']+".gfa.metadata"
-    sections["odgifigs"] = expand("output/report/Pan1c.{gtool}."+config['name']+".{chromosome}.report.fig.png", chromosome=CHRLIST, gtool=[wildcards.gtool])
-    sections["genstats"] = f"output/stats/Pan1c.{wildcards.gtool}."+config['name']+".chrGraph.general.stats.tsv"
-    sections["pathstats"] = f"output/stats/Pan1c.{wildcards.gtool}."+config['name']+".chrGraph.path.stats.tsv"
-    sections["barplots"] = expand("output/chrGraphs.stats.figs/Pan1c.{gtool}."+config['name']+".path.decomp.{chromosome}.png", chromosome=CHRLIST, gtool=[wildcards.gtool])
-    sections["scatters"] = expand("output/chrGraphs.stats.figs/Pan1c.{gtool}."+config['name']+".2D.scatter.{chromosome}.png", chromosome=CHRLIST, gtool=[wildcards.gtool])
-    sections["heatmaps"] = expand("output/chrGraphs.stats.figs/Pan1c.{gtool}."+config['name']+".sharred.content.{chromosome}.png", chromosome=CHRLIST, gtool=[wildcards.gtool])
-    sections["barplot_mean"] = f"output/chrGraphs.stats.figs/Pan1c.{wildcards.gtool}."+config['name']+".path.decomp.mean.png"
-    sections["scatter_mean"] = f"output/chrGraphs.stats.figs/Pan1c.{wildcards.gtool}."+config['name']+".2D.scatter.mean.png"
-    sections["heatmap_diff"] = f"output/chrGraphs.stats.figs/Pan1c.{wildcards.gtool}."+config['name']+".shared.content.diff.png"
-
-    if config["get_ASMs_SyRI"] == "True":
-        sections["SyRI_on_ASMs_figs"] = expand(
-            "output/asm.syri.figs/Pan1c."+config['name']+".{haplotype}.syri_mm2.png", 
-            haplotype=SAMPLES_NOREF
-            )
-
-    if config["get_chrInputs_SyRI"] == "True":
-        sections["SyRI_on_chrInputs_figs"] = expand(
-            "output/chrInput.syri.figs/Pan1c."+config['name']+".{chromosome}.syri_mm2.png", 
-            chromosome=CHRLIST
-            )
-
-    if config['get_VCF'] == "True":
-        sections['VCF_figs'] = f"output/{wildcards.gtool}.vcf.figs"
-
-    return sections      
-
-rule create_pan1c_report:
-    # Produces a markdown report of chromosomes graphs
-    input:
-        unpack(get_report_sections)
-    output:
-        report="output/Pan1c.{gtool}."+config['name']+".report.md",
-        html="output/Pan1c.{gtool}."+config['name']+".report.html"
-    threads: 4
-    resources:
-        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 500
-    params:
-        app_path=config['app.path'],
-        add_ASMs_SyRI=config['get_ASMs_SyRI'],
-        add_chrInputs_SyRI=config['get_chrInputs_SyRI'],
-        add_VCF_fig=config['get_VCF']
-    run:
-        shell("touch {output.report}")
-
-        # Adding Summary
-        shell("echo '# Summary' >> {output.report}")
-        shell("echo '- [Graph metadata](#graph-metadata)' >> {output.report}")
-        shell("echo '- [General stats](#general-stats)' >> {output.report}")
-        shell("echo '- [Path stats](#path-stats)' >> {output.report}")
-        shell("echo '- [Chromosome-scale odgi graphs](#chromosome-scale-odgi-graphs)' >> {output.report}")
-        if params.add_ASMs_SyRI == "True":
-            shell("echo '- [SyRI on input assemblies](#syri-on-input-assemblies)' >> {output.report}")
-        shell("echo '' >> {output.report}")
-        
-        # Adding graph construction info
-        ## WIP
-
-        # Adding metadata
-        shell("echo '# Graph metadata' >> {output.report}")
-        shell("echo '```' >> {output.report}")
-        shell("cat {input.metadata} >> {output.report}")
-        shell("echo '```' >> {output.report}")
-        shell("echo '' >> {output.report}")
-
-        # Adding General stats
-        shell("echo '# General stats' >> {output.report}")
-        shell("cat {input.genstats} | apptainer run {params.app_path}/pan1c-env.sif csv2md -d $'\\t' >> {output.report}")
-        shell("echo '' >> {output.report}")
-
-        # Adding Path stats
-        shell("echo '# Path stats' >> {output.report}")
-
-        ## Barplots sub section
-        shell("echo '## Path composition' >> {output.report}")
-        shell("echo -e 'First, the list of unique node ids belonging to each group is computed :\n- Core : node ids found in every path of the graph\n- Private : node ids found in one and only path of the graph\n- Other : node ids that does not belong to previous groups\n\nEach group length is computed using the sum of node lengths, weighted by the number of copies found in a given Path (left panel). This sum is then divided by the Path length (right panel).' >> {output.report}")
-        _basename = os.path.basename(input.barplot_mean)
-        shell("echo '![{_basename}](./chrGraphs.stats.figs/{_basename})' >> {output.report}")
-
-        barplot_figs_list = [fig for fig in input.barplots]
-        barplot_figs_list.sort()
-
-        for i in range(len(barplot_figs_list)):
-            barplot_basename=os.path.basename(barplot_figs_list[i])
-            chr_name=barplot_basename.split('.')[-2]
-
-            shell("echo '### {chr_name}' >> {output.report}")
-            shell("echo '![{barplot_basename}](./chrGraphs.stats.figs/{barplot_basename})' >> {output.report}")
-        
-        ## 2D scatter sub section
-        shell("echo '## Core vs Private' >> {output.report}")
-        shell("echo -e 'Using the same method to compute the proportion of Core and Private as in the previous figures, Private is plotted against Core. Basically, if the Path is in the lower right, it tends to have more Private than Core sequences compared to other Paths. The clustering is done using DBSCAN.' >> {output.report}")
-        _basename = os.path.basename(input.scatter_mean)
-        shell("echo '![{_basename}](./chrGraphs.stats.figs/{_basename})' >> {output.report}")
-
-        scatter_figs_list = [fig for fig in input.scatters]
-        scatter_figs_list.sort()
-
-        for i in range(len(scatter_figs_list)):
-            scatter_basename=os.path.basename(scatter_figs_list[i])
-            chr_name=scatter_basename.split('.')[-2]
-
-            shell("echo '### {chr_name}' >> {output.report}")
-            shell("echo '![{scatter_basename}](./chrGraphs.stats.figs/{scatter_basename})' >> {output.report}")
-
-        ## Heatmap section
-        shell("echo '## Pairwise shared content' >> {output.report}")
-        shell("echo -e 'For each pair of paths, the length and relative proportion of shared nodes is computed. The left panel shows the relative proportion in the Query path, including repetitions. The right panel shows the absolute length of shared nodes, without repeats.' >> {output.report}")
-        shell("echo '### Pairwise euclid distance betwwen chromosomes ' >> {output.report}")
-        _basename = os.path.basename(input.heatmap_diff)
-        shell("echo '![{_basename}](./chrGraphs.stats.figs/{_basename})' >> {output.report}")
-
-        hm_figs_list = [fig for fig in input.heatmaps]
-        hm_figs_list.sort()
-
-        for i in range(len(hm_figs_list)):
-            hm_basename=os.path.basename(hm_figs_list[i])
-            chr_name=hm_basename.split('.')[-2]
-
-            shell("echo '### {chr_name}' >> {output.report}")
-            shell("echo '![{hm_basename}](./chrGraphs.stats.figs/{hm_basename})' >> {output.report}")
-
-        #shell("cat {input.pathstats} | apptainer run {params.app_path}/pan1c-env.sif csv2md -d $'\\t' >> {output.report}")
-        shell("echo '' >> {output.report}")
-
-        # Adding chromosomes figures
-        odgi_figs_list = [fig for fig in input.odgifigs]
-        odgi_figs_list.sort()
-
-        if params.add_chrInputs_SyRI == "True":
-            syri_figs_dict = {
-                os.path.basename(fig).split('.')[1]:os.path.basename(fig) 
-                for fig in input.SyRI_on_chrInputs_figs
-                }
-        
-        shell("echo '# Chromosome-scale odgi graphs' >> {output.report}")
-        for i in range(len(odgi_figs_list)):
-            odgi_basename=os.path.basename(odgi_figs_list[i])
-            chr_name=odgi_basename.split('.')[2]
-            
-            shell("echo '## {chr_name}' >> {output.report}")
-            shell("echo '![{odgi_basename}](./report/{odgi_basename})' >> {output.report}")
-            if params.add_chrInputs_SyRI == "True":
-                syri_fig = syri_figs_dict[chr_name]
-                shell("echo '![{syri_fig}](./chrInput.syri.figs/{syri_fig})' >> {output.report}")
-        shell("echo '' >> {output.report}")
-
-        # Adding SyRI figs if produced
-        if params.add_ASMs_SyRI == "True":
-
-            fig_list = [fig for fig in input.SyRI_on_ASMs_figs]
-            fig_list.sort()
-
-            shell("echo '# SyRI on input assemblies' >> {output.report}")
-            for fig in fig_list:
-                basename = os.path.basename(fig)
-                hap_name = basename.split('.')[1:3]
-
-                shell("echo '## {hap_name[0]}, {hap_name[1]}' >> {output.report}")
-                shell("echo '![{basename}](./asm.syri.figs/{basename})' >> {output.report}")
-
-            shell("echo '' >> {output.report}")
-
-        # Adding VCF figure from vg
-        if params.add_VCF_fig == "True":
-            shell("echo '# INS/DEL length distribution' >> {output.report}")
-            figures = [
-                fig for fig in os.listdir(input.VCF_figs)
-                if fig[-3:] == "png"
-                and fig.split('.')[2][:7] != "General"
-            ]
-            figures.sort()
-
-            # Adding back 'general' figures
-            figures = [
-                f"pan1c.{config['name']}.General.vcf.{tool}.png" 
-                for tool in ["vg", "syri"]
-            ] + figures
-
-            general_fig = None
-            for basename in figures:
-                name = basename.split('.')[2]
-
-                if general_fig is None :
-                    shell("echo '## {name}' >> {output.report}")
-                    if name[:7] == "General": general_fig = name
-                else :
-                    general_fig = None
-
-                shell("echo '![{basename}](./{wildcards.gtool}.vcf.figs/{basename})' >> {output.report}")
-                shell("echo '' >> {output.report}")
-
-        # Converting to HTML
-        shell("pandoc --standalone -c src/github-markdown.css -f gfm -t html {output.report} > {output.html}")
\ No newline at end of file
diff --git a/scripts/sr_mapping2graph.sh b/scripts/sr_mapping2graph.sh
deleted file mode 100755
index e203385..0000000
--- a/scripts/sr_mapping2graph.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-# Map given shrot reads to the pangenome graph using vg giraffe. See wiki for detailled explanation.
-
-# Initializing arguments
-shortreads=""   # Fastq file to map against the graph
-appdir=""       # Directory containing apptainer images
-threads=""      # Threads count
-graph=""        # Pangenome graph
-output=""       # Output gam
-
-## Getting arguments
-while getopts "a:t:g:f:o:" option; do
-    case "$option" in
-        a) appdir="$OPTARG";;
-        t) threads="$OPTARG";;
-        g) graph="$OPTARG";;
-        f) shortreads="$OPTARG";;
-        o) output="$OPTARG";;
-        \?) echo "Usage: $0 [-a apptainer dir] [-t threads] [-g graph] [-f fastq] [-o output gam]" >&2
-            exit 1;;
-    esac
-done
-
-## Main script
-apptainer run --app vg $appdir/PanGeTools.sif \
-    autoindex -t $threads -g $graph --workflow giraffe -p $(basename $graph)
-
-apptainer run --app vg $appdir/PanGeTools.sif \
-    giraffe -Z ${graph}.giraffe.gbz -m ${graph}.min -d ${graph}.dist -f $shortreads -t $threads -p > $output
-
-apptainer run --app vg $appdir/PanGeTools.sif \
-    stats -a $output
-- 
GitLab


From e724c442d24e003356d4a6ea8bb7cbf0d671cc96 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Mon, 18 Nov 2024 10:55:05 +0100
Subject: [PATCH 294/310] Updated config files

---
 config.yaml              | 3 +--
 example/config_CICD.yaml | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/config.yaml b/config.yaml
index 1ed52d8..9b53b68 100644
--- a/config.yaml
+++ b/config.yaml
@@ -43,6 +43,5 @@ get_ASMs_SyRI: 'False' # Haplotype vs Reference
 get_chrInputs_SyRI: 'False' # SyRI on chrInputs
 # Producing VCF from Pangenome graph
 get_VCF: 'False'
-# Creating final report
-create_report: 'True'
+# Creating Pan1c_View tarball
 Pan1c-View_jsons: 'True'
diff --git a/example/config_CICD.yaml b/example/config_CICD.yaml
index 30a7503..b10bc1b 100644
--- a/example/config_CICD.yaml
+++ b/example/config_CICD.yaml
@@ -44,6 +44,5 @@ get_ASMs_SyRI: 'True' # Haplotype vs Reference
 get_chrInputs_SyRI: 'True' # SyRI on chrInputs
 # Producing VCF and its associated INS/DEL figure
 get_VCF: 'True'
-# Creating final report
-create_report: 'True'
+# Creating Pan1c_View tarball
 Pan1c-View_jsons: 'True'
-- 
GitLab


From 2b31ff7c614bd3f6dddb1ab38223bb6291ec49fe Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Mon, 18 Nov 2024 10:55:35 +0100
Subject: [PATCH 295/310] Fixed quast rule

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index cb199ea..1c55377 100644
--- a/Snakefile
+++ b/Snakefile
@@ -160,7 +160,7 @@ rule quast_stats:
         pan_name=config["name"],
         tmp_dir="output/quast"
     benchmark:
-        "statistics/quast.{haplotype}.txt"
+        "statistics/quast.txt"
     log: 
         cmd="logs/quast/quast.cmd.log",
         time="logs/quast/quast.time.log"
-- 
GitLab


From 2dc055bf06690c63f53793ee0365640e95d7380a Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Mon, 18 Nov 2024 10:56:41 +0100
Subject: [PATCH 296/310] Fixed graph_squeeze rule

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 1c55377..af6254e 100644
--- a/Snakefile
+++ b/Snakefile
@@ -798,7 +798,7 @@ rule graph_squeeze:
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
     benchmark:
-        "statistics/graph_squeeze.{gtool}.{chromosome}.txt"
+        "statistics/graph_squeeze.{gtool}.txt"
     priority: 100
     params:
         app_path=config['app.path']
-- 
GitLab


From 1934af2d2cde1dfcfdaedb6fa43286c275d11862 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Mon, 18 Nov 2024 10:57:32 +0100
Subject: [PATCH 297/310] Fixed graph_json rule

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index af6254e..0a7854e 100644
--- a/Snakefile
+++ b/Snakefile
@@ -965,7 +965,7 @@ rule graph_json:
     resources:
         mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 16000
     benchmark:
-        "statistics/graph_json.{gtool}.{chromosome}.txt"
+        "statistics/graph_json.txt"
     params:
         app_path=config["app.path"],
         pan_name=config["name"]
-- 
GitLab


From 19cb012711e5d2678fdb435fba277b533cba49ca Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Mon, 18 Nov 2024 11:00:35 +0100
Subject: [PATCH 298/310] Readded report_fig rule

---
 Snakefile | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/Snakefile b/Snakefile
index 0a7854e..a663e1b 100644
--- a/Snakefile
+++ b/Snakefile
@@ -1195,6 +1195,43 @@ rule var_json:
             --output {output.json}
         """
 
+rule create_pan1c_report_fig:
+    # Produces a markdown report figure of chromosomes graphs
+    input:
+        graph='data/chrGraphs/Pan1c.{gtool}.'+config['name']+'.{chromosome}.tmp.gfa',
+        contigfig="output/chr.contig/Pan1c."+config['name']+".{chromosome}.contig.png",
+    output:
+        odgifig=temp("tmp/Pan1c.{gtool}.{chromosome}.odgi.png"),
+        namefig=temp("tmp/Pan1c.{gtool}.{chromosome}.name.png"),
+        reportfig="output/report/Pan1c.{gtool}."+config['name']+".{chromosome}.report.fig.png"
+    threads: 4
+    resources:
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
+    params:
+        app_path=config['app.path']
+    shell:
+        """
+        ## Odgi 1D viz
+        apptainer run --app odgi {params.app_path}/PanGeTools.sif \
+            viz -i {input.graph} -o {output.odgifig} -x 2500 -a 80 -b -H -t {threads} -P
+
+        ## Getting legend from contig figure
+        convert {input.contigfig} -crop 790x+0+0 +repage {output.namefig}
+
+        odgheight=$(identify -ping -format '%h' {output.odgifig})
+        ctgheight=$(identify -ping -format '%h' {input.contigfig})
+
+        combinedheight=$(($odgheight+$ctgheight))
+
+        echo -e "[Debug::Report fig]\t$odgheight\t$ctgheight\t$combinedheight"
+
+        ## Creating empty canvas ad adding other figures to it
+        convert -size "3300x$combinedheight" xc:white {output.reportfig}
+        composite -geometry +0+0 {input.contigfig} {output.reportfig} {output.reportfig}
+        composite -geometry "+0+$ctgheight" {output.namefig} {output.reportfig} {output.reportfig}
+        composite -geometry "+790+$ctgheight" {output.odgifig} {output.reportfig} {output.reportfig}
+        """
+
 def Pan1c_view_data_inputs(wildcards):
     inputs = list()
 
-- 
GitLab


From 54884d8b3601f0eb93ef8dc850cdc4e290d7de0b Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 19 Nov 2024 16:16:00 +0100
Subject: [PATCH 299/310] Fixed MC_graph memory request

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index a663e1b..70f62be 100644
--- a/Snakefile
+++ b/Snakefile
@@ -712,7 +712,7 @@ rule MC_graph:
         gfa_gz='data/chrGraphs/Pan1c.MC.'+config['name']+'.{chromosome}.gfa.gz'
     threads: 16
     resources:
-        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 32000
+        mem_mb = lambda wildcards, threads: config["mem_multiplier"] * 32000
     benchmark:
         "statistics/MC.{chromosome}.txt"
     params:
-- 
GitLab


From c214ca049fee6787e5ec6a75d94cb41fb9c3285c Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Fri, 29 Nov 2024 13:47:17 +0100
Subject: [PATCH 300/310] Reworked config.yaml

- Reordered parameters
- Added option to run PGGB
- PGGB can now be easily deactivated
---
 Snakefile                |  2 +-
 config.yaml              | 24 ++++++++++++------------
 example/config_CICD.yaml | 35 +++++++++++++++++------------------
 3 files changed, 30 insertions(+), 31 deletions(-)

diff --git a/Snakefile b/Snakefile
index a663e1b..2decccd 100644
--- a/Snakefile
+++ b/Snakefile
@@ -36,7 +36,7 @@ nHAP = len(SAMPLES)
 with gzip.open("data/haplotypes/"+config['reference'], "r") as handle:
     CHRLIST = [line.decode().split("#")[-1].split('\n')[0] for line in handle.readlines() if line.decode()[0] == ">"]
 
-graph_tools = ["PGGB"] + (config["get_MC"] == "True")*["MC"] 
+graph_tools = (config["run_PGGB"] == "True")*["PGGB"] + (config["run_MC"] == "True")*["MC"] 
 
 # Adding optionnal output based on config.yaml, using the following function
 def which_analysis():
diff --git a/config.yaml b/config.yaml
index 9b53b68..53eeff8 100644
--- a/config.yaml
+++ b/config.yaml
@@ -1,39 +1,39 @@
-## Main parameters
+#% Main parameters
 # Pangenome name 
 name: '<pangenome name>'
 # Reference fasta (BGziped)
 reference: '<reference_name>'
 # Directory of apptainer images (downloaded with getApps.sh)
 app.path: '<path>'
-
-## Resources
 # Memory multiplier (increase when OOM). Formula : job_default_mem * n_retries * mem_multiplier * threads
 mem_multiplier: 1
 
-# Core parameters
-# RagTag parameters (see: https://github.com/malonge/RagTag/wiki/scaffold)
+#% RagTag parameters (see: https://github.com/malonge/RagTag/wiki/scaffold)
 ragtag_args: '-i 0.4'
 ragtag_mm2_conf: '-x asm5'
 ##Â Add -f 0.02 for large genomes
 
+#% PGGB parameters
+run_PGGB: 'True'
 # Wfmash alignement parameters :
 wfmash.segment_length: 10000
 wfmash.mapping_id: 95
 wfmash.secondary: '-k 19 -H 0.001'
-
 # Seqwish parameters
 seqwish.params: '-B 10000000 -k 19 -f 0'
 
-# Odgi 1D and path coverage viz parameters 
+#% MC parameters
+run_MC: 'False'
+# MC arguments passed to 'cactus-pangenome' 
+MC.params: '--clip 0 --filter 0'
+
+#% Odgi 1D and path coverage viz parameters 
 odgi.1Dviz.params: '-x 2000 -b'
 odgi.pcov.params: '-x 2000 -O'
 
-## Optional parts of the workflow
-# Running Quast to get statistics on input haplotypes
+#% Optional parts of the workflow
+# Running Quast to get more statistics on input haplotypes
 run_Quast: 'False'
-# Make Minigraph-Cactus graph using the same method (chromosome level)
-get_MC: 'False'
-MC.params: '--clip 0 --filter 0'
 # Getting figures showing chromosome decomposition into contigs
 get_contig_pos: 'True'
 # Computes Presence Absence Variant matrices for Panache (not recommended; very long)
diff --git a/example/config_CICD.yaml b/example/config_CICD.yaml
index b10bc1b..af7baa0 100644
--- a/example/config_CICD.yaml
+++ b/example/config_CICD.yaml
@@ -5,44 +5,43 @@ name: '03SC_CICD'
 reference: 'R64.hap1.fa.gz'
 # Directory of apptainer images (downloaded with getApps.sh)
 app.path: 'appimgs/'
-
-## Resources
 # Memory multiplier (increase when OOM). Formula : job_default_mem * n_retries * mem_multiplier * threads
 mem_multiplier: 1
 
-# Core parameters
-# RagTag parameters (see: https://github.com/malonge/RagTag/wiki/scaffold)
+#% RagTag parameters (see: https://github.com/malonge/RagTag/wiki/scaffold)
 ragtag_args: '-i 0.4'
 ragtag_mm2_conf: '-x asm5'
-##Â Add -f 0.0002 for large genomes
+##Â Add -f 0.02 for large genomes
 
+#% PGGB parameters
+run_PGGB: 'True'
 # Wfmash alignement parameters :
 wfmash.segment_length: 5000
-wfmash.mapping_id: 90
-wfmash.secondary: '-k 19 -H 0.001 -X'
-
+wfmash.mapping_id: 95
+wfmash.secondary: '-k 19 -H 0.001'
 # Seqwish parameters
 seqwish.params: '-B 10000000 -k 19 -f 0'
 
-# Odgi 1D and path coverage viz parameters 
-odgi.1Dviz.params: '-x 2000 -a 25 -b'
-odgi.pcov.params: '-x 2000 -a 25 -O'
+#% MC parameters
+run_MC: 'False'
+# MC arguments passed to 'cactus-pangenome' 
+MC.params: '--clip 0 --filter 0'
 
-## Optional parts of the workflow
-# Running Quast to get statistics on input haplotypes
+#% Odgi 1D and path coverage viz parameters 
+odgi.1Dviz.params: '-x 2000 -b'
+odgi.pcov.params: '-x 2000 -O'
+
+#% Optional parts of the workflow
+# Running Quast to get more statistics on input haplotypes
 run_Quast: 'True'
-# Make Minigraph-Cactus graph using the same method (chromosome level)
-get_MC: 'True'
-MC.params: '--clip 0 --filter 0'
 # Getting figures showing chromosome decomposition into contigs
 get_contig_pos: 'True'
 # Computes Presence Absence Variant matrices for Panache (not recommended; very long)
 get_PAV: 'False'
 # Computes SyRI figures for haplotypes 
-#get_allASM_SyRI: 'False' # All vs all
 get_ASMs_SyRI: 'True' # Haplotype vs Reference
 get_chrInputs_SyRI: 'True' # SyRI on chrInputs
-# Producing VCF and its associated INS/DEL figure
+# Producing VCF from Pangenome graph
 get_VCF: 'True'
 # Creating Pan1c_View tarball
 Pan1c-View_jsons: 'True'
-- 
GitLab


From acb604d78a9462243a76a9c0a39da52ac78e27f6 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Fri, 29 Nov 2024 17:27:39 +0100
Subject: [PATCH 301/310] Adding some documentation to rules

---
 Snakefile   | 251 +++++++++++++++++++++++++++++++++++++++-------------
 config.yaml |   2 +
 2 files changed, 191 insertions(+), 62 deletions(-)

diff --git a/Snakefile b/Snakefile
index 8ad8f58..d8d1e3c 100644
--- a/Snakefile
+++ b/Snakefile
@@ -102,7 +102,23 @@ Pre-processing section : preparing pggb inputs  --------------------------------
 """
 
 rule ragtag_scaffolding:
-    # Scaffold input haplotype against the reference to infer chromosome scale sequences
+    """
+    Scaffold an haplotype against the reference assembly set in the config file.
+
+    Input :
+        - Reference assembly (.fa.gz)
+        - Haplotype assembly (<haplotype>.fa.gz)
+        - Fasta index of the reference (.fa.gz.fai and .fa.gz.gzi)
+    Output :
+        - Scaffolded haplotype (<haplotype>.ragtagged.fa) [Temporary]
+        - Tarball with RagTag temporary files (<haplotype>.tar.gz)
+    Threads : 8
+    Memory : n_threads * mem_multiplier * 4Gb
+    Parameters :
+        - app.path
+        - ragtag_args
+        - ragtag_mm2_conf
+    """
     input:
         ref="data/haplotypes/"+config['reference'],
         reffai="data/haplotypes/"+config['reference']+".fai",
@@ -146,7 +162,20 @@ rule ragtag_scaffolding:
         """
 
 rule quast_stats:
-    # Run Quast on ragtagged genomes
+    """
+    Run Quast on raw input assemblies. [Optional]
+
+    Input :
+        - Reference assembly (.fa.gz)
+        - All haplotypes (.fa.gz)
+    Output :
+        - Quast HTML report
+    Threads : 16
+    Memory : n_threads * mem_multiplier * 4Gb
+    Parameters :
+        - app.path
+        - Pangenome name
+    """
     input:
         fas=expand("data/haplotypes/{haplotype}.fa.gz", haplotype=SAMPLES_NOREF),
         ref="data/haplotypes/"+config['reference']
@@ -189,6 +218,19 @@ rule quast_stats:
         """
 
 rule assemblathon_stats:
+    """
+    Run Assemblathon_stats on a haplotype.
+
+    Input :
+        - Haplotype (<haplotype>.fa.gz)
+        - Fasta index of the haplotype (<haplotype>.fa.gz.fai)
+    Output :
+        - Tab-delimited table with assemblaton stats (<haplotype>.stats.csv)
+    Threads : 1
+    Memory : n_threads * 16Gb
+    Parameters :
+        - app.path
+    """
     input:
         fa="data/haplotypes/{haplotype}.fa.gz",
         reffai="data/haplotypes/"+config['reference']+".fai"
@@ -216,7 +258,20 @@ rule assemblathon_stats:
         """
 
 rule contig_positions:
-    # Produce figures with contig positions
+    """
+    Unscaffold an assembly to get the contig delimitations of an assembly. [Optional]
+
+    Input :
+        - Chromosome fasta (<chromosome>.fa.gz)
+        - Chromosome fasta index (<chromosome>.fa.gz.fai)
+    Output :
+        - Figure of contig decomposition (<chromosome>.contig.png)
+        - Temporary directory [Temporary]
+    Threads : 1
+    Memory : n_threads * mem_multiplier * 16Gb
+    Parameters :
+        - app.path
+    """
     input:
         fa="data/chrInputs/"+config["name"]+".{chromosome}.fa.gz",
         fai="data/chrInputs/"+config["name"]+".{chromosome}.fa.gz.fai"
@@ -268,7 +323,19 @@ rule contig_positions:
         """
 
 rule chromosome_clustering:
-    # Read ragtagged fastas and split chromosome sequences into according FASTA files
+    """
+    Read ragtagged fastas and split chromosome sequences into according FASTA files.
+
+    Input :
+        - All ragtagged haplotype fasta (<haplotype>.ragtagged.fa.gz)
+    Output :
+        - All chromosome fastas (<chromosome>.fa) [Temporary]
+    Threads : 1
+    Memory : n_threads * mem_multiplier * 16Gb
+    Parameters :
+        - app.path
+        - Pangenome name
+    """
     input:
         expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES)
     output:
@@ -290,8 +357,21 @@ rule chromosome_clustering:
         """
 
 rule SyRI_on_ASM_mm2:
-    # Run SyRI on a single assembly. 
-    # The assembly is mapped on the 'reference' with Minimap2 and SyRI search for SV.
+    """
+    Run SyRI on a ragtagged haplotype assembly using Minimap2 alignments. [Optional]
+
+    Input :
+        - Reference assembly (.ragtagged.fa.gz)
+        - Haplotype assembly (<haplotype>.ragtagged.fa.gz)
+    Output :
+        - SyRI figure (<haplotype>.syri_mm2.png)
+        - SyRI VCF (<haplotype>.syri_mm2.vcf.gz)
+    Threads : 4
+    Memory : n_threads * mem_multiplier * 12Gb
+    Parameters :
+        - app.path
+        - Plotsr config file (src/plotsr-base.cfg)
+    """
     input:
         ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz",
         qry="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz"
@@ -337,8 +417,21 @@ rule SyRI_on_ASM_mm2:
         """
     
 rule SyRI_on_ASM_wfm:
-    # Run SyRI on a single assembly. 
-    # The assembly is mapped on the 'reference' with Wfmash and SyRI search for SV.
+    """
+    Run SyRI on a ragtagged haplotype assembly using WFmash alignments. [Optional] [WIP, broken]
+
+    Input :
+        - Reference assembly (.ragtagged.fa.gz)
+        - Haplotype assembly (<haplotype>.ragtagged.fa.gz)
+    Output :
+        - SyRI figure (<haplotype>.syri_wfm.png)
+        - SyRI VCF (<haplotype>.syri_wfm.vcf.gz)
+    Threads : 4
+    Memory : n_threads * mem_multiplier * 12Gb
+    Parameters :
+        - app.path
+        - Plotsr config file (src/plotsr-base.cfg)
+    """
     input:
         ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz",
         qry="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz"
@@ -386,7 +479,76 @@ rule SyRI_on_ASM_wfm:
         rm -r $dir
         """
 
+rule SyRI_on_chrInput:
+    """
+    Run SyRI on chromosome fasta using Minimap2 alignments. [Optional]
+
+    Input :
+        - Chromosome fasta (<chromosome>.fa.gz)
+    Output :
+        - SyRI figure (<chromosome>.syri_mm2.png)
+    Threads : 4
+    Memory : n_threads * mem_multiplier * 12Gb
+    Parameters :
+        - app.path
+        - Plotsr config file (src/plotsr-base.cfg)
+    """
+    input:
+        fasta='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz'
+    output:
+        fig="output/chrInput.syri.figs/Pan1c."+config['name']+".{chromosome}.syri_mm2.png"
+    threads: 8
+    resources:
+        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 12000
+    benchmark:
+        "statistics/syri_chr_mm2.{chromosome}.txt"
+    params:
+        app_path=config["app.path"],
+        ref=config['reference'],
+        wrk_dir="data/chrInput.syri",
+        plotsr_cfg="src/plotsr-base.cfg"
+    shell:
+        """
+        dir="{params.wrk_dir}/{wildcards.chromosome}"
+
+        mkdir -p $dir
+        refname=$(basename {params.ref} .fa.gz | cut -d'.' -f1,2)
+
+        ## Creating single fasta from multifasta
+        zcat {input.fasta} | awk -F"#" -v DIR=$dir \
+            '/^>/ {{OUT= DIR "/" substr($0,2) ".fa"}}; {{print >> OUT; close(OUT)}}'
+
+        apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
+            -@ {threads} $dir/*.fa
+
+        ## Getting the list of sequences
+        AllAsmList=()
+        for file in $dir/*.fa.gz; do
+            asm="$(basename $file .fa.gz | cut -f1,2 -d"#" | sed 's/#/\\.hap/').fa.gz"
+            mv $file "$(dirname $file)/$asm"
+            AllAsmList+=("$(dirname $file)/$asm")
+        done
+        
+        #echo "The ASM Array : ${{AllAsmList[@]}}"
+
+        bash scripts/Syri.figs_mm2.sh \
+            -a {params.app_path} \
+            -t {threads} \
+            -d $dir \
+            -o $(basename {output.fig}) \
+            -r "${{dir}}/${{refname}}.fa.gz" \
+            -q "${{AllAsmList[*]}}" \
+            -c {params.plotsr_cfg} \
+            -h 10 -w 20 -s "0.9" -f 10
+
+        mv $dir/$(basename {output.fig}) {output.fig}
+        rm -r $dir
+        """
+
 def asm_json_inputs(wildcards):
+    """
+    Creates the input list for asm_json rule.
+    """
     sections = dict()
 
     sections["csv"] = expand("data/haplotypes/{haplotype}.stats.csv", haplotype=SAMPLES)
@@ -413,7 +575,25 @@ def asm_json_inputs(wildcards):
     return sections
 
 rule asm_json:
-    # Produce the Assembly JSON for Pan1c QC
+    """
+    Produce the assembly JSON for Pan1c-View. [Optional]
+
+    Input :
+        - Tab-delimited table with assemblaton stats (<haplotype>.stats.csv)
+        - All chromosome fasta indexes (.fa.gz.fai)
+        - All figures of contig decomposition (.contig.png) [Optional]
+        - SyRI figure with MM2 (<haplotype>.syri_mm2.png) [Optional]
+
+
+    Output :
+        - SyRI figure (<haplotype>.syri_mm2.png)
+        - SyRI VCF (<haplotype>.syri_mm2.vcf.gz)
+    Threads : 4
+    Memory : n_threads * mem_multiplier * 12Gb
+    Parameters :
+        - app.path
+        - Plotsr config file (src/plotsr-base.cfg)
+    """
     input:
         unpack(asm_json_inputs)
     output:
@@ -450,59 +630,6 @@ rule asm_json:
 Core section : Running PGGB
 """
 
-rule SyRI_on_chrInput:
-    input:
-        fasta='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz'
-    output:
-        fig="output/chrInput.syri.figs/Pan1c."+config['name']+".{chromosome}.syri_mm2.png"
-    threads: 8
-    resources:
-        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 12000
-    benchmark:
-        "statistics/syri_chr_mm2.{chromosome}.txt"
-    params:
-        app_path=config["app.path"],
-        ref=config['reference'],
-        wrk_dir="data/chrInput.syri",
-        plotsr_cfg="src/plotsr-base.cfg"
-    shell:
-        """
-        dir="{params.wrk_dir}/{wildcards.chromosome}"
-
-        mkdir -p $dir
-        refname=$(basename {params.ref} .fa.gz | cut -d'.' -f1,2)
-
-        ## Creating single fasta from multifasta
-        zcat {input.fasta} | awk -F"#" -v DIR=$dir \
-            '/^>/ {{OUT= DIR "/" substr($0,2) ".fa"}}; {{print >> OUT; close(OUT)}}'
-
-        apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
-            -@ {threads} $dir/*.fa
-
-        ## Getting the list of sequences
-        AllAsmList=()
-        for file in $dir/*.fa.gz; do
-            asm="$(basename $file .fa.gz | cut -f1,2 -d"#" | sed 's/#/\\.hap/').fa.gz"
-            mv $file "$(dirname $file)/$asm"
-            AllAsmList+=("$(dirname $file)/$asm")
-        done
-        
-        #echo "The ASM Array : ${{AllAsmList[@]}}"
-
-        bash scripts/Syri.figs_mm2.sh \
-            -a {params.app_path} \
-            -t {threads} \
-            -d $dir \
-            -o $(basename {output.fig}) \
-            -r "${{dir}}/${{refname}}.fa.gz" \
-            -q "${{AllAsmList[*]}}" \
-            -c {params.plotsr_cfg} \
-            -h 10 -w 20 -s "0.9" -f 10
-
-        mv $dir/$(basename {output.fig}) {output.fig}
-        rm -r $dir
-        """
-
 rule wfmash_on_chr:
     # Run wfmash on a specific chromosome input
     input:
diff --git a/config.yaml b/config.yaml
index 53eeff8..d9de5cd 100644
--- a/config.yaml
+++ b/config.yaml
@@ -9,7 +9,9 @@ app.path: '<path>'
 mem_multiplier: 1
 
 #% RagTag parameters (see: https://github.com/malonge/RagTag/wiki/scaffold)
+# Main RagTag parameters
 ragtag_args: '-i 0.4'
+# Minimap2 parameters piped through RagTag
 ragtag_mm2_conf: '-x asm5'
 ##Â Add -f 0.02 for large genomes
 
-- 
GitLab


From 65b91c09e3ba5cf8cc6d1a45c3fce89ff413a024 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 29 Nov 2024 18:02:57 +0100
Subject: [PATCH 302/310] Continuing the documentation

---
 Snakefile | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 74 insertions(+), 4 deletions(-)

diff --git a/Snakefile b/Snakefile
index d8d1e3c..fb48ce0 100644
--- a/Snakefile
+++ b/Snakefile
@@ -631,7 +631,25 @@ Core section : Running PGGB
 """
 
 rule wfmash_on_chr:
-    # Run wfmash on a specific chromosome input
+    """
+    Run WFmash on a chromosome fasta (First PGGB step).
+
+    Input :
+        - Chromosome fasta (<chromosome>.fa.gz)
+        - Chromosome fasta index (<chromosome>.fa.gz.fai)
+    Output :
+        - Mapping (<chromosome>.wfmash.mapping.paf) [Temporary]
+        - Compressed mapping (<chromosome>.wfmash.mapping.paf.gz)
+        - Alignment (<chromosome>.wfmash.aln.paf) [Temporary]
+        - Compressed alignment (<chromosome>.wfmash.al.paf.gz)
+    Threads : 16
+    Memory : n_threads * mem_multiplier * 2Gb
+    Parameters :
+        - app.path
+        - wfmash.segment_length
+        - wfmash.mapping_id
+        - wfmash.secondary
+    """
     input:
         fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz',
         fai='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz.fai'
@@ -689,7 +707,20 @@ rule wfmash_on_chr:
         """
 
 rule seqwish:
-    # Run seqwish on alignement produced by wfmash
+    """
+    Run seqwish on alignment file produced by WFmash for a given chromosome.
+
+    Input :
+        - Chromosome fasta (<chromosome>.fa.gz)
+        - Alignment file (<chromosome>.wfamsh.aln.paf.gz)
+    Output :
+        - Seqwish graph (<chromosome>.seqwish.gfa.gz) [GFAv1.0]
+    Threads : 8
+    Memory : n_threads * mem_multiplier * 4Gb
+    Parameters :
+        - app.path
+        - seqwish.params
+    """
     input:
         fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz',
         aln=rules.wfmash_on_chr.output.aln_gz
@@ -724,7 +755,19 @@ rule seqwish:
         """
 
 rule gfaffix_on_chr:
-    # Run gfaffix on seqwish graph
+    """
+    GFAffix on Seqwish graph for a given chromosome.
+
+    Input :
+        - Seqwish graph (<chromosome>.seqwish.gfa.gz) [GFAv1.0]
+    Output :
+        - GFAffixed graph (<chromosome>.seqwish.gfaffixD.gfa.gz) [GFAv1.0]
+        - Transform logs (<chromosome>.seqwish.gfaffixD.transform.txt)
+    Threads : 1
+    Memory : n_threads * mem_multiplier * 24Gb
+    Parameters :
+        - app.path
+    """
     input:
         rules.seqwish.output.gfa_gz
     output:
@@ -760,7 +803,19 @@ rule gfaffix_on_chr:
         """
 
 rule odgi_postprocessing:
-    # Running pggb's postprocessing (mainly odgi) steps with gfaffix graph
+    """
+    ODGI postprocessing on GFAffixed graph, for a given chromosome.
+
+    Input :
+        - Graph metadata (Pan1c.<pangenome_name>.gfa.metadata)
+        - GFAffixed graph (<chromosome>.seqwish.gfaffixD.gfa.gz)
+    Output :
+        - Chromosome graph (<chromosome>.gfa.gz) [GFAv1.0]
+    Threads : 8
+    Memory : n_threads * mem_multiplier * 4Gb
+    Parameters :
+        - app.path
+    """
     input:
         tags="output/Pan1c."+config['name']+".gfa.metadata",
         gfa_gz=rules.gfaffix_on_chr.output.gfa_gz
@@ -832,6 +887,21 @@ rule odgi_postprocessing:
         """
 
 rule MC_graph:
+    """
+    Running Cactus-Pangenome workflow for a given chromosome fasta.
+
+    Input :
+        - Graph metadata (Pan1c.<pangenome_name>.gfa.metadata)
+        - Chromosome fasta (<chromosome>.fa.gz)
+    Output :
+        - Minigraph-Cactus graph (<chromosome>.gfa.gz) [GFAv1.0 (converted from GFAv1.1 with GFAvc)]
+    Threads : 16
+    Memory : mem_multiplier * 32Gb
+    Parameters :
+        - app.path
+        - reference
+        - MC.params
+    """
     input:
         tags="output/Pan1c."+config['name']+".gfa.metadata",
         fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz'
-- 
GitLab


From dc144d3291ff7d41e8f6412ae122bb5cea05a128 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Tue, 3 Dec 2024 10:35:08 +0100
Subject: [PATCH 303/310] Continuing to add documentation

- Removing unused rules (PAV, core statistics, ...)
---
 Snakefile | 183 ++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 117 insertions(+), 66 deletions(-)

diff --git a/Snakefile b/Snakefile
index fb48ce0..e1a61d2 100644
--- a/Snakefile
+++ b/Snakefile
@@ -43,7 +43,7 @@ def which_analysis():
     
     ## Default analysis
     analysis_inputs = [     
-        expand("output/stats/Pan1c.{gtool}."+config['name']+".core.stats.tsv", gtool=graph_tools), # core stats
+        #expand("output/stats/Pan1c.{gtool}."+config['name']+".core.stats.tsv", gtool=graph_tools), # core stats
         expand("output/panacus.reports/Pan1c.{gtool}."+config['name']+".{chromosome}.histgrowth.html", chromosome=CHRLIST, gtool=graph_tools), # panacus histgrowth 
         expand("output/chrGraphs.figs/Pan1c.{gtool}."+config['name']+".{chromosome}.1Dviz.png", chromosome=CHRLIST, gtool=graph_tools), # visualizations from odgi on chromosome graphs
         expand("output/stats/Pan1c.{gtool}."+config['name']+".chrGraph.general.stats.tsv", gtool=graph_tools) # chromosomes graph statistics
@@ -966,14 +966,23 @@ rule MC_graph:
         """
 
 rule generate_graph_list:
-    # Generate a text file containing all created graphs
+    """
+    Generate a list of chromosome graphs from a given tool. Used to trigger construction of all graphs.
+
+    Input :
+        - Chromosome graphs (<gtool>.<chromosome>.tmp.gfa) [GFAv1.0]
+    Output :
+        - List of graphs (graphList.<gtool>.txt)
+    Threads : 1
+    Memory : 4Gb
+    """
     input:
         gfas=expand('data/chrGraphs/Pan1c.{{gtool}}.'+config['name']+'.{chromosome}.tmp.gfa', chromosome=CHRLIST)
     output:
         temp("data/chrGraphs/graphsList.{gtool}.txt")
     threads: 1
     resources:
-        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000
+        mem_mb = lambda wildcards, threads: 4000
     priority: 100
     run:
         with open(output[0], "w") as handle:
@@ -981,7 +990,21 @@ rule generate_graph_list:
                 handle.write(file+"\n")
 
 rule graph_squeeze:
-    # Using odgi to merge every subgraphs into a final one
+    """
+    Concatenate chromosome graphs into a final graph, using Odgi squeez command.
+    It only is run for a single tool (PGGB, MC) at a time.
+
+    Input :
+        - list of graphs (graphList.<gtool>.txt)
+        - Graph metadata (Pan1c.<pangenome_name>.gfa.metadata)
+        - Chromosome graphs (Pan1c.<gtool>.<pangenome_name>.<chromosome>.tmp.gfa) [GFAv1.0]
+    Output :
+        - Final graph (<gtool>.<pangenome_name>.gfa.gz) [GFAv1.0]
+    Threads : 16
+    Memory : n_threads * mem_multiplier * 2Gb
+    Parameters :
+        - app.path
+    """
     input:
         glist="data/chrGraphs/graphsList.{gtool}.txt",
         tags="output/Pan1c."+config['name']+".gfa.metadata",
@@ -1022,7 +1045,22 @@ rule graph_squeeze:
         """
 
 rule graph_stats:
-    # Using GFAstats to produce stats on every chromosome graphs
+    """
+    Statistics computed on a chromosome graphs using a custom script.
+    The script can be found [here](https://forgemia.inra.fr/alexis.mergez/pangetools/-/blob/main/GFAstats.py?ref_type=heads). 
+    See the list of metrics [here](https://forgemia.inra.fr/alexis.mergez/pangetools/-/wikis/GFAstats---Documentation).
+
+    Input :
+        - A single chromosome graph (<gtool>.<pangenome_name>.<chromosome>.gfa.gz) [GFAv1.0]
+    Output :
+        - General statistics (<chromosome>.general.stats.tsv)
+        - Path statistics (<chromosome>.path.stats.tsv)
+    Threads : 4
+    Memory : n_threads * mem_multiplier * 8Gb
+    Parameters :
+        - app.path
+        - name
+    """
     input:
         graph='data/chrGraphs/Pan1c.{gtool}.'+config['name']+'.{chromosome}.gfa.gz'
     output:
@@ -1045,7 +1083,21 @@ rule graph_stats:
         """
 
 rule graph_figs:
-    # Creating figures using odgi viz 
+    """
+    Create 1D viz of a chromosome graph using Odgi viz.
+
+    Input :
+        - Chromosome graph (<gtool>.<pangenome_name>.<chromosome>.tmp.gfa)
+    Output :
+        - 1D viz (<gtool>.<pangenome_name>.<chromosome>.1Dviz.png)
+        - 1D path coverage (<gtool>.<pangenome_name>.<chromosome>.pcov.png)
+    Threads : 4
+    Memory : n_threads * mem_multiplier * 4Gb
+    Parameters :
+        - app.path
+        - odgi.1Dviz.params
+        - odgi.pcov.params
+    """
     input:
         graph='data/chrGraphs/Pan1c.{gtool}.'+config['name']+'.{chromosome}.tmp.gfa'
     output:
@@ -1071,7 +1123,21 @@ rule graph_figs:
         """
 
 rule aggregate_graphs_stats:
-    # Reading and merging all stats files from chromosome graphs into a .tsv.
+    """
+    Aggregate graph statistics produced for each chromosome graph, given a graph tool.
+
+    Input :
+        - Chromosome graphs general statistics (<gtool>.<pangenome_name>.<chromosome>.general.stats.tsv)
+        - Chromosome graphs paths statistics (<gtool>.<pangenome_name>.<chromosome>.path.stats.tsv)
+    Output :
+        - General statistics (<gtool>.<pangenome_name>.chrGraphs.general.stats.tsv)
+        - Paths statistics (<gtool>.<pangenome_name>.chrGraphs.path.stats.tsv)
+    Threads : 1
+    Memory : n_threads * mem_multiplier * 2Gb
+    Parameters :
+        - app.path
+        - name
+    """
     input:
         genstats=expand("output/stats/chrGraphs.{{gtool}}/Pan1c.{{gtool}}."+config['name']+".{chromosome}.general.stats.tsv", chromosome=CHRLIST)
     output:
@@ -1093,7 +1159,20 @@ rule aggregate_graphs_stats:
         """
 
 rule get_graph_tags:
-    # Add metadata to the final GFA
+    """
+    Generate graph metadata file using tools version, passed arguments...
+
+    Input :
+        - config.yaml
+    Output :
+        - Graph metadata (Pan1c.<pangenome_name>.gfa.metadata)
+        - Tags JSON for Pan1c-View (summary section)
+    Threads : 1
+    Memory : n_threads * mem_multiplier * 8Gb
+    Parameters :
+        - app.path
+        - name
+    """
     input:
         "config.yaml"
     output:
@@ -1112,46 +1191,22 @@ rule get_graph_tags:
             --appdir {params.app_path} --config-file config.yaml --json {output.json} > {output.md}
         """
 
-rule pggb_input_stats:
-    # Produces statistics on pggb input sequences
-    input:
-        flag="output/stats/Pan1c.{gtool}."+config['name']+".chrGraph.general.stats.tsv"
-    output:
-        "output/stats/Pan1c.{gtool}."+config['name']+".chrInput.stats.tsv"
-    threads: 1
-    resources:
-        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 32000
-    params:
-        app_path=config['app.path'],
-        pan_name=config['name']
-    shell:
-        """
-        apptainer run {params.app_path}/pan1c-env.sif python scripts/chrInput.stats_compute.py \
-            -f data/chrInputs/*.fa.gz -o {output} -p {params.pan_name}
-        """
-
-rule core_statistics:
-    # Aggregate chrInput, chrGraph and pggb statistics into a single tsv 
-    input:
-        chrInputStats = "output/stats/Pan1c.{gtool}."+config['name']+".chrInput.stats.tsv",
-        chrGraphStats = "output/stats/Pan1c.{gtool}."+config['name']+".chrGraph.general.stats.tsv"
-    output:
-        tsv = "output/stats/Pan1c.{gtool}."+config['name']+".core.stats.tsv"
-    threads: 1
-    resources:
-        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
-    params:
-        app_path=config['app.path'],
-        pan_name=config['name']
-    shell:
-        """
-        apptainer run {params.app_path}/pan1c-env.sif python scripts/core.stats_compute.py \
-            --pggbStats logs/pggb --chrInputStats {input.chrInputStats} \
-            --chrGraphStats {input.chrGraphStats} -o {output.tsv} -f /dev/null -p {params.pan_name}
-        """
-
 rule graph_json:
-    # Produce the Graph JSON for Pan1c QC
+    """
+    Generate graph JSON for Pan1c-View graph section. [Optional]
+
+    Input :
+        - Aggregated general statistics for all graph tool (*.chrGraph.general.stats.tsv)
+        - Aggregated paths statistics for all graph tool (*.chrGraph.general.stats.tsv) 
+        - Contig-Odgi figures (*.report.fig.png) 
+    Output :
+        - Graph JSON (Pan1c.<pangenome_name>.graph.json)
+    Threads : 1
+    Memory : n_threads * mem_multiplier * 16Gb
+    Parameters :
+        - app.path
+        - name
+    """
     input:
         genstats = expand("output/stats/Pan1c.{gtool}."+config['name']+".chrGraph.general.stats.tsv", gtool=graph_tools),
         pathstats = expand("output/stats/Pan1c.{gtool}."+config['name']+".chrGraph.path.stats.tsv", gtool=graph_tools),
@@ -1179,27 +1234,23 @@ rule graph_json:
 """
 Post-processing section
 """
-rule get_pav:
-    # Create PAV matrix readable by panache for a given chromosome scale graph
-    input:
-        "data/chrGraphs/graphsList.{gtool}.txt"
-    output:
-        directory("output/pav.matrices")
-    threads: 16
-    resources:
-        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000
-    params:
-        app_path=config['app.path']
-    run:
-        shell("mkdir {output}")
-        # Getting the list of graphs
-        with open(input[0]) as handle:
-            graphList = [graph.rstrip("\n") for graph in handle.readlines()]
-        # Iterating over graphs
-        for graph in graphList:
-            shell("bash scripts/getPanachePAV.sh -g {graph} -d data/chrGraphs/$(basename {graph} .gfa) -o {output}/$(basename {graph} .gfa).pav.matrix.tsv -a {params.app_path} -t {threads}")
 
 rule panacus_stats:
+    """
+    Generate Panacus figure for a chromosome graph. [Optional]
+
+    Input :
+        - Aggregated general statistics for all graph tool (*.chrGraph.general.stats.tsv)
+        - Aggregated paths statistics for all graph tool (*.chrGraph.general.stats.tsv) 
+        - Contig-Odgi figures (*.report.fig.png) 
+    Output :
+        - Graph JSON (Pan1c.<pangenome_name>.graph.json)
+    Threads : 1
+    Memory : n_threads * mem_multiplier * 16Gb
+    Parameters :
+        - app.path
+        - name
+    """
     # Produces panacus reports for a chromosome graph
     input:
         graph='data/chrGraphs/Pan1c.{gtool}.'+config['name']+'.{chromosome}.tmp.gfa'
-- 
GitLab


From f34b5a8a32b91c8fc84a509a518395af850311fd Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 3 Dec 2024 11:24:16 +0100
Subject: [PATCH 304/310] Documented every rules

---
 Snakefile | 149 +++++++++++++++++++++++++++++++++---------------------
 1 file changed, 90 insertions(+), 59 deletions(-)

diff --git a/Snakefile b/Snakefile
index e1a61d2..9f064ad 100644
--- a/Snakefile
+++ b/Snakefile
@@ -1251,7 +1251,6 @@ rule panacus_stats:
         - app.path
         - name
     """
-    # Produces panacus reports for a chromosome graph
     input:
         graph='data/chrGraphs/Pan1c.{gtool}.'+config['name']+'.{chromosome}.tmp.gfa'
     output:
@@ -1282,7 +1281,19 @@ rule panacus_stats:
         """
 
 rule vg_deconstruct:
-    # Produce a VCF based on the "reference" haplotype
+    """
+    Use VG deconstruct to call variants from the graph. The reference haplotype set for the workflow is used as reference. [Optional]
+
+    Input :
+        - Final graph in XG format (Pan1c.<gtool>.<pangenome_name>.xg)
+    Output :
+        - VCF (Pan1c.<gtool>.<pangenome_name>.vcf) [Temporary]
+    Threads : 8
+    Memory : n_threads * mem_multiplier * 32Gb
+    Parameters :
+        - app.path
+        - reference
+    """
     input:
         graph="output/Pan1c.{gtool}."+config['name']+".xg",
     output:
@@ -1310,61 +1321,17 @@ rule vg_deconstruct:
                 2> >(tee {log.cmd} >&2)
         """
 
-rule vcf_fig:
-    # Produce a figure describing INS/DEL length distribution from vg deconstruct and SyRI
-    input:
-        vg="output/Pan1c.{gtool}."+config['name']+".vcf.gz",
-        syris_mm2=expand("data/asm.syri.mm2/Pan1c."+config['name']+".{haplotype}.syri.mm2.vcf.gz", haplotype=SAMPLES_NOREF)
-    output:
-        vcf_fig=directory("output/{gtool}.vcf.figs")
-    threads: 1
-    resources:
-        mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 20000
-    params:
-        app_path=config['app.path'],
-        pan_name=config['name'],
-        refname=config['reference']
-    shell:
-        """
-        mkdir -p {output.vcf_fig}
-
-        RHAP=$(basename {params.refname} .fa.gz | cut -f1 -d'.')
-        RHAPN=$(basename {params.refname} .fa.gz | cut -f2 -d'.' | cut -f2 -d'p')
-        FOLDER=$(dirname {input.syris_mm2[0]})
-
-        #% SyRI VCF MM2
-        ## Going through all folders
-        for vcf in $FOLDER/*.vcf.gz; do
-            THAP=$(basename $vcf .syri.vcf.gz | cut -f2 -d'.')
-            THAPN=$(basename $vcf .syri.vcf.gz | cut -f3 -d'.' | cut -f2 -d'p')
-
-            # Producing intermediate TSVs
-            zcat $vcf | \
-                awk -v THAP=$THAP -v THAPN=$THAPN -v RHAP=$RHAP -v RHAPN=$RHAPN -f scripts/vcf_2_tsv_syri.awk \
-                > $FOLDER/$(basename $vcf .gz).tsv
-        done
-
-        ## Merging TSVs
-        head -n1 $FOLDER/$(basename $vcf .gz).tsv > {output.vcf_fig}/syri.tsv
-        tail -n +2  -q $FOLDER/*.vcf.tsv >> {output.vcf_fig}/syri.tsv
-
-        rm $FOLDER/*.tsv
-
-        #% VG VCF
-        ## Producing TSV for the figures
-        zcat {input.vg} | awk -f scripts/vcf_2_tsv_vg.awk > {output.vcf_fig}/vg.tsv
-
-        #% Running R to get the figures
-        apptainer run {params.app_path}/pan1c-env.sif python scripts/VCF.stats_figs.py \
-            --vg {output.vcf_fig}/vg.tsv \
-            --syri {output.vcf_fig}/syri.tsv \
-            --output_dir {output.vcf_fig} \
-            --panname {params.pan_name}
-
-        #rm {output.vcf_fig}/*.tsv
-        """
-
 rule vg_vcf_2_tsv:
+    """
+    Convert a VCF from VG deconstruct into a TSV. [Optional]
+
+    Input :
+        - VCF from VG deconstruct (Pan1c.<gtool>.<pangenome_name>.vcf.gz)
+    Output :
+        - TSV (vg_<gtool>.tsv) [Temporary]
+    Threads : 1
+    Memory : n_threads * mem_multiplier * 8Gb
+    """
     input:
         "output/Pan1c.{gtool}."+config['name']+".vcf.gz"
     output:
@@ -1378,6 +1345,16 @@ rule vg_vcf_2_tsv:
         """
 
 rule syri_vcf_2_tsv:
+    """
+    Convert all VCF from SyRI into a single TSV. [Optional]
+
+    Input :
+        - VCFs from SyRI (<haplotypes>.syri.mm2.vcf.gz)
+    Output :
+        - TSV (syri_mm2.tsv) [Temporary]
+    Threads : 1
+    Memory : n_threads * mem_multiplier * 8Gb
+    """
     input:
         expand("data/asm.syri.mm2/Pan1c."+config['name']+".{haplotype}.syri.mm2.vcf.gz", haplotype=SAMPLES_NOREF)
     output:
@@ -1415,7 +1392,9 @@ rule syri_vcf_2_tsv:
         """
 
 def var_json_inputs(wildcards):
-    # Inputs for var_json rule
+    """
+    Creates the input list for var_json rule.
+    """
     inputs = {}
     inputs["vg"] = expand("tmp/var_json/vg_{gtool}.tsv", gtool=graph_tools)
     inputs["syri_mm2"] = "tmp/var_json/syri_mm2.tsv"
@@ -1423,7 +1402,20 @@ def var_json_inputs(wildcards):
     return inputs
 
 rule var_json:
-    # Produce the Assembly JSON for Pan1c QC
+    """
+    Produce the variant JSON for Pan1c-View (Variant section) [Optional]
+
+    Input :
+        - TSVs from VG deconstruct VCF (vg_<gtool>.tsv)
+        - TSV from SyRI with minimap2 (syri_mm2.tsv)
+    Output :
+        - Variant JSON (Pan1c.<pangenome_name>.var.json)
+    Threads : 1
+    Memory : n_threads * mem_multiplier * 48Gb
+    Parameters :
+        - app.path
+        - reference
+    """
     input:
         unpack(var_json_inputs)
     output:
@@ -1444,7 +1436,21 @@ rule var_json:
         """
 
 rule create_pan1c_report_fig:
-    # Produces a markdown report figure of chromosomes graphs
+    """
+    Create a figure combining contig composition of assemblies and odgi 1D figure, for a chromosome graph. [Optional]
+
+    Input :
+        - Chromosome graph (.<chromosome>.tmp.gfa)
+        - Contig decomposition figures (.<chromosome>.contig.png)
+    Output :
+        - Odgi 1D figure (.<gtool>.<chromosome>.odgi.png) [Temporary]
+        - Name figure (.<gtool>.<chromosome>.name.png)
+        - Contig-Odgi figures (<gtool>.<pangenome_name>.<chromosome>.report.fig.png)
+    Threads : 4
+    Memory : n_threads * mem_multiplier * 2Gb
+    Parameters :
+        - app.path
+    """
     input:
         graph='data/chrGraphs/Pan1c.{gtool}.'+config['name']+'.{chromosome}.tmp.gfa',
         contigfig="output/chr.contig/Pan1c."+config['name']+".{chromosome}.contig.png",
@@ -1481,6 +1487,10 @@ rule create_pan1c_report_fig:
         """
 
 def Pan1c_view_data_inputs(wildcards):
+    """
+    Function to gather all files to include in the Pan1c-View tarball.
+    Inputs depends of optionnal parts of the workflow.
+    """
     inputs = list()
 
     # Adding JSONs
@@ -1533,6 +1543,27 @@ def Pan1c_view_data_inputs(wildcards):
     return inputs
 
 rule Pan1c_View_data:
+    """
+    Generate a final Tarball for Pan1c-View. [Optional]
+
+    Input :
+        - Tags JSON (.tags.json)
+        - Assembly JSON (.assembly.json)
+        - Graph JSON (.graph.json)
+        - Variant JSON (.var.json)
+        - Documentation JSON (doc/Pan1c.documentation.json)
+        - Final graphs (*.gfa.gz) [GFAv1.0]
+        - Contig-Odgi figures (<gtool>.<pangenome_name>.<chromosome>.report.fig.png)
+        - Contig decomposition figures (.<chromosome>.contig.png) [Optional]
+        - SyRI figure on whole haplotypes, with minimap2 (.<haplotype>.syri_mm2.png) [Optional]
+        - SyRI figure on chromosomes, with minimap2 (<chromosome>.syri_mm2.png) [Optional]
+        - VCF(s) from VG deconstruct (*.vcf.gz) [Optional]
+        - XG indexed graphs (*.xg) [Optional]
+    Output :
+        - Tarball (<pangenome_name>.Pan1c_View.tar.gz)
+    Threads : 1
+    Memory : 8Gb
+    """
     input:
         Pan1c_view_data_inputs
     output:
-- 
GitLab


From 6ee34789c39752b9997a8fbb04e2a02377df7ee4 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 3 Dec 2024 16:07:01 +0100
Subject: [PATCH 305/310] Removed PAV generation rule

---
 Snakefile                | 4 ----
 config.yaml              | 2 --
 example/config_CICD.yaml | 2 --
 3 files changed, 8 deletions(-)

diff --git a/Snakefile b/Snakefile
index 9f064ad..0a8a02f 100644
--- a/Snakefile
+++ b/Snakefile
@@ -43,16 +43,12 @@ def which_analysis():
     
     ## Default analysis
     analysis_inputs = [     
-        #expand("output/stats/Pan1c.{gtool}."+config['name']+".core.stats.tsv", gtool=graph_tools), # core stats
         expand("output/panacus.reports/Pan1c.{gtool}."+config['name']+".{chromosome}.histgrowth.html", chromosome=CHRLIST, gtool=graph_tools), # panacus histgrowth 
         expand("output/chrGraphs.figs/Pan1c.{gtool}."+config['name']+".{chromosome}.1Dviz.png", chromosome=CHRLIST, gtool=graph_tools), # visualizations from odgi on chromosome graphs
         expand("output/stats/Pan1c.{gtool}."+config['name']+".chrGraph.general.stats.tsv", gtool=graph_tools) # chromosomes graph statistics
     ]
     
     ## Optionals analysis steps
-    if config["get_PAV"] == "True": # Adding PAV matrix creation
-        analysis_inputs.append("output/pav.matrices")
-
     if config["get_ASMs_SyRI"] == "True": # Creating SyRI for each input assembly 
         analysis_inputs.append(
             expand("output/asm.syri.figs/Pan1c."+config['name']+".{haplotype}.syri_{tool}.png", haplotype=SAMPLES_NOREF, tool=["mm2"])
diff --git a/config.yaml b/config.yaml
index d9de5cd..ebca56a 100644
--- a/config.yaml
+++ b/config.yaml
@@ -38,8 +38,6 @@ odgi.pcov.params: '-x 2000 -O'
 run_Quast: 'False'
 # Getting figures showing chromosome decomposition into contigs
 get_contig_pos: 'True'
-# Computes Presence Absence Variant matrices for Panache (not recommended; very long)
-get_PAV: 'False'
 # Computes SyRI figures for haplotypes 
 get_ASMs_SyRI: 'False' # Haplotype vs Reference
 get_chrInputs_SyRI: 'False' # SyRI on chrInputs
diff --git a/example/config_CICD.yaml b/example/config_CICD.yaml
index af7baa0..4ba75da 100644
--- a/example/config_CICD.yaml
+++ b/example/config_CICD.yaml
@@ -36,8 +36,6 @@ odgi.pcov.params: '-x 2000 -O'
 run_Quast: 'True'
 # Getting figures showing chromosome decomposition into contigs
 get_contig_pos: 'True'
-# Computes Presence Absence Variant matrices for Panache (not recommended; very long)
-get_PAV: 'False'
 # Computes SyRI figures for haplotypes 
 get_ASMs_SyRI: 'True' # Haplotype vs Reference
 get_chrInputs_SyRI: 'True' # SyRI on chrInputs
-- 
GitLab


From f228fc62ae3b5b150b2e66318c6a4b94da81af57 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Tue, 3 Dec 2024 19:01:07 +0100
Subject: [PATCH 306/310] Adapted docstring syntax

---
 Snakefile | 108 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 107 insertions(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 0a8a02f..c1da51c 100644
--- a/Snakefile
+++ b/Snakefile
@@ -105,11 +105,15 @@ rule ragtag_scaffolding:
         - Reference assembly (.fa.gz)
         - Haplotype assembly (<haplotype>.fa.gz)
         - Fasta index of the reference (.fa.gz.fai and .fa.gz.gzi)
+
     Output :
         - Scaffolded haplotype (<haplotype>.ragtagged.fa) [Temporary]
         - Tarball with RagTag temporary files (<haplotype>.tar.gz)
+
     Threads : 8
+
     Memory : n_threads * mem_multiplier * 4Gb
+
     Parameters :
         - app.path
         - ragtag_args
@@ -164,10 +168,14 @@ rule quast_stats:
     Input :
         - Reference assembly (.fa.gz)
         - All haplotypes (.fa.gz)
+
     Output :
         - Quast HTML report
+
     Threads : 16
+
     Memory : n_threads * mem_multiplier * 4Gb
+
     Parameters :
         - app.path
         - Pangenome name
@@ -220,10 +228,14 @@ rule assemblathon_stats:
     Input :
         - Haplotype (<haplotype>.fa.gz)
         - Fasta index of the haplotype (<haplotype>.fa.gz.fai)
+
     Output :
         - Tab-delimited table with assemblaton stats (<haplotype>.stats.csv)
+
     Threads : 1
+
     Memory : n_threads * 16Gb
+
     Parameters :
         - app.path
     """
@@ -260,11 +272,15 @@ rule contig_positions:
     Input :
         - Chromosome fasta (<chromosome>.fa.gz)
         - Chromosome fasta index (<chromosome>.fa.gz.fai)
+
     Output :
         - Figure of contig decomposition (<chromosome>.contig.png)
         - Temporary directory [Temporary]
+
     Threads : 1
+
     Memory : n_threads * mem_multiplier * 16Gb
+
     Parameters :
         - app.path
     """
@@ -324,10 +340,14 @@ rule chromosome_clustering:
 
     Input :
         - All ragtagged haplotype fasta (<haplotype>.ragtagged.fa.gz)
+
     Output :
         - All chromosome fastas (<chromosome>.fa) [Temporary]
+
     Threads : 1
+
     Memory : n_threads * mem_multiplier * 16Gb
+
     Parameters :
         - app.path
         - Pangenome name
@@ -359,11 +379,15 @@ rule SyRI_on_ASM_mm2:
     Input :
         - Reference assembly (.ragtagged.fa.gz)
         - Haplotype assembly (<haplotype>.ragtagged.fa.gz)
+
     Output :
         - SyRI figure (<haplotype>.syri_mm2.png)
         - SyRI VCF (<haplotype>.syri_mm2.vcf.gz)
+
     Threads : 4
+
     Memory : n_threads * mem_multiplier * 12Gb
+
     Parameters :
         - app.path
         - Plotsr config file (src/plotsr-base.cfg)
@@ -419,11 +443,15 @@ rule SyRI_on_ASM_wfm:
     Input :
         - Reference assembly (.ragtagged.fa.gz)
         - Haplotype assembly (<haplotype>.ragtagged.fa.gz)
+
     Output :
         - SyRI figure (<haplotype>.syri_wfm.png)
         - SyRI VCF (<haplotype>.syri_wfm.vcf.gz)
+
     Threads : 4
+
     Memory : n_threads * mem_multiplier * 12Gb
+
     Parameters :
         - app.path
         - Plotsr config file (src/plotsr-base.cfg)
@@ -481,10 +509,14 @@ rule SyRI_on_chrInput:
 
     Input :
         - Chromosome fasta (<chromosome>.fa.gz)
+
     Output :
         - SyRI figure (<chromosome>.syri_mm2.png)
+
     Threads : 4
+
     Memory : n_threads * mem_multiplier * 12Gb
+
     Parameters :
         - app.path
         - Plotsr config file (src/plotsr-base.cfg)
@@ -580,12 +612,14 @@ rule asm_json:
         - All figures of contig decomposition (.contig.png) [Optional]
         - SyRI figure with MM2 (<haplotype>.syri_mm2.png) [Optional]
 
-
     Output :
         - SyRI figure (<haplotype>.syri_mm2.png)
         - SyRI VCF (<haplotype>.syri_mm2.vcf.gz)
+
     Threads : 4
+
     Memory : n_threads * mem_multiplier * 12Gb
+
     Parameters :
         - app.path
         - Plotsr config file (src/plotsr-base.cfg)
@@ -633,13 +667,17 @@ rule wfmash_on_chr:
     Input :
         - Chromosome fasta (<chromosome>.fa.gz)
         - Chromosome fasta index (<chromosome>.fa.gz.fai)
+
     Output :
         - Mapping (<chromosome>.wfmash.mapping.paf) [Temporary]
         - Compressed mapping (<chromosome>.wfmash.mapping.paf.gz)
         - Alignment (<chromosome>.wfmash.aln.paf) [Temporary]
         - Compressed alignment (<chromosome>.wfmash.al.paf.gz)
+
     Threads : 16
+
     Memory : n_threads * mem_multiplier * 2Gb
+
     Parameters :
         - app.path
         - wfmash.segment_length
@@ -709,10 +747,14 @@ rule seqwish:
     Input :
         - Chromosome fasta (<chromosome>.fa.gz)
         - Alignment file (<chromosome>.wfamsh.aln.paf.gz)
+
     Output :
         - Seqwish graph (<chromosome>.seqwish.gfa.gz) [GFAv1.0]
+
     Threads : 8
+
     Memory : n_threads * mem_multiplier * 4Gb
+
     Parameters :
         - app.path
         - seqwish.params
@@ -756,11 +798,15 @@ rule gfaffix_on_chr:
 
     Input :
         - Seqwish graph (<chromosome>.seqwish.gfa.gz) [GFAv1.0]
+
     Output :
         - GFAffixed graph (<chromosome>.seqwish.gfaffixD.gfa.gz) [GFAv1.0]
         - Transform logs (<chromosome>.seqwish.gfaffixD.transform.txt)
+
     Threads : 1
+
     Memory : n_threads * mem_multiplier * 24Gb
+
     Parameters :
         - app.path
     """
@@ -805,10 +851,14 @@ rule odgi_postprocessing:
     Input :
         - Graph metadata (Pan1c.<pangenome_name>.gfa.metadata)
         - GFAffixed graph (<chromosome>.seqwish.gfaffixD.gfa.gz)
+
     Output :
         - Chromosome graph (<chromosome>.gfa.gz) [GFAv1.0]
+
     Threads : 8
+
     Memory : n_threads * mem_multiplier * 4Gb
+
     Parameters :
         - app.path
     """
@@ -889,10 +939,14 @@ rule MC_graph:
     Input :
         - Graph metadata (Pan1c.<pangenome_name>.gfa.metadata)
         - Chromosome fasta (<chromosome>.fa.gz)
+
     Output :
         - Minigraph-Cactus graph (<chromosome>.gfa.gz) [GFAv1.0 (converted from GFAv1.1 with GFAvc)]
+
     Threads : 16
+
     Memory : mem_multiplier * 32Gb
+
     Parameters :
         - app.path
         - reference
@@ -967,9 +1021,12 @@ rule generate_graph_list:
 
     Input :
         - Chromosome graphs (<gtool>.<chromosome>.tmp.gfa) [GFAv1.0]
+
     Output :
         - List of graphs (graphList.<gtool>.txt)
+
     Threads : 1
+
     Memory : 4Gb
     """
     input:
@@ -994,10 +1051,14 @@ rule graph_squeeze:
         - list of graphs (graphList.<gtool>.txt)
         - Graph metadata (Pan1c.<pangenome_name>.gfa.metadata)
         - Chromosome graphs (Pan1c.<gtool>.<pangenome_name>.<chromosome>.tmp.gfa) [GFAv1.0]
+
     Output :
         - Final graph (<gtool>.<pangenome_name>.gfa.gz) [GFAv1.0]
+
     Threads : 16
+
     Memory : n_threads * mem_multiplier * 2Gb
+
     Parameters :
         - app.path
     """
@@ -1048,11 +1109,15 @@ rule graph_stats:
 
     Input :
         - A single chromosome graph (<gtool>.<pangenome_name>.<chromosome>.gfa.gz) [GFAv1.0]
+
     Output :
         - General statistics (<chromosome>.general.stats.tsv)
         - Path statistics (<chromosome>.path.stats.tsv)
+
     Threads : 4
+
     Memory : n_threads * mem_multiplier * 8Gb
+
     Parameters :
         - app.path
         - name
@@ -1084,11 +1149,15 @@ rule graph_figs:
 
     Input :
         - Chromosome graph (<gtool>.<pangenome_name>.<chromosome>.tmp.gfa)
+
     Output :
         - 1D viz (<gtool>.<pangenome_name>.<chromosome>.1Dviz.png)
         - 1D path coverage (<gtool>.<pangenome_name>.<chromosome>.pcov.png)
+
     Threads : 4
+
     Memory : n_threads * mem_multiplier * 4Gb
+
     Parameters :
         - app.path
         - odgi.1Dviz.params
@@ -1125,11 +1194,15 @@ rule aggregate_graphs_stats:
     Input :
         - Chromosome graphs general statistics (<gtool>.<pangenome_name>.<chromosome>.general.stats.tsv)
         - Chromosome graphs paths statistics (<gtool>.<pangenome_name>.<chromosome>.path.stats.tsv)
+
     Output :
         - General statistics (<gtool>.<pangenome_name>.chrGraphs.general.stats.tsv)
         - Paths statistics (<gtool>.<pangenome_name>.chrGraphs.path.stats.tsv)
+
     Threads : 1
+
     Memory : n_threads * mem_multiplier * 2Gb
+
     Parameters :
         - app.path
         - name
@@ -1160,11 +1233,15 @@ rule get_graph_tags:
 
     Input :
         - config.yaml
+    
     Output :
         - Graph metadata (Pan1c.<pangenome_name>.gfa.metadata)
         - Tags JSON for Pan1c-View (summary section)
+
     Threads : 1
+
     Memory : n_threads * mem_multiplier * 8Gb
+
     Parameters :
         - app.path
         - name
@@ -1195,10 +1272,14 @@ rule graph_json:
         - Aggregated general statistics for all graph tool (*.chrGraph.general.stats.tsv)
         - Aggregated paths statistics for all graph tool (*.chrGraph.general.stats.tsv) 
         - Contig-Odgi figures (*.report.fig.png) 
+
     Output :
         - Graph JSON (Pan1c.<pangenome_name>.graph.json)
+
     Threads : 1
+
     Memory : n_threads * mem_multiplier * 16Gb
+
     Parameters :
         - app.path
         - name
@@ -1239,10 +1320,14 @@ rule panacus_stats:
         - Aggregated general statistics for all graph tool (*.chrGraph.general.stats.tsv)
         - Aggregated paths statistics for all graph tool (*.chrGraph.general.stats.tsv) 
         - Contig-Odgi figures (*.report.fig.png) 
+
     Output :
         - Graph JSON (Pan1c.<pangenome_name>.graph.json)
+
     Threads : 1
+
     Memory : n_threads * mem_multiplier * 16Gb
+
     Parameters :
         - app.path
         - name
@@ -1282,10 +1367,14 @@ rule vg_deconstruct:
 
     Input :
         - Final graph in XG format (Pan1c.<gtool>.<pangenome_name>.xg)
+
     Output :
         - VCF (Pan1c.<gtool>.<pangenome_name>.vcf) [Temporary]
+
     Threads : 8
+
     Memory : n_threads * mem_multiplier * 32Gb
+
     Parameters :
         - app.path
         - reference
@@ -1323,9 +1412,12 @@ rule vg_vcf_2_tsv:
 
     Input :
         - VCF from VG deconstruct (Pan1c.<gtool>.<pangenome_name>.vcf.gz)
+
     Output :
         - TSV (vg_<gtool>.tsv) [Temporary]
+
     Threads : 1
+
     Memory : n_threads * mem_multiplier * 8Gb
     """
     input:
@@ -1346,9 +1438,12 @@ rule syri_vcf_2_tsv:
 
     Input :
         - VCFs from SyRI (<haplotypes>.syri.mm2.vcf.gz)
+
     Output :
         - TSV (syri_mm2.tsv) [Temporary]
+
     Threads : 1
+
     Memory : n_threads * mem_multiplier * 8Gb
     """
     input:
@@ -1404,10 +1499,14 @@ rule var_json:
     Input :
         - TSVs from VG deconstruct VCF (vg_<gtool>.tsv)
         - TSV from SyRI with minimap2 (syri_mm2.tsv)
+
     Output :
         - Variant JSON (Pan1c.<pangenome_name>.var.json)
+
     Threads : 1
+
     Memory : n_threads * mem_multiplier * 48Gb
+
     Parameters :
         - app.path
         - reference
@@ -1438,12 +1537,16 @@ rule create_pan1c_report_fig:
     Input :
         - Chromosome graph (.<chromosome>.tmp.gfa)
         - Contig decomposition figures (.<chromosome>.contig.png)
+
     Output :
         - Odgi 1D figure (.<gtool>.<chromosome>.odgi.png) [Temporary]
         - Name figure (.<gtool>.<chromosome>.name.png)
         - Contig-Odgi figures (<gtool>.<pangenome_name>.<chromosome>.report.fig.png)
+
     Threads : 4
+
     Memory : n_threads * mem_multiplier * 2Gb
+
     Parameters :
         - app.path
     """
@@ -1555,9 +1658,12 @@ rule Pan1c_View_data:
         - SyRI figure on chromosomes, with minimap2 (<chromosome>.syri_mm2.png) [Optional]
         - VCF(s) from VG deconstruct (*.vcf.gz) [Optional]
         - XG indexed graphs (*.xg) [Optional]
+
     Output :
         - Tarball (<pangenome_name>.Pan1c_View.tar.gz)
+
     Threads : 1
+
     Memory : 8Gb
     """
     input:
-- 
GitLab


From cc3ccbcf752f4d3ae111aa3f27b384efed951c35 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Fri, 6 Dec 2024 15:13:29 +0100
Subject: [PATCH 307/310] Typo

BGZIP2 instead of BGZIP
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d100eaa..baebd74 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ The workflow use a set of apptainer images :
 
 # Prepare your data
 This workflow can take chromosome level assemblies as well as contig level assemblies but requires a reference assembly.  
-**Fasta files need to be compressed** using **bgzip2** (included in [PanGeTools](https://forgemia.inra.fr/alexis.mergez/pangetools)).
+**Fasta files need to be compressed** using **bgzip** (included in [PanGeTools](https://forgemia.inra.fr/alexis.mergez/pangetools)).
 Sequences names of the reference **must follow this pattern** : `<sample>#<haplotype>#<contig or chromosome name>`.  
 For example, CHM13 chromosomes (haploÃ¯d) must be named `CHM13#1#chr..`. Only the reference needs to follow this pattern for its sequence names. Others haplotypes sequences will be renamed based on the reference and their respective fasta file names. 
 Fasta files **must also follow a pattern** : `<sample>.hap<haplotype>.fa.gz`. Once again with CHM13, the fasta file should be named : `CHM13.hap1.fa.gz`.  
-- 
GitLab


From a4228141ef2ea01b025f48c1100bc7710cd10925 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Mon, 9 Dec 2024 13:51:19 +0100
Subject: [PATCH 308/310] Fixed get_MC error in getTags.py

---
 scripts/getTags.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/getTags.py b/scripts/getTags.py
index 54a0483..22a844c 100644
--- a/scripts/getTags.py
+++ b/scripts/getTags.py
@@ -157,7 +157,7 @@ for section, svalues in tags.items():
         print('#')
 
 # Adding path to generated files
-gtools = ["PGGB"] + (tags["Parameters"]["get_MC"] == "True")*["MC"]
+gtools = (tags["Parameters"]["run_PGGB"] == "True")*["PGGB"] + (tags["Parameters"]["run_MC"] == "True")*["MC"]
 
 tags["Files"] = {
     "GFAv1": {
-- 
GitLab


From b3c7eb905bc0c00d97316b41dd0ad98d52cb3184 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 11 Dec 2024 15:48:56 +0100
Subject: [PATCH 309/310] Relaxed naming scheme for sequences

---
 scripts/ragtagChromInfer.sh | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/scripts/ragtagChromInfer.sh b/scripts/ragtagChromInfer.sh
index c163697..52c9924 100755
--- a/scripts/ragtagChromInfer.sh
+++ b/scripts/ragtagChromInfer.sh
@@ -42,12 +42,8 @@ apptainer run $appdir/pan1c-env.sif ragtag.py scaffold \
 
 # Renaming sequence according to naming scheme
 echo -e "\nRenaming sequences\n"
-grep ">${ref}#chr*" -A1 $tmpdir/ragtag.scaffold.fasta | \
-    sed "s/${ref}#chr\([^_]*\)_RagTag/${hapID}#chr\1/g" > $tmpdir/${sample}.ragtagged.fa
-
-# Compressing fasta
-# apptainer run --app bgzip $appdir/PanGeTools.sif \
-#     -@ $threads $tmpdir/${sample}.ragtagged.fa
+grep ">${ref}" -A1 $tmpdir/ragtag.scaffold.fasta | \
+    sed "s/${ref}#\(.*\)_RagTag/${hapID}#\1/g" > $tmpdir/${sample}.ragtagged.fa
 
 # Moving fa.gz to output dir
 echo -e "\nMoving final file\n"
-- 
GitLab


From 08cc84d7a53001d9a4469c9bfbba843c7ef7ec5a Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Wed, 11 Dec 2024 16:20:03 +0100
Subject: [PATCH 310/310] Updated README

---
 README.md | 106 +++++++++++++++++++++++-------------------------------
 Snakefile |   2 +-
 2 files changed, 46 insertions(+), 62 deletions(-)

diff --git a/README.md b/README.md
index baebd74..6915e8b 100644
--- a/README.md
+++ b/README.md
@@ -8,37 +8,49 @@ The workflow use a set of apptainer images :
 > An example of input files and a config file is available in `example/`.  
 
 # Minimum image version
-- PanGeTools >= v1.10.0
-- Pan1c-env >= v1.0.14
-- Pan1c-box >= v1.0.14
+
+- PanGeTools >= v1.10.10
+- Pan1c-env >= v1.1.1
+- Pan1c-box >= v1.1.2
+- minigraph-cactus >= v2.1.4b
 
 # Prepare your data
+
 This workflow can take chromosome level assemblies as well as contig level assemblies but requires a reference assembly.  
 **Fasta files need to be compressed** using **bgzip** (included in [PanGeTools](https://forgemia.inra.fr/alexis.mergez/pangetools)).
-Sequences names of the reference **must follow this pattern** : `<sample>#<haplotype>#<contig or chromosome name>`.  
+Sequences names of the reference **must follow this pattern** :  
+`<sample>#<haplotype>#<contig or chromosome name>`  
 For example, CHM13 chromosomes (haploÃ¯d) must be named `CHM13#1#chr..`. Only the reference needs to follow this pattern for its sequence names. Others haplotypes sequences will be renamed based on the reference and their respective fasta file names. 
-Fasta files **must also follow a pattern** : `<sample>.hap<haplotype>.fa.gz`. Once again with CHM13, the fasta file should be named : `CHM13.hap1.fa.gz`.  
+Fasta files **must also follow a pattern** :  
+`<sample>.hap<haplotype>.fa.gz`  
+Once again with CHM13, the fasta file should be named : `CHM13.hap1.fa.gz`.  
 
 See [PanSN](https://github.com/pangenome/PanSN-spec) for more info on sequence naming.  
 
 You should only provide chromosome-level assemblies, but, as the haplotypes are renamed using RagTag, it is possible to give scaffold or contig-level assemblies. Since RagTag scaffolds each assemblies using the "reference" haplotype, it can scaffold chromosome-level assemblies that also contains non-placed scaffold/contigs. If you don't want this behavior, prune your FASTAs from any non-chromosome-level sequences **before** providing them to Pan1c.
 
 # Download apptainer images
+
 Before running the worflow, some apptainer images needs to be downloaded. Use the script getApps.sh to do so :
 ```
 ./getApps.sh -a <apps directory>
 ``` 
 
-> Make sure to use the latest version or the workflow might return you errors !
+> Make sure to use the latest version or the workflow might return you errors.
 
 # Running the workflow
+
 Clone this repository and create a `data/haplotypes` directory where you will place all your haplotypes.  
 Update the reference name and the apptainer image directory in `config.yaml`.  
 Then, modify the variables in `runSnakemake.sh` to match your requirements (number of threads, memory, job name, email, etc.).  
+
 ## Single machine mode
+
 Navigate to the root directory of the repository and execute `sbatch runSnakemake.sh`!
 The default script uses a single node and runs everything on it. This method only require apptainer to run but isn't the most efficient for job distribution.
+
 ## Cluster execution
+
 To execute each steps as a job with SLURM, install a custom conda environement with this command : 
 ```
 conda create -n Pan1c -c conda-forge -c bioconda snakemake=8.4.7 snakemake-executor-plugin-slurm
@@ -46,20 +58,26 @@ conda create -n Pan1c -c conda-forge -c bioconda snakemake=8.4.7 snakemake-execu
 This works by having a job that runs snakemake which will submit other jobs. To do so, configure `runSnakemakeSLURM.sh` and submit it using `sbatch`.
 > If you get OOM errors, use the mem_multiplier in `config.yaml` to allocate more memory for jobs.
 
-# Outputs
+# Pan1c_View
+
+[Pan1c_View](https://forgemia.inra.fr/philippe.bardou/pan1c_view) is an interface developped by Philippe Bardou, used to visualize different statistics generated from Pan1c pangenome graphs.  
+To use it, extract the Pan1c_View tarball generated by the workflow in the `project` folder of Pan1c_View, then follow [these](https://forgemia.inra.fr/philippe.bardou/pan1c_view#installation) instructions. 
+
+# Main outputs
+
 The workflow generates several key files :
-- Aggregated graph including every chromosome scale graphs (`output/pan1c.<panname>.gfa`)  
-- Chromosome scale graphs (`data/chrGraphs/chr<id>.gfa`)  
-- Panacus html reports for each chromosome level graph (`output/panacus.reports/chr<id>.histgrowth.html`)  
+- Aggregated graph including every chromosome scale graphs (`output/Pan1c.<gtool>.<pangenome_name>.gfa.gz`)  
+- Chromosome scale graphs (`data/chrGraphs/Pan1c.<gtool>.<pangenome_name>.<chromosome>.gfa.gz`)  
+- Panacus html reports for each chromosome level graph (`output/panacus.reports/Pan1c.<gtool>.<pangenome_name>.<chromosome>.histgrowth.html`)  
 - Statistics on input sequences, graphs and resources used by the workflow (`output/stats`)
 - Odgi 1D visualization of chromosome level graphs (`output/chrGraphs.figs`)
-- (Optional) SyRI structural variant figures (`output/asm.syri.figs`) 
-- (Optional) Quast results on your input haplotypes (`output/quast`)
-- (Optional) Contig composition of chromosomes of your input haplotypes (`output/hap.contig`) 
-- (optional) PAV matrices for each chromosome graph (`output/pav.matrices/chr<id>.pav.matrix.tsv`)
+- (Optional) Pan1c-View tarball (`output/<pangenome_name>.Pan1c-View.data.tar.gz`)
+- (Optional) SyRI structural variant figures (`output/asm.syri.figs`, `chrInput.syri.figs`) 
+- (Optional) Quast results on your input haplotypes (`output/Pan1c.<pangenome_name>.quast.report.html`)
+- (Optional) Contig composition of chromosomes of your input haplotypes (`output/chr.contig`) 
 
 # File architecture
-## Before running the workflow
+
 ```
 Pan1c/
 â”œâ”€â”€ config.yaml
@@ -73,60 +91,26 @@ Pan1c/
 â”œâ”€â”€ getApps.sh
 â”œâ”€â”€ README.md
 â”œâ”€â”€ runSnakemake.sh
+â”œâ”€â”€ runSnakemakeSLURM.sh
 â”œâ”€â”€ scripts
 â”‚Â Â  â””â”€â”€ ...
 â””â”€â”€ Snakefile
 ```
-## After the workflow (Arabidopsis Thaliana example)
-The following tree is non-exhaustive for clarity. Temporary files are not listed, but key files are included.
-The name of the pangenome is `06AT-v3`.
-```
-Pan1c-06AT-v3
-â”œâ”€â”€ chrInputs
-â”‚   
-â”œâ”€â”€ config.yaml
-â”œâ”€â”€ data
-â”‚   â”œâ”€â”€ chrGraphs
-â”‚   â”‚   â”œâ”€â”€ chr<id>
-â”‚   â”‚   â”œâ”€â”€ chr<id>.gfa
-â”‚   â”‚   â””â”€â”€ graphsList.txt
-â”‚   â”œâ”€â”€ chrInputs
-â”‚   â”‚   â””â”€â”€ chr<id>.fa.gz
-â”‚   â”œâ”€â”€ haplotypes
-â”‚   â””â”€â”€ hap.ragtagged
-â”‚       â”œâ”€â”€ <sample>.hap<hid>
-â”‚       â””â”€â”€ <sample>.hap<hid>.ragtagged.fa.gz
-â”œâ”€â”€ logs
-â”‚   â”œâ”€â”€ pan1c.pggb.06AT-v3.logs.tar.gz
-â”‚   â””â”€â”€ pggb
-â”‚       â”œâ”€â”€ chr<id>.pggb.cmd.log
-â”‚       â””â”€â”€ chr<id>.pggb.time.log
-â”œâ”€â”€ output
-â”‚   â”œâ”€â”€ figures
-â”‚   â”‚   â”œâ”€â”€ chr<id>.1Dviz.png
-â”‚   â”‚   â””â”€â”€ chr<id>.pcov.png
-â”‚   â”œâ”€â”€ stats
-â”‚   â”‚   â”œâ”€â”€ pan1c.pggb.06AT-v3.core.stats.tsv
-â”‚   â”‚   â”œâ”€â”€ pan1c.pggb.06AT-v3.chrGraph.general.stats.tsv
-â”‚   â”‚   â””â”€â”€ pan1c.pggb.06AT-v3.chrGraph.path.stats.tsv
-â”‚   â”œâ”€â”€ pan1c.pggb.06AT-v3.gfa
-â”‚   â”œâ”€â”€ panacus.reports
-â”‚   â”‚   â””â”€â”€ chr<id>.histgrowth.html
-â”‚   â””â”€â”€ chrGraphs.stats
-â”‚       â””â”€â”€ chr<id>.stats.tsv
-â”œâ”€â”€ Pan1c-06AT-v3.log
-â”œâ”€â”€ README.md
-â”œâ”€â”€ runSnakemake.sh
-â”œâ”€â”€ scripts
-â”‚Â Â  â””â”€â”€ ...
-â”œâ”€â”€ Snakefile
-â””â”€â”€ workflow.svg
-```
 
 # Example DAG (Saccharomyces cerevisiae example)
+
 This DAG shows the worflow for a pangenome of `Saccharomyces cerevisiae` using the `R64` reference.
 ![Workflow DAG](example/workflow.svg)
 
+# Authors and acknowledgment
+
+- Alexis Mergez
+- Martin Racoupeau
+- Christophe Klopp
+- Christine Gaspin
+- Fabrice Legeai
+
 # Contact
-[alexis.mergez@inrae.fr](mailto:alexis.mergez@inrae.fr) 
+
+[pan1c@inrae.fr](mailto:pan1c@inrae.fr) 
 
diff --git a/Snakefile b/Snakefile
index c1da51c..5b2642e 100644
--- a/Snakefile
+++ b/Snakefile
@@ -1053,7 +1053,7 @@ rule graph_squeeze:
         - Chromosome graphs (Pan1c.<gtool>.<pangenome_name>.<chromosome>.tmp.gfa) [GFAv1.0]
 
     Output :
-        - Final graph (<gtool>.<pangenome_name>.gfa.gz) [GFAv1.0]
+        - Final graph (Pan1c.<gtool>.<pangenome_name>.gfa.gz) [GFAv1.0]
 
     Threads : 16
 
-- 
GitLab